[TensorExpr] Make 'Tensor' a value type. (#63586)

author Mikhail Zolotukhin <mvz@fb.com>

Tue, 24 Aug 2021 07:29:22 +0000 (00:29 -0700)

committer Facebook GitHub Bot <facebook-github-bot@users.noreply.github.com>

Tue, 24 Aug 2021 07:32:13 +0000 (00:32 -0700)
author Mikhail Zolotukhin <mvz@fb.com>
Tue, 24 Aug 2021 07:29:22 +0000 (00:29 -0700)
committer Facebook GitHub Bot <facebook-github-bot@users.noreply.github.com>
Tue, 24 Aug 2021 07:32:13 +0000 (00:32 -0700)
diff --git a/benchmarks/cpp/tensorexpr/bench_approx.cpp b/benchmarks/cpp/tensorexpr/bench_approx.cpp

index 6e31697..92c2640 100644 (file)
--- a/benchmarks/cpp/tensorexpr/bench_approx.cpp
+++ b/benchmarks/cpp/tensorexpr/bench_approx.cpp
@@ -10,14 +10,14 @@
  using namespace torch::jit;
  using namespace torch::jit::tensorexpr;
  
-void vectorize(tensorexpr::LoopNest* ln, tensorexpr::Tensor* target, int width) {
+void vectorize(tensorexpr::LoopNest* ln, tensorexpr::Tensor target, int width) {
    auto loops = ln->getLoopStmtsFor(target);
    ForPtr inner, tail;
    ln->splitWithTail(loops[0], width, &inner, &tail);
    ln->vectorize(inner);
  }
  
-void optimizePointwise(tensorexpr::LoopNest* ln, tensorexpr::Tensor* target) {
+void optimizePointwise(tensorexpr::LoopNest* ln, tensorexpr::Tensor target) {
    std::vector<ForPtr> loops = ln->getLoopStmtsFor(target);
    ForPtr inner, tail;
    ln->splitWithTail(loops[0], 16 * 8, &inner, &tail);
@@ -33,7 +33,7 @@ static void relu_nnc(benchmark::State& state) {
    auto N = VarHandle("N", kInt);
    Placeholder A("A", kFloat, {N});
    auto clamp = 0;
-  torch::jit::tensorexpr::Tensor* B = Compute("B", {N}, [&](const VarHandle& i){
+  torch::jit::tensorexpr::Tensor B = Compute("B", {N}, [&](const VarHandle& i){
      auto A_elem = [&]() {
        auto elem = A.load(i);
        auto min = FloatImm::make(clamp);
@@ -67,7 +67,7 @@ static void log_nnc_sleef(benchmark::State& state) {
    KernelScope ks;
    auto N = VarHandle("N", kInt);
    Placeholder A("A", kFloat, {N});
-  torch::jit::tensorexpr::Tensor* B =
+  torch::jit::tensorexpr::Tensor B =
        Compute("B", {N}, [&](const VarHandle& i) {
          return log(A.load(i));
        });
@@ -97,7 +97,7 @@ static void log_nnc_fast(benchmark::State& state) {
    KernelScope ks;
    auto N = VarHandle("N", kInt);
    Placeholder A("A", kFloat, {N});
-  torch::jit::tensorexpr::Tensor* B =
+  torch::jit::tensorexpr::Tensor B =
        Compute("B", {N}, [&](const VarHandle& i) {
          return fast_log(A.load(i));
        });
@@ -127,7 +127,7 @@ static void log_nnc_vml(benchmark::State& state) {
    KernelScope ks;
    auto N = VarHandle("N", kInt);
    Placeholder A("A", kFloat, {N});
-  torch::jit::tensorexpr::Tensor* B =
+  torch::jit::tensorexpr::Tensor B =
        Compute("B", {N}, [&](const VarHandle& i) {
          return log_vml(A.load(i));
        });
@@ -168,7 +168,7 @@ static void logit_nnc_sleef(benchmark::State& state) {
    auto N = VarHandle("N", kInt);
    Placeholder A("A", kFloat, {N});
    auto clamp = 1e-6f;
-  tensorexpr::Tensor* B = Compute("B", {N}, [&](const VarHandle& i) {
+  tensorexpr::Tensor B = Compute("B", {N}, [&](const VarHandle& i) {
      auto A_elem = [&]() {
        auto elem = A.load(i);
        auto min = FloatImm::make(clamp);
@@ -205,7 +205,7 @@ static void logit_nnc_fast(benchmark::State& state) {
    auto N = VarHandle("N", kInt);
    Placeholder A("A", kFloat, {N});
    auto clamp = 1e-6f;
-  tensorexpr::Tensor* B = Compute("B", {N}, [&](const VarHandle& i) {
+  tensorexpr::Tensor B = Compute("B", {N}, [&](const VarHandle& i) {
      auto A_elem = [&]() {
        auto elem = A.load(i);
        auto min = FloatImm::make(clamp);
@@ -242,7 +242,7 @@ static void logit_nnc_vml(benchmark::State& state) {
    auto N = VarHandle("N", kInt);
    Placeholder A("A", kFloat, {N});
    auto clamp = 1e-6f;
-  tensorexpr::Tensor* B = Compute("B", {N}, [&](const VarHandle& i) {
+  tensorexpr::Tensor B = Compute("B", {N}, [&](const VarHandle& i) {
      auto A_elem = [&]() {
        auto elem = A.load(i);
        auto min = FloatImm::make(clamp);
@@ -319,7 +319,7 @@ static void tanh_nnc_fast(benchmark::State& state) {
    KernelScope ks;
    auto N = VarHandle("N", kInt);
    Placeholder A("A", kFloat, {N});
-  torch::jit::tensorexpr::Tensor* B =
+  torch::jit::tensorexpr::Tensor B =
        Compute("B", {N}, [&](const VarHandle& i) {
          return fast_tanh(A.load(i));
        });
diff --git a/benchmarks/cpp/tensorexpr/bench_batchnorm.cpp b/benchmarks/cpp/tensorexpr/bench_batchnorm.cpp

index 872594e..85bf9d3 100644 (file)
--- a/benchmarks/cpp/tensorexpr/bench_batchnorm.cpp
+++ b/benchmarks/cpp/tensorexpr/bench_batchnorm.cpp
@@ -84,7 +84,7 @@ BENCHMARK_DEFINE_F(BatchNorm, NNC)(benchmark::State& state) {
    VarHandle eps("eps", kFloat);
  
    using axis = const VarHandle&;
-  Tensor* output = Compute(
+  Tensor output = Compute(
        "output",
        {{N_, "N"}, {C_, "C"}, {H_, "H"}, {W_, "W"}},
        [&](axis n, axis c, axis h, axis w) {
@@ -147,7 +147,7 @@ BENCHMARK_DEFINE_F(BatchNorm, NNCRelu)(benchmark::State& state) {
    VarHandle eps("eps", kFloat);
  
    using axis = const VarHandle&;
-  Tensor* output = Compute(
+  Tensor output = Compute(
        "output",
        {{N_, "N"}, {C_, "C"}, {H_, "H"}, {W_, "W"}},
        [&](axis n, axis c, axis h, axis w) {
diff --git a/benchmarks/cpp/tensorexpr/bench_compile.cpp b/benchmarks/cpp/tensorexpr/bench_compile.cpp

index 245d5d8..50d54e5 100644 (file)
--- a/benchmarks/cpp/tensorexpr/bench_compile.cpp
+++ b/benchmarks/cpp/tensorexpr/bench_compile.cpp
@@ -13,24 +13,24 @@ static void BM_CompileSwish(benchmark::State& state) {
      te::KernelScope ks;
      te::VarHandle n("n", te::kInt);
      te::Placeholder A(te::BufHandle("A", {N}, te::kFloat));
-    te::Tensor* relu = te::Compute("relu", {{n, "n"}}, [&](const te::VarHandle& i) {
+    te::Tensor relu = te::Compute("relu", {{n, "n"}}, [&](const te::VarHandle& i) {
        return te::Max::make(A.load(i), 0.f, false);
      });
-    te::Tensor* min6 = te::Compute("min6", {{n, "n"}}, [&](const te::VarHandle& i) {
-      return te::Min::make(relu->load(i), 6.f, false);
+    te::Tensor min6 = te::Compute("min6", {{n, "n"}}, [&](const te::VarHandle& i) {
+      return te::Min::make(relu.load(i), 6.f, false);
      });
-    te::Tensor* plus3 = te::Compute("plus3", {{n, "n"}}, [&](const te::VarHandle& i) {
-      return min6->load(i) + 3.f;
+    te::Tensor plus3 = te::Compute("plus3", {{n, "n"}}, [&](const te::VarHandle& i) {
+      return min6.load(i) + 3.f;
      });
-    te::Tensor* times = te::Compute("times", {{n, "n"}}, [&](const te::VarHandle& i) {
-      return A.load(i) * plus3->load(i);
+    te::Tensor times = te::Compute("times", {{n, "n"}}, [&](const te::VarHandle& i) {
+      return A.load(i) * plus3.load(i);
      });
-    te::Tensor* sixth = te::Compute("sixth", {{n, "n"}}, [&](const te::VarHandle& i) {
-      return times->load(i) * 1.f / 6.f;
+    te::Tensor sixth = te::Compute("sixth", {{n, "n"}}, [&](const te::VarHandle& i) {
+      return times.load(i) * 1.f / 6.f;
      });
      te::LoopNest nest({sixth}, {relu, min6, plus3, times, sixth});
      for (auto tensor : {relu, min6, plus3, times}) {
-      nest.computeInline(tensor->buf());
+      nest.computeInline(tensor.buf());
      }
      nest.prepareForCodegen();
      te::StmtPtr s = te::IRSimplifier::simplify(nest.root_stmt());
@@ -43,24 +43,24 @@ static void BM_CompileSwishLLVMOnly(benchmark::State& state) {
    te::KernelScope ks;
    te::VarHandle n("n", te::kInt);
    te::Placeholder A(te::BufHandle("A", {N}, te::kFloat));
-  te::Tensor* relu = te::Compute("relu", {{n, "n"}}, [&](const te::VarHandle& i) {
+  te::Tensor relu = te::Compute("relu", {{n, "n"}}, [&](const te::VarHandle& i) {
      return te::Max::make(A.load(i), 0.f, false);
    });
-  te::Tensor* min6 = te::Compute("min6", {{n, "n"}}, [&](const te::VarHandle& i) {
-    return te::Min::make(relu->load(i), 6.f, false);
+  te::Tensor min6 = te::Compute("min6", {{n, "n"}}, [&](const te::VarHandle& i) {
+    return te::Min::make(relu.load(i), 6.f, false);
    });
-  te::Tensor* plus3 = te::Compute("plus3", {{n, "n"}}, [&](const te::VarHandle& i) {
-    return min6->load(i) + 3.f;
+  te::Tensor plus3 = te::Compute("plus3", {{n, "n"}}, [&](const te::VarHandle& i) {
+    return min6.load(i) + 3.f;
    });
-  te::Tensor* times = te::Compute("times", {{n, "n"}}, [&](const te::VarHandle& i) {
-    return A.load(i) * plus3->load(i);
+  te::Tensor times = te::Compute("times", {{n, "n"}}, [&](const te::VarHandle& i) {
+    return A.load(i) * plus3.load(i);
    });
-  te::Tensor* sixth = te::Compute("sixth", {{n, "n"}}, [&](const te::VarHandle& i) {
-    return times->load(i) * 1.f / 6.f;
+  te::Tensor sixth = te::Compute("sixth", {{n, "n"}}, [&](const te::VarHandle& i) {
+    return times.load(i) * 1.f / 6.f;
    });
    te::LoopNest nest({sixth}, {relu, min6, plus3, times, sixth});
    for (auto tensor : {relu, min6, plus3, times}) {
-    nest.computeInline(tensor->buf());
+    nest.computeInline(tensor.buf());
    }
    nest.prepareForCodegen();
    te::StmtPtr s = te::IRSimplifier::simplify(nest.root_stmt());
diff --git a/benchmarks/cpp/tensorexpr/bench_concat.cpp b/benchmarks/cpp/tensorexpr/bench_concat.cpp

index cb9aa84..856065d 100644 (file)
--- a/benchmarks/cpp/tensorexpr/bench_concat.cpp
+++ b/benchmarks/cpp/tensorexpr/bench_concat.cpp
@@ -60,7 +60,7 @@ class ConcatBench : public benchmark::Fixture {
            {input_sizes_[i][0], input_sizes_[i][1]}));
      }
  
-    Tensor* output = Compute(
+    Tensor output = Compute(
          "aten_cat",
          {{output_size_[0], "M"}, {output_size_[1], "N"}},
          [&](const VarHandle& m, const VarHandle& n) {
@@ -147,7 +147,7 @@ class ConcatBench : public benchmark::Fixture {
        for_stmts[i] = for_st;
        cumulative_input_sizes += input_sizes_[i][1];
      }
-    auto output = new Tensor(output_buf, alloc<Block>(for_stmts));
+    auto output = Tensor(output_buf, alloc<Block>(for_stmts));
  
      LoopNest nest({output});
      nest.prepareForCodegen();
diff --git a/benchmarks/cpp/tensorexpr/bench_gemm.cpp b/benchmarks/cpp/tensorexpr/bench_gemm.cpp

index 7ebaa87..8646e97 100644 (file)
--- a/benchmarks/cpp/tensorexpr/bench_gemm.cpp
+++ b/benchmarks/cpp/tensorexpr/bench_gemm.cpp
@@ -44,7 +44,7 @@ BENCHMARK_DEFINE_F(Gemm, TensorExprNoopt)(benchmark::State& state) {
  
    te::Placeholder AP(te::BufHandle("A", {M, K}, te::kFloat));
    te::Placeholder BP(te::BufHandle("B", {K, N}, te::kFloat));
-  te::Tensor* CT = te::Reduce(
+  te::Tensor CT = te::Reduce(
        "gemm",
        {{M, "M"}, {N, "N"}},
        te::Sum(),
@@ -68,7 +68,7 @@ BENCHMARK_DEFINE_F(Gemm, TensorExprTile32x32)(benchmark::State& state) {
  
    te::Placeholder AP(te::BufHandle("A", {M, K}, te::kFloat));
    te::Placeholder BP(te::BufHandle("B", {K, N}, te::kFloat));
-  te::Tensor* CT = te::Reduce(
+  te::Tensor CT = te::Reduce(
        "gemm",
        {{M, "M"}, {N, "N"}},
        te::Sum(),
@@ -128,7 +128,7 @@ BENCHMARK_DEFINE_F(Gemm, TensorExprTile4x16)(benchmark::State& state) {
  
    te::Placeholder AP(te::BufHandle("A", {M, K}, te::kFloat));
    te::Placeholder BP(te::BufHandle("B", {K, N}, te::kFloat));
-  te::Tensor* CT = te::Reduce(
+  te::Tensor CT = te::Reduce(
        "gemm",
        {{M, "M"}, {N, "N"}},
        te::Sum(),
@@ -188,7 +188,7 @@ BENCHMARK_DEFINE_F(Gemm, TensorExprTile4x16VecUnroll)(benchmark::State& state) {
  
    te::Placeholder AP(te::BufHandle("A", {M, K}, te::kFloat));
    te::Placeholder BP(te::BufHandle("B", {K, N}, te::kFloat));
-  te::Tensor* CT = te::Reduce(
+  te::Tensor CT = te::Reduce(
        "gemm",
        {{M, "M"}, {N, "N"}},
        te::Sum(),
@@ -256,7 +256,7 @@ BENCHMARK_DEFINE_F(Gemm, TensorExprTile4x16Cache)(benchmark::State& state) {
  
    te::Placeholder AP(te::BufHandle("A", {M, K}, te::kFloat));
    te::Placeholder BP(te::BufHandle("B", {K, N}, te::kFloat));
-  te::Tensor* CT = te::Reduce(
+  te::Tensor CT = te::Reduce(
        "gemm",
        {{M, "M"}, {N, "N"}},
        te::Sum(),
@@ -302,7 +302,7 @@ BENCHMARK_DEFINE_F(Gemm, TensorExprTile4x16Cache)(benchmark::State& state) {
    }
    {
      auto const& loops = loop.getLoopStmtsFor(CT);
-    loop.cacheAccesses(CT->buf(), "C_regs", loops[2]);
+    loop.cacheAccesses(CT.buf(), "C_regs", loops[2]);
    }
  
    loop.prepareForCodegen();
diff --git a/benchmarks/cpp/tensorexpr/bench_parallel.cpp b/benchmarks/cpp/tensorexpr/bench_parallel.cpp

index 966c9e2..847b66d 100644 (file)
--- a/benchmarks/cpp/tensorexpr/bench_parallel.cpp
+++ b/benchmarks/cpp/tensorexpr/bench_parallel.cpp
@@ -38,7 +38,7 @@ BENCHMARK_DEFINE_F(ParallelAdd, Simple)(benchmark::State& state) {
    KernelScope kernel_scope;
    Placeholder a_buf("a", kFloat, {M});
    Placeholder b_buf("b", kFloat, {M});
-  Tensor* c_tensor = Compute(
+  Tensor c_tensor = Compute(
        "c", {{M, "m"}}, [&](const VarHandle& m) {
          return a_buf.load(m) + b_buf.load(m);
        });
diff --git a/benchmarks/cpp/tensorexpr/bench_reduce.cpp b/benchmarks/cpp/tensorexpr/bench_reduce.cpp

index be5dcc8..9d35701 100644 (file)
--- a/benchmarks/cpp/tensorexpr/bench_reduce.cpp
+++ b/benchmarks/cpp/tensorexpr/bench_reduce.cpp
@@ -222,7 +222,7 @@ BENCHMARK_DEFINE_F(Reduce1D, TeNaive)(benchmark::State& state) {
    int M = A.numel();
  
    te::Placeholder AP(te::BufHandle("A", {M}, te::kFloat));
-  te::Tensor* BT = te::Reduce(
+  te::Tensor BT = te::Reduce(
        "reduce_full",
        {{1, "N"}},
        te::Sum(),
@@ -255,7 +255,7 @@ BENCHMARK_DEFINE_F(Reduce1D, TeSplitTail)(benchmark::State& state) {
    int M = A.numel();
  
    te::Placeholder AP(te::BufHandle("A", {M}, te::kFloat));
-  te::Tensor* BT = te::Reduce(
+  te::Tensor BT = te::Reduce(
        "reduce_full",
        {{1, "N"}},
        te::Sum(),
@@ -296,7 +296,7 @@ BENCHMARK_DEFINE_F(Reduce1D, TeSplitMask)(benchmark::State& state) {
    int M = A.numel();
  
    te::Placeholder AP(te::BufHandle("A", {M}, te::kFloat));
-  te::Tensor* BT = te::Reduce(
+  te::Tensor BT = te::Reduce(
        "reduce_full",
        {{1, "N"}},
        te::Sum(),
@@ -339,7 +339,7 @@ BENCHMARK_DEFINE_F(Reduce1D, TeRfactorV1)(benchmark::State& state) {
    TORCH_CHECK(M % kChunkSize == 0);
  
    te::Placeholder AP(te::BufHandle("A", {M}, te::kFloat));
-  te::Tensor* BT = te::Reduce(
+  te::Tensor BT = te::Reduce(
        "reduce_full",
        {},
        te::Sum(),
@@ -359,7 +359,7 @@ BENCHMARK_DEFINE_F(Reduce1D, TeRfactorV1)(benchmark::State& state) {
  
    loop.reorderAxis(mo, mi);
    loops = loop.getLoopStmtsFor(BT);
-  auto bt_body = loop.getAllWritesToBuf(BT->buf())[1];
+  auto bt_body = loop.getAllWritesToBuf(BT.buf())[1];
    TORCH_CHECK(loop.rfactor(bt_body, loops.at(0), &rfac_buf));
    loop.reorderAxis(loops.at(0), loops.at(1));
  
@@ -390,7 +390,7 @@ BENCHMARK_DEFINE_F(Reduce1D, Op)(benchmark::State& state) {
    const int kChunkSize = 8;
  
    te::Placeholder a("A", te::kFloat, {M});
-  te::Tensor* b = te::computeSum({a.handle(), te::IntList({0}), false}, at::kFloat);
+  te::Tensor b = te::computeSum({a.handle(), te::IntList({0}), false}, at::kFloat);
    te::LoopNest nest({b});
  
    auto loops = nest.getLoopStmtsFor(b);
@@ -453,7 +453,7 @@ BENCHMARK_DEFINE_F(Reduce2DCol, OpSchedule)(benchmark::State& state) {
    te::KernelScope ks;
    constexpr int kCacheSize = 1 << 12;
    te::Placeholder a("A", te::kFloat, {M, N});
-  te::Tensor* b = te::computeSum({a.handle(), te::IntList({0}), false}, at::kFloat);
+  te::Tensor b = te::computeSum({a.handle(), te::IntList({0}), false}, at::kFloat);
    te::LoopNest nest({b});
  
    auto sch = state.range(2);
@@ -560,7 +560,7 @@ BENCHMARK_DEFINE_F(Reduce2DRow, OpSchedule)(benchmark::State& state) {
    te::KernelScope ks;
    constexpr int kChunkSize = 8;
    te::Placeholder a("A", te::kFloat, {M, N});
-  te::Tensor* b = te::computeSum({a.handle(), te::IntList({1}), false}, at::kFloat);
+  te::Tensor b = te::computeSum({a.handle(), te::IntList({1}), false}, at::kFloat);
    te::LoopNest nest({b});
  
    auto sch = state.range(2);
diff --git a/test/cpp/tensorexpr/test_approx.cpp b/test/cpp/tensorexpr/test_approx.cpp

index d761645..2a4ce94 100644 (file)
--- a/test/cpp/tensorexpr/test_approx.cpp
+++ b/test/cpp/tensorexpr/test_approx.cpp
@@ -11,7 +11,7 @@
  using namespace torch::indexing;
  namespace te = torch::jit::tensorexpr;
  
-static void vectorize(te::LoopNest* ln, te::Tensor* target, int width) {
+static void vectorize(te::LoopNest* ln, te::Tensor target, int width) {
    auto loops = ln->getLoopStmtsFor(target);
    te::ForPtr inner, tail;
    ln->splitWithTail(loops[0], width, &inner, &tail);
@@ -33,7 +33,7 @@ TEST(Approx, log_vml) {
    te::KernelScope ks;
    te::VarHandle N("N", te::kInt);
    te::Placeholder A("A", te::kFloat, {N});
-  te::Tensor* B = te::Compute(
+  te::Tensor B = te::Compute(
        "B", {N}, [&](const te::VarHandle& i) { return log_vml(A.load(i)); });
  
    te::LoopNest ln({B});
diff --git a/test/cpp/tensorexpr/test_boundsinference.cpp b/test/cpp/tensorexpr/test_boundsinference.cpp

index fcfa8ce..d038665 100644 (file)
--- a/test/cpp/tensorexpr/test_boundsinference.cpp
+++ b/test/cpp/tensorexpr/test_boundsinference.cpp
@@ -49,7 +49,7 @@ TEST(BoundsInference, _1) {
    KernelScope kernel_scope;
    ExprHandle n(100);
    Placeholder a(BufHandle("a", {n}, kFloat));
-  Tensor* b =
+  Tensor b =
        Compute("b", {{n, "i"}}, [&](const VarHandle& i) { return a.load(i); });
    LoopNest l({b});
    auto bounds_info = inferBounds(l.root_stmt());
@@ -60,9 +60,9 @@ TEST(BoundsInference, _1) {
    ASSERT_EQ(bounds_info.at(a.data())[0].kind, kLoad);
    verifyConstBounds(bounds_info.at(a.data())[0], {{0, 99}});
  
-  ASSERT_EQ(bounds_info.at(b->buf()).size(), 1);
-  ASSERT_EQ(bounds_info.at(b->buf())[0].kind, kStore);
-  verifyConstBounds(bounds_info.at(b->buf())[0], {{0, 99}});
+  ASSERT_EQ(bounds_info.at(b.buf()).size(), 1);
+  ASSERT_EQ(bounds_info.at(b.buf())[0].kind, kStore);
+  verifyConstBounds(bounds_info.at(b.buf())[0], {{0, 99}});
  }
  
  TEST(BoundsInference, _2) {
@@ -74,7 +74,7 @@ TEST(BoundsInference, _2) {
    KernelScope kernel_scope;
    VarHandle n("n", kInt);
    Placeholder a(BufHandle("a", {n}, kFloat));
-  Tensor* b =
+  Tensor b =
        Compute("b", {{n, "i"}}, [&](const VarHandle& i) { return a.load(i); });
    LoopNest l({b});
    auto bounds_info = inferBounds(l.root_stmt());
@@ -85,9 +85,9 @@ TEST(BoundsInference, _2) {
    ASSERT_EQ(bounds_info.at(a.data())[0].kind, kLoad);
    verifyConstBounds(bounds_info.at(a.data())[0], {{0, -1}});
  
-  ASSERT_EQ(bounds_info.at(b->buf()).size(), 1);
-  ASSERT_EQ(bounds_info.at(b->buf())[0].kind, kStore);
-  verifyConstBounds(bounds_info.at(b->buf())[0], {{0, -1}});
+  ASSERT_EQ(bounds_info.at(b.buf()).size(), 1);
+  ASSERT_EQ(bounds_info.at(b.buf())[0].kind, kStore);
+  verifyConstBounds(bounds_info.at(b.buf())[0], {{0, -1}});
  }
  
  TEST(BoundsInference, _3) {
@@ -99,7 +99,7 @@ TEST(BoundsInference, _3) {
    KernelScope kernel_scope;
    ExprHandle n(100);
    Placeholder a(BufHandle("a", {n + 10}, kFloat));
-  Tensor* b = Compute("b", {{n, "i"}}, [&](const VarHandle& i) {
+  Tensor b = Compute("b", {{n, "i"}}, [&](const VarHandle& i) {
      return a.load(i) * a.load(i + 10);
    });
    LoopNest l({b});
@@ -111,9 +111,9 @@ TEST(BoundsInference, _3) {
    ASSERT_EQ(bounds_info.at(a.data())[0].kind, kLoad);
    verifyConstBounds(bounds_info.at(a.data())[0], {{0, 109}});
  
-  ASSERT_EQ(bounds_info.at(b->buf()).size(), 1);
-  ASSERT_EQ(bounds_info.at(b->buf())[0].kind, kStore);
-  verifyConstBounds(bounds_info.at(b->buf())[0], {{0, 99}});
+  ASSERT_EQ(bounds_info.at(b.buf()).size(), 1);
+  ASSERT_EQ(bounds_info.at(b.buf())[0].kind, kStore);
+  verifyConstBounds(bounds_info.at(b.buf())[0], {{0, 99}});
  }
  
  TEST(BoundsInference, _4) {
@@ -129,13 +129,13 @@ TEST(BoundsInference, _4) {
    ExprHandle W(320);
    ExprHandle H(200);
    Placeholder a(BufHandle("a", {H, W}, kFloat));
-  Tensor* b = Compute(
+  Tensor b = Compute(
        "b", {{H, "y"}, {W, "x"}}, [&](const VarHandle& y, const VarHandle& x) {
          return x * y;
        });
-  Tensor* c = Compute(
+  Tensor c = Compute(
        "c", {{H, "y"}, {W, "x"}}, [&](const VarHandle& y, const VarHandle& x) {
-        return a.load(y, x) * b->load(y, x);
+        return a.load(y, x) * b.load(y, x);
        });
    LoopNest l({c});
    std::vector<ForPtr> loops = l.getLoopStmtsFor(c);
@@ -149,13 +149,13 @@ TEST(BoundsInference, _4) {
      ASSERT_EQ(bounds_info.at(a.data())[0].kind, kLoad);
      verifyConstBounds(bounds_info.at(a.data())[0], {{0, 199}, {0, 319}});
  
-    ASSERT_EQ(bounds_info.at(b->buf()).size(), 1);
-    ASSERT_EQ(bounds_info.at(b->buf())[0].kind, kLoad);
-    verifyConstBounds(bounds_info.at(b->buf())[0], {{0, 199}, {0, 319}});
+    ASSERT_EQ(bounds_info.at(b.buf()).size(), 1);
+    ASSERT_EQ(bounds_info.at(b.buf())[0].kind, kLoad);
+    verifyConstBounds(bounds_info.at(b.buf())[0], {{0, 199}, {0, 319}});
  
-    ASSERT_EQ(bounds_info.at(c->buf()).size(), 1);
-    ASSERT_EQ(bounds_info.at(c->buf())[0].kind, kStore);
-    verifyConstBounds(bounds_info.at(c->buf())[0], {{0, 199}, {0, 319}});
+    ASSERT_EQ(bounds_info.at(c.buf()).size(), 1);
+    ASSERT_EQ(bounds_info.at(c.buf())[0].kind, kStore);
+    verifyConstBounds(bounds_info.at(c.buf())[0], {{0, 199}, {0, 319}});
    }
    {
      // Infer bounds on the inner loop scope
@@ -166,13 +166,13 @@ TEST(BoundsInference, _4) {
      ASSERT_EQ(bounds_info.at(a.data())[0].kind, kLoad);
      verifyConstBounds(bounds_info.at(a.data())[0], {{-1, -1}, {0, 319}});
  
-    ASSERT_EQ(bounds_info.at(b->buf()).size(), 1);
-    ASSERT_EQ(bounds_info.at(b->buf())[0].kind, kLoad);
-    verifyConstBounds(bounds_info.at(b->buf())[0], {{-1, -1}, {0, 319}});
+    ASSERT_EQ(bounds_info.at(b.buf()).size(), 1);
+    ASSERT_EQ(bounds_info.at(b.buf())[0].kind, kLoad);
+    verifyConstBounds(bounds_info.at(b.buf())[0], {{-1, -1}, {0, 319}});
  
-    ASSERT_EQ(bounds_info.at(c->buf()).size(), 1);
-    ASSERT_EQ(bounds_info.at(c->buf())[0].kind, kStore);
-    verifyConstBounds(bounds_info.at(c->buf())[0], {{-1, -1}, {0, 319}});
+    ASSERT_EQ(bounds_info.at(c.buf()).size(), 1);
+    ASSERT_EQ(bounds_info.at(c.buf())[0].kind, kStore);
+    verifyConstBounds(bounds_info.at(c.buf())[0], {{-1, -1}, {0, 319}});
    }
    {
      // Infer bounds on the inner loop body's scope
@@ -183,13 +183,13 @@ TEST(BoundsInference, _4) {
      ASSERT_EQ(bounds_info.at(a.data())[0].kind, kLoad);
      verifyConstBounds(bounds_info.at(a.data())[0], {{-1, -1}, {-1, -1}});
  
-    ASSERT_EQ(bounds_info.at(b->buf()).size(), 1);
-    ASSERT_EQ(bounds_info.at(b->buf())[0].kind, kLoad);
-    verifyConstBounds(bounds_info.at(b->buf())[0], {{-1, -1}, {-1, -1}});
+    ASSERT_EQ(bounds_info.at(b.buf()).size(), 1);
+    ASSERT_EQ(bounds_info.at(b.buf())[0].kind, kLoad);
+    verifyConstBounds(bounds_info.at(b.buf())[0], {{-1, -1}, {-1, -1}});
  
-    ASSERT_EQ(bounds_info.at(c->buf()).size(), 1);
-    ASSERT_EQ(bounds_info.at(c->buf())[0].kind, kStore);
-    verifyConstBounds(bounds_info.at(c->buf())[0], {{-1, -1}, {-1, -1}});
+    ASSERT_EQ(bounds_info.at(c.buf()).size(), 1);
+    ASSERT_EQ(bounds_info.at(c.buf())[0].kind, kStore);
+    verifyConstBounds(bounds_info.at(c.buf())[0], {{-1, -1}, {-1, -1}});
    }
  }
  
@@ -208,7 +208,7 @@ TEST(BoundsInference, _5) {
    KernelScope kernel_scope;
    ExprHandle n(100);
    Placeholder a(BufHandle("a", {n}, kFloat));
-  Tensor* b =
+  Tensor b =
        Compute("b", {{n, "i"}}, [&](const VarHandle& i) { return a.load(i); });
    LoopNest l({b});
  
@@ -229,9 +229,9 @@ TEST(BoundsInference, _5) {
      ASSERT_EQ(bounds_info.at(a.data())[0].kind, kLoad);
      verifyConstBounds(bounds_info.at(a.data())[0], {{0, 95}});
  
-    ASSERT_EQ(bounds_info.at(b->buf()).size(), 1);
-    ASSERT_EQ(bounds_info.at(b->buf())[0].kind, kStore);
-    verifyConstBounds(bounds_info.at(b->buf())[0], {{0, 95}});
+    ASSERT_EQ(bounds_info.at(b.buf()).size(), 1);
+    ASSERT_EQ(bounds_info.at(b.buf())[0].kind, kStore);
+    verifyConstBounds(bounds_info.at(b.buf())[0], {{0, 95}});
    }
    {
      // Verify inferred bounds for the tail loop
@@ -242,9 +242,9 @@ TEST(BoundsInference, _5) {
      ASSERT_EQ(bounds_info.at(a.data())[0].kind, kLoad);
      verifyConstBounds(bounds_info.at(a.data())[0], {{96, 99}});
  
-    ASSERT_EQ(bounds_info.at(b->buf()).size(), 1);
-    ASSERT_EQ(bounds_info.at(b->buf())[0].kind, kStore);
-    verifyConstBounds(bounds_info.at(b->buf())[0], {{96, 99}});
+    ASSERT_EQ(bounds_info.at(b.buf()).size(), 1);
+    ASSERT_EQ(bounds_info.at(b.buf())[0].kind, kStore);
+    verifyConstBounds(bounds_info.at(b.buf())[0], {{96, 99}});
    }
  }
  
@@ -263,13 +263,13 @@ TEST(BoundsInference, _6) {
    ExprHandle CW(32);
    ExprHandle CH(20);
    Placeholder a(BufHandle("a", {H, W}, kFloat));
-  Tensor* b = Compute(
+  Tensor b = Compute(
        "b", {{H, "y"}, {W, "x"}}, [&](const VarHandle& y, const VarHandle& x) {
          return x * y;
        });
-  Tensor* c = Compute(
+  Tensor c = Compute(
        "c", {{CH, "y"}, {CW, "x"}}, [&](const VarHandle& y, const VarHandle& x) {
-        return a.load(y + 100, x + 100) * b->load(y * 2, x * 5);
+        return a.load(y + 100, x + 100) * b.load(y * 2, x * 5);
        });
    LoopNest l({c});
    std::vector<ForPtr> loops = l.getLoopStmtsFor(c);
@@ -283,13 +283,13 @@ TEST(BoundsInference, _6) {
      ASSERT_EQ(bounds_info.at(a.data())[0].kind, kLoad);
      verifyConstBounds(bounds_info.at(a.data())[0], {{100, 119}, {100, 131}});
  
-    ASSERT_EQ(bounds_info.at(b->buf()).size(), 1);
-    ASSERT_EQ(bounds_info.at(b->buf())[0].kind, kLoad);
-    verifyConstBounds(bounds_info.at(b->buf())[0], {{0, 38}, {0, 155}});
+    ASSERT_EQ(bounds_info.at(b.buf()).size(), 1);
+    ASSERT_EQ(bounds_info.at(b.buf())[0].kind, kLoad);
+    verifyConstBounds(bounds_info.at(b.buf())[0], {{0, 38}, {0, 155}});
  
-    ASSERT_EQ(bounds_info.at(c->buf()).size(), 1);
-    ASSERT_EQ(bounds_info.at(c->buf())[0].kind, kStore);
-    verifyConstBounds(bounds_info.at(c->buf())[0], {{0, 19}, {0, 31}});
+    ASSERT_EQ(bounds_info.at(c.buf()).size(), 1);
+    ASSERT_EQ(bounds_info.at(c.buf())[0].kind, kStore);
+    verifyConstBounds(bounds_info.at(c.buf())[0], {{0, 19}, {0, 31}});
    }
    {
      // Infer bounds on the inner loop scope
@@ -300,13 +300,13 @@ TEST(BoundsInference, _6) {
      ASSERT_EQ(bounds_info.at(a.data())[0].kind, kLoad);
      verifyConstBounds(bounds_info.at(a.data())[0], {{-1, -1}, {100, 131}});
  
-    ASSERT_EQ(bounds_info.at(b->buf()).size(), 1);
-    ASSERT_EQ(bounds_info.at(b->buf())[0].kind, kLoad);
-    verifyConstBounds(bounds_info.at(b->buf())[0], {{-1, -1}, {0, 155}});
+    ASSERT_EQ(bounds_info.at(b.buf()).size(), 1);
+    ASSERT_EQ(bounds_info.at(b.buf())[0].kind, kLoad);
+    verifyConstBounds(bounds_info.at(b.buf())[0], {{-1, -1}, {0, 155}});
  
-    ASSERT_EQ(bounds_info.at(c->buf()).size(), 1);
-    ASSERT_EQ(bounds_info.at(c->buf())[0].kind, kStore);
-    verifyConstBounds(bounds_info.at(c->buf())[0], {{-1, -1}, {0, 31}});
+    ASSERT_EQ(bounds_info.at(c.buf()).size(), 1);
+    ASSERT_EQ(bounds_info.at(c.buf())[0].kind, kStore);
+    verifyConstBounds(bounds_info.at(c.buf())[0], {{-1, -1}, {0, 31}});
    }
    {
      // Infer bounds on the inner loop body's scope
@@ -317,13 +317,13 @@ TEST(BoundsInference, _6) {
      ASSERT_EQ(bounds_info.at(a.data())[0].kind, kLoad);
      verifyConstBounds(bounds_info.at(a.data())[0], {{-1, -1}, {-1, -1}});
  
-    ASSERT_EQ(bounds_info.at(b->buf()).size(), 1);
-    ASSERT_EQ(bounds_info.at(b->buf())[0].kind, kLoad);
-    verifyConstBounds(bounds_info.at(b->buf())[0], {{-1, -1}, {-1, -1}});
+    ASSERT_EQ(bounds_info.at(b.buf()).size(), 1);
+    ASSERT_EQ(bounds_info.at(b.buf())[0].kind, kLoad);
+    verifyConstBounds(bounds_info.at(b.buf())[0], {{-1, -1}, {-1, -1}});
  
-    ASSERT_EQ(bounds_info.at(c->buf()).size(), 1);
-    ASSERT_EQ(bounds_info.at(c->buf())[0].kind, kStore);
-    verifyConstBounds(bounds_info.at(c->buf())[0], {{-1, -1}, {-1, -1}});
+    ASSERT_EQ(bounds_info.at(c.buf()).size(), 1);
+    ASSERT_EQ(bounds_info.at(c.buf())[0].kind, kStore);
+    verifyConstBounds(bounds_info.at(c.buf())[0], {{-1, -1}, {-1, -1}});
    }
  }
  
@@ -331,9 +331,9 @@ TEST(BoundsInference, Adjacent) {
    KernelScope kernel_scope;
    ExprHandle H(6);
    Placeholder a(BufHandle("a", {20}, kFloat));
-  Tensor* b =
+  Tensor b =
        Compute("b", {{H, "x"}}, [&](const VarHandle& x) { return a.load(x); });
-  Tensor* c = Compute(
+  Tensor c = Compute(
        "c", {{H, "x"}}, [&](const VarHandle& x) { return a.load(x + H); });
    LoopNest l({b, c});
    std::vector<ForPtr> loops = NodeFinder<For>::find(l.root_stmt());
@@ -348,9 +348,9 @@ TEST(BoundsInference, Adjacent) {
      ASSERT_EQ(bounds_info.at(a.data())[0].kind, kLoad);
      verifyConstBounds(bounds_info.at(a.data())[0], {{0, 5}});
  
-    ASSERT_EQ(bounds_info.at(b->buf()).size(), 1);
-    ASSERT_EQ(bounds_info.at(b->buf())[0].kind, kStore);
-    verifyConstBounds(bounds_info.at(b->buf())[0], {{0, 5}});
+    ASSERT_EQ(bounds_info.at(b.buf()).size(), 1);
+    ASSERT_EQ(bounds_info.at(b.buf())[0].kind, kStore);
+    verifyConstBounds(bounds_info.at(b.buf())[0], {{0, 5}});
    }
    {
      // Infer bounds on the inner loop scope
@@ -362,9 +362,9 @@ TEST(BoundsInference, Adjacent) {
      ASSERT_EQ(bounds_info.at(a.data())[0].kind, kLoad);
      verifyConstBounds(bounds_info.at(a.data())[0], {{6, 11}});
  
-    ASSERT_EQ(bounds_info.at(c->buf()).size(), 1);
-    ASSERT_EQ(bounds_info.at(c->buf())[0].kind, kStore);
-    verifyConstBounds(bounds_info.at(c->buf())[0], {{0, 5}});
+    ASSERT_EQ(bounds_info.at(c.buf()).size(), 1);
+    ASSERT_EQ(bounds_info.at(c.buf())[0].kind, kStore);
+    verifyConstBounds(bounds_info.at(c.buf())[0], {{0, 5}});
    }
    {
      // Infer bounds on the high level program.
@@ -377,24 +377,24 @@ TEST(BoundsInference, Adjacent) {
      ASSERT_EQ(bounds_info.at(a.data())[0].kind, kLoad);
      verifyConstBounds(bounds_info.at(a.data())[0], {{0, 11}});
  
-    ASSERT_EQ(bounds_info.at(b->buf()).size(), 1);
-    ASSERT_EQ(bounds_info.at(b->buf())[0].kind, kStore);
-    verifyConstBounds(bounds_info.at(b->buf())[0], {{0, 5}});
+    ASSERT_EQ(bounds_info.at(b.buf()).size(), 1);
+    ASSERT_EQ(bounds_info.at(b.buf())[0].kind, kStore);
+    verifyConstBounds(bounds_info.at(b.buf())[0], {{0, 5}});
  
-    ASSERT_EQ(bounds_info.at(c->buf()).size(), 1);
-    ASSERT_EQ(bounds_info.at(c->buf())[0].kind, kStore);
-    verifyConstBounds(bounds_info.at(c->buf())[0], {{0, 5}});
+    ASSERT_EQ(bounds_info.at(c.buf()).size(), 1);
+    ASSERT_EQ(bounds_info.at(c.buf())[0].kind, kStore);
+    verifyConstBounds(bounds_info.at(c.buf())[0], {{0, 5}});
    }
  }
  
  TEST(BoundsInference, MultipleTopLoopLoad) {
    KernelScope kernel_scope;
    Placeholder a(BufHandle("a", {100}, kFloat));
-  Tensor* b =
+  Tensor b =
        Compute("b", {{64, "x"}}, [&](const VarHandle& x) { return a.load(x); });
-  Tensor* c = Compute(
+  Tensor c = Compute(
        "c", {{32, "x"}}, [&](const VarHandle& x) { return a.load(x + 10); });
-  Tensor* d = Compute(
+  Tensor d = Compute(
        "d", {{96, "x"}}, [&](const VarHandle& x) { return a.load(x + 2); });
    LoopNest l({b, c, d});
  
@@ -418,7 +418,7 @@ TEST(BoundsInference, MultipleTopLoopLoad) {
  
    // b, c, d only written.
    {
-    auto bounds = bounds_info[b->buf()];
+    auto bounds = bounds_info[b.buf()];
      ASSERT_EQ(bounds.size(), 1);
      auto bound = bounds[0];
      ASSERT_EQ(bound.kind, TensorAccessKind::kStore);
@@ -426,7 +426,7 @@ TEST(BoundsInference, MultipleTopLoopLoad) {
      verifyConstBounds(bound, {{0, 63}});
    }
    {
-    auto bounds = bounds_info[c->buf()];
+    auto bounds = bounds_info[c.buf()];
      ASSERT_EQ(bounds.size(), 1);
      auto bound = bounds[0];
      ASSERT_EQ(bound.kind, TensorAccessKind::kStore);
@@ -434,7 +434,7 @@ TEST(BoundsInference, MultipleTopLoopLoad) {
      verifyConstBounds(bound, {{0, 31}});
    }
    {
-    auto bounds = bounds_info[d->buf()];
+    auto bounds = bounds_info[d.buf()];
      ASSERT_EQ(bounds.size(), 1);
      auto bound = bounds[0];
      ASSERT_EQ(bound.kind, TensorAccessKind::kStore);
@@ -506,24 +506,24 @@ TEST(BoundsInference, MultipleTopLoopStore) {
  TEST(BoundsInference, CacheReads) {
    KernelScope kernel_scope;
  
-  Tensor* A = Compute(
+  Tensor A = Compute(
        "A", {{64, "i"}, {64, "j"}}, [](const VarHandle& i, const VarHandle& j) {
          return i * j;
        });
-  Tensor* B = Compute(
+  Tensor B = Compute(
        "B", {{20, "i"}, {10, "j"}}, [&](const VarHandle& i, const VarHandle& j) {
-        return A->load(i + 30, j + 3);
+        return A.load(i + 30, j + 3);
        });
-  Tensor* C = Compute(
+  Tensor C = Compute(
        "C", {{20, "i"}, {10, "j"}}, [&](const VarHandle& i, const VarHandle& j) {
-        return A->load(i + 10, j + 20) + A->load(i + 30, j + 40);
+        return A.load(i + 10, j + 20) + A.load(i + 30, j + 40);
        });
  
    LoopNest l({B, C});
    auto bounds_info_before = inferBounds(l.root_stmt());
  
    StmtPtr j_loop = l.getLoopStmtsFor(B)[1];
-  LoopNest::cacheAccesses(A->buf(), "A_local", j_loop);
+  LoopNest::cacheAccesses(A.buf(), "A_local", j_loop);
  
    auto bounds_info_after = inferBounds(l.root_stmt());
  
@@ -571,7 +571,7 @@ TEST(BoundsInference, CacheReads) {
  
  TEST(BoundsInference, Flattened) {
    KernelScope kernel_scope;
-  Tensor* b = Compute(
+  Tensor b = Compute(
        "b",
        {{3, "z"}, {4, "y"}, {5, "x"}},
        [&](const VarHandle& z, const VarHandle& y, const VarHandle& x) {
@@ -585,7 +585,7 @@ TEST(BoundsInference, Flattened) {
  
    // There's only one buffer.
    ASSERT_EQ(bounds_info.size(), 1);
-  auto& TABI = bounds_info[b->buf()][0];
+  auto& TABI = bounds_info[b.buf()][0];
    ASSERT_EQ(TABI.kind, TensorAccessKind::kStore);
    // Flattened bounds should have a single dimension.
    ASSERT_EQ(TABI.start.size(), 1);
@@ -651,11 +651,11 @@ TEST(BoundsInference, GetPotentialHazards) {
  TEST(BoundsInference, GetPotentialHazardsLoopNoHazard) {
    KernelScope kernel_scope;
  
-  Tensor* A = Compute(
+  Tensor A = Compute(
        "A", {{64, "i"}, {64, "j"}}, [](const VarHandle& i, const VarHandle& j) {
          return i * j;
        });
-  Tensor* B = Compute(
+  Tensor B = Compute(
        "B", {{64, "i"}, {64, "j"}}, [](const VarHandle& i, const VarHandle& j) {
          return (i + 1) * (j + 1);
        });
@@ -679,13 +679,13 @@ TEST(BoundsInference, GetPotentialHazardsLoopNoHazard) {
  TEST(BoundsInference, GetPotentialHazardsLoopCall) {
    KernelScope kernel_scope;
  
-  Tensor* A = Compute(
+  Tensor A = Compute(
        "A", {{64, "i"}, {64, "j"}}, [](const VarHandle& i, const VarHandle& j) {
          return i * j;
        });
-  Tensor* B = Compute(
+  Tensor B = Compute(
        "B", {{64, "i"}, {64, "j"}}, [&](const VarHandle& i, const VarHandle& j) {
-        return A->load(i, j) + 5;
+        return A.load(i, j) + 5;
        });
  
    LoopNest l({A, B});
@@ -706,7 +706,7 @@ TEST(BoundsInference, GetPotentialHazardsLoopCall) {
  TEST(BoundsInference, GetPotentialHazardsLoopSplit) {
    KernelScope kernel_scope;
  
-  Tensor* A = Compute(
+  Tensor A = Compute(
        "A", {{64, "i"}, {64, "j"}}, [](const VarHandle& i, const VarHandle& j) {
          return i * j;
        });
diff --git a/test/cpp/tensorexpr/test_conv.cpp b/test/cpp/tensorexpr/test_conv.cpp

index 63881d0..293fbe2 100644 (file)
--- a/test/cpp/tensorexpr/test_conv.cpp
+++ b/test/cpp/tensorexpr/test_conv.cpp
@@ -30,7 +30,7 @@ TEST(Conv, DepthwiseConv2D) {
    te::Placeholder input("input", te::kFloat, {N, C, H, W});
    te::Placeholder weight("weight", te::kFloat, {K, CperG, R, S});
    te::Placeholder bias("bias", te::kFloat, {K});
-  te::Tensor* output = te::conv2d_depthwise(
+  te::Tensor output = te::conv2d_depthwise(
        input.handle(), weight.handle(), bias.handle(), kStride, kPad, kGroups);
  
    te::LoopNest loop({output});
@@ -61,7 +61,7 @@ TEST(Conv, DepthwiseConv2DNoBias) {
  
    te::Placeholder input("input", te::kFloat, {N, C, H, W});
    te::Placeholder weight("weight", te::kFloat, {K, CperG, R, S});
-  te::Tensor* output = te::conv2d_depthwise(
+  te::Tensor output = te::conv2d_depthwise(
        input.handle(), weight.handle(), kStride, kPad, kGroups);
  
    te::LoopNest loop({output});
@@ -96,7 +96,7 @@ TEST(Conv, DepthwiseConv2DDynamicShapes) {
    te::Placeholder input("input", te::kFloat, {N_var, C_var, H_var, W_var});
    te::Placeholder weight(
        "weight", te::kFloat, {K_var, CperG_var, R_var, S_var});
-  te::Tensor* output = te::conv2d_depthwise(
+  te::Tensor output = te::conv2d_depthwise(
        input.handle(),
        weight.handle(),
        N_var,
@@ -195,7 +195,7 @@ TEST(Conv, Conv2D) {
    te::Placeholder inputB(te::BufHandle("input", {N, C, H, W}, te::kFloat));
    te::Placeholder filterB(te::BufHandle("filter", {K, C, R, S}, te::kFloat));
  
-  te::Tensor* conv = te::Reduce(
+  te::Tensor conv = te::Reduce(
        "conv",
        {{N, "n"}, {K, "k"}, {OH, "oh"}, {OW, "ow"}},
        te::Sum(),
diff --git a/test/cpp/tensorexpr/test_cuda.cpp b/test/cpp/tensorexpr/test_cuda.cpp

index e36e17a..ed5c070 100644 (file)
--- a/test/cpp/tensorexpr/test_cuda.cpp
+++ b/test/cpp/tensorexpr/test_cuda.cpp
@@ -34,7 +34,7 @@ static void testCudaTestVectorAdd01_impl() {
    Dtype dtype = ToDtype<ctype>();
    Placeholder a_buf("a", dtype, {num_iter, block_count, block_size});
    Placeholder b_buf("b", dtype, {num_iter, block_count, block_size});
-  Tensor* c = Compute(
+  Tensor c = Compute(
        "c",
        {
            {num_iter, "n"},
@@ -99,7 +99,7 @@ TEST(Cuda, Sigmoid_CUDA) {
    const int block_size = 128;
    Dtype dtype = ToDtype<float>();
    Placeholder a_buf("a", dtype, {num_iter, block_count, block_size});
-  Tensor* c = Compute(
+  Tensor c = Compute(
        "c",
        {
            {num_iter, "n"},
@@ -165,7 +165,7 @@ static void testCudaTestVectorAdd02_impl(int N, int block_size) {
    KernelScope kernel_scope;
    Placeholder a_buf("a", kFloat, {N});
    Placeholder b_buf("b", kFloat, {N});
-  Tensor* c = Compute(
+  Tensor c = Compute(
        "c",
        {
            {N, "N"},
@@ -225,7 +225,7 @@ TEST(Cuda, HalfCast_CUDA) {
    KernelScope ks;
    auto half = ToDtype<at::Half>();
    Placeholder a("a", half, {4});
-  Tensor* b = Compute("b", {{4, "n"}}, [&](const VarHandle& i) {
+  Tensor b = Compute("b", {{4, "n"}}, [&](const VarHandle& i) {
      return Cast::make(kFloat, a.load(i));
    });
  
@@ -267,7 +267,7 @@ TEST(Cuda, DynamicShape2D_CUDA) {
      VarHandle n("n", kInt);
      Placeholder a(BufHandle("a", {m, n}, kFloat));
      Placeholder b(BufHandle("b", {m, n}, kFloat));
-    Tensor* c = Compute(
+    Tensor c = Compute(
          "c", {{m, "m"}, {n, "n"}}, [&](const VarHandle& i, const VarHandle& j) {
            return a.load(i, j) + b.load(i, j);
          });
@@ -328,7 +328,7 @@ TEST(Cuda, TestRand01_CUDA) {
    const int num_iter = 3;
    const int block_count = 16;
    const int block_size = 128;
-  Tensor* c = Compute(
+  Tensor c = Compute(
        "c",
        {
            {num_iter, "n"},
@@ -387,7 +387,7 @@ TEST(Cuda, DynamicShapeSplit_CUDA) {
    constexpr int N = 4096;
    VarHandle n("n", kInt);
    Placeholder a(BufHandle("a", {n}, kFloat));
-  Tensor* b = Compute(
+  Tensor b = Compute(
        "b", {{n, "n"}}, [&](const VarHandle& i) { return a.load(i) * 2.0f; });
    LoopNest l({b});
    ForPtr inner;
@@ -928,16 +928,16 @@ TEST(Cuda, HalfSupport_CUDA) {
    KernelScope ks;
    auto half = ToDtype<at::Half>();
    Placeholder a("a", half, {4});
-  Tensor* b = Compute("b", {{4, "n"}}, [&](const VarHandle& i) {
+  Tensor b = Compute("b", {{4, "n"}}, [&](const VarHandle& i) {
      return Cast::make(half, ExprHandle(2.0f) * a.load(i));
    });
  
-  Tensor* c = Compute("c", {{4, "n"}}, [&](const VarHandle& i) {
-    return Cast::make(kFloat, Cast::make(half, ExprHandle(42)) + b->load(i));
+  Tensor c = Compute("c", {{4, "n"}}, [&](const VarHandle& i) {
+    return Cast::make(kFloat, Cast::make(half, ExprHandle(42)) + b.load(i));
    });
  
-  Tensor* d = Compute("d", {{4, "n"}}, [&](const VarHandle& i) {
-    return Cast::make(half, c->load(i));
+  Tensor d = Compute("d", {{4, "n"}}, [&](const VarHandle& i) {
+    return Cast::make(half, c.load(i));
    });
  
    LoopNest l({b, c, d});
@@ -986,7 +986,7 @@ TEST(Cuda, HalfPropagation_CUDA) {
    KernelScope kernel_scope;
    auto half = ToDtype<at::Half>();
    Placeholder a("a", half, {4});
-  Tensor* relu = Compute("relu", {{4, "n"}}, [&](const VarHandle& i) {
+  Tensor relu = Compute("relu", {{4, "n"}}, [&](const VarHandle& i) {
      return Max::make(a.load(i), ExprHandle(alloc<HalfImm>(0)), true);
    });
  
@@ -1036,7 +1036,7 @@ TEST(Cuda, UnusedHalfArgument_CUDA) {
    Placeholder a("a", kFloat, {4});
    auto half = ToDtype<at::Half>();
    Placeholder b("b", half, {4});
-  Tensor* relu = Compute("relu", {{4, "n"}}, [&](const VarHandle& i) {
+  Tensor relu = Compute("relu", {{4, "n"}}, [&](const VarHandle& i) {
      return Max::make(a.load(i), ExprHandle(alloc<FloatImm>(0)), true);
    });
  
@@ -1168,10 +1168,10 @@ TEST(Cuda, MaskBlockDim_CUDA) {
    int B_SIZE = 50;
    Placeholder a_buf("a", kFloat, {A_SIZE});
    Placeholder b_buf("b", kFloat, {B_SIZE});
-  Tensor* c = Compute("c", {{A_SIZE, "i"}}, [&](const VarHandle& i) {
+  Tensor c = Compute("c", {{A_SIZE, "i"}}, [&](const VarHandle& i) {
      return a_buf.load(i) + 10;
    });
-  Tensor* d = Compute("d", {{B_SIZE, "i"}}, [&](const VarHandle& i) {
+  Tensor d = Compute("d", {{B_SIZE, "i"}}, [&](const VarHandle& i) {
      return a_buf.load(i) + b_buf.load(i);
    });
  
@@ -1261,10 +1261,10 @@ TEST(Cuda, MaskThreadDim_CUDA) {
    int B_SIZE = 100;
    Placeholder a_buf("a", kFloat, {A_SIZE});
    Placeholder b_buf("b", kFloat, {B_SIZE});
-  Tensor* c = Compute("c", {{A_SIZE, "i"}}, [&](const VarHandle& i) {
+  Tensor c = Compute("c", {{A_SIZE, "i"}}, [&](const VarHandle& i) {
      return a_buf.load(i) + 10;
    });
-  Tensor* d = Compute("d", {{B_SIZE, "i"}}, [&](const VarHandle& i) {
+  Tensor d = Compute("d", {{B_SIZE, "i"}}, [&](const VarHandle& i) {
      return a_buf.load(i / 2) + b_buf.load(i);
    });
  
@@ -1356,10 +1356,10 @@ TEST(Cuda, MaskMultiBlockDim_CUDA) {
    int B_SIZE = 50;
    Placeholder a_buf("a", kFloat, {A_SIZE});
    Placeholder b_buf("b", kFloat, {B_SIZE});
-  Tensor* c = Compute("c", {{A_SIZE, "i"}}, [&](const VarHandle& i) {
+  Tensor c = Compute("c", {{A_SIZE, "i"}}, [&](const VarHandle& i) {
      return a_buf.load(i) + 10;
    });
-  Tensor* d = Compute("d", {{B_SIZE, "i"}}, [&](const VarHandle& i) {
+  Tensor d = Compute("d", {{B_SIZE, "i"}}, [&](const VarHandle& i) {
      return a_buf.load(i) + b_buf.load(i);
    });
  
@@ -1450,10 +1450,10 @@ TEST(Cuda, MaskBlockAndThreadDim_CUDA) {
    int B_SIZE = 50;
    Placeholder a_buf("a", kFloat, {A_SIZE});
    Placeholder b_buf("b", kFloat, {B_SIZE});
-  Tensor* c = Compute("c", {{A_SIZE, "i"}}, [&](const VarHandle& i) {
+  Tensor c = Compute("c", {{A_SIZE, "i"}}, [&](const VarHandle& i) {
      return a_buf.load(i) + 10;
    });
-  Tensor* d = Compute("d", {{B_SIZE, "i"}}, [&](const VarHandle& i) {
+  Tensor d = Compute("d", {{B_SIZE, "i"}}, [&](const VarHandle& i) {
      return a_buf.load(i) + b_buf.load(i);
    });
  
@@ -1543,17 +1543,17 @@ TEST(Cuda, MaskMultiDim_CUDA) {
    int B_SIZE = 50;
    Placeholder a_buf("a", kFloat, {OUTER_SIZE, A_SIZE});
    Placeholder b_buf("b", kFloat, {OUTER_SIZE, B_SIZE});
-  Tensor* c = Compute(
+  Tensor c = Compute(
        "C",
        {{OUTER_SIZE, "i"}, {A_SIZE, "j"}},
        [&](const VarHandle& i, const VarHandle& j) {
          return ExprHandle(2) * a_buf.load(i, j);
        });
-  Tensor* d = Compute(
+  Tensor d = Compute(
        "D",
        {{OUTER_SIZE, "i"}, {B_SIZE, "j"}},
        [&](const VarHandle& i, const VarHandle& j) {
-        return c->load(i, j * 2) + b_buf.load(i, j);
+        return c.load(i, j * 2) + b_buf.load(i, j);
        });
  
    LoopNest l({c, d});
@@ -1673,17 +1673,17 @@ TEST(Cuda, MaskMultiDimSymbolic_CUDA) {
    VarHandle B_SIZE("B_SIZE", kInt);
    Placeholder a_buf("a", kFloat, {OUTER_SIZE, A_SIZE});
    Placeholder b_buf("b", kFloat, {OUTER_SIZE, B_SIZE});
-  Tensor* c = Compute(
+  Tensor c = Compute(
        "C",
        {{OUTER_SIZE, "i"}, {A_SIZE, "j"}},
        [&](const VarHandle& i, const VarHandle& j) {
          return ExprHandle(2) * a_buf.load(i, j);
        });
-  Tensor* d = Compute(
+  Tensor d = Compute(
        "D",
        {{OUTER_SIZE, "i"}, {B_SIZE, "j"}},
        [&](const VarHandle& i, const VarHandle& j) {
-        return c->load(i, j * 2) + b_buf.load(i, j);
+        return c.load(i, j * 2) + b_buf.load(i, j);
        });
  
    LoopNest l({c, d});
@@ -2087,17 +2087,17 @@ TEST(Cuda, MaskMultiDimMultiAxis_CUDA) {
    int B_SIZE = 15;
    Placeholder a_buf("a", kFloat, {OUTER_SIZE, A_SIZE});
    Placeholder b_buf("b", kFloat, {OUTER_SIZE, B_SIZE});
-  Tensor* c = Compute(
+  Tensor c = Compute(
        "C",
        {{OUTER_SIZE, "i"}, {A_SIZE, "j"}},
        [&](const VarHandle& i, const VarHandle& j) {
          return ExprHandle(2) * a_buf.load(i, j);
        });
-  Tensor* d = Compute(
+  Tensor d = Compute(
        "D",
        {{OUTER_SIZE, "i"}, {B_SIZE, "j"}},
        [&](const VarHandle& i, const VarHandle& j) {
-        return c->load(i, j * 2) + b_buf.load(i, j);
+        return c.load(i, j * 2) + b_buf.load(i, j);
        });
  
    LoopNest l({c, d});
@@ -2218,17 +2218,17 @@ TEST(Cuda, MaskMultiDimMultiLevel_CUDA) {
    int B_SIZE = 15;
    Placeholder a_buf("a", kFloat, {OUTER_A_SIZE, A_SIZE});
    Placeholder b_buf("b", kFloat, {OUTER_B_SIZE, B_SIZE});
-  Tensor* c = Compute(
+  Tensor c = Compute(
        "C",
        {{OUTER_A_SIZE, "i"}, {A_SIZE, "j"}},
        [&](const VarHandle& i, const VarHandle& j) {
          return ExprHandle(2) * a_buf.load(i, j);
        });
-  Tensor* d = Compute(
+  Tensor d = Compute(
        "D",
        {{OUTER_B_SIZE, "i"}, {B_SIZE, "j"}},
        [&](const VarHandle& i, const VarHandle& j) {
-        return c->load(i, j * 2) + b_buf.load(i, j);
+        return c.load(i, j * 2) + b_buf.load(i, j);
        });
  
    LoopNest l({c, d});
diff --git a/test/cpp/tensorexpr/test_external_calls.cpp b/test/cpp/tensorexpr/test_external_calls.cpp

index 24ddfbf..a170e53 100644 (file)
--- a/test/cpp/tensorexpr/test_external_calls.cpp
+++ b/test/cpp/tensorexpr/test_external_calls.cpp
@@ -31,7 +31,7 @@ TEST(ExternalCall, Conv2d_float) {
    int64_t dilation = 1;
    int64_t groups = 1;
  
-  Tensor* Result = new Tensor(
+  Tensor Result = Tensor(
        ResultBuf.node(),
        ExternalCall::make(
            ResultBuf,
@@ -95,7 +95,7 @@ TEST(ExternalCall, Conv2d_int) {
    int64_t dilation = 1;
    int64_t groups = 1;
  
-  Tensor* Result = new Tensor(
+  Tensor Result = Tensor(
        ResultBuf.node(),
        ExternalCall::make(
            ResultBuf,
@@ -153,7 +153,7 @@ TEST(ExternalCall, Conv2d_nobias_noargs) {
    Placeholder Weight("Weight", kFloat, {16, 16, 1, 1});
    BufHandle ResultBuf("Result", {1, 16, 112, 112}, kFloat);
  
-  Tensor* Result = new Tensor(
+  Tensor Result = Tensor(
        ResultBuf.node(),
        ExternalCall::make(
            ResultBuf,
@@ -203,7 +203,7 @@ TEST(ExternalCall, Addmm_float) {
    int64_t beta = 2;
    int64_t alpha = 2;
  
-  Tensor* Result = new Tensor(
+  Tensor Result = Tensor(
        ResultBuf.node(),
        ExternalCall::make(
            ResultBuf,
@@ -283,7 +283,7 @@ TEST(ExternalCall, Prepacked_Linear_float) {
        weight, bias, c10::optional<at::Scalar>(), c10::optional<at::Scalar>());
  
    Placeholder DummyPrepacked("DummyPrepacked", kFloat, {1});
-  Tensor* Result = new Tensor(
+  Tensor Result = Tensor(
        ResultBuf.node(),
        ExternalCall::make(
            ResultBuf,
@@ -370,7 +370,7 @@ TEST(ExternalCall, Prepacked_Conv2d_float) {
        c10::optional<at::Scalar>());
  
    Placeholder DummyPrepacked("DummyPrepacked", kFloat, {1});
-  Tensor* Result = new Tensor(
+  Tensor Result = Tensor(
        ResultBuf.node(),
        ExternalCall::make(
            ResultBuf,
@@ -431,7 +431,7 @@ TEST(ExternalCall, BinaryFloat) {
      Placeholder B("", kFloat, toExprHandleVec(bShape));
      BufHandle ResultBuf("Result", toExprHandleVec(resShape), kFloat);
  
-    Tensor* Result = new Tensor(
+    Tensor Result = Tensor(
          ResultBuf.node(),
          ExternalCall::make(
              ResultBuf,
@@ -516,7 +516,7 @@ TEST(ExternalCall, UnaryFloat) {
      Placeholder A("A", kFloat, toExprHandleVec(aShape));
      BufHandle ResultBuf("Result", toExprHandleVec(resShape), kFloat);
  
-    Tensor* Result = new Tensor(
+    Tensor Result = Tensor(
          ResultBuf.node(),
          ExternalCall::make(
              ResultBuf, externCallName, {BufHandle(A.data())}, externCallArgs));
@@ -566,14 +566,14 @@ TEST(ExternalCall, ComputeInterop) {
    BufHandle ConvResultBuf("ConvResult", {1, 16, 32, 32}, kFloat);
    BufHandle MatmulResultBuf("MatmulResult", {1, 16, 32, 32}, kFloat);
  
-  Tensor* Input = Compute(
+  Tensor Input = Compute(
        "Input",
        {{1, "n"}, {16, "c"}, {32, "h"}, {32, "w"}},
        [&](const VarHandle& n,
            const VarHandle& c,
            const VarHandle& h,
            const VarHandle& w) { return FloatImm::make(5.0f); });
-  Tensor* Weight = Compute(
+  Tensor Weight = Compute(
        "Weight",
        {{16, "n"}, {16, "c"}, {1, "kh"}, {1, "kw"}},
        [&](const VarHandle& n,
@@ -581,28 +581,28 @@ TEST(ExternalCall, ComputeInterop) {
            const VarHandle& h,
            const VarHandle& w) { return FloatImm::make(6.0f); });
  
-  Tensor* ConvResult = new Tensor(
+  Tensor ConvResult = Tensor(
        ConvResultBuf.node(),
        ExternalCall::make(
            ConvResultBuf,
            "nnc_aten_conv2d",
-          {BufHandle(Input->buf()), BufHandle(Weight->buf())},
+          {BufHandle(Input.buf()), BufHandle(Weight.buf())},
            {}));
-  Tensor* MatmulResult = new Tensor(
+  Tensor MatmulResult = Tensor(
        MatmulResultBuf.node(),
        ExternalCall::make(
            MatmulResultBuf,
            "nnc_aten_matmul",
-          {BufHandle(ConvResult->buf()), BufHandle(ConvResult->buf())},
+          {BufHandle(ConvResult.buf()), BufHandle(ConvResult.buf())},
            {}));
-  Tensor* Result = Compute(
+  Tensor Result = Compute(
        "Result",
        {{1, "n"}, {16, "c"}, {32, "h"}, {32, "w"}},
        [&](const VarHandle& n,
            const VarHandle& c,
            const VarHandle& h,
            const VarHandle& w) {
-        return ConvResult->load(n, c, h, w) + MatmulResult->load(n, c, h, w);
+        return ConvResult.load(n, c, h, w) + MatmulResult.load(n, c, h, w);
        });
  
    LoopNest l({Input, Weight, ConvResult, MatmulResult, Result});
@@ -658,31 +658,31 @@ TEST(ExternalCall, Inlining) {
  
    BufHandle MatmulResultBuf("MatmulResult", {8, 8}, kFloat);
  
-  Tensor* A = Compute(
+  Tensor A = Compute(
        "A", {{8, "i"}, {8, "j"}}, [&](const VarHandle& i, const VarHandle& j) {
          return FloatImm::make(5.0f);
        });
-  Tensor* B = Compute(
+  Tensor B = Compute(
        "B", {{8, "i"}, {8, "j"}}, [&](const VarHandle& i, const VarHandle& j) {
          return FloatImm::make(4.0f);
        });
-  Tensor* MatmulResult = new Tensor(
+  Tensor MatmulResult = Tensor(
        MatmulResultBuf.node(),
        ExternalCall::make(
            MatmulResultBuf,
            "nnc_aten_matmul",
-          {BufHandle(A->buf()), BufHandle(B->buf())},
+          {BufHandle(A.buf()), BufHandle(B.buf())},
            {}));
-  Tensor* Result = Compute(
+  Tensor Result = Compute(
        "Result",
        {{8, "i"}, {8, "j"}},
        [&](const VarHandle& i, const VarHandle& j) {
-        return MatmulResult->load(i, j) + FloatImm::make(3.0f);
+        return MatmulResult.load(i, j) + FloatImm::make(3.0f);
        });
  
    StmtPtr root_stmt = alloc<Block>(std::vector<StmtPtr>(
-      {A->stmt(), B->stmt(), MatmulResult->stmt(), Result->stmt()}));
-  LoopNest l(root_stmt, {Result->buf()});
+      {A.stmt(), B.stmt(), MatmulResult.stmt(), Result.stmt()}));
+  LoopNest l(root_stmt, {Result.buf()});
  
    // Inlining should not inline anything here since all Bufs are either
    // defined or used in ExternalCalls
diff --git a/test/cpp/tensorexpr/test_ir_printer.cpp b/test/cpp/tensorexpr/test_ir_printer.cpp

index 76d9247..e11ba06 100644 (file)
--- a/test/cpp/tensorexpr/test_ir_printer.cpp
+++ b/test/cpp/tensorexpr/test_ir_printer.cpp
@@ -56,30 +56,30 @@ TEST(IRPrinter, FunctionName) {
    int M = 4;
    int N = 20;
  
-  Tensor* producer = Compute(
+  Tensor producer = Compute(
        "producer",
        {{M, "m"}, {N, "n"}},
        [&](const ExprHandle& m, const ExprHandle& n) { return m * n; });
  
-  Tensor* chunk_0 = Compute(
+  Tensor chunk_0 = Compute(
        "chunk",
        {{M, "m"}, {N / 2, "n"}},
        [&](const ExprHandle& m, const ExprHandle& n) {
-        return producer->load(m, n);
+        return producer.load(m, n);
        });
  
-  Tensor* chunk_1 = Compute(
+  Tensor chunk_1 = Compute(
        "chunk",
        {{M, "m"}, {N / 2, "n"}},
        [&](const ExprHandle& m, const ExprHandle& n) {
-        return producer->load(m, n + ExprHandle(N / 2));
+        return producer.load(m, n + ExprHandle(N / 2));
        });
  
-  Tensor* consumer = Compute(
+  Tensor consumer = Compute(
        "consumer",
        {{M, "i"}, {N / 2, "j"}},
        [&](const ExprHandle& i, const ExprHandle& j) {
-        return i * chunk_1->load(i, j);
+        return i * chunk_1.load(i, j);
        });
  
    LoopNest l({chunk_0, chunk_1, consumer});
diff --git a/test/cpp/tensorexpr/test_kernel.cpp b/test/cpp/tensorexpr/test_kernel.cpp

index 8f36f54..765522e 100644 (file)
--- a/test/cpp/tensorexpr/test_kernel.cpp
+++ b/test/cpp/tensorexpr/test_kernel.cpp
@@ -1329,7 +1329,7 @@ TEST_F(Kernel, CodegenInspection) {
  #endif
  }
  
-Tensor* lowerNanToNum(
+Tensor lowerNanToNum(
      const std::vector<ArgValue>& inputs,
      const std::vector<ExprHandle>& outputShape,
      const c10::optional<ScalarType>& outputType,
diff --git a/test/cpp/tensorexpr/test_llvm.cpp b/test/cpp/tensorexpr/test_llvm.cpp

index 75e6a06..6081403 100644 (file)
--- a/test/cpp/tensorexpr/test_llvm.cpp
+++ b/test/cpp/tensorexpr/test_llvm.cpp
@@ -593,10 +593,10 @@ TEST(LLVM, VectorizerLoadStoreTest) {
    KernelScope kernel_scope;
    Placeholder a(BufHandle("A", {1}, kInt));
  
-  Tensor* c =
+  Tensor c =
        Compute("c", {{4, "i"}}, [&](const VarHandle& i) { return a.load(i); });
  
-  Placeholder c_buf(BufHandle(c->buf()));
+  Placeholder c_buf(BufHandle(c.buf()));
    LoopNest l({c});
    StmtPtr s = l.root_stmt();
    ASSERT_TRUE(LoopNest::vectorize(to<For>(to<Block>(s)->front())));
@@ -616,11 +616,11 @@ TEST(LLVM, VectorizeBitCast) {
    KernelScope kernel_scope;
    Placeholder a(BufHandle("A", {128}, kInt));
  
-  Tensor* c = Compute("c", {{128, "i"}}, [&](const VarHandle& i) {
+  Tensor c = Compute("c", {{128, "i"}}, [&](const VarHandle& i) {
      return bitcast<float>(a.load(i));
    });
  
-  Placeholder c_buf(BufHandle(c->buf()));
+  Placeholder c_buf(BufHandle(c.buf()));
    LoopNest l({c});
    StmtPtr s = l.root_stmt();
    ASSERT_TRUE(LoopNest::vectorize(to<For>(to<Block>(s)->front())));
@@ -1217,12 +1217,12 @@ TEST(LLVM, StoreFloat) {
  TEST(LLVM, SimpleMath01) {
    KernelScope kernel_scope;
    const int N = 1024;
-  Tensor* tensor = Compute("f", {{N, "i"}}, [](const VarHandle& i) {
+  Tensor tensor = Compute("f", {{N, "i"}}, [](const VarHandle& i) {
      return cast<float>(i * i + 1);
    });
    LoopNest l({tensor});
    StmtPtr stmt = l.root_stmt();
-  Placeholder f_buf(BufHandle(tensor->buf()));
+  Placeholder f_buf(BufHandle(tensor.buf()));
    LLVMCodeGen cg(stmt, {f_buf});
  
    PaddedBuffer<float> f_v(N, "f_v");
@@ -1241,11 +1241,11 @@ TEST(LLVM, ComputeMul) {
    const int N = 1024;
    Placeholder a(BufHandle("a", {N}, kFloat));
    Placeholder b(BufHandle("b", {N}, kFloat));
-  Tensor* c = Compute("c", {{N, "i"}}, [&](const VarHandle& i) {
+  Tensor c = Compute("c", {{N, "i"}}, [&](const VarHandle& i) {
      return a.load(i) * b.load(i);
    });
  
-  Placeholder c_buf(BufHandle(c->buf()));
+  Placeholder c_buf(BufHandle(c.buf()));
    LoopNest l({c});
    StmtPtr s = l.root_stmt();
  
@@ -1265,12 +1265,12 @@ TEST(LLVM, BroadcastAdd) {
    const int N = 1024;
    Placeholder a(BufHandle("a", {M, N}, kFloat));
    Placeholder b(BufHandle("b", {N}, kFloat));
-  Tensor* c = Compute(
+  Tensor c = Compute(
        "c", {{M, "i"}, {N, "j"}}, [&](const VarHandle& i, const VarHandle& j) {
          return a.load(i, j) + b.load(j);
        });
  
-  Placeholder c_buf(BufHandle(c->buf()));
+  Placeholder c_buf(BufHandle(c.buf()));
    LoopNest l({c});
    l.prepareForCodegen();
    StmtPtr s = l.root_stmt();
@@ -1372,7 +1372,7 @@ TEST(LLVM, TensorDynamicShapeAdd) {
      VarHandle n("n", kInt);
      Placeholder a(BufHandle("a", {n}, kFloat));
      Placeholder b(BufHandle("b", {n}, kFloat));
-    Tensor* c = Compute("c", {{n, "n"}}, [&](const VarHandle& i) {
+    Tensor c = Compute("c", {{n, "n"}}, [&](const VarHandle& i) {
        return a.load(i) + b.load(i);
      });
      LoopNest l({c});
@@ -1396,7 +1396,7 @@ TEST(LLVM, DynamicShape2D) {
      VarHandle n("n", kInt);
      Placeholder a(BufHandle("a", {m, n}, kFloat));
      Placeholder b(BufHandle("b", {m, n}, kFloat));
-    Tensor* c = Compute(
+    Tensor c = Compute(
          "c", {{m, "m"}, {n, "n"}}, [&](const VarHandle& i, const VarHandle& j) {
            return a.load(i, j) + b.load(i, j);
          });
@@ -1428,7 +1428,7 @@ TEST(LLVM, EliminatedStmt) {
    KernelScope kernel_scope;
    Placeholder a(BufHandle("a", {1}, kFloat));
  
-  Tensor* c = Compute("c", {{0, "m"}}, [&](const VarHandle& m) { return m; });
+  Tensor c = Compute("c", {{0, "m"}}, [&](const VarHandle& m) { return m; });
  
    LoopNest l({c});
    l.prepareForCodegen();
@@ -1452,7 +1452,7 @@ TEST(LLVM, SimpleReduction) {
    // TODO: why doesn't implicit vector<DimArg> work?
    std::vector<DimArg> axis = {DimArg(1)};
    std::vector<DimArg> reduce_axis = {DimArg(M), DimArg(N)};
-  Tensor* b = Reduce("sum", axis, Sum(), a, reduce_axis);
+  Tensor b = Reduce("sum", axis, Sum(), a, reduce_axis);
    LoopNest loop({b});
  
    loop.prepareForCodegen();
@@ -1491,7 +1491,7 @@ TEST(LLVM, RFactorReduction) {
    // TODO: why doesn't implicit vector<DimArg> work?
    std::vector<DimArg> axis = {DimArg(1)};
    std::vector<DimArg> reduce_axis = {DimArg(M), DimArg(N)};
-  Tensor* b = Reduce("sum", axis, Sum(), a, reduce_axis);
+  Tensor b = Reduce("sum", axis, Sum(), a, reduce_axis);
    LoopNest loop({b});
  
    std::vector<ForPtr> loops = loop.getLoopStmtsFor(b);
@@ -1502,7 +1502,7 @@ TEST(LLVM, RFactorReduction) {
    loops = loop.getLoopStmtsFor(b);
    loop_m = loops.at(2);
    loop_n = loops.at(1);
-  auto b_body = loop.getAllWritesToBuf(b->buf())[1];
+  auto b_body = loop.getAllWritesToBuf(b.buf())[1];
    ASSERT_TRUE(loop.rfactor(b_body, loop_n));
  
    loop.prepareForCodegen();
@@ -1538,13 +1538,13 @@ TEST(LLVM, RFactorVectorizedReduction) {
  
    Placeholder a("a", kFloat, {1, M, N});
  
-  Tensor* b = Reduce("sum", {{1, "K"}}, Sum(), a, {{M, "M"}, {N, "N"}});
+  Tensor b = Reduce("sum", {{1, "K"}}, Sum(), a, {{M, "M"}, {N, "N"}});
    LoopNest loopnest({b});
    std::vector<ForPtr> loops = loopnest.getLoopStmtsFor(b);
    // Reorder n and m loops
    loopnest.reorderAxis(loops.at(1), loops.at(2));
-  auto b_body = loopnest.getAllWritesToBuf(b->buf()).at(1);
-  auto all_loops = loopnest.getAllLoopNestsWritingToBuf(b->buf());
+  auto b_body = loopnest.getAllWritesToBuf(b.buf()).at(1);
+  auto all_loops = loopnest.getAllLoopNestsWritingToBuf(b.buf());
    ASSERT_TRUE(all_loops.size() == 2 && all_loops[1].size() == 3);
    ASSERT_TRUE(loopnest.rfactor(b_body, all_loops[1][1]));
    auto distributed_loops = loopnest.distributeLoop(all_loops[1][1]);
@@ -1585,7 +1585,7 @@ TEST(LLVM, SimpleParallel) {
      KernelScope kernel_scope;
      const int M = 4;
      const int N = 6;
-    Tensor* f = Compute(
+    Tensor f = Compute(
          "f", {{M, "m"}, {N, "n"}}, [](const VarHandle& m, const VarHandle& n) {
            return cast<float>(m + n);
          });
@@ -1626,23 +1626,23 @@ TEST(LLVM, CompositeParallel) {
      KernelScope kernel_scope;
      int M = 5;
      int N = 7;
-    Tensor* t1 =
+    Tensor t1 =
          Compute("t1", {{M, "M"}}, [](const VarHandle& m) { return m + 1.f; });
-    Tensor* t2 =
+    Tensor t2 =
          Compute("t2", {{N, "N"}}, [](const VarHandle& n) { return n + 2.f; });
-    Tensor* t3 = Compute(
+    Tensor t3 = Compute(
          "t3",
          {{M, "M"}, {N, "N"}},
          [=](const VarHandle& m, const VarHandle& n) {
-          return t1->load(m) * t2->load(n);
+          return t1.load(m) * t2.load(n);
          });
-    Tensor* t4 = Compute(
+    Tensor t4 = Compute(
          "t4",
          {{M, "M"}, {N, "N"}},
          [=](const VarHandle& m, const VarHandle& n) {
-          return t3->load(m, n) + m + n;
+          return t3.load(m, n) + m + n;
          });
-    LoopNest loop_nest(std::vector<Tensor*>({t4}), {t1, t2, t3, t4});
+    LoopNest loop_nest({t4}, {t1, t2, t3, t4});
      std::vector<ForPtr> loop_list;
      {
        auto const& loops = loop_nest.getLoopStmtsFor(t1);
@@ -1695,7 +1695,7 @@ TEST(LLVM, VectorizedGEMM) {
  
    Placeholder AP(BufHandle("A", {M, K}, kFloat));
    Placeholder BP(BufHandle("B", {K, N}, kFloat));
-  Tensor* CT = Reduce(
+  Tensor CT = Reduce(
        "gemm",
        {{M, "M"}, {N, "N"}},
        Sum(),
@@ -1776,7 +1776,7 @@ TEST(LLVM, CallRaw) {
    VarHandle N("N", kInt);
    Placeholder a(BufHandle("a", {M, N}, kFloat));
    Placeholder b(BufHandle("b", {N}, kFloat));
-  Tensor* c = Compute(
+  Tensor c = Compute(
        "c", {{M, "i"}, {N, "j"}}, [&](const VarHandle& i, const VarHandle& j) {
          return a.load(i, j) + b.load(j);
        });
@@ -1793,7 +1793,7 @@ TEST(LLVM, CallRaw) {
    std::vector<float> cv(M * N_value, 0);
    std::vector<void*> args({av.data(), bv.data(), cv.data(), &N_value});
  
-  LLVMCodeGen cg(s, {a, b, BufHandle(c->buf()), N});
+  LLVMCodeGen cg(s, {a, b, BufHandle(c.buf()), N});
    cg.call_raw(args);
  
    for (int i = 0; i < M; i++) {
@@ -1802,7 +1802,7 @@ TEST(LLVM, CallRaw) {
      }
    }
  
-  SimpleIREvaluator eval(s, {a, b, BufHandle(c->buf()), N});
+  SimpleIREvaluator eval(s, {a, b, BufHandle(c.buf()), N});
    eval.call_raw(args);
  
    for (int i = 0; i < M; i++) {
@@ -1818,7 +1818,7 @@ TEST(LLVM, CustomTarget) {
    Placeholder a("a", kFloat, {M});
    Placeholder b("b", kFloat, {M});
    Placeholder c("c", kFloat, {M});
-  Tensor* d = Compute("d", {{M, "m"}}, [&](const VarHandle& m) {
+  Tensor d = Compute("d", {{M, "m"}}, [&](const VarHandle& m) {
      return a.load(m) * b.load(m) + c.load(m);
    });
    LoopNest nest({d});
diff --git a/test/cpp/tensorexpr/test_loopnest.cpp b/test/cpp/tensorexpr/test_loopnest.cpp

index c80dd5f..7c3eefa 100644 (file)
--- a/test/cpp/tensorexpr/test_loopnest.cpp
+++ b/test/cpp/tensorexpr/test_loopnest.cpp
@@ -42,13 +42,12 @@ void checkExprIR(const ExprHandle& e, const std::string& pattern) {
  
  TEST(LoopNest, ExprSimple01) {
    KernelScope kernel_scope;
-  Tensor* tensor = Compute(
+  Tensor tensor = Compute(
        "f", {{16, "X"}, {5, "y"}}, [](const VarHandle& x, const VarHandle& y) {
          return ExprHandle(1.0f) + cast<float>(x) * x + cast<float>(y) * y;
        });
    LoopNest l({tensor});
-  std::vector<ForPtr> loops =
-      l.getAllLoopNestsWritingToBuf(tensor->buf()).at(0);
+  std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(tensor.buf()).at(0);
  
    LoopNest::splitWithTail(loops[0], 2);
    LoopNest::splitWithTail(loops[0], 2);
@@ -56,7 +55,7 @@ TEST(LoopNest, ExprSimple01) {
  
  TEST(LoopNest, ExprLower01) {
    KernelScope kernel_scope;
-  Tensor* tensor = Compute(
+  Tensor tensor = Compute(
        "f", {{16, "x"}, {5, "y"}}, [](const VarHandle& x, const VarHandle& y) {
          return ExprHandle(1.0f) + cast<float>(x) * x + cast<float>(y) * y;
        });
@@ -73,10 +72,9 @@ TEST(LoopNest, ExprSimple02) {
    auto func = [](const ExprHandle& x, const ExprHandle& y) {
      return ExprHandle(1.0f) + cast<float>(x) * x + cast<float>(y) * y;
    };
-  Tensor* tensor = Compute("f", {{26, "x"}, {5, "y"}}, func);
+  Tensor tensor = Compute("f", {{26, "x"}, {5, "y"}}, func);
    LoopNest l({tensor});
-  std::vector<ForPtr> loops =
-      l.getAllLoopNestsWritingToBuf(tensor->buf()).at(0);
+  std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(tensor.buf()).at(0);
  
    LoopNest::splitWithTail(loops[0], 4);
  
@@ -168,14 +166,13 @@ TEST(LoopNest, ExprSliceHeadWithLoopOptions) {
    auto func = [](const ExprHandle& x) {
      return ExprHandle(1.0f) + cast<float>(x);
    };
-  Tensor* tensor = Compute("f", {{10, "x"}}, func);
+  Tensor tensor = Compute("f", {{10, "x"}}, func);
    LoopNest l({tensor});
    // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
    ForPtr head;
    // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
    ForPtr tail;
-  std::vector<ForPtr> loops =
-      l.getAllLoopNestsWritingToBuf(tensor->buf()).at(0);
+  std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(tensor.buf()).at(0);
    loops[0]->set_gpu_block_index(LoopOptions::IDX_Y);
    LoopNest::sliceHead(loops[0], 2, &head, &tail);
  
@@ -193,14 +190,13 @@ TEST(LoopNest, ExprSliceTailWithLoopOptions) {
    auto func = [](const ExprHandle& x) {
      return ExprHandle(1.0f) + cast<float>(x);
    };
-  Tensor* tensor = Compute("f", {{10, "x"}}, func);
+  Tensor tensor = Compute("f", {{10, "x"}}, func);
    LoopNest l({tensor});
    // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
    ForPtr head;
    // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
    ForPtr tail;
-  std::vector<ForPtr> loops =
-      l.getAllLoopNestsWritingToBuf(tensor->buf()).at(0);
+  std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(tensor.buf()).at(0);
    LoopNest::sliceTail(loops[0], 4, &head, &tail);
  
    // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
@@ -227,14 +223,13 @@ TEST(LoopNest, ExprSliceHeadWhenFactorEqualsSize) {
    auto func = [](const ExprHandle& x) {
      return ExprHandle(1.0f) + cast<float>(x);
    };
-  Tensor* tensor = Compute("f", {{10, "x"}}, func);
+  Tensor tensor = Compute("f", {{10, "x"}}, func);
    LoopNest l({tensor});
    // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
    ForPtr head;
    // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
    ForPtr tail;
-  std::vector<ForPtr> loops =
-      l.getAllLoopNestsWritingToBuf(tensor->buf()).at(0);
+  std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(tensor.buf()).at(0);
    LoopNest::sliceHead(loops[0], 10, &head, &tail);
  
    ASSERT_EQ(head, loops[0]);
@@ -249,14 +244,13 @@ TEST(LoopNest, ExprSliceHeadWhenFactorLargerThanSize) {
    auto func = [](const ExprHandle& x) {
      return ExprHandle(1.0f) + cast<float>(x);
    };
-  Tensor* tensor = Compute("f", {{10, "x"}}, func);
+  Tensor tensor = Compute("f", {{10, "x"}}, func);
    LoopNest l({tensor});
    // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
    ForPtr head;
    // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
    ForPtr tail;
-  std::vector<ForPtr> loops =
-      l.getAllLoopNestsWritingToBuf(tensor->buf()).at(0);
+  std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(tensor.buf()).at(0);
    LoopNest::sliceHead(loops[0], 100, &head, &tail);
  
    ASSERT_EQ(head, loops[0]);
@@ -271,14 +265,13 @@ TEST(LoopNest, ExprSliceHead) {
    auto func = [](const ExprHandle& x) {
      return ExprHandle(1.0f) + cast<float>(x);
    };
-  Tensor* tensor = Compute("f", {{10, "x"}}, func);
+  Tensor tensor = Compute("f", {{10, "x"}}, func);
    LoopNest l({tensor});
    // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
    ForPtr head;
    // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
    ForPtr tail;
-  std::vector<ForPtr> loops =
-      l.getAllLoopNestsWritingToBuf(tensor->buf()).at(0);
+  std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(tensor.buf()).at(0);
    LoopNest::sliceHead(loops[0], 4, &head, &tail);
  
    ASSERT_NE(head, nullptr);
@@ -295,10 +288,9 @@ TEST(LoopNest, ExprSliceHeadWithNonZeroStart) {
    auto func = [](const ExprHandle& x) {
      return ExprHandle(1.0f) + cast<float>(x);
    };
-  Tensor* tensor = Compute("f", {{10, "x"}}, func);
+  Tensor tensor = Compute("f", {{10, "x"}}, func);
    LoopNest l({tensor});
-  std::vector<ForPtr> loops =
-      l.getAllLoopNestsWritingToBuf(tensor->buf()).at(0);
+  std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(tensor.buf()).at(0);
  
    // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
    ForPtr head;
@@ -323,14 +315,13 @@ TEST(LoopNest, ExprSliceTailWhenFactorEqualsSize) {
    auto func = [](const ExprHandle& x) {
      return ExprHandle(1.0f) + cast<float>(x);
    };
-  Tensor* tensor = Compute("f", {{10, "x"}}, func);
+  Tensor tensor = Compute("f", {{10, "x"}}, func);
    LoopNest l({tensor});
    // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
    ForPtr head;
    // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
    ForPtr tail;
-  std::vector<ForPtr> loops =
-      l.getAllLoopNestsWritingToBuf(tensor->buf()).at(0);
+  std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(tensor.buf()).at(0);
    LoopNest::sliceTail(loops[0], 10, &head, &tail);
  
    ASSERT_EQ(head, nullptr);
@@ -347,14 +338,13 @@ TEST(LoopNest, ExprSliceTailWhenFactorLargerThanSize) {
    auto func = [](const ExprHandle& x) {
      return ExprHandle(1.0f) + cast<float>(x);
    };
-  Tensor* tensor = Compute("f", {{10, "x"}}, func);
+  Tensor tensor = Compute("f", {{10, "x"}}, func);
    LoopNest l({tensor});
    // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
    ForPtr head;
    // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
    ForPtr tail;
-  std::vector<ForPtr> loops =
-      l.getAllLoopNestsWritingToBuf(tensor->buf()).at(0);
+  std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(tensor.buf()).at(0);
    LoopNest::sliceTail(loops[0], 100, &head, &tail);
  
    ASSERT_EQ(head, nullptr);
@@ -369,14 +359,13 @@ TEST(LoopNest, ExprSliceTail) {
    auto func = [](const ExprHandle& x) {
      return ExprHandle(1.0f) + cast<float>(x);
    };
-  Tensor* tensor = Compute("f", {{10, "x"}}, func);
+  Tensor tensor = Compute("f", {{10, "x"}}, func);
    LoopNest l({tensor});
    // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
    ForPtr head;
    // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
    ForPtr tail;
-  std::vector<ForPtr> loops =
-      l.getAllLoopNestsWritingToBuf(tensor->buf()).at(0);
+  std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(tensor.buf()).at(0);
    LoopNest::sliceTail(loops[0], 4, &head, &tail);
  
    ASSERT_NE(head, nullptr);
@@ -396,15 +385,14 @@ TEST(LoopNest, ExprSplitAndSlice) {
    auto func = [](const ExprHandle& x) {
      return ExprHandle(1.0f) + cast<float>(x);
    };
-  Tensor* tensor = Compute("f", {{100, "x"}}, func);
+  Tensor tensor = Compute("f", {{100, "x"}}, func);
    LoopNest l({tensor});
  
    // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
    ForPtr inner;
    // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
    ForPtr tail;
-  std::vector<ForPtr> loops =
-      l.getAllLoopNestsWritingToBuf(tensor->buf()).at(0);
+  std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(tensor.buf()).at(0);
    // outer: [0, 4)
    // inner: [0, 21)
    // tail:  [84, 100)
@@ -450,10 +438,9 @@ TEST(LoopNest, ExprSliceAndNormalize) {
    auto func = [](const ExprHandle& x) {
      return ExprHandle(1.0f) + cast<float>(x);
    };
-  Tensor* tensor = Compute("f", {{10, "x"}}, func);
+  Tensor tensor = Compute("f", {{10, "x"}}, func);
    LoopNest l({tensor});
-  std::vector<ForPtr> loops =
-      l.getAllLoopNestsWritingToBuf(tensor->buf()).at(0);
+  std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(tensor.buf()).at(0);
  
    // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
    ForPtr head;
@@ -482,11 +469,11 @@ TEST(LoopNest, ExprSliceWithVariableDimension) {
           const std::vector<std::pair<int, int>>& expected_for_ranges) {
          KernelScope kernel_scope;
          VarHandle dim("dim", kInt);
-        Tensor* tensor =
+        Tensor tensor =
              Compute("f", {{dim, "x"}}, [](const ExprHandle& x) { return x; });
          LoopNest l({tensor});
          std::vector<ForPtr> loops =
-            l.getAllLoopNestsWritingToBuf(tensor->buf()).at(0);
+            l.getAllLoopNestsWritingToBuf(tensor.buf()).at(0);
  
          // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
          ForPtr head;
@@ -521,10 +508,9 @@ TEST(LoopNest, ExprSplitWithTail) {
    auto func = [](const ExprHandle& x) {
      return ExprHandle(1.0f) + cast<float>(x);
    };
-  Tensor* tensor = Compute("f", {{199, "x"}}, func);
+  Tensor tensor = Compute("f", {{199, "x"}}, func);
    LoopNest l({tensor});
-  std::vector<ForPtr> loops =
-      l.getAllLoopNestsWritingToBuf(tensor->buf()).at(0);
+  std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(tensor.buf()).at(0);
    // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
    LoopNest::splitWithTail(loops[0], 17);
    // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
@@ -552,10 +538,9 @@ TEST(LoopNest, ExprSplitWithTailNone) {
    auto func = [](const ExprHandle& x, const ExprHandle& y) {
      return ExprHandle(1.0f) + cast<float>(x) * x + cast<float>(y) * y;
    };
-  Tensor* tensor = Compute("f", {{24, "x"}, {5, "y"}}, func);
+  Tensor tensor = Compute("f", {{24, "x"}, {5, "y"}}, func);
    LoopNest l({tensor});
-  std::vector<ForPtr> loops =
-      l.getAllLoopNestsWritingToBuf(tensor->buf()).at(0);
+  std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(tensor.buf()).at(0);
    LoopNest::splitWithTail(loops[0], 4);
  
    StmtPtr stmt = l.root_stmt();
@@ -612,14 +597,13 @@ TEST(LoopNest, ExprSplitWithMask01) {
    const int N = 5;
    Placeholder a_buf("a", kFloat, {M, N});
    Placeholder b_buf("b", kFloat, {M, N});
-  Tensor* tensor = Compute(
+  Tensor tensor = Compute(
        "f", {{M, "m"}, {N, "n"}}, [&](const ExprHandle& m, const ExprHandle& n) {
          return a_buf.load(m, n) + b_buf.load(m, n) + 1.0f;
        });
  
    LoopNest l({tensor});
-  std::vector<ForPtr> loops =
-      l.getAllLoopNestsWritingToBuf(tensor->buf()).at(0);
+  std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(tensor.buf()).at(0);
    LoopNest::splitWithMask(loops[1], 4);
  
    StmtPtr stmt = l.root_stmt();
@@ -648,13 +632,12 @@ TEST(LoopNest, ExprSplitWithMaskRepeatedNoMask) {
    const int M = 64;
    Placeholder a_buf("a", kFloat, {M});
    Placeholder b_buf("b", kFloat, {M});
-  Tensor* tensor = Compute("f", {{M, "m"}}, [&](const ExprHandle& m) {
+  Tensor tensor = Compute("f", {{M, "m"}}, [&](const ExprHandle& m) {
      return a_buf.load(m) + b_buf.load(m) + 1.0f;
    });
  
    LoopNest l({tensor});
-  std::vector<ForPtr> loops =
-      l.getAllLoopNestsWritingToBuf(tensor->buf()).at(0);
+  std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(tensor.buf()).at(0);
    LoopNest::splitWithMask(loops[0], 4);
    LoopNest::splitWithMask(loops[0], 4);
  
@@ -736,14 +719,13 @@ TEST(LoopNest, TileSimple) {
    const int M = 64, N = 64;
    Placeholder a_buf("a", kFloat, {M, N});
    Placeholder b_buf("b", kFloat, {M, N});
-  Tensor* tensor = Compute(
+  Tensor tensor = Compute(
        "f", {{M, "m"}, {N, "n"}}, [&](const ExprHandle& m, const ExprHandle& n) {
          return a_buf.load({m, n}) + b_buf.load({m, n}) + 1.0f;
        });
  
    LoopNest l({tensor});
-  std::vector<ForPtr> loops =
-      l.getAllLoopNestsWritingToBuf(tensor->buf()).at(0);
+  std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(tensor.buf()).at(0);
    // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
    l.tile(loops[0], loops[1], 4, 8);
  
@@ -783,14 +765,13 @@ TEST(LoopNest, TileWithTails) {
    const int M = 64, N = 64;
    Placeholder a_buf("a", kFloat, {M, N});
    Placeholder b_buf("b", kFloat, {M, N});
-  Tensor* tensor = Compute(
+  Tensor tensor = Compute(
        "f", {{M, "m"}, {N, "n"}}, [&](const ExprHandle& m, const ExprHandle& n) {
          return a_buf.load({m, n}) + b_buf.load({m, n}) + 1.0f;
        });
  
    LoopNest l({tensor});
-  std::vector<ForPtr> loops =
-      l.getAllLoopNestsWritingToBuf(tensor->buf()).at(0);
+  std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(tensor.buf()).at(0);
    // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
    l.tile(loops[0], loops[1], 5, 9);
  
@@ -831,7 +812,7 @@ TEST(LoopNest, TileInMiddle) {
    const int M = 8, N = 8, L = 8, K = 8;
    Placeholder a_buf("a", kFloat, {M, N, L, K});
    Placeholder b_buf("b", kFloat, {M, N, L, K});
-  Tensor* tensor = Compute(
+  Tensor tensor = Compute(
        "f",
        {{M, "m"}, {N, "n"}, {L, "l"}, {K, "k"}},
        [&](const ExprHandle& m,
@@ -843,7 +824,7 @@ TEST(LoopNest, TileInMiddle) {
  
    LoopNest nest({tensor});
    std::vector<ForPtr> loops =
-      nest.getAllLoopNestsWritingToBuf(tensor->buf()).at(0);
+      nest.getAllLoopNestsWritingToBuf(tensor.buf()).at(0);
    // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
    nest.tile(loops[1], loops[2], 3, 3);
  
@@ -891,7 +872,7 @@ TEST(LoopNest, SplitWithTailWithLoopOptions) {
    const int M = 21;
    Placeholder a_buf("a", kFloat, {M});
    Placeholder b_buf("b", kFloat, {M});
-  Tensor* tensor = Compute("f", {{M, "m"}}, [&](const ExprHandle& m) {
+  Tensor tensor = Compute("f", {{M, "m"}}, [&](const ExprHandle& m) {
      return a_buf.load(m) + b_buf.load(m) + 1.0f;
    });
    // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
@@ -922,7 +903,7 @@ TEST(LoopNest, SplitWithMaskWithLoopOptions) {
    const int M = 21;
    Placeholder a_buf("a", kFloat, {M});
    Placeholder b_buf("b", kFloat, {M});
-  Tensor* tensor = Compute("f", {{M, "m"}}, [&](const ExprHandle& m) {
+  Tensor tensor = Compute("f", {{M, "m"}}, [&](const ExprHandle& m) {
      return a_buf.load(m) + b_buf.load(m) + 1.0f;
    });
    // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
@@ -949,7 +930,7 @@ TEST(LoopNest, ScheduleBroadcastAddBuffer) {
    const int K = 6;
    Placeholder a_buf("a", kFloat, {M, N});
    Placeholder b_buf("b", kFloat, {N, K});
-  Tensor* c = Compute(
+  Tensor c = Compute(
        "broadcast_add",
        {{M, "m"}, {N, "n"}, {K, "k"}},
        [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) {
@@ -998,20 +979,20 @@ TEST(LoopNest, ScheduleFunctionCall01) {
    const int K = 6;
    Placeholder a_buf("a", kFloat, {M, N});
    Placeholder b_buf("b", kFloat, {N, K});
-  Tensor* c = Compute(
+  Tensor c = Compute(
        "broadcast_add",
        {{M, "m"}, {N, "n"}, {K, "k"}},
        [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) {
          return a_buf.load(m, n) + b_buf.load(n, k);
        });
-  Tensor* d = Compute(
+  Tensor d = Compute(
        "d",
        {{M, "m"}, {N, "n"}, {K, "k"}},
        [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) {
-        return c->load(m, n, k) + 1;
+        return c.load(m, n, k) + 1;
        });
  
-  LoopNest l(std::vector<Tensor*>({d}), {c, d});
+  LoopNest l({d}, {c, d});
    l.prepareForCodegen();
    StmtPtr stmt = l.root_stmt();
    std::ostringstream oss;
@@ -1058,22 +1039,22 @@ TEST(LoopNest, ScheduleInlineSimple) {
    Placeholder c_buf("c", kFloat, {M, N});
    Placeholder d_buf("d", kFloat, {M, K});
  
-  Tensor* x = Compute(
+  Tensor x = Compute(
        "x",
        {{M, "m1"}, {N, "n1"}, {K, "k1"}},
        [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) {
          return a_buf.load(m, n) * b_buf.load(n, k);
        });
-  Tensor* y = Compute(
+  Tensor y = Compute(
        "y",
        {{M, "m2"}, {N, "n2"}, {K, "k2"}},
        [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) {
-        return c_buf.load(m, n) * d_buf.load(m, k) + x->load(m, n, k);
+        return c_buf.load(m, n) * d_buf.load(m, k) + x.load(m, n, k);
        });
  
-  LoopNest l1(std::vector<Tensor*>({y}), {x, y});
+  LoopNest l1({y}, {x, y});
    LoopNest l2(l1);
-  l2.computeInline(x->buf());
+  l2.computeInline(x.buf());
  
    l1.prepareForCodegen();
    l2.prepareForCodegen();
@@ -1139,31 +1120,31 @@ void InlineFunc01Helper(const std::vector<std::string>& inline_order) {
    Placeholder c_buf("c", kFloat, {M, N});
    Placeholder d_buf("d", kFloat, {M, K});
  
-  Tensor* x = Compute(
+  Tensor x = Compute(
        "x",
        {{M, "m1"}, {N, "n1"}, {K, "k1"}},
        [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) {
          return a_buf.load(m, n) * b_buf.load(n, k);
        });
-  Tensor* y = Compute(
+  Tensor y = Compute(
        "y",
        {{M, "m2"}, {N, "n2"}, {K, "k2"}},
        [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) {
-        return c_buf.load(m, n) * d_buf.load(m, k) + x->load(m, n, k);
+        return c_buf.load(m, n) * d_buf.load(m, k) + x.load(m, n, k);
        });
-  Tensor* z = Compute(
+  Tensor z = Compute(
        "z",
        {{M, "m3"}, {N, "n3"}, {K, "k3"}},
        [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) {
-        return x->load(m, n, k) + y->load(m, n, k);
+        return x.load(m, n, k) + y.load(m, n, k);
        });
  
-  LoopNest l(std::vector<Tensor*>({z}), {x, y, z});
+  LoopNest l({z}, {x, y, z});
    for (const std::string& order : inline_order) {
      if (order == "x") {
-      l.computeInline(x->buf());
+      l.computeInline(x.buf());
      } else if (order == "y") {
-      l.computeInline(y->buf());
+      l.computeInline(y.buf());
      } else {
        throw std::runtime_error("Invalid order: " + order);
      }
@@ -1218,7 +1199,7 @@ void InlineFunc01Helper(const std::vector<std::string>& inline_order) {
    }
  
    if (inline_order.size() == 2) {
-    Tensor* z2 = Compute(
+    Tensor z2 = Compute(
          "z",
          {{M, "m3"}, {N, "n3"}, {K, "k3"}},
          [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) {
@@ -1254,21 +1235,21 @@ TEST(LoopNest, ScheduleInlineRandom) {
    const int N = 5;
    const int K = 6;
  
-  Tensor* x = Compute(
+  Tensor x = Compute(
        "x",
        {{M, "m1"}, {N, "n1"}, {K, "k1"}},
        [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) {
          return Mod::make(Intrinsics::make(kRand, kInt), 5);
        });
-  Tensor* y = Compute(
+  Tensor y = Compute(
        "y",
        {{M, "m2"}, {N, "n2"}, {K, "k2"}},
        [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) {
-        return x->load(m, n, k) + x->load(m, n, k);
+        return x.load(m, n, k) + x.load(m, n, k);
        });
  
-  LoopNest l1(std::vector<Tensor*>({y}), {x, y});
-  l1.computeInline(x->buf());
+  LoopNest l1({y}, {x, y});
+  l1.computeInline(x.buf());
  
    // would normally compare results but Rand isn't implemented in the
    // SimpleIREvaluator, even if we could seed it.
@@ -1290,22 +1271,22 @@ TEST(LoopNest, ScheduleInlineRandomUnrelated) {
    const int N = 5;
    const int K = 6;
  
-  Tensor* x = Compute(
+  Tensor x = Compute(
        "x",
        {{M, "m1"}, {N, "n1"}, {K, "k1"}},
        [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) {
          return m * n * k;
        });
-  Tensor* y = Compute(
+  Tensor y = Compute(
        "y",
        {{M, "m2"}, {N, "n2"}, {K, "k2"}},
        [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) {
-        return x->load(m, n, k) + Intrinsics::make(kRand, kInt) +
+        return x.load(m, n, k) + Intrinsics::make(kRand, kInt) +
              Intrinsics::make(kRand, kInt);
        });
  
-  LoopNest l1(std::vector<Tensor*>({y}), {x, y});
-  l1.computeInline(x->buf());
+  LoopNest l1({y}, {x, y});
+  l1.computeInline(x.buf());
  
    // would normally compare results but Rand isn't implemented in the
    // SimpleIREvaluator, even if we could seed it.
@@ -1327,18 +1308,18 @@ TEST(LoopNest, ScheduleInlineRandomLowerDimensions) {
    const int N = 5;
    const int K = 6;
  
-  Tensor* x = Compute("x", {{M, "m1"}}, [&](const VarHandle& m) {
+  Tensor x = Compute("x", {{M, "m1"}}, [&](const VarHandle& m) {
      return Mod::make(Intrinsics::make(kRand, kInt), 5);
    });
-  Tensor* y = Compute(
+  Tensor y = Compute(
        "y",
        {{M, "m2"}, {N, "n2"}, {K, "k2"}},
        [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) {
-        return x->load(m) + x->load(m);
+        return x.load(m) + x.load(m);
        });
  
-  LoopNest l1(std::vector<Tensor*>({y}), {x, y});
-  l1.computeInline(x->buf());
+  LoopNest l1({y}, {x, y});
+  l1.computeInline(x.buf());
  
    // would normally compare results but Rand isn't implemented in the
    // SimpleIREvaluator, even if we could seed it.
@@ -1362,17 +1343,17 @@ TEST(LoopNest, ScheduleInlineIntrinsics) {
    Placeholder a_buf("a", kFloat, {M, N});
    Placeholder b_buf("b", kFloat, {N, K});
  
-  Tensor* x = Compute(
+  Tensor x = Compute(
        "x",
        {{M, "m1"}, {N, "n1"}, {K, "k1"}},
        [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) {
          return a_buf.load(m, n) * b_buf.load(n, k);
        });
-  Tensor* y = Compute(
+  Tensor y = Compute(
        "y",
        {{M, "m2"}, {N, "n2"}, {K, "k2"}},
        [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) {
-        return Intrinsics::make(kSqrt, x->load(m, n, k));
+        return Intrinsics::make(kSqrt, x.load(m, n, k));
        });
  
    PaddedBuffer<float> a_v(M, N);
@@ -1389,9 +1370,9 @@ TEST(LoopNest, ScheduleInlineIntrinsics) {
      }
    }
  
-  LoopNest l1(std::vector<Tensor*>({y}), {x, y});
+  LoopNest l1({y}, {x, y});
    LoopNest l2(l1);
-  l2.computeInline(x->buf());
+  l2.computeInline(x.buf());
  
    l1.prepareForCodegen();
    l2.prepareForCodegen();
@@ -1421,21 +1402,21 @@ TEST(LoopNest, ScheduleInlineRandWithIntrinsics) {
    const int N = 5;
    const int K = 6;
  
-  Tensor* x = Compute(
+  Tensor x = Compute(
        "x",
        {{M, "m1"}, {N, "n1"}, {K, "k1"}},
        [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) {
          return Intrinsics::make(kRand, kFloat);
        });
-  Tensor* y = Compute(
+  Tensor y = Compute(
        "y",
        {{M, "m2"}, {N, "n2"}, {K, "k2"}},
        [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) {
-        return Intrinsics::make(kSqrt, x->load(m, n, k));
+        return Intrinsics::make(kSqrt, x.load(m, n, k));
        });
  
-  LoopNest l1(std::vector<Tensor*>({y}), {x, y});
-  l1.computeInline(x->buf());
+  LoopNest l1({y}, {x, y});
+  l1.computeInline(x.buf());
  
    StmtPtr stmt1 = IRSimplifier::simplify(l1.root_stmt());
  
@@ -1451,31 +1432,31 @@ TEST(LoopNest, ScheduleInlineRandWithIntrinsics) {
  // Split a Compute then inline it into another compute.
  TEST(LoopNest, ScheduleSplitAThenInline) {
    KernelScope kernel_scope;
-  Tensor* a =
+  Tensor a =
        Compute("a", {{18, "i"}}, [&](const VarHandle& i) { return i * i; });
-  Tensor* b = Compute("b", {{2, "j"}}, [&](const VarHandle& j) {
-    return a->load(j + ExprHandle(8));
+  Tensor b = Compute("b", {{2, "j"}}, [&](const VarHandle& j) {
+    return a.load(j + ExprHandle(8));
    });
  
-  LoopNest l(std::vector<Tensor*>({b}), {a, b});
-  std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(a->buf()).at(0);
+  LoopNest l({b}, {a, b});
+  std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(a.buf()).at(0);
    LoopNest::splitWithMask(loops[0], 4);
-  ASSERT_THROWS_WITH(l.computeInline(a->buf()), "compound indices");
+  ASSERT_THROWS_WITH(l.computeInline(a.buf()), "compound indices");
  }
  
  // Split a Compute then inline another Compute into it.
  TEST(LoopNest, ScheduleSplitBThenInline) {
    KernelScope kernel_scope;
-  Tensor* a =
+  Tensor a =
        Compute("a", {{18, "i"}}, [&](const VarHandle& i) { return i * i; });
-  Tensor* b = Compute("b", {{6, "j"}}, [&](const VarHandle& j) {
-    return a->load(j + ExprHandle(8));
+  Tensor b = Compute("b", {{6, "j"}}, [&](const VarHandle& j) {
+    return a.load(j + ExprHandle(8));
    });
  
-  LoopNest l(std::vector<Tensor*>({b}), {a, b});
-  std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(b->buf()).at(0);
+  LoopNest l({b}, {a, b});
+  std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(b.buf()).at(0);
    LoopNest::splitWithMask(loops[0], 3);
-  l.computeInline(a->buf());
+  l.computeInline(a.buf());
    l.prepareForCodegen();
    StmtPtr s = IRSimplifier::simplify(l.root_stmt());
  
@@ -1491,32 +1472,32 @@ TEST(LoopNest, ScheduleSplitBThenInline) {
  // Split a Compute twice then inline it.
  TEST(LoopNest, ScheduleSplitTwiceThenInline) {
    KernelScope kernel_scope;
-  Tensor* a =
+  Tensor a =
        Compute("a", {{18, "i"}}, [&](const VarHandle& i) { return i * i; });
-  Tensor* b = Compute("b", {{2, "j"}}, [&](const VarHandle& j) {
-    return a->load(j + ExprHandle(8));
+  Tensor b = Compute("b", {{2, "j"}}, [&](const VarHandle& j) {
+    return a.load(j + ExprHandle(8));
    });
    // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
    ForPtr i_inner;
  
-  LoopNest l(std::vector<Tensor*>({b}), {a, b});
-  std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(a->buf()).at(0);
+  LoopNest l({b}, {a, b});
+  std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(a.buf()).at(0);
    LoopNest::splitWithMask(loops[0], 4, &i_inner);
    LoopNest::splitWithMask(i_inner, 2);
-  ASSERT_THROWS_WITH(l.computeInline(a->buf()), "compound indices");
+  ASSERT_THROWS_WITH(l.computeInline(a.buf()), "compound indices");
  }
  
  // Inline a Compute, then split.
  TEST(LoopNest, ScheduleInlineThenSplit) {
    KernelScope kernel_scope;
-  Tensor* a =
+  Tensor a =
        Compute("a", {{18, "i"}}, [&](const VarHandle& i) { return i * i; });
-  Tensor* b = Compute("b", {{6, "j"}}, [&](const VarHandle& j) {
-    return a->load(j + ExprHandle(8));
+  Tensor b = Compute("b", {{6, "j"}}, [&](const VarHandle& j) {
+    return a.load(j + ExprHandle(8));
    });
  
-  LoopNest l(std::vector<Tensor*>({b}), {a, b});
-  l.computeInline(a->buf());
+  LoopNest l({b}, {a, b});
+  l.computeInline(a.buf());
  
    std::vector<ForPtr> loops = NodeFinder<For>::find(l.root_stmt());
    LoopNest::splitWithMask(loops.back(), 3);
@@ -1534,16 +1515,16 @@ TEST(LoopNest, ScheduleInlineThenSplit) {
  // Split a Compute, inline it, then split the result.
  TEST(LoopNest, ScheduleSplitInlineThenSplit) {
    KernelScope kernel_scope;
-  Tensor* a =
+  Tensor a =
        Compute("a", {{18, "i"}}, [&](const VarHandle& i) { return i * i; });
-  Tensor* b = Compute("b", {{16, "j"}}, [&](const VarHandle& j) {
-    return a->load(j + ExprHandle(8));
+  Tensor b = Compute("b", {{16, "j"}}, [&](const VarHandle& j) {
+    return a.load(j + ExprHandle(8));
    });
  
-  LoopNest l(std::vector<Tensor*>({b}), {a, b});
+  LoopNest l({b}, {a, b});
    auto loops = NodeFinder<For>::find(l.root_stmt());
    LoopNest::splitWithMask(loops.back(), 2);
-  l.computeInline(a->buf());
+  l.computeInline(a.buf());
  
    loops = NodeFinder<For>::find(l.root_stmt());
    LoopNest::splitWithMask(loops.front(), 2);
@@ -1561,35 +1542,35 @@ TEST(LoopNest, ScheduleSplitInlineThenSplit) {
  // Oversplit a loop that is simplified out after inlining.
  TEST(LoopNest, ScheduleSplitInlineSimplify) {
    KernelScope kernel_scope;
-  Tensor* a = Compute("a", {{18, "i"}}, [&](const VarHandle& i) {
+  Tensor a = Compute("a", {{18, "i"}}, [&](const VarHandle& i) {
      return ExprHandle(4) * i - ExprHandle(2) * i;
    });
-  Tensor* b = Compute("b", {{2, "j"}}, [&](const VarHandle& j) {
-    return a->load(j) - ExprHandle(1);
+  Tensor b = Compute("b", {{2, "j"}}, [&](const VarHandle& j) {
+    return a.load(j) - ExprHandle(1);
    });
  
-  LoopNest l(std::vector<Tensor*>({b}), {a, b});
-  std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(a->buf()).at(0);
+  LoopNest l({b}, {a, b});
+  std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(a.buf()).at(0);
    LoopNest::splitWithMask(loops[0], 4);
-  ASSERT_THROWS_WITH(l.computeInline(a->buf()), "compound indices");
+  ASSERT_THROWS_WITH(l.computeInline(a.buf()), "compound indices");
  }
  
  // Inline a Compute with two consumers.
  TEST(LoopNest, ScheduleInlineThreeMixedOnce) {
    KernelScope kernel_scope;
-  Tensor* a =
+  Tensor a =
        Compute("a", {{18, "i"}}, [&](const VarHandle& i) { return i * i; });
-  Tensor* b = Compute("b", {{6, "j"}}, [&](const VarHandle& j) {
-    return a->load(j + ExprHandle(8));
+  Tensor b = Compute("b", {{6, "j"}}, [&](const VarHandle& j) {
+    return a.load(j + ExprHandle(8));
    });
-  Tensor* c = Compute(
+  Tensor c = Compute(
        "c", {{4, "k"}, {3, "l"}}, [&](const VarHandle& k, const VarHandle& l) {
-        return a->load(k) * b->load(l);
+        return a.load(k) * b.load(l);
        });
  
-  LoopNest l(std::vector<Tensor*>({c}), {a, b, c});
-  std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(a->buf()).at(0);
-  l.computeInline(a->buf());
+  LoopNest l({c}, {a, b, c});
+  std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(a.buf()).at(0);
+  l.computeInline(a.buf());
    l.prepareForCodegen();
  
    StmtPtr s = IRSimplifier::simplify(l.root_stmt());
@@ -1607,20 +1588,20 @@ TEST(LoopNest, ScheduleInlineThreeMixedOnce) {
  // Inline Compute A into B, then inline B into C.
  TEST(LoopNest, ScheduleInlineThreeMixedTwice) {
    KernelScope kernel_scope;
-  Tensor* a =
+  Tensor a =
        Compute("a", {{18, "i"}}, [&](const VarHandle& i) { return i * i; });
-  Tensor* b = Compute("b", {{6, "j"}}, [&](const VarHandle& j) {
-    return a->load(j + ExprHandle(8));
+  Tensor b = Compute("b", {{6, "j"}}, [&](const VarHandle& j) {
+    return a.load(j + ExprHandle(8));
    });
-  Tensor* c = Compute(
+  Tensor c = Compute(
        "c", {{4, "k"}, {3, "l"}}, [&](const VarHandle& k, const VarHandle& l) {
-        return a->load(k) * b->load(l);
+        return a.load(k) * b.load(l);
        });
  
-  LoopNest l(std::vector<Tensor*>({c}), {a, b, c});
-  std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(a->buf()).at(0);
-  l.computeInline(a->buf());
-  l.computeInline(b->buf());
+  LoopNest l({c}, {a, b, c});
+  std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(a.buf()).at(0);
+  l.computeInline(a.buf());
+  l.computeInline(b.buf());
    l.prepareForCodegen();
  
    StmtPtr s = IRSimplifier::simplify(l.root_stmt());
@@ -1638,19 +1619,19 @@ TEST(LoopNest, ScheduleInlineThreeMixedTwice) {
  // Inline a Compute that is both a producer and consumer.
  TEST(LoopNest, ScheduleInlineThreeMixedInner) {
    KernelScope kernel_scope;
-  Tensor* a =
+  Tensor a =
        Compute("a", {{18, "i"}}, [&](const VarHandle& i) { return i * i; });
-  Tensor* b = Compute("b", {{6, "j"}}, [&](const VarHandle& j) {
-    return a->load(j + ExprHandle(8));
+  Tensor b = Compute("b", {{6, "j"}}, [&](const VarHandle& j) {
+    return a.load(j + ExprHandle(8));
    });
-  Tensor* c = Compute(
+  Tensor c = Compute(
        "c", {{4, "k"}, {3, "l"}}, [&](const VarHandle& k, const VarHandle& l) {
-        return a->load(k) * b->load(l);
+        return a.load(k) * b.load(l);
        });
  
-  LoopNest l(std::vector<Tensor*>({c}), {a, b, c});
-  std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(a->buf()).at(0);
-  l.computeInline(b->buf());
+  LoopNest l({c}, {a, b, c});
+  std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(a.buf()).at(0);
+  l.computeInline(b.buf());
    l.prepareForCodegen();
  
    StmtPtr s = IRSimplifier::simplify(l.root_stmt());
@@ -1668,25 +1649,25 @@ TEST(LoopNest, ScheduleInlineThreeMixedInner) {
  // Split 3 Computes, then inline the first two into the last.
  TEST(LoopNest, ScheduleInlineThreeMixedSplit) {
    KernelScope kernel_scope;
-  Tensor* a =
+  Tensor a =
        Compute("a", {{18, "i"}}, [&](const VarHandle& i) { return i * i; });
-  Tensor* b = Compute("b", {{6, "j"}}, [&](const VarHandle& j) {
-    return a->load(j + ExprHandle(8));
+  Tensor b = Compute("b", {{6, "j"}}, [&](const VarHandle& j) {
+    return a.load(j + ExprHandle(8));
    });
-  Tensor* c = Compute(
+  Tensor c = Compute(
        "c", {{4, "k"}, {3, "l"}}, [&](const VarHandle& k, const VarHandle& l) {
-        return a->load(k) * b->load(l);
+        return a.load(k) * b.load(l);
        });
  
-  LoopNest l(std::vector<Tensor*>({c}), {a, b, c});
-  std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(a->buf()).at(0);
+  LoopNest l({c}, {a, b, c});
+  std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(a.buf()).at(0);
    LoopNest::splitWithMask(loops[0], 4);
-  loops = l.getAllLoopNestsWritingToBuf(b->buf()).at(0);
+  loops = l.getAllLoopNestsWritingToBuf(b.buf()).at(0);
    LoopNest::splitWithMask(loops[0], 3);
-  loops = l.getAllLoopNestsWritingToBuf(c->buf()).at(0);
+  loops = l.getAllLoopNestsWritingToBuf(c.buf()).at(0);
    LoopNest::splitWithMask(loops[0], 2);
  
-  ASSERT_THROWS_WITH(l.computeInline(a->buf()), "compound indices");
+  ASSERT_THROWS_WITH(l.computeInline(a.buf()), "compound indices");
  }
  
  // Check that inlining works for output tensors too
@@ -1696,21 +1677,21 @@ TEST(LoopNest, ScheduleInlineOutputTensors) {
    const int N = 5;
    const int K = 6;
  
-  Tensor* x = Compute(
+  Tensor x = Compute(
        "x",
        {{M, "m1"}, {N, "n1"}, {K, "k1"}},
        [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) {
          return m * n * k;
        });
-  Tensor* y = Compute(
+  Tensor y = Compute(
        "y",
        {{M, "m2"}, {N, "n2"}, {K, "k2"}},
        [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) {
-        return x->load(m, n, k) + m;
+        return x.load(m, n, k) + m;
        });
  
    LoopNest l1({x, y});
-  l1.computeInline(x->buf());
+  l1.computeInline(x.buf());
  
    // would normally compare results but Rand isn't implemented in the
    // SimpleIREvaluator, even if we could seed it.
@@ -1736,14 +1717,14 @@ TEST(LoopNest, ScheduleFuserStyle) {
  
    Placeholder a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kFloat));
  
-  Tensor* b = Compute(
+  Tensor b = Compute(
        "f", {{kTotalSize, "i"}}, [&](const std::vector<VarHandle>& axes) {
          return a_buf.load(axes[0]) + 11.0f;
        });
  
-  Tensor* c = Compute(
+  Tensor c = Compute(
        "g", {{kTotalSize, "i"}}, [&](const std::vector<VarHandle>& axes) {
-        return b->load(axes[0]) + 1.0f;
+        return b.load(axes[0]) + 1.0f;
        });
  
    LoopNest l({b, c});
@@ -1772,17 +1753,17 @@ TEST(LoopNest, ScheduleFuserThreeArg) {
    Placeholder c(BufHandle("C", {ExprHandle(kTotalSize)}, kFloat));
    Placeholder d(BufHandle("D", {ExprHandle(kTotalSize)}, kFloat));
  
-  Tensor* e = Compute("e", {{kTotalSize, "i"}}, [&](const VarHandle& i) {
+  Tensor e = Compute("e", {{kTotalSize, "i"}}, [&](const VarHandle& i) {
      return a.load(i) + b.load(i);
    });
-  Tensor* f = Compute("f", {{kTotalSize, "i"}}, [&](const VarHandle& i) {
-    return e->load(i) + c.load(i);
+  Tensor f = Compute("f", {{kTotalSize, "i"}}, [&](const VarHandle& i) {
+    return e.load(i) + c.load(i);
    });
-  Tensor* g = Compute("g", {{kTotalSize, "i"}}, [&](const VarHandle& i) {
-    return f->load(i) + d.load(i);
+  Tensor g = Compute("g", {{kTotalSize, "i"}}, [&](const VarHandle& i) {
+    return f.load(i) + d.load(i);
    });
  
-  LoopNest l(std::vector<Tensor*>({g}), {e, f, g});
+  LoopNest l({g}, {e, f, g});
    l.computeInline(l.getLoopBodyFor(e));
    l.computeInline(l.getLoopBodyFor(f));
    l.prepareForCodegen();
@@ -1807,7 +1788,7 @@ TEST(LoopNest, ScheduleDynamicShape2D) {
      VarHandle n("n", kInt);
      Placeholder a(BufHandle("a", {m, n}, kFloat));
      Placeholder b(BufHandle("b", {m, n}, kFloat));
-    Tensor* c = Compute(
+    Tensor c = Compute(
          "c", {{m, "m"}, {n, "n"}}, [&](const VarHandle& i, const VarHandle& j) {
            return a.load(i, j) + b.load(i, j);
          });
@@ -1842,12 +1823,12 @@ TEST(LoopNest, LoopNestComputeAt_1) {
    // and the temp should be used in B.
    KernelScope kernel_scope;
    VarHandle N("N", kInt);
-  Tensor* A = Compute(
+  Tensor A = Compute(
        "A", {{N, "i_a"}}, [&](const VarHandle& i_a) { return i_a * i_a; });
-  Tensor* B = Compute(
-      "B", {{N, "i_b"}}, [&](const VarHandle& i_b) { return A->load(i_b); });
-  LoopNest l(std::vector<Tensor*>({B}), {A, B});
-  std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(B->buf()).at(0);
+  Tensor B = Compute(
+      "B", {{N, "i_b"}}, [&](const VarHandle& i_b) { return A.load(i_b); });
+  LoopNest l({B}, {A, B});
+  std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(B.buf()).at(0);
    LoopNest::computeAt(l.getLoopBodyFor(A), loops[0]);
    l.prepareForCodegen();
    StmtPtr s = l.root_stmt();
@@ -1891,16 +1872,16 @@ TEST(LoopNest, LoopNestComputeAt_2) {
    const int kW = 16, kH = 16;
    VarHandle W("W", kInt);
    VarHandle H("H", kInt);
-  Tensor* p = Compute(
+  Tensor p = Compute(
        "prod",
        {{H + 1, "py"}, {W + 1, "px"}},
        [&](const VarHandle& py, const VarHandle& px) { return px * py; });
-  Tensor* c = Compute(
+  Tensor c = Compute(
        "cons",
        {{H, "cy"}, {W, "cx"}},
        [&](const VarHandle& y, const VarHandle& x) {
-        return p->load(y, x) + p->load(y + 1, x) + p->load(y, x + 1) +
-            p->load(y + 1, x + 1);
+        return p.load(y, x) + p.load(y + 1, x) + p.load(y, x + 1) +
+            p.load(y + 1, x + 1);
        });
  
    std::vector<int> c_ref(kW * kH, 0);
@@ -1909,12 +1890,12 @@ TEST(LoopNest, LoopNestComputeAt_2) {
        c_ref[y * kW + x] = y * x + (y + 1) * x + y * (x + 1) + (y + 1) * (x + 1);
      }
    }
-  LoopNest orig_loopnest(std::vector<Tensor*>({c}), {p, c});
+  LoopNest orig_loopnest({c}, {p, c});
  
    {
      // First let's try to compute P at axis cy (the outer loop)
      LoopNest l(orig_loopnest);
-    std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(c->buf()).at(0);
+    std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(c.buf()).at(0);
      LoopNest::computeAt(l.getLoopBodyFor(p), loops[0]);
      l.prepareForCodegen();
      StmtPtr s = l.root_stmt();
@@ -1940,7 +1921,7 @@ TEST(LoopNest, LoopNestComputeAt_2) {
    {
      // Now let's try to compute P at axis cx (the inner loop)
      LoopNest l(orig_loopnest);
-    std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(c->buf()).at(0);
+    std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(c.buf()).at(0);
      LoopNest::computeAt(l.getLoopBodyFor(p), loops[1]);
      l.prepareForCodegen();
      StmtPtr s = l.root_stmt();
@@ -1979,27 +1960,25 @@ TEST(LoopNest, LoopNestComputeAt_3) {
    const int kW = 16, kH = 16;
    VarHandle W("W", kInt);
    VarHandle H("H", kInt);
-  Tensor* A = Compute(
+  Tensor A = Compute(
        "A",
        {{H + 1, "ay"}, {W + 1, "ax"}},
        [&](const VarHandle& ay, const VarHandle& ax) { return ax * ay; });
-  Tensor* B = Compute(
+  Tensor B = Compute(
        "B",
        {{H + 1, "by"}, {W + 1, "bx"}},
-      [&](const VarHandle& by, const VarHandle& bx) {
-        return A->load(by, bx);
-      });
-  Tensor* C = Compute(
+      [&](const VarHandle& by, const VarHandle& bx) { return A.load(by, bx); });
+  Tensor C = Compute(
        "C",
        {{H, "cy"}, {W, "cx"}},
        [&](const VarHandle& cy, const VarHandle& cx) {
-        return B->load(cy, cx + 1);
+        return B.load(cy, cx + 1);
        });
-  Tensor* D = Compute(
+  Tensor D = Compute(
        "D",
        {{H, "dy"}, {W, "dx"}},
        [&](const VarHandle& dy, const VarHandle& dx) {
-        return A->load(dy + 1, dx) + C->load(dy, dx);
+        return A.load(dy + 1, dx) + C.load(dy, dx);
        });
  
    std::vector<int> c_ref(kW * kH, 0);
@@ -2009,11 +1988,11 @@ TEST(LoopNest, LoopNestComputeAt_3) {
      }
    }
  
-  LoopNest orig_loopnest(std::vector<Tensor*>({D}), {A, B, C, D});
+  LoopNest orig_loopnest({D}, {A, B, C, D});
    {
      // First let's try to compute A at axis dy (the outer loop)
      LoopNest l(orig_loopnest);
-    std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(D->buf()).at(0);
+    std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(D.buf()).at(0);
      LoopNest::computeAt(l.getLoopBodyFor(A), loops[0]);
      l.prepareForCodegen();
      StmtPtr s = l.root_stmt();
@@ -2044,7 +2023,7 @@ TEST(LoopNest, LoopNestComputeAt_3) {
    {
      // Now let's try to compute A at axis dx (the inner loop)
      LoopNest l(orig_loopnest);
-    std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(D->buf()).at(0);
+    std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(D.buf()).at(0);
      LoopNest::computeAt(l.getLoopBodyFor(A), loops[1]);
      l.prepareForCodegen();
      StmtPtr s = l.root_stmt();
@@ -2083,15 +2062,15 @@ TEST(LoopNest, Reduce2dComputeAt) {
    VarHandle W("W", kInt);
    VarHandle H("H", kInt);
  
-  Tensor* p =
+  Tensor p =
        Compute("prod", {{H + 1, "py"}, {W + 1, "px"}}, [&](Axis py, Axis px) {
          return px * py;
        });
-  Tensor* c = Reduce(
+  Tensor c = Reduce(
        "cons",
        {{H, "cy"}, {W, "cx"}},
        Sum(),
-      [&](Axis y, Axis x, Axis r, Axis s) { return p->load(y + r, x + s); },
+      [&](Axis y, Axis x, Axis r, Axis s) { return p.load(y + r, x + s); },
        {{2, "r"}, {2, "s"}});
  
    std::vector<int> c_ref(kW * kH, 0);
@@ -2100,7 +2079,7 @@ TEST(LoopNest, Reduce2dComputeAt) {
        c_ref[y * kW + x] = y * x + (y + 1) * x + y * (x + 1) + (y + 1) * (x + 1);
      }
    }
-  LoopNest orig_loopnest(std::vector<Tensor*>({c}), {p, c});
+  LoopNest orig_loopnest({c}, {p, c});
    checkIR(orig_loopnest.root_stmt(), R"IR(
  # CHECK: for (int py = 0; py < H + 1; py++) {
  # CHECK:   for (int px = 0; px < W + 1; px++) {
@@ -2122,7 +2101,7 @@ TEST(LoopNest, Reduce2dComputeAt) {
    {
      // First let's try to compute P at axis cy (the outer loop)
      LoopNest l(orig_loopnest);
-    auto loops = l.getAllLoopNestsWritingToBuf(c->buf()).at(0);
+    auto loops = l.getAllLoopNestsWritingToBuf(c.buf()).at(0);
      LoopNest::computeAt(l.getLoopBodyFor(p), loops[0]);
      // FIXME: Calling simplify here breaks the IR:
      // MALFORMED INPUT: could not find base node in Load - temp[...]
@@ -2159,7 +2138,7 @@ TEST(LoopNest, Reduce2dComputeAt) {
    {
      // Now let's try to compute P at axis cx (the inner loop)
      LoopNest l(orig_loopnest);
-    std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(c->buf()).at(0);
+    std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(c.buf()).at(0);
      LoopNest::computeAt(l.getLoopBodyFor(p), loops[1]);
      l.simplify();
      l.eliminateDeadStores();
@@ -2205,17 +2184,17 @@ TEST(LoopNest, DISABLED_Conv1d_NH) {
    int Pad = 1;
    Placeholder IP("input", kFloat, {H});
  
-  Tensor* A =
+  Tensor A =
        Compute("A", {{N, "np"}, {H + 2 * Pad, "hp"}}, [&](Axis n, Axis h) {
          auto cond = CompareSelect::make(h, Pad, 1, 0, kLT);
          cond = CompareSelect::make(h, H + Pad, 1, cond, kGE);
          return ifThenElse(cond, 0.f, IP.load(n, h - Pad));
        });
-  Tensor* B = Reduce(
+  Tensor B = Reduce(
        "B",
        {{N, "n"}, {H, "h"}},
        Sum(),
-      [&](Axis n, Axis h, Axis r) { return A->load(n, h + r); },
+      [&](Axis n, Axis h, Axis r) { return A.load(n, h + r); },
        {{R, "r"}});
    LoopNest l({B});
    checkIR(l.root_stmt(), R"IR(
@@ -2233,7 +2212,7 @@ TEST(LoopNest, DISABLED_Conv1d_NH) {
  # CHECK:   }
  # CHECK: }
  )IR");
-  std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(B->buf()).at(0);
+  std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(B.buf()).at(0);
    LoopNest::computeAt(l.getLoopBodyFor(A), loops[0]);
    // FIXME: The current IR is totally broken.  The body of the inlined loop is:
  
@@ -2292,7 +2271,7 @@ class LoopOrderHelper : public IRVisitor {
  
  TEST(LoopNest, LoopNestReorderAxis1) {
    KernelScope kernel_scope;
-  Tensor* tensor = Compute(
+  Tensor tensor = Compute(
        "f", {{2, "x"}, {3, "y"}}, [](const VarHandle& x, const VarHandle& y) {
          return ExprHandle(1.0f) + cast<float>(x) * x + cast<float>(y) * y;
        });
@@ -2303,7 +2282,7 @@ TEST(LoopNest, LoopNestReorderAxis1) {
    SimpleIREvaluator cg(stmt1, {tensor});
    cg.call({stmt1_output});
  
-  auto loops = l.getAllLoopNestsWritingToBuf(tensor->buf()).at(0);
+  auto loops = l.getAllLoopNestsWritingToBuf(tensor.buf()).at(0);
    LoopNest::reorderAxis(loops[0], loops[1]);
    StmtPtr stmt2 = Stmt::clone(l.root_stmt());
  
@@ -2324,7 +2303,7 @@ TEST(LoopNest, LoopNestReorderAxis1) {
    }
  
    // Reorder them back.
-  loops = l.getAllLoopNestsWritingToBuf(tensor->buf()).at(0);
+  loops = l.getAllLoopNestsWritingToBuf(tensor.buf()).at(0);
    LoopNest::reorderAxis(loops[0], loops[1]);
    StmtPtr stmt3 = l.root_stmt();
  
@@ -2341,7 +2320,7 @@ TEST(LoopNest, LoopNestReorderAxis1) {
  
  TEST(LoopNest, LoopNestReorderPartialAxes) {
    KernelScope kernel_scope;
-  Tensor* tensor = Compute(
+  Tensor tensor = Compute(
        "f",
        {{2, "x"}, {3, "y"}, {4, "z"}},
        [](const VarHandle& x, const VarHandle& y, const VarHandle& z) {
@@ -2358,7 +2337,7 @@ TEST(LoopNest, LoopNestReorderPartialAxes) {
    SimpleIREvaluator cg(stmt1, {tensor});
    cg.call({stmt1_output});
  
-  auto loops = l.getAllLoopNestsWritingToBuf(tensor->buf()).at(0);
+  auto loops = l.getAllLoopNestsWritingToBuf(tensor.buf()).at(0);
    LoopNest::reorderAxis(loops[0], loops[1]);
    ASSERT_EQ(loopOrderHelper.getOrder(l.root_stmt()), "y,x,z,");
  
@@ -2372,7 +2351,7 @@ TEST(LoopNest, LoopNestReorderPartialAxes) {
      ASSERT_EQ(stmt1_output[i], stmt2_output[i]);
    }
  
-  loops = l.getAllLoopNestsWritingToBuf(tensor->buf()).at(0);
+  loops = l.getAllLoopNestsWritingToBuf(tensor.buf()).at(0);
    LoopNest::reorderAxis(loops[1], loops[2]);
    ASSERT_EQ(loopOrderHelper.getOrder(l.root_stmt()), "y,z,x,");
  
@@ -2389,7 +2368,7 @@ TEST(LoopNest, LoopNestReorderPartialAxes) {
  
  TEST(LoopNest, LoopNestReorderInternalAxis) {
    KernelScope kernel_scope;
-  Tensor* tensor = Compute(
+  Tensor tensor = Compute(
        "f",
        {{1, "w"}, {2, "x"}, {3, "y"}, {4, "z"}},
        [](const VarHandle& w,
@@ -2409,7 +2388,7 @@ TEST(LoopNest, LoopNestReorderInternalAxis) {
    SimpleIREvaluator cg(stmt1, {tensor});
    cg.call({stmt1_output});
  
-  auto loops = l.getAllLoopNestsWritingToBuf(tensor->buf()).at(0);
+  auto loops = l.getAllLoopNestsWritingToBuf(tensor.buf()).at(0);
    LoopNest::reorderAxis(loops[2], loops[1]);
    ASSERT_EQ(loopOrderHelper.getOrder(l.root_stmt()), "w,y,x,z,");
  
@@ -2426,7 +2405,7 @@ TEST(LoopNest, LoopNestReorderInternalAxis) {
  
  TEST(LoopNest, LoopNestReorderEnclosingAxis) {
    KernelScope kernel_scope;
-  Tensor* tensor = Compute(
+  Tensor tensor = Compute(
        "f",
        {{1, "w"}, {2, "x"}, {3, "y"}, {4, "z"}},
        [](const VarHandle& w,
@@ -2445,7 +2424,7 @@ TEST(LoopNest, LoopNestReorderEnclosingAxis) {
    SimpleIREvaluator cg(stmt1, {tensor});
    cg.call({stmt1_output});
  
-  auto loops = l.getAllLoopNestsWritingToBuf(tensor->buf()).at(0);
+  auto loops = l.getAllLoopNestsWritingToBuf(tensor.buf()).at(0);
    LoopNest::reorderAxis(loops[0], loops[3]);
    ASSERT_EQ(loopOrderHelper.getOrder(l.root_stmt()), "z,x,y,w,");
  
@@ -2462,14 +2441,14 @@ TEST(LoopNest, LoopNestReorderEnclosingAxis) {
  
  TEST(LoopNest, LoopNestReorderSameAxis) {
    KernelScope kernel_scope;
-  Tensor* tensor = Compute(
+  Tensor tensor = Compute(
        "f", {{2, "x"}, {3, "y"}}, [](const VarHandle& x, const VarHandle& y) {
          return ExprHandle(1.0f) + cast<float>(x) * x + cast<float>(y) * y;
        });
    LoopNest l({tensor});
    StmtPtr stmt1 = Stmt::clone(l.root_stmt());
  
-  auto loops = l.getAllLoopNestsWritingToBuf(tensor->buf()).at(0);
+  auto loops = l.getAllLoopNestsWritingToBuf(tensor.buf()).at(0);
    LoopNest::reorderAxis(loops[1], loops[1]);
    StmtPtr stmt2 = Stmt::clone(l.root_stmt());
  
@@ -2492,7 +2471,7 @@ TEST(LoopNest, LoopNestReorderExtraStatements) {
  
    KernelScope kernel_scope;
  
-  Tensor* tensor = Compute(
+  Tensor tensor = Compute(
        "f",
        {{2, "x"}, {3, "y"}, {4, "z"}},
        [](const VarHandle& x, const VarHandle& y, const VarHandle& z) {
@@ -2503,7 +2482,7 @@ TEST(LoopNest, LoopNestReorderExtraStatements) {
  
    Placeholder extra(BufHandle("res", {6, 3}, kFloat));
  
-  auto loops = l.getAllLoopNestsWritingToBuf(tensor->buf()).at(0);
+  auto loops = l.getAllLoopNestsWritingToBuf(tensor.buf()).at(0);
  
    VarHandle i = VarHandle(loops[0]->var());
  
@@ -2589,7 +2568,7 @@ TEST(LoopNest, LoopNestReorderExtraStatements) {
     *
     *
     */
-  loops = l.getAllLoopNestsWritingToBuf(tensor->buf()).at(0);
+  loops = l.getAllLoopNestsWritingToBuf(tensor.buf()).at(0);
    LoopNest::reorderAxis(loops[0], loops[2]);
    StmtPtr stmt3 = Stmt::clone(l.root_stmt());
  
@@ -2628,7 +2607,7 @@ void LoopNestReorderTestHelper(
      int index2) {
    KernelScope kernel_scope;
  
-  Tensor* c = Compute(
+  Tensor c = Compute(
        "5d",
        {{2, "a"}, {3, "b"}, {2, "c"}, {3, "d"}, {2, "e"}},
        [](const std::vector<VarHandle>&) { return -1; });
@@ -2636,7 +2615,7 @@ void LoopNestReorderTestHelper(
  
    Placeholder extra(BufHandle("extra", {5}, kInt));
  
-  auto loops = l.getAllLoopNestsWritingToBuf(c->buf()).at(0);
+  auto loops = l.getAllLoopNestsWritingToBuf(c.buf()).at(0);
    int j = 0;
    for (auto l : loops) {
      // Add an increment at each layer of the loop which counts the number of
@@ -2677,7 +2656,7 @@ void LoopNestReorderTestHelper(
      ASSERT_EQ(extra1[i], expected_loops);
    }
  
-  loops = l.getAllLoopNestsWritingToBuf(c->buf()).at(0);
+  loops = l.getAllLoopNestsWritingToBuf(c.buf()).at(0);
    LoopNest::reorderAxis(loops[index1], loops[index2]);
    StmtPtr stmt2 = Stmt::clone(l.root_stmt());
  
@@ -2752,26 +2731,26 @@ TEST(LoopNest, LoopNestReorderInternalLoopNest) {
    Placeholder c_buf("c", kFloat, {M, N});
    Placeholder d_buf("d", kFloat, {M, K});
  
-  Tensor* x = Compute(
+  Tensor x = Compute(
        "x",
        {{M, "m1"}, {N, "n1"}, {K, "k1"}},
        [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) {
          return a_buf.load(m, n) * b_buf.load(n, k);
        });
-  Tensor* y = Compute(
+  Tensor y = Compute(
        "y",
        {{M, "m2"}, {N, "n2"}, {K, "k2"}},
        [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) {
-        return c_buf.load(m, n) * d_buf.load(m, k) + x->load(m, n, k);
+        return c_buf.load(m, n) * d_buf.load(m, k) + x.load(m, n, k);
        });
-  Tensor* z = Compute(
+  Tensor z = Compute(
        "z",
        {{M, "m3"}, {N, "n3"}, {K, "k3"}},
        [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) {
-        return x->load(m, n, k) + y->load(m, n, k);
+        return x.load(m, n, k) + y.load(m, n, k);
        });
  
-  LoopNest l(std::vector<Tensor*>({z}), {x, y, z});
+  LoopNest l({z}, {x, y, z});
    ForPtr a = nullptr;
    ForPtr b = nullptr;
    auto fors = NodeFinder<For>::find(l.root_stmt());
@@ -2845,14 +2824,14 @@ TEST(LoopNest, LoopNestReorderInternalLoopNest) {
  
  TEST(LoopNest, OuterLoopVectorization) {
    KernelScope kernel_scope;
-  Tensor* tensor = Compute(
+  Tensor tensor = Compute(
        "f", {{8, "X"}, {8, "y"}}, [](const VarHandle& x, const VarHandle& y) {
          return ExprHandle(1.0f) + cast<float>(x) * x + cast<float>(y) * y;
        });
    LoopNest l({tensor});
  
    ASSERT_TRUE(
-      LoopNest::vectorize(l.getAllLoopNestsWritingToBuf(tensor->buf())[0][0]));
+      LoopNest::vectorize(l.getAllLoopNestsWritingToBuf(tensor.buf())[0][0]));
  
    StmtPtr root_stmt = l.root_stmt();
    BlockPtr outer_block = to<Block>(root_stmt);
@@ -2899,10 +2878,10 @@ namespace {
  std::string constantUpperBoundLoopIR(int upper_bound_val) {
    KernelScope kernel_scope;
    ExprHandle upper_bound(upper_bound_val);
-  Tensor* A = Compute(
+  Tensor A = Compute(
        "A", {{upper_bound, "x"}}, [&](const VarHandle& x) { return x * 2; });
    LoopNest l({A});
-  std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(A->buf())[0];
+  std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(A.buf())[0];
    StmtPtr unrolled = nullptr;
    LoopNest::unroll(loops[0], &unrolled);
    std::ostringstream oss;
@@ -2927,12 +2906,12 @@ TEST(LoopNest, UnrollOuter) {
    KernelScope kernel_scope;
    ExprHandle outer_bound(3);
    ExprHandle inner_bound(4);
-  Tensor* A = Compute(
+  Tensor A = Compute(
        "A",
        {{outer_bound, "x"}, {inner_bound, "y"}},
        [&](const VarHandle& x, const VarHandle& y) { return x + y; });
    LoopNest l({A});
-  std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(A->buf())[0];
+  std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(A.buf())[0];
    StmtPtr unrolled = nullptr;
    LoopNest::unroll(loops[0], &unrolled);
    checkIR(unrolled, R"IR(
@@ -2951,12 +2930,12 @@ TEST(LoopNest, UnrollInner) {
    KernelScope kernel_scope;
    ExprHandle outer_bound(3);
    ExprHandle inner_bound(4);
-  Tensor* A = Compute(
+  Tensor A = Compute(
        "A",
        {{outer_bound, "x"}, {inner_bound, "y"}},
        [&](const VarHandle& x, const VarHandle& y) { return x + y; });
    LoopNest l({A});
-  std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(A->buf())[0];
+  std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(A.buf())[0];
    StmtPtr unrolled = nullptr;
    LoopNest::unroll(
        static_to<For>(loops[0]->body()->stmts().front()), &unrolled);
@@ -3044,10 +3023,10 @@ TEST(LoopNest, UnrollEmpty) {
  TEST(LoopNest, NoUnroll) {
    KernelScope kernel_scope;
    VarHandle upper_bound("N", kInt);
-  Tensor* A = Compute(
+  Tensor A = Compute(
        "A", {{upper_bound, "x"}}, [&](const VarHandle& x) { return x * 2; });
    LoopNest l({A});
-  std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(A->buf())[0];
+  std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(A.buf())[0];
    StmtPtr unrolled = nullptr;
    ASSERT_THROWS_WITH(
        LoopNest::unroll(loops[0], &unrolled), "non-constant loop");
@@ -3326,7 +3305,7 @@ TEST(LoopNest, NormalizeAndSplitWithTail) {
    // Create a dummy tensor to construct LoopNest.
    ExprHandle n(100);
    Placeholder a(BufHandle("a", {n}, kFloat));
-  Tensor* b =
+  Tensor b =
        Compute("b", {{n, "i"}}, [&](const VarHandle& i) { return a.load(i); });
    LoopNest l({b});
  
@@ -3615,12 +3594,12 @@ TEST(LoopNest, FlattenReductionLoopNestFromTensor) {
    VarHandle m("m", kInt);
    VarHandle n("n", kInt);
    Placeholder b(BufHandle("b", {m, n}, kFloat));
-  Tensor* c = Reduce("sum", {{M, "m"}}, Sum(), b, {{N, "n"}});
+  Tensor c = Reduce("sum", {{M, "m"}}, Sum(), b, {{N, "n"}});
    LoopNest loop({c});
    HashProvider hasher;
    auto hash_before = hasher.hash(loop.root_stmt());
  
-  auto loops = loop.getAllLoopNestsWritingToBuf(c->buf())[1];
+  auto loops = loop.getAllLoopNestsWritingToBuf(c.buf())[1];
    ForPtr flattened = nullptr;
    ASSERT_FALSE(LoopNest::flatten(loops, &flattened));
    ASSERT_EQ(flattened, nullptr);
@@ -3673,14 +3652,14 @@ TEST(LoopNest, DetectInlineRankMismatch) {
    const int kTotalSize = 8;
  
    Placeholder a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kFloat));
-  Tensor* a = Compute("a", {{kTotalSize, "i"}}, [&](const VarHandle& i) {
+  Tensor a = Compute("a", {{kTotalSize, "i"}}, [&](const VarHandle& i) {
      return a_buf.load(i);
    });
-  Tensor* reshape = Compute(
+  Tensor reshape = Compute(
        "reshape",
        {{kTotalSize / 2, "i"}, {2, "j"}},
-      [&](const VarHandle& i, const VarHandle& j) { return a->load(i, j); });
-  LoopNest l(std::vector<Tensor*>({reshape}), {a, reshape});
+      [&](const VarHandle& i, const VarHandle& j) { return a.load(i, j); });
+  LoopNest l({reshape}, {a, reshape});
    ASSERT_THROWS_WITH(
        l.computeInline(l.getLoopBodyFor(a)),
        "Placeholder indexed access is inconsistent with its rank");
@@ -3689,22 +3668,22 @@ TEST(LoopNest, DetectInlineRankMismatch) {
  TEST(LoopNest, CacheReadsSimple) {
    KernelScope kernel_scope;
  
-  Tensor* A = Compute(
+  Tensor A = Compute(
        "A", {{64, "i"}, {64, "j"}}, [](const VarHandle& i, const VarHandle& j) {
          return i * j;
        });
-  Tensor* B = Compute(
+  Tensor B = Compute(
        "B", {{20, "i"}, {10, "j"}}, [&](const VarHandle& i, const VarHandle& j) {
-        return A->load(i + 30, j + 3);
+        return A.load(i + 30, j + 3);
        });
-  Tensor* C = Compute(
+  Tensor C = Compute(
        "C", {{20, "i"}, {10, "j"}}, [&](const VarHandle& i, const VarHandle& j) {
-        return A->load(i + 10, j + 20) + A->load(i + 30, j + 40);
+        return A.load(i + 10, j + 20) + A.load(i + 30, j + 40);
        });
  
-  LoopNest l(std::vector<Tensor*>({B, C}), {A, B, C});
-  StmtPtr j_loop = l.getAllLoopNestsWritingToBuf(B->buf())[0][1];
-  LoopNest::cacheAccesses(A->buf(), "A_local", j_loop);
+  LoopNest l({B, C}, {A, B, C});
+  StmtPtr j_loop = l.getAllLoopNestsWritingToBuf(B.buf())[0][1];
+  LoopNest::cacheAccesses(A.buf(), "A_local", j_loop);
  
    l.prepareForCodegen();
    StmtPtr result = IRSimplifier::simplify(l.root_stmt());
@@ -3757,22 +3736,22 @@ TEST(LoopNest, CacheReadsSimple) {
  TEST(LoopNest, CacheReadsOuter) {
    KernelScope kernel_scope;
  
-  Tensor* A = Compute(
+  Tensor A = Compute(
        "A", {{64, "i"}, {64, "j"}}, [](const VarHandle& i, const VarHandle& j) {
          return i * j;
        });
-  Tensor* B = Compute(
+  Tensor B = Compute(
        "B", {{20, "i"}, {10, "j"}}, [&](const VarHandle& i, const VarHandle& j) {
-        return A->load(i + 30, j + 40) + A->load(i + 31, j + 41);
+        return A.load(i + 30, j + 40) + A.load(i + 31, j + 41);
        });
-  Tensor* C = Compute(
+  Tensor C = Compute(
        "C", {{20, "i"}, {10, "j"}}, [&](const VarHandle& i, const VarHandle& j) {
-        return A->load(i + 10, j + 20) + A->load(i + 30, j + 40);
+        return A.load(i + 10, j + 20) + A.load(i + 30, j + 40);
        });
  
-  LoopNest l(std::vector<Tensor*>({B, C}), {A, B, C});
-  StmtPtr i_loop = l.getAllLoopNestsWritingToBuf(B->buf())[0][0];
-  LoopNest::cacheAccesses(A->buf(), "A_local", i_loop);
+  LoopNest l({B, C}, {A, B, C});
+  StmtPtr i_loop = l.getAllLoopNestsWritingToBuf(B.buf())[0][0];
+  LoopNest::cacheAccesses(A.buf(), "A_local", i_loop);
  
    l.prepareForCodegen();
    StmtPtr result = IRSimplifier::simplify(l.root_stmt());
@@ -3805,22 +3784,22 @@ TEST(LoopNest, CacheReadsOuter) {
  TEST(LoopNest, CacheReadsInternal) {
    KernelScope kernel_scope;
  
-  Tensor* A = Compute(
+  Tensor A = Compute(
        "A", {{64, "i"}, {64, "j"}}, [](const VarHandle& i, const VarHandle& j) {
          return i * j;
        });
-  Tensor* B = Compute(
+  Tensor B = Compute(
        "B", {{20, "i"}, {10, "j"}}, [&](const VarHandle& i, const VarHandle& j) {
-        return A->load(i + 30, j + 40) + A->load(i + 31, j + 41);
+        return A.load(i + 30, j + 40) + A.load(i + 31, j + 41);
        });
-  Tensor* C = Compute(
+  Tensor C = Compute(
        "C", {{20, "i"}, {10, "j"}}, [&](const VarHandle& i, const VarHandle& j) {
-        return A->load(i + 10, j + 20) + A->load(i + 30, j + 40);
+        return A.load(i + 10, j + 20) + A.load(i + 30, j + 40);
        });
  
-  LoopNest l(std::vector<Tensor*>({B, C}), {A, B, C});
-  StmtPtr j_loop = l.getAllLoopNestsWritingToBuf(B->buf())[0][1];
-  LoopNest::cacheAccesses(A->buf(), "A_local", j_loop);
+  LoopNest l({B, C}, {A, B, C});
+  StmtPtr j_loop = l.getAllLoopNestsWritingToBuf(B.buf())[0][1];
+  LoopNest::cacheAccesses(A.buf(), "A_local", j_loop);
    l.prepareForCodegen();
    StmtPtr result = IRSimplifier::simplify(l.root_stmt());
  
@@ -3852,23 +3831,23 @@ TEST(LoopNest, CacheReadsInternal) {
  TEST(LoopNest, CacheReadsInner) {
    KernelScope kernel_scope;
  
-  Tensor* A = Compute(
+  Tensor A = Compute(
        "A", {{64, "i"}, {64, "j"}}, [](const VarHandle& i, const VarHandle& j) {
          return i * j;
        });
    // note im changing the offset of the first arg of the first call to A.
-  Tensor* B = Compute(
+  Tensor B = Compute(
        "B", {{20, "i"}, {10, "j"}}, [&](const VarHandle& i, const VarHandle& j) {
-        return A->load(i + 34, j + 40) + A->load(i + 30, j + 41);
+        return A.load(i + 34, j + 40) + A.load(i + 30, j + 41);
        });
-  Tensor* C = Compute(
+  Tensor C = Compute(
        "C", {{20, "i"}, {10, "j"}}, [&](const VarHandle& i, const VarHandle& j) {
-        return A->load(i + 10, j + 20) + A->load(i + 30, j + 40);
+        return A.load(i + 10, j + 20) + A.load(i + 30, j + 40);
        });
  
-  LoopNest l(std::vector<Tensor*>({B, C}), {A, B, C});
+  LoopNest l({B, C}, {A, B, C});
    StmtPtr body = l.getLoopBodyFor(B);
-  LoopNest::cacheAccesses(A->buf(), "A_local", body);
+  LoopNest::cacheAccesses(A.buf(), "A_local", body);
    l.prepareForCodegen();
    StmtPtr result = IRSimplifier::simplify(l.root_stmt());
  
@@ -3900,22 +3879,22 @@ TEST(LoopNest, CacheReadsInner) {
  TEST(LoopNest, CacheWritesSimple) {
    KernelScope kernel_scope;
  
-  Tensor* A = Compute(
+  Tensor A = Compute(
        "A", {{64, "i"}, {64, "j"}}, [](const VarHandle& i, const VarHandle& j) {
          return i * j;
        });
-  Tensor* B = Compute(
+  Tensor B = Compute(
        "B", {{20, "i"}, {10, "j"}}, [&](const VarHandle& i, const VarHandle& j) {
-        return A->load(i + 30, j + 40) + A->load(i + 31, j + 41);
+        return A.load(i + 30, j + 40) + A.load(i + 31, j + 41);
        });
-  Tensor* C = Compute(
+  Tensor C = Compute(
        "C", {{20, "i"}, {10, "j"}}, [&](const VarHandle& i, const VarHandle& j) {
-        return A->load(i + 10, j + 20) + A->load(i + 30, j + 40);
+        return A.load(i + 10, j + 20) + A.load(i + 30, j + 40);
        });
  
-  LoopNest l(std::vector<Tensor*>({B, C}), {A, B, C});
-  StmtPtr a_loop = l.getAllLoopNestsWritingToBuf(A->buf())[0][1];
-  LoopNest::cacheAccesses(A->buf(), "A_local", a_loop);
+  LoopNest l({B, C}, {A, B, C});
+  StmtPtr a_loop = l.getAllLoopNestsWritingToBuf(A.buf())[0][1];
+  LoopNest::cacheAccesses(A.buf(), "A_local", a_loop);
  
    l.prepareForCodegen();
    StmtPtr result = IRSimplifier::simplify(l.root_stmt());
@@ -4054,7 +4033,7 @@ TEST(LoopNest, CompoundTensorSimple) {
    auto outer_for2 = For::make(x, 0, 10, inner_for2);
    BlockPtr body = Block::make({outer_for1, outer_for2});
  
-  Tensor* A = new Tensor(a_buf.node(), body);
+  Tensor A = Tensor(a_buf.node(), body);
  
    LoopNest l({A});
    l.prepareForCodegen();
@@ -4080,22 +4059,22 @@ TEST(LoopNest, InlineConstantIndex) {
    KernelScope kernel_scope;
    const int N = 10;
    Placeholder x_buf("a", kFloat, {1, N, 1});
-  Tensor* y = Compute(
+  Tensor y = Compute(
        "f",
        {{1, "m"}, {N, "n"}, {1, "o"}},
        [&](const ExprHandle& m, const ExprHandle& n, const ExprHandle& o) {
          return x_buf.load(m, n, o);
        });
-  Tensor* z = Compute(
+  Tensor z = Compute(
        "f",
        {{1, "m"}, {N, "n"}, {1, "o"}},
        [&](const ExprHandle& m, const ExprHandle& n, const ExprHandle& o) {
-        return y->load(m, n, o);
+        return y.load(m, n, o);
        });
  
-  LoopNest l(std::vector<Tensor*>({z}), {y, z});
+  LoopNest l({z}, {y, z});
    l.simplify();
-  ASSERT_TRUE(l.computeInline(y->buf()));
+  ASSERT_TRUE(l.computeInline(y.buf()));
  }
  
  TEST(LoopNest, CompoundTensorUsed) {
@@ -4115,14 +4094,14 @@ TEST(LoopNest, CompoundTensorUsed) {
    auto outer_for2 = For::make(x, 0, 10, inner_for2);
    BlockPtr body = Block::make({outer_for1, outer_for2});
  
-  Tensor* A = new Tensor(a_buf.node(), body);
-  Tensor* B = Compute(
+  Tensor A = Tensor(a_buf.node(), body);
+  Tensor B = Compute(
        "B", {{10, "i"}, {3, "j"}}, [&](const VarHandle& i, const VarHandle& j) {
-        return A->load(i, j + 1) + A->load(i, j + 2);
+        return A.load(i, j + 1) + A.load(i, j + 2);
        });
  
-  LoopNest l(std::vector<Tensor*>({B}), {A, B});
-  ASSERT_FALSE(l.computeInline(A->buf()));
+  LoopNest l({B}, {A, B});
+  ASSERT_FALSE(l.computeInline(A.buf()));
    l.prepareForCodegen();
  
    std::vector<int> a_data(50, 0);
@@ -4707,12 +4686,10 @@ TEST(LoopNest, OptimizeConditionalsNotNormalized) {
    ASSERT_EQ(hash_before, hash_after);
  }
  
-static std::pair<std::unique_ptr<Placeholder>, Tensor*> colReduce(
-    int M,
-    int N) {
+static std::pair<std::unique_ptr<Placeholder>, Tensor> colReduce(int M, int N) {
    auto a =
        std::make_unique<Placeholder>("a", kFloat, std::vector<ExprHandle>{M, N});
-  Tensor* t = Reduce(
+  Tensor t = Reduce(
        "b",
        {{N, "n"}},
        Sum(),
@@ -4721,10 +4698,10 @@ static std::pair<std::unique_ptr<Placeholder>, Tensor*> colReduce(
    return {std::move(a), t};
  }
  
-static StmtPtr splitTailReorder(Tensor* b) {
+static StmtPtr splitTailReorder(Tensor b) {
    constexpr int kVectorWidth = 8;
    LoopNest nest({b});
-  auto loops = nest.getAllLoopNestsWritingToBuf(b->buf())[0];
+  auto loops = nest.getAllLoopNestsWritingToBuf(b.buf())[0];
    nest.splitWithTail(loops[0], kVectorWidth);
    // Now the loopnests will look like:
    //
@@ -4745,24 +4722,24 @@ static StmtPtr splitTailReorder(Tensor* b) {
    // Write #2: "b[n_outer * 8 + n_inner] = ReduceOp(...)"
    // Loopnest #2: {n_outer, n_inner, m};
    // We will have to reorder n_inner and m.
-  auto loopnests = nest.getAllLoopNestsWritingToBuf(b->buf());
+  auto loopnests = nest.getAllLoopNestsWritingToBuf(b.buf());
    LoopNest::reorderAxis(loopnests[1][1], loopnests[1][2]);
    nest.prepareForCodegen();
    return nest.root_stmt();
  }
  
-static StmtPtr splitMaskReorder(Tensor* b) {
+static StmtPtr splitMaskReorder(Tensor b) {
    constexpr int kVectorWidth = 8;
    LoopNest nest({b});
-  auto loops = nest.getAllLoopNestsWritingToBuf(b->buf())[1];
+  auto loops = nest.getAllLoopNestsWritingToBuf(b.buf())[1];
    nest.splitWithMask(loops[0], kVectorWidth);
-  loops = nest.getAllLoopNestsWritingToBuf(b->buf())[1];
+  loops = nest.getAllLoopNestsWritingToBuf(b.buf())[1];
    LoopNest::reorderAxis(loops[1], loops[2]);
    nest.prepareForCodegen();
    return nest.root_stmt();
  }
  
-static void checkColReduce(StmtPtr s, Placeholder& p, Tensor* t) {
+static void checkColReduce(StmtPtr s, Placeholder& p, Tensor t) {
    int M = immediateAs<int>(p.dim(0));
    int N = immediateAs<int>(p.dim(1));
    PaddedBuffer<float> a(M, N);
@@ -4893,14 +4870,14 @@ TEST(LoopNest, VectorizeUse) {
    KernelScope kernel_scope;
    constexpr int N = 8;
    Placeholder a("a", kFloat, {N});
-  Tensor* b = Compute(
+  Tensor b = Compute(
        "b", {{N, "n"}}, [&](const VarHandle& n) { return a.load(n) + 1.0f; });
-  Tensor* c = Compute(
-      "c", {{N, "n"}}, [&](const VarHandle& n) { return b->load(n) + 2.0f; });
-  LoopNest nest(std::vector<Tensor*>({c}), {b, c});
-  auto loops = nest.getAllLoopNestsWritingToBuf(b->buf())[0];
+  Tensor c = Compute(
+      "c", {{N, "n"}}, [&](const VarHandle& n) { return b.load(n) + 2.0f; });
+  LoopNest nest({c}, {b, c});
+  auto loops = nest.getAllLoopNestsWritingToBuf(b.buf())[0];
    ASSERT_TRUE(LoopNest::vectorize(loops[0]));
-  loops = nest.getAllLoopNestsWritingToBuf(c->buf())[0];
+  loops = nest.getAllLoopNestsWritingToBuf(c.buf())[0];
    ASSERT_TRUE(LoopNest::vectorize(loops[0]));
    nest.prepareForCodegen();
    // NOLINTNEXTLINE(clang-analyzer-deadcode.DeadStores)
@@ -4939,7 +4916,7 @@ TEST(LoopNest, Int64Compute) {
  
    constexpr int64_t N = 12;
    Placeholder a("a", kLong, {N});
-  Tensor* b = Compute("b", {{N, "n"}}, [&](const VarHandle& n) {
+  Tensor b = Compute("b", {{N, "n"}}, [&](const VarHandle& n) {
      return a.load(n) + LongImm::make(1l);
    });
    LoopNest nest({b});
diff --git a/test/cpp/tensorexpr/test_memdependency.cpp b/test/cpp/tensorexpr/test_memdependency.cpp

index 9503f9d..db37b66 100644 (file)
--- a/test/cpp/tensorexpr/test_memdependency.cpp
+++ b/test/cpp/tensorexpr/test_memdependency.cpp
@@ -2726,28 +2726,28 @@ TEST(MemDependency, MemDependencyCheckerComputeAPI) {
    // Can determine if 2 loops created by Compute are dependent.
    Placeholder a_buf("a", kFloat, {4, 5});
    Placeholder b_buf("b", kFloat, {5, 6});
-  Tensor* c = Compute(
+  Tensor c = Compute(
        "broadcast_add",
        {{4, "m"}, {5, "n"}, {6, "k"}},
        [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) {
          return a_buf.load(m, n) + b_buf.load(n, k);
        });
-  Tensor* d = Compute(
+  Tensor d = Compute(
        "d",
        {{4, "m"}, {5, "n"}, {6, "k"}},
        [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) {
-        return c->load(m, n, k) + 1;
+        return c.load(m, n, k) + 1;
        });
  
-  LoopNest l(std::vector<Tensor*>({d}), {c, d});
+  LoopNest l({d}, {c, d});
  
-  MemDependencyChecker analyzer({a_buf.data(), b_buf.data()}, {d->buf()});
+  MemDependencyChecker analyzer({a_buf.data(), b_buf.data()}, {d.buf()});
  
    l.root_stmt()->accept(&analyzer);
  
    // Sanity test: Output depends on input.
-  ASSERT_TRUE(analyzer.dependsIndirectly(d->buf(), a_buf.data()));
-  ASSERT_TRUE(analyzer.dependsIndirectly(d->buf(), b_buf.data()));
+  ASSERT_TRUE(analyzer.dependsIndirectly(d.buf(), a_buf.data()));
+  ASSERT_TRUE(analyzer.dependsIndirectly(d.buf(), b_buf.data()));
  
    // Second loop depends on first loop.
    auto c_loop = l.getLoopStmtsFor(c)[0];
@@ -2773,32 +2773,32 @@ TEST(MemDependency, MemDependencyCheckerComputeInline) {
  
    Placeholder a_buf("a", kFloat, {4, 5});
    Placeholder b_buf("b", kFloat, {5, 6});
-  Tensor* c = Compute(
+  Tensor c = Compute(
        "broadcast_add",
        {{4, "m"}, {5, "n"}, {6, "k"}},
        [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) {
          return a_buf.load(m, n) + b_buf.load(n, k);
        });
-  Tensor* d = Compute(
+  Tensor d = Compute(
        "d",
        {{4, "m"}, {5, "n"}, {6, "k"}},
        [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) {
-        return c->load(m, n, k) + 1;
+        return c.load(m, n, k) + 1;
        });
  
-  LoopNest l(std::vector<Tensor*>({d}), {c, d});
-  l.computeInline(c->buf());
+  LoopNest l({d}, {c, d});
+  l.computeInline(c.buf());
  
-  MemDependencyChecker analyzer({a_buf.data(), b_buf.data()}, {d->buf()});
+  MemDependencyChecker analyzer({a_buf.data(), b_buf.data()}, {d.buf()});
    l.root_stmt()->accept(&analyzer);
  
    // Sanity test: Output depends on input.
-  ASSERT_TRUE(analyzer.dependsIndirectly(d->buf(), a_buf.data()));
-  ASSERT_TRUE(analyzer.dependsIndirectly(d->buf(), b_buf.data()));
+  ASSERT_TRUE(analyzer.dependsIndirectly(d.buf(), a_buf.data()));
+  ASSERT_TRUE(analyzer.dependsIndirectly(d.buf(), b_buf.data()));
  
    // broadcast_add tensor should not appear in trace at all.
    for (auto& wi : analyzer.getHistory()) {
-    ASSERT_NE(wi->var(), c->buf()->base_handle());
+    ASSERT_NE(wi->var(), c.buf()->base_handle());
    }
  }
  
@@ -2810,7 +2810,7 @@ TEST(MemDependency, MemDependencyCheckerComputeSplit) {
  
    Placeholder a_buf("a", kFloat, {4, 5});
    Placeholder b_buf("b", kFloat, {5, 6});
-  Tensor* c = Compute(
+  Tensor c = Compute(
        "broadcast_add",
        {{4, "m"}, {5, "n"}, {6, "k"}},
        [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) {
@@ -2819,13 +2819,12 @@ TEST(MemDependency, MemDependencyCheckerComputeSplit) {
  
    LoopNest l({c});
  
-  MemDependencyChecker analyzer_before(
-      {a_buf.data(), b_buf.data()}, {c->buf()});
+  MemDependencyChecker analyzer_before({a_buf.data(), b_buf.data()}, {c.buf()});
    l.root_stmt()->accept(&analyzer_before);
  
    l.splitWithTail(l.getLoopStmtsFor(c)[0], 2);
  
-  MemDependencyChecker analyzer_after({a_buf.data(), b_buf.data()}, {c->buf()});
+  MemDependencyChecker analyzer_after({a_buf.data(), b_buf.data()}, {c.buf()});
    StmtPtr stmt = IRSimplifier::simplify(l.root_stmt());
    stmt->accept(&analyzer_after);
  
@@ -2859,7 +2858,7 @@ TEST(MemDependency, MemDependencyCheckerComputeReorder) {
  
    Placeholder a_buf("a", kFloat, {4, 5});
    Placeholder b_buf("b", kFloat, {5, 6});
-  Tensor* c = Compute(
+  Tensor c = Compute(
        "broadcast_add",
        {{4, "m"}, {5, "n"}, {6, "k"}},
        [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) {
@@ -2868,14 +2867,13 @@ TEST(MemDependency, MemDependencyCheckerComputeReorder) {
  
    LoopNest l({c});
  
-  MemDependencyChecker analyzer_before(
-      {a_buf.data(), b_buf.data()}, {c->buf()});
+  MemDependencyChecker analyzer_before({a_buf.data(), b_buf.data()}, {c.buf()});
    l.root_stmt()->accept(&analyzer_before);
  
    auto loops = l.getLoopStmtsFor(c);
    l.reorderAxis(loops[0], loops[1]);
  
-  MemDependencyChecker analyzer_after({a_buf.data(), b_buf.data()}, {c->buf()});
+  MemDependencyChecker analyzer_after({a_buf.data(), b_buf.data()}, {c.buf()});
    StmtPtr stmt = IRSimplifier::simplify(l.root_stmt());
    stmt->accept(&analyzer_after);
  
@@ -2928,22 +2926,22 @@ TEST(MemDependency, MemDependencyCheckerComputeReduce) {
    Placeholder a(BufHandle("a", {2, 3, 6}, kFloat));
    Placeholder b(BufHandle("b", {2, 3, 6}, kFloat));
  
-  Tensor* c = Compute(
+  Tensor c = Compute(
        "scale",
        {{2, "l2"}, {3, "n1"}, {6, "m1"}},
        [&](const VarHandle& l, const VarHandle& n, const VarHandle& m) {
          return b.load(l, n, m) * a.load(l, n, m);
        });
-  Tensor* d = Reduce("sum", {{2, "l1"}}, Sum(), c, {{3, "n1"}, {6, "m1"}});
-  LoopNest l(std::vector<Tensor*>({d}), {c, d});
+  Tensor d = Reduce("sum", {{2, "l1"}}, Sum(), c, {{3, "n1"}, {6, "m1"}});
+  LoopNest l({d}, {c, d});
  
-  MemDependencyChecker analyzer({a.data(), b.data()}, {d->buf()});
+  MemDependencyChecker analyzer({a.data(), b.data()}, {d.buf()});
  
    l.root_stmt()->accept(&analyzer);
  
    // Sanity test: Output depends on input.
-  ASSERT_TRUE(analyzer.dependsIndirectly(d->buf(), a.data()));
-  ASSERT_TRUE(analyzer.dependsIndirectly(d->buf(), b.data()));
+  ASSERT_TRUE(analyzer.dependsIndirectly(d.buf(), a.data()));
+  ASSERT_TRUE(analyzer.dependsIndirectly(d.buf(), b.data()));
  
    // Second loop depends on first loop.
    auto c_loop = l.getLoopStmtsFor(c)[0];
@@ -2965,7 +2963,7 @@ TEST(MemDependency, MemDependencyCheckerComputeGEMM) {
  
    Placeholder AP(BufHandle("A", {M, K}, kFloat));
    Placeholder BP(BufHandle("B", {K, N}, kFloat));
-  Tensor* CT = Reduce(
+  Tensor CT = Reduce(
        "gemm",
        {{M, "M"}, {N, "N"}},
        Sum(),
@@ -3011,7 +3009,7 @@ TEST(MemDependency, MemDependencyCheckerComputeGEMM) {
    }
    {
      auto const& loops = loop.getLoopStmtsFor(CT);
-    loop.cacheAccesses(CT->buf(), "C_regs", loops[2]);
+    loop.cacheAccesses(CT.buf(), "C_regs", loops[2]);
    }
  
    MemDependencyChecker analyzer_unlowered(
@@ -3026,12 +3024,12 @@ TEST(MemDependency, MemDependencyCheckerComputeGEMM) {
      stmt->accept(&analyzer_unlowered);
  
      // Outputs depend on inputs.
-    ASSERT_TRUE(analyzer_unlowered.dependsIndirectly(CT->buf(), AP.data()));
-    ASSERT_TRUE(analyzer_unlowered.dependsIndirectly(CT->buf(), BP.data()));
+    ASSERT_TRUE(analyzer_unlowered.dependsIndirectly(CT.buf(), AP.data()));
+    ASSERT_TRUE(analyzer_unlowered.dependsIndirectly(CT.buf(), BP.data()));
  
      // The last write to gemm should cover the total bound of the output.
      std::shared_ptr<AccessInfo> outputAccess =
-        analyzer_unlowered.output(CT->buf());
+        analyzer_unlowered.output(CT.buf());
      // A single dependency.
      ASSERT_EQ(outputAccess->dependencies().size(), 1);
  
diff --git a/test/cpp/tensorexpr/test_ops.cpp b/test/cpp/tensorexpr/test_ops.cpp

index 674dbd9..122a498 100644 (file)
--- a/test/cpp/tensorexpr/test_ops.cpp
+++ b/test/cpp/tensorexpr/test_ops.cpp
@@ -6,7 +6,7 @@
  
  using namespace torch::jit::tensorexpr;
  
-using Tensors = std::vector<Tensor*>;
+using Tensors = std::vector<Tensor>;
  using Args = std::vector<CodeGen::BufferArg>;
  std::unique_ptr<SimpleIREvaluator> compile(
      const Args& inputs,
@@ -28,7 +28,7 @@ TEST(Ops, Sum) {
      constexpr int N = 16;
  
      Placeholder a("a", kFloat, {M, N});
-    Tensor* b = computeSum({a.handle(), dims, false}, c10::kFloat);
+    Tensor b = computeSum({a.handle(), dims, false}, c10::kFloat);
      auto cg = compile({a}, {b});
  
      auto at = at::arange(M * N, at::kFloat).view({M, N});
diff --git a/test/cpp/tensorexpr/test_reductions.cpp b/test/cpp/tensorexpr/test_reductions.cpp

index 449edac..6620ef2 100644 (file)
--- a/test/cpp/tensorexpr/test_reductions.cpp
+++ b/test/cpp/tensorexpr/test_reductions.cpp
@@ -35,7 +35,7 @@ TEST(Reductions, ReduceSum0D_1) {
  
    std::vector<float> out(M, -1.f);
  
-  Tensor* c = Reduce("sum", {{M, "m"}}, Sum(), b, {});
+  Tensor c = Reduce("sum", {{M, "m"}}, Sum(), b, {});
    LoopNest loop({c});
    loop.prepareForCodegen();
    StmtPtr s = loop.root_stmt();
@@ -59,7 +59,7 @@ TEST(Reductions, ReduceSum0D_2) {
  
    std::vector<float> out(1, -1.f);
  
-  Tensor* c = Reduce("sum", {}, Sum(), b, {});
+  Tensor c = Reduce("sum", {}, Sum(), b, {});
    LoopNest loop({c});
    loop.prepareForCodegen();
    StmtPtr s = loop.root_stmt();
@@ -83,7 +83,7 @@ TEST(Reductions, ReduceSum1D) {
  
    std::vector<float> out(1, -1.f);
  
-  Tensor* c = Reduce("sum", {}, Sum(), b, {{10, "m"}});
+  Tensor c = Reduce("sum", {}, Sum(), b, {{10, "m"}});
    LoopNest loop({c});
    loop.prepareForCodegen();
    StmtPtr s = loop.root_stmt();
@@ -114,7 +114,7 @@ TEST(Reductions, ReduceSum2D) {
  
    std::vector<float> out(M, -1.f);
  
-  Tensor* c = Reduce("sum", {{M, "m"}}, Sum(), b, {{N, "n"}});
+  Tensor c = Reduce("sum", {{M, "m"}}, Sum(), b, {{N, "n"}});
    LoopNest loop({c});
    loop.prepareForCodegen();
    StmtPtr s = loop.root_stmt();
@@ -145,7 +145,7 @@ TEST(Reductions, ReduceSum3D) {
  
    Placeholder b(BufHandle("b", {2, 3, m}, kFloat));
  
-  Tensor* c = Reduce("sum", {{2, "l"}, {3, "n"}}, Sum(), b, {{m, "m"}});
+  Tensor c = Reduce("sum", {{2, "l"}, {3, "n"}}, Sum(), b, {{m, "m"}});
    LoopNest loop({c});
    loop.prepareForCodegen();
    StmtPtr s = loop.root_stmt();
@@ -175,7 +175,7 @@ TEST(Reductions, ReduceSum3D) {
      ASSERT_EQ(cData[i], expected);
    }
  
-  Tensor* d = Reduce("sum2", {{2, "l"}}, Sum(), b, {{3, "n"}, {m, "m"}});
+  Tensor d = Reduce("sum2", {{2, "l"}}, Sum(), b, {{3, "n"}, {m, "m"}});
    LoopNest loop2({d});
    loop2.prepareForCodegen();
    StmtPtr s2 = loop2.root_stmt();
@@ -192,8 +192,8 @@ TEST(Reductions, ReduceSum3D) {
    }
  
    // This is the same as just reducing the original result across that axis.
-  Placeholder c_buf(BufHandle(c->buf()));
-  Tensor* e = Reduce("sum3", {{2, "l"}}, Sum(), c_buf, {{3, "m"}});
+  Placeholder c_buf(BufHandle(c.buf()));
+  Tensor e = Reduce("sum3", {{2, "l"}}, Sum(), c_buf, {{3, "m"}});
    LoopNest loop3({e});
    loop3.prepareForCodegen();
    StmtPtr s3 = loop3.root_stmt();
@@ -219,7 +219,7 @@ TEST(Reductions, ReduceSum10D) {
    std::vector<float> in(InputSize, 1.f);
    std::vector<float> out(OutputSize, -1.f);
  
-  Tensor* c = Reduce(
+  Tensor c = Reduce(
        "sum",
        {{2, "a"}, {3, "b"}, {2, "c"}, {3, "d"}, {2, "e"}},
        Sum(),
@@ -261,7 +261,7 @@ TEST(Reductions, ReduceProduct) {
    Reducer product(
        ExprHandle(1.f), [](ExprHandle a, ExprHandle b) { return a * b; });
  
-  Tensor* c = Reduce("product", {{M, "m"}}, product, b, {{N, "n"}});
+  Tensor c = Reduce("product", {{M, "m"}}, product, b, {{N, "n"}});
    LoopNest loop({c});
    loop.prepareForCodegen();
    StmtPtr s = loop.root_stmt();
@@ -294,7 +294,7 @@ TEST(Reductions, ReduceMax) {
      in[j] = j;
    }
  
-  Tensor* dm1 = Reduce("max", {}, Maximum(kFloat), in_, {{10, "m"}});
+  Tensor dm1 = Reduce("max", {}, Maximum(kFloat), in_, {{10, "m"}});
  
    LoopNest loop({dm1});
    loop.prepareForCodegen();
@@ -309,7 +309,7 @@ TEST(Reductions, ReduceMax) {
    Placeholder in2_(BufHandle("b", {2, 5}, kFloat));
    std::vector<float> out2(2, -1.f);
  
-  Tensor* m2d = Reduce("max", {{2, "n"}}, Maximum(kFloat), in2_, {{5, "m"}});
+  Tensor m2d = Reduce("max", {{2, "n"}}, Maximum(kFloat), in2_, {{5, "m"}});
  
    LoopNest loop2({m2d});
    loop2.prepareForCodegen();
@@ -336,7 +336,7 @@ TEST(Reductions, ReduceMinCustomInitializer) {
      in[j] = 10 + j;
    }
  
-  Tensor* min = Reduce(
+  Tensor min = Reduce(
        "min",
        {},
        Minimum(ExprHandle(minInit)),
@@ -372,7 +372,7 @@ TEST(Reductions, ReduceAnyAll) {
      return CompareSelect::make(a, 1, 1, b, kEQ);
    });
  
-  Tensor* any = Reduce(
+  Tensor any = Reduce(
        "anyEqual",
        {{4, "i"}},
        anyEqSV,
@@ -415,7 +415,7 @@ TEST(Reductions, ReduceAnyAll) {
      return CompareSelect::make(a, 0, 0, b, kEQ);
    });
  
-  Tensor* allGreaterThan = Reduce(
+  Tensor allGreaterThan = Reduce(
        "allGreaterThan",
        {{4, "i"}},
        allGTSV,
@@ -465,7 +465,7 @@ TEST(Reductions, ReduceMatmul2D) {
      }
    }
  
-  Tensor* mm = Reduce(
+  Tensor mm = Reduce(
        "mm",
        {{3, "m"}, {3, "n"}},
        Sum(),
@@ -501,10 +501,10 @@ TEST(Reductions, ReduceRfactorLike) {
    std::vector<float> in_rf_(10, -2.f);
    std::vector<float> out(1, -1.f);
  
-  Tensor* l1 = Reduce("l1", {{10, "i"}}, Sum(), in, {{10, "j"}});
-  Placeholder in_rf(BufHandle(l1->buf()));
+  Tensor l1 = Reduce("l1", {{10, "i"}}, Sum(), in, {{10, "j"}});
+  Placeholder in_rf(BufHandle(l1.buf()));
  
-  Tensor* l2 = Reduce("l2", {}, Sum(), in_rf, {{10, "i"}});
+  Tensor l2 = Reduce("l2", {}, Sum(), in_rf, {{10, "i"}});
  
    LoopNest loop({l1, l2});
    loop.prepareForCodegen();
@@ -526,14 +526,14 @@ TEST(Reductions, ReduceAsProducer) {
    Placeholder a(BufHandle("a", {2, 3}, kFloat));
    Placeholder b(BufHandle("b", {2, 3, m}, kFloat));
  
-  Tensor* c = Reduce("sum", {{2, "l1"}, {3, "n1"}}, Sum(), b, {{m, "m1"}});
-  Tensor* d = Compute(
+  Tensor c = Reduce("sum", {{2, "l1"}, {3, "n1"}}, Sum(), b, {{m, "m1"}});
+  Tensor d = Compute(
        "scale",
        {{2, "l2"}, {3, "n1"}},
        [&](const VarHandle& l, const VarHandle& n) {
-        return c->load(l, n) * a.load(l, n);
+        return c.load(l, n) * a.load(l, n);
        });
-  LoopNest loop(std::vector<Tensor*>({d}), {c, d});
+  LoopNest loop({d}, {c, d});
    loop.prepareForCodegen();
    StmtPtr s = loop.root_stmt();
    s = IRSimplifier::simplify(s);
@@ -571,14 +571,14 @@ TEST(Reductions, ReduceAsConsumer) {
    Placeholder a(BufHandle("a", {2, 3, m}, kFloat));
    Placeholder b(BufHandle("b", {2, 3, m}, kFloat));
  
-  Tensor* c = Compute(
+  Tensor c = Compute(
        "scale",
        {{2, "l2"}, {3, "n1"}, {m, "m1"}},
        [&](const VarHandle& l, const VarHandle& n, const VarHandle& m) {
          return b.load(l, n, m) * a.load(l, n, m);
        });
-  Tensor* d = Reduce("sum", {{2, "l1"}}, Sum(), c, {{3, "n1"}, {m, "m1"}});
-  LoopNest loop(std::vector<Tensor*>({d}), {c, d});
+  Tensor d = Reduce("sum", {{2, "l1"}}, Sum(), c, {{3, "n1"}, {m, "m1"}});
+  LoopNest loop({d}, {c, d});
    loop.prepareForCodegen();
    StmtPtr s = loop.root_stmt();
    s = IRSimplifier::simplify(s);
@@ -626,7 +626,7 @@ TEST(Reductions, SplitReduceAxis) {
    }
    std::vector<float> out(16, -1.f);
  
-  Tensor* tensor = Reduce("sum", {{16, "m"}}, Sum(), in, {{8, "n"}});
+  Tensor tensor = Reduce("sum", {{16, "m"}}, Sum(), in, {{8, "n"}});
    LoopNest l({tensor});
    std::vector<ForPtr> loops = l.getLoopStmtsFor(tensor);
    LoopNest::splitWithTail(loops[1], 2);
@@ -656,7 +656,7 @@ TEST(Reductions, SplitNonReduceAxis) {
      }
    }
    std::vector<float> out(16, -1.f);
-  Tensor* tensor = Reduce("sum", {{16, "m"}}, Sum(), in, {{8, "n"}});
+  Tensor tensor = Reduce("sum", {{16, "m"}}, Sum(), in, {{8, "n"}});
    LoopNest l({tensor});
    std::vector<ForPtr> loops = l.getLoopStmtsFor(tensor);
    LoopNest::splitWithTail(loops[0], 2);
@@ -687,14 +687,14 @@ TEST(Reductions, ReorderedReductionInitializer) {
    Placeholder in(BufHandle("in", {1, 12, 6}, kFloat));
    std::vector<float> in_(12 * 6, 1.f);
  
-  Tensor* tensor_ = Reduce("sum", {{1, "k"}, {12, "n"}}, Sum(), in, {{6, "m"}});
+  Tensor tensor_ = Reduce("sum", {{1, "k"}, {12, "n"}}, Sum(), in, {{6, "m"}});
    LoopNest l_({tensor_});
  
    l_.prepareForCodegen();
    StmtPtr s_ = Stmt::clone(l_.root_stmt());
    s_ = IRSimplifier::simplify(s_);
  
-  Tensor* tensor = Reduce("sum", {{1, "k"}, {12, "n"}}, Sum(), in, {{6, "m"}});
+  Tensor tensor = Reduce("sum", {{1, "k"}, {12, "n"}}, Sum(), in, {{6, "m"}});
    LoopNest l({tensor});
  
    auto loops = l.getLoopStmtsFor(tensor);
@@ -741,10 +741,10 @@ TEST(Reductions, ReduceRfactor) {
  
    std::vector<float> out(1, -1.f);
  
-  Tensor* c = Reduce("sum", {}, Sum(), b, {{m, "m"}, {n, "n"}});
+  Tensor c = Reduce("sum", {}, Sum(), b, {{m, "m"}, {n, "n"}});
    LoopNest loop({c});
    std::vector<ForPtr> loops = loop.getLoopStmtsFor(c);
-  auto c_body = loop.getAllWritesToBuf(c->buf())[1];
+  auto c_body = loop.getAllWritesToBuf(c.buf())[1];
    ASSERT_TRUE(loop.rfactor(c_body, loops.at(0)));
    auto rc = NodeFinder<ReduceOp>::find(loop.root_stmt());
    ASSERT_EQ(rc.size(), 2);
@@ -776,10 +776,10 @@ TEST(Reductions, Reduce3DRfactorInner) {
  
    std::vector<float> out(1, -1.f);
  
-  Tensor* c = Reduce("sum", {}, Sum(), b, {{m, "m"}, {n, "n"}, {k, "k"}});
+  Tensor c = Reduce("sum", {}, Sum(), b, {{m, "m"}, {n, "n"}, {k, "k"}});
    LoopNest loop({c});
    std::vector<ForPtr> loops = loop.getLoopStmtsFor(c);
-  auto c_body = loop.getAllWritesToBuf(c->buf())[1];
+  auto c_body = loop.getAllWritesToBuf(c.buf())[1];
    ASSERT_FALSE(loop.rfactor(c_body, loops.at(2)));
    auto rc = NodeFinder<ReduceOp>::find(loop.root_stmt());
    ASSERT_EQ(rc.size(), 1);
@@ -811,10 +811,10 @@ TEST(Reductions, Reduce3DRfactorOuter) {
  
    std::vector<float> out(1, -1.f);
  
-  Tensor* c = Reduce("sum", {}, Sum(), b, {{m, "m"}, {n, "n"}, {k, "k"}});
+  Tensor c = Reduce("sum", {}, Sum(), b, {{m, "m"}, {n, "n"}, {k, "k"}});
    LoopNest loop({c});
    std::vector<ForPtr> loops = loop.getLoopStmtsFor(c);
-  auto c_body = loop.getAllWritesToBuf(c->buf())[1];
+  auto c_body = loop.getAllWritesToBuf(c.buf())[1];
    ASSERT_TRUE(loop.rfactor(c_body, loops.at(0)));
    auto rc = NodeFinder<ReduceOp>::find(loop.root_stmt());
    ASSERT_EQ(rc.size(), 2);
@@ -837,7 +837,7 @@ TEST(Reductions, ReduceRepeatedInternalRfactor) {
    std::vector<float> out(1, -1.f);
    std::vector<float> ref(1, -1.f);
  
-  Tensor* c = Reduce(
+  Tensor c = Reduce(
        "sum",
        {},
        Sum(),
@@ -854,7 +854,7 @@ TEST(Reductions, ReduceRepeatedInternalRfactor) {
          IRSimplifier::simplify(refloop.root_stmt()), {in_, c});
      ref_cg.call({in, ref});
  
-    BufPtr tmp_buf = c->buf();
+    BufPtr tmp_buf = c.buf();
  
      for (int idx = 0; idx < rfac_number; idx++) {
        auto reduce = loop.getAllWritesToBuf(tmp_buf)[1];
@@ -890,7 +890,7 @@ TEST(Reductions, ReduceSplitTail) {
    for (int i = 0; i < 3; ++i) {
      std::vector<float> out(M, -1.f);
  
-    Tensor* c = Reduce("sum", {{M, "m"}}, Sum(), b, {{N, "n"}, {K, "k"}});
+    Tensor c = Reduce("sum", {{M, "m"}}, Sum(), b, {{N, "n"}, {K, "k"}});
      LoopNest loop({c});
      std::vector<ForPtr> loops = loop.getLoopStmtsFor(c);
      LoopNest::splitWithTail(loops[i], 8);
@@ -922,7 +922,7 @@ TEST(Reductions, ReduceSplitNoTail) {
    for (int i = 0; i < 3; ++i) {
      std::vector<float> out(M, -1.f);
  
-    Tensor* c = Reduce("sum", {{M, "m"}}, Sum(), b, {{N, "n"}, {K, "k"}});
+    Tensor c = Reduce("sum", {{M, "m"}}, Sum(), b, {{N, "n"}, {K, "k"}});
      LoopNest loop({c});
      std::vector<ForPtr> loops = loop.getLoopStmtsFor(c);
      LoopNest::splitWithTail(loops[i], 5);
@@ -956,7 +956,7 @@ TEST(Reductions, ReduceOverSplitTail) {
    for (int i = 0; i < 3; ++i) {
      std::vector<float> out(M, -1.f);
  
-    Tensor* c = Reduce("sum", {{M, "m"}}, Sum(), b, {{N, "n"}, {K, "k"}});
+    Tensor c = Reduce("sum", {{M, "m"}}, Sum(), b, {{N, "n"}, {K, "k"}});
      LoopNest loop({c});
      std::vector<ForPtr> loops = loop.getLoopStmtsFor(c);
      LoopNest::splitWithTail(loops[i], 16);
@@ -989,7 +989,7 @@ TEST(Reductions, ReduceSplitMask) {
    for (int i = 0; i < 3; ++i) {
      std::vector<float> out(M, -1.f);
  
-    Tensor* c = Reduce("sum", {{M, "m"}}, Sum(), b, {{N, "n"}, {K, "k"}});
+    Tensor c = Reduce("sum", {{M, "m"}}, Sum(), b, {{N, "n"}, {K, "k"}});
      LoopNest loop({c});
      std::vector<ForPtr> loops = loop.getLoopStmtsFor(c);
      LoopNest::splitWithMask(loops[i], 8);
@@ -1021,7 +1021,7 @@ TEST(Reductions, ReduceSplitNoMask) {
    for (int i = 0; i < 3; ++i) {
      std::vector<float> out(M, -1.f);
  
-    Tensor* c = Reduce("sum", {{M, "m"}}, Sum(), b, {{N, "n"}, {K, "k"}});
+    Tensor c = Reduce("sum", {{M, "m"}}, Sum(), b, {{N, "n"}, {K, "k"}});
      LoopNest loop({c});
      std::vector<ForPtr> loops = loop.getLoopStmtsFor(c);
      LoopNest::splitWithMask(loops[i], 5);
@@ -1054,7 +1054,7 @@ TEST(Reductions, ReduceOverSplitMask) {
    for (int i = 0; i < 3; ++i) {
      std::vector<float> out(M, -1.f);
  
-    Tensor* c = Reduce("sum", {{M, "m"}}, Sum(), b, {{N, "n"}, {K, "k"}});
+    Tensor c = Reduce("sum", {{M, "m"}}, Sum(), b, {{N, "n"}, {K, "k"}});
      LoopNest loop({c});
      std::vector<ForPtr> loops = loop.getLoopStmtsFor(c);
      LoopNest::splitWithMask(loops[i], 16);
@@ -1090,16 +1090,16 @@ TEST(Reductions, ReduceSplitRfactor) {
  
    std::vector<float> out(M, -1.f);
  
-  Tensor* c = Reduce("sum", {{M, "m"}}, Sum(), b, {{N, "n"}, {K, "k"}});
+  Tensor c = Reduce("sum", {{M, "m"}}, Sum(), b, {{N, "n"}, {K, "k"}});
    LoopNest loop({c});
    std::vector<ForPtr> loops = loop.getLoopStmtsFor(c);
    LoopNest::splitWithTail(loops[2], SPLIT_FACTOR);
  
-  auto c_body = loop.getAllWritesToBuf(c->buf())[2];
-  auto all_loops = loop.getAllLoopNestsWritingToBuf(c->buf());
+  auto c_body = loop.getAllWritesToBuf(c.buf())[2];
+  auto all_loops = loop.getAllLoopNestsWritingToBuf(c.buf());
    ASSERT_TRUE(all_loops.size() == 3 && all_loops.at(2).size() == 3);
    LoopNest::reorderAxis(all_loops[2][1], all_loops[2][2]);
-  all_loops = loop.getAllLoopNestsWritingToBuf(c->buf());
+  all_loops = loop.getAllLoopNestsWritingToBuf(c.buf());
    ASSERT_TRUE(all_loops.size() == 3 && all_loops.at(2).size() == 3);
    ASSERT_TRUE(loop.rfactor(c_body, all_loops[2][1]));
    loop.prepareForCodegen();
@@ -1131,7 +1131,7 @@ TEST(Reductions, ReduceOverSplitRfactor) {
  
    std::vector<float> out(1, -1.f);
  
-  Tensor* c = Reduce("sum", {}, Sum(), b, {{N, "n"}, {K, "k"}});
+  Tensor c = Reduce("sum", {}, Sum(), b, {{N, "n"}, {K, "k"}});
    LoopNest loop({c});
    std::vector<ForPtr> loops = loop.getLoopStmtsFor(c);
    // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
@@ -1139,9 +1139,9 @@ TEST(Reductions, ReduceOverSplitRfactor) {
    LoopNest::splitWithTail(loops[1], SPLIT_FACTOR, &i, &t);
    LoopNest::reorderAxis(loops[0], i);
  
-  auto all_loops = loop.getAllLoopNestsWritingToBuf(c->buf());
+  auto all_loops = loop.getAllLoopNestsWritingToBuf(c.buf());
    ASSERT_TRUE(all_loops.size() == 3 && all_loops.at(1).size() == 3);
-  auto c_body = loop.getAllWritesToBuf(c->buf())[1];
+  auto c_body = loop.getAllWritesToBuf(c.buf())[1];
    ASSERT_TRUE(loop.rfactor(c_body, all_loops[1][0]));
    LoopNest::reorderAxis(all_loops[1][0], all_loops[1][2]);
  
@@ -1182,9 +1182,9 @@ TEST(Reductions, ReduceInlineReduction) {
    Placeholder a_buf("a", kFloat, {M});
    Placeholder b_buf("b", kFloat, {M, N, K});
  
-  Tensor* x = Reduce("x", {{M, "m1"}}, Sum(), b_buf, {{N, "n1"}, {K, "k1"}});
-  Tensor* y = Compute("y", {{M, "m2"}}, [&](const VarHandle& m) {
-    return a_buf.load(m) + x->load(m);
+  Tensor x = Reduce("x", {{M, "m1"}}, Sum(), b_buf, {{N, "n1"}, {K, "k1"}});
+  Tensor y = Compute("y", {{M, "m2"}}, [&](const VarHandle& m) {
+    return a_buf.load(m) + x.load(m);
    });
  
    PaddedBuffer<float> a_v(M);
@@ -1201,9 +1201,9 @@ TEST(Reductions, ReduceInlineReduction) {
      }
    }
  
-  LoopNest l1(std::vector<Tensor*>({y}), {x, y});
+  LoopNest l1({y}, {x, y});
    // Cannot inline a reduction computation
-  ASSERT_FALSE(l1.computeInline(x->buf()));
+  ASSERT_FALSE(l1.computeInline(x.buf()));
  }
  
  TEST(Reductions, ReduceInlineConsumer) {
@@ -1215,13 +1215,13 @@ TEST(Reductions, ReduceInlineConsumer) {
    Placeholder a_buf("a", kFloat, {M, N, K});
    Placeholder b_buf("b", kFloat, {M, N, K});
  
-  Tensor* x = Compute(
+  Tensor x = Compute(
        "x",
        {{M, "m1"}, {N, "n1"}, {K, "k1"}},
        [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) {
          return a_buf.load(m, n, k) + b_buf.load(m, n, k);
        });
-  Tensor* y = Reduce("y", {{M, "m2"}}, Sum(), x, {{N, "n2"}, {K, "k2"}});
+  Tensor y = Reduce("y", {{M, "m2"}}, Sum(), x, {{N, "n2"}, {K, "k2"}});
  
    PaddedBuffer<float> a_v(M, N, K);
    PaddedBuffer<float> b_v(M, N, K);
@@ -1235,9 +1235,9 @@ TEST(Reductions, ReduceInlineConsumer) {
      }
    }
  
-  LoopNest l1(std::vector<Tensor*>({y}), {x, y});
+  LoopNest l1({y}, {x, y});
    LoopNest l2(l1);
-  l2.computeInline(x->buf());
+  l2.computeInline(x.buf());
  
    l1.prepareForCodegen();
    l2.prepareForCodegen();
@@ -1269,7 +1269,7 @@ TEST(Reductions, ReduceInlineReducerInternal) {
    Placeholder a_buf("a", kFloat, {M, N, K});
    Placeholder b_buf("b", kFloat, {M, N, K});
  
-  Tensor* x = Compute(
+  Tensor x = Compute(
        "x",
        {{M, "m1"}, {N, "n1"}, {K, "k1"}},
        [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) {
@@ -1279,7 +1279,7 @@ TEST(Reductions, ReduceInlineReducerInternal) {
    Reducer minimum(ExprHandle(0.f), [&](ExprHandle a, ExprHandle b) {
      return Add::make(ExprHandle(1.f), Min::make(a, b, false));
    });
-  Tensor* y = Reduce("y", {{M, "m2"}}, minimum, x, {{N, "n2"}, {K, "k2"}});
+  Tensor y = Reduce("y", {{M, "m2"}}, minimum, x, {{N, "n2"}, {K, "k2"}});
  
    PaddedBuffer<float> a_v(M, N, K);
    PaddedBuffer<float> b_v(M, N, K);
@@ -1293,9 +1293,9 @@ TEST(Reductions, ReduceInlineReducerInternal) {
      }
    }
  
-  LoopNest l1(std::vector<Tensor*>({y}), {x, y});
+  LoopNest l1({y}, {x, y});
    LoopNest l2(l1);
-  l2.computeInline(x->buf());
+  l2.computeInline(x.buf());
  
    l1.prepareForCodegen();
    l2.prepareForCodegen();
@@ -1328,25 +1328,25 @@ TEST(Reductions, ReductionCacheAccessesOperatorAxis) {
    Placeholder a(BufHandle("a", {L, N, M}, kFloat));
    Placeholder b(BufHandle("b", {L, N, M}, kFloat));
  
-  Tensor* c = Compute(
+  Tensor c = Compute(
        "scale",
        {{L, "l2"}, {N, "n1"}, {M, "m1"}},
        [&](const VarHandle& l, const VarHandle& n, const VarHandle& m) {
          return b.load(l, n, m) * a.load(l, n, m);
        });
-  Tensor* d = Reduce("sum", {{L, "l1"}}, Sum(), c, {{N, "n1"}, {M, "m1"}});
+  Tensor d = Reduce("sum", {{L, "l1"}}, Sum(), c, {{N, "n1"}, {M, "m1"}});
  
-  Tensor* e = Compute("scale", {{L, "l"}}, [&](const VarHandle& l) {
-    return b.load(0, 0, l) * d->load(l);
+  Tensor e = Compute("scale", {{L, "l"}}, [&](const VarHandle& l) {
+    return b.load(0, 0, l) * d.load(l);
    });
  
-  LoopNest l(std::vector<Tensor*>({e}), {c, d, e});
+  LoopNest l({e}, {c, d, e});
    LoopNest l_before(l);
    l_before.prepareForCodegen();
    SimpleIREvaluator cg_before(l_before.root_stmt(), {a, b, e});
  
    StmtPtr d_loop = l.getLoopStmtsFor(d)[0];
-  l.cacheAccesses(d->buf(), "d_local", d_loop);
+  l.cacheAccesses(d.buf(), "d_local", d_loop);
    l.prepareForCodegen();
  
    StmtPtr result = IRSimplifier::simplify(l.root_stmt());
@@ -1405,25 +1405,25 @@ TEST(Reductions, ReductionCacheAccessesOuterReduceAxis) {
    Placeholder a(BufHandle("a", {L, N, M}, kFloat));
    Placeholder b(BufHandle("b", {L, N, M}, kFloat));
  
-  Tensor* c = Compute(
+  Tensor c = Compute(
        "scale",
        {{L, "l2"}, {N, "n1"}, {M, "m1"}},
        [&](const VarHandle& l, const VarHandle& n, const VarHandle& m) {
          return b.load(l, n, m) * a.load(l, n, m);
        });
-  Tensor* d = Reduce("sum", {{L, "l1"}}, Sum(), c, {{N, "n1"}, {M, "m1"}});
+  Tensor d = Reduce("sum", {{L, "l1"}}, Sum(), c, {{N, "n1"}, {M, "m1"}});
  
-  Tensor* e = Compute("scale", {{L, "l"}}, [&](const VarHandle& l) {
-    return b.load(0, 0, l) * d->load(l);
+  Tensor e = Compute("scale", {{L, "l"}}, [&](const VarHandle& l) {
+    return b.load(0, 0, l) * d.load(l);
    });
  
-  LoopNest l(std::vector<Tensor*>({e}), {c, d, e});
+  LoopNest l({e}, {c, d, e});
    LoopNest l_before(l);
    l_before.prepareForCodegen();
    SimpleIREvaluator cg_before(l_before.root_stmt(), {a, b, e});
  
    StmtPtr d_loop = l.getLoopStmtsFor(d)[1];
-  l.cacheAccesses(d->buf(), "d_local", d_loop);
+  l.cacheAccesses(d.buf(), "d_local", d_loop);
    l.prepareForCodegen();
  
    StmtPtr result = IRSimplifier::simplify(l.root_stmt());
@@ -1480,25 +1480,25 @@ TEST(Reductions, ReductionCacheAccessesInnerReduceAxis) {
    Placeholder a(BufHandle("a", {L, N, M}, kFloat));
    Placeholder b(BufHandle("b", {L, N, M}, kFloat));
  
-  Tensor* c = Compute(
+  Tensor c = Compute(
        "scale",
        {{L, "l2"}, {N, "n1"}, {M, "m1"}},
        [&](const VarHandle& l, const VarHandle& n, const VarHandle& m) {
          return b.load(l, n, m) * a.load(l, n, m);
        });
-  Tensor* d = Reduce("sum", {{L, "l1"}}, Sum(), c, {{N, "n1"}, {M, "m1"}});
+  Tensor d = Reduce("sum", {{L, "l1"}}, Sum(), c, {{N, "n1"}, {M, "m1"}});
  
-  Tensor* e = Compute("scale", {{L, "l"}}, [&](const VarHandle& l) {
-    return b.load(0, 0, l) * d->load(l);
+  Tensor e = Compute("scale", {{L, "l"}}, [&](const VarHandle& l) {
+    return b.load(0, 0, l) * d.load(l);
    });
  
-  LoopNest l(std::vector<Tensor*>({e}), {c, d, e});
+  LoopNest l({e}, {c, d, e});
    LoopNest l_before(l);
    l_before.prepareForCodegen();
    SimpleIREvaluator cg_before(l_before.root_stmt(), {a, b, e});
  
    StmtPtr d_loop = l.getLoopStmtsFor(d)[2];
-  l.cacheAccesses(d->buf(), "d_local", d_loop);
+  l.cacheAccesses(d.buf(), "d_local", d_loop);
    l.prepareForCodegen();
  
    StmtPtr result = IRSimplifier::simplify(l.root_stmt());
@@ -1551,22 +1551,22 @@ TEST(Reductions, ReductionCacheBodyAccess) {
    Placeholder a(BufHandle("a", {24, 32, 12}, kFloat));
    Placeholder b(BufHandle("b", {24, 32, 12}, kFloat));
  
-  Tensor* c = Compute(
+  Tensor c = Compute(
        "scale",
        {{24, "l2"}, {32, "n1"}, {12, "m1"}},
        [&](const VarHandle& l, const VarHandle& n, const VarHandle& m) {
          return b.load(l, n, m) * a.load(l, n, m);
        });
-  Tensor* d = Reduce("sum", {{24, "l1"}}, Sum(), c, {{32, "n1"}, {12, "m1"}});
+  Tensor d = Reduce("sum", {{24, "l1"}}, Sum(), c, {{32, "n1"}, {12, "m1"}});
  
-  Tensor* e = Compute("scale", {{24, "l"}}, [&](const VarHandle& l) {
-    return b.load(0, 0, l) * d->load(l);
+  Tensor e = Compute("scale", {{24, "l"}}, [&](const VarHandle& l) {
+    return b.load(0, 0, l) * d.load(l);
    });
  
-  LoopNest l(std::vector<Tensor*>({e}), {c, d, e});
+  LoopNest l({e}, {c, d, e});
  
    StmtPtr d_loop = l.getLoopStmtsFor(d)[1];
-  l.cacheAccesses(c->buf(), "scale_local", d_loop);
+  l.cacheAccesses(c.buf(), "scale_local", d_loop);
  
    l.prepareForCodegen();
    StmtPtr result = IRSimplifier::simplify(l.root_stmt());
@@ -1592,24 +1592,24 @@ TEST(Reductions, ReductionCacheConsumerAccess) {
    Placeholder a(BufHandle("a", {24, 32, 12}, kFloat));
    Placeholder b(BufHandle("b", {24, 32, 12}, kFloat));
  
-  Tensor* c = Compute(
+  Tensor c = Compute(
        "scale",
        {{24, "l2"}, {32, "n1"}, {12, "m1"}},
        [&](const VarHandle& l, const VarHandle& n, const VarHandle& m) {
          return b.load(l, n, m) * a.load(l, n, m);
        });
-  Tensor* d = Reduce("sum", {{24, "l1"}}, Sum(), c, {{32, "n1"}, {12, "m1"}});
+  Tensor d = Reduce("sum", {{24, "l1"}}, Sum(), c, {{32, "n1"}, {12, "m1"}});
  
-  Tensor* e = Compute("scale", {{24, "l"}}, [&](const VarHandle& l) {
-    return b.load(0, 0, l) * d->load(l);
+  Tensor e = Compute("scale", {{24, "l"}}, [&](const VarHandle& l) {
+    return b.load(0, 0, l) * d.load(l);
    });
  
-  LoopNest l(std::vector<Tensor*>({e}), {c, d, e});
+  LoopNest l({e}, {c, d, e});
  
    LoopNest::splitWithMask(l.getLoopStmtsFor(e)[0], 4);
  
    StmtPtr e_loop = l.getLoopStmtsFor(e)[1];
-  l.cacheAccesses(d->buf(), "sum_local", e_loop);
+  l.cacheAccesses(d.buf(), "sum_local", e_loop);
    l.prepareForCodegen();
  
    StmtPtr result = IRSimplifier::simplify(l.root_stmt());
@@ -1633,19 +1633,19 @@ TEST(Reductions, ReductionSplitCacheConsumerAccess) {
    Placeholder a(BufHandle("a", {24, 32, 12}, kFloat));
    Placeholder b(BufHandle("b", {24, 32, 12}, kFloat));
  
-  Tensor* c = Compute(
+  Tensor c = Compute(
        "scale",
        {{24, "l2"}, {32, "n1"}, {12, "m1"}},
        [&](const VarHandle& l, const VarHandle& n, const VarHandle& m) {
          return b.load(l, n, m) * a.load(l, n, m);
        });
-  Tensor* d = Reduce("sum", {{24, "l1"}}, Sum(), c, {{32, "n1"}, {12, "m1"}});
+  Tensor d = Reduce("sum", {{24, "l1"}}, Sum(), c, {{32, "n1"}, {12, "m1"}});
  
-  Tensor* e = Compute("scale", {{24, "l"}}, [&](const VarHandle& l) {
-    return b.load(0, 0, l) * d->load(l);
+  Tensor e = Compute("scale", {{24, "l"}}, [&](const VarHandle& l) {
+    return b.load(0, 0, l) * d.load(l);
    });
  
-  LoopNest l(std::vector<Tensor*>({e}), {c, d, e});
+  LoopNest l({e}, {c, d, e});
  
    // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
    ForPtr inner;
@@ -1656,7 +1656,7 @@ TEST(Reductions, ReductionSplitCacheConsumerAccess) {
    // Split reduction consumer.
    LoopNest::splitWithMask(l.getLoopStmtsFor(e)[0], 4, &inner);
  
-  l.cacheAccesses(d->buf(), "sum_local", inner);
+  l.cacheAccesses(d.buf(), "sum_local", inner);
    l.prepareForCodegen();
  
    StmtPtr result = IRSimplifier::simplify(l.root_stmt());
@@ -1681,19 +1681,19 @@ TEST(Reductions, ReductionReorderCacheConsumerAccess) {
    Placeholder a(BufHandle("a", {24, 32, 12}, kFloat));
    Placeholder b(BufHandle("b", {24, 32, 12}, kFloat));
  
-  Tensor* c = Compute(
+  Tensor c = Compute(
        "scale",
        {{24, "l2"}, {32, "n1"}, {12, "m1"}},
        [&](const VarHandle& l, const VarHandle& n, const VarHandle& m) {
          return b.load(l, n, m) * a.load(l, n, m);
        });
-  Tensor* d = Reduce("sum", {{24, "l1"}}, Sum(), c, {{32, "n1"}, {12, "m1"}});
+  Tensor d = Reduce("sum", {{24, "l1"}}, Sum(), c, {{32, "n1"}, {12, "m1"}});
  
-  Tensor* e = Compute("scale", {{24, "l"}}, [&](const VarHandle& l) {
-    return b.load(0, 0, l) * d->load(l);
+  Tensor e = Compute("scale", {{24, "l"}}, [&](const VarHandle& l) {
+    return b.load(0, 0, l) * d.load(l);
    });
  
-  LoopNest l(std::vector<Tensor*>({e}), {c, d, e});
+  LoopNest l({e}, {c, d, e});
  
    // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
    ForPtr inner;
@@ -1705,7 +1705,7 @@ TEST(Reductions, ReductionReorderCacheConsumerAccess) {
    // Split reduction consumer.
    LoopNest::splitWithMask(l.getLoopStmtsFor(e)[0], 4, &inner);
  
-  l.cacheAccesses(d->buf(), "sum_local", inner);
+  l.cacheAccesses(d.buf(), "sum_local", inner);
    l.prepareForCodegen();
  
    StmtPtr result = IRSimplifier::simplify(l.root_stmt());
@@ -1742,13 +1742,13 @@ TEST(Reductions, ReductionRfactorCacheTempOuter) {
  
    std::vector<float> out(1, -1.f);
  
-  Tensor* c = Reduce("sum", {}, Sum(), b, {{m, "a"}, {n, "b"}, {k, "c"}});
+  Tensor c = Reduce("sum", {}, Sum(), b, {{m, "a"}, {n, "b"}, {k, "c"}});
    LoopNest loop({c});
  
    std::vector<ForPtr> loops = loop.getLoopStmtsFor(c);
    LoopNest::reorderAxis(loops.at(0), loops.at(1));
    loops = loop.getLoopStmtsFor(c);
-  auto c_body = loop.getAllWritesToBuf(c->buf())[1];
+  auto c_body = loop.getAllWritesToBuf(c.buf())[1];
    // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
    BufPtr rfac_buf;
    ASSERT_TRUE(loop.rfactor(c_body, loops.at(0), &rfac_buf));
@@ -1811,10 +1811,10 @@ TEST(Reductions, ReductionRfactorCacheTempInner) {
  
    std::vector<float> out(1, -1.f);
  
-  Tensor* c = Reduce("sum", {}, Sum(), b, {{m, "a"}, {n, "b"}, {k, "c"}});
+  Tensor c = Reduce("sum", {}, Sum(), b, {{m, "a"}, {n, "b"}, {k, "c"}});
    LoopNest loop({c});
    std::vector<ForPtr> loops = loop.getLoopStmtsFor(c);
-  auto c_body = loop.getAllWritesToBuf(c->buf())[1];
+  auto c_body = loop.getAllWritesToBuf(c.buf())[1];
  
    LoopNest::reorderAxis(loops.at(0), loops.at(1));
    loops = loop.getLoopStmtsFor(c);
@@ -1871,7 +1871,7 @@ TEST(Reductions, ReductionVectorize) {
  
    Placeholder in(BufHandle("in", {8, 8}, kFloat));
  
-  Tensor* tensor = Reduce("sum", {{8, "m"}}, Sum(), in, {{8, "n"}});
+  Tensor tensor = Reduce("sum", {{8, "m"}}, Sum(), in, {{8, "n"}});
    LoopNest l_before({tensor});
    LoopNest l(l_before);
    l_before.prepareForCodegen();
@@ -1909,7 +1909,7 @@ TEST(Reductions, ReductionVectorizeInner) {
  
    Placeholder in(BufHandle("in", {8, 8}, kFloat));
  
-  Tensor* tensor = Reduce("sum", {{8, "m"}}, Sum(), in, {{8, "n"}});
+  Tensor tensor = Reduce("sum", {{8, "m"}}, Sum(), in, {{8, "n"}});
    LoopNest l({tensor});
  
    ASSERT_FALSE(LoopNest::vectorize(l.getLoopStmtsFor(tensor)[1]));
@@ -1929,7 +1929,7 @@ TEST(Reductions, ReductionVectorizeRfactor) {
  
    Placeholder in(BufHandle("in", {8, 8}, kFloat));
  
-  Tensor* tensor = Reduce("sum", {}, Sum(), in, {{8, "m"}, {8, "n"}});
+  Tensor tensor = Reduce("sum", {}, Sum(), in, {{8, "m"}, {8, "n"}});
  
    LoopNest l_before({tensor});
    LoopNest l(l_before);
@@ -1944,7 +1944,7 @@ TEST(Reductions, ReductionVectorizeRfactor) {
    std::vector<ForPtr> loops = l.getLoopStmtsFor(tensor);
    LoopNest::reorderAxis(loops[0], loops[1]);
    loops = l.getLoopStmtsFor(tensor);
-  auto tensor_body = l.getAllWritesToBuf(tensor->buf())[1];
+  auto tensor_body = l.getAllWritesToBuf(tensor.buf())[1];
    BufPtr rfac_buf = nullptr;
    ASSERT_TRUE(LoopNest::rfactor(tensor_body, loops.at(0), &rfac_buf));
  
@@ -1988,7 +1988,7 @@ TEST(Reductions, InitFunction) {
    constexpr int N = 16;
    Placeholder A("A", kFloat, {M, N});
    Placeholder B("B", kFloat, {N});
-  Tensor* C = Reduce(
+  Tensor C = Reduce(
        "C",
        {{N, "n"}},
        Sum(),
diff --git a/test/cpp/tensorexpr/test_simplify.cpp b/test/cpp/tensorexpr/test_simplify.cpp

index c25ae4f..0df9e92 100644 (file)
--- a/test/cpp/tensorexpr/test_simplify.cpp
+++ b/test/cpp/tensorexpr/test_simplify.cpp
@@ -3932,7 +3932,7 @@ TEST(Simplify, SimplifyForCleansUp) {
    {
      Placeholder a("a", kFloat, {1, 12, 1});
      VarHandle x("x", kInt);
-    Tensor* b = Compute(
+    Tensor b = Compute(
          // NOLINTNEXTLINE(clang-analyzer-cplusplus.NewDeleteLeaks)
          "x",
          {{1, "i"}, {12, "m"}, {1, "n"}},
diff --git a/test/cpp/tensorexpr/tutorial.cpp b/test/cpp/tensorexpr/tutorial.cpp

index 5a6f257..0f0277e 100644 (file)
--- a/test/cpp/tensorexpr/tutorial.cpp
+++ b/test/cpp/tensorexpr/tutorial.cpp
@@ -155,8 +155,8 @@ int main(int argc, char* argv[]) {
      ExprPtr body = alloc<Mul>(i, j);
  
      // Finally, we pass all these pieces together to Tensor constructor:
-    Tensor* X = new Tensor(buf, args, body);
-    std::cout << "Tensor computation: " << *X << std::endl;
+    Tensor X = Tensor(buf, args, body);
+    std::cout << "Tensor computation: " << X << std::endl;
      // Prints:
      // Tensor computation: Tensor X[64, 32]:
      // for (int i = 0; i < 64; i++) {
@@ -171,11 +171,11 @@ int main(int argc, char* argv[]) {
      // constructing Exprs, Tensors also have a more convenient API for
      // construction. It is based on Compute API, which takes a name,
      // dimensions, and a lambda specifying the computation body:
-    Tensor* Z = Compute(
+    Tensor Z = Compute(
          "Z",
          {{64, "i"}, {32, "j"}},
          [](const VarHandle& i, const VarHandle& j) { return i / j; });
-    std::cout << "Tensor computation: " << *Z << std::endl;
+    std::cout << "Tensor computation: " << Z << std::endl;
      // Prints:
      // Tensor computation: Tensor Z[64, 32]:
      // for (int i = 0; i < 64; i++) {
@@ -187,13 +187,13 @@ int main(int argc, char* argv[]) {
      // Tensors might access other tensors and external placeholders in their
      // expressions. It can be done like so:
      Placeholder P("P", kInt, {64, 32});
-    Tensor* R = Compute(
+    Tensor R = Compute(
          "R",
          {{64, "i"}, {32, "j"}},
          [&](const VarHandle& i, const VarHandle& j) {
-          return Z->load(i, j) * P.load(i, j);
+          return Z.load(i, j) * P.load(i, j);
          });
-    std::cout << "Tensor computation: " << *R << std::endl;
+    std::cout << "Tensor computation: " << R << std::endl;
      // Prints:
      // Tensor computation: Tensor R[64, 32]:
      // for (int i = 0; i < 64; i++) {
@@ -224,20 +224,20 @@ int main(int argc, char* argv[]) {
      // Let's create a simple tensor expression and construct a loop nest for it.
      Placeholder A("A", kFloat, {64, 32});
      Placeholder B("B", kFloat, {64, 32});
-    Tensor* X = Compute(
+    Tensor X = Compute(
          "X",
          {{64, "i"}, {32, "j"}},
          [&](const VarHandle& i, const VarHandle& j) {
            return A.load(i, j) + B.load(i, j);
          });
-    Tensor* Y = Compute(
+    Tensor Y = Compute(
          "Y",
          {{64, "i"}, {32, "j"}},
          [&](const VarHandle& i, const VarHandle& j) {
-          return sigmoid(X->load(i, j));
+          return sigmoid(X.load(i, j));
          });
-    std::cout << "Tensor computation X: " << *X
-              << "Tensor computation Y: " << *Y << std::endl;
+    std::cout << "Tensor computation X: " << X << "Tensor computation Y: " << Y
+              << std::endl;
      // Prints:
      // Tensor computation X: Tensor X[64, 32]:
      // for (int i = 0; i < 64; i++) {
@@ -256,9 +256,7 @@ int main(int argc, char* argv[]) {
      // Creating a loop nest is as quite simple, we just need to specify a list
      // of all and a list of output tensors:
      // NOLINTNEXTLINE(bugprone-argument-comment)
-    std::vector<Tensor*> outputs = {Y};
-    std::vector<Tensor*> all = {X, Y};
-    LoopNest loopnest(outputs, all);
+    LoopNest loopnest(/*outputs=*/{Y}, /*all=*/{X, Y});
  
      // An IR used in LoopNest is based on tensor statements, represented by
      // `Stmt` class. Statements are used to specify the loop nest structure, and
@@ -357,7 +355,7 @@ int main(int argc, char* argv[]) {
      // Let's start by constructing a simple computation for us to work with:
      Placeholder A("A", kInt, {64, 32});
      Placeholder B("B", kInt, {64, 32});
-    Tensor* X = Compute(
+    Tensor X = Compute(
          "X",
          {{64, "i"}, {32, "j"}},
          [&](const VarHandle& i, const VarHandle& j) {
diff --git a/torch/csrc/jit/runtime/static/ops.cpp b/torch/csrc/jit/runtime/static/ops.cpp

index 27f6e54..0a34f47 100644 (file)
--- a/torch/csrc/jit/runtime/static/ops.cpp
+++ b/torch/csrc/jit/runtime/static/ops.cpp
@@ -500,7 +500,7 @@ struct TEWrapper {
  
  void optimizePointwise(
      tensorexpr::LoopNest* ln,
-    tensorexpr::Tensor* target,
+    tensorexpr::Tensor target,
      int width) {
    using namespace torch::jit::tensorexpr;
    std::vector<ForPtr> loops = ln->getLoopStmtsFor(target);
@@ -513,7 +513,7 @@ void optimizePointwise(
  std::shared_ptr<TEWrapper> wrapTECompute(
      std::shared_ptr<TEWrapper> wrap,
      tensorexpr::Placeholder& in,
-    tensorexpr::Tensor* out,
+    tensorexpr::Tensor out,
      tensorexpr::VarHandle& dim,
      int width = kVectorWidth) {
    using namespace torch::jit::tensorexpr;
@@ -553,7 +553,7 @@ struct TEWrapper {
  std::shared_ptr<TEWrapper> wrapTECompute(
      std::shared_ptr<TEWrapper> wrap,
      tensorexpr::Placeholder& in,
-    tensorexpr::Tensor* out,
+    tensorexpr::Tensor out,
      tensorexpr::VarHandle& dim,
      int width = kVectorWidth) {
    return wrap;
@@ -593,7 +593,7 @@ std::shared_ptr<TEWrapper> createLogit(c10::optional<float> clamp) {
    auto wrap = std::make_shared<TEWrapper>();
    auto N = VarHandle("N", kInt);
    Placeholder A("A", kFloat, {N});
-  tensorexpr::Tensor* B = Compute("B", {N}, [&](const VarHandle& i) {
+  tensorexpr::Tensor B = Compute("B", {N}, [&](const VarHandle& i) {
      auto A_elem = [&]() {
        if (!clamp) {
          return A.load(i);
@@ -619,7 +619,7 @@ std::shared_ptr<TEWrapper> createRelu() {
    wrap = std::make_shared<TEWrapper>();
    auto N = VarHandle("N", kInt);
    Placeholder A("A", kFloat, {N});
-  tensorexpr::Tensor* B = Compute("B", {N}, [&](const VarHandle& i) {
+  tensorexpr::Tensor B = Compute("B", {N}, [&](const VarHandle& i) {
      auto zero = FloatImm::make(0.f);
      auto a = A.load(i);
      return ifThenElse(a < zero, zero, a);
@@ -638,7 +638,7 @@ std::shared_ptr<TEWrapper> createTanh() {
    wrap = std::make_shared<TEWrapper>();
    auto N = VarHandle("N", kInt);
    Placeholder A("A", kFloat, {N});
-  tensorexpr::Tensor* B = Compute("B", {N}, [&](const VarHandle& i) {
+  tensorexpr::Tensor B = Compute("B", {N}, [&](const VarHandle& i) {
      auto a = A.load(i);
      return fast_tanh(a);
    });
@@ -656,7 +656,7 @@ std::shared_ptr<TEWrapper> createSigmoid() {
    wrap = std::make_shared<TEWrapper>();
    auto N = VarHandle("N", kInt);
    Placeholder A("A", kFloat, {N});
-  Tensor* B =
+  Tensor B =
        Compute("B", {N}, [&](const VarHandle& i) { return sigmoid(A.load(i)); });
    // NNC uses sleef for vectorizing sigmoid, which comes in an 8-wide flavor
    // (Sleef_expf8).
diff --git a/torch/csrc/jit/tensorexpr/codegen.h b/torch/csrc/jit/tensorexpr/codegen.h

index 77ba8e1..d7cfe78 100644 (file)
--- a/torch/csrc/jit/tensorexpr/codegen.h
+++ b/torch/csrc/jit/tensorexpr/codegen.h
@@ -104,7 +104,7 @@ class TORCH_API CodeGen {
  class CodeGen::BufferArg {
   public:
    BufferArg(const Placeholder& buffer) : buf_(buffer.data()) {}
-  BufferArg(Tensor* tensor) : buf_(tensor->buf()) {}
+  BufferArg(Tensor tensor) : buf_(tensor.buf()) {}
    BufferArg(const VarHandle& var) : var_(var.node()), isVar_(true) {}
    BufferArg(const BufHandle& buf) : buf_(buf.node()) {}
  
diff --git a/torch/csrc/jit/tensorexpr/ir_printer.cpp b/torch/csrc/jit/tensorexpr/ir_printer.cpp

index f885246..27b56e2 100644 (file)
--- a/torch/csrc/jit/tensorexpr/ir_printer.cpp
+++ b/torch/csrc/jit/tensorexpr/ir_printer.cpp
@@ -545,7 +545,7 @@ std::ostream& operator<<(std::ostream& stream, const Stmt& stmt) {
  }
  
  std::ostream& operator<<(std::ostream& stream, const Tensor& t) {
-  stream << std::to_string(&t);
+  stream << std::to_string(t);
    return stream;
  }
  
@@ -568,7 +568,7 @@ void print(StmtPtr stmt) {
    }
  }
  
-void print(const Tensor* t) {
+void print(const Tensor& t) {
    std::cout << std::to_string(t);
  }
  
@@ -589,20 +589,17 @@ std::string to_string(StmtPtr stmt) {
    return oss.str();
  }
  
-std::string to_string(const Tensor* t) {
-  if (!t) {
-    return "(null tensor)\n";
-  }
+std::string to_string(const Tensor& t) {
    std::ostringstream oss;
    // TODO: move this to Buf printer
-  oss << "Tensor " << t->buf()->name_hint() << "[";
-  for (const auto i : c10::irange(t->buf()->ndim())) {
+  oss << "Tensor " << t.buf()->name_hint() << "[";
+  for (const auto i : c10::irange(t.buf()->ndim())) {
      if (i != 0) {
        oss << ", ";
      }
-    oss << *t->buf()->dim(i);
+    oss << *t.buf()->dim(i);
    }
-  oss << "]:\n" << *t->stmt() << "\n";
+  oss << "]:\n" << *t.stmt() << "\n";
    return oss.str();
  }
  } // namespace std
diff --git a/torch/csrc/jit/tensorexpr/ir_printer.h b/torch/csrc/jit/tensorexpr/ir_printer.h

index e76dcca..321d1ef 100644 (file)
--- a/torch/csrc/jit/tensorexpr/ir_printer.h
+++ b/torch/csrc/jit/tensorexpr/ir_printer.h
@@ -103,7 +103,7 @@ TORCH_API std::ostream& operator<<(std::ostream& stream, const Tensor&);
  
  TORCH_API void print(ExprPtr expr);
  TORCH_API void print(StmtPtr stmt);
-TORCH_API void print(const Tensor* t);
+TORCH_API void print(const Tensor& t);
  
  } // namespace tensorexpr
  } // namespace jit
@@ -119,5 +119,5 @@ using torch::jit::tensorexpr::Tensor;
  
  TORCH_API std::string to_string(ExprPtr expr);
  TORCH_API std::string to_string(StmtPtr stmt);
-TORCH_API std::string to_string(const Tensor* t);
+TORCH_API std::string to_string(const Tensor& t);
  } // namespace std
diff --git a/torch/csrc/jit/tensorexpr/kernel.cpp b/torch/csrc/jit/tensorexpr/kernel.cpp

index faacd02..8076ba2 100644 (file)
--- a/torch/csrc/jit/tensorexpr/kernel.cpp
+++ b/torch/csrc/jit/tensorexpr/kernel.cpp
@@ -960,7 +960,7 @@ std::vector<ExprHandle> TensorExprKernel::broadcastShapesMut(
    return res.first;
  }
  
-Tensor* computeOneOperand(
+Tensor computeOneOperand(
      const std::string& name,
      const std::vector<ArgValue>& inputValues,
      const std::vector<ExprHandle>& outputShape,
@@ -981,7 +981,7 @@ Tensor* computeOneOperand(
        });
  }
  
-Tensor* computeTwoOperand(
+Tensor computeTwoOperand(
      const std::string& name,
      const std::vector<ArgValue>& inputValues,
      const std::vector<ExprHandle>& outputShape,
@@ -1004,7 +1004,7 @@ Tensor* computeTwoOperand(
        });
  }
  
-Tensor* computeTwoOperandWithAlpha(
+Tensor computeTwoOperandWithAlpha(
      const std::string& name,
      const std::vector<ArgValue>& inputValues,
      const std::vector<ExprHandle>& outputShape,
@@ -1028,7 +1028,7 @@ Tensor* computeTwoOperandWithAlpha(
        });
  }
  
-Tensor* computeConditionWithTwoOperand(
+Tensor computeConditionWithTwoOperand(
      const std::string& name,
      const std::vector<ArgValue>& inputValues,
      const std::vector<ExprHandle>& outputShape,
@@ -1055,7 +1055,7 @@ Tensor* computeConditionWithTwoOperand(
        });
  }
  
-Tensor* computeThreeOperand(
+Tensor computeThreeOperand(
      const std::string& name,
      const std::vector<ArgValue>& inputValues,
      const std::vector<ExprHandle>& outputShape,
@@ -1083,7 +1083,7 @@ Tensor* computeThreeOperand(
          return demoteOutput(compute, outputType);
        });
  }
-Tensor* computeFourOperand(
+Tensor computeFourOperand(
      const std::string& name,
      const std::vector<ArgValue>& inputValues,
      const std::vector<ExprHandle>& outputShape,
@@ -1135,7 +1135,7 @@ std::pair<ScalarType, std::vector<BufHandle>> processCatList(
    }
    return {highType, nonEmptyInputs};
  }
-Tensor* computeCatWoConditionals(
+Tensor computeCatWoConditionals(
      const std::vector<ArgValue>& inputs,
      const std::vector<ExprHandle>& outputShape) {
    // NOLINTNEXTLINE(performance-unnecessary-copy-initialization)
@@ -1164,7 +1164,7 @@ Tensor* computeCatWoConditionals(
    auto output_buf =
        alloc<Buf>("aten_cat", output_sizes_expr, ToDtype(high_type));
    if (non_empty_inputs.size() == 0) {
-    return new Tensor(
+    return Tensor(
          output_buf, alloc<tensorexpr::Block>(std::vector<StmtPtr>({})));
    }
  
@@ -1213,10 +1213,10 @@ Tensor* computeCatWoConditionals(
      concat_dim_size =
          alloc<Add>(concat_dim_size, input_dims[norm_concat_dim].node());
    }
-  return new Tensor(output_buf, IRSimplifier::simplify(block));
+  return Tensor(output_buf, IRSimplifier::simplify(block));
  }
  
-Tensor* computeCat(
+Tensor computeCat(
      const std::vector<ArgValue>& inputs,
      const std::vector<ExprHandle>& outputShape,
      at::Device device) {
@@ -1276,7 +1276,7 @@ Tensor* computeCat(
        });
  }
  
-Tensor* computeConv2d(
+Tensor computeConv2d(
      const std::vector<ArgValue>& inputs,
      const std::vector<ExprHandle>& outputShape,
      const c10::optional<ScalarType>& outputType) {
@@ -1319,10 +1319,10 @@ Tensor* computeConv2d(
         dilation[0],
         dilation[1],
         groups});
-  return new Tensor(ResultBuf.node(), s);
+  return Tensor(ResultBuf.node(), s);
  }
  
-Tensor* tensorexpr::computeOperandValue(
+Tensor tensorexpr::computeOperandValue(
      c10::Symbol op,
      const std::vector<ArgValue>& inputs,
      const std::vector<ExprHandle>& outputShape,
@@ -2391,7 +2391,7 @@ c10::optional<ScalarType> findDtypeForValue(const torch::jit::Value* v) {
    return c10::nullopt;
  }
  
-Tensor* TensorExprKernel::computeValue(const torch::jit::Value* v) {
+Tensor TensorExprKernel::computeValue(const torch::jit::Value* v) {
    auto inputs = v->node()->inputs();
    auto op = v->node()->kind();
  
@@ -2703,9 +2703,9 @@ static std::vector<ExprHandle> toExprHandles(const std::vector<T>& sizes) {
    return dims;
  }
  
-Tensor* TensorExprKernel::bindInput(const torch::jit::Value* input) {
+Tensor TensorExprKernel::bindInput(const torch::jit::Value* input) {
    auto const& t = input->type();
-  Tensor* result = nullptr;
+  Tensor result(nullptr, nullptr);
    switch (t->kind()) {
      case TypeKind::TensorType: {
        auto tt = input->type()->cast<TensorType>();
@@ -2744,7 +2744,7 @@ Tensor* TensorExprKernel::bindInput(const torch::jit::Value* input) {
              }
              return inBuffer.load(idx);
            });
-      bufs_.emplace(input, result->buf());
+      bufs_.emplace(input, result.buf());
  
        bufferArgs_.emplace_back(inBuffer);
        break;
@@ -2800,7 +2800,7 @@ bool denseAndNonOverlapping(
    return (strides == at::infer_dense_strides(sizes, strides));
  }
  
-Tensor* TensorExprKernel::convertOutputToCorrectStrides(torch::jit::Value* v) {
+Tensor TensorExprKernel::convertOutputToCorrectStrides(torch::jit::Value* v) {
    const TensorTypePtr& tt = v->type()->expect<TensorType>();
    TORCH_INTERNAL_ASSERT(bufs_.count(v));
    BufPtr buf = bufs_.at(v);
@@ -2816,19 +2816,19 @@ Tensor* TensorExprKernel::convertOutputToCorrectStrides(torch::jit::Value* v) {
    auto sizes = *tt->sizes().concrete_sizes();
    std::vector<int64_t> default_strides = TensorType::contiguousStridesOf(sizes);
    if (!tt->strides().concrete_sizes()) {
-    return new Tensor(buf, nullptr);
+    return Tensor(buf, nullptr);
    }
    TORCH_INTERNAL_ASSERT(tt->strides().concrete_sizes());
    const std::vector<int64_t> strides = *tt->strides().concrete_sizes();
    // All Tensors in NNC are layed out in default, contiguous layout.
    // If the output is also default contiguous we don't need to do anything
    if (strides == default_strides) {
-    return new Tensor(buf, nullptr);
+    return Tensor(buf, nullptr);
    }
    // If the tensor is not dense or overlaps, we have
    // no way of matching the profiled striding
    if (!denseAndNonOverlapping(sizes, strides)) {
-    return new Tensor(buf, nullptr);
+    return Tensor(buf, nullptr);
    }
  
    auto dims = c10::fmap<DimArg>(sizesForValue(v));
@@ -2922,8 +2922,9 @@ void TensorExprKernel::compile() {
    nInputs_ = graph_->inputs().size();
    genInputDebugNames();
    for (auto const& input : graph_->inputs()) {
-    if (Tensor* t = bindInput(input)) {
-      block->append_stmt(t->stmt());
+    Tensor t = bindInput(input);
+    if (t.stmt()) {
+      block->append_stmt(t.stmt());
      }
    }
  
@@ -2937,10 +2938,9 @@ void TensorExprKernel::compile() {
      } else {
        for (auto const& output : n->outputs()) {
          if (output->hasUses()) {
-          Tensor* t = computeValue(output);
-          bufs_.emplace(output, t->buf());
-          // NOLINTNEXTLINE(clang-analyzer-cplusplus.NewDeleteLeaks)
-          block->append_stmt(t->stmt());
+          Tensor t = computeValue(output);
+          bufs_.emplace(output, t.buf());
+          block->append_stmt(t.stmt());
          }
        }
      }
@@ -2958,12 +2958,12 @@ void TensorExprKernel::compile() {
      // The "strided" tensor will be incorrect if used in NNC,
      // since NNC views it as contiguous. Only convert it to the right
      // strides at the end of the kernel (if already contiguous it's a no-op)
-    Tensor* properly_strided_output = convertOutputToCorrectStrides(output);
-    if (properly_strided_output->stmt()) {
-      block->append_stmt(properly_strided_output->stmt());
+    Tensor properly_strided_output = convertOutputToCorrectStrides(output);
+    if (properly_strided_output.stmt()) {
+      block->append_stmt(properly_strided_output.stmt());
      }
      // NOLINTNEXTLINE(clang-analyzer-cplusplus.NewDeleteLeaks)
-    bufs_[output] = properly_strided_output->buf();
+    bufs_[output] = properly_strided_output.buf();
      const auto& tt = output->type()->expect<TensorType>();
      auto sizes = *tt->sizes().concrete_sizes();
      tensorOutputSizes_.push_back(sizes);
diff --git a/torch/csrc/jit/tensorexpr/kernel.h b/torch/csrc/jit/tensorexpr/kernel.h

index 7b35e1e..a8a57b9 100644 (file)
--- a/torch/csrc/jit/tensorexpr/kernel.h
+++ b/torch/csrc/jit/tensorexpr/kernel.h
@@ -47,7 +47,7 @@ using ArgValue = c10::variant<
      IntList,
      ArgNone>;
  
-using NNCLoweringFunction = std::function<Tensor*(
+using NNCLoweringFunction = std::function<Tensor(
      const std::vector<ArgValue>&,
      const std::vector<ExprHandle>&,
      const c10::optional<ScalarType>&,
@@ -123,7 +123,7 @@ struct TensorInfo {
    c10::ScalarType dtype;
  };
  
-TORCH_API Tensor* computeOperandValue(
+TORCH_API Tensor computeOperandValue(
      c10::Symbol op,
      const std::vector<ArgValue>& inputs,
      const std::vector<ExprHandle>& outputShape,
@@ -209,7 +209,7 @@ class TORCH_API TensorExprKernel {
        const torch::jit::Value* v,
        const std::vector<ExprHandle>& axes);
  
-  Tensor* computeValue(const torch::jit::Value* v);
+  Tensor computeValue(const torch::jit::Value* v);
  
    void bindConstant(const torch::jit::Value* v);
  
@@ -222,9 +222,9 @@ class TORCH_API TensorExprKernel {
        std::vector<at::Tensor>& outputs);
    BackendType inferBackendTypeFromDevice(at::Device device);
  
-  Tensor* bindInput(const torch::jit::Value* input);
+  Tensor bindInput(const torch::jit::Value* input);
  
-  Tensor* convertOutputToCorrectStrides(torch::jit::Value* v);
+  Tensor convertOutputToCorrectStrides(torch::jit::Value* v);
  
    // Captures the information for reduction operation nodes.
    struct ReductionInfo {
diff --git a/torch/csrc/jit/tensorexpr/loopnest.cpp b/torch/csrc/jit/tensorexpr/loopnest.cpp

index d9d2073..1904999 100644 (file)
--- a/torch/csrc/jit/tensorexpr/loopnest.cpp
+++ b/torch/csrc/jit/tensorexpr/loopnest.cpp
@@ -47,14 +47,14 @@ LoopNest::LoopNest(StmtPtr stmt, std::unordered_set<BufPtr> output_bufs)
  
  // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
  LoopNest::LoopNest(
-    const std::vector<Tensor*>& output_tensors,
-    const std::vector<Tensor*>& tensors_to_compute) {
+    const std::vector<Tensor>& output_tensors,
+    const std::vector<Tensor>& tensors_to_compute) {
    initialize(output_tensors, tensors_to_compute);
    verify(root_stmt_);
  }
  
  // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
-LoopNest::LoopNest(const std::vector<Tensor*>& output_tensors) {
+LoopNest::LoopNest(const std::vector<Tensor>& output_tensors) {
    initialize(output_tensors, output_tensors);
    verify(root_stmt_);
  }
@@ -486,15 +486,15 @@ bool LoopNest::vectorize(ForPtr f) {
  }
  
  void LoopNest::initialize(
-    const std::vector<Tensor*>& output_tensors,
-    const std::vector<Tensor*>& tensors_to_compute) {
+    const std::vector<Tensor>& output_tensors,
+    const std::vector<Tensor>& tensors_to_compute) {
    for (auto t : output_tensors) {
-    output_bufs_.insert(t->buf());
+    output_bufs_.insert(t.buf());
    }
  
    std::vector<StmtPtr> loops;
-  for (Tensor* t : tensors_to_compute) {
-    StmtPtr loop = t->stmt();
+  for (Tensor t : tensors_to_compute) {
+    StmtPtr loop = t.stmt();
      if (loop->get_parent()) {
        std::cerr << "Error: creating a loopnest from already used Tensors\n";
        loops = {};
@@ -2384,7 +2384,7 @@ void LoopNest::compressAllBuffers(StmtPtr stmt) {
    }
  }
  
-std::vector<ForPtr> LoopNest::getLoopStmtsFor(Tensor* t) const {
+std::vector<ForPtr> LoopNest::getLoopStmtsFor(Tensor t) const {
    StmtPtr cur_stmt = getLoopBodyFor(t);
    return getLoopStmtsFor(cur_stmt);
  }
@@ -2407,8 +2407,8 @@ std::vector<ForPtr> LoopNest::getLoopStmtsFor(StmtPtr s) const {
    return result;
  }
  
-StmtPtr LoopNest::getLoopBodyFor(Tensor* t) const {
-  return getLoopBodyFor(t->buf());
+StmtPtr LoopNest::getLoopBodyFor(Tensor t) const {
+  return getLoopBodyFor(t.buf());
  }
  
  StmtPtr LoopNest::getLoopBodyFor(BufPtr buf) const {
diff --git a/torch/csrc/jit/tensorexpr/loopnest.h b/torch/csrc/jit/tensorexpr/loopnest.h

index c8cf2d8..42f072d 100644 (file)
--- a/torch/csrc/jit/tensorexpr/loopnest.h
+++ b/torch/csrc/jit/tensorexpr/loopnest.h
@@ -27,11 +27,11 @@ class TORCH_API LoopNest {
   public:
    // A constructor for building a LoopNest from a list of Tensors
    LoopNest(
-      const std::vector<Tensor*>& output_tensors,
-      const std::vector<Tensor*>& tensors_to_compute);
+      const std::vector<Tensor>& output_tensors,
+      const std::vector<Tensor>& tensors_to_compute);
  
    // A convenience constructor for the case when all tensors are output tensors
-  LoopNest(const std::vector<Tensor*>& output_tensors);
+  LoopNest(const std::vector<Tensor>& output_tensors);
  
    // A constructor for building a LoopNest from an Stmt and a list of output
    // buffers.
@@ -45,10 +45,10 @@ class TORCH_API LoopNest {
      return root_stmt_;
    }
  
-  std::vector<ForPtr> getLoopStmtsFor(Tensor*) const;
+  std::vector<ForPtr> getLoopStmtsFor(Tensor) const;
    std::vector<ForPtr> getLoopStmtsFor(BufPtr) const;
    std::vector<ForPtr> getLoopStmtsFor(StmtPtr) const;
-  StmtPtr getLoopBodyFor(Tensor*) const;
+  StmtPtr getLoopBodyFor(Tensor) const;
    StmtPtr getLoopBodyFor(BufPtr) const;
  
    // Returns the For stmt indexed by 'indices' in the 'root' For stmt.
@@ -547,8 +547,8 @@ class TORCH_API LoopNest {
  
   private:
    void initialize(
-      const std::vector<Tensor*>& output_tensors,
-      const std::vector<Tensor*>& tensors_to_compute);
+      const std::vector<Tensor>& output_tensors,
+      const std::vector<Tensor>& tensors_to_compute);
    StmtPtr insertAllocFree(StmtPtr stmt);
    const std::unordered_set<BufPtr> getIntermediateBufs() const;
  
diff --git a/torch/csrc/jit/tensorexpr/operators/conv2d.cpp b/torch/csrc/jit/tensorexpr/operators/conv2d.cpp

index c4af83a..51d323f 100644 (file)
--- a/torch/csrc/jit/tensorexpr/operators/conv2d.cpp
+++ b/torch/csrc/jit/tensorexpr/operators/conv2d.cpp
@@ -16,7 +16,7 @@ void assert_dims_constant(const BufHandle& buf) {
  
  using InitFunc = std::function<ExprHandle(const std::vector<VarHandle>&)>;
  
-Tensor* conv2d_depthwise_static(
+Tensor conv2d_depthwise_static(
      BufHandle input,
      BufHandle weight,
      const InitFunc& init_func,
@@ -45,7 +45,7 @@ Tensor* conv2d_depthwise_static(
    auto OH = (H - R + 2 * pad) / stride + 1;
    auto OW = (W - S + 2 * pad) / stride + 1;
  
-  Tensor* conv = Reduce(
+  Tensor conv = Reduce(
        "conv2d_depthwise",
        {{N, "n"}, {K, "k"}, {OH, "oh"}, {OW, "ow"}},
        Sum(),
@@ -83,7 +83,7 @@ Tensor* conv2d_depthwise_static(
    } else if (R == 3 && stride == 1 && pad == 1) {
      // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
      ForPtr main, peeled;
-    auto loops = nest.getAllLoopNestsWritingToBuf(conv->buf());
+    auto loops = nest.getAllLoopNestsWritingToBuf(conv.buf());
      main = loops[1][kLoopW];
      nest.sliceHead(main, 1, &peeled, &main);
      nest.sliceTail(main, 1, &main, &peeled);
@@ -92,10 +92,10 @@ Tensor* conv2d_depthwise_static(
      nest.sliceTail(main, 1, &main, &peeled);
    }
  
-  return new Tensor(conv->buf(), nest.root_stmt());
+  return Tensor(conv.buf(), nest.root_stmt());
  }
  
-Tensor* conv2d_depthwise_dynamic(
+Tensor conv2d_depthwise_dynamic(
      BufHandle input,
      BufHandle weight,
      const InitFunc& init_func,
@@ -144,7 +144,7 @@ Tensor* conv2d_depthwise_dynamic(
  
  } // namespace
  
-Tensor* conv2d_depthwise(
+Tensor conv2d_depthwise(
      BufHandle input,
      BufHandle weight,
      BufHandle bias,
@@ -158,7 +158,7 @@ Tensor* conv2d_depthwise(
    return conv2d_depthwise_static(input, weight, init_func, stride, pad, groups);
  }
  
-Tensor* conv2d_depthwise(
+Tensor conv2d_depthwise(
      BufHandle input,
      BufHandle weight,
      int stride,
@@ -170,7 +170,7 @@ Tensor* conv2d_depthwise(
    return conv2d_depthwise_static(input, weight, init_func, stride, pad, groups);
  }
  
-Tensor* conv2d_depthwise(
+Tensor conv2d_depthwise(
      BufHandle input,
      BufHandle weight,
      BufHandle bias,
@@ -206,7 +206,7 @@ Tensor* conv2d_depthwise(
        groups);
  }
  
-Tensor* conv2d_depthwise(
+Tensor conv2d_depthwise(
      BufHandle input,
      BufHandle weight,
      ExprHandle N,
diff --git a/torch/csrc/jit/tensorexpr/operators/conv2d.h b/torch/csrc/jit/tensorexpr/operators/conv2d.h

index 14612fb..4c2215b 100644 (file)
--- a/torch/csrc/jit/tensorexpr/operators/conv2d.h
+++ b/torch/csrc/jit/tensorexpr/operators/conv2d.h
@@ -7,7 +7,7 @@ namespace jit {
  namespace tensorexpr {
  
  // An API to compute 2D depthwise convolutions with bias.
-TORCH_API Tensor* conv2d_depthwise(
+TORCH_API Tensor conv2d_depthwise(
      BufHandle input,
      BufHandle weight,
      BufHandle bias,
@@ -16,14 +16,14 @@ TORCH_API Tensor* conv2d_depthwise(
      int groups);
  
  // An API to compute 2D depthwise convolutions without bias.
-TORCH_API Tensor* conv2d_depthwise(
+TORCH_API Tensor conv2d_depthwise(
      BufHandle input,
      BufHandle weight,
      int stride,
      int pad,
      int groups);
  
-TORCH_API Tensor* conv2d_depthwise(
+TORCH_API Tensor conv2d_depthwise(
      BufHandle input,
      BufHandle weight,
      BufHandle bias,
@@ -39,7 +39,7 @@ TORCH_API Tensor* conv2d_depthwise(
      ExprHandle pad,
      ExprHandle groups);
  
-TORCH_API Tensor* conv2d_depthwise(
+TORCH_API Tensor conv2d_depthwise(
      BufHandle input,
      BufHandle weight,
      ExprHandle N,
diff --git a/torch/csrc/jit/tensorexpr/operators/matmul.cpp b/torch/csrc/jit/tensorexpr/operators/matmul.cpp

index 23cb455..581514c 100644 (file)
--- a/torch/csrc/jit/tensorexpr/operators/matmul.cpp
+++ b/torch/csrc/jit/tensorexpr/operators/matmul.cpp
@@ -5,7 +5,7 @@ namespace torch {
  namespace jit {
  namespace tensorexpr {
  
-Tensor* computeMatmul(
+Tensor computeMatmul(
      const std::vector<ArgValue>& inputs,
      const std::vector<ExprHandle>& outputShape,
      const c10::optional<ScalarType>& outputType) {
@@ -44,13 +44,13 @@ Tensor* computeMatmul(
          },
          {{size_a[1], "K"}});
    } else {
-    return new Tensor(
+    return Tensor(
          ResultBuf.node(),
          ExternalCall::make(ResultBuf, "nnc_aten_matmul", {a, b}, {}));
    }
  }
  
-Tensor* computeAddMM(
+Tensor computeAddMM(
      const std::vector<ArgValue>& inputs,
      const std::vector<ExprHandle>& outputShape,
      const c10::optional<ScalarType>& outputType) {
@@ -59,7 +59,7 @@ Tensor* computeAddMM(
      dtype = Dtype(*outputType);
    }
    BufHandle ResultBuf("addmm", outputShape, dtype);
-  return new Tensor(
+  return Tensor(
        ResultBuf.node(),
        ExternalCall::make(
            ResultBuf,
diff --git a/torch/csrc/jit/tensorexpr/operators/matmul.h b/torch/csrc/jit/tensorexpr/operators/matmul.h

index 35b30f4..0b52ad6 100644 (file)
--- a/torch/csrc/jit/tensorexpr/operators/matmul.h
+++ b/torch/csrc/jit/tensorexpr/operators/matmul.h
@@ -6,11 +6,11 @@ namespace torch {
  namespace jit {
  namespace tensorexpr {
  
-Tensor* computeMatmul(
+Tensor computeMatmul(
      const std::vector<ArgValue>& inputs,
      const std::vector<ExprHandle>& outputShape,
      const c10::optional<ScalarType>& outputType);
-Tensor* computeAddMM(
+Tensor computeAddMM(
      const std::vector<ArgValue>& inputs,
      const std::vector<ExprHandle>& outputShape,
      const c10::optional<ScalarType>& outputType);
diff --git a/torch/csrc/jit/tensorexpr/operators/norm.cpp b/torch/csrc/jit/tensorexpr/operators/norm.cpp

index d96ebcd..610f928 100644 (file)
--- a/torch/csrc/jit/tensorexpr/operators/norm.cpp
+++ b/torch/csrc/jit/tensorexpr/operators/norm.cpp
@@ -4,7 +4,7 @@ namespace torch {
  namespace jit {
  namespace tensorexpr {
  
-Tensor* computeBatchNorm(
+Tensor computeBatchNorm(
      const std::vector<ArgValue>& inputs,
      const std::vector<ExprHandle>& outputShape,
      const c10::optional<ScalarType>& outputType) {
diff --git a/torch/csrc/jit/tensorexpr/operators/norm.h b/torch/csrc/jit/tensorexpr/operators/norm.h

index 98d53b4..7f1412f 100644 (file)
--- a/torch/csrc/jit/tensorexpr/operators/norm.h
+++ b/torch/csrc/jit/tensorexpr/operators/norm.h
@@ -6,7 +6,7 @@ namespace torch {
  namespace jit {
  namespace tensorexpr {
  
-Tensor* computeBatchNorm(
+Tensor computeBatchNorm(
      const std::vector<ArgValue>& inputs,
      const std::vector<ExprHandle>& outputShape,
      const c10::optional<ScalarType>& outputType);
diff --git a/torch/csrc/jit/tensorexpr/operators/reduction.cpp b/torch/csrc/jit/tensorexpr/operators/reduction.cpp

index c1f3f7f..fe5cb6d 100644 (file)
--- a/torch/csrc/jit/tensorexpr/operators/reduction.cpp
+++ b/torch/csrc/jit/tensorexpr/operators/reduction.cpp
@@ -19,7 +19,7 @@ namespace torch {
  namespace jit {
  namespace tensorexpr {
  
-Tensor* computeSum(
+Tensor computeSum(
      const std::vector<ArgValue>& inputs,
      const c10::optional<ScalarType>& outputType) {
    std::vector<size_t> axes;
@@ -100,7 +100,7 @@ Tensor* computeSum(
        reductionDims);
  }
  
-Tensor* computeMean(
+Tensor computeMean(
      const std::vector<ArgValue>& inputs,
      const std::vector<ExprHandle>& outputShape,
      const c10::optional<ScalarType>& outputType) {
@@ -120,13 +120,13 @@ Tensor* computeMean(
        mean_dims_expr.emplace_back(idx);
      }
    }
-  return new Tensor(
+  return Tensor(
        ResultBuf.node(),
        ExternalCall::make(
            ResultBuf, "nnc_aten_mean", {InputBuf}, mean_dims_expr));
  }
  
-Tensor* computeAdaptiveAvgPool2d(
+Tensor computeAdaptiveAvgPool2d(
      const std::vector<ArgValue>& inputs,
      const std::vector<ExprHandle>& outputShape,
      const c10::optional<ScalarType>& outputType) {
@@ -137,7 +137,7 @@ Tensor* computeAdaptiveAvgPool2d(
    BufHandle ResultBuf("adaptive_avgpool2d", outputShape, dtype);
    // NOLINTNEXTLINE(performance-unnecessary-copy-initialization)
    auto out_size_param = c10::get<IntList>(inputs[1]);
-  return new Tensor(
+  return Tensor(
        ResultBuf.node(),
        ExternalCall::make(
            ResultBuf,
diff --git a/torch/csrc/jit/tensorexpr/operators/reduction.h b/torch/csrc/jit/tensorexpr/operators/reduction.h

index 4335d7b..d76bac6 100644 (file)
--- a/torch/csrc/jit/tensorexpr/operators/reduction.h
+++ b/torch/csrc/jit/tensorexpr/operators/reduction.h
@@ -6,14 +6,14 @@ namespace torch {
  namespace jit {
  namespace tensorexpr {
  
-TORCH_API Tensor* computeSum(
+TORCH_API Tensor computeSum(
      const std::vector<ArgValue>& inputs,
      const c10::optional<ScalarType>& outputType);
-TORCH_API Tensor* computeMean(
+TORCH_API Tensor computeMean(
      const std::vector<ArgValue>& inputs,
      const std::vector<ExprHandle>& outputShape,
      const c10::optional<ScalarType>& outputType);
-TORCH_API Tensor* computeAdaptiveAvgPool2d(
+TORCH_API Tensor computeAdaptiveAvgPool2d(
      const std::vector<ArgValue>& inputs,
      const std::vector<ExprHandle>& outputShape,
      const c10::optional<ScalarType>& outputType);
diff --git a/torch/csrc/jit/tensorexpr/operators/softmax.cpp b/torch/csrc/jit/tensorexpr/operators/softmax.cpp

index d6cb6c0..c1c2872 100644 (file)
--- a/torch/csrc/jit/tensorexpr/operators/softmax.cpp
+++ b/torch/csrc/jit/tensorexpr/operators/softmax.cpp
@@ -6,7 +6,7 @@ namespace tensorexpr {
  
  using namespace torch::jit::tensorexpr;
  
-Tensor* computeSoftmax(
+Tensor computeSoftmax(
      const std::vector<ArgValue>& inputs,
      const std::vector<ExprHandle>& outputShape,
      bool log_softmax) {
@@ -111,48 +111,43 @@ Tensor* computeSoftmax(
        Compute("aten_softmax_exp", output_dims, [&](ParameterList& indices) {
          auto inp = tensorOrConstant(
              inputs[0], convert_indices_to_expr_handle(indices));
-        return exp(inp - max->load(remove_softmax_dim_index(indices)));
+        return exp(inp - max.load(remove_softmax_dim_index(indices)));
        });
    auto sum = Reduce(
        "aten_softmax_sum",
        non_softmax_dims,
        Sum(),
        [&](ParameterList& indices) {
-        return e->load(move_softmax_dim_index_to_pos(indices));
+        return e.load(move_softmax_dim_index_to_pos(indices));
        },
        {output_dims[softmax_dim]});
    if (!log_softmax) {
      auto result =
          Compute("aten_softmax", output_dims, [&](ParameterList& indices) {
-          return e->load(indices) /
-              sum->load(remove_softmax_dim_index(indices));
+          return e.load(indices) / sum.load(remove_softmax_dim_index(indices));
          });
-    return new Tensor(
-        result->buf(),
+    return Tensor(
+        result.buf(),
          alloc<tensorexpr::Block>(std::vector<StmtPtr>(
-            {max->stmt(), e->stmt(), sum->stmt(), result->stmt()})));
+            {max.stmt(), e.stmt(), sum.stmt(), result.stmt()})));
    }
  
    auto log_sum = Compute(
        "aten_softmax_log_sum", non_softmax_dims, [&](ParameterList& indices) {
-        return log(sum->load(indices));
+        return log(sum.load(indices));
        });
    auto result =
        Compute("aten_log_softmax", output_dims, [&](ParameterList& indices) {
          auto inp = tensorOrConstant(
              inputs[0], convert_indices_to_expr_handle(indices));
          auto non_softmax_indices = remove_softmax_dim_index(indices);
-        return inp - max->load(non_softmax_indices) -
-            log_sum->load(non_softmax_indices);
+        return inp - max.load(non_softmax_indices) -
+            log_sum.load(non_softmax_indices);
        });
-  return new Tensor(
-      result->buf(),
+  return Tensor(
+      result.buf(),
        alloc<tensorexpr::Block>(std::vector<StmtPtr>(
-          {max->stmt(),
-           e->stmt(),
-           sum->stmt(),
-           log_sum->stmt(),
-           result->stmt()})));
+          {max.stmt(), e.stmt(), sum.stmt(), log_sum.stmt(), result.stmt()})));
  }
  
  } // namespace tensorexpr
diff --git a/torch/csrc/jit/tensorexpr/operators/softmax.h b/torch/csrc/jit/tensorexpr/operators/softmax.h

index 07ddd0f..b74a867 100644 (file)
--- a/torch/csrc/jit/tensorexpr/operators/softmax.h
+++ b/torch/csrc/jit/tensorexpr/operators/softmax.h
@@ -6,7 +6,7 @@ namespace torch {
  namespace jit {
  namespace tensorexpr {
  
-Tensor* computeSoftmax(
+Tensor computeSoftmax(
      const std::vector<ArgValue>& inputs,
      const std::vector<ExprHandle>& outputShape,
      bool log_softmax);
diff --git a/torch/csrc/jit/tensorexpr/tensor.cpp b/torch/csrc/jit/tensorexpr/tensor.cpp

index 9df70f8..ea3902d 100644 (file)
--- a/torch/csrc/jit/tensorexpr/tensor.cpp
+++ b/torch/csrc/jit/tensorexpr/tensor.cpp
@@ -48,7 +48,7 @@ StmtPtr Tensor::constructStmt(
    return s;
  }
  
-Tensor* Compute(
+Tensor Compute(
      const std::string& name,
      const std::vector<DimArg>& dim_args,
      const std::function<ExprHandle(const std::vector<VarHandle>&)>& body_func) {
@@ -57,10 +57,10 @@ Tensor* Compute(
    unpack_dim_args(dim_args, &dims, &args);
    ExprPtr body = body_func(VarVectorToVarHandleVector(args)).node();
    BufPtr buf = alloc<Buf>(name, dims, body->dtype());
-  return new Tensor(buf, args, body);
+  return Tensor(buf, args, body);
  }
  
-Tensor* Compute(
+Tensor Compute(
      const std::string& name,
      const std::vector<DimArg>& dim_args,
      const std::function<ExprHandle(const VarHandle&)>& body_func) {
@@ -73,10 +73,10 @@ Tensor* Compute(
    unpack_dim_args(dim_args, &dims, &args);
    ExprPtr body = body_func(VarHandle(args[0])).node();
    BufPtr buf = alloc<Buf>(name, dims, body->dtype());
-  return new Tensor(buf, args, body);
+  return Tensor(buf, args, body);
  }
  
-Tensor* Compute(
+Tensor Compute(
      const std::string& name,
      const std::vector<DimArg>& dim_args,
      const std::function<ExprHandle(const VarHandle&, const VarHandle&)>&
@@ -89,10 +89,10 @@ Tensor* Compute(
    unpack_dim_args(dim_args, &dims, &args);
    ExprPtr body = body_func(VarHandle(args[0]), VarHandle(args[1])).node();
    BufPtr buf = alloc<Buf>(name, dims, body->dtype());
-  return new Tensor(buf, args, body);
+  return Tensor(buf, args, body);
  }
  
-Tensor* Compute(
+Tensor Compute(
      const std::string& name,
      const std::vector<DimArg>& dim_args,
      const std::function<
@@ -108,10 +108,10 @@ Tensor* Compute(
        body_func(VarHandle(args[0]), VarHandle(args[1]), VarHandle(args[2]))
            .node();
    BufPtr buf = alloc<Buf>(name, dims, body->dtype());
-  return new Tensor(buf, args, body);
+  return Tensor(buf, args, body);
  }
  
-Tensor* Compute(
+Tensor Compute(
      const std::string& name,
      const std::vector<DimArg>& dim_args,
      const std::function<ExprHandle(
@@ -132,10 +132,10 @@ Tensor* Compute(
                       VarHandle(args[3]))
                       .node();
    BufPtr buf = alloc<Buf>(name, dims, body->dtype());
-  return new Tensor(buf, args, body);
+  return Tensor(buf, args, body);
  }
  
-Tensor* Reduce(
+Tensor Reduce(
      const std::string& name,
      const std::vector<DimArg>& dim_args,
      const Reducer& reducer,
@@ -149,7 +149,7 @@ Tensor* Reduce(
        reduce_args);
  }
  
-Tensor* Reduce(
+Tensor Reduce(
      const std::string& name,
      const std::vector<DimArg>& dim_args,
      const Reducer& reducer,
@@ -163,17 +163,17 @@ Tensor* Reduce(
        reduce_args);
  }
  
-Tensor* Reduce(
+Tensor Reduce(
      const std::string& name,
      const std::vector<DimArg>& dim_args,
      const Reducer& reducer,
-    Tensor* tensor,
+    Tensor tensor,
      const std::vector<DimArg>& reduce_args) {
    return Reduce(
        name,
        dim_args,
        reducer,
-      [&](ParameterList& p) { return tensor->load(p); },
+      [&](ParameterList& p) { return tensor.load(p); },
        reduce_args);
  }
  
diff --git a/torch/csrc/jit/tensorexpr/tensor.h b/torch/csrc/jit/tensorexpr/tensor.h

index 3eb02c6..8d8ffe5 100644 (file)
--- a/torch/csrc/jit/tensorexpr/tensor.h
+++ b/torch/csrc/jit/tensorexpr/tensor.h
@@ -12,7 +12,7 @@ namespace torch {
  namespace jit {
  namespace tensorexpr {
  
-class TORCH_API Tensor : KernelScopedObject {
+class TORCH_API Tensor {
   public:
    // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
    Tensor(BufPtr buf, const std::vector<VarPtr>& args, ExprPtr body)
@@ -42,9 +42,9 @@ class TORCH_API Tensor : KernelScopedObject {
    }
  
    template <typename T>
-  inline ExprHandle load(const std::vector<T>& args);
+  inline ExprHandle load(const std::vector<T>& args) const;
    template <typename... Ts>
-  inline ExprHandle load(const Ts&... ts);
+  inline ExprHandle load(const Ts&... ts) const;
  
   private:
    StmtPtr constructStmt(
@@ -134,22 +134,22 @@ class Placeholder {
    std::vector<ExprPtr> strides_;
  };
  
-TORCH_API Tensor* Compute(
+TORCH_API Tensor Compute(
      const std::string& func_name,
      const std::vector<DimArg>& dim_args,
      const std::function<ExprHandle(const VarHandle&)>& body_func);
-TORCH_API Tensor* Compute(
+TORCH_API Tensor Compute(
      const std::string& func_name,
      const std::vector<DimArg>& dim_args,
      const std::function<ExprHandle(const VarHandle&, const VarHandle&)>&
          body_func);
-TORCH_API Tensor* Compute(
+TORCH_API Tensor Compute(
      const std::string& func_name,
      const std::vector<DimArg>& dim_args,
      const std::function<
          ExprHandle(const VarHandle&, const VarHandle&, const VarHandle&)>&
          body_func);
-TORCH_API Tensor* Compute(
+TORCH_API Tensor Compute(
      const std::string& func_name,
      const std::vector<DimArg>& dim_args,
      const std::function<ExprHandle(
@@ -157,7 +157,7 @@ TORCH_API Tensor* Compute(
          const VarHandle&,
          const VarHandle&,
          const VarHandle&)>& body_func);
-TORCH_API Tensor* Compute(
+TORCH_API Tensor Compute(
      const std::string& func_name,
      const std::vector<DimArg>& dim_args,
      const std::function<ExprHandle(const std::vector<VarHandle>&)>& body_func);
@@ -179,7 +179,7 @@ inline void unpack_dim_args(
  
  // Handle reductions over a Reducer and a body_func which produces values.
  template <typename InitFunc, typename BodyFunc>
-Tensor* Reduce(
+Tensor Reduce(
      const std::string& func_name,
      const std::vector<DimArg>& dim_args,
      const Reducer& reducer,
@@ -207,7 +207,7 @@ Tensor* Reduce(
              .node();
      // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
      BufPtr func_result = alloc<Buf>(func_name, dims, body->dtype());
-    return new Tensor(func_result, vars, body);
+    return Tensor(func_result, vars, body);
    }
  
    // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
@@ -227,13 +227,12 @@ Tensor* Reduce(
    // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
    ReduceOpPtr reduce_op = reducer(func_result, body, output_args, reduce_vars);
    // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
-  Tensor* t =
-      new Tensor(func_result, vars, reduce_dims, reduce_vars, reduce_op);
+  Tensor t = Tensor(func_result, vars, reduce_dims, reduce_vars, reduce_op);
    return t;
  }
  
  template <typename BodyFunc>
-Tensor* Reduce(
+Tensor Reduce(
      const std::string& func_name,
      const std::vector<DimArg>& dim_args,
      const Reducer& reducer,
@@ -250,7 +249,7 @@ Tensor* Reduce(
  
  // Overload which allows inline lambda functions for the body_func.
  template <typename BodyFunc>
-Tensor* Reduce(
+Tensor Reduce(
      const std::string& func_name,
      const std::vector<DimArg>& dim_args,
      const Reducer& reducer,
@@ -260,14 +259,14 @@ Tensor* Reduce(
  }
  
  // Overload for the common case of all dimensions of a Placeholder.
-TORCH_API Tensor* Reduce(
+TORCH_API Tensor Reduce(
      const std::string& func_name,
      const std::vector<DimArg>& dim_args,
      const Reducer& reducer,
      const Placeholder& buffer,
      const std::vector<DimArg>& reduce_args);
  
-TORCH_API Tensor* Reduce(
+TORCH_API Tensor Reduce(
      const std::string& name,
      const std::vector<DimArg>& dim_args,
      const Reducer& reducer,
@@ -276,22 +275,22 @@ TORCH_API Tensor* Reduce(
  
  // Overload for the common case of all dimensions of a prevously Computed
  // Tensor.
-TORCH_API Tensor* Reduce(
+TORCH_API Tensor Reduce(
      const std::string& func_name,
      const std::vector<DimArg>& dim_args,
      const Reducer& reducer,
-    Tensor* tensor,
+    Tensor tensor,
      const std::vector<DimArg>& reduce_args);
  
  template <typename... Ts>
-inline ExprHandle Tensor::load(const Ts&... ts) {
+inline ExprHandle Tensor::load(const Ts&... ts) const {
    // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
    std::vector<ExprHandle> params({ExprHandle(ts)...});
    return Load::make(BufHandle(this->buf()), params);
  }
  
  template <typename T>
-inline ExprHandle Tensor::load(const std::vector<T>& args) {
+inline ExprHandle Tensor::load(const std::vector<T>& args) const {
    // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
    std::vector<ExprHandle> params(args.begin(), args.end());
    return Load::make(BufHandle(this->buf()), params);
diff --git a/torch/csrc/jit/tensorexpr/tensorexpr_init.cpp b/torch/csrc/jit/tensorexpr/tensorexpr_init.cpp

index 4e1618a..c380233 100644 (file)
--- a/torch/csrc/jit/tensorexpr/tensorexpr_init.cpp
+++ b/torch/csrc/jit/tensorexpr/tensorexpr_init.cpp
@@ -185,9 +185,9 @@ void initTensorExprBindings(PyObject* module) {
               const std::vector<ExprHandle>& args,
               const ExprHandle& val) { return self.store(args, val); })
        .def("data", [](Placeholder& self) { return BufHandle(self.data()); });
-  py::class_<Tensor, std::unique_ptr<Tensor, py::nodelete>>(te, "Tensor")
-      .def(py::init(
-          [](BufHandle& b, StmtPtr s) { return new Tensor(b.node(), s); }))
+  py::class_<Tensor>(te, "Tensor")
+      .def(
+          py::init([](BufHandle& b, StmtPtr s) { return Tensor(b.node(), s); }))
        .def(
            "load",
            [](Tensor& self, const std::vector<ExprHandle>& v) {
@@ -268,7 +268,7 @@ void initTensorExprBindings(PyObject* module) {
        [](const std::string& func_name,
           const std::vector<DimArg>& dim_args,
           const Reducer& reducer,
-         Tensor* buffer,
+         Tensor buffer,
           const std::vector<DimArg>& reduce_args) {
          return Reduce(func_name, dim_args, reducer, buffer, reduce_args);
        },
@@ -380,7 +380,7 @@ void initTensorExprBindings(PyObject* module) {
        .def(py::init(&ExternalCall::make));
  
    py::class_<LoopNest>(te, "LoopNest")
-      .def(py::init<const std::vector<Tensor*>&>())
+      .def(py::init<const std::vector<Tensor>&>())
        .def(py::init([](StmtPtr s, const std::vector<BufHandle>& bufs) {
          std::unordered_set<BufPtr> buf_nodes;
          for (auto& buf : bufs) {
@@ -392,9 +392,7 @@ void initTensorExprBindings(PyObject* module) {
        .def("prepare_for_codegen", &LoopNest::prepareForCodegen)
        .def(
            "get_loop_body_for",
-          [](const LoopNest& self, Tensor* t) {
-            return self.getLoopBodyFor(t);
-          },
+          [](const LoopNest& self, Tensor t) { return self.getLoopBodyFor(t); },
            py::return_value_policy::reference)
        .def(
            "get_loop_body_for",
@@ -404,7 +402,7 @@ void initTensorExprBindings(PyObject* module) {
            py::return_value_policy::reference)
        .def(
            "get_loops_for",
-          [](const LoopNest& self, Tensor* t) {
+          [](const LoopNest& self, Tensor t) {
              return self.getLoopStmtsFor(t);
            },
            py::return_value_policy::reference)
@@ -760,12 +758,12 @@ void initTensorExprBindings(PyObject* module) {
  
    py::class_<CodeGen::BufferArg>(te, "BufferArg")
        .def(py::init<const Placeholder&>())
-      .def(py::init<Tensor*>())
+      .def(py::init<Tensor>())
        .def(py::init<const VarHandle&>())
        .def(py::init<const BufHandle&>());
  
    py::implicitly_convertible<Placeholder, CodeGen::BufferArg>();
-  py::implicitly_convertible<Tensor*, CodeGen::BufferArg>();
+  py::implicitly_convertible<Tensor, CodeGen::BufferArg>();
    py::implicitly_convertible<VarHandle, CodeGen::BufferArg>();
    py::implicitly_convertible<BufHandle, CodeGen::BufferArg>();
author	Mikhail Zolotukhin <mvz@fb.com>
	Tue, 24 Aug 2021 07:29:22 +0000 (00:29 -0700)
committer	Facebook GitHub Bot <facebook-github-bot@users.noreply.github.com>
	Tue, 24 Aug 2021 07:32:13 +0000 (00:32 -0700)
benchmarks/cpp/tensorexpr/bench_approx.cpp		patch \| blob \| history
benchmarks/cpp/tensorexpr/bench_batchnorm.cpp		patch \| blob \| history
benchmarks/cpp/tensorexpr/bench_compile.cpp		patch \| blob \| history
benchmarks/cpp/tensorexpr/bench_concat.cpp		patch \| blob \| history
benchmarks/cpp/tensorexpr/bench_gemm.cpp		patch \| blob \| history
benchmarks/cpp/tensorexpr/bench_parallel.cpp		patch \| blob \| history
benchmarks/cpp/tensorexpr/bench_reduce.cpp		patch \| blob \| history
test/cpp/tensorexpr/test_approx.cpp		patch \| blob \| history
test/cpp/tensorexpr/test_boundsinference.cpp		patch \| blob \| history
test/cpp/tensorexpr/test_conv.cpp		patch \| blob \| history
test/cpp/tensorexpr/test_cuda.cpp		patch \| blob \| history
test/cpp/tensorexpr/test_external_calls.cpp		patch \| blob \| history
test/cpp/tensorexpr/test_ir_printer.cpp		patch \| blob \| history
test/cpp/tensorexpr/test_kernel.cpp		patch \| blob \| history
test/cpp/tensorexpr/test_llvm.cpp		patch \| blob \| history
test/cpp/tensorexpr/test_loopnest.cpp		patch \| blob \| history
test/cpp/tensorexpr/test_memdependency.cpp		patch \| blob \| history
test/cpp/tensorexpr/test_ops.cpp		patch \| blob \| history
test/cpp/tensorexpr/test_reductions.cpp		patch \| blob \| history
test/cpp/tensorexpr/test_simplify.cpp		patch \| blob \| history
test/cpp/tensorexpr/tutorial.cpp		patch \| blob \| history
torch/csrc/jit/runtime/static/ops.cpp		patch \| blob \| history
torch/csrc/jit/tensorexpr/codegen.h		patch \| blob \| history
torch/csrc/jit/tensorexpr/ir_printer.cpp		patch \| blob \| history
torch/csrc/jit/tensorexpr/ir_printer.h		patch \| blob \| history
torch/csrc/jit/tensorexpr/kernel.cpp		patch \| blob \| history
torch/csrc/jit/tensorexpr/kernel.h		patch \| blob \| history
torch/csrc/jit/tensorexpr/loopnest.cpp		patch \| blob \| history
torch/csrc/jit/tensorexpr/loopnest.h		patch \| blob \| history
torch/csrc/jit/tensorexpr/operators/conv2d.cpp		patch \| blob \| history
torch/csrc/jit/tensorexpr/operators/conv2d.h		patch \| blob \| history
torch/csrc/jit/tensorexpr/operators/matmul.cpp		patch \| blob \| history
torch/csrc/jit/tensorexpr/operators/matmul.h		patch \| blob \| history
torch/csrc/jit/tensorexpr/operators/norm.cpp		patch \| blob \| history
torch/csrc/jit/tensorexpr/operators/norm.h		patch \| blob \| history
torch/csrc/jit/tensorexpr/operators/reduction.cpp		patch \| blob \| history
torch/csrc/jit/tensorexpr/operators/reduction.h		patch \| blob \| history
torch/csrc/jit/tensorexpr/operators/softmax.cpp		patch \| blob \| history
torch/csrc/jit/tensorexpr/operators/softmax.h		patch \| blob \| history
torch/csrc/jit/tensorexpr/tensor.cpp		patch \| blob \| history
torch/csrc/jit/tensorexpr/tensor.h		patch \| blob \| history
torch/csrc/jit/tensorexpr/tensorexpr_init.cpp		patch \| blob \| history