void vectorize(tensorexpr::LoopNest* ln, tensorexpr::Tensor* target, int width) {
auto loops = ln->getLoopStmtsFor(target);
- For *inner, *tail;
+ ForPtr inner, tail;
ln->splitWithTail(loops[0], width, &inner, &tail);
ln->vectorize(inner);
}
void optimizePointwise(tensorexpr::LoopNest* ln, tensorexpr::Tensor* target) {
- std::vector<For*> loops = ln->getLoopStmtsFor(target);
- For *inner, *tail;
+ std::vector<ForPtr> loops = ln->getLoopStmtsFor(target);
+ ForPtr inner, tail;
ln->splitWithTail(loops[0], 16 * 8, &inner, &tail);
- For* outer = loops[0];
+ ForPtr outer = loops[0];
ln->vectorize(inner);
ln->splitWithTail(outer, 8, &inner, &tail);
- Stmt* unrolled;
+ StmtPtr unrolled;
LoopNest::unroll(inner, &unrolled);
}
LoopNest ln({B});
optimizePointwise(&ln, B);
ln.prepareForCodegen();
- Stmt* s = ln.root_stmt();
+ StmtPtr s = ln.root_stmt();
s = torch::jit::tensorexpr::IRSimplifier::simplify(s);
std::vector<CodeGen::BufferArg> args;
args.emplace_back(B);
LoopNest ln({B});
ln.prepareForCodegen();
vectorize(&ln, B, 8);
- Stmt* s = ln.root_stmt();
+ StmtPtr s = ln.root_stmt();
s = torch::jit::tensorexpr::IRSimplifier::simplify(s);
std::vector<CodeGen::BufferArg> args;
args.emplace_back(B);
LoopNest ln({B});
optimizePointwise(&ln, B);
ln.prepareForCodegen();
- Stmt* s = ln.root_stmt();
+ StmtPtr s = ln.root_stmt();
s = torch::jit::tensorexpr::IRSimplifier::simplify(s);
std::vector<CodeGen::BufferArg> args;
args.emplace_back(B);
LoopNest ln({B});
vectorize(&ln, B, 8);
ln.prepareForCodegen();
- Stmt* s = ln.root_stmt();
+ StmtPtr s = ln.root_stmt();
s = torch::jit::tensorexpr::IRSimplifier::simplify(s);
std::vector<CodeGen::BufferArg> args;
args.emplace_back(B);
LoopNest ln({B});
ln.prepareForCodegen();
optimizePointwise(&ln, B);
- Stmt* s = ln.root_stmt();
+ StmtPtr s = ln.root_stmt();
s = torch::jit::tensorexpr::IRSimplifier::simplify(s);
std::vector<CodeGen::BufferArg> args;
args.emplace_back(B);
LoopNest ln({B});
ln.prepareForCodegen();
optimizePointwise(&ln, B);
- Stmt* s = ln.root_stmt();
+ StmtPtr s = ln.root_stmt();
s = torch::jit::tensorexpr::IRSimplifier::simplify(s);
std::vector<CodeGen::BufferArg> args;
args.emplace_back(B);
LoopNest ln({B});
ln.prepareForCodegen();
vectorize(&ln, B, 16);
- Stmt* s = ln.root_stmt();
+ StmtPtr s = ln.root_stmt();
s = torch::jit::tensorexpr::IRSimplifier::simplify(s);
std::vector<CodeGen::BufferArg> args;
args.emplace_back(B);
LoopNest ln({B});
optimizePointwise(&ln, B);
ln.prepareForCodegen();
- Stmt* s = ln.root_stmt();
+ StmtPtr s = ln.root_stmt();
s = torch::jit::tensorexpr::IRSimplifier::simplify(s);
std::vector<CodeGen::BufferArg> args;
args.emplace_back(B);
loops = nest.getLoopStmtsFor(output);
loops[0]->set_parallel();
nest.prepareForCodegen();
- Stmt* s = IRSimplifier::simplify(nest.root_stmt());
+ StmtPtr s = IRSimplifier::simplify(nest.root_stmt());
LLVMCodeGen cg(s, {input, weight, bias, mean, var, output, eps});
std::vector<CodeGen::CallArg> args;
});
LoopNest nest({output});
nest.prepareForCodegen();
- Stmt* s = IRSimplifier::simplify(nest.root_stmt());
+ StmtPtr s = IRSimplifier::simplify(nest.root_stmt());
LLVMCodeGen cg(s, {input, weight, bias, mean, var, output, eps});
std::vector<CodeGen::CallArg> args;
nest.computeInline(tensor->buf());
}
nest.prepareForCodegen();
- te::Stmt* s = te::IRSimplifier::simplify(nest.root_stmt());
+ te::StmtPtr s = te::IRSimplifier::simplify(nest.root_stmt());
te::LLVMCodeGen cg(s, {A, sixth, n});
}
}
nest.computeInline(tensor->buf());
}
nest.prepareForCodegen();
- te::Stmt* s = te::IRSimplifier::simplify(nest.root_stmt());
+ te::StmtPtr s = te::IRSimplifier::simplify(nest.root_stmt());
for (auto _ : state) {
te::LLVMCodeGen cg(s, {A, sixth, n});
}
});
LoopNest nest({output});
nest.prepareForCodegen();
- Stmt* s = IRSimplifier::simplify(nest.root_stmt());
+ StmtPtr s = IRSimplifier::simplify(nest.root_stmt());
std::vector<CodeGen::BufferArg> buf_args(inputs.begin(), inputs.end());
buf_args.push_back(output);
LLVMCodeGen cg(s, buf_args);
TORCH_INTERNAL_ASSERT(concat_dim_ == 1);
- auto output_buf = new Buf(
- new Var("aten_cat", kHandle),
- {new IntImm(output_size_[0]), new IntImm(output_size_[1])},
+ auto output_buf = alloc<Buf>(
+ alloc<Var>("aten_cat", kHandle),
+ std::vector<ExprPtr>(
+ {alloc<IntImm>(output_size_[0]), alloc<IntImm>(output_size_[1])}),
kFloat);
std::vector<Placeholder> inputs;
- std::vector<Stmt*> for_stmts(num_inputs);
+ std::vector<StmtPtr> for_stmts(num_inputs);
int cumulative_input_sizes = 0;
for (size_t i = 0; i < num_inputs; ++i) {
inputs.emplace_back(Placeholder(
"input" + std::to_string(i),
kFloat,
{input_sizes_[i][0], input_sizes_[i][1]}));
- std::vector<Var*> for_vars(num_inputs);
+ std::vector<VarPtr> for_vars(num_inputs);
for (size_t d = 0; d < num_dims; ++d) {
for_vars[d] =
- new Var("i" + std::to_string(i) + "_" + std::to_string(d), kInt);
+ alloc<Var>("i" + std::to_string(i) + "_" + std::to_string(d), kInt);
}
- auto store = new Store(
+ auto store = alloc<Store>(
output_buf,
- {for_vars[0],
- new Add(for_vars[1], new IntImm(cumulative_input_sizes))},
- new Load(inputs[i].data(), {for_vars[0], for_vars[1]}));
- auto for_st = new For(
+ std::vector<ExprPtr>(
+ {for_vars[0],
+ alloc<Add>(for_vars[1], alloc<IntImm>(cumulative_input_sizes))}),
+ alloc<Load>(
+ inputs[i].data(),
+ std::vector<ExprPtr>({for_vars[0], for_vars[1]})));
+ auto for_st = alloc<For>(
for_vars[0],
- new IntImm(0),
- new IntImm(input_sizes_[i][0]),
- new For(
+ alloc<IntImm>(0),
+ alloc<IntImm>(input_sizes_[i][0]),
+ alloc<For>(
for_vars[1],
- new IntImm(0),
- new IntImm(input_sizes_[i][1]),
+ alloc<IntImm>(0),
+ alloc<IntImm>(input_sizes_[i][1]),
store));
for_stmts[i] = for_st;
cumulative_input_sizes += input_sizes_[i][1];
}
- auto output = new Tensor(output_buf, new Block(for_stmts));
+ auto output = new Tensor(output_buf, alloc<Block>(for_stmts));
LoopNest nest({output});
nest.prepareForCodegen();
nest.vectorizeInnerLoops();
- Stmt* s = IRSimplifier::simplify(nest.root_stmt());
+ StmtPtr s = IRSimplifier::simplify(nest.root_stmt());
std::vector<CodeGen::BufferArg> buf_args(inputs.begin(), inputs.end());
buf_args.push_back(output);
LLVMCodeGen cg(s, buf_args);
{{K, "K"}});
te::LoopNest loop({CT});
loop.prepareForCodegen();
- te::Stmt* s = loop.root_stmt();
+ te::StmtPtr s = loop.root_stmt();
s = te::IRSimplifier::simplify(s);
auto cg = CreateCodeGen("llvm_codegen", s, {AP, BP, CT});
{
auto const& loops = loop.getLoopStmtsFor(CT);
- te::For* m = loops[0];
+ te::ForPtr m = loops[0];
loop.splitWithMask(m, 32);
}
{
auto const& loops = loop.getLoopStmtsFor(CT);
- te::For* n = loops[2];
+ te::ForPtr n = loops[2];
loop.splitWithMask(n, 32);
}
// mo, mi, no, ni, k ->
// mo, no, mi, ni, k
{
auto const& loops = loop.getLoopStmtsFor(CT);
- te::For* mi = loops[1];
- te::For* no = loops[2];
+ te::ForPtr mi = loops[1];
+ te::ForPtr no = loops[2];
loop.reorderAxis(mi, no);
}
// mo, no, mi, ni, k ->
// mo, no, mi, k, ni
{
auto const& loops = loop.getLoopStmtsFor(CT);
- te::For* ni = loops[3];
- te::For* k = loops[4];
+ te::ForPtr ni = loops[3];
+ te::ForPtr k = loops[4];
loop.reorderAxis(ni, k);
}
// mo, no, mi, k, ni ->
// mo, no, k, mi, ni
{
auto const& loops = loop.getLoopStmtsFor(CT);
- te::For* mi = loops[2];
- te::For* k = loops[3];
+ te::ForPtr mi = loops[2];
+ te::ForPtr k = loops[3];
loop.reorderAxis(mi, k);
}
loop.prepareForCodegen();
- te::Stmt* s = loop.root_stmt();
+ te::StmtPtr s = loop.root_stmt();
s = te::IRSimplifier::simplify(s);
auto cg = CreateCodeGen("llvm_codegen", s, {AP, BP, CT});
{
auto const& loops = loop.getLoopStmtsFor(CT);
- te::For* m = loops[0];
+ te::ForPtr m = loops[0];
loop.splitWithMask(m, 4);
}
{
auto const& loops = loop.getLoopStmtsFor(CT);
- te::For* n = loops[2];
+ te::ForPtr n = loops[2];
loop.splitWithMask(n, 16);
}
// mo, mi, no, ni, k ->
// mo, no, mi, ni, k
{
auto const& loops = loop.getLoopStmtsFor(CT);
- te::For* mi = loops[1];
- te::For* no = loops[2];
+ te::ForPtr mi = loops[1];
+ te::ForPtr no = loops[2];
loop.reorderAxis(mi, no);
}
// mo, no, mi, ni, k ->
// mo, no, mi, k, ni
{
auto const& loops = loop.getLoopStmtsFor(CT);
- te::For* ni = loops[3];
- te::For* k = loops[4];
+ te::ForPtr ni = loops[3];
+ te::ForPtr k = loops[4];
loop.reorderAxis(ni, k);
}
// mo, no, mi, k, ni ->
// mo, no, k, mi, ni
{
auto const& loops = loop.getLoopStmtsFor(CT);
- te::For* mi = loops[2];
- te::For* k = loops[3];
+ te::ForPtr mi = loops[2];
+ te::ForPtr k = loops[3];
loop.reorderAxis(mi, k);
}
loop.prepareForCodegen();
- te::Stmt* s = loop.root_stmt();
+ te::StmtPtr s = loop.root_stmt();
s = te::IRSimplifier::simplify(s);
auto cg = CreateCodeGen("llvm_codegen", s, {AP, BP, CT});
{
auto const& loops = loop.getLoopStmtsFor(CT);
- te::For* m = loops[0];
+ te::ForPtr m = loops[0];
loop.splitWithMask(m, 4);
}
{
auto const& loops = loop.getLoopStmtsFor(CT);
- te::For* n = loops[2];
+ te::ForPtr n = loops[2];
loop.splitWithMask(n, 16);
}
// mo, mi, no, ni, k ->
// mo, no, mi, ni, k
{
auto const& loops = loop.getLoopStmtsFor(CT);
- te::For* mi = loops[1];
- te::For* no = loops[2];
+ te::ForPtr mi = loops[1];
+ te::ForPtr no = loops[2];
loop.reorderAxis(mi, no);
}
// mo, no, mi, ni, k ->
// mo, no, mi, k, ni
{
auto const& loops = loop.getLoopStmtsFor(CT);
- te::For* ni = loops[3];
- te::For* k = loops[4];
+ te::ForPtr ni = loops[3];
+ te::ForPtr k = loops[4];
loop.reorderAxis(ni, k);
}
// mo, no, mi, k, ni ->
// mo, no, k, mi, ni
{
auto const& loops = loop.getLoopStmtsFor(CT);
- te::For* mi = loops[2];
- te::For* k = loops[3];
+ te::ForPtr mi = loops[2];
+ te::ForPtr k = loops[3];
loop.reorderAxis(mi, k);
}
{
auto const& loops = loop.getLoopStmtsFor(CT);
- te::For* mi = loops[3];
- te::For* ni = loops[4];
- te::Stmt* unrolled;
+ te::ForPtr mi = loops[3];
+ te::ForPtr ni = loops[4];
+ te::StmtPtr unrolled;
loop.vectorize(ni);
loop.unroll(mi, &unrolled);
}
loop.prepareForCodegen();
- te::Stmt* s = loop.root_stmt();
+ te::StmtPtr s = loop.root_stmt();
s = te::IRSimplifier::simplify(s);
auto cg = CreateCodeGen("llvm_codegen", s, {AP, BP, CT});
{
auto const& loops = loop.getLoopStmtsFor(CT);
- te::For* m = loops[0];
+ te::ForPtr m = loops[0];
loop.splitWithMask(m, 4);
}
{
auto const& loops = loop.getLoopStmtsFor(CT);
- te::For* n = loops[2];
+ te::ForPtr n = loops[2];
loop.splitWithMask(n, 16);
}
// mo, mi, no, ni, k ->
// mo, no, mi, ni, k
{
auto const& loops = loop.getLoopStmtsFor(CT);
- te::For* mi = loops[1];
- te::For* no = loops[2];
+ te::ForPtr mi = loops[1];
+ te::ForPtr no = loops[2];
loop.reorderAxis(mi, no);
}
// mo, no, mi, ni, k ->
// mo, no, mi, k, ni
{
auto const& loops = loop.getLoopStmtsFor(CT);
- te::For* ni = loops[3];
- te::For* k = loops[4];
+ te::ForPtr ni = loops[3];
+ te::ForPtr k = loops[4];
loop.reorderAxis(ni, k);
}
// mo, no, mi, k, ni ->
// mo, no, k, mi, ni
{
auto const& loops = loop.getLoopStmtsFor(CT);
- te::For* mi = loops[2];
- te::For* k = loops[3];
+ te::ForPtr mi = loops[2];
+ te::ForPtr k = loops[3];
loop.reorderAxis(mi, k);
}
{
}
loop.prepareForCodegen();
- te::Stmt* s = loop.root_stmt();
+ te::StmtPtr s = loop.root_stmt();
s = te::IRSimplifier::simplify(s);
auto cg = CreateCodeGen("llvm_codegen", s, {AP, BP, CT});
});
LoopNest loop_nest({c_tensor});
auto const& loops = loop_nest.getLoopStmtsFor(c_tensor);
- For* m = loops[0];
+ ForPtr m = loops[0];
m->set_parallel();
loop_nest.prepareForCodegen();
- Stmt* stmt = loop_nest.root_stmt();
+ StmtPtr stmt = loop_nest.root_stmt();
LLVMCodeGen cg(stmt, {c_tensor, a_buf, b_buf});
float* a_ptr = A.data_ptr<float>();
te::LoopNest loop({BT});
loop.prepareForCodegen();
- te::Stmt* s = loop.root_stmt();
+ te::StmtPtr s = loop.root_stmt();
s = te::IRSimplifier::simplify(s);
auto cg = CreateCodeGen("llvm_codegen", s, {AP, BT});
{
auto const& loops = loop.getLoopStmtsFor(BT);
- te::For* m = loops[1];
+ te::ForPtr m = loops[1];
loop.splitWithTail(m, kChunkSize);
}
loop.prepareForCodegen();
- te::Stmt* s = loop.root_stmt();
+ te::StmtPtr s = loop.root_stmt();
s = te::IRSimplifier::simplify(s);
auto cg = CreateCodeGen("llvm_codegen", s, {AP, BT});
{
auto const& loops = loop.getLoopStmtsFor(BT);
- te::For* m = loops[1];
+ te::ForPtr m = loops[1];
loop.splitWithMask(m, kChunkSize);
}
loop.prepareForCodegen();
- te::Stmt* s = loop.root_stmt();
+ te::StmtPtr s = loop.root_stmt();
s = te::IRSimplifier::simplify(s);
auto cg = CreateCodeGen("llvm_codegen", s, {AP, BT});
{{M, "M"}});
te::LoopNest loop({BT});
- te::Buf* rfac_buf;
+ te::BufPtr rfac_buf;
auto loops = loop.getLoopStmtsFor(BT);
TORCH_CHECK(loops.size() == 1);
- te::For* mi;
+ te::ForPtr mi;
loop.splitWithMask(loops.at(0), kChunkSize, &mi);
- te::For* mo = loops.at(0);
+ te::ForPtr mo = loops.at(0);
loop.reorderAxis(mo, mi);
loops = loop.getLoopStmtsFor(BT);
- auto bt_body = const_cast<te::Stmt*>(loop.getAllWritesToBuf(BT->buf())[1]);
+ auto bt_body = loop.getAllWritesToBuf(BT->buf())[1];
TORCH_CHECK(loop.rfactor(bt_body, loops.at(0), &rfac_buf));
loop.reorderAxis(loops.at(0), loops.at(1));
loop.vectorize(loops.at(1));
loop.prepareForCodegen();
- te::Stmt* s = loop.root_stmt();
+ te::StmtPtr s = loop.root_stmt();
s = te::IRSimplifier::simplify(s);
auto cg = CreateCodeGen("llvm_codegen", s, {AP, BT});
te::LoopNest nest({b});
auto loops = nest.getLoopStmtsFor(b);
- te::For *mi, *mo;
- te::Buf *rf;
+ te::ForPtr mi, mo;
+ te::BufPtr rf;
nest.splitWithMask(loops[0], kChunkSize, &mi);
loops = nest.reorder({loops[0], mi}, {1, 0});
nest.rfactor(nest.getLoopBodyFor(b), loops[0], &rf);
auto sch = state.range(2);
if (sch == 1) {
auto loops = nest.getLoopStmtsFor(b);
- te::For *mi, *mo;
- te::Buf *rf;
+ te::ForPtr mi, mo;
+ te::BufPtr rf;
nest.splitWithMask(loops[1], kChunkSize, &mi);
loops = nest.reorder({loops[1], mi}, {1, 0});
TORCH_CHECK(nest.rfactor(nest.getLoopBodyFor(b), loops[0], &rf));
nest.reorderAxis(loops[1], loops[2]);
} else if (sch == 3) {
auto loops = nest.getLoopStmtsFor(b);
- te::For *mi, *mo;
- te::Buf *rf;
+ te::ForPtr mi, mo;
+ te::BufPtr rf;
nest.splitWithMask(loops[1], kChunkSize, &mi);
loops = nest.reorder({loops[1], mi}, {1, 0});
TORCH_CHECK(nest.rfactor(nest.getLoopBodyFor(b), loops[0], &rf));
[=](const VarHandle& m, const VarHandle& n) {
return t3->load(m, n) + m + n;
});
- LoopNest loop_nest({t4}, {t1, t2, t3, t4});
+ LoopNest loop_nest(std::vector<Tensor*>({t4}), {t1, t2, t3, t4});
std::vector<ForPtr> loop_list;
{
auto const& loops = loop_nest.getLoopStmtsFor(t1);
return c->load(m, n, k) + 1;
});
- LoopNest l({d}, {c, d});
+ LoopNest l(std::vector<Tensor*>({d}), {c, d});
l.prepareForCodegen();
StmtPtr stmt = l.root_stmt();
std::ostringstream oss;
return c_buf.load(m, n) * d_buf.load(m, k) + x->load(m, n, k);
});
- LoopNest l1({y}, {x, y});
+ LoopNest l1(std::vector<Tensor*>({y}), {x, y});
LoopNest l2(l1);
l2.computeInline(x->buf());
return x->load(m, n, k) + y->load(m, n, k);
});
- LoopNest l({z}, {x, y, z});
+ LoopNest l(std::vector<Tensor*>({z}), {x, y, z});
for (const std::string& order : inline_order) {
if (order == "x") {
l.computeInline(x->buf());
return x->load(m, n, k) + x->load(m, n, k);
});
- LoopNest l1({y}, {x, y});
+ LoopNest l1(std::vector<Tensor*>({y}), {x, y});
l1.computeInline(x->buf());
// would normally compare results but Rand isn't implemented in the
Intrinsics::make(kRand, kInt);
});
- LoopNest l1({y}, {x, y});
+ LoopNest l1(std::vector<Tensor*>({y}), {x, y});
l1.computeInline(x->buf());
// would normally compare results but Rand isn't implemented in the
return x->load(m) + x->load(m);
});
- LoopNest l1({y}, {x, y});
+ LoopNest l1(std::vector<Tensor*>({y}), {x, y});
l1.computeInline(x->buf());
// would normally compare results but Rand isn't implemented in the
}
}
- LoopNest l1({y}, {x, y});
+ LoopNest l1(std::vector<Tensor*>({y}), {x, y});
LoopNest l2(l1);
l2.computeInline(x->buf());
return Intrinsics::make(kSqrt, x->load(m, n, k));
});
- LoopNest l1({y}, {x, y});
+ LoopNest l1(std::vector<Tensor*>({y}), {x, y});
l1.computeInline(x->buf());
StmtPtr stmt1 = IRSimplifier::simplify(l1.root_stmt());
return a->load(j + ExprHandle(8));
});
- LoopNest l({b}, {a, b});
+ LoopNest l(std::vector<Tensor*>({b}), {a, b});
std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(a->buf()).at(0);
LoopNest::splitWithMask(loops[0], 4);
ASSERT_THROWS_WITH(l.computeInline(a->buf()), "compound indices");
return a->load(j + ExprHandle(8));
});
- LoopNest l({b}, {a, b});
+ LoopNest l(std::vector<Tensor*>({b}), {a, b});
std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(b->buf()).at(0);
LoopNest::splitWithMask(loops[0], 3);
l.computeInline(a->buf());
// NOLINTNEXTLINE(cppcoreguidelines-init-variables)
ForPtr i_inner;
- LoopNest l({b}, {a, b});
+ LoopNest l(std::vector<Tensor*>({b}), {a, b});
std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(a->buf()).at(0);
LoopNest::splitWithMask(loops[0], 4, &i_inner);
LoopNest::splitWithMask(i_inner, 2);
return a->load(j + ExprHandle(8));
});
- LoopNest l({b}, {a, b});
+ LoopNest l(std::vector<Tensor*>({b}), {a, b});
l.computeInline(a->buf());
std::vector<ForPtr> loops = NodeFinder<For>::find(l.root_stmt());
return a->load(j + ExprHandle(8));
});
- LoopNest l({b}, {a, b});
+ LoopNest l(std::vector<Tensor*>({b}), {a, b});
auto loops = NodeFinder<For>::find(l.root_stmt());
LoopNest::splitWithMask(loops.back(), 2);
l.computeInline(a->buf());
return a->load(j) - ExprHandle(1);
});
- LoopNest l({b}, {a, b});
+ LoopNest l(std::vector<Tensor*>({b}), {a, b});
std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(a->buf()).at(0);
LoopNest::splitWithMask(loops[0], 4);
ASSERT_THROWS_WITH(l.computeInline(a->buf()), "compound indices");
return a->load(k) * b->load(l);
});
- LoopNest l({c}, {a, b, c});
+ LoopNest l(std::vector<Tensor*>({c}), {a, b, c});
std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(a->buf()).at(0);
l.computeInline(a->buf());
l.prepareForCodegen();
return a->load(k) * b->load(l);
});
- LoopNest l({c}, {a, b, c});
+ LoopNest l(std::vector<Tensor*>({c}), {a, b, c});
std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(a->buf()).at(0);
l.computeInline(a->buf());
l.computeInline(b->buf());
return a->load(k) * b->load(l);
});
- LoopNest l({c}, {a, b, c});
+ LoopNest l(std::vector<Tensor*>({c}), {a, b, c});
std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(a->buf()).at(0);
l.computeInline(b->buf());
l.prepareForCodegen();
return a->load(k) * b->load(l);
});
- LoopNest l({c}, {a, b, c});
+ LoopNest l(std::vector<Tensor*>({c}), {a, b, c});
std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(a->buf()).at(0);
LoopNest::splitWithMask(loops[0], 4);
loops = l.getAllLoopNestsWritingToBuf(b->buf()).at(0);
return f->load(i) + d.load(i);
});
- LoopNest l({g}, {e, f, g});
+ LoopNest l(std::vector<Tensor*>({g}), {e, f, g});
l.computeInline(l.getLoopBodyFor(e));
l.computeInline(l.getLoopBodyFor(f));
l.prepareForCodegen();
"A", {{N, "i_a"}}, [&](const VarHandle& i_a) { return i_a * i_a; });
Tensor* B = Compute(
"B", {{N, "i_b"}}, [&](const VarHandle& i_b) { return A->load(i_b); });
- LoopNest l({B}, {A, B});
+ LoopNest l(std::vector<Tensor*>({B}), {A, B});
std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(B->buf()).at(0);
LoopNest::computeAt(l.getLoopBodyFor(A), loops[0]);
l.prepareForCodegen();
c_ref[y * kW + x] = y * x + (y + 1) * x + y * (x + 1) + (y + 1) * (x + 1);
}
}
- LoopNest orig_loopnest({c}, {p, c});
+ LoopNest orig_loopnest(std::vector<Tensor*>({c}), {p, c});
{
// First let's try to compute P at axis cy (the outer loop)
}
}
- LoopNest orig_loopnest({D}, {A, B, C, D});
+ LoopNest orig_loopnest(std::vector<Tensor*>({D}), {A, B, C, D});
{
// First let's try to compute A at axis dy (the outer loop)
LoopNest l(orig_loopnest);
c_ref[y * kW + x] = y * x + (y + 1) * x + y * (x + 1) + (y + 1) * (x + 1);
}
}
- LoopNest orig_loopnest({c}, {p, c});
+ LoopNest orig_loopnest(std::vector<Tensor*>({c}), {p, c});
checkIR(orig_loopnest.root_stmt(), R"IR(
# CHECK: for (int py = 0; py < H + 1; py++) {
# CHECK: for (int px = 0; px < W + 1; px++) {
return x->load(m, n, k) + y->load(m, n, k);
});
- LoopNest l({z}, {x, y, z});
+ LoopNest l(std::vector<Tensor*>({z}), {x, y, z});
ForPtr a = nullptr;
ForPtr b = nullptr;
auto fors = NodeFinder<For>::find(l.root_stmt());
Block::make(
{Store::make(a_buf, {x}, x * 2),
Store::make(b_buf, {x}, Load::make(a_buf, {x}))}));
- Block::make({f});
+ auto parent_block = Block::make({f});
StmtPtr unrolled = nullptr;
LoopNest::unroll(f, &unrolled);
checkIR(unrolled, R"IR(
{Let::make(e, 7),
Store::make(a_buf, {x}, e),
Store::make(b_buf, {x}, e + 1)}));
- Block::make({f});
+ auto parent_block = Block::make({f});
StmtPtr unrolled = nullptr;
LoopNest::unroll(f, &unrolled);
std::ostringstream oss;
"reshape",
{{kTotalSize / 2, "i"}, {2, "j"}},
[&](const VarHandle& i, const VarHandle& j) { return a->load(i, j); });
- LoopNest l({reshape}, {a, reshape});
+ LoopNest l(std::vector<Tensor*>({reshape}), {a, reshape});
ASSERT_THROWS_WITH(
l.computeInline(l.getLoopBodyFor(a)),
"Placeholder indexed access is inconsistent with its rank");
return A->load(i + 10, j + 20) + A->load(i + 30, j + 40);
});
- LoopNest l({B, C}, {A, B, C});
+ LoopNest l(std::vector<Tensor*>({B, C}), {A, B, C});
StmtPtr j_loop = l.getAllLoopNestsWritingToBuf(B->buf())[0][1];
LoopNest::cacheAccesses(A->buf(), "A_local", j_loop);
return A->load(i + 10, j + 20) + A->load(i + 30, j + 40);
});
- LoopNest l({B, C}, {A, B, C});
+ LoopNest l(std::vector<Tensor*>({B, C}), {A, B, C});
StmtPtr i_loop = l.getAllLoopNestsWritingToBuf(B->buf())[0][0];
LoopNest::cacheAccesses(A->buf(), "A_local", i_loop);
return A->load(i + 10, j + 20) + A->load(i + 30, j + 40);
});
- LoopNest l({B, C}, {A, B, C});
+ LoopNest l(std::vector<Tensor*>({B, C}), {A, B, C});
StmtPtr j_loop = l.getAllLoopNestsWritingToBuf(B->buf())[0][1];
LoopNest::cacheAccesses(A->buf(), "A_local", j_loop);
l.prepareForCodegen();
return A->load(i + 10, j + 20) + A->load(i + 30, j + 40);
});
- LoopNest l({B, C}, {A, B, C});
+ LoopNest l(std::vector<Tensor*>({B, C}), {A, B, C});
StmtPtr body = l.getLoopBodyFor(B);
LoopNest::cacheAccesses(A->buf(), "A_local", body);
l.prepareForCodegen();
return A->load(i + 10, j + 20) + A->load(i + 30, j + 40);
});
- LoopNest l({B, C}, {A, B, C});
+ LoopNest l(std::vector<Tensor*>({B, C}), {A, B, C});
StmtPtr a_loop = l.getAllLoopNestsWritingToBuf(A->buf())[0][1];
LoopNest::cacheAccesses(A->buf(), "A_local", a_loop);
return y->load(m, n, o);
});
- LoopNest l({z}, {y, z});
+ LoopNest l(std::vector<Tensor*>({z}), {y, z});
l.simplify();
ASSERT_TRUE(l.computeInline(y->buf()));
}
return A->load(i, j + 1) + A->load(i, j + 2);
});
- LoopNest l({B}, {A, B});
+ LoopNest l(std::vector<Tensor*>({B}), {A, B});
ASSERT_FALSE(l.computeInline(A->buf()));
l.prepareForCodegen();
"b", {{N, "n"}}, [&](const VarHandle& n) { return a.load(n) + 1.0f; });
Tensor* c = Compute(
"c", {{N, "n"}}, [&](const VarHandle& n) { return b->load(n) + 2.0f; });
- LoopNest nest({c}, {b, c});
+ LoopNest nest(std::vector<Tensor*>({c}), {b, c});
auto loops = nest.getAllLoopNestsWritingToBuf(b->buf())[0];
ASSERT_TRUE(LoopNest::vectorize(loops[0]));
loops = nest.getAllLoopNestsWritingToBuf(c->buf())[0];
return c->load(m, n, k) + 1;
});
- LoopNest l({d}, {c, d});
+ LoopNest l(std::vector<Tensor*>({d}), {c, d});
MemDependencyChecker analyzer({a_buf.data(), b_buf.data()}, {d->buf()});
return c->load(m, n, k) + 1;
});
- LoopNest l({d}, {c, d});
+ LoopNest l(std::vector<Tensor*>({d}), {c, d});
l.computeInline(c->buf());
MemDependencyChecker analyzer({a_buf.data(), b_buf.data()}, {d->buf()});
return b.load(l, n, m) * a.load(l, n, m);
});
Tensor* d = Reduce("sum", {{2, "l1"}}, Sum(), c, {{3, "n1"}, {6, "m1"}});
- LoopNest l({d}, {c, d});
+ LoopNest l(std::vector<Tensor*>({d}), {c, d});
MemDependencyChecker analyzer({a.data(), b.data()}, {d->buf()});
[&](const VarHandle& l, const VarHandle& n) {
return c->load(l, n) * a.load(l, n);
});
- LoopNest loop({d}, {c, d});
+ LoopNest loop(std::vector<Tensor*>({d}), {c, d});
loop.prepareForCodegen();
StmtPtr s = loop.root_stmt();
s = IRSimplifier::simplify(s);
return b.load(l, n, m) * a.load(l, n, m);
});
Tensor* d = Reduce("sum", {{2, "l1"}}, Sum(), c, {{3, "n1"}, {m, "m1"}});
- LoopNest loop({d}, {c, d});
+ LoopNest loop(std::vector<Tensor*>({d}), {c, d});
loop.prepareForCodegen();
StmtPtr s = loop.root_stmt();
s = IRSimplifier::simplify(s);
}
}
- LoopNest l1({y}, {x, y});
+ LoopNest l1(std::vector<Tensor*>({y}), {x, y});
// Cannot inline a reduction computation
ASSERT_FALSE(l1.computeInline(x->buf()));
}
}
}
- LoopNest l1({y}, {x, y});
+ LoopNest l1(std::vector<Tensor*>({y}), {x, y});
LoopNest l2(l1);
l2.computeInline(x->buf());
}
}
- LoopNest l1({y}, {x, y});
+ LoopNest l1(std::vector<Tensor*>({y}), {x, y});
LoopNest l2(l1);
l2.computeInline(x->buf());
return b.load(0, 0, l) * d->load(l);
});
- LoopNest l({e}, {c, d, e});
+ LoopNest l(std::vector<Tensor*>({e}), {c, d, e});
LoopNest l_before(l);
l_before.prepareForCodegen();
SimpleIREvaluator cg_before(l_before.root_stmt(), {a, b, e});
return b.load(0, 0, l) * d->load(l);
});
- LoopNest l({e}, {c, d, e});
+ LoopNest l(std::vector<Tensor*>({e}), {c, d, e});
LoopNest l_before(l);
l_before.prepareForCodegen();
SimpleIREvaluator cg_before(l_before.root_stmt(), {a, b, e});
return b.load(0, 0, l) * d->load(l);
});
- LoopNest l({e}, {c, d, e});
+ LoopNest l(std::vector<Tensor*>({e}), {c, d, e});
LoopNest l_before(l);
l_before.prepareForCodegen();
SimpleIREvaluator cg_before(l_before.root_stmt(), {a, b, e});
return b.load(0, 0, l) * d->load(l);
});
- LoopNest l({e}, {c, d, e});
+ LoopNest l(std::vector<Tensor*>({e}), {c, d, e});
StmtPtr d_loop = l.getLoopStmtsFor(d)[1];
l.cacheAccesses(c->buf(), "scale_local", d_loop);
return b.load(0, 0, l) * d->load(l);
});
- LoopNest l({e}, {c, d, e});
+ LoopNest l(std::vector<Tensor*>({e}), {c, d, e});
LoopNest::splitWithMask(l.getLoopStmtsFor(e)[0], 4);
return b.load(0, 0, l) * d->load(l);
});
- LoopNest l({e}, {c, d, e});
+ LoopNest l(std::vector<Tensor*>({e}), {c, d, e});
// NOLINTNEXTLINE(cppcoreguidelines-init-variables)
ForPtr inner;
return b.load(0, 0, l) * d->load(l);
});
- LoopNest l({e}, {c, d, e});
+ LoopNest l(std::vector<Tensor*>({e}), {c, d, e});
// NOLINTNEXTLINE(cppcoreguidelines-init-variables)
ForPtr inner;
// Creating a loop nest is as quite simple, we just need to specify a list
// of all and a list of output tensors:
// NOLINTNEXTLINE(bugprone-argument-comment)
- LoopNest loopnest(/*outputs=*/{Y}, /*all=*/{X, Y});
+ std::vector<Tensor*> outputs = {Y};
+ std::vector<Tensor*> all = {X, Y};
+ LoopNest loopnest(outputs, all);
// An IR used in LoopNest is based on tensor statements, represented by
// `Stmt` class. Statements are used to specify the loop nest structure, and
StmtPtr mutate(StorePtr v) override {
BufPtr buf = v->buf();
- StorePtr orig = const_cast<StorePtr>(v); // NOLINT
// Thread locals never need to be atomic.
if (thread_local_bufs_.count(buf->base_handle()) != 0) {
- return orig;
+ return v;
}
ScalarType dtype = v->value()->dtype().scalar_type();
if (dtype != ScalarType::Float && dtype != ScalarType::Double) {
- return orig;
+ return v;
}
AddPtr add_v = to<Add>(v->value());
if (!add_v) {
- return orig;
+ return v;
}
LoadPtr load_v = to<Load>(add_v->lhs());
if (!load_v) {
- return orig;
+ return v;
}
if (v->base_handle() != load_v->base_handle()) {
- return orig;
+ return v;
}
if (v->indices().empty() && load_v->indices().empty()) {
- return orig;
+ return v;
}
bool index_equal = CheckEqual(v->flat_index(), load_v->flat_index());
if (!index_equal) {
- return orig;
+ return v;
}
// TODO: this checks that the metavars occur directly as an index, but this
if (vars_to_find.empty()) {
// All metavars accounted for.
- return orig;
+ return v;
}
return alloc<AtomicAdd>(buf, v->indices(), add_v->rhs());
}
StmtPtr mutate(BlockPtr v) override {
- BlockPtr v1 = const_cast<BlockPtr>(v); // NOLINT
- assert(v1);
// NOLINTNEXTLINE(cppcoreguidelines-init-variables)
- std::list<StmtPtr> stmts = v1->stmts();
+ std::list<StmtPtr> stmts = v->stmts();
for (StmtPtr stmt : stmts) {
PushList();
StmtPtr stmt_new = stmt->accept_mutator(this);
- AddMemLoadsFromList(v1, stmt);
+ AddMemLoadsFromList(v, stmt);
PopList();
if (stmt_new == stmt) {
continue;
}
- v1->replace_stmt(stmt, stmt_new);
+ v->replace_stmt(stmt, stmt_new);
}
- return v1;
+ return v;
}
ExprPtr mutate(IfThenElsePtr v) override {
// NOLINTNEXTLINE(cppcoreguidelines-init-variables)
std::vector<StmtPtr> stmts;
for (auto& v : innerSegments) {
- for (auto* s : v.stmts()) {
+ for (auto s : v.stmts()) {
stmts.push_back(s);
}
}
inserted_half_casts_.insert(new_val);
}
- return alloc<Store>(v->buf(), v->indices(), new_val);
+ v->set_value(new_val);
+ return v;
}
ExprPtr mutate(HalfImmPtr v) override {
DependencySet getAllReadsWithin(StmtOrExprPtr v) {
DependencySet reads;
auto insertAllReads = [&](const auto& nodes) {
- for (auto* l : nodes) {
+ for (auto l : nodes) {
auto bound = exprToAccess_.equal_range(l);
for (auto it = bound.first; it != bound.second; ++it) {
if (it->second->isRead()) {
// writes just Store currently.
auto stores = NodeFinder<Store>::find(v);
- for (auto* s : stores) {
+ for (auto s : stores) {
auto bound = stmtToAccess_.equal_range(s);
for (auto it = bound.first; it != bound.second; ++it) {
if (it->second->isWrite()) {
namespace jit {
namespace tensorexpr {
-Tensor* computeSum(
+TORCH_API Tensor* computeSum(
const std::vector<ArgValue>& inputs,
const c10::optional<ScalarType>& outputType);
-Tensor* computeMean(
+TORCH_API Tensor* computeMean(
const std::vector<ArgValue>& inputs,
const std::vector<ExprHandle>& outputShape,
const c10::optional<ScalarType>& outputType);
-Tensor* computeAdaptiveAvgPool2d(
+TORCH_API Tensor* computeAdaptiveAvgPool2d(
const std::vector<ArgValue>& inputs,
const std::vector<ExprHandle>& outputShape,
const c10::optional<ScalarType>& outputType);