const Target& target_host,
const BuildConfig& config);
+/*!
+ * \brief Build a device and host module for a specific target from a map
+ * contains target to a list of lowered functions pairs. This function is used
+ * for heterogeneous build.
+ * \param input The map contains target to a list of lowered functions pairs.
+ * \param target_host The target for building host code. To use the default,
+ * pass Target().
+ * \param config The build configuration.
+ * \return The built module that contains code for different processors.
+ */
+TVM_DLL runtime::Module build(const Map<Target, Array<LoweredFunc>>& input,
+ const Target& target_host,
+ const BuildConfig& config);
+
+/*!
+ * \brief Build a device and host module for a specific target from a map
+ * contains target to a list of lowered functions pairs. This function is used
+ * for heterogeneous build.
+ * \param input The map contains target string to a list of lowered functions
+ * pairs.
+ * \param target_host The target for building host code. To use the default,
+ * pass Target().
+ * \param config The build configuration.
+ * \return The built module that contains code for different processors.
+ */
+TVM_DLL runtime::Module build(const Map<std::string, Array<LoweredFunc>>& input,
+ const Target& target_host,
+ const BuildConfig& config);
+
class GenericFuncNode;
/*!
const Target& target_host,
const BuildConfig& config) {
std::unordered_set<std::string> all_names;
- for (const auto &x : funcs) {
- CHECK(all_names.count(x->name) == 0) << "Duplicate function name " << x->name;
+ for (const auto& x : funcs) {
+ CHECK(all_names.count(x->name) == 0)
+ << "Duplicate function name " << x->name;
all_names.insert(x->name);
}
- auto target_host_val = target_host.defined() ? target_host : DefaultTargetHost(target);
-
Array<LoweredFunc> fhost;
Array<LoweredFunc> fdevice;
for (const auto& x : funcs) {
CHECK(ir::VerifyMemory(x, target->device_type))
- << "Direct host side access to device memory is detected in " << x->func_name()
- << ". Did you forget to bind?";
+ << "Direct host side access to device memory is detected in "
+ << x->func_name() << ". Did you forget to bind?";
if (x->func_type == kMixedFunc) {
auto func = x;
}
func = ir::ThreadSync(func, "shared");
+ func = ir::ThreadSync(func, "warp");
func = ir::LowerThreadAllreduce(func, target->thread_warp_size);
auto fsplits = ir::SplitHostDevice(func);
fhost.push_back(fsplits[0]);
}
}
+ for (size_t i = 0; i < fdevice.size(); i++) {
+ auto warp_size = target->thread_warp_size;
+ auto func = fdevice[i];
+ func = ir::LowerWarpMemory(fdevice[i], warp_size);
+ fdevice.Set(i, func);
+ }
+
auto keys = target->keys();
- bool target_is_gpu =
- std::find(keys.begin(), keys.end(), "gpu") != keys.end();
+ bool target_is_gpu = std::find(keys.begin(), keys.end(), "gpu") != keys.end();
if (target_is_gpu && fdevice.size() == 0) {
- LOG(WARNING) << "Specified target " + target->str() +
- " but cannot find device code. Did you forget to bind?";
+ LOG(WARNING) << "Specified target "
+ << target->str()
+ << " but cannot find device code. Did you forget to bind?";
+ }
+
+ for (size_t i = 0; i < fdevice.size(); ++i) {
+ auto func = fdevice[i];
+ func = ir::LowerIntrin(func, target->target_name);
+ fdevice.Set(i, func);
+ }
+
+ if (target->device_type == target::llvm()->device_type &&
+ target_host == target) {
+ CHECK(fdevice.empty()) << "No device code should be generated when target "
+ << "and host_target are both llvm target."
+ << "\n";
}
for (size_t i = 0; i < fhost.size(); ++i) {
fhost.Set(i, func);
}
-
- for (size_t i = 0; i < fdevice.size(); ++i) {
- auto func = fdevice[i];
- func = ir::LowerIntrin(func, target->target_name);
- fdevice.Set(i, func);
- }
-
for (size_t i = 0; i < fhost.size(); ++i) {
auto func = fhost[i];
- func = ir::LowerIntrin(func, target_host_val->target_name);
+ func = ir::LowerIntrin(func, target_host->target_name);
func = ir::CombineContextCall(func);
fhost.Set(i, func);
}
return {fhost, fdevice};
}
-runtime::Module build(const Array<LoweredFunc>& funcs,
- const Target& target,
+// Create a module for a specific device (target). The lowered functions
+// associated with the host is returned as well.
+runtime::Module DeviceBuild(const Array<LoweredFunc>& fdevice,
+ const Target& target) {
+ if (!fdevice.empty()) {
+ return codegen::Build(fdevice, target->str());
+ } else {
+ return runtime::Module(nullptr);
+ }
+}
+
+// Build for heterogeneous execution.
+runtime::Module build(const Map<Target, Array<LoweredFunc>>& inputs,
const Target& target_host,
const BuildConfig& config) {
- auto target_host_val = target_host.defined() ? target_host : DefaultTargetHost(target);
- auto host_dev_funcs = split_dev_host_funcs(funcs, target, target_host, config);
- auto& fhost = host_dev_funcs[0];
- auto& fdevice = host_dev_funcs[1];
+ Array<LoweredFunc> fhost_all;
+ std::vector<runtime::Module> device_modules;
+
+ Target target_host_val = target_host;
+ if (!target_host.defined()) {
+ for (const auto& it : inputs) {
+ if (it.first->device_type == kDLCPU) {
+ target_host_val = it.first;
+ break;
+ }
+ }
+ }
- auto mhost = codegen::Build(fhost, target_host_val->str());
+ if (!target_host_val.defined()) {
+ target_host_val = DefaultTargetHost(target_host_val);
+ }
- if (fdevice.size() > 0) {
- auto mdev = codegen::Build(fdevice, target->str());
- mhost.Import(mdev);
+ for (const auto& it : inputs) {
+ auto host_dev_funcs =
+ split_dev_host_funcs(it.second, it.first, target_host_val, config);
+ auto& fhost = host_dev_funcs[0];
+ auto& fdevice = host_dev_funcs[1];
+ // Get the module for a certain target.
+ runtime::Module mdev = DeviceBuild(fdevice, it.first);
+ for (const auto& it : fhost) {
+ fhost_all.push_back(it);
+ }
+ device_modules.push_back(mdev);
}
+ runtime::Module mhost = codegen::Build(fhost_all, target_host_val->str());
+ // Import all modules
+ for (const auto& it : device_modules) {
+ if (it.operator->()) {
+ mhost.Import(it);
+ }
+ }
return mhost;
}
+// Build for heterogeneous execution when target is a string.
+runtime::Module build(const Map<std::string, Array<LoweredFunc>>& inputs,
+ const Target& target_host,
+ const BuildConfig& config) {
+ Map<Target, Array<LoweredFunc>> updated_input;
+ for (const auto& it : inputs) {
+ auto target = Target::create(it.first);
+ updated_input.Set(target, it.second);
+ }
+ return build(updated_input, target_host, config);
+}
+
+// Build for homogeneous execution.
+runtime::Module build(const Array<LoweredFunc>& funcs,
+ const Target& target,
+ const Target& target_host,
+ const BuildConfig& config) {
+ Map<Target, Array<LoweredFunc>> inputs = {{target, funcs}};
+ return build(inputs, target_host, config);
+}
+
BuildConfig build_config() {
return BuildConfig(make_node<BuildConfigNode>());
}
}
return func;
}
- /*!
- * \brief Build module given lowered functions for each target
- *
- * \param lowered_funcs target_str -> Array<LoweredFunc> map
- * \param targets Targets map
- * \param cfg Building configuration
- */
- void BuildModule(const Map<std::string, Array<LoweredFunc> >& lowered_funcs,
- const Map<HalideIR::Expr, HalideIR::Expr>& targets,
- const BuildConfig& cfg) {
- auto target_host = Target::create(cfg_.fallback_device);
- for (const auto& kv : lowered_funcs) {
- std::unordered_set<std::string> fname_set;
- for (auto f : kv.second) {
- if (fname_set.count(f->name)) {
- LOG(FATAL) << "Duplicate function name "
- << f->name;
- }
- fname_set.insert(f->name);
- }
- }
- std::unordered_map<std::string, Target> target_map;
- for (const auto& kv : lowered_funcs) {
- target_map[kv.first] = Target::create(kv.first);
- }
- Array<LoweredFunc> fhost_all;
- std::vector<runtime::Module> device_module;
- for (const auto& kv : lowered_funcs) {
- auto target = target_map[kv.first];
- auto host_dev_funcs = split_dev_host_funcs(kv.second, target, target_host, cfg);
- for (auto f : host_dev_funcs[0]) {
- fhost_all.push_back(f);
- }
- if (host_dev_funcs[1].size()) {
- auto mdev = codegen::Build(host_dev_funcs[1], target->str());
- device_module.push_back(mdev);
- }
- }
-
- auto mhost = codegen::Build(fhost_all, target_host->str());
-
- for (auto mdev : device_module) {
- mhost.Import(mdev);
- }
- ret_.mod = mhost;
- }
/*!
* \brief Build relay function to runtime module
ret_.graph_json = graph_codegen_->GetJSON();
ret_.params = graph_codegen_->GetParams();
- BuildModule(graph_codegen_->GetLoweredFunc(),
- device_target,
- tvm_cfg_);
+ auto target_host = Target::create(target_host_);
+ ret_.mod = tvm::build(graph_codegen_->GetLoweredFunc(), target_host, tvm_cfg_);
}
protected:
#include <dmlc/logging.h>
#include <gtest/gtest.h>
+#include <topi/cuda/injective.h>
#include <tvm/tvm.h>
#include <tvm/operation.h>
#include <tvm/build_module.h>
+#include <string>
+#include <cmath>
+
TEST(BuildModule, Basic) {
using namespace tvm;
auto n = var("n");
CHECK_EQ(mali_target->str(), "opencl -model=Mali-T860MP4@800Mhz -device=mali");
}
+TEST(BuildModule, Heterogeneous) {
+ /* The testing network is like following, where the element-wise add and sub
+ * ops are allocated to GPU and CPU, respectively:
+ *
+ * A B
+ * \ /
+ * elemwise_add (gpu)
+ * \
+ * copy C
+ * \ /
+ * elemwise_sub (cpu)
+ */
+
+ using namespace tvm;
+ const runtime::PackedFunc* pf = runtime::Registry::Get("module._Enabled");
+ bool enabled = (*pf)("cuda");
+ if (!enabled) {
+ LOG(INFO) << "Skip heterogeneous test because cuda is not enabled."
+ << "\n";
+ return;
+ }
+
+ auto target_llvm = target::llvm();
+ auto target_cuda = target::cuda();
+
+ // The shape of input tensors.
+ const int n = 4;
+ Array<Expr> shape{n};
+
+ auto A = placeholder(shape, Float(32), "A");
+ auto B = placeholder(shape, Float(32), "B");
+ auto C = placeholder(shape, Float(32), "C");
+
+ auto elemwise_add = compute(A->shape, [&A, &B](Expr i) {
+ return A[i] + B[i];
+ }, "elemwise_add");
+
+ auto copy = placeholder(shape, Float(32), "__copy");
+ auto elemwise_sub = compute(C->shape, [©, &C](Expr i) {
+ return copy[i] - C[i];
+ }, "elemwise_sub");
+
+ auto s1 = topi::cuda::schedule_injective(target_cuda, {elemwise_add});
+ auto s2 = create_schedule({elemwise_sub->op});
+
+ auto config = build_config();
+ auto args1 = Array<Tensor>({A, B, elemwise_add});
+ auto args2 = Array<Tensor>({copy, C, elemwise_sub});
+
+ std::unordered_map<Tensor, Buffer> binds;
+ auto lowered_s1 = lower(s1, args1, "elemwise_add", binds, config);
+ auto lowered_s2 = lower(s2, args2, "elemwise_sub", binds, config);
+ Map<tvm::Target, Array<LoweredFunc>> inputs = {{target_cuda, lowered_s1},
+ {target_llvm, lowered_s2}};
+ auto module = build(inputs, Target(), config);
+
+ // Assertion for build.
+ CHECK_EQ(module->imports().size(), 1);
+
+ // Execute the graph and check the correctness.
+ // Setup graph json.
+ std::string json =
+ "{\"nodes\": [{\"op\": \"null\", \"name\": \"A\", \"inputs\": []}, "
+ "{\"op\": \"null\", \"name\": \"B\", \"inputs\": []}, {\"op\": "
+ "\"tvm_op\", \"name\": \"elemwise_add\", \"attrs\": {\"flatten_data\": "
+ "\"1\", \"func_name\": \"elemwise_add\", \"num_inputs\": \"2\", "
+ "\"num_outputs\": \"1\"}, \"inputs\": [[0, 0, 0], [1, 0, 0]]}, {\"op\": "
+ "\"tvm_op\", \"name\": \"__copy_add_to_sub\", \"attrs\": "
+ "{\"flatten_data\": \"0\", \"func_name\": \"__copy\", \"num_inputs\": "
+ "\"1\", \"num_outputs\": \"1\"}, \"inputs\": [[2, 0, 0]]}, {\"op\": "
+ "\"null\", \"name\": \"C\", \"inputs\": []}, {\"op\": \"tvm_op\", "
+ "\"name\": \"elemwise_sub\", \"attrs\": {\"flatten_data\": \"0\", "
+ "\"func_name\": \"elemwise_sub\", \"num_inputs\": \"2\", "
+ "\"num_outputs\": \"1\"}, \"inputs\": [[3, 0, 0], [4, 0, 0]]}], "
+ "\"arg_nodes\": [0, 1, 4], \"node_row_ptr\": [0, 1, 2, 3, 4, 5, 6], "
+ "\"heads\": [[5, 0, 0]], \"attrs\": {\"storage_id\": [\"list_int\", [3, "
+ "4, 0, 1, 5, 2]], \"shape\": [\"list_shape\", [[4], [4], [4], [4], [4], "
+ "[4]]], \"device_index\": [\"list_int\", [2, 2, 2, 1, 1, 1]], \"dtype\": "
+ "[\"list_int\", [0, 0, 0, 0, 0, 0]], \"dltype\": [\"list_str\", "
+ "[\"float32\", \"float32\", \"float32\", \"float32\", \"float32\", "
+ "\"float32\"]]}}";
+
+ // Setup inputs.
+ auto a_val =
+ runtime::NDArray::Empty({n}, {kDLFloat, 32, 1}, {kDLCPU, 0});
+ auto b_val =
+ runtime::NDArray::Empty({n}, {kDLFloat, 32, 1}, {kDLCPU, 0});
+ auto c_val =
+ runtime::NDArray::Empty({n}, {kDLFloat, 32, 1}, {kDLCPU, 0});
+
+ auto pa = (float*)a_val.ToDLPack()->dl_tensor.data;
+ auto pb = (float*)b_val.ToDLPack()->dl_tensor.data;
+ auto pc = (float*)c_val.ToDLPack()->dl_tensor.data;
+
+ // Assign values.
+ for (int i = 0; i < n; i++) {
+ pa[i] = i;
+ pb[i] = i + 1.0;
+ pc[i] = i - 1.0;
+ }
+
+ // Initialize graph runtime.
+ int cpu_dev_ty = static_cast<int>(kDLCPU);
+ int cpu_dev_id = 0;
+ int gpu_dev_ty = static_cast<int>(kDLGPU);
+ int gpu_dev_id = 0;
+
+ const runtime::PackedFunc* graph_runtime =
+ tvm::runtime::Registry::Get("tvm.graph_runtime.create");
+ runtime::Module mod = (*graph_runtime)(
+ json, module, cpu_dev_ty, cpu_dev_id, gpu_dev_ty, gpu_dev_id);
+
+ PackedFunc set_input = mod.GetFunction("set_input", false);
+ PackedFunc run = mod.GetFunction("run", false);
+ PackedFunc get_output = mod.GetFunction("get_output", false);
+ set_input("A", a_val);
+ set_input("B", b_val);
+ set_input("C", c_val);
+
+ run();
+ tvm::runtime::NDArray out = get_output(0);
+ float* p_out = (float*)out.ToDLPack()->dl_tensor.data;
+
+ // Check correctness.
+ for (int i = 0; i < n; ++i) {
+ CHECK_LT(std::fabs(p_out[i] - (i + (i + 1.0) - (i - 1.0))), 1e-5);
+ }
+}
int main(int argc, char ** argv) {
testing::InitGoogleTest(&argc, argv);
tgt: tgt
}
m_bld.set_opt_level(3)
- m_bld.build(func, targets, "llvm -mcpu=sse3", params=params)
+ m_bld.build(func, targets, "llvm", params=params)
g_json = m_bld.get_json()
mmod = m_bld.get_module()
params = m_bld.get_params()