From 95a323aaa148edb53f9308fd82d0d08638fdb0a4 Mon Sep 17 00:00:00 2001
From: Zhi <5145158+zhiics@users.noreply.github.com>
Date: Thu, 9 May 2019 21:29:16 -0700
Subject: [PATCH] [codegen] heterogeneous build for c++ (#3144)

* heterogeneous build for c++

* merge relay buildmodule to codegen build

* use module split

* use target_host

* remove sse3

* retrigger ci
---
 include/tvm/build_module.h                  |  29 ++++++
 src/codegen/build_module.cc                 | 126 ++++++++++++++++++++------
 src/relay/backend/build_module.cc           |  51 +----------
 tests/cpp/build_module_test.cc              | 132 ++++++++++++++++++++++++++++
 tests/python/relay/test_cpp_build_module.py |   2 +-
 5 files changed, 262 insertions(+), 78 deletions(-)
diff --git a/include/tvm/build_module.h b/include/tvm/build_module.h
index 334fe16..208f086 100644
--- a/include/tvm/build_module.h
+++ b/include/tvm/build_module.h
@@ -371,6 +371,35 @@ TVM_DLL runtime::Module build(const Array<LoweredFunc>& funcs,
                               const Target& target_host,
                               const BuildConfig& config);
 
+/*!
+ * \brief Build a device and host module for a specific target from a map
+ * contains target to a list of lowered functions pairs. This function is used
+ * for heterogeneous build.
+ * \param input The map contains target to a list of lowered functions pairs.
+ * \param target_host The target for building host code. To use the default,
+ *        pass Target().
+ * \param config The build configuration.
+ * \return The built module that contains code for different processors.
+ */
+TVM_DLL runtime::Module build(const Map<Target, Array<LoweredFunc>>& input,
+                              const Target& target_host,
+                              const BuildConfig& config);
+
+/*!
+ * \brief Build a device and host module for a specific target from a map
+ * contains target to a list of lowered functions pairs. This function is used
+ * for heterogeneous build.
+ * \param input The map contains target string to a list of lowered functions
+ *        pairs.
+ * \param target_host The target for building host code. To use the default,
+ *        pass Target().
+ * \param config The build configuration.
+ * \return The built module that contains code for different processors.
+ */
+TVM_DLL runtime::Module build(const Map<std::string, Array<LoweredFunc>>& input,
+                              const Target& target_host,
+                              const BuildConfig& config);
+
 class GenericFuncNode;
 
 /*!
diff --git a/src/codegen/build_module.cc b/src/codegen/build_module.cc
index 01ebcac..57e300f 100644
--- a/src/codegen/build_module.cc
+++ b/src/codegen/build_module.cc
@@ -428,20 +428,19 @@ Array<Array<LoweredFunc> > split_dev_host_funcs(const Array<LoweredFunc>& funcs,
                                                 const Target& target_host,
                                                 const BuildConfig& config) {
   std::unordered_set<std::string> all_names;
-  for (const auto &x : funcs) {
-    CHECK(all_names.count(x->name) == 0) << "Duplicate function name " << x->name;
+  for (const auto& x : funcs) {
+    CHECK(all_names.count(x->name) == 0)
+        << "Duplicate function name " << x->name;
     all_names.insert(x->name);
   }
 
-  auto target_host_val = target_host.defined() ? target_host : DefaultTargetHost(target);
-
   Array<LoweredFunc> fhost;
   Array<LoweredFunc> fdevice;
 
   for (const auto& x : funcs) {
     CHECK(ir::VerifyMemory(x, target->device_type))
-        << "Direct host side access to device memory is detected in " << x->func_name()
-        << ". Did you forget to bind?";
+        << "Direct host side access to device memory is detected in "
+        << x->func_name() << ". Did you forget to bind?";
 
     if (x->func_type == kMixedFunc) {
       auto func = x;
@@ -450,6 +449,7 @@ Array<Array<LoweredFunc> > split_dev_host_funcs(const Array<LoweredFunc>& funcs,
       }
 
       func = ir::ThreadSync(func, "shared");
+      func = ir::ThreadSync(func, "warp");
       func = ir::LowerThreadAllreduce(func, target->thread_warp_size);
       auto fsplits = ir::SplitHostDevice(func);
       fhost.push_back(fsplits[0]);
@@ -465,12 +465,32 @@ Array<Array<LoweredFunc> > split_dev_host_funcs(const Array<LoweredFunc>& funcs,
     }
   }
 
+  for (size_t i = 0; i < fdevice.size(); i++) {
+    auto warp_size = target->thread_warp_size;
+    auto func = fdevice[i];
+    func = ir::LowerWarpMemory(fdevice[i], warp_size);
+    fdevice.Set(i, func);
+  }
+
   auto keys = target->keys();
-  bool target_is_gpu =
-    std::find(keys.begin(), keys.end(), "gpu") != keys.end();
+  bool target_is_gpu = std::find(keys.begin(), keys.end(), "gpu") != keys.end();
   if (target_is_gpu && fdevice.size() == 0) {
-    LOG(WARNING) << "Specified target " + target->str() +
-      " but cannot find device code. Did you forget to bind?";
+    LOG(WARNING) << "Specified target "
+                 << target->str()
+                 << " but cannot find device code. Did you forget to bind?";
+  }
+
+  for (size_t i = 0; i < fdevice.size(); ++i) {
+    auto func = fdevice[i];
+    func = ir::LowerIntrin(func, target->target_name);
+    fdevice.Set(i, func);
+  }
+
+  if (target->device_type == target::llvm()->device_type &&
+        target_host == target) {
+    CHECK(fdevice.empty()) << "No device code should be generated when target "
+                           << "and host_target are both llvm target."
+                           << "\n";
   }
 
   for (size_t i = 0; i < fhost.size(); ++i) {
@@ -480,41 +500,91 @@ Array<Array<LoweredFunc> > split_dev_host_funcs(const Array<LoweredFunc>& funcs,
     fhost.Set(i, func);
   }
 
-
-  for (size_t i = 0; i < fdevice.size(); ++i) {
-    auto func = fdevice[i];
-    func = ir::LowerIntrin(func, target->target_name);
-    fdevice.Set(i, func);
-  }
-
   for (size_t i = 0; i < fhost.size(); ++i) {
     auto func = fhost[i];
-    func = ir::LowerIntrin(func, target_host_val->target_name);
+    func = ir::LowerIntrin(func, target_host->target_name);
     func = ir::CombineContextCall(func);
     fhost.Set(i, func);
   }
   return {fhost, fdevice};
 }
 
-runtime::Module build(const Array<LoweredFunc>& funcs,
-                      const Target& target,
+// Create a module for a specific device (target). The lowered functions
+// associated with the host is returned as well.
+runtime::Module DeviceBuild(const Array<LoweredFunc>& fdevice,
+                            const Target& target) {
+  if (!fdevice.empty()) {
+    return codegen::Build(fdevice, target->str());
+  } else {
+    return runtime::Module(nullptr);
+  }
+}
+
+// Build for heterogeneous execution.
+runtime::Module build(const Map<Target, Array<LoweredFunc>>& inputs,
                       const Target& target_host,
                       const BuildConfig& config) {
-  auto target_host_val = target_host.defined() ? target_host : DefaultTargetHost(target);
-  auto host_dev_funcs = split_dev_host_funcs(funcs, target, target_host, config);
-  auto& fhost = host_dev_funcs[0];
-  auto& fdevice = host_dev_funcs[1];
+  Array<LoweredFunc> fhost_all;
+  std::vector<runtime::Module> device_modules;
+
+  Target target_host_val = target_host;
+  if (!target_host.defined()) {
+    for (const auto& it : inputs) {
+      if (it.first->device_type == kDLCPU) {
+        target_host_val = it.first;
+        break;
+      }
+    }
+  }
 
-  auto mhost = codegen::Build(fhost, target_host_val->str());
+  if (!target_host_val.defined()) {
+    target_host_val = DefaultTargetHost(target_host_val);
+  }
 
-  if (fdevice.size() > 0) {
-    auto mdev = codegen::Build(fdevice, target->str());
-    mhost.Import(mdev);
+  for (const auto& it : inputs) {
+    auto host_dev_funcs =
+        split_dev_host_funcs(it.second, it.first, target_host_val, config);
+    auto& fhost = host_dev_funcs[0];
+    auto& fdevice = host_dev_funcs[1];
+    // Get the module for a certain target.
+    runtime::Module mdev = DeviceBuild(fdevice, it.first);
+    for (const auto& it : fhost) {
+      fhost_all.push_back(it);
+    }
+    device_modules.push_back(mdev);
   }
 
+  runtime::Module mhost = codegen::Build(fhost_all, target_host_val->str());
+  // Import all modules
+  for (const auto& it : device_modules) {
+    if (it.operator->()) {
+      mhost.Import(it);
+    }
+  }
   return mhost;
 }
 
+// Build for heterogeneous execution when target is a string.
+runtime::Module build(const Map<std::string, Array<LoweredFunc>>& inputs,
+                      const Target& target_host,
+                      const BuildConfig& config) {
+  Map<Target, Array<LoweredFunc>> updated_input;
+  for (const auto& it : inputs) {
+    auto target = Target::create(it.first);
+    updated_input.Set(target, it.second);
+  }
+  return build(updated_input, target_host, config);
+}
+
+// Build for homogeneous execution.
+runtime::Module build(const Array<LoweredFunc>& funcs,
+                      const Target& target,
+                      const Target& target_host,
+                      const BuildConfig& config) {
+  Map<Target, Array<LoweredFunc>> inputs = {{target, funcs}};
+  return build(inputs, target_host, config);
+}
+
 BuildConfig build_config() {
   return BuildConfig(make_node<BuildConfigNode>());
 }
diff --git a/src/relay/backend/build_module.cc b/src/relay/backend/build_module.cc
index 564715c..08a88d5 100644
--- a/src/relay/backend/build_module.cc
+++ b/src/relay/backend/build_module.cc
@@ -601,52 +601,6 @@ class RelayBuildModule : public runtime::ModuleNode {
     }
     return func;
   }
-  /*!
-   * \brief Build module given lowered functions for each target
-   *
-   * \param lowered_funcs target_str -> Array<LoweredFunc> map
-   * \param targets Targets map
-   * \param cfg Building configuration
-   */
-  void BuildModule(const Map<std::string, Array<LoweredFunc> >& lowered_funcs,
-                   const Map<HalideIR::Expr, HalideIR::Expr>& targets,
-                   const BuildConfig& cfg) {
-    auto target_host = Target::create(cfg_.fallback_device);
-    for (const auto& kv : lowered_funcs) {
-      std::unordered_set<std::string> fname_set;
-      for (auto f : kv.second) {
-        if (fname_set.count(f->name)) {
-          LOG(FATAL) << "Duplicate function name "
-                     << f->name;
-        }
-        fname_set.insert(f->name);
-      }
-    }
-    std::unordered_map<std::string, Target> target_map;
-    for (const auto& kv : lowered_funcs) {
-      target_map[kv.first] = Target::create(kv.first);
-    }
-    Array<LoweredFunc> fhost_all;
-    std::vector<runtime::Module> device_module;
-    for (const auto& kv : lowered_funcs) {
-      auto target = target_map[kv.first];
-      auto host_dev_funcs = split_dev_host_funcs(kv.second, target, target_host, cfg);
-      for (auto f : host_dev_funcs[0]) {
-        fhost_all.push_back(f);
-      }
-      if (host_dev_funcs[1].size()) {
-        auto mdev = codegen::Build(host_dev_funcs[1], target->str());
-        device_module.push_back(mdev);
-      }
-    }
-
-    auto mhost = codegen::Build(fhost_all, target_host->str());
-
-    for (auto mdev : device_module) {
-      mhost.Import(mdev);
-    }
-    ret_.mod = mhost;
-  }
 
   /*!
    * \brief Build relay function to runtime module
@@ -686,9 +640,8 @@ class RelayBuildModule : public runtime::ModuleNode {
     ret_.graph_json = graph_codegen_->GetJSON();
     ret_.params = graph_codegen_->GetParams();
 
-    BuildModule(graph_codegen_->GetLoweredFunc(),
-                device_target,
-                tvm_cfg_);
+    auto target_host = Target::create(target_host_);
+    ret_.mod = tvm::build(graph_codegen_->GetLoweredFunc(), target_host, tvm_cfg_);
   }
 
  protected:
diff --git a/tests/cpp/build_module_test.cc b/tests/cpp/build_module_test.cc
index 734e457..393714d 100644
--- a/tests/cpp/build_module_test.cc
+++ b/tests/cpp/build_module_test.cc
@@ -19,10 +19,14 @@
 
 #include <dmlc/logging.h>
 #include <gtest/gtest.h>
+#include <topi/cuda/injective.h>
 #include <tvm/tvm.h>
 #include <tvm/operation.h>
 #include <tvm/build_module.h>
 
+#include <string>
+#include <cmath>
+
 TEST(BuildModule, Basic) {
   using namespace tvm;
   auto n = var("n");
@@ -56,6 +60,134 @@ TEST(BuildModule, Basic) {
   CHECK_EQ(mali_target->str(), "opencl -model=Mali-T860MP4@800Mhz -device=mali"); 
 }
 
+TEST(BuildModule, Heterogeneous) {
+  /* The testing network is like following, where the element-wise add and sub
+   * ops are allocated to GPU and CPU, respectively:
+   *
+   *          A    B
+   *           \  /
+   *      elemwise_add  (gpu)
+   *              \
+   *              copy      C
+   *                \      /
+   *              elemwise_sub  (cpu)
+   */
+
+  using namespace tvm;
+  const runtime::PackedFunc* pf = runtime::Registry::Get("module._Enabled");
+  bool enabled = (*pf)("cuda");
+  if (!enabled) {
+    LOG(INFO) << "Skip heterogeneous test because cuda is not enabled."
+              << "\n";
+    return;
+  }
+
+  auto target_llvm = target::llvm();
+  auto target_cuda = target::cuda();
+
+  // The shape of input tensors.
+  const int n = 4;
+  Array<Expr> shape{n};
+
+  auto A = placeholder(shape, Float(32), "A");
+  auto B = placeholder(shape, Float(32), "B");
+  auto C = placeholder(shape, Float(32), "C");
+
+  auto elemwise_add = compute(A->shape, [&A, &B](Expr i) {
+    return A[i] + B[i];
+  }, "elemwise_add");
+
+  auto copy = placeholder(shape, Float(32), "__copy");
+  auto elemwise_sub = compute(C->shape, [&copy, &C](Expr i) {
+    return copy[i] - C[i];
+  }, "elemwise_sub");
+
+  auto s1 = topi::cuda::schedule_injective(target_cuda, {elemwise_add});
+  auto s2 = create_schedule({elemwise_sub->op});
+
+  auto config = build_config();
+  auto args1 = Array<Tensor>({A, B, elemwise_add});
+  auto args2 = Array<Tensor>({copy, C, elemwise_sub});
+
+  std::unordered_map<Tensor, Buffer> binds;
+  auto lowered_s1 = lower(s1, args1, "elemwise_add", binds, config);
+  auto lowered_s2 = lower(s2, args2, "elemwise_sub", binds, config);
+  Map<tvm::Target, Array<LoweredFunc>> inputs = {{target_cuda, lowered_s1},
+                                                 {target_llvm, lowered_s2}};
+  auto module = build(inputs, Target(), config);
+
+  // Assertion for build.
+  CHECK_EQ(module->imports().size(), 1);
+
+  // Execute the graph and check the correctness.
+  // Setup graph json.
+  std::string json =
+      "{\"nodes\": [{\"op\": \"null\", \"name\": \"A\", \"inputs\": []}, "
+      "{\"op\": \"null\", \"name\": \"B\", \"inputs\": []}, {\"op\": "
+      "\"tvm_op\", \"name\": \"elemwise_add\", \"attrs\": {\"flatten_data\": "
+      "\"1\", \"func_name\": \"elemwise_add\", \"num_inputs\": \"2\", "
+      "\"num_outputs\": \"1\"}, \"inputs\": [[0, 0, 0], [1, 0, 0]]}, {\"op\": "
+      "\"tvm_op\", \"name\": \"__copy_add_to_sub\", \"attrs\": "
+      "{\"flatten_data\": \"0\", \"func_name\": \"__copy\", \"num_inputs\": "
+      "\"1\", \"num_outputs\": \"1\"}, \"inputs\": [[2, 0, 0]]}, {\"op\": "
+      "\"null\", \"name\": \"C\", \"inputs\": []}, {\"op\": \"tvm_op\", "
+      "\"name\": \"elemwise_sub\", \"attrs\": {\"flatten_data\": \"0\", "
+      "\"func_name\": \"elemwise_sub\", \"num_inputs\": \"2\", "
+      "\"num_outputs\": \"1\"}, \"inputs\": [[3, 0, 0], [4, 0, 0]]}], "
+      "\"arg_nodes\": [0, 1, 4], \"node_row_ptr\": [0, 1, 2, 3, 4, 5, 6], "
+      "\"heads\": [[5, 0, 0]], \"attrs\": {\"storage_id\": [\"list_int\", [3, "
+      "4, 0, 1, 5, 2]], \"shape\": [\"list_shape\", [[4], [4], [4], [4], [4], "
+      "[4]]], \"device_index\": [\"list_int\", [2, 2, 2, 1, 1, 1]], \"dtype\": "
+      "[\"list_int\", [0, 0, 0, 0, 0, 0]], \"dltype\": [\"list_str\", "
+      "[\"float32\", \"float32\", \"float32\", \"float32\", \"float32\", "
+      "\"float32\"]]}}";
+
+  // Setup inputs.
+  auto a_val =
+      runtime::NDArray::Empty({n}, {kDLFloat, 32, 1}, {kDLCPU, 0});
+  auto b_val =
+      runtime::NDArray::Empty({n}, {kDLFloat, 32, 1}, {kDLCPU, 0});
+  auto c_val =
+      runtime::NDArray::Empty({n}, {kDLFloat, 32, 1}, {kDLCPU, 0});
+
+  auto pa = (float*)a_val.ToDLPack()->dl_tensor.data;
+  auto pb = (float*)b_val.ToDLPack()->dl_tensor.data;
+  auto pc = (float*)c_val.ToDLPack()->dl_tensor.data;
+
+  // Assign values.
+  for (int i = 0; i < n; i++) {
+    pa[i] = i;
+    pb[i] = i + 1.0;
+    pc[i] = i - 1.0;
+  }
+
+  // Initialize graph runtime.
+  int cpu_dev_ty = static_cast<int>(kDLCPU);
+  int cpu_dev_id = 0;
+  int gpu_dev_ty = static_cast<int>(kDLGPU);
+  int gpu_dev_id = 0;
+
+  const runtime::PackedFunc* graph_runtime =
+      tvm::runtime::Registry::Get("tvm.graph_runtime.create");
+  runtime::Module mod = (*graph_runtime)(
+      json, module, cpu_dev_ty, cpu_dev_id, gpu_dev_ty, gpu_dev_id);
+
+  PackedFunc set_input = mod.GetFunction("set_input", false);
+  PackedFunc run = mod.GetFunction("run", false);
+  PackedFunc get_output = mod.GetFunction("get_output", false);
+  set_input("A", a_val);
+  set_input("B", b_val);
+  set_input("C", c_val);
+
+  run();
+  tvm::runtime::NDArray out = get_output(0);
+  float* p_out = (float*)out.ToDLPack()->dl_tensor.data;
+
+  // Check correctness.
+  for (int i = 0; i < n; ++i) {
+    CHECK_LT(std::fabs(p_out[i] - (i + (i + 1.0) - (i - 1.0))), 1e-5);
+  }
+}
 
 int main(int argc, char ** argv) {
   testing::InitGoogleTest(&argc, argv);
diff --git a/tests/python/relay/test_cpp_build_module.py b/tests/python/relay/test_cpp_build_module.py
index c69d877..b94f57d 100644
--- a/tests/python/relay/test_cpp_build_module.py
+++ b/tests/python/relay/test_cpp_build_module.py
@@ -89,7 +89,7 @@ def test_build():
         tgt: tgt
     }
     m_bld.set_opt_level(3)
-    m_bld.build(func, targets, "llvm -mcpu=sse3", params=params)
+    m_bld.build(func, targets, "llvm", params=params)
     g_json = m_bld.get_json()
     mmod = m_bld.get_module()
     params = m_bld.get_params()
-- 
2.7.4