[BACKEND][CODEGEN] C codegen with tests (#2161)
authorPratyush Patel <pratyushpatel.1995@gmail.com>
Wed, 28 Nov 2018 18:32:31 +0000 (10:32 -0800)
committerYuwei Hu <huyuwei1995@gmail.com>
Wed, 28 Nov 2018 18:32:31 +0000 (13:32 -0500)
* Implement C code generation with tests

* Code cleanup

* Implement C code generation with tests

* Code cleanup

* tabs to spaces

* make lint compliant

* update export_library and reserve unique C keywords

* move ReserveKeywordsAsUnique to codegen_c

* some documentation and code cleanup

* use tvm.contrib.util for tempdir in testcases

python/tvm/_ffi/libinfo.py
python/tvm/_ffi/runtime_ctypes.py
python/tvm/contrib/cc.py
python/tvm/module.py
src/codegen/codegen_c.cc
src/codegen/codegen_c.h
src/codegen/codegen_c_host.cc [new file with mode: 0644]
src/codegen/codegen_c_host.h [new file with mode: 0644]
src/codegen/codegen_source_base.h
src/codegen/source_module.cc
tests/python/unittest/test_codegen_c_host.py [new file with mode: 0644]

index f911829d38b17243de8608a48ac8a9d0cf1c1f14..2fdf5aeb132ac42dfd0afada69f15c705db8fb07 100644 (file)
@@ -99,6 +99,66 @@ def find_lib_path(name=None, search_path=None, optional=False):
     return lib_found
 
 
+def find_include_path(name=None, search_path=None, optional=False):
+    """Find header files for C compilation.
+
+    Parameters
+    ----------
+    name : list of str
+        List of directory names to be searched.
+
+    Returns
+    -------
+    include_path : list(string)
+        List of all found paths to header files.
+    """
+    ffi_dir = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
+    source_dir = os.path.join(ffi_dir, "..", "..", "..")
+    install_include_dir = os.path.join(ffi_dir, "..", "..", "..", "..")
+    third_party_dir = os.path.join(source_dir, "3rdparty")
+
+    header_path = []
+
+    if os.environ.get('TVM_INCLUDE_PATH', None):
+        header_path.append(os.environ['TVM_INCLUDE_PATH'])
+
+    header_path.append(install_include_dir)
+    header_path.append(source_dir)
+    header_path.append(third_party_dir)
+
+    header_path = [os.path.abspath(x) for x in header_path]
+    if search_path is not None:
+        if search_path is list:
+            header_path = header_path + search_path
+        else:
+            header_path.append(search_path)
+    if name is not None:
+        if isinstance(name, list):
+            tvm_include_path = []
+            for n in name:
+                tvm_include_path += [os.path.join(p, n) for p in header_path]
+        else:
+            tvm_include_path = [os.path.join(p, name) for p in header_path]
+        dlpack_include_path = []
+    else:
+        tvm_include_path = [os.path.join(p, 'include') for p in header_path]
+        dlpack_include_path = [os.path.join(p, 'dlpack/include') for p in header_path]
+
+        # try to find include path
+        include_found = [p for p in tvm_include_path if os.path.exists(p) and os.path.isdir(p)]
+        include_found += [p for p in dlpack_include_path if os.path.exists(p) and os.path.isdir(p)]
+
+    if not include_found:
+        message = ('Cannot find the files.\n' +
+                   'List of candidates:\n' +
+                   str('\n'.join(tvm_include_path + dlpack_include_path)))
+        if not optional:
+            raise RuntimeError(message)
+        return None
+
+    return include_found
+
+
 # current version
 # We use the version of the incoming release for code
 # that is under development.
index b17487559e50489bfae0b6ce14fde1eeddd6ede2..ef5316b5e2677c5b983ea17d1d1192e246e63b7a 100644 (file)
@@ -118,6 +118,7 @@ class TVMContext(ctypes.Structure):
         'llvm': 1,
         'stackvm': 1,
         'cpu': 1,
+        'c': 1,
         'gpu': 2,
         'cuda': 2,
         'nvptx': 2,
index 0ffa6c420243cf62fa8c9d8ae2e7f89be496bf65..0361f594de6ab24821cd71ebfe1a27a6983789e2 100644 (file)
@@ -7,6 +7,7 @@ import os
 
 from .._ffi.base import py_str
 from .util import tempdir
+from .._ffi.libinfo import find_include_path
 
 
 def create_shared(output,
@@ -49,6 +50,7 @@ def _linux_shared(output, objects, options, cc="g++"):
         cmd += objects
     if options:
         cmd += options
+    cmd += ["-I" + path for path in find_include_path()]
     proc = subprocess.Popen(
         cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
     (out, _) = proc.communicate()
index 1ca09740aff402632924db6079d79a68db28c8f4..cd919722e681959325806f2bc9ec186bd453d513 100644 (file)
@@ -97,17 +97,21 @@ class Module(ModuleBase):
             self.save(file_name)
             return
 
-        if self.type_key != "llvm":
-            raise ValueError("Module[%s]: Only llvm support export shared" % self.type_key)
+        if not (self.type_key == "llvm" or self.type_key == "c"):
+            raise ValueError("Module[%s]: Only llvm and c support export shared" % self.type_key)
         temp = _util.tempdir()
         if fcompile is not None and hasattr(fcompile, "object_format"):
             object_format = fcompile.object_format
         else:
-            object_format = "o"
+            if self.type_key == "llvm":
+                object_format = "o"
+            else:
+                assert self.type_key == "c"
+                object_format = "cc"
         path_obj = temp.relpath("lib." + object_format)
         self.save(path_obj)
         files = [path_obj]
-        is_system_lib = self.get_function("__tvm_is_system_module")()
+        is_system_lib = self.type_key == "llvm" and self.get_function("__tvm_is_system_module")()
         if self.imported_modules:
             path_cc = temp.relpath("devc.cc")
             with open(path_cc, "w") as f:
index d902437dd99023132490a8e6ee5639af07a9c854..3624dc0403aa0f496793d165432b853b224bc9fd 100644 (file)
@@ -22,12 +22,43 @@ void CodeGenC::InitFuncState(LoweredFunc f) {
   handle_data_type_.clear();
   CodeGenSourceBase::ClearFuncState();
 }
-void CodeGenC::AddFunction(LoweredFunc f) {
-  // clear previous generated state.
-  this->InitFuncState(f);
+
+void CodeGenC::ReserveKeywordsAsUnique() {
   // skip the first underscore, so SSA variable starts from _1
   GetUniqueName("_");
   GetUniqueName("extern");
+  GetUniqueName("void");
+  GetUniqueName("int");
+  GetUniqueName("float");
+  GetUniqueName("double");
+  GetUniqueName("char");
+  GetUniqueName("unsigned");
+  GetUniqueName("short");
+  GetUniqueName("long");
+  GetUniqueName("if");
+  GetUniqueName("else");
+  GetUniqueName("switch");
+  GetUniqueName("case");
+  GetUniqueName("default");
+  GetUniqueName("for");
+  GetUniqueName("do");
+  GetUniqueName("while");
+  GetUniqueName("goto");
+  GetUniqueName("register");
+  GetUniqueName("continue");
+  GetUniqueName("break");
+  GetUniqueName("typedef");
+  GetUniqueName("struct");
+  GetUniqueName("enum");
+  GetUniqueName("union");
+  GetUniqueName("return");
+}
+
+void CodeGenC::AddFunction(LoweredFunc f) {
+  // clear previous generated state.
+  this->InitFuncState(f);
+  // reserve keywords
+  ReserveKeywordsAsUnique();
   // add to alloc buffer type.
   for (const auto & kv : f->handle_data_type) {
     RegisterHandleType(kv.first.get(), kv.second.type());
@@ -187,6 +218,7 @@ std::string CodeGenC::GetStructRef(
       case intrinsic::kArrNDim: os << "ndim"; break;
       case intrinsic::kArrTypeCode: os << "dtype.code"; break;
       case intrinsic::kArrTypeBits: os << "dtype.bits"; break;
+      case intrinsic::kArrByteOffset: os << "byte_offset"; break;
       case intrinsic::kArrTypeLanes: os << "dtype.lanes"; break;
       case intrinsic::kArrDeviceId: os << "ctx.device_id"; break;
       case intrinsic::kArrDeviceType: os << "ctx.device_type"; break;
@@ -834,8 +866,10 @@ void CodeGenC::VisitStmt_(const Evaluate *op) {
     }
   }
   std::string vid = this->PrintExpr(op->value);
-  this->PrintIndent();
-  this->stream << "(void)" << vid << ";\n";
+  if (vid != "") {
+    this->PrintIndent();
+    this->stream << "(void)" << vid << ";\n";
+  }
 }
 
 void CodeGenC::VisitStmt_(const ProducerConsumer *op) {
index b36e37da54fef9b7202e07c1684fe377201d1f61..c9af24a04a3cc5de321917d6da9f910e9a0a993d 100644 (file)
@@ -183,6 +183,8 @@ class CodeGenC :
   std::unordered_map<const Variable*, std::string> alloc_storage_scope_;
   /*! \brief the data type of allocated buffers */
   std::unordered_map<const Variable*, Type> handle_data_type_;
+  /*! \brief reserves common C keywords */
+  void ReserveKeywordsAsUnique();
 
  private:
   /*! \brief whether to print in SSA form */
diff --git a/src/codegen/codegen_c_host.cc b/src/codegen/codegen_c_host.cc
new file mode 100644 (file)
index 0000000..248354d
--- /dev/null
@@ -0,0 +1,252 @@
+/*!
+ *  Copyright (c) 2017 by Contributors
+ * \file codegen_c_host.cc
+ */
+#include <tvm/packed_func_ext.h>
+#include <vector>
+#include <string>
+#include "codegen_c_host.h"
+#include "build_common.h"
+
+namespace tvm {
+namespace codegen {
+
+CodeGenCHost::CodeGenCHost() {
+  module_name = GetUniqueName("__tvm_module_ctx");
+}
+
+void CodeGenCHost::Init(bool output_ssa) {
+  decl_stream << "#include \"tvm/runtime/c_runtime_api.h\"\n";
+  decl_stream << "#include \"tvm/runtime/c_backend_api.h\"\n";
+  decl_stream << "extern void* " << module_name << " = NULL;\n";
+  CodeGenC::Init(output_ssa);
+}
+
+void CodeGenCHost::AddFunction(LoweredFunc f) {
+  // clear previous generated state.
+  this->InitFuncState(f);
+  // reserve keywords
+  ReserveKeywordsAsUnique();
+  // add to alloc buffer type.
+  for (const auto & kv : f->handle_data_type) {
+    RegisterHandleType(kv.first.get(), kv.second.type());
+  }
+
+  this->stream << "#ifdef __cplusplus\n";
+  this->stream << "extern \"C\"\n";
+  this->stream << "#endif\n";
+  this->stream << "TVM_DLL int32_t " << f->name << "(";
+  for (size_t i = 0; i < f->args.size(); ++i) {
+    Var v = f->args[i];
+    std::string vid = AllocVarID(v.get());
+    if (i != 0) stream << ", ";
+    if (v.type().is_handle()) {
+      auto it = alloc_storage_scope_.find(v.get());
+      if (it != alloc_storage_scope_.end()) {
+        PrintStorageScope(it->second, stream);
+      }
+      stream << ' ';
+
+      if (handle_data_type_.count(v.get())) {
+        PrintType(handle_data_type_.at(v.get()), stream);
+      } else {
+        stream << "void";
+      }
+      stream << "*";
+
+      if (f->is_restricted && restrict_keyword_.length() != 0) {
+        stream << ' ' << restrict_keyword_;
+      }
+    } else {
+      PrintType(v.type(), stream);
+    }
+    stream << ' ' << vid;
+  }
+  stream << ") {\n";
+  this->PreFunctionBody(f);
+  int func_scope = this->BeginScope();
+  this->PrintStmt(f->body);
+  this->PrintIndent();
+  this->stream << "return 0;\n";
+  this->EndScope(func_scope);
+  this->PrintIndent();
+  this->stream << "}\n\n";
+}
+
+std::string CodeGenCHost::Finish() {
+  return CodeGenC::Finish();
+}
+
+void CodeGenCHost::PrintType(Type t, std::ostream& os) {  // NOLINT(*)
+  int lanes = t.lanes();
+  if (t.is_handle()) {
+    CHECK_EQ(lanes, 1)
+        << "does not support vector types";
+    os << "void*"; return;
+  }
+  if (t == Bool()) {
+    os << "bool"; return;
+  }
+  bool fail = false;
+  if (t.is_float()) {
+    switch (t.bits()) {
+      case 16:
+        os << "half";
+        break;
+      case 32: os << "float"; break;
+      case 64:
+        os << "double";
+        break;
+      default: fail = true; break;
+    }
+    if (!fail && lanes == 1) return;
+    if (!fail && (lanes >= 2 && lanes <= 16)) {
+      os << lanes; return;
+    }
+  } else if (t.is_uint() || t.is_int()) {
+    if (t.is_uint()) {
+      os << 'u';
+    }
+    switch (t.bits()) {
+      case 8: os << "int8_t"; break;
+      case 16: os << "int16_t"; break;
+      case 32: os << "int32_t"; break;
+      case 64: os << "int64_t"; break;
+      case 1: os << "int32_t"; break;
+      default: fail = true; break;
+    }
+    if (!fail && lanes == 1) return;
+    if (!fail && (lanes >= 2 && lanes <= 16)) {
+      os << lanes; return;
+    }
+  }
+  LOG(FATAL) << "Cannot convert type " << t << " to C type";
+}
+
+void CodeGenCHost::VisitExpr_(const Broadcast* op, std::ostream& os) {   // NOLINT(*)
+  std::string v = PrintExpr(op->value);
+  os << "((";
+  PrintType(op->type, os);
+  os << ")(";
+  for (int i = 0; i < op->lanes; ++i) {
+    if (i != 0) os << ", ";
+    os << v;
+  }
+  os << "))";
+}
+
+void CodeGenCHost::PrintGetFuncFromBackend(std::string func_name, std::string packed_func_name) {
+  this->PrintIndent();
+  this->stream << "if (" << packed_func_name << " == NULL) {\n";
+  int packed_func_if_scope = this->BeginScope();
+  this->PrintIndent();
+  this->stream << "if (TVMBackendGetFuncFromEnv(" << module_name
+              << ", \"" << func_name << "\""
+              << ", &" << packed_func_name << ") != 0) {\n";
+  int get_func_env_scope = this->BeginScope();
+  this->PrintIndent();
+  this->stream << "return -1;\n";
+  this->EndScope(get_func_env_scope);
+  this->PrintIndent();
+  this->stream << "}\n";
+  this->EndScope(packed_func_if_scope);
+  this->PrintIndent();
+  this->stream << "}\n";
+}
+
+void CodeGenCHost::PrintFuncCall(std::string packed_func_name, int num_args) {
+  this->PrintIndent();
+  std::string ret_val = GetUniqueName("ret_val");
+  std::string ret_type_code = GetUniqueName("ret_type_code");
+  this->stream << "TVMValue " << ret_val << ";\n";
+  this->PrintIndent();
+  this->stream << "int " << ret_type_code << ";\n";
+  this->PrintIndent();
+  this->stream << "if (TVMFuncCall(" << packed_func_name << ", "
+               << "(TVMValue*) stack_value" << ", " << "(int*) stack_tcode" << ", "
+               << num_args << ", " << "&" << ret_val << ", " << "&"
+               << ret_type_code << ") != 0) {\n";
+  int func_call_scope = this->BeginScope();
+  this->PrintIndent();
+  this->stream << "return -1;\n";
+  this->EndScope(func_call_scope);
+  this->PrintIndent();
+  this->stream << "}\n";
+}
+
+void CodeGenCHost::VisitExpr_(const Call *op, std::ostream& os) { // NOLINT(*)
+  if (op->is_intrinsic(intrinsic::tvm_stack_alloca)) {
+    std::string stack_name = GetUniqueName("stack");
+    const std::string& type = op->args[0].as<StringImm>()->value;
+    const IntImm* num = op->args[1].as<IntImm>();
+    CHECK(num != nullptr);
+    static_assert(alignof(TVMValue) % alignof(TVMArray) == 0, "invariant");
+    size_t unit = sizeof(TVMValue);
+    size_t size = 0;
+    if (type == "shape") {
+      size = (num->value * sizeof(tvm_index_t) + unit - 1) / unit;
+    } else if (type == "arg_value") {
+      size = (num->value * sizeof(TVMValue) + unit - 1) / unit;
+    } else if (type == "arg_tcode") {
+      size = (num->value * sizeof(int) + unit - 1) / unit;
+    } else if (type == "array") {
+      size = (num->value * sizeof(TVMArray) + unit - 1) / unit;
+    } else {
+      LOG(FATAL) << "Unknown stack alloca type " << type;
+    }
+    this->PrintIndent();
+    this->stream << "TVMValue " << stack_name << "[" << size << "];\n";
+    os << stack_name;
+  } else if (op->is_intrinsic(intrinsic::tvm_call_packed_lowered)) {
+    const StringImm* s = op->args[0].as<StringImm>();
+    CHECK(s != nullptr) << "tvm_call_packed_lowered expects first argument as function name";
+    int64_t begin = op->args[3].as<IntImm>()->value;
+    int64_t end = op->args[4].as<IntImm>()->value;
+    int64_t num_args = end - begin;
+    CHECK_GE(num_args, 0);
+    std::string func_name = s->value;
+    std::string packed_func_name = GetUniqueName(func_name + "_packed");
+    decl_stream << "static void* " << packed_func_name << " = NULL;\n";
+    this->PrintGetFuncFromBackend(func_name, packed_func_name);
+    this->PrintFuncCall(packed_func_name, num_args);
+  } else if (op->is_intrinsic(intrinsic::tvm_throw_last_error)) {
+    this->PrintIndent();
+    this->stream << "return -1;\n";
+  } else {
+    CodeGenC::VisitExpr_(op, os);
+  }
+}
+
+void CodeGenCHost::VisitStmt_(const AssertStmt *op) { // NOLINT(*)
+  std::string cond = PrintExpr(op->condition);
+  PrintIndent();
+  stream << "if (!(" << cond << ")) {\n";
+  int assert_if_scope = this->BeginScope();
+  PrintIndent();
+  stream << "TVMAPISetLastError(\"" << op->message.as<StringImm>()->value << "\");\n";
+  PrintIndent();
+  stream << "return -1;\n";
+  this->EndScope(assert_if_scope);
+  PrintIndent();
+  stream << "}\n";
+  this->PrintStmt(op->body);
+}
+
+runtime::Module BuildCHost(Array<LoweredFunc> funcs) {
+  using tvm::runtime::Registry;
+  bool output_ssa = false;
+  CodeGenCHost cg;
+  cg.Init(output_ssa);
+  for (LoweredFunc f : funcs) {
+    cg.AddFunction(f);
+  }
+  std::string code = cg.Finish();
+  return CSourceModuleCreate(code, "c");
+}
+
+TVM_REGISTER_API("codegen.build_c")
+.set_body([](TVMArgs args, TVMRetValue* rv) {
+    *rv = BuildCHost(args[0]);
+  });
+}  // namespace codegen
+}  // namespace tvm
diff --git a/src/codegen/codegen_c_host.h b/src/codegen/codegen_c_host.h
new file mode 100644 (file)
index 0000000..eb47a78
--- /dev/null
@@ -0,0 +1,40 @@
+/*!
+ *  Copyright (c) 2017 by Contributors
+ * \file codegen_c_host.h
+ * \brief Generate C host code.
+ */
+#ifndef TVM_CODEGEN_CODEGEN_C_HOST_H_
+#define TVM_CODEGEN_CODEGEN_C_HOST_H_
+
+#include <tvm/codegen.h>
+#include <tvm/packed_func_ext.h>
+#include <string>
+#include "codegen_c.h"
+
+namespace tvm {
+namespace codegen {
+
+class CodeGenCHost final : public CodeGenC {
+ public:
+  CodeGenCHost();
+  void Init(bool output_ssa);
+  void AddFunction(LoweredFunc f);
+  std::string Finish();
+
+  void PrintType(Type t, std::ostream& os) final; // NOLINT(*)
+
+  // overload visitor functions
+  void VisitExpr_(const Broadcast* op, std::ostream& os) final; // NOLINT(*)
+  void VisitExpr_(const Call *op, std::ostream& os) final; // NOLINT(*)
+  void VisitStmt_(const AssertStmt *op) final; // NOLINT(*)
+
+ private:
+  std::string module_name;
+  void PrintGetFuncFromBackend(std::string func_name, std::string packed_func_name);
+  void PrintFuncCall(std::string packed_func_name, int num_args);
+};
+
+}  // namespace codegen
+}  // namespace tvm
+
+#endif  // TVM_CODEGEN_CODEGEN_C_HOST_H_
index d2f80a538a337524a7897db60bfa6e6b7ea7fadc..3fc46c35c7f7050cbbeeb3985228ffb3115a094c 100644 (file)
@@ -112,6 +112,13 @@ class CodeGenSourceBase {
  */
 runtime::Module SourceModuleCreate(std::string code, std::string fmt);
 
+/*!
+ * \brief Create a C source module for viewing and compiling GCC code.
+ * \param code The code to be viewed.
+ * \param fmt The code. format.
+ */
+runtime::Module CSourceModuleCreate(std::string code, std::string fmt);
+
 /*!
  * \brief Create a source module for viewing and limited saving for device.
  * \param data The code data to be viewed.
index c7100e18735ed34f088d3048ecc29f90407aef4a..56facea1567f2993ca23d2f98ab6f15dda820915 100644 (file)
@@ -53,6 +53,52 @@ runtime::Module SourceModuleCreate(std::string code, std::string fmt) {
   return runtime::Module(n);
 }
 
+// Simulator function
+class CSourceModuleNode : public runtime::ModuleNode {
+ public:
+  CSourceModuleNode(std::string code,
+                   std::string fmt)
+      : code_(code), fmt_(fmt) {}
+  const char* type_key() const {
+    return "c";
+  }
+
+  PackedFunc GetFunction(
+      const std::string& name,
+      const std::shared_ptr<ModuleNode>& sptr_to_self) final {
+    LOG(FATAL) << "C Source module cannot execute, to get executable module"
+               << " build TVM with \'" << fmt_ << "\' runtime support";
+    return PackedFunc();
+  }
+
+  std::string GetSource(const std::string& format) final {
+    return code_;
+  }
+
+  void SaveToFile(const std::string& file_name,
+                  const std::string& format) final {
+    std::string fmt = GetFileFormat(file_name, format);
+    std::string meta_file = GetMetaFilePath(file_name);
+    if (fmt == "cc") {
+      CHECK_NE(code_.length(), 0);
+      SaveBinaryToFile(file_name, code_);
+    } else {
+      CHECK_EQ(fmt, fmt_)
+          << "Can only save to format=" << fmt_;
+    }
+  }
+
+ protected:
+  std::string code_;
+  std::string fmt_;
+};
+
+runtime::Module CSourceModuleCreate(std::string code, std::string fmt) {
+  std::shared_ptr<CSourceModuleNode> n =
+      std::make_shared<CSourceModuleNode>(code, fmt);
+  return runtime::Module(n);
+}
+
 // supports limited save without cross compile
 class DeviceSourceModuleNode final : public runtime::ModuleNode {
  public:
diff --git a/tests/python/unittest/test_codegen_c_host.py b/tests/python/unittest/test_codegen_c_host.py
new file mode 100644 (file)
index 0000000..00acbeb
--- /dev/null
@@ -0,0 +1,87 @@
+import tvm
+import numpy as np
+from tvm.contrib import util
+
+def test_add():
+    nn = 1024
+    n = tvm.convert(nn)
+    A = tvm.placeholder((n,), name='A')
+    B = tvm.placeholder((n,), name='B')
+    C = tvm.compute(A.shape, lambda *i: A(*i) + B(*i), name='C')
+    s = tvm.create_schedule(C.op)
+
+    def check_c():
+        f1 = tvm.lower(s, [A, B, C], name="fadd")
+        fsplits = [x for x in tvm.ir_pass.SplitHostDevice(f1)]
+        fsplits[0] = tvm.ir_pass.LowerTVMBuiltin(fsplits[0])
+        mhost = tvm.codegen.build_module(fsplits[0], "c")
+        temp = util.tempdir()
+        path_dso = temp.relpath("temp.so")
+        mhost.export_library(path_dso)
+        m = tvm.module.load(path_dso)
+        fadd = m['fadd']
+        ctx = tvm.cpu(0)
+        # launch the kernel.
+        n = nn
+        a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx)
+        b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), ctx)
+        c = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx)
+        fadd(a, b, c)
+        tvm.testing.assert_allclose(
+            c.asnumpy(), a.asnumpy() + b.asnumpy())
+    check_c()
+
+def test_add_pipeline():
+    nn = 1024
+    n = tvm.convert(nn)
+    A = tvm.placeholder((n,), name='A')
+    B = tvm.placeholder((n,), name='B')
+    AA = tvm.compute((n,), lambda *i: A(*i), name='A')
+    BB = tvm.compute((n,), lambda *i: B(*i), name='B')
+    T = tvm.compute(A.shape, lambda *i: AA(*i) + BB(*i), name='T')
+    C = tvm.compute(A.shape, lambda *i: T(*i), name='C')
+    s = tvm.create_schedule(C.op)
+    xo, xi = s[C].split(C.op.axis[0], factor=4)
+    xo1, xo2 = s[C].split(xo, factor=13)
+    s[C].parallel(xo2)
+    s[C].pragma(xo1, "parallel_launch_point")
+    s[C].pragma(xo2, "parallel_stride_pattern")
+    s[C].pragma(xo2, "parallel_barrier_when_finish")
+    s[C].vectorize(xi)
+
+    def check_c():
+        if not tvm.module.enabled("llvm"):
+            return
+        # Specifically allow offset to test codepath when offset is available
+        Ab = tvm.decl_buffer(
+            A.shape, A.dtype,
+            elem_offset=tvm.var('Aoffset'),
+            offset_factor=8,
+            name='A')
+        binds = {A : Ab}
+        # BUILD and invoke the kernel.
+        f1 = tvm.lower(s, [A,B,C], name="fadd_pipeline")
+        fsplits = [x for x in tvm.ir_pass.SplitHostDevice(f1)]
+        fsplits[0] = tvm.ir_pass.LowerTVMBuiltin(fsplits[0])
+        mhost = tvm.codegen.build_module(fsplits[0], "c")
+        temp = util.tempdir()
+        path_dso = temp.relpath("temp.so")
+        mhost.export_library(path_dso)
+        m = tvm.module.load(path_dso)
+        fadd = m["fadd_pipeline"]
+        ctx = tvm.cpu(0)
+        # launch the kernel.
+        n = nn
+        a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx)
+        b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), ctx)
+        c = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx)
+        fadd(a, b, c)
+        tvm.testing.assert_allclose(
+            c.asnumpy(), a.asnumpy() + b.asnumpy())
+
+    with tvm.build_config(offset_factor=4):
+        check_c()
+
+if __name__ == "__main__":
+    test_add()
+    test_add_pipeline()