From c8a0f524d92937dd14bc400a95dee9efb1acb2b0 Mon Sep 17 00:00:00 2001
From: Yao Wang <kevinthesunwy@gmail.com>
Date: Wed, 29 May 2019 16:36:05 -0700
Subject: [PATCH] [AutoTVM]Core functionality for Graph tuner (#2184)

* Add graph tuning

* Add tests

* Fix tests

* Fix pylint

* Small fix for docstring

* Minor fix

* Support fetching workload from relay expr

* Simplify benchmark layout transformation

* Add relay support

* Fix infer layout func name

* Refactor internal data representation

* Fix issues

* Add PBQP solver

* Fix layout transform check

* Add PBQPTuner test

* Fix lint

* Update tutorial

* Fix tutorial

* Fix lint

* Add relay test

* Remove nnvm since nnvm graph can be converted to relay function

* Modify benchmark layout wrt new layout_transform api

* Fix lint

* Update docstring for DP tuner

* Refactor traverse graph

* Support graph tuning for multiple target operators

* Fix fetching workloads

* Add x86 depthwise_conv2d infer_layout

* Fix x86 depthwise_conv2d autotvm

* Fix PBQP tuner

* Fix DP tuner

* Generate dummy layout transform record

* Update tutorial

* Modify layout records name

* Add ASF header

* Add ASF header for testing files

* Fix test

* Fix topi fetching

* Some refactors

* Fix lint

* Fix tutorial

* Rename test files

* Fix doc typo

* Add test case note link
---
 python/tvm/autotvm/graph_tuner/__init__.py         |  25 +
 python/tvm/autotvm/graph_tuner/_base.py            |  27 ++
 python/tvm/autotvm/graph_tuner/base_graph_tuner.py | 522 +++++++++++++++++++++
 .../graph_tuner/dynamic_programming_stage.py       | 358 ++++++++++++++
 .../graph_tuner/dynamic_programming_tuner.py       | 189 ++++++++
 python/tvm/autotvm/graph_tuner/pbqp_tuner.py       | 288 ++++++++++++
 python/tvm/autotvm/graph_tuner/utils/__init__.py   |  26 +
 .../autotvm/graph_tuner/utils/traverse_graph.py    | 312 ++++++++++++
 python/tvm/autotvm/graph_tuner/utils/utils.py      | 110 +++++
 python/tvm/autotvm/task/__init__.py                |   3 +-
 python/tvm/autotvm/task/topi_integration.py        |  19 +-
 tests/python/unittest/test_graph_tuner_core.py     | 254 ++++++++++
 tests/python/unittest/test_graph_tuner_utils.py    | 149 ++++++
 topi/python/topi/nn/conv2d.py                      |  20 +
 topi/python/topi/nn/depthwise_conv2d.py            |  19 +
 topi/python/topi/x86/conv2d.py                     |  17 +-
 topi/python/topi/x86/depthwise_conv2d.py           |  20 +-
 tutorials/autotvm/tune_relay_x86.py                |  17 +-
 18 files changed, 2364 insertions(+), 11 deletions(-)
 create mode 100644 python/tvm/autotvm/graph_tuner/__init__.py
 create mode 100644 python/tvm/autotvm/graph_tuner/_base.py
 create mode 100644 python/tvm/autotvm/graph_tuner/base_graph_tuner.py
 create mode 100644 python/tvm/autotvm/graph_tuner/dynamic_programming_stage.py
 create mode 100644 python/tvm/autotvm/graph_tuner/dynamic_programming_tuner.py
 create mode 100644 python/tvm/autotvm/graph_tuner/pbqp_tuner.py
 create mode 100644 python/tvm/autotvm/graph_tuner/utils/__init__.py
 create mode 100644 python/tvm/autotvm/graph_tuner/utils/traverse_graph.py
 create mode 100644 python/tvm/autotvm/graph_tuner/utils/utils.py
 create mode 100644 tests/python/unittest/test_graph_tuner_core.py
 create mode 100644 tests/python/unittest/test_graph_tuner_utils.py

diff --git a/python/tvm/autotvm/graph_tuner/__init__.py b/python/tvm/autotvm/graph_tuner/__init__.py
new file mode 100644
index 0000000..d590db0
--- /dev/null
+++ b/python/tvm/autotvm/graph_tuner/__init__.py
@@ -0,0 +1,25 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Autotvm graph tuner API."""
+from __future__ import absolute_import as _abs
+
+from . import _base
+from . import base_graph_tuner
+
+from .base_graph_tuner import BaseGraphTuner
+from .dynamic_programming_tuner import DPTuner
+from .pbqp_tuner import PBQPTuner
diff --git a/python/tvm/autotvm/graph_tuner/_base.py b/python/tvm/autotvm/graph_tuner/_base.py
new file mode 100644
index 0000000..83b9e06
--- /dev/null
+++ b/python/tvm/autotvm/graph_tuner/_base.py
@@ -0,0 +1,27 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name
+"""Helper functions and global data"""
+
+
+RULE_OUT_NODE_NAMES = ["Tuple", "TupleGetItem", "batch_flatten", "transpose", "reshape",
+                       "multibox_prior", "multibox_transform_loc", "where",
+                       "non_max_suppression", "strided_slice"]
+
+# We set a large time to represent an invalid layout-transformation.
+# This number is set to be 10e9 seconds to align with autotvm.
+INVALID_LAYOUT_TIME = 10e9
diff --git a/python/tvm/autotvm/graph_tuner/base_graph_tuner.py b/python/tvm/autotvm/graph_tuner/base_graph_tuner.py
new file mode 100644
index 0000000..0fbfc27
--- /dev/null
+++ b/python/tvm/autotvm/graph_tuner/base_graph_tuner.py
@@ -0,0 +1,522 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=too-many-arguments,too-many-locals,too-many-statements,too-many-instance-attributes,too-many-branches,too-many-nested-blocks,invalid-name,unused-argument,unused-variable,no-member,no-value-for-parameter
+"""Base class for graph tuner."""
+import logging
+from abc import abstractmethod
+
+import numpy as np
+import topi
+
+import tvm
+from tvm import autotvm, relay
+from tvm.autotvm.task import get_config
+from tvm.autotvm.task.topi_integration import deserialize_args, serialize_args
+from tvm.autotvm.record import encode, load_from_file
+from tvm.autotvm.measure import MeasureResult, MeasureInput
+
+from ... import target as _target
+from .utils import is_input_node, get_in_nodes, get_out_nodes, has_multiple_inputs, \
+    bind_inputs, expr2graph
+from ._base import INVALID_LAYOUT_TIME
+
+
+# Setup topi_op_name -> layout function
+# NOTE: To add more ops, change the following dictionary.
+OP2LAYOUT = {
+    "topi_nn_conv2d": topi.nn.conv2d_infer_layout,
+    "topi_nn_depthwise_conv2d_nchw": topi.nn.depthwise_conv2d_infer_layout,
+}
+
+
+@autotvm.template
+def layout_transform(*args):
+    """Autotvm layout transform template."""
+    args = deserialize_args(args)
+    cfg = get_config()
+    cfg.add_flop(-1)
+    data = args[0]
+    out = topi.layout_transform(*args)
+    sch = topi.generic.schedule_injective([out])
+    return sch, [data, out]
+
+
+class BaseGraphTuner(object):
+    """Class to search schedules considering both kernel execution time and
+    layout transformation time.
+
+    Before creating a Graph Executor instance, schedule candidates for all kernels in
+    graph should be provided through tensor-level tuning.
+    """
+    def __init__(self, graph, input_shapes, records, target_ops,
+                 target, max_sch_num=20, dtype="float32", verbose=True,
+                 log_file="graph_tuner.log", log_level=logging.DEBUG,
+                 name="graph_tuner"):
+        """Create a GlobalTuner instance. Local schedule searching for all nodes with
+        target_op in the input graph and layout transformation benchmark need to be
+        executed before initialization.
+
+        graph : tvm.relay.Expr.Function
+            Input graph
+
+        input_shapes : dict of str to tuple.
+            Input shapes of graph
+
+        records : str or iterator of (MeasureInput, MeasureResult)
+            Collection of kernel level tuning records.
+            If it is str, then it should be the filename of a records log file.
+                       Each row of this file is an encoded record pair.
+            Otherwise, it is an iterator.
+
+        target_ops : List of str
+            Target tuning operators.
+
+        target : str or tvm.target
+            Compilation target.
+
+        max_sch_num : int, optional
+            Maximum number of schedule candidates for each workload.
+
+        dtype : str, optional
+            Data type.
+
+        log_file : str, optional
+            graph tuner log file name
+
+        name : str, optional
+            Name of global tuner.
+        """
+        self._node_list = []
+        self._layout_transform_perf_records = {}
+        self._layout_transform_interlayer_cost = {}
+        self._input_shapes = input_shapes
+        self._target_ops = [op.__name__ for op in target_ops]
+
+        self._name = name
+        self._max_sch_num = max_sch_num
+        self._optimal_sch_dict = {}
+        self._records = records
+        self._dtype = dtype
+        if isinstance(target, str):
+            target = _target.create(target)
+        self._target = target
+        self._optimal_record_dict = {}
+
+        # Set up logger
+        self._verbose = verbose
+        self._logger = logging.getLogger(name + "_logger")
+        need_file_handler = need_console_handler = True
+        for handler in self._logger.handlers:
+            if handler.__class__.__name__ == 'FileHandler':
+                need_file_handler = False
+            if handler.__class__.__name__ == 'StreamHandler':
+                need_console_handler = False
+        self._log_level = log_level
+        self._log_file = log_file
+        self._formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s')
+        self._logger.setLevel(log_level)
+        if need_file_handler:
+            file_handler = logging.FileHandler(log_file)
+            file_handler.setFormatter(self._formatter)
+            self._logger.addHandler(file_handler)
+        if self._verbose and need_console_handler:
+            console_handler = logging.StreamHandler()
+            console_handler.setFormatter(self._formatter)
+            self._logger.addHandler(console_handler)
+            self._logger.setLevel(log_level)
+            self._logger.propagate = False
+
+        # Generate workload and schedule dictionaries.
+        if isinstance(graph, relay.expr.Function):
+            node_dict = {}
+            graph = bind_inputs(graph, input_shapes, dtype)
+            expr2graph(graph, self._target_ops, node_dict, self._node_list)
+        else:
+            raise RuntimeError("Unsupported graph type: %s" % str(type(graph)))
+
+        self._graph = graph
+        self._in_nodes_dict = get_in_nodes(self._node_list, self._target_ops, input_shapes.keys())
+        self._out_nodes_dict = get_out_nodes(self._in_nodes_dict)
+        self._fetch_cfg()
+
+        # Setup infer_layout for elemwise-like nodes
+        # Note: graph tuner currently only supports tuning of single input and single output
+        # op as target op, such as conv2d, dense and conv2d_transpose. In this case, we can
+        # reuse infer_layout function from target ops for elemwise-like nodes. The behavior
+        # is to modify the first tensor shape of input workload to the output shape of
+        # elemwise-like node, and use infer_layout function from input op to generate layouts.
+        input_names = self._input_shapes.keys()
+        for idx in sorted(self._in_nodes_dict.keys()):
+            if has_multiple_inputs(self._node_list, idx, input_names):
+                node_entry = self._node_list[idx]
+                node_entry["topi_op"] = []
+                node_entry["workloads"] = []
+                for input_idx in self._in_nodes_dict[idx]:
+                    input_node = self._node_list[input_idx]
+                    if not is_input_node(input_node, input_names):
+                        input_topi_op = input_node["topi_op"][0]
+                        node_entry["topi_op"].append(input_topi_op)
+                        # Only replace the first input tensor
+                        input_workload = input_node["workloads"][0]
+                        first_tensor = input_workload[1]
+                        dtype = first_tensor[-1]
+                        new_shape = tuple([val.value for val in node_entry["types"][0].shape])
+                        actual_workload = (input_workload[0],) + \
+                                          ((new_shape + (dtype,)),) + input_workload[2:]
+                        node_entry["workloads"].append(actual_workload)
+                        if "record_candidates" not in node_entry:
+                            node_entry["record_candidates"] = input_node["record_candidates"]
+                    else:
+                        node_entry["topi_op"].append(None)
+                        node_entry["workloads"].append(None)
+
+
+    def _fetch_cfg(self):
+        """Read and pre-process input schedules."""
+        if isinstance(self._records, str):
+            records = load_from_file(self._records)
+        else:
+            records = self._records
+        cfg_dict = {}
+        for record in records:
+            in_measure, _ = record
+            workload = in_measure.task.workload
+            if workload not in cfg_dict:
+                cfg_dict[workload] = []
+            cfg_dict[workload].append(record)
+
+        cache_dict = {}
+        for key in self._in_nodes_dict:
+            node_entry = self._node_list[key]
+            if node_entry["op"] not in self._target_ops:
+                continue
+            workload = node_entry["workloads"][0]
+            if workload in cache_dict:
+                node_entry["record_candidates"] = cache_dict[workload]
+                continue
+            record_candidates = []
+            infer_layout_func = OP2LAYOUT[node_entry["topi_op"][0]]
+            layout_tracking_dict = {}
+            for record in cfg_dict[workload]:
+                in_measure, out_measure = record
+                workload = in_measure.task.workload
+                cfg = in_measure.config
+                # For multiple cfgs which produces the same in/out layouts,
+                # only the most efficient one is preserved.
+                with self._target:
+                    layouts = infer_layout_func(workload, cfg)
+                    if layouts in layout_tracking_dict:
+                        cost = out_measure.costs[0]
+                        current_best_cost = layout_tracking_dict[layouts][1].costs[0]
+                        if cost < current_best_cost:
+                            layout_tracking_dict[layouts] = record
+                    else:
+                        layout_tracking_dict[layouts] = record
+            sorted_records = sorted(layout_tracking_dict.values(),
+                                    key=lambda item: item[1].costs[0])
+            for i in range(min(self._max_sch_num, len(sorted_records))):
+                record_candidates.append(sorted_records[i])
+            node_entry["record_candidates"] = record_candidates
+            cache_dict[workload] = record_candidates
+
+    def _iterate_layout_transform(self, callback):
+        """Iterate all possible layout transformations and execute callback for each
+        iteration. callback function accepts 6 arguments: from_node_idx, to_node_idx,
+        from_sch_idx, to_sch_idx, args which represent the argument list of layout
+        transformation and is_valid showing whether this is a valid layout transformation.
+        """
+        input_names = self._input_shapes.keys()
+        for key, val in self._in_nodes_dict.items():
+            node_entry = self._node_list[key]
+            target_input_idx = -1
+            target_input_pos = -1
+            if has_multiple_inputs(self._node_list, key, input_names):
+                for i, item in enumerate(val):
+                    if not is_input_node(self._node_list[item], input_names):
+                        target_input_idx = item
+                        target_input_pos = i
+                        break
+
+            for i, item in enumerate(val):
+                i_idx = item
+                in_node_entry = self._node_list[i_idx]
+                if is_input_node(in_node_entry, input_names):
+                    continue
+
+                if node_entry["op"] in self._target_ops:
+                    o_idx = key
+                    o_infer_layout_func = OP2LAYOUT[node_entry["topi_op"][0]]
+                    o_wkl = node_entry["workloads"][0]
+                    i_topi_op = in_node_entry["topi_op"][0]
+                    i_wkl = in_node_entry["workloads"][0]
+                    pivot = 0
+                    while not i_wkl:
+                        pivot += 1
+                        i_topi_op = in_node_entry["topi_op"][pivot]
+                        i_wkl = in_node_entry["workloads"][pivot]
+                    i_infer_layout_func = OP2LAYOUT[i_topi_op]
+                else:
+                    o_idx = target_input_idx
+                    if i <= target_input_pos:
+                        continue
+                    o_infer_layout_func = OP2LAYOUT[node_entry["topi_op"][0]]
+                    o_wkl = node_entry["workloads"][target_input_pos]
+                    i_infer_layout_func = OP2LAYOUT[node_entry["topi_op"][i]]
+                    i_wkl = node_entry["workloads"][i]
+
+
+                for m, i_record in enumerate(in_node_entry["record_candidates"]):
+                    for n, o_record in enumerate(node_entry["record_candidates"]):
+                        i_cfg, o_cfg = i_record[0].config, o_record[0].config
+                        with self._target:
+                            i_input_info, i_output_info = i_infer_layout_func(i_wkl, i_cfg)
+                            o_input_info, o_output_info = o_infer_layout_func(o_wkl, o_cfg)
+                        if len(i_input_info) > 1 or len(i_output_info) > 1 or \
+                                len(o_input_info) > 1 or len(o_output_info) > 1:
+                            raise RuntimeError("Graph tuner only supports target operator "
+                                               "with single input and single output. "
+                                               "Please check target_ops argument.")
+
+                        in_shape, in_layout = i_output_info[0]
+                        if node_entry["op"] in self._target_ops:
+                            _, out_layout = o_input_info[0]
+                        else:
+                            _, out_layout = o_output_info[0]
+                        data_placeholder = tvm.placeholder(in_shape, name="data",
+                                                           dtype=self._dtype)
+                        args = [data_placeholder, in_layout, out_layout]
+                        callback(i_idx, o_idx, m, n, args)
+
+
+    def _create_matrix_callback(self, from_node_idx, to_node_idx, from_sch_idx,
+                                to_sch_idx, args):
+        """Create dictionary containing matrix format of layout transformation
+        between nodes."""
+        sargs = serialize_args(args)
+        in_layout, out_layout = args[1], args[2]
+        ltf_workload = ('layout_transform',) + autotvm.task.args_to_workload(sargs)
+        idx_pair_key = (from_node_idx, to_node_idx)
+
+        if in_layout == out_layout:
+            layout_transform_time = 0
+        else:
+            layout_transform_time = \
+                self._layout_transform_perf_records[ltf_workload][1].costs[0]
+
+        if idx_pair_key not in self._layout_transform_interlayer_cost:
+            self._layout_transform_interlayer_cost[idx_pair_key] = []
+        if len(self._layout_transform_interlayer_cost[idx_pair_key]) <= from_sch_idx:
+            self._layout_transform_interlayer_cost[idx_pair_key].append([])
+        self._layout_transform_interlayer_cost[idx_pair_key][from_sch_idx]\
+            .append(layout_transform_time)
+
+    def benchmark_layout_transform(self, min_exec_num=100, timeout=10,
+                                   use_rpc=False, device_key=None, host="localhost",
+                                   port=9190, n_parallel=1, build_func='default',
+                                   layout_records=None, target_host=None, infer_layout=False):
+        """Benchmark all possible layout transformation in the graph,
+        given a set of schedule candidates for each workload of target operator.
+
+        Parameters
+        ----------
+        min_exec_num : int, optional
+            Minimum number of execution. Final execution time is the average of
+            all execution time.
+
+        timeout : int, optional
+            Time out for each execution.
+
+        use_rpc : boolean, optional
+            Whether to use rpc mode for benchmarking.
+
+        device_key : str, optional
+            Remote device key which can be queried by
+            python -m tvm.exec.query_rpc_tracker --host=0.0.0.0 --port=9190
+
+        host : str, optional
+            IP address used to create RPC tracker on host machine.
+
+        port : int, optional
+            Port number used to create RPC tracker on host machine.
+
+        n_parallel: int, optional
+            The number of measurement task that can run in parallel.
+            Set this according to the number of cpu cores (for compilation) and
+            the number of devices you have (for measuring generate code).
+
+        build_func: str or callable, optional
+            'default': call default builder. This works for normal target (llvm, cuda)
+
+            'ndk': use Android NDK to create shared library. Use this for android target.
+
+            callable: customized build function for other backends (e.g. VTA).
+                      See autotvm/measure/measure_methods.py::default_build_func for example.
+
+        layout_records : str or iterator of (MeasureInput, MeasureResult). optional
+            Collection of layout_transform benchmarking records.
+            If is str, then it should be the filename of a records log file.
+                   Each row of this file is an encoded record pair.
+            Otherwise, it is an iterator.
+
+            If this argument is set, graph tuner will first check whether layout_transform
+            workload already exists in records and skip benchmarking if possible.
+
+        target_host : str, optional
+            str or :any:`tvm.target.Target` optional
+            Host compilation target, if target is device.
+            When TVM compiles device specific program such as CUDA,
+            we also need host(CPU) side code to interact with the driver
+            setup the dimensions and parameters correctly.
+            target_host is used to specify the host side codegen target.
+            By default, llvm is used if it is enabled,
+            otherwise a stackvm intepreter is used.
+
+        infer_layout : bool, optional
+            Whether to infer layout transformation time if it doesn't exist in records, instead
+            of benchmarking on target device.
+
+            This might bring performance loss comparing to benchmarking layout transformation.
+        """
+        self._logger.info("Start to benchmark layout transformation...")
+        if layout_records is None and infer_layout:
+            raise RuntimeError("Requires some records to infer layout transformation time.")
+
+        if isinstance(layout_records, str):
+            layout_records = load_from_file(layout_records)
+            if not layout_records and infer_layout:
+                raise RuntimeError("Records must be non-empty to infer layout transformation time.")
+
+        if isinstance(layout_records, str):
+            layout_records = load_from_file(layout_records)
+        num_flops, total_time = 0, 0
+        if layout_records is not None:
+            for record in layout_records:
+                ltf_wkl = record[0].task.workload
+                self._layout_transform_perf_records[ltf_wkl] = record
+                input_shape = ltf_wkl[1][1]
+                flops = np.prod(input_shape)
+                num_flops += flops
+                total_time += record[1].costs[0]
+        avg_time = total_time / num_flops if num_flops > 0 else 0
+
+        args_list = []
+        def _fetch_args_callback(from_node_idx, to_node_idx, from_sch_idx,
+                                 to_sch_idx, args):
+            """Callback function to fetch layout transform args"""
+            _, in_layout, out_layout = args
+            if in_layout != out_layout:
+                args_list.append(args)
+
+        self._iterate_layout_transform(_fetch_args_callback)
+
+        def _log_to_list(record_list):
+            """Callback to log result to a list."""
+            def _callback(_, inputs, results):
+                """Callback implementation"""
+                record_list.append((inputs[0], results[0]))
+            return _callback
+
+        builder = autotvm.LocalBuilder(n_parallel=n_parallel, build_func=build_func)
+        runner = autotvm.LocalRunner(number=min_exec_num, repeat=1, timeout=timeout)
+        if use_rpc:
+            if device_key is None:
+                raise RuntimeError("device_key need to be set to use rpc tracker mode.")
+            runner = autotvm.measure.RPCRunner(device_key, host, port, n_parallel=n_parallel,
+                                               number=min_exec_num, repeat=1,
+                                               timeout=timeout)
+        measure_option = autotvm.measure_option(builder=builder, runner=runner)
+        for args in args_list:
+            args = serialize_args(args)
+            ltf_workload = ('layout_transform',) + autotvm.task.args_to_workload(args)
+            if ltf_workload in  self._layout_transform_perf_records:
+                continue
+
+            if infer_layout:
+                input_shape = ltf_workload[1][1]
+                flops = 1
+                for i in input_shape:
+                    flops *= i
+                inferred_time = flops * avg_time
+                record_input = MeasureInput(target=self._target, task=None, config=None)
+                record_output = MeasureResult(costs=(inferred_time,), error_no=0,
+                                              all_cost=-1, timestamp=-1)
+                self._layout_transform_perf_records[ltf_workload] = (record_input, record_output)
+                continue
+
+            records = []
+            task = autotvm.task.create(layout_transform, args=args, target=self._target,
+                                       target_host=target_host)
+            task.workload = ltf_workload
+            tuner = autotvm.tuner.GridSearchTuner(task)
+            tuner.tune(n_trial=1, measure_option=measure_option,
+                       callbacks=[_log_to_list(records)])
+            if not isinstance(records[0][1].costs[0], float):
+                records[0] = (records[0][0], records[0][1]._replace(costs=(INVALID_LAYOUT_TIME,)))
+            self._layout_transform_perf_records[ltf_workload] = records[0]
+
+        self._iterate_layout_transform(self._create_matrix_callback)
+        self._logger.info("Benchmarking layout transformation successful.")
+
+    @property
+    def layout_transform_perf_records(self):
+        """Get layout transformation dictionary for input graph.
+
+        Returns
+        -------
+        layout_transform_perf_records : dict of tuple to (MeasureInput, MeasureResult)
+            Layout transformation dictionary for input graph.
+        """
+        return self._layout_transform_perf_records
+
+
+    def get_optimal_records(self):
+        """Convert optimal record dictionary to a list of records
+        with ascending order of node index in graph.
+
+        Returns
+        -------
+        sch_list : list of tuple
+            List of records with ascending order of node index in graph.
+        """
+        ordered_index_list = sorted(self._optimal_record_dict.keys())
+        ret = []
+        for index in ordered_index_list:
+            node_entry = self._node_list[index]
+            if node_entry["op"] not in self._target_ops:
+                continue
+            ret.append(node_entry["record_candidates"][self._optimal_record_dict[index]])
+        return ret
+
+    def write_opt_sch2record_file(self, record_file="graph_opt_schedule.log"):
+        """Write graph level optimal schedules into file.
+
+        Parameters
+        ----------
+        record_file : str, optional
+            Output schedule file.
+        """
+        with open(record_file, "a") as out_file:
+            records = self.get_optimal_records()
+            for record in records:
+                out_file.write(encode(record[0], record[1]) + "\n")
+        msg = "Writing optimal schedules to %s successfully." % record_file
+        self._logger.info(msg)
+
+    @abstractmethod
+    def run(self, **kwargs):
+        """Run graph tuning."""
+        pass
diff --git a/python/tvm/autotvm/graph_tuner/dynamic_programming_stage.py b/python/tvm/autotvm/graph_tuner/dynamic_programming_stage.py
new file mode 100644
index 0000000..4a512c2
--- /dev/null
+++ b/python/tvm/autotvm/graph_tuner/dynamic_programming_stage.py
@@ -0,0 +1,358 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=too-many-instance-attributes,too-many-branches,too-many-statements,too-many-arguments,too-many-locals,invalid-name
+"""Stage class for dynamic programming tuner"""
+import numpy as np
+
+from .utils import is_input_node
+
+
+class DPStage(object):
+    """Class to represent node in Markov decision process. A stage has states
+    to represent different schedules of the current node. Since in this problem
+    the action is the schedule selected for current node, action can be fully
+    represented by states. No extra attribute needs for action.
+
+    In most cases, instance of this class should be created through DPTuner.
+    """
+    def __init__(self, idx, input_shapes, node_list,
+                 counted_nodes_set, layout_transform_interlayer_cost,
+                 stage_dict, in_nodes_dict, out_nodes_dict,
+                 dep_dict, target_ops, dtype="float32"):
+        """Initialize a stage and create all states.
+
+        Parameters
+        ----------
+        idx : int
+            Index for current node.
+
+        input_shapes : dict of string to tuple of int
+            Input shapes for current graph.
+
+        node_list : list of dict
+            List of all nodes for current graph.
+
+        counted_nodes_set : set of int
+            Global set recording whether the execution time of a node has been counted.
+
+        layout_transform_interlayer_cost : dict of tuple to list
+            Dictionary maps node index pair to layout transformation time between them.
+
+        stage_dict : dict of int to Stage
+            Global dictionary for all stages mapping node index to stage.
+
+        in_nodes_dict : dict of int to list of int
+            Dictionary maps node index to corresponding input node index.
+
+        out_nodes_dict : dict of int to list of int
+            Dictionary maps node index to corresponding output node index.
+
+        dep_dict : dict of int to set of int
+            Dictionary maps node index to dependent node index.
+
+        target_ops : list of str
+            Target operators
+
+        dtype : str, optional
+            Data type.
+        """
+        self._global_input_shapes = input_shapes
+        self._global_input_names = input_shapes.keys()
+        self._global_node_list = node_list
+        self._global_counted_nodes_set = counted_nodes_set
+        self._global_layout_transform_interlayer_cost = layout_transform_interlayer_cost
+        self._global_stage_dict = stage_dict
+        self._global_in_nodes_dict = in_nodes_dict
+        self._global_out_nodes_dict = out_nodes_dict
+        self._global_dep_dict = dep_dict
+
+        self._idx = idx
+        self._node_entry = self._global_node_list[idx]
+        self._target_ops = target_ops
+        self._wkl = self._node_entry["workloads"][0]
+        self._record_list = self._node_entry["record_candidates"]
+        self._dep = []
+        self._dtype = dtype
+        self._states = None
+        self._full_states = None
+        self._full_states_idx = None
+        self._create_states()
+
+    def _create_states(self):
+        """Create states."""
+        node = self._global_node_list[self._idx]
+        if node["op"] in self._target_ops:
+            self._create_op_states()
+        else:
+            self._create_multi_inputs_states()
+
+    def _create_op_states(self):
+        """State creation routine for nodes with target_op."""
+        input_idx = -1
+        for index in self._global_in_nodes_dict[self._idx]:
+            input_idx = index
+            if not is_input_node(self._global_node_list[input_idx],
+                                 self._global_input_names):
+                break
+
+        if is_input_node(self._global_node_list[input_idx],
+                         self._global_input_names):
+            self._full_states = np.array([record[1].costs[0]
+                                          for record in self._record_list])
+            self._states = self._full_states
+        else:
+            input_node_entry = self._global_node_list[input_idx]
+            input_stage = self._global_stage_dict[input_idx]
+            input_dep = input_stage.dep
+            input_states = input_stage.states
+            input_flatten_states = input_states.flatten()
+            input_record_list = input_node_entry["record_candidates"]
+            num_schedules = len(self._record_list)
+            num_input_schedules = len(input_record_list)
+            num_input_states = input_flatten_states.shape[0]
+
+            full_states_shape = tuple([num_schedules, num_input_schedules] +
+                                      [len(self._global_node_list[dep_idx]["record_candidates"])
+                                       for dep_idx in input_dep])
+            self._full_states = np.zeros(full_states_shape).flatten().astype("float32")
+            self._full_states_idx = [self._idx, input_idx] + input_dep
+            dep_multiplier = 1
+            for i in range(2, len(full_states_shape)):
+                dep_multiplier *= full_states_shape[i]
+            input_node_time_counted = input_idx in self._global_counted_nodes_set
+
+            for i in range(num_schedules):
+                current_sch_time = float(self._record_list[i][1].costs[0])
+                for j in range(num_input_states):
+                    input_sch_idx = j // dep_multiplier
+                    layout_transform_time = \
+                        self._global_layout_transform_interlayer_cost \
+                            [(input_idx, self._idx)][input_sch_idx][i]
+
+                    if input_node_time_counted:
+                        total_time = current_sch_time + layout_transform_time
+                    else:
+                        total_time = \
+                            current_sch_time + layout_transform_time + input_flatten_states[j]
+                    current_state_idx = i * num_input_states + j
+                    self._full_states[current_state_idx] = total_time
+
+            if not input_node_time_counted:
+                self._global_counted_nodes_set.add(input_idx)
+            self._full_states = self._full_states.reshape(full_states_shape)
+
+            # If out degree of input node is 1, we can remove the dimension of input node,
+            # since the states of input node will not be needed any more. Otherwise, input
+            # node should become a dependency.
+            if len(self._global_out_nodes_dict[input_idx]) == 1:
+                self._states = np.amin(self._full_states, axis=1)
+                self._dep = list(input_dep)
+            else:
+                self._states = self._full_states
+                self._dep = [input_idx,] + input_dep
+
+        # Update global dependency dictionary.
+        # This is to monitor the dependency states to decide
+        # when a dependency can be eliminated, so that total
+        # number of states can be largely reduced.
+        for dep_idx in self._dep:
+            self._global_dep_dict[dep_idx].remove(self._idx)
+            for child in self._global_out_nodes_dict[self._idx]:
+                self._global_dep_dict[dep_idx].add(child)
+        if len(self._global_out_nodes_dict[self._idx]) > 1:
+            self._global_dep_dict[self._idx] = set()
+            for child in self._global_out_nodes_dict[self._idx]:
+                self._global_dep_dict[self._idx].add(child)
+
+    def _create_multi_inputs_states(self):
+        """State creation routine for multi_input operator
+
+        In tvm, layout transformation for an elemwise-like follow the rule which
+        all input operators transform their layouts to the leftmost input operator
+        layout. For example:
+                            elemwise-sum
+                            |    |    |
+                            |    |    |
+                           op0  op1  op2
+        In this block, the possible layout transformations are: op1 -> op0 and op2 -> op0.
+        In graph tuning, a 3-D array with shape (k0, k1, k2) can represent the layout
+        transformations between these three nodes. It is also possible some earlier states
+        belong to other nodes(We name them as dependency) are required for dynamic programming.
+        The final states array for this elemwise-sum can be with shape (e0, k0, k1, e1, k2).
+        To iterate through all states, we first align the shape of op0, op1 and op2 to be
+        (e0, k0, k1, e1, k2) by broadcasting the original states. We also record the axis of
+        each input node in the states array, together with the multiplier. For example,
+        the axis index for op0 is 1, and multiplier is k1 * e1 * k2. If current iterating index
+        in the flatten array is i, the index of op0 can be computed as:
+        i % (k0 * k1 * e1 * k2) // (k1 * e1 * k2).
+        """
+        full_input_node_list = list(self._global_in_nodes_dict[self._idx])
+        input_index_list = []
+        # Remove input and parameter nodes
+        for input_idx in full_input_node_list:
+            if not is_input_node(self._global_node_list[input_idx],
+                                 self._global_input_names):
+                input_index_list.append(input_idx)
+
+        # Generate new states
+        states_list, aligned_node_list = DPStage.align_states(input_index_list,
+                                                              self._global_stage_dict,
+                                                              self._global_node_list)
+        target_node_idx, target_major_axis, target_multiplier, target_states = states_list[0]
+        aligned_shape = target_states.shape
+        self._full_states = np.zeros(aligned_shape).astype("float32").flatten()
+        self._full_states_idx = list(aligned_node_list)
+        num_states = self._full_states.shape[0]
+        node_time_counted = [item[0] in self._global_counted_nodes_set for item in states_list]
+        target_states = target_states.flatten()
+        src_states_list = [states_list[i][3].flatten() for i in range(1, len(states_list))]
+
+        for i in range(num_states):
+            target_sch_idx = (i % (target_multiplier *
+                                   aligned_shape[target_major_axis])) // target_multiplier
+            if node_time_counted[0]:
+                new_state = 0
+            else:
+                new_state = target_states[i]
+
+            for j in range(1, len(states_list)):
+                src_states = src_states_list[j - 1]
+                src_node_idx, src_major_axis, src_multiplier, _ = states_list[j]
+                src_sch_idx = (i % (src_multiplier *
+                                    aligned_shape[src_major_axis])) // src_multiplier
+                layout_transform_time = \
+                    self._global_layout_transform_interlayer_cost\
+                        [(src_node_idx, target_node_idx)][src_sch_idx][target_sch_idx]
+
+                if node_time_counted[j]:
+                    new_state += layout_transform_time
+                else:
+                    new_state += layout_transform_time + src_states[i]
+                self._full_states[i] = new_state
+
+        for i, node_counted in enumerate(node_time_counted):
+            if not node_counted:
+                self._global_counted_nodes_set.add(states_list[i][0])
+        self._full_states = self._full_states.reshape(aligned_shape)
+
+        # Remove dependency to reduce states
+        reduced_states = np.array(self._full_states)
+        reduced_states_transpose = [states_list[0][1]]
+        reduced_states_dep_list = []
+        self._dep = []
+        for i in range(len(reduced_states.shape)):
+            if i != states_list[0][1]:
+                reduced_states_transpose.append(i)
+                reduced_states_dep_list.append(aligned_node_list[i])
+        reduced_states = np.transpose(reduced_states, reduced_states_transpose)
+        shift = 0
+        for i, dep in enumerate(reduced_states_dep_list):
+            if dep not in self._global_dep_dict or len(self._global_dep_dict[dep]) == 1:
+                self._global_dep_dict.pop(dep, None)
+                reduced_states = np.amin(reduced_states, axis=i+1-shift)
+                shift += 1
+            else:
+                self._dep.append(dep)
+        self._states = reduced_states
+
+        # Update dependency
+        for dep in self._dep:
+            self._global_dep_dict[dep].remove(self._idx)
+            for child in self._global_out_nodes_dict[self._idx]:
+                self._global_dep_dict[dep].add(child)
+        if len(self._global_out_nodes_dict[self._idx]) > 1:
+            self._global_dep_dict[self._idx] = set()
+            for child in self._global_out_nodes_dict[self._idx]:
+                self._global_dep_dict[self._idx].add(child)
+
+    @property
+    def dep(self):
+        """Get dependency list."""
+        return self._dep
+
+    @property
+    def states(self):
+        """Get states."""
+        return self._states
+
+    @property
+    def full_states(self):
+        """Get complete states."""
+        return self._full_states
+
+    @property
+    def full_states_idx(self):
+        """Get node index of complete states."""
+        return self._full_states_idx
+
+    @staticmethod
+    def align_states(input_index_list, stage_dict, node_list):
+        """Align all input node states shapes to be the same and transpose/reshape properly.
+
+        This is used in creating multi_input operator states.
+
+        Parameters
+        ----------
+        input_index_list : list of int
+            List of input node index.
+
+        stage_dict : dict of int to Stage
+            Global dictionary of node index to stage.
+
+        node_list : list of dict
+            List of all nodes for current graph.
+
+        Returns
+        -------
+        states_list : list of tuple
+            List of aligned states.
+
+        aligned_node_list : list in int
+            List of node index for aligned states.
+        """
+        aligned_node_list = list(input_index_list)
+        states_list = []
+        for input_idx in input_index_list:
+            input_node_stage = stage_dict[input_idx]
+            for dep_idx in input_node_stage.dep:
+                if dep_idx not in aligned_node_list:
+                    aligned_node_list.append(dep_idx)
+        aligned_shape = tuple([len(node_list[idx]["record_candidates"])
+                               for idx in aligned_node_list])
+        for input_idx in input_index_list:
+            input_node_stage = stage_dict[input_idx]
+            input_node_shape_idx_list = [input_idx] + input_node_stage.dep
+            transpose_idx_list = []
+            reshape_list = []
+            major_axis = -1
+            for i, idx in enumerate(aligned_node_list):
+                if input_idx == idx:
+                    major_axis = i
+                if idx in input_node_shape_idx_list:
+                    transpose_idx_list.append(idx)
+                    reshape_list.append(aligned_shape[i])
+                else:
+                    reshape_list.append(1)
+            transpose_list = [input_node_shape_idx_list.index(idx) for idx in transpose_idx_list]
+            input_node_states = np.transpose(input_node_stage.states, tuple(transpose_list))
+            input_node_states = np.reshape(input_node_states, tuple(reshape_list))
+            input_node_states = np.broadcast_to(input_node_states, aligned_shape)
+            multiplier = 1
+            for i in range(major_axis + 1, len(aligned_shape)):
+                multiplier *= aligned_shape[i]
+            states_list.append((input_idx, major_axis, multiplier, input_node_states))
+        return states_list, aligned_node_list
diff --git a/python/tvm/autotvm/graph_tuner/dynamic_programming_tuner.py b/python/tvm/autotvm/graph_tuner/dynamic_programming_tuner.py
new file mode 100644
index 0000000..11571f2
--- /dev/null
+++ b/python/tvm/autotvm/graph_tuner/dynamic_programming_tuner.py
@@ -0,0 +1,189 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=import-error,too-many-locals,too-many-statements,too-many-branches,unused-variable
+"""Dynamic programming tuner."""
+import sys
+import numpy as np
+
+from .base_graph_tuner import BaseGraphTuner
+from .dynamic_programming_stage import DPStage
+from .utils import has_multiple_inputs, is_input_node
+
+if sys.version_info[0] == 3:
+    import queue
+else:
+    import Queue as queue
+
+class DPTuner(BaseGraphTuner):
+    """Tuner which uses dynamic programming to solve MDP problem.
+
+    Note: currently dynamic programming is used to solve this MDP problem. However,
+    this problem is intrinsically non-polynomial. DP can't apply for more complicated
+    models, such as networks with many element-wise sum operators. In this case, switch
+    to heuristic algorithm such as PBQP tuner.
+    """
+    def __init__(self, *args, **kwargs):
+        """Create a dynamic programming tuner.
+        """
+        super(DPTuner, self).__init__(*args, **kwargs)
+        self._num_states = self._max_num_states = None
+        self._stage_dict = {}
+        self._dep_dict = {}
+        self._counted_nodes_set = set()
+
+        self._global_data_dict = {
+            "dtype": self._dtype,
+            "counted_nodes_set": self._counted_nodes_set,
+            "stage_dict": self._stage_dict,
+            "in_nodes_dict": self._in_nodes_dict,
+            "out_nodes_dict": self._out_nodes_dict,
+            "dep_dict": self._dep_dict,
+            "node_list": self._node_list,
+            "input_shapes": self._input_shapes,
+            "layout_transform_interlayer_cost": self._layout_transform_interlayer_cost
+        }
+
+    def _check_num_states(self, num_states):
+        """Track the number of states."""
+        self._num_states += num_states
+        if self._max_num_states is not None:
+            if self._num_states > self._max_num_states:
+                raise RuntimeError("Too many states detected while running dynamic "
+                                   "programming: got %d states but upper limit is %d." %
+                                   (self._num_states, self._max_num_states))
+
+    def _forward(self):
+        """Forward pass in DP to generate states for all stages.
+        """
+        self._logger.info("Start forward pass...")
+        for node_idx in sorted(self._in_nodes_dict.keys()):
+            stage = DPStage(idx=node_idx, target_ops=self._target_ops,
+                            **self._global_data_dict)
+            self._check_num_states(stage.full_states.size)
+            self._stage_dict[node_idx] = stage
+        self._logger.info("Finished forward pass.")
+
+    def _backward(self):
+        """Backward pass in DP to generate optimal solution.
+        """
+        self._logger.info("Start backward pass...")
+        input_names = self._input_shapes.keys()
+        optimal_record_dict = {}
+        # Pick optimal schedule for output nodes
+        output_idx_list = []
+        for key, val in self._out_nodes_dict.items():
+            if not val:
+                output_idx_list.append(key)
+        states_list, aligned_node_list = DPStage.align_states(output_idx_list, self._stage_dict,
+                                                              self._node_list)
+        num_states = states_list[0][3].size
+        self._check_num_states(num_states * len(output_idx_list))
+        aligned_node_shape = states_list[0][3].shape
+        min_time = 0
+        min_pos = -1
+        for states in states_list:
+            min_time += np.amax(states[3])
+        flatten_states_list = [current_states[3].flatten() for current_states in states_list]
+        for i in range(num_states):
+            current_time = 0
+            for j, current_states in enumerate(states_list):
+                current_time += flatten_states_list[j][i]
+            if min_time > current_time:
+                min_time = current_time
+                min_pos = i
+        for i, states in enumerate(states_list):
+            current_major_axis = states[1]
+            current_sch_idx = (min_pos % (states[2] *
+                                          aligned_node_shape[current_major_axis])) // states[2]
+            optimal_record_dict[aligned_node_list[i]] = current_sch_idx
+        # Pick optimal schedule for dependencies of output nodes
+        for i in range(len(states_list), len(aligned_node_list)):
+            multiplier = 1
+            for j in range(i + 1, len(aligned_node_list)):
+                multiplier *= aligned_node_shape[j]
+            optimal_record_dict[aligned_node_list[i]] = \
+                min_pos // multiplier % aligned_node_shape[i]
+
+        # Backward pass to get optimal schedules for other nodes
+        bfs_q = queue.Queue()
+        visited = set()
+        for out_idx in output_idx_list:
+            bfs_q.put(out_idx)
+        while not bfs_q.empty():
+            node_idx = bfs_q.get()
+            visited.add(node_idx)
+            if is_input_node(self._node_list[node_idx], input_names):
+                continue
+            optimal_sch_idx = optimal_record_dict[node_idx]
+            full_states = self._stage_dict[node_idx].full_states
+            if not has_multiple_inputs(self._node_list, node_idx, input_names):
+                input_idx = self._in_nodes_dict[node_idx][0]
+                if is_input_node(self._node_list[input_idx], input_names):
+                    continue
+                if input_idx not in visited:
+                    bfs_q.put(input_idx)
+                    if input_idx not in optimal_record_dict:
+                        dep_list = self._stage_dict[node_idx].dep
+                        dep_idx = tuple([optimal_record_dict[item] for item in dep_list])
+                        tmp = np.argmin(full_states, axis=1)
+                        optimal_input_sch_idx = tmp[(optimal_sch_idx,) + dep_idx]
+                        optimal_record_dict[input_idx] = optimal_input_sch_idx
+            else:
+                input_idx_list = self._in_nodes_dict[node_idx]
+                optimal_record_dict[input_idx_list[0]] = optimal_sch_idx
+                full_states_idx = self._stage_dict[node_idx].full_states_idx
+                tmp = full_states[optimal_sch_idx]
+                new_states_idx, new_states_pos = [], []
+                visited_states_idx, visited_states_pos = [], []
+                for i in range(1, len(full_states_idx)):
+                    if full_states_idx[i] in optimal_record_dict:
+                        visited_states_idx.append(full_states_idx[i])
+                        visited_states_pos.append(i - 1)
+                    else:
+                        new_states_idx.append(full_states_idx[i])
+                        new_states_pos.append(i - 1)
+                if visited_states_idx:
+                    tmp = np.transpose(tmp, tuple(visited_states_pos + new_states_pos))
+                    tmp = tmp[tuple([optimal_record_dict[idx] for idx in visited_states_idx])]
+                min_pos = np.argmin(tmp)
+                multiplier = 1
+                for i in range(len(new_states_idx)):
+                    multiplier *= full_states.shape[new_states_pos[i] + 1]
+                for pos, idx in zip(new_states_pos, new_states_idx):
+                    multiplier //= full_states.shape[pos + 1]
+                    optimal_record_dict[idx] = min_pos // multiplier
+                    min_pos %= multiplier
+                for input_idx in input_idx_list:
+                    if input_idx not in visited:
+                        bfs_q.put(input_idx)
+
+        self._optimal_record_dict = optimal_record_dict
+        for node_idx, _ in self._in_nodes_dict.items():
+            if self._node_list[node_idx]["op"] not in self._target_ops:
+                continue
+        self._logger.info("Finished backward pass...")
+
+    def run(self, **kwargs):
+        """Run dynamic programming solver.
+        """
+        max_num_states = None if "max_num_states" not in kwargs else kwargs["max_num_states"]
+        self._num_states = 0
+        self._max_num_states = max_num_states
+        self._logger.info("Start to run dynamic programming algorithm...")
+        self._forward()
+        self._backward()
+        self._logger.info("Finished DPExecutor run.")
diff --git a/python/tvm/autotvm/graph_tuner/pbqp_tuner.py b/python/tvm/autotvm/graph_tuner/pbqp_tuner.py
new file mode 100644
index 0000000..1d7089e
--- /dev/null
+++ b/python/tvm/autotvm/graph_tuner/pbqp_tuner.py
@@ -0,0 +1,288 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name,too-many-locals
+"""Partitioned Boolean Quadratic Programming Tuner"""
+from ._base import INVALID_LAYOUT_TIME
+from .base_graph_tuner import BaseGraphTuner
+from .utils import is_input_node, has_multiple_inputs
+
+
+class PBQPTuner(BaseGraphTuner):
+    """An approximation method to deal with intractably
+    large size of graph tuning problem.
+
+    This graph coloring algorithm mainly comes from:
+
+    Lang Hames and Bernhard Scholz.
+    Nearly optimal register allocation with pbqp.JMLC 2006.
+    LNCS, vol.4228,pp. 346-361, 2016
+    """
+    def __init__(self, *args, **kwargs):
+        """Create a partitioned boolean quadratic programming tuner.
+        """
+        super(PBQPTuner, self).__init__(*args, **kwargs)
+
+        # Remove input nodes
+        input_names = self._input_shapes.keys()
+        for node_idx in self._out_nodes_dict:
+            if is_input_node(self._node_list[node_idx], input_names):
+                for out_node_idx in self._out_nodes_dict[node_idx]:
+                    self._in_nodes_dict[out_node_idx].remove(node_idx)
+
+        self._adj_dict = {}
+        for node_idx in self._in_nodes_dict:
+            self._adj_dict[node_idx] = list(self._in_nodes_dict[node_idx]) + \
+                                       list(self._out_nodes_dict[node_idx])
+
+        self._record_cost_dict = {}
+        for key in self._in_nodes_dict:
+            self._record_cost_dict[key] = []
+            for record in self._node_list[key]["record_candidates"]:
+                self._record_cost_dict[key].append(record[1].costs[0])
+
+        self._max_degree = -1
+        self._node_degree_dict = {}
+        for node_idx in self._in_nodes_dict:
+            node_degree = self._get_degree(node_idx)
+            self._node_degree_dict[node_idx] = node_degree
+            self._max_degree = max(self._max_degree, node_degree)
+
+        self._stack = []
+        self._buckets = [[] for _ in range(self._max_degree + 2)]
+        for node_idx in sorted(self._in_nodes_dict):
+            node_degree = self._get_degree(node_idx)
+            self._buckets[node_degree].append(node_idx)
+
+        self._is_optimal = True
+
+    def _get_degree(self, node_idx):
+        """Get node degree.
+        """
+        return len(self._adj_dict[node_idx])
+
+    def _reorder_adj_nodes(self, node_idx):
+        """Update buckets list with current adjacency list.
+        """
+        for adj_node in self._adj_dict[node_idx]:
+            current_degree = self._get_degree(adj_node)
+            prev_degree = self._node_degree_dict[adj_node]
+            if prev_degree != current_degree:
+                self._buckets[prev_degree].remove(adj_node)
+                self._buckets[current_degree].insert(0, adj_node)
+                self._node_degree_dict[adj_node] = current_degree
+
+    def _remove_node(self, node_idx):
+        """Remove node from graph. Update adjacency list accordingly.
+        """
+        node_degree = self._get_degree(node_idx)
+        self._buckets[node_degree].remove(node_idx)
+        for adj_node in self._adj_dict[node_idx]:
+            self._adj_dict[adj_node].remove(node_idx)
+
+    def _insert_edge(self, node_x, node_y, adj_cost_matrix):
+        """Insert an edge between two nodes.
+        """
+        self._layout_transform_interlayer_cost[(node_x, node_y)] = adj_cost_matrix
+        self._layout_transform_interlayer_cost[(node_y, node_x)] = []
+        for i in range(len(adj_cost_matrix[0])):
+            self._layout_transform_interlayer_cost[(node_y, node_x)].append([])
+            for cost_vec in adj_cost_matrix:
+                self._layout_transform_interlayer_cost[(node_y, node_x)][i] \
+                    .append(cost_vec[i])
+
+        self._adj_dict[node_x].append(node_y)
+        self._adj_dict[node_y].append(node_x)
+
+    def _backward_insert_node(self, node_idx):
+        """Reinsert node in backward pass.
+        """
+        for adj_node in self._adj_dict[node_idx]:
+            self._adj_dict[adj_node].append(node_idx)
+
+    def _RI_reduction(self, node_idx):
+        """Reduce nodes with degree 1.
+        """
+        adj_node = self._adj_dict[node_idx][0]
+        ltf_matrix = self._layout_transform_interlayer_cost[(adj_node, node_idx)]
+        for i, cost_vec in enumerate(ltf_matrix):
+            min_cost = INVALID_LAYOUT_TIME
+            for j, cost in enumerate(cost_vec):
+                min_cost = min(min_cost, cost + self._record_cost_dict[node_idx][j])
+            self._record_cost_dict[adj_node][i] += min_cost
+        self._remove_node(node_idx)
+        self._reorder_adj_nodes(node_idx)
+        self._stack.append(node_idx)
+
+    def _RII_reduction(self, node_idx):
+        """Reduce nodes with degree 2.
+        """
+        adj_node_x, adj_node_y = self._adj_dict[node_idx]
+        ltf_matrix_x = self._layout_transform_interlayer_cost[(adj_node_x, node_idx)]
+        ltf_matrix_y = self._layout_transform_interlayer_cost[(adj_node_y, node_idx)]
+        delta_matrix = [[] for _ in range(len(ltf_matrix_x))]
+        for i, cost_vec_x in enumerate(ltf_matrix_x):
+            for j, cost_vec_y in enumerate(ltf_matrix_y):
+                min_cost = INVALID_LAYOUT_TIME
+                for k in range(len(self._record_cost_dict[node_idx])):
+                    min_cost = min(min_cost, cost_vec_x[k] + cost_vec_y[k]
+                                   + self._record_cost_dict[node_idx][k])
+                delta_matrix[i].append(min_cost)
+
+        if adj_node_x == adj_node_y:
+            for i, delta_row in enumerate(delta_matrix):
+                self._record_cost_dict[adj_node_x][i] += delta_row[i]
+        elif adj_node_x in self._adj_dict[adj_node_y]:
+            for i, _ in enumerate(delta_matrix):
+                for j, delta in enumerate(delta_matrix[i]):
+                    self._layout_transform_interlayer_cost[(adj_node_x, adj_node_y)][i][j] \
+                        += delta
+                    self._layout_transform_interlayer_cost[(adj_node_y, adj_node_x)][j][i] \
+                        += delta
+        else:
+            self._insert_edge(adj_node_x, adj_node_y, delta_matrix)
+
+        self._remove_node(node_idx)
+        self._reorder_adj_nodes(node_idx)
+        self._stack.append(node_idx)
+
+    def _RN_reduction(self, node_idx):
+        """Reduce nodes with degree greater than 2.
+        """
+        min_cost = INVALID_LAYOUT_TIME
+        record_idx = -1
+
+        for i, record_cost in enumerate(self._record_cost_dict[node_idx]):
+            current_cost = record_cost
+            for adj_node in self._adj_dict[node_idx]:
+                ltf_matrix = self._layout_transform_interlayer_cost[(node_idx, adj_node)]
+                adj_record_cost = list(self._record_cost_dict[adj_node])
+                for j, ltf_cost in enumerate(ltf_matrix[i]):
+                    adj_record_cost[j] += ltf_cost
+                current_cost += min(adj_record_cost)
+            if current_cost < min_cost:
+                min_cost = current_cost
+                record_idx = i
+
+        if record_idx < 0:
+            raise RuntimeError("Can't find a soltuion for node %d when "
+                               "applying RN reduction" % node_idx)
+        self._optimal_record_dict[node_idx] = record_idx
+        self._is_optimal = False
+
+        for adj_node in self._adj_dict[node_idx]:
+            ltf_matrix = self._layout_transform_interlayer_cost[(node_idx, adj_node)]
+            for i, ltf_cost in enumerate(ltf_matrix[record_idx]):
+                self._record_cost_dict[adj_node][i] += ltf_cost
+
+        self._remove_node(node_idx)
+        self._reorder_adj_nodes(node_idx)
+        self._stack.append(node_idx)
+
+    def _forward(self):
+        """Forward pass in PBQP to reduce nodes.
+        """
+        while True:
+            if self._buckets[1]:
+                node_idx = self._buckets[1][0]
+                self._RI_reduction(node_idx)
+            elif self._max_degree >= 2 and self._buckets[2]:
+                node_idx = self._buckets[2][0]
+                self._RII_reduction(node_idx)
+            elif self._max_degree >= 3:
+                max_degree_node = -1
+                for i in range(self._max_degree, 2, -1):
+                    if self._buckets[i]:
+                        max_degree_node = self._buckets[i][0]
+                        self._RN_reduction(max_degree_node)
+                        break
+                if max_degree_node < 0:
+                    break
+            else:
+                break
+
+    def _backward(self):
+        """Backward pass in PBQP to generate optimal solution.
+        """
+        # Solve nodes left in the forward graph
+        for node_idx in self._buckets[0]:
+            record_costs = self._record_cost_dict[node_idx]
+            min_cost = min(record_costs)
+            self._optimal_record_dict[node_idx] = record_costs.index(min_cost)
+
+        # Solve nodes with one or two degrees
+        for node_idx in reversed(self._stack):
+            self._backward_insert_node(node_idx)
+            if node_idx not in self._optimal_record_dict:
+                record_costs = list(self._record_cost_dict[node_idx])
+                for adj_node in self._adj_dict[node_idx]:
+                    adj_optimal_idx = self._optimal_record_dict[adj_node]
+                    for i, _ in enumerate(record_costs):
+                        record_costs[i] += \
+                            self._layout_transform_interlayer_cost \
+                                [(node_idx, adj_node)][i][adj_optimal_idx]
+                min_cost = min(record_costs)
+                self._optimal_record_dict[node_idx] = record_costs.index(min_cost)
+
+    def run(self, **kwargs):
+        """Run partitioned boolean quadratic programming tuner.
+        """
+        self._logger.info("Start to run PBQP algorithm...")
+        # Define virtual record lists and layout transformaton matrices
+        # for multi-input nodes.
+        input_names = self._input_shapes.keys()
+        temp = {}
+        for key, val in self._in_nodes_dict.items():
+            target_input_idx = -1
+            target_input_pos = -1
+            if has_multiple_inputs(self._node_list, key, input_names):
+                for i, item in enumerate(val):
+                    if not is_input_node(self._node_list[item], input_names):
+                        target_input_idx = item
+                        target_input_pos = i
+                        break
+                temp[(target_input_idx, key)] = []
+                record_candidates = self._node_list[target_input_idx]["record_candidates"]
+                for j in range(len(record_candidates)):
+                    temp[(target_input_idx, key)].append([])
+                    for k in range(len(record_candidates)):
+                        temp[(target_input_idx, key)][j].append(0 if j == k
+                                                                else INVALID_LAYOUT_TIME)
+
+                for j in range(target_input_pos + 1, len(val)):
+                    input_idx = val[j]
+                    if is_input_node(self._node_list[input_idx], input_names):
+                        continue
+                    temp[(input_idx, key)] = \
+                        self._layout_transform_interlayer_cost[(input_idx, target_input_idx)]
+        self._layout_transform_interlayer_cost.update(temp)
+
+        # Create reverse layout transformation matrices
+        temp = {}
+        for idx_pair, ltf_matrix in self._layout_transform_interlayer_cost.items():
+            reverse_key = (idx_pair[1], idx_pair[0])
+            reverse_matrix = [[] for _ in range(len(ltf_matrix[0]))]
+            for i, _ in enumerate(ltf_matrix):
+                for j, ltf in enumerate(ltf_matrix[i]):
+                    reverse_matrix[j].append(ltf)
+            temp[reverse_key] = reverse_matrix
+        self._layout_transform_interlayer_cost.update(temp)
+
+        self._forward()
+        self._backward()
+        is_optimal = "optimal" if self._is_optimal else "sub-optimal"
+        msg = "Finished PBQPExecutor run. Got %s solution." % is_optimal
+        self._logger.info(msg)
diff --git a/python/tvm/autotvm/graph_tuner/utils/__init__.py b/python/tvm/autotvm/graph_tuner/utils/__init__.py
new file mode 100644
index 0000000..8b36e75
--- /dev/null
+++ b/python/tvm/autotvm/graph_tuner/utils/__init__.py
@@ -0,0 +1,26 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=wildcard-import
+"""Graph tuner utility functions"""
+from __future__ import absolute_import
+
+from . import traverse_graph
+from . import utils
+
+from .traverse_graph import expr2graph, get_direct_ancestor, get_in_nodes, \
+    get_out_nodes
+from .utils import has_multiple_inputs, is_input_node, bind_inputs
diff --git a/python/tvm/autotvm/graph_tuner/utils/traverse_graph.py b/python/tvm/autotvm/graph_tuner/utils/traverse_graph.py
new file mode 100644
index 0000000..08f1017
--- /dev/null
+++ b/python/tvm/autotvm/graph_tuner/utils/traverse_graph.py
@@ -0,0 +1,312 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=too-many-locals,too-many-statements,too-many-branches,protected-access
+"""API for graph traversing."""
+import threading
+
+import topi
+
+from tvm import relay, autotvm
+from tvm.relay.expr import Call, Function, TupleGetItem, Var, Constant, Tuple
+from tvm.relay.ty import TupleType, TensorType
+from tvm.autotvm.task import TaskExtractEnv
+
+from .._base import RULE_OUT_NODE_NAMES
+from .utils import has_multiple_inputs, is_input_node
+
+
+# Setup relay op base name -> topi compute functions
+# NOTE: To add more ops, change the following dictionary.
+OP2COMPUTE = {
+    "conv2d" : [topi.nn.conv2d, topi.nn.depthwise_conv2d_nchw],
+}
+
+
+def expr2graph(expr, target_ops, node_dict, node_list):
+    """Convert relay expr to graph data structure
+    and fetch workloads of target operators.
+
+    Parameters
+    ----------
+    expr : tvm.relay.Expr.Function
+        Input relay function expression.
+
+    target_ops: List of str
+        List of target relay base op name
+
+    node_dict : dictionary from tvm.relay.Expr to int
+        Dictionary to record node index
+
+    node_list : list of dictionary
+        List of nodes which contains all expr in the input relay function.
+        Each node will be stored as a dictionary in the format of
+        {"op": str, "node": tvm.relay.expr, "inputs": [int], "types": [tvm.relay.Type],
+         "name": str, "workloads": [tuple], "topi_op": [function]}
+    """
+    env = TaskExtractEnv.get(allow_duplicate=True)
+    topi_funcs = []
+    for op_name in target_ops:
+        if op_name not in OP2COMPUTE:
+            raise RuntimeError("Not supported relay op in graph tuner: %s"
+                               % op_name)
+        topi_funcs += OP2COMPUTE[op_name]
+    env.reset(topi_funcs)
+    _expr2graph_impl(expr, target_ops, node_dict, node_list)
+    task_pos = 0
+    for node_entry in node_list:
+        if node_entry["op"] in target_ops:
+            task_name, args = env.task_collection[task_pos]
+            task = autotvm.task.create(task_name, args,
+                                       target="llvm",
+                                       target_host=None,
+                                       template_key='direct')
+            node_entry["workloads"] = [task.workload]
+            node_entry["topi_op"] = [task_name]
+            task_pos += 1
+
+
+def _expr2graph_impl(expr, target_ops, node_dict, node_list):
+    """Implementation to convert relay expr to graph data structure
+    """
+    def _traverse_expr(node):
+        if node in node_dict:
+            return
+        node_index = len(node_list)
+        node_entry = {"node": node, "inputs": [], "types": [],
+                      "op": "null", "name": None}
+
+        if isinstance(node, Call):
+            op_name = node.op.name.split(".")[-1]
+            node_entry["op"] = op_name
+            for arg in node.args:
+                in_node_idx = node_dict[arg]
+                if isinstance(arg, (Tuple, TupleGetItem)):
+                    node_entry["inputs"] += node_list[in_node_idx]["inputs"]
+                else:
+                    node_entry["inputs"].append([in_node_idx, 0, 0])
+            infer_out = relay.ir_pass.infer_type(node)
+            out_type = infer_out._checked_type_
+            if isinstance(out_type, TensorType):
+                node_entry["types"].append(out_type)
+            elif isinstance(out_type, TupleType):
+                for tupe_type in out_type.fields:
+                    node_entry["types"].append(tupe_type)
+            else:
+                raise RuntimeError("Unsupported output type %s in operator %s"
+                                   % (type(out_type), op_name))
+
+            # Utilize tracing target to fetch workload with topo-order.
+            # Since we only need workload, dummy target can be used to
+            # create task.
+            if op_name in target_ops:
+                params = []
+                for i, input_idx in enumerate(node_entry["inputs"]):
+                    input_node_entry = node_list[input_idx[0]]
+                    input_type = input_node_entry["types"][input_idx[1]]
+                    if not isinstance(input_node_entry["node"], (Var, Call)):
+                        raise RuntimeError("Graph tuner can only tune target "
+                                           "operators with input node of type "
+                                           "relay.expr.Var or relay.expr.Call. Now "
+                                           "find a target op %s with input type %s"
+                                           % (op_name, str(type(input_node_entry["node"]))))
+                    free_var = relay.Var("var_%d" % i, input_type)
+                    params.append(free_var)
+                call = relay.Call(node.op, params, node.attrs)
+                func = relay.Function(params, call)
+                relay.backend.compile_engine.get().clear()
+                build_thread = threading.Thread(target=relay.build,
+                                                args=(func,
+                                                      "llvm -device=tracing",
+                                                      None,
+                                                      None))
+                build_thread.start()
+                build_thread.join()
+        elif isinstance(node, Var):
+            node_entry["name"] = node.name_hint
+            node_entry["types"] = [node.type_annotation]
+        elif isinstance(node, Function):
+            # Ignore root node since it equals to input function expression
+            if node != expr:
+                _expr2graph_impl(node, target_ops, node_dict, node_list)
+            return
+        elif isinstance(node, TupleGetItem):
+            node_entry["op"] = "TupleGetItem"
+            in_node_idx = node_dict[node.tuple_value]
+            node_entry["inputs"].append([in_node_idx, node.index, 0])
+        elif isinstance(node, Tuple):
+            node_entry["op"] = "Tuple"
+            for tuple_item in node:
+                in_node_idx = node_dict[tuple_item]
+                if isinstance(tuple_item, TupleGetItem):
+                    node_entry["inputs"] += node_list[in_node_idx]["inputs"]
+                elif isinstance(tuple_item, Tuple):
+                    raise RuntimeError("Graph tuner doesn't support nested tuple.")
+                else:
+                    node_entry["inputs"].append([in_node_idx, 0, 0])
+        elif isinstance(node, Constant):
+            pass
+        elif isinstance(node, relay.op.op.Op):
+            return
+        else:
+            raise RuntimeError("Not supported relay node type in graph tuning: %s"
+                               % str(type(node)))
+        node_dict[node] = node_index
+        node_list.append(node_entry)
+
+    relay.ir_pass.post_order_visit(expr, _traverse_expr)
+
+
+def get_direct_ancestor(node_list, visited_dict, target_ops, node_idx, input_names):
+    """Given a node_list in relay function and a node index, return the
+    closest ancestor which has op_name as operator name or is multi_input operator.
+
+    If node has multiple inputs, multiple ancestor nodes will be returned.
+
+    Parameters
+    ----------
+    node_list : list of dict of str to object
+        List of all nodes in a graph.
+
+    visited_dict : dict of int to int
+        Nodes and corresponding ancestors which have been visited.
+
+    target_ops: List of str
+        List of target relay base op name
+
+    node_idx : int
+        Input node index.
+
+    input_names : list of str
+        Names of graph input nodes.
+
+    Returns
+    -------
+    out : list of int
+        List of ancestor node index.
+    """
+    if node_idx in visited_dict:
+        return visited_dict[node_idx]
+    if is_input_node(node_list[node_idx], input_names):
+        return [node_idx]
+    node = node_list[node_idx]
+    # Rule out injective operators
+    is_rule_out = False
+    for item_idx in node["inputs"]:
+        item = node_list[item_idx[0]]
+        if item["op"] in RULE_OUT_NODE_NAMES:
+            is_rule_out = True
+            break
+    if is_rule_out:
+        visited_dict[node_idx] = []
+        return []
+
+    node_direct_ancestor = []
+    for item_idx in node["inputs"]:
+        item = node_list[item_idx[0]]
+        is_multiple_inputs = has_multiple_inputs(node_list, item_idx[0], input_names)
+        if item["op"] in target_ops or is_multiple_inputs:
+            node_direct_ancestor.append(item_idx[0])
+        else:
+            tmp = get_direct_ancestor(node_list, visited_dict, target_ops,
+                                      item_idx[0], input_names)
+            for tmp_item in tmp:
+                node_direct_ancestor.append(tmp_item)
+    if not has_multiple_inputs(node_list, node_idx, input_names) and node_direct_ancestor:
+        node_direct_ancestor = [node_direct_ancestor[0]]
+    visited_dict[node_idx] = node_direct_ancestor
+    return node_direct_ancestor
+
+
+def get_in_nodes(node_list, target_ops, input_names):
+    """Create a dictionary mapping from op_name nodes or multi_input
+    nodes to closest input ancestors.
+
+    Parameters
+    ----------
+    node_list : list of dict of str to object
+        List of all nodes in a graph.
+
+    target_ops: List of str
+        List of target relay op
+
+    input_names : list of str
+        Names of graph input nodes.
+
+    Returns
+    -------
+    out : dict of int to list of int
+        Dictionary maps node index to closest input ancestors.
+    """
+
+    visited_dict = {}
+    in_node_dict = {}
+    for i, node in enumerate(node_list):
+        if node["op"] in RULE_OUT_NODE_NAMES:
+            continue
+        get_direct_ancestor(node_list, visited_dict, target_ops, i, input_names)
+    for key, val in visited_dict.items():
+        node = node_list[key]
+        is_multiple_inputs = has_multiple_inputs(node_list, key, input_names)
+        if node["op"] in target_ops or is_multiple_inputs:
+            in_node_dict[key] = val
+
+    # Remove empty nodes
+    has_empty_node = True
+    out_node_dict = get_out_nodes(in_node_dict)
+    while has_empty_node:
+        empty_nodes = []
+        for key, val in in_node_dict.items():
+            if not val:
+                empty_nodes.append(key)
+        if empty_nodes:
+            has_empty_node = True
+            for node in empty_nodes:
+                del in_node_dict[node]
+                if node in out_node_dict:
+                    for out_node in out_node_dict[node]:
+                        in_node_dict[out_node].remove(node)
+        else:
+            has_empty_node = False
+
+    return in_node_dict
+
+
+def get_out_nodes(in_node_dict):
+    """Create output dictionary from input dictionary.
+
+    Parameters
+    ----------
+    in_node_dict : dict of int to list of int
+        Dictionary maps node index to closest input ancestors.
+        It can be created with get_in_nodes.
+
+    Returns
+    -------
+    out : dict of int to list of int
+        Dictionary maps node index to closest output nodes.
+    """
+    out_node_dict = {}
+    for key in in_node_dict:
+        out_node_dict[key] = []
+    for key, val in in_node_dict.items():
+        for item in val:
+            if item in out_node_dict:
+                out_node_dict[item].append(key)
+            else:
+                out_node_dict[item] = [key]
+
+    return out_node_dict
diff --git a/python/tvm/autotvm/graph_tuner/utils/utils.py b/python/tvm/autotvm/graph_tuner/utils/utils.py
new file mode 100644
index 0000000..6151734
--- /dev/null
+++ b/python/tvm/autotvm/graph_tuner/utils/utils.py
@@ -0,0 +1,110 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=eval-used,invalid-name,too-many-arguments
+"""Utility functions"""
+from tvm import relay
+
+
+def has_multiple_inputs(node_list, node_idx, input_names):
+    """Check whether a node has multiple input nodes
+    except variable nodes.
+
+    Parameters
+    ----------
+    node_list : list of dict of str to object
+        List of all nodes in a graph.
+
+    node_idx : int
+        Node index to be checked.
+
+    input_names : list of str
+        List of input names of graph.
+
+    Returns
+    -------
+    out : bool
+        Whether the specified node has multiple input nodes
+    """
+    num_inputs = 0
+    node = node_list[node_idx]
+    for in_idx in node["inputs"]:
+        in_idx = in_idx[0]
+        in_node = node_list[in_idx]
+        # Exclude parameter nodes
+        if in_node["op"] != "null" or is_input_node(in_node,
+                                                    input_names):
+            num_inputs += 1
+    return num_inputs > 1
+
+
+def is_input_node(node_entry, input_names):
+    """Whether a node is an input node.
+
+    Parameters
+    ----------
+    node_entry : dict
+        Node entry.
+
+    input_names : list of str
+        List of input names of graph.
+
+    Returns
+    -------
+    out : bool
+        whether node is a input node.
+    """
+    return "name" in node_entry and node_entry["name"] in input_names
+
+
+def bind_inputs(expr, input_shapes=None, input_dtypes="float32"):
+    """Bind input variables of a relay function expression
+    to new shapes and/or dtypes.
+
+    Parameters
+    ----------
+    expr : tvm.relay.Expr.Function
+        Input relay function expression.
+
+    input_shapes : dict of str to tuple of int, optional
+        Input shapes.
+
+    input_dtypes : str or dict of str to str, optional
+        Input dtypes.
+
+    Returns
+    -------
+    out : tvm.relay.Expr.Function
+        Bind relay function expression.
+    """
+    if input_shapes is None:
+        return expr
+    if isinstance(input_dtypes, str):
+        input_dtypes = {key : input_dtypes for key in input_shapes.keys()}
+
+    updated_input_dict = {}
+    for input_name in input_shapes.keys():
+        updated_input = relay.var(input_name, shape=input_shapes[input_name],
+                                  dtype=input_dtypes[input_name])
+        updated_input_dict[input_name] = updated_input
+
+    rebind_dict = {}
+    for var in expr.params:
+        if var.name_hint in updated_input_dict:
+            rebind_dict[var] = updated_input_dict[var.name_hint]
+    updated_expr = relay.expr.bind(expr, rebind_dict)
+
+    return relay.ir_pass.infer_type(updated_expr)
diff --git a/python/tvm/autotvm/task/__init__.py b/python/tvm/autotvm/task/__init__.py
index ff50a4e..0a0e6e1 100644
--- a/python/tvm/autotvm/task/__init__.py
+++ b/python/tvm/autotvm/task/__init__.py
@@ -28,6 +28,7 @@ from .code_hash import attach_code_hash, attach_code_hash_to_arg
 from .dispatcher import dispatcher, DispatchContext, ApplyConfig, ApplyHistoryBest, \
     FallbackContext, clear_fallback_cache, ApplyGraphBest
 
-from .topi_integration import register_topi_compute, register_topi_schedule
+from .topi_integration import register_topi_compute, register_topi_schedule, \
+    TaskExtractEnv
 from .nnvm_integration import extract_from_graph, extract_from_multiple_graph
 from .relay_integration import extract_from_program, extract_from_multiple_program
diff --git a/python/tvm/autotvm/task/topi_integration.py b/python/tvm/autotvm/task/topi_integration.py
index 3c98376..ef0cb56 100644
--- a/python/tvm/autotvm/task/topi_integration.py
+++ b/python/tvm/autotvm/task/topi_integration.py
@@ -74,7 +74,7 @@ class TaskExtractEnv:
     """Global environment for extracting tuning tasks from nnvm graph"""
     current = None
 
-    def __init__(self):
+    def __init__(self, allow_duplicate=False):
         import topi
 
         # topi compute -> autotvm task name
@@ -106,6 +106,7 @@ class TaskExtractEnv:
             topi.nn.deformable_conv2d_nchw: [topi.generic.schedule_deformable_conv2d_nchw],
         }
 
+        self.allow_duplicate = allow_duplicate
         self._register_tracing()
         self._register_topi_task()
         self.task_collection = []
@@ -123,10 +124,9 @@ class TaskExtractEnv:
                     assert not kwargs, "Do not support extracting tuning tasks when" \
                                        "kwargs is used in TOPI function call." \
                                        "Please modify it to use only positional args."
-
                     if compute_func in self.wanted_topi_funcs:  # record this call
                         key = (self.topi_to_task[compute_func], serialize_args(args))
-                        if key not in self.task_collection:
+                        if self.allow_duplicate or key not in self.task_collection:
                             self.task_collection.append(key)
                     return compute_func.fdefault(*args)
             _local_scope(topi_compute)
@@ -262,16 +262,25 @@ class TaskExtractEnv:
         return self.task_collection
 
     @staticmethod
-    def get():
+    def get(allow_duplicate=False):
         """Get the single instance of TaskExtractEnv
 
+        Parameters
+        ----------
+        allow_duplicate : boolean
+            Whether to fetch all workloads in the network,
+            even though some of them are the same. This is
+            useful for graph tuning.
+
         Returns
         -------
         env: TaskExtractEnv
             The single instance of TaskExtractEnv
         """
         if not TaskExtractEnv.current:
-            TaskExtractEnv.current = TaskExtractEnv()
+            TaskExtractEnv.current = TaskExtractEnv(allow_duplicate)
+        else:
+            TaskExtractEnv.current.allow_duplicate = allow_duplicate
         return TaskExtractEnv.current
 
 
diff --git a/tests/python/unittest/test_graph_tuner_core.py b/tests/python/unittest/test_graph_tuner_core.py
new file mode 100644
index 0000000..240da7f
--- /dev/null
+++ b/tests/python/unittest/test_graph_tuner_core.py
@@ -0,0 +1,254 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# NOTE: We name this test file to start with test_graph_tuner
+# to make it execute after zero_rank tensor test cases. This
+# helps avoid topi arithmetic operator overloading issue:
+# https://github.com/dmlc/tvm/issues/3240.
+# TODO: restore the file name after this issue is resolved.
+import os
+import copy
+import numpy as np
+import tvm
+import tvm.relay.testing
+
+from tvm import autotvm
+from tvm import relay
+from tvm.autotvm.task import ConfigEntity
+from tvm.autotvm.measure import MeasureResult, MeasureInput
+from tvm.autotvm.graph_tuner import DPTuner, PBQPTuner
+from test_graph_tuner_utils import create_workload
+
+
+def _create_data(target, dshape, dtype, layout):
+    data = relay.var("data", shape=dshape, dtype=dtype)
+    w0 = relay.var("w0_weight")
+    conv0 = relay.nn.conv2d(data, w0, channels=16, kernel_size=(3, 3), padding=(1, 1))
+    w1 = relay.var("w1_weight")
+    conv1 = relay.nn.conv2d(conv0, w1, channels=32, kernel_size=(1, 1))
+    w2 = relay.var("w2_weight")
+    conv2 = relay.nn.conv2d(conv1, w2, channels=32, kernel_size=(3, 3), padding=(1, 1))
+    out = relay.add(conv1, conv2)
+    net = relay.Function(relay.ir_pass.free_vars(out), out)
+    net, params = relay.testing.create_workload(net)
+    tasks = autotvm.task.extract_from_program(net,
+                                              target=target,
+                                              params=params,
+                                              ops=(relay.op.nn.conv2d,))
+    wkl_list = [
+        create_workload((1, 3, 8, 8), (16, 3, 3, 3), (1, 1), (1, 1), (1, 1), layout, layout, dtype, dtype),
+        create_workload((1, 16, 8, 8), (32, 16, 1, 1), (1, 1), (0, 0), (1, 1), layout, layout, dtype, dtype),
+        create_workload((1, 32, 8, 8), (32, 32, 3, 3), (1, 1), (1, 1), (1, 1), layout, layout, dtype, dtype),
+    ]
+    costs = [0.04, 0.012, 0.03]
+    config_list = []
+    cfg_dict = {"i": -1,
+                "c": None,
+                "e": [["tile_ic", "sp", [3, 1]],
+                      ["tile_oc", "sp", [4, 4]],
+                      ["tile_ow", "sp", [4, 2]],
+                      ["unroll_kw", "ot", True]],
+                "t": ""}
+    config_list.append(ConfigEntity.from_json_dict(cfg_dict))
+    cfg_dict = {"i": -1,
+                "c": None,
+                "e": [["tile_ic", "sp", [2, 8]],
+                      ["tile_oc", "sp", [1, 32]],
+                      ["tile_oh", "ot", 1],
+                      ["tile_ow", "sp", [4, 2]]],
+                "t": ""}
+    config_list.append(ConfigEntity.from_json_dict(cfg_dict))
+    cfg_dict = {"i": -1,
+                "c": None,
+                "e": [["tile_ic", "sp", [8, 4]],
+                      ["tile_oc", "sp", [4, 8]],
+                      ["tile_ow", "sp", [2, 4]],
+                      ["unroll_kw", "ot", False]],
+                "t": ""}
+    config_list.append(ConfigEntity.from_json_dict(cfg_dict))
+
+    records = []
+    for wkl, cost, config, task in zip(wkl_list, costs, config_list, tasks):
+        task.workload = wkl
+        ms_input = MeasureInput(target=target, task=task, config=config)
+        ms_output = MeasureResult(costs=(cost,), error_no=0, all_cost=-1, timestamp=-1)
+        records.append((ms_input, ms_output))
+
+    ltf_records = []
+    ltf_arg = [tvm.placeholder((1, 64, 16, 16, 8), dtype=dtype), "NCHW8c", "NCHW512c"]
+    ltf_arg = autotvm.task.topi_integration.serialize_args(ltf_arg)
+    ltf_wkl = ('layout_transform',) + autotvm.task.args_to_workload(ltf_arg)
+    ltf_task = copy.deepcopy(tasks[0])
+    ltf_task.workload = ltf_wkl
+    ms_input = MeasureInput(target=target, task=ltf_task, config=None)
+    ms_output =  MeasureResult(costs=(1.91224744e-05,), error_no=0, all_cost=-1, timestamp=-1)
+    ltf_records.append((ms_input, ms_output))
+
+    ltf_keys = []
+    ltf_arg = [tvm.placeholder((1, 4, 8, 8, 4), dtype=dtype), "NCHW4c", "NCHW8c"]
+    ltf_arg = autotvm.task.topi_integration.serialize_args(ltf_arg)
+    ltf_wkl = ('layout_transform',) + autotvm.task.args_to_workload(ltf_arg)
+    ltf_keys.append(ltf_wkl)
+    ltf_arg = [tvm.placeholder((1, 1, 8, 8, 32), dtype=dtype), "NCHW32c", "NCHW4c"]
+    ltf_arg = autotvm.task.topi_integration.serialize_args(ltf_arg)
+    ltf_wkl = ('layout_transform',) + autotvm.task.args_to_workload(ltf_arg)
+    ltf_keys.append(ltf_wkl)
+    ltf_arg = [tvm.placeholder((1, 4, 8, 8, 8), dtype=dtype), "NCHW8c", "NCHW32c"]
+    ltf_arg = autotvm.task.topi_integration.serialize_args(ltf_arg)
+    ltf_wkl = ('layout_transform',) + autotvm.task.args_to_workload(ltf_arg)
+    ltf_keys.append(ltf_wkl)
+
+    return net, records, ltf_records, ltf_keys, tasks
+
+
+def test_graph_tuner_layout_transform():
+    log_file = "%s/test_tuner.log" % (os.getcwd())
+    target = "llvm"
+    dshape = (1, 3, 8, 8)
+    dtype = "float32"
+    layout = "NCHW"
+    target_ops = [relay.nn.conv2d]
+
+    g, records, ltf_records, ltf_keys, _ = _create_data(target, dshape, dtype, layout)
+    executor = DPTuner(g, {"data": dshape}, records, target_ops, target=target, log_file=log_file)
+    executor.benchmark_layout_transform(layout_records=ltf_records, infer_layout=True)
+    out = executor._layout_transform_perf_records
+
+    num_flops = 0
+    total_time = 0
+    for record in ltf_records:
+        ltf_wkl = record[0].task.workload
+        input_shape = ltf_wkl[1][1]
+        flops = np.prod(input_shape)
+        num_flops += flops
+        total_time += record[1].costs[0]
+    avg_time = total_time / num_flops
+
+    for ltf_workload in out:
+        input_shape = ltf_workload[1][1]
+        flops = 1
+        for i in input_shape:
+            flops *= i
+        expected_time = flops * avg_time
+        out_time = out[ltf_workload][1].costs[0]
+        assert expected_time == out_time, "Inferred layout transformation time mismatch for %s: " \
+                                          "expecting %f but got %f" % (str(ltf_workload), expected_time,
+                                                                       out_time)
+
+
+def test_DPTuner_run():
+    log_file = "%s/test_tuner.log" % (os.getcwd())
+    target = "llvm"
+    dtype = "float32"
+    layout = "NCHW"
+    dshape = (1, 3, 8, 8)
+    target_ops = [relay.nn.conv2d]
+
+    g, records, ltf_records, ltf_keys, tasks = _create_data(target, dshape, dtype, layout)
+    costs = [0.02, 0.02, 0.045]
+    config_list = []
+    cfg_dict = {"i": -1,
+                "c": None,
+                "e": [["tile_ic", "sp", [1, 3]],
+                      ["tile_oc", "sp", [2, 8]],
+                      ["tile_ow", "sp", [4, 2]],
+                      ["unroll_kw", "ot", True]],
+                "t": ""}
+    config_list.append(ConfigEntity.from_json_dict(cfg_dict))
+    cfg_dict = {"i": -1,
+                "c": None,
+                "e": [["tile_ic", "sp", [4, 4]],
+                      ["tile_oc", "sp", [2, 16]],
+                      ["tile_oh", "ot", 1],
+                      ["tile_ow", "sp", [4, 2]]],
+                "t": ""}
+    config_list.append(ConfigEntity.from_json_dict(cfg_dict))
+    cfg_dict = {"i": -1,
+                "c": None,
+                "e": [["tile_ic", "sp", [16, 2]],
+                      ["tile_oc", "sp", [8, 4]],
+                      ["tile_ow", "sp", [2, 4]],
+                      ["unroll_kw", "ot", False]],
+                "t": ""}
+    config_list.append(ConfigEntity.from_json_dict(cfg_dict))
+    for cost, config, task in zip(costs, config_list, tasks):
+        ms_input = MeasureInput(target=target, task=task, config=config)
+        ms_output = MeasureResult(costs=(cost,), error_no=0, all_cost=-1, timestamp=-1)
+        records.append((ms_input, ms_output))
+
+    executor = DPTuner(g, {"data": dshape}, records, target_ops, target, log_file=log_file)
+    executor.benchmark_layout_transform(layout_records=ltf_records, infer_layout=True)
+    executor.run()
+    out = [record[0].config for record in executor.get_optimal_records()]
+    expected_out = [records[3][0].config, records[1][0].config, records[2][0].config]
+    assert expected_out == out, "Output mismatch: expecting %s but got %s" \
+                                % (str(expected_out), str(out))
+    assert os.path.isfile(log_file), "No log file with name %s exists." % log_file
+
+
+def test_PBQPTuner_run():
+    target = "llvm"
+    dtype = "float32"
+    layout = "NCHW"
+    dshape = (1, 3, 8, 8)
+    target_ops = [relay.nn.conv2d]
+
+    g, records, ltf_records, ltf_keys, tasks = _create_data(target, dshape, dtype, layout)
+    costs = [0.02, 0.02, 0.045]
+    config_list = []
+    cfg_dict = {"i": -1,
+                "c": None,
+                "e": [["tile_ic", "sp", [1, 3]],
+                      ["tile_oc", "sp", [2, 8]],
+                      ["tile_ow", "sp", [4, 2]],
+                      ["unroll_kw", "ot", True]],
+                "t": ""}
+    config_list.append(ConfigEntity.from_json_dict(cfg_dict))
+    cfg_dict = {"i": -1,
+                "c": None,
+                "e": [["tile_ic", "sp", [4, 4]],
+                      ["tile_oc", "sp", [2, 16]],
+                      ["tile_oh", "ot", 1],
+                      ["tile_ow", "sp", [4, 2]]],
+                "t": ""}
+    config_list.append(ConfigEntity.from_json_dict(cfg_dict))
+    cfg_dict = {"i": -1,
+                "c": None,
+                "e": [["tile_ic", "sp", [16, 2]],
+                      ["tile_oc", "sp", [8, 4]],
+                      ["tile_ow", "sp", [2, 4]],
+                      ["unroll_kw", "ot", False]],
+                "t": ""}
+    config_list.append(ConfigEntity.from_json_dict(cfg_dict))
+    for cost, config, task in zip(costs, config_list, tasks):
+        ms_input = MeasureInput(target=target, task=task, config=config)
+        ms_output = MeasureResult(costs=(cost,), error_no=0, all_cost=-1, timestamp=-1)
+        records.append((ms_input, ms_output))
+
+    executor = PBQPTuner(g, {"data": dshape}, records, target_ops, target)
+    executor.benchmark_layout_transform(layout_records=ltf_records, infer_layout=True)
+    executor.run()
+    out = [record[0].config for record in executor.get_optimal_records()]
+    expected_out = [records[3][0].config, records[1][0].config, records[2][0].config]
+    assert expected_out == out, "Output mismatch: expecting %s but got %s" \
+                           % (str(expected_out), str(out))
+
+
+if __name__=="__main__":
+    test_graph_tuner_layout_transform()
+    test_DPTuner_run()
+    test_PBQPTuner_run()
diff --git a/tests/python/unittest/test_graph_tuner_utils.py b/tests/python/unittest/test_graph_tuner_utils.py
new file mode 100644
index 0000000..0847166
--- /dev/null
+++ b/tests/python/unittest/test_graph_tuner_utils.py
@@ -0,0 +1,149 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# NOTE: We name this test file to start with test_graph_tuner
+# to make it execute after zero_rank tensor test cases. This
+# helps avoid topi arithmetic operator overloading issue:
+# https://github.com/dmlc/tvm/issues/3240
+# TODO: restore the file name after this issue is resolved.
+import tvm
+
+from tvm import autotvm, relay
+from tvm.relay.testing import resnet
+from tvm.autotvm.graph_tuner.utils import has_multiple_inputs, get_direct_ancestor, get_in_nodes, \
+    get_out_nodes, expr2graph, bind_inputs
+from tvm.relay.expr import Call, TupleGetItem, Tuple
+from topi.nn.conv2d import conv2d
+
+
+def create_workload(dshape, kshape, strides,
+                    padding, dilation, layout,
+                    out_layout, dtype, out_dtype):
+    data = tvm.placeholder(dshape, dtype=dtype)
+    kernel = tvm.placeholder(kshape, dtype=dtype)
+    return autotvm.task.args_to_workload([data, kernel, strides, padding, dilation, layout,
+                                          out_dtype], conv2d)
+
+
+def verify_has_multiple_inputs(node_list, node_idx, input_names, expected_result):
+    out = has_multiple_inputs(node_list, node_idx, input_names)
+    assert out == expected_result, "Output mismatch: expecting checking %s to be %s but got %s." \
+                                   % (node_list[node_idx]["op"], str(expected_result), str(out))
+
+
+def test_has_multiple_inputs():
+    data = relay.var("data")
+    out1 = data * relay.expr.const(3.0)
+    w0 = relay.var("w0")
+    out2 = relay.nn.conv2d(data, w0)
+    out = relay.add(out1, out2)
+    net = relay.Function(relay.ir_pass.free_vars(out), out)
+    net = bind_inputs(net, {"data": (1, 16, 224, 224), "w0": (16, 16, 1, 1)})
+    target_ops = ["conv2d"]
+    node_list = []
+    node_dict = {}
+    expr2graph(net, target_ops, node_dict, node_list)
+    input_names = ["data"]
+    verify_has_multiple_inputs(node_list, 2, input_names, False)
+    verify_has_multiple_inputs(node_list, 4, input_names, False)
+    verify_has_multiple_inputs(node_list, 5, input_names, True)
+
+
+def test_expr2graph():
+    net, _ = resnet.get_workload(num_layers=50, batch_size=1)
+    node_dict = {}
+    node_list = []
+    target_ops = ["conv2d"]
+    op_name_list = []
+    def _count_node(node):
+        if not isinstance(node, relay.op.op.Op,):
+            return
+        if isinstance(node, Call):
+            op_name_list.append(node.op.name.split(".")[-1])
+        elif isinstance(node, TupleGetItem):
+            op_name_list.append("TupleGetItem")
+        elif isinstance(node, Tuple):
+            op_name_list.append("Tuple")
+        else:
+            op_name_list.append("null")
+    relay.ir_pass.post_order_visit(net, _count_node)
+
+    expr2graph(net, target_ops, node_dict, node_list)
+    for i, item in enumerate(zip(op_name_list, node_list)):
+        op_name, node = item
+        assert op_name == node["op"], "%dth Node operator mismatch: expecting %s but got %s" \
+                                      % (i, str(op_name), str(node["op"]))
+
+
+def test_get_direct_ancestor():
+    data = relay.var("data")
+    w0 = relay.var("w0")
+    out1 = relay.nn.conv2d(data, w0)
+    out2 = relay.add(out1, data * relay.expr.const(5.0))
+    out3 = out2 + relay.expr.const(2.5)
+    w1 = relay.var("w1")
+    out = relay.nn.conv2d(out3, w1)
+    net = relay.Function(relay.ir_pass.free_vars(out), out)
+    net = bind_inputs(net, {"data": (1, 16, 224, 224), "w0": (16, 16, 1, 1), "w1": (16, 16, 1, 1)})
+    target_ops = ["conv2d"]
+    node_list = []
+    node_dict = {}
+    expr2graph(net, target_ops, node_dict, node_list)
+    visited_dict = {}
+    input_names = ["data"]
+    out = get_direct_ancestor(node_list, visited_dict, target_ops, 5, input_names)
+    assert out == [2, 0], "Output mismatch: expecting [2, 0] but got %s." % str(out)
+
+
+def test_get_in_nodes():
+    data = relay.var("data")
+    w0 = relay.var("w0")
+    out1 = relay.nn.conv2d(data, w0)
+    out2 = relay.add(out1, data)
+    out3 = out2 + relay.expr.const(2.5)
+    w1 = relay.var("w1")
+    out = relay.nn.conv2d(out3, w1)
+    net = relay.Function(relay.ir_pass.free_vars(out), out)
+    net = bind_inputs(net, {"data": (1, 16, 224, 224), "w0": (16, 16, 1, 1), "w1": (16, 16, 1, 1)})
+    target_ops = ["conv2d"]
+    input_names = ["data"]
+    node_list = []
+    node_dict = {}
+    expr2graph(net, target_ops, node_dict, node_list)
+    out = get_in_nodes(node_list, target_ops, input_names)
+    expected_out = {7: [3], 3: [2, 0], 2: [0]}
+    diff_set = set(out) ^ set(expected_out)
+    if len(diff_set) != 0:
+        raise RuntimeError("Output mismatch: expecting %s but got %s." % (str(expected_out), str(out)))
+
+
+def test_get_out_nodes():
+    in_nodes_dict = {8: [4], 4: [3, 0], 3: [0]}
+    expected_out = {0: [3, 4], 3: [4], 4: [8], 8: []}
+    out = get_out_nodes(in_nodes_dict)
+    diff_set = set(out) ^ set(expected_out)
+    if len(diff_set) != 0:
+        raise RuntimeError("Output mismatch: expecting %s but got %s." % (str(expected_out), str(out)))
+
+
+
+if __name__ == "__main__":
+    test_has_multiple_inputs()
+    test_expr2graph()
+    test_get_direct_ancestor()
+    test_get_in_nodes()
+    test_get_out_nodes()
diff --git a/topi/python/topi/nn/conv2d.py b/topi/python/topi/nn/conv2d.py
index 83e0274..57c1d20 100644
--- a/topi/python/topi/nn/conv2d.py
+++ b/topi/python/topi/nn/conv2d.py
@@ -94,6 +94,26 @@ def conv2d_alter_layout(attrs, inputs, tinfos, F):
     # not to change by default
     return None
 
+@tvm.target.generic_func
+def conv2d_infer_layout(workload, cfg):
+    """Infer input/output shapes and layouts from a workload and cfg.
+
+    Parameters
+    ----------
+    workload : tuple
+        conv2d workload
+
+    cfg : tuple
+        tvm.autotvm config
+
+    Returns
+    -------
+    Output : [tuple of tuple and str, tuple of tuple and str]
+        Input shapes and layouts, and output shapes and layouts
+    """
+    raise ValueError("missing register for topi.nn.conv2d_infer_layout")
+
+
 
 def _get_workload(data, kernel, stride, padding, out_dtype, data_layout='NCHW'):
     """ Get the workload structure. """
diff --git a/topi/python/topi/nn/depthwise_conv2d.py b/topi/python/topi/nn/depthwise_conv2d.py
index 460f4fe..e703bec 100644
--- a/topi/python/topi/nn/depthwise_conv2d.py
+++ b/topi/python/topi/nn/depthwise_conv2d.py
@@ -336,3 +336,22 @@ def depthwise_conv2d_NCHWc(Input, Filter, stride, padding, dilation,
         5-D with shape [batch, out_channel_chunk, out_height, out_width, out_channel_block]
     """
     raise ValueError("missing register for topi.nn.depthwise_conv2d_NCHWc")
+
+@tvm.target.generic_func
+def depthwise_conv2d_infer_layout(workload, cfg):
+    """Infer input/output shapes and layouts from a workload and cfg.
+
+    Parameters
+    ----------
+    workload : tuple
+        conv2d workload
+
+    cfg : tuple
+        tvm.autotvm config
+
+    Returns
+    -------
+    Output : [tuple of tuple and str, tuple of tuple and str]
+        Input shapes and layouts, and output shapes and layouts
+    """
+    raise ValueError("missing register for topi.nn.depthwise_conv2d_infer_layout")
diff --git a/topi/python/topi/x86/conv2d.py b/topi/python/topi/x86/conv2d.py
index de18abd..d0894ad 100644
--- a/topi/python/topi/x86/conv2d.py
+++ b/topi/python/topi/x86/conv2d.py
@@ -28,7 +28,7 @@ from .. import generic, tag
 from .. import nn
 from ..util import get_const_tuple
 from ..nn.conv2d import conv2d, conv2d_NCHWc, \
-    conv2d_alter_layout, _get_workload as _get_conv2d_workload
+    conv2d_alter_layout, conv2d_infer_layout, _get_workload as _get_conv2d_workload
 from ..nn.depthwise_conv2d import _get_workload as _get_depthwise_conv2d_workload
 from ..nn.depthwise_conv2d import depthwise_conv2d_NCHWc, depthwise_conv2d_nchw
 from ..nn.pad import pad
@@ -475,6 +475,21 @@ def _alter_conv2d_layout(attrs, inputs, tinfo, F):
         return F.nn.contrib_conv2d_nchwc(*copy_inputs, **new_attrs)
 
 
+@conv2d_infer_layout.register("cpu")
+def _conv2d_infer_layout(workload, cfg):
+    _, data, kernel, strides, padding, dilation, layout, dtype = workload
+    batch_size, in_channel, in_height, in_width = data[:-1]
+    out_channel, _, k_height, k_width = kernel[:-1]
+    out_height = (in_height + 2 * padding[0] - k_height) // strides[0] + 1
+    out_width = (in_width + 2 * padding[1] - k_width) // strides[1] + 1
+    tile_ic, tile_oc = cfg["tile_ic"].size[-1], cfg["tile_oc"].size[-1]
+    in_shape = (batch_size, in_channel // tile_ic, in_height, in_width, tile_ic)
+    in_layout = "NCHW%dc" % tile_ic
+    out_shape = (batch_size, out_channel // tile_oc, out_height, out_width, tile_oc)
+    out_layout = "NCHW%dc" % tile_oc
+    return ((in_shape, in_layout),), ((out_shape, out_layout),)
+
+
 @autotvm.register_topi_compute(conv2d_NCHWc, 'cpu', 'direct')
 def _declaration_conv_NCHWc(cfg, data, kernel, strides,
                             padding, dilation, layout, out_layout, out_dtype):
diff --git a/topi/python/topi/x86/depthwise_conv2d.py b/topi/python/topi/x86/depthwise_conv2d.py
index f570aaf..6ea11f2 100644
--- a/topi/python/topi/x86/depthwise_conv2d.py
+++ b/topi/python/topi/x86/depthwise_conv2d.py
@@ -25,7 +25,8 @@ from .. import generic, tag
 from ..nn.pad import pad
 from ..util import get_const_tuple
 from ..nn.util import get_pad_tuple
-from ..nn.depthwise_conv2d import depthwise_conv2d_NCHWc, _get_workload
+from ..nn.depthwise_conv2d import depthwise_conv2d_NCHWc, _get_workload, \
+    depthwise_conv2d_infer_layout
 
 from .util import get_fp32_len
 
@@ -206,7 +207,7 @@ def _topi_nn_depthwise_conv2d_NCHWc(*args, **kwargs):
     # change shape with the value in config
     ic_bn, oc_bn = cfg["tile_ic"].size[-1], cfg["tile_oc"].size[-1]
     new_data_shape = (batch, in_channel // ic_bn, height, width, ic_bn)
-    new_kernel_shape = (out_channel // oc_bn, kh, kw, oc_bn)
+    new_kernel_shape = (out_channel // oc_bn, 1, kh, kw, 1, oc_bn)
     new_data = tvm.placeholder(new_data_shape, data.dtype)
     new_kernel = tvm.placeholder(new_kernel_shape, kernel.dtype)
 
@@ -217,3 +218,18 @@ def _topi_nn_depthwise_conv2d_NCHWc(*args, **kwargs):
                                     data_layout, out_layout, dtype)
     s = schedule_depthwise_conv2d_NCHWc(cfg, [C])
     return s, [new_data, new_kernel, C]
+
+@depthwise_conv2d_infer_layout.register("cpu")
+def _depthwise_conv2d_infer_layout(workload, cfg):
+    _, data, kernel, strides, padding, dilation, dtype = workload
+    batch_size, in_channel, in_height, in_width = data[:-1]
+    filter_channel, channel_multiplier, k_height, k_width = kernel[:-1]
+    out_channel = filter_channel * channel_multiplier
+    out_height = (in_height + 2 * padding[0] - k_height) // strides[0] + 1
+    out_width = (in_width + 2 * padding[1] - k_width) // strides[1] + 1
+    tile_ic, tile_oc = cfg["tile_ic"].size[-1], cfg["tile_oc"].size[-1]
+    in_shape = (batch_size, in_channel // tile_ic, in_height, in_width, tile_ic)
+    in_layout = "NCHW%dc" % tile_ic
+    out_shape = (batch_size, out_channel // tile_oc, out_height, out_width, tile_oc)
+    out_layout = "NCHW%dc" % tile_oc
+    return ((in_shape, in_layout),), ((out_shape, out_layout),)
diff --git a/tutorials/autotvm/tune_relay_x86.py b/tutorials/autotvm/tune_relay_x86.py
index f100a35..ad35c19 100644
--- a/tutorials/autotvm/tune_relay_x86.py
+++ b/tutorials/autotvm/tune_relay_x86.py
@@ -30,6 +30,7 @@ from tvm import autotvm
 from tvm import relay
 from tvm.relay import testing
 from tvm.autotvm.tuner import XGBTuner, GATuner, RandomTuner, GridSearchTuner
+from tvm.autotvm.graph_tuner import DPTuner, PBQPTuner
 import tvm.contrib.graph_runtime as runtime
 
 #################################################################
@@ -81,6 +82,7 @@ batch_size = 1
 dtype = "float32"
 model_name = "resnet-18"
 log_file = "%s.log" % model_name
+graph_opt_sch_file = "%s_graph_opt.log" % model_name
 
 # Set number of threads used for tuning based on the number of
 # physical CPU cores on your machine.
@@ -157,6 +159,16 @@ def tune_kernels(tasks,
                            autotvm.callback.progress_bar(n_trial, prefix=prefix),
                            autotvm.callback.log_to_file(log_filename)])
 
+# Use graph tuner to achieve graph level optimal schedules
+# Set use_DP=False if it takes too long to finish.
+def tune_graph(graph, dshape, records, opt_sch_file, use_DP=True):
+    target_op = [relay.nn.conv2d]
+    Tuner = DPTuner if use_DP else PBQPTuner
+    executor = Tuner(graph, {"data": dshape}, records, target_op, target)
+    executor.benchmark_layout_transform(min_exec_num=2000)
+    executor.run()
+    executor.write_opt_sch2record_file(opt_sch_file)
+
 
 ########################################################################
 # Finally, we launch tuning jobs and evaluate the end-to-end performance.
@@ -171,9 +183,10 @@ def tune_and_evaluate(tuning_opt):
     # run tuning tasks
     print("Tuning...")
     tune_kernels(tasks, **tuning_opt)
+    tune_graph(net, data_shape, log_file, graph_opt_sch_file)
 
-    # compile kernels with history best records
-    with autotvm.apply_history_best(log_file):
+    # compile kernels with graph-level best records
+    with autotvm.apply_graph_best(graph_opt_sch_file):
         print("Compile...")
         with relay.build_config(opt_level=3):
             graph, lib, params = relay.build_module.build(
-- 
2.7.4