From f1f31b634d73eb5e260564cb3a4964d29aeecaac Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@fb.com>
Date: Wed, 17 Apr 2019 09:37:37 -0700
Subject: [PATCH] Eliminate AdjustBatch ops (#19083)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/19083

As we have discussed, there are too many of AdjustBatch ops and they incur reallocation overhead and affects the performance. We will eliminate these ops by
- inling the input adjust batch op into Glow
- inling the output adjust batch op into OnnxifiOp and do that only conditionally.

This is the C2 part of the change and requires change from Glow side to work e2e.

Reviewed By: rdzhabarov

Differential Revision: D14860582

fbshipit-source-id: ac2588b894bac25735babb62b1924acc559face6
---
 caffe2/operators/adjust_batch_op.cc                |  20 --
 caffe2/operators/adjust_batch_op.h                 |  76 ------
 caffe2/operators/onnxifi_op.cc                     |  74 +++++-
 caffe2/operators/onnxifi_op.h                      |  26 ++
 caffe2/opt/onnxifi_transformer.cc                  | 293 +++++++--------------
 caffe2/opt/onnxifi_transformer.h                   |  22 +-
 .../python/operator_test/adjust_batch_op_test.py   |  75 ------
 7 files changed, 202 insertions(+), 384 deletions(-)
 delete mode 100644 caffe2/operators/adjust_batch_op.cc
 delete mode 100644 caffe2/operators/adjust_batch_op.h
 delete mode 100644 caffe2/python/operator_test/adjust_batch_op_test.py
diff --git a/caffe2/operators/adjust_batch_op.cc b/caffe2/operators/adjust_batch_op.cc
deleted file mode 100644
index 1e29f5c..0000000
--- a/caffe2/operators/adjust_batch_op.cc
+++ /dev/null
@@ -1,20 +0,0 @@
-#include "caffe2/operators/adjust_batch_op.h"
-
-namespace caffe2 {
-REGISTER_CPU_OPERATOR(AdjustBatch, AdjustBatchOp<CPUContext>);
-OPERATOR_SCHEMA(AdjustBatch)
-    .NumInputs(1, 2)
-    .NumOutputs(1, 2)
-    .Input(0, "Input", "Input data")
-    .Input(1, "RealBatchSizeIn", "[Optional] Real batch size")
-    .Output(0, "Output", "Data with Adjusted batch size")
-    .Output(1, "RealBatchSizeOut", "[Optional] Real batah size")
-    .Arg("max_batch_size", "(*int*): max batch size")
-    .SetDoc(R"DOC(
-Adjust the batch size of `input` tensor. When we only have 1 input, it will adjust the batch size according to `max_batch_size` argument. In this case, in addition, if it has two outputs, it will record the input batch size and record it to the second output. When we have 2 inputs, it expects the seocnd input contains the batch size to adjust to, and will truncate the input data accordingly.
-
-Github Links:
-- https://github.com/pytorch/pytorch/blob/master/caffe2/operators/adjust_batch_op.cc
-
-  )DOC");
-} // namespace caffe2
diff --git a/caffe2/operators/adjust_batch_op.h b/caffe2/operators/adjust_batch_op.h
deleted file mode 100644
index a1044b1..0000000
--- a/caffe2/operators/adjust_batch_op.h
+++ /dev/null
@@ -1,76 +0,0 @@
-#pragma once
-
-#include "caffe2/core/context.h"
-#include "caffe2/core/operator.h"
-
-namespace caffe2 {
-
-template <class Context>
-class AdjustBatchOp final : public Operator<Context> {
- public:
-  USE_OPERATOR_CONTEXT_FUNCTIONS;
-  template <class... Args>
-  explicit AdjustBatchOp(Args&&... args)
-      : Operator<Context>(std::forward<Args>(args)...),
-        max_batch_size_(
-            this->template GetSingleArgument<int64_t>("max_batch_size", -1)) {}
-
-  bool RunOnDevice() override {
-    auto& input = Input(0);
-    vector<int64_t> output_dims(input.sizes().vec());
-    CAFFE_ENFORCE(!output_dims.empty());
-    if (InputSize() > 1) {
-      // TODO: if we have a second input and we have max_batch_size set, check
-      // the batch size of the two inputs for consistency
-      auto& batch_size = Input(1);
-      int64_t real_batch_size = *batch_size.template data<int64_t>();
-      int64_t max_batch_size = output_dims[0];
-      CAFFE_ENFORCE_GE(max_batch_size, real_batch_size);
-      output_dims[0] = real_batch_size;
-      auto* output = Output(0, output_dims, input.dtype());
-      this->context_.template CopyItems<Context, Context>(
-          input.dtype(),
-          input.numel() * real_batch_size / max_batch_size,
-          input.raw_data(),
-          output->raw_mutable_data(input.dtype()));
-    } else {
-      // Pad to max batch size
-      CAFFE_ENFORCE_GT(
-          max_batch_size_,
-          0,
-          "max_batch_size should be larger than 0. Got ",
-          max_batch_size_);
-
-      // TODO: ideally we can support the case when input batch is larger than
-      // the max_batch_size, as we can just pad to the multiple of
-      // max_batch_size.
-      CAFFE_ENFORCE_GE(max_batch_size_, output_dims.front());
-
-      int64_t real_batch_size = output_dims[0];
-      output_dims[0] = max_batch_size_;
-      auto* output = Output(0, output_dims, input.dtype());
-      math::Set(
-          output->nbytes(),
-          static_cast<char>(0),
-          static_cast<char*>(output->raw_data()),
-          &context_);
-      this->context_.template CopyItems<Context, Context>(
-          input.dtype(),
-          input.numel(),
-          input.raw_data(),
-          output->raw_mutable_data(input.dtype()));
-
-      if (OutputSize() > 1) {
-        auto* real_batch_tensor = Output(1, {1}, at::dtype<int64_t>());
-        real_batch_tensor->template mutable_data<int64_t>()[0] =
-            real_batch_size;
-      }
-    }
-
-    return true;
-  }
-
- private:
-  int64_t max_batch_size_;
-};
-} // namespace caffe2
diff --git a/caffe2/operators/onnxifi_op.cc b/caffe2/operators/onnxifi_op.cc
index c686d4c..a375f85 100644
--- a/caffe2/operators/onnxifi_op.cc
+++ b/caffe2/operators/onnxifi_op.cc
@@ -167,6 +167,70 @@ OnnxifiOp<float, CPUContext>::buildInitializationList(
 }
 
 template <>
+std::vector<int> OnnxifiOp<float, CPUContext>::extractOutputBatchSizes() const {
+  CAFFE_ENFORCE_EQ(
+      input_shapes_.size(),
+      InputSize(),
+      "Input shapes and input size don't match. ",
+      input_shapes_.size(),
+      " vs ",
+      InputSize());
+  CAFFE_ENFORCE_EQ(
+      output_shapes_.size(),
+      OutputSize(),
+      "Output shapes and output size don't match. ",
+      output_shapes_.size(),
+      " vs ",
+      OutputSize());
+
+  std::vector<int> adjusted_output_batch;
+  for (const auto& shape : output_shapes_) {
+    if (shape.empty()) {
+      adjusted_output_batch.push_back(0);
+    } else {
+      const auto max_output_batch_size = shape.front();
+      const auto it = batch_pos_map_.find(max_output_batch_size);
+      if (it == batch_pos_map_.end()) {
+        if (use_onnx_) {
+          // For ONNX path, it's possible that we have output batch size that is
+          // unknown, because we handle the second outout of Concat and Split in
+          // ONNX. But for C2 path, we should not meet with this condition.
+          adjusted_output_batch.push_back(0);
+          continue;
+        } else {
+          CAFFE_THROW("Unknow output max batch size: ", max_output_batch_size);
+        }
+      }
+      auto idx = it->second;
+      CAFFE_ENFORCE_LT(idx, input_shapes_.size(), "index out of bound");
+      const auto& input_shape = input_shapes_[idx];
+      // If input real batch size and output max size is the same, we don't need
+      // to adjust max batch size of the output
+      if (input_shape.empty() || input_shape.front() == max_output_batch_size) {
+        adjusted_output_batch.push_back(0);
+      } else {
+        adjusted_output_batch.push_back(input_shape.front());
+      }
+    }
+  }
+
+  return adjusted_output_batch;
+}
+
+template <>
+void OnnxifiOp<float, CPUContext>::maybeAdjustOutputBatchSizes(
+    const std::vector<int>& real_output_batch_sizes) {
+  CAFFE_ENFORCE_EQ(real_output_batch_sizes.size(), output_shapes_.size());
+  for (int i = 0; i < real_output_batch_sizes.size(); ++i) {
+    if (!real_output_batch_sizes[i]) {
+      continue;
+    }
+    auto* output_tensor = Output(i);
+    output_tensor->ShrinkTo(real_output_batch_sizes[i]);
+  }
+}
+
+template <>
 bool OnnxifiOp<float, CPUContext>::RunOnDevice() {
   CAFFE_ENFORCE_EQ(input_desc_.size(), InputSize());
   input_shapes_.clear();
@@ -209,6 +273,7 @@ bool OnnxifiOp<float, CPUContext>::RunOnDevice() {
   bool ext_supported = false;
   onnxMemoryFenceV1 input_fence;
   onnxMemoryFenceV1 output_fence;
+  std::vector<int> output_batch_sizes;
 #ifdef ONNXIFI_ENABLE_EXT
   /**
    * If onnxifi extension mode is enabled,
@@ -230,6 +295,7 @@ bool OnnxifiOp<float, CPUContext>::RunOnDevice() {
             &output_fence,
             /* traceEvents */ nullptr),
         ONNXIFI_STATUS_SUCCESS);
+    output_batch_sizes = extractOutputBatchSizes();
     CAFFE_ENFORCE_EQ(
         lib_->onnxWaitEvent(output_fence.event), ONNXIFI_STATUS_SUCCESS);
     CAFFE_ENFORCE_EQ(
@@ -261,6 +327,7 @@ bool OnnxifiOp<float, CPUContext>::RunOnDevice() {
         ONNXIFI_STATUS_SUCCESS);
     CAFFE_ENFORCE_EQ(
         lib_->onnxSignalEvent(input_fence.event), ONNXIFI_STATUS_SUCCESS);
+    output_batch_sizes = extractOutputBatchSizes();
     CAFFE_ENFORCE_EQ(
         lib_->onnxWaitEvent(output_fence.event), ONNXIFI_STATUS_SUCCESS);
 
@@ -270,6 +337,8 @@ bool OnnxifiOp<float, CPUContext>::RunOnDevice() {
     CAFFE_ENFORCE_EQ(
         lib_->onnxReleaseEvent(output_fence.event), ONNXIFI_STATUS_SUCCESS);
   }
+
+  maybeAdjustOutputBatchSizes(output_batch_sizes);
   return true;
 }
 
@@ -285,5 +354,8 @@ OPERATOR_SCHEMA(Onnxifi)
         "(string default=\"\") Serialized ONNX model to be converted to backend representation")
     .Arg(
         "initializers",
-        "Initialization pair indicating the mapping of the name between NetDef and ONNX model");
+        "Initialization pair indicating the mapping of the name between NetDef and ONNX model")
+    .Arg(
+        "output_resize_hints",
+        "A list of key/value pairs indicating which input index to look up for real batch size for the given max output batch size");
 } // namespace caffe2
diff --git a/caffe2/operators/onnxifi_op.h b/caffe2/operators/onnxifi_op.h
index 3965aaf..a8d0bf4 100644
--- a/caffe2/operators/onnxifi_op.h
+++ b/caffe2/operators/onnxifi_op.h
@@ -30,6 +30,7 @@ class OnnxifiOp final : public Operator<Context> {
     lib_ = onnx::initOnnxifiLibrary();
     backend_graph_map_ptr_ = onnx::getOnnxBackendGraphMap();
     CAFFE_ENFORCE(lib_, "Cannot initialize ONNXIFI library");
+    use_onnx_ = this->template GetSingleArgument<int>("use_onnx", 0);
     auto onnx_model_str =
         this->template GetSingleArgument<std::string>("onnx_model", "");
     CAFFE_ENFORCE(!onnx_model_str.empty(), "onnx_model cannot be empty");
@@ -64,6 +65,19 @@ class OnnxifiOp final : public Operator<Context> {
       ++output_idx;
     }
 
+    // Get output resizing hints
+    auto output_resize_hints =
+        this->template GetRepeatedArgument<int>("output_resize_hints");
+    CAFFE_ENFORCE_EQ(
+        output_resize_hints.size() % 2,
+        0,
+        "output_resize_hints must have even size: ",
+        output_resize_hints.size());
+    for (int i = 0; i < output_resize_hints.size(); ++i) {
+      auto k = output_resize_hints[i++];
+      batch_pos_map_.emplace(k, output_resize_hints[i]);
+    }
+
     // Encode arguments starting with "custom_" to backend
     std::vector<uint64_t> property_pointers;
     std::vector<int64_t> int_args;
@@ -202,6 +216,11 @@ class OnnxifiOp final : public Operator<Context> {
 #endif
   }
 
+  std::vector<int> extractOutputBatchSizes() const;
+
+  void maybeAdjustOutputBatchSizes(
+      const std::vector<int>& real_output_batch_sizes);
+
   std::vector<onnxTensorDescriptorV1> buildInitializationList(
       Workspace* ws,
       const std::vector<std::string>& initializers,
@@ -233,6 +252,7 @@ class OnnxifiOp final : public Operator<Context> {
       onnxMemoryFenceV1*,
       onnxTraceEventList*);
 #endif
+  bool use_onnx_{false};
 
   // We bind the op input/output by position while ONNXIFI binds input/output by
   // names. In addition, op input/output names can be writtten by, for example,
@@ -246,6 +266,12 @@ class OnnxifiOp final : public Operator<Context> {
 
   // output shape hints
   std::unordered_map<int, TensorInfo> output_shape_hints_;
+
+  // Output resizing hint map
+  // key: max batch size
+  // value: position of the input where the real batch size can be extracted
+  // from its first dimension
+  std::unordered_map<int, int> batch_pos_map_;
 };
 
 } // namespace caffe2
diff --git a/caffe2/opt/onnxifi_transformer.cc b/caffe2/opt/onnxifi_transformer.cc
index 7a1ff8b..0c0e8fe 100644
--- a/caffe2/opt/onnxifi_transformer.cc
+++ b/caffe2/opt/onnxifi_transformer.cc
@@ -167,47 +167,12 @@ void fillModelInfo(::ONNX_NAMESPACE::ModelProto* model) {
   opset_id->set_version(7);
 }
 
-std::string MakeSeqSizeBlob(const std::string& blob_name) {
-  return blob_name + "_real_seq_size";
-}
-
-std::string MakeOutputForAdjustBatchOp(const std::string& input) {
-  return input + "_post_adjust_batch";
-}
-
-std::string MakeInputForAdjustBatchOp(const std::string& output) {
-  return output + "_pre_adjust_batch";
-}
-
-OperatorDef MakeAdjustBatchOp(
-    const std::string& input_blob,
-    const std::string& output_blob,
-    int max_batch_size,
-    const std::string& real_batch_size_blob,
-    bool adjust_to_max_batch_size) {
-  OperatorDef adjust_batch_op;
-  adjust_batch_op.set_type("AdjustBatch");
-  auto* arg = adjust_batch_op.add_arg();
-  arg->set_name("max_batch_size");
-  arg->set_i(max_batch_size);
-  adjust_batch_op.add_input(input_blob);
-  adjust_batch_op.add_output(output_blob);
-  if (adjust_to_max_batch_size) {
-    if (!real_batch_size_blob.empty()) {
-      adjust_batch_op.add_output(real_batch_size_blob);
-    }
-  } else {
-    adjust_batch_op.add_input(real_batch_size_blob);
-  }
-  return adjust_batch_op;
-}
-
-std::unordered_set<string> ToHashSet(
+std::unordered_set<string> toHashSet(
     const ::google::protobuf::RepeatedPtrField<string>& strs) {
   return std::unordered_set<string>(strs.begin(), strs.end());
 }
 
-int64_t GetBlob1stDimSize(
+int64_t getBlob1stDimSize(
     const ShapeInfo& shape_info,
     const string& blob_name) {
   if (shape_info.shape.dims_size() == 0) {
@@ -217,125 +182,9 @@ int64_t GetBlob1stDimSize(
   }
 }
 
-// Generates AdjustBatchOps for external inputs/outputs with type BATCH or
-// SEQ and adds them to input_ops and output_ops.
-// Meanwhile, modifies inputs/outputs of corresponding operators in the
-// onnxifi_net to use the new inputs/outputs of AdjustBatchOps.
-std::unordered_map<std::string, std::string> AddAdjustBatchOps(
-    const ShapeInfoMap& shape_hints,
-    NetDef* onnxifi_net,
-    vector<OperatorDef>* input_ops,
-    vector<OperatorDef>* output_ops) {
-  std::unordered_map<std::string, std::string> renaming_map;
-  const auto external_inputs = ToHashSet(onnxifi_net->external_input());
-  const auto external_outputs = ToHashSet(onnxifi_net->external_output());
-  std::unordered_set<std::string> real_batch_size_blobs;
-  std::unordered_set<std::string> post_adjust_inputs;
-
-  for (auto& op : *(onnxifi_net->mutable_op())) {
-    // Add AdjustBatchOp for all external inputs with type BATCH or SEQ.
-    // This will adjust the batch/seq size to the batch/seq size inferred by
-    // bound_shape_inference. Note that we only produce real batch size tensor
-    // once to avoid data race. In addition, for each input we only create one
-    // AdjustBatch op for the same reason.
-    for (auto& input_blob : *(op.mutable_input())) {
-      if (external_inputs.count(input_blob)) {
-        auto shape_info_it = shape_hints.find(input_blob);
-        if (shape_info_it == shape_hints.end()) {
-          LOG(WARNING) << "Cannot find shape_info for external input blob: "
-                       << input_blob;
-          continue;
-        }
-        std::string real_batch_size_blob = "";
-        auto max_batch_size = 0;
-        if (shape_info_it->second.dim_type == ShapeInfo::DimType::BATCH) {
-          max_batch_size = GetBlob1stDimSize(shape_info_it->second, input_blob);
-          real_batch_size_blob =
-              kRealBatchSizeBlob + "_" + c10::to_string(max_batch_size);
-        } else if (shape_info_it->second.dim_type == ShapeInfo::DimType::SEQ) {
-          max_batch_size = GetBlob1stDimSize(shape_info_it->second, input_blob);
-          real_batch_size_blob = MakeSeqSizeBlob(input_blob);
-        } else {
-          continue;
-        }
-
-        auto output_blob = MakeOutputForAdjustBatchOp(input_blob);
-        auto ret = real_batch_size_blobs.emplace(real_batch_size_blob);
-        if (post_adjust_inputs.emplace(output_blob).second) {
-          input_ops->push_back(MakeAdjustBatchOp(
-              input_blob,
-              output_blob,
-              max_batch_size,
-              ret.second ? real_batch_size_blob : "",
-              true /* adjust_to_max_batch_size */));
-        }
-        renaming_map[input_blob] = output_blob;
-        input_blob = output_blob;
-      } else if (renaming_map.count(input_blob)) {
-        // It is possible that input of a certain op is the output of its
-        // predecessor op, which happens to be an external_output. In this case,
-        // the tensor would have been renamed to X_pre_batch_adjust. Therefore,
-        // we need to rename input X to X_pre_batch_adjust too.
-        input_blob = renaming_map[input_blob];
-      }
-    }
-    // Add AdjustBatchOp for all external outputs with type BATCH if the real
-    // batch size is presented. This will adjust the batch size to the
-    // original batch size.
-    for (auto& output_blob : *(op.mutable_output())) {
-      if (external_outputs.count(output_blob)) {
-        auto shape_info_it = shape_hints.find(output_blob);
-        CAFFE_ENFORCE(
-            shape_info_it != shape_hints.end(),
-            "Cannot find shape info for ",
-            output_blob,
-            " for AdjustBatchOp insertion");
-        if (shape_info_it->second.dim_type == ShapeInfo::DimType::BATCH) {
-          auto max_batch_size =
-              GetBlob1stDimSize(shape_info_it->second, output_blob);
-          std::string real_size_blob =
-              kRealBatchSizeBlob + "_" + c10::to_string(max_batch_size);
-          CAFFE_ENFORCE(
-              real_batch_size_blobs.count(real_size_blob),
-              output_blob,
-              ": Cannot find ",
-              real_size_blob,
-              " to make AdjustBatchOp");
-          auto input_blob = MakeInputForAdjustBatchOp(output_blob);
-          output_ops->push_back(MakeAdjustBatchOp(
-              input_blob,
-              output_blob,
-              max_batch_size,
-              real_size_blob,
-              false /* adjust_to_max_batch_size */));
-          renaming_map[output_blob] = input_blob;
-          output_blob = input_blob;
-        } else if (shape_info_it->second.dim_type == ShapeInfo::DimType::SEQ) {
-          LOG(WARNING) << "It's unusual that output tesnor " << output_blob
-                       << " is of dim_type SEQ. "
-                       << "AdjustBatchOp won't attached "
-                       << "and it might degrade the performance";
-        }
-      }
-    }
-  }
-
-  return renaming_map;
-}
-
-NetDef ComposeResultNet(
-    const vector<OperatorDef>& input_ops,
-    const vector<OperatorDef>& output_ops,
-    const OperatorDef& onnxifi_op) {
+NetDef composeResultNet(const OperatorDef& onnxifi_op) {
   NetDef net_opt;
-  for (const auto& op : input_ops) {
-    net_opt.add_op()->CopyFrom(op);
-  }
   net_opt.add_op()->CopyFrom(onnxifi_op);
-  // Add AdjustBatch ops for output blobs to the net.
-  for (const auto& op : output_ops) {
-    net_opt.add_op()->CopyFrom(op);
-  }
   return net_opt;
 }
 
@@ -364,12 +213,66 @@ OnnxifiTransformer::~OnnxifiTransformer() {
   }
 }
 
+std::unordered_map<int, std::string>
+OnnxifiTransformer::generateBatchPaddingHints(
+    const NetDef& onnxifi_net,
+    const ShapeInfoMap& shape_hints) {
+  std::unordered_map<int, std::string> batch_pos_map;
+  const auto external_inputs = toHashSet(onnxifi_net.external_input());
+  const auto external_outputs = toHashSet(onnxifi_net.external_output());
+  for (const auto& op : onnxifi_net.op()) {
+    for (auto i = 0; i < op.input_size(); ++i) {
+      const auto& input_blob = op.input(i);
+      if (external_inputs.count(input_blob)) {
+        auto shape_info_it = shape_hints.find(input_blob);
+        if (shape_info_it == shape_hints.end()) {
+          LOG(WARNING) << "Cannot find shape_info for external input blob: "
+                       << input_blob;
+          continue;
+        }
+        if (shape_info_it->second.dim_type == ShapeInfo::DimType::BATCH ||
+            shape_info_it->second.dim_type == ShapeInfo::DimType::SEQ) {
+          batch_pos_map.emplace(
+              getBlob1stDimSize(shape_info_it->second, input_blob), input_blob);
+        }
+      }
+    }
+
+    // Correctness check on the output
+    for (const auto& output_blob : op.output()) {
+      if (external_outputs.count(output_blob)) {
+        auto shape_info_it = shape_hints.find(output_blob);
+        CAFFE_ENFORCE(
+            shape_info_it != shape_hints.end(),
+            "Cannot find shape info for ",
+            output_blob,
+            " to adjust output batch size");
+        if (shape_info_it->second.dim_type == ShapeInfo::DimType::BATCH) {
+          auto max_batch_size =
+              getBlob1stDimSize(shape_info_it->second, output_blob);
+          CAFFE_ENFORCE(
+              batch_pos_map.count(max_batch_size),
+              "Cannot find input with max batch size",
+              max_batch_size);
+        } else if (shape_info_it->second.dim_type == ShapeInfo::DimType::SEQ) {
+          LOG(WARNING) << "It's unusual that output tensor " << output_blob
+                       << " is of dim_type SEQ. "
+                       << "AdjustBatchOp won't attached "
+                       << "and it might degrade the performance";
+        }
+      }
+    }
+  }
+  return batch_pos_map;
+}
+
 OperatorDef OnnxifiTransformer::BuildOnnxifiOp(
     const std::string& onnx_model_str,
     const std::unordered_map<std::string, TensorShape>& output_shape_hints,
     const std::unordered_set<std::string>& initialization_list,
     const std::vector<std::string>& external_inputs,
-    const std::vector<std::string>& external_outputs) {
+    const std::vector<std::string>& external_outputs,
+    const std::unordered_map<int, std::string>& batch_pos_map) {
   OperatorDef op;
   op.set_type("Onnxifi");
   auto* onnx_model_arg = op.add_arg();
@@ -385,12 +288,15 @@ OperatorDef OnnxifiTransformer::BuildOnnxifiOp(
   }
 
   // Add the input/output
+  std::unordered_map<std::string, int> input_pos_map;
+  int idx = 0;
   auto* input_names = op.add_arg();
   input_names->set_name("input_names");
   for (const auto& input : external_inputs) {
     if (!initialization_list.count(input)) {
       op.add_input(input);
       input_names->add_strings(input);
+      input_pos_map.emplace(input, idx++);
     }
   }
   auto* output_names = op.add_arg();
@@ -427,6 +333,19 @@ OperatorDef OnnxifiTransformer::BuildOnnxifiOp(
   AddArgument(kModelId, model_id_, &op);
   AddArgument(kNetPos, c10::to_string(onnxifi_op_id_++), &op);
 
+  // Add output resizing hints
+  auto* resize_arg = op.add_arg();
+  resize_arg->set_name("output_resize_hints");
+  for (const auto kv : batch_pos_map) {
+    const auto it = input_pos_map.find(kv.second);
+    CAFFE_ENFORCE(
+        it != input_pos_map.end(),
+        "Cannot find input in OnnxifiOp: ",
+        kv.second);
+    resize_arg->add_ints(kv.first);
+    resize_arg->add_ints(it->second);
+  }
+
   return op;
 }
 
@@ -465,16 +384,8 @@ NetDef OnnxifiTransformer::SubnetToOnnxifiOpViaC2(
     }
   }
 
-  // Insert AdjustBatch ops, note that this step will possibly change the names
-  // of the input/output, so we need to create a mapping and use the renamed
-  // names for external_inputs/outputs and input_shape_info for the onnxifi_net.
-  vector<OperatorDef> input_ops;
-  vector<OperatorDef> output_ops;
-  std::unordered_map<std::string, std::string> renaming_map;
-  if (opts_.add_adjust_batch_ops) {
-    renaming_map =
-        AddAdjustBatchOps(shape_hints, &onnxifi_net, &input_ops, &output_ops);
-  }
+  // Add batch padding hints
+  auto batch_pos_map = generateBatchPaddingHints(onnxifi_net, shape_hints);
 
   // Figure out weights and add it to external_inputs too
   std::unordered_set<std::string> initialization_list;
@@ -491,36 +402,25 @@ NetDef OnnxifiTransformer::SubnetToOnnxifiOpViaC2(
   qshape_arg->set_name("input_qshape_info");
   onnxifi_net.clear_external_input();
   for (const auto& i : total_inputs_vec) {
-    auto input = i;
-    const auto it = renaming_map.find(i);
-    if (it != renaming_map.end()) {
-      input = it->second;
-    }
-    onnxifi_net.add_external_input(input);
+    onnxifi_net.add_external_input(i);
     auto info = shape_hints.at(i);
     if (!info.is_quantized) {
       shape_arg->mutable_tensors()->Add()->CopyFrom(
-          wrapShapeInfoIntoTensorProto(input, shape_hints.at(i)));
+          wrapShapeInfoIntoTensorProto(i, shape_hints.at(i)));
     } else {
       qshape_arg->mutable_qtensors()->Add()->CopyFrom(
-          wrapShapeInfoIntoQTensorProto(input, shape_hints.at(i)));
+          wrapShapeInfoIntoQTensorProto(i, shape_hints.at(i)));
     }
   }
 
   // Compute output shape hints
   std::unordered_map<std::string, TensorShape> output_shape_hints;
-  for (auto& o : *onnxifi_net.mutable_external_output()) {
-    auto output = o;
-    const auto rit = renaming_map.find(o);
-    if (rit != renaming_map.end()) {
-      output = rit->second;
-    }
+  for (const auto& o : onnxifi_net.external_output()) {
     const auto it = shape_hints.find(o);
     CAFFE_ENFORCE(
         it != shape_hints.end(), "Cannot find shape info for output ", o);
     const auto& shape = it->second.shape;
-    output_shape_hints.emplace(output, shape);
-    o = output;
+    output_shape_hints.emplace(o, shape);
   }
 
   // Build ONNXIFI Op
@@ -536,8 +436,9 @@ NetDef OnnxifiTransformer::SubnetToOnnxifiOpViaC2(
       output_shape_hints,
       initialization_list,
       onnxifi_net_inputs,
-      onnxifi_net_outputs);
-  NetDef net_opt = ComposeResultNet(input_ops, output_ops, onnxifi_op);
+      onnxifi_net_outputs,
+      batch_pos_map);
+  NetDef net_opt = composeResultNet(onnxifi_op);
 
   // Debugging stuff
   if (opts_.debug) {
@@ -564,13 +465,7 @@ NetDef OnnxifiTransformer::SubnetToOnnxifiOpViaOnnx(
   fillModelInfo(&onnx_model);
 
   caffe2::NetDef onnxifi_net(net);
-  vector<OperatorDef> input_ops;
-  vector<OperatorDef> output_ops;
-  auto renaming_map =
-      AddAdjustBatchOps(*shape_hints, &onnxifi_net, &input_ops, &output_ops);
-  for (const auto& kv : renaming_map) {
-    shape_hints_onnx_.emplace(kv.second, shape_hints_onnx_.at(kv.first));
-  }
+  auto batch_pos_map = generateBatchPaddingHints(onnxifi_net, *shape_hints);
 
   // Convert c2 ops to onnx ops, add const weights if there are any
   DeviceOption option;
@@ -625,12 +520,7 @@ NetDef OnnxifiTransformer::SubnetToOnnxifiOpViaOnnx(
   // Convert outputs and compute output shape hints
   std::vector<std::string> onnxifi_net_outputs;
   for (const auto& o : net.external_output()) {
-    auto output = o;
-    const auto it = renaming_map.find(o);
-    if (it != renaming_map.end()) {
-      output = it->second;
-    }
-    onnxifi_net_outputs.emplace_back(output);
+    onnxifi_net_outputs.emplace_back(o);
   }
   auto io_vec = convertToValueInfo(
       onnxifi_net_outputs,
@@ -657,12 +547,6 @@ NetDef OnnxifiTransformer::SubnetToOnnxifiOpViaOnnx(
       extra_weights,
       &initialization_list,
       &onnxifi_net_inputs);
-  for (auto& i : onnxifi_net_inputs) {
-    const auto it = renaming_map.find(i);
-    if (it != renaming_map.end()) {
-      i = it->second;
-    }
-  }
   io_vec = convertToValueInfo(
       onnxifi_net_inputs,
       shape_hints_onnx_,
@@ -679,8 +563,9 @@ NetDef OnnxifiTransformer::SubnetToOnnxifiOpViaOnnx(
       output_shape_hints,
       initialization_list,
       onnxifi_net_inputs,
-      onnxifi_net_outputs);
-  NetDef net_opt = ComposeResultNet(input_ops, output_ops, onnxifi_op);
+      onnxifi_net_outputs,
+      batch_pos_map);
+  NetDef net_opt = composeResultNet(onnxifi_op);
 
   // Debugging stuff
   if (opts_.debug) {
diff --git a/caffe2/opt/onnxifi_transformer.h b/caffe2/opt/onnxifi_transformer.h
index c93fd7f..48a2d9b 100644
--- a/caffe2/opt/onnxifi_transformer.h
+++ b/caffe2/opt/onnxifi_transformer.h
@@ -23,13 +23,7 @@ struct OnnxifiTransformerOptions {
   bool debug{false};
 
   // Pass serialized onnx model if true, otherwise pass serialized c2 model
-  bool use_onnx{true};
-
-  // Whether to attach AdjustBatch ops or not. In order to maintain static
-  // shapes to the backend, most of the time, we need to add AdjustBatch ops to
-  // the inputs/outputs of the Onnxifi op. But if backend itself supports max
-  // batch size, we don't need to do it.
-  bool add_adjust_batch_ops{true};
+  bool use_onnx{false};
 
   // Minimum number of ops to create an onnxifi op. If the subgraph is too
   // small, it doesn't make sense to lower it to backend.
@@ -76,7 +70,8 @@ class CAFFE2_API OnnxifiTransformer final : public BackendTransformerBase {
       const std::unordered_map<std::string, TensorShape>& output_size_hints,
       const std::unordered_set<std::string>& initialization_list,
       const std::vector<std::string>& external_inputs,
-      const std::vector<std::string>& external_outputs);
+      const std::vector<std::string>& external_outputs,
+      const std::unordered_map<int, std::string>& batch_pos_map);
 
   // Transform by passing C2 proto to backend
   NetDef TransformViaC2(
@@ -107,6 +102,17 @@ class CAFFE2_API OnnxifiTransformer final : public BackendTransformerBase {
       const std::unordered_set<int>& blacklisted_ops,
       onnxBackendID backend_id) const;
 
+  // Go through the inputs of the onnxifi subgraph and extract their batch size.
+  // And use this info to hint how we can cut output batch size from
+  // max_batch_size. The returning key/value is max_batch_size/input_name. For
+  // example, when OnnxifiOp sees an output with batch size 100, it will lookup
+  // the map with key=100. And if we then look into the batch size of the
+  // corresponding input for its real batch size, say 50, and use that to shrink
+  // the output tensor to real batch size.
+  std::unordered_map<int, std::string> generateBatchPaddingHints(
+      const NetDef& onnxifi_net,
+      const ShapeInfoMap& shape_hints);
+
   // Tie the output of Gather to the scalar weight input of the
   // SparseLengthsWeighted* op. If the latter is disabled, disable the former
   // too.
diff --git a/caffe2/python/operator_test/adjust_batch_op_test.py b/caffe2/python/operator_test/adjust_batch_op_test.py
deleted file mode 100644
index f4dffd3..0000000
--- a/caffe2/python/operator_test/adjust_batch_op_test.py
+++ /dev/null
@@ -1,75 +0,0 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
-
-from caffe2.python import core, workspace
-from hypothesis import given, assume
-import caffe2.python.hypothesis_test_util as hu
-import hypothesis.strategies as st
-import numpy as np
-
-import unittest
-import os
-
-
-class TestAdjustBatchOp(hu.HypothesisTestCase):
-    @given(d=st.integers(1, 4), n=st.integers(1, 20),
-           seed=st.integers(0, 1000), **hu.gcs_cpu_only)
-    def test_pad(self, d, n, gc, dc, seed):
-        for dtype in [np.float32, np.int8, np.int64]:
-            np.random.seed(seed)
-            dims = [n] * d
-            X = np.random.rand(*dims).astype(dtype)
-            max_batch_size = n + 8
-
-            def ref_op(X):
-                shape = list(X.shape)
-                out = np.zeros((1), dtype=np.int64)
-                out[0] = shape[0]
-                shape[0] = max_batch_size
-                Y = np.zeros(shape, dtype=dtype)
-                Y[:n] = X
-                return [Y, out]
-
-            op = core.CreateOperator(
-                "AdjustBatch",
-                ["X"],
-                ["Y", "RealBatch"],
-                max_batch_size=max_batch_size,
-            )
-
-            self.assertReferenceChecks(
-                device_option=gc,
-                op=op,
-                inputs=[X],
-                reference=ref_op,
-            )
-
-    @given(d=st.integers(1, 4), n=st.integers(8, 20),
-           seed=st.integers(0, 1000), **hu.gcs_cpu_only)
-    def test_truncate(self, d, n, gc, dc, seed):
-        for dtype in [np.float32, np.int8, np.int64]:
-            np.random.seed(seed)
-            dims = [n] * d
-            X = np.random.rand(*dims).astype(dtype)
-            real_batch_size = n - 8
-            R = np.zeros((1), dtype=np.int64)
-            R[0] = real_batch_size
-
-            def ref_op(X, R):
-                r = R[0]
-                return [X[:r]]
-
-            op = core.CreateOperator(
-                "AdjustBatch",
-                ["X", "RealBatch"],
-                ["Y"],
-            )
-
-            self.assertReferenceChecks(
-                device_option=gc,
-                op=op,
-                inputs=[X, R],
-                reference=ref_op,
-            )
-- 
2.7.4