Eliminate AdjustBatch ops (#19083)

author Yinghai Lu <yinghai@fb.com>

Wed, 17 Apr 2019 16:37:37 +0000 (09:37 -0700)

committer Facebook Github Bot <facebook-github-bot@users.noreply.github.com>

Wed, 17 Apr 2019 17:00:25 +0000 (10:00 -0700)
author Yinghai Lu <yinghai@fb.com>
Wed, 17 Apr 2019 16:37:37 +0000 (09:37 -0700)
committer Facebook Github Bot <facebook-github-bot@users.noreply.github.com>
Wed, 17 Apr 2019 17:00:25 +0000 (10:00 -0700)
diff --git a/caffe2/operators/adjust_batch_op.cc b/caffe2/operators/adjust_batch_op.cc

deleted file mode 100644 (file)

index 1e29f5c..0000000
--- a/caffe2/operators/adjust_batch_op.cc
+++ /dev/null
@@ -1,20 +0,0 @@
-#include "caffe2/operators/adjust_batch_op.h"
-
-namespace caffe2 {
-REGISTER_CPU_OPERATOR(AdjustBatch, AdjustBatchOp<CPUContext>);
-OPERATOR_SCHEMA(AdjustBatch)
-    .NumInputs(1, 2)
-    .NumOutputs(1, 2)
-    .Input(0, "Input", "Input data")
-    .Input(1, "RealBatchSizeIn", "[Optional] Real batch size")
-    .Output(0, "Output", "Data with Adjusted batch size")
-    .Output(1, "RealBatchSizeOut", "[Optional] Real batah size")
-    .Arg("max_batch_size", "(*int*): max batch size")
-    .SetDoc(R"DOC(
-Adjust the batch size of `input` tensor. When we only have 1 input, it will adjust the batch size according to `max_batch_size` argument. In this case, in addition, if it has two outputs, it will record the input batch size and record it to the second output. When we have 2 inputs, it expects the seocnd input contains the batch size to adjust to, and will truncate the input data accordingly.
-
-Github Links:
-- https://github.com/pytorch/pytorch/blob/master/caffe2/operators/adjust_batch_op.cc
-
-  )DOC");
-} // namespace caffe2
diff --git a/caffe2/operators/adjust_batch_op.h b/caffe2/operators/adjust_batch_op.h

deleted file mode 100644 (file)

index a1044b1..0000000
--- a/caffe2/operators/adjust_batch_op.h
+++ /dev/null
@@ -1,76 +0,0 @@
-#pragma once
-
-#include "caffe2/core/context.h"
-#include "caffe2/core/operator.h"
-
-namespace caffe2 {
-
-template <class Context>
-class AdjustBatchOp final : public Operator<Context> {
- public:
-  USE_OPERATOR_CONTEXT_FUNCTIONS;
-  template <class... Args>
-  explicit AdjustBatchOp(Args&&... args)
-      : Operator<Context>(std::forward<Args>(args)...),
-        max_batch_size_(
-            this->template GetSingleArgument<int64_t>("max_batch_size", -1)) {}
-
-  bool RunOnDevice() override {
-    auto& input = Input(0);
-    vector<int64_t> output_dims(input.sizes().vec());
-    CAFFE_ENFORCE(!output_dims.empty());
-    if (InputSize() > 1) {
-      // TODO: if we have a second input and we have max_batch_size set, check
-      // the batch size of the two inputs for consistency
-      auto& batch_size = Input(1);
-      int64_t real_batch_size = *batch_size.template data<int64_t>();
-      int64_t max_batch_size = output_dims[0];
-      CAFFE_ENFORCE_GE(max_batch_size, real_batch_size);
-      output_dims[0] = real_batch_size;
-      auto* output = Output(0, output_dims, input.dtype());
-      this->context_.template CopyItems<Context, Context>(
-          input.dtype(),
-          input.numel() * real_batch_size / max_batch_size,
-          input.raw_data(),
-          output->raw_mutable_data(input.dtype()));
-    } else {
-      // Pad to max batch size
-      CAFFE_ENFORCE_GT(
-          max_batch_size_,
-          0,
-          "max_batch_size should be larger than 0. Got ",
-          max_batch_size_);
-
-      // TODO: ideally we can support the case when input batch is larger than
-      // the max_batch_size, as we can just pad to the multiple of
-      // max_batch_size.
-      CAFFE_ENFORCE_GE(max_batch_size_, output_dims.front());
-
-      int64_t real_batch_size = output_dims[0];
-      output_dims[0] = max_batch_size_;
-      auto* output = Output(0, output_dims, input.dtype());
-      math::Set(
-          output->nbytes(),
-          static_cast<char>(0),
-          static_cast<char*>(output->raw_data()),
-          &context_);
-      this->context_.template CopyItems<Context, Context>(
-          input.dtype(),
-          input.numel(),
-          input.raw_data(),
-          output->raw_mutable_data(input.dtype()));
-
-      if (OutputSize() > 1) {
-        auto* real_batch_tensor = Output(1, {1}, at::dtype<int64_t>());
-        real_batch_tensor->template mutable_data<int64_t>()[0] =
-            real_batch_size;
-      }
-    }
-
-    return true;
-  }
-
- private:
-  int64_t max_batch_size_;
-};
-} // namespace caffe2
diff --git a/caffe2/operators/onnxifi_op.cc b/caffe2/operators/onnxifi_op.cc

index c686d4c..a375f85 100644 (file)
--- a/caffe2/operators/onnxifi_op.cc
+++ b/caffe2/operators/onnxifi_op.cc
@@ -167,6 +167,70 @@ OnnxifiOp<float, CPUContext>::buildInitializationList(
  }
  
  template <>
+std::vector<int> OnnxifiOp<float, CPUContext>::extractOutputBatchSizes() const {
+  CAFFE_ENFORCE_EQ(
+      input_shapes_.size(),
+      InputSize(),
+      "Input shapes and input size don't match. ",
+      input_shapes_.size(),
+      " vs ",
+      InputSize());
+  CAFFE_ENFORCE_EQ(
+      output_shapes_.size(),
+      OutputSize(),
+      "Output shapes and output size don't match. ",
+      output_shapes_.size(),
+      " vs ",
+      OutputSize());
+
+  std::vector<int> adjusted_output_batch;
+  for (const auto& shape : output_shapes_) {
+    if (shape.empty()) {
+      adjusted_output_batch.push_back(0);
+    } else {
+      const auto max_output_batch_size = shape.front();
+      const auto it = batch_pos_map_.find(max_output_batch_size);
+      if (it == batch_pos_map_.end()) {
+        if (use_onnx_) {
+          // For ONNX path, it's possible that we have output batch size that is
+          // unknown, because we handle the second outout of Concat and Split in
+          // ONNX. But for C2 path, we should not meet with this condition.
+          adjusted_output_batch.push_back(0);
+          continue;
+        } else {
+          CAFFE_THROW("Unknow output max batch size: ", max_output_batch_size);
+        }
+      }
+      auto idx = it->second;
+      CAFFE_ENFORCE_LT(idx, input_shapes_.size(), "index out of bound");
+      const auto& input_shape = input_shapes_[idx];
+      // If input real batch size and output max size is the same, we don't need
+      // to adjust max batch size of the output
+      if (input_shape.empty() || input_shape.front() == max_output_batch_size) {
+        adjusted_output_batch.push_back(0);
+      } else {
+        adjusted_output_batch.push_back(input_shape.front());
+      }
+    }
+  }
+
+  return adjusted_output_batch;
+}
+
+template <>
+void OnnxifiOp<float, CPUContext>::maybeAdjustOutputBatchSizes(
+    const std::vector<int>& real_output_batch_sizes) {
+  CAFFE_ENFORCE_EQ(real_output_batch_sizes.size(), output_shapes_.size());
+  for (int i = 0; i < real_output_batch_sizes.size(); ++i) {
+    if (!real_output_batch_sizes[i]) {
+      continue;
+    }
+    auto* output_tensor = Output(i);
+    output_tensor->ShrinkTo(real_output_batch_sizes[i]);
+  }
+}
+
+template <>
  bool OnnxifiOp<float, CPUContext>::RunOnDevice() {
    CAFFE_ENFORCE_EQ(input_desc_.size(), InputSize());
    input_shapes_.clear();
@@ -209,6 +273,7 @@ bool OnnxifiOp<float, CPUContext>::RunOnDevice() {
    bool ext_supported = false;
    onnxMemoryFenceV1 input_fence;
    onnxMemoryFenceV1 output_fence;
+  std::vector<int> output_batch_sizes;
  #ifdef ONNXIFI_ENABLE_EXT
    /**
     * If onnxifi extension mode is enabled,
@@ -230,6 +295,7 @@ bool OnnxifiOp<float, CPUContext>::RunOnDevice() {
              &output_fence,
              /* traceEvents */ nullptr),
          ONNXIFI_STATUS_SUCCESS);
+    output_batch_sizes = extractOutputBatchSizes();
      CAFFE_ENFORCE_EQ(
          lib_->onnxWaitEvent(output_fence.event), ONNXIFI_STATUS_SUCCESS);
      CAFFE_ENFORCE_EQ(
@@ -261,6 +327,7 @@ bool OnnxifiOp<float, CPUContext>::RunOnDevice() {
          ONNXIFI_STATUS_SUCCESS);
      CAFFE_ENFORCE_EQ(
          lib_->onnxSignalEvent(input_fence.event), ONNXIFI_STATUS_SUCCESS);
+    output_batch_sizes = extractOutputBatchSizes();
      CAFFE_ENFORCE_EQ(
          lib_->onnxWaitEvent(output_fence.event), ONNXIFI_STATUS_SUCCESS);
  
@@ -270,6 +337,8 @@ bool OnnxifiOp<float, CPUContext>::RunOnDevice() {
      CAFFE_ENFORCE_EQ(
          lib_->onnxReleaseEvent(output_fence.event), ONNXIFI_STATUS_SUCCESS);
    }
+
+  maybeAdjustOutputBatchSizes(output_batch_sizes);
    return true;
  }
  
@@ -285,5 +354,8 @@ OPERATOR_SCHEMA(Onnxifi)
          "(string default=\"\") Serialized ONNX model to be converted to backend representation")
      .Arg(
          "initializers",
-        "Initialization pair indicating the mapping of the name between NetDef and ONNX model");
+        "Initialization pair indicating the mapping of the name between NetDef and ONNX model")
+    .Arg(
+        "output_resize_hints",
+        "A list of key/value pairs indicating which input index to look up for real batch size for the given max output batch size");
  } // namespace caffe2
diff --git a/caffe2/operators/onnxifi_op.h b/caffe2/operators/onnxifi_op.h

index 3965aaf..a8d0bf4 100644 (file)
--- a/caffe2/operators/onnxifi_op.h
+++ b/caffe2/operators/onnxifi_op.h
@@ -30,6 +30,7 @@ class OnnxifiOp final : public Operator<Context> {
      lib_ = onnx::initOnnxifiLibrary();
      backend_graph_map_ptr_ = onnx::getOnnxBackendGraphMap();
      CAFFE_ENFORCE(lib_, "Cannot initialize ONNXIFI library");
+    use_onnx_ = this->template GetSingleArgument<int>("use_onnx", 0);
      auto onnx_model_str =
          this->template GetSingleArgument<std::string>("onnx_model", "");
      CAFFE_ENFORCE(!onnx_model_str.empty(), "onnx_model cannot be empty");
@@ -64,6 +65,19 @@ class OnnxifiOp final : public Operator<Context> {
        ++output_idx;
      }
  
+    // Get output resizing hints
+    auto output_resize_hints =
+        this->template GetRepeatedArgument<int>("output_resize_hints");
+    CAFFE_ENFORCE_EQ(
+        output_resize_hints.size() % 2,
+        0,
+        "output_resize_hints must have even size: ",
+        output_resize_hints.size());
+    for (int i = 0; i < output_resize_hints.size(); ++i) {
+      auto k = output_resize_hints[i++];
+      batch_pos_map_.emplace(k, output_resize_hints[i]);
+    }
+
      // Encode arguments starting with "custom_" to backend
      std::vector<uint64_t> property_pointers;
      std::vector<int64_t> int_args;
@@ -202,6 +216,11 @@ class OnnxifiOp final : public Operator<Context> {
  #endif
    }
  
+  std::vector<int> extractOutputBatchSizes() const;
+
+  void maybeAdjustOutputBatchSizes(
+      const std::vector<int>& real_output_batch_sizes);
+
    std::vector<onnxTensorDescriptorV1> buildInitializationList(
        Workspace* ws,
        const std::vector<std::string>& initializers,
@@ -233,6 +252,7 @@ class OnnxifiOp final : public Operator<Context> {
        onnxMemoryFenceV1*,
        onnxTraceEventList*);
  #endif
+  bool use_onnx_{false};
  
    // We bind the op input/output by position while ONNXIFI binds input/output by
    // names. In addition, op input/output names can be writtten by, for example,
@@ -246,6 +266,12 @@ class OnnxifiOp final : public Operator<Context> {
  
    // output shape hints
    std::unordered_map<int, TensorInfo> output_shape_hints_;
+
+  // Output resizing hint map
+  // key: max batch size
+  // value: position of the input where the real batch size can be extracted
+  // from its first dimension
+  std::unordered_map<int, int> batch_pos_map_;
  };
  
  } // namespace caffe2
diff --git a/caffe2/opt/onnxifi_transformer.cc b/caffe2/opt/onnxifi_transformer.cc

index 7a1ff8b..0c0e8fe 100644 (file)
--- a/caffe2/opt/onnxifi_transformer.cc
+++ b/caffe2/opt/onnxifi_transformer.cc
@@ -167,47 +167,12 @@ void fillModelInfo(::ONNX_NAMESPACE::ModelProto* model) {
    opset_id->set_version(7);
  }
  
-std::string MakeSeqSizeBlob(const std::string& blob_name) {
-  return blob_name + "_real_seq_size";
-}
-
-std::string MakeOutputForAdjustBatchOp(const std::string& input) {
-  return input + "_post_adjust_batch";
-}
-
-std::string MakeInputForAdjustBatchOp(const std::string& output) {
-  return output + "_pre_adjust_batch";
-}
-
-OperatorDef MakeAdjustBatchOp(
-    const std::string& input_blob,
-    const std::string& output_blob,
-    int max_batch_size,
-    const std::string& real_batch_size_blob,
-    bool adjust_to_max_batch_size) {
-  OperatorDef adjust_batch_op;
-  adjust_batch_op.set_type("AdjustBatch");
-  auto* arg = adjust_batch_op.add_arg();
-  arg->set_name("max_batch_size");
-  arg->set_i(max_batch_size);
-  adjust_batch_op.add_input(input_blob);
-  adjust_batch_op.add_output(output_blob);
-  if (adjust_to_max_batch_size) {
-    if (!real_batch_size_blob.empty()) {
-      adjust_batch_op.add_output(real_batch_size_blob);
-    }
-  } else {
-    adjust_batch_op.add_input(real_batch_size_blob);
-  }
-  return adjust_batch_op;
-}
-
-std::unordered_set<string> ToHashSet(
+std::unordered_set<string> toHashSet(
      const ::google::protobuf::RepeatedPtrField<string>& strs) {
    return std::unordered_set<string>(strs.begin(), strs.end());
  }
  
-int64_t GetBlob1stDimSize(
+int64_t getBlob1stDimSize(
      const ShapeInfo& shape_info,
      const string& blob_name) {
    if (shape_info.shape.dims_size() == 0) {
@@ -217,125 +182,9 @@ int64_t GetBlob1stDimSize(
    }
  }
  
-// Generates AdjustBatchOps for external inputs/outputs with type BATCH or
-// SEQ and adds them to input_ops and output_ops.
-// Meanwhile, modifies inputs/outputs of corresponding operators in the
-// onnxifi_net to use the new inputs/outputs of AdjustBatchOps.
-std::unordered_map<std::string, std::string> AddAdjustBatchOps(
-    const ShapeInfoMap& shape_hints,
-    NetDef* onnxifi_net,
-    vector<OperatorDef>* input_ops,
-    vector<OperatorDef>* output_ops) {
-  std::unordered_map<std::string, std::string> renaming_map;
-  const auto external_inputs = ToHashSet(onnxifi_net->external_input());
-  const auto external_outputs = ToHashSet(onnxifi_net->external_output());
-  std::unordered_set<std::string> real_batch_size_blobs;
-  std::unordered_set<std::string> post_adjust_inputs;
-
-  for (auto& op : *(onnxifi_net->mutable_op())) {
-    // Add AdjustBatchOp for all external inputs with type BATCH or SEQ.
-    // This will adjust the batch/seq size to the batch/seq size inferred by
-    // bound_shape_inference. Note that we only produce real batch size tensor
-    // once to avoid data race. In addition, for each input we only create one
-    // AdjustBatch op for the same reason.
-    for (auto& input_blob : *(op.mutable_input())) {
-      if (external_inputs.count(input_blob)) {
-        auto shape_info_it = shape_hints.find(input_blob);
-        if (shape_info_it == shape_hints.end()) {
-          LOG(WARNING) << "Cannot find shape_info for external input blob: "
-                       << input_blob;
-          continue;
-        }
-        std::string real_batch_size_blob = "";
-        auto max_batch_size = 0;
-        if (shape_info_it->second.dim_type == ShapeInfo::DimType::BATCH) {
-          max_batch_size = GetBlob1stDimSize(shape_info_it->second, input_blob);
-          real_batch_size_blob =
-              kRealBatchSizeBlob + "_" + c10::to_string(max_batch_size);
-        } else if (shape_info_it->second.dim_type == ShapeInfo::DimType::SEQ) {
-          max_batch_size = GetBlob1stDimSize(shape_info_it->second, input_blob);
-          real_batch_size_blob = MakeSeqSizeBlob(input_blob);
-        } else {
-          continue;
-        }
-
-        auto output_blob = MakeOutputForAdjustBatchOp(input_blob);
-        auto ret = real_batch_size_blobs.emplace(real_batch_size_blob);
-        if (post_adjust_inputs.emplace(output_blob).second) {
-          input_ops->push_back(MakeAdjustBatchOp(
-              input_blob,
-              output_blob,
-              max_batch_size,
-              ret.second ? real_batch_size_blob : "",
-              true /* adjust_to_max_batch_size */));
-        }
-        renaming_map[input_blob] = output_blob;
-        input_blob = output_blob;
-      } else if (renaming_map.count(input_blob)) {
-        // It is possible that input of a certain op is the output of its
-        // predecessor op, which happens to be an external_output. In this case,
-        // the tensor would have been renamed to X_pre_batch_adjust. Therefore,
-        // we need to rename input X to X_pre_batch_adjust too.
-        input_blob = renaming_map[input_blob];
-      }
-    }
-    // Add AdjustBatchOp for all external outputs with type BATCH if the real
-    // batch size is presented. This will adjust the batch size to the
-    // original batch size.
-    for (auto& output_blob : *(op.mutable_output())) {
-      if (external_outputs.count(output_blob)) {
-        auto shape_info_it = shape_hints.find(output_blob);
-        CAFFE_ENFORCE(
-            shape_info_it != shape_hints.end(),
-            "Cannot find shape info for ",
-            output_blob,
-            " for AdjustBatchOp insertion");
-        if (shape_info_it->second.dim_type == ShapeInfo::DimType::BATCH) {
-          auto max_batch_size =
-              GetBlob1stDimSize(shape_info_it->second, output_blob);
-          std::string real_size_blob =
-              kRealBatchSizeBlob + "_" + c10::to_string(max_batch_size);
-          CAFFE_ENFORCE(
-              real_batch_size_blobs.count(real_size_blob),
-              output_blob,
-              ": Cannot find ",
-              real_size_blob,
-              " to make AdjustBatchOp");
-          auto input_blob = MakeInputForAdjustBatchOp(output_blob);
-          output_ops->push_back(MakeAdjustBatchOp(
-              input_blob,
-              output_blob,
-              max_batch_size,
-              real_size_blob,
-              false /* adjust_to_max_batch_size */));
-          renaming_map[output_blob] = input_blob;
-          output_blob = input_blob;
-        } else if (shape_info_it->second.dim_type == ShapeInfo::DimType::SEQ) {
-          LOG(WARNING) << "It's unusual that output tesnor " << output_blob
-                       << " is of dim_type SEQ. "
-                       << "AdjustBatchOp won't attached "
-                       << "and it might degrade the performance";
-        }
-      }
-    }
-  }
-
-  return renaming_map;
-}
-
-NetDef ComposeResultNet(
-    const vector<OperatorDef>& input_ops,
-    const vector<OperatorDef>& output_ops,
-    const OperatorDef& onnxifi_op) {
+NetDef composeResultNet(const OperatorDef& onnxifi_op) {
    NetDef net_opt;
-  for (const auto& op : input_ops) {
-    net_opt.add_op()->CopyFrom(op);
-  }
    net_opt.add_op()->CopyFrom(onnxifi_op);
-  // Add AdjustBatch ops for output blobs to the net.
-  for (const auto& op : output_ops) {
-    net_opt.add_op()->CopyFrom(op);
-  }
    return net_opt;
  }
  
@@ -364,12 +213,66 @@ OnnxifiTransformer::~OnnxifiTransformer() {
    }
  }
  
+std::unordered_map<int, std::string>
+OnnxifiTransformer::generateBatchPaddingHints(
+    const NetDef& onnxifi_net,
+    const ShapeInfoMap& shape_hints) {
+  std::unordered_map<int, std::string> batch_pos_map;
+  const auto external_inputs = toHashSet(onnxifi_net.external_input());
+  const auto external_outputs = toHashSet(onnxifi_net.external_output());
+  for (const auto& op : onnxifi_net.op()) {
+    for (auto i = 0; i < op.input_size(); ++i) {
+      const auto& input_blob = op.input(i);
+      if (external_inputs.count(input_blob)) {
+        auto shape_info_it = shape_hints.find(input_blob);
+        if (shape_info_it == shape_hints.end()) {
+          LOG(WARNING) << "Cannot find shape_info for external input blob: "
+                       << input_blob;
+          continue;
+        }
+        if (shape_info_it->second.dim_type == ShapeInfo::DimType::BATCH ||
+            shape_info_it->second.dim_type == ShapeInfo::DimType::SEQ) {
+          batch_pos_map.emplace(
+              getBlob1stDimSize(shape_info_it->second, input_blob), input_blob);
+        }
+      }
+    }
+
+    // Correctness check on the output
+    for (const auto& output_blob : op.output()) {
+      if (external_outputs.count(output_blob)) {
+        auto shape_info_it = shape_hints.find(output_blob);
+        CAFFE_ENFORCE(
+            shape_info_it != shape_hints.end(),
+            "Cannot find shape info for ",
+            output_blob,
+            " to adjust output batch size");
+        if (shape_info_it->second.dim_type == ShapeInfo::DimType::BATCH) {
+          auto max_batch_size =
+              getBlob1stDimSize(shape_info_it->second, output_blob);
+          CAFFE_ENFORCE(
+              batch_pos_map.count(max_batch_size),
+              "Cannot find input with max batch size",
+              max_batch_size);
+        } else if (shape_info_it->second.dim_type == ShapeInfo::DimType::SEQ) {
+          LOG(WARNING) << "It's unusual that output tensor " << output_blob
+                       << " is of dim_type SEQ. "
+                       << "AdjustBatchOp won't attached "
+                       << "and it might degrade the performance";
+        }
+      }
+    }
+  }
+  return batch_pos_map;
+}
+
  OperatorDef OnnxifiTransformer::BuildOnnxifiOp(
      const std::string& onnx_model_str,
      const std::unordered_map<std::string, TensorShape>& output_shape_hints,
      const std::unordered_set<std::string>& initialization_list,
      const std::vector<std::string>& external_inputs,
-    const std::vector<std::string>& external_outputs) {
+    const std::vector<std::string>& external_outputs,
+    const std::unordered_map<int, std::string>& batch_pos_map) {
    OperatorDef op;
    op.set_type("Onnxifi");
    auto* onnx_model_arg = op.add_arg();
@@ -385,12 +288,15 @@ OperatorDef OnnxifiTransformer::BuildOnnxifiOp(
    }
  
    // Add the input/output
+  std::unordered_map<std::string, int> input_pos_map;
+  int idx = 0;
    auto* input_names = op.add_arg();
    input_names->set_name("input_names");
    for (const auto& input : external_inputs) {
      if (!initialization_list.count(input)) {
        op.add_input(input);
        input_names->add_strings(input);
+      input_pos_map.emplace(input, idx++);
      }
    }
    auto* output_names = op.add_arg();
@@ -427,6 +333,19 @@ OperatorDef OnnxifiTransformer::BuildOnnxifiOp(
    AddArgument(kModelId, model_id_, &op);
    AddArgument(kNetPos, c10::to_string(onnxifi_op_id_++), &op);
  
+  // Add output resizing hints
+  auto* resize_arg = op.add_arg();
+  resize_arg->set_name("output_resize_hints");
+  for (const auto kv : batch_pos_map) {
+    const auto it = input_pos_map.find(kv.second);
+    CAFFE_ENFORCE(
+        it != input_pos_map.end(),
+        "Cannot find input in OnnxifiOp: ",
+        kv.second);
+    resize_arg->add_ints(kv.first);
+    resize_arg->add_ints(it->second);
+  }
+
    return op;
  }
  
@@ -465,16 +384,8 @@ NetDef OnnxifiTransformer::SubnetToOnnxifiOpViaC2(
      }
    }
  
-  // Insert AdjustBatch ops, note that this step will possibly change the names
-  // of the input/output, so we need to create a mapping and use the renamed
-  // names for external_inputs/outputs and input_shape_info for the onnxifi_net.
-  vector<OperatorDef> input_ops;
-  vector<OperatorDef> output_ops;
-  std::unordered_map<std::string, std::string> renaming_map;
-  if (opts_.add_adjust_batch_ops) {
-    renaming_map =
-        AddAdjustBatchOps(shape_hints, &onnxifi_net, &input_ops, &output_ops);
-  }
+  // Add batch padding hints
+  auto batch_pos_map = generateBatchPaddingHints(onnxifi_net, shape_hints);
  
    // Figure out weights and add it to external_inputs too
    std::unordered_set<std::string> initialization_list;
@@ -491,36 +402,25 @@ NetDef OnnxifiTransformer::SubnetToOnnxifiOpViaC2(
    qshape_arg->set_name("input_qshape_info");
    onnxifi_net.clear_external_input();
    for (const auto& i : total_inputs_vec) {
-    auto input = i;
-    const auto it = renaming_map.find(i);
-    if (it != renaming_map.end()) {
-      input = it->second;
-    }
-    onnxifi_net.add_external_input(input);
+    onnxifi_net.add_external_input(i);
      auto info = shape_hints.at(i);
      if (!info.is_quantized) {
        shape_arg->mutable_tensors()->Add()->CopyFrom(
-          wrapShapeInfoIntoTensorProto(input, shape_hints.at(i)));
+          wrapShapeInfoIntoTensorProto(i, shape_hints.at(i)));
      } else {
        qshape_arg->mutable_qtensors()->Add()->CopyFrom(
-          wrapShapeInfoIntoQTensorProto(input, shape_hints.at(i)));
+          wrapShapeInfoIntoQTensorProto(i, shape_hints.at(i)));
      }
    }
  
    // Compute output shape hints
    std::unordered_map<std::string, TensorShape> output_shape_hints;
-  for (auto& o : *onnxifi_net.mutable_external_output()) {
-    auto output = o;
-    const auto rit = renaming_map.find(o);
-    if (rit != renaming_map.end()) {
-      output = rit->second;
-    }
+  for (const auto& o : onnxifi_net.external_output()) {
      const auto it = shape_hints.find(o);
      CAFFE_ENFORCE(
          it != shape_hints.end(), "Cannot find shape info for output ", o);
      const auto& shape = it->second.shape;
-    output_shape_hints.emplace(output, shape);
-    o = output;
+    output_shape_hints.emplace(o, shape);
    }
  
    // Build ONNXIFI Op
@@ -536,8 +436,9 @@ NetDef OnnxifiTransformer::SubnetToOnnxifiOpViaC2(
        output_shape_hints,
        initialization_list,
        onnxifi_net_inputs,
-      onnxifi_net_outputs);
-  NetDef net_opt = ComposeResultNet(input_ops, output_ops, onnxifi_op);
+      onnxifi_net_outputs,
+      batch_pos_map);
+  NetDef net_opt = composeResultNet(onnxifi_op);
  
    // Debugging stuff
    if (opts_.debug) {
@@ -564,13 +465,7 @@ NetDef OnnxifiTransformer::SubnetToOnnxifiOpViaOnnx(
    fillModelInfo(&onnx_model);
  
    caffe2::NetDef onnxifi_net(net);
-  vector<OperatorDef> input_ops;
-  vector<OperatorDef> output_ops;
-  auto renaming_map =
-      AddAdjustBatchOps(*shape_hints, &onnxifi_net, &input_ops, &output_ops);
-  for (const auto& kv : renaming_map) {
-    shape_hints_onnx_.emplace(kv.second, shape_hints_onnx_.at(kv.first));
-  }
+  auto batch_pos_map = generateBatchPaddingHints(onnxifi_net, *shape_hints);
  
    // Convert c2 ops to onnx ops, add const weights if there are any
    DeviceOption option;
@@ -625,12 +520,7 @@ NetDef OnnxifiTransformer::SubnetToOnnxifiOpViaOnnx(
    // Convert outputs and compute output shape hints
    std::vector<std::string> onnxifi_net_outputs;
    for (const auto& o : net.external_output()) {
-    auto output = o;
-    const auto it = renaming_map.find(o);
-    if (it != renaming_map.end()) {
-      output = it->second;
-    }
-    onnxifi_net_outputs.emplace_back(output);
+    onnxifi_net_outputs.emplace_back(o);
    }
    auto io_vec = convertToValueInfo(
        onnxifi_net_outputs,
@@ -657,12 +547,6 @@ NetDef OnnxifiTransformer::SubnetToOnnxifiOpViaOnnx(
        extra_weights,
        &initialization_list,
        &onnxifi_net_inputs);
-  for (auto& i : onnxifi_net_inputs) {
-    const auto it = renaming_map.find(i);
-    if (it != renaming_map.end()) {
-      i = it->second;
-    }
-  }
    io_vec = convertToValueInfo(
        onnxifi_net_inputs,
        shape_hints_onnx_,
@@ -679,8 +563,9 @@ NetDef OnnxifiTransformer::SubnetToOnnxifiOpViaOnnx(
        output_shape_hints,
        initialization_list,
        onnxifi_net_inputs,
-      onnxifi_net_outputs);
-  NetDef net_opt = ComposeResultNet(input_ops, output_ops, onnxifi_op);
+      onnxifi_net_outputs,
+      batch_pos_map);
+  NetDef net_opt = composeResultNet(onnxifi_op);
  
    // Debugging stuff
    if (opts_.debug) {
diff --git a/caffe2/opt/onnxifi_transformer.h b/caffe2/opt/onnxifi_transformer.h

index c93fd7f..48a2d9b 100644 (file)
--- a/caffe2/opt/onnxifi_transformer.h
+++ b/caffe2/opt/onnxifi_transformer.h
@@ -23,13 +23,7 @@ struct OnnxifiTransformerOptions {
    bool debug{false};
  
    // Pass serialized onnx model if true, otherwise pass serialized c2 model
-  bool use_onnx{true};
-
-  // Whether to attach AdjustBatch ops or not. In order to maintain static
-  // shapes to the backend, most of the time, we need to add AdjustBatch ops to
-  // the inputs/outputs of the Onnxifi op. But if backend itself supports max
-  // batch size, we don't need to do it.
-  bool add_adjust_batch_ops{true};
+  bool use_onnx{false};
  
    // Minimum number of ops to create an onnxifi op. If the subgraph is too
    // small, it doesn't make sense to lower it to backend.
@@ -76,7 +70,8 @@ class CAFFE2_API OnnxifiTransformer final : public BackendTransformerBase {
        const std::unordered_map<std::string, TensorShape>& output_size_hints,
        const std::unordered_set<std::string>& initialization_list,
        const std::vector<std::string>& external_inputs,
-      const std::vector<std::string>& external_outputs);
+      const std::vector<std::string>& external_outputs,
+      const std::unordered_map<int, std::string>& batch_pos_map);
  
    // Transform by passing C2 proto to backend
    NetDef TransformViaC2(
@@ -107,6 +102,17 @@ class CAFFE2_API OnnxifiTransformer final : public BackendTransformerBase {
        const std::unordered_set<int>& blacklisted_ops,
        onnxBackendID backend_id) const;
  
+  // Go through the inputs of the onnxifi subgraph and extract their batch size.
+  // And use this info to hint how we can cut output batch size from
+  // max_batch_size. The returning key/value is max_batch_size/input_name. For
+  // example, when OnnxifiOp sees an output with batch size 100, it will lookup
+  // the map with key=100. And if we then look into the batch size of the
+  // corresponding input for its real batch size, say 50, and use that to shrink
+  // the output tensor to real batch size.
+  std::unordered_map<int, std::string> generateBatchPaddingHints(
+      const NetDef& onnxifi_net,
+      const ShapeInfoMap& shape_hints);
+
    // Tie the output of Gather to the scalar weight input of the
    // SparseLengthsWeighted* op. If the latter is disabled, disable the former
    // too.
diff --git a/caffe2/python/operator_test/adjust_batch_op_test.py b/caffe2/python/operator_test/adjust_batch_op_test.py

deleted file mode 100644 (file)

index f4dffd3..0000000
--- a/caffe2/python/operator_test/adjust_batch_op_test.py
+++ /dev/null
@@ -1,75 +0,0 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
-
-from caffe2.python import core, workspace
-from hypothesis import given, assume
-import caffe2.python.hypothesis_test_util as hu
-import hypothesis.strategies as st
-import numpy as np
-
-import unittest
-import os
-
-
-class TestAdjustBatchOp(hu.HypothesisTestCase):
-    @given(d=st.integers(1, 4), n=st.integers(1, 20),
-           seed=st.integers(0, 1000), **hu.gcs_cpu_only)
-    def test_pad(self, d, n, gc, dc, seed):
-        for dtype in [np.float32, np.int8, np.int64]:
-            np.random.seed(seed)
-            dims = [n] * d
-            X = np.random.rand(*dims).astype(dtype)
-            max_batch_size = n + 8
-
-            def ref_op(X):
-                shape = list(X.shape)
-                out = np.zeros((1), dtype=np.int64)
-                out[0] = shape[0]
-                shape[0] = max_batch_size
-                Y = np.zeros(shape, dtype=dtype)
-                Y[:n] = X
-                return [Y, out]
-
-            op = core.CreateOperator(
-                "AdjustBatch",
-                ["X"],
-                ["Y", "RealBatch"],
-                max_batch_size=max_batch_size,
-            )
-
-            self.assertReferenceChecks(
-                device_option=gc,
-                op=op,
-                inputs=[X],
-                reference=ref_op,
-            )
-
-    @given(d=st.integers(1, 4), n=st.integers(8, 20),
-           seed=st.integers(0, 1000), **hu.gcs_cpu_only)
-    def test_truncate(self, d, n, gc, dc, seed):
-        for dtype in [np.float32, np.int8, np.int64]:
-            np.random.seed(seed)
-            dims = [n] * d
-            X = np.random.rand(*dims).astype(dtype)
-            real_batch_size = n - 8
-            R = np.zeros((1), dtype=np.int64)
-            R[0] = real_batch_size
-
-            def ref_op(X, R):
-                r = R[0]
-                return [X[:r]]
-
-            op = core.CreateOperator(
-                "AdjustBatch",
-                ["X", "RealBatch"],
-                ["Y"],
-            )
-
-            self.assertReferenceChecks(
-                device_option=gc,
-                op=op,
-                inputs=[X, R],
-                reference=ref_op,
-            )
author	Yinghai Lu <yinghai@fb.com>
	Wed, 17 Apr 2019 16:37:37 +0000 (09:37 -0700)
committer	Facebook Github Bot <facebook-github-bot@users.noreply.github.com>
	Wed, 17 Apr 2019 17:00:25 +0000 (10:00 -0700)
caffe2/operators/adjust_batch_op.cc	[deleted file]	patch \| blob \| history
caffe2/operators/adjust_batch_op.h	[deleted file]	patch \| blob \| history
caffe2/operators/onnxifi_op.cc		patch \| blob \| history
caffe2/operators/onnxifi_op.h		patch \| blob \| history
caffe2/opt/onnxifi_transformer.cc		patch \| blob \| history
caffe2/opt/onnxifi_transformer.h		patch \| blob \| history
caffe2/python/operator_test/adjust_batch_op_test.py	[deleted file]	patch \| blob \| history