}
template <>
+std::vector<int> OnnxifiOp<float, CPUContext>::extractOutputBatchSizes() const {
+ CAFFE_ENFORCE_EQ(
+ input_shapes_.size(),
+ InputSize(),
+ "Input shapes and input size don't match. ",
+ input_shapes_.size(),
+ " vs ",
+ InputSize());
+ CAFFE_ENFORCE_EQ(
+ output_shapes_.size(),
+ OutputSize(),
+ "Output shapes and output size don't match. ",
+ output_shapes_.size(),
+ " vs ",
+ OutputSize());
+
+ std::vector<int> adjusted_output_batch;
+ for (const auto& shape : output_shapes_) {
+ if (shape.empty()) {
+ adjusted_output_batch.push_back(0);
+ } else {
+ const auto max_output_batch_size = shape.front();
+ const auto it = batch_pos_map_.find(max_output_batch_size);
+ if (it == batch_pos_map_.end()) {
+ if (use_onnx_) {
+ // For ONNX path, it's possible that we have output batch size that is
+ // unknown, because we handle the second outout of Concat and Split in
+ // ONNX. But for C2 path, we should not meet with this condition.
+ adjusted_output_batch.push_back(0);
+ continue;
+ } else {
+ CAFFE_THROW("Unknow output max batch size: ", max_output_batch_size);
+ }
+ }
+ auto idx = it->second;
+ CAFFE_ENFORCE_LT(idx, input_shapes_.size(), "index out of bound");
+ const auto& input_shape = input_shapes_[idx];
+ // If input real batch size and output max size is the same, we don't need
+ // to adjust max batch size of the output
+ if (input_shape.empty() || input_shape.front() == max_output_batch_size) {
+ adjusted_output_batch.push_back(0);
+ } else {
+ adjusted_output_batch.push_back(input_shape.front());
+ }
+ }
+ }
+
+ return adjusted_output_batch;
+}
+
+template <>
+void OnnxifiOp<float, CPUContext>::maybeAdjustOutputBatchSizes(
+ const std::vector<int>& real_output_batch_sizes) {
+ CAFFE_ENFORCE_EQ(real_output_batch_sizes.size(), output_shapes_.size());
+ for (int i = 0; i < real_output_batch_sizes.size(); ++i) {
+ if (!real_output_batch_sizes[i]) {
+ continue;
+ }
+ auto* output_tensor = Output(i);
+ output_tensor->ShrinkTo(real_output_batch_sizes[i]);
+ }
+}
+
+template <>
bool OnnxifiOp<float, CPUContext>::RunOnDevice() {
CAFFE_ENFORCE_EQ(input_desc_.size(), InputSize());
input_shapes_.clear();
bool ext_supported = false;
onnxMemoryFenceV1 input_fence;
onnxMemoryFenceV1 output_fence;
+ std::vector<int> output_batch_sizes;
#ifdef ONNXIFI_ENABLE_EXT
/**
* If onnxifi extension mode is enabled,
&output_fence,
/* traceEvents */ nullptr),
ONNXIFI_STATUS_SUCCESS);
+ output_batch_sizes = extractOutputBatchSizes();
CAFFE_ENFORCE_EQ(
lib_->onnxWaitEvent(output_fence.event), ONNXIFI_STATUS_SUCCESS);
CAFFE_ENFORCE_EQ(
ONNXIFI_STATUS_SUCCESS);
CAFFE_ENFORCE_EQ(
lib_->onnxSignalEvent(input_fence.event), ONNXIFI_STATUS_SUCCESS);
+ output_batch_sizes = extractOutputBatchSizes();
CAFFE_ENFORCE_EQ(
lib_->onnxWaitEvent(output_fence.event), ONNXIFI_STATUS_SUCCESS);
CAFFE_ENFORCE_EQ(
lib_->onnxReleaseEvent(output_fence.event), ONNXIFI_STATUS_SUCCESS);
}
+
+ maybeAdjustOutputBatchSizes(output_batch_sizes);
return true;
}
"(string default=\"\") Serialized ONNX model to be converted to backend representation")
.Arg(
"initializers",
- "Initialization pair indicating the mapping of the name between NetDef and ONNX model");
+ "Initialization pair indicating the mapping of the name between NetDef and ONNX model")
+ .Arg(
+ "output_resize_hints",
+ "A list of key/value pairs indicating which input index to look up for real batch size for the given max output batch size");
} // namespace caffe2
opset_id->set_version(7);
}
-std::string MakeSeqSizeBlob(const std::string& blob_name) {
- return blob_name + "_real_seq_size";
-}
-
-std::string MakeOutputForAdjustBatchOp(const std::string& input) {
- return input + "_post_adjust_batch";
-}
-
-std::string MakeInputForAdjustBatchOp(const std::string& output) {
- return output + "_pre_adjust_batch";
-}
-
-OperatorDef MakeAdjustBatchOp(
- const std::string& input_blob,
- const std::string& output_blob,
- int max_batch_size,
- const std::string& real_batch_size_blob,
- bool adjust_to_max_batch_size) {
- OperatorDef adjust_batch_op;
- adjust_batch_op.set_type("AdjustBatch");
- auto* arg = adjust_batch_op.add_arg();
- arg->set_name("max_batch_size");
- arg->set_i(max_batch_size);
- adjust_batch_op.add_input(input_blob);
- adjust_batch_op.add_output(output_blob);
- if (adjust_to_max_batch_size) {
- if (!real_batch_size_blob.empty()) {
- adjust_batch_op.add_output(real_batch_size_blob);
- }
- } else {
- adjust_batch_op.add_input(real_batch_size_blob);
- }
- return adjust_batch_op;
-}
-
-std::unordered_set<string> ToHashSet(
+std::unordered_set<string> toHashSet(
const ::google::protobuf::RepeatedPtrField<string>& strs) {
return std::unordered_set<string>(strs.begin(), strs.end());
}
-int64_t GetBlob1stDimSize(
+int64_t getBlob1stDimSize(
const ShapeInfo& shape_info,
const string& blob_name) {
if (shape_info.shape.dims_size() == 0) {
}
}
-// Generates AdjustBatchOps for external inputs/outputs with type BATCH or
-// SEQ and adds them to input_ops and output_ops.
-// Meanwhile, modifies inputs/outputs of corresponding operators in the
-// onnxifi_net to use the new inputs/outputs of AdjustBatchOps.
-std::unordered_map<std::string, std::string> AddAdjustBatchOps(
- const ShapeInfoMap& shape_hints,
- NetDef* onnxifi_net,
- vector<OperatorDef>* input_ops,
- vector<OperatorDef>* output_ops) {
- std::unordered_map<std::string, std::string> renaming_map;
- const auto external_inputs = ToHashSet(onnxifi_net->external_input());
- const auto external_outputs = ToHashSet(onnxifi_net->external_output());
- std::unordered_set<std::string> real_batch_size_blobs;
- std::unordered_set<std::string> post_adjust_inputs;
-
- for (auto& op : *(onnxifi_net->mutable_op())) {
- // Add AdjustBatchOp for all external inputs with type BATCH or SEQ.
- // This will adjust the batch/seq size to the batch/seq size inferred by
- // bound_shape_inference. Note that we only produce real batch size tensor
- // once to avoid data race. In addition, for each input we only create one
- // AdjustBatch op for the same reason.
- for (auto& input_blob : *(op.mutable_input())) {
- if (external_inputs.count(input_blob)) {
- auto shape_info_it = shape_hints.find(input_blob);
- if (shape_info_it == shape_hints.end()) {
- LOG(WARNING) << "Cannot find shape_info for external input blob: "
- << input_blob;
- continue;
- }
- std::string real_batch_size_blob = "";
- auto max_batch_size = 0;
- if (shape_info_it->second.dim_type == ShapeInfo::DimType::BATCH) {
- max_batch_size = GetBlob1stDimSize(shape_info_it->second, input_blob);
- real_batch_size_blob =
- kRealBatchSizeBlob + "_" + c10::to_string(max_batch_size);
- } else if (shape_info_it->second.dim_type == ShapeInfo::DimType::SEQ) {
- max_batch_size = GetBlob1stDimSize(shape_info_it->second, input_blob);
- real_batch_size_blob = MakeSeqSizeBlob(input_blob);
- } else {
- continue;
- }
-
- auto output_blob = MakeOutputForAdjustBatchOp(input_blob);
- auto ret = real_batch_size_blobs.emplace(real_batch_size_blob);
- if (post_adjust_inputs.emplace(output_blob).second) {
- input_ops->push_back(MakeAdjustBatchOp(
- input_blob,
- output_blob,
- max_batch_size,
- ret.second ? real_batch_size_blob : "",
- true /* adjust_to_max_batch_size */));
- }
- renaming_map[input_blob] = output_blob;
- input_blob = output_blob;
- } else if (renaming_map.count(input_blob)) {
- // It is possible that input of a certain op is the output of its
- // predecessor op, which happens to be an external_output. In this case,
- // the tensor would have been renamed to X_pre_batch_adjust. Therefore,
- // we need to rename input X to X_pre_batch_adjust too.
- input_blob = renaming_map[input_blob];
- }
- }
- // Add AdjustBatchOp for all external outputs with type BATCH if the real
- // batch size is presented. This will adjust the batch size to the
- // original batch size.
- for (auto& output_blob : *(op.mutable_output())) {
- if (external_outputs.count(output_blob)) {
- auto shape_info_it = shape_hints.find(output_blob);
- CAFFE_ENFORCE(
- shape_info_it != shape_hints.end(),
- "Cannot find shape info for ",
- output_blob,
- " for AdjustBatchOp insertion");
- if (shape_info_it->second.dim_type == ShapeInfo::DimType::BATCH) {
- auto max_batch_size =
- GetBlob1stDimSize(shape_info_it->second, output_blob);
- std::string real_size_blob =
- kRealBatchSizeBlob + "_" + c10::to_string(max_batch_size);
- CAFFE_ENFORCE(
- real_batch_size_blobs.count(real_size_blob),
- output_blob,
- ": Cannot find ",
- real_size_blob,
- " to make AdjustBatchOp");
- auto input_blob = MakeInputForAdjustBatchOp(output_blob);
- output_ops->push_back(MakeAdjustBatchOp(
- input_blob,
- output_blob,
- max_batch_size,
- real_size_blob,
- false /* adjust_to_max_batch_size */));
- renaming_map[output_blob] = input_blob;
- output_blob = input_blob;
- } else if (shape_info_it->second.dim_type == ShapeInfo::DimType::SEQ) {
- LOG(WARNING) << "It's unusual that output tesnor " << output_blob
- << " is of dim_type SEQ. "
- << "AdjustBatchOp won't attached "
- << "and it might degrade the performance";
- }
- }
- }
- }
-
- return renaming_map;
-}
-
-NetDef ComposeResultNet(
- const vector<OperatorDef>& input_ops,
- const vector<OperatorDef>& output_ops,
- const OperatorDef& onnxifi_op) {
+NetDef composeResultNet(const OperatorDef& onnxifi_op) {
NetDef net_opt;
- for (const auto& op : input_ops) {
- net_opt.add_op()->CopyFrom(op);
- }
net_opt.add_op()->CopyFrom(onnxifi_op);
- // Add AdjustBatch ops for output blobs to the net.
- for (const auto& op : output_ops) {
- net_opt.add_op()->CopyFrom(op);
- }
return net_opt;
}
}
}
+std::unordered_map<int, std::string>
+OnnxifiTransformer::generateBatchPaddingHints(
+ const NetDef& onnxifi_net,
+ const ShapeInfoMap& shape_hints) {
+ std::unordered_map<int, std::string> batch_pos_map;
+ const auto external_inputs = toHashSet(onnxifi_net.external_input());
+ const auto external_outputs = toHashSet(onnxifi_net.external_output());
+ for (const auto& op : onnxifi_net.op()) {
+ for (auto i = 0; i < op.input_size(); ++i) {
+ const auto& input_blob = op.input(i);
+ if (external_inputs.count(input_blob)) {
+ auto shape_info_it = shape_hints.find(input_blob);
+ if (shape_info_it == shape_hints.end()) {
+ LOG(WARNING) << "Cannot find shape_info for external input blob: "
+ << input_blob;
+ continue;
+ }
+ if (shape_info_it->second.dim_type == ShapeInfo::DimType::BATCH ||
+ shape_info_it->second.dim_type == ShapeInfo::DimType::SEQ) {
+ batch_pos_map.emplace(
+ getBlob1stDimSize(shape_info_it->second, input_blob), input_blob);
+ }
+ }
+ }
+
+ // Correctness check on the output
+ for (const auto& output_blob : op.output()) {
+ if (external_outputs.count(output_blob)) {
+ auto shape_info_it = shape_hints.find(output_blob);
+ CAFFE_ENFORCE(
+ shape_info_it != shape_hints.end(),
+ "Cannot find shape info for ",
+ output_blob,
+ " to adjust output batch size");
+ if (shape_info_it->second.dim_type == ShapeInfo::DimType::BATCH) {
+ auto max_batch_size =
+ getBlob1stDimSize(shape_info_it->second, output_blob);
+ CAFFE_ENFORCE(
+ batch_pos_map.count(max_batch_size),
+ "Cannot find input with max batch size",
+ max_batch_size);
+ } else if (shape_info_it->second.dim_type == ShapeInfo::DimType::SEQ) {
+ LOG(WARNING) << "It's unusual that output tensor " << output_blob
+ << " is of dim_type SEQ. "
+ << "AdjustBatchOp won't attached "
+ << "and it might degrade the performance";
+ }
+ }
+ }
+ }
+ return batch_pos_map;
+}
+
OperatorDef OnnxifiTransformer::BuildOnnxifiOp(
const std::string& onnx_model_str,
const std::unordered_map<std::string, TensorShape>& output_shape_hints,
const std::unordered_set<std::string>& initialization_list,
const std::vector<std::string>& external_inputs,
- const std::vector<std::string>& external_outputs) {
+ const std::vector<std::string>& external_outputs,
+ const std::unordered_map<int, std::string>& batch_pos_map) {
OperatorDef op;
op.set_type("Onnxifi");
auto* onnx_model_arg = op.add_arg();
}
// Add the input/output
+ std::unordered_map<std::string, int> input_pos_map;
+ int idx = 0;
auto* input_names = op.add_arg();
input_names->set_name("input_names");
for (const auto& input : external_inputs) {
if (!initialization_list.count(input)) {
op.add_input(input);
input_names->add_strings(input);
+ input_pos_map.emplace(input, idx++);
}
}
auto* output_names = op.add_arg();
AddArgument(kModelId, model_id_, &op);
AddArgument(kNetPos, c10::to_string(onnxifi_op_id_++), &op);
+ // Add output resizing hints
+ auto* resize_arg = op.add_arg();
+ resize_arg->set_name("output_resize_hints");
+ for (const auto kv : batch_pos_map) {
+ const auto it = input_pos_map.find(kv.second);
+ CAFFE_ENFORCE(
+ it != input_pos_map.end(),
+ "Cannot find input in OnnxifiOp: ",
+ kv.second);
+ resize_arg->add_ints(kv.first);
+ resize_arg->add_ints(it->second);
+ }
+
return op;
}
}
}
- // Insert AdjustBatch ops, note that this step will possibly change the names
- // of the input/output, so we need to create a mapping and use the renamed
- // names for external_inputs/outputs and input_shape_info for the onnxifi_net.
- vector<OperatorDef> input_ops;
- vector<OperatorDef> output_ops;
- std::unordered_map<std::string, std::string> renaming_map;
- if (opts_.add_adjust_batch_ops) {
- renaming_map =
- AddAdjustBatchOps(shape_hints, &onnxifi_net, &input_ops, &output_ops);
- }
+ // Add batch padding hints
+ auto batch_pos_map = generateBatchPaddingHints(onnxifi_net, shape_hints);
// Figure out weights and add it to external_inputs too
std::unordered_set<std::string> initialization_list;
qshape_arg->set_name("input_qshape_info");
onnxifi_net.clear_external_input();
for (const auto& i : total_inputs_vec) {
- auto input = i;
- const auto it = renaming_map.find(i);
- if (it != renaming_map.end()) {
- input = it->second;
- }
- onnxifi_net.add_external_input(input);
+ onnxifi_net.add_external_input(i);
auto info = shape_hints.at(i);
if (!info.is_quantized) {
shape_arg->mutable_tensors()->Add()->CopyFrom(
- wrapShapeInfoIntoTensorProto(input, shape_hints.at(i)));
+ wrapShapeInfoIntoTensorProto(i, shape_hints.at(i)));
} else {
qshape_arg->mutable_qtensors()->Add()->CopyFrom(
- wrapShapeInfoIntoQTensorProto(input, shape_hints.at(i)));
+ wrapShapeInfoIntoQTensorProto(i, shape_hints.at(i)));
}
}
// Compute output shape hints
std::unordered_map<std::string, TensorShape> output_shape_hints;
- for (auto& o : *onnxifi_net.mutable_external_output()) {
- auto output = o;
- const auto rit = renaming_map.find(o);
- if (rit != renaming_map.end()) {
- output = rit->second;
- }
+ for (const auto& o : onnxifi_net.external_output()) {
const auto it = shape_hints.find(o);
CAFFE_ENFORCE(
it != shape_hints.end(), "Cannot find shape info for output ", o);
const auto& shape = it->second.shape;
- output_shape_hints.emplace(output, shape);
- o = output;
+ output_shape_hints.emplace(o, shape);
}
// Build ONNXIFI Op
output_shape_hints,
initialization_list,
onnxifi_net_inputs,
- onnxifi_net_outputs);
- NetDef net_opt = ComposeResultNet(input_ops, output_ops, onnxifi_op);
+ onnxifi_net_outputs,
+ batch_pos_map);
+ NetDef net_opt = composeResultNet(onnxifi_op);
// Debugging stuff
if (opts_.debug) {
fillModelInfo(&onnx_model);
caffe2::NetDef onnxifi_net(net);
- vector<OperatorDef> input_ops;
- vector<OperatorDef> output_ops;
- auto renaming_map =
- AddAdjustBatchOps(*shape_hints, &onnxifi_net, &input_ops, &output_ops);
- for (const auto& kv : renaming_map) {
- shape_hints_onnx_.emplace(kv.second, shape_hints_onnx_.at(kv.first));
- }
+ auto batch_pos_map = generateBatchPaddingHints(onnxifi_net, *shape_hints);
// Convert c2 ops to onnx ops, add const weights if there are any
DeviceOption option;
// Convert outputs and compute output shape hints
std::vector<std::string> onnxifi_net_outputs;
for (const auto& o : net.external_output()) {
- auto output = o;
- const auto it = renaming_map.find(o);
- if (it != renaming_map.end()) {
- output = it->second;
- }
- onnxifi_net_outputs.emplace_back(output);
+ onnxifi_net_outputs.emplace_back(o);
}
auto io_vec = convertToValueInfo(
onnxifi_net_outputs,
extra_weights,
&initialization_list,
&onnxifi_net_inputs);
- for (auto& i : onnxifi_net_inputs) {
- const auto it = renaming_map.find(i);
- if (it != renaming_map.end()) {
- i = it->second;
- }
- }
io_vec = convertToValueInfo(
onnxifi_net_inputs,
shape_hints_onnx_,
output_shape_hints,
initialization_list,
onnxifi_net_inputs,
- onnxifi_net_outputs);
- NetDef net_opt = ComposeResultNet(input_ops, output_ops, onnxifi_op);
+ onnxifi_net_outputs,
+ batch_pos_map);
+ NetDef net_opt = composeResultNet(onnxifi_op);
// Debugging stuff
if (opts_.debug) {