* a tensor on CPU and then CopyFrom a CUDA tensor, that will to a
* CUDA-to-CPU transfer).
*
- * If the function is invoked without `context` the copy would be synchronous
+ * 'async' parameter triggers async copy for CUDA tensors
*/
- void CopyFrom(const TensorImpl& src, at::BaseContext* context = nullptr) {
+ void CopyFrom(const TensorImpl& src, bool async = false) {
AT_ASSERT(!is_variable());
AT_ASSERTM(
src.is_contiguous(),
src.device(),
new_data,
device(),
- context != nullptr);
+ async);
}
}
}
* elements, in which case this tensors' capacity is grown at a factor of
* growthPct. This ensures that Extend runs on an amortized O(1) time
* complexity.
+ *
+ * This op is auto-asynchronous if the underlying device (CUDA) supports it.
*/
- void Extend(int64_t num, float growthPct, at::BaseContext* context) {
+ void Extend(int64_t num, float growthPct) {
AT_ASSERT(sizes_.size() >= 1u);
AT_ASSERTM(num >= 0, "`num` must be non-negative for Extend");
AT_ASSERTM(
auto oldDims = sizes_;
Resize(newCapacity);
auto* newData = raw_mutable_data(data_type_);
- AT_ASSERTM(
- context != nullptr, "Context must be provided to Extend the tensor");
if (data_type_.copy()) {
AT_ASSERTM(
device_type() == ::at::DeviceType::CPU,
Tensor* t,
at::TensorOptions options,
const Tensor& src,
- BaseContext* context) {
+ bool async) {
auto device_type = options.device().type();
CAFFE_ENFORCE(t != nullptr, "Target tensor ptr is null.");
if (!*t || device_type != t->GetDeviceType()) {
t->dtype(),
" to: ",
src.dtype());
- t->CopyFrom(src, context);
+ t->CopyFrom(src, async);
}
namespace {
return impl_.get()->GetDevice();
}
- void CopyFrom(const Tensor& src, BaseContext* context = nullptr) const {
- impl_.get()->CopyFrom(*src.impl_.get(), context);
+ void CopyFrom(const Tensor& src, bool async = false) const {
+ impl_.get()->CopyFrom(*src.impl_.get(), async);
}
/**
* @brief Extend the outer-most dimension of this tensor
* to dimension of `num`.
*/
- void ExtendTo(int64_t num, float growthPct, BaseContext* context) const {
+ void ExtendTo(int64_t num, float growthPct) const {
CAFFE_ENFORCE_GE_WITH_CALLER(impl_->dim(), 1);
CAFFE_ENFORCE_GE_WITH_CALLER(growthPct, 0);
- CAFFE_ENFORCE(context != nullptr, "Context must be provided.");
- Extend(num - impl_->size(0), growthPct, context);
+ Extend(num - impl_->size(0), growthPct);
}
- void Extend(int64_t num, float growthPct, BaseContext* context) const {
- impl_.get()->Extend(num, growthPct, context);
+ void Extend(int64_t num, float growthPct) const {
+ impl_.get()->Extend(num, growthPct);
}
/**
Tensor* t,
at::TensorOptions options,
const Tensor& src,
- BaseContext* context = nullptr);
+ bool async = false);
CAFFE_DECLARE_PREALLOCATED_KNOWN_TYPE(12, Tensor)
int64_t padded_dim0 = (X_dim0 / scale_ + 1) * scale_;
auto dim0_diff = padded_dim0 - X_dim0;
// set growthPct to the upper bound percentage: (100 * scale_ / X_dim0)
- X_pad->Extend(dim0_diff, 100 * scale_ / X_dim0, &context_);
+ X_pad->Extend(dim0_diff, 100 * scale_ / X_dim0);
auto* X_pad_data = X_pad->template mutable_data<T>();
int64_t X_size = X_dim0 * X_dim1;
int csz = im_i_boxes.rows();
int cur_start_idx = out_rois->dim(0);
- out_rois->Extend(csz, 50, &context_);
- out_rois_probs->Extend(csz, 50, &context_);
+ out_rois->Extend(csz, 50);
+ out_rois_probs->Extend(csz, 50);
// write rois
Eigen::Map<ERArrXXf> cur_rois(
// Write results
int cur_start_idx = out_scores->size(0);
- out_scores->Extend(total_keep_count, 50, &context_);
- out_boxes->Extend(total_keep_count, 50, &context_);
- out_classes->Extend(total_keep_count, 50, &context_);
+ out_scores->Extend(total_keep_count, 50);
+ out_boxes->Extend(total_keep_count, 50);
+ out_classes->Extend(total_keep_count, 50);
int cur_out_idx = 0;
for (int j = 1; j < num_classes; j++) {
}
if (out_keeps) {
- out_keeps->Extend(total_keep_count, 50, &context_);
+ out_keeps->Extend(total_keep_count, 50);
Eigen::Map<EArrXi> out_keeps_arr(
out_keeps->template mutable_data<int>() + cur_start_idx,
CAFFE_ENFORCE(a.sizes()[i] == b.sizes()[i]);
}
auto oldSize = c->numel();
- c->Extend(b.sizes()[0], kDatasetGrowthPct, &context_);
+ c->Extend(b.sizes()[0], kDatasetGrowthPct);
auto* dst = (char*)c->raw_mutable_data() + oldSize * b.dtype().itemsize();
context_.CopyItemsSameDevice(b.dtype(), b.numel(), b.raw_data(), dst);
return true;
continue;
}
auto oldSize = c->numel();
- c->Extend(b.sizes()[0], kDatasetGrowthPct, &context_);
+ c->Extend(b.sizes()[0], kDatasetGrowthPct);
auto* dst = (char*)c->raw_mutable_data() + oldSize * b.dtype().itemsize();
context_.CopyItemsSameDevice(b.dtype(), b.numel(), b.raw_data(), dst);
}
bool RunOnDevice() override {
auto& input = Input(0);
auto* output = Output(0);
- output->CopyFrom(input, &context_);
+ output->CopyFrom(input, true /*async*/);
if (dims_.empty()) {
return true;
}
bool RunOnDevice() override {
auto& input = Input(0);
auto* output = Output(0);
- output->CopyFrom(input, &context_);
+ output->CopyFrom(input, true /*async*/);
CAFFE_ENFORCE_GT(
input.dim(),
for (int i = 0; i < num_images; i++) {
roi_counts += im_boxes[i].rows();
}
- out_rois->Extend(roi_counts, 50, &context_);
- out_rois_probs->Extend(roi_counts, 50, &context_);
+ out_rois->Extend(roi_counts, 50);
+ out_rois_probs->Extend(roi_counts, 50);
float* out_rois_ptr = out_rois->template mutable_data<float>();
float* out_rois_probs_ptr = out_rois_probs->template mutable_data<float>();
for (int i = 0; i < num_images; i++) {
if (num_entries == 0) {
if (!output_initialized) {
// Get both shape and meta
- output->CopyFrom(input, &context_);
+ output->CopyFrom(input, true /*async*/);
}
return true;
}
// output_num is >= output_batch_size
if (output_num > output_batch_size) {
- output->ExtendTo(output_num, 50, &context_);
+ output->ExtendTo(output_num, 50);
}
auto* output_data =
auto* output = Output(0);
output->ResizeLike(input0);
- output->CopyFrom(input0, &context_);
+ output->CopyFrom(input0, true /*async*/);
if (InputSize() == 1) {
return true;
for (int i = 1; i < num_inputs; i++) {
auto* cur_dX = Output(i);
cur_dX->ResizeLike(dY);
- cur_dX->CopyFrom(*dX0, &context_);
+ cur_dX->CopyFrom(*dX0, true /*async*/);
}
return true;
scan_outputs_sizes[i],
"Size of scan output changed across iterations");
dims.insert(dims.begin(), itr);
- scan_output_target->Extend(1, 100, &context_);
+ scan_output_target->Extend(1, 100);
int64_t timestep_size = 1;
for (const int64_t t : scan_outputs_sizes[i]) {
auto output_num =
std::min<size_t>(numToCollect_, output_batch_size + num_to_copy);
// output_num is >= output_batch_size
- output->ExtendTo(output_num, 50, &context_);
+ output->ExtendTo(output_num, 50);
if (pos_to_object) {
- pos_to_object->ExtendTo(output_num, 50, &context_);
+ pos_to_object->ExtendTo(output_num, 50);
}
auto* output_data =
(l + Hd - 1 > 0) ? ((H - region_size) / (1.0 * (l + Hd - 1))) : 0;
int cur_rows = output->dim32(0);
- output->Extend((l + Wd) * (l + Hd), 50, &context_);
+ output->Extend((l + Wd) * (l + Hd), 50);
auto* outputData = output->template mutable_data<float>() + cur_rows * 5;
for (int i = 0; i < l + Wd; ++i) {
// Replicate regions for all items in batch
int num_rois = output->dim32(0);
- output->Extend((batch_size - 1) * num_rois, 50, &context_);
+ output->Extend((batch_size - 1) * num_rois, 50);
auto* outputData = output->template mutable_data<float>();
for (int b = 1; b < batch_size; ++b) {
// Copy all rois
bool RunOnDevice() override {
if (startPaddingWidth_ == 0 && endPaddingWidth_ == 0) {
- Output(0)->CopyFrom(Input(0), &context_);
+ Output(0)->CopyFrom(Input(0), true /*async*/);
if (OutputSize() == 2) {
- Output(1)->CopyFrom(Input(1), &context_);
+ Output(1)->CopyFrom(Input(1), true /*async*/);
}
return true;
}
bool RunOnDevice() override {
if (startPaddingWidth_ == 0 && endPaddingWidth_ == 0) {
- Output(0)->CopyFrom(Input(0), &context_);
+ Output(0)->CopyFrom(Input(0), true /*async*/);
if (OutputSize() == 2) {
- Output(1)->CopyFrom(Input(1), &context_);
+ Output(1)->CopyFrom(Input(1), true /*async*/);
}
return true;
}
}
if (dim == -1) {
if (!backward) {
- output->CopyFrom(data, context);
+ output->CopyFrom(data, true /*async*/);
} else {
- gdata->CopyFrom(*go, context);
+ gdata->CopyFrom(*go, true /*async*/);
}
return true;
}
}
if (dim == -1) {
if (!backward) {
- output->CopyFrom(data, context);
+ output->CopyFrom(data, true /*async*/);
} else {
- gdata->CopyFrom(*go, context);
+ gdata->CopyFrom(*go, true /*async*/);
}
return true;
}
const auto& in = Input(0);
auto* out = Output(0);
if (out != &in) {
- out->CopyFrom(in, &context_);
+ out->CopyFrom(in, true /*async*/);
}
return true;
}
// This op should act as an identity matrix if we don't find any NaNs/infs.
// Copy over the data if we are not doing this in-place.
if (&X != Y) {
- Y->CopyFrom(X, &context_);
+ Y->CopyFrom(X, true /*async*/);
}
return true;
}
// allow the output to be copied from the input
if (&input != output) {
output->ResizeLike(input);
- output->CopyFrom(input, &context_);
+ output->CopyFrom(input, true /*async*/);
}
return true;
}
auto& input0 = Input(0);
auto* output = Output(0);
if (InputSize() == 1) {
- output->CopyFrom(input0, &context_);
+ output->CopyFrom(input0, true /*async*/);
return true;
}
output->ResizeLike(input0);
size,
" total columns");
- out->Extend(in.sizes()[0], kTensorGrowthPct, &context_);
+ out->Extend(in.sizes()[0], kTensorGrowthPct);
auto* dst =
(char*)out->raw_mutable_data() + oldSize * in.dtype().itemsize();
context_.template CopyItems<Context, Context>(
// prefetch function as well.
if (!std::is_same<Context, CPUContext>::value) {
if (get_rgb_) {
- prefetched_clip_rgb_on_device_.CopyFrom(prefetched_clip_rgb_, &context_);
+ prefetched_clip_rgb_on_device_.CopyFrom(
+ prefetched_clip_rgb_, true /*async*/);
}
if (get_optical_flow_) {
- prefetched_clip_of_on_device_.CopyFrom(prefetched_clip_of_, &context_);
+ prefetched_clip_of_on_device_.CopyFrom(
+ prefetched_clip_of_, true /*async*/);
}
- prefetched_label_on_device_.CopyFrom(prefetched_label_, &context_);
+ prefetched_label_on_device_.CopyFrom(prefetched_label_, true /*async*/);
if (get_video_id_) {
- prefetched_video_id_on_device_.CopyFrom(prefetched_video_id_, &context_);
+ prefetched_video_id_on_device_.CopyFrom(
+ prefetched_video_id_, true /*async*/);
}
}
return true;
auto* clip_rgb_output =
OperatorBase::Output<Tensor>(index++, Context::GetDeviceType());
if (std::is_same<Context, CPUContext>::value) {
- clip_rgb_output->CopyFrom(prefetched_clip_rgb_, &context_);
+ clip_rgb_output->CopyFrom(prefetched_clip_rgb_, true /*async*/);
} else {
- clip_rgb_output->CopyFrom(prefetched_clip_rgb_on_device_, &context_);
+ clip_rgb_output->CopyFrom(prefetched_clip_rgb_on_device_, true /*async*/);
}
}
if (get_optical_flow_) {
auto* clip_of_output =
OperatorBase::Output<Tensor>(index++, Context::GetDeviceType());
if (std::is_same<Context, CPUContext>::value) {
- clip_of_output->CopyFrom(prefetched_clip_of_, &context_);
+ clip_of_output->CopyFrom(prefetched_clip_of_, true /*async*/);
} else {
- clip_of_output->CopyFrom(prefetched_clip_of_on_device_, &context_);
+ clip_of_output->CopyFrom(prefetched_clip_of_on_device_, true /*async*/);
}
}
auto* label_output =
OperatorBase::Output<Tensor>(index++, Context::GetDeviceType());
if (std::is_same<Context, CPUContext>::value) {
- label_output->CopyFrom(prefetched_label_, &context_);
+ label_output->CopyFrom(prefetched_label_, true /*async*/);
} else {
- label_output->CopyFrom(prefetched_label_on_device_, &context_);
+ label_output->CopyFrom(prefetched_label_on_device_, true /*async*/);
}
if (get_video_id_) {
auto* video_id_output =
OperatorBase::Output<Tensor>(index, Context::GetDeviceType());
if (std::is_same<Context, CPUContext>::value) {
- video_id_output->CopyFrom(prefetched_video_id_, &context_);
+ video_id_output->CopyFrom(prefetched_video_id_, true /*async*/);
} else {
- video_id_output->CopyFrom(prefetched_video_id_on_device_, &context_);
+ video_id_output->CopyFrom(prefetched_video_id_on_device_, true /*async*/);
}
}
return true;