}
};
+/**
+ * Reinitialize a Tensor to given dims and options if necessary, note that
+ * this will not do anything if the
+ * Tensor already has correct size and data type
+ */
CAFFE2_API void ReinitializeTensor(Tensor* t, at::IntList dims, at::TensorOptions options);
CAFFE2_API void ReinitializeAndCopyFrom(
bool RunOnDevice() override {
auto* output = Outputs()[0]->template GetMutable<Int8TensorCPU>();
- output->t.Resize(shape_);
+ ReinitializeTensor(&output->t, shape_, at::dtype<uint8_t>().device(CPU));
output->scale = scale_;
output->zero_point = zero_point_;
return Fill(output);
private:
void ExtractValues() {
auto source_values = this->template GetSingleArgument<string>("values", "");
- values_.Resize(source_values.size());
+ ReinitializeTensor(
+ &values_, {static_cast<int64_t>(source_values.size())}, at::dtype<uint8_t>().device(CPU));
uint8_t* values_data = values_.template mutable_data<uint8_t>();
for (int i = 0; i < source_values.size(); i++) {
values_data[i] = static_cast<uint8_t>(source_values[i]);
float scale_;
int32_t zero_point_;
vector<int64_t> shape_;
- Tensor values_{CPU};
+ Tensor values_;
};
class Int8GivenIntTensorFillOp final : public Operator<CPUContext> {
private:
void ExtractValues() {
auto source_values = this->template GetRepeatedArgument<int32_t>("values");
- values_.Resize(source_values.size());
+ ReinitializeTensor(
+ &values_, {static_cast<int64_t>(source_values.size())}, at::dtype<int32_t>().device(CPU));
auto* values_data = values_.template mutable_data<int32_t>();
for (int i = 0; i < source_values.size(); i++) {
values_data[i] = static_cast<int32_t>(source_values[i]);
float scale_;
int32_t zero_point_;
vector<int64_t> shape_;
- Tensor values_{CPU};
+ Tensor values_;
};
} // namespace int8
const int OW = IW * width_scale_;
const int OH = IH * height_scale_;
- Y->t.Resize(N, OH, OW, C);
+ ReinitializeTensor(&Y->t, {N, OH, OW, C}, at::dtype<uint8_t>().device(CPU));
Y->scale = X.scale;
Y->zero_point = X.zero_point;
assert(sampling_ratio_ >= 0);
// only supports NHWC now
- Y->t.Resize(R.dim32(0), pooled_height_, pooled_width_, X.t.dim32(3));
+ ReinitializeTensor(
+ &Y->t,
+ {R.dim32(0), pooled_height_, pooled_width_, X.t.dim32(3)},
+ at::dtype<uint8_t>().device(CPU));
int output_size = Y->t.numel();
ROIAlignForward(
template <typename SIndex>
bool DoRunWithType() {
if (InputSize() > 1) {
- starts_host_.CopyFrom(Input(1));
- ends_host_.CopyFrom(Input(2));
+ ReinitializeAndCopyFrom(&starts_host_, at::dtype<SIndex>().device(CPU), Input(1));
+ ReinitializeAndCopyFrom(&ends_host_, at::dtype<SIndex>().device(CPU), Input(2));
} else {
if (!statically_inited_) {
CAFFE_ENFORCE(HasArgument("starts"));
CAFFE_ENFORCE(HasArgument("ends"));
CAFFE_ENFORCE_EQ(starts_.size(), ends_.size());
- starts_host_.Resize(starts_.size());
- ends_host_.Resize(ends_.size());
+ ReinitializeTensor(
+ &starts_host_, {static_cast<int64_t>(starts_.size())}, at::dtype<SIndex>().device(CPU));
+ ReinitializeTensor(
+ &ends_host_, {static_cast<int64_t>(ends_.size())}, at::dtype<SIndex>().device(CPU));
memcpy(
starts_host_.template mutable_data<SIndex>(),
auto r = caffe2::make_unique<int8::Int8TensorCPU>();
r->scale = 0.01;
r->zero_point = static_cast<int32_t>(std::numeric_limits<uint8_t>::max()) / 2;
- r->t.Resize(dims);
+ ReinitializeTensor(&r->t, dims, at::dtype<uint8_t>().device(CPU));
std::random_device rd;
std::mt19937 gen(rd());
std::uniform_int_distribution<uint8_t> dis;
// Compute number of regions
int min_step = 1;
int max_step = 6;
- num_rois_.Resize(3); // num_rois, Wd, Hd
+ ReinitializeTensor(&num_rois_, {3}, at::dtype<int>().device(CUDA)); // num_rois, Wd, Hd
NumRMACRegionsKernel<<<
1,
CAFFE_CUDA_NUM_THREADS,
protected:
int scales_;
float overlap_;
- Tensor num_rois_{Context::GetDeviceType()};
+ Tensor num_rois_;
};
} // namespace caffe2
CAFFE_ENFORCE_EQ(1, segment_ids.ndim(), "SEGMENT_IDS must be a vector");
int64_t slize_sz = data.size_from_dim(1);
- K_tensor_.Resize(1);
+ ReinitializeTensor(&K_tensor_, {1}, at::dtype<SIndex>().device(CUDA));
// Get maximum segment id so we can size the output.
// This must be done synchronously with host.
if (segment_ids.size() > 4096) {
context_.cuda_stream());
// the second call do the real computation.
- buffer_tensor_.Resize(tmp_storage_bytes);
+ ReinitializeTensor(&buffer_tensor_, {static_cast<int64_t>(tmp_storage_bytes)}, at::dtype<char>().device(CUDA));
cub::DeviceReduce::Max(
static_cast<void*>(buffer_tensor_.mutable_data<char>()),
tmp_storage_bytes,
nullptr);
} else {
// For mean, we need to compute scaling factors
- scaling_factors_.Resize(K + 1);
+ ReinitializeTensor(&scaling_factors_, {K + 1}, at::dtype<int>().device(CUDA));
math::Set<int, CUDAContext>(
scaling_factors_.size(),
int(0),
}
private:
- Tensor buffer_tensor_{CUDA};
- Tensor K_tensor_{CUDA};
- Tensor scaling_factors_{CUDA}; // for mean
+ Tensor buffer_tensor_;
+ Tensor K_tensor_;
+ Tensor scaling_factors_; // for mean
};
template <typename SIndex>
K += 1;
if (segment_len_.size() != K) {
- segment_len_.Resize(K);
+ ReinitializeTensor(&segment_len_, {K}, at::dtype<SIndex>().device(CUDA));
}
math::Set<SIndex, CUDAContext>(
}
private:
- Tensor segment_len_{CUDA}; // for mean
+ Tensor segment_len_; // for mean
};
REGISTER_CUDA_OPERATOR_STR(
auto& data = Input(0);
if (InputSize() > 1) {
- starts_host_.CopyFrom(Input(1));
- ends_host_.CopyFrom(Input(2));
+ ReinitializeAndCopyFrom(&starts_host_, at::dtype<SIndex>().device(CPU), Input(1));
+ ReinitializeAndCopyFrom(&ends_host_, at::dtype<SIndex>().device(CPU), Input(2));
} else {
if (!statically_inited_) {
CAFFE_ENFORCE(HasArgument("starts"));
CAFFE_ENFORCE(HasArgument("ends"));
CAFFE_ENFORCE_EQ(starts_.size(), ends_.size());
- starts_host_.Resize(starts_.size());
- ends_host_.Resize(ends_.size());
+ ReinitializeTensor(&starts_host_, {static_cast<int64_t>(starts_.size())}, at::dtype<SIndex>().device(CPU));
+ ReinitializeTensor(&ends_host_, {static_cast<int64_t>(ends_.size())}, at::dtype<SIndex>().device(CPU));
memcpy(
starts_host_.mutable_data<SIndex>(),
std::vector<int64_t> starts_;
std::vector<int64_t> ends_;
bool statically_inited_;
- Tensor starts_host_{CPU};
- Tensor ends_host_{CPU};
+ Tensor starts_host_;
+ Tensor ends_host_;
}; // class SliceOp<CUDAContext>
auto& data = Input(0);
if (InputSize() == 4) {
- starts_host_.CopyFrom(Input(1));
- ends_host_.CopyFrom(Input(2));
+ ReinitializeAndCopyFrom(&starts_host_, at::dtype<SIndex>().device(CPU), Input(1));
+ ReinitializeAndCopyFrom(&ends_host_, at::dtype<SIndex>().device(CPU), Input(2));
auto& go = Input(3);
CAFFE_ENFORCE(HasArgument("ends"));
CAFFE_ENFORCE_EQ(starts_.size(), ends_.size());
- starts_host_.Resize(starts_.size());
- ends_host_.Resize(ends_.size());
+ ReinitializeTensor(&starts_host_, {static_cast<int64_t>(starts_.size())}, at::dtype<SIndex>().device(CPU));
+ ReinitializeTensor(&ends_host_, {static_cast<int64_t>(ends_.size())}, at::dtype<SIndex>().device(CPU));
memcpy(
starts_host_.mutable_data<SIndex>(),
std::vector<int64_t> starts_;
std::vector<int64_t> ends_;
bool statically_inited_;
- Tensor starts_host_{CPU};
- Tensor ends_host_{CPU};
+ Tensor starts_host_;
+ Tensor ends_host_;
}; // class SliceGradientOp<CUDAContext>
REGISTER_CUDA_OPERATOR(SliceGradient, SliceGradientOp<CUDAContext>);
} // namespace caffe2
template <typename SIndex>
bool DoRunWithType() {
if (InputSize() > 1) {
- starts_host_.CopyFrom(Input(1));
- ends_host_.CopyFrom(Input(2));
+ ReinitializeAndCopyFrom(&starts_host_, at::dtype<SIndex>().device(CPU), Input(1));
+ ReinitializeAndCopyFrom(&ends_host_, at::dtype<SIndex>().device(CPU), Input(2));
} else {
if (!statically_inited_) {
CAFFE_ENFORCE(HasArgument("starts"));
CAFFE_ENFORCE(HasArgument("ends"));
CAFFE_ENFORCE_EQ(starts_.size(), ends_.size());
- starts_host_.Resize(starts_.size());
- ends_host_.Resize(ends_.size());
+ ReinitializeTensor(&starts_host_, {static_cast<int64_t>(starts_.size())}, at::dtype<SIndex>().device(CPU));
+ ReinitializeTensor(&ends_host_, {static_cast<int64_t>(ends_.size())}, at::dtype<SIndex>().device(CPU));
memcpy(
starts_host_.template mutable_data<SIndex>(),
std::vector<int64_t> starts_;
std::vector<int64_t> ends_;
bool statically_inited_;
- Tensor starts_host_{CPU};
- Tensor ends_host_{CPU};
+ Tensor starts_host_;
+ Tensor ends_host_;
};
template <class Context>
auto& data = Input(0);
if (InputSize() == 4) {
- starts_host_.CopyFrom(Input(1));
- ends_host_.CopyFrom(Input(2));
+ ReinitializeAndCopyFrom(&starts_host_, at::dtype<SIndex>().device(CPU), Input(1));
+ ReinitializeAndCopyFrom(&ends_host_, at::dtype<SIndex>().device(CPU), Input(2));
auto& go = Input(3);
CAFFE_ENFORCE(HasArgument("ends"));
CAFFE_ENFORCE_EQ(starts_.size(), ends_.size());
- starts_host_.Resize(starts_.size());
- ends_host_.Resize(ends_.size());
+ ReinitializeTensor(
+ &starts_host_, {static_cast<int64_t>(starts_.size())}, at::dtype<SIndex>().device(CPU));
+ ReinitializeTensor(
+ &ends_host_, {static_cast<int64_t>(ends_.size())}, at::dtype<SIndex>().device(CPU));
memcpy(
starts_host_.template mutable_data<SIndex>(),
std::vector<int64_t> starts_;
std::vector<int64_t> ends_;
bool statically_inited_;
- Tensor starts_host_{CPU};
- Tensor ends_host_{CPU};
+ Tensor starts_host_;
+ Tensor ends_host_;
};
} // namespace caffe2
const int D = X.size_from_dim(canonical_axis);
auto* Y = Output(0, X.sizes(), at::dtype<float>());
float* Ydata = Y->template mutable_data<float>();
+ // ReinitializeTensor itself has the effect of caching, so there is no need to check for numel of Tensor
// First, get scales
- if (scale_.numel() != N) {
- scale_.Resize(N);
- }
- if (rowmax_.numel() != N) {
- rowmax_.Resize(N);
- }
- if (sum_multiplier_.numel() != D) {
- sum_multiplier_.Resize(D);
- math::Set<float, CPUContext>(D, 1.f, sum_multiplier_.mutable_data<float>(),
- &context_);
- }
+ ReinitializeTensor(
+ &scale_, {N}, at::dtype<float>().device(CPU));
+
+ ReinitializeTensor(
+ &rowmax_, {N}, at::dtype<float>().device(CPU));
+
+ ReinitializeTensor(
+ &sum_multiplier_,
+ {D},
+ at::dtype<float>().device(CPU));
+ math::Set<float, CPUContext>(D, 1.f, sum_multiplier_.mutable_data<float>(), &context_);
SoftmaxCPU(
context_,
auto& dY = Input(1);
const auto canonical_axis = Y.canonical_axis_index(axis_);
- const int N = Y.size_to_dim(canonical_axis);
- const int D = Y.size_from_dim(canonical_axis);
+ const int64_t N = Y.size_to_dim(canonical_axis);
+ const int64_t D = Y.size_from_dim(canonical_axis);
// First, get scales
if (scale_.numel() != N) {
- scale_.Resize(N);
+ ReinitializeTensor(
+ &scale_, {N}, at::dtype<float>().device(CPU));
}
if (sum_multiplier_.numel() != D) {
- sum_multiplier_.Resize(D);
+ ReinitializeTensor(
+ &sum_multiplier_,
+ {D},
+ at::dtype<float>().device(CPU));
math::Set<float, CPUContext>(D, 1.f, sum_multiplier_.mutable_data<float>(),
&context_);
}
protected:
int axis_;
- Tensor scale_{Context::GetDeviceType()};
- Tensor rowmax_{Context::GetDeviceType()};
- Tensor sum_multiplier_{Context::GetDeviceType()};
+ Tensor scale_;
+ Tensor rowmax_;
+ Tensor sum_multiplier_;
};
template <typename T, class Context>
protected:
int axis_;
- Tensor scale_{Context::GetDeviceType()};
- Tensor sum_multiplier_{Context::GetDeviceType()};
+ Tensor scale_;
+ Tensor sum_multiplier_;
};
} // namespace caffe2
N = X.size_to_dim(canonical_axis); // batch size
D = X.size_from_dim(canonical_axis);
P->ResizeLike(X);
- total_weight_ptr_.Resize(1);
+ ReinitializeTensor(&total_weight_ptr_, {1}, at::dtype<float>().device(CUDA));
if (label_prob_mode_) {
CAFFE_ENFORCE_GE(T.ndim(), 2);
auto* avg_loss =
Output(1, vector<int64_t>(), at::dtype<float>()); // Average loss
if (losses_.size() != N) {
- losses_.Resize(N);
+ ReinitializeTensor(&losses_, {N}, at::dtype<float>().device(CUDA));
}
if (rowmax_.size() != N) {
- rowmax_.Resize(N);
+ ReinitializeTensor(&rowmax_, {N}, at::dtype<float>().device(CUDA));
}
if (sum_multiplier_.size() != D) {
- sum_multiplier_.Resize(D);
+ ReinitializeTensor(&sum_multiplier_, {D}, at::dtype<float>().device(CUDA));
math::Set<float, CUDAContext>(
D, 1.f, sum_multiplier_.mutable_data<float>(), &context_);
}
N = X.dim32(0);
D = X.dim32(1);
P->ResizeLike(X);
- total_weight_ptr_.Resize(1);
+ ReinitializeTensor(&total_weight_ptr_, {1}, at::dtype<float>().device(CUDA));
CAFFE_ENFORCE_EQ(X.ndim(), 4);
CAFFE_ENFORCE_EQ(T.ndim(), 3);
CAFFE_ENFORCE_EQ(T.dim32(0), N);
int H = X.dim32(2);
int W = X.dim32(3);
if (losses_.size() != N * W * H) {
- losses_.Resize(N * W * H);
+ ReinitializeTensor(&losses_, {N * W * H}, at::dtype<float>().device(CUDA));
}
if (weights_.size() != N * W * H) {
- weights_.Resize(N * W * H);
+ ReinitializeTensor(&weights_, {N * W * H}, at::dtype<float>().device(CUDA));
}
const float* Xdata = X.data<float>();
N = X.size_to_dim(canonical_axis); // batch size
D = X.size_from_dim(canonical_axis);
- total_weight_ptr_.Resize(1);
+ ReinitializeTensor(&total_weight_ptr_, {1}, at::dtype<float>().device(CUDA));
if (label_prob_mode_) {
CAFFE_ENFORCE_GE(T.ndim(), 2);
N = X.dim32(0);
D = X.dim32(1);
- total_weight_ptr_.Resize(1);
+ ReinitializeTensor(&total_weight_ptr_, {1}, at::dtype<float>().device(CUDA));
// Spatial mode, compute softmax for each x, y location
CAFFE_ENFORCE_EQ(X.ndim(), 4);
CAFFE_ENFORCE_EQ(T.ndim(), 3);
int W = X.dim32(3);
dX->ResizeLike(X);
if (weights_.size() != N * W * H) {
- weights_.Resize(N * W * H);
+ ReinitializeTensor(&weights_, {N * W * H}, at::dtype<float>().device(CUDA));
}
const float* Pdata = P.data<float>();
return true;
}
if (sum_multiplier_.size() != D) {
- sum_multiplier_.Resize(D);
+ ReinitializeTensor(&sum_multiplier_, {D}, at::dtype<float>().device(CUDA));
math::Set<float, CUDAContext>(
D, 1.f, sum_multiplier_.mutable_data<float>(), &context_);
}
if (scale_.size() != N) {
- scale_.Resize(N);
+ ReinitializeTensor(&scale_, {N}, at::dtype<float>().device(CUDA));
}
if (rowmax_.size() != N) {
- rowmax_.Resize(N);
+ ReinitializeTensor(&rowmax_, {N}, at::dtype<float>().device(CUDA));
}
Softmax(
N,
auto& T = Input(1); // Labels / targets
const auto canonical_axis = X.canonical_axis_index(axis_);
- int N, D;
+ int64_t N, D;
N = X.size_to_dim(canonical_axis); // batch size
D = X.size_from_dim(canonical_axis);
auto* P =
}
if (sum_multiplier_.numel() != D) {
- sum_multiplier_.Resize(D);
+ ReinitializeTensor(
+ &sum_multiplier_,
+ {D},
+ at::dtype<float>().device(CPU));
math::Set<float, CPUContext>(
D, 1.f, sum_multiplier_.mutable_data<float>(), &context_);
}
- rowmax_.Resize(N);
- losses_.Resize(N);
+ ReinitializeTensor(
+ &rowmax_, {N}, at::dtype<float>().device(CPU));
+ ReinitializeTensor(
+ &losses_, {N}, at::dtype<float>().device(CPU));
SoftmaxCPU(
context_,
StorageOrder order_;
int axis_;
- Tensor losses_{Context::GetDeviceType()}; // Per example loss
- Tensor rowmax_{Context::GetDeviceType()}; // per example row max
+ Tensor losses_; // Per example loss
+ Tensor rowmax_; // per example row max
Tensor weights_{Context::GetDeviceType()}; // unignored weights
- Tensor sum_multiplier_{
- Context::GetDeviceType()}; // Vector of ones for summing via dot prod
- Tensor total_weight_ptr_{Context::GetDeviceType()};
+ Tensor sum_multiplier_; // Vector of ones for summing via dot prod
+ Tensor total_weight_ptr_;
Tensor scratch_{Context::GetDeviceType()};
};
int label_prob_mode_;
Tensor sum_multiplier_{Context::GetDeviceType()};
Tensor weights_{Context::GetDeviceType()}; // unignored weights
- Tensor total_weight_ptr_{Context::GetDeviceType()};
+ Tensor total_weight_ptr_;
StorageOrder order_;
bool only_loss_;
int axis_;
// Awkward way to get the max element to make it work with both CUDA
// and CPU.
- max_element_.Resize(1);
+ ReinitializeTensor(&max_element_, {1}, at::dtype<TInd>().device(Context::GetDeviceType()));
TInd* max_element_ptr = max_element_.template mutable_data<TInd>();
math::ReduceMax<TInd>(sparse_indices_len, sparse_indices_vec, max_element_ptr,
&scratch_, &context_);
int output_first_dim_;
Tensor scratch_{Context::GetDeviceType()};
Tensor max_element_host_{CPU};
- Tensor max_element_{Context::GetDeviceType()};
+ Tensor max_element_;
INPUT_TAGS(INDICES, VALUES, DATA_TO_INFER_DIM);
};
const T* scale_data = scale.template data<T>();
const T* bias_data = bias.template data<T>();
T* Y_data = Y->template mutable_data<T>();
- alpha_.Resize(C);
- beta_.Resize(C);
+ ReinitializeTensor(
+ &alpha_, {C}, at::dtype<T>().device(Context::GetDeviceType()));
+ ReinitializeTensor(
+ &beta_, {C}, at::dtype<T>().device(Context::GetDeviceType()));
T* alpha_data = alpha_.template mutable_data<T>();
T* beta_data = beta_.template mutable_data<T>();
if (is_test_) {
const StorageOrder order_;
const int num_batches_;
- Tensor alpha_{Context::GetDeviceType()};
- Tensor beta_{Context::GetDeviceType()};
+ Tensor alpha_;
+ Tensor beta_;
INPUT_TAGS(
INPUT,
math::Set<T, Context>(C, T(0), dbias_data, &context_);
return true;
}
- alpha_.Resize(C);
- beta_.Resize(C);
- gamma_.Resize(C);
+ ReinitializeTensor(
+ &alpha_, {C}, at::dtype<T>().device(Context::GetDeviceType()));
+ ReinitializeTensor(
+ &beta_, {C}, at::dtype<T>().device(Context::GetDeviceType()));
+ ReinitializeTensor(
+ &gamma_, {C}, at::dtype<T>().device(Context::GetDeviceType()));
T* alpha_data = alpha_.template mutable_data<T>();
T* beta_data = beta_.template mutable_data<T>();
T* gamma_data = gamma_.template mutable_data<T>();
const StorageOrder order_;
const int num_batches_;
- Tensor alpha_{Context::GetDeviceType()};
- Tensor beta_{Context::GetDeviceType()};
- Tensor gamma_{Context::GetDeviceType()};
+ Tensor alpha_;
+ Tensor beta_;
+ Tensor gamma_;
INPUT_TAGS(
INPUT,
Output(0, X.sizes(), at::dtype<float>()); // Probabilities from softmax
if (sum_multiplier_.numel() != D) {
- sum_multiplier_.Resize(D);
+ ReinitializeTensor(
+ &sum_multiplier_,
+ {D},
+ at::dtype<float>().device(CPU));
math::Set<float, CPUContext>(
D, 1.f, sum_multiplier_.mutable_data<float>(), &context_);
}
float scale_;
StorageOrder order_;
- Tensor losses_{Context::GetDeviceType()}; // Per example loss
+ Tensor losses_; // Per example loss
Tensor rowmax_{Context::GetDeviceType()}; // per example row max
- Tensor weights_{Context::GetDeviceType()}; // unignored weights
- Tensor sum_multiplier_{
- Context::GetDeviceType()}; // Vector of ones for summing via dot prod
- Tensor total_weight_ptr_{Context::GetDeviceType()};
+ Tensor weights_; // unignored weights
+ Tensor sum_multiplier_; // Vector of ones for summing via dot prod
+ Tensor total_weight_ptr_;
Tensor scratch_{Context::GetDeviceType()};
};
protected:
float scale_;
Tensor sum_multiplier_{Context::GetDeviceType()};
- Tensor weights_{Context::GetDeviceType()}; // unignored weights
- Tensor total_weight_ptr_{Context::GetDeviceType()};
+ Tensor weights_; // unignored weights
+ Tensor total_weight_ptr_;
StorageOrder order_;
bool only_loss_;
Tensor scratch_{Context::GetDeviceType()};
int axis_;
// Buffers for CUDAContext.
- Tensor input_transposed_buffer_{CUDA};
- Tensor values_transposed_buffer_{CUDA};
- Tensor indices_transposed_buffer_{CUDA};
+ Tensor input_transposed_buffer_;
+ Tensor values_transposed_buffer_;
+ Tensor indices_transposed_buffer_;
// Shape tensors on device for CUDAContext.
Tensor input_dims_device_{CUDA};
static_cast<int>(inner_size),
static_cast<int>(next_size)};
const std::array<int, 3> axes = {0, 2, 1};
- input_transposed_buffer_.Resize(
- std::vector<int64_t>{outer_size, inner_size});
- values_transposed_buffer_.Resize(std::vector<int64_t>{outer_size, k_});
- indices_transposed_buffer_.Resize(std::vector<int64_t>{outer_size, k_});
+ ReinitializeTensor(&input_transposed_buffer_, std::vector<int64_t>{outer_size, inner_size}, at::dtype<T>().device(CUDA));
+ ReinitializeTensor(&values_transposed_buffer_, std::vector<int64_t>{outer_size, k_}, at::dtype<T>().device(CUDA));
+ ReinitializeTensor(&indices_transposed_buffer_, std::vector<int64_t>{outer_size, k_}, at::dtype<int64_t>().device(CUDA));
math::Transpose(
3,
dims.data(),
// Add bias term
if (bias_multiplier_.numel() != batch_size) {
// If the helper bias multiplier is not M, reshape and fill it with one.
- bias_multiplier_.Resize(batch_size);
+ ReinitializeTensor(
+ &bias_multiplier_,
+ {batch_size},
+ at::dtype<T>().device(Context::GetDeviceType()));
math::Set<T, Context>(
batch_size,
static_cast<T>(1),
}
protected:
- Tensor bias_multiplier_{Context::GetDeviceType()};
+ Tensor bias_multiplier_;
std::vector<int> inp_sizes_;
std::vector<int> out_sizes_;
std::vector<int> tt_ranks_;
}
const T* input = inputTensor.template data<T>();
- thrust_unique_buffer_.Resize(N);
+ ReinitializeTensor(&thrust_unique_buffer_, {N}, at::dtype<T>().device(Context::GetDeviceType()));
auto* buffer = thrust_unique_buffer_.template mutable_data<T>();
context_.CopyItemsSameDevice(inputTensor.meta(), N, input, buffer);
private:
vector<int> order_;
- Tensor thrust_unique_buffer_{Context::GetDeviceType()};
+ Tensor thrust_unique_buffer_;
Tensor cuda_order_buffer_{Context::GetDeviceType()};
Tensor second_order_buffer_{Context::GetDeviceType()};
const size_t N = X.size();
const float* data_ptr = X.data<float>();
- scratch_.Resize(1);
+ ReinitializeTensor(&scratch_, {1}, at::dtype<bool>().device(CUDA));
math::Set<bool, CUDAContext>(
1, false, scratch_.mutable_data<bool>(), &context_);
NanCheckKernel<<<
// consecutively in device memory, copy pointers to a host vector and then
// copy back into a device array.
const int64_t B = (InputSize() - 3) / 2;
- x_data_host_.Resize(B);
- weights_host_.Resize(B);
- x_data_device_.Resize(B);
- weights_device_.Resize(B);
+ ReinitializeTensor(&x_data_host_, {B}, at::dtype<const float*>().device(CPU));
+ ReinitializeTensor(&weights_host_, {B}, at::dtype<const float*>().device(CPU));
+ ReinitializeTensor(&x_data_device_, {B}, at::dtype<const float*>().device(CUDA));
+ ReinitializeTensor(&weights_device_, {B}, at::dtype<const float*>().device(CUDA));
const float** x_data_host = x_data_host_.mutable_data<const float*>();
const float** weights_host = weights_host_.mutable_data<const float*>();
private:
TensorPrinter tensorPrinter_;
- Tensor scratch_{Context::GetDeviceType()};
+ Tensor scratch_;
};
struct GetNanCheckGradient : public GradientMakerBase {
}
return true;
}
- Tensor x_data_host_{CPU};
- Tensor weights_host_{CPU};
- Tensor x_data_device_{Context::GetDeviceType()};
- Tensor weights_device_{Context::GetDeviceType()};
+ Tensor x_data_host_;
+ Tensor weights_host_;
+ Tensor x_data_device_;
+ Tensor weights_device_;
};
/**
if (batch_size > 0 && weights_dim > 0) {
auto* out_idx = Output(0, {batch_size, 1}, at::dtype<int>());
- unif_samples_.Resize(batch_size);
+ ReinitializeTensor(&unif_samples_, {batch_size}, at::dtype<float>().device(CUDA));
const float* in_weights_data = in_weights.data<float>();
const float* in_val_data = nullptr;
private:
vector<float> cum_mass_;
- Tensor unif_samples_{Context::GetDeviceType()};
+ Tensor unif_samples_;
};
} // namespace caffe2