bool RunOnDevice() override {
auto& input = Input(0);
- auto* output = Output(0);
- if (output->sizes() != input.sizes()) {
- LOG(INFO) << "Reshaping and initializing output.";
- output->ResizeLike(input);
- math::Set<T, Context>(
- output->numel(), 0, output->template mutable_data<T>(), &context_);
- }
+
+ // TODO: the operator depends on output being set to 0 before the run
+ auto* output = Output(0, input.sizes(), at::dtype<T>());
math::Axpby<T, T, Context>(
input.numel(),
static_cast<T>(1),
bool AffineChannelGradientOp<float, CUDAContext>::RunOnDeviceWithOrderNCHW() {
const auto& dY = Input(0);
const auto& scale = is_learnable_ ? Input(2) : Input(1);
- auto* dX = Output(0);
- dX->ResizeLike(dY);
+
+ auto* dX = Output(0, dY.sizes(), at::dtype<float>());
const int N = dY.dim32(0);
const int C = dY.dim32(1);
const int HxW = dY.size() / (N * C);
if (is_learnable_) {
const auto& X = Input(1);
const float* X_data = X.data<float>();
- auto* dscale = Output(1);
- auto* dbias = Output(2);
- dscale->ResizeLike(scale);
- dbias->ResizeLike(scale);
+
+
+ auto* dscale = Output(1, scale.sizes(), at::dtype<float>());
+ auto* dbias = Output(2, scale.sizes(), at::dtype<float>());
const int outer_size = N * HxW;
AffineChannelScaleBiasBackwardCUDAKernel<float, StorageOrder::NCHW>
<<<std::min(outer_size, CAFFE_MAXIMUM_NUM_BLOCKS),
bool AffineChannelGradientOp<float, CUDAContext>::RunOnDeviceWithOrderNHWC() {
const auto& dY = Input(0);
const auto& scale = is_learnable_ ? Input(2) : Input(1);
- auto* dX = Output(0);
- dX->ResizeLike(dY);
+
+ auto* dX = Output(0, dY.sizes(), at::dtype<float>());
const int ndim = dY.ndim();
const int C = dY.dim32(ndim - 1);
const int rows = dY.size() / C;
const float* X_data = X.data<float>();
const int N = X.dim32(0);
const int HxW = rows / N;
- auto* dscale = Output(1);
- auto* dbias = Output(2);
- dscale->ResizeLike(scale);
- dbias->ResizeLike(scale);
+
+
+ auto* dscale = Output(1, scale.sizes(), at::dtype<float>());
+ auto* dbias = Output(2, scale.sizes(), at::dtype<float>());
AffineChannelScaleBiasBackwardCUDAKernel<float, StorageOrder::NHWC>
<<<std::min(rows, CAFFE_MAXIMUM_NUM_BLOCKS),
CAFFE_CUDA_NUM_THREADS,
auto& data = Input(DATA);
auto& indices = Input(INDICES);
auto& grad = Input(GRAD);
- auto* output = Output(0);
+
// ONNX allows negative axis to index from the back, valid range: [-r, r].
int axis = axis_;
data.size(acheck), grad.size(acheck), "batch sizes should be the same");
}
- output->ResizeLike(data);
+ auto* output = Output(0, data.sizes(), at::dtype<float>());
auto* out_data = output->template mutable_data<float>();
math::Set<float, CUDAContext>(output->size(), 0, out_data, &context_);
auto* destData = (uint8_t*)dest->raw_mutable_data(src.meta());
const auto* srcData = (uint8_t*)src.raw_data();
if (OutputSize() == 2) {
-
+
auto* indicesOut = Output(1, {numOfOutput}, at::dtype<int64_t>());
indicesOut->template mutable_data<int64_t>();
}
window_centers = &Input(1);
}
- auto* output = Output(0);
- output->ResizeLike(*input);
+ auto* output = Output(0, input->sizes(), at::dtype<T>());
const auto canonical_axis = input->canonical_axis_index(axis_);
template <>
bool CeilOp<float, CUDAContext>::RunOnDevice() {
auto& X = Input(0);
- auto* Y = Output(0);
+
CAFFE_ENFORCE_GT(X.size(), 0);
- Y->ResizeLike(X);
+ auto* Y = Output(0, X.sizes(), at::dtype<float>());
CeilKernel<<<
CAFFE_GET_BLOCKS(X.size()),
CAFFE_CUDA_NUM_THREADS,
template <>
bool ChannelShuffleOp<float, CUDAContext>::RunOnDeviceWithOrderNCHW() {
const auto& X = Input(0);
- auto* Y = Output(0);
- Y->ResizeLike(X);
+
+ auto* Y = Output(0, X.sizes(), at::dtype<float>());
const int N = X.dim32(0);
const int C = X.dim32(1);
const int G = this->group_;
template <>
bool ChannelShuffleOp<float, CUDAContext>::RunOnDeviceWithOrderNHWC() {
const auto& X = Input(0);
- auto* Y = Output(0);
- Y->ResizeLike(X);
+
+ auto* Y = Output(0, X.sizes(), at::dtype<float>());
const int ndim = X.ndim();
const int N = X.dim32(0);
const int C = X.dim32(ndim - 1);
template <>
bool ChannelShuffleGradientOp<float, CUDAContext>::RunOnDeviceWithOrderNCHW() {
const auto& dY = Input(0);
- auto* dX = Output(0);
- dX->ResizeLike(dY);
+
+ auto* dX = Output(0, dY.sizes(), at::dtype<float>());
const int N = dY.dim32(0);
const int C = dY.dim32(1);
const int G = this->group_;
template <>
bool ChannelShuffleGradientOp<float, CUDAContext>::RunOnDeviceWithOrderNHWC() {
const auto& dY = Input(0);
- auto* dX = Output(0);
- dX->ResizeLike(dY);
+
+ auto* dX = Output(0, dY.sizes(), at::dtype<float>());
const int ndim = dY.ndim();
const int N = dY.dim32(0);
const int C = dY.dim32(ndim - 1);
template <>
bool ClipOp<float, CUDAContext>::RunOnDevice() {
auto& X = Input(0);
- auto* Y = Output(0);
+
CAFFE_ENFORCE_GE(X.size(), 0);
- Y->ResizeLike(X);
+ auto* Y = Output(0, X.sizes(), at::dtype<float>());
ClipKernel<<<
CAFFE_GET_BLOCKS(X.size()),
CAFFE_CUDA_NUM_THREADS,
bool ClipGradientOp<float, CUDAContext>::RunOnDevice() {
auto& Y = Input(0);
auto& dY = Input(1);
- auto* dX = Output(0);
+
CAFFE_ENFORCE_GE(Y.size(), 0);
CAFFE_ENFORCE_EQ(dY.size(), Y.size());
- dX->ResizeLike(Y);
+ auto* dX = Output(0, Y.sizes(), at::dtype<float>());
ClipGradientKernel<<<
CAFFE_GET_BLOCKS(Y.size()),
CAFFE_CUDA_NUM_THREADS,
bool CosineEmbeddingCriterionOp<CUDAContext>::RunOnDevice() {
auto& S = Input(0);
auto& Y = Input(1);
- auto* output = Output(0);
+
CAFFE_ENFORCE(S.size() == Y.size(),
"The embedding and label should have the same size.");
- output->ResizeLike(S);
+ auto* output = Output(0, S.sizes(), at::dtype<float>());
const float* Sdata = S.data<float>();
const int* Ydata = Y.data<int>();
auto& S = Input(0);
auto& Y = Input(1);
auto& dOutput = Input(2);
- auto* dS = Output(0);
+
- dS->ResizeLike(S);
+ auto* dS = Output(0, S.sizes(), at::dtype<float>());
const float* Sdata = S.data<float>();
const int* Ydata = Y.data<int>();
auto& X = Input(0);
auto& label = Input(1);
auto& dY = Input(2);
- auto* dX = Output(0);
+
int N, D;
if (X.ndim() > 1) {
N = X.dim32(0);
CAFFE_ENFORCE_EQ(label.dim32(0), N);
CAFFE_ENFORCE_EQ(dY.ndim(), 1);
CAFFE_ENFORCE_EQ(dY.dim32(0), N);
- dX->ResizeLike(X);
+ auto* dX = Output(0, X.sizes(), at::dtype<float>());
math::Set<float, CUDAContext>(
dX->size(), 0.f, dX->template mutable_data<float>(), &context_);
LabelCrossEntropyGradientKernel<<<
const auto outer_size = logits.size() / inner_size;
CAFFE_ENFORCE(g.size() == outer_size);
- auto* out = Output(0);
- out->ResizeLike(logits);
+
+ auto* out = Output(0, logits.sizes(), at::dtype<float>());
auto* out_ptr = out->template mutable_data<float>();
auto* logits_ptr = logits.data<float>();
const auto outer_size = logits.size() / inner_size;
CAFFE_ENFORCE(g.size() == outer_size);
- auto* out = Output(0);
- out->ResizeLike(logits);
+
+ auto* out = Output(0, logits.sizes(), at::dtype<float>());
auto* out_ptr = out->template mutable_data<float>();
auto* logits_ptr = logits.data<float>();
auto& offset = Input(OFFSET);
auto& filter = Input(FILTER);
auto& dY = Input(OUTPUT_GRAD);
- auto* dfilter = Output(FILTER_GRAD);
- auto* doffset = Output(OFFSET_GRAD);
+
+
const int N = X.dim32(0), C = X.dim32(1);
const vector<int> input_dims = this->GetDims(X);
}
CAFFE_ENFORCE(M % group_ == 0);
- dfilter->ResizeLike(filter);
- doffset->ResizeLike(offset);
+ auto* dfilter = Output(FILTER_GRAD, filter.sizes(), at::dtype<T>());
+ auto* doffset = Output(OFFSET_GRAD, offset.sizes(), at::dtype<T>());
// The dimension of each kernel
const int kernel_dim = C / group_ * kernel_dims_size;
T* dXdata = nullptr;
if (OutputSize() == 4 || (no_bias_ && (OutputSize() == 3))) {
- auto* dX = Output(no_bias_ ? BIAS_OR_INPUT_GRAD : INPUT_GRAD);
- dX->ResizeLike(X);
+
+ auto* dX = Output(no_bias_ ? BIAS_OR_INPUT_GRAD : INPUT_GRAD, X.sizes(), at::dtype<T>());
dXdata = dX->template mutable_data<T>();
math::Set<T, Context>(dX->size(), 0, dXdata, &context_);
}
auto& X = Input(0);
auto& Y = Input(1);
auto& dDistance = Input(2);
- auto* dX = Output(0);
- auto* dY = Output(1);
+
+
int N = X.ndim() > 0 ? X.dim32(0) : 1;
int D = N > 0 ? X.size() / N : 0;
CAFFE_ENFORCE(X.ndim() == Y.ndim());
}
CAFFE_ENFORCE_EQ(dDistance.ndim(), 1);
CAFFE_ENFORCE_EQ(dDistance.dim32(0), N);
- dX->ResizeLike(X);
- dY->ResizeLike(Y);
+ auto* dX = Output(0, X.sizes(), at::dtype<float>());
+ auto* dY = Output(1, Y.sizes(), at::dtype<float>());
math::Sub<float, CUDAContext>(
X.size(),
X.data<float>(),
auto& X = Input(0);
auto& Y = Input(1);
auto& dDistance = Input(2);
- auto* dX = Output(0);
- auto* dY = Output(1);
+
+
int N = X.ndim() > 0 ? X.dim32(0) : 1;
int D = N > 0 ? X.size() / N : 0;
CAFFE_ENFORCE(X.ndim() == Y.ndim());
}
CAFFE_ENFORCE_EQ(dDistance.ndim(), 1);
CAFFE_ENFORCE_EQ(dDistance.dim32(0), N);
- dX->ResizeLike(X);
- dY->ResizeLike(Y);
+ auto* dX = Output(0, X.sizes(), at::dtype<float>());
+ auto* dY = Output(1, Y.sizes(), at::dtype<float>());
L1DistanceGradientKernel<<<
CAFFE_GET_BLOCKS(N * D),
auto& X = Input(X_IN);
auto& Y = Input(Y_IN);
auto& dCos = Input(DER_COS_IN);
- auto* dX = Output(DER_X_OUT);
- auto* dY = Output(DER_Y_OUT);
+
+
const int N = X.ndim() > 0 ? X.dim32(0) : 1;
const int D = X.size_from_dim(1);
CAFFE_ENFORCE(X.ndim() == Y.ndim());
}
CAFFE_ENFORCE(dCos.ndim() == 1);
CAFFE_ENFORCE(dCos.dim32(0) == N);
- dX->ResizeLike(X);
- dY->ResizeLike(Y);
+ auto* dX = Output(DER_X_OUT, X.sizes(), at::dtype<float>());
+ auto* dY = Output(DER_Y_OUT, Y.sizes(), at::dtype<float>());
const auto* X_data = X.data<float>();
const auto* Y_data = Y.data<float>();
auto& X = Input(X_IN);
auto& Y = Input(Y_IN);
auto& dDot = Input(DER_DOT_IN);
- auto* dX = Output(DER_X_OUT);
- auto* dY = Output(DER_Y_OUT);
+
+
int N, D;
if (X.size() > 0) {
N = X.ndim() > 0 ? X.dim32(0) : 1;
}
CAFFE_ENFORCE(dDot.ndim() == 1);
CAFFE_ENFORCE(dDot.dim32(0) == N);
- dX->ResizeLike(X);
- dY->ResizeLike(Y);
+ auto* dX = Output(DER_X_OUT, X.sizes(), at::dtype<float>());
+ auto* dY = Output(DER_Y_OUT, Y.sizes(), at::dtype<float>());
DotProductGradientKernel<<<
CAFFE_GET_BLOCKS(N * D),
CAFFE_CUDA_NUM_THREADS,
const auto& X = Input(0);
const auto& a = Input(1);
const auto& b = Input(2);
- auto* Y = Output(0);
+
const auto canonical_axis = X.canonical_axis_index(axis_);
const int N = X.size_to_dim(canonical_axis);
CAFFE_ENFORCE_EQ(b.ndim(), 1, b.ndim());
CAFFE_ENFORCE_EQ(b.dim(0), D, b.ndim());
- Y->ResizeLike(X);
+ auto* Y = Output(0, X.sizes(), at::dtype<float>());
ElementwiseLinearKernel<<<
CAFFE_GET_BLOCKS(N * D),
CAFFE_ENFORCE_EQ(a.ndim(), 1, a.ndim());
CAFFE_ENFORCE_EQ(a.dim(0), D, a.ndim());
- auto* g_X = Output(0);
- auto *g_a = Output(1);
- auto *g_b = Output(2);
- g_X->ResizeLike(X);
- g_a->ResizeLike(a);
- g_b->ResizeLike(a);
+
+
+
+ auto* g_X = Output(0, X.sizes(), at::dtype<float>());
+ auto * g_a = Output(1, a.sizes(), at::dtype<float>());
+ auto * g_b = Output(2, a.sizes(), at::dtype<float>());
float* g_a_data = g_a->template mutable_data<float>();
float* g_b_data = g_b->template mutable_data<float>();