bool FindOp<CUDAContext>::DoRunWithType() {
auto& idx = Input(0);
auto& needles = Input(1);
- auto* res_indices = Output(0);
- res_indices->ResizeLike(needles);
+
+ auto* res_indices = Output(0, needles.sizes(), at::dtype<int>());
const T* idx_data = idx.data<T>();
const T* needles_data = needles.data<T>();
template <>
bool FloorOp<float, CUDAContext>::RunOnDevice() {
auto& X = Input(0);
- auto* Y = Output(0);
+
CAFFE_ENFORCE_GT(X.size(), 0);
- Y->ResizeLike(X);
+ auto* Y = Output(0, X.sizes(), at::dtype<float>());
FloorKernel<<<
CAFFE_GET_BLOCKS(X.size()),
CAFFE_CUDA_NUM_THREADS,
template <>
bool FloatToHalfOp<CUDAContext>::RunOnDevice() {
auto& X = Input(0);
- auto* Y = Output(0);
- Y->ResizeLike(X);
+
+ auto* Y = Output(0, X.sizes(), at::dtype<at::Half>());
FloatToHalfKernel<<<
CAFFE_GET_BLOCKS(X.size()),
CAFFE_CUDA_NUM_THREADS,
template <>
bool HalfToFloatOp<CUDAContext>::RunOnDevice() {
auto& X = Input(0);
- auto* Y = Output(0);
- Y->ResizeLike(X);
+
+ auto* Y = Output(0, X.sizes(), at::dtype<float>());
HalfToFloatKernel<<<
CAFFE_GET_BLOCKS(X.size()),
CAFFE_CUDA_NUM_THREADS,
const auto& input = Input(INPUT);
const auto& scale = Input(SCALE);
const auto& bias = Input(BIAS);
- auto output = Output(OUTPUT);
+
auto mean = OutputSize() >= 2 ? Output(MEAN) : &mean_;
auto inv_stdev = OutputSize() >= 3 ? Output(INV_STDEV) : &inv_stdev_;
CAFFE_ENFORCE_EQ(4, input.ndim());
CAFFE_ENFORCE_EQ(C, scale.dim32(0));
CAFFE_ENFORCE_EQ(1, bias.ndim());
CAFFE_ENFORCE_EQ(C, bias.dim32(0));
- output->ResizeLike(input);
+ auto output = Output(OUTPUT, input.sizes(), at::dtype<float>());
mean->Resize(N, C);
inv_stdev->Resize(N, C);
const auto& input = Input(INPUT);
const auto& scale = Input(SCALE);
const auto& bias = Input(BIAS);
- auto output = Output(OUTPUT);
+
auto mean = OutputSize() >= 2 ? Output(MEAN) : &mean_;
auto inv_stdev = OutputSize() >= 3 ? Output(INV_STDEV) : &inv_stdev_;
CAFFE_ENFORCE_EQ(4, input.ndim());
CAFFE_ENFORCE_EQ(C, scale.dim32(0));
CAFFE_ENFORCE_EQ(1, bias.ndim());
CAFFE_ENFORCE_EQ(C, bias.dim32(0));
- output->ResizeLike(input);
+ auto output = Output(OUTPUT, input.sizes(), at::dtype<float>());
mean->Resize(N, C);
inv_stdev->Resize(N, C);
const auto& output_grad = Input(OUTPUT_GRAD);
const auto& mean = InputSize() >= 5 ? Input(MEAN) : mean_;
const auto& inv_stdev = InputSize() >= 6 ? Input(INV_STDEV) : inv_stdev_;
- auto input_grad = Output(INPUT_GRAD);
- auto scale_grad = Output(SCALE_GRAD);
- auto bias_grad = Output(BIAS_GRAD);
+
CAFFE_ENFORCE_EQ(4, input.ndim());
const int N = input.dim32(0);
const int H = input.dim32(1);
CAFFE_ENFORCE_EQ(H, output_grad.dim32(1));
CAFFE_ENFORCE_EQ(W, output_grad.dim32(2));
CAFFE_ENFORCE_EQ(C, output_grad.dim32(3));
- input_grad->ResizeLike(input);
- scale_grad->ResizeLike(scale);
- bias_grad->ResizeLike(bias);
+ auto input_grad = Output(INPUT_GRAD, input.sizes(), at::dtype<float>());
+ auto scale_grad = Output(SCALE_GRAD, scale.sizes(), at::dtype<float>());
+ auto bias_grad = Output(BIAS_GRAD, bias.sizes(), at::dtype<float>());
const auto input_data = input.data<float>();
const auto scale_data = scale.data<float>();
const auto& output_grad = Input(OUTPUT_GRAD);
const auto& mean = InputSize() >= 5 ? Input(MEAN) : mean_;
const auto& inv_stdev = InputSize() >= 6 ? Input(INV_STDEV) : inv_stdev_;
- auto input_grad = Output(INPUT_GRAD);
- auto scale_grad = Output(SCALE_GRAD);
- auto bias_grad = Output(BIAS_GRAD);
+
CAFFE_ENFORCE_EQ(4, input.ndim());
const int N = input.dim32(0);
const int C = input.dim32(1);
CAFFE_ENFORCE_EQ(C, output_grad.dim32(1));
CAFFE_ENFORCE_EQ(H, output_grad.dim32(2));
CAFFE_ENFORCE_EQ(W, output_grad.dim32(3));
- input_grad->ResizeLike(input);
- scale_grad->ResizeLike(scale);
- bias_grad->ResizeLike(bias);
+ auto input_grad = Output(INPUT_GRAD, input.sizes(), at::dtype<float>());
+ auto scale_grad = Output(SCALE_GRAD, scale.sizes(), at::dtype<float>());
+ auto bias_grad = Output(BIAS_GRAD, bias.sizes(), at::dtype<float>());
const auto input_data = input.data<float>();
const auto scale_data = scale.data<float>();
template <>
bool IntegralImageOp<float, CUDAContext>::RunOnDevice() {
auto& X = Input(0);
-
+
CAFFE_ENFORCE(X.ndim() == 4, "Only supports 4D tensors for the momement");
// Input is (N, C, H, W)
auto& X = Input(0); // Original input to "forward" op
auto& dY = Input(1); // Gradient of net w.r.t. output of "forward" op
// (aka "gradOutput")
- auto* dX = Output(0); // Gradient of net w.r.t. input to
- // "forward" op (aka "gradInput")
- dX->ResizeLike(X);
+ auto* dX = Output(
+ 0, X.sizes(), at::dtype<float>()); // Gradient of net w.r.t. input to
+ // "forward" op (aka "gradInput")
+
// Row pass reduces shape of dY from (N, C, H + 1, W + 1)
// to (N, C, H + 1, W)
// Col pass reduces shape to (N, C, H, W)
bool LeakyReluOp<float, CUDAContext>::RunOnDevice() {
const auto& X = Input(0);
CAFFE_ENFORCE_GT(X.size(), 0);
- auto* Y = Output(0);
- Y->ResizeLike(X);
+
+ auto* Y = Output(0, X.sizes(), at::dtype<float>());
LeakyReluKernel<<<
CAFFE_GET_BLOCKS(X.size()),
CAFFE_CUDA_NUM_THREADS,
bool LeakyReluGradientOp<float, CUDAContext>::RunOnDevice() {
const auto& Y = Input(0);
const auto& dY = Input(1);
- auto* dX = Output(0);
- dX->ResizeLike(Y);
+
+ auto* dX = Output(0, Y.sizes(), at::dtype<float>());
CAFFE_ENFORCE_EQ(Y.size(), dY.size());
LeakyReluGradientKernel<<<
CAFFE_GET_BLOCKS(Y.size()),
template<>
bool LRNOp<float, CUDAContext>::RunOnDeviceWithOrderNCHW() {
auto& X = Input(0);
- auto* Y = Output(0);
+
DCHECK_EQ(X.ndim(), 4);
const int N = X.dim32(0);
const int C = X.dim32(1);
const int H = X.dim32(2);
const int W = X.dim32(3);
const float* Xdata = X.data<float>();
- Y->ResizeLike(X);
+ auto* Y = Output(0, X.sizes(), at::dtype<float>());
float* Ydata = Y->template mutable_data<float>();
if (OutputSize() > 1) {
scale_ = Output(1);
template<>
bool LRNOp<float, CUDAContext>::RunOnDeviceWithOrderNHWC() {
auto& X = Input(0);
- auto* Y = Output(0);
+
DCHECK_EQ(X.ndim(), 4);
const int N = X.dim32(0);
const int H = X.dim32(1);
const int W = X.dim32(2);
const int C = X.dim32(3);
const float* Xdata = X.data<float>();
- Y->ResizeLike(X);
+ auto* Y = Output(0, X.sizes(), at::dtype<float>());
float* Ydata = Y->template mutable_data<float>();
if (OutputSize() > 1) {
scale_ = Output(1);
auto& X = Input(0);
auto& Y = Input(1);
auto& dY = Input(2);
- auto* dX = Output(0);
+
DCHECK_EQ(X.ndim(), 4);
const int N = X.dim32(0);
const int C = X.dim32(1);
// long as the sizes check out.
DCHECK_EQ(X.size(), Y.size());
DCHECK_EQ(X.size(), dY.size());
- dX->ResizeLike(X);
+ auto* dX = Output(0, X.sizes(), at::dtype<float>());
const float* Xdata = X.data<float>();
const float* Ydata = Y.data<float>();
auto& X = Input(0);
auto& Y = Input(1);
auto& dY = Input(2);
- auto* dX = Output(0);
+
DCHECK_EQ(X.ndim(), 4);
const int N = X.dim32(0);
const int H = X.dim32(1);
// long as the sizes check out.
DCHECK_EQ(X.size(), Y.size());
DCHECK_EQ(X.size(), dY.size());
- dX->ResizeLike(X);
+ auto* dX = Output(0, X.sizes(), at::dtype<float>());
if (!scale_) {
scale_ = &local_scale_tensor_;
}
auto& Y = Input(1);
auto& dY = Input(2);
CAFFE_ENFORCE_EQ(dY.ndim(), 4);
- auto* dX = Output(0);
- dX->ResizeLike(X);
+
+ auto* dX = Output(0, X.sizes(), at::dtype<float>());
ConvPoolOpBase<CUDAContext>::ComputePads({X.dim32(2), X.dim32(3)});
LpPoolBackwardNCHW<float>
<<<CAFFE_GET_BLOCKS(X.size()),
auto& Y = Input(1);
auto& dY = Input(2);
CAFFE_ENFORCE_EQ(dY.ndim(), 4);
- auto* dX = Output(0);
- dX->ResizeLike(X);
+
+ auto* dX = Output(0, X.sizes(), at::dtype<float>());
ConvPoolOpBase<CUDAContext>::ComputePads({X.dim32(1), X.dim32(2)});
LpPoolBackwardNHWC<float>
<<<CAFFE_GET_BLOCKS(X.size()),
auto& X1 = Input(0);
auto& X2 = Input(1);
auto& Y = Input(2);
- auto* loss = Output(0);
+
CAFFE_ENFORCE(
X1.size() == X2.size(),
"The two inputs for computing ranking loss should have the same size.");
CAFFE_ENFORCE(
X1.size() == Y.size(),
"The input and label should have the same size.");
- loss->ResizeLike(X1);
+ auto* loss = Output(0, X1.sizes(), at::dtype<float>());
const float* X1data = X1.data<float>();
const float* X2data = X2.data<float>();
auto& X2 = Input(1);
auto& Y = Input(2);
auto& dOutput = Input(3);
- auto* dX1 = Output(0);
- auto* dX2 = Output(1);
- dX1->ResizeLike(X1);
- dX2->ResizeLike(X2);
+ auto* dX1 = Output(0, X1.sizes(), at::dtype<float>());
+ auto* dX2 = Output(1, X2.sizes(), at::dtype<float>());
const float* X1data = X1.data<float>();
const float* X2data = X2.data<float>();
template <>
bool PiecewiseLinearTransformOp<float, CUDAContext>::TransformGeneral() {
auto& X = Input(0);
- auto* Y = Output(0);
+
CAFFE_ENFORCE_EQ(X.ndim(), 2);
int64_t N = X.dim32(0);
int64_t M = X.dim32(1);
- Y->ResizeLike(X);
+ auto* Y = Output(0, X.sizes(), at::dtype<float>());
int64_t num_func_per_group;
int64_t num_group;
template <>
bool PiecewiseLinearTransformOp<float, CUDAContext>::TransformBinary() {
auto& X = Input(0);
- auto* Y = Output(0);
+
CAFFE_ENFORCE(X.ndim() == 1 || X.ndim() == 2);
int64_t N = X.dim32(0);
int64_t M = X.ndim() == 2 ? X.dim32(1) : 1;
CAFFE_ENFORCE(
M == 1 || M == 2,
"If binary is set to true, the input must be Nx2 or Nx1 tensor");
- Y->ResizeLike(X);
+ auto* Y = Output(0, X.sizes(), at::dtype<float>());
int64_t num_func_per_group;
int64_t num_group;
auto& X = Input(0);
auto& dY = Input(2);
CAFFE_ENFORCE_EQ(dY.dim32(1), X.dim32(1));
- auto* dX = Output(0);
- dX->ResizeLike(X);
+
+ auto* dX = Output(0, X.sizes(), at::dtype<float>());
vector<int> dims(X.sizes().begin() + 2, X.sizes().end());
ConvPoolOpBase<CUDAContext>::ComputePads(dims);
switch (kernel_.size()) {
auto& dY = Input(2);
CAFFE_ENFORCE_EQ(X.ndim(), dY.ndim());
CAFFE_ENFORCE_EQ(X.dim32(X.ndim() - 1), dY.dim32(dY.ndim() - 1));
- auto* dX = Output(0);
- dX->ResizeLike(X);
+
+ auto* dX = Output(0, X.sizes(), at::dtype<float>());
vector<int> dims(X.sizes().begin() + 1, X.sizes().end() - 1);
ConvPoolOpBase<CUDAContext>::ComputePads(dims);
switch (kernel_.size()) {
auto& Y = Input(1);
auto& dY = Input(2);
CAFFE_ENFORCE_EQ(dY.ndim(), X.ndim());
- auto* dX = Output(0);
- dX->ResizeLike(X);
+
+ auto* dX = Output(0, X.sizes(), at::dtype<float>());
vector<int> dims(X.sizes().begin() + 2, X.sizes().end());
ConvPoolOpBase<CUDAContext>::ComputePads(dims);
switch (kernel_.size()) {
auto& Y = Input(1);
auto& dY = Input(2);
CAFFE_ENFORCE_EQ(dY.ndim(), X.ndim());
- auto* dX = Output(0);
- dX->ResizeLike(X);
+
+ auto* dX = Output(0, X.sizes(), at::dtype<float>());
vector<int> dims(X.sizes().begin() + 1, X.sizes().end() - 1);
ConvPoolOpBase<CUDAContext>::ComputePads(dims);
switch (kernel_.size()) {
auto& X = Input(0);
auto& Y = Input(1);
auto& dY = Input(2);
- auto* dX = Output(0);
// cuDNN pooling support only 2 and 3 spatial dimensions.
CAFFE_ENFORCE(X.ndim() >= 4 && X.ndim() <= 5);
- dX->ResizeLike(X);
+ auto* dX = Output(0, X.sizes(), at::dtype<float>());
int N = 0, C = 0, H = 0, W = 0, D = 0;
int H_out = 0, W_out = 0, D_out = 0;
switch (order_) {
bool PReluOp<float, CUDAContext>::RunOnDevice() {
const auto& X = Input(0);
const auto& W = Input(1);
- auto* Y = Output(0);
- Y->ResizeLike(X);
+
+ auto* Y = Output(0, X.sizes(), at::dtype<float>());
const auto* Xdata = X.data<float>();
const auto* Wdata = W.data<float>();
auto* Ydata = Y->template mutable_data<float>();
auto& W = Input(3);
CAFFE_ENFORCE(&Y != &X, "Cannot backpropagate through an in-place PReLU");
- auto* dX = Output(0);
- auto* dW = Output(1);
DCHECK_EQ(dY.size(), Y.size());
- dX->ResizeLike(Y);
- dW->ResizeLike(W);
+ auto* dX = Output(0, Y.sizes(), at::dtype<float>());
+ auto* dW = Output(1, W.sizes(), at::dtype<float>());
const auto C = order_ == StorageOrder::NCHW ? X.dim(1) : X.dim(X.ndim() - 1);
const auto C_shared = (W.size() == 1);
auto& X = Input(0);
auto& dY = Input(1);
DCHECK_EQ(dY.size(), 1);
- auto* dX = Output(0);
- dX->ResizeLike(X);
+
+ auto* dX = Output(0, X.sizes(), at::dtype<float>());
SumElementsGradientKernel<float>
<<<CAFFE_GET_BLOCKS(X.size()),
CAFFE_CUDA_NUM_THREADS,
auto& Y = Input(1);
auto& dY = Input(2);
- auto* dX = Output(0);
- dX->ResizeLike(X);
+ auto* dX = Output(0, X.sizes(), at::dtype<T>());
CAFFE_ENFORCE_EQ(X.ndim(), 3);
auto& R = Input(1); // RoIs
auto& dY = Input(2); // Gradient of net w.r.t. output of "forward" op
// (aka "gradOutput")
- auto* dX = Output(0); // Gradient of net w.r.t. input to
- // "forward" op (aka "gradInput")
- dX->ResizeLike(X);
+ auto* dX = Output(
+ 0, X.sizes(), at::dtype<float>()); // Gradient of net w.r.t. input to
+ // "forward" op (aka "gradInput")
// Must zero-out dX before accumulating gradients
// (TODO): Kaiming - is this safe?
auto& R = Input(1); // RoIs
auto& dY = Input(2); // Gradient of net w.r.t. output of "forward" op
// (aka "gradOutput")
- auto* dX = Output(0); // Gradient of net w.r.t. input to "forward" op
- // (aka "gradInput")
- dX->ResizeLike(X);
+ auto* dX = Output(
+ 0, X.sizes(), at::dtype<float>()); // Gradient of net w.r.t. input to
+ // "forward" op (aka "gradInput")
// Must zero-out dX before accumulating gradients
math::Set<float, CUDAContext>(
auto& A = Input(2); // argmaxes
auto& dY = Input(3); // Gradient of net w.r.t. output of "forward" op
// (aka "gradOutput")
- auto* dX = Output(0); // Gradient of net w.r.t. input to "forward" op
- // (aka "gradInput")
- dX->ResizeLike(X);
+ auto* dX = Output(
+ 0, X.sizes(), at::dtype<float>()); // Gradient of net w.r.t. input to
+ // "forward" op (aka "gradInput")
// Must zero-out dX before accumulating gradients
math::Set<float, CUDAContext>(
dX->size(), 0.f, dX->template mutable_data<float>(), &context_);
const auto& Y = Input(1);
const auto& dY = Input(2);
const auto& I = Input(3);
- auto* dX = Output(0);
- dX->ResizeLike(X);
+
+ auto* dX = Output(0, X.sizes(), at::dtype<T>());
const int M = X.dim32(0);
const int N = X.size_from_dim(1);
auto& dataInput = Input(3);
auto& indicesInput = Input(4);
- auto* weightGradsOutput = Output(1);
CAFFE_ENFORCE_EQ(1, lengthsInput.ndim(), "LENGTHS must be a vector");
CAFFE_ENFORCE_EQ(1, weightsInput.ndim(), "WEIGHTS must be a vector");
int output_0dim = indicesInput.dim(0);
shape[0] = output_0dim;
auto* dataGradsOutput = Output(0, shape, at::dtype<T>());
- weightGradsOutput->ResizeLike(indicesInput);
+ auto* weightGradsOutput = Output(1, indicesInput.sizes(), at::dtype<T>());
T* out_data_grads = dataGradsOutput->template mutable_data<T>();
T* out_weight_grads = weightGradsOutput->template mutable_data<T>();
template <>
bool SeluOp<float, CUDAContext>::RunOnDevice() {
auto& X = Input(0);
- auto* Y = Output(0);
+
CAFFE_ENFORCE_GT(X.size(), 0);
- Y->ResizeLike(X);
+ auto* Y = Output(0, X.sizes(), at::dtype<float>());
SeluKernel<float>
<<<CAFFE_GET_BLOCKS(X.size()),
CAFFE_CUDA_NUM_THREADS,
bool SeluGradientOp<float, CUDAContext>::RunOnDevice() {
auto& Y = Input(0);
auto& dY = Input(1);
- auto* dX = Output(0);
+
CAFFE_ENFORCE_GT(Y.size(), 0);
CAFFE_ENFORCE_EQ(dY.size(), Y.size());
- dX->ResizeLike(Y);
+ auto* dX = Output(0, Y.sizes(), at::dtype<float>());
SeluGradientKernel<float>
<<<CAFFE_GET_BLOCKS(Y.size()),
CAFFE_CUDA_NUM_THREADS,
bool SoftmaxWithLossOp<float, CUDAContext>::RunOnDevice() {
auto& X = Input(0); // Logits
auto& T = Input(1); // Labels / targets
- auto* P = Output(0); // Probabilities from softmax
const float* weights = (InputSize() > 2 ? Input(2).data<float>() : NULL);
const auto canonical_axis = X.canonical_axis_index(axis_);
int N, D;
N = X.size_to_dim(canonical_axis); // batch size
D = X.size_from_dim(canonical_axis);
- P->ResizeLike(X);
+
+ auto* P =
+ Output(0, X.sizes(), at::dtype<float>()); // Probabilities from softmax
ReinitializeTensor(&total_weight_ptr_, {1}, at::dtype<float>().device(CUDA));
+ total_weight_ptr_.Resize(1);
if (label_prob_mode_) {
CAFFE_ENFORCE_GE(T.ndim(), 2);
bool SpatialSoftmaxWithLossOp<float, CUDAContext>::RunOnDevice() {
auto& X = Input(0); // Logits
auto& T = Input(1); // Labels / targets
- auto* P = Output(0); // Probabilities from softmax
const float* weights = (InputSize() > 2 ? Input(2).data<float>() : NULL);
int N, D;
N = X.dim32(0);
D = X.dim32(1);
- P->ResizeLike(X);
+
+ auto* P =
+ Output(0, X.sizes(), at::dtype<float>()); // Probabilities from softmax
ReinitializeTensor(&total_weight_ptr_, {1}, at::dtype<float>().device(CUDA));
+
CAFFE_ENFORCE_EQ(X.ndim(), 4);
CAFFE_ENFORCE_EQ(T.ndim(), 3);
CAFFE_ENFORCE_EQ(T.dim32(0), N);
template <>
bool SoftmaxOp<float, CUDAContext>::RunOnDevice() {
auto& X = Input(0);
- auto* P = Output(0);
+
const auto canonical_axis = X.canonical_axis_index(axis_);
const int N = X.size_to_dim(canonical_axis);
const int D = X.size_from_dim(canonical_axis);
- P->ResizeLike(X);
+ auto* P = Output(0, X.sizes(), at::dtype<float>());
auto* P_data = P->mutable_data<float>();
if (N == 0) {
return true;
bool SoftmaxGradientOp<float, CUDAContext>::RunOnDevice() {
auto& Y = Input(0);
auto& dY = Input(1);
- auto* dX = Output(0);
+
const auto canonical_axis = Y.canonical_axis_index(axis_);
const int N = Y.size_to_dim(canonical_axis);
const int D = Y.size_from_dim(canonical_axis);
- dX->ResizeLike(Y);
+ auto* dX = Output(0, Y.sizes(), at::dtype<float>());
auto* dX_data = dX->mutable_data<float>();
if (N == 0) {
return true;
template <>
bool SoftplusOp<float, CUDAContext>::RunOnDevice() {
auto& X = Input(0);
- auto* Y = Output(0);
+
DCHECK_GT(X.size(), 0);
- Y->ResizeLike(X);
+ auto* Y = Output(0, X.sizes(), at::dtype<float>());
SoftplusKernel<float>
<<<CAFFE_GET_BLOCKS(X.size()),
CAFFE_CUDA_NUM_THREADS,
bool SoftplusGradientOp<float, CUDAContext>::RunOnDevice() {
auto& Y = Input(0);
auto& dY = Input(1);
- auto* dX = Output(0);
+
DCHECK_GT(Y.size(), 0);
DCHECK_EQ(dY.size(), Y.size());
- dX->ResizeLike(Y);
+ auto* dX = Output(0, Y.sizes(), at::dtype<float>());
SoftplusGradientKernel<float>
<<<CAFFE_GET_BLOCKS(Y.size()),
CAFFE_CUDA_NUM_THREADS,
bool StumpFuncOp<float, float, CUDAContext>::RunOnDevice() {
auto& in = Input(0);
const float* in_data = in.data<float>();
- auto* out = Output(0);
- out->ResizeLike(in);
+
+ auto* out = Output(0, in.sizes(), at::dtype<float>());
float* out_data = out->template mutable_data<float>();
StumpFuncKernel<<<CAFFE_GET_BLOCKS(in.size()), CAFFE_CUDA_NUM_THREADS,
0, context_.cuda_stream()>>>(
template <>
bool ThresholdedReluOp<float, CUDAContext>::RunOnDevice() {
auto& X = Input(0);
- auto* Y = Output(0);
+
CAFFE_ENFORCE_GT(X.size(), 0);
- Y->ResizeLike(X);
+ auto* Y = Output(0, X.sizes(), at::dtype<float>());
ThresholdedReluKernel<<<
CAFFE_GET_BLOCKS(X.size()),
CAFFE_CUDA_NUM_THREADS,
bool ThresholdedReluGradientOp<float, CUDAContext>::RunOnDevice() {
auto& Y = Input(0);
auto& dY = Input(1);
- auto* dX = Output(0);
+
CAFFE_ENFORCE_GT(Y.size(), 0);
CAFFE_ENFORCE_EQ(dY.size(), Y.size());
- dX->ResizeLike(Y);
+ auto* dX = Output(0, Y.sizes(), at::dtype<float>());
ThresholdedReluGradientKernel<<<
CAFFE_GET_BLOCKS(Y.size()),
CAFFE_CUDA_NUM_THREADS,
for (int i = 0; i < OutputSize(); i++) {
auto& input = Input(i + kInputStartOffset);
- auto* grad_input = Output(i);
- grad_input->ResizeLike(input);
+
+ auto* grad_input = Output(i, input.sizes(), at::dtype<float>());
MaxMinGradKernel<<<
CAFFE_GET_BLOCKS(input.size()),
CAFFE_CUDA_NUM_THREADS,