weight = view4d(weight);
}
- at::MemoryFormat cudnn_memory_format = at::MemoryFormat::Contiguous;
- if (cudnn_conv_use_channels_last(input, weight)) {
- cudnn_memory_format = (k == 5) ? at::MemoryFormat::ChannelsLast3d : at::MemoryFormat::ChannelsLast;
+ at::MemoryFormat backend_memory_format = at::MemoryFormat::Contiguous;
+
+ if (detail::getCUDAHooks().compiledWithCuDNN() && cudnn_conv_use_channels_last(input, weight)) {
+ backend_memory_format = (k == 5) ? at::MemoryFormat::ChannelsLast3d : at::MemoryFormat::ChannelsLast;
+ }
+
+ if (detail::getCUDAHooks().compiledWithMIOpen() && miopen_conv_use_channels_last(input, weight)) {
+ backend_memory_format = (k == 5) ? at::MemoryFormat::Contiguous /*at::MemoryFormat::ChannelsLast3d*/ : at::MemoryFormat::ChannelsLast;
}
Tensor output;
auto dilation = params.dilation;
if (params.use_cudnn_depthwise(input, weight)) {
output = at::cudnn_convolution(
- input.contiguous(cudnn_memory_format), weight,
+ input.contiguous(backend_memory_format), weight,
padding, stride, dilation, params.groups, params.benchmark, params.deterministic, params.allow_tf32);
if (bias.defined()) {
output.add_(reshape_bias(input.dim(), bias));
} else if (params.use_miopen(input, weight, bias.defined())){
output = at::miopen_depthwise_convolution(
- input.contiguous(), weight, bias,
+ input.contiguous(backend_memory_format), weight, bias,
padding, stride, dilation, params.groups, params.benchmark, params.deterministic);
} else {
if (input.ndimension() == 4) {
if (params.transposed) {
output = at::cudnn_convolution_transpose(
- input.contiguous(cudnn_memory_format), weight,
+ input.contiguous(backend_memory_format), weight,
params.padding, params.output_padding, params.stride, params.dilation, params.groups, params.benchmark, params.deterministic, params.allow_tf32);
if (bias.defined()) {
output.add_(reshape_bias(input.dim(), bias));
}
} else {
output = at::cudnn_convolution(
- input.contiguous(cudnn_memory_format), weight,
+ input.contiguous(backend_memory_format), weight,
params.padding, params.stride, params.dilation, params.groups, params.benchmark, params.deterministic, params.allow_tf32);
if (bias.defined()) {
output.add_(reshape_bias(input.dim(), bias));
if (params.transposed) {
output = at::miopen_convolution_transpose(
- input.contiguous(), weight, bias,
+ input.contiguous(backend_memory_format), weight, bias,
params.padding, params.output_padding, params.stride, params.dilation, params.groups, params.benchmark, params.deterministic);
} else {
output = at::miopen_convolution(
- input.contiguous(), weight, bias,
+ input.contiguous(backend_memory_format), weight, bias,
params.padding, params.stride, params.dilation, params.groups, params.benchmark, params.deterministic);
}
} else if (params.use_mkldnn(input, weight)) {
checkSize(c, bias, { output->size(output_channels_dim) });
TensorDescriptor bdesc, odesc;
- bdesc.set(bias->expand({1, bias->size(0)}), output->dim());
+
+ auto memory_format = output->suggest_memory_format();
+
+ std::vector<int64_t> shape( output->dim(), 1);
+ shape[output_channels_dim] = -1;
+ at::Tensor bias_contig = bias->reshape(shape).contiguous(memory_format);
+ // Make sure that NC11 strides follow formula
+ bias_contig.resize_(bias_contig.sizes(), memory_format );
+
+ // TODO: Workaround since MIOpen does not support NHWC bias
+ // See #64426
+ output->add_( bias_contig );
+
+ /* MIOpen does not support NHWC bias; Activate once support is added.
+ bdesc.set( bias_contig );
odesc.set(*output);
auto handle = getMiopenHandle();
MIOPEN_CHECK(miopenConvolutionForwardBias(handle, &one, bdesc.desc(), bias->data_ptr(),
&zero, odesc.desc(), output->data_ptr()));
+ */
}
// see NOTE [ Convolution design ] in src/Aten/native/cudnn/Conv.cpp
args.handle = getMiopenHandle();
setConvolutionParams(&args.params, args.handle, input, weight, padding, stride, dilation, groups, deterministic);
args.idesc.set(input);
- args.wdesc.set(weight);
+ args.wdesc.set(weight, input.suggest_memory_format(), 0);
args.odesc.set(output);
args.cdesc.set(dataType, c_mode, input.dim() - 2, args.params.padding, args.params.stride, args.params.dilation, args.params.groups);
checkAllSameType(c, {input, weight});
checkAllSameGPU(c, {input, weight});
- auto output_t = at::empty(
+ auto memory_format = at::MemoryFormat::Contiguous;
+ if (miopen_conv_use_channels_last(*input, *weight)) {
+ memory_format = (weight->ndimension() == 5) ? /*at::MemoryFormat::ChannelsLast3d*/at::MemoryFormat::Contiguous : at::MemoryFormat::ChannelsLast;
+ }
+
+ auto output_t = at::native::empty_cuda(
conv_output_size(input->sizes(), weight->sizes(),
padding, stride, dilation),
- input->options());
+ /*dtype=*/input->scalar_type(),
+ /*layout=*/c10::nullopt,
+ /*device=*/kCUDA,
+ /*pin_memory=*/c10::nullopt,
+ /*memory_format=*/memory_format);
if (output_t.numel() == 0) {
return output_t;
convolution_shape_check(c, input, weight, output, padding, stride, dilation, groups);
// See #4500
- Tensor weight_contig = weight->contiguous();
+ Tensor weight_contig = weight->contiguous(memory_format);
+ // Make sure that NC11 strides follow formula
+ weight_contig.resize_(weight_contig.sizes(), memory_format);
+ Tensor input_contig = input->contiguous(memory_format);
+ input_contig.resize_(input_contig.sizes(), memory_format);
+
+
raw_miopen_convolution_forward_out(
- *output, *input, weight_contig,
+ *output, input_contig, weight_contig,
padding, stride, dilation, groups, benchmark, deterministic);
return *output;
args.handle = getMiopenHandle();
setConvolutionParams(&args.params, args.handle, input, weight, padding, stride, dilation, groups, deterministic);
args.idesc.set(input);
- args.wdesc.set(weight);
+ args.wdesc.set(weight, input.suggest_memory_format(), 0);
args.odesc.set(output);
args.cdesc.set(dataType, c_mode, input.dim() - 2, args.params.padding, args.params.stride, args.params.dilation, args.params.groups);
checkAllSameType(c, {input, weight});
checkAllSameGPU(c, {input, weight});
- auto output_t = at::empty(
+ auto memory_format = at::MemoryFormat::Contiguous;
+ if (miopen_conv_use_channels_last(*input, *weight)) {
+ memory_format = (weight->ndimension() == 5) ? /*at::MemoryFormat::ChannelsLast3d*/at::MemoryFormat::Contiguous : at::MemoryFormat::ChannelsLast;
+ }
+
+ auto output_t = at::native::empty_cuda(
conv_output_size(input->sizes(), weight->sizes(),
padding, stride, dilation),
- input->options());
+ /*dtype=*/input->scalar_type(),
+ /*layout=*/c10::nullopt,
+ /*device=*/kCUDA,
+ /*pin_memory=*/c10::nullopt,
+ /*memory_format=*/memory_format);
TensorArg output{ output_t, "result", 0 };
convolution_shape_check(c, input, weight, output, padding, stride, dilation, groups);
- Tensor weight_contig = weight->contiguous();
+ // See #4500
+ Tensor weight_contig = weight->contiguous(memory_format);
+ // Make sure that NC11 strides follow formula
+ weight_contig.resize_(weight_contig.sizes(), memory_format);
+ Tensor input_contig = input->contiguous(memory_format);
+ input_contig.resize_(input_contig.sizes(), memory_format);
raw_miopen_depthwise_convolution_forward_out(
- *output, *input, weight_contig,
+ *output, input_contig, weight_contig,
padding, stride, dilation, groups, benchmark, deterministic);
return *output;
args.handle = getMiopenHandle();
setConvolutionParams(&args.params, args.handle, grad_input, weight, padding, stride, dilation, groups, deterministic);
args.idesc.set(grad_input);
- args.wdesc.set(weight);
+ args.wdesc.set(weight, grad_output.suggest_memory_format(), 0);
args.odesc.set(grad_output);
args.cdesc.set(dataType, c_mode, grad_output.dim() - 2, args.params.padding, args.params.stride, args.params.dilation, args.params.groups);
checkAllSameType(c, {grad_output, weight});
checkAllSameGPU(c, {grad_output, weight});
- auto grad_input_t = at::empty(input_size, grad_output->options());
+ auto memory_format = at::MemoryFormat::Contiguous;
+ if (miopen_conv_use_channels_last(*grad_output, *weight)) {
+ memory_format = (weight->ndimension() == 5) ? /*at::MemoryFormat::ChannelsLast3d*/at::MemoryFormat::Contiguous : at::MemoryFormat::ChannelsLast;
+ }
+
+ auto grad_input_t = at::native::empty_cuda(
+ input_size,
+ /*dtype=*/grad_output->scalar_type(),
+ /*layout=*/c10::nullopt,
+ /*device=*/kCUDA,
+ /*pin_memory=*/c10::nullopt,
+ /*memory_format=*/memory_format);
// Avoid "grad_input" when this is being used as transposed convolution
TensorArg grad_input{ grad_input_t, "result", 0 };
convolution_shape_check(c, grad_input, weight, grad_output, padding, stride, dilation, groups);
// See #4500
- Tensor weight_contig = weight->contiguous();
+ Tensor weight_contig = weight->contiguous(memory_format);
+ // Make sure that NC11 strides follow formula
+ weight_contig.resize_(weight_contig.sizes(), memory_format);
+
+ Tensor grad_output_contig = grad_output->contiguous(memory_format);
+ grad_output_contig.resize_(grad_output_contig.sizes(), memory_format);
raw_miopen_convolution_backward_input_out(
- *grad_input, *grad_output, weight_contig,
+ *grad_input, grad_output_contig, weight_contig,
padding, stride, dilation, groups, benchmark, deterministic);
return *grad_input;
args.handle = getMiopenHandle();
setConvolutionParams(&args.params, args.handle, grad_input, weight, padding, stride, dilation, groups, deterministic);
args.idesc.set(grad_input);
- args.wdesc.set(weight);
+ args.wdesc.set(weight, grad_output.suggest_memory_format(), 0);
args.odesc.set(grad_output);
args.cdesc.set(dataType, c_mode, grad_output.dim() - 2, args.params.padding, args.params.stride, args.params.dilation, args.params.groups);
checkAllSameType(c, {grad_output, weight});
checkAllSameGPU(c, {grad_output, weight});
- auto grad_input_t = at::empty(input_size, grad_output->options());
+ auto memory_format = at::MemoryFormat::Contiguous;
+ if (miopen_conv_use_channels_last(*grad_output, *weight)) {
+ memory_format = (weight->ndimension() == 5) ? /*at::MemoryFormat::ChannelsLast3d*/at::MemoryFormat::Contiguous : at::MemoryFormat::ChannelsLast;
+ }
+
+ auto grad_input_t = at::native::empty_cuda(
+ input_size,
+ /*dtype=*/grad_output->scalar_type(),
+ /*layout=*/c10::nullopt,
+ /*device=*/kCUDA,
+ /*pin_memory=*/c10::nullopt,
+ /*memory_format=*/memory_format);
TensorArg grad_input{ grad_input_t, "result", 0 };
convolution_shape_check(c, grad_input, weight, grad_output, padding, stride, dilation, groups);
- Tensor weight_contig = weight->contiguous();
+ // See #4500
+ Tensor weight_contig = weight->contiguous(memory_format);
+ // Make sure that NC11 strides follow formula
+ weight_contig.resize_(weight_contig.sizes(), memory_format);
+
+ Tensor grad_output_contig = grad_output->contiguous(memory_format);
+ grad_output_contig.resize_(grad_output_contig.sizes(), memory_format);
raw_miopen_depthwise_convolution_backward_input_out(
- *grad_input, *grad_output, weight_contig,
+ *grad_input, grad_output_contig, weight_contig,
padding, stride, dilation, groups, benchmark, deterministic);
return *grad_input;
IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups,
bool benchmark, bool deterministic, std::array<bool,3> output_mask) {
- Tensor grad_output = grad_output_t.contiguous();
+ Tensor grad_output = grad_output_t.contiguous(input.suggest_memory_format());
Tensor grad_input, grad_weight, grad_bias;
if (output_mask[0]) {
args.handle = getMiopenHandle();
setConvolutionParams(&args.params, args.handle, input, grad_weight, padding, stride, dilation, groups, deterministic);
args.idesc.set(input);
- args.wdesc.set(grad_weight);
+ args.wdesc.set(grad_weight, input.suggest_memory_format(), 0);
args.odesc.set(grad_output);
args.cdesc.set(dataType, c_mode, input.dim() - 2, args.params.padding, args.params.stride, args.params.dilation, args.params.groups);
checkAllSameType(c, {grad_output, input});
checkAllSameGPU(c, {grad_output, input});
- auto grad_weight_t = at::empty(weight_size, grad_output->options());
+ auto memory_format = at::MemoryFormat::Contiguous;
+ if (miopen_conv_use_channels_last(*input, *grad_output)) {
+ memory_format = (input->ndimension() == 5) ? /*at::MemoryFormat::ChannelsLast3d*/at::MemoryFormat::Contiguous : at::MemoryFormat::ChannelsLast;
+ }
+
+ Tensor grad_output_contig_t = grad_output->contiguous(memory_format);
+ // Make sure that NC11 strides follow formula
+ grad_output_contig_t.resize_(grad_output_contig_t.sizes(), memory_format);
+ TensorArg grad_output_contig{ grad_output_contig_t, "grad_output", 1 };
+
+ Tensor input_contig_t = input->contiguous(memory_format);
+ input_contig_t.resize_(input_contig_t.sizes(), memory_format);
+ TensorArg input_contig{ input_contig_t, "input", 2};
+
+ auto grad_weight_t = at::empty(weight_size, grad_output_contig->options(), memory_format);
// For uniformity with everything else, although it seems grad_weight
// would be unambiguous too.
TensorArg grad_weight{ grad_weight_t, "result", 0 };
- convolution_shape_check(c, input, grad_weight, grad_output, padding, stride, dilation, groups);
+ convolution_shape_check(c, input, grad_weight, grad_output_contig, padding, stride, dilation, groups);
raw_miopen_convolution_backward_weight_out(
- *grad_weight, *grad_output, *input,
+ *grad_weight, *grad_output_contig, *input_contig,
padding, stride, dilation, groups, benchmark, deterministic);
return grad_weight_t;
args.handle = getMiopenHandle();
setConvolutionParams(&args.params, args.handle, input, grad_weight, padding, stride, dilation, groups, deterministic);
args.idesc.set(input);
- args.wdesc.set(grad_weight);
+ args.wdesc.set(grad_weight, input.suggest_memory_format(), 0);
args.odesc.set(grad_output);
args.cdesc.set(dataType, c_mode, input.dim() - 2, args.params.padding, args.params.stride, args.params.dilation, args.params.groups);
checkAllSameType(c, {grad_output, input});
checkAllSameGPU(c, {grad_output, input});
- auto grad_weight_t = at::empty(weight_size, grad_output->options());
+ auto memory_format = at::MemoryFormat::Contiguous;
+ if (miopen_conv_use_channels_last(*input, *grad_output)) {
+ memory_format = (input->ndimension() == 5) ? /*at::MemoryFormat::ChannelsLast3d*/at::MemoryFormat::Contiguous : at::MemoryFormat::ChannelsLast;
+ }
+
+ Tensor grad_output_contig_t = grad_output->contiguous(memory_format);
+ // Make sure that NC11 strides follow formula
+ grad_output_contig_t.resize_(grad_output_contig_t.sizes(), memory_format);
+ TensorArg grad_output_contig{ grad_output_contig_t, "grad_output", 1 };
+
+ Tensor input_contig_t = input->contiguous(memory_format);
+ input_contig_t.resize_(input_contig_t.sizes(), memory_format);
+ TensorArg input_contig{ input_contig_t, "input", 2};
+
+ auto grad_weight_t = at::empty(weight_size, grad_output_contig->options(), memory_format);
// For uniformity with everything else, although it seems grad_weight
// would be unambiguous too.
TensorArg grad_weight{ grad_weight_t, "result", 0 };
- convolution_shape_check(c, input, grad_weight, grad_output, padding, stride, dilation, groups);
+ convolution_shape_check(c, input, grad_weight, grad_output_contig, padding, stride, dilation, groups);
raw_miopen_depthwise_convolution_backward_weight_out(
- *grad_weight, *grad_output, *input,
+ *grad_weight, *grad_output_contig, *input_contig,
padding, stride, dilation, groups, benchmark, deterministic);
return grad_weight_t;
{
TensorArg grad_output{ grad_output_t, "grad_output", 1 };
+ // TODO: Workaround since MIOpen does not support NHWC bias
+ // See #64426
+ std::vector<int64_t> discard_dims;
+ for( int i = 0; i < grad_output_t.dim(); i++ ) {
+ if(i != output_channels_dim ) {
+ discard_dims.push_back(i);
+ }
+ }
+
+ Tensor outputBias = at::squeeze( at::sum(grad_output_t, discard_dims, true) );
+ if( outputBias.dim() == 0 ) {
+ // always return a tensor of shape [_]
+ return outputBias.unsqueeze(0);
+ }
+ else {
+ return outputBias;
+ }
+
+/* MIOpen does not support NHWC bias. Activate once support is added.
auto grad_bias_t = at::empty( { grad_output->size(output_channels_dim) }, grad_output->options());
TensorArg grad_bias{ grad_bias_t, "result", 0 };
MIOPEN_CHECK(miopenConvolutionBackwardBias(handle, &one, odesc.desc(), grad_output->data_ptr(),
&zero, bdesc.desc(), grad_bias->data_ptr()));
return *grad_bias;
+*/
}
from torch.nn.parallel._functions import Broadcast
from torch.testing._internal.common_dtype import integral_types, get_all_fp_dtypes, get_all_math_dtypes
from torch.testing._internal.common_utils import freeze_rng_state, run_tests, TestCase, skipIfNoLapack, skipIfRocm, \
- TEST_NUMPY, TEST_SCIPY, TEST_WITH_ROCM, download_file, \
+ skipIfRocmVersionLessThan, skipIfNotMiopenSuggestNHWC, TEST_NUMPY, TEST_SCIPY, TEST_WITH_ROCM, download_file, \
get_function_arglist, load_tests, repeat_test_for_types, ALL_TENSORTYPES, \
ALL_TENSORTYPES2, suppress_warnings, TemporaryFileName, TEST_WITH_UBSAN, IS_PPC
from torch.testing._internal.common_cuda import TEST_CUDA, TEST_MULTIGPU, TEST_CUDNN, TEST_CUDNN_VERSION
ctcloss_reference, new_module_tests, single_batch_reference_fn
from torch.testing._internal.common_device_type import instantiate_device_type_tests, dtypes, \
dtypesIfCUDA, precisionOverride, skipCUDAIfNoCudnn, skipCUDAIfCudnnVersionLessThan, onlyCUDA, onlyCPU, \
- skipCUDAIfRocm, skipCUDAIf, skipCUDAIfNotRocm, onlyOnCPUAndCUDA, \
- deviceCountAtLeast, largeTensorTest, expectedFailureMeta, skipMeta
+ skipCUDAIfRocm, skipCUDAIf, skipCUDAIfNotRocm, skipCUDAIfRocmVersionLessThan, skipCUDAIfNotMiopenSuggestNHWC, \
+ onlyOnCPUAndCUDA, deviceCountAtLeast, largeTensorTest, expectedFailureMeta, skipMeta
from torch.nn import MultiheadAttention
from hypothesis import given
@unittest.skipIf(not TEST_CUDA, "CUDA unavailable")
@unittest.skipIf(not TEST_CUDNN, "needs cudnn")
- @skipIfRocm
+ @skipIfRocmVersionLessThan((4, 3))
+ @skipIfNotMiopenSuggestNHWC
def test_grouped_conv_cudnn_nhwc_support(self):
# in order to catch the hols in grouped convolution in nhwc support for earlier cudnn version
input = torch.randn((16, 16, 8, 8), dtype=torch.float16, device="cuda").to(memory_format=torch.channels_last)
weight = torch.randn((8, 4, 3, 3), dtype=torch.float16, device="cuda").to(memory_format=torch.channels_last)
- out = torch.cudnn_convolution(input, weight, None, (1, 1), (1, 1), (1, 1), 4, False, False)
+ out = torch.convolution(input, weight, None, (1, 1), (1, 1), (1, 1), False, (0, 0), 4)
input = torch.randn((16, 8, 8, 8), dtype=torch.float16, device="cuda").to(memory_format=torch.channels_last)
- out = torch.cudnn_convolution_transpose(input, weight, None, (1, 1), (0, 0), (1, 1), (1, 1), 4, False, False)
+ out_transpose = torch.convolution(input, weight, None, (1, 1), (1, 1), (1, 1), True, (0, 0), 4)
@unittest.expectedFailure
@unittest.skipIf(not TEST_CUDA, "CUDA unavailable")
self._test_bfloat16_ops(torch.nn.Softmax(dim=dim), device, inp_dims=(16, 33, 15, 16), prec=0.05, scale_factor=1000.0)
@onlyCUDA
- @skipCUDAIfRocm
+ @skipCUDAIfRocmVersionLessThan((4, 3))
+ @skipCUDAIfNotMiopenSuggestNHWC
@skipCUDAIfCudnnVersionLessThan(7603)
@dtypes(torch.half, torch.float)
def test_conv_cudnn_nhwc(self, device, dtype):
ref_out, input_format, w_f, g_f, output_format)
@onlyCUDA
- @skipCUDAIfRocm
+ @skipCUDAIfRocmVersionLessThan((4, 3))
+ @skipCUDAIfNotMiopenSuggestNHWC
@skipCUDAIfCudnnVersionLessThan(7603)
@tf32_on_and_off(0.05)
def test_conv_cudnn_mismatch_memory_format(self, device):