self.assertEqual(p.grad, torch.zeros_like(p.grad))
self.assertEqual(inp.grad, torch.zeros_like(inp))
+ def _test_module_empty_inputs(self, module, inputs):
+ for _inp in inputs:
+ _inp.requires_grad_(True)
+ out = module(*inputs)
+ gO = torch.rand_like(out)
+ out.backward(gO)
+
+ for p in module.parameters():
+ if p.requires_grad:
+ self.assertEqual(p.grad, torch.zeros_like(p.grad))
+
+ for _inp in inputs:
+ self.assertEqual(_inp.grad, torch.zeros_like(_inp))
+
@unittest.skipIf((not TEST_NUMPY) or (not TEST_SCIPY) or (scipy.__version__ < '1.0.0'),
"Scipy v1.0 and/or numpy not found")
@tf32_on_and_off()
self.assertEqual(inp2.grad, torch.zeros_like(inp2))
@onlyOnCPUAndCUDA
+ def test_TransformerEncoderLayer_empty(self, device):
+ for batch_first, input_shape in [(True, (0, 10, 512)),
+ (False, (10, 0, 512))]:
+ input = torch.rand(*input_shape, device=device)
+ encoder_layer = nn.TransformerEncoderLayer(d_model=512, nhead=8, batch_first=batch_first).to(device)
+ self._test_module_empty_input(encoder_layer, input, check_size=False)
+
+ @onlyOnCPUAndCUDA
+ def test_TransformerEncoder_empty(self, device):
+ for batch_first, input_shape in [(True, (0, 10, 512)),
+ (False, (10, 0, 512))]:
+ input = torch.rand(*input_shape, device=device)
+ encoder_layer = nn.TransformerEncoderLayer(d_model=512, nhead=8, batch_first=batch_first).to(device)
+ transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=6).to(device)
+ self._test_module_empty_input(transformer_encoder, input, check_size=False)
+
+ @onlyOnCPUAndCUDA
+ def test_TransformerDecoderLayer_empty(self, device):
+ for batch_first, memory_shape, tgt_shape in [(True, (0, 10, 512), (0, 20, 512)),
+ (False, (10, 0, 512), (20, 0, 512))]:
+ memory = torch.rand(*memory_shape, device=device)
+ tgt = torch.rand(*tgt_shape, requires_grad=True, device=device)
+ decoder_layer = nn.TransformerDecoderLayer(d_model=512, nhead=8, batch_first=batch_first).to(device)
+ self._test_module_empty_inputs(decoder_layer, [tgt, memory])
+
+ @onlyOnCPUAndCUDA
+ def test_TransformerDecoder_empty(self, device):
+ for batch_first, memory_shape, tgt_shape in [(True, (0, 10, 512), (0, 20, 512)),
+ (False, (10, 0, 512), (20, 0, 512))]:
+ memory = torch.rand(*memory_shape, device=device)
+ tgt = torch.rand(*tgt_shape, requires_grad=True, device=device)
+ decoder_layer = nn.TransformerDecoderLayer(d_model=512, nhead=8, batch_first=batch_first).to(device)
+ transformer_decoder = nn.TransformerDecoder(decoder_layer, num_layers=6).to(device)
+ self._test_module_empty_inputs(transformer_decoder, [tgt, memory])
+
+ @onlyOnCPUAndCUDA
+ def test_Transformer_empty(self, device):
+ for batch_first, src_shape, tgt_shape in [(True, (10, 0, 512), (20, 0, 512))]:
+ transformer_model = nn.Transformer(nhead=16, num_encoder_layers=12).to(device)
+ src = torch.rand(*src_shape, requires_grad=True, device=device)
+ tgt = torch.rand(*tgt_shape, requires_grad=True, device=device)
+ self._test_module_empty_inputs(transformer_model, [src, tgt])
+
+ @onlyOnCPUAndCUDA
@dtypes(torch.float32, torch.complex64)
def test_ReflectionPad_empty(self, device, dtype):
for mod, inp in [
#
q = q.contiguous().view(tgt_len, bsz * num_heads, head_dim).transpose(0, 1)
if static_k is None:
- k = k.contiguous().view(-1, bsz * num_heads, head_dim).transpose(0, 1)
+ k = k.contiguous().view(k.shape[0], bsz * num_heads, head_dim).transpose(0, 1)
else:
# TODO finish disentangling control flow so we don't do in-projections when statics are passed
assert static_k.size(0) == bsz * num_heads, \
f"expecting static_k.size(2) of {head_dim}, but got {static_k.size(2)}"
k = static_k
if static_v is None:
- v = v.contiguous().view(-1, bsz * num_heads, head_dim).transpose(0, 1)
+ v = v.contiguous().view(v.shape[0], bsz * num_heads, head_dim).transpose(0, 1)
else:
# TODO finish disentangling control flow so we don't do in-projections when statics are passed
assert static_v.size(0) == bsz * num_heads, \