From: Sasha Rush <srush@seas.harvard.edu>
Date: Thu, 17 Jan 2019 18:04:51 +0000 (-0800)
Subject: Unify the shape notation for all of the pytorch modules (#15741)
X-Git-Tag: accepted/tizen/6.5/unified/20211028.231830~1792
X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=dbe6a7a9ff1a364a8706bf5df58a1ca96d2fd9da;p=platform%2Fupstream%2Fpytorch.git

Unify the shape notation for all of the pytorch modules (#15741)

Summary:
PR to update the shape notation for all of the torch.nn modules to take a unified form. The goal is to make these definitions machine-readable and those checkable by unifying the style across all of the different modules.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/15741

Differential Revision: D13709601

Pulled By: ezyang

fbshipit-source-id: fb89a03903fdf0cd0dcf76f3e469b8582b2f3634
---

diff --git a/torch/nn/modules/activation.py b/torch/nn/modules/activation.py
index 91bc013..d4d151e 100644
--- a/torch/nn/modules/activation.py
+++ b/torch/nn/modules/activation.py
@@ -439,9 +439,9 @@ class GLU(Module):
         dim (int): the dimension on which to split the input. Default: -1
 
     Shape:
-        - Input: :math:`(*, N, *)` where `*` means, any number of additional
+        - Input: :math:`(\ast_1, N, \ast_2)` where `*` means, any number of additional
           dimensions
-        - Output: :math:`(*, N / 2, *)`
+        - Output: :math:`(\ast_1, M, \ast_2)` where :math:`M=N/2`
 
     Examples::
 
@@ -794,8 +794,9 @@ class Softmin(Module):
         \text{Softmin}(x_{i}) = \frac{\exp(-x_i)}{\sum_j \exp(-x_j)}
 
     Shape:
-        - Input: any shape
-        - Output: same as input
+        - Input: :math:`(*)` where `*` means, any number of additional
+          dimensions
+        - Output: :math:`(*)`, same shape as the input
 
     Arguments:
         dim (int): A dimension along which Softmin will be computed (so every slice
@@ -834,8 +835,9 @@ class Softmax(Module):
         \text{Softmax}(x_{i}) = \frac{\exp(x_i)}{\sum_j \exp(x_j)}
 
     Shape:
-        - Input: any shape
-        - Output: same as input
+        - Input: :math:`(*)` where `*` means, any number of additional
+          dimensions
+        - Output: :math:`(*)`, same shape as the input
 
     Returns:
         a Tensor of the same dimension and shape as the input with
@@ -910,8 +912,9 @@ class LogSoftmax(Module):
         \text{LogSoftmax}(x_{i}) = \log\left(\frac{\exp(x_i) }{ \sum_j \exp(x_j)} \right)
 
     Shape:
-        - Input: any shape
-        - Output: same as input
+        - Input: :math:`(*)` where `*` means, any number of additional
+          dimensions
+        - Output: :math:`(*)`, same shape as the input
 
     Arguments:
         dim (int): A dimension along which Softmax will be computed (so every slice
diff --git a/torch/nn/modules/adaptive.py b/torch/nn/modules/adaptive.py
index 55e1630..d3cfb7b 100644
--- a/torch/nn/modules/adaptive.py
+++ b/torch/nn/modules/adaptive.py
@@ -89,8 +89,8 @@ class AdaptiveLogSoftmaxWithLoss(Module):
     Shape:
         - input: :math:`(N, in\_features)`
         - target: :math:`(N)` where each value satisfies :math:`0 <= target[i] <= n\_classes`
-        - output: :math:`(N)`
-        - loss: ``Scalar``
+        - output1: :math:`(N)`
+        - output2: ``Scalar``
 
 
     .. _Efficient softmax approximation for GPUs:
diff --git a/torch/nn/modules/container.py b/torch/nn/modules/container.py
index 7f8181e..e75ed99 100644
--- a/torch/nn/modules/container.py
+++ b/torch/nn/modules/container.py
@@ -41,6 +41,11 @@ class Sequential(Module):
                   ('conv2', nn.Conv2d(20,64,5)),
                   ('relu2', nn.ReLU())
                 ]))
+
+    Shape:
+        - Input: :math:`(*)` where `*` means, any number of additional
+          dimensions
+        - Output: :math:`(*)`, same shape as the input
     """
 
     def __init__(self, *args):
diff --git a/torch/nn/modules/dropout.py b/torch/nn/modules/dropout.py
index e9ec872..6dab40f 100644
--- a/torch/nn/modules/dropout.py
+++ b/torch/nn/modules/dropout.py
@@ -40,8 +40,8 @@ class Dropout(_DropoutNd):
         inplace: If set to ``True``, will do this operation in-place. Default: ``False``
 
     Shape:
-        - Input: `Any`. Input can be of any shape
-        - Output: `Same`. Output is of the same shape as input
+        - Input: :math:`(*)`. Input can be of any shape
+        - Output: :math:`(*)`. Output is of the same shape as input
 
     Examples::
 
@@ -173,8 +173,8 @@ class AlphaDropout(_DropoutNd):
             in-place
 
     Shape:
-        - Input: `Any`. Input can be of any shape
-        - Output: `Same`. Output is of the same shape as input
+        - Input: :math:`(*)`. Input can be of any shape
+        - Output: :math:`(*)`. Output is of the same shape as input
 
     Examples::
 
diff --git a/torch/nn/modules/linear.py b/torch/nn/modules/linear.py
index 657f382..dd8a673 100644
--- a/torch/nn/modules/linear.py
+++ b/torch/nn/modules/linear.py
@@ -19,10 +19,10 @@ class Linear(Module):
             Default: ``True``
 
     Shape:
-        - Input: :math:`(N, *, \text{in\_features})` where :math:`*` means any number of
-          additional dimensions
-        - Output: :math:`(N, *, \text{out\_features})` where all but the last dimension
-          are the same shape as the input.
+        - Input: :math:`(N, *, H_{in})` where :math:`*` means any number of
+          additional dimensions and :math:`H_{in} = \text{in\_features}`
+        - Output: :math:`(N, *, H_{out})` where all but the last dimension
+          are the same shape as the input and :math:`H_{out} = \text{out\_features}`.
 
     Attributes:
         weight: the learnable weights of the module of shape
@@ -85,11 +85,12 @@ class Bilinear(Module):
             Default: ``True``
 
     Shape:
-        - Input: :math:`(N, *, \text{in1\_features})`, :math:`(N, *, \text{in2\_features})`
-          where :math:`*` means any number of additional dimensions. All but the last
-          dimension of the inputs should be the same.
-        - Output: :math:`(N, *, \text{out\_features})` where all but the last dimension
-          are the same shape as the input.
+        - Input1: :math:`(N, *, H_{in1})` where :math:`H_{in1}=\text{in1\_features}` and
+          :math:`*` means any number of additional dimensions. All but the last dimension
+          of the inputs should be the same.
+        - Input2: :math:`(N, *, H_{in2})` where :math:`H_{in2}=\text{in2\_features}`.
+        - Output: :math:`(N, *, H_{out})` where :math:`H_{out}=\text{out\_features}`
+          and all but the last dimension are the same shape as the input.
 
     Attributes:
         weight: the learnable weights of the module of shape
diff --git a/torch/nn/modules/loss.py b/torch/nn/modules/loss.py
index 900756b..e0c0c99 100644
--- a/torch/nn/modules/loss.py
+++ b/torch/nn/modules/loss.py
@@ -270,6 +270,13 @@ class PoissonNLLLoss(_Loss):
         >>> target = torch.randn(5, 2)
         >>> output = loss(log_input, target)
         >>> output.backward()
+
+    Shape:
+        - Input: :math:`(N, *)` where `*` means, any number of additional
+          dimensions
+        - Target: :math:`(N, *)`, same shape as the input
+        - Output: scalar by default. If `reduce` is ``False``, then :math:`(N, *)`,
+          the same shape as the input
     """
     __constants__ = ['log_input', 'full', 'eps', 'reduction']
 
@@ -350,10 +357,10 @@ class KLDivLoss(_Loss):
 
 
     Shape:
-        - input: :math:`(N, *)` where `*` means, any number of additional
+        - Input: :math:`(N, *)` where `*` means, any number of additional
           dimensions
-        - target: :math:`(N, *)`, same shape as the input
-        - output: scalar by default. If `reduce` is ``False``, then :math:`(N, *)`,
+        - Target: :math:`(N, *)`, same shape as the input
+        - Output: scalar by default. If `reduce` is ``False``, then :math:`(N, *)`,
           the same shape as the input
 
     """
@@ -571,6 +578,8 @@ class BCEWithLogitsLoss(_Loss):
          - Input: :math:`(N, *)` where `*` means, any number of additional
            dimensions
          - Target: :math:`(N, *)`, same shape as the input
+         - Output: scalar. If `reduce` is False, then :math:`(N, *)`, same
+           shape as input.
 
      Examples::
 
@@ -640,8 +649,9 @@ class HingeEmbeddingLoss(_Loss):
             specifying either of those two args will override :attr:`reduction`. Default: 'mean'
 
     Shape:
-        - Input: Tensor of arbitrary shape. The sum operation operates over all the elements.
-        - Target: Same shape as input.
+        - Input: :math:`(*)` where `*` means, any number of dimensions. The sum operation
+          operates over all the elements.
+        - Target: :math:`(*)`, same shape as the input
         - Output: scalar. If reduce is ``False``, then same shape as the input
     """
     __constants__ = ['margin', 'reduction']
@@ -797,8 +807,9 @@ class SoftMarginLoss(_Loss):
             specifying either of those two args will override :attr:`reduction`. Default: 'mean'
 
     Shape:
-        - Input: Tensor of arbitrary shape.
-        - Target: Same shape as input.
+        - Input: :math:`(*)` where `*` means, any number of additional
+          dimensions
+        - Target: :math:`(*)`, same shape as the input
         - Output: scalar. If reduce is ``False``, then same shape as the input
 
     """
diff --git a/torch/nn/modules/normalization.py b/torch/nn/modules/normalization.py
index 8ef72ca..a21b858 100644
--- a/torch/nn/modules/normalization.py
+++ b/torch/nn/modules/normalization.py
@@ -25,8 +25,8 @@ class LocalResponseNorm(Module):
         k: additive factor. Default: 1
 
     Shape:
-        - Input: :math:`(N, C, ...)`
-        - Output: :math:`(N, C, ...)` (same shape as input)
+        - Input: :math:`(N, C, *)`
+        - Output: :math:`(N, C, *)` (same shape as input)
 
     Examples::
 
@@ -188,8 +188,8 @@ class GroupNorm(Module):
             and zeros (for biases). Default: ``True``.
 
     Shape:
-        - Input: :math:`(N, num\_channels, *)`
-        - Output: :math:`(N, num\_channels, *)` (same shape as input)
+        - Input: :math:`(N, C, *)` where :math:`C=\text{num\_channels}`
+        - Output: :math:`(N, C, *)` (same shape as input)
 
     Examples::
 
diff --git a/torch/nn/modules/pixelshuffle.py b/torch/nn/modules/pixelshuffle.py
index 971a226..79471a9 100644
--- a/torch/nn/modules/pixelshuffle.py
+++ b/torch/nn/modules/pixelshuffle.py
@@ -19,8 +19,9 @@ class PixelShuffle(Module):
         upscale_factor (int): factor to increase spatial resolution by
 
     Shape:
-        - Input: :math:`(N, C \times \text{upscale_factor}^2, H, W)`
-        - Output: :math:`(N, C, H \times \text{upscale_factor}, W \times \text{upscale_factor})`
+        - Input: :math:`(N, L, H_{in}, W_{in})` where :math:`L=C \times \text{upscale\_factor}^2`
+        - Output: :math:`(N, C, H_{out}, W_{out})` where :math:`H \times \text{upscale\_factor}`
+          and :math:`W \times \text{upscale\_factor}`
 
     Examples::
 
diff --git a/torch/nn/modules/rnn.py b/torch/nn/modules/rnn.py
index 6578f8f..cc77dde 100644
--- a/torch/nn/modules/rnn.py
+++ b/torch/nn/modules/rnn.py
@@ -308,6 +308,18 @@ class RNN(RNNBase):
           Like *output*, the layers can be separated using
           ``h_n.view(num_layers, num_directions, batch, hidden_size)``.
 
+    Shape:
+        - Input1: :math:`(L, N, H_{in})` tensor containing input features where
+          :math:`H_{in}=\text{input\_size}` and `L` represents a sequence length.
+        - Input2: :math:`(S, N, H_{out})` tensor
+          containing the initial hidden state for each element in the batch.
+          :math:`H_{out}=\text{hidden\_size}`
+          Defaults to zero if not provided. where :math:`S=\text{num\_layers} * \text{num\_directions}`
+          If the RNN is bidirectional, num_directions should be 2, else it should be 1.
+        - Output1: :math:`(L, N, H_{all})` where :math:`H_all=\text{num\_directions} * \text{hidden\_size}`
+        - Output2: :math:`(S, N, H_{out})` tensor containing the next hidden state
+          for each element in the batch
+
     Attributes:
         weight_ih_l[k]: the learnable input-hidden weights of the k-th layer,
             of shape `(hidden_size * input_size)` for `k = 0`. Otherwise, the shape is
@@ -525,6 +537,18 @@ class GRU(RNNBase):
           Like *output*, the layers can be separated using
           ``h_n.view(num_layers, num_directions, batch, hidden_size)``.
 
+    Shape:
+        - Input1: :math:`(L, N, H_{in})` tensor containing input features where
+          :math:`H_{in}=\text{input\_size}` and `L` represents a sequence length.
+        - Input2: :math:`(S, N, H_{out})` tensor
+          containing the initial hidden state for each element in the batch.
+          :math:`H_{out}=\text{hidden\_size}`
+          Defaults to zero if not provided. where :math:`S=\text{num\_layers} * \text{num\_directions}`
+          If the RNN is bidirectional, num_directions should be 2, else it should be 1.
+        - Output1: :math:`(L, N, H_{all})` where :math:`H_all=\text{num\_directions} * \text{hidden\_size}`
+        - Output2: :math:`(S, N, H_{out})` tensor containing the next hidden state
+          for each element in the batch
+
     Attributes:
         weight_ih_l[k] : the learnable input-hidden weights of the :math:`\text{k}^{th}` layer
             (W_ir|W_iz|W_in), of shape `(3*hidden_size x input_size)`
@@ -632,6 +656,15 @@ class RNNCell(RNNCellBase):
         - **h'** of shape `(batch, hidden_size)`: tensor containing the next hidden state
           for each element in the batch
 
+    Shape:
+        - Input1: :math:`(N, H_{in})` tensor containing input features where
+          :math:`H_{in}` = `input_size`
+        - Input2: :math:`(N, H_{out})` tensor containing the initial hidden
+          state for each element in the batch where :math:`H_{out}` = `hidden_size`
+          Defaults to zero if not provided.
+        - Output: :math:`(N, H_{out})` tensor containing the next hidden state
+          for each element in the batch
+
     Attributes:
         weight_ih: the learnable input-hidden weights, of shape
             `(hidden_size x input_size)`
@@ -802,6 +835,15 @@ class GRUCell(RNNCellBase):
         - **h'** of shape `(batch, hidden_size)`: tensor containing the next hidden state
           for each element in the batch
 
+    Shape:
+        - Input1: :math:`(N, H_{in})` tensor containing input features where
+          :math:`H_{in}` = `input_size`
+        - Input2: :math:`(N, H_{out})` tensor containing the initial hidden
+          state for each element in the batch where :math:`H_{out}` = `hidden_size`
+          Defaults to zero if not provided.
+        - Output: :math:`(N, H_{out})` tensor containing the next hidden state
+          for each element in the batch
+
     Attributes:
         weight_ih: the learnable input-hidden weights, of shape
             `(3*hidden_size x input_size)`
diff --git a/torch/nn/modules/sparse.py b/torch/nn/modules/sparse.py
index 38670c6..054b0d8 100644
--- a/torch/nn/modules/sparse.py
+++ b/torch/nn/modules/sparse.py
@@ -33,9 +33,8 @@ class Embedding(Module):
                          initialized from :math:`\mathcal{N}(0, 1)`
 
     Shape:
-
-        - Input: LongTensor of arbitrary shape containing the indices to extract
-        - Output: `(*, embedding_dim)`, where `*` is the input shape
+        - Input: :math:`(*)`, LongTensor of arbitrary shape containing the indices to extract
+        - Output: :math:`(*, H)`, where `*` is the input shape and :math:`H=\text{embedding\_dim}`
 
     .. note::
         Keep in mind that only a limited number of optimizers support