Correct recurrent/linear/dropout/sparse layers docstrings

author ZhuBaohe <ehoabuhz@gmail.com>

Tue, 19 Feb 2019 13:17:23 +0000 (05:17 -0800)

committer Facebook Github Bot <facebook-github-bot@users.noreply.github.com>

Tue, 19 Feb 2019 13:23:04 +0000 (05:23 -0800)
author ZhuBaohe <ehoabuhz@gmail.com>
Tue, 19 Feb 2019 13:17:23 +0000 (05:17 -0800)
committer Facebook Github Bot <facebook-github-bot@users.noreply.github.com>
Tue, 19 Feb 2019 13:23:04 +0000 (05:23 -0800)
diff --git a/torch/nn/functional.py b/torch/nn/functional.py

index 3b4a201..131b820 100644 (file)
--- a/torch/nn/functional.py
+++ b/torch/nn/functional.py
@@ -845,8 +845,8 @@ def dropout2d(input, p=0.5, training=True, inplace=False):
      Randomly zero out entire channels (a channel is a 2D feature map,
      e.g., the :math:`j`-th channel of the :math:`i`-th sample in the
      batched input is a 2D tensor :math:`\text{input}[i, j]`) of the input tensor).
-    Each channel will be zeroed out independently on every forward call.
-    with probability :attr:`p` using samples from a Bernoulli distribution.
+    Each channel will be zeroed out independently on every forward call with
+    probability :attr:`p` using samples from a Bernoulli distribution.
  
      See :class:`~torch.nn.Dropout2d` for details.
  
@@ -870,8 +870,8 @@ def dropout3d(input, p=0.5, training=True, inplace=False):
      Randomly zero out entire channels (a channel is a 3D feature map,
      e.g., the :math:`j`-th channel of the :math:`i`-th sample in the
      batched input is a 3D tensor :math:`\text{input}[i, j]`) of the input tensor).
-    Each channel will be zeroed out independently on every forward call.
-    with probability :attr:`p` using samples from a Bernoulli distribution.
+    Each channel will be zeroed out independently on every forward call with
+    probability :attr:`p` using samples from a Bernoulli distribution.
  
      See :class:`~torch.nn.Dropout3d` for details.
  
@@ -1504,10 +1504,11 @@ def embedding(input, weight, padding_idx=None, max_norm=None, norm_type=2.,
  def embedding_bag(input, weight, offsets=None, max_norm=None, norm_type=2,
                    scale_grad_by_freq=False, mode='mean', sparse=False):
      # type: (Tensor, Tensor, Optional[Tensor], Optional[float], float, bool, str, bool) -> Tensor
-    r"""Computes sums, means or maxes of 'bags' of embeddings, without instantiating the
+    r"""Computes sums, means or maxes of `bags` of embeddings, without instantiating the
      intermediate embeddings.
  
      See :class:`torch.nn.EmbeddingBag` for more details.
+
      .. include:: cuda_deterministic_backward.rst
  
      Args:
@@ -1534,25 +1535,25 @@ def embedding_bag(input, weight, offsets=None, max_norm=None, norm_type=2,
  
          - :attr:`input` (LongTensor) and :attr:`offsets` (LongTensor, optional)
  
-          - If :attr:`input` is 2D of shape ``B x N``,
+          - If :attr:`input` is 2D of shape `(B, N)`,
  
              it will be treated as ``B`` bags (sequences) each of fixed length ``N``, and
              this will return ``B`` values aggregated in a way depending on the :attr:`mode`.
              :attr:`offsets` is ignored and required to be ``None`` in this case.
  
-          - If :attr:`input` is 1D of shape ``N``,
+          - If :attr:`input` is 1D of shape `(N)`,
  
              it will be treated as a concatenation of multiple bags (sequences).
              :attr:`offsets` is required to be a 1D tensor containing the
              starting index positions of each bag in :attr:`input`. Therefore,
-            for :attr:`offsets` of shape ``B``, :attr:`input` will be viewed as
+            for :attr:`offsets` of shape `(B)`, :attr:`input` will be viewed as
              having ``B`` bags. Empty bags (i.e., having 0-length) will have
              returned vectors filled by zeros.
  
          - :attr:`weight` (Tensor): the learnable weights of the module of
-          shape ``(num_embeddings x embedding_dim)``
+          shape `(num_embeddings, embedding_dim)`
  
-        - :attr:`output`: aggregated embedding values of shape ``B x embedding_dim``
+        - :attr:`output`: aggregated embedding values of shape `(B, embedding_dim)`
  
      Examples::
  
diff --git a/torch/nn/modules/distance.py b/torch/nn/modules/distance.py

index 45e4ae7..43948b0 100644 (file)
--- a/torch/nn/modules/distance.py
+++ b/torch/nn/modules/distance.py
@@ -10,7 +10,7 @@ class PairwiseDistance(Module):
      Computes the batchwise pairwise distance between vectors :math:`v_1`, :math:`v_2` using the p-norm:
  
      .. math ::
-        \Vert x \Vert _p := \left( \sum_{i=1}^n  \vert x_i \vert ^ p \right) ^ {1/p}
+        \Vert x \Vert _p = \left( \sum_{i=1}^n  \vert x_i \vert ^ p \right) ^ {1/p}
  
      Args:
          p (real): the norm degree. Default: 2
diff --git a/torch/nn/modules/dropout.py b/torch/nn/modules/dropout.py

index 6dab40f..7ceca17 100644 (file)
--- a/torch/nn/modules/dropout.py
+++ b/torch/nn/modules/dropout.py
@@ -63,8 +63,8 @@ class Dropout2d(_DropoutNd):
      r"""Randomly zero out entire channels (a channel is a 2D feature map,
      e.g., the :math:`j`-th channel of the :math:`i`-th sample in the
      batched input is a 2D tensor :math:`\text{input}[i, j]`).
-    Each channel will be zeroed out independently on every forward call.
-    with probability :attr:`p` using samples from a Bernoulli distribution.
+    Each channel will be zeroed out independently on every forward call with
+    probability :attr:`p` using samples from a Bernoulli distribution.
  
      Usually the input comes from :class:`nn.Conv2d` modules.
  
@@ -107,8 +107,8 @@ class Dropout3d(_DropoutNd):
      r"""Randomly zero out entire channels (a channel is a 3D feature map,
      e.g., the :math:`j`-th channel of the :math:`i`-th sample in the
      batched input is a 3D tensor :math:`\text{input}[i, j]`).
-    Each channel will be zeroed out independently on every forward call.
-    with probability :attr:`p` using samples from a Bernoulli distribution.
+    Each channel will be zeroed out independently on every forward call with
+    probability :attr:`p` using samples from a Bernoulli distribution.
  
      Usually the input comes from :class:`nn.Conv3d` modules.
  
diff --git a/torch/nn/modules/linear.py b/torch/nn/modules/linear.py

index dd8a673..442034e 100644 (file)
--- a/torch/nn/modules/linear.py
+++ b/torch/nn/modules/linear.py
@@ -15,7 +15,7 @@ class Linear(Module):
      Args:
          in_features: size of each input sample
          out_features: size of each output sample
-        bias: If set to False, the layer will not learn an additive bias.
+        bias: If set to ``False``, the layer will not learn an additive bias.
              Default: ``True``
  
      Shape:
@@ -94,10 +94,10 @@ class Bilinear(Module):
  
      Attributes:
          weight: the learnable weights of the module of shape
-            :math:`(\text{out\_features} x \text{in1\_features} x \text{in2\_features})`.
+            :math:`(\text{out\_features}, \text{in1\_features}, \text{in2\_features})`.
              The values are initialized from :math:`\mathcal{U}(-\sqrt{k}, \sqrt{k})`, where
              :math:`k = \frac{1}{\text{in1\_features}}`
-        bias:   the learnable bias of the module of shape :math:`(\text{out\_features})`
+        bias:   the learnable bias of the module of shape :math:`(\text{out\_features})`.
                  If :attr:`bias` is ``True``, the values are initialized from
                  :math:`\mathcal{U}(-\sqrt{k}, \sqrt{k})`, where
                  :math:`k = \frac{1}{\text{in1\_features}}`
diff --git a/torch/nn/modules/rnn.py b/torch/nn/modules/rnn.py

index 8e90363..ec7155d 100644 (file)
--- a/torch/nn/modules/rnn.py
+++ b/torch/nn/modules/rnn.py
@@ -259,12 +259,12 @@ class RNN(RNNBase):
      function:
  
      .. math::
-        h_t = \text{tanh}(w_{ih} x_t + b_{ih} + w_{hh} h_{(t-1)} + b_{hh})
+        h_t = \text{tanh}(W_{ih} x_t + b_{ih} + W_{hh} h_{(t-1)} + b_{hh})
  
      where :math:`h_t` is the hidden state at time `t`, :math:`x_t` is
      the input at time `t`, and :math:`h_{(t-1)}` is the hidden state of the
      previous layer at time `t-1` or the initial hidden state at time `0`.
-    If :attr:`nonlinearity` is `'relu'`, then `ReLU` is used instead of `tanh`.
+    If :attr:`nonlinearity` is ``'relu'``, then `ReLU` is used instead of `tanh`.
  
      Args:
          input_size: The number of expected features in the input `x`
@@ -273,7 +273,7 @@ class RNN(RNNBase):
              would mean stacking two RNNs together to form a `stacked RNN`,
              with the second RNN taking in outputs of the first RNN and
              computing the final results. Default: 1
-        nonlinearity: The non-linearity to use. Can be either 'tanh' or 'relu'. Default: 'tanh'
+        nonlinearity: The non-linearity to use. Can be either ``'tanh'`` or ``'relu'``. Default: ``'tanh'``
          bias: If ``False``, then the layer does not use bias weights `b_ih` and `b_hh`.
              Default: ``True``
          batch_first: If ``True``, then the input and output tensors are provided
@@ -296,16 +296,16 @@ class RNN(RNNBase):
  
      Outputs: output, h_n
          - **output** of shape `(seq_len, batch, num_directions * hidden_size)`: tensor
-          containing the output features (`h_k`) from the last layer of the RNN,
-          for each `k`.  If a :class:`torch.nn.utils.rnn.PackedSequence` has
+          containing the output features (`h_t`) from the last layer of the RNN,
+          for each `t`.  If a :class:`torch.nn.utils.rnn.PackedSequence` has
            been given as the input, the output will also be a packed sequence.
  
            For the unpacked case, the directions can be separated
            using ``output.view(seq_len, batch, num_directions, hidden_size)``,
            with forward and backward being direction `0` and `1` respectively.
            Similarly, the directions can be separated in the packed case.
-        - **h_n** (num_layers * num_directions, batch, hidden_size): tensor
-          containing the hidden state for `k = seq_len`.
+        - **h_n** of shape `(num_layers * num_directions, batch, hidden_size)`: tensor
+          containing the hidden state for `t = seq_len`.
  
            Like *output*, the layers can be separated using
            ``h_n.view(num_layers, num_directions, batch, hidden_size)``.
@@ -324,10 +324,10 @@ class RNN(RNNBase):
  
      Attributes:
          weight_ih_l[k]: the learnable input-hidden weights of the k-th layer,
-            of shape `(hidden_size * input_size)` for `k = 0`. Otherwise, the shape is
-            `(hidden_size * hidden_size)`
+            of shape `(hidden_size, input_size)` for `k = 0`. Otherwise, the shape is
+            `(hidden_size, num_directions * hidden_size)`
          weight_hh_l[k]: the learnable hidden-hidden weights of the k-th layer,
-            of shape `(hidden_size * hidden_size)`
+            of shape `(hidden_size, hidden_size)`
          bias_ih_l[k]: the learnable input-hidden bias of the k-th layer,
              of shape `(hidden_size)`
          bias_hh_l[k]: the learnable hidden-hidden bias of the k-th layer,
@@ -386,11 +386,11 @@ class LSTM(RNNBase):
      is the hidden state of the layer at time `t-1` or the initial hidden
      state at time `0`, and :math:`i_t`, :math:`f_t`, :math:`g_t`,
      :math:`o_t` are the input, forget, cell, and output gates, respectively.
-    :math:`\sigma` is the sigmoid function.
+    :math:`\sigma` is the sigmoid function, and :math:`*` is the Hadamard product.
  
-    In a multilayer LSTM, the input :math:`i^{(l)}_t` of the :math:`l` -th layer
+    In a multilayer LSTM, the input :math:`x^{(l)}_t` of the :math:`l` -th layer
      (:math:`l >= 2`) is the hidden state :math:`h^{(l-1)}_t` of the previous layer multiplied by
-    dropout :math:`\delta^{(l-1)}_t` where each :math:`\delta^{(l-1)_t}` is a Bernoulli random
+    dropout :math:`\delta^{(l-1)}_t` where each :math:`\delta^{(l-1)}_t` is a Bernoulli random
      variable which is :math:`0` with probability :attr:`dropout`.
  
      Args:
@@ -417,7 +417,7 @@ class LSTM(RNNBase):
            :func:`torch.nn.utils.rnn.pack_sequence` for details.
          - **h_0** of shape `(num_layers * num_directions, batch, hidden_size)`: tensor
            containing the initial hidden state for each element in the batch.
-          If the RNN is bidirectional, num_directions should be 2, else it should be 1.
+          If the LSTM is bidirectional, num_directions should be 2, else it should be 1.
          - **c_0** of shape `(num_layers * num_directions, batch, hidden_size)`: tensor
            containing the initial cell state for each element in the batch.
  
@@ -427,7 +427,7 @@ class LSTM(RNNBase):
      Outputs: output, (h_n, c_n)
          - **output** of shape `(seq_len, batch, num_directions * hidden_size)`: tensor
            containing the output features `(h_t)` from the last layer of the LSTM,
-          for each t. If a :class:`torch.nn.utils.rnn.PackedSequence` has been
+          for each `t`. If a :class:`torch.nn.utils.rnn.PackedSequence` has been
            given as the input, the output will also be a packed sequence.
  
            For the unpacked case, the directions can be separated
@@ -439,14 +439,15 @@ class LSTM(RNNBase):
  
            Like *output*, the layers can be separated using
            ``h_n.view(num_layers, num_directions, batch, hidden_size)`` and similarly for *c_n*.
-        - **c_n** (num_layers * num_directions, batch, hidden_size): tensor
-          containing the cell state for `t = seq_len`
+        - **c_n** of shape `(num_layers * num_directions, batch, hidden_size)`: tensor
+          containing the cell state for `t = seq_len`.
  
      Attributes:
          weight_ih_l[k] : the learnable input-hidden weights of the :math:`\text{k}^{th}` layer
-            `(W_ii|W_if|W_ig|W_io)`, of shape `(4*hidden_size x input_size)`
+            `(W_ii|W_if|W_ig|W_io)`, of shape `(4*hidden_size, input_size)` for `k = 0`.
+            Otherwise, the shape is `(4*hidden_size, num_directions * hidden_size)`
          weight_hh_l[k] : the learnable hidden-hidden weights of the :math:`\text{k}^{th}` layer
-            `(W_hi|W_hf|W_hg|W_ho)`, of shape `(4*hidden_size x hidden_size)`
+            `(W_hi|W_hf|W_hg|W_ho)`, of shape `(4*hidden_size, hidden_size)`
          bias_ih_l[k] : the learnable input-hidden bias of the :math:`\text{k}^{th}` layer
              `(b_ii|b_if|b_ig|b_io)`, of shape `(4*hidden_size)`
          bias_hh_l[k] : the learnable hidden-hidden bias of the :math:`\text{k}^{th}` layer
@@ -482,7 +483,7 @@ class GRU(RNNBase):
          \begin{array}{ll}
              r_t = \sigma(W_{ir} x_t + b_{ir} + W_{hr} h_{(t-1)} + b_{hr}) \\
              z_t = \sigma(W_{iz} x_t + b_{iz} + W_{hz} h_{(t-1)} + b_{hz}) \\
-            n_t = \tanh(W_{in} x_t + b_{in} + r_t (W_{hn} h_{(t-1)}+ b_{hn})) \\
+            n_t = \tanh(W_{in} x_t + b_{in} + r_t * (W_{hn} h_{(t-1)}+ b_{hn})) \\
              h_t = (1 - z_t) * n_t + z_t * h_{(t-1)}
          \end{array}
  
@@ -490,11 +491,11 @@ class GRU(RNNBase):
      at time `t`, :math:`h_{(t-1)}` is the hidden state of the layer
      at time `t-1` or the initial hidden state at time `0`, and :math:`r_t`,
      :math:`z_t`, :math:`n_t` are the reset, update, and new gates, respectively.
-    :math:`\sigma` is the sigmoid function.
+    :math:`\sigma` is the sigmoid function, and :math:`*` is the Hadamard product.
  
-    In a multilayer GRU, the input :math:`i^{(l)}_t` of the :math:`l` -th layer
+    In a multilayer GRU, the input :math:`x^{(l)}_t` of the :math:`l` -th layer
      (:math:`l >= 2`) is the hidden state :math:`h^{(l-1)}_t` of the previous layer multiplied by
-    dropout :math:`\delta^{(l-1)}_t` where each :math:`\delta^{(l-1)_t}` is a Bernoulli random
+    dropout :math:`\delta^{(l-1)}_t` where each :math:`\delta^{(l-1)}_t` is a Bernoulli random
      variable which is :math:`0` with probability :attr:`dropout`.
  
      Args:
@@ -526,7 +527,7 @@ class GRU(RNNBase):
      Outputs: output, h_n
          - **output** of shape `(seq_len, batch, num_directions * hidden_size)`: tensor
            containing the output features h_t from the last layer of the GRU,
-          for each t. If a :class:`torch.nn.utils.rnn.PackedSequence` has been
+          for each `t`. If a :class:`torch.nn.utils.rnn.PackedSequence` has been
            given as the input, the output will also be a packed sequence.
            For the unpacked case, the directions can be separated
            using ``output.view(seq_len, batch, num_directions, hidden_size)``,
@@ -553,9 +554,10 @@ class GRU(RNNBase):
  
      Attributes:
          weight_ih_l[k] : the learnable input-hidden weights of the :math:`\text{k}^{th}` layer
-            (W_ir|W_iz|W_in), of shape `(3*hidden_size x input_size)`
+            (W_ir|W_iz|W_in), of shape `(3*hidden_size, input_size)` for `k = 0`.
+            Otherwise, the shape is `(3*hidden_size, num_directions * hidden_size)`
          weight_hh_l[k] : the learnable hidden-hidden weights of the :math:`\text{k}^{th}` layer
-            (W_hr|W_hz|W_hn), of shape `(3*hidden_size x hidden_size)`
+            (W_hr|W_hz|W_hn), of shape `(3*hidden_size, hidden_size)`
          bias_ih_l[k] : the learnable input-hidden bias of the :math:`\text{k}^{th}` layer
              (b_ir|b_iz|b_in), of shape `(3*hidden_size)`
          bias_hh_l[k] : the learnable hidden-hidden bias of the :math:`\text{k}^{th}` layer
@@ -637,7 +639,7 @@ class RNNCell(RNNCellBase):
  
      .. math::
  
-        h' = \tanh(w_{ih} x + b_{ih}  +  w_{hh} h + b_{hh})
+        h' = \tanh(W_{ih} x + b_{ih}  +  W_{hh} h + b_{hh})
  
      If :attr:`nonlinearity` is `'relu'`, then ReLU is used in place of tanh.
  
@@ -646,7 +648,7 @@ class RNNCell(RNNCellBase):
          hidden_size: The number of features in the hidden state `h`
          bias: If ``False``, then the layer does not use bias weights `b_ih` and `b_hh`.
              Default: ``True``
-        nonlinearity: The non-linearity to use. Can be either 'tanh' or 'relu'. Default: 'tanh'
+        nonlinearity: The non-linearity to use. Can be either ``'tanh'`` or ``'relu'``. Default: ``'tanh'``
  
      Inputs: input, hidden
          - **input** of shape `(batch, input_size)`: tensor containing input features
@@ -669,9 +671,9 @@ class RNNCell(RNNCellBase):
  
      Attributes:
          weight_ih: the learnable input-hidden weights, of shape
-            `(hidden_size x input_size)`
+            `(hidden_size, input_size)`
          weight_hh: the learnable hidden-hidden weights, of shape
-            `(hidden_size x hidden_size)`
+            `(hidden_size, hidden_size)`
          bias_ih: the learnable input-hidden bias, of shape `(hidden_size)`
          bias_hh: the learnable hidden-hidden bias, of shape `(hidden_size)`
  
@@ -736,12 +738,12 @@ class LSTMCell(RNNCellBase):
          h' = o * \tanh(c') \\
          \end{array}
  
-    where :math:`\sigma` is the sigmoid function.
+    where :math:`\sigma` is the sigmoid function, and :math:`*` is the Hadamard product.
  
      Args:
          input_size: The number of expected features in the input `x`
          hidden_size: The number of features in the hidden state `h`
-        bias: If `False`, then the layer does not use bias weights `b_ih` and
+        bias: If ``False``, then the layer does not use bias weights `b_ih` and
              `b_hh`. Default: ``True``
  
      Inputs: input, (h_0, c_0)
@@ -753,7 +755,7 @@ class LSTMCell(RNNCellBase):
  
            If `(h_0, c_0)` is not provided, both **h_0** and **c_0** default to zero.
  
-    Outputs: h_1, c_1
+    Outputs: (h_1, c_1)
          - **h_1** of shape `(batch, hidden_size)`: tensor containing the next hidden state
            for each element in the batch
          - **c_1** of shape `(batch, hidden_size)`: tensor containing the next cell state
@@ -761,9 +763,9 @@ class LSTMCell(RNNCellBase):
  
      Attributes:
          weight_ih: the learnable input-hidden weights, of shape
-            `(4*hidden_size x input_size)`
+            `(4*hidden_size, input_size)`
          weight_hh: the learnable hidden-hidden weights, of shape
-            `(4*hidden_size x hidden_size)`
+            `(4*hidden_size, hidden_size)`
          bias_ih: the learnable input-hidden bias, of shape `(4*hidden_size)`
          bias_hh: the learnable hidden-hidden bias, of shape `(4*hidden_size)`
  
@@ -815,13 +817,13 @@ class GRUCell(RNNCellBase):
          h' = (1 - z) * n + z * h
          \end{array}
  
-    where :math:`\sigma` is the sigmoid function.
+    where :math:`\sigma` is the sigmoid function, and :math:`*` is the Hadamard product.
  
      Args:
          input_size: The number of expected features in the input `x`
          hidden_size: The number of features in the hidden state `h`
-        bias: If `False`, then the layer does not use bias weights `b_ih` and
-            `b_hh`. Default: `True`
+        bias: If ``False``, then the layer does not use bias weights `b_ih` and
+            `b_hh`. Default: ``True``
  
      Inputs: input, hidden
          - **input** of shape `(batch, input_size)`: tensor containing input features
@@ -844,9 +846,9 @@ class GRUCell(RNNCellBase):
  
      Attributes:
          weight_ih: the learnable input-hidden weights, of shape
-            `(3*hidden_size x input_size)`
+            `(3*hidden_size, input_size)`
          weight_hh: the learnable hidden-hidden weights, of shape
-            `(3*hidden_size x hidden_size)`
+            `(3*hidden_size, hidden_size)`
          bias_ih: the learnable input-hidden bias, of shape `(3*hidden_size)`
          bias_hh: the learnable hidden-hidden bias, of shape `(3*hidden_size)`
  
diff --git a/torch/nn/modules/sparse.py b/torch/nn/modules/sparse.py

index 054b0d8..0ee3c69 100644 (file)
--- a/torch/nn/modules/sparse.py
+++ b/torch/nn/modules/sparse.py
@@ -138,7 +138,7 @@ class Embedding(Module):
  
          Args:
              embeddings (Tensor): FloatTensor containing weights for the Embedding.
-                First dimension is being passed to Embedding as 'num_embeddings', second as 'embedding_dim'.
+                First dimension is being passed to Embedding as ``num_embeddings``, second as ``embedding_dim``.
              freeze (boolean, optional): If ``True``, the tensor does not get updated in the learning process.
                  Equivalent to ``embedding.weight.requires_grad = False``. Default: ``True``
              padding_idx (int, optional): See module initialization documentation.
@@ -180,9 +180,9 @@ class EmbeddingBag(Module):
  
      For bags of constant length, this class
  
-        * with ``mode="sum"`` is equivalent to :class:`~torch.nn.Embedding` followed by ``torch.sum(dim=1)``,
-        * with ``mode="mean"`` is equivalent to :class:`~torch.nn.Embedding` followed by ``torch.mean(dim=1)``,
-        * with ``mode="max"`` is equivalent to :class:`~torch.nn.Embedding` followed by ``torch.max(dim=1)``.
+        * with ``mode="sum"`` is equivalent to :class:`~torch.nn.Embedding` followed by ``torch.sum(dim=0)``,
+        * with ``mode="mean"`` is equivalent to :class:`~torch.nn.Embedding` followed by ``torch.mean(dim=0)``,
+        * with ``mode="max"`` is equivalent to :class:`~torch.nn.Embedding` followed by ``torch.max(dim=0)``.
  
      However, :class:`~torch.nn.EmbeddingBag` is much more time and memory efficient than using a chain of these
      operations.
@@ -203,27 +203,27 @@ class EmbeddingBag(Module):
                                   supported when ``mode="max"``.
  
      Attributes:
-        weight (Tensor): the learnable weights of the module of shape ``(num_embeddings x embedding_dim)``
+        weight (Tensor): the learnable weights of the module of shape `(num_embeddings, embedding_dim)`
                           initialized from :math:`\mathcal{N}(0, 1)`.
  
      Inputs: :attr:`input` (LongTensor) and :attr:`offsets` (LongTensor, optional)
  
-        - If :attr:`input` is 2D of shape ``B x N``,
+        - If :attr:`input` is 2D of shape `(B, N)`,
  
            it will be treated as ``B`` bags (sequences) each of fixed length ``N``, and
            this will return ``B`` values aggregated in a way depending on the :attr:`mode`.
            :attr:`offsets` is ignored and required to be ``None`` in this case.
  
-        - If :attr:`input` is 1D of shape ``N``,
+        - If :attr:`input` is 1D of shape `(N)`,
  
            it will be treated as a concatenation of multiple bags (sequences).
            :attr:`offsets` is required to be a 1D tensor containing the
            starting index positions of each bag in :attr:`input`. Therefore,
-          for :attr:`offsets` of shape ``B``, :attr:`input` will be viewed as
+          for :attr:`offsets` of shape `(B)`, :attr:`input` will be viewed as
            having ``B`` bags. Empty bags (i.e., having 0-length) will have
            returned vectors filled by zeros.
  
-    Output shape: ``B x embedding_dim``
+    Output shape: `(B, embedding_dim)`
  
      Examples::
  
diff --git a/torch/nn/utils/rnn.py b/torch/nn/utils/rnn.py

index c394667..efc34ef 100644 (file)
--- a/torch/nn/utils/rnn.py
+++ b/torch/nn/utils/rnn.py
@@ -25,7 +25,7 @@ class PackedSequence(PackedSequence_):
  
          Batch sizes represent the number elements at each sequence step in
          the batch, not the varying sequence lengths passed to
-        :func:`pack_padded_sequence`.  For instance, given data  ``abc`` and `x`
+        :func:`pack_padded_sequence`.  For instance, given data ``abc`` and ``x``
          the :class:`PackedSequence` would contain data ``axbc`` with
          ``batch_sizes=[2,1,1]``.
  
@@ -257,7 +257,7 @@ def pad_packed_sequence(sequence, batch_first=False, padding_value=0.0, total_le
  
  
  def pad_sequence(sequences, batch_first=False, padding_value=0):
-    r"""Pad a list of variable length Tensors with zero
+    r"""Pad a list of variable length Tensors with ``padding_value``
  
      ``pad_sequence`` stacks a list of Tensors along a new dimension,
      and pads them to equal length. For example, if the input is list of
author	ZhuBaohe <ehoabuhz@gmail.com>
	Tue, 19 Feb 2019 13:17:23 +0000 (05:17 -0800)
committer	Facebook Github Bot <facebook-github-bot@users.noreply.github.com>
	Tue, 19 Feb 2019 13:23:04 +0000 (05:23 -0800)
torch/nn/functional.py		patch \| blob \| history
torch/nn/modules/distance.py		patch \| blob \| history
torch/nn/modules/dropout.py		patch \| blob \| history
torch/nn/modules/linear.py		patch \| blob \| history
torch/nn/modules/rnn.py		patch \| blob \| history
torch/nn/modules/sparse.py		patch \| blob \| history
torch/nn/utils/rnn.py		patch \| blob \| history