From 8c3285bf114c6a1f0cdb6c2ffad4d7c45ad85d5e Mon Sep 17 00:00:00 2001
From: ZhuBaohe <ehoabuhz@gmail.com>
Date: Wed, 27 Mar 2019 10:15:20 -0700
Subject: [PATCH] Fix loss functions doc (#18420)

Summary:
Correct docstring display error on web page caused by my previous PR
Pull Request resolved: https://github.com/pytorch/pytorch/pull/18420

Differential Revision: D14642467

Pulled By: soumith

fbshipit-source-id: 16fdd3301a4c5bad27fbcd8686f7fbfcc1e908ee
---
 torch/nn/modules/conv.py       | 27 ++++++++++++++-------------
 torch/nn/modules/loss.py       | 28 ++++++++++++++--------------
 torch/nn/modules/upsampling.py |  9 ++++-----
 3 files changed, 32 insertions(+), 32 deletions(-)

diff --git a/torch/nn/modules/conv.py b/torch/nn/modules/conv.py
index cc5ca78..12ba9ff 100644
--- a/torch/nn/modules/conv.py
+++ b/torch/nn/modules/conv.py
@@ -289,8 +289,8 @@ class Conv2d(_ConvNd):
 
     Attributes:
         weight (Tensor): the learnable weights of the module of shape
-                         :math:`(\text{out\_channels}, \frac{\text{in\_channels}}{\text{groups}},
-                               \text{kernel\_size[0]}, \text{kernel\_size[1]})`.
+                         :math:`(\text{out\_channels}, \frac{\text{in\_channels}}{\text{groups}},`
+                         :math:`\text{kernel\_size[0]}, \text{kernel\_size[1]})`.
                          The values of these weights are sampled from
                          :math:`\mathcal{U}(-\sqrt{k}, \sqrt{k})` where
                          :math:`k = \frac{1}{C_\text{in} * \prod_{i=0}^{1}\text{kernel\_size}[i]}`
@@ -393,9 +393,9 @@ class Conv3d(_ConvNd):
         where `K` is a positive integer, this operation is also termed in
         literature as depthwise convolution.
 
-         In other words, for an input of size :math:`(N, C_{in}, D_{in}, H_{in}, W_{in})`,
-         a depthwise convolution with a depthwise multiplier `K`, can be constructed by arguments
-         :math:`(in\_channels=C_{in}, out\_channels=C_{in} \times K, ..., groups=C_{in})`.
+        In other words, for an input of size :math:`(N, C_{in}, D_{in}, H_{in}, W_{in})`,
+        a depthwise convolution with a depthwise multiplier `K`, can be constructed by arguments
+        :math:`(in\_channels=C_{in}, out\_channels=C_{in} \times K, ..., groups=C_{in})`.
 
     .. include:: cudnn_deterministic.rst
 
@@ -428,8 +428,8 @@ class Conv3d(_ConvNd):
 
     Attributes:
         weight (Tensor): the learnable weights of the module of shape
-                         :math:`(\text{out\_channels}, \frac{\text{in\_channels}}{\text{groups}},
-                               \text{kernel\_size[0]}, \text{kernel\_size[1]}, \text{kernel\_size[2]})`.
+                         :math:`(\text{out\_channels}, \frac{\text{in\_channels}}{\text{groups}},`
+                         :math:`\text{kernel\_size[0]}, \text{kernel\_size[1]}, \text{kernel\_size[2]})`.
                          The values of these weights are sampled from
                          :math:`\mathcal{U}(-\sqrt{k}, \sqrt{k})` where
                          :math:`k = \frac{1}{C_\text{in} * \prod_{i=0}^{2}\text{kernel\_size}[i]}`
@@ -613,8 +613,9 @@ class ConvTranspose1d(_ConvTransposeMixin, _ConvNd):
 
     Attributes:
         weight (Tensor): the learnable weights of the module of shape
-                         :math:`(\text{in\_channels}, \frac{\text{out\_channels}}{\text{groups}},
-                               \text{kernel\_size})`. The values of these weights are sampled from
+                         :math:`(\text{in\_channels}, \frac{\text{out\_channels}}{\text{groups}},`
+                         :math:`\text{kernel\_size})`.
+                         The values of these weights are sampled from
                          :math:`\mathcal{U}(-\sqrt{k}, \sqrt{k})` where
                          :math:`k = \frac{1}{C_\text{in} * \text{kernel\_size}}`
         bias (Tensor):   the learnable bias of the module of shape (out_channels).
@@ -735,8 +736,8 @@ class ConvTranspose2d(_ConvTransposeMixin, _ConvNd):
 
     Attributes:
         weight (Tensor): the learnable weights of the module of shape
-                         :math:`(\text{in\_channels}, \frac{\text{out\_channels}}{\text{groups}},
-                               \text{kernel\_size[0]}, \text{kernel\_size[1]})`.
+                         :math:`(\text{in\_channels}, \frac{\text{out\_channels}}{\text{groups}},`
+                         :math:`\text{kernel\_size[0]}, \text{kernel\_size[1]})`.
                          The values of these weights are sampled from
                          :math:`\mathcal{U}(-\sqrt{k}, \sqrt{k})` where
                          :math:`k = \frac{1}{C_\text{in} * \prod_{i=0}^{1}\text{kernel\_size}[i]}`
@@ -890,8 +891,8 @@ class ConvTranspose3d(_ConvTransposeMixin, _ConvNd):
 
     Attributes:
         weight (Tensor): the learnable weights of the module of shape
-                         :math:`(\text{in\_channels}, \frac{\text{out\_channels}}{\text{groups}},
-                               \text{kernel\_size[0]}, \text{kernel\_size[1]}, \text{kernel\_size[2]})`.
+                         :math:`(\text{in\_channels}, \frac{\text{out\_channels}}{\text{groups}},`
+                         :math:`\text{kernel\_size[0]}, \text{kernel\_size[1]}, \text{kernel\_size[2]})`.
                          The values of these weights are sampled from
                          :math:`\mathcal{U}(-\sqrt{k}, \sqrt{k})` where
                          :math:`k = \frac{1}{C_\text{in} * \prod_{i=0}^{2}\text{kernel\_size}[i]}`
diff --git a/torch/nn/modules/loss.py b/torch/nn/modules/loss.py
index 3e5f087..2cb4069 100644
--- a/torch/nn/modules/loss.py
+++ b/torch/nn/modules/loss.py
@@ -356,8 +356,8 @@ class KLDivLoss(_Loss):
         and in the meantime, specifying either of those two args will override :attr:`reduction`.
 
     .. note::
-        :attr:``reduction`` = ``'mean'`` doesn't return the true kl divergence value, please use
-        :attr:``reduction`` = ``'batchmean'`` which aligns with KL math definition.
+        :attr:`reduction` = ``'mean'`` doesn't return the true kl divergence value, please use
+        :attr:`reduction` = ``'batchmean'`` which aligns with KL math definition.
         In the next major release, ``'mean'`` will be changed to be the same as ``'batchmean'``.
 
     Shape:
@@ -655,7 +655,7 @@ class HingeEmbeddingLoss(_Loss):
         - Input: :math:`(*)` where :math:`*` means, any number of dimensions. The sum operation
           operates over all the elements.
         - Target: :math:`(*)`, same shape as the input
-        - Output: scalar. If :attr:``reduction`` is ``'none'``, then same shape as the input
+        - Output: scalar. If :attr:`reduction` is ``'none'``, then same shape as the input
     """
     __constants__ = ['margin', 'reduction']
 
@@ -711,7 +711,7 @@ class MultiLabelMarginLoss(_Loss):
         - Input: :math:`(C)` or :math:`(N, C)` where `N` is the batch size and `C`
           is the number of classes.
         - Target: :math:`(C)` or :math:`(N, C)`, label targets padded by -1 ensuring same shape as the input.
-        - Output: scalar. If :attr:``reduction`` is ``'none'``, then :math:`(N)`.
+        - Output: scalar. If :attr:`reduction` is ``'none'``, then :math:`(N)`.
 
     Examples::
 
@@ -739,7 +739,7 @@ class SmoothL1Loss(_Loss):
     r"""Creates a criterion that uses a squared term if the absolute
     element-wise error falls below 1 and an L1 term otherwise.
     It is less sensitive to outliers than the `MSELoss` and in some cases
-    prevents exploding gradients (e.g. see "Fast R-CNN" paper by Ross Girshick).
+    prevents exploding gradients (e.g. see `Fast R-CNN` paper by Ross Girshick).
     Also known as the Huber loss:
 
     .. math::
@@ -853,7 +853,7 @@ class CrossEntropyLoss(_WeightedLoss):
     with :math:`K \geq 1` for the `K`-dimensional case (described later).
 
     This criterion expects a class index in the range :math:`[0, C-1]` as the
-    `target`for each value of a 1D tensor of size `minibatch`; if `ignore_index`
+    `target` for each value of a 1D tensor of size `minibatch`; if `ignore_index`
     is specified, this criterion also accepts this class index (this index may not
     necessarily be in the class range).
 
@@ -1000,8 +1000,8 @@ class CosineEmbeddingLoss(_Loss):
 
     Args:
         margin (float, optional): Should be a number from :math:`-1` to :math:`1`,
-        :math:`0` to :math:`0.5` is suggested. If :attr:`margin` is missing, the
-        default value is :math:`0`.
+            :math:`0` to :math:`0.5` is suggested. If :attr:`margin` is missing, the
+            default value is :math:`0`.
         size_average (bool, optional): Deprecated (see :attr:`reduction`). By default,
             the losses are averaged over each loss element in the batch. Note that for
             some losses, there are multiple elements per sample. If the field :attr:`size_average`
@@ -1032,7 +1032,7 @@ class CosineEmbeddingLoss(_Loss):
 @weak_module
 class MarginRankingLoss(_Loss):
     r"""Creates a criterion that measures the loss given
-    inputs :math:`x1`, :math:`x2`, two 1D mini-batch `Tensor`s,
+    inputs :math:`x1`, :math:`x2`, two 1D mini-batch `Tensors`,
     and a label 1D mini-batch tensor :math:`y` (containing 1 or -1).
 
     If :math:`y = 1` then it assumed the first input should be ranked higher
@@ -1146,8 +1146,8 @@ class TripletMarginLoss(_Loss):
     r"""Creates a criterion that measures the triplet loss given an input
     tensors :math:`x1`, :math:`x2`, :math:`x3` and a margin with a value greater than :math:`0`.
     This is used for measuring a relative similarity between samples. A triplet
-    is composed by `a`, `p` and `n`: `anchor`, `positive examples` and `negative
-    examples` respectively. The shapes of all input tensors should be
+    is composed by `a`, `p` and `n` (i.e., `anchor`, `positive examples` and `negative
+    examples` respectively). The shapes of all input tensors should be
     :math:`(N, D)`.
 
     The distance swap is described in detail in the paper `Learning shallow
@@ -1224,9 +1224,9 @@ class CTCLoss(_Loss):
     Args:
         blank (int, optional): blank label. Default :math:`0`.
         reduction (string, optional): Specifies the reduction to apply to the output:
-            'none' | 'mean' | 'sum'. 'none': no reduction will be applied,
-            'mean': the output losses will be divided by the target lengths and
-            then the mean over the batch is taken. Default: 'mean'
+            ``'none'`` | ``'mean'`` | ``'sum'``. ``'none'``: no reduction will be applied,
+            ``'mean'``: the output losses will be divided by the target lengths and
+            then the mean over the batch is taken. Default: ``'mean'``
         zero_infinity (bool, optional):
             Whether to zero infinite losses and the associated gradients.
             Default: ``False``
diff --git a/torch/nn/modules/upsampling.py b/torch/nn/modules/upsampling.py
index 261edcb..b41ad73 100644
--- a/torch/nn/modules/upsampling.py
+++ b/torch/nn/modules/upsampling.py
@@ -22,11 +22,10 @@ class Upsample(Module):
     calculate the output size. (You cannot give both, as it is ambiguous)
 
     Args:
-        size (int or Tuple[int] or Tuple[int, int] or Tuple[int, int, int],
-            optional): output spatial sizes
-        scale_factor (float or Tuple[float] or Tuple[float, float] or
-            Tuple[float, float, float], optional): multiplier for spatial size.
-            Has to match input size if it is a tuple.
+        size (int or Tuple[int] or Tuple[int, int] or Tuple[int, int, int], optional):
+            output spatial sizes
+        scale_factor (float or Tuple[float] or Tuple[float, float] or Tuple[float, float, float], optional):
+            multiplier for spatial size. Has to match input size if it is a tuple.
         mode (str, optional): the upsampling algorithm: one of ``'nearest'``,
             ``'linear'``, ``'bilinear'``, ``'bicubic'`` and ``'trilinear'``.
             Default: ``'nearest'``
-- 
2.7.4