[doc][hackathon] To add Adadelta Optimizer to the documentation (#63255)

author Ilqar Ramazanli <iramazanli@fb.com>

Fri, 10 Sep 2021 20:33:12 +0000 (13:33 -0700)

committer Facebook GitHub Bot <facebook-github-bot@users.noreply.github.com>

Fri, 10 Sep 2021 23:49:12 +0000 (16:49 -0700)
author Ilqar Ramazanli <iramazanli@fb.com>
Fri, 10 Sep 2021 20:33:12 +0000 (13:33 -0700)
committer Facebook GitHub Bot <facebook-github-bot@users.noreply.github.com>
Fri, 10 Sep 2021 23:49:12 +0000 (16:49 -0700)
diff --git a/torch/optim/adadelta.py b/torch/optim/adadelta.py

index bd499ed..6dc6ca5 100644 (file)
--- a/torch/optim/adadelta.py
+++ b/torch/optim/adadelta.py
@@ -5,9 +5,33 @@ from .optimizer import Optimizer
  
  
  class Adadelta(Optimizer):
-    """Implements Adadelta algorithm.
-
-    It has been proposed in `ADADELTA: An Adaptive Learning Rate Method`__.
+    r"""Implements Adadelta algorithm.
+
+    .. math::
+       \begin{aligned}
+            &\rule{110mm}{0.4pt}                                                                 \\
+            &\textbf{input}      : \gamma \text{ (lr)}, \: \theta_0 \text{ (params)},
+                \: f(\theta) \text{ (objective)}, \: \rho \text{ (decay)},
+                \: \lambda \text{ (weight decay)}                                                \\
+            &\textbf{initialize} :  v_0  \leftarrow 0 \: \text{ (square avg)},
+                \: u_0 \leftarrow 0 \: \text{ (accumulate variables)}                     \\[-1.ex]
+            &\rule{110mm}{0.4pt}                                                                 \\
+            &\textbf{for} \: t=1 \: \textbf{to} \: \ldots \: \textbf{do}                         \\
+            &\hspace{5mm}g_t           \leftarrow   \nabla_{\theta} f_t (\theta_{t-1})           \\
+            &\hspace{5mm}if \: \lambda \neq 0                                                    \\
+            &\hspace{10mm} g_t \leftarrow g_t + \lambda  \theta_{t-1}                            \\
+            &\hspace{5mm} v_t      \leftarrow v_{t-1} \rho + g^2_t (1 - \rho)                    \\
+            &\hspace{5mm}\Delta x_t    \leftarrow   \frac{\sqrt{u_{t-1} +
+                \epsilon }}{ \sqrt{v_t + \epsilon}  }g_t \hspace{21mm}                           \\
+            &\hspace{5mm} u_t  \leftarrow   u_{t-1}  \rho +
+                 \Delta x^2_t  (1 - \rho)                                                        \\
+            &\hspace{5mm}\theta_t      \leftarrow   \theta_{t-1} - \gamma  \Delta x_t            \\
+            &\rule{110mm}{0.4pt}                                                          \\[-1.ex]
+            &\bf{return} \:  \theta_t                                                     \\[-1.ex]
+            &\rule{110mm}{0.4pt}                                                          \\[-1.ex]
+       \end{aligned}
+
+    For further details regarding the algorithm we refer to `ADADELTA: An Adaptive Learning Rate Method`_.
  
      Args:
          params (iterable): iterable of parameters to optimize or dicts defining
@@ -20,7 +44,8 @@ class Adadelta(Optimizer):
              to the parameters (default: 1.0)
          weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
  
-    __ https://arxiv.org/abs/1212.5701
+    .. _ADADELTA\: An Adaptive Learning Rate Method:
+        https://arxiv.org/abs/1212.5701
      """
  
      def __init__(self, params, lr=1.0, rho=0.9, eps=1e-6, weight_decay=0):
author	Ilqar Ramazanli <iramazanli@fb.com>
	Fri, 10 Sep 2021 20:33:12 +0000 (13:33 -0700)
committer	Facebook GitHub Bot <facebook-github-bot@users.noreply.github.com>
	Fri, 10 Sep 2021 23:49:12 +0000 (16:49 -0700)