class Adadelta(Optimizer):
- """Implements Adadelta algorithm.
-
- It has been proposed in `ADADELTA: An Adaptive Learning Rate Method`__.
+ r"""Implements Adadelta algorithm.
+
+ .. math::
+ \begin{aligned}
+ &\rule{110mm}{0.4pt} \\
+ &\textbf{input} : \gamma \text{ (lr)}, \: \theta_0 \text{ (params)},
+ \: f(\theta) \text{ (objective)}, \: \rho \text{ (decay)},
+ \: \lambda \text{ (weight decay)} \\
+ &\textbf{initialize} : v_0 \leftarrow 0 \: \text{ (square avg)},
+ \: u_0 \leftarrow 0 \: \text{ (accumulate variables)} \\[-1.ex]
+ &\rule{110mm}{0.4pt} \\
+ &\textbf{for} \: t=1 \: \textbf{to} \: \ldots \: \textbf{do} \\
+ &\hspace{5mm}g_t \leftarrow \nabla_{\theta} f_t (\theta_{t-1}) \\
+ &\hspace{5mm}if \: \lambda \neq 0 \\
+ &\hspace{10mm} g_t \leftarrow g_t + \lambda \theta_{t-1} \\
+ &\hspace{5mm} v_t \leftarrow v_{t-1} \rho + g^2_t (1 - \rho) \\
+ &\hspace{5mm}\Delta x_t \leftarrow \frac{\sqrt{u_{t-1} +
+ \epsilon }}{ \sqrt{v_t + \epsilon} }g_t \hspace{21mm} \\
+ &\hspace{5mm} u_t \leftarrow u_{t-1} \rho +
+ \Delta x^2_t (1 - \rho) \\
+ &\hspace{5mm}\theta_t \leftarrow \theta_{t-1} - \gamma \Delta x_t \\
+ &\rule{110mm}{0.4pt} \\[-1.ex]
+ &\bf{return} \: \theta_t \\[-1.ex]
+ &\rule{110mm}{0.4pt} \\[-1.ex]
+ \end{aligned}
+
+ For further details regarding the algorithm we refer to `ADADELTA: An Adaptive Learning Rate Method`_.
Args:
params (iterable): iterable of parameters to optimize or dicts defining
to the parameters (default: 1.0)
weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
- __ https://arxiv.org/abs/1212.5701
+ .. _ADADELTA\: An Adaptive Learning Rate Method:
+ https://arxiv.org/abs/1212.5701
"""
def __init__(self, params, lr=1.0, rho=0.9, eps=1e-6, weight_decay=0):