From d4b09dbab3cc9842b917ed5e74d8b901406502c3 Mon Sep 17 00:00:00 2001 From: Ilqar Ramazanli Date: Thu, 9 Sep 2021 15:37:44 -0700 Subject: [PATCH] [doc][hackathon] To add Adagrad Optimizer to the documentation (#63254) Summary: It has been discussed before that adding description of Optimization algorithms to PyTorch Core documentation may result in a nice Optimization research tutorial. In the following tracking issue we mentioned about all the necessary algorithms and links to the originally published paper https://github.com/pytorch/pytorch/issues/63236. In this PR we are adding description of Adagrad to the documentation. For more details, we refer to the paper http://jmlr.org/papers/v12/duchi11a.html AdaGradAlgo Pull Request resolved: https://github.com/pytorch/pytorch/pull/63254 Reviewed By: albanD Differential Revision: D30852139 Pulled By: iramazanli fbshipit-source-id: 9e496560a97e92be8386585b01d9bd3bba4b0c66 --- torch/optim/adagrad.py | 25 +++++++++++++++++++++++-- 1 file changed, 23 insertions(+), 2 deletions(-) diff --git a/torch/optim/adagrad.py b/torch/optim/adagrad.py index 0ef4019..ccdd41e 100644 --- a/torch/optim/adagrad.py +++ b/torch/optim/adagrad.py @@ -4,9 +4,30 @@ from .optimizer import Optimizer class Adagrad(Optimizer): - """Implements Adagrad algorithm. + r"""Implements Adagrad algorithm. - It has been proposed in `Adaptive Subgradient Methods for Online Learning + .. math:: + \begin{aligned} + &\rule{110mm}{0.4pt} \\ + &\textbf{input} : \gamma \text{ (lr)}, \: \theta_0 \text{ (params)}, \: f(\theta) + \text{ (objective)}, \: \lambda \text{ (weight decay)}, \\ + &\hspace{12mm} \tau \text{ (initial accumulator value)}, \: \eta\text{ (lr decay)}\\ + &\textbf{initialize} : state\_sum_0 \leftarrow 0 \\[-1.ex] + &\rule{110mm}{0.4pt} \\ + &\textbf{for} \: t=1 \: \textbf{to} \: \ldots \: \textbf{do} \\ + &\hspace{5mm}g_t \leftarrow \nabla_{\theta} f_t (\theta_{t-1}) \\ + &\hspace{5mm} \tilde{\gamma} \leftarrow \gamma / (1 +(t-1) \eta) \\ + &\hspace{5mm} \textbf{if} \: \lambda \neq 0 \\ + &\hspace{10mm} g_t \leftarrow g_t + \lambda \theta_{t-1} \\ + &\hspace{5mm}state\_sum_t \leftarrow state\_sum_{t-1} + g^2_t \\ + &\hspace{5mm}\theta_t \leftarrow + \theta_{t-1}- \tilde{\gamma} \frac{g_t}{\sqrt{state\_sum_t}+\epsilon} \\ + &\rule{110mm}{0.4pt} \\[-1.ex] + &\bf{return} \: \theta_t \\[-1.ex] + &\rule{110mm}{0.4pt} \\[-1.ex] + \end{aligned} + + For further details regarding the algorithm we refer to `Adaptive Subgradient Methods for Online Learning and Stochastic Optimization`_. Args: -- 2.7.4