From c48e1679f9fe758abe8e3dd126ae87d4daef3af0 Mon Sep 17 00:00:00 2001
From: Jiyan Yang <chocjy@fb.com>
Date: Wed, 17 Apr 2019 21:07:42 -0700
Subject: [PATCH] Add validator for optimizers when parameters are shared

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/18497

Reviewed By: kennyhorror

Differential Revision: D14614738

fbshipit-source-id: beddd8349827dcc8ccae36f21e5d29627056afcd
---
 caffe2/python/layer_model_helper.py           | 64 ++++++++++++++++++++-
 caffe2/python/layer_parameter_sharing_test.py | 82 +++++++++++++++++++++++++++
 caffe2/python/optimizer.py                    |  8 +++
 3 files changed, 153 insertions(+), 1 deletion(-)

diff --git a/caffe2/python/layer_model_helper.py b/caffe2/python/layer_model_helper.py
index e4da1f3..3807877 100644
--- a/caffe2/python/layer_model_helper.py
+++ b/caffe2/python/layer_model_helper.py
@@ -14,7 +14,7 @@ from caffe2.python.modeling.parameter_sharing import (
 )
 from caffe2.python.modeling.net_modifier import NetModifier
 
-from caffe2.python.optimizer import get_param_device
+from caffe2.python.optimizer import get_param_device, Optimizer
 from caffe2.python.regularizer import Regularizer, RegularizationBy
 from caffe2.python.layers import layers
 from caffe2.proto import caffe2_pb2
@@ -228,6 +228,66 @@ class LayerModelHelper(model_helper.ModelHelper):
                     scope.CurrentNameScope(), param_name, ref_shape, shape)
             )
 
+    def _validate_param_optim(self, param_name, optim):
+        # there are three possible values for optim:
+        # 1) None (which will use self._default_optimizer after this layer is instantiated)
+        # 2) self.NoOptim
+        # 3) an instance of Optimizer class such as AdagradOptimizer
+
+        # this implies this parameter is not shared with any other parameter so far
+        if param_name not in self.param_to_optim:
+            return
+
+        logger.info("{} shares the same parameter with another parameter. "
+                    "Validating if the same optimizer has been specified for them.".format(
+                        param_name,
+                    ))
+
+        ref_optim = self.param_to_optim[param_name]
+
+        if optim is None:
+            assert ref_optim == self._default_optimizer, (
+                "Optim for {} is None which will fall back to use default_optimizer. "
+                "However, the optimizer that has been specified for this shared parameter "
+                "is {} which is different from default_optimizer {}. "
+                "Please check the optimizers specified for parameters shared "
+                "with {} and the default_optimizer to ensure the consistency.".format(
+                    param_name, ref_optim, self._default_optimizer, param_name
+                )
+            )
+        elif optim == self.NoOptim:
+            assert ref_optim == self.NoOptim, (
+                "Optim for {} is NoOptim. However, the optimizer for the parameters "
+                "shared with {} is {} which is different from NoOptim. "
+                "Please check the optimizer specified for other parameters in the "
+                "shared group to ensure consistency.".format(
+                    param_name, param_name, ref_optim
+                )
+            )
+        elif isinstance(optim, Optimizer):
+            assert isinstance(ref_optim, Optimizer), (
+                "Optim for {} is an instance of Optimizer. However, the optimizer "
+                "for the parameters shared with {} is {} which is not an instance "
+                "of Optimizer. Please check the optimizer specified for other "
+                " parameters in the shared group to ensure consistency.".format(
+                    param_name, param_name, ref_optim, optim
+                )
+            )
+
+            assert type(optim) is type(ref_optim) and optim.attributes == ref_optim.attributes, (
+                "Optim for {} is an instance of Optimizer. However, the optimizer "
+                "for the parameters shared with {} is {}. "
+                "This optimizer either doesn't have the same type as the current optimizer: "
+                "{} vs {}, or its attributes such as learning rate are different from "
+                "that of current optimizer which is {} vs {}. "
+                "Please check the optimizer specified for other parameters in the "
+                "shared group to ensure consistency.".format(
+                    param_name, param_name, ref_optim, type(optim), type(ref_optim), optim.attributes, ref_optim.attributes
+                )
+            )
+        else:
+            raise ValueError("optim should be either None, NoOptim, or an instance of Optimizer, Got {} ".format(optim))
+
     def create_param(self, param_name, shape, initializer, optimizer=None,
                      ps_param=None, regularizer=None):
         if isinstance(param_name, core.BlobReference):
@@ -270,6 +330,8 @@ class LayerModelHelper(model_helper.ModelHelper):
 
         self._validate_param_shape(param_name, shape)
 
+        self._validate_param_optim(param_name, optimizer)
+
         self._param_to_shape[param_name] = shape
 
         return param
diff --git a/caffe2/python/layer_parameter_sharing_test.py b/caffe2/python/layer_parameter_sharing_test.py
index 65c583c..5d87dbd 100644
--- a/caffe2/python/layer_parameter_sharing_test.py
+++ b/caffe2/python/layer_parameter_sharing_test.py
@@ -7,6 +7,7 @@ from caffe2.python import core, scope
 from caffe2.python.modeling.parameter_sharing import (
     ParameterSharing,
 )
+from caffe2.python.optimizer import AdagradOptimizer, AdamOptimizer
 from caffe2.python.layer_test_util import LayersTestCase
 import six
 
@@ -149,3 +150,84 @@ class ParameterSharingTest(LayersTestCase):
             sorted(op_outputs),
             ['global_scope/shared_fc/b', 'global_scope/shared_fc/w']
         )
+
+    def test_layer_shared_parameter_optim_validator(self):
+        """
+        This test is to cover the _validate_param_optim function in
+        layer_model_helper class.
+        """
+
+        output_dims = 2
+
+        adagrad_optim = AdagradOptimizer(
+            alpha=0.004,
+            epsilon=0.02,
+        )
+
+        self.model.default_optimizer = adagrad_optim
+
+        # the following covers the branch -- optim is None
+        with scope.NameScope('global_scope_0'):
+            with ParameterSharing({'scope_1': 'scope_0'}):
+                with scope.NameScope('scope_0'):
+                    fc1_output = self.model.FC(
+                        self.model.input_feature_schema.float_features,
+                        output_dims,
+                        weight_optim=self.model.NoOptim,
+                    )
+
+                with scope.NameScope('scope_1'), self.assertRaises(Exception):
+                    fc2_output = self.model.FC(
+                        self.model.input_feature_schema.float_features,
+                        output_dims
+                    )
+
+        # the following covers the branch -- optim is NoOptim
+        with scope.NameScope('global_scope_1'):
+            with ParameterSharing({'scope_1': 'scope_0'}):
+                with scope.NameScope('scope_0'):
+                    fc1_output = self.model.FC(
+                        self.model.input_feature_schema.float_features,
+                        output_dims,
+                        weight_optim=None,
+                    )
+
+                with scope.NameScope('scope_1'), self.assertRaises(Exception):
+                    fc2_output = self.model.FC(
+                        self.model.input_feature_schema.float_features,
+                        output_dims,
+                        weight_optim=self.model.NoOptim,
+                    )
+
+        # the following covers the branch -- optim is an instance of Optimizer
+        adagrad_optim_2 = AdagradOptimizer(
+            alpha=0.005,
+            epsilon=0.02,
+        )
+
+        adam_optim = AdamOptimizer()
+
+        self.model.default_optimizer = adagrad_optim_2
+
+        with scope.NameScope('global_scope_2'):
+            with ParameterSharing({'scope_1': 'scope_0', 'scope_2': 'scope_0'}):
+                with scope.NameScope('scope_0'):
+                    fc1_output = self.model.FC(
+                        self.model.input_feature_schema.float_features,
+                        output_dims,
+                        weight_optim=None,   # it will use adagrad_optim_2
+                    )
+
+                with scope.NameScope('scope_1'), self.assertRaises(Exception):
+                    fc2_output = self.model.FC(
+                        self.model.input_feature_schema.float_features,
+                        output_dims,
+                        weight_optim=adagrad_optim,
+                    )
+
+                with scope.NameScope('scope_2'), self.assertRaises(Exception):
+                    fc2_output = self.model.FC(
+                        self.model.input_feature_schema.float_features,
+                        output_dims,
+                        weight_optim=adam_optim,
+                    )
diff --git a/caffe2/python/optimizer.py b/caffe2/python/optimizer.py
index 0aa0201..8a7540f 100644
--- a/caffe2/python/optimizer.py
+++ b/caffe2/python/optimizer.py
@@ -9,6 +9,7 @@ from collections import namedtuple, defaultdict
 from past.builtins import basestring
 
 import logging
+import copy
 
 import numpy as np
 
@@ -72,6 +73,13 @@ class Optimizer(object):
             classname, self._instance_num, base_str, node_name, gpu_id,
         )
 
+    @property
+    def attributes(self):
+        # return a dict that contains attributes related to init args only
+        attr = copy.deepcopy(self.__dict__)
+        del attr['_instance_num']
+        return attr
+
     def make_unique_blob_name(self, base_str):
         """
         Returns a blob name that will be unique to the current device
-- 
2.7.4