From ce4ae5bed9b47f49b085d9d8287cee2fcc5d42ac Mon Sep 17 00:00:00 2001 From: Allen Lavoie Date: Fri, 23 Feb 2018 16:59:01 -0800 Subject: [PATCH] Checkpointable: compatibility mode with name-based saving Allows loading a name-based checkpoint using the object-based API. When graph building it's quite seamless. There's no restore-on-create for eager, so it would require program changes to do much useful there (i.e. is not seamless). Adds several tests for checkpoint compatibility (name->object in eager/graph, and eager->graph/graph->eager for object-based saving) PiperOrigin-RevId: 186844431 --- .../contrib/eager/python/checkpointable_utils.py | 98 +++++++++++++++--- .../eager/python/checkpointable_utils_test.py | 110 +++++++++++++++++++++ 2 files changed, 192 insertions(+), 16 deletions(-) diff --git a/tensorflow/contrib/eager/python/checkpointable_utils.py b/tensorflow/contrib/eager/python/checkpointable_utils.py index e26ecc7..e57093b 100644 --- a/tensorflow/contrib/eager/python/checkpointable_utils.py +++ b/tensorflow/contrib/eager/python/checkpointable_utils.py @@ -27,6 +27,7 @@ from tensorflow.python.client import session as session_lib from tensorflow.python.eager import context from tensorflow.python.framework import constant_op from tensorflow.python.framework import dtypes +from tensorflow.python.framework import errors_impl from tensorflow.python.framework import ops from tensorflow.python.framework import tensor_shape from tensorflow.python.ops import control_flow_ops @@ -38,6 +39,7 @@ from tensorflow.python.training import checkpointable as core_checkpointable from tensorflow.python.training import checkpointable_utils as core_checkpointable_utils from tensorflow.python.training import optimizer as optimizer_lib from tensorflow.python.training import saver as saver_lib +from tensorflow.python.util import deprecation _ESCAPE_CHAR = "." # For avoiding conflicts with user-specified names. @@ -464,6 +466,41 @@ class InitializationOnlyStatus(_LoadStatus): session.run(gather_initializers(self._root_checkpointable)) +_DEPRECATED_RESTORE_INSTRUCTIONS = ( + "Restoring a name-based tf.train.Saver checkpoint using the object-based " + "restore API. This mode uses global names to match variables, and so is " + "somewhat fragile. It also adds new restore ops to the graph each time it " + "is called. Prefer re-encoding training checkpoints in the object-based " + "format: run save() on the object-based saver (the same one this message " + "is coming from) and use that checkpoint in the future.") + + +class NameBasedSaverStatus(_LoadStatus): + """Status for loading a name-based training checkpoint.""" + + def __init__(self, object_saver, save_path): + self._object_saver = object_saver + self._save_path = save_path + + def assert_consumed(self): + """Assertion for consistency with `CheckpointLoadStatus`. Always fails.""" + raise AssertionError( + "Restoring a name-based checkpoint. No load status is available.") + + @deprecation.deprecated( + date=None, instructions=_DEPRECATED_RESTORE_INSTRUCTIONS) + def run_restore_ops(self, session=None): + """Load the name-based training checkpoint using a new `tf.train.Saver`.""" + if session is None and context.in_graph_mode(): + session = ops.get_default_session() + saver_lib.Saver(self._object_saver._global_variable_names()).restore( # pylint: disable=protected-access + sess=session, save_path=self._save_path) + + def initialize_or_restore(self, session=None): + """Alias for `run_restore_ops`.""" + self.run_restore_ops(session=session) + + class _SessionWithFeedDictAdditions(session_lib.SessionInterface): """Pretends to be a session, inserts extra feeds on run().""" @@ -544,7 +581,7 @@ class Saver(object): Args: file_prefix: A prefix to use for the checkpoint filenames (/path/to/directory/and_a_prefix). Names are generated based on this - prefix and the global step, if provided. + prefix and `checkpoint_number`, if provided. checkpoint_number: An integer variable or Tensor, used to number checkpoints. Typically this value is saved along with other variables in training checkpoints, which will happen automatically if it was created @@ -598,6 +635,17 @@ class Saver(object): global_step=checkpoint_number) return save_path + def _global_variable_names(self): + """Generate a `tf.train.Saver`-style `var_list` using `variable.name`s.""" + named_saveables, graph_proto = _serialize_object_graph( + self._root_checkpointable) + saver_names = {} + for object_proto in graph_proto.nodes: + for attribute_proto in object_proto.attributes: + saver_names[attribute_proto.full_name] = named_saveables[ + attribute_proto.checkpoint_key] + return saver_names + def restore(self, save_path, session=None): """Restore a training checkpoint. @@ -633,11 +681,20 @@ class Saver(object): If the checkpoint has not been consumed completely, then the list of restore ops will grow as more objects are added to the dependency graph. + Name-based `tf.train.Saver` checkpoints can be loaded using this + method. There is no deferred loading, and names are used to match + variables. No restore ops are created/run until `run_restore_ops()` or + `initialize_or_restore()` are called on the returned status object, even + when executing eagerly. Re-encode name-based checkpoints using this + object-based `Saver.save` as soon as possible. + Args: save_path: The path to the checkpoint, as returned by `save` or `tf.train.latest_checkpoint`. If None (as when there is no latest checkpoint for `tf.train.latest_checkpoint` to return), returns an - object which may run initializers for objects in the dependency graph. + object which may run initializers for objects in the dependency + graph. If the checkpoint was written by the name-based `tf.train.Saver`, + names are used to match variables. session: The session to retrieve metadata with. Ignored when executing eagerly. If not provided when graph building, the default session is used. @@ -647,6 +704,9 @@ class Saver(object): status of checkpoint restoration and run initialization/restore ops (of type `CheckpointLoadStatus`, or `InitializationOnlyStatus` if `save_path` is `None`). + + If `save_path` points to a name-based checkpoint, a `NameBasedSaverStatus` + object is returned which runs restore ops from a name-based saver. """ if save_path is None: return InitializationOnlyStatus(self._root_checkpointable) @@ -660,21 +720,27 @@ class Saver(object): session = None file_prefix_tensor = constant_op.constant(save_path) file_prefix_feed_dict = None - if not in_graph_mode or self._object_graph_restore_tensor is None: - object_graph_string, = io_ops.restore_v2( - prefix=file_prefix_tensor, - tensor_names=[_OBJECT_GRAPH_PROTO_KEY], - shape_and_slices=[""], - dtypes=[dtypes.string], - name="object_graph_proto_read") + try: + if not in_graph_mode or self._object_graph_restore_tensor is None: + object_graph_string, = io_ops.restore_v2( + prefix=file_prefix_tensor, + tensor_names=[_OBJECT_GRAPH_PROTO_KEY], + shape_and_slices=[""], + dtypes=[dtypes.string], + name="object_graph_proto_read") + if in_graph_mode: + self._object_graph_restore_tensor = object_graph_string if in_graph_mode: - self._object_graph_restore_tensor = object_graph_string - if in_graph_mode: - object_graph_string = session.run( - self._object_graph_restore_tensor, - feed_dict=file_prefix_feed_dict) - else: - object_graph_string = object_graph_string.numpy() + object_graph_string = session.run( + self._object_graph_restore_tensor, + feed_dict=file_prefix_feed_dict) + else: + object_graph_string = object_graph_string.numpy() + except errors_impl.NotFoundError: + # The object graph proto does not exist in this checkpoint. Try again with + # name-based saving. + return NameBasedSaverStatus(self, save_path) + object_graph_proto = ( checkpointable_object_graph_pb2.CheckpointableObjectGraph()) object_graph_proto.ParseFromString(object_graph_string) diff --git a/tensorflow/contrib/eager/python/checkpointable_utils_test.py b/tensorflow/contrib/eager/python/checkpointable_utils_test.py index 6b86d41..3d6a200 100644 --- a/tensorflow/contrib/eager/python/checkpointable_utils_test.py +++ b/tensorflow/contrib/eager/python/checkpointable_utils_test.py @@ -899,5 +899,115 @@ class CheckpointingTests(test.TestCase): saver.restore(save_path) self.assertEqual(before_ops, graph.get_operations()) + +class CheckpointCompatibilityTests(test.TestCase): + + def _initialized_model(self): + input_value = constant_op.constant([[3.]]) + network = MyNetwork() + optimizer = CheckpointableAdam(0.001) + optimizer_step = training_util.get_or_create_global_step() + root_checkpointable = Checkpoint( + optimizer=optimizer, network=network, optimizer_step=optimizer_step) + train_op = optimizer.minimize( + functools.partial(network, input_value), + global_step=optimizer_step) + self.evaluate(checkpointable_utils.gather_initializers( + root_checkpointable)) + self.evaluate(train_op) + # A regular variable, a slot variable, and a non-slot Optimizer variable + # with known values to check when loading. + self.evaluate(network._named_dense.bias.assign([1.])) + self.evaluate(optimizer.get_slot( + var=network._named_dense.bias, name="m").assign([2.])) + beta1_power, _ = optimizer._get_beta_accumulators() + self.evaluate(beta1_power.assign(3.)) + return root_checkpointable + + def _set_sentinels(self, root_checkpointable): + self.evaluate(root_checkpointable.network._named_dense.bias.assign([101.])) + self.evaluate( + root_checkpointable.optimizer.get_slot( + var=root_checkpointable.network._named_dense.bias, name="m") + .assign([102.])) + beta1_power, _ = root_checkpointable.optimizer._get_beta_accumulators() + self.evaluate(beta1_power.assign(103.)) + + def _check_sentinels(self, root_checkpointable): + self.assertAllEqual( + [1.], self.evaluate(root_checkpointable.network._named_dense.bias)) + self.assertAllEqual([2.], self.evaluate( + root_checkpointable.optimizer.get_slot( + var=root_checkpointable.network._named_dense.bias, name="m"))) + beta1_power, _ = root_checkpointable.optimizer._get_beta_accumulators() + self.assertAllEqual(3., self.evaluate(beta1_power)) + + def _write_name_based_checkpoint(self): + checkpoint_directory = self.get_temp_dir() + checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt") + with context.graph_mode(): + save_graph = ops.Graph() + with save_graph.as_default(), self.test_session( + graph=save_graph) as session: + root = self._initialized_model() + name_saver = core_saver.Saver() + return name_saver.save( + sess=session, save_path=checkpoint_prefix, + global_step=root.optimizer_step) + + @test_util.run_in_graph_and_eager_modes() + def testLoadFromNameBasedSaver(self): + """Save a name-based checkpoint, load it using the object-based API.""" + save_path = self._write_name_based_checkpoint() + root = self._initialized_model() + self._set_sentinels(root) + with self.assertRaises(AssertionError): + self._check_sentinels(root) + object_saver = checkpointable_utils.Saver(root) + status = object_saver.restore(save_path) + with self.assertRaises(AssertionError): + status.assert_consumed() + status.run_restore_ops() + self._check_sentinels(root) + self._set_sentinels(root) + status.initialize_or_restore() + self._check_sentinels(root) + + # TODO(allenl): Test for the core name-based saver loading object-based + # checkpoints once object-based checkpointing is in core. + + def testSaveGraphLoadEager(self): + checkpoint_directory = self.get_temp_dir() + checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt") + with context.graph_mode(): + save_graph = ops.Graph() + with save_graph.as_default(), self.test_session( + graph=save_graph) as session: + root = self._initialized_model() + object_saver = checkpointable_utils.Saver(root) + save_path = object_saver.save( + session=session, file_prefix=checkpoint_prefix) + with context.eager_mode(): + root = self._initialized_model() + self._set_sentinels(root) + root.restore(save_path).assert_consumed() + self._check_sentinels(root) + + def testSaveEagerLoadGraph(self): + checkpoint_directory = self.get_temp_dir() + checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt") + with context.eager_mode(): + root = self._initialized_model() + object_saver = checkpointable_utils.Saver(root) + save_path = object_saver.save(file_prefix=checkpoint_prefix) + with context.graph_mode(): + save_graph = ops.Graph() + with save_graph.as_default(), self.test_session( + graph=save_graph): + root = self._initialized_model() + self._set_sentinels(root) + root.restore(save_path).assert_consumed().run_restore_ops() + self._check_sentinels(root) + if __name__ == "__main__": test.main() -- 2.7.4