From 7f9ab7f8c5e161562656604d9b22939b1f97c791 Mon Sep 17 00:00:00 2001
From: Asim Shankar <ashankar@google.com>
Date: Mon, 19 Mar 2018 10:37:00 -0700
Subject: [PATCH] Documentation tweaks and tests for GradientTape with graph
 execution.

PiperOrigin-RevId: 189604536
---
 tensorflow/python/eager/backprop.py            | 76 +++++++++++---------------
 tensorflow/python/eager/backprop_test.py       | 25 ++++++---
 tensorflow/python/framework/ops.py             | 35 ++++++++----
 tensorflow/python/ops/resource_variable_ops.py | 32 +++++------
 tensorflow/python/ops/variables.py             |  4 +-
 5 files changed, 88 insertions(+), 84 deletions(-)

diff --git a/tensorflow/python/eager/backprop.py b/tensorflow/python/eager/backprop.py
index 88de1a9..9b997fe 100644
--- a/tensorflow/python/eager/backprop.py
+++ b/tensorflow/python/eager/backprop.py
@@ -638,63 +638,53 @@ _default_vspace = imperative_grad.VSpace(
 
 
 class GradientTape(object):
-  """Records operations to use to compute gradients.
+  """Record operations for automatic differentiation.
 
-  Operations are recorded if:
-    - they happen in code marked by this context manager
-    - at least one of their inputs is being watched
+  Operations are recorded if they are executed within this context manager and
+  at least one of their inputs is being "watched".
 
-  Outputs of recorded operations are watched. Variables are automatically
-  watched and tensors can be manually watched by calling the watch method on the
-  context manager.
+  Variables (created by @{tf.contrib.eager.Variable} or @{tf.get_variable})
+  are automatically watched. Tensors can be manually watched by invoking the
+  `watch`
+  method on this context manager.
 
-  Example usage:
+  For example, consider the function `y = x * x`. The gradient at `x = 3.0` can
+  be computed as:
 
   ```python
+  x = tf.constant(3.)
   with tfe.GradientTape() as g:
-    x = tf.constant(3.0)
     g.watch(x)
     y = x * x
-  grad = g.gradient(y, [x])[0]
-  assert grad.numpy() == 6.0
+  grad = g.gradient(y, [x])[0] # Will compute to 6.0
   ```
 
-  It is possible to use GradientTapes to compute higher-order derivatives as
-  follows:
+  GradientTapes can be nested to compute higher-order derivatives. For example,
 
   ```python
+  x = tf.constant(3.0)
   with tfe.GradientTape() as g:
-    x = tf.constant(3.0)
-    g.watch(x)
-    y = x * x
     with tfe.GradientTape() as gg:
-      gg.watch(y)
-      z = 2 * y
-    inner_grad = gg.gradient(z, [y])[0]
-    assert inner_grad.numpy() == 2
-    y = y + inner_grad
-  grad = g.gradient(y, [x])[0]
-  assert grad.numpy() == 6.0
+      gg.watch(x)
+      y = x * x
+    dy_dx = gg.gradient(y, [x])[0]     # Will compute to 6.0
+  d2y_dx2 = g.gradient(dy_dx, [x])[0]  # Will compute to 2.0
   ```
 
   By default, the resources held by a GradientTape are released as soon as
-  GradientTape.gradient() method is called. However, if one need to compute
-  multiple gradients over the same computation, she can create a persistent
-  GradientTape. Persistent tapes allow multiple calls to the gradient() method
-  and release resources when the tape object is destructed.
-
-  Example usage:
+  GradientTape.gradient() method is called. To compute multiple gradients over
+  the same computation, create a persistent gradient tape. This allows multiple
+  calls to the gradient() method as resources are released when the tape object
+  is garbage collected. For example:
 
   ```python
+  x = tf.constant(3.0)
   with tfe.GradientTape(persistent=True) as g:
-    x = tf.constant(3.0)
     g.watch(x)
     y = x * x
     z = y * y
-  dz_dx = g.gradient(z, [x])[0]
-  assert dz_dx.numpy() == 108.0   # 4*x^3 at x = 3
-  dy_dx = g.gradient(y, [x])[0]
-  assert dy_dx.numpy() == 6.0
+  dy_dx = g.gradient(z, [x])[0]  # 6.0
+  dz_dx = g.gradient(y, [x])[0]  # 108.0 (4*x^3 at x = 3)
   del g  # Drop the reference to the tape
   """
 
@@ -703,8 +693,8 @@ class GradientTape(object):
 
     Args:
       persistent: Boolean controlling whether a persistent gradient tape
-        is created. Must be True or False.
-
+        is created. False by default, which means at most one call can
+        be made to the gradient() method on this object.
     """
     self._tape = None
     self._persistent = persistent
@@ -720,7 +710,7 @@ class GradientTape(object):
     """Ensures that `tensor` is being traced by this tape.
 
     Args:
-      tensor: a Tensor or Variable a list of Tensors or Variables.
+      tensor: a Tensor or list of Tensors.
     """
     for t in nest.flatten(tensor):
       if isinstance(t, resource_variable_ops.ResourceVariable):
@@ -735,14 +725,14 @@ class GradientTape(object):
                        key=lambda v: v.handle._id))  # pylint: disable=protected-access
 
   def gradient(self, target, sources, output_gradients=None):
-    """Computes the gradient using information traced by the tape.
+    """Computes the gradient using operations recorded in context of this tape.
 
     Args:
-      target: the tensor to be differentiated.
-      sources: a list of Tensors or Variables, the target will be
-       differentiated with respect to the sources.
+      target: Tensor to be differentiated.
+      sources: a list of Tensors or Variables. `target` will be differentiated
+        against elements in `sources`.
       output_gradients: a list of gradients, one for each element of
-       target. Defaults to None.
+        target. Defaults to None.
 
     Returns:
       a list of Tensors (or IndexedSlices, or None), one for each element in
@@ -750,7 +740,7 @@ class GradientTape(object):
 
     Raises:
       RuntimeError: if called inside the context of the tape, or if called more
-       than once.
+       than once on a non-persistent tape.
     """
     if self._tape is None:
       raise RuntimeError("GradientTape.gradient can only be called once "
diff --git a/tensorflow/python/eager/backprop_test.py b/tensorflow/python/eager/backprop_test.py
index 5934293..bca2928 100644
--- a/tensorflow/python/eager/backprop_test.py
+++ b/tensorflow/python/eager/backprop_test.py
@@ -195,8 +195,10 @@ class BackpropTest(test.TestCase):
     g, = backprop.gradients_function(loss, [0])(logits, labels)
     self.assertAllEqual(g.numpy(), [[-0.5, 0.5]])
 
+  @test_util.run_in_graph_and_eager_modes()
   def testGradientWithinTapeBlock(self):
     v1 = resource_variable_ops.ResourceVariable(1.)
+    self.evaluate(v1.initializer)
     with backprop.GradientTape() as t:
       loss = 2 * v1
       with self.assertRaises(RuntimeError):
@@ -204,7 +206,7 @@ class BackpropTest(test.TestCase):
     with backprop.GradientTape(persistent=True) as t:
       loss = 2 * v1
       grad = t.gradient(loss, [v1])
-    self.assertAllEqual(grad[0], 2.0)
+    self.assertAllEqual(self.evaluate(grad[0]), 2.0)
 
   @test_util.assert_no_new_tensors
   def testSecondGrad(self):
@@ -367,6 +369,7 @@ class BackpropTest(test.TestCase):
     self.assertEqual(backprop.implicit_grad(f)()[0][0], None)
 
   @test_util.assert_no_new_tensors
+  @test_util.run_in_graph_and_eager_modes()
   def testGradientTape(self):
     with backprop.GradientTape() as g:
       x = constant_op.constant(3.0)
@@ -376,10 +379,10 @@ class BackpropTest(test.TestCase):
         gg.watch(y)
         z = 2 * y
       inner_grad = gg.gradient(z, [y])[0]
-      self.assertEqual(inner_grad.numpy(), 2.0)
+      self.assertEqual(self.evaluate(inner_grad), 2.0)
       y += inner_grad
     grad = g.gradient(y, [x])[0]
-    self.assertEqual(grad.numpy(), 6.0)
+    self.assertEqual(self.evaluate(grad), 6.0)
 
   @test_util.assert_no_new_tensors
   def testGradientTapeGradientCalledMultipleTimes(self):
@@ -394,6 +397,7 @@ class BackpropTest(test.TestCase):
       g.gradient(y, [x])
 
   @test_util.assert_no_new_tensors
+  @test_util.run_in_graph_and_eager_modes()
   def testPersistentTape(self):
     with backprop.GradientTape(persistent=True) as g:
       x = constant_op.constant(3.0)
@@ -401,12 +405,13 @@ class BackpropTest(test.TestCase):
       y = x * x
       z = y * y
     dz_dx = g.gradient(z, [x])[0]
-    self.assertEqual(dz_dx.numpy(), 4*3*3*3)
+    self.assertEqual(self.evaluate(dz_dx), 4 * 3 * 3 * 3)
     dy_dx = g.gradient(y, [x])[0]
-    self.assertEqual(dy_dx.numpy(), 2*3)
+    self.assertEqual(self.evaluate(dy_dx), 2 * 3)
     del g
 
   @test_util.assert_no_new_tensors
+  @test_util.run_in_graph_and_eager_modes()
   def testPersistentNestedTape(self):
     with backprop.GradientTape(persistent=True) as g:
       x = constant_op.constant(3.0)
@@ -417,22 +422,24 @@ class BackpropTest(test.TestCase):
         z = 2 * y
       for _ in range(2):
         inner_grad = gg.gradient(z, [y])[0]
-        self.assertEqual(inner_grad.numpy(), 2.0)
+        self.assertEqual(self.evaluate(inner_grad), 2.0)
       y += inner_grad
       del gg
     grad = g.gradient(y, [x])[0]
-    self.assertEqual(grad.numpy(), 6.0)
+    self.assertEqual(self.evaluate(grad), 6.0)
     grad = g.gradient(z, [x])[0]
-    self.assertEqual(grad.numpy(), 12.0)
+    self.assertEqual(self.evaluate(grad), 12.0)
     del g
 
   @test_util.assert_no_new_tensors
+  @test_util.run_in_graph_and_eager_modes()
   def testGradientTapeVariable(self):
     v = resource_variable_ops.ResourceVariable(1.0, name='v')
+    self.evaluate(v.initializer)
     with backprop.GradientTape() as g:
       y = v * v
     grad = g.gradient(y, [v])[0]
-    self.assertAllEqual(grad, 2.0)
+    self.assertAllEqual(self.evaluate(grad), 2.0)
 
   @test_util.assert_no_new_tensors
   def testEmptyParamsForValueAndGradFunction(self):
diff --git a/tensorflow/python/framework/ops.py b/tensorflow/python/framework/ops.py
index 01a0e03..f1cd341 100644
--- a/tensorflow/python/framework/ops.py
+++ b/tensorflow/python/framework/ops.py
@@ -838,41 +838,51 @@ class _EagerTensorBase(Tensor):
   def set_shape(self, shape):
     if not self.shape.is_compatible_with(shape):
       raise ValueError(
-          "EagerTensor's shape %s is not compatible with supplied shape %s" %
+          "Tensor's shape %s is not compatible with supplied shape %s" %
           (self.shape, shape))
 
   # Methods not supported / implemented for Eager Tensors.
   @property
   def op(self):
-    raise AttributeError("op not supported for Eager Tensors.")
+    raise AttributeError(
+        "Tensor.op is meaningless when eager execution is enabled.")
 
   @property
   def graph(self):
-    raise AttributeError("graph not supported for Eager Tensors.")
+    raise AttributeError(
+        "Tensor.graph is meaningless when eager execution is enabled.")
 
   @property
   def name(self):
-    raise AttributeError("name not supported for Eager Tensors.")
+    raise AttributeError(
+        "Tensor.name is meaningless when eager execution is enabled.")
 
   @property
   def value_index(self):
-    raise AttributeError("value_index not supported for Eager Tensors.")
+    raise AttributeError(
+        "Tensor.value_index is meaningless when eager execution is enabled.")
 
   def consumers(self):
-    raise NotImplementedError("consumers not supported for Eager Tensors.")
+    raise NotImplementedError(
+        "Tensor.consumers is meaningless when eager execution is enabled.")
 
   def _add_consumer(self, consumer):
-    raise NotImplementedError("_add_consumer not supported for Eager Tensors.")
+    raise NotImplementedError(
+        "_add_consumer not supported when eager execution is enabled.")
 
   def _as_node_def_input(self):
     raise NotImplementedError(
-        "_as_node_def_input not supported for Eager Tensors.")
+        "_as_node_def_input not supported when eager execution is enabled.")
 
   def _as_tf_output(self):
-    raise NotImplementedError("_as_tf_output not supported for Eager Tensors.")
+    raise NotImplementedError(
+        "_as_tf_output not supported when eager execution is enabled.")
 
   def eval(self, feed_dict=None, session=None):
-    raise NotImplementedError("eval not supported for Eager Tensors.")
+    raise NotImplementedError(
+        "eval is not supported when eager execution is enabled, "
+        "is .numpy() what you're looking for?"
+    )
 
 
 # This call creates an EagerTensor class, as a subclass of _EagerTensorBase, and
@@ -5937,8 +5947,9 @@ def get_from_proto_function(collection_name):
 def _assert_collection_is_ok(collection_name):
   if context.executing_eagerly():
     if collection_name in GraphKeys._VARIABLE_COLLECTIONS:  # pylint: disable=protected-access
-      raise ValueError("When Eager Execution is enabled, variable "
-                       "collections are not supported.")
+      raise ValueError(
+          "variable collections are not supported when eager execution is enabled."
+      )
 
 
 def _operation_conversion_error(op, dtype=None, name=None, as_ref=False):
diff --git a/tensorflow/python/ops/resource_variable_ops.py b/tensorflow/python/ops/resource_variable_ops.py
index affa7ae..df873da 100644
--- a/tensorflow/python/ops/resource_variable_ops.py
+++ b/tensorflow/python/ops/resource_variable_ops.py
@@ -149,7 +149,7 @@ def shape_safe_assign_variable_handle(handle, shape, value, name=None):
 class ResourceVariable(variables.Variable):
   """Variable based on resource handles.
 
-  See the @{$python/state_ops$`Variables`} documentation for more details.
+  See the @{$variables$Variables How To} for a high level overview.
 
   A `ResourceVariable` allows you to maintain state across subsequent calls to
   session.run.
@@ -179,24 +179,20 @@ class ResourceVariable(variables.Variable):
   by edges in the graph. Consider the following example, in which two writes
   can cause tf.Variable and tf.ResourceVariable to behave differently:
 
-   ```python
-    a = tf.ResourceVariable(1.0)
-    a.initializer.run()
-
-    assign = a.assign(2.0)
-    with tf.control_dependencies([assign]):
-      b = a.read_value()
-    with tf.control_dependencies([b]):
-      other_assign = a.assign(3.0)
-    with tf.control_dependencies([other_assign]):
-      # Will print 2.0 because the value was read before other_assign ran. If
-      # `a` was a tf.Variable instead, 2.0 or 3.0 could be printed.
-      tf.Print(b, [b]).eval()
+  ```python
+  a = tf.ResourceVariable(1.0)
+  a.initializer.run()
+
+  assign = a.assign(2.0)
+  with tf.control_dependencies([assign]):
+    b = a.read_value()
+  with tf.control_dependencies([b]):
+    other_assign = a.assign(3.0)
+  with tf.control_dependencies([other_assign]):
+    # Will print 2.0 because the value was read before other_assign ran. If
+    # `a` was a tf.Variable instead, 2.0 or 3.0 could be printed.
+    tf.Print(b, [b]).eval()
   ```
-
-  To enforce these consistency properties tf.ResourceVariable might make more
-  copies than an equivalent tf.Variable under the hood, so tf.Variable is still
-  not deprecated.
   """
 
   def __init__(self,
diff --git a/tensorflow/python/ops/variables.py b/tensorflow/python/ops/variables.py
index 5b9947f..c37cdd9 100644
--- a/tensorflow/python/ops/variables.py
+++ b/tensorflow/python/ops/variables.py
@@ -125,8 +125,8 @@ class Variable(checkpointable.CheckpointableBase):
 
   @compatibility(eager)
   `tf.Variable` is not compatible with eager execution.  Use
-  `tfe.Variable` instead which is compatible with both eager execution
-  and graph construction.  See [the TensorFlow Eager Execution
+  `tf.contrib.eager.Variable` instead which is compatible with both eager
+  execution and graph construction.  See [the TensorFlow Eager Execution
   guide](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/eager/python/g3doc/guide.md#variables-and-optimizers)
   for details on how variables work in eager execution.
   @end_compatibility
-- 
2.7.4