From 6d014ecbd63fec208742b327b94c39afd4953fb8 Mon Sep 17 00:00:00 2001
From: Akshay Modi <nareshmodi@google.com>
Date: Fri, 2 Mar 2018 15:11:13 -0800
Subject: [PATCH] ReadVariableOp in C for eager (only for the fastpath)

PiperOrigin-RevId: 187676012
---
 tensorflow/python/eager/benchmarks_test.py     |  21 ++
 tensorflow/python/eager/pywrap_tfe.h           |   7 +
 tensorflow/python/eager/pywrap_tfe_src.cc      | 460 +++++++++++++++++--------
 tensorflow/python/eager/pywrap_tfe_test.py     |  31 ++
 tensorflow/python/ops/resource_variable_ops.py |   4 +
 tensorflow/python/pywrap_tfe.i                 |   1 +
 6 files changed, 377 insertions(+), 147 deletions(-)

diff --git a/tensorflow/python/eager/benchmarks_test.py b/tensorflow/python/eager/benchmarks_test.py
index 527a919..551d564 100644
--- a/tensorflow/python/eager/benchmarks_test.py
+++ b/tensorflow/python/eager/benchmarks_test.py
@@ -275,6 +275,16 @@ class MicroBenchmarks(test.Benchmark):
   def _benchmark_read_variable(self, m, num_iters):
     self._run(m.value, num_iters)
 
+  def _benchmark_matmul_read_variable(self, m, num_iters):
+    self._benchmark_gen_math_ops_matmul(
+        m, transpose_b=False, num_iters=num_iters)
+
+  def _benchmark_matmul_read_variable_with_tape(self, m, num_iters):
+    with backprop.GradientTape() as tape:
+      tape.watch(m)
+      self._benchmark_gen_math_ops_matmul(
+          m, transpose_b=False, num_iters=num_iters)
+
   def _benchmark_read_variable_with_tape(self, m, num_iters):
     with backprop.GradientTape() as tape:
       tape.watch(m)
@@ -416,6 +426,17 @@ class MicroBenchmarks(test.Benchmark):
       self._benchmark_defun_matmul(
           m, transpose_b=True, num_iters=self._num_iters_100_by_784)
 
+  def benchmark_matmul_read_variable_op_2_by_2_CPU(self):
+    with context.device(CPU):
+      m = resource_variable_ops.ResourceVariable(self._m_2_by_2)
+      self._benchmark_matmul_read_variable(m, num_iters=self._num_iters_2_by_2)
+
+  def benchmark_matmul_read_variable_op_with_tape_2_by_2_CPU(self):
+    with context.device(CPU):
+      m = resource_variable_ops.ResourceVariable(self._m_2_by_2)
+      self._benchmark_matmul_read_variable_with_tape(
+          m, num_iters=self._num_iters_2_by_2)
+
   def benchmark_read_variable_op_2_by_2_CPU(self):
     with context.device(CPU):
       m = resource_variable_ops.ResourceVariable(self._m_2_by_2)
diff --git a/tensorflow/python/eager/pywrap_tfe.h b/tensorflow/python/eager/pywrap_tfe.h
index b1b4a6b..32d731d 100644
--- a/tensorflow/python/eager/pywrap_tfe.h
+++ b/tensorflow/python/eager/pywrap_tfe.h
@@ -51,6 +51,13 @@ void TFE_Py_Execute(TFE_Context* ctx, const char* device_name,
 // This function is not thread-safe.
 PyObject* TFE_Py_RegisterExceptionClass(PyObject* e);
 
+// Registers e as the type of the ResourceVariable class.
+// Returns Py_None if registration succeeds, else throws a TypeError and returns
+// NULL.
+//
+// This function is not thread-safe.
+PyObject* TFE_Py_RegisterResourceVariableType(PyObject* e);
+
 // Registers e as the Exception to be raised when the conditions of
 // TFE_Py_FastPathExecute_C have not been met. When this exception is set, it
 // is a signal to the calling code that it should fall back to the safer (and
diff --git a/tensorflow/python/eager/pywrap_tfe_src.cc b/tensorflow/python/eager/pywrap_tfe_src.cc
index 42d97df..27c9d05 100644
--- a/tensorflow/python/eager/pywrap_tfe_src.cc
+++ b/tensorflow/python/eager/pywrap_tfe_src.cc
@@ -38,6 +38,23 @@ using tensorflow::strings::Printf;
 
 namespace {
 
+struct FastPathOpExecInfo {
+  TFE_Context* ctx;
+  const char* device_name;
+  // The op def of the main op being executed.
+  const tensorflow::OpDef* op_def;
+
+  bool run_callbacks;
+  bool run_post_exec_callbacks;
+  bool run_gradient_callback;
+
+  // The op name of the main op being executed.
+  PyObject* name;
+  // The op type name of the main op being executed.
+  PyObject* op_name;
+  PyObject* callbacks;
+};
+
 #define PARSE_VALUE(fn_name, type, check_fn, parse_fn)                       \
   bool fn_name(const string& key, PyObject* py_value, TF_Status* status,     \
                type* value) {                                                \
@@ -120,6 +137,11 @@ bool ParseTypeValue(const string& key, PyObject* py_value, TF_Status* status,
 
   PyObject* py_type_enum = PyObject_GetAttrString(py_value, "_type_enum");
   if (py_type_enum == nullptr) {
+    TF_SetStatus(
+        status, TF_INVALID_ARGUMENT,
+        tensorflow::strings::StrCat("Expecting a DType.dtype for attr ", key,
+                                    ", got ", py_value->ob_type->tp_name)
+            .c_str());
     return false;
   }
 
@@ -580,6 +602,8 @@ PyObject* fallback_exception_class = nullptr;
 // Python function that returns a backward_function.
 PyObject* backward_function_getter = nullptr;
 
+PyTypeObject* resource_variable_type = nullptr;
+
 tensorflow::mutex _uid_mutex(tensorflow::LINKER_INITIALIZED);
 tensorflow::int64 _uid GUARDED_BY(_uid_mutex) = 0;
 
@@ -628,11 +652,28 @@ PyObject* TFE_Py_RegisterExceptionClass(PyObject* e) {
                     "TFE_Py_RegisterExceptionClass: "
                     "Registered class should be subclass of Exception.");
     return nullptr;
-  } else {
-    Py_INCREF(e);
-    exception_class = e;
-    Py_RETURN_NONE;
   }
+
+  Py_INCREF(e);
+  exception_class = e;
+  Py_RETURN_NONE;
+}
+
+PyObject* TFE_Py_RegisterResourceVariableType(PyObject* e) {
+  if (!PyType_Check(e)) {
+    PyErr_SetString(
+        PyExc_TypeError,
+        "TFE_Py_RegisterResourceVariableType: Need to register a type.");
+    return nullptr;
+  }
+
+  if (resource_variable_type != nullptr) {
+    Py_DECREF(resource_variable_type);
+  }
+
+  Py_INCREF(e);
+  resource_variable_type = reinterpret_cast<PyTypeObject*>(e);
+  Py_RETURN_NONE;
 }
 
 PyObject* TFE_Py_RegisterFallbackExceptionClass(PyObject* e) {
@@ -1375,8 +1416,12 @@ PyObject* GetPythonObjectFromString(const char* s) {
 #endif
 }
 
-bool CheckEagerTensors(PyObject* seq, int start_index,
-                       const tensorflow::OpDef& op_def) {
+bool CheckResourceVariable(PyObject* item) {
+  return PyObject_TypeCheck(item, resource_variable_type);
+}
+
+bool CheckInputsOk(PyObject* seq, int start_index,
+                   const tensorflow::OpDef& op_def) {
   for (int i = 0; i < op_def.input_arg_size(); i++) {
     PyObject* item = PyTuple_GET_ITEM(seq, i + start_index);
     if (!op_def.input_arg(i).number_attr().empty() ||
@@ -1384,9 +1429,13 @@ bool CheckEagerTensors(PyObject* seq, int start_index,
       // This item should be a list input.
       if (!PyList_Check(item)) return false;
       for (Py_ssize_t j = 0; j < PyList_Size(item); j++) {
-        if (!EagerTensor_CheckExact(PyList_GET_ITEM(item, j))) return false;
+        PyObject* inner_item = PyList_GET_ITEM(item, j);
+        if (!EagerTensor_CheckExact(inner_item) &&
+            !CheckResourceVariable(inner_item)) {
+          return false;
+        }
       }
-    } else if (!EagerTensor_CheckExact(item)) {
+    } else if (!EagerTensor_CheckExact(item) && !CheckResourceVariable(item)) {
       return false;
     }
   }
@@ -1394,71 +1443,6 @@ bool CheckEagerTensors(PyObject* seq, int start_index,
   return true;
 }
 
-// Adds input and type attr to the op, and to the list of flattened
-// inputs/attrs.
-bool AddInputToOp(PyObject* input, const tensorflow::OpDef::ArgDef* input_arg,
-                  std::vector<PyObject*>* flattened_attrs,
-                  std::vector<PyObject*>* flattened_inputs, TFE_Op* op,
-                  TF_Status* status) {
-  TFE_TensorHandle* input_handle = EagerTensor_Handle(input);
-  if (input_arg != nullptr && !input_arg->type_attr().empty()) {
-    auto dtype = TFE_TensorHandleDataType(input_handle);
-    TFE_OpSetAttrType(op, input_arg->type_attr().data(), dtype);
-    if (flattened_attrs != nullptr) {
-      flattened_attrs->push_back(
-          GetPythonObjectFromString(input_arg->type_attr().data()));
-      flattened_attrs->push_back(PyLong_FromLong(dtype));
-    }
-  }
-
-  if (flattened_inputs != nullptr) {
-    flattened_inputs->push_back(input);
-  }
-  TFE_OpAddInput(op, input_handle, status);
-  if (MaybeRaiseExceptionFromTFStatus(status, nullptr)) {
-    return false;
-  }
-  return true;
-}
-
-const tensorflow::OpDef* GetOpDef(PyObject* py_op_name) {
-  const char* op_name = TFE_GetPythonString(py_op_name);
-  if (op_name == nullptr) {
-    PyErr_SetString(PyExc_TypeError,
-                    Printf("expected a string for op_name, got %s instead",
-                           py_op_name->ob_type->tp_name)
-                        .c_str());
-    return nullptr;
-  }
-
-  const tensorflow::OpRegistrationData* op_reg_data = nullptr;
-  const tensorflow::Status lookup_status =
-      tensorflow::OpRegistry::Global()->LookUp(op_name, &op_reg_data);
-  if (MaybeRaiseExceptionFromStatus(lookup_status, nullptr)) {
-    return nullptr;
-  }
-  return &op_reg_data->op_def;
-}
-
-const char* GetDeviceName(PyObject* py_device_name) {
-  if (py_device_name != Py_None) {
-    return TFE_GetPythonString(py_device_name);
-  }
-  return nullptr;
-}
-
-bool RaiseIfNotPyList(PyObject* list, const string& attr_name) {
-  if (!PyList_Check(list)) {
-    PyErr_SetString(PyExc_TypeError,
-                    Printf("expected a list for attr %s, got %s instead",
-                           attr_name.data(), list->ob_type->tp_name)
-                        .data());
-
-    return false;
-  }
-  return true;
-}
-
 bool OpDoesntRequireOutput(const string& op_name) {
   static tensorflow::gtl::FlatSet<string>* ops_that_dont_require_outputs =
       new tensorflow::gtl::FlatSet<string>({
@@ -1583,7 +1567,6 @@ PyObject* RecordGradient(PyObject* op_name, PyObject* inputs, PyObject* attrs,
       break;
     }
   }
-
   if (!should_record) Py_RETURN_NONE;
 
   string c_op_name = TFE_GetPythonString(op_name);
@@ -1617,50 +1600,212 @@ PyObject* RecordGradient(PyObject* op_name, PyObject* inputs, PyObject* attrs,
   Py_RETURN_NONE;
 }
 
-bool RunCallbacks(bool run_gradient_callback, bool run_post_exec_callbacks,
-                  const tensorflow::OpDef* op_def, PyObject* args,
-                  const std::vector<PyObject*>& flattened_inputs,
-                  const std::vector<PyObject*>& flattened_attrs,
-                  PyObject* flattened_result, PyObject* op_name, PyObject* name,
-                  PyObject* callbacks) {
-  tensorflow::Safe_PyObjectPtr inputs =
-      tensorflow::make_safe(PyTuple_New(flattened_inputs.size()));
+void MaybeWatchVariable(PyObject* input) {
+  DCHECK(CheckResourceVariable(input));
+  DCHECK(PyObject_HasAttrString(input, "_trainable"));
+
+  tensorflow::Safe_PyObjectPtr trainable(
+      PyObject_GetAttrString(input, "_trainable"));
+  if (trainable.get() == Py_False) return;
+  TFE_Py_TapeSetWatchVariable(input);
+}
+
+bool ReadVariableOp(const FastPathOpExecInfo& parent_op_exec_info,
+                    PyObject* input, tensorflow::Safe_PyObjectPtr* output,
+                    TF_Status* status) {
+  MaybeWatchVariable(input);
+
+  TFE_Op* op = TFE_NewOp(parent_op_exec_info.ctx, "ReadVariableOp", status);
+  auto cleaner = tensorflow::gtl::MakeCleanup([op] { TFE_DeleteOp(op); });
+  if (MaybeRaiseExceptionFromTFStatus(status, nullptr)) return false;
+
+  // Set dtype
+  DCHECK(PyObject_HasAttrString(input, "_dtype"));
+  tensorflow::Safe_PyObjectPtr dtype(PyObject_GetAttrString(input, "_dtype"));
+  int value;
+  if (!ParseTypeValue("_dtype", dtype.get(), status, &value)) {
+    return false;
+  }
+  TFE_OpSetAttrType(op, "dtype", static_cast<TF_DataType>(value));
+
+  TFE_OpSetDevice(op, parent_op_exec_info.device_name, status);
+  if (MaybeRaiseExceptionFromTFStatus(status, nullptr)) return false;
+
+  // Get handle
+  tensorflow::Safe_PyObjectPtr handle(PyObject_GetAttrString(input, "_handle"));
+  if (!EagerTensor_CheckExact(handle.get())) return false;
+  TFE_OpAddInput(op, EagerTensor_Handle(handle.get()), status);
+  if (MaybeRaiseExceptionFromTFStatus(status, nullptr)) return false;
+
+  int num_retvals = 1;
+  TFE_TensorHandle* output_handle;
+  TFE_Execute(op, &output_handle, &num_retvals, status);
+  if (MaybeRaiseExceptionFromTFStatus(status, nullptr)) return false;
+
+  // Always create the py object (and correctly DECREF it) from the returned
+  // value, else the data will leak.
+  output->reset(EagerTensorFromHandle(output_handle));
+
+  // TODO(nareshmodi): Should we run post exec callbacks here?
+  if (parent_op_exec_info.run_gradient_callback) {
+    tensorflow::Safe_PyObjectPtr inputs(PyTuple_New(1));
+    PyTuple_SET_ITEM(inputs.get(), 0, handle.release());
+
+    tensorflow::Safe_PyObjectPtr outputs(PyTuple_New(1));
+    Py_INCREF(output->get());  // stay alive after since tuple steals.
+    PyTuple_SET_ITEM(outputs.get(), 0, output->get());
+
+    if (!RecordGradient(GetPythonObjectFromString("ReadVariableOp"),
+                        inputs.get(), Py_None, outputs.get(), Py_None)) {
+      return false;
+    }
+  }
+
+  return true;
+}
+
+// Supports only 2 cases at the moment:
+//  i) input is an EagerTensor
+//  ii) input is a ResourceVariable - in this case, the is_variable param is set
+//  to true.
+bool ConvertToTensor(const FastPathOpExecInfo& op_exec_info, PyObject* input,
+                     tensorflow::Safe_PyObjectPtr* output_handle,
+                     TF_Status* status) {
+  if (CheckResourceVariable(input)) {
+    return ReadVariableOp(op_exec_info, input, output_handle, status);
+  }
+
+  Py_INCREF(input);
+  output_handle->reset(input);
+
+  return true;
+}
+
+// Adds input and type attr to the op, and to the list of flattened
+// inputs/attrs.
+bool AddInputToOp(const FastPathOpExecInfo& op_exec_info, PyObject* input,
+                  const tensorflow::OpDef::ArgDef* input_arg,
+                  std::vector<tensorflow::Safe_PyObjectPtr>* flattened_attrs,
+                  std::vector<tensorflow::Safe_PyObjectPtr>* flattened_inputs,
+                  TFE_Op* op, TF_Status* status) {
+  // py_eager_tensor's ownership is transferred to flattened_inputs if it is
+  // required, else the object is destroyed and DECREF'd when the object goes
+  // out of scope in this function.
+  tensorflow::Safe_PyObjectPtr py_eager_tensor = nullptr;
+
+  if (!ConvertToTensor(op_exec_info, input, &py_eager_tensor, status)) {
+    return false;
+  }
+
+  TFE_TensorHandle* input_handle = EagerTensor_Handle(py_eager_tensor.get());
+
+  if (input_arg != nullptr && !input_arg->type_attr().empty()) {
+    auto dtype = TFE_TensorHandleDataType(input_handle);
+    TFE_OpSetAttrType(op, input_arg->type_attr().data(), dtype);
+    if (flattened_attrs != nullptr) {
+      flattened_attrs->emplace_back(
+          GetPythonObjectFromString(input_arg->type_attr().data()));
+      flattened_attrs->emplace_back(PyLong_FromLong(dtype));
+    }
+  }
+
+  if (flattened_inputs != nullptr) {
+    flattened_inputs->emplace_back(std::move(py_eager_tensor));
+  }
+
+  TFE_OpAddInput(op, input_handle, status);
+  if (MaybeRaiseExceptionFromTFStatus(status, nullptr)) {
+    return false;
+  }
+
+  return true;
+}
+
+const tensorflow::OpDef* GetOpDef(PyObject* py_op_name) {
+  const char* op_name = TFE_GetPythonString(py_op_name);
+  if (op_name == nullptr) {
+    PyErr_SetString(PyExc_TypeError,
+                    Printf("expected a string for op_name, got %s instead",
+                           py_op_name->ob_type->tp_name)
+                        .c_str());
+    return nullptr;
+  }
+
+  const tensorflow::OpRegistrationData* op_reg_data = nullptr;
+  const tensorflow::Status lookup_status =
+      tensorflow::OpRegistry::Global()->LookUp(op_name, &op_reg_data);
+  if (MaybeRaiseExceptionFromStatus(lookup_status, nullptr)) {
+    return nullptr;
+  }
+  return &op_reg_data->op_def;
+}
+
+const char* GetDeviceName(PyObject* py_device_name) {
+  if (py_device_name != Py_None) {
+    return TFE_GetPythonString(py_device_name);
+  }
+  return nullptr;
+}
+
+bool RaiseIfNotPyList(PyObject* list, const string& attr_name) {
+  if (!PyList_Check(list)) {
+    PyErr_SetString(PyExc_TypeError,
+                    Printf("expected a list for attr %s, got %s instead",
+                           attr_name.data(), list->ob_type->tp_name)
+                        .data());
+
+    return false;
+  }
+  return true;
+}
+
+bool RunCallbacks(
+    const FastPathOpExecInfo& op_exec_info, PyObject* args,
+    const std::vector<tensorflow::Safe_PyObjectPtr>& flattened_inputs,
+    const std::vector<tensorflow::Safe_PyObjectPtr>& flattened_attrs,
+    PyObject* flattened_result) {
+  if (!op_exec_info.run_callbacks) return true;
+
+  tensorflow::Safe_PyObjectPtr inputs(PyTuple_New(flattened_inputs.size()));
   for (int i = 0; i < flattened_inputs.size(); i++) {
-    PyObject* input = flattened_inputs[i];
+    PyObject* input = flattened_inputs[i].get();
     Py_INCREF(input);
     PyTuple_SET_ITEM(inputs.get(), i, input);
   }
 
   int num_non_inferred_attrs = PyTuple_GET_SIZE(args) -
-                               op_def->input_arg_size() -
+                               op_exec_info.op_def->input_arg_size() -
                                kFastPathExecuteInputStartIndex;
   int num_attrs = flattened_attrs.size() + num_non_inferred_attrs;
-  tensorflow::Safe_PyObjectPtr attrs =
-      tensorflow::make_safe(PyTuple_New(num_attrs));
+  tensorflow::Safe_PyObjectPtr attrs(PyTuple_New(num_attrs));
 
   for (int i = 0; i < num_non_inferred_attrs; i++) {
-    auto* attr = PyTuple_GET_ITEM(
-        args, kFastPathExecuteInputStartIndex + op_def->input_arg_size() + i);
+    auto* attr =
+        PyTuple_GET_ITEM(args, kFastPathExecuteInputStartIndex +
+                                   op_exec_info.op_def->input_arg_size() + i);
     Py_INCREF(attr);
     PyTuple_SET_ITEM(attrs.get(), i, attr);
   }
   for (int i = num_non_inferred_attrs; i < num_attrs; i++) {
-    // Not INCREFing anything in flattened_attrs as each of those is a new
-    // reference, so allow the attrs tuple to steal the reference.
-    PyTuple_SET_ITEM(attrs.get(), i,
-                     flattened_attrs.at(i - num_non_inferred_attrs));
+    PyObject* attr_or_name =
+        flattened_attrs.at(i - num_non_inferred_attrs).get();
+    Py_INCREF(attr_or_name);
+    PyTuple_SET_ITEM(attrs.get(), i, attr_or_name);
   }
 
-  if (run_gradient_callback) {
-    RecordGradient(op_name, inputs.get(), attrs.get(), flattened_result, name);
+  if (op_exec_info.run_gradient_callback) {
+    if (!RecordGradient(op_exec_info.op_name, inputs.get(), attrs.get(),
+                        flattened_result, op_exec_info.name)) {
+      return false;
+    }
   }
 
-  if (run_post_exec_callbacks) {
-    tensorflow::Safe_PyObjectPtr callback_args = tensorflow::make_safe(
-        Py_BuildValue("OOOOO", op_name, inputs.get(), attrs.get(),
-                      flattened_result, name));
-    for (Py_ssize_t i = 0; i < PyList_Size(callbacks); i++) {
-      PyObject* callback_fn = PyList_GET_ITEM(callbacks, i);
+  if (op_exec_info.run_post_exec_callbacks) {
+    tensorflow::Safe_PyObjectPtr callback_args(
+        Py_BuildValue("OOOOO", op_exec_info.op_name, inputs.get(), attrs.get(),
+                      flattened_result, op_exec_info.name));
+    for (Py_ssize_t i = 0; i < PyList_Size(op_exec_info.callbacks); i++) {
+      PyObject* callback_fn = PyList_GET_ITEM(op_exec_info.callbacks, i);
       if (!PyCallable_Check(callback_fn)) {
         PyErr_SetString(
             PyExc_TypeError,
@@ -1695,14 +1840,30 @@ PyObject* TFE_Py_FastPathExecute_C(PyObject*, PyObject* args) {
     return nullptr;
   }
 
-  TFE_Context* ctx = reinterpret_cast<TFE_Context*>(
+  FastPathOpExecInfo op_exec_info;
+
+  op_exec_info.ctx = reinterpret_cast<TFE_Context*>(
       PyCapsule_GetPointer(PyTuple_GET_ITEM(args, 0), nullptr));
-  const char* device_name = GetDeviceName(PyTuple_GET_ITEM(args, 1));
-  PyObject* op_name = PyTuple_GET_ITEM(args, 2);
-  const tensorflow::OpDef* op_def = GetOpDef(op_name);
-  if (op_def == nullptr) return nullptr;
-  PyObject* name = PyTuple_GET_ITEM(args, 3);
-  PyObject* callbacks = PyTuple_GET_ITEM(args, 4);
+  op_exec_info.device_name = GetDeviceName(PyTuple_GET_ITEM(args, 1));
+  op_exec_info.op_name = PyTuple_GET_ITEM(args, 2);
+  op_exec_info.op_def = GetOpDef(op_exec_info.op_name);
+  if (op_exec_info.op_def == nullptr) return nullptr;
+  op_exec_info.name = PyTuple_GET_ITEM(args, 3);
+  op_exec_info.callbacks = PyTuple_GET_ITEM(args, 4);
+
+  const tensorflow::OpDef* op_def = op_exec_info.op_def;
+
+  // TODO(nareshmodi): Add a benchmark for the fast-path with gradient callbacks
+  // (similar to benchmark_tf_gradient_function_*). Also consider using an
+  // InlinedVector for flattened_attrs and flattened_inputs if the benchmarks
+  // point out problems with heap allocs.
+  op_exec_info.run_gradient_callback =
+      !*ThreadTapeIsStopped() && !GetTapeSet()->empty();
+  op_exec_info.run_post_exec_callbacks =
+      op_exec_info.callbacks != Py_None &&
+      PyList_Size(op_exec_info.callbacks) > 0;
+  op_exec_info.run_callbacks = op_exec_info.run_gradient_callback ||
+                               op_exec_info.run_post_exec_callbacks;
 
   if (args_size < kFastPathExecuteInputStartIndex + op_def->input_arg_size()) {
     PyErr_SetString(
@@ -1715,7 +1876,7 @@ PyObject* TFE_Py_FastPathExecute_C(PyObject*, PyObject* args) {
     return nullptr;
   }
 
-  if (!CheckEagerTensors(args, kFastPathExecuteInputStartIndex, *op_def)) {
+  if (!CheckInputsOk(args, kFastPathExecuteInputStartIndex, *op_def)) {
     RaiseFallbackException(
         "This function does not handle the case of the path where "
         "all inputs are not already EagerTensors.");
@@ -1723,7 +1884,7 @@ PyObject* TFE_Py_FastPathExecute_C(PyObject*, PyObject* args) {
   }
 
   TF_Status* status = TF_NewStatus();
-  TFE_Op* op = TFE_NewOp(ctx, op_def->name().c_str(), status);
+  TFE_Op* op = TFE_NewOp(op_exec_info.ctx, op_def->name().c_str(), status);
   auto cleaner = tensorflow::gtl::MakeCleanup([status, op] {
     TF_DeleteStatus(status);
     TFE_DeleteOp(op);
@@ -1750,8 +1911,8 @@ PyObject* TFE_Py_FastPathExecute_C(PyObject*, PyObject* args) {
     // OpRegistrationData.
     for (const auto& attr : op_def->attr()) {
       if (attr_name == attr.name()) {
-        SetOpAttrWithDefaults(ctx, op, attr, attr_name.data(), py_attr_value,
-                              &attr_list_sizes, status);
+        SetOpAttrWithDefaults(op_exec_info.ctx, op, attr, attr_name.data(),
+                              py_attr_value, &attr_list_sizes, status);
 
         if (TF_GetCode(status) != TF_OK) {
           RaiseFallbackException(TF_Message(status));
@@ -1763,33 +1924,28 @@ PyObject* TFE_Py_FastPathExecute_C(PyObject*, PyObject* args) {
     }
   }
 
-  TFE_OpSetDevice(op, device_name, status);
+  TFE_OpSetDevice(op, op_exec_info.device_name, status);
   if (MaybeRaiseExceptionFromTFStatus(status, nullptr)) {
     return nullptr;
   }
 
-  // TODO(nareshmodi): Add a benchmark for the fast-path with gradient callbacks
-  // (similar to benchmark_tf_gradient_function_*). Also consider using an
-  // InlinedVector for flattened_attrs and flattened_inputs if the benchmarks
-  // point out problems with heap allocs.
-  bool run_gradient_callback =
-      !*ThreadTapeIsStopped() && !GetTapeSet()->empty();
-  bool run_post_exec_callbacks =
-      callbacks != Py_None && PyList_Size(callbacks) > 0;
-  bool run_callbacks = run_gradient_callback || run_post_exec_callbacks;
   // Flat attrs and inputs as required by the record_gradient call. The attrs
   // here only contain inferred attrs (non-inferred attrs are added directly
   // from the input args).
-  // All items in flattened_attrs contain new references.
-  // All items in flattened_inputs contain borrowed references.
+  // All items in flattened_attrs and flattened_inputs contain
+  // Safe_PyObjectPtr - any time something steals a reference to this, it must
+  // INCREF.
   // TODO(nareshmodi): figure out why PyList_New/PyList_Append don't work
   // directly.
-  std::unique_ptr<std::vector<PyObject*>> flattened_attrs = nullptr;
-  std::unique_ptr<std::vector<PyObject*>> flattened_inputs = nullptr;
+  std::unique_ptr<std::vector<tensorflow::Safe_PyObjectPtr>> flattened_attrs =
+      nullptr;
+  std::unique_ptr<std::vector<tensorflow::Safe_PyObjectPtr>> flattened_inputs =
+      nullptr;
 
-  if (run_callbacks) {
-    flattened_attrs.reset(new std::vector<PyObject*>);
-    flattened_inputs.reset(new std::vector<PyObject*>);
+  // TODO(nareshmodi): Encapsulate callbacks information into a struct.
+  if (op_exec_info.run_callbacks) {
+    flattened_attrs.reset(new std::vector<tensorflow::Safe_PyObjectPtr>);
+    flattened_inputs.reset(new std::vector<tensorflow::Safe_PyObjectPtr>);
   }
 
   // Add inferred attrs and inputs.
@@ -1809,16 +1965,16 @@ PyObject* TFE_Py_FastPathExecute_C(PyObject*, PyObject* args) {
       Py_ssize_t len = PyList_Size(input);
 
       TFE_OpSetAttrInt(op, input_arg.number_attr().data(), len);
-      if (run_callbacks) {
-        flattened_attrs->push_back(
+      if (op_exec_info.run_callbacks) {
+        flattened_attrs->emplace_back(
             GetPythonObjectFromString(input_arg.number_attr().data()));
-        flattened_attrs->push_back(PyLong_FromLong(len));
+        flattened_attrs->emplace_back(PyLong_FromLong(len));
       }
       attr_list_sizes[input_arg.number_attr()] = len;
 
       if (len > 0) {
         // First item adds the type attr.
-        if (!AddInputToOp(PyList_GET_ITEM(input, 0), &input_arg,
+        if (!AddInputToOp(op_exec_info, PyList_GET_ITEM(input, 0), &input_arg,
                           flattened_attrs.get(), flattened_inputs.get(), op,
                           status)) {
           return nullptr;
@@ -1826,7 +1982,8 @@ PyObject* TFE_Py_FastPathExecute_C(PyObject*, PyObject* args) {
 
         for (Py_ssize_t j = 1; j < len; j++) {
           // Since the list is homogeneous, we don't need to re-add the attr.
-          if (!AddInputToOp(PyList_GET_ITEM(input, j), nullptr /* input_arg */,
+          if (!AddInputToOp(op_exec_info, PyList_GET_ITEM(input, j),
+                            nullptr /* input_arg */,
                             nullptr /* flattened_attrs */,
                             flattened_inputs.get(), op, status)) {
             return nullptr;
@@ -1840,12 +1997,20 @@ PyObject* TFE_Py_FastPathExecute_C(PyObject*, PyObject* args) {
       Py_ssize_t len = PyList_Size(input);
       tensorflow::gtl::InlinedVector<TF_DataType, 4> attr_value(len);
       PyObject* py_attr_value = nullptr;
-      if (run_callbacks) {
+      if (op_exec_info.run_callbacks) {
         py_attr_value = PyTuple_New(len);
       }
       for (Py_ssize_t j = 0; j < len; j++) {
         PyObject* py_input = PyList_GET_ITEM(input, j);
-        TFE_TensorHandle* input_handle = EagerTensor_Handle(py_input);
+        tensorflow::Safe_PyObjectPtr py_eager_tensor;
+        if (!ConvertToTensor(op_exec_info, py_input, &py_eager_tensor,
+                             status)) {
+          return nullptr;
+        }
+
+        TFE_TensorHandle* input_handle =
+            EagerTensor_Handle(py_eager_tensor.get());
+
         attr_value[j] = TFE_TensorHandleDataType(input_handle);
 
         TFE_OpAddInput(op, input_handle, status);
@@ -1853,22 +2018,23 @@ PyObject* TFE_Py_FastPathExecute_C(PyObject*, PyObject* args) {
           return nullptr;
         }
 
-        if (run_callbacks) {
-          flattened_inputs->push_back(py_input);
+        if (op_exec_info.run_callbacks) {
+          flattened_inputs->emplace_back(std::move(py_eager_tensor));
 
           PyTuple_SET_ITEM(py_attr_value, j, PyLong_FromLong(attr_value[j]));
         }
       }
-      if (run_callbacks) {
-        flattened_attrs->push_back(GetPythonObjectFromString(attr_name.data()));
-        flattened_attrs->push_back(py_attr_value);
+      if (op_exec_info.run_callbacks) {
+        flattened_attrs->emplace_back(
+            GetPythonObjectFromString(attr_name.data()));
+        flattened_attrs->emplace_back(py_attr_value);
       }
       TFE_OpSetAttrTypeList(op, attr_name.data(), attr_value.data(),
                             attr_value.size());
       attr_list_sizes[attr_name] = len;
     } else {
       // The item is a single item.
-      if (!AddInputToOp(input, &input_arg, flattened_attrs.get(),
+      if (!AddInputToOp(op_exec_info, input, &input_arg, flattened_attrs.get(),
                         flattened_inputs.get(), op, status)) {
         return nullptr;
       }
@@ -1892,12 +2058,14 @@ PyObject* TFE_Py_FastPathExecute_C(PyObject*, PyObject* args) {
   Py_BEGIN_ALLOW_THREADS;
   TFE_Execute(op, retvals.data(), &num_retvals, status);
   Py_END_ALLOW_THREADS;
+
   if (TF_GetCode(status) != TF_OK) {
     // Augment the status with the op_name for easier debugging similar to
     // TFE_Py_Execute.
     TF_SetStatus(status, TF_GetCode(status),
-                 tensorflow::strings::StrCat(TF_Message(status), " [Op:",
-                                             TFE_GetPythonString(op_name), "]")
+                 tensorflow::strings::StrCat(
+                     TF_Message(status),
+                     " [Op:", TFE_GetPythonString(op_exec_info.op_name), "]")
                      .c_str());
 
     MaybeRaiseExceptionFromTFStatus(status, nullptr);
@@ -1909,10 +2077,8 @@ PyObject* TFE_Py_FastPathExecute_C(PyObject*, PyObject* args) {
     PyList_SET_ITEM(flat_result, i, EagerTensorFromHandle(retvals[i]));
   }
 
-  if (run_callbacks &&
-      !RunCallbacks(run_gradient_callback, run_post_exec_callbacks, op_def,
-                    args, *flattened_inputs, *flattened_attrs, flat_result,
-                    op_name, name, callbacks)) {
+  if (!RunCallbacks(op_exec_info, args, *flattened_inputs, *flattened_attrs,
+                    flat_result)) {
     return nullptr;
   }
 
diff --git a/tensorflow/python/eager/pywrap_tfe_test.py b/tensorflow/python/eager/pywrap_tfe_test.py
index 418ed75..46c5601 100644
--- a/tensorflow/python/eager/pywrap_tfe_test.py
+++ b/tensorflow/python/eager/pywrap_tfe_test.py
@@ -27,6 +27,7 @@ from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
+from tensorflow.python.ops import resource_variable_ops
 
 
 class Tests(test.TestCase):
@@ -55,6 +56,21 @@ class Tests(test.TestCase):
 
   @test_util.assert_no_new_tensors
   @test_util.assert_no_garbage_created
+  def testFastpathExecute_ResourceVariableMatMulCorrectResponse(self):
+    ctx = context.context()
+    a_2_by_2 = constant_op.constant(1.0, shape=[2, 2])
+    m = resource_variable_ops.ResourceVariable(a_2_by_2)
+    x = pywrap_tensorflow.TFE_Py_FastPathExecute(
+        ctx._handle, ctx.device_name, "MatMul", None, None, m, m, "transpose_a",
+        False, "transpose_b", False)
+    y = pywrap_tensorflow.TFE_Py_FastPathExecute(
+        ctx._handle, ctx.device_name, "MatMul", None, None, a_2_by_2, a_2_by_2,
+        "transpose_a", False, "transpose_b", False)
+
+    self.assertAllEqual(x, y)
+
+  @test_util.assert_no_new_tensors
+  @test_util.assert_no_garbage_created
   def testFastpathExecute_TapeWrite(self):
     ctx = context.context()
     with backprop.GradientTape(persistent=True) as tape:
@@ -67,6 +83,21 @@ class Tests(test.TestCase):
     self.assertAllEqual(dz_dy.numpy(),
                         constant_op.constant(4.0, shape=[2, 2]).numpy())
 
+  @test_util.assert_no_new_tensors
+  @test_util.assert_no_garbage_created
+  def testFastpathExecute_ResourceVariableTapeWrite(self):
+    ctx = context.context()
+    with backprop.GradientTape(persistent=True) as tape:
+      a_2_by_2 = constant_op.constant(1.0, shape=[2, 2])
+      m = resource_variable_ops.ResourceVariable(a_2_by_2)
+      tape.watch(m)
+      z = pywrap_tensorflow.TFE_Py_FastPathExecute(
+          ctx._handle, ctx.device_name, "MatMul", None, None, m, m,
+          "transpose_a", False, "transpose_b", False)
+    dz_dy = tape.gradient(z, [m])[0]
+    self.assertAllEqual(dz_dy.numpy(),
+                        constant_op.constant(4.0, shape=[2, 2]).numpy())
+
   # Tests homogeneous list op
   @test_util.assert_no_new_tensors
   @test_util.assert_no_garbage_created
diff --git a/tensorflow/python/ops/resource_variable_ops.py b/tensorflow/python/ops/resource_variable_ops.py
index cbac3c6..6c5d692 100644
--- a/tensorflow/python/ops/resource_variable_ops.py
+++ b/tensorflow/python/ops/resource_variable_ops.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 
 from tensorflow.core.framework import attr_value_pb2
 from tensorflow.core.framework import variable_pb2
+from tensorflow.python import pywrap_tensorflow
 from tensorflow.python.eager import context
 from tensorflow.python.eager import tape
 from tensorflow.python.framework import dtypes
@@ -932,6 +933,9 @@ class ResourceVariable(variables.Variable):
                        "Tensor object.")
 
 
+pywrap_tensorflow.TFE_Py_RegisterResourceVariableType(ResourceVariable)
+
+
 def _dense_var_to_tensor(var, dtype=None, name=None, as_ref=False):
   return var._dense_var_to_tensor(dtype=dtype, name=name, as_ref=as_ref)  # pylint: disable=protected-access
 
diff --git a/tensorflow/python/pywrap_tfe.i b/tensorflow/python/pywrap_tfe.i
index 7ab0db5..b481ddf 100644
--- a/tensorflow/python/pywrap_tfe.i
+++ b/tensorflow/python/pywrap_tfe.i
@@ -31,6 +31,7 @@ limitations under the License.
 %rename("%s") TFE_Py_RegisterExceptionClass;
 %rename("%s") TFE_Py_RegisterBackwardFunctionGetter;
 %rename("%s") TFE_Py_RegisterFallbackExceptionClass;
+%rename("%s") TFE_Py_RegisterResourceVariableType;
 %rename("%s") TFE_Py_Execute;
 %rename("%s") TFE_Py_FastPathExecute;
 %rename("%s") TFE_Py_RecordGradient;
-- 
2.7.4