Improve numerical gradient check (#3856)

author SWu <SWu@users.noreply.github.com>

Sat, 31 Aug 2019 06:25:16 +0000 (02:25 -0400)

committer Wuwei Lin <wuwei@apache.org>

Sat, 31 Aug 2019 06:25:16 +0000 (02:25 -0400)
author SWu <SWu@users.noreply.github.com>
Sat, 31 Aug 2019 06:25:16 +0000 (02:25 -0400)
committer Wuwei Lin <wuwei@apache.org>
Sat, 31 Aug 2019 06:25:16 +0000 (02:25 -0400)
diff --git a/python/tvm/relay/testing/__init__.py b/python/tvm/relay/testing/__init__.py

index 6a75921..481f35e 100644 (file)
--- a/python/tvm/relay/testing/__init__.py
+++ b/python/tvm/relay/testing/__init__.py
@@ -56,72 +56,71 @@ def run_infer_type(expr):
      return run_opt_pass(expr, transform.InferType())
  
  
-def rand_from_type(t):
-    return relay.Constant(rand(t.dtype, *[int(d) for d in t.shape]))
+def _np_randn_from_type(t, scale=1):
+    return (scale * np.random.randn(*(int(d) for d in t.shape))).astype(t.dtype)
  
  
-CHECK_GRAD_COUNTER = 0
-def check_grad(func, mod=None):
-    """
-    Test that directional gradient calculated by reverse mode
-    is close to the one calculated by finite difference.
+def check_grad(func, inputs=None, eps=1e-6, atol=1e-5, rtol=1e-3):
+    """Perform numerical gradient checking given a relay function.
+
+    Compare analytical gradients to numerical gradients derived from two-sided approximation. Note
+    that this test may fail if your function input types are not of high enough precision.
+
+    Parameters
+    ----------
+    func : tvm.relay.Function
+        The relay function to test.
+
+    inputs: List[np.array]
+        Optional user-provided input parameters to use. If not given, will generate random normal
+        inputs scaled to be close to the chosen epsilon value to avoid numerical precision loss.
+
+    eps: float
+        The epsilon value to use for computing numerical gradient approximation.
+
+    atol: float
+        The absolute tolerance on difference between numerical and analytical gradients. Note that
+        this needs to be scaled appropriately relative to the chosen eps and inputs.
+
+    rtol: float
+        The relative tolerance on difference between numerical and analytical gradients. Note that
+        this needs to be scaled appropriately relative to the chosen eps.
+
      """
-    global CHECK_GRAD_COUNTER
-    if mod is None:
-        mod = relay.Module()
-    def make(name):
-        return GlobalVar(name + str(CHECK_GRAD_COUNTER))
-    func_name = make("func_")
-    back_func_name = make("back_func_")
-    finite_difference_func_name = make("finite_difference_")
-    reverse_mode_func_name = make("reverse_mode_")
-    check_func_name = make("check_func_")
-    CHECK_GRAD_COUNTER = CHECK_GRAD_COUNTER + 1
-    epsilon = relay.const(0.01)
-    mod[func_name] = func
-    mod[back_func_name] = gradient(mod[func_name], mod=mod)
-    params = mod[func_name].params
-    directions = [rand_from_type(x.checked_type) for x in params]
-    ft = TensorType(())
-    sb = ScopeBuilder()
-    def get_reverse_mode_result(e, d, t):
-        assert isinstance(t, TensorType)
-        return op.cast(e * d, 'float32')
-    bf = sb.let("bf", TupleGetItem(back_func_name(*params), 1))
-    reverse_mode_results = [get_reverse_mode_result(TupleGetItem(bf, i),
-                                                    directions[i],
-                                                    x.checked_type)
-                            for i, x in enumerate(params)]
-    reverse_mode_result = relay.const(0.0)
-    for x in reverse_mode_results:
-        reverse_mode_result = reverse_mode_result + op.reduce.sum(x)
-    sb.ret(reverse_mode_result)
-    reverse_mode_result = sb.get()
-    mod[reverse_mode_func_name] = Function(params,
-                                           reverse_mode_result,
-                                           ft,
-                                           mod[func_name].type_params,
-                                           mod[func_name].attrs)
-    finite_difference_result = op.reduce.sum((func_name(*[x + epsilon * y for x, y in
-                                                          zip(params, directions)]) -
-                                              func_name(*params)) /
-                                             epsilon)
-
-    mod[finite_difference_func_name] = Function(params,
-                                                finite_difference_result,
-                                                ft,
-                                                mod[func_name].type_params,
-                                                mod[func_name].attrs)
-    check_func_result = op.abs(reverse_mode_func_name(*params) -
-                               finite_difference_func_name(*params))
-    mod[check_func_name] = Function(params,
-                                    check_func_result,
-                                    ft,
-                                    mod[func_name].type_params,
-                                    mod[func_name].attrs)
-    ex = create_executor(mod=mod)
-    res = ex.evaluate(check_func_name(*[rand_from_type(x.checked_type) for x in params]))
-    assert res.data.asnumpy() < 0.001
+
+    fwd_func = run_infer_type(func)
+    bwd_func = run_infer_type(gradient(fwd_func))
+
+    if inputs is None:
+        params = fwd_func.params
+        # Generate random inputs on the same scale as epsilon to avoid numerical precision loss.
+        inputs = [_np_randn_from_type(x.checked_type, scale=(10 * eps)) for x in params]
+
+    for target, ctx in ctx_list():
+        intrp = relay.create_executor(ctx=ctx, target=target)
+
+        # Get analytic gradients.
+        _, grads = intrp.evaluate(bwd_func)(*inputs)
+        grads = [grad.asnumpy().astype("float64") for grad in grads]
+
+        # Get numeric gradients for each dimension of each param, using two-sided approximation.
+        approx_grads = []
+        for x in inputs:
+            approx_grad = np.zeros(x.shape)
+            for i in np.ndindex(*x.shape):
+                x_i = x[i]
+                x[i] = x_i + eps
+                fwd_plus = intrp.evaluate(fwd_func)(*inputs).asnumpy().astype("float64")
+                x[i] = x_i - eps
+                fwd_minus = intrp.evaluate(fwd_func)(*inputs).asnumpy().astype("float64")
+                x[i] = x_i
+                approx_grad[i] = np.sum((fwd_plus - fwd_minus) / (2 * eps))
+            approx_grads.append(approx_grad)
+
+        # Compare gradients by checking that relative difference is below tolerance.
+        for grad, approx_grad in zip(grads, approx_grads):
+            np.testing.assert_allclose(grad, approx_grad, atol=atol, rtol=rtol)
+
  
  def rand(dtype, *shape):
      return tvm.nd.array(np.random.rand(*shape).astype(dtype))
author	SWu <SWu@users.noreply.github.com>
	Sat, 31 Aug 2019 06:25:16 +0000 (02:25 -0400)
committer	Wuwei Lin <wuwei@apache.org>
	Sat, 31 Aug 2019 06:25:16 +0000 (02:25 -0400)