[DDP] Add a debug check in cpp fp16 compress (#63379)
authorRohan Varma <rvarm1@fb.com>
Wed, 18 Aug 2021 18:38:11 +0000 (11:38 -0700)
committerFacebook GitHub Bot <facebook-github-bot@users.noreply.github.com>
Wed, 18 Aug 2021 18:51:19 +0000 (11:51 -0700)
Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/63379

this codepath has been prone to bugs as seen in the below diff, this
will help ensure against changes/refactors that touch this, as a basic sanity
check. Enabled it in debug-only builds to not affect the perf.
ghstack-source-id: 136056093

Test Plan: CI

Reviewed By: SciPioneer

Differential Revision: D30358440

fbshipit-source-id: e1b3893a223722c2593ceed8696a09c7d07d47c1

torch/csrc/distributed/c10d/default_comm_hooks.cpp

index 91700baa2e4a584754e3eb15eeb51c774a8d30c6..30bc96b16f7db9df987cb3dd3bad47fcc8e9b0aa 100644 (file)
@@ -1,4 +1,6 @@
 #include <c10d/default_comm_hooks.hpp>
+#include <c10/core/ScalarType.h>
+#include <c10/util/Exception.h>
 
 #include <c10d/ProcessGroup.hpp>
 #include <c10d/comm.hpp>
@@ -31,6 +33,11 @@ c10::intrusive_ptr<c10::ivalue::Future> FP16CompressCommHook::runHook(
         "ProcessGroup::allreduce should return TensorList");
 
     auto reduce_tensor = result.toTensorVector()[0];
+    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+      reduce_tensor.scalar_type() == at::ScalarType::Half,
+      "Expected reduced tensor to be fp16 in FP16CompressHook, but got type ",
+      reduce_tensor.scalar_type()
+    );
     decompressed_tensor.copy_(reduce_tensor);
     return c10::IValue(decompressed_tensor);
   };