From f0f5cffde9196dd5ef46a6d6ba17975ea995c3ca Mon Sep 17 00:00:00 2001 From: Rohan Varma Date: Wed, 18 Aug 2021 11:38:11 -0700 Subject: [PATCH] [DDP] Add a debug check in cpp fp16 compress (#63379) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63379 this codepath has been prone to bugs as seen in the below diff, this will help ensure against changes/refactors that touch this, as a basic sanity check. Enabled it in debug-only builds to not affect the perf. ghstack-source-id: 136056093 Test Plan: CI Reviewed By: SciPioneer Differential Revision: D30358440 fbshipit-source-id: e1b3893a223722c2593ceed8696a09c7d07d47c1 --- torch/csrc/distributed/c10d/default_comm_hooks.cpp | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/torch/csrc/distributed/c10d/default_comm_hooks.cpp b/torch/csrc/distributed/c10d/default_comm_hooks.cpp index 91700baa2e..30bc96b16f 100644 --- a/torch/csrc/distributed/c10d/default_comm_hooks.cpp +++ b/torch/csrc/distributed/c10d/default_comm_hooks.cpp @@ -1,4 +1,6 @@ #include +#include +#include #include #include @@ -31,6 +33,11 @@ c10::intrusive_ptr FP16CompressCommHook::runHook( "ProcessGroup::allreduce should return TensorList"); auto reduce_tensor = result.toTensorVector()[0]; + TORCH_INTERNAL_ASSERT_DEBUG_ONLY( + reduce_tensor.scalar_type() == at::ScalarType::Half, + "Expected reduced tensor to be fp16 in FP16CompressHook, but got type ", + reduce_tensor.scalar_type() + ); decompressed_tensor.copy_(reduce_tensor); return c10::IValue(decompressed_tensor); }; -- 2.34.1