[DDP] Log num threads (#64072)

author Rohan Varma <rvarm1@fb.com>

Thu, 2 Sep 2021 01:12:02 +0000 (18:12 -0700)

committer Facebook GitHub Bot <facebook-github-bot@users.noreply.github.com>

Thu, 2 Sep 2021 01:36:15 +0000 (18:36 -0700)
author Rohan Varma <rvarm1@fb.com>
Thu, 2 Sep 2021 01:12:02 +0000 (18:12 -0700)
committer Facebook GitHub Bot <facebook-github-bot@users.noreply.github.com>
Thu, 2 Sep 2021 01:36:15 +0000 (18:36 -0700)
diff --git a/torch/csrc/distributed/c10d/ProcessGroupGloo.hpp b/torch/csrc/distributed/c10d/ProcessGroupGloo.hpp

index 036ce91..5c0c76a 100644 (file)
--- a/torch/csrc/distributed/c10d/ProcessGroupGloo.hpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupGloo.hpp
@@ -318,6 +318,10 @@ class TORCH_API ProcessGroupGloo : public ProcessGroup {
    // may indicate that there is some sort of collective desynchronization.
    uint64_t getSequenceNumberForGroup() override;
  
+  int getNumThreads() {
+    return options_->threads;
+  }
+
   protected:
    std::unique_ptr<::gloo::rendezvous::Store> store_;
    const c10::intrusive_ptr<Options> options_;
diff --git a/torch/csrc/distributed/c10d/logger.cpp b/torch/csrc/distributed/c10d/logger.cpp

index 0bb960a..b1efd0b 100644 (file)
--- a/torch/csrc/distributed/c10d/logger.cpp
+++ b/torch/csrc/distributed/c10d/logger.cpp
@@ -4,6 +4,10 @@
  #include <fmt/format.h>
  #include <string>
  
+#ifdef USE_C10D_GLOO
+#include <c10d/ProcessGroupGloo.hpp>
+#endif
+
  namespace c10d {
  
  // When training runs at these iterations, log the runtime
@@ -68,6 +72,13 @@ void Logger::set_env_variables() {
          parse_env("GLOO_SOCKET_IFNAME");
      ddp_logging_data_->strs_map["gloo_device_transport"] =
          parse_env("GLOO_DEVICE_TRANSPORT");
+
+    #ifdef USE_C10D_GLOO
+    auto gloo_pg =
+        static_cast<c10d::ProcessGroupGloo*>(reducer_->process_group_.get());
+    auto n_threads = gloo_pg->getNumThreads();
+    ddp_logging_data_->ints_map["gloo_num_threads"] = n_threads;
+    #endif
    }
  }
  
diff --git a/torch/testing/_internal/distributed/distributed_test.py b/torch/testing/_internal/distributed/distributed_test.py

index f17842e..613e23e 100644 (file)
--- a/torch/testing/_internal/distributed/distributed_test.py
+++ b/torch/testing/_internal/distributed/distributed_test.py
@@ -5074,6 +5074,12 @@ class DistributedTest:
                      ddp_logging_data.get("gloo_device_transport"),
                      parse_env("GLOO_DEVICE_TRANSPORT"),
                  )
+                default_gloo_threads = 2
+                self.assertEqual(
+                    ddp_logging_data.get("gloo_num_threads"),
+                    default_gloo_threads,
+                )
+
              self.assertEqual(ddp_logging_data.get("nccl_socket_ifname"), None)
              self.assertEqual(ddp_logging_data.get("nccl_blocking_wait"), None)
              self.assertEqual(ddp_logging_data.get("nccl_async_error_handling"), None)
author	Rohan Varma <rvarm1@fb.com>
	Thu, 2 Sep 2021 01:12:02 +0000 (18:12 -0700)
committer	Facebook GitHub Bot <facebook-github-bot@users.noreply.github.com>
	Thu, 2 Sep 2021 01:36:15 +0000 (18:36 -0700)
torch/csrc/distributed/c10d/ProcessGroupGloo.hpp		patch \| blob \| history
torch/csrc/distributed/c10d/logger.cpp		patch \| blob \| history
torch/testing/_internal/distributed/distributed_test.py		patch \| blob \| history