add elastic zeus handler (#16746)
authorJane Wang <janewang@fb.com>
Wed, 27 Feb 2019 19:26:40 +0000 (11:26 -0800)
committerFacebook Github Bot <facebook-github-bot@users.noreply.github.com>
Wed, 27 Feb 2019 19:29:59 +0000 (11:29 -0800)
Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/16746

as titled. We use a special url schem elasticzeus for elastic zeus so that we dont need to change the public interface of init_process_group.

Reviewed By: aazzolini, soumith

Differential Revision: D13948151

fbshipit-source-id: 88939dcfa0ad93467dabedad6905ec32e6ec60e6

torch/distributed/distributed_c10d.py

index 3006c61..ace6f82 100644 (file)
@@ -303,6 +303,8 @@ def init_process_group(backend,
         world_size (int, optional): Number of processes participating in
                                     the job.
         rank (int, optional): Rank of the current process.
+        store(Store, optional): Rendevous key/value store as an alternative
+                                to other init methods.
         timeout (timedelta, optional): Timeout for operations executed against
             the process group. Default value equals 30 minutes.
             This is only applicable for the ``gloo`` backend.
@@ -329,6 +331,10 @@ def init_process_group(backend,
     world_size = kwargs.pop('world_size', -1)
     group_name = kwargs.pop('group_name', '')
     rank = kwargs.pop('rank', -1)
+    store = kwargs.pop('store', None)
+    if store is not None:
+        assert world_size > 0, 'world_size needs to be positive'
+        assert rank >= 0, 'rank needs to be non-negative'
     assert len(kwargs) == 0, \
         "got unexpected keyword arguments: %s" % ",".join(kwargs.keys())
 
@@ -351,7 +357,8 @@ def init_process_group(backend,
         elif world_size != -1:
             url += "?world_size={}".format(world_size)
 
-        store, rank, world_size = next(rendezvous(url))
+        if store is None:
+            store, rank, world_size = next(rendezvous(url))
         if backend == Backend.GLOO:
             _default_pg = ProcessGroupGloo(
                 store,