From 168c0797c45c1a26b0612c8496fe4ad112aeabfd Mon Sep 17 00:00:00 2001 From: Shen Li Date: Tue, 9 Apr 2019 16:11:05 -0700 Subject: [PATCH] Remind users to set map_location properly when using DDP Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/19084 Differential Revision: D14861702 Pulled By: mrshenli fbshipit-source-id: 10ca4a9b41e707050a6bce228ccca4177c9fa4a6 --- torch/nn/parallel/distributed.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/torch/nn/parallel/distributed.py b/torch/nn/parallel/distributed.py index 35a9695..e4878f9 100644 --- a/torch/nn/parallel/distributed.py +++ b/torch/nn/parallel/distributed.py @@ -83,6 +83,12 @@ class DistributedDataParallel(Module): Also note that ``nccl`` backend is currently the fastest and highly recommended backend for fp16/fp32 mixed-precision training. + .. note:: If you use ``torch.save`` on one process to checkpoint the module, + and ``torch.load`` on some other processes to recover it, make sure that + ``map_location`` is configured properly for every process. Without + ``map_location``, ``torch.load`` would recover the module to devices + where the module was saved from. + .. warning:: This module works only with the ``gloo`` and ``nccl`` backends. -- 2.7.4