From 1e42720a77109f68ca04d44d306e2d3040e10e45 Mon Sep 17 00:00:00 2001
From: Elliot Waite <1767836+elliotwaite@users.noreply.github.com>
Date: Wed, 13 Mar 2019 09:18:34 -0700
Subject: [PATCH] Fix some typos in distributed.py.

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/17959

Differential Revision: D14437347

Pulled By: soumith

fbshipit-source-id: 4c33571f56e9da687666516a310f91924cddd4d9
---
 torch/nn/parallel/distributed.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/torch/nn/parallel/distributed.py b/torch/nn/parallel/distributed.py
index a33f321..d02f88d 100644
--- a/torch/nn/parallel/distributed.py
+++ b/torch/nn/parallel/distributed.py
@@ -44,7 +44,7 @@ class DistributedDataParallel(Module):
     this way, you can simply construct the model as the following:
 
         >>> torch.distributed.init_process_group(backend="nccl")
-        >>> model = DistributedDataParallel(model) # device_ids will include all GPU devices be default
+        >>> model = DistributedDataParallel(model) # device_ids will include all GPU devices by default
 
     (2) Multi-Process Single-GPU
 
@@ -102,9 +102,9 @@ class DistributedDataParallel(Module):
         This module assumes all parameters are registered in the model of each
         distributed processes are in the same order. The module itself will
         conduct gradient all-reduction following the reverse order of the
-        registered parameters of the model. In other wise, it is users'
+        registered parameters of the model. In other words, it is users'
         responsibility to ensure that each distributed process has the exact
-        same model and thus the exact parameter registeration order.
+        same model and thus the exact same parameter registration order.
 
     .. warning::
         This module assumes all buffers and gradients are dense.
@@ -169,7 +169,7 @@ class DistributedDataParallel(Module):
                          You normally don't need this option enabled unless you
                          are observing weird behaviors such as different ranks
                          are getting different gradients, which should not
-                         happen if DistributedDataParallel is corrected used.
+                         happen if DistributedDataParallel is correctly used.
                          (default: ``False``)
 
     Attributes:
@@ -352,8 +352,8 @@ class DistributedDataParallel(Module):
             if not self.all_buckets_reduced:
                 raise RuntimeError("Not all gradients have been reduced from "
                                    "the backward of the previous iteration. "
-                                   "This is unexpected and fatal error. Please "
-                                   "check and ensure that the model's "
+                                   "This is an unexpected and fatal error. "
+                                   "Please check and ensure that the model's "
                                    "parameters are not changed after you wrap "
                                    "up the model with DistributedDataParallel.")
         self.all_buckets_reduced = False
-- 
2.7.4