(torch.distributed.elastic) properly format traceback on error (#65041)

author Kiuk Chung <kiuk@fb.com>

Wed, 15 Sep 2021 19:48:28 +0000 (12:48 -0700)

committer Facebook GitHub Bot <facebook-github-bot@users.noreply.github.com>

Wed, 15 Sep 2021 19:50:21 +0000 (12:50 -0700)
author Kiuk Chung <kiuk@fb.com>
Wed, 15 Sep 2021 19:48:28 +0000 (12:48 -0700)
committer Facebook GitHub Bot <facebook-github-bot@users.noreply.github.com>
Wed, 15 Sep 2021 19:50:21 +0000 (12:50 -0700)
diff --git a/torch/distributed/elastic/multiprocessing/errors/__init__.py b/torch/distributed/elastic/multiprocessing/errors/__init__.py

index ab0e0f3..c1be1e0 100644 (file)
--- a/torch/distributed/elastic/multiprocessing/errors/__init__.py
+++ b/torch/distributed/elastic/multiprocessing/errors/__init__.py
@@ -57,13 +57,14 @@ from dataclasses import dataclass, field
  from datetime import datetime
  from functools import wraps
  from string import Template
-from typing import Callable, Dict, List, Optional, Tuple, TypeVar, Any
+from typing import Any, Callable, Dict, List, Optional, Tuple, TypeVar
  
  from torch.distributed.elastic.utils.logging import get_logger
  
  from .error_handler import ErrorHandler  # noqa: F401
  from .handlers import get_error_handler  # noqa: F401
  
+
  log = get_logger()
  
  
@@ -245,7 +246,7 @@ class ChildFailedError(Exception):
                  other_failures_fmt.append(fmt)
  
          # upper boundary on width
-        width = min(width, 250)
+        width = min(width, 80)
  
          return Template(_MSG_FORMAT_TEMPLATE).substitute(
              boarder=boarder_delim * width,
@@ -258,18 +259,22 @@ class ChildFailedError(Exception):
      def _format_failure(
          self, idx: int, rank: int, failure: ProcessFailure
      ) -> Tuple[str, int]:
-        if isinstance(failure.message, str):
-            msg = '"' + failure.message + '"'
-        else:
-            try:
-                dmp = json.dumps(failure.message, indent=2)
-            except ValueError:
-                msg = failure.message
-            else:
-                msg = os.linesep
-                # Indent by 4 chars.
-                for l in dmp.splitlines():
-                    msg += f"    {l}{os.linesep}"
+
+        # failure.message is either a str (when the failure does not generate a traceback - e.g. signals)
+        # or a dict (json) of the form
+        # {"message": $ERROR_MSG, "extraInfo": {"py_callstack": $TRACEBACK, timestamp: $TS}}
+        # so the display logic is:
+        # 1. if failure.message is not a dict (it is a str) just show it as is
+        # 2. else try to get the traceback (py_callstack)
+        # 3.      if the traceback is not there, use the message
+        # 4.      if the message  is not there show <N/A>
+        msg = failure.message
+        if isinstance(failure.message, dict):
+            msg = (
+                failure.message.get("extraInfo", {})
+                .get("py_callstack", failure.message.get("message", "<N/A>"))
+                .replace("\n", "\n  ")  # to properly indent the traceback
+            )
  
          fmt = Template(_FAILURE_FORMAT_TEMPLATE).substitute(
              idx=idx,
diff --git a/torch/distributed/elastic/multiprocessing/errors/error_handler.py b/torch/distributed/elastic/multiprocessing/errors/error_handler.py

index 2974355..74586e9 100644 (file)
--- a/torch/distributed/elastic/multiprocessing/errors/error_handler.py
+++ b/torch/distributed/elastic/multiprocessing/errors/error_handler.py
@@ -23,7 +23,7 @@ def _write_error(e: BaseException, error_file: Optional[str]):
          "message": {
              "message": f"{type(e).__name__}: {e}",
              "extraInfo": {
-                "py_callstack": traceback.format_stack(),
+                "py_callstack": traceback.format_exc(),
                  "timestamp": str(int(time.time())),
              },
          }
author	Kiuk Chung <kiuk@fb.com>
	Wed, 15 Sep 2021 19:48:28 +0000 (12:48 -0700)
committer	Facebook GitHub Bot <facebook-github-bot@users.noreply.github.com>
	Wed, 15 Sep 2021 19:50:21 +0000 (12:50 -0700)
torch/distributed/elastic/multiprocessing/errors/__init__.py		patch \| blob \| history
torch/distributed/elastic/multiprocessing/errors/error_handler.py		patch \| blob \| history