From: Samuel Antao <sfantao@us.ibm.com>
Date: Fri, 15 Jul 2016 23:13:27 +0000 (+0000)
Subject: [CUDA][OpenMP] Create generic offload action
X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=d06239d359df71ee594c1b305fe71af59793abc2;p=platform%2Fupstream%2Fllvm.git

[CUDA][OpenMP] Create generic offload action

Summary:
This patch replaces the CUDA specific action by a generic offload action. The offload action may have multiple dependences classier in “host” and “device”. The way this generic offloading action is used is very similar to what is done today by the CUDA implementation: it is used to set a specific toolchain and architecture to its dependences during the generation of jobs.

This patch also proposes propagating the offloading information through the action graph so that that information can be easily retrieved at any time during the generation of commands. This allows e.g. the "clang tool” to evaluate whether CUDA should be supported for the device or host and ptas to easily retrieve the target architecture.

This is an example of how the action graphs would look like (compilation of a single CUDA file with two GPU architectures)
```
0: input, "cudatests.cu", cuda, (host-cuda)
1: preprocessor, {0}, cuda-cpp-output, (host-cuda)
2: compiler, {1}, ir, (host-cuda)
3: input, "cudatests.cu", cuda, (device-cuda, sm_35)
4: preprocessor, {3}, cuda-cpp-output, (device-cuda, sm_35)
5: compiler, {4}, ir, (device-cuda, sm_35)
6: backend, {5}, assembler, (device-cuda, sm_35)
7: assembler, {6}, object, (device-cuda, sm_35)
8: offload, "device-cuda (nvptx64-nvidia-cuda:sm_35)" {7}, object
9: offload, "device-cuda (nvptx64-nvidia-cuda:sm_35)" {6}, assembler
10: input, "cudatests.cu", cuda, (device-cuda, sm_37)
11: preprocessor, {10}, cuda-cpp-output, (device-cuda, sm_37)
12: compiler, {11}, ir, (device-cuda, sm_37)
13: backend, {12}, assembler, (device-cuda, sm_37)
14: assembler, {13}, object, (device-cuda, sm_37)
15: offload, "device-cuda (nvptx64-nvidia-cuda:sm_37)" {14}, object
16: offload, "device-cuda (nvptx64-nvidia-cuda:sm_37)" {13}, assembler
17: linker, {8, 9, 15, 16}, cuda-fatbin, (device-cuda)
18: offload, "host-cuda (powerpc64le-unknown-linux-gnu)" {2}, "device-cuda (nvptx64-nvidia-cuda)" {17}, ir
19: backend, {18}, assembler
20: assembler, {19}, object
21: input, "cuda", object
22: input, "cudart", object
23: linker, {20, 21, 22}, image
```
The changes in this patch pass the existent regression tests (keeps the existent functionality) and resulting binaries execute correctly in a Power8+K40 machine.

Reviewers: echristo, hfinkel, jlebar, ABataev, tra

Subscribers: guansong, andreybokhanko, tcramer, mkuron, cfe-commits, arpith-jacob, carlo.bertolli, caomhin

Differential Revision: https://reviews.llvm.org/D18171

llvm-svn: 275645
---

diff --git a/clang/include/clang/Driver/Action.h b/clang/include/clang/Driver/Action.h
index c54a2a4..0c38b3a 100644
--- a/clang/include/clang/Driver/Action.h
+++ b/clang/include/clang/Driver/Action.h
@@ -13,6 +13,7 @@
 #include "clang/Basic/Cuda.h"
 #include "clang/Driver/Types.h"
 #include "clang/Driver/Util.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
 
 namespace llvm {
@@ -27,6 +28,8 @@ namespace opt {
 namespace clang {
 namespace driver {
 
+class ToolChain;
+
 /// Action - Represent an abstract compilation step to perform.
 ///
 /// An action represents an edge in the compilation graph; typically
@@ -50,8 +53,7 @@ public:
   enum ActionClass {
     InputClass = 0,
     BindArchClass,
-    CudaDeviceClass,
-    CudaHostClass,
+    OffloadClass,
     PreprocessJobClass,
     PrecompileJobClass,
     AnalyzeJobClass,
@@ -65,17 +67,13 @@ public:
     VerifyDebugInfoJobClass,
     VerifyPCHJobClass,
 
-    JobClassFirst=PreprocessJobClass,
-    JobClassLast=VerifyPCHJobClass
+    JobClassFirst = PreprocessJobClass,
+    JobClassLast = VerifyPCHJobClass
   };
 
   // The offloading kind determines if this action is binded to a particular
   // programming model. Each entry reserves one bit. We also have a special kind
   // to designate the host offloading tool chain.
-  //
-  // FIXME: This is currently used to indicate that tool chains are used in a
-  // given programming, but will be used here as well once a generic offloading
-  // action is implemented.
   enum OffloadKind {
     OFK_None = 0x00,
     // The host offloading tool chain.
@@ -95,6 +93,19 @@ private:
   ActionList Inputs;
 
 protected:
+  ///
+  /// Offload information.
+  ///
+
+  /// The host offloading kind - a combination of kinds encoded in a mask.
+  /// Multiple programming models may be supported simultaneously by the same
+  /// host.
+  unsigned ActiveOffloadKindMask = 0u;
+  /// Offloading kind of the device.
+  OffloadKind OffloadingDeviceKind = OFK_None;
+  /// The Offloading architecture associated with this action.
+  const char *OffloadingArch = nullptr;
+
   Action(ActionClass Kind, types::ID Type) : Action(Kind, ActionList(), Type) {}
   Action(ActionClass Kind, Action *Input, types::ID Type)
       : Action(Kind, ActionList({Input}), Type) {}
@@ -124,6 +135,40 @@ public:
   input_const_range inputs() const {
     return input_const_range(input_begin(), input_end());
   }
+
+  /// Return a string containing the offload kind of the action.
+  std::string getOffloadingKindPrefix() const;
+  /// Return a string that can be used as prefix in order to generate unique
+  /// files for each offloading kind.
+  std::string getOffloadingFileNamePrefix(StringRef NormalizedTriple) const;
+
+  /// Set the device offload info of this action and propagate it to its
+  /// dependences.
+  void propagateDeviceOffloadInfo(OffloadKind OKind, const char *OArch);
+  /// Append the host offload info of this action and propagate it to its
+  /// dependences.
+  void propagateHostOffloadInfo(unsigned OKinds, const char *OArch);
+  /// Set the offload info of this action to be the same as the provided action,
+  /// and propagate it to its dependences.
+  void propagateOffloadInfo(const Action *A);
+
+  unsigned getOffloadingHostActiveKinds() const {
+    return ActiveOffloadKindMask;
+  }
+  OffloadKind getOffloadingDeviceKind() const { return OffloadingDeviceKind; }
+  const char *getOffloadingArch() const { return OffloadingArch; }
+
+  /// Check if this action have any offload kinds. Note that host offload kinds
+  /// are only set if the action is a dependence to a host offload action.
+  bool isHostOffloading(OffloadKind OKind) const {
+    return ActiveOffloadKindMask & OKind;
+  }
+  bool isDeviceOffloading(OffloadKind OKind) const {
+    return OffloadingDeviceKind == OKind;
+  }
+  bool isOffloading(OffloadKind OKind) const {
+    return isHostOffloading(OKind) || isDeviceOffloading(OKind);
+  }
 };
 
 class InputAction : public Action {
@@ -156,39 +201,126 @@ public:
   }
 };
 
-class CudaDeviceAction : public Action {
+/// An offload action combines host or/and device actions according to the
+/// programming model implementation needs and propagates the offloading kind to
+/// its dependences.
+class OffloadAction final : public Action {
   virtual void anchor();
 
-  const CudaArch GpuArch;
+public:
+  /// Type used to communicate device actions. It associates bound architecture,
+  /// toolchain, and offload kind to each action.
+  class DeviceDependences final {
+  public:
+    typedef SmallVector<const ToolChain *, 3> ToolChainList;
+    typedef SmallVector<const char *, 3> BoundArchList;
+    typedef SmallVector<OffloadKind, 3> OffloadKindList;
+
+  private:
+    // Lists that keep the information for each dependency. All the lists are
+    // meant to be updated in sync. We are adopting separate lists instead of a
+    // list of structs, because that simplifies forwarding the actions list to
+    // initialize the inputs of the base Action class.
+
+    /// The dependence actions.
+    ActionList DeviceActions;
+    /// The offloading toolchains that should be used with the action.
+    ToolChainList DeviceToolChains;
+    /// The architectures that should be used with this action.
+    BoundArchList DeviceBoundArchs;
+    /// The offload kind of each dependence.
+    OffloadKindList DeviceOffloadKinds;
+
+  public:
+    /// Add a action along with the associated toolchain, bound arch, and
+    /// offload kind.
+    void add(Action &A, const ToolChain &TC, const char *BoundArch,
+             OffloadKind OKind);
+
+    /// Get each of the individual arrays.
+    const ActionList &getActions() const { return DeviceActions; };
+    const ToolChainList &getToolChains() const { return DeviceToolChains; };
+    const BoundArchList &getBoundArchs() const { return DeviceBoundArchs; };
+    const OffloadKindList &getOffloadKinds() const {
+      return DeviceOffloadKinds;
+    };
+  };
+
+  /// Type used to communicate host actions. It associates bound architecture,
+  /// toolchain, and offload kinds to the host action.
+  class HostDependence final {
+    /// The dependence action.
+    Action &HostAction;
+    /// The offloading toolchain that should be used with the action.
+    const ToolChain &HostToolChain;
+    /// The architectures that should be used with this action.
+    const char *HostBoundArch = nullptr;
+    /// The offload kind of each dependence.
+    unsigned HostOffloadKinds = 0u;
+
+  public:
+    HostDependence(Action &A, const ToolChain &TC, const char *BoundArch,
+                   const unsigned OffloadKinds)
+        : HostAction(A), HostToolChain(TC), HostBoundArch(BoundArch),
+          HostOffloadKinds(OffloadKinds){};
+    /// Constructor version that obtains the offload kinds from the device
+    /// dependencies.
+    HostDependence(Action &A, const ToolChain &TC, const char *BoundArch,
+                   const DeviceDependences &DDeps);
+    Action *getAction() const { return &HostAction; };
+    const ToolChain *getToolChain() const { return &HostToolChain; };
+    const char *getBoundArch() const { return HostBoundArch; };
+    unsigned getOffloadKinds() const { return HostOffloadKinds; };
+  };
+
+  typedef llvm::function_ref<void(Action *, const ToolChain *, const char *)>
+      OffloadActionWorkTy;
+
+private:
+  /// The host offloading toolchain that should be used with the action.
+  const ToolChain *HostTC = nullptr;
 
-  /// True when action results are not consumed by the host action (e.g when
-  /// -fsyntax-only or --cuda-device-only options are used).
-  bool AtTopLevel;
+  /// The tool chains associated with the list of actions.
+  DeviceDependences::ToolChainList DevToolChains;
 
 public:
-  CudaDeviceAction(Action *Input, CudaArch Arch, bool AtTopLevel);
+  OffloadAction(const HostDependence &HDep);
+  OffloadAction(const DeviceDependences &DDeps, types::ID Ty);
+  OffloadAction(const HostDependence &HDep, const DeviceDependences &DDeps);
 
-  /// Get the CUDA GPU architecture to which this Action corresponds.  Returns
-  /// UNKNOWN if this Action corresponds to multiple architectures.
-  CudaArch getGpuArch() const { return GpuArch; }
+  /// Execute the work specified in \a Work on the host dependence.
+  void doOnHostDependence(const OffloadActionWorkTy &Work) const;
 
-  bool isAtTopLevel() const { return AtTopLevel; }
+  /// Execute the work specified in \a Work on each device dependence.
+  void doOnEachDeviceDependence(const OffloadActionWorkTy &Work) const;
 
-  static bool classof(const Action *A) {
-    return A->getKind() == CudaDeviceClass;
-  }
-};
+  /// Execute the work specified in \a Work on each dependence.
+  void doOnEachDependence(const OffloadActionWorkTy &Work) const;
 
-class CudaHostAction : public Action {
-  virtual void anchor();
-  ActionList DeviceActions;
+  /// Execute the work specified in \a Work on each host or device dependence if
+  /// \a IsHostDependenceto is true or false, respectively.
+  void doOnEachDependence(bool IsHostDependence,
+                          const OffloadActionWorkTy &Work) const;
 
-public:
-  CudaHostAction(Action *Input, const ActionList &DeviceActions);
+  /// Return true if the action has a host dependence.
+  bool hasHostDependence() const;
+
+  /// Return the host dependence of this action. This function is only expected
+  /// to be called if the host dependence exists.
+  Action *getHostDependence() const;
+
+  /// Return true if the action has a single device dependence. If \a
+  /// DoNotConsiderHostActions is set, ignore the host dependence, if any, while
+  /// accounting for the number of dependences.
+  bool hasSingleDeviceDependence(bool DoNotConsiderHostActions = false) const;
 
-  const ActionList &getDeviceActions() const { return DeviceActions; }
+  /// Return the single device dependence of this action. This function is only
+  /// expected to be called if a single device dependence exists. If \a
+  /// DoNotConsiderHostActions is set, a host dependence is allowed.
+  Action *
+  getSingleDeviceDependence(bool DoNotConsiderHostActions = false) const;
 
-  static bool classof(const Action *A) { return A->getKind() == CudaHostClass; }
+  static bool classof(const Action *A) { return A->getKind() == OffloadClass; }
 };
 
 class JobAction : public Action {
diff --git a/clang/include/clang/Driver/Compilation.h b/clang/include/clang/Driver/Compilation.h
index ea84a93..3f38715 100644
--- a/clang/include/clang/Driver/Compilation.h
+++ b/clang/include/clang/Driver/Compilation.h
@@ -98,12 +98,7 @@ public:
   const Driver &getDriver() const { return TheDriver; }
 
   const ToolChain &getDefaultToolChain() const { return DefaultToolChain; }
-  const ToolChain *getOffloadingHostToolChain() const {
-    auto It = OrderedOffloadingToolchains.find(Action::OFK_Host);
-    if (It != OrderedOffloadingToolchains.end())
-      return It->second;
-    return nullptr;
-  }
+
   unsigned isOffloadingHostKind(Action::OffloadKind Kind) const {
     return ActiveOffloadMask & Kind;
   }
@@ -121,8 +116,8 @@ public:
     return OrderedOffloadingToolchains.equal_range(Kind);
   }
 
-  // Return an offload toolchain of the provided kind. Only one is expected to
-  // exist.
+  /// Return an offload toolchain of the provided kind. Only one is expected to
+  /// exist.
   template <Action::OffloadKind Kind>
   const ToolChain *getSingleOffloadToolChain() const {
     auto TCs = getOffloadToolChains<Kind>();
diff --git a/clang/include/clang/Driver/Driver.h b/clang/include/clang/Driver/Driver.h
index 46bf06d..9ecf434 100644
--- a/clang/include/clang/Driver/Driver.h
+++ b/clang/include/clang/Driver/Driver.h
@@ -394,12 +394,13 @@ public:
   /// BuildJobsForAction - Construct the jobs to perform for the action \p A and
   /// return an InputInfo for the result of running \p A.  Will only construct
   /// jobs for a given (Action, ToolChain, BoundArch) tuple once.
-  InputInfo BuildJobsForAction(Compilation &C, const Action *A,
-                               const ToolChain *TC, const char *BoundArch,
-                               bool AtTopLevel, bool MultipleArchs,
-                               const char *LinkingOutput,
-                               std::map<std::pair<const Action *, std::string>,
-                                        InputInfo> &CachedResults) const;
+  InputInfo
+  BuildJobsForAction(Compilation &C, const Action *A, const ToolChain *TC,
+                     const char *BoundArch, bool AtTopLevel, bool MultipleArchs,
+                     const char *LinkingOutput,
+                     std::map<std::pair<const Action *, std::string>, InputInfo>
+                         &CachedResults,
+                     bool BuildForOffloadDevice) const;
 
   /// Returns the default name for linked images (e.g., "a.out").
   const char *getDefaultImageName() const;
@@ -415,12 +416,11 @@ public:
   /// \param BoundArch - The bound architecture. 
   /// \param AtTopLevel - Whether this is a "top-level" action.
   /// \param MultipleArchs - Whether multiple -arch options were supplied.
-  const char *GetNamedOutputPath(Compilation &C,
-                                 const JobAction &JA,
-                                 const char *BaseInput,
-                                 const char *BoundArch,
-                                 bool AtTopLevel,
-                                 bool MultipleArchs) const;
+  /// \param NormalizedTriple - The normalized triple of the relevant target.
+  const char *GetNamedOutputPath(Compilation &C, const JobAction &JA,
+                                 const char *BaseInput, const char *BoundArch,
+                                 bool AtTopLevel, bool MultipleArchs,
+                                 StringRef NormalizedTriple) const;
 
   /// GetTemporaryPath - Return the pathname of a temporary file to use 
   /// as part of compilation; the file will have the given prefix and suffix.
@@ -467,7 +467,8 @@ private:
       const char *BoundArch, bool AtTopLevel, bool MultipleArchs,
       const char *LinkingOutput,
       std::map<std::pair<const Action *, std::string>, InputInfo>
-          &CachedResults) const;
+          &CachedResults,
+      bool BuildForOffloadDevice) const;
 
 public:
   /// GetReleaseVersion - Parse (([0-9]+)(.([0-9]+)(.([0-9]+)?))?)? and
diff --git a/clang/lib/Driver/Action.cpp b/clang/lib/Driver/Action.cpp
index 7982f51..a98b5c1 100644
--- a/clang/lib/Driver/Action.cpp
+++ b/clang/lib/Driver/Action.cpp
@@ -8,6 +8,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "clang/Driver/Action.h"
+#include "clang/Driver/ToolChain.h"
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/Regex.h"
@@ -21,8 +22,8 @@ const char *Action::getClassName(ActionClass AC) {
   switch (AC) {
   case InputClass: return "input";
   case BindArchClass: return "bind-arch";
-  case CudaDeviceClass: return "cuda-device";
-  case CudaHostClass: return "cuda-host";
+  case OffloadClass:
+    return "offload";
   case PreprocessJobClass: return "preprocessor";
   case PrecompileJobClass: return "precompiler";
   case AnalyzeJobClass: return "analyzer";
@@ -40,6 +41,82 @@ const char *Action::getClassName(ActionClass AC) {
   llvm_unreachable("invalid class");
 }
 
+void Action::propagateDeviceOffloadInfo(OffloadKind OKind, const char *OArch) {
+  // Offload action set its own kinds on their dependences.
+  if (Kind == OffloadClass)
+    return;
+
+  assert((OffloadingDeviceKind == OKind || OffloadingDeviceKind == OFK_None) &&
+         "Setting device kind to a different device??");
+  assert(!ActiveOffloadKindMask && "Setting a device kind in a host action??");
+  OffloadingDeviceKind = OKind;
+  OffloadingArch = OArch;
+
+  for (auto *A : Inputs)
+    A->propagateDeviceOffloadInfo(OffloadingDeviceKind, OArch);
+}
+
+void Action::propagateHostOffloadInfo(unsigned OKinds, const char *OArch) {
+  // Offload action set its own kinds on their dependences.
+  if (Kind == OffloadClass)
+    return;
+
+  assert(OffloadingDeviceKind == OFK_None &&
+         "Setting a host kind in a device action.");
+  ActiveOffloadKindMask |= OKinds;
+  OffloadingArch = OArch;
+
+  for (auto *A : Inputs)
+    A->propagateHostOffloadInfo(ActiveOffloadKindMask, OArch);
+}
+
+void Action::propagateOffloadInfo(const Action *A) {
+  if (unsigned HK = A->getOffloadingHostActiveKinds())
+    propagateHostOffloadInfo(HK, A->getOffloadingArch());
+  else
+    propagateDeviceOffloadInfo(A->getOffloadingDeviceKind(),
+                               A->getOffloadingArch());
+}
+
+std::string Action::getOffloadingKindPrefix() const {
+  switch (OffloadingDeviceKind) {
+  case OFK_None:
+    break;
+  case OFK_Host:
+    llvm_unreachable("Host kind is not an offloading device kind.");
+    break;
+  case OFK_Cuda:
+    return "device-cuda";
+
+    // TODO: Add other programming models here.
+  }
+
+  if (!ActiveOffloadKindMask)
+    return "";
+
+  std::string Res("host");
+  if (ActiveOffloadKindMask & OFK_Cuda)
+    Res += "-cuda";
+
+  // TODO: Add other programming models here.
+
+  return Res;
+}
+
+std::string
+Action::getOffloadingFileNamePrefix(StringRef NormalizedTriple) const {
+  // A file prefix is only generated for device actions and consists of the
+  // offload kind and triple.
+  if (!OffloadingDeviceKind)
+    return "";
+
+  std::string Res("-");
+  Res += getOffloadingKindPrefix();
+  Res += "-";
+  Res += NormalizedTriple;
+  return Res;
+}
+
 void InputAction::anchor() {}
 
 InputAction::InputAction(const Arg &_Input, types::ID _Type)
@@ -51,16 +128,138 @@ void BindArchAction::anchor() {}
 BindArchAction::BindArchAction(Action *Input, const char *_ArchName)
     : Action(BindArchClass, Input), ArchName(_ArchName) {}
 
-void CudaDeviceAction::anchor() {}
+void OffloadAction::anchor() {}
+
+OffloadAction::OffloadAction(const HostDependence &HDep)
+    : Action(OffloadClass, HDep.getAction()), HostTC(HDep.getToolChain()) {
+  OffloadingArch = HDep.getBoundArch();
+  ActiveOffloadKindMask = HDep.getOffloadKinds();
+  HDep.getAction()->propagateHostOffloadInfo(HDep.getOffloadKinds(),
+                                             HDep.getBoundArch());
+};
+
+OffloadAction::OffloadAction(const DeviceDependences &DDeps, types::ID Ty)
+    : Action(OffloadClass, DDeps.getActions(), Ty),
+      DevToolChains(DDeps.getToolChains()) {
+  auto &OKinds = DDeps.getOffloadKinds();
+  auto &BArchs = DDeps.getBoundArchs();
+
+  // If all inputs agree on the same kind, use it also for this action.
+  if (llvm::all_of(OKinds, [&](OffloadKind K) { return K == OKinds.front(); }))
+    OffloadingDeviceKind = OKinds.front();
+
+  // If we have a single dependency, inherit the architecture from it.
+  if (OKinds.size() == 1)
+    OffloadingArch = BArchs.front();
+
+  // Propagate info to the dependencies.
+  for (unsigned i = 0, e = getInputs().size(); i != e; ++i)
+    getInputs()[i]->propagateDeviceOffloadInfo(OKinds[i], BArchs[i]);
+}
+
+OffloadAction::OffloadAction(const HostDependence &HDep,
+                             const DeviceDependences &DDeps)
+    : Action(OffloadClass, HDep.getAction()), HostTC(HDep.getToolChain()),
+      DevToolChains(DDeps.getToolChains()) {
+  // We use the kinds of the host dependence for this action.
+  OffloadingArch = HDep.getBoundArch();
+  ActiveOffloadKindMask = HDep.getOffloadKinds();
+  HDep.getAction()->propagateHostOffloadInfo(HDep.getOffloadKinds(),
+                                             HDep.getBoundArch());
+
+  // Add device inputs and propagate info to the device actions. Do work only if
+  // we have dependencies.
+  for (unsigned i = 0, e = DDeps.getActions().size(); i != e; ++i)
+    if (auto *A = DDeps.getActions()[i]) {
+      getInputs().push_back(A);
+      A->propagateDeviceOffloadInfo(DDeps.getOffloadKinds()[i],
+                                    DDeps.getBoundArchs()[i]);
+    }
+}
+
+void OffloadAction::doOnHostDependence(const OffloadActionWorkTy &Work) const {
+  if (!HostTC)
+    return;
+  assert(!getInputs().empty() && "No dependencies for offload action??");
+  auto *A = getInputs().front();
+  Work(A, HostTC, A->getOffloadingArch());
+}
 
-CudaDeviceAction::CudaDeviceAction(Action *Input, clang::CudaArch Arch,
-                                   bool AtTopLevel)
-    : Action(CudaDeviceClass, Input), GpuArch(Arch), AtTopLevel(AtTopLevel) {}
+void OffloadAction::doOnEachDeviceDependence(
+    const OffloadActionWorkTy &Work) const {
+  auto I = getInputs().begin();
+  auto E = getInputs().end();
+  if (I == E)
+    return;
+
+  // We expect to have the same number of input dependences and device tool
+  // chains, except if we also have a host dependence. In that case we have one
+  // more dependence than we have device tool chains.
+  assert(getInputs().size() == DevToolChains.size() + (HostTC ? 1 : 0) &&
+         "Sizes of action dependences and toolchains are not consistent!");
+
+  // Skip host action
+  if (HostTC)
+    ++I;
+
+  auto TI = DevToolChains.begin();
+  for (; I != E; ++I, ++TI)
+    Work(*I, *TI, (*I)->getOffloadingArch());
+}
+
+void OffloadAction::doOnEachDependence(const OffloadActionWorkTy &Work) const {
+  doOnHostDependence(Work);
+  doOnEachDeviceDependence(Work);
+}
+
+void OffloadAction::doOnEachDependence(bool IsHostDependence,
+                                       const OffloadActionWorkTy &Work) const {
+  if (IsHostDependence)
+    doOnHostDependence(Work);
+  else
+    doOnEachDeviceDependence(Work);
+}
 
-void CudaHostAction::anchor() {}
+bool OffloadAction::hasHostDependence() const { return HostTC != nullptr; }
 
-CudaHostAction::CudaHostAction(Action *Input, const ActionList &DeviceActions)
-    : Action(CudaHostClass, Input), DeviceActions(DeviceActions) {}
+Action *OffloadAction::getHostDependence() const {
+  assert(hasHostDependence() && "Host dependence does not exist!");
+  assert(!getInputs().empty() && "No dependencies for offload action??");
+  return HostTC ? getInputs().front() : nullptr;
+}
+
+bool OffloadAction::hasSingleDeviceDependence(
+    bool DoNotConsiderHostActions) const {
+  if (DoNotConsiderHostActions)
+    return getInputs().size() == (HostTC ? 2 : 1);
+  return !HostTC && getInputs().size() == 1;
+}
+
+Action *
+OffloadAction::getSingleDeviceDependence(bool DoNotConsiderHostActions) const {
+  assert(hasSingleDeviceDependence(DoNotConsiderHostActions) &&
+         "Single device dependence does not exist!");
+  // The previous assert ensures the number of entries in getInputs() is
+  // consistent with what we are doing here.
+  return HostTC ? getInputs()[1] : getInputs().front();
+}
+
+void OffloadAction::DeviceDependences::add(Action &A, const ToolChain &TC,
+                                           const char *BoundArch,
+                                           OffloadKind OKind) {
+  DeviceActions.push_back(&A);
+  DeviceToolChains.push_back(&TC);
+  DeviceBoundArchs.push_back(BoundArch);
+  DeviceOffloadKinds.push_back(OKind);
+}
+
+OffloadAction::HostDependence::HostDependence(Action &A, const ToolChain &TC,
+                                              const char *BoundArch,
+                                              const DeviceDependences &DDeps)
+    : HostAction(A), HostToolChain(TC), HostBoundArch(BoundArch) {
+  for (auto K : DDeps.getOffloadKinds())
+    HostOffloadKinds |= K;
+}
 
 void JobAction::anchor() {}
 
diff --git a/clang/lib/Driver/Driver.cpp b/clang/lib/Driver/Driver.cpp
index 78c3125..02f4a99 100644
--- a/clang/lib/Driver/Driver.cpp
+++ b/clang/lib/Driver/Driver.cpp
@@ -435,7 +435,9 @@ void Driver::CreateOffloadingDeviceToolChains(Compilation &C,
       })) {
     const ToolChain &TC = getToolChain(
         C.getInputArgs(),
-        llvm::Triple(C.getOffloadingHostToolChain()->getTriple().isArch64Bit()
+        llvm::Triple(C.getSingleOffloadToolChain<Action::OFK_Host>()
+                             ->getTriple()
+                             .isArch64Bit()
                          ? "nvptx64-nvidia-cuda"
                          : "nvptx-nvidia-cuda"));
     C.addOffloadDeviceToolChain(&TC, Action::OFK_Cuda);
@@ -1022,19 +1024,33 @@ static unsigned PrintActions1(const Compilation &C, Action *A,
   } else if (BindArchAction *BIA = dyn_cast<BindArchAction>(A)) {
     os << '"' << BIA->getArchName() << '"' << ", {"
        << PrintActions1(C, *BIA->input_begin(), Ids) << "}";
-  } else if (CudaDeviceAction *CDA = dyn_cast<CudaDeviceAction>(A)) {
-    CudaArch Arch = CDA->getGpuArch();
-    if (Arch != CudaArch::UNKNOWN)
-      os << "'" << CudaArchToString(Arch) << "', ";
-    os << "{" << PrintActions1(C, *CDA->input_begin(), Ids) << "}";
+  } else if (OffloadAction *OA = dyn_cast<OffloadAction>(A)) {
+    bool IsFirst = true;
+    OA->doOnEachDependence(
+        [&](Action *A, const ToolChain *TC, const char *BoundArch) {
+          // E.g. for two CUDA device dependences whose bound arch is sm_20 and
+          // sm_35 this will generate:
+          // "cuda-device" (nvptx64-nvidia-cuda:sm_20) {#ID}, "cuda-device"
+          // (nvptx64-nvidia-cuda:sm_35) {#ID}
+          if (!IsFirst)
+            os << ", ";
+          os << '"';
+          if (TC)
+            os << A->getOffloadingKindPrefix();
+          else
+            os << "host";
+          os << " (";
+          os << TC->getTriple().normalize();
+
+          if (BoundArch)
+            os << ":" << BoundArch;
+          os << ")";
+          os << '"';
+          os << " {" << PrintActions1(C, A, Ids) << "}";
+          IsFirst = false;
+        });
   } else {
-    const ActionList *AL;
-    if (CudaHostAction *CHA = dyn_cast<CudaHostAction>(A)) {
-      os << "{" << PrintActions1(C, *CHA->input_begin(), Ids) << "}"
-         << ", gpu binaries ";
-      AL = &CHA->getDeviceActions();
-    } else
-      AL = &A->getInputs();
+    const ActionList *AL = &A->getInputs();
 
     if (AL->size()) {
       const char *Prefix = "{";
@@ -1047,10 +1063,24 @@ static unsigned PrintActions1(const Compilation &C, Action *A,
       os << "{}";
   }
 
+  // Append offload info for all options other than the offloading action
+  // itself (e.g. (cuda-device, sm_20) or (cuda-host)).
+  std::string offload_str;
+  llvm::raw_string_ostream offload_os(offload_str);
+  if (!isa<OffloadAction>(A)) {
+    auto S = A->getOffloadingKindPrefix();
+    if (!S.empty()) {
+      offload_os << ", (" << S;
+      if (A->getOffloadingArch())
+        offload_os << ", " << A->getOffloadingArch();
+      offload_os << ")";
+    }
+  }
+
   unsigned Id = Ids.size();
   Ids[A] = Id;
   llvm::errs() << Id << ": " << os.str() << ", "
-               << types::getTypeName(A->getType()) << "\n";
+               << types::getTypeName(A->getType()) << offload_os.str() << "\n";
 
   return Id;
 }
@@ -1378,8 +1408,12 @@ static Action *buildCudaActions(Compilation &C, DerivedArgList &Args,
       PartialCompilationArg &&
       PartialCompilationArg->getOption().matches(options::OPT_cuda_device_only);
 
-  if (CompileHostOnly)
-    return C.MakeAction<CudaHostAction>(HostAction, ActionList());
+  if (CompileHostOnly) {
+    OffloadAction::HostDependence HDep(
+        *HostAction, *C.getSingleOffloadToolChain<Action::OFK_Host>(),
+        /*BoundArch=*/nullptr, Action::OFK_Cuda);
+    return C.MakeAction<OffloadAction>(HDep);
+  }
 
   // Collect all cuda_gpu_arch parameters, removing duplicates.
   SmallVector<CudaArch, 4> GpuArchList;
@@ -1408,8 +1442,6 @@ static Action *buildCudaActions(Compilation &C, DerivedArgList &Args,
     CudaDeviceInputs.push_back(std::make_pair(types::TY_CUDA_DEVICE, InputArg));
 
   // Build actions for all device inputs.
-  assert(C.getSingleOffloadToolChain<Action::OFK_Cuda>() &&
-         "Missing toolchain for device-side compilation.");
   ActionList CudaDeviceActions;
   C.getDriver().BuildActions(C, Args, CudaDeviceInputs, CudaDeviceActions);
   assert(GpuArchList.size() == CudaDeviceActions.size() &&
@@ -1421,6 +1453,8 @@ static Action *buildCudaActions(Compilation &C, DerivedArgList &Args,
         return a->getKind() != Action::AssembleJobClass;
       });
 
+  const ToolChain *CudaTC = C.getSingleOffloadToolChain<Action::OFK_Cuda>();
+
   // Figure out what to do with device actions -- pass them as inputs to the
   // host action or run each of them independently.
   if (PartialCompilation || CompileDeviceOnly) {
@@ -1436,10 +1470,13 @@ static Action *buildCudaActions(Compilation &C, DerivedArgList &Args,
       return nullptr;
     }
 
-    for (unsigned I = 0, E = GpuArchList.size(); I != E; ++I)
-      Actions.push_back(C.MakeAction<CudaDeviceAction>(CudaDeviceActions[I],
-                                                       GpuArchList[I],
-                                                       /* AtTopLevel */ true));
+    for (unsigned I = 0, E = GpuArchList.size(); I != E; ++I) {
+      OffloadAction::DeviceDependences DDep;
+      DDep.add(*CudaDeviceActions[I], *CudaTC, CudaArchToString(GpuArchList[I]),
+               Action::OFK_Cuda);
+      Actions.push_back(
+          C.MakeAction<OffloadAction>(DDep, CudaDeviceActions[I]->getType()));
+    }
     // Kill host action in case of device-only compilation.
     if (CompileDeviceOnly)
       return nullptr;
@@ -1459,19 +1496,23 @@ static Action *buildCudaActions(Compilation &C, DerivedArgList &Args,
     Action* BackendAction = AssembleAction->getInputs()[0];
     assert(BackendAction->getType() == types::TY_PP_Asm);
 
-    for (const auto& A : {AssembleAction, BackendAction}) {
-      DeviceActions.push_back(C.MakeAction<CudaDeviceAction>(
-          A, GpuArchList[I], /* AtTopLevel */ false));
+    for (auto &A : {AssembleAction, BackendAction}) {
+      OffloadAction::DeviceDependences DDep;
+      DDep.add(*A, *CudaTC, CudaArchToString(GpuArchList[I]), Action::OFK_Cuda);
+      DeviceActions.push_back(C.MakeAction<OffloadAction>(DDep, A->getType()));
     }
   }
-  auto FatbinAction = C.MakeAction<CudaDeviceAction>(
-      C.MakeAction<LinkJobAction>(DeviceActions, types::TY_CUDA_FATBIN),
-      CudaArch::UNKNOWN,
-      /* AtTopLevel = */ false);
+  auto FatbinAction =
+      C.MakeAction<LinkJobAction>(DeviceActions, types::TY_CUDA_FATBIN);
+
   // Return a new host action that incorporates original host action and all
   // device actions.
-  return C.MakeAction<CudaHostAction>(std::move(HostAction),
-                                      ActionList({FatbinAction}));
+  OffloadAction::HostDependence HDep(
+      *HostAction, *C.getSingleOffloadToolChain<Action::OFK_Host>(),
+      /*BoundArch=*/nullptr, Action::OFK_Cuda);
+  OffloadAction::DeviceDependences DDep;
+  DDep.add(*FatbinAction, *CudaTC, /*BoundArch=*/nullptr, Action::OFK_Cuda);
+  return C.MakeAction<OffloadAction>(HDep, DDep);
 }
 
 void Driver::BuildActions(Compilation &C, DerivedArgList &Args,
@@ -1580,6 +1621,9 @@ void Driver::BuildActions(Compilation &C, DerivedArgList &Args,
     YcArg = YuArg = nullptr;
   }
 
+  // Track the host offload kinds used on this compilation.
+  unsigned CompilationActiveOffloadHostKinds = 0u;
+
   // Construct the actions to perform.
   ActionList LinkerInputs;
 
@@ -1648,6 +1692,9 @@ void Driver::BuildActions(Compilation &C, DerivedArgList &Args,
             ? phases::Compile
             : FinalPhase;
 
+    // Track the host offload kinds used on this input.
+    unsigned InputActiveOffloadHostKinds = 0u;
+
     // Build the pipeline for this file.
     Action *Current = C.MakeAction<InputAction>(*InputArg, InputType);
     for (SmallVectorImpl<phases::ID>::iterator i = PL.begin(), e = PL.end();
@@ -1679,21 +1726,36 @@ void Driver::BuildActions(Compilation &C, DerivedArgList &Args,
         Current = buildCudaActions(C, Args, InputArg, Current, Actions);
         if (!Current)
           break;
+
+        // We produced a CUDA action for this input, so the host has to support
+        // CUDA.
+        InputActiveOffloadHostKinds |= Action::OFK_Cuda;
+        CompilationActiveOffloadHostKinds |= Action::OFK_Cuda;
       }
 
       if (Current->getType() == types::TY_Nothing)
         break;
     }
 
-    // If we ended with something, add to the output list.
-    if (Current)
+    // If we ended with something, add to the output list. Also, propagate the
+    // offload information to the top-level host action related with the current
+    // input.
+    if (Current) {
+      if (InputActiveOffloadHostKinds)
+        Current->propagateHostOffloadInfo(InputActiveOffloadHostKinds,
+                                          /*BoundArch=*/nullptr);
       Actions.push_back(Current);
+    }
   }
 
-  // Add a link action if necessary.
-  if (!LinkerInputs.empty())
+  // Add a link action if necessary and propagate the offload information for
+  // the current compilation.
+  if (!LinkerInputs.empty()) {
     Actions.push_back(
         C.MakeAction<LinkJobAction>(LinkerInputs, types::TY_Image));
+    Actions.back()->propagateHostOffloadInfo(CompilationActiveOffloadHostKinds,
+                                             /*BoundArch=*/nullptr);
+  }
 
   // If we are linking, claim any options which are obviously only used for
   // compilation.
@@ -1829,7 +1891,8 @@ void Driver::BuildJobs(Compilation &C) const {
                        /*BoundArch*/ nullptr,
                        /*AtTopLevel*/ true,
                        /*MultipleArchs*/ ArchNames.size() > 1,
-                       /*LinkingOutput*/ LinkingOutput, CachedResults);
+                       /*LinkingOutput*/ LinkingOutput, CachedResults,
+                       /*BuildForOffloadDevice*/ false);
   }
 
   // If the user passed -Qunused-arguments or there were errors, don't warn
@@ -1878,7 +1941,28 @@ void Driver::BuildJobs(Compilation &C) const {
     }
   }
 }
-
+/// Collapse an offloading action looking for a job of the given type. The input
+/// action is changed to the input of the collapsed sequence. If we effectively
+/// had a collapse return the corresponding offloading action, otherwise return
+/// null.
+template <typename T>
+static OffloadAction *collapseOffloadingAction(Action *&CurAction) {
+  if (!CurAction)
+    return nullptr;
+  if (auto *OA = dyn_cast<OffloadAction>(CurAction)) {
+    if (OA->hasHostDependence())
+      if (auto *HDep = dyn_cast<T>(OA->getHostDependence())) {
+        CurAction = HDep;
+        return OA;
+      }
+    if (OA->hasSingleDeviceDependence())
+      if (auto *DDep = dyn_cast<T>(OA->getSingleDeviceDependence())) {
+        CurAction = DDep;
+        return OA;
+      }
+  }
+  return nullptr;
+}
 // Returns a Tool for a given JobAction.  In case the action and its
 // predecessors can be combined, updates Inputs with the inputs of the
 // first combined action. If one of the collapsed actions is a
@@ -1888,34 +1972,39 @@ static const Tool *selectToolForJob(Compilation &C, bool SaveTemps,
                                     bool EmbedBitcode, const ToolChain *TC,
                                     const JobAction *JA,
                                     const ActionList *&Inputs,
-                                    const CudaHostAction *&CollapsedCHA) {
+                                    ActionList &CollapsedOffloadAction) {
   const Tool *ToolForJob = nullptr;
-  CollapsedCHA = nullptr;
+  CollapsedOffloadAction.clear();
 
   // See if we should look for a compiler with an integrated assembler. We match
   // bottom up, so what we are actually looking for is an assembler job with a
   // compiler input.
 
+  // Look through offload actions between assembler and backend actions.
+  Action *BackendJA = (isa<AssembleJobAction>(JA) && Inputs->size() == 1)
+                          ? *Inputs->begin()
+                          : nullptr;
+  auto *BackendOA = collapseOffloadingAction<BackendJobAction>(BackendJA);
+
   if (TC->useIntegratedAs() && !SaveTemps &&
       !C.getArgs().hasArg(options::OPT_via_file_asm) &&
       !C.getArgs().hasArg(options::OPT__SLASH_FA) &&
-      !C.getArgs().hasArg(options::OPT__SLASH_Fa) &&
-      isa<AssembleJobAction>(JA) && Inputs->size() == 1 &&
-      isa<BackendJobAction>(*Inputs->begin())) {
+      !C.getArgs().hasArg(options::OPT__SLASH_Fa) && BackendJA &&
+      isa<BackendJobAction>(BackendJA)) {
     // A BackendJob is always preceded by a CompileJob, and without -save-temps
     // or -fembed-bitcode, they will always get combined together, so instead of
     // checking the backend tool, check if the tool for the CompileJob has an
     // integrated assembler. For -fembed-bitcode, CompileJob is still used to
     // look up tools for BackendJob, but they need to match before we can split
     // them.
-    const ActionList *BackendInputs = &(*Inputs)[0]->getInputs();
-    // Compile job may be wrapped in CudaHostAction, extract it if
-    // that's the case and update CollapsedCHA if we combine phases.
-    CudaHostAction *CHA = dyn_cast<CudaHostAction>(*BackendInputs->begin());
-    JobAction *CompileJA = cast<CompileJobAction>(
-        CHA ? *CHA->input_begin() : *BackendInputs->begin());
-    assert(CompileJA && "Backend job is not preceeded by compile job.");
-    const Tool *Compiler = TC->SelectTool(*CompileJA);
+
+    // Look through offload actions between backend and compile actions.
+    Action *CompileJA = *BackendJA->getInputs().begin();
+    auto *CompileOA = collapseOffloadingAction<CompileJobAction>(CompileJA);
+
+    assert(CompileJA && isa<CompileJobAction>(CompileJA) &&
+           "Backend job is not preceeded by compile job.");
+    const Tool *Compiler = TC->SelectTool(*cast<CompileJobAction>(CompileJA));
     if (!Compiler)
       return nullptr;
     // When using -fembed-bitcode, it is required to have the same tool (clang)
@@ -1929,7 +2018,12 @@ static const Tool *selectToolForJob(Compilation &C, bool SaveTemps,
     if (Compiler->hasIntegratedAssembler()) {
       Inputs = &CompileJA->getInputs();
       ToolForJob = Compiler;
-      CollapsedCHA = CHA;
+      // Save the collapsed offload actions because they may still contain
+      // device actions.
+      if (CompileOA)
+        CollapsedOffloadAction.push_back(CompileOA);
+      if (BackendOA)
+        CollapsedOffloadAction.push_back(BackendOA);
     }
   }
 
@@ -1939,20 +2033,23 @@ static const Tool *selectToolForJob(Compilation &C, bool SaveTemps,
   if (isa<BackendJobAction>(JA)) {
     // Check if the compiler supports emitting LLVM IR.
     assert(Inputs->size() == 1);
-    // Compile job may be wrapped in CudaHostAction, extract it if
-    // that's the case and update CollapsedCHA if we combine phases.
-    CudaHostAction *CHA = dyn_cast<CudaHostAction>(*Inputs->begin());
-    JobAction *CompileJA =
-        cast<CompileJobAction>(CHA ? *CHA->input_begin() : *Inputs->begin());
-    assert(CompileJA && "Backend job is not preceeded by compile job.");
-    const Tool *Compiler = TC->SelectTool(*CompileJA);
+
+    // Look through offload actions between backend and compile actions.
+    Action *CompileJA = *JA->getInputs().begin();
+    auto *CompileOA = collapseOffloadingAction<CompileJobAction>(CompileJA);
+
+    assert(CompileJA && isa<CompileJobAction>(CompileJA) &&
+           "Backend job is not preceeded by compile job.");
+    const Tool *Compiler = TC->SelectTool(*cast<CompileJobAction>(CompileJA));
     if (!Compiler)
       return nullptr;
     if (!Compiler->canEmitIR() ||
         (!SaveTemps && !EmbedBitcode)) {
       Inputs = &CompileJA->getInputs();
       ToolForJob = Compiler;
-      CollapsedCHA = CHA;
+
+      if (CompileOA)
+        CollapsedOffloadAction.push_back(CompileOA);
     }
   }
 
@@ -1963,12 +2060,21 @@ static const Tool *selectToolForJob(Compilation &C, bool SaveTemps,
   // See if we should use an integrated preprocessor. We do so when we have
   // exactly one input, since this is the only use case we care about
   // (irrelevant since we don't support combine yet).
-  if (Inputs->size() == 1 && isa<PreprocessJobAction>(*Inputs->begin()) &&
+
+  // Look through offload actions after preprocessing.
+  Action *PreprocessJA = (Inputs->size() == 1) ? *Inputs->begin() : nullptr;
+  auto *PreprocessOA =
+      collapseOffloadingAction<PreprocessJobAction>(PreprocessJA);
+
+  if (PreprocessJA && isa<PreprocessJobAction>(PreprocessJA) &&
       !C.getArgs().hasArg(options::OPT_no_integrated_cpp) &&
       !C.getArgs().hasArg(options::OPT_traditional_cpp) && !SaveTemps &&
       !C.getArgs().hasArg(options::OPT_rewrite_objc) &&
-      ToolForJob->hasIntegratedCPP())
-    Inputs = &(*Inputs)[0]->getInputs();
+      ToolForJob->hasIntegratedCPP()) {
+    Inputs = &PreprocessJA->getInputs();
+    if (PreprocessOA)
+      CollapsedOffloadAction.push_back(PreprocessOA);
+  }
 
   return ToolForJob;
 }
@@ -1976,8 +2082,8 @@ static const Tool *selectToolForJob(Compilation &C, bool SaveTemps,
 InputInfo Driver::BuildJobsForAction(
     Compilation &C, const Action *A, const ToolChain *TC, const char *BoundArch,
     bool AtTopLevel, bool MultipleArchs, const char *LinkingOutput,
-    std::map<std::pair<const Action *, std::string>, InputInfo> &CachedResults)
-    const {
+    std::map<std::pair<const Action *, std::string>, InputInfo> &CachedResults,
+    bool BuildForOffloadDevice) const {
   // The bound arch is not necessarily represented in the toolchain's triple --
   // for example, armv7 and armv7s both map to the same triple -- so we need
   // both in our map.
@@ -1991,9 +2097,9 @@ InputInfo Driver::BuildJobsForAction(
   if (CachedResult != CachedResults.end()) {
     return CachedResult->second;
   }
-  InputInfo Result =
-      BuildJobsForActionNoCache(C, A, TC, BoundArch, AtTopLevel, MultipleArchs,
-                                LinkingOutput, CachedResults);
+  InputInfo Result = BuildJobsForActionNoCache(
+      C, A, TC, BoundArch, AtTopLevel, MultipleArchs, LinkingOutput,
+      CachedResults, BuildForOffloadDevice);
   CachedResults[ActionTC] = Result;
   return Result;
 }
@@ -2001,21 +2107,65 @@ InputInfo Driver::BuildJobsForAction(
 InputInfo Driver::BuildJobsForActionNoCache(
     Compilation &C, const Action *A, const ToolChain *TC, const char *BoundArch,
     bool AtTopLevel, bool MultipleArchs, const char *LinkingOutput,
-    std::map<std::pair<const Action *, std::string>, InputInfo> &CachedResults)
-    const {
+    std::map<std::pair<const Action *, std::string>, InputInfo> &CachedResults,
+    bool BuildForOffloadDevice) const {
   llvm::PrettyStackTraceString CrashInfo("Building compilation jobs");
 
-  InputInfoList CudaDeviceInputInfos;
-  if (const CudaHostAction *CHA = dyn_cast<CudaHostAction>(A)) {
-    // Append outputs of device jobs to the input list.
-    for (const Action *DA : CHA->getDeviceActions()) {
-      CudaDeviceInputInfos.push_back(BuildJobsForAction(
-          C, DA, TC, nullptr, AtTopLevel,
-          /*MultipleArchs*/ false, LinkingOutput, CachedResults));
+  InputInfoList OffloadDependencesInputInfo;
+  if (const OffloadAction *OA = dyn_cast<OffloadAction>(A)) {
+    // The offload action is expected to be used in four different situations.
+    //
+    // a) Set a toolchain/architecture/kind for a host action:
+    //    Host Action 1 -> OffloadAction -> Host Action 2
+    //
+    // b) Set a toolchain/architecture/kind for a device action;
+    //    Device Action 1 -> OffloadAction -> Device Action 2
+    //
+    // c) Specify a device dependences to a host action;
+    //    Device Action 1  _
+    //                      \
+    //      Host Action 1  ---> OffloadAction -> Host Action 2
+    //
+    // d) Specify a host dependence to a device action.
+    //      Host Action 1  _
+    //                      \
+    //    Device Action 1  ---> OffloadAction -> Device Action 2
+    //
+    // For a) and b), we just return the job generated for the dependence. For
+    // c) and d) we override the current action with the host/device dependence
+    // if the current toolchain is host/device and set the offload dependences
+    // info with the jobs obtained from the device/host dependence(s).
+
+    // If there is a single device option, just generate the job for it.
+    if (OA->hasSingleDeviceDependence()) {
+      InputInfo DevA;
+      OA->doOnEachDeviceDependence([&](Action *DepA, const ToolChain *DepTC,
+                                       const char *DepBoundArch) {
+        DevA =
+            BuildJobsForAction(C, DepA, DepTC, DepBoundArch, AtTopLevel,
+                               /*MultipleArchs*/ !!DepBoundArch, LinkingOutput,
+                               CachedResults, /*BuildForOffloadDevice=*/true);
+      });
+      return DevA;
     }
-    // Override current action with a real host compile action and continue
-    // processing it.
-    A = *CHA->input_begin();
+
+    // If 'Action 2' is host, we generate jobs for the device dependences and
+    // override the current action with the host dependence. Otherwise, we
+    // generate the host dependences and override the action with the device
+    // dependence. The dependences can't therefore be a top-level action.
+    OA->doOnEachDependence(
+        /*IsHostDependence=*/BuildForOffloadDevice,
+        [&](Action *DepA, const ToolChain *DepTC, const char *DepBoundArch) {
+          OffloadDependencesInputInfo.push_back(BuildJobsForAction(
+              C, DepA, DepTC, DepBoundArch, /*AtTopLevel=*/false,
+              /*MultipleArchs*/ !!DepBoundArch, LinkingOutput, CachedResults,
+              /*BuildForOffloadDevice=*/DepA->getOffloadingDeviceKind() !=
+                  Action::OFK_None));
+        });
+
+    A = BuildForOffloadDevice
+            ? OA->getSingleDeviceDependence(/*DoNotConsiderHostActions=*/true)
+            : OA->getHostDependence();
   }
 
   if (const InputAction *IA = dyn_cast<InputAction>(A)) {
@@ -2042,41 +2192,34 @@ InputInfo Driver::BuildJobsForActionNoCache(
       TC = &C.getDefaultToolChain();
 
     return BuildJobsForAction(C, *BAA->input_begin(), TC, ArchName, AtTopLevel,
-                              MultipleArchs, LinkingOutput, CachedResults);
+                              MultipleArchs, LinkingOutput, CachedResults,
+                              BuildForOffloadDevice);
   }
 
-  if (const CudaDeviceAction *CDA = dyn_cast<CudaDeviceAction>(A)) {
-    // Initial processing of CudaDeviceAction carries host params.
-    // Call BuildJobsForAction() again, now with correct device parameters.
-    InputInfo II = BuildJobsForAction(
-        C, *CDA->input_begin(), C.getSingleOffloadToolChain<Action::OFK_Cuda>(),
-        CudaArchToString(CDA->getGpuArch()), CDA->isAtTopLevel(),
-        /*MultipleArchs=*/true, LinkingOutput, CachedResults);
-    // Currently II's Action is *CDA->input_begin().  Set it to CDA instead, so
-    // that one can retrieve II's GPU arch.
-    II.setAction(A);
-    return II;
-  }
 
   const ActionList *Inputs = &A->getInputs();
 
   const JobAction *JA = cast<JobAction>(A);
-  const CudaHostAction *CollapsedCHA = nullptr;
+  ActionList CollapsedOffloadActions;
+
   const Tool *T =
       selectToolForJob(C, isSaveTempsEnabled(), embedBitcodeEnabled(), TC, JA,
-                       Inputs, CollapsedCHA);
+                       Inputs, CollapsedOffloadActions);
   if (!T)
     return InputInfo();
 
-  // If we've collapsed action list that contained CudaHostAction we
-  // need to build jobs for device-side inputs it may have held.
-  if (CollapsedCHA) {
-    for (const Action *DA : CollapsedCHA->getDeviceActions()) {
-      CudaDeviceInputInfos.push_back(BuildJobsForAction(
-          C, DA, TC, "", AtTopLevel,
-          /*MultipleArchs*/ false, LinkingOutput, CachedResults));
-    }
-  }
+  // If we've collapsed action list that contained OffloadAction we
+  // need to build jobs for host/device-side inputs it may have held.
+  for (const auto *OA : CollapsedOffloadActions)
+    cast<OffloadAction>(OA)->doOnEachDependence(
+        /*IsHostDependence=*/BuildForOffloadDevice,
+        [&](Action *DepA, const ToolChain *DepTC, const char *DepBoundArch) {
+          OffloadDependencesInputInfo.push_back(BuildJobsForAction(
+              C, DepA, DepTC, DepBoundArch, AtTopLevel,
+              /*MultipleArchs=*/!!DepBoundArch, LinkingOutput, CachedResults,
+              /*BuildForOffloadDevice=*/DepA->getOffloadingDeviceKind() !=
+                  Action::OFK_None));
+        });
 
   // Only use pipes when there is exactly one input.
   InputInfoList InputInfos;
@@ -2086,9 +2229,9 @@ InputInfo Driver::BuildJobsForActionNoCache(
     // FIXME: Clean this up.
     bool SubJobAtTopLevel =
         AtTopLevel && (isa<DsymutilJobAction>(A) || isa<VerifyJobAction>(A));
-    InputInfos.push_back(BuildJobsForAction(C, Input, TC, BoundArch,
-                                            SubJobAtTopLevel, MultipleArchs,
-                                            LinkingOutput, CachedResults));
+    InputInfos.push_back(BuildJobsForAction(
+        C, Input, TC, BoundArch, SubJobAtTopLevel, MultipleArchs, LinkingOutput,
+        CachedResults, BuildForOffloadDevice));
   }
 
   // Always use the first input as the base input.
@@ -2099,9 +2242,10 @@ InputInfo Driver::BuildJobsForActionNoCache(
   if (JA->getType() == types::TY_dSYM)
     BaseInput = InputInfos[0].getFilename();
 
-  // Append outputs of cuda device jobs to the input list
-  if (CudaDeviceInputInfos.size())
-    InputInfos.append(CudaDeviceInputInfos.begin(), CudaDeviceInputInfos.end());
+  // Append outputs of offload device jobs to the input list
+  if (!OffloadDependencesInputInfo.empty())
+    InputInfos.append(OffloadDependencesInputInfo.begin(),
+                      OffloadDependencesInputInfo.end());
 
   // Determine the place to write output to, if any.
   InputInfo Result;
@@ -2109,7 +2253,8 @@ InputInfo Driver::BuildJobsForActionNoCache(
     Result = InputInfo(A, BaseInput);
   else
     Result = InputInfo(A, GetNamedOutputPath(C, *JA, BaseInput, BoundArch,
-                                             AtTopLevel, MultipleArchs),
+                                             AtTopLevel, MultipleArchs,
+                                             TC->getTriple().normalize()),
                        BaseInput);
 
   if (CCCPrintBindings && !CCGenDiagnostics) {
@@ -2169,7 +2314,8 @@ static const char *MakeCLOutputFilename(const ArgList &Args, StringRef ArgValue,
 const char *Driver::GetNamedOutputPath(Compilation &C, const JobAction &JA,
                                        const char *BaseInput,
                                        const char *BoundArch, bool AtTopLevel,
-                                       bool MultipleArchs) const {
+                                       bool MultipleArchs,
+                                       StringRef NormalizedTriple) const {
   llvm::PrettyStackTraceString CrashInfo("Computing output path");
   // Output to a user requested destination?
   if (AtTopLevel && !isa<DsymutilJobAction>(JA) && !isa<VerifyJobAction>(JA)) {
@@ -2255,6 +2401,7 @@ const char *Driver::GetNamedOutputPath(Compilation &C, const JobAction &JA,
           MakeCLOutputFilename(C.getArgs(), "", BaseName, types::TY_Image);
     } else if (MultipleArchs && BoundArch) {
       SmallString<128> Output(getDefaultImageName());
+      Output += JA.getOffloadingFileNamePrefix(NormalizedTriple);
       Output += "-";
       Output.append(BoundArch);
       NamedOutput = C.getArgs().MakeArgString(Output.c_str());
@@ -2271,6 +2418,7 @@ const char *Driver::GetNamedOutputPath(Compilation &C, const JobAction &JA,
     if (!types::appendSuffixForType(JA.getType()))
       End = BaseName.rfind('.');
     SmallString<128> Suffixed(BaseName.substr(0, End));
+    Suffixed += JA.getOffloadingFileNamePrefix(NormalizedTriple);
     if (MultipleArchs && BoundArch) {
       Suffixed += "-";
       Suffixed.append(BoundArch);
diff --git a/clang/lib/Driver/ToolChain.cpp b/clang/lib/Driver/ToolChain.cpp
index cba8924..e96688c 100644
--- a/clang/lib/Driver/ToolChain.cpp
+++ b/clang/lib/Driver/ToolChain.cpp
@@ -248,8 +248,7 @@ Tool *ToolChain::getTool(Action::ActionClass AC) const {
 
   case Action::InputClass:
   case Action::BindArchClass:
-  case Action::CudaDeviceClass:
-  case Action::CudaHostClass:
+  case Action::OffloadClass:
   case Action::LipoJobClass:
   case Action::DsymutilJobClass:
   case Action::VerifyDebugInfoJobClass:
diff --git a/clang/lib/Driver/Tools.cpp b/clang/lib/Driver/Tools.cpp
index 63284bc..df4a996 100644
--- a/clang/lib/Driver/Tools.cpp
+++ b/clang/lib/Driver/Tools.cpp
@@ -296,12 +296,45 @@ static bool forwardToGCC(const Option &O) {
          !O.hasFlag(options::DriverOption) && !O.hasFlag(options::LinkerInput);
 }
 
+/// Add the C++ include args of other offloading toolchains. If this is a host
+/// job, the device toolchains are added. If this is a device job, the host
+/// toolchains will be added.
+static void addExtraOffloadCXXStdlibIncludeArgs(Compilation &C,
+                                                const JobAction &JA,
+                                                const ArgList &Args,
+                                                ArgStringList &CmdArgs) {
+
+  if (JA.isHostOffloading(Action::OFK_Cuda))
+    C.getSingleOffloadToolChain<Action::OFK_Cuda>()
+        ->AddClangCXXStdlibIncludeArgs(Args, CmdArgs);
+  else if (JA.isDeviceOffloading(Action::OFK_Cuda))
+    C.getSingleOffloadToolChain<Action::OFK_Host>()
+        ->AddClangCXXStdlibIncludeArgs(Args, CmdArgs);
+
+  // TODO: Add support for other programming models here.
+}
+
+/// Add the include args that are specific of each offloading programming model.
+static void addExtraOffloadSpecificIncludeArgs(Compilation &C,
+                                               const JobAction &JA,
+                                               const ArgList &Args,
+                                               ArgStringList &CmdArgs) {
+
+  if (JA.isHostOffloading(Action::OFK_Cuda))
+    C.getSingleOffloadToolChain<Action::OFK_Host>()->AddCudaIncludeArgs(
+        Args, CmdArgs);
+  else if (JA.isDeviceOffloading(Action::OFK_Cuda))
+    C.getSingleOffloadToolChain<Action::OFK_Cuda>()->AddCudaIncludeArgs(
+        Args, CmdArgs);
+
+  // TODO: Add support for other programming models here.
+}
+
 void Clang::AddPreprocessingOptions(Compilation &C, const JobAction &JA,
                                     const Driver &D, const ArgList &Args,
                                     ArgStringList &CmdArgs,
                                     const InputInfo &Output,
-                                    const InputInfoList &Inputs,
-                                    const ToolChain *AuxToolChain) const {
+                                    const InputInfoList &Inputs) const {
   Arg *A;
   const bool IsIAMCU = getToolChain().getTriple().isOSIAMCU();
 
@@ -566,31 +599,27 @@ void Clang::AddPreprocessingOptions(Compilation &C, const JobAction &JA,
   // OBJCPLUS_INCLUDE_PATH - system includes enabled when compiling ObjC++.
   addDirectoryList(Args, CmdArgs, "-objcxx-isystem", "OBJCPLUS_INCLUDE_PATH");
 
-  // Optional AuxToolChain indicates that we need to include headers
-  // for more than one target. If that's the case, add include paths
-  // from AuxToolChain right after include paths of the same kind for
-  // the current target.
+  // While adding the include arguments, we also attempt to retrieve the
+  // arguments of related offloading toolchains or arguments that are specific
+  // of an offloading programming model.
 
   // Add C++ include arguments, if needed.
   if (types::isCXX(Inputs[0].getType())) {
     getToolChain().AddClangCXXStdlibIncludeArgs(Args, CmdArgs);
-    if (AuxToolChain)
-      AuxToolChain->AddClangCXXStdlibIncludeArgs(Args, CmdArgs);
+    addExtraOffloadCXXStdlibIncludeArgs(C, JA, Args, CmdArgs);
   }
 
   // Add system include arguments for all targets but IAMCU.
   if (!IsIAMCU) {
     getToolChain().AddClangSystemIncludeArgs(Args, CmdArgs);
-    if (AuxToolChain)
-      AuxToolChain->AddClangCXXStdlibIncludeArgs(Args, CmdArgs);
+    addExtraOffloadCXXStdlibIncludeArgs(C, JA, Args, CmdArgs);
   } else {
     // For IAMCU add special include arguments.
     getToolChain().AddIAMCUIncludeArgs(Args, CmdArgs);
   }
 
-  // Add CUDA include arguments, if needed.
-  if (types::isCuda(Inputs[0].getType()))
-    getToolChain().AddCudaIncludeArgs(Args, CmdArgs);
+  // Add offload include arguments, if needed.
+  addExtraOffloadSpecificIncludeArgs(C, JA, Args, CmdArgs);
 }
 
 // FIXME: Move to target hook.
@@ -3799,7 +3828,7 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
   // CUDA compilation may have multiple inputs (source file + results of
   // device-side compilations). All other jobs are expected to have exactly one
   // input.
-  bool IsCuda = types::isCuda(Input.getType());
+  bool IsCuda = JA.isOffloading(Action::OFK_Cuda);
   assert((IsCuda || Inputs.size() == 1) && "Unable to handle multiple inputs.");
 
   // C++ is not supported for IAMCU.
@@ -3815,21 +3844,21 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
   CmdArgs.push_back("-triple");
   CmdArgs.push_back(Args.MakeArgString(TripleStr));
 
-  const ToolChain *AuxToolChain = nullptr;
   if (IsCuda) {
-    // FIXME: We need a (better) way to pass information about
-    // particular compilation pass we're constructing here. For now we
-    // can check which toolchain we're using and pick the other one to
-    // extract the triple.
-    if (&getToolChain() == C.getSingleOffloadToolChain<Action::OFK_Cuda>())
-      AuxToolChain = C.getOffloadingHostToolChain();
-    else if (&getToolChain() == C.getOffloadingHostToolChain())
-      AuxToolChain = C.getSingleOffloadToolChain<Action::OFK_Cuda>();
+    // We have to pass the triple of the host if compiling for a CUDA device and
+    // vice-versa.
+    StringRef NormalizedTriple;
+    if (JA.isDeviceOffloading(Action::OFK_Cuda))
+      NormalizedTriple = C.getSingleOffloadToolChain<Action::OFK_Host>()
+                             ->getTriple()
+                             .normalize();
     else
-      llvm_unreachable("Can't figure out CUDA compilation mode.");
-    assert(AuxToolChain != nullptr && "No aux toolchain.");
+      NormalizedTriple = C.getSingleOffloadToolChain<Action::OFK_Cuda>()
+                             ->getTriple()
+                             .normalize();
+
     CmdArgs.push_back("-aux-triple");
-    CmdArgs.push_back(Args.MakeArgString(AuxToolChain->getTriple().str()));
+    CmdArgs.push_back(Args.MakeArgString(NormalizedTriple));
   }
 
   if (Triple.isOSWindows() && (Triple.getArch() == llvm::Triple::arm ||
@@ -4718,8 +4747,7 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
   //
   // FIXME: Support -fpreprocessed
   if (types::getPreprocessedType(InputType) != types::TY_INVALID)
-    AddPreprocessingOptions(C, JA, D, Args, CmdArgs, Output, Inputs,
-                            AuxToolChain);
+    AddPreprocessingOptions(C, JA, D, Args, CmdArgs, Output, Inputs);
 
   // Don't warn about "clang -c -DPIC -fPIC test.i" because libtool.m4 assumes
   // that "The compiler can only warn and ignore the option if not recognized".
@@ -11193,15 +11221,14 @@ void NVPTX::Assembler::ConstructJob(Compilation &C, const JobAction &JA,
       static_cast<const toolchains::CudaToolChain &>(getToolChain());
   assert(TC.getTriple().isNVPTX() && "Wrong platform");
 
-  std::vector<std::string> gpu_archs =
-      Args.getAllArgValues(options::OPT_march_EQ);
-  assert(gpu_archs.size() == 1 && "Exactly one GPU Arch required for ptxas.");
-  const std::string& gpu_arch = gpu_archs[0];
+  // Obtain architecture from the action.
+  CudaArch gpu_arch = StringToCudaArch(JA.getOffloadingArch());
+  assert(gpu_arch != CudaArch::UNKNOWN &&
+         "Device action expected to have an architecture.");
 
   // Check that our installation's ptxas supports gpu_arch.
   if (!Args.hasArg(options::OPT_no_cuda_version_check)) {
-    TC.cudaInstallation().CheckCudaVersionSupportsArch(
-        StringToCudaArch(gpu_arch));
+    TC.cudaInstallation().CheckCudaVersionSupportsArch(gpu_arch);
   }
 
   ArgStringList CmdArgs;
@@ -11245,7 +11272,7 @@ void NVPTX::Assembler::ConstructJob(Compilation &C, const JobAction &JA,
   }
 
   CmdArgs.push_back("--gpu-name");
-  CmdArgs.push_back(Args.MakeArgString(gpu_arch));
+  CmdArgs.push_back(Args.MakeArgString(CudaArchToString(gpu_arch)));
   CmdArgs.push_back("--output-file");
   CmdArgs.push_back(Args.MakeArgString(Output.getFilename()));
   for (const auto& II : Inputs)
@@ -11277,13 +11304,20 @@ void NVPTX::Linker::ConstructJob(Compilation &C, const JobAction &JA,
   CmdArgs.push_back(Args.MakeArgString(Output.getFilename()));
 
   for (const auto& II : Inputs) {
-    auto* A = cast<const CudaDeviceAction>(II.getAction());
+    auto *A = II.getAction();
+    assert(A->getInputs().size() == 1 &&
+           "Device offload action is expected to have a single input");
+    const char *gpu_arch_str = A->getOffloadingArch();
+    assert(gpu_arch_str &&
+           "Device action expected to have associated a GPU architecture!");
+    CudaArch gpu_arch = StringToCudaArch(gpu_arch_str);
+
     // We need to pass an Arch of the form "sm_XX" for cubin files and
     // "compute_XX" for ptx.
     const char *Arch =
         (II.getType() == types::TY_PP_Asm)
-            ? CudaVirtualArchToString(VirtualArchForCudaArch(A->getGpuArch()))
-            : CudaArchToString(A->getGpuArch());
+            ? CudaVirtualArchToString(VirtualArchForCudaArch(gpu_arch))
+            : gpu_arch_str;
     CmdArgs.push_back(Args.MakeArgString(llvm::Twine("--image=profile=") +
                                          Arch + ",file=" + II.getFilename()));
   }
diff --git a/clang/lib/Driver/Tools.h b/clang/lib/Driver/Tools.h
index 2e546fc..02bdb8e 100644
--- a/clang/lib/Driver/Tools.h
+++ b/clang/lib/Driver/Tools.h
@@ -57,8 +57,7 @@ private:
                                const Driver &D, const llvm::opt::ArgList &Args,
                                llvm::opt::ArgStringList &CmdArgs,
                                const InputInfo &Output,
-                               const InputInfoList &Inputs,
-                               const ToolChain *AuxToolChain) const;
+                               const InputInfoList &Inputs) const;
 
   void AddAArch64TargetArgs(const llvm::opt::ArgList &Args,
                             llvm::opt::ArgStringList &CmdArgs) const;
diff --git a/clang/lib/Frontend/CreateInvocationFromCommandLine.cpp b/clang/lib/Frontend/CreateInvocationFromCommandLine.cpp
index 6c44932..1e9e57a 100644
--- a/clang/lib/Frontend/CreateInvocationFromCommandLine.cpp
+++ b/clang/lib/Frontend/CreateInvocationFromCommandLine.cpp
@@ -60,25 +60,25 @@ clang::createInvocationFromCommandLine(ArrayRef<const char *> ArgList,
   }
 
   // We expect to get back exactly one command job, if we didn't something
-  // failed. CUDA compilation is an exception as it creates multiple jobs. If
-  // that's the case, we proceed with the first job. If caller needs particular
-  // CUDA job, it should be controlled via --cuda-{host|device}-only option
-  // passed to the driver.
+  // failed. Offload compilation is an exception as it creates multiple jobs. If
+  // that's the case, we proceed with the first job. If caller needs a
+  // particular job, it should be controlled via options (e.g.
+  // --cuda-{host|device}-only for CUDA) passed to the driver.
   const driver::JobList &Jobs = C->getJobs();
-  bool CudaCompilation = false;
+  bool OffloadCompilation = false;
   if (Jobs.size() > 1) {
     for (auto &A : C->getActions()){
       // On MacOSX real actions may end up being wrapped in BindArchAction
       if (isa<driver::BindArchAction>(A))
         A = *A->input_begin();
-      if (isa<driver::CudaDeviceAction>(A)) {
-        CudaCompilation = true;
+      if (isa<driver::OffloadAction>(A)) {
+        OffloadCompilation = true;
         break;
       }
     }
   }
   if (Jobs.size() == 0 || !isa<driver::Command>(*Jobs.begin()) ||
-      (Jobs.size() > 1 && !CudaCompilation)) {
+      (Jobs.size() > 1 && !OffloadCompilation)) {
     SmallString<256> Msg;
     llvm::raw_svector_ostream OS(Msg);
     Jobs.Print(OS, "; ", true);
diff --git a/clang/test/Driver/cuda_phases.cu b/clang/test/Driver/cuda_phases.cu
new file mode 100644
index 0000000..6cfb61a
--- /dev/null
+++ b/clang/test/Driver/cuda_phases.cu
@@ -0,0 +1,206 @@
+// Tests the phases generated for a CUDA offloading target for different
+// combinations of:
+// - Number of gpu architectures;
+// - Host/device-only compilation;
+// - User-requested final phase - binary or assembly.
+
+// REQUIRES: clang-driver
+// REQUIRES: powerpc-registered-target
+// REQUIRES: nvptx-registered-target
+
+//
+// Test single gpu architecture with complete compilation.
+//
+// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=sm_30 %s 2>&1 \
+// RUN: | FileCheck -check-prefix=BIN %s
+// BIN: 0: input, "{{.*}}cuda_phases.cu", cuda, (host-cuda)
+// BIN: 1: preprocessor, {0}, cuda-cpp-output, (host-cuda)
+// BIN: 2: compiler, {1}, ir, (host-cuda)
+// BIN: 3: input, "{{.*}}cuda_phases.cu", cuda, (device-cuda, sm_30)
+// BIN: 4: preprocessor, {3}, cuda-cpp-output, (device-cuda, sm_30)
+// BIN: 5: compiler, {4}, ir, (device-cuda, sm_30)
+// BIN: 6: backend, {5}, assembler, (device-cuda, sm_30)
+// BIN: 7: assembler, {6}, object, (device-cuda, sm_30)
+// BIN: 8: offload, "device-cuda (nvptx64-nvidia-cuda:sm_30)" {7}, object
+// BIN: 9: offload, "device-cuda (nvptx64-nvidia-cuda:sm_30)" {6}, assembler
+// BIN: 10: linker, {8, 9}, cuda-fatbin, (device-cuda)
+// BIN: 11: offload, "host-cuda (powerpc64le-ibm-linux-gnu)" {2}, "device-cuda (nvptx64-nvidia-cuda)" {10}, ir
+// BIN: 12: backend, {11}, assembler, (host-cuda)
+// BIN: 13: assembler, {12}, object, (host-cuda)
+// BIN: 14: linker, {13}, image, (host-cuda)
+
+//
+// Test single gpu architecture up to the assemble phase.
+//
+// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=sm_30 %s -S 2>&1 \
+// RUN: | FileCheck -check-prefix=ASM %s
+// ASM: 0: input, "{{.*}}cuda_phases.cu", cuda, (device-cuda, sm_30)
+// ASM: 1: preprocessor, {0}, cuda-cpp-output, (device-cuda, sm_30)
+// ASM: 2: compiler, {1}, ir, (device-cuda, sm_30)
+// ASM: 3: backend, {2}, assembler, (device-cuda, sm_30)
+// ASM: 4: offload, "device-cuda (nvptx64-nvidia-cuda:sm_30)" {3}, assembler
+// ASM: 5: input, "{{.*}}cuda_phases.cu", cuda, (host-cuda)
+// ASM: 6: preprocessor, {5}, cuda-cpp-output, (host-cuda)
+// ASM: 7: compiler, {6}, ir, (host-cuda)
+// ASM: 8: backend, {7}, assembler, (host-cuda)
+
+//
+// Test two gpu architectures with complete compilation.
+//
+// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=sm_30 --cuda-gpu-arch=sm_35 %s 2>&1 \
+// RUN: | FileCheck -check-prefix=BIN2 %s
+// BIN2: 0: input, "{{.*}}cuda_phases.cu", cuda, (host-cuda)
+// BIN2: 1: preprocessor, {0}, cuda-cpp-output, (host-cuda)
+// BIN2: 2: compiler, {1}, ir, (host-cuda)
+// BIN2: 3: input, "{{.*}}cuda_phases.cu", cuda, (device-cuda, sm_30)
+// BIN2: 4: preprocessor, {3}, cuda-cpp-output, (device-cuda, sm_30)
+// BIN2: 5: compiler, {4}, ir, (device-cuda, sm_30)
+// BIN2: 6: backend, {5}, assembler, (device-cuda, sm_30)
+// BIN2: 7: assembler, {6}, object, (device-cuda, sm_30)
+// BIN2: 8: offload, "device-cuda (nvptx64-nvidia-cuda:sm_30)" {7}, object
+// BIN2: 9: offload, "device-cuda (nvptx64-nvidia-cuda:sm_30)" {6}, assembler
+// BIN2: 10: input, "{{.*}}cuda_phases.cu", cuda, (device-cuda, sm_35)
+// BIN2: 11: preprocessor, {10}, cuda-cpp-output, (device-cuda, sm_35)
+// BIN2: 12: compiler, {11}, ir, (device-cuda, sm_35)
+// BIN2: 13: backend, {12}, assembler, (device-cuda, sm_35)
+// BIN2: 14: assembler, {13}, object, (device-cuda, sm_35)
+// BIN2: 15: offload, "device-cuda (nvptx64-nvidia-cuda:sm_35)" {14}, object
+// BIN2: 16: offload, "device-cuda (nvptx64-nvidia-cuda:sm_35)" {13}, assembler
+// BIN2: 17: linker, {8, 9, 15, 16}, cuda-fatbin, (device-cuda)
+// BIN2: 18: offload, "host-cuda (powerpc64le-ibm-linux-gnu)" {2}, "device-cuda (nvptx64-nvidia-cuda)" {17}, ir
+// BIN2: 19: backend, {18}, assembler, (host-cuda)
+// BIN2: 20: assembler, {19}, object, (host-cuda)
+// BIN2: 21: linker, {20}, image, (host-cuda)
+
+//
+// Test two gpu architecturess up to the assemble phase.
+//
+// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=sm_30 --cuda-gpu-arch=sm_35 %s -S 2>&1 \
+// RUN: | FileCheck -check-prefix=ASM2 %s
+// ASM2: 0: input, "{{.*}}cuda_phases.cu", cuda, (device-cuda, sm_30)
+// ASM2: 1: preprocessor, {0}, cuda-cpp-output, (device-cuda, sm_30)
+// ASM2: 2: compiler, {1}, ir, (device-cuda, sm_30)
+// ASM2: 3: backend, {2}, assembler, (device-cuda, sm_30)
+// ASM2: 4: offload, "device-cuda (nvptx64-nvidia-cuda:sm_30)" {3}, assembler
+// ASM2: 5: input, "{{.*}}cuda_phases.cu", cuda, (device-cuda, sm_35)
+// ASM2: 6: preprocessor, {5}, cuda-cpp-output, (device-cuda, sm_35)
+// ASM2: 7: compiler, {6}, ir, (device-cuda, sm_35)
+// ASM2: 8: backend, {7}, assembler, (device-cuda, sm_35)
+// ASM2: 9: offload, "device-cuda (nvptx64-nvidia-cuda:sm_35)" {8}, assembler
+// ASM2: 10: input, "{{.*}}cuda_phases.cu", cuda, (host-cuda)
+// ASM2: 11: preprocessor, {10}, cuda-cpp-output, (host-cuda)
+// ASM2: 12: compiler, {11}, ir, (host-cuda)
+// ASM2: 13: backend, {12}, assembler, (host-cuda)
+
+//
+// Test single gpu architecture with complete compilation in host-only
+// compilation mode.
+//
+// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=sm_30 %s --cuda-host-only 2>&1 \
+// RUN: | FileCheck -check-prefix=HBIN %s
+// HBIN: 0: input, "{{.*}}cuda_phases.cu", cuda, (host-cuda)
+// HBIN: 1: preprocessor, {0}, cuda-cpp-output, (host-cuda)
+// HBIN: 2: compiler, {1}, ir, (host-cuda)
+// HBIN: 3: offload, "host-cuda (powerpc64le-ibm-linux-gnu)" {2}, ir
+// HBIN: 4: backend, {3}, assembler, (host-cuda)
+// HBIN: 5: assembler, {4}, object, (host-cuda)
+// HBIN: 6: linker, {5}, image, (host-cuda)
+
+//
+// Test single gpu architecture up to the assemble phase in host-only
+// compilation mode.
+//
+// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=sm_30 %s --cuda-host-only -S 2>&1 \
+// RUN: | FileCheck -check-prefix=HASM %s
+// HASM: 0: input, "{{.*}}cuda_phases.cu", cuda, (host-cuda)
+// HASM: 1: preprocessor, {0}, cuda-cpp-output, (host-cuda)
+// HASM: 2: compiler, {1}, ir, (host-cuda)
+// HASM: 3: offload, "host-cuda (powerpc64le-ibm-linux-gnu)" {2}, ir
+// HASM: 4: backend, {3}, assembler, (host-cuda)
+
+//
+// Test two gpu architectures with complete compilation in host-only
+// compilation mode.
+//
+// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=sm_30 --cuda-gpu-arch=sm_35 %s --cuda-host-only 2>&1 \
+// RUN: | FileCheck -check-prefix=HBIN2 %s
+// HBIN2: 0: input, "{{.*}}cuda_phases.cu", cuda, (host-cuda)
+// HBIN2: 1: preprocessor, {0}, cuda-cpp-output, (host-cuda)
+// HBIN2: 2: compiler, {1}, ir, (host-cuda)
+// HBIN2: 3: offload, "host-cuda (powerpc64le-ibm-linux-gnu)" {2}, ir
+// HBIN2: 4: backend, {3}, assembler, (host-cuda)
+// HBIN2: 5: assembler, {4}, object, (host-cuda)
+// HBIN2: 6: linker, {5}, image, (host-cuda)
+
+//
+// Test two gpu architectures up to the assemble phase in host-only
+// compilation mode.
+//
+// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=sm_30 --cuda-gpu-arch=sm_35 %s --cuda-host-only -S 2>&1 \
+// RUN: | FileCheck -check-prefix=HASM2 %s
+// HASM2: 0: input, "{{.*}}cuda_phases.cu", cuda, (host-cuda)
+// HASM2: 1: preprocessor, {0}, cuda-cpp-output, (host-cuda)
+// HASM2: 2: compiler, {1}, ir, (host-cuda)
+// HASM2: 3: offload, "host-cuda (powerpc64le-ibm-linux-gnu)" {2}, ir
+// HASM2: 4: backend, {3}, assembler, (host-cuda)
+
+//
+// Test single gpu architecture with complete compilation in device-only
+// compilation mode.
+//
+// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=sm_30 %s --cuda-device-only 2>&1 \
+// RUN: | FileCheck -check-prefix=DBIN %s
+// DBIN: 0: input, "{{.*}}cuda_phases.cu", cuda, (device-cuda, sm_30)
+// DBIN: 1: preprocessor, {0}, cuda-cpp-output, (device-cuda, sm_30)
+// DBIN: 2: compiler, {1}, ir, (device-cuda, sm_30)
+// DBIN: 3: backend, {2}, assembler, (device-cuda, sm_30)
+// DBIN: 4: assembler, {3}, object, (device-cuda, sm_30)
+// DBIN: 5: offload, "device-cuda (nvptx64-nvidia-cuda:sm_30)" {4}, object
+
+//
+// Test single gpu architecture up to the assemble phase in device-only
+// compilation mode.
+//
+// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=sm_30 %s --cuda-device-only -S 2>&1 \
+// RUN: | FileCheck -check-prefix=DASM %s
+// DASM: 0: input, "{{.*}}cuda_phases.cu", cuda, (device-cuda, sm_30)
+// DASM: 1: preprocessor, {0}, cuda-cpp-output, (device-cuda, sm_30)
+// DASM: 2: compiler, {1}, ir, (device-cuda, sm_30)
+// DASM: 3: backend, {2}, assembler, (device-cuda, sm_30)
+// DASM: 4: offload, "device-cuda (nvptx64-nvidia-cuda:sm_30)" {3}, assembler
+
+//
+// Test two gpu architectures with complete compilation in device-only
+// compilation mode.
+//
+// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=sm_30 --cuda-gpu-arch=sm_35 %s --cuda-device-only 2>&1 \
+// RUN: | FileCheck -check-prefix=DBIN2 %s
+// DBIN2: 0: input, "{{.*}}cuda_phases.cu", cuda, (device-cuda, sm_30)
+// DBIN2: 1: preprocessor, {0}, cuda-cpp-output, (device-cuda, sm_30)
+// DBIN2: 2: compiler, {1}, ir, (device-cuda, sm_30)
+// DBIN2: 3: backend, {2}, assembler, (device-cuda, sm_30)
+// DBIN2: 4: assembler, {3}, object, (device-cuda, sm_30)
+// DBIN2: 5: offload, "device-cuda (nvptx64-nvidia-cuda:sm_30)" {4}, object
+// DBIN2: 6: input, "{{.*}}cuda_phases.cu", cuda, (device-cuda, sm_35)
+// DBIN2: 7: preprocessor, {6}, cuda-cpp-output, (device-cuda, sm_35)
+// DBIN2: 8: compiler, {7}, ir, (device-cuda, sm_35)
+// DBIN2: 9: backend, {8}, assembler, (device-cuda, sm_35)
+// DBIN2: 10: assembler, {9}, object, (device-cuda, sm_35)
+// DBIN2: 11: offload, "device-cuda (nvptx64-nvidia-cuda:sm_35)" {10}, object
+
+//
+// Test two gpu architectures up to the assemble phase in device-only
+// compilation mode.
+//
+// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=sm_30 --cuda-gpu-arch=sm_35 %s --cuda-device-only -S 2>&1 \
+// RUN: | FileCheck -check-prefix=DASM2 %s
+// DASM2: 0: input, "{{.*}}cuda_phases.cu", cuda, (device-cuda, sm_30)
+// DASM2: 1: preprocessor, {0}, cuda-cpp-output, (device-cuda, sm_30)
+// DASM2: 2: compiler, {1}, ir, (device-cuda, sm_30)
+// DASM2: 3: backend, {2}, assembler, (device-cuda, sm_30)
+// DASM2: 4: offload, "device-cuda (nvptx64-nvidia-cuda:sm_30)" {3}, assembler
+// DASM2: 5: input, "{{.*}}cuda_phases.cu", cuda, (device-cuda, sm_35)
+// DASM2: 6: preprocessor, {5}, cuda-cpp-output, (device-cuda, sm_35)
+// DASM2: 7: compiler, {6}, ir, (device-cuda, sm_35)
+// DASM2: 8: backend, {7}, assembler, (device-cuda, sm_35)
+// DASM2: 9: offload, "device-cuda (nvptx64-nvidia-cuda:sm_35)" {8}, assembler