From a22236120f17074532bec70f5916a235beddd475 Mon Sep 17 00:00:00 2001
From: Michael Kruse <llvm-project@meinersbur.de>
Date: Thu, 10 Jun 2021 14:24:17 -0500
Subject: [PATCH] [OpenMP] Implement '#pragma omp unroll'.

Implementation of the unroll directive introduced in OpenMP 5.1. Follows the approach from D76342 for the tile directive (i.e. AST-based, not using the OpenMPIRBuilder). Tries to use `llvm.loop.unroll.*` metadata where possible, but has to fall back to an AST representation of the outer loop if the partially unrolled generated loop is associated with another directive (because it needs to compute the number of iterations).

Reviewed By: ABataev

Differential Revision: https://reviews.llvm.org/D99459
---
 clang/include/clang-c/Index.h                 |   6 +-
 clang/include/clang/AST/OpenMPClause.h        | 108 +++++
 clang/include/clang/AST/RecursiveASTVisitor.h |  14 +
 clang/include/clang/AST/StmtOpenMP.h          |  72 +++
 .../clang/Basic/DiagnosticSemaKinds.td        |   3 +
 clang/include/clang/Basic/StmtNodes.td        |   1 +
 clang/include/clang/Sema/Sema.h               |  35 +-
 .../include/clang/Serialization/ASTBitCodes.h |   1 +
 clang/lib/AST/OpenMPClause.cpp                |  42 ++
 clang/lib/AST/StmtOpenMP.cpp                  |  40 +-
 clang/lib/AST/StmtPrinter.cpp                 |   5 +
 clang/lib/AST/StmtProfile.cpp                 |  11 +
 clang/lib/Basic/OpenMPKinds.cpp               |   6 +-
 clang/lib/CodeGen/CGOpenMPRuntime.cpp         |   5 +
 clang/lib/CodeGen/CGStmt.cpp                  |   3 +
 clang/lib/CodeGen/CGStmtOpenMP.cpp            |  28 ++
 clang/lib/CodeGen/CodeGenFunction.h           |   1 +
 clang/lib/Parse/ParseOpenMP.cpp               |   7 +-
 clang/lib/Sema/SemaExceptionSpec.cpp          |   1 +
 clang/lib/Sema/SemaOpenMP.cpp                 | 442 ++++++++++++++++--
 clang/lib/Sema/TreeTransform.h                |  47 ++
 clang/lib/Serialization/ASTReader.cpp         |  13 +
 clang/lib/Serialization/ASTReaderStmt.cpp     |  12 +
 clang/lib/Serialization/ASTWriter.cpp         |   7 +
 clang/lib/Serialization/ASTWriterStmt.cpp     |   5 +
 clang/lib/StaticAnalyzer/Core/ExprEngine.cpp  |   3 +-
 clang/test/OpenMP/unroll_ast_print.cpp        | 127 +++++
 clang/test/OpenMP/unroll_codegen_factor.cpp   |  65 +++
 .../unroll_codegen_for_collapse_outer.cpp     | 251 ++++++++++
 .../OpenMP/unroll_codegen_for_partial.cpp     | 187 ++++++++
 clang/test/OpenMP/unroll_codegen_full.cpp     |  52 +++
 .../test/OpenMP/unroll_codegen_heuristic.cpp  |  64 +++
 .../unroll_codegen_parallel_for_factor.cpp    | 210 +++++++++
 clang/test/OpenMP/unroll_codegen_partial.cpp  |  64 +++
 clang/test/OpenMP/unroll_codegen_tile_for.cpp | 245 ++++++++++
 .../test/OpenMP/unroll_codegen_unroll_for.cpp | 239 ++++++++++
 clang/test/OpenMP/unroll_messages.cpp         | 130 ++++++
 clang/tools/libclang/CIndex.cpp               |  13 +
 clang/tools/libclang/CXCursor.cpp             |   3 +
 llvm/include/llvm/Frontend/OpenMP/OMP.td      |   8 +
 40 files changed, 2530 insertions(+), 46 deletions(-)
 create mode 100644 clang/test/OpenMP/unroll_ast_print.cpp
 create mode 100644 clang/test/OpenMP/unroll_codegen_factor.cpp
 create mode 100644 clang/test/OpenMP/unroll_codegen_for_collapse_outer.cpp
 create mode 100644 clang/test/OpenMP/unroll_codegen_for_partial.cpp
 create mode 100644 clang/test/OpenMP/unroll_codegen_full.cpp
 create mode 100644 clang/test/OpenMP/unroll_codegen_heuristic.cpp
 create mode 100644 clang/test/OpenMP/unroll_codegen_parallel_for_factor.cpp
 create mode 100644 clang/test/OpenMP/unroll_codegen_partial.cpp
 create mode 100644 clang/test/OpenMP/unroll_codegen_tile_for.cpp
 create mode 100644 clang/test/OpenMP/unroll_codegen_unroll_for.cpp
 create mode 100644 clang/test/OpenMP/unroll_messages.cpp

diff --git a/clang/include/clang-c/Index.h b/clang/include/clang-c/Index.h
index 87592156dd6b..c7d3b4e10622 100644
--- a/clang/include/clang-c/Index.h
+++ b/clang/include/clang-c/Index.h
@@ -2588,7 +2588,11 @@ enum CXCursorKind {
    */
   CXCursor_OMPMaskedDirective = 292,
 
-  CXCursor_LastStmt = CXCursor_OMPMaskedDirective,
+  /** OpenMP unroll directive.
+   */
+  CXCursor_OMPUnrollDirective = 293,
+
+  CXCursor_LastStmt = CXCursor_OMPUnrollDirective,
 
   /**
    * Cursor that represents the translation unit itself.
diff --git a/clang/include/clang/AST/OpenMPClause.h b/clang/include/clang/AST/OpenMPClause.h
index e484a89bc84a..aaddcfa307da 100644
--- a/clang/include/clang/AST/OpenMPClause.h
+++ b/clang/include/clang/AST/OpenMPClause.h
@@ -888,6 +888,114 @@ public:
   }
 };
 
+/// Representation of the 'full' clause of the '#pragma omp unroll' directive.
+///
+/// \code
+/// #pragma omp unroll full
+/// for (int i = 0; i < 64; ++i)
+/// \endcode
+class OMPFullClause final : public OMPClause {
+  friend class OMPClauseReader;
+
+  /// Build an empty clause.
+  explicit OMPFullClause() : OMPClause(llvm::omp::OMPC_full, {}, {}) {}
+
+public:
+  /// Build an AST node for a 'full' clause.
+  ///
+  /// \param C        Context of the AST.
+  /// \param StartLoc Starting location of the clause.
+  /// \param EndLoc   Ending location of the clause.
+  static OMPFullClause *Create(const ASTContext &C, SourceLocation StartLoc,
+                               SourceLocation EndLoc);
+
+  /// Build an empty 'full' AST node for deserialization.
+  ///
+  /// \param C Context of the AST.
+  static OMPFullClause *CreateEmpty(const ASTContext &C);
+
+  child_range children() { return {child_iterator(), child_iterator()}; }
+  const_child_range children() const {
+    return {const_child_iterator(), const_child_iterator()};
+  }
+
+  child_range used_children() {
+    return child_range(child_iterator(), child_iterator());
+  }
+  const_child_range used_children() const {
+    return const_child_range(const_child_iterator(), const_child_iterator());
+  }
+
+  static bool classof(const OMPClause *T) {
+    return T->getClauseKind() == llvm::omp::OMPC_full;
+  }
+};
+
+/// Representation of the 'partial' clause of the '#pragma omp unroll'
+/// directive.
+///
+/// \code
+/// #pragma omp unroll partial(4)
+/// for (int i = start; i < end; ++i)
+/// \endcode
+class OMPPartialClause final : public OMPClause {
+  friend class OMPClauseReader;
+
+  /// Location of '('.
+  SourceLocation LParenLoc;
+
+  /// Optional argument to the clause (unroll factor).
+  Stmt *Factor;
+
+  /// Build an empty clause.
+  explicit OMPPartialClause() : OMPClause(llvm::omp::OMPC_partial, {}, {}) {}
+
+  /// Set the unroll factor.
+  void setFactor(Expr *E) { Factor = E; }
+
+  /// Sets the location of '('.
+  void setLParenLoc(SourceLocation Loc) { LParenLoc = Loc; }
+
+public:
+  /// Build an AST node for a 'partial' clause.
+  ///
+  /// \param C         Context of the AST.
+  /// \param StartLoc  Location of the 'partial' identifier.
+  /// \param LParenLoc Location of '('.
+  /// \param EndLoc    Location of ')'.
+  /// \param Factor    Clause argument.
+  static OMPPartialClause *Create(const ASTContext &C, SourceLocation StartLoc,
+                                  SourceLocation LParenLoc,
+                                  SourceLocation EndLoc, Expr *Factor);
+
+  /// Build an empty 'partial' AST node for deserialization.
+  ///
+  /// \param C     Context of the AST.
+  static OMPPartialClause *CreateEmpty(const ASTContext &C);
+
+  /// Returns the location of '('.
+  SourceLocation getLParenLoc() const { return LParenLoc; }
+
+  /// Returns the argument of the clause or nullptr if not set.
+  Expr *getFactor() const { return cast_or_null<Expr>(Factor); }
+
+  child_range children() { return child_range(&Factor, &Factor + 1); }
+  const_child_range children() const {
+    return const_child_range(&Factor, &Factor + 1);
+  }
+
+  child_range used_children() {
+    return child_range(child_iterator(), child_iterator());
+  }
+  const_child_range used_children() const {
+    return const_child_range(const_child_iterator(), const_child_iterator());
+  }
+
+  static bool classof(const OMPClause *T) {
+    return T->getClauseKind() == llvm::omp::OMPC_partial;
+  }
+};
+
 /// This represents 'collapse' clause in the '#pragma omp ...'
 /// directive.
 ///
diff --git a/clang/include/clang/AST/RecursiveASTVisitor.h b/clang/include/clang/AST/RecursiveASTVisitor.h
index 73d757f0255a..a29559e5184a 100644
--- a/clang/include/clang/AST/RecursiveASTVisitor.h
+++ b/clang/include/clang/AST/RecursiveASTVisitor.h
@@ -2841,6 +2841,9 @@ DEF_TRAVERSE_STMT(OMPSimdDirective,
 DEF_TRAVERSE_STMT(OMPTileDirective,
                   { TRY_TO(TraverseOMPExecutableDirective(S)); })
 
+DEF_TRAVERSE_STMT(OMPUnrollDirective,
+                  { TRY_TO(TraverseOMPExecutableDirective(S)); })
+
 DEF_TRAVERSE_STMT(OMPForDirective,
                   { TRY_TO(TraverseOMPExecutableDirective(S)); })
 
@@ -3097,6 +3100,17 @@ bool RecursiveASTVisitor<Derived>::VisitOMPSizesClause(OMPSizesClause *C) {
   return true;
 }
 
+template <typename Derived>
+bool RecursiveASTVisitor<Derived>::VisitOMPFullClause(OMPFullClause *C) {
+  return true;
+}
+
+template <typename Derived>
+bool RecursiveASTVisitor<Derived>::VisitOMPPartialClause(OMPPartialClause *C) {
+  TRY_TO(TraverseStmt(C->getFactor()));
+  return true;
+}
+
 template <typename Derived>
 bool
 RecursiveASTVisitor<Derived>::VisitOMPCollapseClause(OMPCollapseClause *C) {
diff --git a/clang/include/clang/AST/StmtOpenMP.h b/clang/include/clang/AST/StmtOpenMP.h
index 67ba77a5d1b0..9c85df741f48 100644
--- a/clang/include/clang/AST/StmtOpenMP.h
+++ b/clang/include/clang/AST/StmtOpenMP.h
@@ -5067,6 +5067,78 @@ public:
   }
 };
 
+/// This represents the '#pragma omp unroll' loop transformation directive.
+///
+/// \code
+/// #pragma omp unroll
+/// for (int i = 0; i < 64; ++i)
+/// \endcode
+class OMPUnrollDirective final : public OMPLoopBasedDirective {
+  friend class ASTStmtReader;
+  friend class OMPExecutableDirective;
+
+  /// Default list of offsets.
+  enum {
+    PreInitsOffset = 0,
+    TransformedStmtOffset,
+  };
+
+  explicit OMPUnrollDirective(SourceLocation StartLoc, SourceLocation EndLoc)
+      : OMPLoopBasedDirective(OMPUnrollDirectiveClass, llvm::omp::OMPD_unroll,
+                              StartLoc, EndLoc, 1) {}
+
+  /// Set the pre-init statements.
+  void setPreInits(Stmt *PreInits) {
+    Data->getChildren()[PreInitsOffset] = PreInits;
+  }
+
+  /// Set the de-sugared statement.
+  void setTransformedStmt(Stmt *S) {
+    Data->getChildren()[TransformedStmtOffset] = S;
+  }
+
+public:
+  /// Create a new AST node representation for '#pragma omp unroll'.
+  ///
+  /// \param C         Context of the AST.
+  /// \param StartLoc  Location of the introducer (e.g. the 'omp' token).
+  /// \param EndLoc    Location of the directive's end (e.g. the tok::eod).
+  /// \param Clauses   The directive's clauses.
+  /// \param AssociatedStmt The outermost associated loop.
+  /// \param TransformedStmt The loop nest after tiling, or nullptr in
+  ///                        dependent contexts.
+  /// \param PreInits   Helper preinits statements for the loop nest.
+  static OMPUnrollDirective *
+  Create(const ASTContext &C, SourceLocation StartLoc, SourceLocation EndLoc,
+         ArrayRef<OMPClause *> Clauses, Stmt *AssociatedStmt,
+         Stmt *TransformedStmt, Stmt *PreInits);
+
+  /// Build an empty '#pragma omp unroll' AST node for deserialization.
+  ///
+  /// \param C          Context of the AST.
+  /// \param NumClauses Number of clauses to allocate.
+  static OMPUnrollDirective *CreateEmpty(const ASTContext &C,
+                                         unsigned NumClauses);
+
+  /// Get the de-sugared associated loops after unrolling.
+  ///
+  /// This is only used if the unrolled loop becomes an associated loop of
+  /// another directive, otherwise the loop is emitted directly using loop
+  /// transformation metadata. When the unrolled loop cannot be used by another
+  /// directive (e.g. because of the full clause), the transformed stmt can also
+  /// be nullptr.
+  Stmt *getTransformedStmt() const {
+    return Data->getChildren()[TransformedStmtOffset];
+  }
+
+  /// Return the pre-init statements.
+  Stmt *getPreInits() const { return Data->getChildren()[PreInitsOffset]; }
+
+  static bool classof(const Stmt *T) {
+    return T->getStmtClass() == OMPUnrollDirectiveClass;
+  }
+};
+
 /// This represents '#pragma omp scan' directive.
 ///
 /// \code
diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index ba55095a8266..af242cffbe7d 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -10748,6 +10748,9 @@ def err_omp_interop_var_multiple_actions : Error<
 def err_omp_dispatch_statement_call
     : Error<"statement after '#pragma omp dispatch' must be a direct call"
             " to a target function or an assignment to one">;
+def err_omp_unroll_full_variable_trip_count : Error<
+  "loop to be fully unrolled must have a constant trip count">;
+def note_omp_directive_here : Note<"'%0' directive found here">;
 } // end of OpenMP category
 
 let CategoryName = "Related Result Type Issue" in {
diff --git a/clang/include/clang/Basic/StmtNodes.td b/clang/include/clang/Basic/StmtNodes.td
index 6134e60f3541..508f1fddf1b3 100644
--- a/clang/include/clang/Basic/StmtNodes.td
+++ b/clang/include/clang/Basic/StmtNodes.td
@@ -224,6 +224,7 @@ def OMPLoopDirective : StmtNode<OMPLoopBasedDirective, 1>;
 def OMPParallelDirective : StmtNode<OMPExecutableDirective>;
 def OMPSimdDirective : StmtNode<OMPLoopDirective>;
 def OMPTileDirective : StmtNode<OMPLoopBasedDirective>;
+def OMPUnrollDirective : StmtNode<OMPLoopBasedDirective>;
 def OMPForDirective : StmtNode<OMPLoopDirective>;
 def OMPForSimdDirective : StmtNode<OMPLoopDirective>;
 def OMPSectionsDirective : StmtNode<OMPExecutableDirective>;
diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h
index d1ad6a1cc4af..6ade9d769126 100644
--- a/clang/include/clang/Sema/Sema.h
+++ b/clang/include/clang/Sema/Sema.h
@@ -32,6 +32,7 @@
 #include "clang/AST/NSAPI.h"
 #include "clang/AST/PrettyPrinter.h"
 #include "clang/AST/StmtCXX.h"
+#include "clang/AST/StmtOpenMP.h"
 #include "clang/AST/TypeLoc.h"
 #include "clang/AST/TypeOrdering.h"
 #include "clang/Basic/BitmaskEnum.h"
@@ -10220,7 +10221,8 @@ private:
   void DestroyDataSharingAttributesStack();
   ExprResult
   VerifyPositiveIntegerConstantInClause(Expr *Op, OpenMPClauseKind CKind,
-                                        bool StrictlyPositive = true);
+                                        bool StrictlyPositive = true,
+                                        bool SuppressExprDiags = false);
   /// Returns OpenMP nesting level for current directive.
   unsigned getOpenMPNestingLevel() const;
 
@@ -10238,6 +10240,25 @@ private:
   /// Pop OpenMP function region for non-capturing function.
   void popOpenMPFunctionRegion(const sema::FunctionScopeInfo *OldFSI);
 
+  /// Analyzes and checks a loop nest for use by a loop transformation.
+  ///
+  /// \param Kind          The loop transformation directive kind.
+  /// \param NumLoops      How many nested loops the directive is expecting.
+  /// \param AStmt         Associated statement of the transformation directive.
+  /// \param LoopHelpers   [out] The loop analysis result.
+  /// \param Body          [out] The body code nested in \p NumLoops loop.
+  /// \param OriginalInits [out] Collection of statements and declarations that
+  ///                      must have been executed/declared before entering the
+  ///                      loop.
+  ///
+  /// \return Whether there was any error.
+  bool checkTransformableLoopNest(
+      OpenMPDirectiveKind Kind, Stmt *AStmt, int NumLoops,
+      SmallVectorImpl<OMPLoopBasedDirective::HelperExprs> &LoopHelpers,
+      Stmt *&Body,
+      SmallVectorImpl<SmallVector<llvm::PointerUnion<Stmt *, Decl *>, 0>>
+          &OriginalInits);
+
   /// Helper to keep information about the current `omp begin/end declare
   /// variant` nesting.
   struct OMPDeclareVariantScope {
@@ -10543,6 +10564,11 @@ public:
   StmtResult ActOnOpenMPTileDirective(ArrayRef<OMPClause *> Clauses,
                                       Stmt *AStmt, SourceLocation StartLoc,
                                       SourceLocation EndLoc);
+  /// Called on well-formed '#pragma omp unroll' after parsing of its clauses
+  /// and the associated statement.
+  StmtResult ActOnOpenMPUnrollDirective(ArrayRef<OMPClause *> Clauses,
+                                        Stmt *AStmt, SourceLocation StartLoc,
+                                        SourceLocation EndLoc);
   /// Called on well-formed '\#pragma omp for' after parsing
   /// of the associated statement.
   StmtResult
@@ -10898,6 +10924,13 @@ public:
                                     SourceLocation StartLoc,
                                     SourceLocation LParenLoc,
                                     SourceLocation EndLoc);
+  /// Called on well-form 'full' clauses.
+  OMPClause *ActOnOpenMPFullClause(SourceLocation StartLoc,
+                                   SourceLocation EndLoc);
+  /// Called on well-form 'partial' clauses.
+  OMPClause *ActOnOpenMPPartialClause(Expr *FactorExpr, SourceLocation StartLoc,
+                                      SourceLocation LParenLoc,
+                                      SourceLocation EndLoc);
   /// Called on well-formed 'collapse' clause.
   OMPClause *ActOnOpenMPCollapseClause(Expr *NumForLoops,
                                        SourceLocation StartLoc,
diff --git a/clang/include/clang/Serialization/ASTBitCodes.h b/clang/include/clang/Serialization/ASTBitCodes.h
index edf8987d61a6..ad2dcc2216ee 100644
--- a/clang/include/clang/Serialization/ASTBitCodes.h
+++ b/clang/include/clang/Serialization/ASTBitCodes.h
@@ -1894,6 +1894,7 @@ enum StmtCode {
   STMT_OMP_PARALLEL_DIRECTIVE,
   STMT_OMP_SIMD_DIRECTIVE,
   STMT_OMP_TILE_DIRECTIVE,
+  STMT_OMP_UNROLL_DIRECTIVE,
   STMT_OMP_FOR_DIRECTIVE,
   STMT_OMP_FOR_SIMD_DIRECTIVE,
   STMT_OMP_SECTIONS_DIRECTIVE,
diff --git a/clang/lib/AST/OpenMPClause.cpp b/clang/lib/AST/OpenMPClause.cpp
index 2f542a16a8e7..50f40395a197 100644
--- a/clang/lib/AST/OpenMPClause.cpp
+++ b/clang/lib/AST/OpenMPClause.cpp
@@ -942,6 +942,36 @@ OMPSizesClause *OMPSizesClause::CreateEmpty(const ASTContext &C,
   return new (Mem) OMPSizesClause(NumSizes);
 }
 
+OMPFullClause *OMPFullClause::Create(const ASTContext &C,
+                                     SourceLocation StartLoc,
+                                     SourceLocation EndLoc) {
+  OMPFullClause *Clause = CreateEmpty(C);
+  Clause->setLocStart(StartLoc);
+  Clause->setLocEnd(EndLoc);
+  return Clause;
+}
+
+OMPFullClause *OMPFullClause::CreateEmpty(const ASTContext &C) {
+  return new (C) OMPFullClause();
+}
+
+OMPPartialClause *OMPPartialClause::Create(const ASTContext &C,
+                                           SourceLocation StartLoc,
+                                           SourceLocation LParenLoc,
+                                           SourceLocation EndLoc,
+                                           Expr *Factor) {
+  OMPPartialClause *Clause = CreateEmpty(C);
+  Clause->setLocStart(StartLoc);
+  Clause->setLParenLoc(LParenLoc);
+  Clause->setLocEnd(EndLoc);
+  Clause->setFactor(Factor);
+  return Clause;
+}
+
+OMPPartialClause *OMPPartialClause::CreateEmpty(const ASTContext &C) {
+  return new (C) OMPPartialClause();
+}
+
 OMPAllocateClause *
 OMPAllocateClause::Create(const ASTContext &C, SourceLocation StartLoc,
                           SourceLocation LParenLoc, Expr *Allocator,
@@ -1602,6 +1632,18 @@ void OMPClausePrinter::VisitOMPSizesClause(OMPSizesClause *Node) {
   OS << ")";
 }
 
+void OMPClausePrinter::VisitOMPFullClause(OMPFullClause *Node) { OS << "full"; }
+
+void OMPClausePrinter::VisitOMPPartialClause(OMPPartialClause *Node) {
+  OS << "partial";
+
+  if (Expr *Factor = Node->getFactor()) {
+    OS << '(';
+    Factor->printPretty(OS, nullptr, Policy, 0);
+    OS << ')';
+  }
+}
+
 void OMPClausePrinter::VisitOMPAllocatorClause(OMPAllocatorClause *Node) {
   OS << "allocator(";
   Node->getAllocator()->printPretty(OS, nullptr, Policy, 0);
diff --git a/clang/lib/AST/StmtOpenMP.cpp b/clang/lib/AST/StmtOpenMP.cpp
index dbb11e77ac51..b0ef2f49ba04 100644
--- a/clang/lib/AST/StmtOpenMP.cpp
+++ b/clang/lib/AST/StmtOpenMP.cpp
@@ -129,9 +129,24 @@ bool OMPLoopBasedDirective::doForAllLoops(
         OnTransformationCallback) {
   CurStmt = CurStmt->IgnoreContainers();
   for (unsigned Cnt = 0; Cnt < NumLoops; ++Cnt) {
-    while (auto *Dir = dyn_cast<OMPTileDirective>(CurStmt)) {
-      OnTransformationCallback(Dir);
-      CurStmt = Dir->getTransformedStmt();
+    while (true) {
+      auto *OrigStmt = CurStmt;
+      if (auto *Dir = dyn_cast<OMPTileDirective>(OrigStmt)) {
+        OnTransformationCallback(Dir);
+        CurStmt = Dir->getTransformedStmt();
+      } else if (auto *Dir = dyn_cast<OMPUnrollDirective>(OrigStmt)) {
+        OnTransformationCallback(Dir);
+        CurStmt = Dir->getTransformedStmt();
+      } else {
+        break;
+      }
+
+      if (!CurStmt) {
+        // May happen if the loop transformation does not result in a generated
+        // loop (such as full unrolling).
+        CurStmt = OrigStmt;
+        break;
+      }
     }
     if (auto *CanonLoop = dyn_cast<OMPCanonicalLoop>(CurStmt))
       CurStmt = CanonLoop->getLoopStmt();
@@ -359,6 +374,25 @@ OMPTileDirective *OMPTileDirective::CreateEmpty(const ASTContext &C,
       SourceLocation(), SourceLocation(), NumLoops);
 }
 
+OMPUnrollDirective *
+OMPUnrollDirective::Create(const ASTContext &C, SourceLocation StartLoc,
+                           SourceLocation EndLoc, ArrayRef<OMPClause *> Clauses,
+                           Stmt *AssociatedStmt, Stmt *TransformedStmt,
+                           Stmt *PreInits) {
+  auto *Dir = createDirective<OMPUnrollDirective>(
+      C, Clauses, AssociatedStmt, TransformedStmtOffset + 1, StartLoc, EndLoc);
+  Dir->setTransformedStmt(TransformedStmt);
+  Dir->setPreInits(PreInits);
+  return Dir;
+}
+
+OMPUnrollDirective *OMPUnrollDirective::CreateEmpty(const ASTContext &C,
+                                                    unsigned NumClauses) {
+  return createEmptyDirective<OMPUnrollDirective>(
+      C, NumClauses, /*HasAssociatedStmt=*/true, TransformedStmtOffset + 1,
+      SourceLocation(), SourceLocation());
+}
+
 OMPForSimdDirective *
 OMPForSimdDirective::Create(const ASTContext &C, SourceLocation StartLoc,
                             SourceLocation EndLoc, unsigned CollapsedNum,
diff --git a/clang/lib/AST/StmtPrinter.cpp b/clang/lib/AST/StmtPrinter.cpp
index 3e275744931a..ed7f5763acde 100644
--- a/clang/lib/AST/StmtPrinter.cpp
+++ b/clang/lib/AST/StmtPrinter.cpp
@@ -668,6 +668,11 @@ void StmtPrinter::VisitOMPTileDirective(OMPTileDirective *Node) {
   PrintOMPExecutableDirective(Node);
 }
 
+void StmtPrinter::VisitOMPUnrollDirective(OMPUnrollDirective *Node) {
+  Indent() << "#pragma omp unroll";
+  PrintOMPExecutableDirective(Node);
+}
+
 void StmtPrinter::VisitOMPForDirective(OMPForDirective *Node) {
   Indent() << "#pragma omp for";
   PrintOMPExecutableDirective(Node);
diff --git a/clang/lib/AST/StmtProfile.cpp b/clang/lib/AST/StmtProfile.cpp
index dc91043b44a6..ed000c2467fa 100644
--- a/clang/lib/AST/StmtProfile.cpp
+++ b/clang/lib/AST/StmtProfile.cpp
@@ -468,6 +468,13 @@ void OMPClauseProfiler::VisitOMPSizesClause(const OMPSizesClause *C) {
       Profiler->VisitExpr(E);
 }
 
+void OMPClauseProfiler::VisitOMPFullClause(const OMPFullClause *C) {}
+
+void OMPClauseProfiler::VisitOMPPartialClause(const OMPPartialClause *C) {
+  if (const Expr *Factor = C->getFactor())
+    Profiler->VisitExpr(Factor);
+}
+
 void OMPClauseProfiler::VisitOMPAllocatorClause(const OMPAllocatorClause *C) {
   if (C->getAllocator())
     Profiler->VisitStmt(C->getAllocator());
@@ -908,6 +915,10 @@ void StmtProfiler::VisitOMPTileDirective(const OMPTileDirective *S) {
   VisitOMPLoopBasedDirective(S);
 }
 
+void StmtProfiler::VisitOMPUnrollDirective(const OMPUnrollDirective *S) {
+  VisitOMPLoopBasedDirective(S);
+}
+
 void StmtProfiler::VisitOMPForDirective(const OMPForDirective *S) {
   VisitOMPLoopDirective(S);
 }
diff --git a/clang/lib/Basic/OpenMPKinds.cpp b/clang/lib/Basic/OpenMPKinds.cpp
index b1c78742fb55..cfdba09eb1ec 100644
--- a/clang/lib/Basic/OpenMPKinds.cpp
+++ b/clang/lib/Basic/OpenMPKinds.cpp
@@ -452,7 +452,8 @@ bool clang::isOpenMPLoopDirective(OpenMPDirectiveKind DKind) {
          DKind == OMPD_target_teams_distribute ||
          DKind == OMPD_target_teams_distribute_parallel_for ||
          DKind == OMPD_target_teams_distribute_parallel_for_simd ||
-         DKind == OMPD_target_teams_distribute_simd || DKind == OMPD_tile;
+         DKind == OMPD_target_teams_distribute_simd || DKind == OMPD_tile ||
+         DKind == OMPD_unroll;
 }
 
 bool clang::isOpenMPWorksharingDirective(OpenMPDirectiveKind DKind) {
@@ -580,7 +581,7 @@ bool clang::isOpenMPLoopBoundSharingDirective(OpenMPDirectiveKind Kind) {
 }
 
 bool clang::isOpenMPLoopTransformationDirective(OpenMPDirectiveKind DKind) {
-  return DKind == OMPD_tile;
+  return DKind == OMPD_tile || DKind == OMPD_unroll;
 }
 
 void clang::getOpenMPCaptureRegions(
@@ -668,6 +669,7 @@ void clang::getOpenMPCaptureRegions(
     CaptureRegions.push_back(OMPD_unknown);
     break;
   case OMPD_tile:
+  case OMPD_unroll:
     // loop transformations do not introduce captures.
     break;
   case OMPD_threadprivate:
diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.cpp b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
index 8f65f38747d8..71b34a1578fe 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntime.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
@@ -6678,6 +6678,7 @@ emitNumTeamsForTargetDirective(CodeGenFunction &CGF,
   case OMPD_task:
   case OMPD_simd:
   case OMPD_tile:
+  case OMPD_unroll:
   case OMPD_sections:
   case OMPD_section:
   case OMPD_single:
@@ -6996,6 +6997,7 @@ emitNumThreadsForTargetDirective(CodeGenFunction &CGF,
   case OMPD_task:
   case OMPD_simd:
   case OMPD_tile:
+  case OMPD_unroll:
   case OMPD_sections:
   case OMPD_section:
   case OMPD_single:
@@ -9603,6 +9605,7 @@ getNestedDistributeDirective(ASTContext &Ctx, const OMPExecutableDirective &D) {
     case OMPD_task:
     case OMPD_simd:
     case OMPD_tile:
+    case OMPD_unroll:
     case OMPD_sections:
     case OMPD_section:
     case OMPD_single:
@@ -10432,6 +10435,7 @@ void CGOpenMPRuntime::scanForTargetRegionsFunctions(const Stmt *S,
     case OMPD_task:
     case OMPD_simd:
     case OMPD_tile:
+    case OMPD_unroll:
     case OMPD_sections:
     case OMPD_section:
     case OMPD_single:
@@ -11144,6 +11148,7 @@ void CGOpenMPRuntime::emitTargetDataStandAloneCall(
     case OMPD_task:
     case OMPD_simd:
     case OMPD_tile:
+    case OMPD_unroll:
     case OMPD_sections:
     case OMPD_section:
     case OMPD_single:
diff --git a/clang/lib/CodeGen/CGStmt.cpp b/clang/lib/CodeGen/CGStmt.cpp
index 29eea3be87ce..85390f2b1464 100644
--- a/clang/lib/CodeGen/CGStmt.cpp
+++ b/clang/lib/CodeGen/CGStmt.cpp
@@ -208,6 +208,9 @@ void CodeGenFunction::EmitStmt(const Stmt *S, ArrayRef<const Attr *> Attrs) {
   case Stmt::OMPTileDirectiveClass:
     EmitOMPTileDirective(cast<OMPTileDirective>(*S));
     break;
+  case Stmt::OMPUnrollDirectiveClass:
+    EmitOMPUnrollDirective(cast<OMPUnrollDirective>(*S));
+    break;
   case Stmt::OMPForDirectiveClass:
     EmitOMPForDirective(cast<OMPForDirective>(*S));
     break;
diff --git a/clang/lib/CodeGen/CGStmtOpenMP.cpp b/clang/lib/CodeGen/CGStmtOpenMP.cpp
index 9f2e52d4e523..9dd1edbfd1e5 100644
--- a/clang/lib/CodeGen/CGStmtOpenMP.cpp
+++ b/clang/lib/CodeGen/CGStmtOpenMP.cpp
@@ -176,6 +176,8 @@ class OMPLoopScope : public CodeGenFunction::RunCleanupsScope {
       PreInits = cast_or_null<DeclStmt>(LD->getPreInits());
     } else if (const auto *Tile = dyn_cast<OMPTileDirective>(&S)) {
       PreInits = cast_or_null<DeclStmt>(Tile->getPreInits());
+    } else if (const auto *Unroll = dyn_cast<OMPUnrollDirective>(&S)) {
+      PreInits = cast_or_null<DeclStmt>(Unroll->getPreInits());
     } else {
       llvm_unreachable("Unknown loop-based directive kind.");
     }
@@ -1821,6 +1823,8 @@ static void emitBody(CodeGenFunction &CGF, const Stmt *S, const Stmt *NextLoop,
   if (SimplifiedS == NextLoop) {
     if (auto *Dir = dyn_cast<OMPTileDirective>(SimplifiedS))
       SimplifiedS = Dir->getTransformedStmt();
+    if (auto *Dir = dyn_cast<OMPUnrollDirective>(SimplifiedS))
+      SimplifiedS = Dir->getTransformedStmt();
     if (const auto *CanonLoop = dyn_cast<OMPCanonicalLoop>(SimplifiedS))
       SimplifiedS = CanonLoop->getLoopStmt();
     if (const auto *For = dyn_cast<ForStmt>(SimplifiedS)) {
@@ -2579,6 +2583,28 @@ void CodeGenFunction::EmitOMPTileDirective(const OMPTileDirective &S) {
   EmitStmt(S.getTransformedStmt());
 }
 
+void CodeGenFunction::EmitOMPUnrollDirective(const OMPUnrollDirective &S) {
+  // This function is only called if the unrolled loop is not consumed by any
+  // other loop-associated construct. Such a loop-associated construct will have
+  // used the transformed AST.
+
+  // Set the unroll metadata for the next emitted loop.
+  LoopStack.setUnrollState(LoopAttributes::Enable);
+
+  if (S.hasClausesOfKind<OMPFullClause>()) {
+    LoopStack.setUnrollState(LoopAttributes::Full);
+  } else if (auto *PartialClause = S.getSingleClause<OMPPartialClause>()) {
+    if (Expr *FactorExpr = PartialClause->getFactor()) {
+      uint64_t Factor =
+          FactorExpr->EvaluateKnownConstInt(getContext()).getZExtValue();
+      assert(Factor >= 1 && "Only positive factors are valid");
+      LoopStack.setUnrollCount(Factor);
+    }
+  }
+
+  EmitStmt(S.getAssociatedStmt());
+}
+
 void CodeGenFunction::EmitOMPOuterLoop(
     bool DynamicOrOrdered, bool IsMonotonic, const OMPLoopDirective &S,
     CodeGenFunction::OMPPrivateScope &LoopScope,
@@ -5762,6 +5788,8 @@ static void emitOMPAtomicExpr(CodeGenFunction &CGF, OpenMPClauseKind Kind,
   case OMPC_safelen:
   case OMPC_simdlen:
   case OMPC_sizes:
+  case OMPC_full:
+  case OMPC_partial:
   case OMPC_allocator:
   case OMPC_allocate:
   case OMPC_collapse:
diff --git a/clang/lib/CodeGen/CodeGenFunction.h b/clang/lib/CodeGen/CodeGenFunction.h
index 564063d6b7c8..56120c478589 100644
--- a/clang/lib/CodeGen/CodeGenFunction.h
+++ b/clang/lib/CodeGen/CodeGenFunction.h
@@ -3441,6 +3441,7 @@ public:
   void EmitOMPParallelDirective(const OMPParallelDirective &S);
   void EmitOMPSimdDirective(const OMPSimdDirective &S);
   void EmitOMPTileDirective(const OMPTileDirective &S);
+  void EmitOMPUnrollDirective(const OMPUnrollDirective &S);
   void EmitOMPForDirective(const OMPForDirective &S);
   void EmitOMPForSimdDirective(const OMPForSimdDirective &S);
   void EmitOMPSectionsDirective(const OMPSectionsDirective &S);
diff --git a/clang/lib/Parse/ParseOpenMP.cpp b/clang/lib/Parse/ParseOpenMP.cpp
index c8d62f7517a3..d3a456fca49c 100644
--- a/clang/lib/Parse/ParseOpenMP.cpp
+++ b/clang/lib/Parse/ParseOpenMP.cpp
@@ -2166,6 +2166,7 @@ Parser::DeclGroupPtrTy Parser::ParseOpenMPDeclarativeDirectiveWithExtDecl(
   case OMPD_parallel:
   case OMPD_simd:
   case OMPD_tile:
+  case OMPD_unroll:
   case OMPD_task:
   case OMPD_taskyield:
   case OMPD_barrier:
@@ -2403,6 +2404,7 @@ Parser::ParseOpenMPDeclarativeOrExecutableDirective(ParsedStmtContext StmtCtx) {
   case OMPD_parallel:
   case OMPD_simd:
   case OMPD_tile:
+  case OMPD_unroll:
   case OMPD_for:
   case OMPD_for_simd:
   case OMPD_sections:
@@ -2793,6 +2795,7 @@ OMPClause *Parser::ParseOpenMPClause(OpenMPDirectiveKind DKind,
   case OMPC_novariants:
   case OMPC_nocontext:
   case OMPC_filter:
+  case OMPC_partial:
     // OpenMP [2.5, Restrictions]
     //  At most one num_threads clause can appear on the directive.
     // OpenMP [2.8.1, simd construct, Restrictions]
@@ -2824,7 +2827,8 @@ OMPClause *Parser::ParseOpenMPClause(OpenMPDirectiveKind DKind,
       ErrorFound = true;
     }
 
-    if (CKind == OMPC_ordered && PP.LookAhead(/*N=*/0).isNot(tok::l_paren))
+    if ((CKind == OMPC_ordered || CKind == OMPC_partial) &&
+        PP.LookAhead(/*N=*/0).isNot(tok::l_paren))
       Clause = ParseOpenMPClause(CKind, WrongDirective);
     else
       Clause = ParseOpenMPSingleExprClause(CKind, WrongDirective);
@@ -2887,6 +2891,7 @@ OMPClause *Parser::ParseOpenMPClause(OpenMPDirectiveKind DKind,
   case OMPC_unified_shared_memory:
   case OMPC_reverse_offload:
   case OMPC_dynamic_allocators:
+  case OMPC_full:
     // OpenMP [2.7.1, Restrictions, p. 9]
     //  Only one ordered clause can appear on a loop directive.
     // OpenMP [2.7.1, Restrictions, C/C++, p. 4]
diff --git a/clang/lib/Sema/SemaExceptionSpec.cpp b/clang/lib/Sema/SemaExceptionSpec.cpp
index fd6a4940c174..8816c9c1fea0 100644
--- a/clang/lib/Sema/SemaExceptionSpec.cpp
+++ b/clang/lib/Sema/SemaExceptionSpec.cpp
@@ -1462,6 +1462,7 @@ CanThrowResult Sema::canThrow(const Stmt *S) {
   case Stmt::OMPSectionsDirectiveClass:
   case Stmt::OMPSimdDirectiveClass:
   case Stmt::OMPTileDirectiveClass:
+  case Stmt::OMPUnrollDirectiveClass:
   case Stmt::OMPSingleDirectiveClass:
   case Stmt::OMPTargetDataDirectiveClass:
   case Stmt::OMPTargetDirectiveClass:
diff --git a/clang/lib/Sema/SemaOpenMP.cpp b/clang/lib/Sema/SemaOpenMP.cpp
index 27a2ff204adf..2ec487130d30 100644
--- a/clang/lib/Sema/SemaOpenMP.cpp
+++ b/clang/lib/Sema/SemaOpenMP.cpp
@@ -3810,6 +3810,11 @@ public:
     VisitStmt(S);
   }
 
+  void VisitOMPUnrollDirective(OMPUnrollDirective *S) {
+    // #pragma omp unroll does not introduce data sharing.
+    VisitStmt(S);
+  }
+
   void VisitStmt(Stmt *S) {
     for (Stmt *C : S->children()) {
       if (C) {
@@ -3976,6 +3981,7 @@ void Sema::ActOnOpenMPRegionStart(OpenMPDirectiveKind DKind, Scope *CurScope) {
   case OMPD_master:
   case OMPD_masked:
   case OMPD_tile:
+  case OMPD_unroll:
     break;
   case OMPD_simd:
   case OMPD_for:
@@ -5847,6 +5853,10 @@ StmtResult Sema::ActOnOpenMPExecutableDirective(
     Res =
         ActOnOpenMPTileDirective(ClausesWithImplicit, AStmt, StartLoc, EndLoc);
     break;
+  case OMPD_unroll:
+    Res = ActOnOpenMPUnrollDirective(ClausesWithImplicit, AStmt, StartLoc,
+                                     EndLoc);
+    break;
   case OMPD_for:
     Res = ActOnOpenMPForDirective(ClausesWithImplicit, AStmt, StartLoc, EndLoc,
                                   VarsWithInheritedDSA);
@@ -8967,8 +8977,14 @@ checkOpenMPLoop(OpenMPDirectiveKind DKind, Expr *CollapseLoopCountExpr,
             return false;
           },
           [&SemaRef, &Captures](OMPLoopBasedDirective *Transform) {
-            Stmt *DependentPreInits =
-                cast<OMPTileDirective>(Transform)->getPreInits();
+            Stmt *DependentPreInits;
+            if (auto *Dir = dyn_cast<OMPTileDirective>(Transform)) {
+              DependentPreInits = Dir->getPreInits();
+            } else if (auto *Dir = dyn_cast<OMPUnrollDirective>(Transform)) {
+              DependentPreInits = Dir->getPreInits();
+            } else {
+              llvm_unreachable("Unexpected loop transformation");
+            }
             if (!DependentPreInits)
               return;
             for (Decl *C : cast<DeclStmt>(DependentPreInits)->getDeclGroup()) {
@@ -12543,6 +12559,55 @@ StmtResult Sema::ActOnOpenMPTargetTeamsDistributeSimdDirective(
       Context, StartLoc, EndLoc, NestedLoopCount, Clauses, AStmt, B);
 }
 
+bool Sema::checkTransformableLoopNest(
+    OpenMPDirectiveKind Kind, Stmt *AStmt, int NumLoops,
+    SmallVectorImpl<OMPLoopBasedDirective::HelperExprs> &LoopHelpers,
+    Stmt *&Body,
+    SmallVectorImpl<SmallVector<llvm::PointerUnion<Stmt *, Decl *>, 0>>
+        &OriginalInits) {
+  OriginalInits.emplace_back();
+  bool Result = OMPLoopBasedDirective::doForAllLoops(
+      AStmt->IgnoreContainers(), /*TryImperfectlyNestedLoops=*/false, NumLoops,
+      [this, &LoopHelpers, &Body, &OriginalInits, Kind](unsigned Cnt,
+                                                        Stmt *CurStmt) {
+        VarsWithInheritedDSAType TmpDSA;
+        unsigned SingleNumLoops =
+            checkOpenMPLoop(Kind, nullptr, nullptr, CurStmt, *this, *DSAStack,
+                            TmpDSA, LoopHelpers[Cnt]);
+        if (SingleNumLoops == 0)
+          return true;
+        assert(SingleNumLoops == 1 && "Expect single loop iteration space");
+        if (auto *For = dyn_cast<ForStmt>(CurStmt)) {
+          OriginalInits.back().push_back(For->getInit());
+          Body = For->getBody();
+        } else {
+          assert(isa<CXXForRangeStmt>(CurStmt) &&
+                 "Expected canonical for or range-based for loops.");
+          auto *CXXFor = cast<CXXForRangeStmt>(CurStmt);
+          OriginalInits.back().push_back(CXXFor->getBeginStmt());
+          Body = CXXFor->getBody();
+        }
+        OriginalInits.emplace_back();
+        return false;
+      },
+      [&OriginalInits](OMPLoopBasedDirective *Transform) {
+        Stmt *DependentPreInits;
+        if (auto *Dir = dyn_cast<OMPTileDirective>(Transform))
+          DependentPreInits = Dir->getPreInits();
+        else if (auto *Dir = dyn_cast<OMPUnrollDirective>(Transform))
+          DependentPreInits = Dir->getPreInits();
+        else
+          llvm_unreachable("Unhandled loop transformation");
+        if (!DependentPreInits)
+          return;
+        for (Decl *C : cast<DeclStmt>(DependentPreInits)->getDeclGroup())
+          OriginalInits.back().push_back(C);
+      });
+  assert(OriginalInits.back().empty() && "No preinit after innermost loop");
+  OriginalInits.pop_back();
+  return Result;
+}
+
 StmtResult Sema::ActOnOpenMPTileDirective(ArrayRef<OMPClause *> Clauses,
                                           Stmt *AStmt, SourceLocation StartLoc,
                                           SourceLocation EndLoc) {
@@ -12563,38 +12628,9 @@ StmtResult Sema::ActOnOpenMPTileDirective(ArrayRef<OMPClause *> Clauses,
   SmallVector<OMPLoopBasedDirective::HelperExprs, 4> LoopHelpers(NumLoops);
   Stmt *Body = nullptr;
   SmallVector<SmallVector<llvm::PointerUnion<Stmt *, Decl *>, 0>, 4>
-      OriginalInits(1);
-  if (!OMPLoopBasedDirective::doForAllLoops(
-          AStmt->IgnoreContainers(), /*TryImperfectlyNestedLoops=*/false,
-          NumLoops,
-          [this, &LoopHelpers, &Body, &OriginalInits](unsigned Cnt,
-                                                      Stmt *CurStmt) {
-            VarsWithInheritedDSAType TmpDSA;
-            unsigned SingleNumLoops =
-                checkOpenMPLoop(OMPD_tile, nullptr, nullptr, CurStmt, *this,
-                                *DSAStack, TmpDSA, LoopHelpers[Cnt]);
-            if (SingleNumLoops == 0)
-              return true;
-            assert(SingleNumLoops == 1 && "Expect single loop iteration space");
-            if (auto *For = dyn_cast<ForStmt>(CurStmt)) {
-              OriginalInits.back().push_back(For->getInit());
-              Body = For->getBody();
-            } else {
-              assert(isa<CXXForRangeStmt>(CurStmt) &&
-                     "Expected canonical for or range-based for loops.");
-              auto *CXXFor = cast<CXXForRangeStmt>(CurStmt);
-              OriginalInits.back().push_back(CXXFor->getBeginStmt());
-              Body = CXXFor->getBody();
-            }
-            OriginalInits.emplace_back();
-            return false;
-          },
-          [&OriginalInits](OMPLoopBasedDirective *Transform) {
-            Stmt *DependentPreInits =
-                cast<OMPTileDirective>(Transform)->getPreInits();
-            for (Decl *C : cast<DeclStmt>(DependentPreInits)->getDeclGroup())
-              OriginalInits.back().push_back(C);
-          }))
+      OriginalInits;
+  if (!checkTransformableLoopNest(OMPD_tile, AStmt, NumLoops, LoopHelpers, Body,
+                                  OriginalInits))
     return StmtError();
 
   // Delay tiling to when template is completely instantiated.
@@ -12783,6 +12819,282 @@ StmtResult Sema::ActOnOpenMPTileDirective(ArrayRef<OMPClause *> Clauses,
                                   buildPreInits(Context, PreInits));
 }
 
+StmtResult Sema::ActOnOpenMPUnrollDirective(ArrayRef<OMPClause *> Clauses,
+                                            Stmt *AStmt,
+                                            SourceLocation StartLoc,
+                                            SourceLocation EndLoc) {
+  // Empty statement should only be possible if there already was an error.
+  if (!AStmt)
+    return StmtError();
+
+  if (checkMutuallyExclusiveClauses(*this, Clauses, {OMPC_partial, OMPC_full}))
+    return StmtError();
+
+  const OMPFullClause *FullClause =
+      OMPExecutableDirective::getSingleClause<OMPFullClause>(Clauses);
+  const OMPPartialClause *PartialClause =
+      OMPExecutableDirective::getSingleClause<OMPPartialClause>(Clauses);
+  assert(!(FullClause && PartialClause) &&
+         "mutual exclusivity must have been checked before");
+
+  constexpr unsigned NumLoops = 1;
+  Stmt *Body = nullptr;
+  SmallVector<OMPLoopBasedDirective::HelperExprs, NumLoops> LoopHelpers(
+      NumLoops);
+  SmallVector<SmallVector<llvm::PointerUnion<Stmt *, Decl *>, 0>, NumLoops + 1>
+      OriginalInits;
+  if (!checkTransformableLoopNest(OMPD_unroll, AStmt, NumLoops, LoopHelpers,
+                                  Body, OriginalInits))
+    return StmtError();
+
+  // Delay unrolling to when template is completely instantiated.
+  if (CurContext->isDependentContext())
+    return OMPUnrollDirective::Create(Context, StartLoc, EndLoc, Clauses, AStmt,
+                                      nullptr, nullptr);
+
+  OMPLoopBasedDirective::HelperExprs &LoopHelper = LoopHelpers.front();
+
+  if (FullClause) {
+    if (!VerifyPositiveIntegerConstantInClause(
+             LoopHelper.NumIterations, OMPC_full, /*StrictlyPositive=*/false,
+             /*SuppressExprDigs=*/true)
+             .isUsable()) {
+      Diag(AStmt->getBeginLoc(), diag::err_omp_unroll_full_variable_trip_count);
+      Diag(FullClause->getBeginLoc(), diag::note_omp_directive_here)
+          << "#pragma omp unroll full";
+      return StmtError();
+    }
+  }
+
+  // The generated loop may only be passed to other loop-associated directive
+  // when a partial clause is specified. Without the requirement it is
+  // sufficient to generate loop unroll metadata at code-generation.
+  if (!PartialClause)
+    return OMPUnrollDirective::Create(Context, StartLoc, EndLoc, Clauses, AStmt,
+                                      nullptr, nullptr);
+
+  // Otherwise, we need to provide a de-sugared/transformed AST that can be
+  // associated with another loop directive.
+  //
+  // The canonical loop analysis return by checkTransformableLoopNest assumes
+  // the following structure to be the same loop without transformations or
+  // directives applied: \code OriginalInits; LoopHelper.PreInits;
+  // LoopHelper.Counters;
+  // for (; IV < LoopHelper.NumIterations; ++IV) {
+  //   LoopHelper.Updates;
+  //   Body;
+  // }
+  // \endcode
+  // where IV is a variable declared and initialized to 0 in LoopHelper.PreInits
+  // and referenced by LoopHelper.IterationVarRef.
+  //
+  // The unrolling directive transforms this into the following loop:
+  // \code
+  // OriginalInits;         \
+  // LoopHelper.PreInits;    > NewPreInits
+  // LoopHelper.Counters;   /
+  // for (auto UIV = 0; UIV < LoopHelper.NumIterations; UIV+=Factor) {
+  //   #pragma clang loop unroll_count(Factor)
+  //   for (IV = UIV; IV < UIV + Factor && UIV < LoopHelper.NumIterations; ++IV)
+  //   {
+  //     LoopHelper.Updates;
+  //     Body;
+  //   }
+  // }
+  // \endcode
+  // where UIV is a new logical iteration counter. IV must be the same VarDecl
+  // as the original LoopHelper.IterationVarRef because LoopHelper.Updates
+  // references it. If the partially unrolled loop is associated with another
+  // loop directive (like an OMPForDirective), it will use checkOpenMPLoop to
+  // analyze this loop, i.e. the outer loop must fulfill the constraints of an
+  // OpenMP canonical loop. The inner loop is not an associable canonical loop
+  // and only exists to defer its unrolling to LLVM's LoopUnroll instead of
+  // doing it in the frontend (by adding loop metadata). NewPreInits becomes a
+  // property of the OMPLoopBasedDirective instead of statements in
+  // CompoundStatement. This is to allow the loop to become a non-outermost loop
+  // of a canonical loop nest where these PreInits are emitted before the
+  // outermost directive.
+
+  // Determine the PreInit declarations.
+  SmallVector<Decl *, 4> PreInits;
+  assert(OriginalInits.size() == 1 &&
+         "Expecting a single-dimensional loop iteration space");
+  for (auto &P : OriginalInits[0]) {
+    if (auto *D = P.dyn_cast<Decl *>())
+      PreInits.push_back(D);
+    else if (auto *PI = dyn_cast_or_null<DeclStmt>(P.dyn_cast<Stmt *>()))
+      PreInits.append(PI->decl_begin(), PI->decl_end());
+  }
+  if (auto *PI = cast_or_null<DeclStmt>(LoopHelper.PreInits))
+    PreInits.append(PI->decl_begin(), PI->decl_end());
+  // Gather declarations for the data members used as counters.
+  for (Expr *CounterRef : LoopHelper.Counters) {
+    auto *CounterDecl = cast<DeclRefExpr>(CounterRef)->getDecl();
+    if (isa<OMPCapturedExprDecl>(CounterDecl))
+      PreInits.push_back(CounterDecl);
+  }
+
+  auto *IterationVarRef = cast<DeclRefExpr>(LoopHelper.IterationVarRef);
+  QualType IVTy = IterationVarRef->getType();
+  assert(LoopHelper.Counters.size() == 1 &&
+         "Expecting a single-dimensional loop iteration space");
+  auto *OrigVar = cast<DeclRefExpr>(LoopHelper.Counters.front());
+
+  // Determine the unroll factor.
+  uint64_t Factor;
+  SourceLocation FactorLoc;
+  if (Expr *FactorVal = PartialClause->getFactor()) {
+    Factor =
+        FactorVal->getIntegerConstantExpr(Context).getValue().getZExtValue();
+    FactorLoc = FactorVal->getExprLoc();
+  } else {
+    // TODO: Use a better profitability model.
+    Factor = 2;
+  }
+  assert(Factor > 0 && "Expected positive unroll factor");
+  auto MakeFactorExpr = [this, Factor, IVTy, FactorLoc]() {
+    return IntegerLiteral::Create(
+        Context, llvm::APInt(Context.getIntWidth(IVTy), Factor), IVTy,
+        FactorLoc);
+  };
+
+  // Iteration variable SourceLocations.
+  SourceLocation OrigVarLoc = OrigVar->getExprLoc();
+  SourceLocation OrigVarLocBegin = OrigVar->getBeginLoc();
+  SourceLocation OrigVarLocEnd = OrigVar->getEndLoc();
+
+  // Internal variable names.
+  std::string OrigVarName = OrigVar->getNameInfo().getAsString();
+  std::string OuterIVName = (Twine(".unrolled.iv.") + OrigVarName).str();
+  std::string InnerIVName = (Twine(".unroll_inner.iv.") + OrigVarName).str();
+  std::string InnerTripCountName =
+      (Twine(".unroll_inner.tripcount.") + OrigVarName).str();
+
+  // Create the iteration variable for the unrolled loop.
+  VarDecl *OuterIVDecl =
+      buildVarDecl(*this, {}, IVTy, OuterIVName, nullptr, OrigVar);
+  auto MakeOuterRef = [this, OuterIVDecl, IVTy, OrigVarLoc]() {
+    return buildDeclRefExpr(*this, OuterIVDecl, IVTy, OrigVarLoc);
+  };
+
+  // Iteration variable for the inner loop: Reuse the iteration variable created
+  // by checkOpenMPLoop.
+  auto *InnerIVDecl = cast<VarDecl>(IterationVarRef->getDecl());
+  InnerIVDecl->setDeclName(&PP.getIdentifierTable().get(InnerIVName));
+  auto MakeInnerRef = [this, InnerIVDecl, IVTy, OrigVarLoc]() {
+    return buildDeclRefExpr(*this, InnerIVDecl, IVTy, OrigVarLoc);
+  };
+
+  // Make a copy of the NumIterations expression for each use: By the AST
+  // constraints, every expression object in a DeclContext must be unique.
+  CaptureVars CopyTransformer(*this);
+  auto MakeNumIterations = [&CopyTransformer, &LoopHelper]() -> Expr * {
+    return AssertSuccess(
+        CopyTransformer.TransformExpr(LoopHelper.NumIterations));
+  };
+
+  // Inner For init-statement: auto .unroll_inner.iv = .unrolled.iv
+  ExprResult LValueConv = DefaultLvalueConversion(MakeOuterRef());
+  AddInitializerToDecl(InnerIVDecl, LValueConv.get(), /*DirectInit=*/false);
+  StmtResult InnerInit = new (Context)
+      DeclStmt(DeclGroupRef(InnerIVDecl), OrigVarLocBegin, OrigVarLocEnd);
+  if (!InnerInit.isUsable())
+    return StmtError();
+
+  // Inner For cond-expression:
+  // \code
+  //   .unroll_inner.iv < .unrolled.iv + Factor &&
+  //   .unroll_inner.iv < NumIterations
+  // \endcode
+  // This conjunction of two conditions allows ScalarEvolution to derive the
+  // maximum trip count of the inner loop.
+  ExprResult EndOfTile = BuildBinOp(CurScope, LoopHelper.Cond->getExprLoc(),
+                                    BO_Add, MakeOuterRef(), MakeFactorExpr());
+  if (!EndOfTile.isUsable())
+    return StmtError();
+  ExprResult InnerCond1 = BuildBinOp(CurScope, LoopHelper.Cond->getExprLoc(),
+                                     BO_LE, MakeInnerRef(), EndOfTile.get());
+  if (!InnerCond1.isUsable())
+    return StmtError();
+  ExprResult InnerCond2 =
+      BuildBinOp(CurScope, LoopHelper.Cond->getExprLoc(), BO_LE, MakeInnerRef(),
+                 MakeNumIterations());
+  if (!InnerCond2.isUsable())
+    return StmtError();
+  ExprResult InnerCond =
+      BuildBinOp(CurScope, LoopHelper.Cond->getExprLoc(), BO_LAnd,
+                 InnerCond1.get(), InnerCond2.get());
+  if (!InnerCond.isUsable())
+    return StmtError();
+
+  // Inner For incr-statement: ++.unroll_inner.iv
+  ExprResult InnerIncr = BuildUnaryOp(CurScope, LoopHelper.Inc->getExprLoc(),
+                                      UO_PreInc, MakeInnerRef());
+  if (!InnerIncr.isUsable())
+    return StmtError();
+
+  // Inner For statement.
+  SmallVector<Stmt *> InnerBodyStmts;
+  InnerBodyStmts.append(LoopHelper.Updates.begin(), LoopHelper.Updates.end());
+  InnerBodyStmts.push_back(Body);
+  CompoundStmt *InnerBody = CompoundStmt::Create(
+      Context, InnerBodyStmts, Body->getBeginLoc(), Body->getEndLoc());
+  ForStmt *InnerFor = new (Context)
+      ForStmt(Context, InnerInit.get(), InnerCond.get(), nullptr,
+              InnerIncr.get(), InnerBody, LoopHelper.Init->getBeginLoc(),
+              LoopHelper.Init->getBeginLoc(), LoopHelper.Inc->getEndLoc());
+
+  // Unroll metadata for the inner loop.
+  // This needs to take into account the remainder portion of the unrolled loop,
+  // hence `unroll(full)` does not apply here, even though the LoopUnroll pass
+  // supports multiple loop exits. Instead, unroll using a factor equivalent to
+  // the maximum trip count, which will also generate a remainder loop. Just
+  // `unroll(enable)` (which could have been useful if the user has not
+  // specified a concrete factor; even though the outer loop cannot be
+  // influenced anymore, would avoid more code bloat than necessary) will refuse
+  // the loop because "Won't unroll; remainder loop could not be generated when
+  // assuming runtime trip count". Even if it did work, it must not choose a
+  // larger unroll factor than the maximum loop length, or it would always just
+  // execute the remainder loop.
+  LoopHintAttr *UnrollHintAttr =
+      LoopHintAttr::CreateImplicit(Context, LoopHintAttr::UnrollCount,
+                                   LoopHintAttr::Numeric, MakeFactorExpr());
+  AttributedStmt *InnerUnrolled =
+      AttributedStmt::Create(Context, StartLoc, {UnrollHintAttr}, InnerFor);
+
+  // Outer For init-statement: auto .unrolled.iv = 0
+  AddInitializerToDecl(
+      OuterIVDecl, ActOnIntegerConstant(LoopHelper.Init->getExprLoc(), 0).get(),
+      /*DirectInit=*/false);
+  StmtResult OuterInit = new (Context)
+      DeclStmt(DeclGroupRef(OuterIVDecl), OrigVarLocBegin, OrigVarLocEnd);
+  if (!OuterInit.isUsable())
+    return StmtError();
+
+  // Outer For cond-expression: .unrolled.iv < NumIterations
+  ExprResult OuterConde =
+      BuildBinOp(CurScope, LoopHelper.Cond->getExprLoc(), BO_LT, MakeOuterRef(),
+                 MakeNumIterations());
+  if (!OuterConde.isUsable())
+    return StmtError();
+
+  // Outer For incr-statement: .unrolled.iv += Factor
+  ExprResult OuterIncr =
+      BuildBinOp(CurScope, LoopHelper.Inc->getExprLoc(), BO_AddAssign,
+                 MakeOuterRef(), MakeFactorExpr());
+  if (!OuterIncr.isUsable())
+    return StmtError();
+
+  // Outer For statement.
+  ForStmt *OuterFor = new (Context)
+      ForStmt(Context, OuterInit.get(), OuterConde.get(), nullptr,
+              OuterIncr.get(), InnerUnrolled, LoopHelper.Init->getBeginLoc(),
+              LoopHelper.Init->getBeginLoc(), LoopHelper.Inc->getEndLoc());
+
+  return OMPUnrollDirective::Create(Context, StartLoc, EndLoc, Clauses, AStmt,
+                                    OuterFor, buildPreInits(Context, PreInits));
+}
+
 OMPClause *Sema::ActOnOpenMPSingleExprClause(OpenMPClauseKind Kind, Expr *Expr,
                                              SourceLocation StartLoc,
                                              SourceLocation LParenLoc,
@@ -12843,6 +13155,9 @@ OMPClause *Sema::ActOnOpenMPSingleExprClause(OpenMPClauseKind Kind, Expr *Expr,
   case OMPC_filter:
     Res = ActOnOpenMPFilterClause(Expr, StartLoc, LParenLoc, EndLoc);
     break;
+  case OMPC_partial:
+    Res = ActOnOpenMPPartialClause(Expr, StartLoc, LParenLoc, EndLoc);
+    break;
   case OMPC_device:
   case OMPC_if:
   case OMPC_default:
@@ -13045,6 +13360,7 @@ static OpenMPDirectiveKind getOpenMPCaptureRegionForClause(
     case OMPD_end_declare_target:
     case OMPD_teams:
     case OMPD_tile:
+    case OMPD_unroll:
     case OMPD_for:
     case OMPD_sections:
     case OMPD_section:
@@ -13123,6 +13439,7 @@ static OpenMPDirectiveKind getOpenMPCaptureRegionForClause(
     case OMPD_teams:
     case OMPD_simd:
     case OMPD_tile:
+    case OMPD_unroll:
     case OMPD_for:
     case OMPD_for_simd:
     case OMPD_sections:
@@ -13204,6 +13521,7 @@ static OpenMPDirectiveKind getOpenMPCaptureRegionForClause(
     case OMPD_end_declare_target:
     case OMPD_simd:
     case OMPD_tile:
+    case OMPD_unroll:
     case OMPD_for:
     case OMPD_for_simd:
     case OMPD_sections:
@@ -13283,6 +13601,7 @@ static OpenMPDirectiveKind getOpenMPCaptureRegionForClause(
     case OMPD_end_declare_target:
     case OMPD_simd:
     case OMPD_tile:
+    case OMPD_unroll:
     case OMPD_for:
     case OMPD_for_simd:
     case OMPD_sections:
@@ -13363,6 +13682,7 @@ static OpenMPDirectiveKind getOpenMPCaptureRegionForClause(
     case OMPD_end_declare_target:
     case OMPD_simd:
     case OMPD_tile:
+    case OMPD_unroll:
     case OMPD_sections:
     case OMPD_section:
     case OMPD_single:
@@ -13442,6 +13762,7 @@ static OpenMPDirectiveKind getOpenMPCaptureRegionForClause(
     case OMPD_end_declare_target:
     case OMPD_simd:
     case OMPD_tile:
+    case OMPD_unroll:
     case OMPD_for:
     case OMPD_for_simd:
     case OMPD_sections:
@@ -13522,6 +13843,7 @@ static OpenMPDirectiveKind getOpenMPCaptureRegionForClause(
     case OMPD_end_declare_target:
     case OMPD_simd:
     case OMPD_tile:
+    case OMPD_unroll:
     case OMPD_for:
     case OMPD_for_simd:
     case OMPD_sections:
@@ -13603,6 +13925,7 @@ static OpenMPDirectiveKind getOpenMPCaptureRegionForClause(
     case OMPD_end_declare_target:
     case OMPD_simd:
     case OMPD_tile:
+    case OMPD_unroll:
     case OMPD_for:
     case OMPD_for_simd:
     case OMPD_sections:
@@ -13886,17 +14209,33 @@ OMPClause *Sema::ActOnOpenMPNumThreadsClause(Expr *NumThreads,
 
 ExprResult Sema::VerifyPositiveIntegerConstantInClause(Expr *E,
                                                        OpenMPClauseKind CKind,
-                                                       bool StrictlyPositive) {
+                                                       bool StrictlyPositive,
+                                                       bool SuppressExprDiags) {
   if (!E)
     return ExprError();
   if (E->isValueDependent() || E->isTypeDependent() ||
       E->isInstantiationDependent() || E->containsUnexpandedParameterPack())
     return E;
+
   llvm::APSInt Result;
-  ExprResult ICE =
-      VerifyIntegerConstantExpression(E, &Result, /*FIXME*/ AllowFold);
+  ExprResult ICE;
+  if (SuppressExprDiags) {
+    // Use a custom diagnoser that suppresses 'note' diagnostics about the
+    // expression.
+    struct SuppressedDiagnoser : public Sema::VerifyICEDiagnoser {
+      SuppressedDiagnoser() : VerifyICEDiagnoser(/*Suppress=*/true) {}
+      Sema::SemaDiagnosticBuilder diagnoseNotICE(Sema &S,
+                                                 SourceLocation Loc) override {
+        llvm_unreachable("Diagnostic suppressed");
+      }
+    } Diagnoser;
+    ICE = VerifyIntegerConstantExpression(E, &Result, Diagnoser, AllowFold);
+  } else {
+    ICE = VerifyIntegerConstantExpression(E, &Result, /*FIXME*/ AllowFold);
+  }
   if (ICE.isInvalid())
     return ExprError();
+
   if ((StrictlyPositive && !Result.isStrictlyPositive()) ||
       (!StrictlyPositive && !Result.isNonNegative())) {
     Diag(E->getExprLoc(), diag::err_omp_negative_expression_in_clause)
@@ -14301,6 +14640,29 @@ OMPClause *Sema::ActOnOpenMPSizesClause(ArrayRef<Expr *> SizeExprs,
                                 SizeExprs);
 }
 
+OMPClause *Sema::ActOnOpenMPFullClause(SourceLocation StartLoc,
+                                       SourceLocation EndLoc) {
+  return OMPFullClause::Create(Context, StartLoc, EndLoc);
+}
+
+OMPClause *Sema::ActOnOpenMPPartialClause(Expr *FactorExpr,
+                                          SourceLocation StartLoc,
+                                          SourceLocation LParenLoc,
+                                          SourceLocation EndLoc) {
+  if (FactorExpr) {
+    // If an argument is specified, it must be a constant (or an unevaluated
+    // template expression).
+    ExprResult FactorResult = VerifyPositiveIntegerConstantInClause(
+        FactorExpr, OMPC_partial, /*StrictlyPositive=*/true);
+    if (FactorResult.isInvalid())
+      return nullptr;
+    FactorExpr = FactorResult.get();
+  }
+
+  return OMPPartialClause::Create(Context, StartLoc, LParenLoc, EndLoc,
+                                  FactorExpr);
+}
+
 OMPClause *Sema::ActOnOpenMPSingleExprWithArgClause(
     OpenMPClauseKind Kind, ArrayRef<unsigned> Argument, Expr *Expr,
     SourceLocation StartLoc, SourceLocation LParenLoc,
@@ -14603,6 +14965,12 @@ OMPClause *Sema::ActOnOpenMPClause(OpenMPClauseKind Kind,
                                    /*LParenLoc=*/SourceLocation(),
                                    /*VarLoc=*/SourceLocation(), EndLoc);
     break;
+  case OMPC_full:
+    Res = ActOnOpenMPFullClause(StartLoc, EndLoc);
+    break;
+  case OMPC_partial:
+    Res = ActOnOpenMPPartialClause(nullptr, StartLoc, /*LParenLoc=*/{}, EndLoc);
+    break;
   case OMPC_if:
   case OMPC_final:
   case OMPC_num_threads:
diff --git a/clang/lib/Sema/TreeTransform.h b/clang/lib/Sema/TreeTransform.h
index 506e0fc0ba05..21dc8c5d893b 100644
--- a/clang/lib/Sema/TreeTransform.h
+++ b/clang/lib/Sema/TreeTransform.h
@@ -1633,6 +1633,20 @@ public:
     return getSema().ActOnOpenMPSizesClause(Sizes, StartLoc, LParenLoc, EndLoc);
   }
 
+  /// Build a new OpenMP 'full' clause.
+  OMPClause *RebuildOMPFullClause(SourceLocation StartLoc,
+                                  SourceLocation EndLoc) {
+    return getSema().ActOnOpenMPFullClause(StartLoc, EndLoc);
+  }
+
+  /// Build a new OpenMP 'partial' clause.
+  OMPClause *RebuildOMPPartialClause(Expr *Factor, SourceLocation StartLoc,
+                                     SourceLocation LParenLoc,
+                                     SourceLocation EndLoc) {
+    return getSema().ActOnOpenMPPartialClause(Factor, StartLoc, LParenLoc,
+                                              EndLoc);
+  }
+
   /// Build a new OpenMP 'allocator' clause.
   ///
   /// By default, performs semantic analysis to build the new OpenMP clause.
@@ -8521,6 +8535,17 @@ TreeTransform<Derived>::TransformOMPTileDirective(OMPTileDirective *D) {
   return Res;
 }
 
+template <typename Derived>
+StmtResult
+TreeTransform<Derived>::TransformOMPUnrollDirective(OMPUnrollDirective *D) {
+  DeclarationNameInfo DirName;
+  getDerived().getSema().StartOpenMPDSABlock(D->getDirectiveKind(), DirName,
+                                             nullptr, D->getBeginLoc());
+  StmtResult Res = getDerived().TransformOMPExecutableDirective(D);
+  getDerived().getSema().EndOpenMPDSABlock(Res.get());
+  return Res;
+}
+
 template <typename Derived>
 StmtResult
 TreeTransform<Derived>::TransformOMPForDirective(OMPForDirective *D) {
@@ -9220,6 +9245,28 @@ OMPClause *TreeTransform<Derived>::TransformOMPSizesClause(OMPSizesClause *C) {
                                C->getLParenLoc(), C->getEndLoc());
 }
 
+template <typename Derived>
+OMPClause *TreeTransform<Derived>::TransformOMPFullClause(OMPFullClause *C) {
+  if (!getDerived().AlwaysRebuild())
+    return C;
+  return RebuildOMPFullClause(C->getBeginLoc(), C->getEndLoc());
+}
+
+template <typename Derived>
+OMPClause *
+TreeTransform<Derived>::TransformOMPPartialClause(OMPPartialClause *C) {
+  ExprResult T = getDerived().TransformExpr(C->getFactor());
+  if (T.isInvalid())
+    return nullptr;
+  Expr *Factor = T.get();
+  bool Changed = Factor != C->getFactor();
+
+  if (!Changed && !getDerived().AlwaysRebuild())
+    return C;
+  return RebuildOMPPartialClause(Factor, C->getBeginLoc(), C->getLParenLoc(),
+                                 C->getEndLoc());
+}
+
 template <typename Derived>
 OMPClause *
 TreeTransform<Derived>::TransformOMPCollapseClause(OMPCollapseClause *C) {
diff --git a/clang/lib/Serialization/ASTReader.cpp b/clang/lib/Serialization/ASTReader.cpp
index c8290ac5c31b..55e0d084ea4b 100644
--- a/clang/lib/Serialization/ASTReader.cpp
+++ b/clang/lib/Serialization/ASTReader.cpp
@@ -11720,6 +11720,12 @@ OMPClause *OMPClauseReader::readClause() {
     C = OMPSizesClause::CreateEmpty(Context, NumSizes);
     break;
   }
+  case llvm::omp::OMPC_full:
+    C = OMPFullClause::CreateEmpty(Context);
+    break;
+  case llvm::omp::OMPC_partial:
+    C = OMPPartialClause::CreateEmpty(Context);
+    break;
   case llvm::omp::OMPC_allocator:
     C = new (Context) OMPAllocatorClause();
     break;
@@ -12032,6 +12038,13 @@ void OMPClauseReader::VisitOMPSizesClause(OMPSizesClause *C) {
   C->setLParenLoc(Record.readSourceLocation());
 }
 
+void OMPClauseReader::VisitOMPFullClause(OMPFullClause *C) {}
+
+void OMPClauseReader::VisitOMPPartialClause(OMPPartialClause *C) {
+  C->setFactor(Record.readSubExpr());
+  C->setLParenLoc(Record.readSourceLocation());
+}
+
 void OMPClauseReader::VisitOMPAllocatorClause(OMPAllocatorClause *C) {
   C->setAllocator(Record.readExpr());
   C->setLParenLoc(Record.readSourceLocation());
diff --git a/clang/lib/Serialization/ASTReaderStmt.cpp b/clang/lib/Serialization/ASTReaderStmt.cpp
index c1f66709c851..62d7a99b52c7 100644
--- a/clang/lib/Serialization/ASTReaderStmt.cpp
+++ b/clang/lib/Serialization/ASTReaderStmt.cpp
@@ -2321,6 +2321,10 @@ void ASTStmtReader::VisitOMPTileDirective(OMPTileDirective *D) {
   VisitOMPLoopBasedDirective(D);
 }
 
+void ASTStmtReader::VisitOMPUnrollDirective(OMPUnrollDirective *D) {
+  VisitOMPLoopBasedDirective(D);
+}
+
 void ASTStmtReader::VisitOMPForDirective(OMPForDirective *D) {
   VisitOMPLoopDirective(D);
   D->setHasCancel(Record.readBool());
@@ -3201,6 +3205,14 @@ Stmt *ASTReader::ReadStmtFromStream(ModuleFile &F) {
       break;
     }
 
+    case STMT_OMP_UNROLL_DIRECTIVE: {
+      unsigned NumLoops = Record[ASTStmtReader::NumStmtFields];
+      assert(NumLoops == 1 && "Unroll directive accepts only a single loop");
+      unsigned NumClauses = Record[ASTStmtReader::NumStmtFields + 1];
+      S = OMPUnrollDirective::CreateEmpty(Context, NumClauses);
+      break;
+    }
+
     case STMT_OMP_FOR_DIRECTIVE: {
       unsigned CollapsedNum = Record[ASTStmtReader::NumStmtFields];
       unsigned NumClauses = Record[ASTStmtReader::NumStmtFields + 1];
diff --git a/clang/lib/Serialization/ASTWriter.cpp b/clang/lib/Serialization/ASTWriter.cpp
index 5f038b318e26..4cdcf53775de 100644
--- a/clang/lib/Serialization/ASTWriter.cpp
+++ b/clang/lib/Serialization/ASTWriter.cpp
@@ -6057,6 +6057,13 @@ void OMPClauseWriter::VisitOMPSizesClause(OMPSizesClause *C) {
   Record.AddSourceLocation(C->getLParenLoc());
 }
 
+void OMPClauseWriter::VisitOMPFullClause(OMPFullClause *C) {}
+
+void OMPClauseWriter::VisitOMPPartialClause(OMPPartialClause *C) {
+  Record.AddStmt(C->getFactor());
+  Record.AddSourceLocation(C->getLParenLoc());
+}
+
 void OMPClauseWriter::VisitOMPAllocatorClause(OMPAllocatorClause *C) {
   Record.AddStmt(C->getAllocator());
   Record.AddSourceLocation(C->getLParenLoc());
diff --git a/clang/lib/Serialization/ASTWriterStmt.cpp b/clang/lib/Serialization/ASTWriterStmt.cpp
index 10be9b894c27..2bb5e4f3563d 100644
--- a/clang/lib/Serialization/ASTWriterStmt.cpp
+++ b/clang/lib/Serialization/ASTWriterStmt.cpp
@@ -2222,6 +2222,11 @@ void ASTStmtWriter::VisitOMPTileDirective(OMPTileDirective *D) {
   Code = serialization::STMT_OMP_TILE_DIRECTIVE;
 }
 
+void ASTStmtWriter::VisitOMPUnrollDirective(OMPUnrollDirective *D) {
+  VisitOMPLoopBasedDirective(D);
+  Code = serialization::STMT_OMP_UNROLL_DIRECTIVE;
+}
+
 void ASTStmtWriter::VisitOMPForDirective(OMPForDirective *D) {
   VisitOMPLoopDirective(D);
   Record.writeBool(D->hasCancel());
diff --git a/clang/lib/StaticAnalyzer/Core/ExprEngine.cpp b/clang/lib/StaticAnalyzer/Core/ExprEngine.cpp
index d40161f289f8..66332d3b848c 100644
--- a/clang/lib/StaticAnalyzer/Core/ExprEngine.cpp
+++ b/clang/lib/StaticAnalyzer/Core/ExprEngine.cpp
@@ -1297,7 +1297,8 @@ void ExprEngine::Visit(const Stmt *S, ExplodedNode *Pred,
     case Stmt::OMPInteropDirectiveClass:
     case Stmt::OMPDispatchDirectiveClass:
     case Stmt::OMPMaskedDirectiveClass:
-    case Stmt::CapturedStmtClass: {
+    case Stmt::CapturedStmtClass:
+    case Stmt::OMPUnrollDirectiveClass: {
       const ExplodedNode *node = Bldr.generateSink(S, Pred, Pred->getState());
       Engine.addAbortedBlock(node, currBldrCtx->getBlock());
       break;
diff --git a/clang/test/OpenMP/unroll_ast_print.cpp b/clang/test/OpenMP/unroll_ast_print.cpp
new file mode 100644
index 000000000000..63e7b1dbe6ed
--- /dev/null
+++ b/clang/test/OpenMP/unroll_ast_print.cpp
@@ -0,0 +1,127 @@
+// Check no warnings/errors
+// RUN: %clang_cc1 -triple x86_64-pc-linux-gnu -fopenmp -fopenmp-version=51 -fsyntax-only -verify %s
+// expected-no-diagnostics
+
+// Check AST and unparsing
+// RUN: %clang_cc1 -triple x86_64-pc-linux-gnu -fopenmp -fopenmp-version=51 -ast-dump  %s | FileCheck %s --check-prefix=DUMP
+// RUN: %clang_cc1 -triple x86_64-pc-linux-gnu -fopenmp -fopenmp-version=51 -ast-print %s | FileCheck %s --check-prefix=PRINT --match-full-lines
+
+// Check same results after serialization round-trip
+// RUN: %clang_cc1 -triple x86_64-pc-linux-gnu -fopenmp -fopenmp-version=51 -emit-pch -o %t %s
+// RUN: %clang_cc1 -triple x86_64-pc-linux-gnu -fopenmp -fopenmp-version=51 -include-pch %t -ast-dump-all %s | FileCheck %s --check-prefix=DUMP
+// RUN: %clang_cc1 -triple x86_64-pc-linux-gnu -fopenmp -fopenmp-version=51 -include-pch %t -ast-print    %s | FileCheck %s --check-prefix=PRINT --match-full-lines
+
+#ifndef HEADER
+#define HEADER
+
+// placeholder for loop body code.
+void body(...);
+
+
+// PRINT-LABEL: void func_unroll() {
+// DUMP-LABEL:  FunctionDecl {{.*}} func_unroll
+void func_unroll() {
+  // PRINT:  #pragma omp unroll
+  // DUMP:   OMPUnrollDirective
+  #pragma omp unroll
+  // PRINT-NEXT: for (int i = 7; i < 17; i += 3)
+  // DUMP-NEXT: ForStmt
+  for (int i = 7; i < 17; i += 3)
+    // PRINT-NEXT: body(i);
+    // DUMP: CallExpr
+    body(i);
+}
+
+
+// PRINT-LABEL: void func_unroll_full() {
+// DUMP-LABEL:  FunctionDecl {{.*}} func_unroll_full 
+void func_unroll_full() {
+  // PRINT:     #pragma omp unroll full
+  // DUMP:      OMPUnrollDirective
+  // DUMP-NEXT:   OMPFullClause
+  #pragma omp unroll full
+  // PRINT-NEXT: for (int i = 7; i < 17; i += 3)
+  // DUMP-NEXT: ForStmt
+  for (int i = 7; i < 17; i += 3)
+    // PRINT-NEXT: body(i);
+    // DUMP: CallExpr
+    body(i);
+}
+
+
+// PRINT-LABEL: void func_unroll_partial() {
+// DUMP-LABEL:  FunctionDecl {{.*}} func_unroll_partial 
+void func_unroll_partial() {
+  // PRINT:     #pragma omp unroll partial
+  // DUMP:      OMPUnrollDirective
+  // DUMP-NEXT:   OMPPartialClause
+  // DUMP-NEXT:     <<<NULL>>>
+  #pragma omp unroll partial
+  // PRINT-NEXT: for (int i = 7; i < 17; i += 3)
+  // DUMP-NEXT: ForStmt
+  for (int i = 7; i < 17; i += 3)
+    // PRINT: body(i);
+    // DUMP: CallExpr
+    body(i);
+}
+
+
+// PRINT-LABEL: void func_unroll_partial_factor() {
+// DUMP-LABEL:  FunctionDecl {{.*}} func_unroll_partial_factor 
+void func_unroll_partial_factor() {
+  // PRINT:     #pragma omp unroll partial(4)
+  // DUMP:      OMPUnrollDirective
+  // DUMP-NEXT:   OMPPartialClause
+  // DUMP-NEXT:     ConstantExpr
+  // DUMP-NEXT:       value: Int 4
+  // DUMP-NEXT:       IntegerLiteral {{.*}} 4
+  #pragma omp unroll partial(4)
+  // PRINT-NEXT: for (int i = 7; i < 17; i += 3)
+  // DUMP-NEXT: ForStmt
+  for (int i = 7; i < 17; i += 3)
+    // PRINT-NEXT: body(i);
+    // DUMP: CallExpr
+    body(i);
+}
+
+
+// PRINT-LABEL: void func_unroll_partial_factor_for() {
+// DUMP-LABEL:  FunctionDecl {{.*}} func_unroll_partial_factor_for 
+void func_unroll_partial_factor_for() {
+  // PRINT:     #pragma omp for
+  // DUMP:      OMPForDirective
+  #pragma omp for
+  // PRINT:       #pragma omp unroll partial(2)
+  // DUMP:        OMPUnrollDirective
+  // DUMP-NEXT:     OMPPartialClause
+  #pragma omp unroll partial(2)
+  // PRINT-NEXT: for (int i = 7; i < 17; i += 3)
+  // DUMP: ForStmt
+  for (int i = 7; i < 17; i += 3)
+    // PRINT-NEXT: body(i);
+    // DUMP: CallExpr
+    body(i);
+}
+
+
+// PRINT-LABEL: template <typename T, T Start, T End, T Step, int Factor> void unroll_templated() {
+// DUMP-LABEL:  FunctionTemplateDecl {{.*}} unroll_templated
+template<typename T, T Start, T End, T Step, int Factor>
+void unroll_templated() {
+  // PRINT: #pragma omp unroll partial(Factor)
+  // DUMP:      OMPUnrollDirective
+  // DUMP-NEXT: OMPPartialClause
+  // DUMP-NEXT:   DeclRefExpr {{.*}} 'Factor' 'int'
+  #pragma omp unroll partial(Factor)
+    // PRINT-NEXT: for (T i = Start; i < End; i += Step)
+    // DUMP-NEXT:  ForStmt
+    for (T i = Start; i < End; i += Step)
+      // PRINT-NEXT: body(i);
+      // DUMP:  CallExpr
+      body(i);
+}
+void unroll_template() {
+  unroll_templated<int,0,1024,1,4>();
+}
+
+#endif
diff --git a/clang/test/OpenMP/unroll_codegen_factor.cpp b/clang/test/OpenMP/unroll_codegen_factor.cpp
new file mode 100644
index 000000000000..e5280e3439c6
--- /dev/null
+++ b/clang/test/OpenMP/unroll_codegen_factor.cpp
@@ -0,0 +1,65 @@
+// Check code generation
+// RUN: %clang_cc1 -verify -triple x86_64-pc-linux-gnu -fopenmp -fopenmp-version=51 -emit-llvm %s -o - | FileCheck %s --check-prefix=IR
+
+// Check same results after serialization round-trip
+// RUN: %clang_cc1 -verify -triple x86_64-pc-linux-gnu -fopenmp -fopenmp-version=51 -emit-pch -o %t %s
+// RUN: %clang_cc1 -verify -triple x86_64-pc-linux-gnu -fopenmp -fopenmp-version=51 -include-pch %t -emit-llvm %s -o - | FileCheck %s --check-prefix=IR
+// expected-no-diagnostics
+
+#ifndef HEADER
+#define HEADER
+
+// placeholder for loop body code.
+extern "C" void body(...) {}
+
+
+// IR-LABEL: @func(
+// IR-NEXT:  [[ENTRY:.*]]:
+// IR-NEXT:    %[[START_ADDR:.+]] = alloca i32, align 4
+// IR-NEXT:    %[[END_ADDR:.+]] = alloca i32, align 4
+// IR-NEXT:    %[[STEP_ADDR:.+]] = alloca i32, align 4
+// IR-NEXT:    %[[I:.+]] = alloca i32, align 4
+// IR-NEXT:    store i32 %[[START:.+]], i32* %[[START_ADDR]], align 4
+// IR-NEXT:    store i32 %[[END:.+]], i32* %[[END_ADDR]], align 4
+// IR-NEXT:    store i32 %[[STEP:.+]], i32* %[[STEP_ADDR]], align 4
+// IR-NEXT:    %[[TMP0:.+]] = load i32, i32* %[[START_ADDR]], align 4
+// IR-NEXT:    store i32 %[[TMP0]], i32* %[[I]], align 4
+// IR-NEXT:    br label %[[FOR_COND:.+]]
+// IR-EMPTY:
+// IR-NEXT:  [[FOR_COND]]:
+// IR-NEXT:    %[[TMP1:.+]] = load i32, i32* %[[I]], align 4
+// IR-NEXT:    %[[TMP2:.+]] = load i32, i32* %[[END_ADDR]], align 4
+// IR-NEXT:    %[[CMP:.+]] = icmp slt i32 %[[TMP1]], %[[TMP2]]
+// IR-NEXT:    br i1 %[[CMP]], label %[[FOR_BODY:.+]], label %[[FOR_END:.+]]
+// IR-EMPTY:
+// IR-NEXT:  [[FOR_BODY]]:
+// IR-NEXT:    %[[TMP3:.+]] = load i32, i32* %[[START_ADDR]], align 4
+// IR-NEXT:    %[[TMP4:.+]] = load i32, i32* %[[END_ADDR]], align 4
+// IR-NEXT:    %[[TMP5:.+]] = load i32, i32* %[[STEP_ADDR]], align 4
+// IR-NEXT:    %[[TMP6:.+]] = load i32, i32* %[[I]], align 4
+// IR-NEXT:    call void (...) @body(i32 %[[TMP3]], i32 %[[TMP4]], i32 %[[TMP5]], i32 %[[TMP6]])
+// IR-NEXT:    br label %[[FOR_INC:.+]]
+// IR-EMPTY:
+// IR-NEXT:  [[FOR_INC]]:
+// IR-NEXT:    %[[TMP7:.+]] = load i32, i32* %[[STEP_ADDR]], align 4
+// IR-NEXT:    %[[TMP8:.+]] = load i32, i32* %[[I]], align 4
+// IR-NEXT:    %[[ADD:.+]] = add nsw i32 %[[TMP8]], %[[TMP7]]
+// IR-NEXT:    store i32 %[[ADD]], i32* %[[I]], align 4
+// IR-NEXT:    br label %[[FOR_COND]], !llvm.loop ![[LOOP2:[0-9]+]]
+// IR-EMPTY:
+// IR-NEXT:  [[FOR_END]]:
+// IR-NEXT:    ret void
+// IR-NEXT:  }
+extern "C" void func(int start, int end, int step) {
+  #pragma omp unroll partial(4)
+  for (int i = start; i < end; i+=step)
+    body(start, end, step, i);
+}
+
+#endif /* HEADER */
+
+
+// IR: ![[LOOP2]] = distinct !{![[LOOP2]], ![[LOOPPROP3:[0-9]+]], ![[LOOPPROP4:[0-9]+]], ![[LOOPPROP5:[0-9]+]]}
+// IR: ![[LOOPPROP3]] = !{!"llvm.loop.mustprogress"}
+// IR: ![[LOOPPROP4]] = !{!"llvm.loop.unroll.count", i32 4}
+// IR: ![[LOOPPROP5]] = !{!"llvm.loop.unroll.enable"}
diff --git a/clang/test/OpenMP/unroll_codegen_for_collapse_outer.cpp b/clang/test/OpenMP/unroll_codegen_for_collapse_outer.cpp
new file mode 100644
index 000000000000..879db7b911db
--- /dev/null
+++ b/clang/test/OpenMP/unroll_codegen_for_collapse_outer.cpp
@@ -0,0 +1,251 @@
+// Check code generation
+// RUN: %clang_cc1 -verify -triple x86_64-pc-linux-gnu -fopenmp -fopenmp-version=51 -emit-llvm %s -o - | FileCheck %s --check-prefix=IR
+
+// Check same results after serialization round-trip
+// RUN: %clang_cc1 -verify -triple x86_64-pc-linux-gnu -fopenmp -fopenmp-version=51 -emit-pch -o %t %s
+// RUN: %clang_cc1 -verify -triple x86_64-pc-linux-gnu -fopenmp -fopenmp-version=51 -include-pch %t -emit-llvm %s -o - | FileCheck %s --check-prefix=IR
+// expected-no-diagnostics
+
+#ifndef HEADER
+#define HEADER
+
+// placeholder for loop body code.
+extern "C" void body(...) {}
+
+
+// IR-LABEL: @func(
+// IR-NEXT:  [[ENTRY:.*]]:
+// IR-NEXT:    %[[START_ADDR:.+]] = alloca i32, align 4
+// IR-NEXT:    %[[END_ADDR:.+]] = alloca i32, align 4
+// IR-NEXT:    %[[STEP_ADDR:.+]] = alloca i32, align 4
+// IR-NEXT:    %[[DOTOMP_IV:.+]] = alloca i64, align 8
+// IR-NEXT:    %[[TMP:.+]] = alloca i32, align 4
+// IR-NEXT:    %[[TMP1:.+]] = alloca i32, align 4
+// IR-NEXT:    %[[DOTCAPTURE_EXPR_:.+]] = alloca i32, align 4
+// IR-NEXT:    %[[DOTCAPTURE_EXPR_2:.+]] = alloca i32, align 4
+// IR-NEXT:    %[[DOTCAPTURE_EXPR_3:.+]] = alloca i32, align 4
+// IR-NEXT:    %[[J:.+]] = alloca i32, align 4
+// IR-NEXT:    %[[DOTCAPTURE_EXPR_4:.+]] = alloca i32, align 4
+// IR-NEXT:    %[[DOTCAPTURE_EXPR_5:.+]] = alloca i32, align 4
+// IR-NEXT:    %[[DOTCAPTURE_EXPR_6:.+]] = alloca i32, align 4
+// IR-NEXT:    %[[DOTCAPTURE_EXPR_7:.+]] = alloca i32, align 4
+// IR-NEXT:    %[[DOTCAPTURE_EXPR_10:.+]] = alloca i32, align 4
+// IR-NEXT:    %[[DOTCAPTURE_EXPR_12:.+]] = alloca i64, align 8
+// IR-NEXT:    %[[I:.+]] = alloca i32, align 4
+// IR-NEXT:    %[[DOTUNROLLED_IV_J:.+]] = alloca i32, align 4
+// IR-NEXT:    %[[DOTOMP_LB:.+]] = alloca i64, align 8
+// IR-NEXT:    %[[DOTOMP_UB:.+]] = alloca i64, align 8
+// IR-NEXT:    %[[DOTOMP_STRIDE:.+]] = alloca i64, align 8
+// IR-NEXT:    %[[DOTOMP_IS_LAST:.+]] = alloca i32, align 4
+// IR-NEXT:    %[[I22:.+]] = alloca i32, align 4
+// IR-NEXT:    %[[DOTUNROLLED_IV_J23:.+]] = alloca i32, align 4
+// IR-NEXT:    %[[DOTUNROLL_INNER_IV_J:.+]] = alloca i32, align 4
+// IR-NEXT:    %[[TMP0:.+]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @2)
+// IR-NEXT:    store i32 %[[START:.+]], i32* %[[START_ADDR]], align 4
+// IR-NEXT:    store i32 %[[END:.+]], i32* %[[END_ADDR]], align 4
+// IR-NEXT:    store i32 %[[STEP:.+]], i32* %[[STEP_ADDR]], align 4
+// IR-NEXT:    %[[TMP1_1:.+]] = load i32, i32* %[[START_ADDR]], align 4
+// IR-NEXT:    store i32 %[[TMP1_1]], i32* %[[DOTCAPTURE_EXPR_]], align 4
+// IR-NEXT:    %[[TMP2:.+]] = load i32, i32* %[[END_ADDR]], align 4
+// IR-NEXT:    store i32 %[[TMP2]], i32* %[[DOTCAPTURE_EXPR_2]], align 4
+// IR-NEXT:    %[[TMP3:.+]] = load i32, i32* %[[STEP_ADDR]], align 4
+// IR-NEXT:    store i32 %[[TMP3]], i32* %[[DOTCAPTURE_EXPR_3]], align 4
+// IR-NEXT:    %[[TMP4:.+]] = load i32, i32* %[[START_ADDR]], align 4
+// IR-NEXT:    store i32 %[[TMP4]], i32* %[[J]], align 4
+// IR-NEXT:    %[[TMP5:.+]] = load i32, i32* %[[START_ADDR]], align 4
+// IR-NEXT:    store i32 %[[TMP5]], i32* %[[DOTCAPTURE_EXPR_4]], align 4
+// IR-NEXT:    %[[TMP6:.+]] = load i32, i32* %[[END_ADDR]], align 4
+// IR-NEXT:    store i32 %[[TMP6]], i32* %[[DOTCAPTURE_EXPR_5]], align 4
+// IR-NEXT:    %[[TMP7:.+]] = load i32, i32* %[[STEP_ADDR]], align 4
+// IR-NEXT:    store i32 %[[TMP7]], i32* %[[DOTCAPTURE_EXPR_6]], align 4
+// IR-NEXT:    %[[TMP8:.+]] = load i32, i32* %[[DOTCAPTURE_EXPR_5]], align 4
+// IR-NEXT:    %[[TMP9:.+]] = load i32, i32* %[[DOTCAPTURE_EXPR_4]], align 4
+// IR-NEXT:    %[[SUB:.+]] = sub i32 %[[TMP8]], %[[TMP9]]
+// IR-NEXT:    %[[SUB8:.+]] = sub i32 %[[SUB]], 1
+// IR-NEXT:    %[[TMP10:.+]] = load i32, i32* %[[DOTCAPTURE_EXPR_6]], align 4
+// IR-NEXT:    %[[ADD:.+]] = add i32 %[[SUB8]], %[[TMP10]]
+// IR-NEXT:    %[[TMP11:.+]] = load i32, i32* %[[DOTCAPTURE_EXPR_6]], align 4
+// IR-NEXT:    %[[DIV:.+]] = udiv i32 %[[ADD]], %[[TMP11]]
+// IR-NEXT:    %[[SUB9:.+]] = sub i32 %[[DIV]], 1
+// IR-NEXT:    store i32 %[[SUB9]], i32* %[[DOTCAPTURE_EXPR_7]], align 4
+// IR-NEXT:    %[[TMP12:.+]] = load i32, i32* %[[DOTCAPTURE_EXPR_7]], align 4
+// IR-NEXT:    %[[ADD11:.+]] = add i32 %[[TMP12]], 1
+// IR-NEXT:    store i32 %[[ADD11]], i32* %[[DOTCAPTURE_EXPR_10]], align 4
+// IR-NEXT:    %[[TMP13:.+]] = load i32, i32* %[[DOTCAPTURE_EXPR_2]], align 4
+// IR-NEXT:    %[[TMP14:.+]] = load i32, i32* %[[DOTCAPTURE_EXPR_]], align 4
+// IR-NEXT:    %[[SUB13:.+]] = sub i32 %[[TMP13]], %[[TMP14]]
+// IR-NEXT:    %[[SUB14:.+]] = sub i32 %[[SUB13]], 1
+// IR-NEXT:    %[[TMP15:.+]] = load i32, i32* %[[DOTCAPTURE_EXPR_3]], align 4
+// IR-NEXT:    %[[ADD15:.+]] = add i32 %[[SUB14]], %[[TMP15]]
+// IR-NEXT:    %[[TMP16:.+]] = load i32, i32* %[[DOTCAPTURE_EXPR_3]], align 4
+// IR-NEXT:    %[[DIV16:.+]] = udiv i32 %[[ADD15]], %[[TMP16]]
+// IR-NEXT:    %[[CONV:.+]] = zext i32 %[[DIV16]] to i64
+// IR-NEXT:    %[[TMP17:.+]] = load i32, i32* %[[DOTCAPTURE_EXPR_10]], align 4
+// IR-NEXT:    %[[SUB17:.+]] = sub i32 %[[TMP17]], -1
+// IR-NEXT:    %[[DIV18:.+]] = udiv i32 %[[SUB17]], 2
+// IR-NEXT:    %[[CONV19:.+]] = zext i32 %[[DIV18]] to i64
+// IR-NEXT:    %[[MUL:.+]] = mul nsw i64 %[[CONV]], %[[CONV19]]
+// IR-NEXT:    %[[SUB20:.+]] = sub nsw i64 %[[MUL]], 1
+// IR-NEXT:    store i64 %[[SUB20]], i64* %[[DOTCAPTURE_EXPR_12]], align 8
+// IR-NEXT:    %[[TMP18:.+]] = load i32, i32* %[[DOTCAPTURE_EXPR_]], align 4
+// IR-NEXT:    store i32 %[[TMP18]], i32* %[[I]], align 4
+// IR-NEXT:    store i32 0, i32* %[[DOTUNROLLED_IV_J]], align 4
+// IR-NEXT:    %[[TMP19:.+]] = load i32, i32* %[[DOTCAPTURE_EXPR_]], align 4
+// IR-NEXT:    %[[TMP20:.+]] = load i32, i32* %[[DOTCAPTURE_EXPR_2]], align 4
+// IR-NEXT:    %[[CMP:.+]] = icmp slt i32 %[[TMP19]], %[[TMP20]]
+// IR-NEXT:    br i1 %[[CMP]], label %[[LAND_LHS_TRUE:.+]], label %[[OMP_PRECOND_END:.+]]
+// IR-EMPTY:
+// IR-NEXT:  [[LAND_LHS_TRUE]]:
+// IR-NEXT:    %[[TMP21:.+]] = load i32, i32* %[[DOTCAPTURE_EXPR_10]], align 4
+// IR-NEXT:    %[[CMP21:.+]] = icmp ult i32 0, %[[TMP21]]
+// IR-NEXT:    br i1 %[[CMP21]], label %[[OMP_PRECOND_THEN:.+]], label %[[OMP_PRECOND_END]]
+// IR-EMPTY:
+// IR-NEXT:  [[OMP_PRECOND_THEN]]:
+// IR-NEXT:    store i64 0, i64* %[[DOTOMP_LB]], align 8
+// IR-NEXT:    %[[TMP22:.+]] = load i64, i64* %[[DOTCAPTURE_EXPR_12]], align 8
+// IR-NEXT:    store i64 %[[TMP22]], i64* %[[DOTOMP_UB]], align 8
+// IR-NEXT:    store i64 1, i64* %[[DOTOMP_STRIDE]], align 8
+// IR-NEXT:    store i32 0, i32* %[[DOTOMP_IS_LAST]], align 4
+// IR-NEXT:    call void @__kmpc_for_static_init_8(%struct.ident_t* @1, i32 %[[TMP0]], i32 34, i32* %[[DOTOMP_IS_LAST]], i64* %[[DOTOMP_LB]], i64* %[[DOTOMP_UB]], i64* %[[DOTOMP_STRIDE]], i64 1, i64 1)
+// IR-NEXT:    %[[TMP23:.+]] = load i64, i64* %[[DOTOMP_UB]], align 8
+// IR-NEXT:    %[[TMP24:.+]] = load i64, i64* %[[DOTCAPTURE_EXPR_12]], align 8
+// IR-NEXT:    %[[CMP24:.+]] = icmp sgt i64 %[[TMP23]], %[[TMP24]]
+// IR-NEXT:    br i1 %[[CMP24]], label %[[COND_TRUE:.+]], label %[[COND_FALSE:.+]]
+// IR-EMPTY:
+// IR-NEXT:  [[COND_TRUE]]:
+// IR-NEXT:    %[[TMP25:.+]] = load i64, i64* %[[DOTCAPTURE_EXPR_12]], align 8
+// IR-NEXT:    br label %[[COND_END:.+]]
+// IR-EMPTY:
+// IR-NEXT:  [[COND_FALSE]]:
+// IR-NEXT:    %[[TMP26:.+]] = load i64, i64* %[[DOTOMP_UB]], align 8
+// IR-NEXT:    br label %[[COND_END]]
+// IR-EMPTY:
+// IR-NEXT:  [[COND_END]]:
+// IR-NEXT:    %[[COND:.+]] = phi i64 [ %[[TMP25]], %[[COND_TRUE]] ], [ %[[TMP26]], %[[COND_FALSE]] ]
+// IR-NEXT:    store i64 %[[COND]], i64* %[[DOTOMP_UB]], align 8
+// IR-NEXT:    %[[TMP27:.+]] = load i64, i64* %[[DOTOMP_LB]], align 8
+// IR-NEXT:    store i64 %[[TMP27]], i64* %[[DOTOMP_IV]], align 8
+// IR-NEXT:    br label %[[OMP_INNER_FOR_COND:.+]]
+// IR-EMPTY:
+// IR-NEXT:  [[OMP_INNER_FOR_COND]]:
+// IR-NEXT:    %[[TMP28:.+]] = load i64, i64* %[[DOTOMP_IV]], align 8
+// IR-NEXT:    %[[TMP29:.+]] = load i64, i64* %[[DOTOMP_UB]], align 8
+// IR-NEXT:    %[[CMP25:.+]] = icmp sle i64 %[[TMP28]], %[[TMP29]]
+// IR-NEXT:    br i1 %[[CMP25]], label %[[OMP_INNER_FOR_BODY:.+]], label %[[OMP_INNER_FOR_END:.+]]
+// IR-EMPTY:
+// IR-NEXT:  [[OMP_INNER_FOR_BODY]]:
+// IR-NEXT:    %[[TMP30:.+]] = load i32, i32* %[[DOTCAPTURE_EXPR_]], align 4
+// IR-NEXT:    %[[CONV26:.+]] = sext i32 %[[TMP30]] to i64
+// IR-NEXT:    %[[TMP31:.+]] = load i64, i64* %[[DOTOMP_IV]], align 8
+// IR-NEXT:    %[[TMP32:.+]] = load i32, i32* %[[DOTCAPTURE_EXPR_10]], align 4
+// IR-NEXT:    %[[SUB27:.+]] = sub i32 %[[TMP32]], -1
+// IR-NEXT:    %[[DIV28:.+]] = udiv i32 %[[SUB27]], 2
+// IR-NEXT:    %[[MUL29:.+]] = mul i32 1, %[[DIV28]]
+// IR-NEXT:    %[[CONV30:.+]] = zext i32 %[[MUL29]] to i64
+// IR-NEXT:    %[[DIV31:.+]] = sdiv i64 %[[TMP31]], %[[CONV30]]
+// IR-NEXT:    %[[TMP33:.+]] = load i32, i32* %[[DOTCAPTURE_EXPR_3]], align 4
+// IR-NEXT:    %[[CONV32:.+]] = sext i32 %[[TMP33]] to i64
+// IR-NEXT:    %[[MUL33:.+]] = mul nsw i64 %[[DIV31]], %[[CONV32]]
+// IR-NEXT:    %[[ADD34:.+]] = add nsw i64 %[[CONV26]], %[[MUL33]]
+// IR-NEXT:    %[[CONV35:.+]] = trunc i64 %[[ADD34]] to i32
+// IR-NEXT:    store i32 %[[CONV35]], i32* %[[I22]], align 4
+// IR-NEXT:    %[[TMP34:.+]] = load i64, i64* %[[DOTOMP_IV]], align 8
+// IR-NEXT:    %[[TMP35:.+]] = load i64, i64* %[[DOTOMP_IV]], align 8
+// IR-NEXT:    %[[TMP36:.+]] = load i32, i32* %[[DOTCAPTURE_EXPR_10]], align 4
+// IR-NEXT:    %[[SUB36:.+]] = sub i32 %[[TMP36]], -1
+// IR-NEXT:    %[[DIV37:.+]] = udiv i32 %[[SUB36]], 2
+// IR-NEXT:    %[[MUL38:.+]] = mul i32 1, %[[DIV37]]
+// IR-NEXT:    %[[CONV39:.+]] = zext i32 %[[MUL38]] to i64
+// IR-NEXT:    %[[DIV40:.+]] = sdiv i64 %[[TMP35]], %[[CONV39]]
+// IR-NEXT:    %[[TMP37:.+]] = load i32, i32* %[[DOTCAPTURE_EXPR_10]], align 4
+// IR-NEXT:    %[[SUB41:.+]] = sub i32 %[[TMP37]], -1
+// IR-NEXT:    %[[DIV42:.+]] = udiv i32 %[[SUB41]], 2
+// IR-NEXT:    %[[MUL43:.+]] = mul i32 1, %[[DIV42]]
+// IR-NEXT:    %[[CONV44:.+]] = zext i32 %[[MUL43]] to i64
+// IR-NEXT:    %[[MUL45:.+]] = mul nsw i64 %[[DIV40]], %[[CONV44]]
+// IR-NEXT:    %[[SUB46:.+]] = sub nsw i64 %[[TMP34]], %[[MUL45]]
+// IR-NEXT:    %[[MUL47:.+]] = mul nsw i64 %[[SUB46]], 2
+// IR-NEXT:    %[[ADD48:.+]] = add nsw i64 0, %[[MUL47]]
+// IR-NEXT:    %[[CONV49:.+]] = trunc i64 %[[ADD48]] to i32
+// IR-NEXT:    store i32 %[[CONV49]], i32* %[[DOTUNROLLED_IV_J23]], align 4
+// IR-NEXT:    %[[TMP38:.+]] = load i32, i32* %[[DOTUNROLLED_IV_J23]], align 4
+// IR-NEXT:    store i32 %[[TMP38]], i32* %[[DOTUNROLL_INNER_IV_J]], align 4
+// IR-NEXT:    br label %[[FOR_COND:.+]]
+// IR-EMPTY:
+// IR-NEXT:  [[FOR_COND]]:
+// IR-NEXT:    %[[TMP39:.+]] = load i32, i32* %[[DOTUNROLL_INNER_IV_J]], align 4
+// IR-NEXT:    %[[TMP40:.+]] = load i32, i32* %[[DOTUNROLLED_IV_J23]], align 4
+// IR-NEXT:    %[[ADD50:.+]] = add i32 %[[TMP40]], 2
+// IR-NEXT:    %[[CMP51:.+]] = icmp ule i32 %[[TMP39]], %[[ADD50]]
+// IR-NEXT:    br i1 %[[CMP51]], label %[[LAND_RHS:.+]], label %[[LAND_END:.+]]
+// IR-EMPTY:
+// IR-NEXT:  [[LAND_RHS]]:
+// IR-NEXT:    %[[TMP41:.+]] = load i32, i32* %[[DOTUNROLL_INNER_IV_J]], align 4
+// IR-NEXT:    %[[TMP42:.+]] = load i32, i32* %[[DOTCAPTURE_EXPR_7]], align 4
+// IR-NEXT:    %[[ADD52:.+]] = add i32 %[[TMP42]], 1
+// IR-NEXT:    %[[CMP53:.+]] = icmp ule i32 %[[TMP41]], %[[ADD52]]
+// IR-NEXT:    br label %[[LAND_END]]
+// IR-EMPTY:
+// IR-NEXT:  [[LAND_END]]:
+// IR-NEXT:    %[[TMP43:.+]] = phi i1 [ false, %[[FOR_COND]] ], [ %[[CMP53]], %[[LAND_RHS]] ]
+// IR-NEXT:    br i1 %[[TMP43]], label %[[FOR_BODY:.+]], label %[[FOR_END:.+]]
+// IR-EMPTY:
+// IR-NEXT:  [[FOR_BODY]]:
+// IR-NEXT:    %[[TMP44:.+]] = load i32, i32* %[[DOTCAPTURE_EXPR_4]], align 4
+// IR-NEXT:    %[[TMP45:.+]] = load i32, i32* %[[DOTUNROLL_INNER_IV_J]], align 4
+// IR-NEXT:    %[[TMP46:.+]] = load i32, i32* %[[DOTCAPTURE_EXPR_6]], align 4
+// IR-NEXT:    %[[MUL54:.+]] = mul i32 %[[TMP45]], %[[TMP46]]
+// IR-NEXT:    %[[ADD55:.+]] = add i32 %[[TMP44]], %[[MUL54]]
+// IR-NEXT:    store i32 %[[ADD55]], i32* %[[J]], align 4
+// IR-NEXT:    %[[TMP47:.+]] = load i32, i32* %[[START_ADDR]], align 4
+// IR-NEXT:    %[[TMP48:.+]] = load i32, i32* %[[END_ADDR]], align 4
+// IR-NEXT:    %[[TMP49:.+]] = load i32, i32* %[[STEP_ADDR]], align 4
+// IR-NEXT:    %[[TMP50:.+]] = load i32, i32* %[[I22]], align 4
+// IR-NEXT:    %[[TMP51:.+]] = load i32, i32* %[[J]], align 4
+// IR-NEXT:    call void (...) @body(i32 %[[TMP47]], i32 %[[TMP48]], i32 %[[TMP49]], i32 %[[TMP50]], i32 %[[TMP51]])
+// IR-NEXT:    br label %[[FOR_INC:.+]]
+// IR-EMPTY:
+// IR-NEXT:  [[FOR_INC]]:
+// IR-NEXT:    %[[TMP52:.+]] = load i32, i32* %[[DOTUNROLL_INNER_IV_J]], align 4
+// IR-NEXT:    %[[INC:.+]] = add i32 %[[TMP52]], 1
+// IR-NEXT:    store i32 %[[INC]], i32* %[[DOTUNROLL_INNER_IV_J]], align 4
+// IR-NEXT:    br label %[[FOR_COND]], !llvm.loop ![[LOOP2:[0-9]+]]
+// IR-EMPTY:
+// IR-NEXT:  [[FOR_END]]:
+// IR-NEXT:    br label %[[OMP_BODY_CONTINUE:.+]]
+// IR-EMPTY:
+// IR-NEXT:  [[OMP_BODY_CONTINUE]]:
+// IR-NEXT:    br label %[[OMP_INNER_FOR_INC:.+]]
+// IR-EMPTY:
+// IR-NEXT:  [[OMP_INNER_FOR_INC]]:
+// IR-NEXT:    %[[TMP53:.+]] = load i64, i64* %[[DOTOMP_IV]], align 8
+// IR-NEXT:    %[[ADD56:.+]] = add nsw i64 %[[TMP53]], 1
+// IR-NEXT:    store i64 %[[ADD56]], i64* %[[DOTOMP_IV]], align 8
+// IR-NEXT:    br label %[[OMP_INNER_FOR_COND]]
+// IR-EMPTY:
+// IR-NEXT:  [[OMP_INNER_FOR_END]]:
+// IR-NEXT:    br label %[[OMP_LOOP_EXIT:.+]]
+// IR-EMPTY:
+// IR-NEXT:  [[OMP_LOOP_EXIT]]:
+// IR-NEXT:    call void @__kmpc_for_static_fini(%struct.ident_t* @1, i32 %[[TMP0]])
+// IR-NEXT:    br label %[[OMP_PRECOND_END]]
+// IR-EMPTY:
+// IR-NEXT:  [[OMP_PRECOND_END]]:
+// IR-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @3, i32 %[[TMP0]])
+// IR-NEXT:    ret void
+// IR-NEXT:  }
+extern "C" void func(int start, int end, int step) {
+  #pragma omp for collapse(2)
+  for (int i = start; i < end; i+=step) {
+    #pragma omp unroll partial
+    for (int j = start; j < end; j+=step)
+        body(start, end, step, i, j);
+  }
+}
+
+#endif /* HEADER */
+
+
+// IR: ![[LOOP2]] = distinct !{![[LOOP2]], ![[LOOPPROP3:[0-9]+]], ![[LOOPPROP4:[0-9]+]]}
+// IR: ![[LOOPPROP3]] = !{!"llvm.loop.mustprogress"}
+// IR: ![[LOOPPROP4]] = !{!"llvm.loop.unroll.count", i32 2}
diff --git a/clang/test/OpenMP/unroll_codegen_for_partial.cpp b/clang/test/OpenMP/unroll_codegen_for_partial.cpp
new file mode 100644
index 000000000000..9a6e480e18f4
--- /dev/null
+++ b/clang/test/OpenMP/unroll_codegen_for_partial.cpp
@@ -0,0 +1,187 @@
+// Check code generation
+// RUN: %clang_cc1 -verify -triple x86_64-pc-linux-gnu -fopenmp -fopenmp-version=51 -emit-llvm %s -o - | FileCheck %s --check-prefix=IR
+
+// Check same results after serialization round-trip
+// RUN: %clang_cc1 -verify -triple x86_64-pc-linux-gnu -fopenmp -fopenmp-version=51 -emit-pch -o %t %s
+// RUN: %clang_cc1 -verify -triple x86_64-pc-linux-gnu -fopenmp -fopenmp-version=51 -include-pch %t -emit-llvm %s -o - | FileCheck %s --check-prefix=IR
+// expected-no-diagnostics
+
+#ifndef HEADER
+#define HEADER
+
+// placeholder for loop body code.
+extern "C" void body(...) {}
+
+
+// IR-LABEL: @func(
+// IR-NEXT:  [[ENTRY:.*]]:
+// IR-NEXT:    %[[START_ADDR:.+]] = alloca i32, align 4
+// IR-NEXT:    %[[END_ADDR:.+]] = alloca i32, align 4
+// IR-NEXT:    %[[STEP_ADDR:.+]] = alloca i32, align 4
+// IR-NEXT:    %[[I:.+]] = alloca i32, align 4
+// IR-NEXT:    %[[DOTOMP_IV:.+]] = alloca i32, align 4
+// IR-NEXT:    %[[TMP:.+]] = alloca i32, align 4
+// IR-NEXT:    %[[DOTCAPTURE_EXPR_:.+]] = alloca i32, align 4
+// IR-NEXT:    %[[DOTCAPTURE_EXPR_1:.+]] = alloca i32, align 4
+// IR-NEXT:    %[[DOTCAPTURE_EXPR_2:.+]] = alloca i32, align 4
+// IR-NEXT:    %[[DOTCAPTURE_EXPR_3:.+]] = alloca i32, align 4
+// IR-NEXT:    %[[DOTCAPTURE_EXPR_6:.+]] = alloca i32, align 4
+// IR-NEXT:    %[[DOTCAPTURE_EXPR_8:.+]] = alloca i32, align 4
+// IR-NEXT:    %[[DOTUNROLLED_IV_I:.+]] = alloca i32, align 4
+// IR-NEXT:    %[[DOTOMP_LB:.+]] = alloca i32, align 4
+// IR-NEXT:    %[[DOTOMP_UB:.+]] = alloca i32, align 4
+// IR-NEXT:    %[[DOTOMP_STRIDE:.+]] = alloca i32, align 4
+// IR-NEXT:    %[[DOTOMP_IS_LAST:.+]] = alloca i32, align 4
+// IR-NEXT:    %[[DOTUNROLLED_IV_I12:.+]] = alloca i32, align 4
+// IR-NEXT:    %[[DOTUNROLL_INNER_IV_I:.+]] = alloca i32, align 4
+// IR-NEXT:    %[[TMP0:.+]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @2)
+// IR-NEXT:    store i32 %[[START:.+]], i32* %[[START_ADDR]], align 4
+// IR-NEXT:    store i32 %[[END:.+]], i32* %[[END_ADDR]], align 4
+// IR-NEXT:    store i32 %[[STEP:.+]], i32* %[[STEP_ADDR]], align 4
+// IR-NEXT:    %[[TMP1:.+]] = load i32, i32* %[[START_ADDR]], align 4
+// IR-NEXT:    store i32 %[[TMP1]], i32* %[[DOTCAPTURE_EXPR_]], align 4
+// IR-NEXT:    %[[TMP2:.+]] = load i32, i32* %[[END_ADDR]], align 4
+// IR-NEXT:    store i32 %[[TMP2]], i32* %[[DOTCAPTURE_EXPR_1]], align 4
+// IR-NEXT:    %[[TMP3:.+]] = load i32, i32* %[[STEP_ADDR]], align 4
+// IR-NEXT:    store i32 %[[TMP3]], i32* %[[DOTCAPTURE_EXPR_2]], align 4
+// IR-NEXT:    %[[TMP4:.+]] = load i32, i32* %[[DOTCAPTURE_EXPR_1]], align 4
+// IR-NEXT:    %[[TMP5:.+]] = load i32, i32* %[[DOTCAPTURE_EXPR_]], align 4
+// IR-NEXT:    %[[SUB:.+]] = sub i32 %[[TMP4]], %[[TMP5]]
+// IR-NEXT:    %[[SUB4:.+]] = sub i32 %[[SUB]], 1
+// IR-NEXT:    %[[TMP6:.+]] = load i32, i32* %[[DOTCAPTURE_EXPR_2]], align 4
+// IR-NEXT:    %[[ADD:.+]] = add i32 %[[SUB4]], %[[TMP6]]
+// IR-NEXT:    %[[TMP7:.+]] = load i32, i32* %[[DOTCAPTURE_EXPR_2]], align 4
+// IR-NEXT:    %[[DIV:.+]] = udiv i32 %[[ADD]], %[[TMP7]]
+// IR-NEXT:    %[[SUB5:.+]] = sub i32 %[[DIV]], 1
+// IR-NEXT:    store i32 %[[SUB5]], i32* %[[DOTCAPTURE_EXPR_3]], align 4
+// IR-NEXT:    %[[TMP8:.+]] = load i32, i32* %[[DOTCAPTURE_EXPR_3]], align 4
+// IR-NEXT:    %[[ADD7:.+]] = add i32 %[[TMP8]], 1
+// IR-NEXT:    store i32 %[[ADD7]], i32* %[[DOTCAPTURE_EXPR_6]], align 4
+// IR-NEXT:    %[[TMP9:.+]] = load i32, i32* %[[DOTCAPTURE_EXPR_6]], align 4
+// IR-NEXT:    %[[SUB9:.+]] = sub i32 %[[TMP9]], -1
+// IR-NEXT:    %[[DIV10:.+]] = udiv i32 %[[SUB9]], 2
+// IR-NEXT:    %[[SUB11:.+]] = sub i32 %[[DIV10]], 1
+// IR-NEXT:    store i32 %[[SUB11]], i32* %[[DOTCAPTURE_EXPR_8]], align 4
+// IR-NEXT:    store i32 0, i32* %[[DOTUNROLLED_IV_I]], align 4
+// IR-NEXT:    %[[TMP10:.+]] = load i32, i32* %[[DOTCAPTURE_EXPR_6]], align 4
+// IR-NEXT:    %[[CMP:.+]] = icmp ult i32 0, %[[TMP10]]
+// IR-NEXT:    br i1 %[[CMP]], label %[[OMP_PRECOND_THEN:.+]], label %[[OMP_PRECOND_END:.+]]
+// IR-EMPTY:
+// IR-NEXT:  [[OMP_PRECOND_THEN]]:
+// IR-NEXT:    store i32 0, i32* %[[DOTOMP_LB]], align 4
+// IR-NEXT:    %[[TMP11:.+]] = load i32, i32* %[[DOTCAPTURE_EXPR_8]], align 4
+// IR-NEXT:    store i32 %[[TMP11]], i32* %[[DOTOMP_UB]], align 4
+// IR-NEXT:    store i32 1, i32* %[[DOTOMP_STRIDE]], align 4
+// IR-NEXT:    store i32 0, i32* %[[DOTOMP_IS_LAST]], align 4
+// IR-NEXT:    call void @__kmpc_for_static_init_4u(%struct.ident_t* @1, i32 %[[TMP0]], i32 34, i32* %[[DOTOMP_IS_LAST]], i32* %[[DOTOMP_LB]], i32* %[[DOTOMP_UB]], i32* %[[DOTOMP_STRIDE]], i32 1, i32 1)
+// IR-NEXT:    %[[TMP12:.+]] = load i32, i32* %[[DOTOMP_UB]], align 4
+// IR-NEXT:    %[[TMP13:.+]] = load i32, i32* %[[DOTCAPTURE_EXPR_8]], align 4
+// IR-NEXT:    %[[CMP13:.+]] = icmp ugt i32 %[[TMP12]], %[[TMP13]]
+// IR-NEXT:    br i1 %[[CMP13]], label %[[COND_TRUE:.+]], label %[[COND_FALSE:.+]]
+// IR-EMPTY:
+// IR-NEXT:  [[COND_TRUE]]:
+// IR-NEXT:    %[[TMP14:.+]] = load i32, i32* %[[DOTCAPTURE_EXPR_8]], align 4
+// IR-NEXT:    br label %[[COND_END:.+]]
+// IR-EMPTY:
+// IR-NEXT:  [[COND_FALSE]]:
+// IR-NEXT:    %[[TMP15:.+]] = load i32, i32* %[[DOTOMP_UB]], align 4
+// IR-NEXT:    br label %[[COND_END]]
+// IR-EMPTY:
+// IR-NEXT:  [[COND_END]]:
+// IR-NEXT:    %[[COND:.+]] = phi i32 [ %[[TMP14]], %[[COND_TRUE]] ], [ %[[TMP15]], %[[COND_FALSE]] ]
+// IR-NEXT:    store i32 %[[COND]], i32* %[[DOTOMP_UB]], align 4
+// IR-NEXT:    %[[TMP16:.+]] = load i32, i32* %[[DOTOMP_LB]], align 4
+// IR-NEXT:    store i32 %[[TMP16]], i32* %[[DOTOMP_IV]], align 4
+// IR-NEXT:    br label %[[OMP_INNER_FOR_COND:.+]]
+// IR-EMPTY:
+// IR-NEXT:  [[OMP_INNER_FOR_COND]]:
+// IR-NEXT:    %[[TMP17:.+]] = load i32, i32* %[[DOTOMP_IV]], align 4
+// IR-NEXT:    %[[TMP18:.+]] = load i32, i32* %[[DOTOMP_UB]], align 4
+// IR-NEXT:    %[[ADD14:.+]] = add i32 %[[TMP18]], 1
+// IR-NEXT:    %[[CMP15:.+]] = icmp ult i32 %[[TMP17]], %[[ADD14]]
+// IR-NEXT:    br i1 %[[CMP15]], label %[[OMP_INNER_FOR_BODY:.+]], label %[[OMP_INNER_FOR_END:.+]]
+// IR-EMPTY:
+// IR-NEXT:  [[OMP_INNER_FOR_BODY]]:
+// IR-NEXT:    %[[TMP19:.+]] = load i32, i32* %[[DOTOMP_IV]], align 4
+// IR-NEXT:    %[[MUL:.+]] = mul i32 %[[TMP19]], 2
+// IR-NEXT:    %[[ADD16:.+]] = add i32 0, %[[MUL]]
+// IR-NEXT:    store i32 %[[ADD16]], i32* %[[DOTUNROLLED_IV_I12]], align 4
+// IR-NEXT:    %[[TMP20:.+]] = load i32, i32* %[[DOTUNROLLED_IV_I12]], align 4
+// IR-NEXT:    store i32 %[[TMP20]], i32* %[[DOTUNROLL_INNER_IV_I]], align 4
+// IR-NEXT:    br label %[[FOR_COND:.+]]
+// IR-EMPTY:
+// IR-NEXT:  [[FOR_COND]]:
+// IR-NEXT:    %[[TMP21:.+]] = load i32, i32* %[[DOTUNROLL_INNER_IV_I]], align 4
+// IR-NEXT:    %[[TMP22:.+]] = load i32, i32* %[[DOTUNROLLED_IV_I12]], align 4
+// IR-NEXT:    %[[ADD17:.+]] = add i32 %[[TMP22]], 2
+// IR-NEXT:    %[[CMP18:.+]] = icmp ule i32 %[[TMP21]], %[[ADD17]]
+// IR-NEXT:    br i1 %[[CMP18]], label %[[LAND_RHS:.+]], label %[[LAND_END:.+]]
+// IR-EMPTY:
+// IR-NEXT:  [[LAND_RHS]]:
+// IR-NEXT:    %[[TMP23:.+]] = load i32, i32* %[[DOTUNROLL_INNER_IV_I]], align 4
+// IR-NEXT:    %[[TMP24:.+]] = load i32, i32* %[[DOTCAPTURE_EXPR_3]], align 4
+// IR-NEXT:    %[[ADD19:.+]] = add i32 %[[TMP24]], 1
+// IR-NEXT:    %[[CMP20:.+]] = icmp ule i32 %[[TMP23]], %[[ADD19]]
+// IR-NEXT:    br label %[[LAND_END]]
+// IR-EMPTY:
+// IR-NEXT:  [[LAND_END]]:
+// IR-NEXT:    %[[TMP25:.+]] = phi i1 [ false, %[[FOR_COND]] ], [ %[[CMP20]], %[[LAND_RHS]] ]
+// IR-NEXT:    br i1 %[[TMP25]], label %[[FOR_BODY:.+]], label %[[FOR_END:.+]]
+// IR-EMPTY:
+// IR-NEXT:  [[FOR_BODY]]:
+// IR-NEXT:    %[[TMP26:.+]] = load i32, i32* %[[DOTCAPTURE_EXPR_]], align 4
+// IR-NEXT:    %[[TMP27:.+]] = load i32, i32* %[[DOTUNROLL_INNER_IV_I]], align 4
+// IR-NEXT:    %[[TMP28:.+]] = load i32, i32* %[[DOTCAPTURE_EXPR_2]], align 4
+// IR-NEXT:    %[[MUL21:.+]] = mul i32 %[[TMP27]], %[[TMP28]]
+// IR-NEXT:    %[[ADD22:.+]] = add i32 %[[TMP26]], %[[MUL21]]
+// IR-NEXT:    store i32 %[[ADD22]], i32* %[[I]], align 4
+// IR-NEXT:    %[[TMP29:.+]] = load i32, i32* %[[START_ADDR]], align 4
+// IR-NEXT:    %[[TMP30:.+]] = load i32, i32* %[[END_ADDR]], align 4
+// IR-NEXT:    %[[TMP31:.+]] = load i32, i32* %[[STEP_ADDR]], align 4
+// IR-NEXT:    %[[TMP32:.+]] = load i32, i32* %[[I]], align 4
+// IR-NEXT:    call void (...) @body(i32 %[[TMP29]], i32 %[[TMP30]], i32 %[[TMP31]], i32 %[[TMP32]])
+// IR-NEXT:    br label %[[FOR_INC:.+]]
+// IR-EMPTY:
+// IR-NEXT:  [[FOR_INC]]:
+// IR-NEXT:    %[[TMP33:.+]] = load i32, i32* %[[DOTUNROLL_INNER_IV_I]], align 4
+// IR-NEXT:    %[[INC:.+]] = add i32 %[[TMP33]], 1
+// IR-NEXT:    store i32 %[[INC]], i32* %[[DOTUNROLL_INNER_IV_I]], align 4
+// IR-NEXT:    br label %[[FOR_COND]], !llvm.loop ![[LOOP2:[0-9]+]]
+// IR-EMPTY:
+// IR-NEXT:  [[FOR_END]]:
+// IR-NEXT:    br label %[[OMP_BODY_CONTINUE:.+]]
+// IR-EMPTY:
+// IR-NEXT:  [[OMP_BODY_CONTINUE]]:
+// IR-NEXT:    br label %[[OMP_INNER_FOR_INC:.+]]
+// IR-EMPTY:
+// IR-NEXT:  [[OMP_INNER_FOR_INC]]:
+// IR-NEXT:    %[[TMP34:.+]] = load i32, i32* %[[DOTOMP_IV]], align 4
+// IR-NEXT:    %[[ADD23:.+]] = add i32 %[[TMP34]], 1
+// IR-NEXT:    store i32 %[[ADD23]], i32* %[[DOTOMP_IV]], align 4
+// IR-NEXT:    br label %[[OMP_INNER_FOR_COND]]
+// IR-EMPTY:
+// IR-NEXT:  [[OMP_INNER_FOR_END]]:
+// IR-NEXT:    br label %[[OMP_LOOP_EXIT:.+]]
+// IR-EMPTY:
+// IR-NEXT:  [[OMP_LOOP_EXIT]]:
+// IR-NEXT:    call void @__kmpc_for_static_fini(%struct.ident_t* @1, i32 %[[TMP0]])
+// IR-NEXT:    br label %[[OMP_PRECOND_END]]
+// IR-EMPTY:
+// IR-NEXT:  [[OMP_PRECOND_END]]:
+// IR-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @3, i32 %[[TMP0]])
+// IR-NEXT:    ret void
+// IR-NEXT:  }
+extern "C" void func(int start, int end, int step) {
+  int i;
+  #pragma omp for
+  #pragma omp unroll partial
+  for (i = start; i < end; i+=step)
+    body(start, end, step, i);
+}
+
+#endif /* HEADER */
+
+
+// IR: ![[LOOP2]] = distinct !{![[LOOP2]], ![[LOOPPROP3:[0-9]+]], ![[LOOPPROP4:[0-9]+]]}
+// IR: ![[LOOPPROP3]] = !{!"llvm.loop.mustprogress"}
+// IR: ![[LOOPPROP4]] = !{!"llvm.loop.unroll.count", i32 2}
diff --git a/clang/test/OpenMP/unroll_codegen_full.cpp b/clang/test/OpenMP/unroll_codegen_full.cpp
new file mode 100644
index 000000000000..cf5cac94a32b
--- /dev/null
+++ b/clang/test/OpenMP/unroll_codegen_full.cpp
@@ -0,0 +1,52 @@
+// Check code generation
+// RUN: %clang_cc1 -verify -triple x86_64-pc-linux-gnu -fopenmp -fopenmp-version=51 -emit-llvm %s -o - | FileCheck %s --check-prefix=IR
+
+// Check same results after serialization round-trip
+// RUN: %clang_cc1 -verify -triple x86_64-pc-linux-gnu -fopenmp -fopenmp-version=51 -emit-pch -o %t %s
+// RUN: %clang_cc1 -verify -triple x86_64-pc-linux-gnu -fopenmp -fopenmp-version=51 -include-pch %t -emit-llvm %s -o - | FileCheck %s --check-prefix=IR
+// expected-no-diagnostics
+
+#ifndef HEADER
+#define HEADER
+
+// placeholder for loop body code.
+extern "C" void body(...) {}
+
+
+// IR-LABEL: @func(
+// IR-NEXT:  [[ENTRY:.*]]:
+// IR-NEXT:    %[[I:.+]] = alloca i32, align 4
+// IR-NEXT:    store i32 7, i32* %[[I]], align 4
+// IR-NEXT:    br label %[[FOR_COND:.+]]
+// IR-EMPTY:
+// IR-NEXT:  [[FOR_COND]]:
+// IR-NEXT:    %[[TMP0:.+]] = load i32, i32* %[[I]], align 4
+// IR-NEXT:    %[[CMP:.+]] = icmp slt i32 %[[TMP0]], 17
+// IR-NEXT:    br i1 %[[CMP]], label %[[FOR_BODY:.+]], label %[[FOR_END:.+]]
+// IR-EMPTY:
+// IR-NEXT:  [[FOR_BODY]]:
+// IR-NEXT:    %[[TMP1:.+]] = load i32, i32* %[[I]], align 4
+// IR-NEXT:    call void (...) @body(i32 %[[TMP1]])
+// IR-NEXT:    br label %[[FOR_INC:.+]]
+// IR-EMPTY:
+// IR-NEXT:  [[FOR_INC]]:
+// IR-NEXT:    %[[TMP2:.+]] = load i32, i32* %[[I]], align 4
+// IR-NEXT:    %[[ADD:.+]] = add nsw i32 %[[TMP2]], 3
+// IR-NEXT:    store i32 %[[ADD]], i32* %[[I]], align 4
+// IR-NEXT:    br label %[[FOR_COND]], !llvm.loop ![[LOOP2:[0-9]+]]
+// IR-EMPTY:
+// IR-NEXT:  [[FOR_END]]:
+// IR-NEXT:    ret void
+// IR-NEXT:  }
+extern "C" void func() {
+  #pragma omp unroll full
+  for (int i = 7; i < 17; i += 3)
+    body(i);
+}
+
+#endif /* HEADER */
+
+
+// IR: ![[LOOP2]] = distinct !{![[LOOP2]], ![[LOOPPROP3:[0-9]+]], ![[LOOPPROP4:[0-9]+]]}
+// IR: ![[LOOPPROP3]] = !{!"llvm.loop.mustprogress"}
+// IR: ![[LOOPPROP4]] = !{!"llvm.loop.unroll.full"}
diff --git a/clang/test/OpenMP/unroll_codegen_heuristic.cpp b/clang/test/OpenMP/unroll_codegen_heuristic.cpp
new file mode 100644
index 000000000000..d66b53a118b6
--- /dev/null
+++ b/clang/test/OpenMP/unroll_codegen_heuristic.cpp
@@ -0,0 +1,64 @@
+// Check code generation
+// RUN: %clang_cc1 -verify -triple x86_64-pc-linux-gnu -fopenmp -fopenmp-version=51 -emit-llvm %s -o - | FileCheck %s --check-prefix=IR
+
+// Check same results after serialization round-trip
+// RUN: %clang_cc1 -verify -triple x86_64-pc-linux-gnu -fopenmp -fopenmp-version=51 -emit-pch -o %t %s
+// RUN: %clang_cc1 -verify -triple x86_64-pc-linux-gnu -fopenmp -fopenmp-version=51 -include-pch %t -emit-llvm %s -o - | FileCheck %s --check-prefix=IR
+// expected-no-diagnostics
+
+#ifndef HEADER
+#define HEADER
+
+// placeholder for loop body code.
+extern "C" void body(...) {}
+
+
+// IR-LABEL: @func(
+// IR-NEXT:  [[ENTRY:.*]]:
+// IR-NEXT:    %[[START_ADDR:.+]] = alloca i32, align 4
+// IR-NEXT:    %[[END_ADDR:.+]] = alloca i32, align 4
+// IR-NEXT:    %[[STEP_ADDR:.+]] = alloca i32, align 4
+// IR-NEXT:    %[[I:.+]] = alloca i32, align 4
+// IR-NEXT:    store i32 %[[START:.+]], i32* %[[START_ADDR]], align 4
+// IR-NEXT:    store i32 %[[END:.+]], i32* %[[END_ADDR]], align 4
+// IR-NEXT:    store i32 %[[STEP:.+]], i32* %[[STEP_ADDR]], align 4
+// IR-NEXT:    %[[TMP0:.+]] = load i32, i32* %[[START_ADDR]], align 4
+// IR-NEXT:    store i32 %[[TMP0]], i32* %[[I]], align 4
+// IR-NEXT:    br label %[[FOR_COND:.+]]
+// IR-EMPTY:
+// IR-NEXT:  [[FOR_COND]]:
+// IR-NEXT:    %[[TMP1:.+]] = load i32, i32* %[[I]], align 4
+// IR-NEXT:    %[[TMP2:.+]] = load i32, i32* %[[END_ADDR]], align 4
+// IR-NEXT:    %[[CMP:.+]] = icmp slt i32 %[[TMP1]], %[[TMP2]]
+// IR-NEXT:    br i1 %[[CMP]], label %[[FOR_BODY:.+]], label %[[FOR_END:.+]]
+// IR-EMPTY:
+// IR-NEXT:  [[FOR_BODY]]:
+// IR-NEXT:    %[[TMP3:.+]] = load i32, i32* %[[START_ADDR]], align 4
+// IR-NEXT:    %[[TMP4:.+]] = load i32, i32* %[[END_ADDR]], align 4
+// IR-NEXT:    %[[TMP5:.+]] = load i32, i32* %[[STEP_ADDR]], align 4
+// IR-NEXT:    %[[TMP6:.+]] = load i32, i32* %[[I]], align 4
+// IR-NEXT:    call void (...) @body(i32 %[[TMP3]], i32 %[[TMP4]], i32 %[[TMP5]], i32 %[[TMP6]])
+// IR-NEXT:    br label %[[FOR_INC:.+]]
+// IR-EMPTY:
+// IR-NEXT:  [[FOR_INC]]:
+// IR-NEXT:    %[[TMP7:.+]] = load i32, i32* %[[STEP_ADDR]], align 4
+// IR-NEXT:    %[[TMP8:.+]] = load i32, i32* %[[I]], align 4
+// IR-NEXT:    %[[ADD:.+]] = add nsw i32 %[[TMP8]], %[[TMP7]]
+// IR-NEXT:    store i32 %[[ADD]], i32* %[[I]], align 4
+// IR-NEXT:    br label %[[FOR_COND]], !llvm.loop ![[LOOP2:[0-9]+]]
+// IR-EMPTY:
+// IR-NEXT:  [[FOR_END]]:
+// IR-NEXT:    ret void
+// IR-NEXT:  }
+extern "C" void func(int start, int end, int step) {
+  #pragma omp unroll
+  for (int i = start; i < end; i+=step)
+    body(start, end, step, i);
+}
+
+#endif /* HEADER */
+
+
+// IR: ![[LOOP2]] = distinct !{![[LOOP2]], ![[LOOPPROP3:[0-9]+]], ![[LOOPPROP4:[0-9]+]]}
+// IR: ![[LOOPPROP3]] = !{!"llvm.loop.mustprogress"}
+// IR: ![[LOOPPROP4]] = !{!"llvm.loop.unroll.enable"}
diff --git a/clang/test/OpenMP/unroll_codegen_parallel_for_factor.cpp b/clang/test/OpenMP/unroll_codegen_parallel_for_factor.cpp
new file mode 100644
index 000000000000..cee8fc212c03
--- /dev/null
+++ b/clang/test/OpenMP/unroll_codegen_parallel_for_factor.cpp
@@ -0,0 +1,210 @@
+// Check code generation
+// RUN: %clang_cc1 -verify -triple x86_64-pc-linux-gnu -fopenmp -fopenmp-version=51 -emit-llvm %s -o - | FileCheck %s --check-prefix=IR
+
+// Check same results after serialization round-trip
+// RUN: %clang_cc1 -verify -triple x86_64-pc-linux-gnu -fopenmp -fopenmp-version=51 -emit-pch -o %t %s
+// RUN: %clang_cc1 -verify -triple x86_64-pc-linux-gnu -fopenmp -fopenmp-version=51 -include-pch %t -emit-llvm %s -o - | FileCheck %s --check-prefix=IR
+// expected-no-diagnostics
+
+#ifndef HEADER
+#define HEADER
+
+// placeholder for loop body code.
+extern "C" void body(...) {}
+
+
+// IR-LABEL: @func(
+// IR-NEXT:  [[ENTRY:.*]]:
+// IR-NEXT:    %[[START_ADDR:.+]] = alloca i32, align 4
+// IR-NEXT:    %[[END_ADDR:.+]] = alloca i32, align 4
+// IR-NEXT:    %[[STEP_ADDR:.+]] = alloca i32, align 4
+// IR-NEXT:    store i32 %[[START:.+]], i32* %[[START_ADDR]], align 4
+// IR-NEXT:    store i32 %[[END:.+]], i32* %[[END_ADDR]], align 4
+// IR-NEXT:    store i32 %[[STEP:.+]], i32* %[[STEP_ADDR]], align 4
+// IR-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* @2, i32 3, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*, i32*, i32*)* @.omp_outlined. to void (i32*, i32*, ...)*), i32* %[[END_ADDR]], i32* %[[STEP_ADDR]], i32* %[[START_ADDR]])
+// IR-NEXT:    ret void
+// IR-NEXT:  }
+extern "C" void func(int start, int end, int step) {
+  #pragma omp parallel for
+  #pragma omp unroll partial(7)
+  for (int i = start; i < end; i+=step)
+    body(start, end, step, i);
+}
+
+
+// IR-LABEL: @.omp_outlined.(
+// IR-NEXT:  [[ENTRY:.*]]:
+// IR-NEXT:    %[[DOTGLOBAL_TID__ADDR:.+]] = alloca i32*, align 8
+// IR-NEXT:    %[[DOTBOUND_TID__ADDR:.+]] = alloca i32*, align 8
+// IR-NEXT:    %[[END_ADDR:.+]] = alloca i32*, align 8
+// IR-NEXT:    %[[STEP_ADDR:.+]] = alloca i32*, align 8
+// IR-NEXT:    %[[START_ADDR:.+]] = alloca i32*, align 8
+// IR-NEXT:    %[[DOTOMP_IV:.+]] = alloca i32, align 4
+// IR-NEXT:    %[[TMP:.+]] = alloca i32, align 4
+// IR-NEXT:    %[[I:.+]] = alloca i32, align 4
+// IR-NEXT:    %[[DOTCAPTURE_EXPR_:.+]] = alloca i32, align 4
+// IR-NEXT:    %[[DOTCAPTURE_EXPR_1:.+]] = alloca i32, align 4
+// IR-NEXT:    %[[DOTCAPTURE_EXPR_2:.+]] = alloca i32, align 4
+// IR-NEXT:    %[[DOTCAPTURE_EXPR_3:.+]] = alloca i32, align 4
+// IR-NEXT:    %[[DOTCAPTURE_EXPR_6:.+]] = alloca i32, align 4
+// IR-NEXT:    %[[DOTCAPTURE_EXPR_8:.+]] = alloca i32, align 4
+// IR-NEXT:    %[[DOTUNROLLED_IV_I:.+]] = alloca i32, align 4
+// IR-NEXT:    %[[DOTOMP_LB:.+]] = alloca i32, align 4
+// IR-NEXT:    %[[DOTOMP_UB:.+]] = alloca i32, align 4
+// IR-NEXT:    %[[DOTOMP_STRIDE:.+]] = alloca i32, align 4
+// IR-NEXT:    %[[DOTOMP_IS_LAST:.+]] = alloca i32, align 4
+// IR-NEXT:    %[[DOTUNROLLED_IV_I12:.+]] = alloca i32, align 4
+// IR-NEXT:    %[[DOTUNROLL_INNER_IV_I:.+]] = alloca i32, align 4
+// IR-NEXT:    store i32* %[[DOTGLOBAL_TID_:.+]], i32** %[[DOTGLOBAL_TID__ADDR]], align 8
+// IR-NEXT:    store i32* %[[DOTBOUND_TID_:.+]], i32** %[[DOTBOUND_TID__ADDR]], align 8
+// IR-NEXT:    store i32* %[[END:.+]], i32** %[[END_ADDR]], align 8
+// IR-NEXT:    store i32* %[[STEP:.+]], i32** %[[STEP_ADDR]], align 8
+// IR-NEXT:    store i32* %[[START:.+]], i32** %[[START_ADDR]], align 8
+// IR-NEXT:    %[[TMP0:.+]] = load i32*, i32** %[[END_ADDR]], align 8
+// IR-NEXT:    %[[TMP1:.+]] = load i32*, i32** %[[STEP_ADDR]], align 8
+// IR-NEXT:    %[[TMP2:.+]] = load i32*, i32** %[[START_ADDR]], align 8
+// IR-NEXT:    %[[TMP3:.+]] = load i32, i32* %[[TMP2]], align 4
+// IR-NEXT:    store i32 %[[TMP3]], i32* %[[I]], align 4
+// IR-NEXT:    %[[TMP4:.+]] = load i32, i32* %[[TMP2]], align 4
+// IR-NEXT:    store i32 %[[TMP4]], i32* %[[DOTCAPTURE_EXPR_]], align 4
+// IR-NEXT:    %[[TMP5:.+]] = load i32, i32* %[[TMP0]], align 4
+// IR-NEXT:    store i32 %[[TMP5]], i32* %[[DOTCAPTURE_EXPR_1]], align 4
+// IR-NEXT:    %[[TMP6:.+]] = load i32, i32* %[[TMP1]], align 4
+// IR-NEXT:    store i32 %[[TMP6]], i32* %[[DOTCAPTURE_EXPR_2]], align 4
+// IR-NEXT:    %[[TMP7:.+]] = load i32, i32* %[[DOTCAPTURE_EXPR_1]], align 4
+// IR-NEXT:    %[[TMP8:.+]] = load i32, i32* %[[DOTCAPTURE_EXPR_]], align 4
+// IR-NEXT:    %[[SUB:.+]] = sub i32 %[[TMP7]], %[[TMP8]]
+// IR-NEXT:    %[[SUB4:.+]] = sub i32 %[[SUB]], 1
+// IR-NEXT:    %[[TMP9:.+]] = load i32, i32* %[[DOTCAPTURE_EXPR_2]], align 4
+// IR-NEXT:    %[[ADD:.+]] = add i32 %[[SUB4]], %[[TMP9]]
+// IR-NEXT:    %[[TMP10:.+]] = load i32, i32* %[[DOTCAPTURE_EXPR_2]], align 4
+// IR-NEXT:    %[[DIV:.+]] = udiv i32 %[[ADD]], %[[TMP10]]
+// IR-NEXT:    %[[SUB5:.+]] = sub i32 %[[DIV]], 1
+// IR-NEXT:    store i32 %[[SUB5]], i32* %[[DOTCAPTURE_EXPR_3]], align 4
+// IR-NEXT:    %[[TMP11:.+]] = load i32, i32* %[[DOTCAPTURE_EXPR_3]], align 4
+// IR-NEXT:    %[[ADD7:.+]] = add i32 %[[TMP11]], 1
+// IR-NEXT:    store i32 %[[ADD7]], i32* %[[DOTCAPTURE_EXPR_6]], align 4
+// IR-NEXT:    %[[TMP12:.+]] = load i32, i32* %[[DOTCAPTURE_EXPR_6]], align 4
+// IR-NEXT:    %[[SUB9:.+]] = sub i32 %[[TMP12]], -6
+// IR-NEXT:    %[[DIV10:.+]] = udiv i32 %[[SUB9]], 7
+// IR-NEXT:    %[[SUB11:.+]] = sub i32 %[[DIV10]], 1
+// IR-NEXT:    store i32 %[[SUB11]], i32* %[[DOTCAPTURE_EXPR_8]], align 4
+// IR-NEXT:    store i32 0, i32* %[[DOTUNROLLED_IV_I]], align 4
+// IR-NEXT:    %[[TMP13:.+]] = load i32, i32* %[[DOTCAPTURE_EXPR_6]], align 4
+// IR-NEXT:    %[[CMP:.+]] = icmp ult i32 0, %[[TMP13]]
+// IR-NEXT:    br i1 %[[CMP]], label %[[OMP_PRECOND_THEN:.+]], label %[[OMP_PRECOND_END:.+]]
+// IR-EMPTY:
+// IR-NEXT:  [[OMP_PRECOND_THEN]]:
+// IR-NEXT:    store i32 0, i32* %[[DOTOMP_LB]], align 4
+// IR-NEXT:    %[[TMP14:.+]] = load i32, i32* %[[DOTCAPTURE_EXPR_8]], align 4
+// IR-NEXT:    store i32 %[[TMP14]], i32* %[[DOTOMP_UB]], align 4
+// IR-NEXT:    store i32 1, i32* %[[DOTOMP_STRIDE]], align 4
+// IR-NEXT:    store i32 0, i32* %[[DOTOMP_IS_LAST]], align 4
+// IR-NEXT:    %[[TMP15:.+]] = load i32*, i32** %[[DOTGLOBAL_TID__ADDR]], align 8
+// IR-NEXT:    %[[TMP16:.+]] = load i32, i32* %[[TMP15]], align 4
+// IR-NEXT:    call void @__kmpc_for_static_init_4u(%struct.ident_t* @1, i32 %[[TMP16]], i32 34, i32* %[[DOTOMP_IS_LAST]], i32* %[[DOTOMP_LB]], i32* %[[DOTOMP_UB]], i32* %[[DOTOMP_STRIDE]], i32 1, i32 1)
+// IR-NEXT:    %[[TMP17:.+]] = load i32, i32* %[[DOTOMP_UB]], align 4
+// IR-NEXT:    %[[TMP18:.+]] = load i32, i32* %[[DOTCAPTURE_EXPR_8]], align 4
+// IR-NEXT:    %[[CMP13:.+]] = icmp ugt i32 %[[TMP17]], %[[TMP18]]
+// IR-NEXT:    br i1 %[[CMP13]], label %[[COND_TRUE:.+]], label %[[COND_FALSE:.+]]
+// IR-EMPTY:
+// IR-NEXT:  [[COND_TRUE]]:
+// IR-NEXT:    %[[TMP19:.+]] = load i32, i32* %[[DOTCAPTURE_EXPR_8]], align 4
+// IR-NEXT:    br label %[[COND_END:.+]]
+// IR-EMPTY:
+// IR-NEXT:  [[COND_FALSE]]:
+// IR-NEXT:    %[[TMP20:.+]] = load i32, i32* %[[DOTOMP_UB]], align 4
+// IR-NEXT:    br label %[[COND_END]]
+// IR-EMPTY:
+// IR-NEXT:  [[COND_END]]:
+// IR-NEXT:    %[[COND:.+]] = phi i32 [ %[[TMP19]], %[[COND_TRUE]] ], [ %[[TMP20]], %[[COND_FALSE]] ]
+// IR-NEXT:    store i32 %[[COND]], i32* %[[DOTOMP_UB]], align 4
+// IR-NEXT:    %[[TMP21:.+]] = load i32, i32* %[[DOTOMP_LB]], align 4
+// IR-NEXT:    store i32 %[[TMP21]], i32* %[[DOTOMP_IV]], align 4
+// IR-NEXT:    br label %[[OMP_INNER_FOR_COND:.+]]
+// IR-EMPTY:
+// IR-NEXT:  [[OMP_INNER_FOR_COND]]:
+// IR-NEXT:    %[[TMP22:.+]] = load i32, i32* %[[DOTOMP_IV]], align 4
+// IR-NEXT:    %[[TMP23:.+]] = load i32, i32* %[[DOTOMP_UB]], align 4
+// IR-NEXT:    %[[ADD14:.+]] = add i32 %[[TMP23]], 1
+// IR-NEXT:    %[[CMP15:.+]] = icmp ult i32 %[[TMP22]], %[[ADD14]]
+// IR-NEXT:    br i1 %[[CMP15]], label %[[OMP_INNER_FOR_BODY:.+]], label %[[OMP_INNER_FOR_END:.+]]
+// IR-EMPTY:
+// IR-NEXT:  [[OMP_INNER_FOR_BODY]]:
+// IR-NEXT:    %[[TMP24:.+]] = load i32, i32* %[[DOTOMP_IV]], align 4
+// IR-NEXT:    %[[MUL:.+]] = mul i32 %[[TMP24]], 7
+// IR-NEXT:    %[[ADD16:.+]] = add i32 0, %[[MUL]]
+// IR-NEXT:    store i32 %[[ADD16]], i32* %[[DOTUNROLLED_IV_I12]], align 4
+// IR-NEXT:    %[[TMP25:.+]] = load i32, i32* %[[DOTUNROLLED_IV_I12]], align 4
+// IR-NEXT:    store i32 %[[TMP25]], i32* %[[DOTUNROLL_INNER_IV_I]], align 4
+// IR-NEXT:    br label %[[FOR_COND:.+]]
+// IR-EMPTY:
+// IR-NEXT:  [[FOR_COND]]:
+// IR-NEXT:    %[[TMP26:.+]] = load i32, i32* %[[DOTUNROLL_INNER_IV_I]], align 4
+// IR-NEXT:    %[[TMP27:.+]] = load i32, i32* %[[DOTUNROLLED_IV_I12]], align 4
+// IR-NEXT:    %[[ADD17:.+]] = add i32 %[[TMP27]], 7
+// IR-NEXT:    %[[CMP18:.+]] = icmp ule i32 %[[TMP26]], %[[ADD17]]
+// IR-NEXT:    br i1 %[[CMP18]], label %[[LAND_RHS:.+]], label %[[LAND_END:.+]]
+// IR-EMPTY:
+// IR-NEXT:  [[LAND_RHS]]:
+// IR-NEXT:    %[[TMP28:.+]] = load i32, i32* %[[DOTUNROLL_INNER_IV_I]], align 4
+// IR-NEXT:    %[[TMP29:.+]] = load i32, i32* %[[DOTCAPTURE_EXPR_3]], align 4
+// IR-NEXT:    %[[ADD19:.+]] = add i32 %[[TMP29]], 1
+// IR-NEXT:    %[[CMP20:.+]] = icmp ule i32 %[[TMP28]], %[[ADD19]]
+// IR-NEXT:    br label %[[LAND_END]]
+// IR-EMPTY:
+// IR-NEXT:  [[LAND_END]]:
+// IR-NEXT:    %[[TMP30:.+]] = phi i1 [ false, %[[FOR_COND]] ], [ %[[CMP20]], %[[LAND_RHS]] ]
+// IR-NEXT:    br i1 %[[TMP30]], label %[[FOR_BODY:.+]], label %[[FOR_END:.+]]
+// IR-EMPTY:
+// IR-NEXT:  [[FOR_BODY]]:
+// IR-NEXT:    %[[TMP31:.+]] = load i32, i32* %[[DOTCAPTURE_EXPR_]], align 4
+// IR-NEXT:    %[[TMP32:.+]] = load i32, i32* %[[DOTUNROLL_INNER_IV_I]], align 4
+// IR-NEXT:    %[[TMP33:.+]] = load i32, i32* %[[DOTCAPTURE_EXPR_2]], align 4
+// IR-NEXT:    %[[MUL21:.+]] = mul i32 %[[TMP32]], %[[TMP33]]
+// IR-NEXT:    %[[ADD22:.+]] = add i32 %[[TMP31]], %[[MUL21]]
+// IR-NEXT:    store i32 %[[ADD22]], i32* %[[I]], align 4
+// IR-NEXT:    %[[TMP34:.+]] = load i32, i32* %[[TMP2]], align 4
+// IR-NEXT:    %[[TMP35:.+]] = load i32, i32* %[[TMP0]], align 4
+// IR-NEXT:    %[[TMP36:.+]] = load i32, i32* %[[TMP1]], align 4
+// IR-NEXT:    %[[TMP37:.+]] = load i32, i32* %[[I]], align 4
+// IR-NEXT:    call void (...) @body(i32 %[[TMP34]], i32 %[[TMP35]], i32 %[[TMP36]], i32 %[[TMP37]])
+// IR-NEXT:    br label %[[FOR_INC:.+]]
+// IR-EMPTY:
+// IR-NEXT:  [[FOR_INC]]:
+// IR-NEXT:    %[[TMP38:.+]] = load i32, i32* %[[DOTUNROLL_INNER_IV_I]], align 4
+// IR-NEXT:    %[[INC:.+]] = add i32 %[[TMP38]], 1
+// IR-NEXT:    store i32 %[[INC]], i32* %[[DOTUNROLL_INNER_IV_I]], align 4
+// IR-NEXT:    br label %[[FOR_COND]], !llvm.loop ![[LOOP2:[0-9]+]]
+// IR-EMPTY:
+// IR-NEXT:  [[FOR_END]]:
+// IR-NEXT:    br label %[[OMP_BODY_CONTINUE:.+]]
+// IR-EMPTY:
+// IR-NEXT:  [[OMP_BODY_CONTINUE]]:
+// IR-NEXT:    br label %[[OMP_INNER_FOR_INC:.+]]
+// IR-EMPTY:
+// IR-NEXT:  [[OMP_INNER_FOR_INC]]:
+// IR-NEXT:    %[[TMP39:.+]] = load i32, i32* %[[DOTOMP_IV]], align 4
+// IR-NEXT:    %[[ADD23:.+]] = add i32 %[[TMP39]], 1
+// IR-NEXT:    store i32 %[[ADD23]], i32* %[[DOTOMP_IV]], align 4
+// IR-NEXT:    br label %[[OMP_INNER_FOR_COND]]
+// IR-EMPTY:
+// IR-NEXT:  [[OMP_INNER_FOR_END]]:
+// IR-NEXT:    br label %[[OMP_LOOP_EXIT:.+]]
+// IR-EMPTY:
+// IR-NEXT:  [[OMP_LOOP_EXIT]]:
+// IR-NEXT:    %[[TMP40:.+]] = load i32*, i32** %[[DOTGLOBAL_TID__ADDR]], align 8
+// IR-NEXT:    %[[TMP41:.+]] = load i32, i32* %[[TMP40]], align 4
+// IR-NEXT:    call void @__kmpc_for_static_fini(%struct.ident_t* @1, i32 %[[TMP41]])
+// IR-NEXT:    br label %[[OMP_PRECOND_END]]
+// IR-EMPTY:
+// IR-NEXT:  [[OMP_PRECOND_END]]:
+// IR-NEXT:    ret void
+// IR-NEXT:  }
+
+#endif /* HEADER */
+
+
+// IR: ![[LOOP2]] = distinct !{![[LOOP2]], ![[LOOPPROP3:[0-9]+]], ![[LOOPPROP4:[0-9]+]]}
+// IR: ![[LOOPPROP3]] = !{!"llvm.loop.mustprogress"}
+// IR: ![[LOOPPROP4]] = !{!"llvm.loop.unroll.count", i32 7}
diff --git a/clang/test/OpenMP/unroll_codegen_partial.cpp b/clang/test/OpenMP/unroll_codegen_partial.cpp
new file mode 100644
index 000000000000..9f18d7331269
--- /dev/null
+++ b/clang/test/OpenMP/unroll_codegen_partial.cpp
@@ -0,0 +1,64 @@
+// Check code generation
+// RUN: %clang_cc1 -verify -triple x86_64-pc-linux-gnu -fopenmp -fopenmp-version=51 -emit-llvm %s -o - | FileCheck %s --check-prefix=IR
+
+// Check same results after serialization round-trip
+// RUN: %clang_cc1 -verify -triple x86_64-pc-linux-gnu -fopenmp -fopenmp-version=51 -emit-pch -o %t %s
+// RUN: %clang_cc1 -verify -triple x86_64-pc-linux-gnu -fopenmp -fopenmp-version=51 -include-pch %t -emit-llvm %s -o - | FileCheck %s --check-prefix=IR
+// expected-no-diagnostics
+
+#ifndef HEADER
+#define HEADER
+
+// placeholder for loop body code.
+extern "C" void body(...) {}
+
+
+// IR-LABEL: @func(
+// IR-NEXT:  [[ENTRY:.*]]:
+// IR-NEXT:    %[[START_ADDR:.+]] = alloca i32, align 4
+// IR-NEXT:    %[[END_ADDR:.+]] = alloca i32, align 4
+// IR-NEXT:    %[[STEP_ADDR:.+]] = alloca i32, align 4
+// IR-NEXT:    %[[I:.+]] = alloca i32, align 4
+// IR-NEXT:    store i32 %[[START:.+]], i32* %[[START_ADDR]], align 4
+// IR-NEXT:    store i32 %[[END:.+]], i32* %[[END_ADDR]], align 4
+// IR-NEXT:    store i32 %[[STEP:.+]], i32* %[[STEP_ADDR]], align 4
+// IR-NEXT:    %[[TMP0:.+]] = load i32, i32* %[[START_ADDR]], align 4
+// IR-NEXT:    store i32 %[[TMP0]], i32* %[[I]], align 4
+// IR-NEXT:    br label %[[FOR_COND:.+]]
+// IR-EMPTY:
+// IR-NEXT:  [[FOR_COND]]:
+// IR-NEXT:    %[[TMP1:.+]] = load i32, i32* %[[I]], align 4
+// IR-NEXT:    %[[TMP2:.+]] = load i32, i32* %[[END_ADDR]], align 4
+// IR-NEXT:    %[[CMP:.+]] = icmp slt i32 %[[TMP1]], %[[TMP2]]
+// IR-NEXT:    br i1 %[[CMP]], label %[[FOR_BODY:.+]], label %[[FOR_END:.+]]
+// IR-EMPTY:
+// IR-NEXT:  [[FOR_BODY]]:
+// IR-NEXT:    %[[TMP3:.+]] = load i32, i32* %[[START_ADDR]], align 4
+// IR-NEXT:    %[[TMP4:.+]] = load i32, i32* %[[END_ADDR]], align 4
+// IR-NEXT:    %[[TMP5:.+]] = load i32, i32* %[[STEP_ADDR]], align 4
+// IR-NEXT:    %[[TMP6:.+]] = load i32, i32* %[[I]], align 4
+// IR-NEXT:    call void (...) @body(i32 %[[TMP3]], i32 %[[TMP4]], i32 %[[TMP5]], i32 %[[TMP6]])
+// IR-NEXT:    br label %[[FOR_INC:.+]]
+// IR-EMPTY:
+// IR-NEXT:  [[FOR_INC]]:
+// IR-NEXT:    %[[TMP7:.+]] = load i32, i32* %[[STEP_ADDR]], align 4
+// IR-NEXT:    %[[TMP8:.+]] = load i32, i32* %[[I]], align 4
+// IR-NEXT:    %[[ADD:.+]] = add nsw i32 %[[TMP8]], %[[TMP7]]
+// IR-NEXT:    store i32 %[[ADD]], i32* %[[I]], align 4
+// IR-NEXT:    br label %[[FOR_COND]], !llvm.loop ![[LOOP2:[0-9]+]]
+// IR-EMPTY:
+// IR-NEXT:  [[FOR_END]]:
+// IR-NEXT:    ret void
+// IR-NEXT:  }
+extern "C" void func(int start, int end, int step) {
+  #pragma omp unroll partial
+  for (int i = start; i < end; i+=step)
+    body(start, end, step, i);
+}
+
+#endif /* HEADER */
+
+
+// IR: ![[LOOP2]] = distinct !{![[LOOP2]], ![[LOOPPROP3:[0-9]+]], ![[LOOPPROP4:[0-9]+]]}
+// IR: ![[LOOPPROP3]] = !{!"llvm.loop.mustprogress"}
+// IR: ![[LOOPPROP4]] = !{!"llvm.loop.unroll.enable"}
diff --git a/clang/test/OpenMP/unroll_codegen_tile_for.cpp b/clang/test/OpenMP/unroll_codegen_tile_for.cpp
new file mode 100644
index 000000000000..4f7bc347c359
--- /dev/null
+++ b/clang/test/OpenMP/unroll_codegen_tile_for.cpp
@@ -0,0 +1,245 @@
+// Check code generation
+// RUN: %clang_cc1 -verify -triple x86_64-pc-linux-gnu -fopenmp -fopenmp-version=51 -emit-llvm %s -o - | FileCheck %s --check-prefix=IR
+
+// Check same results after serialization round-trip
+// RUN: %clang_cc1 -verify -triple x86_64-pc-linux-gnu -fopenmp -fopenmp-version=51 -emit-pch -o %t %s
+// RUN: %clang_cc1 -verify -triple x86_64-pc-linux-gnu -fopenmp -fopenmp-version=51 -include-pch %t -emit-llvm %s -o - | FileCheck %s --check-prefix=IR
+// expected-no-diagnostics
+
+#ifndef HEADER
+#define HEADER
+
+// placeholder for loop body code.
+extern "C" void body(...) {}
+
+
+// IR-LABEL: @func(
+// IR-NEXT:  [[ENTRY:.*]]:
+// IR-NEXT:    %[[START_ADDR:.+]] = alloca i32, align 4
+// IR-NEXT:    %[[END_ADDR:.+]] = alloca i32, align 4
+// IR-NEXT:    %[[STEP_ADDR:.+]] = alloca i32, align 4
+// IR-NEXT:    %[[DOTOMP_IV:.+]] = alloca i32, align 4
+// IR-NEXT:    %[[TMP:.+]] = alloca i32, align 4
+// IR-NEXT:    %[[I:.+]] = alloca i32, align 4
+// IR-NEXT:    %[[DOTCAPTURE_EXPR_:.+]] = alloca i32, align 4
+// IR-NEXT:    %[[DOTCAPTURE_EXPR_1:.+]] = alloca i32, align 4
+// IR-NEXT:    %[[DOTCAPTURE_EXPR_2:.+]] = alloca i32, align 4
+// IR-NEXT:    %[[DOTCAPTURE_EXPR_3:.+]] = alloca i32, align 4
+// IR-NEXT:    %[[DOTUNROLLED_IV_I:.+]] = alloca i32, align 4
+// IR-NEXT:    %[[DOTCAPTURE_EXPR_6:.+]] = alloca i32, align 4
+// IR-NEXT:    %[[DOTCAPTURE_EXPR_8:.+]] = alloca i32, align 4
+// IR-NEXT:    %[[DOTCAPTURE_EXPR_12:.+]] = alloca i32, align 4
+// IR-NEXT:    %[[DOTCAPTURE_EXPR_14:.+]] = alloca i32, align 4
+// IR-NEXT:    %[[DOTFLOOR_0_IV__UNROLLED_IV_I:.+]] = alloca i32, align 4
+// IR-NEXT:    %[[DOTOMP_LB:.+]] = alloca i32, align 4
+// IR-NEXT:    %[[DOTOMP_UB:.+]] = alloca i32, align 4
+// IR-NEXT:    %[[DOTOMP_STRIDE:.+]] = alloca i32, align 4
+// IR-NEXT:    %[[DOTOMP_IS_LAST:.+]] = alloca i32, align 4
+// IR-NEXT:    %[[DOTFLOOR_0_IV__UNROLLED_IV_I18:.+]] = alloca i32, align 4
+// IR-NEXT:    %[[DOTTILE_0_IV__UNROLLED_IV_I:.+]] = alloca i32, align 4
+// IR-NEXT:    %[[DOTUNROLL_INNER_IV_I:.+]] = alloca i32, align 4
+// IR-NEXT:    %[[TMP0:.+]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @2)
+// IR-NEXT:    store i32 %[[START:.+]], i32* %[[START_ADDR]], align 4
+// IR-NEXT:    store i32 %[[END:.+]], i32* %[[END_ADDR]], align 4
+// IR-NEXT:    store i32 %[[STEP:.+]], i32* %[[STEP_ADDR]], align 4
+// IR-NEXT:    %[[TMP1:.+]] = load i32, i32* %[[START_ADDR]], align 4
+// IR-NEXT:    store i32 %[[TMP1]], i32* %[[I]], align 4
+// IR-NEXT:    %[[TMP2:.+]] = load i32, i32* %[[START_ADDR]], align 4
+// IR-NEXT:    store i32 %[[TMP2]], i32* %[[DOTCAPTURE_EXPR_]], align 4
+// IR-NEXT:    %[[TMP3:.+]] = load i32, i32* %[[END_ADDR]], align 4
+// IR-NEXT:    store i32 %[[TMP3]], i32* %[[DOTCAPTURE_EXPR_1]], align 4
+// IR-NEXT:    %[[TMP4:.+]] = load i32, i32* %[[STEP_ADDR]], align 4
+// IR-NEXT:    store i32 %[[TMP4]], i32* %[[DOTCAPTURE_EXPR_2]], align 4
+// IR-NEXT:    %[[TMP5:.+]] = load i32, i32* %[[DOTCAPTURE_EXPR_1]], align 4
+// IR-NEXT:    %[[TMP6:.+]] = load i32, i32* %[[DOTCAPTURE_EXPR_]], align 4
+// IR-NEXT:    %[[SUB:.+]] = sub i32 %[[TMP5]], %[[TMP6]]
+// IR-NEXT:    %[[SUB4:.+]] = sub i32 %[[SUB]], 1
+// IR-NEXT:    %[[TMP7:.+]] = load i32, i32* %[[DOTCAPTURE_EXPR_2]], align 4
+// IR-NEXT:    %[[ADD:.+]] = add i32 %[[SUB4]], %[[TMP7]]
+// IR-NEXT:    %[[TMP8:.+]] = load i32, i32* %[[DOTCAPTURE_EXPR_2]], align 4
+// IR-NEXT:    %[[DIV:.+]] = udiv i32 %[[ADD]], %[[TMP8]]
+// IR-NEXT:    %[[SUB5:.+]] = sub i32 %[[DIV]], 1
+// IR-NEXT:    store i32 %[[SUB5]], i32* %[[DOTCAPTURE_EXPR_3]], align 4
+// IR-NEXT:    store i32 0, i32* %[[DOTUNROLLED_IV_I]], align 4
+// IR-NEXT:    %[[TMP9:.+]] = load i32, i32* %[[DOTCAPTURE_EXPR_3]], align 4
+// IR-NEXT:    %[[ADD7:.+]] = add i32 %[[TMP9]], 1
+// IR-NEXT:    store i32 %[[ADD7]], i32* %[[DOTCAPTURE_EXPR_6]], align 4
+// IR-NEXT:    %[[TMP10:.+]] = load i32, i32* %[[DOTCAPTURE_EXPR_6]], align 4
+// IR-NEXT:    %[[SUB9:.+]] = sub i32 %[[TMP10]], -1
+// IR-NEXT:    %[[DIV10:.+]] = udiv i32 %[[SUB9]], 2
+// IR-NEXT:    %[[SUB11:.+]] = sub i32 %[[DIV10]], 1
+// IR-NEXT:    store i32 %[[SUB11]], i32* %[[DOTCAPTURE_EXPR_8]], align 4
+// IR-NEXT:    %[[TMP11:.+]] = load i32, i32* %[[DOTCAPTURE_EXPR_8]], align 4
+// IR-NEXT:    %[[ADD13:.+]] = add i32 %[[TMP11]], 1
+// IR-NEXT:    store i32 %[[ADD13]], i32* %[[DOTCAPTURE_EXPR_12]], align 4
+// IR-NEXT:    %[[TMP12:.+]] = load i32, i32* %[[DOTCAPTURE_EXPR_12]], align 4
+// IR-NEXT:    %[[SUB15:.+]] = sub i32 %[[TMP12]], -3
+// IR-NEXT:    %[[DIV16:.+]] = udiv i32 %[[SUB15]], 4
+// IR-NEXT:    %[[SUB17:.+]] = sub i32 %[[DIV16]], 1
+// IR-NEXT:    store i32 %[[SUB17]], i32* %[[DOTCAPTURE_EXPR_14]], align 4
+// IR-NEXT:    store i32 0, i32* %[[DOTFLOOR_0_IV__UNROLLED_IV_I]], align 4
+// IR-NEXT:    %[[TMP13:.+]] = load i32, i32* %[[DOTCAPTURE_EXPR_12]], align 4
+// IR-NEXT:    %[[CMP:.+]] = icmp ult i32 0, %[[TMP13]]
+// IR-NEXT:    br i1 %[[CMP]], label %[[OMP_PRECOND_THEN:.+]], label %[[OMP_PRECOND_END:.+]]
+// IR-EMPTY:
+// IR-NEXT:  [[OMP_PRECOND_THEN]]:
+// IR-NEXT:    store i32 0, i32* %[[DOTOMP_LB]], align 4
+// IR-NEXT:    %[[TMP14:.+]] = load i32, i32* %[[DOTCAPTURE_EXPR_14]], align 4
+// IR-NEXT:    store i32 %[[TMP14]], i32* %[[DOTOMP_UB]], align 4
+// IR-NEXT:    store i32 1, i32* %[[DOTOMP_STRIDE]], align 4
+// IR-NEXT:    store i32 0, i32* %[[DOTOMP_IS_LAST]], align 4
+// IR-NEXT:    call void @__kmpc_for_static_init_4u(%struct.ident_t* @1, i32 %[[TMP0]], i32 34, i32* %[[DOTOMP_IS_LAST]], i32* %[[DOTOMP_LB]], i32* %[[DOTOMP_UB]], i32* %[[DOTOMP_STRIDE]], i32 1, i32 1)
+// IR-NEXT:    %[[TMP15:.+]] = load i32, i32* %[[DOTOMP_UB]], align 4
+// IR-NEXT:    %[[TMP16:.+]] = load i32, i32* %[[DOTCAPTURE_EXPR_14]], align 4
+// IR-NEXT:    %[[CMP19:.+]] = icmp ugt i32 %[[TMP15]], %[[TMP16]]
+// IR-NEXT:    br i1 %[[CMP19]], label %[[COND_TRUE:.+]], label %[[COND_FALSE:.+]]
+// IR-EMPTY:
+// IR-NEXT:  [[COND_TRUE]]:
+// IR-NEXT:    %[[TMP17:.+]] = load i32, i32* %[[DOTCAPTURE_EXPR_14]], align 4
+// IR-NEXT:    br label %[[COND_END:.+]]
+// IR-EMPTY:
+// IR-NEXT:  [[COND_FALSE]]:
+// IR-NEXT:    %[[TMP18:.+]] = load i32, i32* %[[DOTOMP_UB]], align 4
+// IR-NEXT:    br label %[[COND_END]]
+// IR-EMPTY:
+// IR-NEXT:  [[COND_END]]:
+// IR-NEXT:    %[[COND:.+]] = phi i32 [ %[[TMP17]], %[[COND_TRUE]] ], [ %[[TMP18]], %[[COND_FALSE]] ]
+// IR-NEXT:    store i32 %[[COND]], i32* %[[DOTOMP_UB]], align 4
+// IR-NEXT:    %[[TMP19:.+]] = load i32, i32* %[[DOTOMP_LB]], align 4
+// IR-NEXT:    store i32 %[[TMP19]], i32* %[[DOTOMP_IV]], align 4
+// IR-NEXT:    br label %[[OMP_INNER_FOR_COND:.+]]
+// IR-EMPTY:
+// IR-NEXT:  [[OMP_INNER_FOR_COND]]:
+// IR-NEXT:    %[[TMP20:.+]] = load i32, i32* %[[DOTOMP_IV]], align 4
+// IR-NEXT:    %[[TMP21:.+]] = load i32, i32* %[[DOTOMP_UB]], align 4
+// IR-NEXT:    %[[ADD20:.+]] = add i32 %[[TMP21]], 1
+// IR-NEXT:    %[[CMP21:.+]] = icmp ult i32 %[[TMP20]], %[[ADD20]]
+// IR-NEXT:    br i1 %[[CMP21]], label %[[OMP_INNER_FOR_BODY:.+]], label %[[OMP_INNER_FOR_END:.+]]
+// IR-EMPTY:
+// IR-NEXT:  [[OMP_INNER_FOR_BODY]]:
+// IR-NEXT:    %[[TMP22:.+]] = load i32, i32* %[[DOTOMP_IV]], align 4
+// IR-NEXT:    %[[MUL:.+]] = mul i32 %[[TMP22]], 4
+// IR-NEXT:    %[[ADD22:.+]] = add i32 0, %[[MUL]]
+// IR-NEXT:    store i32 %[[ADD22]], i32* %[[DOTFLOOR_0_IV__UNROLLED_IV_I18]], align 4
+// IR-NEXT:    %[[TMP23:.+]] = load i32, i32* %[[DOTFLOOR_0_IV__UNROLLED_IV_I18]], align 4
+// IR-NEXT:    store i32 %[[TMP23]], i32* %[[DOTTILE_0_IV__UNROLLED_IV_I]], align 4
+// IR-NEXT:    br label %[[FOR_COND:.+]]
+// IR-EMPTY:
+// IR-NEXT:  [[FOR_COND]]:
+// IR-NEXT:    %[[TMP24:.+]] = load i32, i32* %[[DOTTILE_0_IV__UNROLLED_IV_I]], align 4
+// IR-NEXT:    %[[TMP25:.+]] = load i32, i32* %[[DOTCAPTURE_EXPR_8]], align 4
+// IR-NEXT:    %[[ADD23:.+]] = add i32 %[[TMP25]], 1
+// IR-NEXT:    %[[TMP26:.+]] = load i32, i32* %[[DOTFLOOR_0_IV__UNROLLED_IV_I18]], align 4
+// IR-NEXT:    %[[ADD24:.+]] = add i32 %[[TMP26]], 4
+// IR-NEXT:    %[[CMP25:.+]] = icmp ult i32 %[[ADD23]], %[[ADD24]]
+// IR-NEXT:    br i1 %[[CMP25]], label %[[COND_TRUE26:.+]], label %[[COND_FALSE28:.+]]
+// IR-EMPTY:
+// IR-NEXT:  [[COND_TRUE26]]:
+// IR-NEXT:    %[[TMP27:.+]] = load i32, i32* %[[DOTCAPTURE_EXPR_8]], align 4
+// IR-NEXT:    %[[ADD27:.+]] = add i32 %[[TMP27]], 1
+// IR-NEXT:    br label %[[COND_END30:.+]]
+// IR-EMPTY:
+// IR-NEXT:  [[COND_FALSE28]]:
+// IR-NEXT:    %[[TMP28:.+]] = load i32, i32* %[[DOTFLOOR_0_IV__UNROLLED_IV_I18]], align 4
+// IR-NEXT:    %[[ADD29:.+]] = add i32 %[[TMP28]], 4
+// IR-NEXT:    br label %[[COND_END30]]
+// IR-EMPTY:
+// IR-NEXT:  [[COND_END30]]:
+// IR-NEXT:    %[[COND31:.+]] = phi i32 [ %[[ADD27]], %[[COND_TRUE26]] ], [ %[[ADD29]], %[[COND_FALSE28]] ]
+// IR-NEXT:    %[[CMP32:.+]] = icmp ult i32 %[[TMP24]], %[[COND31]]
+// IR-NEXT:    br i1 %[[CMP32]], label %[[FOR_BODY:.+]], label %[[FOR_END45:.+]]
+// IR-EMPTY:
+// IR-NEXT:  [[FOR_BODY]]:
+// IR-NEXT:    %[[TMP29:.+]] = load i32, i32* %[[DOTTILE_0_IV__UNROLLED_IV_I]], align 4
+// IR-NEXT:    %[[MUL33:.+]] = mul i32 %[[TMP29]], 2
+// IR-NEXT:    %[[ADD34:.+]] = add i32 0, %[[MUL33]]
+// IR-NEXT:    store i32 %[[ADD34]], i32* %[[DOTUNROLLED_IV_I]], align 4
+// IR-NEXT:    %[[TMP30:.+]] = load i32, i32* %[[DOTUNROLLED_IV_I]], align 4
+// IR-NEXT:    store i32 %[[TMP30]], i32* %[[DOTUNROLL_INNER_IV_I]], align 4
+// IR-NEXT:    br label %[[FOR_COND35:.+]]
+// IR-EMPTY:
+// IR-NEXT:  [[FOR_COND35]]:
+// IR-NEXT:    %[[TMP31:.+]] = load i32, i32* %[[DOTUNROLL_INNER_IV_I]], align 4
+// IR-NEXT:    %[[TMP32:.+]] = load i32, i32* %[[DOTUNROLLED_IV_I]], align 4
+// IR-NEXT:    %[[ADD36:.+]] = add i32 %[[TMP32]], 2
+// IR-NEXT:    %[[CMP37:.+]] = icmp ule i32 %[[TMP31]], %[[ADD36]]
+// IR-NEXT:    br i1 %[[CMP37]], label %[[LAND_RHS:.+]], label %[[LAND_END:.+]]
+// IR-EMPTY:
+// IR-NEXT:  [[LAND_RHS]]:
+// IR-NEXT:    %[[TMP33:.+]] = load i32, i32* %[[DOTUNROLL_INNER_IV_I]], align 4
+// IR-NEXT:    %[[TMP34:.+]] = load i32, i32* %[[DOTCAPTURE_EXPR_3]], align 4
+// IR-NEXT:    %[[ADD38:.+]] = add i32 %[[TMP34]], 1
+// IR-NEXT:    %[[CMP39:.+]] = icmp ule i32 %[[TMP33]], %[[ADD38]]
+// IR-NEXT:    br label %[[LAND_END]]
+// IR-EMPTY:
+// IR-NEXT:  [[LAND_END]]:
+// IR-NEXT:    %[[TMP35:.+]] = phi i1 [ false, %[[FOR_COND35]] ], [ %[[CMP39]], %[[LAND_RHS]] ]
+// IR-NEXT:    br i1 %[[TMP35]], label %[[FOR_BODY40:.+]], label %[[FOR_END:.+]]
+// IR-EMPTY:
+// IR-NEXT:  [[FOR_BODY40]]:
+// IR-NEXT:    %[[TMP36:.+]] = load i32, i32* %[[DOTCAPTURE_EXPR_]], align 4
+// IR-NEXT:    %[[TMP37:.+]] = load i32, i32* %[[DOTUNROLL_INNER_IV_I]], align 4
+// IR-NEXT:    %[[TMP38:.+]] = load i32, i32* %[[DOTCAPTURE_EXPR_2]], align 4
+// IR-NEXT:    %[[MUL41:.+]] = mul i32 %[[TMP37]], %[[TMP38]]
+// IR-NEXT:    %[[ADD42:.+]] = add i32 %[[TMP36]], %[[MUL41]]
+// IR-NEXT:    store i32 %[[ADD42]], i32* %[[I]], align 4
+// IR-NEXT:    %[[TMP39:.+]] = load i32, i32* %[[START_ADDR]], align 4
+// IR-NEXT:    %[[TMP40:.+]] = load i32, i32* %[[END_ADDR]], align 4
+// IR-NEXT:    %[[TMP41:.+]] = load i32, i32* %[[STEP_ADDR]], align 4
+// IR-NEXT:    %[[TMP42:.+]] = load i32, i32* %[[I]], align 4
+// IR-NEXT:    call void (...) @body(i32 %[[TMP39]], i32 %[[TMP40]], i32 %[[TMP41]], i32 %[[TMP42]])
+// IR-NEXT:    br label %[[FOR_INC:.+]]
+// IR-EMPTY:
+// IR-NEXT:  [[FOR_INC]]:
+// IR-NEXT:    %[[TMP43:.+]] = load i32, i32* %[[DOTUNROLL_INNER_IV_I]], align 4
+// IR-NEXT:    %[[INC:.+]] = add i32 %[[TMP43]], 1
+// IR-NEXT:    store i32 %[[INC]], i32* %[[DOTUNROLL_INNER_IV_I]], align 4
+// IR-NEXT:    br label %[[FOR_COND35]], !llvm.loop ![[LOOP2:[0-9]+]]
+// IR-EMPTY:
+// IR-NEXT:  [[FOR_END]]:
+// IR-NEXT:    br label %[[FOR_INC43:.+]]
+// IR-EMPTY:
+// IR-NEXT:  [[FOR_INC43]]:
+// IR-NEXT:    %[[TMP44:.+]] = load i32, i32* %[[DOTTILE_0_IV__UNROLLED_IV_I]], align 4
+// IR-NEXT:    %[[INC44:.+]] = add i32 %[[TMP44]], 1
+// IR-NEXT:    store i32 %[[INC44]], i32* %[[DOTTILE_0_IV__UNROLLED_IV_I]], align 4
+// IR-NEXT:    br label %[[FOR_COND]], !llvm.loop ![[LOOP5:[0-9]+]]
+// IR-EMPTY:
+// IR-NEXT:  [[FOR_END45]]:
+// IR-NEXT:    br label %[[OMP_BODY_CONTINUE:.+]]
+// IR-EMPTY:
+// IR-NEXT:  [[OMP_BODY_CONTINUE]]:
+// IR-NEXT:    br label %[[OMP_INNER_FOR_INC:.+]]
+// IR-EMPTY:
+// IR-NEXT:  [[OMP_INNER_FOR_INC]]:
+// IR-NEXT:    %[[TMP45:.+]] = load i32, i32* %[[DOTOMP_IV]], align 4
+// IR-NEXT:    %[[ADD46:.+]] = add i32 %[[TMP45]], 1
+// IR-NEXT:    store i32 %[[ADD46]], i32* %[[DOTOMP_IV]], align 4
+// IR-NEXT:    br label %[[OMP_INNER_FOR_COND]]
+// IR-EMPTY:
+// IR-NEXT:  [[OMP_INNER_FOR_END]]:
+// IR-NEXT:    br label %[[OMP_LOOP_EXIT:.+]]
+// IR-EMPTY:
+// IR-NEXT:  [[OMP_LOOP_EXIT]]:
+// IR-NEXT:    call void @__kmpc_for_static_fini(%struct.ident_t* @1, i32 %[[TMP0]])
+// IR-NEXT:    br label %[[OMP_PRECOND_END]]
+// IR-EMPTY:
+// IR-NEXT:  [[OMP_PRECOND_END]]:
+// IR-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @3, i32 %[[TMP0]])
+// IR-NEXT:    ret void
+// IR-NEXT:  }
+extern "C" void func(int start, int end, int step) {
+  #pragma omp for
+  #pragma omp tile sizes(4)
+  #pragma omp unroll partial
+  for (int i = start; i < end; i+=step)
+    body(start, end, step, i);
+}
+
+#endif /* HEADER */
+
+
+// IR: ![[LOOP2]] = distinct !{![[LOOP2]], ![[LOOPPROP3:[0-9]+]], ![[LOOPPROP4:[0-9]+]]}
+// IR: ![[LOOPPROP3]] = !{!"llvm.loop.mustprogress"}
+// IR: ![[LOOPPROP4]] = !{!"llvm.loop.unroll.count", i32 2}
+// IR: ![[LOOP5]] = distinct !{![[LOOP5]], ![[LOOPPROP3]]}
diff --git a/clang/test/OpenMP/unroll_codegen_unroll_for.cpp b/clang/test/OpenMP/unroll_codegen_unroll_for.cpp
new file mode 100644
index 000000000000..1d82b6eb77d9
--- /dev/null
+++ b/clang/test/OpenMP/unroll_codegen_unroll_for.cpp
@@ -0,0 +1,239 @@
+// Check code generation
+// RUN: %clang_cc1 -verify -triple x86_64-pc-linux-gnu -fopenmp -fopenmp-version=51 -emit-llvm %s -o - | FileCheck %s --check-prefix=IR
+
+// Check same results after serialization round-trip
+// RUN: %clang_cc1 -verify -triple x86_64-pc-linux-gnu -fopenmp -fopenmp-version=51 -emit-pch -o %t %s
+// RUN: %clang_cc1 -verify -triple x86_64-pc-linux-gnu -fopenmp -fopenmp-version=51 -include-pch %t -emit-llvm %s -o - | FileCheck %s --check-prefix=IR
+// expected-no-diagnostics
+
+#ifndef HEADER
+#define HEADER
+
+// placeholder for loop body code.
+extern "C" void body(...) {}
+
+
+// IR-LABEL: @func(
+// IR-NEXT:  [[ENTRY:.*]]:
+// IR-NEXT:    %[[START_ADDR:.+]] = alloca i32, align 4
+// IR-NEXT:    %[[END_ADDR:.+]] = alloca i32, align 4
+// IR-NEXT:    %[[STEP_ADDR:.+]] = alloca i32, align 4
+// IR-NEXT:    %[[DOTOMP_IV:.+]] = alloca i32, align 4
+// IR-NEXT:    %[[TMP:.+]] = alloca i32, align 4
+// IR-NEXT:    %[[I:.+]] = alloca i32, align 4
+// IR-NEXT:    %[[DOTCAPTURE_EXPR_:.+]] = alloca i32, align 4
+// IR-NEXT:    %[[DOTCAPTURE_EXPR_1:.+]] = alloca i32, align 4
+// IR-NEXT:    %[[DOTCAPTURE_EXPR_2:.+]] = alloca i32, align 4
+// IR-NEXT:    %[[DOTCAPTURE_EXPR_3:.+]] = alloca i32, align 4
+// IR-NEXT:    %[[DOTUNROLLED_IV_I:.+]] = alloca i32, align 4
+// IR-NEXT:    %[[DOTCAPTURE_EXPR_6:.+]] = alloca i32, align 4
+// IR-NEXT:    %[[DOTCAPTURE_EXPR_8:.+]] = alloca i32, align 4
+// IR-NEXT:    %[[DOTCAPTURE_EXPR_12:.+]] = alloca i32, align 4
+// IR-NEXT:    %[[DOTCAPTURE_EXPR_14:.+]] = alloca i32, align 4
+// IR-NEXT:    %[[DOTUNROLLED_IV__UNROLLED_IV_I:.+]] = alloca i32, align 4
+// IR-NEXT:    %[[DOTOMP_LB:.+]] = alloca i32, align 4
+// IR-NEXT:    %[[DOTOMP_UB:.+]] = alloca i32, align 4
+// IR-NEXT:    %[[DOTOMP_STRIDE:.+]] = alloca i32, align 4
+// IR-NEXT:    %[[DOTOMP_IS_LAST:.+]] = alloca i32, align 4
+// IR-NEXT:    %[[DOTUNROLLED_IV__UNROLLED_IV_I18:.+]] = alloca i32, align 4
+// IR-NEXT:    %[[DOTUNROLL_INNER_IV__UNROLLED_IV_I:.+]] = alloca i32, align 4
+// IR-NEXT:    %[[DOTUNROLL_INNER_IV_I:.+]] = alloca i32, align 4
+// IR-NEXT:    %[[TMP0:.+]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @2)
+// IR-NEXT:    store i32 %[[START:.+]], i32* %[[START_ADDR]], align 4
+// IR-NEXT:    store i32 %[[END:.+]], i32* %[[END_ADDR]], align 4
+// IR-NEXT:    store i32 %[[STEP:.+]], i32* %[[STEP_ADDR]], align 4
+// IR-NEXT:    %[[TMP1:.+]] = load i32, i32* %[[START_ADDR]], align 4
+// IR-NEXT:    store i32 %[[TMP1]], i32* %[[I]], align 4
+// IR-NEXT:    %[[TMP2:.+]] = load i32, i32* %[[START_ADDR]], align 4
+// IR-NEXT:    store i32 %[[TMP2]], i32* %[[DOTCAPTURE_EXPR_]], align 4
+// IR-NEXT:    %[[TMP3:.+]] = load i32, i32* %[[END_ADDR]], align 4
+// IR-NEXT:    store i32 %[[TMP3]], i32* %[[DOTCAPTURE_EXPR_1]], align 4
+// IR-NEXT:    %[[TMP4:.+]] = load i32, i32* %[[STEP_ADDR]], align 4
+// IR-NEXT:    store i32 %[[TMP4]], i32* %[[DOTCAPTURE_EXPR_2]], align 4
+// IR-NEXT:    %[[TMP5:.+]] = load i32, i32* %[[DOTCAPTURE_EXPR_1]], align 4
+// IR-NEXT:    %[[TMP6:.+]] = load i32, i32* %[[DOTCAPTURE_EXPR_]], align 4
+// IR-NEXT:    %[[SUB:.+]] = sub i32 %[[TMP5]], %[[TMP6]]
+// IR-NEXT:    %[[SUB4:.+]] = sub i32 %[[SUB]], 1
+// IR-NEXT:    %[[TMP7:.+]] = load i32, i32* %[[DOTCAPTURE_EXPR_2]], align 4
+// IR-NEXT:    %[[ADD:.+]] = add i32 %[[SUB4]], %[[TMP7]]
+// IR-NEXT:    %[[TMP8:.+]] = load i32, i32* %[[DOTCAPTURE_EXPR_2]], align 4
+// IR-NEXT:    %[[DIV:.+]] = udiv i32 %[[ADD]], %[[TMP8]]
+// IR-NEXT:    %[[SUB5:.+]] = sub i32 %[[DIV]], 1
+// IR-NEXT:    store i32 %[[SUB5]], i32* %[[DOTCAPTURE_EXPR_3]], align 4
+// IR-NEXT:    store i32 0, i32* %[[DOTUNROLLED_IV_I]], align 4
+// IR-NEXT:    %[[TMP9:.+]] = load i32, i32* %[[DOTCAPTURE_EXPR_3]], align 4
+// IR-NEXT:    %[[ADD7:.+]] = add i32 %[[TMP9]], 1
+// IR-NEXT:    store i32 %[[ADD7]], i32* %[[DOTCAPTURE_EXPR_6]], align 4
+// IR-NEXT:    %[[TMP10:.+]] = load i32, i32* %[[DOTCAPTURE_EXPR_6]], align 4
+// IR-NEXT:    %[[SUB9:.+]] = sub i32 %[[TMP10]], -1
+// IR-NEXT:    %[[DIV10:.+]] = udiv i32 %[[SUB9]], 2
+// IR-NEXT:    %[[SUB11:.+]] = sub i32 %[[DIV10]], 1
+// IR-NEXT:    store i32 %[[SUB11]], i32* %[[DOTCAPTURE_EXPR_8]], align 4
+// IR-NEXT:    %[[TMP11:.+]] = load i32, i32* %[[DOTCAPTURE_EXPR_8]], align 4
+// IR-NEXT:    %[[ADD13:.+]] = add i32 %[[TMP11]], 1
+// IR-NEXT:    store i32 %[[ADD13]], i32* %[[DOTCAPTURE_EXPR_12]], align 4
+// IR-NEXT:    %[[TMP12:.+]] = load i32, i32* %[[DOTCAPTURE_EXPR_12]], align 4
+// IR-NEXT:    %[[SUB15:.+]] = sub i32 %[[TMP12]], -1
+// IR-NEXT:    %[[DIV16:.+]] = udiv i32 %[[SUB15]], 2
+// IR-NEXT:    %[[SUB17:.+]] = sub i32 %[[DIV16]], 1
+// IR-NEXT:    store i32 %[[SUB17]], i32* %[[DOTCAPTURE_EXPR_14]], align 4
+// IR-NEXT:    store i32 0, i32* %[[DOTUNROLLED_IV__UNROLLED_IV_I]], align 4
+// IR-NEXT:    %[[TMP13:.+]] = load i32, i32* %[[DOTCAPTURE_EXPR_12]], align 4
+// IR-NEXT:    %[[CMP:.+]] = icmp ult i32 0, %[[TMP13]]
+// IR-NEXT:    br i1 %[[CMP]], label %[[OMP_PRECOND_THEN:.+]], label %[[OMP_PRECOND_END:.+]]
+// IR-EMPTY:
+// IR-NEXT:  [[OMP_PRECOND_THEN]]:
+// IR-NEXT:    store i32 0, i32* %[[DOTOMP_LB]], align 4
+// IR-NEXT:    %[[TMP14:.+]] = load i32, i32* %[[DOTCAPTURE_EXPR_14]], align 4
+// IR-NEXT:    store i32 %[[TMP14]], i32* %[[DOTOMP_UB]], align 4
+// IR-NEXT:    store i32 1, i32* %[[DOTOMP_STRIDE]], align 4
+// IR-NEXT:    store i32 0, i32* %[[DOTOMP_IS_LAST]], align 4
+// IR-NEXT:    call void @__kmpc_for_static_init_4u(%struct.ident_t* @1, i32 %[[TMP0]], i32 34, i32* %[[DOTOMP_IS_LAST]], i32* %[[DOTOMP_LB]], i32* %[[DOTOMP_UB]], i32* %[[DOTOMP_STRIDE]], i32 1, i32 1)
+// IR-NEXT:    %[[TMP15:.+]] = load i32, i32* %[[DOTOMP_UB]], align 4
+// IR-NEXT:    %[[TMP16:.+]] = load i32, i32* %[[DOTCAPTURE_EXPR_14]], align 4
+// IR-NEXT:    %[[CMP19:.+]] = icmp ugt i32 %[[TMP15]], %[[TMP16]]
+// IR-NEXT:    br i1 %[[CMP19]], label %[[COND_TRUE:.+]], label %[[COND_FALSE:.+]]
+// IR-EMPTY:
+// IR-NEXT:  [[COND_TRUE]]:
+// IR-NEXT:    %[[TMP17:.+]] = load i32, i32* %[[DOTCAPTURE_EXPR_14]], align 4
+// IR-NEXT:    br label %[[COND_END:.+]]
+// IR-EMPTY:
+// IR-NEXT:  [[COND_FALSE]]:
+// IR-NEXT:    %[[TMP18:.+]] = load i32, i32* %[[DOTOMP_UB]], align 4
+// IR-NEXT:    br label %[[COND_END]]
+// IR-EMPTY:
+// IR-NEXT:  [[COND_END]]:
+// IR-NEXT:    %[[COND:.+]] = phi i32 [ %[[TMP17]], %[[COND_TRUE]] ], [ %[[TMP18]], %[[COND_FALSE]] ]
+// IR-NEXT:    store i32 %[[COND]], i32* %[[DOTOMP_UB]], align 4
+// IR-NEXT:    %[[TMP19:.+]] = load i32, i32* %[[DOTOMP_LB]], align 4
+// IR-NEXT:    store i32 %[[TMP19]], i32* %[[DOTOMP_IV]], align 4
+// IR-NEXT:    br label %[[OMP_INNER_FOR_COND:.+]]
+// IR-EMPTY:
+// IR-NEXT:  [[OMP_INNER_FOR_COND]]:
+// IR-NEXT:    %[[TMP20:.+]] = load i32, i32* %[[DOTOMP_IV]], align 4
+// IR-NEXT:    %[[TMP21:.+]] = load i32, i32* %[[DOTOMP_UB]], align 4
+// IR-NEXT:    %[[ADD20:.+]] = add i32 %[[TMP21]], 1
+// IR-NEXT:    %[[CMP21:.+]] = icmp ult i32 %[[TMP20]], %[[ADD20]]
+// IR-NEXT:    br i1 %[[CMP21]], label %[[OMP_INNER_FOR_BODY:.+]], label %[[OMP_INNER_FOR_END:.+]]
+// IR-EMPTY:
+// IR-NEXT:  [[OMP_INNER_FOR_BODY]]:
+// IR-NEXT:    %[[TMP22:.+]] = load i32, i32* %[[DOTOMP_IV]], align 4
+// IR-NEXT:    %[[MUL:.+]] = mul i32 %[[TMP22]], 2
+// IR-NEXT:    %[[ADD22:.+]] = add i32 0, %[[MUL]]
+// IR-NEXT:    store i32 %[[ADD22]], i32* %[[DOTUNROLLED_IV__UNROLLED_IV_I18]], align 4
+// IR-NEXT:    %[[TMP23:.+]] = load i32, i32* %[[DOTUNROLLED_IV__UNROLLED_IV_I18]], align 4
+// IR-NEXT:    store i32 %[[TMP23]], i32* %[[DOTUNROLL_INNER_IV__UNROLLED_IV_I]], align 4
+// IR-NEXT:    br label %[[FOR_COND:.+]]
+// IR-EMPTY:
+// IR-NEXT:  [[FOR_COND]]:
+// IR-NEXT:    %[[TMP24:.+]] = load i32, i32* %[[DOTUNROLL_INNER_IV__UNROLLED_IV_I]], align 4
+// IR-NEXT:    %[[TMP25:.+]] = load i32, i32* %[[DOTUNROLLED_IV__UNROLLED_IV_I18]], align 4
+// IR-NEXT:    %[[ADD23:.+]] = add i32 %[[TMP25]], 2
+// IR-NEXT:    %[[CMP24:.+]] = icmp ule i32 %[[TMP24]], %[[ADD23]]
+// IR-NEXT:    br i1 %[[CMP24]], label %[[LAND_RHS:.+]], label %[[LAND_END:.+]]
+// IR-EMPTY:
+// IR-NEXT:  [[LAND_RHS]]:
+// IR-NEXT:    %[[TMP26:.+]] = load i32, i32* %[[DOTUNROLL_INNER_IV__UNROLLED_IV_I]], align 4
+// IR-NEXT:    %[[TMP27:.+]] = load i32, i32* %[[DOTCAPTURE_EXPR_8]], align 4
+// IR-NEXT:    %[[ADD25:.+]] = add i32 %[[TMP27]], 1
+// IR-NEXT:    %[[CMP26:.+]] = icmp ule i32 %[[TMP26]], %[[ADD25]]
+// IR-NEXT:    br label %[[LAND_END]]
+// IR-EMPTY:
+// IR-NEXT:  [[LAND_END]]:
+// IR-NEXT:    %[[TMP28:.+]] = phi i1 [ false, %[[FOR_COND]] ], [ %[[CMP26]], %[[LAND_RHS]] ]
+// IR-NEXT:    br i1 %[[TMP28]], label %[[FOR_BODY:.+]], label %[[FOR_END41:.+]]
+// IR-EMPTY:
+// IR-NEXT:  [[FOR_BODY]]:
+// IR-NEXT:    %[[TMP29:.+]] = load i32, i32* %[[DOTUNROLL_INNER_IV__UNROLLED_IV_I]], align 4
+// IR-NEXT:    %[[MUL27:.+]] = mul i32 %[[TMP29]], 2
+// IR-NEXT:    %[[ADD28:.+]] = add i32 0, %[[MUL27]]
+// IR-NEXT:    store i32 %[[ADD28]], i32* %[[DOTUNROLLED_IV_I]], align 4
+// IR-NEXT:    %[[TMP30:.+]] = load i32, i32* %[[DOTUNROLLED_IV_I]], align 4
+// IR-NEXT:    store i32 %[[TMP30]], i32* %[[DOTUNROLL_INNER_IV_I]], align 4
+// IR-NEXT:    br label %[[FOR_COND29:.+]]
+// IR-EMPTY:
+// IR-NEXT:  [[FOR_COND29]]:
+// IR-NEXT:    %[[TMP31:.+]] = load i32, i32* %[[DOTUNROLL_INNER_IV_I]], align 4
+// IR-NEXT:    %[[TMP32:.+]] = load i32, i32* %[[DOTUNROLLED_IV_I]], align 4
+// IR-NEXT:    %[[ADD30:.+]] = add i32 %[[TMP32]], 2
+// IR-NEXT:    %[[CMP31:.+]] = icmp ule i32 %[[TMP31]], %[[ADD30]]
+// IR-NEXT:    br i1 %[[CMP31]], label %[[LAND_RHS32:.+]], label %[[LAND_END35:.+]]
+// IR-EMPTY:
+// IR-NEXT:  [[LAND_RHS32]]:
+// IR-NEXT:    %[[TMP33:.+]] = load i32, i32* %[[DOTUNROLL_INNER_IV_I]], align 4
+// IR-NEXT:    %[[TMP34:.+]] = load i32, i32* %[[DOTCAPTURE_EXPR_3]], align 4
+// IR-NEXT:    %[[ADD33:.+]] = add i32 %[[TMP34]], 1
+// IR-NEXT:    %[[CMP34:.+]] = icmp ule i32 %[[TMP33]], %[[ADD33]]
+// IR-NEXT:    br label %[[LAND_END35]]
+// IR-EMPTY:
+// IR-NEXT:  [[LAND_END35]]:
+// IR-NEXT:    %[[TMP35:.+]] = phi i1 [ false, %[[FOR_COND29]] ], [ %[[CMP34]], %[[LAND_RHS32]] ]
+// IR-NEXT:    br i1 %[[TMP35]], label %[[FOR_BODY36:.+]], label %[[FOR_END:.+]]
+// IR-EMPTY:
+// IR-NEXT:  [[FOR_BODY36]]:
+// IR-NEXT:    %[[TMP36:.+]] = load i32, i32* %[[DOTCAPTURE_EXPR_]], align 4
+// IR-NEXT:    %[[TMP37:.+]] = load i32, i32* %[[DOTUNROLL_INNER_IV_I]], align 4
+// IR-NEXT:    %[[TMP38:.+]] = load i32, i32* %[[DOTCAPTURE_EXPR_2]], align 4
+// IR-NEXT:    %[[MUL37:.+]] = mul i32 %[[TMP37]], %[[TMP38]]
+// IR-NEXT:    %[[ADD38:.+]] = add i32 %[[TMP36]], %[[MUL37]]
+// IR-NEXT:    store i32 %[[ADD38]], i32* %[[I]], align 4
+// IR-NEXT:    %[[TMP39:.+]] = load i32, i32* %[[START_ADDR]], align 4
+// IR-NEXT:    %[[TMP40:.+]] = load i32, i32* %[[END_ADDR]], align 4
+// IR-NEXT:    %[[TMP41:.+]] = load i32, i32* %[[STEP_ADDR]], align 4
+// IR-NEXT:    %[[TMP42:.+]] = load i32, i32* %[[I]], align 4
+// IR-NEXT:    call void (...) @body(i32 %[[TMP39]], i32 %[[TMP40]], i32 %[[TMP41]], i32 %[[TMP42]])
+// IR-NEXT:    br label %[[FOR_INC:.+]]
+// IR-EMPTY:
+// IR-NEXT:  [[FOR_INC]]:
+// IR-NEXT:    %[[TMP43:.+]] = load i32, i32* %[[DOTUNROLL_INNER_IV_I]], align 4
+// IR-NEXT:    %[[INC:.+]] = add i32 %[[TMP43]], 1
+// IR-NEXT:    store i32 %[[INC]], i32* %[[DOTUNROLL_INNER_IV_I]], align 4
+// IR-NEXT:    br label %[[FOR_COND29]], !llvm.loop ![[LOOP2:[0-9]+]]
+// IR-EMPTY:
+// IR-NEXT:  [[FOR_END]]:
+// IR-NEXT:    br label %[[FOR_INC39:.+]]
+// IR-EMPTY:
+// IR-NEXT:  [[FOR_INC39]]:
+// IR-NEXT:    %[[TMP44:.+]] = load i32, i32* %[[DOTUNROLL_INNER_IV__UNROLLED_IV_I]], align 4
+// IR-NEXT:    %[[INC40:.+]] = add i32 %[[TMP44]], 1
+// IR-NEXT:    store i32 %[[INC40]], i32* %[[DOTUNROLL_INNER_IV__UNROLLED_IV_I]], align 4
+// IR-NEXT:    br label %[[FOR_COND]], !llvm.loop ![[LOOP5:[0-9]+]]
+// IR-EMPTY:
+// IR-NEXT:  [[FOR_END41]]:
+// IR-NEXT:    br label %[[OMP_BODY_CONTINUE:.+]]
+// IR-EMPTY:
+// IR-NEXT:  [[OMP_BODY_CONTINUE]]:
+// IR-NEXT:    br label %[[OMP_INNER_FOR_INC:.+]]
+// IR-EMPTY:
+// IR-NEXT:  [[OMP_INNER_FOR_INC]]:
+// IR-NEXT:    %[[TMP45:.+]] = load i32, i32* %[[DOTOMP_IV]], align 4
+// IR-NEXT:    %[[ADD42:.+]] = add i32 %[[TMP45]], 1
+// IR-NEXT:    store i32 %[[ADD42]], i32* %[[DOTOMP_IV]], align 4
+// IR-NEXT:    br label %[[OMP_INNER_FOR_COND]]
+// IR-EMPTY:
+// IR-NEXT:  [[OMP_INNER_FOR_END]]:
+// IR-NEXT:    br label %[[OMP_LOOP_EXIT:.+]]
+// IR-EMPTY:
+// IR-NEXT:  [[OMP_LOOP_EXIT]]:
+// IR-NEXT:    call void @__kmpc_for_static_fini(%struct.ident_t* @1, i32 %[[TMP0]])
+// IR-NEXT:    br label %[[OMP_PRECOND_END]]
+// IR-EMPTY:
+// IR-NEXT:  [[OMP_PRECOND_END]]:
+// IR-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @3, i32 %[[TMP0]])
+// IR-NEXT:    ret void
+// IR-NEXT:  }
+extern "C" void func(int start, int end, int step) {
+  #pragma omp for
+  #pragma omp unroll partial
+  #pragma omp unroll partial
+  for (int i = start; i < end; i+=step)
+    body(start, end, step, i);
+}
+
+#endif /* HEADER */
+
+
+// IR: ![[LOOP2]] = distinct !{![[LOOP2]], ![[LOOPPROP3:[0-9]+]], ![[LOOPPROP4:[0-9]+]]}
+// IR: ![[LOOPPROP3]] = !{!"llvm.loop.mustprogress"}
+// IR: ![[LOOPPROP4]] = !{!"llvm.loop.unroll.count", i32 2}
+// IR: ![[LOOP5]] = distinct !{![[LOOP5]], ![[LOOPPROP3]], ![[LOOPPROP4]]}
diff --git a/clang/test/OpenMP/unroll_messages.cpp b/clang/test/OpenMP/unroll_messages.cpp
new file mode 100644
index 000000000000..d167ce3194fd
--- /dev/null
+++ b/clang/test/OpenMP/unroll_messages.cpp
@@ -0,0 +1,130 @@
+// RUN: %clang_cc1 -triple x86_64-pc-linux-gnu -std=c++17 -fopenmp -fopenmp-version=51 -fsyntax-only -Wuninitialized -verify %s
+
+void func(int n) {
+  // expected-error@+2 {{statement after '#pragma omp unroll' must be a for loop}}
+  #pragma omp unroll
+  func(n);
+
+  // expected-error@+2 {{statement after '#pragma omp unroll' must be a for loop}}
+  #pragma omp unroll
+    ;
+
+  // expected-error@+2 {{the loop condition expression depends on the current loop control variable}}
+  #pragma omp unroll
+  for (int i = 0; i < 2*(i-4); ++i) {}
+
+  // expected-error@+2 {{condition of OpenMP for loop must be a relational comparison ('<', '<=', '>', '>=', or '!=') of loop variable 'i'}}
+  #pragma omp unroll
+  for (int i = 0; i/3 < 7; ++i) {}
+
+  // expected-warning@+1 {{extra tokens at the end of '#pragma omp unroll' are ignored}}
+  #pragma omp unroll foo
+  for (int i = 0; i < n; ++i) {}
+
+  // expected-error@+1 {{expected expression}} expected-error@+1 {{expected ')'}} expected-note@+1 {{to match this '('}}
+  #pragma omp unroll partial(
+  for (int i = 0; i < n; ++i) {}
+  
+  // expected-error@+1 {{expected ')'}} expected-note@+1 {{to match this '('}}
+  #pragma omp unroll partial(4
+  for (int i = 0; i < n; ++i) {}
+
+  // expected-error@+1 {{expected expression}} expected-error@+1 {{expected ')'}} expected-note@+1 {{to match this '('}}
+  #pragma omp unroll partial(4+
+  for (int i = 0; i < n; ++i) {}
+
+  // expected-error@+1 {{expected expression}} expected-error@+1 {{expected ')'}} expected-note@+1 {{to match this '('}}
+  #pragma omp unroll partial(for)
+  for (int i = 0; i < n; ++i) {}
+
+  // expected-error@+1 {{integral constant expression must have integral or unscoped enumeration type, not 'void (int)'}}
+  #pragma omp unroll partial(func)
+  for (int i = 0; i < n; ++i) {}
+
+  // expected-error@+1 {{expected expression}}
+  #pragma omp unroll partial()
+  for (int i = 0; i < n; ++i) {}
+
+  // expected-error@+1 {{expected ')'}} expected-note@+1 {{to match this '('}}
+  #pragma omp unroll partial(4,4)
+  for (int i = 0; i < n; ++i) {}
+
+  // expected-error@+3 {{expression is not an integral constant expression}} expected-note@+3 {{read of non-const variable 'a' is not allowed in a constant expression}}
+  // expected-note@+1 {{declared here}}
+  int a;
+  #pragma omp unroll partial(a)
+  for (int i = 0; i < n; ++i) {}
+
+  // expected-error@+1 {{argument to 'partial' clause must be a strictly positive integer value}} 
+  #pragma omp unroll partial(0)
+  for (int i = 0; i < n; ++i) {}
+    
+  // expected-error@+1 {{directive '#pragma omp unroll' cannot contain more than one 'partial' clause}} 
+  #pragma omp unroll partial partial
+  for (int i = 0; i < n; ++i) {}
+
+  // expected-error@+1 {{directive '#pragma omp unroll' cannot contain more than one 'partial' clause}} 
+  #pragma omp unroll partial(4) partial
+  for (int i = 0; i < n; ++i) {}
+
+  // expected-error@+1 {{directive '#pragma omp unroll' cannot contain more than one 'full' clause}}
+  #pragma omp unroll full full
+  for (int i = 0; i < 128; ++i) {}
+
+  // expected-error@+1 {{'full' and 'partial' clause are mutually exclusive and may not appear on the same directive}} expected-note@+1 {{'partial' clause is specified here}}
+  #pragma omp unroll partial full
+  for (int i = 0; i < n; ++i) {}
+
+  // expected-error@+1 {{'partial' and 'full' clause are mutually exclusive and may not appear on the same directive}} expected-note@+1 {{'full' clause is specified here}}
+  #pragma omp unroll full partial
+  for (int i = 0; i < n; ++i) {}
+
+  // expected-error@+2 {{loop to be fully unrolled must have a constant trip count}} expected-note@+1 {{'#pragma omp unroll full' directive found here}}
+  #pragma omp unroll full
+  for (int i = 0; i < n; ++i) {}
+
+  // expected-error@+2 {{statement after '#pragma omp for' must be a for loop}}
+  #pragma omp for
+  #pragma omp unroll
+  for (int i = 0; i < n; ++i) {}
+
+    // expected-error@+2 {{statement after '#pragma omp for' must be a for loop}}
+  #pragma omp for
+  #pragma omp unroll full
+  for (int i = 0; i < 128; ++i) {}
+
+  // expected-error@+2 {{statement after '#pragma omp unroll' must be a for loop}}
+  #pragma omp unroll
+  #pragma omp unroll
+  for (int i = 0; i < n; ++i) {}
+  
+  // expected-error@+2 {{statement after '#pragma omp tile' must be a for loop}}
+  #pragma omp tile sizes(4)
+  #pragma omp unroll
+  for (int i = 0; i < n; ++i) {}
+  
+  // expected-error@+4 {{expected 2 for loops after '#pragma omp for', but found only 1}} 
+  // expected-note@+1 {{as specified in 'collapse' clause}}
+  #pragma omp for collapse(2)
+  for (int i = 0; i < n; ++i) {
+    #pragma omp unroll full
+    for (int j = 0; j < 128; ++j) {}
+  }
+}
+
+
+template<typename T, int Factor>
+void templated_func(int n) {
+  // expected-error@+1 {{argument to 'partial' clause must be a strictly positive integer value}} 
+  #pragma omp unroll partial(Factor)
+  for (T i = 0; i < n; ++i) {}
+
+  // expected-error@+2 {{loop to be fully unrolled must have a constant trip count}} expected-note@+1 {{'#pragma omp unroll full' directive found here}}
+  #pragma omp unroll full
+  for (int i = 0; i < n; i-=Factor) {}
+}
+
+void template_inst(int n) {
+  // expected-note@+1 {{in instantiation of function template specialization 'templated_func<int, -1>' requested here}}
+  templated_func<int, -1>(n);
+}
diff --git a/clang/tools/libclang/CIndex.cpp b/clang/tools/libclang/CIndex.cpp
index 89f7063b7364..e3d34e1cf59b 100644
--- a/clang/tools/libclang/CIndex.cpp
+++ b/clang/tools/libclang/CIndex.cpp
@@ -2045,6 +2045,7 @@ public:
   void VisitOMPParallelDirective(const OMPParallelDirective *D);
   void VisitOMPSimdDirective(const OMPSimdDirective *D);
   void VisitOMPTileDirective(const OMPTileDirective *D);
+  void VisitOMPUnrollDirective(const OMPUnrollDirective *D);
   void VisitOMPForDirective(const OMPForDirective *D);
   void VisitOMPForSimdDirective(const OMPForSimdDirective *D);
   void VisitOMPSectionsDirective(const OMPSectionsDirective *D);
@@ -2223,6 +2224,12 @@ void OMPClauseEnqueue::VisitOMPSizesClause(const OMPSizesClause *C) {
     Visitor->AddStmt(E);
 }
 
+void OMPClauseEnqueue::VisitOMPFullClause(const OMPFullClause *C) {}
+
+void OMPClauseEnqueue::VisitOMPPartialClause(const OMPPartialClause *C) {
+  Visitor->AddStmt(C->getFactor());
+}
+
 void OMPClauseEnqueue::VisitOMPAllocatorClause(const OMPAllocatorClause *C) {
   Visitor->AddStmt(C->getAllocator());
 }
@@ -2896,6 +2903,10 @@ void EnqueueVisitor::VisitOMPTileDirective(const OMPTileDirective *D) {
   VisitOMPLoopBasedDirective(D);
 }
 
+void EnqueueVisitor::VisitOMPUnrollDirective(const OMPUnrollDirective *D) {
+  VisitOMPLoopBasedDirective(D);
+}
+
 void EnqueueVisitor::VisitOMPForDirective(const OMPForDirective *D) {
   VisitOMPLoopDirective(D);
 }
@@ -5575,6 +5586,8 @@ CXString clang_getCursorKindSpelling(enum CXCursorKind Kind) {
     return cxstring::createRef("OMPSimdDirective");
   case CXCursor_OMPTileDirective:
     return cxstring::createRef("OMPTileDirective");
+  case CXCursor_OMPUnrollDirective:
+    return cxstring::createRef("OMPUnrollDirective");
   case CXCursor_OMPForDirective:
     return cxstring::createRef("OMPForDirective");
   case CXCursor_OMPForSimdDirective:
diff --git a/clang/tools/libclang/CXCursor.cpp b/clang/tools/libclang/CXCursor.cpp
index 7c6b7bcd0593..6fb47300efb8 100644
--- a/clang/tools/libclang/CXCursor.cpp
+++ b/clang/tools/libclang/CXCursor.cpp
@@ -652,6 +652,9 @@ CXCursor cxcursor::MakeCXCursor(const Stmt *S, const Decl *Parent,
   case Stmt::OMPTileDirectiveClass:
     K = CXCursor_OMPTileDirective;
     break;
+  case Stmt::OMPUnrollDirectiveClass:
+    K = CXCursor_OMPUnrollDirective;
+    break;
   case Stmt::OMPForDirectiveClass:
     K = CXCursor_OMPForDirective;
     break;
diff --git a/llvm/include/llvm/Frontend/OpenMP/OMP.td b/llvm/include/llvm/Frontend/OpenMP/OMP.td
index 4db8bcbab27b..3dc6194c7830 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMP.td
+++ b/llvm/include/llvm/Frontend/OpenMP/OMP.td
@@ -68,6 +68,8 @@ def OMPC_Private : Clause<"private"> {
   let flangClass = "OmpObjectList";
 }
 def OMPC_Sizes: Clause<"sizes"> { let clangClass = "OMPSizesClause"; }
+def OMPC_Full: Clause<"full"> { let clangClass = "OMPFullClause"; }
+def OMPC_Partial: Clause<"partial"> { let clangClass = "OMPPartialClause"; }
 def OMPC_FirstPrivate : Clause<"firstprivate"> {
   let clangClass = "OMPFirstprivateClause";
   let flangClass = "OmpObjectList";
@@ -400,6 +402,12 @@ def OMP_Tile : Directive<"tile"> {
     VersionedClause<OMPC_Sizes, 51>,
   ];
 }
+def OMP_Unroll : Directive<"unroll"> {
+  let allowedOnceClauses = [
+    VersionedClause<OMPC_Full, 51>,
+    VersionedClause<OMPC_Partial, 51>,
+  ];
+}
 def OMP_For : Directive<"for"> {
   let allowedClauses = [
     VersionedClause<OMPC_Private>,
-- 
2.34.1