From df7a8e2bc80d15de84f3be277ff3df25769ab84b Mon Sep 17 00:00:00 2001 From: Alexander Musman Date: Thu, 22 Jan 2015 08:49:35 +0000 Subject: [PATCH] =?utf8?q?Support=20=E2=80=98omp=20for=E2=80=99=20with=20s?= =?utf8?q?tatic=20chunked=20schedule=20kind.?= MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit Differential Revision: http://reviews.llvm.org/D7006 llvm-svn: 226795 --- clang/lib/CodeGen/CGOpenMPRuntime.cpp | 6 +++ clang/lib/CodeGen/CGOpenMPRuntime.h | 6 +++ clang/lib/CodeGen/CGStmtOpenMP.cpp | 92 ++++++++++++++++++++++++++++++++++- clang/lib/CodeGen/CodeGenFunction.h | 6 +++ clang/test/OpenMP/for_codegen.cpp | 61 ++++++++++++++++++++++- 5 files changed, 168 insertions(+), 3 deletions(-) diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.cpp b/clang/lib/CodeGen/CGOpenMPRuntime.cpp index 22ee00f..90bc5db 100644 --- a/clang/lib/CodeGen/CGOpenMPRuntime.cpp +++ b/clang/lib/CodeGen/CGOpenMPRuntime.cpp @@ -848,6 +848,12 @@ bool CGOpenMPRuntime::isStaticNonchunked(OpenMPScheduleClauseKind ScheduleKind, return Schedule == OMP_sch_static; } +bool CGOpenMPRuntime::isDynamic(OpenMPScheduleClauseKind ScheduleKind) const { + auto Schedule = getRuntimeSchedule(ScheduleKind, /* Chunked */ false); + assert(Schedule != OMP_sch_static_chunked && "cannot be chunked here"); + return Schedule != OMP_sch_static; +} + void CGOpenMPRuntime::EmitOMPForInit(CodeGenFunction &CGF, SourceLocation Loc, OpenMPScheduleClauseKind ScheduleKind, unsigned IVSize, bool IVSigned, diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.h b/clang/lib/CodeGen/CGOpenMPRuntime.h index 6daf817..5c40d53 100644 --- a/clang/lib/CodeGen/CGOpenMPRuntime.h +++ b/clang/lib/CodeGen/CGOpenMPRuntime.h @@ -320,6 +320,12 @@ public: virtual bool isStaticNonchunked(OpenMPScheduleClauseKind ScheduleKind, bool Chunked) const; + /// \brief Check if the specified \a ScheduleKind is dynamic. + /// This kind of worksharing directive is emitted without outer loop. + /// \param ScheduleKind Schedule Kind specified in the 'schedule' clause. + /// + virtual bool isDynamic(OpenMPScheduleClauseKind ScheduleKind) const; + /// \brief Call the appropriate runtime routine to initialize it before start /// of loop. /// diff --git a/clang/lib/CodeGen/CGStmtOpenMP.cpp b/clang/lib/CodeGen/CGStmtOpenMP.cpp index 101c3e7..60958d0 100644 --- a/clang/lib/CodeGen/CGStmtOpenMP.cpp +++ b/clang/lib/CodeGen/CGStmtOpenMP.cpp @@ -500,6 +500,89 @@ void CodeGenFunction::EmitOMPSimdDirective(const OMPSimdDirective &S) { DI->EmitLexicalBlockEnd(Builder, S.getSourceRange().getEnd()); } +void CodeGenFunction::EmitOMPForOuterLoop(OpenMPScheduleClauseKind ScheduleKind, + const OMPLoopDirective &S, + OMPPrivateScope &LoopScope, + llvm::Value *LB, llvm::Value *UB, + llvm::Value *ST, llvm::Value *IL, + llvm::Value *Chunk) { + auto &RT = CGM.getOpenMPRuntime(); + assert(!RT.isStaticNonchunked(ScheduleKind, /* Chunked */ Chunk != nullptr) && + "static non-chunked schedule does not need outer loop"); + if (RT.isDynamic(ScheduleKind)) { + ErrorUnsupported(&S, "OpenMP loop with dynamic schedule"); + return; + } + + // Emit outer loop. + // + // OpenMP [2.7.1, Loop Construct, Description, table 2-1] + // When schedule(static, chunk_size) is specified, iterations are divided into + // chunks of size chunk_size, and the chunks are assigned to the threads in + // the team in a round-robin fashion in the order of the thread number. + // + // while(UB = min(UB, GlobalUB), idx = LB, idx < UB) { + // while (idx <= UB) { BODY; ++idx; } // inner loop + // LB = LB + ST; + // UB = UB + ST; + // } + // + const Expr *IVExpr = S.getIterationVariable(); + const unsigned IVSize = getContext().getTypeSize(IVExpr->getType()); + const bool IVSigned = IVExpr->getType()->hasSignedIntegerRepresentation(); + + RT.EmitOMPForInit(*this, S.getLocStart(), ScheduleKind, IVSize, IVSigned, IL, + LB, UB, ST, Chunk); + auto LoopExit = getJumpDestInCurrentScope("omp.dispatch.end"); + + // Start the loop with a block that tests the condition. + auto CondBlock = createBasicBlock("omp.dispatch.cond"); + EmitBlock(CondBlock); + LoopStack.push(CondBlock); + + llvm::Value *BoolCondVal = nullptr; + // UB = min(UB, GlobalUB) + EmitIgnoredExpr(S.getEnsureUpperBound()); + // IV = LB + EmitIgnoredExpr(S.getInit()); + // IV < UB + BoolCondVal = EvaluateExprAsBool(S.getCond(false)); + + // If there are any cleanups between here and the loop-exit scope, + // create a block to stage a loop exit along. + auto ExitBlock = LoopExit.getBlock(); + if (LoopScope.requiresCleanups()) + ExitBlock = createBasicBlock("omp.dispatch.cleanup"); + + auto LoopBody = createBasicBlock("omp.dispatch.body"); + Builder.CreateCondBr(BoolCondVal, LoopBody, ExitBlock); + if (ExitBlock != LoopExit.getBlock()) { + EmitBlock(ExitBlock); + EmitBranchThroughCleanup(LoopExit); + } + EmitBlock(LoopBody); + + // Create a block for the increment. + auto Continue = getJumpDestInCurrentScope("omp.dispatch.inc"); + BreakContinueStack.push_back(BreakContinue(LoopExit, Continue)); + + EmitOMPInnerLoop(S, LoopScope); + + EmitBlock(Continue.getBlock()); + BreakContinueStack.pop_back(); + // Emit "LB = LB + Stride", "UB = UB + Stride". + EmitIgnoredExpr(S.getNextLowerBound()); + EmitIgnoredExpr(S.getNextUpperBound()); + + EmitBranch(CondBlock); + LoopStack.pop(); + // Emit the fall-through block. + EmitBlock(LoopExit.getBlock()); + + // Tell the runtime we are done. + RT.EmitOMPForFinish(*this, S.getLocStart(), ScheduleKind); +} + /// \brief Emit a helper variable and return corresponding lvalue. static LValue EmitOMPHelperVar(CodeGenFunction &CGF, const DeclRefExpr *Helper) { @@ -581,8 +664,13 @@ void CodeGenFunction::EmitOMPWorksharingLoop(const OMPLoopDirective &S) { EmitOMPInnerLoop(S, LoopScope); // Tell the runtime we are done. RT.EmitOMPForFinish(*this, S.getLocStart(), ScheduleKind); - } else - ErrorUnsupported(&S, "OpenMP loop with requested schedule"); + } else { + // Emit the outer loop, which requests its work chunk [LB..UB] from + // runtime and runs the inner loop to process it. + EmitOMPForOuterLoop(ScheduleKind, S, LoopScope, LB.getAddress(), + UB.getAddress(), ST.getAddress(), IL.getAddress(), + Chunk); + } } // We're now done with the loop, so jump to the continuation block. EmitBranch(ContBlock); diff --git a/clang/lib/CodeGen/CodeGenFunction.h b/clang/lib/CodeGen/CodeGenFunction.h index 44e01c8..ea63c84 100644 --- a/clang/lib/CodeGen/CodeGenFunction.h +++ b/clang/lib/CodeGen/CodeGenFunction.h @@ -27,6 +27,7 @@ #include "clang/AST/Type.h" #include "clang/Basic/ABI.h" #include "clang/Basic/CapturedStmt.h" +#include "clang/Basic/OpenMPKinds.h" #include "clang/Basic/TargetInfo.h" #include "clang/Frontend/CodeGenOptions.h" #include "llvm/ADT/ArrayRef.h" @@ -2052,6 +2053,11 @@ private: bool SeparateIter = false); void EmitOMPSimdFinal(const OMPLoopDirective &S); void EmitOMPWorksharingLoop(const OMPLoopDirective &S); + void EmitOMPForOuterLoop(OpenMPScheduleClauseKind ScheduleKind, + const OMPLoopDirective &S, + OMPPrivateScope &LoopScope, llvm::Value *LB, + llvm::Value *UB, llvm::Value *ST, llvm::Value *IL, + llvm::Value *Chunk); public: diff --git a/clang/test/OpenMP/for_codegen.cpp b/clang/test/OpenMP/for_codegen.cpp index 757de65..badc5bd 100644 --- a/clang/test/OpenMP/for_codegen.cpp +++ b/clang/test/OpenMP/for_codegen.cpp @@ -1,4 +1,4 @@ -// RUN: %clang_cc1 -verify -fopenmp=libiomp5 -x c++ -emit-llvm %s -fexceptions -fcxx-exceptions -o - | FileCheck %s +// RUN: %clang_cc1 -verify -fopenmp=libiomp5 -x c++ -triple x86_64-unknown-unknown -emit-llvm %s -fexceptions -fcxx-exceptions -o - | FileCheck %s // RUN: %clang_cc1 -fopenmp=libiomp5 -x c++ -std=c++11 -triple x86_64-unknown-unknown -fexceptions -fcxx-exceptions -emit-pch -o %t %s // RUN: %clang_cc1 -fopenmp=libiomp5 -x c++ -triple x86_64-unknown-unknown -fexceptions -fcxx-exceptions -g -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s // @@ -87,5 +87,64 @@ void static_not_chunked(float *a, float *b, float *c, float *d) { // CHECK: ret void } +// CHECK-LABEL: define {{.*void}} @{{.*}}static_chunked{{.*}}(float* {{.+}}, float* {{.+}}, float* {{.+}}, float* {{.+}}) +void static_chunked(float *a, float *b, float *c, float *d) { +// CHECK: [[GTID:%.+]] = call i32 @__kmpc_global_thread_num([[IDENT_T_TY]]* [[DEFAULT_LOC:[@%].+]]) + #pragma omp for schedule(static, 5) +// CHECK: call void @__kmpc_for_static_init_4u([[IDENT_T_TY]]* [[DEFAULT_LOC]], i32 [[GTID]], i32 33, i32* [[IS_LAST:%[^,]+]], i32* [[OMP_LB:%[^,]+]], i32* [[OMP_UB:%[^,]+]], i32* [[OMP_ST:%[^,]+]], i32 1, i32 5) +// UB = min(UB, GlobalUB) +// CHECK: [[UB:%.+]] = load i32* [[OMP_UB]] +// CHECK-NEXT: [[UBCMP:%.+]] = icmp ugt i32 [[UB]], 16908288 +// CHECK-NEXT: br i1 [[UBCMP]], label [[UB_TRUE:%[^,]+]], label [[UB_FALSE:%[^,]+]] +// CHECK: [[UBRESULT:%.+]] = phi i32 [ 16908288, [[UB_TRUE]] ], [ [[UBVAL:%[^,]+]], [[UB_FALSE]] ] +// CHECK-NEXT: store i32 [[UBRESULT]], i32* [[OMP_UB]] +// CHECK-NEXT: [[LB:%.+]] = load i32* [[OMP_LB]] +// CHECK-NEXT: store i32 [[LB]], i32* [[OMP_IV:[^,]+]] + +// Outer loop header +// CHECK: [[O_IV:%.+]] = load i32* [[OMP_IV]] +// CHECK-NEXT: [[O_UB:%.+]] = load i32* [[OMP_UB]] +// CHECK-NEXT: [[O_CMP:%.+]] = icmp ule i32 [[O_IV]], [[O_UB]] +// CHECK-NEXT: br i1 [[O_CMP]], label %[[O_LOOP1_BODY:[^,]+]], label %[[O_LOOP1_END:[^,]+]] + +// Loop header +// CHECK: [[O_LOOP1_BODY]] +// CHECK: [[IV:%.+]] = load i32* [[OMP_IV]] +// CHECK-NEXT: [[UB:%.+]] = load i32* [[OMP_UB]] +// CHECK-NEXT: [[CMP:%.+]] = icmp ule i32 [[IV]], [[UB]] +// CHECK-NEXT: br i1 [[CMP]], label %[[LOOP1_BODY:[^,]+]], label %[[LOOP1_END:[^,]+]] + for (unsigned i = 131071; i <= 2147483647; i += 127) { +// CHECK: [[LOOP1_BODY]] +// Start of body: calculate i from IV: +// CHECK: [[IV1_1:%.+]] = load i32* [[OMP_IV]] +// CHECK-NEXT: [[CALC_I_1:%.+]] = mul i32 [[IV1_1]], 127 +// CHECK-NEXT: [[CALC_I_2:%.+]] = add i32 131071, [[CALC_I_1]] +// CHECK-NEXT: store i32 [[CALC_I_2]], i32* [[LC_I:.+]] +// ... loop body ... +// End of body: store into a[i]: +// CHECK: store float [[RESULT:%.+]], float* {{%.+}} + a[i] = b[i] * c[i] * d[i]; +// CHECK: [[IV1_2:%.+]] = load i32* [[OMP_IV]]{{.*}} +// CHECK-NEXT: [[ADD1_2:%.+]] = add i32 [[IV1_2]], 1 +// CHECK-NEXT: store i32 [[ADD1_2]], i32* [[OMP_IV]] +// CHECK-NEXT: br label %{{.+}} + } +// CHECK: [[LOOP1_END]] +// Update the counters, adding stride +// CHECK: [[LB:%.+]] = load i32* [[OMP_LB]] +// CHECK-NEXT: [[ST:%.+]] = load i32* [[OMP_ST]] +// CHECK-NEXT: [[ADD_LB:%.+]] = add i32 [[LB]], [[ST]] +// CHECK-NEXT: store i32 [[ADD_LB]], i32* [[OMP_LB]] +// CHECK-NEXT: [[UB:%.+]] = load i32* [[OMP_UB]] +// CHECK-NEXT: [[ST:%.+]] = load i32* [[OMP_ST]] +// CHECK-NEXT: [[ADD_UB:%.+]] = add i32 [[UB]], [[ST]] +// CHECK-NEXT: store i32 [[ADD_UB]], i32* [[OMP_UB]] + +// CHECK: [[O_LOOP1_END]] +// CHECK: call void @__kmpc_for_static_fini([[IDENT_T_TY]]* [[DEFAULT_LOC]], i32 [[GTID]]) +// CHECK: call {{.+}} @__kmpc_cancel_barrier([[IDENT_T_TY]]* [[DEFAULT_LOC_BARRIER:[@%].+]], i32 [[GTID]]) +// CHECK: ret void +} + #endif // HEADER -- 2.7.4