--- /dev/null
+//===--- PerfMonitor.h --- Monitor time spent in scops --------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef PERF_MONITOR_H
+#define PERF_MONITOR_H
+
+#include "polly/CodeGen/IRBuilder.h"
+
+namespace llvm {
+class Function;
+class Module;
+class Value;
+class Instruction;
+} // namespace llvm
+
+namespace polly {
+
+class PerfMonitor {
+public:
+ /// Create a new performance monitor.
+ ///
+ /// @param M The module for which to generate the performance monitor.
+ PerfMonitor(llvm::Module *M);
+
+ /// Initialize the performance monitor.
+ ///
+ /// Ensure that all global variables, functions, and callbacks needed to
+ /// manage the performance monitor are initialized and registered.
+ void initialize();
+
+ /// Mark the beginning of a timing region.
+ ///
+ /// @param InsertBefore The instruction before which the timing region starts.
+ void insertRegionStart(llvm::Instruction *InserBefore);
+
+ /// Mark the end of a timing region.
+ ///
+ /// @param InsertBefore The instruction before which the timing region starts.
+ void insertRegionEnd(llvm::Instruction *InsertBefore);
+
+private:
+ llvm::Module *M;
+ PollyIRBuilder Builder;
+
+ /// Indicates if performance profiling is supported on this architecture.
+ bool Supported;
+
+ /// The cycle counter at the beginning of the program execution.
+ llvm::Value *CyclesTotalStartPtr;
+
+ /// The total number of cycles spent within scops.
+ llvm::Value *CyclesInScopsPtr;
+
+ /// The value of the cycle counter at the beginning of the last scop.
+ llvm::Value *CyclesInScopStartPtr;
+
+ /// A memory location which serves as argument of the RDTSCP function.
+ ///
+ /// The value written to this location is currently not used.
+ llvm::Value *RDTSCPWriteLocation;
+
+ /// A global variable, that keeps track if the performance monitor
+ /// initialization has already been run.
+ llvm::Value *AlreadyInitializedPtr;
+
+ llvm::Function *insertInitFunction(llvm::Function *FinalReporting);
+
+ /// Add Function @p to list of global constructors
+ ///
+ /// If no global constructors are available in this current module, insert
+ /// a new list of global constructors containing @p Fn as only global
+ /// constructor. Otherwise, append @p Fn to the list of global constructors.
+ ///
+ /// All functions listed as global constructors are executed before the
+ /// main() function is called.
+ ///
+ /// @param Fn Function to add to global constructors
+ void addToGlobalConstructors(llvm::Function *Fn);
+
+ /// Add global variables to module.
+ ///
+ /// Insert a set of global variables that are used to track performance,
+ /// into the module (or obtain references to them if they already exist).
+ void addGlobalVariables();
+
+ /// Get a reference to the intrinsic "i64 @llvm.x86.rdtscp(i8*)".
+ ///
+ /// The rdtscp function returns the current value of the processor's
+ /// time-stamp counter as well as the current CPU identifier. On modern x86
+ /// systems, the returned value is independent of the dynamic clock frequency
+ /// and consistent across multiple cores. It can consequently be used to get
+ /// accurate and low-overhead timing information. Even though the counter is
+ /// wrapping, it can be reliably used even for measuring longer time
+ /// intervals, as on a 1 GHz processor the counter only wraps every 545 years.
+ ///
+ /// The RDTSCP instruction is "pseudo" serializing:
+ ///
+ /// "“The RDTSCP instruction waits until all previous instructions have been
+ /// executed before reading the counter. However, subsequent instructions may
+ /// begin execution before the read operation is performed.”
+ ///
+ /// To ensure that no later instructions are scheduled before the RDTSCP
+ /// instruction it is often recommended to schedule a cpuid call after the
+ /// RDTSCP instruction. We do not do this yet, trading some imprecision in
+ /// our timing for a reduced overhead in our timing.
+ ///
+ /// @returns A reference to the declaration of @llvm.x86.rdtscp.
+ llvm::Function *getRDTSCP();
+
+ /// Get a reference to "int atexit(void (*function)(void))" function.
+ ///
+ /// This function allows to register function pointers that must be executed
+ /// when the program is terminated.
+ ///
+ /// @returns A reference to @atexit().
+ llvm::Function *getAtExit();
+
+ /// Create function "__polly_perf_final_reporting".
+ ///
+ /// This function finalizes the performance measurements and prints the
+ /// results to stdout. It is expected to be registered with 'atexit()'.
+ llvm::Function *insertFinalReporting();
+};
+} // namespace polly
+
+#endif
CodeGen/Utils.cpp
CodeGen/RuntimeDebugBuilder.cpp
CodeGen/CodegenCleanup.cpp
+ CodeGen/PerfMonitor.cpp
${GPGPU_CODEGEN_FILES}
Exchange/JSONExporter.cpp
Support/GICHelper.cpp
#include "polly/CodeGen/IslAst.h"
#include "polly/CodeGen/IslNodeBuilder.h"
+#include "polly/CodeGen/PerfMonitor.h"
#include "polly/CodeGen/Utils.h"
#include "polly/DependenceInfo.h"
#include "polly/LinkAllPasses.h"
cl::Hidden, cl::init(true), cl::ZeroOrMore,
cl::cat(PollyCategory));
+static cl::opt<bool>
+ PerfMonitoring("polly-codegen-perf-monitoring",
+ cl::desc("Add run-time performance monitoring"), cl::Hidden,
+ cl::init(false), cl::ZeroOrMore, cl::cat(PollyCategory));
+
namespace {
class CodeGeneration : public ScopPass {
public:
IslNodeBuilder NodeBuilder(Builder, Annotator, this, *DL, *LI, *SE, *DT, S,
StartBlock);
+ if (PerfMonitoring) {
+ PerfMonitor P(EnteringBB->getParent()->getParent());
+ P.initialize();
+ P.insertRegionStart(SplitBlock->getTerminator());
+
+ BasicBlock *MergeBlock = SplitBlock->getTerminator()
+ ->getSuccessor(0)
+ ->getUniqueSuccessor()
+ ->getUniqueSuccessor();
+ P.insertRegionEnd(MergeBlock->getTerminator());
+ }
+
// First generate code for the hoisted invariant loads and transitively the
// parameters they reference. Afterwards, for the remaining parameters that
// might reference the hoisted loads. Finally, build the runtime check
--- /dev/null
+//===------ PerfMonitor.cpp - Generate a run-time performance monitor. -======//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//===----------------------------------------------------------------------===//
+
+#include "polly/CodeGen/PerfMonitor.h"
+#include "polly/CodeGen/RuntimeDebugBuilder.h"
+#include "llvm/ADT/Triple.h"
+
+using namespace llvm;
+using namespace polly;
+
+Function *PerfMonitor::getAtExit() {
+ const char *Name = "atexit";
+ Function *F = M->getFunction(Name);
+
+ if (!F) {
+ GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
+ FunctionType *Ty = FunctionType::get(Builder.getInt32Ty(),
+ {Builder.getInt8PtrTy()}, false);
+ F = Function::Create(Ty, Linkage, Name, M);
+ }
+
+ return F;
+}
+
+void PerfMonitor::addToGlobalConstructors(Function *Fn) {
+ const char *Name = "llvm.global_ctors";
+ GlobalVariable *GV = M->getGlobalVariable(Name);
+ std::vector<Constant *> V;
+
+ if (GV) {
+ Constant *Array = GV->getInitializer();
+ for (Value *X : Array->operand_values())
+ V.push_back(cast<Constant>(X));
+ GV->eraseFromParent();
+ }
+
+ StructType *ST = StructType::get(Builder.getInt32Ty(), Fn->getType(),
+ Builder.getInt8PtrTy(), nullptr);
+
+ V.push_back(ConstantStruct::get(
+ ST, Builder.getInt32(10), Fn,
+ ConstantPointerNull::get(Builder.getInt8PtrTy()), nullptr));
+ ArrayType *Ty = ArrayType::get(ST, V.size());
+
+ GV = new GlobalVariable(*M, Ty, true, GlobalValue::AppendingLinkage,
+ ConstantArray::get(Ty, V), Name, nullptr,
+ GlobalVariable::NotThreadLocal);
+}
+
+Function *PerfMonitor::getRDTSCP() {
+ const char *Name = "llvm.x86.rdtscp";
+ Function *F = M->getFunction(Name);
+
+ if (!F) {
+ GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
+ FunctionType *Ty = FunctionType::get(Builder.getInt64Ty(),
+ {Builder.getInt8PtrTy()}, false);
+ F = Function::Create(Ty, Linkage, Name, M);
+ }
+
+ return F;
+}
+
+PerfMonitor::PerfMonitor(Module *M) : M(M), Builder(M->getContext()) {
+ if (Triple(M->getTargetTriple()).getArch() == llvm::Triple::x86_64)
+ Supported = true;
+ else
+ Supported = false;
+}
+
+void PerfMonitor::addGlobalVariables() {
+ auto TryRegisterGlobal = [=](const char *Name, Constant *InitialValue,
+ Value **Location) {
+ *Location = M->getGlobalVariable(Name);
+
+ if (!*Location)
+ *Location = new GlobalVariable(
+ *M, InitialValue->getType(), true, GlobalValue::WeakAnyLinkage,
+ InitialValue, Name, nullptr, GlobalVariable::InitialExecTLSModel);
+ };
+
+ TryRegisterGlobal("__polly_perf_cycles_total_start", Builder.getInt64(0),
+ &CyclesTotalStartPtr);
+
+ TryRegisterGlobal("__polly_perf_initialized", Builder.getInt1(0),
+ &AlreadyInitializedPtr);
+
+ TryRegisterGlobal("__polly_perf_cycles_in_scops", Builder.getInt64(0),
+ &CyclesInScopsPtr);
+
+ TryRegisterGlobal("__polly_perf_cycles_in_scop_start", Builder.getInt64(0),
+ &CyclesInScopStartPtr);
+
+ TryRegisterGlobal("__polly_perf_write_loation", Builder.getInt32(0),
+ &RDTSCPWriteLocation);
+}
+
+static const char *InitFunctionName = "__polly_perf_init";
+static const char *FinalReportingFunctionName = "__polly_perf_final";
+
+Function *PerfMonitor::insertFinalReporting() {
+ // Create new function.
+ GlobalValue::LinkageTypes Linkage = Function::WeakODRLinkage;
+ FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), {}, false);
+ Function *ExitFn =
+ Function::Create(Ty, Linkage, FinalReportingFunctionName, M);
+ BasicBlock *Start = BasicBlock::Create(M->getContext(), "start", ExitFn);
+ Builder.SetInsertPoint(Start);
+
+ if (!Supported) {
+ RuntimeDebugBuilder::createCPUPrinter(
+ Builder, "Polly runtime information generation not supported\n");
+ Builder.CreateRetVoid();
+ return ExitFn;
+ }
+
+ // Measure current cycles and compute final timings.
+ Function *RDTSCPFn = getRDTSCP();
+ Value *CurrentCycles = Builder.CreateCall(
+ RDTSCPFn,
+ Builder.CreatePointerCast(RDTSCPWriteLocation, Builder.getInt8PtrTy()));
+ Value *CyclesStart = Builder.CreateLoad(CyclesTotalStartPtr, true);
+ Value *CyclesTotal = Builder.CreateSub(CurrentCycles, CyclesStart);
+ Value *CyclesInScops = Builder.CreateLoad(CyclesInScopsPtr, true);
+
+ // Print the runtime information.
+ RuntimeDebugBuilder::createCPUPrinter(Builder, "Polly runtime information\n");
+ RuntimeDebugBuilder::createCPUPrinter(Builder, "-------------------------\n");
+ RuntimeDebugBuilder::createCPUPrinter(Builder, "Total: ", CyclesTotal, "\n");
+ RuntimeDebugBuilder::createCPUPrinter(Builder, "Scops: ", CyclesInScops,
+ "\n");
+
+ // Finalize function.
+ Builder.CreateRetVoid();
+ return ExitFn;
+}
+
+void PerfMonitor::initialize() {
+ addGlobalVariables();
+
+ Function *F = M->getFunction(InitFunctionName);
+ if (F)
+ return;
+
+ // initialize
+ Function *FinalReporting = insertFinalReporting();
+ Function *InitFn = insertInitFunction(FinalReporting);
+ addToGlobalConstructors(InitFn);
+}
+
+Function *PerfMonitor::insertInitFunction(Function *FinalReporting) {
+ // Insert function definition and BBs.
+ GlobalValue::LinkageTypes Linkage = Function::WeakODRLinkage;
+ FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), {}, false);
+ Function *InitFn = Function::Create(Ty, Linkage, InitFunctionName, M);
+ BasicBlock *Start = BasicBlock::Create(M->getContext(), "start", InitFn);
+ BasicBlock *EarlyReturn =
+ BasicBlock::Create(M->getContext(), "earlyreturn", InitFn);
+ BasicBlock *InitBB = BasicBlock::Create(M->getContext(), "initbb", InitFn);
+
+ Builder.SetInsertPoint(Start);
+
+ // Check if this function was already run. If yes, return.
+ //
+ // In case profiling has been enabled in multiple translation units, the
+ // initializer function will be added to the global constructors list of
+ // each translation unit. When merging translation units, the global
+ // constructor lists are just appended, such that the initializer will appear
+ // multiple times. To avoid initializations being run multiple times (and
+ // especially to avoid that atExitFn is called more than once), we bail
+ // out if the intializer is run more than once.
+ Value *HasRunBefore = Builder.CreateLoad(AlreadyInitializedPtr);
+ Builder.CreateCondBr(HasRunBefore, EarlyReturn, InitBB);
+ Builder.SetInsertPoint(EarlyReturn);
+ Builder.CreateRetVoid();
+
+ // Keep track that this function has been run once.
+ Builder.SetInsertPoint(InitBB);
+ Value *True = Builder.getInt1(true);
+ Builder.CreateStore(True, AlreadyInitializedPtr);
+
+ // Register the final reporting function with atexit().
+ Value *FinalReportingPtr =
+ Builder.CreatePointerCast(FinalReporting, Builder.getInt8PtrTy());
+ Function *AtExitFn = getAtExit();
+ Builder.CreateCall(AtExitFn, {FinalReportingPtr});
+
+ if (Supported) {
+ // Read the currently cycle counter and store the result for later.
+ Function *RDTSCPFn = getRDTSCP();
+ Value *CurrentCycles = Builder.CreateCall(
+ RDTSCPFn,
+ Builder.CreatePointerCast(RDTSCPWriteLocation, Builder.getInt8PtrTy()));
+ Builder.CreateStore(CurrentCycles, CyclesTotalStartPtr, true);
+ }
+ Builder.CreateRetVoid();
+
+ return InitFn;
+}
+
+void PerfMonitor::insertRegionStart(Instruction *InsertBefore) {
+ if (!Supported)
+ return;
+
+ Builder.SetInsertPoint(InsertBefore);
+ Function *RDTSCPFn = getRDTSCP();
+ Value *CurrentCycles = Builder.CreateCall(
+ RDTSCPFn,
+ Builder.CreatePointerCast(RDTSCPWriteLocation, Builder.getInt8PtrTy()));
+ Builder.CreateStore(CurrentCycles, CyclesInScopStartPtr, true);
+}
+
+void PerfMonitor::insertRegionEnd(Instruction *InsertBefore) {
+ if (!Supported)
+ return;
+
+ Builder.SetInsertPoint(InsertBefore);
+ Function *RDTSCPFn = getRDTSCP();
+ LoadInst *CyclesStart = Builder.CreateLoad(CyclesInScopStartPtr, true);
+ Value *CurrentCycles = Builder.CreateCall(
+ RDTSCPFn,
+ Builder.CreatePointerCast(RDTSCPWriteLocation, Builder.getInt8PtrTy()));
+ Value *CyclesInScop = Builder.CreateSub(CurrentCycles, CyclesStart);
+ Value *CyclesInScops = Builder.CreateLoad(CyclesInScopsPtr, true);
+ CyclesInScops = Builder.CreateAdd(CyclesInScops, CyclesInScop);
+ Builder.CreateStore(CyclesInScops, CyclesInScopsPtr, true);
+}
--- /dev/null
+; RUN: opt %loadPolly -polly-codegen -polly-codegen-perf-monitoring \
+; RUN: -S < %s | FileCheck %s
+
+; void f(long A[], long N) {
+; long i;
+; if (true)
+; for (i = 0; i < N; ++i)
+; A[i] = i;
+; }
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define void @f(i64* %A, i64 %N) nounwind {
+entry:
+ fence seq_cst
+ br label %next
+
+next:
+ br i1 true, label %for.i, label %return
+
+for.i:
+ %indvar = phi i64 [ 0, %next], [ %indvar.next, %for.i ]
+ %scevgep = getelementptr i64, i64* %A, i64 %indvar
+ store i64 %indvar, i64* %scevgep
+ %indvar.next = add nsw i64 %indvar, 1
+ %exitcond = icmp eq i64 %indvar.next, %N
+ br i1 %exitcond, label %return, label %for.i
+
+return:
+ fence seq_cst
+ ret void
+}
+
+; CHECK: @__polly_perf_cycles_total_start = weak thread_local(initialexec) constant i64 0
+; CHECK-NEXT: @__polly_perf_initialized = weak thread_local(initialexec) constant i1 false
+; CHECK-NEXT: @__polly_perf_cycles_in_scops = weak thread_local(initialexec) constant i64 0
+; CHECK-NEXT: @__polly_perf_cycles_in_scop_start = weak thread_local(initialexec) constant i64 0
+; CHECK-NEXT: @__polly_perf_write_loation = weak thread_local(initialexec) constant i32 0
+
+; CHECK: polly.split_new_and_old: ; preds = %entry
+; CHECK-NEXT: %0 = call i64 @llvm.x86.rdtscp(i8* bitcast (i32* @__polly_perf_write_loation to i8*))
+; CHECK-NEXT: store volatile i64 %0, i64* @__polly_perf_cycles_in_scop_start
+
+; CHECK: polly.merge_new_and_old: ; preds = %polly.exiting, %return.region_exiting
+; CHECK-NEXT: %5 = load volatile i64, i64* @__polly_perf_cycles_in_scop_start
+; CHECK-NEXT: %6 = call i64 @llvm.x86.rdtscp(i8* bitcast (i32* @__polly_perf_write_loation to i8*))
+; CHECK-NEXT: %7 = sub i64 %6, %5
+; CHECK-NEXT: %8 = load volatile i64, i64* @__polly_perf_cycles_in_scops
+; CHECK-NEXT: %9 = add i64 %8, %7
+; CHECK-NEXT: store volatile i64 %9, i64* @__polly_perf_cycles_in_scops
+; CHECK-NEXT: br label %return
+
+
+; CHECK: define weak_odr void @__polly_perf_final() {
+; CHECK-NEXT: start:
+; CHECK-NEXT: %0 = call i64 @llvm.x86.rdtscp(i8* bitcast (i32* @__polly_perf_write_loation to i8*))
+; CHECK-NEXT: %1 = load volatile i64, i64* @__polly_perf_cycles_total_start
+; CHECK-NEXT: %2 = sub i64 %0, %1
+; CHECK-NEXT: %3 = load volatile i64, i64* @__polly_perf_cycles_in_scops
+; CHECK-NEXT: %4 = call i32 (...) @printf(i8* getelementptr inbounds ([3 x i8], [3 x i8]* @1, i32 0, i32 0), i8 addrspace(4)* getelementptr inbounds ([27 x i8], [27 x i8] addrspace(4)* @0, i32 0, i32 0))
+; CHECK-NEXT: %5 = call i32 @fflush(i8* null)
+; CHECK-NEXT: %6 = call i32 (...) @printf(i8* getelementptr inbounds ([3 x i8], [3 x i8]* @3, i32 0, i32 0), i8 addrspace(4)* getelementptr inbounds ([27 x i8], [27 x i8] addrspace(4)* @2, i32 0, i32 0))
+; CHECK-NEXT: %7 = call i32 @fflush(i8* null)
+; CHECK-NEXT: %8 = call i32 (...) @printf(i8* getelementptr inbounds ([8 x i8], [8 x i8]* @6, i32 0, i32 0), i8 addrspace(4)* getelementptr inbounds ([8 x i8], [8 x i8] addrspace(4)* @4, i32 0, i32 0), i64 %2, i8 addrspace(4)* getelementptr inbounds ([2 x i8], [2 x i8] addrspace(4)* @5, i32 0, i32 0))
+; CHECK-NEXT: %9 = call i32 @fflush(i8* null)
+; CHECK-NEXT: %10 = call i32 (...) @printf(i8* getelementptr inbounds ([8 x i8], [8 x i8]* @9, i32 0, i32 0), i8 addrspace(4)* getelementptr inbounds ([8 x i8], [8 x i8] addrspace(4)* @7, i32 0, i32 0), i64 %3, i8 addrspace(4)* getelementptr inbounds ([2 x i8], [2 x i8] addrspace(4)* @8, i32 0, i32 0))
+; CHECK-NEXT: %11 = call i32 @fflush(i8* null)
+; CHECK-NEXT: ret void
+; CHECK-NEXT: }
+
+
+; CHECK: define weak_odr void @__polly_perf_init() {
+; CHECK-NEXT: start:
+; CHECK-NEXT: %0 = load i1, i1* @__polly_perf_initialized
+; CHECK-NEXT: br i1 %0, label %earlyreturn, label %initbb
+
+; CHECK: earlyreturn: ; preds = %start
+; CHECK-NEXT: ret void
+
+; CHECK: initbb: ; preds = %start
+; CHECK-NEXT: store i1 true, i1* @__polly_perf_initialized
+; CHECK-NEXT: %1 = call i32 @atexit(i8* bitcast (void ()* @__polly_perf_final to i8*))
+; CHECK-NEXT: %2 = call i64 @llvm.x86.rdtscp(i8* bitcast (i32* @__polly_perf_write_loation to i8*))
+; CHECK-NEXT: store volatile i64 %2, i64* @__polly_perf_cycles_total_start
+; CHECK-NEXT: ret void
+; CHECK-NEXT: }