[llvm-profdata] Speed up merging by using a thread pool

author Vedant Kumar <vsk@apple.com>

Mon, 18 Jul 2016 22:02:39 +0000 (22:02 +0000)

committer Vedant Kumar <vsk@apple.com>

Mon, 18 Jul 2016 22:02:39 +0000 (22:02 +0000)
author Vedant Kumar <vsk@apple.com>
Mon, 18 Jul 2016 22:02:39 +0000 (22:02 +0000)
committer Vedant Kumar <vsk@apple.com>
Mon, 18 Jul 2016 22:02:39 +0000 (22:02 +0000)
diff --git a/llvm/docs/CommandGuide/llvm-profdata.rst b/llvm/docs/CommandGuide/llvm-profdata.rst

index f5508b5..bae0ff7 100644 (file)
--- a/llvm/docs/CommandGuide/llvm-profdata.rst
+++ b/llvm/docs/CommandGuide/llvm-profdata.rst
@@ -106,6 +106,11 @@ OPTIONS
   conjunction with -instr. Defaults to false, since it can inhibit compiler
   optimization during PGO.
  
+.. option:: -num-threads=N, -j=N
+
+ Use N threads to perform profile merging. When N=0, llvm-profdata auto-detects
+ an appropriate number of threads to use. This is the default.
+
  EXAMPLES
  ^^^^^^^^
  Basic Usage
diff --git a/llvm/include/llvm/ProfileData/InstrProfWriter.h b/llvm/include/llvm/ProfileData/InstrProfWriter.h

index 7d29273..f7780fb 100644 (file)
--- a/llvm/include/llvm/ProfileData/InstrProfWriter.h
+++ b/llvm/include/llvm/ProfileData/InstrProfWriter.h
@@ -47,6 +47,8 @@ public:
    /// for this function and the hash and number of counts match, each counter is
    /// summed. Optionally scale counts by \p Weight.
    Error addRecord(InstrProfRecord &&I, uint64_t Weight = 1);
+  /// Merge existing function counts from the given writer.
+  Error mergeRecordsFromWriter(InstrProfWriter &&IPW);
    /// Write the profile to \c OS
    void write(raw_fd_ostream &OS);
    /// Write the profile in text format to \c OS
diff --git a/llvm/lib/ProfileData/InstrProfWriter.cpp b/llvm/lib/ProfileData/InstrProfWriter.cpp

index e25299e..7fabcdb 100644 (file)
--- a/llvm/lib/ProfileData/InstrProfWriter.cpp
+++ b/llvm/lib/ProfileData/InstrProfWriter.cpp
@@ -182,6 +182,14 @@ Error InstrProfWriter::addRecord(InstrProfRecord &&I, uint64_t Weight) {
    return Dest.takeError();
  }
  
+Error InstrProfWriter::mergeRecordsFromWriter(InstrProfWriter &&IPW) {
+  for (auto &I : IPW.FunctionData)
+    for (auto &Func : I.getValue())
+      if (Error E = addRecord(std::move(Func.second), 1))
+        return E;
+  return Error::success();
+}
+
  bool InstrProfWriter::shouldEncodeData(const ProfilingData &PD) {
    if (!Sparse)
      return true;
diff --git a/llvm/test/tools/llvm-profdata/multiple-inputs.test b/llvm/test/tools/llvm-profdata/multiple-inputs.test

index 40d1108..399438a 100644 (file)
--- a/llvm/test/tools/llvm-profdata/multiple-inputs.test
+++ b/llvm/test/tools/llvm-profdata/multiple-inputs.test
@@ -51,3 +51,43 @@ DISJOINT-2: Block counts: [2, 3]
  DISJOINT: Total functions: 2
  DISJOINT: Maximum function count: 1
  DISJOINT: Maximum internal block count: 3
+
+RUN: llvm-profdata merge %p/Inputs/foo3-1.proftext %p/Inputs/foo3-1.proftext \
+RUN:                     %p/Inputs/foo3-1.proftext %p/Inputs/foo3-1.proftext \
+RUN:                     -num-threads 2 -o %t
+RUN: llvm-profdata show %t -all-functions -counts | FileCheck %s --check-prefix=FOO4
+RUN: llvm-profdata merge %p/Inputs/foo3-1.proftext %p/Inputs/foo3-1.proftext \
+RUN:                     %p/Inputs/foo3-1.proftext %p/Inputs/foo3-1.proftext \
+RUN:                     -j 3 -o %t
+RUN: llvm-profdata show %t -all-functions -counts | FileCheck %s --check-prefix=FOO4
+FOO4: foo:
+FOO4: Counters: 3
+FOO4: Function count: 4
+FOO4: Block counts: [8, 12]
+FOO4: Total functions: 1
+FOO4: Maximum function count: 4
+FOO4: Maximum internal block count: 12
+
+RUN: llvm-profdata merge %p/Inputs/foo3-1.proftext %p/Inputs/foo3-1.proftext \
+RUN:                     %p/Inputs/foo3-1.proftext %p/Inputs/foo3-1.proftext \
+RUN:                     %p/Inputs/foo3-1.proftext -j 2 -o %t
+RUN: llvm-profdata show %t -all-functions -counts | FileCheck %s --check-prefix=FOO5
+RUN: llvm-profdata merge %p/Inputs/foo3-1.proftext %p/Inputs/foo3-1.proftext \
+RUN:                     %p/Inputs/foo3-1.proftext %p/Inputs/foo3-1.proftext \
+RUN:                     %p/Inputs/foo3-1.proftext -j 3 -o %t
+RUN: llvm-profdata show %t -all-functions -counts | FileCheck %s --check-prefix=FOO5
+RUN: llvm-profdata merge %p/Inputs/foo3-1.proftext %p/Inputs/foo3-1.proftext \
+RUN:                     %p/Inputs/foo3-1.proftext %p/Inputs/foo3-1.proftext \
+RUN:                     %p/Inputs/foo3-1.proftext -o %t
+RUN: llvm-profdata show %t -all-functions -counts | FileCheck %s --check-prefix=FOO5
+RUN: llvm-profdata merge %p/Inputs/foo3-1.proftext %p/Inputs/foo3-1.proftext \
+RUN:                     %p/Inputs/foo3-1.proftext %p/Inputs/foo3-1.proftext \
+RUN:                     %p/Inputs/foo3-1.proftext -j 1 -o %t
+RUN: llvm-profdata show %t -all-functions -counts | FileCheck %s --check-prefix=FOO5
+FOO5: foo:
+FOO5: Counters: 3
+FOO5: Function count: 5
+FOO5: Block counts: [10, 15]
+FOO5: Total functions: 1
+FOO5: Maximum function count: 5
+FOO5: Maximum internal block count: 15
diff --git a/llvm/tools/llvm-profdata/llvm-profdata.cpp b/llvm/tools/llvm-profdata/llvm-profdata.cpp

index 8e4b4c3..e57b19f 100644 (file)
--- a/llvm/tools/llvm-profdata/llvm-profdata.cpp
+++ b/llvm/tools/llvm-profdata/llvm-profdata.cpp
@@ -29,6 +29,7 @@
  #include "llvm/Support/Path.h"
  #include "llvm/Support/PrettyStackTrace.h"
  #include "llvm/Support/Signals.h"
+#include "llvm/Support/ThreadPool.h"
  #include "llvm/Support/raw_ostream.h"
  #include <algorithm>
  
@@ -117,9 +118,68 @@ struct WeightedFile {
  };
  typedef SmallVector<WeightedFile, 5> WeightedFileVector;
  
+/// Keep track of merged data and reported errors.
+struct WriterContext {
+  std::mutex Lock;
+  InstrProfWriter Writer;
+  Error Err;
+  StringRef ErrWhence;
+  std::mutex &ErrLock;
+  SmallSet<instrprof_error, 4> &WriterErrorCodes;
+
+  WriterContext(bool IsSparse, std::mutex &ErrLock,
+                SmallSet<instrprof_error, 4> &WriterErrorCodes)
+      : Lock(), Writer(IsSparse), Err(Error::success()), ErrWhence(""),
+        ErrLock(ErrLock), WriterErrorCodes(WriterErrorCodes) {}
+};
+
+/// Load an input into a writer context.
+static void loadInput(const WeightedFile &Input, WriterContext *WC) {
+  std::unique_lock<std::mutex> CtxGuard{WC->Lock};
+
+  // If there's a pending hard error, don't do more work.
+  if (WC->Err)
+    return;
+
+  WC->ErrWhence = Input.Filename;
+
+  auto ReaderOrErr = InstrProfReader::create(Input.Filename);
+  if ((WC->Err = ReaderOrErr.takeError()))
+    return;
+
+  auto Reader = std::move(ReaderOrErr.get());
+  bool IsIRProfile = Reader->isIRLevelProfile();
+  if (WC->Writer.setIsIRLevelProfile(IsIRProfile)) {
+    WC->Err = make_error<StringError>(
+        "Merge IR generated profile with Clang generated profile.",
+        std::error_code());
+    return;
+  }
+
+  for (auto &I : *Reader) {
+    if (Error E = WC->Writer.addRecord(std::move(I), Input.Weight)) {
+      // Only show hint the first time an error occurs.
+      instrprof_error IPE = InstrProfError::take(std::move(E));
+      std::unique_lock<std::mutex> ErrGuard{WC->ErrLock};
+      bool firstTime = WC->WriterErrorCodes.insert(IPE).second;
+      handleMergeWriterError(make_error<InstrProfError>(IPE), Input.Filename,
+                             I.Name, firstTime);
+    }
+  }
+  if (Reader->hasError())
+    WC->Err = Reader->getError();
+}
+
+/// Merge the \p Src writer context into \p Dst.
+static void mergeWriterContexts(WriterContext *Dst, WriterContext *Src) {
+  if (Error E = Dst->Writer.mergeRecordsFromWriter(std::move(Src->Writer)))
+    Dst->Err = std::move(E);
+}
+
  static void mergeInstrProfile(const WeightedFileVector &Inputs,
                                StringRef OutputFilename,
-                              ProfileFormat OutputFormat, bool OutputSparse) {
+                              ProfileFormat OutputFormat, bool OutputSparse,
+                              unsigned NumThreads) {
    if (OutputFilename.compare("-") == 0)
      exitWithError("Cannot write indexed profdata format to stdout.");
  
@@ -131,30 +191,57 @@ static void mergeInstrProfile(const WeightedFileVector &Inputs,
    if (EC)
      exitWithErrorCode(EC, OutputFilename);
  
-  InstrProfWriter Writer(OutputSparse);
+  std::mutex ErrorLock;
    SmallSet<instrprof_error, 4> WriterErrorCodes;
-  for (const auto &Input : Inputs) {
-    auto ReaderOrErr = InstrProfReader::create(Input.Filename);
-    if (Error E = ReaderOrErr.takeError())
-      exitWithError(std::move(E), Input.Filename);
-
-    auto Reader = std::move(ReaderOrErr.get());
-    bool IsIRProfile = Reader->isIRLevelProfile();
-    if (Writer.setIsIRLevelProfile(IsIRProfile))
-      exitWithError("Merge IR generated profile with Clang generated profile.");
-
-    for (auto &I : *Reader) {
-      if (Error E = Writer.addRecord(std::move(I), Input.Weight)) {
-        // Only show hint the first time an error occurs.
-        instrprof_error IPE = InstrProfError::take(std::move(E));
-        bool firstTime = WriterErrorCodes.insert(IPE).second;
-        handleMergeWriterError(make_error<InstrProfError>(IPE), Input.Filename,
-                               I.Name, firstTime);
-      }
+
+  // If NumThreads is not specified, auto-detect a good default.
+  if (NumThreads == 0)
+    NumThreads = std::max(1U, std::min(std::thread::hardware_concurrency(),
+                                       unsigned(Inputs.size() / 2)));
+
+  // Initialize the writer contexts.
+  SmallVector<std::unique_ptr<WriterContext>, 4> Contexts;
+  for (unsigned I = 0; I < NumThreads; ++I)
+    Contexts.emplace_back(llvm::make_unique<WriterContext>(
+        OutputSparse, ErrorLock, WriterErrorCodes));
+
+  if (NumThreads == 1) {
+    for (const auto &Input : Inputs)
+      loadInput(Input, Contexts[0].get());
+  } else {
+    ThreadPool Pool(NumThreads);
+
+    // Load the inputs in parallel (N/NumThreads serial steps).
+    unsigned Ctx = 0;
+    for (const auto &Input : Inputs) {
+      Pool.async(loadInput, Input, Contexts[Ctx].get());
+      Ctx = (Ctx + 1) % NumThreads;
      }
-    if (Reader->hasError())
-      exitWithError(Reader->getError(), Input.Filename);
+    Pool.wait();
+
+    // Merge the writer contexts together (lg(NumThreads) serial steps).
+    unsigned Mid = Contexts.size() / 2;
+    unsigned End = Contexts.size();
+    assert(Mid > 0 && "Expected more than one context");
+    do {
+      for (unsigned I = 0; I < Mid; ++I)
+        Pool.async(mergeWriterContexts, Contexts[I].get(),
+                   Contexts[I + Mid].get());
+      if (End & 1)
+        Pool.async(mergeWriterContexts, Contexts[0].get(),
+                   Contexts[End - 1].get());
+      Pool.wait();
+      End = Mid;
+      Mid /= 2;
+    } while (Mid > 0);
    }
+
+  // Handle deferred hard errors encountered during merging.
+  for (std::unique_ptr<WriterContext> &WC : Contexts)
+    if (WC->Err)
+      exitWithError(std::move(WC->Err), WC->ErrWhence);
+
+  InstrProfWriter &Writer = Contexts[0]->Writer;
    if (OutputFormat == PF_Text)
      Writer.writeText(Output);
    else
@@ -288,6 +375,11 @@ static int merge_main(int argc, const char *argv[]) {
                   clEnumValEnd));
    cl::opt<bool> OutputSparse("sparse", cl::init(false),
        cl::desc("Generate a sparse profile (only meaningful for -instr)"));
+  cl::opt<unsigned> NumThreads(
+      "num-threads", cl::init(0),
+      cl::desc("Number of merge threads to use (default: autodetect)"));
+  cl::alias NumThreadsA("j", cl::desc("Alias for --num-threads"),
+                        cl::aliasopt(NumThreads));
  
    cl::ParseCommandLineOptions(argc, argv, "LLVM profile data merger\n");
  
@@ -314,7 +406,7 @@ static int merge_main(int argc, const char *argv[]) {
  
    if (ProfileKind == instr)
      mergeInstrProfile(WeightedInputs, OutputFilename, OutputFormat,
-                      OutputSparse);
+                      OutputSparse, NumThreads);
    else
      mergeSampleProfile(WeightedInputs, OutputFilename, OutputFormat);
  
diff --git a/llvm/unittests/ProfileData/InstrProfTest.cpp b/llvm/unittests/ProfileData/InstrProfTest.cpp

index c13f312..4efb17e 100644 (file)
--- a/llvm/unittests/ProfileData/InstrProfTest.cpp
+++ b/llvm/unittests/ProfileData/InstrProfTest.cpp
@@ -204,6 +204,31 @@ TEST_F(InstrProfTest, get_profile_summary) {
    delete PSFromMD;
  }
  
+TEST_F(InstrProfTest, test_writer_merge) {
+  InstrProfRecord Record1("func1", 0x1234, {42});
+  NoError(Writer.addRecord(std::move(Record1)));
+
+  InstrProfWriter Writer2;
+  InstrProfRecord Record2("func2", 0x1234, {0, 0});
+  NoError(Writer2.addRecord(std::move(Record2)));
+
+  NoError(Writer.mergeRecordsFromWriter(std::move(Writer2)));
+
+  auto Profile = Writer.writeBuffer();
+  readProfile(std::move(Profile));
+
+  Expected<InstrProfRecord> R = Reader->getInstrProfRecord("func1", 0x1234);
+  ASSERT_TRUE(NoError(R.takeError()));
+  ASSERT_EQ(1U, R->Counts.size());
+  ASSERT_EQ(42U, R->Counts[0]);
+
+  R = Reader->getInstrProfRecord("func2", 0x1234);
+  ASSERT_TRUE(NoError(R.takeError()));
+  ASSERT_EQ(2U, R->Counts.size());
+  ASSERT_EQ(0U, R->Counts[0]);
+  ASSERT_EQ(0U, R->Counts[1]);
+}
+
  static const char callee1[] = "callee1";
  static const char callee2[] = "callee2";
  static const char callee3[] = "callee3";
author	Vedant Kumar <vsk@apple.com>
	Mon, 18 Jul 2016 22:02:39 +0000 (22:02 +0000)
committer	Vedant Kumar <vsk@apple.com>
	Mon, 18 Jul 2016 22:02:39 +0000 (22:02 +0000)
llvm/docs/CommandGuide/llvm-profdata.rst		patch \| blob \| history
llvm/include/llvm/ProfileData/InstrProfWriter.h		patch \| blob \| history
llvm/lib/ProfileData/InstrProfWriter.cpp		patch \| blob \| history
llvm/test/tools/llvm-profdata/multiple-inputs.test		patch \| blob \| history
llvm/tools/llvm-profdata/llvm-profdata.cpp		patch \| blob \| history
llvm/unittests/ProfileData/InstrProfTest.cpp		patch \| blob \| history