[llvm-exegesis] Add a library to cluster benchmark results.
authorClement Courbet <courbet@google.com>
Mon, 7 May 2018 08:20:00 +0000 (08:20 +0000)
committerClement Courbet <courbet@google.com>
Mon, 7 May 2018 08:20:00 +0000 (08:20 +0000)
Reviewers: gchatelet

Subscribers: mgorny, tschuett, llvm-commits

Differential Revision: https://reviews.llvm.org/D46432

llvm-svn: 331622

llvm/tools/llvm-exegesis/lib/CMakeLists.txt
llvm/tools/llvm-exegesis/lib/Clustering.cpp [new file with mode: 0644]
llvm/tools/llvm-exegesis/lib/Clustering.h [new file with mode: 0644]
llvm/unittests/tools/llvm-exegesis/CMakeLists.txt
llvm/unittests/tools/llvm-exegesis/ClusteringTest.cpp [new file with mode: 0644]

index b6c4ae1..5ace962 100644 (file)
@@ -2,6 +2,7 @@ add_library(LLVMExegesis
   STATIC
   BenchmarkResult.cpp
   BenchmarkRunner.cpp
+  Clustering.cpp
   InMemoryAssembler.cpp
   InstructionSnippetGenerator.cpp
   Latency.cpp
diff --git a/llvm/tools/llvm-exegesis/lib/Clustering.cpp b/llvm/tools/llvm-exegesis/lib/Clustering.cpp
new file mode 100644 (file)
index 0000000..2745844
--- /dev/null
@@ -0,0 +1,170 @@
+//===-- Clustering.cpp ------------------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Clustering.h"
+#include <string>
+#include <unordered_set>
+
+namespace exegesis {
+
+// The clustering problem has the following characteristics:
+//  (A) - Low dimension (dimensions are typically proc resource units,
+//    typically < 10).
+//  (B) - Number of points : ~thousands (points are measurements of an MCInst)
+//  (C) - Number of clusters: ~tens.
+//  (D) - The number of clusters is not known /a priory/.
+//  (E) - The amount of noise is relatively small.
+// The problem is rather small. In terms of algorithms, (D) disqualifies
+// k-means and makes algorithms such as DBSCAN[1] or OPTICS[2] more applicable.
+//
+// We've used DBSCAN here because it's simple to implement. This is a pretty
+// straightforward and inefficient implementation of the pseudocode in [2].
+//
+// [1] https://en.wikipedia.org/wiki/DBSCAN
+// [2] https://en.wikipedia.org/wiki/OPTICS_algorithm
+
+namespace {
+
+// Finds the points at distance less than sqrt(EpsilonSquared) of Q (not
+// including Q).
+std::vector<size_t> rangeQuery(const std::vector<InstructionBenchmark> &Points,
+                               const size_t Q, const double EpsilonSquared) {
+  std::vector<size_t> Neighbors;
+  const auto &QMeasurements = Points[Q].Measurements;
+  for (size_t P = 0, NumPoints = Points.size(); P < NumPoints; ++P) {
+    if (P == Q)
+      continue;
+    const auto &PMeasurements = Points[P].Measurements;
+    if (PMeasurements.empty()) // Error point.
+      continue;
+    double DistanceSquared = 0;
+    for (size_t I = 0, E = QMeasurements.size(); I < E; ++I) {
+      const auto Diff = PMeasurements[I].Value - QMeasurements[I].Value;
+      DistanceSquared += Diff * Diff;
+    }
+    if (DistanceSquared <= EpsilonSquared) {
+      Neighbors.push_back(P);
+    }
+  }
+  return Neighbors;
+}
+
+} // namespace
+
+InstructionBenchmarkClustering::InstructionBenchmarkClustering()
+    : NoiseCluster_(ClusterId::noise()), ErrorCluster_(ClusterId::error()) {}
+
+llvm::Error InstructionBenchmarkClustering::validateAndSetup(
+    const std::vector<InstructionBenchmark> &Points) {
+  ClusterIdForPoint_.resize(Points.size());
+  // Mark erroneous measurements out.
+  // All points must have the same number of dimensions, in the same order.
+  const std::vector<BenchmarkMeasure> *LastMeasurement = nullptr;
+  for (size_t P = 0, NumPoints = Points.size(); P < NumPoints; ++P) {
+    const auto &Point = Points[P];
+    if (!Point.Error.empty()) {
+      ClusterIdForPoint_[P] = ClusterId::error();
+      ErrorCluster_.PointIndices.push_back(P);
+      continue;
+    }
+    const auto *CurMeasurement = &Point.Measurements;
+    if (LastMeasurement) {
+      if (LastMeasurement->size() != CurMeasurement->size()) {
+        return llvm::make_error<llvm::StringError>(
+            "inconsistent measurement dimensions",
+            llvm::inconvertibleErrorCode());
+      }
+      for (size_t I = 0, E = LastMeasurement->size(); I < E; ++I) {
+        if (LastMeasurement->at(I).Key != CurMeasurement->at(I).Key) {
+          return llvm::make_error<llvm::StringError>(
+              "inconsistent measurement dimensions keys",
+              llvm::inconvertibleErrorCode());
+        }
+      }
+    }
+    LastMeasurement = CurMeasurement;
+  }
+  if (LastMeasurement) {
+    NumDimensions_ = LastMeasurement->size();
+  }
+  return llvm::Error::success();
+}
+
+void InstructionBenchmarkClustering::dbScan(
+    const std::vector<InstructionBenchmark> &Points, const size_t MinPts,
+    const double EpsilonSquared) {
+  for (size_t P = 0, NumPoints = Points.size(); P < NumPoints; ++P) {
+    if (!ClusterIdForPoint_[P].isUndef())
+      continue; // Previously processed in inner loop.
+    const auto Neighbors = rangeQuery(Points, P, EpsilonSquared);
+    if (Neighbors.size() + 1 < MinPts) { // Density check.
+      // The region around P is not dense enough to create a new cluster, mark
+      // as noise for now.
+      ClusterIdForPoint_[P] = ClusterId::noise();
+      continue;
+    }
+
+    // Create a new cluster, add P.
+    Clusters_.emplace_back(ClusterId::makeValid(Clusters_.size()));
+    Cluster &CurrentCluster = Clusters_.back();
+    ClusterIdForPoint_[P] = CurrentCluster.Id; /* Label initial point */
+    CurrentCluster.PointIndices.push_back(P);
+
+    // Process P's neighbors.
+    std::unordered_set<size_t> ToProcess(Neighbors.begin(), Neighbors.end());
+    while (!ToProcess.empty()) {
+      // Retrieve a point from the set.
+      const size_t Q = *ToProcess.begin();
+      ToProcess.erase(Q);
+
+      if (ClusterIdForPoint_[Q].isNoise()) {
+        // Change noise point to border point.
+        ClusterIdForPoint_[Q] = CurrentCluster.Id;
+        CurrentCluster.PointIndices.push_back(Q);
+        continue;
+      }
+      if (!ClusterIdForPoint_[Q].isUndef()) {
+        continue; // Previously processed.
+      }
+      // Add Q to the current custer.
+      ClusterIdForPoint_[Q] = CurrentCluster.Id;
+      CurrentCluster.PointIndices.push_back(Q);
+      // And extend to the neighbors of Q if the region is dense enough.
+      const auto Neighbors = rangeQuery(Points, Q, EpsilonSquared);
+      if (Neighbors.size() + 1 >= MinPts) {
+        ToProcess.insert(Neighbors.begin(), Neighbors.end());
+      }
+    }
+  }
+
+  // Add noisy points to noise cluster.
+  for (size_t P = 0, NumPoints = Points.size(); P < NumPoints; ++P) {
+    if (ClusterIdForPoint_[P].isNoise()) {
+      NoiseCluster_.PointIndices.push_back(P);
+    }
+  }
+}
+
+llvm::Expected<InstructionBenchmarkClustering>
+InstructionBenchmarkClustering::create(
+    const std::vector<InstructionBenchmark> &Points, const size_t MinPts,
+    const double Epsilon) {
+  InstructionBenchmarkClustering Clustering;
+  if (auto Error = Clustering.validateAndSetup(Points)) {
+    return Error;
+  }
+  if (Clustering.ErrorCluster_.PointIndices.size() == Points.size()) {
+    return Clustering; // Nothing to cluster.
+  }
+
+  Clustering.dbScan(Points, MinPts, Epsilon * Epsilon);
+  return Clustering;
+}
+
+} // namespace exegesis
diff --git a/llvm/tools/llvm-exegesis/lib/Clustering.h b/llvm/tools/llvm-exegesis/lib/Clustering.h
new file mode 100644 (file)
index 0000000..aa4ef67
--- /dev/null
@@ -0,0 +1,103 @@
+//===-- Clustering.h --------------------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// Utilities to compute benchmark result clusters.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TOOLS_LLVM_EXEGESIS_CLUSTERING_H
+#define LLVM_TOOLS_LLVM_EXEGESIS_CLUSTERING_H
+
+#include "BenchmarkResult.h"
+#include "llvm/Support/Error.h"
+#include <vector>
+
+namespace exegesis {
+
+class InstructionBenchmarkClustering {
+public:
+  // Clusters `Points` using DBSCAN with the given parameters. See the cc file
+  // for more explanations on the algorithm.
+  static llvm::Expected<InstructionBenchmarkClustering>
+  create(const std::vector<InstructionBenchmark> &Points, size_t MinPts,
+         double Epsilon);
+
+  class ClusterId {
+  public:
+    static ClusterId noise() { return ClusterId(kNoise); }
+    static ClusterId error() { return ClusterId(kError); }
+    static ClusterId makeValid(int Id) {
+      assert(Id >= 0);
+      return ClusterId(Id);
+    }
+    ClusterId() : Id_(kUndef) {}
+    bool operator==(const ClusterId &O) const { return Id_ == O.Id_; }
+
+    bool isValid() const { return Id_ >= 0; }
+    bool isUndef() const { return Id_ == kUndef; }
+    bool isNoise() const { return Id_ == kNoise; }
+    bool isError() const { return Id_ == kError; }
+
+    // Precondition: isValid().
+    size_t getId() const {
+      assert(isValid());
+      return static_cast<size_t>(Id_);
+    }
+
+  private:
+    explicit ClusterId(int Id) : Id_(Id) {}
+    static constexpr const int kUndef = -1;
+    static constexpr const int kNoise = -2;
+    static constexpr const int kError = -3;
+    int Id_;
+  };
+
+  struct Cluster {
+    Cluster() = delete;
+    explicit Cluster(const ClusterId &Id) : Id(Id) {}
+
+    const ClusterId Id;
+    // Indices of benchmarks within the cluster.
+    std::vector<int> PointIndices;
+  };
+
+  ClusterId getClusterIdForPoint(size_t P) const {
+    return ClusterIdForPoint_[P];
+  }
+
+  const Cluster &getCluster(ClusterId Id) const {
+    assert(!Id.isUndef() && "unlabeled cluster");
+    if (Id.isNoise()) {
+      return NoiseCluster_;
+    }
+    if (Id.isError()) {
+      return ErrorCluster_;
+    }
+    return Clusters_[Id.getId()];
+  }
+
+  const std::vector<Cluster> &getValidClusters() const { return Clusters_; }
+
+private:
+  InstructionBenchmarkClustering();
+  llvm::Error validateAndSetup(const std::vector<InstructionBenchmark> &Points);
+  void dbScan(const std::vector<InstructionBenchmark> &Points, size_t MinPts,
+              double EpsilonSquared);
+  int NumDimensions_ = 0;
+  // ClusterForPoint_[P] is the cluster id for Points[P].
+  std::vector<ClusterId> ClusterIdForPoint_;
+  std::vector<Cluster> Clusters_;
+  Cluster NoiseCluster_;
+  Cluster ErrorCluster_;
+};
+
+} // namespace exegesis
+
+#endif // LLVM_TOOLS_LLVM_EXEGESIS_CLUSTERING_H
index a08d569..6f12bb1 100644 (file)
@@ -12,6 +12,7 @@ set(LLVM_LINK_COMPONENTS
 
 add_llvm_unittest(LLVMExegesisTests
   BenchmarkResultTest.cpp
+  ClusteringTest.cpp
   OperandGraphTest.cpp
   PerfHelperTest.cpp
   )
diff --git a/llvm/unittests/tools/llvm-exegesis/ClusteringTest.cpp b/llvm/unittests/tools/llvm-exegesis/ClusteringTest.cpp
new file mode 100644 (file)
index 0000000..b89c277
--- /dev/null
@@ -0,0 +1,86 @@
+//===-- ClusteringTest.cpp --------------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Clustering.h"
+#include "BenchmarkResult.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Support/raw_ostream.h"
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+
+namespace exegesis {
+
+namespace {
+
+using testing::Field;
+using testing::UnorderedElementsAre;
+using testing::UnorderedElementsAreArray;
+
+TEST(ClusteringTest, Clusters3D) {
+  std::vector<InstructionBenchmark> Points(6);
+
+  // Cluster around (x=0, y=1, z=2): points {0, 3}.
+  Points[0].Measurements = {{"x", 0.01, ""}, {"y", 1.02, ""}, {"z", 1.98, "A"}};
+  Points[3].Measurements = {{"x", -0.01, ""}, {"y", 1.02, ""}, {"z", 1.98, ""}};
+  // Cluster around (x=1, y=1, z=2): points {1, 4}.
+  Points[1].Measurements = {{"x", 1.01, ""}, {"y", 1.02, ""}, {"z", 1.98, ""}};
+  Points[4].Measurements = {{"x", 0.99, ""}, {"y", 1.02, ""}, {"z", 1.98, ""}};
+  // Cluster around (x=0, y=0, z=0): points {5}, marked as noise.
+  Points[5].Measurements = {{"x", 0.0, ""}, {"y", 0.01, ""}, {"z", -0.02, ""}};
+  // Error cluster: points {2}
+  Points[2].Error = "oops";
+
+  auto HasPoints = [](const std::vector<int> &Indices) {
+    return Field(&InstructionBenchmarkClustering::Cluster::PointIndices,
+                 UnorderedElementsAreArray(Indices));
+  };
+
+  auto Clustering = InstructionBenchmarkClustering::create(Points, 2, 0.25);
+  ASSERT_TRUE((bool)Clustering);
+  EXPECT_THAT(Clustering.get().getValidClusters(),
+              UnorderedElementsAre(HasPoints({0, 3}), HasPoints({1, 4})));
+  EXPECT_THAT(Clustering.get().getCluster(
+                  InstructionBenchmarkClustering::ClusterId::noise()),
+              HasPoints({5}));
+  EXPECT_THAT(Clustering.get().getCluster(
+                  InstructionBenchmarkClustering::ClusterId::error()),
+              HasPoints({2}));
+
+  EXPECT_EQ(Clustering.get().getClusterIdForPoint(2),
+            InstructionBenchmarkClustering::ClusterId::error());
+  EXPECT_EQ(Clustering.get().getClusterIdForPoint(5),
+            InstructionBenchmarkClustering::ClusterId::noise());
+  EXPECT_EQ(Clustering.get().getClusterIdForPoint(0),
+            Clustering.get().getClusterIdForPoint(3));
+  EXPECT_EQ(Clustering.get().getClusterIdForPoint(1),
+            Clustering.get().getClusterIdForPoint(4));
+}
+
+TEST(ClusteringTest, Clusters3D_InvalidSize) {
+  std::vector<InstructionBenchmark> Points(6);
+  Points[0].Measurements = {{"x", 0.01, ""}, {"y", 1.02, ""}, {"z", 1.98, ""}};
+  Points[1].Measurements = {{"y", 1.02, ""}, {"z", 1.98, ""}};
+  auto Error =
+      InstructionBenchmarkClustering::create(Points, 2, 0.25).takeError();
+  ASSERT_TRUE((bool)Error);
+  consumeError(std::move(Error));
+}
+
+TEST(ClusteringTest, Clusters3D_InvalidOrder) {
+  std::vector<InstructionBenchmark> Points(6);
+  Points[0].Measurements = {{"x", 0.01, ""}, {"y", 1.02, ""}};
+  Points[1].Measurements = {{"y", 1.02, ""}, {"x", 1.98, ""}};
+  auto Error =
+      InstructionBenchmarkClustering::create(Points, 2, 0.25).takeError();
+  ASSERT_TRUE((bool)Error);
+  consumeError(std::move(Error));
+}
+
+} // namespace
+} // namespace exegesis