From: Vy Nguyen Date: Mon, 27 Jul 2020 16:38:05 +0000 (-0400) Subject: Reland [llvm-exegesis] Add benchmark latency option on X86 that uses LBR for more... X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=ee7caa75939afb75547c00744c5df4d04d45e517;p=platform%2Fupstream%2Fllvm.git Reland [llvm-exegesis] Add benchmark latency option on X86 that uses LBR for more precise measurements. Starting with Skylake, the LBR contains the precise number of cycles between the two consecutive branches. Making use of this will hopefully make the measurements more precise than the existing methods of using RDTSC. Differential Revision: https://reviews.llvm.org/D77422 New change: check for existence of field `cycles` in perf_branch_entry before enabling this mode. This should prevent compilation errors when building for older kernel whose headers don't support it. --- diff --git a/llvm/docs/CommandGuide/llvm-exegesis.rst b/llvm/docs/CommandGuide/llvm-exegesis.rst index 321cdf5..8cc1a23 100644 --- a/llvm/docs/CommandGuide/llvm-exegesis.rst +++ b/llvm/docs/CommandGuide/llvm-exegesis.rst @@ -192,10 +192,24 @@ OPTIONS .. option:: -mode=[latency|uops|inverse_throughput|analysis] - Specify the run mode. Note that if you pick `analysis` mode, you also need - to specify at least one of the `-analysis-clusters-output-file=` and - `-analysis-inconsistencies-output-file=`. + Specify the run mode. Note that some modes have additional requirements and options. + `latency` mode can be make use of either RDTSC or LBR. + `latency[LBR]` is only available on X86 (at least `Skylake`). + To run in this mode, a positive value must be specified for `x86-lbr-sample-period` and `--repetition-mode=loop` + + In `analysis` mode, you also need to specify at least one of the + `-analysis-clusters-output-file=` and `-analysis-inconsistencies-output-file=`. + +.. option:: -x86-lbr-sample-period= + + Specify the LBR sampling period - how many branches before we take a sample. + When a positive value is specified for this option and when the mode is `latency`, + we will use LBRs for measuring. + On choosing the "right" sampling period, a small value is preferred, but throttling + could occur if the sampling is too frequent. A prime number should be used to + avoid consistently skipping certain blocks. + .. option:: -repetition-mode=[duplicate|loop|min] Specify the repetition mode. `duplicate` will create a large, straight line diff --git a/llvm/test/tools/llvm-exegesis/X86/lbr/Inputs/mov_add.att b/llvm/test/tools/llvm-exegesis/X86/lbr/Inputs/mov_add.att new file mode 100644 index 0000000..8f85b39 --- /dev/null +++ b/llvm/test/tools/llvm-exegesis/X86/lbr/Inputs/mov_add.att @@ -0,0 +1,4 @@ +# LLVM-EXEGESIS-LIVEIN RDI +# LLVM-EXEGESIS-DEFREG XMM1 42 +movq $2, %rdi +addq $0x10, %rdi \ No newline at end of file diff --git a/llvm/test/tools/llvm-exegesis/X86/lbr/lit.local.cfg b/llvm/test/tools/llvm-exegesis/X86/lbr/lit.local.cfg new file mode 100644 index 0000000..431967c --- /dev/null +++ b/llvm/test/tools/llvm-exegesis/X86/lbr/lit.local.cfg @@ -0,0 +1,31 @@ +import subprocess +import lit.util + +if not ('X86' in config.root.targets): + # We need support for X86. + config.unsupported = True + +elif not ('x86_64' in config.root.host_triple): + # We need to be running on an X86 host. + config.unsupported = True + +else: + # We need libpfm to be installed and the host to be at least skylake. + llvm_exegesis_exe = lit.util.which('llvm-exegesis', config.llvm_tools_dir) + if not llvm_exegesis_exe: + print('llvm-exegesis not found') + config.unsupported = True + else: + try: + with open(os.devnull, 'w') as quiet: + check_llvm_exegesis_uops_result = subprocess.call( + [llvm_exegesis_exe, '-allowed-host-cpu', 'skylake', '-allowed-host-cpu', 'skylake-avx512', '-mode', 'uops', '-snippets-file', '/dev/null'], stdout=quiet, stderr=quiet) + check_llvm_exegesis_latency_result = subprocess.call( + [llvm_exegesis_exe, '-allowed-host-cpu', 'skylake', '-allowed-host-cpu', 'skylake-avx512', '-mode', 'latency', '-snippets-file', '/dev/null'], stdout=quiet, stderr=quiet) + except OSError: + print('could not exec llvm-exegesis') + config.unsupported = True + if not check_llvm_exegesis_uops_result == 0: + config.unsupported = True + if not check_llvm_exegesis_latency_result == 0: + config.unsupported = True diff --git a/llvm/test/tools/llvm-exegesis/X86/lbr/mov-add.s b/llvm/test/tools/llvm-exegesis/X86/lbr/mov-add.s new file mode 100644 index 0000000..5f72e8f --- /dev/null +++ b/llvm/test/tools/llvm-exegesis/X86/lbr/mov-add.s @@ -0,0 +1,18 @@ +# RUN: llvm-exegesis -mode=latency --repetition-mode=loop --x86-lbr-sample-period=521 --snippets-file=%p/Inputs/mov_add.att + + +CHECK: --- +CHECK-NEXT: mode: latency +CHECK-NEXT: key: +CHECK-NEXT: instructions: +CHECK-NEXT: 'MOV64ri32 RDI i_0x2' +CHECK-NEXT: 'ADD64ri8 RDI RDI i_0x10' +CHECK-NEXT: config: '' +CHECK-NEXT: {{.*}} +CHECK-NEXT: {{.*}} +CHECK-NEXT: {{.*}} +CHECK-NEXT: {{.*}} +CHECK-NEXT: num_repetitions: 10000 +CHECK-NEXT: measurements: +CHECK-NEXT: {{.*}} value: 0.0001, per_snippet_value: 0.0002 {{.*}} +CHECK-LAST: ... diff --git a/llvm/tools/llvm-exegesis/lib/BenchmarkRunner.cpp b/llvm/tools/llvm-exegesis/lib/BenchmarkRunner.cpp index bdef8f8..f015147 100644 --- a/llvm/tools/llvm-exegesis/lib/BenchmarkRunner.cpp +++ b/llvm/tools/llvm-exegesis/lib/BenchmarkRunner.cpp @@ -55,7 +55,6 @@ private: static void accumulateCounterValues(const llvm::SmallVector &NewValues, llvm::SmallVector *Result) { - const size_t NumValues = std::max(NewValues.size(), Result->size()); if (NumValues > Result->size()) Result->resize(NumValues, 0); @@ -106,10 +105,10 @@ private: if (Crashed) return make_error("snippet crashed while running"); } - auto ValueOrError = Counter->readOrError(); + + auto ValueOrError = Counter->readOrError(Function.getFunctionBytes()); if (!ValueOrError) return ValueOrError.takeError(); - accumulateCounterValues(ValueOrError.get(), &CounterValues); } return CounterValues; diff --git a/llvm/tools/llvm-exegesis/lib/PerfHelper.cpp b/llvm/tools/llvm-exegesis/lib/PerfHelper.cpp index cba4846..58e1f4d 100644 --- a/llvm/tools/llvm-exegesis/lib/PerfHelper.cpp +++ b/llvm/tools/llvm-exegesis/lib/PerfHelper.cpp @@ -128,7 +128,8 @@ int64_t Counter::read() const { return -1; } -llvm::Expected> Counter::readOrError() const { +llvm::Expected> +Counter::readOrError(StringRef /*unused*/) const { int64_t Count = 0; ssize_t ReadSize = ::read(FileDescriptor, &Count, sizeof(Count)); if (ReadSize != sizeof(Count)) @@ -152,7 +153,8 @@ void Counter::stop() {} int64_t Counter::read() const { return 42; } -llvm::Expected> Counter::readOrError() const { +llvm::Expected> +Counter::readOrError(StringRef /*unused*/) const { return llvm::make_error("Not implemented", llvm::errc::io_error); } diff --git a/llvm/tools/llvm-exegesis/lib/PerfHelper.h b/llvm/tools/llvm-exegesis/lib/PerfHelper.h index d41b090..19a3559 100644 --- a/llvm/tools/llvm-exegesis/lib/PerfHelper.h +++ b/llvm/tools/llvm-exegesis/lib/PerfHelper.h @@ -59,8 +59,9 @@ public: // e.g. "snb_ep::INSTRUCTION_RETIRED:e=0:i=0:c=0:t=0:u=1:k=0:mg=0:mh=1" StringRef getPfmEventString() const; -private: - const std::string EventString; +protected: + PerfEvent() = default; + std::string EventString; std::string FullQualifiedEventString; perf_event_attr *Attr; }; @@ -87,11 +88,17 @@ public: int64_t read() const; /// Returns the current value of the counter or error if it cannot be read. - virtual llvm::Expected> readOrError() const; + /// FunctionBytes: The benchmark function being executed. + /// This is used to filter out the measurements to ensure they are only + /// within the benchmarked code. + /// If empty (or not specified), then no filtering will be done. + /// Not all counters choose to use this. + virtual llvm::Expected> + readOrError(StringRef FunctionBytes = StringRef()) const; virtual int numValues() const; -private: +protected: PerfEvent Event; #ifdef HAVE_LIBPFM int FileDescriptor = -1; diff --git a/llvm/tools/llvm-exegesis/lib/X86/CMakeLists.txt b/llvm/tools/llvm-exegesis/lib/X86/CMakeLists.txt index 912877d..ce3bbd5 100644 --- a/llvm/tools/llvm-exegesis/lib/X86/CMakeLists.txt +++ b/llvm/tools/llvm-exegesis/lib/X86/CMakeLists.txt @@ -6,6 +6,7 @@ include_directories( add_library(LLVMExegesisX86 STATIC Target.cpp + X86Counter.cpp ) llvm_update_compile_flags(LLVMExegesisX86) diff --git a/llvm/tools/llvm-exegesis/lib/X86/Target.cpp b/llvm/tools/llvm-exegesis/lib/X86/Target.cpp index 7a84f93..9f045fa 100644 --- a/llvm/tools/llvm-exegesis/lib/X86/Target.cpp +++ b/llvm/tools/llvm-exegesis/lib/X86/Target.cpp @@ -14,15 +14,40 @@ #include "MCTargetDesc/X86BaseInfo.h" #include "MCTargetDesc/X86MCTargetDesc.h" #include "X86.h" +#include "X86Counter.h" #include "X86RegisterInfo.h" #include "X86Subtarget.h" #include "llvm/ADT/Sequence.h" #include "llvm/MC/MCInstBuilder.h" +#include "llvm/Support/Errc.h" +#include "llvm/Support/Error.h" #include "llvm/Support/FormatVariadic.h" +#include +#include +#include + namespace llvm { namespace exegesis { +static cl::OptionCategory + BenchmarkOptions("llvm-exegesis benchmark x86-options"); + +// If a positive value is specified, we are going to use the LBR in +// latency-mode. +// +// Note: +// - A small value is preferred, but too low a value could result in +// throttling. +// - A prime number is preferred to avoid always skipping certain blocks. +// +static cl::opt LbrSamplingPeriod( + "x86-lbr-sample-period", + cl::desc("The sample period (nbranches/sample), used for LBR sampling"), + cl::cat(BenchmarkOptions), cl::init(0)); + +// FIXME: Validates that repetition-mode is loop if LBR is requested. + // Returns a non-null reason if we cannot handle the memory references in this // instruction. static const char *isInvalidMemoryInstr(const Instruction &Instr) { @@ -568,10 +593,32 @@ void ConstantInliner::initStack(unsigned Bytes) { #include "X86GenExegesis.inc" namespace { + class ExegesisX86Target : public ExegesisTarget { public: ExegesisX86Target() : ExegesisTarget(X86CpuPfmCounters) {} + Expected> + createCounter(StringRef CounterName, const LLVMState &State) const override { + // If LbrSamplingPeriod was provided, then ignore the + // CounterName because we only have one for LBR. + if (LbrSamplingPeriod > 0) { + // Can't use LBR without HAVE_LIBPFM, LIBPFM_HAS_FIELD_CYCLES, or without + // __linux__ (for now) +#if defined(HAVE_LIBPFM) && defined(LIBPFM_HAS_FIELD_CYCLES) && \ + defined(__linux__) + return std::make_unique( + X86LbrPerfEvent(LbrSamplingPeriod)); +#else + return llvm::make_error( + "LBR counter requested without HAVE_LIBPFM, LIBPFM_HAS_FIELD_CYCLES, " + "or running on Linux.", + llvm::errc::invalid_argument); +#endif + } + return ExegesisTarget::createCounter(CounterName, State); + } + private: void addTargetSpecificPasses(PassManagerBase &PM) const override; diff --git a/llvm/tools/llvm-exegesis/lib/X86/X86Counter.cpp b/llvm/tools/llvm-exegesis/lib/X86/X86Counter.cpp new file mode 100644 index 0000000..57b4938 --- /dev/null +++ b/llvm/tools/llvm-exegesis/lib/X86/X86Counter.cpp @@ -0,0 +1,212 @@ +//===-- X86Counter.cpp ------------------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "X86Counter.h" + +// FIXME: Use appropriate wrappers for poll.h and mman.h +// to support Windows and remove this linux-only guard. +#ifdef __linux__ +#include "llvm/Support/Endian.h" +#include "llvm/Support/Errc.h" + +#ifdef HAVE_LIBPFM +#include "perfmon/perf_event.h" +#include "perfmon/pfmlib.h" +#include "perfmon/pfmlib_perf_event.h" +#endif // HAVE_LIBPFM + +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#if defined(HAVE_LIBPFM) && defined(LIBPFM_HAS_FIELD_CYCLES) +namespace llvm { +namespace exegesis { + +static constexpr size_t kBufferPages = 8; +static const size_t kDataBufferSize = kBufferPages * getpagesize(); + +// Waits for the LBR perf events. +static int pollLbrPerfEvent(const int FileDescriptor) { + struct pollfd PollFd; + PollFd.fd = FileDescriptor; + PollFd.events = POLLIN; + PollFd.revents = 0; + return poll(&PollFd, 1 /* num of fds */, 10000 /* timeout in ms */); +} + +// Copies the data-buffer into Buf, given the pointer to MMapped. +static void copyDataBuffer(void *MMappedBuffer, char *Buf, uint64_t Tail, + size_t DataSize) { + // First page is reserved for perf_event_mmap_page. Data buffer starts on + // the next page. + char *Start = reinterpret_cast(MMappedBuffer) + getpagesize(); + // The LBR buffer is a cyclic buffer, we copy data to another buffer. + uint64_t Offset = Tail % kDataBufferSize; + size_t CopySize = kDataBufferSize - Offset; + memcpy(Buf, Start + Offset, CopySize); + if (CopySize >= DataSize) + return; + + memcpy(Buf + CopySize, Start, Offset); + return; +} + +// Parses the given data-buffer for stats and fill the CycleArray. +// If data has been extracted successfully, also modifies the code to jump +// out the benchmark loop. +static llvm::Error parseDataBuffer(const char *DataBuf, size_t DataSize, + const void *From, const void *To, + llvm::SmallVector *CycleArray) { + assert(From != nullptr && To != nullptr); + const char *DataPtr = DataBuf; + while (DataPtr < DataBuf + DataSize) { + struct perf_event_header Header; + memcpy(&Header, DataPtr, sizeof(struct perf_event_header)); + if (Header.type != PERF_RECORD_SAMPLE) { + // Ignores non-sample records. + DataPtr += Header.size; + continue; + } + DataPtr += sizeof(Header); + uint64_t Count = llvm::support::endian::read64(DataPtr, support::native); + DataPtr += sizeof(Count); + + struct perf_branch_entry Entry; + memcpy(&Entry, DataPtr, sizeof(struct perf_branch_entry)); + + // Read the perf_branch_entry array. + for (uint64_t i = 0; i < Count; ++i) { + const uint64_t BlockStart = From == nullptr + ? std::numeric_limits::min() + : reinterpret_cast(From); + const uint64_t BlockEnd = To == nullptr + ? std::numeric_limits::max() + : reinterpret_cast(To); + + if (BlockStart <= Entry.from && BlockEnd >= Entry.to) + CycleArray->push_back(Entry.cycles); + + if (i == Count - 1) + // We've reached the last entry. + return llvm::Error::success(); + + // Advance to next entry + DataPtr += sizeof(Entry); + memcpy(&Entry, DataPtr, sizeof(struct perf_branch_entry)); + } + } + return llvm::make_error("Unable to parse databuffer.", + llvm::errc::io_error); +} + +X86LbrPerfEvent::X86LbrPerfEvent(unsigned SamplingPeriod) { + assert(SamplingPeriod > 0 && "SamplingPeriod must be positive"); + EventString = "BR_INST_RETIRED.NEAR_TAKEN"; + Attr = new perf_event_attr(); + Attr->size = sizeof(*Attr); + Attr->type = PERF_TYPE_RAW; + // FIXME This is SKL's encoding. Not sure if it'll change. + Attr->config = 0x20c4; // BR_INST_RETIRED.NEAR_TAKEN + Attr->sample_type = PERF_SAMPLE_BRANCH_STACK; + // Don't need to specify "USER" because we've already excluded HV and Kernel. + Attr->branch_sample_type = PERF_SAMPLE_BRANCH_ANY; + Attr->sample_period = SamplingPeriod; + Attr->wakeup_events = 1; // We need this even when using ioctl REFRESH. + Attr->disabled = 1; + Attr->exclude_kernel = 1; + Attr->exclude_hv = 1; + Attr->read_format = PERF_FORMAT_GROUP; + + FullQualifiedEventString = EventString; +} + +X86LbrCounter::X86LbrCounter(pfm::PerfEvent &&NewEvent) + : Counter(std::move(NewEvent)) { + // First page is reserved for perf_event_mmap_page. Data buffer starts on + // the next page, so we allocate one more page. + MMappedBuffer = mmap(nullptr, (kBufferPages + 1) * getpagesize(), + PROT_READ | PROT_WRITE, MAP_SHARED, FileDescriptor, 0); + if (MMappedBuffer == MAP_FAILED) + llvm::errs() << "Failed to mmap buffer."; +} + +X86LbrCounter::~X86LbrCounter() { close(FileDescriptor); } + +void X86LbrCounter::start() { + ioctl(FileDescriptor, PERF_EVENT_IOC_REFRESH, 1024 /* kMaxPollsPerFd */); +} + +llvm::Expected> +X86LbrCounter::readOrError(StringRef FunctionBytes) const { + // The max number of time-outs/retries before we give up. + static constexpr int kMaxTimeouts = 160; + + // Disable the event before reading + ioctl(FileDescriptor, PERF_EVENT_IOC_DISABLE, 0); + + // Parses the LBR buffer and fills CycleArray with the sequence of cycle + // counts from the buffer. + llvm::SmallVector CycleArray; + std::unique_ptr DataBuf(new char[kDataBufferSize]); + int NumTimeouts = 0; + int PollResult = 0; + + // Find the boundary of the function so that we could filter the LBRs + // to keep only the relevant records. + if (FunctionBytes.empty()) + return llvm::make_error("Empty function bytes", + llvm::errc::invalid_argument); + const void *From = reinterpret_cast(FunctionBytes.data()); + const void *To = reinterpret_cast(FunctionBytes.data() + + FunctionBytes.size()); + while (PollResult <= 0) { + PollResult = pollLbrPerfEvent(FileDescriptor); + if (PollResult > 0) + break; + if (PollResult == -1) + return llvm::make_error("Cannot poll LBR perf event.", + llvm::errc::io_error); + if (NumTimeouts++ >= kMaxTimeouts) + return llvm::make_error( + "LBR polling still timed out after max number of attempts.", + llvm::errc::device_or_resource_busy); + } + + struct perf_event_mmap_page Page; + memcpy(&Page, MMappedBuffer, sizeof(struct perf_event_mmap_page)); + + const uint64_t DataTail = Page.data_tail; + const uint64_t DataHead = Page.data_head; + // We're supposed to use a barrier after reading data_head. + std::atomic_thread_fence(std::memory_order_acq_rel); + const size_t DataSize = DataHead - DataTail; + if (DataSize > kDataBufferSize) + return llvm::make_error( + "DataSize larger than buffer size.", llvm::errc::invalid_argument); + + copyDataBuffer(MMappedBuffer, DataBuf.get(), DataTail, DataSize); + llvm::Error error = + parseDataBuffer(DataBuf.get(), DataSize, From, To, &CycleArray); + if (!error) + return CycleArray; + return std::move(error); +} + +} // namespace exegesis +} // namespace llvm + +#endif // defined(HAVE_LIBPFM) && defined(LIBPFM_HAS_FIELD_CYCLES) +#endif // __linux__ diff --git a/llvm/tools/llvm-exegesis/lib/X86/X86Counter.h b/llvm/tools/llvm-exegesis/lib/X86/X86Counter.h new file mode 100644 index 0000000..9406201 --- /dev/null +++ b/llvm/tools/llvm-exegesis/lib/X86/X86Counter.h @@ -0,0 +1,55 @@ +//===-- X86Counter.h --------------------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// Perf counter that reads the LBRs for measuring the benchmarked block's +/// throughput. +/// +/// More info at: https://lwn.net/Articles/680985 +//===----------------------------------------------------------------------===// +#ifndef LLVM_TOOLS_LLVM_EXEGESIS_LIB_X86_X86COUNTER_H +#define LLVM_TOOLS_LLVM_EXEGESIS_LIB_X86_X86COUNTER_H + +#include "../PerfHelper.h" +#include "llvm/Support/Error.h" + +// FIXME: Use appropriate wrappers for poll.h and mman.h +// to support Windows and remove this linux-only guard. +#if defined(__linux__) && defined(HAVE_LIBPFM) && \ + defined(LIBPFM_HAS_FIELD_CYCLES) + +namespace llvm { +namespace exegesis { + +class X86LbrPerfEvent : public pfm::PerfEvent { +public: + X86LbrPerfEvent(unsigned SamplingPeriod); +}; + +class X86LbrCounter : public pfm::Counter { +public: + explicit X86LbrCounter(pfm::PerfEvent &&Event); + + virtual ~X86LbrCounter(); + + void start() override; + + llvm::Expected> + readOrError(StringRef FunctionBytes) const override; + +private: + void *MMappedBuffer = nullptr; +}; + +} // namespace exegesis +} // namespace llvm + +#endif // defined(__linux__) && defined(HAVE_LIBPFM) && + // defined(LIBPFM_HAS_FIELD_CYCLES) + +#endif // LLVM_TOOLS_LLVM_EXEGESIS_LIB_X86_X86COUNTER_H diff --git a/llvm/tools/llvm-exegesis/llvm-exegesis.cpp b/llvm/tools/llvm-exegesis/llvm-exegesis.cpp index 507015b..8eeda48 100644 --- a/llvm/tools/llvm-exegesis/llvm-exegesis.cpp +++ b/llvm/tools/llvm-exegesis/llvm-exegesis.cpp @@ -160,6 +160,12 @@ static cl::opt cl::desc(""), cl::cat(AnalysisOptions), cl::init("")); +static cl::list + AllowedHostCpus("allowed-host-cpu", + cl::desc("If specified, only run the benchmark if the host " + "CPU matches the names"), + cl::cat(Options), cl::ZeroOrMore); + static cl::opt AnalysisDisplayUnstableOpcodes( "analysis-display-unstable-clusters", cl::desc("if there is more than one benchmark for an opcode, said " @@ -296,6 +302,13 @@ void benchmarkMain() { const LLVMState State(CpuName); + llvm::StringRef ActualCpu = State.getTargetMachine().getTargetCPU(); + for (auto Begin = AllowedHostCpus.begin(); Begin != AllowedHostCpus.end(); + ++Begin) { + if (ActualCpu != *Begin) + ExitWithError(llvm::Twine("Unexpected host CPU ").concat(ActualCpu)); + } + const std::unique_ptr Runner = ExitOnErr(State.getExegesisTarget().createBenchmarkRunner( BenchmarkMode, State, ResultAggMode));