From 70b37f4c03cd189c94167dc22d9f5303c8773092 Mon Sep 17 00:00:00 2001 From: Andrea Di Biagio Date: Wed, 16 Jun 2021 14:39:14 +0100 Subject: [PATCH] [MCA][InstrBuilder] Always check for implicit uses of resource units (PR50725). When instructions are issued to the underlying pipeline resources, the mca::ResourceManager should also check for the presence of extra uses induced by the explicit consumption of multiple partially overlapping group resources. Fixes PR50725 --- llvm/include/llvm/MCA/Instruction.h | 3 ++ llvm/lib/MCA/HardwareUnits/ResourceManager.cpp | 13 ++++++- llvm/lib/MCA/InstrBuilder.cpp | 43 +++++++++++++++++++++- .../tools/llvm-mca/X86/SkylakeClient/PR50725.s | 19 ++++++++++ 4 files changed, 75 insertions(+), 3 deletions(-) create mode 100644 llvm/test/tools/llvm-mca/X86/SkylakeClient/PR50725.s diff --git a/llvm/include/llvm/MCA/Instruction.h b/llvm/include/llvm/MCA/Instruction.h index f34f31d..a1a1632 100644 --- a/llvm/include/llvm/MCA/Instruction.h +++ b/llvm/include/llvm/MCA/Instruction.h @@ -359,6 +359,9 @@ struct InstrDesc { // A bitmask of used processor resource units. uint64_t UsedProcResUnits; + // A bitmask of implicit uses of processor resource units. + uint64_t ImplicitlyUsedProcResUnits; + // A bitmask of used processor resource groups. uint64_t UsedProcResGroups; diff --git a/llvm/lib/MCA/HardwareUnits/ResourceManager.cpp b/llvm/lib/MCA/HardwareUnits/ResourceManager.cpp index 30c4f14..3687a24 100644 --- a/llvm/lib/MCA/HardwareUnits/ResourceManager.cpp +++ b/llvm/lib/MCA/HardwareUnits/ResourceManager.cpp @@ -114,8 +114,8 @@ ResourceManager::ResourceManager(const MCSchedModel &SM) Resource2Groups(SM.getNumProcResourceKinds() - 1, 0), ProcResID2Mask(SM.getNumProcResourceKinds(), 0), ResIndex2ProcResID(SM.getNumProcResourceKinds() - 1, 0), - ProcResUnitMask(0), ReservedResourceGroups(0), - AvailableBuffers(~0ULL), ReservedBuffers(0) { + ProcResUnitMask(0), ReservedResourceGroups(0), AvailableBuffers(~0ULL), + ReservedBuffers(0) { computeProcResourceMasks(SM, ProcResID2Mask); // initialize vector ResIndex2ProcResID. @@ -288,6 +288,15 @@ uint64_t ResourceManager::checkAvailability(const InstrDesc &Desc) const { BusyResourceMask |= E.first; } + uint64_t ImplicitUses = Desc.ImplicitlyUsedProcResUnits; + while (ImplicitUses) { + uint64_t Use = ImplicitUses & -ImplicitUses; + ImplicitUses ^= Use; + unsigned Index = getResourceStateIndex(Use); + if (!Resources[Index]->isReady(/* NumUnits */ 1)) + BusyResourceMask |= Index; + } + BusyResourceMask &= ProcResUnitMask; if (BusyResourceMask) return BusyResourceMask; diff --git a/llvm/lib/MCA/InstrBuilder.cpp b/llvm/lib/MCA/InstrBuilder.cpp index fa11beb..1532fd6 100644 --- a/llvm/lib/MCA/InstrBuilder.cpp +++ b/llvm/lib/MCA/InstrBuilder.cpp @@ -43,7 +43,7 @@ static void initializeUsedResources(InstrDesc &ID, // Populate resources consumed. using ResourcePlusCycles = std::pair; - std::vector Worklist; + SmallVector Worklist; // Track cycles contributed by resources that are in a "Super" relationship. // This is required if we want to correctly match the behavior of method @@ -109,6 +109,11 @@ static void initializeUsedResources(InstrDesc &ID, uint64_t UsedResourceUnits = 0; uint64_t UsedResourceGroups = 0; + auto GroupIt = find_if(Worklist, [](const ResourcePlusCycles &Elt) { + return countPopulation(Elt.first) > 1; + }); + unsigned FirstGroupIdx = std::distance(Worklist.begin(), GroupIt); + uint64_t ImpliedUsesOfResourceUnits = 0; // Remove cycles contributed by smaller resources. for (unsigned I = 0, E = Worklist.size(); I < E; ++I) { @@ -127,6 +132,15 @@ static void initializeUsedResources(InstrDesc &ID, // Remove the leading 1 from the resource group mask. NormalizedMask ^= PowerOf2Floor(NormalizedMask); UsedResourceGroups |= (A.first ^ NormalizedMask); + + uint64_t AvailableMask = NormalizedMask & ~UsedResourceUnits; + if ((NormalizedMask != AvailableMask) && + countPopulation(AvailableMask) == 1) { + // At simulation time, this resource group use will decay into a simple + // use of the resource unit identified by `AvailableMask`. + ImpliedUsesOfResourceUnits |= AvailableMask; + UsedResourceUnits |= AvailableMask; + } } for (unsigned J = I + 1; J < E; ++J) { @@ -139,6 +153,31 @@ static void initializeUsedResources(InstrDesc &ID, } } + // Look for implicit uses of processor resource units. These are resource + // units which are indirectly consumed by resource groups, and that must be + // always available on instruction issue. + while (ImpliedUsesOfResourceUnits) { + ID.ImplicitlyUsedProcResUnits |= ImpliedUsesOfResourceUnits; + ImpliedUsesOfResourceUnits = 0; + for (unsigned I = FirstGroupIdx, E = Worklist.size(); I < E; ++I) { + ResourcePlusCycles &A = Worklist[I]; + if (!A.second.size()) + continue; + + uint64_t NormalizedMask = A.first; + assert(countPopulation(NormalizedMask) > 1); + // Remove the leading 1 from the resource group mask. + NormalizedMask ^= PowerOf2Floor(NormalizedMask); + uint64_t AvailableMask = NormalizedMask & ~UsedResourceUnits; + if ((NormalizedMask != AvailableMask) && + countPopulation(AvailableMask) != 1) + continue; + + UsedResourceUnits |= AvailableMask; + ImpliedUsesOfResourceUnits |= AvailableMask; + } + } + // A SchedWrite may specify a number of cycles in which a resource group // is reserved. For example (on target x86; cpu Haswell): // @@ -198,6 +237,8 @@ static void initializeUsedResources(InstrDesc &ID, BufferIDs ^= Current; } dbgs() << "\t\t Used Units=" << format_hex(ID.UsedProcResUnits, 16) << '\n'; + dbgs() << "\t\tImplicitly Used Units=" + << format_hex(ID.ImplicitlyUsedProcResUnits, 16) << '\n'; dbgs() << "\t\tUsed Groups=" << format_hex(ID.UsedProcResGroups, 16) << '\n'; }); diff --git a/llvm/test/tools/llvm-mca/X86/SkylakeClient/PR50725.s b/llvm/test/tools/llvm-mca/X86/SkylakeClient/PR50725.s new file mode 100644 index 0000000..c236c62 --- /dev/null +++ b/llvm/test/tools/llvm-mca/X86/SkylakeClient/PR50725.s @@ -0,0 +1,19 @@ +# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py +# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=skylake -all-views=false -summary-view -iterations=1 < %s | FileCheck %s + +# Do not crash when running this simulation. +# It is not safe to issue FXRSTOR if SKLPort1 is not available. + +bswap %eax +bswap %eax +fxrstor 64(%rsp) + +# CHECK: Iterations: 1 +# CHECK-NEXT: Instructions: 3 +# CHECK-NEXT: Total Cycles: 68 +# CHECK-NEXT: Total uOps: 92 + +# CHECK: Dispatch Width: 6 +# CHECK-NEXT: uOps Per Cycle: 1.35 +# CHECK-NEXT: IPC: 0.04 +# CHECK-NEXT: Block RThroughput: 16.5 -- 2.7.4