[llvm-exegesis] parallel snippet generator: avoid Read-After-Write pitfail for instrs...
authorRoman Lebedev <lebedev.ri@gmail.com>
Tue, 6 Dec 2022 14:55:35 +0000 (17:55 +0300)
committerRoman Lebedev <lebedev.ri@gmail.com>
Tue, 6 Dec 2022 15:05:22 +0000 (18:05 +0300)
As it is being discussed in https://github.com/llvm/llvm-project/issues/59325,
at least for the instructions with tied variables,
when trying to parallelize the instructions,
register selection is rather bad, and may either
use a register which we have used for def,
or vice versa.

That introduces serialization, and leads to
overly pessimistic inverse throughput measurement.

The new implementation avoids that,

New result:
```
$ ninja llvm-exegesis && ./bin/llvm-exegesis --mode=inverse_throughput --opcode-name=VFMADD132PDr --max-configs-per-opcode=9182
ninja: no work to do.
Check generated assembly with: /usr/bin/objdump -d /tmp/snippet-4af034.o
---
mode:            inverse_throughput
key:
  instructions:
    - 'VFMADD132PDr XMM3 XMM3 XMM4 XMM8'
    - 'VFMADD132PDr XMM5 XMM5 XMM14 XMM7'
    - 'VFMADD132PDr XMM10 XMM10 XMM11 XMM15'
    - 'VFMADD132PDr XMM13 XMM13 XMM15 XMM15'
    - 'VFMADD132PDr XMM12 XMM12 XMM11 XMM1'
    - 'VFMADD132PDr XMM0 XMM0 XMM6 XMM9'
    - 'VFMADD132PDr XMM2 XMM2 XMM15 XMM11'
  config:          ''
  register_initial_values:
    - 'XMM3=0x0'
    - 'XMM4=0x0'
    - 'XMM8=0x0'
    - 'MXCSR=0x0'
    - 'XMM5=0x0'
    - 'XMM14=0x0'
    - 'XMM7=0x0'
    - 'XMM10=0x0'
    - 'XMM11=0x0'
    - 'XMM15=0x0'
    - 'XMM13=0x0'
    - 'XMM12=0x0'
    - 'XMM1=0x0'
    - 'XMM0=0x0'
    - 'XMM6=0x0'
    - 'XMM9=0x0'
    - 'XMM2=0x0'
cpu_name:        znver3
llvm_triple:     x86_64-unknown-linux-gnu
num_repetitions: 10000
measurements:
  - { key: inverse_throughput, value: 0.6403, per_snippet_value: 4.4821 }
error:           ''
info:            instruction has tied variables, avoiding Read-After-Write issue, picking random def and use registers not aliasing each other, randomizing registers for uses
assembled_snippet: 4883EC10C7042400000000C744240400000000C744240800000000C744240C00000000C5FA6F1C244883C4104883EC10C7042400000000C744240400000000C744240800000000C744240C00000000C5FA6F24244883C4104883EC10C7042400000000C744240400000000C744240800000000C744240C00000000C57A6F04244883C4104883EC04C70424801F0000C5F8AE14244883C4044883EC10C7042400000000C744240400000000C744240800000000C744240C00000000C5FA6F2C244883C4104883EC10C7042400000000C744240400000000C744240800000000C744240C00000000C57A6F34244883C4104883EC10C7042400000000C744240400000000C744240800000000C744240C00000000C5FA6F3C244883C4104883EC10C7042400000000C744240400000000C744240800000000C744240C00000000C57A6F14244883C4104883EC10C7042400000000C744240400000000C744240800000000C744240C00000000C57A6F1C244883C4104883EC10C7042400000000C744240400000000C744240800000000C744240C00000000C57A6F3C244883C4104883EC10C7042400000000C744240400000000C744240800000000C744240C00000000C57A6F2C244883C4104883EC10C7042400000000C744240400000000C744240800000000C744240C00000000C57A6F24244883C4104883EC10C7042400000000C744240400000000C744240800000000C744240C00000000C5FA6F0C244883C4104883EC10C7042400000000C744240400000000C744240800000000C744240C00000000C5FA6F04244883C4104883EC10C7042400000000C744240400000000C744240800000000C744240C00000000C5FA6F34244883C4104883EC10C7042400000000C744240400000000C744240800000000C744240C00000000C57A6F0C244883C4104883EC10C7042400000000C744240400000000C744240800000000C744240C00000000C5FA6F14244883C410C4C2D998D8C4E28998EFC442A198D7C4428198EFC462A198E1C4C2C998C1C4C28198D3C4C2D998D8C4E28998EFC442A198D7C4428198EFC462A198E1C4C2C998C1C4C28198D3C4C2D998D8C4E28998EFC442A198D7C4428198EFC462A198E1C4C2C998C1C4C28198D3C4C2D998D8C4E28998EFC442A198D7C4428198EFC462A198E1C4C2C998C1C4C28198D3C3
...
Check generated assembly with: /usr/bin/objdump -d /tmp/snippet-f05c2f.o
---
mode:            inverse_throughput
key:
  instructions:
    - 'VFMADD132PDr XMM15 XMM15 XMM11 XMM2'
    - 'VFMADD132PDr XMM5 XMM5 XMM11 XMM2'
    - 'VFMADD132PDr XMM14 XMM14 XMM11 XMM2'
    - 'VFMADD132PDr XMM4 XMM4 XMM11 XMM2'
    - 'VFMADD132PDr XMM8 XMM8 XMM11 XMM2'
    - 'VFMADD132PDr XMM3 XMM3 XMM11 XMM2'
    - 'VFMADD132PDr XMM10 XMM10 XMM11 XMM2'
    - 'VFMADD132PDr XMM7 XMM7 XMM11 XMM2'
    - 'VFMADD132PDr XMM13 XMM13 XMM11 XMM2'
    - 'VFMADD132PDr XMM9 XMM9 XMM11 XMM2'
    - 'VFMADD132PDr XMM1 XMM1 XMM11 XMM2'
    - 'VFMADD132PDr XMM6 XMM6 XMM11 XMM2'
    - 'VFMADD132PDr XMM0 XMM0 XMM11 XMM2'
    - 'VFMADD132PDr XMM12 XMM12 XMM11 XMM2'
  config:          ''
  register_initial_values:
    - 'XMM15=0x0'
    - 'XMM11=0x0'
    - 'XMM2=0x0'
    - 'MXCSR=0x0'
    - 'XMM5=0x0'
    - 'XMM14=0x0'
    - 'XMM4=0x0'
    - 'XMM8=0x0'
    - 'XMM3=0x0'
    - 'XMM10=0x0'
    - 'XMM7=0x0'
    - 'XMM13=0x0'
    - 'XMM9=0x0'
    - 'XMM1=0x0'
    - 'XMM6=0x0'
    - 'XMM0=0x0'
    - 'XMM12=0x0'
cpu_name:        znver3
llvm_triple:     x86_64-unknown-linux-gnu
num_repetitions: 10000
measurements:
  - { key: inverse_throughput, value: 0.5312, per_snippet_value: 7.4368 }
error:           ''
info:            instruction has tied variables, avoiding Read-After-Write issue, picking random def and use registers not aliasing each other, one unique register for each use position
assembled_snippet: 4883EC10C7042400000000C744240400000000C744240800000000C744240C00000000C57A6F3C244883C4104883EC10C7042400000000C744240400000000C744240800000000C744240C00000000C57A6F1C244883C4104883EC10C7042400000000C744240400000000C744240800000000C744240C00000000C5FA6F14244883C4104883EC04C70424801F0000C5F8AE14244883C4044883EC10C7042400000000C744240400000000C744240800000000C744240C00000000C5FA6F2C244883C4104883EC10C7042400000000C744240400000000C744240800000000C744240C00000000C57A6F34244883C4104883EC10C7042400000000C744240400000000C744240800000000C744240C00000000C5FA6F24244883C4104883EC10C7042400000000C744240400000000C744240800000000C744240C00000000C57A6F04244883C4104883EC10C7042400000000C744240400000000C744240800000000C744240C00000000C5FA6F1C244883C4104883EC10C7042400000000C744240400000000C744240800000000C744240C00000000C57A6F14244883C4104883EC10C7042400000000C744240400000000C744240800000000C744240C00000000C5FA6F3C244883C4104883EC10C7042400000000C744240400000000C744240800000000C744240C00000000C57A6F2C244883C4104883EC10C7042400000000C744240400000000C744240800000000C744240C00000000C57A6F0C244883C4104883EC10C7042400000000C744240400000000C744240800000000C744240C00000000C5FA6F0C244883C4104883EC10C7042400000000C744240400000000C744240800000000C744240C00000000C5FA6F34244883C4104883EC10C7042400000000C744240400000000C744240800000000C744240C00000000C5FA6F04244883C4104883EC10C7042400000000C744240400000000C744240800000000C744240C00000000C57A6F24244883C410C462A198FAC4E2A198EAC462A198F2C4E2A198E2C462A198C2C4E2A198DAC462A198D2C4E2A198FAC462A198EAC462A198CAC4E2A198CAC4E2A198F2C4E2A198C2C462A198E2C462A198FAC4E2A198EAC462A198F2C4E2A198E2C462A198C2C4E2A198DAC462A198D2C4E2A198FAC462A198EAC462A198CAC4E2A198CAC4E2A198F2C4E2A198C2C462A198E2C462A198FAC4E2A198EAC462A198F2C4E2A198E2C462A198C2C4E2A198DAC462A198D2C4E2A198FAC462A198EAC462A198CAC4E2A198CAC4E2A198F2C4E2A198C2C462A198E2C462A198FAC4E2A198EAC462A198F2C4E2A198E2C462A198C2C4E2A198DAC462A198D2C4E2A198FAC462A198EAC462A198CAC4E2A198CAC4E2A198F2C4E2A198C2C462A198E2C3
...
Check generated assembly with: /usr/bin/objdump -d /tmp/snippet-c32060.o
---
mode:            inverse_throughput
key:
  instructions:
    - 'VFMADD132PDr XMM10 XMM10 XMM6 XMM6'
    - 'VFMADD132PDr XMM8 XMM8 XMM6 XMM6'
    - 'VFMADD132PDr XMM12 XMM12 XMM6 XMM6'
    - 'VFMADD132PDr XMM9 XMM9 XMM6 XMM6'
    - 'VFMADD132PDr XMM7 XMM7 XMM6 XMM6'
    - 'VFMADD132PDr XMM1 XMM1 XMM6 XMM6'
    - 'VFMADD132PDr XMM0 XMM0 XMM6 XMM6'
    - 'VFMADD132PDr XMM5 XMM5 XMM6 XMM6'
    - 'VFMADD132PDr XMM11 XMM11 XMM6 XMM6'
    - 'VFMADD132PDr XMM2 XMM2 XMM6 XMM6'
    - 'VFMADD132PDr XMM15 XMM15 XMM6 XMM6'
    - 'VFMADD132PDr XMM3 XMM3 XMM6 XMM6'
    - 'VFMADD132PDr XMM14 XMM14 XMM6 XMM6'
    - 'VFMADD132PDr XMM4 XMM4 XMM6 XMM6'
    - 'VFMADD132PDr XMM13 XMM13 XMM6 XMM6'
  config:          ''
  register_initial_values:
    - 'XMM10=0x0'
    - 'XMM6=0x0'
    - 'MXCSR=0x0'
    - 'XMM8=0x0'
    - 'XMM12=0x0'
    - 'XMM9=0x0'
    - 'XMM7=0x0'
    - 'XMM1=0x0'
    - 'XMM0=0x0'
    - 'XMM5=0x0'
    - 'XMM11=0x0'
    - 'XMM2=0x0'
    - 'XMM15=0x0'
    - 'XMM3=0x0'
    - 'XMM14=0x0'
    - 'XMM4=0x0'
    - 'XMM13=0x0'
cpu_name:        znver3
llvm_triple:     x86_64-unknown-linux-gnu
num_repetitions: 10000
measurements:
  - { key: inverse_throughput, value: 0.5311, per_snippet_value: 7.9665 }
error:           ''
info:            instruction has tied variables, avoiding Read-After-Write issue, picking random def and use registers not aliasing each other, reusing the same register for all uses
assembled_snippet: 4883EC10C7042400000000C744240400000000C744240800000000C744240C00000000C57A6F14244883C4104883EC10C7042400000000C744240400000000C744240800000000C744240C00000000C5FA6F34244883C4104883EC04C70424801F0000C5F8AE14244883C4044883EC10C7042400000000C744240400000000C744240800000000C744240C00000000C57A6F04244883C4104883EC10C7042400000000C744240400000000C744240800000000C744240C00000000C57A6F24244883C4104883EC10C7042400000000C744240400000000C744240800000000C744240C00000000C57A6F0C244883C4104883EC10C7042400000000C744240400000000C744240800000000C744240C00000000C5FA6F3C244883C4104883EC10C7042400000000C744240400000000C744240800000000C744240C00000000C5FA6F0C244883C4104883EC10C7042400000000C744240400000000C744240800000000C744240C00000000C5FA6F04244883C4104883EC10C7042400000000C744240400000000C744240800000000C744240C00000000C5FA6F2C244883C4104883EC10C7042400000000C744240400000000C744240800000000C744240C00000000C57A6F1C244883C4104883EC10C7042400000000C744240400000000C744240800000000C744240C00000000C5FA6F14244883C4104883EC10C7042400000000C744240400000000C744240800000000C744240C00000000C57A6F3C244883C4104883EC10C7042400000000C744240400000000C744240800000000C744240C00000000C5FA6F1C244883C4104883EC10C7042400000000C744240400000000C744240800000000C744240C00000000C57A6F34244883C4104883EC10C7042400000000C744240400000000C744240800000000C744240C00000000C5FA6F24244883C4104883EC10C7042400000000C744240400000000C744240800000000C744240C00000000C57A6F2C244883C410C462C998D6C462C998C6C462C998E6C462C998CEC4E2C998FEC4E2C998CEC4E2C998C6C4E2C998EEC462C998DEC4E2C998D6C462C998FEC4E2C998DEC462C998F6C4E2C998E6C462C998EEC462C998D6C462C998C6C462C998E6C462C998CEC4E2C998FEC4E2C998CEC4E2C998C6C4E2C998EEC462C998DEC4E2C998D6C462C998FEC4E2C998DEC462C998F6C4E2C998E6C462C998EEC462C998D6C462C998C6C462C998E6C462C998CEC4E2C998FEC4E2C998CEC4E2C998C6C4E2C998EEC462C998DEC4E2C998D6C462C998FEC4E2C998DEC462C998F6C4E2C998E6C462C998EEC462C998D6C462C998C6C462C998E6C462C998CEC4E2C998FEC4E2C998CEC4E2C998C6C4E2C998EEC462C998DEC4E2C998D6C462C998FEC4E2C998DEC462C998F6C4E2C998E6C462C998EEC3
...
```

Reviewed By: courbet

Differential Revision: https://reviews.llvm.org/D139283

llvm/tools/llvm-exegesis/lib/CodeTemplate.cpp
llvm/tools/llvm-exegesis/lib/CodeTemplate.h
llvm/tools/llvm-exegesis/lib/ParallelSnippetGenerator.cpp
llvm/tools/llvm-exegesis/lib/SnippetGenerator.cpp
llvm/tools/llvm-exegesis/lib/SnippetGenerator.h
llvm/unittests/tools/llvm-exegesis/PowerPC/SnippetGeneratorTest.cpp
llvm/unittests/tools/llvm-exegesis/X86/SnippetGeneratorTest.cpp

index 9840a08..0b026dc 100644 (file)
 namespace llvm {
 namespace exegesis {
 
+CodeTemplate::CodeTemplate(const CodeTemplate &) = default;
+
 CodeTemplate::CodeTemplate(CodeTemplate &&) = default;
 
 CodeTemplate &CodeTemplate::operator=(CodeTemplate &&) = default;
 
+CodeTemplate &CodeTemplate::operator=(const CodeTemplate &) = default;
+
+CodeTemplate CodeTemplate::clone() const {
+  CodeTemplate CT = *this;
+  return CT;
+}
+
 InstructionTemplate::InstructionTemplate(const Instruction *Instr)
     : Instr(Instr), VariableValues(Instr->Variables.size()) {}
 
index bea1030..9235217 100644 (file)
@@ -119,8 +119,8 @@ struct CodeTemplate {
 
   CodeTemplate(CodeTemplate &&);            // default
   CodeTemplate &operator=(CodeTemplate &&); // default
-  CodeTemplate(const CodeTemplate &) = delete;
-  CodeTemplate &operator=(const CodeTemplate &) = delete;
+
+  CodeTemplate clone() const;
 
   ExecutionMode Execution = ExecutionMode::UNKNOWN;
   // See InstructionBenchmarkKey.::Config.
@@ -132,6 +132,10 @@ struct CodeTemplate {
   // If the template uses the provided scratch memory, the register in which
   // the pointer to this memory is passed in to the function.
   unsigned ScratchSpacePointerInReg = 0;
+
+private:
+  CodeTemplate(const CodeTemplate &);            // default
+  CodeTemplate &operator=(const CodeTemplate &); // default
 };
 
 } // namespace exegesis
index 7728fcb..0d284af 100644 (file)
 namespace llvm {
 namespace exegesis {
 
-static SmallVector<const Variable *, 8>
-getVariablesWithTiedOperands(const Instruction &Instr) {
+static bool hasVariablesWithTiedOperands(const Instruction &Instr) {
   SmallVector<const Variable *, 8> Result;
   for (const auto &Var : Instr.Variables)
     if (Var.hasTiedOperands())
-      Result.push_back(&Var);
-  return Result;
+      return true;
+  return false;
 }
 
 ParallelSnippetGenerator::~ParallelSnippetGenerator() = default;
@@ -114,43 +113,181 @@ void ParallelSnippetGenerator::instantiateMemoryOperands(
          "not enough scratch space");
 }
 
-static std::vector<InstructionTemplate> generateSnippetUsingStaticRenaming(
-    const LLVMState &State, const InstructionTemplate &IT,
-    const ArrayRef<const Variable *> TiedVariables,
-    const BitVector &ForbiddenRegisters) {
-  std::vector<InstructionTemplate> Instructions;
-  // Assign registers to variables in a round-robin manner. This is simple but
-  // ensures that the most register-constrained variable does not get starved.
-  std::vector<BitVector> PossibleRegsForVar;
-  for (const Variable *Var : TiedVariables) {
-    assert(Var);
-    const Operand &Op = IT.getInstr().getPrimaryOperand(*Var);
-    assert(Op.isReg());
-    BitVector PossibleRegs = Op.getRegisterAliasing().sourceBits();
-    remove(PossibleRegs, ForbiddenRegisters);
-    PossibleRegsForVar.push_back(std::move(PossibleRegs));
+enum class RegRandomizationStrategy : uint8_t {
+  PickRandomRegs,
+  SingleStaticRegPerOperand,
+  SingleStaticReg,
+
+  FIRST = PickRandomRegs,
+  LAST = SingleStaticReg,
+};
+
+} // namespace exegesis
+
+template <> struct enum_iteration_traits<exegesis::RegRandomizationStrategy> {
+  static constexpr bool is_iterable = true;
+};
+
+namespace exegesis {
+
+const char *getDescription(RegRandomizationStrategy S) {
+  switch (S) {
+  case RegRandomizationStrategy::PickRandomRegs:
+    return "randomizing registers";
+  case RegRandomizationStrategy::SingleStaticRegPerOperand:
+    return "one unique register for each position";
+  case RegRandomizationStrategy::SingleStaticReg:
+    return "reusing the same register for all positions";
   }
-  SmallVector<int, 2> Iterators(TiedVariables.size(), 0);
-  while (true) {
-    InstructionTemplate TmpIT = IT;
-    // Find a possible register for each variable in turn, marking the
-    // register as taken.
-    for (size_t VarId = 0; VarId < TiedVariables.size(); ++VarId) {
-      const int NextPossibleReg =
-          PossibleRegsForVar[VarId].find_next(Iterators[VarId]);
-      if (NextPossibleReg <= 0) {
-        return Instructions;
-      }
-      TmpIT.getValueFor(*TiedVariables[VarId]) =
-          MCOperand::createReg(NextPossibleReg);
-      // Bump iterator.
-      Iterators[VarId] = NextPossibleReg;
-      // Prevent other variables from using the register.
-      for (BitVector &OtherPossibleRegs : PossibleRegsForVar) {
-        OtherPossibleRegs.reset(NextPossibleReg);
+  llvm_unreachable("Unknown UseRegRandomizationStrategy enum");
+}
+
+static std::variant<std::nullopt_t, MCOperand, Register>
+generateSingleRegisterForInstrAvoidingDefUseOverlap(
+    const LLVMState &State, const BitVector &ForbiddenRegisters,
+    const BitVector &ImplicitUseAliases, const BitVector &ImplicitDefAliases,
+    const BitVector &Uses, const BitVector &Defs, const InstructionTemplate &IT,
+    const Operand &Op, const ArrayRef<InstructionTemplate> Instructions,
+    RegRandomizationStrategy S) {
+  const Instruction &Instr = IT.getInstr();
+  assert(Op.isReg() && Op.isExplicit() && !Op.isMemory() &&
+         !IT.getValueFor(Op).isValid());
+  assert((!Op.isUse() || !Op.isTied()) &&
+         "Not expecting to see a tied use reg");
+
+  if (Op.isUse()) {
+    switch (S) {
+    case RegRandomizationStrategy::PickRandomRegs:
+      break;
+    case RegRandomizationStrategy::SingleStaticReg:
+    case RegRandomizationStrategy::SingleStaticRegPerOperand: {
+      if (!Instructions.empty())
+        return Instructions.front().getValueFor(Op);
+      if (S != RegRandomizationStrategy::SingleStaticReg)
+        break;
+      BitVector PossibleRegisters = Op.getRegisterAliasing().sourceBits();
+      const BitVector UseAliases = getAliasedBits(State.getRegInfo(), Uses);
+      if (std::optional<int> CommonBit =
+              getFirstCommonBit(PossibleRegisters, UseAliases))
+        return *CommonBit;
+      break;
+    }
+    }
+  }
+
+  BitVector PossibleRegisters = Op.getRegisterAliasing().sourceBits();
+  remove(PossibleRegisters, ForbiddenRegisters);
+
+  if (Op.isDef()) {
+    remove(PossibleRegisters, ImplicitUseAliases);
+    const BitVector UseAliases = getAliasedBits(State.getRegInfo(), Uses);
+    remove(PossibleRegisters, UseAliases);
+  }
+
+  if (Op.isUse()) {
+    remove(PossibleRegisters, ImplicitDefAliases);
+    // NOTE: in general, using same reg for multiple Use's is fine.
+    if (S == RegRandomizationStrategy::SingleStaticRegPerOperand) {
+      const BitVector UseAliases = getAliasedBits(State.getRegInfo(), Uses);
+      remove(PossibleRegisters, UseAliases);
+    }
+  }
+
+  bool IsDefWithTiedUse =
+      Instr.Variables[Op.getVariableIndex()].hasTiedOperands();
+  if (Op.isUse() || IsDefWithTiedUse) {
+    // Now, important bit: if we have used some register for def,
+    // then we can not use that same register for *any* use,
+    // be it either an untied use, or an use tied to a def.
+    // But def-ing same regs is fine, as long as there are no uses!
+    const BitVector DefsAliases = getAliasedBits(State.getRegInfo(), Defs);
+    remove(PossibleRegisters, DefsAliases);
+  }
+
+  if (!PossibleRegisters.any())
+    return std::nullopt;
+
+  return randomBit(PossibleRegisters);
+}
+
+static std::optional<InstructionTemplate>
+generateSingleSnippetForInstrAvoidingDefUseOverlap(
+    const LLVMState &State, const BitVector &ForbiddenRegisters,
+    const BitVector &ImplicitUseAliases, const BitVector &ImplicitDefAliases,
+    BitVector &Uses, BitVector &Defs, InstructionTemplate IT,
+    const ArrayRef<InstructionTemplate> Instructions,
+    RegRandomizationStrategy S) {
+  const Instruction &Instr = IT.getInstr();
+  for (const Operand &Op : Instr.Operands) {
+    if (!Op.isReg() || !Op.isExplicit() || Op.isMemory() ||
+        IT.getValueFor(Op).isValid())
+      continue;
+    assert((!Op.isUse() || !Op.isTied()) && "Will not get tied uses.");
+
+    std::variant<std::nullopt_t, MCOperand, Register> R =
+        generateSingleRegisterForInstrAvoidingDefUseOverlap(
+            State, ForbiddenRegisters, ImplicitUseAliases, ImplicitDefAliases,
+            Uses, Defs, IT, Op, Instructions, S);
+
+    if (std::holds_alternative<std::nullopt_t>(R))
+      return {};
+
+    MCOperand MCOp;
+    if (std::holds_alternative<MCOperand>(R))
+      MCOp = std::get<MCOperand>(R);
+    else {
+      Register RandomReg = std::get<Register>(R);
+      if (Op.isDef())
+        Defs.set(RandomReg);
+      if (Op.isUse())
+        Uses.set(RandomReg);
+      MCOp = MCOperand::createReg(RandomReg);
+    }
+    IT.getValueFor(Op) = MCOp;
+  }
+  return IT;
+}
+
+static std::vector<InstructionTemplate>
+generateSnippetForInstrAvoidingDefUseOverlap(
+    const LLVMState &State, const InstructionTemplate &IT,
+    RegRandomizationStrategy S, const BitVector &ForbiddenRegisters) {
+  // We don't want to accidentally serialize the instruction,
+  // so we must be sure that we don't pick a def that is an implicit use,
+  // or a use that is an implicit def, so record implicit regs now.
+  BitVector ImplicitUses(State.getRegInfo().getNumRegs());
+  BitVector ImplicitDefs(State.getRegInfo().getNumRegs());
+  for (const auto &Op : IT.getInstr().Operands) {
+    if (Op.isReg() && Op.isImplicit() && !Op.isMemory()) {
+      assert(Op.isImplicitReg() && "Not an implicit register operand?");
+      if (Op.isUse())
+        ImplicitUses.set(Op.getImplicitReg());
+      else {
+        assert(Op.isDef() && "Not a use and not a def?");
+        ImplicitDefs.set(Op.getImplicitReg());
       }
     }
-    Instructions.push_back(std::move(TmpIT));
+  }
+  const BitVector ImplicitUseAliases =
+      getAliasedBits(State.getRegInfo(), ImplicitUses);
+  const BitVector ImplicitDefAliases =
+      getAliasedBits(State.getRegInfo(), ImplicitDefs);
+
+  BitVector Defs(State.getRegInfo().getNumRegs());
+  BitVector Uses(State.getRegInfo().getNumRegs());
+  std::vector<InstructionTemplate> Instructions;
+
+  while (true) {
+    std::optional<InstructionTemplate> TmpIT =
+        generateSingleSnippetForInstrAvoidingDefUseOverlap(
+            State, ForbiddenRegisters, ImplicitUseAliases, ImplicitDefAliases,
+            Uses, Defs, IT, Instructions, S);
+    if (!TmpIT)
+      return Instructions;
+    Instructions.push_back(std::move(*TmpIT));
+    if (!hasVariablesWithTiedOperands(IT.getInstr()))
+      return Instructions;
+    assert(Instructions.size() <= 128 && "Stuck in endless loop?");
   }
 }
 
@@ -177,78 +314,41 @@ ParallelSnippetGenerator::generateCodeTemplates(
     instantiateMemoryOperands(CT.ScratchSpacePointerInReg, CT.Instructions);
     return getSingleton(std::move(CT));
   }
-  const auto TiedVariables = getVariablesWithTiedOperands(Instr);
-  if (!TiedVariables.empty()) {
-    CT.Info = "instruction has tied variables, using static renaming.";
-    CT.Instructions = generateSnippetUsingStaticRenaming(
-        State, Variant, TiedVariables, ForbiddenRegisters);
-    instantiateMemoryOperands(CT.ScratchSpacePointerInReg, CT.Instructions);
-    return getSingleton(std::move(CT));
-  }
-  // No tied variables, we pick random values for defs.
-
-  // We don't want to accidentally serialize the instruction,
-  // so we must be sure that we don't pick a def that is an implicit use,
-  // or a use that is an implicit def, so record implicit regs now.
-  BitVector ImplicitUses(State.getRegInfo().getNumRegs());
-  BitVector ImplicitDefs(State.getRegInfo().getNumRegs());
-  for (const auto &Op : Instr.Operands) {
-    if (Op.isReg() && Op.isImplicit() && !Op.isMemory()) {
-      assert(Op.isImplicitReg() && "Not an implicit register operand?");
-      if (Op.isUse())
-        ImplicitUses.set(Op.getImplicitReg());
-      else {
-        assert(Op.isDef() && "Not a use and not a def?");
-        ImplicitDefs.set(Op.getImplicitReg());
-      }
-    }
-  }
-  const auto ImplicitUseAliases =
-      getAliasedBits(State.getRegInfo(), ImplicitUses);
-  const auto ImplicitDefAliases =
-      getAliasedBits(State.getRegInfo(), ImplicitDefs);
-  BitVector Defs(State.getRegInfo().getNumRegs());
-  for (const auto &Op : Instr.Operands) {
-    if (Op.isReg() && Op.isExplicit() && Op.isDef() && !Op.isMemory()) {
-      auto PossibleRegisters = Op.getRegisterAliasing().sourceBits();
-      // Do not use forbidden registers and regs that are implicitly used.
-      // Note that we don't try to avoid using implicit defs explicitly.
-      remove(PossibleRegisters, ForbiddenRegisters);
-      remove(PossibleRegisters, ImplicitUseAliases);
-      if (!PossibleRegisters.any())
-        return make_error<StringError>(
-            Twine("no available registers:\ncandidates:\n")
-                .concat(debugString(State.getRegInfo(),
-                                    Op.getRegisterAliasing().sourceBits()))
-                .concat("\nforbidden:\n")
-                .concat(debugString(State.getRegInfo(), ForbiddenRegisters))
-                .concat("\nimplicit use:\n")
-                .concat(debugString(State.getRegInfo(), ImplicitUseAliases)),
-            inconvertibleErrorCode());
-      const auto RandomReg = randomBit(PossibleRegisters);
-      Defs.set(RandomReg);
-      Variant.getValueFor(Op) = MCOperand::createReg(RandomReg);
-    }
-  }
-  // And pick random use values that are not reserved and don't alias with defs.
-  // Note that we don't try to avoid using implicit uses explicitly.
-  const auto DefAliases = getAliasedBits(State.getRegInfo(), Defs);
-  for (const auto &Op : Instr.Operands) {
-    if (Op.isReg() && Op.isExplicit() && Op.isUse() && !Op.isMemory()) {
-      auto PossibleRegisters = Op.getRegisterAliasing().sourceBits();
-      remove(PossibleRegisters, ForbiddenRegisters);
-      remove(PossibleRegisters, DefAliases);
-      remove(PossibleRegisters, ImplicitDefAliases);
-      assert(PossibleRegisters.any() && "No register left to choose from");
-      const auto RandomReg = randomBit(PossibleRegisters);
-      Variant.getValueFor(Op) = MCOperand::createReg(RandomReg);
-    }
+  std::vector<CodeTemplate> Result;
+  bool HasTiedOperands = hasVariablesWithTiedOperands(Instr);
+  // If there are no tied operands, then we don't want to "saturate backedge",
+  // and the template we will produce will have only a single instruction.
+  unsigned NumUntiedUseRegs = count_if(Instr.Operands, [](const Operand &Op) {
+    return Op.isReg() && Op.isExplicit() && !Op.isMemory() && Op.isUse() &&
+           !Op.isTied();
+  });
+  SmallVector<RegRandomizationStrategy, 3> Strategies;
+  if (HasTiedOperands || NumUntiedUseRegs >= 3)
+    Strategies.push_back(RegRandomizationStrategy::PickRandomRegs);
+  if (HasTiedOperands || NumUntiedUseRegs >= 2)
+    Strategies.push_back(RegRandomizationStrategy::SingleStaticRegPerOperand);
+  Strategies.push_back(RegRandomizationStrategy::SingleStaticReg);
+  for (RegRandomizationStrategy S : Strategies) {
+    CodeTemplate CurrCT = CT.clone();
+    CurrCT.Info =
+        Twine("instruction has ")
+            .concat(HasTiedOperands ? "" : "no ")
+            .concat("tied variables, avoiding "
+                    "Read-After-Write issue, picking random def and use "
+                    "registers not aliasing each other, for uses, ")
+            .concat(getDescription(S))
+            .str();
+    CurrCT.Instructions = generateSnippetForInstrAvoidingDefUseOverlap(
+        State, Variant, S, ForbiddenRegisters);
+    if (CurrCT.Instructions.empty())
+      return make_error<StringError>(
+          Twine("Failed to produce any snippet via: ").concat(CurrCT.Info),
+          inconvertibleErrorCode());
+    instantiateMemoryOperands(CurrCT.ScratchSpacePointerInReg,
+                              CurrCT.Instructions);
+    Result.push_back(std::move(CurrCT));
   }
-  CT.Info =
-      "instruction has no tied variables picking Uses different from defs";
-  CT.Instructions.push_back(std::move(Variant));
-  instantiateMemoryOperands(CT.ScratchSpacePointerInReg, CT.Instructions);
-  return getSingleton(std::move(CT));
+  return Result;
 }
 
 constexpr const size_t ParallelSnippetGenerator::kMinNumDifferentAddresses;
index b3a7118..15c2c60 100644 (file)
@@ -213,6 +213,15 @@ size_t randomBit(const BitVector &Vector) {
   return *Itr;
 }
 
+std::optional<int> getFirstCommonBit(const BitVector &A, const BitVector &B) {
+  BitVector Intersect = A;
+  Intersect &= B;
+  int idx = Intersect.find_first();
+  if (idx != -1)
+    return idx;
+  return {};
+}
+
 void setRandomAliasing(const AliasingConfigurations &AliasingConfigurations,
                        InstructionTemplate &DefIB, InstructionTemplate &UseIB) {
   assert(!AliasingConfigurations.empty());
index 7a53c03..0c2b617 100644 (file)
@@ -92,6 +92,9 @@ size_t randomIndex(size_t Max);
 // Precondition: Vector must have at least one bit set.
 size_t randomBit(const BitVector &Vector);
 
+// Picks a first bit that is common to these two vectors.
+std::optional<int> getFirstCommonBit(const BitVector &A, const BitVector &B);
+
 // Picks a random configuration, then selects a random def and a random use from
 // it and finally set the selected values in the provided InstructionInstances.
 void setRandomAliasing(const AliasingConfigurations &AliasingConfigurations,
index bca1a5e..007fa5b 100644 (file)
@@ -116,17 +116,17 @@ TEST_F(PPCParallelSnippetGeneratorTest, MemoryUse) {
   const unsigned Opcode = PPC::LDX;
   const auto CodeTemplates = checkAndGetCodeTemplates(Opcode);
   ASSERT_THAT(CodeTemplates, SizeIs(1));
-  const auto &CT = CodeTemplates[0];
-  EXPECT_THAT(CT.Info, HasSubstr("instruction has no tied variables picking "
-                                 "Uses different from defs"));
-  EXPECT_THAT(CT.Execution, ExecutionMode::UNKNOWN);
-  ASSERT_THAT(CT.Instructions,
-              SizeIs(ParallelSnippetGenerator::kMinNumDifferentAddresses));
-  const InstructionTemplate &IT = CT.Instructions[0];
-  EXPECT_THAT(IT.getOpcode(), Opcode);
-  ASSERT_THAT(IT.getVariableValues(), SizeIs(3));
-  EXPECT_EQ(IT.getVariableValues()[1].getReg(), PPC::X1);
-  EXPECT_EQ(IT.getVariableValues()[2].getReg(), PPC::X13);
+  for (const auto &CT : CodeTemplates) {
+    EXPECT_THAT(CT.Info, HasSubstr("instruction has no tied variables"));
+    EXPECT_THAT(CT.Execution, ExecutionMode::UNKNOWN);
+    ASSERT_THAT(CT.Instructions,
+                SizeIs(ParallelSnippetGenerator::kMinNumDifferentAddresses));
+    const InstructionTemplate &IT = CT.Instructions[0];
+    EXPECT_THAT(IT.getOpcode(), Opcode);
+    ASSERT_THAT(IT.getVariableValues(), SizeIs(3));
+    EXPECT_EQ(IT.getVariableValues()[1].getReg(), PPC::X1);
+    EXPECT_EQ(IT.getVariableValues()[2].getReg(), PPC::X13);
+  }
 }
 
 } // namespace
index f4bf2d2..b6d25f0 100644 (file)
@@ -14,6 +14,7 @@
 #include "SerialSnippetGenerator.h"
 #include "TestBase.h"
 #include "X86InstrInfo.h"
+#include "llvm/ADT/SetOperations.h"
 
 #include <unordered_set>
 
@@ -26,8 +27,10 @@ namespace {
 
 using testing::AnyOf;
 using testing::ElementsAre;
+using testing::Ge;
 using testing::Gt;
 using testing::HasSubstr;
+using testing::IsEmpty;
 using testing::Not;
 using testing::SizeIs;
 using testing::UnorderedElementsAre;
@@ -232,7 +235,7 @@ TEST_F(X86ParallelSnippetGeneratorTest, SerialInstruction) {
   ASSERT_THAT(IT.getVariableValues(), SizeIs(0));
 }
 
-TEST_F(X86ParallelSnippetGeneratorTest, StaticRenaming) {
+TEST_F(X86ParallelSnippetGeneratorTest, ReadAfterWrite_CMOV32rr) {
   // CMOV32rr has tied variables, we enumerate the possible values to execute
   // as many in parallel as possible.
 
@@ -248,19 +251,69 @@ TEST_F(X86ParallelSnippetGeneratorTest, StaticRenaming) {
   // - hasAliasingRegisters
   const unsigned Opcode = X86::CMOV32rr;
   const auto CodeTemplates = checkAndGetCodeTemplates(Opcode);
-  ASSERT_THAT(CodeTemplates, SizeIs(1));
-  const auto &CT = CodeTemplates[0];
-  EXPECT_THAT(CT.Info, HasSubstr("static renaming"));
-  EXPECT_THAT(CT.Execution, ExecutionMode::UNKNOWN);
-  constexpr const unsigned kInstructionCount = 15;
-  ASSERT_THAT(CT.Instructions, SizeIs(kInstructionCount));
-  std::unordered_set<unsigned> AllDefRegisters;
-  for (const auto &IT : CT.Instructions) {
-    ASSERT_THAT(IT.getVariableValues(), SizeIs(3));
-    AllDefRegisters.insert(IT.getVariableValues()[0].getReg());
+  ASSERT_THAT(CodeTemplates, SizeIs(3));
+  for (const auto &CT : CodeTemplates) {
+    EXPECT_THAT(CT.Info, HasSubstr("avoiding Read-After-Write issue"));
+    EXPECT_THAT(CT.Execution, ExecutionMode::UNKNOWN);
+    ASSERT_GT(CT.Instructions.size(), 1U);
+    std::unordered_set<unsigned> AllDefRegisters;
+    std::unordered_set<unsigned> AllUseRegisters;
+    for (const auto &IT : CT.Instructions) {
+      ASSERT_THAT(IT.getVariableValues(), SizeIs(3));
+      AllDefRegisters.insert(IT.getVariableValues()[0].getReg());
+      AllUseRegisters.insert(IT.getVariableValues()[1].getReg());
+    }
+    EXPECT_THAT(AllDefRegisters, SizeIs(CT.Instructions.size()))
+        << "Each instruction writes to a different register";
+    EXPECT_THAT(AllUseRegisters, Not(IsEmpty()))
+        << "In total, some other registers are used";
+    auto AllDefAndUseRegs = AllUseRegisters;
+    llvm::set_intersect(AllDefAndUseRegs, AllDefRegisters); // A := A ^ B
+    EXPECT_THAT(AllDefAndUseRegs, IsEmpty())
+        << "No instruction uses any register defined by any of the "
+           "instructions";
+  }
+}
+
+TEST_F(X86ParallelSnippetGeneratorTest, ReadAfterWrite_VFMADD132PDr) {
+  // VFMADD132PDr has tied variables, we enumerate the possible values
+  // to execute as many in parallel as possible.
+
+  // - VFMADD132PDr
+  // - Op0 Explicit Def RegClass(XMM)
+  // - Op1 Explicit Use RegClass(XMM) TiedToOp0
+  // - Op2 Explicit Use RegClass(XMM)
+  // - Op3 Explicit Use RegClass(XMM)
+  // - Var0 [Op0,Op1]
+  // - Var1 [Op2]
+  // - Var2 [Op3]
+  // - hasTiedRegisters (execution is always serial)
+  // - hasAliasingRegisters
+  const unsigned Opcode = X86::VFMADD132PDr;
+  const auto CodeTemplates = checkAndGetCodeTemplates(Opcode);
+  ASSERT_THAT(CodeTemplates, SizeIs(3));
+  for (const auto &CT : CodeTemplates) {
+    EXPECT_THAT(CT.Info, HasSubstr("avoiding Read-After-Write issue"));
+    EXPECT_THAT(CT.Execution, ExecutionMode::UNKNOWN);
+    ASSERT_GT(CT.Instructions.size(), 1U);
+    std::unordered_set<unsigned> AllDefRegisters;
+    std::unordered_set<unsigned> AllUseRegisters;
+    for (const auto &IT : CT.Instructions) {
+      ASSERT_THAT(IT.getVariableValues(), SizeIs(3));
+      AllDefRegisters.insert(IT.getVariableValues()[0].getReg());
+      AllUseRegisters.insert(IT.getVariableValues()[1].getReg());
+      AllUseRegisters.insert(IT.getVariableValues()[2].getReg());
+    }
+    EXPECT_THAT(AllDefRegisters, SizeIs(CT.Instructions.size()))
+        << "Each instruction writes to a different register";
+    EXPECT_THAT(AllUseRegisters, Not(IsEmpty()))
+        << "In total, some other registers are used";
+    auto AllDefAndUseRegs = AllUseRegisters;
+    llvm::set_intersect(AllDefAndUseRegs, AllDefRegisters); // A := A ^ B
+    EXPECT_THAT(AllDefAndUseRegs, IsEmpty())
+        << "No instruction uses any register defined by any of the "
+           "instructions";
   }
-  EXPECT_THAT(AllDefRegisters, SizeIs(kInstructionCount))
-      << "Each instruction writes to a different register";
 }
 
 TEST_F(X86ParallelSnippetGeneratorTest, NoTiedVariables) {
@@ -280,21 +333,22 @@ TEST_F(X86ParallelSnippetGeneratorTest, NoTiedVariables) {
   // - hasAliasingRegisters
   const unsigned Opcode = X86::CMOV_GR32;
   const auto CodeTemplates = checkAndGetCodeTemplates(Opcode);
-  ASSERT_THAT(CodeTemplates, SizeIs(1));
-  const auto &CT = CodeTemplates[0];
-  EXPECT_THAT(CT.Info, HasSubstr("no tied variables"));
-  EXPECT_THAT(CT.Execution, ExecutionMode::UNKNOWN);
-  ASSERT_THAT(CT.Instructions, SizeIs(1));
-  const InstructionTemplate &IT = CT.Instructions[0];
-  EXPECT_THAT(IT.getOpcode(), Opcode);
-  ASSERT_THAT(IT.getVariableValues(), SizeIs(4));
-  EXPECT_THAT(IT.getVariableValues()[0].getReg(),
-              Not(IT.getVariableValues()[1].getReg()))
-      << "Def is different from first Use";
-  EXPECT_THAT(IT.getVariableValues()[0].getReg(),
-              Not(IT.getVariableValues()[2].getReg()))
-      << "Def is different from second Use";
-  EXPECT_THAT(IT.getVariableValues()[3], IsInvalid());
+  ASSERT_THAT(CodeTemplates, SizeIs(2));
+  for (const auto &CT : CodeTemplates) {
+    EXPECT_THAT(CT.Info, HasSubstr("no tied variables"));
+    EXPECT_THAT(CT.Execution, ExecutionMode::UNKNOWN);
+    ASSERT_THAT(CT.Instructions, SizeIs(1));
+    const InstructionTemplate &IT = CT.Instructions[0];
+    EXPECT_THAT(IT.getOpcode(), Opcode);
+    ASSERT_THAT(IT.getVariableValues(), SizeIs(4));
+    EXPECT_THAT(IT.getVariableValues()[0].getReg(),
+                Not(IT.getVariableValues()[1].getReg()))
+        << "Def is different from first Use";
+    EXPECT_THAT(IT.getVariableValues()[0].getReg(),
+                Not(IT.getVariableValues()[2].getReg()))
+        << "Def is different from second Use";
+    EXPECT_THAT(IT.getVariableValues()[3], IsInvalid());
+  }
 }
 
 TEST_F(X86ParallelSnippetGeneratorTest, MemoryUse) {
@@ -317,18 +371,19 @@ TEST_F(X86ParallelSnippetGeneratorTest, MemoryUse) {
   const unsigned Opcode = X86::MOV32rm;
   const auto CodeTemplates = checkAndGetCodeTemplates(Opcode);
   ASSERT_THAT(CodeTemplates, SizeIs(1));
-  const auto &CT = CodeTemplates[0];
-  EXPECT_THAT(CT.Info, HasSubstr("no tied variables"));
-  EXPECT_THAT(CT.Execution, ExecutionMode::UNKNOWN);
-  ASSERT_THAT(CT.Instructions,
-              SizeIs(ParallelSnippetGenerator::kMinNumDifferentAddresses));
-  const InstructionTemplate &IT = CT.Instructions[0];
-  EXPECT_THAT(IT.getOpcode(), Opcode);
-  ASSERT_THAT(IT.getVariableValues(), SizeIs(6));
-  EXPECT_EQ(IT.getVariableValues()[2].getImm(), 1);
-  EXPECT_EQ(IT.getVariableValues()[3].getReg(), 0u);
-  EXPECT_EQ(IT.getVariableValues()[4].getImm(), 0);
-  EXPECT_EQ(IT.getVariableValues()[5].getReg(), 0u);
+  for (const auto &CT : CodeTemplates) {
+    EXPECT_THAT(CT.Info, HasSubstr("no tied variables"));
+    EXPECT_THAT(CT.Execution, ExecutionMode::UNKNOWN);
+    ASSERT_THAT(CT.Instructions,
+                SizeIs(ParallelSnippetGenerator::kMinNumDifferentAddresses));
+    const InstructionTemplate &IT = CT.Instructions[0];
+    EXPECT_THAT(IT.getOpcode(), Opcode);
+    ASSERT_THAT(IT.getVariableValues(), SizeIs(6));
+    EXPECT_EQ(IT.getVariableValues()[2].getImm(), 1);
+    EXPECT_EQ(IT.getVariableValues()[3].getReg(), 0u);
+    EXPECT_EQ(IT.getVariableValues()[4].getImm(), 0);
+    EXPECT_EQ(IT.getVariableValues()[5].getReg(), 0u);
+  }
 }
 
 TEST_F(X86ParallelSnippetGeneratorTest, MOV16ms) {
@@ -362,7 +417,7 @@ TEST_F(X86ParallelSnippetGeneratorTest,
   auto Err = Generator.generateCodeTemplates(&Instr, AllRegisters);
   EXPECT_FALSE((bool)Err);
   EXPECT_THAT(toString(Err.takeError()),
-              testing::HasSubstr("no available registers"));
+              testing::HasSubstr("Failed to produce any snippet"));
 }
 
 class X86FakeSnippetGenerator : public SnippetGenerator {