LLD Support for Basic Block Sections

author Sriraman Tallam <tmsriram@google.com>

Tue, 7 Apr 2020 13:48:18 +0000 (06:48 -0700)

committer Sriraman Tallam <tmsriram@google.com>

Tue, 7 Apr 2020 13:55:57 +0000 (06:55 -0700)
author Sriraman Tallam <tmsriram@google.com>
Tue, 7 Apr 2020 13:48:18 +0000 (06:48 -0700)
committer Sriraman Tallam <tmsriram@google.com>
Tue, 7 Apr 2020 13:55:57 +0000 (06:55 -0700)
diff --git a/lld/ELF/Arch/X86_64.cpp b/lld/ELF/Arch/X86_64.cpp

index 7cb6f320fdef4874e7abe31b6ffac451a20019e5..5d9ce6f6a3dc0423fb99ce180ab8d23a8de57cf7 100644 (file)
--- a/lld/ELF/Arch/X86_64.cpp
+++ b/lld/ELF/Arch/X86_64.cpp
@@ -7,6 +7,7 @@
  //===----------------------------------------------------------------------===//
  
  #include "InputFiles.h"
+#include "OutputSections.h"
  #include "Symbols.h"
  #include "SyntheticSections.h"
  #include "Target.h"
@@ -37,6 +38,8 @@ public:
                  uint64_t pltEntryAddr) const override;
    void relocate(uint8_t *loc, const Relocation &rel,
                  uint64_t val) const override;
+  void applyJumpInstrMod(uint8_t *loc, JumpModType type,
+                         unsigned size) const override;
  
    RelExpr adjustRelaxExpr(RelType type, const uint8_t *data,
                            RelExpr expr) const override;
@@ -52,9 +55,25 @@ public:
                        uint64_t val) const override;
    bool adjustPrologueForCrossSplitStack(uint8_t *loc, uint8_t *end,
                                          uint8_t stOther) const override;
+  bool deleteFallThruJmpInsn(InputSection &is, InputFile *file,
+                             InputSection *nextIS) const override;
  };
  } // namespace
  
+// This is vector of NOP instructions of sizes from 1 to 8 bytes.  The
+// appropriately sized instructions are used to fill the gaps between sections
+// which are executed during fall through.
+static const std::vector<std::vector<uint8_t>> nopInstructions = {
+    {0x90},
+    {0x66, 0x90},
+    {0x0f, 0x1f, 0x00},
+    {0x0f, 0x1f, 0x40, 0x00},
+    {0x0f, 0x1f, 0x44, 0x00, 0x00},
+    {0x66, 0x0f, 0x1f, 0x44, 0x00, 0x00},
+    {0x0F, 0x1F, 0x80, 0x00, 0x00, 0x00, 0x00},
+    {0x0F, 0x1F, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00},
+    {0x66, 0x0F, 0x1F, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00}};
+
  X86_64::X86_64() {
    copyRel = R_X86_64_COPY;
    gotRel = R_X86_64_GLOB_DAT;
@@ -71,6 +90,7 @@ X86_64::X86_64() {
    pltEntrySize = 16;
    ipltEntrySize = 16;
    trapInstr = {0xcc, 0xcc, 0xcc, 0xcc}; // 0xcc = INT3
+  nopInstrs = nopInstructions;
  
    // Align to the large page size (known as a superpage or huge page).
    // FreeBSD automatically promotes large, superpage-aligned allocations.
@@ -79,6 +99,216 @@ X86_64::X86_64() {
  
  int X86_64::getTlsGdRelaxSkip(RelType type) const { return 2; }
  
+// Opcodes for the different X86_64 jmp instructions.
+enum JmpInsnOpcode : uint32_t {
+  J_JMP_32,
+  J_JNE_32,
+  J_JE_32,
+  J_JG_32,
+  J_JGE_32,
+  J_JB_32,
+  J_JBE_32,
+  J_JL_32,
+  J_JLE_32,
+  J_JA_32,
+  J_JAE_32,
+  J_UNKNOWN,
+};
+
+// Given the first (optional) and second byte of the insn's opcode, this
+// returns the corresponding enum value.
+static JmpInsnOpcode getJmpInsnType(const uint8_t *first,
+                                    const uint8_t *second) {
+  if (*second == 0xe9)
+    return J_JMP_32;
+
+  if (first == nullptr)
+    return J_UNKNOWN;
+
+  if (*first == 0x0f) {
+    switch (*second) {
+    case 0x84:
+      return J_JE_32;
+    case 0x85:
+      return J_JNE_32;
+    case 0x8f:
+      return J_JG_32;
+    case 0x8d:
+      return J_JGE_32;
+    case 0x82:
+      return J_JB_32;
+    case 0x86:
+      return J_JBE_32;
+    case 0x8c:
+      return J_JL_32;
+    case 0x8e:
+      return J_JLE_32;
+    case 0x87:
+      return J_JA_32;
+    case 0x83:
+      return J_JAE_32;
+    }
+  }
+  return J_UNKNOWN;
+}
+
+// Return the relocation index for input section IS with a specific Offset.
+// Returns the maximum size of the vector if no such relocation is found.
+static unsigned getRelocationWithOffset(const InputSection &is,
+                                        uint64_t offset) {
+  unsigned size = is.relocations.size();
+  for (unsigned i = size - 1; i + 1 > 0; --i) {
+    if (is.relocations[i].offset == offset && is.relocations[i].expr != R_NONE)
+      return i;
+  }
+  return size;
+}
+
+// Returns true if R corresponds to a relocation used for a jump instruction.
+// TODO: Once special relocations for relaxable jump instructions are available,
+// this should be modified to use those relocations.
+static bool isRelocationForJmpInsn(Relocation &R) {
+  return R.type == R_X86_64_PLT32 || R.type == R_X86_64_PC32 ||
+         R.type == R_X86_64_PC8;
+}
+
+// Return true if Relocation R points to the first instruction in the
+// next section.
+// TODO: Delete this once psABI reserves a new relocation type for fall thru
+// jumps.
+static bool isFallThruRelocation(InputSection &is, InputFile *file,
+                                 InputSection *nextIS, Relocation &r) {
+  if (!isRelocationForJmpInsn(r))
+    return false;
+
+  uint64_t addrLoc = is.getOutputSection()->addr + is.outSecOff + r.offset;
+  uint64_t targetOffset = InputSectionBase::getRelocTargetVA(
+      file, r.type, r.addend, addrLoc, *r.sym, r.expr);
+
+  // If this jmp is a fall thru, the target offset is the beginning of the
+  // next section.
+  uint64_t nextSectionOffset =
+      nextIS->getOutputSection()->addr + nextIS->outSecOff;
+  return (addrLoc + 4 + targetOffset) == nextSectionOffset;
+}
+
+// Return the jmp instruction opcode that is the inverse of the given
+// opcode.  For example, JE inverted is JNE.
+static JmpInsnOpcode invertJmpOpcode(const JmpInsnOpcode opcode) {
+  switch (opcode) {
+  case J_JE_32:
+    return J_JNE_32;
+  case J_JNE_32:
+    return J_JE_32;
+  case J_JG_32:
+    return J_JLE_32;
+  case J_JGE_32:
+    return J_JL_32;
+  case J_JB_32:
+    return J_JAE_32;
+  case J_JBE_32:
+    return J_JA_32;
+  case J_JL_32:
+    return J_JGE_32;
+  case J_JLE_32:
+    return J_JG_32;
+  case J_JA_32:
+    return J_JBE_32;
+  case J_JAE_32:
+    return J_JB_32;
+  default:
+    return J_UNKNOWN;
+  }
+}
+
+// Deletes direct jump instruction in input sections that jumps to the
+// following section as it is not required.  If there are two consecutive jump
+// instructions, it checks if they can be flipped and one can be deleted.
+// For example:
+// .section .text
+// a.BB.foo:
+//    ...
+//    10: jne aa.BB.foo
+//    16: jmp bar
+// aa.BB.foo:
+//    ...
+//
+// can be converted to:
+// a.BB.foo:
+//   ...
+//   10: je bar  #jne flipped to je and the jmp is deleted.
+// aa.BB.foo:
+//   ...
+bool X86_64::deleteFallThruJmpInsn(InputSection &is, InputFile *file,
+                                   InputSection *nextIS) const {
+  const unsigned sizeOfDirectJmpInsn = 5;
+
+  if (nextIS == nullptr)
+    return false;
+
+  if (is.getSize() < sizeOfDirectJmpInsn)
+    return false;
+
+  // If this jmp insn can be removed, it is the last insn and the
+  // relocation is 4 bytes before the end.
+  unsigned rIndex = getRelocationWithOffset(is, is.getSize() - 4);
+  if (rIndex == is.relocations.size())
+    return false;
+
+  Relocation &r = is.relocations[rIndex];
+
+  // Check if the relocation corresponds to a direct jmp.
+  const uint8_t *secContents = is.data().data();
+  // If it is not a direct jmp instruction, there is nothing to do here.
+  if (*(secContents + r.offset - 1) != 0xe9)
+    return false;
+
+  if (isFallThruRelocation(is, file, nextIS, r)) {
+    // This is a fall thru and can be deleted.
+    r.expr = R_NONE;
+    r.offset = 0;
+    is.drop_back(sizeOfDirectJmpInsn);
+    is.nopFiller = true;
+    return true;
+  }
+
+  // Now, check if flip and delete is possible.
+  const unsigned sizeOfJmpCCInsn = 6;
+  // To flip, there must be atleast one JmpCC and one direct jmp.
+  if (is.getSize() < sizeOfDirectJmpInsn + sizeOfJmpCCInsn)
+    return 0;
+
+  unsigned rbIndex =
+      getRelocationWithOffset(is, (is.getSize() - sizeOfDirectJmpInsn - 4));
+  if (rbIndex == is.relocations.size())
+    return 0;
+
+  Relocation &rB = is.relocations[rbIndex];
+
+  const uint8_t *jmpInsnB = secContents + rB.offset - 1;
+  JmpInsnOpcode jmpOpcodeB = getJmpInsnType(jmpInsnB - 1, jmpInsnB);
+  if (jmpOpcodeB == J_UNKNOWN)
+    return false;
+
+  if (!isFallThruRelocation(is, file, nextIS, rB))
+    return false;
+
+  // jmpCC jumps to the fall thru block, the branch can be flipped and the
+  // jmp can be deleted.
+  JmpInsnOpcode jInvert = invertJmpOpcode(jmpOpcodeB);
+  if (jInvert == J_UNKNOWN)
+    return false;
+  is.jumpInstrMods.push_back({jInvert, (rB.offset - 1), 4});
+  // Move R's values to rB except the offset.
+  rB = {r.expr, r.type, rB.offset, r.addend, r.sym};
+  // Cancel R
+  r.expr = R_NONE;
+  r.offset = 0;
+  is.drop_back(sizeOfDirectJmpInsn);
+  is.nopFiller = true;
+  return true;
+}
+
  RelExpr X86_64::getRelExpr(RelType type, const Symbol &s,
                             const uint8_t *loc) const {
    if (type == R_X86_64_GOTTPOFF)
@@ -357,6 +587,94 @@ void X86_64::relaxTlsLdToLe(uint8_t *loc, const Relocation &rel,
          "expected R_X86_64_PLT32 or R_X86_64_GOTPCRELX after R_X86_64_TLSLD");
  }
  
+// A JumpInstrMod at a specific offset indicates that the jump instruction
+// opcode at that offset must be modified.  This is specifically used to relax
+// jump instructions with basic block sections.  This function looks at the
+// JumpMod and effects the change.
+void X86_64::applyJumpInstrMod(uint8_t *loc, JumpModType type,
+                               unsigned size) const {
+  switch (type) {
+  case J_JMP_32:
+    if (size == 4)
+      *loc = 0xe9;
+    else
+      *loc = 0xeb;
+    break;
+  case J_JE_32:
+    if (size == 4) {
+      loc[-1] = 0x0f;
+      *loc = 0x84;
+    } else
+      *loc = 0x74;
+    break;
+  case J_JNE_32:
+    if (size == 4) {
+      loc[-1] = 0x0f;
+      *loc = 0x85;
+    } else
+      *loc = 0x75;
+    break;
+  case J_JG_32:
+    if (size == 4) {
+      loc[-1] = 0x0f;
+      *loc = 0x8f;
+    } else
+      *loc = 0x7f;
+    break;
+  case J_JGE_32:
+    if (size == 4) {
+      loc[-1] = 0x0f;
+      *loc = 0x8d;
+    } else
+      *loc = 0x7d;
+    break;
+  case J_JB_32:
+    if (size == 4) {
+      loc[-1] = 0x0f;
+      *loc = 0x82;
+    } else
+      *loc = 0x72;
+    break;
+  case J_JBE_32:
+    if (size == 4) {
+      loc[-1] = 0x0f;
+      *loc = 0x86;
+    } else
+      *loc = 0x76;
+    break;
+  case J_JL_32:
+    if (size == 4) {
+      loc[-1] = 0x0f;
+      *loc = 0x8c;
+    } else
+      *loc = 0x7c;
+    break;
+  case J_JLE_32:
+    if (size == 4) {
+      loc[-1] = 0x0f;
+      *loc = 0x8e;
+    } else
+      *loc = 0x7e;
+    break;
+  case J_JA_32:
+    if (size == 4) {
+      loc[-1] = 0x0f;
+      *loc = 0x87;
+    } else
+      *loc = 0x77;
+    break;
+  case J_JAE_32:
+    if (size == 4) {
+      loc[-1] = 0x0f;
+      *loc = 0x83;
+    } else
+      *loc = 0x73;
+    break;
+  case J_UNKNOWN:
+    llvm_unreachable("Unknown Jump Relocation");
+  }
+}
+
  void X86_64::relocate(uint8_t *loc, const Relocation &rel, uint64_t val) const {
    switch (rel.type) {
    case R_X86_64_8:
diff --git a/lld/ELF/Config.h b/lld/ELF/Config.h

index 0263708634cbba7207fdcf04c36ffae9a8d0c886..70ef86239aa2f68758186e0e87bbec56d235aacb 100644 (file)
--- a/lld/ELF/Config.h
+++ b/lld/ELF/Config.h
@@ -114,6 +114,7 @@ struct Configuration {
    llvm::StringRef sysroot;
    llvm::StringRef thinLTOCacheDir;
    llvm::StringRef thinLTOIndexOnlyArg;
+  llvm::StringRef ltoBasicBlockSections;
    std::pair<llvm::StringRef, llvm::StringRef> thinLTOObjectSuffixReplace;
    std::pair<llvm::StringRef, llvm::StringRef> thinLTOPrefixReplace;
    std::string rpath;
@@ -165,6 +166,7 @@ struct Configuration {
    bool ltoCSProfileGenerate;
    bool ltoDebugPassManager;
    bool ltoNewPassManager;
+  bool ltoUniqueBBSectionNames;
    bool ltoWholeProgramVisibility;
    bool mergeArmExidx;
    bool mipsN32Abi = false;
@@ -175,6 +177,7 @@ struct Configuration {
    bool nostdlib;
    bool oFormatBinary;
    bool omagic;
+  bool optimizeBBJumps;
    bool optRemarksWithHotness;
    bool picThunk;
    bool pie;
diff --git a/lld/ELF/Driver.cpp b/lld/ELF/Driver.cpp

index c31cbcc8a54d08dfc5c453cda0cf43f4d7781fac..914cfefbd3292e60b034ea54852ede1fc719e600 100644 (file)
--- a/lld/ELF/Driver.cpp
+++ b/lld/ELF/Driver.cpp
@@ -878,6 +878,8 @@ static void readConfigs(opt::InputArgList &args) {
    config->cref = args.hasFlag(OPT_cref, OPT_no_cref, false);
    config->defineCommon = args.hasFlag(OPT_define_common, OPT_no_define_common,
                                        !args.hasArg(OPT_relocatable));
+  config->optimizeBBJumps =
+      args.hasFlag(OPT_optimize_bb_jumps, OPT_no_optimize_bb_jumps, false);
    config->demangle = args.hasFlag(OPT_demangle, OPT_no_demangle, true);
    config->dependentLibraries = args.hasFlag(OPT_dependent_libraries, OPT_no_dependent_libraries, true);
    config->disableVerify = args.hasArg(OPT_disable_verify);
@@ -924,6 +926,11 @@ static void readConfigs(opt::InputArgList &args) {
    config->ltoObjPath = args.getLastArgValue(OPT_lto_obj_path_eq);
    config->ltoPartitions = args::getInteger(args, OPT_lto_partitions, 1);
    config->ltoSampleProfile = args.getLastArgValue(OPT_lto_sample_profile);
+  config->ltoBasicBlockSections =
+      args.getLastArgValue(OPT_lto_basicblock_sections);
+  config->ltoUniqueBBSectionNames =
+      args.hasFlag(OPT_lto_unique_bb_section_names,
+                   OPT_no_lto_unique_bb_section_names, false);
    config->mapFile = args.getLastArgValue(OPT_Map);
    config->mipsGotSize = args::getInteger(args, OPT_mips_got_size, 0xfff0);
    config->mergeArmExidx =
diff --git a/lld/ELF/InputSection.cpp b/lld/ELF/InputSection.cpp

index e93dec947d9031c3255f0dc60c8729867739bca9..13c3dd486c3324ad4c38ccdecf3dee08e212004e 100644 (file)
--- a/lld/ELF/InputSection.cpp
+++ b/lld/ELF/InputSection.cpp
@@ -138,7 +138,7 @@ size_t InputSectionBase::getSize() const {
      return s->getSize();
    if (uncompressedSize >= 0)
      return uncompressedSize;
-  return rawData.size();
+  return rawData.size() - bytesDropped;
  }
  
  void InputSectionBase::uncompress() const {
@@ -659,8 +659,9 @@ static int64_t getTlsTpOffset(const Symbol &s) {
    }
  }
  
-static uint64_t getRelocTargetVA(const InputFile *file, RelType type, int64_t a,
-                                 uint64_t p, const Symbol &sym, RelExpr expr) {
+uint64_t InputSectionBase::getRelocTargetVA(const InputFile *file, RelType type,
+                                            int64_t a, uint64_t p,
+                                            const Symbol &sym, RelExpr expr) {
    switch (expr) {
    case R_ABS:
    case R_DTPREL:
@@ -871,6 +872,12 @@ void InputSection::relocateNonAlloc(uint8_t *buf, ArrayRef<RelTy> rels) {
      if (expr == R_NONE)
        continue;
  
+    if (expr == R_SIZE) {
+      target->relocateNoSym(bufLoc, type,
+                            SignExtend64<bits>(sym.getSize() + addend));
+      continue;
+    }
+
      if (expr != R_ABS && expr != R_DTPREL && expr != R_RISCV_ADD) {
        std::string msg = getLocation<ELFT>(offset) +
                          ": has non-ABS relocation " + toString(type) +
@@ -942,6 +949,8 @@ void InputSectionBase::relocateAlloc(uint8_t *buf, uint8_t *bufEnd) {
    const unsigned bits = config->wordsize * 8;
  
    for (const Relocation &rel : relocations) {
+    if (rel.expr == R_NONE)
+      continue;
      uint64_t offset = rel.offset;
      if (auto *sec = dyn_cast<InputSection>(this))
        offset += sec->outSecOff;
@@ -1011,6 +1020,18 @@ void InputSectionBase::relocateAlloc(uint8_t *buf, uint8_t *bufEnd) {
        break;
      }
    }
+
+  // Apply jumpInstrMods.  jumpInstrMods are created when the opcode of
+  // a jmp insn must be modified to shrink the jmp insn or to flip the jmp
+  // insn.  This is primarily used to relax and optimize jumps created with
+  // basic block sections.
+  if (auto *sec = dyn_cast<InputSection>(this)) {
+    for (const JumpInstrMod &jumpMod : jumpInstrMods) {
+      uint64_t offset = jumpMod.offset + sec->outSecOff;
+      uint8_t *bufLoc = buf + offset;
+      target->applyJumpInstrMod(bufLoc, jumpMod.original, jumpMod.size);
+    }
+  }
  }
  
  // For each function-defining prologue, find any calls to __morestack,
diff --git a/lld/ELF/InputSection.h b/lld/ELF/InputSection.h

index fe2c3c516a9651ecf906719170b453aab67510e7..719971b9c72a1d91463fc58160c952c492277d7a 100644 (file)
--- a/lld/ELF/InputSection.h
+++ b/lld/ELF/InputSection.h
@@ -128,6 +128,26 @@ public:
      return cast_or_null<ObjFile<ELFT>>(file);
    }
  
+  // If basic block sections are enabled, many code sections could end up with
+  // one or two jump instructions at the end that could be relaxed to a smaller
+  // instruction. The members below help trimming the trailing jump instruction
+  // and shrinking a section.
+  unsigned bytesDropped = 0;
+
+  void drop_back(uint64_t num) { bytesDropped += num; }
+
+  void push_back(uint64_t num) {
+    assert(bytesDropped >= num);
+    bytesDropped -= num;
+  }
+
+  void trim() {
+    if (bytesDropped) {
+      rawData = rawData.drop_back(bytesDropped);
+      bytesDropped = 0;
+    }
+  }
+
    ArrayRef<uint8_t> data() const {
      if (uncompressedSize >= 0)
        uncompress();
@@ -183,12 +203,25 @@ public:
    // the mmap'ed output buffer.
    template <class ELFT> void relocate(uint8_t *buf, uint8_t *bufEnd);
    void relocateAlloc(uint8_t *buf, uint8_t *bufEnd);
+  static uint64_t getRelocTargetVA(const InputFile *File, RelType Type,
+                                   int64_t A, uint64_t P, const Symbol &Sym,
+                                   RelExpr Expr);
  
    // The native ELF reloc data type is not very convenient to handle.
    // So we convert ELF reloc records to our own records in Relocations.cpp.
    // This vector contains such "cooked" relocations.
    std::vector<Relocation> relocations;
  
+  // Indicates that this section needs to be padded with a NOP filler if set to
+  // true.
+  bool nopFiller = false;
+
+  // These are modifiers to jump instructions that are necessary when basic
+  // block sections are enabled.  Basic block sections creates opportunities to
+  // relax jump instructions at basic block boundaries after reordering the
+  // basic blocks.
+  std::vector<JumpInstrMod> jumpInstrMods;
+
    // A function compiled with -fsplit-stack calling a function
    // compiled without -fsplit-stack needs its prologue adjusted. Find
    // such functions and adjust their prologues.  This is very similar
diff --git a/lld/ELF/LTO.cpp b/lld/ELF/LTO.cpp

index f4e04a82f0c2afa99df76bd754bf2fa36586bc1c..d09f2a74e48cfb474b9a34c28858e0f970ec72ea 100644 (file)
--- a/lld/ELF/LTO.cpp
+++ b/lld/ELF/LTO.cpp
@@ -76,6 +76,32 @@ static lto::Config createConfig() {
    c.Options.FunctionSections = true;
    c.Options.DataSections = true;
  
+  // Check if basic block sections must be used.
+  // Allowed values for --lto-basicblock-sections are "all", "labels",
+  // "<file name specifying basic block ids>", or none.  This is the equivalent
+  // of -fbasicblock-sections= flag in clang.
+  if (!config->ltoBasicBlockSections.empty()) {
+    if (config->ltoBasicBlockSections == "all") {
+      c.Options.BBSections = BasicBlockSection::All;
+    } else if (config->ltoBasicBlockSections == "labels") {
+      c.Options.BBSections = BasicBlockSection::Labels;
+    } else if (config->ltoBasicBlockSections == "none") {
+      c.Options.BBSections = BasicBlockSection::None;
+    } else {
+      ErrorOr<std::unique_ptr<MemoryBuffer>> MBOrErr =
+          MemoryBuffer::getFile(config->ltoBasicBlockSections.str());
+      if (!MBOrErr) {
+        error("cannot open " + config->ltoBasicBlockSections + ":" +
+              MBOrErr.getError().message());
+      } else {
+        c.Options.BBSectionsFuncListBuf = std::move(*MBOrErr);
+      }
+      c.Options.BBSections = BasicBlockSection::List;
+    }
+  }
+
+  c.Options.UniqueBBSectionNames = config->ltoUniqueBBSectionNames;
+
    if (auto relocModel = getRelocModelFromCMModel())
      c.RelocModel = *relocModel;
    else if (config->relocatable)
diff --git a/lld/ELF/Options.td b/lld/ELF/Options.td

index f3afb03cba471f46b141a6f80dfe0b2836a70a3d..83b9fa467033ec5cc4151d98cfd014b8eb4abf00 100644 (file)
--- a/lld/ELF/Options.td
+++ b/lld/ELF/Options.td
@@ -42,6 +42,10 @@ defm compress_debug_sections:
  
  defm defsym: Eq<"defsym", "Define a symbol alias">, MetaVarName<"<symbol>=<value>">;
  
+defm optimize_bb_jumps: B<"optimize-bb-jumps",
+    "Remove direct jumps at the end to the next basic block",
+    "Do not remove any direct jumps at the end to the next basic block (default)">;
+
  defm split_stack_adjust_size
      : Eq<"split-stack-adjust-size",
           "Specify adjustment to stack size when a split-stack function calls a "
@@ -502,6 +506,11 @@ def opt_remarks_format: Separate<["--"], "opt-remarks-format">,
    HelpText<"The format used for serializing remarks (default: YAML)">;
  defm plugin_opt: Eq<"plugin-opt", "specifies LTO options for compatibility with GNU linkers">;
  def save_temps: F<"save-temps">;
+def lto_basicblock_sections: J<"lto-basicblock-sections=">,
+  HelpText<"Enable basic block sections for LTO">;
+defm lto_unique_bb_section_names: B<"lto-unique-bb-section-names",
+    "Give unique names to every basic block section for LTO",
+    "Do not give unique names to every basic block section for LTO (default)">;
  def shuffle_sections: J<"shuffle-sections=">, MetaVarName<"<seed>">,
    HelpText<"Shuffle input sections using the given seed. If 0, use a random seed">;
  def thinlto_cache_dir: J<"thinlto-cache-dir=">,
diff --git a/lld/ELF/OutputSections.cpp b/lld/ELF/OutputSections.cpp

index c38d4f0b67508014bd2dc1561f433ab1d3dd8668..59868e38d72fca7d52f874fd61c7e59264a73068 100644 (file)
--- a/lld/ELF/OutputSections.cpp
+++ b/lld/ELF/OutputSections.cpp
@@ -242,6 +242,25 @@ void OutputSection::sort(llvm::function_ref<int(InputSectionBase *s)> order) {
        sortByOrder(isd->sections, order);
  }
  
+static void nopInstrFill(uint8_t *buf, size_t size) {
+  if (size == 0)
+    return;
+  unsigned i = 0;
+  if (size == 0)
+    return;
+  std::vector<std::vector<uint8_t>> nopFiller = *target->nopInstrs;
+  unsigned num = size / nopFiller.back().size();
+  for (unsigned c = 0; c < num; ++c) {
+    memcpy(buf + i, nopFiller.back().data(), nopFiller.back().size());
+    i += nopFiller.back().size();
+  }
+  unsigned remaining = size - i;
+  if (!remaining)
+    return;
+  assert(nopFiller[remaining - 1].size() == remaining);
+  memcpy(buf + i, nopFiller[remaining - 1].data(), remaining);
+}
+
  // Fill [Buf, Buf + Size) with Filler.
  // This is used for linker script "=fillexp" command.
  static void fill(uint8_t *buf, size_t size,
@@ -330,7 +349,11 @@ template <class ELFT> void OutputSection::writeTo(uint8_t *buf) {
          end = buf + size;
        else
          end = buf + sections[i + 1]->outSecOff;
-      fill(start, end - start, filler);
+      if (isec->nopFiller) {
+        assert(target->nopInstrs);
+        nopInstrFill(start, end - start);
+      } else
+        fill(start, end - start, filler);
      }
    });
  
diff --git a/lld/ELF/Relocations.h b/lld/ELF/Relocations.h

index b3181b67ad0e7afb3ebbd6e3573b921dd7afc27b..ec59c63410d07badca0f6201f8611536642492d0 100644 (file)
--- a/lld/ELF/Relocations.h
+++ b/lld/ELF/Relocations.h
@@ -24,6 +24,7 @@ class SectionBase;
  
  // Represents a relocation type, such as R_X86_64_PC32 or R_ARM_THM_CALL.
  using RelType = uint32_t;
+using JumpModType = uint32_t;
  
  // List of target-independent relocation types. Relocations read
  // from files are converted to these types so that the main code
@@ -108,6 +109,15 @@ struct Relocation {
    Symbol *sym;
  };
  
+// Manipulate jump instructions with these modifiers.  These are used to relax
+// jump instruction opcodes at basic block boundaries and are particularly
+// useful when basic block sections are enabled.
+struct JumpInstrMod {
+  JumpModType original;
+  uint64_t offset;
+  unsigned size;
+};
+
  // This function writes undefined symbol diagnostics to an internal buffer.
  // Call reportUndefinedSymbols() after calling scanRelocations() to emit
  // the diagnostics.
diff --git a/lld/ELF/Target.h b/lld/ELF/Target.h

index f58f216332a50e90f8a4b7d3a56adc50c8e7ec76..a308a41ff4b921367a583ce1f126d01a866d7e2d 100644 (file)
--- a/lld/ELF/Target.h
+++ b/lld/ELF/Target.h
@@ -88,8 +88,21 @@ public:
      relocate(loc, Relocation{R_NONE, type, 0, 0, nullptr}, val);
    }
  
+  virtual void applyJumpInstrMod(uint8_t *loc, JumpModType type,
+                                 JumpModType val) const {}
+
    virtual ~TargetInfo();
  
+  // This deletes a jump insn at the end of the section if it is a fall thru to
+  // the next section.  Further, if there is a conditional jump and a direct
+  // jump consecutively, it tries to flip the conditional jump to convert the
+  // direct jump into a fall thru and delete it.  Returns true if a jump
+  // instruction can be deleted.
+  virtual bool deleteFallThruJmpInsn(InputSection &is, InputFile *file,
+                                     InputSection *nextIS) const {
+    return false;
+  }
+
    unsigned defaultCommonPageSize = 4096;
    unsigned defaultMaxPageSize = 4096;
  
@@ -126,6 +139,10 @@ public:
    // executable OutputSections.
    std::array<uint8_t, 4> trapInstr;
  
+  // Stores the NOP instructions of different sizes for the target and is used
+  // to pad sections that are relaxed.
+  llvm::Optional<std::vector<std::vector<uint8_t>>> nopInstrs;
+
    // If a target needs to rewrite calls to __morestack to instead call
    // __morestack_non_split when a split-stack enabled caller calls a
    // non-split-stack callee this will return true. Otherwise returns false.
diff --git a/lld/ELF/Writer.cpp b/lld/ELF/Writer.cpp

index 801ca7a0dc5ef512c9a90e53c7a0c15674f712b4..55df6b7c5e4c17135d9ec78c60c13ac0d2a32e1c 100644 (file)
--- a/lld/ELF/Writer.cpp
+++ b/lld/ELF/Writer.cpp
@@ -31,6 +31,8 @@
  #include "llvm/Support/xxhash.h"
  #include <climits>
  
+#define DEBUG_TYPE "lld"
+
  using namespace llvm;
  using namespace llvm::ELF;
  using namespace llvm::object;
@@ -57,6 +59,7 @@ private:
    void sortSections();
    void resolveShfLinkOrder();
    void finalizeAddressDependentContent();
+  void optimizeBasicBlockJumps();
    void sortInputSections();
    void finalizeSections();
    void checkExecuteOnly();
@@ -1670,6 +1673,94 @@ template <class ELFT> void Writer<ELFT>::finalizeAddressDependentContent() {
               Twine(os->alignment) + ")");
  }
  
+// If Input Sections have been shrinked (basic block sections) then
+// update symbol values and sizes associated with these sections.  With basic
+// block sections, input sections can shrink when the jump instructions at
+// the end of the section are relaxed.
+static void fixSymbolsAfterShrinking() {
+  for (InputFile *File : objectFiles) {
+    parallelForEach(File->getSymbols(), [&](Symbol *Sym) {
+      auto *def = dyn_cast<Defined>(Sym);
+      if (!def)
+        return;
+
+      const SectionBase *sec = def->section;
+      if (!sec)
+        return;
+
+      const InputSectionBase *inputSec = dyn_cast<InputSectionBase>(sec->repl);
+      if (!inputSec || !inputSec->bytesDropped)
+        return;
+
+      const size_t OldSize = inputSec->data().size();
+      const size_t NewSize = OldSize - inputSec->bytesDropped;
+
+      if (def->value > NewSize && def->value <= OldSize) {
+        LLVM_DEBUG(llvm::dbgs()
+                   << "Moving symbol " << Sym->getName() << " from "
+                   << def->value << " to "
+                   << def->value - inputSec->bytesDropped << " bytes\n");
+        def->value -= inputSec->bytesDropped;
+        return;
+      }
+
+      if (def->value + def->size > NewSize && def->value <= OldSize &&
+          def->value + def->size <= OldSize) {
+        LLVM_DEBUG(llvm::dbgs()
+                   << "Shrinking symbol " << Sym->getName() << " from "
+                   << def->size << " to " << def->size - inputSec->bytesDropped
+                   << " bytes\n");
+        def->size -= inputSec->bytesDropped;
+      }
+    });
+  }
+}
+
+// If basic block sections exist, there are opportunities to delete fall thru
+// jumps and shrink jump instructions after basic block reordering.  This
+// relaxation pass does that.  It is only enabled when --optimize-bb-jumps
+// option is used.
+template <class ELFT> void Writer<ELFT>::optimizeBasicBlockJumps() {
+  assert(config->optimizeBBJumps);
+
+  script->assignAddresses();
+  // For every output section that has executable input sections, this
+  // does the following:
+  //   1. Deletes all direct jump instructions in input sections that
+  //      jump to the following section as it is not required.
+  //   2. If there are two consecutive jump instructions, it checks
+  //      if they can be flipped and one can be deleted.
+  for (OutputSection *os : outputSections) {
+    if (!(os->flags & SHF_EXECINSTR))
+      continue;
+    std::vector<InputSection *> sections = getInputSections(os);
+    std::vector<unsigned> result(sections.size());
+    // Delete all fall through jump instructions.  Also, check if two
+    // consecutive jump instructions can be flipped so that a fall
+    // through jmp instruction can be deleted.
+    parallelForEachN(0, sections.size(), [&](size_t i) {
+      InputSection *next = i + 1 < sections.size() ? sections[i + 1] : nullptr;
+      InputSection &is = *sections[i];
+      result[i] =
+          target->deleteFallThruJmpInsn(is, is.getFile<ELFT>(), next) ? 1 : 0;
+    });
+    size_t numDeleted = std::count(result.begin(), result.end(), 1);
+    if (numDeleted > 0) {
+      script->assignAddresses();
+      LLVM_DEBUG(llvm::dbgs()
+                 << "Removing " << numDeleted << " fall through jumps\n");
+    }
+  }
+
+  fixSymbolsAfterShrinking();
+
+  for (OutputSection *os : outputSections) {
+    std::vector<InputSection *> sections = getInputSections(os);
+    for (InputSection *is : sections)
+      is->trim();
+  }
+}
+
  static void finalizeSynthetic(SyntheticSection *sec) {
    if (sec && sec->isNeeded() && sec->getParent())
      sec->finalizeContents();
@@ -1992,6 +2083,12 @@ template <class ELFT> void Writer<ELFT>::finalizeSections() {
    finalizeSynthetic(in.symTab);
    finalizeSynthetic(in.ppc64LongBranchTarget);
  
+  // Relaxation to delete inter-basic block jumps created by basic block
+  // sections. Run after in.symTab is finalized as optimizeBasicBlockJumps
+  // can relax jump instructions based on symbol offset.
+  if (config->optimizeBBJumps)
+    optimizeBasicBlockJumps();
+
    // Fill other section headers. The dynamic table is finalized
    // at the end because some tags like RELSZ depend on result
    // of finalizing other sections.
diff --git a/lld/test/ELF/bb-sections-and-icf.s b/lld/test/ELF/bb-sections-and-icf.s

new file mode 100644 (file)

index 0000000..bcc9193
--- /dev/null
+++ b/lld/test/ELF/bb-sections-and-icf.s
@@ -0,0 +1,47 @@
+# REQUIRES: x86
+## basicblock-sections tests.
+## This simple test checks foo is folded into bar with bb sections
+## and the jumps are deleted.
+
+# RUN: llvm-mc -filetype=obj -triple=x86_64 %s -o %t.o
+# RUN: ld.lld --optimize-bb-jumps --icf=all %t.o -o %t.out
+# RUN: llvm-objdump -d %t.out| FileCheck %s
+
+# CHECK:      <foo>:
+# CHECK-NEXT:  nopl (%rax)
+# CHECK-NEXT:  je 0x{{[[:xdigit:]]+}} <aa.BB.foo>
+# CHECK-NOT:   jmp
+
+# CHECK:     <a.BB.foo>:
+## Explicity check that bar is folded and not emitted.
+# CHECK-NOT: <bar>:
+# CHECK-NOT: <a.BB.bar>:
+# CHECK-NOT: <aa.BB.bar>:
+
+.section       .text.bar,"ax",@progbits
+.type  bar,@function
+bar:
+ nopl (%rax)
+ jne   a.BB.bar
+ jmp   aa.BB.bar
+
+.section       .text.a.BB.bar,"ax",@progbits,unique,3
+a.BB.bar:
+ nopl (%rax)
+
+aa.BB.bar:
+ ret
+
+.section       .text.foo,"ax",@progbits
+.type  foo,@function
+foo:
+ nopl (%rax)
+ jne   a.BB.foo
+ jmp   aa.BB.foo
+
+.section       .text.a.BB.foo,"ax",@progbits,unique,2
+a.BB.foo:
+ nopl (%rax)
+
+aa.BB.foo:
+ ret
diff --git a/lld/test/ELF/bb-sections-delete-fallthru.s b/lld/test/ELF/bb-sections-delete-fallthru.s

new file mode 100644 (file)

index 0000000..c8a0e93
--- /dev/null
+++ b/lld/test/ELF/bb-sections-delete-fallthru.s
@@ -0,0 +1,128 @@
+# REQUIRES: x86
+## basicblock-sections tests.
+## This simple test checks if redundant direct jumps are converted to
+## implicit fallthrus.  The jcc's must be converted to their inverted
+## opcode, for instance jne to je and jmp must be deleted.
+
+# RUN: llvm-mc -filetype=obj -triple=x86_64 %s -o %t.o
+# RUN: ld.lld  --optimize-bb-jumps %t.o -o %t.out
+# RUN: llvm-objdump -d %t.out| FileCheck %s
+
+# CHECK:      <foo>:
+# CHECK-NEXT:  nopl    (%rax)
+# CHECK-NEXT:  jne      0x{{[[:xdigit:]]+}} <r.BB.foo>
+# CHECK-NOT:   jmp
+
+
+.section       .text,"ax",@progbits
+.type  foo,@function
+foo:
+ nopl (%rax)
+ je    a.BB.foo
+ jmp   r.BB.foo
+
+# CHECK:      <a.BB.foo>:
+# CHECK-NEXT:  nopl (%rax)
+# CHECK-NEXT:  je 0x{{[[:xdigit:]]+}} <r.BB.foo>
+# CHECK-NOT:   jmp
+
+.section       .text,"ax",@progbits,unique,3
+a.BB.foo:
+ nopl (%rax)
+ jne   aa.BB.foo
+ jmp   r.BB.foo
+
+# CHECK:      <aa.BB.foo>:
+# CHECK-NEXT:  nopl (%rax)
+# CHECK-NEXT:  jle 0x{{[[:xdigit:]]+}} <r.BB.foo>
+# CHECK-NOT:   jmp
+#
+.section       .text,"ax",@progbits,unique,4
+aa.BB.foo:
+ nopl (%rax)
+ jg    aaa.BB.foo
+ jmp   r.BB.foo
+
+# CHECK:      <aaa.BB.foo>:
+# CHECK-NEXT:  nopl (%rax)
+# CHECK-NEXT:  jl 0x{{[[:xdigit:]]+}} <r.BB.foo>
+# CHECK-NOT:   jmp
+#
+.section       .text,"ax",@progbits,unique,5
+aaa.BB.foo:
+ nopl (%rax)
+ jge   aaaa.BB.foo
+ jmp   r.BB.foo
+
+# CHECK:      <aaaa.BB.foo>:
+# CHECK-NEXT:  nopl (%rax)
+# CHECK-NEXT:  jae 0x{{[[:xdigit:]]+}} <r.BB.foo>
+# CHECK-NOT:   jmp
+#
+.section       .text,"ax",@progbits,unique,6
+aaaa.BB.foo:
+ nopl (%rax)
+ jb    aaaaa.BB.foo
+ jmp   r.BB.foo
+
+# CHECK:      <aaaaa.BB.foo>:
+# CHECK-NEXT:  nopl (%rax)
+# CHECK-NEXT:  ja 0x{{[[:xdigit:]]+}} <r.BB.foo>
+# CHECK-NOT:   jmp
+#
+.section       .text,"ax",@progbits,unique,7
+aaaaa.BB.foo:
+ nopl (%rax)
+ jbe   aaaaaa.BB.foo
+ jmp   r.BB.foo
+
+# CHECK:      <aaaaaa.BB.foo>:
+# CHECK-NEXT:  nopl (%rax)
+# CHECK-NEXT:  jge 0x{{[[:xdigit:]]+}} <r.BB.foo>
+# CHECK-NOT:   jmp
+#
+.section       .text,"ax",@progbits,unique,8
+aaaaaa.BB.foo:
+ nopl (%rax)
+ jl    aaaaaaa.BB.foo
+ jmp   r.BB.foo
+
+# CHECK:      <aaaaaaa.BB.foo>:
+# CHECK-NEXT:  nopl (%rax)
+# CHECK-NEXT:  jg 0x{{[[:xdigit:]]+}} <r.BB.foo>
+# CHECK-NOT:   jmp
+#
+.section       .text,"ax",@progbits,unique,9
+aaaaaaa.BB.foo:
+ nopl (%rax)
+ jle   aaaaaaaa.BB.foo
+ jmp   r.BB.foo
+
+# CHECK:      <aaaaaaaa.BB.foo>:
+# CHECK-NEXT:  nopl (%rax)
+# CHECK-NEXT:  jbe 0x{{[[:xdigit:]]+}} <r.BB.foo>
+# CHECK-NOT:   jmp
+#
+.section       .text,"ax",@progbits,unique,10
+aaaaaaaa.BB.foo:
+ nopl (%rax)
+ ja    aaaaaaaaa.BB.foo
+ jmp   r.BB.foo
+
+# CHECK:      <aaaaaaaaa.BB.foo>:
+# CHECK-NEXT:  nopl (%rax)
+# CHECK-NEXT:  jb 0x{{[[:xdigit:]]+}} <r.BB.foo>
+# CHECK-NOT:   jmp
+#
+.section       .text,"ax",@progbits,unique,11
+aaaaaaaaa.BB.foo:
+ nopl (%rax)
+ jae   aaaaaaaaaa.BB.foo
+ jmp   r.BB.foo
+
+.section       .text,"ax",@progbits,unique,20
+aaaaaaaaaa.BB.foo:
+ nopl (%rax)
+
+r.BB.foo:
+ ret
diff --git a/lld/test/ELF/bb-sections-pc32reloc.s b/lld/test/ELF/bb-sections-pc32reloc.s

new file mode 100644 (file)

index 0000000..9631a3c
--- /dev/null
+++ b/lld/test/ELF/bb-sections-pc32reloc.s
@@ -0,0 +1,37 @@
+# REQUIRES: x86
+## basicblock-sections tests.
+## This simple test checks if redundant direct jumps are converted to
+## implicit fallthrus when PC32 reloc is present.  The jcc's must be converted
+## to their inverted opcode, for instance jne to je and jmp must be deleted.
+
+# RUN: llvm-mc -filetype=obj -triple=x86_64 %s -o %t.o
+# RUN: llvm-objdump -dr %t.o| FileCheck %s --check-prefix=RELOC
+# RUN: ld.lld  --optimize-bb-jumps %t.o -o %t.out
+# RUN: llvm-objdump -d %t.out| FileCheck %s
+
+# RELOC:      jmp
+# RELOC-NEXT: R_X86_64_PC32
+
+# CHECK:      <foo>:
+# CHECK-NEXT:  nopl (%rax)
+# CHECK-NEXT:  jne 0x{{[[:xdigit:]]+}} <r.BB.foo>
+# CHECK-NOT:   jmp
+
+
+.section       .text,"ax",@progbits
+.type  foo,@function
+foo:
+ nopl (%rax)
+ je    a.BB.foo
+# Encode a jmp r.BB.foo insn using a PC32 reloc
+ .byte  0xe9
+ .long  r.BB.foo - . - 4
+
+# CHECK:      <a.BB.foo>:
+# CHECK-NEXT:  nopl (%rax)
+
+.section       .text,"ax",@progbits,unique,3
+a.BB.foo:
+ nopl (%rax)
+r.BB.foo:
+ ret
author	Sriraman Tallam <tmsriram@google.com>
	Tue, 7 Apr 2020 13:48:18 +0000 (06:48 -0700)
committer	Sriraman Tallam <tmsriram@google.com>
	Tue, 7 Apr 2020 13:55:57 +0000 (06:55 -0700)
lld/ELF/Arch/X86_64.cpp		patch \| blob \| history
lld/ELF/Config.h		patch \| blob \| history
lld/ELF/Driver.cpp		patch \| blob \| history
lld/ELF/InputSection.cpp		patch \| blob \| history
lld/ELF/InputSection.h		patch \| blob \| history
lld/ELF/LTO.cpp		patch \| blob \| history
lld/ELF/Options.td		patch \| blob \| history
lld/ELF/OutputSections.cpp		patch \| blob \| history
lld/ELF/Relocations.h		patch \| blob \| history
lld/ELF/Target.h		patch \| blob \| history
lld/ELF/Writer.cpp		patch \| blob \| history
lld/test/ELF/bb-sections-and-icf.s	[new file with mode: 0644]	patch \| blob
lld/test/ELF/bb-sections-delete-fallthru.s	[new file with mode: 0644]	patch \| blob
lld/test/ELF/bb-sections-pc32reloc.s	[new file with mode: 0644]	patch \| blob