[BOLT] [AArch64] Handle constant islands spanning multiple functions
authorDenis Revunov <revunov.denis@huawei-partners.com>
Tue, 31 May 2022 18:50:59 +0000 (11:50 -0700)
committerRafael Auler <rafaelauler@fb.com>
Tue, 31 May 2022 20:51:35 +0000 (13:51 -0700)
Fix BOLT's constant island mapping when a constant island marked by $d
spans multiple functions. Currently, because BOLT only marks the
constant island in the first function where $d is located, if the next
function contains data at its start, BOLT will miss the data and try
to disassemble it. This patch adds code to explicitly go through all
symbols between $d and $x markers and mark their respective offsets as
data, which stops BOLT from trying to disassemble data. It also adds
MarkerType enum and refactors related functions.

Reviewed By: yota9, rafauler

Differential Revision: https://reviews.llvm.org/D126177

bolt/include/bolt/Core/BinaryContext.h
bolt/include/bolt/Core/BinaryFunction.h
bolt/lib/Core/BinaryContext.cpp
bolt/lib/Core/BinaryFunction.cpp
bolt/lib/Rewrite/RewriteInstance.cpp
bolt/test/AArch64/Inputs/unmarked-data.yaml [new file with mode: 0644]
bolt/test/AArch64/unmarked-data.test [new file with mode: 0644]

index ccd1833..a82a174 100644 (file)
@@ -82,6 +82,13 @@ inline raw_ostream &operator<<(raw_ostream &OS, const SegmentInfo &SegInfo) {
   return OS;
 }
 
+// AArch64-specific symbol markers used to delimit code/data in .text.
+enum class MarkerSymType : char {
+  NONE = 0,
+  CODE,
+  DATA,
+};
+
 enum class MemoryContentsType : char {
   UNKNOWN = 0,             /// Unknown contents.
   POSSIBLE_JUMP_TABLE,     /// Possibly a non-PIC jump table.
@@ -662,6 +669,11 @@ public:
            TheTriple->getArch() == llvm::Triple::x86_64;
   }
 
+  // AArch64-specific functions to check if symbol is used to delimit
+  // code/data in .text. Code is marked by $x, data by $d.
+  MarkerSymType getMarkerType(const SymbolRef &Symbol) const;
+  bool isMarker(const SymbolRef &Symbol) const;
+
   /// Iterate over all BinaryData.
   iterator_range<binary_data_const_iterator> getBinaryData() const {
     return make_range(BinaryDataMap.begin(), BinaryDataMap.end());
index 2c3ed9d..b5ba136 100644 (file)
@@ -1948,11 +1948,6 @@ public:
     return ColdLSDASymbol;
   }
 
-  /// True if the symbol is a mapping symbol used in AArch64 to delimit
-  /// data inside code section.
-  bool isDataMarker(const SymbolRef &Symbol, uint64_t SymbolSize) const;
-  bool isCodeMarker(const SymbolRef &Symbol, uint64_t SymbolSize) const;
-
   void setOutputDataAddress(uint64_t Address) { OutputDataOffset = Address; }
 
   uint64_t getOutputDataAddress() const { return OutputDataOffset; }
index e9ca812..e0a2af5 100644 (file)
@@ -1639,6 +1639,35 @@ void BinaryContext::printCFI(raw_ostream &OS, const MCCFIInstruction &Inst) {
   }
 }
 
+MarkerSymType BinaryContext::getMarkerType(const SymbolRef &Symbol) const {
+  // For aarch64, the ABI defines mapping symbols so we identify data in the
+  // code section (see IHI0056B). $x identifies a symbol starting code or the
+  // end of a data chunk inside code, $d indentifies start of data.
+  if (!isAArch64() || ELFSymbolRef(Symbol).getSize())
+    return MarkerSymType::NONE;
+
+  Expected<StringRef> NameOrError = Symbol.getName();
+  Expected<object::SymbolRef::Type> TypeOrError = Symbol.getType();
+
+  if (!TypeOrError || !NameOrError)
+    return MarkerSymType::NONE;
+
+  if (*TypeOrError != SymbolRef::ST_Unknown)
+    return MarkerSymType::NONE;
+
+  if (*NameOrError == "$x" || NameOrError->startswith("$x."))
+    return MarkerSymType::CODE;
+
+  if (*NameOrError == "$d" || NameOrError->startswith("$d."))
+    return MarkerSymType::DATA;
+
+  return MarkerSymType::NONE;
+}
+
+bool BinaryContext::isMarker(const SymbolRef &Symbol) const {
+  return getMarkerType(Symbol) != MarkerSymType::NONE;
+}
+
 void BinaryContext::printInstruction(raw_ostream &OS, const MCInst &Instruction,
                                      uint64_t Offset,
                                      const BinaryFunction *Function,
index 157d5d9..cd822f6 100644 (file)
@@ -3926,33 +3926,6 @@ void BinaryFunction::deleteConservativeEdges() {
   }
 }
 
-bool BinaryFunction::isDataMarker(const SymbolRef &Symbol,
-                                  uint64_t SymbolSize) const {
-  // For aarch64, the ABI defines mapping symbols so we identify data in the
-  // code section (see IHI0056B). $d identifies a symbol starting data contents.
-  if (BC.isAArch64() && Symbol.getType() &&
-      cantFail(Symbol.getType()) == SymbolRef::ST_Unknown && SymbolSize == 0 &&
-      Symbol.getName() &&
-      (cantFail(Symbol.getName()) == "$d" ||
-       cantFail(Symbol.getName()).startswith("$d.")))
-    return true;
-  return false;
-}
-
-bool BinaryFunction::isCodeMarker(const SymbolRef &Symbol,
-                                  uint64_t SymbolSize) const {
-  // For aarch64, the ABI defines mapping symbols so we identify data in the
-  // code section (see IHI0056B). $x identifies a symbol starting code or the
-  // end of a data chunk inside code.
-  if (BC.isAArch64() && Symbol.getType() &&
-      cantFail(Symbol.getType()) == SymbolRef::ST_Unknown && SymbolSize == 0 &&
-      Symbol.getName() &&
-      (cantFail(Symbol.getName()) == "$x" ||
-       cantFail(Symbol.getName()).startswith("$x.")))
-    return true;
-  return false;
-}
-
 bool BinaryFunction::isSymbolValidInScope(const SymbolRef &Symbol,
                                           uint64_t SymbolSize) const {
   // If this symbol is in a different section from the one where the
@@ -3963,7 +3936,7 @@ bool BinaryFunction::isSymbolValidInScope(const SymbolRef &Symbol,
 
   // Some symbols are tolerated inside function bodies, others are not.
   // The real function boundaries may not be known at this point.
-  if (isDataMarker(Symbol, SymbolSize) || isCodeMarker(Symbol, SymbolSize))
+  if (BC.isMarker(Symbol))
     return true;
 
   // It's okay to have a zero-sized symbol in the middle of non-zero-sized
index 2d3f987..d5dc499 100644 (file)
@@ -880,47 +880,88 @@ void RewriteInstance::discoverFileObjects() {
   std::vector<SymbolRef> SortedFileSymbols;
   std::copy_if(InputFile->symbol_begin(), InputFile->symbol_end(),
                std::back_inserter(SortedFileSymbols), isSymbolInMemory);
+  auto CompareSymbols = [this](const SymbolRef &A, const SymbolRef &B) {
+    // Marker symbols have the highest precedence, while
+    // SECTIONs have the lowest.
+    auto AddressA = cantFail(A.getAddress());
+    auto AddressB = cantFail(B.getAddress());
+    if (AddressA != AddressB)
+      return AddressA < AddressB;
+
+    bool AMarker = BC->isMarker(A);
+    bool BMarker = BC->isMarker(B);
+    if (AMarker || BMarker) {
+      return AMarker && !BMarker;
+    }
 
-  std::stable_sort(
-      SortedFileSymbols.begin(), SortedFileSymbols.end(),
-      [](const SymbolRef &A, const SymbolRef &B) {
-        // FUNC symbols have the highest precedence, while SECTIONs
-        // have the lowest.
-        uint64_t AddressA = cantFail(A.getAddress());
-        uint64_t AddressB = cantFail(B.getAddress());
-        if (AddressA != AddressB)
-          return AddressA < AddressB;
-
-        SymbolRef::Type AType = cantFail(A.getType());
-        SymbolRef::Type BType = cantFail(B.getType());
-        if (AType == SymbolRef::ST_Function && BType != SymbolRef::ST_Function)
-          return true;
-        if (BType == SymbolRef::ST_Debug && AType != SymbolRef::ST_Debug)
-          return true;
+    auto AType = cantFail(A.getType());
+    auto BType = cantFail(B.getType());
+    if (AType == SymbolRef::ST_Function && BType != SymbolRef::ST_Function)
+      return true;
+    if (BType == SymbolRef::ST_Debug && AType != SymbolRef::ST_Debug)
+      return true;
 
-        return false;
-      });
+    return false;
+  };
+
+  std::stable_sort(SortedFileSymbols.begin(), SortedFileSymbols.end(),
+                   CompareSymbols);
+
+  auto LastSymbol = SortedFileSymbols.end() - 1;
 
   // For aarch64, the ABI defines mapping symbols so we identify data in the
   // code section (see IHI0056B). $d identifies data contents.
-  auto LastSymbol = SortedFileSymbols.end() - 1;
+  // Compilers usually merge multiple data objects in a single $d-$x interval,
+  // but we need every data object to be marked with $d. Because of that we
+  // create a vector of MarkerSyms with all locations of data objects.
+
+  struct MarkerSym {
+    uint64_t Address;
+    MarkerSymType Type;
+  };
+
+  std::vector<MarkerSym> SortedMarkerSymbols;
+  auto addExtraDataMarkerPerSymbol =
+      [this](const std::vector<SymbolRef> &SortedFileSymbols,
+             std::vector<MarkerSym> &SortedMarkerSymbols) {
+        bool IsData = false;
+        uint64_t LastAddr = 0;
+        for (auto Sym = SortedFileSymbols.begin();
+             Sym < SortedFileSymbols.end(); ++Sym) {
+          uint64_t Address = cantFail(Sym->getAddress());
+          if (LastAddr == Address) // don't repeat markers
+            continue;
+
+          MarkerSymType MarkerType = BC->getMarkerType(*Sym);
+          if (MarkerType != MarkerSymType::NONE) {
+            SortedMarkerSymbols.push_back(MarkerSym{Address, MarkerType});
+            LastAddr = Address;
+            IsData = MarkerType == MarkerSymType::DATA;
+            continue;
+          }
+
+          if (IsData) {
+            SortedMarkerSymbols.push_back(
+                MarkerSym{cantFail(Sym->getAddress()), MarkerSymType::DATA});
+            LastAddr = Address;
+          }
+        }
+      };
+
   if (BC->isAArch64()) {
+    addExtraDataMarkerPerSymbol(SortedFileSymbols, SortedMarkerSymbols);
     LastSymbol = std::stable_partition(
         SortedFileSymbols.begin(), SortedFileSymbols.end(),
-        [](const SymbolRef &Symbol) {
-          StringRef Name = cantFail(Symbol.getName());
-          return !(cantFail(Symbol.getType()) == SymbolRef::ST_Unknown &&
-                   (Name == "$d" || Name.startswith("$d.") || Name == "$x" ||
-                    Name.startswith("$x.")));
-        });
+        [this](const SymbolRef &Symbol) { return !BC->isMarker(Symbol); });
     --LastSymbol;
   }
 
   BinaryFunction *PreviousFunction = nullptr;
   unsigned AnonymousId = 0;
 
-  const auto MarkersBegin = std::next(LastSymbol);
-  for (auto ISym = SortedFileSymbols.begin(); ISym != MarkersBegin; ++ISym) {
+  const auto SortedSymbolsEnd = std::next(LastSymbol);
+  for (auto ISym = SortedFileSymbols.begin(); ISym != SortedSymbolsEnd;
+       ++ISym) {
     const SymbolRef &Symbol = *ISym;
     // Keep undefined symbols for pretty printing?
     if (cantFail(Symbol.getFlags()) & SymbolRef::SF_Undefined)
@@ -1213,25 +1254,24 @@ void RewriteInstance::discoverFileObjects() {
   adjustFunctionBoundaries();
 
   // Annotate functions with code/data markers in AArch64
-  for (auto ISym = MarkersBegin; ISym != SortedFileSymbols.end(); ++ISym) {
-    const SymbolRef &Symbol = *ISym;
-    uint64_t Address =
-        cantFail(Symbol.getAddress(), "cannot get symbol address");
-    uint64_t SymbolSize = ELFSymbolRef(Symbol).getSize();
-    BinaryFunction *BF =
-        BC->getBinaryFunctionContainingAddress(Address, true, true);
+  for (auto ISym = SortedMarkerSymbols.begin();
+       ISym != SortedMarkerSymbols.end(); ++ISym) {
+
+    auto *BF =
+        BC->getBinaryFunctionContainingAddress(ISym->Address, true, true);
+
     if (!BF) {
       // Stray marker
       continue;
     }
-    const uint64_t EntryOffset = Address - BF->getAddress();
-    if (BF->isCodeMarker(Symbol, SymbolSize)) {
+    const auto EntryOffset = ISym->Address - BF->getAddress();
+    if (ISym->Type == MarkerSymType::CODE) {
       BF->markCodeAtOffset(EntryOffset);
       continue;
     }
-    if (BF->isDataMarker(Symbol, SymbolSize)) {
+    if (ISym->Type == MarkerSymType::DATA) {
       BF->markDataAtOffset(EntryOffset);
-      BC->AddressToConstantIslandMap[Address] = BF;
+      BC->AddressToConstantIslandMap[ISym->Address] = BF;
       continue;
     }
     llvm_unreachable("Unknown marker");
diff --git a/bolt/test/AArch64/Inputs/unmarked-data.yaml b/bolt/test/AArch64/Inputs/unmarked-data.yaml
new file mode 100644 (file)
index 0000000..a91b62e
--- /dev/null
@@ -0,0 +1,90 @@
+--- !ELF
+FileHeader:
+  Class:           ELFCLASS64
+  Data:            ELFDATA2LSB
+  Type:            ET_EXEC
+  Machine:         EM_AARCH64
+  Entry:           0x210134
+ProgramHeaders:
+  - Type:            PT_PHDR
+    Flags:           [ PF_R ]
+    VAddr:           0x200040
+    Align:           0x8
+    FileSize:        0x0000e0
+    MemSize:         0x0000e0
+    Offset:          0x000040
+  - Type:            PT_LOAD
+    Flags:           [ PF_R ]
+    VAddr:           0x200000
+    Align:           0x10000
+    FileSize:        0x000120
+    MemSize:         0x000120
+    Offset:          0x000000
+  - Type:            PT_LOAD
+    Flags:           [ PF_X, PF_R ]
+    FirstSec:        .text
+    LastSec:         .text
+    VAddr:           0x210120
+    Align:           0x10000
+  - Type:            PT_GNU_STACK
+    Flags:           [ PF_W, PF_R ]
+    Align:           0x0
+Sections:
+  - Name:            .text
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC, SHF_EXECINSTR ]
+    Address:         0x210120
+    AddressAlign:    0x4
+    Content:         030F0B0700000000030F0B0700000000C0035FD6FFFFFF97000080D2A80B8052010000D4
+  - Name:            .rela.text
+    Type:            SHT_RELA
+    Flags:           [ SHF_INFO_LINK ]
+    Link:            .symtab
+    AddressAlign:    0x8
+    Info:            .text
+    Relocations:
+      - Offset:          0x210134
+        Symbol:          dummy
+        Type:            R_AARCH64_CALL26
+  - Name:            .comment
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_MERGE, SHF_STRINGS ]
+    AddressAlign:    0x1
+    EntSize:         0x1
+    Content:         4C696E6B65723A204C4C442031352E302E3000
+Symbols:
+  - Name:            val
+    Index:           SHN_ABS
+    Value:           0x70B0F03
+  - Name:            first
+    Section:         .text
+    Value:           0x210120
+    Size:            0x8
+  - Name:            '$d.0'
+    Section:         .text
+    Value:           0x210120
+  - Name:            second
+    Section:         .text
+    Value:           0x210128
+    Size:            0x8
+  - Name:            '$x.1'
+    Section:         .text
+    Value:           0x210130
+  - Name:            .text
+    Type:            STT_SECTION
+    Section:         .text
+    Value:           0x210120
+  - Name:            .comment
+    Type:            STT_SECTION
+    Section:         .comment
+  - Name:            dummy
+    Type:            STT_FUNC
+    Section:         .text
+    Binding:         STB_GLOBAL
+    Value:           0x210130
+  - Name:            _start
+    Type:            STT_FUNC
+    Section:         .text
+    Binding:         STB_GLOBAL
+    Value:           0x210134
+...
diff --git a/bolt/test/AArch64/unmarked-data.test b/bolt/test/AArch64/unmarked-data.test
new file mode 100644 (file)
index 0000000..42de589
--- /dev/null
@@ -0,0 +1,34 @@
+// This test checks that multiple data objects in text of which only first is marked get disassembled properly
+
+// RUN: yaml2obj %S/Inputs/unmarked-data.yaml -o %t.exe
+// RUN: llvm-bolt %t.exe -o %t.bolt -lite=0 -use-old-text=0 2>&1 | FileCheck %s
+// CHECK-NOT: BOLT-WARNING
+// RUN: llvm-objdump -j .text -d --disassemble-symbols=first,second %t.bolt | FileCheck %s -check-prefix=CHECK-SYMBOL
+// CHECK-SYMBOL: <first>:
+// CHECK-SYMBOL: <second>:
+
+// YAML is based in the following assembly:
+
+  .equ val, 0x070b0f03  // we use constant that is not a valid instruction so that it can't be silently dissassembled
+  .text
+
+first:
+  .xword val
+  .size first, .-first
+
+second:
+  .xword val
+  .size second, .-second
+
+  .globl dummy
+  .type dummy, %function
+dummy: // dummy function to force relocations
+    ret
+
+  .globl _start
+  .type _start, %function
+_start:
+    bl      dummy
+    mov     x0, #0
+    mov     w8, #93
+    svc     #0