[lld-macho] Emit STABS symbols for debugging, and drop debug sections
authorJez Ng <jezng@fb.com>
Tue, 1 Dec 2020 22:45:01 +0000 (14:45 -0800)
committerJez Ng <jezng@fb.com>
Tue, 1 Dec 2020 23:05:20 +0000 (15:05 -0800)
Debug sections contain a large amount of data. In order not to bloat the size
of the final binary, we remove them and instead emit STABS symbols for
`dsymutil` and the debugger to locate their contents in the object files.

With this diff, `dsymutil` is able to locate the debug info. However, we need
a few more features before `lldb` is able to work well with our binaries --
e.g. having `LC_DYSYMTAB` accurately reflect the number of local symbols,
emitting `LC_UUID`, and more. Those will be handled in follow-up diffs.

Note also that the STABS we emit differ slightly from what ld64 does. First, we
emit the path to the source file as one `N_SO` symbol instead of two. (`ld64`
emits one `N_SO` for the dirname and one of the basename.) Second, we do not
emit `N_BNSYM` and `N_ENSYM` STABS to mark the start and end of functions,
because the `N_FUN` STABS already serve that purpose. @clayborg recommended
these changes based on his knowledge of what the debugging tools look for.

Additionally, this current implementation doesn't accurately reflect the size
of function symbols. It uses the size of their containing sectioins as a proxy,
but that is only accurate if `.subsections_with_symbols` is set, and if there
isn't an `N_ALT_ENTRY` in that particular subsection. I think we have two
options to solve this:

1. We can split up subsections by symbol even if `.subsections_with_symbols`
   is not set, but include constraints to ensure those subsections retain
   their order in the final output. This is `ld64`'s approach.
2. We could just add a `size` field to our `Symbol` class. This seems simpler,
   and I'm more inclined toward it, but I'm not sure if there are use cases
   that it doesn't handle well. As such I'm punting on the decision for now.

Reviewed By: clayborg

Differential Revision: https://reviews.llvm.org/D89257

lld/MachO/CMakeLists.txt
lld/MachO/Dwarf.cpp [new file with mode: 0644]
lld/MachO/Dwarf.h [new file with mode: 0644]
lld/MachO/InputFiles.cpp
lld/MachO/InputFiles.h
lld/MachO/InputSection.h
lld/MachO/OutputSegment.h
lld/MachO/SyntheticSections.cpp
lld/MachO/SyntheticSections.h
lld/MachO/Writer.cpp
lld/test/MachO/stabs.s [new file with mode: 0644]

index 6ddc88f..6a8b5d3 100644 (file)
@@ -9,6 +9,7 @@ add_lld_library(lldMachO2
   UnwindInfoSection.cpp
   Driver.cpp
   DriverUtils.cpp
+  Dwarf.cpp
   ExportTrie.cpp
   InputFiles.cpp
   InputSection.cpp
diff --git a/lld/MachO/Dwarf.cpp b/lld/MachO/Dwarf.cpp
new file mode 100644 (file)
index 0000000..121f54f
--- /dev/null
@@ -0,0 +1,49 @@
+//===- DWARF.cpp ----------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "Dwarf.h"
+#include "InputFiles.h"
+#include "InputSection.h"
+#include "OutputSegment.h"
+
+#include <memory>
+
+using namespace lld;
+using namespace lld::macho;
+using namespace llvm;
+
+std::unique_ptr<DwarfObject> DwarfObject::create(ObjFile *obj) {
+  auto dObj = std::make_unique<DwarfObject>();
+  bool hasDwarfInfo = false;
+  for (SubsectionMap subsecMap : obj->subsections) {
+    for (auto it : subsecMap) {
+      InputSection *isec = it.second;
+      if (!(isDebugSection(isec->flags) &&
+            isec->segname == segment_names::dwarf))
+        continue;
+
+      if (isec->name == "__debug_info") {
+        dObj->infoSection.Data = toStringRef(isec->data);
+        hasDwarfInfo = true;
+        continue;
+      }
+
+      if (StringRef *s = StringSwitch<StringRef *>(isec->name)
+                             .Case("__debug_abbrev", &dObj->abbrevSection)
+                             .Case("__debug_str", &dObj->strSection)
+                             .Default(nullptr)) {
+        *s = toStringRef(isec->data);
+        hasDwarfInfo = true;
+      }
+    }
+  }
+
+  if (hasDwarfInfo)
+    return dObj;
+  return nullptr;
+}
diff --git a/lld/MachO/Dwarf.h b/lld/MachO/Dwarf.h
new file mode 100644 (file)
index 0000000..119f277
--- /dev/null
@@ -0,0 +1,53 @@
+//===- DWARF.h -----------------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===-------------------------------------------------------------------===//
+
+#ifndef LLD_MACHO_DWARF_H
+#define LLD_MACHO_DWARF_H
+
+#include "llvm/ADT/StringRef.h"
+#include "llvm/DebugInfo/DWARF/DWARFObject.h"
+
+namespace lld {
+namespace macho {
+
+class ObjFile;
+
+// Implements the interface between LLVM's DWARF-parsing utilities and LLD's
+// InputSection structures.
+class DwarfObject final : public llvm::DWARFObject {
+public:
+  bool isLittleEndian() const override { return true; }
+
+  llvm::Optional<llvm::RelocAddrEntry> find(const llvm::DWARFSection &sec,
+                                            uint64_t pos) const override {
+    // TODO: implement this
+    return llvm::None;
+  }
+
+  void forEachInfoSections(
+      llvm::function_ref<void(const llvm::DWARFSection &)> f) const override {
+    f(infoSection);
+  }
+
+  llvm::StringRef getAbbrevSection() const override { return abbrevSection; }
+  llvm::StringRef getStrSection() const override { return strSection; }
+
+  // Returns an instance of DwarfObject if the given object file has the
+  // relevant DWARF debug sections.
+  static std::unique_ptr<DwarfObject> create(ObjFile *);
+
+private:
+  llvm::DWARFSection infoSection;
+  llvm::StringRef abbrevSection;
+  llvm::StringRef strSection;
+};
+
+} // namespace macho
+} // namespace lld
+
+#endif
index 2f65951..921b699 100644 (file)
@@ -44,6 +44,7 @@
 #include "InputFiles.h"
 #include "Config.h"
 #include "Driver.h"
+#include "Dwarf.h"
 #include "ExportTrie.h"
 #include "InputSection.h"
 #include "MachOStructs.h"
@@ -54,6 +55,7 @@
 #include "Symbols.h"
 #include "Target.h"
 
+#include "lld/Common/DWARF.h"
 #include "lld/Common/ErrorHandler.h"
 #include "lld/Common/Memory.h"
 #include "lld/Common/Reproduce.h"
@@ -387,6 +389,28 @@ ObjFile::ObjFile(MemoryBufferRef mb) : InputFile(ObjKind, mb) {
   // parsed all the symbols.
   for (size_t i = 0, n = subsections.size(); i < n; ++i)
     parseRelocations(sectionHeaders[i], subsections[i]);
+
+  parseDebugInfo();
+}
+
+void ObjFile::parseDebugInfo() {
+  std::unique_ptr<DwarfObject> dObj = DwarfObject::create(this);
+  if (!dObj)
+    return;
+
+  auto *ctx = make<DWARFContext>(
+      std::move(dObj), "",
+      [&](Error err) { warn(getName() + ": " + toString(std::move(err))); },
+      [&](Error warning) {
+        warn(getName() + ": " + toString(std::move(warning)));
+      });
+
+  // TODO: Since object files can contain a lot of DWARF info, we should verify
+  // that we are parsing just the info we need
+  const DWARFContext::compile_unit_range &units = ctx->compile_units();
+  auto it = units.begin();
+  compileUnit = it->get();
+  assert(std::next(it) == units.end());
 }
 
 // The path can point to either a dylib or a .tbd file.
index a1405aa..4356350 100644 (file)
@@ -15,6 +15,7 @@
 #include "lld/Common/Memory.h"
 #include "llvm/ADT/DenseSet.h"
 #include "llvm/BinaryFormat/MachO.h"
+#include "llvm/DebugInfo/DWARF/DWARFUnit.h"
 #include "llvm/Object/Archive.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/TextAPI/MachO/InterfaceFile.h"
@@ -91,6 +92,11 @@ class ObjFile : public InputFile {
 public:
   explicit ObjFile(MemoryBufferRef mb);
   static bool classof(const InputFile *f) { return f->kind() == ObjKind; }
+
+  llvm::DWARFUnit *compileUnit = nullptr;
+
+private:
+  void parseDebugInfo();
 };
 
 // command-line -sectcreate file
index 1449c87..4ef8d84 100644 (file)
@@ -35,15 +35,20 @@ struct Reloc {
   llvm::PointerUnion<Symbol *, InputSection *> referent;
 };
 
-inline bool isZeroFill(uint8_t flags) {
+inline bool isZeroFill(uint32_t flags) {
   return llvm::MachO::isVirtualSection(flags & llvm::MachO::SECTION_TYPE);
 }
 
-inline bool isThreadLocalVariables(uint8_t flags) {
+inline bool isThreadLocalVariables(uint32_t flags) {
   return (flags & llvm::MachO::SECTION_TYPE) ==
          llvm::MachO::S_THREAD_LOCAL_VARIABLES;
 }
 
+inline bool isDebugSection(uint32_t flags) {
+  return (flags & llvm::MachO::SECTION_ATTRIBUTES_USR) ==
+         llvm::MachO::S_ATTR_DEBUG;
+}
+
 class InputSection {
 public:
   virtual ~InputSection() = default;
index 6234237..63b62d5 100644 (file)
@@ -23,6 +23,7 @@ constexpr const char data[] = "__DATA";
 constexpr const char linkEdit[] = "__LINKEDIT";
 constexpr const char dataConst[] = "__DATA_CONST";
 constexpr const char ld[] = "__LD"; // output only with -r
+constexpr const char dwarf[] = "__DWARF";
 
 } // namespace segment_names
 
index 5d603d6..feaa4b5 100644 (file)
@@ -20,7 +20,9 @@
 #include "lld/Common/ErrorHandler.h"
 #include "lld/Common/Memory.h"
 #include "llvm/Support/EndianStream.h"
+#include "llvm/Support/FileSystem.h"
 #include "llvm/Support/LEB128.h"
+#include "llvm/Support/Path.h"
 
 using namespace llvm;
 using namespace llvm::support;
@@ -574,17 +576,100 @@ SymtabSection::SymtabSection(StringTableSection &stringTableSection)
       stringTableSection(stringTableSection) {}
 
 uint64_t SymtabSection::getRawSize() const {
-  return symbols.size() * sizeof(structs::nlist_64);
+  return getNumSymbols() * sizeof(structs::nlist_64);
+}
+
+void SymtabSection::emitBeginSourceStab(DWARFUnit *compileUnit) {
+  StabsEntry stab(MachO::N_SO);
+  SmallString<261> dir(compileUnit->getCompilationDir());
+  StringRef sep = sys::path::get_separator();
+  // We don't use `path::append` here because we want an empty `dir` to result
+  // in an absolute path. `append` would give us a relative path for that case.
+  if (!dir.endswith(sep))
+    dir += sep;
+  stab.strx = stringTableSection.addString(
+      saver.save(dir + compileUnit->getUnitDIE().getShortName()));
+  stabs.emplace_back(std::move(stab));
+}
+
+void SymtabSection::emitEndSourceStab() {
+  StabsEntry stab(MachO::N_SO);
+  stab.sect = 1;
+  stabs.emplace_back(std::move(stab));
+}
+
+void SymtabSection::emitObjectFileStab(ObjFile *file) {
+  StabsEntry stab(MachO::N_OSO);
+  stab.sect = target->cpuSubtype;
+  SmallString<261> path(file->getName());
+  std::error_code ec = sys::fs::make_absolute(path);
+  if (ec)
+    fatal("failed to get absolute path for " + file->getName());
+
+  stab.strx = stringTableSection.addString(saver.save(path.str()));
+  stab.desc = 1;
+  stabs.emplace_back(std::move(stab));
+}
+
+void SymtabSection::emitFunStabs(Defined *defined) {
+  {
+    StabsEntry stab(MachO::N_FUN);
+    stab.sect = 1;
+    stab.strx = stringTableSection.addString(defined->getName());
+    stab.value = defined->getVA();
+    stabs.emplace_back(std::move(stab));
+  }
+
+  {
+    StabsEntry stab(MachO::N_FUN);
+    // FIXME this should be the size of the symbol. Using the section size in
+    // lieu is only correct if .subsections_via_symbols is set.
+    stab.value = defined->isec->getSize();
+    stabs.emplace_back(std::move(stab));
+  }
 }
 
 void SymtabSection::finalizeContents() {
-  // TODO support other symbol types
+  InputFile *lastFile = nullptr;
   for (Symbol *sym : symtab->getSymbols()) {
+    // TODO support other symbol types
     if (isa<Defined>(sym) || sym->isInGot() || sym->isInStubs()) {
       sym->symtabIndex = symbols.size();
       symbols.push_back({sym, stringTableSection.addString(sym->getName())});
     }
+
+    // Emit STABS symbols so that dsymutil and/or the debugger can map address
+    // regions in the final binary to the source and object files from which
+    // they originated.
+    if (auto *defined = dyn_cast<Defined>(sym)) {
+      if (defined->isAbsolute())
+        continue;
+
+      InputSection *isec = defined->isec;
+      // XXX is it right to assume that all symbols in __text are function
+      // symbols?
+      if (isec->name == "__text") {
+        ObjFile *file = dyn_cast<ObjFile>(isec->file);
+        assert(file);
+        if (!file->compileUnit)
+          continue;
+
+        if (lastFile == nullptr || lastFile != file) {
+          if (lastFile != nullptr)
+            emitEndSourceStab();
+          lastFile = file;
+
+          emitBeginSourceStab(file->compileUnit);
+          emitObjectFileStab(file);
+        }
+        emitFunStabs(defined);
+      }
+      // TODO emit stabs for non-function symbols too
+    }
   }
+
+  if (!stabs.empty())
+    emitEndSourceStab();
 }
 
 void SymtabSection::writeTo(uint8_t *buf) const {
@@ -602,12 +687,23 @@ void SymtabSection::writeTo(uint8_t *buf) const {
         nList->n_type = MachO::N_EXT | MachO::N_SECT;
         nList->n_sect = defined->isec->parent->index;
         // For the N_SECT symbol type, n_value is the address of the symbol
-        nList->n_value = defined->value + defined->isec->getVA();
+        nList->n_value = defined->getVA();
       }
       nList->n_desc |= defined->isWeakDef() ? MachO::N_WEAK_DEF : 0;
     }
     ++nList;
   }
+
+  // Emit the stabs entries after the "real" symbols. We cannot emit them
+  // before as that would render Symbol::symtabIndex inaccurate.
+  for (const StabsEntry &entry : stabs) {
+    nList->n_strx = entry.strx;
+    nList->n_type = entry.type;
+    nList->n_sect = entry.sect;
+    nList->n_desc = entry.desc;
+    nList->n_value = entry.value;
+    ++nList;
+  }
 }
 
 IndirectSymtabSection::IndirectSymtabSection()
@@ -656,7 +752,7 @@ StringTableSection::StringTableSection()
 
 uint32_t StringTableSection::addString(StringRef str) {
   uint32_t strx = size;
-  strings.push_back(str);
+  strings.push_back(str); // TODO: consider deduplicating strings
   size += str.size() + 1; // account for null terminator
   return strx;
 }
index 1736b6a..4a8820a 100644 (file)
 #include "llvm/ADT/SetVector.h"
 #include "llvm/Support/raw_ostream.h"
 
+namespace llvm {
+class DWARFUnit;
+} // namespace llvm
+
 namespace lld {
 namespace macho {
 
@@ -48,6 +52,7 @@ constexpr const char ehFrame[] = "__eh_frame";
 class Defined;
 class DylibSymbol;
 class LoadCommand;
+class ObjFile;
 
 class SyntheticSection : public OutputSection {
 public:
@@ -405,16 +410,32 @@ struct SymtabEntry {
   size_t strx;
 };
 
+struct StabsEntry {
+  uint8_t type;
+  uint32_t strx = 0;
+  uint8_t sect = 0;
+  uint16_t desc = 0;
+  uint64_t value = 0;
+
+  explicit StabsEntry(uint8_t type) : type(type) {}
+};
+
 class SymtabSection : public LinkEditSection {
 public:
   SymtabSection(StringTableSection &);
   void finalizeContents();
-  size_t getNumSymbols() const { return symbols.size(); }
+  size_t getNumSymbols() const { return stabs.size() + symbols.size(); }
   uint64_t getRawSize() const override;
   void writeTo(uint8_t *buf) const override;
 
 private:
+  void emitBeginSourceStab(llvm::DWARFUnit *compileUnit);
+  void emitEndSourceStab();
+  void emitObjectFileStab(ObjFile *);
+  void emitFunStabs(Defined *);
+
   StringTableSection &stringTableSection;
+  std::vector<StabsEntry> stabs;
   std::vector<SymtabEntry> symbols;
 };
 
index fa42c1c..e9d88af 100644 (file)
@@ -578,6 +578,10 @@ void Writer::createOutputSections() {
   MapVector<std::pair<StringRef, StringRef>, MergedOutputSection *>
       mergedOutputSections;
   for (InputSection *isec : inputSections) {
+    // Instead of emitting DWARF sections, we emit STABS symbols to the object
+    // files that contain them.
+    if (isDebugSection(isec->flags) && isec->segname == segment_names::dwarf)
+      continue;
     MergedOutputSection *&osec =
         mergedOutputSections[{isec->segname, isec->name}];
     if (osec == nullptr)
@@ -591,8 +595,9 @@ void Writer::createOutputSections() {
     if (unwindInfoSection && segname == segment_names::ld) {
       assert(osec->name == section_names::compactUnwind);
       unwindInfoSection->setCompactUnwindSection(osec);
-    } else
+    } else {
       getOrCreateOutputSegment(segname)->addOutputSection(osec);
+    }
   }
 
   for (SyntheticSection *ssec : syntheticSections) {
diff --git a/lld/test/MachO/stabs.s b/lld/test/MachO/stabs.s
new file mode 100644 (file)
index 0000000..5e85ccc
--- /dev/null
@@ -0,0 +1,114 @@
+# REQUIRES: x86
+# UNSUPPORTED: system-windows
+# RUN: split-file %s %t
+# RUN: llvm-mc -filetype=obj -triple=x86_64-apple-darwin %t/test.s -o %t/test.o
+# RUN: llvm-mc -filetype=obj -triple=x86_64-apple-darwin %t/foo.s -o %t/foo.o
+
+# RUN: %lld -lSystem %t/test.o %t/foo.o -o %t/test
+# RUN: llvm-nm -pa %t/test | FileCheck %s -DDIR=%t
+
+## Check that we emit absolute paths to the object files in our OSO entries
+## even if our inputs are relative paths.
+# RUN: cd %t && %lld -lSystem test.o foo.o -o test
+# RUN: llvm-nm -pa %t/test | FileCheck %s -DDIR=%t
+
+# CHECK-DAG:  [[#%x, MAIN:]]   T _main
+# CHECK-DAG:  [[#%x, FOO: ]]   T _foo
+# CHECK:      0000000000000000 - 00 0000    SO /tmp/test.cpp
+# CHECK-NEXT: 0000000000000000 - 03 0001   OSO [[DIR]]/test.o
+# CHECK-NEXT: [[#MAIN]]        - 01 0000   FUN _main
+# CHECK-NEXT: 0000000000000001 - 00 0000   FUN
+# CHECK-NEXT: 0000000000000000 - 01 0000    SO
+# CHECK-NEXT: 0000000000000000 - 00 0000    SO /foo.cpp
+# CHECK-NEXT: 0000000000000000 - 03 0001   OSO [[DIR]]/foo.o
+# CHECK-NEXT: [[#FOO]]         - 01 0000   FUN _foo
+# CHECK-NEXT: 0000000000000001 - 00 0000   FUN
+# CHECK-NEXT: 0000000000000000 - 01 0000    SO
+
+#--- test.s
+.text
+.globl  _main
+_main:
+Lfunc_begin0:
+  retq
+Lfunc_end0:
+
+.section  __DWARF,__debug_str,regular,debug
+  .asciz  "test.cpp"             ## string offset=0
+  .asciz  "/tmp"                 ## string offset=9
+.section  __DWARF,__debug_abbrev,regular,debug
+Lsection_abbrev:
+  .byte  1                       ## Abbreviation Code
+  .byte  17                      ## DW_TAG_compile_unit
+  .byte  1                       ## DW_CHILDREN_yes
+  .byte  3                       ## DW_AT_name
+  .byte  14                      ## DW_FORM_strp
+  .byte  27                      ## DW_AT_comp_dir
+  .byte  14                      ## DW_FORM_strp
+  .byte  17                      ## DW_AT_low_pc
+  .byte  1                       ## DW_FORM_addr
+  .byte  18                      ## DW_AT_high_pc
+  .byte  6                       ## DW_FORM_data4
+  .byte  0                       ## EOM(1)
+.section  __DWARF,__debug_info,regular,debug
+.set Lset0, Ldebug_info_end0-Ldebug_info_start0 ## Length of Unit
+  .long  Lset0
+Ldebug_info_start0:
+  .short  4                       ## DWARF version number
+.set Lset1, Lsection_abbrev-Lsection_abbrev ## Offset Into Abbrev. Section
+  .long  Lset1
+  .byte  8                       ## Address Size (in bytes)
+  .byte  1                       ## Abbrev [1] 0xb:0x48 DW_TAG_compile_unit
+  .long  0                       ## DW_AT_name
+  .long  9                       ## DW_AT_comp_dir
+  .quad  Lfunc_begin0            ## DW_AT_low_pc
+.set Lset3, Lfunc_end0-Lfunc_begin0     ## DW_AT_high_pc
+  .long  Lset3
+  .byte  0                       ## End Of Children Mark
+Ldebug_info_end0:
+.subsections_via_symbols
+.section  __DWARF,__debug_line,regular,debug
+
+#--- foo.s
+.text
+.globl  _foo
+_foo:
+Lfunc_begin0:
+  retq
+Lfunc_end0:
+
+.section  __DWARF,__debug_str,regular,debug
+  .asciz  "foo.cpp"              ## string offset=0
+  .asciz  ""                     ## string offset=8
+.section  __DWARF,__debug_abbrev,regular,debug
+Lsection_abbrev:
+  .byte  1                       ## Abbreviation Code
+  .byte  17                      ## DW_TAG_compile_unit
+  .byte  1                       ## DW_CHILDREN_yes
+  .byte  3                       ## DW_AT_name
+  .byte  14                      ## DW_FORM_strp
+  .byte  27                      ## DW_AT_comp_dir
+  .byte  14                      ## DW_FORM_strp
+  .byte  17                      ## DW_AT_low_pc
+  .byte  1                       ## DW_FORM_addr
+  .byte  18                      ## DW_AT_high_pc
+  .byte  6                       ## DW_FORM_data4
+  .byte  0                       ## EOM(1)
+.section  __DWARF,__debug_info,regular,debug
+.set Lset0, Ldebug_info_end0-Ldebug_info_start0 ## Length of Unit
+  .long  Lset0
+Ldebug_info_start0:
+  .short  4                       ## DWARF version number
+.set Lset1, Lsection_abbrev-Lsection_abbrev ## Offset Into Abbrev. Section
+  .long  Lset1
+  .byte  8                       ## Address Size (in bytes)
+  .byte  1                       ## Abbrev [1] 0xb:0x48 DW_TAG_compile_unit
+  .long  0                       ## DW_AT_name
+  .long  8                       ## DW_AT_comp_dir
+  .quad  Lfunc_begin0            ## DW_AT_low_pc
+.set Lset3, Lfunc_end0-Lfunc_begin0     ## DW_AT_high_pc
+  .long  Lset3
+  .byte  0                       ## End Of Children Mark
+Ldebug_info_end0:
+.subsections_via_symbols
+.section  __DWARF,__debug_line,regular,debug