[lld-macho] Have dead-stripping work with literal sections

author Jez Ng <jezng@fb.com>

Fri, 11 Jun 2021 23:49:54 +0000 (19:49 -0400)

committer Jez Ng <jezng@fb.com>

Fri, 11 Jun 2021 23:50:09 +0000 (19:50 -0400)
author Jez Ng <jezng@fb.com>
Fri, 11 Jun 2021 23:49:54 +0000 (19:49 -0400)
committer Jez Ng <jezng@fb.com>
Fri, 11 Jun 2021 23:50:09 +0000 (19:50 -0400)
diff --git a/lld/MachO/InputSection.cpp b/lld/MachO/InputSection.cpp

index 42f4a98..995637d 100644 (file)
--- a/lld/MachO/InputSection.cpp
+++ b/lld/MachO/InputSection.cpp
@@ -108,7 +108,7 @@ void CStringInputSection::splitIntoPieces() {
    }
  }
  
-const StringPiece &CStringInputSection::getStringPiece(uint64_t off) const {
+StringPiece &CStringInputSection::getStringPiece(uint64_t off) {
    if (off >= data.size())
      fatal(toString(this) + ": offset is outside the section");
  
@@ -117,6 +117,10 @@ const StringPiece &CStringInputSection::getStringPiece(uint64_t off) const {
    return it[-1];
  }
  
+const StringPiece &CStringInputSection::getStringPiece(uint64_t off) const {
+  return const_cast<CStringInputSection *>(this)->getStringPiece(off);
+}
+
  uint64_t CStringInputSection::getFileOffset(uint64_t off) const {
    return parent->fileOff + getOffset(off);
  }
@@ -132,7 +136,23 @@ WordLiteralInputSection::WordLiteralInputSection(StringRef segname,
                                                   InputFile *file,
                                                   ArrayRef<uint8_t> data,
                                                   uint32_t align, uint32_t flags)
-    : InputSection(WordLiteralKind, segname, name, file, data, align, flags) {}
+    : InputSection(WordLiteralKind, segname, name, file, data, align, flags) {
+  switch (sectionType(flags)) {
+  case S_4BYTE_LITERALS:
+    power2LiteralSize = 2;
+    break;
+  case S_8BYTE_LITERALS:
+    power2LiteralSize = 3;
+    break;
+  case S_16BYTE_LITERALS:
+    power2LiteralSize = 4;
+    break;
+  default:
+    llvm_unreachable("invalid literal section type");
+  }
+
+  live.resize(data.size() >> power2LiteralSize, !config->deadStrip);
+}
  
  uint64_t WordLiteralInputSection::getFileOffset(uint64_t off) const {
    return parent->fileOff + getOffset(off);
diff --git a/lld/MachO/InputSection.h b/lld/MachO/InputSection.h

index 44ee1f9..b01092e 100644 (file)
--- a/lld/MachO/InputSection.h
+++ b/lld/MachO/InputSection.h
@@ -14,6 +14,7 @@
  
  #include "lld/Common/LLVM.h"
  #include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/BitVector.h"
  #include "llvm/ADT/CachedHashString.h"
  #include "llvm/BinaryFormat/MachO.h"
  
@@ -43,6 +44,7 @@ public:
    uint64_t getVA(uint64_t off) const;
    // Whether the data at \p off in this InputSection is live.
    virtual bool isLive(uint64_t off) const = 0;
+  virtual void markLive(uint64_t off) = 0;
  
    void writeTo(uint8_t *buf);
  
@@ -91,6 +93,7 @@ public:
    uint64_t getVA() const { return InputSection::getVA(0); }
    // ConcatInputSections are entirely live or dead, so the offset is irrelevant.
    bool isLive(uint64_t off) const override { return live; }
+  void markLive(uint64_t off) override { live = true; }
    bool isCoalescedWeak() const { return wasCoalesced && numRefs == 0; }
    bool shouldOmitFromOutput() const { return !live || isCoalescedWeak(); }
  
@@ -112,17 +115,21 @@ public:
  };
  
  // We allocate a lot of these and binary search on them, so they should be as
-// compact as possible. Hence the use of 32 rather than 64 bits for the hash.
+// compact as possible. Hence the use of 31 rather than 64 bits for the hash.
  struct StringPiece {
    // Offset from the start of the containing input section.
    uint32_t inSecOff;
-  uint32_t hash;
+  uint32_t live : 1;
+  uint32_t hash : 31;
    // Offset from the start of the containing output section.
    uint64_t outSecOff = 0;
  
-  StringPiece(uint64_t off, uint32_t hash) : inSecOff(off), hash(hash) {}
+  StringPiece(uint64_t off, uint32_t hash)
+      : inSecOff(off), live(!config->deadStrip), hash(hash) {}
  };
  
+static_assert(sizeof(StringPiece) == 16, "StringPiece is too big!");
+
  // CStringInputSections are composed of multiple null-terminated string
  // literals, which we represent using StringPieces. These literals can be
  // deduplicated and tail-merged, so translating offsets between the input and
@@ -141,9 +148,10 @@ public:
                       flags) {}
    uint64_t getFileOffset(uint64_t off) const override;
    uint64_t getOffset(uint64_t off) const override;
-  // FIXME implement this
-  bool isLive(uint64_t off) const override { return true; }
+  bool isLive(uint64_t off) const override { return getStringPiece(off).live; }
+  void markLive(uint64_t off) override { getStringPiece(off).live = true; }
    // Find the StringPiece that contains this offset.
+  StringPiece &getStringPiece(uint64_t off);
    const StringPiece &getStringPiece(uint64_t off) const;
    // Split at each null byte.
    void splitIntoPieces();
@@ -172,12 +180,19 @@ public:
                            uint32_t flags);
    uint64_t getFileOffset(uint64_t off) const override;
    uint64_t getOffset(uint64_t off) const override;
-  // FIXME implement this
-  bool isLive(uint64_t off) const override { return true; }
+  bool isLive(uint64_t off) const override {
+    return live[off >> power2LiteralSize];
+  }
+  void markLive(uint64_t off) override { live[off >> power2LiteralSize] = 1; }
  
    static bool classof(const InputSection *isec) {
      return isec->kind() == WordLiteralKind;
    }
+
+private:
+  unsigned power2LiteralSize;
+  // The liveness of data[off] is tracked by live[off >> power2LiteralSize].
+  llvm::BitVector live;
  };
  
  inline uint8_t sectionType(uint32_t flags) {
diff --git a/lld/MachO/MarkLive.cpp b/lld/MachO/MarkLive.cpp

index 73bd215..a63f57d 100644 (file)
--- a/lld/MachO/MarkLive.cpp
+++ b/lld/MachO/MarkLive.cpp
@@ -34,12 +34,12 @@ void markLive() {
    // store ConcatInputSections in our worklist.
    SmallVector<ConcatInputSection *, 256> worklist;
  
-  auto enqueue = [&](InputSection *isec) {
+  auto enqueue = [&](InputSection *isec, uint64_t off) {
+    if (isec->isLive(off))
+      return;
+    isec->markLive(off);
      if (auto s = dyn_cast<ConcatInputSection>(isec)) {
        assert(!s->isCoalescedWeak());
-      if (s->live)
-        return;
-      s->live = true;
        worklist.push_back(s);
      }
    };
@@ -48,7 +48,7 @@ void markLive() {
      s->used = true;
      if (auto *d = dyn_cast<Defined>(s))
        if (d->isec)
-        enqueue(d->isec);
+        enqueue(d->isec, d->value);
    };
  
    // Add GC roots.
@@ -104,14 +104,16 @@ void markLive() {
    for (InputSection *isec : inputSections) {
      // Sections marked no_dead_strip
      if (isec->flags & S_ATTR_NO_DEAD_STRIP) {
-      enqueue(isec);
+      assert(isa<ConcatInputSection>(isec));
+      enqueue(isec, 0);
        continue;
      }
  
      // mod_init_funcs, mod_term_funcs sections
      if (sectionType(isec->flags) == S_MOD_INIT_FUNC_POINTERS ||
          sectionType(isec->flags) == S_MOD_TERM_FUNC_POINTERS) {
-      enqueue(isec);
+      assert(isa<ConcatInputSection>(isec));
+      enqueue(isec, 0);
        continue;
      }
  
@@ -138,7 +140,7 @@ void markLive() {
          if (auto *s = r.referent.dyn_cast<Symbol *>())
            addSym(s);
          else
-          enqueue(r.referent.get<InputSection *>());
+          enqueue(r.referent.get<InputSection *>(), r.addend);
        }
        continue;
      }
@@ -155,7 +157,7 @@ void markLive() {
          if (auto *s = r.referent.dyn_cast<Symbol *>())
            addSym(s);
          else
-          enqueue(r.referent.get<InputSection *>());
+          enqueue(r.referent.get<InputSection *>(), r.addend);
        }
      }
  
@@ -177,7 +179,7 @@ void markLive() {
          else
            referentLive = r.referent.get<InputSection *>()->isLive(r.addend);
          if (referentLive)
-          enqueue(isec);
+          enqueue(isec, 0);
        }
      }
  
diff --git a/lld/MachO/SyntheticSections.cpp b/lld/MachO/SyntheticSections.cpp

index 9183f18..235389c 100644 (file)
--- a/lld/MachO/SyntheticSections.cpp
+++ b/lld/MachO/SyntheticSections.cpp
@@ -1121,7 +1121,8 @@ void CStringSection::finalize() {
    // contents.
    for (const CStringInputSection *isec : inputs)
      for (size_t i = 0, e = isec->pieces.size(); i != e; ++i)
-      builder.add(isec->getCachedHashStringRef(i));
+      if (isec->pieces[i].live)
+        builder.add(isec->getCachedHashStringRef(i));
  
    // Fix the string table content. After this, the contents will never change.
    builder.finalizeInOrder();
@@ -1131,6 +1132,8 @@ void CStringSection::finalize() {
    // to a corresponding SectionPiece for easy access.
    for (CStringInputSection *isec : inputs) {
      for (size_t i = 0, e = isec->pieces.size(); i != e; ++i) {
+      if (!isec->pieces[i].live)
+        continue;
        isec->pieces[i].outSecOff =
            builder.getOffset(isec->getCachedHashStringRef(i));
        isec->isFinal = true;
@@ -1155,22 +1158,28 @@ void WordLiteralSection::addInput(WordLiteralInputSection *isec) {
    const uint8_t *buf = isec->data.data();
    switch (sectionType(isec->flags)) {
    case S_4BYTE_LITERALS: {
-    for (size_t i = 0, e = isec->data.size() / 4; i < e; ++i) {
-      uint32_t value = *reinterpret_cast<const uint32_t *>(buf + i * 4);
+    for (size_t off = 0, e = isec->data.size(); off < e; off += 4) {
+      if (!isec->isLive(off))
+        continue;
+      uint32_t value = *reinterpret_cast<const uint32_t *>(buf + off);
        literal4Map.emplace(value, literal4Map.size());
      }
      break;
    }
    case S_8BYTE_LITERALS: {
-    for (size_t i = 0, e = isec->data.size() / 8; i < e; ++i) {
-      uint64_t value = *reinterpret_cast<const uint64_t *>(buf + i * 8);
+    for (size_t off = 0, e = isec->data.size(); off < e; off += 8) {
+      if (!isec->isLive(off))
+        continue;
+      uint64_t value = *reinterpret_cast<const uint64_t *>(buf + off);
        literal8Map.emplace(value, literal8Map.size());
      }
      break;
    }
    case S_16BYTE_LITERALS: {
-    for (size_t i = 0, e = isec->data.size() / 16; i < e; ++i) {
-      UInt128 value = *reinterpret_cast<const UInt128 *>(buf + i * 16);
+    for (size_t off = 0, e = isec->data.size(); off < e; off += 16) {
+      if (!isec->isLive(off))
+        continue;
+      UInt128 value = *reinterpret_cast<const UInt128 *>(buf + off);
        literal16Map.emplace(value, literal16Map.size());
      }
      break;
diff --git a/lld/test/MachO/dead-strip.s b/lld/test/MachO/dead-strip.s

index 6b0acbb..e64c95e 100644 (file)
--- a/lld/test/MachO/dead-strip.s
+++ b/lld/test/MachO/dead-strip.s
@@ -253,6 +253,20 @@
  # EXECSTABS:     N_FUN {{.*}} '_main'
  # EXECSTABS-NOT: N_FUN {{.*}} '_unref'
  
+# RUN: llvm-mc -g -filetype=obj -triple=x86_64-apple-macos \
+# RUN:     %t/literals.s -o %t/literals.o
+# RUN: %lld -dylib -dead_strip --deduplicate-literals %t/literals.o -o %t/literals
+# RUN: llvm-objdump --macho --section="__TEXT,__cstring" --section="__DATA,str_ptrs" \
+# RUN:   --section="__TEXT,__literals" %t/literals | FileCheck %s --check-prefix=LIT
+
+# LIT:      Contents of (__TEXT,__cstring) section
+# LIT-NEXT: foobar
+# LIT-NEXT: Contents of (__DATA,str_ptrs) section
+# LIT-NEXT: __TEXT:__cstring:bar
+# LIT-NEXT: __TEXT:__cstring:bar
+# LIT-NEXT: Contents of (__TEXT,__literals) section
+# LIT-NEXT: ef be ad de {{$}}
+
  #--- basics.s
  .comm _ref_com, 1
  .comm _unref_com, 1
@@ -736,3 +750,39 @@ _main:
    retq
  
  .subsections_via_symbols
+
+#--- literals.s
+.cstring
+_unref_foo:
+  .ascii "foo"
+_bar:
+Lbar:
+  .asciz "bar"
+_unref_baz:
+  .asciz "baz"
+
+.literal4
+.p2align 2
+L._foo4:
+  .long 0xdeadbeef
+L._bar4:
+  .long 0xdeadbeef
+L._unref:
+  .long 0xfeedface
+
+.section __DATA,str_ptrs,literal_pointers
+.globl _data
+_data:
+  .quad _bar
+  .quad Lbar
+
+## The output binary has these integer literals put into a section that isn't
+## marked with a S_*BYTE_LITERALS flag, so we don't mark word_ptrs with the
+## S_LITERAL_POINTERS flag in order not to confuse llvm-objdump.
+.section __DATA,word_ptrs
+.globl _more_data
+_more_data:
+  .quad L._foo4
+  .quad L._bar4
+
+.subsections_via_symbols
author	Jez Ng <jezng@fb.com>
	Fri, 11 Jun 2021 23:49:54 +0000 (19:49 -0400)
committer	Jez Ng <jezng@fb.com>
	Fri, 11 Jun 2021 23:50:09 +0000 (19:50 -0400)
lld/MachO/InputSection.cpp		patch \| blob \| history
lld/MachO/InputSection.h		patch \| blob \| history
lld/MachO/MarkLive.cpp		patch \| blob \| history
lld/MachO/SyntheticSections.cpp		patch \| blob \| history
lld/test/MachO/dead-strip.s		patch \| blob \| history