From 5d88f2dd947825750d487f91b3b3735d61f7aa8e Mon Sep 17 00:00:00 2001 From: Jez Ng Date: Fri, 11 Jun 2021 19:49:50 -0400 Subject: [PATCH] [lld-macho] Deduplicate fixed-width literals Conceptually, the implementation is pretty straightforward: we put each literal value into a hashtable, and then write out the keys of that hashtable at the end. In contrast with ELF, the Mach-O format does not support variable-length literals that aren't strings. Its literals are either 4, 8, or 16 bytes in length. LLD-ELF dedups its literals via sorting + uniq'ing, but since we don't need to worry about overly-long values, we should be able to do a faster job by just hashing. That said, the implementation right now is far from optimal, because we add to those hashtables serially. To parallelize this, we'll need a basic concurrent hashtable (only needs to support concurrent writes w/o interleave reads), which shouldn't be to hard to implement, but I'd like to punt on it for now. Numbers for linking chromium_framework on my 3.2 GHz 16-Core Intel Xeon W: N Min Max Median Avg Stddev x 20 4.27 4.39 4.315 4.3225 0.033225703 + 20 4.36 4.82 4.44 4.4845 0.13152846 Difference at 95.0% confidence 0.162 +/- 0.0613971 3.74783% +/- 1.42041% (Student's t, pooled s = 0.0959262) This corresponds to binary size savings of 2MB out of 335MB, or 0.6%. It's not a great tradeoff as-is, but as mentioned our implementation can be signficantly optimized, and literal dedup will unlock more opportunities for ICF to identify identical structures that reference the same literals. Reviewed By: #lld-macho, gkm Differential Revision: https://reviews.llvm.org/D103113 --- lld/MachO/InputFiles.cpp | 17 +++++-- lld/MachO/InputSection.cpp | 19 +++++++ lld/MachO/InputSection.h | 19 +++++++ lld/MachO/SyntheticSections.cpp | 58 +++++++++++++++++++++ lld/MachO/SyntheticSections.h | 55 ++++++++++++++++++++ lld/MachO/Writer.cpp | 23 ++++++--- lld/test/MachO/literal-dedup.s | 110 ++++++++++++++++++++++++++++++++++++++++ lld/test/MachO/mattrs.ll | 4 +- 8 files changed, 291 insertions(+), 14 deletions(-) create mode 100644 lld/test/MachO/literal-dedup.s diff --git a/lld/MachO/InputFiles.cpp b/lld/MachO/InputFiles.cpp index cbbaf04..4d93878 100644 --- a/lld/MachO/InputFiles.cpp +++ b/lld/MachO/InputFiles.cpp @@ -265,16 +265,25 @@ void ObjFile::parseSections(ArrayRef
sections) { auto *buf = reinterpret_cast(mb.getBufferStart()); for (const Section &sec : sections) { - if (config->dedupLiterals && sectionType(sec.flags) == S_CSTRING_LITERALS) { + if (config->dedupLiterals && + (sectionType(sec.flags) == S_CSTRING_LITERALS || + isWordLiteralSection(sec.flags))) { if (sec.nreloc) fatal(toString(this) + " contains relocations in " + sec.segname + "," + sec.sectname + ", so LLD cannot deduplicate literals. Try re-running without " "--deduplicate-literals."); - auto *isec = make(); - parseSection(this, buf, sec, isec); - isec->splitIntoPieces(); // FIXME: parallelize this? + InputSection *isec; + if (sectionType(sec.flags) == S_CSTRING_LITERALS) { + isec = make(); + parseSection(this, buf, sec, isec); + // FIXME: parallelize this? + cast(isec)->splitIntoPieces(); + } else { + isec = make(); + parseSection(this, buf, sec, isec); + } subsections.push_back({{0, isec}}); } else { auto *isec = make(); diff --git a/lld/MachO/InputSection.cpp b/lld/MachO/InputSection.cpp index 0b08e4f..29e5045 100644 --- a/lld/MachO/InputSection.cpp +++ b/lld/MachO/InputSection.cpp @@ -127,6 +127,25 @@ uint64_t CStringInputSection::getOffset(uint64_t off) const { return piece.outSecOff + addend; } +uint64_t WordLiteralInputSection::getFileOffset(uint64_t off) const { + return parent->fileOff + getOffset(off); +} + +uint64_t WordLiteralInputSection::getOffset(uint64_t off) const { + auto *osec = cast(parent); + const uint8_t *buf = data.data(); + switch (sectionType(flags)) { + case S_4BYTE_LITERALS: + return osec->getLiteral4Offset(buf + off); + case S_8BYTE_LITERALS: + return osec->getLiteral8Offset(buf + off); + case S_16BYTE_LITERALS: + return osec->getLiteral16Offset(buf + off); + default: + llvm_unreachable("invalid literal section type"); + } +} + bool macho::isCodeSection(const InputSection *isec) { uint32_t type = sectionType(isec->flags); if (type != S_REGULAR && type != S_COALESCED) diff --git a/lld/MachO/InputSection.h b/lld/MachO/InputSection.h index d8f283c..539c60a 100644 --- a/lld/MachO/InputSection.h +++ b/lld/MachO/InputSection.h @@ -28,6 +28,7 @@ public: enum Kind { ConcatKind, CStringLiteralKind, + WordLiteralKind, }; Kind kind() const { return sectionKind; } @@ -146,6 +147,17 @@ public: std::vector pieces; }; +class WordLiteralInputSection : public InputSection { +public: + WordLiteralInputSection() : InputSection(WordLiteralKind) {} + uint64_t getFileOffset(uint64_t off) const override; + uint64_t getOffset(uint64_t off) const override; + + static bool classof(const InputSection *isec) { + return isec->kind() == WordLiteralKind; + } +}; + inline uint8_t sectionType(uint32_t flags) { return flags & llvm::MachO::SECTION_TYPE; } @@ -169,6 +181,12 @@ inline bool isDebugSection(uint32_t flags) { llvm::MachO::S_ATTR_DEBUG; } +inline bool isWordLiteralSection(uint32_t flags) { + return sectionType(flags) == llvm::MachO::S_4BYTE_LITERALS || + sectionType(flags) == llvm::MachO::S_8BYTE_LITERALS || + sectionType(flags) == llvm::MachO::S_16BYTE_LITERALS; +} + bool isCodeSection(const InputSection *); extern std::vector inputSections; @@ -197,6 +215,7 @@ constexpr const char indirectSymbolTable[] = "__ind_sym_tab"; constexpr const char const_[] = "__const"; constexpr const char lazySymbolPtr[] = "__la_symbol_ptr"; constexpr const char lazyBinding[] = "__lazy_binding"; +constexpr const char literals[] = "__literals"; constexpr const char moduleInitFunc[] = "__mod_init_func"; constexpr const char moduleTermFunc[] = "__mod_term_func"; constexpr const char nonLazySymbolPtr[] = "__nl_symbol_ptr"; diff --git a/lld/MachO/SyntheticSections.cpp b/lld/MachO/SyntheticSections.cpp index cf5d07a..570a2da 100644 --- a/lld/MachO/SyntheticSections.cpp +++ b/lld/MachO/SyntheticSections.cpp @@ -1141,6 +1141,64 @@ void CStringSection::finalize() { } } +// This section is actually emitted as __TEXT,__const by ld64, but clang may +// emit input sections of that name, and LLD doesn't currently support mixing +// synthetic and concat-type OutputSections. To work around this, I've given +// our merged-literals section a different name. +WordLiteralSection::WordLiteralSection() + : SyntheticSection(segment_names::text, section_names::literals) { + align = 16; +} + +void WordLiteralSection::addInput(WordLiteralInputSection *isec) { + isec->parent = this; + // We do all processing of the InputSection here, so it will be effectively + // finalized. + isec->isFinal = true; + const uint8_t *buf = isec->data.data(); + switch (sectionType(isec->flags)) { + case S_4BYTE_LITERALS: { + for (size_t i = 0, e = isec->data.size() / 4; i < e; ++i) { + uint32_t value = *reinterpret_cast(buf + i * 4); + literal4Map.emplace(value, literal4Map.size()); + } + break; + } + case S_8BYTE_LITERALS: { + for (size_t i = 0, e = isec->data.size() / 8; i < e; ++i) { + uint64_t value = *reinterpret_cast(buf + i * 8); + literal8Map.emplace(value, literal8Map.size()); + } + break; + } + case S_16BYTE_LITERALS: { + for (size_t i = 0, e = isec->data.size() / 16; i < e; ++i) { + UInt128 value = *reinterpret_cast(buf + i * 16); + literal16Map.emplace(value, literal16Map.size()); + } + break; + } + default: + llvm_unreachable("invalid literal section type"); + } +} + +void WordLiteralSection::writeTo(uint8_t *buf) const { + // Note that we don't attempt to do any endianness conversion in addInput(), + // so we don't do it here either -- just write out the original value, + // byte-for-byte. + for (const auto &p : literal16Map) + memcpy(buf + p.second * 16, &p.first, 16); + buf += literal16Map.size() * 16; + + for (const auto &p : literal8Map) + memcpy(buf + p.second * 8, &p.first, 8); + buf += literal8Map.size() * 8; + + for (const auto &p : literal4Map) + memcpy(buf + p.second * 4, &p.first, 4); +} + void macho::createSyntheticSymbols() { auto addHeaderSymbol = [](const char *name) { symtab->addSynthetic(name, in.header->isec, /*value=*/0, diff --git a/lld/MachO/SyntheticSections.h b/lld/MachO/SyntheticSections.h index 6577238..ca78ce8 100644 --- a/lld/MachO/SyntheticSections.h +++ b/lld/MachO/SyntheticSections.h @@ -22,6 +22,8 @@ #include "llvm/Support/MathExtras.h" #include "llvm/Support/raw_ostream.h" +#include + namespace llvm { class DWARFUnit; } // namespace llvm @@ -297,6 +299,7 @@ public: // have a corresponding entry in the LazyPointerSection. bool addEntry(Symbol *); uint64_t getVA(uint32_t stubsIndex) const { + assert(isFinal || target->usesThunks()); // ConcatOutputSection::finalize() can seek the address of a // stub before its address is assigned. Before __stubs is // finalized, return a contrived out-of-range address. @@ -531,9 +534,61 @@ private: llvm::StringTableBuilder builder; }; +/* + * This section contains deduplicated literal values. The 16-byte values are + * laid out first, followed by the 8- and then the 4-byte ones. + */ +class WordLiteralSection : public SyntheticSection { +public: + using UInt128 = std::pair; + // I don't think the standard guarantees the size of a pair, so let's make + // sure it's exact -- that way we can construct it via `mmap`. + static_assert(sizeof(UInt128) == 16, ""); + + WordLiteralSection(); + void addInput(WordLiteralInputSection *); + void writeTo(uint8_t *buf) const override; + + uint64_t getSize() const override { + return literal16Map.size() * 16 + literal8Map.size() * 8 + + literal4Map.size() * 4; + } + + bool isNeeded() const override { + return !literal16Map.empty() || !literal4Map.empty() || + !literal8Map.empty(); + } + + uint64_t getLiteral16Offset(const uint8_t *buf) const { + return literal16Map.at(*reinterpret_cast(buf)) * 16; + } + + uint64_t getLiteral8Offset(const uint8_t *buf) const { + return literal16Map.size() * 16 + + literal8Map.at(*reinterpret_cast(buf)) * 8; + } + + uint64_t getLiteral4Offset(const uint8_t *buf) const { + return literal16Map.size() * 16 + literal8Map.size() * 8 + + literal4Map.at(*reinterpret_cast(buf)) * 4; + } + +private: + template struct Hasher { + llvm::hash_code operator()(T v) const { return llvm::hash_value(v); } + }; + // We're using unordered_map instead of DenseMap here because we need to + // support all possible integer values -- there are no suitable tombstone + // values for DenseMap. + std::unordered_map> literal16Map; + std::unordered_map literal8Map; + std::unordered_map literal4Map; +}; + struct InStruct { MachHeaderSection *header = nullptr; CStringSection *cStringSection = nullptr; + WordLiteralSection *wordLiteralSection = nullptr; RebaseSection *rebase = nullptr; BindingSection *binding = nullptr; WeakBindingSection *weakBinding = nullptr; diff --git a/lld/MachO/Writer.cpp b/lld/MachO/Writer.cpp index 5ba50a0..1a1c214 100644 --- a/lld/MachO/Writer.cpp +++ b/lld/MachO/Writer.cpp @@ -863,19 +863,24 @@ template void Writer::createOutputSections() { InputSection *isec = p.value(); if (isec->shouldOmitFromOutput()) continue; + OutputSection *osec; if (auto *concatIsec = dyn_cast(isec)) { NamePair names = maybeRenameSection({isec->segname, isec->name}); - ConcatOutputSection *&osec = concatOutputSections[names]; - if (osec == nullptr) { - osec = make(names.second); - osec->inputOrder = p.index(); - } - osec->addInput(concatIsec); + ConcatOutputSection *&concatOsec = concatOutputSections[names]; + if (concatOsec == nullptr) + concatOsec = make(names.second); + concatOsec->addInput(concatIsec); + osec = concatOsec; } else if (auto *cStringIsec = dyn_cast(isec)) { - if (in.cStringSection->inputs.empty()) - in.cStringSection->inputOrder = p.index(); in.cStringSection->addInput(cStringIsec); + osec = in.cStringSection; + } else if (auto *litIsec = dyn_cast(isec)) { + in.wordLiteralSection->addInput(litIsec); + osec = in.wordLiteralSection; + } else { + llvm_unreachable("unhandled InputSection type"); } + osec->inputOrder = std::min(osec->inputOrder, static_cast(p.index())); } // Once all the inputs are added, we can finalize the output section @@ -1050,6 +1055,8 @@ template void macho::writeResult() { Writer().run(); } void macho::createSyntheticSections() { in.header = make(); in.cStringSection = config->dedupLiterals ? make() : nullptr; + in.wordLiteralSection = + config->dedupLiterals ? make() : nullptr; in.rebase = make(); in.binding = make(); in.weakBinding = make(); diff --git a/lld/test/MachO/literal-dedup.s b/lld/test/MachO/literal-dedup.s new file mode 100644 index 0000000..df74e70 --- /dev/null +++ b/lld/test/MachO/literal-dedup.s @@ -0,0 +1,110 @@ +# REQUIRES: x86 +# RUN: rm -rf %t; split-file %s %t +# RUN: llvm-mc -filetype=obj -triple=x86_64-apple-darwin %t/test.s -o %t/test.o +# RUN: llvm-mc -filetype=obj -triple=x86_64-apple-darwin %t/qux.s -o %t/qux.o +# RUN: %lld -dylib --deduplicate-literals %t/test.o %t/qux.o -o %t/test +# RUN: llvm-objdump --macho --section="__TEXT,__literals" --section="__DATA,ptrs" --syms %t/test | FileCheck %s +# RUN: llvm-readobj --section-headers %t/test | FileCheck %s --check-prefix=HEADER + +# CHECK: Contents of (__TEXT,__literals) section +# CHECK-NEXT: [[#%.16x,DEADBEEF16:]] ef be ad de ef be ad de ef be ad de ef be ad de +# CHECK-NEXT: [[#%.16x,FEEDFACE16:]] ce fa ed fe ce fa ed fe ce fa ed fe ce fa ed fe +# CHECK-NEXT: [[#%.16x,DEADBEEF8:]] ef be ad de ef be ad de ce fa ed fe ce fa ed fe +# CHECK-NEXT: [[#%.16x,DEADBEEF4:]] ef be ad de ce fa ed fe +# CHECK-NEXT: Contents of (__DATA,ptrs) section +# CHECK-NEXT: 0000000000001000 0x[[#%x,DEADBEEF16]] +# CHECK-NEXT: 0000000000001008 0x[[#%x,DEADBEEF16]] +# CHECK-NEXT: 0000000000001010 0x[[#%x,FEEDFACE16]] +# CHECK-NEXT: 0000000000001018 0x[[#%x,DEADBEEF16]] +# CHECK-NEXT: 0000000000001020 0x[[#%x,DEADBEEF8]] +# CHECK-NEXT: 0000000000001028 0x[[#%x,DEADBEEF8]] +# CHECK-NEXT: 0000000000001030 0x[[#%x,DEADBEEF8 + 8]] +# CHECK-NEXT: 0000000000001038 0x[[#%x,DEADBEEF8]] +# CHECK-NEXT: 0000000000001040 0x[[#%x,DEADBEEF4]] +# CHECK-NEXT: 0000000000001048 0x[[#%x,DEADBEEF4]] +# CHECK-NEXT: 0000000000001050 0x[[#%x,DEADBEEF4 + 4]] +# CHECK-NEXT: 0000000000001058 0x[[#%x,DEADBEEF4]] + +## Make sure the symbol addresses are correct too. +# CHECK: SYMBOL TABLE: +# CHECK-DAG: [[#DEADBEEF16]] g O __TEXT,__literals _qux16 +# CHECK-DAG: [[#DEADBEEF8]] g O __TEXT,__literals _qux8 +# CHECK-DAG: [[#DEADBEEF4]] g O __TEXT,__literals _qux4 + +## Make sure we set the right alignment and flags. +# HEADER: Name: __literals +# HEADER-NEXT: Segment: __TEXT +# HEADER-NEXT: Address: +# HEADER-NEXT: Size: +# HEADER-NEXT: Offset: +# HEADER-NEXT: Alignment: 4 +# HEADER-NEXT: RelocationOffset: +# HEADER-NEXT: RelocationCount: 0 +# HEADER-NEXT: Type: Regular +# HEADER-NEXT: Attributes [ (0x0) +# HEADER-NEXT: ] +# HEADER-NEXT: Reserved1: 0x0 +# HEADER-NEXT: Reserved2: 0x0 +# HEADER-NEXT: Reserved3: 0x0 + +#--- test.s +.literal4 +.p2align 2 +L._foo4: + .long 0xdeadbeef +L._bar4: + .long 0xdeadbeef +L._baz4: + .long 0xfeedface + +.literal8 +L._foo8: + .quad 0xdeadbeefdeadbeef +L._bar8: + .quad 0xdeadbeefdeadbeef +L._baz8: + .quad 0xfeedfacefeedface + +.literal16 +L._foo16: + .quad 0xdeadbeefdeadbeef + .quad 0xdeadbeefdeadbeef +L._bar16: + .quad 0xdeadbeefdeadbeef + .quad 0xdeadbeefdeadbeef +L._baz16: + .quad 0xfeedfacefeedface + .quad 0xfeedfacefeedface + +.section __DATA,ptrs,literal_pointers +.quad L._foo16 +.quad L._bar16 +.quad L._baz16 +.quad _qux16 + +.quad L._foo8 +.quad L._bar8 +.quad L._baz8 +.quad _qux8 + +.quad L._foo4 +.quad L._bar4 +.quad L._baz4 +.quad _qux4 + +#--- qux.s +.globl _qux4, _qux8, _qux16 + +.literal4 +.p2align 2 +_qux4: + .long 0xdeadbeef + +.literal8 +_qux8: + .quad 0xdeadbeefdeadbeef + +.literal16 +_qux16: + .quad 0xdeadbeefdeadbeef + .quad 0xdeadbeefdeadbeef diff --git a/lld/test/MachO/mattrs.ll b/lld/test/MachO/mattrs.ll index ed49b5d..25ed6f0 100644 --- a/lld/test/MachO/mattrs.ll +++ b/lld/test/MachO/mattrs.ll @@ -11,14 +11,14 @@ ; FMA: <_foo>: ; FMA-NEXT: vrcpss %xmm0, %xmm0, %xmm1 -; FMA-NEXT: vfmsub213ss 7(%rip), %xmm1, %xmm0 +; FMA-NEXT: vfmsub213ss [[#]](%rip), %xmm1, %xmm0 ; FMA-NEXT: vfnmadd132ss %xmm1, %xmm1, %xmm0 ; FMA-NEXT: retq ; NO-FMA: <_foo>: ; NO-FMA-NEXT: vrcpss %xmm0, %xmm0, %xmm1 ; NO-FMA-NEXT: vmulss %xmm1, %xmm0, %xmm0 -; NO-FMA-NEXT: vmovss 16(%rip), %xmm2 +; NO-FMA-NEXT: vmovss [[#]](%rip), %xmm2 ; NO-FMA-NEXT: vsubss %xmm0, %xmm2, %xmm0 ; NO-FMA-NEXT: vmulss %xmm0, %xmm1, %xmm0 ; NO-FMA-NEXT: vaddss %xmm0, %xmm1, %xmm0 -- 2.7.4