From 7f60ed12effade437028b938ea463f37020f89fd Mon Sep 17 00:00:00 2001 From: Jez Ng Date: Wed, 21 Dec 2022 17:44:45 -0500 Subject: [PATCH] [reland][lld-macho] Private label aliases to weak symbols should not retain section data This reverts commit a650f2ec7a37cf1f495108bbb313e948c232c29c. The crashes it was causing will be fixed by the stacked diff {D140606}. --- lld/MachO/InputFiles.cpp | 30 +++++++++-- lld/MachO/UnwindInfoSection.cpp | 2 +- lld/test/MachO/weak-def-alias-private-label.s | 75 +++++++++++++++++++++++++++ 3 files changed, 101 insertions(+), 6 deletions(-) create mode 100644 lld/test/MachO/weak-def-alias-private-label.s diff --git a/lld/MachO/InputFiles.cpp b/lld/MachO/InputFiles.cpp index 93e5ee6..c903c98 100644 --- a/lld/MachO/InputFiles.cpp +++ b/lld/MachO/InputFiles.cpp @@ -628,6 +628,12 @@ void ObjFile::parseRelocations(ArrayRef sectionHeaders, } } +// Symbols with `l` or `L` as a prefix are linker-private and never appear in +// the output. +static bool isPrivateLabel(StringRef name) { + return name.startswith("l") || name.startswith("L"); +} + template static macho::Symbol *createDefined(const NList &sym, StringRef name, InputSection *isec, uint64_t value, @@ -697,8 +703,7 @@ static macho::Symbol *createDefined(const NList &sym, StringRef name, } assert(!isWeakDefCanBeHidden && "weak_def_can_be_hidden on already-hidden symbol?"); - bool includeInSymtab = - !name.startswith("l") && !name.startswith("L") && !isEhFrameSection(isec); + bool includeInSymtab = !isPrivateLabel(name) && !isEhFrameSection(isec); return make( name, isec->getFile(), isec, value, size, sym.n_desc & N_WEAK_DEF, /*isExternal=*/false, /*isPrivateExtern=*/false, includeInSymtab, @@ -831,17 +836,25 @@ void ObjFile::parseSymbols(ArrayRef sectionHeaders, } sections[i]->doneSplitting = true; + auto getSymName = [strtab](const NList& sym) -> StringRef { + return StringRef(strtab + sym.n_strx); + }; + // Calculate symbol sizes and create subsections by splitting the sections // along symbol boundaries. // We populate subsections by repeatedly splitting the last (highest // address) subsection. llvm::stable_sort(symbolIndices, [&](uint32_t lhs, uint32_t rhs) { + // Put private-label symbols after other symbols at the same address. + if (nList[lhs].n_value == nList[rhs].n_value) + return !isPrivateLabel(getSymName(nList[lhs])) && + isPrivateLabel(getSymName(nList[rhs])); return nList[lhs].n_value < nList[rhs].n_value; }); for (size_t j = 0; j < symbolIndices.size(); ++j) { const uint32_t symIndex = symbolIndices[j]; const NList &sym = nList[symIndex]; - StringRef name = strtab + sym.n_strx; + StringRef name = getSymName(sym); Subsection &subsec = subsections.back(); InputSection *isec = subsec.isec; @@ -861,8 +874,15 @@ void ObjFile::parseSymbols(ArrayRef sectionHeaders, if (!subsectionsViaSymbols || symbolOffset == 0 || sym.n_desc & N_ALT_ENTRY || !isa(isec)) { isec->hasAltEntry = symbolOffset != 0; - symbols[symIndex] = createDefined(sym, name, isec, symbolOffset, - symbolSize, forceHidden); + // If we have an private-label symbol that's an alias, just reuse the + // aliased symbol. Our sorting step above ensures that any such symbols + // will appear after the non-private-label ones. See + // weak-def-alias-ignored.s for the motivation behind this. + if (symbolOffset == 0 && isPrivateLabel(name) && j != 0) + symbols[symIndex] = symbols[symbolIndices[j - 1]]; + else + symbols[symIndex] = createDefined(sym, name, isec, symbolOffset, + symbolSize, forceHidden); continue; } auto *concatIsec = cast(isec); diff --git a/lld/MachO/UnwindInfoSection.cpp b/lld/MachO/UnwindInfoSection.cpp index 8ad2aeb9..470f335 100644 --- a/lld/MachO/UnwindInfoSection.cpp +++ b/lld/MachO/UnwindInfoSection.cpp @@ -209,7 +209,7 @@ void UnwindInfoSection::addSymbol(const Defined *d) { // If we have multiple symbols at the same address, only one of them can have // an associated unwind entry. if (!p.second && d->unwindEntry) { - assert(!p.first->second->unwindEntry); + assert(p.first->second == d || !p.first->second->unwindEntry); p.first->second = d; } } diff --git a/lld/test/MachO/weak-def-alias-private-label.s b/lld/test/MachO/weak-def-alias-private-label.s new file mode 100644 index 0000000..32067b7 --- /dev/null +++ b/lld/test/MachO/weak-def-alias-private-label.s @@ -0,0 +1,75 @@ +# REQUIRES: x86 +## This test checks that when we coalesce weak definitions, any private-label +## aliases to those weak defs don't cause the coalesced data to be retained. +## This test explicitly creates those private-label symbols, but it was actually +## motivated by MC's aarch64 backend which automatically creates them when +## emitting object files. I've chosen to explicitly create them here since we +## can then reference those symbols for a more complete test. +## +## Not retaining the data matters for more than just size -- we have a use case +## that depends on proper data coalescing to emit a valid file format. +## +## ld64 actually treats all local symbol aliases (not just the private ones) the +## same way. But implementing this is harder -- we would have to create those +## symbols first (so we can emit their names later), but we would have to +## ensure the linker correctly shuffles them around when their aliasees get +## coalesced. Emulating the behavior of weak binds for non-private symbols would +## be even trickier. Let's just deal with private-label symbols for now until we +## find a use case for more general local symbols. +## +## Finally, ld64 does all this regardless of whether .subsections_via_symbols is +## specified. We don't. But again, given how rare the lack of that directive is +## (I've only seen it from hand-written assembly inputs), I don't think we need +## to worry about it. + +# RUN: rm -rf %t; split-file %s %t +# RUN: llvm-mc -filetype=obj -triple=x86_64-apple-darwin %t/weak-then-private.s -o %t/weak-then-private.o +# RUN: llvm-mc -filetype=obj -triple=x86_64-apple-darwin %t/private-then-weak.s -o %t/private-then-weak.o +# RUN: %lld -dylib %t/weak-then-private.o %t/private-then-weak.o -o %t/test1 +# RUN: %lld -dylib %t/private-then-weak.o %t/weak-then-private.o -o %t/test2 +# RUN: llvm-objdump --macho --syms --section="__DATA,__data" --weak-bind %t/test1 | FileCheck %s +# RUN: llvm-objdump --macho --syms --section="__DATA,__data" --weak-bind %t/test2 | FileCheck %s + +## Check that we only have one copy of 0x123 in the data, not two. +# CHECK: Contents of (__DATA,__data) section +# CHECK-NEXT: 0000000000001000 23 01 00 00 00 00 00 00 00 10 00 00 00 00 00 00 {{$}} +# CHECK-NEXT: 0000000000001010 00 10 00 00 00 00 00 00 {{$}} +# CHECK-EMPTY: +# CHECK-NEXT: SYMBOL TABLE: +# CHECK-NEXT: 0000000000001008 l O __DATA,__data _ref +# CHECK-NEXT: 0000000000001010 l O __DATA,__data _ref +# CHECK-NEXT: 0000000000001000 w O __DATA,__data _weak +# CHECK-NEXT: 0000000000000000 *UND* dyld_stub_binder +# CHECK-EMPTY: +## Even though the references were to the non-weak `l_ignored` aliases, we +## should still emit weak binds as if they were the `_weak` symbol itself. +# CHECK-NEXT: Weak bind table: +# CHECK-NEXT: segment section address type addend symbol +# CHECK-NEXT: __DATA __data 0x00001008 pointer 0 _weak +# CHECK-NEXT: __DATA __data 0x00001010 pointer 0 _weak + +#--- weak-then-private.s +.globl _weak +.weak_definition _weak +.data +_weak: +l_ignored: + .quad 0x123 + +_ref: + .quad l_ignored + +.subsections_via_symbols + +#--- private-then-weak.s +.globl _weak +.weak_definition _weak +.data +l_ignored: +_weak: + .quad 0x123 + +_ref: + .quad l_ignored + +.subsections_via_symbols -- 2.7.4