Handle overflow beyond the 127 common encodings limit

author Greg McGary <gkm@fb.com>

Mon, 7 Dec 2020 06:33:38 +0000 (22:33 -0800)

committer Greg McGary <gkm@fb.com>

Sat, 19 Dec 2020 22:54:37 +0000 (14:54 -0800)
author Greg McGary <gkm@fb.com>
Mon, 7 Dec 2020 06:33:38 +0000 (22:33 -0800)
committer Greg McGary <gkm@fb.com>
Sat, 19 Dec 2020 22:54:37 +0000 (14:54 -0800)
diff --git a/lld/MachO/UnwindInfoSection.cpp b/lld/MachO/UnwindInfoSection.cpp

index ed6cf05..acb4a9d 100644 (file)
--- a/lld/MachO/UnwindInfoSection.cpp
+++ b/lld/MachO/UnwindInfoSection.cpp
@@ -25,6 +25,24 @@ using namespace llvm::MachO;
  using namespace lld;
  using namespace lld::macho;
  
+#define COMMON_ENCODINGS_MAX 127
+#define COMPACT_ENCODINGS_MAX 256
+
+#define SECOND_LEVEL_PAGE_BYTES 4096
+#define SECOND_LEVEL_PAGE_WORDS (SECOND_LEVEL_PAGE_BYTES / sizeof(uint32_t))
+#define REGULAR_SECOND_LEVEL_ENTRIES_MAX                                       \
+  ((SECOND_LEVEL_PAGE_BYTES -                                                  \
+    sizeof(unwind_info_regular_second_level_page_header)) /                    \
+   sizeof(unwind_info_regular_second_level_entry))
+#define COMPRESSED_SECOND_LEVEL_ENTRIES_MAX                                    \
+  ((SECOND_LEVEL_PAGE_BYTES -                                                  \
+    sizeof(unwind_info_compressed_second_level_page_header)) /                 \
+   sizeof(uint32_t))
+
+#define COMPRESSED_ENTRY_FUNC_OFFSET_BITS 24
+#define COMPRESSED_ENTRY_FUNC_OFFSET_MASK                                      \
+  UNWIND_INFO_COMPRESSED_ENTRY_FUNC_OFFSET(~0)
+
  // Compact Unwind format is a Mach-O evolution of DWARF Unwind that
  // optimizes space and exception-time lookup.  Most DWARF unwind
  // entries can be replaced with Compact Unwind entries, but the ones
@@ -101,7 +119,7 @@ void UnwindInfoSection::finalize() {
    // Rather than sort & fold the 32-byte entries directly, we create a
    // vector of pointers to entries and sort & fold that instead.
    cuPtrVector.reserve(cuCount);
-  for (const auto &cuEntry : cuVector)
+  for (const CompactUnwindEntry64 &cuEntry : cuVector)
      cuPtrVector.emplace_back(&cuEntry);
    std::sort(cuPtrVector.begin(), cuPtrVector.end(),
              [](const CompactUnwindEntry64 *a, const CompactUnwindEntry64 *b) {
@@ -129,13 +147,11 @@ void UnwindInfoSection::finalize() {
    cuPtrVector.erase(foldWrite, cuPtrVector.end());
  
    // Count frequencies of the folded encodings
-  llvm::DenseMap<compact_unwind_encoding_t, size_t> encodingFrequencies;
+  EncodingMap encodingFrequencies;
    for (auto cuPtrEntry : cuPtrVector)
      encodingFrequencies[cuPtrEntry->encoding]++;
-  if (encodingFrequencies.size() > UNWIND_INFO_COMMON_ENCODINGS_MAX)
-    error("TODO(gkm): handle common encodings table overflow");
  
-  // Make a table of encodings, sorted by descending frequency
+  // Make a vector of encodings, sorted by descending frequency
    for (const auto &frequency : encodingFrequencies)
      commonEncodings.emplace_back(frequency);
    std::sort(commonEncodings.begin(), commonEncodings.end(),
@@ -148,37 +164,67 @@ void UnwindInfoSection::finalize() {
                return a.second > b.second;
              });
  
-  // Split folded encodings into pages, limited by capacity of a page
-  // and the 24-bit range of function offset
-  //
-  // Record the page splits as a vector of iterators on cuPtrVector
-  // such that successive elements form a semi-open interval. E.g.,
-  // page X's bounds are thus: [ pageBounds[X] .. pageBounds[X+1] )
-  //
-  // Note that pageBounds.size() is one greater than the number of
-  // pages, and pageBounds.back() holds the sentinel cuPtrVector.cend()
-  pageBounds.push_back(cuPtrVector.cbegin());
-  // TODO(gkm): cut 1st page entries short to accommodate section headers ???
-  CompactUnwindEntry64 cuEntryKey;
-  for (size_t i = 0;;) {
-    // Limit the search to entries that can fit within a 4 KiB page.
-    const auto pageBegin = pageBounds[0] + i;
-    const auto pageMax =
-        pageBounds[0] +
-        std::min(i + UNWIND_INFO_COMPRESSED_SECOND_LEVEL_ENTRIES_MAX,
-                 cuPtrVector.size());
-    // Exclude entries with functionOffset that would overflow 24 bits
-    cuEntryKey.functionAddress = (*pageBegin)->functionAddress +
-                                 UNWIND_INFO_COMPRESSED_ENTRY_FUNC_OFFSET_MASK;
-    const auto pageBreak = std::lower_bound(
-        pageBegin, pageMax, &cuEntryKey,
-        [](const CompactUnwindEntry64 *a, const CompactUnwindEntry64 *b) {
-          return a->functionAddress < b->functionAddress;
-        });
-    pageBounds.push_back(pageBreak);
-    if (pageBreak == cuPtrVector.cend())
-      break;
-    i = pageBreak - cuPtrVector.cbegin();
+  // Truncate the vector to 127 elements.
+  // Common encoding indexes are limited to 0..126, while enconding
+  // indexes 127..255 are local to each second-level page
+  if (commonEncodings.size() > COMMON_ENCODINGS_MAX)
+    commonEncodings.resize(COMMON_ENCODINGS_MAX);
+
+  // Create a map from encoding to common-encoding-table index
+  for (size_t i = 0; i < commonEncodings.size(); i++)
+    commonEncodingIndexes[commonEncodings[i].first] = i;
+
+  // Split folded encodings into pages, where each page is limited by ...
+  // (a) 4 KiB capacity
+  // (b) 24-bit difference between first & final function address
+  // (c) 8-bit compact-encoding-table index,
+  //     for which 0..126 references the global common-encodings table,
+  //     and 127..255 references a local per-second-level-page table.
+  // First we try the compact format and determine how many entries fit.
+  // If more entries fit in the regular format, we use that.
+  for (size_t i = 0; i < cuPtrVector.size();) {
+    secondLevelPages.emplace_back();
+    auto &page = secondLevelPages.back();
+    page.entryIndex = i;
+    uintptr_t functionAddressMax =
+        cuPtrVector[i]->functionAddress + COMPRESSED_ENTRY_FUNC_OFFSET_MASK;
+    size_t n = commonEncodings.size();
+    size_t wordsRemaining =
+        SECOND_LEVEL_PAGE_WORDS -
+        sizeof(unwind_info_compressed_second_level_page_header) /
+            sizeof(uint32_t);
+    while (wordsRemaining >= 1 && i < cuPtrVector.size()) {
+      const auto *cuPtr = cuPtrVector[i];
+      if (cuPtr->functionAddress >= functionAddressMax) {
+        break;
+      } else if (commonEncodingIndexes.count(cuPtr->encoding) ||
+                 page.localEncodingIndexes.count(cuPtr->encoding)) {
+        i++;
+        wordsRemaining--;
+      } else if (wordsRemaining >= 2 && n < COMPACT_ENCODINGS_MAX) {
+        page.localEncodings.emplace_back(cuPtr->encoding);
+        page.localEncodingIndexes[cuPtr->encoding] = n++;
+        i++;
+        wordsRemaining -= 2;
+      } else {
+        break;
+      }
+    }
+    page.entryCount = i - page.entryIndex;
+
+    // If this is not the final page, see if it's possible to fit more
+    // entries by using the regular format. This can happen when there
+    // are many unique encodings, and we we saturated the local
+    // encoding table early.
+    if (i < cuPtrVector.size() &&
+        page.entryCount < REGULAR_SECOND_LEVEL_ENTRIES_MAX) {
+      page.kind = UNWIND_SECOND_LEVEL_REGULAR;
+      page.entryCount = std::min(REGULAR_SECOND_LEVEL_ENTRIES_MAX,
+                                 cuPtrVector.size() - page.entryIndex);
+      i = page.entryIndex + page.entryCount;
+    } else {
+      page.kind = UNWIND_SECOND_LEVEL_COMPRESSED;
+    }
    }
  
    // compute size of __TEXT,__unwind_info section
@@ -186,12 +232,12 @@ void UnwindInfoSection::finalize() {
        sizeof(unwind_info_section_header) +
        commonEncodings.size() * sizeof(uint32_t) +
        personalities.size() * sizeof(uint32_t) +
-      pageBounds.size() * sizeof(unwind_info_section_header_index_entry) +
+      // The extra second-level-page entry is for the sentinel
+      (secondLevelPages.size() + 1) *
+          sizeof(unwind_info_section_header_index_entry) +
        lsdaEntries.size() * sizeof(unwind_info_section_header_lsda_index_entry);
-  unwindInfoSize = level2PagesOffset +
-                   (pageBounds.size() - 1) *
-                       sizeof(unwind_info_compressed_second_level_page_header) +
-                   cuPtrVector.size() * sizeof(uint32_t);
+  unwindInfoSize =
+      level2PagesOffset + secondLevelPages.size() * SECOND_LEVEL_PAGE_BYTES;
  }
  
  // All inputs are relocated and output addresses are known, so write!
@@ -208,7 +254,7 @@ void UnwindInfoSection::writeTo(uint8_t *buf) const {
    uip->personalityArrayCount = personalities.size();
    uip->indexSectionOffset = uip->personalityArraySectionOffset +
                              (uip->personalityArrayCount * sizeof(uint32_t));
-  uip->indexCount = pageBounds.size();
+  uip->indexCount = secondLevelPages.size() + 1;
  
    // Common encodings
    auto *i32p = reinterpret_cast<uint32_t *>(&uip[1]);
@@ -216,7 +262,7 @@ void UnwindInfoSection::writeTo(uint8_t *buf) const {
      *i32p++ = encoding.first;
  
    // Personalities
-  for (const auto &personality : personalities)
+  for (const uint32_t &personality : personalities)
      *i32p++ = personality;
  
    // Level-1 index
@@ -225,16 +271,12 @@ void UnwindInfoSection::writeTo(uint8_t *buf) const {
        uip->indexCount * sizeof(unwind_info_section_header_index_entry);
    uint64_t l2PagesOffset = level2PagesOffset;
    auto *iep = reinterpret_cast<unwind_info_section_header_index_entry *>(i32p);
-  for (size_t i = 0; i < pageBounds.size() - 1; i++) {
-    iep->functionOffset = (*pageBounds[i])->functionAddress;
+  for (const SecondLevelPage &page : secondLevelPages) {
+    iep->functionOffset = cuPtrVector[page.entryIndex]->functionAddress;
      iep->secondLevelPagesSectionOffset = l2PagesOffset;
      iep->lsdaIndexArraySectionOffset = lsdaOffset;
      iep++;
-    // TODO(gkm): pad to 4 KiB page boundary ???
-    size_t entryCount = pageBounds[i + 1] - pageBounds[i];
-    uint64_t pageSize = sizeof(unwind_info_section_header_index_entry) +
-                        entryCount * sizeof(uint32_t);
-    l2PagesOffset += pageSize;
+    l2PagesOffset += SECOND_LEVEL_PAGE_BYTES;
    }
    // Level-1 sentinel
    const CompactUnwindEntry64 &cuEnd = cuVector.back();
@@ -246,41 +288,52 @@ void UnwindInfoSection::writeTo(uint8_t *buf) const {
    // LSDAs
    auto *lep =
        reinterpret_cast<unwind_info_section_header_lsda_index_entry *>(iep);
-  for (const auto &lsda : lsdaEntries) {
+  for (const unwind_info_section_header_lsda_index_entry &lsda : lsdaEntries) {
      lep->functionOffset = lsda.functionOffset;
      lep->lsdaOffset = lsda.lsdaOffset;
    }
  
-  // create map from encoding to common-encoding-table index compact
-  // encoding entries use 7 bits to index the common-encoding table
-  size_t i = 0;
-  llvm::DenseMap<compact_unwind_encoding_t, size_t> commonEncodingIndexes;
-  for (const auto &encoding : commonEncodings)
-    commonEncodingIndexes[encoding.first] = i++;
-
    // Level-2 pages
-  auto *p2p =
-      reinterpret_cast<unwind_info_compressed_second_level_page_header *>(lep);
-  for (size_t i = 0; i < pageBounds.size() - 1; i++) {
-    p2p->kind = UNWIND_SECOND_LEVEL_COMPRESSED;
-    p2p->entryPageOffset =
-        sizeof(unwind_info_compressed_second_level_page_header);
-    p2p->entryCount = pageBounds[i + 1] - pageBounds[i];
-    p2p->encodingsPageOffset =
-        p2p->entryPageOffset + p2p->entryCount * sizeof(uint32_t);
-    p2p->encodingsCount = 0;
-    auto *ep = reinterpret_cast<uint32_t *>(&p2p[1]);
-    auto cuPtrVectorIt = pageBounds[i];
-    uintptr_t functionAddressBase = (*cuPtrVectorIt)->functionAddress;
-    while (cuPtrVectorIt < pageBounds[i + 1]) {
-      const CompactUnwindEntry64 *cuep = *cuPtrVectorIt++;
-      size_t cueIndex = commonEncodingIndexes.lookup(cuep->encoding);
-      *ep++ = ((cueIndex << UNWIND_INFO_COMPRESSED_ENTRY_FUNC_OFFSET_BITS) |
-               (cuep->functionAddress - functionAddressBase));
+  auto *pp = reinterpret_cast<uint32_t *>(lep);
+  for (const SecondLevelPage &page : secondLevelPages) {
+    if (page.kind == UNWIND_SECOND_LEVEL_COMPRESSED) {
+      uintptr_t functionAddressBase =
+          cuPtrVector[page.entryIndex]->functionAddress;
+      auto *p2p =
+          reinterpret_cast<unwind_info_compressed_second_level_page_header *>(
+              pp);
+      p2p->kind = page.kind;
+      p2p->entryPageOffset =
+          sizeof(unwind_info_compressed_second_level_page_header);
+      p2p->entryCount = page.entryCount;
+      p2p->encodingsPageOffset =
+          p2p->entryPageOffset + p2p->entryCount * sizeof(uint32_t);
+      p2p->encodingsCount = page.localEncodings.size();
+      auto *ep = reinterpret_cast<uint32_t *>(&p2p[1]);
+      for (size_t i = 0; i < page.entryCount; i++) {
+        const CompactUnwindEntry64 *cuep = cuPtrVector[page.entryIndex + i];
+        auto it = commonEncodingIndexes.find(cuep->encoding);
+        if (it == commonEncodingIndexes.end())
+          it = page.localEncodingIndexes.find(cuep->encoding);
+        *ep++ = (it->second << COMPRESSED_ENTRY_FUNC_OFFSET_BITS) |
+                (cuep->functionAddress - functionAddressBase);
+      }
+      memcpy(ep, page.localEncodings.data(),
+             page.localEncodings.size() * sizeof(uint32_t));
+    } else {
+      auto *p2p =
+          reinterpret_cast<unwind_info_regular_second_level_page_header *>(pp);
+      p2p->kind = page.kind;
+      p2p->entryPageOffset =
+          sizeof(unwind_info_regular_second_level_page_header);
+      p2p->entryCount = page.entryCount;
+      auto *ep = reinterpret_cast<uint32_t *>(&p2p[1]);
+      for (size_t i = 0; i < page.entryCount; i++) {
+        const CompactUnwindEntry64 *cuep = cuPtrVector[page.entryIndex + i];
+        *ep++ = cuep->functionAddress;
+        *ep++ = cuep->encoding;
+      }
      }
-    p2p =
-        reinterpret_cast<unwind_info_compressed_second_level_page_header *>(ep);
+    pp += SECOND_LEVEL_PAGE_WORDS;
    }
-  assert(getSize() ==
-         static_cast<size_t>((reinterpret_cast<uint8_t *>(p2p) - buf)));
  }
diff --git a/lld/MachO/UnwindInfoSection.h b/lld/MachO/UnwindInfoSection.h

index 61e639d..2285cf9 100644 (file)
--- a/lld/MachO/UnwindInfoSection.h
+++ b/lld/MachO/UnwindInfoSection.h
@@ -49,35 +49,30 @@ public:
      compactUnwindSection = cuSection;
    }
  
+  using EncodingMap = llvm::DenseMap<compact_unwind_encoding_t, size_t>;
+
+  struct SecondLevelPage {
+    uint32_t kind;
+    size_t entryIndex;
+    size_t entryCount;
+    size_t byteCount;
+    std::vector<compact_unwind_encoding_t> localEncodings;
+    EncodingMap localEncodingIndexes;
+  };
+
  private:
    std::vector<std::pair<compact_unwind_encoding_t, size_t>> commonEncodings;
+  EncodingMap commonEncodingIndexes;
    std::vector<uint32_t> personalities;
    std::vector<unwind_info_section_header_lsda_index_entry> lsdaEntries;
    std::vector<CompactUnwindEntry64> cuVector;
    std::vector<const CompactUnwindEntry64 *> cuPtrVector;
-  std::vector<std::vector<const CompactUnwindEntry64 *>::const_iterator>
-      pageBounds;
+  std::vector<SecondLevelPage> secondLevelPages;
    MergedOutputSection *compactUnwindSection = nullptr;
    uint64_t level2PagesOffset = 0;
    uint64_t unwindInfoSize = 0;
  };
  
-#define UNWIND_INFO_COMMON_ENCODINGS_MAX 127
-
-#define UNWIND_INFO_SECOND_LEVEL_PAGE_SIZE 4096
-#define UNWIND_INFO_REGULAR_SECOND_LEVEL_ENTRIES_MAX                           \
-  ((UNWIND_INFO_SECOND_LEVEL_PAGE_SIZE -                                       \
-    sizeof(unwind_info_regular_second_level_page_header)) /                    \
-   sizeof(unwind_info_regular_second_level_entry))
-#define UNWIND_INFO_COMPRESSED_SECOND_LEVEL_ENTRIES_MAX                        \
-  ((UNWIND_INFO_SECOND_LEVEL_PAGE_SIZE -                                       \
-    sizeof(unwind_info_compressed_second_level_page_header)) /                 \
-   sizeof(uint32_t))
-
-#define UNWIND_INFO_COMPRESSED_ENTRY_FUNC_OFFSET_BITS 24
-#define UNWIND_INFO_COMPRESSED_ENTRY_FUNC_OFFSET_MASK                          \
-  UNWIND_INFO_COMPRESSED_ENTRY_FUNC_OFFSET(~0)
-
  } // namespace macho
  } // namespace lld
  
diff --git a/lld/test/MachO/tools/generate-cfi-funcs.py b/lld/test/MachO/tools/generate-cfi-funcs.py

index d9c6ecd..a91eab3 100755 (executable)
--- a/lld/test/MachO/tools/generate-cfi-funcs.py
+++ b/lld/test/MachO/tools/generate-cfi-funcs.py
@@ -24,7 +24,7 @@ def print_function(name):
    have_lsda = (random.random() < lsda_odds)
    frame_size = random.randint(4, 64) * 16
    frame_offset = -random.randint(0, (frame_size/16 - 4)) * 16
-  reg_count = random.randint(0, 4)
+  reg_count = random.randint(0, 5)
    reg_combo = random.randint(0, factorial(reg_count) - 1)
    regs_saved = saved_regs_combined[reg_count][reg_combo]
    global func_size_low, func_size_high
diff --git a/lld/test/MachO/tools/validate-unwind-info.py b/lld/test/MachO/tools/validate-unwind-info.py

index ee4cca4..f3619b4 100755 (executable)
--- a/lld/test/MachO/tools/validate-unwind-info.py
+++ b/lld/test/MachO/tools/validate-unwind-info.py
@@ -45,7 +45,7 @@ def main():
      sys.exit("no program symbols found in input")
  
    program_common_encodings = (
-    re.findall(r"^\s+encoding\[\d+\]: 0x(%s+)$" % hex,
+    re.findall(r"^\s+encoding\[(?:\d|\d\d|1[01]\d|12[0-6])\]: 0x(%s+)$" % hex,
                 objdump_string, re.MULTILINE))
    if not program_common_encodings:
      sys.exit("no common encodings found in input")
@@ -53,7 +53,7 @@ def main():
    program_encodings_map = {program_symbols_map[address]:encoding
      for address, encoding in
      re.findall(r"^\s+\[\d+\]: function offset=0x(%s+), " % hex +
-               r"encoding\[\d+\]=0x(%s+)$" % hex,
+               r"encoding(?:\[\d+\])?=0x(%s+)$" % hex,
                 objdump_string, re.MULTILINE)}
    if not object_encodings_map:
      sys.exit("no program encodings found in input")
@@ -73,8 +73,8 @@ def main():
  
    if program_encodings_map != object_encodings_map:
      if args.debug:
-      pprint("program encodings map:\n" + program_encodings_map)
-      pprint("object encodings map:\n" + object_encodings_map)
+      pprint("program encodings map:\n" + str(program_encodings_map))
+      pprint("object encodings map:\n" + str(object_encodings_map))
      sys.exit("encoding maps differ")
  
    # Count frequency of object-file folded encodings
@@ -86,11 +86,12 @@ def main():
                            sorted(encoding_frequency_map,
                                   key=lambda x: (encoding_frequency_map.get(x), x),
                                   reverse=True)]
+  del encoding_frequencies[127:]
  
    if program_common_encodings != encoding_frequencies:
      if args.debug:
-      pprint("program common encodings:\n" + program_common_encodings)
-      pprint("object encoding frequencies:\n" + encoding_frequencies)
+      pprint("program common encodings:\n" + str(program_common_encodings))
+      pprint("object encoding frequencies:\n" + str(encoding_frequencies))
      sys.exit("encoding frequencies differ")
author	Greg McGary <gkm@fb.com>
	Mon, 7 Dec 2020 06:33:38 +0000 (22:33 -0800)
committer	Greg McGary <gkm@fb.com>
	Sat, 19 Dec 2020 22:54:37 +0000 (14:54 -0800)
lld/MachO/UnwindInfoSection.cpp		patch \| blob \| history
lld/MachO/UnwindInfoSection.h		patch \| blob \| history
lld/test/MachO/tools/generate-cfi-funcs.py		patch \| blob \| history
lld/test/MachO/tools/validate-unwind-info.py		patch \| blob \| history