From f141d1b99b120b6dd9c656837f8721d1846a5fcc Mon Sep 17 00:00:00 2001 From: Sasha Goldshtein Date: Fri, 3 Mar 2017 15:13:31 -0500 Subject: [PATCH] cc: Symbol resolution with multiple executable regions per module The symbol resolution code used to assume for most purposes that there is a single executable region per module. When there were several, there was no crash, but symbols were not resolved correctly. The reason is that the symbol offsets are relative to the first executable region's start address, but bcc would resolve them relative to the region in which they appeared. For example, given the following regions and spans for a module libfoo.so loaded into some process: 1000-2000 r-xp libfoo.so 2000-3000 rw-p libfoo.so 3000-4000 r-xp libfoo.so 4000-5000 r--- libfoo.so Now, suppose there is a symbol bar() loaded at address 3500. In the binary on disk, bar() is at offset 2500 from the beginning of the module (but not the beginning of the 3000-4000 region!). When we look at the candidate regions, we find 3000-4000, and discover that 3500 lies within it. Then we subtract 3500-3000 to find the offset from the beginning of the region, get 500, and now look for a symbol that contains the relative address 500. As a result, we might find some random symbol in the region 1000-2000, and report that address 3500 corresponds to that random symbol rather than to bar(). This commit fixes the situation by keeping only a single `Module` instance for each module, even if that module spans multiple executable regions. We remember all executable region start and end ranges so we can determine whether an address (like 3500 in the above example) lies within the module. But for the purpose of finding the actual symbol, we need only the offset from the start of the _first_ executable region, and then need to look up a symbol based on that. This was discovered and fixed while tracing .NET Core processes on Linux, where libcoreclr.so (the main CLR binary) has several executable regions. Resolving symbols from any but the first region would produce totally bogus results. --- src/cc/bcc_syms.cc | 27 +++++++++++++++++++++------ src/cc/syms.h | 13 ++++++++++--- 2 files changed, 31 insertions(+), 9 deletions(-) diff --git a/src/cc/bcc_syms.cc b/src/cc/bcc_syms.cc index cfc34d8..65ec960 100644 --- a/src/cc/bcc_syms.cc +++ b/src/cc/bcc_syms.cc @@ -101,7 +101,11 @@ void ProcSyms::refresh() { int ProcSyms::_add_module(const char *modname, uint64_t start, uint64_t end, void *payload) { ProcSyms *ps = static_cast(payload); - ps->modules_.emplace_back(modname, start, end); + auto it = std::find_if(ps->modules_.begin(), ps->modules_.end(), + [=](const ProcSyms::Module &m) { return m.name_ == modname; }); + if (it == ps->modules_.end()) + it = ps->modules_.insert(ps->modules_.end(), modname); + it->ranges_.push_back(ProcSyms::Module::Range(start, end)); return 0; } @@ -116,7 +120,7 @@ bool ProcSyms::resolve_addr(uint64_t addr, struct bcc_symbol *sym) { const char *original_module = nullptr; for (Module &mod : modules_) { - if (addr >= mod.start_ && addr < mod.end_) { + if (mod.contains(addr)) { bool res = mod.find_addr(addr, sym); if (sym->name) { sym->demangle_name = abi::__cxa_demangle(sym->name, nullptr, nullptr, nullptr); @@ -155,8 +159,8 @@ bool ProcSyms::resolve_name(const char *module, const char *name, return false; } -ProcSyms::Module::Module(const char *name, uint64_t start, uint64_t end) - : name_(name), start_(start), end_(end) { +ProcSyms::Module::Module(const char *name) + : name_(name) { is_so_ = bcc_elf_is_shared_obj(name) == 1; } @@ -184,12 +188,20 @@ void ProcSyms::Module::load_sym_table() { std::sort(syms_.begin(), syms_.end()); } +bool ProcSyms::Module::contains(uint64_t addr) const { + for (const auto &range : ranges_) { + if (addr >= range.start && addr < range.end) + return true; + } + return false; +} + bool ProcSyms::Module::find_name(const char *symname, uint64_t *addr) { load_sym_table(); for (Symbol &s : syms_) { if (*(s.name) == symname) { - *addr = is_so() ? start_ + s.start : s.start; + *addr = is_so() ? start() + s.start : s.start; return true; } } @@ -197,7 +209,7 @@ bool ProcSyms::Module::find_name(const char *symname, uint64_t *addr) { } bool ProcSyms::Module::find_addr(uint64_t addr, struct bcc_symbol *sym) { - uint64_t offset = is_so() ? (addr - start_) : addr; + uint64_t offset = is_so() ? (addr - start()) : addr; load_sym_table(); @@ -230,6 +242,9 @@ bool ProcSyms::Module::find_addr(uint64_t addr, struct bcc_symbol *sym) { sym->offset = (offset - it->start); return true; } + // But don't step beyond begin()! + if (it == syms_.begin()) + break; } return false; diff --git a/src/cc/syms.h b/src/cc/syms.h index 6fbf843..60dedd5 100644 --- a/src/cc/syms.h +++ b/src/cc/syms.h @@ -79,15 +79,22 @@ class ProcSyms : SymbolCache { }; struct Module { - Module(const char *name, uint64_t start, uint64_t end); + struct Range { + uint64_t start; + uint64_t end; + Range(uint64_t s, uint64_t e) : start(s), end(e) {} + }; + + Module(const char *name); std::string name_; - uint64_t start_; - uint64_t end_; + std::vector ranges_; bool is_so_; std::unordered_set symnames_; std::vector syms_; void load_sym_table(); + bool contains(uint64_t addr) const; + uint64_t start() const { return ranges_.begin()->start; } bool find_addr(uint64_t addr, struct bcc_symbol *sym); bool find_name(const char *symname, uint64_t *addr); bool is_so() const { return is_so_; } -- 2.7.4