From e25ae03eaa966a35c55f185a9b34c1a01115b5f2 Mon Sep 17 00:00:00 2001 From: Vicent Marti Date: Fri, 25 Mar 2016 17:14:34 +0100 Subject: [PATCH] memleak: Resolve stacks with a StackTrace table Instead of manually walking the (kernel) stack inside the eBPF code, create a `BPF_STACK_TRACE` table and store the stack IDs in the alloc info struct. This greatly simplifies the leak detection code: instead of storing the full stack trace as a string to de-duplicate the allocation point for the different allocations, we can store the `stack_id`. Since we're creating stack IDs with `BPF_F_REUSE_STACKID`, the kernel will take care of deduplication for us. Additionally, the `StackDecoder` class has been specialized into a `UStackDecoder` and `KStackDecoder` (for userland and kernel stacks, respectively). This lets us pass the decode class directly to `stack_traces.walk()` to automate symbol resolution. A new class, `Allocation` has been created to encapsulate what previously was a 2-element tuple of `(count, size)`. This --- man/man8/memleak.8 | 14 +-- tools/memleak.py | 108 ++++++++--------- tools/old/memleak.py | 321 +++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 369 insertions(+), 74 deletions(-) create mode 100755 tools/old/memleak.py diff --git a/man/man8/memleak.8 b/man/man8/memleak.8 index 4abeea8..6ff092d 100644 --- a/man/man8/memleak.8 +++ b/man/man8/memleak.8 @@ -3,7 +3,7 @@ memleak \- Print a summary of outstanding allocations and their call stacks to detect memory leaks. Uses Linux eBPF/bcc. .SH SYNOPSIS .B memleak [-h] [-p PID] [-t] [-a] [-o OLDER] [-c COMMAND] [-s SAMPLE_RATE] -[-d STACK_DEPTH] [-T TOP] [-z MIN_SIZE] [-Z MAX_SIZE] [INTERVAL] [COUNT] +[-T TOP] [-z MIN_SIZE] [-Z MAX_SIZE] [INTERVAL] [COUNT] .SH DESCRIPTION memleak traces and matches memory allocation and deallocation requests, and collects call stacks for each allocation. memleak can then print a summary @@ -15,10 +15,7 @@ When tracing all processes, memleak instruments kmalloc and kfree. memleak may introduce significant overhead when tracing processes that allocate and free many blocks very quickly. See the OVERHEAD section below. -The stack depth is limited to 10 by default (+1 for the current instruction pointer), -but it can be controlled using the \-d switch if deeper stacks are required. - -This currently only works on x86_64. Check for future versions. +This tool only works on Linux 4.6+. Stack traces are obtained using the new BPF_STACK_TRACE` APIs. .SH REQUIREMENTS CONFIG_BPF and bcc. .SH OPTIONS @@ -45,10 +42,6 @@ Run the specified command and trace its allocations only. This traces malloc and \-s SAMPLE_RATE Record roughly every SAMPLE_RATE-th allocation to reduce overhead. .TP -\-d STACK_DEPTH -Capture STACK_DEPTH frames (or less) when obtaining allocation call stacks. -The default value is 10. -.TP \-t TOP Print only the top TOP stacks (sorted by size). The default value is 10. @@ -106,9 +99,6 @@ placed in a typical period of 10 seconds: # .B perf stat -a -e 'probe:__kmalloc' -- sleep 10 -Another setting that may help reduce overhead is lowering the number of stack -frames captured and parsed by memleak for each allocation, using the \-d switch. - .SH SOURCE This is from bcc. .IP diff --git a/tools/memleak.py b/tools/memleak.py index 5662815..bc59b40 100755 --- a/tools/memleak.py +++ b/tools/memleak.py @@ -44,29 +44,33 @@ class Time(object): raise OSError(errno_, os.strerror(errno_)) return t.tv_sec * 1e9 + t.tv_nsec -class StackDecoder(object): +class KStackDecoder(object): + def refresh(self): + pass + + def __call__(self, addr): + return "%s [kernel] (%x)" % (BPF.ksym(addr), addr) + +class UStackDecoder(object): def __init__(self, pid): self.pid = pid - if pid != -1: - self.proc_sym = ProcessSymbols(pid) + self.proc_sym = ProcessSymbols(pid) def refresh(self): - if self.pid != -1: - self.proc_sym.refresh_code_ranges() - - def decode_stack(self, info, is_kernel_trace): - stack = "" - if info.num_frames <= 0: - return "???" - for i in range(0, info.num_frames): - addr = info.callstack[i] - if is_kernel_trace: - stack += " %s [kernel] (%x) ;" % \ - (BPF.ksym(addr), addr) - else: - stack += " %s (%x) ;" % \ - (self.proc_sym.decode_addr(addr), addr) - return stack + self.proc_sym.refresh_code_ranges() + + def __call__(self, addr): + return "%s (%x)" % (self.proc_sym.decode_addr(addr), addr) + +class Allocation(object): + def __init__(self, stack, size): + self.stack = stack + self.count = 1 + self.size = size + + def update(self, size): + self.count += 1 + self.size += size def run_command_get_output(command): p = subprocess.Popen(command.split(), @@ -125,8 +129,6 @@ parser.add_argument("-c", "--command", help="execute and trace the specified command") parser.add_argument("-s", "--sample-rate", default=1, type=int, help="sample every N-th allocation to decrease the overhead") -parser.add_argument("-d", "--stack-depth", default=10, type=int, - help="maximum stack depth to capture") parser.add_argument("-T", "--top", type=int, default=10, help="display only this many top allocating stacks (by size)") parser.add_argument("-z", "--min-size", type=int, @@ -144,7 +146,6 @@ interval = args.interval min_age_ns = 1e6 * args.older sample_every_n = args.sample_rate num_prints = args.count -max_stack_size = args.stack_depth + 2 top_stacks = args.top min_size = args.min_size max_size = args.max_size @@ -163,33 +164,12 @@ bpf_source = """ struct alloc_info_t { u64 size; u64 timestamp_ns; - int num_frames; - u64 callstack[MAX_STACK_SIZE]; + int stack_id; }; BPF_HASH(sizes, u64); BPF_HASH(allocs, u64, struct alloc_info_t); - -// Adapted from https://github.com/iovisor/bcc/tools/offcputime.py -static u64 get_frame(u64 *bp) { - if (*bp) { - // The following stack walker is x86_64 specific - u64 ret = 0; - if (bpf_probe_read(&ret, sizeof(ret), (void *)(*bp+8))) - return 0; - if (bpf_probe_read(bp, sizeof(*bp), (void *)*bp)) - *bp = 0; - return ret; - } - return 0; -} -static int grab_stack(struct pt_regs *ctx, struct alloc_info_t *info) -{ - int depth = 0; - u64 bp = ctx->bp; - GRAB_ONE_FRAME - return depth; -} +BPF_STACK_TRACE(stack_traces, 1024) int alloc_enter(struct pt_regs *ctx, size_t size) { @@ -223,12 +203,12 @@ int alloc_exit(struct pt_regs *ctx) sizes.delete(&pid); info.timestamp_ns = bpf_ktime_get_ns(); - info.num_frames = grab_stack(ctx, &info) - 2; + info.stack_id = stack_traces.get_stackid(ctx, STACK_FLAGS); allocs.update(&address, &info); if (SHOULD_PRINT) { - bpf_trace_printk("alloc exited, size = %lu, result = %lx, frames = %d\\n", - info.size, address, info.num_frames); + bpf_trace_printk("alloc exited, size = %lu, result = %lx\\n", + info.size, address); } return 0; } @@ -251,9 +231,6 @@ int free_enter(struct pt_regs *ctx, void *address) """ bpf_source = bpf_source.replace("SHOULD_PRINT", "1" if trace_all else "0") bpf_source = bpf_source.replace("SAMPLE_EVERY_N", str(sample_every_n)) -bpf_source = bpf_source.replace("GRAB_ONE_FRAME", max_stack_size * - "\tif (!(info->callstack[depth++] = get_frame(&bp))) return depth;\n") -bpf_source = bpf_source.replace("MAX_STACK_SIZE", str(max_stack_size)) size_filter = "" if min_size is not None and max_size is not None: @@ -265,6 +242,11 @@ elif max_size is not None: size_filter = "if (size > %d) return 0;" % max_size bpf_source = bpf_source.replace("SIZE_FILTER", size_filter) +stack_flags = "BPF_F_REUSE_STACKID" +if not kernel_trace: + stack_flags += "|BPF_F_USER_STACK" +bpf_source = bpf_source.replace("STACK_FLAGS", stack_flags) + bpf_program = BPF(text=bpf_source) if not kernel_trace: @@ -281,29 +263,31 @@ else: bpf_program.attach_kretprobe(event="__kmalloc", fn_name="alloc_exit") bpf_program.attach_kprobe(event="kfree", fn_name="free_enter") -decoder = StackDecoder(pid) +decoder = KStackDecoder() if kernel_trace else UStackDecoder(pid) def print_outstanding(): - stacks = {} print("[%s] Top %d stacks with outstanding allocations:" % (datetime.now().strftime("%H:%M:%S"), top_stacks)) - allocs = bpf_program.get_table("allocs") + alloc_info = {} + allocs = bpf_program["allocs"] + stack_traces = bpf_program["stack_traces"] for address, info in sorted(allocs.items(), key=lambda a: a[1].size): if Time.monotonic_time() - min_age_ns < info.timestamp_ns: continue - stack = decoder.decode_stack(info, kernel_trace) - if stack in stacks: - stacks[stack] = (stacks[stack][0] + 1, - stacks[stack][1] + info.size) + if info.stack_id < 0: + continue + if info.stack_id in alloc_info: + alloc_info[info.stack_id].update(info.size) else: - stacks[stack] = (1, info.size) + stack = list(stack_traces.walk(info.stack_id, decoder)) + alloc_info[info.stack_id] = Allocation(stack, info.size) if args.show_allocs: print("\taddr = %x size = %s" % (address.value, info.size)) - to_show = sorted(stacks.items(), key=lambda s: s[1][1])[-top_stacks:] - for stack, (count, size) in to_show: + to_show = sorted(alloc_info.values(), key=lambda a: a.size)[-top_stacks:] + for alloc in to_show: print("\t%d bytes in %d allocations from stack\n\t\t%s" % - (size, count, stack.replace(";", "\n\t\t"))) + (alloc.size, alloc.count, "\n\t\t".join(alloc.stack))) count_so_far = 0 while True: diff --git a/tools/old/memleak.py b/tools/old/memleak.py new file mode 100755 index 0000000..5662815 --- /dev/null +++ b/tools/old/memleak.py @@ -0,0 +1,321 @@ +#!/usr/bin/env python +# +# memleak Trace and display outstanding allocations to detect +# memory leaks in user-mode processes and the kernel. +# +# USAGE: memleak [-h] [-p PID] [-t] [-a] [-o OLDER] [-c COMMAND] +# [-s SAMPLE_RATE] [-d STACK_DEPTH] [-T TOP] [-z MIN_SIZE] +# [-Z MAX_SIZE] +# [interval] [count] +# +# Licensed under the Apache License, Version 2.0 (the "License") +# Copyright (C) 2016 Sasha Goldshtein. + +from bcc import BPF, ProcessSymbols +from time import sleep +from datetime import datetime +import argparse +import subprocess +import ctypes +import os + +class Time(object): + # BPF timestamps come from the monotonic clock. To be able to filter + # and compare them from Python, we need to invoke clock_gettime. + # Adapted from http://stackoverflow.com/a/1205762 + CLOCK_MONOTONIC_RAW = 4 # see + + class timespec(ctypes.Structure): + _fields_ = [ + ('tv_sec', ctypes.c_long), + ('tv_nsec', ctypes.c_long) + ] + + librt = ctypes.CDLL('librt.so.1', use_errno=True) + clock_gettime = librt.clock_gettime + clock_gettime.argtypes = [ctypes.c_int, ctypes.POINTER(timespec)] + + @staticmethod + def monotonic_time(): + t = Time.timespec() + if Time.clock_gettime( + Time.CLOCK_MONOTONIC_RAW, ctypes.pointer(t)) != 0: + errno_ = ctypes.get_errno() + raise OSError(errno_, os.strerror(errno_)) + return t.tv_sec * 1e9 + t.tv_nsec + +class StackDecoder(object): + def __init__(self, pid): + self.pid = pid + if pid != -1: + self.proc_sym = ProcessSymbols(pid) + + def refresh(self): + if self.pid != -1: + self.proc_sym.refresh_code_ranges() + + def decode_stack(self, info, is_kernel_trace): + stack = "" + if info.num_frames <= 0: + return "???" + for i in range(0, info.num_frames): + addr = info.callstack[i] + if is_kernel_trace: + stack += " %s [kernel] (%x) ;" % \ + (BPF.ksym(addr), addr) + else: + stack += " %s (%x) ;" % \ + (self.proc_sym.decode_addr(addr), addr) + return stack + +def run_command_get_output(command): + p = subprocess.Popen(command.split(), + stdout=subprocess.PIPE, stderr=subprocess.STDOUT) + return iter(p.stdout.readline, b'') + +def run_command_get_pid(command): + p = subprocess.Popen(command.split()) + return p.pid + +examples = """ +EXAMPLES: + +./memleak -p $(pidof allocs) + Trace allocations and display a summary of "leaked" (outstanding) + allocations every 5 seconds +./memleak -p $(pidof allocs) -t + Trace allocations and display each individual call to malloc/free +./memleak -ap $(pidof allocs) 10 + Trace allocations and display allocated addresses, sizes, and stacks + every 10 seconds for outstanding allocations +./memleak -c "./allocs" + Run the specified command and trace its allocations +./memleak + Trace allocations in kernel mode and display a summary of outstanding + allocations every 5 seconds +./memleak -o 60000 + Trace allocations in kernel mode and display a summary of outstanding + allocations that are at least one minute (60 seconds) old +./memleak -s 5 + Trace roughly every 5th allocation, to reduce overhead +""" + +description = """ +Trace outstanding memory allocations that weren't freed. +Supports both user-mode allocations made with malloc/free and kernel-mode +allocations made with kmalloc/kfree. +""" + +parser = argparse.ArgumentParser(description=description, + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=examples) +parser.add_argument("-p", "--pid", type=int, default=-1, + help="the PID to trace; if not specified, trace kernel allocs") +parser.add_argument("-t", "--trace", action="store_true", + help="print trace messages for each alloc/free call") +parser.add_argument("interval", nargs="?", default=5, type=int, + help="interval in seconds to print outstanding allocations") +parser.add_argument("count", nargs="?", type=int, + help="number of times to print the report before exiting") +parser.add_argument("-a", "--show-allocs", default=False, action="store_true", + help="show allocation addresses and sizes as well as call stacks") +parser.add_argument("-o", "--older", default=500, type=int, + help="prune allocations younger than this age in milliseconds") +parser.add_argument("-c", "--command", + help="execute and trace the specified command") +parser.add_argument("-s", "--sample-rate", default=1, type=int, + help="sample every N-th allocation to decrease the overhead") +parser.add_argument("-d", "--stack-depth", default=10, type=int, + help="maximum stack depth to capture") +parser.add_argument("-T", "--top", type=int, default=10, + help="display only this many top allocating stacks (by size)") +parser.add_argument("-z", "--min-size", type=int, + help="capture only allocations larger than this size") +parser.add_argument("-Z", "--max-size", type=int, + help="capture only allocations smaller than this size") + +args = parser.parse_args() + +pid = args.pid +command = args.command +kernel_trace = (pid == -1 and command is None) +trace_all = args.trace +interval = args.interval +min_age_ns = 1e6 * args.older +sample_every_n = args.sample_rate +num_prints = args.count +max_stack_size = args.stack_depth + 2 +top_stacks = args.top +min_size = args.min_size +max_size = args.max_size + +if min_size is not None and max_size is not None and min_size > max_size: + print("min_size (-z) can't be greater than max_size (-Z)") + exit(1) + +if command is not None: + print("Executing '%s' and tracing the resulting process." % command) + pid = run_command_get_pid(command) + +bpf_source = """ +#include + +struct alloc_info_t { + u64 size; + u64 timestamp_ns; + int num_frames; + u64 callstack[MAX_STACK_SIZE]; +}; + +BPF_HASH(sizes, u64); +BPF_HASH(allocs, u64, struct alloc_info_t); + +// Adapted from https://github.com/iovisor/bcc/tools/offcputime.py +static u64 get_frame(u64 *bp) { + if (*bp) { + // The following stack walker is x86_64 specific + u64 ret = 0; + if (bpf_probe_read(&ret, sizeof(ret), (void *)(*bp+8))) + return 0; + if (bpf_probe_read(bp, sizeof(*bp), (void *)*bp)) + *bp = 0; + return ret; + } + return 0; +} +static int grab_stack(struct pt_regs *ctx, struct alloc_info_t *info) +{ + int depth = 0; + u64 bp = ctx->bp; + GRAB_ONE_FRAME + return depth; +} + +int alloc_enter(struct pt_regs *ctx, size_t size) +{ + SIZE_FILTER + if (SAMPLE_EVERY_N > 1) { + u64 ts = bpf_ktime_get_ns(); + if (ts % SAMPLE_EVERY_N != 0) + return 0; + } + + u64 pid = bpf_get_current_pid_tgid(); + u64 size64 = size; + sizes.update(&pid, &size64); + + if (SHOULD_PRINT) + bpf_trace_printk("alloc entered, size = %u\\n", size); + return 0; +} + +int alloc_exit(struct pt_regs *ctx) +{ + u64 address = ctx->ax; + u64 pid = bpf_get_current_pid_tgid(); + u64* size64 = sizes.lookup(&pid); + struct alloc_info_t info = {0}; + + if (size64 == 0) + return 0; // missed alloc entry + + info.size = *size64; + sizes.delete(&pid); + + info.timestamp_ns = bpf_ktime_get_ns(); + info.num_frames = grab_stack(ctx, &info) - 2; + allocs.update(&address, &info); + + if (SHOULD_PRINT) { + bpf_trace_printk("alloc exited, size = %lu, result = %lx, frames = %d\\n", + info.size, address, info.num_frames); + } + return 0; +} + +int free_enter(struct pt_regs *ctx, void *address) +{ + u64 addr = (u64)address; + struct alloc_info_t *info = allocs.lookup(&addr); + if (info == 0) + return 0; + + allocs.delete(&addr); + + if (SHOULD_PRINT) { + bpf_trace_printk("free entered, address = %lx, size = %lu\\n", + address, info->size); + } + return 0; +} +""" +bpf_source = bpf_source.replace("SHOULD_PRINT", "1" if trace_all else "0") +bpf_source = bpf_source.replace("SAMPLE_EVERY_N", str(sample_every_n)) +bpf_source = bpf_source.replace("GRAB_ONE_FRAME", max_stack_size * + "\tif (!(info->callstack[depth++] = get_frame(&bp))) return depth;\n") +bpf_source = bpf_source.replace("MAX_STACK_SIZE", str(max_stack_size)) + +size_filter = "" +if min_size is not None and max_size is not None: + size_filter = "if (size < %d || size > %d) return 0;" % \ + (min_size, max_size) +elif min_size is not None: + size_filter = "if (size < %d) return 0;" % min_size +elif max_size is not None: + size_filter = "if (size > %d) return 0;" % max_size +bpf_source = bpf_source.replace("SIZE_FILTER", size_filter) + +bpf_program = BPF(text=bpf_source) + +if not kernel_trace: + print("Attaching to malloc and free in pid %d, Ctrl+C to quit." % pid) + bpf_program.attach_uprobe(name="c", sym="malloc", + fn_name="alloc_enter", pid=pid) + bpf_program.attach_uretprobe(name="c", sym="malloc", + fn_name="alloc_exit", pid=pid) + bpf_program.attach_uprobe(name="c", sym="free", + fn_name="free_enter", pid=pid) +else: + print("Attaching to kmalloc and kfree, Ctrl+C to quit.") + bpf_program.attach_kprobe(event="__kmalloc", fn_name="alloc_enter") + bpf_program.attach_kretprobe(event="__kmalloc", fn_name="alloc_exit") + bpf_program.attach_kprobe(event="kfree", fn_name="free_enter") + +decoder = StackDecoder(pid) + +def print_outstanding(): + stacks = {} + print("[%s] Top %d stacks with outstanding allocations:" % + (datetime.now().strftime("%H:%M:%S"), top_stacks)) + allocs = bpf_program.get_table("allocs") + for address, info in sorted(allocs.items(), key=lambda a: a[1].size): + if Time.monotonic_time() - min_age_ns < info.timestamp_ns: + continue + stack = decoder.decode_stack(info, kernel_trace) + if stack in stacks: + stacks[stack] = (stacks[stack][0] + 1, + stacks[stack][1] + info.size) + else: + stacks[stack] = (1, info.size) + if args.show_allocs: + print("\taddr = %x size = %s" % + (address.value, info.size)) + to_show = sorted(stacks.items(), key=lambda s: s[1][1])[-top_stacks:] + for stack, (count, size) in to_show: + print("\t%d bytes in %d allocations from stack\n\t\t%s" % + (size, count, stack.replace(";", "\n\t\t"))) + +count_so_far = 0 +while True: + if trace_all: + print(bpf_program.trace_fields()) + else: + try: + sleep(interval) + except KeyboardInterrupt: + exit() + decoder.refresh() + print_outstanding() + count_so_far += 1 + if num_prints is not None and count_so_far >= num_prints: + exit() -- 2.7.4