kernel/profile.c

   1 // SPDX-License-Identifier: GPL-2.0-only
   2 /*
   3  *  linux/kernel/profile.c
   4  *  Simple profiling. Manages a direct-mapped profile hit count buffer,
   5  *  with configurable resolution, support for restricting the cpus on
   6  *  which profiling is done, and switching between cpu time and
   7  *  schedule() calls via kernel command line parameters passed at boot.
   8  *
   9  *  Scheduler profiling support, Arjan van de Ven and Ingo Molnar,
  10  *      Red Hat, July 2004
  11  *  Consolidation of architecture support code for profiling,
  12  *      Nadia Yvette Chambers, Oracle, July 2004
  13  *  Amortized hit count accounting via per-cpu open-addressed hashtables
  14  *      to resolve timer interrupt livelocks, Nadia Yvette Chambers,
  15  *      Oracle, 2004
  16  */
  17
  18 #include <linux/export.h>
  19 #include <linux/profile.h>
  20 #include <linux/memblock.h>
  21 #include <linux/notifier.h>
  22 #include <linux/mm.h>
  23 #include <linux/cpumask.h>
  24 #include <linux/cpu.h>
  25 #include <linux/highmem.h>
  26 #include <linux/mutex.h>
  27 #include <linux/slab.h>
  28 #include <linux/vmalloc.h>
  29 #include <linux/sched/stat.h>
  30
  31 #include <asm/sections.h>
  32 #include <asm/irq_regs.h>
  33 #include <asm/ptrace.h>
  34
  35 struct profile_hit {
  36         u32 pc, hits;
  37 };
  38 #define PROFILE_GRPSHIFT        3
  39 #define PROFILE_GRPSZ           (1 << PROFILE_GRPSHIFT)
  40 #define NR_PROFILE_HIT          (PAGE_SIZE/sizeof(struct profile_hit))
  41 #define NR_PROFILE_GRP          (NR_PROFILE_HIT/PROFILE_GRPSZ)
  42
  43 static atomic_t *prof_buffer;
  44 static unsigned long prof_len;
  45 static unsigned short int prof_shift;
  46
  47 int prof_on __read_mostly;
  48 EXPORT_SYMBOL_GPL(prof_on);
  49
  50 static cpumask_var_t prof_cpu_mask;
  51 #if defined(CONFIG_SMP) && defined(CONFIG_PROC_FS)
  52 static DEFINE_PER_CPU(struct profile_hit *[2], cpu_profile_hits);
  53 static DEFINE_PER_CPU(int, cpu_profile_flip);
  54 static DEFINE_MUTEX(profile_flip_mutex);
  55 #endif /* CONFIG_SMP */
  56
  57 int profile_setup(char *str)
  58 {
  59         static const char schedstr[] = "schedule";
  60         static const char sleepstr[] = "sleep";
  61         static const char kvmstr[] = "kvm";
  62         int par;
  63
  64         if (!strncmp(str, sleepstr, strlen(sleepstr))) {
  65 #ifdef CONFIG_SCHEDSTATS
  66                 force_schedstat_enabled();
  67                 prof_on = SLEEP_PROFILING;
  68                 if (str[strlen(sleepstr)] == ',')
  69                         str += strlen(sleepstr) + 1;
  70                 if (get_option(&str, &par))
  71                         prof_shift = clamp(par, 0, BITS_PER_LONG - 1);
  72                 pr_info("kernel sleep profiling enabled (shift: %u)\n",
  73                         prof_shift);
  74 #else
  75                 pr_warn("kernel sleep profiling requires CONFIG_SCHEDSTATS\n");
  76 #endif /* CONFIG_SCHEDSTATS */
  77         } else if (!strncmp(str, schedstr, strlen(schedstr))) {
  78                 prof_on = SCHED_PROFILING;
  79                 if (str[strlen(schedstr)] == ',')
  80                         str += strlen(schedstr) + 1;
  81                 if (get_option(&str, &par))
  82                         prof_shift = clamp(par, 0, BITS_PER_LONG - 1);
  83                 pr_info("kernel schedule profiling enabled (shift: %u)\n",
  84                         prof_shift);
  85         } else if (!strncmp(str, kvmstr, strlen(kvmstr))) {
  86                 prof_on = KVM_PROFILING;
  87                 if (str[strlen(kvmstr)] == ',')
  88                         str += strlen(kvmstr) + 1;
  89                 if (get_option(&str, &par))
  90                         prof_shift = clamp(par, 0, BITS_PER_LONG - 1);
  91                 pr_info("kernel KVM profiling enabled (shift: %u)\n",
  92                         prof_shift);
  93         } else if (get_option(&str, &par)) {
  94                 prof_shift = clamp(par, 0, BITS_PER_LONG - 1);
  95                 prof_on = CPU_PROFILING;
  96                 pr_info("kernel profiling enabled (shift: %u)\n",
  97                         prof_shift);
  98         }
  99         return 1;
 100 }
 101 __setup("profile=", profile_setup);
 102
 103
 104 int __ref profile_init(void)
 105 {
 106         int buffer_bytes;
 107         if (!prof_on)
 108                 return 0;
 109
 110         /* only text is profiled */
 111         prof_len = (_etext - _stext) >> prof_shift;
 112
 113         if (!prof_len) {
 114                 pr_warn("profiling shift: %u too large\n", prof_shift);
 115                 prof_on = 0;
 116                 return -EINVAL;
 117         }
 118
 119         buffer_bytes = prof_len*sizeof(atomic_t);
 120
 121         if (!alloc_cpumask_var(&prof_cpu_mask, GFP_KERNEL))
 122                 return -ENOMEM;
 123
 124         cpumask_copy(prof_cpu_mask, cpu_possible_mask);
 125
 126         prof_buffer = kzalloc(buffer_bytes, GFP_KERNEL|__GFP_NOWARN);
 127         if (prof_buffer)
 128                 return 0;
 129
 130         prof_buffer = alloc_pages_exact(buffer_bytes,
 131                                         GFP_KERNEL|__GFP_ZERO|__GFP_NOWARN);
 132         if (prof_buffer)
 133                 return 0;
 134
 135         prof_buffer = vzalloc(buffer_bytes);
 136         if (prof_buffer)
 137                 return 0;
 138
 139         free_cpumask_var(prof_cpu_mask);
 140         return -ENOMEM;
 141 }
 142
 143 #if defined(CONFIG_SMP) && defined(CONFIG_PROC_FS)
 144 /*
 145  * Each cpu has a pair of open-addressed hashtables for pending
 146  * profile hits. read_profile() IPI's all cpus to request them
 147  * to flip buffers and flushes their contents to prof_buffer itself.
 148  * Flip requests are serialized by the profile_flip_mutex. The sole
 149  * use of having a second hashtable is for avoiding cacheline
 150  * contention that would otherwise happen during flushes of pending
 151  * profile hits required for the accuracy of reported profile hits
 152  * and so resurrect the interrupt livelock issue.
 153  *
 154  * The open-addressed hashtables are indexed by profile buffer slot
 155  * and hold the number of pending hits to that profile buffer slot on
 156  * a cpu in an entry. When the hashtable overflows, all pending hits
 157  * are accounted to their corresponding profile buffer slots with
 158  * atomic_add() and the hashtable emptied. As numerous pending hits
 159  * may be accounted to a profile buffer slot in a hashtable entry,
 160  * this amortizes a number of atomic profile buffer increments likely
 161  * to be far larger than the number of entries in the hashtable,
 162  * particularly given that the number of distinct profile buffer
 163  * positions to which hits are accounted during short intervals (e.g.
 164  * several seconds) is usually very small. Exclusion from buffer
 165  * flipping is provided by interrupt disablement (note that for
 166  * SCHED_PROFILING or SLEEP_PROFILING profile_hit() may be called from
 167  * process context).
 168  * The hash function is meant to be lightweight as opposed to strong,
 169  * and was vaguely inspired by ppc64 firmware-supported inverted
 170  * pagetable hash functions, but uses a full hashtable full of finite
 171  * collision chains, not just pairs of them.
 172  *
 173  * -- nyc
 174  */
 175 static void __profile_flip_buffers(void *unused)
 176 {
 177         int cpu = smp_processor_id();
 178
 179         per_cpu(cpu_profile_flip, cpu) = !per_cpu(cpu_profile_flip, cpu);
 180 }
 181
 182 static void profile_flip_buffers(void)
 183 {
 184         int i, j, cpu;
 185
 186         mutex_lock(&profile_flip_mutex);
 187         j = per_cpu(cpu_profile_flip, get_cpu());
 188         put_cpu();
 189         on_each_cpu(__profile_flip_buffers, NULL, 1);
 190         for_each_online_cpu(cpu) {
 191                 struct profile_hit *hits = per_cpu(cpu_profile_hits, cpu)[j];
 192                 for (i = 0; i < NR_PROFILE_HIT; ++i) {
 193                         if (!hits[i].hits) {
 194                                 if (hits[i].pc)
 195                                         hits[i].pc = 0;
 196                                 continue;
 197                         }
 198                         atomic_add(hits[i].hits, &prof_buffer[hits[i].pc]);
 199                         hits[i].hits = hits[i].pc = 0;
 200                 }
 201         }
 202         mutex_unlock(&profile_flip_mutex);
 203 }
 204
 205 static void profile_discard_flip_buffers(void)
 206 {
 207         int i, cpu;
 208
 209         mutex_lock(&profile_flip_mutex);
 210         i = per_cpu(cpu_profile_flip, get_cpu());
 211         put_cpu();
 212         on_each_cpu(__profile_flip_buffers, NULL, 1);
 213         for_each_online_cpu(cpu) {
 214                 struct profile_hit *hits = per_cpu(cpu_profile_hits, cpu)[i];
 215                 memset(hits, 0, NR_PROFILE_HIT*sizeof(struct profile_hit));
 216         }
 217         mutex_unlock(&profile_flip_mutex);
 218 }
 219
 220 static void do_profile_hits(int type, void *__pc, unsigned int nr_hits)
 221 {
 222         unsigned long primary, secondary, flags, pc = (unsigned long)__pc;
 223         int i, j, cpu;
 224         struct profile_hit *hits;
 225
 226         pc = min((pc - (unsigned long)_stext) >> prof_shift, prof_len - 1);
 227         i = primary = (pc & (NR_PROFILE_GRP - 1)) << PROFILE_GRPSHIFT;
 228         secondary = (~(pc << 1) & (NR_PROFILE_GRP - 1)) << PROFILE_GRPSHIFT;
 229         cpu = get_cpu();
 230         hits = per_cpu(cpu_profile_hits, cpu)[per_cpu(cpu_profile_flip, cpu)];
 231         if (!hits) {
 232                 put_cpu();
 233                 return;
 234         }
 235         /*
 236          * We buffer the global profiler buffer into a per-CPU
 237          * queue and thus reduce the number of global (and possibly
 238          * NUMA-alien) accesses. The write-queue is self-coalescing:
 239          */
 240         local_irq_save(flags);
 241         do {
 242                 for (j = 0; j < PROFILE_GRPSZ; ++j) {
 243                         if (hits[i + j].pc == pc) {
 244                                 hits[i + j].hits += nr_hits;
 245                                 goto out;
 246                         } else if (!hits[i + j].hits) {
 247                                 hits[i + j].pc = pc;
 248                                 hits[i + j].hits = nr_hits;
 249                                 goto out;
 250                         }
 251                 }
 252                 i = (i + secondary) & (NR_PROFILE_HIT - 1);
 253         } while (i != primary);
 254
 255         /*
 256          * Add the current hit(s) and flush the write-queue out
 257          * to the global buffer:
 258          */
 259         atomic_add(nr_hits, &prof_buffer[pc]);
 260         for (i = 0; i < NR_PROFILE_HIT; ++i) {
 261                 atomic_add(hits[i].hits, &prof_buffer[hits[i].pc]);
 262                 hits[i].pc = hits[i].hits = 0;
 263         }
 264 out:
 265         local_irq_restore(flags);
 266         put_cpu();
 267 }
 268
 269 static int profile_dead_cpu(unsigned int cpu)
 270 {
 271         struct page *page;
 272         int i;
 273
 274         if (cpumask_available(prof_cpu_mask))
 275                 cpumask_clear_cpu(cpu, prof_cpu_mask);
 276
 277         for (i = 0; i < 2; i++) {
 278                 if (per_cpu(cpu_profile_hits, cpu)[i]) {
 279                         page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[i]);
 280                         per_cpu(cpu_profile_hits, cpu)[i] = NULL;
 281                         __free_page(page);
 282                 }
 283         }
 284         return 0;
 285 }
 286
 287 static int profile_prepare_cpu(unsigned int cpu)
 288 {
 289         int i, node = cpu_to_mem(cpu);
 290         struct page *page;
 291
 292         per_cpu(cpu_profile_flip, cpu) = 0;
 293
 294         for (i = 0; i < 2; i++) {
 295                 if (per_cpu(cpu_profile_hits, cpu)[i])
 296                         continue;
 297
 298                 page = __alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0);
 299                 if (!page) {
 300                         profile_dead_cpu(cpu);
 301                         return -ENOMEM;
 302                 }
 303                 per_cpu(cpu_profile_hits, cpu)[i] = page_address(page);
 304
 305         }
 306         return 0;
 307 }
 308
 309 static int profile_online_cpu(unsigned int cpu)
 310 {
 311         if (cpumask_available(prof_cpu_mask))
 312                 cpumask_set_cpu(cpu, prof_cpu_mask);
 313
 314         return 0;
 315 }
 316
 317 #else /* !CONFIG_SMP */
 318 #define profile_flip_buffers()          do { } while (0)
 319 #define profile_discard_flip_buffers()  do { } while (0)
 320
 321 static void do_profile_hits(int type, void *__pc, unsigned int nr_hits)
 322 {
 323         unsigned long pc;
 324         pc = ((unsigned long)__pc - (unsigned long)_stext) >> prof_shift;
 325         atomic_add(nr_hits, &prof_buffer[min(pc, prof_len - 1)]);
 326 }
 327 #endif /* !CONFIG_SMP */
 328
 329 void profile_hits(int type, void *__pc, unsigned int nr_hits)
 330 {
 331         if (prof_on != type || !prof_buffer)
 332                 return;
 333         do_profile_hits(type, __pc, nr_hits);
 334 }
 335 EXPORT_SYMBOL_GPL(profile_hits);
 336
 337 void profile_tick(int type)
 338 {
 339         struct pt_regs *regs = get_irq_regs();
 340
 341         if (!user_mode(regs) && cpumask_available(prof_cpu_mask) &&
 342             cpumask_test_cpu(smp_processor_id(), prof_cpu_mask))
 343                 profile_hit(type, (void *)profile_pc(regs));
 344 }
 345
 346 #ifdef CONFIG_PROC_FS
 347 #include <linux/proc_fs.h>
 348 #include <linux/seq_file.h>
 349 #include <linux/uaccess.h>
 350
 351 static int prof_cpu_mask_proc_show(struct seq_file *m, void *v)
 352 {
 353         seq_printf(m, "%*pb\n", cpumask_pr_args(prof_cpu_mask));
 354         return 0;
 355 }
 356
 357 static int prof_cpu_mask_proc_open(struct inode *inode, struct file *file)
 358 {
 359         return single_open(file, prof_cpu_mask_proc_show, NULL);
 360 }
 361
 362 static ssize_t prof_cpu_mask_proc_write(struct file *file,
 363         const char __user *buffer, size_t count, loff_t *pos)
 364 {
 365         cpumask_var_t new_value;
 366         int err;
 367
 368         if (!zalloc_cpumask_var(&new_value, GFP_KERNEL))
 369                 return -ENOMEM;
 370
 371         err = cpumask_parse_user(buffer, count, new_value);
 372         if (!err) {
 373                 cpumask_copy(prof_cpu_mask, new_value);
 374                 err = count;
 375         }
 376         free_cpumask_var(new_value);
 377         return err;
 378 }
 379
 380 static const struct proc_ops prof_cpu_mask_proc_ops = {
 381         .proc_open      = prof_cpu_mask_proc_open,
 382         .proc_read      = seq_read,
 383         .proc_lseek     = seq_lseek,
 384         .proc_release   = single_release,
 385         .proc_write     = prof_cpu_mask_proc_write,
 386 };
 387
 388 void create_prof_cpu_mask(void)
 389 {
 390         /* create /proc/irq/prof_cpu_mask */
 391         proc_create("irq/prof_cpu_mask", 0600, NULL, &prof_cpu_mask_proc_ops);
 392 }
 393
 394 /*
 395  * This function accesses profiling information. The returned data is
 396  * binary: the sampling step and the actual contents of the profile
 397  * buffer. Use of the program readprofile is recommended in order to
 398  * get meaningful info out of these data.
 399  */
 400 static ssize_t
 401 read_profile(struct file *file, char __user *buf, size_t count, loff_t *ppos)
 402 {
 403         unsigned long p = *ppos;
 404         ssize_t read;
 405         char *pnt;
 406         unsigned long sample_step = 1UL << prof_shift;
 407
 408         profile_flip_buffers();
 409         if (p >= (prof_len+1)*sizeof(unsigned int))
 410                 return 0;
 411         if (count > (prof_len+1)*sizeof(unsigned int) - p)
 412                 count = (prof_len+1)*sizeof(unsigned int) - p;
 413         read = 0;
 414
 415         while (p < sizeof(unsigned int) && count > 0) {
 416                 if (put_user(*((char *)(&sample_step)+p), buf))
 417                         return -EFAULT;
 418                 buf++; p++; count--; read++;
 419         }
 420         pnt = (char *)prof_buffer + p - sizeof(atomic_t);
 421         if (copy_to_user(buf, (void *)pnt, count))
 422                 return -EFAULT;
 423         read += count;
 424         *ppos += read;
 425         return read;
 426 }
 427
 428 /* default is to not implement this call */
 429 int __weak setup_profiling_timer(unsigned mult)
 430 {
 431         return -EINVAL;
 432 }
 433
 434 /*
 435  * Writing to /proc/profile resets the counters
 436  *
 437  * Writing a 'profiling multiplier' value into it also re-sets the profiling
 438  * interrupt frequency, on architectures that support this.
 439  */
 440 static ssize_t write_profile(struct file *file, const char __user *buf,
 441                              size_t count, loff_t *ppos)
 442 {
 443 #ifdef CONFIG_SMP
 444         if (count == sizeof(int)) {
 445                 unsigned int multiplier;
 446
 447                 if (copy_from_user(&multiplier, buf, sizeof(int)))
 448                         return -EFAULT;
 449
 450                 if (setup_profiling_timer(multiplier))
 451                         return -EINVAL;
 452         }
 453 #endif
 454         profile_discard_flip_buffers();
 455         memset(prof_buffer, 0, prof_len * sizeof(atomic_t));
 456         return count;
 457 }
 458
 459 static const struct proc_ops profile_proc_ops = {
 460         .proc_read      = read_profile,
 461         .proc_write     = write_profile,
 462         .proc_lseek     = default_llseek,
 463 };
 464
 465 int __ref create_proc_profile(void)
 466 {
 467         struct proc_dir_entry *entry;
 468 #ifdef CONFIG_SMP
 469         enum cpuhp_state online_state;
 470 #endif
 471
 472         int err = 0;
 473
 474         if (!prof_on)
 475                 return 0;
 476 #ifdef CONFIG_SMP
 477         err = cpuhp_setup_state(CPUHP_PROFILE_PREPARE, "PROFILE_PREPARE",
 478                                 profile_prepare_cpu, profile_dead_cpu);
 479         if (err)
 480                 return err;
 481
 482         err = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "AP_PROFILE_ONLINE",
 483                                 profile_online_cpu, NULL);
 484         if (err < 0)
 485                 goto err_state_prep;
 486         online_state = err;
 487         err = 0;
 488 #endif
 489         entry = proc_create("profile", S_IWUSR | S_IRUGO,
 490                             NULL, &profile_proc_ops);
 491         if (!entry)
 492                 goto err_state_onl;
 493         proc_set_size(entry, (1 + prof_len) * sizeof(atomic_t));
 494
 495         return err;
 496 err_state_onl:
 497 #ifdef CONFIG_SMP
 498         cpuhp_remove_state(online_state);
 499 err_state_prep:
 500         cpuhp_remove_state(CPUHP_PROFILE_PREPARE);
 501 #endif
 502         return err;
 503 }
 504 subsys_initcall(create_proc_profile);
 505 #endif /* CONFIG_PROC_FS */