kernel/profile.c

   1 // SPDX-License-Identifier: GPL-2.0-only
   2 /*
   3  *  linux/kernel/profile.c
   4  *  Simple profiling. Manages a direct-mapped profile hit count buffer,
   5  *  with configurable resolution, support for restricting the cpus on
   6  *  which profiling is done, and switching between cpu time and
   7  *  schedule() calls via kernel command line parameters passed at boot.
   8  *
   9  *  Scheduler profiling support, Arjan van de Ven and Ingo Molnar,
  10  *      Red Hat, July 2004
  11  *  Consolidation of architecture support code for profiling,
  12  *      Nadia Yvette Chambers, Oracle, July 2004
  13  *  Amortized hit count accounting via per-cpu open-addressed hashtables
  14  *      to resolve timer interrupt livelocks, Nadia Yvette Chambers,
  15  *      Oracle, 2004
  16  */
  17
  18 #include <linux/export.h>
  19 #include <linux/profile.h>
  20 #include <linux/memblock.h>
  21 #include <linux/notifier.h>
  22 #include <linux/mm.h>
  23 #include <linux/cpumask.h>
  24 #include <linux/cpu.h>
  25 #include <linux/highmem.h>
  26 #include <linux/mutex.h>
  27 #include <linux/slab.h>
  28 #include <linux/vmalloc.h>
  29 #include <linux/sched/stat.h>
  30
  31 #include <asm/sections.h>
  32 #include <asm/irq_regs.h>
  33 #include <asm/ptrace.h>
  34
  35 struct profile_hit {
  36         u32 pc, hits;
  37 };
  38 #define PROFILE_GRPSHIFT        3
  39 #define PROFILE_GRPSZ           (1 << PROFILE_GRPSHIFT)
  40 #define NR_PROFILE_HIT          (PAGE_SIZE/sizeof(struct profile_hit))
  41 #define NR_PROFILE_GRP          (NR_PROFILE_HIT/PROFILE_GRPSZ)
  42
  43 static atomic_t *prof_buffer;
  44 static unsigned long prof_len;
  45 static unsigned short int prof_shift;
  46
  47 int prof_on __read_mostly;
  48 EXPORT_SYMBOL_GPL(prof_on);
  49
  50 static cpumask_var_t prof_cpu_mask;
  51 #if defined(CONFIG_SMP) && defined(CONFIG_PROC_FS)
  52 static DEFINE_PER_CPU(struct profile_hit *[2], cpu_profile_hits);
  53 static DEFINE_PER_CPU(int, cpu_profile_flip);
  54 static DEFINE_MUTEX(profile_flip_mutex);
  55 #endif /* CONFIG_SMP */
  56
  57 int profile_setup(char *str)
  58 {
  59         static const char schedstr[] = "schedule";
  60         static const char sleepstr[] = "sleep";
  61         static const char kvmstr[] = "kvm";
  62         const char *select = NULL;
  63         int par;
  64
  65         if (!strncmp(str, sleepstr, strlen(sleepstr))) {
  66 #ifdef CONFIG_SCHEDSTATS
  67                 force_schedstat_enabled();
  68                 prof_on = SLEEP_PROFILING;
  69                 select = sleepstr;
  70 #else
  71                 pr_warn("kernel sleep profiling requires CONFIG_SCHEDSTATS\n");
  72 #endif /* CONFIG_SCHEDSTATS */
  73         } else if (!strncmp(str, schedstr, strlen(schedstr))) {
  74                 prof_on = SCHED_PROFILING;
  75                 select = schedstr;
  76         } else if (!strncmp(str, kvmstr, strlen(kvmstr))) {
  77                 prof_on = KVM_PROFILING;
  78                 select = kvmstr;
  79         } else if (get_option(&str, &par)) {
  80                 prof_shift = clamp(par, 0, BITS_PER_LONG - 1);
  81                 prof_on = CPU_PROFILING;
  82                 pr_info("kernel profiling enabled (shift: %u)\n",
  83                         prof_shift);
  84         }
  85
  86         if (select) {
  87                 if (str[strlen(select)] == ',')
  88                         str += strlen(select) + 1;
  89                 if (get_option(&str, &par))
  90                         prof_shift = clamp(par, 0, BITS_PER_LONG - 1);
  91                 pr_info("kernel %s profiling enabled (shift: %u)\n",
  92                         select, prof_shift);
  93         }
  94
  95         return 1;
  96 }
  97 __setup("profile=", profile_setup);
  98
  99
 100 int __ref profile_init(void)
 101 {
 102         int buffer_bytes;
 103         if (!prof_on)
 104                 return 0;
 105
 106         /* only text is profiled */
 107         prof_len = (_etext - _stext) >> prof_shift;
 108
 109         if (!prof_len) {
 110                 pr_warn("profiling shift: %u too large\n", prof_shift);
 111                 prof_on = 0;
 112                 return -EINVAL;
 113         }
 114
 115         buffer_bytes = prof_len*sizeof(atomic_t);
 116
 117         if (!alloc_cpumask_var(&prof_cpu_mask, GFP_KERNEL))
 118                 return -ENOMEM;
 119
 120         cpumask_copy(prof_cpu_mask, cpu_possible_mask);
 121
 122         prof_buffer = kzalloc(buffer_bytes, GFP_KERNEL|__GFP_NOWARN);
 123         if (prof_buffer)
 124                 return 0;
 125
 126         prof_buffer = alloc_pages_exact(buffer_bytes,
 127                                         GFP_KERNEL|__GFP_ZERO|__GFP_NOWARN);
 128         if (prof_buffer)
 129                 return 0;
 130
 131         prof_buffer = vzalloc(buffer_bytes);
 132         if (prof_buffer)
 133                 return 0;
 134
 135         free_cpumask_var(prof_cpu_mask);
 136         return -ENOMEM;
 137 }
 138
 139 #if defined(CONFIG_SMP) && defined(CONFIG_PROC_FS)
 140 /*
 141  * Each cpu has a pair of open-addressed hashtables for pending
 142  * profile hits. read_profile() IPI's all cpus to request them
 143  * to flip buffers and flushes their contents to prof_buffer itself.
 144  * Flip requests are serialized by the profile_flip_mutex. The sole
 145  * use of having a second hashtable is for avoiding cacheline
 146  * contention that would otherwise happen during flushes of pending
 147  * profile hits required for the accuracy of reported profile hits
 148  * and so resurrect the interrupt livelock issue.
 149  *
 150  * The open-addressed hashtables are indexed by profile buffer slot
 151  * and hold the number of pending hits to that profile buffer slot on
 152  * a cpu in an entry. When the hashtable overflows, all pending hits
 153  * are accounted to their corresponding profile buffer slots with
 154  * atomic_add() and the hashtable emptied. As numerous pending hits
 155  * may be accounted to a profile buffer slot in a hashtable entry,
 156  * this amortizes a number of atomic profile buffer increments likely
 157  * to be far larger than the number of entries in the hashtable,
 158  * particularly given that the number of distinct profile buffer
 159  * positions to which hits are accounted during short intervals (e.g.
 160  * several seconds) is usually very small. Exclusion from buffer
 161  * flipping is provided by interrupt disablement (note that for
 162  * SCHED_PROFILING or SLEEP_PROFILING profile_hit() may be called from
 163  * process context).
 164  * The hash function is meant to be lightweight as opposed to strong,
 165  * and was vaguely inspired by ppc64 firmware-supported inverted
 166  * pagetable hash functions, but uses a full hashtable full of finite
 167  * collision chains, not just pairs of them.
 168  *
 169  * -- nyc
 170  */
 171 static void __profile_flip_buffers(void *unused)
 172 {
 173         int cpu = smp_processor_id();
 174
 175         per_cpu(cpu_profile_flip, cpu) = !per_cpu(cpu_profile_flip, cpu);
 176 }
 177
 178 static void profile_flip_buffers(void)
 179 {
 180         int i, j, cpu;
 181
 182         mutex_lock(&profile_flip_mutex);
 183         j = per_cpu(cpu_profile_flip, get_cpu());
 184         put_cpu();
 185         on_each_cpu(__profile_flip_buffers, NULL, 1);
 186         for_each_online_cpu(cpu) {
 187                 struct profile_hit *hits = per_cpu(cpu_profile_hits, cpu)[j];
 188                 for (i = 0; i < NR_PROFILE_HIT; ++i) {
 189                         if (!hits[i].hits) {
 190                                 if (hits[i].pc)
 191                                         hits[i].pc = 0;
 192                                 continue;
 193                         }
 194                         atomic_add(hits[i].hits, &prof_buffer[hits[i].pc]);
 195                         hits[i].hits = hits[i].pc = 0;
 196                 }
 197         }
 198         mutex_unlock(&profile_flip_mutex);
 199 }
 200
 201 static void profile_discard_flip_buffers(void)
 202 {
 203         int i, cpu;
 204
 205         mutex_lock(&profile_flip_mutex);
 206         i = per_cpu(cpu_profile_flip, get_cpu());
 207         put_cpu();
 208         on_each_cpu(__profile_flip_buffers, NULL, 1);
 209         for_each_online_cpu(cpu) {
 210                 struct profile_hit *hits = per_cpu(cpu_profile_hits, cpu)[i];
 211                 memset(hits, 0, NR_PROFILE_HIT*sizeof(struct profile_hit));
 212         }
 213         mutex_unlock(&profile_flip_mutex);
 214 }
 215
 216 static void do_profile_hits(int type, void *__pc, unsigned int nr_hits)
 217 {
 218         unsigned long primary, secondary, flags, pc = (unsigned long)__pc;
 219         int i, j, cpu;
 220         struct profile_hit *hits;
 221
 222         pc = min((pc - (unsigned long)_stext) >> prof_shift, prof_len - 1);
 223         i = primary = (pc & (NR_PROFILE_GRP - 1)) << PROFILE_GRPSHIFT;
 224         secondary = (~(pc << 1) & (NR_PROFILE_GRP - 1)) << PROFILE_GRPSHIFT;
 225         cpu = get_cpu();
 226         hits = per_cpu(cpu_profile_hits, cpu)[per_cpu(cpu_profile_flip, cpu)];
 227         if (!hits) {
 228                 put_cpu();
 229                 return;
 230         }
 231         /*
 232          * We buffer the global profiler buffer into a per-CPU
 233          * queue and thus reduce the number of global (and possibly
 234          * NUMA-alien) accesses. The write-queue is self-coalescing:
 235          */
 236         local_irq_save(flags);
 237         do {
 238                 for (j = 0; j < PROFILE_GRPSZ; ++j) {
 239                         if (hits[i + j].pc == pc) {
 240                                 hits[i + j].hits += nr_hits;
 241                                 goto out;
 242                         } else if (!hits[i + j].hits) {
 243                                 hits[i + j].pc = pc;
 244                                 hits[i + j].hits = nr_hits;
 245                                 goto out;
 246                         }
 247                 }
 248                 i = (i + secondary) & (NR_PROFILE_HIT - 1);
 249         } while (i != primary);
 250
 251         /*
 252          * Add the current hit(s) and flush the write-queue out
 253          * to the global buffer:
 254          */
 255         atomic_add(nr_hits, &prof_buffer[pc]);
 256         for (i = 0; i < NR_PROFILE_HIT; ++i) {
 257                 atomic_add(hits[i].hits, &prof_buffer[hits[i].pc]);
 258                 hits[i].pc = hits[i].hits = 0;
 259         }
 260 out:
 261         local_irq_restore(flags);
 262         put_cpu();
 263 }
 264
 265 static int profile_dead_cpu(unsigned int cpu)
 266 {
 267         struct page *page;
 268         int i;
 269
 270         if (cpumask_available(prof_cpu_mask))
 271                 cpumask_clear_cpu(cpu, prof_cpu_mask);
 272
 273         for (i = 0; i < 2; i++) {
 274                 if (per_cpu(cpu_profile_hits, cpu)[i]) {
 275                         page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[i]);
 276                         per_cpu(cpu_profile_hits, cpu)[i] = NULL;
 277                         __free_page(page);
 278                 }
 279         }
 280         return 0;
 281 }
 282
 283 static int profile_prepare_cpu(unsigned int cpu)
 284 {
 285         int i, node = cpu_to_mem(cpu);
 286         struct page *page;
 287
 288         per_cpu(cpu_profile_flip, cpu) = 0;
 289
 290         for (i = 0; i < 2; i++) {
 291                 if (per_cpu(cpu_profile_hits, cpu)[i])
 292                         continue;
 293
 294                 page = __alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0);
 295                 if (!page) {
 296                         profile_dead_cpu(cpu);
 297                         return -ENOMEM;
 298                 }
 299                 per_cpu(cpu_profile_hits, cpu)[i] = page_address(page);
 300
 301         }
 302         return 0;
 303 }
 304
 305 static int profile_online_cpu(unsigned int cpu)
 306 {
 307         if (cpumask_available(prof_cpu_mask))
 308                 cpumask_set_cpu(cpu, prof_cpu_mask);
 309
 310         return 0;
 311 }
 312
 313 #else /* !CONFIG_SMP */
 314 #define profile_flip_buffers()          do { } while (0)
 315 #define profile_discard_flip_buffers()  do { } while (0)
 316
 317 static void do_profile_hits(int type, void *__pc, unsigned int nr_hits)
 318 {
 319         unsigned long pc;
 320         pc = ((unsigned long)__pc - (unsigned long)_stext) >> prof_shift;
 321         atomic_add(nr_hits, &prof_buffer[min(pc, prof_len - 1)]);
 322 }
 323 #endif /* !CONFIG_SMP */
 324
 325 void profile_hits(int type, void *__pc, unsigned int nr_hits)
 326 {
 327         if (prof_on != type || !prof_buffer)
 328                 return;
 329         do_profile_hits(type, __pc, nr_hits);
 330 }
 331 EXPORT_SYMBOL_GPL(profile_hits);
 332
 333 void profile_tick(int type)
 334 {
 335         struct pt_regs *regs = get_irq_regs();
 336
 337         if (!user_mode(regs) && cpumask_available(prof_cpu_mask) &&
 338             cpumask_test_cpu(smp_processor_id(), prof_cpu_mask))
 339                 profile_hit(type, (void *)profile_pc(regs));
 340 }
 341
 342 #ifdef CONFIG_PROC_FS
 343 #include <linux/proc_fs.h>
 344 #include <linux/seq_file.h>
 345 #include <linux/uaccess.h>
 346
 347 static int prof_cpu_mask_proc_show(struct seq_file *m, void *v)
 348 {
 349         seq_printf(m, "%*pb\n", cpumask_pr_args(prof_cpu_mask));
 350         return 0;
 351 }
 352
 353 static int prof_cpu_mask_proc_open(struct inode *inode, struct file *file)
 354 {
 355         return single_open(file, prof_cpu_mask_proc_show, NULL);
 356 }
 357
 358 static ssize_t prof_cpu_mask_proc_write(struct file *file,
 359         const char __user *buffer, size_t count, loff_t *pos)
 360 {
 361         cpumask_var_t new_value;
 362         int err;
 363
 364         if (!zalloc_cpumask_var(&new_value, GFP_KERNEL))
 365                 return -ENOMEM;
 366
 367         err = cpumask_parse_user(buffer, count, new_value);
 368         if (!err) {
 369                 cpumask_copy(prof_cpu_mask, new_value);
 370                 err = count;
 371         }
 372         free_cpumask_var(new_value);
 373         return err;
 374 }
 375
 376 static const struct proc_ops prof_cpu_mask_proc_ops = {
 377         .proc_open      = prof_cpu_mask_proc_open,
 378         .proc_read      = seq_read,
 379         .proc_lseek     = seq_lseek,
 380         .proc_release   = single_release,
 381         .proc_write     = prof_cpu_mask_proc_write,
 382 };
 383
 384 void create_prof_cpu_mask(void)
 385 {
 386         /* create /proc/irq/prof_cpu_mask */
 387         proc_create("irq/prof_cpu_mask", 0600, NULL, &prof_cpu_mask_proc_ops);
 388 }
 389
 390 /*
 391  * This function accesses profiling information. The returned data is
 392  * binary: the sampling step and the actual contents of the profile
 393  * buffer. Use of the program readprofile is recommended in order to
 394  * get meaningful info out of these data.
 395  */
 396 static ssize_t
 397 read_profile(struct file *file, char __user *buf, size_t count, loff_t *ppos)
 398 {
 399         unsigned long p = *ppos;
 400         ssize_t read;
 401         char *pnt;
 402         unsigned long sample_step = 1UL << prof_shift;
 403
 404         profile_flip_buffers();
 405         if (p >= (prof_len+1)*sizeof(unsigned int))
 406                 return 0;
 407         if (count > (prof_len+1)*sizeof(unsigned int) - p)
 408                 count = (prof_len+1)*sizeof(unsigned int) - p;
 409         read = 0;
 410
 411         while (p < sizeof(unsigned int) && count > 0) {
 412                 if (put_user(*((char *)(&sample_step)+p), buf))
 413                         return -EFAULT;
 414                 buf++; p++; count--; read++;
 415         }
 416         pnt = (char *)prof_buffer + p - sizeof(atomic_t);
 417         if (copy_to_user(buf, (void *)pnt, count))
 418                 return -EFAULT;
 419         read += count;
 420         *ppos += read;
 421         return read;
 422 }
 423
 424 /* default is to not implement this call */
 425 int __weak setup_profiling_timer(unsigned mult)
 426 {
 427         return -EINVAL;
 428 }
 429
 430 /*
 431  * Writing to /proc/profile resets the counters
 432  *
 433  * Writing a 'profiling multiplier' value into it also re-sets the profiling
 434  * interrupt frequency, on architectures that support this.
 435  */
 436 static ssize_t write_profile(struct file *file, const char __user *buf,
 437                              size_t count, loff_t *ppos)
 438 {
 439 #ifdef CONFIG_SMP
 440         if (count == sizeof(int)) {
 441                 unsigned int multiplier;
 442
 443                 if (copy_from_user(&multiplier, buf, sizeof(int)))
 444                         return -EFAULT;
 445
 446                 if (setup_profiling_timer(multiplier))
 447                         return -EINVAL;
 448         }
 449 #endif
 450         profile_discard_flip_buffers();
 451         memset(prof_buffer, 0, prof_len * sizeof(atomic_t));
 452         return count;
 453 }
 454
 455 static const struct proc_ops profile_proc_ops = {
 456         .proc_read      = read_profile,
 457         .proc_write     = write_profile,
 458         .proc_lseek     = default_llseek,
 459 };
 460
 461 int __ref create_proc_profile(void)
 462 {
 463         struct proc_dir_entry *entry;
 464 #ifdef CONFIG_SMP
 465         enum cpuhp_state online_state;
 466 #endif
 467
 468         int err = 0;
 469
 470         if (!prof_on)
 471                 return 0;
 472 #ifdef CONFIG_SMP
 473         err = cpuhp_setup_state(CPUHP_PROFILE_PREPARE, "PROFILE_PREPARE",
 474                                 profile_prepare_cpu, profile_dead_cpu);
 475         if (err)
 476                 return err;
 477
 478         err = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "AP_PROFILE_ONLINE",
 479                                 profile_online_cpu, NULL);
 480         if (err < 0)
 481                 goto err_state_prep;
 482         online_state = err;
 483         err = 0;
 484 #endif
 485         entry = proc_create("profile", S_IWUSR | S_IRUGO,
 486                             NULL, &profile_proc_ops);
 487         if (!entry)
 488                 goto err_state_onl;
 489         proc_set_size(entry, (1 + prof_len) * sizeof(atomic_t));
 490
 491         return err;
 492 err_state_onl:
 493 #ifdef CONFIG_SMP
 494         cpuhp_remove_state(online_state);
 495 err_state_prep:
 496         cpuhp_remove_state(CPUHP_PROFILE_PREPARE);
 497 #endif
 498         return err;
 499 }
 500 subsys_initcall(create_proc_profile);
 501 #endif /* CONFIG_PROC_FS */