Merge tag 'trace-v6.6' of git://git.kernel.org/pub/scm/linux/kernel/git/trace/linux...

author Linus Torvalds <torvalds@linux-foundation.org>

Fri, 1 Sep 2023 23:34:25 +0000 (16:34 -0700)

committer Linus Torvalds <torvalds@linux-foundation.org>

Fri, 1 Sep 2023 23:34:25 +0000 (16:34 -0700)
author Linus Torvalds <torvalds@linux-foundation.org>
Fri, 1 Sep 2023 23:34:25 +0000 (16:34 -0700)
committer Linus Torvalds <torvalds@linux-foundation.org>
Fri, 1 Sep 2023 23:34:25 +0000 (16:34 -0700)
diff --combined Documentation/trace/events.rst

index 15f78e7,34108d5..759907c
--- 1/Documentation/trace/events.rst
--- 2/Documentation/trace/events.rst
+++ b/Documentation/trace/events.rst
@@@ -219,6 -219,20 +219,20 @@@ the function "security_prepare_creds" a
   The ".function" postfix can only be attached to values of size long, and can only
   be compared with "==" or "!=".
   
+ Cpumask fields or scalar fields that encode a CPU number can be filtered using
+ a user-provided cpumask in cpulist format. The format is as follows::
+ 
+   CPUS{$cpulist}
+ 
+ Operators available to cpumask filtering are:
+ 
+ & (intersection), ==, !=
+ 
+ For example, this will filter events that have their .target_cpu field present
+ in the given cpumask::
+ 
+   target_cpu & CPUS{17-42}
+ 
   5.2 Setting filters
   -------------------
   
@@@ -915,7 -929,7 +929,7 @@@ functions can be used
   
   To create a kprobe event, an empty or partially empty kprobe event
   should first be created using kprobe_event_gen_cmd_start().  The name
- -of the event and the probe location should be specfied along with one
+ +of the event and the probe location should be specified along with one
   or args each representing a probe field should be supplied to this
   function.  Before calling kprobe_event_gen_cmd_start(), the user
   should create and initialize a dynevent_cmd object using
@@@ -995,7 -1009,7 +1009,7 @@@ The basic idea is simple and amounts t
   layer that can be used to generate trace event commands.  The
   generated command strings can then be passed to the command-parsing
   and event creation code that already exists in the trace event
- -subystem for creating the corresponding trace events.
+ +subsystem for creating the corresponding trace events.
   
   In a nutshell, the way it works is that the higher-level interface
   code creates a struct dynevent_cmd object, then uses a couple
@@@ -1068,7 -1082,7 +1082,7 @@@ to add an operator between the pair (he
   appended onto the end of the arg pair (here ';').
   
   There's also a dynevent_str_add() function that can be used to simply
- -add a string as-is, with no spaces, delimeters, or arg check.
+ +add a string as-is, with no spaces, delimiters, or arg check.
   
   Any number of dynevent_*_add() calls can be made to build up the string
   (until its length surpasses cmd->maxlen).  When all the arguments have
diff --combined fs/tracefs/inode.c

index 2feb6c5,c7a10f9..de5b722
--- 1/fs/tracefs/inode.c
--- 2/fs/tracefs/inode.c
+++ b/fs/tracefs/inode.c
@@@ -21,13 -21,33 +21,33 @@@
   #include <linux/parser.h>
   #include <linux/magic.h>
   #include <linux/slab.h>
+ #include "internal.h"
   
   #define TRACEFS_DEFAULT_MODE  0700
+ static struct kmem_cache *tracefs_inode_cachep __ro_after_init;
   
   static struct vfsmount *tracefs_mount;
   static int tracefs_mount_count;
   static bool tracefs_registered;
   
+ static struct inode *tracefs_alloc_inode(struct super_block *sb)
+ {
+       struct tracefs_inode *ti;
+ 
+       ti = kmem_cache_alloc(tracefs_inode_cachep, GFP_KERNEL);
+       if (!ti)
+               return NULL;
+ 
+       ti->flags = 0;
+ 
+       return &ti->vfs_inode;
+ }
+ 
+ static void tracefs_free_inode(struct inode *inode)
+ {
+       kmem_cache_free(tracefs_inode_cachep, get_tracefs(inode));
+ }
+ 
   static ssize_t default_read_file(struct file *file, char __user *buf,
                                  size_t count, loff_t *ppos)
   {
@@@ -127,12 -147,12 +147,12 @@@ static const struct inode_operations tr
         .rmdir          = tracefs_syscall_rmdir,
   };
   
- static struct inode *tracefs_get_inode(struct super_block *sb)
+ struct inode *tracefs_get_inode(struct super_block *sb)
   {
         struct inode *inode = new_inode(sb);
         if (inode) {
                 inode->i_ino = get_next_ino();
- -              inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode);
+ +              inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode);
         }
         return inode;
   }
@@@ -290,6 -310,7 +310,7 @@@ static int tracefs_apply_options(struc
         struct tracefs_fs_info *fsi = sb->s_fs_info;
         struct inode *inode = d_inode(sb->s_root);
         struct tracefs_mount_opts *opts = &fsi->mount_opts;
+       umode_t tmp_mode;
   
         /*
          * On remount, only reset mode/uid/gid if they were provided as mount
@@@ -297,8 -318,9 +318,9 @@@
          */
   
         if (!remount || opts->opts & BIT(Opt_mode)) {
-               inode->i_mode &= ~S_IALLUGO;
-               inode->i_mode |= opts->mode;
+               tmp_mode = READ_ONCE(inode->i_mode) & ~S_IALLUGO;
+               tmp_mode |= opts->mode;
+               WRITE_ONCE(inode->i_mode, tmp_mode);
         }
   
         if (!remount || opts->opts & BIT(Opt_uid))
@@@ -346,11 -368,31 +368,31 @@@ static int tracefs_show_options(struct 
   }
   
   static const struct super_operations tracefs_super_operations = {
+       .alloc_inode    = tracefs_alloc_inode,
+       .free_inode     = tracefs_free_inode,
+       .drop_inode     = generic_delete_inode,
         .statfs         = simple_statfs,
         .remount_fs     = tracefs_remount,
         .show_options   = tracefs_show_options,
   };
   
+ static void tracefs_dentry_iput(struct dentry *dentry, struct inode *inode)
+ {
+       struct tracefs_inode *ti;
+ 
+       if (!dentry || !inode)
+               return;
+ 
+       ti = get_tracefs(inode);
+       if (ti && ti->flags & TRACEFS_EVENT_INODE)
+               eventfs_set_ef_status_free(dentry);
+       iput(inode);
+ }
+ 
+ static const struct dentry_operations tracefs_dentry_operations = {
+       .d_iput = tracefs_dentry_iput,
+ };
+ 
   static int trace_fill_super(struct super_block *sb, void *data, int silent)
   {
         static const struct tree_descr trace_files[] = {{""}};
@@@ -373,6 -415,7 +415,7 @@@
                 goto fail;
   
         sb->s_op = &tracefs_super_operations;
+       sb->s_d_op = &tracefs_dentry_operations;
   
         tracefs_apply_options(sb, false);
   
@@@ -399,7 -442,7 +442,7 @@@ static struct file_system_type trace_fs
   };
   MODULE_ALIAS_FS("tracefs");
   
- static struct dentry *start_creating(const char *name, struct dentry *parent)
+ struct dentry *tracefs_start_creating(const char *name, struct dentry *parent)
   {
         struct dentry *dentry;
         int error;
@@@ -437,7 -480,7 +480,7 @@@
         return dentry;
   }
   
- static struct dentry *failed_creating(struct dentry *dentry)
+ struct dentry *tracefs_failed_creating(struct dentry *dentry)
   {
         inode_unlock(d_inode(dentry->d_parent));
         dput(dentry);
@@@ -445,13 -488,87 +488,87 @@@
         return NULL;
   }
   
- static struct dentry *end_creating(struct dentry *dentry)
+ struct dentry *tracefs_end_creating(struct dentry *dentry)
   {
         inode_unlock(d_inode(dentry->d_parent));
         return dentry;
   }
   
   /**
+  * eventfs_start_creating - start the process of creating a dentry
+  * @name: Name of the file created for the dentry
+  * @parent: The parent dentry where this dentry will be created
+  *
+  * This is a simple helper function for the dynamically created eventfs
+  * files. When the directory of the eventfs files are accessed, their
+  * dentries are created on the fly. This function is used to start that
+  * process.
+  */
+ struct dentry *eventfs_start_creating(const char *name, struct dentry *parent)
+ {
+       struct dentry *dentry;
+       int error;
+ 
+       error = simple_pin_fs(&trace_fs_type, &tracefs_mount,
+                             &tracefs_mount_count);
+       if (error)
+               return ERR_PTR(error);
+ 
+       /*
+        * If the parent is not specified, we create it in the root.
+        * We need the root dentry to do this, which is in the super
+        * block. A pointer to that is in the struct vfsmount that we
+        * have around.
+        */
+       if (!parent)
+               parent = tracefs_mount->mnt_root;
+ 
+       if (unlikely(IS_DEADDIR(parent->d_inode)))
+               dentry = ERR_PTR(-ENOENT);
+       else
+               dentry = lookup_one_len(name, parent, strlen(name));
+ 
+       if (!IS_ERR(dentry) && dentry->d_inode) {
+               dput(dentry);
+               dentry = ERR_PTR(-EEXIST);
+       }
+ 
+       if (IS_ERR(dentry))
+               simple_release_fs(&tracefs_mount, &tracefs_mount_count);
+ 
+       return dentry;
+ }
+ 
+ /**
+  * eventfs_failed_creating - clean up a failed eventfs dentry creation
+  * @dentry: The dentry to clean up
+  *
+  * If after calling eventfs_start_creating(), a failure is detected, the
+  * resources created by eventfs_start_creating() needs to be cleaned up. In
+  * that case, this function should be called to perform that clean up.
+  */
+ struct dentry *eventfs_failed_creating(struct dentry *dentry)
+ {
+       dput(dentry);
+       simple_release_fs(&tracefs_mount, &tracefs_mount_count);
+       return NULL;
+ }
+ 
+ /**
+  * eventfs_end_creating - Finish the process of creating a eventfs dentry
+  * @dentry: The dentry that has successfully been created.
+  *
+  * This function is currently just a place holder to match
+  * eventfs_start_creating(). In case any synchronization needs to be added,
+  * this function will be used to implement that without having to modify
+  * the callers of eventfs_start_creating().
+  */
+ struct dentry *eventfs_end_creating(struct dentry *dentry)
+ {
+       return dentry;
+ }
+ 
+ /**
    * tracefs_create_file - create a file in the tracefs filesystem
    * @name: a pointer to a string containing the name of the file to create.
    * @mode: the permission that the file should have.
@@@ -490,14 -607,14 +607,14 @@@ struct dentry *tracefs_create_file(cons
         if (!(mode & S_IFMT))
                 mode |= S_IFREG;
         BUG_ON(!S_ISREG(mode));
-       dentry = start_creating(name, parent);
+       dentry = tracefs_start_creating(name, parent);
   
         if (IS_ERR(dentry))
                 return NULL;
   
         inode = tracefs_get_inode(dentry->d_sb);
         if (unlikely(!inode))
-               return failed_creating(dentry);
+               return tracefs_failed_creating(dentry);
   
         inode->i_mode = mode;
         inode->i_fop = fops ? fops : &tracefs_file_operations;
@@@ -506,13 -623,13 +623,13 @@@
         inode->i_gid = d_inode(dentry->d_parent)->i_gid;
         d_instantiate(dentry, inode);
         fsnotify_create(d_inode(dentry->d_parent), dentry);
-       return end_creating(dentry);
+       return tracefs_end_creating(dentry);
   }
   
   static struct dentry *__create_dir(const char *name, struct dentry *parent,
                                    const struct inode_operations *ops)
   {
-       struct dentry *dentry = start_creating(name, parent);
+       struct dentry *dentry = tracefs_start_creating(name, parent);
         struct inode *inode;
   
         if (IS_ERR(dentry))
@@@ -520,7 -637,7 +637,7 @@@
   
         inode = tracefs_get_inode(dentry->d_sb);
         if (unlikely(!inode))
-               return failed_creating(dentry);
+               return tracefs_failed_creating(dentry);
   
         /* Do not set bits for OTH */
         inode->i_mode = S_IFDIR | S_IRWXU | S_IRUSR| S_IRGRP | S_IXUSR | S_IXGRP;
@@@ -534,7 -651,7 +651,7 @@@
         d_instantiate(dentry, inode);
         inc_nlink(d_inode(dentry->d_parent));
         fsnotify_mkdir(d_inode(dentry->d_parent), dentry);
-       return end_creating(dentry);
+       return tracefs_end_creating(dentry);
   }
   
   /**
@@@ -628,10 -745,26 +745,26 @@@ bool tracefs_initialized(void
         return tracefs_registered;
   }
   
+ static void init_once(void *foo)
+ {
+       struct tracefs_inode *ti = (struct tracefs_inode *) foo;
+ 
+       inode_init_once(&ti->vfs_inode);
+ }
+ 
   static int __init tracefs_init(void)
   {
         int retval;
   
+       tracefs_inode_cachep = kmem_cache_create("tracefs_inode_cache",
+                                                sizeof(struct tracefs_inode),
+                                                0, (SLAB_RECLAIM_ACCOUNT|
+                                                    SLAB_MEM_SPREAD|
+                                                    SLAB_ACCOUNT),
+                                                init_once);
+       if (!tracefs_inode_cachep)
+               return -ENOMEM;
+ 
         retval = sysfs_create_mount_point(kernel_kobj, "tracing");
         if (retval)
                 return -EINVAL;
diff --combined include/linux/trace_events.h

index c1a0a19,1600aeb..eb5c3ad
--- 1/include/linux/trace_events.h
--- 2/include/linux/trace_events.h
+++ b/include/linux/trace_events.h
@@@ -59,17 -59,6 +59,17 @@@ int trace_raw_output_prep(struct trace_
   extern __printf(2, 3)
   void trace_event_printf(struct trace_iterator *iter, const char *fmt, ...);
   
+ +/* Used to find the offset and length of dynamic fields in trace events */
+ +struct trace_dynamic_info {
+ +#ifdef CONFIG_CPU_BIG_ENDIAN
+ +      u16     offset;
+ +      u16     len;
+ +#else
+ +      u16     len;
+ +      u16     offset;
+ +#endif
+ +};
+ +
   /*
    * The trace entry - the most basic unit of tracing. This is what
    * is printed in the end as a single line in the trace output, such as:
@@@ -649,6 -638,7 +649,7 @@@ struct trace_event_file 
         struct list_head                list;
         struct trace_event_call         *event_call;
         struct event_filter __rcu       *filter;
+       struct eventfs_file             *ef;
         struct dentry                   *dir;
         struct trace_array              *tr;
         struct trace_subsystem_dir      *system;
@@@ -763,7 -753,6 +764,7 @@@ int bpf_get_perf_event_info(const struc
                             u32 *fd_type, const char **buf,
                             u64 *probe_offset, u64 *probe_addr);
   int bpf_kprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *prog);
+ +int bpf_uprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *prog);
   #else
   static inline unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx)
   {
@@@ -810,11 -799,6 +811,11 @@@ bpf_kprobe_multi_link_attach(const unio
   {
         return -EOPNOTSUPP;
   }
+ +static inline int
+ +bpf_uprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
+ +{
+ +      return -EOPNOTSUPP;
+ +}
   #endif
   
   enum {
@@@ -824,6 -808,7 +825,7 @@@
         FILTER_RDYN_STRING,
         FILTER_PTR_STRING,
         FILTER_TRACE_FN,
+       FILTER_CPUMASK,
         FILTER_COMM,
         FILTER_CPU,
         FILTER_STACKTRACE,
@@@ -884,8 -869,7 +886,8 @@@ extern int  perf_uprobe_init(struct per
   extern void perf_uprobe_destroy(struct perf_event *event);
   extern int bpf_get_uprobe_info(const struct perf_event *event,
                                u32 *fd_type, const char **filename,
- -                             u64 *probe_offset, bool perf_type_tracepoint);
+ +                             u64 *probe_offset, u64 *probe_addr,
+ +                             bool perf_type_tracepoint);
   #endif
   extern int  ftrace_profile_set_filter(struct perf_event *event, int event_id,
                                      char *filter_str);
diff --combined kernel/trace/trace.c

index 8e64aaa,bc96567..3e55375
--- 1/kernel/trace/trace.c
--- 2/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@@ -3119,7 -3119,6 +3119,6 @@@ static void __ftrace_trace_stack(struc
         struct ftrace_stack *fstack;
         struct stack_entry *entry;
         int stackidx;
-       void *ptr;
   
         /*
          * Add one, for this function and the call to save_stack_trace()
@@@ -3157,32 -3156,16 +3156,16 @@@
                 nr_entries = stack_trace_save(fstack->calls, size, skip);
         }
   
-       size = nr_entries * sizeof(unsigned long);
         event = __trace_buffer_lock_reserve(buffer, TRACE_STACK,
-                                   (sizeof(*entry) - sizeof(entry->caller)) + size,
+                                   struct_size(entry, caller, nr_entries),
                                     trace_ctx);
         if (!event)
                 goto out;
-       ptr = ring_buffer_event_data(event);
-       entry = ptr;
- 
-       /*
-        * For backward compatibility reasons, the entry->caller is an
-        * array of 8 slots to store the stack. This is also exported
-        * to user space. The amount allocated on the ring buffer actually
-        * holds enough for the stack specified by nr_entries. This will
-        * go into the location of entry->caller. Due to string fortifiers
-        * checking the size of the destination of memcpy() it triggers
-        * when it detects that size is greater than 8. To hide this from
-        * the fortifiers, we use "ptr" and pointer arithmetic to assign caller.
-        *
-        * The below is really just:
-        *   memcpy(&entry->caller, fstack->calls, size);
-        */
-       ptr += offsetof(typeof(*entry), caller);
-       memcpy(ptr, fstack->calls, size);
+       entry = ring_buffer_event_data(event);
   
         entry->size = nr_entries;
+       memcpy(&entry->caller, fstack->calls,
+              flex_array_size(entry, caller, nr_entries));
   
         if (!call_filter_check_discard(call, entry, buffer, event))
                 __buffer_unlock_commit(buffer, event);
@@@ -4206,22 -4189,9 +4189,16 @@@ static void *s_start(struct seq_file *m
         loff_t l = 0;
         int cpu;
   
-       /*
-        * copy the tracer to avoid using a global lock all around.
-        * iter->trace is a copy of current_trace, the pointer to the
-        * name may be used instead of a strcmp(), as iter->trace->name
-        * will point to the same string as current_trace->name.
-        */
         mutex_lock(&trace_types_lock);
-       if (unlikely(tr->current_trace && iter->trace->name != tr->current_trace->name)) {
- -      if (unlikely(tr->current_trace != iter->trace))
++      if (unlikely(tr->current_trace != iter->trace)) {
+ +              /* Close iter->trace before switching to the new current tracer */
+ +              if (iter->trace->close)
+ +                      iter->trace->close(iter);
-               *iter->trace = *tr->current_trace;
+               iter->trace = tr->current_trace;
+ +              /* Reopen the new current tracer */
+ +              if (iter->trace->open)
+ +                      iter->trace->open(iter);
+ +      }
         mutex_unlock(&trace_types_lock);
   
   #ifdef CONFIG_TRACER_MAX_TRACE
@@@ -4829,6 -4799,25 +4806,25 @@@ static const struct seq_operations trac
         .show           = s_show,
   };
   
+ /*
+  * Note, as iter itself can be allocated and freed in different
+  * ways, this function is only used to free its content, and not
+  * the iterator itself. The only requirement to all the allocations
+  * is that it must zero all fields (kzalloc), as freeing works with
+  * ethier allocated content or NULL.
+  */
+ static void free_trace_iter_content(struct trace_iterator *iter)
+ {
+       /* The fmt is either NULL, allocated or points to static_fmt_buf */
+       if (iter->fmt != static_fmt_buf)
+               kfree(iter->fmt);
+ 
+       kfree(iter->temp);
+       kfree(iter->buffer_iter);
+       mutex_destroy(&iter->mutex);
+       free_cpumask_var(iter->started);
+ }
+ 
   static struct trace_iterator *
   __tracing_open(struct inode *inode, struct file *file, bool snapshot)
   {
@@@ -4870,16 -4859,8 +4866,8 @@@
         iter->fmt = NULL;
         iter->fmt_size = 0;
   
-       /*
-        * We make a copy of the current tracer to avoid concurrent
-        * changes on it while we are reading.
-        */
         mutex_lock(&trace_types_lock);
-       iter->trace = kzalloc(sizeof(*iter->trace), GFP_KERNEL);
-       if (!iter->trace)
-               goto fail;
- 
-       *iter->trace = *tr->current_trace;
+       iter->trace = tr->current_trace;
   
         if (!zalloc_cpumask_var(&iter->started, GFP_KERNEL))
                 goto fail;
@@@ -4944,9 -4925,7 +4932,7 @@@
   
    fail:
         mutex_unlock(&trace_types_lock);
-       kfree(iter->trace);
-       kfree(iter->temp);
-       kfree(iter->buffer_iter);
+       free_trace_iter_content(iter);
   release:
         seq_release_private(inode, file);
         return ERR_PTR(-ENOMEM);
@@@ -5025,12 -5004,7 +5011,7 @@@ static int tracing_release(struct inod
   
         mutex_unlock(&trace_types_lock);
   
-       mutex_destroy(&iter->mutex);
-       free_cpumask_var(iter->started);
-       kfree(iter->fmt);
-       kfree(iter->temp);
-       kfree(iter->trace);
-       kfree(iter->buffer_iter);
+       free_trace_iter_content(iter);
         seq_release_private(inode, file);
   
         return 0;
@@@ -5284,17 -5258,11 +5265,17 @@@ int tracing_set_cpumask(struct trace_ar
                                 !cpumask_test_cpu(cpu, tracing_cpumask_new)) {
                         atomic_inc(&per_cpu_ptr(tr->array_buffer.data, cpu)->disabled);
                         ring_buffer_record_disable_cpu(tr->array_buffer.buffer, cpu);
+ +#ifdef CONFIG_TRACER_MAX_TRACE
+ +                      ring_buffer_record_disable_cpu(tr->max_buffer.buffer, cpu);
+ +#endif
                 }
                 if (!cpumask_test_cpu(cpu, tr->tracing_cpumask) &&
                                 cpumask_test_cpu(cpu, tracing_cpumask_new)) {
                         atomic_dec(&per_cpu_ptr(tr->array_buffer.data, cpu)->disabled);
                         ring_buffer_record_enable_cpu(tr->array_buffer.buffer, cpu);
+ +#ifdef CONFIG_TRACER_MAX_TRACE
+ +                      ring_buffer_record_enable_cpu(tr->max_buffer.buffer, cpu);
+ +#endif
                 }
         }
         arch_spin_unlock(&tr->max_lock);
@@@ -6318,6 -6286,15 +6299,15 @@@ static void set_buffer_entries(struct a
                 per_cpu_ptr(buf->data, cpu)->entries = val;
   }
   
+ static void update_buffer_entries(struct array_buffer *buf, int cpu)
+ {
+       if (cpu == RING_BUFFER_ALL_CPUS) {
+               set_buffer_entries(buf, ring_buffer_size(buf->buffer, 0));
+       } else {
+               per_cpu_ptr(buf->data, cpu)->entries = ring_buffer_size(buf->buffer, cpu);
+       }
+ }
+ 
   #ifdef CONFIG_TRACER_MAX_TRACE
   /* resize @tr's buffer to the size of @size_tr's entries */
   static int resize_buffer_duplicate_size(struct array_buffer *trace_buf,
@@@ -6396,18 -6373,12 +6386,12 @@@ static int __tracing_resize_ring_buffer
                 return ret;
         }
   
-       if (cpu == RING_BUFFER_ALL_CPUS)
-               set_buffer_entries(&tr->max_buffer, size);
-       else
-               per_cpu_ptr(tr->max_buffer.data, cpu)->entries = size;
+       update_buffer_entries(&tr->max_buffer, cpu);
   
    out:
   #endif /* CONFIG_TRACER_MAX_TRACE */
   
-       if (cpu == RING_BUFFER_ALL_CPUS)
-               set_buffer_entries(&tr->array_buffer, size);
-       else
-               per_cpu_ptr(tr->array_buffer.data, cpu)->entries = size;
+       update_buffer_entries(&tr->array_buffer, cpu);
   
         return ret;
   }
@@@ -6718,36 -6689,10 +6702,36 @@@ tracing_max_lat_write(struct file *filp
   
   #endif
   
+ +static int open_pipe_on_cpu(struct trace_array *tr, int cpu)
+ +{
+ +      if (cpu == RING_BUFFER_ALL_CPUS) {
+ +              if (cpumask_empty(tr->pipe_cpumask)) {
+ +                      cpumask_setall(tr->pipe_cpumask);
+ +                      return 0;
+ +              }
+ +      } else if (!cpumask_test_cpu(cpu, tr->pipe_cpumask)) {
+ +              cpumask_set_cpu(cpu, tr->pipe_cpumask);
+ +              return 0;
+ +      }
+ +      return -EBUSY;
+ +}
+ +
+ +static void close_pipe_on_cpu(struct trace_array *tr, int cpu)
+ +{
+ +      if (cpu == RING_BUFFER_ALL_CPUS) {
+ +              WARN_ON(!cpumask_full(tr->pipe_cpumask));
+ +              cpumask_clear(tr->pipe_cpumask);
+ +      } else {
+ +              WARN_ON(!cpumask_test_cpu(cpu, tr->pipe_cpumask));
+ +              cpumask_clear_cpu(cpu, tr->pipe_cpumask);
+ +      }
+ +}
+ +
   static int tracing_open_pipe(struct inode *inode, struct file *filp)
   {
         struct trace_array *tr = inode->i_private;
         struct trace_iterator *iter;
+ +      int cpu;
         int ret;
   
         ret = tracing_check_open_get_tr(tr);
@@@ -6755,16 -6700,13 +6739,16 @@@
                 return ret;
   
         mutex_lock(&trace_types_lock);
+ +      cpu = tracing_get_cpu(inode);
+ +      ret = open_pipe_on_cpu(tr, cpu);
+ +      if (ret)
+ +              goto fail_pipe_on_cpu;
   
         /* create a buffer to store the information to pass to userspace */
         iter = kzalloc(sizeof(*iter), GFP_KERNEL);
         if (!iter) {
                 ret = -ENOMEM;
- -              __trace_array_put(tr);
- -              goto out;
+ +              goto fail_alloc_iter;
         }
   
         trace_seq_init(&iter->seq);
@@@ -6787,7 -6729,7 +6771,7 @@@
   
         iter->tr = tr;
         iter->array_buffer = &tr->array_buffer;
- -      iter->cpu_file = tracing_get_cpu(inode);
+ +      iter->cpu_file = cpu;
         mutex_init(&iter->mutex);
         filp->private_data = iter;
   
@@@ -6797,15 -6739,12 +6781,15 @@@
         nonseekable_open(inode, filp);
   
         tr->trace_ref++;
- -out:
+ +
         mutex_unlock(&trace_types_lock);
         return ret;
   
   fail:
         kfree(iter);
+ +fail_alloc_iter:
+ +      close_pipe_on_cpu(tr, cpu);
+ +fail_pipe_on_cpu:
         __trace_array_put(tr);
         mutex_unlock(&trace_types_lock);
         return ret;
@@@ -6822,13 -6761,10 +6806,10 @@@ static int tracing_release_pipe(struct 
   
         if (iter->trace->pipe_close)
                 iter->trace->pipe_close(iter);
- -
+ +      close_pipe_on_cpu(tr, iter->cpu_file);
         mutex_unlock(&trace_types_lock);
   
-       free_cpumask_var(iter->started);
-       kfree(iter->fmt);
-       kfree(iter->temp);
-       mutex_destroy(&iter->mutex);
+       free_trace_iter_content(iter);
         kfree(iter);
   
         trace_array_put(tr);
@@@ -9486,9 -9422,6 +9467,9 @@@ static struct trace_array *trace_array_
         if (!alloc_cpumask_var(&tr->tracing_cpumask, GFP_KERNEL))
                 goto out_free_tr;
   
+ +      if (!alloc_cpumask_var(&tr->pipe_cpumask, GFP_KERNEL))
+ +              goto out_free_tr;
+ +
         tr->trace_flags = global_trace.trace_flags & ~ZEROED_TRACE_FLAGS;
   
         cpumask_copy(tr->tracing_cpumask, cpu_all_mask);
@@@ -9530,7 -9463,6 +9511,7 @@@
    out_free_tr:
         ftrace_free_ftrace_ops(tr);
         free_trace_buffers(tr);
+ +      free_cpumask_var(tr->pipe_cpumask);
         free_cpumask_var(tr->tracing_cpumask);
         kfree(tr->name);
         kfree(tr);
@@@ -9633,7 -9565,6 +9614,7 @@@ static int __remove_instance(struct tra
         }
         kfree(tr->topts);
   
+ +      free_cpumask_var(tr->pipe_cpumask);
         free_cpumask_var(tr->tracing_cpumask);
         kfree(tr->name);
         kfree(tr);
@@@ -10431,14 -10362,12 +10412,14 @@@ __init static int tracer_alloc_buffers(
         if (trace_create_savedcmd() < 0)
                 goto out_free_temp_buffer;
   
+ +      if (!alloc_cpumask_var(&global_trace.pipe_cpumask, GFP_KERNEL))
+ +              goto out_free_savedcmd;
+ +
         /* TODO: make the number of buffers hot pluggable with CPUS */
         if (allocate_trace_buffers(&global_trace, ring_buf_size) < 0) {
                 MEM_FAIL(1, "tracer: failed to allocate ring buffer!\n");
- -              goto out_free_savedcmd;
+ +              goto out_free_pipe_cpumask;
         }
- -
         if (global_trace.buffer_disabled)
                 tracing_off();
   
@@@ -10491,8 -10420,6 +10472,8 @@@
   
         return 0;
   
+ +out_free_pipe_cpumask:
+ +      free_cpumask_var(global_trace.pipe_cpumask);
   out_free_savedcmd:
         free_saved_cmdlines_buffer(savedcmd);
   out_free_temp_buffer:
diff --combined kernel/trace/trace.h

index 73eaec1,b6e44a3..5669dd1
--- 1/kernel/trace/trace.h
--- 2/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@@ -77,6 -77,16 +77,16 @@@ enum trace_type 
   #undef __array
   #define __array(type, item, size)     type    item[size];
   
+ /*
+  * For backward compatibility, older user space expects to see the
+  * kernel_stack event with a fixed size caller field. But today the fix
+  * size is ignored by the kernel, and the real structure is dynamic.
+  * Expose to user space: "unsigned long caller[8];" but the real structure
+  * will be "unsigned long caller[] __counted_by(size)"
+  */
+ #undef __stack_array
+ #define __stack_array(type, item, size, field)                type item[] __counted_by(field);
+ 
   #undef __array_desc
   #define __array_desc(type, container, item, size)
   
@@@ -377,8 -387,6 +387,8 @@@ struct trace_array 
         struct list_head        events;
         struct trace_event_file *trace_marker_file;
         cpumask_var_t           tracing_cpumask; /* only trace on set CPUs */
+ +      /* one per_cpu trace_pipe can be opened by only one user */
+ +      cpumask_var_t           pipe_cpumask;
         int                     ref;
         int                     trace_ref;
   #ifdef CONFIG_FUNCTION_TRACER
@@@ -596,7 -604,6 +606,6 @@@ trace_buffer_iter(struct trace_iterato
   int tracer_init(struct tracer *t, struct trace_array *tr);
   int tracing_is_enabled(void);
   void tracing_reset_online_cpus(struct array_buffer *buf);
- void tracing_reset_current(int cpu);
   void tracing_reset_all_online_cpus(void);
   void tracing_reset_all_online_cpus_unlocked(void);
   int tracing_open_generic(struct inode *inode, struct file *filp);
@@@ -697,7 -704,6 +706,6 @@@ void trace_filter_add_remove_task(struc
   void *trace_pid_next(struct trace_pid_list *pid_list, void *v, loff_t *pos);
   void *trace_pid_start(struct trace_pid_list *pid_list, loff_t *pos);
   int trace_pid_show(struct seq_file *m, void *v);
- void trace_free_pid_list(struct trace_pid_list *pid_list);
   int trace_pid_write(struct trace_pid_list *filtered_pids,
                     struct trace_pid_list **new_pid_list,
                     const char __user *ubuf, size_t cnt);
@@@ -1297,14 -1303,6 +1305,14 @@@ static inline void trace_branch_disable
   /* set ring buffers to default size if not already done so */
   int tracing_update_buffers(void);
   
+ +union trace_synth_field {
+ +      u8                              as_u8;
+ +      u16                             as_u16;
+ +      u32                             as_u32;
+ +      u64                             as_u64;
+ +      struct trace_dynamic_info       as_dynamic;
+ +};
+ +
   struct ftrace_event_field {
         struct list_head        link;
         const char              *name;
@@@ -1334,7 -1332,7 +1342,7 @@@ struct trace_subsystem_dir 
         struct list_head                list;
         struct event_subsystem          *subsystem;
         struct trace_array              *tr;
-       struct dentry                   *entry;
+       struct eventfs_file             *ef;
         int                             ref_count;
         int                             nr_events;
   };
author	Linus Torvalds <torvalds@linux-foundation.org>
	Fri, 1 Sep 2023 23:34:25 +0000 (16:34 -0700)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Fri, 1 Sep 2023 23:34:25 +0000 (16:34 -0700)
		1	2
Documentation/trace/events.rst	patch \|	diff1 \|	diff2 \|	blob \| history
fs/tracefs/inode.c	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/trace_events.h	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/trace/trace.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/trace/trace.h	patch \|	diff1 \|	diff2 \|	blob \| history