2 * ring_buffer_iterator.c
4 * Ring buffer and channel iterators. Get each event of a channel in order. Uses
5 * a prio heap for per-cpu buffers, giving a O(log(NR_CPUS)) algorithmic
6 * complexity for the "get next event" operation.
8 * Copyright (C) 2010-2012 Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
10 * This library is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU Lesser General Public
12 * License as published by the Free Software Foundation; only
13 * version 2.1 of the License.
15 * This library is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * Lesser General Public License for more details.
20 * You should have received a copy of the GNU Lesser General Public
21 * License along with this library; if not, write to the Free Software
22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
25 * Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
28 #include "../../wrapper/ringbuffer/iterator.h"
29 #include <linux/jiffies.h>
30 #include <linux/delay.h>
31 #include <linux/module.h>
34 * Safety factor taking into account internal kernel interrupt latency.
35 * Assuming 250ms worse-case latency.
37 #define MAX_SYSTEM_LATENCY 250
40 * Maximum delta expected between trace clocks. At most 1 jiffy delta.
42 #define MAX_CLOCK_DELTA (jiffies_to_usecs(1) * 1000)
45 * lib_ring_buffer_get_next_record - Get the next record in a buffer.
49 * Returns the size of the event read, -EAGAIN if buffer is empty, -ENODATA if
50 * buffer is empty and finalized. The buffer must already be opened for reading.
52 ssize_t lib_ring_buffer_get_next_record(struct channel *chan,
53 struct lib_ring_buffer *buf)
55 const struct lib_ring_buffer_config *config = &chan->backend.config;
56 struct lib_ring_buffer_iter *iter = &buf->iter;
60 switch (iter->state) {
62 ret = lib_ring_buffer_get_next_subbuf(buf);
63 if (ret && !ACCESS_ONCE(buf->finalized)
64 && config->alloc == RING_BUFFER_ALLOC_GLOBAL) {
66 * Use "pull" scheme for global buffers. The reader
67 * itself flushes the buffer to "pull" data not visible
68 * to readers yet. Flush current subbuffer and re-try.
70 * Per-CPU buffers rather use a "push" scheme because
71 * the IPI needed to flush all CPU's buffers is too
72 * costly. In the "push" scheme, the reader waits for
73 * the writer periodic deferrable timer to flush the
74 * buffers (keeping track of a quiescent state
75 * timestamp). Therefore, the writer "pushes" data out
76 * of the buffers rather than letting the reader "pull"
77 * data from the buffer.
79 lib_ring_buffer_switch_slow(buf, SWITCH_ACTIVE);
80 ret = lib_ring_buffer_get_next_subbuf(buf);
84 iter->consumed = buf->cons_snapshot;
85 iter->data_size = lib_ring_buffer_get_read_data_size(config, buf);
86 iter->read_offset = iter->consumed;
88 iter->read_offset += config->cb.subbuffer_header_size();
89 iter->state = ITER_TEST_RECORD;
91 case ITER_TEST_RECORD:
92 if (iter->read_offset - iter->consumed >= iter->data_size) {
93 iter->state = ITER_PUT_SUBBUF;
95 CHAN_WARN_ON(chan, !config->cb.record_get);
96 config->cb.record_get(config, chan, buf,
101 iter->read_offset += iter->header_len;
102 subbuffer_consume_record(config, &buf->backend);
103 iter->state = ITER_NEXT_RECORD;
104 return iter->payload_len;
107 case ITER_NEXT_RECORD:
108 iter->read_offset += iter->payload_len;
109 iter->state = ITER_TEST_RECORD;
111 case ITER_PUT_SUBBUF:
112 lib_ring_buffer_put_next_subbuf(buf);
113 iter->state = ITER_GET_SUBBUF;
116 CHAN_WARN_ON(chan, 1); /* Should not happen */
120 EXPORT_SYMBOL_GPL(lib_ring_buffer_get_next_record);
122 static int buf_is_higher(void *a, void *b)
124 struct lib_ring_buffer *bufa = a;
125 struct lib_ring_buffer *bufb = b;
127 /* Consider lowest timestamps to be at the top of the heap */
128 return (bufa->iter.timestamp < bufb->iter.timestamp);
132 void lib_ring_buffer_get_empty_buf_records(const struct lib_ring_buffer_config *config,
133 struct channel *chan)
135 struct lttng_ptr_heap *heap = &chan->iter.heap;
136 struct lib_ring_buffer *buf, *tmp;
139 list_for_each_entry_safe(buf, tmp, &chan->iter.empty_head,
141 len = lib_ring_buffer_get_next_record(chan, buf);
144 * Deal with -EAGAIN and -ENODATA.
145 * len >= 0 means record contains data.
146 * -EBUSY should never happen, because we support only one
151 /* Keep node in empty list */
155 * Buffer is finalized. Don't add to list of empty
156 * buffer, because it has no more data to provide, ever.
158 list_del(&buf->iter.empty_node);
161 CHAN_WARN_ON(chan, 1);
165 * Insert buffer into the heap, remove from empty buffer
168 CHAN_WARN_ON(chan, len < 0);
169 list_del(&buf->iter.empty_node);
170 CHAN_WARN_ON(chan, lttng_heap_insert(heap, buf));
176 void lib_ring_buffer_wait_for_qs(const struct lib_ring_buffer_config *config,
177 struct channel *chan)
180 unsigned long wait_msecs;
183 * No need to wait if no empty buffers are present.
185 if (list_empty(&chan->iter.empty_head))
188 timestamp_qs = config->cb.ring_buffer_clock_read(chan);
190 * We need to consider previously empty buffers.
191 * Do a get next buf record on each of them. Add them to
192 * the heap if they have data. If at least one of them
193 * don't have data, we need to wait for
194 * switch_timer_interval + MAX_SYSTEM_LATENCY (so we are sure the
195 * buffers have been switched either by the timer or idle entry) and
196 * check them again, adding them if they have data.
198 lib_ring_buffer_get_empty_buf_records(config, chan);
201 * No need to wait if no empty buffers are present.
203 if (list_empty(&chan->iter.empty_head))
207 * We need to wait for the buffer switch timer to run. If the
208 * CPU is idle, idle entry performed the switch.
209 * TODO: we could optimize further by skipping the sleep if all
210 * empty buffers belong to idle or offline cpus.
212 wait_msecs = jiffies_to_msecs(chan->switch_timer_interval);
213 wait_msecs += MAX_SYSTEM_LATENCY;
215 lib_ring_buffer_get_empty_buf_records(config, chan);
217 * Any buffer still in the empty list here cannot possibly
218 * contain an event with a timestamp prior to "timestamp_qs".
219 * The new quiescent state timestamp is the one we grabbed
220 * before waiting for buffer data. It is therefore safe to
221 * ignore empty buffers up to last_qs timestamp for fusion
224 chan->iter.last_qs = timestamp_qs;
228 * channel_get_next_record - Get the next record in a channel.
230 * @ret_buf: the buffer in which the event is located (output)
232 * Returns the size of new current event, -EAGAIN if all buffers are empty,
233 * -ENODATA if all buffers are empty and finalized. The channel must already be
234 * opened for reading.
237 ssize_t channel_get_next_record(struct channel *chan,
238 struct lib_ring_buffer **ret_buf)
240 const struct lib_ring_buffer_config *config = &chan->backend.config;
241 struct lib_ring_buffer *buf;
242 struct lttng_ptr_heap *heap;
245 if (config->alloc == RING_BUFFER_ALLOC_GLOBAL) {
246 *ret_buf = channel_get_ring_buffer(config, chan, 0);
247 return lib_ring_buffer_get_next_record(chan, *ret_buf);
250 heap = &chan->iter.heap;
253 * get next record for topmost buffer.
255 buf = lttng_heap_maximum(heap);
257 len = lib_ring_buffer_get_next_record(chan, buf);
259 * Deal with -EAGAIN and -ENODATA.
260 * len >= 0 means record contains data.
264 buf->iter.timestamp = 0;
265 list_add(&buf->iter.empty_node, &chan->iter.empty_head);
266 /* Remove topmost buffer from the heap */
267 CHAN_WARN_ON(chan, lttng_heap_remove(heap) != buf);
271 * Buffer is finalized. Remove buffer from heap and
272 * don't add to list of empty buffer, because it has no
273 * more data to provide, ever.
275 CHAN_WARN_ON(chan, lttng_heap_remove(heap) != buf);
278 CHAN_WARN_ON(chan, 1);
282 * Reinsert buffer into the heap. Note that heap can be
283 * partially empty, so we need to use
284 * lttng_heap_replace_max().
286 CHAN_WARN_ON(chan, len < 0);
287 CHAN_WARN_ON(chan, lttng_heap_replace_max(heap, buf) != buf);
292 buf = lttng_heap_maximum(heap);
293 if (!buf || buf->iter.timestamp > chan->iter.last_qs) {
295 * Deal with buffers previously showing no data.
296 * Add buffers containing data to the heap, update
299 lib_ring_buffer_wait_for_qs(config, chan);
302 *ret_buf = buf = lttng_heap_maximum(heap);
305 * If this warning triggers, you probably need to check your
306 * system interrupt latency. Typical causes: too many printk()
307 * output going to a serial console with interrupts off.
308 * Allow for MAX_CLOCK_DELTA ns timestamp delta going backward.
309 * Observed on SMP KVM setups with trace_clock().
311 if (chan->iter.last_timestamp
312 > (buf->iter.timestamp + MAX_CLOCK_DELTA)) {
313 printk(KERN_WARNING "ring_buffer: timestamps going "
314 "backward. Last time %llu ns, cpu %d, "
315 "current time %llu ns, cpu %d, "
317 chan->iter.last_timestamp, chan->iter.last_cpu,
318 buf->iter.timestamp, buf->backend.cpu,
319 chan->iter.last_timestamp - buf->iter.timestamp);
320 CHAN_WARN_ON(chan, 1);
322 chan->iter.last_timestamp = buf->iter.timestamp;
323 chan->iter.last_cpu = buf->backend.cpu;
324 return buf->iter.payload_len;
327 if (list_empty(&chan->iter.empty_head))
328 return -ENODATA; /* All buffers finalized */
330 return -EAGAIN; /* Temporarily empty */
333 EXPORT_SYMBOL_GPL(channel_get_next_record);
336 void lib_ring_buffer_iterator_init(struct channel *chan, struct lib_ring_buffer *buf)
338 if (buf->iter.allocated)
341 buf->iter.allocated = 1;
342 if (chan->iter.read_open && !buf->iter.read_open) {
343 CHAN_WARN_ON(chan, lib_ring_buffer_open_read(buf) != 0);
344 buf->iter.read_open = 1;
347 /* Add to list of buffers without any current record */
348 if (chan->backend.config.alloc == RING_BUFFER_ALLOC_PER_CPU)
349 list_add(&buf->iter.empty_node, &chan->iter.empty_head);
352 #ifdef CONFIG_HOTPLUG_CPU
354 int channel_iterator_cpu_hotplug(struct notifier_block *nb,
355 unsigned long action,
358 unsigned int cpu = (unsigned long)hcpu;
359 struct channel *chan = container_of(nb, struct channel,
361 struct lib_ring_buffer *buf = per_cpu_ptr(chan->backend.buf, cpu);
362 const struct lib_ring_buffer_config *config = &chan->backend.config;
364 if (!chan->hp_iter_enable)
367 CHAN_WARN_ON(chan, config->alloc == RING_BUFFER_ALLOC_GLOBAL);
370 case CPU_DOWN_FAILED:
371 case CPU_DOWN_FAILED_FROZEN:
373 case CPU_ONLINE_FROZEN:
374 lib_ring_buffer_iterator_init(chan, buf);
382 int channel_iterator_init(struct channel *chan)
384 const struct lib_ring_buffer_config *config = &chan->backend.config;
385 struct lib_ring_buffer *buf;
387 if (config->alloc == RING_BUFFER_ALLOC_PER_CPU) {
390 INIT_LIST_HEAD(&chan->iter.empty_head);
391 ret = lttng_heap_init(&chan->iter.heap,
393 GFP_KERNEL, buf_is_higher);
397 * In case of non-hotplug cpu, if the ring-buffer is allocated
398 * in early initcall, it will not be notified of secondary cpus.
399 * In that off case, we need to allocate for all possible cpus.
401 #ifdef CONFIG_HOTPLUG_CPU
402 chan->hp_iter_notifier.notifier_call =
403 channel_iterator_cpu_hotplug;
404 chan->hp_iter_notifier.priority = 10;
405 register_cpu_notifier(&chan->hp_iter_notifier);
407 for_each_online_cpu(cpu) {
408 buf = per_cpu_ptr(chan->backend.buf, cpu);
409 lib_ring_buffer_iterator_init(chan, buf);
411 chan->hp_iter_enable = 1;
414 for_each_possible_cpu(cpu) {
415 buf = per_cpu_ptr(chan->backend.buf, cpu);
416 lib_ring_buffer_iterator_init(chan, buf);
420 buf = channel_get_ring_buffer(config, chan, 0);
421 lib_ring_buffer_iterator_init(chan, buf);
426 void channel_iterator_unregister_notifiers(struct channel *chan)
428 const struct lib_ring_buffer_config *config = &chan->backend.config;
430 if (config->alloc == RING_BUFFER_ALLOC_PER_CPU) {
431 chan->hp_iter_enable = 0;
432 unregister_cpu_notifier(&chan->hp_iter_notifier);
436 void channel_iterator_free(struct channel *chan)
438 const struct lib_ring_buffer_config *config = &chan->backend.config;
440 if (config->alloc == RING_BUFFER_ALLOC_PER_CPU)
441 lttng_heap_free(&chan->iter.heap);
444 int lib_ring_buffer_iterator_open(struct lib_ring_buffer *buf)
446 struct channel *chan = buf->backend.chan;
447 const struct lib_ring_buffer_config *config = &chan->backend.config;
448 CHAN_WARN_ON(chan, config->output != RING_BUFFER_ITERATOR);
449 return lib_ring_buffer_open_read(buf);
451 EXPORT_SYMBOL_GPL(lib_ring_buffer_iterator_open);
454 * Note: Iterators must not be mixed with other types of outputs, because an
455 * iterator can leave the buffer in "GET" state, which is not consistent with
456 * other types of output (mmap, splice, raw data read).
458 void lib_ring_buffer_iterator_release(struct lib_ring_buffer *buf)
460 lib_ring_buffer_release_read(buf);
462 EXPORT_SYMBOL_GPL(lib_ring_buffer_iterator_release);
464 int channel_iterator_open(struct channel *chan)
466 const struct lib_ring_buffer_config *config = &chan->backend.config;
467 struct lib_ring_buffer *buf;
470 CHAN_WARN_ON(chan, config->output != RING_BUFFER_ITERATOR);
472 if (config->alloc == RING_BUFFER_ALLOC_PER_CPU) {
474 /* Allow CPU hotplug to keep track of opened reader */
475 chan->iter.read_open = 1;
476 for_each_channel_cpu(cpu, chan) {
477 buf = channel_get_ring_buffer(config, chan, cpu);
478 ret = lib_ring_buffer_iterator_open(buf);
481 buf->iter.read_open = 1;
485 buf = channel_get_ring_buffer(config, chan, 0);
486 ret = lib_ring_buffer_iterator_open(buf);
490 /* Error should always happen on CPU 0, hence no close is required. */
491 CHAN_WARN_ON(chan, cpu != 0);
495 EXPORT_SYMBOL_GPL(channel_iterator_open);
497 void channel_iterator_release(struct channel *chan)
499 const struct lib_ring_buffer_config *config = &chan->backend.config;
500 struct lib_ring_buffer *buf;
503 if (config->alloc == RING_BUFFER_ALLOC_PER_CPU) {
505 for_each_channel_cpu(cpu, chan) {
506 buf = channel_get_ring_buffer(config, chan, cpu);
507 if (buf->iter.read_open) {
508 lib_ring_buffer_iterator_release(buf);
509 buf->iter.read_open = 0;
512 chan->iter.read_open = 0;
515 buf = channel_get_ring_buffer(config, chan, 0);
516 lib_ring_buffer_iterator_release(buf);
519 EXPORT_SYMBOL_GPL(channel_iterator_release);
521 void lib_ring_buffer_iterator_reset(struct lib_ring_buffer *buf)
523 struct channel *chan = buf->backend.chan;
525 if (buf->iter.state != ITER_GET_SUBBUF)
526 lib_ring_buffer_put_next_subbuf(buf);
527 buf->iter.state = ITER_GET_SUBBUF;
528 /* Remove from heap (if present). */
529 if (lttng_heap_cherrypick(&chan->iter.heap, buf))
530 list_add(&buf->iter.empty_node, &chan->iter.empty_head);
531 buf->iter.timestamp = 0;
532 buf->iter.header_len = 0;
533 buf->iter.payload_len = 0;
534 buf->iter.consumed = 0;
535 buf->iter.read_offset = 0;
536 buf->iter.data_size = 0;
537 /* Don't reset allocated and read_open */
540 void channel_iterator_reset(struct channel *chan)
542 const struct lib_ring_buffer_config *config = &chan->backend.config;
543 struct lib_ring_buffer *buf;
546 /* Empty heap, put into empty_head */
547 while ((buf = lttng_heap_remove(&chan->iter.heap)) != NULL)
548 list_add(&buf->iter.empty_node, &chan->iter.empty_head);
550 for_each_channel_cpu(cpu, chan) {
551 buf = channel_get_ring_buffer(config, chan, cpu);
552 lib_ring_buffer_iterator_reset(buf);
554 /* Don't reset read_open */
555 chan->iter.last_qs = 0;
556 chan->iter.last_timestamp = 0;
557 chan->iter.last_cpu = 0;
558 chan->iter.len_left = 0;
562 * Ring buffer payload extraction read() implementation.
565 ssize_t channel_ring_buffer_file_read(struct file *filp,
566 char __user *user_buf,
569 struct channel *chan,
570 struct lib_ring_buffer *buf,
573 const struct lib_ring_buffer_config *config = &chan->backend.config;
574 size_t read_count = 0, read_offset;
578 if (!access_ok(VERIFY_WRITE, user_buf, count))
581 /* Finish copy of previous record */
583 if (read_count < count) {
584 len = chan->iter.len_left;
586 if (config->alloc == RING_BUFFER_ALLOC_PER_CPU
588 buf = lttng_heap_maximum(&chan->iter.heap);
589 CHAN_WARN_ON(chan, !buf);
594 while (read_count < count) {
595 size_t copy_len, space_left;
598 len = channel_get_next_record(chan, &buf);
600 len = lib_ring_buffer_get_next_record(chan, buf);
604 * Check if buffer is finalized (end of file).
606 if (len == -ENODATA) {
607 /* A 0 read_count will tell about end of file */
610 if (filp->f_flags & O_NONBLOCK) {
612 read_count = -EAGAIN;
618 * No data available at the moment, return what
625 * Wait for returned len to be >= 0 or -ENODATA.
628 error = wait_event_interruptible(
630 ((len = channel_get_next_record(chan,
631 &buf)), len != -EAGAIN));
633 error = wait_event_interruptible(
635 ((len = lib_ring_buffer_get_next_record(
636 chan, buf)), len != -EAGAIN));
637 CHAN_WARN_ON(chan, len == -EBUSY);
642 CHAN_WARN_ON(chan, len < 0 && len != -ENODATA);
646 read_offset = buf->iter.read_offset;
648 space_left = count - read_count;
649 if (len <= space_left) {
651 chan->iter.len_left = 0;
654 copy_len = space_left;
655 chan->iter.len_left = len - copy_len;
656 *ppos = read_offset + copy_len;
658 if (__lib_ring_buffer_copy_to_user(&buf->backend, read_offset,
659 &user_buf[read_count],
662 * Leave the len_left and ppos values at their current
663 * state, as we currently have a valid event to read.
667 read_count += copy_len;
673 chan->iter.len_left = 0;
678 * lib_ring_buffer_file_read - Read buffer record payload.
679 * @filp: file structure pointer.
680 * @buffer: user buffer to read data into.
681 * @count: number of bytes to read.
682 * @ppos: file read position.
684 * Returns a negative value on error, or the number of bytes read on success.
685 * ppos is used to save the position _within the current record_ between calls
689 ssize_t lib_ring_buffer_file_read(struct file *filp,
690 char __user *user_buf,
694 struct inode *inode = filp->f_dentry->d_inode;
695 struct lib_ring_buffer *buf = inode->i_private;
696 struct channel *chan = buf->backend.chan;
698 return channel_ring_buffer_file_read(filp, user_buf, count, ppos,
703 * channel_file_read - Read channel record payload.
704 * @filp: file structure pointer.
705 * @buffer: user buffer to read data into.
706 * @count: number of bytes to read.
707 * @ppos: file read position.
709 * Returns a negative value on error, or the number of bytes read on success.
710 * ppos is used to save the position _within the current record_ between calls
714 ssize_t channel_file_read(struct file *filp,
715 char __user *user_buf,
719 struct inode *inode = filp->f_dentry->d_inode;
720 struct channel *chan = inode->i_private;
721 const struct lib_ring_buffer_config *config = &chan->backend.config;
723 if (config->alloc == RING_BUFFER_ALLOC_PER_CPU)
724 return channel_ring_buffer_file_read(filp, user_buf, count,
725 ppos, chan, NULL, 1);
727 struct lib_ring_buffer *buf =
728 channel_get_ring_buffer(config, chan, 0);
729 return channel_ring_buffer_file_read(filp, user_buf, count,
735 int lib_ring_buffer_file_open(struct inode *inode, struct file *file)
737 struct lib_ring_buffer *buf = inode->i_private;
740 ret = lib_ring_buffer_iterator_open(buf);
744 file->private_data = buf;
745 ret = nonseekable_open(inode, file);
751 lib_ring_buffer_iterator_release(buf);
756 int lib_ring_buffer_file_release(struct inode *inode, struct file *file)
758 struct lib_ring_buffer *buf = inode->i_private;
760 lib_ring_buffer_iterator_release(buf);
765 int channel_file_open(struct inode *inode, struct file *file)
767 struct channel *chan = inode->i_private;
770 ret = channel_iterator_open(chan);
774 file->private_data = chan;
775 ret = nonseekable_open(inode, file);
781 channel_iterator_release(chan);
786 int channel_file_release(struct inode *inode, struct file *file)
788 struct channel *chan = inode->i_private;
790 channel_iterator_release(chan);
794 const struct file_operations channel_payload_file_operations = {
795 .owner = THIS_MODULE,
796 .open = channel_file_open,
797 .release = channel_file_release,
798 .read = channel_file_read,
799 .llseek = vfs_lib_ring_buffer_no_llseek,
801 EXPORT_SYMBOL_GPL(channel_payload_file_operations);
803 const struct file_operations lib_ring_buffer_payload_file_operations = {
804 .owner = THIS_MODULE,
805 .open = lib_ring_buffer_file_open,
806 .release = lib_ring_buffer_file_release,
807 .read = lib_ring_buffer_file_read,
808 .llseek = vfs_lib_ring_buffer_no_llseek,
810 EXPORT_SYMBOL_GPL(lib_ring_buffer_payload_file_operations);