ring-buffer: Have rb_time_cmpxchg() set the msb counter too

[platform/kernel/linux-starfive.git] / kernel / trace / ring_buffer.c
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c

index 78502d4..af08a1a 100644 (file)
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -354,6 +354,11 @@ static void rb_init_page(struct buffer_data_page *bpage)
         local_set(&bpage->commit, 0);
  }
  
+static __always_inline unsigned int rb_page_commit(struct buffer_page *bpage)
+{
+       return local_read(&bpage->page->commit);
+}
+
  static void free_buffer_page(struct buffer_page *bpage)
  {
         free_page((unsigned long)bpage->page);
@@ -639,8 +644,8 @@ static inline bool __rb_time_read(rb_time_t *t, u64 *ret, unsigned long *cnt)
  
         *cnt = rb_time_cnt(top);
  
-       /* If top and bottom counts don't match, this interrupted a write */
-       if (*cnt != rb_time_cnt(bottom))
+       /* If top and msb counts don't match, this interrupted a write */
+       if (*cnt != rb_time_cnt(msb))
                 return false;
  
         /* The shift to msb will lose its cnt bits */
@@ -701,6 +706,9 @@ static bool rb_time_cmpxchg(rb_time_t *t, u64 expect, u64 set)
         unsigned long cnt2, top2, bottom2, msb2;
         u64 val;
  
+       /* Any interruptions in this function should cause a failure */
+       cnt = local_read(&t->cnt);
+
         /* The cmpxchg always fails if it interrupted an update */
          if (!__rb_time_read(t, &val, &cnt2))
                  return false;
@@ -708,17 +716,18 @@ static bool rb_time_cmpxchg(rb_time_t *t, u64 expect, u64 set)
          if (val != expect)
                  return false;
  
-        cnt = local_read(&t->cnt);
          if ((cnt & 3) != cnt2)
                  return false;
  
          cnt2 = cnt + 1;
  
          rb_time_split(val, &top, &bottom, &msb);
+        msb = rb_time_val_cnt(msb, cnt);
          top = rb_time_val_cnt(top, cnt);
          bottom = rb_time_val_cnt(bottom, cnt);
  
          rb_time_split(set, &top2, &bottom2, &msb2);
+        msb2 = rb_time_val_cnt(msb2, cnt);
          top2 = rb_time_val_cnt(top2, cnt2);
          bottom2 = rb_time_val_cnt(bottom2, cnt2);
  
@@ -1132,6 +1141,9 @@ __poll_t ring_buffer_poll_wait(struct trace_buffer *buffer, int cpu,
         if (full) {
                 poll_wait(filp, &work->full_waiters, poll_table);
                 work->full_waiters_pending = true;
+               if (!cpu_buffer->shortest_full ||
+                   cpu_buffer->shortest_full > full)
+                       cpu_buffer->shortest_full = full;
         } else {
                 poll_wait(filp, &work->waiters, poll_table);
                 work->waiters_pending = true;
@@ -1779,6 +1791,8 @@ static void rb_free_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer)
                 free_buffer_page(bpage);
         }
  
+       free_page((unsigned long)cpu_buffer->free_page);
+
         kfree(cpu_buffer);
  }
  
@@ -2003,7 +2017,7 @@ rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned long nr_pages)
                          * Increment overrun to account for the lost events.
                          */
                         local_add(page_entries, &cpu_buffer->overrun);
-                       local_sub(BUF_PAGE_SIZE, &cpu_buffer->entries_bytes);
+                       local_sub(rb_page_commit(to_remove_page), &cpu_buffer->entries_bytes);
                         local_inc(&cpu_buffer->pages_lost);
                 }
  
@@ -2198,6 +2212,8 @@ int ring_buffer_resize(struct trace_buffer *buffer, unsigned long size,
                                 err = -ENOMEM;
                                 goto out_err;
                         }
+
+                       cond_resched();
                 }
  
                 cpus_read_lock();
@@ -2365,11 +2381,6 @@ rb_reader_event(struct ring_buffer_per_cpu *cpu_buffer)
                                cpu_buffer->reader_page->read);
  }
  
-static __always_inline unsigned rb_page_commit(struct buffer_page *bpage)
-{
-       return local_read(&bpage->page->commit);
-}
-
  static struct ring_buffer_event *
  rb_iter_head_event(struct ring_buffer_iter *iter)
  {
@@ -2388,6 +2399,11 @@ rb_iter_head_event(struct ring_buffer_iter *iter)
          */
         commit = rb_page_commit(iter_head_page);
         smp_rmb();
+
+       /* An event needs to be at least 8 bytes in size */
+       if (iter->head > commit - 8)
+               goto reset;
+
         event = __rb_page_index(iter_head_page, iter->head);
         length = rb_event_length(event);
  
@@ -2397,7 +2413,7 @@ rb_iter_head_event(struct ring_buffer_iter *iter)
          */
         barrier();
  
-       if ((iter->head + length) > commit || length > BUF_MAX_DATA_SIZE)
+       if ((iter->head + length) > commit || length > BUF_PAGE_SIZE)
                 /* Writer corrupted the read? */
                 goto reset;
  
@@ -2510,7 +2526,7 @@ rb_handle_head_page(struct ring_buffer_per_cpu *cpu_buffer,
                  * the counters.
                  */
                 local_add(entries, &cpu_buffer->overrun);
-               local_sub(BUF_PAGE_SIZE, &cpu_buffer->entries_bytes);
+               local_sub(rb_page_commit(next_page), &cpu_buffer->entries_bytes);
                 local_inc(&cpu_buffer->pages_lost);
  
                 /*
@@ -2653,9 +2669,6 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer,
  
         event = __rb_page_index(tail_page, tail);
  
-       /* account for padding bytes */
-       local_add(BUF_PAGE_SIZE - tail, &cpu_buffer->entries_bytes);
-
         /*
          * Save the original length to the meta data.
          * This will be used by the reader to add lost event
@@ -2669,7 +2682,8 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer,
          * write counter enough to allow another writer to slip
          * in on this page.
          * We put in a discarded commit instead, to make sure
-        * that this space is not used again.
+        * that this space is not used again, and this space will
+        * not be accounted into 'entries_bytes'.
          *
          * If we are less than the minimum size, we don't need to
          * worry about it.
@@ -2694,6 +2708,9 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer,
         /* time delta must be non zero */
         event->time_delta = 1;
  
+       /* account for padding bytes */
+       local_add(BUF_PAGE_SIZE - tail, &cpu_buffer->entries_bytes);
+
         /* Make sure the padding is visible before the tail_page->write update */
         smp_wmb();
  
@@ -3019,23 +3036,20 @@ rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer,
                         local_read(&bpage->write) & ~RB_WRITE_MASK;
                 unsigned long event_length = rb_event_length(event);
  
+               /*
+                * For the before_stamp to be different than the write_stamp
+                * to make sure that the next event adds an absolute
+                * value and does not rely on the saved write stamp, which
+                * is now going to be bogus.
+                */
+               rb_time_set(&cpu_buffer->before_stamp, 0);
+
                 /* Something came in, can't discard */
                 if (!rb_time_cmpxchg(&cpu_buffer->write_stamp,
                                        write_stamp, write_stamp - delta))
                         return false;
  
                 /*
-                * It's possible that the event time delta is zero
-                * (has the same time stamp as the previous event)
-                * in which case write_stamp and before_stamp could
-                * be the same. In such a case, force before_stamp
-                * to be different than write_stamp. It doesn't
-                * matter what it is, as long as its different.
-                */
-               if (!delta)
-                       rb_time_set(&cpu_buffer->before_stamp, 0);
-
-               /*
                  * If an event were to come in now, it would see that the
                  * write_stamp and the before_stamp are different, and assume
                  * that this event just added itself before updating
@@ -3571,7 +3585,10 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
                  * absolute timestamp.
                  * Don't bother if this is the start of a new page (w == 0).
                  */
-               if (unlikely(!a_ok || !b_ok || (info->before != info->after && w))) {
+               if (!w) {
+                       /* Use the sub-buffer timestamp */
+                       info->delta = 0;
+               } else if (unlikely(!a_ok || !b_ok || info->before != info->after)) {
                         info->add_timestamp |= RB_ADD_STAMP_FORCE | RB_ADD_STAMP_EXTEND;
                         info->length += RB_LEN_TIME_EXTEND;
                 } else {
@@ -3594,26 +3611,19 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
  
         /* See if we shot pass the end of this buffer page */
         if (unlikely(write > BUF_PAGE_SIZE)) {
-               /* before and after may now different, fix it up*/
-               b_ok = rb_time_read(&cpu_buffer->before_stamp, &info->before);
-               a_ok = rb_time_read(&cpu_buffer->write_stamp, &info->after);
-               if (a_ok && b_ok && info->before != info->after)
-                       (void)rb_time_cmpxchg(&cpu_buffer->before_stamp,
-                                             info->before, info->after);
-               if (a_ok && b_ok)
-                       check_buffer(cpu_buffer, info, CHECK_FULL_PAGE);
+               check_buffer(cpu_buffer, info, CHECK_FULL_PAGE);
                 return rb_move_tail(cpu_buffer, tail, info);
         }
  
         if (likely(tail == w)) {
-               u64 save_before;
-               bool s_ok;
-
                 /* Nothing interrupted us between A and C */
   /*D*/         rb_time_set(&cpu_buffer->write_stamp, info->ts);
-               barrier();
- /*E*/         s_ok = rb_time_read(&cpu_buffer->before_stamp, &save_before);
-               RB_WARN_ON(cpu_buffer, !s_ok);
+               /*
+                * If something came in between C and D, the write stamp
+                * may now not be in sync. But that's fine as the before_stamp
+                * will be different and then next event will just be forced
+                * to use an absolute timestamp.
+                */
                 if (likely(!(info->add_timestamp &
                              (RB_ADD_STAMP_FORCE | RB_ADD_STAMP_ABSOLUTE))))
                         /* This did not interrupt any time update */
@@ -3621,24 +3631,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
                 else
                         /* Just use full timestamp for interrupting event */
                         info->delta = info->ts;
-               barrier();
                 check_buffer(cpu_buffer, info, tail);
-               if (unlikely(info->ts != save_before)) {
-                       /* SLOW PATH - Interrupted between C and E */
-
-                       a_ok = rb_time_read(&cpu_buffer->write_stamp, &info->after);
-                       RB_WARN_ON(cpu_buffer, !a_ok);
-
-                       /* Write stamp must only go forward */
-                       if (save_before > info->after) {
-                               /*
-                                * We do not care about the result, only that
-                                * it gets updated atomically.
-                                */
-                               (void)rb_time_cmpxchg(&cpu_buffer->write_stamp,
-                                                     info->after, save_before);
-                       }
-               }
         } else {
                 u64 ts;
                 /* SLOW PATH - Interrupted between A and C */
@@ -3729,6 +3722,8 @@ rb_reserve_next_event(struct trace_buffer *buffer,
         if (ring_buffer_time_stamp_abs(cpu_buffer->buffer)) {
                 add_ts_default = RB_ADD_STAMP_ABSOLUTE;
                 info.length += RB_LEN_TIME_EXTEND;
+               if (info.length > BUF_MAX_DATA_SIZE)
+                       goto out_fail;
         } else {
                 add_ts_default = RB_ADD_STAMP_NONE;
         }
@@ -4208,7 +4203,7 @@ u64 ring_buffer_oldest_event_ts(struct trace_buffer *buffer, int cpu)
  EXPORT_SYMBOL_GPL(ring_buffer_oldest_event_ts);
  
  /**
- * ring_buffer_bytes_cpu - get the number of bytes consumed in a cpu buffer
+ * ring_buffer_bytes_cpu - get the number of bytes unconsumed in a cpu buffer
   * @buffer: The ring buffer
   * @cpu: The per CPU buffer to read from.
   */
@@ -4716,6 +4711,7 @@ static void rb_advance_reader(struct ring_buffer_per_cpu *cpu_buffer)
  
         length = rb_event_length(event);
         cpu_buffer->reader_page->read += length;
+       cpu_buffer->read_bytes += length;
  }
  
  static void rb_advance_iter(struct ring_buffer_iter *iter)
@@ -5109,7 +5105,8 @@ ring_buffer_read_prepare(struct trace_buffer *buffer, int cpu, gfp_t flags)
         if (!iter)
                 return NULL;
  
-       iter->event = kmalloc(BUF_MAX_DATA_SIZE, flags);
+       /* Holds the entire event: data and meta data */
+       iter->event = kmalloc(BUF_PAGE_SIZE, flags);
         if (!iter->event) {
                 kfree(iter);
                 return NULL;
@@ -5809,7 +5806,7 @@ int ring_buffer_read_page(struct trace_buffer *buffer,
         } else {
                 /* update the entry counter */
                 cpu_buffer->read += rb_page_entries(reader);
-               cpu_buffer->read_bytes += BUF_PAGE_SIZE;
+               cpu_buffer->read_bytes += rb_page_commit(reader);
  
                 /* swap the pages */
                 rb_init_page(bpage);