From fcc742eaad7cbcbbb2a96edc8f1d22adbaa804cb Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
Date: Thu, 28 May 2015 17:13:14 -0400
Subject: ring-buffer: Add event descriptor to simplify passing data

Add rb_event_info descriptor to pass event info to functions a bit
easier than using a bunch of parameters. This will also allow for
changing the code around a bit to find better fast paths.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/ring_buffer.c | 91 ++++++++++++++++++++++++++--------------------
 1 file changed, 52 insertions(+), 39 deletions(-)

(limited to 'kernel/trace/ring_buffer.c')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 6260717c18e3..ba8f25ffcf6f 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -399,6 +399,17 @@ struct rb_irq_work {
 	bool				wakeup_full;
 };
 
+/*
+ * Structure to hold event state and handle nested events.
+ */
+struct rb_event_info {
+	u64			ts;
+	u64			delta;
+	unsigned long		length;
+	struct buffer_page	*tail_page;
+	int			add_timestamp;
+};
+
 /*
  * Used for which event context the event is in.
  *  NMI     = 0
@@ -2000,9 +2011,12 @@ rb_add_time_stamp(struct ring_buffer_event *event, u64 delta)
  */
 static void
 rb_update_event(struct ring_buffer_per_cpu *cpu_buffer,
-		struct ring_buffer_event *event, unsigned length,
-		int add_timestamp, u64 delta)
+		struct ring_buffer_event *event,
+		struct rb_event_info *info)
 {
+	unsigned length = info->length;
+	u64 delta = info->delta;
+
 	/* Only a commit updates the timestamp */
 	if (unlikely(!rb_event_is_commit(cpu_buffer, event)))
 		delta = 0;
@@ -2011,7 +2025,7 @@ rb_update_event(struct ring_buffer_per_cpu *cpu_buffer,
 	 * If we need to add a timestamp, then we
 	 * add it to the start of the resevered space.
 	 */
-	if (unlikely(add_timestamp)) {
+	if (unlikely(info->add_timestamp)) {
 		event = rb_add_time_stamp(event, delta);
 		length -= RB_LEN_TIME_EXTEND;
 		delta = 0;
@@ -2203,10 +2217,11 @@ static unsigned rb_calculate_event_length(unsigned length)
 
 static inline void
 rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer,
-	      struct buffer_page *tail_page,
-	      unsigned long tail, unsigned long length)
+	      unsigned long tail, struct rb_event_info *info)
 {
+	struct buffer_page *tail_page = info->tail_page;
 	struct ring_buffer_event *event;
+	unsigned long length = info->length;
 
 	/*
 	 * Only the event that crossed the page boundary
@@ -2276,13 +2291,14 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer,
  */
 static noinline struct ring_buffer_event *
 rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer,
-	     unsigned long length, unsigned long tail,
-	     struct buffer_page *tail_page, u64 ts)
+	     unsigned long tail, struct rb_event_info *info)
 {
+	struct buffer_page *tail_page = info->tail_page;
 	struct buffer_page *commit_page = cpu_buffer->commit_page;
 	struct ring_buffer *buffer = cpu_buffer->buffer;
 	struct buffer_page *next_page;
 	int ret;
+	u64 ts;
 
 	next_page = tail_page;
 
@@ -2368,25 +2384,24 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer,
 
  out_again:
 
-	rb_reset_tail(cpu_buffer, tail_page, tail, length);
+	rb_reset_tail(cpu_buffer, tail, info);
 
 	/* fail and let the caller try again */
 	return ERR_PTR(-EAGAIN);
 
  out_reset:
 	/* reset write */
-	rb_reset_tail(cpu_buffer, tail_page, tail, length);
+	rb_reset_tail(cpu_buffer, tail, info);
 
 	return NULL;
 }
 
 static struct ring_buffer_event *
 __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
-		  unsigned long length, u64 ts,
-		  u64 delta, int add_timestamp)
+		  struct rb_event_info *info)
 {
-	struct buffer_page *tail_page;
 	struct ring_buffer_event *event;
+	struct buffer_page *tail_page;
 	unsigned long tail, write;
 
 	/*
@@ -2394,33 +2409,32 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
 	 * hold in the time field of the event, then we append a
 	 * TIME EXTEND event ahead of the data event.
 	 */
-	if (unlikely(add_timestamp))
-		length += RB_LEN_TIME_EXTEND;
+	if (unlikely(info->add_timestamp))
+		info->length += RB_LEN_TIME_EXTEND;
 
-	tail_page = cpu_buffer->tail_page;
-	write = local_add_return(length, &tail_page->write);
+	tail_page = info->tail_page = cpu_buffer->tail_page;
+	write = local_add_return(info->length, &tail_page->write);
 
 	/* set write to only the index of the write */
 	write &= RB_WRITE_MASK;
-	tail = write - length;
+	tail = write - info->length;
 
 	/*
 	 * If this is the first commit on the page, then it has the same
 	 * timestamp as the page itself.
 	 */
 	if (!tail)
-		delta = 0;
+		info->delta = 0;
 
 	/* See if we shot pass the end of this buffer page */
 	if (unlikely(write > BUF_PAGE_SIZE))
-		return rb_move_tail(cpu_buffer, length, tail,
-				    tail_page, ts);
+		return rb_move_tail(cpu_buffer, tail, info);
 
 	/* We reserved something on the buffer */
 
 	event = __rb_page_index(tail_page, tail);
 	kmemcheck_annotate_bitfield(event, bitfield);
-	rb_update_event(cpu_buffer, event, length, add_timestamp, delta);
+	rb_update_event(cpu_buffer, event, info);
 
 	local_inc(&tail_page->entries);
 
@@ -2429,10 +2443,10 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
 	 * its timestamp.
 	 */
 	if (!tail)
-		tail_page->page->time_stamp = ts;
+		tail_page->page->time_stamp = info->ts;
 
 	/* account for these added bytes */
-	local_add(length, &cpu_buffer->entries_bytes);
+	local_add(info->length, &cpu_buffer->entries_bytes);
 
 	return event;
 }
@@ -2521,9 +2535,8 @@ rb_reserve_next_event(struct ring_buffer *buffer,
 		      unsigned long length)
 {
 	struct ring_buffer_event *event;
-	u64 ts, delta;
+	struct rb_event_info info;
 	int nr_loops = 0;
-	int add_timestamp;
 	u64 diff;
 
 	rb_start_commit(cpu_buffer);
@@ -2543,10 +2556,10 @@ rb_reserve_next_event(struct ring_buffer *buffer,
 	}
 #endif
 
-	length = rb_calculate_event_length(length);
+	info.length = rb_calculate_event_length(length);
  again:
-	add_timestamp = 0;
-	delta = 0;
+	info.add_timestamp = 0;
+	info.delta = 0;
 
 	/*
 	 * We allow for interrupts to reenter here and do a trace.
@@ -2560,35 +2573,35 @@ rb_reserve_next_event(struct ring_buffer *buffer,
 	if (RB_WARN_ON(cpu_buffer, ++nr_loops > 1000))
 		goto out_fail;
 
-	ts = rb_time_stamp(cpu_buffer->buffer);
-	diff = ts - cpu_buffer->write_stamp;
+	info.ts = rb_time_stamp(cpu_buffer->buffer);
+	diff = info.ts - cpu_buffer->write_stamp;
 
 	/* make sure this diff is calculated here */
 	barrier();
 
 	/* Did the write stamp get updated already? */
-	if (likely(ts >= cpu_buffer->write_stamp)) {
-		delta = diff;
-		if (unlikely(test_time_stamp(delta))) {
+	if (likely(info.ts >= cpu_buffer->write_stamp)) {
+		info.delta = diff;
+		if (unlikely(test_time_stamp(info.delta))) {
 			int local_clock_stable = 1;
 #ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
 			local_clock_stable = sched_clock_stable();
 #endif
-			WARN_ONCE(delta > (1ULL << 59),
+			WARN_ONCE(info.delta > (1ULL << 59),
 				  KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n%s",
-				  (unsigned long long)delta,
-				  (unsigned long long)ts,
+				  (unsigned long long)info.delta,
+				  (unsigned long long)info.ts,
 				  (unsigned long long)cpu_buffer->write_stamp,
 				  local_clock_stable ? "" :
 				  "If you just came from a suspend/resume,\n"
 				  "please switch to the trace global clock:\n"
 				  "  echo global > /sys/kernel/debug/tracing/trace_clock\n");
-			add_timestamp = 1;
+			info.add_timestamp = 1;
 		}
 	}
 
-	event = __rb_reserve_next(cpu_buffer, length, ts,
-				  delta, add_timestamp);
+	event = __rb_reserve_next(cpu_buffer, &info);
+
 	if (unlikely(PTR_ERR(event) == -EAGAIN))
 		goto again;
 
-- 
cgit v1.2.3


From 9826b2733a4399149072058a11f611357479229d Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
Date: Thu, 28 May 2015 17:36:45 -0400
Subject: ring-buffer: Move the adding of the extended timestamp out of line

Requiring a extended time stamp is an uncommon occurrence, and it is
best to do it out of line when needed.

Add a noinline function that handles the extended timestamp and
have it called with an unlikely to completely move it out of the
fast path.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/ring_buffer.c | 41 +++++++++++++++++++++++++----------------
 1 file changed, 25 insertions(+), 16 deletions(-)

(limited to 'kernel/trace/ring_buffer.c')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index ba8f25ffcf6f..a78d4ee4bc58 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -2396,6 +2396,29 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer,
 	return NULL;
 }
 
+#ifndef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
+static inline bool sched_clock_stable(void)
+{
+	return true;
+}
+#endif
+
+static noinline void
+rb_handle_timestamp(struct ring_buffer_per_cpu *cpu_buffer,
+		    struct rb_event_info *info)
+{
+	WARN_ONCE(info->delta > (1ULL << 59),
+		  KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n%s",
+		  (unsigned long long)info->delta,
+		  (unsigned long long)info->ts,
+		  (unsigned long long)cpu_buffer->write_stamp,
+		  sched_clock_stable() ? "" :
+		  "If you just came from a suspend/resume,\n"
+		  "please switch to the trace global clock:\n"
+		  "  echo global > /sys/kernel/debug/tracing/trace_clock\n");
+	info->add_timestamp = 1;
+}
+
 static struct ring_buffer_event *
 __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
 		  struct rb_event_info *info)
@@ -2582,22 +2605,8 @@ rb_reserve_next_event(struct ring_buffer *buffer,
 	/* Did the write stamp get updated already? */
 	if (likely(info.ts >= cpu_buffer->write_stamp)) {
 		info.delta = diff;
-		if (unlikely(test_time_stamp(info.delta))) {
-			int local_clock_stable = 1;
-#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
-			local_clock_stable = sched_clock_stable();
-#endif
-			WARN_ONCE(info.delta > (1ULL << 59),
-				  KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n%s",
-				  (unsigned long long)info.delta,
-				  (unsigned long long)info.ts,
-				  (unsigned long long)cpu_buffer->write_stamp,
-				  local_clock_stable ? "" :
-				  "If you just came from a suspend/resume,\n"
-				  "please switch to the trace global clock:\n"
-				  "  echo global > /sys/kernel/debug/tracing/trace_clock\n");
-			info.add_timestamp = 1;
-		}
+		if (unlikely(test_time_stamp(info.delta)))
+			rb_handle_timestamp(cpu_buffer, &info);
 	}
 
 	event = __rb_reserve_next(cpu_buffer, &info);
-- 
cgit v1.2.3


From a4543a2fa9ef31d6d0f854a4e14f8f82e7996d8d Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
Date: Fri, 29 May 2015 09:40:18 -0400
Subject: ring-buffer: Get timestamp after event is allocated

Move the capturing of the timestamp to after an event is allocated.
If the event is not a commit (where it is an event that preempted
another event), then no timestamp is needed, because the delta of
nested events is always zero.

If the event starts on a new page, no delta needs to be calculated
as the full timestamp will be added to the page header, and the
event will have a delta of zero.

Now if the event requires a time extend (the delta does not fit
in the 27 bit delta slot in the header), then the event is discarded,
the length is extended to hold the TIME_EXTEND event that allows for
a 59 bit delta, and the commit is tried again.

If the event can't be discarded (another event came in after it),
then the TIME_EXTEND is added directly to the allocated event and
the rest of the event is given padding.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/ring_buffer.c | 161 ++++++++++++++++++++++++++++++++-------------
 1 file changed, 114 insertions(+), 47 deletions(-)

(limited to 'kernel/trace/ring_buffer.c')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index a78d4ee4bc58..b5ed553e0a45 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -2009,7 +2009,7 @@ rb_add_time_stamp(struct ring_buffer_event *event, u64 delta)
  * and with this, we can determine what to place into the
  * data field.
  */
-static void
+static void __always_inline
 rb_update_event(struct ring_buffer_per_cpu *cpu_buffer,
 		struct ring_buffer_event *event,
 		struct rb_event_info *info)
@@ -2017,10 +2017,6 @@ rb_update_event(struct ring_buffer_per_cpu *cpu_buffer,
 	unsigned length = info->length;
 	u64 delta = info->delta;
 
-	/* Only a commit updates the timestamp */
-	if (unlikely(!rb_event_is_commit(cpu_buffer, event)))
-		delta = 0;
-
 	/*
 	 * If we need to add a timestamp, then we
 	 * add it to the start of the resevered space.
@@ -2286,6 +2282,8 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer,
 	local_sub(length, &tail_page->write);
 }
 
+static inline void rb_end_commit(struct ring_buffer_per_cpu *cpu_buffer);
+
 /*
  * This is the slow path, force gcc not to inline it.
  */
@@ -2300,6 +2298,16 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer,
 	int ret;
 	u64 ts;
 
+	/*
+	 * If the event had a timestamp attached to it, remove it.
+	 * The first event on a page (nested or not) always uses
+	 * the full timestamp of the new page.
+	 */
+	if (info->add_timestamp) {
+		info->add_timestamp = 0;
+		info->length -= RB_LEN_TIME_EXTEND;
+	}
+
 	next_page = tail_page;
 
 	rb_inc_page(cpu_buffer, &next_page);
@@ -2386,6 +2394,11 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer,
 
 	rb_reset_tail(cpu_buffer, tail, info);
 
+	/* Commit what we have for now to update timestamps */
+	rb_end_commit(cpu_buffer);
+	/* rb_end_commit() decs committing */
+	local_inc(&cpu_buffer->committing);
+
 	/* fail and let the caller try again */
 	return ERR_PTR(-EAGAIN);
 
@@ -2403,10 +2416,23 @@ static inline bool sched_clock_stable(void)
 }
 #endif
 
+static inline int
+rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer,
+		  struct ring_buffer_event *event);
+static inline void rb_event_discard(struct ring_buffer_event *event);
+static void
+rb_update_write_stamp(struct ring_buffer_per_cpu *cpu_buffer,
+		      struct ring_buffer_event *event);
+
 static noinline void
 rb_handle_timestamp(struct ring_buffer_per_cpu *cpu_buffer,
+		    struct ring_buffer_event *event,
 		    struct rb_event_info *info)
 {
+	struct ring_buffer_event *padding;
+	int length;
+	int size;
+
 	WARN_ONCE(info->delta > (1ULL << 59),
 		  KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n%s",
 		  (unsigned long long)info->delta,
@@ -2416,7 +2442,61 @@ rb_handle_timestamp(struct ring_buffer_per_cpu *cpu_buffer,
 		  "If you just came from a suspend/resume,\n"
 		  "please switch to the trace global clock:\n"
 		  "  echo global > /sys/kernel/debug/tracing/trace_clock\n");
-	info->add_timestamp = 1;
+
+	/*
+	 * Discarding this event to add a timestamp in front, but
+	 * we still need to update the length of it to perform the discard.
+	 */
+	rb_update_event(cpu_buffer, event, info);
+
+	if (rb_try_to_discard(cpu_buffer, event)) {
+		info->add_timestamp = 1;
+		/*
+		 * The time delta since the last event is too big to
+		 * hold in the time field of the event, then we append a
+		 * TIME EXTEND event ahead of the data event.
+		 */
+		info->length += RB_LEN_TIME_EXTEND;
+		return;
+	}
+
+	/*
+	 * Humpf! An event came in after this one, and because it is not a
+	 * commit, it will have a delta of zero, thus, it will take on
+	 * the timestamp of the previous commit, which happened a long time
+	 * ago (we need to add a timestamp, remember?).
+	 * We need to add the timestamp here. A timestamp is a fixed size
+	 * of 8 bytes. That means the rest of the event needs to be
+	 * padding.
+	 */
+	size = info->length - RB_LEN_TIME_EXTEND;
+
+	/* The padding will have a delta of 1 */
+	if (size)
+		info->delta--;
+
+	padding = rb_add_time_stamp(event, info->delta);
+
+	if (size) {
+		length = info->length;
+		info->delta = 0;
+		info->length = size;
+		rb_update_event(cpu_buffer, padding, info);
+
+		rb_event_discard(padding);
+
+		/* Still visible, need to update write_stamp */
+		rb_update_write_stamp(cpu_buffer, event);
+
+		/* Still need to commit the padding. */
+		rb_end_commit(cpu_buffer);
+
+		/* rb_end_commit() decs committing */
+		local_inc(&cpu_buffer->committing);
+
+		/* The next iteration still uses the original length */
+		info->length = length;
+	}
 }
 
 static struct ring_buffer_event *
@@ -2426,14 +2506,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
 	struct ring_buffer_event *event;
 	struct buffer_page *tail_page;
 	unsigned long tail, write;
-
-	/*
-	 * If the time delta since the last event is too big to
-	 * hold in the time field of the event, then we append a
-	 * TIME EXTEND event ahead of the data event.
-	 */
-	if (unlikely(info->add_timestamp))
-		info->length += RB_LEN_TIME_EXTEND;
+	bool is_commit;
 
 	tail_page = info->tail_page = cpu_buffer->tail_page;
 	write = local_add_return(info->length, &tail_page->write);
@@ -2442,31 +2515,42 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
 	write &= RB_WRITE_MASK;
 	tail = write - info->length;
 
-	/*
-	 * If this is the first commit on the page, then it has the same
-	 * timestamp as the page itself.
-	 */
-	if (!tail)
-		info->delta = 0;
-
 	/* See if we shot pass the end of this buffer page */
 	if (unlikely(write > BUF_PAGE_SIZE))
 		return rb_move_tail(cpu_buffer, tail, info);
 
 	/* We reserved something on the buffer */
-
 	event = __rb_page_index(tail_page, tail);
-	kmemcheck_annotate_bitfield(event, bitfield);
-	rb_update_event(cpu_buffer, event, info);
-
-	local_inc(&tail_page->entries);
 
 	/*
-	 * If this is the first commit on the page, then update
-	 * its timestamp.
+	 * If this is the first commit on the page, then it has the same
+	 * timestamp as the page itself, otherwise we need to figure out
+	 * the delta.
 	 */
-	if (!tail)
+	info->ts = rb_time_stamp(cpu_buffer->buffer);
+	is_commit = rb_event_is_commit(cpu_buffer, event);
+
+	/* Commits are special (non nested events) */
+	info->delta = is_commit ? info->ts - cpu_buffer->write_stamp : 0;
+
+	if (!tail) {
+		/*
+		 * If this is the first commit on the page, set the
+		 * page to its timestamp.
+		 */
 		tail_page->page->time_stamp = info->ts;
+		info->delta = 0;
+
+	} else if (unlikely(test_time_stamp(info->delta)) &&
+		   !info->add_timestamp) {
+		rb_handle_timestamp(cpu_buffer, event, info);
+		return ERR_PTR(-EAGAIN);
+	}
+
+	kmemcheck_annotate_bitfield(event, bitfield);
+	rb_update_event(cpu_buffer, event, info);
+
+	local_inc(&tail_page->entries);
 
 	/* account for these added bytes */
 	local_add(info->length, &cpu_buffer->entries_bytes);
@@ -2560,7 +2644,6 @@ rb_reserve_next_event(struct ring_buffer *buffer,
 	struct ring_buffer_event *event;
 	struct rb_event_info info;
 	int nr_loops = 0;
-	u64 diff;
 
 	rb_start_commit(cpu_buffer);
 
@@ -2578,12 +2661,9 @@ rb_reserve_next_event(struct ring_buffer *buffer,
 		return NULL;
 	}
 #endif
-
 	info.length = rb_calculate_event_length(length);
- again:
 	info.add_timestamp = 0;
-	info.delta = 0;
-
+ again:
 	/*
 	 * We allow for interrupts to reenter here and do a trace.
 	 * If one does, it will cause this original code to loop
@@ -2596,19 +2676,6 @@ rb_reserve_next_event(struct ring_buffer *buffer,
 	if (RB_WARN_ON(cpu_buffer, ++nr_loops > 1000))
 		goto out_fail;
 
-	info.ts = rb_time_stamp(cpu_buffer->buffer);
-	diff = info.ts - cpu_buffer->write_stamp;
-
-	/* make sure this diff is calculated here */
-	barrier();
-
-	/* Did the write stamp get updated already? */
-	if (likely(info.ts >= cpu_buffer->write_stamp)) {
-		info.delta = diff;
-		if (unlikely(test_time_stamp(info.delta)))
-			rb_handle_timestamp(cpu_buffer, &info);
-	}
-
 	event = __rb_reserve_next(cpu_buffer, &info);
 
 	if (unlikely(PTR_ERR(event) == -EAGAIN))
-- 
cgit v1.2.3


From 7d75e6833b579adb3de2c7b917de1204eeafea47 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
Date: Fri, 29 May 2015 10:29:10 -0400
Subject: ring-buffer: Make sure event has enough room for extend and padding

Now that events only add time extends after it is committed, in case
an event comes in before it can discard the allocated event, the time
extend needs to be stored within the event. If the event is bigger
than then size needed for the time extend, padding must be added.
The minimum padding size is 8 bytes. Thus if the event is 12 bytes
(size of time extend + 4), there will not be enough room to add both
the time extend and padding. Make sure all events are either 8 bytes
or 16 or more bytes.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/ring_buffer.c | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

(limited to 'kernel/trace/ring_buffer.c')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index b5ed553e0a45..781ce359976c 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -2208,6 +2208,21 @@ static unsigned rb_calculate_event_length(unsigned length)
 	length += RB_EVNT_HDR_SIZE;
 	length = ALIGN(length, RB_ARCH_ALIGNMENT);
 
+	/*
+	 * In case the time delta is larger than the 27 bits for it
+	 * in the header, we need to add a timestamp. If another
+	 * event comes in when trying to discard this one to increase
+	 * the length, then the timestamp will be added in the allocated
+	 * space of this event. If length is bigger than the size needed
+	 * for the TIME_EXTEND, then padding has to be used. The events
+	 * length must be either RB_LEN_TIME_EXTEND, or greater than or equal
+	 * to RB_LEN_TIME_EXTEND + 8, as 8 is the minimum size for padding.
+	 * As length is a multiple of 4, we only need to worry if it
+	 * is 12 (RB_LEN_TIME_EXTEND + 4).
+	 */
+	if (length == RB_LEN_TIME_EXTEND + RB_ALIGNMENT)
+		length += RB_ALIGNMENT;
+
 	return length;
 }
 
-- 
cgit v1.2.3


From d90fd77402d3de56a9ca3df04e5d868d0979dc59 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
Date: Fri, 29 May 2015 12:12:27 -0400
Subject: ring-buffer: Reorganize function locations

Functions in ring-buffer.c have gotten interleaved between different
use cases. Move the functions around to get like functions closer
together. This may or may not help gcc keep cache locality, but it
makes it a little easier to work with the code.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/ring_buffer.c | 920 ++++++++++++++++++++++-----------------------
 1 file changed, 456 insertions(+), 464 deletions(-)

(limited to 'kernel/trace/ring_buffer.c')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 781ce359976c..1cce0fbf92ce 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -1887,73 +1887,6 @@ rb_event_index(struct ring_buffer_event *event)
 	return (addr & ~PAGE_MASK) - BUF_PAGE_HDR_SIZE;
 }
 
-static inline int
-rb_event_is_commit(struct ring_buffer_per_cpu *cpu_buffer,
-		   struct ring_buffer_event *event)
-{
-	unsigned long addr = (unsigned long)event;
-	unsigned long index;
-
-	index = rb_event_index(event);
-	addr &= PAGE_MASK;
-
-	return cpu_buffer->commit_page->page == (void *)addr &&
-		rb_commit_index(cpu_buffer) == index;
-}
-
-static void
-rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer)
-{
-	unsigned long max_count;
-
-	/*
-	 * We only race with interrupts and NMIs on this CPU.
-	 * If we own the commit event, then we can commit
-	 * all others that interrupted us, since the interruptions
-	 * are in stack format (they finish before they come
-	 * back to us). This allows us to do a simple loop to
-	 * assign the commit to the tail.
-	 */
- again:
-	max_count = cpu_buffer->nr_pages * 100;
-
-	while (cpu_buffer->commit_page != cpu_buffer->tail_page) {
-		if (RB_WARN_ON(cpu_buffer, !(--max_count)))
-			return;
-		if (RB_WARN_ON(cpu_buffer,
-			       rb_is_reader_page(cpu_buffer->tail_page)))
-			return;
-		local_set(&cpu_buffer->commit_page->page->commit,
-			  rb_page_write(cpu_buffer->commit_page));
-		rb_inc_page(cpu_buffer, &cpu_buffer->commit_page);
-		cpu_buffer->write_stamp =
-			cpu_buffer->commit_page->page->time_stamp;
-		/* add barrier to keep gcc from optimizing too much */
-		barrier();
-	}
-	while (rb_commit_index(cpu_buffer) !=
-	       rb_page_write(cpu_buffer->commit_page)) {
-
-		local_set(&cpu_buffer->commit_page->page->commit,
-			  rb_page_write(cpu_buffer->commit_page));
-		RB_WARN_ON(cpu_buffer,
-			   local_read(&cpu_buffer->commit_page->page->commit) &
-			   ~RB_WRITE_MASK);
-		barrier();
-	}
-
-	/* again, keep gcc from optimizing */
-	barrier();
-
-	/*
-	 * If an interrupt came in just after the first while loop
-	 * and pushed the tail page forward, we will be left with
-	 * a dangling commit that will never go forward.
-	 */
-	if (unlikely(cpu_buffer->commit_page != cpu_buffer->tail_page))
-		goto again;
-}
-
 static void rb_reset_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
 {
 	cpu_buffer->read_stamp = cpu_buffer->reader_page->page->time_stamp;
@@ -1979,63 +1912,6 @@ static void rb_inc_iter(struct ring_buffer_iter *iter)
 	iter->head = 0;
 }
 
-/* Slow path, do not inline */
-static noinline struct ring_buffer_event *
-rb_add_time_stamp(struct ring_buffer_event *event, u64 delta)
-{
-	event->type_len = RINGBUF_TYPE_TIME_EXTEND;
-
-	/* Not the first event on the page? */
-	if (rb_event_index(event)) {
-		event->time_delta = delta & TS_MASK;
-		event->array[0] = delta >> TS_SHIFT;
-	} else {
-		/* nope, just zero it */
-		event->time_delta = 0;
-		event->array[0] = 0;
-	}
-
-	return skip_time_extend(event);
-}
-
-/**
- * rb_update_event - update event type and data
- * @event: the event to update
- * @type: the type of event
- * @length: the size of the event field in the ring buffer
- *
- * Update the type and data fields of the event. The length
- * is the actual size that is written to the ring buffer,
- * and with this, we can determine what to place into the
- * data field.
- */
-static void __always_inline
-rb_update_event(struct ring_buffer_per_cpu *cpu_buffer,
-		struct ring_buffer_event *event,
-		struct rb_event_info *info)
-{
-	unsigned length = info->length;
-	u64 delta = info->delta;
-
-	/*
-	 * If we need to add a timestamp, then we
-	 * add it to the start of the resevered space.
-	 */
-	if (unlikely(info->add_timestamp)) {
-		event = rb_add_time_stamp(event, delta);
-		length -= RB_LEN_TIME_EXTEND;
-		delta = 0;
-	}
-
-	event->time_delta = delta;
-	length -= RB_EVNT_HDR_SIZE;
-	if (length > RB_MAX_SMALL_DATA || RB_FORCE_8BYTE_ALIGNMENT) {
-		event->type_len = 0;
-		event->array[0] = length;
-	} else
-		event->type_len = DIV_ROUND_UP(length, RB_ALIGNMENT);
-}
-
 /*
  * rb_handle_head_page - writer hit the head page
  *
@@ -2194,38 +2070,6 @@ rb_handle_head_page(struct ring_buffer_per_cpu *cpu_buffer,
 	return 0;
 }
 
-static unsigned rb_calculate_event_length(unsigned length)
-{
-	struct ring_buffer_event event; /* Used only for sizeof array */
-
-	/* zero length can cause confusions */
-	if (!length)
-		length++;
-
-	if (length > RB_MAX_SMALL_DATA || RB_FORCE_8BYTE_ALIGNMENT)
-		length += sizeof(event.array[0]);
-
-	length += RB_EVNT_HDR_SIZE;
-	length = ALIGN(length, RB_ARCH_ALIGNMENT);
-
-	/*
-	 * In case the time delta is larger than the 27 bits for it
-	 * in the header, we need to add a timestamp. If another
-	 * event comes in when trying to discard this one to increase
-	 * the length, then the timestamp will be added in the allocated
-	 * space of this event. If length is bigger than the size needed
-	 * for the TIME_EXTEND, then padding has to be used. The events
-	 * length must be either RB_LEN_TIME_EXTEND, or greater than or equal
-	 * to RB_LEN_TIME_EXTEND + 8, as 8 is the minimum size for padding.
-	 * As length is a multiple of 4, we only need to worry if it
-	 * is 12 (RB_LEN_TIME_EXTEND + 4).
-	 */
-	if (length == RB_LEN_TIME_EXTEND + RB_ALIGNMENT)
-		length += RB_ALIGNMENT;
-
-	return length;
-}
-
 static inline void
 rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer,
 	      unsigned long tail, struct rb_event_info *info)
@@ -2424,71 +2268,471 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer,
 	return NULL;
 }
 
-#ifndef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
-static inline bool sched_clock_stable(void)
+/* Slow path, do not inline */
+static noinline struct ring_buffer_event *
+rb_add_time_stamp(struct ring_buffer_event *event, u64 delta)
 {
-	return true;
-}
-#endif
+	event->type_len = RINGBUF_TYPE_TIME_EXTEND;
 
-static inline int
-rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer,
-		  struct ring_buffer_event *event);
-static inline void rb_event_discard(struct ring_buffer_event *event);
-static void
-rb_update_write_stamp(struct ring_buffer_per_cpu *cpu_buffer,
-		      struct ring_buffer_event *event);
+	/* Not the first event on the page? */
+	if (rb_event_index(event)) {
+		event->time_delta = delta & TS_MASK;
+		event->array[0] = delta >> TS_SHIFT;
+	} else {
+		/* nope, just zero it */
+		event->time_delta = 0;
+		event->array[0] = 0;
+	}
 
-static noinline void
-rb_handle_timestamp(struct ring_buffer_per_cpu *cpu_buffer,
-		    struct ring_buffer_event *event,
-		    struct rb_event_info *info)
-{
-	struct ring_buffer_event *padding;
-	int length;
-	int size;
+	return skip_time_extend(event);
+}
 
-	WARN_ONCE(info->delta > (1ULL << 59),
-		  KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n%s",
-		  (unsigned long long)info->delta,
-		  (unsigned long long)info->ts,
-		  (unsigned long long)cpu_buffer->write_stamp,
-		  sched_clock_stable() ? "" :
-		  "If you just came from a suspend/resume,\n"
-		  "please switch to the trace global clock:\n"
-		  "  echo global > /sys/kernel/debug/tracing/trace_clock\n");
+/**
+ * rb_update_event - update event type and data
+ * @event: the event to update
+ * @type: the type of event
+ * @length: the size of the event field in the ring buffer
+ *
+ * Update the type and data fields of the event. The length
+ * is the actual size that is written to the ring buffer,
+ * and with this, we can determine what to place into the
+ * data field.
+ */
+static void __always_inline
+rb_update_event(struct ring_buffer_per_cpu *cpu_buffer,
+		struct ring_buffer_event *event,
+		struct rb_event_info *info)
+{
+	unsigned length = info->length;
+	u64 delta = info->delta;
 
 	/*
-	 * Discarding this event to add a timestamp in front, but
-	 * we still need to update the length of it to perform the discard.
+	 * If we need to add a timestamp, then we
+	 * add it to the start of the resevered space.
 	 */
-	rb_update_event(cpu_buffer, event, info);
-
-	if (rb_try_to_discard(cpu_buffer, event)) {
-		info->add_timestamp = 1;
-		/*
-		 * The time delta since the last event is too big to
-		 * hold in the time field of the event, then we append a
-		 * TIME EXTEND event ahead of the data event.
-		 */
-		info->length += RB_LEN_TIME_EXTEND;
-		return;
+	if (unlikely(info->add_timestamp)) {
+		event = rb_add_time_stamp(event, delta);
+		length -= RB_LEN_TIME_EXTEND;
+		delta = 0;
 	}
 
-	/*
-	 * Humpf! An event came in after this one, and because it is not a
-	 * commit, it will have a delta of zero, thus, it will take on
-	 * the timestamp of the previous commit, which happened a long time
-	 * ago (we need to add a timestamp, remember?).
-	 * We need to add the timestamp here. A timestamp is a fixed size
-	 * of 8 bytes. That means the rest of the event needs to be
-	 * padding.
-	 */
-	size = info->length - RB_LEN_TIME_EXTEND;
-
-	/* The padding will have a delta of 1 */
-	if (size)
-		info->delta--;
+	event->time_delta = delta;
+	length -= RB_EVNT_HDR_SIZE;
+	if (length > RB_MAX_SMALL_DATA || RB_FORCE_8BYTE_ALIGNMENT) {
+		event->type_len = 0;
+		event->array[0] = length;
+	} else
+		event->type_len = DIV_ROUND_UP(length, RB_ALIGNMENT);
+}
+
+static unsigned rb_calculate_event_length(unsigned length)
+{
+	struct ring_buffer_event event; /* Used only for sizeof array */
+
+	/* zero length can cause confusions */
+	if (!length)
+		length++;
+
+	if (length > RB_MAX_SMALL_DATA || RB_FORCE_8BYTE_ALIGNMENT)
+		length += sizeof(event.array[0]);
+
+	length += RB_EVNT_HDR_SIZE;
+	length = ALIGN(length, RB_ARCH_ALIGNMENT);
+
+	/*
+	 * In case the time delta is larger than the 27 bits for it
+	 * in the header, we need to add a timestamp. If another
+	 * event comes in when trying to discard this one to increase
+	 * the length, then the timestamp will be added in the allocated
+	 * space of this event. If length is bigger than the size needed
+	 * for the TIME_EXTEND, then padding has to be used. The events
+	 * length must be either RB_LEN_TIME_EXTEND, or greater than or equal
+	 * to RB_LEN_TIME_EXTEND + 8, as 8 is the minimum size for padding.
+	 * As length is a multiple of 4, we only need to worry if it
+	 * is 12 (RB_LEN_TIME_EXTEND + 4).
+	 */
+	if (length == RB_LEN_TIME_EXTEND + RB_ALIGNMENT)
+		length += RB_ALIGNMENT;
+
+	return length;
+}
+
+#ifndef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
+static inline bool sched_clock_stable(void)
+{
+	return true;
+}
+#endif
+
+static inline int
+rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer,
+		  struct ring_buffer_event *event)
+{
+	unsigned long new_index, old_index;
+	struct buffer_page *bpage;
+	unsigned long index;
+	unsigned long addr;
+
+	new_index = rb_event_index(event);
+	old_index = new_index + rb_event_ts_length(event);
+	addr = (unsigned long)event;
+	addr &= PAGE_MASK;
+
+	bpage = cpu_buffer->tail_page;
+
+	if (bpage->page == (void *)addr && rb_page_write(bpage) == old_index) {
+		unsigned long write_mask =
+			local_read(&bpage->write) & ~RB_WRITE_MASK;
+		unsigned long event_length = rb_event_length(event);
+		/*
+		 * This is on the tail page. It is possible that
+		 * a write could come in and move the tail page
+		 * and write to the next page. That is fine
+		 * because we just shorten what is on this page.
+		 */
+		old_index += write_mask;
+		new_index += write_mask;
+		index = local_cmpxchg(&bpage->write, old_index, new_index);
+		if (index == old_index) {
+			/* update counters */
+			local_sub(event_length, &cpu_buffer->entries_bytes);
+			return 1;
+		}
+	}
+
+	/* could not discard */
+	return 0;
+}
+
+static void rb_start_commit(struct ring_buffer_per_cpu *cpu_buffer)
+{
+	local_inc(&cpu_buffer->committing);
+	local_inc(&cpu_buffer->commits);
+}
+
+static void
+rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer)
+{
+	unsigned long max_count;
+
+	/*
+	 * We only race with interrupts and NMIs on this CPU.
+	 * If we own the commit event, then we can commit
+	 * all others that interrupted us, since the interruptions
+	 * are in stack format (they finish before they come
+	 * back to us). This allows us to do a simple loop to
+	 * assign the commit to the tail.
+	 */
+ again:
+	max_count = cpu_buffer->nr_pages * 100;
+
+	while (cpu_buffer->commit_page != cpu_buffer->tail_page) {
+		if (RB_WARN_ON(cpu_buffer, !(--max_count)))
+			return;
+		if (RB_WARN_ON(cpu_buffer,
+			       rb_is_reader_page(cpu_buffer->tail_page)))
+			return;
+		local_set(&cpu_buffer->commit_page->page->commit,
+			  rb_page_write(cpu_buffer->commit_page));
+		rb_inc_page(cpu_buffer, &cpu_buffer->commit_page);
+		cpu_buffer->write_stamp =
+			cpu_buffer->commit_page->page->time_stamp;
+		/* add barrier to keep gcc from optimizing too much */
+		barrier();
+	}
+	while (rb_commit_index(cpu_buffer) !=
+	       rb_page_write(cpu_buffer->commit_page)) {
+
+		local_set(&cpu_buffer->commit_page->page->commit,
+			  rb_page_write(cpu_buffer->commit_page));
+		RB_WARN_ON(cpu_buffer,
+			   local_read(&cpu_buffer->commit_page->page->commit) &
+			   ~RB_WRITE_MASK);
+		barrier();
+	}
+
+	/* again, keep gcc from optimizing */
+	barrier();
+
+	/*
+	 * If an interrupt came in just after the first while loop
+	 * and pushed the tail page forward, we will be left with
+	 * a dangling commit that will never go forward.
+	 */
+	if (unlikely(cpu_buffer->commit_page != cpu_buffer->tail_page))
+		goto again;
+}
+
+static inline void rb_end_commit(struct ring_buffer_per_cpu *cpu_buffer)
+{
+	unsigned long commits;
+
+	if (RB_WARN_ON(cpu_buffer,
+		       !local_read(&cpu_buffer->committing)))
+		return;
+
+ again:
+	commits = local_read(&cpu_buffer->commits);
+	/* synchronize with interrupts */
+	barrier();
+	if (local_read(&cpu_buffer->committing) == 1)
+		rb_set_commit_to_write(cpu_buffer);
+
+	local_dec(&cpu_buffer->committing);
+
+	/* synchronize with interrupts */
+	barrier();
+
+	/*
+	 * Need to account for interrupts coming in between the
+	 * updating of the commit page and the clearing of the
+	 * committing counter.
+	 */
+	if (unlikely(local_read(&cpu_buffer->commits) != commits) &&
+	    !local_read(&cpu_buffer->committing)) {
+		local_inc(&cpu_buffer->committing);
+		goto again;
+	}
+}
+
+static inline void rb_event_discard(struct ring_buffer_event *event)
+{
+	if (event->type_len == RINGBUF_TYPE_TIME_EXTEND)
+		event = skip_time_extend(event);
+
+	/* array[0] holds the actual length for the discarded event */
+	event->array[0] = rb_event_data_length(event) - RB_EVNT_HDR_SIZE;
+	event->type_len = RINGBUF_TYPE_PADDING;
+	/* time delta must be non zero */
+	if (!event->time_delta)
+		event->time_delta = 1;
+}
+
+static inline int
+rb_event_is_commit(struct ring_buffer_per_cpu *cpu_buffer,
+		   struct ring_buffer_event *event)
+{
+	unsigned long addr = (unsigned long)event;
+	unsigned long index;
+
+	index = rb_event_index(event);
+	addr &= PAGE_MASK;
+
+	return cpu_buffer->commit_page->page == (void *)addr &&
+		rb_commit_index(cpu_buffer) == index;
+}
+
+static void
+rb_update_write_stamp(struct ring_buffer_per_cpu *cpu_buffer,
+		      struct ring_buffer_event *event)
+{
+	u64 delta;
+
+	/*
+	 * The event first in the commit queue updates the
+	 * time stamp.
+	 */
+	if (rb_event_is_commit(cpu_buffer, event)) {
+		/*
+		 * A commit event that is first on a page
+		 * updates the write timestamp with the page stamp
+		 */
+		if (!rb_event_index(event))
+			cpu_buffer->write_stamp =
+				cpu_buffer->commit_page->page->time_stamp;
+		else if (event->type_len == RINGBUF_TYPE_TIME_EXTEND) {
+			delta = event->array[0];
+			delta <<= TS_SHIFT;
+			delta += event->time_delta;
+			cpu_buffer->write_stamp += delta;
+		} else
+			cpu_buffer->write_stamp += event->time_delta;
+	}
+}
+
+static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer,
+		      struct ring_buffer_event *event)
+{
+	local_inc(&cpu_buffer->entries);
+	rb_update_write_stamp(cpu_buffer, event);
+	rb_end_commit(cpu_buffer);
+}
+
+static __always_inline void
+rb_wakeups(struct ring_buffer *buffer, struct ring_buffer_per_cpu *cpu_buffer)
+{
+	bool pagebusy;
+
+	if (buffer->irq_work.waiters_pending) {
+		buffer->irq_work.waiters_pending = false;
+		/* irq_work_queue() supplies it's own memory barriers */
+		irq_work_queue(&buffer->irq_work.work);
+	}
+
+	if (cpu_buffer->irq_work.waiters_pending) {
+		cpu_buffer->irq_work.waiters_pending = false;
+		/* irq_work_queue() supplies it's own memory barriers */
+		irq_work_queue(&cpu_buffer->irq_work.work);
+	}
+
+	pagebusy = cpu_buffer->reader_page == cpu_buffer->commit_page;
+
+	if (!pagebusy && cpu_buffer->irq_work.full_waiters_pending) {
+		cpu_buffer->irq_work.wakeup_full = true;
+		cpu_buffer->irq_work.full_waiters_pending = false;
+		/* irq_work_queue() supplies it's own memory barriers */
+		irq_work_queue(&cpu_buffer->irq_work.work);
+	}
+}
+
+/*
+ * The lock and unlock are done within a preempt disable section.
+ * The current_context per_cpu variable can only be modified
+ * by the current task between lock and unlock. But it can
+ * be modified more than once via an interrupt. To pass this
+ * information from the lock to the unlock without having to
+ * access the 'in_interrupt()' functions again (which do show
+ * a bit of overhead in something as critical as function tracing,
+ * we use a bitmask trick.
+ *
+ *  bit 0 =  NMI context
+ *  bit 1 =  IRQ context
+ *  bit 2 =  SoftIRQ context
+ *  bit 3 =  normal context.
+ *
+ * This works because this is the order of contexts that can
+ * preempt other contexts. A SoftIRQ never preempts an IRQ
+ * context.
+ *
+ * When the context is determined, the corresponding bit is
+ * checked and set (if it was set, then a recursion of that context
+ * happened).
+ *
+ * On unlock, we need to clear this bit. To do so, just subtract
+ * 1 from the current_context and AND it to itself.
+ *
+ * (binary)
+ *  101 - 1 = 100
+ *  101 & 100 = 100 (clearing bit zero)
+ *
+ *  1010 - 1 = 1001
+ *  1010 & 1001 = 1000 (clearing bit 1)
+ *
+ * The least significant bit can be cleared this way, and it
+ * just so happens that it is the same bit corresponding to
+ * the current context.
+ */
+
+static __always_inline int
+trace_recursive_lock(struct ring_buffer_per_cpu *cpu_buffer)
+{
+	unsigned int val = cpu_buffer->current_context;
+	int bit;
+
+	if (in_interrupt()) {
+		if (in_nmi())
+			bit = RB_CTX_NMI;
+		else if (in_irq())
+			bit = RB_CTX_IRQ;
+		else
+			bit = RB_CTX_SOFTIRQ;
+	} else
+		bit = RB_CTX_NORMAL;
+
+	if (unlikely(val & (1 << bit)))
+		return 1;
+
+	val |= (1 << bit);
+	cpu_buffer->current_context = val;
+
+	return 0;
+}
+
+static __always_inline void
+trace_recursive_unlock(struct ring_buffer_per_cpu *cpu_buffer)
+{
+	cpu_buffer->current_context &= cpu_buffer->current_context - 1;
+}
+
+/**
+ * ring_buffer_unlock_commit - commit a reserved
+ * @buffer: The buffer to commit to
+ * @event: The event pointer to commit.
+ *
+ * This commits the data to the ring buffer, and releases any locks held.
+ *
+ * Must be paired with ring_buffer_lock_reserve.
+ */
+int ring_buffer_unlock_commit(struct ring_buffer *buffer,
+			      struct ring_buffer_event *event)
+{
+	struct ring_buffer_per_cpu *cpu_buffer;
+	int cpu = raw_smp_processor_id();
+
+	cpu_buffer = buffer->buffers[cpu];
+
+	rb_commit(cpu_buffer, event);
+
+	rb_wakeups(buffer, cpu_buffer);
+
+	trace_recursive_unlock(cpu_buffer);
+
+	preempt_enable_notrace();
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(ring_buffer_unlock_commit);
+
+static noinline void
+rb_handle_timestamp(struct ring_buffer_per_cpu *cpu_buffer,
+		    struct ring_buffer_event *event,
+		    struct rb_event_info *info)
+{
+	struct ring_buffer_event *padding;
+	int length;
+	int size;
+
+	WARN_ONCE(info->delta > (1ULL << 59),
+		  KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n%s",
+		  (unsigned long long)info->delta,
+		  (unsigned long long)info->ts,
+		  (unsigned long long)cpu_buffer->write_stamp,
+		  sched_clock_stable() ? "" :
+		  "If you just came from a suspend/resume,\n"
+		  "please switch to the trace global clock:\n"
+		  "  echo global > /sys/kernel/debug/tracing/trace_clock\n");
+
+	/*
+	 * Discarding this event to add a timestamp in front, but
+	 * we still need to update the length of it to perform the discard.
+	 */
+	rb_update_event(cpu_buffer, event, info);
+
+	if (rb_try_to_discard(cpu_buffer, event)) {
+		info->add_timestamp = 1;
+		/*
+		 * The time delta since the last event is too big to
+		 * hold in the time field of the event, then we append a
+		 * TIME EXTEND event ahead of the data event.
+		 */
+		info->length += RB_LEN_TIME_EXTEND;
+		return;
+	}
+
+	/*
+	 * Humpf! An event came in after this one, and because it is not a
+	 * commit, it will have a delta of zero, thus, it will take on
+	 * the timestamp of the previous commit, which happened a long time
+	 * ago (we need to add a timestamp, remember?).
+	 * We need to add the timestamp here. A timestamp is a fixed size
+	 * of 8 bytes. That means the rest of the event needs to be
+	 * padding.
+	 */
+	size = info->length - RB_LEN_TIME_EXTEND;
+
+	/* The padding will have a delta of 1 */
+	if (size)
+		info->delta--;
 
 	padding = rb_add_time_stamp(event, info->delta);
 
@@ -2573,84 +2817,6 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
 	return event;
 }
 
-static inline int
-rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer,
-		  struct ring_buffer_event *event)
-{
-	unsigned long new_index, old_index;
-	struct buffer_page *bpage;
-	unsigned long index;
-	unsigned long addr;
-
-	new_index = rb_event_index(event);
-	old_index = new_index + rb_event_ts_length(event);
-	addr = (unsigned long)event;
-	addr &= PAGE_MASK;
-
-	bpage = cpu_buffer->tail_page;
-
-	if (bpage->page == (void *)addr && rb_page_write(bpage) == old_index) {
-		unsigned long write_mask =
-			local_read(&bpage->write) & ~RB_WRITE_MASK;
-		unsigned long event_length = rb_event_length(event);
-		/*
-		 * This is on the tail page. It is possible that
-		 * a write could come in and move the tail page
-		 * and write to the next page. That is fine
-		 * because we just shorten what is on this page.
-		 */
-		old_index += write_mask;
-		new_index += write_mask;
-		index = local_cmpxchg(&bpage->write, old_index, new_index);
-		if (index == old_index) {
-			/* update counters */
-			local_sub(event_length, &cpu_buffer->entries_bytes);
-			return 1;
-		}
-	}
-
-	/* could not discard */
-	return 0;
-}
-
-static void rb_start_commit(struct ring_buffer_per_cpu *cpu_buffer)
-{
-	local_inc(&cpu_buffer->committing);
-	local_inc(&cpu_buffer->commits);
-}
-
-static inline void rb_end_commit(struct ring_buffer_per_cpu *cpu_buffer)
-{
-	unsigned long commits;
-
-	if (RB_WARN_ON(cpu_buffer,
-		       !local_read(&cpu_buffer->committing)))
-		return;
-
- again:
-	commits = local_read(&cpu_buffer->commits);
-	/* synchronize with interrupts */
-	barrier();
-	if (local_read(&cpu_buffer->committing) == 1)
-		rb_set_commit_to_write(cpu_buffer);
-
-	local_dec(&cpu_buffer->committing);
-
-	/* synchronize with interrupts */
-	barrier();
-
-	/*
-	 * Need to account for interrupts coming in between the
-	 * updating of the commit page and the clearing of the
-	 * committing counter.
-	 */
-	if (unlikely(local_read(&cpu_buffer->commits) != commits) &&
-	    !local_read(&cpu_buffer->committing)) {
-		local_inc(&cpu_buffer->committing);
-		goto again;
-	}
-}
-
 static struct ring_buffer_event *
 rb_reserve_next_event(struct ring_buffer *buffer,
 		      struct ring_buffer_per_cpu *cpu_buffer,
@@ -2706,75 +2872,6 @@ rb_reserve_next_event(struct ring_buffer *buffer,
 	return NULL;
 }
 
-/*
- * The lock and unlock are done within a preempt disable section.
- * The current_context per_cpu variable can only be modified
- * by the current task between lock and unlock. But it can
- * be modified more than once via an interrupt. To pass this
- * information from the lock to the unlock without having to
- * access the 'in_interrupt()' functions again (which do show
- * a bit of overhead in something as critical as function tracing,
- * we use a bitmask trick.
- *
- *  bit 0 =  NMI context
- *  bit 1 =  IRQ context
- *  bit 2 =  SoftIRQ context
- *  bit 3 =  normal context.
- *
- * This works because this is the order of contexts that can
- * preempt other contexts. A SoftIRQ never preempts an IRQ
- * context.
- *
- * When the context is determined, the corresponding bit is
- * checked and set (if it was set, then a recursion of that context
- * happened).
- *
- * On unlock, we need to clear this bit. To do so, just subtract
- * 1 from the current_context and AND it to itself.
- *
- * (binary)
- *  101 - 1 = 100
- *  101 & 100 = 100 (clearing bit zero)
- *
- *  1010 - 1 = 1001
- *  1010 & 1001 = 1000 (clearing bit 1)
- *
- * The least significant bit can be cleared this way, and it
- * just so happens that it is the same bit corresponding to
- * the current context.
- */
-
-static __always_inline int
-trace_recursive_lock(struct ring_buffer_per_cpu *cpu_buffer)
-{
-	unsigned int val = cpu_buffer->current_context;
-	int bit;
-
-	if (in_interrupt()) {
-		if (in_nmi())
-			bit = RB_CTX_NMI;
-		else if (in_irq())
-			bit = RB_CTX_IRQ;
-		else
-			bit = RB_CTX_SOFTIRQ;
-	} else
-		bit = RB_CTX_NORMAL;
-
-	if (unlikely(val & (1 << bit)))
-		return 1;
-
-	val |= (1 << bit);
-	cpu_buffer->current_context = val;
-
-	return 0;
-}
-
-static __always_inline void
-trace_recursive_unlock(struct ring_buffer_per_cpu *cpu_buffer)
-{
-	cpu_buffer->current_context &= cpu_buffer->current_context - 1;
-}
-
 /**
  * ring_buffer_lock_reserve - reserve a part of the buffer
  * @buffer: the ring buffer to reserve from
@@ -2833,111 +2930,6 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer, unsigned long length)
 }
 EXPORT_SYMBOL_GPL(ring_buffer_lock_reserve);
 
-static void
-rb_update_write_stamp(struct ring_buffer_per_cpu *cpu_buffer,
-		      struct ring_buffer_event *event)
-{
-	u64 delta;
-
-	/*
-	 * The event first in the commit queue updates the
-	 * time stamp.
-	 */
-	if (rb_event_is_commit(cpu_buffer, event)) {
-		/*
-		 * A commit event that is first on a page
-		 * updates the write timestamp with the page stamp
-		 */
-		if (!rb_event_index(event))
-			cpu_buffer->write_stamp =
-				cpu_buffer->commit_page->page->time_stamp;
-		else if (event->type_len == RINGBUF_TYPE_TIME_EXTEND) {
-			delta = event->array[0];
-			delta <<= TS_SHIFT;
-			delta += event->time_delta;
-			cpu_buffer->write_stamp += delta;
-		} else
-			cpu_buffer->write_stamp += event->time_delta;
-	}
-}
-
-static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer,
-		      struct ring_buffer_event *event)
-{
-	local_inc(&cpu_buffer->entries);
-	rb_update_write_stamp(cpu_buffer, event);
-	rb_end_commit(cpu_buffer);
-}
-
-static __always_inline void
-rb_wakeups(struct ring_buffer *buffer, struct ring_buffer_per_cpu *cpu_buffer)
-{
-	bool pagebusy;
-
-	if (buffer->irq_work.waiters_pending) {
-		buffer->irq_work.waiters_pending = false;
-		/* irq_work_queue() supplies it's own memory barriers */
-		irq_work_queue(&buffer->irq_work.work);
-	}
-
-	if (cpu_buffer->irq_work.waiters_pending) {
-		cpu_buffer->irq_work.waiters_pending = false;
-		/* irq_work_queue() supplies it's own memory barriers */
-		irq_work_queue(&cpu_buffer->irq_work.work);
-	}
-
-	pagebusy = cpu_buffer->reader_page == cpu_buffer->commit_page;
-
-	if (!pagebusy && cpu_buffer->irq_work.full_waiters_pending) {
-		cpu_buffer->irq_work.wakeup_full = true;
-		cpu_buffer->irq_work.full_waiters_pending = false;
-		/* irq_work_queue() supplies it's own memory barriers */
-		irq_work_queue(&cpu_buffer->irq_work.work);
-	}
-}
-
-/**
- * ring_buffer_unlock_commit - commit a reserved
- * @buffer: The buffer to commit to
- * @event: The event pointer to commit.
- *
- * This commits the data to the ring buffer, and releases any locks held.
- *
- * Must be paired with ring_buffer_lock_reserve.
- */
-int ring_buffer_unlock_commit(struct ring_buffer *buffer,
-			      struct ring_buffer_event *event)
-{
-	struct ring_buffer_per_cpu *cpu_buffer;
-	int cpu = raw_smp_processor_id();
-
-	cpu_buffer = buffer->buffers[cpu];
-
-	rb_commit(cpu_buffer, event);
-
-	rb_wakeups(buffer, cpu_buffer);
-
-	trace_recursive_unlock(cpu_buffer);
-
-	preempt_enable_notrace();
-
-	return 0;
-}
-EXPORT_SYMBOL_GPL(ring_buffer_unlock_commit);
-
-static inline void rb_event_discard(struct ring_buffer_event *event)
-{
-	if (event->type_len == RINGBUF_TYPE_TIME_EXTEND)
-		event = skip_time_extend(event);
-
-	/* array[0] holds the actual length for the discarded event */
-	event->array[0] = rb_event_data_length(event) - RB_EVNT_HDR_SIZE;
-	event->type_len = RINGBUF_TYPE_PADDING;
-	/* time delta must be non zero */
-	if (!event->time_delta)
-		event->time_delta = 1;
-}
-
 /*
  * Decrement the entries to the page that an event is on.
  * The event does not even need to exist, only the pointer
-- 
cgit v1.2.3


From b7dc42fd79390c074e2bff3b172b585d5c2d80c2 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
Date: Thu, 3 Sep 2015 08:57:12 -0400
Subject: ring-buffer: Revert "ring-buffer: Get timestamp after event is
 allocated"

The commit a4543a2fa9ef31 "ring-buffer: Get timestamp after event is
allocated" is needed for some future work. But after adding it, there is a
race somewhere that causes the saved timestamp to have a slight shift, and
get ahead of the actual timestamp and make it look like time goes backwards.

I'm still looking into why this happens, but in the mean time, this is
holding up other work to get in. I'm reverting the change for now (which
makes the problem go away), and will add it back after I know what is wrong
and fix it.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/ring_buffer.c | 156 +++++++++++++++------------------------------
 1 file changed, 50 insertions(+), 106 deletions(-)

(limited to 'kernel/trace/ring_buffer.c')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 1cce0fbf92ce..fc347f8b1bca 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -2141,8 +2141,6 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer,
 	local_sub(length, &tail_page->write);
 }
 
-static inline void rb_end_commit(struct ring_buffer_per_cpu *cpu_buffer);
-
 /*
  * This is the slow path, force gcc not to inline it.
  */
@@ -2157,16 +2155,6 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer,
 	int ret;
 	u64 ts;
 
-	/*
-	 * If the event had a timestamp attached to it, remove it.
-	 * The first event on a page (nested or not) always uses
-	 * the full timestamp of the new page.
-	 */
-	if (info->add_timestamp) {
-		info->add_timestamp = 0;
-		info->length -= RB_LEN_TIME_EXTEND;
-	}
-
 	next_page = tail_page;
 
 	rb_inc_page(cpu_buffer, &next_page);
@@ -2253,11 +2241,6 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer,
 
 	rb_reset_tail(cpu_buffer, tail, info);
 
-	/* Commit what we have for now to update timestamps */
-	rb_end_commit(cpu_buffer);
-	/* rb_end_commit() decs committing */
-	local_inc(&cpu_buffer->committing);
-
 	/* fail and let the caller try again */
 	return ERR_PTR(-EAGAIN);
 
@@ -2287,6 +2270,9 @@ rb_add_time_stamp(struct ring_buffer_event *event, u64 delta)
 	return skip_time_extend(event);
 }
 
+static inline int rb_event_is_commit(struct ring_buffer_per_cpu *cpu_buffer,
+				     struct ring_buffer_event *event);
+
 /**
  * rb_update_event - update event type and data
  * @event: the event to update
@@ -2298,7 +2284,7 @@ rb_add_time_stamp(struct ring_buffer_event *event, u64 delta)
  * and with this, we can determine what to place into the
  * data field.
  */
-static void __always_inline
+static void
 rb_update_event(struct ring_buffer_per_cpu *cpu_buffer,
 		struct ring_buffer_event *event,
 		struct rb_event_info *info)
@@ -2306,6 +2292,10 @@ rb_update_event(struct ring_buffer_per_cpu *cpu_buffer,
 	unsigned length = info->length;
 	u64 delta = info->delta;
 
+	/* Only a commit updates the timestamp */
+	if (unlikely(!rb_event_is_commit(cpu_buffer, event)))
+		delta = 0;
+
 	/*
 	 * If we need to add a timestamp, then we
 	 * add it to the start of the resevered space.
@@ -2685,13 +2675,8 @@ EXPORT_SYMBOL_GPL(ring_buffer_unlock_commit);
 
 static noinline void
 rb_handle_timestamp(struct ring_buffer_per_cpu *cpu_buffer,
-		    struct ring_buffer_event *event,
 		    struct rb_event_info *info)
 {
-	struct ring_buffer_event *padding;
-	int length;
-	int size;
-
 	WARN_ONCE(info->delta > (1ULL << 59),
 		  KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n%s",
 		  (unsigned long long)info->delta,
@@ -2701,61 +2686,7 @@ rb_handle_timestamp(struct ring_buffer_per_cpu *cpu_buffer,
 		  "If you just came from a suspend/resume,\n"
 		  "please switch to the trace global clock:\n"
 		  "  echo global > /sys/kernel/debug/tracing/trace_clock\n");
-
-	/*
-	 * Discarding this event to add a timestamp in front, but
-	 * we still need to update the length of it to perform the discard.
-	 */
-	rb_update_event(cpu_buffer, event, info);
-
-	if (rb_try_to_discard(cpu_buffer, event)) {
-		info->add_timestamp = 1;
-		/*
-		 * The time delta since the last event is too big to
-		 * hold in the time field of the event, then we append a
-		 * TIME EXTEND event ahead of the data event.
-		 */
-		info->length += RB_LEN_TIME_EXTEND;
-		return;
-	}
-
-	/*
-	 * Humpf! An event came in after this one, and because it is not a
-	 * commit, it will have a delta of zero, thus, it will take on
-	 * the timestamp of the previous commit, which happened a long time
-	 * ago (we need to add a timestamp, remember?).
-	 * We need to add the timestamp here. A timestamp is a fixed size
-	 * of 8 bytes. That means the rest of the event needs to be
-	 * padding.
-	 */
-	size = info->length - RB_LEN_TIME_EXTEND;
-
-	/* The padding will have a delta of 1 */
-	if (size)
-		info->delta--;
-
-	padding = rb_add_time_stamp(event, info->delta);
-
-	if (size) {
-		length = info->length;
-		info->delta = 0;
-		info->length = size;
-		rb_update_event(cpu_buffer, padding, info);
-
-		rb_event_discard(padding);
-
-		/* Still visible, need to update write_stamp */
-		rb_update_write_stamp(cpu_buffer, event);
-
-		/* Still need to commit the padding. */
-		rb_end_commit(cpu_buffer);
-
-		/* rb_end_commit() decs committing */
-		local_inc(&cpu_buffer->committing);
-
-		/* The next iteration still uses the original length */
-		info->length = length;
-	}
+	info->add_timestamp = 1;
 }
 
 static struct ring_buffer_event *
@@ -2765,7 +2696,14 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
 	struct ring_buffer_event *event;
 	struct buffer_page *tail_page;
 	unsigned long tail, write;
-	bool is_commit;
+
+	/*
+	 * If the time delta since the last event is too big to
+	 * hold in the time field of the event, then we append a
+	 * TIME EXTEND event ahead of the data event.
+	 */
+	if (unlikely(info->add_timestamp))
+		info->length += RB_LEN_TIME_EXTEND;
 
 	tail_page = info->tail_page = cpu_buffer->tail_page;
 	write = local_add_return(info->length, &tail_page->write);
@@ -2774,43 +2712,32 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
 	write &= RB_WRITE_MASK;
 	tail = write - info->length;
 
-	/* See if we shot pass the end of this buffer page */
-	if (unlikely(write > BUF_PAGE_SIZE))
-		return rb_move_tail(cpu_buffer, tail, info);
-
-	/* We reserved something on the buffer */
-	event = __rb_page_index(tail_page, tail);
-
 	/*
 	 * If this is the first commit on the page, then it has the same
-	 * timestamp as the page itself, otherwise we need to figure out
-	 * the delta.
+	 * timestamp as the page itself.
 	 */
-	info->ts = rb_time_stamp(cpu_buffer->buffer);
-	is_commit = rb_event_is_commit(cpu_buffer, event);
-
-	/* Commits are special (non nested events) */
-	info->delta = is_commit ? info->ts - cpu_buffer->write_stamp : 0;
-
-	if (!tail) {
-		/*
-		 * If this is the first commit on the page, set the
-		 * page to its timestamp.
-		 */
-		tail_page->page->time_stamp = info->ts;
+	if (!tail)
 		info->delta = 0;
 
-	} else if (unlikely(test_time_stamp(info->delta)) &&
-		   !info->add_timestamp) {
-		rb_handle_timestamp(cpu_buffer, event, info);
-		return ERR_PTR(-EAGAIN);
-	}
+	/* See if we shot pass the end of this buffer page */
+	if (unlikely(write > BUF_PAGE_SIZE))
+		return rb_move_tail(cpu_buffer, tail, info);
 
+	/* We reserved something on the buffer */
+
+	event = __rb_page_index(tail_page, tail);
 	kmemcheck_annotate_bitfield(event, bitfield);
 	rb_update_event(cpu_buffer, event, info);
 
 	local_inc(&tail_page->entries);
 
+	/*
+	 * If this is the first commit on the page, then update
+	 * its timestamp.
+	 */
+	if (!tail)
+		tail_page->page->time_stamp = info->ts;
+
 	/* account for these added bytes */
 	local_add(info->length, &cpu_buffer->entries_bytes);
 
@@ -2825,6 +2752,7 @@ rb_reserve_next_event(struct ring_buffer *buffer,
 	struct ring_buffer_event *event;
 	struct rb_event_info info;
 	int nr_loops = 0;
+	u64 diff;
 
 	rb_start_commit(cpu_buffer);
 
@@ -2842,9 +2770,12 @@ rb_reserve_next_event(struct ring_buffer *buffer,
 		return NULL;
 	}
 #endif
+
 	info.length = rb_calculate_event_length(length);
-	info.add_timestamp = 0;
  again:
+	info.add_timestamp = 0;
+	info.delta = 0;
+
 	/*
 	 * We allow for interrupts to reenter here and do a trace.
 	 * If one does, it will cause this original code to loop
@@ -2857,6 +2788,19 @@ rb_reserve_next_event(struct ring_buffer *buffer,
 	if (RB_WARN_ON(cpu_buffer, ++nr_loops > 1000))
 		goto out_fail;
 
+	info.ts = rb_time_stamp(cpu_buffer->buffer);
+	diff = info.ts - cpu_buffer->write_stamp;
+
+	/* make sure this diff is calculated here */
+	barrier();
+
+	/* Did the write stamp get updated already? */
+	if (likely(info.ts >= cpu_buffer->write_stamp)) {
+		info.delta = diff;
+		if (unlikely(test_time_stamp(info.delta)))
+			rb_handle_timestamp(cpu_buffer, &info);
+	}
+
 	event = __rb_reserve_next(cpu_buffer, &info);
 
 	if (unlikely(PTR_ERR(event) == -EAGAIN))
-- 
cgit v1.2.3