diff options
Diffstat (limited to 'kernel/trace/ring_buffer.c')
| -rw-r--r-- | kernel/trace/ring_buffer.c | 780 | 
1 files changed, 509 insertions, 271 deletions
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 960cbf44c844..dc4dc70171ce 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -10,6 +10,7 @@  #include <linux/debugfs.h>  #include <linux/uaccess.h>  #include <linux/hardirq.h> +#include <linux/kmemcheck.h>  #include <linux/module.h>  #include <linux/percpu.h>  #include <linux/mutex.h> @@ -22,6 +23,28 @@  #include "trace.h"  /* + * The ring buffer header is special. We must manually up keep it. + */ +int ring_buffer_print_entry_header(struct trace_seq *s) +{ +	int ret; + +	ret = trace_seq_printf(s, "# compressed entry header\n"); +	ret = trace_seq_printf(s, "\ttype_len    :    5 bits\n"); +	ret = trace_seq_printf(s, "\ttime_delta  :   27 bits\n"); +	ret = trace_seq_printf(s, "\tarray       :   32 bits\n"); +	ret = trace_seq_printf(s, "\n"); +	ret = trace_seq_printf(s, "\tpadding     : type == %d\n", +			       RINGBUF_TYPE_PADDING); +	ret = trace_seq_printf(s, "\ttime_extend : type == %d\n", +			       RINGBUF_TYPE_TIME_EXTEND); +	ret = trace_seq_printf(s, "\tdata max type_len  == %d\n", +			       RINGBUF_TYPE_DATA_TYPE_LEN_MAX); + +	return ret; +} + +/*   * The ring buffer is made up of a list of pages. A separate list of pages is   * allocated for each CPU. A writer may only write to a buffer that is   * associated with the CPU it is currently executing on.  A reader may read @@ -182,7 +205,10 @@ EXPORT_SYMBOL_GPL(tracing_is_on);  #define RB_EVNT_HDR_SIZE (offsetof(struct ring_buffer_event, array))  #define RB_ALIGNMENT		4U -#define RB_MAX_SMALL_DATA	28 +#define RB_MAX_SMALL_DATA	(RB_ALIGNMENT * RINGBUF_TYPE_DATA_TYPE_LEN_MAX) + +/* define RINGBUF_TYPE_DATA for 'case RINGBUF_TYPE_DATA:' */ +#define RINGBUF_TYPE_DATA 0 ... RINGBUF_TYPE_DATA_TYPE_LEN_MAX  enum {  	RB_LEN_TIME_EXTEND = 8, @@ -191,48 +217,28 @@ enum {  static inline int rb_null_event(struct ring_buffer_event *event)  { -	return event->type == RINGBUF_TYPE_PADDING && event->time_delta == 0; +	return event->type_len == RINGBUF_TYPE_PADDING +			&& event->time_delta == 0;  }  static inline int rb_discarded_event(struct ring_buffer_event *event)  { -	return event->type == RINGBUF_TYPE_PADDING && event->time_delta; +	return event->type_len == RINGBUF_TYPE_PADDING && event->time_delta;  }  static void rb_event_set_padding(struct ring_buffer_event *event)  { -	event->type = RINGBUF_TYPE_PADDING; +	event->type_len = RINGBUF_TYPE_PADDING;  	event->time_delta = 0;  } -/** - * ring_buffer_event_discard - discard an event in the ring buffer - * @buffer: the ring buffer - * @event: the event to discard - * - * Sometimes a event that is in the ring buffer needs to be ignored. - * This function lets the user discard an event in the ring buffer - * and then that event will not be read later. - * - * Note, it is up to the user to be careful with this, and protect - * against races. If the user discards an event that has been consumed - * it is possible that it could corrupt the ring buffer. - */ -void ring_buffer_event_discard(struct ring_buffer_event *event) -{ -	event->type = RINGBUF_TYPE_PADDING; -	/* time delta must be non zero */ -	if (!event->time_delta) -		event->time_delta = 1; -} -  static unsigned  rb_event_data_length(struct ring_buffer_event *event)  {  	unsigned length; -	if (event->len) -		length = event->len * RB_ALIGNMENT; +	if (event->type_len) +		length = event->type_len * RB_ALIGNMENT;  	else  		length = event->array[0];  	return length + RB_EVNT_HDR_SIZE; @@ -242,12 +248,12 @@ rb_event_data_length(struct ring_buffer_event *event)  static unsigned  rb_event_length(struct ring_buffer_event *event)  { -	switch (event->type) { +	switch (event->type_len) {  	case RINGBUF_TYPE_PADDING:  		if (rb_null_event(event))  			/* undefined */  			return -1; -		return rb_event_data_length(event); +		return  event->array[0] + RB_EVNT_HDR_SIZE;  	case RINGBUF_TYPE_TIME_EXTEND:  		return RB_LEN_TIME_EXTEND; @@ -271,7 +277,7 @@ rb_event_length(struct ring_buffer_event *event)  unsigned ring_buffer_event_length(struct ring_buffer_event *event)  {  	unsigned length = rb_event_length(event); -	if (event->type != RINGBUF_TYPE_DATA) +	if (event->type_len > RINGBUF_TYPE_DATA_TYPE_LEN_MAX)  		return length;  	length -= RB_EVNT_HDR_SIZE;  	if (length > RB_MAX_SMALL_DATA + sizeof(event->array[0])) @@ -284,9 +290,9 @@ EXPORT_SYMBOL_GPL(ring_buffer_event_length);  static void *  rb_event_data(struct ring_buffer_event *event)  { -	BUG_ON(event->type != RINGBUF_TYPE_DATA); +	BUG_ON(event->type_len > RINGBUF_TYPE_DATA_TYPE_LEN_MAX);  	/* If length is in len field, then array[0] has the data */ -	if (event->len) +	if (event->type_len)  		return (void *)&event->array[0];  	/* Otherwise length is in array[0] and array[1] has the data */  	return (void *)&event->array[1]; @@ -316,9 +322,10 @@ struct buffer_data_page {  };  struct buffer_page { +	struct list_head list;		/* list of buffer pages */  	local_t		 write;		/* index for next write */  	unsigned	 read;		/* index for next read */ -	struct list_head list;		/* list of free pages */ +	local_t		 entries;	/* entries on this page */  	struct buffer_data_page *page;	/* Actual data page */  }; @@ -361,6 +368,34 @@ static inline int test_time_stamp(u64 delta)  #define BUF_PAGE_SIZE (PAGE_SIZE - BUF_PAGE_HDR_SIZE) +/* Max payload is BUF_PAGE_SIZE - header (8bytes) */ +#define BUF_MAX_DATA_SIZE (BUF_PAGE_SIZE - (sizeof(u32) * 2)) + +/* Max number of timestamps that can fit on a page */ +#define RB_TIMESTAMPS_PER_PAGE	(BUF_PAGE_SIZE / RB_LEN_TIME_STAMP) + +int ring_buffer_print_page_header(struct trace_seq *s) +{ +	struct buffer_data_page field; +	int ret; + +	ret = trace_seq_printf(s, "\tfield: u64 timestamp;\t" +			       "offset:0;\tsize:%u;\n", +			       (unsigned int)sizeof(field.time_stamp)); + +	ret = trace_seq_printf(s, "\tfield: local_t commit;\t" +			       "offset:%u;\tsize:%u;\n", +			       (unsigned int)offsetof(typeof(field), commit), +			       (unsigned int)sizeof(field.commit)); + +	ret = trace_seq_printf(s, "\tfield: char data;\t" +			       "offset:%u;\tsize:%u;\n", +			       (unsigned int)offsetof(typeof(field), data), +			       (unsigned int)BUF_PAGE_SIZE); + +	return ret; +} +  /*   * head_page == tail_page && head == tail then buffer is empty.   */ @@ -375,8 +410,11 @@ struct ring_buffer_per_cpu {  	struct buffer_page		*tail_page;	/* write to tail */  	struct buffer_page		*commit_page;	/* committed pages */  	struct buffer_page		*reader_page; +	unsigned long			nmi_dropped; +	unsigned long			commit_overrun;  	unsigned long			overrun; -	unsigned long			entries; +	unsigned long			read; +	local_t				entries;  	u64				write_stamp;  	u64				read_stamp;  	atomic_t			record_disabled; @@ -389,6 +427,8 @@ struct ring_buffer {  	atomic_t			record_disabled;  	cpumask_var_t			cpumask; +	struct lock_class_key		*reader_lock_key; +  	struct mutex			mutex;  	struct ring_buffer_per_cpu	**buffers; @@ -420,13 +460,18 @@ struct ring_buffer_iter {  /* Up this if you want to test the TIME_EXTENTS and normalization */  #define DEBUG_SHIFT 0 +static inline u64 rb_time_stamp(struct ring_buffer *buffer, int cpu) +{ +	/* shift to debug/test normalization and TIME_EXTENTS */ +	return buffer->clock() << DEBUG_SHIFT; +} +  u64 ring_buffer_time_stamp(struct ring_buffer *buffer, int cpu)  {  	u64 time;  	preempt_disable_notrace(); -	/* shift to debug/test normalization and TIME_EXTENTS */ -	time = buffer->clock() << DEBUG_SHIFT; +	time = rb_time_stamp(buffer, cpu);  	preempt_enable_no_resched_notrace();  	return time; @@ -523,6 +568,7 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu)  	cpu_buffer->cpu = cpu;  	cpu_buffer->buffer = buffer;  	spin_lock_init(&cpu_buffer->reader_lock); +	lockdep_set_class(&cpu_buffer->reader_lock, buffer->reader_lock_key);  	cpu_buffer->lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;  	INIT_LIST_HEAD(&cpu_buffer->pages); @@ -593,7 +639,8 @@ static int rb_cpu_notify(struct notifier_block *self,   * when the buffer wraps. If this flag is not set, the buffer will   * drop data when the tail hits the head.   */ -struct ring_buffer *ring_buffer_alloc(unsigned long size, unsigned flags) +struct ring_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags, +					struct lock_class_key *key)  {  	struct ring_buffer *buffer;  	int bsize; @@ -616,6 +663,7 @@ struct ring_buffer *ring_buffer_alloc(unsigned long size, unsigned flags)  	buffer->pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE);  	buffer->flags = flags;  	buffer->clock = trace_clock_local; +	buffer->reader_lock_key = key;  	/* need at least two pages */  	if (buffer->pages == 1) @@ -673,7 +721,7 @@ struct ring_buffer *ring_buffer_alloc(unsigned long size, unsigned flags)  	kfree(buffer);  	return NULL;  } -EXPORT_SYMBOL_GPL(ring_buffer_alloc); +EXPORT_SYMBOL_GPL(__ring_buffer_alloc);  /**   * ring_buffer_free - free a ring buffer. @@ -947,31 +995,6 @@ static inline unsigned rb_head_size(struct ring_buffer_per_cpu *cpu_buffer)  	return rb_page_commit(cpu_buffer->head_page);  } -/* - * When the tail hits the head and the buffer is in overwrite mode, - * the head jumps to the next page and all content on the previous - * page is discarded. But before doing so, we update the overrun - * variable of the buffer. - */ -static void rb_update_overflow(struct ring_buffer_per_cpu *cpu_buffer) -{ -	struct ring_buffer_event *event; -	unsigned long head; - -	for (head = 0; head < rb_head_size(cpu_buffer); -	     head += rb_event_length(event)) { - -		event = __rb_page_index(cpu_buffer->head_page, head); -		if (RB_WARN_ON(cpu_buffer, rb_null_event(event))) -			return; -		/* Only count data entries */ -		if (event->type != RINGBUF_TYPE_DATA) -			continue; -		cpu_buffer->overrun++; -		cpu_buffer->entries--; -	} -} -  static inline void rb_inc_page(struct ring_buffer_per_cpu *cpu_buffer,  			       struct buffer_page **bpage)  { @@ -991,7 +1014,7 @@ rb_event_index(struct ring_buffer_event *event)  	return (addr & ~PAGE_MASK) - (PAGE_SIZE - BUF_PAGE_SIZE);  } -static int +static inline int  rb_is_commit(struct ring_buffer_per_cpu *cpu_buffer,  	     struct ring_buffer_event *event)  { @@ -1110,28 +1133,21 @@ static void  rb_update_event(struct ring_buffer_event *event,  			 unsigned type, unsigned length)  { -	event->type = type; +	event->type_len = type;  	switch (type) {  	case RINGBUF_TYPE_PADDING: -		break; -  	case RINGBUF_TYPE_TIME_EXTEND: -		event->len = DIV_ROUND_UP(RB_LEN_TIME_EXTEND, RB_ALIGNMENT); -		break; -  	case RINGBUF_TYPE_TIME_STAMP: -		event->len = DIV_ROUND_UP(RB_LEN_TIME_STAMP, RB_ALIGNMENT);  		break; -	case RINGBUF_TYPE_DATA: +	case 0:  		length -= RB_EVNT_HDR_SIZE; -		if (length > RB_MAX_SMALL_DATA) { -			event->len = 0; +		if (length > RB_MAX_SMALL_DATA)  			event->array[0] = length; -		} else -			event->len = DIV_ROUND_UP(length, RB_ALIGNMENT); +		else +			event->type_len = DIV_ROUND_UP(length, RB_ALIGNMENT);  		break;  	default:  		BUG(); @@ -1155,131 +1171,157 @@ static unsigned rb_calculate_event_length(unsigned length)  	return length;  } +  static struct ring_buffer_event * -__rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, -		  unsigned type, unsigned long length, u64 *ts) +rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer, +	     unsigned long length, unsigned long tail, +	     struct buffer_page *commit_page, +	     struct buffer_page *tail_page, u64 *ts)  { -	struct buffer_page *tail_page, *head_page, *reader_page, *commit_page; -	unsigned long tail, write; +	struct buffer_page *next_page, *head_page, *reader_page;  	struct ring_buffer *buffer = cpu_buffer->buffer;  	struct ring_buffer_event *event; -	unsigned long flags;  	bool lock_taken = false; +	unsigned long flags; -	commit_page = cpu_buffer->commit_page; -	/* we just need to protect against interrupts */ -	barrier(); -	tail_page = cpu_buffer->tail_page; -	write = local_add_return(length, &tail_page->write); -	tail = write - length; +	next_page = tail_page; -	/* See if we shot pass the end of this buffer page */ -	if (write > BUF_PAGE_SIZE) { -		struct buffer_page *next_page = tail_page; +	local_irq_save(flags); +	/* +	 * Since the write to the buffer is still not +	 * fully lockless, we must be careful with NMIs. +	 * The locks in the writers are taken when a write +	 * crosses to a new page. The locks protect against +	 * races with the readers (this will soon be fixed +	 * with a lockless solution). +	 * +	 * Because we can not protect against NMIs, and we +	 * want to keep traces reentrant, we need to manage +	 * what happens when we are in an NMI. +	 * +	 * NMIs can happen after we take the lock. +	 * If we are in an NMI, only take the lock +	 * if it is not already taken. Otherwise +	 * simply fail. +	 */ +	if (unlikely(in_nmi())) { +		if (!__raw_spin_trylock(&cpu_buffer->lock)) { +			cpu_buffer->nmi_dropped++; +			goto out_reset; +		} +	} else +		__raw_spin_lock(&cpu_buffer->lock); -		local_irq_save(flags); -		/* -		 * Since the write to the buffer is still not -		 * fully lockless, we must be careful with NMIs. -		 * The locks in the writers are taken when a write -		 * crosses to a new page. The locks protect against -		 * races with the readers (this will soon be fixed -		 * with a lockless solution). -		 * -		 * Because we can not protect against NMIs, and we -		 * want to keep traces reentrant, we need to manage -		 * what happens when we are in an NMI. -		 * -		 * NMIs can happen after we take the lock. -		 * If we are in an NMI, only take the lock -		 * if it is not already taken. Otherwise -		 * simply fail. -		 */ -		if (unlikely(in_nmi())) { -			if (!__raw_spin_trylock(&cpu_buffer->lock)) -				goto out_reset; -		} else -			__raw_spin_lock(&cpu_buffer->lock); +	lock_taken = true; -		lock_taken = true; +	rb_inc_page(cpu_buffer, &next_page); -		rb_inc_page(cpu_buffer, &next_page); +	head_page = cpu_buffer->head_page; +	reader_page = cpu_buffer->reader_page; -		head_page = cpu_buffer->head_page; -		reader_page = cpu_buffer->reader_page; +	/* we grabbed the lock before incrementing */ +	if (RB_WARN_ON(cpu_buffer, next_page == reader_page)) +		goto out_reset; -		/* we grabbed the lock before incrementing */ -		if (RB_WARN_ON(cpu_buffer, next_page == reader_page)) -			goto out_reset; +	/* +	 * If for some reason, we had an interrupt storm that made +	 * it all the way around the buffer, bail, and warn +	 * about it. +	 */ +	if (unlikely(next_page == commit_page)) { +		cpu_buffer->commit_overrun++; +		goto out_reset; +	} -		/* -		 * If for some reason, we had an interrupt storm that made -		 * it all the way around the buffer, bail, and warn -		 * about it. -		 */ -		if (unlikely(next_page == commit_page)) { -			WARN_ON_ONCE(1); +	if (next_page == head_page) { +		if (!(buffer->flags & RB_FL_OVERWRITE))  			goto out_reset; -		} -		if (next_page == head_page) { -			if (!(buffer->flags & RB_FL_OVERWRITE)) -				goto out_reset; - -			/* tail_page has not moved yet? */ -			if (tail_page == cpu_buffer->tail_page) { -				/* count overflows */ -				rb_update_overflow(cpu_buffer); +		/* tail_page has not moved yet? */ +		if (tail_page == cpu_buffer->tail_page) { +			/* count overflows */ +			cpu_buffer->overrun += +				local_read(&head_page->entries); -				rb_inc_page(cpu_buffer, &head_page); -				cpu_buffer->head_page = head_page; -				cpu_buffer->head_page->read = 0; -			} +			rb_inc_page(cpu_buffer, &head_page); +			cpu_buffer->head_page = head_page; +			cpu_buffer->head_page->read = 0;  		} +	} -		/* -		 * If the tail page is still the same as what we think -		 * it is, then it is up to us to update the tail -		 * pointer. -		 */ -		if (tail_page == cpu_buffer->tail_page) { -			local_set(&next_page->write, 0); -			local_set(&next_page->page->commit, 0); -			cpu_buffer->tail_page = next_page; +	/* +	 * If the tail page is still the same as what we think +	 * it is, then it is up to us to update the tail +	 * pointer. +	 */ +	if (tail_page == cpu_buffer->tail_page) { +		local_set(&next_page->write, 0); +		local_set(&next_page->entries, 0); +		local_set(&next_page->page->commit, 0); +		cpu_buffer->tail_page = next_page; + +		/* reread the time stamp */ +		*ts = rb_time_stamp(buffer, cpu_buffer->cpu); +		cpu_buffer->tail_page->page->time_stamp = *ts; +	} -			/* reread the time stamp */ -			*ts = ring_buffer_time_stamp(buffer, cpu_buffer->cpu); -			cpu_buffer->tail_page->page->time_stamp = *ts; -		} +	/* +	 * The actual tail page has moved forward. +	 */ +	if (tail < BUF_PAGE_SIZE) { +		/* Mark the rest of the page with padding */ +		event = __rb_page_index(tail_page, tail); +		kmemcheck_annotate_bitfield(event, bitfield); +		rb_event_set_padding(event); +	} -		/* -		 * The actual tail page has moved forward. -		 */ -		if (tail < BUF_PAGE_SIZE) { -			/* Mark the rest of the page with padding */ -			event = __rb_page_index(tail_page, tail); -			rb_event_set_padding(event); -		} +	/* Set the write back to the previous setting */ +	local_sub(length, &tail_page->write); -		if (tail <= BUF_PAGE_SIZE) -			/* Set the write back to the previous setting */ -			local_set(&tail_page->write, tail); +	/* +	 * If this was a commit entry that failed, +	 * increment that too +	 */ +	if (tail_page == cpu_buffer->commit_page && +	    tail == rb_commit_index(cpu_buffer)) { +		rb_set_commit_to_write(cpu_buffer); +	} -		/* -		 * If this was a commit entry that failed, -		 * increment that too -		 */ -		if (tail_page == cpu_buffer->commit_page && -		    tail == rb_commit_index(cpu_buffer)) { -			rb_set_commit_to_write(cpu_buffer); -		} +	__raw_spin_unlock(&cpu_buffer->lock); +	local_irq_restore(flags); + +	/* fail and let the caller try again */ +	return ERR_PTR(-EAGAIN); + + out_reset: +	/* reset write */ +	local_sub(length, &tail_page->write); +	if (likely(lock_taken))  		__raw_spin_unlock(&cpu_buffer->lock); -		local_irq_restore(flags); +	local_irq_restore(flags); +	return NULL; +} -		/* fail and let the caller try again */ -		return ERR_PTR(-EAGAIN); -	} +static struct ring_buffer_event * +__rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, +		  unsigned type, unsigned long length, u64 *ts) +{ +	struct buffer_page *tail_page, *commit_page; +	struct ring_buffer_event *event; +	unsigned long tail, write; + +	commit_page = cpu_buffer->commit_page; +	/* we just need to protect against interrupts */ +	barrier(); +	tail_page = cpu_buffer->tail_page; +	write = local_add_return(length, &tail_page->write); +	tail = write - length; + +	/* See if we shot pass the end of this buffer page */ +	if (write > BUF_PAGE_SIZE) +		return rb_move_tail(cpu_buffer, length, tail, +				    commit_page, tail_page, ts);  	/* We reserved something on the buffer */ @@ -1287,8 +1329,13 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,  		return NULL;  	event = __rb_page_index(tail_page, tail); +	kmemcheck_annotate_bitfield(event, bitfield);  	rb_update_event(event, type, length); +	/* The passed in type is zero for DATA */ +	if (likely(!type)) +		local_inc(&tail_page->entries); +  	/*  	 * If this is a commit and the tail is zero, then update  	 * this page's time stamp. @@ -1297,16 +1344,38 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,  		cpu_buffer->commit_page->page->time_stamp = *ts;  	return event; +} - out_reset: -	/* reset write */ -	if (tail <= BUF_PAGE_SIZE) -		local_set(&tail_page->write, tail); +static inline int +rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer, +		  struct ring_buffer_event *event) +{ +	unsigned long new_index, old_index; +	struct buffer_page *bpage; +	unsigned long index; +	unsigned long addr; -	if (likely(lock_taken)) -		__raw_spin_unlock(&cpu_buffer->lock); -	local_irq_restore(flags); -	return NULL; +	new_index = rb_event_index(event); +	old_index = new_index + rb_event_length(event); +	addr = (unsigned long)event; +	addr &= PAGE_MASK; + +	bpage = cpu_buffer->tail_page; + +	if (bpage->page == (void *)addr && rb_page_write(bpage) == old_index) { +		/* +		 * This is on the tail page. It is possible that +		 * a write could come in and move the tail page +		 * and write to the next page. That is fine +		 * because we just shorten what is on this page. +		 */ +		index = local_cmpxchg(&bpage->write, old_index, new_index); +		if (index == old_index) +			return 1; +	} + +	/* could not discard */ +	return 0;  }  static int @@ -1351,16 +1420,23 @@ rb_add_time_stamp(struct ring_buffer_per_cpu *cpu_buffer,  			event->array[0] = *delta >> TS_SHIFT;  		} else {  			cpu_buffer->commit_page->page->time_stamp = *ts; -			event->time_delta = 0; -			event->array[0] = 0; +			/* try to discard, since we do not need this */ +			if (!rb_try_to_discard(cpu_buffer, event)) { +				/* nope, just zero it */ +				event->time_delta = 0; +				event->array[0] = 0; +			}  		}  		cpu_buffer->write_stamp = *ts;  		/* let the caller know this was the commit */  		ret = 1;  	} else { -		/* Darn, this is just wasted space */ -		event->time_delta = 0; -		event->array[0] = 0; +		/* Try to discard the event */ +		if (!rb_try_to_discard(cpu_buffer, event)) { +			/* Darn, this is just wasted space */ +			event->time_delta = 0; +			event->array[0] = 0; +		}  		ret = 0;  	} @@ -1371,13 +1447,14 @@ rb_add_time_stamp(struct ring_buffer_per_cpu *cpu_buffer,  static struct ring_buffer_event *  rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer, -		      unsigned type, unsigned long length) +		      unsigned long length)  {  	struct ring_buffer_event *event; -	u64 ts, delta; +	u64 ts, delta = 0;  	int commit = 0;  	int nr_loops = 0; +	length = rb_calculate_event_length(length);   again:  	/*  	 * We allow for interrupts to reenter here and do a trace. @@ -1391,7 +1468,7 @@ rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer,  	if (RB_WARN_ON(cpu_buffer, ++nr_loops > 1000))  		return NULL; -	ts = ring_buffer_time_stamp(cpu_buffer->buffer, cpu_buffer->cpu); +	ts = rb_time_stamp(cpu_buffer->buffer, cpu_buffer->cpu);  	/*  	 * Only the first commit can update the timestamp. @@ -1401,23 +1478,24 @@ rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer,  	 * also be made. But only the entry that did the actual  	 * commit will be something other than zero.  	 */ -	if (cpu_buffer->tail_page == cpu_buffer->commit_page && -	    rb_page_write(cpu_buffer->tail_page) == -	    rb_commit_index(cpu_buffer)) { +	if (likely(cpu_buffer->tail_page == cpu_buffer->commit_page && +		   rb_page_write(cpu_buffer->tail_page) == +		   rb_commit_index(cpu_buffer))) { +		u64 diff; -		delta = ts - cpu_buffer->write_stamp; +		diff = ts - cpu_buffer->write_stamp; -		/* make sure this delta is calculated here */ +		/* make sure this diff is calculated here */  		barrier();  		/* Did the write stamp get updated already? */  		if (unlikely(ts < cpu_buffer->write_stamp)) -			delta = 0; +			goto get_event; -		if (test_time_stamp(delta)) { +		delta = diff; +		if (unlikely(test_time_stamp(delta))) {  			commit = rb_add_time_stamp(cpu_buffer, &ts, &delta); -  			if (commit == -EBUSY)  				return NULL; @@ -1426,12 +1504,11 @@ rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer,  			RB_WARN_ON(cpu_buffer, commit < 0);  		} -	} else -		/* Non commits have zero deltas */ -		delta = 0; +	} -	event = __rb_reserve_next(cpu_buffer, type, length, &ts); -	if (PTR_ERR(event) == -EAGAIN) + get_event: +	event = __rb_reserve_next(cpu_buffer, 0, length, &ts); +	if (unlikely(PTR_ERR(event) == -EAGAIN))  		goto again;  	if (!event) { @@ -1448,7 +1525,7 @@ rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer,  	 * If the timestamp was commited, make the commit our entry  	 * now so that we will update it when needed.  	 */ -	if (commit) +	if (unlikely(commit))  		rb_set_commit_event(cpu_buffer, event);  	else if (!rb_is_commit(cpu_buffer, event))  		delta = 0; @@ -1458,6 +1535,36 @@ rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer,  	return event;  } +#define TRACE_RECURSIVE_DEPTH 16 + +static int trace_recursive_lock(void) +{ +	current->trace_recursion++; + +	if (likely(current->trace_recursion < TRACE_RECURSIVE_DEPTH)) +		return 0; + +	/* Disable all tracing before we do anything else */ +	tracing_off_permanent(); + +	printk_once(KERN_WARNING "Tracing recursion: depth[%ld]:" +		    "HC[%lu]:SC[%lu]:NMI[%lu]\n", +		    current->trace_recursion, +		    hardirq_count() >> HARDIRQ_SHIFT, +		    softirq_count() >> SOFTIRQ_SHIFT, +		    in_nmi()); + +	WARN_ON_ONCE(1); +	return -1; +} + +static void trace_recursive_unlock(void) +{ +	WARN_ON_ONCE(!current->trace_recursion); + +	current->trace_recursion--; +} +  static DEFINE_PER_CPU(int, rb_need_resched);  /** @@ -1491,6 +1598,9 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer, unsigned long length)  	/* If we are tracing schedule, we don't want to recurse */  	resched = ftrace_preempt_disable(); +	if (trace_recursive_lock()) +		goto out_nocheck; +  	cpu = raw_smp_processor_id();  	if (!cpumask_test_cpu(cpu, buffer->cpumask)) @@ -1501,11 +1611,10 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer, unsigned long length)  	if (atomic_read(&cpu_buffer->record_disabled))  		goto out; -	length = rb_calculate_event_length(length); -	if (length > BUF_PAGE_SIZE) +	if (length > BUF_MAX_DATA_SIZE)  		goto out; -	event = rb_reserve_next_event(cpu_buffer, RINGBUF_TYPE_DATA, length); +	event = rb_reserve_next_event(cpu_buffer, length);  	if (!event)  		goto out; @@ -1520,6 +1629,9 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer, unsigned long length)  	return event;   out: +	trace_recursive_unlock(); + + out_nocheck:  	ftrace_preempt_enable(resched);  	return NULL;  } @@ -1528,7 +1640,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_lock_reserve);  static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer,  		      struct ring_buffer_event *event)  { -	cpu_buffer->entries++; +	local_inc(&cpu_buffer->entries);  	/* Only process further if we own the commit */  	if (!rb_is_commit(cpu_buffer, event)) @@ -1558,6 +1670,8 @@ int ring_buffer_unlock_commit(struct ring_buffer *buffer,  	rb_commit(cpu_buffer, event); +	trace_recursive_unlock(); +  	/*  	 * Only the last preempt count needs to restore preemption.  	 */ @@ -1570,6 +1684,99 @@ int ring_buffer_unlock_commit(struct ring_buffer *buffer,  }  EXPORT_SYMBOL_GPL(ring_buffer_unlock_commit); +static inline void rb_event_discard(struct ring_buffer_event *event) +{ +	/* array[0] holds the actual length for the discarded event */ +	event->array[0] = rb_event_data_length(event) - RB_EVNT_HDR_SIZE; +	event->type_len = RINGBUF_TYPE_PADDING; +	/* time delta must be non zero */ +	if (!event->time_delta) +		event->time_delta = 1; +} + +/** + * ring_buffer_event_discard - discard any event in the ring buffer + * @event: the event to discard + * + * Sometimes a event that is in the ring buffer needs to be ignored. + * This function lets the user discard an event in the ring buffer + * and then that event will not be read later. + * + * Note, it is up to the user to be careful with this, and protect + * against races. If the user discards an event that has been consumed + * it is possible that it could corrupt the ring buffer. + */ +void ring_buffer_event_discard(struct ring_buffer_event *event) +{ +	rb_event_discard(event); +} +EXPORT_SYMBOL_GPL(ring_buffer_event_discard); + +/** + * ring_buffer_commit_discard - discard an event that has not been committed + * @buffer: the ring buffer + * @event: non committed event to discard + * + * This is similar to ring_buffer_event_discard but must only be + * performed on an event that has not been committed yet. The difference + * is that this will also try to free the event from the ring buffer + * if another event has not been added behind it. + * + * If another event has been added behind it, it will set the event + * up as discarded, and perform the commit. + * + * If this function is called, do not call ring_buffer_unlock_commit on + * the event. + */ +void ring_buffer_discard_commit(struct ring_buffer *buffer, +				struct ring_buffer_event *event) +{ +	struct ring_buffer_per_cpu *cpu_buffer; +	int cpu; + +	/* The event is discarded regardless */ +	rb_event_discard(event); + +	/* +	 * This must only be called if the event has not been +	 * committed yet. Thus we can assume that preemption +	 * is still disabled. +	 */ +	RB_WARN_ON(buffer, preemptible()); + +	cpu = smp_processor_id(); +	cpu_buffer = buffer->buffers[cpu]; + +	if (!rb_try_to_discard(cpu_buffer, event)) +		goto out; + +	/* +	 * The commit is still visible by the reader, so we +	 * must increment entries. +	 */ +	local_inc(&cpu_buffer->entries); + out: +	/* +	 * If a write came in and pushed the tail page +	 * we still need to update the commit pointer +	 * if we were the commit. +	 */ +	if (rb_is_commit(cpu_buffer, event)) +		rb_set_commit_to_write(cpu_buffer); + +	trace_recursive_unlock(); + +	/* +	 * Only the last preempt count needs to restore preemption. +	 */ +	if (preempt_count() == 1) +		ftrace_preempt_enable(per_cpu(rb_need_resched, cpu)); +	else +		preempt_enable_no_resched_notrace(); + +} +EXPORT_SYMBOL_GPL(ring_buffer_discard_commit); +  /**   * ring_buffer_write - write data to the buffer without reserving   * @buffer: The ring buffer to write to. @@ -1589,7 +1796,6 @@ int ring_buffer_write(struct ring_buffer *buffer,  {  	struct ring_buffer_per_cpu *cpu_buffer;  	struct ring_buffer_event *event; -	unsigned long event_length;  	void *body;  	int ret = -EBUSY;  	int cpu, resched; @@ -1612,9 +1818,10 @@ int ring_buffer_write(struct ring_buffer *buffer,  	if (atomic_read(&cpu_buffer->record_disabled))  		goto out; -	event_length = rb_calculate_event_length(length); -	event = rb_reserve_next_event(cpu_buffer, -				      RINGBUF_TYPE_DATA, event_length); +	if (length > BUF_MAX_DATA_SIZE) +		goto out; + +	event = rb_reserve_next_event(cpu_buffer, length);  	if (!event)  		goto out; @@ -1728,7 +1935,8 @@ unsigned long ring_buffer_entries_cpu(struct ring_buffer *buffer, int cpu)  		return 0;  	cpu_buffer = buffer->buffers[cpu]; -	ret = cpu_buffer->entries; +	ret = (local_read(&cpu_buffer->entries) - cpu_buffer->overrun) +		- cpu_buffer->read;  	return ret;  } @@ -1755,6 +1963,47 @@ unsigned long ring_buffer_overrun_cpu(struct ring_buffer *buffer, int cpu)  EXPORT_SYMBOL_GPL(ring_buffer_overrun_cpu);  /** + * ring_buffer_nmi_dropped_cpu - get the number of nmis that were dropped + * @buffer: The ring buffer + * @cpu: The per CPU buffer to get the number of overruns from + */ +unsigned long ring_buffer_nmi_dropped_cpu(struct ring_buffer *buffer, int cpu) +{ +	struct ring_buffer_per_cpu *cpu_buffer; +	unsigned long ret; + +	if (!cpumask_test_cpu(cpu, buffer->cpumask)) +		return 0; + +	cpu_buffer = buffer->buffers[cpu]; +	ret = cpu_buffer->nmi_dropped; + +	return ret; +} +EXPORT_SYMBOL_GPL(ring_buffer_nmi_dropped_cpu); + +/** + * ring_buffer_commit_overrun_cpu - get the number of overruns caused by commits + * @buffer: The ring buffer + * @cpu: The per CPU buffer to get the number of overruns from + */ +unsigned long +ring_buffer_commit_overrun_cpu(struct ring_buffer *buffer, int cpu) +{ +	struct ring_buffer_per_cpu *cpu_buffer; +	unsigned long ret; + +	if (!cpumask_test_cpu(cpu, buffer->cpumask)) +		return 0; + +	cpu_buffer = buffer->buffers[cpu]; +	ret = cpu_buffer->commit_overrun; + +	return ret; +} +EXPORT_SYMBOL_GPL(ring_buffer_commit_overrun_cpu); + +/**   * ring_buffer_entries - get the number of entries in a buffer   * @buffer: The ring buffer   * @@ -1770,7 +2019,8 @@ unsigned long ring_buffer_entries(struct ring_buffer *buffer)  	/* if you care about this being correct, lock the buffer */  	for_each_buffer_cpu(buffer, cpu) {  		cpu_buffer = buffer->buffers[cpu]; -		entries += cpu_buffer->entries; +		entries += (local_read(&cpu_buffer->entries) - +			    cpu_buffer->overrun) - cpu_buffer->read;  	}  	return entries; @@ -1862,7 +2112,7 @@ rb_update_read_stamp(struct ring_buffer_per_cpu *cpu_buffer,  {  	u64 delta; -	switch (event->type) { +	switch (event->type_len) {  	case RINGBUF_TYPE_PADDING:  		return; @@ -1893,7 +2143,7 @@ rb_update_iter_read_stamp(struct ring_buffer_iter *iter,  {  	u64 delta; -	switch (event->type) { +	switch (event->type_len) {  	case RINGBUF_TYPE_PADDING:  		return; @@ -1966,6 +2216,7 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)  	cpu_buffer->reader_page->list.prev = reader->list.prev;  	local_set(&cpu_buffer->reader_page->write, 0); +	local_set(&cpu_buffer->reader_page->entries, 0);  	local_set(&cpu_buffer->reader_page->page->commit, 0);  	/* Make the reader page now replace the head */ @@ -2008,8 +2259,9 @@ static void rb_advance_reader(struct ring_buffer_per_cpu *cpu_buffer)  	event = rb_reader_event(cpu_buffer); -	if (event->type == RINGBUF_TYPE_DATA || rb_discarded_event(event)) -		cpu_buffer->entries--; +	if (event->type_len <= RINGBUF_TYPE_DATA_TYPE_LEN_MAX +			|| rb_discarded_event(event)) +		cpu_buffer->read++;  	rb_update_read_stamp(cpu_buffer, event); @@ -2031,8 +2283,8 @@ static void rb_advance_iter(struct ring_buffer_iter *iter)  	 * Check if we are at the end of the buffer.  	 */  	if (iter->head >= rb_page_size(iter->head_page)) { -		if (RB_WARN_ON(buffer, -			       iter->head_page == cpu_buffer->commit_page)) +		/* discarded commits can make the page empty */ +		if (iter->head_page == cpu_buffer->commit_page)  			return;  		rb_inc_iter(iter);  		return; @@ -2075,12 +2327,10 @@ rb_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)  	/*  	 * We repeat when a timestamp is encountered. It is possible  	 * to get multiple timestamps from an interrupt entering just -	 * as one timestamp is about to be written. The max times -	 * that this can happen is the number of nested interrupts we -	 * can have.  Nesting 10 deep of interrupts is clearly -	 * an anomaly. +	 * as one timestamp is about to be written, or from discarded +	 * commits. The most that we can have is the number on a single page.  	 */ -	if (RB_WARN_ON(cpu_buffer, ++nr_loops > 10)) +	if (RB_WARN_ON(cpu_buffer, ++nr_loops > RB_TIMESTAMPS_PER_PAGE))  		return NULL;  	reader = rb_get_reader_page(cpu_buffer); @@ -2089,7 +2339,7 @@ rb_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)  	event = rb_reader_event(cpu_buffer); -	switch (event->type) { +	switch (event->type_len) {  	case RINGBUF_TYPE_PADDING:  		if (rb_null_event(event))  			RB_WARN_ON(cpu_buffer, 1); @@ -2146,14 +2396,14 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts)   again:  	/* -	 * We repeat when a timestamp is encountered. It is possible -	 * to get multiple timestamps from an interrupt entering just -	 * as one timestamp is about to be written. The max times -	 * that this can happen is the number of nested interrupts we -	 * can have. Nesting 10 deep of interrupts is clearly -	 * an anomaly. +	 * We repeat when a timestamp is encountered. +	 * We can get multiple timestamps by nested interrupts or also +	 * if filtering is on (discarding commits). Since discarding +	 * commits can be frequent we can get a lot of timestamps. +	 * But we limit them by not adding timestamps if they begin +	 * at the start of a page.  	 */ -	if (RB_WARN_ON(cpu_buffer, ++nr_loops > 10)) +	if (RB_WARN_ON(cpu_buffer, ++nr_loops > RB_TIMESTAMPS_PER_PAGE))  		return NULL;  	if (rb_per_cpu_empty(cpu_buffer)) @@ -2161,7 +2411,7 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts)  	event = rb_iter_head_event(iter); -	switch (event->type) { +	switch (event->type_len) {  	case RINGBUF_TYPE_PADDING:  		if (rb_null_event(event)) {  			rb_inc_iter(iter); @@ -2220,7 +2470,7 @@ ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)  	event = rb_buffer_peek(buffer, cpu, ts);  	spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); -	if (event && event->type == RINGBUF_TYPE_PADDING) { +	if (event && event->type_len == RINGBUF_TYPE_PADDING) {  		cpu_relax();  		goto again;  	} @@ -2248,7 +2498,7 @@ ring_buffer_iter_peek(struct ring_buffer_iter *iter, u64 *ts)  	event = rb_iter_peek(iter, ts);  	spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); -	if (event && event->type == RINGBUF_TYPE_PADDING) { +	if (event && event->type_len == RINGBUF_TYPE_PADDING) {  		cpu_relax();  		goto again;  	} @@ -2293,7 +2543,7 @@ ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts)   out:  	preempt_enable(); -	if (event && event->type == RINGBUF_TYPE_PADDING) { +	if (event && event->type_len == RINGBUF_TYPE_PADDING) {  		cpu_relax();  		goto again;  	} @@ -2386,7 +2636,7 @@ ring_buffer_read(struct ring_buffer_iter *iter, u64 *ts)   out:  	spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); -	if (event && event->type == RINGBUF_TYPE_PADDING) { +	if (event && event->type_len == RINGBUF_TYPE_PADDING) {  		cpu_relax();  		goto again;  	} @@ -2411,6 +2661,7 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer)  	cpu_buffer->head_page  		= list_entry(cpu_buffer->pages.next, struct buffer_page, list);  	local_set(&cpu_buffer->head_page->write, 0); +	local_set(&cpu_buffer->head_page->entries, 0);  	local_set(&cpu_buffer->head_page->page->commit, 0);  	cpu_buffer->head_page->read = 0; @@ -2420,11 +2671,15 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer)  	INIT_LIST_HEAD(&cpu_buffer->reader_page->list);  	local_set(&cpu_buffer->reader_page->write, 0); +	local_set(&cpu_buffer->reader_page->entries, 0);  	local_set(&cpu_buffer->reader_page->page->commit, 0);  	cpu_buffer->reader_page->read = 0; +	cpu_buffer->nmi_dropped = 0; +	cpu_buffer->commit_overrun = 0;  	cpu_buffer->overrun = 0; -	cpu_buffer->entries = 0; +	cpu_buffer->read = 0; +	local_set(&cpu_buffer->entries, 0);  	cpu_buffer->write_stamp = 0;  	cpu_buffer->read_stamp = 0; @@ -2443,6 +2698,8 @@ void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu)  	if (!cpumask_test_cpu(cpu, buffer->cpumask))  		return; +	atomic_inc(&cpu_buffer->record_disabled); +  	spin_lock_irqsave(&cpu_buffer->reader_lock, flags);  	__raw_spin_lock(&cpu_buffer->lock); @@ -2452,6 +2709,8 @@ void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu)  	__raw_spin_unlock(&cpu_buffer->lock);  	spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); + +	atomic_dec(&cpu_buffer->record_disabled);  }  EXPORT_SYMBOL_GPL(ring_buffer_reset_cpu); @@ -2578,28 +2837,6 @@ out:  }  EXPORT_SYMBOL_GPL(ring_buffer_swap_cpu); -static void rb_remove_entries(struct ring_buffer_per_cpu *cpu_buffer, -			      struct buffer_data_page *bpage, -			      unsigned int offset) -{ -	struct ring_buffer_event *event; -	unsigned long head; - -	__raw_spin_lock(&cpu_buffer->lock); -	for (head = offset; head < local_read(&bpage->commit); -	     head += rb_event_length(event)) { - -		event = __rb_data_page_index(bpage, head); -		if (RB_WARN_ON(cpu_buffer, rb_null_event(event))) -			return; -		/* Only count data entries */ -		if (event->type != RINGBUF_TYPE_DATA) -			continue; -		cpu_buffer->entries--; -	} -	__raw_spin_unlock(&cpu_buffer->lock); -} -  /**   * ring_buffer_alloc_read_page - allocate a page to read from buffer   * @buffer: the buffer to allocate for. @@ -2630,6 +2867,7 @@ void *ring_buffer_alloc_read_page(struct ring_buffer *buffer)  	return bpage;  } +EXPORT_SYMBOL_GPL(ring_buffer_alloc_read_page);  /**   * ring_buffer_free_read_page - free an allocated read page @@ -2642,6 +2880,7 @@ void ring_buffer_free_read_page(struct ring_buffer *buffer, void *data)  {  	free_page((unsigned long)data);  } +EXPORT_SYMBOL_GPL(ring_buffer_free_read_page);  /**   * ring_buffer_read_page - extract a page from the ring buffer @@ -2768,16 +3007,17 @@ int ring_buffer_read_page(struct ring_buffer *buffer,  		/* we copied everything to the beginning */  		read = 0;  	} else { +		/* update the entry counter */ +		cpu_buffer->read += local_read(&reader->entries); +  		/* swap the pages */  		rb_init_page(bpage);  		bpage = reader->page;  		reader->page = *data_page;  		local_set(&reader->write, 0); +		local_set(&reader->entries, 0);  		reader->read = 0;  		*data_page = bpage; - -		/* update the entry counter */ -		rb_remove_entries(cpu_buffer, bpage, read);  	}  	ret = read; @@ -2787,6 +3027,7 @@ int ring_buffer_read_page(struct ring_buffer *buffer,   out:  	return ret;  } +EXPORT_SYMBOL_GPL(ring_buffer_read_page);  static ssize_t  rb_simple_read(struct file *filp, char __user *ubuf, @@ -2845,14 +3086,11 @@ static const struct file_operations rb_simple_fops = {  static __init int rb_init_debugfs(void)  {  	struct dentry *d_tracer; -	struct dentry *entry;  	d_tracer = tracing_init_dentry(); -	entry = debugfs_create_file("tracing_on", 0644, d_tracer, -				    &ring_buffer_flags, &rb_simple_fops); -	if (!entry) -		pr_warning("Could not create debugfs 'tracing_on' entry\n"); +	trace_create_file("tracing_on", 0644, d_tracer, +			    &ring_buffer_flags, &rb_simple_fops);  	return 0;  }  |