diff options
Diffstat (limited to 'kernel/trace/ring_buffer.c')
-rw-r--r-- | kernel/trace/ring_buffer.c | 1562 |
1 files changed, 1125 insertions, 437 deletions
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index bd38c5cfd8ad..a330513d96ce 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -4,21 +4,115 @@ * Copyright (C) 2008 Steven Rostedt <srostedt@redhat.com> */ #include <linux/ring_buffer.h> +#include <linux/trace_clock.h> +#include <linux/ftrace_irq.h> #include <linux/spinlock.h> #include <linux/debugfs.h> #include <linux/uaccess.h> +#include <linux/hardirq.h> +#include <linux/kmemcheck.h> #include <linux/module.h> #include <linux/percpu.h> #include <linux/mutex.h> -#include <linux/sched.h> /* used for sched_clock() (for now) */ #include <linux/init.h> #include <linux/hash.h> #include <linux/list.h> +#include <linux/cpu.h> #include <linux/fs.h> #include "trace.h" /* + * The ring buffer header is special. We must manually up keep it. + */ +int ring_buffer_print_entry_header(struct trace_seq *s) +{ + int ret; + + ret = trace_seq_printf(s, "# compressed entry header\n"); + ret = trace_seq_printf(s, "\ttype_len : 5 bits\n"); + ret = trace_seq_printf(s, "\ttime_delta : 27 bits\n"); + ret = trace_seq_printf(s, "\tarray : 32 bits\n"); + ret = trace_seq_printf(s, "\n"); + ret = trace_seq_printf(s, "\tpadding : type == %d\n", + RINGBUF_TYPE_PADDING); + ret = trace_seq_printf(s, "\ttime_extend : type == %d\n", + RINGBUF_TYPE_TIME_EXTEND); + ret = trace_seq_printf(s, "\tdata max type_len == %d\n", + RINGBUF_TYPE_DATA_TYPE_LEN_MAX); + + return ret; +} + +/* + * The ring buffer is made up of a list of pages. A separate list of pages is + * allocated for each CPU. A writer may only write to a buffer that is + * associated with the CPU it is currently executing on. A reader may read + * from any per cpu buffer. + * + * The reader is special. For each per cpu buffer, the reader has its own + * reader page. When a reader has read the entire reader page, this reader + * page is swapped with another page in the ring buffer. + * + * Now, as long as the writer is off the reader page, the reader can do what + * ever it wants with that page. The writer will never write to that page + * again (as long as it is out of the ring buffer). + * + * Here's some silly ASCII art. + * + * +------+ + * |reader| RING BUFFER + * |page | + * +------+ +---+ +---+ +---+ + * | |-->| |-->| | + * +---+ +---+ +---+ + * ^ | + * | | + * +---------------+ + * + * + * +------+ + * |reader| RING BUFFER + * |page |------------------v + * +------+ +---+ +---+ +---+ + * | |-->| |-->| | + * +---+ +---+ +---+ + * ^ | + * | | + * +---------------+ + * + * + * +------+ + * |reader| RING BUFFER + * |page |------------------v + * +------+ +---+ +---+ +---+ + * ^ | |-->| |-->| | + * | +---+ +---+ +---+ + * | | + * | | + * +------------------------------+ + * + * + * +------+ + * |buffer| RING BUFFER + * |page |------------------v + * +------+ +---+ +---+ +---+ + * ^ | | | |-->| | + * | New +---+ +---+ +---+ + * | Reader------^ | + * | page | + * +------------------------------+ + * + * + * After we make this swap, the reader can hand this page off to the splice + * code and be done with it. It can even allocate a new page if it needs to + * and swap that into the ring buffer. + * + * We will be using cmpxchg soon to make all this lockless. + * + */ + +/* * A fast way to enable or disable all ring buffers is to * call tracing_on or tracing_off. Turning off the ring buffers * prevents all ring buffers from being recorded to. @@ -57,7 +151,9 @@ enum { RB_BUFFERS_DISABLED = 1 << RB_BUFFERS_DISABLED_BIT, }; -static long ring_buffer_flags __read_mostly = RB_BUFFERS_ON; +static unsigned long ring_buffer_flags __read_mostly = RB_BUFFERS_ON; + +#define BUF_PAGE_HDR_SIZE offsetof(struct buffer_data_page, data) /** * tracing_on - enable all tracing buffers @@ -89,59 +185,76 @@ EXPORT_SYMBOL_GPL(tracing_off); * tracing_off_permanent - permanently disable ring buffers * * This function, once called, will disable all ring buffers - * permanenty. + * permanently. */ void tracing_off_permanent(void) { set_bit(RB_BUFFERS_DISABLED_BIT, &ring_buffer_flags); } +/** + * tracing_is_on - show state of ring buffers enabled + */ +int tracing_is_on(void) +{ + return ring_buffer_flags == RB_BUFFERS_ON; +} +EXPORT_SYMBOL_GPL(tracing_is_on); + #include "trace.h" -/* Up this if you want to test the TIME_EXTENTS and normalization */ -#define DEBUG_SHIFT 0 +#define RB_EVNT_HDR_SIZE (offsetof(struct ring_buffer_event, array)) +#define RB_ALIGNMENT 4U +#define RB_MAX_SMALL_DATA (RB_ALIGNMENT * RINGBUF_TYPE_DATA_TYPE_LEN_MAX) +#define RB_EVNT_MIN_SIZE 8U /* two 32bit words */ -/* FIXME!!! */ -u64 ring_buffer_time_stamp(int cpu) -{ - u64 time; +/* define RINGBUF_TYPE_DATA for 'case RINGBUF_TYPE_DATA:' */ +#define RINGBUF_TYPE_DATA 0 ... RINGBUF_TYPE_DATA_TYPE_LEN_MAX - preempt_disable_notrace(); - /* shift to debug/test normalization and TIME_EXTENTS */ - time = sched_clock() << DEBUG_SHIFT; - preempt_enable_no_resched_notrace(); +enum { + RB_LEN_TIME_EXTEND = 8, + RB_LEN_TIME_STAMP = 16, +}; - return time; +static inline int rb_null_event(struct ring_buffer_event *event) +{ + return event->type_len == RINGBUF_TYPE_PADDING + && event->time_delta == 0; } -EXPORT_SYMBOL_GPL(ring_buffer_time_stamp); -void ring_buffer_normalize_time_stamp(int cpu, u64 *ts) +static inline int rb_discarded_event(struct ring_buffer_event *event) { - /* Just stupid testing the normalize function and deltas */ - *ts >>= DEBUG_SHIFT; + return event->type_len == RINGBUF_TYPE_PADDING && event->time_delta; } -EXPORT_SYMBOL_GPL(ring_buffer_normalize_time_stamp); -#define RB_EVNT_HDR_SIZE (sizeof(struct ring_buffer_event)) -#define RB_ALIGNMENT_SHIFT 2 -#define RB_ALIGNMENT (1 << RB_ALIGNMENT_SHIFT) -#define RB_MAX_SMALL_DATA 28 +static void rb_event_set_padding(struct ring_buffer_event *event) +{ + event->type_len = RINGBUF_TYPE_PADDING; + event->time_delta = 0; +} -enum { - RB_LEN_TIME_EXTEND = 8, - RB_LEN_TIME_STAMP = 16, -}; +static unsigned +rb_event_data_length(struct ring_buffer_event *event) +{ + unsigned length; + + if (event->type_len) + length = event->type_len * RB_ALIGNMENT; + else + length = event->array[0]; + return length + RB_EVNT_HDR_SIZE; +} /* inline for ring buffer fast paths */ -static inline unsigned +static unsigned rb_event_length(struct ring_buffer_event *event) { - unsigned length; - - switch (event->type) { + switch (event->type_len) { case RINGBUF_TYPE_PADDING: - /* undefined */ - return -1; + if (rb_null_event(event)) + /* undefined */ + return -1; + return event->array[0] + RB_EVNT_HDR_SIZE; case RINGBUF_TYPE_TIME_EXTEND: return RB_LEN_TIME_EXTEND; @@ -150,11 +263,7 @@ rb_event_length(struct ring_buffer_event *event) return RB_LEN_TIME_STAMP; case RINGBUF_TYPE_DATA: - if (event->len) - length = event->len << RB_ALIGNMENT_SHIFT; - else - length = event->array[0]; - return length + RB_EVNT_HDR_SIZE; + return rb_event_data_length(event); default: BUG(); } @@ -169,7 +278,7 @@ rb_event_length(struct ring_buffer_event *event) unsigned ring_buffer_event_length(struct ring_buffer_event *event) { unsigned length = rb_event_length(event); - if (event->type != RINGBUF_TYPE_DATA) + if (event->type_len > RINGBUF_TYPE_DATA_TYPE_LEN_MAX) return length; length -= RB_EVNT_HDR_SIZE; if (length > RB_MAX_SMALL_DATA + sizeof(event->array[0])) @@ -179,12 +288,12 @@ unsigned ring_buffer_event_length(struct ring_buffer_event *event) EXPORT_SYMBOL_GPL(ring_buffer_event_length); /* inline for ring buffer fast paths */ -static inline void * +static void * rb_event_data(struct ring_buffer_event *event) { - BUG_ON(event->type != RINGBUF_TYPE_DATA); + BUG_ON(event->type_len > RINGBUF_TYPE_DATA_TYPE_LEN_MAX); /* If length is in len field, then array[0] has the data */ - if (event->len) + if (event->type_len) return (void *)&event->array[0]; /* Otherwise length is in array[0] and array[1] has the data */ return (void *)&event->array[1]; @@ -209,14 +318,15 @@ EXPORT_SYMBOL_GPL(ring_buffer_event_data); struct buffer_data_page { u64 time_stamp; /* page time stamp */ - local_t commit; /* write commited index */ + local_t commit; /* write committed index */ unsigned char data[]; /* data of buffer page */ }; struct buffer_page { + struct list_head list; /* list of buffer pages */ local_t write; /* index for next write */ unsigned read; /* index for next read */ - struct list_head list; /* list of free pages */ + local_t entries; /* entries on this page */ struct buffer_data_page *page; /* Actual data page */ }; @@ -225,14 +335,25 @@ static void rb_init_page(struct buffer_data_page *bpage) local_set(&bpage->commit, 0); } +/** + * ring_buffer_page_len - the size of data on the page. + * @page: The page to read + * + * Returns the amount of data on the page, including buffer page header. + */ +size_t ring_buffer_page_len(void *page) +{ + return local_read(&((struct buffer_data_page *)page)->commit) + + BUF_PAGE_HDR_SIZE; +} + /* * Also stolen from mm/slob.c. Thanks to Mathieu Desnoyers for pointing * this issue out. */ -static inline void free_buffer_page(struct buffer_page *bpage) +static void free_buffer_page(struct buffer_page *bpage) { - if (bpage->page) - free_page((unsigned long)bpage->page); + free_page((unsigned long)bpage->page); kfree(bpage); } @@ -246,7 +367,35 @@ static inline int test_time_stamp(u64 delta) return 0; } -#define BUF_PAGE_SIZE (PAGE_SIZE - offsetof(struct buffer_data_page, data)) +#define BUF_PAGE_SIZE (PAGE_SIZE - BUF_PAGE_HDR_SIZE) + +/* Max payload is BUF_PAGE_SIZE - header (8bytes) */ +#define BUF_MAX_DATA_SIZE (BUF_PAGE_SIZE - (sizeof(u32) * 2)) + +/* Max number of timestamps that can fit on a page */ +#define RB_TIMESTAMPS_PER_PAGE (BUF_PAGE_SIZE / RB_LEN_TIME_STAMP) + +int ring_buffer_print_page_header(struct trace_seq *s) +{ + struct buffer_data_page field; + int ret; + + ret = trace_seq_printf(s, "\tfield: u64 timestamp;\t" + "offset:0;\tsize:%u;\n", + (unsigned int)sizeof(field.time_stamp)); + + ret = trace_seq_printf(s, "\tfield: local_t commit;\t" + "offset:%u;\tsize:%u;\n", + (unsigned int)offsetof(typeof(field), commit), + (unsigned int)sizeof(field.commit)); + + ret = trace_seq_printf(s, "\tfield: char data;\t" + "offset:%u;\tsize:%u;\n", + (unsigned int)offsetof(typeof(field), data), + (unsigned int)BUF_PAGE_SIZE); + + return ret; +} /* * head_page == tail_page && head == tail then buffer is empty. @@ -260,10 +409,15 @@ struct ring_buffer_per_cpu { struct list_head pages; struct buffer_page *head_page; /* read from head */ struct buffer_page *tail_page; /* write to tail */ - struct buffer_page *commit_page; /* commited pages */ + struct buffer_page *commit_page; /* committed pages */ struct buffer_page *reader_page; + unsigned long nmi_dropped; + unsigned long commit_overrun; unsigned long overrun; - unsigned long entries; + unsigned long read; + local_t entries; + local_t committing; + local_t commits; u64 write_stamp; u64 read_stamp; atomic_t record_disabled; @@ -273,12 +427,19 @@ struct ring_buffer { unsigned pages; unsigned flags; int cpus; - cpumask_var_t cpumask; atomic_t record_disabled; + cpumask_var_t cpumask; + + struct lock_class_key *reader_lock_key; struct mutex mutex; struct ring_buffer_per_cpu **buffers; + +#ifdef CONFIG_HOTPLUG_CPU + struct notifier_block cpu_notify; +#endif + u64 (*clock)(void); }; struct ring_buffer_iter { @@ -299,11 +460,40 @@ struct ring_buffer_iter { _____ret; \ }) +/* Up this if you want to test the TIME_EXTENTS and normalization */ +#define DEBUG_SHIFT 0 + +static inline u64 rb_time_stamp(struct ring_buffer *buffer, int cpu) +{ + /* shift to debug/test normalization and TIME_EXTENTS */ + return buffer->clock() << DEBUG_SHIFT; +} + +u64 ring_buffer_time_stamp(struct ring_buffer *buffer, int cpu) +{ + u64 time; + + preempt_disable_notrace(); + time = rb_time_stamp(buffer, cpu); + preempt_enable_no_resched_notrace(); + + return time; +} +EXPORT_SYMBOL_GPL(ring_buffer_time_stamp); + +void ring_buffer_normalize_time_stamp(struct ring_buffer *buffer, + int cpu, u64 *ts) +{ + /* Just stupid testing the normalize function and deltas */ + *ts >>= DEBUG_SHIFT; +} +EXPORT_SYMBOL_GPL(ring_buffer_normalize_time_stamp); + /** * check_pages - integrity check of buffer pages * @cpu_buffer: CPU buffer with pages to test * - * As a safty measure we check to make sure the data pages have not + * As a safety measure we check to make sure the data pages have not * been corrupted. */ static int rb_check_pages(struct ring_buffer_per_cpu *cpu_buffer) @@ -381,6 +571,7 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu) cpu_buffer->cpu = cpu; cpu_buffer->buffer = buffer; spin_lock_init(&cpu_buffer->reader_lock); + lockdep_set_class(&cpu_buffer->reader_lock, buffer->reader_lock_key); cpu_buffer->lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; INIT_LIST_HEAD(&cpu_buffer->pages); @@ -421,7 +612,6 @@ static void rb_free_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer) struct list_head *head = &cpu_buffer->pages; struct buffer_page *bpage, *tmp; - list_del_init(&cpu_buffer->reader_page->list); free_buffer_page(cpu_buffer->reader_page); list_for_each_entry_safe(bpage, tmp, head, list) { @@ -431,11 +621,10 @@ static void rb_free_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer) kfree(cpu_buffer); } -/* - * Causes compile errors if the struct buffer_page gets bigger - * than the struct page. - */ -extern int ring_buffer_page_too_big(void); +#ifdef CONFIG_HOTPLUG_CPU +static int rb_cpu_notify(struct notifier_block *self, + unsigned long action, void *hcpu); +#endif /** * ring_buffer_alloc - allocate a new ring_buffer @@ -447,17 +636,13 @@ extern int ring_buffer_page_too_big(void); * when the buffer wraps. If this flag is not set, the buffer will * drop data when the tail hits the head. */ -struct ring_buffer *ring_buffer_alloc(unsigned long size, unsigned flags) +struct ring_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags, + struct lock_class_key *key) { struct ring_buffer *buffer; int bsize; int cpu; - /* Paranoid! Optimizes out when all is well */ - if (sizeof(struct buffer_page) > sizeof(struct page)) - ring_buffer_page_too_big(); - - /* keep it in its own cache line */ buffer = kzalloc(ALIGN(sizeof(*buffer), cache_line_size()), GFP_KERNEL); @@ -469,12 +654,24 @@ struct ring_buffer *ring_buffer_alloc(unsigned long size, unsigned flags) buffer->pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE); buffer->flags = flags; + buffer->clock = trace_clock_local; + buffer->reader_lock_key = key; /* need at least two pages */ - if (buffer->pages == 1) - buffer->pages++; + if (buffer->pages < 2) + buffer->pages = 2; + /* + * In case of non-hotplug cpu, if the ring-buffer is allocated + * in early initcall, it will not be notified of secondary cpus. + * In that off case, we need to allocate for all possible cpus. + */ +#ifdef CONFIG_HOTPLUG_CPU + get_online_cpus(); + cpumask_copy(buffer->cpumask, cpu_online_mask); +#else cpumask_copy(buffer->cpumask, cpu_possible_mask); +#endif buffer->cpus = nr_cpu_ids; bsize = sizeof(void *) * nr_cpu_ids; @@ -490,6 +687,13 @@ struct ring_buffer *ring_buffer_alloc(unsigned long size, unsigned flags) goto fail_free_buffers; } +#ifdef CONFIG_HOTPLUG_CPU + buffer->cpu_notify.notifier_call = rb_cpu_notify; + buffer->cpu_notify.priority = 0; + register_cpu_notifier(&buffer->cpu_notify); +#endif + + put_online_cpus(); mutex_init(&buffer->mutex); return buffer; @@ -503,12 +707,13 @@ struct ring_buffer *ring_buffer_alloc(unsigned long size, unsigned flags) fail_free_cpumask: free_cpumask_var(buffer->cpumask); + put_online_cpus(); fail_free_buffer: kfree(buffer); return NULL; } -EXPORT_SYMBOL_GPL(ring_buffer_alloc); +EXPORT_SYMBOL_GPL(__ring_buffer_alloc); /** * ring_buffer_free - free a ring buffer. @@ -519,15 +724,30 @@ ring_buffer_free(struct ring_buffer *buffer) { int cpu; + get_online_cpus(); + +#ifdef CONFIG_HOTPLUG_CPU + unregister_cpu_notifier(&buffer->cpu_notify); +#endif + for_each_buffer_cpu(buffer, cpu) rb_free_cpu_buffer(buffer->buffers[cpu]); + put_online_cpus(); + + kfree(buffer->buffers); free_cpumask_var(buffer->cpumask); kfree(buffer); } EXPORT_SYMBOL_GPL(ring_buffer_free); +void ring_buffer_set_clock(struct ring_buffer *buffer, + u64 (*clock)(void)) +{ + buffer->clock = clock; +} + static void rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer); static void @@ -627,16 +847,15 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size) return size; mutex_lock(&buffer->mutex); + get_online_cpus(); nr_pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE); if (size < buffer_size) { /* easy case, just free pages */ - if (RB_WARN_ON(buffer, nr_pages >= buffer->pages)) { - mutex_unlock(&buffer->mutex); - return -1; - } + if (RB_WARN_ON(buffer, nr_pages >= buffer->pages)) + goto out_fail; rm_pages = buffer->pages - nr_pages; @@ -655,10 +874,8 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size) * add these pages to the cpu_buffers. Otherwise we just free * them all and return -ENOMEM; */ - if (RB_WARN_ON(buffer, nr_pages <= buffer->pages)) { - mutex_unlock(&buffer->mutex); - return -1; - } + if (RB_WARN_ON(buffer, nr_pages <= buffer->pages)) + goto out_fail; new_pages = nr_pages - buffer->pages; @@ -683,13 +900,12 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size) rb_insert_pages(cpu_buffer, &pages, new_pages); } - if (RB_WARN_ON(buffer, !list_empty(&pages))) { - mutex_unlock(&buffer->mutex); - return -1; - } + if (RB_WARN_ON(buffer, !list_empty(&pages))) + goto out_fail; out: buffer->pages = nr_pages; + put_online_cpus(); mutex_unlock(&buffer->mutex); return size; @@ -699,15 +915,20 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size) list_del_init(&bpage->list); free_buffer_page(bpage); } + put_online_cpus(); mutex_unlock(&buffer->mutex); return -ENOMEM; -} -EXPORT_SYMBOL_GPL(ring_buffer_resize); -static inline int rb_null_event(struct ring_buffer_event *event) -{ - return event->type == RINGBUF_TYPE_PADDING; + /* + * Something went totally wrong, and we are too paranoid + * to even clean up the mess. + */ + out_fail: + put_online_cpus(); + mutex_unlock(&buffer->mutex); + return -1; } +EXPORT_SYMBOL_GPL(ring_buffer_resize); static inline void * __rb_data_page_index(struct buffer_data_page *bpage, unsigned index) @@ -767,31 +988,6 @@ static inline unsigned rb_head_size(struct ring_buffer_per_cpu *cpu_buffer) return rb_page_commit(cpu_buffer->head_page); } -/* - * When the tail hits the head and the buffer is in overwrite mode, - * the head jumps to the next page and all content on the previous - * page is discarded. But before doing so, we update the overrun - * variable of the buffer. - */ -static void rb_update_overflow(struct ring_buffer_per_cpu *cpu_buffer) -{ - struct ring_buffer_event *event; - unsigned long head; - - for (head = 0; head < rb_head_size(cpu_buffer); - head += rb_event_length(event)) { - - event = __rb_page_index(cpu_buffer->head_page, head); - if (RB_WARN_ON(cpu_buffer, rb_null_event(event))) - return; - /* Only count data entries */ - if (event->type != RINGBUF_TYPE_DATA) - continue; - cpu_buffer->overrun++; - cpu_buffer->entries--; - } -} - static inline void rb_inc_page(struct ring_buffer_per_cpu *cpu_buffer, struct buffer_page **bpage) { @@ -808,12 +1004,12 @@ rb_event_index(struct ring_buffer_event *event) { unsigned long addr = (unsigned long)event; - return (addr & ~PAGE_MASK) - (PAGE_SIZE - BUF_PAGE_SIZE); + return (addr & ~PAGE_MASK) - BUF_PAGE_HDR_SIZE; } static inline int -rb_is_commit(struct ring_buffer_per_cpu *cpu_buffer, - struct ring_buffer_event *event) +rb_event_is_commit(struct ring_buffer_per_cpu *cpu_buffer, + struct ring_buffer_event *event) { unsigned long addr = (unsigned long)event; unsigned long index; @@ -825,32 +1021,7 @@ rb_is_commit(struct ring_buffer_per_cpu *cpu_buffer, rb_commit_index(cpu_buffer) == index; } -static inline void -rb_set_commit_event(struct ring_buffer_per_cpu *cpu_buffer, - struct ring_buffer_event *event) -{ - unsigned long addr = (unsigned long)event; - unsigned long index; - - index = rb_event_index(event); - addr &= PAGE_MASK; - - while (cpu_buffer->commit_page->page != (void *)addr) { - if (RB_WARN_ON(cpu_buffer, - cpu_buffer->commit_page == cpu_buffer->tail_page)) - return; - cpu_buffer->commit_page->page->commit = - cpu_buffer->commit_page->write; - rb_inc_page(cpu_buffer, &cpu_buffer->commit_page); - cpu_buffer->write_stamp = - cpu_buffer->commit_page->page->time_stamp; - } - - /* Now set the commit to the event's index */ - local_set(&cpu_buffer->commit_page->page->commit, index); -} - -static inline void +static void rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer) { /* @@ -896,7 +1067,7 @@ static void rb_reset_reader_page(struct ring_buffer_per_cpu *cpu_buffer) cpu_buffer->reader_page->read = 0; } -static inline void rb_inc_iter(struct ring_buffer_iter *iter) +static void rb_inc_iter(struct ring_buffer_iter *iter) { struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer; @@ -926,45 +1097,32 @@ static inline void rb_inc_iter(struct ring_buffer_iter *iter) * and with this, we can determine what to place into the * data field. */ -static inline void +static void rb_update_event(struct ring_buffer_event *event, unsigned type, unsigned length) { - event->type = type; + event->type_len = type; switch (type) { case RINGBUF_TYPE_PADDING: - break; - case RINGBUF_TYPE_TIME_EXTEND: - event->len = - (RB_LEN_TIME_EXTEND + (RB_ALIGNMENT-1)) - >> RB_ALIGNMENT_SHIFT; - break; - case RINGBUF_TYPE_TIME_STAMP: - event->len = - (RB_LEN_TIME_STAMP + (RB_ALIGNMENT-1)) - >> RB_ALIGNMENT_SHIFT; break; - case RINGBUF_TYPE_DATA: + case 0: length -= RB_EVNT_HDR_SIZE; - if (length > RB_MAX_SMALL_DATA) { - event->len = 0; + if (length > RB_MAX_SMALL_DATA) event->array[0] = length; - } else - event->len = - (length + (RB_ALIGNMENT-1)) - >> RB_ALIGNMENT_SHIFT; + else + event->type_len = DIV_ROUND_UP(length, RB_ALIGNMENT); break; default: BUG(); } } -static inline unsigned rb_calculate_event_length(unsigned length) +static unsigned rb_calculate_event_length(unsigned length) { struct ring_buffer_event event; /* Used only for sizeof array */ @@ -981,133 +1139,241 @@ static inline unsigned rb_calculate_event_length(unsigned length) return length; } -static struct ring_buffer_event * -__rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, - unsigned type, unsigned long length, u64 *ts) +static inline void +rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer, + struct buffer_page *tail_page, + unsigned long tail, unsigned long length) { - struct buffer_page *tail_page, *head_page, *reader_page, *commit_page; - unsigned long tail, write; - struct ring_buffer *buffer = cpu_buffer->buffer; struct ring_buffer_event *event; - unsigned long flags; - commit_page = cpu_buffer->commit_page; - /* we just need to protect against interrupts */ - barrier(); - tail_page = cpu_buffer->tail_page; - write = local_add_return(length, &tail_page->write); - tail = write - length; + /* + * Only the event that crossed the page boundary + * must fill the old tail_page with padding. + */ + if (tail >= BUF_PAGE_SIZE) { + local_sub(length, &tail_page->write); + return; + } - /* See if we shot pass the end of this buffer page */ - if (write > BUF_PAGE_SIZE) { - struct buffer_page *next_page = tail_page; + event = __rb_page_index(tail_page, tail); + kmemcheck_annotate_bitfield(event, bitfield); - local_irq_save(flags); - __raw_spin_lock(&cpu_buffer->lock); + /* + * If this event is bigger than the minimum size, then + * we need to be careful that we don't subtract the + * write counter enough to allow another writer to slip + * in on this page. + * We put in a discarded commit instead, to make sure + * that this space is not used again. + * + * If we are less than the minimum size, we don't need to + * worry about it. + */ + if (tail > (BUF_PAGE_SIZE - RB_EVNT_MIN_SIZE)) { + /* No room for any events */ - rb_inc_page(cpu_buffer, &next_page); + /* Mark the rest of the page with padding */ + rb_event_set_padding(event); - head_page = cpu_buffer->head_page; - reader_page = cpu_buffer->reader_page; + /* Set the write back to the previous setting */ + local_sub(length, &tail_page->write); + return; + } - /* we grabbed the lock before incrementing */ - if (RB_WARN_ON(cpu_buffer, next_page == reader_page)) - goto out_unlock; + /* Put in a discarded event */ + event->array[0] = (BUF_PAGE_SIZE - tail) - RB_EVNT_HDR_SIZE; + event->type_len = RINGBUF_TYPE_PADDING; + /* time delta must be non zero */ + event->time_delta = 1; + /* Account for this as an entry */ + local_inc(&tail_page->entries); + local_inc(&cpu_buffer->entries); - /* - * If for some reason, we had an interrupt storm that made - * it all the way around the buffer, bail, and warn - * about it. - */ - if (unlikely(next_page == commit_page)) { - WARN_ON_ONCE(1); - goto out_unlock; - } + /* Set write to end of buffer */ + length = (tail + length) - BUF_PAGE_SIZE; + local_sub(length, &tail_page->write); +} - if (next_page == head_page) { - if (!(buffer->flags & RB_FL_OVERWRITE)) - goto out_unlock; +static struct ring_buffer_event * +rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer, + unsigned long length, unsigned long tail, + struct buffer_page *commit_page, + struct buffer_page *tail_page, u64 *ts) +{ + struct buffer_page *next_page, *head_page, *reader_page; + struct ring_buffer *buffer = cpu_buffer->buffer; + bool lock_taken = false; + unsigned long flags; - /* tail_page has not moved yet? */ - if (tail_page == cpu_buffer->tail_page) { - /* count overflows */ - rb_update_overflow(cpu_buffer); + next_page = tail_page; - rb_inc_page(cpu_buffer, &head_page); - cpu_buffer->head_page = head_page; - cpu_buffer->head_page->read = 0; - } + local_irq_save(flags); + /* + * Since the write to the buffer is still not + * fully lockless, we must be careful with NMIs. + * The locks in the writers are taken when a write + * crosses to a new page. The locks protect against + * races with the readers (this will soon be fixed + * with a lockless solution). + * + * Because we can not protect against NMIs, and we + * want to keep traces reentrant, we need to manage + * what happens when we are in an NMI. + * + * NMIs can happen after we take the lock. + * If we are in an NMI, only take the lock + * if it is not already taken. Otherwise + * simply fail. + */ + if (unlikely(in_nmi())) { + if (!__raw_spin_trylock(&cpu_buffer->lock)) { + cpu_buffer->nmi_dropped++; + goto out_reset; } + } else + __raw_spin_lock(&cpu_buffer->lock); - /* - * If the tail page is still the same as what we think - * it is, then it is up to us to update the tail - * pointer. - */ + lock_taken = true; + + rb_inc_page(cpu_buffer, &next_page); + + head_page = cpu_buffer->head_page; + reader_page = cpu_buffer->reader_page; + + /* we grabbed the lock before incrementing */ + if (RB_WARN_ON(cpu_buffer, next_page == reader_page)) + goto out_reset; + + /* + * If for some reason, we had an interrupt storm that made + * it all the way around the buffer, bail, and warn + * about it. + */ + if (unlikely(next_page == commit_page)) { + cpu_buffer->commit_overrun++; + goto out_reset; + } + + if (next_page == head_page) { + if (!(buffer->flags & RB_FL_OVERWRITE)) + goto out_reset; + + /* tail_page has not moved yet? */ if (tail_page == cpu_buffer->tail_page) { - local_set(&next_page->write, 0); - local_set(&next_page->page->commit, 0); - cpu_buffer->tail_page = next_page; + /* count overflows */ + cpu_buffer->overrun += + local_read(&head_page->entries); - /* reread the time stamp */ - *ts = ring_buffer_time_stamp(cpu_buffer->cpu); - cpu_buffer->tail_page->page->time_stamp = *ts; + rb_inc_page(cpu_buffer, &head_page); + cpu_buffer->head_page = head_page; + cpu_buffer->head_page->read = 0; } + } - /* - * The actual tail page has moved forward. - */ - if (tail < BUF_PAGE_SIZE) { - /* Mark the rest of the page with padding */ - event = __rb_page_index(tail_page, tail); - event->type = RINGBUF_TYPE_PADDING; - } + /* + * If the tail page is still the same as what we think + * it is, then it is up to us to update the tail + * pointer. + */ + if (tail_page == cpu_buffer->tail_page) { + local_set(&next_page->write, 0); + local_set(&next_page->entries, 0); + local_set(&next_page->page->commit, 0); + cpu_buffer->tail_page = next_page; + + /* reread the time stamp */ + *ts = rb_time_stamp(buffer, cpu_buffer->cpu); + cpu_buffer->tail_page->page->time_stamp = *ts; + } - if (tail <= BUF_PAGE_SIZE) - /* Set the write back to the previous setting */ - local_set(&tail_page->write, tail); + rb_reset_tail(cpu_buffer, tail_page, tail, length); - /* - * If this was a commit entry that failed, - * increment that too - */ - if (tail_page == cpu_buffer->commit_page && - tail == rb_commit_index(cpu_buffer)) { - rb_set_commit_to_write(cpu_buffer); - } + __raw_spin_unlock(&cpu_buffer->lock); + local_irq_restore(flags); + + /* fail and let the caller try again */ + return ERR_PTR(-EAGAIN); + + out_reset: + /* reset write */ + rb_reset_tail(cpu_buffer, tail_page, tail, length); + if (likely(lock_taken)) __raw_spin_unlock(&cpu_buffer->lock); - local_irq_restore(flags); + local_irq_restore(flags); + return NULL; +} - /* fail and let the caller try again */ - return ERR_PTR(-EAGAIN); - } +static struct ring_buffer_event * +__rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, + unsigned type, unsigned long length, u64 *ts) +{ + struct buffer_page *tail_page, *commit_page; + struct ring_buffer_event *event; + unsigned long tail, write; - /* We reserved something on the buffer */ + commit_page = cpu_buffer->commit_page; + /* we just need to protect against interrupts */ + barrier(); + tail_page = cpu_buffer->tail_page; + write = local_add_return(length, &tail_page->write); + tail = write - length; - if (RB_WARN_ON(cpu_buffer, write > BUF_PAGE_SIZE)) - return NULL; + /* See if we shot pass the end of this buffer page */ + if (write > BUF_PAGE_SIZE) + return rb_move_tail(cpu_buffer, length, tail, + commit_page, tail_page, ts); + + /* We reserved something on the buffer */ event = __rb_page_index(tail_page, tail); + kmemcheck_annotate_bitfield(event, bitfield); rb_update_event(event, type, length); + /* The passed in type is zero for DATA */ + if (likely(!type)) + local_inc(&tail_page->entries); + /* - * If this is a commit and the tail is zero, then update - * this page's time stamp. + * If this is the first commit on the page, then update + * its timestamp. */ - if (!tail && rb_is_commit(cpu_buffer, event)) - cpu_buffer->commit_page->page->time_stamp = *ts; + if (!tail) + tail_page->page->time_stamp = *ts; return event; +} - out_unlock: - /* reset write */ - if (tail <= BUF_PAGE_SIZE) - local_set(&tail_page->write, tail); +static inline int +rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer, + struct ring_buffer_event *event) +{ + unsigned long new_index, old_index; + struct buffer_page *bpage; + unsigned long index; + unsigned long addr; - __raw_spin_unlock(&cpu_buffer->lock); - local_irq_restore(flags); - return NULL; + new_index = rb_event_index(event); + old_index = new_index + rb_event_length(event); + addr = (unsigned long)event; + addr &= PAGE_MASK; + + bpage = cpu_buffer->tail_page; + + if (bpage->page == (void *)addr && rb_page_write(bpage) == old_index) { + /* + * This is on the tail page. It is possible that + * a write could come in and move the tail page + * and write to the next page. That is fine + * because we just shorten what is on this page. + */ + index = local_cmpxchg(&bpage->write, old_index, new_index); + if (index == old_index) + return 1; + } + + /* could not discard */ + return 0; } static int @@ -1142,26 +1408,33 @@ rb_add_time_stamp(struct ring_buffer_per_cpu *cpu_buffer, return -EAGAIN; /* Only a commited time event can update the write stamp */ - if (rb_is_commit(cpu_buffer, event)) { + if (rb_event_is_commit(cpu_buffer, event)) { /* - * If this is the first on the page, then we need to - * update the page itself, and just put in a zero. + * If this is the first on the page, then it was + * updated with the page itself. Try to discard it + * and if we can't just make it zero. */ if (rb_event_index(event)) { event->time_delta = *delta & TS_MASK; event->array[0] = *delta >> TS_SHIFT; } else { - cpu_buffer->commit_page->page->time_stamp = *ts; - event->time_delta = 0; - event->array[0] = 0; + /* try to discard, since we do not need this */ + if (!rb_try_to_discard(cpu_buffer, event)) { + /* nope, just zero it */ + event->time_delta = 0; + event->array[0] = 0; + } } cpu_buffer->write_stamp = *ts; /* let the caller know this was the commit */ ret = 1; } else { - /* Darn, this is just wasted space */ - event->time_delta = 0; - event->array[0] = 0; + /* Try to discard the event */ + if (!rb_try_to_discard(cpu_buffer, event)) { + /* Darn, this is just wasted space */ + event->time_delta = 0; + event->array[0] = 0; + } ret = 0; } @@ -1170,15 +1443,56 @@ rb_add_time_stamp(struct ring_buffer_per_cpu *cpu_buffer, return ret; } +static void rb_start_commit(struct ring_buffer_per_cpu *cpu_buffer) +{ + local_inc(&cpu_buffer->committing); + local_inc(&cpu_buffer->commits); +} + +static void rb_end_commit(struct ring_buffer_per_cpu *cpu_buffer) +{ + unsigned long commits; + + if (RB_WARN_ON(cpu_buffer, + !local_read(&cpu_buffer->committing))) + return; + + again: + commits = local_read(&cpu_buffer->commits); + /* synchronize with interrupts */ + barrier(); + if (local_read(&cpu_buffer->committing) == 1) + rb_set_commit_to_write(cpu_buffer); + + local_dec(&cpu_buffer->committing); + + /* synchronize with interrupts */ + barrier(); + + /* + * Need to account for interrupts coming in between the + * updating of the commit page and the clearing of the + * committing counter. + */ + if (unlikely(local_read(&cpu_buffer->commits) != commits) && + !local_read(&cpu_buffer->committing)) { + local_inc(&cpu_buffer->committing); + goto again; + } +} + static struct ring_buffer_event * rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer, - unsigned type, unsigned long length) + unsigned long length) { struct ring_buffer_event *event; - u64 ts, delta; + u64 ts, delta = 0; int commit = 0; int nr_loops = 0; + rb_start_commit(cpu_buffer); + + length = rb_calculate_event_length(length); again: /* * We allow for interrupts to reenter here and do a trace. @@ -1190,9 +1504,9 @@ rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer, * Bail! */ if (RB_WARN_ON(cpu_buffer, ++nr_loops > 1000)) - return NULL; + goto out_fail; - ts = ring_buffer_time_stamp(cpu_buffer->cpu); + ts = rb_time_stamp(cpu_buffer->buffer, cpu_buffer->cpu); /* * Only the first commit can update the timestamp. @@ -1202,70 +1516,99 @@ rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer, * also be made. But only the entry that did the actual * commit will be something other than zero. */ - if (cpu_buffer->tail_page == cpu_buffer->commit_page && - rb_page_write(cpu_buffer->tail_page) == - rb_commit_index(cpu_buffer)) { + if (likely(cpu_buffer->tail_page == cpu_buffer->commit_page && + rb_page_write(cpu_buffer->tail_page) == + rb_commit_index(cpu_buffer))) { + u64 diff; - delta = ts - cpu_buffer->write_stamp; + diff = ts - cpu_buffer->write_stamp; - /* make sure this delta is calculated here */ + /* make sure this diff is calculated here */ barrier(); /* Did the write stamp get updated already? */ if (unlikely(ts < cpu_buffer->write_stamp)) - delta = 0; + goto get_event; - if (test_time_stamp(delta)) { + delta = diff; + if (unlikely(test_time_stamp(delta))) { commit = rb_add_time_stamp(cpu_buffer, &ts, &delta); - if (commit == -EBUSY) - return NULL; + goto out_fail; if (commit == -EAGAIN) goto again; RB_WARN_ON(cpu_buffer, commit < 0); } - } else - /* Non commits have zero deltas */ - delta = 0; + } - event = __rb_reserve_next(cpu_buffer, type, length, &ts); - if (PTR_ERR(event) == -EAGAIN) + get_event: + event = __rb_reserve_next(cpu_buffer, 0, length, &ts); + if (unlikely(PTR_ERR(event) == -EAGAIN)) goto again; - if (!event) { - if (unlikely(commit)) - /* - * Ouch! We needed a timestamp and it was commited. But - * we didn't get our event reserved. - */ - rb_set_commit_to_write(cpu_buffer); - return NULL; - } + if (!event) + goto out_fail; - /* - * If the timestamp was commited, make the commit our entry - * now so that we will update it when needed. - */ - if (commit) - rb_set_commit_event(cpu_buffer, event); - else if (!rb_is_commit(cpu_buffer, event)) + if (!rb_event_is_commit(cpu_buffer, event)) delta = 0; event->time_delta = delta; return event; + + out_fail: + rb_end_commit(cpu_buffer); + return NULL; +} + +#ifdef CONFIG_TRACING + +#define TRACE_RECURSIVE_DEPTH 16 + +static int trace_recursive_lock(void) +{ + current->trace_recursion++; + + if (likely(current->trace_recursion < TRACE_RECURSIVE_DEPTH)) + return 0; + + /* Disable all tracing before we do anything else */ + tracing_off_permanent(); + + printk_once(KERN_WARNING "Tracing recursion: depth[%ld]:" + "HC[%lu]:SC[%lu]:NMI[%lu]\n", + current->trace_recursion, + hardirq_count() >> HARDIRQ_SHIFT, + softirq_count() >> SOFTIRQ_SHIFT, + in_nmi()); + + WARN_ON_ONCE(1); + return -1; +} + +static void trace_recursive_unlock(void) +{ + WARN_ON_ONCE(!current->trace_recursion); + + current->trace_recursion--; } +#else + +#define trace_recursive_lock() (0) +#define trace_recursive_unlock() do { } while (0) + +#endif + static DEFINE_PER_CPU(int, rb_need_resched); /** * ring_buffer_lock_reserve - reserve a part of the buffer * @buffer: the ring buffer to reserve from * @length: the length of the data to reserve (excluding event header) - * @flags: a pointer to save the interrupt flags * * Returns a reseverd event on the ring buffer to copy directly to. * The user of this interface will need to get the body to write into @@ -1278,9 +1621,7 @@ static DEFINE_PER_CPU(int, rb_need_resched); * If NULL is returned, then nothing has been allocated or locked. */ struct ring_buffer_event * -ring_buffer_lock_reserve(struct ring_buffer *buffer, - unsigned long length, - unsigned long *flags) +ring_buffer_lock_reserve(struct ring_buffer *buffer, unsigned long length) { struct ring_buffer_per_cpu *cpu_buffer; struct ring_buffer_event *event; @@ -1295,6 +1636,9 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer, /* If we are tracing schedule, we don't want to recurse */ resched = ftrace_preempt_disable(); + if (trace_recursive_lock()) + goto out_nocheck; + cpu = raw_smp_processor_id(); if (!cpumask_test_cpu(cpu, buffer->cpumask)) @@ -1305,11 +1649,10 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer, if (atomic_read(&cpu_buffer->record_disabled)) goto out; - length = rb_calculate_event_length(length); - if (length > BUF_PAGE_SIZE) + if (length > BUF_MAX_DATA_SIZE) goto out; - event = rb_reserve_next_event(cpu_buffer, RINGBUF_TYPE_DATA, length); + event = rb_reserve_next_event(cpu_buffer, length); if (!event) goto out; @@ -1324,6 +1667,9 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer, return event; out: + trace_recursive_unlock(); + + out_nocheck: ftrace_preempt_enable(resched); return NULL; } @@ -1332,30 +1678,29 @@ EXPORT_SYMBOL_GPL(ring_buffer_lock_reserve); static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer, struct ring_buffer_event *event) { - cpu_buffer->entries++; + local_inc(&cpu_buffer->entries); - /* Only process further if we own the commit */ - if (!rb_is_commit(cpu_buffer, event)) - return; - - cpu_buffer->write_stamp += event->time_delta; + /* + * The event first in the commit queue updates the + * time stamp. + */ + if (rb_event_is_commit(cpu_buffer, event)) + cpu_buffer->write_stamp += event->time_delta; - rb_set_commit_to_write(cpu_buffer); + rb_end_commit(cpu_buffer); } /** * ring_buffer_unlock_commit - commit a reserved * @buffer: The buffer to commit to * @event: The event pointer to commit. - * @flags: the interrupt flags received from ring_buffer_lock_reserve. * * This commits the data to the ring buffer, and releases any locks held. * * Must be paired with ring_buffer_lock_reserve. */ int ring_buffer_unlock_commit(struct ring_buffer *buffer, - struct ring_buffer_event *event, - unsigned long flags) + struct ring_buffer_event *event) { struct ring_buffer_per_cpu *cpu_buffer; int cpu = raw_smp_processor_id(); @@ -1364,6 +1709,8 @@ int ring_buffer_unlock_commit(struct ring_buffer *buffer, rb_commit(cpu_buffer, event); + trace_recursive_unlock(); + /* * Only the last preempt count needs to restore preemption. */ @@ -1376,6 +1723,93 @@ int ring_buffer_unlock_commit(struct ring_buffer *buffer, } EXPORT_SYMBOL_GPL(ring_buffer_unlock_commit); +static inline void rb_event_discard(struct ring_buffer_event *event) +{ + /* array[0] holds the actual length for the discarded event */ + event->array[0] = rb_event_data_length(event) - RB_EVNT_HDR_SIZE; + event->type_len = RINGBUF_TYPE_PADDING; + /* time delta must be non zero */ + if (!event->time_delta) + event->time_delta = 1; +} + +/** + * ring_buffer_event_discard - discard any event in the ring buffer + * @event: the event to discard + * + * Sometimes a event that is in the ring buffer needs to be ignored. + * This function lets the user discard an event in the ring buffer + * and then that event will not be read later. + * + * Note, it is up to the user to be careful with this, and protect + * against races. If the user discards an event that has been consumed + * it is possible that it could corrupt the ring buffer. + */ +void ring_buffer_event_discard(struct ring_buffer_event *event) +{ + rb_event_discard(event); +} +EXPORT_SYMBOL_GPL(ring_buffer_event_discard); + +/** + * ring_buffer_commit_discard - discard an event that has not been committed + * @buffer: the ring buffer + * @event: non committed event to discard + * + * This is similar to ring_buffer_event_discard but must only be + * performed on an event that has not been committed yet. The difference + * is that this will also try to free the event from the ring buffer + * if another event has not been added behind it. + * + * If another event has been added behind it, it will set the event + * up as discarded, and perform the commit. + * + * If this function is called, do not call ring_buffer_unlock_commit on + * the event. + */ +void ring_buffer_discard_commit(struct ring_buffer *buffer, + struct ring_buffer_event *event) +{ + struct ring_buffer_per_cpu *cpu_buffer; + int cpu; + + /* The event is discarded regardless */ + rb_event_discard(event); + + cpu = smp_processor_id(); + cpu_buffer = buffer->buffers[cpu]; + + /* + * This must only be called if the event has not been + * committed yet. Thus we can assume that preemption + * is still disabled. + */ + RB_WARN_ON(buffer, !local_read(&cpu_buffer->committing)); + + if (rb_try_to_discard(cpu_buffer, event)) + goto out; + + /* + * The commit is still visible by the reader, so we + * must increment entries. + */ + local_inc(&cpu_buffer->entries); + out: + rb_end_commit(cpu_buffer); + + trace_recursive_unlock(); + + /* + * Only the last preempt count needs to restore preemption. + */ + if (preempt_count() == 1) + ftrace_preempt_enable(per_cpu(rb_need_resched, cpu)); + else + preempt_enable_no_resched_notrace(); + +} +EXPORT_SYMBOL_GPL(ring_buffer_discard_commit); + /** * ring_buffer_write - write data to the buffer without reserving * @buffer: The ring buffer to write to. @@ -1395,7 +1829,6 @@ int ring_buffer_write(struct ring_buffer *buffer, { struct ring_buffer_per_cpu *cpu_buffer; struct ring_buffer_event *event; - unsigned long event_length; void *body; int ret = -EBUSY; int cpu, resched; @@ -1418,9 +1851,10 @@ int ring_buffer_write(struct ring_buffer *buffer, if (atomic_read(&cpu_buffer->record_disabled)) goto out; - event_length = rb_calculate_event_length(length); - event = rb_reserve_next_event(cpu_buffer, - RINGBUF_TYPE_DATA, event_length); + if (length > BUF_MAX_DATA_SIZE) + goto out; + + event = rb_reserve_next_event(cpu_buffer, length); if (!event) goto out; @@ -1438,7 +1872,7 @@ int ring_buffer_write(struct ring_buffer *buffer, } EXPORT_SYMBOL_GPL(ring_buffer_write); -static inline int rb_per_cpu_empty(struct ring_buffer_per_cpu *cpu_buffer) +static int rb_per_cpu_empty(struct ring_buffer_per_cpu *cpu_buffer) { struct buffer_page *reader = cpu_buffer->reader_page; struct buffer_page *head = cpu_buffer->head_page; @@ -1528,12 +1962,16 @@ EXPORT_SYMBOL_GPL(ring_buffer_record_enable_cpu); unsigned long ring_buffer_entries_cpu(struct ring_buffer *buffer, int cpu) { struct ring_buffer_per_cpu *cpu_buffer; + unsigned long ret; if (!cpumask_test_cpu(cpu, buffer->cpumask)) return 0; cpu_buffer = buffer->buffers[cpu]; - return cpu_buffer->entries; + ret = (local_read(&cpu_buffer->entries) - cpu_buffer->overrun) + - cpu_buffer->read; + + return ret; } EXPORT_SYMBOL_GPL(ring_buffer_entries_cpu); @@ -1545,16 +1983,60 @@ EXPORT_SYMBOL_GPL(ring_buffer_entries_cpu); unsigned long ring_buffer_overrun_cpu(struct ring_buffer *buffer, int cpu) { struct ring_buffer_per_cpu *cpu_buffer; + unsigned long ret; if (!cpumask_test_cpu(cpu, buffer->cpumask)) return 0; cpu_buffer = buffer->buffers[cpu]; - return cpu_buffer->overrun; + ret = cpu_buffer->overrun; + + return ret; } EXPORT_SYMBOL_GPL(ring_buffer_overrun_cpu); /** + * ring_buffer_nmi_dropped_cpu - get the number of nmis that were dropped + * @buffer: The ring buffer + * @cpu: The per CPU buffer to get the number of overruns from + */ +unsigned long ring_buffer_nmi_dropped_cpu(struct ring_buffer *buffer, int cpu) +{ + struct ring_buffer_per_cpu *cpu_buffer; + unsigned long ret; + + if (!cpumask_test_cpu(cpu, buffer->cpumask)) + return 0; + + cpu_buffer = buffer->buffers[cpu]; + ret = cpu_buffer->nmi_dropped; + + return ret; +} +EXPORT_SYMBOL_GPL(ring_buffer_nmi_dropped_cpu); + +/** + * ring_buffer_commit_overrun_cpu - get the number of overruns caused by commits + * @buffer: The ring buffer + * @cpu: The per CPU buffer to get the number of overruns from + */ +unsigned long +ring_buffer_commit_overrun_cpu(struct ring_buffer *buffer, int cpu) +{ + struct ring_buffer_per_cpu *cpu_buffer; + unsigned long ret; + + if (!cpumask_test_cpu(cpu, buffer->cpumask)) + return 0; + + cpu_buffer = buffer->buffers[cpu]; + ret = cpu_buffer->commit_overrun; + + return ret; +} +EXPORT_SYMBOL_GPL(ring_buffer_commit_overrun_cpu); + +/** * ring_buffer_entries - get the number of entries in a buffer * @buffer: The ring buffer * @@ -1570,7 +2052,8 @@ unsigned long ring_buffer_entries(struct ring_buffer *buffer) /* if you care about this being correct, lock the buffer */ for_each_buffer_cpu(buffer, cpu) { cpu_buffer = buffer->buffers[cpu]; - entries += cpu_buffer->entries; + entries += (local_read(&cpu_buffer->entries) - + cpu_buffer->overrun) - cpu_buffer->read; } return entries; @@ -1627,9 +2110,14 @@ static void rb_iter_reset(struct ring_buffer_iter *iter) */ void ring_buffer_iter_reset(struct ring_buffer_iter *iter) { - struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer; + struct ring_buffer_per_cpu *cpu_buffer; unsigned long flags; + if (!iter) + return; + + cpu_buffer = iter->cpu_buffer; + spin_lock_irqsave(&cpu_buffer->reader_lock, flags); rb_iter_reset(iter); spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); @@ -1657,7 +2145,7 @@ rb_update_read_stamp(struct ring_buffer_per_cpu *cpu_buffer, { u64 delta; - switch (event->type) { + switch (event->type_len) { case RINGBUF_TYPE_PADDING: return; @@ -1688,7 +2176,7 @@ rb_update_iter_read_stamp(struct ring_buffer_iter *iter, { u64 delta; - switch (event->type) { + switch (event->type_len) { case RINGBUF_TYPE_PADDING: return; @@ -1761,6 +2249,7 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer) cpu_buffer->reader_page->list.prev = reader->list.prev; local_set(&cpu_buffer->reader_page->write, 0); + local_set(&cpu_buffer->reader_page->entries, 0); local_set(&cpu_buffer->reader_page->page->commit, 0); /* Make the reader page now replace the head */ @@ -1803,8 +2292,9 @@ static void rb_advance_reader(struct ring_buffer_per_cpu *cpu_buffer) event = rb_reader_event(cpu_buffer); - if (event->type == RINGBUF_TYPE_DATA) - cpu_buffer->entries--; + if (event->type_len <= RINGBUF_TYPE_DATA_TYPE_LEN_MAX + || rb_discarded_event(event)) + cpu_buffer->read++; rb_update_read_stamp(cpu_buffer, event); @@ -1826,8 +2316,8 @@ static void rb_advance_iter(struct ring_buffer_iter *iter) * Check if we are at the end of the buffer. */ if (iter->head >= rb_page_size(iter->head_page)) { - if (RB_WARN_ON(buffer, - iter->head_page == cpu_buffer->commit_page)) + /* discarded commits can make the page empty */ + if (iter->head_page == cpu_buffer->commit_page) return; rb_inc_iter(iter); return; @@ -1864,21 +2354,16 @@ rb_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts) struct buffer_page *reader; int nr_loops = 0; - if (!cpumask_test_cpu(cpu, buffer->cpumask)) - return NULL; - cpu_buffer = buffer->buffers[cpu]; again: /* * We repeat when a timestamp is encountered. It is possible * to get multiple timestamps from an interrupt entering just - * as one timestamp is about to be written. The max times - * that this can happen is the number of nested interrupts we - * can have. Nesting 10 deep of interrupts is clearly - * an anomaly. + * as one timestamp is about to be written, or from discarded + * commits. The most that we can have is the number on a single page. */ - if (RB_WARN_ON(cpu_buffer, ++nr_loops > 10)) + if (RB_WARN_ON(cpu_buffer, ++nr_loops > RB_TIMESTAMPS_PER_PAGE)) return NULL; reader = rb_get_reader_page(cpu_buffer); @@ -1887,11 +2372,19 @@ rb_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts) event = rb_reader_event(cpu_buffer); - switch (event->type) { + switch (event->type_len) { case RINGBUF_TYPE_PADDING: - RB_WARN_ON(cpu_buffer, 1); - rb_advance_reader(cpu_buffer); - return NULL; + if (rb_null_event(event)) + RB_WARN_ON(cpu_buffer, 1); + /* + * Because the writer could be discarding every + * event it creates (which would probably be bad) + * if we were to go back to "again" then we may never + * catch up, and will trigger the warn on, or lock + * the box. Return the padding, and we will release + * the current locks, and try again. + */ + return event; case RINGBUF_TYPE_TIME_EXTEND: /* Internal data, OK to advance */ @@ -1906,7 +2399,8 @@ rb_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts) case RINGBUF_TYPE_DATA: if (ts) { *ts = cpu_buffer->read_stamp + event->time_delta; - ring_buffer_normalize_time_stamp(cpu_buffer->cpu, ts); + ring_buffer_normalize_time_stamp(buffer, + cpu_buffer->cpu, ts); } return event; @@ -1934,14 +2428,14 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts) again: /* - * We repeat when a timestamp is encountered. It is possible - * to get multiple timestamps from an interrupt entering just - * as one timestamp is about to be written. The max times - * that this can happen is the number of nested interrupts we - * can have. Nesting 10 deep of interrupts is clearly - * an anomaly. + * We repeat when a timestamp is encountered. + * We can get multiple timestamps by nested interrupts or also + * if filtering is on (discarding commits). Since discarding + * commits can be frequent we can get a lot of timestamps. + * But we limit them by not adding timestamps if they begin + * at the start of a page. */ - if (RB_WARN_ON(cpu_buffer, ++nr_loops > 10)) + if (RB_WARN_ON(cpu_buffer, ++nr_loops > RB_TIMESTAMPS_PER_PAGE)) return NULL; if (rb_per_cpu_empty(cpu_buffer)) @@ -1949,10 +2443,14 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts) event = rb_iter_head_event(iter); - switch (event->type) { + switch (event->type_len) { case RINGBUF_TYPE_PADDING: - rb_inc_iter(iter); - goto again; + if (rb_null_event(event)) { + rb_inc_iter(iter); + goto again; + } + rb_advance_iter(iter); + return event; case RINGBUF_TYPE_TIME_EXTEND: /* Internal data, OK to advance */ @@ -1967,7 +2465,8 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts) case RINGBUF_TYPE_DATA: if (ts) { *ts = iter->read_stamp + event->time_delta; - ring_buffer_normalize_time_stamp(cpu_buffer->cpu, ts); + ring_buffer_normalize_time_stamp(buffer, + cpu_buffer->cpu, ts); } return event; @@ -1979,6 +2478,21 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts) } EXPORT_SYMBOL_GPL(ring_buffer_iter_peek); +static inline int rb_ok_to_lock(void) +{ + /* + * If an NMI die dumps out the content of the ring buffer + * do not grab locks. We also permanently disable the ring + * buffer too. A one time deal is all you get from reading + * the ring buffer from an NMI. + */ + if (likely(!in_nmi())) + return 1; + + tracing_off_permanent(); + return 0; +} + /** * ring_buffer_peek - peek at the next event to be read * @buffer: The ring buffer to read @@ -1994,10 +2508,27 @@ ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts) struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu]; struct ring_buffer_event *event; unsigned long flags; + int dolock; - spin_lock_irqsave(&cpu_buffer->reader_lock, flags); + if (!cpumask_test_cpu(cpu, buffer->cpumask)) + return NULL; + + dolock = rb_ok_to_lock(); + again: + local_irq_save(flags); + if (dolock) + spin_lock(&cpu_buffer->reader_lock); event = rb_buffer_peek(buffer, cpu, ts); - spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); + if (event && event->type_len == RINGBUF_TYPE_PADDING) + rb_advance_reader(cpu_buffer); + if (dolock) + spin_unlock(&cpu_buffer->reader_lock); + local_irq_restore(flags); + + if (event && event->type_len == RINGBUF_TYPE_PADDING) { + cpu_relax(); + goto again; + } return event; } @@ -2017,10 +2548,16 @@ ring_buffer_iter_peek(struct ring_buffer_iter *iter, u64 *ts) struct ring_buffer_event *event; unsigned long flags; + again: spin_lock_irqsave(&cpu_buffer->reader_lock, flags); event = rb_iter_peek(iter, ts); spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); + if (event && event->type_len == RINGBUF_TYPE_PADDING) { + cpu_relax(); + goto again; + } + return event; } @@ -2035,23 +2572,40 @@ ring_buffer_iter_peek(struct ring_buffer_iter *iter, u64 *ts) struct ring_buffer_event * ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts) { - struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu]; - struct ring_buffer_event *event; + struct ring_buffer_per_cpu *cpu_buffer; + struct ring_buffer_event *event = NULL; unsigned long flags; + int dolock; + + dolock = rb_ok_to_lock(); + + again: + /* might be called in atomic */ + preempt_disable(); if (!cpumask_test_cpu(cpu, buffer->cpumask)) - return NULL; + goto out; - spin_lock_irqsave(&cpu_buffer->reader_lock, flags); + cpu_buffer = buffer->buffers[cpu]; + local_irq_save(flags); + if (dolock) + spin_lock(&cpu_buffer->reader_lock); event = rb_buffer_peek(buffer, cpu, ts); - if (!event) - goto out; + if (event) + rb_advance_reader(cpu_buffer); - rb_advance_reader(cpu_buffer); + if (dolock) + spin_unlock(&cpu_buffer->reader_lock); + local_irq_restore(flags); out: - spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); + preempt_enable(); + + if (event && event->type_len == RINGBUF_TYPE_PADDING) { + cpu_relax(); + goto again; + } return event; } @@ -2131,6 +2685,7 @@ ring_buffer_read(struct ring_buffer_iter *iter, u64 *ts) struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer; unsigned long flags; + again: spin_lock_irqsave(&cpu_buffer->reader_lock, flags); event = rb_iter_peek(iter, ts); if (!event) @@ -2140,6 +2695,11 @@ ring_buffer_read(struct ring_buffer_iter *iter, u64 *ts) out: spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); + if (event && event->type_len == RINGBUF_TYPE_PADDING) { + cpu_relax(); + goto again; + } + return event; } EXPORT_SYMBOL_GPL(ring_buffer_read); @@ -2160,6 +2720,7 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer) cpu_buffer->head_page = list_entry(cpu_buffer->pages.next, struct buffer_page, list); local_set(&cpu_buffer->head_page->write, 0); + local_set(&cpu_buffer->head_page->entries, 0); local_set(&cpu_buffer->head_page->page->commit, 0); cpu_buffer->head_page->read = 0; @@ -2169,11 +2730,17 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer) INIT_LIST_HEAD(&cpu_buffer->reader_page->list); local_set(&cpu_buffer->reader_page->write, 0); + local_set(&cpu_buffer->reader_page->entries, 0); local_set(&cpu_buffer->reader_page->page->commit, 0); cpu_buffer->reader_page->read = 0; + cpu_buffer->nmi_dropped = 0; + cpu_buffer->commit_overrun = 0; cpu_buffer->overrun = 0; - cpu_buffer->entries = 0; + cpu_buffer->read = 0; + local_set(&cpu_buffer->entries, 0); + local_set(&cpu_buffer->committing, 0); + local_set(&cpu_buffer->commits, 0); cpu_buffer->write_stamp = 0; cpu_buffer->read_stamp = 0; @@ -2192,6 +2759,8 @@ void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu) if (!cpumask_test_cpu(cpu, buffer->cpumask)) return; + atomic_inc(&cpu_buffer->record_disabled); + spin_lock_irqsave(&cpu_buffer->reader_lock, flags); __raw_spin_lock(&cpu_buffer->lock); @@ -2201,6 +2770,8 @@ void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu) __raw_spin_unlock(&cpu_buffer->lock); spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); + + atomic_dec(&cpu_buffer->record_disabled); } EXPORT_SYMBOL_GPL(ring_buffer_reset_cpu); @@ -2224,14 +2795,28 @@ EXPORT_SYMBOL_GPL(ring_buffer_reset); int ring_buffer_empty(struct ring_buffer *buffer) { struct ring_buffer_per_cpu *cpu_buffer; + unsigned long flags; + int dolock; int cpu; + int ret; + + dolock = rb_ok_to_lock(); /* yes this is racy, but if you don't like the race, lock the buffer */ for_each_buffer_cpu(buffer, cpu) { cpu_buffer = buffer->buffers[cpu]; - if (!rb_per_cpu_empty(cpu_buffer)) + local_irq_save(flags); + if (dolock) + spin_lock(&cpu_buffer->reader_lock); + ret = rb_per_cpu_empty(cpu_buffer); + if (dolock) + spin_unlock(&cpu_buffer->reader_lock); + local_irq_restore(flags); + + if (!ret) return 0; } + return 1; } EXPORT_SYMBOL_GPL(ring_buffer_empty); @@ -2244,12 +2829,25 @@ EXPORT_SYMBOL_GPL(ring_buffer_empty); int ring_buffer_empty_cpu(struct ring_buffer *buffer, int cpu) { struct ring_buffer_per_cpu *cpu_buffer; + unsigned long flags; + int dolock; + int ret; if (!cpumask_test_cpu(cpu, buffer->cpumask)) return 1; + dolock = rb_ok_to_lock(); + cpu_buffer = buffer->buffers[cpu]; - return rb_per_cpu_empty(cpu_buffer); + local_irq_save(flags); + if (dolock) + spin_lock(&cpu_buffer->reader_lock); + ret = rb_per_cpu_empty(cpu_buffer); + if (dolock) + spin_unlock(&cpu_buffer->reader_lock); + local_irq_restore(flags); + + return ret; } EXPORT_SYMBOL_GPL(ring_buffer_empty_cpu); @@ -2268,18 +2866,36 @@ int ring_buffer_swap_cpu(struct ring_buffer *buffer_a, { struct ring_buffer_per_cpu *cpu_buffer_a; struct ring_buffer_per_cpu *cpu_buffer_b; + int ret = -EINVAL; if (!cpumask_test_cpu(cpu, buffer_a->cpumask) || !cpumask_test_cpu(cpu, buffer_b->cpumask)) - return -EINVAL; + goto out; /* At least make sure the two buffers are somewhat the same */ if (buffer_a->pages != buffer_b->pages) - return -EINVAL; + goto out; + + ret = -EAGAIN; + + if (ring_buffer_flags != RB_BUFFERS_ON) + goto out; + + if (atomic_read(&buffer_a->record_disabled)) + goto out; + + if (atomic_read(&buffer_b->record_disabled)) + goto out; cpu_buffer_a = buffer_a->buffers[cpu]; cpu_buffer_b = buffer_b->buffers[cpu]; + if (atomic_read(&cpu_buffer_a->record_disabled)) + goto out; + + if (atomic_read(&cpu_buffer_b->record_disabled)) + goto out; + /* * We can't do a synchronize_sched here because this * function can be called in atomic context. @@ -2298,31 +2914,12 @@ int ring_buffer_swap_cpu(struct ring_buffer *buffer_a, atomic_dec(&cpu_buffer_a->record_disabled); atomic_dec(&cpu_buffer_b->record_disabled); - return 0; + ret = 0; +out: + return ret; } EXPORT_SYMBOL_GPL(ring_buffer_swap_cpu); -static void rb_remove_entries(struct ring_buffer_per_cpu *cpu_buffer, - struct buffer_data_page *bpage) -{ - struct ring_buffer_event *event; - unsigned long head; - - __raw_spin_lock(&cpu_buffer->lock); - for (head = 0; head < local_read(&bpage->commit); - head += rb_event_length(event)) { - - event = __rb_data_page_index(bpage, head); - if (RB_WARN_ON(cpu_buffer, rb_null_event(event))) - return; - /* Only count data entries */ - if (event->type != RINGBUF_TYPE_DATA) - continue; - cpu_buffer->entries--; - } - __raw_spin_unlock(&cpu_buffer->lock); -} - /** * ring_buffer_alloc_read_page - allocate a page to read from buffer * @buffer: the buffer to allocate for. @@ -2340,8 +2937,8 @@ static void rb_remove_entries(struct ring_buffer_per_cpu *cpu_buffer, */ void *ring_buffer_alloc_read_page(struct ring_buffer *buffer) { - unsigned long addr; struct buffer_data_page *bpage; + unsigned long addr; addr = __get_free_page(GFP_KERNEL); if (!addr) @@ -2349,8 +2946,11 @@ void *ring_buffer_alloc_read_page(struct ring_buffer *buffer) bpage = (void *)addr; + rb_init_page(bpage); + return bpage; } +EXPORT_SYMBOL_GPL(ring_buffer_alloc_read_page); /** * ring_buffer_free_read_page - free an allocated read page @@ -2363,11 +2963,13 @@ void ring_buffer_free_read_page(struct ring_buffer *buffer, void *data) { free_page((unsigned long)data); } +EXPORT_SYMBOL_GPL(ring_buffer_free_read_page); /** * ring_buffer_read_page - extract a page from the ring buffer * @buffer: buffer to extract from * @data_page: the page to use allocated from ring_buffer_alloc_read_page + * @len: amount to extract * @cpu: the cpu of the buffer to extract * @full: should the extraction only happen when the page is full. * @@ -2377,12 +2979,12 @@ void ring_buffer_free_read_page(struct ring_buffer *buffer, void *data) * to swap with a page in the ring buffer. * * for example: - * rpage = ring_buffer_alloc_page(buffer); + * rpage = ring_buffer_alloc_read_page(buffer); * if (!rpage) * return error; - * ret = ring_buffer_read_page(buffer, &rpage, cpu, 0); - * if (ret) - * process_page(rpage); + * ret = ring_buffer_read_page(buffer, &rpage, len, cpu, 0); + * if (ret >= 0) + * process_page(rpage, ret); * * When @full is set, the function will not return true unless * the writer is off the reader page. @@ -2393,80 +2995,129 @@ void ring_buffer_free_read_page(struct ring_buffer *buffer, void *data) * responsible for that. * * Returns: - * 1 if data has been transferred - * 0 if no data has been transferred. + * >=0 if data has been transferred, returns the offset of consumed data. + * <0 if no data has been transferred. */ int ring_buffer_read_page(struct ring_buffer *buffer, - void **data_page, int cpu, int full) + void **data_page, size_t len, int cpu, int full) { struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu]; struct ring_buffer_event *event; struct buffer_data_page *bpage; + struct buffer_page *reader; unsigned long flags; - int ret = 0; + unsigned int commit; + unsigned int read; + u64 save_timestamp; + int ret = -1; + + if (!cpumask_test_cpu(cpu, buffer->cpumask)) + goto out; + + /* + * If len is not big enough to hold the page header, then + * we can not copy anything. + */ + if (len <= BUF_PAGE_HDR_SIZE) + goto out; + + len -= BUF_PAGE_HDR_SIZE; if (!data_page) - return 0; + goto out; bpage = *data_page; if (!bpage) - return 0; + goto out; spin_lock_irqsave(&cpu_buffer->reader_lock, flags); - /* - * rb_buffer_peek will get the next ring buffer if - * the current reader page is empty. - */ - event = rb_buffer_peek(buffer, cpu, NULL); - if (!event) - goto out; + reader = rb_get_reader_page(cpu_buffer); + if (!reader) + goto out_unlock; + + event = rb_reader_event(cpu_buffer); + + read = reader->read; + commit = rb_page_commit(reader); - /* check for data */ - if (!local_read(&cpu_buffer->reader_page->page->commit)) - goto out; /* - * If the writer is already off of the read page, then simply - * switch the read page with the given page. Otherwise - * we need to copy the data from the reader to the writer. + * If this page has been partially read or + * if len is not big enough to read the rest of the page or + * a writer is still on the page, then + * we must copy the data from the page to the buffer. + * Otherwise, we can simply swap the page with the one passed in. */ - if (cpu_buffer->reader_page == cpu_buffer->commit_page) { - unsigned int read = cpu_buffer->reader_page->read; + if (read || (len < (commit - read)) || + cpu_buffer->reader_page == cpu_buffer->commit_page) { + struct buffer_data_page *rpage = cpu_buffer->reader_page->page; + unsigned int rpos = read; + unsigned int pos = 0; + unsigned int size; if (full) - goto out; - /* The writer is still on the reader page, we must copy */ - bpage = cpu_buffer->reader_page->page; - memcpy(bpage->data, - cpu_buffer->reader_page->page->data + read, - local_read(&bpage->commit) - read); + goto out_unlock; + + if (len > (commit - read)) + len = (commit - read); - /* consume what was read */ - cpu_buffer->reader_page += read; + size = rb_event_length(event); + if (len < size) + goto out_unlock; + + /* save the current timestamp, since the user will need it */ + save_timestamp = cpu_buffer->read_stamp; + + /* Need to copy one event at a time */ + do { + memcpy(bpage->data + pos, rpage->data + rpos, size); + + len -= size; + + rb_advance_reader(cpu_buffer); + rpos = reader->read; + pos += size; + + event = rb_reader_event(cpu_buffer); + size = rb_event_length(event); + } while (len > size); + + /* update bpage */ + local_set(&bpage->commit, pos); + bpage->time_stamp = save_timestamp; + + /* we copied everything to the beginning */ + read = 0; } else { + /* update the entry counter */ + cpu_buffer->read += local_read(&reader->entries); + /* swap the pages */ rb_init_page(bpage); - bpage = cpu_buffer->reader_page->page; - cpu_buffer->reader_page->page = *data_page; - cpu_buffer->reader_page->read = 0; + bpage = reader->page; + reader->page = *data_page; + local_set(&reader->write, 0); + local_set(&reader->entries, 0); + reader->read = 0; *data_page = bpage; } - ret = 1; + ret = read; - /* update the entry counter */ - rb_remove_entries(cpu_buffer, bpage); - out: + out_unlock: spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); + out: return ret; } +EXPORT_SYMBOL_GPL(ring_buffer_read_page); +#ifdef CONFIG_TRACING static ssize_t rb_simple_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) { - long *p = filp->private_data; + unsigned long *p = filp->private_data; char buf[64]; int r; @@ -2482,9 +3133,9 @@ static ssize_t rb_simple_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { - long *p = filp->private_data; + unsigned long *p = filp->private_data; char buf[64]; - long val; + unsigned long val; int ret; if (cnt >= sizeof(buf)) @@ -2509,7 +3160,7 @@ rb_simple_write(struct file *filp, const char __user *ubuf, return cnt; } -static struct file_operations rb_simple_fops = { +static const struct file_operations rb_simple_fops = { .open = tracing_open_generic, .read = rb_simple_read, .write = rb_simple_write, @@ -2519,16 +3170,53 @@ static struct file_operations rb_simple_fops = { static __init int rb_init_debugfs(void) { struct dentry *d_tracer; - struct dentry *entry; d_tracer = tracing_init_dentry(); - entry = debugfs_create_file("tracing_on", 0644, d_tracer, - &ring_buffer_flags, &rb_simple_fops); - if (!entry) - pr_warning("Could not create debugfs 'tracing_on' entry\n"); + trace_create_file("tracing_on", 0644, d_tracer, + &ring_buffer_flags, &rb_simple_fops); return 0; } fs_initcall(rb_init_debugfs); +#endif + +#ifdef CONFIG_HOTPLUG_CPU +static int rb_cpu_notify(struct notifier_block *self, + unsigned long action, void *hcpu) +{ + struct ring_buffer *buffer = + container_of(self, struct ring_buffer, cpu_notify); + long cpu = (long)hcpu; + + switch (action) { + case CPU_UP_PREPARE: + case CPU_UP_PREPARE_FROZEN: + if (cpumask_test_cpu(cpu, buffer->cpumask)) + return NOTIFY_OK; + + buffer->buffers[cpu] = + rb_allocate_cpu_buffer(buffer, cpu); + if (!buffer->buffers[cpu]) { + WARN(1, "failed to allocate ring buffer on CPU %ld\n", + cpu); + return NOTIFY_OK; + } + smp_wmb(); + cpumask_set_cpu(cpu, buffer->cpumask); + break; + case CPU_DOWN_PREPARE: + case CPU_DOWN_PREPARE_FROZEN: + /* + * Do nothing. + * If we were to free the buffer, then the user would + * lose any trace that was in the buffer. + */ + break; + default: + break; + } + return NOTIFY_OK; +} +#endif |