diff options
Diffstat (limited to 'kernel/perf_event.c')
| -rw-r--r-- | kernel/perf_event.c | 770 | 
1 files changed, 495 insertions, 275 deletions
diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 3d1552d3c12b..bd7ce8ca5bb9 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -16,6 +16,7 @@  #include <linux/file.h>  #include <linux/poll.h>  #include <linux/slab.h> +#include <linux/hash.h>  #include <linux/sysfs.h>  #include <linux/dcache.h>  #include <linux/percpu.h> @@ -82,14 +83,6 @@ extern __weak const struct pmu *hw_perf_event_init(struct perf_event *event)  void __weak hw_perf_disable(void)		{ barrier(); }  void __weak hw_perf_enable(void)		{ barrier(); } -int __weak -hw_perf_group_sched_in(struct perf_event *group_leader, -	       struct perf_cpu_context *cpuctx, -	       struct perf_event_context *ctx) -{ -	return 0; -} -  void __weak perf_event_print_debug(void)	{ }  static DEFINE_PER_CPU(int, perf_disable_count); @@ -262,6 +255,18 @@ static void update_event_times(struct perf_event *event)  	event->total_time_running = run_end - event->tstamp_running;  } +/* + * Update total_time_enabled and total_time_running for all events in a group. + */ +static void update_group_times(struct perf_event *leader) +{ +	struct perf_event *event; + +	update_event_times(leader); +	list_for_each_entry(event, &leader->sibling_list, group_entry) +		update_event_times(event); +} +  static struct list_head *  ctx_group_list(struct perf_event *event, struct perf_event_context *ctx)  { @@ -315,8 +320,6 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)  static void  list_del_event(struct perf_event *event, struct perf_event_context *ctx)  { -	struct perf_event *sibling, *tmp; -  	if (list_empty(&event->group_entry))  		return;  	ctx->nr_events--; @@ -329,7 +332,7 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)  	if (event->group_leader != event)  		event->group_leader->nr_siblings--; -	update_event_times(event); +	update_group_times(event);  	/*  	 * If event was in error state, then keep it @@ -340,6 +343,12 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)  	 */  	if (event->state > PERF_EVENT_STATE_OFF)  		event->state = PERF_EVENT_STATE_OFF; +} + +static void +perf_destroy_group(struct perf_event *event, struct perf_event_context *ctx) +{ +	struct perf_event *sibling, *tmp;  	/*  	 * If this was a group event with sibling events then @@ -505,18 +514,6 @@ retry:  }  /* - * Update total_time_enabled and total_time_running for all events in a group. - */ -static void update_group_times(struct perf_event *leader) -{ -	struct perf_event *event; - -	update_event_times(leader); -	list_for_each_entry(event, &leader->sibling_list, group_entry) -		update_event_times(event); -} - -/*   * Cross CPU call to disable a performance event   */  static void __perf_event_disable(void *info) @@ -640,15 +637,20 @@ group_sched_in(struct perf_event *group_event,  	       struct perf_cpu_context *cpuctx,  	       struct perf_event_context *ctx)  { -	struct perf_event *event, *partial_group; +	struct perf_event *event, *partial_group = NULL; +	const struct pmu *pmu = group_event->pmu; +	bool txn = false;  	int ret;  	if (group_event->state == PERF_EVENT_STATE_OFF)  		return 0; -	ret = hw_perf_group_sched_in(group_event, cpuctx, ctx); -	if (ret) -		return ret < 0 ? ret : 0; +	/* Check if group transaction availabe */ +	if (pmu->start_txn) +		txn = true; + +	if (txn) +		pmu->start_txn(pmu);  	if (event_sched_in(group_event, cpuctx, ctx))  		return -EAGAIN; @@ -663,9 +665,19 @@ group_sched_in(struct perf_event *group_event,  		}  	} -	return 0; +	if (!txn) +		return 0; + +	ret = pmu->commit_txn(pmu); +	if (!ret) { +		pmu->cancel_txn(pmu); +		return 0; +	}  group_error: +	if (txn) +		pmu->cancel_txn(pmu); +  	/*  	 * Groups can be scheduled in as one unit only, so undo any  	 * partial group before returning: @@ -1367,6 +1379,8 @@ void perf_event_task_sched_in(struct task_struct *task)  	if (cpuctx->task_ctx == ctx)  		return; +	perf_disable(); +  	/*  	 * We want to keep the following priority order:  	 * cpu pinned (that don't need to move), task pinned, @@ -1379,6 +1393,8 @@ void perf_event_task_sched_in(struct task_struct *task)  	ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE);  	cpuctx->task_ctx = ctx; + +	perf_enable();  }  #define MAX_INTERRUPTS (~0ULL) @@ -1856,9 +1872,30 @@ int perf_event_release_kernel(struct perf_event *event)  {  	struct perf_event_context *ctx = event->ctx; +	/* +	 * Remove from the PMU, can't get re-enabled since we got +	 * here because the last ref went. +	 */ +	perf_event_disable(event); +  	WARN_ON_ONCE(ctx->parent_ctx); -	mutex_lock(&ctx->mutex); -	perf_event_remove_from_context(event); +	/* +	 * There are two ways this annotation is useful: +	 * +	 *  1) there is a lock recursion from perf_event_exit_task +	 *     see the comment there. +	 * +	 *  2) there is a lock-inversion with mmap_sem through +	 *     perf_event_read_group(), which takes faults while +	 *     holding ctx->mutex, however this is called after +	 *     the last filedesc died, so there is no possibility +	 *     to trigger the AB-BA case. +	 */ +	mutex_lock_nested(&ctx->mutex, SINGLE_DEPTH_NESTING); +	raw_spin_lock_irq(&ctx->lock); +	list_del_event(event, ctx); +	perf_destroy_group(event, ctx); +	raw_spin_unlock_irq(&ctx->lock);  	mutex_unlock(&ctx->mutex);  	mutex_lock(&event->owner->perf_event_mutex); @@ -2260,11 +2297,6 @@ unlock:  	rcu_read_unlock();  } -static unsigned long perf_data_size(struct perf_mmap_data *data) -{ -	return data->nr_pages << (PAGE_SHIFT + data->data_order); -} -  #ifndef CONFIG_PERF_USE_VMALLOC  /* @@ -2283,6 +2315,19 @@ perf_mmap_to_page(struct perf_mmap_data *data, unsigned long pgoff)  	return virt_to_page(data->data_pages[pgoff - 1]);  } +static void *perf_mmap_alloc_page(int cpu) +{ +	struct page *page; +	int node; + +	node = (cpu == -1) ? cpu : cpu_to_node(cpu); +	page = alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0); +	if (!page) +		return NULL; + +	return page_address(page); +} +  static struct perf_mmap_data *  perf_mmap_data_alloc(struct perf_event *event, int nr_pages)  { @@ -2299,17 +2344,16 @@ perf_mmap_data_alloc(struct perf_event *event, int nr_pages)  	if (!data)  		goto fail; -	data->user_page = (void *)get_zeroed_page(GFP_KERNEL); +	data->user_page = perf_mmap_alloc_page(event->cpu);  	if (!data->user_page)  		goto fail_user_page;  	for (i = 0; i < nr_pages; i++) { -		data->data_pages[i] = (void *)get_zeroed_page(GFP_KERNEL); +		data->data_pages[i] = perf_mmap_alloc_page(event->cpu);  		if (!data->data_pages[i])  			goto fail_data_pages;  	} -	data->data_order = 0;  	data->nr_pages = nr_pages;  	return data; @@ -2345,6 +2389,11 @@ static void perf_mmap_data_free(struct perf_mmap_data *data)  	kfree(data);  } +static inline int page_order(struct perf_mmap_data *data) +{ +	return 0; +} +  #else  /* @@ -2353,10 +2402,15 @@ static void perf_mmap_data_free(struct perf_mmap_data *data)   * Required for architectures that have d-cache aliasing issues.   */ +static inline int page_order(struct perf_mmap_data *data) +{ +	return data->page_order; +} +  static struct page *  perf_mmap_to_page(struct perf_mmap_data *data, unsigned long pgoff)  { -	if (pgoff > (1UL << data->data_order)) +	if (pgoff > (1UL << page_order(data)))  		return NULL;  	return vmalloc_to_page((void *)data->user_page + pgoff * PAGE_SIZE); @@ -2376,7 +2430,7 @@ static void perf_mmap_data_free_work(struct work_struct *work)  	int i, nr;  	data = container_of(work, struct perf_mmap_data, work); -	nr = 1 << data->data_order; +	nr = 1 << page_order(data);  	base = data->user_page;  	for (i = 0; i < nr + 1; i++) @@ -2415,7 +2469,7 @@ perf_mmap_data_alloc(struct perf_event *event, int nr_pages)  	data->user_page = all_buf;  	data->data_pages[0] = all_buf + PAGE_SIZE; -	data->data_order = ilog2(nr_pages); +	data->page_order = ilog2(nr_pages);  	data->nr_pages = 1;  	return data; @@ -2429,6 +2483,11 @@ fail:  #endif +static unsigned long perf_data_size(struct perf_mmap_data *data) +{ +	return data->nr_pages << (PAGE_SHIFT + page_order(data)); +} +  static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)  {  	struct perf_event *event = vma->vm_file->private_data; @@ -2469,8 +2528,6 @@ perf_mmap_data_init(struct perf_event *event, struct perf_mmap_data *data)  {  	long max_size = perf_data_size(data); -	atomic_set(&data->lock, -1); -  	if (event->attr.watermark) {  		data->watermark = min_t(long, max_size,  					event->attr.wakeup_watermark); @@ -2543,6 +2600,14 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)  	long user_extra, extra;  	int ret = 0; +	/* +	 * Don't allow mmap() of inherited per-task counters. This would +	 * create a performance issue due to all children writing to the +	 * same buffer. +	 */ +	if (event->cpu == -1 && event->attr.inherit) +		return -EINVAL; +  	if (!(vma->vm_flags & VM_SHARED))  		return -EINVAL; @@ -2642,6 +2707,7 @@ static int perf_fasync(int fd, struct file *filp, int on)  }  static const struct file_operations perf_fops = { +	.llseek			= no_llseek,  	.release		= perf_release,  	.read			= perf_read,  	.poll			= perf_poll, @@ -2792,6 +2858,27 @@ void perf_arch_fetch_caller_regs(struct pt_regs *regs, unsigned long ip, int ski  /* + * We assume there is only KVM supporting the callbacks. + * Later on, we might change it to a list if there is + * another virtualization implementation supporting the callbacks. + */ +struct perf_guest_info_callbacks *perf_guest_cbs; + +int perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *cbs) +{ +	perf_guest_cbs = cbs; +	return 0; +} +EXPORT_SYMBOL_GPL(perf_register_guest_info_callbacks); + +int perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs) +{ +	perf_guest_cbs = NULL; +	return 0; +} +EXPORT_SYMBOL_GPL(perf_unregister_guest_info_callbacks); + +/*   * Output   */  static bool perf_output_space(struct perf_mmap_data *data, unsigned long tail, @@ -2826,120 +2913,80 @@ static void perf_output_wakeup(struct perf_output_handle *handle)  }  /* - * Curious locking construct. - *   * We need to ensure a later event_id doesn't publish a head when a former - * event_id isn't done writing. However since we need to deal with NMIs we + * event isn't done writing. However since we need to deal with NMIs we   * cannot fully serialize things.   * - * What we do is serialize between CPUs so we only have to deal with NMI - * nesting on a single CPU. - *   * We only publish the head (and generate a wakeup) when the outer-most - * event_id completes. + * event completes.   */ -static void perf_output_lock(struct perf_output_handle *handle) +static void perf_output_get_handle(struct perf_output_handle *handle)  {  	struct perf_mmap_data *data = handle->data; -	int cur, cpu = get_cpu(); -	handle->locked = 0; - -	for (;;) { -		cur = atomic_cmpxchg(&data->lock, -1, cpu); -		if (cur == -1) { -			handle->locked = 1; -			break; -		} -		if (cur == cpu) -			break; - -		cpu_relax(); -	} +	preempt_disable(); +	local_inc(&data->nest); +	handle->wakeup = local_read(&data->wakeup);  } -static void perf_output_unlock(struct perf_output_handle *handle) +static void perf_output_put_handle(struct perf_output_handle *handle)  {  	struct perf_mmap_data *data = handle->data;  	unsigned long head; -	int cpu; - -	data->done_head = data->head; - -	if (!handle->locked) -		goto out;  again: -	/* -	 * The xchg implies a full barrier that ensures all writes are done -	 * before we publish the new head, matched by a rmb() in userspace when -	 * reading this position. -	 */ -	while ((head = atomic_long_xchg(&data->done_head, 0))) -		data->user_page->data_head = head; +	head = local_read(&data->head);  	/* -	 * NMI can happen here, which means we can miss a done_head update. +	 * IRQ/NMI can happen here, which means we can miss a head update.  	 */ -	cpu = atomic_xchg(&data->lock, -1); -	WARN_ON_ONCE(cpu != smp_processor_id()); +	if (!local_dec_and_test(&data->nest)) +		goto out;  	/* -	 * Therefore we have to validate we did not indeed do so. +	 * Publish the known good head. Rely on the full barrier implied +	 * by atomic_dec_and_test() order the data->head read and this +	 * write.  	 */ -	if (unlikely(atomic_long_read(&data->done_head))) { -		/* -		 * Since we had it locked, we can lock it again. -		 */ -		while (atomic_cmpxchg(&data->lock, -1, cpu) != -1) -			cpu_relax(); +	data->user_page->data_head = head; +	/* +	 * Now check if we missed an update, rely on the (compiler) +	 * barrier in atomic_dec_and_test() to re-read data->head. +	 */ +	if (unlikely(head != local_read(&data->head))) { +		local_inc(&data->nest);  		goto again;  	} -	if (atomic_xchg(&data->wakeup, 0)) +	if (handle->wakeup != local_read(&data->wakeup))  		perf_output_wakeup(handle); -out: -	put_cpu(); + + out: +	preempt_enable();  } -void perf_output_copy(struct perf_output_handle *handle, +__always_inline void perf_output_copy(struct perf_output_handle *handle,  		      const void *buf, unsigned int len)  { -	unsigned int pages_mask; -	unsigned long offset; -	unsigned int size; -	void **pages; - -	offset		= handle->offset; -	pages_mask	= handle->data->nr_pages - 1; -	pages		= handle->data->data_pages; -  	do { -		unsigned long page_offset; -		unsigned long page_size; -		int nr; +		unsigned long size = min_t(unsigned long, handle->size, len); -		nr	    = (offset >> PAGE_SHIFT) & pages_mask; -		page_size   = 1UL << (handle->data->data_order + PAGE_SHIFT); -		page_offset = offset & (page_size - 1); -		size	    = min_t(unsigned int, page_size - page_offset, len); +		memcpy(handle->addr, buf, size); -		memcpy(pages[nr] + page_offset, buf, size); +		len -= size; +		handle->addr += size; +		handle->size -= size; +		if (!handle->size) { +			struct perf_mmap_data *data = handle->data; -		len	    -= size; -		buf	    += size; -		offset	    += size; +			handle->page++; +			handle->page &= data->nr_pages - 1; +			handle->addr = data->data_pages[handle->page]; +			handle->size = PAGE_SIZE << page_order(data); +		}  	} while (len); - -	handle->offset = offset; - -	/* -	 * Check we didn't copy past our reservation window, taking the -	 * possible unsigned int wrap into account. -	 */ -	WARN_ON_ONCE(((long)(handle->head - handle->offset)) < 0);  }  int perf_output_begin(struct perf_output_handle *handle, @@ -2977,13 +3024,13 @@ int perf_output_begin(struct perf_output_handle *handle,  	handle->sample	= sample;  	if (!data->nr_pages) -		goto fail; +		goto out; -	have_lost = atomic_read(&data->lost); +	have_lost = local_read(&data->lost);  	if (have_lost)  		size += sizeof(lost_event); -	perf_output_lock(handle); +	perf_output_get_handle(handle);  	do {  		/* @@ -2993,24 +3040,28 @@ int perf_output_begin(struct perf_output_handle *handle,  		 */  		tail = ACCESS_ONCE(data->user_page->data_tail);  		smp_rmb(); -		offset = head = atomic_long_read(&data->head); +		offset = head = local_read(&data->head);  		head += size;  		if (unlikely(!perf_output_space(data, tail, offset, head)))  			goto fail; -	} while (atomic_long_cmpxchg(&data->head, offset, head) != offset); +	} while (local_cmpxchg(&data->head, offset, head) != offset); -	handle->offset	= offset; -	handle->head	= head; +	if (head - local_read(&data->wakeup) > data->watermark) +		local_add(data->watermark, &data->wakeup); -	if (head - tail > data->watermark) -		atomic_set(&data->wakeup, 1); +	handle->page = offset >> (PAGE_SHIFT + page_order(data)); +	handle->page &= data->nr_pages - 1; +	handle->size = offset & ((PAGE_SIZE << page_order(data)) - 1); +	handle->addr = data->data_pages[handle->page]; +	handle->addr += handle->size; +	handle->size = (PAGE_SIZE << page_order(data)) - handle->size;  	if (have_lost) {  		lost_event.header.type = PERF_RECORD_LOST;  		lost_event.header.misc = 0;  		lost_event.header.size = sizeof(lost_event);  		lost_event.id          = event->id; -		lost_event.lost        = atomic_xchg(&data->lost, 0); +		lost_event.lost        = local_xchg(&data->lost, 0);  		perf_output_put(handle, lost_event);  	} @@ -3018,8 +3069,8 @@ int perf_output_begin(struct perf_output_handle *handle,  	return 0;  fail: -	atomic_inc(&data->lost); -	perf_output_unlock(handle); +	local_inc(&data->lost); +	perf_output_put_handle(handle);  out:  	rcu_read_unlock(); @@ -3034,14 +3085,14 @@ void perf_output_end(struct perf_output_handle *handle)  	int wakeup_events = event->attr.wakeup_events;  	if (handle->sample && wakeup_events) { -		int events = atomic_inc_return(&data->events); +		int events = local_inc_return(&data->events);  		if (events >= wakeup_events) { -			atomic_sub(wakeup_events, &data->events); -			atomic_set(&data->wakeup, 1); +			local_sub(wakeup_events, &data->events); +			local_inc(&data->wakeup);  		}  	} -	perf_output_unlock(handle); +	perf_output_put_handle(handle);  	rcu_read_unlock();  } @@ -3377,22 +3428,13 @@ static void perf_event_task_output(struct perf_event *event,  {  	struct perf_output_handle handle;  	struct task_struct *task = task_event->task; -	unsigned long flags;  	int size, ret; -	/* -	 * If this CPU attempts to acquire an rq lock held by a CPU spinning -	 * in perf_output_lock() from interrupt context, it's game over. -	 */ -	local_irq_save(flags); -  	size  = task_event->event_id.header.size;  	ret = perf_output_begin(&handle, event, size, 0, 0); -	if (ret) { -		local_irq_restore(flags); +	if (ret)  		return; -	}  	task_event->event_id.pid = perf_event_pid(event, task);  	task_event->event_id.ppid = perf_event_pid(event, current); @@ -3403,7 +3445,6 @@ static void perf_event_task_output(struct perf_event *event,  	perf_output_put(&handle, task_event->event_id);  	perf_output_end(&handle); -	local_irq_restore(flags);  }  static int perf_event_task_match(struct perf_event *event) @@ -3743,7 +3784,7 @@ void __perf_event_mmap(struct vm_area_struct *vma)  		.event_id  = {  			.header = {  				.type = PERF_RECORD_MMAP, -				.misc = 0, +				.misc = PERF_RECORD_MISC_USER,  				/* .size */  			},  			/* .pid */ @@ -3961,39 +4002,6 @@ static void perf_swevent_add(struct perf_event *event, u64 nr,  	perf_swevent_overflow(event, 0, nmi, data, regs);  } -static int perf_swevent_is_counting(struct perf_event *event) -{ -	/* -	 * The event is active, we're good! -	 */ -	if (event->state == PERF_EVENT_STATE_ACTIVE) -		return 1; - -	/* -	 * The event is off/error, not counting. -	 */ -	if (event->state != PERF_EVENT_STATE_INACTIVE) -		return 0; - -	/* -	 * The event is inactive, if the context is active -	 * we're part of a group that didn't make it on the 'pmu', -	 * not counting. -	 */ -	if (event->ctx->is_active) -		return 0; - -	/* -	 * We're inactive and the context is too, this means the -	 * task is scheduled out, we're counting events that happen -	 * to us, like migration events. -	 */ -	return 1; -} - -static int perf_tp_event_match(struct perf_event *event, -				struct perf_sample_data *data); -  static int perf_exclude_event(struct perf_event *event,  			      struct pt_regs *regs)  { @@ -4014,12 +4022,6 @@ static int perf_swevent_match(struct perf_event *event,  				struct perf_sample_data *data,  				struct pt_regs *regs)  { -	if (event->cpu != -1 && event->cpu != smp_processor_id()) -		return 0; - -	if (!perf_swevent_is_counting(event)) -		return 0; -  	if (event->attr.type != type)  		return 0; @@ -4029,30 +4031,88 @@ static int perf_swevent_match(struct perf_event *event,  	if (perf_exclude_event(event, regs))  		return 0; -	if (event->attr.type == PERF_TYPE_TRACEPOINT && -	    !perf_tp_event_match(event, data)) -		return 0; -  	return 1;  } -static void perf_swevent_ctx_event(struct perf_event_context *ctx, -				     enum perf_type_id type, -				     u32 event_id, u64 nr, int nmi, -				     struct perf_sample_data *data, -				     struct pt_regs *regs) +static inline u64 swevent_hash(u64 type, u32 event_id)  { +	u64 val = event_id | (type << 32); + +	return hash_64(val, SWEVENT_HLIST_BITS); +} + +static inline struct hlist_head * +__find_swevent_head(struct swevent_hlist *hlist, u64 type, u32 event_id) +{ +	u64 hash = swevent_hash(type, event_id); + +	return &hlist->heads[hash]; +} + +/* For the read side: events when they trigger */ +static inline struct hlist_head * +find_swevent_head_rcu(struct perf_cpu_context *ctx, u64 type, u32 event_id) +{ +	struct swevent_hlist *hlist; + +	hlist = rcu_dereference(ctx->swevent_hlist); +	if (!hlist) +		return NULL; + +	return __find_swevent_head(hlist, type, event_id); +} + +/* For the event head insertion and removal in the hlist */ +static inline struct hlist_head * +find_swevent_head(struct perf_cpu_context *ctx, struct perf_event *event) +{ +	struct swevent_hlist *hlist; +	u32 event_id = event->attr.config; +	u64 type = event->attr.type; + +	/* +	 * Event scheduling is always serialized against hlist allocation +	 * and release. Which makes the protected version suitable here. +	 * The context lock guarantees that. +	 */ +	hlist = rcu_dereference_protected(ctx->swevent_hlist, +					  lockdep_is_held(&event->ctx->lock)); +	if (!hlist) +		return NULL; + +	return __find_swevent_head(hlist, type, event_id); +} + +static void do_perf_sw_event(enum perf_type_id type, u32 event_id, +				    u64 nr, int nmi, +				    struct perf_sample_data *data, +				    struct pt_regs *regs) +{ +	struct perf_cpu_context *cpuctx;  	struct perf_event *event; +	struct hlist_node *node; +	struct hlist_head *head; -	list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { +	cpuctx = &__get_cpu_var(perf_cpu_context); + +	rcu_read_lock(); + +	head = find_swevent_head_rcu(cpuctx, type, event_id); + +	if (!head) +		goto end; + +	hlist_for_each_entry_rcu(event, node, head, hlist_entry) {  		if (perf_swevent_match(event, type, event_id, data, regs))  			perf_swevent_add(event, nr, nmi, data, regs);  	} +end: +	rcu_read_unlock();  }  int perf_swevent_get_recursion_context(void)  { -	struct perf_cpu_context *cpuctx = &get_cpu_var(perf_cpu_context); +	struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);  	int rctx;  	if (in_nmi()) @@ -4064,10 +4124,8 @@ int perf_swevent_get_recursion_context(void)  	else  		rctx = 0; -	if (cpuctx->recursion[rctx]) { -		put_cpu_var(perf_cpu_context); +	if (cpuctx->recursion[rctx])  		return -1; -	}  	cpuctx->recursion[rctx]++;  	barrier(); @@ -4081,31 +4139,9 @@ void perf_swevent_put_recursion_context(int rctx)  	struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);  	barrier();  	cpuctx->recursion[rctx]--; -	put_cpu_var(perf_cpu_context);  }  EXPORT_SYMBOL_GPL(perf_swevent_put_recursion_context); -static void do_perf_sw_event(enum perf_type_id type, u32 event_id, -				    u64 nr, int nmi, -				    struct perf_sample_data *data, -				    struct pt_regs *regs) -{ -	struct perf_cpu_context *cpuctx; -	struct perf_event_context *ctx; - -	cpuctx = &__get_cpu_var(perf_cpu_context); -	rcu_read_lock(); -	perf_swevent_ctx_event(&cpuctx->ctx, type, event_id, -				 nr, nmi, data, regs); -	/* -	 * doesn't really matter which of the child contexts the -	 * events ends up in. -	 */ -	ctx = rcu_dereference(current->perf_event_ctxp); -	if (ctx) -		perf_swevent_ctx_event(ctx, type, event_id, nr, nmi, data, regs); -	rcu_read_unlock(); -}  void __perf_sw_event(u32 event_id, u64 nr, int nmi,  			    struct pt_regs *regs, u64 addr) @@ -4113,6 +4149,7 @@ void __perf_sw_event(u32 event_id, u64 nr, int nmi,  	struct perf_sample_data data;  	int rctx; +	preempt_disable_notrace();  	rctx = perf_swevent_get_recursion_context();  	if (rctx < 0)  		return; @@ -4122,6 +4159,7 @@ void __perf_sw_event(u32 event_id, u64 nr, int nmi,  	do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, nmi, &data, regs);  	perf_swevent_put_recursion_context(rctx); +	preempt_enable_notrace();  }  static void perf_swevent_read(struct perf_event *event) @@ -4131,16 +4169,28 @@ static void perf_swevent_read(struct perf_event *event)  static int perf_swevent_enable(struct perf_event *event)  {  	struct hw_perf_event *hwc = &event->hw; +	struct perf_cpu_context *cpuctx; +	struct hlist_head *head; + +	cpuctx = &__get_cpu_var(perf_cpu_context);  	if (hwc->sample_period) {  		hwc->last_period = hwc->sample_period;  		perf_swevent_set_period(event);  	} + +	head = find_swevent_head(cpuctx, event); +	if (WARN_ON_ONCE(!head)) +		return -EINVAL; + +	hlist_add_head_rcu(&event->hlist_entry, head); +  	return 0;  }  static void perf_swevent_disable(struct perf_event *event)  { +	hlist_del_rcu(&event->hlist_entry);  }  static const struct pmu perf_ops_generic = { @@ -4168,15 +4218,8 @@ static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)  	perf_sample_data_init(&data, 0);  	data.period = event->hw.last_period;  	regs = get_irq_regs(); -	/* -	 * In case we exclude kernel IPs or are somehow not in interrupt -	 * context, provide the next best thing, the user IP. -	 */ -	if ((event->attr.exclude_kernel || !regs) && -			!event->attr.exclude_user) -		regs = task_pt_regs(current); -	if (regs) { +	if (regs && !perf_exclude_event(event, regs)) {  		if (!(event->attr.exclude_idle && current->pid == 0))  			if (perf_event_overflow(event, 0, &data, regs))  				ret = HRTIMER_NORESTART; @@ -4324,27 +4367,122 @@ static const struct pmu perf_ops_task_clock = {  	.read		= task_clock_perf_event_read,  }; -#ifdef CONFIG_EVENT_TRACING +/* Deref the hlist from the update side */ +static inline struct swevent_hlist * +swevent_hlist_deref(struct perf_cpu_context *cpuctx) +{ +	return rcu_dereference_protected(cpuctx->swevent_hlist, +					 lockdep_is_held(&cpuctx->hlist_mutex)); +} -void perf_tp_event(int event_id, u64 addr, u64 count, void *record, -		   int entry_size, struct pt_regs *regs) +static void swevent_hlist_release_rcu(struct rcu_head *rcu_head)  { -	struct perf_sample_data data; -	struct perf_raw_record raw = { -		.size = entry_size, -		.data = record, -	}; +	struct swevent_hlist *hlist; -	perf_sample_data_init(&data, addr); -	data.raw = &raw; +	hlist = container_of(rcu_head, struct swevent_hlist, rcu_head); +	kfree(hlist); +} -	/* Trace events already protected against recursion */ -	do_perf_sw_event(PERF_TYPE_TRACEPOINT, event_id, count, 1, -			 &data, regs); +static void swevent_hlist_release(struct perf_cpu_context *cpuctx) +{ +	struct swevent_hlist *hlist = swevent_hlist_deref(cpuctx); + +	if (!hlist) +		return; + +	rcu_assign_pointer(cpuctx->swevent_hlist, NULL); +	call_rcu(&hlist->rcu_head, swevent_hlist_release_rcu);  } -EXPORT_SYMBOL_GPL(perf_tp_event); -static int perf_tp_event_match(struct perf_event *event, +static void swevent_hlist_put_cpu(struct perf_event *event, int cpu) +{ +	struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); + +	mutex_lock(&cpuctx->hlist_mutex); + +	if (!--cpuctx->hlist_refcount) +		swevent_hlist_release(cpuctx); + +	mutex_unlock(&cpuctx->hlist_mutex); +} + +static void swevent_hlist_put(struct perf_event *event) +{ +	int cpu; + +	if (event->cpu != -1) { +		swevent_hlist_put_cpu(event, event->cpu); +		return; +	} + +	for_each_possible_cpu(cpu) +		swevent_hlist_put_cpu(event, cpu); +} + +static int swevent_hlist_get_cpu(struct perf_event *event, int cpu) +{ +	struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); +	int err = 0; + +	mutex_lock(&cpuctx->hlist_mutex); + +	if (!swevent_hlist_deref(cpuctx) && cpu_online(cpu)) { +		struct swevent_hlist *hlist; + +		hlist = kzalloc(sizeof(*hlist), GFP_KERNEL); +		if (!hlist) { +			err = -ENOMEM; +			goto exit; +		} +		rcu_assign_pointer(cpuctx->swevent_hlist, hlist); +	} +	cpuctx->hlist_refcount++; + exit: +	mutex_unlock(&cpuctx->hlist_mutex); + +	return err; +} + +static int swevent_hlist_get(struct perf_event *event) +{ +	int err; +	int cpu, failed_cpu; + +	if (event->cpu != -1) +		return swevent_hlist_get_cpu(event, event->cpu); + +	get_online_cpus(); +	for_each_possible_cpu(cpu) { +		err = swevent_hlist_get_cpu(event, cpu); +		if (err) { +			failed_cpu = cpu; +			goto fail; +		} +	} +	put_online_cpus(); + +	return 0; + fail: +	for_each_possible_cpu(cpu) { +		if (cpu == failed_cpu) +			break; +		swevent_hlist_put_cpu(event, cpu); +	} + +	put_online_cpus(); +	return err; +} + +#ifdef CONFIG_EVENT_TRACING + +static const struct pmu perf_ops_tracepoint = { +	.enable		= perf_trace_enable, +	.disable	= perf_trace_disable, +	.read		= perf_swevent_read, +	.unthrottle	= perf_swevent_unthrottle, +}; + +static int perf_tp_filter_match(struct perf_event *event,  				struct perf_sample_data *data)  {  	void *record = data->raw->data; @@ -4354,13 +4492,55 @@ static int perf_tp_event_match(struct perf_event *event,  	return 0;  } +static int perf_tp_event_match(struct perf_event *event, +				struct perf_sample_data *data, +				struct pt_regs *regs) +{ +	/* +	 * All tracepoints are from kernel-space. +	 */ +	if (event->attr.exclude_kernel) +		return 0; + +	if (!perf_tp_filter_match(event, data)) +		return 0; + +	return 1; +} + +void perf_tp_event(u64 addr, u64 count, void *record, int entry_size, +		   struct pt_regs *regs, struct hlist_head *head) +{ +	struct perf_sample_data data; +	struct perf_event *event; +	struct hlist_node *node; + +	struct perf_raw_record raw = { +		.size = entry_size, +		.data = record, +	}; + +	perf_sample_data_init(&data, addr); +	data.raw = &raw; + +	rcu_read_lock(); +	hlist_for_each_entry_rcu(event, node, head, hlist_entry) { +		if (perf_tp_event_match(event, &data, regs)) +			perf_swevent_add(event, count, 1, &data, regs); +	} +	rcu_read_unlock(); +} +EXPORT_SYMBOL_GPL(perf_tp_event); +  static void tp_perf_event_destroy(struct perf_event *event)  { -	perf_trace_disable(event->attr.config); +	perf_trace_destroy(event);  }  static const struct pmu *tp_perf_event_init(struct perf_event *event)  { +	int err; +  	/*  	 * Raw tracepoint data is a severe data leak, only allow root to  	 * have these. @@ -4370,12 +4550,13 @@ static const struct pmu *tp_perf_event_init(struct perf_event *event)  			!capable(CAP_SYS_ADMIN))  		return ERR_PTR(-EPERM); -	if (perf_trace_enable(event->attr.config)) +	err = perf_trace_init(event); +	if (err)  		return NULL;  	event->destroy = tp_perf_event_destroy; -	return &perf_ops_generic; +	return &perf_ops_tracepoint;  }  static int perf_event_set_filter(struct perf_event *event, void __user *arg) @@ -4403,12 +4584,6 @@ static void perf_event_free_filter(struct perf_event *event)  #else -static int perf_tp_event_match(struct perf_event *event, -				struct perf_sample_data *data) -{ -	return 1; -} -  static const struct pmu *tp_perf_event_init(struct perf_event *event)  {  	return NULL; @@ -4474,6 +4649,7 @@ static void sw_perf_event_destroy(struct perf_event *event)  	WARN_ON(event->parent);  	atomic_dec(&perf_swevent_enabled[event_id]); +	swevent_hlist_put(event);  }  static const struct pmu *sw_perf_event_init(struct perf_event *event) @@ -4512,6 +4688,12 @@ static const struct pmu *sw_perf_event_init(struct perf_event *event)  	case PERF_COUNT_SW_ALIGNMENT_FAULTS:  	case PERF_COUNT_SW_EMULATION_FAULTS:  		if (!event->parent) { +			int err; + +			err = swevent_hlist_get(event); +			if (err) +				return ERR_PTR(err); +  			atomic_inc(&perf_swevent_enabled[event_id]);  			event->destroy = sw_perf_event_destroy;  		} @@ -4738,6 +4920,13 @@ static int perf_event_set_output(struct perf_event *event, int output_fd)  	int fput_needed = 0;  	int ret = -EINVAL; +	/* +	 * Don't allow output of inherited per-task events. This would +	 * create performance issues due to cross cpu access. +	 */ +	if (event->cpu == -1 && event->attr.inherit) +		return -EINVAL; +  	if (!output_fd)  		goto set; @@ -4758,6 +4947,18 @@ static int perf_event_set_output(struct perf_event *event, int output_fd)  	if (event->data)  		goto out; +	/* +	 * Don't allow cross-cpu buffers +	 */ +	if (output_event->cpu != event->cpu) +		goto out; + +	/* +	 * If its not a per-cpu buffer, it must be the same task. +	 */ +	if (output_event->cpu == -1 && output_event->ctx != event->ctx) +		goto out; +  	atomic_long_inc(&output_file->f_count);  set: @@ -4798,8 +4999,8 @@ SYSCALL_DEFINE5(perf_event_open,  	struct perf_event_context *ctx;  	struct file *event_file = NULL;  	struct file *group_file = NULL; +	int event_fd;  	int fput_needed = 0; -	int fput_needed2 = 0;  	int err;  	/* for future expandability... */ @@ -4820,12 +5021,18 @@ SYSCALL_DEFINE5(perf_event_open,  			return -EINVAL;  	} +	event_fd = get_unused_fd_flags(O_RDWR); +	if (event_fd < 0) +		return event_fd; +  	/*  	 * Get the target context (task or percpu):  	 */  	ctx = find_get_context(pid, cpu); -	if (IS_ERR(ctx)) -		return PTR_ERR(ctx); +	if (IS_ERR(ctx)) { +		err = PTR_ERR(ctx); +		goto err_fd; +	}  	/*  	 * Look up the group leader (we will attach this event to it): @@ -4865,13 +5072,11 @@ SYSCALL_DEFINE5(perf_event_open,  	if (IS_ERR(event))  		goto err_put_context; -	err = anon_inode_getfd("[perf_event]", &perf_fops, event, O_RDWR); -	if (err < 0) -		goto err_free_put_context; - -	event_file = fget_light(err, &fput_needed2); -	if (!event_file) +	event_file = anon_inode_getfile("[perf_event]", &perf_fops, event, O_RDWR); +	if (IS_ERR(event_file)) { +		err = PTR_ERR(event_file);  		goto err_free_put_context; +	}  	if (flags & PERF_FLAG_FD_OUTPUT) {  		err = perf_event_set_output(event, group_fd); @@ -4892,19 +5097,19 @@ SYSCALL_DEFINE5(perf_event_open,  	list_add_tail(&event->owner_entry, ¤t->perf_event_list);  	mutex_unlock(¤t->perf_event_mutex); -err_fput_free_put_context: -	fput_light(event_file, fput_needed2); +	fput_light(group_file, fput_needed); +	fd_install(event_fd, event_file); +	return event_fd; +err_fput_free_put_context: +	fput(event_file);  err_free_put_context: -	if (err < 0) -		free_event(event); - +	free_event(event);  err_put_context: -	if (err < 0) -		put_ctx(ctx); -  	fput_light(group_file, fput_needed); - +	put_ctx(ctx); +err_fd: +	put_unused_fd(event_fd);  	return err;  } @@ -5176,7 +5381,7 @@ void perf_event_exit_task(struct task_struct *child)  	 *  	 * But since its the parent context it won't be the same instance.  	 */ -	mutex_lock_nested(&child_ctx->mutex, SINGLE_DEPTH_NESTING); +	mutex_lock(&child_ctx->mutex);  again:  	list_for_each_entry_safe(child_event, tmp, &child_ctx->pinned_groups, @@ -5384,6 +5589,7 @@ static void __init perf_event_init_all_cpus(void)  	for_each_possible_cpu(cpu) {  		cpuctx = &per_cpu(perf_cpu_context, cpu); +		mutex_init(&cpuctx->hlist_mutex);  		__perf_event_init_context(&cpuctx->ctx, NULL);  	}  } @@ -5397,6 +5603,16 @@ static void __cpuinit perf_event_init_cpu(int cpu)  	spin_lock(&perf_resource_lock);  	cpuctx->max_pertask = perf_max_events - perf_reserved_percpu;  	spin_unlock(&perf_resource_lock); + +	mutex_lock(&cpuctx->hlist_mutex); +	if (cpuctx->hlist_refcount > 0) { +		struct swevent_hlist *hlist; + +		hlist = kzalloc(sizeof(*hlist), GFP_KERNEL); +		WARN_ON_ONCE(!hlist); +		rcu_assign_pointer(cpuctx->swevent_hlist, hlist); +	} +	mutex_unlock(&cpuctx->hlist_mutex);  }  #ifdef CONFIG_HOTPLUG_CPU @@ -5416,6 +5632,10 @@ static void perf_event_exit_cpu(int cpu)  	struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);  	struct perf_event_context *ctx = &cpuctx->ctx; +	mutex_lock(&cpuctx->hlist_mutex); +	swevent_hlist_release(cpuctx); +	mutex_unlock(&cpuctx->hlist_mutex); +  	mutex_lock(&ctx->mutex);  	smp_call_function_single(cpu, __perf_event_exit_cpu, NULL, 1);  	mutex_unlock(&ctx->mutex);  |