diff options
Diffstat (limited to 'kernel/trace')
| -rw-r--r-- | kernel/trace/Kconfig | 3 | ||||
| -rw-r--r-- | kernel/trace/blktrace.c | 30 | ||||
| -rw-r--r-- | kernel/trace/bpf_trace.c | 27 | ||||
| -rw-r--r-- | kernel/trace/ring_buffer.c | 79 | ||||
| -rw-r--r-- | kernel/trace/trace.c | 54 | ||||
| -rw-r--r-- | kernel/trace/trace_events.c | 16 | ||||
| -rw-r--r-- | kernel/trace/trace_stack.c | 4 | 
7 files changed, 127 insertions, 86 deletions
| diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index af7dad126c13..f54dc62b599c 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -164,6 +164,7 @@ config PREEMPTIRQ_EVENTS  	bool "Enable trace events for preempt and irq disable/enable"  	select TRACE_IRQFLAGS  	depends on DEBUG_PREEMPT || !PROVE_LOCKING +	depends on TRACING  	default n  	help  	  Enable tracing of disable and enable events for preemption and irqs. @@ -354,7 +355,7 @@ config PROFILE_ANNOTATED_BRANCHES  	  on if you need to profile the system's use of these macros.  config PROFILE_ALL_BRANCHES -	bool "Profile all if conditionals" +	bool "Profile all if conditionals" if !FORTIFY_SOURCE  	select TRACE_BRANCH_PROFILING  	help  	  This tracer profiles all branch conditions. Every if () diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 206e0e2ace53..987d9a9ae283 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -591,7 +591,7 @@ static int __blk_trace_setup(struct request_queue *q, char *name, dev_t dev,  		return ret;  	if (copy_to_user(arg, &buts, sizeof(buts))) { -		blk_trace_remove(q); +		__blk_trace_remove(q);  		return -EFAULT;  	}  	return 0; @@ -637,7 +637,7 @@ static int compat_blk_trace_setup(struct request_queue *q, char *name,  		return ret;  	if (copy_to_user(arg, &buts.name, ARRAY_SIZE(buts.name))) { -		blk_trace_remove(q); +		__blk_trace_remove(q);  		return -EFAULT;  	} @@ -872,7 +872,7 @@ static void blk_add_trace_rq_complete(void *ignore, struct request *rq,   *   **/  static void blk_add_trace_bio(struct request_queue *q, struct bio *bio, -			      u32 what, int error, union kernfs_node_id *cgid) +			      u32 what, int error)  {  	struct blk_trace *bt = q->blk_trace; @@ -880,22 +880,21 @@ static void blk_add_trace_bio(struct request_queue *q, struct bio *bio,  		return;  	__blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size, -			bio_op(bio), bio->bi_opf, what, error, 0, NULL, cgid); +			bio_op(bio), bio->bi_opf, what, error, 0, NULL, +			blk_trace_bio_get_cgid(q, bio));  }  static void blk_add_trace_bio_bounce(void *ignore,  				     struct request_queue *q, struct bio *bio)  { -	blk_add_trace_bio(q, bio, BLK_TA_BOUNCE, 0, -			  blk_trace_bio_get_cgid(q, bio)); +	blk_add_trace_bio(q, bio, BLK_TA_BOUNCE, 0);  }  static void blk_add_trace_bio_complete(void *ignore,  				       struct request_queue *q, struct bio *bio,  				       int error)  { -	blk_add_trace_bio(q, bio, BLK_TA_COMPLETE, error, -			  blk_trace_bio_get_cgid(q, bio)); +	blk_add_trace_bio(q, bio, BLK_TA_COMPLETE, error);  }  static void blk_add_trace_bio_backmerge(void *ignore, @@ -903,8 +902,7 @@ static void blk_add_trace_bio_backmerge(void *ignore,  					struct request *rq,  					struct bio *bio)  { -	blk_add_trace_bio(q, bio, BLK_TA_BACKMERGE, 0, -			 blk_trace_bio_get_cgid(q, bio)); +	blk_add_trace_bio(q, bio, BLK_TA_BACKMERGE, 0);  }  static void blk_add_trace_bio_frontmerge(void *ignore, @@ -912,15 +910,13 @@ static void blk_add_trace_bio_frontmerge(void *ignore,  					 struct request *rq,  					 struct bio *bio)  { -	blk_add_trace_bio(q, bio, BLK_TA_FRONTMERGE, 0, -			  blk_trace_bio_get_cgid(q, bio)); +	blk_add_trace_bio(q, bio, BLK_TA_FRONTMERGE, 0);  }  static void blk_add_trace_bio_queue(void *ignore,  				    struct request_queue *q, struct bio *bio)  { -	blk_add_trace_bio(q, bio, BLK_TA_QUEUE, 0, -			  blk_trace_bio_get_cgid(q, bio)); +	blk_add_trace_bio(q, bio, BLK_TA_QUEUE, 0);  }  static void blk_add_trace_getrq(void *ignore, @@ -928,8 +924,7 @@ static void blk_add_trace_getrq(void *ignore,  				struct bio *bio, int rw)  {  	if (bio) -		blk_add_trace_bio(q, bio, BLK_TA_GETRQ, 0, -				  blk_trace_bio_get_cgid(q, bio)); +		blk_add_trace_bio(q, bio, BLK_TA_GETRQ, 0);  	else {  		struct blk_trace *bt = q->blk_trace; @@ -945,8 +940,7 @@ static void blk_add_trace_sleeprq(void *ignore,  				  struct bio *bio, int rw)  {  	if (bio) -		blk_add_trace_bio(q, bio, BLK_TA_SLEEPRQ, 0, -				  blk_trace_bio_get_cgid(q, bio)); +		blk_add_trace_bio(q, bio, BLK_TA_SLEEPRQ, 0);  	else {  		struct blk_trace *bt = q->blk_trace; diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 27d1f4ffa3de..40207c2a4113 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -343,14 +343,13 @@ static const struct bpf_func_proto bpf_perf_event_read_value_proto = {  	.arg4_type	= ARG_CONST_SIZE,  }; -static DEFINE_PER_CPU(struct perf_sample_data, bpf_sd); +static DEFINE_PER_CPU(struct perf_sample_data, bpf_trace_sd);  static __always_inline u64  __bpf_perf_event_output(struct pt_regs *regs, struct bpf_map *map, -			u64 flags, struct perf_raw_record *raw) +			u64 flags, struct perf_sample_data *sd)  {  	struct bpf_array *array = container_of(map, struct bpf_array, map); -	struct perf_sample_data *sd = this_cpu_ptr(&bpf_sd);  	unsigned int cpu = smp_processor_id();  	u64 index = flags & BPF_F_INDEX_MASK;  	struct bpf_event_entry *ee; @@ -373,8 +372,6 @@ __bpf_perf_event_output(struct pt_regs *regs, struct bpf_map *map,  	if (unlikely(event->oncpu != cpu))  		return -EOPNOTSUPP; -	perf_sample_data_init(sd, 0, 0); -	sd->raw = raw;  	perf_event_output(event, sd, regs);  	return 0;  } @@ -382,6 +379,7 @@ __bpf_perf_event_output(struct pt_regs *regs, struct bpf_map *map,  BPF_CALL_5(bpf_perf_event_output, struct pt_regs *, regs, struct bpf_map *, map,  	   u64, flags, void *, data, u64, size)  { +	struct perf_sample_data *sd = this_cpu_ptr(&bpf_trace_sd);  	struct perf_raw_record raw = {  		.frag = {  			.size = size, @@ -392,7 +390,10 @@ BPF_CALL_5(bpf_perf_event_output, struct pt_regs *, regs, struct bpf_map *, map,  	if (unlikely(flags & ~(BPF_F_INDEX_MASK)))  		return -EINVAL; -	return __bpf_perf_event_output(regs, map, flags, &raw); +	perf_sample_data_init(sd, 0, 0); +	sd->raw = &raw; + +	return __bpf_perf_event_output(regs, map, flags, sd);  }  static const struct bpf_func_proto bpf_perf_event_output_proto = { @@ -407,10 +408,12 @@ static const struct bpf_func_proto bpf_perf_event_output_proto = {  };  static DEFINE_PER_CPU(struct pt_regs, bpf_pt_regs); +static DEFINE_PER_CPU(struct perf_sample_data, bpf_misc_sd);  u64 bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size,  		     void *ctx, u64 ctx_size, bpf_ctx_copy_t ctx_copy)  { +	struct perf_sample_data *sd = this_cpu_ptr(&bpf_misc_sd);  	struct pt_regs *regs = this_cpu_ptr(&bpf_pt_regs);  	struct perf_raw_frag frag = {  		.copy		= ctx_copy, @@ -428,8 +431,10 @@ u64 bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size,  	};  	perf_fetch_caller_regs(regs); +	perf_sample_data_init(sd, 0, 0); +	sd->raw = &raw; -	return __bpf_perf_event_output(regs, map, flags, &raw); +	return __bpf_perf_event_output(regs, map, flags, sd);  }  BPF_CALL_0(bpf_get_current_task) @@ -759,6 +764,8 @@ const struct bpf_prog_ops perf_event_prog_ops = {  static DEFINE_MUTEX(bpf_event_mutex); +#define BPF_TRACE_MAX_PROGS 64 +  int perf_event_attach_bpf_prog(struct perf_event *event,  			       struct bpf_prog *prog)  { @@ -772,6 +779,12 @@ int perf_event_attach_bpf_prog(struct perf_event *event,  		goto unlock;  	old_array = event->tp_event->prog_array; +	if (old_array && +	    bpf_prog_array_length(old_array) >= BPF_TRACE_MAX_PROGS) { +		ret = -E2BIG; +		goto unlock; +	} +  	ret = bpf_prog_array_copy(old_array, NULL, prog, &new_array);  	if (ret < 0)  		goto unlock; diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 91874a95060d..5af2842dea96 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -280,6 +280,8 @@ EXPORT_SYMBOL_GPL(ring_buffer_event_data);  /* Missed count stored at end */  #define RB_MISSED_STORED	(1 << 30) +#define RB_MISSED_FLAGS		(RB_MISSED_EVENTS|RB_MISSED_STORED) +  struct buffer_data_page {  	u64		 time_stamp;	/* page time stamp */  	local_t		 commit;	/* write committed index */ @@ -331,7 +333,9 @@ static void rb_init_page(struct buffer_data_page *bpage)   */  size_t ring_buffer_page_len(void *page)  { -	return local_read(&((struct buffer_data_page *)page)->commit) +	struct buffer_data_page *bpage = page; + +	return (local_read(&bpage->commit) & ~RB_MISSED_FLAGS)  		+ BUF_PAGE_HDR_SIZE;  } @@ -1799,12 +1803,6 @@ void ring_buffer_change_overwrite(struct ring_buffer *buffer, int val)  }  EXPORT_SYMBOL_GPL(ring_buffer_change_overwrite); -static __always_inline void * -__rb_data_page_index(struct buffer_data_page *bpage, unsigned index) -{ -	return bpage->data + index; -} -  static __always_inline void *__rb_page_index(struct buffer_page *bpage, unsigned index)  {  	return bpage->page->data + index; @@ -2536,29 +2534,58 @@ rb_wakeups(struct ring_buffer *buffer, struct ring_buffer_per_cpu *cpu_buffer)   * The lock and unlock are done within a preempt disable section.   * The current_context per_cpu variable can only be modified   * by the current task between lock and unlock. But it can - * be modified more than once via an interrupt. There are four - * different contexts that we need to consider. + * be modified more than once via an interrupt. To pass this + * information from the lock to the unlock without having to + * access the 'in_interrupt()' functions again (which do show + * a bit of overhead in something as critical as function tracing, + * we use a bitmask trick. + * + *  bit 0 =  NMI context + *  bit 1 =  IRQ context + *  bit 2 =  SoftIRQ context + *  bit 3 =  normal context. + * + * This works because this is the order of contexts that can + * preempt other contexts. A SoftIRQ never preempts an IRQ + * context. + * + * When the context is determined, the corresponding bit is + * checked and set (if it was set, then a recursion of that context + * happened). + * + * On unlock, we need to clear this bit. To do so, just subtract + * 1 from the current_context and AND it to itself.   * - *  Normal context. - *  SoftIRQ context - *  IRQ context - *  NMI context + * (binary) + *  101 - 1 = 100 + *  101 & 100 = 100 (clearing bit zero)   * - * If for some reason the ring buffer starts to recurse, we - * only allow that to happen at most 4 times (one for each - * context). If it happens 5 times, then we consider this a - * recusive loop and do not let it go further. + *  1010 - 1 = 1001 + *  1010 & 1001 = 1000 (clearing bit 1) + * + * The least significant bit can be cleared this way, and it + * just so happens that it is the same bit corresponding to + * the current context.   */  static __always_inline int  trace_recursive_lock(struct ring_buffer_per_cpu *cpu_buffer)  { -	if (cpu_buffer->current_context >= 4) +	unsigned int val = cpu_buffer->current_context; +	unsigned long pc = preempt_count(); +	int bit; + +	if (!(pc & (NMI_MASK | HARDIRQ_MASK | SOFTIRQ_OFFSET))) +		bit = RB_CTX_NORMAL; +	else +		bit = pc & NMI_MASK ? RB_CTX_NMI : +			pc & HARDIRQ_MASK ? RB_CTX_IRQ : RB_CTX_SOFTIRQ; + +	if (unlikely(val & (1 << bit)))  		return 1; -	cpu_buffer->current_context++; -	/* Interrupts must see this update */ -	barrier(); +	val |= (1 << bit); +	cpu_buffer->current_context = val;  	return 0;  } @@ -2566,9 +2593,7 @@ trace_recursive_lock(struct ring_buffer_per_cpu *cpu_buffer)  static __always_inline void  trace_recursive_unlock(struct ring_buffer_per_cpu *cpu_buffer)  { -	/* Don't let the dec leak out */ -	barrier(); -	cpu_buffer->current_context--; +	cpu_buffer->current_context &= cpu_buffer->current_context - 1;  }  /** @@ -4406,8 +4431,13 @@ void ring_buffer_free_read_page(struct ring_buffer *buffer, int cpu, void *data)  {  	struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu];  	struct buffer_data_page *bpage = data; +	struct page *page = virt_to_page(bpage);  	unsigned long flags; +	/* If the page is still in use someplace else, we can't reuse it */ +	if (page_ref_count(page) > 1) +		goto out; +  	local_irq_save(flags);  	arch_spin_lock(&cpu_buffer->lock); @@ -4419,6 +4449,7 @@ void ring_buffer_free_read_page(struct ring_buffer *buffer, int cpu, void *data)  	arch_spin_unlock(&cpu_buffer->lock);  	local_irq_restore(flags); + out:  	free_page((unsigned long)bpage);  }  EXPORT_SYMBOL_GPL(ring_buffer_free_read_page); diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 73e67b68c53b..2a8d8a294345 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -362,7 +362,7 @@ trace_ignore_this_task(struct trace_pid_list *filtered_pids, struct task_struct  }  /** - * trace_pid_filter_add_remove - Add or remove a task from a pid_list + * trace_pid_filter_add_remove_task - Add or remove a task from a pid_list   * @pid_list: The list to modify   * @self: The current task for fork or NULL for exit   * @task: The task to add or remove @@ -925,7 +925,7 @@ static void tracing_snapshot_instance(struct trace_array *tr)  }  /** - * trace_snapshot - take a snapshot of the current buffer. + * tracing_snapshot - take a snapshot of the current buffer.   *   * This causes a swap between the snapshot buffer and the current live   * tracing buffer. You can use this to take snapshots of the live @@ -1004,9 +1004,9 @@ int tracing_alloc_snapshot(void)  EXPORT_SYMBOL_GPL(tracing_alloc_snapshot);  /** - * trace_snapshot_alloc - allocate and take a snapshot of the current buffer. + * tracing_snapshot_alloc - allocate and take a snapshot of the current buffer.   * - * This is similar to trace_snapshot(), but it will allocate the + * This is similar to tracing_snapshot(), but it will allocate the   * snapshot buffer if it isn't already allocated. Use this only   * where it is safe to sleep, as the allocation may sleep.   * @@ -1303,7 +1303,7 @@ unsigned long __read_mostly	tracing_thresh;  /*   * Copy the new maximum trace into the separate maximum-trace   * structure. (this way the maximum trace is permanently saved, - * for later retrieval via /sys/kernel/debug/tracing/latency_trace) + * for later retrieval via /sys/kernel/tracing/tracing_max_latency)   */  static void  __update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) @@ -2415,7 +2415,7 @@ trace_process_export(struct trace_export *export,  	entry = ring_buffer_event_data(event);  	size = ring_buffer_event_length(event); -	export->write(entry, size); +	export->write(export, entry, size);  }  static DEFINE_MUTEX(ftrace_export_lock); @@ -4178,37 +4178,30 @@ static const struct file_operations show_traces_fops = {  	.llseek		= seq_lseek,  }; -/* - * The tracer itself will not take this lock, but still we want - * to provide a consistent cpumask to user-space: - */ -static DEFINE_MUTEX(tracing_cpumask_update_lock); - -/* - * Temporary storage for the character representation of the - * CPU bitmask (and one more byte for the newline): - */ -static char mask_str[NR_CPUS + 1]; -  static ssize_t  tracing_cpumask_read(struct file *filp, char __user *ubuf,  		     size_t count, loff_t *ppos)  {  	struct trace_array *tr = file_inode(filp)->i_private; +	char *mask_str;  	int len; -	mutex_lock(&tracing_cpumask_update_lock); +	len = snprintf(NULL, 0, "%*pb\n", +		       cpumask_pr_args(tr->tracing_cpumask)) + 1; +	mask_str = kmalloc(len, GFP_KERNEL); +	if (!mask_str) +		return -ENOMEM; -	len = snprintf(mask_str, count, "%*pb\n", +	len = snprintf(mask_str, len, "%*pb\n",  		       cpumask_pr_args(tr->tracing_cpumask));  	if (len >= count) {  		count = -EINVAL;  		goto out_err;  	} -	count = simple_read_from_buffer(ubuf, count, ppos, mask_str, NR_CPUS+1); +	count = simple_read_from_buffer(ubuf, count, ppos, mask_str, len);  out_err: -	mutex_unlock(&tracing_cpumask_update_lock); +	kfree(mask_str);  	return count;  } @@ -4228,8 +4221,6 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf,  	if (err)  		goto err_unlock; -	mutex_lock(&tracing_cpumask_update_lock); -  	local_irq_disable();  	arch_spin_lock(&tr->max_lock);  	for_each_tracing_cpu(cpu) { @@ -4252,8 +4243,6 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf,  	local_irq_enable();  	cpumask_copy(tr->tracing_cpumask, tracing_cpumask_new); - -	mutex_unlock(&tracing_cpumask_update_lock);  	free_cpumask_var(tracing_cpumask_new);  	return count; @@ -6780,7 +6769,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,  		.spd_release	= buffer_spd_release,  	};  	struct buffer_ref *ref; -	int entries, size, i; +	int entries, i;  	ssize_t ret = 0;  #ifdef CONFIG_TRACER_MAX_TRACE @@ -6834,14 +6823,6 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,  			break;  		} -		/* -		 * zero out any left over data, this is going to -		 * user land. -		 */ -		size = ring_buffer_page_len(ref->page); -		if (size < PAGE_SIZE) -			memset(ref->page + size, 0, PAGE_SIZE - size); -  		page = virt_to_page(ref->page);  		spd.pages[i] = page; @@ -7599,6 +7580,7 @@ allocate_trace_buffer(struct trace_array *tr, struct trace_buffer *buf, int size  	buf->data = alloc_percpu(struct trace_array_cpu);  	if (!buf->data) {  		ring_buffer_free(buf->buffer); +		buf->buffer = NULL;  		return -ENOMEM;  	} @@ -7622,7 +7604,9 @@ static int allocate_trace_buffers(struct trace_array *tr, int size)  				    allocate_snapshot ? size : 1);  	if (WARN_ON(ret)) {  		ring_buffer_free(tr->trace_buffer.buffer); +		tr->trace_buffer.buffer = NULL;  		free_percpu(tr->trace_buffer.data); +		tr->trace_buffer.data = NULL;  		return -ENOMEM;  	}  	tr->allocated_snapshot = allocate_snapshot; diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index ec0f9aa4e151..1b87157edbff 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -2213,6 +2213,7 @@ void trace_event_eval_update(struct trace_eval_map **map, int len)  {  	struct trace_event_call *call, *p;  	const char *last_system = NULL; +	bool first = false;  	int last_i;  	int i; @@ -2220,15 +2221,28 @@ void trace_event_eval_update(struct trace_eval_map **map, int len)  	list_for_each_entry_safe(call, p, &ftrace_events, list) {  		/* events are usually grouped together with systems */  		if (!last_system || call->class->system != last_system) { +			first = true;  			last_i = 0;  			last_system = call->class->system;  		} +		/* +		 * Since calls are grouped by systems, the likelyhood that the +		 * next call in the iteration belongs to the same system as the +		 * previous call is high. As an optimization, we skip seaching +		 * for a map[] that matches the call's system if the last call +		 * was from the same system. That's what last_i is for. If the +		 * call has the same system as the previous call, then last_i +		 * will be the index of the first map[] that has a matching +		 * system. +		 */  		for (i = last_i; i < len; i++) {  			if (call->class->system == map[i]->system) {  				/* Save the first system if need be */ -				if (!last_i) +				if (first) {  					last_i = i; +					first = false; +				}  				update_event_printk(call, map[i]);  			}  		} diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index 734accc02418..3c7bfc4bf5e9 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -209,6 +209,10 @@ stack_trace_call(unsigned long ip, unsigned long parent_ip,  	if (__this_cpu_read(disable_stack_tracer) != 1)  		goto out; +	/* If rcu is not watching, then save stack trace can fail */ +	if (!rcu_is_watching()) +		goto out; +  	ip += MCOUNT_INSN_SIZE;  	check_stack(ip, &stack); |