diff options
Diffstat (limited to 'kernel/trace')
| -rw-r--r-- | kernel/trace/Kconfig | 40 | ||||
| -rw-r--r-- | kernel/trace/Makefile | 5 | ||||
| -rw-r--r-- | kernel/trace/bpf_trace.c | 160 | ||||
| -rw-r--r-- | kernel/trace/ftrace.c | 30 | ||||
| -rw-r--r-- | kernel/trace/trace.c | 61 | ||||
| -rw-r--r-- | kernel/trace/trace.h | 5 | ||||
| -rw-r--r-- | kernel/trace/trace_entries.h | 27 | ||||
| -rw-r--r-- | kernel/trace/trace_events_trigger.c | 1 | ||||
| -rw-r--r-- | kernel/trace/trace_functions_graph.c | 73 | ||||
| -rw-r--r-- | kernel/trace/trace_hwlat.c | 633 | ||||
| -rw-r--r-- | kernel/trace/trace_kprobe.c | 4 | ||||
| -rw-r--r-- | kernel/trace/trace_output.c | 66 | ||||
| -rw-r--r-- | kernel/trace/trace_probe.c | 30 | ||||
| -rw-r--r-- | kernel/trace/trace_probe.h | 11 | ||||
| -rw-r--r-- | kernel/trace/trace_syscalls.c | 6 | ||||
| -rw-r--r-- | kernel/trace/trace_uprobe.c | 8 | 
16 files changed, 1073 insertions, 87 deletions
| diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index f4b86e8ca1e7..2a96b063d659 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -24,11 +24,6 @@ config HAVE_FUNCTION_GRAPH_TRACER  	help  	  See Documentation/trace/ftrace-design.txt -config HAVE_FUNCTION_GRAPH_FP_TEST -	bool -	help -	  See Documentation/trace/ftrace-design.txt -  config HAVE_DYNAMIC_FTRACE  	bool  	help @@ -221,6 +216,41 @@ config SCHED_TRACER  	  This tracer tracks the latency of the highest priority task  	  to be scheduled in, starting from the point it has woken up. +config HWLAT_TRACER +	bool "Tracer to detect hardware latencies (like SMIs)" +	select GENERIC_TRACER +	help +	 This tracer, when enabled will create one or more kernel threads, +	 depening on what the cpumask file is set to, which each thread +	 spinning in a loop looking for interruptions caused by +	 something other than the kernel. For example, if a +	 System Management Interrupt (SMI) takes a noticeable amount of +	 time, this tracer will detect it. This is useful for testing +	 if a system is reliable for Real Time tasks. + +	 Some files are created in the tracing directory when this +	 is enabled: + +	   hwlat_detector/width   - time in usecs for how long to spin for +	   hwlat_detector/window  - time in usecs between the start of each +				     iteration + +	 A kernel thread is created that will spin with interrupts disabled +	 for "width" microseconds in every "widow" cycle. It will not spin +	 for "window - width" microseconds, where the system can +	 continue to operate. + +	 The output will appear in the trace and trace_pipe files. + +	 When the tracer is not running, it has no affect on the system, +	 but when it is running, it can cause the system to be +	 periodically non responsive. Do not run this tracer on a +	 production system. + +	 To enable this tracer, echo in "hwlat" into the current_tracer +	 file. Every time a latency is greater than tracing_thresh, it will +	 be recorded into the ring buffer. +  config ENABLE_DEFAULT_TRACERS  	bool "Trace process context switches and events"  	depends on !GENERIC_TRACER diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index d0a1617b52b4..e57980845549 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -1,8 +1,4 @@ -# We are fully aware of the dangers of __builtin_return_address() -FRAME_CFLAGS := $(call cc-disable-warning,frame-address) -KBUILD_CFLAGS += $(FRAME_CFLAGS) -  # Do not instrument the tracer itself:  ifdef CONFIG_FUNCTION_TRACER @@ -41,6 +37,7 @@ obj-$(CONFIG_FUNCTION_TRACER) += trace_functions.o  obj-$(CONFIG_IRQSOFF_TRACER) += trace_irqsoff.o  obj-$(CONFIG_PREEMPT_TRACER) += trace_irqsoff.o  obj-$(CONFIG_SCHED_TRACER) += trace_sched_wakeup.o +obj-$(CONFIG_HWLAT_TRACER) += trace_hwlat.o  obj-$(CONFIG_NOP_TRACER) += trace_nop.o  obj-$(CONFIG_STACK_TRACER) += trace_stack.o  obj-$(CONFIG_MMIOTRACE) += trace_mmiotrace.o diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index b20438fdb029..5dcb99281259 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1,4 +1,5 @@  /* Copyright (c) 2011-2015 PLUMgrid, http://plumgrid.com + * Copyright (c) 2016 Facebook   *   * This program is free software; you can redistribute it and/or   * modify it under the terms of version 2 of the GNU General Public @@ -8,6 +9,7 @@  #include <linux/types.h>  #include <linux/slab.h>  #include <linux/bpf.h> +#include <linux/bpf_perf_event.h>  #include <linux/filter.h>  #include <linux/uaccess.h>  #include <linux/ctype.h> @@ -59,11 +61,9 @@ unsigned int trace_call_bpf(struct bpf_prog *prog, void *ctx)  }  EXPORT_SYMBOL_GPL(trace_call_bpf); -static u64 bpf_probe_read(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) +BPF_CALL_3(bpf_probe_read, void *, dst, u32, size, const void *, unsafe_ptr)  { -	void *dst = (void *) (long) r1; -	int ret, size = (int) r2; -	void *unsafe_ptr = (void *) (long) r3; +	int ret;  	ret = probe_kernel_read(dst, unsafe_ptr, size);  	if (unlikely(ret < 0)) @@ -81,12 +81,9 @@ static const struct bpf_func_proto bpf_probe_read_proto = {  	.arg3_type	= ARG_ANYTHING,  }; -static u64 bpf_probe_write_user(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) +BPF_CALL_3(bpf_probe_write_user, void *, unsafe_ptr, const void *, src, +	   u32, size)  { -	void *unsafe_ptr = (void *) (long) r1; -	void *src = (void *) (long) r2; -	int size = (int) r3; -  	/*  	 * Ensure we're in user context which is safe for the helper to  	 * run. This helper has no business in a kthread. @@ -128,9 +125,9 @@ static const struct bpf_func_proto *bpf_get_probe_write_proto(void)   * limited trace_printk()   * only %d %u %x %ld %lu %lx %lld %llu %llx %p %s conversion specifiers allowed   */ -static u64 bpf_trace_printk(u64 r1, u64 fmt_size, u64 r3, u64 r4, u64 r5) +BPF_CALL_5(bpf_trace_printk, char *, fmt, u32, fmt_size, u64, arg1, +	   u64, arg2, u64, arg3)  { -	char *fmt = (char *) (long) r1;  	bool str_seen = false;  	int mod[3] = {};  	int fmt_cnt = 0; @@ -176,16 +173,16 @@ static u64 bpf_trace_printk(u64 r1, u64 fmt_size, u64 r3, u64 r4, u64 r5)  				switch (fmt_cnt) {  				case 1: -					unsafe_addr = r3; -					r3 = (long) buf; +					unsafe_addr = arg1; +					arg1 = (long) buf;  					break;  				case 2: -					unsafe_addr = r4; -					r4 = (long) buf; +					unsafe_addr = arg2; +					arg2 = (long) buf;  					break;  				case 3: -					unsafe_addr = r5; -					r5 = (long) buf; +					unsafe_addr = arg3; +					arg3 = (long) buf;  					break;  				}  				buf[0] = 0; @@ -207,9 +204,9 @@ static u64 bpf_trace_printk(u64 r1, u64 fmt_size, u64 r3, u64 r4, u64 r5)  	}  	return __trace_printk(1/* fake ip will not be printed */, fmt, -			      mod[0] == 2 ? r3 : mod[0] == 1 ? (long) r3 : (u32) r3, -			      mod[1] == 2 ? r4 : mod[1] == 1 ? (long) r4 : (u32) r4, -			      mod[2] == 2 ? r5 : mod[2] == 1 ? (long) r5 : (u32) r5); +			      mod[0] == 2 ? arg1 : mod[0] == 1 ? (long) arg1 : (u32) arg1, +			      mod[1] == 2 ? arg2 : mod[1] == 1 ? (long) arg2 : (u32) arg2, +			      mod[2] == 2 ? arg3 : mod[2] == 1 ? (long) arg3 : (u32) arg3);  }  static const struct bpf_func_proto bpf_trace_printk_proto = { @@ -231,9 +228,8 @@ const struct bpf_func_proto *bpf_get_trace_printk_proto(void)  	return &bpf_trace_printk_proto;  } -static u64 bpf_perf_event_read(u64 r1, u64 flags, u64 r3, u64 r4, u64 r5) +BPF_CALL_2(bpf_perf_event_read, struct bpf_map *, map, u64, flags)  { -	struct bpf_map *map = (struct bpf_map *) (unsigned long) r1;  	struct bpf_array *array = container_of(map, struct bpf_array, map);  	unsigned int cpu = smp_processor_id();  	u64 index = flags & BPF_F_INDEX_MASK; @@ -310,11 +306,9 @@ __bpf_perf_event_output(struct pt_regs *regs, struct bpf_map *map,  	return 0;  } -static u64 bpf_perf_event_output(u64 r1, u64 r2, u64 flags, u64 r4, u64 size) +BPF_CALL_5(bpf_perf_event_output, struct pt_regs *, regs, struct bpf_map *, map, +	   u64, flags, void *, data, u64, size)  { -	struct pt_regs *regs = (struct pt_regs *)(long) r1; -	struct bpf_map *map  = (struct bpf_map *)(long) r2; -	void *data = (void *)(long) r4;  	struct perf_raw_record raw = {  		.frag = {  			.size = size, @@ -365,7 +359,7 @@ u64 bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size,  	return __bpf_perf_event_output(regs, map, flags, &raw);  } -static u64 bpf_get_current_task(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) +BPF_CALL_0(bpf_get_current_task)  {  	return (long) current;  } @@ -376,6 +370,31 @@ static const struct bpf_func_proto bpf_get_current_task_proto = {  	.ret_type	= RET_INTEGER,  }; +BPF_CALL_2(bpf_current_task_under_cgroup, struct bpf_map *, map, u32, idx) +{ +	struct bpf_array *array = container_of(map, struct bpf_array, map); +	struct cgroup *cgrp; + +	if (unlikely(in_interrupt())) +		return -EINVAL; +	if (unlikely(idx >= array->map.max_entries)) +		return -E2BIG; + +	cgrp = READ_ONCE(array->ptrs[idx]); +	if (unlikely(!cgrp)) +		return -EAGAIN; + +	return task_under_cgroup_hierarchy(current, cgrp); +} + +static const struct bpf_func_proto bpf_current_task_under_cgroup_proto = { +	.func           = bpf_current_task_under_cgroup, +	.gpl_only       = false, +	.ret_type       = RET_INTEGER, +	.arg1_type      = ARG_CONST_MAP_PTR, +	.arg2_type      = ARG_ANYTHING, +}; +  static const struct bpf_func_proto *tracing_func_proto(enum bpf_func_id func_id)  {  	switch (func_id) { @@ -407,6 +426,10 @@ static const struct bpf_func_proto *tracing_func_proto(enum bpf_func_id func_id)  		return &bpf_perf_event_read_proto;  	case BPF_FUNC_probe_write_user:  		return bpf_get_probe_write_proto(); +	case BPF_FUNC_current_task_under_cgroup: +		return &bpf_current_task_under_cgroup_proto; +	case BPF_FUNC_get_prandom_u32: +		return &bpf_get_prandom_u32_proto;  	default:  		return NULL;  	} @@ -447,16 +470,17 @@ static struct bpf_prog_type_list kprobe_tl = {  	.type	= BPF_PROG_TYPE_KPROBE,  }; -static u64 bpf_perf_event_output_tp(u64 r1, u64 r2, u64 index, u64 r4, u64 size) +BPF_CALL_5(bpf_perf_event_output_tp, void *, tp_buff, struct bpf_map *, map, +	   u64, flags, void *, data, u64, size)  { +	struct pt_regs *regs = *(struct pt_regs **)tp_buff; +  	/*  	 * r1 points to perf tracepoint buffer where first 8 bytes are hidden  	 * from bpf program and contain a pointer to 'struct pt_regs'. Fetch it -	 * from there and call the same bpf_perf_event_output() helper +	 * from there and call the same bpf_perf_event_output() helper inline.  	 */ -	u64 ctx = *(long *)(uintptr_t)r1; - -	return bpf_perf_event_output(ctx, r2, index, r4, size); +	return ____bpf_perf_event_output(regs, map, flags, data, size);  }  static const struct bpf_func_proto bpf_perf_event_output_proto_tp = { @@ -470,11 +494,18 @@ static const struct bpf_func_proto bpf_perf_event_output_proto_tp = {  	.arg5_type	= ARG_CONST_STACK_SIZE,  }; -static u64 bpf_get_stackid_tp(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) +BPF_CALL_3(bpf_get_stackid_tp, void *, tp_buff, struct bpf_map *, map, +	   u64, flags)  { -	u64 ctx = *(long *)(uintptr_t)r1; +	struct pt_regs *regs = *(struct pt_regs **)tp_buff; -	return bpf_get_stackid(ctx, r2, r3, r4, r5); +	/* +	 * Same comment as in bpf_perf_event_output_tp(), only that this time +	 * the other helper's function body cannot be inlined due to being +	 * external, thus we need to call raw helper function. +	 */ +	return bpf_get_stackid((unsigned long) regs, (unsigned long) map, +			       flags, 0, 0);  }  static const struct bpf_func_proto bpf_get_stackid_proto_tp = { @@ -520,10 +551,69 @@ static struct bpf_prog_type_list tracepoint_tl = {  	.type	= BPF_PROG_TYPE_TRACEPOINT,  }; +static bool pe_prog_is_valid_access(int off, int size, enum bpf_access_type type, +				    enum bpf_reg_type *reg_type) +{ +	if (off < 0 || off >= sizeof(struct bpf_perf_event_data)) +		return false; +	if (type != BPF_READ) +		return false; +	if (off % size != 0) +		return false; +	if (off == offsetof(struct bpf_perf_event_data, sample_period)) { +		if (size != sizeof(u64)) +			return false; +	} else { +		if (size != sizeof(long)) +			return false; +	} +	return true; +} + +static u32 pe_prog_convert_ctx_access(enum bpf_access_type type, int dst_reg, +				      int src_reg, int ctx_off, +				      struct bpf_insn *insn_buf, +				      struct bpf_prog *prog) +{ +	struct bpf_insn *insn = insn_buf; + +	switch (ctx_off) { +	case offsetof(struct bpf_perf_event_data, sample_period): +		BUILD_BUG_ON(FIELD_SIZEOF(struct perf_sample_data, period) != sizeof(u64)); + +		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_perf_event_data_kern, +						       data), dst_reg, src_reg, +				      offsetof(struct bpf_perf_event_data_kern, data)); +		*insn++ = BPF_LDX_MEM(BPF_DW, dst_reg, dst_reg, +				      offsetof(struct perf_sample_data, period)); +		break; +	default: +		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_perf_event_data_kern, +						       regs), dst_reg, src_reg, +				      offsetof(struct bpf_perf_event_data_kern, regs)); +		*insn++ = BPF_LDX_MEM(BPF_SIZEOF(long), dst_reg, dst_reg, ctx_off); +		break; +	} + +	return insn - insn_buf; +} + +static const struct bpf_verifier_ops perf_event_prog_ops = { +	.get_func_proto		= tp_prog_func_proto, +	.is_valid_access	= pe_prog_is_valid_access, +	.convert_ctx_access	= pe_prog_convert_ctx_access, +}; + +static struct bpf_prog_type_list perf_event_tl = { +	.ops	= &perf_event_prog_ops, +	.type	= BPF_PROG_TYPE_PERF_EVENT, +}; +  static int __init register_kprobe_prog_ops(void)  {  	bpf_register_prog_type(&kprobe_tl);  	bpf_register_prog_type(&tracepoint_tl); +	bpf_register_prog_type(&perf_event_tl);  	return 0;  }  late_initcall(register_kprobe_prog_ops); diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 84752c8e28b5..da87b3cba5b3 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -872,7 +872,13 @@ function_profile_call(unsigned long ip, unsigned long parent_ip,  #ifdef CONFIG_FUNCTION_GRAPH_TRACER  static int profile_graph_entry(struct ftrace_graph_ent *trace)  { +	int index = trace->depth; +  	function_profile_call(trace->func, 0, NULL, NULL); + +	if (index >= 0 && index < FTRACE_RETFUNC_DEPTH) +		current->ret_stack[index].subtime = 0; +  	return 1;  } @@ -1856,6 +1862,10 @@ static int __ftrace_hash_update_ipmodify(struct ftrace_ops *ops,  	/* Update rec->flags */  	do_for_each_ftrace_rec(pg, rec) { + +		if (rec->flags & FTRACE_FL_DISABLED) +			continue; +  		/* We need to update only differences of filter_hash */  		in_old = !!ftrace_lookup_ip(old_hash, rec->ip);  		in_new = !!ftrace_lookup_ip(new_hash, rec->ip); @@ -1878,6 +1888,10 @@ rollback:  	/* Roll back what we did above */  	do_for_each_ftrace_rec(pg, rec) { + +		if (rec->flags & FTRACE_FL_DISABLED) +			continue; +  		if (rec == end)  			goto err_out; @@ -2391,6 +2405,10 @@ void __weak ftrace_replace_code(int enable)  		return;  	do_for_each_ftrace_rec(pg, rec) { + +		if (rec->flags & FTRACE_FL_DISABLED) +			continue; +  		failed = __ftrace_replace_code(rec, enable);  		if (failed) {  			ftrace_bug(failed, rec); @@ -2757,7 +2775,7 @@ static int ftrace_shutdown(struct ftrace_ops *ops, int command)  		struct dyn_ftrace *rec;  		do_for_each_ftrace_rec(pg, rec) { -			if (FTRACE_WARN_ON_ONCE(rec->flags)) +			if (FTRACE_WARN_ON_ONCE(rec->flags & ~FTRACE_FL_DISABLED))  				pr_warn("  %pS flags:%lx\n",  					(void *)rec->ip, rec->flags);  		} while_for_each_ftrace_rec(); @@ -3592,6 +3610,10 @@ match_records(struct ftrace_hash *hash, char *func, int len, char *mod)  		goto out_unlock;  	do_for_each_ftrace_rec(pg, rec) { + +		if (rec->flags & FTRACE_FL_DISABLED) +			continue; +  		if (ftrace_match_record(rec, &func_g, mod_match, exclude_mod)) {  			ret = enter_record(hash, rec, clear_filter);  			if (ret < 0) { @@ -3787,6 +3809,9 @@ register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,  	do_for_each_ftrace_rec(pg, rec) { +		if (rec->flags & FTRACE_FL_DISABLED) +			continue; +  		if (!ftrace_match_record(rec, &func_g, NULL, 0))  			continue; @@ -4679,6 +4704,9 @@ ftrace_set_func(unsigned long *array, int *idx, int size, char *buffer)  	do_for_each_ftrace_rec(pg, rec) { +		if (rec->flags & FTRACE_FL_DISABLED) +			continue; +  		if (ftrace_match_record(rec, &func_g, NULL, 0)) {  			/* if it is in the array */  			exists = false; diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index dade4c9559cc..8696ce6bf2f6 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1047,7 +1047,7 @@ void disable_trace_on_warning(void)   *   * Shows real state of the ring buffer if it is enabled or not.   */ -static int tracer_tracing_is_on(struct trace_array *tr) +int tracer_tracing_is_on(struct trace_array *tr)  {  	if (tr->trace_buffer.buffer)  		return ring_buffer_record_is_on(tr->trace_buffer.buffer); @@ -4123,6 +4123,30 @@ static const char readme_msg[] =  	"\t\t\t  traces\n"  #endif  #endif /* CONFIG_STACK_TRACER */ +#ifdef CONFIG_KPROBE_EVENT +	"  kprobe_events\t\t- Add/remove/show the kernel dynamic events\n" +	"\t\t\t  Write into this file to define/undefine new trace events.\n" +#endif +#ifdef CONFIG_UPROBE_EVENT +	"  uprobe_events\t\t- Add/remove/show the userspace dynamic events\n" +	"\t\t\t  Write into this file to define/undefine new trace events.\n" +#endif +#if defined(CONFIG_KPROBE_EVENT) || defined(CONFIG_UPROBE_EVENT) +	"\t  accepts: event-definitions (one definition per line)\n" +	"\t   Format: p|r[:[<group>/]<event>] <place> [<args>]\n" +	"\t           -:[<group>/]<event>\n" +#ifdef CONFIG_KPROBE_EVENT +	"\t    place: [<module>:]<symbol>[+<offset>]|<memaddr>\n" +#endif +#ifdef CONFIG_UPROBE_EVENT +	"\t    place: <path>:<offset>\n" +#endif +	"\t     args: <name>=fetcharg[:type]\n" +	"\t fetcharg: %<register>, @<address>, @<symbol>[+|-<offset>],\n" +	"\t           $stack<index>, $stack, $retval, $comm\n" +	"\t     type: s8/16/32/64, u8/16/32/64, x8/16/32/64, string,\n" +	"\t           b<bit-width>@<bit-offset>/<container-size>\n" +#endif  	"  events/\t\t- Directory containing all trace event subsystems:\n"  	"      enable\t\t- Write 0/1 to enable/disable tracing of all events\n"  	"  events/<system>/\t- Directory containing all trace events for <system>:\n" @@ -4945,7 +4969,7 @@ out:  	return ret;  } -#ifdef CONFIG_TRACER_MAX_TRACE +#if defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER)  static ssize_t  tracing_max_lat_read(struct file *filp, char __user *ubuf, @@ -5124,19 +5148,20 @@ tracing_read_pipe(struct file *filp, char __user *ubuf,  	struct trace_iterator *iter = filp->private_data;  	ssize_t sret; -	/* return any leftover data */ -	sret = trace_seq_to_user(&iter->seq, ubuf, cnt); -	if (sret != -EBUSY) -		return sret; - -	trace_seq_init(&iter->seq); -  	/*  	 * Avoid more than one consumer on a single file descriptor  	 * This is just a matter of traces coherency, the ring buffer itself  	 * is protected.  	 */  	mutex_lock(&iter->mutex); + +	/* return any leftover data */ +	sret = trace_seq_to_user(&iter->seq, ubuf, cnt); +	if (sret != -EBUSY) +		goto out; + +	trace_seq_init(&iter->seq); +  	if (iter->trace->read) {  		sret = iter->trace->read(iter, filp, ubuf, cnt, ppos);  		if (sret) @@ -5867,7 +5892,7 @@ static const struct file_operations tracing_thresh_fops = {  	.llseek		= generic_file_llseek,  }; -#ifdef CONFIG_TRACER_MAX_TRACE +#if defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER)  static const struct file_operations tracing_max_lat_fops = {  	.open		= tracing_open_generic,  	.read		= tracing_max_lat_read, @@ -6163,9 +6188,6 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,  		return -EBUSY;  #endif -	if (splice_grow_spd(pipe, &spd)) -		return -ENOMEM; -  	if (*ppos & (PAGE_SIZE - 1))  		return -EINVAL; @@ -6175,6 +6197,9 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,  		len &= PAGE_MASK;  	} +	if (splice_grow_spd(pipe, &spd)) +		return -ENOMEM; +   again:  	trace_access_lock(iter->cpu_file);  	entries = ring_buffer_entries_cpu(iter->trace_buffer->buffer, iter->cpu_file); @@ -6232,19 +6257,21 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,  	/* did we read anything? */  	if (!spd.nr_pages) {  		if (ret) -			return ret; +			goto out; +		ret = -EAGAIN;  		if ((file->f_flags & O_NONBLOCK) || (flags & SPLICE_F_NONBLOCK)) -			return -EAGAIN; +			goto out;  		ret = wait_on_pipe(iter, true);  		if (ret) -			return ret; +			goto out;  		goto again;  	}  	ret = splice_to_pipe(pipe, &spd); +out:  	splice_shrink_spd(&spd);  	return ret; @@ -7195,7 +7222,7 @@ init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer)  	create_trace_options_dir(tr); -#ifdef CONFIG_TRACER_MAX_TRACE +#if defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER)  	trace_create_file("tracing_max_latency", 0644, d_tracer,  			&tr->max_latency, &tracing_max_lat_fops);  #endif diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index f783df416726..fd24b1f9ac43 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -38,6 +38,7 @@ enum trace_type {  	TRACE_USER_STACK,  	TRACE_BLK,  	TRACE_BPUTS, +	TRACE_HWLAT,  	__TRACE_LAST_TYPE,  }; @@ -213,6 +214,8 @@ struct trace_array {  	 */  	struct trace_buffer	max_buffer;  	bool			allocated_snapshot; +#endif +#if defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER)  	unsigned long		max_latency;  #endif  	struct trace_pid_list	__rcu *filtered_pids; @@ -326,6 +329,7 @@ extern void __ftrace_bad_type(void);  		IF_ASSIGN(var, ent, struct print_entry, TRACE_PRINT);	\  		IF_ASSIGN(var, ent, struct bprint_entry, TRACE_BPRINT);	\  		IF_ASSIGN(var, ent, struct bputs_entry, TRACE_BPUTS);	\ +		IF_ASSIGN(var, ent, struct hwlat_entry, TRACE_HWLAT);	\  		IF_ASSIGN(var, ent, struct trace_mmiotrace_rw,		\  			  TRACE_MMIO_RW);				\  		IF_ASSIGN(var, ent, struct trace_mmiotrace_map,		\ @@ -571,6 +575,7 @@ void tracing_reset_current(int cpu);  void tracing_reset_all_online_cpus(void);  int tracing_open_generic(struct inode *inode, struct file *filp);  bool tracing_is_disabled(void); +int tracer_tracing_is_on(struct trace_array *tr);  struct dentry *trace_create_file(const char *name,  				 umode_t mode,  				 struct dentry *parent, diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h index 5c30efcda5e6..d1cc37e78f99 100644 --- a/kernel/trace/trace_entries.h +++ b/kernel/trace/trace_entries.h @@ -322,3 +322,30 @@ FTRACE_ENTRY(branch, trace_branch,  	FILTER_OTHER  ); + +FTRACE_ENTRY(hwlat, hwlat_entry, + +	TRACE_HWLAT, + +	F_STRUCT( +		__field(	u64,			duration	) +		__field(	u64,			outer_duration	) +		__field(	u64,			nmi_total_ts	) +		__field_struct( struct timespec,	timestamp	) +		__field_desc(	long,	timestamp,	tv_sec		) +		__field_desc(	long,	timestamp,	tv_nsec		) +		__field(	unsigned int,		nmi_count	) +		__field(	unsigned int,		seqnum		) +	), + +	F_printk("cnt:%u\tts:%010lu.%010lu\tinner:%llu\touter:%llunmi-ts:%llu\tnmi-count:%u\n", +		 __entry->seqnum, +		 __entry->tv_sec, +		 __entry->tv_nsec, +		 __entry->duration, +		 __entry->outer_duration, +		 __entry->nmi_total_ts, +		 __entry->nmi_count), + +	FILTER_OTHER +); diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c index a975571cde24..6721a1e89f39 100644 --- a/kernel/trace/trace_events_trigger.c +++ b/kernel/trace/trace_events_trigger.c @@ -1028,6 +1028,7 @@ static struct event_command trigger_traceon_cmd = {  static struct event_command trigger_traceoff_cmd = {  	.name			= "traceoff",  	.trigger_type		= ETT_TRACE_ONOFF, +	.flags			= EVENT_CMD_FL_POST_TRIGGER,  	.func			= event_trigger_callback,  	.reg			= register_trigger,  	.unreg			= unregister_trigger, diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 7363ccf79512..4e480e870474 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -119,7 +119,7 @@ print_graph_duration(struct trace_array *tr, unsigned long long duration,  /* Add a function return address to the trace stack on thread info.*/  int  ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth, -			 unsigned long frame_pointer) +			 unsigned long frame_pointer, unsigned long *retp)  {  	unsigned long long calltime;  	int index; @@ -170,8 +170,12 @@ ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth,  	current->ret_stack[index].ret = ret;  	current->ret_stack[index].func = func;  	current->ret_stack[index].calltime = calltime; -	current->ret_stack[index].subtime = 0; +#ifdef HAVE_FUNCTION_GRAPH_FP_TEST  	current->ret_stack[index].fp = frame_pointer; +#endif +#ifdef HAVE_FUNCTION_GRAPH_RET_ADDR_PTR +	current->ret_stack[index].retp = retp; +#endif  	*depth = current->curr_ret_stack;  	return 0; @@ -204,7 +208,7 @@ ftrace_pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret,  		return;  	} -#if defined(CONFIG_HAVE_FUNCTION_GRAPH_FP_TEST) && !defined(CC_USING_FENTRY) +#ifdef HAVE_FUNCTION_GRAPH_FP_TEST  	/*  	 * The arch may choose to record the frame pointer used  	 * and check it here to make sure that it is what we expect it @@ -279,6 +283,64 @@ unsigned long ftrace_return_to_handler(unsigned long frame_pointer)  	return ret;  } +/** + * ftrace_graph_ret_addr - convert a potentially modified stack return address + *			   to its original value + * + * This function can be called by stack unwinding code to convert a found stack + * return address ('ret') to its original value, in case the function graph + * tracer has modified it to be 'return_to_handler'.  If the address hasn't + * been modified, the unchanged value of 'ret' is returned. + * + * 'idx' is a state variable which should be initialized by the caller to zero + * before the first call. + * + * 'retp' is a pointer to the return address on the stack.  It's ignored if + * the arch doesn't have HAVE_FUNCTION_GRAPH_RET_ADDR_PTR defined. + */ +#ifdef HAVE_FUNCTION_GRAPH_RET_ADDR_PTR +unsigned long ftrace_graph_ret_addr(struct task_struct *task, int *idx, +				    unsigned long ret, unsigned long *retp) +{ +	int index = task->curr_ret_stack; +	int i; + +	if (ret != (unsigned long)return_to_handler) +		return ret; + +	if (index < -1) +		index += FTRACE_NOTRACE_DEPTH; + +	if (index < 0) +		return ret; + +	for (i = 0; i <= index; i++) +		if (task->ret_stack[i].retp == retp) +			return task->ret_stack[i].ret; + +	return ret; +} +#else /* !HAVE_FUNCTION_GRAPH_RET_ADDR_PTR */ +unsigned long ftrace_graph_ret_addr(struct task_struct *task, int *idx, +				    unsigned long ret, unsigned long *retp) +{ +	int task_idx; + +	if (ret != (unsigned long)return_to_handler) +		return ret; + +	task_idx = task->curr_ret_stack; + +	if (!task->ret_stack || task_idx < *idx) +		return ret; + +	task_idx -= *idx; +	(*idx)++; + +	return task->ret_stack[task_idx].ret; +} +#endif /* HAVE_FUNCTION_GRAPH_RET_ADDR_PTR */ +  int __trace_graph_entry(struct trace_array *tr,  				struct ftrace_graph_ent *trace,  				unsigned long flags, @@ -1120,6 +1182,11 @@ print_graph_comment(struct trace_seq *s, struct trace_entry *ent,  	trace_seq_puts(s, "/* ");  	switch (iter->ent->type) { +	case TRACE_BPUTS: +		ret = trace_print_bputs_msg_only(iter); +		if (ret != TRACE_TYPE_HANDLED) +			return ret; +		break;  	case TRACE_BPRINT:  		ret = trace_print_bprintk_msg_only(iter);  		if (ret != TRACE_TYPE_HANDLED) diff --git a/kernel/trace/trace_hwlat.c b/kernel/trace/trace_hwlat.c new file mode 100644 index 000000000000..b97286c48735 --- /dev/null +++ b/kernel/trace/trace_hwlat.c @@ -0,0 +1,633 @@ +/* + * trace_hwlatdetect.c - A simple Hardware Latency detector. + * + * Use this tracer to detect large system latencies induced by the behavior of + * certain underlying system hardware or firmware, independent of Linux itself. + * The code was developed originally to detect the presence of SMIs on Intel + * and AMD systems, although there is no dependency upon x86 herein. + * + * The classical example usage of this tracer is in detecting the presence of + * SMIs or System Management Interrupts on Intel and AMD systems. An SMI is a + * somewhat special form of hardware interrupt spawned from earlier CPU debug + * modes in which the (BIOS/EFI/etc.) firmware arranges for the South Bridge + * LPC (or other device) to generate a special interrupt under certain + * circumstances, for example, upon expiration of a special SMI timer device, + * due to certain external thermal readings, on certain I/O address accesses, + * and other situations. An SMI hits a special CPU pin, triggers a special + * SMI mode (complete with special memory map), and the OS is unaware. + * + * Although certain hardware-inducing latencies are necessary (for example, + * a modern system often requires an SMI handler for correct thermal control + * and remote management) they can wreak havoc upon any OS-level performance + * guarantees toward low-latency, especially when the OS is not even made + * aware of the presence of these interrupts. For this reason, we need a + * somewhat brute force mechanism to detect these interrupts. In this case, + * we do it by hogging all of the CPU(s) for configurable timer intervals, + * sampling the built-in CPU timer, looking for discontiguous readings. + * + * WARNING: This implementation necessarily introduces latencies. Therefore, + *          you should NEVER use this tracer while running in a production + *          environment requiring any kind of low-latency performance + *          guarantee(s). + * + * Copyright (C) 2008-2009 Jon Masters, Red Hat, Inc. <[email protected]> + * Copyright (C) 2013-2016 Steven Rostedt, Red Hat, Inc. <[email protected]> + * + * Includes useful feedback from Clark Williams <[email protected]> + * + * This file is licensed under the terms of the GNU General Public + * License version 2. This program is licensed "as is" without any + * warranty of any kind, whether express or implied. + */ +#include <linux/kthread.h> +#include <linux/tracefs.h> +#include <linux/uaccess.h> +#include <linux/cpumask.h> +#include <linux/delay.h> +#include "trace.h" + +static struct trace_array	*hwlat_trace; + +#define U64STR_SIZE		22			/* 20 digits max */ + +#define BANNER			"hwlat_detector: " +#define DEFAULT_SAMPLE_WINDOW	1000000			/* 1s */ +#define DEFAULT_SAMPLE_WIDTH	500000			/* 0.5s */ +#define DEFAULT_LAT_THRESHOLD	10			/* 10us */ + +/* sampling thread*/ +static struct task_struct *hwlat_kthread; + +static struct dentry *hwlat_sample_width;	/* sample width us */ +static struct dentry *hwlat_sample_window;	/* sample window us */ + +/* Save the previous tracing_thresh value */ +static unsigned long save_tracing_thresh; + +/* NMI timestamp counters */ +static u64 nmi_ts_start; +static u64 nmi_total_ts; +static int nmi_count; +static int nmi_cpu; + +/* Tells NMIs to call back to the hwlat tracer to record timestamps */ +bool trace_hwlat_callback_enabled; + +/* If the user changed threshold, remember it */ +static u64 last_tracing_thresh = DEFAULT_LAT_THRESHOLD * NSEC_PER_USEC; + +/* Individual latency samples are stored here when detected. */ +struct hwlat_sample { +	u64		seqnum;		/* unique sequence */ +	u64		duration;	/* delta */ +	u64		outer_duration;	/* delta (outer loop) */ +	u64		nmi_total_ts;	/* Total time spent in NMIs */ +	struct timespec	timestamp;	/* wall time */ +	int		nmi_count;	/* # NMIs during this sample */ +}; + +/* keep the global state somewhere. */ +static struct hwlat_data { + +	struct mutex lock;		/* protect changes */ + +	u64	count;			/* total since reset */ + +	u64	sample_window;		/* total sampling window (on+off) */ +	u64	sample_width;		/* active sampling portion of window */ + +} hwlat_data = { +	.sample_window		= DEFAULT_SAMPLE_WINDOW, +	.sample_width		= DEFAULT_SAMPLE_WIDTH, +}; + +static void trace_hwlat_sample(struct hwlat_sample *sample) +{ +	struct trace_array *tr = hwlat_trace; +	struct trace_event_call *call = &event_hwlat; +	struct ring_buffer *buffer = tr->trace_buffer.buffer; +	struct ring_buffer_event *event; +	struct hwlat_entry *entry; +	unsigned long flags; +	int pc; + +	pc = preempt_count(); +	local_save_flags(flags); + +	event = trace_buffer_lock_reserve(buffer, TRACE_HWLAT, sizeof(*entry), +					  flags, pc); +	if (!event) +		return; +	entry	= ring_buffer_event_data(event); +	entry->seqnum			= sample->seqnum; +	entry->duration			= sample->duration; +	entry->outer_duration		= sample->outer_duration; +	entry->timestamp		= sample->timestamp; +	entry->nmi_total_ts		= sample->nmi_total_ts; +	entry->nmi_count		= sample->nmi_count; + +	if (!call_filter_check_discard(call, entry, buffer, event)) +		__buffer_unlock_commit(buffer, event); +} + +/* Macros to encapsulate the time capturing infrastructure */ +#define time_type	u64 +#define time_get()	trace_clock_local() +#define time_to_us(x)	div_u64(x, 1000) +#define time_sub(a, b)	((a) - (b)) +#define init_time(a, b)	(a = b) +#define time_u64(a)	a + +void trace_hwlat_callback(bool enter) +{ +	if (smp_processor_id() != nmi_cpu) +		return; + +	/* +	 * Currently trace_clock_local() calls sched_clock() and the +	 * generic version is not NMI safe. +	 */ +	if (!IS_ENABLED(CONFIG_GENERIC_SCHED_CLOCK)) { +		if (enter) +			nmi_ts_start = time_get(); +		else +			nmi_total_ts = time_get() - nmi_ts_start; +	} + +	if (enter) +		nmi_count++; +} + +/** + * get_sample - sample the CPU TSC and look for likely hardware latencies + * + * Used to repeatedly capture the CPU TSC (or similar), looking for potential + * hardware-induced latency. Called with interrupts disabled and with + * hwlat_data.lock held. + */ +static int get_sample(void) +{ +	struct trace_array *tr = hwlat_trace; +	time_type start, t1, t2, last_t2; +	s64 diff, total, last_total = 0; +	u64 sample = 0; +	u64 thresh = tracing_thresh; +	u64 outer_sample = 0; +	int ret = -1; + +	do_div(thresh, NSEC_PER_USEC); /* modifies interval value */ + +	nmi_cpu = smp_processor_id(); +	nmi_total_ts = 0; +	nmi_count = 0; +	/* Make sure NMIs see this first */ +	barrier(); + +	trace_hwlat_callback_enabled = true; + +	init_time(last_t2, 0); +	start = time_get(); /* start timestamp */ + +	do { + +		t1 = time_get();	/* we'll look for a discontinuity */ +		t2 = time_get(); + +		if (time_u64(last_t2)) { +			/* Check the delta from outer loop (t2 to next t1) */ +			diff = time_to_us(time_sub(t1, last_t2)); +			/* This shouldn't happen */ +			if (diff < 0) { +				pr_err(BANNER "time running backwards\n"); +				goto out; +			} +			if (diff > outer_sample) +				outer_sample = diff; +		} +		last_t2 = t2; + +		total = time_to_us(time_sub(t2, start)); /* sample width */ + +		/* Check for possible overflows */ +		if (total < last_total) { +			pr_err("Time total overflowed\n"); +			break; +		} +		last_total = total; + +		/* This checks the inner loop (t1 to t2) */ +		diff = time_to_us(time_sub(t2, t1));     /* current diff */ + +		/* This shouldn't happen */ +		if (diff < 0) { +			pr_err(BANNER "time running backwards\n"); +			goto out; +		} + +		if (diff > sample) +			sample = diff; /* only want highest value */ + +	} while (total <= hwlat_data.sample_width); + +	barrier(); /* finish the above in the view for NMIs */ +	trace_hwlat_callback_enabled = false; +	barrier(); /* Make sure nmi_total_ts is no longer updated */ + +	ret = 0; + +	/* If we exceed the threshold value, we have found a hardware latency */ +	if (sample > thresh || outer_sample > thresh) { +		struct hwlat_sample s; + +		ret = 1; + +		/* We read in microseconds */ +		if (nmi_total_ts) +			do_div(nmi_total_ts, NSEC_PER_USEC); + +		hwlat_data.count++; +		s.seqnum = hwlat_data.count; +		s.duration = sample; +		s.outer_duration = outer_sample; +		s.timestamp = CURRENT_TIME; +		s.nmi_total_ts = nmi_total_ts; +		s.nmi_count = nmi_count; +		trace_hwlat_sample(&s); + +		/* Keep a running maximum ever recorded hardware latency */ +		if (sample > tr->max_latency) +			tr->max_latency = sample; +	} + +out: +	return ret; +} + +static struct cpumask save_cpumask; +static bool disable_migrate; + +static void move_to_next_cpu(void) +{ +	static struct cpumask *current_mask; +	int next_cpu; + +	if (disable_migrate) +		return; + +	/* Just pick the first CPU on first iteration */ +	if (!current_mask) { +		current_mask = &save_cpumask; +		get_online_cpus(); +		cpumask_and(current_mask, cpu_online_mask, tracing_buffer_mask); +		put_online_cpus(); +		next_cpu = cpumask_first(current_mask); +		goto set_affinity; +	} + +	/* +	 * If for some reason the user modifies the CPU affinity +	 * of this thread, than stop migrating for the duration +	 * of the current test. +	 */ +	if (!cpumask_equal(current_mask, ¤t->cpus_allowed)) +		goto disable; + +	get_online_cpus(); +	cpumask_and(current_mask, cpu_online_mask, tracing_buffer_mask); +	next_cpu = cpumask_next(smp_processor_id(), current_mask); +	put_online_cpus(); + +	if (next_cpu >= nr_cpu_ids) +		next_cpu = cpumask_first(current_mask); + + set_affinity: +	if (next_cpu >= nr_cpu_ids) /* Shouldn't happen! */ +		goto disable; + +	cpumask_clear(current_mask); +	cpumask_set_cpu(next_cpu, current_mask); + +	sched_setaffinity(0, current_mask); +	return; + + disable: +	disable_migrate = true; +} + +/* + * kthread_fn - The CPU time sampling/hardware latency detection kernel thread + * + * Used to periodically sample the CPU TSC via a call to get_sample. We + * disable interrupts, which does (intentionally) introduce latency since we + * need to ensure nothing else might be running (and thus preempting). + * Obviously this should never be used in production environments. + * + * Currently this runs on which ever CPU it was scheduled on, but most + * real-world hardware latency situations occur across several CPUs, + * but we might later generalize this if we find there are any actualy + * systems with alternate SMI delivery or other hardware latencies. + */ +static int kthread_fn(void *data) +{ +	u64 interval; + +	while (!kthread_should_stop()) { + +		move_to_next_cpu(); + +		local_irq_disable(); +		get_sample(); +		local_irq_enable(); + +		mutex_lock(&hwlat_data.lock); +		interval = hwlat_data.sample_window - hwlat_data.sample_width; +		mutex_unlock(&hwlat_data.lock); + +		do_div(interval, USEC_PER_MSEC); /* modifies interval value */ + +		/* Always sleep for at least 1ms */ +		if (interval < 1) +			interval = 1; + +		if (msleep_interruptible(interval)) +			break; +	} + +	return 0; +} + +/** + * start_kthread - Kick off the hardware latency sampling/detector kthread + * + * This starts the kernel thread that will sit and sample the CPU timestamp + * counter (TSC or similar) and look for potential hardware latencies. + */ +static int start_kthread(struct trace_array *tr) +{ +	struct task_struct *kthread; + +	kthread = kthread_create(kthread_fn, NULL, "hwlatd"); +	if (IS_ERR(kthread)) { +		pr_err(BANNER "could not start sampling thread\n"); +		return -ENOMEM; +	} +	hwlat_kthread = kthread; +	wake_up_process(kthread); + +	return 0; +} + +/** + * stop_kthread - Inform the hardware latency samping/detector kthread to stop + * + * This kicks the running hardware latency sampling/detector kernel thread and + * tells it to stop sampling now. Use this on unload and at system shutdown. + */ +static void stop_kthread(void) +{ +	if (!hwlat_kthread) +		return; +	kthread_stop(hwlat_kthread); +	hwlat_kthread = NULL; +} + +/* + * hwlat_read - Wrapper read function for reading both window and width + * @filp: The active open file structure + * @ubuf: The userspace provided buffer to read value into + * @cnt: The maximum number of bytes to read + * @ppos: The current "file" position + * + * This function provides a generic read implementation for the global state + * "hwlat_data" structure filesystem entries. + */ +static ssize_t hwlat_read(struct file *filp, char __user *ubuf, +			  size_t cnt, loff_t *ppos) +{ +	char buf[U64STR_SIZE]; +	u64 *entry = filp->private_data; +	u64 val; +	int len; + +	if (!entry) +		return -EFAULT; + +	if (cnt > sizeof(buf)) +		cnt = sizeof(buf); + +	val = *entry; + +	len = snprintf(buf, sizeof(buf), "%llu\n", val); + +	return simple_read_from_buffer(ubuf, cnt, ppos, buf, len); +} + +/** + * hwlat_width_write - Write function for "width" entry + * @filp: The active open file structure + * @ubuf: The user buffer that contains the value to write + * @cnt: The maximum number of bytes to write to "file" + * @ppos: The current position in @file + * + * This function provides a write implementation for the "width" interface + * to the hardware latency detector. It can be used to configure + * for how many us of the total window us we will actively sample for any + * hardware-induced latency periods. Obviously, it is not possible to + * sample constantly and have the system respond to a sample reader, or, + * worse, without having the system appear to have gone out to lunch. It + * is enforced that width is less that the total window size. + */ +static ssize_t +hwlat_width_write(struct file *filp, const char __user *ubuf, +		  size_t cnt, loff_t *ppos) +{ +	u64 val; +	int err; + +	err = kstrtoull_from_user(ubuf, cnt, 10, &val); +	if (err) +		return err; + +	mutex_lock(&hwlat_data.lock); +	if (val < hwlat_data.sample_window) +		hwlat_data.sample_width = val; +	else +		err = -EINVAL; +	mutex_unlock(&hwlat_data.lock); + +	if (err) +		return err; + +	return cnt; +} + +/** + * hwlat_window_write - Write function for "window" entry + * @filp: The active open file structure + * @ubuf: The user buffer that contains the value to write + * @cnt: The maximum number of bytes to write to "file" + * @ppos: The current position in @file + * + * This function provides a write implementation for the "window" interface + * to the hardware latency detetector. The window is the total time + * in us that will be considered one sample period. Conceptually, windows + * occur back-to-back and contain a sample width period during which + * actual sampling occurs. Can be used to write a new total window size. It + * is enfoced that any value written must be greater than the sample width + * size, or an error results. + */ +static ssize_t +hwlat_window_write(struct file *filp, const char __user *ubuf, +		   size_t cnt, loff_t *ppos) +{ +	u64 val; +	int err; + +	err = kstrtoull_from_user(ubuf, cnt, 10, &val); +	if (err) +		return err; + +	mutex_lock(&hwlat_data.lock); +	if (hwlat_data.sample_width < val) +		hwlat_data.sample_window = val; +	else +		err = -EINVAL; +	mutex_unlock(&hwlat_data.lock); + +	if (err) +		return err; + +	return cnt; +} + +static const struct file_operations width_fops = { +	.open		= tracing_open_generic, +	.read		= hwlat_read, +	.write		= hwlat_width_write, +}; + +static const struct file_operations window_fops = { +	.open		= tracing_open_generic, +	.read		= hwlat_read, +	.write		= hwlat_window_write, +}; + +/** + * init_tracefs - A function to initialize the tracefs interface files + * + * This function creates entries in tracefs for "hwlat_detector". + * It creates the hwlat_detector directory in the tracing directory, + * and within that directory is the count, width and window files to + * change and view those values. + */ +static int init_tracefs(void) +{ +	struct dentry *d_tracer; +	struct dentry *top_dir; + +	d_tracer = tracing_init_dentry(); +	if (IS_ERR(d_tracer)) +		return -ENOMEM; + +	top_dir = tracefs_create_dir("hwlat_detector", d_tracer); +	if (!top_dir) +		return -ENOMEM; + +	hwlat_sample_window = tracefs_create_file("window", 0640, +						  top_dir, +						  &hwlat_data.sample_window, +						  &window_fops); +	if (!hwlat_sample_window) +		goto err; + +	hwlat_sample_width = tracefs_create_file("width", 0644, +						 top_dir, +						 &hwlat_data.sample_width, +						 &width_fops); +	if (!hwlat_sample_width) +		goto err; + +	return 0; + + err: +	tracefs_remove_recursive(top_dir); +	return -ENOMEM; +} + +static void hwlat_tracer_start(struct trace_array *tr) +{ +	int err; + +	err = start_kthread(tr); +	if (err) +		pr_err(BANNER "Cannot start hwlat kthread\n"); +} + +static void hwlat_tracer_stop(struct trace_array *tr) +{ +	stop_kthread(); +} + +static bool hwlat_busy; + +static int hwlat_tracer_init(struct trace_array *tr) +{ +	/* Only allow one instance to enable this */ +	if (hwlat_busy) +		return -EBUSY; + +	hwlat_trace = tr; + +	disable_migrate = false; +	hwlat_data.count = 0; +	tr->max_latency = 0; +	save_tracing_thresh = tracing_thresh; + +	/* tracing_thresh is in nsecs, we speak in usecs */ +	if (!tracing_thresh) +		tracing_thresh = last_tracing_thresh; + +	if (tracer_tracing_is_on(tr)) +		hwlat_tracer_start(tr); + +	hwlat_busy = true; + +	return 0; +} + +static void hwlat_tracer_reset(struct trace_array *tr) +{ +	stop_kthread(); + +	/* the tracing threshold is static between runs */ +	last_tracing_thresh = tracing_thresh; + +	tracing_thresh = save_tracing_thresh; +	hwlat_busy = false; +} + +static struct tracer hwlat_tracer __read_mostly = +{ +	.name		= "hwlat", +	.init		= hwlat_tracer_init, +	.reset		= hwlat_tracer_reset, +	.start		= hwlat_tracer_start, +	.stop		= hwlat_tracer_stop, +	.allow_instances = true, +}; + +__init static int init_hwlat_tracer(void) +{ +	int ret; + +	mutex_init(&hwlat_data.lock); + +	ret = register_tracer(&hwlat_tracer); +	if (ret) +		return ret; + +	init_tracefs(); + +	return 0; +} +late_initcall(init_hwlat_tracer); diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 9aedb0b06683..eb6c9f1d3a93 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -253,6 +253,10 @@ static const struct fetch_type kprobes_fetch_type_table[] = {  	ASSIGN_FETCH_TYPE(s16, u16, 1),  	ASSIGN_FETCH_TYPE(s32, u32, 1),  	ASSIGN_FETCH_TYPE(s64, u64, 1), +	ASSIGN_FETCH_TYPE_ALIAS(x8,  u8,  u8,  0), +	ASSIGN_FETCH_TYPE_ALIAS(x16, u16, u16, 0), +	ASSIGN_FETCH_TYPE_ALIAS(x32, u32, u32, 0), +	ASSIGN_FETCH_TYPE_ALIAS(x64, u64, u64, 0),  	ASSIGN_FETCH_TYPE_END  }; diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 0bb9cf2d53e6..3fc20422c166 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -1098,6 +1098,71 @@ static struct trace_event trace_user_stack_event = {  	.funcs		= &trace_user_stack_funcs,  }; +/* TRACE_HWLAT */ +static enum print_line_t +trace_hwlat_print(struct trace_iterator *iter, int flags, +		  struct trace_event *event) +{ +	struct trace_entry *entry = iter->ent; +	struct trace_seq *s = &iter->seq; +	struct hwlat_entry *field; + +	trace_assign_type(field, entry); + +	trace_seq_printf(s, "#%-5u inner/outer(us): %4llu/%-5llu ts:%ld.%09ld", +			 field->seqnum, +			 field->duration, +			 field->outer_duration, +			 field->timestamp.tv_sec, +			 field->timestamp.tv_nsec); + +	if (field->nmi_count) { +		/* +		 * The generic sched_clock() is not NMI safe, thus +		 * we only record the count and not the time. +		 */ +		if (!IS_ENABLED(CONFIG_GENERIC_SCHED_CLOCK)) +			trace_seq_printf(s, " nmi-total:%llu", +					 field->nmi_total_ts); +		trace_seq_printf(s, " nmi-count:%u", +				 field->nmi_count); +	} + +	trace_seq_putc(s, '\n'); + +	return trace_handle_return(s); +} + + +static enum print_line_t +trace_hwlat_raw(struct trace_iterator *iter, int flags, +		struct trace_event *event) +{ +	struct hwlat_entry *field; +	struct trace_seq *s = &iter->seq; + +	trace_assign_type(field, iter->ent); + +	trace_seq_printf(s, "%llu %lld %ld %09ld %u\n", +			 field->duration, +			 field->outer_duration, +			 field->timestamp.tv_sec, +			 field->timestamp.tv_nsec, +			 field->seqnum); + +	return trace_handle_return(s); +} + +static struct trace_event_functions trace_hwlat_funcs = { +	.trace		= trace_hwlat_print, +	.raw		= trace_hwlat_raw, +}; + +static struct trace_event trace_hwlat_event = { +	.type		= TRACE_HWLAT, +	.funcs		= &trace_hwlat_funcs, +}; +  /* TRACE_BPUTS */  static enum print_line_t  trace_bputs_print(struct trace_iterator *iter, int flags, @@ -1233,6 +1298,7 @@ static struct trace_event *events[] __initdata = {  	&trace_bputs_event,  	&trace_bprint_event,  	&trace_print_event, +	&trace_hwlat_event,  	NULL  }; diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index 74e80a582c28..8c0553d9afd3 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -36,24 +36,28 @@ const char *reserved_field_names[] = {  };  /* Printing  in basic type function template */ -#define DEFINE_BASIC_PRINT_TYPE_FUNC(type, fmt)				\ -int PRINT_TYPE_FUNC_NAME(type)(struct trace_seq *s, const char *name,	\ +#define DEFINE_BASIC_PRINT_TYPE_FUNC(tname, type, fmt)			\ +int PRINT_TYPE_FUNC_NAME(tname)(struct trace_seq *s, const char *name,	\  				void *data, void *ent)			\  {									\  	trace_seq_printf(s, " %s=" fmt, name, *(type *)data);		\  	return !trace_seq_has_overflowed(s);				\  }									\ -const char PRINT_TYPE_FMT_NAME(type)[] = fmt;				\ -NOKPROBE_SYMBOL(PRINT_TYPE_FUNC_NAME(type)); - -DEFINE_BASIC_PRINT_TYPE_FUNC(u8 , "0x%x") -DEFINE_BASIC_PRINT_TYPE_FUNC(u16, "0x%x") -DEFINE_BASIC_PRINT_TYPE_FUNC(u32, "0x%x") -DEFINE_BASIC_PRINT_TYPE_FUNC(u64, "0x%Lx") -DEFINE_BASIC_PRINT_TYPE_FUNC(s8,  "%d") -DEFINE_BASIC_PRINT_TYPE_FUNC(s16, "%d") -DEFINE_BASIC_PRINT_TYPE_FUNC(s32, "%d") -DEFINE_BASIC_PRINT_TYPE_FUNC(s64, "%Ld") +const char PRINT_TYPE_FMT_NAME(tname)[] = fmt;				\ +NOKPROBE_SYMBOL(PRINT_TYPE_FUNC_NAME(tname)); + +DEFINE_BASIC_PRINT_TYPE_FUNC(u8,  u8,  "%u") +DEFINE_BASIC_PRINT_TYPE_FUNC(u16, u16, "%u") +DEFINE_BASIC_PRINT_TYPE_FUNC(u32, u32, "%u") +DEFINE_BASIC_PRINT_TYPE_FUNC(u64, u64, "%Lu") +DEFINE_BASIC_PRINT_TYPE_FUNC(s8,  s8,  "%d") +DEFINE_BASIC_PRINT_TYPE_FUNC(s16, s16, "%d") +DEFINE_BASIC_PRINT_TYPE_FUNC(s32, s32, "%d") +DEFINE_BASIC_PRINT_TYPE_FUNC(s64, s64, "%Ld") +DEFINE_BASIC_PRINT_TYPE_FUNC(x8,  u8,  "0x%x") +DEFINE_BASIC_PRINT_TYPE_FUNC(x16, u16, "0x%x") +DEFINE_BASIC_PRINT_TYPE_FUNC(x32, u32, "0x%x") +DEFINE_BASIC_PRINT_TYPE_FUNC(x64, u64, "0x%Lx")  /* Print type function for string type */  int PRINT_TYPE_FUNC_NAME(string)(struct trace_seq *s, const char *name, diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h index 45400ca5ded1..0c0ae54d44c6 100644 --- a/kernel/trace/trace_probe.h +++ b/kernel/trace/trace_probe.h @@ -149,6 +149,11 @@ DECLARE_BASIC_PRINT_TYPE_FUNC(s8);  DECLARE_BASIC_PRINT_TYPE_FUNC(s16);  DECLARE_BASIC_PRINT_TYPE_FUNC(s32);  DECLARE_BASIC_PRINT_TYPE_FUNC(s64); +DECLARE_BASIC_PRINT_TYPE_FUNC(x8); +DECLARE_BASIC_PRINT_TYPE_FUNC(x16); +DECLARE_BASIC_PRINT_TYPE_FUNC(x32); +DECLARE_BASIC_PRINT_TYPE_FUNC(x64); +  DECLARE_BASIC_PRINT_TYPE_FUNC(string);  #define FETCH_FUNC_NAME(method, type)	fetch_##method##_##type @@ -203,7 +208,7 @@ DEFINE_FETCH_##method(u32)		\  DEFINE_FETCH_##method(u64)  /* Default (unsigned long) fetch type */ -#define __DEFAULT_FETCH_TYPE(t) u##t +#define __DEFAULT_FETCH_TYPE(t) x##t  #define _DEFAULT_FETCH_TYPE(t) __DEFAULT_FETCH_TYPE(t)  #define DEFAULT_FETCH_TYPE _DEFAULT_FETCH_TYPE(BITS_PER_LONG)  #define DEFAULT_FETCH_TYPE_STR __stringify(DEFAULT_FETCH_TYPE) @@ -234,6 +239,10 @@ ASSIGN_FETCH_FUNC(file_offset, ftype),			\  #define ASSIGN_FETCH_TYPE(ptype, ftype, sign)			\  	__ASSIGN_FETCH_TYPE(#ptype, ptype, ftype, sizeof(ftype), sign, #ptype) +/* If ptype is an alias of atype, use this macro (show atype in format) */ +#define ASSIGN_FETCH_TYPE_ALIAS(ptype, atype, ftype, sign)		\ +	__ASSIGN_FETCH_TYPE(#ptype, ptype, ftype, sizeof(ftype), sign, #atype) +  #define ASSIGN_FETCH_TYPE_END {}  #define FETCH_TYPE_STRING	0 diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index b2b6efc083a4..5e10395da88e 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -610,8 +610,7 @@ static int perf_sysenter_enable(struct trace_event_call *call)  	if (!sys_perf_refcount_enter)  		ret = register_trace_sys_enter(perf_syscall_enter, NULL);  	if (ret) { -		pr_info("event trace: Could not activate" -				"syscall entry trace point"); +		pr_info("event trace: Could not activate syscall entry trace point");  	} else {  		set_bit(num, enabled_perf_enter_syscalls);  		sys_perf_refcount_enter++; @@ -682,8 +681,7 @@ static int perf_sysexit_enable(struct trace_event_call *call)  	if (!sys_perf_refcount_exit)  		ret = register_trace_sys_exit(perf_syscall_exit, NULL);  	if (ret) { -		pr_info("event trace: Could not activate" -				"syscall exit trace point"); +		pr_info("event trace: Could not activate syscall exit trace point");  	} else {  		set_bit(num, enabled_perf_exit_syscalls);  		sys_perf_refcount_exit++; diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index c53485441c88..0913693caf6e 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -211,6 +211,10 @@ static const struct fetch_type uprobes_fetch_type_table[] = {  	ASSIGN_FETCH_TYPE(s16, u16, 1),  	ASSIGN_FETCH_TYPE(s32, u32, 1),  	ASSIGN_FETCH_TYPE(s64, u64, 1), +	ASSIGN_FETCH_TYPE_ALIAS(x8,  u8,  u8,  0), +	ASSIGN_FETCH_TYPE_ALIAS(x16, u16, u16, 0), +	ASSIGN_FETCH_TYPE_ALIAS(x32, u32, u32, 0), +	ASSIGN_FETCH_TYPE_ALIAS(x64, u64, u64, 0),  	ASSIGN_FETCH_TYPE_END  }; @@ -427,10 +431,6 @@ static int create_trace_uprobe(int argc, char **argv)  		pr_info("Probe point is not specified.\n");  		return -EINVAL;  	} -	if (isdigit(argv[1][0])) { -		pr_info("probe point must be have a filename.\n"); -		return -EINVAL; -	}  	arg = strchr(argv[1], ':');  	if (!arg) {  		ret = -EINVAL; |