diff options
Diffstat (limited to 'kernel/trace')
26 files changed, 2945 insertions, 1236 deletions
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index d5038005eb5d..434c840e2d82 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -134,7 +134,8 @@ config FUNCTION_TRACER select KALLSYMS select GENERIC_TRACER select CONTEXT_SWITCH_TRACER - select GLOB + select GLOB + select TASKS_RCU if PREEMPT help Enable the kernel to trace every kernel function. This is done by using a compiler feature to insert a small, 5-byte No-Operation @@ -429,7 +430,7 @@ config BLK_DEV_IO_TRACE If unsure, say N. -config KPROBE_EVENT +config KPROBE_EVENTS depends on KPROBES depends on HAVE_REGS_AND_STACK_ACCESS_API bool "Enable kprobes-based dynamic events" @@ -447,7 +448,7 @@ config KPROBE_EVENT This option is also required by perf-probe subcommand of perf tools. If you want to use perf tools, this option is strongly recommended. -config UPROBE_EVENT +config UPROBE_EVENTS bool "Enable uprobes-based dynamic events" depends on ARCH_SUPPORTS_UPROBES depends on MMU @@ -455,7 +456,7 @@ config UPROBE_EVENT select UPROBES select PROBE_EVENTS select TRACING - default n + default y help This allows the user to add tracing events on top of userspace dynamic events (similar to tracepoints) on the fly via the trace @@ -466,7 +467,7 @@ config UPROBE_EVENT config BPF_EVENTS depends on BPF_SYSCALL - depends on (KPROBE_EVENT || UPROBE_EVENT) && PERF_EVENTS + depends on (KPROBE_EVENTS || UPROBE_EVENTS) && PERF_EVENTS bool default y help @@ -666,30 +667,30 @@ config RING_BUFFER_STARTUP_TEST If unsure, say N -config TRACE_ENUM_MAP_FILE - bool "Show enum mappings for trace events" +config TRACE_EVAL_MAP_FILE + bool "Show eval mappings for trace events" depends on TRACING help - The "print fmt" of the trace events will show the enum names instead - of their values. This can cause problems for user space tools that - use this string to parse the raw data as user space does not know + The "print fmt" of the trace events will show the enum/sizeof names + instead of their values. This can cause problems for user space tools + that use this string to parse the raw data as user space does not know how to convert the string to its value. To fix this, there's a special macro in the kernel that can be used - to convert the enum into its value. If this macro is used, then the - print fmt strings will have the enums converted to their values. + to convert an enum/sizeof into its value. If this macro is used, then + the print fmt strings will be converted to their values. If something does not get converted properly, this option can be - used to show what enums the kernel tried to convert. + used to show what enums/sizeof the kernel tried to convert. - This option is for debugging the enum conversions. A file is created - in the tracing directory called "enum_map" that will show the enum + This option is for debugging the conversions. A file is created + in the tracing directory called "eval_map" that will show the names matched with their values and what trace event system they belong too. Normally, the mapping of the strings to values will be freed after boot up or module load. With this option, they will not be freed, as - they are needed for the "enum_map" file. Enabling this option will + they are needed for the "eval_map" file. Enabling this option will increase the memory footprint of the running kernel. If unsure, say N diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index e57980845549..90f2701d92a7 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -57,7 +57,7 @@ obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o obj-$(CONFIG_EVENT_TRACING) += trace_events_trigger.o obj-$(CONFIG_HIST_TRIGGERS) += trace_events_hist.o obj-$(CONFIG_BPF_EVENTS) += bpf_trace.o -obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o +obj-$(CONFIG_KPROBE_EVENTS) += trace_kprobe.o obj-$(CONFIG_TRACEPOINTS) += power-traces.o ifeq ($(CONFIG_PM),y) obj-$(CONFIG_TRACEPOINTS) += rpm-traces.o @@ -66,7 +66,7 @@ ifeq ($(CONFIG_TRACING),y) obj-$(CONFIG_KGDB_KDB) += trace_kdb.o endif obj-$(CONFIG_PROBE_EVENTS) += trace_probe.o -obj-$(CONFIG_UPROBE_EVENT) += trace_uprobe.o +obj-$(CONFIG_UPROBE_EVENTS) += trace_uprobe.o obj-$(CONFIG_TRACEPOINT_BENCHMARK) += trace_benchmark.o diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 95cecbf67f5c..bc364f86100a 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -28,6 +28,8 @@ #include <linux/uaccess.h> #include <linux/list.h> +#include "../../block/blk.h" + #include <trace/events/block.h> #include "trace_output.h" @@ -292,9 +294,6 @@ record_it: local_irq_restore(flags); } -static struct dentry *blk_tree_root; -static DEFINE_MUTEX(blk_tree_mutex); - static void blk_trace_free(struct blk_trace *bt) { debugfs_remove(bt->msg_file); @@ -433,9 +432,9 @@ static void blk_trace_setup_lba(struct blk_trace *bt, /* * Setup everything required to start tracing */ -int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, - struct block_device *bdev, - struct blk_user_trace_setup *buts) +static int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, + struct block_device *bdev, + struct blk_user_trace_setup *buts) { struct blk_trace *bt = NULL; struct dentry *dir = NULL; @@ -468,22 +467,15 @@ int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, ret = -ENOENT; - mutex_lock(&blk_tree_mutex); - if (!blk_tree_root) { - blk_tree_root = debugfs_create_dir("block", NULL); - if (!blk_tree_root) { - mutex_unlock(&blk_tree_mutex); - goto err; - } - } - mutex_unlock(&blk_tree_mutex); - - dir = debugfs_create_dir(buts->name, blk_tree_root); + if (!blk_debugfs_root) + goto err; + dir = debugfs_lookup(buts->name, blk_debugfs_root); + if (!dir) + bt->dir = dir = debugfs_create_dir(buts->name, blk_debugfs_root); if (!dir) goto err; - bt->dir = dir; bt->dev = dev; atomic_set(&bt->dropped, 0); INIT_LIST_HEAD(&bt->running_list); @@ -525,9 +517,12 @@ int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, if (atomic_inc_return(&blk_probes_ref) == 1) blk_register_tracepoints(); - return 0; + ret = 0; err: - blk_trace_free(bt); + if (dir && !bt->dir) + dput(dir); + if (ret) + blk_trace_free(bt); return ret; } @@ -695,8 +690,8 @@ void blk_trace_shutdown(struct request_queue *q) /** * blk_add_trace_rq - Add a trace for a request oriented action - * @q: queue the io is for * @rq: the source request + * @error: return status to log * @nr_bytes: number of completed bytes * @what: the action * @@ -704,56 +699,46 @@ void blk_trace_shutdown(struct request_queue *q) * Records an action against a request. Will log the bio offset + size. * **/ -static void blk_add_trace_rq(struct request_queue *q, struct request *rq, +static void blk_add_trace_rq(struct request *rq, int error, unsigned int nr_bytes, u32 what) { - struct blk_trace *bt = q->blk_trace; + struct blk_trace *bt = rq->q->blk_trace; if (likely(!bt)) return; - if (rq->cmd_type == REQ_TYPE_BLOCK_PC) { + if (blk_rq_is_passthrough(rq)) what |= BLK_TC_ACT(BLK_TC_PC); - __blk_add_trace(bt, 0, nr_bytes, req_op(rq), rq->cmd_flags, - what, rq->errors, rq->cmd_len, rq->cmd); - } else { + else what |= BLK_TC_ACT(BLK_TC_FS); - __blk_add_trace(bt, blk_rq_pos(rq), nr_bytes, req_op(rq), - rq->cmd_flags, what, rq->errors, 0, NULL); - } -} -static void blk_add_trace_rq_abort(void *ignore, - struct request_queue *q, struct request *rq) -{ - blk_add_trace_rq(q, rq, blk_rq_bytes(rq), BLK_TA_ABORT); + __blk_add_trace(bt, blk_rq_trace_sector(rq), nr_bytes, req_op(rq), + rq->cmd_flags, what, error, 0, NULL); } static void blk_add_trace_rq_insert(void *ignore, struct request_queue *q, struct request *rq) { - blk_add_trace_rq(q, rq, blk_rq_bytes(rq), BLK_TA_INSERT); + blk_add_trace_rq(rq, 0, blk_rq_bytes(rq), BLK_TA_INSERT); } static void blk_add_trace_rq_issue(void *ignore, struct request_queue *q, struct request *rq) { - blk_add_trace_rq(q, rq, blk_rq_bytes(rq), BLK_TA_ISSUE); + blk_add_trace_rq(rq, 0, blk_rq_bytes(rq), BLK_TA_ISSUE); } static void blk_add_trace_rq_requeue(void *ignore, struct request_queue *q, struct request *rq) { - blk_add_trace_rq(q, rq, blk_rq_bytes(rq), BLK_TA_REQUEUE); + blk_add_trace_rq(rq, 0, blk_rq_bytes(rq), BLK_TA_REQUEUE); } -static void blk_add_trace_rq_complete(void *ignore, - struct request_queue *q, - struct request *rq, - unsigned int nr_bytes) +static void blk_add_trace_rq_complete(void *ignore, struct request *rq, + int error, unsigned int nr_bytes) { - blk_add_trace_rq(q, rq, nr_bytes, BLK_TA_COMPLETE); + blk_add_trace_rq(rq, error, nr_bytes, BLK_TA_COMPLETE); } /** @@ -882,7 +867,7 @@ static void blk_add_trace_split(void *ignore, __blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size, bio_op(bio), bio->bi_opf, - BLK_TA_SPLIT, bio->bi_error, sizeof(rpdu), + BLK_TA_SPLIT, bio->bi_status, sizeof(rpdu), &rpdu); } } @@ -915,7 +900,7 @@ static void blk_add_trace_bio_remap(void *ignore, r.sector_from = cpu_to_be64(from); __blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size, - bio_op(bio), bio->bi_opf, BLK_TA_REMAP, bio->bi_error, + bio_op(bio), bio->bi_opf, BLK_TA_REMAP, bio->bi_status, sizeof(r), &r); } @@ -948,7 +933,7 @@ static void blk_add_trace_rq_remap(void *ignore, r.sector_from = cpu_to_be64(from); __blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq), - rq_data_dir(rq), 0, BLK_TA_REMAP, !!rq->errors, + rq_data_dir(rq), 0, BLK_TA_REMAP, 0, sizeof(r), &r); } @@ -972,12 +957,8 @@ void blk_add_driver_data(struct request_queue *q, if (likely(!bt)) return; - if (rq->cmd_type == REQ_TYPE_BLOCK_PC) - __blk_add_trace(bt, 0, blk_rq_bytes(rq), 0, 0, - BLK_TA_DRV_DATA, rq->errors, len, data); - else - __blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq), 0, 0, - BLK_TA_DRV_DATA, rq->errors, len, data); + __blk_add_trace(bt, blk_rq_trace_sector(rq), blk_rq_bytes(rq), 0, 0, + BLK_TA_DRV_DATA, 0, len, data); } EXPORT_SYMBOL_GPL(blk_add_driver_data); @@ -985,8 +966,6 @@ static void blk_register_tracepoints(void) { int ret; - ret = register_trace_block_rq_abort(blk_add_trace_rq_abort, NULL); - WARN_ON(ret); ret = register_trace_block_rq_insert(blk_add_trace_rq_insert, NULL); WARN_ON(ret); ret = register_trace_block_rq_issue(blk_add_trace_rq_issue, NULL); @@ -1039,7 +1018,6 @@ static void blk_unregister_tracepoints(void) unregister_trace_block_rq_requeue(blk_add_trace_rq_requeue, NULL); unregister_trace_block_rq_issue(blk_add_trace_rq_issue, NULL); unregister_trace_block_rq_insert(blk_add_trace_rq_insert, NULL); - unregister_trace_block_rq_abort(blk_add_trace_rq_abort, NULL); tracepoint_synchronize_unregister(); } @@ -1684,14 +1662,14 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev, goto out; if (attr == &dev_attr_act_mask) { - if (sscanf(buf, "%llx", &value) != 1) { + if (kstrtoull(buf, 0, &value)) { /* Assume it is a list of trace category names */ ret = blk_trace_str2mask(buf); if (ret < 0) goto out; value = ret; } - } else if (sscanf(buf, "%llu", &value) != 1) + } else if (kstrtoull(buf, 0, &value)) goto out; ret = -ENXIO; @@ -1752,31 +1730,6 @@ void blk_trace_remove_sysfs(struct device *dev) #ifdef CONFIG_EVENT_TRACING -void blk_dump_cmd(char *buf, struct request *rq) -{ - int i, end; - int len = rq->cmd_len; - unsigned char *cmd = rq->cmd; - - if (rq->cmd_type != REQ_TYPE_BLOCK_PC) { - buf[0] = '\0'; - return; - } - - for (end = len - 1; end >= 0; end--) - if (cmd[end]) - break; - end++; - - for (i = 0; i < len; i++) { - buf += sprintf(buf, "%s%02x", i == 0 ? "" : " ", cmd[i]); - if (i == end && end != len - 1) { - sprintf(buf, " .."); - break; - } - } -} - void blk_fill_rwbs(char *rwbs, unsigned int op, int bytes) { int i = 0; diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index fa77311dadb2..37385193a608 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -76,8 +76,8 @@ static const struct bpf_func_proto bpf_probe_read_proto = { .func = bpf_probe_read, .gpl_only = true, .ret_type = RET_INTEGER, - .arg1_type = ARG_PTR_TO_RAW_STACK, - .arg2_type = ARG_CONST_STACK_SIZE, + .arg1_type = ARG_PTR_TO_UNINIT_MEM, + .arg2_type = ARG_CONST_SIZE, .arg3_type = ARG_ANYTHING, }; @@ -96,7 +96,7 @@ BPF_CALL_3(bpf_probe_write_user, void *, unsafe_ptr, const void *, src, if (unlikely(in_interrupt() || current->flags & (PF_KTHREAD | PF_EXITING))) return -EPERM; - if (unlikely(segment_eq(get_fs(), KERNEL_DS))) + if (unlikely(uaccess_kernel())) return -EPERM; if (!access_ok(VERIFY_WRITE, unsafe_ptr, size)) return -EPERM; @@ -109,8 +109,8 @@ static const struct bpf_func_proto bpf_probe_write_user_proto = { .gpl_only = true, .ret_type = RET_INTEGER, .arg1_type = ARG_ANYTHING, - .arg2_type = ARG_PTR_TO_STACK, - .arg3_type = ARG_CONST_STACK_SIZE, + .arg2_type = ARG_PTR_TO_MEM, + .arg3_type = ARG_CONST_SIZE, }; static const struct bpf_func_proto *bpf_get_probe_write_proto(void) @@ -122,8 +122,8 @@ static const struct bpf_func_proto *bpf_get_probe_write_proto(void) } /* - * limited trace_printk() - * only %d %u %x %ld %lu %lx %lld %llu %llx %p %s conversion specifiers allowed + * Only limited trace_printk() conversion specifiers allowed: + * %d %i %u %x %ld %li %lu %lx %lld %lli %llu %llx %p %s */ BPF_CALL_5(bpf_trace_printk, char *, fmt, u32, fmt_size, u64, arg1, u64, arg2, u64, arg3) @@ -198,7 +198,8 @@ BPF_CALL_5(bpf_trace_printk, char *, fmt, u32, fmt_size, u64, arg1, i++; } - if (fmt[i] != 'd' && fmt[i] != 'u' && fmt[i] != 'x') + if (fmt[i] != 'i' && fmt[i] != 'd' && + fmt[i] != 'u' && fmt[i] != 'x') return -EINVAL; fmt_cnt++; } @@ -213,8 +214,8 @@ static const struct bpf_func_proto bpf_trace_printk_proto = { .func = bpf_trace_printk, .gpl_only = true, .ret_type = RET_INTEGER, - .arg1_type = ARG_PTR_TO_STACK, - .arg2_type = ARG_CONST_STACK_SIZE, + .arg1_type = ARG_PTR_TO_MEM, + .arg2_type = ARG_CONST_SIZE, }; const struct bpf_func_proto *bpf_get_trace_printk_proto(void) @@ -234,7 +235,8 @@ BPF_CALL_2(bpf_perf_event_read, struct bpf_map *, map, u64, flags) unsigned int cpu = smp_processor_id(); u64 index = flags & BPF_F_INDEX_MASK; struct bpf_event_entry *ee; - struct perf_event *event; + u64 value = 0; + int err; if (unlikely(flags & ~(BPF_F_INDEX_MASK))) return -EINVAL; @@ -247,21 +249,14 @@ BPF_CALL_2(bpf_perf_event_read, struct bpf_map *, map, u64, flags) if (!ee) return -ENOENT; - event = ee->event; - if (unlikely(event->attr.type != PERF_TYPE_HARDWARE && - event->attr.type != PERF_TYPE_RAW)) - return -EINVAL; - - /* make sure event is local and doesn't have pmu::count */ - if (unlikely(event->oncpu != cpu || event->pmu->count)) - return -EINVAL; - + err = perf_event_read_local(ee->event, &value); /* - * we don't know if the function is run successfully by the - * return value. It can be judged in other places, such as - * eBPF programs. + * this api is ugly since we miss [-22..-2] range of valid + * counter values, but that's uapi */ - return perf_event_read_local(event); + if (err) + return err; + return value; } static const struct bpf_func_proto bpf_perf_event_read_proto = { @@ -272,14 +267,16 @@ static const struct bpf_func_proto bpf_perf_event_read_proto = { .arg2_type = ARG_ANYTHING, }; +static DEFINE_PER_CPU(struct perf_sample_data, bpf_sd); + static __always_inline u64 __bpf_perf_event_output(struct pt_regs *regs, struct bpf_map *map, u64 flags, struct perf_raw_record *raw) { struct bpf_array *array = container_of(map, struct bpf_array, map); + struct perf_sample_data *sd = this_cpu_ptr(&bpf_sd); unsigned int cpu = smp_processor_id(); u64 index = flags & BPF_F_INDEX_MASK; - struct perf_sample_data sample_data; struct bpf_event_entry *ee; struct perf_event *event; @@ -300,9 +297,9 @@ __bpf_perf_event_output(struct pt_regs *regs, struct bpf_map *map, if (unlikely(event->oncpu != cpu)) return -EOPNOTSUPP; - perf_sample_data_init(&sample_data, 0, 0); - sample_data.raw = raw; - perf_event_output(event, &sample_data, regs); + perf_sample_data_init(sd, 0, 0); + sd->raw = raw; + perf_event_output(event, sd, regs); return 0; } @@ -329,8 +326,8 @@ static const struct bpf_func_proto bpf_perf_event_output_proto = { .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_CONST_MAP_PTR, .arg3_type = ARG_ANYTHING, - .arg4_type = ARG_PTR_TO_STACK, - .arg5_type = ARG_CONST_STACK_SIZE, + .arg4_type = ARG_PTR_TO_MEM, + .arg5_type = ARG_CONST_SIZE, }; static DEFINE_PER_CPU(struct pt_regs, bpf_pt_regs); @@ -395,6 +392,36 @@ static const struct bpf_func_proto bpf_current_task_under_cgroup_proto = { .arg2_type = ARG_ANYTHING, }; +BPF_CALL_3(bpf_probe_read_str, void *, dst, u32, size, + const void *, unsafe_ptr) +{ + int ret; + + /* + * The strncpy_from_unsafe() call will likely not fill the entire + * buffer, but that's okay in this circumstance as we're probing + * arbitrary memory anyway similar to bpf_probe_read() and might + * as well probe the stack. Thus, memory is explicitly cleared + * only in error case, so that improper users ignoring return + * code altogether don't copy garbage; otherwise length of string + * is returned that can be used for bpf_perf_event_output() et al. + */ + ret = strncpy_from_unsafe(dst, unsafe_ptr, size); + if (unlikely(ret < 0)) + memset(dst, 0, size); + + return ret; +} + +static const struct bpf_func_proto bpf_probe_read_str_proto = { + .func = bpf_probe_read_str, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_UNINIT_MEM, + .arg2_type = ARG_CONST_SIZE, + .arg3_type = ARG_ANYTHING, +}; + static const struct bpf_func_proto *tracing_func_proto(enum bpf_func_id func_id) { switch (func_id) { @@ -432,6 +459,8 @@ static const struct bpf_func_proto *tracing_func_proto(enum bpf_func_id func_id) return &bpf_current_task_under_cgroup_proto; case BPF_FUNC_get_prandom_u32: return &bpf_get_prandom_u32_proto; + case BPF_FUNC_probe_read_str: + return &bpf_probe_read_str_proto; default: return NULL; } @@ -451,7 +480,7 @@ static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func /* bpf+kprobe programs can access fields of 'struct pt_regs' */ static bool kprobe_prog_is_valid_access(int off, int size, enum bpf_access_type type, - enum bpf_reg_type *reg_type) + struct bpf_insn_access_aux *info) { if (off < 0 || off >= sizeof(struct pt_regs)) return false; @@ -459,19 +488,21 @@ static bool kprobe_prog_is_valid_access(int off, int size, enum bpf_access_type return false; if (off % size != 0) return false; + /* + * Assertion for 32 bit to make sure last 8 byte access + * (BPF_DW) to the last 4 byte member is disallowed. + */ + if (off + size > sizeof(struct pt_regs)) + return false; + return true; } -static const struct bpf_verifier_ops kprobe_prog_ops = { +const struct bpf_verifier_ops kprobe_prog_ops = { .get_func_proto = kprobe_prog_func_proto, .is_valid_access = kprobe_prog_is_valid_access, }; -static struct bpf_prog_type_list kprobe_tl = { - .ops = &kprobe_prog_ops, - .type = BPF_PROG_TYPE_KPROBE, -}; - BPF_CALL_5(bpf_perf_event_output_tp, void *, tp_buff, struct bpf_map *, map, u64, flags, void *, data, u64, size) { @@ -492,8 +523,8 @@ static const struct bpf_func_proto bpf_perf_event_output_proto_tp = { .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_CONST_MAP_PTR, .arg3_type = ARG_ANYTHING, - .arg4_type = ARG_PTR_TO_STACK, - .arg5_type = ARG_CONST_STACK_SIZE, + .arg4_type = ARG_PTR_TO_MEM, + .arg5_type = ARG_CONST_SIZE, }; BPF_CALL_3(bpf_get_stackid_tp, void *, tp_buff, struct bpf_map *, map, @@ -532,7 +563,7 @@ static const struct bpf_func_proto *tp_prog_func_proto(enum bpf_func_id func_id) } static bool tp_prog_is_valid_access(int off, int size, enum bpf_access_type type, - enum bpf_reg_type *reg_type) + struct bpf_insn_access_aux *info) { if (off < sizeof(void *) || off >= PERF_MAX_TRACE_SIZE) return false; @@ -540,82 +571,73 @@ static bool tp_prog_is_valid_access(int off, int size, enum bpf_access_type type return false; if (off % size != 0) return false; + + BUILD_BUG_ON(PERF_MAX_TRACE_SIZE % sizeof(__u64)); return true; } -static const struct bpf_verifier_ops tracepoint_prog_ops = { +const struct bpf_verifier_ops tracepoint_prog_ops = { .get_func_proto = tp_prog_func_proto, .is_valid_access = tp_prog_is_valid_access, }; -static struct bpf_prog_type_list tracepoint_tl = { - .ops = &tracepoint_prog_ops, - .type = BPF_PROG_TYPE_TRACEPOINT, -}; - static bool pe_prog_is_valid_access(int off, int size, enum bpf_access_type type, - enum bpf_reg_type *reg_type) + struct bpf_insn_access_aux *info) { + const int size_sp = FIELD_SIZEOF(struct bpf_perf_event_data, + sample_period); + if (off < 0 || off >= sizeof(struct bpf_perf_event_data)) return false; if (type != BPF_READ) return false; if (off % size != 0) return false; - if (off == offsetof(struct bpf_perf_event_data, sample_period)) { - if (size != sizeof(u64)) + + switch (off) { + case bpf_ctx_range(struct bpf_perf_event_data, sample_period): + bpf_ctx_record_field_size(info, size_sp); + if (!bpf_ctx_narrow_access_ok(off, size, size_sp)) return false; - } else { + break; + default: if (size != sizeof(long)) return false; } + return true; } -static u32 pe_prog_convert_ctx_access(enum bpf_access_type type, int dst_reg, - int src_reg, int ctx_off, +static u32 pe_prog_convert_ctx_access(enum bpf_access_type type, + const struct bpf_insn *si, struct bpf_insn *insn_buf, - struct bpf_prog *prog) + struct bpf_prog *prog, u32 *target_size) { struct bpf_insn *insn = insn_buf; - switch (ctx_off) { + switch (si->off) { case offsetof(struct bpf_perf_event_data, sample_period): - BUILD_BUG_ON(FIELD_SIZEOF(struct perf_sample_data, period) != sizeof(u64)); - *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_perf_event_data_kern, - data), dst_reg, src_reg, + data), si->dst_reg, si->src_reg, offsetof(struct bpf_perf_event_data_kern, data)); - *insn++ = BPF_LDX_MEM(BPF_DW, dst_reg, dst_reg, - offsetof(struct perf_sample_data, period)); + *insn++ = BPF_LDX_MEM(BPF_DW, si->dst_reg, si->dst_reg, + bpf_target_off(struct perf_sample_data, period, 8, + target_size)); break; default: *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_perf_event_data_kern, - regs), dst_reg, src_reg, + regs), si->dst_reg, si->src_reg, offsetof(struct bpf_perf_event_data_kern, regs)); - *insn++ = BPF_LDX_MEM(BPF_SIZEOF(long), dst_reg, dst_reg, ctx_off); + *insn++ = BPF_LDX_MEM(BPF_SIZEOF(long), si->dst_reg, si->dst_reg, + si->off); break; } return insn - insn_buf; } -static const struct bpf_verifier_ops perf_event_prog_ops = { +const struct bpf_verifier_ops perf_event_prog_ops = { .get_func_proto = tp_prog_func_proto, .is_valid_access = pe_prog_is_valid_access, .convert_ctx_access = pe_prog_convert_ctx_access, }; - -static struct bpf_prog_type_list perf_event_tl = { - .ops = &perf_event_prog_ops, - .type = BPF_PROG_TYPE_PERF_EVENT, -}; - -static int __init register_kprobe_prog_ops(void) -{ - bpf_register_prog_type(&kprobe_tl); - bpf_register_prog_type(&tracepoint_tl); - bpf_register_prog_type(&perf_event_tl); - return 0; -} -late_initcall(register_kprobe_prog_ops); diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index eb230f06ba41..02004ae91860 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -15,6 +15,7 @@ #include <linux/stop_machine.h> #include <linux/clocksource.h> +#include <linux/sched/task.h> #include <linux/kallsyms.h> #include <linux/seq_file.h> #include <linux/suspend.h> @@ -35,6 +36,7 @@ #include <trace/events/sched.h> +#include <asm/sections.h> #include <asm/setup.h> #include "trace_output.h" @@ -111,7 +113,7 @@ static int ftrace_disabled __read_mostly; static DEFINE_MUTEX(ftrace_lock); -static struct ftrace_ops *ftrace_ops_list __read_mostly = &ftrace_list_end; +static struct ftrace_ops __rcu *ftrace_ops_list __read_mostly = &ftrace_list_end; ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub; static struct ftrace_ops global_ops; @@ -167,8 +169,11 @@ int ftrace_nr_registered_ops(void) mutex_lock(&ftrace_lock); - for (ops = ftrace_ops_list; - ops != &ftrace_list_end; ops = ops->next) + for (ops = rcu_dereference_protected(ftrace_ops_list, + lockdep_is_held(&ftrace_lock)); + ops != &ftrace_list_end; + ops = rcu_dereference_protected(ops->next, + lockdep_is_held(&ftrace_lock))) cnt++; mutex_unlock(&ftrace_lock); @@ -273,10 +278,11 @@ static void update_ftrace_function(void) * If there's only one ftrace_ops registered, the ftrace_ops_list * will point to the ops we want. */ - set_function_trace_op = ftrace_ops_list; + set_function_trace_op = rcu_dereference_protected(ftrace_ops_list, + lockdep_is_held(&ftrace_lock)); /* If there's no ftrace_ops registered, just call the stub function */ - if (ftrace_ops_list == &ftrace_list_end) { + if (set_function_trace_op == &ftrace_list_end) { func = ftrace_stub; /* @@ -284,7 +290,8 @@ static void update_ftrace_function(void) * recursion safe and not dynamic and the arch supports passing ops, * then have the mcount trampoline call the function directly. */ - } else if (ftrace_ops_list->next == &ftrace_list_end) { + } else if (rcu_dereference_protected(ftrace_ops_list->next, + lockdep_is_held(&ftrace_lock)) == &ftrace_list_end) { func = ftrace_ops_get_list_func(ftrace_ops_list); } else { @@ -346,9 +353,11 @@ int using_ftrace_ops_list_func(void) return ftrace_trace_function == ftrace_ops_list_func; } -static void add_ftrace_ops(struct ftrace_ops **list, struct ftrace_ops *ops) +static void add_ftrace_ops(struct ftrace_ops __rcu **list, + struct ftrace_ops *ops) { - ops->next = *list; + rcu_assign_pointer(ops->next, *list); + /* * We are entering ops into the list but another * CPU might be walking that list. We need to make sure @@ -358,7 +367,8 @@ static void add_ftrace_ops(struct ftrace_ops **list, struct ftrace_ops *ops) rcu_assign_pointer(*list, ops); } -static int remove_ftrace_ops(struct ftrace_ops **list, struct ftrace_ops *ops) +static int remove_ftrace_ops(struct ftrace_ops __rcu **list, + struct ftrace_ops *ops) { struct ftrace_ops **p; @@ -366,7 +376,10 @@ static int remove_ftrace_ops(struct ftrace_ops **list, struct ftrace_ops *ops) * If we are removing the last function, then simply point * to the ftrace_stub. */ - if (*list == ops && ops->next == &ftrace_list_end) { + if (rcu_dereference_protected(*list, + lockdep_is_held(&ftrace_lock)) == ops && + rcu_dereference_protected(ops->next, + lockdep_is_held(&ftrace_lock)) == &ftrace_list_end) { *list = &ftrace_list_end; return 0; } @@ -1094,27 +1107,18 @@ static bool update_all_ops; # error Dynamic ftrace depends on MCOUNT_RECORD #endif -static struct hlist_head ftrace_func_hash[FTRACE_FUNC_HASHSIZE] __read_mostly; - -struct ftrace_func_probe { - struct hlist_node node; - struct ftrace_probe_ops *ops; - unsigned long flags; - unsigned long ip; - void *data; - struct list_head free_list; -}; - struct ftrace_func_entry { struct hlist_node hlist; unsigned long ip; }; -struct ftrace_hash { - unsigned long size_bits; - struct hlist_head *buckets; - unsigned long count; - struct rcu_head rcu; +struct ftrace_func_probe { + struct ftrace_probe_ops *probe_ops; + struct ftrace_ops ops; + struct trace_array *tr; + struct list_head list; + void *data; + int ref; }; /* @@ -1192,26 +1196,24 @@ struct ftrace_page { static struct ftrace_page *ftrace_pages_start; static struct ftrace_page *ftrace_pages; -static bool __always_inline ftrace_hash_empty(struct ftrace_hash *hash) +static __always_inline unsigned long +ftrace_hash_key(struct ftrace_hash *hash, unsigned long ip) { - return !hash || !hash->count; + if (hash->size_bits > 0) + return hash_long(ip, hash->size_bits); + + return 0; } -static struct ftrace_func_entry * -ftrace_lookup_ip(struct ftrace_hash *hash, unsigned long ip) +/* Only use this function if ftrace_hash_empty() has already been tested */ +static __always_inline struct ftrace_func_entry * +__ftrace_lookup_ip(struct ftrace_hash *hash, unsigned long ip) { unsigned long key; struct ftrace_func_entry *entry; struct hlist_head *hhd; - if (ftrace_hash_empty(hash)) - return NULL; - - if (hash->size_bits > 0) - key = hash_long(ip, hash->size_bits); - else - key = 0; - + key = ftrace_hash_key(hash, ip); hhd = &hash->buckets[key]; hlist_for_each_entry_rcu_notrace(entry, hhd, hlist) { @@ -1221,17 +1223,32 @@ ftrace_lookup_ip(struct ftrace_hash *hash, unsigned long ip) return NULL; } +/** + * ftrace_lookup_ip - Test to see if an ip exists in an ftrace_hash + * @hash: The hash to look at + * @ip: The instruction pointer to test + * + * Search a given @hash to see if a given instruction pointer (@ip) + * exists in it. + * + * Returns the entry that holds the @ip if found. NULL otherwise. + */ +struct ftrace_func_entry * +ftrace_lookup_ip(struct ftrace_hash *hash, unsigned long ip) +{ + if (ftrace_hash_empty(hash)) + return NULL; + + return __ftrace_lookup_ip(hash, ip); +} + static void __add_hash_entry(struct ftrace_hash *hash, struct ftrace_func_entry *entry) { struct hlist_head *hhd; unsigned long key; - if (hash->size_bits) - key = hash_long(entry->ip, hash->size_bits); - else - key = 0; - + key = ftrace_hash_key(hash, entry->ip); hhd = &hash->buckets[key]; hlist_add_head(&entry->hlist, hhd); hash->count++; @@ -1264,7 +1281,7 @@ static void remove_hash_entry(struct ftrace_hash *hash, struct ftrace_func_entry *entry) { - hlist_del(&entry->hlist); + hlist_del_rcu(&entry->hlist); hash->count--; } @@ -1287,6 +1304,28 @@ static void ftrace_hash_clear(struct ftrace_hash *hash) FTRACE_WARN_ON(hash->count); } +static void free_ftrace_mod(struct ftrace_mod_load *ftrace_mod) +{ + list_del(&ftrace_mod->list); + kfree(ftrace_mod->module); + kfree(ftrace_mod->func); + kfree(ftrace_mod); +} + +static void clear_ftrace_mod_list(struct list_head *head) +{ + struct ftrace_mod_load *p, *n; + + /* stack tracer isn't supported yet */ + if (!head) + return; + + mutex_lock(&ftrace_lock); + list_for_each_entry_safe(p, n, head, list) + free_ftrace_mod(p); + mutex_unlock(&ftrace_lock); +} + static void free_ftrace_hash(struct ftrace_hash *hash) { if (!hash || hash == EMPTY_HASH) @@ -1340,6 +1379,35 @@ static struct ftrace_hash *alloc_ftrace_hash(int size_bits) return hash; } + +static int ftrace_add_mod(struct trace_array *tr, + const char *func, const char *module, + int enable) +{ + struct ftrace_mod_load *ftrace_mod; + struct list_head *mod_head = enable ? &tr->mod_trace : &tr->mod_notrace; + + ftrace_mod = kzalloc(sizeof(*ftrace_mod), GFP_KERNEL); + if (!ftrace_mod) + return -ENOMEM; + + ftrace_mod->func = kstrdup(func, GFP_KERNEL); + ftrace_mod->module = kstrdup(module, GFP_KERNEL); + ftrace_mod->enable = enable; + + if (!ftrace_mod->func || !ftrace_mod->module) + goto out_free; + + list_add(&ftrace_mod->list, mod_head); + + return 0; + + out_free: + free_ftrace_mod(ftrace_mod); + + return -ENOMEM; +} + static struct ftrace_hash * alloc_and_copy_ftrace_hash(int size_bits, struct ftrace_hash *hash) { @@ -1353,6 +1421,9 @@ alloc_and_copy_ftrace_hash(int size_bits, struct ftrace_hash *hash) if (!new_hash) return NULL; + if (hash) + new_hash->flags = hash->flags; + /* Empty hash? */ if (ftrace_hash_empty(hash)) return new_hash; @@ -1383,9 +1454,8 @@ ftrace_hash_rec_enable_modify(struct ftrace_ops *ops, int filter_hash); static int ftrace_hash_ipmodify_update(struct ftrace_ops *ops, struct ftrace_hash *new_hash); -static int -ftrace_hash_move(struct ftrace_ops *ops, int enable, - struct ftrace_hash **dst, struct ftrace_hash *src) +static struct ftrace_hash * +__ftrace_hash_move(struct ftrace_hash *src) { struct ftrace_func_entry *entry; struct hlist_node *tn; @@ -1393,21 +1463,13 @@ ftrace_hash_move(struct ftrace_ops *ops, int enable, struct ftrace_hash *new_hash; int size = src->count; int bits = 0; - int ret; int i; - /* Reject setting notrace hash on IPMODIFY ftrace_ops */ - if (ops->flags & FTRACE_OPS_FL_IPMODIFY && !enable) - return -EINVAL; - /* - * If the new source is empty, just free dst and assign it - * the empty_hash. + * If the new source is empty, just return the empty_hash. */ - if (!src->count) { - new_hash = EMPTY_HASH; - goto update; - } + if (ftrace_hash_empty(src)) + return EMPTY_HASH; /* * Make the hash size about 1/2 the # found @@ -1421,7 +1483,9 @@ ftrace_hash_move(struct ftrace_ops *ops, int enable, new_hash = alloc_ftrace_hash(bits); if (!new_hash) - return -ENOMEM; + return NULL; + + new_hash->flags = src->flags; size = 1 << src->size_bits; for (i = 0; i < size; i++) { @@ -1432,7 +1496,24 @@ ftrace_hash_move(struct ftrace_ops *ops, int enable, } } -update: + return new_hash; +} + +static int +ftrace_hash_move(struct ftrace_ops *ops, int enable, + struct ftrace_hash **dst, struct ftrace_hash *src) +{ + struct ftrace_hash *new_hash; + int ret; + + /* Reject setting notrace hash on IPMODIFY ftrace_ops */ + if (ops->flags & FTRACE_OPS_FL_IPMODIFY && !enable) + return -EINVAL; + + new_hash = __ftrace_hash_move(src); + if (!new_hash) + return -ENOMEM; + /* Make sure this can be applied if it is IPMODIFY ftrace_ops */ if (enable) { /* IPMODIFY should be updated only when filter_hash updating */ @@ -1466,9 +1547,9 @@ static bool hash_contains_ip(unsigned long ip, * notrace hash is considered not in the notrace hash. */ return (ftrace_hash_empty(hash->filter_hash) || - ftrace_lookup_ip(hash->filter_hash, ip)) && + __ftrace_lookup_ip(hash->filter_hash, ip)) && (ftrace_hash_empty(hash->notrace_hash) || - !ftrace_lookup_ip(hash->notrace_hash, ip)); + !__ftrace_lookup_ip(hash->notrace_hash, ip)); } /* @@ -1499,8 +1580,8 @@ ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip, void *regs) return 0; #endif - hash.filter_hash = rcu_dereference_raw_notrace(ops->func_hash->filter_hash); - hash.notrace_hash = rcu_dereference_raw_notrace(ops->func_hash->notrace_hash); + rcu_assign_pointer(hash.filter_hash, ops->func_hash->filter_hash); + rcu_assign_pointer(hash.notrace_hash, ops->func_hash->notrace_hash); if (hash_contains_ip(ip, &hash)) ret = 1; @@ -1636,7 +1717,7 @@ static bool __ftrace_hash_rec_update(struct ftrace_ops *ops, struct dyn_ftrace *rec; bool update = false; int count = 0; - int all = 0; + int all = false; /* Only update if the ops has been registered */ if (!(ops->flags & FTRACE_OPS_FL_ENABLED)) @@ -1657,7 +1738,7 @@ static bool __ftrace_hash_rec_update(struct ftrace_ops *ops, hash = ops->func_hash->filter_hash; other_hash = ops->func_hash->notrace_hash; if (ftrace_hash_empty(hash)) - all = 1; + all = true; } else { inc = !inc; hash = ops->func_hash->notrace_hash; @@ -2770,7 +2851,8 @@ static int ftrace_shutdown(struct ftrace_ops *ops, int command) * If there's no more ops registered with ftrace, run a * sanity check to make sure all rec flags are cleared. */ - if (ftrace_ops_list == &ftrace_list_end) { + if (rcu_dereference_protected(ftrace_ops_list, + lockdep_is_held(&ftrace_lock)) == &ftrace_list_end) { struct ftrace_page *pg; struct dyn_ftrace *rec; @@ -2792,18 +2874,28 @@ static int ftrace_shutdown(struct ftrace_ops *ops, int command) * callers are done before leaving this function. * The same goes for freeing the per_cpu data of the per_cpu * ops. - * - * Again, normal synchronize_sched() is not good enough. - * We need to do a hard force of sched synchronization. - * This is because we use preempt_disable() to do RCU, but - * the function tracers can be called where RCU is not watching - * (like before user_exit()). We can not rely on the RCU - * infrastructure to do the synchronization, thus we must do it - * ourselves. */ if (ops->flags & (FTRACE_OPS_FL_DYNAMIC | FTRACE_OPS_FL_PER_CPU)) { + /* + * We need to do a hard force of sched synchronization. + * This is because we use preempt_disable() to do RCU, but + * the function tracers can be called where RCU is not watching + * (like before user_exit()). We can not rely on the RCU + * infrastructure to do the synchronization, thus we must do it + * ourselves. + */ schedule_on_each_cpu(ftrace_sync); + /* + * When the kernel is preeptive, tasks can be preempted + * while on a ftrace trampoline. Just scheduling a task on + * a CPU is not good enough to flush them. Calling + * synchornize_rcu_tasks() will wait for those tasks to + * execute and either schedule voluntarily or enter user space. + */ + if (IS_ENABLED(CONFIG_PREEMPT)) + synchronize_rcu_tasks(); + arch_ftrace_trampoline_free(ops); if (ops->flags & FTRACE_OPS_FL_PER_CPU) @@ -2880,7 +2972,7 @@ ops_references_rec(struct ftrace_ops *ops, struct dyn_ftrace *rec) /* The function must be in the filter */ if (!ftrace_hash_empty(ops->func_hash->filter_hash) && - !ftrace_lookup_ip(ops->func_hash->filter_hash, rec->ip)) + !__ftrace_lookup_ip(ops->func_hash->filter_hash, rec->ip)) return 0; /* If in notrace hash, we ignore it too */ @@ -3037,37 +3129,69 @@ ftrace_allocate_pages(unsigned long num_to_init) struct ftrace_iterator { loff_t pos; loff_t func_pos; + loff_t mod_pos; struct ftrace_page *pg; struct dyn_ftrace *func; struct ftrace_func_probe *probe; + struct ftrace_func_entry *probe_entry; struct trace_parser parser; struct ftrace_hash *hash; struct ftrace_ops *ops; - int hidx; + struct trace_array *tr; + struct list_head *mod_list; + int pidx; int idx; unsigned flags; }; static void * -t_hash_next(struct seq_file *m, loff_t *pos) +t_probe_next(struct seq_file *m, loff_t *pos) { struct ftrace_iterator *iter = m->private; + struct trace_array *tr = iter->ops->private; + struct list_head *func_probes; + struct ftrace_hash *hash; + struct list_head *next; struct hlist_node *hnd = NULL; struct hlist_head *hhd; + int size; (*pos)++; iter->pos = *pos; - if (iter->probe) - hnd = &iter->probe->node; - retry: - if (iter->hidx >= FTRACE_FUNC_HASHSIZE) + if (!tr) return NULL; - hhd = &ftrace_func_hash[iter->hidx]; + func_probes = &tr->func_probes; + if (list_empty(func_probes)) + return NULL; + + if (!iter->probe) { + next = func_probes->next; + iter->probe = list_entry(next, struct ftrace_func_probe, list); + } + + if (iter->probe_entry) + hnd = &iter->probe_entry->hlist; + + hash = iter->probe->ops.func_hash->filter_hash; + size = 1 << hash->size_bits; + + retry: + if (iter->pidx >= size) { + if (iter->probe->list.next == func_probes) + return NULL; + next = iter->probe->list.next; + iter->probe = list_entry(next, struct ftrace_func_probe, list); + hash = iter->probe->ops.func_hash->filter_hash; + size = 1 << hash->size_bits; + iter->pidx = 0; + } + + hhd = &hash->buckets[iter->pidx]; if (hlist_empty(hhd)) { - iter->hidx++; + iter->pidx++; hnd = NULL; goto retry; } @@ -3077,7 +3201,7 @@ t_hash_next(struct seq_file *m, loff_t *pos) else { hnd = hnd->next; if (!hnd) { - iter->hidx++; + iter->pidx++; goto retry; } } @@ -3085,26 +3209,28 @@ t_hash_next(struct seq_file *m, loff_t *pos) if (WARN_ON_ONCE(!hnd)) return NULL; - iter->probe = hlist_entry(hnd, struct ftrace_func_probe, node); + iter->probe_entry = hlist_entry(hnd, struct ftrace_func_entry, hlist); return iter; } -static void *t_hash_start(struct seq_file *m, loff_t *pos) +static void *t_probe_start(struct seq_file *m, loff_t *pos) { struct ftrace_iterator *iter = m->private; void *p = NULL; loff_t l; - if (!(iter->flags & FTRACE_ITER_DO_HASH)) + if (!(iter->flags & FTRACE_ITER_DO_PROBES)) return NULL; - if (iter->func_pos > *pos) + if (iter->mod_pos > *pos) return NULL; - iter->hidx = 0; - for (l = 0; l <= (*pos - iter->func_pos); ) { - p = t_hash_next(m, &l); + iter->probe = NULL; + iter->probe_entry = NULL; + iter->pidx = 0; + for (l = 0; l <= (*pos - iter->mod_pos); ) { + p = t_probe_next(m, &l); if (!p) break; } @@ -3112,50 +3238,118 @@ static void *t_hash_start(struct seq_file *m, loff_t *pos) return NULL; /* Only set this if we have an item */ - iter->flags |= FTRACE_ITER_HASH; + iter->flags |= FTRACE_ITER_PROBE; return iter; } static int -t_hash_show(struct seq_file *m, struct ftrace_iterator *iter) +t_probe_show(struct seq_file *m, struct ftrace_iterator *iter) { - struct ftrace_func_probe *rec; + struct ftrace_func_entry *probe_entry; + struct ftrace_probe_ops *probe_ops; + struct ftrace_func_probe *probe; - rec = iter->probe; - if (WARN_ON_ONCE(!rec)) + probe = iter->probe; + probe_entry = iter->probe_entry; + + if (WARN_ON_ONCE(!probe || !probe_entry)) return -EIO; - if (rec->ops->print) - return rec->ops->print(m, rec->ip, rec->ops, rec->data); + probe_ops = probe->probe_ops; - seq_printf(m, "%ps:%ps", (void *)rec->ip, (void *)rec->ops->func); + if (probe_ops->print) + return probe_ops->print(m, probe_entry->ip, probe_ops, probe->data); - if (rec->data) - seq_printf(m, ":%p", rec->data); - seq_putc(m, '\n'); + seq_printf(m, "%ps:%ps\n", (void *)probe_entry->ip, + (void *)probe_ops->func); return 0; } static void * -t_next(struct seq_file *m, void *v, loff_t *pos) +t_mod_next(struct seq_file *m, loff_t *pos) { struct ftrace_iterator *iter = m->private; - struct ftrace_ops *ops = iter->ops; - struct dyn_ftrace *rec = NULL; + struct trace_array *tr = iter->tr; - if (unlikely(ftrace_disabled)) + (*pos)++; + iter->pos = *pos; + + iter->mod_list = iter->mod_list->next; + + if (iter->mod_list == &tr->mod_trace || + iter->mod_list == &tr->mod_notrace) { + iter->flags &= ~FTRACE_ITER_MOD; return NULL; + } - if (iter->flags & FTRACE_ITER_HASH) - return t_hash_next(m, pos); + iter->mod_pos = *pos; - (*pos)++; - iter->pos = iter->func_pos = *pos; + return iter; +} + +static void *t_mod_start(struct seq_file *m, loff_t *pos) +{ + struct ftrace_iterator *iter = m->private; + void *p = NULL; + loff_t l; + + if (iter->func_pos > *pos) + return NULL; + + iter->mod_pos = iter->func_pos; + + /* probes are only available if tr is set */ + if (!iter->tr) + return NULL; + + for (l = 0; l <= (*pos - iter->func_pos); ) { + p = t_mod_next(m, &l); + if (!p) + break; + } + if (!p) { + iter->flags &= ~FTRACE_ITER_MOD; + return t_probe_start(m, pos); + } + + /* Only set this if we have an item */ + iter->flags |= FTRACE_ITER_MOD; + + return iter; +} + +static int +t_mod_show(struct seq_file *m, struct ftrace_iterator *iter) +{ + struct ftrace_mod_load *ftrace_mod; + struct trace_array *tr = iter->tr; + + if (WARN_ON_ONCE(!iter->mod_list) || + iter->mod_list == &tr->mod_trace || + iter->mod_list == &tr->mod_notrace) + return -EIO; - if (iter->flags & FTRACE_ITER_PRINTALL) - return t_hash_start(m, pos); + ftrace_mod = list_entry(iter->mod_list, struct ftrace_mod_load, list); + + if (ftrace_mod->func) + seq_printf(m, "%s", ftrace_mod->func); + else + seq_putc(m, '*'); + + seq_printf(m, ":mod:%s\n", ftrace_mod->module); + + return 0; +} + +static void * +t_func_next(struct seq_file *m, loff_t *pos) +{ + struct ftrace_iterator *iter = m->private; + struct dyn_ftrace *rec = NULL; + + (*pos)++; retry: if (iter->idx >= iter->pg->index) { @@ -3166,11 +3360,8 @@ t_next(struct seq_file *m, void *v, loff_t *pos) } } else { rec = &iter->pg->records[iter->idx++]; - if (((iter->flags & FTRACE_ITER_FILTER) && - !(ftrace_lookup_ip(ops->func_hash->filter_hash, rec->ip))) || - - ((iter->flags & FTRACE_ITER_NOTRACE) && - !ftrace_lookup_ip(ops->func_hash->notrace_hash, rec->ip)) || + if (((iter->flags & (FTRACE_ITER_FILTER | FTRACE_ITER_NOTRACE)) && + !ftrace_lookup_ip(iter->hash, rec->ip)) || ((iter->flags & FTRACE_ITER_ENABLED) && !(rec->flags & FTRACE_FL_ENABLED))) { @@ -3181,24 +3372,54 @@ t_next(struct seq_file *m, void *v, loff_t *pos) } if (!rec) - return t_hash_start(m, pos); + return NULL; + iter->pos = iter->func_pos = *pos; iter->func = rec; return iter; } +static void * +t_next(struct seq_file *m, void *v, loff_t *pos) +{ + struct ftrace_iterator *iter = m->private; + loff_t l = *pos; /* t_probe_start() must use original pos */ + void *ret; + + if (unlikely(ftrace_disabled)) + return NULL; + + if (iter->flags & FTRACE_ITER_PROBE) + return t_probe_next(m, pos); + + if (iter->flags & FTRACE_ITER_MOD) + return t_mod_next(m, pos); + + if (iter->flags & FTRACE_ITER_PRINTALL) { + /* next must increment pos, and t_probe_start does not */ + (*pos)++; + return t_mod_start(m, &l); + } + + ret = t_func_next(m, pos); + + if (!ret) + return t_mod_start(m, &l); + + return ret; +} + static void reset_iter_read(struct ftrace_iterator *iter) { iter->pos = 0; iter->func_pos = 0; - iter->flags &= ~(FTRACE_ITER_PRINTALL | FTRACE_ITER_HASH); + iter->flags &= ~(FTRACE_ITER_PRINTALL | FTRACE_ITER_PROBE | FTRACE_ITER_MOD); } static void *t_start(struct seq_file *m, loff_t *pos) { struct ftrace_iterator *iter = m->private; - struct ftrace_ops *ops = iter->ops; void *p = NULL; loff_t l; @@ -3218,20 +3439,19 @@ static void *t_start(struct seq_file *m, loff_t *pos) * off, we can short cut and just print out that all * functions are enabled. */ - if ((iter->flags & FTRACE_ITER_FILTER && - ftrace_hash_empty(ops->func_hash->filter_hash)) || - (iter->flags & FTRACE_ITER_NOTRACE && - ftrace_hash_empty(ops->func_hash->notrace_hash))) { + if ((iter->flags & (FTRACE_ITER_FILTER | FTRACE_ITER_NOTRACE)) && + ftrace_hash_empty(iter->hash)) { + iter->func_pos = 1; /* Account for the message */ if (*pos > 0) - return t_hash_start(m, pos); + return t_mod_start(m, pos); iter->flags |= FTRACE_ITER_PRINTALL; /* reset in case of seek/pread */ - iter->flags &= ~FTRACE_ITER_HASH; + iter->flags &= ~FTRACE_ITER_PROBE; return iter; } - if (iter->flags & FTRACE_ITER_HASH) - return t_hash_start(m, pos); + if (iter->flags & FTRACE_ITER_MOD) + return t_mod_start(m, pos); /* * Unfortunately, we need to restart at ftrace_pages_start @@ -3241,13 +3461,13 @@ static void *t_start(struct seq_file *m, loff_t *pos) iter->pg = ftrace_pages_start; iter->idx = 0; for (l = 0; l <= *pos; ) { - p = t_next(m, p, &l); + p = t_func_next(m, &l); if (!p) break; } if (!p) - return t_hash_start(m, pos); + return t_mod_start(m, pos); return iter; } @@ -3278,8 +3498,11 @@ static int t_show(struct seq_file *m, void *v) struct ftrace_iterator *iter = m->private; struct dyn_ftrace *rec; - if (iter->flags & FTRACE_ITER_HASH) - return t_hash_show(m, iter); + if (iter->flags & FTRACE_ITER_PROBE) + return t_probe_show(m, iter); + + if (iter->flags & FTRACE_ITER_MOD) + return t_mod_show(m, iter); if (iter->flags & FTRACE_ITER_PRINTALL) { if (iter->flags & FTRACE_ITER_NOTRACE) @@ -3340,12 +3563,13 @@ ftrace_avail_open(struct inode *inode, struct file *file) return -ENODEV; iter = __seq_open_private(file, &show_ftrace_seq_ops, sizeof(*iter)); - if (iter) { - iter->pg = ftrace_pages_start; - iter->ops = &global_ops; - } + if (!iter) + return -ENOMEM; + + iter->pg = ftrace_pages_start; + iter->ops = &global_ops; - return iter ? 0 : -ENOMEM; + return 0; } static int @@ -3354,13 +3578,14 @@ ftrace_enabled_open(struct inode *inode, struct file *file) struct ftrace_iterator *iter; iter = __seq_open_private(file, &show_ftrace_seq_ops, sizeof(*iter)); - if (iter) { - iter->pg = ftrace_pages_start; - iter->flags = FTRACE_ITER_ENABLED; - iter->ops = &global_ops; - } + if (!iter) + return -ENOMEM; + + iter->pg = ftrace_pages_start; + iter->flags = FTRACE_ITER_ENABLED; + iter->ops = &global_ops; - return iter ? 0 : -ENOMEM; + return 0; } /** @@ -3385,6 +3610,8 @@ ftrace_regex_open(struct ftrace_ops *ops, int flag, { struct ftrace_iterator *iter; struct ftrace_hash *hash; + struct list_head *mod_head; + struct trace_array *tr = ops->private; int ret = 0; ftrace_ops_init(ops); @@ -3403,21 +3630,29 @@ ftrace_regex_open(struct ftrace_ops *ops, int flag, iter->ops = ops; iter->flags = flag; + iter->tr = tr; mutex_lock(&ops->func_hash->regex_lock); - if (flag & FTRACE_ITER_NOTRACE) + if (flag & FTRACE_ITER_NOTRACE) { hash = ops->func_hash->notrace_hash; - else + mod_head = tr ? &tr->mod_notrace : NULL; + } else { hash = ops->func_hash->filter_hash; + mod_head = tr ? &tr->mod_trace : NULL; + } + + iter->mod_list = mod_head; if (file->f_mode & FMODE_WRITE) { const int size_bits = FTRACE_HASH_DEFAULT_BITS; - if (file->f_flags & O_TRUNC) + if (file->f_flags & O_TRUNC) { iter->hash = alloc_ftrace_hash(size_bits); - else + clear_ftrace_mod_list(mod_head); + } else { iter->hash = alloc_and_copy_ftrace_hash(size_bits, hash); + } if (!iter->hash) { trace_parser_put(&iter->parser); @@ -3425,7 +3660,8 @@ ftrace_regex_open(struct ftrace_ops *ops, int flag, ret = -ENOMEM; goto out_unlock; } - } + } else + iter->hash = hash; if (file->f_mode & FMODE_READ) { iter->pg = ftrace_pages_start; @@ -3455,7 +3691,7 @@ ftrace_filter_open(struct inode *inode, struct file *file) struct ftrace_ops *ops = inode->i_private; return ftrace_regex_open(ops, - FTRACE_ITER_FILTER | FTRACE_ITER_DO_HASH, + FTRACE_ITER_FILTER | FTRACE_ITER_DO_PROBES, inode, file); } @@ -3558,22 +3794,20 @@ ftrace_match_record(struct dyn_ftrace *rec, struct ftrace_glob *func_g, /* blank module name to match all modules */ if (!mod_g->len) { /* blank module globbing: modname xor exclude_mod */ - if ((!exclude_mod) != (!modname)) + if (!exclude_mod != !modname) goto func_match; return 0; } - /* not matching the module */ - if (!modname || !mod_matches) { - if (exclude_mod) - goto func_match; - else - return 0; - } - - if (mod_matches && exclude_mod) + /* + * exclude_mod is set to trace everything but the given + * module. If it is set and the module matches, then + * return 0. If it is not set, and the module doesn't match + * also return 0. Otherwise, check the function to see if + * that matches. + */ + if (!mod_matches == !exclude_mod) return 0; - func_match: /* blank search means to match all funcs in the mod */ if (!func_g->len) @@ -3594,7 +3828,7 @@ match_records(struct ftrace_hash *hash, char *func, int len, char *mod) int exclude_mod = 0; int found = 0; int ret; - int clear_filter; + int clear_filter = 0; if (func) { func_g.type = filter_parse_regex(func, len, &func_g.search, @@ -3639,6 +3873,215 @@ ftrace_match_records(struct ftrace_hash *hash, char *buff, int len) return match_records(hash, buff, len, NULL); } +static void ftrace_ops_update_code(struct ftrace_ops *ops, + struct ftrace_ops_hash *old_hash) +{ + struct ftrace_ops *op; + + if (!ftrace_enabled) + return; + + if (ops->flags & FTRACE_OPS_FL_ENABLED) { + ftrace_run_modify_code(ops, FTRACE_UPDATE_CALLS, old_hash); + return; + } + + /* + * If this is the shared global_ops filter, then we need to + * check if there is another ops that shares it, is enabled. + * If so, we still need to run the modify code. + */ + if (ops->func_hash != &global_ops.local_hash) + return; + + do_for_each_ftrace_op(op, ftrace_ops_list) { + if (op->func_hash == &global_ops.local_hash && + op->flags & FTRACE_OPS_FL_ENABLED) { + ftrace_run_modify_code(op, FTRACE_UPDATE_CALLS, old_hash); + /* Only need to do this once */ + return; + } + } while_for_each_ftrace_op(op); +} + +static int ftrace_hash_move_and_update_ops(struct ftrace_ops *ops, + struct ftrace_hash **orig_hash, + struct ftrace_hash *hash, + int enable) +{ + struct ftrace_ops_hash old_hash_ops; + struct ftrace_hash *old_hash; + int ret; + + old_hash = *orig_hash; + old_hash_ops.filter_hash = ops->func_hash->filter_hash; + old_hash_ops.notrace_hash = ops->func_hash->notrace_hash; + ret = ftrace_hash_move(ops, enable, orig_hash, hash); + if (!ret) { + ftrace_ops_update_code(ops, &old_hash_ops); + free_ftrace_hash_rcu(old_hash); + } + return ret; +} + +static bool module_exists(const char *module) +{ + /* All modules have the symbol __this_module */ + const char this_mod[] = "__this_module"; + const int modname_size = MAX_PARAM_PREFIX_LEN + sizeof(this_mod) + 1; + char modname[modname_size + 1]; + unsigned long val; + int n; + + n = snprintf(modname, modname_size + 1, "%s:%s", module, this_mod); + + if (n > modname_size) + return false; + + val = module_kallsyms_lookup_name(modname); + return val != 0; +} + +static int cache_mod(struct trace_array *tr, + const char *func, char *module, int enable) +{ + struct ftrace_mod_load *ftrace_mod, *n; + struct list_head *head = enable ? &tr->mod_trace : &tr->mod_notrace; + int ret; + + mutex_lock(&ftrace_lock); + + /* We do not cache inverse filters */ + if (func[0] == '!') { + func++; + ret = -EINVAL; + + /* Look to remove this hash */ + list_for_each_entry_safe(ftrace_mod, n, head, list) { + if (strcmp(ftrace_mod->module, module) != 0) + continue; + + /* no func matches all */ + if (strcmp(func, "*") == 0 || + (ftrace_mod->func && + strcmp(ftrace_mod->func, func) == 0)) { + ret = 0; + free_ftrace_mod(ftrace_mod); + continue; + } + } + goto out; + } + + ret = -EINVAL; + /* We only care about modules that have not been loaded yet */ + if (module_exists(module)) + goto out; + + /* Save this string off, and execute it when the module is loaded */ + ret = ftrace_add_mod(tr, func, module, enable); + out: + mutex_unlock(&ftrace_lock); + + return ret; +} + +static int +ftrace_set_regex(struct ftrace_ops *ops, unsigned char *buf, int len, + int reset, int enable); + +#ifdef CONFIG_MODULES +static void process_mod_list(struct list_head *head, struct ftrace_ops *ops, + char *mod, bool enable) +{ + struct ftrace_mod_load *ftrace_mod, *n; + struct ftrace_hash **orig_hash, *new_hash; + LIST_HEAD(process_mods); + char *func; + int ret; + + mutex_lock(&ops->func_hash->regex_lock); + + if (enable) + orig_hash = &ops->func_hash->filter_hash; + else + orig_hash = &ops->func_hash->notrace_hash; + + new_hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, + *orig_hash); + if (!new_hash) + goto out; /* warn? */ + + mutex_lock(&ftrace_lock); + + list_for_each_entry_safe(ftrace_mod, n, head, list) { + + if (strcmp(ftrace_mod->module, mod) != 0) + continue; + + if (ftrace_mod->func) + func = kstrdup(ftrace_mod->func, GFP_KERNEL); + else + func = kstrdup("*", GFP_KERNEL); + + if (!func) /* warn? */ + continue; + + list_del(&ftrace_mod->list); + list_add(&ftrace_mod->list, &process_mods); + + /* Use the newly allocated func, as it may be "*" */ + kfree(ftrace_mod->func); + ftrace_mod->func = func; + } + + mutex_unlock(&ftrace_lock); + + list_for_each_entry_safe(ftrace_mod, n, &process_mods, list) { + + func = ftrace_mod->func; + + /* Grabs ftrace_lock, which is why we have this extra step */ + match_records(new_hash, func, strlen(func), mod); + free_ftrace_mod(ftrace_mod); + } + + if (enable && list_empty(head)) + new_hash->flags &= ~FTRACE_HASH_FL_MOD; + + mutex_lock(&ftrace_lock); + + ret = ftrace_hash_move_and_update_ops(ops, orig_hash, + new_hash, enable); + mutex_unlock(&ftrace_lock); + + out: + mutex_unlock(&ops->func_hash->regex_lock); + + free_ftrace_hash(new_hash); +} + +static void process_cached_mods(const char *mod_name) +{ + struct trace_array *tr; + char *mod; + + mod = kstrdup(mod_name, GFP_KERNEL); + if (!mod) + return; + + mutex_lock(&trace_types_lock); + list_for_each_entry(tr, &ftrace_trace_arrays, list) { + if (!list_empty(&tr->mod_trace)) + process_mod_list(&tr->mod_trace, tr->ops, mod, true); + if (!list_empty(&tr->mod_notrace)) + process_mod_list(&tr->mod_notrace, tr->ops, mod, false); + } + mutex_unlock(&trace_types_lock); + + kfree(mod); +} +#endif /* * We register the module command as a template to show others how @@ -3646,11 +4089,17 @@ ftrace_match_records(struct ftrace_hash *hash, char *buff, int len) */ static int -ftrace_mod_callback(struct ftrace_hash *hash, - char *func, char *cmd, char *module, int enable) +ftrace_mod_callback(struct trace_array *tr, struct ftrace_hash *hash, + char *func_orig, char *cmd, char *module, int enable) { + char *func; int ret; + /* match_records() modifies func, and we need the original */ + func = kstrdup(func_orig, GFP_KERNEL); + if (!func) + return -ENOMEM; + /* * cmd == 'mod' because we only registered this func * for the 'mod' ftrace_func_command. @@ -3659,8 +4108,10 @@ ftrace_mod_callback(struct ftrace_hash *hash, * parameter. */ ret = match_records(hash, func, strlen(func), module); + kfree(func); + if (!ret) - return -EINVAL; + return cache_mod(tr, func_orig, module, enable); if (ret < 0) return ret; return 0; @@ -3680,16 +4131,11 @@ core_initcall(ftrace_mod_cmd_init); static void function_trace_probe_call(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *op, struct pt_regs *pt_regs) { - struct ftrace_func_probe *entry; - struct hlist_head *hhd; - unsigned long key; + struct ftrace_probe_ops *probe_ops; + struct ftrace_func_probe *probe; - key = hash_long(ip, FTRACE_HASH_BITS); - - hhd = &ftrace_func_hash[key]; - - if (hlist_empty(hhd)) - return; + probe = container_of(op, struct ftrace_func_probe, ops); + probe_ops = probe->probe_ops; /* * Disable preemption for these calls to prevent a RCU grace @@ -3697,210 +4143,340 @@ static void function_trace_probe_call(unsigned long ip, unsigned long parent_ip, * on the hash. rcu_read_lock is too dangerous here. */ preempt_disable_notrace(); - hlist_for_each_entry_rcu_notrace(entry, hhd, node) { - if (entry->ip == ip) - entry->ops->func(ip, parent_ip, &entry->data); - } + probe_ops->func(ip, parent_ip, probe->tr, probe_ops, probe->data); preempt_enable_notrace(); } -static struct ftrace_ops trace_probe_ops __read_mostly = -{ - .func = function_trace_probe_call, - .flags = FTRACE_OPS_FL_INITIALIZED, - INIT_OPS_HASH(trace_probe_ops) +struct ftrace_func_map { + struct ftrace_func_entry entry; + void *data; }; -static int ftrace_probe_registered; +struct ftrace_func_mapper { + struct ftrace_hash hash; +}; -static void __enable_ftrace_function_probe(struct ftrace_ops_hash *old_hash) +/** + * allocate_ftrace_func_mapper - allocate a new ftrace_func_mapper + * + * Returns a ftrace_func_mapper descriptor that can be used to map ips to data. + */ +struct ftrace_func_mapper *allocate_ftrace_func_mapper(void) { - int ret; - int i; + struct ftrace_hash *hash; - if (ftrace_probe_registered) { - /* still need to update the function call sites */ - if (ftrace_enabled) - ftrace_run_modify_code(&trace_probe_ops, FTRACE_UPDATE_CALLS, - old_hash); - return; - } + /* + * The mapper is simply a ftrace_hash, but since the entries + * in the hash are not ftrace_func_entry type, we define it + * as a separate structure. + */ + hash = alloc_ftrace_hash(FTRACE_HASH_DEFAULT_BITS); + return (struct ftrace_func_mapper *)hash; +} - for (i = 0; i < FTRACE_FUNC_HASHSIZE; i++) { - struct hlist_head *hhd = &ftrace_func_hash[i]; - if (hhd->first) - break; - } - /* Nothing registered? */ - if (i == FTRACE_FUNC_HASHSIZE) - return; +/** + * ftrace_func_mapper_find_ip - Find some data mapped to an ip + * @mapper: The mapper that has the ip maps + * @ip: the instruction pointer to find the data for + * + * Returns the data mapped to @ip if found otherwise NULL. The return + * is actually the address of the mapper data pointer. The address is + * returned for use cases where the data is no bigger than a long, and + * the user can use the data pointer as its data instead of having to + * allocate more memory for the reference. + */ +void **ftrace_func_mapper_find_ip(struct ftrace_func_mapper *mapper, + unsigned long ip) +{ + struct ftrace_func_entry *entry; + struct ftrace_func_map *map; - ret = ftrace_startup(&trace_probe_ops, 0); + entry = ftrace_lookup_ip(&mapper->hash, ip); + if (!entry) + return NULL; - ftrace_probe_registered = 1; + map = (struct ftrace_func_map *)entry; + return &map->data; } -static void __disable_ftrace_function_probe(void) +/** + * ftrace_func_mapper_add_ip - Map some data to an ip + * @mapper: The mapper that has the ip maps + * @ip: The instruction pointer address to map @data to + * @data: The data to map to @ip + * + * Returns 0 on succes otherwise an error. + */ +int ftrace_func_mapper_add_ip(struct ftrace_func_mapper *mapper, + unsigned long ip, void *data) { - int i; + struct ftrace_func_entry *entry; + struct ftrace_func_map *map; - if (!ftrace_probe_registered) - return; + entry = ftrace_lookup_ip(&mapper->hash, ip); + if (entry) + return -EBUSY; - for (i = 0; i < FTRACE_FUNC_HASHSIZE; i++) { - struct hlist_head *hhd = &ftrace_func_hash[i]; - if (hhd->first) - return; - } + map = kmalloc(sizeof(*map), GFP_KERNEL); + if (!map) + return -ENOMEM; - /* no more funcs left */ - ftrace_shutdown(&trace_probe_ops, 0); + map->entry.ip = ip; + map->data = data; - ftrace_probe_registered = 0; -} + __add_hash_entry(&mapper->hash, &map->entry); + return 0; +} -static void ftrace_free_entry(struct ftrace_func_probe *entry) +/** + * ftrace_func_mapper_remove_ip - Remove an ip from the mapping + * @mapper: The mapper that has the ip maps + * @ip: The instruction pointer address to remove the data from + * + * Returns the data if it is found, otherwise NULL. + * Note, if the data pointer is used as the data itself, (see + * ftrace_func_mapper_find_ip(), then the return value may be meaningless, + * if the data pointer was set to zero. + */ +void *ftrace_func_mapper_remove_ip(struct ftrace_func_mapper *mapper, + unsigned long ip) { - if (entry->ops->free) - entry->ops->free(entry->ops, entry->ip, &entry->data); + struct ftrace_func_entry *entry; + struct ftrace_func_map *map; + void *data; + + entry = ftrace_lookup_ip(&mapper->hash, ip); + if (!entry) + return NULL; + + map = (struct ftrace_func_map *)entry; + data = map->data; + + remove_hash_entry(&mapper->hash, entry); kfree(entry); + + return data; +} + +/** + * free_ftrace_func_mapper - free a mapping of ips and data + * @mapper: The mapper that has the ip maps + * @free_func: A function to be called on each data item. + * + * This is used to free the function mapper. The @free_func is optional + * and can be used if the data needs to be freed as well. + */ +void free_ftrace_func_mapper(struct ftrace_func_mapper *mapper, + ftrace_mapper_func free_func) +{ + struct ftrace_func_entry *entry; + struct ftrace_func_map *map; + struct hlist_head *hhd; + int size = 1 << mapper->hash.size_bits; + int i; + + if (free_func && mapper->hash.count) { + for (i = 0; i < size; i++) { + hhd = &mapper->hash.buckets[i]; + hlist_for_each_entry(entry, hhd, hlist) { + map = (struct ftrace_func_map *)entry; + free_func(map); + } + } + } + free_ftrace_hash(&mapper->hash); +} + +static void release_probe(struct ftrace_func_probe *probe) +{ + struct ftrace_probe_ops *probe_ops; + + mutex_lock(&ftrace_lock); + + WARN_ON(probe->ref <= 0); + + /* Subtract the ref that was used to protect this instance */ + probe->ref--; + + if (!probe->ref) { + probe_ops = probe->probe_ops; + /* + * Sending zero as ip tells probe_ops to free + * the probe->data itself + */ + if (probe_ops->free) + probe_ops->free(probe_ops, probe->tr, 0, probe->data); + list_del(&probe->list); + kfree(probe); + } + mutex_unlock(&ftrace_lock); +} + +static void acquire_probe_locked(struct ftrace_func_probe *probe) +{ + /* + * Add one ref to keep it from being freed when releasing the + * ftrace_lock mutex. + */ + probe->ref++; } int -register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, - void *data) +register_ftrace_function_probe(char *glob, struct trace_array *tr, + struct ftrace_probe_ops *probe_ops, + void *data) { - struct ftrace_ops_hash old_hash_ops; - struct ftrace_func_probe *entry; - struct ftrace_glob func_g; - struct ftrace_hash **orig_hash = &trace_probe_ops.func_hash->filter_hash; - struct ftrace_hash *old_hash = *orig_hash; + struct ftrace_func_entry *entry; + struct ftrace_func_probe *probe; + struct ftrace_hash **orig_hash; + struct ftrace_hash *old_hash; struct ftrace_hash *hash; - struct ftrace_page *pg; - struct dyn_ftrace *rec; - int not; - unsigned long key; int count = 0; + int size; int ret; + int i; - func_g.type = filter_parse_regex(glob, strlen(glob), - &func_g.search, ¬); - func_g.len = strlen(func_g.search); - - /* we do not support '!' for function probes */ - if (WARN_ON(not)) + if (WARN_ON(!tr)) return -EINVAL; - mutex_lock(&trace_probe_ops.func_hash->regex_lock); + /* We do not support '!' for function probes */ + if (WARN_ON(glob[0] == '!')) + return -EINVAL; - old_hash_ops.filter_hash = old_hash; - /* Probes only have filters */ - old_hash_ops.notrace_hash = NULL; - hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, old_hash); - if (!hash) { - count = -ENOMEM; - goto out; + mutex_lock(&ftrace_lock); + /* Check if the probe_ops is already registered */ + list_for_each_entry(probe, &tr->func_probes, list) { + if (probe->probe_ops == probe_ops) + break; } - - if (unlikely(ftrace_disabled)) { - count = -ENODEV; - goto out; + if (&probe->list == &tr->func_probes) { + probe = kzalloc(sizeof(*probe), GFP_KERNEL); + if (!probe) { + mutex_unlock(&ftrace_lock); + return -ENOMEM; + } + probe->probe_ops = probe_ops; + probe->ops.func = function_trace_probe_call; + probe->tr = tr; + ftrace_ops_init(&probe->ops); + list_add(&probe->list, &tr->func_probes); } - mutex_lock(&ftrace_lock); + acquire_probe_locked(probe); - do_for_each_ftrace_rec(pg, rec) { + mutex_unlock(&ftrace_lock); - if (rec->flags & FTRACE_FL_DISABLED) - continue; + mutex_lock(&probe->ops.func_hash->regex_lock); - if (!ftrace_match_record(rec, &func_g, NULL, 0)) - continue; + orig_hash = &probe->ops.func_hash->filter_hash; + old_hash = *orig_hash; + hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, old_hash); - entry = kmalloc(sizeof(*entry), GFP_KERNEL); - if (!entry) { - /* If we did not process any, then return error */ - if (!count) - count = -ENOMEM; - goto out_unlock; - } + ret = ftrace_match_records(hash, glob, strlen(glob)); - count++; + /* Nothing found? */ + if (!ret) + ret = -EINVAL; - entry->data = data; + if (ret < 0) + goto out; - /* - * The caller might want to do something special - * for each function we find. We call the callback - * to give the caller an opportunity to do so. - */ - if (ops->init) { - if (ops->init(ops, rec->ip, &entry->data) < 0) { - /* caller does not like this func */ - kfree(entry); + size = 1 << hash->size_bits; + for (i = 0; i < size; i++) { + hlist_for_each_entry(entry, &hash->buckets[i], hlist) { + if (ftrace_lookup_ip(old_hash, entry->ip)) continue; + /* + * The caller might want to do something special + * for each function we find. We call the callback + * to give the caller an opportunity to do so. + */ + if (probe_ops->init) { + ret = probe_ops->init(probe_ops, tr, + entry->ip, data, + &probe->data); + if (ret < 0) { + if (probe_ops->free && count) + probe_ops->free(probe_ops, tr, + 0, probe->data); + probe->data = NULL; + goto out; + } } + count++; } + } - ret = enter_record(hash, rec, 0); - if (ret < 0) { - kfree(entry); - count = ret; - goto out_unlock; - } - - entry->ops = ops; - entry->ip = rec->ip; - - key = hash_long(entry->ip, FTRACE_HASH_BITS); - hlist_add_head_rcu(&entry->node, &ftrace_func_hash[key]); + mutex_lock(&ftrace_lock); - } while_for_each_ftrace_rec(); + if (!count) { + /* Nothing was added? */ + ret = -EINVAL; + goto out_unlock; + } - ret = ftrace_hash_move(&trace_probe_ops, 1, orig_hash, hash); + ret = ftrace_hash_move_and_update_ops(&probe->ops, orig_hash, + hash, 1); + if (ret < 0) + goto err_unlock; - __enable_ftrace_function_probe(&old_hash_ops); + /* One ref for each new function traced */ + probe->ref += count; - if (!ret) - free_ftrace_hash_rcu(old_hash); - else - count = ret; + if (!(probe->ops.flags & FTRACE_OPS_FL_ENABLED)) + ret = ftrace_startup(&probe->ops, 0); out_unlock: mutex_unlock(&ftrace_lock); + + if (!ret) + ret = count; out: - mutex_unlock(&trace_probe_ops.func_hash->regex_lock); + mutex_unlock(&probe->ops.func_hash->regex_lock); free_ftrace_hash(hash); - return count; -} + release_probe(probe); -enum { - PROBE_TEST_FUNC = 1, - PROBE_TEST_DATA = 2 -}; + return ret; -static void -__unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, - void *data, int flags) + err_unlock: + if (!probe_ops->free || !count) + goto out_unlock; + + /* Failed to do the move, need to call the free functions */ + for (i = 0; i < size; i++) { + hlist_for_each_entry(entry, &hash->buckets[i], hlist) { + if (ftrace_lookup_ip(old_hash, entry->ip)) + continue; + probe_ops->free(probe_ops, tr, entry->ip, probe->data); + } + } + goto out_unlock; +} + +int +unregister_ftrace_function_probe_func(char *glob, struct trace_array *tr, + struct ftrace_probe_ops *probe_ops) { - struct ftrace_func_entry *rec_entry; - struct ftrace_func_probe *entry; - struct ftrace_func_probe *p; + struct ftrace_ops_hash old_hash_ops; + struct ftrace_func_entry *entry; + struct ftrace_func_probe *probe; struct ftrace_glob func_g; - struct ftrace_hash **orig_hash = &trace_probe_ops.func_hash->filter_hash; - struct ftrace_hash *old_hash = *orig_hash; - struct list_head free_list; - struct ftrace_hash *hash; + struct ftrace_hash **orig_hash; + struct ftrace_hash *old_hash; + struct ftrace_hash *hash = NULL; struct hlist_node *tmp; + struct hlist_head hhd; char str[KSYM_SYMBOL_LEN]; - int i, ret; + int count = 0; + int i, ret = -ENODEV; + int size; - if (glob && (strcmp(glob, "*") == 0 || !strlen(glob))) + if (!glob || !strlen(glob) || !strcmp(glob, "*")) func_g.search = NULL; - else if (glob) { + else { int not; func_g.type = filter_parse_regex(glob, strlen(glob), @@ -3910,86 +4486,112 @@ __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, /* we do not support '!' for function probes */ if (WARN_ON(not)) - return; + return -EINVAL; } - mutex_lock(&trace_probe_ops.func_hash->regex_lock); + mutex_lock(&ftrace_lock); + /* Check if the probe_ops is already registered */ + list_for_each_entry(probe, &tr->func_probes, list) { + if (probe->probe_ops == probe_ops) + break; + } + if (&probe->list == &tr->func_probes) + goto err_unlock_ftrace; - hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, *orig_hash); - if (!hash) - /* Hmm, should report this somehow */ - goto out_unlock; + ret = -EINVAL; + if (!(probe->ops.flags & FTRACE_OPS_FL_INITIALIZED)) + goto err_unlock_ftrace; - INIT_LIST_HEAD(&free_list); + acquire_probe_locked(probe); - for (i = 0; i < FTRACE_FUNC_HASHSIZE; i++) { - struct hlist_head *hhd = &ftrace_func_hash[i]; + mutex_unlock(&ftrace_lock); - hlist_for_each_entry_safe(entry, tmp, hhd, node) { + mutex_lock(&probe->ops.func_hash->regex_lock); - /* break up if statements for readability */ - if ((flags & PROBE_TEST_FUNC) && entry->ops != ops) - continue; + orig_hash = &probe->ops.func_hash->filter_hash; + old_hash = *orig_hash; - if ((flags & PROBE_TEST_DATA) && entry->data != data) - continue; + if (ftrace_hash_empty(old_hash)) + goto out_unlock; + + old_hash_ops.filter_hash = old_hash; + /* Probes only have filters */ + old_hash_ops.notrace_hash = NULL; + + ret = -ENOMEM; + hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, old_hash); + if (!hash) + goto out_unlock; + + INIT_HLIST_HEAD(&hhd); + + size = 1 << hash->size_bits; + for (i = 0; i < size; i++) { + hlist_for_each_entry_safe(entry, tmp, &hash->buckets[i], hlist) { - /* do this last, since it is the most expensive */ if (func_g.search) { kallsyms_lookup(entry->ip, NULL, NULL, NULL, str); if (!ftrace_match(str, &func_g)) continue; } - - rec_entry = ftrace_lookup_ip(hash, entry->ip); - /* It is possible more than one entry had this ip */ - if (rec_entry) - free_hash_entry(hash, rec_entry); - - hlist_del_rcu(&entry->node); - list_add(&entry->free_list, &free_list); + count++; + remove_hash_entry(hash, entry); + hlist_add_head(&entry->hlist, &hhd); } } + + /* Nothing found? */ + if (!count) { + ret = -EINVAL; + goto out_unlock; + } + mutex_lock(&ftrace_lock); - __disable_ftrace_function_probe(); - /* - * Remove after the disable is called. Otherwise, if the last - * probe is removed, a null hash means *all enabled*. - */ - ret = ftrace_hash_move(&trace_probe_ops, 1, orig_hash, hash); + + WARN_ON(probe->ref < count); + + probe->ref -= count; + + if (ftrace_hash_empty(hash)) + ftrace_shutdown(&probe->ops, 0); + + ret = ftrace_hash_move_and_update_ops(&probe->ops, orig_hash, + hash, 1); + + /* still need to update the function call sites */ + if (ftrace_enabled && !ftrace_hash_empty(hash)) + ftrace_run_modify_code(&probe->ops, FTRACE_UPDATE_CALLS, + &old_hash_ops); synchronize_sched(); - if (!ret) - free_ftrace_hash_rcu(old_hash); - list_for_each_entry_safe(entry, p, &free_list, free_list) { - list_del(&entry->free_list); - ftrace_free_entry(entry); + hlist_for_each_entry_safe(entry, tmp, &hhd, hlist) { + hlist_del(&entry->hlist); + if (probe_ops->free) + probe_ops->free(probe_ops, tr, entry->ip, probe->data); + kfree(entry); } mutex_unlock(&ftrace_lock); out_unlock: - mutex_unlock(&trace_probe_ops.func_hash->regex_lock); + mutex_unlock(&probe->ops.func_hash->regex_lock); free_ftrace_hash(hash); -} -void -unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, - void *data) -{ - __unregister_ftrace_function_probe(glob, ops, data, - PROBE_TEST_FUNC | PROBE_TEST_DATA); -} + release_probe(probe); -void -unregister_ftrace_function_probe_func(char *glob, struct ftrace_probe_ops *ops) -{ - __unregister_ftrace_function_probe(glob, ops, NULL, PROBE_TEST_FUNC); + return ret; + + err_unlock_ftrace: + mutex_unlock(&ftrace_lock); + return ret; } -void unregister_ftrace_function_probe_all(char *glob) +void clear_ftrace_function_probes(struct trace_array *tr) { - __unregister_ftrace_function_probe(glob, NULL, NULL, 0); + struct ftrace_func_probe *probe, *n; + + list_for_each_entry_safe(probe, n, &tr->func_probes, list) + unregister_ftrace_function_probe_func(NULL, tr, probe->probe_ops); } static LIST_HEAD(ftrace_commands); @@ -4041,9 +4643,11 @@ __init int unregister_ftrace_command(struct ftrace_func_command *cmd) return ret; } -static int ftrace_process_regex(struct ftrace_hash *hash, +static int ftrace_process_regex(struct ftrace_iterator *iter, char *buff, int len, int enable) { + struct ftrace_hash *hash = iter->hash; + struct trace_array *tr = iter->ops->private; char *func, *command, *next = buff; struct ftrace_func_command *p; int ret = -EINVAL; @@ -4066,7 +4670,7 @@ static int ftrace_process_regex(struct ftrace_hash *hash, mutex_lock(&ftrace_cmd_mutex); list_for_each_entry(p, &ftrace_commands, list) { if (strcmp(p->name, command) == 0) { - ret = p->func(hash, func, command, next, enable); + ret = p->func(tr, hash, func, command, next, enable); goto out_unlock; } } @@ -4103,7 +4707,7 @@ ftrace_regex_write(struct file *file, const char __user *ubuf, if (read >= 0 && trace_parser_loaded(parser) && !trace_parser_cont(parser)) { - ret = ftrace_process_regex(iter->hash, parser->buffer, + ret = ftrace_process_regex(iter, parser->buffer, parser->idx, enable); trace_parser_clear(parser); if (ret < 0) @@ -4148,44 +4752,11 @@ ftrace_match_addr(struct ftrace_hash *hash, unsigned long ip, int remove) return add_hash_entry(hash, ip); } -static void ftrace_ops_update_code(struct ftrace_ops *ops, - struct ftrace_ops_hash *old_hash) -{ - struct ftrace_ops *op; - - if (!ftrace_enabled) - return; - - if (ops->flags & FTRACE_OPS_FL_ENABLED) { - ftrace_run_modify_code(ops, FTRACE_UPDATE_CALLS, old_hash); - return; - } - - /* - * If this is the shared global_ops filter, then we need to - * check if there is another ops that shares it, is enabled. - * If so, we still need to run the modify code. - */ - if (ops->func_hash != &global_ops.local_hash) - return; - - do_for_each_ftrace_op(op, ftrace_ops_list) { - if (op->func_hash == &global_ops.local_hash && - op->flags & FTRACE_OPS_FL_ENABLED) { - ftrace_run_modify_code(op, FTRACE_UPDATE_CALLS, old_hash); - /* Only need to do this once */ - return; - } - } while_for_each_ftrace_op(op); -} - static int ftrace_set_hash(struct ftrace_ops *ops, unsigned char *buf, int len, unsigned long ip, int remove, int reset, int enable) { struct ftrace_hash **orig_hash; - struct ftrace_ops_hash old_hash_ops; - struct ftrace_hash *old_hash; struct ftrace_hash *hash; int ret; @@ -4220,14 +4791,7 @@ ftrace_set_hash(struct ftrace_ops *ops, unsigned char *buf, int len, } mutex_lock(&ftrace_lock); - old_hash = *orig_hash; - old_hash_ops.filter_hash = ops->func_hash->filter_hash; - old_hash_ops.notrace_hash = ops->func_hash->notrace_hash; - ret = ftrace_hash_move(ops, enable, orig_hash, hash); - if (!ret) { - ftrace_ops_update_code(ops, &old_hash_ops); - free_ftrace_hash_rcu(old_hash); - } + ret = ftrace_hash_move_and_update_ops(ops, orig_hash, hash, enable); mutex_unlock(&ftrace_lock); out_regex_unlock: @@ -4382,7 +4946,7 @@ __setup("ftrace_filter=", set_ftrace_filter); #ifdef CONFIG_FUNCTION_GRAPH_TRACER static char ftrace_graph_buf[FTRACE_FILTER_SIZE] __initdata; static char ftrace_graph_notrace_buf[FTRACE_FILTER_SIZE] __initdata; -static int ftrace_set_func(unsigned long *array, int *idx, int size, char *buffer); +static int ftrace_graph_set_hash(struct ftrace_hash *hash, char *buffer); static unsigned long save_global_trampoline; static unsigned long save_global_flags; @@ -4401,26 +4965,38 @@ static int __init set_graph_notrace_function(char *str) } __setup("ftrace_graph_notrace=", set_graph_notrace_function); +static int __init set_graph_max_depth_function(char *str) +{ + if (!str) + return 0; + fgraph_max_depth = simple_strtoul(str, NULL, 0); + return 1; +} +__setup("ftrace_graph_max_depth=", set_graph_max_depth_function); + static void __init set_ftrace_early_graph(char *buf, int enable) { int ret; char *func; - unsigned long *table = ftrace_graph_funcs; - int *count = &ftrace_graph_count; + struct ftrace_hash *hash; - if (!enable) { - table = ftrace_graph_notrace_funcs; - count = &ftrace_graph_notrace_count; - } + hash = alloc_ftrace_hash(FTRACE_HASH_DEFAULT_BITS); + if (WARN_ON(!hash)) + return; while (buf) { func = strsep(&buf, ","); /* we allow only one expression at a time */ - ret = ftrace_set_func(table, count, FTRACE_GRAPH_MAX_FUNCS, func); + ret = ftrace_graph_set_hash(hash, func); if (ret) printk(KERN_DEBUG "ftrace: function %s not " "traceable\n", func); } + + if (enable) + ftrace_graph_hash = hash; + else + ftrace_graph_notrace_hash = hash; } #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ @@ -4454,10 +5030,8 @@ static void __init set_ftrace_early_filters(void) int ftrace_regex_release(struct inode *inode, struct file *file) { struct seq_file *m = (struct seq_file *)file->private_data; - struct ftrace_ops_hash old_hash_ops; struct ftrace_iterator *iter; struct ftrace_hash **orig_hash; - struct ftrace_hash *old_hash; struct trace_parser *parser; int filter_hash; int ret; @@ -4481,22 +5055,20 @@ int ftrace_regex_release(struct inode *inode, struct file *file) if (file->f_mode & FMODE_WRITE) { filter_hash = !!(iter->flags & FTRACE_ITER_FILTER); - if (filter_hash) + if (filter_hash) { orig_hash = &iter->ops->func_hash->filter_hash; - else + if (iter->tr && !list_empty(&iter->tr->mod_trace)) + iter->hash->flags |= FTRACE_HASH_FL_MOD; + } else orig_hash = &iter->ops->func_hash->notrace_hash; mutex_lock(&ftrace_lock); - old_hash = *orig_hash; - old_hash_ops.filter_hash = iter->ops->func_hash->filter_hash; - old_hash_ops.notrace_hash = iter->ops->func_hash->notrace_hash; - ret = ftrace_hash_move(iter->ops, filter_hash, - orig_hash, iter->hash); - if (!ret) { - ftrace_ops_update_code(iter->ops, &old_hash_ops); - free_ftrace_hash_rcu(old_hash); - } + ret = ftrace_hash_move_and_update_ops(iter->ops, orig_hash, + iter->hash, filter_hash); mutex_unlock(&ftrace_lock); + } else { + /* For read only, the hash is the ops hash */ + iter->hash = NULL; } mutex_unlock(&iter->ops->func_hash->regex_lock); @@ -4540,26 +5112,55 @@ static const struct file_operations ftrace_notrace_fops = { static DEFINE_MUTEX(graph_lock); -int ftrace_graph_count; -int ftrace_graph_notrace_count; -unsigned long ftrace_graph_funcs[FTRACE_GRAPH_MAX_FUNCS] __read_mostly; -unsigned long ftrace_graph_notrace_funcs[FTRACE_GRAPH_MAX_FUNCS] __read_mostly; +struct ftrace_hash *ftrace_graph_hash = EMPTY_HASH; +struct ftrace_hash *ftrace_graph_notrace_hash = EMPTY_HASH; + +enum graph_filter_type { + GRAPH_FILTER_NOTRACE = 0, + GRAPH_FILTER_FUNCTION, +}; + +#define FTRACE_GRAPH_EMPTY ((void *)1) struct ftrace_graph_data { - unsigned long *table; - size_t size; - int *count; - const struct seq_operations *seq_ops; + struct ftrace_hash *hash; + struct ftrace_func_entry *entry; + int idx; /* for hash table iteration */ + enum graph_filter_type type; + struct ftrace_hash *new_hash; + const struct seq_operations *seq_ops; + struct trace_parser parser; }; static void * __g_next(struct seq_file *m, loff_t *pos) { struct ftrace_graph_data *fgd = m->private; + struct ftrace_func_entry *entry = fgd->entry; + struct hlist_head *head; + int i, idx = fgd->idx; - if (*pos >= *fgd->count) + if (*pos >= fgd->hash->count) return NULL; - return &fgd->table[*pos]; + + if (entry) { + hlist_for_each_entry_continue(entry, hlist) { + fgd->entry = entry; + return entry; + } + + idx++; + } + + for (i = idx; i < 1 << fgd->hash->size_bits; i++) { + head = &fgd->hash->buckets[i]; + hlist_for_each_entry(entry, head, hlist) { + fgd->entry = entry; + fgd->idx = i; + return entry; + } + } + return NULL; } static void * @@ -4575,10 +5176,19 @@ static void *g_start(struct seq_file *m, loff_t *pos) mutex_lock(&graph_lock); + if (fgd->type == GRAPH_FILTER_FUNCTION) + fgd->hash = rcu_dereference_protected(ftrace_graph_hash, + lockdep_is_held(&graph_lock)); + else + fgd->hash = rcu_dereference_protected(ftrace_graph_notrace_hash, + lockdep_is_held(&graph_lock)); + /* Nothing, tell g_show to print all functions are enabled */ - if (!*fgd->count && !*pos) - return (void *)1; + if (ftrace_hash_empty(fgd->hash) && !*pos) + return FTRACE_GRAPH_EMPTY; + fgd->idx = 0; + fgd->entry = NULL; return __g_next(m, pos); } @@ -4589,22 +5199,22 @@ static void g_stop(struct seq_file *m, void *p) static int g_show(struct seq_file *m, void *v) { - unsigned long *ptr = v; + struct ftrace_func_entry *entry = v; - if (!ptr) + if (!entry) return 0; - if (ptr == (unsigned long *)1) { + if (entry == FTRACE_GRAPH_EMPTY) { struct ftrace_graph_data *fgd = m->private; - if (fgd->table == ftrace_graph_funcs) + if (fgd->type == GRAPH_FILTER_FUNCTION) seq_puts(m, "#### all functions enabled ####\n"); else seq_puts(m, "#### no functions disabled ####\n"); return 0; } - seq_printf(m, "%ps\n", (void *)*ptr); + seq_printf(m, "%ps\n", (void *)entry->ip); return 0; } @@ -4621,24 +5231,51 @@ __ftrace_graph_open(struct inode *inode, struct file *file, struct ftrace_graph_data *fgd) { int ret = 0; + struct ftrace_hash *new_hash = NULL; - mutex_lock(&graph_lock); - if ((file->f_mode & FMODE_WRITE) && - (file->f_flags & O_TRUNC)) { - *fgd->count = 0; - memset(fgd->table, 0, fgd->size * sizeof(*fgd->table)); + if (file->f_mode & FMODE_WRITE) { + const int size_bits = FTRACE_HASH_DEFAULT_BITS; + + if (trace_parser_get_init(&fgd->parser, FTRACE_BUFF_MAX)) + return -ENOMEM; + + if (file->f_flags & O_TRUNC) + new_hash = alloc_ftrace_hash(size_bits); + else + new_hash = alloc_and_copy_ftrace_hash(size_bits, + fgd->hash); + if (!new_hash) { + ret = -ENOMEM; + goto out; + } } - mutex_unlock(&graph_lock); if (file->f_mode & FMODE_READ) { - ret = seq_open(file, fgd->seq_ops); + ret = seq_open(file, &ftrace_graph_seq_ops); if (!ret) { struct seq_file *m = file->private_data; m->private = fgd; + } else { + /* Failed */ + free_ftrace_hash(new_hash); + new_hash = NULL; } } else file->private_data = fgd; +out: + if (ret < 0 && file->f_mode & FMODE_WRITE) + trace_parser_put(&fgd->parser); + + fgd->new_hash = new_hash; + + /* + * All uses of fgd->hash must be taken with the graph_lock + * held. The graph_lock is going to be released, so force + * fgd->hash to be reinitialized when it is taken again. + */ + fgd->hash = NULL; + return ret; } @@ -4646,6 +5283,7 @@ static int ftrace_graph_open(struct inode *inode, struct file *file) { struct ftrace_graph_data *fgd; + int ret; if (unlikely(ftrace_disabled)) return -ENODEV; @@ -4654,18 +5292,26 @@ ftrace_graph_open(struct inode *inode, struct file *file) if (fgd == NULL) return -ENOMEM; - fgd->table = ftrace_graph_funcs; - fgd->size = FTRACE_GRAPH_MAX_FUNCS; - fgd->count = &ftrace_graph_count; + mutex_lock(&graph_lock); + + fgd->hash = rcu_dereference_protected(ftrace_graph_hash, + lockdep_is_held(&graph_lock)); + fgd->type = GRAPH_FILTER_FUNCTION; fgd->seq_ops = &ftrace_graph_seq_ops; - return __ftrace_graph_open(inode, file, fgd); + ret = __ftrace_graph_open(inode, file, fgd); + if (ret < 0) + kfree(fgd); + + mutex_unlock(&graph_lock); + return ret; } static int ftrace_graph_notrace_open(struct inode *inode, struct file *file) { struct ftrace_graph_data *fgd; + int ret; if (unlikely(ftrace_disabled)) return -ENODEV; @@ -4674,45 +5320,97 @@ ftrace_graph_notrace_open(struct inode *inode, struct file *file) if (fgd == NULL) return -ENOMEM; - fgd->table = ftrace_graph_notrace_funcs; - fgd->size = FTRACE_GRAPH_MAX_FUNCS; - fgd->count = &ftrace_graph_notrace_count; + mutex_lock(&graph_lock); + + fgd->hash = rcu_dereference_protected(ftrace_graph_notrace_hash, + lockdep_is_held(&graph_lock)); + fgd->type = GRAPH_FILTER_NOTRACE; fgd->seq_ops = &ftrace_graph_seq_ops; - return __ftrace_graph_open(inode, file, fgd); + ret = __ftrace_graph_open(inode, file, fgd); + if (ret < 0) + kfree(fgd); + + mutex_unlock(&graph_lock); + return ret; } static int ftrace_graph_release(struct inode *inode, struct file *file) { + struct ftrace_graph_data *fgd; + struct ftrace_hash *old_hash, *new_hash; + struct trace_parser *parser; + int ret = 0; + if (file->f_mode & FMODE_READ) { struct seq_file *m = file->private_data; - kfree(m->private); + fgd = m->private; seq_release(inode, file); } else { - kfree(file->private_data); + fgd = file->private_data; } - return 0; + + if (file->f_mode & FMODE_WRITE) { + + parser = &fgd->parser; + + if (trace_parser_loaded((parser))) { + parser->buffer[parser->idx] = 0; + ret = ftrace_graph_set_hash(fgd->new_hash, + parser->buffer); + } + + trace_parser_put(parser); + + new_hash = __ftrace_hash_move(fgd->new_hash); + if (!new_hash) { + ret = -ENOMEM; + goto out; + } + + mutex_lock(&graph_lock); + + if (fgd->type == GRAPH_FILTER_FUNCTION) { + old_hash = rcu_dereference_protected(ftrace_graph_hash, + lockdep_is_held(&graph_lock)); + rcu_assign_pointer(ftrace_graph_hash, new_hash); + } else { + old_hash = rcu_dereference_protected(ftrace_graph_notrace_hash, + lockdep_is_held(&graph_lock)); + rcu_assign_pointer(ftrace_graph_notrace_hash, new_hash); + } + + mutex_unlock(&graph_lock); + + /* Wait till all users are no longer using the old hash */ + synchronize_sched(); + + free_ftrace_hash(old_hash); + } + + out: + free_ftrace_hash(fgd->new_hash); + kfree(fgd); + + return ret; } static int -ftrace_set_func(unsigned long *array, int *idx, int size, char *buffer) +ftrace_graph_set_hash(struct ftrace_hash *hash, char *buffer) { struct ftrace_glob func_g; struct dyn_ftrace *rec; struct ftrace_page *pg; + struct ftrace_func_entry *entry; int fail = 1; int not; - bool exists; - int i; /* decode regex */ func_g.type = filter_parse_regex(buffer, strlen(buffer), &func_g.search, ¬); - if (!not && *idx >= size) - return -EBUSY; func_g.len = strlen(func_g.search); @@ -4729,26 +5427,18 @@ ftrace_set_func(unsigned long *array, int *idx, int size, char *buffer) continue; if (ftrace_match_record(rec, &func_g, NULL, 0)) { - /* if it is in the array */ - exists = false; - for (i = 0; i < *idx; i++) { - if (array[i] == rec->ip) { - exists = true; - break; - } - } + entry = ftrace_lookup_ip(hash, rec->ip); if (!not) { fail = 0; - if (!exists) { - array[(*idx)++] = rec->ip; - if (*idx >= size) - goto out; - } + + if (entry) + continue; + if (add_hash_entry(hash, rec->ip) < 0) + goto out; } else { - if (exists) { - array[i] = array[--(*idx)]; - array[*idx] = 0; + if (entry) { + free_hash_entry(hash, entry); fail = 0; } } @@ -4767,35 +5457,34 @@ static ssize_t ftrace_graph_write(struct file *file, const char __user *ubuf, size_t cnt, loff_t *ppos) { - struct trace_parser parser; ssize_t read, ret = 0; struct ftrace_graph_data *fgd = file->private_data; + struct trace_parser *parser; if (!cnt) return 0; - if (trace_parser_get_init(&parser, FTRACE_BUFF_MAX)) - return -ENOMEM; - - read = trace_get_user(&parser, ubuf, cnt, ppos); + /* Read mode uses seq functions */ + if (file->f_mode & FMODE_READ) { + struct seq_file *m = file->private_data; + fgd = m->private; + } - if (read >= 0 && trace_parser_loaded((&parser))) { - parser.buffer[parser.idx] = 0; + parser = &fgd->parser; - mutex_lock(&graph_lock); + read = trace_get_user(parser, ubuf, cnt, ppos); - /* we allow only one expression at a time */ - ret = ftrace_set_func(fgd->table, fgd->count, fgd->size, - parser.buffer); + if (read >= 0 && trace_parser_loaded(parser) && + !trace_parser_cont(parser)) { - mutex_unlock(&graph_lock); + ret = ftrace_graph_set_hash(fgd->new_hash, + parser->buffer); + trace_parser_clear(parser); } if (!ret) ret = read; - trace_parser_put(&parser); - return ret; } @@ -5028,6 +5717,7 @@ void ftrace_release_mod(struct module *mod) if (pg == ftrace_pages) ftrace_pages = next_to_ftrace_page(last_pg); + ftrace_update_tot_cnt -= pg->index; *last_pg = pg->next; order = get_count_order(pg->size / ENTRIES_PER_PAGE); free_pages((unsigned long)pg->records, order); @@ -5106,6 +5796,8 @@ void ftrace_module_enable(struct module *mod) out_unlock: mutex_unlock(&ftrace_lock); + + process_cached_mods(mod->name); } void ftrace_module_init(struct module *mod) @@ -5118,6 +5810,51 @@ void ftrace_module_init(struct module *mod) } #endif /* CONFIG_MODULES */ +void __init ftrace_free_init_mem(void) +{ + unsigned long start = (unsigned long)(&__init_begin); + unsigned long end = (unsigned long)(&__init_end); + struct ftrace_page **last_pg = &ftrace_pages_start; + struct ftrace_page *pg; + struct dyn_ftrace *rec; + struct dyn_ftrace key; + int order; + + key.ip = start; + key.flags = end; /* overload flags, as it is unsigned long */ + + mutex_lock(&ftrace_lock); + + for (pg = ftrace_pages_start; pg; last_pg = &pg->next, pg = *last_pg) { + if (end < pg->records[0].ip || + start >= (pg->records[pg->index - 1].ip + MCOUNT_INSN_SIZE)) + continue; + again: + rec = bsearch(&key, pg->records, pg->index, + sizeof(struct dyn_ftrace), + ftrace_cmp_recs); + if (!rec) + continue; + pg->index--; + ftrace_update_tot_cnt--; + if (!pg->index) { + *last_pg = pg->next; + order = get_count_order(pg->size / ENTRIES_PER_PAGE); + free_pages((unsigned long)pg->records, order); + kfree(pg); + pg = container_of(last_pg, struct ftrace_page, next); + if (!(*last_pg)) + ftrace_pages = pg; + continue; + } + memmove(rec, rec + 1, + (pg->index - (rec - pg->records)) * sizeof(*rec)); + /* More than one function may be in this block */ + goto again; + } + mutex_unlock(&ftrace_lock); +} + void __init ftrace_init(void) { extern unsigned long __start_mcount_loc[]; @@ -5160,25 +5897,15 @@ void __weak arch_ftrace_update_trampoline(struct ftrace_ops *ops) static void ftrace_update_trampoline(struct ftrace_ops *ops) { - -/* - * Currently there's no safe way to free a trampoline when the kernel - * is configured with PREEMPT. That is because a task could be preempted - * when it jumped to the trampoline, it may be preempted for a long time - * depending on the system load, and currently there's no way to know - * when it will be off the trampoline. If the trampoline is freed - * too early, when the task runs again, it will be executing on freed - * memory and crash. - */ -#ifdef CONFIG_PREEMPT - /* Currently, only non dynamic ops can have a trampoline */ - if (ops->flags & FTRACE_OPS_FL_DYNAMIC) - return; -#endif - arch_ftrace_update_trampoline(ops); } +void ftrace_init_trace_array(struct trace_array *tr) +{ + INIT_LIST_HEAD(&tr->func_probes); + INIT_LIST_HEAD(&tr->mod_trace); + INIT_LIST_HEAD(&tr->mod_notrace); +} #else static struct ftrace_ops global_ops = { @@ -5233,6 +5960,7 @@ __init void ftrace_init_global_array_ops(struct trace_array *tr) { tr->ops = &global_ops; tr->ops->private = tr; + ftrace_init_trace_array(tr); } void ftrace_init_array_ops(struct trace_array *tr, ftrace_func_t func) @@ -5357,7 +6085,7 @@ static void ftrace_ops_assist_func(unsigned long ip, unsigned long parent_ip, * Normally the mcount trampoline will call the ops->func, but there * are times that it should not. For example, if the ops does not * have its own recursion protection, then it should call the - * ftrace_ops_recurs_func() instead. + * ftrace_ops_assist_func() instead. * * Returns the function that the trampoline should call for @ops. */ @@ -5387,6 +6115,43 @@ ftrace_filter_pid_sched_switch_probe(void *data, bool preempt, trace_ignore_this_task(pid_list, next)); } +static void +ftrace_pid_follow_sched_process_fork(void *data, + struct task_struct *self, + struct task_struct *task) +{ + struct trace_pid_list *pid_list; + struct trace_array *tr = data; + + pid_list = rcu_dereference_sched(tr->function_pids); + trace_filter_add_remove_task(pid_list, self, task); +} + +static void +ftrace_pid_follow_sched_process_exit(void *data, struct task_struct *task) +{ + struct trace_pid_list *pid_list; + struct trace_array *tr = data; + + pid_list = rcu_dereference_sched(tr->function_pids); + trace_filter_add_remove_task(pid_list, NULL, task); +} + +void ftrace_pid_follow_fork(struct trace_array *tr, bool enable) +{ + if (enable) { + register_trace_sched_process_fork(ftrace_pid_follow_sched_process_fork, + tr); + register_trace_sched_process_exit(ftrace_pid_follow_sched_process_exit, + tr); + } else { + unregister_trace_sched_process_fork(ftrace_pid_follow_sched_process_fork, + tr); + unregister_trace_sched_process_exit(ftrace_pid_follow_sched_process_exit, + tr); + } +} + static void clear_ftrace_pids(struct trace_array *tr) { struct trace_pid_list *pid_list; @@ -5410,6 +6175,15 @@ static void clear_ftrace_pids(struct trace_array *tr) trace_free_pid_list(pid_list); } +void ftrace_clear_pids(struct trace_array *tr) +{ + mutex_lock(&ftrace_lock); + + clear_ftrace_pids(tr); + + mutex_unlock(&ftrace_lock); +} + static void ftrace_pid_reset(struct trace_array *tr) { mutex_lock(&ftrace_lock); @@ -5691,7 +6465,8 @@ ftrace_enable_sysctl(struct ctl_table *table, int write, if (ftrace_enabled) { /* we are starting ftrace again */ - if (ftrace_ops_list != &ftrace_list_end) + if (rcu_dereference_protected(ftrace_ops_list, + lockdep_is_held(&ftrace_lock)) != &ftrace_list_end) update_ftrace_function(); ftrace_startup_sysctl(); diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index a85739efcc30..529cc50d7243 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -6,6 +6,7 @@ #include <linux/trace_events.h> #include <linux/ring_buffer.h> #include <linux/trace_clock.h> +#include <linux/sched/clock.h> #include <linux/trace_seq.h> #include <linux/spinlock.h> #include <linux/irq_work.h> @@ -437,6 +438,7 @@ struct ring_buffer_per_cpu { raw_spinlock_t reader_lock; /* serialize readers */ arch_spinlock_t lock; struct lock_class_key lock_key; + struct buffer_data_page *free_page; unsigned long nr_pages; unsigned int current_context; struct list_head *pages; @@ -1134,12 +1136,12 @@ static int __rb_allocate_pages(long nr_pages, struct list_head *pages, int cpu) for (i = 0; i < nr_pages; i++) { struct page *page; /* - * __GFP_NORETRY flag makes sure that the allocation fails - * gracefully without invoking oom-killer and the system is - * not destabilized. + * __GFP_RETRY_MAYFAIL flag makes sure that the allocation fails + * gracefully without invoking oom-killer and the system is not + * destabilized. */ bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()), - GFP_KERNEL | __GFP_NORETRY, + GFP_KERNEL | __GFP_RETRY_MAYFAIL, cpu_to_node(cpu)); if (!bpage) goto free_pages; @@ -1147,7 +1149,7 @@ static int __rb_allocate_pages(long nr_pages, struct list_head *pages, int cpu) list_add(&bpage->list, pages); page = alloc_pages_node(cpu_to_node(cpu), - GFP_KERNEL | __GFP_NORETRY, 0); + GFP_KERNEL | __GFP_RETRY_MAYFAIL, 0); if (!page) goto free_pages; bpage->page = page_address(page); @@ -3404,11 +3406,23 @@ EXPORT_SYMBOL_GPL(ring_buffer_iter_reset); int ring_buffer_iter_empty(struct ring_buffer_iter *iter) { struct ring_buffer_per_cpu *cpu_buffer; + struct buffer_page *reader; + struct buffer_page *head_page; + struct buffer_page *commit_page; + unsigned commit; cpu_buffer = iter->cpu_buffer; - return iter->head_page == cpu_buffer->commit_page && - iter->head == rb_commit_index(cpu_buffer); + /* Remember, trace recording is off when iterator is in use */ + reader = cpu_buffer->reader_page; + head_page = cpu_buffer->head_page; + commit_page = cpu_buffer->commit_page; + commit = rb_page_commit(commit_page); + + return ((iter->head_page == commit_page && iter->head == commit) || + (iter->head_page == reader && commit_page == head_page && + head_page->read == commit && + iter->head == rb_page_commit(cpu_buffer->reader_page))); } EXPORT_SYMBOL_GPL(ring_buffer_iter_empty); @@ -4376,9 +4390,25 @@ EXPORT_SYMBOL_GPL(ring_buffer_swap_cpu); */ void *ring_buffer_alloc_read_page(struct ring_buffer *buffer, int cpu) { - struct buffer_data_page *bpage; + struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu]; + struct buffer_data_page *bpage = NULL; + unsigned long flags; struct page *page; + local_irq_save(flags); + arch_spin_lock(&cpu_buffer->lock); + + if (cpu_buffer->free_page) { + bpage = cpu_buffer->free_page; + cpu_buffer->free_page = NULL; + } + + arch_spin_unlock(&cpu_buffer->lock); + local_irq_restore(flags); + + if (bpage) + goto out; + page = alloc_pages_node(cpu_to_node(cpu), GFP_KERNEL | __GFP_NORETRY, 0); if (!page) @@ -4386,6 +4416,7 @@ void *ring_buffer_alloc_read_page(struct ring_buffer *buffer, int cpu) bpage = page_address(page); + out: rb_init_page(bpage); return bpage; @@ -4395,13 +4426,29 @@ EXPORT_SYMBOL_GPL(ring_buffer_alloc_read_page); /** * ring_buffer_free_read_page - free an allocated read page * @buffer: the buffer the page was allocate for + * @cpu: the cpu buffer the page came from * @data: the page to free * * Free a page allocated from ring_buffer_alloc_read_page. */ -void ring_buffer_free_read_page(struct ring_buffer *buffer, void *data) +void ring_buffer_free_read_page(struct ring_buffer *buffer, int cpu, void *data) { - free_page((unsigned long)data); + struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu]; + struct buffer_data_page *bpage = data; + unsigned long flags; + + local_irq_save(flags); + arch_spin_lock(&cpu_buffer->lock); + + if (!cpu_buffer->free_page) { + cpu_buffer->free_page = bpage; + bpage = NULL; + } + + arch_spin_unlock(&cpu_buffer->lock); + local_irq_restore(flags); + + free_page((unsigned long)bpage); } EXPORT_SYMBOL_GPL(ring_buffer_free_read_page); @@ -4825,9 +4872,9 @@ static __init int test_ringbuffer(void) rb_data[cpu].cnt = cpu; rb_threads[cpu] = kthread_create(rb_test, &rb_data[cpu], "rbtester/%d", cpu); - if (WARN_ON(!rb_threads[cpu])) { + if (WARN_ON(IS_ERR(rb_threads[cpu]))) { pr_cont("FAILED\n"); - ret = -1; + ret = PTR_ERR(rb_threads[cpu]); goto out_free; } @@ -4837,9 +4884,9 @@ static __init int test_ringbuffer(void) /* Now create the rb hammer! */ rb_hammer = kthread_run(rb_hammer_test, NULL, "rbhammer"); - if (WARN_ON(!rb_hammer)) { + if (WARN_ON(IS_ERR(rb_hammer))) { pr_cont("FAILED\n"); - ret = -1; + ret = PTR_ERR(rb_hammer); goto out_free; } diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c index 6df9a83e20d7..9fbcaf567886 100644 --- a/kernel/trace/ring_buffer_benchmark.c +++ b/kernel/trace/ring_buffer_benchmark.c @@ -6,6 +6,7 @@ #include <linux/ring_buffer.h> #include <linux/completion.h> #include <linux/kthread.h> +#include <uapi/linux/sched/types.h> #include <linux/module.h> #include <linux/ktime.h> #include <asm/local.h> @@ -170,7 +171,7 @@ static enum event_status read_page(int cpu) } } } - ring_buffer_free_read_page(buffer, bpage); + ring_buffer_free_read_page(buffer, cpu, bpage); if (ret < 0) return EVENT_DROPPED; diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index d7449783987a..42b9355033d4 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -87,7 +87,7 @@ dummy_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set) * tracing is active, only save the comm when a trace event * occurred. */ -static DEFINE_PER_CPU(bool, trace_cmdline_save); +static DEFINE_PER_CPU(bool, trace_taskinfo_save); /* * Kill all tracing for good (never come back). @@ -120,41 +120,41 @@ enum ftrace_dump_mode ftrace_dump_on_oops; /* When set, tracing will stop when a WARN*() is hit */ int __disable_trace_on_warning; -#ifdef CONFIG_TRACE_ENUM_MAP_FILE -/* Map of enums to their values, for "enum_map" file */ -struct trace_enum_map_head { +#ifdef CONFIG_TRACE_EVAL_MAP_FILE +/* Map of enums to their values, for "eval_map" file */ +struct trace_eval_map_head { struct module *mod; unsigned long length; }; -union trace_enum_map_item; +union trace_eval_map_item; -struct trace_enum_map_tail { +struct trace_eval_map_tail { /* * "end" is first and points to NULL as it must be different - * than "mod" or "enum_string" + * than "mod" or "eval_string" */ - union trace_enum_map_item *next; + union trace_eval_map_item *next; const char *end; /* points to NULL */ }; -static DEFINE_MUTEX(trace_enum_mutex); +static DEFINE_MUTEX(trace_eval_mutex); /* - * The trace_enum_maps are saved in an array with two extra elements, + * The trace_eval_maps are saved in an array with two extra elements, * one at the beginning, and one at the end. The beginning item contains * the count of the saved maps (head.length), and the module they * belong to if not built in (head.mod). The ending item contains a - * pointer to the next array of saved enum_map items. + * pointer to the next array of saved eval_map items. */ -union trace_enum_map_item { - struct trace_enum_map map; - struct trace_enum_map_head head; - struct trace_enum_map_tail tail; +union trace_eval_map_item { + struct trace_eval_map map; + struct trace_eval_map_head head; + struct trace_eval_map_tail tail; }; -static union trace_enum_map_item *trace_enum_maps; -#endif /* CONFIG_TRACE_ENUM_MAP_FILE */ +static union trace_eval_map_item *trace_eval_maps; +#endif /* CONFIG_TRACE_EVAL_MAP_FILE */ static int tracing_set_tracer(struct trace_array *tr, const char *buf); @@ -257,19 +257,11 @@ unsigned long long ns2usecs(u64 nsec) /* trace_flags that are default zero for instances */ #define ZEROED_TRACE_FLAGS \ - TRACE_ITER_EVENT_FORK + (TRACE_ITER_EVENT_FORK | TRACE_ITER_FUNC_FORK) /* - * The global_trace is the descriptor that holds the tracing - * buffers for the live tracing. For each CPU, it contains - * a link list of pages that will store trace entries. The - * page descriptor of the pages in the memory is used to hold - * the link list by linking the lru item in the page descriptor - * to each of the pages in the buffer per CPU. - * - * For each active CPU there is a data field that holds the - * pages for the buffer for that CPU. Each CPU has the same number - * of pages allocated for its buffer. + * The global_trace is the descriptor that holds the top-level tracing + * buffers for the live tracing. */ static struct trace_array global_trace = { .trace_flags = TRACE_DEFAULT_FLAGS, @@ -765,7 +757,7 @@ __trace_buffer_lock_reserve(struct ring_buffer *buffer, return event; } -static void tracer_tracing_on(struct trace_array *tr) +void tracer_tracing_on(struct trace_array *tr) { if (tr->trace_buffer.buffer) ring_buffer_record_on(tr->trace_buffer.buffer); @@ -798,7 +790,7 @@ EXPORT_SYMBOL_GPL(tracing_on); static __always_inline void __buffer_unlock_commit(struct ring_buffer *buffer, struct ring_buffer_event *event) { - __this_cpu_write(trace_cmdline_save, true); + __this_cpu_write(trace_taskinfo_save, true); /* If this is the temp buffer, we need to commit fully */ if (this_cpu_read(trace_buffered_event) == event) { @@ -902,23 +894,8 @@ int __trace_bputs(unsigned long ip, const char *str) EXPORT_SYMBOL_GPL(__trace_bputs); #ifdef CONFIG_TRACER_SNAPSHOT -/** - * trace_snapshot - take a snapshot of the current buffer. - * - * This causes a swap between the snapshot buffer and the current live - * tracing buffer. You can use this to take snapshots of the live - * trace when some condition is triggered, but continue to trace. - * - * Note, make sure to allocate the snapshot with either - * a tracing_snapshot_alloc(), or by doing it manually - * with: echo 1 > /sys/kernel/debug/tracing/snapshot - * - * If the snapshot buffer is not allocated, it will stop tracing. - * Basically making a permanent snapshot. - */ -void tracing_snapshot(void) +static void tracing_snapshot_instance(struct trace_array *tr) { - struct trace_array *tr = &global_trace; struct tracer *tracer = tr->current_trace; unsigned long flags; @@ -946,6 +923,27 @@ void tracing_snapshot(void) update_max_tr(tr, current, smp_processor_id()); local_irq_restore(flags); } + +/** + * trace_snapshot - take a snapshot of the current buffer. + * + * This causes a swap between the snapshot buffer and the current live + * tracing buffer. You can use this to take snapshots of the live + * trace when some condition is triggered, but continue to trace. + * + * Note, make sure to allocate the snapshot with either + * a tracing_snapshot_alloc(), or by doing it manually + * with: echo 1 > /sys/kernel/debug/tracing/snapshot + * + * If the snapshot buffer is not allocated, it will stop tracing. + * Basically making a permanent snapshot. + */ +void tracing_snapshot(void) +{ + struct trace_array *tr = &global_trace; + + tracing_snapshot_instance(tr); +} EXPORT_SYMBOL_GPL(tracing_snapshot); static int resize_buffer_duplicate_size(struct trace_buffer *trace_buf, @@ -1047,7 +1045,7 @@ void tracing_snapshot_alloc(void) EXPORT_SYMBOL_GPL(tracing_snapshot_alloc); #endif /* CONFIG_TRACER_SNAPSHOT */ -static void tracer_tracing_off(struct trace_array *tr) +void tracer_tracing_off(struct trace_array *tr) { if (tr->trace_buffer.buffer) ring_buffer_record_off(tr->trace_buffer.buffer); @@ -1143,9 +1141,9 @@ unsigned long nsecs_to_usecs(unsigned long nsecs) /* * TRACE_FLAGS is defined as a tuple matching bit masks with strings. - * It uses C(a, b) where 'a' is the enum name and 'b' is the string that + * It uses C(a, b) where 'a' is the eval (enum) name and 'b' is the string that * matches it. By defining "C(a, b) b", TRACE_FLAGS becomes a list - * of strings in the order that the enums were defined. + * of strings in the order that the evals (enum) were defined. */ #undef C #define C(a, b) b @@ -1193,6 +1191,7 @@ int trace_parser_get_init(struct trace_parser *parser, int size) void trace_parser_put(struct trace_parser *parser) { kfree(parser->buffer); + parser->buffer = NULL; } /* @@ -1431,6 +1430,28 @@ static int wait_on_pipe(struct trace_iterator *iter, bool full) } #ifdef CONFIG_FTRACE_STARTUP_TEST +static bool selftests_can_run; + +struct trace_selftests { + struct list_head list; + struct tracer *type; +}; + +static LIST_HEAD(postponed_selftests); + +static int save_selftest(struct tracer *type) +{ + struct trace_selftests *selftest; + + selftest = kmalloc(sizeof(*selftest), GFP_KERNEL); + if (!selftest) + return -ENOMEM; + + selftest->type = type; + list_add(&selftest->list, &postponed_selftests); + return 0; +} + static int run_tracer_selftest(struct tracer *type) { struct trace_array *tr = &global_trace; @@ -1441,6 +1462,14 @@ static int run_tracer_selftest(struct tracer *type) return 0; /* + * If a tracer registers early in boot up (before scheduling is + * initialized and such), then do not run its selftests yet. + * Instead, run it a little later in the boot process. + */ + if (!selftests_can_run) + return save_selftest(type); + + /* * Run a selftest on this tracer. * Here we reset the trace buffer, and set the current * tracer to be this tracer. The tracer can then run some @@ -1489,6 +1518,47 @@ static int run_tracer_selftest(struct tracer *type) printk(KERN_CONT "PASSED\n"); return 0; } + +static __init int init_trace_selftests(void) +{ + struct trace_selftests *p, *n; + struct tracer *t, **last; + int ret; + + selftests_can_run = true; + + mutex_lock(&trace_types_lock); + + if (list_empty(&postponed_selftests)) + goto out; + + pr_info("Running postponed tracer tests:\n"); + + list_for_each_entry_safe(p, n, &postponed_selftests, list) { + ret = run_tracer_selftest(p->type); + /* If the test fails, then warn and remove from available_tracers */ + if (ret < 0) { + WARN(1, "tracer: %s failed selftest, disabling\n", + p->type->name); + last = &trace_types; + for (t = trace_types; t; t = t->next) { + if (t == p->type) { + *last = t->next; + break; + } + last = &t->next; + } + } + list_del(&p->list); + kfree(p); + } + + out: + mutex_unlock(&trace_types_lock); + + return 0; +} +core_initcall(init_trace_selftests); #else static inline int run_tracer_selftest(struct tracer *type) { @@ -1639,6 +1709,8 @@ void tracing_reset_all_online_cpus(void) } } +static int *tgid_map; + #define SAVED_CMDLINES_DEFAULT 128 #define NO_CMDLINE_MAP UINT_MAX static arch_spinlock_t trace_cmdline_lock = __ARCH_SPIN_LOCK_UNLOCKED; @@ -1652,7 +1724,7 @@ struct saved_cmdlines_buffer { static struct saved_cmdlines_buffer *savedcmd; /* temporary disable recording */ -static atomic_t trace_record_cmdline_disabled __read_mostly; +static atomic_t trace_record_taskinfo_disabled __read_mostly; static inline char *get_saved_cmdlines(int idx) { @@ -1840,13 +1912,15 @@ static void tracing_stop_tr(struct trace_array *tr) raw_spin_unlock_irqrestore(&tr->start_lock, flags); } -void trace_stop_cmdline_recording(void); - static int trace_save_cmdline(struct task_struct *tsk) { unsigned pid, idx; - if (!tsk->pid || unlikely(tsk->pid > PID_MAX_DEFAULT)) + /* treat recording of idle task as a success */ + if (!tsk->pid) + return 1; + + if (unlikely(tsk->pid > PID_MAX_DEFAULT)) return 0; /* @@ -1906,7 +1980,7 @@ static void __trace_find_cmdline(int pid, char comm[]) map = savedcmd->map_pid_to_cmdline[pid]; if (map != NO_CMDLINE_MAP) - strcpy(comm, get_saved_cmdlines(map)); + strlcpy(comm, get_saved_cmdlines(map), TASK_COMM_LEN); else strcpy(comm, "<...>"); } @@ -1922,18 +1996,121 @@ void trace_find_cmdline(int pid, char comm[]) preempt_enable(); } -void tracing_record_cmdline(struct task_struct *tsk) +int trace_find_tgid(int pid) +{ + if (unlikely(!tgid_map || !pid || pid > PID_MAX_DEFAULT)) + return 0; + + return tgid_map[pid]; +} + +static int trace_save_tgid(struct task_struct *tsk) +{ + /* treat recording of idle task as a success */ + if (!tsk->pid) + return 1; + + if (unlikely(!tgid_map || tsk->pid > PID_MAX_DEFAULT)) + return 0; + + tgid_map[tsk->pid] = tsk->tgid; + return 1; +} + +static bool tracing_record_taskinfo_skip(int flags) +{ + if (unlikely(!(flags & (TRACE_RECORD_CMDLINE | TRACE_RECORD_TGID)))) + return true; + if (atomic_read(&trace_record_taskinfo_disabled) || !tracing_is_on()) + return true; + if (!__this_cpu_read(trace_taskinfo_save)) + return true; + return false; +} + +/** + * tracing_record_taskinfo - record the task info of a task + * + * @task - task to record + * @flags - TRACE_RECORD_CMDLINE for recording comm + * - TRACE_RECORD_TGID for recording tgid + */ +void tracing_record_taskinfo(struct task_struct *task, int flags) +{ + bool done; + + if (tracing_record_taskinfo_skip(flags)) + return; + + /* + * Record as much task information as possible. If some fail, continue + * to try to record the others. + */ + done = !(flags & TRACE_RECORD_CMDLINE) || trace_save_cmdline(task); + done &= !(flags & TRACE_RECORD_TGID) || trace_save_tgid(task); + + /* If recording any information failed, retry again soon. */ + if (!done) + return; + + __this_cpu_write(trace_taskinfo_save, false); +} + +/** + * tracing_record_taskinfo_sched_switch - record task info for sched_switch + * + * @prev - previous task during sched_switch + * @next - next task during sched_switch + * @flags - TRACE_RECORD_CMDLINE for recording comm + * TRACE_RECORD_TGID for recording tgid + */ +void tracing_record_taskinfo_sched_switch(struct task_struct *prev, + struct task_struct *next, int flags) { - if (atomic_read(&trace_record_cmdline_disabled) || !tracing_is_on()) + bool done; + + if (tracing_record_taskinfo_skip(flags)) return; - if (!__this_cpu_read(trace_cmdline_save)) + /* + * Record as much task information as possible. If some fail, continue + * to try to record the others. + */ + done = !(flags & TRACE_RECORD_CMDLINE) || trace_save_cmdline(prev); + done &= !(flags & TRACE_RECORD_CMDLINE) || trace_save_cmdline(next); + done &= !(flags & TRACE_RECORD_TGID) || trace_save_tgid(prev); + done &= !(flags & TRACE_RECORD_TGID) || trace_save_tgid(next); + + /* If recording any information failed, retry again soon. */ + if (!done) return; - if (trace_save_cmdline(tsk)) - __this_cpu_write(trace_cmdline_save, false); + __this_cpu_write(trace_taskinfo_save, false); +} + +/* Helpers to record a specific task information */ +void tracing_record_cmdline(struct task_struct *task) +{ + tracing_record_taskinfo(task, TRACE_RECORD_CMDLINE); } +void tracing_record_tgid(struct task_struct *task) +{ + tracing_record_taskinfo(task, TRACE_RECORD_TGID); +} + +/* + * Several functions return TRACE_TYPE_PARTIAL_LINE if the trace_seq + * overflowed, and TRACE_TYPE_HANDLED otherwise. This helper function + * simplifies those functions and keeps them in sync. + */ +enum print_line_t trace_handle_return(struct trace_seq *s) +{ + return trace_seq_has_overflowed(s) ? + TRACE_TYPE_PARTIAL_LINE : TRACE_TYPE_HANDLED; +} +EXPORT_SYMBOL_GPL(trace_handle_return); + void tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags, int pc) @@ -2486,7 +2663,36 @@ static inline void ftrace_trace_stack(struct trace_array *tr, void __trace_stack(struct trace_array *tr, unsigned long flags, int skip, int pc) { - __ftrace_trace_stack(tr->trace_buffer.buffer, flags, skip, pc, NULL); + struct ring_buffer *buffer = tr->trace_buffer.buffer; + + if (rcu_is_watching()) { + __ftrace_trace_stack(buffer, flags, skip, pc, NULL); + return; + } + + /* + * When an NMI triggers, RCU is enabled via rcu_nmi_enter(), + * but if the above rcu_is_watching() failed, then the NMI + * triggered someplace critical, and rcu_irq_enter() should + * not be called from NMI. + */ + if (unlikely(in_nmi())) + return; + + /* + * It is possible that a function is being traced in a + * location that RCU is not watching. A call to + * rcu_irq_enter() will make sure that it is, but there's + * a few internal rcu functions that could be traced + * where that wont work either. In those cases, we just + * do nothing. + */ + if (unlikely(rcu_irq_enter_disabled())) + return; + + rcu_irq_enter_irqson(); + __ftrace_trace_stack(buffer, flags, skip, pc, NULL); + rcu_irq_exit_irqson(); } /** @@ -3035,7 +3241,7 @@ static void *s_start(struct seq_file *m, loff_t *pos) #endif if (!iter->snapshot) - atomic_inc(&trace_record_cmdline_disabled); + atomic_inc(&trace_record_taskinfo_disabled); if (*pos != iter->pos) { iter->ent = NULL; @@ -3080,7 +3286,7 @@ static void s_stop(struct seq_file *m, void *p) #endif if (!iter->snapshot) - atomic_dec(&trace_record_cmdline_disabled); + atomic_dec(&trace_record_taskinfo_disabled); trace_access_unlock(iter->cpu_file); trace_event_read_unlock(); @@ -3137,23 +3343,38 @@ static void print_event_info(struct trace_buffer *buf, struct seq_file *m) seq_puts(m, "#\n"); } -static void print_func_help_header(struct trace_buffer *buf, struct seq_file *m) +static void print_func_help_header(struct trace_buffer *buf, struct seq_file *m, + unsigned int flags) { + bool tgid = flags & TRACE_ITER_RECORD_TGID; + print_event_info(buf, m); - seq_puts(m, "# TASK-PID CPU# TIMESTAMP FUNCTION\n" - "# | | | | |\n"); + + seq_printf(m, "# TASK-PID CPU# %s TIMESTAMP FUNCTION\n", tgid ? "TGID " : ""); + seq_printf(m, "# | | | %s | |\n", tgid ? " | " : ""); } -static void print_func_help_header_irq(struct trace_buffer *buf, struct seq_file *m) +static void print_func_help_header_irq(struct trace_buffer *buf, struct seq_file *m, + unsigned int flags) { - print_event_info(buf, m); - seq_puts(m, "# _-----=> irqs-off\n" - "# / _----=> need-resched\n" - "# | / _---=> hardirq/softirq\n" - "# || / _--=> preempt-depth\n" - "# ||| / delay\n" - "# TASK-PID CPU# |||| TIMESTAMP FUNCTION\n" - "# | | | |||| | |\n"); + bool tgid = flags & TRACE_ITER_RECORD_TGID; + const char tgid_space[] = " "; + const char space[] = " "; + + seq_printf(m, "# %s _-----=> irqs-off\n", + tgid ? tgid_space : space); + seq_printf(m, "# %s / _----=> need-resched\n", + tgid ? tgid_space : space); + seq_printf(m, "# %s| / _---=> hardirq/softirq\n", + tgid ? tgid_space : space); + seq_printf(m, "# %s|| / _--=> preempt-depth\n", + tgid ? tgid_space : space); + seq_printf(m, "# %s||| / delay\n", + tgid ? tgid_space : space); + seq_printf(m, "# TASK-PID CPU#%s|||| TIMESTAMP FUNCTION\n", + tgid ? " TGID " : space); + seq_printf(m, "# | | | %s|||| | |\n", + tgid ? " | " : space); } void @@ -3229,13 +3450,14 @@ static void test_cpu_buff_start(struct trace_iterator *iter) if (!(iter->iter_flags & TRACE_FILE_ANNOTATE)) return; - if (iter->started && cpumask_test_cpu(iter->cpu, iter->started)) + if (cpumask_available(iter->started) && + cpumask_test_cpu(iter->cpu, iter->started)) return; if (per_cpu_ptr(iter->trace_buffer->data, iter->cpu)->skipped_entries) return; - if (iter->started) + if (cpumask_available(iter->started)) cpumask_set_cpu(iter->cpu, iter->started); /* Don't print started cpu buffer for the first entry of the trace */ @@ -3468,9 +3690,11 @@ void trace_default_header(struct seq_file *m) } else { if (!(trace_flags & TRACE_ITER_VERBOSE)) { if (trace_flags & TRACE_ITER_IRQ_INFO) - print_func_help_header_irq(iter->trace_buffer, m); + print_func_help_header_irq(iter->trace_buffer, + m, trace_flags); else - print_func_help_header(iter->trace_buffer, m); + print_func_help_header(iter->trace_buffer, m, + trace_flags); } } } @@ -4126,9 +4350,24 @@ int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled) if (mask == TRACE_ITER_RECORD_CMD) trace_event_enable_cmd_record(enabled); + if (mask == TRACE_ITER_RECORD_TGID) { + if (!tgid_map) + tgid_map = kzalloc((PID_MAX_DEFAULT + 1) * sizeof(*tgid_map), + GFP_KERNEL); + if (!tgid_map) { + tr->trace_flags &= ~TRACE_ITER_RECORD_TGID; + return -ENOMEM; + } + + trace_event_enable_tgid_record(enabled); + } + if (mask == TRACE_ITER_EVENT_FORK) trace_event_follow_fork(tr, enabled); + if (mask == TRACE_ITER_FUNC_FORK) + ftrace_pid_follow_fork(tr, enabled); + if (mask == TRACE_ITER_OVERWRITE) { ring_buffer_change_overwrite(tr->trace_buffer.buffer, enabled); #ifdef CONFIG_TRACER_MAX_TRACE @@ -4348,22 +4587,24 @@ static const char readme_msg[] = "\t\t\t traces\n" #endif #endif /* CONFIG_STACK_TRACER */ -#ifdef CONFIG_KPROBE_EVENT +#ifdef CONFIG_KPROBE_EVENTS " kprobe_events\t\t- Add/remove/show the kernel dynamic events\n" "\t\t\t Write into this file to define/undefine new trace events.\n" #endif -#ifdef CONFIG_UPROBE_EVENT +#ifdef CONFIG_UPROBE_EVENTS " uprobe_events\t\t- Add/remove/show the userspace dynamic events\n" "\t\t\t Write into this file to define/undefine new trace events.\n" #endif -#if defined(CONFIG_KPROBE_EVENT) || defined(CONFIG_UPROBE_EVENT) +#if defined(CONFIG_KPROBE_EVENTS) || defined(CONFIG_UPROBE_EVENTS) "\t accepts: event-definitions (one definition per line)\n" - "\t Format: p|r[:[<group>/]<event>] <place> [<args>]\n" + "\t Format: p[:[<group>/]<event>] <place> [<args>]\n" + "\t r[maxactive][:[<group>/]<event>] <place> [<args>]\n" "\t -:[<group>/]<event>\n" -#ifdef CONFIG_KPROBE_EVENT +#ifdef CONFIG_KPROBE_EVENTS "\t place: [<module>:]<symbol>[+<offset>]|<memaddr>\n" + "place (kretprobe): [<module>:]<symbol>[+<offset>]|<memaddr>\n" #endif -#ifdef CONFIG_UPROBE_EVENT +#ifdef CONFIG_UPROBE_EVENTS "\t place: <path>:<offset>\n" #endif "\t args: <name>=fetcharg[:type]\n" @@ -4481,6 +4722,76 @@ static const struct file_operations tracing_readme_fops = { .llseek = generic_file_llseek, }; +static void *saved_tgids_next(struct seq_file *m, void *v, loff_t *pos) +{ + int *ptr = v; + + if (*pos || m->count) + ptr++; + + (*pos)++; + + for (; ptr <= &tgid_map[PID_MAX_DEFAULT]; ptr++) { + if (trace_find_tgid(*ptr)) + return ptr; + } + + return NULL; +} + +static void *saved_tgids_start(struct seq_file *m, loff_t *pos) +{ + void *v; + loff_t l = 0; + + if (!tgid_map) + return NULL; + + v = &tgid_map[0]; + while (l <= *pos) { + v = saved_tgids_next(m, v, &l); + if (!v) + return NULL; + } + + return v; +} + +static void saved_tgids_stop(struct seq_file *m, void *v) +{ +} + +static int saved_tgids_show(struct seq_file *m, void *v) +{ + int pid = (int *)v - tgid_map; + + seq_printf(m, "%d %d\n", pid, trace_find_tgid(pid)); + return 0; +} + +static const struct seq_operations tracing_saved_tgids_seq_ops = { + .start = saved_tgids_start, + .stop = saved_tgids_stop, + .next = saved_tgids_next, + .show = saved_tgids_show, +}; + +static int tracing_saved_tgids_open(struct inode *inode, struct file *filp) +{ + if (tracing_disabled) + return -ENODEV; + + return seq_open(filp, &tracing_saved_tgids_seq_ops); +} + + +static const struct file_operations tracing_saved_tgids_fops = { + .open = tracing_saved_tgids_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + static void *saved_cmdlines_next(struct seq_file *m, void *v, loff_t *pos) { unsigned int *ptr = v; @@ -4630,11 +4941,11 @@ static const struct file_operations tracing_saved_cmdlines_size_fops = { .write = tracing_saved_cmdlines_size_write, }; -#ifdef CONFIG_TRACE_ENUM_MAP_FILE -static union trace_enum_map_item * -update_enum_map(union trace_enum_map_item *ptr) +#ifdef CONFIG_TRACE_EVAL_MAP_FILE +static union trace_eval_map_item * +update_eval_map(union trace_eval_map_item *ptr) { - if (!ptr->map.enum_string) { + if (!ptr->map.eval_string) { if (ptr->tail.next) { ptr = ptr->tail.next; /* Set ptr to the next real item (skip head) */ @@ -4645,15 +4956,15 @@ update_enum_map(union trace_enum_map_item *ptr) return ptr; } -static void *enum_map_next(struct seq_file *m, void *v, loff_t *pos) +static void *eval_map_next(struct seq_file *m, void *v, loff_t *pos) { - union trace_enum_map_item *ptr = v; + union trace_eval_map_item *ptr = v; /* * Paranoid! If ptr points to end, we don't want to increment past it. * This really should never happen. */ - ptr = update_enum_map(ptr); + ptr = update_eval_map(ptr); if (WARN_ON_ONCE(!ptr)) return NULL; @@ -4661,104 +4972,104 @@ static void *enum_map_next(struct seq_file *m, void *v, loff_t *pos) (*pos)++; - ptr = update_enum_map(ptr); + ptr = update_eval_map(ptr); return ptr; } -static void *enum_map_start(struct seq_file *m, loff_t *pos) +static void *eval_map_start(struct seq_file *m, loff_t *pos) { - union trace_enum_map_item *v; + union trace_eval_map_item *v; loff_t l = 0; - mutex_lock(&trace_enum_mutex); + mutex_lock(&trace_eval_mutex); - v = trace_enum_maps; + v = trace_eval_maps; if (v) v++; while (v && l < *pos) { - v = enum_map_next(m, v, &l); + v = eval_map_next(m, v, &l); } return v; } -static void enum_map_stop(struct seq_file *m, void *v) +static void eval_map_stop(struct seq_file *m, void *v) { - mutex_unlock(&trace_enum_mutex); + mutex_unlock(&trace_eval_mutex); } -static int enum_map_show(struct seq_file *m, void *v) +static int eval_map_show(struct seq_file *m, void *v) { - union trace_enum_map_item *ptr = v; + union trace_eval_map_item *ptr = v; seq_printf(m, "%s %ld (%s)\n", - ptr->map.enum_string, ptr->map.enum_value, + ptr->map.eval_string, ptr->map.eval_value, ptr->map.system); return 0; } -static const struct seq_operations tracing_enum_map_seq_ops = { - .start = enum_map_start, - .next = enum_map_next, - .stop = enum_map_stop, - .show = enum_map_show, +static const struct seq_operations tracing_eval_map_seq_ops = { + .start = eval_map_start, + .next = eval_map_next, + .stop = eval_map_stop, + .show = eval_map_show, }; -static int tracing_enum_map_open(struct inode *inode, struct file *filp) +static int tracing_eval_map_open(struct inode *inode, struct file *filp) { if (tracing_disabled) return -ENODEV; - return seq_open(filp, &tracing_enum_map_seq_ops); + return seq_open(filp, &tracing_eval_map_seq_ops); } -static const struct file_operations tracing_enum_map_fops = { - .open = tracing_enum_map_open, +static const struct file_operations tracing_eval_map_fops = { + .open = tracing_eval_map_open, .read = seq_read, .llseek = seq_lseek, .release = seq_release, }; -static inline union trace_enum_map_item * -trace_enum_jmp_to_tail(union trace_enum_map_item *ptr) +static inline union trace_eval_map_item * +trace_eval_jmp_to_tail(union trace_eval_map_item *ptr) { /* Return tail of array given the head */ return ptr + ptr->head.length + 1; } static void -trace_insert_enum_map_file(struct module *mod, struct trace_enum_map **start, +trace_insert_eval_map_file(struct module *mod, struct trace_eval_map **start, int len) { - struct trace_enum_map **stop; - struct trace_enum_map **map; - union trace_enum_map_item *map_array; - union trace_enum_map_item *ptr; + struct trace_eval_map **stop; + struct trace_eval_map **map; + union trace_eval_map_item *map_array; + union trace_eval_map_item *ptr; stop = start + len; /* - * The trace_enum_maps contains the map plus a head and tail item, + * The trace_eval_maps contains the map plus a head and tail item, * where the head holds the module and length of array, and the * tail holds a pointer to the next list. */ map_array = kmalloc(sizeof(*map_array) * (len + 2), GFP_KERNEL); if (!map_array) { - pr_warn("Unable to allocate trace enum mapping\n"); + pr_warn("Unable to allocate trace eval mapping\n"); return; } - mutex_lock(&trace_enum_mutex); + mutex_lock(&trace_eval_mutex); - if (!trace_enum_maps) - trace_enum_maps = map_array; + if (!trace_eval_maps) + trace_eval_maps = map_array; else { - ptr = trace_enum_maps; + ptr = trace_eval_maps; for (;;) { - ptr = trace_enum_jmp_to_tail(ptr); + ptr = trace_eval_jmp_to_tail(ptr); if (!ptr->tail.next) break; ptr = ptr->tail.next; @@ -4776,34 +5087,34 @@ trace_insert_enum_map_file(struct module *mod, struct trace_enum_map **start, } memset(map_array, 0, sizeof(*map_array)); - mutex_unlock(&trace_enum_mutex); + mutex_unlock(&trace_eval_mutex); } -static void trace_create_enum_file(struct dentry *d_tracer) +static void trace_create_eval_file(struct dentry *d_tracer) { - trace_create_file("enum_map", 0444, d_tracer, - NULL, &tracing_enum_map_fops); + trace_create_file("eval_map", 0444, d_tracer, + NULL, &tracing_eval_map_fops); } -#else /* CONFIG_TRACE_ENUM_MAP_FILE */ -static inline void trace_create_enum_file(struct dentry *d_tracer) { } -static inline void trace_insert_enum_map_file(struct module *mod, - struct trace_enum_map **start, int len) { } -#endif /* !CONFIG_TRACE_ENUM_MAP_FILE */ +#else /* CONFIG_TRACE_EVAL_MAP_FILE */ +static inline void trace_create_eval_file(struct dentry *d_tracer) { } +static inline void trace_insert_eval_map_file(struct module *mod, + struct trace_eval_map **start, int len) { } +#endif /* !CONFIG_TRACE_EVAL_MAP_FILE */ -static void trace_insert_enum_map(struct module *mod, - struct trace_enum_map **start, int len) +static void trace_insert_eval_map(struct module *mod, + struct trace_eval_map **start, int len) { - struct trace_enum_map **map; + struct trace_eval_map **map; if (len <= 0) return; map = start; - trace_event_enum_update(map, len); + trace_event_eval_update(map, len); - trace_insert_enum_map_file(mod, start, len); + trace_insert_eval_map_file(mod, start, len); } static ssize_t @@ -5536,7 +5847,6 @@ static ssize_t tracing_splice_read_pipe(struct file *filp, .partial = partial_def, .nr_pages = 0, /* This gets updated below. */ .nr_pages_max = PIPE_DEF_BUFFERS, - .flags = flags, .ops = &tracing_pipe_buf_ops, .spd_release = tracing_spd_release_pipe, }; @@ -5969,6 +6279,7 @@ static int tracing_clock_open(struct inode *inode, struct file *file) struct ftrace_buffer_info { struct trace_iterator iter; void *spare; + unsigned int spare_cpu; unsigned int read; }; @@ -6298,9 +6609,11 @@ tracing_buffers_read(struct file *filp, char __user *ubuf, return -EBUSY; #endif - if (!info->spare) + if (!info->spare) { info->spare = ring_buffer_alloc_read_page(iter->trace_buffer->buffer, iter->cpu_file); + info->spare_cpu = iter->cpu_file; + } if (!info->spare) return -ENOMEM; @@ -6360,7 +6673,8 @@ static int tracing_buffers_release(struct inode *inode, struct file *file) __trace_array_put(iter->tr); if (info->spare) - ring_buffer_free_read_page(iter->trace_buffer->buffer, info->spare); + ring_buffer_free_read_page(iter->trace_buffer->buffer, + info->spare_cpu, info->spare); kfree(info); mutex_unlock(&trace_types_lock); @@ -6371,6 +6685,7 @@ static int tracing_buffers_release(struct inode *inode, struct file *file) struct buffer_ref { struct ring_buffer *buffer; void *page; + int cpu; int ref; }; @@ -6382,7 +6697,7 @@ static void buffer_pipe_buf_release(struct pipe_inode_info *pipe, if (--ref->ref) return; - ring_buffer_free_read_page(ref->buffer, ref->page); + ring_buffer_free_read_page(ref->buffer, ref->cpu, ref->page); kfree(ref); buf->private = 0; } @@ -6416,7 +6731,7 @@ static void buffer_spd_release(struct splice_pipe_desc *spd, unsigned int i) if (--ref->ref) return; - ring_buffer_free_read_page(ref->buffer, ref->page); + ring_buffer_free_read_page(ref->buffer, ref->cpu, ref->page); kfree(ref); spd->partial[i].private = 0; } @@ -6434,7 +6749,6 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, .pages = pages_def, .partial = partial_def, .nr_pages_max = PIPE_DEF_BUFFERS, - .flags = flags, .ops = &buffer_pipe_buf_ops, .spd_release = buffer_spd_release, }; @@ -6481,11 +6795,13 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, kfree(ref); break; } + ref->cpu = iter->cpu_file; r = ring_buffer_read_page(ref->buffer, &ref->page, len, iter->cpu_file, 1); if (r < 0) { - ring_buffer_free_read_page(ref->buffer, ref->page); + ring_buffer_free_read_page(ref->buffer, ref->cpu, + ref->page); kfree(ref); break; } @@ -6618,33 +6934,18 @@ static const struct file_operations tracing_stats_fops = { #ifdef CONFIG_DYNAMIC_FTRACE -int __weak ftrace_arch_read_dyn_info(char *buf, int size) -{ - return 0; -} - static ssize_t tracing_read_dyn_info(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) { - static char ftrace_dyn_info_buffer[1024]; - static DEFINE_MUTEX(dyn_info_mutex); unsigned long *p = filp->private_data; - char *buf = ftrace_dyn_info_buffer; - int size = ARRAY_SIZE(ftrace_dyn_info_buffer); + char buf[64]; /* Not too big for a shallow stack */ int r; - mutex_lock(&dyn_info_mutex); - r = sprintf(buf, "%ld ", *p); - - r += ftrace_arch_read_dyn_info(buf+r, (size-1)-r); + r = scnprintf(buf, 63, "%ld", *p); buf[r++] = '\n'; - r = simple_read_from_buffer(ubuf, cnt, ppos, buf, r); - - mutex_unlock(&dyn_info_mutex); - - return r; + return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); } static const struct file_operations tracing_dyn_info_fops = { @@ -6656,43 +6957,89 @@ static const struct file_operations tracing_dyn_info_fops = { #if defined(CONFIG_TRACER_SNAPSHOT) && defined(CONFIG_DYNAMIC_FTRACE) static void -ftrace_snapshot(unsigned long ip, unsigned long parent_ip, void **data) +ftrace_snapshot(unsigned long ip, unsigned long parent_ip, + struct trace_array *tr, struct ftrace_probe_ops *ops, + void *data) { - tracing_snapshot(); + tracing_snapshot_instance(tr); } static void -ftrace_count_snapshot(unsigned long ip, unsigned long parent_ip, void **data) +ftrace_count_snapshot(unsigned long ip, unsigned long parent_ip, + struct trace_array *tr, struct ftrace_probe_ops *ops, + void *data) { - unsigned long *count = (long *)data; + struct ftrace_func_mapper *mapper = data; + long *count = NULL; - if (!*count) - return; + if (mapper) + count = (long *)ftrace_func_mapper_find_ip(mapper, ip); + + if (count) { + + if (*count <= 0) + return; - if (*count != -1) (*count)--; + } - tracing_snapshot(); + tracing_snapshot_instance(tr); } static int ftrace_snapshot_print(struct seq_file *m, unsigned long ip, struct ftrace_probe_ops *ops, void *data) { - long count = (long)data; + struct ftrace_func_mapper *mapper = data; + long *count = NULL; seq_printf(m, "%ps:", (void *)ip); seq_puts(m, "snapshot"); - if (count == -1) - seq_puts(m, ":unlimited\n"); + if (mapper) + count = (long *)ftrace_func_mapper_find_ip(mapper, ip); + + if (count) + seq_printf(m, ":count=%ld\n", *count); else - seq_printf(m, ":count=%ld\n", count); + seq_puts(m, ":unlimited\n"); return 0; } +static int +ftrace_snapshot_init(struct ftrace_probe_ops *ops, struct trace_array *tr, + unsigned long ip, void *init_data, void **data) +{ + struct ftrace_func_mapper *mapper = *data; + + if (!mapper) { + mapper = allocate_ftrace_func_mapper(); + if (!mapper) + return -ENOMEM; + *data = mapper; + } + + return ftrace_func_mapper_add_ip(mapper, ip, init_data); +} + +static void +ftrace_snapshot_free(struct ftrace_probe_ops *ops, struct trace_array *tr, + unsigned long ip, void *data) +{ + struct ftrace_func_mapper *mapper = data; + + if (!ip) { + if (!mapper) + return; + free_ftrace_func_mapper(mapper, NULL); + return; + } + + ftrace_func_mapper_remove_ip(mapper, ip); +} + static struct ftrace_probe_ops snapshot_probe_ops = { .func = ftrace_snapshot, .print = ftrace_snapshot_print, @@ -6701,10 +7048,12 @@ static struct ftrace_probe_ops snapshot_probe_ops = { static struct ftrace_probe_ops snapshot_count_probe_ops = { .func = ftrace_count_snapshot, .print = ftrace_snapshot_print, + .init = ftrace_snapshot_init, + .free = ftrace_snapshot_free, }; static int -ftrace_trace_snapshot_callback(struct ftrace_hash *hash, +ftrace_trace_snapshot_callback(struct trace_array *tr, struct ftrace_hash *hash, char *glob, char *cmd, char *param, int enable) { struct ftrace_probe_ops *ops; @@ -6712,16 +7061,17 @@ ftrace_trace_snapshot_callback(struct ftrace_hash *hash, char *number; int ret; + if (!tr) + return -ENODEV; + /* hash funcs only work with set_ftrace_filter */ if (!enable) return -EINVAL; ops = param ? &snapshot_count_probe_ops : &snapshot_probe_ops; - if (glob[0] == '!') { - unregister_ftrace_function_probe_func(glob+1, ops); - return 0; - } + if (glob[0] == '!') + return unregister_ftrace_function_probe_func(glob+1, tr, ops); if (!param) goto out_reg; @@ -6740,11 +7090,13 @@ ftrace_trace_snapshot_callback(struct ftrace_hash *hash, return ret; out_reg: - ret = register_ftrace_function_probe(glob, ops, count); + ret = alloc_snapshot(tr); + if (ret < 0) + goto out; - if (ret >= 0) - alloc_snapshot(&global_trace); + ret = register_ftrace_function_probe(glob, tr, ops, count); + out: return ret < 0 ? ret : 0; } @@ -7353,6 +7705,8 @@ static int instance_mkdir(const char *name) goto out_free_tr; } + ftrace_init_trace_array(tr); + init_tracer_tracefs(tr, tr->dir); init_trace_flags_index(tr); __update_tracer_options(tr); @@ -7408,7 +7762,9 @@ static int instance_rmdir(const char *name) } tracing_set_nop(tr); + clear_ftrace_function_probes(tr); event_trace_del_tracer(tr); + ftrace_clear_pids(tr); ftrace_destroy_function_files(tr); tracefs_remove_recursive(tr->dir); free_trace_buffers(tr); @@ -7418,6 +7774,7 @@ static int instance_rmdir(const char *name) } kfree(tr->topts); + free_cpumask_var(tr->tracing_cpumask); kfree(tr->name); kfree(tr); @@ -7503,7 +7860,7 @@ init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer) ftrace_init_tracefs(tr, d_tracer); } -static struct vfsmount *trace_automount(void *ingore) +static struct vfsmount *trace_automount(struct dentry *mntpt, void *ingore) { struct vfsmount *mnt; struct file_system_type *type; @@ -7516,7 +7873,7 @@ static struct vfsmount *trace_automount(void *ingore) type = get_fs_type("tracefs"); if (!type) return NULL; - mnt = vfs_kern_mount(type, 0, "tracefs", NULL); + mnt = vfs_submount(mntpt, type, "tracefs", NULL); put_filesystem(type); if (IS_ERR(mnt)) return NULL; @@ -7561,21 +7918,21 @@ struct dentry *tracing_init_dentry(void) return NULL; } -extern struct trace_enum_map *__start_ftrace_enum_maps[]; -extern struct trace_enum_map *__stop_ftrace_enum_maps[]; +extern struct trace_eval_map *__start_ftrace_eval_maps[]; +extern struct trace_eval_map *__stop_ftrace_eval_maps[]; -static void __init trace_enum_init(void) +static void __init trace_eval_init(void) { int len; - len = __stop_ftrace_enum_maps - __start_ftrace_enum_maps; - trace_insert_enum_map(NULL, __start_ftrace_enum_maps, len); + len = __stop_ftrace_eval_maps - __start_ftrace_eval_maps; + trace_insert_eval_map(NULL, __start_ftrace_eval_maps, len); } #ifdef CONFIG_MODULES -static void trace_module_add_enums(struct module *mod) +static void trace_module_add_evals(struct module *mod) { - if (!mod->num_trace_enums) + if (!mod->num_trace_evals) return; /* @@ -7585,40 +7942,40 @@ static void trace_module_add_enums(struct module *mod) if (trace_module_has_bad_taint(mod)) return; - trace_insert_enum_map(mod, mod->trace_enums, mod->num_trace_enums); + trace_insert_eval_map(mod, mod->trace_evals, mod->num_trace_evals); } -#ifdef CONFIG_TRACE_ENUM_MAP_FILE -static void trace_module_remove_enums(struct module *mod) +#ifdef CONFIG_TRACE_EVAL_MAP_FILE +static void trace_module_remove_evals(struct module *mod) { - union trace_enum_map_item *map; - union trace_enum_map_item **last = &trace_enum_maps; + union trace_eval_map_item *map; + union trace_eval_map_item **last = &trace_eval_maps; - if (!mod->num_trace_enums) + if (!mod->num_trace_evals) return; - mutex_lock(&trace_enum_mutex); + mutex_lock(&trace_eval_mutex); - map = trace_enum_maps; + map = trace_eval_maps; while (map) { if (map->head.mod == mod) break; - map = trace_enum_jmp_to_tail(map); + map = trace_eval_jmp_to_tail(map); last = &map->tail.next; map = map->tail.next; } if (!map) goto out; - *last = trace_enum_jmp_to_tail(map)->tail.next; + *last = trace_eval_jmp_to_tail(map)->tail.next; kfree(map); out: - mutex_unlock(&trace_enum_mutex); + mutex_unlock(&trace_eval_mutex); } #else -static inline void trace_module_remove_enums(struct module *mod) { } -#endif /* CONFIG_TRACE_ENUM_MAP_FILE */ +static inline void trace_module_remove_evals(struct module *mod) { } +#endif /* CONFIG_TRACE_EVAL_MAP_FILE */ static int trace_module_notify(struct notifier_block *self, unsigned long val, void *data) @@ -7627,10 +7984,10 @@ static int trace_module_notify(struct notifier_block *self, switch (val) { case MODULE_STATE_COMING: - trace_module_add_enums(mod); + trace_module_add_evals(mod); break; case MODULE_STATE_GOING: - trace_module_remove_enums(mod); + trace_module_remove_evals(mod); break; } @@ -7668,9 +8025,12 @@ static __init int tracer_init_tracefs(void) trace_create_file("saved_cmdlines_size", 0644, d_tracer, NULL, &tracing_saved_cmdlines_size_fops); - trace_enum_init(); + trace_create_file("saved_tgids", 0444, d_tracer, + NULL, &tracing_saved_tgids_fops); - trace_create_enum_file(d_tracer); + trace_eval_init(); + + trace_create_eval_file(d_tracer); #ifdef CONFIG_MODULES register_module_notifier(&trace_module_nb); @@ -7972,6 +8332,9 @@ __init static int tracer_alloc_buffers(void) register_tracer(&nop_trace); + /* Function tracing may start here (via kernel command line) */ + init_function_trace(); + /* All seems OK, enable tracing */ tracing_disabled = 0; @@ -8006,7 +8369,7 @@ out: return ret; } -void __init trace_init(void) +void __init early_trace_init(void) { if (tracepoint_printk) { tracepoint_print_iter = @@ -8017,6 +8380,10 @@ void __init trace_init(void) static_key_enable(&tracepoint_printk_key.key); } tracer_alloc_buffers(); +} + +void __init trace_init(void) +{ trace_event_init(); } diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 1ea51ab53edf..490ba229931d 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -262,6 +262,12 @@ struct trace_array { #ifdef CONFIG_FUNCTION_TRACER struct ftrace_ops *ops; struct trace_pid_list __rcu *function_pids; +#ifdef CONFIG_DYNAMIC_FTRACE + /* All of these are protected by the ftrace_lock */ + struct list_head func_probes; + struct list_head mod_trace; + struct list_head mod_notrace; +#endif /* function tracing enabled */ int function_enabled; #endif @@ -579,6 +585,8 @@ void tracing_reset_all_online_cpus(void); int tracing_open_generic(struct inode *inode, struct file *filp); bool tracing_is_disabled(void); int tracer_tracing_is_on(struct trace_array *tr); +void tracer_tracing_on(struct trace_array *tr); +void tracer_tracing_off(struct trace_array *tr); struct dentry *trace_create_file(const char *name, umode_t mode, struct dentry *parent, @@ -632,6 +640,9 @@ void set_graph_array(struct trace_array *tr); void tracing_start_cmdline_record(void); void tracing_stop_cmdline_record(void); +void tracing_start_tgid_record(void); +void tracing_stop_tgid_record(void); + int register_tracer(struct tracer *type); int is_tracing_stopped(void); @@ -692,10 +703,14 @@ static inline void __trace_stack(struct trace_array *tr, unsigned long flags, extern u64 ftrace_now(int cpu); extern void trace_find_cmdline(int pid, char comm[]); +extern int trace_find_tgid(int pid); extern void trace_event_follow_fork(struct trace_array *tr, bool enable); #ifdef CONFIG_DYNAMIC_FTRACE extern unsigned long ftrace_update_tot_cnt; +void ftrace_init_trace_array(struct trace_array *tr); +#else +static inline void ftrace_init_trace_array(struct trace_array *tr) { } #endif #define DYN_FTRACE_TEST_NAME trace_selftest_dynamic_test_func extern int DYN_FTRACE_TEST_NAME(void); @@ -753,6 +768,35 @@ enum print_line_t print_trace_line(struct trace_iterator *iter); extern char trace_find_mark(unsigned long long duration); +struct ftrace_hash; + +struct ftrace_mod_load { + struct list_head list; + char *func; + char *module; + int enable; +}; + +enum { + FTRACE_HASH_FL_MOD = (1 << 0), +}; + +struct ftrace_hash { + unsigned long size_bits; + struct hlist_head *buckets; + unsigned long count; + unsigned long flags; + struct rcu_head rcu; +}; + +struct ftrace_func_entry * +ftrace_lookup_ip(struct ftrace_hash *hash, unsigned long ip); + +static __always_inline bool ftrace_hash_empty(struct ftrace_hash *hash) +{ + return !hash || !(hash->count || (hash->flags & FTRACE_HASH_FL_MOD)); +} + /* Standard output formatting function used for function return traces */ #ifdef CONFIG_FUNCTION_GRAPH_TRACER @@ -787,53 +831,50 @@ extern void __trace_graph_return(struct trace_array *tr, struct ftrace_graph_ret *trace, unsigned long flags, int pc); - #ifdef CONFIG_DYNAMIC_FTRACE -/* TODO: make this variable */ -#define FTRACE_GRAPH_MAX_FUNCS 32 -extern int ftrace_graph_count; -extern unsigned long ftrace_graph_funcs[FTRACE_GRAPH_MAX_FUNCS]; -extern int ftrace_graph_notrace_count; -extern unsigned long ftrace_graph_notrace_funcs[FTRACE_GRAPH_MAX_FUNCS]; +extern struct ftrace_hash *ftrace_graph_hash; +extern struct ftrace_hash *ftrace_graph_notrace_hash; static inline int ftrace_graph_addr(unsigned long addr) { - int i; - - if (!ftrace_graph_count) - return 1; - - for (i = 0; i < ftrace_graph_count; i++) { - if (addr == ftrace_graph_funcs[i]) { - /* - * If no irqs are to be traced, but a set_graph_function - * is set, and called by an interrupt handler, we still - * want to trace it. - */ - if (in_irq()) - trace_recursion_set(TRACE_IRQ_BIT); - else - trace_recursion_clear(TRACE_IRQ_BIT); - return 1; - } + int ret = 0; + + preempt_disable_notrace(); + + if (ftrace_hash_empty(ftrace_graph_hash)) { + ret = 1; + goto out; } - return 0; + if (ftrace_lookup_ip(ftrace_graph_hash, addr)) { + /* + * If no irqs are to be traced, but a set_graph_function + * is set, and called by an interrupt handler, we still + * want to trace it. + */ + if (in_irq()) + trace_recursion_set(TRACE_IRQ_BIT); + else + trace_recursion_clear(TRACE_IRQ_BIT); + ret = 1; + } + +out: + preempt_enable_notrace(); + return ret; } static inline int ftrace_graph_notrace_addr(unsigned long addr) { - int i; + int ret = 0; - if (!ftrace_graph_notrace_count) - return 0; + preempt_disable_notrace(); - for (i = 0; i < ftrace_graph_notrace_count; i++) { - if (addr == ftrace_graph_notrace_funcs[i]) - return 1; - } + if (ftrace_lookup_ip(ftrace_graph_notrace_hash, addr)) + ret = 1; - return 0; + preempt_enable_notrace(); + return ret; } #else static inline int ftrace_graph_addr(unsigned long addr) @@ -868,6 +909,14 @@ print_graph_function_flags(struct trace_iterator *iter, u32 flags) extern struct list_head ftrace_pids; #ifdef CONFIG_FUNCTION_TRACER +struct ftrace_func_command { + struct list_head list; + char *name; + int (*func)(struct trace_array *tr, + struct ftrace_hash *hash, + char *func, char *cmd, + char *params, int enable); +}; extern bool ftrace_filter_param __initdata; static inline int ftrace_trace_task(struct trace_array *tr) { @@ -884,6 +933,9 @@ int using_ftrace_ops_list_func(void); void ftrace_init_tracefs(struct trace_array *tr, struct dentry *d_tracer); void ftrace_init_tracefs_toplevel(struct trace_array *tr, struct dentry *d_tracer); +void ftrace_clear_pids(struct trace_array *tr); +int init_function_trace(void); +void ftrace_pid_follow_fork(struct trace_array *tr, bool enable); #else static inline int ftrace_trace_task(struct trace_array *tr) { @@ -902,15 +954,76 @@ ftrace_init_global_array_ops(struct trace_array *tr) { } static inline void ftrace_reset_array_ops(struct trace_array *tr) { } static inline void ftrace_init_tracefs(struct trace_array *tr, struct dentry *d) { } static inline void ftrace_init_tracefs_toplevel(struct trace_array *tr, struct dentry *d) { } +static inline void ftrace_clear_pids(struct trace_array *tr) { } +static inline int init_function_trace(void) { return 0; } +static inline void ftrace_pid_follow_fork(struct trace_array *tr, bool enable) { } /* ftace_func_t type is not defined, use macro instead of static inline */ #define ftrace_init_array_ops(tr, func) do { } while (0) #endif /* CONFIG_FUNCTION_TRACER */ #if defined(CONFIG_FUNCTION_TRACER) && defined(CONFIG_DYNAMIC_FTRACE) + +struct ftrace_probe_ops { + void (*func)(unsigned long ip, + unsigned long parent_ip, + struct trace_array *tr, + struct ftrace_probe_ops *ops, + void *data); + int (*init)(struct ftrace_probe_ops *ops, + struct trace_array *tr, + unsigned long ip, void *init_data, + void **data); + void (*free)(struct ftrace_probe_ops *ops, + struct trace_array *tr, + unsigned long ip, void *data); + int (*print)(struct seq_file *m, + unsigned long ip, + struct ftrace_probe_ops *ops, + void *data); +}; + +struct ftrace_func_mapper; +typedef int (*ftrace_mapper_func)(void *data); + +struct ftrace_func_mapper *allocate_ftrace_func_mapper(void); +void **ftrace_func_mapper_find_ip(struct ftrace_func_mapper *mapper, + unsigned long ip); +int ftrace_func_mapper_add_ip(struct ftrace_func_mapper *mapper, + unsigned long ip, void *data); +void *ftrace_func_mapper_remove_ip(struct ftrace_func_mapper *mapper, + unsigned long ip); +void free_ftrace_func_mapper(struct ftrace_func_mapper *mapper, + ftrace_mapper_func free_func); + +extern int +register_ftrace_function_probe(char *glob, struct trace_array *tr, + struct ftrace_probe_ops *ops, void *data); +extern int +unregister_ftrace_function_probe_func(char *glob, struct trace_array *tr, + struct ftrace_probe_ops *ops); +extern void clear_ftrace_function_probes(struct trace_array *tr); + +int register_ftrace_command(struct ftrace_func_command *cmd); +int unregister_ftrace_command(struct ftrace_func_command *cmd); + void ftrace_create_filter_files(struct ftrace_ops *ops, struct dentry *parent); void ftrace_destroy_filter_files(struct ftrace_ops *ops); #else +struct ftrace_func_command; + +static inline __init int register_ftrace_command(struct ftrace_func_command *cmd) +{ + return -EINVAL; +} +static inline __init int unregister_ftrace_command(char *cmd_name) +{ + return -EINVAL; +} +static inline void clear_ftrace_function_probes(struct trace_array *tr) +{ +} + /* * The ops parameter passed in is usually undefined. * This must be a macro. @@ -975,11 +1088,13 @@ extern int trace_get_user(struct trace_parser *parser, const char __user *ubuf, #ifdef CONFIG_FUNCTION_TRACER # define FUNCTION_FLAGS \ - C(FUNCTION, "function-trace"), + C(FUNCTION, "function-trace"), \ + C(FUNC_FORK, "function-fork"), # define FUNCTION_DEFAULT_FLAGS TRACE_ITER_FUNCTION #else # define FUNCTION_FLAGS # define FUNCTION_DEFAULT_FLAGS 0UL +# define TRACE_ITER_FUNC_FORK 0UL #endif #ifdef CONFIG_STACKTRACE @@ -1013,6 +1128,7 @@ extern int trace_get_user(struct trace_parser *parser, const char __user *ubuf, C(CONTEXT_INFO, "context-info"), /* Print pid/cpu/time */ \ C(LATENCY_FMT, "latency-format"), \ C(RECORD_CMD, "record-cmd"), \ + C(RECORD_TGID, "record-tgid"), \ C(OVERWRITE, "overwrite"), \ C(STOP_ON_FREE, "disable_on_free"), \ C(IRQ_INFO, "irq-info"), \ @@ -1094,9 +1210,9 @@ struct ftrace_event_field { struct event_filter { int n_preds; /* Number assigned */ int a_preds; /* allocated */ - struct filter_pred *preds; - struct filter_pred *root; - char *filter_string; + struct filter_pred __rcu *preds; + struct filter_pred __rcu *root; + char *filter_string; }; struct event_subsystem { @@ -1300,7 +1416,8 @@ static inline bool is_string_field(struct ftrace_event_field *field) { return field->filter_type == FILTER_DYN_STRING || field->filter_type == FILTER_STATIC_STRING || - field->filter_type == FILTER_PTR_STRING; + field->filter_type == FILTER_PTR_STRING || + field->filter_type == FILTER_COMM; } static inline bool is_function_field(struct ftrace_event_field *field) @@ -1328,6 +1445,8 @@ struct ftrace_event_field * trace_find_event_field(struct trace_event_call *call, char *name); extern void trace_event_enable_cmd_record(bool enable); +extern void trace_event_enable_tgid_record(bool enable); + extern int event_trace_add_tracer(struct dentry *parent, struct trace_array *tr); extern int event_trace_del_tracer(struct trace_array *tr); @@ -1678,10 +1797,10 @@ static inline const char *get_syscall_name(int syscall) #ifdef CONFIG_EVENT_TRACING void trace_event_init(void); -void trace_event_enum_update(struct trace_enum_map **map, int len); +void trace_event_eval_update(struct trace_eval_map **map, int len); #else static inline void __init trace_event_init(void) { } -static inline void trace_event_enum_update(struct trace_enum_map **map, int len) { } +static inline void trace_event_eval_update(struct trace_eval_map **map, int len) { } #endif extern struct trace_iterator *tracepoint_print_iter; diff --git a/kernel/trace/trace_benchmark.c b/kernel/trace/trace_benchmark.c index e3b488825ae3..16a8cf02eee9 100644 --- a/kernel/trace/trace_benchmark.c +++ b/kernel/trace/trace_benchmark.c @@ -153,10 +153,18 @@ static int benchmark_event_kthread(void *arg) trace_do_benchmark(); /* - * We don't go to sleep, but let others - * run as well. + * We don't go to sleep, but let others run as well. + * This is bascially a "yield()" to let any task that + * wants to run, schedule in, but if the CPU is idle, + * we'll keep burning cycles. + * + * Note the _rcu_qs() version of cond_resched() will + * notify synchronize_rcu_tasks() that this thread has + * passed a quiescent state for rcu_tasks. Otherwise + * this thread will never voluntarily schedule which would + * block synchronize_rcu_tasks() indefinitely. */ - cond_resched(); + cond_resched_rcu_qs(); } return 0; @@ -175,9 +183,9 @@ int trace_benchmark_reg(void) bm_event_thread = kthread_run(benchmark_event_kthread, NULL, "event_benchmark"); - if (!bm_event_thread) { + if (IS_ERR(bm_event_thread)) { pr_warning("trace benchmark failed to create kernel thread\n"); - return -ENOMEM; + return PTR_ERR(bm_event_thread); } return 0; diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c index 75489de546b6..4d8fdf3184dc 100644 --- a/kernel/trace/trace_branch.c +++ b/kernel/trace/trace_branch.c @@ -27,7 +27,7 @@ static DEFINE_MUTEX(branch_tracing_mutex); static struct trace_array *branch_tracer; static void -probe_likely_condition(struct ftrace_branch_data *f, int val, int expect) +probe_likely_condition(struct ftrace_likely_data *f, int val, int expect) { struct trace_event_call *call = &event_branch; struct trace_array *tr = branch_tracer; @@ -68,16 +68,17 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect) entry = ring_buffer_event_data(event); /* Strip off the path, only save the file */ - p = f->file + strlen(f->file); - while (p >= f->file && *p != '/') + p = f->data.file + strlen(f->data.file); + while (p >= f->data.file && *p != '/') p--; p++; - strncpy(entry->func, f->func, TRACE_FUNC_SIZE); + strncpy(entry->func, f->data.func, TRACE_FUNC_SIZE); strncpy(entry->file, p, TRACE_FILE_SIZE); entry->func[TRACE_FUNC_SIZE] = 0; entry->file[TRACE_FILE_SIZE] = 0; - entry->line = f->line; + entry->constant = f->constant; + entry->line = f->data.line; entry->correct = val == expect; if (!call_filter_check_discard(call, entry, buffer, event)) @@ -89,7 +90,7 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect) } static inline -void trace_likely_condition(struct ftrace_branch_data *f, int val, int expect) +void trace_likely_condition(struct ftrace_likely_data *f, int val, int expect) { if (!branch_tracing_enabled) return; @@ -195,13 +196,19 @@ core_initcall(init_branch_tracer); #else static inline -void trace_likely_condition(struct ftrace_branch_data *f, int val, int expect) +void trace_likely_condition(struct ftrace_likely_data *f, int val, int expect) { } #endif /* CONFIG_BRANCH_TRACER */ -void ftrace_likely_update(struct ftrace_branch_data *f, int val, int expect) +void ftrace_likely_update(struct ftrace_likely_data *f, int val, + int expect, int is_constant) { + /* A constant is always correct */ + if (is_constant) { + f->constant++; + val = expect; + } /* * I would love to have a trace point here instead, but the * trace point code is so inundated with unlikely and likely @@ -212,9 +219,9 @@ void ftrace_likely_update(struct ftrace_branch_data *f, int val, int expect) /* FIXME: Make this atomic! */ if (val == expect) - f->correct++; + f->data.correct++; else - f->incorrect++; + f->data.incorrect++; } EXPORT_SYMBOL(ftrace_likely_update); @@ -245,29 +252,60 @@ static inline long get_incorrect_percent(struct ftrace_branch_data *p) return percent; } -static int branch_stat_show(struct seq_file *m, void *v) +static const char *branch_stat_process_file(struct ftrace_branch_data *p) { - struct ftrace_branch_data *p = v; const char *f; - long percent; /* Only print the file, not the path */ f = p->file + strlen(p->file); while (f >= p->file && *f != '/') f--; - f++; + return ++f; +} + +static void branch_stat_show(struct seq_file *m, + struct ftrace_branch_data *p, const char *f) +{ + long percent; /* * The miss is overlayed on correct, and hit on incorrect. */ percent = get_incorrect_percent(p); - seq_printf(m, "%8lu %8lu ", p->correct, p->incorrect); if (percent < 0) seq_puts(m, " X "); else seq_printf(m, "%3ld ", percent); + seq_printf(m, "%-30.30s %-20.20s %d\n", p->func, f, p->line); +} + +static int branch_stat_show_normal(struct seq_file *m, + struct ftrace_branch_data *p, const char *f) +{ + seq_printf(m, "%8lu %8lu ", p->correct, p->incorrect); + branch_stat_show(m, p, f); + return 0; +} + +static int annotate_branch_stat_show(struct seq_file *m, void *v) +{ + struct ftrace_likely_data *p = v; + const char *f; + int l; + + f = branch_stat_process_file(&p->data); + + if (!p->constant) + return branch_stat_show_normal(m, &p->data, f); + + l = snprintf(NULL, 0, "/%lu", p->constant); + l = l > 8 ? 0 : 8 - l; + + seq_printf(m, "%8lu/%lu %*lu ", + p->data.correct, p->constant, l, p->data.incorrect); + branch_stat_show(m, &p->data, f); return 0; } @@ -279,7 +317,7 @@ static void *annotated_branch_stat_start(struct tracer_stat *trace) static void * annotated_branch_stat_next(void *v, int idx) { - struct ftrace_branch_data *p = v; + struct ftrace_likely_data *p = v; ++p; @@ -328,7 +366,7 @@ static struct tracer_stat annotated_branch_stats = { .stat_next = annotated_branch_stat_next, .stat_cmp = annotated_branch_stat_cmp, .stat_headers = annotated_branch_stat_headers, - .stat_show = branch_stat_show + .stat_show = annotate_branch_stat_show }; __init static int init_annotated_branch_stats(void) @@ -379,12 +417,21 @@ all_branch_stat_next(void *v, int idx) return p; } +static int all_branch_stat_show(struct seq_file *m, void *v) +{ + struct ftrace_branch_data *p = v; + const char *f; + + f = branch_stat_process_file(p); + return branch_stat_show_normal(m, p, f); +} + static struct tracer_stat all_branch_stats = { .name = "branch_all", .stat_start = all_branch_stat_start, .stat_next = all_branch_stat_next, .stat_headers = all_branch_stat_headers, - .stat_show = branch_stat_show + .stat_show = all_branch_stat_show }; __init static int all_annotated_branch_stats(void) diff --git a/kernel/trace/trace_clock.c b/kernel/trace/trace_clock.c index 0f06532a755b..5fdc779f411d 100644 --- a/kernel/trace/trace_clock.c +++ b/kernel/trace/trace_clock.c @@ -18,6 +18,7 @@ #include <linux/module.h> #include <linux/percpu.h> #include <linux/sched.h> +#include <linux/sched/clock.h> #include <linux/ktime.h> #include <linux/trace_clock.h> diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h index eb7396b7e7c3..adcdbbeae010 100644 --- a/kernel/trace/trace_entries.h +++ b/kernel/trace/trace_entries.h @@ -328,11 +328,13 @@ FTRACE_ENTRY(branch, trace_branch, __array( char, func, TRACE_FUNC_SIZE+1 ) __array( char, file, TRACE_FILE_SIZE+1 ) __field( char, correct ) + __field( char, constant ) ), - F_printk("%u:%s:%s (%u)", + F_printk("%u:%s:%s (%u)%s", __entry->line, - __entry->func, __entry->file, __entry->correct), + __entry->func, __entry->file, __entry->correct, + __entry->constant ? " CONSTANT" : ""), FILTER_OTHER ); @@ -346,14 +348,14 @@ FTRACE_ENTRY(hwlat, hwlat_entry, __field( u64, duration ) __field( u64, outer_duration ) __field( u64, nmi_total_ts ) - __field_struct( struct timespec, timestamp ) - __field_desc( long, timestamp, tv_sec ) + __field_struct( struct timespec64, timestamp ) + __field_desc( s64, timestamp, tv_sec ) __field_desc( long, timestamp, tv_nsec ) __field( unsigned int, nmi_count ) __field( unsigned int, seqnum ) ), - F_printk("cnt:%u\tts:%010lu.%010lu\tinner:%llu\touter:%llunmi-ts:%llu\tnmi-count:%u\n", + F_printk("cnt:%u\tts:%010llu.%010lu\tinner:%llu\touter:%llunmi-ts:%llu\tnmi-count:%u\n", __entry->seqnum, __entry->tv_sec, __entry->tv_nsec, diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 93116549a284..36132f9280e6 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -343,6 +343,28 @@ void trace_event_enable_cmd_record(bool enable) mutex_unlock(&event_mutex); } +void trace_event_enable_tgid_record(bool enable) +{ + struct trace_event_file *file; + struct trace_array *tr; + + mutex_lock(&event_mutex); + do_for_each_event_file(tr, file) { + if (!(file->flags & EVENT_FILE_FL_ENABLED)) + continue; + + if (enable) { + tracing_start_tgid_record(); + set_bit(EVENT_FILE_FL_RECORDED_TGID_BIT, &file->flags); + } else { + tracing_stop_tgid_record(); + clear_bit(EVENT_FILE_FL_RECORDED_TGID_BIT, + &file->flags); + } + } while_for_each_event_file(); + mutex_unlock(&event_mutex); +} + static int __ftrace_event_enable_disable(struct trace_event_file *file, int enable, int soft_disable) { @@ -381,6 +403,12 @@ static int __ftrace_event_enable_disable(struct trace_event_file *file, tracing_stop_cmdline_record(); clear_bit(EVENT_FILE_FL_RECORDED_CMD_BIT, &file->flags); } + + if (file->flags & EVENT_FILE_FL_RECORDED_TGID) { + tracing_stop_tgid_record(); + clear_bit(EVENT_FILE_FL_RECORDED_CMD_BIT, &file->flags); + } + call->class->reg(call, TRACE_REG_UNREGISTER, file); } /* If in SOFT_MODE, just set the SOFT_DISABLE_BIT, else clear it */ @@ -407,18 +435,30 @@ static int __ftrace_event_enable_disable(struct trace_event_file *file, } if (!(file->flags & EVENT_FILE_FL_ENABLED)) { + bool cmd = false, tgid = false; /* Keep the event disabled, when going to SOFT_MODE. */ if (soft_disable) set_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &file->flags); if (tr->trace_flags & TRACE_ITER_RECORD_CMD) { + cmd = true; tracing_start_cmdline_record(); set_bit(EVENT_FILE_FL_RECORDED_CMD_BIT, &file->flags); } + + if (tr->trace_flags & TRACE_ITER_RECORD_TGID) { + tgid = true; + tracing_start_tgid_record(); + set_bit(EVENT_FILE_FL_RECORDED_TGID_BIT, &file->flags); + } + ret = call->class->reg(call, TRACE_REG_REGISTER, file); if (ret) { - tracing_stop_cmdline_record(); + if (cmd) + tracing_stop_cmdline_record(); + if (tgid) + tracing_stop_tgid_record(); pr_info("event trace: Could not enable event " "%s\n", trace_event_name(call)); break; @@ -2067,18 +2107,18 @@ __register_event(struct trace_event_call *call, struct module *mod) return 0; } -static char *enum_replace(char *ptr, struct trace_enum_map *map, int len) +static char *eval_replace(char *ptr, struct trace_eval_map *map, int len) { int rlen; int elen; - /* Find the length of the enum value as a string */ - elen = snprintf(ptr, 0, "%ld", map->enum_value); + /* Find the length of the eval value as a string */ + elen = snprintf(ptr, 0, "%ld", map->eval_value); /* Make sure there's enough room to replace the string with the value */ if (len < elen) return NULL; - snprintf(ptr, elen + 1, "%ld", map->enum_value); + snprintf(ptr, elen + 1, "%ld", map->eval_value); /* Get the rest of the string of ptr */ rlen = strlen(ptr + len); @@ -2090,11 +2130,11 @@ static char *enum_replace(char *ptr, struct trace_enum_map *map, int len) } static void update_event_printk(struct trace_event_call *call, - struct trace_enum_map *map) + struct trace_eval_map *map) { char *ptr; int quote = 0; - int len = strlen(map->enum_string); + int len = strlen(map->eval_string); for (ptr = call->print_fmt; *ptr; ptr++) { if (*ptr == '\\') { @@ -2125,16 +2165,16 @@ static void update_event_printk(struct trace_event_call *call, continue; } if (isalpha(*ptr) || *ptr == '_') { - if (strncmp(map->enum_string, ptr, len) == 0 && + if (strncmp(map->eval_string, ptr, len) == 0 && !isalnum(ptr[len]) && ptr[len] != '_') { - ptr = enum_replace(ptr, map, len); - /* Hmm, enum string smaller than value */ + ptr = eval_replace(ptr, map, len); + /* enum/sizeof string smaller than value */ if (WARN_ON_ONCE(!ptr)) return; /* - * No need to decrement here, as enum_replace() + * No need to decrement here, as eval_replace() * returns the pointer to the character passed - * the enum, and two enums can not be placed + * the eval, and two evals can not be placed * back to back without something in between. * We can skip that something in between. */ @@ -2165,7 +2205,7 @@ static void update_event_printk(struct trace_event_call *call, } } -void trace_event_enum_update(struct trace_enum_map **map, int len) +void trace_event_eval_update(struct trace_eval_map **map, int len) { struct trace_event_call *call, *p; const char *last_system = NULL; @@ -2460,15 +2500,8 @@ struct event_probe_data { bool enable; }; -static void -event_enable_probe(unsigned long ip, unsigned long parent_ip, void **_data) +static void update_event_probe(struct event_probe_data *data) { - struct event_probe_data **pdata = (struct event_probe_data **)_data; - struct event_probe_data *data = *pdata; - - if (!data) - return; - if (data->enable) clear_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &data->file->flags); else @@ -2476,77 +2509,141 @@ event_enable_probe(unsigned long ip, unsigned long parent_ip, void **_data) } static void -event_enable_count_probe(unsigned long ip, unsigned long parent_ip, void **_data) +event_enable_probe(unsigned long ip, unsigned long parent_ip, + struct trace_array *tr, struct ftrace_probe_ops *ops, + void *data) { - struct event_probe_data **pdata = (struct event_probe_data **)_data; - struct event_probe_data *data = *pdata; + struct ftrace_func_mapper *mapper = data; + struct event_probe_data *edata; + void **pdata; - if (!data) + pdata = ftrace_func_mapper_find_ip(mapper, ip); + if (!pdata || !*pdata) return; - if (!data->count) + edata = *pdata; + update_event_probe(edata); +} + +static void +event_enable_count_probe(unsigned long ip, unsigned long parent_ip, + struct trace_array *tr, struct ftrace_probe_ops *ops, + void *data) +{ + struct ftrace_func_mapper *mapper = data; + struct event_probe_data *edata; + void **pdata; + + pdata = ftrace_func_mapper_find_ip(mapper, ip); + if (!pdata || !*pdata) + return; + + edata = *pdata; + + if (!edata->count) return; /* Skip if the event is in a state we want to switch to */ - if (data->enable == !(data->file->flags & EVENT_FILE_FL_SOFT_DISABLED)) + if (edata->enable == !(edata->file->flags & EVENT_FILE_FL_SOFT_DISABLED)) return; - if (data->count != -1) - (data->count)--; + if (edata->count != -1) + (edata->count)--; - event_enable_probe(ip, parent_ip, _data); + update_event_probe(edata); } static int event_enable_print(struct seq_file *m, unsigned long ip, - struct ftrace_probe_ops *ops, void *_data) + struct ftrace_probe_ops *ops, void *data) { - struct event_probe_data *data = _data; + struct ftrace_func_mapper *mapper = data; + struct event_probe_data *edata; + void **pdata; + + pdata = ftrace_func_mapper_find_ip(mapper, ip); + + if (WARN_ON_ONCE(!pdata || !*pdata)) + return 0; + + edata = *pdata; seq_printf(m, "%ps:", (void *)ip); seq_printf(m, "%s:%s:%s", - data->enable ? ENABLE_EVENT_STR : DISABLE_EVENT_STR, - data->file->event_call->class->system, - trace_event_name(data->file->event_call)); + edata->enable ? ENABLE_EVENT_STR : DISABLE_EVENT_STR, + edata->file->event_call->class->system, + trace_event_name(edata->file->event_call)); - if (data->count == -1) + if (edata->count == -1) seq_puts(m, ":unlimited\n"); else - seq_printf(m, ":count=%ld\n", data->count); + seq_printf(m, ":count=%ld\n", edata->count); return 0; } static int -event_enable_init(struct ftrace_probe_ops *ops, unsigned long ip, - void **_data) +event_enable_init(struct ftrace_probe_ops *ops, struct trace_array *tr, + unsigned long ip, void *init_data, void **data) { - struct event_probe_data **pdata = (struct event_probe_data **)_data; - struct event_probe_data *data = *pdata; + struct ftrace_func_mapper *mapper = *data; + struct event_probe_data *edata = init_data; + int ret; + + if (!mapper) { + mapper = allocate_ftrace_func_mapper(); + if (!mapper) + return -ENODEV; + *data = mapper; + } + + ret = ftrace_func_mapper_add_ip(mapper, ip, edata); + if (ret < 0) + return ret; + + edata->ref++; - data->ref++; + return 0; +} + +static int free_probe_data(void *data) +{ + struct event_probe_data *edata = data; + + edata->ref--; + if (!edata->ref) { + /* Remove the SOFT_MODE flag */ + __ftrace_event_enable_disable(edata->file, 0, 1); + module_put(edata->file->event_call->mod); + kfree(edata); + } return 0; } static void -event_enable_free(struct ftrace_probe_ops *ops, unsigned long ip, - void **_data) +event_enable_free(struct ftrace_probe_ops *ops, struct trace_array *tr, + unsigned long ip, void *data) { - struct event_probe_data **pdata = (struct event_probe_data **)_data; - struct event_probe_data *data = *pdata; + struct ftrace_func_mapper *mapper = data; + struct event_probe_data *edata; - if (WARN_ON_ONCE(data->ref <= 0)) + if (!ip) { + if (!mapper) + return; + free_ftrace_func_mapper(mapper, free_probe_data); return; - - data->ref--; - if (!data->ref) { - /* Remove the SOFT_MODE flag */ - __ftrace_event_enable_disable(data->file, 0, 1); - module_put(data->file->event_call->mod); - kfree(data); } - *pdata = NULL; + + edata = ftrace_func_mapper_remove_ip(mapper, ip); + + if (WARN_ON_ONCE(!edata)) + return; + + if (WARN_ON_ONCE(edata->ref <= 0)) + return; + + free_probe_data(edata); } static struct ftrace_probe_ops event_enable_probe_ops = { @@ -2578,10 +2675,9 @@ static struct ftrace_probe_ops event_disable_count_probe_ops = { }; static int -event_enable_func(struct ftrace_hash *hash, +event_enable_func(struct trace_array *tr, struct ftrace_hash *hash, char *glob, char *cmd, char *param, int enabled) { - struct trace_array *tr = top_trace_array(); struct trace_event_file *file; struct ftrace_probe_ops *ops; struct event_probe_data *data; @@ -2619,12 +2715,12 @@ event_enable_func(struct ftrace_hash *hash, ops = param ? &event_disable_count_probe_ops : &event_disable_probe_ops; if (glob[0] == '!') { - unregister_ftrace_function_probe_func(glob+1, ops); - ret = 0; + ret = unregister_ftrace_function_probe_func(glob+1, tr, ops); goto out; } ret = -ENOMEM; + data = kzalloc(sizeof(*data), GFP_KERNEL); if (!data) goto out; @@ -2661,7 +2757,8 @@ event_enable_func(struct ftrace_hash *hash, ret = __ftrace_event_enable_disable(file, 1, 1); if (ret < 0) goto out_put; - ret = register_ftrace_function_probe(glob, ops, data); + + ret = register_ftrace_function_probe(glob, tr, ops, data); /* * The above returns on success the # of functions enabled, * but if it didn't find any functions it returns zero. diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index f3a960ed75a1..1c21d0e2a145 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -19,6 +19,7 @@ #include <linux/mutex.h> #include <linux/slab.h> #include <linux/stacktrace.h> +#include <linux/rculist.h> #include "tracing_map.h" #include "trace.h" diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c index 6721a1e89f39..f2ac9d44f6c4 100644 --- a/kernel/trace/trace_events_trigger.c +++ b/kernel/trace/trace_events_trigger.c @@ -22,6 +22,7 @@ #include <linux/ctype.h> #include <linux/mutex.h> #include <linux/slab.h> +#include <linux/rculist.h> #include "trace.h" diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index 0efa00d80623..a0910c0cdf2e 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c @@ -267,10 +267,14 @@ static struct tracer function_trace __tracer_data = }; #ifdef CONFIG_DYNAMIC_FTRACE -static void update_traceon_count(void **data, bool on) +static void update_traceon_count(struct ftrace_probe_ops *ops, + unsigned long ip, + struct trace_array *tr, bool on, + void *data) { - long *count = (long *)data; - long old_count = *count; + struct ftrace_func_mapper *mapper = data; + long *count; + long old_count; /* * Tracing gets disabled (or enabled) once per count. @@ -301,23 +305,22 @@ static void update_traceon_count(void **data, bool on) * setting the tracing_on file. But we currently don't care * about that. */ - if (!old_count) + count = (long *)ftrace_func_mapper_find_ip(mapper, ip); + old_count = *count; + + if (old_count <= 0) return; /* Make sure we see count before checking tracing state */ smp_rmb(); - if (on == !!tracing_is_on()) + if (on == !!tracer_tracing_is_on(tr)) return; if (on) - tracing_on(); + tracer_tracing_on(tr); else - tracing_off(); - - /* unlimited? */ - if (old_count == -1) - return; + tracer_tracing_off(tr); /* Make sure tracing state is visible before updating count */ smp_wmb(); @@ -326,33 +329,41 @@ static void update_traceon_count(void **data, bool on) } static void -ftrace_traceon_count(unsigned long ip, unsigned long parent_ip, void **data) +ftrace_traceon_count(unsigned long ip, unsigned long parent_ip, + struct trace_array *tr, struct ftrace_probe_ops *ops, + void *data) { - update_traceon_count(data, 1); + update_traceon_count(ops, ip, tr, 1, data); } static void -ftrace_traceoff_count(unsigned long ip, unsigned long parent_ip, void **data) +ftrace_traceoff_count(unsigned long ip, unsigned long parent_ip, + struct trace_array *tr, struct ftrace_probe_ops *ops, + void *data) { - update_traceon_count(data, 0); + update_traceon_count(ops, ip, tr, 0, data); } static void -ftrace_traceon(unsigned long ip, unsigned long parent_ip, void **data) +ftrace_traceon(unsigned long ip, unsigned long parent_ip, + struct trace_array *tr, struct ftrace_probe_ops *ops, + void *data) { - if (tracing_is_on()) + if (tracer_tracing_is_on(tr)) return; - tracing_on(); + tracer_tracing_on(tr); } static void -ftrace_traceoff(unsigned long ip, unsigned long parent_ip, void **data) +ftrace_traceoff(unsigned long ip, unsigned long parent_ip, + struct trace_array *tr, struct ftrace_probe_ops *ops, + void *data) { - if (!tracing_is_on()) + if (!tracer_tracing_is_on(tr)) return; - tracing_off(); + tracer_tracing_off(tr); } /* @@ -364,144 +375,218 @@ ftrace_traceoff(unsigned long ip, unsigned long parent_ip, void **data) */ #define STACK_SKIP 4 +static __always_inline void trace_stack(struct trace_array *tr) +{ + unsigned long flags; + int pc; + + local_save_flags(flags); + pc = preempt_count(); + + __trace_stack(tr, flags, STACK_SKIP, pc); +} + static void -ftrace_stacktrace(unsigned long ip, unsigned long parent_ip, void **data) +ftrace_stacktrace(unsigned long ip, unsigned long parent_ip, + struct trace_array *tr, struct ftrace_probe_ops *ops, + void *data) { - trace_dump_stack(STACK_SKIP); + trace_stack(tr); } static void -ftrace_stacktrace_count(unsigned long ip, unsigned long parent_ip, void **data) +ftrace_stacktrace_count(unsigned long ip, unsigned long parent_ip, + struct trace_array *tr, struct ftrace_probe_ops *ops, + void *data) { - long *count = (long *)data; + struct ftrace_func_mapper *mapper = data; + long *count; long old_count; long new_count; + if (!tracing_is_on()) + return; + + /* unlimited? */ + if (!mapper) { + trace_stack(tr); + return; + } + + count = (long *)ftrace_func_mapper_find_ip(mapper, ip); + /* * Stack traces should only execute the number of times the * user specified in the counter. */ do { - - if (!tracing_is_on()) - return; - old_count = *count; if (!old_count) return; - /* unlimited? */ - if (old_count == -1) { - trace_dump_stack(STACK_SKIP); - return; - } - new_count = old_count - 1; new_count = cmpxchg(count, old_count, new_count); if (new_count == old_count) - trace_dump_stack(STACK_SKIP); + trace_stack(tr); + + if (!tracing_is_on()) + return; } while (new_count != old_count); } -static int update_count(void **data) +static int update_count(struct ftrace_probe_ops *ops, unsigned long ip, + void *data) { - unsigned long *count = (long *)data; + struct ftrace_func_mapper *mapper = data; + long *count = NULL; - if (!*count) - return 0; + if (mapper) + count = (long *)ftrace_func_mapper_find_ip(mapper, ip); - if (*count != -1) + if (count) { + if (*count <= 0) + return 0; (*count)--; + } return 1; } static void -ftrace_dump_probe(unsigned long ip, unsigned long parent_ip, void **data) +ftrace_dump_probe(unsigned long ip, unsigned long parent_ip, + struct trace_array *tr, struct ftrace_probe_ops *ops, + void *data) { - if (update_count(data)) + if (update_count(ops, ip, data)) ftrace_dump(DUMP_ALL); } /* Only dump the current CPU buffer. */ static void -ftrace_cpudump_probe(unsigned long ip, unsigned long parent_ip, void **data) +ftrace_cpudump_probe(unsigned long ip, unsigned long parent_ip, + struct trace_array *tr, struct ftrace_probe_ops *ops, + void *data) { - if (update_count(data)) + if (update_count(ops, ip, data)) ftrace_dump(DUMP_ORIG); } static int ftrace_probe_print(const char *name, struct seq_file *m, - unsigned long ip, void *data) + unsigned long ip, struct ftrace_probe_ops *ops, + void *data) { - long count = (long)data; + struct ftrace_func_mapper *mapper = data; + long *count = NULL; seq_printf(m, "%ps:%s", (void *)ip, name); - if (count == -1) - seq_puts(m, ":unlimited\n"); + if (mapper) + count = (long *)ftrace_func_mapper_find_ip(mapper, ip); + + if (count) + seq_printf(m, ":count=%ld\n", *count); else - seq_printf(m, ":count=%ld\n", count); + seq_puts(m, ":unlimited\n"); return 0; } static int ftrace_traceon_print(struct seq_file *m, unsigned long ip, - struct ftrace_probe_ops *ops, void *data) + struct ftrace_probe_ops *ops, + void *data) { - return ftrace_probe_print("traceon", m, ip, data); + return ftrace_probe_print("traceon", m, ip, ops, data); } static int ftrace_traceoff_print(struct seq_file *m, unsigned long ip, struct ftrace_probe_ops *ops, void *data) { - return ftrace_probe_print("traceoff", m, ip, data); + return ftrace_probe_print("traceoff", m, ip, ops, data); } static int ftrace_stacktrace_print(struct seq_file *m, unsigned long ip, struct ftrace_probe_ops *ops, void *data) { - return ftrace_probe_print("stacktrace", m, ip, data); + return ftrace_probe_print("stacktrace", m, ip, ops, data); } static int ftrace_dump_print(struct seq_file *m, unsigned long ip, struct ftrace_probe_ops *ops, void *data) { - return ftrace_probe_print("dump", m, ip, data); + return ftrace_probe_print("dump", m, ip, ops, data); } static int ftrace_cpudump_print(struct seq_file *m, unsigned long ip, struct ftrace_probe_ops *ops, void *data) { - return ftrace_probe_print("cpudump", m, ip, data); + return ftrace_probe_print("cpudump", m, ip, ops, data); +} + + +static int +ftrace_count_init(struct ftrace_probe_ops *ops, struct trace_array *tr, + unsigned long ip, void *init_data, void **data) +{ + struct ftrace_func_mapper *mapper = *data; + + if (!mapper) { + mapper = allocate_ftrace_func_mapper(); + if (!mapper) + return -ENOMEM; + *data = mapper; + } + + return ftrace_func_mapper_add_ip(mapper, ip, init_data); +} + +static void +ftrace_count_free(struct ftrace_probe_ops *ops, struct trace_array *tr, + unsigned long ip, void *data) +{ + struct ftrace_func_mapper *mapper = data; + + if (!ip) { + free_ftrace_func_mapper(mapper, NULL); + return; + } + + ftrace_func_mapper_remove_ip(mapper, ip); } static struct ftrace_probe_ops traceon_count_probe_ops = { .func = ftrace_traceon_count, .print = ftrace_traceon_print, + .init = ftrace_count_init, + .free = ftrace_count_free, }; static struct ftrace_probe_ops traceoff_count_probe_ops = { .func = ftrace_traceoff_count, .print = ftrace_traceoff_print, + .init = ftrace_count_init, + .free = ftrace_count_free, }; static struct ftrace_probe_ops stacktrace_count_probe_ops = { .func = ftrace_stacktrace_count, .print = ftrace_stacktrace_print, + .init = ftrace_count_init, + .free = ftrace_count_free, }; static struct ftrace_probe_ops dump_probe_ops = { .func = ftrace_dump_probe, .print = ftrace_dump_print, + .init = ftrace_count_init, + .free = ftrace_count_free, }; static struct ftrace_probe_ops cpudump_probe_ops = { @@ -525,7 +610,8 @@ static struct ftrace_probe_ops stacktrace_probe_ops = { }; static int -ftrace_trace_probe_callback(struct ftrace_probe_ops *ops, +ftrace_trace_probe_callback(struct trace_array *tr, + struct ftrace_probe_ops *ops, struct ftrace_hash *hash, char *glob, char *cmd, char *param, int enable) { @@ -537,10 +623,8 @@ ftrace_trace_probe_callback(struct ftrace_probe_ops *ops, if (!enable) return -EINVAL; - if (glob[0] == '!') { - unregister_ftrace_function_probe_func(glob+1, ops); - return 0; - } + if (glob[0] == '!') + return unregister_ftrace_function_probe_func(glob+1, tr, ops); if (!param) goto out_reg; @@ -559,62 +643,74 @@ ftrace_trace_probe_callback(struct ftrace_probe_ops *ops, return ret; out_reg: - ret = register_ftrace_function_probe(glob, ops, count); + ret = register_ftrace_function_probe(glob, tr, ops, count); return ret < 0 ? ret : 0; } static int -ftrace_trace_onoff_callback(struct ftrace_hash *hash, +ftrace_trace_onoff_callback(struct trace_array *tr, struct ftrace_hash *hash, char *glob, char *cmd, char *param, int enable) { struct ftrace_probe_ops *ops; + if (!tr) + return -ENODEV; + /* we register both traceon and traceoff to this callback */ if (strcmp(cmd, "traceon") == 0) ops = param ? &traceon_count_probe_ops : &traceon_probe_ops; else ops = param ? &traceoff_count_probe_ops : &traceoff_probe_ops; - return ftrace_trace_probe_callback(ops, hash, glob, cmd, + return ftrace_trace_probe_callback(tr, ops, hash, glob, cmd, param, enable); } static int -ftrace_stacktrace_callback(struct ftrace_hash *hash, +ftrace_stacktrace_callback(struct trace_array *tr, struct ftrace_hash *hash, char *glob, char *cmd, char *param, int enable) { struct ftrace_probe_ops *ops; + if (!tr) + return -ENODEV; + ops = param ? &stacktrace_count_probe_ops : &stacktrace_probe_ops; - return ftrace_trace_probe_callback(ops, hash, glob, cmd, + return ftrace_trace_probe_callback(tr, ops, hash, glob, cmd, param, enable); } static int -ftrace_dump_callback(struct ftrace_hash *hash, +ftrace_dump_callback(struct trace_array *tr, struct ftrace_hash *hash, char *glob, char *cmd, char *param, int enable) { struct ftrace_probe_ops *ops; + if (!tr) + return -ENODEV; + ops = &dump_probe_ops; /* Only dump once. */ - return ftrace_trace_probe_callback(ops, hash, glob, cmd, + return ftrace_trace_probe_callback(tr, ops, hash, glob, cmd, "1", enable); } static int -ftrace_cpudump_callback(struct ftrace_hash *hash, +ftrace_cpudump_callback(struct trace_array *tr, struct ftrace_hash *hash, char *glob, char *cmd, char *param, int enable) { struct ftrace_probe_ops *ops; + if (!tr) + return -ENODEV; + ops = &cpudump_probe_ops; /* Only dump once. */ - return ftrace_trace_probe_callback(ops, hash, glob, cmd, + return ftrace_trace_probe_callback(tr, ops, hash, glob, cmd, "1", enable); } @@ -687,9 +783,8 @@ static inline int init_func_cmd_traceon(void) } #endif /* CONFIG_DYNAMIC_FTRACE */ -static __init int init_function_trace(void) +__init int init_function_trace(void) { init_func_cmd_traceon(); return register_tracer(&function_trace); } -core_initcall(init_function_trace); diff --git a/kernel/trace/trace_hwlat.c b/kernel/trace/trace_hwlat.c index af344a1bf0d0..d7c8e4ec3d9d 100644 --- a/kernel/trace/trace_hwlat.c +++ b/kernel/trace/trace_hwlat.c @@ -44,6 +44,7 @@ #include <linux/uaccess.h> #include <linux/cpumask.h> #include <linux/delay.h> +#include <linux/sched/clock.h> #include "trace.h" static struct trace_array *hwlat_trace; @@ -78,12 +79,12 @@ static u64 last_tracing_thresh = DEFAULT_LAT_THRESHOLD * NSEC_PER_USEC; /* Individual latency samples are stored here when detected. */ struct hwlat_sample { - u64 seqnum; /* unique sequence */ - u64 duration; /* delta */ - u64 outer_duration; /* delta (outer loop) */ - u64 nmi_total_ts; /* Total time spent in NMIs */ - struct timespec timestamp; /* wall time */ - int nmi_count; /* # NMIs during this sample */ + u64 seqnum; /* unique sequence */ + u64 duration; /* delta */ + u64 outer_duration; /* delta (outer loop) */ + u64 nmi_total_ts; /* Total time spent in NMIs */ + struct timespec64 timestamp; /* wall time */ + int nmi_count; /* # NMIs during this sample */ }; /* keep the global state somewhere. */ @@ -249,7 +250,7 @@ static int get_sample(void) s.seqnum = hwlat_data.count; s.duration = sample; s.outer_duration = outer_sample; - s.timestamp = CURRENT_TIME; + ktime_get_real_ts64(&s.timestamp); s.nmi_total_ts = nmi_total_ts; s.nmi_count = nmi_count; trace_hwlat_sample(&s); @@ -266,24 +267,13 @@ out: static struct cpumask save_cpumask; static bool disable_migrate; -static void move_to_next_cpu(bool initmask) +static void move_to_next_cpu(void) { - static struct cpumask *current_mask; + struct cpumask *current_mask = &save_cpumask; int next_cpu; if (disable_migrate) return; - - /* Just pick the first CPU on first iteration */ - if (initmask) { - current_mask = &save_cpumask; - get_online_cpus(); - cpumask_and(current_mask, cpu_online_mask, tracing_buffer_mask); - put_online_cpus(); - next_cpu = cpumask_first(current_mask); - goto set_affinity; - } - /* * If for some reason the user modifies the CPU affinity * of this thread, than stop migrating for the duration @@ -300,7 +290,6 @@ static void move_to_next_cpu(bool initmask) if (next_cpu >= nr_cpu_ids) next_cpu = cpumask_first(current_mask); - set_affinity: if (next_cpu >= nr_cpu_ids) /* Shouldn't happen! */ goto disable; @@ -322,20 +311,15 @@ static void move_to_next_cpu(bool initmask) * need to ensure nothing else might be running (and thus preempting). * Obviously this should never be used in production environments. * - * Currently this runs on which ever CPU it was scheduled on, but most - * real-world hardware latency situations occur across several CPUs, - * but we might later generalize this if we find there are any actualy - * systems with alternate SMI delivery or other hardware latencies. + * Executes one loop interaction on each CPU in tracing_cpumask sysfs file. */ static int kthread_fn(void *data) { u64 interval; - bool initmask = true; while (!kthread_should_stop()) { - move_to_next_cpu(initmask); - initmask = false; + move_to_next_cpu(); local_irq_disable(); get_sample(); @@ -366,13 +350,27 @@ static int kthread_fn(void *data) */ static int start_kthread(struct trace_array *tr) { + struct cpumask *current_mask = &save_cpumask; struct task_struct *kthread; + int next_cpu; + + /* Just pick the first CPU on first iteration */ + current_mask = &save_cpumask; + get_online_cpus(); + cpumask_and(current_mask, cpu_online_mask, tracing_buffer_mask); + put_online_cpus(); + next_cpu = cpumask_first(current_mask); kthread = kthread_create(kthread_fn, NULL, "hwlatd"); if (IS_ERR(kthread)) { pr_err(BANNER "could not start sampling thread\n"); return -ENOMEM; } + + cpumask_clear(current_mask); + cpumask_set_cpu(next_cpu, current_mask); + sched_setaffinity(kthread->pid, current_mask); + hwlat_kthread = kthread; wake_up_process(kthread); diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 7ad9e53ad174..c9b5aa10fbf9 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -16,13 +16,16 @@ * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ +#define pr_fmt(fmt) "trace_kprobe: " fmt #include <linux/module.h> #include <linux/uaccess.h> +#include <linux/rculist.h> #include "trace_probe.h" #define KPROBE_EVENT_SYSTEM "kprobes" +#define KRETPROBE_MAXACTIVE_MAX 4096 /** * Kprobe event core functions @@ -280,6 +283,7 @@ static struct trace_kprobe *alloc_trace_kprobe(const char *group, void *addr, const char *symbol, unsigned long offs, + int maxactive, int nargs, bool is_return) { struct trace_kprobe *tk; @@ -307,6 +311,8 @@ static struct trace_kprobe *alloc_trace_kprobe(const char *group, else tk->rp.kp.pre_handler = kprobe_dispatcher; + tk->rp.maxactive = maxactive; + if (!event || !is_good_name(event)) { ret = -EINVAL; goto error; @@ -592,12 +598,22 @@ static struct notifier_block trace_kprobe_module_nb = { .priority = 1 /* Invoked after kprobe module callback */ }; +/* Convert certain expected symbols into '_' when generating event names */ +static inline void sanitize_event_name(char *name) +{ + while (*name++ != '\0') + if (*name == ':' || *name == '.') + *name = '_'; +} + static int create_trace_kprobe(int argc, char **argv) { /* * Argument syntax: - * - Add kprobe: p[:[GRP/]EVENT] [MOD:]KSYM[+OFFS]|KADDR [FETCHARGS] - * - Add kretprobe: r[:[GRP/]EVENT] [MOD:]KSYM[+0] [FETCHARGS] + * - Add kprobe: + * p[:[GRP/]EVENT] [MOD:]KSYM[+OFFS]|KADDR [FETCHARGS] + * - Add kretprobe: + * r[MAXACTIVE][:[GRP/]EVENT] [MOD:]KSYM[+0] [FETCHARGS] * Fetch args: * $retval : fetch return value * $stack : fetch stack address @@ -617,6 +633,7 @@ static int create_trace_kprobe(int argc, char **argv) int i, ret = 0; bool is_return = false, is_delete = false; char *symbol = NULL, *event = NULL, *group = NULL; + int maxactive = 0; char *arg; unsigned long offset = 0; void *addr = NULL; @@ -635,8 +652,28 @@ static int create_trace_kprobe(int argc, char **argv) return -EINVAL; } - if (argv[0][1] == ':') { - event = &argv[0][2]; + event = strchr(&argv[0][1], ':'); + if (event) { + event[0] = '\0'; + event++; + } + if (is_return && isdigit(argv[0][1])) { + ret = kstrtouint(&argv[0][1], 0, &maxactive); + if (ret) { + pr_info("Failed to parse maxactive.\n"); + return ret; + } + /* kretprobes instances are iterated over via a list. The + * maximum should stay reasonable. + */ + if (maxactive > KRETPROBE_MAXACTIVE_MAX) { + pr_info("Maxactive is too big (%d > %d).\n", + maxactive, KRETPROBE_MAXACTIVE_MAX); + return -E2BIG; + } + } + + if (event) { if (strchr(event, '/')) { group = event; event = strchr(group, '/') + 1; @@ -678,28 +715,21 @@ static int create_trace_kprobe(int argc, char **argv) pr_info("Probe point is not specified.\n"); return -EINVAL; } - if (isdigit(argv[1][0])) { - if (is_return) { - pr_info("Return probe point must be a symbol.\n"); - return -EINVAL; - } - /* an address specified */ - ret = kstrtoul(&argv[1][0], 0, (unsigned long *)&addr); - if (ret) { - pr_info("Failed to parse address.\n"); - return ret; - } - } else { + + /* try to parse an address. if that fails, try to read the + * input as a symbol. */ + if (kstrtoul(argv[1], 0, (unsigned long *)&addr)) { /* a symbol specified */ symbol = argv[1]; /* TODO: support .init module functions */ ret = traceprobe_split_symbol_offset(symbol, &offset); if (ret) { - pr_info("Failed to parse symbol.\n"); + pr_info("Failed to parse either an address or a symbol.\n"); return ret; } - if (offset && is_return) { - pr_info("Return probe must be used without offset.\n"); + if (offset && is_return && + !kprobe_on_func_entry(NULL, symbol, offset)) { + pr_info("Given offset is not valid for return probe.\n"); return -EINVAL; } } @@ -714,10 +744,11 @@ static int create_trace_kprobe(int argc, char **argv) else snprintf(buf, MAX_EVENT_NAME_LEN, "%c_0x%p", is_return ? 'r' : 'p', addr); + sanitize_event_name(buf); event = buf; } - tk = alloc_trace_kprobe(group, event, addr, symbol, offset, argc, - is_return); + tk = alloc_trace_kprobe(group, event, addr, symbol, offset, maxactive, + argc, is_return); if (IS_ERR(tk)) { pr_info("Failed to allocate trace_probe.(%d)\n", (int)PTR_ERR(tk)); @@ -1509,6 +1540,11 @@ static __init int kprobe_trace_self_tests_init(void) end: release_all_trace_kprobes(); + /* + * Wait for the optimizer work to finish. Otherwise it might fiddle + * with probes in already freed __init text. + */ + wait_for_kprobe_optimizer(); if (warn) pr_cont("NG: Some tests are failed. Please check them.\n"); else diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 5d33a7352919..bac629af2285 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -4,10 +4,11 @@ * Copyright (C) 2008 Red Hat Inc, Steven Rostedt <[email protected]> * */ - #include <linux/module.h> #include <linux/mutex.h> #include <linux/ftrace.h> +#include <linux/sched/clock.h> +#include <linux/sched/mm.h> #include "trace_output.h" @@ -124,6 +125,44 @@ EXPORT_SYMBOL(trace_print_symbols_seq); #if BITS_PER_LONG == 32 const char * +trace_print_flags_seq_u64(struct trace_seq *p, const char *delim, + unsigned long long flags, + const struct trace_print_flags_u64 *flag_array) +{ + unsigned long long mask; + const char *str; + const char *ret = trace_seq_buffer_ptr(p); + int i, first = 1; + + for (i = 0; flag_array[i].name && flags; i++) { + + mask = flag_array[i].mask; + if ((flags & mask) != mask) + continue; + + str = flag_array[i].name; + flags &= ~mask; + if (!first && delim) + trace_seq_puts(p, delim); + else + first = 0; + trace_seq_puts(p, str); + } + + /* check for left over flags */ + if (flags) { + if (!first && delim) + trace_seq_puts(p, delim); + trace_seq_printf(p, "0x%llx", flags); + } + + trace_seq_putc(p, 0); + + return ret; +} +EXPORT_SYMBOL(trace_print_flags_seq_u64); + +const char * trace_print_symbols_seq_u64(struct trace_seq *p, unsigned long long val, const struct trace_print_flags_u64 *symbol_array) { @@ -162,15 +201,27 @@ trace_print_bitmask_seq(struct trace_seq *p, void *bitmask_ptr, } EXPORT_SYMBOL_GPL(trace_print_bitmask_seq); +/** + * trace_print_hex_seq - print buffer as hex sequence + * @p: trace seq struct to write to + * @buf: The buffer to print + * @buf_len: Length of @buf in bytes + * @concatenate: Print @buf as single hex string or with spacing + * + * Prints the passed buffer as a hex sequence either as a whole, + * single hex string if @concatenate is true or with spacing after + * each byte in case @concatenate is false. + */ const char * -trace_print_hex_seq(struct trace_seq *p, const unsigned char *buf, int buf_len) +trace_print_hex_seq(struct trace_seq *p, const unsigned char *buf, int buf_len, + bool concatenate) { int i; const char *ret = trace_seq_buffer_ptr(p); for (i = 0; i < buf_len; i++) - trace_seq_printf(p, "%s%2.2x", i == 0 ? "" : " ", buf[i]); - + trace_seq_printf(p, "%s%2.2x", concatenate || i == 0 ? "" : " ", + buf[i]); trace_seq_putc(p, 0); return ret; @@ -289,31 +340,41 @@ static inline const char *kretprobed(const char *name) static void seq_print_sym_short(struct trace_seq *s, const char *fmt, unsigned long address) { -#ifdef CONFIG_KALLSYMS char str[KSYM_SYMBOL_LEN]; +#ifdef CONFIG_KALLSYMS const char *name; kallsyms_lookup(address, NULL, NULL, NULL, str); name = kretprobed(str); - trace_seq_printf(s, fmt, name); + if (name && strlen(name)) { + trace_seq_printf(s, fmt, name); + return; + } #endif + snprintf(str, KSYM_SYMBOL_LEN, "0x%08lx", address); + trace_seq_printf(s, fmt, str); } static void seq_print_sym_offset(struct trace_seq *s, const char *fmt, unsigned long address) { -#ifdef CONFIG_KALLSYMS char str[KSYM_SYMBOL_LEN]; +#ifdef CONFIG_KALLSYMS const char *name; sprint_symbol(str, address); name = kretprobed(str); - trace_seq_printf(s, fmt, name); + if (name && strlen(name)) { + trace_seq_printf(s, fmt, name); + return; + } #endif + snprintf(str, KSYM_SYMBOL_LEN, "0x%08lx", address); + trace_seq_printf(s, fmt, str); } #ifndef CONFIG_64BIT @@ -536,6 +597,15 @@ int trace_print_context(struct trace_iterator *iter) trace_seq_printf(s, "%16s-%-5d [%03d] ", comm, entry->pid, iter->cpu); + if (tr->trace_flags & TRACE_ITER_RECORD_TGID) { + unsigned int tgid = trace_find_tgid(entry->pid); + + if (!tgid) + trace_seq_printf(s, "(-----) "); + else + trace_seq_printf(s, "(%5d) ", tgid); + } + if (tr->trace_flags & TRACE_ITER_IRQ_INFO) trace_print_lat_fmt(s, entry); @@ -1109,11 +1179,11 @@ trace_hwlat_print(struct trace_iterator *iter, int flags, trace_assign_type(field, entry); - trace_seq_printf(s, "#%-5u inner/outer(us): %4llu/%-5llu ts:%ld.%09ld", + trace_seq_printf(s, "#%-5u inner/outer(us): %4llu/%-5llu ts:%lld.%09ld", field->seqnum, field->duration, field->outer_duration, - field->timestamp.tv_sec, + (long long)field->timestamp.tv_sec, field->timestamp.tv_nsec); if (field->nmi_count) { @@ -1143,10 +1213,10 @@ trace_hwlat_raw(struct trace_iterator *iter, int flags, trace_assign_type(field, iter->ent); - trace_seq_printf(s, "%llu %lld %ld %09ld %u\n", + trace_seq_printf(s, "%llu %lld %lld %09ld %u\n", field->duration, field->outer_duration, - field->timestamp.tv_sec, + (long long)field->timestamp.tv_sec, field->timestamp.tv_nsec, field->seqnum); diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index 8c0553d9afd3..52478f033f88 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -21,6 +21,7 @@ * Copyright (C) IBM Corporation, 2010-2011 * Author: Srikar Dronamraju */ +#define pr_fmt(fmt) "trace_probe: " fmt #include "trace_probe.h" @@ -647,7 +648,7 @@ ssize_t traceprobe_probes_write(struct file *file, const char __user *buffer, size_t count, loff_t *ppos, int (*createfn)(int, char **)) { - char *kbuf, *tmp; + char *kbuf, *buf, *tmp; int ret = 0; size_t done = 0; size_t size; @@ -667,27 +668,38 @@ ssize_t traceprobe_probes_write(struct file *file, const char __user *buffer, goto out; } kbuf[size] = '\0'; - tmp = strchr(kbuf, '\n'); + buf = kbuf; + do { + tmp = strchr(buf, '\n'); + if (tmp) { + *tmp = '\0'; + size = tmp - buf + 1; + } else { + size = strlen(buf); + if (done + size < count) { + if (buf != kbuf) + break; + /* This can accept WRITE_BUFSIZE - 2 ('\n' + '\0') */ + pr_warn("Line length is too long: Should be less than %d\n", + WRITE_BUFSIZE - 2); + ret = -EINVAL; + goto out; + } + } + done += size; - if (tmp) { - *tmp = '\0'; - size = tmp - kbuf + 1; - } else if (done + size < count) { - pr_warn("Line length is too long: Should be less than %d\n", - WRITE_BUFSIZE); - ret = -EINVAL; - goto out; - } - done += size; - /* Remove comments */ - tmp = strchr(kbuf, '#'); + /* Remove comments */ + tmp = strchr(buf, '#'); - if (tmp) - *tmp = '\0'; + if (tmp) + *tmp = '\0'; - ret = traceprobe_command(kbuf, createfn); - if (ret) - goto out; + ret = traceprobe_command(buf, createfn); + if (ret) + goto out; + buf += size; + + } while (done < count); } ret = done; diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h index 0c0ae54d44c6..903273c93e61 100644 --- a/kernel/trace/trace_probe.h +++ b/kernel/trace/trace_probe.h @@ -248,7 +248,7 @@ ASSIGN_FETCH_FUNC(file_offset, ftype), \ #define FETCH_TYPE_STRING 0 #define FETCH_TYPE_STRSIZE 1 -#ifdef CONFIG_KPROBE_EVENT +#ifdef CONFIG_KPROBE_EVENTS struct symbol_cache; unsigned long update_symbol_cache(struct symbol_cache *sc); void free_symbol_cache(struct symbol_cache *sc); @@ -278,7 +278,7 @@ alloc_symbol_cache(const char *sym, long offset) { return NULL; } -#endif /* CONFIG_KPROBE_EVENT */ +#endif /* CONFIG_KPROBE_EVENTS */ struct probe_arg { struct fetch_param fetch; diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c index 4c896a0101bd..b341c02730be 100644 --- a/kernel/trace/trace_sched_switch.c +++ b/kernel/trace/trace_sched_switch.c @@ -12,27 +12,38 @@ #include "trace.h" -static int sched_ref; +#define RECORD_CMDLINE 1 +#define RECORD_TGID 2 + +static int sched_cmdline_ref; +static int sched_tgid_ref; static DEFINE_MUTEX(sched_register_mutex); static void probe_sched_switch(void *ignore, bool preempt, struct task_struct *prev, struct task_struct *next) { - if (unlikely(!sched_ref)) - return; + int flags; + + flags = (RECORD_TGID * !!sched_tgid_ref) + + (RECORD_CMDLINE * !!sched_cmdline_ref); - tracing_record_cmdline(prev); - tracing_record_cmdline(next); + if (!flags) + return; + tracing_record_taskinfo_sched_switch(prev, next, flags); } static void probe_sched_wakeup(void *ignore, struct task_struct *wakee) { - if (unlikely(!sched_ref)) - return; + int flags; + + flags = (RECORD_TGID * !!sched_tgid_ref) + + (RECORD_CMDLINE * !!sched_cmdline_ref); - tracing_record_cmdline(current); + if (!flags) + return; + tracing_record_taskinfo(current, flags); } static int tracing_sched_register(void) @@ -75,28 +86,61 @@ static void tracing_sched_unregister(void) unregister_trace_sched_wakeup(probe_sched_wakeup, NULL); } -static void tracing_start_sched_switch(void) +static void tracing_start_sched_switch(int ops) { + bool sched_register = (!sched_cmdline_ref && !sched_tgid_ref); mutex_lock(&sched_register_mutex); - if (!(sched_ref++)) + + switch (ops) { + case RECORD_CMDLINE: + sched_cmdline_ref++; + break; + + case RECORD_TGID: + sched_tgid_ref++; + break; + } + + if (sched_register && (sched_cmdline_ref || sched_tgid_ref)) tracing_sched_register(); mutex_unlock(&sched_register_mutex); } -static void tracing_stop_sched_switch(void) +static void tracing_stop_sched_switch(int ops) { mutex_lock(&sched_register_mutex); - if (!(--sched_ref)) + + switch (ops) { + case RECORD_CMDLINE: + sched_cmdline_ref--; + break; + + case RECORD_TGID: + sched_tgid_ref--; + break; + } + + if (!sched_cmdline_ref && !sched_tgid_ref) tracing_sched_unregister(); mutex_unlock(&sched_register_mutex); } void tracing_start_cmdline_record(void) { - tracing_start_sched_switch(); + tracing_start_sched_switch(RECORD_CMDLINE); } void tracing_stop_cmdline_record(void) { - tracing_stop_sched_switch(); + tracing_stop_sched_switch(RECORD_CMDLINE); +} + +void tracing_start_tgid_record(void) +{ + tracing_start_sched_switch(RECORD_TGID); +} + +void tracing_stop_tgid_record(void) +{ + tracing_stop_sched_switch(RECORD_TGID); } diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index b0f86ea77881..cb917cebae29 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -1,5 +1,6 @@ /* Include in trace.c */ +#include <uapi/linux/sched/types.h> #include <linux/stringify.h> #include <linux/kthread.h> #include <linux/delay.h> diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index 2a1abbaca10e..a4df67cbc711 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -2,6 +2,7 @@ * Copyright (C) 2008 Steven Rostedt <[email protected]> * */ +#include <linux/sched/task_stack.h> #include <linux/stacktrace.h> #include <linux/kallsyms.h> #include <linux/seq_file.h> @@ -34,7 +35,7 @@ unsigned long stack_trace_max_size; arch_spinlock_t stack_trace_max_lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; -static DEFINE_PER_CPU(int, trace_active); +DEFINE_PER_CPU(int, disable_stack_tracer); static DEFINE_MUTEX(stack_sysctl_mutex); int stack_tracer_enabled; @@ -64,7 +65,7 @@ void stack_trace_print(void) } /* - * When arch-specific code overides this function, the following + * When arch-specific code overrides this function, the following * data should be filled up, assuming stack_trace_max_lock is held to * prevent concurrent updates. * stack_trace_index[] @@ -95,6 +96,14 @@ check_stack(unsigned long ip, unsigned long *stack) if (in_nmi()) return; + /* + * There's a slight chance that we are tracing inside the + * RCU infrastructure, and rcu_irq_enter() will not work + * as expected. + */ + if (unlikely(rcu_irq_enter_disabled())) + return; + local_irq_save(flags); arch_spin_lock(&stack_trace_max_lock); @@ -206,13 +215,12 @@ stack_trace_call(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *op, struct pt_regs *pt_regs) { unsigned long stack; - int cpu; preempt_disable_notrace(); - cpu = raw_smp_processor_id(); /* no atomic needed, we only modify this variable by this cpu */ - if (per_cpu(trace_active, cpu)++ != 0) + __this_cpu_inc(disable_stack_tracer); + if (__this_cpu_read(disable_stack_tracer) != 1) goto out; ip += MCOUNT_INSN_SIZE; @@ -220,7 +228,7 @@ stack_trace_call(unsigned long ip, unsigned long parent_ip, check_stack(ip, &stack); out: - per_cpu(trace_active, cpu)--; + __this_cpu_dec(disable_stack_tracer); /* prevent recursion in schedule */ preempt_enable_notrace(); } @@ -252,7 +260,6 @@ stack_max_size_write(struct file *filp, const char __user *ubuf, long *ptr = filp->private_data; unsigned long val, flags; int ret; - int cpu; ret = kstrtoul_from_user(ubuf, count, 10, &val); if (ret) @@ -263,16 +270,15 @@ stack_max_size_write(struct file *filp, const char __user *ubuf, /* * In case we trace inside arch_spin_lock() or after (NMI), * we will cause circular lock, so we also need to increase - * the percpu trace_active here. + * the percpu disable_stack_tracer here. */ - cpu = smp_processor_id(); - per_cpu(trace_active, cpu)++; + __this_cpu_inc(disable_stack_tracer); arch_spin_lock(&stack_trace_max_lock); *ptr = val; arch_spin_unlock(&stack_trace_max_lock); - per_cpu(trace_active, cpu)--; + __this_cpu_dec(disable_stack_tracer); local_irq_restore(flags); return count; @@ -306,12 +312,9 @@ t_next(struct seq_file *m, void *v, loff_t *pos) static void *t_start(struct seq_file *m, loff_t *pos) { - int cpu; - local_irq_disable(); - cpu = smp_processor_id(); - per_cpu(trace_active, cpu)++; + __this_cpu_inc(disable_stack_tracer); arch_spin_lock(&stack_trace_max_lock); @@ -323,12 +326,9 @@ static void *t_start(struct seq_file *m, loff_t *pos) static void t_stop(struct seq_file *m, void *p) { - int cpu; - arch_spin_unlock(&stack_trace_max_lock); - cpu = smp_processor_id(); - per_cpu(trace_active, cpu)--; + __this_cpu_dec(disable_stack_tracer); local_irq_enable(); } @@ -406,10 +406,14 @@ static const struct file_operations stack_trace_fops = { .release = seq_release, }; +#ifdef CONFIG_DYNAMIC_FTRACE + static int stack_trace_filter_open(struct inode *inode, struct file *file) { - return ftrace_regex_open(&trace_ops, FTRACE_ITER_FILTER, + struct ftrace_ops *ops = inode->i_private; + + return ftrace_regex_open(ops, FTRACE_ITER_FILTER, inode, file); } @@ -421,6 +425,8 @@ static const struct file_operations stack_trace_filter_fops = { .release = ftrace_regex_release, }; +#endif /* CONFIG_DYNAMIC_FTRACE */ + int stack_trace_sysctl(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, @@ -475,8 +481,10 @@ static __init int stack_trace_init(void) trace_create_file("stack_trace", 0444, d_tracer, NULL, &stack_trace_fops); +#ifdef CONFIG_DYNAMIC_FTRACE trace_create_file("stack_trace_filter", 0444, d_tracer, - NULL, &stack_trace_filter_fops); + &trace_ops, &stack_trace_filter_fops); +#endif if (stack_trace_filter_buf[0]) ftrace_set_early_filter(&trace_ops, stack_trace_filter_buf, 1); diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 0913693caf6e..a7581fec9681 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -17,12 +17,14 @@ * Copyright (C) IBM Corporation, 2010-2012 * Author: Srikar Dronamraju <[email protected]> */ +#define pr_fmt(fmt) "trace_kprobe: " fmt #include <linux/module.h> #include <linux/uaccess.h> #include <linux/uprobes.h> #include <linux/namei.h> #include <linux/string.h> +#include <linux/rculist.h> #include "trace_probe.h" @@ -431,7 +433,8 @@ static int create_trace_uprobe(int argc, char **argv) pr_info("Probe point is not specified.\n"); return -EINVAL; } - arg = strchr(argv[1], ':'); + /* Find the last occurrence, in case the path contains ':' too. */ + arg = strrchr(argv[1], ':'); if (!arg) { ret = -EINVAL; goto fail_address_parse; |