aboutsummaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/bpf/arraymap.c163
-rw-r--r--kernel/bpf/core.c9
-rw-r--r--kernel/bpf/helpers.c2
-rw-r--r--kernel/bpf/inode.c4
-rw-r--r--kernel/bpf/syscall.c66
-rw-r--r--kernel/bpf/verifier.c26
-rw-r--r--kernel/cgroup.c53
-rw-r--r--kernel/cpu.c66
-rw-r--r--kernel/cpuset.c9
-rw-r--r--kernel/events/core.c124
-rw-r--r--kernel/events/internal.h25
-rw-r--r--kernel/fork.c24
-rw-r--r--kernel/freezer.c2
-rw-r--r--kernel/memremap.c14
-rw-r--r--kernel/power/snapshot.c10
-rw-r--r--kernel/printk/printk.c5
-rw-r--r--kernel/profile.c181
-rw-r--r--kernel/rcu/tree.c105
-rw-r--r--kernel/sched/cputime.c10
-rw-r--r--kernel/smp.c79
-rw-r--r--kernel/sysctl.c4
-rw-r--r--kernel/time/hrtimer.c40
-rw-r--r--kernel/time/timer.c25
-rw-r--r--kernel/trace/Kconfig1
-rw-r--r--kernel/trace/bpf_trace.c164
-rw-r--r--kernel/trace/ftrace.c313
-rw-r--r--kernel/trace/trace.c358
-rw-r--r--kernel/trace/trace.h48
-rw-r--r--kernel/trace/trace_entries.h4
-rw-r--r--kernel/trace/trace_events.c219
-rw-r--r--kernel/trace/trace_functions.c2
-rw-r--r--kernel/trace/trace_functions_graph.c19
-rw-r--r--kernel/trace/trace_kprobe.c1
-rw-r--r--kernel/trace/trace_mmiotrace.c10
-rw-r--r--kernel/trace/trace_probe.c33
-rw-r--r--kernel/trace/trace_probe.h10
-rw-r--r--kernel/workqueue.c108
37 files changed, 1287 insertions, 1049 deletions
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 76d5a794e426..633a650d7aeb 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -328,8 +328,8 @@ static void *fd_array_map_lookup_elem(struct bpf_map *map, void *key)
}
/* only called from syscall */
-static int fd_array_map_update_elem(struct bpf_map *map, void *key,
- void *value, u64 map_flags)
+int bpf_fd_array_map_update_elem(struct bpf_map *map, struct file *map_file,
+ void *key, void *value, u64 map_flags)
{
struct bpf_array *array = container_of(map, struct bpf_array, map);
void *new_ptr, *old_ptr;
@@ -342,7 +342,7 @@ static int fd_array_map_update_elem(struct bpf_map *map, void *key,
return -E2BIG;
ufd = *(u32 *)value;
- new_ptr = map->ops->map_fd_get_ptr(map, ufd);
+ new_ptr = map->ops->map_fd_get_ptr(map, map_file, ufd);
if (IS_ERR(new_ptr))
return PTR_ERR(new_ptr);
@@ -371,10 +371,12 @@ static int fd_array_map_delete_elem(struct bpf_map *map, void *key)
}
}
-static void *prog_fd_array_get_ptr(struct bpf_map *map, int fd)
+static void *prog_fd_array_get_ptr(struct bpf_map *map,
+ struct file *map_file, int fd)
{
struct bpf_array *array = container_of(map, struct bpf_array, map);
struct bpf_prog *prog = bpf_prog_get(fd);
+
if (IS_ERR(prog))
return prog;
@@ -382,14 +384,13 @@ static void *prog_fd_array_get_ptr(struct bpf_map *map, int fd)
bpf_prog_put(prog);
return ERR_PTR(-EINVAL);
}
+
return prog;
}
static void prog_fd_array_put_ptr(void *ptr)
{
- struct bpf_prog *prog = ptr;
-
- bpf_prog_put_rcu(prog);
+ bpf_prog_put(ptr);
}
/* decrement refcnt of all bpf_progs that are stored in this map */
@@ -407,7 +408,6 @@ static const struct bpf_map_ops prog_array_ops = {
.map_free = fd_array_map_free,
.map_get_next_key = array_map_get_next_key,
.map_lookup_elem = fd_array_map_lookup_elem,
- .map_update_elem = fd_array_map_update_elem,
.map_delete_elem = fd_array_map_delete_elem,
.map_fd_get_ptr = prog_fd_array_get_ptr,
.map_fd_put_ptr = prog_fd_array_put_ptr,
@@ -425,59 +425,105 @@ static int __init register_prog_array_map(void)
}
late_initcall(register_prog_array_map);
-static void perf_event_array_map_free(struct bpf_map *map)
+static struct bpf_event_entry *bpf_event_entry_gen(struct file *perf_file,
+ struct file *map_file)
{
- bpf_fd_array_map_clear(map);
- fd_array_map_free(map);
+ struct bpf_event_entry *ee;
+
+ ee = kzalloc(sizeof(*ee), GFP_ATOMIC);
+ if (ee) {
+ ee->event = perf_file->private_data;
+ ee->perf_file = perf_file;
+ ee->map_file = map_file;
+ }
+
+ return ee;
}
-static void *perf_event_fd_array_get_ptr(struct bpf_map *map, int fd)
+static void __bpf_event_entry_free(struct rcu_head *rcu)
{
- struct perf_event *event;
- const struct perf_event_attr *attr;
- struct file *file;
+ struct bpf_event_entry *ee;
- file = perf_event_get(fd);
- if (IS_ERR(file))
- return file;
+ ee = container_of(rcu, struct bpf_event_entry, rcu);
+ fput(ee->perf_file);
+ kfree(ee);
+}
- event = file->private_data;
+static void bpf_event_entry_free_rcu(struct bpf_event_entry *ee)
+{
+ call_rcu(&ee->rcu, __bpf_event_entry_free);
+}
- attr = perf_event_attrs(event);
- if (IS_ERR(attr))
- goto err;
+static void *perf_event_fd_array_get_ptr(struct bpf_map *map,
+ struct file *map_file, int fd)
+{
+ const struct perf_event_attr *attr;
+ struct bpf_event_entry *ee;
+ struct perf_event *event;
+ struct file *perf_file;
- if (attr->inherit)
- goto err;
+ perf_file = perf_event_get(fd);
+ if (IS_ERR(perf_file))
+ return perf_file;
- if (attr->type == PERF_TYPE_RAW)
- return file;
+ event = perf_file->private_data;
+ ee = ERR_PTR(-EINVAL);
- if (attr->type == PERF_TYPE_HARDWARE)
- return file;
+ attr = perf_event_attrs(event);
+ if (IS_ERR(attr) || attr->inherit)
+ goto err_out;
+
+ switch (attr->type) {
+ case PERF_TYPE_SOFTWARE:
+ if (attr->config != PERF_COUNT_SW_BPF_OUTPUT)
+ goto err_out;
+ /* fall-through */
+ case PERF_TYPE_RAW:
+ case PERF_TYPE_HARDWARE:
+ ee = bpf_event_entry_gen(perf_file, map_file);
+ if (ee)
+ return ee;
+ ee = ERR_PTR(-ENOMEM);
+ /* fall-through */
+ default:
+ break;
+ }
- if (attr->type == PERF_TYPE_SOFTWARE &&
- attr->config == PERF_COUNT_SW_BPF_OUTPUT)
- return file;
-err:
- fput(file);
- return ERR_PTR(-EINVAL);
+err_out:
+ fput(perf_file);
+ return ee;
}
static void perf_event_fd_array_put_ptr(void *ptr)
{
- fput((struct file *)ptr);
+ bpf_event_entry_free_rcu(ptr);
+}
+
+static void perf_event_fd_array_release(struct bpf_map *map,
+ struct file *map_file)
+{
+ struct bpf_array *array = container_of(map, struct bpf_array, map);
+ struct bpf_event_entry *ee;
+ int i;
+
+ rcu_read_lock();
+ for (i = 0; i < array->map.max_entries; i++) {
+ ee = READ_ONCE(array->ptrs[i]);
+ if (ee && ee->map_file == map_file)
+ fd_array_map_delete_elem(map, &i);
+ }
+ rcu_read_unlock();
}
static const struct bpf_map_ops perf_event_array_ops = {
.map_alloc = fd_array_map_alloc,
- .map_free = perf_event_array_map_free,
+ .map_free = fd_array_map_free,
.map_get_next_key = array_map_get_next_key,
.map_lookup_elem = fd_array_map_lookup_elem,
- .map_update_elem = fd_array_map_update_elem,
.map_delete_elem = fd_array_map_delete_elem,
.map_fd_get_ptr = perf_event_fd_array_get_ptr,
.map_fd_put_ptr = perf_event_fd_array_put_ptr,
+ .map_release = perf_event_fd_array_release,
};
static struct bpf_map_type_list perf_event_array_type __read_mostly = {
@@ -491,3 +537,46 @@ static int __init register_perf_event_array_map(void)
return 0;
}
late_initcall(register_perf_event_array_map);
+
+#ifdef CONFIG_SOCK_CGROUP_DATA
+static void *cgroup_fd_array_get_ptr(struct bpf_map *map,
+ struct file *map_file /* not used */,
+ int fd)
+{
+ return cgroup_get_from_fd(fd);
+}
+
+static void cgroup_fd_array_put_ptr(void *ptr)
+{
+ /* cgroup_put free cgrp after a rcu grace period */
+ cgroup_put(ptr);
+}
+
+static void cgroup_fd_array_free(struct bpf_map *map)
+{
+ bpf_fd_array_map_clear(map);
+ fd_array_map_free(map);
+}
+
+static const struct bpf_map_ops cgroup_array_ops = {
+ .map_alloc = fd_array_map_alloc,
+ .map_free = cgroup_fd_array_free,
+ .map_get_next_key = array_map_get_next_key,
+ .map_lookup_elem = fd_array_map_lookup_elem,
+ .map_delete_elem = fd_array_map_delete_elem,
+ .map_fd_get_ptr = cgroup_fd_array_get_ptr,
+ .map_fd_put_ptr = cgroup_fd_array_put_ptr,
+};
+
+static struct bpf_map_type_list cgroup_array_type __read_mostly = {
+ .ops = &cgroup_array_ops,
+ .type = BPF_MAP_TYPE_CGROUP_ARRAY,
+};
+
+static int __init register_cgroup_array_map(void)
+{
+ bpf_register_map_type(&cgroup_array_type);
+ return 0;
+}
+late_initcall(register_cgroup_array_map);
+#endif
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index b94a36550591..03fd23d4d587 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -719,14 +719,13 @@ select_insn:
if (unlikely(index >= array->map.max_entries))
goto out;
-
if (unlikely(tail_call_cnt > MAX_TAIL_CALL_CNT))
goto out;
tail_call_cnt++;
prog = READ_ONCE(array->ptrs[index]);
- if (unlikely(!prog))
+ if (!prog)
goto out;
/* ARG1 at this point is guaranteed to point to CTX from
@@ -1055,9 +1054,11 @@ const struct bpf_func_proto * __weak bpf_get_trace_printk_proto(void)
return NULL;
}
-const struct bpf_func_proto * __weak bpf_get_event_output_proto(void)
+u64 __weak
+bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size,
+ void *ctx, u64 ctx_size, bpf_ctx_copy_t ctx_copy)
{
- return NULL;
+ return -ENOTSUPP;
}
/* Always built-in helper functions. */
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index ad7a0573f71b..1ea3afba1a4f 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -101,7 +101,7 @@ const struct bpf_func_proto bpf_get_prandom_u32_proto = {
static u64 bpf_get_smp_processor_id(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
{
- return raw_smp_processor_id();
+ return smp_processor_id();
}
const struct bpf_func_proto bpf_get_smp_processor_id_proto = {
diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c
index 318858edb1cd..5967b870a895 100644
--- a/kernel/bpf/inode.c
+++ b/kernel/bpf/inode.c
@@ -11,7 +11,7 @@
* version 2 as published by the Free Software Foundation.
*/
-#include <linux/module.h>
+#include <linux/init.h>
#include <linux/magic.h>
#include <linux/major.h>
#include <linux/mount.h>
@@ -367,8 +367,6 @@ static struct file_system_type bpf_fs_type = {
.kill_sb = kill_litter_super,
};
-MODULE_ALIAS_FS("bpf");
-
static int __init bpf_init(void)
{
int ret;
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 46ecce4b79ed..228f962447a5 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -124,7 +124,12 @@ void bpf_map_put_with_uref(struct bpf_map *map)
static int bpf_map_release(struct inode *inode, struct file *filp)
{
- bpf_map_put_with_uref(filp->private_data);
+ struct bpf_map *map = filp->private_data;
+
+ if (map->ops->map_release)
+ map->ops->map_release(map, filp);
+
+ bpf_map_put_with_uref(map);
return 0;
}
@@ -387,6 +392,13 @@ static int map_update_elem(union bpf_attr *attr)
err = bpf_percpu_hash_update(map, key, value, attr->flags);
} else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) {
err = bpf_percpu_array_update(map, key, value, attr->flags);
+ } else if (map->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY ||
+ map->map_type == BPF_MAP_TYPE_PROG_ARRAY ||
+ map->map_type == BPF_MAP_TYPE_CGROUP_ARRAY) {
+ rcu_read_lock();
+ err = bpf_fd_array_map_update_elem(map, f.file, key, value,
+ attr->flags);
+ rcu_read_unlock();
} else {
rcu_read_lock();
err = map->ops->map_update_elem(map, key, value, attr->flags);
@@ -612,7 +624,7 @@ static void bpf_prog_uncharge_memlock(struct bpf_prog *prog)
free_uid(user);
}
-static void __prog_put_common(struct rcu_head *rcu)
+static void __bpf_prog_put_rcu(struct rcu_head *rcu)
{
struct bpf_prog_aux *aux = container_of(rcu, struct bpf_prog_aux, rcu);
@@ -621,17 +633,10 @@ static void __prog_put_common(struct rcu_head *rcu)
bpf_prog_free(aux->prog);
}
-/* version of bpf_prog_put() that is called after a grace period */
-void bpf_prog_put_rcu(struct bpf_prog *prog)
-{
- if (atomic_dec_and_test(&prog->aux->refcnt))
- call_rcu(&prog->aux->rcu, __prog_put_common);
-}
-
void bpf_prog_put(struct bpf_prog *prog)
{
if (atomic_dec_and_test(&prog->aux->refcnt))
- __prog_put_common(&prog->aux->rcu);
+ call_rcu(&prog->aux->rcu, __bpf_prog_put_rcu);
}
EXPORT_SYMBOL_GPL(bpf_prog_put);
@@ -639,7 +644,7 @@ static int bpf_prog_release(struct inode *inode, struct file *filp)
{
struct bpf_prog *prog = filp->private_data;
- bpf_prog_put_rcu(prog);
+ bpf_prog_put(prog);
return 0;
}
@@ -653,7 +658,7 @@ int bpf_prog_new_fd(struct bpf_prog *prog)
O_RDWR | O_CLOEXEC);
}
-static struct bpf_prog *__bpf_prog_get(struct fd f)
+static struct bpf_prog *____bpf_prog_get(struct fd f)
{
if (!f.file)
return ERR_PTR(-EBADF);
@@ -665,33 +670,50 @@ static struct bpf_prog *__bpf_prog_get(struct fd f)
return f.file->private_data;
}
-struct bpf_prog *bpf_prog_inc(struct bpf_prog *prog)
+struct bpf_prog *bpf_prog_add(struct bpf_prog *prog, int i)
{
- if (atomic_inc_return(&prog->aux->refcnt) > BPF_MAX_REFCNT) {
- atomic_dec(&prog->aux->refcnt);
+ if (atomic_add_return(i, &prog->aux->refcnt) > BPF_MAX_REFCNT) {
+ atomic_sub(i, &prog->aux->refcnt);
return ERR_PTR(-EBUSY);
}
return prog;
}
+EXPORT_SYMBOL_GPL(bpf_prog_add);
-/* called by sockets/tracing/seccomp before attaching program to an event
- * pairs with bpf_prog_put()
- */
-struct bpf_prog *bpf_prog_get(u32 ufd)
+struct bpf_prog *bpf_prog_inc(struct bpf_prog *prog)
+{
+ return bpf_prog_add(prog, 1);
+}
+
+static struct bpf_prog *__bpf_prog_get(u32 ufd, enum bpf_prog_type *type)
{
struct fd f = fdget(ufd);
struct bpf_prog *prog;
- prog = __bpf_prog_get(f);
+ prog = ____bpf_prog_get(f);
if (IS_ERR(prog))
return prog;
+ if (type && prog->type != *type) {
+ prog = ERR_PTR(-EINVAL);
+ goto out;
+ }
prog = bpf_prog_inc(prog);
+out:
fdput(f);
-
return prog;
}
-EXPORT_SYMBOL_GPL(bpf_prog_get);
+
+struct bpf_prog *bpf_prog_get(u32 ufd)
+{
+ return __bpf_prog_get(ufd, NULL);
+}
+
+struct bpf_prog *bpf_prog_get_type(u32 ufd, enum bpf_prog_type type)
+{
+ return __bpf_prog_get(ufd, &type);
+}
+EXPORT_SYMBOL_GPL(bpf_prog_get_type);
/* last field in 'union bpf_attr' used by this command */
#define BPF_PROG_LOAD_LAST_FIELD kern_version
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index eec9f90ba030..f72f23b8fdab 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -653,6 +653,16 @@ static int check_map_access(struct verifier_env *env, u32 regno, int off,
#define MAX_PACKET_OFF 0xffff
+static bool may_write_pkt_data(enum bpf_prog_type type)
+{
+ switch (type) {
+ case BPF_PROG_TYPE_XDP:
+ return true;
+ default:
+ return false;
+ }
+}
+
static int check_packet_access(struct verifier_env *env, u32 regno, int off,
int size)
{
@@ -713,6 +723,7 @@ static int check_ptr_alignment(struct verifier_env *env, struct reg_state *reg,
switch (env->prog->type) {
case BPF_PROG_TYPE_SCHED_CLS:
case BPF_PROG_TYPE_SCHED_ACT:
+ case BPF_PROG_TYPE_XDP:
break;
default:
verbose("verifier is misconfigured\n");
@@ -805,10 +816,15 @@ static int check_mem_access(struct verifier_env *env, u32 regno, int off,
err = check_stack_read(state, off, size, value_regno);
}
} else if (state->regs[regno].type == PTR_TO_PACKET) {
- if (t == BPF_WRITE) {
+ if (t == BPF_WRITE && !may_write_pkt_data(env->prog->type)) {
verbose("cannot write into packet\n");
return -EACCES;
}
+ if (t == BPF_WRITE && value_regno >= 0 &&
+ is_pointer_value(env, value_regno)) {
+ verbose("R%d leaks addr into packet\n", value_regno);
+ return -EACCES;
+ }
err = check_packet_access(env, regno, off, size);
if (!err && t == BPF_READ && value_regno >= 0)
mark_reg_unknown_value(state->regs, value_regno);
@@ -1035,6 +1051,10 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id)
if (func_id != BPF_FUNC_get_stackid)
goto error;
break;
+ case BPF_MAP_TYPE_CGROUP_ARRAY:
+ if (func_id != BPF_FUNC_skb_in_cgroup)
+ goto error;
+ break;
default:
break;
}
@@ -1054,6 +1074,10 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id)
if (map->map_type != BPF_MAP_TYPE_STACK_TRACE)
goto error;
break;
+ case BPF_FUNC_skb_in_cgroup:
+ if (map->map_type != BPF_MAP_TYPE_CGROUP_ARRAY)
+ goto error;
+ break;
default:
break;
}
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 33a2f63d4a10..d1c51b7f5221 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -61,6 +61,7 @@
#include <linux/cpuset.h>
#include <linux/proc_ns.h>
#include <linux/nsproxy.h>
+#include <linux/file.h>
#include <net/sock.h>
/*
@@ -2208,12 +2209,8 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
goto out_unlock;
}
- /*
- * We know this subsystem has not yet been bound. Users in a non-init
- * user namespace may only mount hierarchies with no bound subsystems,
- * i.e. 'none,name=user1'
- */
- if (!opts.none && !capable(CAP_SYS_ADMIN)) {
+ /* Hierarchies may only be created in the initial cgroup namespace. */
+ if (ns != &init_cgroup_ns) {
ret = -EPERM;
goto out_unlock;
}
@@ -2955,6 +2952,7 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
int retval = 0;
mutex_lock(&cgroup_mutex);
+ percpu_down_write(&cgroup_threadgroup_rwsem);
for_each_root(root) {
struct cgroup *from_cgrp;
@@ -2969,6 +2967,7 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
if (retval)
break;
}
+ percpu_up_write(&cgroup_threadgroup_rwsem);
mutex_unlock(&cgroup_mutex);
return retval;
@@ -4336,6 +4335,8 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
mutex_lock(&cgroup_mutex);
+ percpu_down_write(&cgroup_threadgroup_rwsem);
+
/* all tasks in @from are being moved, all csets are source */
spin_lock_irq(&css_set_lock);
list_for_each_entry(link, &from->cset_links, cset_link)
@@ -4364,6 +4365,7 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
} while (task && !ret);
out_err:
cgroup_migrate_finish(&preloaded_csets);
+ percpu_up_write(&cgroup_threadgroup_rwsem);
mutex_unlock(&cgroup_mutex);
return ret;
}
@@ -6204,6 +6206,40 @@ struct cgroup *cgroup_get_from_path(const char *path)
}
EXPORT_SYMBOL_GPL(cgroup_get_from_path);
+/**
+ * cgroup_get_from_fd - get a cgroup pointer from a fd
+ * @fd: fd obtained by open(cgroup2_dir)
+ *
+ * Find the cgroup from a fd which should be obtained
+ * by opening a cgroup directory. Returns a pointer to the
+ * cgroup on success. ERR_PTR is returned if the cgroup
+ * cannot be found.
+ */
+struct cgroup *cgroup_get_from_fd(int fd)
+{
+ struct cgroup_subsys_state *css;
+ struct cgroup *cgrp;
+ struct file *f;
+
+ f = fget_raw(fd);
+ if (!f)
+ return ERR_PTR(-EBADF);
+
+ css = css_tryget_online_from_dir(f->f_path.dentry, NULL);
+ fput(f);
+ if (IS_ERR(css))
+ return ERR_CAST(css);
+
+ cgrp = css->cgroup;
+ if (!cgroup_on_dfl(cgrp)) {
+ cgroup_put(cgrp);
+ return ERR_PTR(-EBADF);
+ }
+
+ return cgrp;
+}
+EXPORT_SYMBOL_GPL(cgroup_get_from_fd);
+
/*
* sock->sk_cgrp_data handling. For more info, see sock_cgroup_data
* definition in cgroup-defs.h.
@@ -6304,14 +6340,11 @@ struct cgroup_namespace *copy_cgroup_ns(unsigned long flags,
if (!ns_capable(user_ns, CAP_SYS_ADMIN))
return ERR_PTR(-EPERM);
- mutex_lock(&cgroup_mutex);
+ /* It is not safe to take cgroup_mutex here */
spin_lock_irq(&css_set_lock);
-
cset = task_css_set(current);
get_css_set(cset);
-
spin_unlock_irq(&css_set_lock);
- mutex_unlock(&cgroup_mutex);
new_ns = alloc_cgroup_ns();
if (IS_ERR(new_ns)) {
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 7b61887f7ccd..341bf80f80bd 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -517,6 +517,13 @@ static int cpuhp_invoke_ap_callback(int cpu, enum cpuhp_state state,
if (!cpu_online(cpu))
return 0;
+ /*
+ * If we are up and running, use the hotplug thread. For early calls
+ * we invoke the thread function directly.
+ */
+ if (!st->thread)
+ return cpuhp_invoke_callback(cpu, state, cb);
+
st->cb_state = state;
st->cb = cb;
/*
@@ -1173,6 +1180,31 @@ static struct cpuhp_step cpuhp_bp_states[] = {
.teardown = NULL,
.cant_stop = true,
},
+ [CPUHP_PERF_PREPARE] = {
+ .name = "perf prepare",
+ .startup = perf_event_init_cpu,
+ .teardown = perf_event_exit_cpu,
+ },
+ [CPUHP_WORKQUEUE_PREP] = {
+ .name = "workqueue prepare",
+ .startup = workqueue_prepare_cpu,
+ .teardown = NULL,
+ },
+ [CPUHP_HRTIMERS_PREPARE] = {
+ .name = "hrtimers prepare",
+ .startup = hrtimers_prepare_cpu,
+ .teardown = hrtimers_dead_cpu,
+ },
+ [CPUHP_SMPCFD_PREPARE] = {
+ .name = "SMPCFD prepare",
+ .startup = smpcfd_prepare_cpu,
+ .teardown = smpcfd_dead_cpu,
+ },
+ [CPUHP_RCUTREE_PREP] = {
+ .name = "RCU-tree prepare",
+ .startup = rcutree_prepare_cpu,
+ .teardown = rcutree_dead_cpu,
+ },
/*
* Preparatory and dead notifiers. Will be replaced once the notifiers
* are converted to states.
@@ -1184,6 +1216,16 @@ static struct cpuhp_step cpuhp_bp_states[] = {
.skip_onerr = true,
.cant_stop = true,
},
+ /*
+ * On the tear-down path, timers_dead_cpu() must be invoked
+ * before blk_mq_queue_reinit_notify() from notify_dead(),
+ * otherwise a RCU stall occurs.
+ */
+ [CPUHP_TIMERS_DEAD] = {
+ .name = "timers dead",
+ .startup = NULL,
+ .teardown = timers_dead_cpu,
+ },
/* Kicks the plugged cpu into life */
[CPUHP_BRINGUP_CPU] = {
.name = "cpu:bringup",
@@ -1191,6 +1233,10 @@ static struct cpuhp_step cpuhp_bp_states[] = {
.teardown = NULL,
.cant_stop = true,
},
+ [CPUHP_AP_SMPCFD_DYING] = {
+ .startup = NULL,
+ .teardown = smpcfd_dying_cpu,
+ },
/*
* Handled on controll processor until the plugged processor manages
* this itself.
@@ -1227,6 +1273,10 @@ static struct cpuhp_step cpuhp_ap_states[] = {
.startup = sched_cpu_starting,
.teardown = sched_cpu_dying,
},
+ [CPUHP_AP_RCUTREE_DYING] = {
+ .startup = NULL,
+ .teardown = rcutree_dying_cpu,
+ },
/*
* Low level startup/teardown notifiers. Run with interrupts
* disabled. Will be removed once the notifiers are converted to
@@ -1250,6 +1300,22 @@ static struct cpuhp_step cpuhp_ap_states[] = {
.startup = smpboot_unpark_threads,
.teardown = NULL,
},
+ [CPUHP_AP_PERF_ONLINE] = {
+ .name = "perf online",
+ .startup = perf_event_init_cpu,
+ .teardown = perf_event_exit_cpu,
+ },
+ [CPUHP_AP_WORKQUEUE_ONLINE] = {
+ .name = "workqueue online",
+ .startup = workqueue_online_cpu,
+ .teardown = workqueue_offline_cpu,
+ },
+ [CPUHP_AP_RCUTREE_ONLINE] = {
+ .name = "RCU-tree online",
+ .startup = rcutree_online_cpu,
+ .teardown = rcutree_offline_cpu,
+ },
+
/*
* Online/down_prepare notifiers. Will be removed once the notifiers
* are converted to states.
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 73e93e53884d..c7fd2778ed50 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1034,15 +1034,6 @@ static void cpuset_change_task_nodemask(struct task_struct *tsk,
{
bool need_loop;
- /*
- * Allow tasks that have access to memory reserves because they have
- * been OOM killed to get memory anywhere.
- */
- if (unlikely(test_thread_flag(TIF_MEMDIE)))
- return;
- if (current->flags & PF_EXITING) /* Let dying task have memory */
- return;
-
task_lock(tsk);
/*
* Determine if a loop is necessary if another thread is doing
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 79dae188a987..356a6c7cb52a 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -5617,16 +5617,26 @@ void perf_output_sample(struct perf_output_handle *handle,
}
if (sample_type & PERF_SAMPLE_RAW) {
- if (data->raw) {
- u32 raw_size = data->raw->size;
- u32 real_size = round_up(raw_size + sizeof(u32),
- sizeof(u64)) - sizeof(u32);
- u64 zero = 0;
-
- perf_output_put(handle, real_size);
- __output_copy(handle, data->raw->data, raw_size);
- if (real_size - raw_size)
- __output_copy(handle, &zero, real_size - raw_size);
+ struct perf_raw_record *raw = data->raw;
+
+ if (raw) {
+ struct perf_raw_frag *frag = &raw->frag;
+
+ perf_output_put(handle, raw->size);
+ do {
+ if (frag->copy) {
+ __output_custom(handle, frag->copy,
+ frag->data, frag->size);
+ } else {
+ __output_copy(handle, frag->data,
+ frag->size);
+ }
+ if (perf_raw_frag_last(frag))
+ break;
+ frag = frag->next;
+ } while (1);
+ if (frag->pad)
+ __output_skip(handle, NULL, frag->pad);
} else {
struct {
u32 size;
@@ -5751,14 +5761,28 @@ void perf_prepare_sample(struct perf_event_header *header,
}
if (sample_type & PERF_SAMPLE_RAW) {
- int size = sizeof(u32);
-
- if (data->raw)
- size += data->raw->size;
- else
- size += sizeof(u32);
+ struct perf_raw_record *raw = data->raw;
+ int size;
+
+ if (raw) {
+ struct perf_raw_frag *frag = &raw->frag;
+ u32 sum = 0;
+
+ do {
+ sum += frag->size;
+ if (perf_raw_frag_last(frag))
+ break;
+ frag = frag->next;
+ } while (1);
+
+ size = round_up(sum + sizeof(u32), sizeof(u64));
+ raw->size = size - sizeof(u32);
+ frag->pad = raw->size - sum;
+ } else {
+ size = sizeof(u64);
+ }
- header->size += round_up(size, sizeof(u64));
+ header->size += size;
}
if (sample_type & PERF_SAMPLE_BRANCH_STACK) {
@@ -7398,7 +7422,7 @@ static struct pmu perf_swevent = {
static int perf_tp_filter_match(struct perf_event *event,
struct perf_sample_data *data)
{
- void *record = data->raw->data;
+ void *record = data->raw->frag.data;
/* only top level events have filters set */
if (event->parent)
@@ -7454,8 +7478,10 @@ void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size,
struct perf_event *event;
struct perf_raw_record raw = {
- .size = entry_size,
- .data = record,
+ .frag = {
+ .size = entry_size,
+ .data = record,
+ },
};
perf_sample_data_init(&data, 0, 0);
@@ -7596,7 +7622,7 @@ static void perf_event_free_bpf_prog(struct perf_event *event)
prog = event->tp_event->prog;
if (prog) {
event->tp_event->prog = NULL;
- bpf_prog_put_rcu(prog);
+ bpf_prog_put(prog);
}
}
@@ -10331,7 +10357,7 @@ static void __init perf_event_init_all_cpus(void)
}
}
-static void perf_event_init_cpu(int cpu)
+int perf_event_init_cpu(unsigned int cpu)
{
struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
@@ -10344,6 +10370,7 @@ static void perf_event_init_cpu(int cpu)
rcu_assign_pointer(swhash->swevent_hlist, hlist);
}
mutex_unlock(&swhash->hlist_mutex);
+ return 0;
}
#if defined CONFIG_HOTPLUG_CPU || defined CONFIG_KEXEC_CORE
@@ -10375,14 +10402,17 @@ static void perf_event_exit_cpu_context(int cpu)
}
srcu_read_unlock(&pmus_srcu, idx);
}
+#else
-static void perf_event_exit_cpu(int cpu)
+static void perf_event_exit_cpu_context(int cpu) { }
+
+#endif
+
+int perf_event_exit_cpu(unsigned int cpu)
{
perf_event_exit_cpu_context(cpu);
+ return 0;
}
-#else
-static inline void perf_event_exit_cpu(int cpu) { }
-#endif
static int
perf_reboot(struct notifier_block *notifier, unsigned long val, void *v)
@@ -10404,46 +10434,6 @@ static struct notifier_block perf_reboot_notifier = {
.priority = INT_MIN,
};
-static int
-perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
-{
- unsigned int cpu = (long)hcpu;
-
- switch (action & ~CPU_TASKS_FROZEN) {
-
- case CPU_UP_PREPARE:
- /*
- * This must be done before the CPU comes alive, because the
- * moment we can run tasks we can encounter (software) events.
- *
- * Specifically, someone can have inherited events on kthreadd
- * or a pre-existing worker thread that gets re-bound.
- */
- perf_event_init_cpu(cpu);
- break;
-
- case CPU_DOWN_PREPARE:
- /*
- * This must be done before the CPU dies because after that an
- * active event might want to IPI the CPU and that'll not work
- * so great for dead CPUs.
- *
- * XXX smp_call_function_single() return -ENXIO without a warn
- * so we could possibly deal with this.
- *
- * This is safe against new events arriving because
- * sys_perf_event_open() serializes against hotplug using
- * get_online_cpus().
- */
- perf_event_exit_cpu(cpu);
- break;
- default:
- break;
- }
-
- return NOTIFY_OK;
-}
-
void __init perf_event_init(void)
{
int ret;
@@ -10456,7 +10446,7 @@ void __init perf_event_init(void)
perf_pmu_register(&perf_cpu_clock, NULL, -1);
perf_pmu_register(&perf_task_clock, NULL, -1);
perf_tp_register();
- perf_cpu_notifier(perf_cpu_notify);
+ perf_event_init_cpu(smp_processor_id());
register_reboot_notifier(&perf_reboot_notifier);
ret = init_hw_breakpoint();
diff --git a/kernel/events/internal.h b/kernel/events/internal.h
index 05f9f6d626df..486fd78eb8d5 100644
--- a/kernel/events/internal.h
+++ b/kernel/events/internal.h
@@ -123,21 +123,19 @@ static inline unsigned long perf_aux_size(struct ring_buffer *rb)
return rb->aux_nr_pages << PAGE_SHIFT;
}
-#define DEFINE_OUTPUT_COPY(func_name, memcpy_func) \
-static inline unsigned long \
-func_name(struct perf_output_handle *handle, \
- const void *buf, unsigned long len) \
+#define __DEFINE_OUTPUT_COPY_BODY(advance_buf, memcpy_func, ...) \
{ \
unsigned long size, written; \
\
do { \
size = min(handle->size, len); \
- written = memcpy_func(handle->addr, buf, size); \
+ written = memcpy_func(__VA_ARGS__); \
written = size - written; \
\
len -= written; \
handle->addr += written; \
- buf += written; \
+ if (advance_buf) \
+ buf += written; \
handle->size -= written; \
if (!handle->size) { \
struct ring_buffer *rb = handle->rb; \
@@ -152,6 +150,21 @@ func_name(struct perf_output_handle *handle, \
return len; \
}
+#define DEFINE_OUTPUT_COPY(func_name, memcpy_func) \
+static inline unsigned long \
+func_name(struct perf_output_handle *handle, \
+ const void *buf, unsigned long len) \
+__DEFINE_OUTPUT_COPY_BODY(true, memcpy_func, handle->addr, buf, size)
+
+static inline unsigned long
+__output_custom(struct perf_output_handle *handle, perf_copy_f copy_func,
+ const void *buf, unsigned long len)
+{
+ unsigned long orig_len = len;
+ __DEFINE_OUTPUT_COPY_BODY(false, copy_func, handle->addr, buf,
+ orig_len - len, size)
+}
+
static inline unsigned long
memcpy_common(void *dst, const void *src, unsigned long n)
{
diff --git a/kernel/fork.c b/kernel/fork.c
index 4a7ec0c6c88c..52e725d4a866 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -162,23 +162,15 @@ void __weak arch_release_thread_stack(unsigned long *stack)
static unsigned long *alloc_thread_stack_node(struct task_struct *tsk,
int node)
{
- struct page *page = alloc_kmem_pages_node(node, THREADINFO_GFP,
- THREAD_SIZE_ORDER);
-
- if (page)
- memcg_kmem_update_page_stat(page, MEMCG_KERNEL_STACK,
- 1 << THREAD_SIZE_ORDER);
+ struct page *page = alloc_pages_node(node, THREADINFO_GFP,
+ THREAD_SIZE_ORDER);
return page ? page_address(page) : NULL;
}
static inline void free_thread_stack(unsigned long *stack)
{
- struct page *page = virt_to_page(stack);
-
- memcg_kmem_update_page_stat(page, MEMCG_KERNEL_STACK,
- -(1 << THREAD_SIZE_ORDER));
- __free_kmem_pages(page, THREAD_SIZE_ORDER);
+ __free_pages(virt_to_page(stack), THREAD_SIZE_ORDER);
}
# else
static struct kmem_cache *thread_stack_cache;
@@ -223,9 +215,15 @@ static struct kmem_cache *mm_cachep;
static void account_kernel_stack(unsigned long *stack, int account)
{
- struct zone *zone = page_zone(virt_to_page(stack));
+ /* All stack pages are in the same zone and belong to the same memcg. */
+ struct page *first_page = virt_to_page(stack);
+
+ mod_zone_page_state(page_zone(first_page), NR_KERNEL_STACK_KB,
+ THREAD_SIZE / 1024 * account);
- mod_zone_page_state(zone, NR_KERNEL_STACK, account);
+ memcg_kmem_update_page_stat(
+ first_page, MEMCG_KERNEL_STACK_KB,
+ account * (THREAD_SIZE / 1024));
}
void free_task(struct task_struct *tsk)
diff --git a/kernel/freezer.c b/kernel/freezer.c
index a8900a3bc27a..6f56a9e219fa 100644
--- a/kernel/freezer.c
+++ b/kernel/freezer.c
@@ -42,7 +42,7 @@ bool freezing_slow_path(struct task_struct *p)
if (p->flags & (PF_NOFREEZE | PF_SUSPEND_TASK))
return false;
- if (test_thread_flag(TIF_MEMDIE))
+ if (test_tsk_thread_flag(p, TIF_MEMDIE))
return false;
if (pm_nosig_freezing || cgroup_freezing(p))
diff --git a/kernel/memremap.c b/kernel/memremap.c
index 017532193fb1..251d16b4cb41 100644
--- a/kernel/memremap.c
+++ b/kernel/memremap.c
@@ -169,12 +169,6 @@ void devm_memunmap(struct device *dev, void *addr)
}
EXPORT_SYMBOL(devm_memunmap);
-pfn_t phys_to_pfn_t(phys_addr_t addr, u64 flags)
-{
- return __pfn_to_pfn_t(addr >> PAGE_SHIFT, flags);
-}
-EXPORT_SYMBOL(phys_to_pfn_t);
-
#ifdef CONFIG_ZONE_DEVICE
static DEFINE_MUTEX(pgmap_lock);
static RADIX_TREE(pgmap_radix, GFP_KERNEL);
@@ -308,12 +302,6 @@ void *devm_memremap_pages(struct device *dev, struct resource *res,
if (is_ram == REGION_INTERSECTS)
return __va(res->start);
- if (altmap && !IS_ENABLED(CONFIG_SPARSEMEM_VMEMMAP)) {
- dev_err(dev, "%s: altmap requires CONFIG_SPARSEMEM_VMEMMAP=y\n",
- __func__);
- return ERR_PTR(-ENXIO);
- }
-
if (!ref)
return ERR_PTR(-EINVAL);
@@ -401,7 +389,6 @@ void vmem_altmap_free(struct vmem_altmap *altmap, unsigned long nr_pfns)
altmap->alloc -= nr_pfns;
}
-#ifdef CONFIG_SPARSEMEM_VMEMMAP
struct vmem_altmap *to_vmem_altmap(unsigned long memmap_start)
{
/*
@@ -427,5 +414,4 @@ struct vmem_altmap *to_vmem_altmap(unsigned long memmap_start)
return pgmap ? pgmap->altmap : NULL;
}
-#endif /* CONFIG_SPARSEMEM_VMEMMAP */
#endif /* CONFIG_ZONE_DEVICE */
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index d90df926b59f..9a0178c2ac1d 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -1627,11 +1627,11 @@ static unsigned long minimum_image_size(unsigned long saveable)
unsigned long size;
size = global_page_state(NR_SLAB_RECLAIMABLE)
- + global_page_state(NR_ACTIVE_ANON)
- + global_page_state(NR_INACTIVE_ANON)
- + global_page_state(NR_ACTIVE_FILE)
- + global_page_state(NR_INACTIVE_FILE)
- - global_page_state(NR_FILE_MAPPED);
+ + global_node_page_state(NR_ACTIVE_ANON)
+ + global_node_page_state(NR_INACTIVE_ANON)
+ + global_node_page_state(NR_ACTIVE_FILE)
+ + global_node_page_state(NR_INACTIVE_FILE)
+ - global_node_page_state(NR_FILE_MAPPED);
return saveable <= size ? 0 : saveable - size;
}
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 60cdf6386763..d4de33934dac 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -3177,9 +3177,8 @@ void show_regs_print_info(const char *log_lvl)
{
dump_stack_print_info(log_lvl);
- printk("%stask: %p ti: %p task.ti: %p\n",
- log_lvl, current, current_thread_info(),
- task_thread_info(current));
+ printk("%stask: %p task.stack: %p\n",
+ log_lvl, current, task_stack_page(current));
}
#endif
diff --git a/kernel/profile.c b/kernel/profile.c
index c2199e9901c9..2dbccf2d806c 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -328,68 +328,57 @@ out:
put_cpu();
}
-static int profile_cpu_callback(struct notifier_block *info,
- unsigned long action, void *__cpu)
+static int profile_dead_cpu(unsigned int cpu)
{
- int node, cpu = (unsigned long)__cpu;
struct page *page;
+ int i;
- switch (action) {
- case CPU_UP_PREPARE:
- case CPU_UP_PREPARE_FROZEN:
- node = cpu_to_mem(cpu);
- per_cpu(cpu_profile_flip, cpu) = 0;
- if (!per_cpu(cpu_profile_hits, cpu)[1]) {
- page = __alloc_pages_node(node,
- GFP_KERNEL | __GFP_ZERO,
- 0);
- if (!page)
- return notifier_from_errno(-ENOMEM);
- per_cpu(cpu_profile_hits, cpu)[1] = page_address(page);
- }
- if (!per_cpu(cpu_profile_hits, cpu)[0]) {
- page = __alloc_pages_node(node,
- GFP_KERNEL | __GFP_ZERO,
- 0);
- if (!page)
- goto out_free;
- per_cpu(cpu_profile_hits, cpu)[0] = page_address(page);
- }
- break;
-out_free:
- page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[1]);
- per_cpu(cpu_profile_hits, cpu)[1] = NULL;
- __free_page(page);
- return notifier_from_errno(-ENOMEM);
- case CPU_ONLINE:
- case CPU_ONLINE_FROZEN:
- if (prof_cpu_mask != NULL)
- cpumask_set_cpu(cpu, prof_cpu_mask);
- break;
- case CPU_UP_CANCELED:
- case CPU_UP_CANCELED_FROZEN:
- case CPU_DEAD:
- case CPU_DEAD_FROZEN:
- if (prof_cpu_mask != NULL)
- cpumask_clear_cpu(cpu, prof_cpu_mask);
- if (per_cpu(cpu_profile_hits, cpu)[0]) {
- page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[0]);
- per_cpu(cpu_profile_hits, cpu)[0] = NULL;
+ if (prof_cpu_mask != NULL)
+ cpumask_clear_cpu(cpu, prof_cpu_mask);
+
+ for (i = 0; i < 2; i++) {
+ if (per_cpu(cpu_profile_hits, cpu)[i]) {
+ page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[i]);
+ per_cpu(cpu_profile_hits, cpu)[i] = NULL;
__free_page(page);
}
- if (per_cpu(cpu_profile_hits, cpu)[1]) {
- page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[1]);
- per_cpu(cpu_profile_hits, cpu)[1] = NULL;
- __free_page(page);
+ }
+ return 0;
+}
+
+static int profile_prepare_cpu(unsigned int cpu)
+{
+ int i, node = cpu_to_mem(cpu);
+ struct page *page;
+
+ per_cpu(cpu_profile_flip, cpu) = 0;
+
+ for (i = 0; i < 2; i++) {
+ if (per_cpu(cpu_profile_hits, cpu)[i])
+ continue;
+
+ page = __alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0);
+ if (!page) {
+ profile_dead_cpu(cpu);
+ return -ENOMEM;
}
- break;
+ per_cpu(cpu_profile_hits, cpu)[i] = page_address(page);
+
}
- return NOTIFY_OK;
+ return 0;
+}
+
+static int profile_online_cpu(unsigned int cpu)
+{
+ if (prof_cpu_mask != NULL)
+ cpumask_set_cpu(cpu, prof_cpu_mask);
+
+ return 0;
}
+
#else /* !CONFIG_SMP */
#define profile_flip_buffers() do { } while (0)
#define profile_discard_flip_buffers() do { } while (0)
-#define profile_cpu_callback NULL
static void do_profile_hits(int type, void *__pc, unsigned int nr_hits)
{
@@ -531,83 +520,43 @@ static const struct file_operations proc_profile_operations = {
.llseek = default_llseek,
};
-#ifdef CONFIG_SMP
-static void profile_nop(void *unused)
-{
-}
-
-static int create_hash_tables(void)
+int __ref create_proc_profile(void)
{
- int cpu;
-
- for_each_online_cpu(cpu) {
- int node = cpu_to_mem(cpu);
- struct page *page;
-
- page = __alloc_pages_node(node,
- GFP_KERNEL | __GFP_ZERO | __GFP_THISNODE,
- 0);
- if (!page)
- goto out_cleanup;
- per_cpu(cpu_profile_hits, cpu)[1]
- = (struct profile_hit *)page_address(page);
- page = __alloc_pages_node(node,
- GFP_KERNEL | __GFP_ZERO | __GFP_THISNODE,
- 0);
- if (!page)
- goto out_cleanup;
- per_cpu(cpu_profile_hits, cpu)[0]
- = (struct profile_hit *)page_address(page);
- }
- return 0;
-out_cleanup:
- prof_on = 0;
- smp_mb();
- on_each_cpu(profile_nop, NULL, 1);
- for_each_online_cpu(cpu) {
- struct page *page;
-
- if (per_cpu(cpu_profile_hits, cpu)[0]) {
- page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[0]);
- per_cpu(cpu_profile_hits, cpu)[0] = NULL;
- __free_page(page);
- }
- if (per_cpu(cpu_profile_hits, cpu)[1]) {
- page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[1]);
- per_cpu(cpu_profile_hits, cpu)[1] = NULL;
- __free_page(page);
- }
- }
- return -1;
-}
-#else
-#define create_hash_tables() ({ 0; })
+ struct proc_dir_entry *entry;
+#ifdef CONFIG_SMP
+ enum cpuhp_state online_state;
#endif
-int __ref create_proc_profile(void) /* false positive from hotcpu_notifier */
-{
- struct proc_dir_entry *entry;
int err = 0;
if (!prof_on)
return 0;
-
- cpu_notifier_register_begin();
-
- if (create_hash_tables()) {
- err = -ENOMEM;
- goto out;
- }
-
+#ifdef CONFIG_SMP
+ err = cpuhp_setup_state(CPUHP_PROFILE_PREPARE, "PROFILE_PREPARE",
+ profile_prepare_cpu, profile_dead_cpu);
+ if (err)
+ return err;
+
+ err = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "AP_PROFILE_ONLINE",
+ profile_online_cpu, NULL);
+ if (err < 0)
+ goto err_state_prep;
+ online_state = err;
+ err = 0;
+#endif
entry = proc_create("profile", S_IWUSR | S_IRUGO,
NULL, &proc_profile_operations);
if (!entry)
- goto out;
+ goto err_state_onl;
proc_set_size(entry, (1 + prof_len) * sizeof(atomic_t));
- __hotcpu_notifier(profile_cpu_callback, 0);
-out:
- cpu_notifier_register_done();
+ return err;
+err_state_onl:
+#ifdef CONFIG_SMP
+ cpuhp_remove_state(online_state);
+err_state_prep:
+ cpuhp_remove_state(CPUHP_PROFILE_PREPARE);
+#endif
return err;
}
subsys_initcall(create_proc_profile);
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index f433959e9322..5d80925e7fc8 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -1073,11 +1073,11 @@ EXPORT_SYMBOL_GPL(rcu_is_watching);
* offline to continue to use RCU for one jiffy after marking itself
* offline in the cpu_online_mask. This leniency is necessary given the
* non-atomic nature of the online and offline processing, for example,
- * the fact that a CPU enters the scheduler after completing the CPU_DYING
- * notifiers.
+ * the fact that a CPU enters the scheduler after completing the teardown
+ * of the CPU.
*
- * This is also why RCU internally marks CPUs online during the
- * CPU_UP_PREPARE phase and offline during the CPU_DEAD phase.
+ * This is also why RCU internally marks CPUs online during in the
+ * preparation phase and offline after the CPU has been taken down.
*
* Disable checking if in an NMI handler because we cannot safely report
* errors from NMI handlers anyway.
@@ -3806,12 +3806,58 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp)
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
}
-static void rcu_prepare_cpu(int cpu)
+int rcutree_prepare_cpu(unsigned int cpu)
{
struct rcu_state *rsp;
for_each_rcu_flavor(rsp)
rcu_init_percpu_data(cpu, rsp);
+
+ rcu_prepare_kthreads(cpu);
+ rcu_spawn_all_nocb_kthreads(cpu);
+
+ return 0;
+}
+
+static void rcutree_affinity_setting(unsigned int cpu, int outgoing)
+{
+ struct rcu_data *rdp = per_cpu_ptr(rcu_state_p->rda, cpu);
+
+ rcu_boost_kthread_setaffinity(rdp->mynode, outgoing);
+}
+
+int rcutree_online_cpu(unsigned int cpu)
+{
+ sync_sched_exp_online_cleanup(cpu);
+ rcutree_affinity_setting(cpu, -1);
+ return 0;
+}
+
+int rcutree_offline_cpu(unsigned int cpu)
+{
+ rcutree_affinity_setting(cpu, cpu);
+ return 0;
+}
+
+
+int rcutree_dying_cpu(unsigned int cpu)
+{
+ struct rcu_state *rsp;
+
+ for_each_rcu_flavor(rsp)
+ rcu_cleanup_dying_cpu(rsp);
+ return 0;
+}
+
+int rcutree_dead_cpu(unsigned int cpu)
+{
+ struct rcu_state *rsp;
+
+ for_each_rcu_flavor(rsp) {
+ rcu_cleanup_dead_cpu(cpu, rsp);
+ do_nocb_deferred_wakeup(per_cpu_ptr(rsp->rda, cpu));
+ }
+ return 0;
}
#ifdef CONFIG_HOTPLUG_CPU
@@ -3851,52 +3897,6 @@ void rcu_report_dead(unsigned int cpu)
}
#endif
-/*
- * Handle CPU online/offline notification events.
- */
-int rcu_cpu_notify(struct notifier_block *self,
- unsigned long action, void *hcpu)
-{
- long cpu = (long)hcpu;
- struct rcu_data *rdp = per_cpu_ptr(rcu_state_p->rda, cpu);
- struct rcu_node *rnp = rdp->mynode;
- struct rcu_state *rsp;
-
- switch (action) {
- case CPU_UP_PREPARE:
- case CPU_UP_PREPARE_FROZEN:
- rcu_prepare_cpu(cpu);
- rcu_prepare_kthreads(cpu);
- rcu_spawn_all_nocb_kthreads(cpu);
- break;
- case CPU_ONLINE:
- case CPU_DOWN_FAILED:
- sync_sched_exp_online_cleanup(cpu);
- rcu_boost_kthread_setaffinity(rnp, -1);
- break;
- case CPU_DOWN_PREPARE:
- rcu_boost_kthread_setaffinity(rnp, cpu);
- break;
- case CPU_DYING:
- case CPU_DYING_FROZEN:
- for_each_rcu_flavor(rsp)
- rcu_cleanup_dying_cpu(rsp);
- break;
- case CPU_DEAD:
- case CPU_DEAD_FROZEN:
- case CPU_UP_CANCELED:
- case CPU_UP_CANCELED_FROZEN:
- for_each_rcu_flavor(rsp) {
- rcu_cleanup_dead_cpu(cpu, rsp);
- do_nocb_deferred_wakeup(per_cpu_ptr(rsp->rda, cpu));
- }
- break;
- default:
- break;
- }
- return NOTIFY_OK;
-}
-
static int rcu_pm_notify(struct notifier_block *self,
unsigned long action, void *hcpu)
{
@@ -4208,10 +4208,9 @@ void __init rcu_init(void)
* this is called early in boot, before either interrupts
* or the scheduler are operational.
*/
- cpu_notifier(rcu_cpu_notify, 0);
pm_notifier(rcu_pm_notify, 0);
for_each_online_cpu(cpu)
- rcu_cpu_notify(NULL, CPU_UP_PREPARE, (void *)(long)cpu);
+ rcutree_prepare_cpu(cpu);
}
#include "tree_exp.h"
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index ea0f6f31a244..1934f658c036 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -503,16 +503,6 @@ void account_process_tick(struct task_struct *p, int user_tick)
}
/*
- * Account multiple ticks of steal time.
- * @p: the process from which the cpu time has been stolen
- * @ticks: number of stolen ticks
- */
-void account_steal_ticks(unsigned long ticks)
-{
- account_steal_time(jiffies_to_cputime(ticks));
-}
-
-/*
* Account multiple ticks of idle time.
* @ticks: number of stolen ticks
*/
diff --git a/kernel/smp.c b/kernel/smp.c
index 36552beed397..3aa642d39c03 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -33,69 +33,54 @@ static DEFINE_PER_CPU_SHARED_ALIGNED(struct llist_head, call_single_queue);
static void flush_smp_call_function_queue(bool warn_cpu_offline);
-static int
-hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu)
+int smpcfd_prepare_cpu(unsigned int cpu)
{
- long cpu = (long)hcpu;
struct call_function_data *cfd = &per_cpu(cfd_data, cpu);
- switch (action) {
- case CPU_UP_PREPARE:
- case CPU_UP_PREPARE_FROZEN:
- if (!zalloc_cpumask_var_node(&cfd->cpumask, GFP_KERNEL,
- cpu_to_node(cpu)))
- return notifier_from_errno(-ENOMEM);
- cfd->csd = alloc_percpu(struct call_single_data);
- if (!cfd->csd) {
- free_cpumask_var(cfd->cpumask);
- return notifier_from_errno(-ENOMEM);
- }
- break;
-
-#ifdef CONFIG_HOTPLUG_CPU
- case CPU_UP_CANCELED:
- case CPU_UP_CANCELED_FROZEN:
- /* Fall-through to the CPU_DEAD[_FROZEN] case. */
-
- case CPU_DEAD:
- case CPU_DEAD_FROZEN:
+ if (!zalloc_cpumask_var_node(&cfd->cpumask, GFP_KERNEL,
+ cpu_to_node(cpu)))
+ return -ENOMEM;
+ cfd->csd = alloc_percpu(struct call_single_data);
+ if (!cfd->csd) {
free_cpumask_var(cfd->cpumask);
- free_percpu(cfd->csd);
- break;
+ return -ENOMEM;
+ }
- case CPU_DYING:
- case CPU_DYING_FROZEN:
- /*
- * The IPIs for the smp-call-function callbacks queued by other
- * CPUs might arrive late, either due to hardware latencies or
- * because this CPU disabled interrupts (inside stop-machine)
- * before the IPIs were sent. So flush out any pending callbacks
- * explicitly (without waiting for the IPIs to arrive), to
- * ensure that the outgoing CPU doesn't go offline with work
- * still pending.
- */
- flush_smp_call_function_queue(false);
- break;
-#endif
- };
+ return 0;
+}
+
+int smpcfd_dead_cpu(unsigned int cpu)
+{
+ struct call_function_data *cfd = &per_cpu(cfd_data, cpu);
- return NOTIFY_OK;
+ free_cpumask_var(cfd->cpumask);
+ free_percpu(cfd->csd);
+ return 0;
}
-static struct notifier_block hotplug_cfd_notifier = {
- .notifier_call = hotplug_cfd,
-};
+int smpcfd_dying_cpu(unsigned int cpu)
+{
+ /*
+ * The IPIs for the smp-call-function callbacks queued by other
+ * CPUs might arrive late, either due to hardware latencies or
+ * because this CPU disabled interrupts (inside stop-machine)
+ * before the IPIs were sent. So flush out any pending callbacks
+ * explicitly (without waiting for the IPIs to arrive), to
+ * ensure that the outgoing CPU doesn't go offline with work
+ * still pending.
+ */
+ flush_smp_call_function_queue(false);
+ return 0;
+}
void __init call_function_init(void)
{
- void *cpu = (void *)(long)smp_processor_id();
int i;
for_each_possible_cpu(i)
init_llist_head(&per_cpu(call_single_queue, i));
- hotplug_cfd(&hotplug_cfd_notifier, CPU_UP_PREPARE, cpu);
- register_cpu_notifier(&hotplug_cfd_notifier);
+ smpcfd_prepare_cpu(smp_processor_id());
}
/*
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 35f0dcb1cb4f..53954631a4e1 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1508,8 +1508,8 @@ static struct ctl_table vm_table[] = {
#ifdef CONFIG_NUMA
{
.procname = "zone_reclaim_mode",
- .data = &zone_reclaim_mode,
- .maxlen = sizeof(zone_reclaim_mode),
+ .data = &node_reclaim_mode,
+ .maxlen = sizeof(node_reclaim_mode),
.mode = 0644,
.proc_handler = proc_dointvec,
.extra1 = &zero,
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index d13c9aebf7a3..9ba7c820fc23 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -1590,7 +1590,7 @@ SYSCALL_DEFINE2(nanosleep, struct timespec __user *, rqtp,
/*
* Functions related to boot-time initialization:
*/
-static void init_hrtimers_cpu(int cpu)
+int hrtimers_prepare_cpu(unsigned int cpu)
{
struct hrtimer_cpu_base *cpu_base = &per_cpu(hrtimer_bases, cpu);
int i;
@@ -1602,6 +1602,7 @@ static void init_hrtimers_cpu(int cpu)
cpu_base->cpu = cpu;
hrtimer_init_hres(cpu_base);
+ return 0;
}
#ifdef CONFIG_HOTPLUG_CPU
@@ -1636,7 +1637,7 @@ static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base,
}
}
-static void migrate_hrtimers(int scpu)
+int hrtimers_dead_cpu(unsigned int scpu)
{
struct hrtimer_cpu_base *old_base, *new_base;
int i;
@@ -1665,45 +1666,14 @@ static void migrate_hrtimers(int scpu)
/* Check, if we got expired work to do */
__hrtimer_peek_ahead_timers();
local_irq_enable();
+ return 0;
}
#endif /* CONFIG_HOTPLUG_CPU */
-static int hrtimer_cpu_notify(struct notifier_block *self,
- unsigned long action, void *hcpu)
-{
- int scpu = (long)hcpu;
-
- switch (action) {
-
- case CPU_UP_PREPARE:
- case CPU_UP_PREPARE_FROZEN:
- init_hrtimers_cpu(scpu);
- break;
-
-#ifdef CONFIG_HOTPLUG_CPU
- case CPU_DEAD:
- case CPU_DEAD_FROZEN:
- migrate_hrtimers(scpu);
- break;
-#endif
-
- default:
- break;
- }
-
- return NOTIFY_OK;
-}
-
-static struct notifier_block hrtimers_nb = {
- .notifier_call = hrtimer_cpu_notify,
-};
-
void __init hrtimers_init(void)
{
- hrtimer_cpu_notify(&hrtimers_nb, (unsigned long)CPU_UP_PREPARE,
- (void *)(long)smp_processor_id());
- register_cpu_notifier(&hrtimers_nb);
+ hrtimers_prepare_cpu(smp_processor_id());
}
/**
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index cb9ab401e2d9..555670a5143c 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -1804,7 +1804,7 @@ static void migrate_timer_list(struct timer_base *new_base, struct hlist_head *h
}
}
-static void migrate_timers(int cpu)
+int timers_dead_cpu(unsigned int cpu)
{
struct timer_base *old_base;
struct timer_base *new_base;
@@ -1831,29 +1831,9 @@ static void migrate_timers(int cpu)
spin_unlock_irq(&new_base->lock);
put_cpu_ptr(&timer_bases);
}
+ return 0;
}
-static int timer_cpu_notify(struct notifier_block *self,
- unsigned long action, void *hcpu)
-{
- switch (action) {
- case CPU_DEAD:
- case CPU_DEAD_FROZEN:
- migrate_timers((long)hcpu);
- break;
- default:
- break;
- }
-
- return NOTIFY_OK;
-}
-
-static inline void timer_register_cpu_notifier(void)
-{
- cpu_notifier(timer_cpu_notify, 0);
-}
-#else
-static inline void timer_register_cpu_notifier(void) { }
#endif /* CONFIG_HOTPLUG_CPU */
static void __init init_timer_cpu(int cpu)
@@ -1881,7 +1861,6 @@ void __init init_timers(void)
{
init_timer_cpus();
init_timer_stats();
- timer_register_cpu_notifier();
open_softirq(TIMER_SOFTIRQ, run_timer_softirq);
}
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index fafeaf803bd0..f4b86e8ca1e7 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -542,6 +542,7 @@ config HIST_TRIGGERS
bool "Histogram triggers"
depends on ARCH_HAVE_NMI_SAFE_CMPXCHG
select TRACING_MAP
+ select TRACING
default n
help
Hist triggers allow one or more arbitrary trace event fields
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 26f603da7e26..b20438fdb029 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -81,6 +81,49 @@ static const struct bpf_func_proto bpf_probe_read_proto = {
.arg3_type = ARG_ANYTHING,
};
+static u64 bpf_probe_write_user(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+{
+ void *unsafe_ptr = (void *) (long) r1;
+ void *src = (void *) (long) r2;
+ int size = (int) r3;
+
+ /*
+ * Ensure we're in user context which is safe for the helper to
+ * run. This helper has no business in a kthread.
+ *
+ * access_ok() should prevent writing to non-user memory, but in
+ * some situations (nommu, temporary switch, etc) access_ok() does
+ * not provide enough validation, hence the check on KERNEL_DS.
+ */
+
+ if (unlikely(in_interrupt() ||
+ current->flags & (PF_KTHREAD | PF_EXITING)))
+ return -EPERM;
+ if (unlikely(segment_eq(get_fs(), KERNEL_DS)))
+ return -EPERM;
+ if (!access_ok(VERIFY_WRITE, unsafe_ptr, size))
+ return -EPERM;
+
+ return probe_kernel_write(unsafe_ptr, src, size);
+}
+
+static const struct bpf_func_proto bpf_probe_write_user_proto = {
+ .func = bpf_probe_write_user,
+ .gpl_only = true,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_ANYTHING,
+ .arg2_type = ARG_PTR_TO_STACK,
+ .arg3_type = ARG_CONST_STACK_SIZE,
+};
+
+static const struct bpf_func_proto *bpf_get_probe_write_proto(void)
+{
+ pr_warn_ratelimited("%s[%d] is installing a program with bpf_probe_write_user helper that may corrupt user memory!",
+ current->comm, task_pid_nr(current));
+
+ return &bpf_probe_write_user_proto;
+}
+
/*
* limited trace_printk()
* only %d %u %x %ld %lu %lx %lld %llu %llx %p %s conversion specifiers allowed
@@ -188,31 +231,35 @@ const struct bpf_func_proto *bpf_get_trace_printk_proto(void)
return &bpf_trace_printk_proto;
}
-static u64 bpf_perf_event_read(u64 r1, u64 index, u64 r3, u64 r4, u64 r5)
+static u64 bpf_perf_event_read(u64 r1, u64 flags, u64 r3, u64 r4, u64 r5)
{
struct bpf_map *map = (struct bpf_map *) (unsigned long) r1;
struct bpf_array *array = container_of(map, struct bpf_array, map);
+ unsigned int cpu = smp_processor_id();
+ u64 index = flags & BPF_F_INDEX_MASK;
+ struct bpf_event_entry *ee;
struct perf_event *event;
- struct file *file;
+ if (unlikely(flags & ~(BPF_F_INDEX_MASK)))
+ return -EINVAL;
+ if (index == BPF_F_CURRENT_CPU)
+ index = cpu;
if (unlikely(index >= array->map.max_entries))
return -E2BIG;
- file = READ_ONCE(array->ptrs[index]);
- if (unlikely(!file))
+ ee = READ_ONCE(array->ptrs[index]);
+ if (!ee)
return -ENOENT;
- event = file->private_data;
-
- /* make sure event is local and doesn't have pmu::count */
- if (event->oncpu != smp_processor_id() ||
- event->pmu->count)
- return -EINVAL;
-
+ event = ee->event;
if (unlikely(event->attr.type != PERF_TYPE_HARDWARE &&
event->attr.type != PERF_TYPE_RAW))
return -EINVAL;
+ /* make sure event is local and doesn't have pmu::count */
+ if (unlikely(event->oncpu != cpu || event->pmu->count))
+ return -EINVAL;
+
/*
* we don't know if the function is run successfully by the
* return value. It can be judged in other places, such as
@@ -229,47 +276,58 @@ static const struct bpf_func_proto bpf_perf_event_read_proto = {
.arg2_type = ARG_ANYTHING,
};
-static u64 bpf_perf_event_output(u64 r1, u64 r2, u64 flags, u64 r4, u64 size)
+static __always_inline u64
+__bpf_perf_event_output(struct pt_regs *regs, struct bpf_map *map,
+ u64 flags, struct perf_raw_record *raw)
{
- struct pt_regs *regs = (struct pt_regs *) (long) r1;
- struct bpf_map *map = (struct bpf_map *) (long) r2;
struct bpf_array *array = container_of(map, struct bpf_array, map);
+ unsigned int cpu = smp_processor_id();
u64 index = flags & BPF_F_INDEX_MASK;
- void *data = (void *) (long) r4;
struct perf_sample_data sample_data;
+ struct bpf_event_entry *ee;
struct perf_event *event;
- struct file *file;
- struct perf_raw_record raw = {
- .size = size,
- .data = data,
- };
- if (unlikely(flags & ~(BPF_F_INDEX_MASK)))
- return -EINVAL;
if (index == BPF_F_CURRENT_CPU)
- index = raw_smp_processor_id();
+ index = cpu;
if (unlikely(index >= array->map.max_entries))
return -E2BIG;
- file = READ_ONCE(array->ptrs[index]);
- if (unlikely(!file))
+ ee = READ_ONCE(array->ptrs[index]);
+ if (!ee)
return -ENOENT;
- event = file->private_data;
-
+ event = ee->event;
if (unlikely(event->attr.type != PERF_TYPE_SOFTWARE ||
event->attr.config != PERF_COUNT_SW_BPF_OUTPUT))
return -EINVAL;
- if (unlikely(event->oncpu != smp_processor_id()))
+ if (unlikely(event->oncpu != cpu))
return -EOPNOTSUPP;
perf_sample_data_init(&sample_data, 0, 0);
- sample_data.raw = &raw;
+ sample_data.raw = raw;
perf_event_output(event, &sample_data, regs);
return 0;
}
+static u64 bpf_perf_event_output(u64 r1, u64 r2, u64 flags, u64 r4, u64 size)
+{
+ struct pt_regs *regs = (struct pt_regs *)(long) r1;
+ struct bpf_map *map = (struct bpf_map *)(long) r2;
+ void *data = (void *)(long) r4;
+ struct perf_raw_record raw = {
+ .frag = {
+ .size = size,
+ .data = data,
+ },
+ };
+
+ if (unlikely(flags & ~(BPF_F_INDEX_MASK)))
+ return -EINVAL;
+
+ return __bpf_perf_event_output(regs, map, flags, &raw);
+}
+
static const struct bpf_func_proto bpf_perf_event_output_proto = {
.func = bpf_perf_event_output,
.gpl_only = true,
@@ -283,31 +341,41 @@ static const struct bpf_func_proto bpf_perf_event_output_proto = {
static DEFINE_PER_CPU(struct pt_regs, bpf_pt_regs);
-static u64 bpf_event_output(u64 r1, u64 r2, u64 flags, u64 r4, u64 size)
+u64 bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size,
+ void *ctx, u64 ctx_size, bpf_ctx_copy_t ctx_copy)
{
struct pt_regs *regs = this_cpu_ptr(&bpf_pt_regs);
+ struct perf_raw_frag frag = {
+ .copy = ctx_copy,
+ .size = ctx_size,
+ .data = ctx,
+ };
+ struct perf_raw_record raw = {
+ .frag = {
+ {
+ .next = ctx_size ? &frag : NULL,
+ },
+ .size = meta_size,
+ .data = meta,
+ },
+ };
perf_fetch_caller_regs(regs);
- return bpf_perf_event_output((long)regs, r2, flags, r4, size);
+ return __bpf_perf_event_output(regs, map, flags, &raw);
+}
+
+static u64 bpf_get_current_task(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+{
+ return (long) current;
}
-static const struct bpf_func_proto bpf_event_output_proto = {
- .func = bpf_event_output,
+static const struct bpf_func_proto bpf_get_current_task_proto = {
+ .func = bpf_get_current_task,
.gpl_only = true,
.ret_type = RET_INTEGER,
- .arg1_type = ARG_PTR_TO_CTX,
- .arg2_type = ARG_CONST_MAP_PTR,
- .arg3_type = ARG_ANYTHING,
- .arg4_type = ARG_PTR_TO_STACK,
- .arg5_type = ARG_CONST_STACK_SIZE,
};
-const struct bpf_func_proto *bpf_get_event_output_proto(void)
-{
- return &bpf_event_output_proto;
-}
-
static const struct bpf_func_proto *tracing_func_proto(enum bpf_func_id func_id)
{
switch (func_id) {
@@ -325,6 +393,8 @@ static const struct bpf_func_proto *tracing_func_proto(enum bpf_func_id func_id)
return &bpf_tail_call_proto;
case BPF_FUNC_get_current_pid_tgid:
return &bpf_get_current_pid_tgid_proto;
+ case BPF_FUNC_get_current_task:
+ return &bpf_get_current_task_proto;
case BPF_FUNC_get_current_uid_gid:
return &bpf_get_current_uid_gid_proto;
case BPF_FUNC_get_current_comm:
@@ -335,6 +405,8 @@ static const struct bpf_func_proto *tracing_func_proto(enum bpf_func_id func_id)
return &bpf_get_smp_processor_id_proto;
case BPF_FUNC_perf_event_read:
return &bpf_perf_event_read_proto;
+ case BPF_FUNC_probe_write_user:
+ return bpf_get_probe_write_proto();
default:
return NULL;
}
@@ -356,18 +428,12 @@ static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func
static bool kprobe_prog_is_valid_access(int off, int size, enum bpf_access_type type,
enum bpf_reg_type *reg_type)
{
- /* check bounds */
if (off < 0 || off >= sizeof(struct pt_regs))
return false;
-
- /* only read is allowed */
if (type != BPF_READ)
return false;
-
- /* disallow misaligned access */
if (off % size != 0)
return false;
-
return true;
}
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 900dbb1efff2..84752c8e28b5 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -89,16 +89,16 @@ struct ftrace_ops *function_trace_op __read_mostly = &ftrace_list_end;
/* What to set function_trace_op to */
static struct ftrace_ops *set_function_trace_op;
-/* List for set_ftrace_pid's pids. */
-LIST_HEAD(ftrace_pids);
-struct ftrace_pid {
- struct list_head list;
- struct pid *pid;
-};
-
-static bool ftrace_pids_enabled(void)
+static bool ftrace_pids_enabled(struct ftrace_ops *ops)
{
- return !list_empty(&ftrace_pids);
+ struct trace_array *tr;
+
+ if (!(ops->flags & FTRACE_OPS_FL_PID) || !ops->private)
+ return false;
+
+ tr = ops->private;
+
+ return tr->function_pids != NULL;
}
static void ftrace_update_trampoline(struct ftrace_ops *ops);
@@ -179,7 +179,9 @@ int ftrace_nr_registered_ops(void)
static void ftrace_pid_func(unsigned long ip, unsigned long parent_ip,
struct ftrace_ops *op, struct pt_regs *regs)
{
- if (!test_tsk_trace_trace(current))
+ struct trace_array *tr = op->private;
+
+ if (tr && this_cpu_read(tr->trace_buffer.data->ftrace_ignore_pid))
return;
op->saved_func(ip, parent_ip, op, regs);
@@ -417,7 +419,7 @@ static int __register_ftrace_function(struct ftrace_ops *ops)
/* Always save the function, and reset at unregistering */
ops->saved_func = ops->func;
- if (ops->flags & FTRACE_OPS_FL_PID && ftrace_pids_enabled())
+ if (ftrace_pids_enabled(ops))
ops->func = ftrace_pid_func;
ftrace_update_trampoline(ops);
@@ -450,7 +452,6 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops)
static void ftrace_update_pid_func(void)
{
- bool enabled = ftrace_pids_enabled();
struct ftrace_ops *op;
/* Only do something if we are tracing something */
@@ -459,8 +460,8 @@ static void ftrace_update_pid_func(void)
do_for_each_ftrace_op(op, ftrace_ops_list) {
if (op->flags & FTRACE_OPS_FL_PID) {
- op->func = enabled ? ftrace_pid_func :
- op->saved_func;
+ op->func = ftrace_pids_enabled(op) ?
+ ftrace_pid_func : op->saved_func;
ftrace_update_trampoline(op);
}
} while_for_each_ftrace_op(op);
@@ -5324,179 +5325,99 @@ ftrace_func_t ftrace_ops_get_func(struct ftrace_ops *ops)
return ops->func;
}
-static void clear_ftrace_swapper(void)
+static void
+ftrace_filter_pid_sched_switch_probe(void *data, bool preempt,
+ struct task_struct *prev, struct task_struct *next)
{
- struct task_struct *p;
- int cpu;
+ struct trace_array *tr = data;
+ struct trace_pid_list *pid_list;
- get_online_cpus();
- for_each_online_cpu(cpu) {
- p = idle_task(cpu);
- clear_tsk_trace_trace(p);
- }
- put_online_cpus();
-}
-
-static void set_ftrace_swapper(void)
-{
- struct task_struct *p;
- int cpu;
+ pid_list = rcu_dereference_sched(tr->function_pids);
- get_online_cpus();
- for_each_online_cpu(cpu) {
- p = idle_task(cpu);
- set_tsk_trace_trace(p);
- }
- put_online_cpus();
+ this_cpu_write(tr->trace_buffer.data->ftrace_ignore_pid,
+ trace_ignore_this_task(pid_list, next));
}
-static void clear_ftrace_pid(struct pid *pid)
+static void clear_ftrace_pids(struct trace_array *tr)
{
- struct task_struct *p;
+ struct trace_pid_list *pid_list;
+ int cpu;
- rcu_read_lock();
- do_each_pid_task(pid, PIDTYPE_PID, p) {
- clear_tsk_trace_trace(p);
- } while_each_pid_task(pid, PIDTYPE_PID, p);
- rcu_read_unlock();
+ pid_list = rcu_dereference_protected(tr->function_pids,
+ lockdep_is_held(&ftrace_lock));
+ if (!pid_list)
+ return;
- put_pid(pid);
-}
+ unregister_trace_sched_switch(ftrace_filter_pid_sched_switch_probe, tr);
-static void set_ftrace_pid(struct pid *pid)
-{
- struct task_struct *p;
+ for_each_possible_cpu(cpu)
+ per_cpu_ptr(tr->trace_buffer.data, cpu)->ftrace_ignore_pid = false;
- rcu_read_lock();
- do_each_pid_task(pid, PIDTYPE_PID, p) {
- set_tsk_trace_trace(p);
- } while_each_pid_task(pid, PIDTYPE_PID, p);
- rcu_read_unlock();
-}
+ rcu_assign_pointer(tr->function_pids, NULL);
-static void clear_ftrace_pid_task(struct pid *pid)
-{
- if (pid == ftrace_swapper_pid)
- clear_ftrace_swapper();
- else
- clear_ftrace_pid(pid);
-}
+ /* Wait till all users are no longer using pid filtering */
+ synchronize_sched();
-static void set_ftrace_pid_task(struct pid *pid)
-{
- if (pid == ftrace_swapper_pid)
- set_ftrace_swapper();
- else
- set_ftrace_pid(pid);
+ trace_free_pid_list(pid_list);
}
-static int ftrace_pid_add(int p)
+static void ftrace_pid_reset(struct trace_array *tr)
{
- struct pid *pid;
- struct ftrace_pid *fpid;
- int ret = -EINVAL;
-
mutex_lock(&ftrace_lock);
-
- if (!p)
- pid = ftrace_swapper_pid;
- else
- pid = find_get_pid(p);
-
- if (!pid)
- goto out;
-
- ret = 0;
-
- list_for_each_entry(fpid, &ftrace_pids, list)
- if (fpid->pid == pid)
- goto out_put;
-
- ret = -ENOMEM;
-
- fpid = kmalloc(sizeof(*fpid), GFP_KERNEL);
- if (!fpid)
- goto out_put;
-
- list_add(&fpid->list, &ftrace_pids);
- fpid->pid = pid;
-
- set_ftrace_pid_task(pid);
+ clear_ftrace_pids(tr);
ftrace_update_pid_func();
-
ftrace_startup_all(0);
mutex_unlock(&ftrace_lock);
- return 0;
-
-out_put:
- if (pid != ftrace_swapper_pid)
- put_pid(pid);
-
-out:
- mutex_unlock(&ftrace_lock);
- return ret;
}
-static void ftrace_pid_reset(void)
-{
- struct ftrace_pid *fpid, *safe;
-
- mutex_lock(&ftrace_lock);
- list_for_each_entry_safe(fpid, safe, &ftrace_pids, list) {
- struct pid *pid = fpid->pid;
-
- clear_ftrace_pid_task(pid);
-
- list_del(&fpid->list);
- kfree(fpid);
- }
-
- ftrace_update_pid_func();
- ftrace_startup_all(0);
-
- mutex_unlock(&ftrace_lock);
-}
+/* Greater than any max PID */
+#define FTRACE_NO_PIDS (void *)(PID_MAX_LIMIT + 1)
static void *fpid_start(struct seq_file *m, loff_t *pos)
+ __acquires(RCU)
{
+ struct trace_pid_list *pid_list;
+ struct trace_array *tr = m->private;
+
mutex_lock(&ftrace_lock);
+ rcu_read_lock_sched();
- if (!ftrace_pids_enabled() && (!*pos))
- return (void *) 1;
+ pid_list = rcu_dereference_sched(tr->function_pids);
- return seq_list_start(&ftrace_pids, *pos);
+ if (!pid_list)
+ return !(*pos) ? FTRACE_NO_PIDS : NULL;
+
+ return trace_pid_start(pid_list, pos);
}
static void *fpid_next(struct seq_file *m, void *v, loff_t *pos)
{
- if (v == (void *)1)
+ struct trace_array *tr = m->private;
+ struct trace_pid_list *pid_list = rcu_dereference_sched(tr->function_pids);
+
+ if (v == FTRACE_NO_PIDS)
return NULL;
- return seq_list_next(v, &ftrace_pids, pos);
+ return trace_pid_next(pid_list, v, pos);
}
static void fpid_stop(struct seq_file *m, void *p)
+ __releases(RCU)
{
+ rcu_read_unlock_sched();
mutex_unlock(&ftrace_lock);
}
static int fpid_show(struct seq_file *m, void *v)
{
- const struct ftrace_pid *fpid = list_entry(v, struct ftrace_pid, list);
-
- if (v == (void *)1) {
+ if (v == FTRACE_NO_PIDS) {
seq_puts(m, "no pid\n");
return 0;
}
- if (fpid->pid == ftrace_swapper_pid)
- seq_puts(m, "swapper tasks\n");
- else
- seq_printf(m, "%u\n", pid_vnr(fpid->pid));
-
- return 0;
+ return trace_pid_show(m, v);
}
static const struct seq_operations ftrace_pid_sops = {
@@ -5509,58 +5430,103 @@ static const struct seq_operations ftrace_pid_sops = {
static int
ftrace_pid_open(struct inode *inode, struct file *file)
{
+ struct trace_array *tr = inode->i_private;
+ struct seq_file *m;
int ret = 0;
+ if (trace_array_get(tr) < 0)
+ return -ENODEV;
+
if ((file->f_mode & FMODE_WRITE) &&
(file->f_flags & O_TRUNC))
- ftrace_pid_reset();
+ ftrace_pid_reset(tr);
- if (file->f_mode & FMODE_READ)
- ret = seq_open(file, &ftrace_pid_sops);
+ ret = seq_open(file, &ftrace_pid_sops);
+ if (ret < 0) {
+ trace_array_put(tr);
+ } else {
+ m = file->private_data;
+ /* copy tr over to seq ops */
+ m->private = tr;
+ }
return ret;
}
+static void ignore_task_cpu(void *data)
+{
+ struct trace_array *tr = data;
+ struct trace_pid_list *pid_list;
+
+ /*
+ * This function is called by on_each_cpu() while the
+ * event_mutex is held.
+ */
+ pid_list = rcu_dereference_protected(tr->function_pids,
+ mutex_is_locked(&ftrace_lock));
+
+ this_cpu_write(tr->trace_buffer.data->ftrace_ignore_pid,
+ trace_ignore_this_task(pid_list, current));
+}
+
static ssize_t
ftrace_pid_write(struct file *filp, const char __user *ubuf,
size_t cnt, loff_t *ppos)
{
- char buf[64], *tmp;
- long val;
- int ret;
+ struct seq_file *m = filp->private_data;
+ struct trace_array *tr = m->private;
+ struct trace_pid_list *filtered_pids = NULL;
+ struct trace_pid_list *pid_list;
+ ssize_t ret;
- if (cnt >= sizeof(buf))
- return -EINVAL;
+ if (!cnt)
+ return 0;
+
+ mutex_lock(&ftrace_lock);
+
+ filtered_pids = rcu_dereference_protected(tr->function_pids,
+ lockdep_is_held(&ftrace_lock));
+
+ ret = trace_pid_write(filtered_pids, &pid_list, ubuf, cnt);
+ if (ret < 0)
+ goto out;
- if (copy_from_user(&buf, ubuf, cnt))
- return -EFAULT;
+ rcu_assign_pointer(tr->function_pids, pid_list);
- buf[cnt] = 0;
+ if (filtered_pids) {
+ synchronize_sched();
+ trace_free_pid_list(filtered_pids);
+ } else if (pid_list) {
+ /* Register a probe to set whether to ignore the tracing of a task */
+ register_trace_sched_switch(ftrace_filter_pid_sched_switch_probe, tr);
+ }
/*
- * Allow "echo > set_ftrace_pid" or "echo -n '' > set_ftrace_pid"
- * to clean the filter quietly.
+ * Ignoring of pids is done at task switch. But we have to
+ * check for those tasks that are currently running.
+ * Always do this in case a pid was appended or removed.
*/
- tmp = strstrip(buf);
- if (strlen(tmp) == 0)
- return 1;
+ on_each_cpu(ignore_task_cpu, tr, 1);
- ret = kstrtol(tmp, 10, &val);
- if (ret < 0)
- return ret;
+ ftrace_update_pid_func();
+ ftrace_startup_all(0);
+ out:
+ mutex_unlock(&ftrace_lock);
- ret = ftrace_pid_add(val);
+ if (ret > 0)
+ *ppos += ret;
- return ret ? ret : cnt;
+ return ret;
}
static int
ftrace_pid_release(struct inode *inode, struct file *file)
{
- if (file->f_mode & FMODE_READ)
- seq_release(inode, file);
+ struct trace_array *tr = inode->i_private;
- return 0;
+ trace_array_put(tr);
+
+ return seq_release(inode, file);
}
static const struct file_operations ftrace_pid_fops = {
@@ -5571,24 +5537,21 @@ static const struct file_operations ftrace_pid_fops = {
.release = ftrace_pid_release,
};
-static __init int ftrace_init_tracefs(void)
+void ftrace_init_tracefs(struct trace_array *tr, struct dentry *d_tracer)
{
- struct dentry *d_tracer;
+ trace_create_file("set_ftrace_pid", 0644, d_tracer,
+ tr, &ftrace_pid_fops);
+}
- d_tracer = tracing_init_dentry();
- if (IS_ERR(d_tracer))
- return 0;
+void __init ftrace_init_tracefs_toplevel(struct trace_array *tr,
+ struct dentry *d_tracer)
+{
+ /* Only the top level directory has the dyn_tracefs and profile */
+ WARN_ON(!(tr->flags & TRACE_ARRAY_FL_GLOBAL));
ftrace_init_dyn_tracefs(d_tracer);
-
- trace_create_file("set_ftrace_pid", 0644, d_tracer,
- NULL, &ftrace_pid_fops);
-
ftrace_profile_tracefs(d_tracer);
-
- return 0;
}
-fs_initcall(ftrace_init_tracefs);
/**
* ftrace_kill - kill ftrace
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 8a4bd6b68a0b..dade4c9559cc 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -25,7 +25,7 @@
#include <linux/hardirq.h>
#include <linux/linkage.h>
#include <linux/uaccess.h>
-#include <linux/kprobes.h>
+#include <linux/vmalloc.h>
#include <linux/ftrace.h>
#include <linux/module.h>
#include <linux/percpu.h>
@@ -319,6 +319,258 @@ int call_filter_check_discard(struct trace_event_call *call, void *rec,
return 0;
}
+void trace_free_pid_list(struct trace_pid_list *pid_list)
+{
+ vfree(pid_list->pids);
+ kfree(pid_list);
+}
+
+/**
+ * trace_find_filtered_pid - check if a pid exists in a filtered_pid list
+ * @filtered_pids: The list of pids to check
+ * @search_pid: The PID to find in @filtered_pids
+ *
+ * Returns true if @search_pid is fonud in @filtered_pids, and false otherwis.
+ */
+bool
+trace_find_filtered_pid(struct trace_pid_list *filtered_pids, pid_t search_pid)
+{
+ /*
+ * If pid_max changed after filtered_pids was created, we
+ * by default ignore all pids greater than the previous pid_max.
+ */
+ if (search_pid >= filtered_pids->pid_max)
+ return false;
+
+ return test_bit(search_pid, filtered_pids->pids);
+}
+
+/**
+ * trace_ignore_this_task - should a task be ignored for tracing
+ * @filtered_pids: The list of pids to check
+ * @task: The task that should be ignored if not filtered
+ *
+ * Checks if @task should be traced or not from @filtered_pids.
+ * Returns true if @task should *NOT* be traced.
+ * Returns false if @task should be traced.
+ */
+bool
+trace_ignore_this_task(struct trace_pid_list *filtered_pids, struct task_struct *task)
+{
+ /*
+ * Return false, because if filtered_pids does not exist,
+ * all pids are good to trace.
+ */
+ if (!filtered_pids)
+ return false;
+
+ return !trace_find_filtered_pid(filtered_pids, task->pid);
+}
+
+/**
+ * trace_pid_filter_add_remove - Add or remove a task from a pid_list
+ * @pid_list: The list to modify
+ * @self: The current task for fork or NULL for exit
+ * @task: The task to add or remove
+ *
+ * If adding a task, if @self is defined, the task is only added if @self
+ * is also included in @pid_list. This happens on fork and tasks should
+ * only be added when the parent is listed. If @self is NULL, then the
+ * @task pid will be removed from the list, which would happen on exit
+ * of a task.
+ */
+void trace_filter_add_remove_task(struct trace_pid_list *pid_list,
+ struct task_struct *self,
+ struct task_struct *task)
+{
+ if (!pid_list)
+ return;
+
+ /* For forks, we only add if the forking task is listed */
+ if (self) {
+ if (!trace_find_filtered_pid(pid_list, self->pid))
+ return;
+ }
+
+ /* Sorry, but we don't support pid_max changing after setting */
+ if (task->pid >= pid_list->pid_max)
+ return;
+
+ /* "self" is set for forks, and NULL for exits */
+ if (self)
+ set_bit(task->pid, pid_list->pids);
+ else
+ clear_bit(task->pid, pid_list->pids);
+}
+
+/**
+ * trace_pid_next - Used for seq_file to get to the next pid of a pid_list
+ * @pid_list: The pid list to show
+ * @v: The last pid that was shown (+1 the actual pid to let zero be displayed)
+ * @pos: The position of the file
+ *
+ * This is used by the seq_file "next" operation to iterate the pids
+ * listed in a trace_pid_list structure.
+ *
+ * Returns the pid+1 as we want to display pid of zero, but NULL would
+ * stop the iteration.
+ */
+void *trace_pid_next(struct trace_pid_list *pid_list, void *v, loff_t *pos)
+{
+ unsigned long pid = (unsigned long)v;
+
+ (*pos)++;
+
+ /* pid already is +1 of the actual prevous bit */
+ pid = find_next_bit(pid_list->pids, pid_list->pid_max, pid);
+
+ /* Return pid + 1 to allow zero to be represented */
+ if (pid < pid_list->pid_max)
+ return (void *)(pid + 1);
+
+ return NULL;
+}
+
+/**
+ * trace_pid_start - Used for seq_file to start reading pid lists
+ * @pid_list: The pid list to show
+ * @pos: The position of the file
+ *
+ * This is used by seq_file "start" operation to start the iteration
+ * of listing pids.
+ *
+ * Returns the pid+1 as we want to display pid of zero, but NULL would
+ * stop the iteration.
+ */
+void *trace_pid_start(struct trace_pid_list *pid_list, loff_t *pos)
+{
+ unsigned long pid;
+ loff_t l = 0;
+
+ pid = find_first_bit(pid_list->pids, pid_list->pid_max);
+ if (pid >= pid_list->pid_max)
+ return NULL;
+
+ /* Return pid + 1 so that zero can be the exit value */
+ for (pid++; pid && l < *pos;
+ pid = (unsigned long)trace_pid_next(pid_list, (void *)pid, &l))
+ ;
+ return (void *)pid;
+}
+
+/**
+ * trace_pid_show - show the current pid in seq_file processing
+ * @m: The seq_file structure to write into
+ * @v: A void pointer of the pid (+1) value to display
+ *
+ * Can be directly used by seq_file operations to display the current
+ * pid value.
+ */
+int trace_pid_show(struct seq_file *m, void *v)
+{
+ unsigned long pid = (unsigned long)v - 1;
+
+ seq_printf(m, "%lu\n", pid);
+ return 0;
+}
+
+/* 128 should be much more than enough */
+#define PID_BUF_SIZE 127
+
+int trace_pid_write(struct trace_pid_list *filtered_pids,
+ struct trace_pid_list **new_pid_list,
+ const char __user *ubuf, size_t cnt)
+{
+ struct trace_pid_list *pid_list;
+ struct trace_parser parser;
+ unsigned long val;
+ int nr_pids = 0;
+ ssize_t read = 0;
+ ssize_t ret = 0;
+ loff_t pos;
+ pid_t pid;
+
+ if (trace_parser_get_init(&parser, PID_BUF_SIZE + 1))
+ return -ENOMEM;
+
+ /*
+ * Always recreate a new array. The write is an all or nothing
+ * operation. Always create a new array when adding new pids by
+ * the user. If the operation fails, then the current list is
+ * not modified.
+ */
+ pid_list = kmalloc(sizeof(*pid_list), GFP_KERNEL);
+ if (!pid_list)
+ return -ENOMEM;
+
+ pid_list->pid_max = READ_ONCE(pid_max);
+
+ /* Only truncating will shrink pid_max */
+ if (filtered_pids && filtered_pids->pid_max > pid_list->pid_max)
+ pid_list->pid_max = filtered_pids->pid_max;
+
+ pid_list->pids = vzalloc((pid_list->pid_max + 7) >> 3);
+ if (!pid_list->pids) {
+ kfree(pid_list);
+ return -ENOMEM;
+ }
+
+ if (filtered_pids) {
+ /* copy the current bits to the new max */
+ for_each_set_bit(pid, filtered_pids->pids,
+ filtered_pids->pid_max) {
+ set_bit(pid, pid_list->pids);
+ nr_pids++;
+ }
+ }
+
+ while (cnt > 0) {
+
+ pos = 0;
+
+ ret = trace_get_user(&parser, ubuf, cnt, &pos);
+ if (ret < 0 || !trace_parser_loaded(&parser))
+ break;
+
+ read += ret;
+ ubuf += ret;
+ cnt -= ret;
+
+ parser.buffer[parser.idx] = 0;
+
+ ret = -EINVAL;
+ if (kstrtoul(parser.buffer, 0, &val))
+ break;
+ if (val >= pid_list->pid_max)
+ break;
+
+ pid = (pid_t)val;
+
+ set_bit(pid, pid_list->pids);
+ nr_pids++;
+
+ trace_parser_clear(&parser);
+ ret = 0;
+ }
+ trace_parser_put(&parser);
+
+ if (ret < 0) {
+ trace_free_pid_list(pid_list);
+ return ret;
+ }
+
+ if (!nr_pids) {
+ /* Cleared the list of pids */
+ trace_free_pid_list(pid_list);
+ read = ret;
+ pid_list = NULL;
+ }
+
+ *new_pid_list = pid_list;
+
+ return read;
+}
+
static cycle_t buffer_ftrace_now(struct trace_buffer *buf, int cpu)
{
u64 ts;
@@ -1862,7 +2114,17 @@ void trace_buffer_unlock_commit_regs(struct trace_array *tr,
{
__buffer_unlock_commit(buffer, event);
- ftrace_trace_stack(tr, buffer, flags, 0, pc, regs);
+ /*
+ * If regs is not set, then skip the following callers:
+ * trace_buffer_unlock_commit_regs
+ * event_trigger_unlock_commit
+ * trace_event_buffer_commit
+ * trace_event_raw_event_sched_switch
+ * Note, we can still get here via blktrace, wakeup tracer
+ * and mmiotrace, but that's ok if they lose a function or
+ * two. They are that meaningful.
+ */
+ ftrace_trace_stack(tr, buffer, flags, regs ? 0 : 4, pc, regs);
ftrace_trace_userstack(buffer, flags, pc);
}
@@ -1913,6 +2175,13 @@ static void __ftrace_trace_stack(struct ring_buffer *buffer,
trace.skip = skip;
/*
+ * Add two, for this function and the call to save_stack_trace()
+ * If regs is set, then these functions will not be in the way.
+ */
+ if (!regs)
+ trace.skip += 2;
+
+ /*
* Since events can happen in NMIs there's no safe way to
* use the per cpu ftrace_stacks. We reserve it and if an interrupt
* or NMI comes in, it will just have to use the default
@@ -2083,83 +2352,41 @@ static void __trace_userstack(struct trace_array *tr, unsigned long flags)
/* created for use with alloc_percpu */
struct trace_buffer_struct {
- char buffer[TRACE_BUF_SIZE];
+ int nesting;
+ char buffer[4][TRACE_BUF_SIZE];
};
static struct trace_buffer_struct *trace_percpu_buffer;
-static struct trace_buffer_struct *trace_percpu_sirq_buffer;
-static struct trace_buffer_struct *trace_percpu_irq_buffer;
-static struct trace_buffer_struct *trace_percpu_nmi_buffer;
/*
- * The buffer used is dependent on the context. There is a per cpu
- * buffer for normal context, softirq contex, hard irq context and
- * for NMI context. Thise allows for lockless recording.
- *
- * Note, if the buffers failed to be allocated, then this returns NULL
+ * Thise allows for lockless recording. If we're nested too deeply, then
+ * this returns NULL.
*/
static char *get_trace_buf(void)
{
- struct trace_buffer_struct *percpu_buffer;
-
- /*
- * If we have allocated per cpu buffers, then we do not
- * need to do any locking.
- */
- if (in_nmi())
- percpu_buffer = trace_percpu_nmi_buffer;
- else if (in_irq())
- percpu_buffer = trace_percpu_irq_buffer;
- else if (in_softirq())
- percpu_buffer = trace_percpu_sirq_buffer;
- else
- percpu_buffer = trace_percpu_buffer;
+ struct trace_buffer_struct *buffer = this_cpu_ptr(trace_percpu_buffer);
- if (!percpu_buffer)
+ if (!buffer || buffer->nesting >= 4)
return NULL;
- return this_cpu_ptr(&percpu_buffer->buffer[0]);
+ return &buffer->buffer[buffer->nesting++][0];
+}
+
+static void put_trace_buf(void)
+{
+ this_cpu_dec(trace_percpu_buffer->nesting);
}
static int alloc_percpu_trace_buffer(void)
{
struct trace_buffer_struct *buffers;
- struct trace_buffer_struct *sirq_buffers;
- struct trace_buffer_struct *irq_buffers;
- struct trace_buffer_struct *nmi_buffers;
buffers = alloc_percpu(struct trace_buffer_struct);
- if (!buffers)
- goto err_warn;
-
- sirq_buffers = alloc_percpu(struct trace_buffer_struct);
- if (!sirq_buffers)
- goto err_sirq;
-
- irq_buffers = alloc_percpu(struct trace_buffer_struct);
- if (!irq_buffers)
- goto err_irq;
-
- nmi_buffers = alloc_percpu(struct trace_buffer_struct);
- if (!nmi_buffers)
- goto err_nmi;
+ if (WARN(!buffers, "Could not allocate percpu trace_printk buffer"))
+ return -ENOMEM;
trace_percpu_buffer = buffers;
- trace_percpu_sirq_buffer = sirq_buffers;
- trace_percpu_irq_buffer = irq_buffers;
- trace_percpu_nmi_buffer = nmi_buffers;
-
return 0;
-
- err_nmi:
- free_percpu(irq_buffers);
- err_irq:
- free_percpu(sirq_buffers);
- err_sirq:
- free_percpu(buffers);
- err_warn:
- WARN(1, "Could not allocate percpu trace_printk buffer");
- return -ENOMEM;
}
static int buffers_allocated;
@@ -2250,7 +2477,7 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
tbuffer = get_trace_buf();
if (!tbuffer) {
len = 0;
- goto out;
+ goto out_nobuffer;
}
len = vbin_printf((u32 *)tbuffer, TRACE_BUF_SIZE/sizeof(int), fmt, args);
@@ -2276,6 +2503,9 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
}
out:
+ put_trace_buf();
+
+out_nobuffer:
preempt_enable_notrace();
unpause_graph_tracing();
@@ -2307,7 +2537,7 @@ __trace_array_vprintk(struct ring_buffer *buffer,
tbuffer = get_trace_buf();
if (!tbuffer) {
len = 0;
- goto out;
+ goto out_nobuffer;
}
len = vscnprintf(tbuffer, TRACE_BUF_SIZE, fmt, args);
@@ -2326,7 +2556,11 @@ __trace_array_vprintk(struct ring_buffer *buffer,
__buffer_unlock_commit(buffer, event);
ftrace_trace_stack(&global_trace, buffer, flags, 6, pc, NULL);
}
- out:
+
+out:
+ put_trace_buf();
+
+out_nobuffer:
preempt_enable_notrace();
unpause_graph_tracing();
@@ -6977,6 +7211,7 @@ init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer)
for_each_tracing_cpu(cpu)
tracing_init_tracefs_percpu(tr, cpu);
+ ftrace_init_tracefs(tr, d_tracer);
}
static struct vfsmount *trace_automount(void *ingore)
@@ -7130,6 +7365,7 @@ static __init int tracer_init_tracefs(void)
return 0;
init_tracer_tracefs(&global_trace, d_tracer);
+ ftrace_init_tracefs_toplevel(&global_trace, d_tracer);
trace_create_file("tracing_thresh", 0644, d_tracer,
&global_trace, &tracing_thresh_fops);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 5167c366d6b7..f783df416726 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -80,6 +80,12 @@ enum trace_type {
FTRACE_ENTRY(name, struct_name, id, PARAMS(tstruct), PARAMS(print), \
filter)
+#undef FTRACE_ENTRY_PACKED
+#define FTRACE_ENTRY_PACKED(name, struct_name, id, tstruct, print, \
+ filter) \
+ FTRACE_ENTRY(name, struct_name, id, PARAMS(tstruct), PARAMS(print), \
+ filter) __packed
+
#include "trace_entries.h"
/*
@@ -156,6 +162,9 @@ struct trace_array_cpu {
char comm[TASK_COMM_LEN];
bool ignore_pid;
+#ifdef CONFIG_FUNCTION_TRACER
+ bool ftrace_ignore_pid;
+#endif
};
struct tracer;
@@ -247,6 +256,7 @@ struct trace_array {
int ref;
#ifdef CONFIG_FUNCTION_TRACER
struct ftrace_ops *ops;
+ struct trace_pid_list __rcu *function_pids;
/* function tracing enabled */
int function_enabled;
#endif
@@ -628,6 +638,25 @@ extern unsigned long nsecs_to_usecs(unsigned long nsecs);
extern unsigned long tracing_thresh;
+/* PID filtering */
+
+extern int pid_max;
+
+bool trace_find_filtered_pid(struct trace_pid_list *filtered_pids,
+ pid_t search_pid);
+bool trace_ignore_this_task(struct trace_pid_list *filtered_pids,
+ struct task_struct *task);
+void trace_filter_add_remove_task(struct trace_pid_list *pid_list,
+ struct task_struct *self,
+ struct task_struct *task);
+void *trace_pid_next(struct trace_pid_list *pid_list, void *v, loff_t *pos);
+void *trace_pid_start(struct trace_pid_list *pid_list, loff_t *pos);
+int trace_pid_show(struct seq_file *m, void *v);
+void trace_free_pid_list(struct trace_pid_list *pid_list);
+int trace_pid_write(struct trace_pid_list *filtered_pids,
+ struct trace_pid_list **new_pid_list,
+ const char __user *ubuf, size_t cnt);
+
#ifdef CONFIG_TRACER_MAX_TRACE
void update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu);
void update_max_tr_single(struct trace_array *tr,
@@ -821,12 +850,9 @@ extern struct list_head ftrace_pids;
#ifdef CONFIG_FUNCTION_TRACER
extern bool ftrace_filter_param __initdata;
-static inline int ftrace_trace_task(struct task_struct *task)
+static inline int ftrace_trace_task(struct trace_array *tr)
{
- if (list_empty(&ftrace_pids))
- return 1;
-
- return test_tsk_trace_trace(task);
+ return !this_cpu_read(tr->trace_buffer.data->ftrace_ignore_pid);
}
extern int ftrace_is_dead(void);
int ftrace_create_function_files(struct trace_array *tr,
@@ -836,8 +862,11 @@ void ftrace_init_global_array_ops(struct trace_array *tr);
void ftrace_init_array_ops(struct trace_array *tr, ftrace_func_t func);
void ftrace_reset_array_ops(struct trace_array *tr);
int using_ftrace_ops_list_func(void);
+void ftrace_init_tracefs(struct trace_array *tr, struct dentry *d_tracer);
+void ftrace_init_tracefs_toplevel(struct trace_array *tr,
+ struct dentry *d_tracer);
#else
-static inline int ftrace_trace_task(struct task_struct *task)
+static inline int ftrace_trace_task(struct trace_array *tr)
{
return 1;
}
@@ -852,6 +881,8 @@ static inline void ftrace_destroy_function_files(struct trace_array *tr) { }
static inline __init void
ftrace_init_global_array_ops(struct trace_array *tr) { }
static inline void ftrace_reset_array_ops(struct trace_array *tr) { }
+static inline void ftrace_init_tracefs(struct trace_array *tr, struct dentry *d) { }
+static inline void ftrace_init_tracefs_toplevel(struct trace_array *tr, struct dentry *d) { }
/* ftace_func_t type is not defined, use macro instead of static inline */
#define ftrace_init_array_ops(tr, func) do { } while (0)
#endif /* CONFIG_FUNCTION_TRACER */
@@ -1600,6 +1631,11 @@ int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled);
#define FTRACE_ENTRY_DUP(call, struct_name, id, tstruct, print, filter) \
FTRACE_ENTRY(call, struct_name, id, PARAMS(tstruct), PARAMS(print), \
filter)
+#undef FTRACE_ENTRY_PACKED
+#define FTRACE_ENTRY_PACKED(call, struct_name, id, tstruct, print, filter) \
+ FTRACE_ENTRY(call, struct_name, id, PARAMS(tstruct), PARAMS(print), \
+ filter)
+
#include "trace_entries.h"
#if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_FUNCTION_TRACER)
diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h
index ee7b94a4810a..5c30efcda5e6 100644
--- a/kernel/trace/trace_entries.h
+++ b/kernel/trace/trace_entries.h
@@ -72,7 +72,7 @@ FTRACE_ENTRY_REG(function, ftrace_entry,
);
/* Function call entry */
-FTRACE_ENTRY(funcgraph_entry, ftrace_graph_ent_entry,
+FTRACE_ENTRY_PACKED(funcgraph_entry, ftrace_graph_ent_entry,
TRACE_GRAPH_ENT,
@@ -88,7 +88,7 @@ FTRACE_ENTRY(funcgraph_entry, ftrace_graph_ent_entry,
);
/* Function return entry */
-FTRACE_ENTRY(funcgraph_exit, ftrace_graph_ret_entry,
+FTRACE_ENTRY_PACKED(funcgraph_exit, ftrace_graph_ret_entry,
TRACE_GRAPH_RET,
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 3d4155892a1e..03c0a48c3ac4 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -15,7 +15,6 @@
#include <linux/kthread.h>
#include <linux/tracefs.h>
#include <linux/uaccess.h>
-#include <linux/vmalloc.h>
#include <linux/module.h>
#include <linux/ctype.h>
#include <linux/sort.h>
@@ -262,6 +261,14 @@ void *trace_event_buffer_reserve(struct trace_event_buffer *fbuffer,
local_save_flags(fbuffer->flags);
fbuffer->pc = preempt_count();
+ /*
+ * If CONFIG_PREEMPT is enabled, then the tracepoint itself disables
+ * preemption (adding one to the preempt_count). Since we are
+ * interested in the preempt_count at the time the tracepoint was
+ * hit, we need to subtract one to offset the increment.
+ */
+ if (IS_ENABLED(CONFIG_PREEMPT))
+ fbuffer->pc--;
fbuffer->trace_file = trace_file;
fbuffer->event =
@@ -499,60 +506,6 @@ static void ftrace_clear_events(struct trace_array *tr)
mutex_unlock(&event_mutex);
}
-/* Shouldn't this be in a header? */
-extern int pid_max;
-
-/* Returns true if found in filter */
-static bool
-find_filtered_pid(struct trace_pid_list *filtered_pids, pid_t search_pid)
-{
- /*
- * If pid_max changed after filtered_pids was created, we
- * by default ignore all pids greater than the previous pid_max.
- */
- if (search_pid >= filtered_pids->pid_max)
- return false;
-
- return test_bit(search_pid, filtered_pids->pids);
-}
-
-static bool
-ignore_this_task(struct trace_pid_list *filtered_pids, struct task_struct *task)
-{
- /*
- * Return false, because if filtered_pids does not exist,
- * all pids are good to trace.
- */
- if (!filtered_pids)
- return false;
-
- return !find_filtered_pid(filtered_pids, task->pid);
-}
-
-static void filter_add_remove_task(struct trace_pid_list *pid_list,
- struct task_struct *self,
- struct task_struct *task)
-{
- if (!pid_list)
- return;
-
- /* For forks, we only add if the forking task is listed */
- if (self) {
- if (!find_filtered_pid(pid_list, self->pid))
- return;
- }
-
- /* Sorry, but we don't support pid_max changing after setting */
- if (task->pid >= pid_list->pid_max)
- return;
-
- /* "self" is set for forks, and NULL for exits */
- if (self)
- set_bit(task->pid, pid_list->pids);
- else
- clear_bit(task->pid, pid_list->pids);
-}
-
static void
event_filter_pid_sched_process_exit(void *data, struct task_struct *task)
{
@@ -560,7 +513,7 @@ event_filter_pid_sched_process_exit(void *data, struct task_struct *task)
struct trace_array *tr = data;
pid_list = rcu_dereference_sched(tr->filtered_pids);
- filter_add_remove_task(pid_list, NULL, task);
+ trace_filter_add_remove_task(pid_list, NULL, task);
}
static void
@@ -572,7 +525,7 @@ event_filter_pid_sched_process_fork(void *data,
struct trace_array *tr = data;
pid_list = rcu_dereference_sched(tr->filtered_pids);
- filter_add_remove_task(pid_list, self, task);
+ trace_filter_add_remove_task(pid_list, self, task);
}
void trace_event_follow_fork(struct trace_array *tr, bool enable)
@@ -600,8 +553,8 @@ event_filter_pid_sched_switch_probe_pre(void *data, bool preempt,
pid_list = rcu_dereference_sched(tr->filtered_pids);
this_cpu_write(tr->trace_buffer.data->ignore_pid,
- ignore_this_task(pid_list, prev) &&
- ignore_this_task(pid_list, next));
+ trace_ignore_this_task(pid_list, prev) &&
+ trace_ignore_this_task(pid_list, next));
}
static void
@@ -614,7 +567,7 @@ event_filter_pid_sched_switch_probe_post(void *data, bool preempt,
pid_list = rcu_dereference_sched(tr->filtered_pids);
this_cpu_write(tr->trace_buffer.data->ignore_pid,
- ignore_this_task(pid_list, next));
+ trace_ignore_this_task(pid_list, next));
}
static void
@@ -630,7 +583,7 @@ event_filter_pid_sched_wakeup_probe_pre(void *data, struct task_struct *task)
pid_list = rcu_dereference_sched(tr->filtered_pids);
this_cpu_write(tr->trace_buffer.data->ignore_pid,
- ignore_this_task(pid_list, task));
+ trace_ignore_this_task(pid_list, task));
}
static void
@@ -647,7 +600,7 @@ event_filter_pid_sched_wakeup_probe_post(void *data, struct task_struct *task)
/* Set tracing if current is enabled */
this_cpu_write(tr->trace_buffer.data->ignore_pid,
- ignore_this_task(pid_list, current));
+ trace_ignore_this_task(pid_list, current));
}
static void __ftrace_clear_event_pids(struct trace_array *tr)
@@ -685,8 +638,7 @@ static void __ftrace_clear_event_pids(struct trace_array *tr)
/* Wait till all users are no longer using pid filtering */
synchronize_sched();
- vfree(pid_list->pids);
- kfree(pid_list);
+ trace_free_pid_list(pid_list);
}
static void ftrace_clear_event_pids(struct trace_array *tr)
@@ -1034,18 +986,8 @@ p_next(struct seq_file *m, void *v, loff_t *pos)
{
struct trace_array *tr = m->private;
struct trace_pid_list *pid_list = rcu_dereference_sched(tr->filtered_pids);
- unsigned long pid = (unsigned long)v;
-
- (*pos)++;
-
- /* pid already is +1 of the actual prevous bit */
- pid = find_next_bit(pid_list->pids, pid_list->pid_max, pid);
- /* Return pid + 1 to allow zero to be represented */
- if (pid < pid_list->pid_max)
- return (void *)(pid + 1);
-
- return NULL;
+ return trace_pid_next(pid_list, v, pos);
}
static void *p_start(struct seq_file *m, loff_t *pos)
@@ -1053,8 +995,6 @@ static void *p_start(struct seq_file *m, loff_t *pos)
{
struct trace_pid_list *pid_list;
struct trace_array *tr = m->private;
- unsigned long pid;
- loff_t l = 0;
/*
* Grab the mutex, to keep calls to p_next() having the same
@@ -1070,15 +1010,7 @@ static void *p_start(struct seq_file *m, loff_t *pos)
if (!pid_list)
return NULL;
- pid = find_first_bit(pid_list->pids, pid_list->pid_max);
- if (pid >= pid_list->pid_max)
- return NULL;
-
- /* Return pid + 1 so that zero can be the exit value */
- for (pid++; pid && l < *pos;
- pid = (unsigned long)p_next(m, (void *)pid, &l))
- ;
- return (void *)pid;
+ return trace_pid_start(pid_list, pos);
}
static void p_stop(struct seq_file *m, void *p)
@@ -1088,14 +1020,6 @@ static void p_stop(struct seq_file *m, void *p)
mutex_unlock(&event_mutex);
}
-static int p_show(struct seq_file *m, void *v)
-{
- unsigned long pid = (unsigned long)v - 1;
-
- seq_printf(m, "%lu\n", pid);
- return 0;
-}
-
static ssize_t
event_enable_read(struct file *filp, char __user *ubuf, size_t cnt,
loff_t *ppos)
@@ -1654,7 +1578,7 @@ static void ignore_task_cpu(void *data)
mutex_is_locked(&event_mutex));
this_cpu_write(tr->trace_buffer.data->ignore_pid,
- ignore_this_task(pid_list, current));
+ trace_ignore_this_task(pid_list, current));
}
static ssize_t
@@ -1666,13 +1590,7 @@ ftrace_event_pid_write(struct file *filp, const char __user *ubuf,
struct trace_pid_list *filtered_pids = NULL;
struct trace_pid_list *pid_list;
struct trace_event_file *file;
- struct trace_parser parser;
- unsigned long val;
- loff_t this_pos;
- ssize_t read = 0;
- ssize_t ret = 0;
- pid_t pid;
- int nr_pids = 0;
+ ssize_t ret;
if (!cnt)
return 0;
@@ -1681,93 +1599,15 @@ ftrace_event_pid_write(struct file *filp, const char __user *ubuf,
if (ret < 0)
return ret;
- if (trace_parser_get_init(&parser, EVENT_BUF_SIZE + 1))
- return -ENOMEM;
-
mutex_lock(&event_mutex);
+
filtered_pids = rcu_dereference_protected(tr->filtered_pids,
lockdep_is_held(&event_mutex));
- /*
- * Always recreate a new array. The write is an all or nothing
- * operation. Always create a new array when adding new pids by
- * the user. If the operation fails, then the current list is
- * not modified.
- */
- pid_list = kmalloc(sizeof(*pid_list), GFP_KERNEL);
- if (!pid_list) {
- read = -ENOMEM;
- goto out;
- }
- pid_list->pid_max = READ_ONCE(pid_max);
- /* Only truncating will shrink pid_max */
- if (filtered_pids && filtered_pids->pid_max > pid_list->pid_max)
- pid_list->pid_max = filtered_pids->pid_max;
- pid_list->pids = vzalloc((pid_list->pid_max + 7) >> 3);
- if (!pid_list->pids) {
- kfree(pid_list);
- read = -ENOMEM;
- goto out;
- }
- if (filtered_pids) {
- /* copy the current bits to the new max */
- pid = find_first_bit(filtered_pids->pids,
- filtered_pids->pid_max);
- while (pid < filtered_pids->pid_max) {
- set_bit(pid, pid_list->pids);
- pid = find_next_bit(filtered_pids->pids,
- filtered_pids->pid_max,
- pid + 1);
- nr_pids++;
- }
- }
-
- while (cnt > 0) {
-
- this_pos = 0;
-
- ret = trace_get_user(&parser, ubuf, cnt, &this_pos);
- if (ret < 0 || !trace_parser_loaded(&parser))
- break;
-
- read += ret;
- ubuf += ret;
- cnt -= ret;
-
- parser.buffer[parser.idx] = 0;
-
- ret = -EINVAL;
- if (kstrtoul(parser.buffer, 0, &val))
- break;
- if (val >= pid_list->pid_max)
- break;
-
- pid = (pid_t)val;
-
- set_bit(pid, pid_list->pids);
- nr_pids++;
-
- trace_parser_clear(&parser);
- ret = 0;
- }
- trace_parser_put(&parser);
-
- if (ret < 0) {
- vfree(pid_list->pids);
- kfree(pid_list);
- read = ret;
+ ret = trace_pid_write(filtered_pids, &pid_list, ubuf, cnt);
+ if (ret < 0)
goto out;
- }
- if (!nr_pids) {
- /* Cleared the list of pids */
- vfree(pid_list->pids);
- kfree(pid_list);
- read = ret;
- if (!filtered_pids)
- goto out;
- pid_list = NULL;
- }
rcu_assign_pointer(tr->filtered_pids, pid_list);
list_for_each_entry(file, &tr->events, list) {
@@ -1776,10 +1616,8 @@ ftrace_event_pid_write(struct file *filp, const char __user *ubuf,
if (filtered_pids) {
synchronize_sched();
-
- vfree(filtered_pids->pids);
- kfree(filtered_pids);
- } else {
+ trace_free_pid_list(filtered_pids);
+ } else if (pid_list) {
/*
* Register a probe that is called before all other probes
* to set ignore_pid if next or prev do not match.
@@ -1817,9 +1655,8 @@ ftrace_event_pid_write(struct file *filp, const char __user *ubuf,
out:
mutex_unlock(&event_mutex);
- ret = read;
- if (read > 0)
- *ppos += read;
+ if (ret > 0)
+ *ppos += ret;
return ret;
}
@@ -1846,7 +1683,7 @@ static const struct seq_operations show_set_event_seq_ops = {
static const struct seq_operations show_set_pid_seq_ops = {
.start = p_start,
.next = p_next,
- .show = p_show,
+ .show = trace_pid_show,
.stop = p_stop,
};
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index 5a095c2e4b69..0efa00d80623 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -43,7 +43,7 @@ static int allocate_ftrace_ops(struct trace_array *tr)
/* Currently only the non stack verision is supported */
ops->func = function_trace_call;
- ops->flags = FTRACE_OPS_FL_RECURSION_SAFE;
+ ops->flags = FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_PID;
tr->ops = ops;
ops->private = tr;
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 3a0244ff7ea8..7363ccf79512 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -319,7 +319,7 @@ int trace_graph_entry(struct ftrace_graph_ent *trace)
int cpu;
int pc;
- if (!ftrace_trace_task(current))
+ if (!ftrace_trace_task(tr))
return 0;
/* trace it when it is-nested-in or is a function enabled. */
@@ -338,6 +338,13 @@ int trace_graph_entry(struct ftrace_graph_ent *trace)
if (ftrace_graph_notrace_addr(trace->func))
return 1;
+ /*
+ * Stop here if tracing_threshold is set. We only write function return
+ * events to the ring buffer.
+ */
+ if (tracing_thresh)
+ return 1;
+
local_irq_save(flags);
cpu = raw_smp_processor_id();
data = per_cpu_ptr(tr->trace_buffer.data, cpu);
@@ -355,14 +362,6 @@ int trace_graph_entry(struct ftrace_graph_ent *trace)
return ret;
}
-static int trace_graph_thresh_entry(struct ftrace_graph_ent *trace)
-{
- if (tracing_thresh)
- return 1;
- else
- return trace_graph_entry(trace);
-}
-
static void
__trace_graph_function(struct trace_array *tr,
unsigned long ip, unsigned long flags, int pc)
@@ -457,7 +456,7 @@ static int graph_trace_init(struct trace_array *tr)
set_graph_array(tr);
if (tracing_thresh)
ret = register_ftrace_graph(&trace_graph_thresh_return,
- &trace_graph_thresh_entry);
+ &trace_graph_entry);
else
ret = register_ftrace_graph(&trace_graph_return,
&trace_graph_entry);
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 5546eec0505f..9aedb0b06683 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -587,6 +587,7 @@ static int create_trace_kprobe(int argc, char **argv)
* $retval : fetch return value
* $stack : fetch stack address
* $stackN : fetch Nth of stack (N:0-)
+ * $comm : fetch current task comm
* @ADDR : fetch memory at ADDR (ADDR should be in kernel)
* @SYM[+|-offs] : fetch memory at SYM +|- offs (SYM is a data symbol)
* %REG : fetch register REG
diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c
index 68f376ca6d3f..cd7480d0a201 100644
--- a/kernel/trace/trace_mmiotrace.c
+++ b/kernel/trace/trace_mmiotrace.c
@@ -68,19 +68,15 @@ static void mmio_print_pcidev(struct trace_seq *s, const struct pci_dev *dev)
trace_seq_printf(s, "PCIDEV %02x%02x %04x%04x %x",
dev->bus->number, dev->devfn,
dev->vendor, dev->device, dev->irq);
- /*
- * XXX: is pci_resource_to_user() appropriate, since we are
- * supposed to interpret the __ioremap() phys_addr argument based on
- * these printed values?
- */
for (i = 0; i < 7; i++) {
- pci_resource_to_user(dev, i, &dev->resource[i], &start, &end);
+ start = dev->resource[i].start;
trace_seq_printf(s, " %llx",
(unsigned long long)(start |
(dev->resource[i].flags & PCI_REGION_FLAG_MASK)));
}
for (i = 0; i < 7; i++) {
- pci_resource_to_user(dev, i, &dev->resource[i], &start, &end);
+ start = dev->resource[i].start;
+ end = dev->resource[i].end;
trace_seq_printf(s, " %llx",
dev->resource[i].start < dev->resource[i].end ?
(unsigned long long)(end - start) + 1 : 0);
diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c
index 1d372fa6fefb..74e80a582c28 100644
--- a/kernel/trace/trace_probe.c
+++ b/kernel/trace/trace_probe.c
@@ -218,6 +218,28 @@ free_bitfield_fetch_param(struct bitfield_fetch_param *data)
kfree(data);
}
+void FETCH_FUNC_NAME(comm, string)(struct pt_regs *regs,
+ void *data, void *dest)
+{
+ int maxlen = get_rloc_len(*(u32 *)dest);
+ u8 *dst = get_rloc_data(dest);
+ long ret;
+
+ if (!maxlen)
+ return;
+
+ ret = strlcpy(dst, current->comm, maxlen);
+ *(u32 *)dest = make_data_rloc(ret, get_rloc_offs(*(u32 *)dest));
+}
+NOKPROBE_SYMBOL(FETCH_FUNC_NAME(comm, string));
+
+void FETCH_FUNC_NAME(comm, string_size)(struct pt_regs *regs,
+ void *data, void *dest)
+{
+ *(u32 *)dest = strlen(current->comm) + 1;
+}
+NOKPROBE_SYMBOL(FETCH_FUNC_NAME(comm, string_size));
+
static const struct fetch_type *find_fetch_type(const char *type,
const struct fetch_type *ftbl)
{
@@ -348,6 +370,11 @@ static int parse_probe_vars(char *arg, const struct fetch_type *t,
}
} else
ret = -EINVAL;
+ } else if (strcmp(arg, "comm") == 0) {
+ if (strcmp(t->name, "string") != 0 &&
+ strcmp(t->name, "string_size") != 0)
+ return -EINVAL;
+ f->fn = t->fetch[FETCH_MTD_comm];
} else
ret = -EINVAL;
@@ -522,6 +549,12 @@ int traceprobe_parse_probe_arg(char *arg, ssize_t *size,
arg[t - parg->comm] = '\0';
t++;
}
+ /*
+ * The default type of $comm should be "string", and it can't be
+ * dereferenced.
+ */
+ if (!t && strcmp(arg, "$comm") == 0)
+ t = "string";
parg->type = find_fetch_type(t, ftbl);
if (!parg->type) {
pr_info("Unsupported type: %s\n", t);
diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h
index f6398db09114..45400ca5ded1 100644
--- a/kernel/trace/trace_probe.h
+++ b/kernel/trace/trace_probe.h
@@ -102,6 +102,7 @@ enum {
FETCH_MTD_reg = 0,
FETCH_MTD_stack,
FETCH_MTD_retval,
+ FETCH_MTD_comm,
FETCH_MTD_memory,
FETCH_MTD_symbol,
FETCH_MTD_deref,
@@ -183,6 +184,14 @@ DECLARE_BASIC_FETCH_FUNCS(bitfield);
#define fetch_bitfield_string NULL
#define fetch_bitfield_string_size NULL
+/* comm only makes sense as a string */
+#define fetch_comm_u8 NULL
+#define fetch_comm_u16 NULL
+#define fetch_comm_u32 NULL
+#define fetch_comm_u64 NULL
+DECLARE_FETCH_FUNC(comm, string);
+DECLARE_FETCH_FUNC(comm, string_size);
+
/*
* Define macro for basic types - we don't need to define s* types, because
* we have to care only about bitwidth at recording time.
@@ -213,6 +222,7 @@ DEFINE_FETCH_##method(u64)
ASSIGN_FETCH_FUNC(reg, ftype), \
ASSIGN_FETCH_FUNC(stack, ftype), \
ASSIGN_FETCH_FUNC(retval, ftype), \
+ASSIGN_FETCH_FUNC(comm, ftype), \
ASSIGN_FETCH_FUNC(memory, ftype), \
ASSIGN_FETCH_FUNC(symbol, ftype), \
ASSIGN_FETCH_FUNC(deref, ftype), \
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index d12bd958077e..ef071ca73fc3 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -4607,84 +4607,65 @@ static void restore_unbound_workers_cpumask(struct worker_pool *pool, int cpu)
WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, &cpumask) < 0);
}
-/*
- * Workqueues should be brought up before normal priority CPU notifiers.
- * This will be registered high priority CPU notifier.
- */
-static int workqueue_cpu_up_callback(struct notifier_block *nfb,
- unsigned long action,
- void *hcpu)
+int workqueue_prepare_cpu(unsigned int cpu)
+{
+ struct worker_pool *pool;
+
+ for_each_cpu_worker_pool(pool, cpu) {
+ if (pool->nr_workers)
+ continue;
+ if (!create_worker(pool))
+ return -ENOMEM;
+ }
+ return 0;
+}
+
+int workqueue_online_cpu(unsigned int cpu)
{
- int cpu = (unsigned long)hcpu;
struct worker_pool *pool;
struct workqueue_struct *wq;
int pi;
- switch (action & ~CPU_TASKS_FROZEN) {
- case CPU_UP_PREPARE:
- for_each_cpu_worker_pool(pool, cpu) {
- if (pool->nr_workers)
- continue;
- if (!create_worker(pool))
- return NOTIFY_BAD;
- }
- break;
-
- case CPU_DOWN_FAILED:
- case CPU_ONLINE:
- mutex_lock(&wq_pool_mutex);
+ mutex_lock(&wq_pool_mutex);
- for_each_pool(pool, pi) {
- mutex_lock(&pool->attach_mutex);
+ for_each_pool(pool, pi) {
+ mutex_lock(&pool->attach_mutex);
- if (pool->cpu == cpu)
- rebind_workers(pool);
- else if (pool->cpu < 0)
- restore_unbound_workers_cpumask(pool, cpu);
+ if (pool->cpu == cpu)
+ rebind_workers(pool);
+ else if (pool->cpu < 0)
+ restore_unbound_workers_cpumask(pool, cpu);
- mutex_unlock(&pool->attach_mutex);
- }
+ mutex_unlock(&pool->attach_mutex);
+ }
- /* update NUMA affinity of unbound workqueues */
- list_for_each_entry(wq, &workqueues, list)
- wq_update_unbound_numa(wq, cpu, true);
+ /* update NUMA affinity of unbound workqueues */
+ list_for_each_entry(wq, &workqueues, list)
+ wq_update_unbound_numa(wq, cpu, true);
- mutex_unlock(&wq_pool_mutex);
- break;
- }
- return NOTIFY_OK;
+ mutex_unlock(&wq_pool_mutex);
+ return 0;
}
-/*
- * Workqueues should be brought down after normal priority CPU notifiers.
- * This will be registered as low priority CPU notifier.
- */
-static int workqueue_cpu_down_callback(struct notifier_block *nfb,
- unsigned long action,
- void *hcpu)
+int workqueue_offline_cpu(unsigned int cpu)
{
- int cpu = (unsigned long)hcpu;
struct work_struct unbind_work;
struct workqueue_struct *wq;
- switch (action & ~CPU_TASKS_FROZEN) {
- case CPU_DOWN_PREPARE:
- /* unbinding per-cpu workers should happen on the local CPU */
- INIT_WORK_ONSTACK(&unbind_work, wq_unbind_fn);
- queue_work_on(cpu, system_highpri_wq, &unbind_work);
-
- /* update NUMA affinity of unbound workqueues */
- mutex_lock(&wq_pool_mutex);
- list_for_each_entry(wq, &workqueues, list)
- wq_update_unbound_numa(wq, cpu, false);
- mutex_unlock(&wq_pool_mutex);
-
- /* wait for per-cpu unbinding to finish */
- flush_work(&unbind_work);
- destroy_work_on_stack(&unbind_work);
- break;
- }
- return NOTIFY_OK;
+ /* unbinding per-cpu workers should happen on the local CPU */
+ INIT_WORK_ONSTACK(&unbind_work, wq_unbind_fn);
+ queue_work_on(cpu, system_highpri_wq, &unbind_work);
+
+ /* update NUMA affinity of unbound workqueues */
+ mutex_lock(&wq_pool_mutex);
+ list_for_each_entry(wq, &workqueues, list)
+ wq_update_unbound_numa(wq, cpu, false);
+ mutex_unlock(&wq_pool_mutex);
+
+ /* wait for per-cpu unbinding to finish */
+ flush_work(&unbind_work);
+ destroy_work_on_stack(&unbind_work);
+ return 0;
}
#ifdef CONFIG_SMP
@@ -5486,9 +5467,6 @@ static int __init init_workqueues(void)
pwq_cache = KMEM_CACHE(pool_workqueue, SLAB_PANIC);
- cpu_notifier(workqueue_cpu_up_callback, CPU_PRI_WORKQUEUE_UP);
- hotcpu_notifier(workqueue_cpu_down_callback, CPU_PRI_WORKQUEUE_DOWN);
-
wq_numa_init();
/* initialize CPU pools */