diff options
Diffstat (limited to 'kernel')
83 files changed, 3062 insertions, 1732 deletions
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 76d5a794e426..633a650d7aeb 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -328,8 +328,8 @@ static void *fd_array_map_lookup_elem(struct bpf_map *map, void *key) } /* only called from syscall */ -static int fd_array_map_update_elem(struct bpf_map *map, void *key, - void *value, u64 map_flags) +int bpf_fd_array_map_update_elem(struct bpf_map *map, struct file *map_file, + void *key, void *value, u64 map_flags) { struct bpf_array *array = container_of(map, struct bpf_array, map); void *new_ptr, *old_ptr; @@ -342,7 +342,7 @@ static int fd_array_map_update_elem(struct bpf_map *map, void *key, return -E2BIG; ufd = *(u32 *)value; - new_ptr = map->ops->map_fd_get_ptr(map, ufd); + new_ptr = map->ops->map_fd_get_ptr(map, map_file, ufd); if (IS_ERR(new_ptr)) return PTR_ERR(new_ptr); @@ -371,10 +371,12 @@ static int fd_array_map_delete_elem(struct bpf_map *map, void *key) } } -static void *prog_fd_array_get_ptr(struct bpf_map *map, int fd) +static void *prog_fd_array_get_ptr(struct bpf_map *map, + struct file *map_file, int fd) { struct bpf_array *array = container_of(map, struct bpf_array, map); struct bpf_prog *prog = bpf_prog_get(fd); + if (IS_ERR(prog)) return prog; @@ -382,14 +384,13 @@ static void *prog_fd_array_get_ptr(struct bpf_map *map, int fd) bpf_prog_put(prog); return ERR_PTR(-EINVAL); } + return prog; } static void prog_fd_array_put_ptr(void *ptr) { - struct bpf_prog *prog = ptr; - - bpf_prog_put_rcu(prog); + bpf_prog_put(ptr); } /* decrement refcnt of all bpf_progs that are stored in this map */ @@ -407,7 +408,6 @@ static const struct bpf_map_ops prog_array_ops = { .map_free = fd_array_map_free, .map_get_next_key = array_map_get_next_key, .map_lookup_elem = fd_array_map_lookup_elem, - .map_update_elem = fd_array_map_update_elem, .map_delete_elem = fd_array_map_delete_elem, .map_fd_get_ptr = prog_fd_array_get_ptr, .map_fd_put_ptr = prog_fd_array_put_ptr, @@ -425,59 +425,105 @@ static int __init register_prog_array_map(void) } late_initcall(register_prog_array_map); -static void perf_event_array_map_free(struct bpf_map *map) +static struct bpf_event_entry *bpf_event_entry_gen(struct file *perf_file, + struct file *map_file) { - bpf_fd_array_map_clear(map); - fd_array_map_free(map); + struct bpf_event_entry *ee; + + ee = kzalloc(sizeof(*ee), GFP_ATOMIC); + if (ee) { + ee->event = perf_file->private_data; + ee->perf_file = perf_file; + ee->map_file = map_file; + } + + return ee; } -static void *perf_event_fd_array_get_ptr(struct bpf_map *map, int fd) +static void __bpf_event_entry_free(struct rcu_head *rcu) { - struct perf_event *event; - const struct perf_event_attr *attr; - struct file *file; + struct bpf_event_entry *ee; - file = perf_event_get(fd); - if (IS_ERR(file)) - return file; + ee = container_of(rcu, struct bpf_event_entry, rcu); + fput(ee->perf_file); + kfree(ee); +} - event = file->private_data; +static void bpf_event_entry_free_rcu(struct bpf_event_entry *ee) +{ + call_rcu(&ee->rcu, __bpf_event_entry_free); +} - attr = perf_event_attrs(event); - if (IS_ERR(attr)) - goto err; +static void *perf_event_fd_array_get_ptr(struct bpf_map *map, + struct file *map_file, int fd) +{ + const struct perf_event_attr *attr; + struct bpf_event_entry *ee; + struct perf_event *event; + struct file *perf_file; - if (attr->inherit) - goto err; + perf_file = perf_event_get(fd); + if (IS_ERR(perf_file)) + return perf_file; - if (attr->type == PERF_TYPE_RAW) - return file; + event = perf_file->private_data; + ee = ERR_PTR(-EINVAL); - if (attr->type == PERF_TYPE_HARDWARE) - return file; + attr = perf_event_attrs(event); + if (IS_ERR(attr) || attr->inherit) + goto err_out; + + switch (attr->type) { + case PERF_TYPE_SOFTWARE: + if (attr->config != PERF_COUNT_SW_BPF_OUTPUT) + goto err_out; + /* fall-through */ + case PERF_TYPE_RAW: + case PERF_TYPE_HARDWARE: + ee = bpf_event_entry_gen(perf_file, map_file); + if (ee) + return ee; + ee = ERR_PTR(-ENOMEM); + /* fall-through */ + default: + break; + } - if (attr->type == PERF_TYPE_SOFTWARE && - attr->config == PERF_COUNT_SW_BPF_OUTPUT) - return file; -err: - fput(file); - return ERR_PTR(-EINVAL); +err_out: + fput(perf_file); + return ee; } static void perf_event_fd_array_put_ptr(void *ptr) { - fput((struct file *)ptr); + bpf_event_entry_free_rcu(ptr); +} + +static void perf_event_fd_array_release(struct bpf_map *map, + struct file *map_file) +{ + struct bpf_array *array = container_of(map, struct bpf_array, map); + struct bpf_event_entry *ee; + int i; + + rcu_read_lock(); + for (i = 0; i < array->map.max_entries; i++) { + ee = READ_ONCE(array->ptrs[i]); + if (ee && ee->map_file == map_file) + fd_array_map_delete_elem(map, &i); + } + rcu_read_unlock(); } static const struct bpf_map_ops perf_event_array_ops = { .map_alloc = fd_array_map_alloc, - .map_free = perf_event_array_map_free, + .map_free = fd_array_map_free, .map_get_next_key = array_map_get_next_key, .map_lookup_elem = fd_array_map_lookup_elem, - .map_update_elem = fd_array_map_update_elem, .map_delete_elem = fd_array_map_delete_elem, .map_fd_get_ptr = perf_event_fd_array_get_ptr, .map_fd_put_ptr = perf_event_fd_array_put_ptr, + .map_release = perf_event_fd_array_release, }; static struct bpf_map_type_list perf_event_array_type __read_mostly = { @@ -491,3 +537,46 @@ static int __init register_perf_event_array_map(void) return 0; } late_initcall(register_perf_event_array_map); + +#ifdef CONFIG_SOCK_CGROUP_DATA +static void *cgroup_fd_array_get_ptr(struct bpf_map *map, + struct file *map_file /* not used */, + int fd) +{ + return cgroup_get_from_fd(fd); +} + +static void cgroup_fd_array_put_ptr(void *ptr) +{ + /* cgroup_put free cgrp after a rcu grace period */ + cgroup_put(ptr); +} + +static void cgroup_fd_array_free(struct bpf_map *map) +{ + bpf_fd_array_map_clear(map); + fd_array_map_free(map); +} + +static const struct bpf_map_ops cgroup_array_ops = { + .map_alloc = fd_array_map_alloc, + .map_free = cgroup_fd_array_free, + .map_get_next_key = array_map_get_next_key, + .map_lookup_elem = fd_array_map_lookup_elem, + .map_delete_elem = fd_array_map_delete_elem, + .map_fd_get_ptr = cgroup_fd_array_get_ptr, + .map_fd_put_ptr = cgroup_fd_array_put_ptr, +}; + +static struct bpf_map_type_list cgroup_array_type __read_mostly = { + .ops = &cgroup_array_ops, + .type = BPF_MAP_TYPE_CGROUP_ARRAY, +}; + +static int __init register_cgroup_array_map(void) +{ + bpf_register_map_type(&cgroup_array_type); + return 0; +} +late_initcall(register_cgroup_array_map); +#endif diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index b94a36550591..03fd23d4d587 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -719,14 +719,13 @@ select_insn: if (unlikely(index >= array->map.max_entries)) goto out; - if (unlikely(tail_call_cnt > MAX_TAIL_CALL_CNT)) goto out; tail_call_cnt++; prog = READ_ONCE(array->ptrs[index]); - if (unlikely(!prog)) + if (!prog) goto out; /* ARG1 at this point is guaranteed to point to CTX from @@ -1055,9 +1054,11 @@ const struct bpf_func_proto * __weak bpf_get_trace_printk_proto(void) return NULL; } -const struct bpf_func_proto * __weak bpf_get_event_output_proto(void) +u64 __weak +bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size, + void *ctx, u64 ctx_size, bpf_ctx_copy_t ctx_copy) { - return NULL; + return -ENOTSUPP; } /* Always built-in helper functions. */ diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index ad7a0573f71b..1ea3afba1a4f 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -101,7 +101,7 @@ const struct bpf_func_proto bpf_get_prandom_u32_proto = { static u64 bpf_get_smp_processor_id(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) { - return raw_smp_processor_id(); + return smp_processor_id(); } const struct bpf_func_proto bpf_get_smp_processor_id_proto = { diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index 318858edb1cd..5967b870a895 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -11,7 +11,7 @@ * version 2 as published by the Free Software Foundation. */ -#include <linux/module.h> +#include <linux/init.h> #include <linux/magic.h> #include <linux/major.h> #include <linux/mount.h> @@ -367,8 +367,6 @@ static struct file_system_type bpf_fs_type = { .kill_sb = kill_litter_super, }; -MODULE_ALIAS_FS("bpf"); - static int __init bpf_init(void) { int ret; diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index 080a2dfb5800..bf4495fcd25d 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -99,7 +99,7 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr) if (err) goto free_smap; - err = get_callchain_buffers(); + err = get_callchain_buffers(sysctl_perf_event_max_stack); if (err) goto free_smap; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 46ecce4b79ed..228f962447a5 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -124,7 +124,12 @@ void bpf_map_put_with_uref(struct bpf_map *map) static int bpf_map_release(struct inode *inode, struct file *filp) { - bpf_map_put_with_uref(filp->private_data); + struct bpf_map *map = filp->private_data; + + if (map->ops->map_release) + map->ops->map_release(map, filp); + + bpf_map_put_with_uref(map); return 0; } @@ -387,6 +392,13 @@ static int map_update_elem(union bpf_attr *attr) err = bpf_percpu_hash_update(map, key, value, attr->flags); } else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) { err = bpf_percpu_array_update(map, key, value, attr->flags); + } else if (map->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || + map->map_type == BPF_MAP_TYPE_PROG_ARRAY || + map->map_type == BPF_MAP_TYPE_CGROUP_ARRAY) { + rcu_read_lock(); + err = bpf_fd_array_map_update_elem(map, f.file, key, value, + attr->flags); + rcu_read_unlock(); } else { rcu_read_lock(); err = map->ops->map_update_elem(map, key, value, attr->flags); @@ -612,7 +624,7 @@ static void bpf_prog_uncharge_memlock(struct bpf_prog *prog) free_uid(user); } -static void __prog_put_common(struct rcu_head *rcu) +static void __bpf_prog_put_rcu(struct rcu_head *rcu) { struct bpf_prog_aux *aux = container_of(rcu, struct bpf_prog_aux, rcu); @@ -621,17 +633,10 @@ static void __prog_put_common(struct rcu_head *rcu) bpf_prog_free(aux->prog); } -/* version of bpf_prog_put() that is called after a grace period */ -void bpf_prog_put_rcu(struct bpf_prog *prog) -{ - if (atomic_dec_and_test(&prog->aux->refcnt)) - call_rcu(&prog->aux->rcu, __prog_put_common); -} - void bpf_prog_put(struct bpf_prog *prog) { if (atomic_dec_and_test(&prog->aux->refcnt)) - __prog_put_common(&prog->aux->rcu); + call_rcu(&prog->aux->rcu, __bpf_prog_put_rcu); } EXPORT_SYMBOL_GPL(bpf_prog_put); @@ -639,7 +644,7 @@ static int bpf_prog_release(struct inode *inode, struct file *filp) { struct bpf_prog *prog = filp->private_data; - bpf_prog_put_rcu(prog); + bpf_prog_put(prog); return 0; } @@ -653,7 +658,7 @@ int bpf_prog_new_fd(struct bpf_prog *prog) O_RDWR | O_CLOEXEC); } -static struct bpf_prog *__bpf_prog_get(struct fd f) +static struct bpf_prog *____bpf_prog_get(struct fd f) { if (!f.file) return ERR_PTR(-EBADF); @@ -665,33 +670,50 @@ static struct bpf_prog *__bpf_prog_get(struct fd f) return f.file->private_data; } -struct bpf_prog *bpf_prog_inc(struct bpf_prog *prog) +struct bpf_prog *bpf_prog_add(struct bpf_prog *prog, int i) { - if (atomic_inc_return(&prog->aux->refcnt) > BPF_MAX_REFCNT) { - atomic_dec(&prog->aux->refcnt); + if (atomic_add_return(i, &prog->aux->refcnt) > BPF_MAX_REFCNT) { + atomic_sub(i, &prog->aux->refcnt); return ERR_PTR(-EBUSY); } return prog; } +EXPORT_SYMBOL_GPL(bpf_prog_add); -/* called by sockets/tracing/seccomp before attaching program to an event - * pairs with bpf_prog_put() - */ -struct bpf_prog *bpf_prog_get(u32 ufd) +struct bpf_prog *bpf_prog_inc(struct bpf_prog *prog) +{ + return bpf_prog_add(prog, 1); +} + +static struct bpf_prog *__bpf_prog_get(u32 ufd, enum bpf_prog_type *type) { struct fd f = fdget(ufd); struct bpf_prog *prog; - prog = __bpf_prog_get(f); + prog = ____bpf_prog_get(f); if (IS_ERR(prog)) return prog; + if (type && prog->type != *type) { + prog = ERR_PTR(-EINVAL); + goto out; + } prog = bpf_prog_inc(prog); +out: fdput(f); - return prog; } -EXPORT_SYMBOL_GPL(bpf_prog_get); + +struct bpf_prog *bpf_prog_get(u32 ufd) +{ + return __bpf_prog_get(ufd, NULL); +} + +struct bpf_prog *bpf_prog_get_type(u32 ufd, enum bpf_prog_type type) +{ + return __bpf_prog_get(ufd, &type); +} +EXPORT_SYMBOL_GPL(bpf_prog_get_type); /* last field in 'union bpf_attr' used by this command */ #define BPF_PROG_LOAD_LAST_FIELD kern_version diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index eec9f90ba030..f72f23b8fdab 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -653,6 +653,16 @@ static int check_map_access(struct verifier_env *env, u32 regno, int off, #define MAX_PACKET_OFF 0xffff +static bool may_write_pkt_data(enum bpf_prog_type type) +{ + switch (type) { + case BPF_PROG_TYPE_XDP: + return true; + default: + return false; + } +} + static int check_packet_access(struct verifier_env *env, u32 regno, int off, int size) { @@ -713,6 +723,7 @@ static int check_ptr_alignment(struct verifier_env *env, struct reg_state *reg, switch (env->prog->type) { case BPF_PROG_TYPE_SCHED_CLS: case BPF_PROG_TYPE_SCHED_ACT: + case BPF_PROG_TYPE_XDP: break; default: verbose("verifier is misconfigured\n"); @@ -805,10 +816,15 @@ static int check_mem_access(struct verifier_env *env, u32 regno, int off, err = check_stack_read(state, off, size, value_regno); } } else if (state->regs[regno].type == PTR_TO_PACKET) { - if (t == BPF_WRITE) { + if (t == BPF_WRITE && !may_write_pkt_data(env->prog->type)) { verbose("cannot write into packet\n"); return -EACCES; } + if (t == BPF_WRITE && value_regno >= 0 && + is_pointer_value(env, value_regno)) { + verbose("R%d leaks addr into packet\n", value_regno); + return -EACCES; + } err = check_packet_access(env, regno, off, size); if (!err && t == BPF_READ && value_regno >= 0) mark_reg_unknown_value(state->regs, value_regno); @@ -1035,6 +1051,10 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id) if (func_id != BPF_FUNC_get_stackid) goto error; break; + case BPF_MAP_TYPE_CGROUP_ARRAY: + if (func_id != BPF_FUNC_skb_in_cgroup) + goto error; + break; default: break; } @@ -1054,6 +1074,10 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id) if (map->map_type != BPF_MAP_TYPE_STACK_TRACE) goto error; break; + case BPF_FUNC_skb_in_cgroup: + if (map->map_type != BPF_MAP_TYPE_CGROUP_ARRAY) + goto error; + break; default: break; } diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 75c0ff00aca6..9624db80dc4e 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -61,7 +61,7 @@ #include <linux/cpuset.h> #include <linux/proc_ns.h> #include <linux/nsproxy.h> -#include <linux/proc_ns.h> +#include <linux/file.h> #include <net/sock.h> /* @@ -1160,18 +1160,12 @@ static void cgroup_exit_root_id(struct cgroup_root *root) { lockdep_assert_held(&cgroup_mutex); - if (root->hierarchy_id) { - idr_remove(&cgroup_hierarchy_idr, root->hierarchy_id); - root->hierarchy_id = 0; - } + idr_remove(&cgroup_hierarchy_idr, root->hierarchy_id); } static void cgroup_free_root(struct cgroup_root *root) { if (root) { - /* hierarchy ID should already have been released */ - WARN_ON_ONCE(root->hierarchy_id); - idr_destroy(&root->cgroup_idr); kfree(root); } @@ -5146,6 +5140,8 @@ static struct cgroup_subsys_state *css_create(struct cgroup *cgrp, lockdep_assert_held(&cgroup_mutex); css = ss->css_alloc(parent_css); + if (!css) + css = ERR_PTR(-ENOMEM); if (IS_ERR(css)) return css; @@ -6172,7 +6168,7 @@ struct cgroup_subsys_state *css_tryget_online_from_dir(struct dentry *dentry, struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss) { WARN_ON_ONCE(!rcu_read_lock_held()); - return id > 0 ? idr_find(&ss->css_idr, id) : NULL; + return idr_find(&ss->css_idr, id); } /** @@ -6209,6 +6205,40 @@ struct cgroup *cgroup_get_from_path(const char *path) } EXPORT_SYMBOL_GPL(cgroup_get_from_path); +/** + * cgroup_get_from_fd - get a cgroup pointer from a fd + * @fd: fd obtained by open(cgroup2_dir) + * + * Find the cgroup from a fd which should be obtained + * by opening a cgroup directory. Returns a pointer to the + * cgroup on success. ERR_PTR is returned if the cgroup + * cannot be found. + */ +struct cgroup *cgroup_get_from_fd(int fd) +{ + struct cgroup_subsys_state *css; + struct cgroup *cgrp; + struct file *f; + + f = fget_raw(fd); + if (!f) + return ERR_PTR(-EBADF); + + css = css_tryget_online_from_dir(f->f_path.dentry, NULL); + fput(f); + if (IS_ERR(css)) + return ERR_CAST(css); + + cgrp = css->cgroup; + if (!cgroup_on_dfl(cgrp)) { + cgroup_put(cgrp); + return ERR_PTR(-EBADF); + } + + return cgrp; +} +EXPORT_SYMBOL_GPL(cgroup_get_from_fd); + /* * sock->sk_cgrp_data handling. For more info, see sock_cgroup_data * definition in cgroup-defs.h. diff --git a/kernel/cgroup_pids.c b/kernel/cgroup_pids.c index 303097b37429..2bd673783f1a 100644 --- a/kernel/cgroup_pids.c +++ b/kernel/cgroup_pids.c @@ -49,6 +49,12 @@ struct pids_cgroup { */ atomic64_t counter; int64_t limit; + + /* Handle for "pids.events" */ + struct cgroup_file events_file; + + /* Number of times fork failed because limit was hit. */ + atomic64_t events_limit; }; static struct pids_cgroup *css_pids(struct cgroup_subsys_state *css) @@ -72,6 +78,7 @@ pids_css_alloc(struct cgroup_subsys_state *parent) pids->limit = PIDS_MAX; atomic64_set(&pids->counter, 0); + atomic64_set(&pids->events_limit, 0); return &pids->css; } @@ -213,10 +220,21 @@ static int pids_can_fork(struct task_struct *task) { struct cgroup_subsys_state *css; struct pids_cgroup *pids; + int err; css = task_css_check(current, pids_cgrp_id, true); pids = css_pids(css); - return pids_try_charge(pids, 1); + err = pids_try_charge(pids, 1); + if (err) { + /* Only log the first time events_limit is incremented. */ + if (atomic64_inc_return(&pids->events_limit) == 1) { + pr_info("cgroup: fork rejected by pids controller in "); + pr_cont_cgroup_path(task_cgroup(current, pids_cgrp_id)); + pr_cont("\n"); + } + cgroup_file_notify(&pids->events_file); + } + return err; } static void pids_cancel_fork(struct task_struct *task) @@ -288,6 +306,14 @@ static s64 pids_current_read(struct cgroup_subsys_state *css, return atomic64_read(&pids->counter); } +static int pids_events_show(struct seq_file *sf, void *v) +{ + struct pids_cgroup *pids = css_pids(seq_css(sf)); + + seq_printf(sf, "max %lld\n", (s64)atomic64_read(&pids->events_limit)); + return 0; +} + static struct cftype pids_files[] = { { .name = "max", @@ -300,6 +326,12 @@ static struct cftype pids_files[] = { .read_s64 = pids_current_read, .flags = CFTYPE_NOT_ON_ROOT, }, + { + .name = "events", + .seq_show = pids_events_show, + .file_offset = offsetof(struct pids_cgroup, events_file), + .flags = CFTYPE_NOT_ON_ROOT, + }, { } /* terminate */ }; diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 73e93e53884d..c7fd2778ed50 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -1034,15 +1034,6 @@ static void cpuset_change_task_nodemask(struct task_struct *tsk, { bool need_loop; - /* - * Allow tasks that have access to memory reserves because they have - * been OOM killed to get memory anywhere. - */ - if (unlikely(test_thread_flag(TIF_MEMDIE))) - return; - if (current->flags & PF_EXITING) /* Let dying task have memory */ - return; - task_lock(tsk); /* * Determine if a loop is necessary if another thread is doing diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c index 179ef4640964..e9fdb5203de5 100644 --- a/kernel/events/callchain.c +++ b/kernel/events/callchain.c @@ -104,7 +104,7 @@ fail: return -ENOMEM; } -int get_callchain_buffers(void) +int get_callchain_buffers(int event_max_stack) { int err = 0; int count; @@ -121,6 +121,15 @@ int get_callchain_buffers(void) /* If the allocation failed, give up */ if (!callchain_cpus_entries) err = -ENOMEM; + /* + * If requesting per event more than the global cap, + * return a different error to help userspace figure + * this out. + * + * And also do it here so that we have &callchain_mutex held. + */ + if (event_max_stack > sysctl_perf_event_max_stack) + err = -EOVERFLOW; goto exit; } @@ -174,11 +183,12 @@ perf_callchain(struct perf_event *event, struct pt_regs *regs) bool user = !event->attr.exclude_callchain_user; /* Disallow cross-task user callchains. */ bool crosstask = event->ctx->task && event->ctx->task != current; + const u32 max_stack = event->attr.sample_max_stack; if (!kernel && !user) return NULL; - return get_perf_callchain(regs, 0, kernel, user, sysctl_perf_event_max_stack, crosstask, true); + return get_perf_callchain(regs, 0, kernel, user, max_stack, crosstask, true); } struct perf_callchain_entry * diff --git a/kernel/events/core.c b/kernel/events/core.c index f3ef1c29a7c9..356a6c7cb52a 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -335,6 +335,7 @@ static atomic_t perf_sched_count; static DEFINE_PER_CPU(atomic_t, perf_cgroup_events); static DEFINE_PER_CPU(int, perf_sched_cb_usages); +static DEFINE_PER_CPU(struct pmu_event_list, pmu_sb_events); static atomic_t nr_mmap_events __read_mostly; static atomic_t nr_comm_events __read_mostly; @@ -396,6 +397,13 @@ int perf_proc_update_handler(struct ctl_table *table, int write, if (ret || !write) return ret; + /* + * If throttling is disabled don't allow the write: + */ + if (sysctl_perf_cpu_time_max_percent == 100 || + sysctl_perf_cpu_time_max_percent == 0) + return -EINVAL; + max_samples_per_tick = DIV_ROUND_UP(sysctl_perf_event_sample_rate, HZ); perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate; update_perf_cpu_limits(); @@ -3686,6 +3694,39 @@ static void free_event_rcu(struct rcu_head *head) static void ring_buffer_attach(struct perf_event *event, struct ring_buffer *rb); +static void detach_sb_event(struct perf_event *event) +{ + struct pmu_event_list *pel = per_cpu_ptr(&pmu_sb_events, event->cpu); + + raw_spin_lock(&pel->lock); + list_del_rcu(&event->sb_list); + raw_spin_unlock(&pel->lock); +} + +static bool is_sb_event(struct perf_event *event) +{ + struct perf_event_attr *attr = &event->attr; + + if (event->parent) + return false; + + if (event->attach_state & PERF_ATTACH_TASK) + return false; + + if (attr->mmap || attr->mmap_data || attr->mmap2 || + attr->comm || attr->comm_exec || + attr->task || + attr->context_switch) + return true; + return false; +} + +static void unaccount_pmu_sb_event(struct perf_event *event) +{ + if (is_sb_event(event)) + detach_sb_event(event); +} + static void unaccount_event_cpu(struct perf_event *event, int cpu) { if (event->parent) @@ -3749,6 +3790,8 @@ static void unaccount_event(struct perf_event *event) } unaccount_event_cpu(event, event->cpu); + + unaccount_pmu_sb_event(event); } static void perf_sched_delayed(struct work_struct *work) @@ -5574,16 +5617,26 @@ void perf_output_sample(struct perf_output_handle *handle, } if (sample_type & PERF_SAMPLE_RAW) { - if (data->raw) { - u32 raw_size = data->raw->size; - u32 real_size = round_up(raw_size + sizeof(u32), - sizeof(u64)) - sizeof(u32); - u64 zero = 0; - - perf_output_put(handle, real_size); - __output_copy(handle, data->raw->data, raw_size); - if (real_size - raw_size) - __output_copy(handle, &zero, real_size - raw_size); + struct perf_raw_record *raw = data->raw; + + if (raw) { + struct perf_raw_frag *frag = &raw->frag; + + perf_output_put(handle, raw->size); + do { + if (frag->copy) { + __output_custom(handle, frag->copy, + frag->data, frag->size); + } else { + __output_copy(handle, frag->data, + frag->size); + } + if (perf_raw_frag_last(frag)) + break; + frag = frag->next; + } while (1); + if (frag->pad) + __output_skip(handle, NULL, frag->pad); } else { struct { u32 size; @@ -5708,14 +5761,28 @@ void perf_prepare_sample(struct perf_event_header *header, } if (sample_type & PERF_SAMPLE_RAW) { - int size = sizeof(u32); - - if (data->raw) - size += data->raw->size; - else - size += sizeof(u32); + struct perf_raw_record *raw = data->raw; + int size; + + if (raw) { + struct perf_raw_frag *frag = &raw->frag; + u32 sum = 0; + + do { + sum += frag->size; + if (perf_raw_frag_last(frag)) + break; + frag = frag->next; + } while (1); + + size = round_up(sum + sizeof(u32), sizeof(u64)); + raw->size = size - sizeof(u32); + frag->pad = raw->size - sum; + } else { + size = sizeof(u64); + } - header->size += round_up(size, sizeof(u64)); + header->size += size; } if (sample_type & PERF_SAMPLE_BRANCH_STACK) { @@ -5875,11 +5942,11 @@ perf_event_read_event(struct perf_event *event, perf_output_end(&handle); } -typedef void (perf_event_aux_output_cb)(struct perf_event *event, void *data); +typedef void (perf_iterate_f)(struct perf_event *event, void *data); static void -perf_event_aux_ctx(struct perf_event_context *ctx, - perf_event_aux_output_cb output, +perf_iterate_ctx(struct perf_event_context *ctx, + perf_iterate_f output, void *data, bool all) { struct perf_event *event; @@ -5896,52 +5963,55 @@ perf_event_aux_ctx(struct perf_event_context *ctx, } } -static void -perf_event_aux_task_ctx(perf_event_aux_output_cb output, void *data, - struct perf_event_context *task_ctx) +static void perf_iterate_sb_cpu(perf_iterate_f output, void *data) { - rcu_read_lock(); - preempt_disable(); - perf_event_aux_ctx(task_ctx, output, data, false); - preempt_enable(); - rcu_read_unlock(); + struct pmu_event_list *pel = this_cpu_ptr(&pmu_sb_events); + struct perf_event *event; + + list_for_each_entry_rcu(event, &pel->list, sb_list) { + if (event->state < PERF_EVENT_STATE_INACTIVE) + continue; + if (!event_filter_match(event)) + continue; + output(event, data); + } } +/* + * Iterate all events that need to receive side-band events. + * + * For new callers; ensure that account_pmu_sb_event() includes + * your event, otherwise it might not get delivered. + */ static void -perf_event_aux(perf_event_aux_output_cb output, void *data, +perf_iterate_sb(perf_iterate_f output, void *data, struct perf_event_context *task_ctx) { - struct perf_cpu_context *cpuctx; struct perf_event_context *ctx; - struct pmu *pmu; int ctxn; + rcu_read_lock(); + preempt_disable(); + /* - * If we have task_ctx != NULL we only notify - * the task context itself. The task_ctx is set - * only for EXIT events before releasing task + * If we have task_ctx != NULL we only notify the task context itself. + * The task_ctx is set only for EXIT events before releasing task * context. */ if (task_ctx) { - perf_event_aux_task_ctx(output, data, task_ctx); - return; + perf_iterate_ctx(task_ctx, output, data, false); + goto done; } - rcu_read_lock(); - list_for_each_entry_rcu(pmu, &pmus, entry) { - cpuctx = get_cpu_ptr(pmu->pmu_cpu_context); - if (cpuctx->unique_pmu != pmu) - goto next; - perf_event_aux_ctx(&cpuctx->ctx, output, data, false); - ctxn = pmu->task_ctx_nr; - if (ctxn < 0) - goto next; + perf_iterate_sb_cpu(output, data); + + for_each_task_context_nr(ctxn) { ctx = rcu_dereference(current->perf_event_ctxp[ctxn]); if (ctx) - perf_event_aux_ctx(ctx, output, data, false); -next: - put_cpu_ptr(pmu->pmu_cpu_context); + perf_iterate_ctx(ctx, output, data, false); } +done: + preempt_enable(); rcu_read_unlock(); } @@ -5990,7 +6060,7 @@ void perf_event_exec(void) perf_event_enable_on_exec(ctxn); - perf_event_aux_ctx(ctx, perf_event_addr_filters_exec, NULL, + perf_iterate_ctx(ctx, perf_event_addr_filters_exec, NULL, true); } rcu_read_unlock(); @@ -6034,9 +6104,9 @@ static int __perf_pmu_output_stop(void *info) }; rcu_read_lock(); - perf_event_aux_ctx(&cpuctx->ctx, __perf_event_output_stop, &ro, false); + perf_iterate_ctx(&cpuctx->ctx, __perf_event_output_stop, &ro, false); if (cpuctx->task_ctx) - perf_event_aux_ctx(cpuctx->task_ctx, __perf_event_output_stop, + perf_iterate_ctx(cpuctx->task_ctx, __perf_event_output_stop, &ro, false); rcu_read_unlock(); @@ -6165,7 +6235,7 @@ static void perf_event_task(struct task_struct *task, }, }; - perf_event_aux(perf_event_task_output, + perf_iterate_sb(perf_event_task_output, &task_event, task_ctx); } @@ -6244,7 +6314,7 @@ static void perf_event_comm_event(struct perf_comm_event *comm_event) comm_event->event_id.header.size = sizeof(comm_event->event_id) + size; - perf_event_aux(perf_event_comm_output, + perf_iterate_sb(perf_event_comm_output, comm_event, NULL); } @@ -6475,7 +6545,7 @@ got_name: mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size; - perf_event_aux(perf_event_mmap_output, + perf_iterate_sb(perf_event_mmap_output, mmap_event, NULL); @@ -6558,7 +6628,7 @@ static void perf_addr_filters_adjust(struct vm_area_struct *vma) if (!ctx) continue; - perf_event_aux_ctx(ctx, __perf_addr_filters_adjust, vma, true); + perf_iterate_ctx(ctx, __perf_addr_filters_adjust, vma, true); } rcu_read_unlock(); } @@ -6745,7 +6815,7 @@ static void perf_event_switch(struct task_struct *task, }, }; - perf_event_aux(perf_event_switch_output, + perf_iterate_sb(perf_event_switch_output, &switch_event, NULL); } @@ -7352,7 +7422,7 @@ static struct pmu perf_swevent = { static int perf_tp_filter_match(struct perf_event *event, struct perf_sample_data *data) { - void *record = data->raw->data; + void *record = data->raw->frag.data; /* only top level events have filters set */ if (event->parent) @@ -7408,8 +7478,10 @@ void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size, struct perf_event *event; struct perf_raw_record raw = { - .size = entry_size, - .data = record, + .frag = { + .size = entry_size, + .data = record, + }, }; perf_sample_data_init(&data, 0, 0); @@ -7550,7 +7622,7 @@ static void perf_event_free_bpf_prog(struct perf_event *event) prog = event->tp_event->prog; if (prog) { event->tp_event->prog = NULL; - bpf_prog_put_rcu(prog); + bpf_prog_put(prog); } } @@ -8667,6 +8739,28 @@ unlock: return pmu; } +static void attach_sb_event(struct perf_event *event) +{ + struct pmu_event_list *pel = per_cpu_ptr(&pmu_sb_events, event->cpu); + + raw_spin_lock(&pel->lock); + list_add_rcu(&event->sb_list, &pel->list); + raw_spin_unlock(&pel->lock); +} + +/* + * We keep a list of all !task (and therefore per-cpu) events + * that need to receive side-band records. + * + * This avoids having to scan all the various PMU per-cpu contexts + * looking for them. + */ +static void account_pmu_sb_event(struct perf_event *event) +{ + if (is_sb_event(event)) + attach_sb_event(event); +} + static void account_event_cpu(struct perf_event *event, int cpu) { if (event->parent) @@ -8747,6 +8841,8 @@ static void account_event(struct perf_event *event) enabled: account_event_cpu(event, event->cpu); + + account_pmu_sb_event(event); } /* @@ -8895,7 +8991,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, if (!event->parent) { if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) { - err = get_callchain_buffers(); + err = get_callchain_buffers(attr->sample_max_stack); if (err) goto err_addr_filters; } @@ -9217,6 +9313,9 @@ SYSCALL_DEFINE5(perf_event_open, return -EINVAL; } + if (!attr.sample_max_stack) + attr.sample_max_stack = sysctl_perf_event_max_stack; + /* * In cgroup mode, the pid argument is used to pass the fd * opened to the cgroup directory in cgroupfs. The cpu argument @@ -9290,7 +9389,7 @@ SYSCALL_DEFINE5(perf_event_open, if (is_sampling_event(event)) { if (event->pmu->capabilities & PERF_PMU_CAP_NO_INTERRUPT) { - err = -ENOTSUPP; + err = -EOPNOTSUPP; goto err_alloc; } } @@ -10252,6 +10351,9 @@ static void __init perf_event_init_all_cpus(void) swhash = &per_cpu(swevent_htable, cpu); mutex_init(&swhash->hlist_mutex); INIT_LIST_HEAD(&per_cpu(active_ctx_list, cpu)); + + INIT_LIST_HEAD(&per_cpu(pmu_sb_events.list, cpu)); + raw_spin_lock_init(&per_cpu(pmu_sb_events.lock, cpu)); } } diff --git a/kernel/events/internal.h b/kernel/events/internal.h index 05f9f6d626df..486fd78eb8d5 100644 --- a/kernel/events/internal.h +++ b/kernel/events/internal.h @@ -123,21 +123,19 @@ static inline unsigned long perf_aux_size(struct ring_buffer *rb) return rb->aux_nr_pages << PAGE_SHIFT; } -#define DEFINE_OUTPUT_COPY(func_name, memcpy_func) \ -static inline unsigned long \ -func_name(struct perf_output_handle *handle, \ - const void *buf, unsigned long len) \ +#define __DEFINE_OUTPUT_COPY_BODY(advance_buf, memcpy_func, ...) \ { \ unsigned long size, written; \ \ do { \ size = min(handle->size, len); \ - written = memcpy_func(handle->addr, buf, size); \ + written = memcpy_func(__VA_ARGS__); \ written = size - written; \ \ len -= written; \ handle->addr += written; \ - buf += written; \ + if (advance_buf) \ + buf += written; \ handle->size -= written; \ if (!handle->size) { \ struct ring_buffer *rb = handle->rb; \ @@ -152,6 +150,21 @@ func_name(struct perf_output_handle *handle, \ return len; \ } +#define DEFINE_OUTPUT_COPY(func_name, memcpy_func) \ +static inline unsigned long \ +func_name(struct perf_output_handle *handle, \ + const void *buf, unsigned long len) \ +__DEFINE_OUTPUT_COPY_BODY(true, memcpy_func, handle->addr, buf, size) + +static inline unsigned long +__output_custom(struct perf_output_handle *handle, perf_copy_f copy_func, + const void *buf, unsigned long len) +{ + unsigned long orig_len = len; + __DEFINE_OUTPUT_COPY_BODY(false, copy_func, handle->addr, buf, + orig_len - len, size) +} + static inline unsigned long memcpy_common(void *dst, const void *src, unsigned long n) { diff --git a/kernel/exit.c b/kernel/exit.c index 9e6e1356e6bb..84ae830234f8 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -211,6 +211,82 @@ repeat: } /* + * Note that if this function returns a valid task_struct pointer (!NULL) + * task->usage must remain >0 for the duration of the RCU critical section. + */ +struct task_struct *task_rcu_dereference(struct task_struct **ptask) +{ + struct sighand_struct *sighand; + struct task_struct *task; + + /* + * We need to verify that release_task() was not called and thus + * delayed_put_task_struct() can't run and drop the last reference + * before rcu_read_unlock(). We check task->sighand != NULL, + * but we can read the already freed and reused memory. + */ +retry: + task = rcu_dereference(*ptask); + if (!task) + return NULL; + + probe_kernel_address(&task->sighand, sighand); + + /* + * Pairs with atomic_dec_and_test() in put_task_struct(). If this task + * was already freed we can not miss the preceding update of this + * pointer. + */ + smp_rmb(); + if (unlikely(task != READ_ONCE(*ptask))) + goto retry; + + /* + * We've re-checked that "task == *ptask", now we have two different + * cases: + * + * 1. This is actually the same task/task_struct. In this case + * sighand != NULL tells us it is still alive. + * + * 2. This is another task which got the same memory for task_struct. + * We can't know this of course, and we can not trust + * sighand != NULL. + * + * In this case we actually return a random value, but this is + * correct. + * + * If we return NULL - we can pretend that we actually noticed that + * *ptask was updated when the previous task has exited. Or pretend + * that probe_slab_address(&sighand) reads NULL. + * + * If we return the new task (because sighand is not NULL for any + * reason) - this is fine too. This (new) task can't go away before + * another gp pass. + * + * And note: We could even eliminate the false positive if re-read + * task->sighand once again to avoid the falsely NULL. But this case + * is very unlikely so we don't care. + */ + if (!sighand) + return NULL; + + return task; +} + +struct task_struct *try_get_task_struct(struct task_struct **ptask) +{ + struct task_struct *task; + + rcu_read_lock(); + task = task_rcu_dereference(ptask); + if (task) + get_task_struct(task); + rcu_read_unlock(); + + return task; +} + +/* * Determine if a process group is "orphaned", according to the POSIX * definition in 2.2.2.52. Orphaned process groups are not to be affected * by terminal-generated stop signals. Newly orphaned process groups are @@ -700,10 +776,14 @@ void do_exit(long code) exit_signals(tsk); /* sets PF_EXITING */ /* - * tsk->flags are checked in the futex code to protect against - * an exiting task cleaning up the robust pi futexes. + * Ensure that all new tsk->pi_lock acquisitions must observe + * PF_EXITING. Serializes against futex.c:attach_to_pi_owner(). */ smp_mb(); + /* + * Ensure that we must observe the pi_state in exit_mm() -> + * mm_release() -> exit_pi_state_list(). + */ raw_spin_unlock_wait(&tsk->pi_lock); if (unlikely(in_atomic())) { diff --git a/kernel/fork.c b/kernel/fork.c index 4a7ec0c6c88c..52e725d4a866 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -162,23 +162,15 @@ void __weak arch_release_thread_stack(unsigned long *stack) static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node) { - struct page *page = alloc_kmem_pages_node(node, THREADINFO_GFP, - THREAD_SIZE_ORDER); - - if (page) - memcg_kmem_update_page_stat(page, MEMCG_KERNEL_STACK, - 1 << THREAD_SIZE_ORDER); + struct page *page = alloc_pages_node(node, THREADINFO_GFP, + THREAD_SIZE_ORDER); return page ? page_address(page) : NULL; } static inline void free_thread_stack(unsigned long *stack) { - struct page *page = virt_to_page(stack); - - memcg_kmem_update_page_stat(page, MEMCG_KERNEL_STACK, - -(1 << THREAD_SIZE_ORDER)); - __free_kmem_pages(page, THREAD_SIZE_ORDER); + __free_pages(virt_to_page(stack), THREAD_SIZE_ORDER); } # else static struct kmem_cache *thread_stack_cache; @@ -223,9 +215,15 @@ static struct kmem_cache *mm_cachep; static void account_kernel_stack(unsigned long *stack, int account) { - struct zone *zone = page_zone(virt_to_page(stack)); + /* All stack pages are in the same zone and belong to the same memcg. */ + struct page *first_page = virt_to_page(stack); + + mod_zone_page_state(page_zone(first_page), NR_KERNEL_STACK_KB, + THREAD_SIZE / 1024 * account); - mod_zone_page_state(zone, NR_KERNEL_STACK, account); + memcg_kmem_update_page_stat( + first_page, MEMCG_KERNEL_STACK_KB, + account * (THREAD_SIZE / 1024)); } void free_task(struct task_struct *tsk) diff --git a/kernel/freezer.c b/kernel/freezer.c index a8900a3bc27a..6f56a9e219fa 100644 --- a/kernel/freezer.c +++ b/kernel/freezer.c @@ -42,7 +42,7 @@ bool freezing_slow_path(struct task_struct *p) if (p->flags & (PF_NOFREEZE | PF_SUSPEND_TASK)) return false; - if (test_thread_flag(TIF_MEMDIE)) + if (test_tsk_thread_flag(p, TIF_MEMDIE)) return false; if (pm_nosig_freezing || cgroup_freezing(p)) diff --git a/kernel/gcov/gcc_4_7.c b/kernel/gcov/gcc_4_7.c index e25e92fb44fa..6a5c239c7669 100644 --- a/kernel/gcov/gcc_4_7.c +++ b/kernel/gcov/gcc_4_7.c @@ -18,7 +18,7 @@ #include <linux/vmalloc.h> #include "gcov.h" -#if __GNUC__ == 5 && __GNUC_MINOR__ >= 1 +#if (__GNUC__ > 5) || (__GNUC__ == 5 && __GNUC_MINOR__ >= 1) #define GCOV_COUNTERS 10 #elif __GNUC__ == 4 && __GNUC_MINOR__ >= 9 #define GCOV_COUNTERS 9 diff --git a/kernel/irq/Makefile b/kernel/irq/Makefile index 2ee42e95a3ce..1d3ee3169202 100644 --- a/kernel/irq/Makefile +++ b/kernel/irq/Makefile @@ -9,3 +9,4 @@ obj-$(CONFIG_GENERIC_IRQ_MIGRATION) += cpuhotplug.o obj-$(CONFIG_PM_SLEEP) += pm.o obj-$(CONFIG_GENERIC_MSI_IRQ) += msi.o obj-$(CONFIG_GENERIC_IRQ_IPI) += ipi.o +obj-$(CONFIG_SMP) += affinity.o diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c new file mode 100644 index 000000000000..f68959341c0f --- /dev/null +++ b/kernel/irq/affinity.c @@ -0,0 +1,61 @@ + +#include <linux/interrupt.h> +#include <linux/kernel.h> +#include <linux/slab.h> +#include <linux/cpu.h> + +static int get_first_sibling(unsigned int cpu) +{ + unsigned int ret; + + ret = cpumask_first(topology_sibling_cpumask(cpu)); + if (ret < nr_cpu_ids) + return ret; + return cpu; +} + +/* + * Take a map of online CPUs and the number of available interrupt vectors + * and generate an output cpumask suitable for spreading MSI/MSI-X vectors + * so that they are distributed as good as possible around the CPUs. If + * more vectors than CPUs are available we'll map one to each CPU, + * otherwise we map one to the first sibling of each socket. + * + * If there are more vectors than CPUs we will still only have one bit + * set per CPU, but interrupt code will keep on assigning the vectors from + * the start of the bitmap until we run out of vectors. + */ +struct cpumask *irq_create_affinity_mask(unsigned int *nr_vecs) +{ + struct cpumask *affinity_mask; + unsigned int max_vecs = *nr_vecs; + + if (max_vecs == 1) + return NULL; + + affinity_mask = kzalloc(cpumask_size(), GFP_KERNEL); + if (!affinity_mask) { + *nr_vecs = 1; + return NULL; + } + + if (max_vecs >= num_online_cpus()) { + cpumask_copy(affinity_mask, cpu_online_mask); + *nr_vecs = num_online_cpus(); + } else { + unsigned int vecs = 0, cpu; + + for_each_online_cpu(cpu) { + if (cpu == get_first_sibling(cpu)) { + cpumask_set_cpu(cpu, affinity_mask); + vecs++; + } + + if (--max_vecs == 0) + break; + } + *nr_vecs = vecs; + } + + return affinity_mask; +} diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 2f9f2b0e79f2..b4c1bc7c9ca2 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -426,6 +426,49 @@ out_unlock: } EXPORT_SYMBOL_GPL(handle_simple_irq); +/** + * handle_untracked_irq - Simple and software-decoded IRQs. + * @desc: the interrupt description structure for this irq + * + * Untracked interrupts are sent from a demultiplexing interrupt + * handler when the demultiplexer does not know which device it its + * multiplexed irq domain generated the interrupt. IRQ's handled + * through here are not subjected to stats tracking, randomness, or + * spurious interrupt detection. + * + * Note: Like handle_simple_irq, the caller is expected to handle + * the ack, clear, mask and unmask issues if necessary. + */ +void handle_untracked_irq(struct irq_desc *desc) +{ + unsigned int flags = 0; + + raw_spin_lock(&desc->lock); + + if (!irq_may_run(desc)) + goto out_unlock; + + desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING); + + if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data))) { + desc->istate |= IRQS_PENDING; + goto out_unlock; + } + + desc->istate &= ~IRQS_PENDING; + irqd_set(&desc->irq_data, IRQD_IRQ_INPROGRESS); + raw_spin_unlock(&desc->lock); + + __handle_irq_event_percpu(desc, &flags); + + raw_spin_lock(&desc->lock); + irqd_clear(&desc->irq_data, IRQD_IRQ_INPROGRESS); + +out_unlock: + raw_spin_unlock(&desc->lock); +} +EXPORT_SYMBOL_GPL(handle_untracked_irq); + /* * Called unconditionally from handle_level_irq() and only for oneshot * interrupts from handle_fasteoi_irq() @@ -1093,3 +1136,43 @@ int irq_chip_compose_msi_msg(struct irq_data *data, struct msi_msg *msg) return 0; } + +/** + * irq_chip_pm_get - Enable power for an IRQ chip + * @data: Pointer to interrupt specific data + * + * Enable the power to the IRQ chip referenced by the interrupt data + * structure. + */ +int irq_chip_pm_get(struct irq_data *data) +{ + int retval; + + if (IS_ENABLED(CONFIG_PM) && data->chip->parent_device) { + retval = pm_runtime_get_sync(data->chip->parent_device); + if (retval < 0) { + pm_runtime_put_noidle(data->chip->parent_device); + return retval; + } + } + + return 0; +} + +/** + * irq_chip_pm_put - Disable power for an IRQ chip + * @data: Pointer to interrupt specific data + * + * Disable the power to the IRQ chip referenced by the interrupt data + * structure, belongs. Note that power will only be disabled, once this + * function has been called for all IRQs that have called irq_chip_pm_get(). + */ +int irq_chip_pm_put(struct irq_data *data) +{ + int retval = 0; + + if (IS_ENABLED(CONFIG_PM) && data->chip->parent_device) + retval = pm_runtime_put(data->chip->parent_device); + + return (retval < 0) ? retval : 0; +} diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index a15b5485b446..d3f24905852c 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -132,10 +132,10 @@ void __irq_wake_thread(struct irq_desc *desc, struct irqaction *action) wake_up_process(action->thread); } -irqreturn_t handle_irq_event_percpu(struct irq_desc *desc) +irqreturn_t __handle_irq_event_percpu(struct irq_desc *desc, unsigned int *flags) { irqreturn_t retval = IRQ_NONE; - unsigned int flags = 0, irq = desc->irq_data.irq; + unsigned int irq = desc->irq_data.irq; struct irqaction *action; for_each_action_of_desc(desc, action) { @@ -164,7 +164,7 @@ irqreturn_t handle_irq_event_percpu(struct irq_desc *desc) /* Fall through to add to randomness */ case IRQ_HANDLED: - flags |= action->flags; + *flags |= action->flags; break; default: @@ -174,7 +174,17 @@ irqreturn_t handle_irq_event_percpu(struct irq_desc *desc) retval |= res; } - add_interrupt_randomness(irq, flags); + return retval; +} + +irqreturn_t handle_irq_event_percpu(struct irq_desc *desc) +{ + irqreturn_t retval; + unsigned int flags = 0; + + retval = __handle_irq_event_percpu(desc, &flags); + + add_interrupt_randomness(desc->irq_data.irq, flags); if (!noirqdebug) note_interrupt(desc, retval); diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index 09be2c903c6d..bc226e783bd2 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -7,6 +7,7 @@ */ #include <linux/irqdesc.h> #include <linux/kernel_stat.h> +#include <linux/pm_runtime.h> #ifdef CONFIG_SPARSE_IRQ # define IRQ_BITMAP_BITS (NR_IRQS + 8196) @@ -83,6 +84,7 @@ extern void irq_mark_irq(unsigned int irq); extern void init_kstat_irqs(struct irq_desc *desc, int node, int nr); +irqreturn_t __handle_irq_event_percpu(struct irq_desc *desc, unsigned int *flags); irqreturn_t handle_irq_event_percpu(struct irq_desc *desc); irqreturn_t handle_irq_event(struct irq_desc *desc); @@ -105,6 +107,8 @@ static inline void unregister_handler_proc(unsigned int irq, struct irqaction *action) { } #endif +extern bool irq_can_set_affinity_usr(unsigned int irq); + extern int irq_select_affinity_usr(unsigned int irq, struct cpumask *mask); extern void irq_set_thread_affinity(struct irq_desc *desc); diff --git a/kernel/irq/ipi.c b/kernel/irq/ipi.c index 89b49f6773f0..1a9abc1c8ea0 100644 --- a/kernel/irq/ipi.c +++ b/kernel/irq/ipi.c @@ -76,14 +76,14 @@ int irq_reserve_ipi(struct irq_domain *domain, } } - virq = irq_domain_alloc_descs(-1, nr_irqs, 0, NUMA_NO_NODE); + virq = irq_domain_alloc_descs(-1, nr_irqs, 0, NUMA_NO_NODE, NULL); if (virq <= 0) { pr_warn("Can't reserve IPI, failed to alloc descs\n"); return -ENOMEM; } virq = __irq_domain_alloc_irqs(domain, virq, nr_irqs, NUMA_NO_NODE, - (void *) dest, true); + (void *) dest, true, NULL); if (virq <= 0) { pr_warn("Can't reserve IPI, failed to alloc hw irqs\n"); diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index 8731e1c5d1e7..a623b44f2d4b 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -68,9 +68,13 @@ static int alloc_masks(struct irq_desc *desc, gfp_t gfp, int node) return 0; } -static void desc_smp_init(struct irq_desc *desc, int node) +static void desc_smp_init(struct irq_desc *desc, int node, + const struct cpumask *affinity) { - cpumask_copy(desc->irq_common_data.affinity, irq_default_affinity); + if (!affinity) + affinity = irq_default_affinity; + cpumask_copy(desc->irq_common_data.affinity, affinity); + #ifdef CONFIG_GENERIC_PENDING_IRQ cpumask_clear(desc->pending_mask); #endif @@ -82,11 +86,12 @@ static void desc_smp_init(struct irq_desc *desc, int node) #else static inline int alloc_masks(struct irq_desc *desc, gfp_t gfp, int node) { return 0; } -static inline void desc_smp_init(struct irq_desc *desc, int node) { } +static inline void +desc_smp_init(struct irq_desc *desc, int node, const struct cpumask *affinity) { } #endif static void desc_set_defaults(unsigned int irq, struct irq_desc *desc, int node, - struct module *owner) + const struct cpumask *affinity, struct module *owner) { int cpu; @@ -107,7 +112,7 @@ static void desc_set_defaults(unsigned int irq, struct irq_desc *desc, int node, desc->owner = owner; for_each_possible_cpu(cpu) *per_cpu_ptr(desc->kstat_irqs, cpu) = 0; - desc_smp_init(desc, node); + desc_smp_init(desc, node, affinity); } int nr_irqs = NR_IRQS; @@ -158,7 +163,9 @@ void irq_unlock_sparse(void) mutex_unlock(&sparse_irq_lock); } -static struct irq_desc *alloc_desc(int irq, int node, struct module *owner) +static struct irq_desc *alloc_desc(int irq, int node, unsigned int flags, + const struct cpumask *affinity, + struct module *owner) { struct irq_desc *desc; gfp_t gfp = GFP_KERNEL; @@ -178,7 +185,8 @@ static struct irq_desc *alloc_desc(int irq, int node, struct module *owner) lockdep_set_class(&desc->lock, &irq_desc_lock_class); init_rcu_head(&desc->rcu); - desc_set_defaults(irq, desc, node, owner); + desc_set_defaults(irq, desc, node, affinity, owner); + irqd_set(&desc->irq_data, flags); return desc; @@ -223,13 +231,32 @@ static void free_desc(unsigned int irq) } static int alloc_descs(unsigned int start, unsigned int cnt, int node, - struct module *owner) + const struct cpumask *affinity, struct module *owner) { + const struct cpumask *mask = NULL; struct irq_desc *desc; - int i; + unsigned int flags; + int i, cpu = -1; + + if (affinity && cpumask_empty(affinity)) + return -EINVAL; + + flags = affinity ? IRQD_AFFINITY_MANAGED : 0; for (i = 0; i < cnt; i++) { - desc = alloc_desc(start + i, node, owner); + if (affinity) { + cpu = cpumask_next(cpu, affinity); + if (cpu >= nr_cpu_ids) + cpu = cpumask_first(affinity); + node = cpu_to_node(cpu); + + /* + * For single allocations we use the caller provided + * mask otherwise we use the mask of the target cpu + */ + mask = cnt == 1 ? affinity : cpumask_of(cpu); + } + desc = alloc_desc(start + i, node, flags, mask, owner); if (!desc) goto err; mutex_lock(&sparse_irq_lock); @@ -277,7 +304,7 @@ int __init early_irq_init(void) nr_irqs = initcnt; for (i = 0; i < initcnt; i++) { - desc = alloc_desc(i, node, NULL); + desc = alloc_desc(i, node, 0, NULL, NULL); set_bit(i, allocated_irqs); irq_insert_desc(i, desc); } @@ -311,7 +338,7 @@ int __init early_irq_init(void) alloc_masks(&desc[i], GFP_KERNEL, node); raw_spin_lock_init(&desc[i].lock); lockdep_set_class(&desc[i].lock, &irq_desc_lock_class); - desc_set_defaults(i, &desc[i], node, NULL); + desc_set_defaults(i, &desc[i], node, NULL, NULL); } return arch_early_irq_init(); } @@ -328,11 +355,12 @@ static void free_desc(unsigned int irq) unsigned long flags; raw_spin_lock_irqsave(&desc->lock, flags); - desc_set_defaults(irq, desc, irq_desc_get_node(desc), NULL); + desc_set_defaults(irq, desc, irq_desc_get_node(desc), NULL, NULL); raw_spin_unlock_irqrestore(&desc->lock, flags); } static inline int alloc_descs(unsigned int start, unsigned int cnt, int node, + const struct cpumask *affinity, struct module *owner) { u32 i; @@ -453,12 +481,15 @@ EXPORT_SYMBOL_GPL(irq_free_descs); * @cnt: Number of consecutive irqs to allocate. * @node: Preferred node on which the irq descriptor should be allocated * @owner: Owning module (can be NULL) + * @affinity: Optional pointer to an affinity mask which hints where the + * irq descriptors should be allocated and which default + * affinities to use * * Returns the first irq number or error code */ int __ref __irq_alloc_descs(int irq, unsigned int from, unsigned int cnt, int node, - struct module *owner) + struct module *owner, const struct cpumask *affinity) { int start, ret; @@ -494,7 +525,7 @@ __irq_alloc_descs(int irq, unsigned int from, unsigned int cnt, int node, bitmap_set(allocated_irqs, start, cnt); mutex_unlock(&sparse_irq_lock); - return alloc_descs(start, cnt, node, owner); + return alloc_descs(start, cnt, node, affinity, owner); err: mutex_unlock(&sparse_irq_lock); @@ -512,7 +543,7 @@ EXPORT_SYMBOL_GPL(__irq_alloc_descs); */ unsigned int irq_alloc_hwirqs(int cnt, int node) { - int i, irq = __irq_alloc_descs(-1, 0, cnt, node, NULL); + int i, irq = __irq_alloc_descs(-1, 0, cnt, node, NULL, NULL); if (irq < 0) return 0; diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 8798b6c9e945..4752b43662e0 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -481,7 +481,7 @@ unsigned int irq_create_mapping(struct irq_domain *domain, } /* Allocate a virtual interrupt number */ - virq = irq_domain_alloc_descs(-1, 1, hwirq, of_node_to_nid(of_node)); + virq = irq_domain_alloc_descs(-1, 1, hwirq, of_node_to_nid(of_node), NULL); if (virq <= 0) { pr_debug("-> virq allocation failed\n"); return 0; @@ -567,6 +567,7 @@ static void of_phandle_args_to_fwspec(struct of_phandle_args *irq_data, unsigned int irq_create_fwspec_mapping(struct irq_fwspec *fwspec) { struct irq_domain *domain; + struct irq_data *irq_data; irq_hw_number_t hwirq; unsigned int type = IRQ_TYPE_NONE; int virq; @@ -588,15 +589,46 @@ unsigned int irq_create_fwspec_mapping(struct irq_fwspec *fwspec) if (irq_domain_translate(domain, fwspec, &hwirq, &type)) return 0; - if (irq_domain_is_hierarchy(domain)) { + /* + * WARN if the irqchip returns a type with bits + * outside the sense mask set and clear these bits. + */ + if (WARN_ON(type & ~IRQ_TYPE_SENSE_MASK)) + type &= IRQ_TYPE_SENSE_MASK; + + /* + * If we've already configured this interrupt, + * don't do it again, or hell will break loose. + */ + virq = irq_find_mapping(domain, hwirq); + if (virq) { + /* + * If the trigger type is not specified or matches the + * current trigger type then we are done so return the + * interrupt number. + */ + if (type == IRQ_TYPE_NONE || type == irq_get_trigger_type(virq)) + return virq; + /* - * If we've already configured this interrupt, - * don't do it again, or hell will break loose. + * If the trigger type has not been set yet, then set + * it now and return the interrupt number. */ - virq = irq_find_mapping(domain, hwirq); - if (virq) + if (irq_get_trigger_type(virq) == IRQ_TYPE_NONE) { + irq_data = irq_get_irq_data(virq); + if (!irq_data) + return 0; + + irqd_set_trigger_type(irq_data, type); return virq; + } + pr_warn("type mismatch, failed to map hwirq-%lu for %s!\n", + hwirq, of_node_full_name(to_of_node(fwspec->fwnode))); + return 0; + } + + if (irq_domain_is_hierarchy(domain)) { virq = irq_domain_alloc_irqs(domain, 1, NUMA_NO_NODE, fwspec); if (virq <= 0) return 0; @@ -607,10 +639,18 @@ unsigned int irq_create_fwspec_mapping(struct irq_fwspec *fwspec) return virq; } - /* Set type if specified and different than the current one */ - if (type != IRQ_TYPE_NONE && - type != irq_get_trigger_type(virq)) - irq_set_irq_type(virq, type); + irq_data = irq_get_irq_data(virq); + if (!irq_data) { + if (irq_domain_is_hierarchy(domain)) + irq_domain_free_irqs(virq, 1); + else + irq_dispose_mapping(virq); + return 0; + } + + /* Store trigger type */ + irqd_set_trigger_type(irq_data, type); + return virq; } EXPORT_SYMBOL_GPL(irq_create_fwspec_mapping); @@ -640,8 +680,12 @@ void irq_dispose_mapping(unsigned int virq) if (WARN_ON(domain == NULL)) return; - irq_domain_disassociate(domain, virq); - irq_free_desc(virq); + if (irq_domain_is_hierarchy(domain)) { + irq_domain_free_irqs(virq, 1); + } else { + irq_domain_disassociate(domain, virq); + irq_free_desc(virq); + } } EXPORT_SYMBOL_GPL(irq_dispose_mapping); @@ -835,19 +879,23 @@ const struct irq_domain_ops irq_domain_simple_ops = { EXPORT_SYMBOL_GPL(irq_domain_simple_ops); int irq_domain_alloc_descs(int virq, unsigned int cnt, irq_hw_number_t hwirq, - int node) + int node, const struct cpumask *affinity) { unsigned int hint; if (virq >= 0) { - virq = irq_alloc_descs(virq, virq, cnt, node); + virq = __irq_alloc_descs(virq, virq, cnt, node, THIS_MODULE, + affinity); } else { hint = hwirq % nr_irqs; if (hint == 0) hint++; - virq = irq_alloc_descs_from(hint, cnt, node); - if (virq <= 0 && hint > 1) - virq = irq_alloc_descs_from(1, cnt, node); + virq = __irq_alloc_descs(-1, hint, cnt, node, THIS_MODULE, + affinity); + if (virq <= 0 && hint > 1) { + virq = __irq_alloc_descs(-1, 1, cnt, node, THIS_MODULE, + affinity); + } } return virq; @@ -1144,8 +1192,10 @@ int irq_domain_alloc_irqs_recursive(struct irq_domain *domain, if (recursive) ret = irq_domain_alloc_irqs_recursive(parent, irq_base, nr_irqs, arg); - if (ret >= 0) - ret = domain->ops->alloc(domain, irq_base, nr_irqs, arg); + if (ret < 0) + return ret; + + ret = domain->ops->alloc(domain, irq_base, nr_irqs, arg); if (ret < 0 && recursive) irq_domain_free_irqs_recursive(parent, irq_base, nr_irqs); @@ -1160,6 +1210,7 @@ int irq_domain_alloc_irqs_recursive(struct irq_domain *domain, * @node: NUMA node id for memory allocation * @arg: domain specific argument * @realloc: IRQ descriptors have already been allocated if true + * @affinity: Optional irq affinity mask for multiqueue devices * * Allocate IRQ numbers and initialized all data structures to support * hierarchy IRQ domains. @@ -1175,7 +1226,7 @@ int irq_domain_alloc_irqs_recursive(struct irq_domain *domain, */ int __irq_domain_alloc_irqs(struct irq_domain *domain, int irq_base, unsigned int nr_irqs, int node, void *arg, - bool realloc) + bool realloc, const struct cpumask *affinity) { int i, ret, virq; @@ -1193,7 +1244,8 @@ int __irq_domain_alloc_irqs(struct irq_domain *domain, int irq_base, if (realloc && irq_base >= 0) { virq = irq_base; } else { - virq = irq_domain_alloc_descs(irq_base, nr_irqs, 0, node); + virq = irq_domain_alloc_descs(irq_base, nr_irqs, 0, node, + affinity); if (virq < 0) { pr_debug("cannot allocate IRQ(base %d, count %d)\n", irq_base, nr_irqs); diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index ef0bc02c3a70..73a2b786b5e9 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -115,12 +115,12 @@ EXPORT_SYMBOL(synchronize_irq); #ifdef CONFIG_SMP cpumask_var_t irq_default_affinity; -static int __irq_can_set_affinity(struct irq_desc *desc) +static bool __irq_can_set_affinity(struct irq_desc *desc) { if (!desc || !irqd_can_balance(&desc->irq_data) || !desc->irq_data.chip || !desc->irq_data.chip->irq_set_affinity) - return 0; - return 1; + return false; + return true; } /** @@ -134,6 +134,21 @@ int irq_can_set_affinity(unsigned int irq) } /** + * irq_can_set_affinity_usr - Check if affinity of a irq can be set from user space + * @irq: Interrupt to check + * + * Like irq_can_set_affinity() above, but additionally checks for the + * AFFINITY_MANAGED flag. + */ +bool irq_can_set_affinity_usr(unsigned int irq) +{ + struct irq_desc *desc = irq_to_desc(irq); + + return __irq_can_set_affinity(desc) && + !irqd_affinity_is_managed(&desc->irq_data); +} + +/** * irq_set_thread_affinity - Notify irq threads to adjust affinity * @desc: irq descriptor which has affitnity changed * @@ -338,10 +353,11 @@ static int setup_affinity(struct irq_desc *desc, struct cpumask *mask) return 0; /* - * Preserve an userspace affinity setup, but make sure that - * one of the targets is online. + * Preserve the managed affinity setting and an userspace affinity + * setup, but make sure that one of the targets is online. */ - if (irqd_has_set(&desc->irq_data, IRQD_AFFINITY_SET)) { + if (irqd_affinity_is_managed(&desc->irq_data) || + irqd_has_set(&desc->irq_data, IRQD_AFFINITY_SET)) { if (cpumask_intersects(desc->irq_common_data.affinity, cpu_online_mask)) set = desc->irq_common_data.affinity; @@ -1117,6 +1133,13 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) new->irq = irq; /* + * If the trigger type is not specified by the caller, + * then use the default for this interrupt. + */ + if (!(new->flags & IRQF_TRIGGER_MASK)) + new->flags |= irqd_get_trigger_type(&desc->irq_data); + + /* * Check whether the interrupt nests into another interrupt * thread. */ @@ -1409,10 +1432,18 @@ int setup_irq(unsigned int irq, struct irqaction *act) if (!desc || WARN_ON(irq_settings_is_per_cpu_devid(desc))) return -EINVAL; + + retval = irq_chip_pm_get(&desc->irq_data); + if (retval < 0) + return retval; + chip_bus_lock(desc); retval = __setup_irq(irq, desc, act); chip_bus_sync_unlock(desc); + if (retval) + irq_chip_pm_put(&desc->irq_data); + return retval; } EXPORT_SYMBOL_GPL(setup_irq); @@ -1506,6 +1537,7 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id) } } + irq_chip_pm_put(&desc->irq_data); module_put(desc->owner); kfree(action->secondary); return action; @@ -1648,11 +1680,16 @@ int request_threaded_irq(unsigned int irq, irq_handler_t handler, action->name = devname; action->dev_id = dev_id; + retval = irq_chip_pm_get(&desc->irq_data); + if (retval < 0) + return retval; + chip_bus_lock(desc); retval = __setup_irq(irq, desc, action); chip_bus_sync_unlock(desc); if (retval) { + irq_chip_pm_put(&desc->irq_data); kfree(action->secondary); kfree(action); } @@ -1730,7 +1767,14 @@ void enable_percpu_irq(unsigned int irq, unsigned int type) if (!desc) return; + /* + * If the trigger type is not specified by the caller, then + * use the default for this interrupt. + */ type &= IRQ_TYPE_SENSE_MASK; + if (type == IRQ_TYPE_NONE) + type = irqd_get_trigger_type(&desc->irq_data); + if (type != IRQ_TYPE_NONE) { int ret; @@ -1822,6 +1866,7 @@ static struct irqaction *__free_percpu_irq(unsigned int irq, void __percpu *dev_ unregister_handler_proc(irq, action); + irq_chip_pm_put(&desc->irq_data); module_put(desc->owner); return action; @@ -1884,10 +1929,18 @@ int setup_percpu_irq(unsigned int irq, struct irqaction *act) if (!desc || !irq_settings_is_per_cpu_devid(desc)) return -EINVAL; + + retval = irq_chip_pm_get(&desc->irq_data); + if (retval < 0) + return retval; + chip_bus_lock(desc); retval = __setup_irq(irq, desc, act); chip_bus_sync_unlock(desc); + if (retval) + irq_chip_pm_put(&desc->irq_data); + return retval; } @@ -1931,12 +1984,18 @@ int request_percpu_irq(unsigned int irq, irq_handler_t handler, action->name = devname; action->percpu_dev_id = dev_id; + retval = irq_chip_pm_get(&desc->irq_data); + if (retval < 0) + return retval; + chip_bus_lock(desc); retval = __setup_irq(irq, desc, action); chip_bus_sync_unlock(desc); - if (retval) + if (retval) { + irq_chip_pm_put(&desc->irq_data); kfree(action); + } return retval; } diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c index 38e89ce7b071..54999350162c 100644 --- a/kernel/irq/msi.c +++ b/kernel/irq/msi.c @@ -324,7 +324,7 @@ int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev, struct msi_domain_ops *ops = info->ops; msi_alloc_info_t arg; struct msi_desc *desc; - int i, ret, virq = -1; + int i, ret, virq; ret = msi_domain_prepare_irqs(domain, dev, nvec, &arg); if (ret) @@ -332,13 +332,10 @@ int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev, for_each_msi_entry(desc, dev) { ops->set_desc(&arg, desc); - if (info->flags & MSI_FLAG_IDENTITY_MAP) - virq = (int)ops->get_hwirq(info, &arg); - else - virq = -1; - virq = __irq_domain_alloc_irqs(domain, virq, desc->nvec_used, - dev_to_node(dev), &arg, false); + virq = __irq_domain_alloc_irqs(domain, -1, desc->nvec_used, + dev_to_node(dev), &arg, false, + desc->affinity); if (virq < 0) { ret = -ENOSPC; if (ops->handle_error) @@ -356,6 +353,7 @@ int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev, ops->msi_finish(&arg, 0); for_each_msi_entry(desc, dev) { + virq = desc->irq; if (desc->nvec_used == 1) dev_dbg(dev, "irq %d for MSI\n", virq); else diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index 4e1b94726818..feaa813b84a9 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -96,7 +96,7 @@ static ssize_t write_irq_affinity(int type, struct file *file, cpumask_var_t new_value; int err; - if (!irq_can_set_affinity(irq) || no_irq_affinity) + if (!irq_can_set_affinity_usr(irq) || no_irq_affinity) return -EIO; if (!alloc_cpumask_var(&new_value, GFP_KERNEL)) @@ -311,7 +311,6 @@ void register_handler_proc(unsigned int irq, struct irqaction *action) !name_unique(irq, action)) return; - memset(name, 0, MAX_NAMELEN); snprintf(name, MAX_NAMELEN, "%s", action->name); /* create /proc/irq/1234/handler/ */ @@ -340,7 +339,6 @@ void register_irq_proc(unsigned int irq, struct irq_desc *desc) if (desc->dir) goto out_unlock; - memset(name, 0, MAX_NAMELEN); sprintf(name, "%d", irq); /* create /proc/irq/1234 */ @@ -386,7 +384,6 @@ void unregister_irq_proc(unsigned int irq, struct irq_desc *desc) #endif remove_proc_entry("spurious", desc->dir); - memset(name, 0, MAX_NAMELEN); sprintf(name, "%u", irq); remove_proc_entry(name, root_irq_dir); } @@ -421,12 +418,8 @@ void init_irq_proc(void) /* * Create entries for all existing IRQs. */ - for_each_irq_desc(irq, desc) { - if (!desc) - continue; - + for_each_irq_desc(irq, desc) register_irq_proc(irq, desc); - } } #ifdef CONFIG_GENERIC_IRQ_SHOW diff --git a/kernel/jump_label.c b/kernel/jump_label.c index 4b353e0be121..0dbea887d625 100644 --- a/kernel/jump_label.c +++ b/kernel/jump_label.c @@ -452,7 +452,7 @@ jump_label_module_notify(struct notifier_block *self, unsigned long val, return notifier_from_errno(ret); } -struct notifier_block jump_label_module_nb = { +static struct notifier_block jump_label_module_nb = { .notifier_call = jump_label_module_notify, .priority = 1, /* higher than tracepoints */ }; diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 81f1a7107c0e..589d763a49b3 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -46,6 +46,7 @@ #include <linux/gfp.h> #include <linux/kmemcheck.h> #include <linux/random.h> +#include <linux/jhash.h> #include <asm/sections.h> @@ -309,10 +310,14 @@ static struct hlist_head chainhash_table[CHAINHASH_SIZE]; * It's a 64-bit hash, because it's important for the keys to be * unique. */ -#define iterate_chain_key(key1, key2) \ - (((key1) << MAX_LOCKDEP_KEYS_BITS) ^ \ - ((key1) >> (64-MAX_LOCKDEP_KEYS_BITS)) ^ \ - (key2)) +static inline u64 iterate_chain_key(u64 key, u32 idx) +{ + u32 k0 = key, k1 = key >> 32; + + __jhash_mix(idx, k0, k1); /* Macro that modifies arguments! */ + + return k0 | (u64)k1 << 32; +} void lockdep_off(void) { diff --git a/kernel/locking/mutex-debug.h b/kernel/locking/mutex-debug.h index d06ae3bb46c5..57a871ae3c81 100644 --- a/kernel/locking/mutex-debug.h +++ b/kernel/locking/mutex-debug.h @@ -29,12 +29,12 @@ extern void debug_mutex_init(struct mutex *lock, const char *name, static inline void mutex_set_owner(struct mutex *lock) { - lock->owner = current; + WRITE_ONCE(lock->owner, current); } static inline void mutex_clear_owner(struct mutex *lock) { - lock->owner = NULL; + WRITE_ONCE(lock->owner, NULL); } #define spin_lock_mutex(lock, flags) \ diff --git a/kernel/locking/mutex.h b/kernel/locking/mutex.h index a68bae5e852a..6cd6b8e9efd7 100644 --- a/kernel/locking/mutex.h +++ b/kernel/locking/mutex.h @@ -17,14 +17,20 @@ __list_del((waiter)->list.prev, (waiter)->list.next) #ifdef CONFIG_MUTEX_SPIN_ON_OWNER +/* + * The mutex owner can get read and written to locklessly. + * We should use WRITE_ONCE when writing the owner value to + * avoid store tearing, otherwise, a thread could potentially + * read a partially written and incomplete owner value. + */ static inline void mutex_set_owner(struct mutex *lock) { - lock->owner = current; + WRITE_ONCE(lock->owner, current); } static inline void mutex_clear_owner(struct mutex *lock) { - lock->owner = NULL; + WRITE_ONCE(lock->owner, NULL); } #else static inline void mutex_set_owner(struct mutex *lock) diff --git a/kernel/locking/qrwlock.c b/kernel/locking/qrwlock.c index fec082338668..19248ddf37ce 100644 --- a/kernel/locking/qrwlock.c +++ b/kernel/locking/qrwlock.c @@ -93,7 +93,7 @@ void queued_read_lock_slowpath(struct qrwlock *lock, u32 cnts) * that accesses can't leak upwards out of our subsequent critical * section in the case that the lock is currently held for write. */ - cnts = atomic_add_return_acquire(_QR_BIAS, &lock->cnts) - _QR_BIAS; + cnts = atomic_fetch_add_acquire(_QR_BIAS, &lock->cnts); rspin_until_writer_unlock(lock, cnts); /* diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c index 5fc8c311b8fe..b2caec7315af 100644 --- a/kernel/locking/qspinlock.c +++ b/kernel/locking/qspinlock.c @@ -90,7 +90,7 @@ static DEFINE_PER_CPU_ALIGNED(struct mcs_spinlock, mcs_nodes[MAX_NODES]); * therefore increment the cpu number by one. */ -static inline u32 encode_tail(int cpu, int idx) +static inline __pure u32 encode_tail(int cpu, int idx) { u32 tail; @@ -103,7 +103,7 @@ static inline u32 encode_tail(int cpu, int idx) return tail; } -static inline struct mcs_spinlock *decode_tail(u32 tail) +static inline __pure struct mcs_spinlock *decode_tail(u32 tail) { int cpu = (tail >> _Q_TAIL_CPU_OFFSET) - 1; int idx = (tail & _Q_TAIL_IDX_MASK) >> _Q_TAIL_IDX_OFFSET; @@ -268,6 +268,63 @@ static __always_inline u32 __pv_wait_head_or_lock(struct qspinlock *lock, #endif /* + * Various notes on spin_is_locked() and spin_unlock_wait(), which are + * 'interesting' functions: + * + * PROBLEM: some architectures have an interesting issue with atomic ACQUIRE + * operations in that the ACQUIRE applies to the LOAD _not_ the STORE (ARM64, + * PPC). Also qspinlock has a similar issue per construction, the setting of + * the locked byte can be unordered acquiring the lock proper. + * + * This gets to be 'interesting' in the following cases, where the /should/s + * end up false because of this issue. + * + * + * CASE 1: + * + * So the spin_is_locked() correctness issue comes from something like: + * + * CPU0 CPU1 + * + * global_lock(); local_lock(i) + * spin_lock(&G) spin_lock(&L[i]) + * for (i) if (!spin_is_locked(&G)) { + * spin_unlock_wait(&L[i]); smp_acquire__after_ctrl_dep(); + * return; + * } + * // deal with fail + * + * Where it is important CPU1 sees G locked or CPU0 sees L[i] locked such + * that there is exclusion between the two critical sections. + * + * The load from spin_is_locked(&G) /should/ be constrained by the ACQUIRE from + * spin_lock(&L[i]), and similarly the load(s) from spin_unlock_wait(&L[i]) + * /should/ be constrained by the ACQUIRE from spin_lock(&G). + * + * Similarly, later stuff is constrained by the ACQUIRE from CTRL+RMB. + * + * + * CASE 2: + * + * For spin_unlock_wait() there is a second correctness issue, namely: + * + * CPU0 CPU1 + * + * flag = set; + * smp_mb(); spin_lock(&l) + * spin_unlock_wait(&l); if (!flag) + * // add to lockless list + * spin_unlock(&l); + * // iterate lockless list + * + * Which wants to ensure that CPU1 will stop adding bits to the list and CPU0 + * will observe the last entry on the list (if spin_unlock_wait() had ACQUIRE + * semantics etc..) + * + * Where flag /should/ be ordered against the locked store of l. + */ + +/* * queued_spin_lock_slowpath() can (load-)ACQUIRE the lock before * issuing an _unordered_ store to set _Q_LOCKED_VAL. * @@ -322,7 +379,7 @@ void queued_spin_unlock_wait(struct qspinlock *lock) cpu_relax(); done: - smp_rmb(); /* CTRL + RMB -> ACQUIRE */ + smp_acquire__after_ctrl_dep(); } EXPORT_SYMBOL(queued_spin_unlock_wait); #endif @@ -418,7 +475,7 @@ void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val) * sequentiality; this is because not all clear_pending_set_locked() * implementations imply full barriers. */ - smp_cond_acquire(!(atomic_read(&lock->val) & _Q_LOCKED_MASK)); + smp_cond_load_acquire(&lock->val.counter, !(VAL & _Q_LOCKED_MASK)); /* * take ownership and clear the pending bit. @@ -455,6 +512,8 @@ queue: * pending stuff. * * p,*,* -> n,*,* + * + * RELEASE, such that the stores to @node must be complete. */ old = xchg_tail(lock, tail); next = NULL; @@ -465,6 +524,15 @@ queue: */ if (old & _Q_TAIL_MASK) { prev = decode_tail(old); + /* + * The above xchg_tail() is also a load of @lock which generates, + * through decode_tail(), a pointer. + * + * The address dependency matches the RELEASE of xchg_tail() + * such that the access to @prev must happen after. + */ + smp_read_barrier_depends(); + WRITE_ONCE(prev->next, node); pv_wait_node(node, prev); @@ -494,7 +562,7 @@ queue: * * The PV pv_wait_head_or_lock function, if active, will acquire * the lock and return a non-zero value. So we have to skip the - * smp_cond_acquire() call. As the next PV queue head hasn't been + * smp_cond_load_acquire() call. As the next PV queue head hasn't been * designated yet, there is no way for the locked value to become * _Q_SLOW_VAL. So both the set_locked() and the * atomic_cmpxchg_relaxed() calls will be safe. @@ -505,7 +573,7 @@ queue: if ((val = pv_wait_head_or_lock(lock, node))) goto locked; - smp_cond_acquire(!((val = atomic_read(&lock->val)) & _Q_LOCKED_PENDING_MASK)); + val = smp_cond_load_acquire(&lock->val.counter, !(VAL & _Q_LOCKED_PENDING_MASK)); locked: /* @@ -525,9 +593,9 @@ locked: break; } /* - * The smp_cond_acquire() call above has provided the necessary - * acquire semantics required for locking. At most two - * iterations of this loop may be ran. + * The smp_cond_load_acquire() call above has provided the + * necessary acquire semantics required for locking. At most + * two iterations of this loop may be ran. */ old = atomic_cmpxchg_relaxed(&lock->val, val, _Q_LOCKED_VAL); if (old == val) @@ -551,7 +619,7 @@ release: /* * release the node */ - this_cpu_dec(mcs_nodes[0].count); + __this_cpu_dec(mcs_nodes[0].count); } EXPORT_SYMBOL(queued_spin_lock_slowpath); diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h index 21ede57f68b3..37649e69056c 100644 --- a/kernel/locking/qspinlock_paravirt.h +++ b/kernel/locking/qspinlock_paravirt.h @@ -112,12 +112,12 @@ static __always_inline int trylock_clear_pending(struct qspinlock *lock) #else /* _Q_PENDING_BITS == 8 */ static __always_inline void set_pending(struct qspinlock *lock) { - atomic_set_mask(_Q_PENDING_VAL, &lock->val); + atomic_or(_Q_PENDING_VAL, &lock->val); } static __always_inline void clear_pending(struct qspinlock *lock) { - atomic_clear_mask(_Q_PENDING_VAL, &lock->val); + atomic_andnot(_Q_PENDING_VAL, &lock->val); } static __always_inline int trylock_clear_pending(struct qspinlock *lock) diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index 3e746607abe5..1ec0f48962b3 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -1478,7 +1478,7 @@ EXPORT_SYMBOL_GPL(rt_mutex_timed_lock); */ int __sched rt_mutex_trylock(struct rt_mutex *lock) { - if (WARN_ON(in_irq() || in_nmi() || in_serving_softirq())) + if (WARN_ON_ONCE(in_irq() || in_nmi() || in_serving_softirq())) return 0; return rt_mutex_fasttrylock(lock, rt_mutex_slowtrylock); diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c index 09e30c6225e5..447e08de1fab 100644 --- a/kernel/locking/rwsem-xadd.c +++ b/kernel/locking/rwsem-xadd.c @@ -80,7 +80,7 @@ void __init_rwsem(struct rw_semaphore *sem, const char *name, debug_check_no_locks_freed((void *)sem, sizeof(*sem)); lockdep_init_map(&sem->dep_map, name, key, 0); #endif - sem->count = RWSEM_UNLOCKED_VALUE; + atomic_long_set(&sem->count, RWSEM_UNLOCKED_VALUE); raw_spin_lock_init(&sem->wait_lock); INIT_LIST_HEAD(&sem->wait_list); #ifdef CONFIG_RWSEM_SPIN_ON_OWNER @@ -114,12 +114,16 @@ enum rwsem_wake_type { * - the 'active part' of count (&0x0000ffff) reached 0 (but may have changed) * - the 'waiting part' of count (&0xffff0000) is -ve (and will still be so) * - there must be someone on the queue - * - the spinlock must be held by the caller + * - the wait_lock must be held by the caller + * - tasks are marked for wakeup, the caller must later invoke wake_up_q() + * to actually wakeup the blocked task(s) and drop the reference count, + * preferably when the wait_lock is released * - woken process blocks are discarded from the list after having task zeroed - * - writers are only woken if downgrading is false + * - writers are only marked woken if downgrading is false */ static struct rw_semaphore * -__rwsem_do_wake(struct rw_semaphore *sem, enum rwsem_wake_type wake_type) +__rwsem_mark_wake(struct rw_semaphore *sem, + enum rwsem_wake_type wake_type, struct wake_q_head *wake_q) { struct rwsem_waiter *waiter; struct task_struct *tsk; @@ -128,13 +132,16 @@ __rwsem_do_wake(struct rw_semaphore *sem, enum rwsem_wake_type wake_type) waiter = list_entry(sem->wait_list.next, struct rwsem_waiter, list); if (waiter->type == RWSEM_WAITING_FOR_WRITE) { - if (wake_type == RWSEM_WAKE_ANY) - /* Wake writer at the front of the queue, but do not - * grant it the lock yet as we want other writers - * to be able to steal it. Readers, on the other hand, - * will block as they will notice the queued writer. + if (wake_type == RWSEM_WAKE_ANY) { + /* + * Mark writer at the front of the queue for wakeup. + * Until the task is actually later awoken later by + * the caller, other writers are able to steal it. + * Readers, on the other hand, will block as they + * will notice the queued writer. */ - wake_up_process(waiter->task); + wake_q_add(wake_q, waiter->task); + } goto out; } @@ -146,15 +153,27 @@ __rwsem_do_wake(struct rw_semaphore *sem, enum rwsem_wake_type wake_type) if (wake_type != RWSEM_WAKE_READ_OWNED) { adjustment = RWSEM_ACTIVE_READ_BIAS; try_reader_grant: - oldcount = rwsem_atomic_update(adjustment, sem) - adjustment; + oldcount = atomic_long_fetch_add(adjustment, &sem->count); + if (unlikely(oldcount < RWSEM_WAITING_BIAS)) { - /* A writer stole the lock. Undo our reader grant. */ - if (rwsem_atomic_update(-adjustment, sem) & - RWSEM_ACTIVE_MASK) + /* + * If the count is still less than RWSEM_WAITING_BIAS + * after removing the adjustment, it is assumed that + * a writer has stolen the lock. We have to undo our + * reader grant. + */ + if (atomic_long_add_return(-adjustment, &sem->count) < + RWSEM_WAITING_BIAS) goto out; /* Last active locker left. Retry waking readers. */ goto try_reader_grant; } + /* + * It is not really necessary to set it to reader-owned here, + * but it gives the spinners an early indication that the + * readers now have the lock. + */ + rwsem_set_reader_owned(sem); } /* Grant an infinite number of read locks to the readers at the front @@ -179,7 +198,7 @@ __rwsem_do_wake(struct rw_semaphore *sem, enum rwsem_wake_type wake_type) adjustment -= RWSEM_WAITING_BIAS; if (adjustment) - rwsem_atomic_add(adjustment, sem); + atomic_long_add(adjustment, &sem->count); next = sem->wait_list.next; loop = woken; @@ -187,17 +206,15 @@ __rwsem_do_wake(struct rw_semaphore *sem, enum rwsem_wake_type wake_type) waiter = list_entry(next, struct rwsem_waiter, list); next = waiter->list.next; tsk = waiter->task; + + wake_q_add(wake_q, tsk); /* - * Make sure we do not wakeup the next reader before - * setting the nil condition to grant the next reader; - * otherwise we could miss the wakeup on the other - * side and end up sleeping again. See the pairing - * in rwsem_down_read_failed(). + * Ensure that the last operation is setting the reader + * waiter to nil such that rwsem_down_read_failed() cannot + * race with do_exit() by always holding a reference count + * to the task to wakeup. */ - smp_mb(); - waiter->task = NULL; - wake_up_process(tsk); - put_task_struct(tsk); + smp_store_release(&waiter->task, NULL); } while (--loop); sem->wait_list.next = next; @@ -216,11 +233,11 @@ struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem) long count, adjustment = -RWSEM_ACTIVE_READ_BIAS; struct rwsem_waiter waiter; struct task_struct *tsk = current; + WAKE_Q(wake_q); /* set up my own style of waitqueue */ waiter.task = tsk; waiter.type = RWSEM_WAITING_FOR_READ; - get_task_struct(tsk); raw_spin_lock_irq(&sem->wait_lock); if (list_empty(&sem->wait_list)) @@ -228,7 +245,7 @@ struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem) list_add_tail(&waiter.list, &sem->wait_list); /* we're now waiting on the lock, but no longer actively locking */ - count = rwsem_atomic_update(adjustment, sem); + count = atomic_long_add_return(adjustment, &sem->count); /* If there are no active locks, wake the front queued process(es). * @@ -238,9 +255,10 @@ struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem) if (count == RWSEM_WAITING_BIAS || (count > RWSEM_WAITING_BIAS && adjustment != -RWSEM_ACTIVE_READ_BIAS)) - sem = __rwsem_do_wake(sem, RWSEM_WAKE_ANY); + sem = __rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q); raw_spin_unlock_irq(&sem->wait_lock); + wake_up_q(&wake_q); /* wait to be given the lock */ while (true) { @@ -255,17 +273,29 @@ struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem) } EXPORT_SYMBOL(rwsem_down_read_failed); +/* + * This function must be called with the sem->wait_lock held to prevent + * race conditions between checking the rwsem wait list and setting the + * sem->count accordingly. + */ static inline bool rwsem_try_write_lock(long count, struct rw_semaphore *sem) { /* - * Try acquiring the write lock. Check count first in order - * to reduce unnecessary expensive cmpxchg() operations. + * Avoid trying to acquire write lock if count isn't RWSEM_WAITING_BIAS. */ - if (count == RWSEM_WAITING_BIAS && - cmpxchg_acquire(&sem->count, RWSEM_WAITING_BIAS, - RWSEM_ACTIVE_WRITE_BIAS) == RWSEM_WAITING_BIAS) { - if (!list_is_singular(&sem->wait_list)) - rwsem_atomic_update(RWSEM_WAITING_BIAS, sem); + if (count != RWSEM_WAITING_BIAS) + return false; + + /* + * Acquire the lock by trying to set it to ACTIVE_WRITE_BIAS. If there + * are other tasks on the wait list, we need to add on WAITING_BIAS. + */ + count = list_is_singular(&sem->wait_list) ? + RWSEM_ACTIVE_WRITE_BIAS : + RWSEM_ACTIVE_WRITE_BIAS + RWSEM_WAITING_BIAS; + + if (atomic_long_cmpxchg_acquire(&sem->count, RWSEM_WAITING_BIAS, count) + == RWSEM_WAITING_BIAS) { rwsem_set_owner(sem); return true; } @@ -279,13 +309,13 @@ static inline bool rwsem_try_write_lock(long count, struct rw_semaphore *sem) */ static inline bool rwsem_try_write_lock_unqueued(struct rw_semaphore *sem) { - long old, count = READ_ONCE(sem->count); + long old, count = atomic_long_read(&sem->count); while (true) { if (!(count == 0 || count == RWSEM_WAITING_BIAS)) return false; - old = cmpxchg_acquire(&sem->count, count, + old = atomic_long_cmpxchg_acquire(&sem->count, count, count + RWSEM_ACTIVE_WRITE_BIAS); if (old == count) { rwsem_set_owner(sem); @@ -306,16 +336,11 @@ static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem) rcu_read_lock(); owner = READ_ONCE(sem->owner); - if (!owner) { - long count = READ_ONCE(sem->count); + if (!rwsem_owner_is_writer(owner)) { /* - * If sem->owner is not set, yet we have just recently entered the - * slowpath with the lock being active, then there is a possibility - * reader(s) may have the lock. To be safe, bail spinning in these - * situations. + * Don't spin if the rwsem is readers owned. */ - if (count & RWSEM_ACTIVE_MASK) - ret = false; + ret = !rwsem_owner_is_reader(owner); goto done; } @@ -325,10 +350,15 @@ done: return ret; } -static noinline -bool rwsem_spin_on_owner(struct rw_semaphore *sem, struct task_struct *owner) +/* + * Return true only if we can still spin on the owner field of the rwsem. + */ +static noinline bool rwsem_spin_on_owner(struct rw_semaphore *sem) { - long count; + struct task_struct *owner = READ_ONCE(sem->owner); + + if (!rwsem_owner_is_writer(owner)) + goto out; rcu_read_lock(); while (sem->owner == owner) { @@ -349,22 +379,16 @@ bool rwsem_spin_on_owner(struct rw_semaphore *sem, struct task_struct *owner) cpu_relax_lowlatency(); } rcu_read_unlock(); - - if (READ_ONCE(sem->owner)) - return true; /* new owner, continue spinning */ - +out: /* - * When the owner is not set, the lock could be free or - * held by readers. Check the counter to verify the - * state. + * If there is a new owner or the owner is not set, we continue + * spinning. */ - count = READ_ONCE(sem->count); - return (count == 0 || count == RWSEM_WAITING_BIAS); + return !rwsem_owner_is_reader(READ_ONCE(sem->owner)); } static bool rwsem_optimistic_spin(struct rw_semaphore *sem) { - struct task_struct *owner; bool taken = false; preempt_disable(); @@ -376,12 +400,17 @@ static bool rwsem_optimistic_spin(struct rw_semaphore *sem) if (!osq_lock(&sem->osq)) goto done; - while (true) { - owner = READ_ONCE(sem->owner); - if (owner && !rwsem_spin_on_owner(sem, owner)) - break; - - /* wait_lock will be acquired if write_lock is obtained */ + /* + * Optimistically spin on the owner field and attempt to acquire the + * lock whenever the owner changes. Spinning will be stopped when: + * 1) the owning writer isn't running; or + * 2) readers own the lock as we can't determine if they are + * actively running or not. + */ + while (rwsem_spin_on_owner(sem)) { + /* + * Try to acquire the lock + */ if (rwsem_try_write_lock_unqueued(sem)) { taken = true; break; @@ -393,7 +422,7 @@ static bool rwsem_optimistic_spin(struct rw_semaphore *sem) * we're an RT task that will live-lock because we won't let * the owner complete. */ - if (!owner && (need_resched() || rt_task(current))) + if (!sem->owner && (need_resched() || rt_task(current))) break; /* @@ -440,9 +469,10 @@ __rwsem_down_write_failed_common(struct rw_semaphore *sem, int state) bool waiting = true; /* any queued threads before us */ struct rwsem_waiter waiter; struct rw_semaphore *ret = sem; + WAKE_Q(wake_q); /* undo write bias from down_write operation, stop active locking */ - count = rwsem_atomic_update(-RWSEM_ACTIVE_WRITE_BIAS, sem); + count = atomic_long_sub_return(RWSEM_ACTIVE_WRITE_BIAS, &sem->count); /* do optimistic spinning and steal lock if possible */ if (rwsem_optimistic_spin(sem)) @@ -465,18 +495,29 @@ __rwsem_down_write_failed_common(struct rw_semaphore *sem, int state) /* we're now waiting on the lock, but no longer actively locking */ if (waiting) { - count = READ_ONCE(sem->count); + count = atomic_long_read(&sem->count); /* * If there were already threads queued before us and there are * no active writers, the lock must be read owned; so we try to * wake any read locks that were queued ahead of us. */ - if (count > RWSEM_WAITING_BIAS) - sem = __rwsem_do_wake(sem, RWSEM_WAKE_READERS); + if (count > RWSEM_WAITING_BIAS) { + WAKE_Q(wake_q); + + sem = __rwsem_mark_wake(sem, RWSEM_WAKE_READERS, &wake_q); + /* + * The wakeup is normally called _after_ the wait_lock + * is released, but given that we are proactively waking + * readers we can deal with the wake_q overhead as it is + * similar to releasing and taking the wait_lock again + * for attempting rwsem_try_write_lock(). + */ + wake_up_q(&wake_q); + } } else - count = rwsem_atomic_update(RWSEM_WAITING_BIAS, sem); + count = atomic_long_add_return(RWSEM_WAITING_BIAS, &sem->count); /* wait until we successfully acquire the lock */ set_current_state(state); @@ -492,7 +533,7 @@ __rwsem_down_write_failed_common(struct rw_semaphore *sem, int state) schedule(); set_current_state(state); - } while ((count = sem->count) & RWSEM_ACTIVE_MASK); + } while ((count = atomic_long_read(&sem->count)) & RWSEM_ACTIVE_MASK); raw_spin_lock_irq(&sem->wait_lock); } @@ -507,10 +548,11 @@ out_nolock: raw_spin_lock_irq(&sem->wait_lock); list_del(&waiter.list); if (list_empty(&sem->wait_list)) - rwsem_atomic_update(-RWSEM_WAITING_BIAS, sem); + atomic_long_add(-RWSEM_WAITING_BIAS, &sem->count); else - __rwsem_do_wake(sem, RWSEM_WAKE_ANY); + __rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q); raw_spin_unlock_irq(&sem->wait_lock); + wake_up_q(&wake_q); return ERR_PTR(-EINTR); } @@ -537,6 +579,7 @@ __visible struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem) { unsigned long flags; + WAKE_Q(wake_q); /* * If a spinner is present, it is not necessary to do the wakeup. @@ -573,9 +616,10 @@ locked: /* do nothing if list empty */ if (!list_empty(&sem->wait_list)) - sem = __rwsem_do_wake(sem, RWSEM_WAKE_ANY); + sem = __rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q); raw_spin_unlock_irqrestore(&sem->wait_lock, flags); + wake_up_q(&wake_q); return sem; } @@ -590,14 +634,16 @@ __visible struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem) { unsigned long flags; + WAKE_Q(wake_q); raw_spin_lock_irqsave(&sem->wait_lock, flags); /* do nothing if list empty */ if (!list_empty(&sem->wait_list)) - sem = __rwsem_do_wake(sem, RWSEM_WAKE_READ_OWNED); + sem = __rwsem_mark_wake(sem, RWSEM_WAKE_READ_OWNED, &wake_q); raw_spin_unlock_irqrestore(&sem->wait_lock, flags); + wake_up_q(&wake_q); return sem; } diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c index 2e853ad93a3a..45ba475d4be3 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c @@ -22,6 +22,7 @@ void __sched down_read(struct rw_semaphore *sem) rwsem_acquire_read(&sem->dep_map, 0, 0, _RET_IP_); LOCK_CONTENDED(sem, __down_read_trylock, __down_read); + rwsem_set_reader_owned(sem); } EXPORT_SYMBOL(down_read); @@ -33,8 +34,10 @@ int down_read_trylock(struct rw_semaphore *sem) { int ret = __down_read_trylock(sem); - if (ret == 1) + if (ret == 1) { rwsem_acquire_read(&sem->dep_map, 0, 1, _RET_IP_); + rwsem_set_reader_owned(sem); + } return ret; } @@ -124,7 +127,7 @@ void downgrade_write(struct rw_semaphore *sem) * lockdep: a downgraded write will live on as a write * dependency. */ - rwsem_clear_owner(sem); + rwsem_set_reader_owned(sem); __downgrade_write(sem); } @@ -138,6 +141,7 @@ void down_read_nested(struct rw_semaphore *sem, int subclass) rwsem_acquire_read(&sem->dep_map, subclass, 0, _RET_IP_); LOCK_CONTENDED(sem, __down_read_trylock, __down_read); + rwsem_set_reader_owned(sem); } EXPORT_SYMBOL(down_read_nested); diff --git a/kernel/locking/rwsem.h b/kernel/locking/rwsem.h index 870ed9a5b426..a699f4048ba1 100644 --- a/kernel/locking/rwsem.h +++ b/kernel/locking/rwsem.h @@ -1,14 +1,58 @@ +/* + * The owner field of the rw_semaphore structure will be set to + * RWSEM_READ_OWNED when a reader grabs the lock. A writer will clear + * the owner field when it unlocks. A reader, on the other hand, will + * not touch the owner field when it unlocks. + * + * In essence, the owner field now has the following 3 states: + * 1) 0 + * - lock is free or the owner hasn't set the field yet + * 2) RWSEM_READER_OWNED + * - lock is currently or previously owned by readers (lock is free + * or not set by owner yet) + * 3) Other non-zero value + * - a writer owns the lock + */ +#define RWSEM_READER_OWNED ((struct task_struct *)1UL) + #ifdef CONFIG_RWSEM_SPIN_ON_OWNER +/* + * All writes to owner are protected by WRITE_ONCE() to make sure that + * store tearing can't happen as optimistic spinners may read and use + * the owner value concurrently without lock. Read from owner, however, + * may not need READ_ONCE() as long as the pointer value is only used + * for comparison and isn't being dereferenced. + */ static inline void rwsem_set_owner(struct rw_semaphore *sem) { - sem->owner = current; + WRITE_ONCE(sem->owner, current); } static inline void rwsem_clear_owner(struct rw_semaphore *sem) { - sem->owner = NULL; + WRITE_ONCE(sem->owner, NULL); +} + +static inline void rwsem_set_reader_owned(struct rw_semaphore *sem) +{ + /* + * We check the owner value first to make sure that we will only + * do a write to the rwsem cacheline when it is really necessary + * to minimize cacheline contention. + */ + if (sem->owner != RWSEM_READER_OWNED) + WRITE_ONCE(sem->owner, RWSEM_READER_OWNED); +} + +static inline bool rwsem_owner_is_writer(struct task_struct *owner) +{ + return owner && owner != RWSEM_READER_OWNED; } +static inline bool rwsem_owner_is_reader(struct task_struct *owner) +{ + return owner == RWSEM_READER_OWNED; +} #else static inline void rwsem_set_owner(struct rw_semaphore *sem) { @@ -17,4 +61,8 @@ static inline void rwsem_set_owner(struct rw_semaphore *sem) static inline void rwsem_clear_owner(struct rw_semaphore *sem) { } + +static inline void rwsem_set_reader_owned(struct rw_semaphore *sem) +{ +} #endif diff --git a/kernel/memremap.c b/kernel/memremap.c index 017532193fb1..251d16b4cb41 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c @@ -169,12 +169,6 @@ void devm_memunmap(struct device *dev, void *addr) } EXPORT_SYMBOL(devm_memunmap); -pfn_t phys_to_pfn_t(phys_addr_t addr, u64 flags) -{ - return __pfn_to_pfn_t(addr >> PAGE_SHIFT, flags); -} -EXPORT_SYMBOL(phys_to_pfn_t); - #ifdef CONFIG_ZONE_DEVICE static DEFINE_MUTEX(pgmap_lock); static RADIX_TREE(pgmap_radix, GFP_KERNEL); @@ -308,12 +302,6 @@ void *devm_memremap_pages(struct device *dev, struct resource *res, if (is_ram == REGION_INTERSECTS) return __va(res->start); - if (altmap && !IS_ENABLED(CONFIG_SPARSEMEM_VMEMMAP)) { - dev_err(dev, "%s: altmap requires CONFIG_SPARSEMEM_VMEMMAP=y\n", - __func__); - return ERR_PTR(-ENXIO); - } - if (!ref) return ERR_PTR(-EINVAL); @@ -401,7 +389,6 @@ void vmem_altmap_free(struct vmem_altmap *altmap, unsigned long nr_pfns) altmap->alloc -= nr_pfns; } -#ifdef CONFIG_SPARSEMEM_VMEMMAP struct vmem_altmap *to_vmem_altmap(unsigned long memmap_start) { /* @@ -427,5 +414,4 @@ struct vmem_altmap *to_vmem_altmap(unsigned long memmap_start) return pgmap ? pgmap->altmap : NULL; } -#endif /* CONFIG_SPARSEMEM_VMEMMAP */ #endif /* CONFIG_ZONE_DEVICE */ diff --git a/kernel/power/Makefile b/kernel/power/Makefile index cb880a14cc39..eb4f717705ba 100644 --- a/kernel/power/Makefile +++ b/kernel/power/Makefile @@ -1,6 +1,8 @@ ccflags-$(CONFIG_PM_DEBUG) := -DDEBUG +KASAN_SANITIZE_snapshot.o := n + obj-y += qos.o obj-$(CONFIG_PM) += main.o obj-$(CONFIG_VT_CONSOLE_SLEEP) += console.o diff --git a/kernel/power/console.c b/kernel/power/console.c index aba9c545a0e3..0e781798b0b3 100644 --- a/kernel/power/console.c +++ b/kernel/power/console.c @@ -126,17 +126,17 @@ out: return ret; } -int pm_prepare_console(void) +void pm_prepare_console(void) { if (!pm_vt_switch()) - return 0; + return; orig_fgconsole = vt_move_to_console(SUSPEND_CONSOLE, 1); if (orig_fgconsole < 0) - return 1; + return; orig_kmsg = vt_kmsg_redirect(SUSPEND_CONSOLE); - return 0; + return; } void pm_restore_console(void) diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index fca9254280ee..a881c6a7ba74 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -52,6 +52,7 @@ enum { #ifdef CONFIG_SUSPEND HIBERNATION_SUSPEND, #endif + HIBERNATION_TEST_RESUME, /* keep last */ __HIBERNATION_AFTER_LAST }; @@ -409,6 +410,11 @@ int hibernation_snapshot(int platform_mode) goto Close; } +int __weak hibernate_resume_nonboot_cpu_disable(void) +{ + return disable_nonboot_cpus(); +} + /** * resume_target_kernel - Restore system state from a hibernation image. * @platform_mode: Whether or not to use the platform driver. @@ -433,7 +439,7 @@ static int resume_target_kernel(bool platform_mode) if (error) goto Cleanup; - error = disable_nonboot_cpus(); + error = hibernate_resume_nonboot_cpu_disable(); if (error) goto Enable_cpus; @@ -642,12 +648,39 @@ static void power_down(void) cpu_relax(); } +static int load_image_and_restore(void) +{ + int error; + unsigned int flags; + + pr_debug("PM: Loading hibernation image.\n"); + + lock_device_hotplug(); + error = create_basic_memory_bitmaps(); + if (error) + goto Unlock; + + error = swsusp_read(&flags); + swsusp_close(FMODE_READ); + if (!error) + hibernation_restore(flags & SF_PLATFORM_MODE); + + printk(KERN_ERR "PM: Failed to load hibernation image, recovering.\n"); + swsusp_free(); + free_basic_memory_bitmaps(); + Unlock: + unlock_device_hotplug(); + + return error; +} + /** * hibernate - Carry out system hibernation, including saving the image. */ int hibernate(void) { - int error; + int error, nr_calls = 0; + bool snapshot_test = false; if (!hibernation_available()) { pr_debug("PM: Hibernation not available.\n"); @@ -662,9 +695,11 @@ int hibernate(void) } pm_prepare_console(); - error = pm_notifier_call_chain(PM_HIBERNATION_PREPARE); - if (error) + error = __pm_notifier_call_chain(PM_HIBERNATION_PREPARE, -1, &nr_calls); + if (error) { + nr_calls--; goto Exit; + } printk(KERN_INFO "PM: Syncing filesystems ... "); sys_sync(); @@ -697,8 +732,12 @@ int hibernate(void) pr_debug("PM: writing image.\n"); error = swsusp_write(flags); swsusp_free(); - if (!error) - power_down(); + if (!error) { + if (hibernation_mode == HIBERNATION_TEST_RESUME) + snapshot_test = true; + else + power_down(); + } in_suspend = 0; pm_restore_gfp_mask(); } else { @@ -709,12 +748,18 @@ int hibernate(void) free_basic_memory_bitmaps(); Thaw: unlock_device_hotplug(); + if (snapshot_test) { + pr_debug("PM: Checking hibernation image\n"); + error = swsusp_check(); + if (!error) + error = load_image_and_restore(); + } thaw_processes(); /* Don't bother checking whether freezer_test_done is true */ freezer_test_done = false; Exit: - pm_notifier_call_chain(PM_POST_HIBERNATION); + __pm_notifier_call_chain(PM_POST_HIBERNATION, nr_calls, NULL); pm_restore_console(); atomic_inc(&snapshot_device_available); Unlock: @@ -740,8 +785,7 @@ int hibernate(void) */ static int software_resume(void) { - int error; - unsigned int flags; + int error, nr_calls = 0; /* * If the user said "noresume".. bail out early. @@ -827,35 +871,20 @@ static int software_resume(void) } pm_prepare_console(); - error = pm_notifier_call_chain(PM_RESTORE_PREPARE); - if (error) + error = __pm_notifier_call_chain(PM_RESTORE_PREPARE, -1, &nr_calls); + if (error) { + nr_calls--; goto Close_Finish; + } pr_debug("PM: Preparing processes for restore.\n"); error = freeze_processes(); if (error) goto Close_Finish; - - pr_debug("PM: Loading hibernation image.\n"); - - lock_device_hotplug(); - error = create_basic_memory_bitmaps(); - if (error) - goto Thaw; - - error = swsusp_read(&flags); - swsusp_close(FMODE_READ); - if (!error) - hibernation_restore(flags & SF_PLATFORM_MODE); - - printk(KERN_ERR "PM: Failed to load hibernation image, recovering.\n"); - swsusp_free(); - free_basic_memory_bitmaps(); - Thaw: - unlock_device_hotplug(); + error = load_image_and_restore(); thaw_processes(); Finish: - pm_notifier_call_chain(PM_POST_RESTORE); + __pm_notifier_call_chain(PM_POST_RESTORE, nr_calls, NULL); pm_restore_console(); atomic_inc(&snapshot_device_available); /* For success case, the suspend path will release the lock */ @@ -878,6 +907,7 @@ static const char * const hibernation_modes[] = { #ifdef CONFIG_SUSPEND [HIBERNATION_SUSPEND] = "suspend", #endif + [HIBERNATION_TEST_RESUME] = "test_resume", }; /* @@ -924,6 +954,7 @@ static ssize_t disk_show(struct kobject *kobj, struct kobj_attribute *attr, #ifdef CONFIG_SUSPEND case HIBERNATION_SUSPEND: #endif + case HIBERNATION_TEST_RESUME: break; case HIBERNATION_PLATFORM: if (hibernation_ops) @@ -970,6 +1001,7 @@ static ssize_t disk_store(struct kobject *kobj, struct kobj_attribute *attr, #ifdef CONFIG_SUSPEND case HIBERNATION_SUSPEND: #endif + case HIBERNATION_TEST_RESUME: hibernation_mode = mode; break; case HIBERNATION_PLATFORM: @@ -1115,13 +1147,16 @@ static int __init resume_offset_setup(char *str) static int __init hibernate_setup(char *str) { - if (!strncmp(str, "noresume", 8)) + if (!strncmp(str, "noresume", 8)) { noresume = 1; - else if (!strncmp(str, "nocompress", 10)) + } else if (!strncmp(str, "nocompress", 10)) { nocompress = 1; - else if (!strncmp(str, "no", 2)) { + } else if (!strncmp(str, "no", 2)) { noresume = 1; nohibernate = 1; + } else if (IS_ENABLED(CONFIG_DEBUG_RODATA) + && !strncmp(str, "protect_image", 13)) { + enable_restore_image_protection(); } return 1; } @@ -1154,11 +1189,6 @@ static int __init nohibernate_setup(char *str) return 1; } -static int __init kaslr_nohibernate_setup(char *str) -{ - return nohibernate_setup(str); -} - static int __init page_poison_nohibernate_setup(char *str) { #ifdef CONFIG_PAGE_POISONING_ZERO @@ -1182,5 +1212,4 @@ __setup("hibernate=", hibernate_setup); __setup("resumewait", resumewait_setup); __setup("resumedelay=", resumedelay_setup); __setup("nohibernate", nohibernate_setup); -__setup("kaslr", kaslr_nohibernate_setup); __setup("page_poison=", page_poison_nohibernate_setup); diff --git a/kernel/power/main.c b/kernel/power/main.c index 27946975eff0..5ea50b1b7595 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -38,12 +38,19 @@ int unregister_pm_notifier(struct notifier_block *nb) } EXPORT_SYMBOL_GPL(unregister_pm_notifier); -int pm_notifier_call_chain(unsigned long val) +int __pm_notifier_call_chain(unsigned long val, int nr_to_call, int *nr_calls) { - int ret = blocking_notifier_call_chain(&pm_chain_head, val, NULL); + int ret; + + ret = __blocking_notifier_call_chain(&pm_chain_head, val, NULL, + nr_to_call, nr_calls); return notifier_to_errno(ret); } +int pm_notifier_call_chain(unsigned long val) +{ + return __pm_notifier_call_chain(val, -1, NULL); +} /* If set, devices may be suspended and resumed asynchronously. */ int pm_async_enabled = 1; diff --git a/kernel/power/power.h b/kernel/power/power.h index efe1b3b17c88..242d8b827dd5 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h @@ -38,6 +38,8 @@ static inline char *check_image_kernel(struct swsusp_info *info) } #endif /* CONFIG_ARCH_HIBERNATION_HEADER */ +extern int hibernate_resume_nonboot_cpu_disable(void); + /* * Keep some memory free so that I/O operations can succeed without paging * [Might this be more than 4 MB?] @@ -59,6 +61,13 @@ extern int hibernation_snapshot(int platform_mode); extern int hibernation_restore(int platform_mode); extern int hibernation_platform_enter(void); +#ifdef CONFIG_DEBUG_RODATA +/* kernel/power/snapshot.c */ +extern void enable_restore_image_protection(void); +#else +static inline void enable_restore_image_protection(void) {} +#endif /* CONFIG_DEBUG_RODATA */ + #else /* !CONFIG_HIBERNATION */ static inline void hibernate_reserved_size_init(void) {} @@ -200,6 +209,8 @@ static inline void suspend_test_finish(const char *label) {} #ifdef CONFIG_PM_SLEEP /* kernel/power/main.c */ +extern int __pm_notifier_call_chain(unsigned long val, int nr_to_call, + int *nr_calls); extern int pm_notifier_call_chain(unsigned long val); #endif diff --git a/kernel/power/process.c b/kernel/power/process.c index 0c2ee9761d57..8f27d5a8adf6 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -89,6 +89,9 @@ static int try_to_freeze_tasks(bool user_only) elapsed_msecs / 1000, elapsed_msecs % 1000, todo - wq_busy, wq_busy); + if (wq_busy) + show_workqueue_state(); + if (!wakeup) { read_lock(&tasklist_lock); for_each_process_thread(g, p) { diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 3a970604308f..9a0178c2ac1d 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -38,6 +38,43 @@ #include "power.h" +#ifdef CONFIG_DEBUG_RODATA +static bool hibernate_restore_protection; +static bool hibernate_restore_protection_active; + +void enable_restore_image_protection(void) +{ + hibernate_restore_protection = true; +} + +static inline void hibernate_restore_protection_begin(void) +{ + hibernate_restore_protection_active = hibernate_restore_protection; +} + +static inline void hibernate_restore_protection_end(void) +{ + hibernate_restore_protection_active = false; +} + +static inline void hibernate_restore_protect_page(void *page_address) +{ + if (hibernate_restore_protection_active) + set_memory_ro((unsigned long)page_address, 1); +} + +static inline void hibernate_restore_unprotect_page(void *page_address) +{ + if (hibernate_restore_protection_active) + set_memory_rw((unsigned long)page_address, 1); +} +#else +static inline void hibernate_restore_protection_begin(void) {} +static inline void hibernate_restore_protection_end(void) {} +static inline void hibernate_restore_protect_page(void *page_address) {} +static inline void hibernate_restore_unprotect_page(void *page_address) {} +#endif /* CONFIG_DEBUG_RODATA */ + static int swsusp_page_is_free(struct page *); static void swsusp_set_page_forbidden(struct page *); static void swsusp_unset_page_forbidden(struct page *); @@ -67,25 +104,32 @@ void __init hibernate_image_size_init(void) image_size = ((totalram_pages * 2) / 5) * PAGE_SIZE; } -/* List of PBEs needed for restoring the pages that were allocated before +/* + * List of PBEs needed for restoring the pages that were allocated before * the suspend and included in the suspend image, but have also been * allocated by the "resume" kernel, so their contents cannot be written * directly to their "original" page frames. */ struct pbe *restore_pblist; -/* Pointer to an auxiliary buffer (1 page) */ -static void *buffer; +/* struct linked_page is used to build chains of pages */ -/** - * @safe_needed - on resume, for storing the PBE list and the image, - * we can only use memory pages that do not conflict with the pages - * used before suspend. The unsafe pages have PageNosaveFree set - * and we count them using unsafe_pages. - * - * Each allocated image page is marked as PageNosave and PageNosaveFree - * so that swsusp_free() can release it. +#define LINKED_PAGE_DATA_SIZE (PAGE_SIZE - sizeof(void *)) + +struct linked_page { + struct linked_page *next; + char data[LINKED_PAGE_DATA_SIZE]; +} __packed; + +/* + * List of "safe" pages (ie. pages that were not used by the image kernel + * before hibernation) that may be used as temporary storage for image kernel + * memory contents. */ +static struct linked_page *safe_pages_list; + +/* Pointer to an auxiliary buffer (1 page) */ +static void *buffer; #define PG_ANY 0 #define PG_SAFE 1 @@ -94,6 +138,19 @@ static void *buffer; static unsigned int allocated_unsafe_pages; +/** + * get_image_page - Allocate a page for a hibernation image. + * @gfp_mask: GFP mask for the allocation. + * @safe_needed: Get pages that were not used before hibernation (restore only) + * + * During image restoration, for storing the PBE list and the image data, we can + * only use memory pages that do not conflict with the pages used before + * hibernation. The "unsafe" pages have PageNosaveFree set and we count them + * using allocated_unsafe_pages. + * + * Each allocated image page is marked as PageNosave and PageNosaveFree so that + * swsusp_free() can release it. + */ static void *get_image_page(gfp_t gfp_mask, int safe_needed) { void *res; @@ -113,9 +170,21 @@ static void *get_image_page(gfp_t gfp_mask, int safe_needed) return res; } +static void *__get_safe_page(gfp_t gfp_mask) +{ + if (safe_pages_list) { + void *ret = safe_pages_list; + + safe_pages_list = safe_pages_list->next; + memset(ret, 0, PAGE_SIZE); + return ret; + } + return get_image_page(gfp_mask, PG_SAFE); +} + unsigned long get_safe_page(gfp_t gfp_mask) { - return (unsigned long)get_image_page(gfp_mask, PG_SAFE); + return (unsigned long)__get_safe_page(gfp_mask); } static struct page *alloc_image_page(gfp_t gfp_mask) @@ -130,11 +199,22 @@ static struct page *alloc_image_page(gfp_t gfp_mask) return page; } +static void recycle_safe_page(void *page_address) +{ + struct linked_page *lp = page_address; + + lp->next = safe_pages_list; + safe_pages_list = lp; +} + /** - * free_image_page - free page represented by @addr, allocated with - * get_image_page (page flags set by it must be cleared) + * free_image_page - Free a page allocated for hibernation image. + * @addr: Address of the page to free. + * @clear_nosave_free: If set, clear the PageNosaveFree bit for the page. + * + * The page to free should have been allocated by get_image_page() (page flags + * set by it are affected). */ - static inline void free_image_page(void *addr, int clear_nosave_free) { struct page *page; @@ -150,17 +230,8 @@ static inline void free_image_page(void *addr, int clear_nosave_free) __free_page(page); } -/* struct linked_page is used to build chains of pages */ - -#define LINKED_PAGE_DATA_SIZE (PAGE_SIZE - sizeof(void *)) - -struct linked_page { - struct linked_page *next; - char data[LINKED_PAGE_DATA_SIZE]; -} __packed; - -static inline void -free_list_of_pages(struct linked_page *list, int clear_page_nosave) +static inline void free_list_of_pages(struct linked_page *list, + int clear_page_nosave) { while (list) { struct linked_page *lp = list->next; @@ -170,30 +241,28 @@ free_list_of_pages(struct linked_page *list, int clear_page_nosave) } } -/** - * struct chain_allocator is used for allocating small objects out of - * a linked list of pages called 'the chain'. - * - * The chain grows each time when there is no room for a new object in - * the current page. The allocated objects cannot be freed individually. - * It is only possible to free them all at once, by freeing the entire - * chain. - * - * NOTE: The chain allocator may be inefficient if the allocated objects - * are not much smaller than PAGE_SIZE. - */ - +/* + * struct chain_allocator is used for allocating small objects out of + * a linked list of pages called 'the chain'. + * + * The chain grows each time when there is no room for a new object in + * the current page. The allocated objects cannot be freed individually. + * It is only possible to free them all at once, by freeing the entire + * chain. + * + * NOTE: The chain allocator may be inefficient if the allocated objects + * are not much smaller than PAGE_SIZE. + */ struct chain_allocator { struct linked_page *chain; /* the chain */ unsigned int used_space; /* total size of objects allocated out - * of the current page - */ + of the current page */ gfp_t gfp_mask; /* mask for allocating pages */ int safe_needed; /* if set, only "safe" pages are allocated */ }; -static void -chain_init(struct chain_allocator *ca, gfp_t gfp_mask, int safe_needed) +static void chain_init(struct chain_allocator *ca, gfp_t gfp_mask, + int safe_needed) { ca->chain = NULL; ca->used_space = LINKED_PAGE_DATA_SIZE; @@ -208,7 +277,8 @@ static void *chain_alloc(struct chain_allocator *ca, unsigned int size) if (LINKED_PAGE_DATA_SIZE - ca->used_space < size) { struct linked_page *lp; - lp = get_image_page(ca->gfp_mask, ca->safe_needed); + lp = ca->safe_needed ? __get_safe_page(ca->gfp_mask) : + get_image_page(ca->gfp_mask, PG_ANY); if (!lp) return NULL; @@ -222,44 +292,44 @@ static void *chain_alloc(struct chain_allocator *ca, unsigned int size) } /** - * Data types related to memory bitmaps. + * Data types related to memory bitmaps. * - * Memory bitmap is a structure consiting of many linked lists of - * objects. The main list's elements are of type struct zone_bitmap - * and each of them corresonds to one zone. For each zone bitmap - * object there is a list of objects of type struct bm_block that - * represent each blocks of bitmap in which information is stored. + * Memory bitmap is a structure consiting of many linked lists of + * objects. The main list's elements are of type struct zone_bitmap + * and each of them corresonds to one zone. For each zone bitmap + * object there is a list of objects of type struct bm_block that + * represent each blocks of bitmap in which information is stored. * - * struct memory_bitmap contains a pointer to the main list of zone - * bitmap objects, a struct bm_position used for browsing the bitmap, - * and a pointer to the list of pages used for allocating all of the - * zone bitmap objects and bitmap block objects. + * struct memory_bitmap contains a pointer to the main list of zone + * bitmap objects, a struct bm_position used for browsing the bitmap, + * and a pointer to the list of pages used for allocating all of the + * zone bitmap objects and bitmap block objects. * - * NOTE: It has to be possible to lay out the bitmap in memory - * using only allocations of order 0. Additionally, the bitmap is - * designed to work with arbitrary number of zones (this is over the - * top for now, but let's avoid making unnecessary assumptions ;-). + * NOTE: It has to be possible to lay out the bitmap in memory + * using only allocations of order 0. Additionally, the bitmap is + * designed to work with arbitrary number of zones (this is over the + * top for now, but let's avoid making unnecessary assumptions ;-). * - * struct zone_bitmap contains a pointer to a list of bitmap block - * objects and a pointer to the bitmap block object that has been - * most recently used for setting bits. Additionally, it contains the - * pfns that correspond to the start and end of the represented zone. + * struct zone_bitmap contains a pointer to a list of bitmap block + * objects and a pointer to the bitmap block object that has been + * most recently used for setting bits. Additionally, it contains the + * PFNs that correspond to the start and end of the represented zone. * - * struct bm_block contains a pointer to the memory page in which - * information is stored (in the form of a block of bitmap) - * It also contains the pfns that correspond to the start and end of - * the represented memory area. + * struct bm_block contains a pointer to the memory page in which + * information is stored (in the form of a block of bitmap) + * It also contains the pfns that correspond to the start and end of + * the represented memory area. * - * The memory bitmap is organized as a radix tree to guarantee fast random - * access to the bits. There is one radix tree for each zone (as returned - * from create_mem_extents). + * The memory bitmap is organized as a radix tree to guarantee fast random + * access to the bits. There is one radix tree for each zone (as returned + * from create_mem_extents). * - * One radix tree is represented by one struct mem_zone_bm_rtree. There are - * two linked lists for the nodes of the tree, one for the inner nodes and - * one for the leave nodes. The linked leave nodes are used for fast linear - * access of the memory bitmap. + * One radix tree is represented by one struct mem_zone_bm_rtree. There are + * two linked lists for the nodes of the tree, one for the inner nodes and + * one for the leave nodes. The linked leave nodes are used for fast linear + * access of the memory bitmap. * - * The struct rtree_node represents one node of the radix tree. + * The struct rtree_node represents one node of the radix tree. */ #define BM_END_OF_MAP (~0UL) @@ -305,9 +375,8 @@ struct bm_position { struct memory_bitmap { struct list_head zones; struct linked_page *p_list; /* list of pages used to store zone - * bitmap objects and bitmap block - * objects - */ + bitmap objects and bitmap block + objects */ struct bm_position cur; /* most recently used bit position */ }; @@ -321,12 +390,12 @@ struct memory_bitmap { #endif #define BM_RTREE_LEVEL_MASK ((1UL << BM_RTREE_LEVEL_SHIFT) - 1) -/* - * alloc_rtree_node - Allocate a new node and add it to the radix tree. +/** + * alloc_rtree_node - Allocate a new node and add it to the radix tree. * - * This function is used to allocate inner nodes as well as the - * leave nodes of the radix tree. It also adds the node to the - * corresponding linked list passed in by the *list parameter. + * This function is used to allocate inner nodes as well as the + * leave nodes of the radix tree. It also adds the node to the + * corresponding linked list passed in by the *list parameter. */ static struct rtree_node *alloc_rtree_node(gfp_t gfp_mask, int safe_needed, struct chain_allocator *ca, @@ -347,12 +416,12 @@ static struct rtree_node *alloc_rtree_node(gfp_t gfp_mask, int safe_needed, return node; } -/* - * add_rtree_block - Add a new leave node to the radix tree +/** + * add_rtree_block - Add a new leave node to the radix tree. * - * The leave nodes need to be allocated in order to keep the leaves - * linked list in order. This is guaranteed by the zone->blocks - * counter. + * The leave nodes need to be allocated in order to keep the leaves + * linked list in order. This is guaranteed by the zone->blocks + * counter. */ static int add_rtree_block(struct mem_zone_bm_rtree *zone, gfp_t gfp_mask, int safe_needed, struct chain_allocator *ca) @@ -417,17 +486,18 @@ static int add_rtree_block(struct mem_zone_bm_rtree *zone, gfp_t gfp_mask, static void free_zone_bm_rtree(struct mem_zone_bm_rtree *zone, int clear_nosave_free); -/* - * create_zone_bm_rtree - create a radix tree for one zone +/** + * create_zone_bm_rtree - Create a radix tree for one zone. * - * Allocated the mem_zone_bm_rtree structure and initializes it. - * This function also allocated and builds the radix tree for the - * zone. + * Allocated the mem_zone_bm_rtree structure and initializes it. + * This function also allocated and builds the radix tree for the + * zone. */ -static struct mem_zone_bm_rtree * -create_zone_bm_rtree(gfp_t gfp_mask, int safe_needed, - struct chain_allocator *ca, - unsigned long start, unsigned long end) +static struct mem_zone_bm_rtree *create_zone_bm_rtree(gfp_t gfp_mask, + int safe_needed, + struct chain_allocator *ca, + unsigned long start, + unsigned long end) { struct mem_zone_bm_rtree *zone; unsigned int i, nr_blocks; @@ -454,12 +524,12 @@ create_zone_bm_rtree(gfp_t gfp_mask, int safe_needed, return zone; } -/* - * free_zone_bm_rtree - Free the memory of the radix tree +/** + * free_zone_bm_rtree - Free the memory of the radix tree. * - * Free all node pages of the radix tree. The mem_zone_bm_rtree - * structure itself is not freed here nor are the rtree_node - * structs. + * Free all node pages of the radix tree. The mem_zone_bm_rtree + * structure itself is not freed here nor are the rtree_node + * structs. */ static void free_zone_bm_rtree(struct mem_zone_bm_rtree *zone, int clear_nosave_free) @@ -492,8 +562,8 @@ struct mem_extent { }; /** - * free_mem_extents - free a list of memory extents - * @list - list of extents to empty + * free_mem_extents - Free a list of memory extents. + * @list: List of extents to free. */ static void free_mem_extents(struct list_head *list) { @@ -506,10 +576,11 @@ static void free_mem_extents(struct list_head *list) } /** - * create_mem_extents - create a list of memory extents representing - * contiguous ranges of PFNs - * @list - list to put the extents into - * @gfp_mask - mask to use for memory allocations + * create_mem_extents - Create a list of memory extents. + * @list: List to put the extents into. + * @gfp_mask: Mask to use for memory allocations. + * + * The extents represent contiguous ranges of PFNs. */ static int create_mem_extents(struct list_head *list, gfp_t gfp_mask) { @@ -565,10 +636,10 @@ static int create_mem_extents(struct list_head *list, gfp_t gfp_mask) } /** - * memory_bm_create - allocate memory for a memory bitmap - */ -static int -memory_bm_create(struct memory_bitmap *bm, gfp_t gfp_mask, int safe_needed) + * memory_bm_create - Allocate memory for a memory bitmap. + */ +static int memory_bm_create(struct memory_bitmap *bm, gfp_t gfp_mask, + int safe_needed) { struct chain_allocator ca; struct list_head mem_extents; @@ -607,8 +678,9 @@ memory_bm_create(struct memory_bitmap *bm, gfp_t gfp_mask, int safe_needed) } /** - * memory_bm_free - free memory occupied by the memory bitmap @bm - */ + * memory_bm_free - Free memory occupied by the memory bitmap. + * @bm: Memory bitmap. + */ static void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free) { struct mem_zone_bm_rtree *zone; @@ -622,14 +694,13 @@ static void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free) } /** - * memory_bm_find_bit - Find the bit for pfn in the memory - * bitmap + * memory_bm_find_bit - Find the bit for a given PFN in a memory bitmap. * - * Find the bit in the bitmap @bm that corresponds to given pfn. - * The cur.zone, cur.block and cur.node_pfn member of @bm are - * updated. - * It walks the radix tree to find the page which contains the bit for - * pfn and returns the bit position in **addr and *bit_nr. + * Find the bit in memory bitmap @bm that corresponds to the given PFN. + * The cur.zone, cur.block and cur.node_pfn members of @bm are updated. + * + * Walk the radix tree to find the page containing the bit that represents @pfn + * and return the position of the bit in @addr and @bit_nr. */ static int memory_bm_find_bit(struct memory_bitmap *bm, unsigned long pfn, void **addr, unsigned int *bit_nr) @@ -658,10 +729,9 @@ static int memory_bm_find_bit(struct memory_bitmap *bm, unsigned long pfn, zone_found: /* - * We have a zone. Now walk the radix tree to find the leave - * node for our pfn. + * We have found the zone. Now walk the radix tree to find the leaf node + * for our PFN. */ - node = bm->cur.node; if (((pfn - zone->start_pfn) & ~BM_BLOCK_MASK) == bm->cur.node_pfn) goto node_found; @@ -754,14 +824,14 @@ static bool memory_bm_pfn_present(struct memory_bitmap *bm, unsigned long pfn) } /* - * rtree_next_node - Jumps to the next leave node + * rtree_next_node - Jump to the next leaf node. * - * Sets the position to the beginning of the next node in the - * memory bitmap. This is either the next node in the current - * zone's radix tree or the first node in the radix tree of the - * next zone. + * Set the position to the beginning of the next node in the + * memory bitmap. This is either the next node in the current + * zone's radix tree or the first node in the radix tree of the + * next zone. * - * Returns true if there is a next node, false otherwise. + * Return true if there is a next node, false otherwise. */ static bool rtree_next_node(struct memory_bitmap *bm) { @@ -790,14 +860,15 @@ static bool rtree_next_node(struct memory_bitmap *bm) } /** - * memory_bm_rtree_next_pfn - Find the next set bit in the bitmap @bm + * memory_bm_rtree_next_pfn - Find the next set bit in a memory bitmap. + * @bm: Memory bitmap. * - * Starting from the last returned position this function searches - * for the next set bit in the memory bitmap and returns its - * number. If no more bit is set BM_END_OF_MAP is returned. + * Starting from the last returned position this function searches for the next + * set bit in @bm and returns the PFN represented by it. If no more bits are + * set, BM_END_OF_MAP is returned. * - * It is required to run memory_bm_position_reset() before the - * first call to this function. + * It is required to run memory_bm_position_reset() before the first call to + * this function for the given memory bitmap. */ static unsigned long memory_bm_next_pfn(struct memory_bitmap *bm) { @@ -819,11 +890,10 @@ static unsigned long memory_bm_next_pfn(struct memory_bitmap *bm) return BM_END_OF_MAP; } -/** - * This structure represents a range of page frames the contents of which - * should not be saved during the suspend. +/* + * This structure represents a range of page frames the contents of which + * should not be saved during hibernation. */ - struct nosave_region { struct list_head list; unsigned long start_pfn; @@ -832,15 +902,42 @@ struct nosave_region { static LIST_HEAD(nosave_regions); +static void recycle_zone_bm_rtree(struct mem_zone_bm_rtree *zone) +{ + struct rtree_node *node; + + list_for_each_entry(node, &zone->nodes, list) + recycle_safe_page(node->data); + + list_for_each_entry(node, &zone->leaves, list) + recycle_safe_page(node->data); +} + +static void memory_bm_recycle(struct memory_bitmap *bm) +{ + struct mem_zone_bm_rtree *zone; + struct linked_page *p_list; + + list_for_each_entry(zone, &bm->zones, list) + recycle_zone_bm_rtree(zone); + + p_list = bm->p_list; + while (p_list) { + struct linked_page *lp = p_list; + + p_list = lp->next; + recycle_safe_page(lp); + } +} + /** - * register_nosave_region - register a range of page frames the contents - * of which should not be saved during the suspend (to be used in the early - * initialization code) + * register_nosave_region - Register a region of unsaveable memory. + * + * Register a range of page frames the contents of which should not be saved + * during hibernation (to be used in the early initialization code). */ - -void __init -__register_nosave_region(unsigned long start_pfn, unsigned long end_pfn, - int use_kmalloc) +void __init __register_nosave_region(unsigned long start_pfn, + unsigned long end_pfn, int use_kmalloc) { struct nosave_region *region; @@ -857,12 +954,13 @@ __register_nosave_region(unsigned long start_pfn, unsigned long end_pfn, } } if (use_kmalloc) { - /* during init, this shouldn't fail */ + /* During init, this shouldn't fail */ region = kmalloc(sizeof(struct nosave_region), GFP_KERNEL); BUG_ON(!region); - } else + } else { /* This allocation cannot fail */ region = memblock_virt_alloc(sizeof(struct nosave_region), 0); + } region->start_pfn = start_pfn; region->end_pfn = end_pfn; list_add_tail(®ion->list, &nosave_regions); @@ -923,10 +1021,12 @@ static void swsusp_unset_page_forbidden(struct page *page) } /** - * mark_nosave_pages - set bits corresponding to the page frames the - * contents of which should not be saved in a given bitmap. + * mark_nosave_pages - Mark pages that should not be saved. + * @bm: Memory bitmap. + * + * Set the bits in @bm that correspond to the page frames the contents of which + * should not be saved. */ - static void mark_nosave_pages(struct memory_bitmap *bm) { struct nosave_region *region; @@ -956,13 +1056,13 @@ static void mark_nosave_pages(struct memory_bitmap *bm) } /** - * create_basic_memory_bitmaps - create bitmaps needed for marking page - * frames that should not be saved and free page frames. The pointers - * forbidden_pages_map and free_pages_map are only modified if everything - * goes well, because we don't want the bits to be used before both bitmaps - * are set up. + * create_basic_memory_bitmaps - Create bitmaps to hold basic page information. + * + * Create bitmaps needed for marking page frames that should not be saved and + * free page frames. The forbidden_pages_map and free_pages_map pointers are + * only modified if everything goes well, because we don't want the bits to be + * touched before both bitmaps are set up. */ - int create_basic_memory_bitmaps(void) { struct memory_bitmap *bm1, *bm2; @@ -1007,12 +1107,12 @@ int create_basic_memory_bitmaps(void) } /** - * free_basic_memory_bitmaps - free memory bitmaps allocated by - * create_basic_memory_bitmaps(). The auxiliary pointers are necessary - * so that the bitmaps themselves are not referred to while they are being - * freed. + * free_basic_memory_bitmaps - Free memory bitmaps holding basic information. + * + * Free memory bitmaps allocated by create_basic_memory_bitmaps(). The + * auxiliary pointers are necessary so that the bitmaps themselves are not + * referred to while they are being freed. */ - void free_basic_memory_bitmaps(void) { struct memory_bitmap *bm1, *bm2; @@ -1033,11 +1133,13 @@ void free_basic_memory_bitmaps(void) } /** - * snapshot_additional_pages - estimate the number of additional pages - * be needed for setting up the suspend image data structures for given - * zone (usually the returned value is greater than the exact number) + * snapshot_additional_pages - Estimate the number of extra pages needed. + * @zone: Memory zone to carry out the computation for. + * + * Estimate the number of additional pages needed for setting up a hibernation + * image data structures for @zone (usually, the returned value is greater than + * the exact number). */ - unsigned int snapshot_additional_pages(struct zone *zone) { unsigned int rtree, nodes; @@ -1055,10 +1157,10 @@ unsigned int snapshot_additional_pages(struct zone *zone) #ifdef CONFIG_HIGHMEM /** - * count_free_highmem_pages - compute the total number of free highmem - * pages, system-wide. + * count_free_highmem_pages - Compute the total number of free highmem pages. + * + * The returned number is system-wide. */ - static unsigned int count_free_highmem_pages(void) { struct zone *zone; @@ -1072,11 +1174,12 @@ static unsigned int count_free_highmem_pages(void) } /** - * saveable_highmem_page - Determine whether a highmem page should be - * included in the suspend image. + * saveable_highmem_page - Check if a highmem page is saveable. * - * We should save the page if it isn't Nosave or NosaveFree, or Reserved, - * and it isn't a part of a free chunk of pages. + * Determine whether a highmem page should be included in a hibernation image. + * + * We should save the page if it isn't Nosave or NosaveFree, or Reserved, + * and it isn't part of a free chunk of pages. */ static struct page *saveable_highmem_page(struct zone *zone, unsigned long pfn) { @@ -1102,10 +1205,8 @@ static struct page *saveable_highmem_page(struct zone *zone, unsigned long pfn) } /** - * count_highmem_pages - compute the total number of saveable highmem - * pages. + * count_highmem_pages - Compute the total number of saveable highmem pages. */ - static unsigned int count_highmem_pages(void) { struct zone *zone; @@ -1133,12 +1234,14 @@ static inline void *saveable_highmem_page(struct zone *z, unsigned long p) #endif /* CONFIG_HIGHMEM */ /** - * saveable_page - Determine whether a non-highmem page should be included - * in the suspend image. + * saveable_page - Check if the given page is saveable. * - * We should save the page if it isn't Nosave, and is not in the range - * of pages statically defined as 'unsaveable', and it isn't a part of - * a free chunk of pages. + * Determine whether a non-highmem page should be included in a hibernation + * image. + * + * We should save the page if it isn't Nosave, and is not in the range + * of pages statically defined as 'unsaveable', and it isn't part of + * a free chunk of pages. */ static struct page *saveable_page(struct zone *zone, unsigned long pfn) { @@ -1167,10 +1270,8 @@ static struct page *saveable_page(struct zone *zone, unsigned long pfn) } /** - * count_data_pages - compute the total number of saveable non-highmem - * pages. + * count_data_pages - Compute the total number of saveable non-highmem pages. */ - static unsigned int count_data_pages(void) { struct zone *zone; @@ -1190,7 +1291,8 @@ static unsigned int count_data_pages(void) return n; } -/* This is needed, because copy_page and memcpy are not usable for copying +/* + * This is needed, because copy_page and memcpy are not usable for copying * task structs. */ static inline void do_copy_page(long *dst, long *src) @@ -1201,12 +1303,12 @@ static inline void do_copy_page(long *dst, long *src) *dst++ = *src++; } - /** - * safe_copy_page - check if the page we are going to copy is marked as - * present in the kernel page tables (this always is the case if - * CONFIG_DEBUG_PAGEALLOC is not set and in that case - * kernel_page_present() always returns 'true'). + * safe_copy_page - Copy a page in a safe way. + * + * Check if the page we are going to copy is marked as present in the kernel + * page tables (this always is the case if CONFIG_DEBUG_PAGEALLOC is not set + * and in that case kernel_page_present() always returns 'true'). */ static void safe_copy_page(void *dst, struct page *s_page) { @@ -1219,10 +1321,8 @@ static void safe_copy_page(void *dst, struct page *s_page) } } - #ifdef CONFIG_HIGHMEM -static inline struct page * -page_is_saveable(struct zone *zone, unsigned long pfn) +static inline struct page *page_is_saveable(struct zone *zone, unsigned long pfn) { return is_highmem(zone) ? saveable_highmem_page(zone, pfn) : saveable_page(zone, pfn); @@ -1243,7 +1343,8 @@ static void copy_data_page(unsigned long dst_pfn, unsigned long src_pfn) kunmap_atomic(src); } else { if (PageHighMem(d_page)) { - /* Page pointed to by src may contain some kernel + /* + * The page pointed to by src may contain some kernel * data modified by kmap_atomic() */ safe_copy_page(buffer, s_page); @@ -1265,8 +1366,8 @@ static inline void copy_data_page(unsigned long dst_pfn, unsigned long src_pfn) } #endif /* CONFIG_HIGHMEM */ -static void -copy_data_pages(struct memory_bitmap *copy_bm, struct memory_bitmap *orig_bm) +static void copy_data_pages(struct memory_bitmap *copy_bm, + struct memory_bitmap *orig_bm) { struct zone *zone; unsigned long pfn; @@ -1315,12 +1416,11 @@ static struct memory_bitmap orig_bm; static struct memory_bitmap copy_bm; /** - * swsusp_free - free pages allocated for the suspend. + * swsusp_free - Free pages allocated for hibernation image. * - * Suspend pages are alocated before the atomic copy is made, so we - * need to release them after the resume. + * Image pages are alocated before snapshot creation, so they need to be + * released after resume. */ - void swsusp_free(void) { unsigned long fb_pfn, fr_pfn; @@ -1351,6 +1451,7 @@ loop: memory_bm_clear_current(forbidden_pages_map); memory_bm_clear_current(free_pages_map); + hibernate_restore_unprotect_page(page_address(page)); __free_page(page); goto loop; } @@ -1362,6 +1463,7 @@ out: buffer = NULL; alloc_normal = 0; alloc_highmem = 0; + hibernate_restore_protection_end(); } /* Helper functions used for the shrinking of memory. */ @@ -1369,7 +1471,7 @@ out: #define GFP_IMAGE (GFP_KERNEL | __GFP_NOWARN) /** - * preallocate_image_pages - Allocate a number of pages for hibernation image + * preallocate_image_pages - Allocate a number of pages for hibernation image. * @nr_pages: Number of page frames to allocate. * @mask: GFP flags to use for the allocation. * @@ -1419,7 +1521,7 @@ static unsigned long preallocate_image_highmem(unsigned long nr_pages) } /** - * __fraction - Compute (an approximation of) x * (multiplier / base) + * __fraction - Compute (an approximation of) x * (multiplier / base). */ static unsigned long __fraction(u64 x, u64 multiplier, u64 base) { @@ -1429,8 +1531,8 @@ static unsigned long __fraction(u64 x, u64 multiplier, u64 base) } static unsigned long preallocate_highmem_fraction(unsigned long nr_pages, - unsigned long highmem, - unsigned long total) + unsigned long highmem, + unsigned long total) { unsigned long alloc = __fraction(nr_pages, highmem, total); @@ -1443,15 +1545,15 @@ static inline unsigned long preallocate_image_highmem(unsigned long nr_pages) } static inline unsigned long preallocate_highmem_fraction(unsigned long nr_pages, - unsigned long highmem, - unsigned long total) + unsigned long highmem, + unsigned long total) { return 0; } #endif /* CONFIG_HIGHMEM */ /** - * free_unnecessary_pages - Release preallocated pages not needed for the image + * free_unnecessary_pages - Release preallocated pages not needed for the image. */ static unsigned long free_unnecessary_pages(void) { @@ -1505,7 +1607,7 @@ static unsigned long free_unnecessary_pages(void) } /** - * minimum_image_size - Estimate the minimum acceptable size of an image + * minimum_image_size - Estimate the minimum acceptable size of an image. * @saveable: Number of saveable pages in the system. * * We want to avoid attempting to free too much memory too hard, so estimate the @@ -1525,17 +1627,17 @@ static unsigned long minimum_image_size(unsigned long saveable) unsigned long size; size = global_page_state(NR_SLAB_RECLAIMABLE) - + global_page_state(NR_ACTIVE_ANON) - + global_page_state(NR_INACTIVE_ANON) - + global_page_state(NR_ACTIVE_FILE) - + global_page_state(NR_INACTIVE_FILE) - - global_page_state(NR_FILE_MAPPED); + + global_node_page_state(NR_ACTIVE_ANON) + + global_node_page_state(NR_INACTIVE_ANON) + + global_node_page_state(NR_ACTIVE_FILE) + + global_node_page_state(NR_INACTIVE_FILE) + - global_node_page_state(NR_FILE_MAPPED); return saveable <= size ? 0 : saveable - size; } /** - * hibernate_preallocate_memory - Preallocate memory for hibernation image + * hibernate_preallocate_memory - Preallocate memory for hibernation image. * * To create a hibernation image it is necessary to make a copy of every page * frame in use. We also need a number of page frames to be free during @@ -1708,10 +1810,11 @@ int hibernate_preallocate_memory(void) #ifdef CONFIG_HIGHMEM /** - * count_pages_for_highmem - compute the number of non-highmem pages - * that will be necessary for creating copies of highmem pages. - */ - + * count_pages_for_highmem - Count non-highmem pages needed for copying highmem. + * + * Compute the number of non-highmem pages that will be necessary for creating + * copies of highmem pages. + */ static unsigned int count_pages_for_highmem(unsigned int nr_highmem) { unsigned int free_highmem = count_free_highmem_pages() + alloc_highmem; @@ -1724,15 +1827,12 @@ static unsigned int count_pages_for_highmem(unsigned int nr_highmem) return nr_highmem; } #else -static unsigned int -count_pages_for_highmem(unsigned int nr_highmem) { return 0; } +static unsigned int count_pages_for_highmem(unsigned int nr_highmem) { return 0; } #endif /* CONFIG_HIGHMEM */ /** - * enough_free_mem - Make sure we have enough free memory for the - * snapshot image. + * enough_free_mem - Check if there is enough free memory for the image. */ - static int enough_free_mem(unsigned int nr_pages, unsigned int nr_highmem) { struct zone *zone; @@ -1751,10 +1851,11 @@ static int enough_free_mem(unsigned int nr_pages, unsigned int nr_highmem) #ifdef CONFIG_HIGHMEM /** - * get_highmem_buffer - if there are some highmem pages in the suspend - * image, we may need the buffer to copy them and/or load their data. + * get_highmem_buffer - Allocate a buffer for highmem pages. + * + * If there are some highmem pages in the hibernation image, we may need a + * buffer to copy them and/or load their data. */ - static inline int get_highmem_buffer(int safe_needed) { buffer = get_image_page(GFP_ATOMIC | __GFP_COLD, safe_needed); @@ -1762,13 +1863,13 @@ static inline int get_highmem_buffer(int safe_needed) } /** - * alloc_highmem_image_pages - allocate some highmem pages for the image. - * Try to allocate as many pages as needed, but if the number of free - * highmem pages is lesser than that, allocate them all. + * alloc_highmem_image_pages - Allocate some highmem pages for the image. + * + * Try to allocate as many pages as needed, but if the number of free highmem + * pages is less than that, allocate them all. */ - -static inline unsigned int -alloc_highmem_pages(struct memory_bitmap *bm, unsigned int nr_highmem) +static inline unsigned int alloc_highmem_pages(struct memory_bitmap *bm, + unsigned int nr_highmem) { unsigned int to_alloc = count_free_highmem_pages(); @@ -1787,25 +1888,24 @@ alloc_highmem_pages(struct memory_bitmap *bm, unsigned int nr_highmem) #else static inline int get_highmem_buffer(int safe_needed) { return 0; } -static inline unsigned int -alloc_highmem_pages(struct memory_bitmap *bm, unsigned int n) { return 0; } +static inline unsigned int alloc_highmem_pages(struct memory_bitmap *bm, + unsigned int n) { return 0; } #endif /* CONFIG_HIGHMEM */ /** - * swsusp_alloc - allocate memory for the suspend image + * swsusp_alloc - Allocate memory for hibernation image. * - * We first try to allocate as many highmem pages as there are - * saveable highmem pages in the system. If that fails, we allocate - * non-highmem pages for the copies of the remaining highmem ones. + * We first try to allocate as many highmem pages as there are + * saveable highmem pages in the system. If that fails, we allocate + * non-highmem pages for the copies of the remaining highmem ones. * - * In this approach it is likely that the copies of highmem pages will - * also be located in the high memory, because of the way in which - * copy_data_pages() works. + * In this approach it is likely that the copies of highmem pages will + * also be located in the high memory, because of the way in which + * copy_data_pages() works. */ - -static int -swsusp_alloc(struct memory_bitmap *orig_bm, struct memory_bitmap *copy_bm, - unsigned int nr_pages, unsigned int nr_highmem) +static int swsusp_alloc(struct memory_bitmap *orig_bm, + struct memory_bitmap *copy_bm, + unsigned int nr_pages, unsigned int nr_highmem) { if (nr_highmem > 0) { if (get_highmem_buffer(PG_ANY)) @@ -1855,7 +1955,8 @@ asmlinkage __visible int swsusp_save(void) return -ENOMEM; } - /* During allocating of suspend pagedir, new cold pages may appear. + /* + * During allocating of suspend pagedir, new cold pages may appear. * Kill them. */ drain_local_pages(NULL); @@ -1918,12 +2019,14 @@ static int init_header(struct swsusp_info *info) } /** - * pack_pfns - pfns corresponding to the set bits found in the bitmap @bm - * are stored in the array @buf[] (1 page at a time) + * pack_pfns - Prepare PFNs for saving. + * @bm: Memory bitmap. + * @buf: Memory buffer to store the PFNs in. + * + * PFNs corresponding to set bits in @bm are stored in the area of memory + * pointed to by @buf (1 page at a time). */ - -static inline void -pack_pfns(unsigned long *buf, struct memory_bitmap *bm) +static inline void pack_pfns(unsigned long *buf, struct memory_bitmap *bm) { int j; @@ -1937,22 +2040,21 @@ pack_pfns(unsigned long *buf, struct memory_bitmap *bm) } /** - * snapshot_read_next - used for reading the system memory snapshot. + * snapshot_read_next - Get the address to read the next image page from. + * @handle: Snapshot handle to be used for the reading. * - * On the first call to it @handle should point to a zeroed - * snapshot_handle structure. The structure gets updated and a pointer - * to it should be passed to this function every next time. + * On the first call, @handle should point to a zeroed snapshot_handle + * structure. The structure gets populated then and a pointer to it should be + * passed to this function every next time. * - * On success the function returns a positive number. Then, the caller - * is allowed to read up to the returned number of bytes from the memory - * location computed by the data_of() macro. + * On success, the function returns a positive number. Then, the caller + * is allowed to read up to the returned number of bytes from the memory + * location computed by the data_of() macro. * - * The function returns 0 to indicate the end of data stream condition, - * and a negative number is returned on error. In such cases the - * structure pointed to by @handle is not updated and should not be used - * any more. + * The function returns 0 to indicate the end of the data stream condition, + * and negative numbers are returned on errors. If that happens, the structure + * pointed to by @handle is not updated and should not be used any more. */ - int snapshot_read_next(struct snapshot_handle *handle) { if (handle->cur > nr_meta_pages + nr_copy_pages) @@ -1981,7 +2083,8 @@ int snapshot_read_next(struct snapshot_handle *handle) page = pfn_to_page(memory_bm_next_pfn(©_bm)); if (PageHighMem(page)) { - /* Highmem pages are copied to the buffer, + /* + * Highmem pages are copied to the buffer, * because we can't return with a kmapped * highmem page (we may not be called again). */ @@ -1999,53 +2102,41 @@ int snapshot_read_next(struct snapshot_handle *handle) return PAGE_SIZE; } -/** - * mark_unsafe_pages - mark the pages that cannot be used for storing - * the image during resume, because they conflict with the pages that - * had been used before suspend - */ - -static int mark_unsafe_pages(struct memory_bitmap *bm) +static void duplicate_memory_bitmap(struct memory_bitmap *dst, + struct memory_bitmap *src) { - struct zone *zone; - unsigned long pfn, max_zone_pfn; + unsigned long pfn; - /* Clear page flags */ - for_each_populated_zone(zone) { - max_zone_pfn = zone_end_pfn(zone); - for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) - if (pfn_valid(pfn)) - swsusp_unset_page_free(pfn_to_page(pfn)); + memory_bm_position_reset(src); + pfn = memory_bm_next_pfn(src); + while (pfn != BM_END_OF_MAP) { + memory_bm_set_bit(dst, pfn); + pfn = memory_bm_next_pfn(src); } - - /* Mark pages that correspond to the "original" pfns as "unsafe" */ - memory_bm_position_reset(bm); - do { - pfn = memory_bm_next_pfn(bm); - if (likely(pfn != BM_END_OF_MAP)) { - if (likely(pfn_valid(pfn))) - swsusp_set_page_free(pfn_to_page(pfn)); - else - return -EFAULT; - } - } while (pfn != BM_END_OF_MAP); - - allocated_unsafe_pages = 0; - - return 0; } -static void -duplicate_memory_bitmap(struct memory_bitmap *dst, struct memory_bitmap *src) +/** + * mark_unsafe_pages - Mark pages that were used before hibernation. + * + * Mark the pages that cannot be used for storing the image during restoration, + * because they conflict with the pages that had been used before hibernation. + */ +static void mark_unsafe_pages(struct memory_bitmap *bm) { unsigned long pfn; - memory_bm_position_reset(src); - pfn = memory_bm_next_pfn(src); + /* Clear the "free"/"unsafe" bit for all PFNs */ + memory_bm_position_reset(free_pages_map); + pfn = memory_bm_next_pfn(free_pages_map); while (pfn != BM_END_OF_MAP) { - memory_bm_set_bit(dst, pfn); - pfn = memory_bm_next_pfn(src); + memory_bm_clear_current(free_pages_map); + pfn = memory_bm_next_pfn(free_pages_map); } + + /* Mark pages that correspond to the "original" PFNs as "unsafe" */ + duplicate_memory_bitmap(free_pages_map, bm); + + allocated_unsafe_pages = 0; } static int check_header(struct swsusp_info *info) @@ -2063,11 +2154,9 @@ static int check_header(struct swsusp_info *info) } /** - * load header - check the image header and copy data from it + * load header - Check the image header and copy the data from it. */ - -static int -load_header(struct swsusp_info *info) +static int load_header(struct swsusp_info *info) { int error; @@ -2081,8 +2170,12 @@ load_header(struct swsusp_info *info) } /** - * unpack_orig_pfns - for each element of @buf[] (1 page at a time) set - * the corresponding bit in the memory bitmap @bm + * unpack_orig_pfns - Set bits corresponding to given PFNs in a memory bitmap. + * @bm: Memory bitmap. + * @buf: Area of memory containing the PFNs. + * + * For each element of the array pointed to by @buf (1 page at a time), set the + * corresponding bit in @bm. */ static int unpack_orig_pfns(unsigned long *buf, struct memory_bitmap *bm) { @@ -2095,7 +2188,7 @@ static int unpack_orig_pfns(unsigned long *buf, struct memory_bitmap *bm) /* Extract and buffer page key for data page (s390 only). */ page_key_memorize(buf + j); - if (memory_bm_pfn_present(bm, buf[j])) + if (pfn_valid(buf[j]) && memory_bm_pfn_present(bm, buf[j])) memory_bm_set_bit(bm, buf[j]); else return -EFAULT; @@ -2104,13 +2197,9 @@ static int unpack_orig_pfns(unsigned long *buf, struct memory_bitmap *bm) return 0; } -/* List of "safe" pages that may be used to store data loaded from the suspend - * image - */ -static struct linked_page *safe_pages_list; - #ifdef CONFIG_HIGHMEM -/* struct highmem_pbe is used for creating the list of highmem pages that +/* + * struct highmem_pbe is used for creating the list of highmem pages that * should be restored atomically during the resume from disk, because the page * frames they have occupied before the suspend are in use. */ @@ -2120,7 +2209,8 @@ struct highmem_pbe { struct highmem_pbe *next; }; -/* List of highmem PBEs needed for restoring the highmem pages that were +/* + * List of highmem PBEs needed for restoring the highmem pages that were * allocated before the suspend and included in the suspend image, but have * also been allocated by the "resume" kernel, so their contents cannot be * written directly to their "original" page frames. @@ -2128,11 +2218,11 @@ struct highmem_pbe { static struct highmem_pbe *highmem_pblist; /** - * count_highmem_image_pages - compute the number of highmem pages in the - * suspend image. The bits in the memory bitmap @bm that correspond to the - * image pages are assumed to be set. + * count_highmem_image_pages - Compute the number of highmem pages in the image. + * @bm: Memory bitmap. + * + * The bits in @bm that correspond to image pages are assumed to be set. */ - static unsigned int count_highmem_image_pages(struct memory_bitmap *bm) { unsigned long pfn; @@ -2149,24 +2239,25 @@ static unsigned int count_highmem_image_pages(struct memory_bitmap *bm) return cnt; } -/** - * prepare_highmem_image - try to allocate as many highmem pages as - * there are highmem image pages (@nr_highmem_p points to the variable - * containing the number of highmem image pages). The pages that are - * "safe" (ie. will not be overwritten when the suspend image is - * restored) have the corresponding bits set in @bm (it must be - * unitialized). - * - * NOTE: This function should not be called if there are no highmem - * image pages. - */ - static unsigned int safe_highmem_pages; static struct memory_bitmap *safe_highmem_bm; -static int -prepare_highmem_image(struct memory_bitmap *bm, unsigned int *nr_highmem_p) +/** + * prepare_highmem_image - Allocate memory for loading highmem data from image. + * @bm: Pointer to an uninitialized memory bitmap structure. + * @nr_highmem_p: Pointer to the number of highmem image pages. + * + * Try to allocate as many highmem pages as there are highmem image pages + * (@nr_highmem_p points to the variable containing the number of highmem image + * pages). The pages that are "safe" (ie. will not be overwritten when the + * hibernation image is restored entirely) have the corresponding bits set in + * @bm (it must be unitialized). + * + * NOTE: This function should not be called if there are no highmem image pages. + */ +static int prepare_highmem_image(struct memory_bitmap *bm, + unsigned int *nr_highmem_p) { unsigned int to_alloc; @@ -2201,39 +2292,42 @@ prepare_highmem_image(struct memory_bitmap *bm, unsigned int *nr_highmem_p) return 0; } +static struct page *last_highmem_page; + /** - * get_highmem_page_buffer - for given highmem image page find the buffer - * that suspend_write_next() should set for its caller to write to. + * get_highmem_page_buffer - Prepare a buffer to store a highmem image page. * - * If the page is to be saved to its "original" page frame or a copy of - * the page is to be made in the highmem, @buffer is returned. Otherwise, - * the copy of the page is to be made in normal memory, so the address of - * the copy is returned. + * For a given highmem image page get a buffer that suspend_write_next() should + * return to its caller to write to. * - * If @buffer is returned, the caller of suspend_write_next() will write - * the page's contents to @buffer, so they will have to be copied to the - * right location on the next call to suspend_write_next() and it is done - * with the help of copy_last_highmem_page(). For this purpose, if - * @buffer is returned, @last_highmem page is set to the page to which - * the data will have to be copied from @buffer. + * If the page is to be saved to its "original" page frame or a copy of + * the page is to be made in the highmem, @buffer is returned. Otherwise, + * the copy of the page is to be made in normal memory, so the address of + * the copy is returned. + * + * If @buffer is returned, the caller of suspend_write_next() will write + * the page's contents to @buffer, so they will have to be copied to the + * right location on the next call to suspend_write_next() and it is done + * with the help of copy_last_highmem_page(). For this purpose, if + * @buffer is returned, @last_highmem_page is set to the page to which + * the data will have to be copied from @buffer. */ - -static struct page *last_highmem_page; - -static void * -get_highmem_page_buffer(struct page *page, struct chain_allocator *ca) +static void *get_highmem_page_buffer(struct page *page, + struct chain_allocator *ca) { struct highmem_pbe *pbe; void *kaddr; if (swsusp_page_is_forbidden(page) && swsusp_page_is_free(page)) { - /* We have allocated the "original" page frame and we can + /* + * We have allocated the "original" page frame and we can * use it directly to store the loaded page. */ last_highmem_page = page; return buffer; } - /* The "original" page frame has not been allocated and we have to + /* + * The "original" page frame has not been allocated and we have to * use a "safe" page frame to store the loaded page. */ pbe = chain_alloc(ca, sizeof(struct highmem_pbe)); @@ -2263,11 +2357,12 @@ get_highmem_page_buffer(struct page *page, struct chain_allocator *ca) } /** - * copy_last_highmem_page - copy the contents of a highmem image from - * @buffer, where the caller of snapshot_write_next() has place them, - * to the right location represented by @last_highmem_page . + * copy_last_highmem_page - Copy most the most recent highmem image page. + * + * Copy the contents of a highmem image from @buffer, where the caller of + * snapshot_write_next() has stored them, to the right location represented by + * @last_highmem_page . */ - static void copy_last_highmem_page(void) { if (last_highmem_page) { @@ -2294,17 +2389,13 @@ static inline void free_highmem_data(void) free_image_page(buffer, PG_UNSAFE_CLEAR); } #else -static unsigned int -count_highmem_image_pages(struct memory_bitmap *bm) { return 0; } +static unsigned int count_highmem_image_pages(struct memory_bitmap *bm) { return 0; } -static inline int -prepare_highmem_image(struct memory_bitmap *bm, unsigned int *nr_highmem_p) -{ - return 0; -} +static inline int prepare_highmem_image(struct memory_bitmap *bm, + unsigned int *nr_highmem_p) { return 0; } -static inline void * -get_highmem_page_buffer(struct page *page, struct chain_allocator *ca) +static inline void *get_highmem_page_buffer(struct page *page, + struct chain_allocator *ca) { return ERR_PTR(-EINVAL); } @@ -2314,27 +2405,27 @@ static inline int last_highmem_page_copied(void) { return 1; } static inline void free_highmem_data(void) {} #endif /* CONFIG_HIGHMEM */ +#define PBES_PER_LINKED_PAGE (LINKED_PAGE_DATA_SIZE / sizeof(struct pbe)) + /** - * prepare_image - use the memory bitmap @bm to mark the pages that will - * be overwritten in the process of restoring the system memory state - * from the suspend image ("unsafe" pages) and allocate memory for the - * image. + * prepare_image - Make room for loading hibernation image. + * @new_bm: Unitialized memory bitmap structure. + * @bm: Memory bitmap with unsafe pages marked. + * + * Use @bm to mark the pages that will be overwritten in the process of + * restoring the system memory state from the suspend image ("unsafe" pages) + * and allocate memory for the image. * - * The idea is to allocate a new memory bitmap first and then allocate - * as many pages as needed for the image data, but not to assign these - * pages to specific tasks initially. Instead, we just mark them as - * allocated and create a lists of "safe" pages that will be used - * later. On systems with high memory a list of "safe" highmem pages is - * also created. + * The idea is to allocate a new memory bitmap first and then allocate + * as many pages as needed for image data, but without specifying what those + * pages will be used for just yet. Instead, we mark them all as allocated and + * create a lists of "safe" pages to be used later. On systems with high + * memory a list of "safe" highmem pages is created too. */ - -#define PBES_PER_LINKED_PAGE (LINKED_PAGE_DATA_SIZE / sizeof(struct pbe)) - -static int -prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm) +static int prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm) { unsigned int nr_pages, nr_highmem; - struct linked_page *sp_list, *lp; + struct linked_page *lp; int error; /* If there is no highmem, the buffer will not be necessary */ @@ -2342,9 +2433,7 @@ prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm) buffer = NULL; nr_highmem = count_highmem_image_pages(bm); - error = mark_unsafe_pages(bm); - if (error) - goto Free; + mark_unsafe_pages(bm); error = memory_bm_create(new_bm, GFP_ATOMIC, PG_SAFE); if (error) @@ -2357,14 +2446,15 @@ prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm) if (error) goto Free; } - /* Reserve some safe pages for potential later use. + /* + * Reserve some safe pages for potential later use. * * NOTE: This way we make sure there will be enough safe pages for the * chain_alloc() in get_buffer(). It is a bit wasteful, but * nr_copy_pages cannot be greater than 50% of the memory anyway. + * + * nr_copy_pages cannot be less than allocated_unsafe_pages too. */ - sp_list = NULL; - /* nr_copy_pages cannot be lesser than allocated_unsafe_pages */ nr_pages = nr_copy_pages - nr_highmem - allocated_unsafe_pages; nr_pages = DIV_ROUND_UP(nr_pages, PBES_PER_LINKED_PAGE); while (nr_pages > 0) { @@ -2373,12 +2463,11 @@ prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm) error = -ENOMEM; goto Free; } - lp->next = sp_list; - sp_list = lp; + lp->next = safe_pages_list; + safe_pages_list = lp; nr_pages--; } /* Preallocate memory for the image */ - safe_pages_list = NULL; nr_pages = nr_copy_pages - nr_highmem - allocated_unsafe_pages; while (nr_pages > 0) { lp = (struct linked_page *)get_zeroed_page(GFP_ATOMIC); @@ -2396,12 +2485,6 @@ prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm) swsusp_set_page_free(virt_to_page(lp)); nr_pages--; } - /* Free the reserved safe pages so that chain_alloc() can use them */ - while (sp_list) { - lp = sp_list->next; - free_image_page(sp_list, PG_UNSAFE_CLEAR); - sp_list = lp; - } return 0; Free: @@ -2410,10 +2493,11 @@ prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm) } /** - * get_buffer - compute the address that snapshot_write_next() should - * set for its caller to write to. + * get_buffer - Get the address to store the next image data page. + * + * Get the address that snapshot_write_next() should return to its caller to + * write to. */ - static void *get_buffer(struct memory_bitmap *bm, struct chain_allocator *ca) { struct pbe *pbe; @@ -2428,12 +2512,14 @@ static void *get_buffer(struct memory_bitmap *bm, struct chain_allocator *ca) return get_highmem_page_buffer(page, ca); if (swsusp_page_is_forbidden(page) && swsusp_page_is_free(page)) - /* We have allocated the "original" page frame and we can + /* + * We have allocated the "original" page frame and we can * use it directly to store the loaded page. */ return page_address(page); - /* The "original" page frame has not been allocated and we have to + /* + * The "original" page frame has not been allocated and we have to * use a "safe" page frame to store the loaded page. */ pbe = chain_alloc(ca, sizeof(struct pbe)); @@ -2450,22 +2536,21 @@ static void *get_buffer(struct memory_bitmap *bm, struct chain_allocator *ca) } /** - * snapshot_write_next - used for writing the system memory snapshot. + * snapshot_write_next - Get the address to store the next image page. + * @handle: Snapshot handle structure to guide the writing. * - * On the first call to it @handle should point to a zeroed - * snapshot_handle structure. The structure gets updated and a pointer - * to it should be passed to this function every next time. + * On the first call, @handle should point to a zeroed snapshot_handle + * structure. The structure gets populated then and a pointer to it should be + * passed to this function every next time. * - * On success the function returns a positive number. Then, the caller - * is allowed to write up to the returned number of bytes to the memory - * location computed by the data_of() macro. + * On success, the function returns a positive number. Then, the caller + * is allowed to write up to the returned number of bytes to the memory + * location computed by the data_of() macro. * - * The function returns 0 to indicate the "end of file" condition, - * and a negative number is returned on error. In such cases the - * structure pointed to by @handle is not updated and should not be used - * any more. + * The function returns 0 to indicate the "end of file" condition. Negative + * numbers are returned on errors, in which cases the structure pointed to by + * @handle is not updated and should not be used any more. */ - int snapshot_write_next(struct snapshot_handle *handle) { static struct chain_allocator ca; @@ -2491,6 +2576,8 @@ int snapshot_write_next(struct snapshot_handle *handle) if (error) return error; + safe_pages_list = NULL; + error = memory_bm_create(©_bm, GFP_ATOMIC, PG_ANY); if (error) return error; @@ -2500,6 +2587,7 @@ int snapshot_write_next(struct snapshot_handle *handle) if (error) return error; + hibernate_restore_protection_begin(); } else if (handle->cur <= nr_meta_pages + 1) { error = unpack_orig_pfns(buffer, ©_bm); if (error) @@ -2522,6 +2610,7 @@ int snapshot_write_next(struct snapshot_handle *handle) copy_last_highmem_page(); /* Restore page key for data page (s390 only). */ page_key_write(handle->buffer); + hibernate_restore_protect_page(handle->buffer); handle->buffer = get_buffer(&orig_bm, &ca); if (IS_ERR(handle->buffer)) return PTR_ERR(handle->buffer); @@ -2533,22 +2622,23 @@ int snapshot_write_next(struct snapshot_handle *handle) } /** - * snapshot_write_finalize - must be called after the last call to - * snapshot_write_next() in case the last page in the image happens - * to be a highmem page and its contents should be stored in the - * highmem. Additionally, it releases the memory that will not be - * used any more. + * snapshot_write_finalize - Complete the loading of a hibernation image. + * + * Must be called after the last call to snapshot_write_next() in case the last + * page in the image happens to be a highmem page and its contents should be + * stored in highmem. Additionally, it recycles bitmap memory that's not + * necessary any more. */ - void snapshot_write_finalize(struct snapshot_handle *handle) { copy_last_highmem_page(); /* Restore page key for data page (s390 only). */ page_key_write(handle->buffer); page_key_free(); - /* Free only if we have loaded the image entirely */ + hibernate_restore_protect_page(handle->buffer); + /* Do that only if we have loaded the image entirely */ if (handle->cur > 1 && handle->cur > nr_meta_pages + nr_copy_pages) { - memory_bm_free(&orig_bm, PG_UNSAFE_CLEAR); + memory_bm_recycle(&orig_bm); free_highmem_data(); } } @@ -2561,8 +2651,8 @@ int snapshot_image_loaded(struct snapshot_handle *handle) #ifdef CONFIG_HIGHMEM /* Assumes that @buf is ready and points to a "safe" page */ -static inline void -swap_two_pages_data(struct page *p1, struct page *p2, void *buf) +static inline void swap_two_pages_data(struct page *p1, struct page *p2, + void *buf) { void *kaddr1, *kaddr2; @@ -2576,15 +2666,15 @@ swap_two_pages_data(struct page *p1, struct page *p2, void *buf) } /** - * restore_highmem - for each highmem page that was allocated before - * the suspend and included in the suspend image, and also has been - * allocated by the "resume" kernel swap its current (ie. "before - * resume") contents with the previous (ie. "before suspend") one. + * restore_highmem - Put highmem image pages into their original locations. + * + * For each highmem page that was in use before hibernation and is included in + * the image, and also has been allocated by the "restore" kernel, swap its + * current contents with the previous (ie. "before hibernation") ones. * - * If the resume eventually fails, we can call this function once - * again and restore the "before resume" highmem state. + * If the restore eventually fails, we can call this function once again and + * restore the highmem state as seen by the restore kernel. */ - int restore_highmem(void) { struct highmem_pbe *pbe = highmem_pblist; diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 5b70d64b871e..0acab9d7f96f 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -266,16 +266,18 @@ static int suspend_test(int level) */ static int suspend_prepare(suspend_state_t state) { - int error; + int error, nr_calls = 0; if (!sleep_state_supported(state)) return -EPERM; pm_prepare_console(); - error = pm_notifier_call_chain(PM_SUSPEND_PREPARE); - if (error) + error = __pm_notifier_call_chain(PM_SUSPEND_PREPARE, -1, &nr_calls); + if (error) { + nr_calls--; goto Finish; + } trace_suspend_resume(TPS("freeze_processes"), 0, true); error = suspend_freeze_processes(); @@ -286,7 +288,7 @@ static int suspend_prepare(suspend_state_t state) suspend_stats.failed_freeze++; dpm_save_failed_step(SUSPEND_FREEZE); Finish: - pm_notifier_call_chain(PM_POST_SUSPEND); + __pm_notifier_call_chain(PM_POST_SUSPEND, nr_calls, NULL); pm_restore_console(); return error; } diff --git a/kernel/power/swap.c b/kernel/power/swap.c index 160e1006640d..a3b1e617bcdc 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -261,7 +261,7 @@ static void hib_end_io(struct bio *bio) bio_put(bio); } -static int hib_submit_io(int rw, pgoff_t page_off, void *addr, +static int hib_submit_io(int op, int op_flags, pgoff_t page_off, void *addr, struct hib_bio_batch *hb) { struct page *page = virt_to_page(addr); @@ -271,6 +271,7 @@ static int hib_submit_io(int rw, pgoff_t page_off, void *addr, bio = bio_alloc(__GFP_RECLAIM | __GFP_HIGH, 1); bio->bi_iter.bi_sector = page_off * (PAGE_SIZE >> 9); bio->bi_bdev = hib_resume_bdev; + bio_set_op_attrs(bio, op, op_flags); if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) { printk(KERN_ERR "PM: Adding page to bio failed at %llu\n", @@ -283,9 +284,9 @@ static int hib_submit_io(int rw, pgoff_t page_off, void *addr, bio->bi_end_io = hib_end_io; bio->bi_private = hb; atomic_inc(&hb->count); - submit_bio(rw, bio); + submit_bio(bio); } else { - error = submit_bio_wait(rw, bio); + error = submit_bio_wait(bio); bio_put(bio); } @@ -306,7 +307,8 @@ static int mark_swapfiles(struct swap_map_handle *handle, unsigned int flags) { int error; - hib_submit_io(READ_SYNC, swsusp_resume_block, swsusp_header, NULL); + hib_submit_io(REQ_OP_READ, READ_SYNC, swsusp_resume_block, + swsusp_header, NULL); if (!memcmp("SWAP-SPACE",swsusp_header->sig, 10) || !memcmp("SWAPSPACE2",swsusp_header->sig, 10)) { memcpy(swsusp_header->orig_sig,swsusp_header->sig, 10); @@ -315,8 +317,8 @@ static int mark_swapfiles(struct swap_map_handle *handle, unsigned int flags) swsusp_header->flags = flags; if (flags & SF_CRC32_MODE) swsusp_header->crc32 = handle->crc32; - error = hib_submit_io(WRITE_SYNC, swsusp_resume_block, - swsusp_header, NULL); + error = hib_submit_io(REQ_OP_WRITE, WRITE_SYNC, + swsusp_resume_block, swsusp_header, NULL); } else { printk(KERN_ERR "PM: Swap header not found!\n"); error = -ENODEV; @@ -348,6 +350,12 @@ static int swsusp_swap_check(void) if (res < 0) blkdev_put(hib_resume_bdev, FMODE_WRITE); + /* + * Update the resume device to the one actually used, + * so the test_resume mode can use it in case it is + * invoked from hibernate() to test the snapshot. + */ + swsusp_resume_device = hib_resume_bdev->bd_dev; return res; } @@ -389,7 +397,7 @@ static int write_page(void *buf, sector_t offset, struct hib_bio_batch *hb) } else { src = buf; } - return hib_submit_io(WRITE_SYNC, offset, src, hb); + return hib_submit_io(REQ_OP_WRITE, WRITE_SYNC, offset, src, hb); } static void release_swap_writer(struct swap_map_handle *handle) @@ -992,7 +1000,8 @@ static int get_swap_reader(struct swap_map_handle *handle, return -ENOMEM; } - error = hib_submit_io(READ_SYNC, offset, tmp->map, NULL); + error = hib_submit_io(REQ_OP_READ, READ_SYNC, offset, + tmp->map, NULL); if (error) { release_swap_reader(handle); return error; @@ -1016,7 +1025,7 @@ static int swap_read_page(struct swap_map_handle *handle, void *buf, offset = handle->cur->entries[handle->k]; if (!offset) return -EFAULT; - error = hib_submit_io(READ_SYNC, offset, buf, hb); + error = hib_submit_io(REQ_OP_READ, READ_SYNC, offset, buf, hb); if (error) return error; if (++handle->k >= MAP_PAGE_ENTRIES) { @@ -1525,7 +1534,8 @@ int swsusp_check(void) if (!IS_ERR(hib_resume_bdev)) { set_blocksize(hib_resume_bdev, PAGE_SIZE); clear_page(swsusp_header); - error = hib_submit_io(READ_SYNC, swsusp_resume_block, + error = hib_submit_io(REQ_OP_READ, READ_SYNC, + swsusp_resume_block, swsusp_header, NULL); if (error) goto put; @@ -1533,7 +1543,8 @@ int swsusp_check(void) if (!memcmp(HIBERNATE_SIG, swsusp_header->sig, 10)) { memcpy(swsusp_header->sig, swsusp_header->orig_sig, 10); /* Reset swap signature now */ - error = hib_submit_io(WRITE_SYNC, swsusp_resume_block, + error = hib_submit_io(REQ_OP_WRITE, WRITE_SYNC, + swsusp_resume_block, swsusp_header, NULL); } else { error = -EINVAL; @@ -1577,10 +1588,12 @@ int swsusp_unmark(void) { int error; - hib_submit_io(READ_SYNC, swsusp_resume_block, swsusp_header, NULL); + hib_submit_io(REQ_OP_READ, READ_SYNC, swsusp_resume_block, + swsusp_header, NULL); if (!memcmp(HIBERNATE_SIG,swsusp_header->sig, 10)) { memcpy(swsusp_header->sig,swsusp_header->orig_sig, 10); - error = hib_submit_io(WRITE_SYNC, swsusp_resume_block, + error = hib_submit_io(REQ_OP_WRITE, WRITE_SYNC, + swsusp_resume_block, swsusp_header, NULL); } else { printk(KERN_ERR "PM: Cannot find swsusp signature!\n"); diff --git a/kernel/power/user.c b/kernel/power/user.c index 526e8911460a..35310b627388 100644 --- a/kernel/power/user.c +++ b/kernel/power/user.c @@ -47,7 +47,7 @@ atomic_t snapshot_device_available = ATOMIC_INIT(1); static int snapshot_open(struct inode *inode, struct file *filp) { struct snapshot_data *data; - int error; + int error, nr_calls = 0; if (!hibernation_available()) return -EPERM; @@ -74,9 +74,9 @@ static int snapshot_open(struct inode *inode, struct file *filp) swap_type_of(swsusp_resume_device, 0, NULL) : -1; data->mode = O_RDONLY; data->free_bitmaps = false; - error = pm_notifier_call_chain(PM_HIBERNATION_PREPARE); + error = __pm_notifier_call_chain(PM_HIBERNATION_PREPARE, -1, &nr_calls); if (error) - pm_notifier_call_chain(PM_POST_HIBERNATION); + __pm_notifier_call_chain(PM_POST_HIBERNATION, --nr_calls, NULL); } else { /* * Resuming. We may need to wait for the image device to @@ -86,13 +86,15 @@ static int snapshot_open(struct inode *inode, struct file *filp) data->swap = -1; data->mode = O_WRONLY; - error = pm_notifier_call_chain(PM_RESTORE_PREPARE); + error = __pm_notifier_call_chain(PM_RESTORE_PREPARE, -1, &nr_calls); if (!error) { error = create_basic_memory_bitmaps(); data->free_bitmaps = !error; - } + } else + nr_calls--; + if (error) - pm_notifier_call_chain(PM_POST_RESTORE); + __pm_notifier_call_chain(PM_POST_RESTORE, nr_calls, NULL); } if (error) atomic_inc(&snapshot_device_available); diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 60cdf6386763..d4de33934dac 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -3177,9 +3177,8 @@ void show_regs_print_info(const char *log_lvl) { dump_stack_print_info(log_lvl); - printk("%stask: %p ti: %p task.ti: %p\n", - log_lvl, current, current_thread_info(), - task_thread_info(current)); + printk("%stask: %p task.stack: %p\n", + log_lvl, current, task_stack_page(current)); } #endif diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index d400434af6b2..6d86ab6ec2c9 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -253,7 +253,6 @@ static bool exp_funnel_lock(struct rcu_state *rsp, unsigned long s) if (ULONG_CMP_LT(READ_ONCE(rnp->exp_seq_rq), s) && (rnp == rnp_root || ULONG_CMP_LT(READ_ONCE(rnp_root->exp_seq_rq), s)) && - !mutex_is_locked(&rsp->exp_mutex) && mutex_trylock(&rsp->exp_mutex)) goto fastpath; diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 51d7105f529a..5c883fe8e440 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1937,7 +1937,7 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags) * chain to provide order. Instead we do: * * 1) smp_store_release(X->on_cpu, 0) - * 2) smp_cond_acquire(!X->on_cpu) + * 2) smp_cond_load_acquire(!X->on_cpu) * * Example: * @@ -1948,7 +1948,7 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags) * sched-out X * smp_store_release(X->on_cpu, 0); * - * smp_cond_acquire(!X->on_cpu); + * smp_cond_load_acquire(&X->on_cpu, !VAL); * X->state = WAKING * set_task_cpu(X,2) * @@ -1974,7 +1974,7 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags) * This means that any means of doing remote wakeups must order the CPU doing * the wakeup against the CPU the task is going to end up running on. This, * however, is already required for the regular Program-Order guarantee above, - * since the waking CPU is the one issueing the ACQUIRE (smp_cond_acquire). + * since the waking CPU is the one issueing the ACQUIRE (smp_cond_load_acquire). * */ @@ -2047,7 +2047,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) * This ensures that tasks getting woken will be fully ordered against * their previous state and preserve Program Order. */ - smp_cond_acquire(!p->on_cpu); + smp_cond_load_acquire(&p->on_cpu, !VAL); p->sched_contributes_to_load = !!task_contributes_to_load(p); p->state = TASK_WAKING; @@ -2342,11 +2342,11 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) __sched_fork(clone_flags, p); /* - * We mark the process as running here. This guarantees that + * We mark the process as NEW here. This guarantees that * nobody will actually run it, and a signal or other external * event cannot wake it up and insert it on the runqueue either. */ - p->state = TASK_RUNNING; + p->state = TASK_NEW; /* * Make sure we do not leak PI boosting priority to the child. @@ -2383,8 +2383,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) p->sched_class = &fair_sched_class; } - if (p->sched_class->task_fork) - p->sched_class->task_fork(p); + init_entity_runnable_average(&p->se); /* * The child is not yet in the pid-hash so no cgroup attach races, @@ -2394,7 +2393,13 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) * Silence PROVE_RCU. */ raw_spin_lock_irqsave(&p->pi_lock, flags); - set_task_cpu(p, cpu); + /* + * We're setting the cpu for the first time, we don't migrate, + * so use __set_task_cpu(). + */ + __set_task_cpu(p, cpu); + if (p->sched_class->task_fork) + p->sched_class->task_fork(p); raw_spin_unlock_irqrestore(&p->pi_lock, flags); #ifdef CONFIG_SCHED_INFO @@ -2526,16 +2531,18 @@ void wake_up_new_task(struct task_struct *p) struct rq_flags rf; struct rq *rq; - /* Initialize new task's runnable average */ - init_entity_runnable_average(&p->se); raw_spin_lock_irqsave(&p->pi_lock, rf.flags); + p->state = TASK_RUNNING; #ifdef CONFIG_SMP /* * Fork balancing, do it here and not earlier because: * - cpus_allowed can change in the fork path * - any previously selected cpu might disappear through hotplug + * + * Use __set_task_cpu() to avoid calling sched_class::migrate_task_rq, + * as we're not fully set-up yet. */ - set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0)); + __set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0)); #endif rq = __task_rq_lock(p, &rf); post_init_entity_util_avg(&p->se); @@ -3161,6 +3168,9 @@ static noinline void __schedule_bug(struct task_struct *prev) pr_cont("\n"); } #endif + if (panic_on_warn) + panic("scheduling while atomic\n"); + dump_stack(); add_taint(TAINT_WARN, LOCKDEP_STILL_OK); } @@ -4752,7 +4762,8 @@ out_unlock: * @len: length in bytes of the bitmask pointed to by user_mask_ptr * @user_mask_ptr: user-space pointer to hold the current cpu mask * - * Return: 0 on success. An error code otherwise. + * Return: size of CPU mask copied to user_mask_ptr on success. An + * error code otherwise. */ SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len, unsigned long __user *, user_mask_ptr) @@ -5394,13 +5405,15 @@ void idle_task_exit(void) /* * Since this CPU is going 'away' for a while, fold any nr_active delta * we might have. Assumes we're called after migrate_tasks() so that the - * nr_active count is stable. + * nr_active count is stable. We need to take the teardown thread which + * is calling this into account, so we hand in adjust = 1 to the load + * calculation. * * Also see the comment "Global load-average calculations". */ static void calc_load_migrate(struct rq *rq) { - long delta = calc_load_fold_active(rq); + long delta = calc_load_fold_active(rq, 1); if (delta) atomic_long_add(delta, &calc_load_tasks); } @@ -7231,7 +7244,6 @@ static void sched_rq_cpu_starting(unsigned int cpu) struct rq *rq = cpu_rq(cpu); rq->calc_load_update = calc_load_update; - account_reset_rq(rq); update_max_interval(); } @@ -7711,6 +7723,8 @@ void sched_online_group(struct task_group *tg, struct task_group *parent) INIT_LIST_HEAD(&tg->children); list_add_rcu(&tg->siblings, &parent->children); spin_unlock_irqrestore(&task_group_lock, flags); + + online_fair_sched_group(tg); } /* rcu callback to free various structures associated with a task group */ @@ -7739,27 +7753,9 @@ void sched_offline_group(struct task_group *tg) spin_unlock_irqrestore(&task_group_lock, flags); } -/* change task's runqueue when it moves between groups. - * The caller of this function should have put the task in its new group - * by now. This function just updates tsk->se.cfs_rq and tsk->se.parent to - * reflect its new group. - */ -void sched_move_task(struct task_struct *tsk) +static void sched_change_group(struct task_struct *tsk, int type) { struct task_group *tg; - int queued, running; - struct rq_flags rf; - struct rq *rq; - - rq = task_rq_lock(tsk, &rf); - - running = task_current(rq, tsk); - queued = task_on_rq_queued(tsk); - - if (queued) - dequeue_task(rq, tsk, DEQUEUE_SAVE | DEQUEUE_MOVE); - if (unlikely(running)) - put_prev_task(rq, tsk); /* * All callers are synchronized by task_rq_lock(); we do not use RCU @@ -7772,11 +7768,37 @@ void sched_move_task(struct task_struct *tsk) tsk->sched_task_group = tg; #ifdef CONFIG_FAIR_GROUP_SCHED - if (tsk->sched_class->task_move_group) - tsk->sched_class->task_move_group(tsk); + if (tsk->sched_class->task_change_group) + tsk->sched_class->task_change_group(tsk, type); else #endif set_task_rq(tsk, task_cpu(tsk)); +} + +/* + * Change task's runqueue when it moves between groups. + * + * The caller of this function should have put the task in its new group by + * now. This function just updates tsk->se.cfs_rq and tsk->se.parent to reflect + * its new group. + */ +void sched_move_task(struct task_struct *tsk) +{ + int queued, running; + struct rq_flags rf; + struct rq *rq; + + rq = task_rq_lock(tsk, &rf); + + running = task_current(rq, tsk); + queued = task_on_rq_queued(tsk); + + if (queued) + dequeue_task(rq, tsk, DEQUEUE_SAVE | DEQUEUE_MOVE); + if (unlikely(running)) + put_prev_task(rq, tsk); + + sched_change_group(tsk, TASK_MOVE_GROUP); if (unlikely(running)) tsk->sched_class->set_curr_task(rq); @@ -8204,15 +8226,27 @@ static void cpu_cgroup_css_free(struct cgroup_subsys_state *css) sched_free_group(tg); } +/* + * This is called before wake_up_new_task(), therefore we really only + * have to set its group bits, all the other stuff does not apply. + */ static void cpu_cgroup_fork(struct task_struct *task) { - sched_move_task(task); + struct rq_flags rf; + struct rq *rq; + + rq = task_rq_lock(task, &rf); + + sched_change_group(task, TASK_SET_GROUP); + + task_rq_unlock(rq, task, &rf); } static int cpu_cgroup_can_attach(struct cgroup_taskset *tset) { struct task_struct *task; struct cgroup_subsys_state *css; + int ret = 0; cgroup_taskset_for_each(task, css, tset) { #ifdef CONFIG_RT_GROUP_SCHED @@ -8223,8 +8257,24 @@ static int cpu_cgroup_can_attach(struct cgroup_taskset *tset) if (task->sched_class != &fair_sched_class) return -EINVAL; #endif + /* + * Serialize against wake_up_new_task() such that if its + * running, we're sure to observe its full state. + */ + raw_spin_lock_irq(&task->pi_lock); + /* + * Avoid calling sched_move_task() before wake_up_new_task() + * has happened. This would lead to problems with PELT, due to + * move wanting to detach+attach while we're not attached yet. + */ + if (task->state == TASK_NEW) + ret = -EINVAL; + raw_spin_unlock_irq(&task->pi_lock); + + if (ret) + break; } - return 0; + return ret; } static void cpu_cgroup_attach(struct cgroup_taskset *tset) diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c index 41f85c4d0938..bc0b309c3f19 100644 --- a/kernel/sched/cpuacct.c +++ b/kernel/sched/cpuacct.c @@ -25,15 +25,13 @@ enum cpuacct_stat_index { CPUACCT_STAT_NSTATS, }; -enum cpuacct_usage_index { - CPUACCT_USAGE_USER, /* ... user mode */ - CPUACCT_USAGE_SYSTEM, /* ... kernel mode */ - - CPUACCT_USAGE_NRUSAGE, +static const char * const cpuacct_stat_desc[] = { + [CPUACCT_STAT_USER] = "user", + [CPUACCT_STAT_SYSTEM] = "system", }; struct cpuacct_usage { - u64 usages[CPUACCT_USAGE_NRUSAGE]; + u64 usages[CPUACCT_STAT_NSTATS]; }; /* track cpu usage of a group of tasks and its child groups */ @@ -108,16 +106,16 @@ static void cpuacct_css_free(struct cgroup_subsys_state *css) } static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu, - enum cpuacct_usage_index index) + enum cpuacct_stat_index index) { struct cpuacct_usage *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); u64 data; /* - * We allow index == CPUACCT_USAGE_NRUSAGE here to read + * We allow index == CPUACCT_STAT_NSTATS here to read * the sum of suages. */ - BUG_ON(index > CPUACCT_USAGE_NRUSAGE); + BUG_ON(index > CPUACCT_STAT_NSTATS); #ifndef CONFIG_64BIT /* @@ -126,11 +124,11 @@ static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu, raw_spin_lock_irq(&cpu_rq(cpu)->lock); #endif - if (index == CPUACCT_USAGE_NRUSAGE) { + if (index == CPUACCT_STAT_NSTATS) { int i = 0; data = 0; - for (i = 0; i < CPUACCT_USAGE_NRUSAGE; i++) + for (i = 0; i < CPUACCT_STAT_NSTATS; i++) data += cpuusage->usages[i]; } else { data = cpuusage->usages[index]; @@ -155,7 +153,7 @@ static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val) raw_spin_lock_irq(&cpu_rq(cpu)->lock); #endif - for (i = 0; i < CPUACCT_USAGE_NRUSAGE; i++) + for (i = 0; i < CPUACCT_STAT_NSTATS; i++) cpuusage->usages[i] = val; #ifndef CONFIG_64BIT @@ -165,7 +163,7 @@ static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val) /* return total cpu usage (in nanoseconds) of a group */ static u64 __cpuusage_read(struct cgroup_subsys_state *css, - enum cpuacct_usage_index index) + enum cpuacct_stat_index index) { struct cpuacct *ca = css_ca(css); u64 totalcpuusage = 0; @@ -180,18 +178,18 @@ static u64 __cpuusage_read(struct cgroup_subsys_state *css, static u64 cpuusage_user_read(struct cgroup_subsys_state *css, struct cftype *cft) { - return __cpuusage_read(css, CPUACCT_USAGE_USER); + return __cpuusage_read(css, CPUACCT_STAT_USER); } static u64 cpuusage_sys_read(struct cgroup_subsys_state *css, struct cftype *cft) { - return __cpuusage_read(css, CPUACCT_USAGE_SYSTEM); + return __cpuusage_read(css, CPUACCT_STAT_SYSTEM); } static u64 cpuusage_read(struct cgroup_subsys_state *css, struct cftype *cft) { - return __cpuusage_read(css, CPUACCT_USAGE_NRUSAGE); + return __cpuusage_read(css, CPUACCT_STAT_NSTATS); } static int cpuusage_write(struct cgroup_subsys_state *css, struct cftype *cft, @@ -213,7 +211,7 @@ static int cpuusage_write(struct cgroup_subsys_state *css, struct cftype *cft, } static int __cpuacct_percpu_seq_show(struct seq_file *m, - enum cpuacct_usage_index index) + enum cpuacct_stat_index index) { struct cpuacct *ca = css_ca(seq_css(m)); u64 percpu; @@ -229,48 +227,78 @@ static int __cpuacct_percpu_seq_show(struct seq_file *m, static int cpuacct_percpu_user_seq_show(struct seq_file *m, void *V) { - return __cpuacct_percpu_seq_show(m, CPUACCT_USAGE_USER); + return __cpuacct_percpu_seq_show(m, CPUACCT_STAT_USER); } static int cpuacct_percpu_sys_seq_show(struct seq_file *m, void *V) { - return __cpuacct_percpu_seq_show(m, CPUACCT_USAGE_SYSTEM); + return __cpuacct_percpu_seq_show(m, CPUACCT_STAT_SYSTEM); } static int cpuacct_percpu_seq_show(struct seq_file *m, void *V) { - return __cpuacct_percpu_seq_show(m, CPUACCT_USAGE_NRUSAGE); + return __cpuacct_percpu_seq_show(m, CPUACCT_STAT_NSTATS); } -static const char * const cpuacct_stat_desc[] = { - [CPUACCT_STAT_USER] = "user", - [CPUACCT_STAT_SYSTEM] = "system", -}; +static int cpuacct_all_seq_show(struct seq_file *m, void *V) +{ + struct cpuacct *ca = css_ca(seq_css(m)); + int index; + int cpu; + + seq_puts(m, "cpu"); + for (index = 0; index < CPUACCT_STAT_NSTATS; index++) + seq_printf(m, " %s", cpuacct_stat_desc[index]); + seq_puts(m, "\n"); + + for_each_possible_cpu(cpu) { + struct cpuacct_usage *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); + + seq_printf(m, "%d", cpu); + + for (index = 0; index < CPUACCT_STAT_NSTATS; index++) { +#ifndef CONFIG_64BIT + /* + * Take rq->lock to make 64-bit read safe on 32-bit + * platforms. + */ + raw_spin_lock_irq(&cpu_rq(cpu)->lock); +#endif + + seq_printf(m, " %llu", cpuusage->usages[index]); + +#ifndef CONFIG_64BIT + raw_spin_unlock_irq(&cpu_rq(cpu)->lock); +#endif + } + seq_puts(m, "\n"); + } + return 0; +} static int cpuacct_stats_show(struct seq_file *sf, void *v) { struct cpuacct *ca = css_ca(seq_css(sf)); + s64 val[CPUACCT_STAT_NSTATS]; int cpu; - s64 val = 0; + int stat; + memset(val, 0, sizeof(val)); for_each_possible_cpu(cpu) { - struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu); - val += kcpustat->cpustat[CPUTIME_USER]; - val += kcpustat->cpustat[CPUTIME_NICE]; - } - val = cputime64_to_clock_t(val); - seq_printf(sf, "%s %lld\n", cpuacct_stat_desc[CPUACCT_STAT_USER], val); + u64 *cpustat = per_cpu_ptr(ca->cpustat, cpu)->cpustat; - val = 0; - for_each_possible_cpu(cpu) { - struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu); - val += kcpustat->cpustat[CPUTIME_SYSTEM]; - val += kcpustat->cpustat[CPUTIME_IRQ]; - val += kcpustat->cpustat[CPUTIME_SOFTIRQ]; + val[CPUACCT_STAT_USER] += cpustat[CPUTIME_USER]; + val[CPUACCT_STAT_USER] += cpustat[CPUTIME_NICE]; + val[CPUACCT_STAT_SYSTEM] += cpustat[CPUTIME_SYSTEM]; + val[CPUACCT_STAT_SYSTEM] += cpustat[CPUTIME_IRQ]; + val[CPUACCT_STAT_SYSTEM] += cpustat[CPUTIME_SOFTIRQ]; } - val = cputime64_to_clock_t(val); - seq_printf(sf, "%s %lld\n", cpuacct_stat_desc[CPUACCT_STAT_SYSTEM], val); + for (stat = 0; stat < CPUACCT_STAT_NSTATS; stat++) { + seq_printf(sf, "%s %lld\n", + cpuacct_stat_desc[stat], + cputime64_to_clock_t(val[stat])); + } return 0; } @@ -302,6 +330,10 @@ static struct cftype files[] = { .seq_show = cpuacct_percpu_sys_seq_show, }, { + .name = "usage_all", + .seq_show = cpuacct_all_seq_show, + }, + { .name = "stat", .seq_show = cpuacct_stats_show, }, @@ -316,11 +348,11 @@ static struct cftype files[] = { void cpuacct_charge(struct task_struct *tsk, u64 cputime) { struct cpuacct *ca; - int index = CPUACCT_USAGE_SYSTEM; + int index = CPUACCT_STAT_SYSTEM; struct pt_regs *regs = task_pt_regs(tsk); if (regs && user_mode(regs)) - index = CPUACCT_USAGE_USER; + index = CPUACCT_STAT_USER; rcu_read_lock(); diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 14c4aa25cc45..a84641b222c1 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -47,6 +47,8 @@ struct sugov_cpu { struct update_util_data update_util; struct sugov_policy *sg_policy; + unsigned int cached_raw_freq; + /* The fields below are only needed when sharing a policy. */ unsigned long util; unsigned long max; @@ -106,7 +108,7 @@ static void sugov_update_commit(struct sugov_policy *sg_policy, u64 time, /** * get_next_freq - Compute a new frequency for a given cpufreq policy. - * @policy: cpufreq policy object to compute the new frequency for. + * @sg_cpu: schedutil cpu object to compute the new frequency for. * @util: Current CPU utilization. * @max: CPU capacity. * @@ -121,14 +123,25 @@ static void sugov_update_commit(struct sugov_policy *sg_policy, u64 time, * next_freq = C * curr_freq * util_raw / max * * Take C = 1.25 for the frequency tipping point at (util / max) = 0.8. + * + * The lowest driver-supported frequency which is equal or greater than the raw + * next_freq (as calculated above) is returned, subject to policy min/max and + * cpufreq driver limitations. */ -static unsigned int get_next_freq(struct cpufreq_policy *policy, - unsigned long util, unsigned long max) +static unsigned int get_next_freq(struct sugov_cpu *sg_cpu, unsigned long util, + unsigned long max) { + struct sugov_policy *sg_policy = sg_cpu->sg_policy; + struct cpufreq_policy *policy = sg_policy->policy; unsigned int freq = arch_scale_freq_invariant() ? policy->cpuinfo.max_freq : policy->cur; - return (freq + (freq >> 2)) * util / max; + freq = (freq + (freq >> 2)) * util / max; + + if (freq == sg_cpu->cached_raw_freq && sg_policy->next_freq != UINT_MAX) + return sg_policy->next_freq; + sg_cpu->cached_raw_freq = freq; + return cpufreq_driver_resolve_freq(policy, freq); } static void sugov_update_single(struct update_util_data *hook, u64 time, @@ -143,13 +156,14 @@ static void sugov_update_single(struct update_util_data *hook, u64 time, return; next_f = util == ULONG_MAX ? policy->cpuinfo.max_freq : - get_next_freq(policy, util, max); + get_next_freq(sg_cpu, util, max); sugov_update_commit(sg_policy, time, next_f); } -static unsigned int sugov_next_freq_shared(struct sugov_policy *sg_policy, +static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, unsigned long util, unsigned long max) { + struct sugov_policy *sg_policy = sg_cpu->sg_policy; struct cpufreq_policy *policy = sg_policy->policy; unsigned int max_f = policy->cpuinfo.max_freq; u64 last_freq_update_time = sg_policy->last_freq_update_time; @@ -189,7 +203,7 @@ static unsigned int sugov_next_freq_shared(struct sugov_policy *sg_policy, } } - return get_next_freq(policy, util, max); + return get_next_freq(sg_cpu, util, max); } static void sugov_update_shared(struct update_util_data *hook, u64 time, @@ -206,7 +220,7 @@ static void sugov_update_shared(struct update_util_data *hook, u64 time, sg_cpu->last_update = time; if (sugov_should_update_freq(sg_policy, time)) { - next_f = sugov_next_freq_shared(sg_policy, util, max); + next_f = sugov_next_freq_shared(sg_cpu, util, max); sugov_update_commit(sg_policy, time, next_f); } @@ -394,7 +408,7 @@ static int sugov_init(struct cpufreq_policy *policy) return ret; } -static int sugov_exit(struct cpufreq_policy *policy) +static void sugov_exit(struct cpufreq_policy *policy) { struct sugov_policy *sg_policy = policy->governor_data; struct sugov_tunables *tunables = sg_policy->tunables; @@ -412,7 +426,6 @@ static int sugov_exit(struct cpufreq_policy *policy) mutex_unlock(&global_tunables_lock); sugov_policy_free(sg_policy); - return 0; } static int sugov_start(struct cpufreq_policy *policy) @@ -434,6 +447,7 @@ static int sugov_start(struct cpufreq_policy *policy) sg_cpu->util = ULONG_MAX; sg_cpu->max = 0; sg_cpu->last_update = 0; + sg_cpu->cached_raw_freq = 0; cpufreq_add_update_util_hook(cpu, &sg_cpu->update_util, sugov_update_shared); } else { @@ -444,7 +458,7 @@ static int sugov_start(struct cpufreq_policy *policy) return 0; } -static int sugov_stop(struct cpufreq_policy *policy) +static void sugov_stop(struct cpufreq_policy *policy) { struct sugov_policy *sg_policy = policy->governor_data; unsigned int cpu; @@ -456,53 +470,29 @@ static int sugov_stop(struct cpufreq_policy *policy) irq_work_sync(&sg_policy->irq_work); cancel_work_sync(&sg_policy->work); - return 0; } -static int sugov_limits(struct cpufreq_policy *policy) +static void sugov_limits(struct cpufreq_policy *policy) { struct sugov_policy *sg_policy = policy->governor_data; if (!policy->fast_switch_enabled) { mutex_lock(&sg_policy->work_lock); - - if (policy->max < policy->cur) - __cpufreq_driver_target(policy, policy->max, - CPUFREQ_RELATION_H); - else if (policy->min > policy->cur) - __cpufreq_driver_target(policy, policy->min, - CPUFREQ_RELATION_L); - + cpufreq_policy_apply_limits(policy); mutex_unlock(&sg_policy->work_lock); } sg_policy->need_freq_update = true; - return 0; -} - -int sugov_governor(struct cpufreq_policy *policy, unsigned int event) -{ - if (event == CPUFREQ_GOV_POLICY_INIT) { - return sugov_init(policy); - } else if (policy->governor_data) { - switch (event) { - case CPUFREQ_GOV_POLICY_EXIT: - return sugov_exit(policy); - case CPUFREQ_GOV_START: - return sugov_start(policy); - case CPUFREQ_GOV_STOP: - return sugov_stop(policy); - case CPUFREQ_GOV_LIMITS: - return sugov_limits(policy); - } - } - return -EINVAL; } static struct cpufreq_governor schedutil_gov = { .name = "schedutil", - .governor = sugov_governor, .owner = THIS_MODULE, + .init = sugov_init, + .exit = sugov_exit, + .start = sugov_start, + .stop = sugov_stop, + .limits = sugov_limits, }; static int __init sugov_module_init(void) diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 75f98c5498d5..1934f658c036 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -49,15 +49,12 @@ DEFINE_PER_CPU(seqcount_t, irq_time_seq); */ void irqtime_account_irq(struct task_struct *curr) { - unsigned long flags; s64 delta; int cpu; if (!sched_clock_irqtime) return; - local_irq_save(flags); - cpu = smp_processor_id(); delta = sched_clock_cpu(cpu) - __this_cpu_read(irq_start_time); __this_cpu_add(irq_start_time, delta); @@ -75,44 +72,53 @@ void irqtime_account_irq(struct task_struct *curr) __this_cpu_add(cpu_softirq_time, delta); irq_time_write_end(); - local_irq_restore(flags); } EXPORT_SYMBOL_GPL(irqtime_account_irq); -static int irqtime_account_hi_update(void) +static cputime_t irqtime_account_hi_update(cputime_t maxtime) { u64 *cpustat = kcpustat_this_cpu->cpustat; unsigned long flags; - u64 latest_ns; - int ret = 0; + cputime_t irq_cputime; local_irq_save(flags); - latest_ns = this_cpu_read(cpu_hardirq_time); - if (nsecs_to_cputime64(latest_ns) > cpustat[CPUTIME_IRQ]) - ret = 1; + irq_cputime = nsecs_to_cputime64(this_cpu_read(cpu_hardirq_time)) - + cpustat[CPUTIME_IRQ]; + irq_cputime = min(irq_cputime, maxtime); + cpustat[CPUTIME_IRQ] += irq_cputime; local_irq_restore(flags); - return ret; + return irq_cputime; } -static int irqtime_account_si_update(void) +static cputime_t irqtime_account_si_update(cputime_t maxtime) { u64 *cpustat = kcpustat_this_cpu->cpustat; unsigned long flags; - u64 latest_ns; - int ret = 0; + cputime_t softirq_cputime; local_irq_save(flags); - latest_ns = this_cpu_read(cpu_softirq_time); - if (nsecs_to_cputime64(latest_ns) > cpustat[CPUTIME_SOFTIRQ]) - ret = 1; + softirq_cputime = nsecs_to_cputime64(this_cpu_read(cpu_softirq_time)) - + cpustat[CPUTIME_SOFTIRQ]; + softirq_cputime = min(softirq_cputime, maxtime); + cpustat[CPUTIME_SOFTIRQ] += softirq_cputime; local_irq_restore(flags); - return ret; + return softirq_cputime; } #else /* CONFIG_IRQ_TIME_ACCOUNTING */ #define sched_clock_irqtime (0) +static cputime_t irqtime_account_hi_update(cputime_t dummy) +{ + return 0; +} + +static cputime_t irqtime_account_si_update(cputime_t dummy) +{ + return 0; +} + #endif /* !CONFIG_IRQ_TIME_ACCOUNTING */ static inline void task_group_account_field(struct task_struct *p, int index, @@ -257,29 +263,42 @@ void account_idle_time(cputime_t cputime) cpustat[CPUTIME_IDLE] += (__force u64) cputime; } -static __always_inline bool steal_account_process_tick(void) +static __always_inline cputime_t steal_account_process_time(cputime_t maxtime) { #ifdef CONFIG_PARAVIRT if (static_key_false(¶virt_steal_enabled)) { + cputime_t steal_cputime; u64 steal; - unsigned long steal_jiffies; steal = paravirt_steal_clock(smp_processor_id()); steal -= this_rq()->prev_steal_time; - /* - * steal is in nsecs but our caller is expecting steal - * time in jiffies. Lets cast the result to jiffies - * granularity and account the rest on the next rounds. - */ - steal_jiffies = nsecs_to_jiffies(steal); - this_rq()->prev_steal_time += jiffies_to_nsecs(steal_jiffies); + steal_cputime = min(nsecs_to_cputime(steal), maxtime); + account_steal_time(steal_cputime); + this_rq()->prev_steal_time += cputime_to_nsecs(steal_cputime); - account_steal_time(jiffies_to_cputime(steal_jiffies)); - return steal_jiffies; + return steal_cputime; } #endif - return false; + return 0; +} + +/* + * Account how much elapsed time was spent in steal, irq, or softirq time. + */ +static inline cputime_t account_other_time(cputime_t max) +{ + cputime_t accounted; + + accounted = steal_account_process_time(max); + + if (accounted < max) + accounted += irqtime_account_hi_update(max - accounted); + + if (accounted < max) + accounted += irqtime_account_si_update(max - accounted); + + return accounted; } /* @@ -342,21 +361,23 @@ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times) static void irqtime_account_process_tick(struct task_struct *p, int user_tick, struct rq *rq, int ticks) { - cputime_t scaled = cputime_to_scaled(cputime_one_jiffy); - u64 cputime = (__force u64) cputime_one_jiffy; - u64 *cpustat = kcpustat_this_cpu->cpustat; + u64 cputime = (__force u64) cputime_one_jiffy * ticks; + cputime_t scaled, other; - if (steal_account_process_tick()) + /* + * When returning from idle, many ticks can get accounted at + * once, including some ticks of steal, irq, and softirq time. + * Subtract those ticks from the amount of time accounted to + * idle, or potentially user or system time. Due to rounding, + * other time can exceed ticks occasionally. + */ + other = account_other_time(cputime); + if (other >= cputime) return; + cputime -= other; + scaled = cputime_to_scaled(cputime); - cputime *= ticks; - scaled *= ticks; - - if (irqtime_account_hi_update()) { - cpustat[CPUTIME_IRQ] += cputime; - } else if (irqtime_account_si_update()) { - cpustat[CPUTIME_SOFTIRQ] += cputime; - } else if (this_cpu_ksoftirqd() == p) { + if (this_cpu_ksoftirqd() == p) { /* * ksoftirqd time do not get accounted in cpu_softirq_time. * So, we have to handle it separately here. @@ -406,6 +427,10 @@ void vtime_common_task_switch(struct task_struct *prev) } #endif +#endif /* CONFIG_VIRT_CPU_ACCOUNTING */ + + +#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE /* * Archs that account the whole time spent in the idle task * (outside irq) as idle time can rely on this and just implement @@ -415,33 +440,16 @@ void vtime_common_task_switch(struct task_struct *prev) * vtime_account(). */ #ifndef __ARCH_HAS_VTIME_ACCOUNT -void vtime_common_account_irq_enter(struct task_struct *tsk) +void vtime_account_irq_enter(struct task_struct *tsk) { - if (!in_interrupt()) { - /* - * If we interrupted user, context_tracking_in_user() - * is 1 because the context tracking don't hook - * on irq entry/exit. This way we know if - * we need to flush user time on kernel entry. - */ - if (context_tracking_in_user()) { - vtime_account_user(tsk); - return; - } - - if (is_idle_task(tsk)) { - vtime_account_idle(tsk); - return; - } - } - vtime_account_system(tsk); + if (!in_interrupt() && is_idle_task(tsk)) + vtime_account_idle(tsk); + else + vtime_account_system(tsk); } -EXPORT_SYMBOL_GPL(vtime_common_account_irq_enter); +EXPORT_SYMBOL_GPL(vtime_account_irq_enter); #endif /* __ARCH_HAS_VTIME_ACCOUNT */ -#endif /* CONFIG_VIRT_CPU_ACCOUNTING */ - -#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) { *ut = p->utime; @@ -466,7 +474,7 @@ void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime */ void account_process_tick(struct task_struct *p, int user_tick) { - cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy); + cputime_t cputime, scaled, steal; struct rq *rq = this_rq(); if (vtime_accounting_cpu_enabled()) @@ -477,26 +485,21 @@ void account_process_tick(struct task_struct *p, int user_tick) return; } - if (steal_account_process_tick()) + cputime = cputime_one_jiffy; + steal = steal_account_process_time(cputime); + + if (steal >= cputime) return; + cputime -= steal; + scaled = cputime_to_scaled(cputime); + if (user_tick) - account_user_time(p, cputime_one_jiffy, one_jiffy_scaled); + account_user_time(p, cputime, scaled); else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET)) - account_system_time(p, HARDIRQ_OFFSET, cputime_one_jiffy, - one_jiffy_scaled); + account_system_time(p, HARDIRQ_OFFSET, cputime, scaled); else - account_idle_time(cputime_one_jiffy); -} - -/* - * Account multiple ticks of steal time. - * @p: the process from which the cpu time has been stolen - * @ticks: number of stolen ticks - */ -void account_steal_ticks(unsigned long ticks) -{ - account_steal_time(jiffies_to_cputime(ticks)); + account_idle_time(cputime); } /* @@ -681,12 +684,14 @@ static cputime_t vtime_delta(struct task_struct *tsk) static cputime_t get_vtime_delta(struct task_struct *tsk) { unsigned long now = READ_ONCE(jiffies); - unsigned long delta = now - tsk->vtime_snap; + cputime_t delta, other; + delta = jiffies_to_cputime(now - tsk->vtime_snap); + other = account_other_time(delta); WARN_ON_ONCE(tsk->vtime_snap_whence == VTIME_INACTIVE); tsk->vtime_snap = now; - return jiffies_to_cputime(delta); + return delta - other; } static void __vtime_account_system(struct task_struct *tsk) @@ -706,16 +711,6 @@ void vtime_account_system(struct task_struct *tsk) write_seqcount_end(&tsk->vtime_seqcount); } -void vtime_gen_account_irq_exit(struct task_struct *tsk) -{ - write_seqcount_begin(&tsk->vtime_seqcount); - if (vtime_delta(tsk)) - __vtime_account_system(tsk); - if (context_tracking_in_user()) - tsk->vtime_snap_whence = VTIME_USER; - write_seqcount_end(&tsk->vtime_seqcount); -} - void vtime_account_user(struct task_struct *tsk) { cputime_t delta_cpu; diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 0368c393a336..2a0a9995256d 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -879,9 +879,9 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m) nr_switches = p->nvcsw + p->nivcsw; -#ifdef CONFIG_SCHEDSTATS P(se.nr_migrations); +#ifdef CONFIG_SCHEDSTATS if (schedstat_enabled()) { u64 avg_atom, avg_per_cpu; diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index c8c5d2d48424..4088eedea763 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -690,6 +690,11 @@ void init_entity_runnable_average(struct sched_entity *se) /* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */ } +static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq); +static int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq); +static void update_tg_load_avg(struct cfs_rq *cfs_rq, int force); +static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se); + /* * With new tasks being created, their initial util_avgs are extrapolated * based on the cfs_rq's current util_avg: @@ -720,6 +725,8 @@ void post_init_entity_util_avg(struct sched_entity *se) struct cfs_rq *cfs_rq = cfs_rq_of(se); struct sched_avg *sa = &se->avg; long cap = (long)(SCHED_CAPACITY_SCALE - cfs_rq->avg.util_avg) / 2; + u64 now = cfs_rq_clock_task(cfs_rq); + int tg_update; if (cap > 0) { if (cfs_rq->avg.util_avg != 0) { @@ -733,16 +740,42 @@ void post_init_entity_util_avg(struct sched_entity *se) } sa->util_sum = sa->util_avg * LOAD_AVG_MAX; } + + if (entity_is_task(se)) { + struct task_struct *p = task_of(se); + if (p->sched_class != &fair_sched_class) { + /* + * For !fair tasks do: + * + update_cfs_rq_load_avg(now, cfs_rq, false); + attach_entity_load_avg(cfs_rq, se); + switched_from_fair(rq, p); + * + * such that the next switched_to_fair() has the + * expected state. + */ + se->avg.last_update_time = now; + return; + } + } + + tg_update = update_cfs_rq_load_avg(now, cfs_rq, false); + attach_entity_load_avg(cfs_rq, se); + if (tg_update) + update_tg_load_avg(cfs_rq, false); } -#else +#else /* !CONFIG_SMP */ void init_entity_runnable_average(struct sched_entity *se) { } void post_init_entity_util_avg(struct sched_entity *se) { } -#endif +static void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) +{ +} +#endif /* CONFIG_SMP */ /* * Update the current task's runtime statistics. @@ -1303,6 +1336,8 @@ static void task_numa_assign(struct task_numa_env *env, { if (env->best_task) put_task_struct(env->best_task); + if (p) + get_task_struct(p); env->best_task = p; env->best_imp = imp; @@ -1370,31 +1405,11 @@ static void task_numa_compare(struct task_numa_env *env, long imp = env->p->numa_group ? groupimp : taskimp; long moveimp = imp; int dist = env->dist; - bool assigned = false; rcu_read_lock(); - - raw_spin_lock_irq(&dst_rq->lock); - cur = dst_rq->curr; - /* - * No need to move the exiting task or idle task. - */ - if ((cur->flags & PF_EXITING) || is_idle_task(cur)) + cur = task_rcu_dereference(&dst_rq->curr); + if (cur && ((cur->flags & PF_EXITING) || is_idle_task(cur))) cur = NULL; - else { - /* - * The task_struct must be protected here to protect the - * p->numa_faults access in the task_weight since the - * numa_faults could already be freed in the following path: - * finish_task_switch() - * --> put_task_struct() - * --> __put_task_struct() - * --> task_numa_free() - */ - get_task_struct(cur); - } - - raw_spin_unlock_irq(&dst_rq->lock); /* * Because we have preemption enabled we can get migrated around and @@ -1477,7 +1492,6 @@ balance: */ if (!load_too_imbalanced(src_load, dst_load, env)) { imp = moveimp - 1; - put_task_struct(cur); cur = NULL; goto assign; } @@ -1503,16 +1517,9 @@ balance: env->dst_cpu = select_idle_sibling(env->p, env->dst_cpu); assign: - assigned = true; task_numa_assign(env, cur, imp); unlock: rcu_read_unlock(); - /* - * The dst_rq->curr isn't assigned. The protection for task_struct is - * finished. - */ - if (cur && !assigned) - put_task_struct(cur); } static void task_numa_find_cpu(struct task_numa_env *env, @@ -2866,8 +2873,6 @@ void set_task_rq_fair(struct sched_entity *se, static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) {} #endif /* CONFIG_FAIR_GROUP_SCHED */ -static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq); - static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq) { struct rq *rq = rq_of(cfs_rq); @@ -2914,7 +2919,23 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq) WRITE_ONCE(*ptr, res); \ } while (0) -/* Group cfs_rq's load_avg is used for task_h_load and update_cfs_share */ +/** + * update_cfs_rq_load_avg - update the cfs_rq's load/util averages + * @now: current time, as per cfs_rq_clock_task() + * @cfs_rq: cfs_rq to update + * @update_freq: should we call cfs_rq_util_change() or will the call do so + * + * The cfs_rq avg is the direct sum of all its entities (blocked and runnable) + * avg. The immediate corollary is that all (fair) tasks must be attached, see + * post_init_entity_util_avg(). + * + * cfs_rq->avg is used for task_h_load() and update_cfs_share() for example. + * + * Returns true if the load decayed or we removed utilization. It is expected + * that one calls update_tg_load_avg() on this condition, but after you've + * modified the cfs_rq avg (attach/detach), such that we propagate the new + * avg up. + */ static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq) { @@ -2969,6 +2990,14 @@ static inline void update_load_avg(struct sched_entity *se, int update_tg) update_tg_load_avg(cfs_rq, 0); } +/** + * attach_entity_load_avg - attach this entity to its cfs_rq load avg + * @cfs_rq: cfs_rq to attach to + * @se: sched_entity to attach + * + * Must call update_cfs_rq_load_avg() before this, since we rely on + * cfs_rq->avg.last_update_time being current. + */ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { if (!sched_feat(ATTACH_AGE_LOAD)) @@ -2977,6 +3006,8 @@ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s /* * If we got migrated (either between CPUs or between cgroups) we'll * have aged the average right before clearing @last_update_time. + * + * Or we're fresh through post_init_entity_util_avg(). */ if (se->avg.last_update_time) { __update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq_of(cfs_rq)), @@ -2998,6 +3029,14 @@ skip_aging: cfs_rq_util_change(cfs_rq); } +/** + * detach_entity_load_avg - detach this entity from its cfs_rq load avg + * @cfs_rq: cfs_rq to detach from + * @se: sched_entity to detach + * + * Must call update_cfs_rq_load_avg() before this, since we rely on + * cfs_rq->avg.last_update_time being current. + */ static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { __update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq_of(cfs_rq)), @@ -3082,11 +3121,14 @@ void remove_entity_load_avg(struct sched_entity *se) u64 last_update_time; /* - * Newly created task or never used group entity should not be removed - * from its (source) cfs_rq + * tasks cannot exit without having gone through wake_up_new_task() -> + * post_init_entity_util_avg() which will have added things to the + * cfs_rq, so we can remove unconditionally. + * + * Similarly for groups, they will have passed through + * post_init_entity_util_avg() before unregister_sched_fair_group() + * calls this. */ - if (se->avg.last_update_time == 0) - return; last_update_time = cfs_rq_last_update_time(cfs_rq); @@ -3109,6 +3151,12 @@ static int idle_balance(struct rq *this_rq); #else /* CONFIG_SMP */ +static inline int +update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq) +{ + return 0; +} + static inline void update_load_avg(struct sched_entity *se, int not_used) { struct cfs_rq *cfs_rq = cfs_rq_of(se); @@ -3698,7 +3746,7 @@ static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg) static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq) { if (unlikely(cfs_rq->throttle_count)) - return cfs_rq->throttled_clock_task; + return cfs_rq->throttled_clock_task - cfs_rq->throttled_clock_task_time; return rq_clock_task(rq_of(cfs_rq)) - cfs_rq->throttled_clock_task_time; } @@ -3836,13 +3884,11 @@ static int tg_unthrottle_up(struct task_group *tg, void *data) struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)]; cfs_rq->throttle_count--; -#ifdef CONFIG_SMP if (!cfs_rq->throttle_count) { /* adjust cfs_rq_clock_task() */ cfs_rq->throttled_clock_task_time += rq_clock_task(rq) - cfs_rq->throttled_clock_task; } -#endif return 0; } @@ -4195,26 +4241,6 @@ static void check_enqueue_throttle(struct cfs_rq *cfs_rq) if (!cfs_bandwidth_used()) return; - /* Synchronize hierarchical throttle counter: */ - if (unlikely(!cfs_rq->throttle_uptodate)) { - struct rq *rq = rq_of(cfs_rq); - struct cfs_rq *pcfs_rq; - struct task_group *tg; - - cfs_rq->throttle_uptodate = 1; - - /* Get closest up-to-date node, because leaves go first: */ - for (tg = cfs_rq->tg->parent; tg; tg = tg->parent) { - pcfs_rq = tg->cfs_rq[cpu_of(rq)]; - if (pcfs_rq->throttle_uptodate) - break; - } - if (tg) { - cfs_rq->throttle_count = pcfs_rq->throttle_count; - cfs_rq->throttled_clock_task = rq_clock_task(rq); - } - } - /* an active group must be handled by the update_curr()->put() path */ if (!cfs_rq->runtime_enabled || cfs_rq->curr) return; @@ -4229,6 +4255,23 @@ static void check_enqueue_throttle(struct cfs_rq *cfs_rq) throttle_cfs_rq(cfs_rq); } +static void sync_throttle(struct task_group *tg, int cpu) +{ + struct cfs_rq *pcfs_rq, *cfs_rq; + + if (!cfs_bandwidth_used()) + return; + + if (!tg->parent) + return; + + cfs_rq = tg->cfs_rq[cpu]; + pcfs_rq = tg->parent->cfs_rq[cpu]; + + cfs_rq->throttle_count = pcfs_rq->throttle_count; + pcfs_rq->throttled_clock_task = rq_clock_task(cpu_rq(cpu)); +} + /* conditionally throttle active cfs_rq's from put_prev_entity() */ static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) { @@ -4368,6 +4411,7 @@ static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq) static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) {} static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) { return false; } static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {} +static inline void sync_throttle(struct task_group *tg, int cpu) {} static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq) @@ -4476,7 +4520,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) * * note: in the case of encountering a throttled cfs_rq we will * post the final h_nr_running increment below. - */ + */ if (cfs_rq_throttled(cfs_rq)) break; cfs_rq->h_nr_running++; @@ -8317,31 +8361,17 @@ static void task_fork_fair(struct task_struct *p) { struct cfs_rq *cfs_rq; struct sched_entity *se = &p->se, *curr; - int this_cpu = smp_processor_id(); struct rq *rq = this_rq(); - unsigned long flags; - - raw_spin_lock_irqsave(&rq->lock, flags); + raw_spin_lock(&rq->lock); update_rq_clock(rq); cfs_rq = task_cfs_rq(current); curr = cfs_rq->curr; - - /* - * Not only the cpu but also the task_group of the parent might have - * been changed after parent->se.parent,cfs_rq were copied to - * child->se.parent,cfs_rq. So call __set_task_cpu() to make those - * of child point to valid ones. - */ - rcu_read_lock(); - __set_task_cpu(p, this_cpu); - rcu_read_unlock(); - - update_curr(cfs_rq); - - if (curr) + if (curr) { + update_curr(cfs_rq); se->vruntime = curr->vruntime; + } place_entity(cfs_rq, se, 1); if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) { @@ -8354,8 +8384,7 @@ static void task_fork_fair(struct task_struct *p) } se->vruntime -= cfs_rq->min_vruntime; - - raw_spin_unlock_irqrestore(&rq->lock, flags); + raw_spin_unlock(&rq->lock); } /* @@ -8411,6 +8440,8 @@ static void detach_task_cfs_rq(struct task_struct *p) { struct sched_entity *se = &p->se; struct cfs_rq *cfs_rq = cfs_rq_of(se); + u64 now = cfs_rq_clock_task(cfs_rq); + int tg_update; if (!vruntime_normalized(p)) { /* @@ -8422,13 +8453,18 @@ static void detach_task_cfs_rq(struct task_struct *p) } /* Catch up with the cfs_rq and remove our load when we leave */ + tg_update = update_cfs_rq_load_avg(now, cfs_rq, false); detach_entity_load_avg(cfs_rq, se); + if (tg_update) + update_tg_load_avg(cfs_rq, false); } static void attach_task_cfs_rq(struct task_struct *p) { struct sched_entity *se = &p->se; struct cfs_rq *cfs_rq = cfs_rq_of(se); + u64 now = cfs_rq_clock_task(cfs_rq); + int tg_update; #ifdef CONFIG_FAIR_GROUP_SCHED /* @@ -8439,7 +8475,10 @@ static void attach_task_cfs_rq(struct task_struct *p) #endif /* Synchronize task with its cfs_rq */ + tg_update = update_cfs_rq_load_avg(now, cfs_rq, false); attach_entity_load_avg(cfs_rq, se); + if (tg_update) + update_tg_load_avg(cfs_rq, false); if (!vruntime_normalized(p)) se->vruntime += cfs_rq->min_vruntime; @@ -8499,6 +8538,14 @@ void init_cfs_rq(struct cfs_rq *cfs_rq) } #ifdef CONFIG_FAIR_GROUP_SCHED +static void task_set_group_fair(struct task_struct *p) +{ + struct sched_entity *se = &p->se; + + set_task_rq(p, task_cpu(p)); + se->depth = se->parent ? se->parent->depth + 1 : 0; +} + static void task_move_group_fair(struct task_struct *p) { detach_task_cfs_rq(p); @@ -8511,6 +8558,19 @@ static void task_move_group_fair(struct task_struct *p) attach_task_cfs_rq(p); } +static void task_change_group_fair(struct task_struct *p, int type) +{ + switch (type) { + case TASK_SET_GROUP: + task_set_group_fair(p); + break; + + case TASK_MOVE_GROUP: + task_move_group_fair(p); + break; + } +} + void free_fair_sched_group(struct task_group *tg) { int i; @@ -8562,10 +8622,6 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) init_cfs_rq(cfs_rq); init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]); init_entity_runnable_average(se); - - raw_spin_lock_irq(&rq->lock); - post_init_entity_util_avg(se); - raw_spin_unlock_irq(&rq->lock); } return 1; @@ -8576,6 +8632,23 @@ err: return 0; } +void online_fair_sched_group(struct task_group *tg) +{ + struct sched_entity *se; + struct rq *rq; + int i; + + for_each_possible_cpu(i) { + rq = cpu_rq(i); + se = tg->se[i]; + + raw_spin_lock_irq(&rq->lock); + post_init_entity_util_avg(se); + sync_throttle(tg, i); + raw_spin_unlock_irq(&rq->lock); + } +} + void unregister_fair_sched_group(struct task_group *tg) { unsigned long flags; @@ -8680,6 +8753,8 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) return 1; } +void online_fair_sched_group(struct task_group *tg) { } + void unregister_fair_sched_group(struct task_group *tg) { } #endif /* CONFIG_FAIR_GROUP_SCHED */ @@ -8739,7 +8814,7 @@ const struct sched_class fair_sched_class = { .update_curr = update_curr_fair, #ifdef CONFIG_FAIR_GROUP_SCHED - .task_move_group = task_move_group_fair, + .task_change_group = task_change_group_fair, #endif }; diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index c5aeedf4e93a..9fb873cfc75c 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -201,6 +201,8 @@ exit_idle: */ static void cpu_idle_loop(void) { + int cpu = smp_processor_id(); + while (1) { /* * If the arch has a polling bit, we maintain an invariant: @@ -219,7 +221,7 @@ static void cpu_idle_loop(void) check_pgt_cache(); rmb(); - if (cpu_is_offline(smp_processor_id())) { + if (cpu_is_offline(cpu)) { cpuhp_report_idle_dead(); arch_cpu_idle_dead(); } diff --git a/kernel/sched/loadavg.c b/kernel/sched/loadavg.c index b0b93fd33af9..a2d6eb71f06b 100644 --- a/kernel/sched/loadavg.c +++ b/kernel/sched/loadavg.c @@ -78,11 +78,11 @@ void get_avenrun(unsigned long *loads, unsigned long offset, int shift) loads[2] = (avenrun[2] + offset) << shift; } -long calc_load_fold_active(struct rq *this_rq) +long calc_load_fold_active(struct rq *this_rq, long adjust) { long nr_active, delta = 0; - nr_active = this_rq->nr_running; + nr_active = this_rq->nr_running - adjust; nr_active += (long)this_rq->nr_uninterruptible; if (nr_active != this_rq->calc_load_active) { @@ -188,7 +188,7 @@ void calc_load_enter_idle(void) * We're going into NOHZ mode, if there's any pending delta, fold it * into the pending idle delta. */ - delta = calc_load_fold_active(this_rq); + delta = calc_load_fold_active(this_rq, 0); if (delta) { int idx = calc_load_write_idx(); @@ -389,7 +389,7 @@ void calc_global_load_tick(struct rq *this_rq) if (time_before(jiffies, this_rq->calc_load_update)) return; - delta = calc_load_fold_active(this_rq); + delta = calc_load_fold_active(this_rq, 0); if (delta) atomic_long_add(delta, &calc_load_tasks); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 7cbeb92a1cb9..c64fc5114004 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -28,7 +28,7 @@ extern unsigned long calc_load_update; extern atomic_long_t calc_load_tasks; extern void calc_global_load_tick(struct rq *this_rq); -extern long calc_load_fold_active(struct rq *this_rq); +extern long calc_load_fold_active(struct rq *this_rq, long adjust); #ifdef CONFIG_SMP extern void cpu_load_update_active(struct rq *this_rq); @@ -321,6 +321,7 @@ extern int tg_nop(struct task_group *tg, void *data); extern void free_fair_sched_group(struct task_group *tg); extern int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent); +extern void online_fair_sched_group(struct task_group *tg); extern void unregister_fair_sched_group(struct task_group *tg); extern void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, struct sched_entity *se, int cpu, @@ -437,7 +438,7 @@ struct cfs_rq { u64 throttled_clock, throttled_clock_task; u64 throttled_clock_task_time; - int throttled, throttle_count, throttle_uptodate; + int throttled, throttle_count; struct list_head throttled_list; #endif /* CONFIG_CFS_BANDWIDTH */ #endif /* CONFIG_FAIR_GROUP_SCHED */ @@ -1113,7 +1114,7 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) * In particular, the load of prev->state in finish_task_switch() must * happen before this. * - * Pairs with the smp_cond_acquire() in try_to_wake_up(). + * Pairs with the smp_cond_load_acquire() in try_to_wake_up(). */ smp_store_release(&prev->on_cpu, 0); #endif @@ -1246,8 +1247,11 @@ struct sched_class { void (*update_curr) (struct rq *rq); +#define TASK_SET_GROUP 0 +#define TASK_MOVE_GROUP 1 + #ifdef CONFIG_FAIR_GROUP_SCHED - void (*task_move_group) (struct task_struct *p); + void (*task_change_group) (struct task_struct *p, int type); #endif }; @@ -1809,16 +1813,3 @@ static inline void cpufreq_trigger_update(u64 time) {} #else /* arch_scale_freq_capacity */ #define arch_scale_freq_invariant() (false) #endif - -static inline void account_reset_rq(struct rq *rq) -{ -#ifdef CONFIG_IRQ_TIME_ACCOUNTING - rq->prev_irq_time = 0; -#endif -#ifdef CONFIG_PARAVIRT - rq->prev_steal_time = 0; -#endif -#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING - rq->prev_steal_time_rq = 0; -#endif -} diff --git a/kernel/smp.c b/kernel/smp.c index 7180491c9678..3aa642d39c03 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -92,7 +92,7 @@ void __init call_function_init(void) */ static __always_inline void csd_lock_wait(struct call_single_data *csd) { - smp_cond_acquire(!(csd->flags & CSD_FLAG_LOCK)); + smp_cond_load_acquire(&csd->flags, !(VAL & CSD_FLAG_LOCK)); } static __always_inline void csd_lock(struct call_single_data *csd) diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 35f0dcb1cb4f..53954631a4e1 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1508,8 +1508,8 @@ static struct ctl_table vm_table[] = { #ifdef CONFIG_NUMA { .procname = "zone_reclaim_mode", - .data = &zone_reclaim_mode, - .maxlen = sizeof(zone_reclaim_mode), + .data = &node_reclaim_mode, + .maxlen = sizeof(node_reclaim_mode), .mode = 0644, .proc_handler = proc_dointvec, .extra1 = &zero, diff --git a/kernel/task_work.c b/kernel/task_work.c index 53fa971d000d..6ab4842b00e8 100644 --- a/kernel/task_work.c +++ b/kernel/task_work.c @@ -108,7 +108,6 @@ void task_work_run(void) * fail, but it can play with *work and other entries. */ raw_spin_unlock_wait(&task->pi_lock); - smp_mb(); do { next = work->next; diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index a9b76a40319e..2c5bc77c0bb0 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c @@ -645,7 +645,7 @@ void tick_cleanup_dead_cpu(int cpu) #endif #ifdef CONFIG_SYSFS -struct bus_type clockevents_subsys = { +static struct bus_type clockevents_subsys = { .name = "clockevents", .dev_name = "clockevent", }; diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index 1cafba860b08..39008d78927a 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -777,6 +777,7 @@ static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp) timer->it.cpu.expires = 0; sample_to_timespec(timer->it_clock, timer->it.cpu.expires, &itp->it_value); + return; } else { cpu_timer_sample_group(timer->it_clock, p, &now); unlock_task_sighand(p, &flags); diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 2ec7c00228f3..204fdc86863d 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -908,11 +908,10 @@ static void __tick_nohz_idle_enter(struct tick_sched *ts) ktime_t now, expires; int cpu = smp_processor_id(); - now = tick_nohz_start_idle(ts); - if (can_stop_idle_tick(cpu, ts)) { int was_stopped = ts->tick_stopped; + now = tick_nohz_start_idle(ts); ts->idle_calls++; expires = tick_nohz_stop_sched_tick(ts, now, cpu); diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index a196e08324e7..3b65746c7f15 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -2188,6 +2188,7 @@ struct timespec64 get_monotonic_coarse64(void) return now; } +EXPORT_SYMBOL(get_monotonic_coarse64); /* * Must hold jiffies_lock diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index fafeaf803bd0..f4b86e8ca1e7 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -542,6 +542,7 @@ config HIST_TRIGGERS bool "Histogram triggers" depends on ARCH_HAVE_NMI_SAFE_CMPXCHG select TRACING_MAP + select TRACING default n help Hist triggers allow one or more arbitrary trace event fields diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 9aef8654e90d..fb345cd11883 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -127,12 +127,13 @@ static void trace_note_tsk(struct task_struct *tsk) static void trace_note_time(struct blk_trace *bt) { - struct timespec now; + struct timespec64 now; unsigned long flags; u32 words[2]; - getnstimeofday(&now); - words[0] = now.tv_sec; + /* need to check user space to see if this breaks in y2038 or y2106 */ + ktime_get_real_ts64(&now); + words[0] = (u32)now.tv_sec; words[1] = now.tv_nsec; local_irq_save(flags); @@ -189,6 +190,7 @@ static const u32 ddir_act[2] = { BLK_TC_ACT(BLK_TC_READ), BLK_TC_ACT(BLK_TC_WRITE) }; #define BLK_TC_RAHEAD BLK_TC_AHEAD +#define BLK_TC_PREFLUSH BLK_TC_FLUSH /* The ilog2() calls fall out because they're constant */ #define MASK_TC_BIT(rw, __name) ((rw & REQ_ ## __name) << \ @@ -199,7 +201,8 @@ static const u32 ddir_act[2] = { BLK_TC_ACT(BLK_TC_READ), * blk_io_trace structure and places it in a per-cpu subbuffer. */ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes, - int rw, u32 what, int error, int pdu_len, void *pdu_data) + int op, int op_flags, u32 what, int error, int pdu_len, + void *pdu_data) { struct task_struct *tsk = current; struct ring_buffer_event *event = NULL; @@ -214,13 +217,16 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes, if (unlikely(bt->trace_state != Blktrace_running && !blk_tracer)) return; - what |= ddir_act[rw & WRITE]; - what |= MASK_TC_BIT(rw, SYNC); - what |= MASK_TC_BIT(rw, RAHEAD); - what |= MASK_TC_BIT(rw, META); - what |= MASK_TC_BIT(rw, DISCARD); - what |= MASK_TC_BIT(rw, FLUSH); - what |= MASK_TC_BIT(rw, FUA); + what |= ddir_act[op_is_write(op) ? WRITE : READ]; + what |= MASK_TC_BIT(op_flags, SYNC); + what |= MASK_TC_BIT(op_flags, RAHEAD); + what |= MASK_TC_BIT(op_flags, META); + what |= MASK_TC_BIT(op_flags, PREFLUSH); + what |= MASK_TC_BIT(op_flags, FUA); + if (op == REQ_OP_DISCARD) + what |= BLK_TC_ACT(BLK_TC_DISCARD); + if (op == REQ_OP_FLUSH) + what |= BLK_TC_ACT(BLK_TC_FLUSH); pid = tsk->pid; if (act_log_check(bt, what, sector, pid)) @@ -708,11 +714,11 @@ static void blk_add_trace_rq(struct request_queue *q, struct request *rq, if (rq->cmd_type == REQ_TYPE_BLOCK_PC) { what |= BLK_TC_ACT(BLK_TC_PC); - __blk_add_trace(bt, 0, nr_bytes, rq->cmd_flags, + __blk_add_trace(bt, 0, nr_bytes, req_op(rq), rq->cmd_flags, what, rq->errors, rq->cmd_len, rq->cmd); } else { what |= BLK_TC_ACT(BLK_TC_FS); - __blk_add_trace(bt, blk_rq_pos(rq), nr_bytes, + __blk_add_trace(bt, blk_rq_pos(rq), nr_bytes, req_op(rq), rq->cmd_flags, what, rq->errors, 0, NULL); } } @@ -770,7 +776,7 @@ static void blk_add_trace_bio(struct request_queue *q, struct bio *bio, return; __blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size, - bio->bi_rw, what, error, 0, NULL); + bio_op(bio), bio->bi_rw, what, error, 0, NULL); } static void blk_add_trace_bio_bounce(void *ignore, @@ -818,7 +824,8 @@ static void blk_add_trace_getrq(void *ignore, struct blk_trace *bt = q->blk_trace; if (bt) - __blk_add_trace(bt, 0, 0, rw, BLK_TA_GETRQ, 0, 0, NULL); + __blk_add_trace(bt, 0, 0, rw, 0, BLK_TA_GETRQ, 0, 0, + NULL); } } @@ -833,7 +840,7 @@ static void blk_add_trace_sleeprq(void *ignore, struct blk_trace *bt = q->blk_trace; if (bt) - __blk_add_trace(bt, 0, 0, rw, BLK_TA_SLEEPRQ, + __blk_add_trace(bt, 0, 0, rw, 0, BLK_TA_SLEEPRQ, 0, 0, NULL); } } @@ -843,7 +850,7 @@ static void blk_add_trace_plug(void *ignore, struct request_queue *q) struct blk_trace *bt = q->blk_trace; if (bt) - __blk_add_trace(bt, 0, 0, 0, BLK_TA_PLUG, 0, 0, NULL); + __blk_add_trace(bt, 0, 0, 0, 0, BLK_TA_PLUG, 0, 0, NULL); } static void blk_add_trace_unplug(void *ignore, struct request_queue *q, @@ -860,7 +867,7 @@ static void blk_add_trace_unplug(void *ignore, struct request_queue *q, else what = BLK_TA_UNPLUG_TIMER; - __blk_add_trace(bt, 0, 0, 0, what, 0, sizeof(rpdu), &rpdu); + __blk_add_trace(bt, 0, 0, 0, 0, what, 0, sizeof(rpdu), &rpdu); } } @@ -874,8 +881,9 @@ static void blk_add_trace_split(void *ignore, __be64 rpdu = cpu_to_be64(pdu); __blk_add_trace(bt, bio->bi_iter.bi_sector, - bio->bi_iter.bi_size, bio->bi_rw, BLK_TA_SPLIT, - bio->bi_error, sizeof(rpdu), &rpdu); + bio->bi_iter.bi_size, bio_op(bio), bio->bi_rw, + BLK_TA_SPLIT, bio->bi_error, sizeof(rpdu), + &rpdu); } } @@ -907,7 +915,7 @@ static void blk_add_trace_bio_remap(void *ignore, r.sector_from = cpu_to_be64(from); __blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size, - bio->bi_rw, BLK_TA_REMAP, bio->bi_error, + bio_op(bio), bio->bi_rw, BLK_TA_REMAP, bio->bi_error, sizeof(r), &r); } @@ -940,7 +948,7 @@ static void blk_add_trace_rq_remap(void *ignore, r.sector_from = cpu_to_be64(from); __blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq), - rq_data_dir(rq), BLK_TA_REMAP, !!rq->errors, + rq_data_dir(rq), 0, BLK_TA_REMAP, !!rq->errors, sizeof(r), &r); } @@ -965,10 +973,10 @@ void blk_add_driver_data(struct request_queue *q, return; if (rq->cmd_type == REQ_TYPE_BLOCK_PC) - __blk_add_trace(bt, 0, blk_rq_bytes(rq), 0, + __blk_add_trace(bt, 0, blk_rq_bytes(rq), 0, 0, BLK_TA_DRV_DATA, rq->errors, len, data); else - __blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq), 0, + __blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq), 0, 0, BLK_TA_DRV_DATA, rq->errors, len, data); } EXPORT_SYMBOL_GPL(blk_add_driver_data); @@ -1769,21 +1777,34 @@ void blk_dump_cmd(char *buf, struct request *rq) } } -void blk_fill_rwbs(char *rwbs, u32 rw, int bytes) +void blk_fill_rwbs(char *rwbs, int op, u32 rw, int bytes) { int i = 0; - if (rw & REQ_FLUSH) + if (rw & REQ_PREFLUSH) rwbs[i++] = 'F'; - if (rw & WRITE) + switch (op) { + case REQ_OP_WRITE: + case REQ_OP_WRITE_SAME: rwbs[i++] = 'W'; - else if (rw & REQ_DISCARD) + break; + case REQ_OP_DISCARD: + rwbs[i++] = 'D'; + break; + case REQ_OP_SECURE_ERASE: rwbs[i++] = 'D'; - else if (bytes) + rwbs[i++] = 'E'; + break; + case REQ_OP_FLUSH: + rwbs[i++] = 'F'; + break; + case REQ_OP_READ: rwbs[i++] = 'R'; - else + break; + default: rwbs[i++] = 'N'; + } if (rw & REQ_FUA) rwbs[i++] = 'F'; @@ -1793,8 +1814,6 @@ void blk_fill_rwbs(char *rwbs, u32 rw, int bytes) rwbs[i++] = 'S'; if (rw & REQ_META) rwbs[i++] = 'M'; - if (rw & REQ_SECURE) - rwbs[i++] = 'E'; rwbs[i] = '\0'; } diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 26f603da7e26..b20438fdb029 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -81,6 +81,49 @@ static const struct bpf_func_proto bpf_probe_read_proto = { .arg3_type = ARG_ANYTHING, }; +static u64 bpf_probe_write_user(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) +{ + void *unsafe_ptr = (void *) (long) r1; + void *src = (void *) (long) r2; + int size = (int) r3; + + /* + * Ensure we're in user context which is safe for the helper to + * run. This helper has no business in a kthread. + * + * access_ok() should prevent writing to non-user memory, but in + * some situations (nommu, temporary switch, etc) access_ok() does + * not provide enough validation, hence the check on KERNEL_DS. + */ + + if (unlikely(in_interrupt() || + current->flags & (PF_KTHREAD | PF_EXITING))) + return -EPERM; + if (unlikely(segment_eq(get_fs(), KERNEL_DS))) + return -EPERM; + if (!access_ok(VERIFY_WRITE, unsafe_ptr, size)) + return -EPERM; + + return probe_kernel_write(unsafe_ptr, src, size); +} + +static const struct bpf_func_proto bpf_probe_write_user_proto = { + .func = bpf_probe_write_user, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_ANYTHING, + .arg2_type = ARG_PTR_TO_STACK, + .arg3_type = ARG_CONST_STACK_SIZE, +}; + +static const struct bpf_func_proto *bpf_get_probe_write_proto(void) +{ + pr_warn_ratelimited("%s[%d] is installing a program with bpf_probe_write_user helper that may corrupt user memory!", + current->comm, task_pid_nr(current)); + + return &bpf_probe_write_user_proto; +} + /* * limited trace_printk() * only %d %u %x %ld %lu %lx %lld %llu %llx %p %s conversion specifiers allowed @@ -188,31 +231,35 @@ const struct bpf_func_proto *bpf_get_trace_printk_proto(void) return &bpf_trace_printk_proto; } -static u64 bpf_perf_event_read(u64 r1, u64 index, u64 r3, u64 r4, u64 r5) +static u64 bpf_perf_event_read(u64 r1, u64 flags, u64 r3, u64 r4, u64 r5) { struct bpf_map *map = (struct bpf_map *) (unsigned long) r1; struct bpf_array *array = container_of(map, struct bpf_array, map); + unsigned int cpu = smp_processor_id(); + u64 index = flags & BPF_F_INDEX_MASK; + struct bpf_event_entry *ee; struct perf_event *event; - struct file *file; + if (unlikely(flags & ~(BPF_F_INDEX_MASK))) + return -EINVAL; + if (index == BPF_F_CURRENT_CPU) + index = cpu; if (unlikely(index >= array->map.max_entries)) return -E2BIG; - file = READ_ONCE(array->ptrs[index]); - if (unlikely(!file)) + ee = READ_ONCE(array->ptrs[index]); + if (!ee) return -ENOENT; - event = file->private_data; - - /* make sure event is local and doesn't have pmu::count */ - if (event->oncpu != smp_processor_id() || - event->pmu->count) - return -EINVAL; - + event = ee->event; if (unlikely(event->attr.type != PERF_TYPE_HARDWARE && event->attr.type != PERF_TYPE_RAW)) return -EINVAL; + /* make sure event is local and doesn't have pmu::count */ + if (unlikely(event->oncpu != cpu || event->pmu->count)) + return -EINVAL; + /* * we don't know if the function is run successfully by the * return value. It can be judged in other places, such as @@ -229,47 +276,58 @@ static const struct bpf_func_proto bpf_perf_event_read_proto = { .arg2_type = ARG_ANYTHING, }; -static u64 bpf_perf_event_output(u64 r1, u64 r2, u64 flags, u64 r4, u64 size) +static __always_inline u64 +__bpf_perf_event_output(struct pt_regs *regs, struct bpf_map *map, + u64 flags, struct perf_raw_record *raw) { - struct pt_regs *regs = (struct pt_regs *) (long) r1; - struct bpf_map *map = (struct bpf_map *) (long) r2; struct bpf_array *array = container_of(map, struct bpf_array, map); + unsigned int cpu = smp_processor_id(); u64 index = flags & BPF_F_INDEX_MASK; - void *data = (void *) (long) r4; struct perf_sample_data sample_data; + struct bpf_event_entry *ee; struct perf_event *event; - struct file *file; - struct perf_raw_record raw = { - .size = size, - .data = data, - }; - if (unlikely(flags & ~(BPF_F_INDEX_MASK))) - return -EINVAL; if (index == BPF_F_CURRENT_CPU) - index = raw_smp_processor_id(); + index = cpu; if (unlikely(index >= array->map.max_entries)) return -E2BIG; - file = READ_ONCE(array->ptrs[index]); - if (unlikely(!file)) + ee = READ_ONCE(array->ptrs[index]); + if (!ee) return -ENOENT; - event = file->private_data; - + event = ee->event; if (unlikely(event->attr.type != PERF_TYPE_SOFTWARE || event->attr.config != PERF_COUNT_SW_BPF_OUTPUT)) return -EINVAL; - if (unlikely(event->oncpu != smp_processor_id())) + if (unlikely(event->oncpu != cpu)) return -EOPNOTSUPP; perf_sample_data_init(&sample_data, 0, 0); - sample_data.raw = &raw; + sample_data.raw = raw; perf_event_output(event, &sample_data, regs); return 0; } +static u64 bpf_perf_event_output(u64 r1, u64 r2, u64 flags, u64 r4, u64 size) +{ + struct pt_regs *regs = (struct pt_regs *)(long) r1; + struct bpf_map *map = (struct bpf_map *)(long) r2; + void *data = (void *)(long) r4; + struct perf_raw_record raw = { + .frag = { + .size = size, + .data = data, + }, + }; + + if (unlikely(flags & ~(BPF_F_INDEX_MASK))) + return -EINVAL; + + return __bpf_perf_event_output(regs, map, flags, &raw); +} + static const struct bpf_func_proto bpf_perf_event_output_proto = { .func = bpf_perf_event_output, .gpl_only = true, @@ -283,31 +341,41 @@ static const struct bpf_func_proto bpf_perf_event_output_proto = { static DEFINE_PER_CPU(struct pt_regs, bpf_pt_regs); -static u64 bpf_event_output(u64 r1, u64 r2, u64 flags, u64 r4, u64 size) +u64 bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size, + void *ctx, u64 ctx_size, bpf_ctx_copy_t ctx_copy) { struct pt_regs *regs = this_cpu_ptr(&bpf_pt_regs); + struct perf_raw_frag frag = { + .copy = ctx_copy, + .size = ctx_size, + .data = ctx, + }; + struct perf_raw_record raw = { + .frag = { + { + .next = ctx_size ? &frag : NULL, + }, + .size = meta_size, + .data = meta, + }, + }; perf_fetch_caller_regs(regs); - return bpf_perf_event_output((long)regs, r2, flags, r4, size); + return __bpf_perf_event_output(regs, map, flags, &raw); +} + +static u64 bpf_get_current_task(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) +{ + return (long) current; } -static const struct bpf_func_proto bpf_event_output_proto = { - .func = bpf_event_output, +static const struct bpf_func_proto bpf_get_current_task_proto = { + .func = bpf_get_current_task, .gpl_only = true, .ret_type = RET_INTEGER, - .arg1_type = ARG_PTR_TO_CTX, - .arg2_type = ARG_CONST_MAP_PTR, - .arg3_type = ARG_ANYTHING, - .arg4_type = ARG_PTR_TO_STACK, - .arg5_type = ARG_CONST_STACK_SIZE, }; -const struct bpf_func_proto *bpf_get_event_output_proto(void) -{ - return &bpf_event_output_proto; -} - static const struct bpf_func_proto *tracing_func_proto(enum bpf_func_id func_id) { switch (func_id) { @@ -325,6 +393,8 @@ static const struct bpf_func_proto *tracing_func_proto(enum bpf_func_id func_id) return &bpf_tail_call_proto; case BPF_FUNC_get_current_pid_tgid: return &bpf_get_current_pid_tgid_proto; + case BPF_FUNC_get_current_task: + return &bpf_get_current_task_proto; case BPF_FUNC_get_current_uid_gid: return &bpf_get_current_uid_gid_proto; case BPF_FUNC_get_current_comm: @@ -335,6 +405,8 @@ static const struct bpf_func_proto *tracing_func_proto(enum bpf_func_id func_id) return &bpf_get_smp_processor_id_proto; case BPF_FUNC_perf_event_read: return &bpf_perf_event_read_proto; + case BPF_FUNC_probe_write_user: + return bpf_get_probe_write_proto(); default: return NULL; } @@ -356,18 +428,12 @@ static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func static bool kprobe_prog_is_valid_access(int off, int size, enum bpf_access_type type, enum bpf_reg_type *reg_type) { - /* check bounds */ if (off < 0 || off >= sizeof(struct pt_regs)) return false; - - /* only read is allowed */ if (type != BPF_READ) return false; - - /* disallow misaligned access */ if (off % size != 0) return false; - return true; } diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 900dbb1efff2..84752c8e28b5 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -89,16 +89,16 @@ struct ftrace_ops *function_trace_op __read_mostly = &ftrace_list_end; /* What to set function_trace_op to */ static struct ftrace_ops *set_function_trace_op; -/* List for set_ftrace_pid's pids. */ -LIST_HEAD(ftrace_pids); -struct ftrace_pid { - struct list_head list; - struct pid *pid; -}; - -static bool ftrace_pids_enabled(void) +static bool ftrace_pids_enabled(struct ftrace_ops *ops) { - return !list_empty(&ftrace_pids); + struct trace_array *tr; + + if (!(ops->flags & FTRACE_OPS_FL_PID) || !ops->private) + return false; + + tr = ops->private; + + return tr->function_pids != NULL; } static void ftrace_update_trampoline(struct ftrace_ops *ops); @@ -179,7 +179,9 @@ int ftrace_nr_registered_ops(void) static void ftrace_pid_func(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *op, struct pt_regs *regs) { - if (!test_tsk_trace_trace(current)) + struct trace_array *tr = op->private; + + if (tr && this_cpu_read(tr->trace_buffer.data->ftrace_ignore_pid)) return; op->saved_func(ip, parent_ip, op, regs); @@ -417,7 +419,7 @@ static int __register_ftrace_function(struct ftrace_ops *ops) /* Always save the function, and reset at unregistering */ ops->saved_func = ops->func; - if (ops->flags & FTRACE_OPS_FL_PID && ftrace_pids_enabled()) + if (ftrace_pids_enabled(ops)) ops->func = ftrace_pid_func; ftrace_update_trampoline(ops); @@ -450,7 +452,6 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops) static void ftrace_update_pid_func(void) { - bool enabled = ftrace_pids_enabled(); struct ftrace_ops *op; /* Only do something if we are tracing something */ @@ -459,8 +460,8 @@ static void ftrace_update_pid_func(void) do_for_each_ftrace_op(op, ftrace_ops_list) { if (op->flags & FTRACE_OPS_FL_PID) { - op->func = enabled ? ftrace_pid_func : - op->saved_func; + op->func = ftrace_pids_enabled(op) ? + ftrace_pid_func : op->saved_func; ftrace_update_trampoline(op); } } while_for_each_ftrace_op(op); @@ -5324,179 +5325,99 @@ ftrace_func_t ftrace_ops_get_func(struct ftrace_ops *ops) return ops->func; } -static void clear_ftrace_swapper(void) +static void +ftrace_filter_pid_sched_switch_probe(void *data, bool preempt, + struct task_struct *prev, struct task_struct *next) { - struct task_struct *p; - int cpu; + struct trace_array *tr = data; + struct trace_pid_list *pid_list; - get_online_cpus(); - for_each_online_cpu(cpu) { - p = idle_task(cpu); - clear_tsk_trace_trace(p); - } - put_online_cpus(); -} - -static void set_ftrace_swapper(void) -{ - struct task_struct *p; - int cpu; + pid_list = rcu_dereference_sched(tr->function_pids); - get_online_cpus(); - for_each_online_cpu(cpu) { - p = idle_task(cpu); - set_tsk_trace_trace(p); - } - put_online_cpus(); + this_cpu_write(tr->trace_buffer.data->ftrace_ignore_pid, + trace_ignore_this_task(pid_list, next)); } -static void clear_ftrace_pid(struct pid *pid) +static void clear_ftrace_pids(struct trace_array *tr) { - struct task_struct *p; + struct trace_pid_list *pid_list; + int cpu; - rcu_read_lock(); - do_each_pid_task(pid, PIDTYPE_PID, p) { - clear_tsk_trace_trace(p); - } while_each_pid_task(pid, PIDTYPE_PID, p); - rcu_read_unlock(); + pid_list = rcu_dereference_protected(tr->function_pids, + lockdep_is_held(&ftrace_lock)); + if (!pid_list) + return; - put_pid(pid); -} + unregister_trace_sched_switch(ftrace_filter_pid_sched_switch_probe, tr); -static void set_ftrace_pid(struct pid *pid) -{ - struct task_struct *p; + for_each_possible_cpu(cpu) + per_cpu_ptr(tr->trace_buffer.data, cpu)->ftrace_ignore_pid = false; - rcu_read_lock(); - do_each_pid_task(pid, PIDTYPE_PID, p) { - set_tsk_trace_trace(p); - } while_each_pid_task(pid, PIDTYPE_PID, p); - rcu_read_unlock(); -} + rcu_assign_pointer(tr->function_pids, NULL); -static void clear_ftrace_pid_task(struct pid *pid) -{ - if (pid == ftrace_swapper_pid) - clear_ftrace_swapper(); - else - clear_ftrace_pid(pid); -} + /* Wait till all users are no longer using pid filtering */ + synchronize_sched(); -static void set_ftrace_pid_task(struct pid *pid) -{ - if (pid == ftrace_swapper_pid) - set_ftrace_swapper(); - else - set_ftrace_pid(pid); + trace_free_pid_list(pid_list); } -static int ftrace_pid_add(int p) +static void ftrace_pid_reset(struct trace_array *tr) { - struct pid *pid; - struct ftrace_pid *fpid; - int ret = -EINVAL; - mutex_lock(&ftrace_lock); - - if (!p) - pid = ftrace_swapper_pid; - else - pid = find_get_pid(p); - - if (!pid) - goto out; - - ret = 0; - - list_for_each_entry(fpid, &ftrace_pids, list) - if (fpid->pid == pid) - goto out_put; - - ret = -ENOMEM; - - fpid = kmalloc(sizeof(*fpid), GFP_KERNEL); - if (!fpid) - goto out_put; - - list_add(&fpid->list, &ftrace_pids); - fpid->pid = pid; - - set_ftrace_pid_task(pid); + clear_ftrace_pids(tr); ftrace_update_pid_func(); - ftrace_startup_all(0); mutex_unlock(&ftrace_lock); - return 0; - -out_put: - if (pid != ftrace_swapper_pid) - put_pid(pid); - -out: - mutex_unlock(&ftrace_lock); - return ret; } -static void ftrace_pid_reset(void) -{ - struct ftrace_pid *fpid, *safe; - - mutex_lock(&ftrace_lock); - list_for_each_entry_safe(fpid, safe, &ftrace_pids, list) { - struct pid *pid = fpid->pid; - - clear_ftrace_pid_task(pid); - - list_del(&fpid->list); - kfree(fpid); - } - - ftrace_update_pid_func(); - ftrace_startup_all(0); - - mutex_unlock(&ftrace_lock); -} +/* Greater than any max PID */ +#define FTRACE_NO_PIDS (void *)(PID_MAX_LIMIT + 1) static void *fpid_start(struct seq_file *m, loff_t *pos) + __acquires(RCU) { + struct trace_pid_list *pid_list; + struct trace_array *tr = m->private; + mutex_lock(&ftrace_lock); + rcu_read_lock_sched(); - if (!ftrace_pids_enabled() && (!*pos)) - return (void *) 1; + pid_list = rcu_dereference_sched(tr->function_pids); - return seq_list_start(&ftrace_pids, *pos); + if (!pid_list) + return !(*pos) ? FTRACE_NO_PIDS : NULL; + + return trace_pid_start(pid_list, pos); } static void *fpid_next(struct seq_file *m, void *v, loff_t *pos) { - if (v == (void *)1) + struct trace_array *tr = m->private; + struct trace_pid_list *pid_list = rcu_dereference_sched(tr->function_pids); + + if (v == FTRACE_NO_PIDS) return NULL; - return seq_list_next(v, &ftrace_pids, pos); + return trace_pid_next(pid_list, v, pos); } static void fpid_stop(struct seq_file *m, void *p) + __releases(RCU) { + rcu_read_unlock_sched(); mutex_unlock(&ftrace_lock); } static int fpid_show(struct seq_file *m, void *v) { - const struct ftrace_pid *fpid = list_entry(v, struct ftrace_pid, list); - - if (v == (void *)1) { + if (v == FTRACE_NO_PIDS) { seq_puts(m, "no pid\n"); return 0; } - if (fpid->pid == ftrace_swapper_pid) - seq_puts(m, "swapper tasks\n"); - else - seq_printf(m, "%u\n", pid_vnr(fpid->pid)); - - return 0; + return trace_pid_show(m, v); } static const struct seq_operations ftrace_pid_sops = { @@ -5509,58 +5430,103 @@ static const struct seq_operations ftrace_pid_sops = { static int ftrace_pid_open(struct inode *inode, struct file *file) { + struct trace_array *tr = inode->i_private; + struct seq_file *m; int ret = 0; + if (trace_array_get(tr) < 0) + return -ENODEV; + if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) - ftrace_pid_reset(); + ftrace_pid_reset(tr); - if (file->f_mode & FMODE_READ) - ret = seq_open(file, &ftrace_pid_sops); + ret = seq_open(file, &ftrace_pid_sops); + if (ret < 0) { + trace_array_put(tr); + } else { + m = file->private_data; + /* copy tr over to seq ops */ + m->private = tr; + } return ret; } +static void ignore_task_cpu(void *data) +{ + struct trace_array *tr = data; + struct trace_pid_list *pid_list; + + /* + * This function is called by on_each_cpu() while the + * event_mutex is held. + */ + pid_list = rcu_dereference_protected(tr->function_pids, + mutex_is_locked(&ftrace_lock)); + + this_cpu_write(tr->trace_buffer.data->ftrace_ignore_pid, + trace_ignore_this_task(pid_list, current)); +} + static ssize_t ftrace_pid_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { - char buf[64], *tmp; - long val; - int ret; + struct seq_file *m = filp->private_data; + struct trace_array *tr = m->private; + struct trace_pid_list *filtered_pids = NULL; + struct trace_pid_list *pid_list; + ssize_t ret; - if (cnt >= sizeof(buf)) - return -EINVAL; + if (!cnt) + return 0; + + mutex_lock(&ftrace_lock); + + filtered_pids = rcu_dereference_protected(tr->function_pids, + lockdep_is_held(&ftrace_lock)); + + ret = trace_pid_write(filtered_pids, &pid_list, ubuf, cnt); + if (ret < 0) + goto out; - if (copy_from_user(&buf, ubuf, cnt)) - return -EFAULT; + rcu_assign_pointer(tr->function_pids, pid_list); - buf[cnt] = 0; + if (filtered_pids) { + synchronize_sched(); + trace_free_pid_list(filtered_pids); + } else if (pid_list) { + /* Register a probe to set whether to ignore the tracing of a task */ + register_trace_sched_switch(ftrace_filter_pid_sched_switch_probe, tr); + } /* - * Allow "echo > set_ftrace_pid" or "echo -n '' > set_ftrace_pid" - * to clean the filter quietly. + * Ignoring of pids is done at task switch. But we have to + * check for those tasks that are currently running. + * Always do this in case a pid was appended or removed. */ - tmp = strstrip(buf); - if (strlen(tmp) == 0) - return 1; + on_each_cpu(ignore_task_cpu, tr, 1); - ret = kstrtol(tmp, 10, &val); - if (ret < 0) - return ret; + ftrace_update_pid_func(); + ftrace_startup_all(0); + out: + mutex_unlock(&ftrace_lock); - ret = ftrace_pid_add(val); + if (ret > 0) + *ppos += ret; - return ret ? ret : cnt; + return ret; } static int ftrace_pid_release(struct inode *inode, struct file *file) { - if (file->f_mode & FMODE_READ) - seq_release(inode, file); + struct trace_array *tr = inode->i_private; - return 0; + trace_array_put(tr); + + return seq_release(inode, file); } static const struct file_operations ftrace_pid_fops = { @@ -5571,24 +5537,21 @@ static const struct file_operations ftrace_pid_fops = { .release = ftrace_pid_release, }; -static __init int ftrace_init_tracefs(void) +void ftrace_init_tracefs(struct trace_array *tr, struct dentry *d_tracer) { - struct dentry *d_tracer; + trace_create_file("set_ftrace_pid", 0644, d_tracer, + tr, &ftrace_pid_fops); +} - d_tracer = tracing_init_dentry(); - if (IS_ERR(d_tracer)) - return 0; +void __init ftrace_init_tracefs_toplevel(struct trace_array *tr, + struct dentry *d_tracer) +{ + /* Only the top level directory has the dyn_tracefs and profile */ + WARN_ON(!(tr->flags & TRACE_ARRAY_FL_GLOBAL)); ftrace_init_dyn_tracefs(d_tracer); - - trace_create_file("set_ftrace_pid", 0644, d_tracer, - NULL, &ftrace_pid_fops); - ftrace_profile_tracefs(d_tracer); - - return 0; } -fs_initcall(ftrace_init_tracefs); /** * ftrace_kill - kill ftrace diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 8a4bd6b68a0b..dade4c9559cc 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -25,7 +25,7 @@ #include <linux/hardirq.h> #include <linux/linkage.h> #include <linux/uaccess.h> -#include <linux/kprobes.h> +#include <linux/vmalloc.h> #include <linux/ftrace.h> #include <linux/module.h> #include <linux/percpu.h> @@ -319,6 +319,258 @@ int call_filter_check_discard(struct trace_event_call *call, void *rec, return 0; } +void trace_free_pid_list(struct trace_pid_list *pid_list) +{ + vfree(pid_list->pids); + kfree(pid_list); +} + +/** + * trace_find_filtered_pid - check if a pid exists in a filtered_pid list + * @filtered_pids: The list of pids to check + * @search_pid: The PID to find in @filtered_pids + * + * Returns true if @search_pid is fonud in @filtered_pids, and false otherwis. + */ +bool +trace_find_filtered_pid(struct trace_pid_list *filtered_pids, pid_t search_pid) +{ + /* + * If pid_max changed after filtered_pids was created, we + * by default ignore all pids greater than the previous pid_max. + */ + if (search_pid >= filtered_pids->pid_max) + return false; + + return test_bit(search_pid, filtered_pids->pids); +} + +/** + * trace_ignore_this_task - should a task be ignored for tracing + * @filtered_pids: The list of pids to check + * @task: The task that should be ignored if not filtered + * + * Checks if @task should be traced or not from @filtered_pids. + * Returns true if @task should *NOT* be traced. + * Returns false if @task should be traced. + */ +bool +trace_ignore_this_task(struct trace_pid_list *filtered_pids, struct task_struct *task) +{ + /* + * Return false, because if filtered_pids does not exist, + * all pids are good to trace. + */ + if (!filtered_pids) + return false; + + return !trace_find_filtered_pid(filtered_pids, task->pid); +} + +/** + * trace_pid_filter_add_remove - Add or remove a task from a pid_list + * @pid_list: The list to modify + * @self: The current task for fork or NULL for exit + * @task: The task to add or remove + * + * If adding a task, if @self is defined, the task is only added if @self + * is also included in @pid_list. This happens on fork and tasks should + * only be added when the parent is listed. If @self is NULL, then the + * @task pid will be removed from the list, which would happen on exit + * of a task. + */ +void trace_filter_add_remove_task(struct trace_pid_list *pid_list, + struct task_struct *self, + struct task_struct *task) +{ + if (!pid_list) + return; + + /* For forks, we only add if the forking task is listed */ + if (self) { + if (!trace_find_filtered_pid(pid_list, self->pid)) + return; + } + + /* Sorry, but we don't support pid_max changing after setting */ + if (task->pid >= pid_list->pid_max) + return; + + /* "self" is set for forks, and NULL for exits */ + if (self) + set_bit(task->pid, pid_list->pids); + else + clear_bit(task->pid, pid_list->pids); +} + +/** + * trace_pid_next - Used for seq_file to get to the next pid of a pid_list + * @pid_list: The pid list to show + * @v: The last pid that was shown (+1 the actual pid to let zero be displayed) + * @pos: The position of the file + * + * This is used by the seq_file "next" operation to iterate the pids + * listed in a trace_pid_list structure. + * + * Returns the pid+1 as we want to display pid of zero, but NULL would + * stop the iteration. + */ +void *trace_pid_next(struct trace_pid_list *pid_list, void *v, loff_t *pos) +{ + unsigned long pid = (unsigned long)v; + + (*pos)++; + + /* pid already is +1 of the actual prevous bit */ + pid = find_next_bit(pid_list->pids, pid_list->pid_max, pid); + + /* Return pid + 1 to allow zero to be represented */ + if (pid < pid_list->pid_max) + return (void *)(pid + 1); + + return NULL; +} + +/** + * trace_pid_start - Used for seq_file to start reading pid lists + * @pid_list: The pid list to show + * @pos: The position of the file + * + * This is used by seq_file "start" operation to start the iteration + * of listing pids. + * + * Returns the pid+1 as we want to display pid of zero, but NULL would + * stop the iteration. + */ +void *trace_pid_start(struct trace_pid_list *pid_list, loff_t *pos) +{ + unsigned long pid; + loff_t l = 0; + + pid = find_first_bit(pid_list->pids, pid_list->pid_max); + if (pid >= pid_list->pid_max) + return NULL; + + /* Return pid + 1 so that zero can be the exit value */ + for (pid++; pid && l < *pos; + pid = (unsigned long)trace_pid_next(pid_list, (void *)pid, &l)) + ; + return (void *)pid; +} + +/** + * trace_pid_show - show the current pid in seq_file processing + * @m: The seq_file structure to write into + * @v: A void pointer of the pid (+1) value to display + * + * Can be directly used by seq_file operations to display the current + * pid value. + */ +int trace_pid_show(struct seq_file *m, void *v) +{ + unsigned long pid = (unsigned long)v - 1; + + seq_printf(m, "%lu\n", pid); + return 0; +} + +/* 128 should be much more than enough */ +#define PID_BUF_SIZE 127 + +int trace_pid_write(struct trace_pid_list *filtered_pids, + struct trace_pid_list **new_pid_list, + const char __user *ubuf, size_t cnt) +{ + struct trace_pid_list *pid_list; + struct trace_parser parser; + unsigned long val; + int nr_pids = 0; + ssize_t read = 0; + ssize_t ret = 0; + loff_t pos; + pid_t pid; + + if (trace_parser_get_init(&parser, PID_BUF_SIZE + 1)) + return -ENOMEM; + + /* + * Always recreate a new array. The write is an all or nothing + * operation. Always create a new array when adding new pids by + * the user. If the operation fails, then the current list is + * not modified. + */ + pid_list = kmalloc(sizeof(*pid_list), GFP_KERNEL); + if (!pid_list) + return -ENOMEM; + + pid_list->pid_max = READ_ONCE(pid_max); + + /* Only truncating will shrink pid_max */ + if (filtered_pids && filtered_pids->pid_max > pid_list->pid_max) + pid_list->pid_max = filtered_pids->pid_max; + + pid_list->pids = vzalloc((pid_list->pid_max + 7) >> 3); + if (!pid_list->pids) { + kfree(pid_list); + return -ENOMEM; + } + + if (filtered_pids) { + /* copy the current bits to the new max */ + for_each_set_bit(pid, filtered_pids->pids, + filtered_pids->pid_max) { + set_bit(pid, pid_list->pids); + nr_pids++; + } + } + + while (cnt > 0) { + + pos = 0; + + ret = trace_get_user(&parser, ubuf, cnt, &pos); + if (ret < 0 || !trace_parser_loaded(&parser)) + break; + + read += ret; + ubuf += ret; + cnt -= ret; + + parser.buffer[parser.idx] = 0; + + ret = -EINVAL; + if (kstrtoul(parser.buffer, 0, &val)) + break; + if (val >= pid_list->pid_max) + break; + + pid = (pid_t)val; + + set_bit(pid, pid_list->pids); + nr_pids++; + + trace_parser_clear(&parser); + ret = 0; + } + trace_parser_put(&parser); + + if (ret < 0) { + trace_free_pid_list(pid_list); + return ret; + } + + if (!nr_pids) { + /* Cleared the list of pids */ + trace_free_pid_list(pid_list); + read = ret; + pid_list = NULL; + } + + *new_pid_list = pid_list; + + return read; +} + static cycle_t buffer_ftrace_now(struct trace_buffer *buf, int cpu) { u64 ts; @@ -1862,7 +2114,17 @@ void trace_buffer_unlock_commit_regs(struct trace_array *tr, { __buffer_unlock_commit(buffer, event); - ftrace_trace_stack(tr, buffer, flags, 0, pc, regs); + /* + * If regs is not set, then skip the following callers: + * trace_buffer_unlock_commit_regs + * event_trigger_unlock_commit + * trace_event_buffer_commit + * trace_event_raw_event_sched_switch + * Note, we can still get here via blktrace, wakeup tracer + * and mmiotrace, but that's ok if they lose a function or + * two. They are that meaningful. + */ + ftrace_trace_stack(tr, buffer, flags, regs ? 0 : 4, pc, regs); ftrace_trace_userstack(buffer, flags, pc); } @@ -1913,6 +2175,13 @@ static void __ftrace_trace_stack(struct ring_buffer *buffer, trace.skip = skip; /* + * Add two, for this function and the call to save_stack_trace() + * If regs is set, then these functions will not be in the way. + */ + if (!regs) + trace.skip += 2; + + /* * Since events can happen in NMIs there's no safe way to * use the per cpu ftrace_stacks. We reserve it and if an interrupt * or NMI comes in, it will just have to use the default @@ -2083,83 +2352,41 @@ static void __trace_userstack(struct trace_array *tr, unsigned long flags) /* created for use with alloc_percpu */ struct trace_buffer_struct { - char buffer[TRACE_BUF_SIZE]; + int nesting; + char buffer[4][TRACE_BUF_SIZE]; }; static struct trace_buffer_struct *trace_percpu_buffer; -static struct trace_buffer_struct *trace_percpu_sirq_buffer; -static struct trace_buffer_struct *trace_percpu_irq_buffer; -static struct trace_buffer_struct *trace_percpu_nmi_buffer; /* - * The buffer used is dependent on the context. There is a per cpu - * buffer for normal context, softirq contex, hard irq context and - * for NMI context. Thise allows for lockless recording. - * - * Note, if the buffers failed to be allocated, then this returns NULL + * Thise allows for lockless recording. If we're nested too deeply, then + * this returns NULL. */ static char *get_trace_buf(void) { - struct trace_buffer_struct *percpu_buffer; - - /* - * If we have allocated per cpu buffers, then we do not - * need to do any locking. - */ - if (in_nmi()) - percpu_buffer = trace_percpu_nmi_buffer; - else if (in_irq()) - percpu_buffer = trace_percpu_irq_buffer; - else if (in_softirq()) - percpu_buffer = trace_percpu_sirq_buffer; - else - percpu_buffer = trace_percpu_buffer; + struct trace_buffer_struct *buffer = this_cpu_ptr(trace_percpu_buffer); - if (!percpu_buffer) + if (!buffer || buffer->nesting >= 4) return NULL; - return this_cpu_ptr(&percpu_buffer->buffer[0]); + return &buffer->buffer[buffer->nesting++][0]; +} + +static void put_trace_buf(void) +{ + this_cpu_dec(trace_percpu_buffer->nesting); } static int alloc_percpu_trace_buffer(void) { struct trace_buffer_struct *buffers; - struct trace_buffer_struct *sirq_buffers; - struct trace_buffer_struct *irq_buffers; - struct trace_buffer_struct *nmi_buffers; buffers = alloc_percpu(struct trace_buffer_struct); - if (!buffers) - goto err_warn; - - sirq_buffers = alloc_percpu(struct trace_buffer_struct); - if (!sirq_buffers) - goto err_sirq; - - irq_buffers = alloc_percpu(struct trace_buffer_struct); - if (!irq_buffers) - goto err_irq; - - nmi_buffers = alloc_percpu(struct trace_buffer_struct); - if (!nmi_buffers) - goto err_nmi; + if (WARN(!buffers, "Could not allocate percpu trace_printk buffer")) + return -ENOMEM; trace_percpu_buffer = buffers; - trace_percpu_sirq_buffer = sirq_buffers; - trace_percpu_irq_buffer = irq_buffers; - trace_percpu_nmi_buffer = nmi_buffers; - return 0; - - err_nmi: - free_percpu(irq_buffers); - err_irq: - free_percpu(sirq_buffers); - err_sirq: - free_percpu(buffers); - err_warn: - WARN(1, "Could not allocate percpu trace_printk buffer"); - return -ENOMEM; } static int buffers_allocated; @@ -2250,7 +2477,7 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args) tbuffer = get_trace_buf(); if (!tbuffer) { len = 0; - goto out; + goto out_nobuffer; } len = vbin_printf((u32 *)tbuffer, TRACE_BUF_SIZE/sizeof(int), fmt, args); @@ -2276,6 +2503,9 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args) } out: + put_trace_buf(); + +out_nobuffer: preempt_enable_notrace(); unpause_graph_tracing(); @@ -2307,7 +2537,7 @@ __trace_array_vprintk(struct ring_buffer *buffer, tbuffer = get_trace_buf(); if (!tbuffer) { len = 0; - goto out; + goto out_nobuffer; } len = vscnprintf(tbuffer, TRACE_BUF_SIZE, fmt, args); @@ -2326,7 +2556,11 @@ __trace_array_vprintk(struct ring_buffer *buffer, __buffer_unlock_commit(buffer, event); ftrace_trace_stack(&global_trace, buffer, flags, 6, pc, NULL); } - out: + +out: + put_trace_buf(); + +out_nobuffer: preempt_enable_notrace(); unpause_graph_tracing(); @@ -6977,6 +7211,7 @@ init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer) for_each_tracing_cpu(cpu) tracing_init_tracefs_percpu(tr, cpu); + ftrace_init_tracefs(tr, d_tracer); } static struct vfsmount *trace_automount(void *ingore) @@ -7130,6 +7365,7 @@ static __init int tracer_init_tracefs(void) return 0; init_tracer_tracefs(&global_trace, d_tracer); + ftrace_init_tracefs_toplevel(&global_trace, d_tracer); trace_create_file("tracing_thresh", 0644, d_tracer, &global_trace, &tracing_thresh_fops); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 5167c366d6b7..f783df416726 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -80,6 +80,12 @@ enum trace_type { FTRACE_ENTRY(name, struct_name, id, PARAMS(tstruct), PARAMS(print), \ filter) +#undef FTRACE_ENTRY_PACKED +#define FTRACE_ENTRY_PACKED(name, struct_name, id, tstruct, print, \ + filter) \ + FTRACE_ENTRY(name, struct_name, id, PARAMS(tstruct), PARAMS(print), \ + filter) __packed + #include "trace_entries.h" /* @@ -156,6 +162,9 @@ struct trace_array_cpu { char comm[TASK_COMM_LEN]; bool ignore_pid; +#ifdef CONFIG_FUNCTION_TRACER + bool ftrace_ignore_pid; +#endif }; struct tracer; @@ -247,6 +256,7 @@ struct trace_array { int ref; #ifdef CONFIG_FUNCTION_TRACER struct ftrace_ops *ops; + struct trace_pid_list __rcu *function_pids; /* function tracing enabled */ int function_enabled; #endif @@ -628,6 +638,25 @@ extern unsigned long nsecs_to_usecs(unsigned long nsecs); extern unsigned long tracing_thresh; +/* PID filtering */ + +extern int pid_max; + +bool trace_find_filtered_pid(struct trace_pid_list *filtered_pids, + pid_t search_pid); +bool trace_ignore_this_task(struct trace_pid_list *filtered_pids, + struct task_struct *task); +void trace_filter_add_remove_task(struct trace_pid_list *pid_list, + struct task_struct *self, + struct task_struct *task); +void *trace_pid_next(struct trace_pid_list *pid_list, void *v, loff_t *pos); +void *trace_pid_start(struct trace_pid_list *pid_list, loff_t *pos); +int trace_pid_show(struct seq_file *m, void *v); +void trace_free_pid_list(struct trace_pid_list *pid_list); +int trace_pid_write(struct trace_pid_list *filtered_pids, + struct trace_pid_list **new_pid_list, + const char __user *ubuf, size_t cnt); + #ifdef CONFIG_TRACER_MAX_TRACE void update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu); void update_max_tr_single(struct trace_array *tr, @@ -821,12 +850,9 @@ extern struct list_head ftrace_pids; #ifdef CONFIG_FUNCTION_TRACER extern bool ftrace_filter_param __initdata; -static inline int ftrace_trace_task(struct task_struct *task) +static inline int ftrace_trace_task(struct trace_array *tr) { - if (list_empty(&ftrace_pids)) - return 1; - - return test_tsk_trace_trace(task); + return !this_cpu_read(tr->trace_buffer.data->ftrace_ignore_pid); } extern int ftrace_is_dead(void); int ftrace_create_function_files(struct trace_array *tr, @@ -836,8 +862,11 @@ void ftrace_init_global_array_ops(struct trace_array *tr); void ftrace_init_array_ops(struct trace_array *tr, ftrace_func_t func); void ftrace_reset_array_ops(struct trace_array *tr); int using_ftrace_ops_list_func(void); +void ftrace_init_tracefs(struct trace_array *tr, struct dentry *d_tracer); +void ftrace_init_tracefs_toplevel(struct trace_array *tr, + struct dentry *d_tracer); #else -static inline int ftrace_trace_task(struct task_struct *task) +static inline int ftrace_trace_task(struct trace_array *tr) { return 1; } @@ -852,6 +881,8 @@ static inline void ftrace_destroy_function_files(struct trace_array *tr) { } static inline __init void ftrace_init_global_array_ops(struct trace_array *tr) { } static inline void ftrace_reset_array_ops(struct trace_array *tr) { } +static inline void ftrace_init_tracefs(struct trace_array *tr, struct dentry *d) { } +static inline void ftrace_init_tracefs_toplevel(struct trace_array *tr, struct dentry *d) { } /* ftace_func_t type is not defined, use macro instead of static inline */ #define ftrace_init_array_ops(tr, func) do { } while (0) #endif /* CONFIG_FUNCTION_TRACER */ @@ -1600,6 +1631,11 @@ int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled); #define FTRACE_ENTRY_DUP(call, struct_name, id, tstruct, print, filter) \ FTRACE_ENTRY(call, struct_name, id, PARAMS(tstruct), PARAMS(print), \ filter) +#undef FTRACE_ENTRY_PACKED +#define FTRACE_ENTRY_PACKED(call, struct_name, id, tstruct, print, filter) \ + FTRACE_ENTRY(call, struct_name, id, PARAMS(tstruct), PARAMS(print), \ + filter) + #include "trace_entries.h" #if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_FUNCTION_TRACER) diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h index ee7b94a4810a..5c30efcda5e6 100644 --- a/kernel/trace/trace_entries.h +++ b/kernel/trace/trace_entries.h @@ -72,7 +72,7 @@ FTRACE_ENTRY_REG(function, ftrace_entry, ); /* Function call entry */ -FTRACE_ENTRY(funcgraph_entry, ftrace_graph_ent_entry, +FTRACE_ENTRY_PACKED(funcgraph_entry, ftrace_graph_ent_entry, TRACE_GRAPH_ENT, @@ -88,7 +88,7 @@ FTRACE_ENTRY(funcgraph_entry, ftrace_graph_ent_entry, ); /* Function return entry */ -FTRACE_ENTRY(funcgraph_exit, ftrace_graph_ret_entry, +FTRACE_ENTRY_PACKED(funcgraph_exit, ftrace_graph_ret_entry, TRACE_GRAPH_RET, diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 3d4155892a1e..03c0a48c3ac4 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -15,7 +15,6 @@ #include <linux/kthread.h> #include <linux/tracefs.h> #include <linux/uaccess.h> -#include <linux/vmalloc.h> #include <linux/module.h> #include <linux/ctype.h> #include <linux/sort.h> @@ -262,6 +261,14 @@ void *trace_event_buffer_reserve(struct trace_event_buffer *fbuffer, local_save_flags(fbuffer->flags); fbuffer->pc = preempt_count(); + /* + * If CONFIG_PREEMPT is enabled, then the tracepoint itself disables + * preemption (adding one to the preempt_count). Since we are + * interested in the preempt_count at the time the tracepoint was + * hit, we need to subtract one to offset the increment. + */ + if (IS_ENABLED(CONFIG_PREEMPT)) + fbuffer->pc--; fbuffer->trace_file = trace_file; fbuffer->event = @@ -499,60 +506,6 @@ static void ftrace_clear_events(struct trace_array *tr) mutex_unlock(&event_mutex); } -/* Shouldn't this be in a header? */ -extern int pid_max; - -/* Returns true if found in filter */ -static bool -find_filtered_pid(struct trace_pid_list *filtered_pids, pid_t search_pid) -{ - /* - * If pid_max changed after filtered_pids was created, we - * by default ignore all pids greater than the previous pid_max. - */ - if (search_pid >= filtered_pids->pid_max) - return false; - - return test_bit(search_pid, filtered_pids->pids); -} - -static bool -ignore_this_task(struct trace_pid_list *filtered_pids, struct task_struct *task) -{ - /* - * Return false, because if filtered_pids does not exist, - * all pids are good to trace. - */ - if (!filtered_pids) - return false; - - return !find_filtered_pid(filtered_pids, task->pid); -} - -static void filter_add_remove_task(struct trace_pid_list *pid_list, - struct task_struct *self, - struct task_struct *task) -{ - if (!pid_list) - return; - - /* For forks, we only add if the forking task is listed */ - if (self) { - if (!find_filtered_pid(pid_list, self->pid)) - return; - } - - /* Sorry, but we don't support pid_max changing after setting */ - if (task->pid >= pid_list->pid_max) - return; - - /* "self" is set for forks, and NULL for exits */ - if (self) - set_bit(task->pid, pid_list->pids); - else - clear_bit(task->pid, pid_list->pids); -} - static void event_filter_pid_sched_process_exit(void *data, struct task_struct *task) { @@ -560,7 +513,7 @@ event_filter_pid_sched_process_exit(void *data, struct task_struct *task) struct trace_array *tr = data; pid_list = rcu_dereference_sched(tr->filtered_pids); - filter_add_remove_task(pid_list, NULL, task); + trace_filter_add_remove_task(pid_list, NULL, task); } static void @@ -572,7 +525,7 @@ event_filter_pid_sched_process_fork(void *data, struct trace_array *tr = data; pid_list = rcu_dereference_sched(tr->filtered_pids); - filter_add_remove_task(pid_list, self, task); + trace_filter_add_remove_task(pid_list, self, task); } void trace_event_follow_fork(struct trace_array *tr, bool enable) @@ -600,8 +553,8 @@ event_filter_pid_sched_switch_probe_pre(void *data, bool preempt, pid_list = rcu_dereference_sched(tr->filtered_pids); this_cpu_write(tr->trace_buffer.data->ignore_pid, - ignore_this_task(pid_list, prev) && - ignore_this_task(pid_list, next)); + trace_ignore_this_task(pid_list, prev) && + trace_ignore_this_task(pid_list, next)); } static void @@ -614,7 +567,7 @@ event_filter_pid_sched_switch_probe_post(void *data, bool preempt, pid_list = rcu_dereference_sched(tr->filtered_pids); this_cpu_write(tr->trace_buffer.data->ignore_pid, - ignore_this_task(pid_list, next)); + trace_ignore_this_task(pid_list, next)); } static void @@ -630,7 +583,7 @@ event_filter_pid_sched_wakeup_probe_pre(void *data, struct task_struct *task) pid_list = rcu_dereference_sched(tr->filtered_pids); this_cpu_write(tr->trace_buffer.data->ignore_pid, - ignore_this_task(pid_list, task)); + trace_ignore_this_task(pid_list, task)); } static void @@ -647,7 +600,7 @@ event_filter_pid_sched_wakeup_probe_post(void *data, struct task_struct *task) /* Set tracing if current is enabled */ this_cpu_write(tr->trace_buffer.data->ignore_pid, - ignore_this_task(pid_list, current)); + trace_ignore_this_task(pid_list, current)); } static void __ftrace_clear_event_pids(struct trace_array *tr) @@ -685,8 +638,7 @@ static void __ftrace_clear_event_pids(struct trace_array *tr) /* Wait till all users are no longer using pid filtering */ synchronize_sched(); - vfree(pid_list->pids); - kfree(pid_list); + trace_free_pid_list(pid_list); } static void ftrace_clear_event_pids(struct trace_array *tr) @@ -1034,18 +986,8 @@ p_next(struct seq_file *m, void *v, loff_t *pos) { struct trace_array *tr = m->private; struct trace_pid_list *pid_list = rcu_dereference_sched(tr->filtered_pids); - unsigned long pid = (unsigned long)v; - - (*pos)++; - - /* pid already is +1 of the actual prevous bit */ - pid = find_next_bit(pid_list->pids, pid_list->pid_max, pid); - /* Return pid + 1 to allow zero to be represented */ - if (pid < pid_list->pid_max) - return (void *)(pid + 1); - - return NULL; + return trace_pid_next(pid_list, v, pos); } static void *p_start(struct seq_file *m, loff_t *pos) @@ -1053,8 +995,6 @@ static void *p_start(struct seq_file *m, loff_t *pos) { struct trace_pid_list *pid_list; struct trace_array *tr = m->private; - unsigned long pid; - loff_t l = 0; /* * Grab the mutex, to keep calls to p_next() having the same @@ -1070,15 +1010,7 @@ static void *p_start(struct seq_file *m, loff_t *pos) if (!pid_list) return NULL; - pid = find_first_bit(pid_list->pids, pid_list->pid_max); - if (pid >= pid_list->pid_max) - return NULL; - - /* Return pid + 1 so that zero can be the exit value */ - for (pid++; pid && l < *pos; - pid = (unsigned long)p_next(m, (void *)pid, &l)) - ; - return (void *)pid; + return trace_pid_start(pid_list, pos); } static void p_stop(struct seq_file *m, void *p) @@ -1088,14 +1020,6 @@ static void p_stop(struct seq_file *m, void *p) mutex_unlock(&event_mutex); } -static int p_show(struct seq_file *m, void *v) -{ - unsigned long pid = (unsigned long)v - 1; - - seq_printf(m, "%lu\n", pid); - return 0; -} - static ssize_t event_enable_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) @@ -1654,7 +1578,7 @@ static void ignore_task_cpu(void *data) mutex_is_locked(&event_mutex)); this_cpu_write(tr->trace_buffer.data->ignore_pid, - ignore_this_task(pid_list, current)); + trace_ignore_this_task(pid_list, current)); } static ssize_t @@ -1666,13 +1590,7 @@ ftrace_event_pid_write(struct file *filp, const char __user *ubuf, struct trace_pid_list *filtered_pids = NULL; struct trace_pid_list *pid_list; struct trace_event_file *file; - struct trace_parser parser; - unsigned long val; - loff_t this_pos; - ssize_t read = 0; - ssize_t ret = 0; - pid_t pid; - int nr_pids = 0; + ssize_t ret; if (!cnt) return 0; @@ -1681,93 +1599,15 @@ ftrace_event_pid_write(struct file *filp, const char __user *ubuf, if (ret < 0) return ret; - if (trace_parser_get_init(&parser, EVENT_BUF_SIZE + 1)) - return -ENOMEM; - mutex_lock(&event_mutex); + filtered_pids = rcu_dereference_protected(tr->filtered_pids, lockdep_is_held(&event_mutex)); - /* - * Always recreate a new array. The write is an all or nothing - * operation. Always create a new array when adding new pids by - * the user. If the operation fails, then the current list is - * not modified. - */ - pid_list = kmalloc(sizeof(*pid_list), GFP_KERNEL); - if (!pid_list) { - read = -ENOMEM; - goto out; - } - pid_list->pid_max = READ_ONCE(pid_max); - /* Only truncating will shrink pid_max */ - if (filtered_pids && filtered_pids->pid_max > pid_list->pid_max) - pid_list->pid_max = filtered_pids->pid_max; - pid_list->pids = vzalloc((pid_list->pid_max + 7) >> 3); - if (!pid_list->pids) { - kfree(pid_list); - read = -ENOMEM; - goto out; - } - if (filtered_pids) { - /* copy the current bits to the new max */ - pid = find_first_bit(filtered_pids->pids, - filtered_pids->pid_max); - while (pid < filtered_pids->pid_max) { - set_bit(pid, pid_list->pids); - pid = find_next_bit(filtered_pids->pids, - filtered_pids->pid_max, - pid + 1); - nr_pids++; - } - } - - while (cnt > 0) { - - this_pos = 0; - - ret = trace_get_user(&parser, ubuf, cnt, &this_pos); - if (ret < 0 || !trace_parser_loaded(&parser)) - break; - - read += ret; - ubuf += ret; - cnt -= ret; - - parser.buffer[parser.idx] = 0; - - ret = -EINVAL; - if (kstrtoul(parser.buffer, 0, &val)) - break; - if (val >= pid_list->pid_max) - break; - - pid = (pid_t)val; - - set_bit(pid, pid_list->pids); - nr_pids++; - - trace_parser_clear(&parser); - ret = 0; - } - trace_parser_put(&parser); - - if (ret < 0) { - vfree(pid_list->pids); - kfree(pid_list); - read = ret; + ret = trace_pid_write(filtered_pids, &pid_list, ubuf, cnt); + if (ret < 0) goto out; - } - if (!nr_pids) { - /* Cleared the list of pids */ - vfree(pid_list->pids); - kfree(pid_list); - read = ret; - if (!filtered_pids) - goto out; - pid_list = NULL; - } rcu_assign_pointer(tr->filtered_pids, pid_list); list_for_each_entry(file, &tr->events, list) { @@ -1776,10 +1616,8 @@ ftrace_event_pid_write(struct file *filp, const char __user *ubuf, if (filtered_pids) { synchronize_sched(); - - vfree(filtered_pids->pids); - kfree(filtered_pids); - } else { + trace_free_pid_list(filtered_pids); + } else if (pid_list) { /* * Register a probe that is called before all other probes * to set ignore_pid if next or prev do not match. @@ -1817,9 +1655,8 @@ ftrace_event_pid_write(struct file *filp, const char __user *ubuf, out: mutex_unlock(&event_mutex); - ret = read; - if (read > 0) - *ppos += read; + if (ret > 0) + *ppos += ret; return ret; } @@ -1846,7 +1683,7 @@ static const struct seq_operations show_set_event_seq_ops = { static const struct seq_operations show_set_pid_seq_ops = { .start = p_start, .next = p_next, - .show = p_show, + .show = trace_pid_show, .stop = p_stop, }; diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index 5a095c2e4b69..0efa00d80623 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c @@ -43,7 +43,7 @@ static int allocate_ftrace_ops(struct trace_array *tr) /* Currently only the non stack verision is supported */ ops->func = function_trace_call; - ops->flags = FTRACE_OPS_FL_RECURSION_SAFE; + ops->flags = FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_PID; tr->ops = ops; ops->private = tr; diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 3a0244ff7ea8..7363ccf79512 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -319,7 +319,7 @@ int trace_graph_entry(struct ftrace_graph_ent *trace) int cpu; int pc; - if (!ftrace_trace_task(current)) + if (!ftrace_trace_task(tr)) return 0; /* trace it when it is-nested-in or is a function enabled. */ @@ -338,6 +338,13 @@ int trace_graph_entry(struct ftrace_graph_ent *trace) if (ftrace_graph_notrace_addr(trace->func)) return 1; + /* + * Stop here if tracing_threshold is set. We only write function return + * events to the ring buffer. + */ + if (tracing_thresh) + return 1; + local_irq_save(flags); cpu = raw_smp_processor_id(); data = per_cpu_ptr(tr->trace_buffer.data, cpu); @@ -355,14 +362,6 @@ int trace_graph_entry(struct ftrace_graph_ent *trace) return ret; } -static int trace_graph_thresh_entry(struct ftrace_graph_ent *trace) -{ - if (tracing_thresh) - return 1; - else - return trace_graph_entry(trace); -} - static void __trace_graph_function(struct trace_array *tr, unsigned long ip, unsigned long flags, int pc) @@ -457,7 +456,7 @@ static int graph_trace_init(struct trace_array *tr) set_graph_array(tr); if (tracing_thresh) ret = register_ftrace_graph(&trace_graph_thresh_return, - &trace_graph_thresh_entry); + &trace_graph_entry); else ret = register_ftrace_graph(&trace_graph_return, &trace_graph_entry); diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 5546eec0505f..9aedb0b06683 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -587,6 +587,7 @@ static int create_trace_kprobe(int argc, char **argv) * $retval : fetch return value * $stack : fetch stack address * $stackN : fetch Nth of stack (N:0-) + * $comm : fetch current task comm * @ADDR : fetch memory at ADDR (ADDR should be in kernel) * @SYM[+|-offs] : fetch memory at SYM +|- offs (SYM is a data symbol) * %REG : fetch register REG diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c index 68f376ca6d3f..cd7480d0a201 100644 --- a/kernel/trace/trace_mmiotrace.c +++ b/kernel/trace/trace_mmiotrace.c @@ -68,19 +68,15 @@ static void mmio_print_pcidev(struct trace_seq *s, const struct pci_dev *dev) trace_seq_printf(s, "PCIDEV %02x%02x %04x%04x %x", dev->bus->number, dev->devfn, dev->vendor, dev->device, dev->irq); - /* - * XXX: is pci_resource_to_user() appropriate, since we are - * supposed to interpret the __ioremap() phys_addr argument based on - * these printed values? - */ for (i = 0; i < 7; i++) { - pci_resource_to_user(dev, i, &dev->resource[i], &start, &end); + start = dev->resource[i].start; trace_seq_printf(s, " %llx", (unsigned long long)(start | (dev->resource[i].flags & PCI_REGION_FLAG_MASK))); } for (i = 0; i < 7; i++) { - pci_resource_to_user(dev, i, &dev->resource[i], &start, &end); + start = dev->resource[i].start; + end = dev->resource[i].end; trace_seq_printf(s, " %llx", dev->resource[i].start < dev->resource[i].end ? (unsigned long long)(end - start) + 1 : 0); diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index 1d372fa6fefb..74e80a582c28 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -218,6 +218,28 @@ free_bitfield_fetch_param(struct bitfield_fetch_param *data) kfree(data); } +void FETCH_FUNC_NAME(comm, string)(struct pt_regs *regs, + void *data, void *dest) +{ + int maxlen = get_rloc_len(*(u32 *)dest); + u8 *dst = get_rloc_data(dest); + long ret; + + if (!maxlen) + return; + + ret = strlcpy(dst, current->comm, maxlen); + *(u32 *)dest = make_data_rloc(ret, get_rloc_offs(*(u32 *)dest)); +} +NOKPROBE_SYMBOL(FETCH_FUNC_NAME(comm, string)); + +void FETCH_FUNC_NAME(comm, string_size)(struct pt_regs *regs, + void *data, void *dest) +{ + *(u32 *)dest = strlen(current->comm) + 1; +} +NOKPROBE_SYMBOL(FETCH_FUNC_NAME(comm, string_size)); + static const struct fetch_type *find_fetch_type(const char *type, const struct fetch_type *ftbl) { @@ -348,6 +370,11 @@ static int parse_probe_vars(char *arg, const struct fetch_type *t, } } else ret = -EINVAL; + } else if (strcmp(arg, "comm") == 0) { + if (strcmp(t->name, "string") != 0 && + strcmp(t->name, "string_size") != 0) + return -EINVAL; + f->fn = t->fetch[FETCH_MTD_comm]; } else ret = -EINVAL; @@ -522,6 +549,12 @@ int traceprobe_parse_probe_arg(char *arg, ssize_t *size, arg[t - parg->comm] = '\0'; t++; } + /* + * The default type of $comm should be "string", and it can't be + * dereferenced. + */ + if (!t && strcmp(arg, "$comm") == 0) + t = "string"; parg->type = find_fetch_type(t, ftbl); if (!parg->type) { pr_info("Unsupported type: %s\n", t); diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h index f6398db09114..45400ca5ded1 100644 --- a/kernel/trace/trace_probe.h +++ b/kernel/trace/trace_probe.h @@ -102,6 +102,7 @@ enum { FETCH_MTD_reg = 0, FETCH_MTD_stack, FETCH_MTD_retval, + FETCH_MTD_comm, FETCH_MTD_memory, FETCH_MTD_symbol, FETCH_MTD_deref, @@ -183,6 +184,14 @@ DECLARE_BASIC_FETCH_FUNCS(bitfield); #define fetch_bitfield_string NULL #define fetch_bitfield_string_size NULL +/* comm only makes sense as a string */ +#define fetch_comm_u8 NULL +#define fetch_comm_u16 NULL +#define fetch_comm_u32 NULL +#define fetch_comm_u64 NULL +DECLARE_FETCH_FUNC(comm, string); +DECLARE_FETCH_FUNC(comm, string_size); + /* * Define macro for basic types - we don't need to define s* types, because * we have to care only about bitwidth at recording time. @@ -213,6 +222,7 @@ DEFINE_FETCH_##method(u64) ASSIGN_FETCH_FUNC(reg, ftype), \ ASSIGN_FETCH_FUNC(stack, ftype), \ ASSIGN_FETCH_FUNC(retval, ftype), \ +ASSIGN_FETCH_FUNC(comm, ftype), \ ASSIGN_FETCH_FUNC(memory, ftype), \ ASSIGN_FETCH_FUNC(symbol, ftype), \ ASSIGN_FETCH_FUNC(deref, ftype), \ diff --git a/kernel/workqueue.c b/kernel/workqueue.c index c9dd5fbdbf33..ef071ca73fc3 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -4369,8 +4369,8 @@ static void show_pwq(struct pool_workqueue *pwq) /** * show_workqueue_state - dump workqueue state * - * Called from a sysrq handler and prints out all busy workqueues and - * pools. + * Called from a sysrq handler or try_to_freeze_tasks() and prints out + * all busy workqueues and pools. */ void show_workqueue_state(void) { @@ -4600,15 +4600,11 @@ static void restore_unbound_workers_cpumask(struct worker_pool *pool, int cpu) if (!cpumask_test_cpu(cpu, pool->attrs->cpumask)) return; - /* is @cpu the only online CPU? */ cpumask_and(&cpumask, pool->attrs->cpumask, cpu_online_mask); - if (cpumask_weight(&cpumask) != 1) - return; /* as we're called from CPU_ONLINE, the following shouldn't fail */ for_each_pool_worker(worker, pool) - WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, - pool->attrs->cpumask) < 0); + WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, &cpumask) < 0); } int workqueue_prepare_cpu(unsigned int cpu) |