aboutsummaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/bpf/arraymap.c163
-rw-r--r--kernel/bpf/core.c9
-rw-r--r--kernel/bpf/helpers.c2
-rw-r--r--kernel/bpf/inode.c4
-rw-r--r--kernel/bpf/stackmap.c2
-rw-r--r--kernel/bpf/syscall.c66
-rw-r--r--kernel/bpf/verifier.c26
-rw-r--r--kernel/cgroup.c48
-rw-r--r--kernel/cgroup_pids.c34
-rw-r--r--kernel/cpuset.c9
-rw-r--r--kernel/events/callchain.c14
-rw-r--r--kernel/events/core.c226
-rw-r--r--kernel/events/internal.h25
-rw-r--r--kernel/exit.c84
-rw-r--r--kernel/fork.c24
-rw-r--r--kernel/freezer.c2
-rw-r--r--kernel/gcov/gcc_4_7.c2
-rw-r--r--kernel/irq/Makefile1
-rw-r--r--kernel/irq/affinity.c61
-rw-r--r--kernel/irq/chip.c83
-rw-r--r--kernel/irq/handle.c18
-rw-r--r--kernel/irq/internals.h4
-rw-r--r--kernel/irq/ipi.c4
-rw-r--r--kernel/irq/irqdesc.c63
-rw-r--r--kernel/irq/irqdomain.c94
-rw-r--r--kernel/irq/manage.c73
-rw-r--r--kernel/irq/msi.c12
-rw-r--r--kernel/irq/proc.c11
-rw-r--r--kernel/jump_label.c2
-rw-r--r--kernel/locking/lockdep.c13
-rw-r--r--kernel/locking/mutex-debug.h4
-rw-r--r--kernel/locking/mutex.h10
-rw-r--r--kernel/locking/qrwlock.c2
-rw-r--r--kernel/locking/qspinlock.c88
-rw-r--r--kernel/locking/qspinlock_paravirt.h4
-rw-r--r--kernel/locking/rtmutex.c2
-rw-r--r--kernel/locking/rwsem-xadd.c194
-rw-r--r--kernel/locking/rwsem.c8
-rw-r--r--kernel/locking/rwsem.h52
-rw-r--r--kernel/memremap.c14
-rw-r--r--kernel/power/Makefile2
-rw-r--r--kernel/power/console.c8
-rw-r--r--kernel/power/hibernate.c107
-rw-r--r--kernel/power/main.c11
-rw-r--r--kernel/power/power.h11
-rw-r--r--kernel/power/process.c3
-rw-r--r--kernel/power/snapshot.c950
-rw-r--r--kernel/power/suspend.c10
-rw-r--r--kernel/power/swap.c39
-rw-r--r--kernel/power/user.c14
-rw-r--r--kernel/printk/printk.c5
-rw-r--r--kernel/rcu/tree_exp.h1
-rw-r--r--kernel/sched/core.c128
-rw-r--r--kernel/sched/cpuacct.c114
-rw-r--r--kernel/sched/cpufreq_schedutil.c74
-rw-r--r--kernel/sched/cputime.c181
-rw-r--r--kernel/sched/debug.c2
-rw-r--r--kernel/sched/fair.c251
-rw-r--r--kernel/sched/idle.c4
-rw-r--r--kernel/sched/loadavg.c8
-rw-r--r--kernel/sched/sched.h25
-rw-r--r--kernel/smp.c2
-rw-r--r--kernel/sysctl.c4
-rw-r--r--kernel/task_work.c1
-rw-r--r--kernel/time/clockevents.c2
-rw-r--r--kernel/time/posix-cpu-timers.c1
-rw-r--r--kernel/time/tick-sched.c3
-rw-r--r--kernel/time/timekeeping.c1
-rw-r--r--kernel/trace/Kconfig1
-rw-r--r--kernel/trace/blktrace.c83
-rw-r--r--kernel/trace/bpf_trace.c164
-rw-r--r--kernel/trace/ftrace.c313
-rw-r--r--kernel/trace/trace.c358
-rw-r--r--kernel/trace/trace.h48
-rw-r--r--kernel/trace/trace_entries.h4
-rw-r--r--kernel/trace/trace_events.c219
-rw-r--r--kernel/trace/trace_functions.c2
-rw-r--r--kernel/trace/trace_functions_graph.c19
-rw-r--r--kernel/trace/trace_kprobe.c1
-rw-r--r--kernel/trace/trace_mmiotrace.c10
-rw-r--r--kernel/trace/trace_probe.c33
-rw-r--r--kernel/trace/trace_probe.h10
-rw-r--r--kernel/workqueue.c10
83 files changed, 3062 insertions, 1732 deletions
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 76d5a794e426..633a650d7aeb 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -328,8 +328,8 @@ static void *fd_array_map_lookup_elem(struct bpf_map *map, void *key)
}
/* only called from syscall */
-static int fd_array_map_update_elem(struct bpf_map *map, void *key,
- void *value, u64 map_flags)
+int bpf_fd_array_map_update_elem(struct bpf_map *map, struct file *map_file,
+ void *key, void *value, u64 map_flags)
{
struct bpf_array *array = container_of(map, struct bpf_array, map);
void *new_ptr, *old_ptr;
@@ -342,7 +342,7 @@ static int fd_array_map_update_elem(struct bpf_map *map, void *key,
return -E2BIG;
ufd = *(u32 *)value;
- new_ptr = map->ops->map_fd_get_ptr(map, ufd);
+ new_ptr = map->ops->map_fd_get_ptr(map, map_file, ufd);
if (IS_ERR(new_ptr))
return PTR_ERR(new_ptr);
@@ -371,10 +371,12 @@ static int fd_array_map_delete_elem(struct bpf_map *map, void *key)
}
}
-static void *prog_fd_array_get_ptr(struct bpf_map *map, int fd)
+static void *prog_fd_array_get_ptr(struct bpf_map *map,
+ struct file *map_file, int fd)
{
struct bpf_array *array = container_of(map, struct bpf_array, map);
struct bpf_prog *prog = bpf_prog_get(fd);
+
if (IS_ERR(prog))
return prog;
@@ -382,14 +384,13 @@ static void *prog_fd_array_get_ptr(struct bpf_map *map, int fd)
bpf_prog_put(prog);
return ERR_PTR(-EINVAL);
}
+
return prog;
}
static void prog_fd_array_put_ptr(void *ptr)
{
- struct bpf_prog *prog = ptr;
-
- bpf_prog_put_rcu(prog);
+ bpf_prog_put(ptr);
}
/* decrement refcnt of all bpf_progs that are stored in this map */
@@ -407,7 +408,6 @@ static const struct bpf_map_ops prog_array_ops = {
.map_free = fd_array_map_free,
.map_get_next_key = array_map_get_next_key,
.map_lookup_elem = fd_array_map_lookup_elem,
- .map_update_elem = fd_array_map_update_elem,
.map_delete_elem = fd_array_map_delete_elem,
.map_fd_get_ptr = prog_fd_array_get_ptr,
.map_fd_put_ptr = prog_fd_array_put_ptr,
@@ -425,59 +425,105 @@ static int __init register_prog_array_map(void)
}
late_initcall(register_prog_array_map);
-static void perf_event_array_map_free(struct bpf_map *map)
+static struct bpf_event_entry *bpf_event_entry_gen(struct file *perf_file,
+ struct file *map_file)
{
- bpf_fd_array_map_clear(map);
- fd_array_map_free(map);
+ struct bpf_event_entry *ee;
+
+ ee = kzalloc(sizeof(*ee), GFP_ATOMIC);
+ if (ee) {
+ ee->event = perf_file->private_data;
+ ee->perf_file = perf_file;
+ ee->map_file = map_file;
+ }
+
+ return ee;
}
-static void *perf_event_fd_array_get_ptr(struct bpf_map *map, int fd)
+static void __bpf_event_entry_free(struct rcu_head *rcu)
{
- struct perf_event *event;
- const struct perf_event_attr *attr;
- struct file *file;
+ struct bpf_event_entry *ee;
- file = perf_event_get(fd);
- if (IS_ERR(file))
- return file;
+ ee = container_of(rcu, struct bpf_event_entry, rcu);
+ fput(ee->perf_file);
+ kfree(ee);
+}
- event = file->private_data;
+static void bpf_event_entry_free_rcu(struct bpf_event_entry *ee)
+{
+ call_rcu(&ee->rcu, __bpf_event_entry_free);
+}
- attr = perf_event_attrs(event);
- if (IS_ERR(attr))
- goto err;
+static void *perf_event_fd_array_get_ptr(struct bpf_map *map,
+ struct file *map_file, int fd)
+{
+ const struct perf_event_attr *attr;
+ struct bpf_event_entry *ee;
+ struct perf_event *event;
+ struct file *perf_file;
- if (attr->inherit)
- goto err;
+ perf_file = perf_event_get(fd);
+ if (IS_ERR(perf_file))
+ return perf_file;
- if (attr->type == PERF_TYPE_RAW)
- return file;
+ event = perf_file->private_data;
+ ee = ERR_PTR(-EINVAL);
- if (attr->type == PERF_TYPE_HARDWARE)
- return file;
+ attr = perf_event_attrs(event);
+ if (IS_ERR(attr) || attr->inherit)
+ goto err_out;
+
+ switch (attr->type) {
+ case PERF_TYPE_SOFTWARE:
+ if (attr->config != PERF_COUNT_SW_BPF_OUTPUT)
+ goto err_out;
+ /* fall-through */
+ case PERF_TYPE_RAW:
+ case PERF_TYPE_HARDWARE:
+ ee = bpf_event_entry_gen(perf_file, map_file);
+ if (ee)
+ return ee;
+ ee = ERR_PTR(-ENOMEM);
+ /* fall-through */
+ default:
+ break;
+ }
- if (attr->type == PERF_TYPE_SOFTWARE &&
- attr->config == PERF_COUNT_SW_BPF_OUTPUT)
- return file;
-err:
- fput(file);
- return ERR_PTR(-EINVAL);
+err_out:
+ fput(perf_file);
+ return ee;
}
static void perf_event_fd_array_put_ptr(void *ptr)
{
- fput((struct file *)ptr);
+ bpf_event_entry_free_rcu(ptr);
+}
+
+static void perf_event_fd_array_release(struct bpf_map *map,
+ struct file *map_file)
+{
+ struct bpf_array *array = container_of(map, struct bpf_array, map);
+ struct bpf_event_entry *ee;
+ int i;
+
+ rcu_read_lock();
+ for (i = 0; i < array->map.max_entries; i++) {
+ ee = READ_ONCE(array->ptrs[i]);
+ if (ee && ee->map_file == map_file)
+ fd_array_map_delete_elem(map, &i);
+ }
+ rcu_read_unlock();
}
static const struct bpf_map_ops perf_event_array_ops = {
.map_alloc = fd_array_map_alloc,
- .map_free = perf_event_array_map_free,
+ .map_free = fd_array_map_free,
.map_get_next_key = array_map_get_next_key,
.map_lookup_elem = fd_array_map_lookup_elem,
- .map_update_elem = fd_array_map_update_elem,
.map_delete_elem = fd_array_map_delete_elem,
.map_fd_get_ptr = perf_event_fd_array_get_ptr,
.map_fd_put_ptr = perf_event_fd_array_put_ptr,
+ .map_release = perf_event_fd_array_release,
};
static struct bpf_map_type_list perf_event_array_type __read_mostly = {
@@ -491,3 +537,46 @@ static int __init register_perf_event_array_map(void)
return 0;
}
late_initcall(register_perf_event_array_map);
+
+#ifdef CONFIG_SOCK_CGROUP_DATA
+static void *cgroup_fd_array_get_ptr(struct bpf_map *map,
+ struct file *map_file /* not used */,
+ int fd)
+{
+ return cgroup_get_from_fd(fd);
+}
+
+static void cgroup_fd_array_put_ptr(void *ptr)
+{
+ /* cgroup_put free cgrp after a rcu grace period */
+ cgroup_put(ptr);
+}
+
+static void cgroup_fd_array_free(struct bpf_map *map)
+{
+ bpf_fd_array_map_clear(map);
+ fd_array_map_free(map);
+}
+
+static const struct bpf_map_ops cgroup_array_ops = {
+ .map_alloc = fd_array_map_alloc,
+ .map_free = cgroup_fd_array_free,
+ .map_get_next_key = array_map_get_next_key,
+ .map_lookup_elem = fd_array_map_lookup_elem,
+ .map_delete_elem = fd_array_map_delete_elem,
+ .map_fd_get_ptr = cgroup_fd_array_get_ptr,
+ .map_fd_put_ptr = cgroup_fd_array_put_ptr,
+};
+
+static struct bpf_map_type_list cgroup_array_type __read_mostly = {
+ .ops = &cgroup_array_ops,
+ .type = BPF_MAP_TYPE_CGROUP_ARRAY,
+};
+
+static int __init register_cgroup_array_map(void)
+{
+ bpf_register_map_type(&cgroup_array_type);
+ return 0;
+}
+late_initcall(register_cgroup_array_map);
+#endif
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index b94a36550591..03fd23d4d587 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -719,14 +719,13 @@ select_insn:
if (unlikely(index >= array->map.max_entries))
goto out;
-
if (unlikely(tail_call_cnt > MAX_TAIL_CALL_CNT))
goto out;
tail_call_cnt++;
prog = READ_ONCE(array->ptrs[index]);
- if (unlikely(!prog))
+ if (!prog)
goto out;
/* ARG1 at this point is guaranteed to point to CTX from
@@ -1055,9 +1054,11 @@ const struct bpf_func_proto * __weak bpf_get_trace_printk_proto(void)
return NULL;
}
-const struct bpf_func_proto * __weak bpf_get_event_output_proto(void)
+u64 __weak
+bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size,
+ void *ctx, u64 ctx_size, bpf_ctx_copy_t ctx_copy)
{
- return NULL;
+ return -ENOTSUPP;
}
/* Always built-in helper functions. */
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index ad7a0573f71b..1ea3afba1a4f 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -101,7 +101,7 @@ const struct bpf_func_proto bpf_get_prandom_u32_proto = {
static u64 bpf_get_smp_processor_id(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
{
- return raw_smp_processor_id();
+ return smp_processor_id();
}
const struct bpf_func_proto bpf_get_smp_processor_id_proto = {
diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c
index 318858edb1cd..5967b870a895 100644
--- a/kernel/bpf/inode.c
+++ b/kernel/bpf/inode.c
@@ -11,7 +11,7 @@
* version 2 as published by the Free Software Foundation.
*/
-#include <linux/module.h>
+#include <linux/init.h>
#include <linux/magic.h>
#include <linux/major.h>
#include <linux/mount.h>
@@ -367,8 +367,6 @@ static struct file_system_type bpf_fs_type = {
.kill_sb = kill_litter_super,
};
-MODULE_ALIAS_FS("bpf");
-
static int __init bpf_init(void)
{
int ret;
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index 080a2dfb5800..bf4495fcd25d 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -99,7 +99,7 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr)
if (err)
goto free_smap;
- err = get_callchain_buffers();
+ err = get_callchain_buffers(sysctl_perf_event_max_stack);
if (err)
goto free_smap;
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 46ecce4b79ed..228f962447a5 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -124,7 +124,12 @@ void bpf_map_put_with_uref(struct bpf_map *map)
static int bpf_map_release(struct inode *inode, struct file *filp)
{
- bpf_map_put_with_uref(filp->private_data);
+ struct bpf_map *map = filp->private_data;
+
+ if (map->ops->map_release)
+ map->ops->map_release(map, filp);
+
+ bpf_map_put_with_uref(map);
return 0;
}
@@ -387,6 +392,13 @@ static int map_update_elem(union bpf_attr *attr)
err = bpf_percpu_hash_update(map, key, value, attr->flags);
} else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) {
err = bpf_percpu_array_update(map, key, value, attr->flags);
+ } else if (map->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY ||
+ map->map_type == BPF_MAP_TYPE_PROG_ARRAY ||
+ map->map_type == BPF_MAP_TYPE_CGROUP_ARRAY) {
+ rcu_read_lock();
+ err = bpf_fd_array_map_update_elem(map, f.file, key, value,
+ attr->flags);
+ rcu_read_unlock();
} else {
rcu_read_lock();
err = map->ops->map_update_elem(map, key, value, attr->flags);
@@ -612,7 +624,7 @@ static void bpf_prog_uncharge_memlock(struct bpf_prog *prog)
free_uid(user);
}
-static void __prog_put_common(struct rcu_head *rcu)
+static void __bpf_prog_put_rcu(struct rcu_head *rcu)
{
struct bpf_prog_aux *aux = container_of(rcu, struct bpf_prog_aux, rcu);
@@ -621,17 +633,10 @@ static void __prog_put_common(struct rcu_head *rcu)
bpf_prog_free(aux->prog);
}
-/* version of bpf_prog_put() that is called after a grace period */
-void bpf_prog_put_rcu(struct bpf_prog *prog)
-{
- if (atomic_dec_and_test(&prog->aux->refcnt))
- call_rcu(&prog->aux->rcu, __prog_put_common);
-}
-
void bpf_prog_put(struct bpf_prog *prog)
{
if (atomic_dec_and_test(&prog->aux->refcnt))
- __prog_put_common(&prog->aux->rcu);
+ call_rcu(&prog->aux->rcu, __bpf_prog_put_rcu);
}
EXPORT_SYMBOL_GPL(bpf_prog_put);
@@ -639,7 +644,7 @@ static int bpf_prog_release(struct inode *inode, struct file *filp)
{
struct bpf_prog *prog = filp->private_data;
- bpf_prog_put_rcu(prog);
+ bpf_prog_put(prog);
return 0;
}
@@ -653,7 +658,7 @@ int bpf_prog_new_fd(struct bpf_prog *prog)
O_RDWR | O_CLOEXEC);
}
-static struct bpf_prog *__bpf_prog_get(struct fd f)
+static struct bpf_prog *____bpf_prog_get(struct fd f)
{
if (!f.file)
return ERR_PTR(-EBADF);
@@ -665,33 +670,50 @@ static struct bpf_prog *__bpf_prog_get(struct fd f)
return f.file->private_data;
}
-struct bpf_prog *bpf_prog_inc(struct bpf_prog *prog)
+struct bpf_prog *bpf_prog_add(struct bpf_prog *prog, int i)
{
- if (atomic_inc_return(&prog->aux->refcnt) > BPF_MAX_REFCNT) {
- atomic_dec(&prog->aux->refcnt);
+ if (atomic_add_return(i, &prog->aux->refcnt) > BPF_MAX_REFCNT) {
+ atomic_sub(i, &prog->aux->refcnt);
return ERR_PTR(-EBUSY);
}
return prog;
}
+EXPORT_SYMBOL_GPL(bpf_prog_add);
-/* called by sockets/tracing/seccomp before attaching program to an event
- * pairs with bpf_prog_put()
- */
-struct bpf_prog *bpf_prog_get(u32 ufd)
+struct bpf_prog *bpf_prog_inc(struct bpf_prog *prog)
+{
+ return bpf_prog_add(prog, 1);
+}
+
+static struct bpf_prog *__bpf_prog_get(u32 ufd, enum bpf_prog_type *type)
{
struct fd f = fdget(ufd);
struct bpf_prog *prog;
- prog = __bpf_prog_get(f);
+ prog = ____bpf_prog_get(f);
if (IS_ERR(prog))
return prog;
+ if (type && prog->type != *type) {
+ prog = ERR_PTR(-EINVAL);
+ goto out;
+ }
prog = bpf_prog_inc(prog);
+out:
fdput(f);
-
return prog;
}
-EXPORT_SYMBOL_GPL(bpf_prog_get);
+
+struct bpf_prog *bpf_prog_get(u32 ufd)
+{
+ return __bpf_prog_get(ufd, NULL);
+}
+
+struct bpf_prog *bpf_prog_get_type(u32 ufd, enum bpf_prog_type type)
+{
+ return __bpf_prog_get(ufd, &type);
+}
+EXPORT_SYMBOL_GPL(bpf_prog_get_type);
/* last field in 'union bpf_attr' used by this command */
#define BPF_PROG_LOAD_LAST_FIELD kern_version
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index eec9f90ba030..f72f23b8fdab 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -653,6 +653,16 @@ static int check_map_access(struct verifier_env *env, u32 regno, int off,
#define MAX_PACKET_OFF 0xffff
+static bool may_write_pkt_data(enum bpf_prog_type type)
+{
+ switch (type) {
+ case BPF_PROG_TYPE_XDP:
+ return true;
+ default:
+ return false;
+ }
+}
+
static int check_packet_access(struct verifier_env *env, u32 regno, int off,
int size)
{
@@ -713,6 +723,7 @@ static int check_ptr_alignment(struct verifier_env *env, struct reg_state *reg,
switch (env->prog->type) {
case BPF_PROG_TYPE_SCHED_CLS:
case BPF_PROG_TYPE_SCHED_ACT:
+ case BPF_PROG_TYPE_XDP:
break;
default:
verbose("verifier is misconfigured\n");
@@ -805,10 +816,15 @@ static int check_mem_access(struct verifier_env *env, u32 regno, int off,
err = check_stack_read(state, off, size, value_regno);
}
} else if (state->regs[regno].type == PTR_TO_PACKET) {
- if (t == BPF_WRITE) {
+ if (t == BPF_WRITE && !may_write_pkt_data(env->prog->type)) {
verbose("cannot write into packet\n");
return -EACCES;
}
+ if (t == BPF_WRITE && value_regno >= 0 &&
+ is_pointer_value(env, value_regno)) {
+ verbose("R%d leaks addr into packet\n", value_regno);
+ return -EACCES;
+ }
err = check_packet_access(env, regno, off, size);
if (!err && t == BPF_READ && value_regno >= 0)
mark_reg_unknown_value(state->regs, value_regno);
@@ -1035,6 +1051,10 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id)
if (func_id != BPF_FUNC_get_stackid)
goto error;
break;
+ case BPF_MAP_TYPE_CGROUP_ARRAY:
+ if (func_id != BPF_FUNC_skb_in_cgroup)
+ goto error;
+ break;
default:
break;
}
@@ -1054,6 +1074,10 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id)
if (map->map_type != BPF_MAP_TYPE_STACK_TRACE)
goto error;
break;
+ case BPF_FUNC_skb_in_cgroup:
+ if (map->map_type != BPF_MAP_TYPE_CGROUP_ARRAY)
+ goto error;
+ break;
default:
break;
}
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 75c0ff00aca6..9624db80dc4e 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -61,7 +61,7 @@
#include <linux/cpuset.h>
#include <linux/proc_ns.h>
#include <linux/nsproxy.h>
-#include <linux/proc_ns.h>
+#include <linux/file.h>
#include <net/sock.h>
/*
@@ -1160,18 +1160,12 @@ static void cgroup_exit_root_id(struct cgroup_root *root)
{
lockdep_assert_held(&cgroup_mutex);
- if (root->hierarchy_id) {
- idr_remove(&cgroup_hierarchy_idr, root->hierarchy_id);
- root->hierarchy_id = 0;
- }
+ idr_remove(&cgroup_hierarchy_idr, root->hierarchy_id);
}
static void cgroup_free_root(struct cgroup_root *root)
{
if (root) {
- /* hierarchy ID should already have been released */
- WARN_ON_ONCE(root->hierarchy_id);
-
idr_destroy(&root->cgroup_idr);
kfree(root);
}
@@ -5146,6 +5140,8 @@ static struct cgroup_subsys_state *css_create(struct cgroup *cgrp,
lockdep_assert_held(&cgroup_mutex);
css = ss->css_alloc(parent_css);
+ if (!css)
+ css = ERR_PTR(-ENOMEM);
if (IS_ERR(css))
return css;
@@ -6172,7 +6168,7 @@ struct cgroup_subsys_state *css_tryget_online_from_dir(struct dentry *dentry,
struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss)
{
WARN_ON_ONCE(!rcu_read_lock_held());
- return id > 0 ? idr_find(&ss->css_idr, id) : NULL;
+ return idr_find(&ss->css_idr, id);
}
/**
@@ -6209,6 +6205,40 @@ struct cgroup *cgroup_get_from_path(const char *path)
}
EXPORT_SYMBOL_GPL(cgroup_get_from_path);
+/**
+ * cgroup_get_from_fd - get a cgroup pointer from a fd
+ * @fd: fd obtained by open(cgroup2_dir)
+ *
+ * Find the cgroup from a fd which should be obtained
+ * by opening a cgroup directory. Returns a pointer to the
+ * cgroup on success. ERR_PTR is returned if the cgroup
+ * cannot be found.
+ */
+struct cgroup *cgroup_get_from_fd(int fd)
+{
+ struct cgroup_subsys_state *css;
+ struct cgroup *cgrp;
+ struct file *f;
+
+ f = fget_raw(fd);
+ if (!f)
+ return ERR_PTR(-EBADF);
+
+ css = css_tryget_online_from_dir(f->f_path.dentry, NULL);
+ fput(f);
+ if (IS_ERR(css))
+ return ERR_CAST(css);
+
+ cgrp = css->cgroup;
+ if (!cgroup_on_dfl(cgrp)) {
+ cgroup_put(cgrp);
+ return ERR_PTR(-EBADF);
+ }
+
+ return cgrp;
+}
+EXPORT_SYMBOL_GPL(cgroup_get_from_fd);
+
/*
* sock->sk_cgrp_data handling. For more info, see sock_cgroup_data
* definition in cgroup-defs.h.
diff --git a/kernel/cgroup_pids.c b/kernel/cgroup_pids.c
index 303097b37429..2bd673783f1a 100644
--- a/kernel/cgroup_pids.c
+++ b/kernel/cgroup_pids.c
@@ -49,6 +49,12 @@ struct pids_cgroup {
*/
atomic64_t counter;
int64_t limit;
+
+ /* Handle for "pids.events" */
+ struct cgroup_file events_file;
+
+ /* Number of times fork failed because limit was hit. */
+ atomic64_t events_limit;
};
static struct pids_cgroup *css_pids(struct cgroup_subsys_state *css)
@@ -72,6 +78,7 @@ pids_css_alloc(struct cgroup_subsys_state *parent)
pids->limit = PIDS_MAX;
atomic64_set(&pids->counter, 0);
+ atomic64_set(&pids->events_limit, 0);
return &pids->css;
}
@@ -213,10 +220,21 @@ static int pids_can_fork(struct task_struct *task)
{
struct cgroup_subsys_state *css;
struct pids_cgroup *pids;
+ int err;
css = task_css_check(current, pids_cgrp_id, true);
pids = css_pids(css);
- return pids_try_charge(pids, 1);
+ err = pids_try_charge(pids, 1);
+ if (err) {
+ /* Only log the first time events_limit is incremented. */
+ if (atomic64_inc_return(&pids->events_limit) == 1) {
+ pr_info("cgroup: fork rejected by pids controller in ");
+ pr_cont_cgroup_path(task_cgroup(current, pids_cgrp_id));
+ pr_cont("\n");
+ }
+ cgroup_file_notify(&pids->events_file);
+ }
+ return err;
}
static void pids_cancel_fork(struct task_struct *task)
@@ -288,6 +306,14 @@ static s64 pids_current_read(struct cgroup_subsys_state *css,
return atomic64_read(&pids->counter);
}
+static int pids_events_show(struct seq_file *sf, void *v)
+{
+ struct pids_cgroup *pids = css_pids(seq_css(sf));
+
+ seq_printf(sf, "max %lld\n", (s64)atomic64_read(&pids->events_limit));
+ return 0;
+}
+
static struct cftype pids_files[] = {
{
.name = "max",
@@ -300,6 +326,12 @@ static struct cftype pids_files[] = {
.read_s64 = pids_current_read,
.flags = CFTYPE_NOT_ON_ROOT,
},
+ {
+ .name = "events",
+ .seq_show = pids_events_show,
+ .file_offset = offsetof(struct pids_cgroup, events_file),
+ .flags = CFTYPE_NOT_ON_ROOT,
+ },
{ } /* terminate */
};
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 73e93e53884d..c7fd2778ed50 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1034,15 +1034,6 @@ static void cpuset_change_task_nodemask(struct task_struct *tsk,
{
bool need_loop;
- /*
- * Allow tasks that have access to memory reserves because they have
- * been OOM killed to get memory anywhere.
- */
- if (unlikely(test_thread_flag(TIF_MEMDIE)))
- return;
- if (current->flags & PF_EXITING) /* Let dying task have memory */
- return;
-
task_lock(tsk);
/*
* Determine if a loop is necessary if another thread is doing
diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c
index 179ef4640964..e9fdb5203de5 100644
--- a/kernel/events/callchain.c
+++ b/kernel/events/callchain.c
@@ -104,7 +104,7 @@ fail:
return -ENOMEM;
}
-int get_callchain_buffers(void)
+int get_callchain_buffers(int event_max_stack)
{
int err = 0;
int count;
@@ -121,6 +121,15 @@ int get_callchain_buffers(void)
/* If the allocation failed, give up */
if (!callchain_cpus_entries)
err = -ENOMEM;
+ /*
+ * If requesting per event more than the global cap,
+ * return a different error to help userspace figure
+ * this out.
+ *
+ * And also do it here so that we have &callchain_mutex held.
+ */
+ if (event_max_stack > sysctl_perf_event_max_stack)
+ err = -EOVERFLOW;
goto exit;
}
@@ -174,11 +183,12 @@ perf_callchain(struct perf_event *event, struct pt_regs *regs)
bool user = !event->attr.exclude_callchain_user;
/* Disallow cross-task user callchains. */
bool crosstask = event->ctx->task && event->ctx->task != current;
+ const u32 max_stack = event->attr.sample_max_stack;
if (!kernel && !user)
return NULL;
- return get_perf_callchain(regs, 0, kernel, user, sysctl_perf_event_max_stack, crosstask, true);
+ return get_perf_callchain(regs, 0, kernel, user, max_stack, crosstask, true);
}
struct perf_callchain_entry *
diff --git a/kernel/events/core.c b/kernel/events/core.c
index f3ef1c29a7c9..356a6c7cb52a 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -335,6 +335,7 @@ static atomic_t perf_sched_count;
static DEFINE_PER_CPU(atomic_t, perf_cgroup_events);
static DEFINE_PER_CPU(int, perf_sched_cb_usages);
+static DEFINE_PER_CPU(struct pmu_event_list, pmu_sb_events);
static atomic_t nr_mmap_events __read_mostly;
static atomic_t nr_comm_events __read_mostly;
@@ -396,6 +397,13 @@ int perf_proc_update_handler(struct ctl_table *table, int write,
if (ret || !write)
return ret;
+ /*
+ * If throttling is disabled don't allow the write:
+ */
+ if (sysctl_perf_cpu_time_max_percent == 100 ||
+ sysctl_perf_cpu_time_max_percent == 0)
+ return -EINVAL;
+
max_samples_per_tick = DIV_ROUND_UP(sysctl_perf_event_sample_rate, HZ);
perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate;
update_perf_cpu_limits();
@@ -3686,6 +3694,39 @@ static void free_event_rcu(struct rcu_head *head)
static void ring_buffer_attach(struct perf_event *event,
struct ring_buffer *rb);
+static void detach_sb_event(struct perf_event *event)
+{
+ struct pmu_event_list *pel = per_cpu_ptr(&pmu_sb_events, event->cpu);
+
+ raw_spin_lock(&pel->lock);
+ list_del_rcu(&event->sb_list);
+ raw_spin_unlock(&pel->lock);
+}
+
+static bool is_sb_event(struct perf_event *event)
+{
+ struct perf_event_attr *attr = &event->attr;
+
+ if (event->parent)
+ return false;
+
+ if (event->attach_state & PERF_ATTACH_TASK)
+ return false;
+
+ if (attr->mmap || attr->mmap_data || attr->mmap2 ||
+ attr->comm || attr->comm_exec ||
+ attr->task ||
+ attr->context_switch)
+ return true;
+ return false;
+}
+
+static void unaccount_pmu_sb_event(struct perf_event *event)
+{
+ if (is_sb_event(event))
+ detach_sb_event(event);
+}
+
static void unaccount_event_cpu(struct perf_event *event, int cpu)
{
if (event->parent)
@@ -3749,6 +3790,8 @@ static void unaccount_event(struct perf_event *event)
}
unaccount_event_cpu(event, event->cpu);
+
+ unaccount_pmu_sb_event(event);
}
static void perf_sched_delayed(struct work_struct *work)
@@ -5574,16 +5617,26 @@ void perf_output_sample(struct perf_output_handle *handle,
}
if (sample_type & PERF_SAMPLE_RAW) {
- if (data->raw) {
- u32 raw_size = data->raw->size;
- u32 real_size = round_up(raw_size + sizeof(u32),
- sizeof(u64)) - sizeof(u32);
- u64 zero = 0;
-
- perf_output_put(handle, real_size);
- __output_copy(handle, data->raw->data, raw_size);
- if (real_size - raw_size)
- __output_copy(handle, &zero, real_size - raw_size);
+ struct perf_raw_record *raw = data->raw;
+
+ if (raw) {
+ struct perf_raw_frag *frag = &raw->frag;
+
+ perf_output_put(handle, raw->size);
+ do {
+ if (frag->copy) {
+ __output_custom(handle, frag->copy,
+ frag->data, frag->size);
+ } else {
+ __output_copy(handle, frag->data,
+ frag->size);
+ }
+ if (perf_raw_frag_last(frag))
+ break;
+ frag = frag->next;
+ } while (1);
+ if (frag->pad)
+ __output_skip(handle, NULL, frag->pad);
} else {
struct {
u32 size;
@@ -5708,14 +5761,28 @@ void perf_prepare_sample(struct perf_event_header *header,
}
if (sample_type & PERF_SAMPLE_RAW) {
- int size = sizeof(u32);
-
- if (data->raw)
- size += data->raw->size;
- else
- size += sizeof(u32);
+ struct perf_raw_record *raw = data->raw;
+ int size;
+
+ if (raw) {
+ struct perf_raw_frag *frag = &raw->frag;
+ u32 sum = 0;
+
+ do {
+ sum += frag->size;
+ if (perf_raw_frag_last(frag))
+ break;
+ frag = frag->next;
+ } while (1);
+
+ size = round_up(sum + sizeof(u32), sizeof(u64));
+ raw->size = size - sizeof(u32);
+ frag->pad = raw->size - sum;
+ } else {
+ size = sizeof(u64);
+ }
- header->size += round_up(size, sizeof(u64));
+ header->size += size;
}
if (sample_type & PERF_SAMPLE_BRANCH_STACK) {
@@ -5875,11 +5942,11 @@ perf_event_read_event(struct perf_event *event,
perf_output_end(&handle);
}
-typedef void (perf_event_aux_output_cb)(struct perf_event *event, void *data);
+typedef void (perf_iterate_f)(struct perf_event *event, void *data);
static void
-perf_event_aux_ctx(struct perf_event_context *ctx,
- perf_event_aux_output_cb output,
+perf_iterate_ctx(struct perf_event_context *ctx,
+ perf_iterate_f output,
void *data, bool all)
{
struct perf_event *event;
@@ -5896,52 +5963,55 @@ perf_event_aux_ctx(struct perf_event_context *ctx,
}
}
-static void
-perf_event_aux_task_ctx(perf_event_aux_output_cb output, void *data,
- struct perf_event_context *task_ctx)
+static void perf_iterate_sb_cpu(perf_iterate_f output, void *data)
{
- rcu_read_lock();
- preempt_disable();
- perf_event_aux_ctx(task_ctx, output, data, false);
- preempt_enable();
- rcu_read_unlock();
+ struct pmu_event_list *pel = this_cpu_ptr(&pmu_sb_events);
+ struct perf_event *event;
+
+ list_for_each_entry_rcu(event, &pel->list, sb_list) {
+ if (event->state < PERF_EVENT_STATE_INACTIVE)
+ continue;
+ if (!event_filter_match(event))
+ continue;
+ output(event, data);
+ }
}
+/*
+ * Iterate all events that need to receive side-band events.
+ *
+ * For new callers; ensure that account_pmu_sb_event() includes
+ * your event, otherwise it might not get delivered.
+ */
static void
-perf_event_aux(perf_event_aux_output_cb output, void *data,
+perf_iterate_sb(perf_iterate_f output, void *data,
struct perf_event_context *task_ctx)
{
- struct perf_cpu_context *cpuctx;
struct perf_event_context *ctx;
- struct pmu *pmu;
int ctxn;
+ rcu_read_lock();
+ preempt_disable();
+
/*
- * If we have task_ctx != NULL we only notify
- * the task context itself. The task_ctx is set
- * only for EXIT events before releasing task
+ * If we have task_ctx != NULL we only notify the task context itself.
+ * The task_ctx is set only for EXIT events before releasing task
* context.
*/
if (task_ctx) {
- perf_event_aux_task_ctx(output, data, task_ctx);
- return;
+ perf_iterate_ctx(task_ctx, output, data, false);
+ goto done;
}
- rcu_read_lock();
- list_for_each_entry_rcu(pmu, &pmus, entry) {
- cpuctx = get_cpu_ptr(pmu->pmu_cpu_context);
- if (cpuctx->unique_pmu != pmu)
- goto next;
- perf_event_aux_ctx(&cpuctx->ctx, output, data, false);
- ctxn = pmu->task_ctx_nr;
- if (ctxn < 0)
- goto next;
+ perf_iterate_sb_cpu(output, data);
+
+ for_each_task_context_nr(ctxn) {
ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
if (ctx)
- perf_event_aux_ctx(ctx, output, data, false);
-next:
- put_cpu_ptr(pmu->pmu_cpu_context);
+ perf_iterate_ctx(ctx, output, data, false);
}
+done:
+ preempt_enable();
rcu_read_unlock();
}
@@ -5990,7 +6060,7 @@ void perf_event_exec(void)
perf_event_enable_on_exec(ctxn);
- perf_event_aux_ctx(ctx, perf_event_addr_filters_exec, NULL,
+ perf_iterate_ctx(ctx, perf_event_addr_filters_exec, NULL,
true);
}
rcu_read_unlock();
@@ -6034,9 +6104,9 @@ static int __perf_pmu_output_stop(void *info)
};
rcu_read_lock();
- perf_event_aux_ctx(&cpuctx->ctx, __perf_event_output_stop, &ro, false);
+ perf_iterate_ctx(&cpuctx->ctx, __perf_event_output_stop, &ro, false);
if (cpuctx->task_ctx)
- perf_event_aux_ctx(cpuctx->task_ctx, __perf_event_output_stop,
+ perf_iterate_ctx(cpuctx->task_ctx, __perf_event_output_stop,
&ro, false);
rcu_read_unlock();
@@ -6165,7 +6235,7 @@ static void perf_event_task(struct task_struct *task,
},
};
- perf_event_aux(perf_event_task_output,
+ perf_iterate_sb(perf_event_task_output,
&task_event,
task_ctx);
}
@@ -6244,7 +6314,7 @@ static void perf_event_comm_event(struct perf_comm_event *comm_event)
comm_event->event_id.header.size = sizeof(comm_event->event_id) + size;
- perf_event_aux(perf_event_comm_output,
+ perf_iterate_sb(perf_event_comm_output,
comm_event,
NULL);
}
@@ -6475,7 +6545,7 @@ got_name:
mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size;
- perf_event_aux(perf_event_mmap_output,
+ perf_iterate_sb(perf_event_mmap_output,
mmap_event,
NULL);
@@ -6558,7 +6628,7 @@ static void perf_addr_filters_adjust(struct vm_area_struct *vma)
if (!ctx)
continue;
- perf_event_aux_ctx(ctx, __perf_addr_filters_adjust, vma, true);
+ perf_iterate_ctx(ctx, __perf_addr_filters_adjust, vma, true);
}
rcu_read_unlock();
}
@@ -6745,7 +6815,7 @@ static void perf_event_switch(struct task_struct *task,
},
};
- perf_event_aux(perf_event_switch_output,
+ perf_iterate_sb(perf_event_switch_output,
&switch_event,
NULL);
}
@@ -7352,7 +7422,7 @@ static struct pmu perf_swevent = {
static int perf_tp_filter_match(struct perf_event *event,
struct perf_sample_data *data)
{
- void *record = data->raw->data;
+ void *record = data->raw->frag.data;
/* only top level events have filters set */
if (event->parent)
@@ -7408,8 +7478,10 @@ void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size,
struct perf_event *event;
struct perf_raw_record raw = {
- .size = entry_size,
- .data = record,
+ .frag = {
+ .size = entry_size,
+ .data = record,
+ },
};
perf_sample_data_init(&data, 0, 0);
@@ -7550,7 +7622,7 @@ static void perf_event_free_bpf_prog(struct perf_event *event)
prog = event->tp_event->prog;
if (prog) {
event->tp_event->prog = NULL;
- bpf_prog_put_rcu(prog);
+ bpf_prog_put(prog);
}
}
@@ -8667,6 +8739,28 @@ unlock:
return pmu;
}
+static void attach_sb_event(struct perf_event *event)
+{
+ struct pmu_event_list *pel = per_cpu_ptr(&pmu_sb_events, event->cpu);
+
+ raw_spin_lock(&pel->lock);
+ list_add_rcu(&event->sb_list, &pel->list);
+ raw_spin_unlock(&pel->lock);
+}
+
+/*
+ * We keep a list of all !task (and therefore per-cpu) events
+ * that need to receive side-band records.
+ *
+ * This avoids having to scan all the various PMU per-cpu contexts
+ * looking for them.
+ */
+static void account_pmu_sb_event(struct perf_event *event)
+{
+ if (is_sb_event(event))
+ attach_sb_event(event);
+}
+
static void account_event_cpu(struct perf_event *event, int cpu)
{
if (event->parent)
@@ -8747,6 +8841,8 @@ static void account_event(struct perf_event *event)
enabled:
account_event_cpu(event, event->cpu);
+
+ account_pmu_sb_event(event);
}
/*
@@ -8895,7 +8991,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
if (!event->parent) {
if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) {
- err = get_callchain_buffers();
+ err = get_callchain_buffers(attr->sample_max_stack);
if (err)
goto err_addr_filters;
}
@@ -9217,6 +9313,9 @@ SYSCALL_DEFINE5(perf_event_open,
return -EINVAL;
}
+ if (!attr.sample_max_stack)
+ attr.sample_max_stack = sysctl_perf_event_max_stack;
+
/*
* In cgroup mode, the pid argument is used to pass the fd
* opened to the cgroup directory in cgroupfs. The cpu argument
@@ -9290,7 +9389,7 @@ SYSCALL_DEFINE5(perf_event_open,
if (is_sampling_event(event)) {
if (event->pmu->capabilities & PERF_PMU_CAP_NO_INTERRUPT) {
- err = -ENOTSUPP;
+ err = -EOPNOTSUPP;
goto err_alloc;
}
}
@@ -10252,6 +10351,9 @@ static void __init perf_event_init_all_cpus(void)
swhash = &per_cpu(swevent_htable, cpu);
mutex_init(&swhash->hlist_mutex);
INIT_LIST_HEAD(&per_cpu(active_ctx_list, cpu));
+
+ INIT_LIST_HEAD(&per_cpu(pmu_sb_events.list, cpu));
+ raw_spin_lock_init(&per_cpu(pmu_sb_events.lock, cpu));
}
}
diff --git a/kernel/events/internal.h b/kernel/events/internal.h
index 05f9f6d626df..486fd78eb8d5 100644
--- a/kernel/events/internal.h
+++ b/kernel/events/internal.h
@@ -123,21 +123,19 @@ static inline unsigned long perf_aux_size(struct ring_buffer *rb)
return rb->aux_nr_pages << PAGE_SHIFT;
}
-#define DEFINE_OUTPUT_COPY(func_name, memcpy_func) \
-static inline unsigned long \
-func_name(struct perf_output_handle *handle, \
- const void *buf, unsigned long len) \
+#define __DEFINE_OUTPUT_COPY_BODY(advance_buf, memcpy_func, ...) \
{ \
unsigned long size, written; \
\
do { \
size = min(handle->size, len); \
- written = memcpy_func(handle->addr, buf, size); \
+ written = memcpy_func(__VA_ARGS__); \
written = size - written; \
\
len -= written; \
handle->addr += written; \
- buf += written; \
+ if (advance_buf) \
+ buf += written; \
handle->size -= written; \
if (!handle->size) { \
struct ring_buffer *rb = handle->rb; \
@@ -152,6 +150,21 @@ func_name(struct perf_output_handle *handle, \
return len; \
}
+#define DEFINE_OUTPUT_COPY(func_name, memcpy_func) \
+static inline unsigned long \
+func_name(struct perf_output_handle *handle, \
+ const void *buf, unsigned long len) \
+__DEFINE_OUTPUT_COPY_BODY(true, memcpy_func, handle->addr, buf, size)
+
+static inline unsigned long
+__output_custom(struct perf_output_handle *handle, perf_copy_f copy_func,
+ const void *buf, unsigned long len)
+{
+ unsigned long orig_len = len;
+ __DEFINE_OUTPUT_COPY_BODY(false, copy_func, handle->addr, buf,
+ orig_len - len, size)
+}
+
static inline unsigned long
memcpy_common(void *dst, const void *src, unsigned long n)
{
diff --git a/kernel/exit.c b/kernel/exit.c
index 9e6e1356e6bb..84ae830234f8 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -211,6 +211,82 @@ repeat:
}
/*
+ * Note that if this function returns a valid task_struct pointer (!NULL)
+ * task->usage must remain >0 for the duration of the RCU critical section.
+ */
+struct task_struct *task_rcu_dereference(struct task_struct **ptask)
+{
+ struct sighand_struct *sighand;
+ struct task_struct *task;
+
+ /*
+ * We need to verify that release_task() was not called and thus
+ * delayed_put_task_struct() can't run and drop the last reference
+ * before rcu_read_unlock(). We check task->sighand != NULL,
+ * but we can read the already freed and reused memory.
+ */
+retry:
+ task = rcu_dereference(*ptask);
+ if (!task)
+ return NULL;
+
+ probe_kernel_address(&task->sighand, sighand);
+
+ /*
+ * Pairs with atomic_dec_and_test() in put_task_struct(). If this task
+ * was already freed we can not miss the preceding update of this
+ * pointer.
+ */
+ smp_rmb();
+ if (unlikely(task != READ_ONCE(*ptask)))
+ goto retry;
+
+ /*
+ * We've re-checked that "task == *ptask", now we have two different
+ * cases:
+ *
+ * 1. This is actually the same task/task_struct. In this case
+ * sighand != NULL tells us it is still alive.
+ *
+ * 2. This is another task which got the same memory for task_struct.
+ * We can't know this of course, and we can not trust
+ * sighand != NULL.
+ *
+ * In this case we actually return a random value, but this is
+ * correct.
+ *
+ * If we return NULL - we can pretend that we actually noticed that
+ * *ptask was updated when the previous task has exited. Or pretend
+ * that probe_slab_address(&sighand) reads NULL.
+ *
+ * If we return the new task (because sighand is not NULL for any
+ * reason) - this is fine too. This (new) task can't go away before
+ * another gp pass.
+ *
+ * And note: We could even eliminate the false positive if re-read
+ * task->sighand once again to avoid the falsely NULL. But this case
+ * is very unlikely so we don't care.
+ */
+ if (!sighand)
+ return NULL;
+
+ return task;
+}
+
+struct task_struct *try_get_task_struct(struct task_struct **ptask)
+{
+ struct task_struct *task;
+
+ rcu_read_lock();
+ task = task_rcu_dereference(ptask);
+ if (task)
+ get_task_struct(task);
+ rcu_read_unlock();
+
+ return task;
+}
+
+/*
* Determine if a process group is "orphaned", according to the POSIX
* definition in 2.2.2.52. Orphaned process groups are not to be affected
* by terminal-generated stop signals. Newly orphaned process groups are
@@ -700,10 +776,14 @@ void do_exit(long code)
exit_signals(tsk); /* sets PF_EXITING */
/*
- * tsk->flags are checked in the futex code to protect against
- * an exiting task cleaning up the robust pi futexes.
+ * Ensure that all new tsk->pi_lock acquisitions must observe
+ * PF_EXITING. Serializes against futex.c:attach_to_pi_owner().
*/
smp_mb();
+ /*
+ * Ensure that we must observe the pi_state in exit_mm() ->
+ * mm_release() -> exit_pi_state_list().
+ */
raw_spin_unlock_wait(&tsk->pi_lock);
if (unlikely(in_atomic())) {
diff --git a/kernel/fork.c b/kernel/fork.c
index 4a7ec0c6c88c..52e725d4a866 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -162,23 +162,15 @@ void __weak arch_release_thread_stack(unsigned long *stack)
static unsigned long *alloc_thread_stack_node(struct task_struct *tsk,
int node)
{
- struct page *page = alloc_kmem_pages_node(node, THREADINFO_GFP,
- THREAD_SIZE_ORDER);
-
- if (page)
- memcg_kmem_update_page_stat(page, MEMCG_KERNEL_STACK,
- 1 << THREAD_SIZE_ORDER);
+ struct page *page = alloc_pages_node(node, THREADINFO_GFP,
+ THREAD_SIZE_ORDER);
return page ? page_address(page) : NULL;
}
static inline void free_thread_stack(unsigned long *stack)
{
- struct page *page = virt_to_page(stack);
-
- memcg_kmem_update_page_stat(page, MEMCG_KERNEL_STACK,
- -(1 << THREAD_SIZE_ORDER));
- __free_kmem_pages(page, THREAD_SIZE_ORDER);
+ __free_pages(virt_to_page(stack), THREAD_SIZE_ORDER);
}
# else
static struct kmem_cache *thread_stack_cache;
@@ -223,9 +215,15 @@ static struct kmem_cache *mm_cachep;
static void account_kernel_stack(unsigned long *stack, int account)
{
- struct zone *zone = page_zone(virt_to_page(stack));
+ /* All stack pages are in the same zone and belong to the same memcg. */
+ struct page *first_page = virt_to_page(stack);
+
+ mod_zone_page_state(page_zone(first_page), NR_KERNEL_STACK_KB,
+ THREAD_SIZE / 1024 * account);
- mod_zone_page_state(zone, NR_KERNEL_STACK, account);
+ memcg_kmem_update_page_stat(
+ first_page, MEMCG_KERNEL_STACK_KB,
+ account * (THREAD_SIZE / 1024));
}
void free_task(struct task_struct *tsk)
diff --git a/kernel/freezer.c b/kernel/freezer.c
index a8900a3bc27a..6f56a9e219fa 100644
--- a/kernel/freezer.c
+++ b/kernel/freezer.c
@@ -42,7 +42,7 @@ bool freezing_slow_path(struct task_struct *p)
if (p->flags & (PF_NOFREEZE | PF_SUSPEND_TASK))
return false;
- if (test_thread_flag(TIF_MEMDIE))
+ if (test_tsk_thread_flag(p, TIF_MEMDIE))
return false;
if (pm_nosig_freezing || cgroup_freezing(p))
diff --git a/kernel/gcov/gcc_4_7.c b/kernel/gcov/gcc_4_7.c
index e25e92fb44fa..6a5c239c7669 100644
--- a/kernel/gcov/gcc_4_7.c
+++ b/kernel/gcov/gcc_4_7.c
@@ -18,7 +18,7 @@
#include <linux/vmalloc.h>
#include "gcov.h"
-#if __GNUC__ == 5 && __GNUC_MINOR__ >= 1
+#if (__GNUC__ > 5) || (__GNUC__ == 5 && __GNUC_MINOR__ >= 1)
#define GCOV_COUNTERS 10
#elif __GNUC__ == 4 && __GNUC_MINOR__ >= 9
#define GCOV_COUNTERS 9
diff --git a/kernel/irq/Makefile b/kernel/irq/Makefile
index 2ee42e95a3ce..1d3ee3169202 100644
--- a/kernel/irq/Makefile
+++ b/kernel/irq/Makefile
@@ -9,3 +9,4 @@ obj-$(CONFIG_GENERIC_IRQ_MIGRATION) += cpuhotplug.o
obj-$(CONFIG_PM_SLEEP) += pm.o
obj-$(CONFIG_GENERIC_MSI_IRQ) += msi.o
obj-$(CONFIG_GENERIC_IRQ_IPI) += ipi.o
+obj-$(CONFIG_SMP) += affinity.o
diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c
new file mode 100644
index 000000000000..f68959341c0f
--- /dev/null
+++ b/kernel/irq/affinity.c
@@ -0,0 +1,61 @@
+
+#include <linux/interrupt.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/cpu.h>
+
+static int get_first_sibling(unsigned int cpu)
+{
+ unsigned int ret;
+
+ ret = cpumask_first(topology_sibling_cpumask(cpu));
+ if (ret < nr_cpu_ids)
+ return ret;
+ return cpu;
+}
+
+/*
+ * Take a map of online CPUs and the number of available interrupt vectors
+ * and generate an output cpumask suitable for spreading MSI/MSI-X vectors
+ * so that they are distributed as good as possible around the CPUs. If
+ * more vectors than CPUs are available we'll map one to each CPU,
+ * otherwise we map one to the first sibling of each socket.
+ *
+ * If there are more vectors than CPUs we will still only have one bit
+ * set per CPU, but interrupt code will keep on assigning the vectors from
+ * the start of the bitmap until we run out of vectors.
+ */
+struct cpumask *irq_create_affinity_mask(unsigned int *nr_vecs)
+{
+ struct cpumask *affinity_mask;
+ unsigned int max_vecs = *nr_vecs;
+
+ if (max_vecs == 1)
+ return NULL;
+
+ affinity_mask = kzalloc(cpumask_size(), GFP_KERNEL);
+ if (!affinity_mask) {
+ *nr_vecs = 1;
+ return NULL;
+ }
+
+ if (max_vecs >= num_online_cpus()) {
+ cpumask_copy(affinity_mask, cpu_online_mask);
+ *nr_vecs = num_online_cpus();
+ } else {
+ unsigned int vecs = 0, cpu;
+
+ for_each_online_cpu(cpu) {
+ if (cpu == get_first_sibling(cpu)) {
+ cpumask_set_cpu(cpu, affinity_mask);
+ vecs++;
+ }
+
+ if (--max_vecs == 0)
+ break;
+ }
+ *nr_vecs = vecs;
+ }
+
+ return affinity_mask;
+}
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 2f9f2b0e79f2..b4c1bc7c9ca2 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -426,6 +426,49 @@ out_unlock:
}
EXPORT_SYMBOL_GPL(handle_simple_irq);
+/**
+ * handle_untracked_irq - Simple and software-decoded IRQs.
+ * @desc: the interrupt description structure for this irq
+ *
+ * Untracked interrupts are sent from a demultiplexing interrupt
+ * handler when the demultiplexer does not know which device it its
+ * multiplexed irq domain generated the interrupt. IRQ's handled
+ * through here are not subjected to stats tracking, randomness, or
+ * spurious interrupt detection.
+ *
+ * Note: Like handle_simple_irq, the caller is expected to handle
+ * the ack, clear, mask and unmask issues if necessary.
+ */
+void handle_untracked_irq(struct irq_desc *desc)
+{
+ unsigned int flags = 0;
+
+ raw_spin_lock(&desc->lock);
+
+ if (!irq_may_run(desc))
+ goto out_unlock;
+
+ desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING);
+
+ if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data))) {
+ desc->istate |= IRQS_PENDING;
+ goto out_unlock;
+ }
+
+ desc->istate &= ~IRQS_PENDING;
+ irqd_set(&desc->irq_data, IRQD_IRQ_INPROGRESS);
+ raw_spin_unlock(&desc->lock);
+
+ __handle_irq_event_percpu(desc, &flags);
+
+ raw_spin_lock(&desc->lock);
+ irqd_clear(&desc->irq_data, IRQD_IRQ_INPROGRESS);
+
+out_unlock:
+ raw_spin_unlock(&desc->lock);
+}
+EXPORT_SYMBOL_GPL(handle_untracked_irq);
+
/*
* Called unconditionally from handle_level_irq() and only for oneshot
* interrupts from handle_fasteoi_irq()
@@ -1093,3 +1136,43 @@ int irq_chip_compose_msi_msg(struct irq_data *data, struct msi_msg *msg)
return 0;
}
+
+/**
+ * irq_chip_pm_get - Enable power for an IRQ chip
+ * @data: Pointer to interrupt specific data
+ *
+ * Enable the power to the IRQ chip referenced by the interrupt data
+ * structure.
+ */
+int irq_chip_pm_get(struct irq_data *data)
+{
+ int retval;
+
+ if (IS_ENABLED(CONFIG_PM) && data->chip->parent_device) {
+ retval = pm_runtime_get_sync(data->chip->parent_device);
+ if (retval < 0) {
+ pm_runtime_put_noidle(data->chip->parent_device);
+ return retval;
+ }
+ }
+
+ return 0;
+}
+
+/**
+ * irq_chip_pm_put - Disable power for an IRQ chip
+ * @data: Pointer to interrupt specific data
+ *
+ * Disable the power to the IRQ chip referenced by the interrupt data
+ * structure, belongs. Note that power will only be disabled, once this
+ * function has been called for all IRQs that have called irq_chip_pm_get().
+ */
+int irq_chip_pm_put(struct irq_data *data)
+{
+ int retval = 0;
+
+ if (IS_ENABLED(CONFIG_PM) && data->chip->parent_device)
+ retval = pm_runtime_put(data->chip->parent_device);
+
+ return (retval < 0) ? retval : 0;
+}
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index a15b5485b446..d3f24905852c 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -132,10 +132,10 @@ void __irq_wake_thread(struct irq_desc *desc, struct irqaction *action)
wake_up_process(action->thread);
}
-irqreturn_t handle_irq_event_percpu(struct irq_desc *desc)
+irqreturn_t __handle_irq_event_percpu(struct irq_desc *desc, unsigned int *flags)
{
irqreturn_t retval = IRQ_NONE;
- unsigned int flags = 0, irq = desc->irq_data.irq;
+ unsigned int irq = desc->irq_data.irq;
struct irqaction *action;
for_each_action_of_desc(desc, action) {
@@ -164,7 +164,7 @@ irqreturn_t handle_irq_event_percpu(struct irq_desc *desc)
/* Fall through to add to randomness */
case IRQ_HANDLED:
- flags |= action->flags;
+ *flags |= action->flags;
break;
default:
@@ -174,7 +174,17 @@ irqreturn_t handle_irq_event_percpu(struct irq_desc *desc)
retval |= res;
}
- add_interrupt_randomness(irq, flags);
+ return retval;
+}
+
+irqreturn_t handle_irq_event_percpu(struct irq_desc *desc)
+{
+ irqreturn_t retval;
+ unsigned int flags = 0;
+
+ retval = __handle_irq_event_percpu(desc, &flags);
+
+ add_interrupt_randomness(desc->irq_data.irq, flags);
if (!noirqdebug)
note_interrupt(desc, retval);
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index 09be2c903c6d..bc226e783bd2 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -7,6 +7,7 @@
*/
#include <linux/irqdesc.h>
#include <linux/kernel_stat.h>
+#include <linux/pm_runtime.h>
#ifdef CONFIG_SPARSE_IRQ
# define IRQ_BITMAP_BITS (NR_IRQS + 8196)
@@ -83,6 +84,7 @@ extern void irq_mark_irq(unsigned int irq);
extern void init_kstat_irqs(struct irq_desc *desc, int node, int nr);
+irqreturn_t __handle_irq_event_percpu(struct irq_desc *desc, unsigned int *flags);
irqreturn_t handle_irq_event_percpu(struct irq_desc *desc);
irqreturn_t handle_irq_event(struct irq_desc *desc);
@@ -105,6 +107,8 @@ static inline void unregister_handler_proc(unsigned int irq,
struct irqaction *action) { }
#endif
+extern bool irq_can_set_affinity_usr(unsigned int irq);
+
extern int irq_select_affinity_usr(unsigned int irq, struct cpumask *mask);
extern void irq_set_thread_affinity(struct irq_desc *desc);
diff --git a/kernel/irq/ipi.c b/kernel/irq/ipi.c
index 89b49f6773f0..1a9abc1c8ea0 100644
--- a/kernel/irq/ipi.c
+++ b/kernel/irq/ipi.c
@@ -76,14 +76,14 @@ int irq_reserve_ipi(struct irq_domain *domain,
}
}
- virq = irq_domain_alloc_descs(-1, nr_irqs, 0, NUMA_NO_NODE);
+ virq = irq_domain_alloc_descs(-1, nr_irqs, 0, NUMA_NO_NODE, NULL);
if (virq <= 0) {
pr_warn("Can't reserve IPI, failed to alloc descs\n");
return -ENOMEM;
}
virq = __irq_domain_alloc_irqs(domain, virq, nr_irqs, NUMA_NO_NODE,
- (void *) dest, true);
+ (void *) dest, true, NULL);
if (virq <= 0) {
pr_warn("Can't reserve IPI, failed to alloc hw irqs\n");
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index 8731e1c5d1e7..a623b44f2d4b 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -68,9 +68,13 @@ static int alloc_masks(struct irq_desc *desc, gfp_t gfp, int node)
return 0;
}
-static void desc_smp_init(struct irq_desc *desc, int node)
+static void desc_smp_init(struct irq_desc *desc, int node,
+ const struct cpumask *affinity)
{
- cpumask_copy(desc->irq_common_data.affinity, irq_default_affinity);
+ if (!affinity)
+ affinity = irq_default_affinity;
+ cpumask_copy(desc->irq_common_data.affinity, affinity);
+
#ifdef CONFIG_GENERIC_PENDING_IRQ
cpumask_clear(desc->pending_mask);
#endif
@@ -82,11 +86,12 @@ static void desc_smp_init(struct irq_desc *desc, int node)
#else
static inline int
alloc_masks(struct irq_desc *desc, gfp_t gfp, int node) { return 0; }
-static inline void desc_smp_init(struct irq_desc *desc, int node) { }
+static inline void
+desc_smp_init(struct irq_desc *desc, int node, const struct cpumask *affinity) { }
#endif
static void desc_set_defaults(unsigned int irq, struct irq_desc *desc, int node,
- struct module *owner)
+ const struct cpumask *affinity, struct module *owner)
{
int cpu;
@@ -107,7 +112,7 @@ static void desc_set_defaults(unsigned int irq, struct irq_desc *desc, int node,
desc->owner = owner;
for_each_possible_cpu(cpu)
*per_cpu_ptr(desc->kstat_irqs, cpu) = 0;
- desc_smp_init(desc, node);
+ desc_smp_init(desc, node, affinity);
}
int nr_irqs = NR_IRQS;
@@ -158,7 +163,9 @@ void irq_unlock_sparse(void)
mutex_unlock(&sparse_irq_lock);
}
-static struct irq_desc *alloc_desc(int irq, int node, struct module *owner)
+static struct irq_desc *alloc_desc(int irq, int node, unsigned int flags,
+ const struct cpumask *affinity,
+ struct module *owner)
{
struct irq_desc *desc;
gfp_t gfp = GFP_KERNEL;
@@ -178,7 +185,8 @@ static struct irq_desc *alloc_desc(int irq, int node, struct module *owner)
lockdep_set_class(&desc->lock, &irq_desc_lock_class);
init_rcu_head(&desc->rcu);
- desc_set_defaults(irq, desc, node, owner);
+ desc_set_defaults(irq, desc, node, affinity, owner);
+ irqd_set(&desc->irq_data, flags);
return desc;
@@ -223,13 +231,32 @@ static void free_desc(unsigned int irq)
}
static int alloc_descs(unsigned int start, unsigned int cnt, int node,
- struct module *owner)
+ const struct cpumask *affinity, struct module *owner)
{
+ const struct cpumask *mask = NULL;
struct irq_desc *desc;
- int i;
+ unsigned int flags;
+ int i, cpu = -1;
+
+ if (affinity && cpumask_empty(affinity))
+ return -EINVAL;
+
+ flags = affinity ? IRQD_AFFINITY_MANAGED : 0;
for (i = 0; i < cnt; i++) {
- desc = alloc_desc(start + i, node, owner);
+ if (affinity) {
+ cpu = cpumask_next(cpu, affinity);
+ if (cpu >= nr_cpu_ids)
+ cpu = cpumask_first(affinity);
+ node = cpu_to_node(cpu);
+
+ /*
+ * For single allocations we use the caller provided
+ * mask otherwise we use the mask of the target cpu
+ */
+ mask = cnt == 1 ? affinity : cpumask_of(cpu);
+ }
+ desc = alloc_desc(start + i, node, flags, mask, owner);
if (!desc)
goto err;
mutex_lock(&sparse_irq_lock);
@@ -277,7 +304,7 @@ int __init early_irq_init(void)
nr_irqs = initcnt;
for (i = 0; i < initcnt; i++) {
- desc = alloc_desc(i, node, NULL);
+ desc = alloc_desc(i, node, 0, NULL, NULL);
set_bit(i, allocated_irqs);
irq_insert_desc(i, desc);
}
@@ -311,7 +338,7 @@ int __init early_irq_init(void)
alloc_masks(&desc[i], GFP_KERNEL, node);
raw_spin_lock_init(&desc[i].lock);
lockdep_set_class(&desc[i].lock, &irq_desc_lock_class);
- desc_set_defaults(i, &desc[i], node, NULL);
+ desc_set_defaults(i, &desc[i], node, NULL, NULL);
}
return arch_early_irq_init();
}
@@ -328,11 +355,12 @@ static void free_desc(unsigned int irq)
unsigned long flags;
raw_spin_lock_irqsave(&desc->lock, flags);
- desc_set_defaults(irq, desc, irq_desc_get_node(desc), NULL);
+ desc_set_defaults(irq, desc, irq_desc_get_node(desc), NULL, NULL);
raw_spin_unlock_irqrestore(&desc->lock, flags);
}
static inline int alloc_descs(unsigned int start, unsigned int cnt, int node,
+ const struct cpumask *affinity,
struct module *owner)
{
u32 i;
@@ -453,12 +481,15 @@ EXPORT_SYMBOL_GPL(irq_free_descs);
* @cnt: Number of consecutive irqs to allocate.
* @node: Preferred node on which the irq descriptor should be allocated
* @owner: Owning module (can be NULL)
+ * @affinity: Optional pointer to an affinity mask which hints where the
+ * irq descriptors should be allocated and which default
+ * affinities to use
*
* Returns the first irq number or error code
*/
int __ref
__irq_alloc_descs(int irq, unsigned int from, unsigned int cnt, int node,
- struct module *owner)
+ struct module *owner, const struct cpumask *affinity)
{
int start, ret;
@@ -494,7 +525,7 @@ __irq_alloc_descs(int irq, unsigned int from, unsigned int cnt, int node,
bitmap_set(allocated_irqs, start, cnt);
mutex_unlock(&sparse_irq_lock);
- return alloc_descs(start, cnt, node, owner);
+ return alloc_descs(start, cnt, node, affinity, owner);
err:
mutex_unlock(&sparse_irq_lock);
@@ -512,7 +543,7 @@ EXPORT_SYMBOL_GPL(__irq_alloc_descs);
*/
unsigned int irq_alloc_hwirqs(int cnt, int node)
{
- int i, irq = __irq_alloc_descs(-1, 0, cnt, node, NULL);
+ int i, irq = __irq_alloc_descs(-1, 0, cnt, node, NULL, NULL);
if (irq < 0)
return 0;
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index 8798b6c9e945..4752b43662e0 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -481,7 +481,7 @@ unsigned int irq_create_mapping(struct irq_domain *domain,
}
/* Allocate a virtual interrupt number */
- virq = irq_domain_alloc_descs(-1, 1, hwirq, of_node_to_nid(of_node));
+ virq = irq_domain_alloc_descs(-1, 1, hwirq, of_node_to_nid(of_node), NULL);
if (virq <= 0) {
pr_debug("-> virq allocation failed\n");
return 0;
@@ -567,6 +567,7 @@ static void of_phandle_args_to_fwspec(struct of_phandle_args *irq_data,
unsigned int irq_create_fwspec_mapping(struct irq_fwspec *fwspec)
{
struct irq_domain *domain;
+ struct irq_data *irq_data;
irq_hw_number_t hwirq;
unsigned int type = IRQ_TYPE_NONE;
int virq;
@@ -588,15 +589,46 @@ unsigned int irq_create_fwspec_mapping(struct irq_fwspec *fwspec)
if (irq_domain_translate(domain, fwspec, &hwirq, &type))
return 0;
- if (irq_domain_is_hierarchy(domain)) {
+ /*
+ * WARN if the irqchip returns a type with bits
+ * outside the sense mask set and clear these bits.
+ */
+ if (WARN_ON(type & ~IRQ_TYPE_SENSE_MASK))
+ type &= IRQ_TYPE_SENSE_MASK;
+
+ /*
+ * If we've already configured this interrupt,
+ * don't do it again, or hell will break loose.
+ */
+ virq = irq_find_mapping(domain, hwirq);
+ if (virq) {
+ /*
+ * If the trigger type is not specified or matches the
+ * current trigger type then we are done so return the
+ * interrupt number.
+ */
+ if (type == IRQ_TYPE_NONE || type == irq_get_trigger_type(virq))
+ return virq;
+
/*
- * If we've already configured this interrupt,
- * don't do it again, or hell will break loose.
+ * If the trigger type has not been set yet, then set
+ * it now and return the interrupt number.
*/
- virq = irq_find_mapping(domain, hwirq);
- if (virq)
+ if (irq_get_trigger_type(virq) == IRQ_TYPE_NONE) {
+ irq_data = irq_get_irq_data(virq);
+ if (!irq_data)
+ return 0;
+
+ irqd_set_trigger_type(irq_data, type);
return virq;
+ }
+ pr_warn("type mismatch, failed to map hwirq-%lu for %s!\n",
+ hwirq, of_node_full_name(to_of_node(fwspec->fwnode)));
+ return 0;
+ }
+
+ if (irq_domain_is_hierarchy(domain)) {
virq = irq_domain_alloc_irqs(domain, 1, NUMA_NO_NODE, fwspec);
if (virq <= 0)
return 0;
@@ -607,10 +639,18 @@ unsigned int irq_create_fwspec_mapping(struct irq_fwspec *fwspec)
return virq;
}
- /* Set type if specified and different than the current one */
- if (type != IRQ_TYPE_NONE &&
- type != irq_get_trigger_type(virq))
- irq_set_irq_type(virq, type);
+ irq_data = irq_get_irq_data(virq);
+ if (!irq_data) {
+ if (irq_domain_is_hierarchy(domain))
+ irq_domain_free_irqs(virq, 1);
+ else
+ irq_dispose_mapping(virq);
+ return 0;
+ }
+
+ /* Store trigger type */
+ irqd_set_trigger_type(irq_data, type);
+
return virq;
}
EXPORT_SYMBOL_GPL(irq_create_fwspec_mapping);
@@ -640,8 +680,12 @@ void irq_dispose_mapping(unsigned int virq)
if (WARN_ON(domain == NULL))
return;
- irq_domain_disassociate(domain, virq);
- irq_free_desc(virq);
+ if (irq_domain_is_hierarchy(domain)) {
+ irq_domain_free_irqs(virq, 1);
+ } else {
+ irq_domain_disassociate(domain, virq);
+ irq_free_desc(virq);
+ }
}
EXPORT_SYMBOL_GPL(irq_dispose_mapping);
@@ -835,19 +879,23 @@ const struct irq_domain_ops irq_domain_simple_ops = {
EXPORT_SYMBOL_GPL(irq_domain_simple_ops);
int irq_domain_alloc_descs(int virq, unsigned int cnt, irq_hw_number_t hwirq,
- int node)
+ int node, const struct cpumask *affinity)
{
unsigned int hint;
if (virq >= 0) {
- virq = irq_alloc_descs(virq, virq, cnt, node);
+ virq = __irq_alloc_descs(virq, virq, cnt, node, THIS_MODULE,
+ affinity);
} else {
hint = hwirq % nr_irqs;
if (hint == 0)
hint++;
- virq = irq_alloc_descs_from(hint, cnt, node);
- if (virq <= 0 && hint > 1)
- virq = irq_alloc_descs_from(1, cnt, node);
+ virq = __irq_alloc_descs(-1, hint, cnt, node, THIS_MODULE,
+ affinity);
+ if (virq <= 0 && hint > 1) {
+ virq = __irq_alloc_descs(-1, 1, cnt, node, THIS_MODULE,
+ affinity);
+ }
}
return virq;
@@ -1144,8 +1192,10 @@ int irq_domain_alloc_irqs_recursive(struct irq_domain *domain,
if (recursive)
ret = irq_domain_alloc_irqs_recursive(parent, irq_base,
nr_irqs, arg);
- if (ret >= 0)
- ret = domain->ops->alloc(domain, irq_base, nr_irqs, arg);
+ if (ret < 0)
+ return ret;
+
+ ret = domain->ops->alloc(domain, irq_base, nr_irqs, arg);
if (ret < 0 && recursive)
irq_domain_free_irqs_recursive(parent, irq_base, nr_irqs);
@@ -1160,6 +1210,7 @@ int irq_domain_alloc_irqs_recursive(struct irq_domain *domain,
* @node: NUMA node id for memory allocation
* @arg: domain specific argument
* @realloc: IRQ descriptors have already been allocated if true
+ * @affinity: Optional irq affinity mask for multiqueue devices
*
* Allocate IRQ numbers and initialized all data structures to support
* hierarchy IRQ domains.
@@ -1175,7 +1226,7 @@ int irq_domain_alloc_irqs_recursive(struct irq_domain *domain,
*/
int __irq_domain_alloc_irqs(struct irq_domain *domain, int irq_base,
unsigned int nr_irqs, int node, void *arg,
- bool realloc)
+ bool realloc, const struct cpumask *affinity)
{
int i, ret, virq;
@@ -1193,7 +1244,8 @@ int __irq_domain_alloc_irqs(struct irq_domain *domain, int irq_base,
if (realloc && irq_base >= 0) {
virq = irq_base;
} else {
- virq = irq_domain_alloc_descs(irq_base, nr_irqs, 0, node);
+ virq = irq_domain_alloc_descs(irq_base, nr_irqs, 0, node,
+ affinity);
if (virq < 0) {
pr_debug("cannot allocate IRQ(base %d, count %d)\n",
irq_base, nr_irqs);
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index ef0bc02c3a70..73a2b786b5e9 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -115,12 +115,12 @@ EXPORT_SYMBOL(synchronize_irq);
#ifdef CONFIG_SMP
cpumask_var_t irq_default_affinity;
-static int __irq_can_set_affinity(struct irq_desc *desc)
+static bool __irq_can_set_affinity(struct irq_desc *desc)
{
if (!desc || !irqd_can_balance(&desc->irq_data) ||
!desc->irq_data.chip || !desc->irq_data.chip->irq_set_affinity)
- return 0;
- return 1;
+ return false;
+ return true;
}
/**
@@ -134,6 +134,21 @@ int irq_can_set_affinity(unsigned int irq)
}
/**
+ * irq_can_set_affinity_usr - Check if affinity of a irq can be set from user space
+ * @irq: Interrupt to check
+ *
+ * Like irq_can_set_affinity() above, but additionally checks for the
+ * AFFINITY_MANAGED flag.
+ */
+bool irq_can_set_affinity_usr(unsigned int irq)
+{
+ struct irq_desc *desc = irq_to_desc(irq);
+
+ return __irq_can_set_affinity(desc) &&
+ !irqd_affinity_is_managed(&desc->irq_data);
+}
+
+/**
* irq_set_thread_affinity - Notify irq threads to adjust affinity
* @desc: irq descriptor which has affitnity changed
*
@@ -338,10 +353,11 @@ static int setup_affinity(struct irq_desc *desc, struct cpumask *mask)
return 0;
/*
- * Preserve an userspace affinity setup, but make sure that
- * one of the targets is online.
+ * Preserve the managed affinity setting and an userspace affinity
+ * setup, but make sure that one of the targets is online.
*/
- if (irqd_has_set(&desc->irq_data, IRQD_AFFINITY_SET)) {
+ if (irqd_affinity_is_managed(&desc->irq_data) ||
+ irqd_has_set(&desc->irq_data, IRQD_AFFINITY_SET)) {
if (cpumask_intersects(desc->irq_common_data.affinity,
cpu_online_mask))
set = desc->irq_common_data.affinity;
@@ -1117,6 +1133,13 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
new->irq = irq;
/*
+ * If the trigger type is not specified by the caller,
+ * then use the default for this interrupt.
+ */
+ if (!(new->flags & IRQF_TRIGGER_MASK))
+ new->flags |= irqd_get_trigger_type(&desc->irq_data);
+
+ /*
* Check whether the interrupt nests into another interrupt
* thread.
*/
@@ -1409,10 +1432,18 @@ int setup_irq(unsigned int irq, struct irqaction *act)
if (!desc || WARN_ON(irq_settings_is_per_cpu_devid(desc)))
return -EINVAL;
+
+ retval = irq_chip_pm_get(&desc->irq_data);
+ if (retval < 0)
+ return retval;
+
chip_bus_lock(desc);
retval = __setup_irq(irq, desc, act);
chip_bus_sync_unlock(desc);
+ if (retval)
+ irq_chip_pm_put(&desc->irq_data);
+
return retval;
}
EXPORT_SYMBOL_GPL(setup_irq);
@@ -1506,6 +1537,7 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id)
}
}
+ irq_chip_pm_put(&desc->irq_data);
module_put(desc->owner);
kfree(action->secondary);
return action;
@@ -1648,11 +1680,16 @@ int request_threaded_irq(unsigned int irq, irq_handler_t handler,
action->name = devname;
action->dev_id = dev_id;
+ retval = irq_chip_pm_get(&desc->irq_data);
+ if (retval < 0)
+ return retval;
+
chip_bus_lock(desc);
retval = __setup_irq(irq, desc, action);
chip_bus_sync_unlock(desc);
if (retval) {
+ irq_chip_pm_put(&desc->irq_data);
kfree(action->secondary);
kfree(action);
}
@@ -1730,7 +1767,14 @@ void enable_percpu_irq(unsigned int irq, unsigned int type)
if (!desc)
return;
+ /*
+ * If the trigger type is not specified by the caller, then
+ * use the default for this interrupt.
+ */
type &= IRQ_TYPE_SENSE_MASK;
+ if (type == IRQ_TYPE_NONE)
+ type = irqd_get_trigger_type(&desc->irq_data);
+
if (type != IRQ_TYPE_NONE) {
int ret;
@@ -1822,6 +1866,7 @@ static struct irqaction *__free_percpu_irq(unsigned int irq, void __percpu *dev_
unregister_handler_proc(irq, action);
+ irq_chip_pm_put(&desc->irq_data);
module_put(desc->owner);
return action;
@@ -1884,10 +1929,18 @@ int setup_percpu_irq(unsigned int irq, struct irqaction *act)
if (!desc || !irq_settings_is_per_cpu_devid(desc))
return -EINVAL;
+
+ retval = irq_chip_pm_get(&desc->irq_data);
+ if (retval < 0)
+ return retval;
+
chip_bus_lock(desc);
retval = __setup_irq(irq, desc, act);
chip_bus_sync_unlock(desc);
+ if (retval)
+ irq_chip_pm_put(&desc->irq_data);
+
return retval;
}
@@ -1931,12 +1984,18 @@ int request_percpu_irq(unsigned int irq, irq_handler_t handler,
action->name = devname;
action->percpu_dev_id = dev_id;
+ retval = irq_chip_pm_get(&desc->irq_data);
+ if (retval < 0)
+ return retval;
+
chip_bus_lock(desc);
retval = __setup_irq(irq, desc, action);
chip_bus_sync_unlock(desc);
- if (retval)
+ if (retval) {
+ irq_chip_pm_put(&desc->irq_data);
kfree(action);
+ }
return retval;
}
diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c
index 38e89ce7b071..54999350162c 100644
--- a/kernel/irq/msi.c
+++ b/kernel/irq/msi.c
@@ -324,7 +324,7 @@ int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev,
struct msi_domain_ops *ops = info->ops;
msi_alloc_info_t arg;
struct msi_desc *desc;
- int i, ret, virq = -1;
+ int i, ret, virq;
ret = msi_domain_prepare_irqs(domain, dev, nvec, &arg);
if (ret)
@@ -332,13 +332,10 @@ int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev,
for_each_msi_entry(desc, dev) {
ops->set_desc(&arg, desc);
- if (info->flags & MSI_FLAG_IDENTITY_MAP)
- virq = (int)ops->get_hwirq(info, &arg);
- else
- virq = -1;
- virq = __irq_domain_alloc_irqs(domain, virq, desc->nvec_used,
- dev_to_node(dev), &arg, false);
+ virq = __irq_domain_alloc_irqs(domain, -1, desc->nvec_used,
+ dev_to_node(dev), &arg, false,
+ desc->affinity);
if (virq < 0) {
ret = -ENOSPC;
if (ops->handle_error)
@@ -356,6 +353,7 @@ int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev,
ops->msi_finish(&arg, 0);
for_each_msi_entry(desc, dev) {
+ virq = desc->irq;
if (desc->nvec_used == 1)
dev_dbg(dev, "irq %d for MSI\n", virq);
else
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index 4e1b94726818..feaa813b84a9 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -96,7 +96,7 @@ static ssize_t write_irq_affinity(int type, struct file *file,
cpumask_var_t new_value;
int err;
- if (!irq_can_set_affinity(irq) || no_irq_affinity)
+ if (!irq_can_set_affinity_usr(irq) || no_irq_affinity)
return -EIO;
if (!alloc_cpumask_var(&new_value, GFP_KERNEL))
@@ -311,7 +311,6 @@ void register_handler_proc(unsigned int irq, struct irqaction *action)
!name_unique(irq, action))
return;
- memset(name, 0, MAX_NAMELEN);
snprintf(name, MAX_NAMELEN, "%s", action->name);
/* create /proc/irq/1234/handler/ */
@@ -340,7 +339,6 @@ void register_irq_proc(unsigned int irq, struct irq_desc *desc)
if (desc->dir)
goto out_unlock;
- memset(name, 0, MAX_NAMELEN);
sprintf(name, "%d", irq);
/* create /proc/irq/1234 */
@@ -386,7 +384,6 @@ void unregister_irq_proc(unsigned int irq, struct irq_desc *desc)
#endif
remove_proc_entry("spurious", desc->dir);
- memset(name, 0, MAX_NAMELEN);
sprintf(name, "%u", irq);
remove_proc_entry(name, root_irq_dir);
}
@@ -421,12 +418,8 @@ void init_irq_proc(void)
/*
* Create entries for all existing IRQs.
*/
- for_each_irq_desc(irq, desc) {
- if (!desc)
- continue;
-
+ for_each_irq_desc(irq, desc)
register_irq_proc(irq, desc);
- }
}
#ifdef CONFIG_GENERIC_IRQ_SHOW
diff --git a/kernel/jump_label.c b/kernel/jump_label.c
index 4b353e0be121..0dbea887d625 100644
--- a/kernel/jump_label.c
+++ b/kernel/jump_label.c
@@ -452,7 +452,7 @@ jump_label_module_notify(struct notifier_block *self, unsigned long val,
return notifier_from_errno(ret);
}
-struct notifier_block jump_label_module_nb = {
+static struct notifier_block jump_label_module_nb = {
.notifier_call = jump_label_module_notify,
.priority = 1, /* higher than tracepoints */
};
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index 81f1a7107c0e..589d763a49b3 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -46,6 +46,7 @@
#include <linux/gfp.h>
#include <linux/kmemcheck.h>
#include <linux/random.h>
+#include <linux/jhash.h>
#include <asm/sections.h>
@@ -309,10 +310,14 @@ static struct hlist_head chainhash_table[CHAINHASH_SIZE];
* It's a 64-bit hash, because it's important for the keys to be
* unique.
*/
-#define iterate_chain_key(key1, key2) \
- (((key1) << MAX_LOCKDEP_KEYS_BITS) ^ \
- ((key1) >> (64-MAX_LOCKDEP_KEYS_BITS)) ^ \
- (key2))
+static inline u64 iterate_chain_key(u64 key, u32 idx)
+{
+ u32 k0 = key, k1 = key >> 32;
+
+ __jhash_mix(idx, k0, k1); /* Macro that modifies arguments! */
+
+ return k0 | (u64)k1 << 32;
+}
void lockdep_off(void)
{
diff --git a/kernel/locking/mutex-debug.h b/kernel/locking/mutex-debug.h
index d06ae3bb46c5..57a871ae3c81 100644
--- a/kernel/locking/mutex-debug.h
+++ b/kernel/locking/mutex-debug.h
@@ -29,12 +29,12 @@ extern void debug_mutex_init(struct mutex *lock, const char *name,
static inline void mutex_set_owner(struct mutex *lock)
{
- lock->owner = current;
+ WRITE_ONCE(lock->owner, current);
}
static inline void mutex_clear_owner(struct mutex *lock)
{
- lock->owner = NULL;
+ WRITE_ONCE(lock->owner, NULL);
}
#define spin_lock_mutex(lock, flags) \
diff --git a/kernel/locking/mutex.h b/kernel/locking/mutex.h
index a68bae5e852a..6cd6b8e9efd7 100644
--- a/kernel/locking/mutex.h
+++ b/kernel/locking/mutex.h
@@ -17,14 +17,20 @@
__list_del((waiter)->list.prev, (waiter)->list.next)
#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
+/*
+ * The mutex owner can get read and written to locklessly.
+ * We should use WRITE_ONCE when writing the owner value to
+ * avoid store tearing, otherwise, a thread could potentially
+ * read a partially written and incomplete owner value.
+ */
static inline void mutex_set_owner(struct mutex *lock)
{
- lock->owner = current;
+ WRITE_ONCE(lock->owner, current);
}
static inline void mutex_clear_owner(struct mutex *lock)
{
- lock->owner = NULL;
+ WRITE_ONCE(lock->owner, NULL);
}
#else
static inline void mutex_set_owner(struct mutex *lock)
diff --git a/kernel/locking/qrwlock.c b/kernel/locking/qrwlock.c
index fec082338668..19248ddf37ce 100644
--- a/kernel/locking/qrwlock.c
+++ b/kernel/locking/qrwlock.c
@@ -93,7 +93,7 @@ void queued_read_lock_slowpath(struct qrwlock *lock, u32 cnts)
* that accesses can't leak upwards out of our subsequent critical
* section in the case that the lock is currently held for write.
*/
- cnts = atomic_add_return_acquire(_QR_BIAS, &lock->cnts) - _QR_BIAS;
+ cnts = atomic_fetch_add_acquire(_QR_BIAS, &lock->cnts);
rspin_until_writer_unlock(lock, cnts);
/*
diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c
index 5fc8c311b8fe..b2caec7315af 100644
--- a/kernel/locking/qspinlock.c
+++ b/kernel/locking/qspinlock.c
@@ -90,7 +90,7 @@ static DEFINE_PER_CPU_ALIGNED(struct mcs_spinlock, mcs_nodes[MAX_NODES]);
* therefore increment the cpu number by one.
*/
-static inline u32 encode_tail(int cpu, int idx)
+static inline __pure u32 encode_tail(int cpu, int idx)
{
u32 tail;
@@ -103,7 +103,7 @@ static inline u32 encode_tail(int cpu, int idx)
return tail;
}
-static inline struct mcs_spinlock *decode_tail(u32 tail)
+static inline __pure struct mcs_spinlock *decode_tail(u32 tail)
{
int cpu = (tail >> _Q_TAIL_CPU_OFFSET) - 1;
int idx = (tail & _Q_TAIL_IDX_MASK) >> _Q_TAIL_IDX_OFFSET;
@@ -268,6 +268,63 @@ static __always_inline u32 __pv_wait_head_or_lock(struct qspinlock *lock,
#endif
/*
+ * Various notes on spin_is_locked() and spin_unlock_wait(), which are
+ * 'interesting' functions:
+ *
+ * PROBLEM: some architectures have an interesting issue with atomic ACQUIRE
+ * operations in that the ACQUIRE applies to the LOAD _not_ the STORE (ARM64,
+ * PPC). Also qspinlock has a similar issue per construction, the setting of
+ * the locked byte can be unordered acquiring the lock proper.
+ *
+ * This gets to be 'interesting' in the following cases, where the /should/s
+ * end up false because of this issue.
+ *
+ *
+ * CASE 1:
+ *
+ * So the spin_is_locked() correctness issue comes from something like:
+ *
+ * CPU0 CPU1
+ *
+ * global_lock(); local_lock(i)
+ * spin_lock(&G) spin_lock(&L[i])
+ * for (i) if (!spin_is_locked(&G)) {
+ * spin_unlock_wait(&L[i]); smp_acquire__after_ctrl_dep();
+ * return;
+ * }
+ * // deal with fail
+ *
+ * Where it is important CPU1 sees G locked or CPU0 sees L[i] locked such
+ * that there is exclusion between the two critical sections.
+ *
+ * The load from spin_is_locked(&G) /should/ be constrained by the ACQUIRE from
+ * spin_lock(&L[i]), and similarly the load(s) from spin_unlock_wait(&L[i])
+ * /should/ be constrained by the ACQUIRE from spin_lock(&G).
+ *
+ * Similarly, later stuff is constrained by the ACQUIRE from CTRL+RMB.
+ *
+ *
+ * CASE 2:
+ *
+ * For spin_unlock_wait() there is a second correctness issue, namely:
+ *
+ * CPU0 CPU1
+ *
+ * flag = set;
+ * smp_mb(); spin_lock(&l)
+ * spin_unlock_wait(&l); if (!flag)
+ * // add to lockless list
+ * spin_unlock(&l);
+ * // iterate lockless list
+ *
+ * Which wants to ensure that CPU1 will stop adding bits to the list and CPU0
+ * will observe the last entry on the list (if spin_unlock_wait() had ACQUIRE
+ * semantics etc..)
+ *
+ * Where flag /should/ be ordered against the locked store of l.
+ */
+
+/*
* queued_spin_lock_slowpath() can (load-)ACQUIRE the lock before
* issuing an _unordered_ store to set _Q_LOCKED_VAL.
*
@@ -322,7 +379,7 @@ void queued_spin_unlock_wait(struct qspinlock *lock)
cpu_relax();
done:
- smp_rmb(); /* CTRL + RMB -> ACQUIRE */
+ smp_acquire__after_ctrl_dep();
}
EXPORT_SYMBOL(queued_spin_unlock_wait);
#endif
@@ -418,7 +475,7 @@ void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val)
* sequentiality; this is because not all clear_pending_set_locked()
* implementations imply full barriers.
*/
- smp_cond_acquire(!(atomic_read(&lock->val) & _Q_LOCKED_MASK));
+ smp_cond_load_acquire(&lock->val.counter, !(VAL & _Q_LOCKED_MASK));
/*
* take ownership and clear the pending bit.
@@ -455,6 +512,8 @@ queue:
* pending stuff.
*
* p,*,* -> n,*,*
+ *
+ * RELEASE, such that the stores to @node must be complete.
*/
old = xchg_tail(lock, tail);
next = NULL;
@@ -465,6 +524,15 @@ queue:
*/
if (old & _Q_TAIL_MASK) {
prev = decode_tail(old);
+ /*
+ * The above xchg_tail() is also a load of @lock which generates,
+ * through decode_tail(), a pointer.
+ *
+ * The address dependency matches the RELEASE of xchg_tail()
+ * such that the access to @prev must happen after.
+ */
+ smp_read_barrier_depends();
+
WRITE_ONCE(prev->next, node);
pv_wait_node(node, prev);
@@ -494,7 +562,7 @@ queue:
*
* The PV pv_wait_head_or_lock function, if active, will acquire
* the lock and return a non-zero value. So we have to skip the
- * smp_cond_acquire() call. As the next PV queue head hasn't been
+ * smp_cond_load_acquire() call. As the next PV queue head hasn't been
* designated yet, there is no way for the locked value to become
* _Q_SLOW_VAL. So both the set_locked() and the
* atomic_cmpxchg_relaxed() calls will be safe.
@@ -505,7 +573,7 @@ queue:
if ((val = pv_wait_head_or_lock(lock, node)))
goto locked;
- smp_cond_acquire(!((val = atomic_read(&lock->val)) & _Q_LOCKED_PENDING_MASK));
+ val = smp_cond_load_acquire(&lock->val.counter, !(VAL & _Q_LOCKED_PENDING_MASK));
locked:
/*
@@ -525,9 +593,9 @@ locked:
break;
}
/*
- * The smp_cond_acquire() call above has provided the necessary
- * acquire semantics required for locking. At most two
- * iterations of this loop may be ran.
+ * The smp_cond_load_acquire() call above has provided the
+ * necessary acquire semantics required for locking. At most
+ * two iterations of this loop may be ran.
*/
old = atomic_cmpxchg_relaxed(&lock->val, val, _Q_LOCKED_VAL);
if (old == val)
@@ -551,7 +619,7 @@ release:
/*
* release the node
*/
- this_cpu_dec(mcs_nodes[0].count);
+ __this_cpu_dec(mcs_nodes[0].count);
}
EXPORT_SYMBOL(queued_spin_lock_slowpath);
diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h
index 21ede57f68b3..37649e69056c 100644
--- a/kernel/locking/qspinlock_paravirt.h
+++ b/kernel/locking/qspinlock_paravirt.h
@@ -112,12 +112,12 @@ static __always_inline int trylock_clear_pending(struct qspinlock *lock)
#else /* _Q_PENDING_BITS == 8 */
static __always_inline void set_pending(struct qspinlock *lock)
{
- atomic_set_mask(_Q_PENDING_VAL, &lock->val);
+ atomic_or(_Q_PENDING_VAL, &lock->val);
}
static __always_inline void clear_pending(struct qspinlock *lock)
{
- atomic_clear_mask(_Q_PENDING_VAL, &lock->val);
+ atomic_andnot(_Q_PENDING_VAL, &lock->val);
}
static __always_inline int trylock_clear_pending(struct qspinlock *lock)
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index 3e746607abe5..1ec0f48962b3 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -1478,7 +1478,7 @@ EXPORT_SYMBOL_GPL(rt_mutex_timed_lock);
*/
int __sched rt_mutex_trylock(struct rt_mutex *lock)
{
- if (WARN_ON(in_irq() || in_nmi() || in_serving_softirq()))
+ if (WARN_ON_ONCE(in_irq() || in_nmi() || in_serving_softirq()))
return 0;
return rt_mutex_fasttrylock(lock, rt_mutex_slowtrylock);
diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c
index 09e30c6225e5..447e08de1fab 100644
--- a/kernel/locking/rwsem-xadd.c
+++ b/kernel/locking/rwsem-xadd.c
@@ -80,7 +80,7 @@ void __init_rwsem(struct rw_semaphore *sem, const char *name,
debug_check_no_locks_freed((void *)sem, sizeof(*sem));
lockdep_init_map(&sem->dep_map, name, key, 0);
#endif
- sem->count = RWSEM_UNLOCKED_VALUE;
+ atomic_long_set(&sem->count, RWSEM_UNLOCKED_VALUE);
raw_spin_lock_init(&sem->wait_lock);
INIT_LIST_HEAD(&sem->wait_list);
#ifdef CONFIG_RWSEM_SPIN_ON_OWNER
@@ -114,12 +114,16 @@ enum rwsem_wake_type {
* - the 'active part' of count (&0x0000ffff) reached 0 (but may have changed)
* - the 'waiting part' of count (&0xffff0000) is -ve (and will still be so)
* - there must be someone on the queue
- * - the spinlock must be held by the caller
+ * - the wait_lock must be held by the caller
+ * - tasks are marked for wakeup, the caller must later invoke wake_up_q()
+ * to actually wakeup the blocked task(s) and drop the reference count,
+ * preferably when the wait_lock is released
* - woken process blocks are discarded from the list after having task zeroed
- * - writers are only woken if downgrading is false
+ * - writers are only marked woken if downgrading is false
*/
static struct rw_semaphore *
-__rwsem_do_wake(struct rw_semaphore *sem, enum rwsem_wake_type wake_type)
+__rwsem_mark_wake(struct rw_semaphore *sem,
+ enum rwsem_wake_type wake_type, struct wake_q_head *wake_q)
{
struct rwsem_waiter *waiter;
struct task_struct *tsk;
@@ -128,13 +132,16 @@ __rwsem_do_wake(struct rw_semaphore *sem, enum rwsem_wake_type wake_type)
waiter = list_entry(sem->wait_list.next, struct rwsem_waiter, list);
if (waiter->type == RWSEM_WAITING_FOR_WRITE) {
- if (wake_type == RWSEM_WAKE_ANY)
- /* Wake writer at the front of the queue, but do not
- * grant it the lock yet as we want other writers
- * to be able to steal it. Readers, on the other hand,
- * will block as they will notice the queued writer.
+ if (wake_type == RWSEM_WAKE_ANY) {
+ /*
+ * Mark writer at the front of the queue for wakeup.
+ * Until the task is actually later awoken later by
+ * the caller, other writers are able to steal it.
+ * Readers, on the other hand, will block as they
+ * will notice the queued writer.
*/
- wake_up_process(waiter->task);
+ wake_q_add(wake_q, waiter->task);
+ }
goto out;
}
@@ -146,15 +153,27 @@ __rwsem_do_wake(struct rw_semaphore *sem, enum rwsem_wake_type wake_type)
if (wake_type != RWSEM_WAKE_READ_OWNED) {
adjustment = RWSEM_ACTIVE_READ_BIAS;
try_reader_grant:
- oldcount = rwsem_atomic_update(adjustment, sem) - adjustment;
+ oldcount = atomic_long_fetch_add(adjustment, &sem->count);
+
if (unlikely(oldcount < RWSEM_WAITING_BIAS)) {
- /* A writer stole the lock. Undo our reader grant. */
- if (rwsem_atomic_update(-adjustment, sem) &
- RWSEM_ACTIVE_MASK)
+ /*
+ * If the count is still less than RWSEM_WAITING_BIAS
+ * after removing the adjustment, it is assumed that
+ * a writer has stolen the lock. We have to undo our
+ * reader grant.
+ */
+ if (atomic_long_add_return(-adjustment, &sem->count) <
+ RWSEM_WAITING_BIAS)
goto out;
/* Last active locker left. Retry waking readers. */
goto try_reader_grant;
}
+ /*
+ * It is not really necessary to set it to reader-owned here,
+ * but it gives the spinners an early indication that the
+ * readers now have the lock.
+ */
+ rwsem_set_reader_owned(sem);
}
/* Grant an infinite number of read locks to the readers at the front
@@ -179,7 +198,7 @@ __rwsem_do_wake(struct rw_semaphore *sem, enum rwsem_wake_type wake_type)
adjustment -= RWSEM_WAITING_BIAS;
if (adjustment)
- rwsem_atomic_add(adjustment, sem);
+ atomic_long_add(adjustment, &sem->count);
next = sem->wait_list.next;
loop = woken;
@@ -187,17 +206,15 @@ __rwsem_do_wake(struct rw_semaphore *sem, enum rwsem_wake_type wake_type)
waiter = list_entry(next, struct rwsem_waiter, list);
next = waiter->list.next;
tsk = waiter->task;
+
+ wake_q_add(wake_q, tsk);
/*
- * Make sure we do not wakeup the next reader before
- * setting the nil condition to grant the next reader;
- * otherwise we could miss the wakeup on the other
- * side and end up sleeping again. See the pairing
- * in rwsem_down_read_failed().
+ * Ensure that the last operation is setting the reader
+ * waiter to nil such that rwsem_down_read_failed() cannot
+ * race with do_exit() by always holding a reference count
+ * to the task to wakeup.
*/
- smp_mb();
- waiter->task = NULL;
- wake_up_process(tsk);
- put_task_struct(tsk);
+ smp_store_release(&waiter->task, NULL);
} while (--loop);
sem->wait_list.next = next;
@@ -216,11 +233,11 @@ struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem)
long count, adjustment = -RWSEM_ACTIVE_READ_BIAS;
struct rwsem_waiter waiter;
struct task_struct *tsk = current;
+ WAKE_Q(wake_q);
/* set up my own style of waitqueue */
waiter.task = tsk;
waiter.type = RWSEM_WAITING_FOR_READ;
- get_task_struct(tsk);
raw_spin_lock_irq(&sem->wait_lock);
if (list_empty(&sem->wait_list))
@@ -228,7 +245,7 @@ struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem)
list_add_tail(&waiter.list, &sem->wait_list);
/* we're now waiting on the lock, but no longer actively locking */
- count = rwsem_atomic_update(adjustment, sem);
+ count = atomic_long_add_return(adjustment, &sem->count);
/* If there are no active locks, wake the front queued process(es).
*
@@ -238,9 +255,10 @@ struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem)
if (count == RWSEM_WAITING_BIAS ||
(count > RWSEM_WAITING_BIAS &&
adjustment != -RWSEM_ACTIVE_READ_BIAS))
- sem = __rwsem_do_wake(sem, RWSEM_WAKE_ANY);
+ sem = __rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q);
raw_spin_unlock_irq(&sem->wait_lock);
+ wake_up_q(&wake_q);
/* wait to be given the lock */
while (true) {
@@ -255,17 +273,29 @@ struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem)
}
EXPORT_SYMBOL(rwsem_down_read_failed);
+/*
+ * This function must be called with the sem->wait_lock held to prevent
+ * race conditions between checking the rwsem wait list and setting the
+ * sem->count accordingly.
+ */
static inline bool rwsem_try_write_lock(long count, struct rw_semaphore *sem)
{
/*
- * Try acquiring the write lock. Check count first in order
- * to reduce unnecessary expensive cmpxchg() operations.
+ * Avoid trying to acquire write lock if count isn't RWSEM_WAITING_BIAS.
*/
- if (count == RWSEM_WAITING_BIAS &&
- cmpxchg_acquire(&sem->count, RWSEM_WAITING_BIAS,
- RWSEM_ACTIVE_WRITE_BIAS) == RWSEM_WAITING_BIAS) {
- if (!list_is_singular(&sem->wait_list))
- rwsem_atomic_update(RWSEM_WAITING_BIAS, sem);
+ if (count != RWSEM_WAITING_BIAS)
+ return false;
+
+ /*
+ * Acquire the lock by trying to set it to ACTIVE_WRITE_BIAS. If there
+ * are other tasks on the wait list, we need to add on WAITING_BIAS.
+ */
+ count = list_is_singular(&sem->wait_list) ?
+ RWSEM_ACTIVE_WRITE_BIAS :
+ RWSEM_ACTIVE_WRITE_BIAS + RWSEM_WAITING_BIAS;
+
+ if (atomic_long_cmpxchg_acquire(&sem->count, RWSEM_WAITING_BIAS, count)
+ == RWSEM_WAITING_BIAS) {
rwsem_set_owner(sem);
return true;
}
@@ -279,13 +309,13 @@ static inline bool rwsem_try_write_lock(long count, struct rw_semaphore *sem)
*/
static inline bool rwsem_try_write_lock_unqueued(struct rw_semaphore *sem)
{
- long old, count = READ_ONCE(sem->count);
+ long old, count = atomic_long_read(&sem->count);
while (true) {
if (!(count == 0 || count == RWSEM_WAITING_BIAS))
return false;
- old = cmpxchg_acquire(&sem->count, count,
+ old = atomic_long_cmpxchg_acquire(&sem->count, count,
count + RWSEM_ACTIVE_WRITE_BIAS);
if (old == count) {
rwsem_set_owner(sem);
@@ -306,16 +336,11 @@ static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem)
rcu_read_lock();
owner = READ_ONCE(sem->owner);
- if (!owner) {
- long count = READ_ONCE(sem->count);
+ if (!rwsem_owner_is_writer(owner)) {
/*
- * If sem->owner is not set, yet we have just recently entered the
- * slowpath with the lock being active, then there is a possibility
- * reader(s) may have the lock. To be safe, bail spinning in these
- * situations.
+ * Don't spin if the rwsem is readers owned.
*/
- if (count & RWSEM_ACTIVE_MASK)
- ret = false;
+ ret = !rwsem_owner_is_reader(owner);
goto done;
}
@@ -325,10 +350,15 @@ done:
return ret;
}
-static noinline
-bool rwsem_spin_on_owner(struct rw_semaphore *sem, struct task_struct *owner)
+/*
+ * Return true only if we can still spin on the owner field of the rwsem.
+ */
+static noinline bool rwsem_spin_on_owner(struct rw_semaphore *sem)
{
- long count;
+ struct task_struct *owner = READ_ONCE(sem->owner);
+
+ if (!rwsem_owner_is_writer(owner))
+ goto out;
rcu_read_lock();
while (sem->owner == owner) {
@@ -349,22 +379,16 @@ bool rwsem_spin_on_owner(struct rw_semaphore *sem, struct task_struct *owner)
cpu_relax_lowlatency();
}
rcu_read_unlock();
-
- if (READ_ONCE(sem->owner))
- return true; /* new owner, continue spinning */
-
+out:
/*
- * When the owner is not set, the lock could be free or
- * held by readers. Check the counter to verify the
- * state.
+ * If there is a new owner or the owner is not set, we continue
+ * spinning.
*/
- count = READ_ONCE(sem->count);
- return (count == 0 || count == RWSEM_WAITING_BIAS);
+ return !rwsem_owner_is_reader(READ_ONCE(sem->owner));
}
static bool rwsem_optimistic_spin(struct rw_semaphore *sem)
{
- struct task_struct *owner;
bool taken = false;
preempt_disable();
@@ -376,12 +400,17 @@ static bool rwsem_optimistic_spin(struct rw_semaphore *sem)
if (!osq_lock(&sem->osq))
goto done;
- while (true) {
- owner = READ_ONCE(sem->owner);
- if (owner && !rwsem_spin_on_owner(sem, owner))
- break;
-
- /* wait_lock will be acquired if write_lock is obtained */
+ /*
+ * Optimistically spin on the owner field and attempt to acquire the
+ * lock whenever the owner changes. Spinning will be stopped when:
+ * 1) the owning writer isn't running; or
+ * 2) readers own the lock as we can't determine if they are
+ * actively running or not.
+ */
+ while (rwsem_spin_on_owner(sem)) {
+ /*
+ * Try to acquire the lock
+ */
if (rwsem_try_write_lock_unqueued(sem)) {
taken = true;
break;
@@ -393,7 +422,7 @@ static bool rwsem_optimistic_spin(struct rw_semaphore *sem)
* we're an RT task that will live-lock because we won't let
* the owner complete.
*/
- if (!owner && (need_resched() || rt_task(current)))
+ if (!sem->owner && (need_resched() || rt_task(current)))
break;
/*
@@ -440,9 +469,10 @@ __rwsem_down_write_failed_common(struct rw_semaphore *sem, int state)
bool waiting = true; /* any queued threads before us */
struct rwsem_waiter waiter;
struct rw_semaphore *ret = sem;
+ WAKE_Q(wake_q);
/* undo write bias from down_write operation, stop active locking */
- count = rwsem_atomic_update(-RWSEM_ACTIVE_WRITE_BIAS, sem);
+ count = atomic_long_sub_return(RWSEM_ACTIVE_WRITE_BIAS, &sem->count);
/* do optimistic spinning and steal lock if possible */
if (rwsem_optimistic_spin(sem))
@@ -465,18 +495,29 @@ __rwsem_down_write_failed_common(struct rw_semaphore *sem, int state)
/* we're now waiting on the lock, but no longer actively locking */
if (waiting) {
- count = READ_ONCE(sem->count);
+ count = atomic_long_read(&sem->count);
/*
* If there were already threads queued before us and there are
* no active writers, the lock must be read owned; so we try to
* wake any read locks that were queued ahead of us.
*/
- if (count > RWSEM_WAITING_BIAS)
- sem = __rwsem_do_wake(sem, RWSEM_WAKE_READERS);
+ if (count > RWSEM_WAITING_BIAS) {
+ WAKE_Q(wake_q);
+
+ sem = __rwsem_mark_wake(sem, RWSEM_WAKE_READERS, &wake_q);
+ /*
+ * The wakeup is normally called _after_ the wait_lock
+ * is released, but given that we are proactively waking
+ * readers we can deal with the wake_q overhead as it is
+ * similar to releasing and taking the wait_lock again
+ * for attempting rwsem_try_write_lock().
+ */
+ wake_up_q(&wake_q);
+ }
} else
- count = rwsem_atomic_update(RWSEM_WAITING_BIAS, sem);
+ count = atomic_long_add_return(RWSEM_WAITING_BIAS, &sem->count);
/* wait until we successfully acquire the lock */
set_current_state(state);
@@ -492,7 +533,7 @@ __rwsem_down_write_failed_common(struct rw_semaphore *sem, int state)
schedule();
set_current_state(state);
- } while ((count = sem->count) & RWSEM_ACTIVE_MASK);
+ } while ((count = atomic_long_read(&sem->count)) & RWSEM_ACTIVE_MASK);
raw_spin_lock_irq(&sem->wait_lock);
}
@@ -507,10 +548,11 @@ out_nolock:
raw_spin_lock_irq(&sem->wait_lock);
list_del(&waiter.list);
if (list_empty(&sem->wait_list))
- rwsem_atomic_update(-RWSEM_WAITING_BIAS, sem);
+ atomic_long_add(-RWSEM_WAITING_BIAS, &sem->count);
else
- __rwsem_do_wake(sem, RWSEM_WAKE_ANY);
+ __rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q);
raw_spin_unlock_irq(&sem->wait_lock);
+ wake_up_q(&wake_q);
return ERR_PTR(-EINTR);
}
@@ -537,6 +579,7 @@ __visible
struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem)
{
unsigned long flags;
+ WAKE_Q(wake_q);
/*
* If a spinner is present, it is not necessary to do the wakeup.
@@ -573,9 +616,10 @@ locked:
/* do nothing if list empty */
if (!list_empty(&sem->wait_list))
- sem = __rwsem_do_wake(sem, RWSEM_WAKE_ANY);
+ sem = __rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q);
raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
+ wake_up_q(&wake_q);
return sem;
}
@@ -590,14 +634,16 @@ __visible
struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem)
{
unsigned long flags;
+ WAKE_Q(wake_q);
raw_spin_lock_irqsave(&sem->wait_lock, flags);
/* do nothing if list empty */
if (!list_empty(&sem->wait_list))
- sem = __rwsem_do_wake(sem, RWSEM_WAKE_READ_OWNED);
+ sem = __rwsem_mark_wake(sem, RWSEM_WAKE_READ_OWNED, &wake_q);
raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
+ wake_up_q(&wake_q);
return sem;
}
diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c
index 2e853ad93a3a..45ba475d4be3 100644
--- a/kernel/locking/rwsem.c
+++ b/kernel/locking/rwsem.c
@@ -22,6 +22,7 @@ void __sched down_read(struct rw_semaphore *sem)
rwsem_acquire_read(&sem->dep_map, 0, 0, _RET_IP_);
LOCK_CONTENDED(sem, __down_read_trylock, __down_read);
+ rwsem_set_reader_owned(sem);
}
EXPORT_SYMBOL(down_read);
@@ -33,8 +34,10 @@ int down_read_trylock(struct rw_semaphore *sem)
{
int ret = __down_read_trylock(sem);
- if (ret == 1)
+ if (ret == 1) {
rwsem_acquire_read(&sem->dep_map, 0, 1, _RET_IP_);
+ rwsem_set_reader_owned(sem);
+ }
return ret;
}
@@ -124,7 +127,7 @@ void downgrade_write(struct rw_semaphore *sem)
* lockdep: a downgraded write will live on as a write
* dependency.
*/
- rwsem_clear_owner(sem);
+ rwsem_set_reader_owned(sem);
__downgrade_write(sem);
}
@@ -138,6 +141,7 @@ void down_read_nested(struct rw_semaphore *sem, int subclass)
rwsem_acquire_read(&sem->dep_map, subclass, 0, _RET_IP_);
LOCK_CONTENDED(sem, __down_read_trylock, __down_read);
+ rwsem_set_reader_owned(sem);
}
EXPORT_SYMBOL(down_read_nested);
diff --git a/kernel/locking/rwsem.h b/kernel/locking/rwsem.h
index 870ed9a5b426..a699f4048ba1 100644
--- a/kernel/locking/rwsem.h
+++ b/kernel/locking/rwsem.h
@@ -1,14 +1,58 @@
+/*
+ * The owner field of the rw_semaphore structure will be set to
+ * RWSEM_READ_OWNED when a reader grabs the lock. A writer will clear
+ * the owner field when it unlocks. A reader, on the other hand, will
+ * not touch the owner field when it unlocks.
+ *
+ * In essence, the owner field now has the following 3 states:
+ * 1) 0
+ * - lock is free or the owner hasn't set the field yet
+ * 2) RWSEM_READER_OWNED
+ * - lock is currently or previously owned by readers (lock is free
+ * or not set by owner yet)
+ * 3) Other non-zero value
+ * - a writer owns the lock
+ */
+#define RWSEM_READER_OWNED ((struct task_struct *)1UL)
+
#ifdef CONFIG_RWSEM_SPIN_ON_OWNER
+/*
+ * All writes to owner are protected by WRITE_ONCE() to make sure that
+ * store tearing can't happen as optimistic spinners may read and use
+ * the owner value concurrently without lock. Read from owner, however,
+ * may not need READ_ONCE() as long as the pointer value is only used
+ * for comparison and isn't being dereferenced.
+ */
static inline void rwsem_set_owner(struct rw_semaphore *sem)
{
- sem->owner = current;
+ WRITE_ONCE(sem->owner, current);
}
static inline void rwsem_clear_owner(struct rw_semaphore *sem)
{
- sem->owner = NULL;
+ WRITE_ONCE(sem->owner, NULL);
+}
+
+static inline void rwsem_set_reader_owned(struct rw_semaphore *sem)
+{
+ /*
+ * We check the owner value first to make sure that we will only
+ * do a write to the rwsem cacheline when it is really necessary
+ * to minimize cacheline contention.
+ */
+ if (sem->owner != RWSEM_READER_OWNED)
+ WRITE_ONCE(sem->owner, RWSEM_READER_OWNED);
+}
+
+static inline bool rwsem_owner_is_writer(struct task_struct *owner)
+{
+ return owner && owner != RWSEM_READER_OWNED;
}
+static inline bool rwsem_owner_is_reader(struct task_struct *owner)
+{
+ return owner == RWSEM_READER_OWNED;
+}
#else
static inline void rwsem_set_owner(struct rw_semaphore *sem)
{
@@ -17,4 +61,8 @@ static inline void rwsem_set_owner(struct rw_semaphore *sem)
static inline void rwsem_clear_owner(struct rw_semaphore *sem)
{
}
+
+static inline void rwsem_set_reader_owned(struct rw_semaphore *sem)
+{
+}
#endif
diff --git a/kernel/memremap.c b/kernel/memremap.c
index 017532193fb1..251d16b4cb41 100644
--- a/kernel/memremap.c
+++ b/kernel/memremap.c
@@ -169,12 +169,6 @@ void devm_memunmap(struct device *dev, void *addr)
}
EXPORT_SYMBOL(devm_memunmap);
-pfn_t phys_to_pfn_t(phys_addr_t addr, u64 flags)
-{
- return __pfn_to_pfn_t(addr >> PAGE_SHIFT, flags);
-}
-EXPORT_SYMBOL(phys_to_pfn_t);
-
#ifdef CONFIG_ZONE_DEVICE
static DEFINE_MUTEX(pgmap_lock);
static RADIX_TREE(pgmap_radix, GFP_KERNEL);
@@ -308,12 +302,6 @@ void *devm_memremap_pages(struct device *dev, struct resource *res,
if (is_ram == REGION_INTERSECTS)
return __va(res->start);
- if (altmap && !IS_ENABLED(CONFIG_SPARSEMEM_VMEMMAP)) {
- dev_err(dev, "%s: altmap requires CONFIG_SPARSEMEM_VMEMMAP=y\n",
- __func__);
- return ERR_PTR(-ENXIO);
- }
-
if (!ref)
return ERR_PTR(-EINVAL);
@@ -401,7 +389,6 @@ void vmem_altmap_free(struct vmem_altmap *altmap, unsigned long nr_pfns)
altmap->alloc -= nr_pfns;
}
-#ifdef CONFIG_SPARSEMEM_VMEMMAP
struct vmem_altmap *to_vmem_altmap(unsigned long memmap_start)
{
/*
@@ -427,5 +414,4 @@ struct vmem_altmap *to_vmem_altmap(unsigned long memmap_start)
return pgmap ? pgmap->altmap : NULL;
}
-#endif /* CONFIG_SPARSEMEM_VMEMMAP */
#endif /* CONFIG_ZONE_DEVICE */
diff --git a/kernel/power/Makefile b/kernel/power/Makefile
index cb880a14cc39..eb4f717705ba 100644
--- a/kernel/power/Makefile
+++ b/kernel/power/Makefile
@@ -1,6 +1,8 @@
ccflags-$(CONFIG_PM_DEBUG) := -DDEBUG
+KASAN_SANITIZE_snapshot.o := n
+
obj-y += qos.o
obj-$(CONFIG_PM) += main.o
obj-$(CONFIG_VT_CONSOLE_SLEEP) += console.o
diff --git a/kernel/power/console.c b/kernel/power/console.c
index aba9c545a0e3..0e781798b0b3 100644
--- a/kernel/power/console.c
+++ b/kernel/power/console.c
@@ -126,17 +126,17 @@ out:
return ret;
}
-int pm_prepare_console(void)
+void pm_prepare_console(void)
{
if (!pm_vt_switch())
- return 0;
+ return;
orig_fgconsole = vt_move_to_console(SUSPEND_CONSOLE, 1);
if (orig_fgconsole < 0)
- return 1;
+ return;
orig_kmsg = vt_kmsg_redirect(SUSPEND_CONSOLE);
- return 0;
+ return;
}
void pm_restore_console(void)
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index fca9254280ee..a881c6a7ba74 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -52,6 +52,7 @@ enum {
#ifdef CONFIG_SUSPEND
HIBERNATION_SUSPEND,
#endif
+ HIBERNATION_TEST_RESUME,
/* keep last */
__HIBERNATION_AFTER_LAST
};
@@ -409,6 +410,11 @@ int hibernation_snapshot(int platform_mode)
goto Close;
}
+int __weak hibernate_resume_nonboot_cpu_disable(void)
+{
+ return disable_nonboot_cpus();
+}
+
/**
* resume_target_kernel - Restore system state from a hibernation image.
* @platform_mode: Whether or not to use the platform driver.
@@ -433,7 +439,7 @@ static int resume_target_kernel(bool platform_mode)
if (error)
goto Cleanup;
- error = disable_nonboot_cpus();
+ error = hibernate_resume_nonboot_cpu_disable();
if (error)
goto Enable_cpus;
@@ -642,12 +648,39 @@ static void power_down(void)
cpu_relax();
}
+static int load_image_and_restore(void)
+{
+ int error;
+ unsigned int flags;
+
+ pr_debug("PM: Loading hibernation image.\n");
+
+ lock_device_hotplug();
+ error = create_basic_memory_bitmaps();
+ if (error)
+ goto Unlock;
+
+ error = swsusp_read(&flags);
+ swsusp_close(FMODE_READ);
+ if (!error)
+ hibernation_restore(flags & SF_PLATFORM_MODE);
+
+ printk(KERN_ERR "PM: Failed to load hibernation image, recovering.\n");
+ swsusp_free();
+ free_basic_memory_bitmaps();
+ Unlock:
+ unlock_device_hotplug();
+
+ return error;
+}
+
/**
* hibernate - Carry out system hibernation, including saving the image.
*/
int hibernate(void)
{
- int error;
+ int error, nr_calls = 0;
+ bool snapshot_test = false;
if (!hibernation_available()) {
pr_debug("PM: Hibernation not available.\n");
@@ -662,9 +695,11 @@ int hibernate(void)
}
pm_prepare_console();
- error = pm_notifier_call_chain(PM_HIBERNATION_PREPARE);
- if (error)
+ error = __pm_notifier_call_chain(PM_HIBERNATION_PREPARE, -1, &nr_calls);
+ if (error) {
+ nr_calls--;
goto Exit;
+ }
printk(KERN_INFO "PM: Syncing filesystems ... ");
sys_sync();
@@ -697,8 +732,12 @@ int hibernate(void)
pr_debug("PM: writing image.\n");
error = swsusp_write(flags);
swsusp_free();
- if (!error)
- power_down();
+ if (!error) {
+ if (hibernation_mode == HIBERNATION_TEST_RESUME)
+ snapshot_test = true;
+ else
+ power_down();
+ }
in_suspend = 0;
pm_restore_gfp_mask();
} else {
@@ -709,12 +748,18 @@ int hibernate(void)
free_basic_memory_bitmaps();
Thaw:
unlock_device_hotplug();
+ if (snapshot_test) {
+ pr_debug("PM: Checking hibernation image\n");
+ error = swsusp_check();
+ if (!error)
+ error = load_image_and_restore();
+ }
thaw_processes();
/* Don't bother checking whether freezer_test_done is true */
freezer_test_done = false;
Exit:
- pm_notifier_call_chain(PM_POST_HIBERNATION);
+ __pm_notifier_call_chain(PM_POST_HIBERNATION, nr_calls, NULL);
pm_restore_console();
atomic_inc(&snapshot_device_available);
Unlock:
@@ -740,8 +785,7 @@ int hibernate(void)
*/
static int software_resume(void)
{
- int error;
- unsigned int flags;
+ int error, nr_calls = 0;
/*
* If the user said "noresume".. bail out early.
@@ -827,35 +871,20 @@ static int software_resume(void)
}
pm_prepare_console();
- error = pm_notifier_call_chain(PM_RESTORE_PREPARE);
- if (error)
+ error = __pm_notifier_call_chain(PM_RESTORE_PREPARE, -1, &nr_calls);
+ if (error) {
+ nr_calls--;
goto Close_Finish;
+ }
pr_debug("PM: Preparing processes for restore.\n");
error = freeze_processes();
if (error)
goto Close_Finish;
-
- pr_debug("PM: Loading hibernation image.\n");
-
- lock_device_hotplug();
- error = create_basic_memory_bitmaps();
- if (error)
- goto Thaw;
-
- error = swsusp_read(&flags);
- swsusp_close(FMODE_READ);
- if (!error)
- hibernation_restore(flags & SF_PLATFORM_MODE);
-
- printk(KERN_ERR "PM: Failed to load hibernation image, recovering.\n");
- swsusp_free();
- free_basic_memory_bitmaps();
- Thaw:
- unlock_device_hotplug();
+ error = load_image_and_restore();
thaw_processes();
Finish:
- pm_notifier_call_chain(PM_POST_RESTORE);
+ __pm_notifier_call_chain(PM_POST_RESTORE, nr_calls, NULL);
pm_restore_console();
atomic_inc(&snapshot_device_available);
/* For success case, the suspend path will release the lock */
@@ -878,6 +907,7 @@ static const char * const hibernation_modes[] = {
#ifdef CONFIG_SUSPEND
[HIBERNATION_SUSPEND] = "suspend",
#endif
+ [HIBERNATION_TEST_RESUME] = "test_resume",
};
/*
@@ -924,6 +954,7 @@ static ssize_t disk_show(struct kobject *kobj, struct kobj_attribute *attr,
#ifdef CONFIG_SUSPEND
case HIBERNATION_SUSPEND:
#endif
+ case HIBERNATION_TEST_RESUME:
break;
case HIBERNATION_PLATFORM:
if (hibernation_ops)
@@ -970,6 +1001,7 @@ static ssize_t disk_store(struct kobject *kobj, struct kobj_attribute *attr,
#ifdef CONFIG_SUSPEND
case HIBERNATION_SUSPEND:
#endif
+ case HIBERNATION_TEST_RESUME:
hibernation_mode = mode;
break;
case HIBERNATION_PLATFORM:
@@ -1115,13 +1147,16 @@ static int __init resume_offset_setup(char *str)
static int __init hibernate_setup(char *str)
{
- if (!strncmp(str, "noresume", 8))
+ if (!strncmp(str, "noresume", 8)) {
noresume = 1;
- else if (!strncmp(str, "nocompress", 10))
+ } else if (!strncmp(str, "nocompress", 10)) {
nocompress = 1;
- else if (!strncmp(str, "no", 2)) {
+ } else if (!strncmp(str, "no", 2)) {
noresume = 1;
nohibernate = 1;
+ } else if (IS_ENABLED(CONFIG_DEBUG_RODATA)
+ && !strncmp(str, "protect_image", 13)) {
+ enable_restore_image_protection();
}
return 1;
}
@@ -1154,11 +1189,6 @@ static int __init nohibernate_setup(char *str)
return 1;
}
-static int __init kaslr_nohibernate_setup(char *str)
-{
- return nohibernate_setup(str);
-}
-
static int __init page_poison_nohibernate_setup(char *str)
{
#ifdef CONFIG_PAGE_POISONING_ZERO
@@ -1182,5 +1212,4 @@ __setup("hibernate=", hibernate_setup);
__setup("resumewait", resumewait_setup);
__setup("resumedelay=", resumedelay_setup);
__setup("nohibernate", nohibernate_setup);
-__setup("kaslr", kaslr_nohibernate_setup);
__setup("page_poison=", page_poison_nohibernate_setup);
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 27946975eff0..5ea50b1b7595 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -38,12 +38,19 @@ int unregister_pm_notifier(struct notifier_block *nb)
}
EXPORT_SYMBOL_GPL(unregister_pm_notifier);
-int pm_notifier_call_chain(unsigned long val)
+int __pm_notifier_call_chain(unsigned long val, int nr_to_call, int *nr_calls)
{
- int ret = blocking_notifier_call_chain(&pm_chain_head, val, NULL);
+ int ret;
+
+ ret = __blocking_notifier_call_chain(&pm_chain_head, val, NULL,
+ nr_to_call, nr_calls);
return notifier_to_errno(ret);
}
+int pm_notifier_call_chain(unsigned long val)
+{
+ return __pm_notifier_call_chain(val, -1, NULL);
+}
/* If set, devices may be suspended and resumed asynchronously. */
int pm_async_enabled = 1;
diff --git a/kernel/power/power.h b/kernel/power/power.h
index efe1b3b17c88..242d8b827dd5 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -38,6 +38,8 @@ static inline char *check_image_kernel(struct swsusp_info *info)
}
#endif /* CONFIG_ARCH_HIBERNATION_HEADER */
+extern int hibernate_resume_nonboot_cpu_disable(void);
+
/*
* Keep some memory free so that I/O operations can succeed without paging
* [Might this be more than 4 MB?]
@@ -59,6 +61,13 @@ extern int hibernation_snapshot(int platform_mode);
extern int hibernation_restore(int platform_mode);
extern int hibernation_platform_enter(void);
+#ifdef CONFIG_DEBUG_RODATA
+/* kernel/power/snapshot.c */
+extern void enable_restore_image_protection(void);
+#else
+static inline void enable_restore_image_protection(void) {}
+#endif /* CONFIG_DEBUG_RODATA */
+
#else /* !CONFIG_HIBERNATION */
static inline void hibernate_reserved_size_init(void) {}
@@ -200,6 +209,8 @@ static inline void suspend_test_finish(const char *label) {}
#ifdef CONFIG_PM_SLEEP
/* kernel/power/main.c */
+extern int __pm_notifier_call_chain(unsigned long val, int nr_to_call,
+ int *nr_calls);
extern int pm_notifier_call_chain(unsigned long val);
#endif
diff --git a/kernel/power/process.c b/kernel/power/process.c
index 0c2ee9761d57..8f27d5a8adf6 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -89,6 +89,9 @@ static int try_to_freeze_tasks(bool user_only)
elapsed_msecs / 1000, elapsed_msecs % 1000,
todo - wq_busy, wq_busy);
+ if (wq_busy)
+ show_workqueue_state();
+
if (!wakeup) {
read_lock(&tasklist_lock);
for_each_process_thread(g, p) {
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 3a970604308f..9a0178c2ac1d 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -38,6 +38,43 @@
#include "power.h"
+#ifdef CONFIG_DEBUG_RODATA
+static bool hibernate_restore_protection;
+static bool hibernate_restore_protection_active;
+
+void enable_restore_image_protection(void)
+{
+ hibernate_restore_protection = true;
+}
+
+static inline void hibernate_restore_protection_begin(void)
+{
+ hibernate_restore_protection_active = hibernate_restore_protection;
+}
+
+static inline void hibernate_restore_protection_end(void)
+{
+ hibernate_restore_protection_active = false;
+}
+
+static inline void hibernate_restore_protect_page(void *page_address)
+{
+ if (hibernate_restore_protection_active)
+ set_memory_ro((unsigned long)page_address, 1);
+}
+
+static inline void hibernate_restore_unprotect_page(void *page_address)
+{
+ if (hibernate_restore_protection_active)
+ set_memory_rw((unsigned long)page_address, 1);
+}
+#else
+static inline void hibernate_restore_protection_begin(void) {}
+static inline void hibernate_restore_protection_end(void) {}
+static inline void hibernate_restore_protect_page(void *page_address) {}
+static inline void hibernate_restore_unprotect_page(void *page_address) {}
+#endif /* CONFIG_DEBUG_RODATA */
+
static int swsusp_page_is_free(struct page *);
static void swsusp_set_page_forbidden(struct page *);
static void swsusp_unset_page_forbidden(struct page *);
@@ -67,25 +104,32 @@ void __init hibernate_image_size_init(void)
image_size = ((totalram_pages * 2) / 5) * PAGE_SIZE;
}
-/* List of PBEs needed for restoring the pages that were allocated before
+/*
+ * List of PBEs needed for restoring the pages that were allocated before
* the suspend and included in the suspend image, but have also been
* allocated by the "resume" kernel, so their contents cannot be written
* directly to their "original" page frames.
*/
struct pbe *restore_pblist;
-/* Pointer to an auxiliary buffer (1 page) */
-static void *buffer;
+/* struct linked_page is used to build chains of pages */
-/**
- * @safe_needed - on resume, for storing the PBE list and the image,
- * we can only use memory pages that do not conflict with the pages
- * used before suspend. The unsafe pages have PageNosaveFree set
- * and we count them using unsafe_pages.
- *
- * Each allocated image page is marked as PageNosave and PageNosaveFree
- * so that swsusp_free() can release it.
+#define LINKED_PAGE_DATA_SIZE (PAGE_SIZE - sizeof(void *))
+
+struct linked_page {
+ struct linked_page *next;
+ char data[LINKED_PAGE_DATA_SIZE];
+} __packed;
+
+/*
+ * List of "safe" pages (ie. pages that were not used by the image kernel
+ * before hibernation) that may be used as temporary storage for image kernel
+ * memory contents.
*/
+static struct linked_page *safe_pages_list;
+
+/* Pointer to an auxiliary buffer (1 page) */
+static void *buffer;
#define PG_ANY 0
#define PG_SAFE 1
@@ -94,6 +138,19 @@ static void *buffer;
static unsigned int allocated_unsafe_pages;
+/**
+ * get_image_page - Allocate a page for a hibernation image.
+ * @gfp_mask: GFP mask for the allocation.
+ * @safe_needed: Get pages that were not used before hibernation (restore only)
+ *
+ * During image restoration, for storing the PBE list and the image data, we can
+ * only use memory pages that do not conflict with the pages used before
+ * hibernation. The "unsafe" pages have PageNosaveFree set and we count them
+ * using allocated_unsafe_pages.
+ *
+ * Each allocated image page is marked as PageNosave and PageNosaveFree so that
+ * swsusp_free() can release it.
+ */
static void *get_image_page(gfp_t gfp_mask, int safe_needed)
{
void *res;
@@ -113,9 +170,21 @@ static void *get_image_page(gfp_t gfp_mask, int safe_needed)
return res;
}
+static void *__get_safe_page(gfp_t gfp_mask)
+{
+ if (safe_pages_list) {
+ void *ret = safe_pages_list;
+
+ safe_pages_list = safe_pages_list->next;
+ memset(ret, 0, PAGE_SIZE);
+ return ret;
+ }
+ return get_image_page(gfp_mask, PG_SAFE);
+}
+
unsigned long get_safe_page(gfp_t gfp_mask)
{
- return (unsigned long)get_image_page(gfp_mask, PG_SAFE);
+ return (unsigned long)__get_safe_page(gfp_mask);
}
static struct page *alloc_image_page(gfp_t gfp_mask)
@@ -130,11 +199,22 @@ static struct page *alloc_image_page(gfp_t gfp_mask)
return page;
}
+static void recycle_safe_page(void *page_address)
+{
+ struct linked_page *lp = page_address;
+
+ lp->next = safe_pages_list;
+ safe_pages_list = lp;
+}
+
/**
- * free_image_page - free page represented by @addr, allocated with
- * get_image_page (page flags set by it must be cleared)
+ * free_image_page - Free a page allocated for hibernation image.
+ * @addr: Address of the page to free.
+ * @clear_nosave_free: If set, clear the PageNosaveFree bit for the page.
+ *
+ * The page to free should have been allocated by get_image_page() (page flags
+ * set by it are affected).
*/
-
static inline void free_image_page(void *addr, int clear_nosave_free)
{
struct page *page;
@@ -150,17 +230,8 @@ static inline void free_image_page(void *addr, int clear_nosave_free)
__free_page(page);
}
-/* struct linked_page is used to build chains of pages */
-
-#define LINKED_PAGE_DATA_SIZE (PAGE_SIZE - sizeof(void *))
-
-struct linked_page {
- struct linked_page *next;
- char data[LINKED_PAGE_DATA_SIZE];
-} __packed;
-
-static inline void
-free_list_of_pages(struct linked_page *list, int clear_page_nosave)
+static inline void free_list_of_pages(struct linked_page *list,
+ int clear_page_nosave)
{
while (list) {
struct linked_page *lp = list->next;
@@ -170,30 +241,28 @@ free_list_of_pages(struct linked_page *list, int clear_page_nosave)
}
}
-/**
- * struct chain_allocator is used for allocating small objects out of
- * a linked list of pages called 'the chain'.
- *
- * The chain grows each time when there is no room for a new object in
- * the current page. The allocated objects cannot be freed individually.
- * It is only possible to free them all at once, by freeing the entire
- * chain.
- *
- * NOTE: The chain allocator may be inefficient if the allocated objects
- * are not much smaller than PAGE_SIZE.
- */
-
+/*
+ * struct chain_allocator is used for allocating small objects out of
+ * a linked list of pages called 'the chain'.
+ *
+ * The chain grows each time when there is no room for a new object in
+ * the current page. The allocated objects cannot be freed individually.
+ * It is only possible to free them all at once, by freeing the entire
+ * chain.
+ *
+ * NOTE: The chain allocator may be inefficient if the allocated objects
+ * are not much smaller than PAGE_SIZE.
+ */
struct chain_allocator {
struct linked_page *chain; /* the chain */
unsigned int used_space; /* total size of objects allocated out
- * of the current page
- */
+ of the current page */
gfp_t gfp_mask; /* mask for allocating pages */
int safe_needed; /* if set, only "safe" pages are allocated */
};
-static void
-chain_init(struct chain_allocator *ca, gfp_t gfp_mask, int safe_needed)
+static void chain_init(struct chain_allocator *ca, gfp_t gfp_mask,
+ int safe_needed)
{
ca->chain = NULL;
ca->used_space = LINKED_PAGE_DATA_SIZE;
@@ -208,7 +277,8 @@ static void *chain_alloc(struct chain_allocator *ca, unsigned int size)
if (LINKED_PAGE_DATA_SIZE - ca->used_space < size) {
struct linked_page *lp;
- lp = get_image_page(ca->gfp_mask, ca->safe_needed);
+ lp = ca->safe_needed ? __get_safe_page(ca->gfp_mask) :
+ get_image_page(ca->gfp_mask, PG_ANY);
if (!lp)
return NULL;
@@ -222,44 +292,44 @@ static void *chain_alloc(struct chain_allocator *ca, unsigned int size)
}
/**
- * Data types related to memory bitmaps.
+ * Data types related to memory bitmaps.
*
- * Memory bitmap is a structure consiting of many linked lists of
- * objects. The main list's elements are of type struct zone_bitmap
- * and each of them corresonds to one zone. For each zone bitmap
- * object there is a list of objects of type struct bm_block that
- * represent each blocks of bitmap in which information is stored.
+ * Memory bitmap is a structure consiting of many linked lists of
+ * objects. The main list's elements are of type struct zone_bitmap
+ * and each of them corresonds to one zone. For each zone bitmap
+ * object there is a list of objects of type struct bm_block that
+ * represent each blocks of bitmap in which information is stored.
*
- * struct memory_bitmap contains a pointer to the main list of zone
- * bitmap objects, a struct bm_position used for browsing the bitmap,
- * and a pointer to the list of pages used for allocating all of the
- * zone bitmap objects and bitmap block objects.
+ * struct memory_bitmap contains a pointer to the main list of zone
+ * bitmap objects, a struct bm_position used for browsing the bitmap,
+ * and a pointer to the list of pages used for allocating all of the
+ * zone bitmap objects and bitmap block objects.
*
- * NOTE: It has to be possible to lay out the bitmap in memory
- * using only allocations of order 0. Additionally, the bitmap is
- * designed to work with arbitrary number of zones (this is over the
- * top for now, but let's avoid making unnecessary assumptions ;-).
+ * NOTE: It has to be possible to lay out the bitmap in memory
+ * using only allocations of order 0. Additionally, the bitmap is
+ * designed to work with arbitrary number of zones (this is over the
+ * top for now, but let's avoid making unnecessary assumptions ;-).
*
- * struct zone_bitmap contains a pointer to a list of bitmap block
- * objects and a pointer to the bitmap block object that has been
- * most recently used for setting bits. Additionally, it contains the
- * pfns that correspond to the start and end of the represented zone.
+ * struct zone_bitmap contains a pointer to a list of bitmap block
+ * objects and a pointer to the bitmap block object that has been
+ * most recently used for setting bits. Additionally, it contains the
+ * PFNs that correspond to the start and end of the represented zone.
*
- * struct bm_block contains a pointer to the memory page in which
- * information is stored (in the form of a block of bitmap)
- * It also contains the pfns that correspond to the start and end of
- * the represented memory area.
+ * struct bm_block contains a pointer to the memory page in which
+ * information is stored (in the form of a block of bitmap)
+ * It also contains the pfns that correspond to the start and end of
+ * the represented memory area.
*
- * The memory bitmap is organized as a radix tree to guarantee fast random
- * access to the bits. There is one radix tree for each zone (as returned
- * from create_mem_extents).
+ * The memory bitmap is organized as a radix tree to guarantee fast random
+ * access to the bits. There is one radix tree for each zone (as returned
+ * from create_mem_extents).
*
- * One radix tree is represented by one struct mem_zone_bm_rtree. There are
- * two linked lists for the nodes of the tree, one for the inner nodes and
- * one for the leave nodes. The linked leave nodes are used for fast linear
- * access of the memory bitmap.
+ * One radix tree is represented by one struct mem_zone_bm_rtree. There are
+ * two linked lists for the nodes of the tree, one for the inner nodes and
+ * one for the leave nodes. The linked leave nodes are used for fast linear
+ * access of the memory bitmap.
*
- * The struct rtree_node represents one node of the radix tree.
+ * The struct rtree_node represents one node of the radix tree.
*/
#define BM_END_OF_MAP (~0UL)
@@ -305,9 +375,8 @@ struct bm_position {
struct memory_bitmap {
struct list_head zones;
struct linked_page *p_list; /* list of pages used to store zone
- * bitmap objects and bitmap block
- * objects
- */
+ bitmap objects and bitmap block
+ objects */
struct bm_position cur; /* most recently used bit position */
};
@@ -321,12 +390,12 @@ struct memory_bitmap {
#endif
#define BM_RTREE_LEVEL_MASK ((1UL << BM_RTREE_LEVEL_SHIFT) - 1)
-/*
- * alloc_rtree_node - Allocate a new node and add it to the radix tree.
+/**
+ * alloc_rtree_node - Allocate a new node and add it to the radix tree.
*
- * This function is used to allocate inner nodes as well as the
- * leave nodes of the radix tree. It also adds the node to the
- * corresponding linked list passed in by the *list parameter.
+ * This function is used to allocate inner nodes as well as the
+ * leave nodes of the radix tree. It also adds the node to the
+ * corresponding linked list passed in by the *list parameter.
*/
static struct rtree_node *alloc_rtree_node(gfp_t gfp_mask, int safe_needed,
struct chain_allocator *ca,
@@ -347,12 +416,12 @@ static struct rtree_node *alloc_rtree_node(gfp_t gfp_mask, int safe_needed,
return node;
}
-/*
- * add_rtree_block - Add a new leave node to the radix tree
+/**
+ * add_rtree_block - Add a new leave node to the radix tree.
*
- * The leave nodes need to be allocated in order to keep the leaves
- * linked list in order. This is guaranteed by the zone->blocks
- * counter.
+ * The leave nodes need to be allocated in order to keep the leaves
+ * linked list in order. This is guaranteed by the zone->blocks
+ * counter.
*/
static int add_rtree_block(struct mem_zone_bm_rtree *zone, gfp_t gfp_mask,
int safe_needed, struct chain_allocator *ca)
@@ -417,17 +486,18 @@ static int add_rtree_block(struct mem_zone_bm_rtree *zone, gfp_t gfp_mask,
static void free_zone_bm_rtree(struct mem_zone_bm_rtree *zone,
int clear_nosave_free);
-/*
- * create_zone_bm_rtree - create a radix tree for one zone
+/**
+ * create_zone_bm_rtree - Create a radix tree for one zone.
*
- * Allocated the mem_zone_bm_rtree structure and initializes it.
- * This function also allocated and builds the radix tree for the
- * zone.
+ * Allocated the mem_zone_bm_rtree structure and initializes it.
+ * This function also allocated and builds the radix tree for the
+ * zone.
*/
-static struct mem_zone_bm_rtree *
-create_zone_bm_rtree(gfp_t gfp_mask, int safe_needed,
- struct chain_allocator *ca,
- unsigned long start, unsigned long end)
+static struct mem_zone_bm_rtree *create_zone_bm_rtree(gfp_t gfp_mask,
+ int safe_needed,
+ struct chain_allocator *ca,
+ unsigned long start,
+ unsigned long end)
{
struct mem_zone_bm_rtree *zone;
unsigned int i, nr_blocks;
@@ -454,12 +524,12 @@ create_zone_bm_rtree(gfp_t gfp_mask, int safe_needed,
return zone;
}
-/*
- * free_zone_bm_rtree - Free the memory of the radix tree
+/**
+ * free_zone_bm_rtree - Free the memory of the radix tree.
*
- * Free all node pages of the radix tree. The mem_zone_bm_rtree
- * structure itself is not freed here nor are the rtree_node
- * structs.
+ * Free all node pages of the radix tree. The mem_zone_bm_rtree
+ * structure itself is not freed here nor are the rtree_node
+ * structs.
*/
static void free_zone_bm_rtree(struct mem_zone_bm_rtree *zone,
int clear_nosave_free)
@@ -492,8 +562,8 @@ struct mem_extent {
};
/**
- * free_mem_extents - free a list of memory extents
- * @list - list of extents to empty
+ * free_mem_extents - Free a list of memory extents.
+ * @list: List of extents to free.
*/
static void free_mem_extents(struct list_head *list)
{
@@ -506,10 +576,11 @@ static void free_mem_extents(struct list_head *list)
}
/**
- * create_mem_extents - create a list of memory extents representing
- * contiguous ranges of PFNs
- * @list - list to put the extents into
- * @gfp_mask - mask to use for memory allocations
+ * create_mem_extents - Create a list of memory extents.
+ * @list: List to put the extents into.
+ * @gfp_mask: Mask to use for memory allocations.
+ *
+ * The extents represent contiguous ranges of PFNs.
*/
static int create_mem_extents(struct list_head *list, gfp_t gfp_mask)
{
@@ -565,10 +636,10 @@ static int create_mem_extents(struct list_head *list, gfp_t gfp_mask)
}
/**
- * memory_bm_create - allocate memory for a memory bitmap
- */
-static int
-memory_bm_create(struct memory_bitmap *bm, gfp_t gfp_mask, int safe_needed)
+ * memory_bm_create - Allocate memory for a memory bitmap.
+ */
+static int memory_bm_create(struct memory_bitmap *bm, gfp_t gfp_mask,
+ int safe_needed)
{
struct chain_allocator ca;
struct list_head mem_extents;
@@ -607,8 +678,9 @@ memory_bm_create(struct memory_bitmap *bm, gfp_t gfp_mask, int safe_needed)
}
/**
- * memory_bm_free - free memory occupied by the memory bitmap @bm
- */
+ * memory_bm_free - Free memory occupied by the memory bitmap.
+ * @bm: Memory bitmap.
+ */
static void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free)
{
struct mem_zone_bm_rtree *zone;
@@ -622,14 +694,13 @@ static void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free)
}
/**
- * memory_bm_find_bit - Find the bit for pfn in the memory
- * bitmap
+ * memory_bm_find_bit - Find the bit for a given PFN in a memory bitmap.
*
- * Find the bit in the bitmap @bm that corresponds to given pfn.
- * The cur.zone, cur.block and cur.node_pfn member of @bm are
- * updated.
- * It walks the radix tree to find the page which contains the bit for
- * pfn and returns the bit position in **addr and *bit_nr.
+ * Find the bit in memory bitmap @bm that corresponds to the given PFN.
+ * The cur.zone, cur.block and cur.node_pfn members of @bm are updated.
+ *
+ * Walk the radix tree to find the page containing the bit that represents @pfn
+ * and return the position of the bit in @addr and @bit_nr.
*/
static int memory_bm_find_bit(struct memory_bitmap *bm, unsigned long pfn,
void **addr, unsigned int *bit_nr)
@@ -658,10 +729,9 @@ static int memory_bm_find_bit(struct memory_bitmap *bm, unsigned long pfn,
zone_found:
/*
- * We have a zone. Now walk the radix tree to find the leave
- * node for our pfn.
+ * We have found the zone. Now walk the radix tree to find the leaf node
+ * for our PFN.
*/
-
node = bm->cur.node;
if (((pfn - zone->start_pfn) & ~BM_BLOCK_MASK) == bm->cur.node_pfn)
goto node_found;
@@ -754,14 +824,14 @@ static bool memory_bm_pfn_present(struct memory_bitmap *bm, unsigned long pfn)
}
/*
- * rtree_next_node - Jumps to the next leave node
+ * rtree_next_node - Jump to the next leaf node.
*
- * Sets the position to the beginning of the next node in the
- * memory bitmap. This is either the next node in the current
- * zone's radix tree or the first node in the radix tree of the
- * next zone.
+ * Set the position to the beginning of the next node in the
+ * memory bitmap. This is either the next node in the current
+ * zone's radix tree or the first node in the radix tree of the
+ * next zone.
*
- * Returns true if there is a next node, false otherwise.
+ * Return true if there is a next node, false otherwise.
*/
static bool rtree_next_node(struct memory_bitmap *bm)
{
@@ -790,14 +860,15 @@ static bool rtree_next_node(struct memory_bitmap *bm)
}
/**
- * memory_bm_rtree_next_pfn - Find the next set bit in the bitmap @bm
+ * memory_bm_rtree_next_pfn - Find the next set bit in a memory bitmap.
+ * @bm: Memory bitmap.
*
- * Starting from the last returned position this function searches
- * for the next set bit in the memory bitmap and returns its
- * number. If no more bit is set BM_END_OF_MAP is returned.
+ * Starting from the last returned position this function searches for the next
+ * set bit in @bm and returns the PFN represented by it. If no more bits are
+ * set, BM_END_OF_MAP is returned.
*
- * It is required to run memory_bm_position_reset() before the
- * first call to this function.
+ * It is required to run memory_bm_position_reset() before the first call to
+ * this function for the given memory bitmap.
*/
static unsigned long memory_bm_next_pfn(struct memory_bitmap *bm)
{
@@ -819,11 +890,10 @@ static unsigned long memory_bm_next_pfn(struct memory_bitmap *bm)
return BM_END_OF_MAP;
}
-/**
- * This structure represents a range of page frames the contents of which
- * should not be saved during the suspend.
+/*
+ * This structure represents a range of page frames the contents of which
+ * should not be saved during hibernation.
*/
-
struct nosave_region {
struct list_head list;
unsigned long start_pfn;
@@ -832,15 +902,42 @@ struct nosave_region {
static LIST_HEAD(nosave_regions);
+static void recycle_zone_bm_rtree(struct mem_zone_bm_rtree *zone)
+{
+ struct rtree_node *node;
+
+ list_for_each_entry(node, &zone->nodes, list)
+ recycle_safe_page(node->data);
+
+ list_for_each_entry(node, &zone->leaves, list)
+ recycle_safe_page(node->data);
+}
+
+static void memory_bm_recycle(struct memory_bitmap *bm)
+{
+ struct mem_zone_bm_rtree *zone;
+ struct linked_page *p_list;
+
+ list_for_each_entry(zone, &bm->zones, list)
+ recycle_zone_bm_rtree(zone);
+
+ p_list = bm->p_list;
+ while (p_list) {
+ struct linked_page *lp = p_list;
+
+ p_list = lp->next;
+ recycle_safe_page(lp);
+ }
+}
+
/**
- * register_nosave_region - register a range of page frames the contents
- * of which should not be saved during the suspend (to be used in the early
- * initialization code)
+ * register_nosave_region - Register a region of unsaveable memory.
+ *
+ * Register a range of page frames the contents of which should not be saved
+ * during hibernation (to be used in the early initialization code).
*/
-
-void __init
-__register_nosave_region(unsigned long start_pfn, unsigned long end_pfn,
- int use_kmalloc)
+void __init __register_nosave_region(unsigned long start_pfn,
+ unsigned long end_pfn, int use_kmalloc)
{
struct nosave_region *region;
@@ -857,12 +954,13 @@ __register_nosave_region(unsigned long start_pfn, unsigned long end_pfn,
}
}
if (use_kmalloc) {
- /* during init, this shouldn't fail */
+ /* During init, this shouldn't fail */
region = kmalloc(sizeof(struct nosave_region), GFP_KERNEL);
BUG_ON(!region);
- } else
+ } else {
/* This allocation cannot fail */
region = memblock_virt_alloc(sizeof(struct nosave_region), 0);
+ }
region->start_pfn = start_pfn;
region->end_pfn = end_pfn;
list_add_tail(&region->list, &nosave_regions);
@@ -923,10 +1021,12 @@ static void swsusp_unset_page_forbidden(struct page *page)
}
/**
- * mark_nosave_pages - set bits corresponding to the page frames the
- * contents of which should not be saved in a given bitmap.
+ * mark_nosave_pages - Mark pages that should not be saved.
+ * @bm: Memory bitmap.
+ *
+ * Set the bits in @bm that correspond to the page frames the contents of which
+ * should not be saved.
*/
-
static void mark_nosave_pages(struct memory_bitmap *bm)
{
struct nosave_region *region;
@@ -956,13 +1056,13 @@ static void mark_nosave_pages(struct memory_bitmap *bm)
}
/**
- * create_basic_memory_bitmaps - create bitmaps needed for marking page
- * frames that should not be saved and free page frames. The pointers
- * forbidden_pages_map and free_pages_map are only modified if everything
- * goes well, because we don't want the bits to be used before both bitmaps
- * are set up.
+ * create_basic_memory_bitmaps - Create bitmaps to hold basic page information.
+ *
+ * Create bitmaps needed for marking page frames that should not be saved and
+ * free page frames. The forbidden_pages_map and free_pages_map pointers are
+ * only modified if everything goes well, because we don't want the bits to be
+ * touched before both bitmaps are set up.
*/
-
int create_basic_memory_bitmaps(void)
{
struct memory_bitmap *bm1, *bm2;
@@ -1007,12 +1107,12 @@ int create_basic_memory_bitmaps(void)
}
/**
- * free_basic_memory_bitmaps - free memory bitmaps allocated by
- * create_basic_memory_bitmaps(). The auxiliary pointers are necessary
- * so that the bitmaps themselves are not referred to while they are being
- * freed.
+ * free_basic_memory_bitmaps - Free memory bitmaps holding basic information.
+ *
+ * Free memory bitmaps allocated by create_basic_memory_bitmaps(). The
+ * auxiliary pointers are necessary so that the bitmaps themselves are not
+ * referred to while they are being freed.
*/
-
void free_basic_memory_bitmaps(void)
{
struct memory_bitmap *bm1, *bm2;
@@ -1033,11 +1133,13 @@ void free_basic_memory_bitmaps(void)
}
/**
- * snapshot_additional_pages - estimate the number of additional pages
- * be needed for setting up the suspend image data structures for given
- * zone (usually the returned value is greater than the exact number)
+ * snapshot_additional_pages - Estimate the number of extra pages needed.
+ * @zone: Memory zone to carry out the computation for.
+ *
+ * Estimate the number of additional pages needed for setting up a hibernation
+ * image data structures for @zone (usually, the returned value is greater than
+ * the exact number).
*/
-
unsigned int snapshot_additional_pages(struct zone *zone)
{
unsigned int rtree, nodes;
@@ -1055,10 +1157,10 @@ unsigned int snapshot_additional_pages(struct zone *zone)
#ifdef CONFIG_HIGHMEM
/**
- * count_free_highmem_pages - compute the total number of free highmem
- * pages, system-wide.
+ * count_free_highmem_pages - Compute the total number of free highmem pages.
+ *
+ * The returned number is system-wide.
*/
-
static unsigned int count_free_highmem_pages(void)
{
struct zone *zone;
@@ -1072,11 +1174,12 @@ static unsigned int count_free_highmem_pages(void)
}
/**
- * saveable_highmem_page - Determine whether a highmem page should be
- * included in the suspend image.
+ * saveable_highmem_page - Check if a highmem page is saveable.
*
- * We should save the page if it isn't Nosave or NosaveFree, or Reserved,
- * and it isn't a part of a free chunk of pages.
+ * Determine whether a highmem page should be included in a hibernation image.
+ *
+ * We should save the page if it isn't Nosave or NosaveFree, or Reserved,
+ * and it isn't part of a free chunk of pages.
*/
static struct page *saveable_highmem_page(struct zone *zone, unsigned long pfn)
{
@@ -1102,10 +1205,8 @@ static struct page *saveable_highmem_page(struct zone *zone, unsigned long pfn)
}
/**
- * count_highmem_pages - compute the total number of saveable highmem
- * pages.
+ * count_highmem_pages - Compute the total number of saveable highmem pages.
*/
-
static unsigned int count_highmem_pages(void)
{
struct zone *zone;
@@ -1133,12 +1234,14 @@ static inline void *saveable_highmem_page(struct zone *z, unsigned long p)
#endif /* CONFIG_HIGHMEM */
/**
- * saveable_page - Determine whether a non-highmem page should be included
- * in the suspend image.
+ * saveable_page - Check if the given page is saveable.
*
- * We should save the page if it isn't Nosave, and is not in the range
- * of pages statically defined as 'unsaveable', and it isn't a part of
- * a free chunk of pages.
+ * Determine whether a non-highmem page should be included in a hibernation
+ * image.
+ *
+ * We should save the page if it isn't Nosave, and is not in the range
+ * of pages statically defined as 'unsaveable', and it isn't part of
+ * a free chunk of pages.
*/
static struct page *saveable_page(struct zone *zone, unsigned long pfn)
{
@@ -1167,10 +1270,8 @@ static struct page *saveable_page(struct zone *zone, unsigned long pfn)
}
/**
- * count_data_pages - compute the total number of saveable non-highmem
- * pages.
+ * count_data_pages - Compute the total number of saveable non-highmem pages.
*/
-
static unsigned int count_data_pages(void)
{
struct zone *zone;
@@ -1190,7 +1291,8 @@ static unsigned int count_data_pages(void)
return n;
}
-/* This is needed, because copy_page and memcpy are not usable for copying
+/*
+ * This is needed, because copy_page and memcpy are not usable for copying
* task structs.
*/
static inline void do_copy_page(long *dst, long *src)
@@ -1201,12 +1303,12 @@ static inline void do_copy_page(long *dst, long *src)
*dst++ = *src++;
}
-
/**
- * safe_copy_page - check if the page we are going to copy is marked as
- * present in the kernel page tables (this always is the case if
- * CONFIG_DEBUG_PAGEALLOC is not set and in that case
- * kernel_page_present() always returns 'true').
+ * safe_copy_page - Copy a page in a safe way.
+ *
+ * Check if the page we are going to copy is marked as present in the kernel
+ * page tables (this always is the case if CONFIG_DEBUG_PAGEALLOC is not set
+ * and in that case kernel_page_present() always returns 'true').
*/
static void safe_copy_page(void *dst, struct page *s_page)
{
@@ -1219,10 +1321,8 @@ static void safe_copy_page(void *dst, struct page *s_page)
}
}
-
#ifdef CONFIG_HIGHMEM
-static inline struct page *
-page_is_saveable(struct zone *zone, unsigned long pfn)
+static inline struct page *page_is_saveable(struct zone *zone, unsigned long pfn)
{
return is_highmem(zone) ?
saveable_highmem_page(zone, pfn) : saveable_page(zone, pfn);
@@ -1243,7 +1343,8 @@ static void copy_data_page(unsigned long dst_pfn, unsigned long src_pfn)
kunmap_atomic(src);
} else {
if (PageHighMem(d_page)) {
- /* Page pointed to by src may contain some kernel
+ /*
+ * The page pointed to by src may contain some kernel
* data modified by kmap_atomic()
*/
safe_copy_page(buffer, s_page);
@@ -1265,8 +1366,8 @@ static inline void copy_data_page(unsigned long dst_pfn, unsigned long src_pfn)
}
#endif /* CONFIG_HIGHMEM */
-static void
-copy_data_pages(struct memory_bitmap *copy_bm, struct memory_bitmap *orig_bm)
+static void copy_data_pages(struct memory_bitmap *copy_bm,
+ struct memory_bitmap *orig_bm)
{
struct zone *zone;
unsigned long pfn;
@@ -1315,12 +1416,11 @@ static struct memory_bitmap orig_bm;
static struct memory_bitmap copy_bm;
/**
- * swsusp_free - free pages allocated for the suspend.
+ * swsusp_free - Free pages allocated for hibernation image.
*
- * Suspend pages are alocated before the atomic copy is made, so we
- * need to release them after the resume.
+ * Image pages are alocated before snapshot creation, so they need to be
+ * released after resume.
*/
-
void swsusp_free(void)
{
unsigned long fb_pfn, fr_pfn;
@@ -1351,6 +1451,7 @@ loop:
memory_bm_clear_current(forbidden_pages_map);
memory_bm_clear_current(free_pages_map);
+ hibernate_restore_unprotect_page(page_address(page));
__free_page(page);
goto loop;
}
@@ -1362,6 +1463,7 @@ out:
buffer = NULL;
alloc_normal = 0;
alloc_highmem = 0;
+ hibernate_restore_protection_end();
}
/* Helper functions used for the shrinking of memory. */
@@ -1369,7 +1471,7 @@ out:
#define GFP_IMAGE (GFP_KERNEL | __GFP_NOWARN)
/**
- * preallocate_image_pages - Allocate a number of pages for hibernation image
+ * preallocate_image_pages - Allocate a number of pages for hibernation image.
* @nr_pages: Number of page frames to allocate.
* @mask: GFP flags to use for the allocation.
*
@@ -1419,7 +1521,7 @@ static unsigned long preallocate_image_highmem(unsigned long nr_pages)
}
/**
- * __fraction - Compute (an approximation of) x * (multiplier / base)
+ * __fraction - Compute (an approximation of) x * (multiplier / base).
*/
static unsigned long __fraction(u64 x, u64 multiplier, u64 base)
{
@@ -1429,8 +1531,8 @@ static unsigned long __fraction(u64 x, u64 multiplier, u64 base)
}
static unsigned long preallocate_highmem_fraction(unsigned long nr_pages,
- unsigned long highmem,
- unsigned long total)
+ unsigned long highmem,
+ unsigned long total)
{
unsigned long alloc = __fraction(nr_pages, highmem, total);
@@ -1443,15 +1545,15 @@ static inline unsigned long preallocate_image_highmem(unsigned long nr_pages)
}
static inline unsigned long preallocate_highmem_fraction(unsigned long nr_pages,
- unsigned long highmem,
- unsigned long total)
+ unsigned long highmem,
+ unsigned long total)
{
return 0;
}
#endif /* CONFIG_HIGHMEM */
/**
- * free_unnecessary_pages - Release preallocated pages not needed for the image
+ * free_unnecessary_pages - Release preallocated pages not needed for the image.
*/
static unsigned long free_unnecessary_pages(void)
{
@@ -1505,7 +1607,7 @@ static unsigned long free_unnecessary_pages(void)
}
/**
- * minimum_image_size - Estimate the minimum acceptable size of an image
+ * minimum_image_size - Estimate the minimum acceptable size of an image.
* @saveable: Number of saveable pages in the system.
*
* We want to avoid attempting to free too much memory too hard, so estimate the
@@ -1525,17 +1627,17 @@ static unsigned long minimum_image_size(unsigned long saveable)
unsigned long size;
size = global_page_state(NR_SLAB_RECLAIMABLE)
- + global_page_state(NR_ACTIVE_ANON)
- + global_page_state(NR_INACTIVE_ANON)
- + global_page_state(NR_ACTIVE_FILE)
- + global_page_state(NR_INACTIVE_FILE)
- - global_page_state(NR_FILE_MAPPED);
+ + global_node_page_state(NR_ACTIVE_ANON)
+ + global_node_page_state(NR_INACTIVE_ANON)
+ + global_node_page_state(NR_ACTIVE_FILE)
+ + global_node_page_state(NR_INACTIVE_FILE)
+ - global_node_page_state(NR_FILE_MAPPED);
return saveable <= size ? 0 : saveable - size;
}
/**
- * hibernate_preallocate_memory - Preallocate memory for hibernation image
+ * hibernate_preallocate_memory - Preallocate memory for hibernation image.
*
* To create a hibernation image it is necessary to make a copy of every page
* frame in use. We also need a number of page frames to be free during
@@ -1708,10 +1810,11 @@ int hibernate_preallocate_memory(void)
#ifdef CONFIG_HIGHMEM
/**
- * count_pages_for_highmem - compute the number of non-highmem pages
- * that will be necessary for creating copies of highmem pages.
- */
-
+ * count_pages_for_highmem - Count non-highmem pages needed for copying highmem.
+ *
+ * Compute the number of non-highmem pages that will be necessary for creating
+ * copies of highmem pages.
+ */
static unsigned int count_pages_for_highmem(unsigned int nr_highmem)
{
unsigned int free_highmem = count_free_highmem_pages() + alloc_highmem;
@@ -1724,15 +1827,12 @@ static unsigned int count_pages_for_highmem(unsigned int nr_highmem)
return nr_highmem;
}
#else
-static unsigned int
-count_pages_for_highmem(unsigned int nr_highmem) { return 0; }
+static unsigned int count_pages_for_highmem(unsigned int nr_highmem) { return 0; }
#endif /* CONFIG_HIGHMEM */
/**
- * enough_free_mem - Make sure we have enough free memory for the
- * snapshot image.
+ * enough_free_mem - Check if there is enough free memory for the image.
*/
-
static int enough_free_mem(unsigned int nr_pages, unsigned int nr_highmem)
{
struct zone *zone;
@@ -1751,10 +1851,11 @@ static int enough_free_mem(unsigned int nr_pages, unsigned int nr_highmem)
#ifdef CONFIG_HIGHMEM
/**
- * get_highmem_buffer - if there are some highmem pages in the suspend
- * image, we may need the buffer to copy them and/or load their data.
+ * get_highmem_buffer - Allocate a buffer for highmem pages.
+ *
+ * If there are some highmem pages in the hibernation image, we may need a
+ * buffer to copy them and/or load their data.
*/
-
static inline int get_highmem_buffer(int safe_needed)
{
buffer = get_image_page(GFP_ATOMIC | __GFP_COLD, safe_needed);
@@ -1762,13 +1863,13 @@ static inline int get_highmem_buffer(int safe_needed)
}
/**
- * alloc_highmem_image_pages - allocate some highmem pages for the image.
- * Try to allocate as many pages as needed, but if the number of free
- * highmem pages is lesser than that, allocate them all.
+ * alloc_highmem_image_pages - Allocate some highmem pages for the image.
+ *
+ * Try to allocate as many pages as needed, but if the number of free highmem
+ * pages is less than that, allocate them all.
*/
-
-static inline unsigned int
-alloc_highmem_pages(struct memory_bitmap *bm, unsigned int nr_highmem)
+static inline unsigned int alloc_highmem_pages(struct memory_bitmap *bm,
+ unsigned int nr_highmem)
{
unsigned int to_alloc = count_free_highmem_pages();
@@ -1787,25 +1888,24 @@ alloc_highmem_pages(struct memory_bitmap *bm, unsigned int nr_highmem)
#else
static inline int get_highmem_buffer(int safe_needed) { return 0; }
-static inline unsigned int
-alloc_highmem_pages(struct memory_bitmap *bm, unsigned int n) { return 0; }
+static inline unsigned int alloc_highmem_pages(struct memory_bitmap *bm,
+ unsigned int n) { return 0; }
#endif /* CONFIG_HIGHMEM */
/**
- * swsusp_alloc - allocate memory for the suspend image
+ * swsusp_alloc - Allocate memory for hibernation image.
*
- * We first try to allocate as many highmem pages as there are
- * saveable highmem pages in the system. If that fails, we allocate
- * non-highmem pages for the copies of the remaining highmem ones.
+ * We first try to allocate as many highmem pages as there are
+ * saveable highmem pages in the system. If that fails, we allocate
+ * non-highmem pages for the copies of the remaining highmem ones.
*
- * In this approach it is likely that the copies of highmem pages will
- * also be located in the high memory, because of the way in which
- * copy_data_pages() works.
+ * In this approach it is likely that the copies of highmem pages will
+ * also be located in the high memory, because of the way in which
+ * copy_data_pages() works.
*/
-
-static int
-swsusp_alloc(struct memory_bitmap *orig_bm, struct memory_bitmap *copy_bm,
- unsigned int nr_pages, unsigned int nr_highmem)
+static int swsusp_alloc(struct memory_bitmap *orig_bm,
+ struct memory_bitmap *copy_bm,
+ unsigned int nr_pages, unsigned int nr_highmem)
{
if (nr_highmem > 0) {
if (get_highmem_buffer(PG_ANY))
@@ -1855,7 +1955,8 @@ asmlinkage __visible int swsusp_save(void)
return -ENOMEM;
}
- /* During allocating of suspend pagedir, new cold pages may appear.
+ /*
+ * During allocating of suspend pagedir, new cold pages may appear.
* Kill them.
*/
drain_local_pages(NULL);
@@ -1918,12 +2019,14 @@ static int init_header(struct swsusp_info *info)
}
/**
- * pack_pfns - pfns corresponding to the set bits found in the bitmap @bm
- * are stored in the array @buf[] (1 page at a time)
+ * pack_pfns - Prepare PFNs for saving.
+ * @bm: Memory bitmap.
+ * @buf: Memory buffer to store the PFNs in.
+ *
+ * PFNs corresponding to set bits in @bm are stored in the area of memory
+ * pointed to by @buf (1 page at a time).
*/
-
-static inline void
-pack_pfns(unsigned long *buf, struct memory_bitmap *bm)
+static inline void pack_pfns(unsigned long *buf, struct memory_bitmap *bm)
{
int j;
@@ -1937,22 +2040,21 @@ pack_pfns(unsigned long *buf, struct memory_bitmap *bm)
}
/**
- * snapshot_read_next - used for reading the system memory snapshot.
+ * snapshot_read_next - Get the address to read the next image page from.
+ * @handle: Snapshot handle to be used for the reading.
*
- * On the first call to it @handle should point to a zeroed
- * snapshot_handle structure. The structure gets updated and a pointer
- * to it should be passed to this function every next time.
+ * On the first call, @handle should point to a zeroed snapshot_handle
+ * structure. The structure gets populated then and a pointer to it should be
+ * passed to this function every next time.
*
- * On success the function returns a positive number. Then, the caller
- * is allowed to read up to the returned number of bytes from the memory
- * location computed by the data_of() macro.
+ * On success, the function returns a positive number. Then, the caller
+ * is allowed to read up to the returned number of bytes from the memory
+ * location computed by the data_of() macro.
*
- * The function returns 0 to indicate the end of data stream condition,
- * and a negative number is returned on error. In such cases the
- * structure pointed to by @handle is not updated and should not be used
- * any more.
+ * The function returns 0 to indicate the end of the data stream condition,
+ * and negative numbers are returned on errors. If that happens, the structure
+ * pointed to by @handle is not updated and should not be used any more.
*/
-
int snapshot_read_next(struct snapshot_handle *handle)
{
if (handle->cur > nr_meta_pages + nr_copy_pages)
@@ -1981,7 +2083,8 @@ int snapshot_read_next(struct snapshot_handle *handle)
page = pfn_to_page(memory_bm_next_pfn(&copy_bm));
if (PageHighMem(page)) {
- /* Highmem pages are copied to the buffer,
+ /*
+ * Highmem pages are copied to the buffer,
* because we can't return with a kmapped
* highmem page (we may not be called again).
*/
@@ -1999,53 +2102,41 @@ int snapshot_read_next(struct snapshot_handle *handle)
return PAGE_SIZE;
}
-/**
- * mark_unsafe_pages - mark the pages that cannot be used for storing
- * the image during resume, because they conflict with the pages that
- * had been used before suspend
- */
-
-static int mark_unsafe_pages(struct memory_bitmap *bm)
+static void duplicate_memory_bitmap(struct memory_bitmap *dst,
+ struct memory_bitmap *src)
{
- struct zone *zone;
- unsigned long pfn, max_zone_pfn;
+ unsigned long pfn;
- /* Clear page flags */
- for_each_populated_zone(zone) {
- max_zone_pfn = zone_end_pfn(zone);
- for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
- if (pfn_valid(pfn))
- swsusp_unset_page_free(pfn_to_page(pfn));
+ memory_bm_position_reset(src);
+ pfn = memory_bm_next_pfn(src);
+ while (pfn != BM_END_OF_MAP) {
+ memory_bm_set_bit(dst, pfn);
+ pfn = memory_bm_next_pfn(src);
}
-
- /* Mark pages that correspond to the "original" pfns as "unsafe" */
- memory_bm_position_reset(bm);
- do {
- pfn = memory_bm_next_pfn(bm);
- if (likely(pfn != BM_END_OF_MAP)) {
- if (likely(pfn_valid(pfn)))
- swsusp_set_page_free(pfn_to_page(pfn));
- else
- return -EFAULT;
- }
- } while (pfn != BM_END_OF_MAP);
-
- allocated_unsafe_pages = 0;
-
- return 0;
}
-static void
-duplicate_memory_bitmap(struct memory_bitmap *dst, struct memory_bitmap *src)
+/**
+ * mark_unsafe_pages - Mark pages that were used before hibernation.
+ *
+ * Mark the pages that cannot be used for storing the image during restoration,
+ * because they conflict with the pages that had been used before hibernation.
+ */
+static void mark_unsafe_pages(struct memory_bitmap *bm)
{
unsigned long pfn;
- memory_bm_position_reset(src);
- pfn = memory_bm_next_pfn(src);
+ /* Clear the "free"/"unsafe" bit for all PFNs */
+ memory_bm_position_reset(free_pages_map);
+ pfn = memory_bm_next_pfn(free_pages_map);
while (pfn != BM_END_OF_MAP) {
- memory_bm_set_bit(dst, pfn);
- pfn = memory_bm_next_pfn(src);
+ memory_bm_clear_current(free_pages_map);
+ pfn = memory_bm_next_pfn(free_pages_map);
}
+
+ /* Mark pages that correspond to the "original" PFNs as "unsafe" */
+ duplicate_memory_bitmap(free_pages_map, bm);
+
+ allocated_unsafe_pages = 0;
}
static int check_header(struct swsusp_info *info)
@@ -2063,11 +2154,9 @@ static int check_header(struct swsusp_info *info)
}
/**
- * load header - check the image header and copy data from it
+ * load header - Check the image header and copy the data from it.
*/
-
-static int
-load_header(struct swsusp_info *info)
+static int load_header(struct swsusp_info *info)
{
int error;
@@ -2081,8 +2170,12 @@ load_header(struct swsusp_info *info)
}
/**
- * unpack_orig_pfns - for each element of @buf[] (1 page at a time) set
- * the corresponding bit in the memory bitmap @bm
+ * unpack_orig_pfns - Set bits corresponding to given PFNs in a memory bitmap.
+ * @bm: Memory bitmap.
+ * @buf: Area of memory containing the PFNs.
+ *
+ * For each element of the array pointed to by @buf (1 page at a time), set the
+ * corresponding bit in @bm.
*/
static int unpack_orig_pfns(unsigned long *buf, struct memory_bitmap *bm)
{
@@ -2095,7 +2188,7 @@ static int unpack_orig_pfns(unsigned long *buf, struct memory_bitmap *bm)
/* Extract and buffer page key for data page (s390 only). */
page_key_memorize(buf + j);
- if (memory_bm_pfn_present(bm, buf[j]))
+ if (pfn_valid(buf[j]) && memory_bm_pfn_present(bm, buf[j]))
memory_bm_set_bit(bm, buf[j]);
else
return -EFAULT;
@@ -2104,13 +2197,9 @@ static int unpack_orig_pfns(unsigned long *buf, struct memory_bitmap *bm)
return 0;
}
-/* List of "safe" pages that may be used to store data loaded from the suspend
- * image
- */
-static struct linked_page *safe_pages_list;
-
#ifdef CONFIG_HIGHMEM
-/* struct highmem_pbe is used for creating the list of highmem pages that
+/*
+ * struct highmem_pbe is used for creating the list of highmem pages that
* should be restored atomically during the resume from disk, because the page
* frames they have occupied before the suspend are in use.
*/
@@ -2120,7 +2209,8 @@ struct highmem_pbe {
struct highmem_pbe *next;
};
-/* List of highmem PBEs needed for restoring the highmem pages that were
+/*
+ * List of highmem PBEs needed for restoring the highmem pages that were
* allocated before the suspend and included in the suspend image, but have
* also been allocated by the "resume" kernel, so their contents cannot be
* written directly to their "original" page frames.
@@ -2128,11 +2218,11 @@ struct highmem_pbe {
static struct highmem_pbe *highmem_pblist;
/**
- * count_highmem_image_pages - compute the number of highmem pages in the
- * suspend image. The bits in the memory bitmap @bm that correspond to the
- * image pages are assumed to be set.
+ * count_highmem_image_pages - Compute the number of highmem pages in the image.
+ * @bm: Memory bitmap.
+ *
+ * The bits in @bm that correspond to image pages are assumed to be set.
*/
-
static unsigned int count_highmem_image_pages(struct memory_bitmap *bm)
{
unsigned long pfn;
@@ -2149,24 +2239,25 @@ static unsigned int count_highmem_image_pages(struct memory_bitmap *bm)
return cnt;
}
-/**
- * prepare_highmem_image - try to allocate as many highmem pages as
- * there are highmem image pages (@nr_highmem_p points to the variable
- * containing the number of highmem image pages). The pages that are
- * "safe" (ie. will not be overwritten when the suspend image is
- * restored) have the corresponding bits set in @bm (it must be
- * unitialized).
- *
- * NOTE: This function should not be called if there are no highmem
- * image pages.
- */
-
static unsigned int safe_highmem_pages;
static struct memory_bitmap *safe_highmem_bm;
-static int
-prepare_highmem_image(struct memory_bitmap *bm, unsigned int *nr_highmem_p)
+/**
+ * prepare_highmem_image - Allocate memory for loading highmem data from image.
+ * @bm: Pointer to an uninitialized memory bitmap structure.
+ * @nr_highmem_p: Pointer to the number of highmem image pages.
+ *
+ * Try to allocate as many highmem pages as there are highmem image pages
+ * (@nr_highmem_p points to the variable containing the number of highmem image
+ * pages). The pages that are "safe" (ie. will not be overwritten when the
+ * hibernation image is restored entirely) have the corresponding bits set in
+ * @bm (it must be unitialized).
+ *
+ * NOTE: This function should not be called if there are no highmem image pages.
+ */
+static int prepare_highmem_image(struct memory_bitmap *bm,
+ unsigned int *nr_highmem_p)
{
unsigned int to_alloc;
@@ -2201,39 +2292,42 @@ prepare_highmem_image(struct memory_bitmap *bm, unsigned int *nr_highmem_p)
return 0;
}
+static struct page *last_highmem_page;
+
/**
- * get_highmem_page_buffer - for given highmem image page find the buffer
- * that suspend_write_next() should set for its caller to write to.
+ * get_highmem_page_buffer - Prepare a buffer to store a highmem image page.
*
- * If the page is to be saved to its "original" page frame or a copy of
- * the page is to be made in the highmem, @buffer is returned. Otherwise,
- * the copy of the page is to be made in normal memory, so the address of
- * the copy is returned.
+ * For a given highmem image page get a buffer that suspend_write_next() should
+ * return to its caller to write to.
*
- * If @buffer is returned, the caller of suspend_write_next() will write
- * the page's contents to @buffer, so they will have to be copied to the
- * right location on the next call to suspend_write_next() and it is done
- * with the help of copy_last_highmem_page(). For this purpose, if
- * @buffer is returned, @last_highmem page is set to the page to which
- * the data will have to be copied from @buffer.
+ * If the page is to be saved to its "original" page frame or a copy of
+ * the page is to be made in the highmem, @buffer is returned. Otherwise,
+ * the copy of the page is to be made in normal memory, so the address of
+ * the copy is returned.
+ *
+ * If @buffer is returned, the caller of suspend_write_next() will write
+ * the page's contents to @buffer, so they will have to be copied to the
+ * right location on the next call to suspend_write_next() and it is done
+ * with the help of copy_last_highmem_page(). For this purpose, if
+ * @buffer is returned, @last_highmem_page is set to the page to which
+ * the data will have to be copied from @buffer.
*/
-
-static struct page *last_highmem_page;
-
-static void *
-get_highmem_page_buffer(struct page *page, struct chain_allocator *ca)
+static void *get_highmem_page_buffer(struct page *page,
+ struct chain_allocator *ca)
{
struct highmem_pbe *pbe;
void *kaddr;
if (swsusp_page_is_forbidden(page) && swsusp_page_is_free(page)) {
- /* We have allocated the "original" page frame and we can
+ /*
+ * We have allocated the "original" page frame and we can
* use it directly to store the loaded page.
*/
last_highmem_page = page;
return buffer;
}
- /* The "original" page frame has not been allocated and we have to
+ /*
+ * The "original" page frame has not been allocated and we have to
* use a "safe" page frame to store the loaded page.
*/
pbe = chain_alloc(ca, sizeof(struct highmem_pbe));
@@ -2263,11 +2357,12 @@ get_highmem_page_buffer(struct page *page, struct chain_allocator *ca)
}
/**
- * copy_last_highmem_page - copy the contents of a highmem image from
- * @buffer, where the caller of snapshot_write_next() has place them,
- * to the right location represented by @last_highmem_page .
+ * copy_last_highmem_page - Copy most the most recent highmem image page.
+ *
+ * Copy the contents of a highmem image from @buffer, where the caller of
+ * snapshot_write_next() has stored them, to the right location represented by
+ * @last_highmem_page .
*/
-
static void copy_last_highmem_page(void)
{
if (last_highmem_page) {
@@ -2294,17 +2389,13 @@ static inline void free_highmem_data(void)
free_image_page(buffer, PG_UNSAFE_CLEAR);
}
#else
-static unsigned int
-count_highmem_image_pages(struct memory_bitmap *bm) { return 0; }
+static unsigned int count_highmem_image_pages(struct memory_bitmap *bm) { return 0; }
-static inline int
-prepare_highmem_image(struct memory_bitmap *bm, unsigned int *nr_highmem_p)
-{
- return 0;
-}
+static inline int prepare_highmem_image(struct memory_bitmap *bm,
+ unsigned int *nr_highmem_p) { return 0; }
-static inline void *
-get_highmem_page_buffer(struct page *page, struct chain_allocator *ca)
+static inline void *get_highmem_page_buffer(struct page *page,
+ struct chain_allocator *ca)
{
return ERR_PTR(-EINVAL);
}
@@ -2314,27 +2405,27 @@ static inline int last_highmem_page_copied(void) { return 1; }
static inline void free_highmem_data(void) {}
#endif /* CONFIG_HIGHMEM */
+#define PBES_PER_LINKED_PAGE (LINKED_PAGE_DATA_SIZE / sizeof(struct pbe))
+
/**
- * prepare_image - use the memory bitmap @bm to mark the pages that will
- * be overwritten in the process of restoring the system memory state
- * from the suspend image ("unsafe" pages) and allocate memory for the
- * image.
+ * prepare_image - Make room for loading hibernation image.
+ * @new_bm: Unitialized memory bitmap structure.
+ * @bm: Memory bitmap with unsafe pages marked.
+ *
+ * Use @bm to mark the pages that will be overwritten in the process of
+ * restoring the system memory state from the suspend image ("unsafe" pages)
+ * and allocate memory for the image.
*
- * The idea is to allocate a new memory bitmap first and then allocate
- * as many pages as needed for the image data, but not to assign these
- * pages to specific tasks initially. Instead, we just mark them as
- * allocated and create a lists of "safe" pages that will be used
- * later. On systems with high memory a list of "safe" highmem pages is
- * also created.
+ * The idea is to allocate a new memory bitmap first and then allocate
+ * as many pages as needed for image data, but without specifying what those
+ * pages will be used for just yet. Instead, we mark them all as allocated and
+ * create a lists of "safe" pages to be used later. On systems with high
+ * memory a list of "safe" highmem pages is created too.
*/
-
-#define PBES_PER_LINKED_PAGE (LINKED_PAGE_DATA_SIZE / sizeof(struct pbe))
-
-static int
-prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm)
+static int prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm)
{
unsigned int nr_pages, nr_highmem;
- struct linked_page *sp_list, *lp;
+ struct linked_page *lp;
int error;
/* If there is no highmem, the buffer will not be necessary */
@@ -2342,9 +2433,7 @@ prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm)
buffer = NULL;
nr_highmem = count_highmem_image_pages(bm);
- error = mark_unsafe_pages(bm);
- if (error)
- goto Free;
+ mark_unsafe_pages(bm);
error = memory_bm_create(new_bm, GFP_ATOMIC, PG_SAFE);
if (error)
@@ -2357,14 +2446,15 @@ prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm)
if (error)
goto Free;
}
- /* Reserve some safe pages for potential later use.
+ /*
+ * Reserve some safe pages for potential later use.
*
* NOTE: This way we make sure there will be enough safe pages for the
* chain_alloc() in get_buffer(). It is a bit wasteful, but
* nr_copy_pages cannot be greater than 50% of the memory anyway.
+ *
+ * nr_copy_pages cannot be less than allocated_unsafe_pages too.
*/
- sp_list = NULL;
- /* nr_copy_pages cannot be lesser than allocated_unsafe_pages */
nr_pages = nr_copy_pages - nr_highmem - allocated_unsafe_pages;
nr_pages = DIV_ROUND_UP(nr_pages, PBES_PER_LINKED_PAGE);
while (nr_pages > 0) {
@@ -2373,12 +2463,11 @@ prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm)
error = -ENOMEM;
goto Free;
}
- lp->next = sp_list;
- sp_list = lp;
+ lp->next = safe_pages_list;
+ safe_pages_list = lp;
nr_pages--;
}
/* Preallocate memory for the image */
- safe_pages_list = NULL;
nr_pages = nr_copy_pages - nr_highmem - allocated_unsafe_pages;
while (nr_pages > 0) {
lp = (struct linked_page *)get_zeroed_page(GFP_ATOMIC);
@@ -2396,12 +2485,6 @@ prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm)
swsusp_set_page_free(virt_to_page(lp));
nr_pages--;
}
- /* Free the reserved safe pages so that chain_alloc() can use them */
- while (sp_list) {
- lp = sp_list->next;
- free_image_page(sp_list, PG_UNSAFE_CLEAR);
- sp_list = lp;
- }
return 0;
Free:
@@ -2410,10 +2493,11 @@ prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm)
}
/**
- * get_buffer - compute the address that snapshot_write_next() should
- * set for its caller to write to.
+ * get_buffer - Get the address to store the next image data page.
+ *
+ * Get the address that snapshot_write_next() should return to its caller to
+ * write to.
*/
-
static void *get_buffer(struct memory_bitmap *bm, struct chain_allocator *ca)
{
struct pbe *pbe;
@@ -2428,12 +2512,14 @@ static void *get_buffer(struct memory_bitmap *bm, struct chain_allocator *ca)
return get_highmem_page_buffer(page, ca);
if (swsusp_page_is_forbidden(page) && swsusp_page_is_free(page))
- /* We have allocated the "original" page frame and we can
+ /*
+ * We have allocated the "original" page frame and we can
* use it directly to store the loaded page.
*/
return page_address(page);
- /* The "original" page frame has not been allocated and we have to
+ /*
+ * The "original" page frame has not been allocated and we have to
* use a "safe" page frame to store the loaded page.
*/
pbe = chain_alloc(ca, sizeof(struct pbe));
@@ -2450,22 +2536,21 @@ static void *get_buffer(struct memory_bitmap *bm, struct chain_allocator *ca)
}
/**
- * snapshot_write_next - used for writing the system memory snapshot.
+ * snapshot_write_next - Get the address to store the next image page.
+ * @handle: Snapshot handle structure to guide the writing.
*
- * On the first call to it @handle should point to a zeroed
- * snapshot_handle structure. The structure gets updated and a pointer
- * to it should be passed to this function every next time.
+ * On the first call, @handle should point to a zeroed snapshot_handle
+ * structure. The structure gets populated then and a pointer to it should be
+ * passed to this function every next time.
*
- * On success the function returns a positive number. Then, the caller
- * is allowed to write up to the returned number of bytes to the memory
- * location computed by the data_of() macro.
+ * On success, the function returns a positive number. Then, the caller
+ * is allowed to write up to the returned number of bytes to the memory
+ * location computed by the data_of() macro.
*
- * The function returns 0 to indicate the "end of file" condition,
- * and a negative number is returned on error. In such cases the
- * structure pointed to by @handle is not updated and should not be used
- * any more.
+ * The function returns 0 to indicate the "end of file" condition. Negative
+ * numbers are returned on errors, in which cases the structure pointed to by
+ * @handle is not updated and should not be used any more.
*/
-
int snapshot_write_next(struct snapshot_handle *handle)
{
static struct chain_allocator ca;
@@ -2491,6 +2576,8 @@ int snapshot_write_next(struct snapshot_handle *handle)
if (error)
return error;
+ safe_pages_list = NULL;
+
error = memory_bm_create(&copy_bm, GFP_ATOMIC, PG_ANY);
if (error)
return error;
@@ -2500,6 +2587,7 @@ int snapshot_write_next(struct snapshot_handle *handle)
if (error)
return error;
+ hibernate_restore_protection_begin();
} else if (handle->cur <= nr_meta_pages + 1) {
error = unpack_orig_pfns(buffer, &copy_bm);
if (error)
@@ -2522,6 +2610,7 @@ int snapshot_write_next(struct snapshot_handle *handle)
copy_last_highmem_page();
/* Restore page key for data page (s390 only). */
page_key_write(handle->buffer);
+ hibernate_restore_protect_page(handle->buffer);
handle->buffer = get_buffer(&orig_bm, &ca);
if (IS_ERR(handle->buffer))
return PTR_ERR(handle->buffer);
@@ -2533,22 +2622,23 @@ int snapshot_write_next(struct snapshot_handle *handle)
}
/**
- * snapshot_write_finalize - must be called after the last call to
- * snapshot_write_next() in case the last page in the image happens
- * to be a highmem page and its contents should be stored in the
- * highmem. Additionally, it releases the memory that will not be
- * used any more.
+ * snapshot_write_finalize - Complete the loading of a hibernation image.
+ *
+ * Must be called after the last call to snapshot_write_next() in case the last
+ * page in the image happens to be a highmem page and its contents should be
+ * stored in highmem. Additionally, it recycles bitmap memory that's not
+ * necessary any more.
*/
-
void snapshot_write_finalize(struct snapshot_handle *handle)
{
copy_last_highmem_page();
/* Restore page key for data page (s390 only). */
page_key_write(handle->buffer);
page_key_free();
- /* Free only if we have loaded the image entirely */
+ hibernate_restore_protect_page(handle->buffer);
+ /* Do that only if we have loaded the image entirely */
if (handle->cur > 1 && handle->cur > nr_meta_pages + nr_copy_pages) {
- memory_bm_free(&orig_bm, PG_UNSAFE_CLEAR);
+ memory_bm_recycle(&orig_bm);
free_highmem_data();
}
}
@@ -2561,8 +2651,8 @@ int snapshot_image_loaded(struct snapshot_handle *handle)
#ifdef CONFIG_HIGHMEM
/* Assumes that @buf is ready and points to a "safe" page */
-static inline void
-swap_two_pages_data(struct page *p1, struct page *p2, void *buf)
+static inline void swap_two_pages_data(struct page *p1, struct page *p2,
+ void *buf)
{
void *kaddr1, *kaddr2;
@@ -2576,15 +2666,15 @@ swap_two_pages_data(struct page *p1, struct page *p2, void *buf)
}
/**
- * restore_highmem - for each highmem page that was allocated before
- * the suspend and included in the suspend image, and also has been
- * allocated by the "resume" kernel swap its current (ie. "before
- * resume") contents with the previous (ie. "before suspend") one.
+ * restore_highmem - Put highmem image pages into their original locations.
+ *
+ * For each highmem page that was in use before hibernation and is included in
+ * the image, and also has been allocated by the "restore" kernel, swap its
+ * current contents with the previous (ie. "before hibernation") ones.
*
- * If the resume eventually fails, we can call this function once
- * again and restore the "before resume" highmem state.
+ * If the restore eventually fails, we can call this function once again and
+ * restore the highmem state as seen by the restore kernel.
*/
-
int restore_highmem(void)
{
struct highmem_pbe *pbe = highmem_pblist;
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index 5b70d64b871e..0acab9d7f96f 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -266,16 +266,18 @@ static int suspend_test(int level)
*/
static int suspend_prepare(suspend_state_t state)
{
- int error;
+ int error, nr_calls = 0;
if (!sleep_state_supported(state))
return -EPERM;
pm_prepare_console();
- error = pm_notifier_call_chain(PM_SUSPEND_PREPARE);
- if (error)
+ error = __pm_notifier_call_chain(PM_SUSPEND_PREPARE, -1, &nr_calls);
+ if (error) {
+ nr_calls--;
goto Finish;
+ }
trace_suspend_resume(TPS("freeze_processes"), 0, true);
error = suspend_freeze_processes();
@@ -286,7 +288,7 @@ static int suspend_prepare(suspend_state_t state)
suspend_stats.failed_freeze++;
dpm_save_failed_step(SUSPEND_FREEZE);
Finish:
- pm_notifier_call_chain(PM_POST_SUSPEND);
+ __pm_notifier_call_chain(PM_POST_SUSPEND, nr_calls, NULL);
pm_restore_console();
return error;
}
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 160e1006640d..a3b1e617bcdc 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -261,7 +261,7 @@ static void hib_end_io(struct bio *bio)
bio_put(bio);
}
-static int hib_submit_io(int rw, pgoff_t page_off, void *addr,
+static int hib_submit_io(int op, int op_flags, pgoff_t page_off, void *addr,
struct hib_bio_batch *hb)
{
struct page *page = virt_to_page(addr);
@@ -271,6 +271,7 @@ static int hib_submit_io(int rw, pgoff_t page_off, void *addr,
bio = bio_alloc(__GFP_RECLAIM | __GFP_HIGH, 1);
bio->bi_iter.bi_sector = page_off * (PAGE_SIZE >> 9);
bio->bi_bdev = hib_resume_bdev;
+ bio_set_op_attrs(bio, op, op_flags);
if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) {
printk(KERN_ERR "PM: Adding page to bio failed at %llu\n",
@@ -283,9 +284,9 @@ static int hib_submit_io(int rw, pgoff_t page_off, void *addr,
bio->bi_end_io = hib_end_io;
bio->bi_private = hb;
atomic_inc(&hb->count);
- submit_bio(rw, bio);
+ submit_bio(bio);
} else {
- error = submit_bio_wait(rw, bio);
+ error = submit_bio_wait(bio);
bio_put(bio);
}
@@ -306,7 +307,8 @@ static int mark_swapfiles(struct swap_map_handle *handle, unsigned int flags)
{
int error;
- hib_submit_io(READ_SYNC, swsusp_resume_block, swsusp_header, NULL);
+ hib_submit_io(REQ_OP_READ, READ_SYNC, swsusp_resume_block,
+ swsusp_header, NULL);
if (!memcmp("SWAP-SPACE",swsusp_header->sig, 10) ||
!memcmp("SWAPSPACE2",swsusp_header->sig, 10)) {
memcpy(swsusp_header->orig_sig,swsusp_header->sig, 10);
@@ -315,8 +317,8 @@ static int mark_swapfiles(struct swap_map_handle *handle, unsigned int flags)
swsusp_header->flags = flags;
if (flags & SF_CRC32_MODE)
swsusp_header->crc32 = handle->crc32;
- error = hib_submit_io(WRITE_SYNC, swsusp_resume_block,
- swsusp_header, NULL);
+ error = hib_submit_io(REQ_OP_WRITE, WRITE_SYNC,
+ swsusp_resume_block, swsusp_header, NULL);
} else {
printk(KERN_ERR "PM: Swap header not found!\n");
error = -ENODEV;
@@ -348,6 +350,12 @@ static int swsusp_swap_check(void)
if (res < 0)
blkdev_put(hib_resume_bdev, FMODE_WRITE);
+ /*
+ * Update the resume device to the one actually used,
+ * so the test_resume mode can use it in case it is
+ * invoked from hibernate() to test the snapshot.
+ */
+ swsusp_resume_device = hib_resume_bdev->bd_dev;
return res;
}
@@ -389,7 +397,7 @@ static int write_page(void *buf, sector_t offset, struct hib_bio_batch *hb)
} else {
src = buf;
}
- return hib_submit_io(WRITE_SYNC, offset, src, hb);
+ return hib_submit_io(REQ_OP_WRITE, WRITE_SYNC, offset, src, hb);
}
static void release_swap_writer(struct swap_map_handle *handle)
@@ -992,7 +1000,8 @@ static int get_swap_reader(struct swap_map_handle *handle,
return -ENOMEM;
}
- error = hib_submit_io(READ_SYNC, offset, tmp->map, NULL);
+ error = hib_submit_io(REQ_OP_READ, READ_SYNC, offset,
+ tmp->map, NULL);
if (error) {
release_swap_reader(handle);
return error;
@@ -1016,7 +1025,7 @@ static int swap_read_page(struct swap_map_handle *handle, void *buf,
offset = handle->cur->entries[handle->k];
if (!offset)
return -EFAULT;
- error = hib_submit_io(READ_SYNC, offset, buf, hb);
+ error = hib_submit_io(REQ_OP_READ, READ_SYNC, offset, buf, hb);
if (error)
return error;
if (++handle->k >= MAP_PAGE_ENTRIES) {
@@ -1525,7 +1534,8 @@ int swsusp_check(void)
if (!IS_ERR(hib_resume_bdev)) {
set_blocksize(hib_resume_bdev, PAGE_SIZE);
clear_page(swsusp_header);
- error = hib_submit_io(READ_SYNC, swsusp_resume_block,
+ error = hib_submit_io(REQ_OP_READ, READ_SYNC,
+ swsusp_resume_block,
swsusp_header, NULL);
if (error)
goto put;
@@ -1533,7 +1543,8 @@ int swsusp_check(void)
if (!memcmp(HIBERNATE_SIG, swsusp_header->sig, 10)) {
memcpy(swsusp_header->sig, swsusp_header->orig_sig, 10);
/* Reset swap signature now */
- error = hib_submit_io(WRITE_SYNC, swsusp_resume_block,
+ error = hib_submit_io(REQ_OP_WRITE, WRITE_SYNC,
+ swsusp_resume_block,
swsusp_header, NULL);
} else {
error = -EINVAL;
@@ -1577,10 +1588,12 @@ int swsusp_unmark(void)
{
int error;
- hib_submit_io(READ_SYNC, swsusp_resume_block, swsusp_header, NULL);
+ hib_submit_io(REQ_OP_READ, READ_SYNC, swsusp_resume_block,
+ swsusp_header, NULL);
if (!memcmp(HIBERNATE_SIG,swsusp_header->sig, 10)) {
memcpy(swsusp_header->sig,swsusp_header->orig_sig, 10);
- error = hib_submit_io(WRITE_SYNC, swsusp_resume_block,
+ error = hib_submit_io(REQ_OP_WRITE, WRITE_SYNC,
+ swsusp_resume_block,
swsusp_header, NULL);
} else {
printk(KERN_ERR "PM: Cannot find swsusp signature!\n");
diff --git a/kernel/power/user.c b/kernel/power/user.c
index 526e8911460a..35310b627388 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -47,7 +47,7 @@ atomic_t snapshot_device_available = ATOMIC_INIT(1);
static int snapshot_open(struct inode *inode, struct file *filp)
{
struct snapshot_data *data;
- int error;
+ int error, nr_calls = 0;
if (!hibernation_available())
return -EPERM;
@@ -74,9 +74,9 @@ static int snapshot_open(struct inode *inode, struct file *filp)
swap_type_of(swsusp_resume_device, 0, NULL) : -1;
data->mode = O_RDONLY;
data->free_bitmaps = false;
- error = pm_notifier_call_chain(PM_HIBERNATION_PREPARE);
+ error = __pm_notifier_call_chain(PM_HIBERNATION_PREPARE, -1, &nr_calls);
if (error)
- pm_notifier_call_chain(PM_POST_HIBERNATION);
+ __pm_notifier_call_chain(PM_POST_HIBERNATION, --nr_calls, NULL);
} else {
/*
* Resuming. We may need to wait for the image device to
@@ -86,13 +86,15 @@ static int snapshot_open(struct inode *inode, struct file *filp)
data->swap = -1;
data->mode = O_WRONLY;
- error = pm_notifier_call_chain(PM_RESTORE_PREPARE);
+ error = __pm_notifier_call_chain(PM_RESTORE_PREPARE, -1, &nr_calls);
if (!error) {
error = create_basic_memory_bitmaps();
data->free_bitmaps = !error;
- }
+ } else
+ nr_calls--;
+
if (error)
- pm_notifier_call_chain(PM_POST_RESTORE);
+ __pm_notifier_call_chain(PM_POST_RESTORE, nr_calls, NULL);
}
if (error)
atomic_inc(&snapshot_device_available);
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 60cdf6386763..d4de33934dac 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -3177,9 +3177,8 @@ void show_regs_print_info(const char *log_lvl)
{
dump_stack_print_info(log_lvl);
- printk("%stask: %p ti: %p task.ti: %p\n",
- log_lvl, current, current_thread_info(),
- task_thread_info(current));
+ printk("%stask: %p task.stack: %p\n",
+ log_lvl, current, task_stack_page(current));
}
#endif
diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h
index d400434af6b2..6d86ab6ec2c9 100644
--- a/kernel/rcu/tree_exp.h
+++ b/kernel/rcu/tree_exp.h
@@ -253,7 +253,6 @@ static bool exp_funnel_lock(struct rcu_state *rsp, unsigned long s)
if (ULONG_CMP_LT(READ_ONCE(rnp->exp_seq_rq), s) &&
(rnp == rnp_root ||
ULONG_CMP_LT(READ_ONCE(rnp_root->exp_seq_rq), s)) &&
- !mutex_is_locked(&rsp->exp_mutex) &&
mutex_trylock(&rsp->exp_mutex))
goto fastpath;
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 51d7105f529a..5c883fe8e440 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1937,7 +1937,7 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags)
* chain to provide order. Instead we do:
*
* 1) smp_store_release(X->on_cpu, 0)
- * 2) smp_cond_acquire(!X->on_cpu)
+ * 2) smp_cond_load_acquire(!X->on_cpu)
*
* Example:
*
@@ -1948,7 +1948,7 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags)
* sched-out X
* smp_store_release(X->on_cpu, 0);
*
- * smp_cond_acquire(!X->on_cpu);
+ * smp_cond_load_acquire(&X->on_cpu, !VAL);
* X->state = WAKING
* set_task_cpu(X,2)
*
@@ -1974,7 +1974,7 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags)
* This means that any means of doing remote wakeups must order the CPU doing
* the wakeup against the CPU the task is going to end up running on. This,
* however, is already required for the regular Program-Order guarantee above,
- * since the waking CPU is the one issueing the ACQUIRE (smp_cond_acquire).
+ * since the waking CPU is the one issueing the ACQUIRE (smp_cond_load_acquire).
*
*/
@@ -2047,7 +2047,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
* This ensures that tasks getting woken will be fully ordered against
* their previous state and preserve Program Order.
*/
- smp_cond_acquire(!p->on_cpu);
+ smp_cond_load_acquire(&p->on_cpu, !VAL);
p->sched_contributes_to_load = !!task_contributes_to_load(p);
p->state = TASK_WAKING;
@@ -2342,11 +2342,11 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
__sched_fork(clone_flags, p);
/*
- * We mark the process as running here. This guarantees that
+ * We mark the process as NEW here. This guarantees that
* nobody will actually run it, and a signal or other external
* event cannot wake it up and insert it on the runqueue either.
*/
- p->state = TASK_RUNNING;
+ p->state = TASK_NEW;
/*
* Make sure we do not leak PI boosting priority to the child.
@@ -2383,8 +2383,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
p->sched_class = &fair_sched_class;
}
- if (p->sched_class->task_fork)
- p->sched_class->task_fork(p);
+ init_entity_runnable_average(&p->se);
/*
* The child is not yet in the pid-hash so no cgroup attach races,
@@ -2394,7 +2393,13 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
* Silence PROVE_RCU.
*/
raw_spin_lock_irqsave(&p->pi_lock, flags);
- set_task_cpu(p, cpu);
+ /*
+ * We're setting the cpu for the first time, we don't migrate,
+ * so use __set_task_cpu().
+ */
+ __set_task_cpu(p, cpu);
+ if (p->sched_class->task_fork)
+ p->sched_class->task_fork(p);
raw_spin_unlock_irqrestore(&p->pi_lock, flags);
#ifdef CONFIG_SCHED_INFO
@@ -2526,16 +2531,18 @@ void wake_up_new_task(struct task_struct *p)
struct rq_flags rf;
struct rq *rq;
- /* Initialize new task's runnable average */
- init_entity_runnable_average(&p->se);
raw_spin_lock_irqsave(&p->pi_lock, rf.flags);
+ p->state = TASK_RUNNING;
#ifdef CONFIG_SMP
/*
* Fork balancing, do it here and not earlier because:
* - cpus_allowed can change in the fork path
* - any previously selected cpu might disappear through hotplug
+ *
+ * Use __set_task_cpu() to avoid calling sched_class::migrate_task_rq,
+ * as we're not fully set-up yet.
*/
- set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0));
+ __set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0));
#endif
rq = __task_rq_lock(p, &rf);
post_init_entity_util_avg(&p->se);
@@ -3161,6 +3168,9 @@ static noinline void __schedule_bug(struct task_struct *prev)
pr_cont("\n");
}
#endif
+ if (panic_on_warn)
+ panic("scheduling while atomic\n");
+
dump_stack();
add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
}
@@ -4752,7 +4762,8 @@ out_unlock:
* @len: length in bytes of the bitmask pointed to by user_mask_ptr
* @user_mask_ptr: user-space pointer to hold the current cpu mask
*
- * Return: 0 on success. An error code otherwise.
+ * Return: size of CPU mask copied to user_mask_ptr on success. An
+ * error code otherwise.
*/
SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,
unsigned long __user *, user_mask_ptr)
@@ -5394,13 +5405,15 @@ void idle_task_exit(void)
/*
* Since this CPU is going 'away' for a while, fold any nr_active delta
* we might have. Assumes we're called after migrate_tasks() so that the
- * nr_active count is stable.
+ * nr_active count is stable. We need to take the teardown thread which
+ * is calling this into account, so we hand in adjust = 1 to the load
+ * calculation.
*
* Also see the comment "Global load-average calculations".
*/
static void calc_load_migrate(struct rq *rq)
{
- long delta = calc_load_fold_active(rq);
+ long delta = calc_load_fold_active(rq, 1);
if (delta)
atomic_long_add(delta, &calc_load_tasks);
}
@@ -7231,7 +7244,6 @@ static void sched_rq_cpu_starting(unsigned int cpu)
struct rq *rq = cpu_rq(cpu);
rq->calc_load_update = calc_load_update;
- account_reset_rq(rq);
update_max_interval();
}
@@ -7711,6 +7723,8 @@ void sched_online_group(struct task_group *tg, struct task_group *parent)
INIT_LIST_HEAD(&tg->children);
list_add_rcu(&tg->siblings, &parent->children);
spin_unlock_irqrestore(&task_group_lock, flags);
+
+ online_fair_sched_group(tg);
}
/* rcu callback to free various structures associated with a task group */
@@ -7739,27 +7753,9 @@ void sched_offline_group(struct task_group *tg)
spin_unlock_irqrestore(&task_group_lock, flags);
}
-/* change task's runqueue when it moves between groups.
- * The caller of this function should have put the task in its new group
- * by now. This function just updates tsk->se.cfs_rq and tsk->se.parent to
- * reflect its new group.
- */
-void sched_move_task(struct task_struct *tsk)
+static void sched_change_group(struct task_struct *tsk, int type)
{
struct task_group *tg;
- int queued, running;
- struct rq_flags rf;
- struct rq *rq;
-
- rq = task_rq_lock(tsk, &rf);
-
- running = task_current(rq, tsk);
- queued = task_on_rq_queued(tsk);
-
- if (queued)
- dequeue_task(rq, tsk, DEQUEUE_SAVE | DEQUEUE_MOVE);
- if (unlikely(running))
- put_prev_task(rq, tsk);
/*
* All callers are synchronized by task_rq_lock(); we do not use RCU
@@ -7772,11 +7768,37 @@ void sched_move_task(struct task_struct *tsk)
tsk->sched_task_group = tg;
#ifdef CONFIG_FAIR_GROUP_SCHED
- if (tsk->sched_class->task_move_group)
- tsk->sched_class->task_move_group(tsk);
+ if (tsk->sched_class->task_change_group)
+ tsk->sched_class->task_change_group(tsk, type);
else
#endif
set_task_rq(tsk, task_cpu(tsk));
+}
+
+/*
+ * Change task's runqueue when it moves between groups.
+ *
+ * The caller of this function should have put the task in its new group by
+ * now. This function just updates tsk->se.cfs_rq and tsk->se.parent to reflect
+ * its new group.
+ */
+void sched_move_task(struct task_struct *tsk)
+{
+ int queued, running;
+ struct rq_flags rf;
+ struct rq *rq;
+
+ rq = task_rq_lock(tsk, &rf);
+
+ running = task_current(rq, tsk);
+ queued = task_on_rq_queued(tsk);
+
+ if (queued)
+ dequeue_task(rq, tsk, DEQUEUE_SAVE | DEQUEUE_MOVE);
+ if (unlikely(running))
+ put_prev_task(rq, tsk);
+
+ sched_change_group(tsk, TASK_MOVE_GROUP);
if (unlikely(running))
tsk->sched_class->set_curr_task(rq);
@@ -8204,15 +8226,27 @@ static void cpu_cgroup_css_free(struct cgroup_subsys_state *css)
sched_free_group(tg);
}
+/*
+ * This is called before wake_up_new_task(), therefore we really only
+ * have to set its group bits, all the other stuff does not apply.
+ */
static void cpu_cgroup_fork(struct task_struct *task)
{
- sched_move_task(task);
+ struct rq_flags rf;
+ struct rq *rq;
+
+ rq = task_rq_lock(task, &rf);
+
+ sched_change_group(task, TASK_SET_GROUP);
+
+ task_rq_unlock(rq, task, &rf);
}
static int cpu_cgroup_can_attach(struct cgroup_taskset *tset)
{
struct task_struct *task;
struct cgroup_subsys_state *css;
+ int ret = 0;
cgroup_taskset_for_each(task, css, tset) {
#ifdef CONFIG_RT_GROUP_SCHED
@@ -8223,8 +8257,24 @@ static int cpu_cgroup_can_attach(struct cgroup_taskset *tset)
if (task->sched_class != &fair_sched_class)
return -EINVAL;
#endif
+ /*
+ * Serialize against wake_up_new_task() such that if its
+ * running, we're sure to observe its full state.
+ */
+ raw_spin_lock_irq(&task->pi_lock);
+ /*
+ * Avoid calling sched_move_task() before wake_up_new_task()
+ * has happened. This would lead to problems with PELT, due to
+ * move wanting to detach+attach while we're not attached yet.
+ */
+ if (task->state == TASK_NEW)
+ ret = -EINVAL;
+ raw_spin_unlock_irq(&task->pi_lock);
+
+ if (ret)
+ break;
}
- return 0;
+ return ret;
}
static void cpu_cgroup_attach(struct cgroup_taskset *tset)
diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c
index 41f85c4d0938..bc0b309c3f19 100644
--- a/kernel/sched/cpuacct.c
+++ b/kernel/sched/cpuacct.c
@@ -25,15 +25,13 @@ enum cpuacct_stat_index {
CPUACCT_STAT_NSTATS,
};
-enum cpuacct_usage_index {
- CPUACCT_USAGE_USER, /* ... user mode */
- CPUACCT_USAGE_SYSTEM, /* ... kernel mode */
-
- CPUACCT_USAGE_NRUSAGE,
+static const char * const cpuacct_stat_desc[] = {
+ [CPUACCT_STAT_USER] = "user",
+ [CPUACCT_STAT_SYSTEM] = "system",
};
struct cpuacct_usage {
- u64 usages[CPUACCT_USAGE_NRUSAGE];
+ u64 usages[CPUACCT_STAT_NSTATS];
};
/* track cpu usage of a group of tasks and its child groups */
@@ -108,16 +106,16 @@ static void cpuacct_css_free(struct cgroup_subsys_state *css)
}
static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu,
- enum cpuacct_usage_index index)
+ enum cpuacct_stat_index index)
{
struct cpuacct_usage *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
u64 data;
/*
- * We allow index == CPUACCT_USAGE_NRUSAGE here to read
+ * We allow index == CPUACCT_STAT_NSTATS here to read
* the sum of suages.
*/
- BUG_ON(index > CPUACCT_USAGE_NRUSAGE);
+ BUG_ON(index > CPUACCT_STAT_NSTATS);
#ifndef CONFIG_64BIT
/*
@@ -126,11 +124,11 @@ static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu,
raw_spin_lock_irq(&cpu_rq(cpu)->lock);
#endif
- if (index == CPUACCT_USAGE_NRUSAGE) {
+ if (index == CPUACCT_STAT_NSTATS) {
int i = 0;
data = 0;
- for (i = 0; i < CPUACCT_USAGE_NRUSAGE; i++)
+ for (i = 0; i < CPUACCT_STAT_NSTATS; i++)
data += cpuusage->usages[i];
} else {
data = cpuusage->usages[index];
@@ -155,7 +153,7 @@ static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val)
raw_spin_lock_irq(&cpu_rq(cpu)->lock);
#endif
- for (i = 0; i < CPUACCT_USAGE_NRUSAGE; i++)
+ for (i = 0; i < CPUACCT_STAT_NSTATS; i++)
cpuusage->usages[i] = val;
#ifndef CONFIG_64BIT
@@ -165,7 +163,7 @@ static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val)
/* return total cpu usage (in nanoseconds) of a group */
static u64 __cpuusage_read(struct cgroup_subsys_state *css,
- enum cpuacct_usage_index index)
+ enum cpuacct_stat_index index)
{
struct cpuacct *ca = css_ca(css);
u64 totalcpuusage = 0;
@@ -180,18 +178,18 @@ static u64 __cpuusage_read(struct cgroup_subsys_state *css,
static u64 cpuusage_user_read(struct cgroup_subsys_state *css,
struct cftype *cft)
{
- return __cpuusage_read(css, CPUACCT_USAGE_USER);
+ return __cpuusage_read(css, CPUACCT_STAT_USER);
}
static u64 cpuusage_sys_read(struct cgroup_subsys_state *css,
struct cftype *cft)
{
- return __cpuusage_read(css, CPUACCT_USAGE_SYSTEM);
+ return __cpuusage_read(css, CPUACCT_STAT_SYSTEM);
}
static u64 cpuusage_read(struct cgroup_subsys_state *css, struct cftype *cft)
{
- return __cpuusage_read(css, CPUACCT_USAGE_NRUSAGE);
+ return __cpuusage_read(css, CPUACCT_STAT_NSTATS);
}
static int cpuusage_write(struct cgroup_subsys_state *css, struct cftype *cft,
@@ -213,7 +211,7 @@ static int cpuusage_write(struct cgroup_subsys_state *css, struct cftype *cft,
}
static int __cpuacct_percpu_seq_show(struct seq_file *m,
- enum cpuacct_usage_index index)
+ enum cpuacct_stat_index index)
{
struct cpuacct *ca = css_ca(seq_css(m));
u64 percpu;
@@ -229,48 +227,78 @@ static int __cpuacct_percpu_seq_show(struct seq_file *m,
static int cpuacct_percpu_user_seq_show(struct seq_file *m, void *V)
{
- return __cpuacct_percpu_seq_show(m, CPUACCT_USAGE_USER);
+ return __cpuacct_percpu_seq_show(m, CPUACCT_STAT_USER);
}
static int cpuacct_percpu_sys_seq_show(struct seq_file *m, void *V)
{
- return __cpuacct_percpu_seq_show(m, CPUACCT_USAGE_SYSTEM);
+ return __cpuacct_percpu_seq_show(m, CPUACCT_STAT_SYSTEM);
}
static int cpuacct_percpu_seq_show(struct seq_file *m, void *V)
{
- return __cpuacct_percpu_seq_show(m, CPUACCT_USAGE_NRUSAGE);
+ return __cpuacct_percpu_seq_show(m, CPUACCT_STAT_NSTATS);
}
-static const char * const cpuacct_stat_desc[] = {
- [CPUACCT_STAT_USER] = "user",
- [CPUACCT_STAT_SYSTEM] = "system",
-};
+static int cpuacct_all_seq_show(struct seq_file *m, void *V)
+{
+ struct cpuacct *ca = css_ca(seq_css(m));
+ int index;
+ int cpu;
+
+ seq_puts(m, "cpu");
+ for (index = 0; index < CPUACCT_STAT_NSTATS; index++)
+ seq_printf(m, " %s", cpuacct_stat_desc[index]);
+ seq_puts(m, "\n");
+
+ for_each_possible_cpu(cpu) {
+ struct cpuacct_usage *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
+
+ seq_printf(m, "%d", cpu);
+
+ for (index = 0; index < CPUACCT_STAT_NSTATS; index++) {
+#ifndef CONFIG_64BIT
+ /*
+ * Take rq->lock to make 64-bit read safe on 32-bit
+ * platforms.
+ */
+ raw_spin_lock_irq(&cpu_rq(cpu)->lock);
+#endif
+
+ seq_printf(m, " %llu", cpuusage->usages[index]);
+
+#ifndef CONFIG_64BIT
+ raw_spin_unlock_irq(&cpu_rq(cpu)->lock);
+#endif
+ }
+ seq_puts(m, "\n");
+ }
+ return 0;
+}
static int cpuacct_stats_show(struct seq_file *sf, void *v)
{
struct cpuacct *ca = css_ca(seq_css(sf));
+ s64 val[CPUACCT_STAT_NSTATS];
int cpu;
- s64 val = 0;
+ int stat;
+ memset(val, 0, sizeof(val));
for_each_possible_cpu(cpu) {
- struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu);
- val += kcpustat->cpustat[CPUTIME_USER];
- val += kcpustat->cpustat[CPUTIME_NICE];
- }
- val = cputime64_to_clock_t(val);
- seq_printf(sf, "%s %lld\n", cpuacct_stat_desc[CPUACCT_STAT_USER], val);
+ u64 *cpustat = per_cpu_ptr(ca->cpustat, cpu)->cpustat;
- val = 0;
- for_each_possible_cpu(cpu) {
- struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu);
- val += kcpustat->cpustat[CPUTIME_SYSTEM];
- val += kcpustat->cpustat[CPUTIME_IRQ];
- val += kcpustat->cpustat[CPUTIME_SOFTIRQ];
+ val[CPUACCT_STAT_USER] += cpustat[CPUTIME_USER];
+ val[CPUACCT_STAT_USER] += cpustat[CPUTIME_NICE];
+ val[CPUACCT_STAT_SYSTEM] += cpustat[CPUTIME_SYSTEM];
+ val[CPUACCT_STAT_SYSTEM] += cpustat[CPUTIME_IRQ];
+ val[CPUACCT_STAT_SYSTEM] += cpustat[CPUTIME_SOFTIRQ];
}
- val = cputime64_to_clock_t(val);
- seq_printf(sf, "%s %lld\n", cpuacct_stat_desc[CPUACCT_STAT_SYSTEM], val);
+ for (stat = 0; stat < CPUACCT_STAT_NSTATS; stat++) {
+ seq_printf(sf, "%s %lld\n",
+ cpuacct_stat_desc[stat],
+ cputime64_to_clock_t(val[stat]));
+ }
return 0;
}
@@ -302,6 +330,10 @@ static struct cftype files[] = {
.seq_show = cpuacct_percpu_sys_seq_show,
},
{
+ .name = "usage_all",
+ .seq_show = cpuacct_all_seq_show,
+ },
+ {
.name = "stat",
.seq_show = cpuacct_stats_show,
},
@@ -316,11 +348,11 @@ static struct cftype files[] = {
void cpuacct_charge(struct task_struct *tsk, u64 cputime)
{
struct cpuacct *ca;
- int index = CPUACCT_USAGE_SYSTEM;
+ int index = CPUACCT_STAT_SYSTEM;
struct pt_regs *regs = task_pt_regs(tsk);
if (regs && user_mode(regs))
- index = CPUACCT_USAGE_USER;
+ index = CPUACCT_STAT_USER;
rcu_read_lock();
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index 14c4aa25cc45..a84641b222c1 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -47,6 +47,8 @@ struct sugov_cpu {
struct update_util_data update_util;
struct sugov_policy *sg_policy;
+ unsigned int cached_raw_freq;
+
/* The fields below are only needed when sharing a policy. */
unsigned long util;
unsigned long max;
@@ -106,7 +108,7 @@ static void sugov_update_commit(struct sugov_policy *sg_policy, u64 time,
/**
* get_next_freq - Compute a new frequency for a given cpufreq policy.
- * @policy: cpufreq policy object to compute the new frequency for.
+ * @sg_cpu: schedutil cpu object to compute the new frequency for.
* @util: Current CPU utilization.
* @max: CPU capacity.
*
@@ -121,14 +123,25 @@ static void sugov_update_commit(struct sugov_policy *sg_policy, u64 time,
* next_freq = C * curr_freq * util_raw / max
*
* Take C = 1.25 for the frequency tipping point at (util / max) = 0.8.
+ *
+ * The lowest driver-supported frequency which is equal or greater than the raw
+ * next_freq (as calculated above) is returned, subject to policy min/max and
+ * cpufreq driver limitations.
*/
-static unsigned int get_next_freq(struct cpufreq_policy *policy,
- unsigned long util, unsigned long max)
+static unsigned int get_next_freq(struct sugov_cpu *sg_cpu, unsigned long util,
+ unsigned long max)
{
+ struct sugov_policy *sg_policy = sg_cpu->sg_policy;
+ struct cpufreq_policy *policy = sg_policy->policy;
unsigned int freq = arch_scale_freq_invariant() ?
policy->cpuinfo.max_freq : policy->cur;
- return (freq + (freq >> 2)) * util / max;
+ freq = (freq + (freq >> 2)) * util / max;
+
+ if (freq == sg_cpu->cached_raw_freq && sg_policy->next_freq != UINT_MAX)
+ return sg_policy->next_freq;
+ sg_cpu->cached_raw_freq = freq;
+ return cpufreq_driver_resolve_freq(policy, freq);
}
static void sugov_update_single(struct update_util_data *hook, u64 time,
@@ -143,13 +156,14 @@ static void sugov_update_single(struct update_util_data *hook, u64 time,
return;
next_f = util == ULONG_MAX ? policy->cpuinfo.max_freq :
- get_next_freq(policy, util, max);
+ get_next_freq(sg_cpu, util, max);
sugov_update_commit(sg_policy, time, next_f);
}
-static unsigned int sugov_next_freq_shared(struct sugov_policy *sg_policy,
+static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu,
unsigned long util, unsigned long max)
{
+ struct sugov_policy *sg_policy = sg_cpu->sg_policy;
struct cpufreq_policy *policy = sg_policy->policy;
unsigned int max_f = policy->cpuinfo.max_freq;
u64 last_freq_update_time = sg_policy->last_freq_update_time;
@@ -189,7 +203,7 @@ static unsigned int sugov_next_freq_shared(struct sugov_policy *sg_policy,
}
}
- return get_next_freq(policy, util, max);
+ return get_next_freq(sg_cpu, util, max);
}
static void sugov_update_shared(struct update_util_data *hook, u64 time,
@@ -206,7 +220,7 @@ static void sugov_update_shared(struct update_util_data *hook, u64 time,
sg_cpu->last_update = time;
if (sugov_should_update_freq(sg_policy, time)) {
- next_f = sugov_next_freq_shared(sg_policy, util, max);
+ next_f = sugov_next_freq_shared(sg_cpu, util, max);
sugov_update_commit(sg_policy, time, next_f);
}
@@ -394,7 +408,7 @@ static int sugov_init(struct cpufreq_policy *policy)
return ret;
}
-static int sugov_exit(struct cpufreq_policy *policy)
+static void sugov_exit(struct cpufreq_policy *policy)
{
struct sugov_policy *sg_policy = policy->governor_data;
struct sugov_tunables *tunables = sg_policy->tunables;
@@ -412,7 +426,6 @@ static int sugov_exit(struct cpufreq_policy *policy)
mutex_unlock(&global_tunables_lock);
sugov_policy_free(sg_policy);
- return 0;
}
static int sugov_start(struct cpufreq_policy *policy)
@@ -434,6 +447,7 @@ static int sugov_start(struct cpufreq_policy *policy)
sg_cpu->util = ULONG_MAX;
sg_cpu->max = 0;
sg_cpu->last_update = 0;
+ sg_cpu->cached_raw_freq = 0;
cpufreq_add_update_util_hook(cpu, &sg_cpu->update_util,
sugov_update_shared);
} else {
@@ -444,7 +458,7 @@ static int sugov_start(struct cpufreq_policy *policy)
return 0;
}
-static int sugov_stop(struct cpufreq_policy *policy)
+static void sugov_stop(struct cpufreq_policy *policy)
{
struct sugov_policy *sg_policy = policy->governor_data;
unsigned int cpu;
@@ -456,53 +470,29 @@ static int sugov_stop(struct cpufreq_policy *policy)
irq_work_sync(&sg_policy->irq_work);
cancel_work_sync(&sg_policy->work);
- return 0;
}
-static int sugov_limits(struct cpufreq_policy *policy)
+static void sugov_limits(struct cpufreq_policy *policy)
{
struct sugov_policy *sg_policy = policy->governor_data;
if (!policy->fast_switch_enabled) {
mutex_lock(&sg_policy->work_lock);
-
- if (policy->max < policy->cur)
- __cpufreq_driver_target(policy, policy->max,
- CPUFREQ_RELATION_H);
- else if (policy->min > policy->cur)
- __cpufreq_driver_target(policy, policy->min,
- CPUFREQ_RELATION_L);
-
+ cpufreq_policy_apply_limits(policy);
mutex_unlock(&sg_policy->work_lock);
}
sg_policy->need_freq_update = true;
- return 0;
-}
-
-int sugov_governor(struct cpufreq_policy *policy, unsigned int event)
-{
- if (event == CPUFREQ_GOV_POLICY_INIT) {
- return sugov_init(policy);
- } else if (policy->governor_data) {
- switch (event) {
- case CPUFREQ_GOV_POLICY_EXIT:
- return sugov_exit(policy);
- case CPUFREQ_GOV_START:
- return sugov_start(policy);
- case CPUFREQ_GOV_STOP:
- return sugov_stop(policy);
- case CPUFREQ_GOV_LIMITS:
- return sugov_limits(policy);
- }
- }
- return -EINVAL;
}
static struct cpufreq_governor schedutil_gov = {
.name = "schedutil",
- .governor = sugov_governor,
.owner = THIS_MODULE,
+ .init = sugov_init,
+ .exit = sugov_exit,
+ .start = sugov_start,
+ .stop = sugov_stop,
+ .limits = sugov_limits,
};
static int __init sugov_module_init(void)
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 75f98c5498d5..1934f658c036 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -49,15 +49,12 @@ DEFINE_PER_CPU(seqcount_t, irq_time_seq);
*/
void irqtime_account_irq(struct task_struct *curr)
{
- unsigned long flags;
s64 delta;
int cpu;
if (!sched_clock_irqtime)
return;
- local_irq_save(flags);
-
cpu = smp_processor_id();
delta = sched_clock_cpu(cpu) - __this_cpu_read(irq_start_time);
__this_cpu_add(irq_start_time, delta);
@@ -75,44 +72,53 @@ void irqtime_account_irq(struct task_struct *curr)
__this_cpu_add(cpu_softirq_time, delta);
irq_time_write_end();
- local_irq_restore(flags);
}
EXPORT_SYMBOL_GPL(irqtime_account_irq);
-static int irqtime_account_hi_update(void)
+static cputime_t irqtime_account_hi_update(cputime_t maxtime)
{
u64 *cpustat = kcpustat_this_cpu->cpustat;
unsigned long flags;
- u64 latest_ns;
- int ret = 0;
+ cputime_t irq_cputime;
local_irq_save(flags);
- latest_ns = this_cpu_read(cpu_hardirq_time);
- if (nsecs_to_cputime64(latest_ns) > cpustat[CPUTIME_IRQ])
- ret = 1;
+ irq_cputime = nsecs_to_cputime64(this_cpu_read(cpu_hardirq_time)) -
+ cpustat[CPUTIME_IRQ];
+ irq_cputime = min(irq_cputime, maxtime);
+ cpustat[CPUTIME_IRQ] += irq_cputime;
local_irq_restore(flags);
- return ret;
+ return irq_cputime;
}
-static int irqtime_account_si_update(void)
+static cputime_t irqtime_account_si_update(cputime_t maxtime)
{
u64 *cpustat = kcpustat_this_cpu->cpustat;
unsigned long flags;
- u64 latest_ns;
- int ret = 0;
+ cputime_t softirq_cputime;
local_irq_save(flags);
- latest_ns = this_cpu_read(cpu_softirq_time);
- if (nsecs_to_cputime64(latest_ns) > cpustat[CPUTIME_SOFTIRQ])
- ret = 1;
+ softirq_cputime = nsecs_to_cputime64(this_cpu_read(cpu_softirq_time)) -
+ cpustat[CPUTIME_SOFTIRQ];
+ softirq_cputime = min(softirq_cputime, maxtime);
+ cpustat[CPUTIME_SOFTIRQ] += softirq_cputime;
local_irq_restore(flags);
- return ret;
+ return softirq_cputime;
}
#else /* CONFIG_IRQ_TIME_ACCOUNTING */
#define sched_clock_irqtime (0)
+static cputime_t irqtime_account_hi_update(cputime_t dummy)
+{
+ return 0;
+}
+
+static cputime_t irqtime_account_si_update(cputime_t dummy)
+{
+ return 0;
+}
+
#endif /* !CONFIG_IRQ_TIME_ACCOUNTING */
static inline void task_group_account_field(struct task_struct *p, int index,
@@ -257,29 +263,42 @@ void account_idle_time(cputime_t cputime)
cpustat[CPUTIME_IDLE] += (__force u64) cputime;
}
-static __always_inline bool steal_account_process_tick(void)
+static __always_inline cputime_t steal_account_process_time(cputime_t maxtime)
{
#ifdef CONFIG_PARAVIRT
if (static_key_false(&paravirt_steal_enabled)) {
+ cputime_t steal_cputime;
u64 steal;
- unsigned long steal_jiffies;
steal = paravirt_steal_clock(smp_processor_id());
steal -= this_rq()->prev_steal_time;
- /*
- * steal is in nsecs but our caller is expecting steal
- * time in jiffies. Lets cast the result to jiffies
- * granularity and account the rest on the next rounds.
- */
- steal_jiffies = nsecs_to_jiffies(steal);
- this_rq()->prev_steal_time += jiffies_to_nsecs(steal_jiffies);
+ steal_cputime = min(nsecs_to_cputime(steal), maxtime);
+ account_steal_time(steal_cputime);
+ this_rq()->prev_steal_time += cputime_to_nsecs(steal_cputime);
- account_steal_time(jiffies_to_cputime(steal_jiffies));
- return steal_jiffies;
+ return steal_cputime;
}
#endif
- return false;
+ return 0;
+}
+
+/*
+ * Account how much elapsed time was spent in steal, irq, or softirq time.
+ */
+static inline cputime_t account_other_time(cputime_t max)
+{
+ cputime_t accounted;
+
+ accounted = steal_account_process_time(max);
+
+ if (accounted < max)
+ accounted += irqtime_account_hi_update(max - accounted);
+
+ if (accounted < max)
+ accounted += irqtime_account_si_update(max - accounted);
+
+ return accounted;
}
/*
@@ -342,21 +361,23 @@ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
struct rq *rq, int ticks)
{
- cputime_t scaled = cputime_to_scaled(cputime_one_jiffy);
- u64 cputime = (__force u64) cputime_one_jiffy;
- u64 *cpustat = kcpustat_this_cpu->cpustat;
+ u64 cputime = (__force u64) cputime_one_jiffy * ticks;
+ cputime_t scaled, other;
- if (steal_account_process_tick())
+ /*
+ * When returning from idle, many ticks can get accounted at
+ * once, including some ticks of steal, irq, and softirq time.
+ * Subtract those ticks from the amount of time accounted to
+ * idle, or potentially user or system time. Due to rounding,
+ * other time can exceed ticks occasionally.
+ */
+ other = account_other_time(cputime);
+ if (other >= cputime)
return;
+ cputime -= other;
+ scaled = cputime_to_scaled(cputime);
- cputime *= ticks;
- scaled *= ticks;
-
- if (irqtime_account_hi_update()) {
- cpustat[CPUTIME_IRQ] += cputime;
- } else if (irqtime_account_si_update()) {
- cpustat[CPUTIME_SOFTIRQ] += cputime;
- } else if (this_cpu_ksoftirqd() == p) {
+ if (this_cpu_ksoftirqd() == p) {
/*
* ksoftirqd time do not get accounted in cpu_softirq_time.
* So, we have to handle it separately here.
@@ -406,6 +427,10 @@ void vtime_common_task_switch(struct task_struct *prev)
}
#endif
+#endif /* CONFIG_VIRT_CPU_ACCOUNTING */
+
+
+#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
/*
* Archs that account the whole time spent in the idle task
* (outside irq) as idle time can rely on this and just implement
@@ -415,33 +440,16 @@ void vtime_common_task_switch(struct task_struct *prev)
* vtime_account().
*/
#ifndef __ARCH_HAS_VTIME_ACCOUNT
-void vtime_common_account_irq_enter(struct task_struct *tsk)
+void vtime_account_irq_enter(struct task_struct *tsk)
{
- if (!in_interrupt()) {
- /*
- * If we interrupted user, context_tracking_in_user()
- * is 1 because the context tracking don't hook
- * on irq entry/exit. This way we know if
- * we need to flush user time on kernel entry.
- */
- if (context_tracking_in_user()) {
- vtime_account_user(tsk);
- return;
- }
-
- if (is_idle_task(tsk)) {
- vtime_account_idle(tsk);
- return;
- }
- }
- vtime_account_system(tsk);
+ if (!in_interrupt() && is_idle_task(tsk))
+ vtime_account_idle(tsk);
+ else
+ vtime_account_system(tsk);
}
-EXPORT_SYMBOL_GPL(vtime_common_account_irq_enter);
+EXPORT_SYMBOL_GPL(vtime_account_irq_enter);
#endif /* __ARCH_HAS_VTIME_ACCOUNT */
-#endif /* CONFIG_VIRT_CPU_ACCOUNTING */
-
-#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
{
*ut = p->utime;
@@ -466,7 +474,7 @@ void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime
*/
void account_process_tick(struct task_struct *p, int user_tick)
{
- cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
+ cputime_t cputime, scaled, steal;
struct rq *rq = this_rq();
if (vtime_accounting_cpu_enabled())
@@ -477,26 +485,21 @@ void account_process_tick(struct task_struct *p, int user_tick)
return;
}
- if (steal_account_process_tick())
+ cputime = cputime_one_jiffy;
+ steal = steal_account_process_time(cputime);
+
+ if (steal >= cputime)
return;
+ cputime -= steal;
+ scaled = cputime_to_scaled(cputime);
+
if (user_tick)
- account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);
+ account_user_time(p, cputime, scaled);
else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET))
- account_system_time(p, HARDIRQ_OFFSET, cputime_one_jiffy,
- one_jiffy_scaled);
+ account_system_time(p, HARDIRQ_OFFSET, cputime, scaled);
else
- account_idle_time(cputime_one_jiffy);
-}
-
-/*
- * Account multiple ticks of steal time.
- * @p: the process from which the cpu time has been stolen
- * @ticks: number of stolen ticks
- */
-void account_steal_ticks(unsigned long ticks)
-{
- account_steal_time(jiffies_to_cputime(ticks));
+ account_idle_time(cputime);
}
/*
@@ -681,12 +684,14 @@ static cputime_t vtime_delta(struct task_struct *tsk)
static cputime_t get_vtime_delta(struct task_struct *tsk)
{
unsigned long now = READ_ONCE(jiffies);
- unsigned long delta = now - tsk->vtime_snap;
+ cputime_t delta, other;
+ delta = jiffies_to_cputime(now - tsk->vtime_snap);
+ other = account_other_time(delta);
WARN_ON_ONCE(tsk->vtime_snap_whence == VTIME_INACTIVE);
tsk->vtime_snap = now;
- return jiffies_to_cputime(delta);
+ return delta - other;
}
static void __vtime_account_system(struct task_struct *tsk)
@@ -706,16 +711,6 @@ void vtime_account_system(struct task_struct *tsk)
write_seqcount_end(&tsk->vtime_seqcount);
}
-void vtime_gen_account_irq_exit(struct task_struct *tsk)
-{
- write_seqcount_begin(&tsk->vtime_seqcount);
- if (vtime_delta(tsk))
- __vtime_account_system(tsk);
- if (context_tracking_in_user())
- tsk->vtime_snap_whence = VTIME_USER;
- write_seqcount_end(&tsk->vtime_seqcount);
-}
-
void vtime_account_user(struct task_struct *tsk)
{
cputime_t delta_cpu;
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 0368c393a336..2a0a9995256d 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -879,9 +879,9 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
nr_switches = p->nvcsw + p->nivcsw;
-#ifdef CONFIG_SCHEDSTATS
P(se.nr_migrations);
+#ifdef CONFIG_SCHEDSTATS
if (schedstat_enabled()) {
u64 avg_atom, avg_per_cpu;
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index c8c5d2d48424..4088eedea763 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -690,6 +690,11 @@ void init_entity_runnable_average(struct sched_entity *se)
/* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */
}
+static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq);
+static int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq);
+static void update_tg_load_avg(struct cfs_rq *cfs_rq, int force);
+static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se);
+
/*
* With new tasks being created, their initial util_avgs are extrapolated
* based on the cfs_rq's current util_avg:
@@ -720,6 +725,8 @@ void post_init_entity_util_avg(struct sched_entity *se)
struct cfs_rq *cfs_rq = cfs_rq_of(se);
struct sched_avg *sa = &se->avg;
long cap = (long)(SCHED_CAPACITY_SCALE - cfs_rq->avg.util_avg) / 2;
+ u64 now = cfs_rq_clock_task(cfs_rq);
+ int tg_update;
if (cap > 0) {
if (cfs_rq->avg.util_avg != 0) {
@@ -733,16 +740,42 @@ void post_init_entity_util_avg(struct sched_entity *se)
}
sa->util_sum = sa->util_avg * LOAD_AVG_MAX;
}
+
+ if (entity_is_task(se)) {
+ struct task_struct *p = task_of(se);
+ if (p->sched_class != &fair_sched_class) {
+ /*
+ * For !fair tasks do:
+ *
+ update_cfs_rq_load_avg(now, cfs_rq, false);
+ attach_entity_load_avg(cfs_rq, se);
+ switched_from_fair(rq, p);
+ *
+ * such that the next switched_to_fair() has the
+ * expected state.
+ */
+ se->avg.last_update_time = now;
+ return;
+ }
+ }
+
+ tg_update = update_cfs_rq_load_avg(now, cfs_rq, false);
+ attach_entity_load_avg(cfs_rq, se);
+ if (tg_update)
+ update_tg_load_avg(cfs_rq, false);
}
-#else
+#else /* !CONFIG_SMP */
void init_entity_runnable_average(struct sched_entity *se)
{
}
void post_init_entity_util_avg(struct sched_entity *se)
{
}
-#endif
+static void update_tg_load_avg(struct cfs_rq *cfs_rq, int force)
+{
+}
+#endif /* CONFIG_SMP */
/*
* Update the current task's runtime statistics.
@@ -1303,6 +1336,8 @@ static void task_numa_assign(struct task_numa_env *env,
{
if (env->best_task)
put_task_struct(env->best_task);
+ if (p)
+ get_task_struct(p);
env->best_task = p;
env->best_imp = imp;
@@ -1370,31 +1405,11 @@ static void task_numa_compare(struct task_numa_env *env,
long imp = env->p->numa_group ? groupimp : taskimp;
long moveimp = imp;
int dist = env->dist;
- bool assigned = false;
rcu_read_lock();
-
- raw_spin_lock_irq(&dst_rq->lock);
- cur = dst_rq->curr;
- /*
- * No need to move the exiting task or idle task.
- */
- if ((cur->flags & PF_EXITING) || is_idle_task(cur))
+ cur = task_rcu_dereference(&dst_rq->curr);
+ if (cur && ((cur->flags & PF_EXITING) || is_idle_task(cur)))
cur = NULL;
- else {
- /*
- * The task_struct must be protected here to protect the
- * p->numa_faults access in the task_weight since the
- * numa_faults could already be freed in the following path:
- * finish_task_switch()
- * --> put_task_struct()
- * --> __put_task_struct()
- * --> task_numa_free()
- */
- get_task_struct(cur);
- }
-
- raw_spin_unlock_irq(&dst_rq->lock);
/*
* Because we have preemption enabled we can get migrated around and
@@ -1477,7 +1492,6 @@ balance:
*/
if (!load_too_imbalanced(src_load, dst_load, env)) {
imp = moveimp - 1;
- put_task_struct(cur);
cur = NULL;
goto assign;
}
@@ -1503,16 +1517,9 @@ balance:
env->dst_cpu = select_idle_sibling(env->p, env->dst_cpu);
assign:
- assigned = true;
task_numa_assign(env, cur, imp);
unlock:
rcu_read_unlock();
- /*
- * The dst_rq->curr isn't assigned. The protection for task_struct is
- * finished.
- */
- if (cur && !assigned)
- put_task_struct(cur);
}
static void task_numa_find_cpu(struct task_numa_env *env,
@@ -2866,8 +2873,6 @@ void set_task_rq_fair(struct sched_entity *se,
static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) {}
#endif /* CONFIG_FAIR_GROUP_SCHED */
-static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq);
-
static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq)
{
struct rq *rq = rq_of(cfs_rq);
@@ -2914,7 +2919,23 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq)
WRITE_ONCE(*ptr, res); \
} while (0)
-/* Group cfs_rq's load_avg is used for task_h_load and update_cfs_share */
+/**
+ * update_cfs_rq_load_avg - update the cfs_rq's load/util averages
+ * @now: current time, as per cfs_rq_clock_task()
+ * @cfs_rq: cfs_rq to update
+ * @update_freq: should we call cfs_rq_util_change() or will the call do so
+ *
+ * The cfs_rq avg is the direct sum of all its entities (blocked and runnable)
+ * avg. The immediate corollary is that all (fair) tasks must be attached, see
+ * post_init_entity_util_avg().
+ *
+ * cfs_rq->avg is used for task_h_load() and update_cfs_share() for example.
+ *
+ * Returns true if the load decayed or we removed utilization. It is expected
+ * that one calls update_tg_load_avg() on this condition, but after you've
+ * modified the cfs_rq avg (attach/detach), such that we propagate the new
+ * avg up.
+ */
static inline int
update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq)
{
@@ -2969,6 +2990,14 @@ static inline void update_load_avg(struct sched_entity *se, int update_tg)
update_tg_load_avg(cfs_rq, 0);
}
+/**
+ * attach_entity_load_avg - attach this entity to its cfs_rq load avg
+ * @cfs_rq: cfs_rq to attach to
+ * @se: sched_entity to attach
+ *
+ * Must call update_cfs_rq_load_avg() before this, since we rely on
+ * cfs_rq->avg.last_update_time being current.
+ */
static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
if (!sched_feat(ATTACH_AGE_LOAD))
@@ -2977,6 +3006,8 @@ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
/*
* If we got migrated (either between CPUs or between cgroups) we'll
* have aged the average right before clearing @last_update_time.
+ *
+ * Or we're fresh through post_init_entity_util_avg().
*/
if (se->avg.last_update_time) {
__update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq_of(cfs_rq)),
@@ -2998,6 +3029,14 @@ skip_aging:
cfs_rq_util_change(cfs_rq);
}
+/**
+ * detach_entity_load_avg - detach this entity from its cfs_rq load avg
+ * @cfs_rq: cfs_rq to detach from
+ * @se: sched_entity to detach
+ *
+ * Must call update_cfs_rq_load_avg() before this, since we rely on
+ * cfs_rq->avg.last_update_time being current.
+ */
static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
__update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq_of(cfs_rq)),
@@ -3082,11 +3121,14 @@ void remove_entity_load_avg(struct sched_entity *se)
u64 last_update_time;
/*
- * Newly created task or never used group entity should not be removed
- * from its (source) cfs_rq
+ * tasks cannot exit without having gone through wake_up_new_task() ->
+ * post_init_entity_util_avg() which will have added things to the
+ * cfs_rq, so we can remove unconditionally.
+ *
+ * Similarly for groups, they will have passed through
+ * post_init_entity_util_avg() before unregister_sched_fair_group()
+ * calls this.
*/
- if (se->avg.last_update_time == 0)
- return;
last_update_time = cfs_rq_last_update_time(cfs_rq);
@@ -3109,6 +3151,12 @@ static int idle_balance(struct rq *this_rq);
#else /* CONFIG_SMP */
+static inline int
+update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq)
+{
+ return 0;
+}
+
static inline void update_load_avg(struct sched_entity *se, int not_used)
{
struct cfs_rq *cfs_rq = cfs_rq_of(se);
@@ -3698,7 +3746,7 @@ static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
{
if (unlikely(cfs_rq->throttle_count))
- return cfs_rq->throttled_clock_task;
+ return cfs_rq->throttled_clock_task - cfs_rq->throttled_clock_task_time;
return rq_clock_task(rq_of(cfs_rq)) - cfs_rq->throttled_clock_task_time;
}
@@ -3836,13 +3884,11 @@ static int tg_unthrottle_up(struct task_group *tg, void *data)
struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
cfs_rq->throttle_count--;
-#ifdef CONFIG_SMP
if (!cfs_rq->throttle_count) {
/* adjust cfs_rq_clock_task() */
cfs_rq->throttled_clock_task_time += rq_clock_task(rq) -
cfs_rq->throttled_clock_task;
}
-#endif
return 0;
}
@@ -4195,26 +4241,6 @@ static void check_enqueue_throttle(struct cfs_rq *cfs_rq)
if (!cfs_bandwidth_used())
return;
- /* Synchronize hierarchical throttle counter: */
- if (unlikely(!cfs_rq->throttle_uptodate)) {
- struct rq *rq = rq_of(cfs_rq);
- struct cfs_rq *pcfs_rq;
- struct task_group *tg;
-
- cfs_rq->throttle_uptodate = 1;
-
- /* Get closest up-to-date node, because leaves go first: */
- for (tg = cfs_rq->tg->parent; tg; tg = tg->parent) {
- pcfs_rq = tg->cfs_rq[cpu_of(rq)];
- if (pcfs_rq->throttle_uptodate)
- break;
- }
- if (tg) {
- cfs_rq->throttle_count = pcfs_rq->throttle_count;
- cfs_rq->throttled_clock_task = rq_clock_task(rq);
- }
- }
-
/* an active group must be handled by the update_curr()->put() path */
if (!cfs_rq->runtime_enabled || cfs_rq->curr)
return;
@@ -4229,6 +4255,23 @@ static void check_enqueue_throttle(struct cfs_rq *cfs_rq)
throttle_cfs_rq(cfs_rq);
}
+static void sync_throttle(struct task_group *tg, int cpu)
+{
+ struct cfs_rq *pcfs_rq, *cfs_rq;
+
+ if (!cfs_bandwidth_used())
+ return;
+
+ if (!tg->parent)
+ return;
+
+ cfs_rq = tg->cfs_rq[cpu];
+ pcfs_rq = tg->parent->cfs_rq[cpu];
+
+ cfs_rq->throttle_count = pcfs_rq->throttle_count;
+ pcfs_rq->throttled_clock_task = rq_clock_task(cpu_rq(cpu));
+}
+
/* conditionally throttle active cfs_rq's from put_prev_entity() */
static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
{
@@ -4368,6 +4411,7 @@ static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) {}
static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) { return false; }
static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
+static inline void sync_throttle(struct task_group *tg, int cpu) {}
static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
@@ -4476,7 +4520,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
*
* note: in the case of encountering a throttled cfs_rq we will
* post the final h_nr_running increment below.
- */
+ */
if (cfs_rq_throttled(cfs_rq))
break;
cfs_rq->h_nr_running++;
@@ -8317,31 +8361,17 @@ static void task_fork_fair(struct task_struct *p)
{
struct cfs_rq *cfs_rq;
struct sched_entity *se = &p->se, *curr;
- int this_cpu = smp_processor_id();
struct rq *rq = this_rq();
- unsigned long flags;
-
- raw_spin_lock_irqsave(&rq->lock, flags);
+ raw_spin_lock(&rq->lock);
update_rq_clock(rq);
cfs_rq = task_cfs_rq(current);
curr = cfs_rq->curr;
-
- /*
- * Not only the cpu but also the task_group of the parent might have
- * been changed after parent->se.parent,cfs_rq were copied to
- * child->se.parent,cfs_rq. So call __set_task_cpu() to make those
- * of child point to valid ones.
- */
- rcu_read_lock();
- __set_task_cpu(p, this_cpu);
- rcu_read_unlock();
-
- update_curr(cfs_rq);
-
- if (curr)
+ if (curr) {
+ update_curr(cfs_rq);
se->vruntime = curr->vruntime;
+ }
place_entity(cfs_rq, se, 1);
if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) {
@@ -8354,8 +8384,7 @@ static void task_fork_fair(struct task_struct *p)
}
se->vruntime -= cfs_rq->min_vruntime;
-
- raw_spin_unlock_irqrestore(&rq->lock, flags);
+ raw_spin_unlock(&rq->lock);
}
/*
@@ -8411,6 +8440,8 @@ static void detach_task_cfs_rq(struct task_struct *p)
{
struct sched_entity *se = &p->se;
struct cfs_rq *cfs_rq = cfs_rq_of(se);
+ u64 now = cfs_rq_clock_task(cfs_rq);
+ int tg_update;
if (!vruntime_normalized(p)) {
/*
@@ -8422,13 +8453,18 @@ static void detach_task_cfs_rq(struct task_struct *p)
}
/* Catch up with the cfs_rq and remove our load when we leave */
+ tg_update = update_cfs_rq_load_avg(now, cfs_rq, false);
detach_entity_load_avg(cfs_rq, se);
+ if (tg_update)
+ update_tg_load_avg(cfs_rq, false);
}
static void attach_task_cfs_rq(struct task_struct *p)
{
struct sched_entity *se = &p->se;
struct cfs_rq *cfs_rq = cfs_rq_of(se);
+ u64 now = cfs_rq_clock_task(cfs_rq);
+ int tg_update;
#ifdef CONFIG_FAIR_GROUP_SCHED
/*
@@ -8439,7 +8475,10 @@ static void attach_task_cfs_rq(struct task_struct *p)
#endif
/* Synchronize task with its cfs_rq */
+ tg_update = update_cfs_rq_load_avg(now, cfs_rq, false);
attach_entity_load_avg(cfs_rq, se);
+ if (tg_update)
+ update_tg_load_avg(cfs_rq, false);
if (!vruntime_normalized(p))
se->vruntime += cfs_rq->min_vruntime;
@@ -8499,6 +8538,14 @@ void init_cfs_rq(struct cfs_rq *cfs_rq)
}
#ifdef CONFIG_FAIR_GROUP_SCHED
+static void task_set_group_fair(struct task_struct *p)
+{
+ struct sched_entity *se = &p->se;
+
+ set_task_rq(p, task_cpu(p));
+ se->depth = se->parent ? se->parent->depth + 1 : 0;
+}
+
static void task_move_group_fair(struct task_struct *p)
{
detach_task_cfs_rq(p);
@@ -8511,6 +8558,19 @@ static void task_move_group_fair(struct task_struct *p)
attach_task_cfs_rq(p);
}
+static void task_change_group_fair(struct task_struct *p, int type)
+{
+ switch (type) {
+ case TASK_SET_GROUP:
+ task_set_group_fair(p);
+ break;
+
+ case TASK_MOVE_GROUP:
+ task_move_group_fair(p);
+ break;
+ }
+}
+
void free_fair_sched_group(struct task_group *tg)
{
int i;
@@ -8562,10 +8622,6 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
init_cfs_rq(cfs_rq);
init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);
init_entity_runnable_average(se);
-
- raw_spin_lock_irq(&rq->lock);
- post_init_entity_util_avg(se);
- raw_spin_unlock_irq(&rq->lock);
}
return 1;
@@ -8576,6 +8632,23 @@ err:
return 0;
}
+void online_fair_sched_group(struct task_group *tg)
+{
+ struct sched_entity *se;
+ struct rq *rq;
+ int i;
+
+ for_each_possible_cpu(i) {
+ rq = cpu_rq(i);
+ se = tg->se[i];
+
+ raw_spin_lock_irq(&rq->lock);
+ post_init_entity_util_avg(se);
+ sync_throttle(tg, i);
+ raw_spin_unlock_irq(&rq->lock);
+ }
+}
+
void unregister_fair_sched_group(struct task_group *tg)
{
unsigned long flags;
@@ -8680,6 +8753,8 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
return 1;
}
+void online_fair_sched_group(struct task_group *tg) { }
+
void unregister_fair_sched_group(struct task_group *tg) { }
#endif /* CONFIG_FAIR_GROUP_SCHED */
@@ -8739,7 +8814,7 @@ const struct sched_class fair_sched_class = {
.update_curr = update_curr_fair,
#ifdef CONFIG_FAIR_GROUP_SCHED
- .task_move_group = task_move_group_fair,
+ .task_change_group = task_change_group_fair,
#endif
};
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index c5aeedf4e93a..9fb873cfc75c 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -201,6 +201,8 @@ exit_idle:
*/
static void cpu_idle_loop(void)
{
+ int cpu = smp_processor_id();
+
while (1) {
/*
* If the arch has a polling bit, we maintain an invariant:
@@ -219,7 +221,7 @@ static void cpu_idle_loop(void)
check_pgt_cache();
rmb();
- if (cpu_is_offline(smp_processor_id())) {
+ if (cpu_is_offline(cpu)) {
cpuhp_report_idle_dead();
arch_cpu_idle_dead();
}
diff --git a/kernel/sched/loadavg.c b/kernel/sched/loadavg.c
index b0b93fd33af9..a2d6eb71f06b 100644
--- a/kernel/sched/loadavg.c
+++ b/kernel/sched/loadavg.c
@@ -78,11 +78,11 @@ void get_avenrun(unsigned long *loads, unsigned long offset, int shift)
loads[2] = (avenrun[2] + offset) << shift;
}
-long calc_load_fold_active(struct rq *this_rq)
+long calc_load_fold_active(struct rq *this_rq, long adjust)
{
long nr_active, delta = 0;
- nr_active = this_rq->nr_running;
+ nr_active = this_rq->nr_running - adjust;
nr_active += (long)this_rq->nr_uninterruptible;
if (nr_active != this_rq->calc_load_active) {
@@ -188,7 +188,7 @@ void calc_load_enter_idle(void)
* We're going into NOHZ mode, if there's any pending delta, fold it
* into the pending idle delta.
*/
- delta = calc_load_fold_active(this_rq);
+ delta = calc_load_fold_active(this_rq, 0);
if (delta) {
int idx = calc_load_write_idx();
@@ -389,7 +389,7 @@ void calc_global_load_tick(struct rq *this_rq)
if (time_before(jiffies, this_rq->calc_load_update))
return;
- delta = calc_load_fold_active(this_rq);
+ delta = calc_load_fold_active(this_rq, 0);
if (delta)
atomic_long_add(delta, &calc_load_tasks);
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 7cbeb92a1cb9..c64fc5114004 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -28,7 +28,7 @@ extern unsigned long calc_load_update;
extern atomic_long_t calc_load_tasks;
extern void calc_global_load_tick(struct rq *this_rq);
-extern long calc_load_fold_active(struct rq *this_rq);
+extern long calc_load_fold_active(struct rq *this_rq, long adjust);
#ifdef CONFIG_SMP
extern void cpu_load_update_active(struct rq *this_rq);
@@ -321,6 +321,7 @@ extern int tg_nop(struct task_group *tg, void *data);
extern void free_fair_sched_group(struct task_group *tg);
extern int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent);
+extern void online_fair_sched_group(struct task_group *tg);
extern void unregister_fair_sched_group(struct task_group *tg);
extern void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
struct sched_entity *se, int cpu,
@@ -437,7 +438,7 @@ struct cfs_rq {
u64 throttled_clock, throttled_clock_task;
u64 throttled_clock_task_time;
- int throttled, throttle_count, throttle_uptodate;
+ int throttled, throttle_count;
struct list_head throttled_list;
#endif /* CONFIG_CFS_BANDWIDTH */
#endif /* CONFIG_FAIR_GROUP_SCHED */
@@ -1113,7 +1114,7 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
* In particular, the load of prev->state in finish_task_switch() must
* happen before this.
*
- * Pairs with the smp_cond_acquire() in try_to_wake_up().
+ * Pairs with the smp_cond_load_acquire() in try_to_wake_up().
*/
smp_store_release(&prev->on_cpu, 0);
#endif
@@ -1246,8 +1247,11 @@ struct sched_class {
void (*update_curr) (struct rq *rq);
+#define TASK_SET_GROUP 0
+#define TASK_MOVE_GROUP 1
+
#ifdef CONFIG_FAIR_GROUP_SCHED
- void (*task_move_group) (struct task_struct *p);
+ void (*task_change_group) (struct task_struct *p, int type);
#endif
};
@@ -1809,16 +1813,3 @@ static inline void cpufreq_trigger_update(u64 time) {}
#else /* arch_scale_freq_capacity */
#define arch_scale_freq_invariant() (false)
#endif
-
-static inline void account_reset_rq(struct rq *rq)
-{
-#ifdef CONFIG_IRQ_TIME_ACCOUNTING
- rq->prev_irq_time = 0;
-#endif
-#ifdef CONFIG_PARAVIRT
- rq->prev_steal_time = 0;
-#endif
-#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
- rq->prev_steal_time_rq = 0;
-#endif
-}
diff --git a/kernel/smp.c b/kernel/smp.c
index 7180491c9678..3aa642d39c03 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -92,7 +92,7 @@ void __init call_function_init(void)
*/
static __always_inline void csd_lock_wait(struct call_single_data *csd)
{
- smp_cond_acquire(!(csd->flags & CSD_FLAG_LOCK));
+ smp_cond_load_acquire(&csd->flags, !(VAL & CSD_FLAG_LOCK));
}
static __always_inline void csd_lock(struct call_single_data *csd)
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 35f0dcb1cb4f..53954631a4e1 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1508,8 +1508,8 @@ static struct ctl_table vm_table[] = {
#ifdef CONFIG_NUMA
{
.procname = "zone_reclaim_mode",
- .data = &zone_reclaim_mode,
- .maxlen = sizeof(zone_reclaim_mode),
+ .data = &node_reclaim_mode,
+ .maxlen = sizeof(node_reclaim_mode),
.mode = 0644,
.proc_handler = proc_dointvec,
.extra1 = &zero,
diff --git a/kernel/task_work.c b/kernel/task_work.c
index 53fa971d000d..6ab4842b00e8 100644
--- a/kernel/task_work.c
+++ b/kernel/task_work.c
@@ -108,7 +108,6 @@ void task_work_run(void)
* fail, but it can play with *work and other entries.
*/
raw_spin_unlock_wait(&task->pi_lock);
- smp_mb();
do {
next = work->next;
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index a9b76a40319e..2c5bc77c0bb0 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -645,7 +645,7 @@ void tick_cleanup_dead_cpu(int cpu)
#endif
#ifdef CONFIG_SYSFS
-struct bus_type clockevents_subsys = {
+static struct bus_type clockevents_subsys = {
.name = "clockevents",
.dev_name = "clockevent",
};
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index 1cafba860b08..39008d78927a 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -777,6 +777,7 @@ static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp)
timer->it.cpu.expires = 0;
sample_to_timespec(timer->it_clock, timer->it.cpu.expires,
&itp->it_value);
+ return;
} else {
cpu_timer_sample_group(timer->it_clock, p, &now);
unlock_task_sighand(p, &flags);
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 2ec7c00228f3..204fdc86863d 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -908,11 +908,10 @@ static void __tick_nohz_idle_enter(struct tick_sched *ts)
ktime_t now, expires;
int cpu = smp_processor_id();
- now = tick_nohz_start_idle(ts);
-
if (can_stop_idle_tick(cpu, ts)) {
int was_stopped = ts->tick_stopped;
+ now = tick_nohz_start_idle(ts);
ts->idle_calls++;
expires = tick_nohz_stop_sched_tick(ts, now, cpu);
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index a196e08324e7..3b65746c7f15 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -2188,6 +2188,7 @@ struct timespec64 get_monotonic_coarse64(void)
return now;
}
+EXPORT_SYMBOL(get_monotonic_coarse64);
/*
* Must hold jiffies_lock
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index fafeaf803bd0..f4b86e8ca1e7 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -542,6 +542,7 @@ config HIST_TRIGGERS
bool "Histogram triggers"
depends on ARCH_HAVE_NMI_SAFE_CMPXCHG
select TRACING_MAP
+ select TRACING
default n
help
Hist triggers allow one or more arbitrary trace event fields
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 9aef8654e90d..fb345cd11883 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -127,12 +127,13 @@ static void trace_note_tsk(struct task_struct *tsk)
static void trace_note_time(struct blk_trace *bt)
{
- struct timespec now;
+ struct timespec64 now;
unsigned long flags;
u32 words[2];
- getnstimeofday(&now);
- words[0] = now.tv_sec;
+ /* need to check user space to see if this breaks in y2038 or y2106 */
+ ktime_get_real_ts64(&now);
+ words[0] = (u32)now.tv_sec;
words[1] = now.tv_nsec;
local_irq_save(flags);
@@ -189,6 +190,7 @@ static const u32 ddir_act[2] = { BLK_TC_ACT(BLK_TC_READ),
BLK_TC_ACT(BLK_TC_WRITE) };
#define BLK_TC_RAHEAD BLK_TC_AHEAD
+#define BLK_TC_PREFLUSH BLK_TC_FLUSH
/* The ilog2() calls fall out because they're constant */
#define MASK_TC_BIT(rw, __name) ((rw & REQ_ ## __name) << \
@@ -199,7 +201,8 @@ static const u32 ddir_act[2] = { BLK_TC_ACT(BLK_TC_READ),
* blk_io_trace structure and places it in a per-cpu subbuffer.
*/
static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
- int rw, u32 what, int error, int pdu_len, void *pdu_data)
+ int op, int op_flags, u32 what, int error, int pdu_len,
+ void *pdu_data)
{
struct task_struct *tsk = current;
struct ring_buffer_event *event = NULL;
@@ -214,13 +217,16 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
if (unlikely(bt->trace_state != Blktrace_running && !blk_tracer))
return;
- what |= ddir_act[rw & WRITE];
- what |= MASK_TC_BIT(rw, SYNC);
- what |= MASK_TC_BIT(rw, RAHEAD);
- what |= MASK_TC_BIT(rw, META);
- what |= MASK_TC_BIT(rw, DISCARD);
- what |= MASK_TC_BIT(rw, FLUSH);
- what |= MASK_TC_BIT(rw, FUA);
+ what |= ddir_act[op_is_write(op) ? WRITE : READ];
+ what |= MASK_TC_BIT(op_flags, SYNC);
+ what |= MASK_TC_BIT(op_flags, RAHEAD);
+ what |= MASK_TC_BIT(op_flags, META);
+ what |= MASK_TC_BIT(op_flags, PREFLUSH);
+ what |= MASK_TC_BIT(op_flags, FUA);
+ if (op == REQ_OP_DISCARD)
+ what |= BLK_TC_ACT(BLK_TC_DISCARD);
+ if (op == REQ_OP_FLUSH)
+ what |= BLK_TC_ACT(BLK_TC_FLUSH);
pid = tsk->pid;
if (act_log_check(bt, what, sector, pid))
@@ -708,11 +714,11 @@ static void blk_add_trace_rq(struct request_queue *q, struct request *rq,
if (rq->cmd_type == REQ_TYPE_BLOCK_PC) {
what |= BLK_TC_ACT(BLK_TC_PC);
- __blk_add_trace(bt, 0, nr_bytes, rq->cmd_flags,
+ __blk_add_trace(bt, 0, nr_bytes, req_op(rq), rq->cmd_flags,
what, rq->errors, rq->cmd_len, rq->cmd);
} else {
what |= BLK_TC_ACT(BLK_TC_FS);
- __blk_add_trace(bt, blk_rq_pos(rq), nr_bytes,
+ __blk_add_trace(bt, blk_rq_pos(rq), nr_bytes, req_op(rq),
rq->cmd_flags, what, rq->errors, 0, NULL);
}
}
@@ -770,7 +776,7 @@ static void blk_add_trace_bio(struct request_queue *q, struct bio *bio,
return;
__blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size,
- bio->bi_rw, what, error, 0, NULL);
+ bio_op(bio), bio->bi_rw, what, error, 0, NULL);
}
static void blk_add_trace_bio_bounce(void *ignore,
@@ -818,7 +824,8 @@ static void blk_add_trace_getrq(void *ignore,
struct blk_trace *bt = q->blk_trace;
if (bt)
- __blk_add_trace(bt, 0, 0, rw, BLK_TA_GETRQ, 0, 0, NULL);
+ __blk_add_trace(bt, 0, 0, rw, 0, BLK_TA_GETRQ, 0, 0,
+ NULL);
}
}
@@ -833,7 +840,7 @@ static void blk_add_trace_sleeprq(void *ignore,
struct blk_trace *bt = q->blk_trace;
if (bt)
- __blk_add_trace(bt, 0, 0, rw, BLK_TA_SLEEPRQ,
+ __blk_add_trace(bt, 0, 0, rw, 0, BLK_TA_SLEEPRQ,
0, 0, NULL);
}
}
@@ -843,7 +850,7 @@ static void blk_add_trace_plug(void *ignore, struct request_queue *q)
struct blk_trace *bt = q->blk_trace;
if (bt)
- __blk_add_trace(bt, 0, 0, 0, BLK_TA_PLUG, 0, 0, NULL);
+ __blk_add_trace(bt, 0, 0, 0, 0, BLK_TA_PLUG, 0, 0, NULL);
}
static void blk_add_trace_unplug(void *ignore, struct request_queue *q,
@@ -860,7 +867,7 @@ static void blk_add_trace_unplug(void *ignore, struct request_queue *q,
else
what = BLK_TA_UNPLUG_TIMER;
- __blk_add_trace(bt, 0, 0, 0, what, 0, sizeof(rpdu), &rpdu);
+ __blk_add_trace(bt, 0, 0, 0, 0, what, 0, sizeof(rpdu), &rpdu);
}
}
@@ -874,8 +881,9 @@ static void blk_add_trace_split(void *ignore,
__be64 rpdu = cpu_to_be64(pdu);
__blk_add_trace(bt, bio->bi_iter.bi_sector,
- bio->bi_iter.bi_size, bio->bi_rw, BLK_TA_SPLIT,
- bio->bi_error, sizeof(rpdu), &rpdu);
+ bio->bi_iter.bi_size, bio_op(bio), bio->bi_rw,
+ BLK_TA_SPLIT, bio->bi_error, sizeof(rpdu),
+ &rpdu);
}
}
@@ -907,7 +915,7 @@ static void blk_add_trace_bio_remap(void *ignore,
r.sector_from = cpu_to_be64(from);
__blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size,
- bio->bi_rw, BLK_TA_REMAP, bio->bi_error,
+ bio_op(bio), bio->bi_rw, BLK_TA_REMAP, bio->bi_error,
sizeof(r), &r);
}
@@ -940,7 +948,7 @@ static void blk_add_trace_rq_remap(void *ignore,
r.sector_from = cpu_to_be64(from);
__blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq),
- rq_data_dir(rq), BLK_TA_REMAP, !!rq->errors,
+ rq_data_dir(rq), 0, BLK_TA_REMAP, !!rq->errors,
sizeof(r), &r);
}
@@ -965,10 +973,10 @@ void blk_add_driver_data(struct request_queue *q,
return;
if (rq->cmd_type == REQ_TYPE_BLOCK_PC)
- __blk_add_trace(bt, 0, blk_rq_bytes(rq), 0,
+ __blk_add_trace(bt, 0, blk_rq_bytes(rq), 0, 0,
BLK_TA_DRV_DATA, rq->errors, len, data);
else
- __blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq), 0,
+ __blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq), 0, 0,
BLK_TA_DRV_DATA, rq->errors, len, data);
}
EXPORT_SYMBOL_GPL(blk_add_driver_data);
@@ -1769,21 +1777,34 @@ void blk_dump_cmd(char *buf, struct request *rq)
}
}
-void blk_fill_rwbs(char *rwbs, u32 rw, int bytes)
+void blk_fill_rwbs(char *rwbs, int op, u32 rw, int bytes)
{
int i = 0;
- if (rw & REQ_FLUSH)
+ if (rw & REQ_PREFLUSH)
rwbs[i++] = 'F';
- if (rw & WRITE)
+ switch (op) {
+ case REQ_OP_WRITE:
+ case REQ_OP_WRITE_SAME:
rwbs[i++] = 'W';
- else if (rw & REQ_DISCARD)
+ break;
+ case REQ_OP_DISCARD:
+ rwbs[i++] = 'D';
+ break;
+ case REQ_OP_SECURE_ERASE:
rwbs[i++] = 'D';
- else if (bytes)
+ rwbs[i++] = 'E';
+ break;
+ case REQ_OP_FLUSH:
+ rwbs[i++] = 'F';
+ break;
+ case REQ_OP_READ:
rwbs[i++] = 'R';
- else
+ break;
+ default:
rwbs[i++] = 'N';
+ }
if (rw & REQ_FUA)
rwbs[i++] = 'F';
@@ -1793,8 +1814,6 @@ void blk_fill_rwbs(char *rwbs, u32 rw, int bytes)
rwbs[i++] = 'S';
if (rw & REQ_META)
rwbs[i++] = 'M';
- if (rw & REQ_SECURE)
- rwbs[i++] = 'E';
rwbs[i] = '\0';
}
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 26f603da7e26..b20438fdb029 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -81,6 +81,49 @@ static const struct bpf_func_proto bpf_probe_read_proto = {
.arg3_type = ARG_ANYTHING,
};
+static u64 bpf_probe_write_user(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+{
+ void *unsafe_ptr = (void *) (long) r1;
+ void *src = (void *) (long) r2;
+ int size = (int) r3;
+
+ /*
+ * Ensure we're in user context which is safe for the helper to
+ * run. This helper has no business in a kthread.
+ *
+ * access_ok() should prevent writing to non-user memory, but in
+ * some situations (nommu, temporary switch, etc) access_ok() does
+ * not provide enough validation, hence the check on KERNEL_DS.
+ */
+
+ if (unlikely(in_interrupt() ||
+ current->flags & (PF_KTHREAD | PF_EXITING)))
+ return -EPERM;
+ if (unlikely(segment_eq(get_fs(), KERNEL_DS)))
+ return -EPERM;
+ if (!access_ok(VERIFY_WRITE, unsafe_ptr, size))
+ return -EPERM;
+
+ return probe_kernel_write(unsafe_ptr, src, size);
+}
+
+static const struct bpf_func_proto bpf_probe_write_user_proto = {
+ .func = bpf_probe_write_user,
+ .gpl_only = true,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_ANYTHING,
+ .arg2_type = ARG_PTR_TO_STACK,
+ .arg3_type = ARG_CONST_STACK_SIZE,
+};
+
+static const struct bpf_func_proto *bpf_get_probe_write_proto(void)
+{
+ pr_warn_ratelimited("%s[%d] is installing a program with bpf_probe_write_user helper that may corrupt user memory!",
+ current->comm, task_pid_nr(current));
+
+ return &bpf_probe_write_user_proto;
+}
+
/*
* limited trace_printk()
* only %d %u %x %ld %lu %lx %lld %llu %llx %p %s conversion specifiers allowed
@@ -188,31 +231,35 @@ const struct bpf_func_proto *bpf_get_trace_printk_proto(void)
return &bpf_trace_printk_proto;
}
-static u64 bpf_perf_event_read(u64 r1, u64 index, u64 r3, u64 r4, u64 r5)
+static u64 bpf_perf_event_read(u64 r1, u64 flags, u64 r3, u64 r4, u64 r5)
{
struct bpf_map *map = (struct bpf_map *) (unsigned long) r1;
struct bpf_array *array = container_of(map, struct bpf_array, map);
+ unsigned int cpu = smp_processor_id();
+ u64 index = flags & BPF_F_INDEX_MASK;
+ struct bpf_event_entry *ee;
struct perf_event *event;
- struct file *file;
+ if (unlikely(flags & ~(BPF_F_INDEX_MASK)))
+ return -EINVAL;
+ if (index == BPF_F_CURRENT_CPU)
+ index = cpu;
if (unlikely(index >= array->map.max_entries))
return -E2BIG;
- file = READ_ONCE(array->ptrs[index]);
- if (unlikely(!file))
+ ee = READ_ONCE(array->ptrs[index]);
+ if (!ee)
return -ENOENT;
- event = file->private_data;
-
- /* make sure event is local and doesn't have pmu::count */
- if (event->oncpu != smp_processor_id() ||
- event->pmu->count)
- return -EINVAL;
-
+ event = ee->event;
if (unlikely(event->attr.type != PERF_TYPE_HARDWARE &&
event->attr.type != PERF_TYPE_RAW))
return -EINVAL;
+ /* make sure event is local and doesn't have pmu::count */
+ if (unlikely(event->oncpu != cpu || event->pmu->count))
+ return -EINVAL;
+
/*
* we don't know if the function is run successfully by the
* return value. It can be judged in other places, such as
@@ -229,47 +276,58 @@ static const struct bpf_func_proto bpf_perf_event_read_proto = {
.arg2_type = ARG_ANYTHING,
};
-static u64 bpf_perf_event_output(u64 r1, u64 r2, u64 flags, u64 r4, u64 size)
+static __always_inline u64
+__bpf_perf_event_output(struct pt_regs *regs, struct bpf_map *map,
+ u64 flags, struct perf_raw_record *raw)
{
- struct pt_regs *regs = (struct pt_regs *) (long) r1;
- struct bpf_map *map = (struct bpf_map *) (long) r2;
struct bpf_array *array = container_of(map, struct bpf_array, map);
+ unsigned int cpu = smp_processor_id();
u64 index = flags & BPF_F_INDEX_MASK;
- void *data = (void *) (long) r4;
struct perf_sample_data sample_data;
+ struct bpf_event_entry *ee;
struct perf_event *event;
- struct file *file;
- struct perf_raw_record raw = {
- .size = size,
- .data = data,
- };
- if (unlikely(flags & ~(BPF_F_INDEX_MASK)))
- return -EINVAL;
if (index == BPF_F_CURRENT_CPU)
- index = raw_smp_processor_id();
+ index = cpu;
if (unlikely(index >= array->map.max_entries))
return -E2BIG;
- file = READ_ONCE(array->ptrs[index]);
- if (unlikely(!file))
+ ee = READ_ONCE(array->ptrs[index]);
+ if (!ee)
return -ENOENT;
- event = file->private_data;
-
+ event = ee->event;
if (unlikely(event->attr.type != PERF_TYPE_SOFTWARE ||
event->attr.config != PERF_COUNT_SW_BPF_OUTPUT))
return -EINVAL;
- if (unlikely(event->oncpu != smp_processor_id()))
+ if (unlikely(event->oncpu != cpu))
return -EOPNOTSUPP;
perf_sample_data_init(&sample_data, 0, 0);
- sample_data.raw = &raw;
+ sample_data.raw = raw;
perf_event_output(event, &sample_data, regs);
return 0;
}
+static u64 bpf_perf_event_output(u64 r1, u64 r2, u64 flags, u64 r4, u64 size)
+{
+ struct pt_regs *regs = (struct pt_regs *)(long) r1;
+ struct bpf_map *map = (struct bpf_map *)(long) r2;
+ void *data = (void *)(long) r4;
+ struct perf_raw_record raw = {
+ .frag = {
+ .size = size,
+ .data = data,
+ },
+ };
+
+ if (unlikely(flags & ~(BPF_F_INDEX_MASK)))
+ return -EINVAL;
+
+ return __bpf_perf_event_output(regs, map, flags, &raw);
+}
+
static const struct bpf_func_proto bpf_perf_event_output_proto = {
.func = bpf_perf_event_output,
.gpl_only = true,
@@ -283,31 +341,41 @@ static const struct bpf_func_proto bpf_perf_event_output_proto = {
static DEFINE_PER_CPU(struct pt_regs, bpf_pt_regs);
-static u64 bpf_event_output(u64 r1, u64 r2, u64 flags, u64 r4, u64 size)
+u64 bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size,
+ void *ctx, u64 ctx_size, bpf_ctx_copy_t ctx_copy)
{
struct pt_regs *regs = this_cpu_ptr(&bpf_pt_regs);
+ struct perf_raw_frag frag = {
+ .copy = ctx_copy,
+ .size = ctx_size,
+ .data = ctx,
+ };
+ struct perf_raw_record raw = {
+ .frag = {
+ {
+ .next = ctx_size ? &frag : NULL,
+ },
+ .size = meta_size,
+ .data = meta,
+ },
+ };
perf_fetch_caller_regs(regs);
- return bpf_perf_event_output((long)regs, r2, flags, r4, size);
+ return __bpf_perf_event_output(regs, map, flags, &raw);
+}
+
+static u64 bpf_get_current_task(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+{
+ return (long) current;
}
-static const struct bpf_func_proto bpf_event_output_proto = {
- .func = bpf_event_output,
+static const struct bpf_func_proto bpf_get_current_task_proto = {
+ .func = bpf_get_current_task,
.gpl_only = true,
.ret_type = RET_INTEGER,
- .arg1_type = ARG_PTR_TO_CTX,
- .arg2_type = ARG_CONST_MAP_PTR,
- .arg3_type = ARG_ANYTHING,
- .arg4_type = ARG_PTR_TO_STACK,
- .arg5_type = ARG_CONST_STACK_SIZE,
};
-const struct bpf_func_proto *bpf_get_event_output_proto(void)
-{
- return &bpf_event_output_proto;
-}
-
static const struct bpf_func_proto *tracing_func_proto(enum bpf_func_id func_id)
{
switch (func_id) {
@@ -325,6 +393,8 @@ static const struct bpf_func_proto *tracing_func_proto(enum bpf_func_id func_id)
return &bpf_tail_call_proto;
case BPF_FUNC_get_current_pid_tgid:
return &bpf_get_current_pid_tgid_proto;
+ case BPF_FUNC_get_current_task:
+ return &bpf_get_current_task_proto;
case BPF_FUNC_get_current_uid_gid:
return &bpf_get_current_uid_gid_proto;
case BPF_FUNC_get_current_comm:
@@ -335,6 +405,8 @@ static const struct bpf_func_proto *tracing_func_proto(enum bpf_func_id func_id)
return &bpf_get_smp_processor_id_proto;
case BPF_FUNC_perf_event_read:
return &bpf_perf_event_read_proto;
+ case BPF_FUNC_probe_write_user:
+ return bpf_get_probe_write_proto();
default:
return NULL;
}
@@ -356,18 +428,12 @@ static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func
static bool kprobe_prog_is_valid_access(int off, int size, enum bpf_access_type type,
enum bpf_reg_type *reg_type)
{
- /* check bounds */
if (off < 0 || off >= sizeof(struct pt_regs))
return false;
-
- /* only read is allowed */
if (type != BPF_READ)
return false;
-
- /* disallow misaligned access */
if (off % size != 0)
return false;
-
return true;
}
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 900dbb1efff2..84752c8e28b5 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -89,16 +89,16 @@ struct ftrace_ops *function_trace_op __read_mostly = &ftrace_list_end;
/* What to set function_trace_op to */
static struct ftrace_ops *set_function_trace_op;
-/* List for set_ftrace_pid's pids. */
-LIST_HEAD(ftrace_pids);
-struct ftrace_pid {
- struct list_head list;
- struct pid *pid;
-};
-
-static bool ftrace_pids_enabled(void)
+static bool ftrace_pids_enabled(struct ftrace_ops *ops)
{
- return !list_empty(&ftrace_pids);
+ struct trace_array *tr;
+
+ if (!(ops->flags & FTRACE_OPS_FL_PID) || !ops->private)
+ return false;
+
+ tr = ops->private;
+
+ return tr->function_pids != NULL;
}
static void ftrace_update_trampoline(struct ftrace_ops *ops);
@@ -179,7 +179,9 @@ int ftrace_nr_registered_ops(void)
static void ftrace_pid_func(unsigned long ip, unsigned long parent_ip,
struct ftrace_ops *op, struct pt_regs *regs)
{
- if (!test_tsk_trace_trace(current))
+ struct trace_array *tr = op->private;
+
+ if (tr && this_cpu_read(tr->trace_buffer.data->ftrace_ignore_pid))
return;
op->saved_func(ip, parent_ip, op, regs);
@@ -417,7 +419,7 @@ static int __register_ftrace_function(struct ftrace_ops *ops)
/* Always save the function, and reset at unregistering */
ops->saved_func = ops->func;
- if (ops->flags & FTRACE_OPS_FL_PID && ftrace_pids_enabled())
+ if (ftrace_pids_enabled(ops))
ops->func = ftrace_pid_func;
ftrace_update_trampoline(ops);
@@ -450,7 +452,6 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops)
static void ftrace_update_pid_func(void)
{
- bool enabled = ftrace_pids_enabled();
struct ftrace_ops *op;
/* Only do something if we are tracing something */
@@ -459,8 +460,8 @@ static void ftrace_update_pid_func(void)
do_for_each_ftrace_op(op, ftrace_ops_list) {
if (op->flags & FTRACE_OPS_FL_PID) {
- op->func = enabled ? ftrace_pid_func :
- op->saved_func;
+ op->func = ftrace_pids_enabled(op) ?
+ ftrace_pid_func : op->saved_func;
ftrace_update_trampoline(op);
}
} while_for_each_ftrace_op(op);
@@ -5324,179 +5325,99 @@ ftrace_func_t ftrace_ops_get_func(struct ftrace_ops *ops)
return ops->func;
}
-static void clear_ftrace_swapper(void)
+static void
+ftrace_filter_pid_sched_switch_probe(void *data, bool preempt,
+ struct task_struct *prev, struct task_struct *next)
{
- struct task_struct *p;
- int cpu;
+ struct trace_array *tr = data;
+ struct trace_pid_list *pid_list;
- get_online_cpus();
- for_each_online_cpu(cpu) {
- p = idle_task(cpu);
- clear_tsk_trace_trace(p);
- }
- put_online_cpus();
-}
-
-static void set_ftrace_swapper(void)
-{
- struct task_struct *p;
- int cpu;
+ pid_list = rcu_dereference_sched(tr->function_pids);
- get_online_cpus();
- for_each_online_cpu(cpu) {
- p = idle_task(cpu);
- set_tsk_trace_trace(p);
- }
- put_online_cpus();
+ this_cpu_write(tr->trace_buffer.data->ftrace_ignore_pid,
+ trace_ignore_this_task(pid_list, next));
}
-static void clear_ftrace_pid(struct pid *pid)
+static void clear_ftrace_pids(struct trace_array *tr)
{
- struct task_struct *p;
+ struct trace_pid_list *pid_list;
+ int cpu;
- rcu_read_lock();
- do_each_pid_task(pid, PIDTYPE_PID, p) {
- clear_tsk_trace_trace(p);
- } while_each_pid_task(pid, PIDTYPE_PID, p);
- rcu_read_unlock();
+ pid_list = rcu_dereference_protected(tr->function_pids,
+ lockdep_is_held(&ftrace_lock));
+ if (!pid_list)
+ return;
- put_pid(pid);
-}
+ unregister_trace_sched_switch(ftrace_filter_pid_sched_switch_probe, tr);
-static void set_ftrace_pid(struct pid *pid)
-{
- struct task_struct *p;
+ for_each_possible_cpu(cpu)
+ per_cpu_ptr(tr->trace_buffer.data, cpu)->ftrace_ignore_pid = false;
- rcu_read_lock();
- do_each_pid_task(pid, PIDTYPE_PID, p) {
- set_tsk_trace_trace(p);
- } while_each_pid_task(pid, PIDTYPE_PID, p);
- rcu_read_unlock();
-}
+ rcu_assign_pointer(tr->function_pids, NULL);
-static void clear_ftrace_pid_task(struct pid *pid)
-{
- if (pid == ftrace_swapper_pid)
- clear_ftrace_swapper();
- else
- clear_ftrace_pid(pid);
-}
+ /* Wait till all users are no longer using pid filtering */
+ synchronize_sched();
-static void set_ftrace_pid_task(struct pid *pid)
-{
- if (pid == ftrace_swapper_pid)
- set_ftrace_swapper();
- else
- set_ftrace_pid(pid);
+ trace_free_pid_list(pid_list);
}
-static int ftrace_pid_add(int p)
+static void ftrace_pid_reset(struct trace_array *tr)
{
- struct pid *pid;
- struct ftrace_pid *fpid;
- int ret = -EINVAL;
-
mutex_lock(&ftrace_lock);
-
- if (!p)
- pid = ftrace_swapper_pid;
- else
- pid = find_get_pid(p);
-
- if (!pid)
- goto out;
-
- ret = 0;
-
- list_for_each_entry(fpid, &ftrace_pids, list)
- if (fpid->pid == pid)
- goto out_put;
-
- ret = -ENOMEM;
-
- fpid = kmalloc(sizeof(*fpid), GFP_KERNEL);
- if (!fpid)
- goto out_put;
-
- list_add(&fpid->list, &ftrace_pids);
- fpid->pid = pid;
-
- set_ftrace_pid_task(pid);
+ clear_ftrace_pids(tr);
ftrace_update_pid_func();
-
ftrace_startup_all(0);
mutex_unlock(&ftrace_lock);
- return 0;
-
-out_put:
- if (pid != ftrace_swapper_pid)
- put_pid(pid);
-
-out:
- mutex_unlock(&ftrace_lock);
- return ret;
}
-static void ftrace_pid_reset(void)
-{
- struct ftrace_pid *fpid, *safe;
-
- mutex_lock(&ftrace_lock);
- list_for_each_entry_safe(fpid, safe, &ftrace_pids, list) {
- struct pid *pid = fpid->pid;
-
- clear_ftrace_pid_task(pid);
-
- list_del(&fpid->list);
- kfree(fpid);
- }
-
- ftrace_update_pid_func();
- ftrace_startup_all(0);
-
- mutex_unlock(&ftrace_lock);
-}
+/* Greater than any max PID */
+#define FTRACE_NO_PIDS (void *)(PID_MAX_LIMIT + 1)
static void *fpid_start(struct seq_file *m, loff_t *pos)
+ __acquires(RCU)
{
+ struct trace_pid_list *pid_list;
+ struct trace_array *tr = m->private;
+
mutex_lock(&ftrace_lock);
+ rcu_read_lock_sched();
- if (!ftrace_pids_enabled() && (!*pos))
- return (void *) 1;
+ pid_list = rcu_dereference_sched(tr->function_pids);
- return seq_list_start(&ftrace_pids, *pos);
+ if (!pid_list)
+ return !(*pos) ? FTRACE_NO_PIDS : NULL;
+
+ return trace_pid_start(pid_list, pos);
}
static void *fpid_next(struct seq_file *m, void *v, loff_t *pos)
{
- if (v == (void *)1)
+ struct trace_array *tr = m->private;
+ struct trace_pid_list *pid_list = rcu_dereference_sched(tr->function_pids);
+
+ if (v == FTRACE_NO_PIDS)
return NULL;
- return seq_list_next(v, &ftrace_pids, pos);
+ return trace_pid_next(pid_list, v, pos);
}
static void fpid_stop(struct seq_file *m, void *p)
+ __releases(RCU)
{
+ rcu_read_unlock_sched();
mutex_unlock(&ftrace_lock);
}
static int fpid_show(struct seq_file *m, void *v)
{
- const struct ftrace_pid *fpid = list_entry(v, struct ftrace_pid, list);
-
- if (v == (void *)1) {
+ if (v == FTRACE_NO_PIDS) {
seq_puts(m, "no pid\n");
return 0;
}
- if (fpid->pid == ftrace_swapper_pid)
- seq_puts(m, "swapper tasks\n");
- else
- seq_printf(m, "%u\n", pid_vnr(fpid->pid));
-
- return 0;
+ return trace_pid_show(m, v);
}
static const struct seq_operations ftrace_pid_sops = {
@@ -5509,58 +5430,103 @@ static const struct seq_operations ftrace_pid_sops = {
static int
ftrace_pid_open(struct inode *inode, struct file *file)
{
+ struct trace_array *tr = inode->i_private;
+ struct seq_file *m;
int ret = 0;
+ if (trace_array_get(tr) < 0)
+ return -ENODEV;
+
if ((file->f_mode & FMODE_WRITE) &&
(file->f_flags & O_TRUNC))
- ftrace_pid_reset();
+ ftrace_pid_reset(tr);
- if (file->f_mode & FMODE_READ)
- ret = seq_open(file, &ftrace_pid_sops);
+ ret = seq_open(file, &ftrace_pid_sops);
+ if (ret < 0) {
+ trace_array_put(tr);
+ } else {
+ m = file->private_data;
+ /* copy tr over to seq ops */
+ m->private = tr;
+ }
return ret;
}
+static void ignore_task_cpu(void *data)
+{
+ struct trace_array *tr = data;
+ struct trace_pid_list *pid_list;
+
+ /*
+ * This function is called by on_each_cpu() while the
+ * event_mutex is held.
+ */
+ pid_list = rcu_dereference_protected(tr->function_pids,
+ mutex_is_locked(&ftrace_lock));
+
+ this_cpu_write(tr->trace_buffer.data->ftrace_ignore_pid,
+ trace_ignore_this_task(pid_list, current));
+}
+
static ssize_t
ftrace_pid_write(struct file *filp, const char __user *ubuf,
size_t cnt, loff_t *ppos)
{
- char buf[64], *tmp;
- long val;
- int ret;
+ struct seq_file *m = filp->private_data;
+ struct trace_array *tr = m->private;
+ struct trace_pid_list *filtered_pids = NULL;
+ struct trace_pid_list *pid_list;
+ ssize_t ret;
- if (cnt >= sizeof(buf))
- return -EINVAL;
+ if (!cnt)
+ return 0;
+
+ mutex_lock(&ftrace_lock);
+
+ filtered_pids = rcu_dereference_protected(tr->function_pids,
+ lockdep_is_held(&ftrace_lock));
+
+ ret = trace_pid_write(filtered_pids, &pid_list, ubuf, cnt);
+ if (ret < 0)
+ goto out;
- if (copy_from_user(&buf, ubuf, cnt))
- return -EFAULT;
+ rcu_assign_pointer(tr->function_pids, pid_list);
- buf[cnt] = 0;
+ if (filtered_pids) {
+ synchronize_sched();
+ trace_free_pid_list(filtered_pids);
+ } else if (pid_list) {
+ /* Register a probe to set whether to ignore the tracing of a task */
+ register_trace_sched_switch(ftrace_filter_pid_sched_switch_probe, tr);
+ }
/*
- * Allow "echo > set_ftrace_pid" or "echo -n '' > set_ftrace_pid"
- * to clean the filter quietly.
+ * Ignoring of pids is done at task switch. But we have to
+ * check for those tasks that are currently running.
+ * Always do this in case a pid was appended or removed.
*/
- tmp = strstrip(buf);
- if (strlen(tmp) == 0)
- return 1;
+ on_each_cpu(ignore_task_cpu, tr, 1);
- ret = kstrtol(tmp, 10, &val);
- if (ret < 0)
- return ret;
+ ftrace_update_pid_func();
+ ftrace_startup_all(0);
+ out:
+ mutex_unlock(&ftrace_lock);
- ret = ftrace_pid_add(val);
+ if (ret > 0)
+ *ppos += ret;
- return ret ? ret : cnt;
+ return ret;
}
static int
ftrace_pid_release(struct inode *inode, struct file *file)
{
- if (file->f_mode & FMODE_READ)
- seq_release(inode, file);
+ struct trace_array *tr = inode->i_private;
- return 0;
+ trace_array_put(tr);
+
+ return seq_release(inode, file);
}
static const struct file_operations ftrace_pid_fops = {
@@ -5571,24 +5537,21 @@ static const struct file_operations ftrace_pid_fops = {
.release = ftrace_pid_release,
};
-static __init int ftrace_init_tracefs(void)
+void ftrace_init_tracefs(struct trace_array *tr, struct dentry *d_tracer)
{
- struct dentry *d_tracer;
+ trace_create_file("set_ftrace_pid", 0644, d_tracer,
+ tr, &ftrace_pid_fops);
+}
- d_tracer = tracing_init_dentry();
- if (IS_ERR(d_tracer))
- return 0;
+void __init ftrace_init_tracefs_toplevel(struct trace_array *tr,
+ struct dentry *d_tracer)
+{
+ /* Only the top level directory has the dyn_tracefs and profile */
+ WARN_ON(!(tr->flags & TRACE_ARRAY_FL_GLOBAL));
ftrace_init_dyn_tracefs(d_tracer);
-
- trace_create_file("set_ftrace_pid", 0644, d_tracer,
- NULL, &ftrace_pid_fops);
-
ftrace_profile_tracefs(d_tracer);
-
- return 0;
}
-fs_initcall(ftrace_init_tracefs);
/**
* ftrace_kill - kill ftrace
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 8a4bd6b68a0b..dade4c9559cc 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -25,7 +25,7 @@
#include <linux/hardirq.h>
#include <linux/linkage.h>
#include <linux/uaccess.h>
-#include <linux/kprobes.h>
+#include <linux/vmalloc.h>
#include <linux/ftrace.h>
#include <linux/module.h>
#include <linux/percpu.h>
@@ -319,6 +319,258 @@ int call_filter_check_discard(struct trace_event_call *call, void *rec,
return 0;
}
+void trace_free_pid_list(struct trace_pid_list *pid_list)
+{
+ vfree(pid_list->pids);
+ kfree(pid_list);
+}
+
+/**
+ * trace_find_filtered_pid - check if a pid exists in a filtered_pid list
+ * @filtered_pids: The list of pids to check
+ * @search_pid: The PID to find in @filtered_pids
+ *
+ * Returns true if @search_pid is fonud in @filtered_pids, and false otherwis.
+ */
+bool
+trace_find_filtered_pid(struct trace_pid_list *filtered_pids, pid_t search_pid)
+{
+ /*
+ * If pid_max changed after filtered_pids was created, we
+ * by default ignore all pids greater than the previous pid_max.
+ */
+ if (search_pid >= filtered_pids->pid_max)
+ return false;
+
+ return test_bit(search_pid, filtered_pids->pids);
+}
+
+/**
+ * trace_ignore_this_task - should a task be ignored for tracing
+ * @filtered_pids: The list of pids to check
+ * @task: The task that should be ignored if not filtered
+ *
+ * Checks if @task should be traced or not from @filtered_pids.
+ * Returns true if @task should *NOT* be traced.
+ * Returns false if @task should be traced.
+ */
+bool
+trace_ignore_this_task(struct trace_pid_list *filtered_pids, struct task_struct *task)
+{
+ /*
+ * Return false, because if filtered_pids does not exist,
+ * all pids are good to trace.
+ */
+ if (!filtered_pids)
+ return false;
+
+ return !trace_find_filtered_pid(filtered_pids, task->pid);
+}
+
+/**
+ * trace_pid_filter_add_remove - Add or remove a task from a pid_list
+ * @pid_list: The list to modify
+ * @self: The current task for fork or NULL for exit
+ * @task: The task to add or remove
+ *
+ * If adding a task, if @self is defined, the task is only added if @self
+ * is also included in @pid_list. This happens on fork and tasks should
+ * only be added when the parent is listed. If @self is NULL, then the
+ * @task pid will be removed from the list, which would happen on exit
+ * of a task.
+ */
+void trace_filter_add_remove_task(struct trace_pid_list *pid_list,
+ struct task_struct *self,
+ struct task_struct *task)
+{
+ if (!pid_list)
+ return;
+
+ /* For forks, we only add if the forking task is listed */
+ if (self) {
+ if (!trace_find_filtered_pid(pid_list, self->pid))
+ return;
+ }
+
+ /* Sorry, but we don't support pid_max changing after setting */
+ if (task->pid >= pid_list->pid_max)
+ return;
+
+ /* "self" is set for forks, and NULL for exits */
+ if (self)
+ set_bit(task->pid, pid_list->pids);
+ else
+ clear_bit(task->pid, pid_list->pids);
+}
+
+/**
+ * trace_pid_next - Used for seq_file to get to the next pid of a pid_list
+ * @pid_list: The pid list to show
+ * @v: The last pid that was shown (+1 the actual pid to let zero be displayed)
+ * @pos: The position of the file
+ *
+ * This is used by the seq_file "next" operation to iterate the pids
+ * listed in a trace_pid_list structure.
+ *
+ * Returns the pid+1 as we want to display pid of zero, but NULL would
+ * stop the iteration.
+ */
+void *trace_pid_next(struct trace_pid_list *pid_list, void *v, loff_t *pos)
+{
+ unsigned long pid = (unsigned long)v;
+
+ (*pos)++;
+
+ /* pid already is +1 of the actual prevous bit */
+ pid = find_next_bit(pid_list->pids, pid_list->pid_max, pid);
+
+ /* Return pid + 1 to allow zero to be represented */
+ if (pid < pid_list->pid_max)
+ return (void *)(pid + 1);
+
+ return NULL;
+}
+
+/**
+ * trace_pid_start - Used for seq_file to start reading pid lists
+ * @pid_list: The pid list to show
+ * @pos: The position of the file
+ *
+ * This is used by seq_file "start" operation to start the iteration
+ * of listing pids.
+ *
+ * Returns the pid+1 as we want to display pid of zero, but NULL would
+ * stop the iteration.
+ */
+void *trace_pid_start(struct trace_pid_list *pid_list, loff_t *pos)
+{
+ unsigned long pid;
+ loff_t l = 0;
+
+ pid = find_first_bit(pid_list->pids, pid_list->pid_max);
+ if (pid >= pid_list->pid_max)
+ return NULL;
+
+ /* Return pid + 1 so that zero can be the exit value */
+ for (pid++; pid && l < *pos;
+ pid = (unsigned long)trace_pid_next(pid_list, (void *)pid, &l))
+ ;
+ return (void *)pid;
+}
+
+/**
+ * trace_pid_show - show the current pid in seq_file processing
+ * @m: The seq_file structure to write into
+ * @v: A void pointer of the pid (+1) value to display
+ *
+ * Can be directly used by seq_file operations to display the current
+ * pid value.
+ */
+int trace_pid_show(struct seq_file *m, void *v)
+{
+ unsigned long pid = (unsigned long)v - 1;
+
+ seq_printf(m, "%lu\n", pid);
+ return 0;
+}
+
+/* 128 should be much more than enough */
+#define PID_BUF_SIZE 127
+
+int trace_pid_write(struct trace_pid_list *filtered_pids,
+ struct trace_pid_list **new_pid_list,
+ const char __user *ubuf, size_t cnt)
+{
+ struct trace_pid_list *pid_list;
+ struct trace_parser parser;
+ unsigned long val;
+ int nr_pids = 0;
+ ssize_t read = 0;
+ ssize_t ret = 0;
+ loff_t pos;
+ pid_t pid;
+
+ if (trace_parser_get_init(&parser, PID_BUF_SIZE + 1))
+ return -ENOMEM;
+
+ /*
+ * Always recreate a new array. The write is an all or nothing
+ * operation. Always create a new array when adding new pids by
+ * the user. If the operation fails, then the current list is
+ * not modified.
+ */
+ pid_list = kmalloc(sizeof(*pid_list), GFP_KERNEL);
+ if (!pid_list)
+ return -ENOMEM;
+
+ pid_list->pid_max = READ_ONCE(pid_max);
+
+ /* Only truncating will shrink pid_max */
+ if (filtered_pids && filtered_pids->pid_max > pid_list->pid_max)
+ pid_list->pid_max = filtered_pids->pid_max;
+
+ pid_list->pids = vzalloc((pid_list->pid_max + 7) >> 3);
+ if (!pid_list->pids) {
+ kfree(pid_list);
+ return -ENOMEM;
+ }
+
+ if (filtered_pids) {
+ /* copy the current bits to the new max */
+ for_each_set_bit(pid, filtered_pids->pids,
+ filtered_pids->pid_max) {
+ set_bit(pid, pid_list->pids);
+ nr_pids++;
+ }
+ }
+
+ while (cnt > 0) {
+
+ pos = 0;
+
+ ret = trace_get_user(&parser, ubuf, cnt, &pos);
+ if (ret < 0 || !trace_parser_loaded(&parser))
+ break;
+
+ read += ret;
+ ubuf += ret;
+ cnt -= ret;
+
+ parser.buffer[parser.idx] = 0;
+
+ ret = -EINVAL;
+ if (kstrtoul(parser.buffer, 0, &val))
+ break;
+ if (val >= pid_list->pid_max)
+ break;
+
+ pid = (pid_t)val;
+
+ set_bit(pid, pid_list->pids);
+ nr_pids++;
+
+ trace_parser_clear(&parser);
+ ret = 0;
+ }
+ trace_parser_put(&parser);
+
+ if (ret < 0) {
+ trace_free_pid_list(pid_list);
+ return ret;
+ }
+
+ if (!nr_pids) {
+ /* Cleared the list of pids */
+ trace_free_pid_list(pid_list);
+ read = ret;
+ pid_list = NULL;
+ }
+
+ *new_pid_list = pid_list;
+
+ return read;
+}
+
static cycle_t buffer_ftrace_now(struct trace_buffer *buf, int cpu)
{
u64 ts;
@@ -1862,7 +2114,17 @@ void trace_buffer_unlock_commit_regs(struct trace_array *tr,
{
__buffer_unlock_commit(buffer, event);
- ftrace_trace_stack(tr, buffer, flags, 0, pc, regs);
+ /*
+ * If regs is not set, then skip the following callers:
+ * trace_buffer_unlock_commit_regs
+ * event_trigger_unlock_commit
+ * trace_event_buffer_commit
+ * trace_event_raw_event_sched_switch
+ * Note, we can still get here via blktrace, wakeup tracer
+ * and mmiotrace, but that's ok if they lose a function or
+ * two. They are that meaningful.
+ */
+ ftrace_trace_stack(tr, buffer, flags, regs ? 0 : 4, pc, regs);
ftrace_trace_userstack(buffer, flags, pc);
}
@@ -1913,6 +2175,13 @@ static void __ftrace_trace_stack(struct ring_buffer *buffer,
trace.skip = skip;
/*
+ * Add two, for this function and the call to save_stack_trace()
+ * If regs is set, then these functions will not be in the way.
+ */
+ if (!regs)
+ trace.skip += 2;
+
+ /*
* Since events can happen in NMIs there's no safe way to
* use the per cpu ftrace_stacks. We reserve it and if an interrupt
* or NMI comes in, it will just have to use the default
@@ -2083,83 +2352,41 @@ static void __trace_userstack(struct trace_array *tr, unsigned long flags)
/* created for use with alloc_percpu */
struct trace_buffer_struct {
- char buffer[TRACE_BUF_SIZE];
+ int nesting;
+ char buffer[4][TRACE_BUF_SIZE];
};
static struct trace_buffer_struct *trace_percpu_buffer;
-static struct trace_buffer_struct *trace_percpu_sirq_buffer;
-static struct trace_buffer_struct *trace_percpu_irq_buffer;
-static struct trace_buffer_struct *trace_percpu_nmi_buffer;
/*
- * The buffer used is dependent on the context. There is a per cpu
- * buffer for normal context, softirq contex, hard irq context and
- * for NMI context. Thise allows for lockless recording.
- *
- * Note, if the buffers failed to be allocated, then this returns NULL
+ * Thise allows for lockless recording. If we're nested too deeply, then
+ * this returns NULL.
*/
static char *get_trace_buf(void)
{
- struct trace_buffer_struct *percpu_buffer;
-
- /*
- * If we have allocated per cpu buffers, then we do not
- * need to do any locking.
- */
- if (in_nmi())
- percpu_buffer = trace_percpu_nmi_buffer;
- else if (in_irq())
- percpu_buffer = trace_percpu_irq_buffer;
- else if (in_softirq())
- percpu_buffer = trace_percpu_sirq_buffer;
- else
- percpu_buffer = trace_percpu_buffer;
+ struct trace_buffer_struct *buffer = this_cpu_ptr(trace_percpu_buffer);
- if (!percpu_buffer)
+ if (!buffer || buffer->nesting >= 4)
return NULL;
- return this_cpu_ptr(&percpu_buffer->buffer[0]);
+ return &buffer->buffer[buffer->nesting++][0];
+}
+
+static void put_trace_buf(void)
+{
+ this_cpu_dec(trace_percpu_buffer->nesting);
}
static int alloc_percpu_trace_buffer(void)
{
struct trace_buffer_struct *buffers;
- struct trace_buffer_struct *sirq_buffers;
- struct trace_buffer_struct *irq_buffers;
- struct trace_buffer_struct *nmi_buffers;
buffers = alloc_percpu(struct trace_buffer_struct);
- if (!buffers)
- goto err_warn;
-
- sirq_buffers = alloc_percpu(struct trace_buffer_struct);
- if (!sirq_buffers)
- goto err_sirq;
-
- irq_buffers = alloc_percpu(struct trace_buffer_struct);
- if (!irq_buffers)
- goto err_irq;
-
- nmi_buffers = alloc_percpu(struct trace_buffer_struct);
- if (!nmi_buffers)
- goto err_nmi;
+ if (WARN(!buffers, "Could not allocate percpu trace_printk buffer"))
+ return -ENOMEM;
trace_percpu_buffer = buffers;
- trace_percpu_sirq_buffer = sirq_buffers;
- trace_percpu_irq_buffer = irq_buffers;
- trace_percpu_nmi_buffer = nmi_buffers;
-
return 0;
-
- err_nmi:
- free_percpu(irq_buffers);
- err_irq:
- free_percpu(sirq_buffers);
- err_sirq:
- free_percpu(buffers);
- err_warn:
- WARN(1, "Could not allocate percpu trace_printk buffer");
- return -ENOMEM;
}
static int buffers_allocated;
@@ -2250,7 +2477,7 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
tbuffer = get_trace_buf();
if (!tbuffer) {
len = 0;
- goto out;
+ goto out_nobuffer;
}
len = vbin_printf((u32 *)tbuffer, TRACE_BUF_SIZE/sizeof(int), fmt, args);
@@ -2276,6 +2503,9 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
}
out:
+ put_trace_buf();
+
+out_nobuffer:
preempt_enable_notrace();
unpause_graph_tracing();
@@ -2307,7 +2537,7 @@ __trace_array_vprintk(struct ring_buffer *buffer,
tbuffer = get_trace_buf();
if (!tbuffer) {
len = 0;
- goto out;
+ goto out_nobuffer;
}
len = vscnprintf(tbuffer, TRACE_BUF_SIZE, fmt, args);
@@ -2326,7 +2556,11 @@ __trace_array_vprintk(struct ring_buffer *buffer,
__buffer_unlock_commit(buffer, event);
ftrace_trace_stack(&global_trace, buffer, flags, 6, pc, NULL);
}
- out:
+
+out:
+ put_trace_buf();
+
+out_nobuffer:
preempt_enable_notrace();
unpause_graph_tracing();
@@ -6977,6 +7211,7 @@ init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer)
for_each_tracing_cpu(cpu)
tracing_init_tracefs_percpu(tr, cpu);
+ ftrace_init_tracefs(tr, d_tracer);
}
static struct vfsmount *trace_automount(void *ingore)
@@ -7130,6 +7365,7 @@ static __init int tracer_init_tracefs(void)
return 0;
init_tracer_tracefs(&global_trace, d_tracer);
+ ftrace_init_tracefs_toplevel(&global_trace, d_tracer);
trace_create_file("tracing_thresh", 0644, d_tracer,
&global_trace, &tracing_thresh_fops);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 5167c366d6b7..f783df416726 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -80,6 +80,12 @@ enum trace_type {
FTRACE_ENTRY(name, struct_name, id, PARAMS(tstruct), PARAMS(print), \
filter)
+#undef FTRACE_ENTRY_PACKED
+#define FTRACE_ENTRY_PACKED(name, struct_name, id, tstruct, print, \
+ filter) \
+ FTRACE_ENTRY(name, struct_name, id, PARAMS(tstruct), PARAMS(print), \
+ filter) __packed
+
#include "trace_entries.h"
/*
@@ -156,6 +162,9 @@ struct trace_array_cpu {
char comm[TASK_COMM_LEN];
bool ignore_pid;
+#ifdef CONFIG_FUNCTION_TRACER
+ bool ftrace_ignore_pid;
+#endif
};
struct tracer;
@@ -247,6 +256,7 @@ struct trace_array {
int ref;
#ifdef CONFIG_FUNCTION_TRACER
struct ftrace_ops *ops;
+ struct trace_pid_list __rcu *function_pids;
/* function tracing enabled */
int function_enabled;
#endif
@@ -628,6 +638,25 @@ extern unsigned long nsecs_to_usecs(unsigned long nsecs);
extern unsigned long tracing_thresh;
+/* PID filtering */
+
+extern int pid_max;
+
+bool trace_find_filtered_pid(struct trace_pid_list *filtered_pids,
+ pid_t search_pid);
+bool trace_ignore_this_task(struct trace_pid_list *filtered_pids,
+ struct task_struct *task);
+void trace_filter_add_remove_task(struct trace_pid_list *pid_list,
+ struct task_struct *self,
+ struct task_struct *task);
+void *trace_pid_next(struct trace_pid_list *pid_list, void *v, loff_t *pos);
+void *trace_pid_start(struct trace_pid_list *pid_list, loff_t *pos);
+int trace_pid_show(struct seq_file *m, void *v);
+void trace_free_pid_list(struct trace_pid_list *pid_list);
+int trace_pid_write(struct trace_pid_list *filtered_pids,
+ struct trace_pid_list **new_pid_list,
+ const char __user *ubuf, size_t cnt);
+
#ifdef CONFIG_TRACER_MAX_TRACE
void update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu);
void update_max_tr_single(struct trace_array *tr,
@@ -821,12 +850,9 @@ extern struct list_head ftrace_pids;
#ifdef CONFIG_FUNCTION_TRACER
extern bool ftrace_filter_param __initdata;
-static inline int ftrace_trace_task(struct task_struct *task)
+static inline int ftrace_trace_task(struct trace_array *tr)
{
- if (list_empty(&ftrace_pids))
- return 1;
-
- return test_tsk_trace_trace(task);
+ return !this_cpu_read(tr->trace_buffer.data->ftrace_ignore_pid);
}
extern int ftrace_is_dead(void);
int ftrace_create_function_files(struct trace_array *tr,
@@ -836,8 +862,11 @@ void ftrace_init_global_array_ops(struct trace_array *tr);
void ftrace_init_array_ops(struct trace_array *tr, ftrace_func_t func);
void ftrace_reset_array_ops(struct trace_array *tr);
int using_ftrace_ops_list_func(void);
+void ftrace_init_tracefs(struct trace_array *tr, struct dentry *d_tracer);
+void ftrace_init_tracefs_toplevel(struct trace_array *tr,
+ struct dentry *d_tracer);
#else
-static inline int ftrace_trace_task(struct task_struct *task)
+static inline int ftrace_trace_task(struct trace_array *tr)
{
return 1;
}
@@ -852,6 +881,8 @@ static inline void ftrace_destroy_function_files(struct trace_array *tr) { }
static inline __init void
ftrace_init_global_array_ops(struct trace_array *tr) { }
static inline void ftrace_reset_array_ops(struct trace_array *tr) { }
+static inline void ftrace_init_tracefs(struct trace_array *tr, struct dentry *d) { }
+static inline void ftrace_init_tracefs_toplevel(struct trace_array *tr, struct dentry *d) { }
/* ftace_func_t type is not defined, use macro instead of static inline */
#define ftrace_init_array_ops(tr, func) do { } while (0)
#endif /* CONFIG_FUNCTION_TRACER */
@@ -1600,6 +1631,11 @@ int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled);
#define FTRACE_ENTRY_DUP(call, struct_name, id, tstruct, print, filter) \
FTRACE_ENTRY(call, struct_name, id, PARAMS(tstruct), PARAMS(print), \
filter)
+#undef FTRACE_ENTRY_PACKED
+#define FTRACE_ENTRY_PACKED(call, struct_name, id, tstruct, print, filter) \
+ FTRACE_ENTRY(call, struct_name, id, PARAMS(tstruct), PARAMS(print), \
+ filter)
+
#include "trace_entries.h"
#if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_FUNCTION_TRACER)
diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h
index ee7b94a4810a..5c30efcda5e6 100644
--- a/kernel/trace/trace_entries.h
+++ b/kernel/trace/trace_entries.h
@@ -72,7 +72,7 @@ FTRACE_ENTRY_REG(function, ftrace_entry,
);
/* Function call entry */
-FTRACE_ENTRY(funcgraph_entry, ftrace_graph_ent_entry,
+FTRACE_ENTRY_PACKED(funcgraph_entry, ftrace_graph_ent_entry,
TRACE_GRAPH_ENT,
@@ -88,7 +88,7 @@ FTRACE_ENTRY(funcgraph_entry, ftrace_graph_ent_entry,
);
/* Function return entry */
-FTRACE_ENTRY(funcgraph_exit, ftrace_graph_ret_entry,
+FTRACE_ENTRY_PACKED(funcgraph_exit, ftrace_graph_ret_entry,
TRACE_GRAPH_RET,
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 3d4155892a1e..03c0a48c3ac4 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -15,7 +15,6 @@
#include <linux/kthread.h>
#include <linux/tracefs.h>
#include <linux/uaccess.h>
-#include <linux/vmalloc.h>
#include <linux/module.h>
#include <linux/ctype.h>
#include <linux/sort.h>
@@ -262,6 +261,14 @@ void *trace_event_buffer_reserve(struct trace_event_buffer *fbuffer,
local_save_flags(fbuffer->flags);
fbuffer->pc = preempt_count();
+ /*
+ * If CONFIG_PREEMPT is enabled, then the tracepoint itself disables
+ * preemption (adding one to the preempt_count). Since we are
+ * interested in the preempt_count at the time the tracepoint was
+ * hit, we need to subtract one to offset the increment.
+ */
+ if (IS_ENABLED(CONFIG_PREEMPT))
+ fbuffer->pc--;
fbuffer->trace_file = trace_file;
fbuffer->event =
@@ -499,60 +506,6 @@ static void ftrace_clear_events(struct trace_array *tr)
mutex_unlock(&event_mutex);
}
-/* Shouldn't this be in a header? */
-extern int pid_max;
-
-/* Returns true if found in filter */
-static bool
-find_filtered_pid(struct trace_pid_list *filtered_pids, pid_t search_pid)
-{
- /*
- * If pid_max changed after filtered_pids was created, we
- * by default ignore all pids greater than the previous pid_max.
- */
- if (search_pid >= filtered_pids->pid_max)
- return false;
-
- return test_bit(search_pid, filtered_pids->pids);
-}
-
-static bool
-ignore_this_task(struct trace_pid_list *filtered_pids, struct task_struct *task)
-{
- /*
- * Return false, because if filtered_pids does not exist,
- * all pids are good to trace.
- */
- if (!filtered_pids)
- return false;
-
- return !find_filtered_pid(filtered_pids, task->pid);
-}
-
-static void filter_add_remove_task(struct trace_pid_list *pid_list,
- struct task_struct *self,
- struct task_struct *task)
-{
- if (!pid_list)
- return;
-
- /* For forks, we only add if the forking task is listed */
- if (self) {
- if (!find_filtered_pid(pid_list, self->pid))
- return;
- }
-
- /* Sorry, but we don't support pid_max changing after setting */
- if (task->pid >= pid_list->pid_max)
- return;
-
- /* "self" is set for forks, and NULL for exits */
- if (self)
- set_bit(task->pid, pid_list->pids);
- else
- clear_bit(task->pid, pid_list->pids);
-}
-
static void
event_filter_pid_sched_process_exit(void *data, struct task_struct *task)
{
@@ -560,7 +513,7 @@ event_filter_pid_sched_process_exit(void *data, struct task_struct *task)
struct trace_array *tr = data;
pid_list = rcu_dereference_sched(tr->filtered_pids);
- filter_add_remove_task(pid_list, NULL, task);
+ trace_filter_add_remove_task(pid_list, NULL, task);
}
static void
@@ -572,7 +525,7 @@ event_filter_pid_sched_process_fork(void *data,
struct trace_array *tr = data;
pid_list = rcu_dereference_sched(tr->filtered_pids);
- filter_add_remove_task(pid_list, self, task);
+ trace_filter_add_remove_task(pid_list, self, task);
}
void trace_event_follow_fork(struct trace_array *tr, bool enable)
@@ -600,8 +553,8 @@ event_filter_pid_sched_switch_probe_pre(void *data, bool preempt,
pid_list = rcu_dereference_sched(tr->filtered_pids);
this_cpu_write(tr->trace_buffer.data->ignore_pid,
- ignore_this_task(pid_list, prev) &&
- ignore_this_task(pid_list, next));
+ trace_ignore_this_task(pid_list, prev) &&
+ trace_ignore_this_task(pid_list, next));
}
static void
@@ -614,7 +567,7 @@ event_filter_pid_sched_switch_probe_post(void *data, bool preempt,
pid_list = rcu_dereference_sched(tr->filtered_pids);
this_cpu_write(tr->trace_buffer.data->ignore_pid,
- ignore_this_task(pid_list, next));
+ trace_ignore_this_task(pid_list, next));
}
static void
@@ -630,7 +583,7 @@ event_filter_pid_sched_wakeup_probe_pre(void *data, struct task_struct *task)
pid_list = rcu_dereference_sched(tr->filtered_pids);
this_cpu_write(tr->trace_buffer.data->ignore_pid,
- ignore_this_task(pid_list, task));
+ trace_ignore_this_task(pid_list, task));
}
static void
@@ -647,7 +600,7 @@ event_filter_pid_sched_wakeup_probe_post(void *data, struct task_struct *task)
/* Set tracing if current is enabled */
this_cpu_write(tr->trace_buffer.data->ignore_pid,
- ignore_this_task(pid_list, current));
+ trace_ignore_this_task(pid_list, current));
}
static void __ftrace_clear_event_pids(struct trace_array *tr)
@@ -685,8 +638,7 @@ static void __ftrace_clear_event_pids(struct trace_array *tr)
/* Wait till all users are no longer using pid filtering */
synchronize_sched();
- vfree(pid_list->pids);
- kfree(pid_list);
+ trace_free_pid_list(pid_list);
}
static void ftrace_clear_event_pids(struct trace_array *tr)
@@ -1034,18 +986,8 @@ p_next(struct seq_file *m, void *v, loff_t *pos)
{
struct trace_array *tr = m->private;
struct trace_pid_list *pid_list = rcu_dereference_sched(tr->filtered_pids);
- unsigned long pid = (unsigned long)v;
-
- (*pos)++;
-
- /* pid already is +1 of the actual prevous bit */
- pid = find_next_bit(pid_list->pids, pid_list->pid_max, pid);
- /* Return pid + 1 to allow zero to be represented */
- if (pid < pid_list->pid_max)
- return (void *)(pid + 1);
-
- return NULL;
+ return trace_pid_next(pid_list, v, pos);
}
static void *p_start(struct seq_file *m, loff_t *pos)
@@ -1053,8 +995,6 @@ static void *p_start(struct seq_file *m, loff_t *pos)
{
struct trace_pid_list *pid_list;
struct trace_array *tr = m->private;
- unsigned long pid;
- loff_t l = 0;
/*
* Grab the mutex, to keep calls to p_next() having the same
@@ -1070,15 +1010,7 @@ static void *p_start(struct seq_file *m, loff_t *pos)
if (!pid_list)
return NULL;
- pid = find_first_bit(pid_list->pids, pid_list->pid_max);
- if (pid >= pid_list->pid_max)
- return NULL;
-
- /* Return pid + 1 so that zero can be the exit value */
- for (pid++; pid && l < *pos;
- pid = (unsigned long)p_next(m, (void *)pid, &l))
- ;
- return (void *)pid;
+ return trace_pid_start(pid_list, pos);
}
static void p_stop(struct seq_file *m, void *p)
@@ -1088,14 +1020,6 @@ static void p_stop(struct seq_file *m, void *p)
mutex_unlock(&event_mutex);
}
-static int p_show(struct seq_file *m, void *v)
-{
- unsigned long pid = (unsigned long)v - 1;
-
- seq_printf(m, "%lu\n", pid);
- return 0;
-}
-
static ssize_t
event_enable_read(struct file *filp, char __user *ubuf, size_t cnt,
loff_t *ppos)
@@ -1654,7 +1578,7 @@ static void ignore_task_cpu(void *data)
mutex_is_locked(&event_mutex));
this_cpu_write(tr->trace_buffer.data->ignore_pid,
- ignore_this_task(pid_list, current));
+ trace_ignore_this_task(pid_list, current));
}
static ssize_t
@@ -1666,13 +1590,7 @@ ftrace_event_pid_write(struct file *filp, const char __user *ubuf,
struct trace_pid_list *filtered_pids = NULL;
struct trace_pid_list *pid_list;
struct trace_event_file *file;
- struct trace_parser parser;
- unsigned long val;
- loff_t this_pos;
- ssize_t read = 0;
- ssize_t ret = 0;
- pid_t pid;
- int nr_pids = 0;
+ ssize_t ret;
if (!cnt)
return 0;
@@ -1681,93 +1599,15 @@ ftrace_event_pid_write(struct file *filp, const char __user *ubuf,
if (ret < 0)
return ret;
- if (trace_parser_get_init(&parser, EVENT_BUF_SIZE + 1))
- return -ENOMEM;
-
mutex_lock(&event_mutex);
+
filtered_pids = rcu_dereference_protected(tr->filtered_pids,
lockdep_is_held(&event_mutex));
- /*
- * Always recreate a new array. The write is an all or nothing
- * operation. Always create a new array when adding new pids by
- * the user. If the operation fails, then the current list is
- * not modified.
- */
- pid_list = kmalloc(sizeof(*pid_list), GFP_KERNEL);
- if (!pid_list) {
- read = -ENOMEM;
- goto out;
- }
- pid_list->pid_max = READ_ONCE(pid_max);
- /* Only truncating will shrink pid_max */
- if (filtered_pids && filtered_pids->pid_max > pid_list->pid_max)
- pid_list->pid_max = filtered_pids->pid_max;
- pid_list->pids = vzalloc((pid_list->pid_max + 7) >> 3);
- if (!pid_list->pids) {
- kfree(pid_list);
- read = -ENOMEM;
- goto out;
- }
- if (filtered_pids) {
- /* copy the current bits to the new max */
- pid = find_first_bit(filtered_pids->pids,
- filtered_pids->pid_max);
- while (pid < filtered_pids->pid_max) {
- set_bit(pid, pid_list->pids);
- pid = find_next_bit(filtered_pids->pids,
- filtered_pids->pid_max,
- pid + 1);
- nr_pids++;
- }
- }
-
- while (cnt > 0) {
-
- this_pos = 0;
-
- ret = trace_get_user(&parser, ubuf, cnt, &this_pos);
- if (ret < 0 || !trace_parser_loaded(&parser))
- break;
-
- read += ret;
- ubuf += ret;
- cnt -= ret;
-
- parser.buffer[parser.idx] = 0;
-
- ret = -EINVAL;
- if (kstrtoul(parser.buffer, 0, &val))
- break;
- if (val >= pid_list->pid_max)
- break;
-
- pid = (pid_t)val;
-
- set_bit(pid, pid_list->pids);
- nr_pids++;
-
- trace_parser_clear(&parser);
- ret = 0;
- }
- trace_parser_put(&parser);
-
- if (ret < 0) {
- vfree(pid_list->pids);
- kfree(pid_list);
- read = ret;
+ ret = trace_pid_write(filtered_pids, &pid_list, ubuf, cnt);
+ if (ret < 0)
goto out;
- }
- if (!nr_pids) {
- /* Cleared the list of pids */
- vfree(pid_list->pids);
- kfree(pid_list);
- read = ret;
- if (!filtered_pids)
- goto out;
- pid_list = NULL;
- }
rcu_assign_pointer(tr->filtered_pids, pid_list);
list_for_each_entry(file, &tr->events, list) {
@@ -1776,10 +1616,8 @@ ftrace_event_pid_write(struct file *filp, const char __user *ubuf,
if (filtered_pids) {
synchronize_sched();
-
- vfree(filtered_pids->pids);
- kfree(filtered_pids);
- } else {
+ trace_free_pid_list(filtered_pids);
+ } else if (pid_list) {
/*
* Register a probe that is called before all other probes
* to set ignore_pid if next or prev do not match.
@@ -1817,9 +1655,8 @@ ftrace_event_pid_write(struct file *filp, const char __user *ubuf,
out:
mutex_unlock(&event_mutex);
- ret = read;
- if (read > 0)
- *ppos += read;
+ if (ret > 0)
+ *ppos += ret;
return ret;
}
@@ -1846,7 +1683,7 @@ static const struct seq_operations show_set_event_seq_ops = {
static const struct seq_operations show_set_pid_seq_ops = {
.start = p_start,
.next = p_next,
- .show = p_show,
+ .show = trace_pid_show,
.stop = p_stop,
};
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index 5a095c2e4b69..0efa00d80623 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -43,7 +43,7 @@ static int allocate_ftrace_ops(struct trace_array *tr)
/* Currently only the non stack verision is supported */
ops->func = function_trace_call;
- ops->flags = FTRACE_OPS_FL_RECURSION_SAFE;
+ ops->flags = FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_PID;
tr->ops = ops;
ops->private = tr;
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 3a0244ff7ea8..7363ccf79512 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -319,7 +319,7 @@ int trace_graph_entry(struct ftrace_graph_ent *trace)
int cpu;
int pc;
- if (!ftrace_trace_task(current))
+ if (!ftrace_trace_task(tr))
return 0;
/* trace it when it is-nested-in or is a function enabled. */
@@ -338,6 +338,13 @@ int trace_graph_entry(struct ftrace_graph_ent *trace)
if (ftrace_graph_notrace_addr(trace->func))
return 1;
+ /*
+ * Stop here if tracing_threshold is set. We only write function return
+ * events to the ring buffer.
+ */
+ if (tracing_thresh)
+ return 1;
+
local_irq_save(flags);
cpu = raw_smp_processor_id();
data = per_cpu_ptr(tr->trace_buffer.data, cpu);
@@ -355,14 +362,6 @@ int trace_graph_entry(struct ftrace_graph_ent *trace)
return ret;
}
-static int trace_graph_thresh_entry(struct ftrace_graph_ent *trace)
-{
- if (tracing_thresh)
- return 1;
- else
- return trace_graph_entry(trace);
-}
-
static void
__trace_graph_function(struct trace_array *tr,
unsigned long ip, unsigned long flags, int pc)
@@ -457,7 +456,7 @@ static int graph_trace_init(struct trace_array *tr)
set_graph_array(tr);
if (tracing_thresh)
ret = register_ftrace_graph(&trace_graph_thresh_return,
- &trace_graph_thresh_entry);
+ &trace_graph_entry);
else
ret = register_ftrace_graph(&trace_graph_return,
&trace_graph_entry);
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 5546eec0505f..9aedb0b06683 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -587,6 +587,7 @@ static int create_trace_kprobe(int argc, char **argv)
* $retval : fetch return value
* $stack : fetch stack address
* $stackN : fetch Nth of stack (N:0-)
+ * $comm : fetch current task comm
* @ADDR : fetch memory at ADDR (ADDR should be in kernel)
* @SYM[+|-offs] : fetch memory at SYM +|- offs (SYM is a data symbol)
* %REG : fetch register REG
diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c
index 68f376ca6d3f..cd7480d0a201 100644
--- a/kernel/trace/trace_mmiotrace.c
+++ b/kernel/trace/trace_mmiotrace.c
@@ -68,19 +68,15 @@ static void mmio_print_pcidev(struct trace_seq *s, const struct pci_dev *dev)
trace_seq_printf(s, "PCIDEV %02x%02x %04x%04x %x",
dev->bus->number, dev->devfn,
dev->vendor, dev->device, dev->irq);
- /*
- * XXX: is pci_resource_to_user() appropriate, since we are
- * supposed to interpret the __ioremap() phys_addr argument based on
- * these printed values?
- */
for (i = 0; i < 7; i++) {
- pci_resource_to_user(dev, i, &dev->resource[i], &start, &end);
+ start = dev->resource[i].start;
trace_seq_printf(s, " %llx",
(unsigned long long)(start |
(dev->resource[i].flags & PCI_REGION_FLAG_MASK)));
}
for (i = 0; i < 7; i++) {
- pci_resource_to_user(dev, i, &dev->resource[i], &start, &end);
+ start = dev->resource[i].start;
+ end = dev->resource[i].end;
trace_seq_printf(s, " %llx",
dev->resource[i].start < dev->resource[i].end ?
(unsigned long long)(end - start) + 1 : 0);
diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c
index 1d372fa6fefb..74e80a582c28 100644
--- a/kernel/trace/trace_probe.c
+++ b/kernel/trace/trace_probe.c
@@ -218,6 +218,28 @@ free_bitfield_fetch_param(struct bitfield_fetch_param *data)
kfree(data);
}
+void FETCH_FUNC_NAME(comm, string)(struct pt_regs *regs,
+ void *data, void *dest)
+{
+ int maxlen = get_rloc_len(*(u32 *)dest);
+ u8 *dst = get_rloc_data(dest);
+ long ret;
+
+ if (!maxlen)
+ return;
+
+ ret = strlcpy(dst, current->comm, maxlen);
+ *(u32 *)dest = make_data_rloc(ret, get_rloc_offs(*(u32 *)dest));
+}
+NOKPROBE_SYMBOL(FETCH_FUNC_NAME(comm, string));
+
+void FETCH_FUNC_NAME(comm, string_size)(struct pt_regs *regs,
+ void *data, void *dest)
+{
+ *(u32 *)dest = strlen(current->comm) + 1;
+}
+NOKPROBE_SYMBOL(FETCH_FUNC_NAME(comm, string_size));
+
static const struct fetch_type *find_fetch_type(const char *type,
const struct fetch_type *ftbl)
{
@@ -348,6 +370,11 @@ static int parse_probe_vars(char *arg, const struct fetch_type *t,
}
} else
ret = -EINVAL;
+ } else if (strcmp(arg, "comm") == 0) {
+ if (strcmp(t->name, "string") != 0 &&
+ strcmp(t->name, "string_size") != 0)
+ return -EINVAL;
+ f->fn = t->fetch[FETCH_MTD_comm];
} else
ret = -EINVAL;
@@ -522,6 +549,12 @@ int traceprobe_parse_probe_arg(char *arg, ssize_t *size,
arg[t - parg->comm] = '\0';
t++;
}
+ /*
+ * The default type of $comm should be "string", and it can't be
+ * dereferenced.
+ */
+ if (!t && strcmp(arg, "$comm") == 0)
+ t = "string";
parg->type = find_fetch_type(t, ftbl);
if (!parg->type) {
pr_info("Unsupported type: %s\n", t);
diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h
index f6398db09114..45400ca5ded1 100644
--- a/kernel/trace/trace_probe.h
+++ b/kernel/trace/trace_probe.h
@@ -102,6 +102,7 @@ enum {
FETCH_MTD_reg = 0,
FETCH_MTD_stack,
FETCH_MTD_retval,
+ FETCH_MTD_comm,
FETCH_MTD_memory,
FETCH_MTD_symbol,
FETCH_MTD_deref,
@@ -183,6 +184,14 @@ DECLARE_BASIC_FETCH_FUNCS(bitfield);
#define fetch_bitfield_string NULL
#define fetch_bitfield_string_size NULL
+/* comm only makes sense as a string */
+#define fetch_comm_u8 NULL
+#define fetch_comm_u16 NULL
+#define fetch_comm_u32 NULL
+#define fetch_comm_u64 NULL
+DECLARE_FETCH_FUNC(comm, string);
+DECLARE_FETCH_FUNC(comm, string_size);
+
/*
* Define macro for basic types - we don't need to define s* types, because
* we have to care only about bitwidth at recording time.
@@ -213,6 +222,7 @@ DEFINE_FETCH_##method(u64)
ASSIGN_FETCH_FUNC(reg, ftype), \
ASSIGN_FETCH_FUNC(stack, ftype), \
ASSIGN_FETCH_FUNC(retval, ftype), \
+ASSIGN_FETCH_FUNC(comm, ftype), \
ASSIGN_FETCH_FUNC(memory, ftype), \
ASSIGN_FETCH_FUNC(symbol, ftype), \
ASSIGN_FETCH_FUNC(deref, ftype), \
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index c9dd5fbdbf33..ef071ca73fc3 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -4369,8 +4369,8 @@ static void show_pwq(struct pool_workqueue *pwq)
/**
* show_workqueue_state - dump workqueue state
*
- * Called from a sysrq handler and prints out all busy workqueues and
- * pools.
+ * Called from a sysrq handler or try_to_freeze_tasks() and prints out
+ * all busy workqueues and pools.
*/
void show_workqueue_state(void)
{
@@ -4600,15 +4600,11 @@ static void restore_unbound_workers_cpumask(struct worker_pool *pool, int cpu)
if (!cpumask_test_cpu(cpu, pool->attrs->cpumask))
return;
- /* is @cpu the only online CPU? */
cpumask_and(&cpumask, pool->attrs->cpumask, cpu_online_mask);
- if (cpumask_weight(&cpumask) != 1)
- return;
/* as we're called from CPU_ONLINE, the following shouldn't fail */
for_each_pool_worker(worker, pool)
- WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task,
- pool->attrs->cpumask) < 0);
+ WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, &cpumask) < 0);
}
int workqueue_prepare_cpu(unsigned int cpu)