diff options
| author | Tony Lindgren <[email protected]> | 2016-03-30 10:36:06 -0700 |
|---|---|---|
| committer | Tony Lindgren <[email protected]> | 2016-03-30 10:36:06 -0700 |
| commit | 1809de7e7d37c585e01a1bcc583ea92b78fc759d (patch) | |
| tree | 76c5b35c2b04eafce86a1a729c02ab705eba44bc /kernel | |
| parent | ebf24414809200915b9ddf7f109bba7c278c8210 (diff) | |
| parent | 3ca4a238106dedc285193ee47f494a6584b6fd2f (diff) | |
Merge tag 'for-v4.6-rc/omap-fixes-a' of git://git.kernel.org/pub/scm/linux/kernel/git/pjw/omap-pending into omap-for-v4.6/fixes
ARM: OMAP2+: first hwmod fix for v4.6-rc
Fix a longstanding bug in the hwmod code that could cause
hardware SYSCONFIG register values to not match the kernel's
idea of what they should be, and that could result in lower
performance during IP block idle entry.
Basic build, boot, and PM test logs are available here:
http://www.pwsan.com/omap/testlogs/omap-hwmod-fixes-a-for-v4.6-rc/20160326231727/
Diffstat (limited to 'kernel')
139 files changed, 8247 insertions, 3645 deletions
diff --git a/kernel/Makefile b/kernel/Makefile index 53abf008ecb3..f0c40bf49d9f 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -14,11 +14,21 @@ obj-y = fork.o exec_domain.o panic.o \ obj-$(CONFIG_MULTIUSER) += groups.o ifdef CONFIG_FUNCTION_TRACER -# Do not trace debug files and internal ftrace files -CFLAGS_REMOVE_cgroup-debug.o = $(CC_FLAGS_FTRACE) +# Do not trace internal ftrace files CFLAGS_REMOVE_irq_work.o = $(CC_FLAGS_FTRACE) endif +# Prevents flicker of uninteresting __do_softirq()/__local_bh_disable_ip() +# in coverage traces. +KCOV_INSTRUMENT_softirq.o := n +# These are called from save_stack_trace() on slub debug path, +# and produce insane amounts of uninteresting coverage. +KCOV_INSTRUMENT_module.o := n +KCOV_INSTRUMENT_extable.o := n +# Don't self-instrument. +KCOV_INSTRUMENT_kcov.o := n +KASAN_SANITIZE_kcov.o := n + # cond_syscall is currently not LTO compatible CFLAGS_sys_ni.o = $(DISABLE_LTO) @@ -69,6 +79,7 @@ obj-$(CONFIG_AUDITSYSCALL) += auditsc.o obj-$(CONFIG_AUDIT_WATCH) += audit_watch.o audit_fsnotify.o obj-$(CONFIG_AUDIT_TREE) += audit_tree.o obj-$(CONFIG_GCOV_KERNEL) += gcov/ +obj-$(CONFIG_KCOV) += kcov.o obj-$(CONFIG_KPROBES) += kprobes.o obj-$(CONFIG_KGDB) += debug/ obj-$(CONFIG_DETECT_HUNG_TASK) += hung_task.o diff --git a/kernel/audit.c b/kernel/audit.c index 3a3e5deeda8d..678c3f000191 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -809,6 +809,16 @@ static int audit_set_feature(struct sk_buff *skb) return 0; } +static int audit_replace(pid_t pid) +{ + struct sk_buff *skb = audit_make_reply(0, 0, AUDIT_REPLACE, 0, 0, + &pid, sizeof(pid)); + + if (!skb) + return -ENOMEM; + return netlink_unicast(audit_sock, skb, audit_nlk_portid, 0); +} + static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) { u32 seq; @@ -870,9 +880,17 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) } if (s.mask & AUDIT_STATUS_PID) { int new_pid = s.pid; + pid_t requesting_pid = task_tgid_vnr(current); - if ((!new_pid) && (task_tgid_vnr(current) != audit_pid)) + if ((!new_pid) && (requesting_pid != audit_pid)) { + audit_log_config_change("audit_pid", new_pid, audit_pid, 0); return -EACCES; + } + if (audit_pid && new_pid && + audit_replace(requesting_pid) != -ECONNREFUSED) { + audit_log_config_change("audit_pid", new_pid, audit_pid, 0); + return -EEXIST; + } if (audit_enabled != AUDIT_OFF) audit_log_config_change("audit_pid", new_pid, audit_pid, 1); audit_pid = new_pid; @@ -920,7 +938,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) if (err == 1) { /* match or error */ err = 0; if (msg_type == AUDIT_USER_TTY) { - err = tty_audit_push_current(); + err = tty_audit_push(); if (err) break; } @@ -1030,20 +1048,19 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) break; case AUDIT_TTY_GET: { struct audit_tty_status s; - struct task_struct *tsk = current; + unsigned int t; - spin_lock(&tsk->sighand->siglock); - s.enabled = tsk->signal->audit_tty; - s.log_passwd = tsk->signal->audit_tty_log_passwd; - spin_unlock(&tsk->sighand->siglock); + t = READ_ONCE(current->signal->audit_tty); + s.enabled = t & AUDIT_TTY_ENABLE; + s.log_passwd = !!(t & AUDIT_TTY_LOG_PASSWD); audit_send_reply(skb, seq, AUDIT_TTY_GET, 0, 0, &s, sizeof(s)); break; } case AUDIT_TTY_SET: { struct audit_tty_status s, old; - struct task_struct *tsk = current; struct audit_buffer *ab; + unsigned int t; memset(&s, 0, sizeof(s)); /* guard against past and future API changes */ @@ -1053,14 +1070,14 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) (s.log_passwd != 0 && s.log_passwd != 1)) err = -EINVAL; - spin_lock(&tsk->sighand->siglock); - old.enabled = tsk->signal->audit_tty; - old.log_passwd = tsk->signal->audit_tty_log_passwd; - if (!err) { - tsk->signal->audit_tty = s.enabled; - tsk->signal->audit_tty_log_passwd = s.log_passwd; + if (err) + t = READ_ONCE(current->signal->audit_tty); + else { + t = s.enabled | (-s.log_passwd & AUDIT_TTY_LOG_PASSWD); + t = xchg(¤t->signal->audit_tty, t); } - spin_unlock(&tsk->sighand->siglock); + old.enabled = t & AUDIT_TTY_ENABLE; + old.log_passwd = !!(t & AUDIT_TTY_LOG_PASSWD); audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE); audit_log_format(ab, " op=tty_set old-enabled=%d new-enabled=%d" diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c index 9f194aad0adc..3cf1c5978d39 100644 --- a/kernel/audit_watch.c +++ b/kernel/audit_watch.c @@ -185,7 +185,7 @@ static struct audit_watch *audit_init_watch(char *path) return watch; } -/* Translate a watch string to kernel respresentation. */ +/* Translate a watch string to kernel representation. */ int audit_to_watch(struct audit_krule *krule, char *path, int len, u32 op) { struct audit_watch *watch; diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index b8ff9e193753..94ca7b1e5e7e 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -158,7 +158,7 @@ char *audit_unpack_string(void **bufp, size_t *remain, size_t len) return str; } -/* Translate an inode field to kernel respresentation. */ +/* Translate an inode field to kernel representation. */ static inline int audit_to_inode(struct audit_krule *krule, struct audit_field *f) { @@ -415,7 +415,7 @@ static int audit_field_valid(struct audit_entry *entry, struct audit_field *f) return 0; } -/* Translate struct audit_rule_data to kernel's rule respresentation. */ +/* Translate struct audit_rule_data to kernel's rule representation. */ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data, size_t datasz) { @@ -593,7 +593,7 @@ static inline size_t audit_pack_string(void **bufp, const char *str) return len; } -/* Translate kernel rule respresentation to struct audit_rule_data. */ +/* Translate kernel rule representation to struct audit_rule_data. */ static struct audit_rule_data *audit_krule_to_data(struct audit_krule *krule) { struct audit_rule_data *data; diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 195ffaee50b9..7d0e3cf8abe1 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -2412,8 +2412,8 @@ void __audit_seccomp(unsigned long syscall, long signr, int code) return; audit_log_task(ab); audit_log_format(ab, " sig=%ld arch=%x syscall=%ld compat=%d ip=0x%lx code=0x%x", - signr, syscall_get_arch(), syscall, is_compat_task(), - KSTK_EIP(current), code); + signr, syscall_get_arch(), syscall, + in_compat_syscall(), KSTK_EIP(current), code); audit_log_end(ab); } diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index 13272582eee0..eed911d091da 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -1,4 +1,7 @@ obj-y := core.o obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o -obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o +obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o +ifeq ($(CONFIG_PERF_EVENTS),y) +obj-$(CONFIG_BPF_SYSCALL) += stackmap.o +endif diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index b0799bced518..76d5a794e426 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -17,15 +17,43 @@ #include <linux/filter.h> #include <linux/perf_event.h> +static void bpf_array_free_percpu(struct bpf_array *array) +{ + int i; + + for (i = 0; i < array->map.max_entries; i++) + free_percpu(array->pptrs[i]); +} + +static int bpf_array_alloc_percpu(struct bpf_array *array) +{ + void __percpu *ptr; + int i; + + for (i = 0; i < array->map.max_entries; i++) { + ptr = __alloc_percpu_gfp(array->elem_size, 8, + GFP_USER | __GFP_NOWARN); + if (!ptr) { + bpf_array_free_percpu(array); + return -ENOMEM; + } + array->pptrs[i] = ptr; + } + + return 0; +} + /* Called from syscall */ static struct bpf_map *array_map_alloc(union bpf_attr *attr) { + bool percpu = attr->map_type == BPF_MAP_TYPE_PERCPU_ARRAY; struct bpf_array *array; - u32 elem_size, array_size; + u64 array_size; + u32 elem_size; /* check sanity of attributes */ if (attr->max_entries == 0 || attr->key_size != 4 || - attr->value_size == 0) + attr->value_size == 0 || attr->map_flags) return ERR_PTR(-EINVAL); if (attr->value_size >= 1 << (KMALLOC_SHIFT_MAX - 1)) @@ -36,12 +64,16 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr) elem_size = round_up(attr->value_size, 8); - /* check round_up into zero and u32 overflow */ - if (elem_size == 0 || - attr->max_entries > (U32_MAX - PAGE_SIZE - sizeof(*array)) / elem_size) + array_size = sizeof(*array); + if (percpu) + array_size += (u64) attr->max_entries * sizeof(void *); + else + array_size += (u64) attr->max_entries * elem_size; + + /* make sure there is no u32 overflow later in round_up() */ + if (array_size >= U32_MAX - PAGE_SIZE) return ERR_PTR(-ENOMEM); - array_size = sizeof(*array) + attr->max_entries * elem_size; /* allocate all map elements and zero-initialize them */ array = kzalloc(array_size, GFP_USER | __GFP_NOWARN); @@ -52,12 +84,25 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr) } /* copy mandatory map attributes */ + array->map.map_type = attr->map_type; array->map.key_size = attr->key_size; array->map.value_size = attr->value_size; array->map.max_entries = attr->max_entries; - array->map.pages = round_up(array_size, PAGE_SIZE) >> PAGE_SHIFT; array->elem_size = elem_size; + if (!percpu) + goto out; + + array_size += (u64) attr->max_entries * elem_size * num_possible_cpus(); + + if (array_size >= U32_MAX - PAGE_SIZE || + elem_size > PCPU_MIN_UNIT_SIZE || bpf_array_alloc_percpu(array)) { + kvfree(array); + return ERR_PTR(-ENOMEM); + } +out: + array->map.pages = round_up(array_size, PAGE_SIZE) >> PAGE_SHIFT; + return &array->map; } @@ -67,12 +112,50 @@ static void *array_map_lookup_elem(struct bpf_map *map, void *key) struct bpf_array *array = container_of(map, struct bpf_array, map); u32 index = *(u32 *)key; - if (index >= array->map.max_entries) + if (unlikely(index >= array->map.max_entries)) return NULL; return array->value + array->elem_size * index; } +/* Called from eBPF program */ +static void *percpu_array_map_lookup_elem(struct bpf_map *map, void *key) +{ + struct bpf_array *array = container_of(map, struct bpf_array, map); + u32 index = *(u32 *)key; + + if (unlikely(index >= array->map.max_entries)) + return NULL; + + return this_cpu_ptr(array->pptrs[index]); +} + +int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value) +{ + struct bpf_array *array = container_of(map, struct bpf_array, map); + u32 index = *(u32 *)key; + void __percpu *pptr; + int cpu, off = 0; + u32 size; + + if (unlikely(index >= array->map.max_entries)) + return -ENOENT; + + /* per_cpu areas are zero-filled and bpf programs can only + * access 'value_size' of them, so copying rounded areas + * will not leak any kernel data + */ + size = round_up(map->value_size, 8); + rcu_read_lock(); + pptr = array->pptrs[index]; + for_each_possible_cpu(cpu) { + bpf_long_memcpy(value + off, per_cpu_ptr(pptr, cpu), size); + off += size; + } + rcu_read_unlock(); + return 0; +} + /* Called from syscall */ static int array_map_get_next_key(struct bpf_map *map, void *key, void *next_key) { @@ -99,19 +182,62 @@ static int array_map_update_elem(struct bpf_map *map, void *key, void *value, struct bpf_array *array = container_of(map, struct bpf_array, map); u32 index = *(u32 *)key; - if (map_flags > BPF_EXIST) + if (unlikely(map_flags > BPF_EXIST)) /* unknown flags */ return -EINVAL; - if (index >= array->map.max_entries) + if (unlikely(index >= array->map.max_entries)) + /* all elements were pre-allocated, cannot insert a new one */ + return -E2BIG; + + if (unlikely(map_flags == BPF_NOEXIST)) + /* all elements already exist */ + return -EEXIST; + + if (array->map.map_type == BPF_MAP_TYPE_PERCPU_ARRAY) + memcpy(this_cpu_ptr(array->pptrs[index]), + value, map->value_size); + else + memcpy(array->value + array->elem_size * index, + value, map->value_size); + return 0; +} + +int bpf_percpu_array_update(struct bpf_map *map, void *key, void *value, + u64 map_flags) +{ + struct bpf_array *array = container_of(map, struct bpf_array, map); + u32 index = *(u32 *)key; + void __percpu *pptr; + int cpu, off = 0; + u32 size; + + if (unlikely(map_flags > BPF_EXIST)) + /* unknown flags */ + return -EINVAL; + + if (unlikely(index >= array->map.max_entries)) /* all elements were pre-allocated, cannot insert a new one */ return -E2BIG; - if (map_flags == BPF_NOEXIST) + if (unlikely(map_flags == BPF_NOEXIST)) /* all elements already exist */ return -EEXIST; - memcpy(array->value + array->elem_size * index, value, map->value_size); + /* the user space will provide round_up(value_size, 8) bytes that + * will be copied into per-cpu area. bpf programs can only access + * value_size of it. During lookup the same extra bytes will be + * returned or zeros which were zero-filled by percpu_alloc, + * so no kernel data leaks possible + */ + size = round_up(map->value_size, 8); + rcu_read_lock(); + pptr = array->pptrs[index]; + for_each_possible_cpu(cpu) { + bpf_long_memcpy(per_cpu_ptr(pptr, cpu), value + off, size); + off += size; + } + rcu_read_unlock(); return 0; } @@ -133,6 +259,9 @@ static void array_map_free(struct bpf_map *map) */ synchronize_rcu(); + if (array->map.map_type == BPF_MAP_TYPE_PERCPU_ARRAY) + bpf_array_free_percpu(array); + kvfree(array); } @@ -150,9 +279,24 @@ static struct bpf_map_type_list array_type __read_mostly = { .type = BPF_MAP_TYPE_ARRAY, }; +static const struct bpf_map_ops percpu_array_ops = { + .map_alloc = array_map_alloc, + .map_free = array_map_free, + .map_get_next_key = array_map_get_next_key, + .map_lookup_elem = percpu_array_map_lookup_elem, + .map_update_elem = array_map_update_elem, + .map_delete_elem = array_map_delete_elem, +}; + +static struct bpf_map_type_list percpu_array_type __read_mostly = { + .ops = &percpu_array_ops, + .type = BPF_MAP_TYPE_PERCPU_ARRAY, +}; + static int __init register_array_map(void) { bpf_register_map_type(&array_type); + bpf_register_map_type(&percpu_array_type); return 0; } late_initcall(register_array_map); @@ -291,10 +435,13 @@ static void *perf_event_fd_array_get_ptr(struct bpf_map *map, int fd) { struct perf_event *event; const struct perf_event_attr *attr; + struct file *file; - event = perf_event_get(fd); - if (IS_ERR(event)) - return event; + file = perf_event_get(fd); + if (IS_ERR(file)) + return file; + + event = file->private_data; attr = perf_event_attrs(event); if (IS_ERR(attr)) @@ -304,24 +451,22 @@ static void *perf_event_fd_array_get_ptr(struct bpf_map *map, int fd) goto err; if (attr->type == PERF_TYPE_RAW) - return event; + return file; if (attr->type == PERF_TYPE_HARDWARE) - return event; + return file; if (attr->type == PERF_TYPE_SOFTWARE && attr->config == PERF_COUNT_SW_BPF_OUTPUT) - return event; + return file; err: - perf_event_release_kernel(event); + fput(file); return ERR_PTR(-EINVAL); } static void perf_event_fd_array_put_ptr(void *ptr) { - struct perf_event *event = ptr; - - perf_event_release_kernel(event); + fput((struct file *)ptr); } static const struct bpf_map_ops perf_event_array_ops = { diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 972d9a8e4ac4..be0abf669ced 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -27,6 +27,7 @@ #include <linux/random.h> #include <linux/moduleloader.h> #include <linux/bpf.h> +#include <linux/frame.h> #include <asm/unaligned.h> @@ -649,6 +650,7 @@ load_byte: WARN_RATELIMIT(1, "unknown opcode %02x\n", insn->code); return 0; } +STACK_FRAME_NON_STANDARD(__bpf_prog_run); /* jump table */ bool bpf_prog_array_compatible(struct bpf_array *array, const struct bpf_prog *fp) diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index c5b30fd8a315..fff3650d52fc 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -1,4 +1,5 @@ /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com + * Copyright (c) 2016 Facebook * * This program is free software; you can redistribute it and/or * modify it under the terms of version 2 of the GNU General Public @@ -13,6 +14,7 @@ #include <linux/jhash.h> #include <linux/filter.h> #include <linux/vmalloc.h> +#include "percpu_freelist.h" struct bucket { struct hlist_head head; @@ -22,6 +24,8 @@ struct bucket { struct bpf_htab { struct bpf_map map; struct bucket *buckets; + void *elems; + struct pcpu_freelist freelist; atomic_t count; /* number of elements in this hashtable */ u32 n_buckets; /* number of hash buckets */ u32 elem_size; /* size of each element in bytes */ @@ -29,26 +33,108 @@ struct bpf_htab { /* each htab element is struct htab_elem + key + value */ struct htab_elem { - struct hlist_node hash_node; + union { + struct hlist_node hash_node; + struct bpf_htab *htab; + struct pcpu_freelist_node fnode; + }; struct rcu_head rcu; u32 hash; char key[0] __aligned(8); }; +static inline void htab_elem_set_ptr(struct htab_elem *l, u32 key_size, + void __percpu *pptr) +{ + *(void __percpu **)(l->key + key_size) = pptr; +} + +static inline void __percpu *htab_elem_get_ptr(struct htab_elem *l, u32 key_size) +{ + return *(void __percpu **)(l->key + key_size); +} + +static struct htab_elem *get_htab_elem(struct bpf_htab *htab, int i) +{ + return (struct htab_elem *) (htab->elems + i * htab->elem_size); +} + +static void htab_free_elems(struct bpf_htab *htab) +{ + int i; + + if (htab->map.map_type != BPF_MAP_TYPE_PERCPU_HASH) + goto free_elems; + + for (i = 0; i < htab->map.max_entries; i++) { + void __percpu *pptr; + + pptr = htab_elem_get_ptr(get_htab_elem(htab, i), + htab->map.key_size); + free_percpu(pptr); + } +free_elems: + vfree(htab->elems); +} + +static int prealloc_elems_and_freelist(struct bpf_htab *htab) +{ + int err = -ENOMEM, i; + + htab->elems = vzalloc(htab->elem_size * htab->map.max_entries); + if (!htab->elems) + return -ENOMEM; + + if (htab->map.map_type != BPF_MAP_TYPE_PERCPU_HASH) + goto skip_percpu_elems; + + for (i = 0; i < htab->map.max_entries; i++) { + u32 size = round_up(htab->map.value_size, 8); + void __percpu *pptr; + + pptr = __alloc_percpu_gfp(size, 8, GFP_USER | __GFP_NOWARN); + if (!pptr) + goto free_elems; + htab_elem_set_ptr(get_htab_elem(htab, i), htab->map.key_size, + pptr); + } + +skip_percpu_elems: + err = pcpu_freelist_init(&htab->freelist); + if (err) + goto free_elems; + + pcpu_freelist_populate(&htab->freelist, htab->elems, htab->elem_size, + htab->map.max_entries); + return 0; + +free_elems: + htab_free_elems(htab); + return err; +} + /* Called from syscall */ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) { + bool percpu = attr->map_type == BPF_MAP_TYPE_PERCPU_HASH; struct bpf_htab *htab; int err, i; + u64 cost; + + if (attr->map_flags & ~BPF_F_NO_PREALLOC) + /* reserved bits should not be used */ + return ERR_PTR(-EINVAL); htab = kzalloc(sizeof(*htab), GFP_USER); if (!htab) return ERR_PTR(-ENOMEM); /* mandatory map attributes */ + htab->map.map_type = attr->map_type; htab->map.key_size = attr->key_size; htab->map.value_size = attr->value_size; htab->map.max_entries = attr->max_entries; + htab->map.map_flags = attr->map_flags; /* check sanity of attributes. * value_size == 0 may be allowed in the future to use map as a set @@ -77,24 +163,39 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) */ goto free_htab; + if (percpu && round_up(htab->map.value_size, 8) > PCPU_MIN_UNIT_SIZE) + /* make sure the size for pcpu_alloc() is reasonable */ + goto free_htab; + htab->elem_size = sizeof(struct htab_elem) + - round_up(htab->map.key_size, 8) + - htab->map.value_size; + round_up(htab->map.key_size, 8); + if (percpu) + htab->elem_size += sizeof(void *); + else + htab->elem_size += round_up(htab->map.value_size, 8); /* prevent zero size kmalloc and check for u32 overflow */ if (htab->n_buckets == 0 || htab->n_buckets > U32_MAX / sizeof(struct bucket)) goto free_htab; - if ((u64) htab->n_buckets * sizeof(struct bucket) + - (u64) htab->elem_size * htab->map.max_entries >= - U32_MAX - PAGE_SIZE) + cost = (u64) htab->n_buckets * sizeof(struct bucket) + + (u64) htab->elem_size * htab->map.max_entries; + + if (percpu) + cost += (u64) round_up(htab->map.value_size, 8) * + num_possible_cpus() * htab->map.max_entries; + + if (cost >= U32_MAX - PAGE_SIZE) /* make sure page count doesn't overflow */ goto free_htab; - htab->map.pages = round_up(htab->n_buckets * sizeof(struct bucket) + - htab->elem_size * htab->map.max_entries, - PAGE_SIZE) >> PAGE_SHIFT; + htab->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; + + /* if map size is larger than memlock limit, reject it early */ + err = bpf_map_precharge_memlock(htab->map.pages); + if (err) + goto free_htab; err = -ENOMEM; htab->buckets = kmalloc_array(htab->n_buckets, sizeof(struct bucket), @@ -111,10 +212,16 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) raw_spin_lock_init(&htab->buckets[i].lock); } - atomic_set(&htab->count, 0); + if (!(attr->map_flags & BPF_F_NO_PREALLOC)) { + err = prealloc_elems_and_freelist(htab); + if (err) + goto free_buckets; + } return &htab->map; +free_buckets: + kvfree(htab->buckets); free_htab: kfree(htab); return ERR_PTR(err); @@ -148,7 +255,7 @@ static struct htab_elem *lookup_elem_raw(struct hlist_head *head, u32 hash, } /* Called from syscall or from eBPF program */ -static void *htab_map_lookup_elem(struct bpf_map *map, void *key) +static void *__htab_map_lookup_elem(struct bpf_map *map, void *key) { struct bpf_htab *htab = container_of(map, struct bpf_htab, map); struct hlist_head *head; @@ -166,6 +273,13 @@ static void *htab_map_lookup_elem(struct bpf_map *map, void *key) l = lookup_elem_raw(head, hash, key, key_size); + return l; +} + +static void *htab_map_lookup_elem(struct bpf_map *map, void *key) +{ + struct htab_elem *l = __htab_map_lookup_elem(map, key); + if (l) return l->key + round_up(map->key_size, 8); @@ -226,86 +340,248 @@ find_first_elem: } } - /* itereated over all buckets and all elements */ + /* iterated over all buckets and all elements */ return -ENOENT; } +static void htab_elem_free(struct bpf_htab *htab, struct htab_elem *l) +{ + if (htab->map.map_type == BPF_MAP_TYPE_PERCPU_HASH) + free_percpu(htab_elem_get_ptr(l, htab->map.key_size)); + kfree(l); + +} + +static void htab_elem_free_rcu(struct rcu_head *head) +{ + struct htab_elem *l = container_of(head, struct htab_elem, rcu); + struct bpf_htab *htab = l->htab; + + /* must increment bpf_prog_active to avoid kprobe+bpf triggering while + * we're calling kfree, otherwise deadlock is possible if kprobes + * are placed somewhere inside of slub + */ + preempt_disable(); + __this_cpu_inc(bpf_prog_active); + htab_elem_free(htab, l); + __this_cpu_dec(bpf_prog_active); + preempt_enable(); +} + +static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l) +{ + if (!(htab->map.map_flags & BPF_F_NO_PREALLOC)) { + pcpu_freelist_push(&htab->freelist, &l->fnode); + } else { + atomic_dec(&htab->count); + l->htab = htab; + call_rcu(&l->rcu, htab_elem_free_rcu); + } +} + +static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key, + void *value, u32 key_size, u32 hash, + bool percpu, bool onallcpus) +{ + u32 size = htab->map.value_size; + bool prealloc = !(htab->map.map_flags & BPF_F_NO_PREALLOC); + struct htab_elem *l_new; + void __percpu *pptr; + + if (prealloc) { + l_new = (struct htab_elem *)pcpu_freelist_pop(&htab->freelist); + if (!l_new) + return ERR_PTR(-E2BIG); + } else { + if (atomic_inc_return(&htab->count) > htab->map.max_entries) { + atomic_dec(&htab->count); + return ERR_PTR(-E2BIG); + } + l_new = kmalloc(htab->elem_size, GFP_ATOMIC | __GFP_NOWARN); + if (!l_new) + return ERR_PTR(-ENOMEM); + } + + memcpy(l_new->key, key, key_size); + if (percpu) { + /* round up value_size to 8 bytes */ + size = round_up(size, 8); + + if (prealloc) { + pptr = htab_elem_get_ptr(l_new, key_size); + } else { + /* alloc_percpu zero-fills */ + pptr = __alloc_percpu_gfp(size, 8, + GFP_ATOMIC | __GFP_NOWARN); + if (!pptr) { + kfree(l_new); + return ERR_PTR(-ENOMEM); + } + } + + if (!onallcpus) { + /* copy true value_size bytes */ + memcpy(this_cpu_ptr(pptr), value, htab->map.value_size); + } else { + int off = 0, cpu; + + for_each_possible_cpu(cpu) { + bpf_long_memcpy(per_cpu_ptr(pptr, cpu), + value + off, size); + off += size; + } + } + if (!prealloc) + htab_elem_set_ptr(l_new, key_size, pptr); + } else { + memcpy(l_new->key + round_up(key_size, 8), value, size); + } + + l_new->hash = hash; + return l_new; +} + +static int check_flags(struct bpf_htab *htab, struct htab_elem *l_old, + u64 map_flags) +{ + if (l_old && map_flags == BPF_NOEXIST) + /* elem already exists */ + return -EEXIST; + + if (!l_old && map_flags == BPF_EXIST) + /* elem doesn't exist, cannot update it */ + return -ENOENT; + + return 0; +} + /* Called from syscall or from eBPF program */ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value, u64 map_flags) { struct bpf_htab *htab = container_of(map, struct bpf_htab, map); - struct htab_elem *l_new, *l_old; + struct htab_elem *l_new = NULL, *l_old; struct hlist_head *head; - struct bucket *b; unsigned long flags; - u32 key_size; + struct bucket *b; + u32 key_size, hash; int ret; - if (map_flags > BPF_EXIST) + if (unlikely(map_flags > BPF_EXIST)) /* unknown flags */ return -EINVAL; WARN_ON_ONCE(!rcu_read_lock_held()); - /* allocate new element outside of lock */ - l_new = kmalloc(htab->elem_size, GFP_ATOMIC | __GFP_NOWARN); - if (!l_new) - return -ENOMEM; - key_size = map->key_size; - memcpy(l_new->key, key, key_size); - memcpy(l_new->key + round_up(key_size, 8), value, map->value_size); + hash = htab_map_hash(key, key_size); - l_new->hash = htab_map_hash(l_new->key, key_size); - b = __select_bucket(htab, l_new->hash); + b = __select_bucket(htab, hash); head = &b->head; /* bpf_map_update_elem() can be called in_irq() */ raw_spin_lock_irqsave(&b->lock, flags); - l_old = lookup_elem_raw(head, l_new->hash, key, key_size); + l_old = lookup_elem_raw(head, hash, key, key_size); - if (!l_old && unlikely(atomic_read(&htab->count) >= map->max_entries)) { - /* if elem with this 'key' doesn't exist and we've reached - * max_entries limit, fail insertion of new elem - */ - ret = -E2BIG; + ret = check_flags(htab, l_old, map_flags); + if (ret) goto err; - } - if (l_old && map_flags == BPF_NOEXIST) { - /* elem already exists */ - ret = -EEXIST; + l_new = alloc_htab_elem(htab, key, value, key_size, hash, false, false); + if (IS_ERR(l_new)) { + /* all pre-allocated elements are in use or memory exhausted */ + ret = PTR_ERR(l_new); goto err; } - if (!l_old && map_flags == BPF_EXIST) { - /* elem doesn't exist, cannot update it */ - ret = -ENOENT; - goto err; - } - - /* add new element to the head of the list, so that concurrent - * search will find it before old elem + /* add new element to the head of the list, so that + * concurrent search will find it before old elem */ hlist_add_head_rcu(&l_new->hash_node, head); if (l_old) { hlist_del_rcu(&l_old->hash_node); - kfree_rcu(l_old, rcu); - } else { - atomic_inc(&htab->count); + free_htab_elem(htab, l_old); } + ret = 0; +err: raw_spin_unlock_irqrestore(&b->lock, flags); + return ret; +} - return 0; +static int __htab_percpu_map_update_elem(struct bpf_map *map, void *key, + void *value, u64 map_flags, + bool onallcpus) +{ + struct bpf_htab *htab = container_of(map, struct bpf_htab, map); + struct htab_elem *l_new = NULL, *l_old; + struct hlist_head *head; + unsigned long flags; + struct bucket *b; + u32 key_size, hash; + int ret; + + if (unlikely(map_flags > BPF_EXIST)) + /* unknown flags */ + return -EINVAL; + + WARN_ON_ONCE(!rcu_read_lock_held()); + + key_size = map->key_size; + + hash = htab_map_hash(key, key_size); + + b = __select_bucket(htab, hash); + head = &b->head; + + /* bpf_map_update_elem() can be called in_irq() */ + raw_spin_lock_irqsave(&b->lock, flags); + + l_old = lookup_elem_raw(head, hash, key, key_size); + + ret = check_flags(htab, l_old, map_flags); + if (ret) + goto err; + + if (l_old) { + void __percpu *pptr = htab_elem_get_ptr(l_old, key_size); + u32 size = htab->map.value_size; + + /* per-cpu hash map can update value in-place */ + if (!onallcpus) { + memcpy(this_cpu_ptr(pptr), value, size); + } else { + int off = 0, cpu; + + size = round_up(size, 8); + for_each_possible_cpu(cpu) { + bpf_long_memcpy(per_cpu_ptr(pptr, cpu), + value + off, size); + off += size; + } + } + } else { + l_new = alloc_htab_elem(htab, key, value, key_size, + hash, true, onallcpus); + if (IS_ERR(l_new)) { + ret = PTR_ERR(l_new); + goto err; + } + hlist_add_head_rcu(&l_new->hash_node, head); + } + ret = 0; err: raw_spin_unlock_irqrestore(&b->lock, flags); - kfree(l_new); return ret; } +static int htab_percpu_map_update_elem(struct bpf_map *map, void *key, + void *value, u64 map_flags) +{ + return __htab_percpu_map_update_elem(map, key, value, map_flags, false); +} + /* Called from syscall or from eBPF program */ static int htab_map_delete_elem(struct bpf_map *map, void *key) { @@ -331,8 +607,7 @@ static int htab_map_delete_elem(struct bpf_map *map, void *key) if (l) { hlist_del_rcu(&l->hash_node); - atomic_dec(&htab->count); - kfree_rcu(l, rcu); + free_htab_elem(htab, l); ret = 0; } @@ -351,12 +626,10 @@ static void delete_all_elements(struct bpf_htab *htab) hlist_for_each_entry_safe(l, n, head, hash_node) { hlist_del_rcu(&l->hash_node); - atomic_dec(&htab->count); - kfree(l); + htab_elem_free(htab, l); } } } - /* Called when map->refcnt goes to zero, either from workqueue or from syscall */ static void htab_map_free(struct bpf_map *map) { @@ -369,10 +642,16 @@ static void htab_map_free(struct bpf_map *map) */ synchronize_rcu(); - /* some of kfree_rcu() callbacks for elements of this map may not have - * executed. It's ok. Proceed to free residual elements and map itself + /* some of free_htab_elem() callbacks for elements of this map may + * not have executed. Wait for them. */ - delete_all_elements(htab); + rcu_barrier(); + if (htab->map.map_flags & BPF_F_NO_PREALLOC) { + delete_all_elements(htab); + } else { + htab_free_elems(htab); + pcpu_freelist_destroy(&htab->freelist); + } kvfree(htab->buckets); kfree(htab); } @@ -391,9 +670,76 @@ static struct bpf_map_type_list htab_type __read_mostly = { .type = BPF_MAP_TYPE_HASH, }; +/* Called from eBPF program */ +static void *htab_percpu_map_lookup_elem(struct bpf_map *map, void *key) +{ + struct htab_elem *l = __htab_map_lookup_elem(map, key); + + if (l) + return this_cpu_ptr(htab_elem_get_ptr(l, map->key_size)); + else + return NULL; +} + +int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value) +{ + struct htab_elem *l; + void __percpu *pptr; + int ret = -ENOENT; + int cpu, off = 0; + u32 size; + + /* per_cpu areas are zero-filled and bpf programs can only + * access 'value_size' of them, so copying rounded areas + * will not leak any kernel data + */ + size = round_up(map->value_size, 8); + rcu_read_lock(); + l = __htab_map_lookup_elem(map, key); + if (!l) + goto out; + pptr = htab_elem_get_ptr(l, map->key_size); + for_each_possible_cpu(cpu) { + bpf_long_memcpy(value + off, + per_cpu_ptr(pptr, cpu), size); + off += size; + } + ret = 0; +out: + rcu_read_unlock(); + return ret; +} + +int bpf_percpu_hash_update(struct bpf_map *map, void *key, void *value, + u64 map_flags) +{ + int ret; + + rcu_read_lock(); + ret = __htab_percpu_map_update_elem(map, key, value, map_flags, true); + rcu_read_unlock(); + + return ret; +} + +static const struct bpf_map_ops htab_percpu_ops = { + .map_alloc = htab_map_alloc, + .map_free = htab_map_free, + .map_get_next_key = htab_map_get_next_key, + .map_lookup_elem = htab_percpu_map_lookup_elem, + .map_update_elem = htab_percpu_map_update_elem, + .map_delete_elem = htab_map_delete_elem, +}; + +static struct bpf_map_type_list htab_percpu_type __read_mostly = { + .ops = &htab_percpu_ops, + .type = BPF_MAP_TYPE_PERCPU_HASH, +}; + static int __init register_htab_map(void) { bpf_register_map_type(&htab_type); + bpf_register_map_type(&htab_percpu_type); return 0; } late_initcall(register_htab_map); diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 4504ca66118d..50da680c479f 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -166,7 +166,7 @@ static u64 bpf_get_current_comm(u64 r1, u64 size, u64 r3, u64 r4, u64 r5) if (!task) return -EINVAL; - memcpy(buf, task->comm, min_t(size_t, size, sizeof(task->comm))); + strlcpy(buf, task->comm, min_t(size_t, size, sizeof(task->comm))); return 0; } diff --git a/kernel/bpf/percpu_freelist.c b/kernel/bpf/percpu_freelist.c new file mode 100644 index 000000000000..5c51d1985b51 --- /dev/null +++ b/kernel/bpf/percpu_freelist.c @@ -0,0 +1,100 @@ +/* Copyright (c) 2016 Facebook + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + */ +#include "percpu_freelist.h" + +int pcpu_freelist_init(struct pcpu_freelist *s) +{ + int cpu; + + s->freelist = alloc_percpu(struct pcpu_freelist_head); + if (!s->freelist) + return -ENOMEM; + + for_each_possible_cpu(cpu) { + struct pcpu_freelist_head *head = per_cpu_ptr(s->freelist, cpu); + + raw_spin_lock_init(&head->lock); + head->first = NULL; + } + return 0; +} + +void pcpu_freelist_destroy(struct pcpu_freelist *s) +{ + free_percpu(s->freelist); +} + +static inline void __pcpu_freelist_push(struct pcpu_freelist_head *head, + struct pcpu_freelist_node *node) +{ + raw_spin_lock(&head->lock); + node->next = head->first; + head->first = node; + raw_spin_unlock(&head->lock); +} + +void pcpu_freelist_push(struct pcpu_freelist *s, + struct pcpu_freelist_node *node) +{ + struct pcpu_freelist_head *head = this_cpu_ptr(s->freelist); + + __pcpu_freelist_push(head, node); +} + +void pcpu_freelist_populate(struct pcpu_freelist *s, void *buf, u32 elem_size, + u32 nr_elems) +{ + struct pcpu_freelist_head *head; + unsigned long flags; + int i, cpu, pcpu_entries; + + pcpu_entries = nr_elems / num_possible_cpus() + 1; + i = 0; + + /* disable irq to workaround lockdep false positive + * in bpf usage pcpu_freelist_populate() will never race + * with pcpu_freelist_push() + */ + local_irq_save(flags); + for_each_possible_cpu(cpu) { +again: + head = per_cpu_ptr(s->freelist, cpu); + __pcpu_freelist_push(head, buf); + i++; + buf += elem_size; + if (i == nr_elems) + break; + if (i % pcpu_entries) + goto again; + } + local_irq_restore(flags); +} + +struct pcpu_freelist_node *pcpu_freelist_pop(struct pcpu_freelist *s) +{ + struct pcpu_freelist_head *head; + struct pcpu_freelist_node *node; + int orig_cpu, cpu; + + orig_cpu = cpu = raw_smp_processor_id(); + while (1) { + head = per_cpu_ptr(s->freelist, cpu); + raw_spin_lock(&head->lock); + node = head->first; + if (node) { + head->first = node->next; + raw_spin_unlock(&head->lock); + return node; + } + raw_spin_unlock(&head->lock); + cpu = cpumask_next(cpu, cpu_possible_mask); + if (cpu >= nr_cpu_ids) + cpu = 0; + if (cpu == orig_cpu) + return NULL; + } +} diff --git a/kernel/bpf/percpu_freelist.h b/kernel/bpf/percpu_freelist.h new file mode 100644 index 000000000000..3049aae8ea1e --- /dev/null +++ b/kernel/bpf/percpu_freelist.h @@ -0,0 +1,31 @@ +/* Copyright (c) 2016 Facebook + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + */ +#ifndef __PERCPU_FREELIST_H__ +#define __PERCPU_FREELIST_H__ +#include <linux/spinlock.h> +#include <linux/percpu.h> + +struct pcpu_freelist_head { + struct pcpu_freelist_node *first; + raw_spinlock_t lock; +}; + +struct pcpu_freelist { + struct pcpu_freelist_head __percpu *freelist; +}; + +struct pcpu_freelist_node { + struct pcpu_freelist_node *next; +}; + +void pcpu_freelist_push(struct pcpu_freelist *, struct pcpu_freelist_node *); +struct pcpu_freelist_node *pcpu_freelist_pop(struct pcpu_freelist *); +void pcpu_freelist_populate(struct pcpu_freelist *s, void *buf, u32 elem_size, + u32 nr_elems); +int pcpu_freelist_init(struct pcpu_freelist *); +void pcpu_freelist_destroy(struct pcpu_freelist *s); +#endif diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c new file mode 100644 index 000000000000..499d9e933f8e --- /dev/null +++ b/kernel/bpf/stackmap.c @@ -0,0 +1,290 @@ +/* Copyright (c) 2016 Facebook + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + */ +#include <linux/bpf.h> +#include <linux/jhash.h> +#include <linux/filter.h> +#include <linux/vmalloc.h> +#include <linux/stacktrace.h> +#include <linux/perf_event.h> +#include "percpu_freelist.h" + +struct stack_map_bucket { + struct pcpu_freelist_node fnode; + u32 hash; + u32 nr; + u64 ip[]; +}; + +struct bpf_stack_map { + struct bpf_map map; + void *elems; + struct pcpu_freelist freelist; + u32 n_buckets; + struct stack_map_bucket *buckets[]; +}; + +static int prealloc_elems_and_freelist(struct bpf_stack_map *smap) +{ + u32 elem_size = sizeof(struct stack_map_bucket) + smap->map.value_size; + int err; + + smap->elems = vzalloc(elem_size * smap->map.max_entries); + if (!smap->elems) + return -ENOMEM; + + err = pcpu_freelist_init(&smap->freelist); + if (err) + goto free_elems; + + pcpu_freelist_populate(&smap->freelist, smap->elems, elem_size, + smap->map.max_entries); + return 0; + +free_elems: + vfree(smap->elems); + return err; +} + +/* Called from syscall */ +static struct bpf_map *stack_map_alloc(union bpf_attr *attr) +{ + u32 value_size = attr->value_size; + struct bpf_stack_map *smap; + u64 cost, n_buckets; + int err; + + if (!capable(CAP_SYS_ADMIN)) + return ERR_PTR(-EPERM); + + if (attr->map_flags) + return ERR_PTR(-EINVAL); + + /* check sanity of attributes */ + if (attr->max_entries == 0 || attr->key_size != 4 || + value_size < 8 || value_size % 8 || + value_size / 8 > PERF_MAX_STACK_DEPTH) + return ERR_PTR(-EINVAL); + + /* hash table size must be power of 2 */ + n_buckets = roundup_pow_of_two(attr->max_entries); + + cost = n_buckets * sizeof(struct stack_map_bucket *) + sizeof(*smap); + if (cost >= U32_MAX - PAGE_SIZE) + return ERR_PTR(-E2BIG); + + smap = kzalloc(cost, GFP_USER | __GFP_NOWARN); + if (!smap) { + smap = vzalloc(cost); + if (!smap) + return ERR_PTR(-ENOMEM); + } + + err = -E2BIG; + cost += n_buckets * (value_size + sizeof(struct stack_map_bucket)); + if (cost >= U32_MAX - PAGE_SIZE) + goto free_smap; + + smap->map.map_type = attr->map_type; + smap->map.key_size = attr->key_size; + smap->map.value_size = value_size; + smap->map.max_entries = attr->max_entries; + smap->n_buckets = n_buckets; + smap->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; + + err = bpf_map_precharge_memlock(smap->map.pages); + if (err) + goto free_smap; + + err = get_callchain_buffers(); + if (err) + goto free_smap; + + err = prealloc_elems_and_freelist(smap); + if (err) + goto put_buffers; + + return &smap->map; + +put_buffers: + put_callchain_buffers(); +free_smap: + kvfree(smap); + return ERR_PTR(err); +} + +static u64 bpf_get_stackid(u64 r1, u64 r2, u64 flags, u64 r4, u64 r5) +{ + struct pt_regs *regs = (struct pt_regs *) (long) r1; + struct bpf_map *map = (struct bpf_map *) (long) r2; + struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map); + struct perf_callchain_entry *trace; + struct stack_map_bucket *bucket, *new_bucket, *old_bucket; + u32 max_depth = map->value_size / 8; + /* stack_map_alloc() checks that max_depth <= PERF_MAX_STACK_DEPTH */ + u32 init_nr = PERF_MAX_STACK_DEPTH - max_depth; + u32 skip = flags & BPF_F_SKIP_FIELD_MASK; + u32 hash, id, trace_nr, trace_len; + bool user = flags & BPF_F_USER_STACK; + bool kernel = !user; + u64 *ips; + + if (unlikely(flags & ~(BPF_F_SKIP_FIELD_MASK | BPF_F_USER_STACK | + BPF_F_FAST_STACK_CMP | BPF_F_REUSE_STACKID))) + return -EINVAL; + + trace = get_perf_callchain(regs, init_nr, kernel, user, false, false); + + if (unlikely(!trace)) + /* couldn't fetch the stack trace */ + return -EFAULT; + + /* get_perf_callchain() guarantees that trace->nr >= init_nr + * and trace-nr <= PERF_MAX_STACK_DEPTH, so trace_nr <= max_depth + */ + trace_nr = trace->nr - init_nr; + + if (trace_nr <= skip) + /* skipping more than usable stack trace */ + return -EFAULT; + + trace_nr -= skip; + trace_len = trace_nr * sizeof(u64); + ips = trace->ip + skip + init_nr; + hash = jhash2((u32 *)ips, trace_len / sizeof(u32), 0); + id = hash & (smap->n_buckets - 1); + bucket = READ_ONCE(smap->buckets[id]); + + if (bucket && bucket->hash == hash) { + if (flags & BPF_F_FAST_STACK_CMP) + return id; + if (bucket->nr == trace_nr && + memcmp(bucket->ip, ips, trace_len) == 0) + return id; + } + + /* this call stack is not in the map, try to add it */ + if (bucket && !(flags & BPF_F_REUSE_STACKID)) + return -EEXIST; + + new_bucket = (struct stack_map_bucket *) + pcpu_freelist_pop(&smap->freelist); + if (unlikely(!new_bucket)) + return -ENOMEM; + + memcpy(new_bucket->ip, ips, trace_len); + new_bucket->hash = hash; + new_bucket->nr = trace_nr; + + old_bucket = xchg(&smap->buckets[id], new_bucket); + if (old_bucket) + pcpu_freelist_push(&smap->freelist, &old_bucket->fnode); + return id; +} + +const struct bpf_func_proto bpf_get_stackid_proto = { + .func = bpf_get_stackid, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_CONST_MAP_PTR, + .arg3_type = ARG_ANYTHING, +}; + +/* Called from eBPF program */ +static void *stack_map_lookup_elem(struct bpf_map *map, void *key) +{ + return NULL; +} + +/* Called from syscall */ +int bpf_stackmap_copy(struct bpf_map *map, void *key, void *value) +{ + struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map); + struct stack_map_bucket *bucket, *old_bucket; + u32 id = *(u32 *)key, trace_len; + + if (unlikely(id >= smap->n_buckets)) + return -ENOENT; + + bucket = xchg(&smap->buckets[id], NULL); + if (!bucket) + return -ENOENT; + + trace_len = bucket->nr * sizeof(u64); + memcpy(value, bucket->ip, trace_len); + memset(value + trace_len, 0, map->value_size - trace_len); + + old_bucket = xchg(&smap->buckets[id], bucket); + if (old_bucket) + pcpu_freelist_push(&smap->freelist, &old_bucket->fnode); + return 0; +} + +static int stack_map_get_next_key(struct bpf_map *map, void *key, void *next_key) +{ + return -EINVAL; +} + +static int stack_map_update_elem(struct bpf_map *map, void *key, void *value, + u64 map_flags) +{ + return -EINVAL; +} + +/* Called from syscall or from eBPF program */ +static int stack_map_delete_elem(struct bpf_map *map, void *key) +{ + struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map); + struct stack_map_bucket *old_bucket; + u32 id = *(u32 *)key; + + if (unlikely(id >= smap->n_buckets)) + return -E2BIG; + + old_bucket = xchg(&smap->buckets[id], NULL); + if (old_bucket) { + pcpu_freelist_push(&smap->freelist, &old_bucket->fnode); + return 0; + } else { + return -ENOENT; + } +} + +/* Called when map->refcnt goes to zero, either from workqueue or from syscall */ +static void stack_map_free(struct bpf_map *map) +{ + struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map); + + /* wait for bpf programs to complete before freeing stack map */ + synchronize_rcu(); + + vfree(smap->elems); + pcpu_freelist_destroy(&smap->freelist); + kvfree(smap); + put_callchain_buffers(); +} + +static const struct bpf_map_ops stack_map_ops = { + .map_alloc = stack_map_alloc, + .map_free = stack_map_free, + .map_get_next_key = stack_map_get_next_key, + .map_lookup_elem = stack_map_lookup_elem, + .map_update_elem = stack_map_update_elem, + .map_delete_elem = stack_map_delete_elem, +}; + +static struct bpf_map_type_list stack_map_type __read_mostly = { + .ops = &stack_map_ops, + .type = BPF_MAP_TYPE_STACK_TRACE, +}; + +static int __init register_stack_map(void) +{ + bpf_register_map_type(&stack_map_type); + return 0; +} +late_initcall(register_stack_map); diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 637397059f76..2a2efe1bc76c 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -18,6 +18,8 @@ #include <linux/filter.h> #include <linux/version.h> +DEFINE_PER_CPU(int, bpf_prog_active); + int sysctl_unprivileged_bpf_disabled __read_mostly; static LIST_HEAD(bpf_map_types); @@ -46,6 +48,19 @@ void bpf_register_map_type(struct bpf_map_type_list *tl) list_add(&tl->list_node, &bpf_map_types); } +int bpf_map_precharge_memlock(u32 pages) +{ + struct user_struct *user = get_current_user(); + unsigned long memlock_limit, cur; + + memlock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; + cur = atomic_long_read(&user->locked_vm); + free_uid(user); + if (cur + pages > memlock_limit) + return -EPERM; + return 0; +} + static int bpf_map_charge_memlock(struct bpf_map *map) { struct user_struct *user = get_current_user(); @@ -151,7 +166,7 @@ int bpf_map_new_fd(struct bpf_map *map) offsetof(union bpf_attr, CMD##_LAST_FIELD) - \ sizeof(attr->CMD##_LAST_FIELD)) != NULL -#define BPF_MAP_CREATE_LAST_FIELD max_entries +#define BPF_MAP_CREATE_LAST_FIELD map_flags /* called via syscall */ static int map_create(union bpf_attr *attr) { @@ -229,6 +244,11 @@ static void __user *u64_to_ptr(__u64 val) return (void __user *) (unsigned long) val; } +int __weak bpf_stackmap_copy(struct bpf_map *map, void *key, void *value) +{ + return -ENOTSUPP; +} + /* last field in 'union bpf_attr' used by this command */ #define BPF_MAP_LOOKUP_ELEM_LAST_FIELD value @@ -239,6 +259,7 @@ static int map_lookup_elem(union bpf_attr *attr) int ufd = attr->map_fd; struct bpf_map *map; void *key, *value, *ptr; + u32 value_size; struct fd f; int err; @@ -259,23 +280,37 @@ static int map_lookup_elem(union bpf_attr *attr) if (copy_from_user(key, ukey, map->key_size) != 0) goto free_key; + if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH || + map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) + value_size = round_up(map->value_size, 8) * num_possible_cpus(); + else + value_size = map->value_size; + err = -ENOMEM; - value = kmalloc(map->value_size, GFP_USER | __GFP_NOWARN); + value = kmalloc(value_size, GFP_USER | __GFP_NOWARN); if (!value) goto free_key; - rcu_read_lock(); - ptr = map->ops->map_lookup_elem(map, key); - if (ptr) - memcpy(value, ptr, map->value_size); - rcu_read_unlock(); + if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH) { + err = bpf_percpu_hash_copy(map, key, value); + } else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) { + err = bpf_percpu_array_copy(map, key, value); + } else if (map->map_type == BPF_MAP_TYPE_STACK_TRACE) { + err = bpf_stackmap_copy(map, key, value); + } else { + rcu_read_lock(); + ptr = map->ops->map_lookup_elem(map, key); + if (ptr) + memcpy(value, ptr, value_size); + rcu_read_unlock(); + err = ptr ? 0 : -ENOENT; + } - err = -ENOENT; - if (!ptr) + if (err) goto free_value; err = -EFAULT; - if (copy_to_user(uvalue, value, map->value_size) != 0) + if (copy_to_user(uvalue, value, value_size) != 0) goto free_value; err = 0; @@ -298,6 +333,7 @@ static int map_update_elem(union bpf_attr *attr) int ufd = attr->map_fd; struct bpf_map *map; void *key, *value; + u32 value_size; struct fd f; int err; @@ -318,21 +354,37 @@ static int map_update_elem(union bpf_attr *attr) if (copy_from_user(key, ukey, map->key_size) != 0) goto free_key; + if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH || + map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) + value_size = round_up(map->value_size, 8) * num_possible_cpus(); + else + value_size = map->value_size; + err = -ENOMEM; - value = kmalloc(map->value_size, GFP_USER | __GFP_NOWARN); + value = kmalloc(value_size, GFP_USER | __GFP_NOWARN); if (!value) goto free_key; err = -EFAULT; - if (copy_from_user(value, uvalue, map->value_size) != 0) + if (copy_from_user(value, uvalue, value_size) != 0) goto free_value; - /* eBPF program that use maps are running under rcu_read_lock(), - * therefore all map accessors rely on this fact, so do the same here + /* must increment bpf_prog_active to avoid kprobe+bpf triggering from + * inside bpf map update or delete otherwise deadlocks are possible */ - rcu_read_lock(); - err = map->ops->map_update_elem(map, key, value, attr->flags); - rcu_read_unlock(); + preempt_disable(); + __this_cpu_inc(bpf_prog_active); + if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH) { + err = bpf_percpu_hash_update(map, key, value, attr->flags); + } else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) { + err = bpf_percpu_array_update(map, key, value, attr->flags); + } else { + rcu_read_lock(); + err = map->ops->map_update_elem(map, key, value, attr->flags); + rcu_read_unlock(); + } + __this_cpu_dec(bpf_prog_active); + preempt_enable(); free_value: kfree(value); @@ -371,9 +423,13 @@ static int map_delete_elem(union bpf_attr *attr) if (copy_from_user(key, ukey, map->key_size) != 0) goto free_key; + preempt_disable(); + __this_cpu_inc(bpf_prog_active); rcu_read_lock(); err = map->ops->map_delete_elem(map, key); rcu_read_unlock(); + __this_cpu_dec(bpf_prog_active); + preempt_enable(); free_key: kfree(key); diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index d1d3e8f57de9..2e08f8e9b771 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -246,6 +246,7 @@ static const struct { {BPF_MAP_TYPE_PROG_ARRAY, BPF_FUNC_tail_call}, {BPF_MAP_TYPE_PERF_EVENT_ARRAY, BPF_FUNC_perf_event_read}, {BPF_MAP_TYPE_PERF_EVENT_ARRAY, BPF_FUNC_perf_event_output}, + {BPF_MAP_TYPE_STACK_TRACE, BPF_FUNC_get_stackid}, }; static void print_verifier_state(struct verifier_env *env) @@ -778,15 +779,24 @@ static int check_xadd(struct verifier_env *env, struct bpf_insn *insn) * bytes from that pointer, make sure that it's within stack boundary * and all elements of stack are initialized */ -static int check_stack_boundary(struct verifier_env *env, - int regno, int access_size) +static int check_stack_boundary(struct verifier_env *env, int regno, + int access_size, bool zero_size_allowed) { struct verifier_state *state = &env->cur_state; struct reg_state *regs = state->regs; int off, i; - if (regs[regno].type != PTR_TO_STACK) + if (regs[regno].type != PTR_TO_STACK) { + if (zero_size_allowed && access_size == 0 && + regs[regno].type == CONST_IMM && + regs[regno].imm == 0) + return 0; + + verbose("R%d type=%s expected=%s\n", regno, + reg_type_str[regs[regno].type], + reg_type_str[PTR_TO_STACK]); return -EACCES; + } off = regs[regno].imm; if (off >= 0 || off < -MAX_BPF_STACK || off + access_size > 0 || @@ -829,15 +839,24 @@ static int check_func_arg(struct verifier_env *env, u32 regno, return 0; } - if (arg_type == ARG_PTR_TO_STACK || arg_type == ARG_PTR_TO_MAP_KEY || + if (arg_type == ARG_PTR_TO_MAP_KEY || arg_type == ARG_PTR_TO_MAP_VALUE) { expected_type = PTR_TO_STACK; - } else if (arg_type == ARG_CONST_STACK_SIZE) { + } else if (arg_type == ARG_CONST_STACK_SIZE || + arg_type == ARG_CONST_STACK_SIZE_OR_ZERO) { expected_type = CONST_IMM; } else if (arg_type == ARG_CONST_MAP_PTR) { expected_type = CONST_PTR_TO_MAP; } else if (arg_type == ARG_PTR_TO_CTX) { expected_type = PTR_TO_CTX; + } else if (arg_type == ARG_PTR_TO_STACK) { + expected_type = PTR_TO_STACK; + /* One exception here. In case function allows for NULL to be + * passed in as argument, it's a CONST_IMM type. Final test + * happens during stack boundary checking. + */ + if (reg->type == CONST_IMM && reg->imm == 0) + expected_type = CONST_IMM; } else { verbose("unsupported arg_type %d\n", arg_type); return -EFAULT; @@ -867,8 +886,8 @@ static int check_func_arg(struct verifier_env *env, u32 regno, verbose("invalid map_ptr to access map->key\n"); return -EACCES; } - err = check_stack_boundary(env, regno, (*mapp)->key_size); - + err = check_stack_boundary(env, regno, (*mapp)->key_size, + false); } else if (arg_type == ARG_PTR_TO_MAP_VALUE) { /* bpf_map_xxx(..., map_ptr, ..., value) call: * check [value, value + map->value_size) validity @@ -878,9 +897,12 @@ static int check_func_arg(struct verifier_env *env, u32 regno, verbose("invalid map_ptr to access map->value\n"); return -EACCES; } - err = check_stack_boundary(env, regno, (*mapp)->value_size); + err = check_stack_boundary(env, regno, (*mapp)->value_size, + false); + } else if (arg_type == ARG_CONST_STACK_SIZE || + arg_type == ARG_CONST_STACK_SIZE_OR_ZERO) { + bool zero_size_allowed = (arg_type == ARG_CONST_STACK_SIZE_OR_ZERO); - } else if (arg_type == ARG_CONST_STACK_SIZE) { /* bpf_xxx(..., buf, len) call will access 'len' bytes * from stack pointer 'buf'. Check it * note: regno == len, regno - 1 == buf @@ -890,7 +912,8 @@ static int check_func_arg(struct verifier_env *env, u32 regno, verbose("ARG_CONST_STACK_SIZE cannot be first argument\n"); return -EACCES; } - err = check_stack_boundary(env, regno - 1, reg->imm); + err = check_stack_boundary(env, regno - 1, reg->imm, + zero_size_allowed); } return err; @@ -911,8 +934,11 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id) * don't allow any other map type to be passed into * the special func; */ - if (bool_func && bool_map != bool_func) + if (bool_func && bool_map != bool_func) { + verbose("cannot pass map_type %d into func %d\n", + map->map_type, func_id); return -EINVAL; + } } return 0; @@ -2082,7 +2108,7 @@ static void adjust_branches(struct bpf_prog *prog, int pos, int delta) /* adjust offset of jmps if necessary */ if (i < pos && i + insn->off + 1 > pos) insn->off += delta; - else if (i > pos && i + insn->off + 1 < pos) + else if (i > pos + delta && i + insn->off + 1 <= pos + delta) insn->off -= delta; } } diff --git a/kernel/cgroup.c b/kernel/cgroup.c index c03a640ef6da..671dc05c0b0f 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -58,6 +58,10 @@ #include <linux/kthread.h> #include <linux/delay.h> #include <linux/atomic.h> +#include <linux/cpuset.h> +#include <linux/proc_ns.h> +#include <linux/nsproxy.h> +#include <linux/proc_ns.h> #include <net/sock.h> /* @@ -177,10 +181,16 @@ EXPORT_SYMBOL_GPL(cgrp_dfl_root); * The default hierarchy always exists but is hidden until mounted for the * first time. This is for backward compatibility. */ -static bool cgrp_dfl_root_visible; +static bool cgrp_dfl_visible; + +/* Controllers blocked by the commandline in v1 */ +static u16 cgroup_no_v1_mask; /* some controllers are not supported in the default hierarchy */ -static unsigned long cgrp_dfl_root_inhibit_ss_mask; +static u16 cgrp_dfl_inhibit_ss_mask; + +/* some controllers are implicitly enabled on the default hierarchy */ +static unsigned long cgrp_dfl_implicit_ss_mask; /* The list of hierarchy roots */ @@ -204,23 +214,34 @@ static u64 css_serial_nr_next = 1; * fork/exit handlers to call. This avoids us having to do extra work in the * fork/exit path to check which subsystems have fork/exit callbacks. */ -static unsigned long have_fork_callback __read_mostly; -static unsigned long have_exit_callback __read_mostly; -static unsigned long have_free_callback __read_mostly; +static u16 have_fork_callback __read_mostly; +static u16 have_exit_callback __read_mostly; +static u16 have_free_callback __read_mostly; + +/* cgroup namespace for init task */ +struct cgroup_namespace init_cgroup_ns = { + .count = { .counter = 2, }, + .user_ns = &init_user_ns, + .ns.ops = &cgroupns_operations, + .ns.inum = PROC_CGROUP_INIT_INO, + .root_cset = &init_css_set, +}; /* Ditto for the can_fork callback. */ -static unsigned long have_canfork_callback __read_mostly; +static u16 have_canfork_callback __read_mostly; static struct file_system_type cgroup2_fs_type; static struct cftype cgroup_dfl_base_files[]; static struct cftype cgroup_legacy_base_files[]; -static int rebind_subsystems(struct cgroup_root *dst_root, - unsigned long ss_mask); +static int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask); +static void cgroup_lock_and_drain_offline(struct cgroup *cgrp); +static int cgroup_apply_control(struct cgroup *cgrp); +static void cgroup_finalize_control(struct cgroup *cgrp, int ret); static void css_task_iter_advance(struct css_task_iter *it); static int cgroup_destroy_locked(struct cgroup *cgrp); -static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss, - bool visible); +static struct cgroup_subsys_state *css_create(struct cgroup *cgrp, + struct cgroup_subsys *ss); static void css_release(struct percpu_ref *ref); static void kill_css(struct cgroup_subsys_state *css); static int cgroup_addrm_files(struct cgroup_subsys_state *css, @@ -237,9 +258,17 @@ static int cgroup_addrm_files(struct cgroup_subsys_state *css, */ static bool cgroup_ssid_enabled(int ssid) { + if (CGROUP_SUBSYS_COUNT == 0) + return false; + return static_key_enabled(cgroup_subsys_enabled_key[ssid]); } +static bool cgroup_ssid_no_v1(int ssid) +{ + return cgroup_no_v1_mask & (1 << ssid); +} + /** * cgroup_on_dfl - test whether a cgroup is on the default hierarchy * @cgrp: the cgroup of interest @@ -338,6 +367,32 @@ static struct cgroup *cgroup_parent(struct cgroup *cgrp) return NULL; } +/* subsystems visibly enabled on a cgroup */ +static u16 cgroup_control(struct cgroup *cgrp) +{ + struct cgroup *parent = cgroup_parent(cgrp); + u16 root_ss_mask = cgrp->root->subsys_mask; + + if (parent) + return parent->subtree_control; + + if (cgroup_on_dfl(cgrp)) + root_ss_mask &= ~(cgrp_dfl_inhibit_ss_mask | + cgrp_dfl_implicit_ss_mask); + return root_ss_mask; +} + +/* subsystems enabled on a cgroup */ +static u16 cgroup_ss_mask(struct cgroup *cgrp) +{ + struct cgroup *parent = cgroup_parent(cgrp); + + if (parent) + return parent->subtree_ss_mask; + + return cgrp->root->subsys_mask; +} + /** * cgroup_css - obtain a cgroup's css for the specified subsystem * @cgrp: the cgroup of interest @@ -377,16 +432,15 @@ static struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgrp, if (!ss) return &cgrp->self; - if (!(cgrp->root->subsys_mask & (1 << ss->id))) - return NULL; - /* * This function is used while updating css associations and thus - * can't test the csses directly. Use ->child_subsys_mask. + * can't test the csses directly. Test ss_mask. */ - while (cgroup_parent(cgrp) && - !(cgroup_parent(cgrp)->child_subsys_mask & (1 << ss->id))) + while (!(cgroup_ss_mask(cgrp) & (1 << ss->id))) { cgrp = cgroup_parent(cgrp); + if (!cgrp) + return NULL; + } return cgroup_css(cgrp, ss); } @@ -505,22 +559,28 @@ static int notify_on_release(const struct cgroup *cgrp) (((ss) = cgroup_subsys[ssid]) || true); (ssid)++) /** - * for_each_subsys_which - filter for_each_subsys with a bitmask + * do_each_subsys_mask - filter for_each_subsys with a bitmask * @ss: the iteration cursor * @ssid: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end - * @ss_maskp: a pointer to the bitmask + * @ss_mask: the bitmask * * The block will only run for cases where the ssid-th bit (1 << ssid) of - * mask is set to 1. + * @ss_mask is set. */ -#define for_each_subsys_which(ss, ssid, ss_maskp) \ - if (!CGROUP_SUBSYS_COUNT) /* to avoid spurious gcc warning */ \ +#define do_each_subsys_mask(ss, ssid, ss_mask) do { \ + unsigned long __ss_mask = (ss_mask); \ + if (!CGROUP_SUBSYS_COUNT) { /* to avoid spurious gcc warning */ \ (ssid) = 0; \ - else \ - for_each_set_bit(ssid, ss_maskp, CGROUP_SUBSYS_COUNT) \ - if (((ss) = cgroup_subsys[ssid]) && false) \ - break; \ - else + break; \ + } \ + for_each_set_bit(ssid, &__ss_mask, CGROUP_SUBSYS_COUNT) { \ + (ss) = cgroup_subsys[ssid]; \ + { + +#define while_each_subsys_mask() \ + } \ + } \ +} while (false) /* iterate across the hierarchies */ #define for_each_root(root) \ @@ -534,6 +594,24 @@ static int notify_on_release(const struct cgroup *cgrp) ; \ else +/* walk live descendants in preorder */ +#define cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) \ + css_for_each_descendant_pre((d_css), cgroup_css((cgrp), NULL)) \ + if (({ lockdep_assert_held(&cgroup_mutex); \ + (dsct) = (d_css)->cgroup; \ + cgroup_is_dead(dsct); })) \ + ; \ + else + +/* walk live descendants in postorder */ +#define cgroup_for_each_live_descendant_post(dsct, d_css, cgrp) \ + css_for_each_descendant_post((d_css), cgroup_css((cgrp), NULL)) \ + if (({ lockdep_assert_held(&cgroup_mutex); \ + (dsct) = (d_css)->cgroup; \ + cgroup_is_dead(dsct); })) \ + ; \ + else + static void cgroup_release_agent(struct work_struct *work); static void check_for_release(struct cgroup *cgrp); @@ -664,6 +742,9 @@ static void css_set_move_task(struct task_struct *task, { lockdep_assert_held(&css_set_lock); + if (to_cset && !css_set_populated(to_cset)) + css_set_update_populated(to_cset, true); + if (from_cset) { struct css_task_iter *it, *pos; @@ -697,8 +778,6 @@ static void css_set_move_task(struct task_struct *task, */ WARN_ON_ONCE(task->flags & PF_EXITING); - if (!css_set_populated(to_cset)) - css_set_update_populated(to_cset, true); rcu_assign_pointer(task->cgroups, to_cset); list_add_tail(&task->cg_list, use_mg_tasks ? &to_cset->mg_tasks : &to_cset->tasks); @@ -1101,13 +1180,13 @@ static void cgroup_destroy_root(struct cgroup_root *root) struct cgroup *cgrp = &root->cgrp; struct cgrp_cset_link *link, *tmp_link; - mutex_lock(&cgroup_mutex); + cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp); BUG_ON(atomic_read(&root->nr_cgrps)); BUG_ON(!list_empty(&cgrp->self.children)); /* Rebind all subsystems back to the default hierarchy */ - rebind_subsystems(&cgrp_dfl_root, root->subsys_mask); + WARN_ON(rebind_subsystems(&cgrp_dfl_root, root->subsys_mask)); /* * Release all the links from cset_links to this hierarchy's @@ -1247,46 +1326,40 @@ static umode_t cgroup_file_mode(const struct cftype *cft) } /** - * cgroup_calc_child_subsys_mask - calculate child_subsys_mask - * @cgrp: the target cgroup + * cgroup_calc_subtree_ss_mask - calculate subtree_ss_mask * @subtree_control: the new subtree_control mask to consider + * @this_ss_mask: available subsystems * * On the default hierarchy, a subsystem may request other subsystems to be * enabled together through its ->depends_on mask. In such cases, more * subsystems than specified in "cgroup.subtree_control" may be enabled. * * This function calculates which subsystems need to be enabled if - * @subtree_control is to be applied to @cgrp. The returned mask is always - * a superset of @subtree_control and follows the usual hierarchy rules. + * @subtree_control is to be applied while restricted to @this_ss_mask. */ -static unsigned long cgroup_calc_child_subsys_mask(struct cgroup *cgrp, - unsigned long subtree_control) +static u16 cgroup_calc_subtree_ss_mask(u16 subtree_control, u16 this_ss_mask) { - struct cgroup *parent = cgroup_parent(cgrp); - unsigned long cur_ss_mask = subtree_control; + u16 cur_ss_mask = subtree_control; struct cgroup_subsys *ss; int ssid; lockdep_assert_held(&cgroup_mutex); - if (!cgroup_on_dfl(cgrp)) - return cur_ss_mask; + cur_ss_mask |= cgrp_dfl_implicit_ss_mask; while (true) { - unsigned long new_ss_mask = cur_ss_mask; + u16 new_ss_mask = cur_ss_mask; - for_each_subsys_which(ss, ssid, &cur_ss_mask) + do_each_subsys_mask(ss, ssid, cur_ss_mask) { new_ss_mask |= ss->depends_on; + } while_each_subsys_mask(); /* * Mask out subsystems which aren't available. This can * happen only if some depended-upon subsystems were bound * to non-default hierarchies. */ - if (parent) - new_ss_mask &= parent->child_subsys_mask; - else - new_ss_mask &= cgrp->root->subsys_mask; + new_ss_mask &= this_ss_mask; if (new_ss_mask == cur_ss_mask) break; @@ -1297,19 +1370,6 @@ static unsigned long cgroup_calc_child_subsys_mask(struct cgroup *cgrp, } /** - * cgroup_refresh_child_subsys_mask - update child_subsys_mask - * @cgrp: the target cgroup - * - * Update @cgrp->child_subsys_mask according to the current - * @cgrp->subtree_control using cgroup_calc_child_subsys_mask(). - */ -static void cgroup_refresh_child_subsys_mask(struct cgroup *cgrp) -{ - cgrp->child_subsys_mask = - cgroup_calc_child_subsys_mask(cgrp, cgrp->subtree_control); -} - -/** * cgroup_kn_unlock - unlocking helper for cgroup kernfs methods * @kn: the kernfs_node being serviced * @@ -1337,19 +1397,22 @@ static void cgroup_kn_unlock(struct kernfs_node *kn) /** * cgroup_kn_lock_live - locking helper for cgroup kernfs methods * @kn: the kernfs_node being serviced + * @drain_offline: perform offline draining on the cgroup * * This helper is to be used by a cgroup kernfs method currently servicing * @kn. It breaks the active protection, performs cgroup locking and * verifies that the associated cgroup is alive. Returns the cgroup if * alive; otherwise, %NULL. A successful return should be undone by a - * matching cgroup_kn_unlock() invocation. + * matching cgroup_kn_unlock() invocation. If @drain_offline is %true, the + * cgroup is drained of offlining csses before return. * * Any cgroup kernfs method implementation which requires locking the * associated cgroup should use this helper. It avoids nesting cgroup * locking under kernfs active protection and allows all kernfs operations * including self-removal. */ -static struct cgroup *cgroup_kn_lock_live(struct kernfs_node *kn) +static struct cgroup *cgroup_kn_lock_live(struct kernfs_node *kn, + bool drain_offline) { struct cgroup *cgrp; @@ -1368,7 +1431,10 @@ static struct cgroup *cgroup_kn_lock_live(struct kernfs_node *kn) return NULL; kernfs_break_active_protection(kn); - mutex_lock(&cgroup_mutex); + if (drain_offline) + cgroup_lock_and_drain_offline(cgrp); + else + mutex_lock(&cgroup_mutex); if (!cgroup_is_dead(cgrp)) return cgrp; @@ -1398,14 +1464,17 @@ static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft) /** * css_clear_dir - remove subsys files in a cgroup directory * @css: taget css - * @cgrp_override: specify if target cgroup is different from css->cgroup */ -static void css_clear_dir(struct cgroup_subsys_state *css, - struct cgroup *cgrp_override) +static void css_clear_dir(struct cgroup_subsys_state *css) { - struct cgroup *cgrp = cgrp_override ?: css->cgroup; + struct cgroup *cgrp = css->cgroup; struct cftype *cfts; + if (!(css->flags & CSS_VISIBLE)) + return; + + css->flags &= ~CSS_VISIBLE; + list_for_each_entry(cfts, &css->ss->cfts, node) cgroup_addrm_files(css, cgrp, cfts, false); } @@ -1413,17 +1482,18 @@ static void css_clear_dir(struct cgroup_subsys_state *css, /** * css_populate_dir - create subsys files in a cgroup directory * @css: target css - * @cgrp_overried: specify if target cgroup is different from css->cgroup * * On failure, no file is added. */ -static int css_populate_dir(struct cgroup_subsys_state *css, - struct cgroup *cgrp_override) +static int css_populate_dir(struct cgroup_subsys_state *css) { - struct cgroup *cgrp = cgrp_override ?: css->cgroup; + struct cgroup *cgrp = css->cgroup; struct cftype *cfts, *failed_cfts; int ret; + if ((css->flags & CSS_VISIBLE) || !cgrp->kn) + return 0; + if (!css->ss) { if (cgroup_on_dfl(cgrp)) cfts = cgroup_dfl_base_files; @@ -1440,6 +1510,9 @@ static int css_populate_dir(struct cgroup_subsys_state *css, goto err; } } + + css->flags |= CSS_VISIBLE; + return 0; err: list_for_each_entry(cfts, &css->ss->cfts, node) { @@ -1450,67 +1523,30 @@ err: return ret; } -static int rebind_subsystems(struct cgroup_root *dst_root, - unsigned long ss_mask) +static int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask) { struct cgroup *dcgrp = &dst_root->cgrp; struct cgroup_subsys *ss; - unsigned long tmp_ss_mask; int ssid, i, ret; lockdep_assert_held(&cgroup_mutex); - for_each_subsys_which(ss, ssid, &ss_mask) { - /* if @ss has non-root csses attached to it, can't move */ - if (css_next_child(NULL, cgroup_css(&ss->root->cgrp, ss))) + do_each_subsys_mask(ss, ssid, ss_mask) { + /* + * If @ss has non-root csses attached to it, can't move. + * If @ss is an implicit controller, it is exempt from this + * rule and can be stolen. + */ + if (css_next_child(NULL, cgroup_css(&ss->root->cgrp, ss)) && + !ss->implicit_on_dfl) return -EBUSY; /* can't move between two non-dummy roots either */ if (ss->root != &cgrp_dfl_root && dst_root != &cgrp_dfl_root) return -EBUSY; - } - - /* skip creating root files on dfl_root for inhibited subsystems */ - tmp_ss_mask = ss_mask; - if (dst_root == &cgrp_dfl_root) - tmp_ss_mask &= ~cgrp_dfl_root_inhibit_ss_mask; - - for_each_subsys_which(ss, ssid, &tmp_ss_mask) { - struct cgroup *scgrp = &ss->root->cgrp; - int tssid; - - ret = css_populate_dir(cgroup_css(scgrp, ss), dcgrp); - if (!ret) - continue; + } while_each_subsys_mask(); - /* - * Rebinding back to the default root is not allowed to - * fail. Using both default and non-default roots should - * be rare. Moving subsystems back and forth even more so. - * Just warn about it and continue. - */ - if (dst_root == &cgrp_dfl_root) { - if (cgrp_dfl_root_visible) { - pr_warn("failed to create files (%d) while rebinding 0x%lx to default root\n", - ret, ss_mask); - pr_warn("you may retry by moving them to a different hierarchy and unbinding\n"); - } - continue; - } - - for_each_subsys_which(ss, tssid, &tmp_ss_mask) { - if (tssid == ssid) - break; - css_clear_dir(cgroup_css(scgrp, ss), dcgrp); - } - return ret; - } - - /* - * Nothing can fail from this point on. Remove files for the - * removed subsystems and rebind each subsystem. - */ - for_each_subsys_which(ss, ssid, &ss_mask) { + do_each_subsys_mask(ss, ssid, ss_mask) { struct cgroup_root *src_root = ss->root; struct cgroup *scgrp = &src_root->cgrp; struct cgroup_subsys_state *css = cgroup_css(scgrp, ss); @@ -1518,8 +1554,12 @@ static int rebind_subsystems(struct cgroup_root *dst_root, WARN_ON(!css || cgroup_css(dcgrp, ss)); - css_clear_dir(css, NULL); + /* disable from the source */ + src_root->subsys_mask &= ~(1 << ssid); + WARN_ON(cgroup_apply_control(scgrp)); + cgroup_finalize_control(scgrp, 0); + /* rebind */ RCU_INIT_POINTER(scgrp->subsys[ssid], NULL); rcu_assign_pointer(dcgrp->subsys[ssid], css); ss->root = dst_root; @@ -1531,23 +1571,23 @@ static int rebind_subsystems(struct cgroup_root *dst_root, &dcgrp->e_csets[ss->id]); spin_unlock_bh(&css_set_lock); - src_root->subsys_mask &= ~(1 << ssid); - scgrp->subtree_control &= ~(1 << ssid); - cgroup_refresh_child_subsys_mask(scgrp); - /* default hierarchy doesn't enable controllers by default */ dst_root->subsys_mask |= 1 << ssid; if (dst_root == &cgrp_dfl_root) { static_branch_enable(cgroup_subsys_on_dfl_key[ssid]); } else { dcgrp->subtree_control |= 1 << ssid; - cgroup_refresh_child_subsys_mask(dcgrp); static_branch_disable(cgroup_subsys_on_dfl_key[ssid]); } + ret = cgroup_apply_control(dcgrp); + if (ret) + pr_warn("partial failure to rebind %s controller (err=%d)\n", + ss->name, ret); + if (ss->bind) ss->bind(css); - } + } while_each_subsys_mask(); kernfs_activate(dcgrp->kn); return 0; @@ -1583,7 +1623,7 @@ static int cgroup_show_options(struct seq_file *seq, } struct cgroup_sb_opts { - unsigned long subsys_mask; + u16 subsys_mask; unsigned int flags; char *release_agent; bool cpuset_clone_children; @@ -1596,13 +1636,13 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) { char *token, *o = data; bool all_ss = false, one_ss = false; - unsigned long mask = -1UL; + u16 mask = U16_MAX; struct cgroup_subsys *ss; int nr_opts = 0; int i; #ifdef CONFIG_CPUSETS - mask = ~(1U << cpuset_cgrp_id); + mask = ~((u16)1 << cpuset_cgrp_id); #endif memset(opts, 0, sizeof(*opts)); @@ -1677,6 +1717,8 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) continue; if (!cgroup_ssid_enabled(i)) continue; + if (cgroup_ssid_no_v1(i)) + continue; /* Mutually exclusive option 'all' + subsystem name */ if (all_ss) @@ -1697,7 +1739,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) */ if (all_ss || (!one_ss && !opts->none && !opts->name)) for_each_subsys(ss, i) - if (cgroup_ssid_enabled(i)) + if (cgroup_ssid_enabled(i) && !cgroup_ssid_no_v1(i)) opts->subsys_mask |= (1 << i); /* @@ -1727,14 +1769,14 @@ static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data) int ret = 0; struct cgroup_root *root = cgroup_root_from_kf(kf_root); struct cgroup_sb_opts opts; - unsigned long added_mask, removed_mask; + u16 added_mask, removed_mask; if (root == &cgrp_dfl_root) { pr_err("remount is not allowed\n"); return -EINVAL; } - mutex_lock(&cgroup_mutex); + cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp); /* See what subsystems are wanted */ ret = parse_cgroupfs_options(data, &opts); @@ -1767,7 +1809,7 @@ static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data) if (ret) goto out_unlock; - rebind_subsystems(&cgrp_dfl_root, removed_mask); + WARN_ON(rebind_subsystems(&cgrp_dfl_root, removed_mask)); if (opts.release_agent) { spin_lock(&release_agent_path_lock); @@ -1875,7 +1917,7 @@ static void init_cgroup_root(struct cgroup_root *root, set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags); } -static int cgroup_setup_root(struct cgroup_root *root, unsigned long ss_mask) +static int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask) { LIST_HEAD(tmp_links); struct cgroup *root_cgrp = &root->cgrp; @@ -1898,10 +1940,11 @@ static int cgroup_setup_root(struct cgroup_root *root, unsigned long ss_mask) /* * We're accessing css_set_count without locking css_set_lock here, * but that's OK - it can only be increased by someone holding - * cgroup_lock, and that's us. The worst that can happen is that we - * have some link structures left over + * cgroup_lock, and that's us. Later rebinding may disable + * controllers on the default hierarchy and thus create new csets, + * which can't be more than the existing ones. Allocate 2x. */ - ret = allocate_cgrp_cset_links(css_set_count, &tmp_links); + ret = allocate_cgrp_cset_links(2 * css_set_count, &tmp_links); if (ret) goto cancel_ref; @@ -1918,7 +1961,7 @@ static int cgroup_setup_root(struct cgroup_root *root, unsigned long ss_mask) } root_cgrp->kn = root->kf_root->kn; - ret = css_populate_dir(&root_cgrp->self, NULL); + ret = css_populate_dir(&root_cgrp->self); if (ret) goto destroy_root; @@ -1971,6 +2014,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, { bool is_v2 = fs_type == &cgroup2_fs_type; struct super_block *pinned_sb = NULL; + struct cgroup_namespace *ns = current->nsproxy->cgroup_ns; struct cgroup_subsys *ss; struct cgroup_root *root; struct cgroup_sb_opts opts; @@ -1979,6 +2023,14 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, int i; bool new_sb; + get_cgroup_ns(ns); + + /* Check if the caller has permission to mount. */ + if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN)) { + put_cgroup_ns(ns); + return ERR_PTR(-EPERM); + } + /* * The first time anyone tries to mount a cgroup, enable the list * linking each css_set to its tasks and fix up all existing tasks. @@ -1989,15 +2041,16 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, if (is_v2) { if (data) { pr_err("cgroup2: unknown option \"%s\"\n", (char *)data); + put_cgroup_ns(ns); return ERR_PTR(-EINVAL); } - cgrp_dfl_root_visible = true; + cgrp_dfl_visible = true; root = &cgrp_dfl_root; cgroup_get(&root->cgrp); goto out_mount; } - mutex_lock(&cgroup_mutex); + cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp); /* First find the desired set of subsystems */ ret = parse_cgroupfs_options(data, &opts); @@ -2094,6 +2147,16 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, goto out_unlock; } + /* + * We know this subsystem has not yet been bound. Users in a non-init + * user namespace may only mount hierarchies with no bound subsystems, + * i.e. 'none,name=user1' + */ + if (!opts.none && !capable(CAP_SYS_ADMIN)) { + ret = -EPERM; + goto out_unlock; + } + root = kzalloc(sizeof(*root), GFP_KERNEL); if (!root) { ret = -ENOMEM; @@ -2112,12 +2175,37 @@ out_free: kfree(opts.release_agent); kfree(opts.name); - if (ret) + if (ret) { + put_cgroup_ns(ns); return ERR_PTR(ret); + } out_mount: dentry = kernfs_mount(fs_type, flags, root->kf_root, is_v2 ? CGROUP2_SUPER_MAGIC : CGROUP_SUPER_MAGIC, &new_sb); + + /* + * In non-init cgroup namespace, instead of root cgroup's + * dentry, we return the dentry corresponding to the + * cgroupns->root_cgrp. + */ + if (!IS_ERR(dentry) && ns != &init_cgroup_ns) { + struct dentry *nsdentry; + struct cgroup *cgrp; + + mutex_lock(&cgroup_mutex); + spin_lock_bh(&css_set_lock); + + cgrp = cset_cgroup_from_root(ns->root_cset, root); + + spin_unlock_bh(&css_set_lock); + mutex_unlock(&cgroup_mutex); + + nsdentry = kernfs_node_dentry(cgrp->kn, dentry->d_sb); + dput(dentry); + dentry = nsdentry; + } + if (IS_ERR(dentry) || !new_sb) cgroup_put(&root->cgrp); @@ -2130,6 +2218,7 @@ out_mount: deactivate_super(pinned_sb); } + put_cgroup_ns(ns); return dentry; } @@ -2158,14 +2247,45 @@ static struct file_system_type cgroup_fs_type = { .name = "cgroup", .mount = cgroup_mount, .kill_sb = cgroup_kill_sb, + .fs_flags = FS_USERNS_MOUNT, }; static struct file_system_type cgroup2_fs_type = { .name = "cgroup2", .mount = cgroup_mount, .kill_sb = cgroup_kill_sb, + .fs_flags = FS_USERNS_MOUNT, }; +static char *cgroup_path_ns_locked(struct cgroup *cgrp, char *buf, size_t buflen, + struct cgroup_namespace *ns) +{ + struct cgroup *root = cset_cgroup_from_root(ns->root_cset, cgrp->root); + int ret; + + ret = kernfs_path_from_node(cgrp->kn, root->kn, buf, buflen); + if (ret < 0 || ret >= buflen) + return NULL; + return buf; +} + +char *cgroup_path_ns(struct cgroup *cgrp, char *buf, size_t buflen, + struct cgroup_namespace *ns) +{ + char *ret; + + mutex_lock(&cgroup_mutex); + spin_lock_bh(&css_set_lock); + + ret = cgroup_path_ns_locked(cgrp, buf, buflen, ns); + + spin_unlock_bh(&css_set_lock); + mutex_unlock(&cgroup_mutex); + + return ret; +} +EXPORT_SYMBOL_GPL(cgroup_path_ns); + /** * task_cgroup_path - cgroup path of a task in the first cgroup hierarchy * @task: target task @@ -2193,7 +2313,7 @@ char *task_cgroup_path(struct task_struct *task, char *buf, size_t buflen) if (root) { cgrp = task_cgroup_from_root(task, root); - path = cgroup_path(cgrp, buf, buflen); + path = cgroup_path_ns_locked(cgrp, buf, buflen, &init_cgroup_ns); } else { /* if no hierarchy exists, everyone is in "/" */ if (strlcpy(buf, "/", buflen) < buflen) @@ -2337,38 +2457,38 @@ struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset, } /** - * cgroup_taskset_migrate - migrate a taskset to a cgroup + * cgroup_taskset_migrate - migrate a taskset * @tset: taget taskset - * @dst_cgrp: destination cgroup + * @root: cgroup root the migration is taking place on * - * Migrate tasks in @tset to @dst_cgrp. This function fails iff one of the - * ->can_attach callbacks fails and guarantees that either all or none of - * the tasks in @tset are migrated. @tset is consumed regardless of - * success. + * Migrate tasks in @tset as setup by migration preparation functions. + * This function fails iff one of the ->can_attach callbacks fails and + * guarantees that either all or none of the tasks in @tset are migrated. + * @tset is consumed regardless of success. */ static int cgroup_taskset_migrate(struct cgroup_taskset *tset, - struct cgroup *dst_cgrp) + struct cgroup_root *root) { - struct cgroup_subsys_state *css, *failed_css = NULL; + struct cgroup_subsys *ss; struct task_struct *task, *tmp_task; struct css_set *cset, *tmp_cset; - int i, ret; + int ssid, failed_ssid, ret; /* methods shouldn't be called if no task is actually migrating */ if (list_empty(&tset->src_csets)) return 0; /* check that we can legitimately attach to the cgroup */ - for_each_e_css(css, i, dst_cgrp) { - if (css->ss->can_attach) { - tset->ssid = i; - ret = css->ss->can_attach(tset); + do_each_subsys_mask(ss, ssid, root->subsys_mask) { + if (ss->can_attach) { + tset->ssid = ssid; + ret = ss->can_attach(tset); if (ret) { - failed_css = css; + failed_ssid = ssid; goto out_cancel_attach; } } - } + } while_each_subsys_mask(); /* * Now that we're guaranteed success, proceed to move all tasks to @@ -2395,25 +2515,25 @@ static int cgroup_taskset_migrate(struct cgroup_taskset *tset, */ tset->csets = &tset->dst_csets; - for_each_e_css(css, i, dst_cgrp) { - if (css->ss->attach) { - tset->ssid = i; - css->ss->attach(tset); + do_each_subsys_mask(ss, ssid, root->subsys_mask) { + if (ss->attach) { + tset->ssid = ssid; + ss->attach(tset); } - } + } while_each_subsys_mask(); ret = 0; goto out_release_tset; out_cancel_attach: - for_each_e_css(css, i, dst_cgrp) { - if (css == failed_css) + do_each_subsys_mask(ss, ssid, root->subsys_mask) { + if (ssid == failed_ssid) break; - if (css->ss->cancel_attach) { - tset->ssid = i; - css->ss->cancel_attach(tset); + if (ss->cancel_attach) { + tset->ssid = ssid; + ss->cancel_attach(tset); } - } + } while_each_subsys_mask(); out_release_tset: spin_lock_bh(&css_set_lock); list_splice_init(&tset->dst_csets, &tset->src_csets); @@ -2426,6 +2546,20 @@ out_release_tset: } /** + * cgroup_may_migrate_to - verify whether a cgroup can be migration destination + * @dst_cgrp: destination cgroup to test + * + * On the default hierarchy, except for the root, subtree_control must be + * zero for migration destination cgroups with tasks so that child cgroups + * don't compete against tasks. + */ +static bool cgroup_may_migrate_to(struct cgroup *dst_cgrp) +{ + return !cgroup_on_dfl(dst_cgrp) || !cgroup_parent(dst_cgrp) || + !dst_cgrp->subtree_control; +} + +/** * cgroup_migrate_finish - cleanup after attach * @preloaded_csets: list of preloaded css_sets * @@ -2441,6 +2575,7 @@ static void cgroup_migrate_finish(struct list_head *preloaded_csets) spin_lock_bh(&css_set_lock); list_for_each_entry_safe(cset, tmp_cset, preloaded_csets, mg_preload_node) { cset->mg_src_cgrp = NULL; + cset->mg_dst_cgrp = NULL; cset->mg_dst_cset = NULL; list_del_init(&cset->mg_preload_node); put_css_set_locked(cset); @@ -2473,58 +2608,56 @@ static void cgroup_migrate_add_src(struct css_set *src_cset, lockdep_assert_held(&cgroup_mutex); lockdep_assert_held(&css_set_lock); + /* + * If ->dead, @src_set is associated with one or more dead cgroups + * and doesn't contain any migratable tasks. Ignore it early so + * that the rest of migration path doesn't get confused by it. + */ + if (src_cset->dead) + return; + src_cgrp = cset_cgroup_from_root(src_cset, dst_cgrp->root); if (!list_empty(&src_cset->mg_preload_node)) return; WARN_ON(src_cset->mg_src_cgrp); + WARN_ON(src_cset->mg_dst_cgrp); WARN_ON(!list_empty(&src_cset->mg_tasks)); WARN_ON(!list_empty(&src_cset->mg_node)); src_cset->mg_src_cgrp = src_cgrp; + src_cset->mg_dst_cgrp = dst_cgrp; get_css_set(src_cset); list_add(&src_cset->mg_preload_node, preloaded_csets); } /** * cgroup_migrate_prepare_dst - prepare destination css_sets for migration - * @dst_cgrp: the destination cgroup (may be %NULL) * @preloaded_csets: list of preloaded source css_sets * - * Tasks are about to be moved to @dst_cgrp and all the source css_sets - * have been preloaded to @preloaded_csets. This function looks up and - * pins all destination css_sets, links each to its source, and append them - * to @preloaded_csets. If @dst_cgrp is %NULL, the destination of each - * source css_set is assumed to be its cgroup on the default hierarchy. + * Tasks are about to be moved and all the source css_sets have been + * preloaded to @preloaded_csets. This function looks up and pins all + * destination css_sets, links each to its source, and append them to + * @preloaded_csets. * * This function must be called after cgroup_migrate_add_src() has been * called on each migration source css_set. After migration is performed * using cgroup_migrate(), cgroup_migrate_finish() must be called on * @preloaded_csets. */ -static int cgroup_migrate_prepare_dst(struct cgroup *dst_cgrp, - struct list_head *preloaded_csets) +static int cgroup_migrate_prepare_dst(struct list_head *preloaded_csets) { LIST_HEAD(csets); struct css_set *src_cset, *tmp_cset; lockdep_assert_held(&cgroup_mutex); - /* - * Except for the root, child_subsys_mask must be zero for a cgroup - * with tasks so that child cgroups don't compete against tasks. - */ - if (dst_cgrp && cgroup_on_dfl(dst_cgrp) && cgroup_parent(dst_cgrp) && - dst_cgrp->child_subsys_mask) - return -EBUSY; - /* look up the dst cset for each src cset and link it to src */ list_for_each_entry_safe(src_cset, tmp_cset, preloaded_csets, mg_preload_node) { struct css_set *dst_cset; - dst_cset = find_css_set(src_cset, - dst_cgrp ?: src_cset->dfl_cgrp); + dst_cset = find_css_set(src_cset, src_cset->mg_dst_cgrp); if (!dst_cset) goto err; @@ -2537,6 +2670,7 @@ static int cgroup_migrate_prepare_dst(struct cgroup *dst_cgrp, */ if (src_cset == dst_cset) { src_cset->mg_src_cgrp = NULL; + src_cset->mg_dst_cgrp = NULL; list_del_init(&src_cset->mg_preload_node); put_css_set(src_cset); put_css_set(dst_cset); @@ -2562,11 +2696,11 @@ err: * cgroup_migrate - migrate a process or task to a cgroup * @leader: the leader of the process or the task to migrate * @threadgroup: whether @leader points to the whole process or a single task - * @cgrp: the destination cgroup + * @root: cgroup root migration is taking place on * - * Migrate a process or task denoted by @leader to @cgrp. If migrating a - * process, the caller must be holding cgroup_threadgroup_rwsem. The - * caller is also responsible for invoking cgroup_migrate_add_src() and + * Migrate a process or task denoted by @leader. If migrating a process, + * the caller must be holding cgroup_threadgroup_rwsem. The caller is also + * responsible for invoking cgroup_migrate_add_src() and * cgroup_migrate_prepare_dst() on the targets before invoking this * function and following up with cgroup_migrate_finish(). * @@ -2577,7 +2711,7 @@ err: * actually starting migrating. */ static int cgroup_migrate(struct task_struct *leader, bool threadgroup, - struct cgroup *cgrp) + struct cgroup_root *root) { struct cgroup_taskset tset = CGROUP_TASKSET_INIT(tset); struct task_struct *task; @@ -2598,7 +2732,7 @@ static int cgroup_migrate(struct task_struct *leader, bool threadgroup, rcu_read_unlock(); spin_unlock_bh(&css_set_lock); - return cgroup_taskset_migrate(&tset, cgrp); + return cgroup_taskset_migrate(&tset, root); } /** @@ -2616,6 +2750,9 @@ static int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *task; int ret; + if (!cgroup_may_migrate_to(dst_cgrp)) + return -EBUSY; + /* look up all src csets */ spin_lock_bh(&css_set_lock); rcu_read_lock(); @@ -2630,9 +2767,9 @@ static int cgroup_attach_task(struct cgroup *dst_cgrp, spin_unlock_bh(&css_set_lock); /* prepare dst csets and commit */ - ret = cgroup_migrate_prepare_dst(dst_cgrp, &preloaded_csets); + ret = cgroup_migrate_prepare_dst(&preloaded_csets); if (!ret) - ret = cgroup_migrate(leader, threadgroup, dst_cgrp); + ret = cgroup_migrate(leader, threadgroup, dst_cgrp->root); cgroup_migrate_finish(&preloaded_csets); return ret; @@ -2695,7 +2832,7 @@ static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf, if (kstrtoint(strstrip(buf), 0, &pid) || pid < 0) return -EINVAL; - cgrp = cgroup_kn_lock_live(of->kn); + cgrp = cgroup_kn_lock_live(of->kn, false); if (!cgrp) return -ENODEV; @@ -2739,6 +2876,7 @@ out_unlock_rcu: out_unlock_threadgroup: percpu_up_write(&cgroup_threadgroup_rwsem); cgroup_kn_unlock(of->kn); + cpuset_post_attach_flush(); return ret ?: nbytes; } @@ -2792,7 +2930,7 @@ static ssize_t cgroup_release_agent_write(struct kernfs_open_file *of, BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX); - cgrp = cgroup_kn_lock_live(of->kn); + cgrp = cgroup_kn_lock_live(of->kn, false); if (!cgrp) return -ENODEV; spin_lock(&release_agent_path_lock); @@ -2820,38 +2958,28 @@ static int cgroup_sane_behavior_show(struct seq_file *seq, void *v) return 0; } -static void cgroup_print_ss_mask(struct seq_file *seq, unsigned long ss_mask) +static void cgroup_print_ss_mask(struct seq_file *seq, u16 ss_mask) { struct cgroup_subsys *ss; bool printed = false; int ssid; - for_each_subsys_which(ss, ssid, &ss_mask) { + do_each_subsys_mask(ss, ssid, ss_mask) { if (printed) seq_putc(seq, ' '); seq_printf(seq, "%s", ss->name); printed = true; - } + } while_each_subsys_mask(); if (printed) seq_putc(seq, '\n'); } -/* show controllers which are currently attached to the default hierarchy */ -static int cgroup_root_controllers_show(struct seq_file *seq, void *v) -{ - struct cgroup *cgrp = seq_css(seq)->cgroup; - - cgroup_print_ss_mask(seq, cgrp->root->subsys_mask & - ~cgrp_dfl_root_inhibit_ss_mask); - return 0; -} - /* show controllers which are enabled from the parent */ static int cgroup_controllers_show(struct seq_file *seq, void *v) { struct cgroup *cgrp = seq_css(seq)->cgroup; - cgroup_print_ss_mask(seq, cgroup_parent(cgrp)->subtree_control); + cgroup_print_ss_mask(seq, cgroup_control(cgrp)); return 0; } @@ -2868,16 +2996,17 @@ static int cgroup_subtree_control_show(struct seq_file *seq, void *v) * cgroup_update_dfl_csses - update css assoc of a subtree in default hierarchy * @cgrp: root of the subtree to update csses for * - * @cgrp's child_subsys_mask has changed and its subtree's (self excluded) - * css associations need to be updated accordingly. This function looks up - * all css_sets which are attached to the subtree, creates the matching - * updated css_sets and migrates the tasks to the new ones. + * @cgrp's control masks have changed and its subtree's css associations + * need to be updated accordingly. This function looks up all css_sets + * which are attached to the subtree, creates the matching updated css_sets + * and migrates the tasks to the new ones. */ static int cgroup_update_dfl_csses(struct cgroup *cgrp) { LIST_HEAD(preloaded_csets); struct cgroup_taskset tset = CGROUP_TASKSET_INIT(tset); - struct cgroup_subsys_state *css; + struct cgroup_subsys_state *d_css; + struct cgroup *dsct; struct css_set *src_cset; int ret; @@ -2887,21 +3016,17 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp) /* look up all csses currently attached to @cgrp's subtree */ spin_lock_bh(&css_set_lock); - css_for_each_descendant_pre(css, cgroup_css(cgrp, NULL)) { + cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) { struct cgrp_cset_link *link; - /* self is not affected by child_subsys_mask change */ - if (css->cgroup == cgrp) - continue; - - list_for_each_entry(link, &css->cgroup->cset_links, cset_link) - cgroup_migrate_add_src(link->cset, cgrp, + list_for_each_entry(link, &dsct->cset_links, cset_link) + cgroup_migrate_add_src(link->cset, dsct, &preloaded_csets); } spin_unlock_bh(&css_set_lock); /* NULL dst indicates self on default hierarchy */ - ret = cgroup_migrate_prepare_dst(NULL, &preloaded_csets); + ret = cgroup_migrate_prepare_dst(&preloaded_csets); if (ret) goto out_finish; @@ -2919,20 +3044,272 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp) } spin_unlock_bh(&css_set_lock); - ret = cgroup_taskset_migrate(&tset, cgrp); + ret = cgroup_taskset_migrate(&tset, cgrp->root); out_finish: cgroup_migrate_finish(&preloaded_csets); percpu_up_write(&cgroup_threadgroup_rwsem); return ret; } +/** + * cgroup_lock_and_drain_offline - lock cgroup_mutex and drain offlined csses + * @cgrp: root of the target subtree + * + * Because css offlining is asynchronous, userland may try to re-enable a + * controller while the previous css is still around. This function grabs + * cgroup_mutex and drains the previous css instances of @cgrp's subtree. + */ +static void cgroup_lock_and_drain_offline(struct cgroup *cgrp) + __acquires(&cgroup_mutex) +{ + struct cgroup *dsct; + struct cgroup_subsys_state *d_css; + struct cgroup_subsys *ss; + int ssid; + +restart: + mutex_lock(&cgroup_mutex); + + cgroup_for_each_live_descendant_post(dsct, d_css, cgrp) { + for_each_subsys(ss, ssid) { + struct cgroup_subsys_state *css = cgroup_css(dsct, ss); + DEFINE_WAIT(wait); + + if (!css || !percpu_ref_is_dying(&css->refcnt)) + continue; + + cgroup_get(dsct); + prepare_to_wait(&dsct->offline_waitq, &wait, + TASK_UNINTERRUPTIBLE); + + mutex_unlock(&cgroup_mutex); + schedule(); + finish_wait(&dsct->offline_waitq, &wait); + + cgroup_put(dsct); + goto restart; + } + } +} + +/** + * cgroup_save_control - save control masks of a subtree + * @cgrp: root of the target subtree + * + * Save ->subtree_control and ->subtree_ss_mask to the respective old_ + * prefixed fields for @cgrp's subtree including @cgrp itself. + */ +static void cgroup_save_control(struct cgroup *cgrp) +{ + struct cgroup *dsct; + struct cgroup_subsys_state *d_css; + + cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) { + dsct->old_subtree_control = dsct->subtree_control; + dsct->old_subtree_ss_mask = dsct->subtree_ss_mask; + } +} + +/** + * cgroup_propagate_control - refresh control masks of a subtree + * @cgrp: root of the target subtree + * + * For @cgrp and its subtree, ensure ->subtree_ss_mask matches + * ->subtree_control and propagate controller availability through the + * subtree so that descendants don't have unavailable controllers enabled. + */ +static void cgroup_propagate_control(struct cgroup *cgrp) +{ + struct cgroup *dsct; + struct cgroup_subsys_state *d_css; + + cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) { + dsct->subtree_control &= cgroup_control(dsct); + dsct->subtree_ss_mask = + cgroup_calc_subtree_ss_mask(dsct->subtree_control, + cgroup_ss_mask(dsct)); + } +} + +/** + * cgroup_restore_control - restore control masks of a subtree + * @cgrp: root of the target subtree + * + * Restore ->subtree_control and ->subtree_ss_mask from the respective old_ + * prefixed fields for @cgrp's subtree including @cgrp itself. + */ +static void cgroup_restore_control(struct cgroup *cgrp) +{ + struct cgroup *dsct; + struct cgroup_subsys_state *d_css; + + cgroup_for_each_live_descendant_post(dsct, d_css, cgrp) { + dsct->subtree_control = dsct->old_subtree_control; + dsct->subtree_ss_mask = dsct->old_subtree_ss_mask; + } +} + +static bool css_visible(struct cgroup_subsys_state *css) +{ + struct cgroup_subsys *ss = css->ss; + struct cgroup *cgrp = css->cgroup; + + if (cgroup_control(cgrp) & (1 << ss->id)) + return true; + if (!(cgroup_ss_mask(cgrp) & (1 << ss->id))) + return false; + return cgroup_on_dfl(cgrp) && ss->implicit_on_dfl; +} + +/** + * cgroup_apply_control_enable - enable or show csses according to control + * @cgrp: root of the target subtree + * + * Walk @cgrp's subtree and create new csses or make the existing ones + * visible. A css is created invisible if it's being implicitly enabled + * through dependency. An invisible css is made visible when the userland + * explicitly enables it. + * + * Returns 0 on success, -errno on failure. On failure, csses which have + * been processed already aren't cleaned up. The caller is responsible for + * cleaning up with cgroup_apply_control_disble(). + */ +static int cgroup_apply_control_enable(struct cgroup *cgrp) +{ + struct cgroup *dsct; + struct cgroup_subsys_state *d_css; + struct cgroup_subsys *ss; + int ssid, ret; + + cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) { + for_each_subsys(ss, ssid) { + struct cgroup_subsys_state *css = cgroup_css(dsct, ss); + + WARN_ON_ONCE(css && percpu_ref_is_dying(&css->refcnt)); + + if (!(cgroup_ss_mask(dsct) & (1 << ss->id))) + continue; + + if (!css) { + css = css_create(dsct, ss); + if (IS_ERR(css)) + return PTR_ERR(css); + } + + if (css_visible(css)) { + ret = css_populate_dir(css); + if (ret) + return ret; + } + } + } + + return 0; +} + +/** + * cgroup_apply_control_disable - kill or hide csses according to control + * @cgrp: root of the target subtree + * + * Walk @cgrp's subtree and kill and hide csses so that they match + * cgroup_ss_mask() and cgroup_visible_mask(). + * + * A css is hidden when the userland requests it to be disabled while other + * subsystems are still depending on it. The css must not actively control + * resources and be in the vanilla state if it's made visible again later. + * Controllers which may be depended upon should provide ->css_reset() for + * this purpose. + */ +static void cgroup_apply_control_disable(struct cgroup *cgrp) +{ + struct cgroup *dsct; + struct cgroup_subsys_state *d_css; + struct cgroup_subsys *ss; + int ssid; + + cgroup_for_each_live_descendant_post(dsct, d_css, cgrp) { + for_each_subsys(ss, ssid) { + struct cgroup_subsys_state *css = cgroup_css(dsct, ss); + + WARN_ON_ONCE(css && percpu_ref_is_dying(&css->refcnt)); + + if (!css) + continue; + + if (css->parent && + !(cgroup_ss_mask(dsct) & (1 << ss->id))) { + kill_css(css); + } else if (!css_visible(css)) { + css_clear_dir(css); + if (ss->css_reset) + ss->css_reset(css); + } + } + } +} + +/** + * cgroup_apply_control - apply control mask updates to the subtree + * @cgrp: root of the target subtree + * + * subsystems can be enabled and disabled in a subtree using the following + * steps. + * + * 1. Call cgroup_save_control() to stash the current state. + * 2. Update ->subtree_control masks in the subtree as desired. + * 3. Call cgroup_apply_control() to apply the changes. + * 4. Optionally perform other related operations. + * 5. Call cgroup_finalize_control() to finish up. + * + * This function implements step 3 and propagates the mask changes + * throughout @cgrp's subtree, updates csses accordingly and perform + * process migrations. + */ +static int cgroup_apply_control(struct cgroup *cgrp) +{ + int ret; + + cgroup_propagate_control(cgrp); + + ret = cgroup_apply_control_enable(cgrp); + if (ret) + return ret; + + /* + * At this point, cgroup_e_css() results reflect the new csses + * making the following cgroup_update_dfl_csses() properly update + * css associations of all tasks in the subtree. + */ + ret = cgroup_update_dfl_csses(cgrp); + if (ret) + return ret; + + return 0; +} + +/** + * cgroup_finalize_control - finalize control mask update + * @cgrp: root of the target subtree + * @ret: the result of the update + * + * Finalize control mask update. See cgroup_apply_control() for more info. + */ +static void cgroup_finalize_control(struct cgroup *cgrp, int ret) +{ + if (ret) { + cgroup_restore_control(cgrp); + cgroup_propagate_control(cgrp); + } + + cgroup_apply_control_disable(cgrp); +} + /* change the enabled child controllers for a cgroup in the default hierarchy */ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { - unsigned long enable = 0, disable = 0; - unsigned long css_enable, css_disable, old_sc, new_sc, old_ss, new_ss; + u16 enable = 0, disable = 0; struct cgroup *cgrp, *child; struct cgroup_subsys *ss; char *tok; @@ -2944,11 +3321,9 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, */ buf = strstrip(buf); while ((tok = strsep(&buf, " "))) { - unsigned long tmp_ss_mask = ~cgrp_dfl_root_inhibit_ss_mask; - if (tok[0] == '\0') continue; - for_each_subsys_which(ss, ssid, &tmp_ss_mask) { + do_each_subsys_mask(ss, ssid, ~cgrp_dfl_inhibit_ss_mask) { if (!cgroup_ssid_enabled(ssid) || strcmp(tok + 1, ss->name)) continue; @@ -2963,12 +3338,12 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, return -EINVAL; } break; - } + } while_each_subsys_mask(); if (ssid == CGROUP_SUBSYS_COUNT) return -EINVAL; } - cgrp = cgroup_kn_lock_live(of->kn); + cgrp = cgroup_kn_lock_live(of->kn, true); if (!cgrp) return -ENODEV; @@ -2979,10 +3354,7 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, continue; } - /* unavailable or not enabled on the parent? */ - if (!(cgrp_dfl_root.subsys_mask & (1 << ssid)) || - (cgroup_parent(cgrp) && - !(cgroup_parent(cgrp)->subtree_control & (1 << ssid)))) { + if (!(cgroup_control(cgrp) & (1 << ssid))) { ret = -ENOENT; goto out_unlock; } @@ -3016,150 +3388,21 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, goto out_unlock; } - /* - * Update subsys masks and calculate what needs to be done. More - * subsystems than specified may need to be enabled or disabled - * depending on subsystem dependencies. - */ - old_sc = cgrp->subtree_control; - old_ss = cgrp->child_subsys_mask; - new_sc = (old_sc | enable) & ~disable; - new_ss = cgroup_calc_child_subsys_mask(cgrp, new_sc); - - css_enable = ~old_ss & new_ss; - css_disable = old_ss & ~new_ss; - enable |= css_enable; - disable |= css_disable; - - /* - * Because css offlining is asynchronous, userland might try to - * re-enable the same controller while the previous instance is - * still around. In such cases, wait till it's gone using - * offline_waitq. - */ - for_each_subsys_which(ss, ssid, &css_enable) { - cgroup_for_each_live_child(child, cgrp) { - DEFINE_WAIT(wait); - - if (!cgroup_css(child, ss)) - continue; - - cgroup_get(child); - prepare_to_wait(&child->offline_waitq, &wait, - TASK_UNINTERRUPTIBLE); - cgroup_kn_unlock(of->kn); - schedule(); - finish_wait(&child->offline_waitq, &wait); - cgroup_put(child); - - return restart_syscall(); - } - } - - cgrp->subtree_control = new_sc; - cgrp->child_subsys_mask = new_ss; - - /* - * Create new csses or make the existing ones visible. A css is - * created invisible if it's being implicitly enabled through - * dependency. An invisible css is made visible when the userland - * explicitly enables it. - */ - for_each_subsys(ss, ssid) { - if (!(enable & (1 << ssid))) - continue; + /* save and update control masks and prepare csses */ + cgroup_save_control(cgrp); - cgroup_for_each_live_child(child, cgrp) { - if (css_enable & (1 << ssid)) - ret = create_css(child, ss, - cgrp->subtree_control & (1 << ssid)); - else - ret = css_populate_dir(cgroup_css(child, ss), - NULL); - if (ret) - goto err_undo_css; - } - } + cgrp->subtree_control |= enable; + cgrp->subtree_control &= ~disable; - /* - * At this point, cgroup_e_css() results reflect the new csses - * making the following cgroup_update_dfl_csses() properly update - * css associations of all tasks in the subtree. - */ - ret = cgroup_update_dfl_csses(cgrp); - if (ret) - goto err_undo_css; + ret = cgroup_apply_control(cgrp); - /* - * All tasks are migrated out of disabled csses. Kill or hide - * them. A css is hidden when the userland requests it to be - * disabled while other subsystems are still depending on it. The - * css must not actively control resources and be in the vanilla - * state if it's made visible again later. Controllers which may - * be depended upon should provide ->css_reset() for this purpose. - */ - for_each_subsys(ss, ssid) { - if (!(disable & (1 << ssid))) - continue; - - cgroup_for_each_live_child(child, cgrp) { - struct cgroup_subsys_state *css = cgroup_css(child, ss); - - if (css_disable & (1 << ssid)) { - kill_css(css); - } else { - css_clear_dir(css, NULL); - if (ss->css_reset) - ss->css_reset(css); - } - } - } - - /* - * The effective csses of all the descendants (excluding @cgrp) may - * have changed. Subsystems can optionally subscribe to this event - * by implementing ->css_e_css_changed() which is invoked if any of - * the effective csses seen from the css's cgroup may have changed. - */ - for_each_subsys(ss, ssid) { - struct cgroup_subsys_state *this_css = cgroup_css(cgrp, ss); - struct cgroup_subsys_state *css; - - if (!ss->css_e_css_changed || !this_css) - continue; - - css_for_each_descendant_pre(css, this_css) - if (css != this_css) - ss->css_e_css_changed(css); - } + cgroup_finalize_control(cgrp, ret); kernfs_activate(cgrp->kn); ret = 0; out_unlock: cgroup_kn_unlock(of->kn); return ret ?: nbytes; - -err_undo_css: - cgrp->subtree_control = old_sc; - cgrp->child_subsys_mask = old_ss; - - for_each_subsys(ss, ssid) { - if (!(enable & (1 << ssid))) - continue; - - cgroup_for_each_live_child(child, cgrp) { - struct cgroup_subsys_state *css = cgroup_css(child, ss); - - if (!css) - continue; - - if (css_enable & (1 << ssid)) - kill_css(css); - else - css_clear_dir(css, NULL); - } - } - goto out_unlock; } static int cgroup_events_show(struct seq_file *seq, void *v) @@ -3357,7 +3600,7 @@ static int cgroup_addrm_files(struct cgroup_subsys_state *css, bool is_add) { struct cftype *cft, *cft_end = NULL; - int ret; + int ret = 0; lockdep_assert_held(&cgroup_mutex); @@ -3386,7 +3629,7 @@ restart: cgroup_rm_file(cgrp, cft); } } - return 0; + return ret; } static int cgroup_apply_cftypes(struct cftype *cfts, bool is_add) @@ -3403,7 +3646,7 @@ static int cgroup_apply_cftypes(struct cftype *cfts, bool is_add) css_for_each_descendant_pre(css, cgroup_css(root, ss)) { struct cgroup *cgrp = css->cgroup; - if (cgroup_is_dead(cgrp)) + if (!(css->flags & CSS_VISIBLE)) continue; ret = cgroup_addrm_files(css, cgrp, cfts, is_add); @@ -4024,6 +4267,9 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from) struct task_struct *task; int ret; + if (!cgroup_may_migrate_to(to)) + return -EBUSY; + mutex_lock(&cgroup_mutex); /* all tasks in @from are being moved, all csets are source */ @@ -4032,7 +4278,7 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from) cgroup_migrate_add_src(link->cset, to, &preloaded_csets); spin_unlock_bh(&css_set_lock); - ret = cgroup_migrate_prepare_dst(to, &preloaded_csets); + ret = cgroup_migrate_prepare_dst(&preloaded_csets); if (ret) goto out_err; @@ -4048,7 +4294,7 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from) css_task_iter_end(&it); if (task) { - ret = cgroup_migrate(task, false, to); + ret = cgroup_migrate(task, false, to->root); put_task_struct(task); } } while (task && !ret); @@ -4555,12 +4801,6 @@ static struct cftype cgroup_dfl_base_files[] = { }, { .name = "cgroup.controllers", - .flags = CFTYPE_ONLY_ON_ROOT, - .seq_show = cgroup_root_controllers_show, - }, - { - .name = "cgroup.controllers", - .flags = CFTYPE_NOT_ON_ROOT, .seq_show = cgroup_controllers_show, }, { @@ -4655,14 +4895,15 @@ static void css_free_work_fn(struct work_struct *work) if (ss) { /* css free path */ + struct cgroup_subsys_state *parent = css->parent; int id = css->id; - if (css->parent) - css_put(css->parent); - ss->css_free(css); cgroup_idr_remove(&ss->css_idr, id); cgroup_put(cgrp); + + if (parent) + css_put(parent); } else { /* cgroup free path */ atomic_dec(&cgrp->root->nr_cgrps); @@ -4728,7 +4969,9 @@ static void css_release_work_fn(struct work_struct *work) * Those are supported by RCU protecting clearing of * cgrp->kn->priv backpointer. */ - RCU_INIT_POINTER(*(void __rcu __force **)&cgrp->kn->priv, NULL); + if (cgrp->kn) + RCU_INIT_POINTER(*(void __rcu __force **)&cgrp->kn->priv, + NULL); } mutex_unlock(&cgroup_mutex); @@ -4758,6 +5001,7 @@ static void init_and_link_css(struct cgroup_subsys_state *css, INIT_LIST_HEAD(&css->sibling); INIT_LIST_HEAD(&css->children); css->serial_nr = css_serial_nr_next++; + atomic_set(&css->online_cnt, 0); if (cgroup_parent(cgrp)) { css->parent = cgroup_css(cgroup_parent(cgrp), ss); @@ -4780,6 +5024,10 @@ static int online_css(struct cgroup_subsys_state *css) if (!ret) { css->flags |= CSS_ONLINE; rcu_assign_pointer(css->cgroup->subsys[ss->id], css); + + atomic_inc(&css->online_cnt); + if (css->parent) + atomic_inc(&css->parent->online_cnt); } return ret; } @@ -4794,6 +5042,9 @@ static void offline_css(struct cgroup_subsys_state *css) if (!(css->flags & CSS_ONLINE)) return; + if (ss->css_reset) + ss->css_reset(css); + if (ss->css_offline) ss->css_offline(css); @@ -4804,17 +5055,16 @@ static void offline_css(struct cgroup_subsys_state *css) } /** - * create_css - create a cgroup_subsys_state + * css_create - create a cgroup_subsys_state * @cgrp: the cgroup new css will be associated with * @ss: the subsys of new css - * @visible: whether to create control knobs for the new css or not * * Create a new css associated with @cgrp - @ss pair. On success, the new - * css is online and installed in @cgrp with all interface files created if - * @visible. Returns 0 on success, -errno on failure. + * css is online and installed in @cgrp. This function doesn't create the + * interface files. Returns 0 on success, -errno on failure. */ -static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss, - bool visible) +static struct cgroup_subsys_state *css_create(struct cgroup *cgrp, + struct cgroup_subsys *ss) { struct cgroup *parent = cgroup_parent(cgrp); struct cgroup_subsys_state *parent_css = cgroup_css(parent, ss); @@ -4825,7 +5075,7 @@ static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss, css = ss->css_alloc(parent_css); if (IS_ERR(css)) - return PTR_ERR(css); + return css; init_and_link_css(css, ss, cgrp); @@ -4838,12 +5088,6 @@ static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss, goto err_free_percpu_ref; css->id = err; - if (visible) { - err = css_populate_dir(css, NULL); - if (err) - goto err_free_id; - } - /* @css is ready to be brought online now, make it visible */ list_add_tail_rcu(&css->sibling, &parent_css->children); cgroup_idr_replace(&ss->css_idr, css, css->id); @@ -4861,47 +5105,30 @@ static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss, ss->warned_broken_hierarchy = true; } - return 0; + return css; err_list_del: list_del_rcu(&css->sibling); - css_clear_dir(css, NULL); -err_free_id: cgroup_idr_remove(&ss->css_idr, css->id); err_free_percpu_ref: percpu_ref_exit(&css->refcnt); err_free_css: call_rcu(&css->rcu_head, css_free_rcu_fn); - return err; + return ERR_PTR(err); } -static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, - umode_t mode) +static struct cgroup *cgroup_create(struct cgroup *parent) { - struct cgroup *parent, *cgrp, *tcgrp; - struct cgroup_root *root; - struct cgroup_subsys *ss; - struct kernfs_node *kn; - int level, ssid, ret; - - /* Do not accept '\n' to prevent making /proc/<pid>/cgroup unparsable. - */ - if (strchr(name, '\n')) - return -EINVAL; - - parent = cgroup_kn_lock_live(parent_kn); - if (!parent) - return -ENODEV; - root = parent->root; - level = parent->level + 1; + struct cgroup_root *root = parent->root; + struct cgroup *cgrp, *tcgrp; + int level = parent->level + 1; + int ret; /* allocate the cgroup and its ID, 0 is reserved for the root */ cgrp = kzalloc(sizeof(*cgrp) + sizeof(cgrp->ancestor_ids[0]) * (level + 1), GFP_KERNEL); - if (!cgrp) { - ret = -ENOMEM; - goto out_unlock; - } + if (!cgrp) + return ERR_PTR(-ENOMEM); ret = percpu_ref_init(&cgrp->self.refcnt, css_release, 0, GFP_KERNEL); if (ret) @@ -4932,20 +5159,6 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &parent->flags)) set_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags); - /* create the directory */ - kn = kernfs_create_dir(parent->kn, name, mode, cgrp); - if (IS_ERR(kn)) { - ret = PTR_ERR(kn); - goto out_free_id; - } - cgrp->kn = kn; - - /* - * This extra ref will be put in cgroup_free_fn() and guarantees - * that @cgrp->kn is always accessible. - */ - kernfs_get(kn); - cgrp->self.serial_nr = css_serial_nr_next++; /* allocation complete, commit to creation */ @@ -4959,51 +5172,90 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, */ cgroup_idr_replace(&root->cgroup_idr, cgrp, cgrp->id); - ret = cgroup_kn_set_ugid(kn); - if (ret) - goto out_destroy; + /* + * On the default hierarchy, a child doesn't automatically inherit + * subtree_control from the parent. Each is configured manually. + */ + if (!cgroup_on_dfl(cgrp)) + cgrp->subtree_control = cgroup_control(cgrp); + + cgroup_propagate_control(cgrp); - ret = css_populate_dir(&cgrp->self, NULL); + /* @cgrp doesn't have dir yet so the following will only create csses */ + ret = cgroup_apply_control_enable(cgrp); if (ret) goto out_destroy; - /* let's create and online css's */ - for_each_subsys(ss, ssid) { - if (parent->child_subsys_mask & (1 << ssid)) { - ret = create_css(cgrp, ss, - parent->subtree_control & (1 << ssid)); - if (ret) - goto out_destroy; - } + return cgrp; + +out_cancel_ref: + percpu_ref_exit(&cgrp->self.refcnt); +out_free_cgrp: + kfree(cgrp); + return ERR_PTR(ret); +out_destroy: + cgroup_destroy_locked(cgrp); + return ERR_PTR(ret); +} + +static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, + umode_t mode) +{ + struct cgroup *parent, *cgrp; + struct kernfs_node *kn; + int ret; + + /* do not accept '\n' to prevent making /proc/<pid>/cgroup unparsable */ + if (strchr(name, '\n')) + return -EINVAL; + + parent = cgroup_kn_lock_live(parent_kn, false); + if (!parent) + return -ENODEV; + + cgrp = cgroup_create(parent); + if (IS_ERR(cgrp)) { + ret = PTR_ERR(cgrp); + goto out_unlock; } + /* create the directory */ + kn = kernfs_create_dir(parent->kn, name, mode, cgrp); + if (IS_ERR(kn)) { + ret = PTR_ERR(kn); + goto out_destroy; + } + cgrp->kn = kn; + /* - * On the default hierarchy, a child doesn't automatically inherit - * subtree_control from the parent. Each is configured manually. + * This extra ref will be put in cgroup_free_fn() and guarantees + * that @cgrp->kn is always accessible. */ - if (!cgroup_on_dfl(cgrp)) { - cgrp->subtree_control = parent->subtree_control; - cgroup_refresh_child_subsys_mask(cgrp); - } + kernfs_get(kn); + + ret = cgroup_kn_set_ugid(kn); + if (ret) + goto out_destroy; + + ret = css_populate_dir(&cgrp->self); + if (ret) + goto out_destroy; + + ret = cgroup_apply_control_enable(cgrp); + if (ret) + goto out_destroy; + /* let's create and online css's */ kernfs_activate(kn); ret = 0; goto out_unlock; -out_free_id: - cgroup_idr_remove(&root->cgroup_idr, cgrp->id); -out_cancel_ref: - percpu_ref_exit(&cgrp->self.refcnt); -out_free_cgrp: - kfree(cgrp); +out_destroy: + cgroup_destroy_locked(cgrp); out_unlock: cgroup_kn_unlock(parent_kn); return ret; - -out_destroy: - cgroup_destroy_locked(cgrp); - goto out_unlock; } /* @@ -5017,10 +5269,15 @@ static void css_killed_work_fn(struct work_struct *work) container_of(work, struct cgroup_subsys_state, destroy_work); mutex_lock(&cgroup_mutex); - offline_css(css); - mutex_unlock(&cgroup_mutex); - css_put(css); + do { + offline_css(css); + css_put(css); + /* @css can't go away while we're holding cgroup_mutex */ + css = css->parent; + } while (css && atomic_dec_and_test(&css->online_cnt)); + + mutex_unlock(&cgroup_mutex); } /* css kill confirmation processing requires process context, bounce */ @@ -5029,8 +5286,10 @@ static void css_killed_ref_fn(struct percpu_ref *ref) struct cgroup_subsys_state *css = container_of(ref, struct cgroup_subsys_state, refcnt); - INIT_WORK(&css->destroy_work, css_killed_work_fn); - queue_work(cgroup_destroy_wq, &css->destroy_work); + if (atomic_dec_and_test(&css->online_cnt)) { + INIT_WORK(&css->destroy_work, css_killed_work_fn); + queue_work(cgroup_destroy_wq, &css->destroy_work); + } } /** @@ -5050,7 +5309,7 @@ static void kill_css(struct cgroup_subsys_state *css) * This must happen before css is disassociated with its cgroup. * See seq_css() for details. */ - css_clear_dir(css, NULL); + css_clear_dir(css); /* * Killing would put the base ref, but we need to keep it alive @@ -5099,6 +5358,7 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) __releases(&cgroup_mutex) __acquires(&cgroup_mutex) { struct cgroup_subsys_state *css; + struct cgrp_cset_link *link; int ssid; lockdep_assert_held(&cgroup_mutex); @@ -5119,11 +5379,18 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) return -EBUSY; /* - * Mark @cgrp dead. This prevents further task migration and child - * creation by disabling cgroup_lock_live_group(). + * Mark @cgrp and the associated csets dead. The former prevents + * further task migration and child creation by disabling + * cgroup_lock_live_group(). The latter makes the csets ignored by + * the migration path. */ cgrp->self.flags &= ~CSS_ONLINE; + spin_lock_bh(&css_set_lock); + list_for_each_entry(link, &cgrp->cset_links, cset_link) + link->cset->dead = true; + spin_unlock_bh(&css_set_lock); + /* initiate massacre of all css's */ for_each_css(css, ssid, cgrp) kill_css(css); @@ -5147,7 +5414,7 @@ static int cgroup_rmdir(struct kernfs_node *kn) struct cgroup *cgrp; int ret = 0; - cgrp = cgroup_kn_lock_live(kn); + cgrp = cgroup_kn_lock_live(kn, false); if (!cgrp) return 0; @@ -5237,7 +5504,7 @@ int __init cgroup_init_early(void) for_each_subsys(ss, i) { WARN(!ss->css_alloc || !ss->css_free || ss->name || ss->id, - "invalid cgroup_subsys %d:%s css_alloc=%p css_free=%p name:id=%d:%s\n", + "invalid cgroup_subsys %d:%s css_alloc=%p css_free=%p id:name=%d:%s\n", i, cgroup_subsys_name[i], ss->css_alloc, ss->css_free, ss->id, ss->name); WARN(strlen(cgroup_subsys_name[i]) > MAX_CGROUP_TYPE_NAMELEN, @@ -5254,7 +5521,7 @@ int __init cgroup_init_early(void) return 0; } -static unsigned long cgroup_disable_mask __initdata; +static u16 cgroup_disable_mask __initdata; /** * cgroup_init - cgroup initialization @@ -5265,18 +5532,23 @@ static unsigned long cgroup_disable_mask __initdata; int __init cgroup_init(void) { struct cgroup_subsys *ss; - unsigned long key; int ssid; + BUILD_BUG_ON(CGROUP_SUBSYS_COUNT > 16); BUG_ON(percpu_init_rwsem(&cgroup_threadgroup_rwsem)); BUG_ON(cgroup_init_cftypes(NULL, cgroup_dfl_base_files)); BUG_ON(cgroup_init_cftypes(NULL, cgroup_legacy_base_files)); + get_user_ns(init_cgroup_ns.user_ns); + mutex_lock(&cgroup_mutex); - /* Add init_css_set to the hash table */ - key = css_set_hash(init_css_set.subsys); - hash_add(css_set_table, &init_css_set.hlist, key); + /* + * Add init_css_set to the hash table so that dfl_root can link to + * it during init. + */ + hash_add(css_set_table, &init_css_set.hlist, + css_set_hash(init_css_set.subsys)); BUG_ON(cgroup_setup_root(&cgrp_dfl_root, 0)); @@ -5309,10 +5581,16 @@ int __init cgroup_init(void) continue; } + if (cgroup_ssid_no_v1(ssid)) + printk(KERN_INFO "Disabling %s control group subsystem in v1 mounts\n", + ss->name); + cgrp_dfl_root.subsys_mask |= 1 << ss->id; - if (!ss->dfl_cftypes) - cgrp_dfl_root_inhibit_ss_mask |= 1 << ss->id; + if (ss->implicit_on_dfl) + cgrp_dfl_implicit_ss_mask |= 1 << ss->id; + else if (!ss->dfl_cftypes) + cgrp_dfl_inhibit_ss_mask |= 1 << ss->id; if (ss->dfl_cftypes == ss->legacy_cftypes) { WARN_ON(cgroup_add_cftypes(ss, ss->dfl_cftypes)); @@ -5325,6 +5603,11 @@ int __init cgroup_init(void) ss->bind(init_css_set.subsys[ssid]); } + /* init_css_set.subsys[] has been updated, re-hash */ + hash_del(&init_css_set.hlist); + hash_add(css_set_table, &init_css_set.hlist, + css_set_hash(init_css_set.subsys)); + WARN_ON(sysfs_create_mount_point(fs_kobj, "cgroup")); WARN_ON(register_filesystem(&cgroup_fs_type)); WARN_ON(register_filesystem(&cgroup2_fs_type)); @@ -5383,7 +5666,7 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns, struct cgroup *cgrp; int ssid, count = 0; - if (root == &cgrp_dfl_root && !cgrp_dfl_root_visible) + if (root == &cgrp_dfl_root && !cgrp_dfl_visible) continue; seq_printf(m, "%d:", root->hierarchy_id); @@ -5409,7 +5692,8 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns, * " (deleted)" is appended to the cgroup path. */ if (cgroup_on_dfl(cgrp) || !(tsk->flags & PF_EXITING)) { - path = cgroup_path(cgrp, buf, PATH_MAX); + path = cgroup_path_ns_locked(cgrp, buf, PATH_MAX, + current->nsproxy->cgroup_ns); if (!path) { retval = -ENAMETOOLONG; goto out_unlock; @@ -5498,11 +5782,11 @@ int cgroup_can_fork(struct task_struct *child) struct cgroup_subsys *ss; int i, j, ret; - for_each_subsys_which(ss, i, &have_canfork_callback) { + do_each_subsys_mask(ss, i, have_canfork_callback) { ret = ss->can_fork(child); if (ret) goto out_revert; - } + } while_each_subsys_mask(); return 0; @@ -5587,8 +5871,9 @@ void cgroup_post_fork(struct task_struct *child) * css_set; otherwise, @child might change state between ->fork() * and addition to css_set. */ - for_each_subsys_which(ss, i, &have_fork_callback) + do_each_subsys_mask(ss, i, have_fork_callback) { ss->fork(child); + } while_each_subsys_mask(); } /** @@ -5631,8 +5916,9 @@ void cgroup_exit(struct task_struct *tsk) } /* see cgroup_post_fork() for details */ - for_each_subsys_which(ss, i, &have_exit_callback) + do_each_subsys_mask(ss, i, have_exit_callback) { ss->exit(tsk); + } while_each_subsys_mask(); } void cgroup_free(struct task_struct *task) @@ -5641,8 +5927,9 @@ void cgroup_free(struct task_struct *task) struct cgroup_subsys *ss; int ssid; - for_each_subsys_which(ss, ssid, &have_free_callback) + do_each_subsys_mask(ss, ssid, have_free_callback) { ss->free(task); + } while_each_subsys_mask(); put_css_set(cset); } @@ -5691,7 +5978,9 @@ static void cgroup_release_agent(struct work_struct *work) if (!pathbuf || !agentbuf) goto out; - path = cgroup_path(cgrp, pathbuf, PATH_MAX); + spin_lock_bh(&css_set_lock); + path = cgroup_path_ns_locked(cgrp, pathbuf, PATH_MAX, &init_cgroup_ns); + spin_unlock_bh(&css_set_lock); if (!path) goto out; @@ -5735,6 +6024,33 @@ static int __init cgroup_disable(char *str) } __setup("cgroup_disable=", cgroup_disable); +static int __init cgroup_no_v1(char *str) +{ + struct cgroup_subsys *ss; + char *token; + int i; + + while ((token = strsep(&str, ",")) != NULL) { + if (!*token) + continue; + + if (!strcmp(token, "all")) { + cgroup_no_v1_mask = U16_MAX; + break; + } + + for_each_subsys(ss, i) { + if (strcmp(token, ss->name) && + strcmp(token, ss->legacy_name)) + continue; + + cgroup_no_v1_mask |= 1 << i; + } + } + return 1; +} +__setup("cgroup_no_v1=", cgroup_no_v1); + /** * css_tryget_online_from_dir - get corresponding css from a cgroup dentry * @dentry: directory dentry of interest @@ -5748,12 +6064,13 @@ struct cgroup_subsys_state *css_tryget_online_from_dir(struct dentry *dentry, struct cgroup_subsys *ss) { struct kernfs_node *kn = kernfs_node_from_dentry(dentry); + struct file_system_type *s_type = dentry->d_sb->s_type; struct cgroup_subsys_state *css = NULL; struct cgroup *cgrp; /* is @dentry a cgroup dir? */ - if (dentry->d_sb->s_type != &cgroup_fs_type || !kn || - kernfs_type(kn) != KERNFS_DIR) + if ((s_type != &cgroup_fs_type && s_type != &cgroup2_fs_type) || + !kn || kernfs_type(kn) != KERNFS_DIR) return ERR_PTR(-EBADF); rcu_read_lock(); @@ -5875,6 +6192,133 @@ void cgroup_sk_free(struct sock_cgroup_data *skcd) #endif /* CONFIG_SOCK_CGROUP_DATA */ +/* cgroup namespaces */ + +static struct cgroup_namespace *alloc_cgroup_ns(void) +{ + struct cgroup_namespace *new_ns; + int ret; + + new_ns = kzalloc(sizeof(struct cgroup_namespace), GFP_KERNEL); + if (!new_ns) + return ERR_PTR(-ENOMEM); + ret = ns_alloc_inum(&new_ns->ns); + if (ret) { + kfree(new_ns); + return ERR_PTR(ret); + } + atomic_set(&new_ns->count, 1); + new_ns->ns.ops = &cgroupns_operations; + return new_ns; +} + +void free_cgroup_ns(struct cgroup_namespace *ns) +{ + put_css_set(ns->root_cset); + put_user_ns(ns->user_ns); + ns_free_inum(&ns->ns); + kfree(ns); +} +EXPORT_SYMBOL(free_cgroup_ns); + +struct cgroup_namespace *copy_cgroup_ns(unsigned long flags, + struct user_namespace *user_ns, + struct cgroup_namespace *old_ns) +{ + struct cgroup_namespace *new_ns; + struct css_set *cset; + + BUG_ON(!old_ns); + + if (!(flags & CLONE_NEWCGROUP)) { + get_cgroup_ns(old_ns); + return old_ns; + } + + /* Allow only sysadmin to create cgroup namespace. */ + if (!ns_capable(user_ns, CAP_SYS_ADMIN)) + return ERR_PTR(-EPERM); + + mutex_lock(&cgroup_mutex); + spin_lock_bh(&css_set_lock); + + cset = task_css_set(current); + get_css_set(cset); + + spin_unlock_bh(&css_set_lock); + mutex_unlock(&cgroup_mutex); + + new_ns = alloc_cgroup_ns(); + if (IS_ERR(new_ns)) { + put_css_set(cset); + return new_ns; + } + + new_ns->user_ns = get_user_ns(user_ns); + new_ns->root_cset = cset; + + return new_ns; +} + +static inline struct cgroup_namespace *to_cg_ns(struct ns_common *ns) +{ + return container_of(ns, struct cgroup_namespace, ns); +} + +static int cgroupns_install(struct nsproxy *nsproxy, struct ns_common *ns) +{ + struct cgroup_namespace *cgroup_ns = to_cg_ns(ns); + + if (!ns_capable(current_user_ns(), CAP_SYS_ADMIN) || + !ns_capable(cgroup_ns->user_ns, CAP_SYS_ADMIN)) + return -EPERM; + + /* Don't need to do anything if we are attaching to our own cgroupns. */ + if (cgroup_ns == nsproxy->cgroup_ns) + return 0; + + get_cgroup_ns(cgroup_ns); + put_cgroup_ns(nsproxy->cgroup_ns); + nsproxy->cgroup_ns = cgroup_ns; + + return 0; +} + +static struct ns_common *cgroupns_get(struct task_struct *task) +{ + struct cgroup_namespace *ns = NULL; + struct nsproxy *nsproxy; + + task_lock(task); + nsproxy = task->nsproxy; + if (nsproxy) { + ns = nsproxy->cgroup_ns; + get_cgroup_ns(ns); + } + task_unlock(task); + + return ns ? &ns->ns : NULL; +} + +static void cgroupns_put(struct ns_common *ns) +{ + put_cgroup_ns(to_cg_ns(ns)); +} + +const struct proc_ns_operations cgroupns_operations = { + .name = "cgroup", + .type = CLONE_NEWCGROUP, + .get = cgroupns_get, + .put = cgroupns_put, + .install = cgroupns_install, +}; + +static __init int cgroup_namespaces_init(void) +{ + return 0; +} +subsys_initcall(cgroup_namespaces_init); + #ifdef CONFIG_CGROUP_DEBUG static struct cgroup_subsys_state * debug_css_alloc(struct cgroup_subsys_state *parent_css) diff --git a/kernel/cpu.c b/kernel/cpu.c index 5b9d39633ce9..6ea42e8da861 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -22,13 +22,88 @@ #include <linux/lockdep.h> #include <linux/tick.h> #include <linux/irq.h> +#include <linux/smpboot.h> + #include <trace/events/power.h> +#define CREATE_TRACE_POINTS +#include <trace/events/cpuhp.h> #include "smpboot.h" +/** + * cpuhp_cpu_state - Per cpu hotplug state storage + * @state: The current cpu state + * @target: The target state + * @thread: Pointer to the hotplug thread + * @should_run: Thread should execute + * @cb_stat: The state for a single callback (install/uninstall) + * @cb: Single callback function (install/uninstall) + * @result: Result of the operation + * @done: Signal completion to the issuer of the task + */ +struct cpuhp_cpu_state { + enum cpuhp_state state; + enum cpuhp_state target; +#ifdef CONFIG_SMP + struct task_struct *thread; + bool should_run; + enum cpuhp_state cb_state; + int (*cb)(unsigned int cpu); + int result; + struct completion done; +#endif +}; + +static DEFINE_PER_CPU(struct cpuhp_cpu_state, cpuhp_state); + +/** + * cpuhp_step - Hotplug state machine step + * @name: Name of the step + * @startup: Startup function of the step + * @teardown: Teardown function of the step + * @skip_onerr: Do not invoke the functions on error rollback + * Will go away once the notifiers are gone + * @cant_stop: Bringup/teardown can't be stopped at this step + */ +struct cpuhp_step { + const char *name; + int (*startup)(unsigned int cpu); + int (*teardown)(unsigned int cpu); + bool skip_onerr; + bool cant_stop; +}; + +static DEFINE_MUTEX(cpuhp_state_mutex); +static struct cpuhp_step cpuhp_bp_states[]; +static struct cpuhp_step cpuhp_ap_states[]; + +/** + * cpuhp_invoke_callback _ Invoke the callbacks for a given state + * @cpu: The cpu for which the callback should be invoked + * @step: The step in the state machine + * @cb: The callback function to invoke + * + * Called from cpu hotplug and from the state register machinery + */ +static int cpuhp_invoke_callback(unsigned int cpu, enum cpuhp_state step, + int (*cb)(unsigned int)) +{ + struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); + int ret = 0; + + if (cb) { + trace_cpuhp_enter(cpu, st->target, step, cb); + ret = cb(cpu); + trace_cpuhp_exit(cpu, st->state, step, ret); + } + return ret; +} + #ifdef CONFIG_SMP /* Serializes the updates to cpu_online_mask, cpu_present_mask */ static DEFINE_MUTEX(cpu_add_remove_lock); +bool cpuhp_tasks_frozen; +EXPORT_SYMBOL_GPL(cpuhp_tasks_frozen); /* * The following two APIs (cpu_maps_update_begin/done) must be used when @@ -207,31 +282,281 @@ int __register_cpu_notifier(struct notifier_block *nb) return raw_notifier_chain_register(&cpu_chain, nb); } -static int __cpu_notify(unsigned long val, void *v, int nr_to_call, +static int __cpu_notify(unsigned long val, unsigned int cpu, int nr_to_call, int *nr_calls) { + unsigned long mod = cpuhp_tasks_frozen ? CPU_TASKS_FROZEN : 0; + void *hcpu = (void *)(long)cpu; + int ret; - ret = __raw_notifier_call_chain(&cpu_chain, val, v, nr_to_call, + ret = __raw_notifier_call_chain(&cpu_chain, val | mod, hcpu, nr_to_call, nr_calls); return notifier_to_errno(ret); } -static int cpu_notify(unsigned long val, void *v) +static int cpu_notify(unsigned long val, unsigned int cpu) { - return __cpu_notify(val, v, -1, NULL); + return __cpu_notify(val, cpu, -1, NULL); } -#ifdef CONFIG_HOTPLUG_CPU +/* Notifier wrappers for transitioning to state machine */ +static int notify_prepare(unsigned int cpu) +{ + int nr_calls = 0; + int ret; + + ret = __cpu_notify(CPU_UP_PREPARE, cpu, -1, &nr_calls); + if (ret) { + nr_calls--; + printk(KERN_WARNING "%s: attempt to bring up CPU %u failed\n", + __func__, cpu); + __cpu_notify(CPU_UP_CANCELED, cpu, nr_calls, NULL); + } + return ret; +} + +static int notify_online(unsigned int cpu) +{ + cpu_notify(CPU_ONLINE, cpu); + return 0; +} + +static int notify_starting(unsigned int cpu) +{ + cpu_notify(CPU_STARTING, cpu); + return 0; +} + +static int bringup_wait_for_ap(unsigned int cpu) +{ + struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); + + wait_for_completion(&st->done); + return st->result; +} + +static int bringup_cpu(unsigned int cpu) +{ + struct task_struct *idle = idle_thread_get(cpu); + int ret; + + /* Arch-specific enabling code. */ + ret = __cpu_up(cpu, idle); + if (ret) { + cpu_notify(CPU_UP_CANCELED, cpu); + return ret; + } + ret = bringup_wait_for_ap(cpu); + BUG_ON(!cpu_online(cpu)); + return ret; +} + +/* + * Hotplug state machine related functions + */ +static void undo_cpu_down(unsigned int cpu, struct cpuhp_cpu_state *st, + struct cpuhp_step *steps) +{ + for (st->state++; st->state < st->target; st->state++) { + struct cpuhp_step *step = steps + st->state; + + if (!step->skip_onerr) + cpuhp_invoke_callback(cpu, st->state, step->startup); + } +} + +static int cpuhp_down_callbacks(unsigned int cpu, struct cpuhp_cpu_state *st, + struct cpuhp_step *steps, enum cpuhp_state target) +{ + enum cpuhp_state prev_state = st->state; + int ret = 0; + + for (; st->state > target; st->state--) { + struct cpuhp_step *step = steps + st->state; + + ret = cpuhp_invoke_callback(cpu, st->state, step->teardown); + if (ret) { + st->target = prev_state; + undo_cpu_down(cpu, st, steps); + break; + } + } + return ret; +} + +static void undo_cpu_up(unsigned int cpu, struct cpuhp_cpu_state *st, + struct cpuhp_step *steps) +{ + for (st->state--; st->state > st->target; st->state--) { + struct cpuhp_step *step = steps + st->state; + + if (!step->skip_onerr) + cpuhp_invoke_callback(cpu, st->state, step->teardown); + } +} + +static int cpuhp_up_callbacks(unsigned int cpu, struct cpuhp_cpu_state *st, + struct cpuhp_step *steps, enum cpuhp_state target) +{ + enum cpuhp_state prev_state = st->state; + int ret = 0; + + while (st->state < target) { + struct cpuhp_step *step; + + st->state++; + step = steps + st->state; + ret = cpuhp_invoke_callback(cpu, st->state, step->startup); + if (ret) { + st->target = prev_state; + undo_cpu_up(cpu, st, steps); + break; + } + } + return ret; +} + +/* + * The cpu hotplug threads manage the bringup and teardown of the cpus + */ +static void cpuhp_create(unsigned int cpu) +{ + struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); + + init_completion(&st->done); +} + +static int cpuhp_should_run(unsigned int cpu) +{ + struct cpuhp_cpu_state *st = this_cpu_ptr(&cpuhp_state); + + return st->should_run; +} + +/* Execute the teardown callbacks. Used to be CPU_DOWN_PREPARE */ +static int cpuhp_ap_offline(unsigned int cpu, struct cpuhp_cpu_state *st) +{ + enum cpuhp_state target = max((int)st->target, CPUHP_TEARDOWN_CPU); + + return cpuhp_down_callbacks(cpu, st, cpuhp_ap_states, target); +} + +/* Execute the online startup callbacks. Used to be CPU_ONLINE */ +static int cpuhp_ap_online(unsigned int cpu, struct cpuhp_cpu_state *st) +{ + return cpuhp_up_callbacks(cpu, st, cpuhp_ap_states, st->target); +} + +/* + * Execute teardown/startup callbacks on the plugged cpu. Also used to invoke + * callbacks when a state gets [un]installed at runtime. + */ +static void cpuhp_thread_fun(unsigned int cpu) +{ + struct cpuhp_cpu_state *st = this_cpu_ptr(&cpuhp_state); + int ret = 0; + + /* + * Paired with the mb() in cpuhp_kick_ap_work and + * cpuhp_invoke_ap_callback, so the work set is consistent visible. + */ + smp_mb(); + if (!st->should_run) + return; + + st->should_run = false; + + /* Single callback invocation for [un]install ? */ + if (st->cb) { + if (st->cb_state < CPUHP_AP_ONLINE) { + local_irq_disable(); + ret = cpuhp_invoke_callback(cpu, st->cb_state, st->cb); + local_irq_enable(); + } else { + ret = cpuhp_invoke_callback(cpu, st->cb_state, st->cb); + } + } else { + /* Cannot happen .... */ + BUG_ON(st->state < CPUHP_AP_ONLINE_IDLE); + + /* Regular hotplug work */ + if (st->state < st->target) + ret = cpuhp_ap_online(cpu, st); + else if (st->state > st->target) + ret = cpuhp_ap_offline(cpu, st); + } + st->result = ret; + complete(&st->done); +} -static void cpu_notify_nofail(unsigned long val, void *v) +/* Invoke a single callback on a remote cpu */ +static int cpuhp_invoke_ap_callback(int cpu, enum cpuhp_state state, + int (*cb)(unsigned int)) { - BUG_ON(cpu_notify(val, v)); + struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); + + if (!cpu_online(cpu)) + return 0; + + st->cb_state = state; + st->cb = cb; + /* + * Make sure the above stores are visible before should_run becomes + * true. Paired with the mb() above in cpuhp_thread_fun() + */ + smp_mb(); + st->should_run = true; + wake_up_process(st->thread); + wait_for_completion(&st->done); + return st->result; } + +/* Regular hotplug invocation of the AP hotplug thread */ +static void __cpuhp_kick_ap_work(struct cpuhp_cpu_state *st) +{ + st->result = 0; + st->cb = NULL; + /* + * Make sure the above stores are visible before should_run becomes + * true. Paired with the mb() above in cpuhp_thread_fun() + */ + smp_mb(); + st->should_run = true; + wake_up_process(st->thread); +} + +static int cpuhp_kick_ap_work(unsigned int cpu) +{ + struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); + enum cpuhp_state state = st->state; + + trace_cpuhp_enter(cpu, st->target, state, cpuhp_kick_ap_work); + __cpuhp_kick_ap_work(st); + wait_for_completion(&st->done); + trace_cpuhp_exit(cpu, st->state, state, st->result); + return st->result; +} + +static struct smp_hotplug_thread cpuhp_threads = { + .store = &cpuhp_state.thread, + .create = &cpuhp_create, + .thread_should_run = cpuhp_should_run, + .thread_fn = cpuhp_thread_fun, + .thread_comm = "cpuhp/%u", + .selfparking = true, +}; + +void __init cpuhp_threads_init(void) +{ + BUG_ON(smpboot_register_percpu_thread(&cpuhp_threads)); + kthread_unpark(this_cpu_read(cpuhp_state.thread)); +} + +#ifdef CONFIG_HOTPLUG_CPU EXPORT_SYMBOL(register_cpu_notifier); EXPORT_SYMBOL(__register_cpu_notifier); - void unregister_cpu_notifier(struct notifier_block *nb) { cpu_maps_update_begin(); @@ -311,57 +636,60 @@ static inline void check_for_tasks(int dead_cpu) read_unlock(&tasklist_lock); } -struct take_cpu_down_param { - unsigned long mod; - void *hcpu; -}; +static void cpu_notify_nofail(unsigned long val, unsigned int cpu) +{ + BUG_ON(cpu_notify(val, cpu)); +} + +static int notify_down_prepare(unsigned int cpu) +{ + int err, nr_calls = 0; + + err = __cpu_notify(CPU_DOWN_PREPARE, cpu, -1, &nr_calls); + if (err) { + nr_calls--; + __cpu_notify(CPU_DOWN_FAILED, cpu, nr_calls, NULL); + pr_warn("%s: attempt to take down CPU %u failed\n", + __func__, cpu); + } + return err; +} + +static int notify_dying(unsigned int cpu) +{ + cpu_notify(CPU_DYING, cpu); + return 0; +} /* Take this CPU down. */ static int take_cpu_down(void *_param) { - struct take_cpu_down_param *param = _param; - int err; + struct cpuhp_cpu_state *st = this_cpu_ptr(&cpuhp_state); + enum cpuhp_state target = max((int)st->target, CPUHP_AP_OFFLINE); + int err, cpu = smp_processor_id(); /* Ensure this CPU doesn't handle any more interrupts. */ err = __cpu_disable(); if (err < 0) return err; - cpu_notify(CPU_DYING | param->mod, param->hcpu); + /* Invoke the former CPU_DYING callbacks */ + for (; st->state > target; st->state--) { + struct cpuhp_step *step = cpuhp_ap_states + st->state; + + cpuhp_invoke_callback(cpu, st->state, step->teardown); + } /* Give up timekeeping duties */ tick_handover_do_timer(); /* Park the stopper thread */ - stop_machine_park((long)param->hcpu); + stop_machine_park(cpu); return 0; } -/* Requires cpu_add_remove_lock to be held */ -static int _cpu_down(unsigned int cpu, int tasks_frozen) +static int takedown_cpu(unsigned int cpu) { - int err, nr_calls = 0; - void *hcpu = (void *)(long)cpu; - unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0; - struct take_cpu_down_param tcd_param = { - .mod = mod, - .hcpu = hcpu, - }; - - if (num_online_cpus() == 1) - return -EBUSY; - - if (!cpu_online(cpu)) - return -EINVAL; - - cpu_hotplug_begin(); - - err = __cpu_notify(CPU_DOWN_PREPARE | mod, hcpu, -1, &nr_calls); - if (err) { - nr_calls--; - __cpu_notify(CPU_DOWN_FAILED | mod, hcpu, nr_calls, NULL); - pr_warn("%s: attempt to take down CPU %u failed\n", - __func__, cpu); - goto out_release; - } + struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); + int err; /* * By now we've cleared cpu_active_mask, wait for all preempt-disabled @@ -378,6 +706,8 @@ static int _cpu_down(unsigned int cpu, int tasks_frozen) else synchronize_rcu(); + /* Park the smpboot threads */ + kthread_park(per_cpu_ptr(&cpuhp_state, cpu)->thread); smpboot_park_threads(cpu); /* @@ -389,12 +719,12 @@ static int _cpu_down(unsigned int cpu, int tasks_frozen) /* * So now all preempt/rcu users must observe !cpu_active(). */ - err = stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu)); + err = stop_machine(take_cpu_down, NULL, cpumask_of(cpu)); if (err) { /* CPU didn't die: tell everyone. Can't complain. */ - cpu_notify_nofail(CPU_DOWN_FAILED | mod, hcpu); + cpu_notify_nofail(CPU_DOWN_FAILED, cpu); irq_unlock_sparse(); - goto out_release; + return err; } BUG_ON(cpu_online(cpu)); @@ -405,10 +735,8 @@ static int _cpu_down(unsigned int cpu, int tasks_frozen) * * Wait for the stop thread to go away. */ - while (!per_cpu(cpu_dead_idle, cpu)) - cpu_relax(); - smp_mb(); /* Read from cpu_dead_idle before __cpu_die(). */ - per_cpu(cpu_dead_idle, cpu) = false; + wait_for_completion(&st->done); + BUG_ON(st->state != CPUHP_AP_IDLE_DEAD); /* Interrupts are moved away from the dying cpu, reenable alloc/free */ irq_unlock_sparse(); @@ -417,20 +745,104 @@ static int _cpu_down(unsigned int cpu, int tasks_frozen) /* This actually kills the CPU. */ __cpu_die(cpu); - /* CPU is completely dead: tell everyone. Too late to complain. */ tick_cleanup_dead_cpu(cpu); - cpu_notify_nofail(CPU_DEAD | mod, hcpu); + return 0; +} +static int notify_dead(unsigned int cpu) +{ + cpu_notify_nofail(CPU_DEAD, cpu); check_for_tasks(cpu); + return 0; +} -out_release: +static void cpuhp_complete_idle_dead(void *arg) +{ + struct cpuhp_cpu_state *st = arg; + + complete(&st->done); +} + +void cpuhp_report_idle_dead(void) +{ + struct cpuhp_cpu_state *st = this_cpu_ptr(&cpuhp_state); + + BUG_ON(st->state != CPUHP_AP_OFFLINE); + rcu_report_dead(smp_processor_id()); + st->state = CPUHP_AP_IDLE_DEAD; + /* + * We cannot call complete after rcu_report_dead() so we delegate it + * to an online cpu. + */ + smp_call_function_single(cpumask_first(cpu_online_mask), + cpuhp_complete_idle_dead, st, 0); +} + +#else +#define notify_down_prepare NULL +#define takedown_cpu NULL +#define notify_dead NULL +#define notify_dying NULL +#endif + +#ifdef CONFIG_HOTPLUG_CPU + +/* Requires cpu_add_remove_lock to be held */ +static int __ref _cpu_down(unsigned int cpu, int tasks_frozen, + enum cpuhp_state target) +{ + struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); + int prev_state, ret = 0; + bool hasdied = false; + + if (num_online_cpus() == 1) + return -EBUSY; + + if (!cpu_present(cpu)) + return -EINVAL; + + cpu_hotplug_begin(); + + cpuhp_tasks_frozen = tasks_frozen; + + prev_state = st->state; + st->target = target; + /* + * If the current CPU state is in the range of the AP hotplug thread, + * then we need to kick the thread. + */ + if (st->state > CPUHP_TEARDOWN_CPU) { + ret = cpuhp_kick_ap_work(cpu); + /* + * The AP side has done the error rollback already. Just + * return the error code.. + */ + if (ret) + goto out; + + /* + * We might have stopped still in the range of the AP hotplug + * thread. Nothing to do anymore. + */ + if (st->state > CPUHP_TEARDOWN_CPU) + goto out; + } + /* + * The AP brought itself down to CPUHP_TEARDOWN_CPU. So we need + * to do the further cleanups. + */ + ret = cpuhp_down_callbacks(cpu, st, cpuhp_bp_states, target); + + hasdied = prev_state != st->state && st->state == CPUHP_OFFLINE; +out: cpu_hotplug_done(); - if (!err) - cpu_notify_nofail(CPU_POST_DEAD | mod, hcpu); - return err; + /* This post dead nonsense must die */ + if (!ret && hasdied) + cpu_notify_nofail(CPU_POST_DEAD, cpu); + return ret; } -int cpu_down(unsigned int cpu) +static int do_cpu_down(unsigned int cpu, enum cpuhp_state target) { int err; @@ -441,100 +853,131 @@ int cpu_down(unsigned int cpu) goto out; } - err = _cpu_down(cpu, 0); + err = _cpu_down(cpu, 0, target); out: cpu_maps_update_done(); return err; } +int cpu_down(unsigned int cpu) +{ + return do_cpu_down(cpu, CPUHP_OFFLINE); +} EXPORT_SYMBOL(cpu_down); #endif /*CONFIG_HOTPLUG_CPU*/ -/* - * Unpark per-CPU smpboot kthreads at CPU-online time. +/** + * notify_cpu_starting(cpu) - call the CPU_STARTING notifiers + * @cpu: cpu that just started + * + * This function calls the cpu_chain notifiers with CPU_STARTING. + * It must be called by the arch code on the new cpu, before the new cpu + * enables interrupts and before the "boot" cpu returns from __cpu_up(). */ -static int smpboot_thread_call(struct notifier_block *nfb, - unsigned long action, void *hcpu) +void notify_cpu_starting(unsigned int cpu) { - int cpu = (long)hcpu; - - switch (action & ~CPU_TASKS_FROZEN) { + struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); + enum cpuhp_state target = min((int)st->target, CPUHP_AP_ONLINE); - case CPU_DOWN_FAILED: - case CPU_ONLINE: - smpboot_unpark_threads(cpu); - break; + while (st->state < target) { + struct cpuhp_step *step; - default: - break; + st->state++; + step = cpuhp_ap_states + st->state; + cpuhp_invoke_callback(cpu, st->state, step->startup); } - - return NOTIFY_OK; } -static struct notifier_block smpboot_thread_notifier = { - .notifier_call = smpboot_thread_call, - .priority = CPU_PRI_SMPBOOT, -}; - -void smpboot_thread_init(void) +/* + * Called from the idle task. We need to set active here, so we can kick off + * the stopper thread and unpark the smpboot threads. If the target state is + * beyond CPUHP_AP_ONLINE_IDLE we kick cpuhp thread and let it bring up the + * cpu further. + */ +void cpuhp_online_idle(enum cpuhp_state state) { - register_cpu_notifier(&smpboot_thread_notifier); + struct cpuhp_cpu_state *st = this_cpu_ptr(&cpuhp_state); + unsigned int cpu = smp_processor_id(); + + /* Happens for the boot cpu */ + if (state != CPUHP_AP_ONLINE_IDLE) + return; + + st->state = CPUHP_AP_ONLINE_IDLE; + + /* The cpu is marked online, set it active now */ + set_cpu_active(cpu, true); + /* Unpark the stopper thread and the hotplug thread of this cpu */ + stop_machine_unpark(cpu); + kthread_unpark(st->thread); + + /* Should we go further up ? */ + if (st->target > CPUHP_AP_ONLINE_IDLE) + __cpuhp_kick_ap_work(st); + else + complete(&st->done); } /* Requires cpu_add_remove_lock to be held */ -static int _cpu_up(unsigned int cpu, int tasks_frozen) +static int _cpu_up(unsigned int cpu, int tasks_frozen, enum cpuhp_state target) { - int ret, nr_calls = 0; - void *hcpu = (void *)(long)cpu; - unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0; + struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); struct task_struct *idle; + int ret = 0; cpu_hotplug_begin(); - if (cpu_online(cpu) || !cpu_present(cpu)) { + if (!cpu_present(cpu)) { ret = -EINVAL; goto out; } - idle = idle_thread_get(cpu); - if (IS_ERR(idle)) { - ret = PTR_ERR(idle); - goto out; - } - - ret = smpboot_create_threads(cpu); - if (ret) + /* + * The caller of do_cpu_up might have raced with another + * caller. Ignore it for now. + */ + if (st->state >= target) goto out; - ret = __cpu_notify(CPU_UP_PREPARE | mod, hcpu, -1, &nr_calls); - if (ret) { - nr_calls--; - pr_warn("%s: attempt to bring up CPU %u failed\n", - __func__, cpu); - goto out_notify; + if (st->state == CPUHP_OFFLINE) { + /* Let it fail before we try to bring the cpu up */ + idle = idle_thread_get(cpu); + if (IS_ERR(idle)) { + ret = PTR_ERR(idle); + goto out; + } } - /* Arch-specific enabling code. */ - ret = __cpu_up(cpu, idle); - - if (ret != 0) - goto out_notify; - BUG_ON(!cpu_online(cpu)); + cpuhp_tasks_frozen = tasks_frozen; - /* Now call notifier in preparation. */ - cpu_notify(CPU_ONLINE | mod, hcpu); + st->target = target; + /* + * If the current CPU state is in the range of the AP hotplug thread, + * then we need to kick the thread once more. + */ + if (st->state > CPUHP_BRINGUP_CPU) { + ret = cpuhp_kick_ap_work(cpu); + /* + * The AP side has done the error rollback already. Just + * return the error code.. + */ + if (ret) + goto out; + } -out_notify: - if (ret != 0) - __cpu_notify(CPU_UP_CANCELED | mod, hcpu, nr_calls, NULL); + /* + * Try to reach the target state. We max out on the BP at + * CPUHP_BRINGUP_CPU. After that the AP hotplug thread is + * responsible for bringing it up to the target state. + */ + target = min((int)target, CPUHP_BRINGUP_CPU); + ret = cpuhp_up_callbacks(cpu, st, cpuhp_bp_states, target); out: cpu_hotplug_done(); - return ret; } -int cpu_up(unsigned int cpu) +static int do_cpu_up(unsigned int cpu, enum cpuhp_state target) { int err = 0; @@ -558,12 +1001,16 @@ int cpu_up(unsigned int cpu) goto out; } - err = _cpu_up(cpu, 0); - + err = _cpu_up(cpu, 0, target); out: cpu_maps_update_done(); return err; } + +int cpu_up(unsigned int cpu) +{ + return do_cpu_up(cpu, CPUHP_ONLINE); +} EXPORT_SYMBOL_GPL(cpu_up); #ifdef CONFIG_PM_SLEEP_SMP @@ -586,7 +1033,7 @@ int disable_nonboot_cpus(void) if (cpu == first_cpu) continue; trace_suspend_resume(TPS("CPU_OFF"), cpu, true); - error = _cpu_down(cpu, 1); + error = _cpu_down(cpu, 1, CPUHP_OFFLINE); trace_suspend_resume(TPS("CPU_OFF"), cpu, false); if (!error) cpumask_set_cpu(cpu, frozen_cpus); @@ -636,7 +1083,7 @@ void enable_nonboot_cpus(void) for_each_cpu(cpu, frozen_cpus) { trace_suspend_resume(TPS("CPU_ON"), cpu, true); - error = _cpu_up(cpu, 1); + error = _cpu_up(cpu, 1, CPUHP_ONLINE); trace_suspend_resume(TPS("CPU_ON"), cpu, false); if (!error) { pr_info("CPU%d is up\n", cpu); @@ -709,26 +1156,463 @@ core_initcall(cpu_hotplug_pm_sync_init); #endif /* CONFIG_PM_SLEEP_SMP */ +#endif /* CONFIG_SMP */ + +/* Boot processor state steps */ +static struct cpuhp_step cpuhp_bp_states[] = { + [CPUHP_OFFLINE] = { + .name = "offline", + .startup = NULL, + .teardown = NULL, + }, +#ifdef CONFIG_SMP + [CPUHP_CREATE_THREADS]= { + .name = "threads:create", + .startup = smpboot_create_threads, + .teardown = NULL, + .cant_stop = true, + }, + /* + * Preparatory and dead notifiers. Will be replaced once the notifiers + * are converted to states. + */ + [CPUHP_NOTIFY_PREPARE] = { + .name = "notify:prepare", + .startup = notify_prepare, + .teardown = notify_dead, + .skip_onerr = true, + .cant_stop = true, + }, + /* Kicks the plugged cpu into life */ + [CPUHP_BRINGUP_CPU] = { + .name = "cpu:bringup", + .startup = bringup_cpu, + .teardown = NULL, + .cant_stop = true, + }, + /* + * Handled on controll processor until the plugged processor manages + * this itself. + */ + [CPUHP_TEARDOWN_CPU] = { + .name = "cpu:teardown", + .startup = NULL, + .teardown = takedown_cpu, + .cant_stop = true, + }, +#endif +}; + +/* Application processor state steps */ +static struct cpuhp_step cpuhp_ap_states[] = { +#ifdef CONFIG_SMP + /* Final state before CPU kills itself */ + [CPUHP_AP_IDLE_DEAD] = { + .name = "idle:dead", + }, + /* + * Last state before CPU enters the idle loop to die. Transient state + * for synchronization. + */ + [CPUHP_AP_OFFLINE] = { + .name = "ap:offline", + .cant_stop = true, + }, + /* + * Low level startup/teardown notifiers. Run with interrupts + * disabled. Will be removed once the notifiers are converted to + * states. + */ + [CPUHP_AP_NOTIFY_STARTING] = { + .name = "notify:starting", + .startup = notify_starting, + .teardown = notify_dying, + .skip_onerr = true, + .cant_stop = true, + }, + /* Entry state on starting. Interrupts enabled from here on. Transient + * state for synchronsization */ + [CPUHP_AP_ONLINE] = { + .name = "ap:online", + }, + /* Handle smpboot threads park/unpark */ + [CPUHP_AP_SMPBOOT_THREADS] = { + .name = "smpboot:threads", + .startup = smpboot_unpark_threads, + .teardown = NULL, + }, + /* + * Online/down_prepare notifiers. Will be removed once the notifiers + * are converted to states. + */ + [CPUHP_AP_NOTIFY_ONLINE] = { + .name = "notify:online", + .startup = notify_online, + .teardown = notify_down_prepare, + }, +#endif + /* + * The dynamically registered state space is here + */ + + /* CPU is fully up and running. */ + [CPUHP_ONLINE] = { + .name = "online", + .startup = NULL, + .teardown = NULL, + }, +}; + +/* Sanity check for callbacks */ +static int cpuhp_cb_check(enum cpuhp_state state) +{ + if (state <= CPUHP_OFFLINE || state >= CPUHP_ONLINE) + return -EINVAL; + return 0; +} + +static bool cpuhp_is_ap_state(enum cpuhp_state state) +{ + /* + * The extra check for CPUHP_TEARDOWN_CPU is only for documentation + * purposes as that state is handled explicitely in cpu_down. + */ + return state > CPUHP_BRINGUP_CPU && state != CPUHP_TEARDOWN_CPU; +} + +static struct cpuhp_step *cpuhp_get_step(enum cpuhp_state state) +{ + struct cpuhp_step *sp; + + sp = cpuhp_is_ap_state(state) ? cpuhp_ap_states : cpuhp_bp_states; + return sp + state; +} + +static void cpuhp_store_callbacks(enum cpuhp_state state, + const char *name, + int (*startup)(unsigned int cpu), + int (*teardown)(unsigned int cpu)) +{ + /* (Un)Install the callbacks for further cpu hotplug operations */ + struct cpuhp_step *sp; + + mutex_lock(&cpuhp_state_mutex); + sp = cpuhp_get_step(state); + sp->startup = startup; + sp->teardown = teardown; + sp->name = name; + mutex_unlock(&cpuhp_state_mutex); +} + +static void *cpuhp_get_teardown_cb(enum cpuhp_state state) +{ + return cpuhp_get_step(state)->teardown; +} + +/* + * Call the startup/teardown function for a step either on the AP or + * on the current CPU. + */ +static int cpuhp_issue_call(int cpu, enum cpuhp_state state, + int (*cb)(unsigned int), bool bringup) +{ + int ret; + + if (!cb) + return 0; + /* + * The non AP bound callbacks can fail on bringup. On teardown + * e.g. module removal we crash for now. + */ +#ifdef CONFIG_SMP + if (cpuhp_is_ap_state(state)) + ret = cpuhp_invoke_ap_callback(cpu, state, cb); + else + ret = cpuhp_invoke_callback(cpu, state, cb); +#else + ret = cpuhp_invoke_callback(cpu, state, cb); +#endif + BUG_ON(ret && !bringup); + return ret; +} + +/* + * Called from __cpuhp_setup_state on a recoverable failure. + * + * Note: The teardown callbacks for rollback are not allowed to fail! + */ +static void cpuhp_rollback_install(int failedcpu, enum cpuhp_state state, + int (*teardown)(unsigned int cpu)) +{ + int cpu; + + if (!teardown) + return; + + /* Roll back the already executed steps on the other cpus */ + for_each_present_cpu(cpu) { + struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); + int cpustate = st->state; + + if (cpu >= failedcpu) + break; + + /* Did we invoke the startup call on that cpu ? */ + if (cpustate >= state) + cpuhp_issue_call(cpu, state, teardown, false); + } +} + +/* + * Returns a free for dynamic slot assignment of the Online state. The states + * are protected by the cpuhp_slot_states mutex and an empty slot is identified + * by having no name assigned. + */ +static int cpuhp_reserve_state(enum cpuhp_state state) +{ + enum cpuhp_state i; + + mutex_lock(&cpuhp_state_mutex); + for (i = CPUHP_AP_ONLINE_DYN; i <= CPUHP_AP_ONLINE_DYN_END; i++) { + if (cpuhp_ap_states[i].name) + continue; + + cpuhp_ap_states[i].name = "Reserved"; + mutex_unlock(&cpuhp_state_mutex); + return i; + } + mutex_unlock(&cpuhp_state_mutex); + WARN(1, "No more dynamic states available for CPU hotplug\n"); + return -ENOSPC; +} + /** - * notify_cpu_starting(cpu) - call the CPU_STARTING notifiers - * @cpu: cpu that just started + * __cpuhp_setup_state - Setup the callbacks for an hotplug machine state + * @state: The state to setup + * @invoke: If true, the startup function is invoked for cpus where + * cpu state >= @state + * @startup: startup callback function + * @teardown: teardown callback function * - * This function calls the cpu_chain notifiers with CPU_STARTING. - * It must be called by the arch code on the new cpu, before the new cpu - * enables interrupts and before the "boot" cpu returns from __cpu_up(). + * Returns 0 if successful, otherwise a proper error code */ -void notify_cpu_starting(unsigned int cpu) +int __cpuhp_setup_state(enum cpuhp_state state, + const char *name, bool invoke, + int (*startup)(unsigned int cpu), + int (*teardown)(unsigned int cpu)) { - unsigned long val = CPU_STARTING; + int cpu, ret = 0; + int dyn_state = 0; -#ifdef CONFIG_PM_SLEEP_SMP - if (frozen_cpus != NULL && cpumask_test_cpu(cpu, frozen_cpus)) - val = CPU_STARTING_FROZEN; -#endif /* CONFIG_PM_SLEEP_SMP */ - cpu_notify(val, (void *)(long)cpu); + if (cpuhp_cb_check(state) || !name) + return -EINVAL; + + get_online_cpus(); + + /* currently assignments for the ONLINE state are possible */ + if (state == CPUHP_AP_ONLINE_DYN) { + dyn_state = 1; + ret = cpuhp_reserve_state(state); + if (ret < 0) + goto out; + state = ret; + } + + cpuhp_store_callbacks(state, name, startup, teardown); + + if (!invoke || !startup) + goto out; + + /* + * Try to call the startup callback for each present cpu + * depending on the hotplug state of the cpu. + */ + for_each_present_cpu(cpu) { + struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); + int cpustate = st->state; + + if (cpustate < state) + continue; + + ret = cpuhp_issue_call(cpu, state, startup, true); + if (ret) { + cpuhp_rollback_install(cpu, state, teardown); + cpuhp_store_callbacks(state, NULL, NULL, NULL); + goto out; + } + } +out: + put_online_cpus(); + if (!ret && dyn_state) + return state; + return ret; } +EXPORT_SYMBOL(__cpuhp_setup_state); -#endif /* CONFIG_SMP */ +/** + * __cpuhp_remove_state - Remove the callbacks for an hotplug machine state + * @state: The state to remove + * @invoke: If true, the teardown function is invoked for cpus where + * cpu state >= @state + * + * The teardown callback is currently not allowed to fail. Think + * about module removal! + */ +void __cpuhp_remove_state(enum cpuhp_state state, bool invoke) +{ + int (*teardown)(unsigned int cpu) = cpuhp_get_teardown_cb(state); + int cpu; + + BUG_ON(cpuhp_cb_check(state)); + + get_online_cpus(); + + if (!invoke || !teardown) + goto remove; + + /* + * Call the teardown callback for each present cpu depending + * on the hotplug state of the cpu. This function is not + * allowed to fail currently! + */ + for_each_present_cpu(cpu) { + struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); + int cpustate = st->state; + + if (cpustate >= state) + cpuhp_issue_call(cpu, state, teardown, false); + } +remove: + cpuhp_store_callbacks(state, NULL, NULL, NULL); + put_online_cpus(); +} +EXPORT_SYMBOL(__cpuhp_remove_state); + +#if defined(CONFIG_SYSFS) && defined(CONFIG_HOTPLUG_CPU) +static ssize_t show_cpuhp_state(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, dev->id); + + return sprintf(buf, "%d\n", st->state); +} +static DEVICE_ATTR(state, 0444, show_cpuhp_state, NULL); + +static ssize_t write_cpuhp_target(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, dev->id); + struct cpuhp_step *sp; + int target, ret; + + ret = kstrtoint(buf, 10, &target); + if (ret) + return ret; + +#ifdef CONFIG_CPU_HOTPLUG_STATE_CONTROL + if (target < CPUHP_OFFLINE || target > CPUHP_ONLINE) + return -EINVAL; +#else + if (target != CPUHP_OFFLINE && target != CPUHP_ONLINE) + return -EINVAL; +#endif + + ret = lock_device_hotplug_sysfs(); + if (ret) + return ret; + + mutex_lock(&cpuhp_state_mutex); + sp = cpuhp_get_step(target); + ret = !sp->name || sp->cant_stop ? -EINVAL : 0; + mutex_unlock(&cpuhp_state_mutex); + if (ret) + return ret; + + if (st->state < target) + ret = do_cpu_up(dev->id, target); + else + ret = do_cpu_down(dev->id, target); + + unlock_device_hotplug(); + return ret ? ret : count; +} + +static ssize_t show_cpuhp_target(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, dev->id); + + return sprintf(buf, "%d\n", st->target); +} +static DEVICE_ATTR(target, 0644, show_cpuhp_target, write_cpuhp_target); + +static struct attribute *cpuhp_cpu_attrs[] = { + &dev_attr_state.attr, + &dev_attr_target.attr, + NULL +}; + +static struct attribute_group cpuhp_cpu_attr_group = { + .attrs = cpuhp_cpu_attrs, + .name = "hotplug", + NULL +}; + +static ssize_t show_cpuhp_states(struct device *dev, + struct device_attribute *attr, char *buf) +{ + ssize_t cur, res = 0; + int i; + + mutex_lock(&cpuhp_state_mutex); + for (i = CPUHP_OFFLINE; i <= CPUHP_ONLINE; i++) { + struct cpuhp_step *sp = cpuhp_get_step(i); + + if (sp->name) { + cur = sprintf(buf, "%3d: %s\n", i, sp->name); + buf += cur; + res += cur; + } + } + mutex_unlock(&cpuhp_state_mutex); + return res; +} +static DEVICE_ATTR(states, 0444, show_cpuhp_states, NULL); + +static struct attribute *cpuhp_cpu_root_attrs[] = { + &dev_attr_states.attr, + NULL +}; + +static struct attribute_group cpuhp_cpu_root_attr_group = { + .attrs = cpuhp_cpu_root_attrs, + .name = "hotplug", + NULL +}; + +static int __init cpuhp_sysfs_init(void) +{ + int cpu, ret; + + ret = sysfs_create_group(&cpu_subsys.dev_root->kobj, + &cpuhp_cpu_root_attr_group); + if (ret) + return ret; + + for_each_possible_cpu(cpu) { + struct device *dev = get_cpu_device(cpu); + + if (!dev) + continue; + ret = sysfs_create_group(&dev->kobj, &cpuhp_cpu_attr_group); + if (ret) + return ret; + } + return 0; +} +device_initcall(cpuhp_sysfs_init); +#endif /* * cpu_bit_bitmap[] is a special, "compressed" data structure that @@ -789,3 +1673,25 @@ void init_cpu_online(const struct cpumask *src) { cpumask_copy(&__cpu_online_mask, src); } + +/* + * Activate the first processor. + */ +void __init boot_cpu_init(void) +{ + int cpu = smp_processor_id(); + + /* Mark the boot cpu "present", "online" etc for SMP and UP case */ + set_cpu_online(cpu, true); + set_cpu_active(cpu, true); + set_cpu_present(cpu, true); + set_cpu_possible(cpu, true); +} + +/* + * Must be called _AFTER_ setting up the per_cpu areas + */ +void __init boot_cpu_state_init(void) +{ + per_cpu_ptr(&cpuhp_state, smp_processor_id())->state = CPUHP_ONLINE; +} diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 3e945fcd8179..00ab5c2b7c5b 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -287,6 +287,8 @@ static struct cpuset top_cpuset = { static DEFINE_MUTEX(cpuset_mutex); static DEFINE_SPINLOCK(callback_lock); +static struct workqueue_struct *cpuset_migrate_mm_wq; + /* * CPU / memory hotplug is handled asynchronously. */ @@ -972,31 +974,51 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, } /* - * cpuset_migrate_mm - * - * Migrate memory region from one set of nodes to another. - * - * Temporarilly set tasks mems_allowed to target nodes of migration, - * so that the migration code can allocate pages on these nodes. - * - * While the mm_struct we are migrating is typically from some - * other task, the task_struct mems_allowed that we are hacking - * is for our current task, which must allocate new pages for that - * migrating memory region. + * Migrate memory region from one set of nodes to another. This is + * performed asynchronously as it can be called from process migration path + * holding locks involved in process management. All mm migrations are + * performed in the queued order and can be waited for by flushing + * cpuset_migrate_mm_wq. */ +struct cpuset_migrate_mm_work { + struct work_struct work; + struct mm_struct *mm; + nodemask_t from; + nodemask_t to; +}; + +static void cpuset_migrate_mm_workfn(struct work_struct *work) +{ + struct cpuset_migrate_mm_work *mwork = + container_of(work, struct cpuset_migrate_mm_work, work); + + /* on a wq worker, no need to worry about %current's mems_allowed */ + do_migrate_pages(mwork->mm, &mwork->from, &mwork->to, MPOL_MF_MOVE_ALL); + mmput(mwork->mm); + kfree(mwork); +} + static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from, const nodemask_t *to) { - struct task_struct *tsk = current; + struct cpuset_migrate_mm_work *mwork; - tsk->mems_allowed = *to; - - do_migrate_pages(mm, from, to, MPOL_MF_MOVE_ALL); + mwork = kzalloc(sizeof(*mwork), GFP_KERNEL); + if (mwork) { + mwork->mm = mm; + mwork->from = *from; + mwork->to = *to; + INIT_WORK(&mwork->work, cpuset_migrate_mm_workfn); + queue_work(cpuset_migrate_mm_wq, &mwork->work); + } else { + mmput(mm); + } +} - rcu_read_lock(); - guarantee_online_mems(task_cs(tsk), &tsk->mems_allowed); - rcu_read_unlock(); +void cpuset_post_attach_flush(void) +{ + flush_workqueue(cpuset_migrate_mm_wq); } /* @@ -1097,7 +1119,8 @@ static void update_tasks_nodemask(struct cpuset *cs) mpol_rebind_mm(mm, &cs->mems_allowed); if (migrate) cpuset_migrate_mm(mm, &cs->old_mems_allowed, &newmems); - mmput(mm); + else + mmput(mm); } css_task_iter_end(&it); @@ -1545,11 +1568,11 @@ static void cpuset_attach(struct cgroup_taskset *tset) * @old_mems_allowed is the right nodesets that we * migrate mm from. */ - if (is_memory_migrate(cs)) { + if (is_memory_migrate(cs)) cpuset_migrate_mm(mm, &oldcs->old_mems_allowed, &cpuset_attach_nodemask_to); - } - mmput(mm); + else + mmput(mm); } } @@ -1714,6 +1737,7 @@ out_unlock: mutex_unlock(&cpuset_mutex); kernfs_unbreak_active_protection(of->kn); css_put(&cs->css); + flush_workqueue(cpuset_migrate_mm_wq); return retval ?: nbytes; } @@ -2065,7 +2089,7 @@ struct cgroup_subsys cpuset_cgrp_subsys = { .attach = cpuset_attach, .bind = cpuset_bind, .legacy_cftypes = files, - .early_init = 1, + .early_init = true, }; /** @@ -2359,6 +2383,9 @@ void __init cpuset_init_smp(void) top_cpuset.effective_mems = node_states[N_MEMORY]; register_hotmemory_notifier(&cpuset_track_online_nodes_nb); + + cpuset_migrate_mm_wq = alloc_ordered_workqueue("cpuset_migrate_mm", 0); + BUG_ON(!cpuset_migrate_mm_wq); } /** @@ -2687,10 +2714,10 @@ int proc_cpuset_show(struct seq_file *m, struct pid_namespace *ns, goto out; retval = -ENAMETOOLONG; - rcu_read_lock(); - css = task_css(tsk, cpuset_cgrp_id); - p = cgroup_path(css->cgroup, buf, PATH_MAX); - rcu_read_unlock(); + css = task_get_css(tsk, cpuset_cgrp_id); + p = cgroup_path_ns(css->cgroup, buf, PATH_MAX, + current->nsproxy->cgroup_ns); + css_put(css); if (!p) goto out_free; seq_puts(m, p); diff --git a/kernel/debug/kdb/kdb_bp.c b/kernel/debug/kdb/kdb_bp.c index e1dbf4a2c69e..90ff129c88a2 100644 --- a/kernel/debug/kdb/kdb_bp.c +++ b/kernel/debug/kdb/kdb_bp.c @@ -153,13 +153,11 @@ static int _kdb_bp_install(struct pt_regs *regs, kdb_bp_t *bp) } else { kdb_printf("%s: failed to set breakpoint at 0x%lx\n", __func__, bp->bp_addr); -#ifdef CONFIG_DEBUG_RODATA if (!bp->bp_type) { kdb_printf("Software breakpoints are unavailable.\n" - " Change the kernel CONFIG_DEBUG_RODATA=n\n" + " Boot the kernel with rodata=off\n" " OR use hw breaks: help bph\n"); } -#endif return 1; } return 0; diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c index 9c418002b8c1..343c22f5e867 100644 --- a/kernel/events/callchain.c +++ b/kernel/events/callchain.c @@ -159,15 +159,24 @@ put_callchain_entry(int rctx) struct perf_callchain_entry * perf_callchain(struct perf_event *event, struct pt_regs *regs) { - int rctx; - struct perf_callchain_entry *entry; - - int kernel = !event->attr.exclude_callchain_kernel; - int user = !event->attr.exclude_callchain_user; + bool kernel = !event->attr.exclude_callchain_kernel; + bool user = !event->attr.exclude_callchain_user; + /* Disallow cross-task user callchains. */ + bool crosstask = event->ctx->task && event->ctx->task != current; if (!kernel && !user) return NULL; + return get_perf_callchain(regs, 0, kernel, user, crosstask, true); +} + +struct perf_callchain_entry * +get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user, + bool crosstask, bool add_mark) +{ + struct perf_callchain_entry *entry; + int rctx; + entry = get_callchain_entry(&rctx); if (rctx == -1) return NULL; @@ -175,10 +184,11 @@ perf_callchain(struct perf_event *event, struct pt_regs *regs) if (!entry) goto exit_put; - entry->nr = 0; + entry->nr = init_nr; if (kernel && !user_mode(regs)) { - perf_callchain_store(entry, PERF_CONTEXT_KERNEL); + if (add_mark) + perf_callchain_store(entry, PERF_CONTEXT_KERNEL); perf_callchain_kernel(entry, regs); } @@ -191,13 +201,11 @@ perf_callchain(struct perf_event *event, struct pt_regs *regs) } if (regs) { - /* - * Disallow cross-task user callchains. - */ - if (event->ctx->task && event->ctx->task != current) + if (crosstask) goto exit_put; - perf_callchain_store(entry, PERF_CONTEXT_USER); + if (add_mark) + perf_callchain_store(entry, PERF_CONTEXT_USER); perf_callchain_user(entry, regs); } } diff --git a/kernel/events/core.c b/kernel/events/core.c index 06ae52e99ac2..de24fbce5277 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -49,8 +49,6 @@ #include <asm/irq_regs.h> -static struct workqueue_struct *perf_wq; - typedef int (*remote_function_f)(void *); struct remote_function_call { @@ -66,8 +64,17 @@ static void remote_function(void *data) struct task_struct *p = tfc->p; if (p) { - tfc->ret = -EAGAIN; - if (task_cpu(p) != smp_processor_id() || !task_curr(p)) + /* -EAGAIN */ + if (task_cpu(p) != smp_processor_id()) + return; + + /* + * Now that we're on right CPU with IRQs disabled, we can test + * if we hit the right task without races. + */ + + tfc->ret = -ESRCH; /* No such (running) process */ + if (p != current) return; } @@ -94,13 +101,17 @@ task_function_call(struct task_struct *p, remote_function_f func, void *info) .p = p, .func = func, .info = info, - .ret = -ESRCH, /* No such (running) process */ + .ret = -EAGAIN, }; + int ret; - if (task_curr(p)) - smp_call_function_single(task_cpu(p), remote_function, &data, 1); + do { + ret = smp_call_function_single(task_cpu(p), remote_function, &data, 1); + if (!ret) + ret = data.ret; + } while (ret == -EAGAIN); - return data.ret; + return ret; } /** @@ -126,44 +137,170 @@ static int cpu_function_call(int cpu, remote_function_f func, void *info) return data.ret; } -static void event_function_call(struct perf_event *event, - int (*active)(void *), - void (*inactive)(void *), - void *data) +static inline struct perf_cpu_context * +__get_cpu_context(struct perf_event_context *ctx) +{ + return this_cpu_ptr(ctx->pmu->pmu_cpu_context); +} + +static void perf_ctx_lock(struct perf_cpu_context *cpuctx, + struct perf_event_context *ctx) +{ + raw_spin_lock(&cpuctx->ctx.lock); + if (ctx) + raw_spin_lock(&ctx->lock); +} + +static void perf_ctx_unlock(struct perf_cpu_context *cpuctx, + struct perf_event_context *ctx) +{ + if (ctx) + raw_spin_unlock(&ctx->lock); + raw_spin_unlock(&cpuctx->ctx.lock); +} + +#define TASK_TOMBSTONE ((void *)-1L) + +static bool is_kernel_event(struct perf_event *event) +{ + return READ_ONCE(event->owner) == TASK_TOMBSTONE; +} + +/* + * On task ctx scheduling... + * + * When !ctx->nr_events a task context will not be scheduled. This means + * we can disable the scheduler hooks (for performance) without leaving + * pending task ctx state. + * + * This however results in two special cases: + * + * - removing the last event from a task ctx; this is relatively straight + * forward and is done in __perf_remove_from_context. + * + * - adding the first event to a task ctx; this is tricky because we cannot + * rely on ctx->is_active and therefore cannot use event_function_call(). + * See perf_install_in_context(). + * + * If ctx->nr_events, then ctx->is_active and cpuctx->task_ctx are set. + */ + +typedef void (*event_f)(struct perf_event *, struct perf_cpu_context *, + struct perf_event_context *, void *); + +struct event_function_struct { + struct perf_event *event; + event_f func; + void *data; +}; + +static int event_function(void *info) +{ + struct event_function_struct *efs = info; + struct perf_event *event = efs->event; + struct perf_event_context *ctx = event->ctx; + struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); + struct perf_event_context *task_ctx = cpuctx->task_ctx; + int ret = 0; + + WARN_ON_ONCE(!irqs_disabled()); + + perf_ctx_lock(cpuctx, task_ctx); + /* + * Since we do the IPI call without holding ctx->lock things can have + * changed, double check we hit the task we set out to hit. + */ + if (ctx->task) { + if (ctx->task != current) { + ret = -ESRCH; + goto unlock; + } + + /* + * We only use event_function_call() on established contexts, + * and event_function() is only ever called when active (or + * rather, we'll have bailed in task_function_call() or the + * above ctx->task != current test), therefore we must have + * ctx->is_active here. + */ + WARN_ON_ONCE(!ctx->is_active); + /* + * And since we have ctx->is_active, cpuctx->task_ctx must + * match. + */ + WARN_ON_ONCE(task_ctx != ctx); + } else { + WARN_ON_ONCE(&cpuctx->ctx != ctx); + } + + efs->func(event, cpuctx, ctx, efs->data); +unlock: + perf_ctx_unlock(cpuctx, task_ctx); + + return ret; +} + +static void event_function_local(struct perf_event *event, event_f func, void *data) +{ + struct event_function_struct efs = { + .event = event, + .func = func, + .data = data, + }; + + int ret = event_function(&efs); + WARN_ON_ONCE(ret); +} + +static void event_function_call(struct perf_event *event, event_f func, void *data) { struct perf_event_context *ctx = event->ctx; - struct task_struct *task = ctx->task; + struct task_struct *task = READ_ONCE(ctx->task); /* verified in event_function */ + struct event_function_struct efs = { + .event = event, + .func = func, + .data = data, + }; + + if (!event->parent) { + /* + * If this is a !child event, we must hold ctx::mutex to + * stabilize the the event->ctx relation. See + * perf_event_ctx_lock(). + */ + lockdep_assert_held(&ctx->mutex); + } if (!task) { - cpu_function_call(event->cpu, active, data); + cpu_function_call(event->cpu, event_function, &efs); return; } + if (task == TASK_TOMBSTONE) + return; + again: - if (!task_function_call(task, active, data)) + if (!task_function_call(task, event_function, &efs)) return; raw_spin_lock_irq(&ctx->lock); + /* + * Reload the task pointer, it might have been changed by + * a concurrent perf_event_context_sched_out(). + */ + task = ctx->task; + if (task == TASK_TOMBSTONE) { + raw_spin_unlock_irq(&ctx->lock); + return; + } if (ctx->is_active) { - /* - * Reload the task pointer, it might have been changed by - * a concurrent perf_event_context_sched_out(). - */ - task = ctx->task; raw_spin_unlock_irq(&ctx->lock); goto again; } - inactive(data); + func(event, NULL, ctx, data); raw_spin_unlock_irq(&ctx->lock); } -#define EVENT_OWNER_KERNEL ((void *) -1) - -static bool is_kernel_event(struct perf_event *event) -{ - return event->owner == EVENT_OWNER_KERNEL; -} - #define PERF_FLAG_ALL (PERF_FLAG_FD_NO_GROUP |\ PERF_FLAG_FD_OUTPUT |\ PERF_FLAG_PID_CGROUP |\ @@ -179,6 +316,7 @@ static bool is_kernel_event(struct perf_event *event) enum event_type_t { EVENT_FLEXIBLE = 0x1, EVENT_PINNED = 0x2, + EVENT_TIME = 0x4, EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED, }; @@ -186,7 +324,13 @@ enum event_type_t { * perf_sched_events : >0 events exist * perf_cgroup_events: >0 per-cpu cgroup events exist on this cpu */ -struct static_key_deferred perf_sched_events __read_mostly; + +static void perf_sched_delayed(struct work_struct *work); +DEFINE_STATIC_KEY_FALSE(perf_sched_events); +static DECLARE_DELAYED_WORK(perf_sched_work, perf_sched_delayed); +static DEFINE_MUTEX(perf_sched_mutex); +static atomic_t perf_sched_count; + static DEFINE_PER_CPU(atomic_t, perf_cgroup_events); static DEFINE_PER_CPU(int, perf_sched_cb_usages); @@ -232,8 +376,11 @@ static void update_perf_cpu_limits(void) u64 tmp = perf_sample_period_ns; tmp *= sysctl_perf_cpu_time_max_percent; - do_div(tmp, 100); - ACCESS_ONCE(perf_sample_allowed_ns) = tmp; + tmp = div_u64(tmp, 100); + if (!tmp) + tmp = 1; + + WRITE_ONCE(perf_sample_allowed_ns, tmp); } static int perf_rotate_context(struct perf_cpu_context *cpuctx); @@ -265,7 +412,13 @@ int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write, if (ret || !write) return ret; - update_perf_cpu_limits(); + if (sysctl_perf_cpu_time_max_percent == 100) { + printk(KERN_WARNING + "perf: Dynamic interrupt throttling disabled, can hang your system!\n"); + WRITE_ONCE(perf_sample_allowed_ns, 0); + } else { + update_perf_cpu_limits(); + } return 0; } @@ -279,62 +432,68 @@ int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write, #define NR_ACCUMULATED_SAMPLES 128 static DEFINE_PER_CPU(u64, running_sample_length); +static u64 __report_avg; +static u64 __report_allowed; + static void perf_duration_warn(struct irq_work *w) { - u64 allowed_ns = ACCESS_ONCE(perf_sample_allowed_ns); - u64 avg_local_sample_len; - u64 local_samples_len; - - local_samples_len = __this_cpu_read(running_sample_length); - avg_local_sample_len = local_samples_len/NR_ACCUMULATED_SAMPLES; - printk_ratelimited(KERN_WARNING - "perf interrupt took too long (%lld > %lld), lowering " - "kernel.perf_event_max_sample_rate to %d\n", - avg_local_sample_len, allowed_ns >> 1, - sysctl_perf_event_sample_rate); + "perf: interrupt took too long (%lld > %lld), lowering " + "kernel.perf_event_max_sample_rate to %d\n", + __report_avg, __report_allowed, + sysctl_perf_event_sample_rate); } static DEFINE_IRQ_WORK(perf_duration_work, perf_duration_warn); void perf_sample_event_took(u64 sample_len_ns) { - u64 allowed_ns = ACCESS_ONCE(perf_sample_allowed_ns); - u64 avg_local_sample_len; - u64 local_samples_len; + u64 max_len = READ_ONCE(perf_sample_allowed_ns); + u64 running_len; + u64 avg_len; + u32 max; - if (allowed_ns == 0) + if (max_len == 0) return; - /* decay the counter by 1 average sample */ - local_samples_len = __this_cpu_read(running_sample_length); - local_samples_len -= local_samples_len/NR_ACCUMULATED_SAMPLES; - local_samples_len += sample_len_ns; - __this_cpu_write(running_sample_length, local_samples_len); + /* Decay the counter by 1 average sample. */ + running_len = __this_cpu_read(running_sample_length); + running_len -= running_len/NR_ACCUMULATED_SAMPLES; + running_len += sample_len_ns; + __this_cpu_write(running_sample_length, running_len); /* - * note: this will be biased artifically low until we have - * seen NR_ACCUMULATED_SAMPLES. Doing it this way keeps us + * Note: this will be biased artifically low until we have + * seen NR_ACCUMULATED_SAMPLES. Doing it this way keeps us * from having to maintain a count. */ - avg_local_sample_len = local_samples_len/NR_ACCUMULATED_SAMPLES; - - if (avg_local_sample_len <= allowed_ns) + avg_len = running_len/NR_ACCUMULATED_SAMPLES; + if (avg_len <= max_len) return; - if (max_samples_per_tick <= 1) - return; + __report_avg = avg_len; + __report_allowed = max_len; - max_samples_per_tick = DIV_ROUND_UP(max_samples_per_tick, 2); - sysctl_perf_event_sample_rate = max_samples_per_tick * HZ; - perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate; + /* + * Compute a throttle threshold 25% below the current duration. + */ + avg_len += avg_len / 4; + max = (TICK_NSEC / 100) * sysctl_perf_cpu_time_max_percent; + if (avg_len < max) + max /= (u32)avg_len; + else + max = 1; - update_perf_cpu_limits(); + WRITE_ONCE(perf_sample_allowed_ns, avg_len); + WRITE_ONCE(max_samples_per_tick, max); + + sysctl_perf_event_sample_rate = max * HZ; + perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate; if (!irq_work_queue(&perf_duration_work)) { - early_printk("perf interrupt took too long (%lld > %lld), lowering " + early_printk("perf: interrupt took too long (%lld > %lld), lowering " "kernel.perf_event_max_sample_rate to %d\n", - avg_local_sample_len, allowed_ns >> 1, + __report_avg, __report_allowed, sysctl_perf_event_sample_rate); } } @@ -368,28 +527,6 @@ static inline u64 perf_event_clock(struct perf_event *event) return event->clock(); } -static inline struct perf_cpu_context * -__get_cpu_context(struct perf_event_context *ctx) -{ - return this_cpu_ptr(ctx->pmu->pmu_cpu_context); -} - -static void perf_ctx_lock(struct perf_cpu_context *cpuctx, - struct perf_event_context *ctx) -{ - raw_spin_lock(&cpuctx->ctx.lock); - if (ctx) - raw_spin_lock(&ctx->lock); -} - -static void perf_ctx_unlock(struct perf_cpu_context *cpuctx, - struct perf_event_context *ctx) -{ - if (ctx) - raw_spin_unlock(&ctx->lock); - raw_spin_unlock(&cpuctx->ctx.lock); -} - #ifdef CONFIG_CGROUP_PERF static inline bool @@ -579,13 +716,7 @@ static inline void perf_cgroup_sched_out(struct task_struct *task, * we are holding the rcu lock */ cgrp1 = perf_cgroup_from_task(task, NULL); - - /* - * next is NULL when called from perf_event_enable_on_exec() - * that will systematically cause a cgroup_switch() - */ - if (next) - cgrp2 = perf_cgroup_from_task(next, NULL); + cgrp2 = perf_cgroup_from_task(next, NULL); /* * only schedule out current cgroup events if we know @@ -611,8 +742,6 @@ static inline void perf_cgroup_sched_in(struct task_struct *prev, * we are holding the rcu lock */ cgrp1 = perf_cgroup_from_task(task, NULL); - - /* prev can never be NULL */ cgrp2 = perf_cgroup_from_task(prev, NULL); /* @@ -917,7 +1046,7 @@ static void put_ctx(struct perf_event_context *ctx) if (atomic_dec_and_test(&ctx->refcount)) { if (ctx->parent_ctx) put_ctx(ctx->parent_ctx); - if (ctx->task) + if (ctx->task && ctx->task != TASK_TOMBSTONE) put_task_struct(ctx->task); call_rcu(&ctx->rcu_head, free_ctx); } @@ -934,9 +1063,8 @@ static void put_ctx(struct perf_event_context *ctx) * perf_event_context::mutex nests and those are: * * - perf_event_exit_task_context() [ child , 0 ] - * __perf_event_exit_task() - * sync_child_event() - * put_event() [ parent, 1 ] + * perf_event_exit_event() + * put_event() [ parent, 1 ] * * - perf_event_init_context() [ parent, 0 ] * inherit_task_group() @@ -979,8 +1107,8 @@ static void put_ctx(struct perf_event_context *ctx) * Lock order: * task_struct::perf_event_mutex * perf_event_context::mutex - * perf_event_context::lock * perf_event::child_mutex; + * perf_event_context::lock * perf_event::mmap_mutex * mmap_sem */ @@ -1078,6 +1206,7 @@ static u64 primary_event_id(struct perf_event *event) /* * Get the perf_event_context for a task and lock it. + * * This has to cope with with the fact that until it is locked, * the context could get moved to another task. */ @@ -1118,9 +1247,12 @@ retry: goto retry; } - if (!atomic_inc_not_zero(&ctx->refcount)) { + if (ctx->task == TASK_TOMBSTONE || + !atomic_inc_not_zero(&ctx->refcount)) { raw_spin_unlock(&ctx->lock); ctx = NULL; + } else { + WARN_ON_ONCE(ctx->task != task); } } rcu_read_unlock(); @@ -1180,16 +1312,18 @@ static u64 perf_event_time(struct perf_event *event) /* * Update the total_time_enabled and total_time_running fields for a event. - * The caller of this function needs to hold the ctx->lock. */ static void update_event_times(struct perf_event *event) { struct perf_event_context *ctx = event->ctx; u64 run_end; + lockdep_assert_held(&ctx->lock); + if (event->state < PERF_EVENT_STATE_INACTIVE || event->group_leader->state < PERF_EVENT_STATE_INACTIVE) return; + /* * in cgroup mode, time_enabled represents * the time the event was enabled AND active @@ -1246,6 +1380,8 @@ ctx_group_list(struct perf_event *event, struct perf_event_context *ctx) static void list_add_event(struct perf_event *event, struct perf_event_context *ctx) { + lockdep_assert_held(&ctx->lock); + WARN_ON_ONCE(event->attach_state & PERF_ATTACH_CONTEXT); event->attach_state |= PERF_ATTACH_CONTEXT; @@ -1448,11 +1584,14 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx) if (is_cgroup_event(event)) { ctx->nr_cgroups--; + /* + * Because cgroup events are always per-cpu events, this will + * always be called from the right CPU. + */ cpuctx = __get_cpu_context(ctx); /* - * if there are no more cgroup events - * then cler cgrp to avoid stale pointer - * in update_cgrp_time_from_cpuctx() + * If there are no more cgroup events then clear cgrp to avoid + * stale pointer in update_cgrp_time_from_cpuctx(). */ if (!ctx->nr_cgroups) cpuctx->cgrp = NULL; @@ -1530,45 +1669,11 @@ out: perf_event__header_size(tmp); } -/* - * User event without the task. - */ static bool is_orphaned_event(struct perf_event *event) { - return event && !is_kernel_event(event) && !event->owner; -} - -/* - * Event has a parent but parent's task finished and it's - * alive only because of children holding refference. - */ -static bool is_orphaned_child(struct perf_event *event) -{ - return is_orphaned_event(event->parent); + return event->state == PERF_EVENT_STATE_DEAD; } -static void orphans_remove_work(struct work_struct *work); - -static void schedule_orphans_remove(struct perf_event_context *ctx) -{ - if (!ctx->task || ctx->orphans_remove_sched || !perf_wq) - return; - - if (queue_delayed_work(perf_wq, &ctx->orphans_remove, 1)) { - get_ctx(ctx); - ctx->orphans_remove_sched = true; - } -} - -static int __init perf_workqueue_init(void) -{ - perf_wq = create_singlethread_workqueue("perf"); - WARN(!perf_wq, "failed to create perf workqueue\n"); - return perf_wq ? 0 : -1; -} - -core_initcall(perf_workqueue_init); - static inline int pmu_filter_match(struct perf_event *event) { struct pmu *pmu = event->pmu; @@ -1611,14 +1716,14 @@ event_sched_out(struct perf_event *event, perf_pmu_disable(event->pmu); + event->tstamp_stopped = tstamp; + event->pmu->del(event, 0); + event->oncpu = -1; event->state = PERF_EVENT_STATE_INACTIVE; if (event->pending_disable) { event->pending_disable = 0; event->state = PERF_EVENT_STATE_OFF; } - event->tstamp_stopped = tstamp; - event->pmu->del(event, 0); - event->oncpu = -1; if (!is_software_event(event)) cpuctx->active_oncpu--; @@ -1629,9 +1734,6 @@ event_sched_out(struct perf_event *event, if (event->attr.exclusive || !cpuctx->active_oncpu) cpuctx->exclusive = 0; - if (is_orphaned_child(event)) - schedule_orphans_remove(ctx); - perf_pmu_enable(event->pmu); } @@ -1655,21 +1757,7 @@ group_sched_out(struct perf_event *group_event, cpuctx->exclusive = 0; } -struct remove_event { - struct perf_event *event; - bool detach_group; -}; - -static void ___perf_remove_from_context(void *info) -{ - struct remove_event *re = info; - struct perf_event *event = re->event; - struct perf_event_context *ctx = event->ctx; - - if (re->detach_group) - perf_group_detach(event); - list_del_event(event, ctx); -} +#define DETACH_GROUP 0x01UL /* * Cross CPU call to remove a performance event @@ -1677,33 +1765,31 @@ static void ___perf_remove_from_context(void *info) * We disable the event on the hardware level first. After that we * remove it from the context list. */ -static int __perf_remove_from_context(void *info) +static void +__perf_remove_from_context(struct perf_event *event, + struct perf_cpu_context *cpuctx, + struct perf_event_context *ctx, + void *info) { - struct remove_event *re = info; - struct perf_event *event = re->event; - struct perf_event_context *ctx = event->ctx; - struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); + unsigned long flags = (unsigned long)info; - raw_spin_lock(&ctx->lock); event_sched_out(event, cpuctx, ctx); - if (re->detach_group) + if (flags & DETACH_GROUP) perf_group_detach(event); list_del_event(event, ctx); - if (!ctx->nr_events && cpuctx->task_ctx == ctx) { + + if (!ctx->nr_events && ctx->is_active) { ctx->is_active = 0; - cpuctx->task_ctx = NULL; + if (ctx->task) { + WARN_ON_ONCE(cpuctx->task_ctx != ctx); + cpuctx->task_ctx = NULL; + } } - raw_spin_unlock(&ctx->lock); - - return 0; } /* * Remove the event from a task's (or a CPU's) list of events. * - * CPU events are removed with a smp call. For task events we only - * call when the task is on a CPU. - * * If event->ctx is a cloned context, callers must make sure that * every task struct that event->ctx->task could possibly point to * remains valid. This is OK when called from perf_release since @@ -1711,73 +1797,32 @@ static int __perf_remove_from_context(void *info) * When called from perf_event_exit_task, it's OK because the * context has been detached from its task. */ -static void perf_remove_from_context(struct perf_event *event, bool detach_group) +static void perf_remove_from_context(struct perf_event *event, unsigned long flags) { - struct perf_event_context *ctx = event->ctx; - struct remove_event re = { - .event = event, - .detach_group = detach_group, - }; - - lockdep_assert_held(&ctx->mutex); + lockdep_assert_held(&event->ctx->mutex); - event_function_call(event, __perf_remove_from_context, - ___perf_remove_from_context, &re); + event_function_call(event, __perf_remove_from_context, (void *)flags); } /* * Cross CPU call to disable a performance event */ -int __perf_event_disable(void *info) -{ - struct perf_event *event = info; - struct perf_event_context *ctx = event->ctx; - struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); - - /* - * If this is a per-task event, need to check whether this - * event's task is the current task on this cpu. - * - * Can trigger due to concurrent perf_event_context_sched_out() - * flipping contexts around. - */ - if (ctx->task && cpuctx->task_ctx != ctx) - return -EINVAL; - - raw_spin_lock(&ctx->lock); - - /* - * If the event is on, turn it off. - * If it is in error state, leave it in error state. - */ - if (event->state >= PERF_EVENT_STATE_INACTIVE) { - update_context_time(ctx); - update_cgrp_time_from_event(event); - update_group_times(event); - if (event == event->group_leader) - group_sched_out(event, cpuctx, ctx); - else - event_sched_out(event, cpuctx, ctx); - event->state = PERF_EVENT_STATE_OFF; - } - - raw_spin_unlock(&ctx->lock); - - return 0; -} - -void ___perf_event_disable(void *info) +static void __perf_event_disable(struct perf_event *event, + struct perf_cpu_context *cpuctx, + struct perf_event_context *ctx, + void *info) { - struct perf_event *event = info; + if (event->state < PERF_EVENT_STATE_INACTIVE) + return; - /* - * Since we have the lock this context can't be scheduled - * in, so we can change the state safely. - */ - if (event->state == PERF_EVENT_STATE_INACTIVE) { - update_group_times(event); - event->state = PERF_EVENT_STATE_OFF; - } + update_context_time(ctx); + update_cgrp_time_from_event(event); + update_group_times(event); + if (event == event->group_leader) + group_sched_out(event, cpuctx, ctx); + else + event_sched_out(event, cpuctx, ctx); + event->state = PERF_EVENT_STATE_OFF; } /* @@ -1788,7 +1833,8 @@ void ___perf_event_disable(void *info) * remains valid. This condition is satisifed when called through * perf_event_for_each_child or perf_event_for_each because they * hold the top-level event's child_mutex, so any descendant that - * goes to exit will block in sync_child_event. + * goes to exit will block in perf_event_exit_event(). + * * When called from perf_pending_event it's OK because event->ctx * is the current context on this CPU and preemption is disabled, * hence we can't get into perf_event_task_sched_out for this context. @@ -1804,8 +1850,12 @@ static void _perf_event_disable(struct perf_event *event) } raw_spin_unlock_irq(&ctx->lock); - event_function_call(event, __perf_event_disable, - ___perf_event_disable, event); + event_function_call(event, __perf_event_disable, NULL); +} + +void perf_event_disable_local(struct perf_event *event) +{ + event_function_local(event, __perf_event_disable, NULL); } /* @@ -1918,9 +1968,6 @@ event_sched_in(struct perf_event *event, if (event->attr.exclusive) cpuctx->exclusive = 1; - if (is_orphaned_child(event)) - schedule_orphans_remove(ctx); - out: perf_pmu_enable(event->pmu); @@ -2039,13 +2086,27 @@ static void add_event_to_ctx(struct perf_event *event, event->tstamp_stopped = tstamp; } -static void task_ctx_sched_out(struct perf_event_context *ctx); +static void ctx_sched_out(struct perf_event_context *ctx, + struct perf_cpu_context *cpuctx, + enum event_type_t event_type); static void ctx_sched_in(struct perf_event_context *ctx, struct perf_cpu_context *cpuctx, enum event_type_t event_type, struct task_struct *task); +static void task_ctx_sched_out(struct perf_cpu_context *cpuctx, + struct perf_event_context *ctx) +{ + if (!cpuctx->task_ctx) + return; + + if (WARN_ON_ONCE(ctx != cpuctx->task_ctx)) + return; + + ctx_sched_out(ctx, cpuctx, EVENT_ALL); +} + static void perf_event_sched_in(struct perf_cpu_context *cpuctx, struct perf_event_context *ctx, struct task_struct *task) @@ -2058,22 +2119,22 @@ static void perf_event_sched_in(struct perf_cpu_context *cpuctx, ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE, task); } -static void ___perf_install_in_context(void *info) +static void ctx_resched(struct perf_cpu_context *cpuctx, + struct perf_event_context *task_ctx) { - struct perf_event *event = info; - struct perf_event_context *ctx = event->ctx; - - /* - * Since the task isn't running, its safe to add the event, us holding - * the ctx->lock ensures the task won't get scheduled in. - */ - add_event_to_ctx(event, ctx); + perf_pmu_disable(cpuctx->ctx.pmu); + if (task_ctx) + task_ctx_sched_out(cpuctx, task_ctx); + cpu_ctx_sched_out(cpuctx, EVENT_ALL); + perf_event_sched_in(cpuctx, task_ctx, current); + perf_pmu_enable(cpuctx->ctx.pmu); } /* * Cross CPU call to install and enable a performance event * - * Must be called with ctx->mutex held + * Very similar to remote_function() + event_function() but cannot assume that + * things like ctx->is_active and cpuctx->task_ctx are set. */ static int __perf_install_in_context(void *info) { @@ -2081,79 +2142,106 @@ static int __perf_install_in_context(void *info) struct perf_event_context *ctx = event->ctx; struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); struct perf_event_context *task_ctx = cpuctx->task_ctx; - struct task_struct *task = current; - - perf_ctx_lock(cpuctx, task_ctx); - perf_pmu_disable(cpuctx->ctx.pmu); - - /* - * If there was an active task_ctx schedule it out. - */ - if (task_ctx) - task_ctx_sched_out(task_ctx); + bool activate = true; + int ret = 0; - /* - * If the context we're installing events in is not the - * active task_ctx, flip them. - */ - if (ctx->task && task_ctx != ctx) { - if (task_ctx) - raw_spin_unlock(&task_ctx->lock); + raw_spin_lock(&cpuctx->ctx.lock); + if (ctx->task) { raw_spin_lock(&ctx->lock); task_ctx = ctx; - } - - if (task_ctx) { - cpuctx->task_ctx = task_ctx; - task = task_ctx->task; - } - cpu_ctx_sched_out(cpuctx, EVENT_ALL); + /* If we're on the wrong CPU, try again */ + if (task_cpu(ctx->task) != smp_processor_id()) { + ret = -ESRCH; + goto unlock; + } - update_context_time(ctx); - /* - * update cgrp time only if current cgrp - * matches event->cgrp. Must be done before - * calling add_event_to_ctx() - */ - update_cgrp_time_from_event(event); + /* + * If we're on the right CPU, see if the task we target is + * current, if not we don't have to activate the ctx, a future + * context switch will do that for us. + */ + if (ctx->task != current) + activate = false; + else + WARN_ON_ONCE(cpuctx->task_ctx && cpuctx->task_ctx != ctx); - add_event_to_ctx(event, ctx); + } else if (task_ctx) { + raw_spin_lock(&task_ctx->lock); + } - /* - * Schedule everything back in - */ - perf_event_sched_in(cpuctx, task_ctx, task); + if (activate) { + ctx_sched_out(ctx, cpuctx, EVENT_TIME); + add_event_to_ctx(event, ctx); + ctx_resched(cpuctx, task_ctx); + } else { + add_event_to_ctx(event, ctx); + } - perf_pmu_enable(cpuctx->ctx.pmu); +unlock: perf_ctx_unlock(cpuctx, task_ctx); - return 0; + return ret; } /* - * Attach a performance event to a context - * - * First we add the event to the list with the hardware enable bit - * in event->hw_config cleared. + * Attach a performance event to a context. * - * If the event is attached to a task which is on a CPU we use a smp - * call to enable it in the task context. The task might have been - * scheduled away, but we check this in the smp call again. + * Very similar to event_function_call, see comment there. */ static void perf_install_in_context(struct perf_event_context *ctx, struct perf_event *event, int cpu) { + struct task_struct *task = READ_ONCE(ctx->task); + lockdep_assert_held(&ctx->mutex); event->ctx = ctx; if (event->cpu != -1) event->cpu = cpu; - event_function_call(event, __perf_install_in_context, - ___perf_install_in_context, event); + if (!task) { + cpu_function_call(cpu, __perf_install_in_context, event); + return; + } + + /* + * Should not happen, we validate the ctx is still alive before calling. + */ + if (WARN_ON_ONCE(task == TASK_TOMBSTONE)) + return; + + /* + * Installing events is tricky because we cannot rely on ctx->is_active + * to be set in case this is the nr_events 0 -> 1 transition. + */ +again: + /* + * Cannot use task_function_call() because we need to run on the task's + * CPU regardless of whether its current or not. + */ + if (!cpu_function_call(task_cpu(task), __perf_install_in_context, event)) + return; + + raw_spin_lock_irq(&ctx->lock); + task = ctx->task; + if (WARN_ON_ONCE(task == TASK_TOMBSTONE)) { + /* + * Cannot happen because we already checked above (which also + * cannot happen), and we hold ctx->mutex, which serializes us + * against perf_event_exit_task_context(). + */ + raw_spin_unlock_irq(&ctx->lock); + return; + } + raw_spin_unlock_irq(&ctx->lock); + /* + * Since !ctx->is_active doesn't mean anything, we must IPI + * unconditionally. + */ + goto again; } /* @@ -2180,85 +2268,47 @@ static void __perf_event_mark_enabled(struct perf_event *event) /* * Cross CPU call to enable a performance event */ -static int __perf_event_enable(void *info) +static void __perf_event_enable(struct perf_event *event, + struct perf_cpu_context *cpuctx, + struct perf_event_context *ctx, + void *info) { - struct perf_event *event = info; - struct perf_event_context *ctx = event->ctx; struct perf_event *leader = event->group_leader; - struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); - int err; - - /* - * There's a time window between 'ctx->is_active' check - * in perf_event_enable function and this place having: - * - IRQs on - * - ctx->lock unlocked - * - * where the task could be killed and 'ctx' deactivated - * by perf_event_exit_task. - */ - if (!ctx->is_active) - return -EINVAL; + struct perf_event_context *task_ctx; - raw_spin_lock(&ctx->lock); - update_context_time(ctx); - - if (event->state >= PERF_EVENT_STATE_INACTIVE) - goto unlock; + if (event->state >= PERF_EVENT_STATE_INACTIVE || + event->state <= PERF_EVENT_STATE_ERROR) + return; - /* - * set current task's cgroup time reference point - */ - perf_cgroup_set_timestamp(current, ctx); + if (ctx->is_active) + ctx_sched_out(ctx, cpuctx, EVENT_TIME); __perf_event_mark_enabled(event); + if (!ctx->is_active) + return; + if (!event_filter_match(event)) { if (is_cgroup_event(event)) perf_cgroup_defer_enabled(event); - goto unlock; + ctx_sched_in(ctx, cpuctx, EVENT_TIME, current); + return; } /* * If the event is in a group and isn't the group leader, * then don't put it on unless the group is on. */ - if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE) - goto unlock; - - if (!group_can_go_on(event, cpuctx, 1)) { - err = -EEXIST; - } else { - if (event == leader) - err = group_sched_in(event, cpuctx, ctx); - else - err = event_sched_in(event, cpuctx, ctx); + if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE) { + ctx_sched_in(ctx, cpuctx, EVENT_TIME, current); + return; } - if (err) { - /* - * If this event can't go on and it's part of a - * group, then the whole group has to come off. - */ - if (leader != event) { - group_sched_out(leader, cpuctx, ctx); - perf_mux_hrtimer_restart(cpuctx); - } - if (leader->attr.pinned) { - update_group_times(leader); - leader->state = PERF_EVENT_STATE_ERROR; - } - } + task_ctx = cpuctx->task_ctx; + if (ctx->task) + WARN_ON_ONCE(task_ctx != ctx); -unlock: - raw_spin_unlock(&ctx->lock); - - return 0; -} - -void ___perf_event_enable(void *info) -{ - __perf_event_mark_enabled((struct perf_event *)info); + ctx_resched(cpuctx, task_ctx); } /* @@ -2275,7 +2325,8 @@ static void _perf_event_enable(struct perf_event *event) struct perf_event_context *ctx = event->ctx; raw_spin_lock_irq(&ctx->lock); - if (event->state >= PERF_EVENT_STATE_INACTIVE) { + if (event->state >= PERF_EVENT_STATE_INACTIVE || + event->state < PERF_EVENT_STATE_ERROR) { raw_spin_unlock_irq(&ctx->lock); return; } @@ -2291,8 +2342,7 @@ static void _perf_event_enable(struct perf_event *event) event->state = PERF_EVENT_STATE_OFF; raw_spin_unlock_irq(&ctx->lock); - event_function_call(event, __perf_event_enable, - ___perf_event_enable, event); + event_function_call(event, __perf_event_enable, NULL); } /* @@ -2342,25 +2392,49 @@ static void ctx_sched_out(struct perf_event_context *ctx, struct perf_cpu_context *cpuctx, enum event_type_t event_type) { - struct perf_event *event; int is_active = ctx->is_active; + struct perf_event *event; - ctx->is_active &= ~event_type; - if (likely(!ctx->nr_events)) + lockdep_assert_held(&ctx->lock); + + if (likely(!ctx->nr_events)) { + /* + * See __perf_remove_from_context(). + */ + WARN_ON_ONCE(ctx->is_active); + if (ctx->task) + WARN_ON_ONCE(cpuctx->task_ctx); return; + } - update_context_time(ctx); - update_cgrp_time_from_cpuctx(cpuctx); - if (!ctx->nr_active) + ctx->is_active &= ~event_type; + if (!(ctx->is_active & EVENT_ALL)) + ctx->is_active = 0; + + if (ctx->task) { + WARN_ON_ONCE(cpuctx->task_ctx != ctx); + if (!ctx->is_active) + cpuctx->task_ctx = NULL; + } + + is_active ^= ctx->is_active; /* changed bits */ + + if (is_active & EVENT_TIME) { + /* update (and stop) ctx time */ + update_context_time(ctx); + update_cgrp_time_from_cpuctx(cpuctx); + } + + if (!ctx->nr_active || !(is_active & EVENT_ALL)) return; perf_pmu_disable(ctx->pmu); - if ((is_active & EVENT_PINNED) && (event_type & EVENT_PINNED)) { + if (is_active & EVENT_PINNED) { list_for_each_entry(event, &ctx->pinned_groups, group_entry) group_sched_out(event, cpuctx, ctx); } - if ((is_active & EVENT_FLEXIBLE) && (event_type & EVENT_FLEXIBLE)) { + if (is_active & EVENT_FLEXIBLE) { list_for_each_entry(event, &ctx->flexible_groups, group_entry) group_sched_out(event, cpuctx, ctx); } @@ -2518,17 +2592,21 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn, raw_spin_lock(&ctx->lock); raw_spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING); if (context_equiv(ctx, next_ctx)) { - /* - * XXX do we need a memory barrier of sorts - * wrt to rcu_dereference() of perf_event_ctxp - */ - task->perf_event_ctxp[ctxn] = next_ctx; - next->perf_event_ctxp[ctxn] = ctx; - ctx->task = next; - next_ctx->task = task; + WRITE_ONCE(ctx->task, next); + WRITE_ONCE(next_ctx->task, task); swap(ctx->task_ctx_data, next_ctx->task_ctx_data); + /* + * RCU_INIT_POINTER here is safe because we've not + * modified the ctx and the above modification of + * ctx->task and ctx->task_ctx_data are immaterial + * since those values are always verified under + * ctx->lock which we're now holding. + */ + RCU_INIT_POINTER(task->perf_event_ctxp[ctxn], next_ctx); + RCU_INIT_POINTER(next->perf_event_ctxp[ctxn], ctx); + do_switch = 0; perf_event_sync_stat(ctx, next_ctx); @@ -2541,8 +2619,7 @@ unlock: if (do_switch) { raw_spin_lock(&ctx->lock); - ctx_sched_out(ctx, cpuctx, EVENT_ALL); - cpuctx->task_ctx = NULL; + task_ctx_sched_out(cpuctx, ctx); raw_spin_unlock(&ctx->lock); } } @@ -2637,20 +2714,6 @@ void __perf_event_task_sched_out(struct task_struct *task, perf_cgroup_sched_out(task, next); } -static void task_ctx_sched_out(struct perf_event_context *ctx) -{ - struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); - - if (!cpuctx->task_ctx) - return; - - if (WARN_ON_ONCE(ctx != cpuctx->task_ctx)) - return; - - ctx_sched_out(ctx, cpuctx, EVENT_ALL); - cpuctx->task_ctx = NULL; -} - /* * Called with IRQs disabled */ @@ -2725,25 +2788,40 @@ ctx_sched_in(struct perf_event_context *ctx, enum event_type_t event_type, struct task_struct *task) { - u64 now; int is_active = ctx->is_active; + u64 now; + + lockdep_assert_held(&ctx->lock); - ctx->is_active |= event_type; if (likely(!ctx->nr_events)) return; - now = perf_clock(); - ctx->timestamp = now; - perf_cgroup_set_timestamp(task, ctx); + ctx->is_active |= (event_type | EVENT_TIME); + if (ctx->task) { + if (!is_active) + cpuctx->task_ctx = ctx; + else + WARN_ON_ONCE(cpuctx->task_ctx != ctx); + } + + is_active ^= ctx->is_active; /* changed bits */ + + if (is_active & EVENT_TIME) { + /* start ctx time */ + now = perf_clock(); + ctx->timestamp = now; + perf_cgroup_set_timestamp(task, ctx); + } + /* * First go through the list and put on any pinned groups * in order to give them the best chance of going on. */ - if (!(is_active & EVENT_PINNED) && (event_type & EVENT_PINNED)) + if (is_active & EVENT_PINNED) ctx_pinned_sched_in(ctx, cpuctx); /* Then walk through the lower prio flexible groups */ - if (!(is_active & EVENT_FLEXIBLE) && (event_type & EVENT_FLEXIBLE)) + if (is_active & EVENT_FLEXIBLE) ctx_flexible_sched_in(ctx, cpuctx); } @@ -2773,12 +2851,7 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx, * cpu flexible, task flexible. */ cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); - - if (ctx->nr_events) - cpuctx->task_ctx = ctx; - - perf_event_sched_in(cpuctx, cpuctx->task_ctx, task); - + perf_event_sched_in(cpuctx, ctx, task); perf_pmu_enable(ctx->pmu); perf_ctx_unlock(cpuctx, ctx); } @@ -2800,6 +2873,16 @@ void __perf_event_task_sched_in(struct task_struct *prev, struct perf_event_context *ctx; int ctxn; + /* + * If cgroup events exist on this CPU, then we need to check if we have + * to switch in PMU state; cgroup event are system-wide mode only. + * + * Since cgroup events are CPU events, we must schedule these in before + * we schedule in the task events. + */ + if (atomic_read(this_cpu_ptr(&perf_cgroup_events))) + perf_cgroup_sched_in(prev, task); + for_each_task_context_nr(ctxn) { ctx = task->perf_event_ctxp[ctxn]; if (likely(!ctx)) @@ -2807,13 +2890,6 @@ void __perf_event_task_sched_in(struct task_struct *prev, perf_event_context_sched_in(ctx, task); } - /* - * if cgroup events exist on this CPU, then we need - * to check if we have to switch in PMU state. - * cgroup event are system-wide mode only - */ - if (atomic_read(this_cpu_ptr(&perf_cgroup_events))) - perf_cgroup_sched_in(prev, task); if (atomic_read(&nr_switch_events)) perf_event_switch(task, prev, true); @@ -3051,17 +3127,6 @@ done: return rotate; } -#ifdef CONFIG_NO_HZ_FULL -bool perf_event_can_stop_tick(void) -{ - if (atomic_read(&nr_freq_events) || - __this_cpu_read(perf_throttled_count)) - return false; - else - return true; -} -#endif - void perf_event_task_tick(void) { struct list_head *head = this_cpu_ptr(&active_ctx_list); @@ -3072,6 +3137,7 @@ void perf_event_task_tick(void) __this_cpu_inc(perf_throttled_seq); throttled = __this_cpu_xchg(perf_throttled_count, 0); + tick_dep_clear_cpu(smp_processor_id(), TICK_DEP_BIT_PERF_EVENTS); list_for_each_entry_safe(ctx, tmp, head, active_ctx_list) perf_adjust_freq_unthr_context(ctx, throttled); @@ -3099,46 +3165,31 @@ static int event_enable_on_exec(struct perf_event *event, static void perf_event_enable_on_exec(int ctxn) { struct perf_event_context *ctx, *clone_ctx = NULL; + struct perf_cpu_context *cpuctx; struct perf_event *event; unsigned long flags; int enabled = 0; - int ret; local_irq_save(flags); ctx = current->perf_event_ctxp[ctxn]; if (!ctx || !ctx->nr_events) goto out; - /* - * We must ctxsw out cgroup events to avoid conflict - * when invoking perf_task_event_sched_in() later on - * in this function. Otherwise we end up trying to - * ctxswin cgroup events which are already scheduled - * in. - */ - perf_cgroup_sched_out(current, NULL); - - raw_spin_lock(&ctx->lock); - task_ctx_sched_out(ctx); - - list_for_each_entry(event, &ctx->event_list, event_entry) { - ret = event_enable_on_exec(event, ctx); - if (ret) - enabled = 1; - } + cpuctx = __get_cpu_context(ctx); + perf_ctx_lock(cpuctx, ctx); + ctx_sched_out(ctx, cpuctx, EVENT_TIME); + list_for_each_entry(event, &ctx->event_list, event_entry) + enabled |= event_enable_on_exec(event, ctx); /* - * Unclone this context if we enabled any event. + * Unclone and reschedule this context if we enabled any event. */ - if (enabled) + if (enabled) { clone_ctx = unclone_ctx(ctx); + ctx_resched(cpuctx, ctx); + } + perf_ctx_unlock(cpuctx, ctx); - raw_spin_unlock(&ctx->lock); - - /* - * Also calls ctxswin for cgroup events, if any: - */ - perf_event_context_sched_in(ctx, ctx->task); out: local_irq_restore(flags); @@ -3334,7 +3385,6 @@ static void __perf_event_init_context(struct perf_event_context *ctx) INIT_LIST_HEAD(&ctx->flexible_groups); INIT_LIST_HEAD(&ctx->event_list); atomic_set(&ctx->refcount, 1); - INIT_DELAYED_WORK(&ctx->orphans_remove, orphans_remove_work); } static struct perf_event_context * @@ -3519,13 +3569,37 @@ static void unaccount_event_cpu(struct perf_event *event, int cpu) atomic_dec(&per_cpu(perf_cgroup_events, cpu)); } +#ifdef CONFIG_NO_HZ_FULL +static DEFINE_SPINLOCK(nr_freq_lock); +#endif + +static void unaccount_freq_event_nohz(void) +{ +#ifdef CONFIG_NO_HZ_FULL + spin_lock(&nr_freq_lock); + if (atomic_dec_and_test(&nr_freq_events)) + tick_nohz_dep_clear(TICK_DEP_BIT_PERF_EVENTS); + spin_unlock(&nr_freq_lock); +#endif +} + +static void unaccount_freq_event(void) +{ + if (tick_nohz_full_enabled()) + unaccount_freq_event_nohz(); + else + atomic_dec(&nr_freq_events); +} + static void unaccount_event(struct perf_event *event) { + bool dec = false; + if (event->parent) return; if (event->attach_state & PERF_ATTACH_TASK) - static_key_slow_dec_deferred(&perf_sched_events); + dec = true; if (event->attr.mmap || event->attr.mmap_data) atomic_dec(&nr_mmap_events); if (event->attr.comm) @@ -3533,19 +3607,32 @@ static void unaccount_event(struct perf_event *event) if (event->attr.task) atomic_dec(&nr_task_events); if (event->attr.freq) - atomic_dec(&nr_freq_events); + unaccount_freq_event(); if (event->attr.context_switch) { - static_key_slow_dec_deferred(&perf_sched_events); + dec = true; atomic_dec(&nr_switch_events); } if (is_cgroup_event(event)) - static_key_slow_dec_deferred(&perf_sched_events); + dec = true; if (has_branch_stack(event)) - static_key_slow_dec_deferred(&perf_sched_events); + dec = true; + + if (dec) { + if (!atomic_add_unless(&perf_sched_count, -1, 1)) + schedule_delayed_work(&perf_sched_work, HZ); + } unaccount_event_cpu(event, event->cpu); } +static void perf_sched_delayed(struct work_struct *work) +{ + mutex_lock(&perf_sched_mutex); + if (atomic_dec_and_test(&perf_sched_count)) + static_branch_disable(&perf_sched_events); + mutex_unlock(&perf_sched_mutex); +} + /* * The following implement mutual exclusion of events on "exclusive" pmus * (PERF_PMU_CAP_EXCLUSIVE). Such pmus can only have one event scheduled @@ -3556,7 +3643,7 @@ static void unaccount_event(struct perf_event *event) * 3) two matching events on the same context. * * The former two cases are handled in the allocation path (perf_event_alloc(), - * __free_event()), the latter -- before the first perf_install_in_context(). + * _free_event()), the latter -- before the first perf_install_in_context(). */ static int exclusive_event_init(struct perf_event *event) { @@ -3631,29 +3718,6 @@ static bool exclusive_event_installable(struct perf_event *event, return true; } -static void __free_event(struct perf_event *event) -{ - if (!event->parent) { - if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) - put_callchain_buffers(); - } - - perf_event_free_bpf_prog(event); - - if (event->destroy) - event->destroy(event); - - if (event->ctx) - put_ctx(event->ctx); - - if (event->pmu) { - exclusive_event_destroy(event); - module_put(event->pmu->module); - } - - call_rcu(&event->rcu_head, free_event_rcu); -} - static void _free_event(struct perf_event *event) { irq_work_sync(&event->pending); @@ -3675,7 +3739,25 @@ static void _free_event(struct perf_event *event) if (is_cgroup_event(event)) perf_detach_cgroup(event); - __free_event(event); + if (!event->parent) { + if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) + put_callchain_buffers(); + } + + perf_event_free_bpf_prog(event); + + if (event->destroy) + event->destroy(event); + + if (event->ctx) + put_ctx(event->ctx); + + if (event->pmu) { + exclusive_event_destroy(event); + module_put(event->pmu->module); + } + + call_rcu(&event->rcu_head, free_event_rcu); } /* @@ -3702,14 +3784,13 @@ static void perf_remove_from_owner(struct perf_event *event) struct task_struct *owner; rcu_read_lock(); - owner = ACCESS_ONCE(event->owner); /* - * Matches the smp_wmb() in perf_event_exit_task(). If we observe - * !owner it means the list deletion is complete and we can indeed - * free this event, otherwise we need to serialize on + * Matches the smp_store_release() in perf_event_exit_task(). If we + * observe !owner it means the list deletion is complete and we can + * indeed free this event, otherwise we need to serialize on * owner->perf_event_mutex. */ - smp_read_barrier_depends(); + owner = lockless_dereference(event->owner); if (owner) { /* * Since delayed_put_task_struct() also drops the last @@ -3737,8 +3818,10 @@ static void perf_remove_from_owner(struct perf_event *event) * ensured they're done, and we can proceed with freeing the * event. */ - if (event->owner) + if (event->owner) { list_del_init(&event->owner_entry); + smp_store_release(&event->owner, NULL); + } mutex_unlock(&owner->perf_event_mutex); put_task_struct(owner); } @@ -3746,37 +3829,111 @@ static void perf_remove_from_owner(struct perf_event *event) static void put_event(struct perf_event *event) { - struct perf_event_context *ctx; - if (!atomic_long_dec_and_test(&event->refcount)) return; + _free_event(event); +} + +/* + * Kill an event dead; while event:refcount will preserve the event + * object, it will not preserve its functionality. Once the last 'user' + * gives up the object, we'll destroy the thing. + */ +int perf_event_release_kernel(struct perf_event *event) +{ + struct perf_event_context *ctx = event->ctx; + struct perf_event *child, *tmp; + + /* + * If we got here through err_file: fput(event_file); we will not have + * attached to a context yet. + */ + if (!ctx) { + WARN_ON_ONCE(event->attach_state & + (PERF_ATTACH_CONTEXT|PERF_ATTACH_GROUP)); + goto no_ctx; + } + if (!is_kernel_event(event)) perf_remove_from_owner(event); + ctx = perf_event_ctx_lock(event); + WARN_ON_ONCE(ctx->parent_ctx); + perf_remove_from_context(event, DETACH_GROUP); + + raw_spin_lock_irq(&ctx->lock); /* - * There are two ways this annotation is useful: + * Mark this even as STATE_DEAD, there is no external reference to it + * anymore. * - * 1) there is a lock recursion from perf_event_exit_task - * see the comment there. + * Anybody acquiring event->child_mutex after the below loop _must_ + * also see this, most importantly inherit_event() which will avoid + * placing more children on the list. * - * 2) there is a lock-inversion with mmap_sem through - * perf_read_group(), which takes faults while - * holding ctx->mutex, however this is called after - * the last filedesc died, so there is no possibility - * to trigger the AB-BA case. + * Thus this guarantees that we will in fact observe and kill _ALL_ + * child events. */ - ctx = perf_event_ctx_lock_nested(event, SINGLE_DEPTH_NESTING); - WARN_ON_ONCE(ctx->parent_ctx); - perf_remove_from_context(event, true); + event->state = PERF_EVENT_STATE_DEAD; + raw_spin_unlock_irq(&ctx->lock); + perf_event_ctx_unlock(event, ctx); - _free_event(event); -} +again: + mutex_lock(&event->child_mutex); + list_for_each_entry(child, &event->child_list, child_list) { -int perf_event_release_kernel(struct perf_event *event) -{ - put_event(event); + /* + * Cannot change, child events are not migrated, see the + * comment with perf_event_ctx_lock_nested(). + */ + ctx = lockless_dereference(child->ctx); + /* + * Since child_mutex nests inside ctx::mutex, we must jump + * through hoops. We start by grabbing a reference on the ctx. + * + * Since the event cannot get freed while we hold the + * child_mutex, the context must also exist and have a !0 + * reference count. + */ + get_ctx(ctx); + + /* + * Now that we have a ctx ref, we can drop child_mutex, and + * acquire ctx::mutex without fear of it going away. Then we + * can re-acquire child_mutex. + */ + mutex_unlock(&event->child_mutex); + mutex_lock(&ctx->mutex); + mutex_lock(&event->child_mutex); + + /* + * Now that we hold ctx::mutex and child_mutex, revalidate our + * state, if child is still the first entry, it didn't get freed + * and we can continue doing so. + */ + tmp = list_first_entry_or_null(&event->child_list, + struct perf_event, child_list); + if (tmp == child) { + perf_remove_from_context(child, DETACH_GROUP); + list_del(&child->child_list); + free_event(child); + /* + * This matches the refcount bump in inherit_event(); + * this can't be the last reference. + */ + put_event(event); + } + + mutex_unlock(&event->child_mutex); + mutex_unlock(&ctx->mutex); + put_ctx(ctx); + goto again; + } + mutex_unlock(&event->child_mutex); + +no_ctx: + put_event(event); /* Must be the 'last' reference */ return 0; } EXPORT_SYMBOL_GPL(perf_event_release_kernel); @@ -3786,46 +3943,10 @@ EXPORT_SYMBOL_GPL(perf_event_release_kernel); */ static int perf_release(struct inode *inode, struct file *file) { - put_event(file->private_data); + perf_event_release_kernel(file->private_data); return 0; } -/* - * Remove all orphanes events from the context. - */ -static void orphans_remove_work(struct work_struct *work) -{ - struct perf_event_context *ctx; - struct perf_event *event, *tmp; - - ctx = container_of(work, struct perf_event_context, - orphans_remove.work); - - mutex_lock(&ctx->mutex); - list_for_each_entry_safe(event, tmp, &ctx->event_list, event_entry) { - struct perf_event *parent_event = event->parent; - - if (!is_orphaned_child(event)) - continue; - - perf_remove_from_context(event, true); - - mutex_lock(&parent_event->child_mutex); - list_del_init(&event->child_list); - mutex_unlock(&parent_event->child_mutex); - - free_event(event); - put_event(parent_event); - } - - raw_spin_lock_irq(&ctx->lock); - ctx->orphans_remove_sched = false; - raw_spin_unlock_irq(&ctx->lock); - mutex_unlock(&ctx->mutex); - - put_ctx(ctx); -} - u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running) { struct perf_event *child; @@ -3969,7 +4090,7 @@ static bool is_event_hup(struct perf_event *event) { bool no_children; - if (event->state != PERF_EVENT_STATE_EXIT) + if (event->state > PERF_EVENT_STATE_EXIT) return false; mutex_lock(&event->child_mutex); @@ -4054,7 +4175,7 @@ static void _perf_event_reset(struct perf_event *event) /* * Holding the top-level event's child_mutex means that any * descendant process that has inherited this event will block - * in sync_child_event if it goes to exit, thus satisfying the + * in perf_event_exit_event() if it goes to exit, thus satisfying the * task existence requirements of perf_event_enable/disable. */ static void perf_event_for_each_child(struct perf_event *event, @@ -4086,36 +4207,14 @@ static void perf_event_for_each(struct perf_event *event, perf_event_for_each_child(sibling, func); } -struct period_event { - struct perf_event *event; - u64 value; -}; - -static void ___perf_event_period(void *info) -{ - struct period_event *pe = info; - struct perf_event *event = pe->event; - u64 value = pe->value; - - if (event->attr.freq) { - event->attr.sample_freq = value; - } else { - event->attr.sample_period = value; - event->hw.sample_period = value; - } - - local64_set(&event->hw.period_left, 0); -} - -static int __perf_event_period(void *info) +static void __perf_event_period(struct perf_event *event, + struct perf_cpu_context *cpuctx, + struct perf_event_context *ctx, + void *info) { - struct period_event *pe = info; - struct perf_event *event = pe->event; - struct perf_event_context *ctx = event->ctx; - u64 value = pe->value; + u64 value = *((u64 *)info); bool active; - raw_spin_lock(&ctx->lock); if (event->attr.freq) { event->attr.sample_freq = value; } else { @@ -4126,6 +4225,14 @@ static int __perf_event_period(void *info) active = (event->state == PERF_EVENT_STATE_ACTIVE); if (active) { perf_pmu_disable(ctx->pmu); + /* + * We could be throttled; unthrottle now to avoid the tick + * trying to unthrottle while we already re-started the event. + */ + if (event->hw.interrupts == MAX_INTERRUPTS) { + event->hw.interrupts = 0; + perf_log_throttle(event, 1); + } event->pmu->stop(event, PERF_EF_UPDATE); } @@ -4135,14 +4242,10 @@ static int __perf_event_period(void *info) event->pmu->start(event, PERF_EF_RELOAD); perf_pmu_enable(ctx->pmu); } - raw_spin_unlock(&ctx->lock); - - return 0; } static int perf_event_period(struct perf_event *event, u64 __user *arg) { - struct period_event pe = { .event = event, }; u64 value; if (!is_sampling_event(event)) @@ -4157,10 +4260,7 @@ static int perf_event_period(struct perf_event *event, u64 __user *arg) if (event->attr.freq && value > sysctl_perf_event_sample_rate) return -EINVAL; - pe.value = value; - - event_function_call(event, __perf_event_period, - ___perf_event_period, &pe); + event_function_call(event, __perf_event_period, &value); return 0; } @@ -4932,7 +5032,7 @@ static void perf_pending_event(struct irq_work *entry) if (event->pending_disable) { event->pending_disable = 0; - __perf_event_disable(event); + perf_event_disable_local(event); } if (event->pending_wakeup) { @@ -6359,9 +6459,9 @@ static int __perf_event_overflow(struct perf_event *event, if (unlikely(throttle && hwc->interrupts >= max_samples_per_tick)) { __this_cpu_inc(perf_throttled_count); + tick_dep_set_cpu(smp_processor_id(), TICK_DEP_BIT_PERF_EVENTS); hwc->interrupts = MAX_INTERRUPTS; perf_log_throttle(event, 0); - tick_nohz_full_kick(); ret = 1; } } @@ -6720,7 +6820,7 @@ static void swevent_hlist_release(struct swevent_htable *swhash) kfree_rcu(hlist, rcu_head); } -static void swevent_hlist_put_cpu(struct perf_event *event, int cpu) +static void swevent_hlist_put_cpu(int cpu) { struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu); @@ -6732,15 +6832,15 @@ static void swevent_hlist_put_cpu(struct perf_event *event, int cpu) mutex_unlock(&swhash->hlist_mutex); } -static void swevent_hlist_put(struct perf_event *event) +static void swevent_hlist_put(void) { int cpu; for_each_possible_cpu(cpu) - swevent_hlist_put_cpu(event, cpu); + swevent_hlist_put_cpu(cpu); } -static int swevent_hlist_get_cpu(struct perf_event *event, int cpu) +static int swevent_hlist_get_cpu(int cpu) { struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu); int err = 0; @@ -6763,14 +6863,13 @@ exit: return err; } -static int swevent_hlist_get(struct perf_event *event) +static int swevent_hlist_get(void) { - int err; - int cpu, failed_cpu; + int err, cpu, failed_cpu; get_online_cpus(); for_each_possible_cpu(cpu) { - err = swevent_hlist_get_cpu(event, cpu); + err = swevent_hlist_get_cpu(cpu); if (err) { failed_cpu = cpu; goto fail; @@ -6783,7 +6882,7 @@ fail: for_each_possible_cpu(cpu) { if (cpu == failed_cpu) break; - swevent_hlist_put_cpu(event, cpu); + swevent_hlist_put_cpu(cpu); } put_online_cpus(); @@ -6799,7 +6898,7 @@ static void sw_perf_event_destroy(struct perf_event *event) WARN_ON(event->parent); static_key_slow_dec(&perf_swevent_enabled[event_id]); - swevent_hlist_put(event); + swevent_hlist_put(); } static int perf_swevent_init(struct perf_event *event) @@ -6830,7 +6929,7 @@ static int perf_swevent_init(struct perf_event *event) if (!event->parent) { int err; - err = swevent_hlist_get(event); + err = swevent_hlist_get(); if (err) return err; @@ -7751,31 +7850,75 @@ static void account_event_cpu(struct perf_event *event, int cpu) atomic_inc(&per_cpu(perf_cgroup_events, cpu)); } +/* Freq events need the tick to stay alive (see perf_event_task_tick). */ +static void account_freq_event_nohz(void) +{ +#ifdef CONFIG_NO_HZ_FULL + /* Lock so we don't race with concurrent unaccount */ + spin_lock(&nr_freq_lock); + if (atomic_inc_return(&nr_freq_events) == 1) + tick_nohz_dep_set(TICK_DEP_BIT_PERF_EVENTS); + spin_unlock(&nr_freq_lock); +#endif +} + +static void account_freq_event(void) +{ + if (tick_nohz_full_enabled()) + account_freq_event_nohz(); + else + atomic_inc(&nr_freq_events); +} + + static void account_event(struct perf_event *event) { + bool inc = false; + if (event->parent) return; if (event->attach_state & PERF_ATTACH_TASK) - static_key_slow_inc(&perf_sched_events.key); + inc = true; if (event->attr.mmap || event->attr.mmap_data) atomic_inc(&nr_mmap_events); if (event->attr.comm) atomic_inc(&nr_comm_events); if (event->attr.task) atomic_inc(&nr_task_events); - if (event->attr.freq) { - if (atomic_inc_return(&nr_freq_events) == 1) - tick_nohz_full_kick_all(); - } + if (event->attr.freq) + account_freq_event(); if (event->attr.context_switch) { atomic_inc(&nr_switch_events); - static_key_slow_inc(&perf_sched_events.key); + inc = true; } if (has_branch_stack(event)) - static_key_slow_inc(&perf_sched_events.key); + inc = true; if (is_cgroup_event(event)) - static_key_slow_inc(&perf_sched_events.key); + inc = true; + + if (inc) { + if (atomic_inc_not_zero(&perf_sched_count)) + goto enabled; + + mutex_lock(&perf_sched_mutex); + if (!atomic_read(&perf_sched_count)) { + static_branch_enable(&perf_sched_events); + /* + * Guarantee that all CPUs observe they key change and + * call the perf scheduling hooks before proceeding to + * install events that need them. + */ + synchronize_sched(); + } + /* + * Now that we have waited for the sync_sched(), allow further + * increments to by-pass the mutex. + */ + atomic_inc(&perf_sched_count); + mutex_unlock(&perf_sched_mutex); + } +enabled: account_event_cpu(event, event->cpu); } @@ -7911,6 +8054,9 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, } } + /* symmetric to unaccount_event() in _free_event() */ + account_event(event); + return event; err_per_task: @@ -8274,8 +8420,6 @@ SYSCALL_DEFINE5(perf_event_open, } } - account_event(event); - /* * Special case software events and allow them to be part of * any hardware group. @@ -8394,10 +8538,19 @@ SYSCALL_DEFINE5(perf_event_open, if (move_group) { gctx = group_leader->ctx; mutex_lock_double(&gctx->mutex, &ctx->mutex); + if (gctx->task == TASK_TOMBSTONE) { + err = -ESRCH; + goto err_locked; + } } else { mutex_lock(&ctx->mutex); } + if (ctx->task == TASK_TOMBSTONE) { + err = -ESRCH; + goto err_locked; + } + if (!perf_event_validate_size(event)) { err = -E2BIG; goto err_locked; @@ -8422,11 +8575,11 @@ SYSCALL_DEFINE5(perf_event_open, * See perf_event_ctx_lock() for comments on the details * of swizzling perf_event::ctx. */ - perf_remove_from_context(group_leader, false); + perf_remove_from_context(group_leader, 0); list_for_each_entry(sibling, &group_leader->sibling_list, group_entry) { - perf_remove_from_context(sibling, false); + perf_remove_from_context(sibling, 0); put_ctx(gctx); } @@ -8479,6 +8632,8 @@ SYSCALL_DEFINE5(perf_event_open, perf_event__header_size(event); perf_event__id_header_size(event); + event->owner = current; + perf_install_in_context(ctx, event, event->cpu); perf_unpin_context(ctx); @@ -8488,8 +8643,6 @@ SYSCALL_DEFINE5(perf_event_open, put_online_cpus(); - event->owner = current; - mutex_lock(¤t->perf_event_mutex); list_add_tail(&event->owner_entry, ¤t->perf_event_list); mutex_unlock(¤t->perf_event_mutex); @@ -8514,7 +8667,12 @@ err_context: perf_unpin_context(ctx); put_ctx(ctx); err_alloc: - free_event(event); + /* + * If event_file is set, the fput() above will have called ->release() + * and that will take care of freeing the event. + */ + if (!event_file) + free_event(event); err_cpus: put_online_cpus(); err_task: @@ -8556,9 +8714,7 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, } /* Mark owner so we could distinguish it from user events. */ - event->owner = EVENT_OWNER_KERNEL; - - account_event(event); + event->owner = TASK_TOMBSTONE; ctx = find_get_context(event->pmu, task, event); if (IS_ERR(ctx)) { @@ -8568,12 +8724,14 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, WARN_ON_ONCE(ctx->parent_ctx); mutex_lock(&ctx->mutex); + if (ctx->task == TASK_TOMBSTONE) { + err = -ESRCH; + goto err_unlock; + } + if (!exclusive_event_installable(event, ctx)) { - mutex_unlock(&ctx->mutex); - perf_unpin_context(ctx); - put_ctx(ctx); err = -EBUSY; - goto err_free; + goto err_unlock; } perf_install_in_context(ctx, event, cpu); @@ -8582,6 +8740,10 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, return event; +err_unlock: + mutex_unlock(&ctx->mutex); + perf_unpin_context(ctx); + put_ctx(ctx); err_free: free_event(event); err: @@ -8606,7 +8768,7 @@ void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu) mutex_lock_double(&src_ctx->mutex, &dst_ctx->mutex); list_for_each_entry_safe(event, tmp, &src_ctx->event_list, event_entry) { - perf_remove_from_context(event, false); + perf_remove_from_context(event, 0); unaccount_event_cpu(event, src_cpu); put_ctx(src_ctx); list_add(&event->migrate_entry, &events); @@ -8673,33 +8835,15 @@ static void sync_child_event(struct perf_event *child_event, &parent_event->child_total_time_enabled); atomic64_add(child_event->total_time_running, &parent_event->child_total_time_running); - - /* - * Remove this event from the parent's list - */ - WARN_ON_ONCE(parent_event->ctx->parent_ctx); - mutex_lock(&parent_event->child_mutex); - list_del_init(&child_event->child_list); - mutex_unlock(&parent_event->child_mutex); - - /* - * Make sure user/parent get notified, that we just - * lost one event. - */ - perf_event_wakeup(parent_event); - - /* - * Release the parent event, if this was the last - * reference to it. - */ - put_event(parent_event); } static void -__perf_event_exit_task(struct perf_event *child_event, - struct perf_event_context *child_ctx, - struct task_struct *child) +perf_event_exit_event(struct perf_event *child_event, + struct perf_event_context *child_ctx, + struct task_struct *child) { + struct perf_event *parent_event = child_event->parent; + /* * Do not destroy the 'original' grouping; because of the context * switch optimization the original events could've ended up in a @@ -8712,57 +8856,86 @@ __perf_event_exit_task(struct perf_event *child_event, * Do destroy all inherited groups, we don't care about those * and being thorough is better. */ - perf_remove_from_context(child_event, !!child_event->parent); + raw_spin_lock_irq(&child_ctx->lock); + WARN_ON_ONCE(child_ctx->is_active); + + if (parent_event) + perf_group_detach(child_event); + list_del_event(child_event, child_ctx); + child_event->state = PERF_EVENT_STATE_EXIT; /* is_event_hup() */ + raw_spin_unlock_irq(&child_ctx->lock); /* - * It can happen that the parent exits first, and has events - * that are still around due to the child reference. These - * events need to be zapped. + * Parent events are governed by their filedesc, retain them. */ - if (child_event->parent) { - sync_child_event(child_event, child); - free_event(child_event); - } else { - child_event->state = PERF_EVENT_STATE_EXIT; + if (!parent_event) { perf_event_wakeup(child_event); + return; } + /* + * Child events can be cleaned up. + */ + + sync_child_event(child_event, child); + + /* + * Remove this event from the parent's list + */ + WARN_ON_ONCE(parent_event->ctx->parent_ctx); + mutex_lock(&parent_event->child_mutex); + list_del_init(&child_event->child_list); + mutex_unlock(&parent_event->child_mutex); + + /* + * Kick perf_poll() for is_event_hup(). + */ + perf_event_wakeup(parent_event); + free_event(child_event); + put_event(parent_event); } static void perf_event_exit_task_context(struct task_struct *child, int ctxn) { - struct perf_event *child_event, *next; struct perf_event_context *child_ctx, *clone_ctx = NULL; - unsigned long flags; + struct perf_event *child_event, *next; + + WARN_ON_ONCE(child != current); - if (likely(!child->perf_event_ctxp[ctxn])) + child_ctx = perf_pin_task_context(child, ctxn); + if (!child_ctx) return; - local_irq_save(flags); /* - * We can't reschedule here because interrupts are disabled, - * and either child is current or it is a task that can't be - * scheduled, so we are now safe from rescheduling changing - * our context. + * In order to reduce the amount of tricky in ctx tear-down, we hold + * ctx::mutex over the entire thing. This serializes against almost + * everything that wants to access the ctx. + * + * The exception is sys_perf_event_open() / + * perf_event_create_kernel_count() which does find_get_context() + * without ctx::mutex (it cannot because of the move_group double mutex + * lock thing). See the comments in perf_install_in_context(). */ - child_ctx = rcu_dereference_raw(child->perf_event_ctxp[ctxn]); + mutex_lock(&child_ctx->mutex); /* - * Take the context lock here so that if find_get_context is - * reading child->perf_event_ctxp, we wait until it has - * incremented the context's refcount before we do put_ctx below. + * In a single ctx::lock section, de-schedule the events and detach the + * context from the task such that we cannot ever get it scheduled back + * in. */ - raw_spin_lock(&child_ctx->lock); - task_ctx_sched_out(child_ctx); - child->perf_event_ctxp[ctxn] = NULL; + raw_spin_lock_irq(&child_ctx->lock); + task_ctx_sched_out(__get_cpu_context(child_ctx), child_ctx); /* - * If this context is a clone; unclone it so it can't get - * swapped to another process while we're removing all - * the events from it. + * Now that the context is inactive, destroy the task <-> ctx relation + * and mark the context dead. */ + RCU_INIT_POINTER(child->perf_event_ctxp[ctxn], NULL); + put_ctx(child_ctx); /* cannot be last */ + WRITE_ONCE(child_ctx->task, TASK_TOMBSTONE); + put_task_struct(current); /* cannot be last */ + clone_ctx = unclone_ctx(child_ctx); - update_context_time(child_ctx); - raw_spin_unlock_irqrestore(&child_ctx->lock, flags); + raw_spin_unlock_irq(&child_ctx->lock); if (clone_ctx) put_ctx(clone_ctx); @@ -8774,20 +8947,8 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn) */ perf_event_task(child, child_ctx, 0); - /* - * We can recurse on the same lock type through: - * - * __perf_event_exit_task() - * sync_child_event() - * put_event() - * mutex_lock(&ctx->mutex) - * - * But since its the parent context it won't be the same instance. - */ - mutex_lock(&child_ctx->mutex); - list_for_each_entry_safe(child_event, next, &child_ctx->event_list, event_entry) - __perf_event_exit_task(child_event, child_ctx, child); + perf_event_exit_event(child_event, child_ctx, child); mutex_unlock(&child_ctx->mutex); @@ -8812,8 +8973,7 @@ void perf_event_exit_task(struct task_struct *child) * the owner, closes a race against perf_release() where * we need to serialize on the owner->perf_event_mutex. */ - smp_wmb(); - event->owner = NULL; + smp_store_release(&event->owner, NULL); } mutex_unlock(&child->perf_event_mutex); @@ -8896,21 +9056,20 @@ void perf_event_delayed_put(struct task_struct *task) WARN_ON_ONCE(task->perf_event_ctxp[ctxn]); } -struct perf_event *perf_event_get(unsigned int fd) +struct file *perf_event_get(unsigned int fd) { - int err; - struct fd f; - struct perf_event *event; + struct file *file; - err = perf_fget_light(fd, &f); - if (err) - return ERR_PTR(err); + file = fget_raw(fd); + if (!file) + return ERR_PTR(-EBADF); - event = f.file->private_data; - atomic_long_inc(&event->refcount); - fdput(f); + if (file->f_op != &perf_fops) { + fput(file); + return ERR_PTR(-EBADF); + } - return event; + return file; } const struct perf_event_attr *perf_event_attrs(struct perf_event *event) @@ -8953,8 +9112,16 @@ inherit_event(struct perf_event *parent_event, if (IS_ERR(child_event)) return child_event; + /* + * is_orphaned_event() and list_add_tail(&parent_event->child_list) + * must be under the same lock in order to serialize against + * perf_event_release_kernel(), such that either we must observe + * is_orphaned_event() or they will observe us on the child_list. + */ + mutex_lock(&parent_event->child_mutex); if (is_orphaned_event(parent_event) || !atomic_long_inc_not_zero(&parent_event->refcount)) { + mutex_unlock(&parent_event->child_mutex); free_event(child_event); return NULL; } @@ -9002,8 +9169,6 @@ inherit_event(struct perf_event *parent_event, /* * Link this into the parent event's child list */ - WARN_ON_ONCE(parent_event->ctx->parent_ctx); - mutex_lock(&parent_event->child_mutex); list_add_tail(&child_event->child_list, &parent_event->child_list); mutex_unlock(&parent_event->child_mutex); @@ -9208,7 +9373,7 @@ static void perf_event_init_cpu(int cpu) struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu); mutex_lock(&swhash->hlist_mutex); - if (swhash->hlist_refcount > 0) { + if (swhash->hlist_refcount > 0 && !swevent_hlist_deref(swhash)) { struct swevent_hlist *hlist; hlist = kzalloc_node(sizeof(*hlist), GFP_KERNEL, cpu_to_node(cpu)); @@ -9221,13 +9386,14 @@ static void perf_event_init_cpu(int cpu) #if defined CONFIG_HOTPLUG_CPU || defined CONFIG_KEXEC_CORE static void __perf_event_exit_context(void *__info) { - struct remove_event re = { .detach_group = true }; struct perf_event_context *ctx = __info; + struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); + struct perf_event *event; - rcu_read_lock(); - list_for_each_entry_rcu(re.event, &ctx->event_list, event_entry) - __perf_remove_from_context(&re); - rcu_read_unlock(); + raw_spin_lock(&ctx->lock); + list_for_each_entry(event, &ctx->event_list, event_entry) + __perf_remove_from_context(event, cpuctx, ctx, (void *)DETACH_GROUP); + raw_spin_unlock(&ctx->lock); } static void perf_event_exit_cpu_context(int cpu) @@ -9283,12 +9449,29 @@ perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) switch (action & ~CPU_TASKS_FROZEN) { case CPU_UP_PREPARE: - case CPU_DOWN_FAILED: + /* + * This must be done before the CPU comes alive, because the + * moment we can run tasks we can encounter (software) events. + * + * Specifically, someone can have inherited events on kthreadd + * or a pre-existing worker thread that gets re-bound. + */ perf_event_init_cpu(cpu); break; - case CPU_UP_CANCELED: case CPU_DOWN_PREPARE: + /* + * This must be done before the CPU dies because after that an + * active event might want to IPI the CPU and that'll not work + * so great for dead CPUs. + * + * XXX smp_call_function_single() return -ENXIO without a warn + * so we could possibly deal with this. + * + * This is safe against new events arriving because + * sys_perf_event_open() serializes against hotplug using + * get_online_cpus(). + */ perf_event_exit_cpu(cpu); break; default: @@ -9316,9 +9499,6 @@ void __init perf_event_init(void) ret = init_hw_breakpoint(); WARN(ret, "hw_breakpoint initialization failed with: %d", ret); - /* do not patch jump label more than once per second */ - jump_label_rate_limit(&perf_sched_events, HZ); - /* * Build time assertion that we keep the data_head at the intended * location. IOW, validation we got the __reserved[] size right. @@ -9338,6 +9518,7 @@ ssize_t perf_event_sysfs_show(struct device *dev, struct device_attribute *attr, return 0; } +EXPORT_SYMBOL_GPL(perf_event_sysfs_show); static int __init perf_event_sysfs_init(void) { diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c index 92ce5f4ccc26..3f8cb1e14588 100644 --- a/kernel/events/hw_breakpoint.c +++ b/kernel/events/hw_breakpoint.c @@ -444,7 +444,7 @@ int modify_user_hw_breakpoint(struct perf_event *bp, struct perf_event_attr *att * current task. */ if (irqs_disabled() && bp->ctx && bp->ctx->task == current) - __perf_event_disable(bp); + perf_event_disable_local(bp); else perf_event_disable(bp); diff --git a/kernel/events/internal.h b/kernel/events/internal.h index 2bbad9c1274c..4199b6d193f5 100644 --- a/kernel/events/internal.h +++ b/kernel/events/internal.h @@ -182,8 +182,6 @@ DEFINE_OUTPUT_COPY(__output_copy_user, arch_perf_out_copy_user) /* Callchain handling */ extern struct perf_callchain_entry * perf_callchain(struct perf_event *event, struct pt_regs *regs); -extern int get_callchain_buffers(void); -extern void put_callchain_buffers(void); static inline int get_recursion_context(int *recursion) { diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index adfdc0536117..c61f0cbd308b 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -459,6 +459,25 @@ static void rb_free_aux_page(struct ring_buffer *rb, int idx) __free_page(page); } +static void __rb_free_aux(struct ring_buffer *rb) +{ + int pg; + + if (rb->aux_priv) { + rb->free_aux(rb->aux_priv); + rb->free_aux = NULL; + rb->aux_priv = NULL; + } + + if (rb->aux_nr_pages) { + for (pg = 0; pg < rb->aux_nr_pages; pg++) + rb_free_aux_page(rb, pg); + + kfree(rb->aux_pages); + rb->aux_nr_pages = 0; + } +} + int rb_alloc_aux(struct ring_buffer *rb, struct perf_event *event, pgoff_t pgoff, int nr_pages, long watermark, int flags) { @@ -547,30 +566,11 @@ out: if (!ret) rb->aux_pgoff = pgoff; else - rb_free_aux(rb); + __rb_free_aux(rb); return ret; } -static void __rb_free_aux(struct ring_buffer *rb) -{ - int pg; - - if (rb->aux_priv) { - rb->free_aux(rb->aux_priv); - rb->free_aux = NULL; - rb->aux_priv = NULL; - } - - if (rb->aux_nr_pages) { - for (pg = 0; pg < rb->aux_nr_pages; pg++) - rb_free_aux_page(rb, pg); - - kfree(rb->aux_pages); - rb->aux_nr_pages = 0; - } -} - void rb_free_aux(struct ring_buffer *rb) { if (atomic_dec_and_test(&rb->aux_refcount)) @@ -746,8 +746,10 @@ struct ring_buffer *rb_alloc(int nr_pages, long watermark, int cpu, int flags) rb->user_page = all_buf; rb->data_pages[0] = all_buf + PAGE_SIZE; - rb->page_order = ilog2(nr_pages); - rb->nr_pages = !!nr_pages; + if (nr_pages) { + rb->nr_pages = 1; + rb->page_order = ilog2(nr_pages); + } ring_buffer_init(rb, watermark, flags); diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 0167679182c0..220fc17b9718 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -299,7 +299,7 @@ int uprobe_write_opcode(struct mm_struct *mm, unsigned long vaddr, retry: /* Read the page with vaddr into memory */ - ret = get_user_pages(NULL, mm, vaddr, 1, 0, 1, &old_page, &vma); + ret = get_user_pages_remote(NULL, mm, vaddr, 1, 0, 1, &old_page, &vma); if (ret <= 0) return ret; @@ -1178,6 +1178,7 @@ static struct xol_area *__create_xol_area(unsigned long vaddr) goto free_area; area->xol_mapping.name = "[uprobes]"; + area->xol_mapping.fault = NULL; area->xol_mapping.pages = area->pages; area->pages[0] = alloc_page(GFP_HIGHUSER); if (!area->pages[0]) @@ -1700,7 +1701,13 @@ static int is_trap_at_addr(struct mm_struct *mm, unsigned long vaddr) if (likely(result == 0)) goto out; - result = get_user_pages(NULL, mm, vaddr, 1, 0, 1, &page, NULL); + /* + * The NULL 'tsk' here ensures that any faults that occur here + * will not be accounted to the task. 'mm' *is* current->mm, + * but we treat this as a 'remote' access since it is + * essentially a kernel access to the memory. + */ + result = get_user_pages_remote(NULL, mm, vaddr, 1, 0, 1, &page, NULL); if (result < 0) return result; diff --git a/kernel/exit.c b/kernel/exit.c index 10e088237fed..fd90195667e1 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -53,6 +53,7 @@ #include <linux/oom.h> #include <linux/writeback.h> #include <linux/shm.h> +#include <linux/kcov.h> #include <asm/uaccess.h> #include <asm/unistd.h> @@ -434,7 +435,7 @@ static void exit_mm(struct task_struct *tsk) mm_update_next_owner(mm); mmput(mm); if (test_thread_flag(TIF_MEMDIE)) - exit_oom_victim(); + exit_oom_victim(tsk); } static struct task_struct *find_alive_thread(struct task_struct *p) @@ -655,6 +656,7 @@ void do_exit(long code) TASKS_RCU(int tasks_rcu_i); profile_task_exit(tsk); + kcov_task_exit(tsk); WARN_ON(blk_needs_flush_plug(tsk)); diff --git a/kernel/fork.c b/kernel/fork.c index 2e391c754ae7..d277e83ed3e0 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -75,6 +75,7 @@ #include <linux/aio.h> #include <linux/compiler.h> #include <linux/sysctl.h> +#include <linux/kcov.h> #include <asm/pgtable.h> #include <asm/pgalloc.h> @@ -164,12 +165,20 @@ static struct thread_info *alloc_thread_info_node(struct task_struct *tsk, struct page *page = alloc_kmem_pages_node(node, THREADINFO_GFP, THREAD_SIZE_ORDER); + if (page) + memcg_kmem_update_page_stat(page, MEMCG_KERNEL_STACK, + 1 << THREAD_SIZE_ORDER); + return page ? page_address(page) : NULL; } static inline void free_thread_info(struct thread_info *ti) { - free_kmem_pages((unsigned long)ti, THREAD_SIZE_ORDER); + struct page *page = virt_to_page(ti); + + memcg_kmem_update_page_stat(page, MEMCG_KERNEL_STACK, + -(1 << THREAD_SIZE_ORDER)); + __free_kmem_pages(page, THREAD_SIZE_ORDER); } # else static struct kmem_cache *thread_info_cache; @@ -384,6 +393,8 @@ static struct task_struct *dup_task_struct(struct task_struct *orig) account_kernel_stack(ti, 1); + kcov_task_init(tsk); + return tsk; free_ti: @@ -1884,7 +1895,7 @@ static int check_unshare_flags(unsigned long unshare_flags) if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND| CLONE_VM|CLONE_FILES|CLONE_SYSVSEM| CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWNET| - CLONE_NEWUSER|CLONE_NEWPID)) + CLONE_NEWUSER|CLONE_NEWPID|CLONE_NEWCGROUP)) return -EINVAL; /* * Not implemented, but pretend it works if there is nothing diff --git a/kernel/futex.c b/kernel/futex.c index 0773f2b23b10..a5d2e74c89e0 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -124,16 +124,16 @@ * futex_wait(futex, val); * * waiters++; (a) - * mb(); (A) <-- paired with -. - * | - * lock(hash_bucket(futex)); | - * | - * uval = *futex; | - * | *futex = newval; - * | sys_futex(WAKE, futex); - * | futex_wake(futex); - * | - * `-------> mb(); (B) + * smp_mb(); (A) <-- paired with -. + * | + * lock(hash_bucket(futex)); | + * | + * uval = *futex; | + * | *futex = newval; + * | sys_futex(WAKE, futex); + * | futex_wake(futex); + * | + * `--------> smp_mb(); (B) * if (uval == val) * queue(); * unlock(hash_bucket(futex)); @@ -334,7 +334,7 @@ static inline void futex_get_mm(union futex_key *key) /* * Ensure futex_get_mm() implies a full barrier such that * get_futex_key() implies a full barrier. This is relied upon - * as full barrier (B), see the ordering comment above. + * as smp_mb(); (B), see the ordering comment above. */ smp_mb__after_atomic(); } @@ -407,10 +407,10 @@ static void get_futex_key_refs(union futex_key *key) switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) { case FUT_OFF_INODE: - ihold(key->shared.inode); /* implies MB (B) */ + ihold(key->shared.inode); /* implies smp_mb(); (B) */ break; case FUT_OFF_MMSHARED: - futex_get_mm(key); /* implies MB (B) */ + futex_get_mm(key); /* implies smp_mb(); (B) */ break; default: /* @@ -418,7 +418,7 @@ static void get_futex_key_refs(union futex_key *key) * mm, therefore the only purpose of calling get_futex_key_refs * is because we need the barrier for the lockless waiter check. */ - smp_mb(); /* explicit MB (B) */ + smp_mb(); /* explicit smp_mb(); (B) */ } } @@ -497,7 +497,7 @@ get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw) if (!fshared) { key->private.mm = mm; key->private.address = address; - get_futex_key_refs(key); /* implies MB (B) */ + get_futex_key_refs(key); /* implies smp_mb(); (B) */ return 0; } @@ -520,7 +520,20 @@ again: else err = 0; - lock_page(page); + /* + * The treatment of mapping from this point on is critical. The page + * lock protects many things but in this context the page lock + * stabilizes mapping, prevents inode freeing in the shared + * file-backed region case and guards against movement to swap cache. + * + * Strictly speaking the page lock is not needed in all cases being + * considered here and page lock forces unnecessarily serialization + * From this point on, mapping will be re-verified if necessary and + * page lock will be acquired only if it is unavoidable + */ + page = compound_head(page); + mapping = READ_ONCE(page->mapping); + /* * If page->mapping is NULL, then it cannot be a PageAnon * page; but it might be the ZERO_PAGE or in the gate area or @@ -536,19 +549,31 @@ again: * shmem_writepage move it from filecache to swapcache beneath us: * an unlikely race, but we do need to retry for page->mapping. */ - mapping = compound_head(page)->mapping; - if (!mapping) { - int shmem_swizzled = PageSwapCache(page); + if (unlikely(!mapping)) { + int shmem_swizzled; + + /* + * Page lock is required to identify which special case above + * applies. If this is really a shmem page then the page lock + * will prevent unexpected transitions. + */ + lock_page(page); + shmem_swizzled = PageSwapCache(page) || page->mapping; unlock_page(page); put_page(page); + if (shmem_swizzled) goto again; + return -EFAULT; } /* * Private mappings are handled in a simple way. * + * If the futex key is stored on an anonymous page, then the associated + * object is the mm which is implicitly pinned by the calling process. + * * NOTE: When userspace waits on a MAP_SHARED mapping, even if * it's a read-only handle, it's expected that futexes attach to * the object not the particular process. @@ -566,16 +591,74 @@ again: key->both.offset |= FUT_OFF_MMSHARED; /* ref taken on mm */ key->private.mm = mm; key->private.address = address; + + get_futex_key_refs(key); /* implies smp_mb(); (B) */ + } else { + struct inode *inode; + + /* + * The associated futex object in this case is the inode and + * the page->mapping must be traversed. Ordinarily this should + * be stabilised under page lock but it's not strictly + * necessary in this case as we just want to pin the inode, not + * update the radix tree or anything like that. + * + * The RCU read lock is taken as the inode is finally freed + * under RCU. If the mapping still matches expectations then the + * mapping->host can be safely accessed as being a valid inode. + */ + rcu_read_lock(); + + if (READ_ONCE(page->mapping) != mapping) { + rcu_read_unlock(); + put_page(page); + + goto again; + } + + inode = READ_ONCE(mapping->host); + if (!inode) { + rcu_read_unlock(); + put_page(page); + + goto again; + } + + /* + * Take a reference unless it is about to be freed. Previously + * this reference was taken by ihold under the page lock + * pinning the inode in place so i_lock was unnecessary. The + * only way for this check to fail is if the inode was + * truncated in parallel so warn for now if this happens. + * + * We are not calling into get_futex_key_refs() in file-backed + * cases, therefore a successful atomic_inc return below will + * guarantee that get_futex_key() will still imply smp_mb(); (B). + */ + if (WARN_ON_ONCE(!atomic_inc_not_zero(&inode->i_count))) { + rcu_read_unlock(); + put_page(page); + + goto again; + } + + /* Should be impossible but lets be paranoid for now */ + if (WARN_ON_ONCE(inode->i_mapping != mapping)) { + err = -EFAULT; + rcu_read_unlock(); + iput(inode); + + goto out; + } + key->both.offset |= FUT_OFF_INODE; /* inode-based key */ - key->shared.inode = mapping->host; + key->shared.inode = inode; key->shared.pgoff = basepage_index(page); + rcu_read_unlock(); } - get_futex_key_refs(key); /* implies MB (B) */ - out: - unlock_page(page); put_page(page); return err; } @@ -1191,7 +1274,7 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this, if (pi_state->owner != current) return -EINVAL; - raw_spin_lock(&pi_state->pi_mutex.wait_lock); + raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); new_owner = rt_mutex_next_owner(&pi_state->pi_mutex); /* @@ -1217,22 +1300,22 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this, else if (curval != uval) ret = -EINVAL; if (ret) { - raw_spin_unlock(&pi_state->pi_mutex.wait_lock); + raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); return ret; } - raw_spin_lock_irq(&pi_state->owner->pi_lock); + raw_spin_lock(&pi_state->owner->pi_lock); WARN_ON(list_empty(&pi_state->list)); list_del_init(&pi_state->list); - raw_spin_unlock_irq(&pi_state->owner->pi_lock); + raw_spin_unlock(&pi_state->owner->pi_lock); - raw_spin_lock_irq(&new_owner->pi_lock); + raw_spin_lock(&new_owner->pi_lock); WARN_ON(!list_empty(&pi_state->list)); list_add(&pi_state->list, &new_owner->pi_state_list); pi_state->owner = new_owner; - raw_spin_unlock_irq(&new_owner->pi_lock); + raw_spin_unlock(&new_owner->pi_lock); - raw_spin_unlock(&pi_state->pi_mutex.wait_lock); + raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); deboost = rt_mutex_futex_unlock(&pi_state->pi_mutex, &wake_q); @@ -1864,7 +1947,7 @@ static inline struct futex_hash_bucket *queue_lock(struct futex_q *q) q->lock_ptr = &hb->lock; - spin_lock(&hb->lock); /* implies MB (A) */ + spin_lock(&hb->lock); /* implies smp_mb(); (A) */ return hb; } @@ -1927,8 +2010,12 @@ static int unqueue_me(struct futex_q *q) /* In the common case we don't take the spinlock, which is nice. */ retry: - lock_ptr = q->lock_ptr; - barrier(); + /* + * q->lock_ptr can change between this read and the following spin_lock. + * Use READ_ONCE to forbid the compiler from reloading q->lock_ptr and + * optimizing lock_ptr out of the logic below. + */ + lock_ptr = READ_ONCE(q->lock_ptr); if (lock_ptr != NULL) { spin_lock(lock_ptr); /* @@ -2127,11 +2214,11 @@ static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked) * we returned due to timeout or signal without taking the * rt_mutex. Too late. */ - raw_spin_lock(&q->pi_state->pi_mutex.wait_lock); + raw_spin_lock_irq(&q->pi_state->pi_mutex.wait_lock); owner = rt_mutex_owner(&q->pi_state->pi_mutex); if (!owner) owner = rt_mutex_next_owner(&q->pi_state->pi_mutex); - raw_spin_unlock(&q->pi_state->pi_mutex.wait_lock); + raw_spin_unlock_irq(&q->pi_state->pi_mutex.wait_lock); ret = fixup_pi_state_owner(uaddr, q, owner); goto out; } diff --git a/kernel/hung_task.c b/kernel/hung_task.c index e0f90c2b57aa..d234022805dc 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c @@ -185,10 +185,12 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout) rcu_read_unlock(); } -static unsigned long timeout_jiffies(unsigned long timeout) +static long hung_timeout_jiffies(unsigned long last_checked, + unsigned long timeout) { /* timeout of 0 will disable the watchdog */ - return timeout ? timeout * HZ : MAX_SCHEDULE_TIMEOUT; + return timeout ? last_checked - jiffies + timeout * HZ : + MAX_SCHEDULE_TIMEOUT; } /* @@ -224,18 +226,21 @@ EXPORT_SYMBOL_GPL(reset_hung_task_detector); */ static int watchdog(void *dummy) { + unsigned long hung_last_checked = jiffies; + set_user_nice(current, 0); for ( ; ; ) { unsigned long timeout = sysctl_hung_task_timeout_secs; + long t = hung_timeout_jiffies(hung_last_checked, timeout); - while (schedule_timeout_interruptible(timeout_jiffies(timeout))) - timeout = sysctl_hung_task_timeout_secs; - - if (atomic_xchg(&reset_hung_task, 0)) + if (t <= 0) { + if (!atomic_xchg(&reset_hung_task, 0)) + check_hung_uninterruptible_tasks(timeout); + hung_last_checked = jiffies; continue; - - check_hung_uninterruptible_tasks(timeout); + } + schedule_timeout_interruptible(t); } return 0; diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig index 3b48dab80164..3bbfd6a9c475 100644 --- a/kernel/irq/Kconfig +++ b/kernel/irq/Kconfig @@ -64,6 +64,10 @@ config IRQ_DOMAIN_HIERARCHY bool select IRQ_DOMAIN +# Generic IRQ IPI support +config GENERIC_IRQ_IPI + bool + # Generic MSI interrupt support config GENERIC_MSI_IRQ bool diff --git a/kernel/irq/Makefile b/kernel/irq/Makefile index 2fc9cbdf35b6..2ee42e95a3ce 100644 --- a/kernel/irq/Makefile +++ b/kernel/irq/Makefile @@ -8,3 +8,4 @@ obj-$(CONFIG_GENERIC_PENDING_IRQ) += migration.o obj-$(CONFIG_GENERIC_IRQ_MIGRATION) += cpuhotplug.o obj-$(CONFIG_PM_SLEEP) += pm.o obj-$(CONFIG_GENERIC_MSI_IRQ) += msi.o +obj-$(CONFIG_GENERIC_IRQ_IPI) += ipi.o diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 5797909f4e5b..2f9f2b0e79f2 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -961,6 +961,7 @@ void irq_chip_mask_parent(struct irq_data *data) data = data->parent_data; data->chip->irq_mask(data); } +EXPORT_SYMBOL_GPL(irq_chip_mask_parent); /** * irq_chip_unmask_parent - Unmask the parent interrupt @@ -971,6 +972,7 @@ void irq_chip_unmask_parent(struct irq_data *data) data = data->parent_data; data->chip->irq_unmask(data); } +EXPORT_SYMBOL_GPL(irq_chip_unmask_parent); /** * irq_chip_eoi_parent - Invoke EOI on the parent interrupt @@ -981,6 +983,7 @@ void irq_chip_eoi_parent(struct irq_data *data) data = data->parent_data; data->chip->irq_eoi(data); } +EXPORT_SYMBOL_GPL(irq_chip_eoi_parent); /** * irq_chip_set_affinity_parent - Set affinity on the parent interrupt @@ -1016,6 +1019,7 @@ int irq_chip_set_type_parent(struct irq_data *data, unsigned int type) return -ENOSYS; } +EXPORT_SYMBOL_GPL(irq_chip_set_type_parent); /** * irq_chip_retrigger_hierarchy - Retrigger an interrupt in hardware diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index a302cf9a2126..a15b5485b446 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -136,9 +136,9 @@ irqreturn_t handle_irq_event_percpu(struct irq_desc *desc) { irqreturn_t retval = IRQ_NONE; unsigned int flags = 0, irq = desc->irq_data.irq; - struct irqaction *action = desc->action; + struct irqaction *action; - do { + for_each_action_of_desc(desc, action) { irqreturn_t res; trace_irq_handler_entry(irq, action); @@ -172,8 +172,7 @@ irqreturn_t handle_irq_event_percpu(struct irq_desc *desc) } retval |= res; - action = action->next; - } while (action); + } add_interrupt_randomness(irq, flags); diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index fcab63c66905..09be2c903c6d 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -131,6 +131,9 @@ static inline void chip_bus_sync_unlock(struct irq_desc *desc) #define IRQ_GET_DESC_CHECK_GLOBAL (_IRQ_DESC_CHECK) #define IRQ_GET_DESC_CHECK_PERCPU (_IRQ_DESC_CHECK | _IRQ_DESC_PERCPU) +#define for_each_action_of_desc(desc, act) \ + for (act = desc->act; act; act = act->next) + struct irq_desc * __irq_get_desc_lock(unsigned int irq, unsigned long *flags, bool bus, unsigned int check); @@ -160,6 +163,8 @@ irq_put_desc_unlock(struct irq_desc *desc, unsigned long flags) __irq_put_desc_unlock(desc, flags, false); } +#define __irqd_to_state(d) ACCESS_PRIVATE((d)->common, state_use_accessors) + /* * Manipulation functions for irq_data.state */ @@ -188,6 +193,8 @@ static inline bool irqd_has_set(struct irq_data *d, unsigned int mask) return __irqd_to_state(d) & mask; } +#undef __irqd_to_state + static inline void kstat_incr_irqs_this_cpu(struct irq_desc *desc) { __this_cpu_inc(*desc->kstat_irqs); diff --git a/kernel/irq/ipi.c b/kernel/irq/ipi.c new file mode 100644 index 000000000000..c37f34b00a11 --- /dev/null +++ b/kernel/irq/ipi.c @@ -0,0 +1,326 @@ +/* + * linux/kernel/irq/ipi.c + * + * Copyright (C) 2015 Imagination Technologies Ltd + * Author: Qais Yousef <[email protected]> + * + * This file contains driver APIs to the IPI subsystem. + */ + +#define pr_fmt(fmt) "genirq/ipi: " fmt + +#include <linux/irqdomain.h> +#include <linux/irq.h> + +/** + * irq_reserve_ipi() - Setup an IPI to destination cpumask + * @domain: IPI domain + * @dest: cpumask of cpus which can receive the IPI + * + * Allocate a virq that can be used to send IPI to any CPU in dest mask. + * + * On success it'll return linux irq number and 0 on failure + */ +unsigned int irq_reserve_ipi(struct irq_domain *domain, + const struct cpumask *dest) +{ + unsigned int nr_irqs, offset; + struct irq_data *data; + int virq, i; + + if (!domain ||!irq_domain_is_ipi(domain)) { + pr_warn("Reservation on a non IPI domain\n"); + return 0; + } + + if (!cpumask_subset(dest, cpu_possible_mask)) { + pr_warn("Reservation is not in possible_cpu_mask\n"); + return 0; + } + + nr_irqs = cpumask_weight(dest); + if (!nr_irqs) { + pr_warn("Reservation for empty destination mask\n"); + return 0; + } + + if (irq_domain_is_ipi_single(domain)) { + /* + * If the underlying implementation uses a single HW irq on + * all cpus then we only need a single Linux irq number for + * it. We have no restrictions vs. the destination mask. The + * underlying implementation can deal with holes nicely. + */ + nr_irqs = 1; + offset = 0; + } else { + unsigned int next; + + /* + * The IPI requires a seperate HW irq on each CPU. We require + * that the destination mask is consecutive. If an + * implementation needs to support holes, it can reserve + * several IPI ranges. + */ + offset = cpumask_first(dest); + /* + * Find a hole and if found look for another set bit after the + * hole. For now we don't support this scenario. + */ + next = cpumask_next_zero(offset, dest); + if (next < nr_cpu_ids) + next = cpumask_next(next, dest); + if (next < nr_cpu_ids) { + pr_warn("Destination mask has holes\n"); + return 0; + } + } + + virq = irq_domain_alloc_descs(-1, nr_irqs, 0, NUMA_NO_NODE); + if (virq <= 0) { + pr_warn("Can't reserve IPI, failed to alloc descs\n"); + return 0; + } + + virq = __irq_domain_alloc_irqs(domain, virq, nr_irqs, NUMA_NO_NODE, + (void *) dest, true); + + if (virq <= 0) { + pr_warn("Can't reserve IPI, failed to alloc hw irqs\n"); + goto free_descs; + } + + for (i = 0; i < nr_irqs; i++) { + data = irq_get_irq_data(virq + i); + cpumask_copy(data->common->affinity, dest); + data->common->ipi_offset = offset; + } + return virq; + +free_descs: + irq_free_descs(virq, nr_irqs); + return 0; +} + +/** + * irq_destroy_ipi() - unreserve an IPI that was previously allocated + * @irq: linux irq number to be destroyed + * + * Return the IPIs allocated with irq_reserve_ipi() to the system destroying + * all virqs associated with them. + */ +void irq_destroy_ipi(unsigned int irq) +{ + struct irq_data *data = irq_get_irq_data(irq); + struct cpumask *ipimask = data ? irq_data_get_affinity_mask(data) : NULL; + struct irq_domain *domain; + unsigned int nr_irqs; + + if (!irq || !data || !ipimask) + return; + + domain = data->domain; + if (WARN_ON(domain == NULL)) + return; + + if (!irq_domain_is_ipi(domain)) { + pr_warn("Trying to destroy a non IPI domain!\n"); + return; + } + + if (irq_domain_is_ipi_per_cpu(domain)) + nr_irqs = cpumask_weight(ipimask); + else + nr_irqs = 1; + + irq_domain_free_irqs(irq, nr_irqs); +} + +/** + * ipi_get_hwirq - Get the hwirq associated with an IPI to a cpu + * @irq: linux irq number + * @cpu: the target cpu + * + * When dealing with coprocessors IPI, we need to inform the coprocessor of + * the hwirq it needs to use to receive and send IPIs. + * + * Returns hwirq value on success and INVALID_HWIRQ on failure. + */ +irq_hw_number_t ipi_get_hwirq(unsigned int irq, unsigned int cpu) +{ + struct irq_data *data = irq_get_irq_data(irq); + struct cpumask *ipimask = data ? irq_data_get_affinity_mask(data) : NULL; + + if (!data || !ipimask || cpu > nr_cpu_ids) + return INVALID_HWIRQ; + + if (!cpumask_test_cpu(cpu, ipimask)) + return INVALID_HWIRQ; + + /* + * Get the real hardware irq number if the underlying implementation + * uses a seperate irq per cpu. If the underlying implementation uses + * a single hardware irq for all cpus then the IPI send mechanism + * needs to take care of the cpu destinations. + */ + if (irq_domain_is_ipi_per_cpu(data->domain)) + data = irq_get_irq_data(irq + cpu - data->common->ipi_offset); + + return data ? irqd_to_hwirq(data) : INVALID_HWIRQ; +} +EXPORT_SYMBOL_GPL(ipi_get_hwirq); + +static int ipi_send_verify(struct irq_chip *chip, struct irq_data *data, + const struct cpumask *dest, unsigned int cpu) +{ + struct cpumask *ipimask = irq_data_get_affinity_mask(data); + + if (!chip || !ipimask) + return -EINVAL; + + if (!chip->ipi_send_single && !chip->ipi_send_mask) + return -EINVAL; + + if (cpu > nr_cpu_ids) + return -EINVAL; + + if (dest) { + if (!cpumask_subset(dest, ipimask)) + return -EINVAL; + } else { + if (!cpumask_test_cpu(cpu, ipimask)) + return -EINVAL; + } + return 0; +} + +/** + * __ipi_send_single - send an IPI to a target Linux SMP CPU + * @desc: pointer to irq_desc of the IRQ + * @cpu: destination CPU, must in the destination mask passed to + * irq_reserve_ipi() + * + * This function is for architecture or core code to speed up IPI sending. Not + * usable from driver code. + * + * Returns zero on success and negative error number on failure. + */ +int __ipi_send_single(struct irq_desc *desc, unsigned int cpu) +{ + struct irq_data *data = irq_desc_get_irq_data(desc); + struct irq_chip *chip = irq_data_get_irq_chip(data); + +#ifdef DEBUG + /* + * Minimise the overhead by omitting the checks for Linux SMP IPIs. + * Since the callers should be arch or core code which is generally + * trusted, only check for errors when debugging. + */ + if (WARN_ON_ONCE(ipi_send_verify(chip, data, NULL, cpu))) + return -EINVAL; +#endif + if (!chip->ipi_send_single) { + chip->ipi_send_mask(data, cpumask_of(cpu)); + return 0; + } + + /* FIXME: Store this information in irqdata flags */ + if (irq_domain_is_ipi_per_cpu(data->domain) && + cpu != data->common->ipi_offset) { + /* use the correct data for that cpu */ + unsigned irq = data->irq + cpu - data->common->ipi_offset; + + data = irq_get_irq_data(irq); + } + chip->ipi_send_single(data, cpu); + return 0; +} + +/** + * ipi_send_mask - send an IPI to target Linux SMP CPU(s) + * @desc: pointer to irq_desc of the IRQ + * @dest: dest CPU(s), must be a subset of the mask passed to + * irq_reserve_ipi() + * + * This function is for architecture or core code to speed up IPI sending. Not + * usable from driver code. + * + * Returns zero on success and negative error number on failure. + */ +int __ipi_send_mask(struct irq_desc *desc, const struct cpumask *dest) +{ + struct irq_data *data = irq_desc_get_irq_data(desc); + struct irq_chip *chip = irq_data_get_irq_chip(data); + unsigned int cpu; + +#ifdef DEBUG + /* + * Minimise the overhead by omitting the checks for Linux SMP IPIs. + * Since the callers should be arch or core code which is generally + * trusted, only check for errors when debugging. + */ + if (WARN_ON_ONCE(ipi_send_verify(chip, data, dest, 0))) + return -EINVAL; +#endif + if (chip->ipi_send_mask) { + chip->ipi_send_mask(data, dest); + return 0; + } + + if (irq_domain_is_ipi_per_cpu(data->domain)) { + unsigned int base = data->irq; + + for_each_cpu(cpu, dest) { + unsigned irq = base + cpu - data->common->ipi_offset; + + data = irq_get_irq_data(irq); + chip->ipi_send_single(data, cpu); + } + } else { + for_each_cpu(cpu, dest) + chip->ipi_send_single(data, cpu); + } + return 0; +} + +/** + * ipi_send_single - Send an IPI to a single CPU + * @virq: linux irq number from irq_reserve_ipi() + * @cpu: destination CPU, must in the destination mask passed to + * irq_reserve_ipi() + * + * Returns zero on success and negative error number on failure. + */ +int ipi_send_single(unsigned int virq, unsigned int cpu) +{ + struct irq_desc *desc = irq_to_desc(virq); + struct irq_data *data = desc ? irq_desc_get_irq_data(desc) : NULL; + struct irq_chip *chip = data ? irq_data_get_irq_chip(data) : NULL; + + if (WARN_ON_ONCE(ipi_send_verify(chip, data, NULL, cpu))) + return -EINVAL; + + return __ipi_send_single(desc, cpu); +} +EXPORT_SYMBOL_GPL(ipi_send_single); + +/** + * ipi_send_mask - Send an IPI to target CPU(s) + * @virq: linux irq number from irq_reserve_ipi() + * @dest: dest CPU(s), must be a subset of the mask passed to + * irq_reserve_ipi() + * + * Returns zero on success and negative error number on failure. + */ +int ipi_send_mask(unsigned int virq, const struct cpumask *dest) +{ + struct irq_desc *desc = irq_to_desc(virq); + struct irq_data *data = desc ? irq_desc_get_irq_data(desc) : NULL; + struct irq_chip *chip = data ? irq_data_get_irq_chip(data) : NULL; + + if (WARN_ON_ONCE(ipi_send_verify(chip, data, dest, 0))) + return -EINVAL; + + return __ipi_send_mask(desc, dest); +} +EXPORT_SYMBOL_GPL(ipi_send_mask); diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index 0409da0bcc33..0ccd028817d7 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -24,10 +24,27 @@ static struct lock_class_key irq_desc_lock_class; #if defined(CONFIG_SMP) +static int __init irq_affinity_setup(char *str) +{ + zalloc_cpumask_var(&irq_default_affinity, GFP_NOWAIT); + cpulist_parse(str, irq_default_affinity); + /* + * Set at least the boot cpu. We don't want to end up with + * bugreports caused by random comandline masks + */ + cpumask_set_cpu(smp_processor_id(), irq_default_affinity); + return 1; +} +__setup("irqaffinity=", irq_affinity_setup); + static void __init init_irq_default_affinity(void) { - alloc_cpumask_var(&irq_default_affinity, GFP_NOWAIT); - cpumask_setall(irq_default_affinity); +#ifdef CONFIG_CPUMASK_OFFSTACK + if (!irq_default_affinity) + zalloc_cpumask_var(&irq_default_affinity, GFP_NOWAIT); +#endif + if (cpumask_empty(irq_default_affinity)) + cpumask_setall(irq_default_affinity); } #else static void __init init_irq_default_affinity(void) diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 6e655f7acd3b..3a519a01118b 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -23,8 +23,6 @@ static DEFINE_MUTEX(irq_domain_mutex); static DEFINE_MUTEX(revmap_trees_mutex); static struct irq_domain *irq_default_domain; -static int irq_domain_alloc_descs(int virq, unsigned int nr_irqs, - irq_hw_number_t hwirq, int node); static void irq_domain_check_hierarchy(struct irq_domain *domain); struct irqchip_fwid { @@ -575,10 +573,15 @@ unsigned int irq_create_fwspec_mapping(struct irq_fwspec *fwspec) unsigned int type = IRQ_TYPE_NONE; int virq; - if (fwspec->fwnode) - domain = irq_find_matching_fwnode(fwspec->fwnode, DOMAIN_BUS_ANY); - else + if (fwspec->fwnode) { + domain = irq_find_matching_fwnode(fwspec->fwnode, + DOMAIN_BUS_WIRED); + if (!domain) + domain = irq_find_matching_fwnode(fwspec->fwnode, + DOMAIN_BUS_ANY); + } else { domain = irq_default_domain; + } if (!domain) { pr_warn("no irq domain found for %s !\n", @@ -835,8 +838,8 @@ const struct irq_domain_ops irq_domain_simple_ops = { }; EXPORT_SYMBOL_GPL(irq_domain_simple_ops); -static int irq_domain_alloc_descs(int virq, unsigned int cnt, - irq_hw_number_t hwirq, int node) +int irq_domain_alloc_descs(int virq, unsigned int cnt, irq_hw_number_t hwirq, + int node) { unsigned int hint; @@ -890,6 +893,7 @@ struct irq_domain *irq_domain_create_hierarchy(struct irq_domain *parent, return domain; } +EXPORT_SYMBOL_GPL(irq_domain_create_hierarchy); static void irq_domain_insert_irq(int virq) { @@ -1040,6 +1044,7 @@ int irq_domain_set_hwirq_and_chip(struct irq_domain *domain, unsigned int virq, return 0; } +EXPORT_SYMBOL_GPL(irq_domain_set_hwirq_and_chip); /** * irq_domain_set_info - Set the complete data for a @virq in @domain @@ -1073,6 +1078,7 @@ void irq_domain_reset_irq_data(struct irq_data *irq_data) irq_data->chip = &no_irq_chip; irq_data->chip_data = NULL; } +EXPORT_SYMBOL_GPL(irq_domain_reset_irq_data); /** * irq_domain_free_irqs_common - Clear irq_data and free the parent @@ -1270,6 +1276,7 @@ int irq_domain_alloc_irqs_parent(struct irq_domain *domain, nr_irqs, arg); return -ENOSYS; } +EXPORT_SYMBOL_GPL(irq_domain_alloc_irqs_parent); /** * irq_domain_free_irqs_parent - Free interrupts from parent domain @@ -1287,6 +1294,7 @@ void irq_domain_free_irqs_parent(struct irq_domain *domain, irq_domain_free_irqs_recursive(domain->parent, irq_base, nr_irqs); } +EXPORT_SYMBOL_GPL(irq_domain_free_irqs_parent); /** * irq_domain_activate_irq - Call domain_ops->activate recursively to activate diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 841187239adc..cc1cc641d653 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -144,13 +144,11 @@ int irq_can_set_affinity(unsigned int irq) */ void irq_set_thread_affinity(struct irq_desc *desc) { - struct irqaction *action = desc->action; + struct irqaction *action; - while (action) { + for_each_action_of_desc(desc, action) if (action->thread) set_bit(IRQTF_AFFINITY, &action->thread_flags); - action = action->next; - } } #ifdef CONFIG_GENERIC_PENDING_IRQ @@ -994,7 +992,7 @@ void irq_wake_thread(unsigned int irq, void *dev_id) return; raw_spin_lock_irqsave(&desc->lock, flags); - for (action = desc->action; action; action = action->next) { + for_each_action_of_desc(desc, action) { if (action->dev_id == dev_id) { if (action->thread) __irq_wake_thread(desc, action); @@ -1324,8 +1322,8 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) if (nmsk != omsk) /* hope the handler works with current trigger mode */ - pr_warning("irq %d uses trigger mode %u; requested %u\n", - irq, nmsk, omsk); + pr_warn("irq %d uses trigger mode %u; requested %u\n", + irq, nmsk, omsk); } *old_ptr = new; @@ -1609,6 +1607,9 @@ int request_threaded_irq(unsigned int irq, irq_handler_t handler, struct irq_desc *desc; int retval; + if (irq == IRQ_NOTCONNECTED) + return -ENOTCONN; + /* * Sanity-check: shared interrupts must pass in a real dev-ID, * otherwise we'll have trouble later trying to figure out @@ -1699,9 +1700,13 @@ EXPORT_SYMBOL(request_threaded_irq); int request_any_context_irq(unsigned int irq, irq_handler_t handler, unsigned long flags, const char *name, void *dev_id) { - struct irq_desc *desc = irq_to_desc(irq); + struct irq_desc *desc; int ret; + if (irq == IRQ_NOTCONNECTED) + return -ENOTCONN; + + desc = irq_to_desc(irq); if (!desc) return -EINVAL; diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index a2c02fd5d6d0..4e1b94726818 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -291,7 +291,7 @@ static int name_unique(unsigned int irq, struct irqaction *new_action) int ret = 1; raw_spin_lock_irqsave(&desc->lock, flags); - for (action = desc->action ; action; action = action->next) { + for_each_action_of_desc(desc, action) { if ((action != new_action) && action->name && !strcmp(new_action->name, action->name)) { ret = 0; diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c index 32144175458d..5707f97a3e6a 100644 --- a/kernel/irq/spurious.c +++ b/kernel/irq/spurious.c @@ -211,14 +211,12 @@ static void __report_bad_irq(struct irq_desc *desc, irqreturn_t action_ret) * desc->lock here. See synchronize_irq(). */ raw_spin_lock_irqsave(&desc->lock, flags); - action = desc->action; - while (action) { + for_each_action_of_desc(desc, action) { printk(KERN_ERR "[<%p>] %pf", action->handler, action->handler); if (action->thread_fn) printk(KERN_CONT " threaded [<%p>] %pf", action->thread_fn, action->thread_fn); printk(KERN_CONT "\n"); - action = action->next; } raw_spin_unlock_irqrestore(&desc->lock, flags); } diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index 5c5987f10819..fafd1a3ef0da 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -38,6 +38,7 @@ * during the second link stage. */ extern const unsigned long kallsyms_addresses[] __weak; +extern const int kallsyms_offsets[] __weak; extern const u8 kallsyms_names[] __weak; /* @@ -47,6 +48,9 @@ extern const u8 kallsyms_names[] __weak; extern const unsigned long kallsyms_num_syms __attribute__((weak, section(".rodata"))); +extern const unsigned long kallsyms_relative_base +__attribute__((weak, section(".rodata"))); + extern const u8 kallsyms_token_table[] __weak; extern const u16 kallsyms_token_index[] __weak; @@ -176,6 +180,23 @@ static unsigned int get_symbol_offset(unsigned long pos) return name - kallsyms_names; } +static unsigned long kallsyms_sym_address(int idx) +{ + if (!IS_ENABLED(CONFIG_KALLSYMS_BASE_RELATIVE)) + return kallsyms_addresses[idx]; + + /* values are unsigned offsets if --absolute-percpu is not in effect */ + if (!IS_ENABLED(CONFIG_KALLSYMS_ABSOLUTE_PERCPU)) + return kallsyms_relative_base + (u32)kallsyms_offsets[idx]; + + /* ...otherwise, positive offsets are absolute values */ + if (kallsyms_offsets[idx] >= 0) + return kallsyms_offsets[idx]; + + /* ...and negative offsets are relative to kallsyms_relative_base - 1 */ + return kallsyms_relative_base - 1 - kallsyms_offsets[idx]; +} + /* Lookup the address for this symbol. Returns 0 if not found. */ unsigned long kallsyms_lookup_name(const char *name) { @@ -187,7 +208,7 @@ unsigned long kallsyms_lookup_name(const char *name) off = kallsyms_expand_symbol(off, namebuf, ARRAY_SIZE(namebuf)); if (strcmp(namebuf, name) == 0) - return kallsyms_addresses[i]; + return kallsyms_sym_address(i); } return module_kallsyms_lookup_name(name); } @@ -204,7 +225,7 @@ int kallsyms_on_each_symbol(int (*fn)(void *, const char *, struct module *, for (i = 0, off = 0; i < kallsyms_num_syms; i++) { off = kallsyms_expand_symbol(off, namebuf, ARRAY_SIZE(namebuf)); - ret = fn(data, namebuf, NULL, kallsyms_addresses[i]); + ret = fn(data, namebuf, NULL, kallsyms_sym_address(i)); if (ret != 0) return ret; } @@ -220,7 +241,10 @@ static unsigned long get_symbol_pos(unsigned long addr, unsigned long i, low, high, mid; /* This kernel should never had been booted. */ - BUG_ON(!kallsyms_addresses); + if (!IS_ENABLED(CONFIG_KALLSYMS_BASE_RELATIVE)) + BUG_ON(!kallsyms_addresses); + else + BUG_ON(!kallsyms_offsets); /* Do a binary search on the sorted kallsyms_addresses array. */ low = 0; @@ -228,7 +252,7 @@ static unsigned long get_symbol_pos(unsigned long addr, while (high - low > 1) { mid = low + (high - low) / 2; - if (kallsyms_addresses[mid] <= addr) + if (kallsyms_sym_address(mid) <= addr) low = mid; else high = mid; @@ -238,15 +262,15 @@ static unsigned long get_symbol_pos(unsigned long addr, * Search for the first aliased symbol. Aliased * symbols are symbols with the same address. */ - while (low && kallsyms_addresses[low-1] == kallsyms_addresses[low]) + while (low && kallsyms_sym_address(low-1) == kallsyms_sym_address(low)) --low; - symbol_start = kallsyms_addresses[low]; + symbol_start = kallsyms_sym_address(low); /* Search for next non-aliased symbol. */ for (i = low + 1; i < kallsyms_num_syms; i++) { - if (kallsyms_addresses[i] > symbol_start) { - symbol_end = kallsyms_addresses[i]; + if (kallsyms_sym_address(i) > symbol_start) { + symbol_end = kallsyms_sym_address(i); break; } } @@ -470,7 +494,7 @@ static unsigned long get_ksymbol_core(struct kallsym_iter *iter) unsigned off = iter->nameoff; iter->module_name[0] = '\0'; - iter->value = kallsyms_addresses[iter->pos]; + iter->value = kallsyms_sym_address(iter->pos); iter->type = kallsyms_get_symbol_type(off); diff --git a/kernel/kcov.c b/kernel/kcov.c new file mode 100644 index 000000000000..3efbee0834a8 --- /dev/null +++ b/kernel/kcov.c @@ -0,0 +1,273 @@ +#define pr_fmt(fmt) "kcov: " fmt + +#include <linux/compiler.h> +#include <linux/types.h> +#include <linux/file.h> +#include <linux/fs.h> +#include <linux/mm.h> +#include <linux/printk.h> +#include <linux/slab.h> +#include <linux/spinlock.h> +#include <linux/vmalloc.h> +#include <linux/debugfs.h> +#include <linux/uaccess.h> +#include <linux/kcov.h> + +/* + * kcov descriptor (one per opened debugfs file). + * State transitions of the descriptor: + * - initial state after open() + * - then there must be a single ioctl(KCOV_INIT_TRACE) call + * - then, mmap() call (several calls are allowed but not useful) + * - then, repeated enable/disable for a task (only one task a time allowed) + */ +struct kcov { + /* + * Reference counter. We keep one for: + * - opened file descriptor + * - task with enabled coverage (we can't unwire it from another task) + */ + atomic_t refcount; + /* The lock protects mode, size, area and t. */ + spinlock_t lock; + enum kcov_mode mode; + /* Size of arena (in long's for KCOV_MODE_TRACE). */ + unsigned size; + /* Coverage buffer shared with user space. */ + void *area; + /* Task for which we collect coverage, or NULL. */ + struct task_struct *t; +}; + +/* + * Entry point from instrumented code. + * This is called once per basic-block/edge. + */ +void __sanitizer_cov_trace_pc(void) +{ + struct task_struct *t; + enum kcov_mode mode; + + t = current; + /* + * We are interested in code coverage as a function of a syscall inputs, + * so we ignore code executed in interrupts. + */ + if (!t || in_interrupt()) + return; + mode = READ_ONCE(t->kcov_mode); + if (mode == KCOV_MODE_TRACE) { + unsigned long *area; + unsigned long pos; + + /* + * There is some code that runs in interrupts but for which + * in_interrupt() returns false (e.g. preempt_schedule_irq()). + * READ_ONCE()/barrier() effectively provides load-acquire wrt + * interrupts, there are paired barrier()/WRITE_ONCE() in + * kcov_ioctl_locked(). + */ + barrier(); + area = t->kcov_area; + /* The first word is number of subsequent PCs. */ + pos = READ_ONCE(area[0]) + 1; + if (likely(pos < t->kcov_size)) { + area[pos] = _RET_IP_; + WRITE_ONCE(area[0], pos); + } + } +} +EXPORT_SYMBOL(__sanitizer_cov_trace_pc); + +static void kcov_get(struct kcov *kcov) +{ + atomic_inc(&kcov->refcount); +} + +static void kcov_put(struct kcov *kcov) +{ + if (atomic_dec_and_test(&kcov->refcount)) { + vfree(kcov->area); + kfree(kcov); + } +} + +void kcov_task_init(struct task_struct *t) +{ + t->kcov_mode = KCOV_MODE_DISABLED; + t->kcov_size = 0; + t->kcov_area = NULL; + t->kcov = NULL; +} + +void kcov_task_exit(struct task_struct *t) +{ + struct kcov *kcov; + + kcov = t->kcov; + if (kcov == NULL) + return; + spin_lock(&kcov->lock); + if (WARN_ON(kcov->t != t)) { + spin_unlock(&kcov->lock); + return; + } + /* Just to not leave dangling references behind. */ + kcov_task_init(t); + kcov->t = NULL; + spin_unlock(&kcov->lock); + kcov_put(kcov); +} + +static int kcov_mmap(struct file *filep, struct vm_area_struct *vma) +{ + int res = 0; + void *area; + struct kcov *kcov = vma->vm_file->private_data; + unsigned long size, off; + struct page *page; + + area = vmalloc_user(vma->vm_end - vma->vm_start); + if (!area) + return -ENOMEM; + + spin_lock(&kcov->lock); + size = kcov->size * sizeof(unsigned long); + if (kcov->mode == KCOV_MODE_DISABLED || vma->vm_pgoff != 0 || + vma->vm_end - vma->vm_start != size) { + res = -EINVAL; + goto exit; + } + if (!kcov->area) { + kcov->area = area; + vma->vm_flags |= VM_DONTEXPAND; + spin_unlock(&kcov->lock); + for (off = 0; off < size; off += PAGE_SIZE) { + page = vmalloc_to_page(kcov->area + off); + if (vm_insert_page(vma, vma->vm_start + off, page)) + WARN_ONCE(1, "vm_insert_page() failed"); + } + return 0; + } +exit: + spin_unlock(&kcov->lock); + vfree(area); + return res; +} + +static int kcov_open(struct inode *inode, struct file *filep) +{ + struct kcov *kcov; + + kcov = kzalloc(sizeof(*kcov), GFP_KERNEL); + if (!kcov) + return -ENOMEM; + atomic_set(&kcov->refcount, 1); + spin_lock_init(&kcov->lock); + filep->private_data = kcov; + return nonseekable_open(inode, filep); +} + +static int kcov_close(struct inode *inode, struct file *filep) +{ + kcov_put(filep->private_data); + return 0; +} + +static int kcov_ioctl_locked(struct kcov *kcov, unsigned int cmd, + unsigned long arg) +{ + struct task_struct *t; + unsigned long size, unused; + + switch (cmd) { + case KCOV_INIT_TRACE: + /* + * Enable kcov in trace mode and setup buffer size. + * Must happen before anything else. + */ + if (kcov->mode != KCOV_MODE_DISABLED) + return -EBUSY; + /* + * Size must be at least 2 to hold current position and one PC. + * Later we allocate size * sizeof(unsigned long) memory, + * that must not overflow. + */ + size = arg; + if (size < 2 || size > INT_MAX / sizeof(unsigned long)) + return -EINVAL; + kcov->size = size; + kcov->mode = KCOV_MODE_TRACE; + return 0; + case KCOV_ENABLE: + /* + * Enable coverage for the current task. + * At this point user must have been enabled trace mode, + * and mmapped the file. Coverage collection is disabled only + * at task exit or voluntary by KCOV_DISABLE. After that it can + * be enabled for another task. + */ + unused = arg; + if (unused != 0 || kcov->mode == KCOV_MODE_DISABLED || + kcov->area == NULL) + return -EINVAL; + if (kcov->t != NULL) + return -EBUSY; + t = current; + /* Cache in task struct for performance. */ + t->kcov_size = kcov->size; + t->kcov_area = kcov->area; + /* See comment in __sanitizer_cov_trace_pc(). */ + barrier(); + WRITE_ONCE(t->kcov_mode, kcov->mode); + t->kcov = kcov; + kcov->t = t; + /* This is put either in kcov_task_exit() or in KCOV_DISABLE. */ + kcov_get(kcov); + return 0; + case KCOV_DISABLE: + /* Disable coverage for the current task. */ + unused = arg; + if (unused != 0 || current->kcov != kcov) + return -EINVAL; + t = current; + if (WARN_ON(kcov->t != t)) + return -EINVAL; + kcov_task_init(t); + kcov->t = NULL; + kcov_put(kcov); + return 0; + default: + return -ENOTTY; + } +} + +static long kcov_ioctl(struct file *filep, unsigned int cmd, unsigned long arg) +{ + struct kcov *kcov; + int res; + + kcov = filep->private_data; + spin_lock(&kcov->lock); + res = kcov_ioctl_locked(kcov, cmd, arg); + spin_unlock(&kcov->lock); + return res; +} + +static const struct file_operations kcov_fops = { + .open = kcov_open, + .unlocked_ioctl = kcov_ioctl, + .mmap = kcov_mmap, + .release = kcov_close, +}; + +static int __init kcov_init(void) +{ + if (!debugfs_create_file("kcov", 0600, NULL, NULL, &kcov_fops)) { + pr_err("failed to create kcov in debugfs\n"); + return -ENOMEM; + } + return 0; +} + +device_initcall(kcov_init); diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index 8dc659144869..8d34308ea449 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c @@ -66,13 +66,15 @@ struct resource crashk_res = { .name = "Crash kernel", .start = 0, .end = 0, - .flags = IORESOURCE_BUSY | IORESOURCE_MEM + .flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM, + .desc = IORES_DESC_CRASH_KERNEL }; struct resource crashk_low_res = { .name = "Crash kernel", .start = 0, .end = 0, - .flags = IORESOURCE_BUSY | IORESOURCE_MEM + .flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM, + .desc = IORES_DESC_CRASH_KERNEL }; int kexec_should_crash(struct task_struct *p) @@ -959,7 +961,7 @@ int crash_shrink_memory(unsigned long new_size) ram_res->start = end; ram_res->end = crashk_res.end; - ram_res->flags = IORESOURCE_BUSY | IORESOURCE_MEM; + ram_res->flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM; ram_res->name = "System RAM"; crashk_res.end = end - 1; diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index 007b791f676d..c72d2ff5896e 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -18,6 +18,7 @@ #include <linux/kexec.h> #include <linux/mutex.h> #include <linux/list.h> +#include <linux/fs.h> #include <crypto/hash.h> #include <crypto/sha.h> #include <linux/syscalls.h> @@ -33,65 +34,6 @@ size_t __weak kexec_purgatory_size = 0; static int kexec_calculate_store_digests(struct kimage *image); -static int copy_file_from_fd(int fd, void **buf, unsigned long *buf_len) -{ - struct fd f = fdget(fd); - int ret; - struct kstat stat; - loff_t pos; - ssize_t bytes = 0; - - if (!f.file) - return -EBADF; - - ret = vfs_getattr(&f.file->f_path, &stat); - if (ret) - goto out; - - if (stat.size > INT_MAX) { - ret = -EFBIG; - goto out; - } - - /* Don't hand 0 to vmalloc, it whines. */ - if (stat.size == 0) { - ret = -EINVAL; - goto out; - } - - *buf = vmalloc(stat.size); - if (!*buf) { - ret = -ENOMEM; - goto out; - } - - pos = 0; - while (pos < stat.size) { - bytes = kernel_read(f.file, pos, (char *)(*buf) + pos, - stat.size - pos); - if (bytes < 0) { - vfree(*buf); - ret = bytes; - goto out; - } - - if (bytes == 0) - break; - pos += bytes; - } - - if (pos != stat.size) { - ret = -EBADF; - vfree(*buf); - goto out; - } - - *buf_len = pos; -out: - fdput(f); - return ret; -} - /* Architectures can provide this probe function */ int __weak arch_kexec_kernel_image_probe(struct kimage *image, void *buf, unsigned long buf_len) @@ -182,16 +124,17 @@ kimage_file_prepare_segments(struct kimage *image, int kernel_fd, int initrd_fd, { int ret = 0; void *ldata; + loff_t size; - ret = copy_file_from_fd(kernel_fd, &image->kernel_buf, - &image->kernel_buf_len); + ret = kernel_read_file_from_fd(kernel_fd, &image->kernel_buf, + &size, INT_MAX, READING_KEXEC_IMAGE); if (ret) return ret; + image->kernel_buf_len = size; /* Call arch image probe handlers */ ret = arch_kexec_kernel_image_probe(image, image->kernel_buf, image->kernel_buf_len); - if (ret) goto out; @@ -206,10 +149,12 @@ kimage_file_prepare_segments(struct kimage *image, int kernel_fd, int initrd_fd, #endif /* It is possible that there no initramfs is being loaded */ if (!(flags & KEXEC_FILE_NO_INITRAMFS)) { - ret = copy_file_from_fd(initrd_fd, &image->initrd_buf, - &image->initrd_buf_len); + ret = kernel_read_file_from_fd(initrd_fd, &image->initrd_buf, + &size, INT_MAX, + READING_KEXEC_INITRAMFS); if (ret) goto out; + image->initrd_buf_len = size; } if (cmdline_len) { @@ -524,10 +469,10 @@ int kexec_add_buffer(struct kimage *image, char *buffer, unsigned long bufsz, /* Walk the RAM ranges and allocate a suitable range for the buffer */ if (image->type == KEXEC_TYPE_CRASH) - ret = walk_iomem_res("Crash kernel", - IORESOURCE_MEM | IORESOURCE_BUSY, - crashk_res.start, crashk_res.end, kbuf, - locate_mem_hole_callback); + ret = walk_iomem_res_desc(crashk_res.desc, + IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY, + crashk_res.start, crashk_res.end, kbuf, + locate_mem_hole_callback); else ret = walk_system_ram_res(0, -1, kbuf, locate_mem_hole_callback); diff --git a/kernel/latencytop.c b/kernel/latencytop.c index a02812743a7e..b5c30d9f46c5 100644 --- a/kernel/latencytop.c +++ b/kernel/latencytop.c @@ -47,12 +47,12 @@ * of times) */ -#include <linux/latencytop.h> #include <linux/kallsyms.h> #include <linux/seq_file.h> #include <linux/notifier.h> #include <linux/spinlock.h> #include <linux/proc_fs.h> +#include <linux/latencytop.h> #include <linux/export.h> #include <linux/sched.h> #include <linux/list.h> @@ -289,4 +289,16 @@ static int __init init_lstats_procfs(void) proc_create("latency_stats", 0644, NULL, &lstats_fops); return 0; } + +int sysctl_latencytop(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + int err; + + err = proc_dointvec(table, write, buffer, lenp, ppos); + if (latencytop_enabled) + force_schedstat_enabled(); + + return err; +} device_initcall(init_lstats_procfs); diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c index bc2c85c064c1..d68fbf63b083 100644 --- a/kernel/livepatch/core.c +++ b/kernel/livepatch/core.c @@ -99,12 +99,12 @@ static void klp_find_object_module(struct klp_object *obj) /* * We do not want to block removal of patched modules and therefore * we do not take a reference here. The patches are removed by - * a going module handler instead. + * klp_module_going() instead. */ mod = find_module(obj->name); /* - * Do not mess work of the module coming and going notifiers. - * Note that the patch might still be needed before the going handler + * Do not mess work of klp_module_coming() and klp_module_going(). + * Note that the patch might still be needed before klp_module_going() * is called. Module functions can be called even in the GOING state * until mod->exit() finishes. This is especially important for * patches that modify semantic of the functions. @@ -190,8 +190,8 @@ static int klp_find_object_symbol(const char *objname, const char *name, if (args.addr == 0) pr_err("symbol '%s' not found in symbol table\n", name); else if (args.count > 1 && sympos == 0) { - pr_err("unresolvable ambiguity (%lu matches) on symbol '%s' in object '%s'\n", - args.count, name, objname); + pr_err("unresolvable ambiguity for symbol '%s' in object '%s'\n", + name, objname); } else if (sympos != args.count && sympos > 0) { pr_err("symbol position %lu for symbol '%s' in object '%s' not found\n", sympos, name, objname ? objname : "vmlinux"); @@ -866,103 +866,108 @@ int klp_register_patch(struct klp_patch *patch) } EXPORT_SYMBOL_GPL(klp_register_patch); -static int klp_module_notify_coming(struct klp_patch *patch, - struct klp_object *obj) +int klp_module_coming(struct module *mod) { - struct module *pmod = patch->mod; - struct module *mod = obj->mod; int ret; + struct klp_patch *patch; + struct klp_object *obj; - ret = klp_init_object_loaded(patch, obj); - if (ret) { - pr_warn("failed to initialize patch '%s' for module '%s' (%d)\n", - pmod->name, mod->name, ret); - return ret; - } + if (WARN_ON(mod->state != MODULE_STATE_COMING)) + return -EINVAL; - if (patch->state == KLP_DISABLED) - return 0; + mutex_lock(&klp_mutex); + /* + * Each module has to know that klp_module_coming() + * has been called. We never know what module will + * get patched by a new patch. + */ + mod->klp_alive = true; - pr_notice("applying patch '%s' to loading module '%s'\n", - pmod->name, mod->name); + list_for_each_entry(patch, &klp_patches, list) { + klp_for_each_object(patch, obj) { + if (!klp_is_module(obj) || strcmp(obj->name, mod->name)) + continue; - ret = klp_enable_object(obj); - if (ret) - pr_warn("failed to apply patch '%s' to module '%s' (%d)\n", - pmod->name, mod->name, ret); - return ret; -} + obj->mod = mod; -static void klp_module_notify_going(struct klp_patch *patch, - struct klp_object *obj) -{ - struct module *pmod = patch->mod; - struct module *mod = obj->mod; + ret = klp_init_object_loaded(patch, obj); + if (ret) { + pr_warn("failed to initialize patch '%s' for module '%s' (%d)\n", + patch->mod->name, obj->mod->name, ret); + goto err; + } - if (patch->state == KLP_DISABLED) - goto disabled; + if (patch->state == KLP_DISABLED) + break; + + pr_notice("applying patch '%s' to loading module '%s'\n", + patch->mod->name, obj->mod->name); + + ret = klp_enable_object(obj); + if (ret) { + pr_warn("failed to apply patch '%s' to module '%s' (%d)\n", + patch->mod->name, obj->mod->name, ret); + goto err; + } + + break; + } + } - pr_notice("reverting patch '%s' on unloading module '%s'\n", - pmod->name, mod->name); + mutex_unlock(&klp_mutex); - klp_disable_object(obj); + return 0; -disabled: +err: + /* + * If a patch is unsuccessfully applied, return + * error to the module loader. + */ + pr_warn("patch '%s' failed for module '%s', refusing to load module '%s'\n", + patch->mod->name, obj->mod->name, obj->mod->name); + mod->klp_alive = false; klp_free_object_loaded(obj); + mutex_unlock(&klp_mutex); + + return ret; } -static int klp_module_notify(struct notifier_block *nb, unsigned long action, - void *data) +void klp_module_going(struct module *mod) { - int ret; - struct module *mod = data; struct klp_patch *patch; struct klp_object *obj; - if (action != MODULE_STATE_COMING && action != MODULE_STATE_GOING) - return 0; + if (WARN_ON(mod->state != MODULE_STATE_GOING && + mod->state != MODULE_STATE_COMING)) + return; mutex_lock(&klp_mutex); - /* - * Each module has to know that the notifier has been called. - * We never know what module will get patched by a new patch. + * Each module has to know that klp_module_going() + * has been called. We never know what module will + * get patched by a new patch. */ - if (action == MODULE_STATE_COMING) - mod->klp_alive = true; - else /* MODULE_STATE_GOING */ - mod->klp_alive = false; + mod->klp_alive = false; list_for_each_entry(patch, &klp_patches, list) { klp_for_each_object(patch, obj) { if (!klp_is_module(obj) || strcmp(obj->name, mod->name)) continue; - if (action == MODULE_STATE_COMING) { - obj->mod = mod; - ret = klp_module_notify_coming(patch, obj); - if (ret) { - obj->mod = NULL; - pr_warn("patch '%s' is in an inconsistent state!\n", - patch->mod->name); - } - } else /* MODULE_STATE_GOING */ - klp_module_notify_going(patch, obj); + if (patch->state != KLP_DISABLED) { + pr_notice("reverting patch '%s' on unloading module '%s'\n", + patch->mod->name, obj->mod->name); + klp_disable_object(obj); + } + klp_free_object_loaded(obj); break; } } mutex_unlock(&klp_mutex); - - return 0; } -static struct notifier_block klp_module_nb = { - .notifier_call = klp_module_notify, - .priority = INT_MIN+1, /* called late but before ftrace notifier */ -}; - static int __init klp_init(void) { int ret; @@ -973,21 +978,11 @@ static int __init klp_init(void) return -EINVAL; } - ret = register_module_notifier(&klp_module_nb); - if (ret) - return ret; - klp_root_kobj = kobject_create_and_add("livepatch", kernel_kobj); - if (!klp_root_kobj) { - ret = -ENOMEM; - goto unregister; - } + if (!klp_root_kobj) + return -ENOMEM; return 0; - -unregister: - unregister_module_notifier(&klp_module_nb); - return ret; } module_init(klp_init); diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile index 8e96f6cc2a4a..31322a4275cd 100644 --- a/kernel/locking/Makefile +++ b/kernel/locking/Makefile @@ -1,3 +1,6 @@ +# Any varying coverage in these files is non-deterministic +# and is generally not a function of system call inputs. +KCOV_INSTRUMENT := n obj-y += mutex.o semaphore.o rwsem.o percpu-rwsem.o diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 60ace56618f6..53ab2f85d77e 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -123,8 +123,6 @@ static inline int debug_locks_off_graph_unlock(void) return ret; } -static int lockdep_initialized; - unsigned long nr_list_entries; static struct lock_list list_entries[MAX_LOCKDEP_ENTRIES]; @@ -150,8 +148,7 @@ static inline struct lock_class *hlock_class(struct held_lock *hlock) } #ifdef CONFIG_LOCK_STAT -static DEFINE_PER_CPU(struct lock_class_stats[MAX_LOCKDEP_KEYS], - cpu_lock_stats); +static DEFINE_PER_CPU(struct lock_class_stats[MAX_LOCKDEP_KEYS], cpu_lock_stats); static inline u64 lockstat_clock(void) { @@ -292,7 +289,7 @@ LIST_HEAD(all_lock_classes); #define __classhashfn(key) hash_long((unsigned long)key, CLASSHASH_BITS) #define classhashentry(key) (classhash_table + __classhashfn((key))) -static struct list_head classhash_table[CLASSHASH_SIZE]; +static struct hlist_head classhash_table[CLASSHASH_SIZE]; /* * We put the lock dependency chains into a hash-table as well, to cache @@ -303,7 +300,7 @@ static struct list_head classhash_table[CLASSHASH_SIZE]; #define __chainhashfn(chain) hash_long(chain, CHAINHASH_BITS) #define chainhashentry(chain) (chainhash_table + __chainhashfn((chain))) -static struct list_head chainhash_table[CHAINHASH_SIZE]; +static struct hlist_head chainhash_table[CHAINHASH_SIZE]; /* * The hash key of the lock dependency chains is a hash itself too: @@ -434,19 +431,6 @@ unsigned int max_lockdep_depth; #ifdef CONFIG_DEBUG_LOCKDEP /* - * We cannot printk in early bootup code. Not even early_printk() - * might work. So we mark any initialization errors and printk - * about it later on, in lockdep_info(). - */ -static int lockdep_init_error; -static const char *lock_init_error; -static unsigned long lockdep_init_trace_data[20]; -static struct stack_trace lockdep_init_trace = { - .max_entries = ARRAY_SIZE(lockdep_init_trace_data), - .entries = lockdep_init_trace_data, -}; - -/* * Various lockdep statistics: */ DEFINE_PER_CPU(struct lockdep_stats, lockdep_stats); @@ -666,23 +650,9 @@ static inline struct lock_class * look_up_lock_class(struct lockdep_map *lock, unsigned int subclass) { struct lockdep_subclass_key *key; - struct list_head *hash_head; + struct hlist_head *hash_head; struct lock_class *class; -#ifdef CONFIG_DEBUG_LOCKDEP - /* - * If the architecture calls into lockdep before initializing - * the hashes then we'll warn about it later. (we cannot printk - * right now) - */ - if (unlikely(!lockdep_initialized)) { - lockdep_init(); - lockdep_init_error = 1; - lock_init_error = lock->name; - save_stack_trace(&lockdep_init_trace); - } -#endif - if (unlikely(subclass >= MAX_LOCKDEP_SUBCLASSES)) { debug_locks_off(); printk(KERN_ERR @@ -719,7 +689,7 @@ look_up_lock_class(struct lockdep_map *lock, unsigned int subclass) if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) return NULL; - list_for_each_entry_rcu(class, hash_head, hash_entry) { + hlist_for_each_entry_rcu(class, hash_head, hash_entry) { if (class->key == key) { /* * Huh! same key, different name? Did someone trample @@ -742,7 +712,7 @@ static inline struct lock_class * register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force) { struct lockdep_subclass_key *key; - struct list_head *hash_head; + struct hlist_head *hash_head; struct lock_class *class; DEBUG_LOCKS_WARN_ON(!irqs_disabled()); @@ -774,7 +744,7 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force) * We have to do the hash-walk again, to avoid races * with another CPU: */ - list_for_each_entry_rcu(class, hash_head, hash_entry) { + hlist_for_each_entry_rcu(class, hash_head, hash_entry) { if (class->key == key) goto out_unlock_set; } @@ -805,7 +775,7 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force) * We use RCU's safe list-add method to make * parallel walking of the hash-list safe: */ - list_add_tail_rcu(&class->hash_entry, hash_head); + hlist_add_head_rcu(&class->hash_entry, hash_head); /* * Add it to the global list of classes: */ @@ -1822,7 +1792,7 @@ check_deadlock(struct task_struct *curr, struct held_lock *next, */ static int check_prev_add(struct task_struct *curr, struct held_lock *prev, - struct held_lock *next, int distance, int trylock_loop) + struct held_lock *next, int distance, int *stack_saved) { struct lock_list *entry; int ret; @@ -1883,8 +1853,11 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev, } } - if (!trylock_loop && !save_trace(&trace)) - return 0; + if (!*stack_saved) { + if (!save_trace(&trace)) + return 0; + *stack_saved = 1; + } /* * Ok, all validations passed, add the new lock @@ -1907,6 +1880,8 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev, * Debugging printouts: */ if (verbose(hlock_class(prev)) || verbose(hlock_class(next))) { + /* We drop graph lock, so another thread can overwrite trace. */ + *stack_saved = 0; graph_unlock(); printk("\n new dependency: "); print_lock_name(hlock_class(prev)); @@ -1929,7 +1904,7 @@ static int check_prevs_add(struct task_struct *curr, struct held_lock *next) { int depth = curr->lockdep_depth; - int trylock_loop = 0; + int stack_saved = 0; struct held_lock *hlock; /* @@ -1956,7 +1931,7 @@ check_prevs_add(struct task_struct *curr, struct held_lock *next) */ if (hlock->read != 2 && hlock->check) { if (!check_prev_add(curr, hlock, next, - distance, trylock_loop)) + distance, &stack_saved)) return 0; /* * Stop after the first non-trylock entry, @@ -1979,7 +1954,6 @@ check_prevs_add(struct task_struct *curr, struct held_lock *next) if (curr->held_locks[depth].irq_context != curr->held_locks[depth-1].irq_context) break; - trylock_loop = 1; } return 1; out_bug: @@ -2007,6 +1981,53 @@ struct lock_class *lock_chain_get_class(struct lock_chain *chain, int i) } /* + * Returns the index of the first held_lock of the current chain + */ +static inline int get_first_held_lock(struct task_struct *curr, + struct held_lock *hlock) +{ + int i; + struct held_lock *hlock_curr; + + for (i = curr->lockdep_depth - 1; i >= 0; i--) { + hlock_curr = curr->held_locks + i; + if (hlock_curr->irq_context != hlock->irq_context) + break; + + } + + return ++i; +} + +/* + * Checks whether the chain and the current held locks are consistent + * in depth and also in content. If they are not it most likely means + * that there was a collision during the calculation of the chain_key. + * Returns: 0 not passed, 1 passed + */ +static int check_no_collision(struct task_struct *curr, + struct held_lock *hlock, + struct lock_chain *chain) +{ +#ifdef CONFIG_DEBUG_LOCKDEP + int i, j, id; + + i = get_first_held_lock(curr, hlock); + + if (DEBUG_LOCKS_WARN_ON(chain->depth != curr->lockdep_depth - (i - 1))) + return 0; + + for (j = 0; j < chain->depth - 1; j++, i++) { + id = curr->held_locks[i].class_idx - 1; + + if (DEBUG_LOCKS_WARN_ON(chain_hlocks[chain->base + j] != id)) + return 0; + } +#endif + return 1; +} + +/* * Look up a dependency chain. If the key is not present yet then * add it and return 1 - in this case the new dependency chain is * validated. If the key is already hashed, return 0. @@ -2017,9 +2038,8 @@ static inline int lookup_chain_cache(struct task_struct *curr, u64 chain_key) { struct lock_class *class = hlock_class(hlock); - struct list_head *hash_head = chainhashentry(chain_key); + struct hlist_head *hash_head = chainhashentry(chain_key); struct lock_chain *chain; - struct held_lock *hlock_curr; int i, j; /* @@ -2033,10 +2053,13 @@ static inline int lookup_chain_cache(struct task_struct *curr, * We can walk it lock-free, because entries only get added * to the hash: */ - list_for_each_entry_rcu(chain, hash_head, entry) { + hlist_for_each_entry_rcu(chain, hash_head, entry) { if (chain->chain_key == chain_key) { cache_hit: debug_atomic_inc(chain_lookup_hits); + if (!check_no_collision(curr, hlock, chain)) + return 0; + if (very_verbose(class)) printk("\nhash chain already cached, key: " "%016Lx tail class: [%p] %s\n", @@ -2057,7 +2080,7 @@ cache_hit: /* * We have to walk the chain again locked - to avoid duplicates: */ - list_for_each_entry(chain, hash_head, entry) { + hlist_for_each_entry(chain, hash_head, entry) { if (chain->chain_key == chain_key) { graph_unlock(); goto cache_hit; @@ -2074,13 +2097,7 @@ cache_hit: chain = lock_chains + nr_lock_chains++; chain->chain_key = chain_key; chain->irq_context = hlock->irq_context; - /* Find the first held_lock of current chain */ - for (i = curr->lockdep_depth - 1; i >= 0; i--) { - hlock_curr = curr->held_locks + i; - if (hlock_curr->irq_context != hlock->irq_context) - break; - } - i++; + i = get_first_held_lock(curr, hlock); chain->depth = curr->lockdep_depth + 1 - i; if (likely(nr_chain_hlocks + chain->depth <= MAX_LOCKDEP_CHAIN_HLOCKS)) { chain->base = nr_chain_hlocks; @@ -2091,7 +2108,7 @@ cache_hit: } chain_hlocks[chain->base + j] = class - lock_classes; } - list_add_tail_rcu(&chain->entry, hash_head); + hlist_add_head_rcu(&chain->entry, hash_head); debug_atomic_inc(chain_lookup_misses); inc_chains(); @@ -2168,7 +2185,7 @@ static void check_chain_key(struct task_struct *curr) { #ifdef CONFIG_DEBUG_LOCKDEP struct held_lock *hlock, *prev_hlock = NULL; - unsigned int i, id; + unsigned int i; u64 chain_key = 0; for (i = 0; i < curr->lockdep_depth; i++) { @@ -2185,17 +2202,16 @@ static void check_chain_key(struct task_struct *curr) (unsigned long long)hlock->prev_chain_key); return; } - id = hlock->class_idx - 1; /* * Whoops ran out of static storage again? */ - if (DEBUG_LOCKS_WARN_ON(id >= MAX_LOCKDEP_KEYS)) + if (DEBUG_LOCKS_WARN_ON(hlock->class_idx > MAX_LOCKDEP_KEYS)) return; if (prev_hlock && (prev_hlock->irq_context != hlock->irq_context)) chain_key = 0; - chain_key = iterate_chain_key(chain_key, id); + chain_key = iterate_chain_key(chain_key, hlock->class_idx); prev_hlock = hlock; } if (chain_key != curr->curr_chain_key) { @@ -3073,7 +3089,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, struct task_struct *curr = current; struct lock_class *class = NULL; struct held_lock *hlock; - unsigned int depth, id; + unsigned int depth; int chain_head = 0; int class_idx; u64 chain_key; @@ -3176,11 +3192,10 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, * The 'key ID' is what is the most compact key value to drive * the hash, not class->key. */ - id = class - lock_classes; /* * Whoops, we did it again.. ran straight out of our static allocation. */ - if (DEBUG_LOCKS_WARN_ON(id >= MAX_LOCKDEP_KEYS)) + if (DEBUG_LOCKS_WARN_ON(class_idx > MAX_LOCKDEP_KEYS)) return 0; chain_key = curr->curr_chain_key; @@ -3198,7 +3213,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, chain_key = 0; chain_head = 1; } - chain_key = iterate_chain_key(chain_key, id); + chain_key = iterate_chain_key(chain_key, class_idx); if (nest_lock && !__lock_is_held(nest_lock)) return print_lock_nested_lock_not_held(curr, hlock, ip); @@ -3875,7 +3890,7 @@ void lockdep_reset(void) nr_process_chains = 0; debug_locks = 1; for (i = 0; i < CHAINHASH_SIZE; i++) - INIT_LIST_HEAD(chainhash_table + i); + INIT_HLIST_HEAD(chainhash_table + i); raw_local_irq_restore(flags); } @@ -3894,7 +3909,7 @@ static void zap_class(struct lock_class *class) /* * Unhash the class and remove it from the all_lock_classes list: */ - list_del_rcu(&class->hash_entry); + hlist_del_rcu(&class->hash_entry); list_del_rcu(&class->lock_entry); RCU_INIT_POINTER(class->key, NULL); @@ -3917,7 +3932,7 @@ static inline int within(const void *addr, void *start, unsigned long size) void lockdep_free_key_range(void *start, unsigned long size) { struct lock_class *class; - struct list_head *head; + struct hlist_head *head; unsigned long flags; int i; int locked; @@ -3930,9 +3945,7 @@ void lockdep_free_key_range(void *start, unsigned long size) */ for (i = 0; i < CLASSHASH_SIZE; i++) { head = classhash_table + i; - if (list_empty(head)) - continue; - list_for_each_entry_rcu(class, head, hash_entry) { + hlist_for_each_entry_rcu(class, head, hash_entry) { if (within(class->key, start, size)) zap_class(class); else if (within(class->name, start, size)) @@ -3962,7 +3975,7 @@ void lockdep_free_key_range(void *start, unsigned long size) void lockdep_reset_lock(struct lockdep_map *lock) { struct lock_class *class; - struct list_head *head; + struct hlist_head *head; unsigned long flags; int i, j; int locked; @@ -3987,9 +4000,7 @@ void lockdep_reset_lock(struct lockdep_map *lock) locked = graph_lock(); for (i = 0; i < CLASSHASH_SIZE; i++) { head = classhash_table + i; - if (list_empty(head)) - continue; - list_for_each_entry_rcu(class, head, hash_entry) { + hlist_for_each_entry_rcu(class, head, hash_entry) { int match = 0; for (j = 0; j < NR_LOCKDEP_CACHING_CLASSES; j++) @@ -4013,28 +4024,6 @@ out_restore: raw_local_irq_restore(flags); } -void lockdep_init(void) -{ - int i; - - /* - * Some architectures have their own start_kernel() - * code which calls lockdep_init(), while we also - * call lockdep_init() from the start_kernel() itself, - * and we want to initialize the hashes only once: - */ - if (lockdep_initialized) - return; - - for (i = 0; i < CLASSHASH_SIZE; i++) - INIT_LIST_HEAD(classhash_table + i); - - for (i = 0; i < CHAINHASH_SIZE; i++) - INIT_LIST_HEAD(chainhash_table + i); - - lockdep_initialized = 1; -} - void __init lockdep_info(void) { printk("Lock dependency validator: Copyright (c) 2006 Red Hat, Inc., Ingo Molnar\n"); @@ -4061,14 +4050,6 @@ void __init lockdep_info(void) printk(" per task-struct memory footprint: %lu bytes\n", sizeof(struct held_lock) * MAX_LOCK_DEPTH); - -#ifdef CONFIG_DEBUG_LOCKDEP - if (lockdep_init_error) { - printk("WARNING: lockdep init error: lock '%s' was acquired before lockdep_init().\n", lock_init_error); - printk("Call stack leading to lockdep invocation was:\n"); - print_stack_trace(&lockdep_init_trace, 0); - } -#endif } static void diff --git a/kernel/locking/mcs_spinlock.h b/kernel/locking/mcs_spinlock.h index 5b9102a47ea5..c835270f0c2f 100644 --- a/kernel/locking/mcs_spinlock.h +++ b/kernel/locking/mcs_spinlock.h @@ -67,7 +67,13 @@ void mcs_spin_lock(struct mcs_spinlock **lock, struct mcs_spinlock *node) node->locked = 0; node->next = NULL; - prev = xchg_acquire(lock, node); + /* + * We rely on the full barrier with global transitivity implied by the + * below xchg() to order the initialization stores above against any + * observation of @node. And to provide the ACQUIRE ordering associated + * with a LOCK primitive. + */ + prev = xchg(lock, node); if (likely(prev == NULL)) { /* * Lock acquired, don't need to set node->locked to 1. Threads diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index 0551c219c40e..e364b424b019 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -716,6 +716,7 @@ static inline void __mutex_unlock_common_slowpath(struct mutex *lock, int nested) { unsigned long flags; + WAKE_Q(wake_q); /* * As a performance measurement, release the lock before doing other @@ -743,11 +744,11 @@ __mutex_unlock_common_slowpath(struct mutex *lock, int nested) struct mutex_waiter, list); debug_mutex_wake_waiter(lock, waiter); - - wake_up_process(waiter->task); + wake_q_add(&wake_q, waiter->task); } spin_unlock_mutex(&lock->wait_lock, flags); + wake_up_q(&wake_q); } /* diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c index 393d1874b9e0..ce2f75e32ae1 100644 --- a/kernel/locking/qspinlock.c +++ b/kernel/locking/qspinlock.c @@ -358,8 +358,7 @@ void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val) * sequentiality; this is because not all clear_pending_set_locked() * implementations imply full barriers. */ - while ((val = smp_load_acquire(&lock->val.counter)) & _Q_LOCKED_MASK) - cpu_relax(); + smp_cond_acquire(!(atomic_read(&lock->val) & _Q_LOCKED_MASK)); /* * take ownership and clear the pending bit. @@ -435,7 +434,7 @@ queue: * * The PV pv_wait_head_or_lock function, if active, will acquire * the lock and return a non-zero value. So we have to skip the - * smp_load_acquire() call. As the next PV queue head hasn't been + * smp_cond_acquire() call. As the next PV queue head hasn't been * designated yet, there is no way for the locked value to become * _Q_SLOW_VAL. So both the set_locked() and the * atomic_cmpxchg_relaxed() calls will be safe. @@ -466,7 +465,7 @@ locked: break; } /* - * The smp_load_acquire() call above has provided the necessary + * The smp_cond_acquire() call above has provided the necessary * acquire semantics required for locking. At most two * iterations of this loop may be ran. */ diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h index 87bb235c3448..21ede57f68b3 100644 --- a/kernel/locking/qspinlock_paravirt.h +++ b/kernel/locking/qspinlock_paravirt.h @@ -55,6 +55,11 @@ struct pv_node { }; /* + * Include queued spinlock statistics code + */ +#include "qspinlock_stat.h" + +/* * By replacing the regular queued_spin_trylock() with the function below, * it will be called once when a lock waiter enter the PV slowpath before * being queued. By allowing one lock stealing attempt here when the pending @@ -65,9 +70,11 @@ struct pv_node { static inline bool pv_queued_spin_steal_lock(struct qspinlock *lock) { struct __qspinlock *l = (void *)lock; + int ret = !(atomic_read(&lock->val) & _Q_LOCKED_PENDING_MASK) && + (cmpxchg(&l->locked, 0, _Q_LOCKED_VAL) == 0); - return !(atomic_read(&lock->val) & _Q_LOCKED_PENDING_MASK) && - (cmpxchg(&l->locked, 0, _Q_LOCKED_VAL) == 0); + qstat_inc(qstat_pv_lock_stealing, ret); + return ret; } /* @@ -138,11 +145,6 @@ static __always_inline int trylock_clear_pending(struct qspinlock *lock) #endif /* _Q_PENDING_BITS == 8 */ /* - * Include queued spinlock statistics code - */ -#include "qspinlock_stat.h" - -/* * Lock and MCS node addresses hash table for fast lookup * * Hashing is done on a per-cacheline basis to minimize the need to access @@ -398,6 +400,11 @@ pv_wait_head_or_lock(struct qspinlock *lock, struct mcs_spinlock *node) if (READ_ONCE(pn->state) == vcpu_hashed) lp = (struct qspinlock **)1; + /* + * Tracking # of slowpath locking operations + */ + qstat_inc(qstat_pv_lock_slowpath, true); + for (;; waitcnt++) { /* * Set correct vCPU state to be used by queue node wait-early diff --git a/kernel/locking/qspinlock_stat.h b/kernel/locking/qspinlock_stat.h index 640dcecdd1df..eb2a2c9bc3fc 100644 --- a/kernel/locking/qspinlock_stat.h +++ b/kernel/locking/qspinlock_stat.h @@ -22,6 +22,7 @@ * pv_kick_wake - # of vCPU kicks used for computing pv_latency_wake * pv_latency_kick - average latency (ns) of vCPU kick operation * pv_latency_wake - average latency (ns) from vCPU kick to wakeup + * pv_lock_slowpath - # of locking operations via the slowpath * pv_lock_stealing - # of lock stealing operations * pv_spurious_wakeup - # of spurious wakeups * pv_wait_again - # of vCPU wait's that happened after a vCPU kick @@ -45,6 +46,7 @@ enum qlock_stats { qstat_pv_kick_wake, qstat_pv_latency_kick, qstat_pv_latency_wake, + qstat_pv_lock_slowpath, qstat_pv_lock_stealing, qstat_pv_spurious_wakeup, qstat_pv_wait_again, @@ -70,6 +72,7 @@ static const char * const qstat_names[qstat_num + 1] = { [qstat_pv_spurious_wakeup] = "pv_spurious_wakeup", [qstat_pv_latency_kick] = "pv_latency_kick", [qstat_pv_latency_wake] = "pv_latency_wake", + [qstat_pv_lock_slowpath] = "pv_lock_slowpath", [qstat_pv_lock_stealing] = "pv_lock_stealing", [qstat_pv_wait_again] = "pv_wait_again", [qstat_pv_wait_early] = "pv_wait_early", @@ -279,19 +282,6 @@ static inline void __pv_wait(u8 *ptr, u8 val) #define pv_kick(c) __pv_kick(c) #define pv_wait(p, v) __pv_wait(p, v) -/* - * PV unfair trylock count tracking function - */ -static inline int qstat_spin_steal_lock(struct qspinlock *lock) -{ - int ret = pv_queued_spin_steal_lock(lock); - - qstat_inc(qstat_pv_lock_stealing, ret); - return ret; -} -#undef queued_spin_trylock -#define queued_spin_trylock(l) qstat_spin_steal_lock(l) - #else /* CONFIG_QUEUED_LOCK_STAT */ static inline void qstat_inc(enum qlock_stats stat, bool cond) { } diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index 8251e75dd9c0..3e746607abe5 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -99,13 +99,14 @@ static inline void mark_rt_mutex_waiters(struct rt_mutex *lock) * 2) Drop lock->wait_lock * 3) Try to unlock the lock with cmpxchg */ -static inline bool unlock_rt_mutex_safe(struct rt_mutex *lock) +static inline bool unlock_rt_mutex_safe(struct rt_mutex *lock, + unsigned long flags) __releases(lock->wait_lock) { struct task_struct *owner = rt_mutex_owner(lock); clear_rt_mutex_waiters(lock); - raw_spin_unlock(&lock->wait_lock); + raw_spin_unlock_irqrestore(&lock->wait_lock, flags); /* * If a new waiter comes in between the unlock and the cmpxchg * we have two situations: @@ -147,11 +148,12 @@ static inline void mark_rt_mutex_waiters(struct rt_mutex *lock) /* * Simple slow path only version: lock->owner is protected by lock->wait_lock. */ -static inline bool unlock_rt_mutex_safe(struct rt_mutex *lock) +static inline bool unlock_rt_mutex_safe(struct rt_mutex *lock, + unsigned long flags) __releases(lock->wait_lock) { lock->owner = NULL; - raw_spin_unlock(&lock->wait_lock); + raw_spin_unlock_irqrestore(&lock->wait_lock, flags); return true; } #endif @@ -433,7 +435,6 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, int ret = 0, depth = 0; struct rt_mutex *lock; bool detect_deadlock; - unsigned long flags; bool requeue = true; detect_deadlock = rt_mutex_cond_detect_deadlock(orig_waiter, chwalk); @@ -476,7 +477,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, /* * [1] Task cannot go away as we did a get_task() before ! */ - raw_spin_lock_irqsave(&task->pi_lock, flags); + raw_spin_lock_irq(&task->pi_lock); /* * [2] Get the waiter on which @task is blocked on. @@ -560,7 +561,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, * operations. */ if (!raw_spin_trylock(&lock->wait_lock)) { - raw_spin_unlock_irqrestore(&task->pi_lock, flags); + raw_spin_unlock_irq(&task->pi_lock); cpu_relax(); goto retry; } @@ -591,7 +592,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, /* * No requeue[7] here. Just release @task [8] */ - raw_spin_unlock_irqrestore(&task->pi_lock, flags); + raw_spin_unlock(&task->pi_lock); put_task_struct(task); /* @@ -599,14 +600,14 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, * If there is no owner of the lock, end of chain. */ if (!rt_mutex_owner(lock)) { - raw_spin_unlock(&lock->wait_lock); + raw_spin_unlock_irq(&lock->wait_lock); return 0; } /* [10] Grab the next task, i.e. owner of @lock */ task = rt_mutex_owner(lock); get_task_struct(task); - raw_spin_lock_irqsave(&task->pi_lock, flags); + raw_spin_lock(&task->pi_lock); /* * No requeue [11] here. We just do deadlock detection. @@ -621,8 +622,8 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, top_waiter = rt_mutex_top_waiter(lock); /* [13] Drop locks */ - raw_spin_unlock_irqrestore(&task->pi_lock, flags); - raw_spin_unlock(&lock->wait_lock); + raw_spin_unlock(&task->pi_lock); + raw_spin_unlock_irq(&lock->wait_lock); /* If owner is not blocked, end of chain. */ if (!next_lock) @@ -643,7 +644,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, rt_mutex_enqueue(lock, waiter); /* [8] Release the task */ - raw_spin_unlock_irqrestore(&task->pi_lock, flags); + raw_spin_unlock(&task->pi_lock); put_task_struct(task); /* @@ -661,14 +662,14 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, */ if (prerequeue_top_waiter != rt_mutex_top_waiter(lock)) wake_up_process(rt_mutex_top_waiter(lock)->task); - raw_spin_unlock(&lock->wait_lock); + raw_spin_unlock_irq(&lock->wait_lock); return 0; } /* [10] Grab the next task, i.e. the owner of @lock */ task = rt_mutex_owner(lock); get_task_struct(task); - raw_spin_lock_irqsave(&task->pi_lock, flags); + raw_spin_lock(&task->pi_lock); /* [11] requeue the pi waiters if necessary */ if (waiter == rt_mutex_top_waiter(lock)) { @@ -722,8 +723,8 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, top_waiter = rt_mutex_top_waiter(lock); /* [13] Drop the locks */ - raw_spin_unlock_irqrestore(&task->pi_lock, flags); - raw_spin_unlock(&lock->wait_lock); + raw_spin_unlock(&task->pi_lock); + raw_spin_unlock_irq(&lock->wait_lock); /* * Make the actual exit decisions [12], based on the stored @@ -746,7 +747,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, goto again; out_unlock_pi: - raw_spin_unlock_irqrestore(&task->pi_lock, flags); + raw_spin_unlock_irq(&task->pi_lock); out_put_task: put_task_struct(task); @@ -756,7 +757,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, /* * Try to take an rt-mutex * - * Must be called with lock->wait_lock held. + * Must be called with lock->wait_lock held and interrupts disabled * * @lock: The lock to be acquired. * @task: The task which wants to acquire the lock @@ -766,8 +767,6 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task, struct rt_mutex_waiter *waiter) { - unsigned long flags; - /* * Before testing whether we can acquire @lock, we set the * RT_MUTEX_HAS_WAITERS bit in @lock->owner. This forces all @@ -852,7 +851,7 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task, * case, but conditionals are more expensive than a redundant * store. */ - raw_spin_lock_irqsave(&task->pi_lock, flags); + raw_spin_lock(&task->pi_lock); task->pi_blocked_on = NULL; /* * Finish the lock acquisition. @task is the new owner. If @@ -861,7 +860,7 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task, */ if (rt_mutex_has_waiters(lock)) rt_mutex_enqueue_pi(task, rt_mutex_top_waiter(lock)); - raw_spin_unlock_irqrestore(&task->pi_lock, flags); + raw_spin_unlock(&task->pi_lock); takeit: /* We got the lock. */ @@ -883,7 +882,7 @@ takeit: * * Prepare waiter and propagate pi chain * - * This must be called with lock->wait_lock held. + * This must be called with lock->wait_lock held and interrupts disabled */ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, struct rt_mutex_waiter *waiter, @@ -894,7 +893,6 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, struct rt_mutex_waiter *top_waiter = waiter; struct rt_mutex *next_lock; int chain_walk = 0, res; - unsigned long flags; /* * Early deadlock detection. We really don't want the task to @@ -908,7 +906,7 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, if (owner == task) return -EDEADLK; - raw_spin_lock_irqsave(&task->pi_lock, flags); + raw_spin_lock(&task->pi_lock); __rt_mutex_adjust_prio(task); waiter->task = task; waiter->lock = lock; @@ -921,12 +919,12 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, task->pi_blocked_on = waiter; - raw_spin_unlock_irqrestore(&task->pi_lock, flags); + raw_spin_unlock(&task->pi_lock); if (!owner) return 0; - raw_spin_lock_irqsave(&owner->pi_lock, flags); + raw_spin_lock(&owner->pi_lock); if (waiter == rt_mutex_top_waiter(lock)) { rt_mutex_dequeue_pi(owner, top_waiter); rt_mutex_enqueue_pi(owner, waiter); @@ -941,7 +939,7 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, /* Store the lock on which owner is blocked or NULL */ next_lock = task_blocked_on_lock(owner); - raw_spin_unlock_irqrestore(&owner->pi_lock, flags); + raw_spin_unlock(&owner->pi_lock); /* * Even if full deadlock detection is on, if the owner is not * blocked itself, we can avoid finding this out in the chain @@ -957,12 +955,12 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, */ get_task_struct(owner); - raw_spin_unlock(&lock->wait_lock); + raw_spin_unlock_irq(&lock->wait_lock); res = rt_mutex_adjust_prio_chain(owner, chwalk, lock, next_lock, waiter, task); - raw_spin_lock(&lock->wait_lock); + raw_spin_lock_irq(&lock->wait_lock); return res; } @@ -971,15 +969,14 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, * Remove the top waiter from the current tasks pi waiter tree and * queue it up. * - * Called with lock->wait_lock held. + * Called with lock->wait_lock held and interrupts disabled. */ static void mark_wakeup_next_waiter(struct wake_q_head *wake_q, struct rt_mutex *lock) { struct rt_mutex_waiter *waiter; - unsigned long flags; - raw_spin_lock_irqsave(¤t->pi_lock, flags); + raw_spin_lock(¤t->pi_lock); waiter = rt_mutex_top_waiter(lock); @@ -1001,7 +998,7 @@ static void mark_wakeup_next_waiter(struct wake_q_head *wake_q, */ lock->owner = (void *) RT_MUTEX_HAS_WAITERS; - raw_spin_unlock_irqrestore(¤t->pi_lock, flags); + raw_spin_unlock(¤t->pi_lock); wake_q_add(wake_q, waiter->task); } @@ -1009,7 +1006,7 @@ static void mark_wakeup_next_waiter(struct wake_q_head *wake_q, /* * Remove a waiter from a lock and give up * - * Must be called with lock->wait_lock held and + * Must be called with lock->wait_lock held and interrupts disabled. I must * have just failed to try_to_take_rt_mutex(). */ static void remove_waiter(struct rt_mutex *lock, @@ -1018,12 +1015,11 @@ static void remove_waiter(struct rt_mutex *lock, bool is_top_waiter = (waiter == rt_mutex_top_waiter(lock)); struct task_struct *owner = rt_mutex_owner(lock); struct rt_mutex *next_lock; - unsigned long flags; - raw_spin_lock_irqsave(¤t->pi_lock, flags); + raw_spin_lock(¤t->pi_lock); rt_mutex_dequeue(lock, waiter); current->pi_blocked_on = NULL; - raw_spin_unlock_irqrestore(¤t->pi_lock, flags); + raw_spin_unlock(¤t->pi_lock); /* * Only update priority if the waiter was the highest priority @@ -1032,7 +1028,7 @@ static void remove_waiter(struct rt_mutex *lock, if (!owner || !is_top_waiter) return; - raw_spin_lock_irqsave(&owner->pi_lock, flags); + raw_spin_lock(&owner->pi_lock); rt_mutex_dequeue_pi(owner, waiter); @@ -1044,7 +1040,7 @@ static void remove_waiter(struct rt_mutex *lock, /* Store the lock on which owner is blocked or NULL */ next_lock = task_blocked_on_lock(owner); - raw_spin_unlock_irqrestore(&owner->pi_lock, flags); + raw_spin_unlock(&owner->pi_lock); /* * Don't walk the chain, if the owner task is not blocked @@ -1056,12 +1052,12 @@ static void remove_waiter(struct rt_mutex *lock, /* gets dropped in rt_mutex_adjust_prio_chain()! */ get_task_struct(owner); - raw_spin_unlock(&lock->wait_lock); + raw_spin_unlock_irq(&lock->wait_lock); rt_mutex_adjust_prio_chain(owner, RT_MUTEX_MIN_CHAINWALK, lock, next_lock, NULL, current); - raw_spin_lock(&lock->wait_lock); + raw_spin_lock_irq(&lock->wait_lock); } /* @@ -1097,11 +1093,11 @@ void rt_mutex_adjust_pi(struct task_struct *task) * __rt_mutex_slowlock() - Perform the wait-wake-try-to-take loop * @lock: the rt_mutex to take * @state: the state the task should block in (TASK_INTERRUPTIBLE - * or TASK_UNINTERRUPTIBLE) + * or TASK_UNINTERRUPTIBLE) * @timeout: the pre-initialized and started timer, or NULL for none * @waiter: the pre-initialized rt_mutex_waiter * - * lock->wait_lock must be held by the caller. + * Must be called with lock->wait_lock held and interrupts disabled */ static int __sched __rt_mutex_slowlock(struct rt_mutex *lock, int state, @@ -1129,13 +1125,13 @@ __rt_mutex_slowlock(struct rt_mutex *lock, int state, break; } - raw_spin_unlock(&lock->wait_lock); + raw_spin_unlock_irq(&lock->wait_lock); debug_rt_mutex_print_deadlock(waiter); schedule(); - raw_spin_lock(&lock->wait_lock); + raw_spin_lock_irq(&lock->wait_lock); set_current_state(state); } @@ -1172,17 +1168,26 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state, enum rtmutex_chainwalk chwalk) { struct rt_mutex_waiter waiter; + unsigned long flags; int ret = 0; debug_rt_mutex_init_waiter(&waiter); RB_CLEAR_NODE(&waiter.pi_tree_entry); RB_CLEAR_NODE(&waiter.tree_entry); - raw_spin_lock(&lock->wait_lock); + /* + * Technically we could use raw_spin_[un]lock_irq() here, but this can + * be called in early boot if the cmpxchg() fast path is disabled + * (debug, no architecture support). In this case we will acquire the + * rtmutex with lock->wait_lock held. But we cannot unconditionally + * enable interrupts in that early boot case. So we need to use the + * irqsave/restore variants. + */ + raw_spin_lock_irqsave(&lock->wait_lock, flags); /* Try to acquire the lock again: */ if (try_to_take_rt_mutex(lock, current, NULL)) { - raw_spin_unlock(&lock->wait_lock); + raw_spin_unlock_irqrestore(&lock->wait_lock, flags); return 0; } @@ -1211,7 +1216,7 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state, */ fixup_rt_mutex_waiters(lock); - raw_spin_unlock(&lock->wait_lock); + raw_spin_unlock_irqrestore(&lock->wait_lock, flags); /* Remove pending timer: */ if (unlikely(timeout)) @@ -1227,6 +1232,7 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state, */ static inline int rt_mutex_slowtrylock(struct rt_mutex *lock) { + unsigned long flags; int ret; /* @@ -1238,10 +1244,10 @@ static inline int rt_mutex_slowtrylock(struct rt_mutex *lock) return 0; /* - * The mutex has currently no owner. Lock the wait lock and - * try to acquire the lock. + * The mutex has currently no owner. Lock the wait lock and try to + * acquire the lock. We use irqsave here to support early boot calls. */ - raw_spin_lock(&lock->wait_lock); + raw_spin_lock_irqsave(&lock->wait_lock, flags); ret = try_to_take_rt_mutex(lock, current, NULL); @@ -1251,7 +1257,7 @@ static inline int rt_mutex_slowtrylock(struct rt_mutex *lock) */ fixup_rt_mutex_waiters(lock); - raw_spin_unlock(&lock->wait_lock); + raw_spin_unlock_irqrestore(&lock->wait_lock, flags); return ret; } @@ -1263,7 +1269,10 @@ static inline int rt_mutex_slowtrylock(struct rt_mutex *lock) static bool __sched rt_mutex_slowunlock(struct rt_mutex *lock, struct wake_q_head *wake_q) { - raw_spin_lock(&lock->wait_lock); + unsigned long flags; + + /* irqsave required to support early boot calls */ + raw_spin_lock_irqsave(&lock->wait_lock, flags); debug_rt_mutex_unlock(lock); @@ -1302,10 +1311,10 @@ static bool __sched rt_mutex_slowunlock(struct rt_mutex *lock, */ while (!rt_mutex_has_waiters(lock)) { /* Drops lock->wait_lock ! */ - if (unlock_rt_mutex_safe(lock) == true) + if (unlock_rt_mutex_safe(lock, flags) == true) return false; /* Relock the rtmutex and try again */ - raw_spin_lock(&lock->wait_lock); + raw_spin_lock_irqsave(&lock->wait_lock, flags); } /* @@ -1316,7 +1325,7 @@ static bool __sched rt_mutex_slowunlock(struct rt_mutex *lock, */ mark_wakeup_next_waiter(wake_q, lock); - raw_spin_unlock(&lock->wait_lock); + raw_spin_unlock_irqrestore(&lock->wait_lock, flags); /* check PI boosting */ return true; @@ -1596,10 +1605,10 @@ int rt_mutex_start_proxy_lock(struct rt_mutex *lock, { int ret; - raw_spin_lock(&lock->wait_lock); + raw_spin_lock_irq(&lock->wait_lock); if (try_to_take_rt_mutex(lock, task, NULL)) { - raw_spin_unlock(&lock->wait_lock); + raw_spin_unlock_irq(&lock->wait_lock); return 1; } @@ -1620,7 +1629,7 @@ int rt_mutex_start_proxy_lock(struct rt_mutex *lock, if (unlikely(ret)) remove_waiter(lock, waiter); - raw_spin_unlock(&lock->wait_lock); + raw_spin_unlock_irq(&lock->wait_lock); debug_rt_mutex_print_deadlock(waiter); @@ -1668,7 +1677,7 @@ int rt_mutex_finish_proxy_lock(struct rt_mutex *lock, { int ret; - raw_spin_lock(&lock->wait_lock); + raw_spin_lock_irq(&lock->wait_lock); set_current_state(TASK_INTERRUPTIBLE); @@ -1684,7 +1693,7 @@ int rt_mutex_finish_proxy_lock(struct rt_mutex *lock, */ fixup_rt_mutex_waiters(lock); - raw_spin_unlock(&lock->wait_lock); + raw_spin_unlock_irq(&lock->wait_lock); return ret; } diff --git a/kernel/memremap.c b/kernel/memremap.c index e517a16cb426..a6d382312e6f 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c @@ -29,10 +29,10 @@ __weak void __iomem *ioremap_cache(resource_size_t offset, unsigned long size) static void *try_ram_remap(resource_size_t offset, size_t size) { - struct page *page = pfn_to_page(offset >> PAGE_SHIFT); + unsigned long pfn = PHYS_PFN(offset); /* In the simple case just return the existing linear address */ - if (!PageHighMem(page)) + if (pfn_valid(pfn) && !PageHighMem(pfn_to_page(pfn))) return __va(offset); return NULL; /* fallback to ioremap_cache */ } @@ -41,13 +41,15 @@ static void *try_ram_remap(resource_size_t offset, size_t size) * memremap() - remap an iomem_resource as cacheable memory * @offset: iomem resource start address * @size: size of remap - * @flags: either MEMREMAP_WB or MEMREMAP_WT + * @flags: any of MEMREMAP_WB, MEMREMAP_WT and MEMREMAP_WC * * memremap() is "ioremap" for cases where it is known that the resource * being mapped does not have i/o side effects and the __iomem - * annotation is not applicable. + * annotation is not applicable. In the case of multiple flags, the different + * mapping types will be attempted in the order listed below until one of + * them succeeds. * - * MEMREMAP_WB - matches the default mapping for "System RAM" on + * MEMREMAP_WB - matches the default mapping for System RAM on * the architecture. This is usually a read-allocate write-back cache. * Morever, if MEMREMAP_WB is specified and the requested remap region is RAM * memremap() will bypass establishing a new mapping and instead return @@ -56,13 +58,21 @@ static void *try_ram_remap(resource_size_t offset, size_t size) * MEMREMAP_WT - establish a mapping whereby writes either bypass the * cache or are written through to memory and never exist in a * cache-dirty state with respect to program visibility. Attempts to - * map "System RAM" with this mapping type will fail. + * map System RAM with this mapping type will fail. + * + * MEMREMAP_WC - establish a writecombine mapping, whereby writes may + * be coalesced together (e.g. in the CPU's write buffers), but is otherwise + * uncached. Attempts to map System RAM with this mapping type will fail. */ void *memremap(resource_size_t offset, size_t size, unsigned long flags) { - int is_ram = region_intersects(offset, size, "System RAM"); + int is_ram = region_intersects(offset, size, + IORESOURCE_SYSTEM_RAM, IORES_DESC_NONE); void *addr = NULL; + if (!flags) + return NULL; + if (is_ram == REGION_MIXED) { WARN_ONCE(1, "memremap attempted on mixed range %pa size: %#lx\n", &offset, (unsigned long) size); @@ -71,12 +81,11 @@ void *memremap(resource_size_t offset, size_t size, unsigned long flags) /* Try all mapping types requested until one returns non-NULL */ if (flags & MEMREMAP_WB) { - flags &= ~MEMREMAP_WB; /* * MEMREMAP_WB is special in that it can be satisifed * from the direct map. Some archs depend on the * capability of memremap() to autodetect cases where - * the requested range is potentially in "System RAM" + * the requested range is potentially in System RAM. */ if (is_ram == REGION_INTERSECTS) addr = try_ram_remap(offset, size); @@ -85,21 +94,22 @@ void *memremap(resource_size_t offset, size_t size, unsigned long flags) } /* - * If we don't have a mapping yet and more request flags are - * pending then we will be attempting to establish a new virtual + * If we don't have a mapping yet and other request flags are + * present then we will be attempting to establish a new virtual * address mapping. Enforce that this mapping is not aliasing - * "System RAM" + * System RAM. */ - if (!addr && is_ram == REGION_INTERSECTS && flags) { + if (!addr && is_ram == REGION_INTERSECTS && flags != MEMREMAP_WB) { WARN_ONCE(1, "memremap attempted on ram %pa size: %#lx\n", &offset, (unsigned long) size); return NULL; } - if (!addr && (flags & MEMREMAP_WT)) { - flags &= ~MEMREMAP_WT; + if (!addr && (flags & MEMREMAP_WT)) addr = ioremap_wt(offset, size); - } + + if (!addr && (flags & MEMREMAP_WC)) + addr = ioremap_wc(offset, size); return addr; } @@ -114,7 +124,7 @@ EXPORT_SYMBOL(memunmap); static void devm_memremap_release(struct device *dev, void *res) { - memunmap(res); + memunmap(*(void **)res); } static int devm_memremap_match(struct device *dev, void *res, void *match_data) @@ -136,8 +146,10 @@ void *devm_memremap(struct device *dev, resource_size_t offset, if (addr) { *ptr = addr; devres_add(dev, ptr); - } else + } else { devres_free(ptr); + return ERR_PTR(-ENXIO); + } return addr; } @@ -150,7 +162,7 @@ void devm_memunmap(struct device *dev, void *addr) } EXPORT_SYMBOL(devm_memunmap); -pfn_t phys_to_pfn_t(dma_addr_t addr, unsigned long flags) +pfn_t phys_to_pfn_t(phys_addr_t addr, u64 flags) { return __pfn_to_pfn_t(addr >> PAGE_SHIFT, flags); } @@ -183,7 +195,11 @@ EXPORT_SYMBOL(put_zone_device_page); static void pgmap_radix_release(struct resource *res) { - resource_size_t key; + resource_size_t key, align_start, align_size, align_end; + + align_start = res->start & ~(SECTION_SIZE - 1); + align_size = ALIGN(resource_size(res), SECTION_SIZE); + align_end = align_start + align_size - 1; mutex_lock(&pgmap_lock); for (key = res->start; key <= res->end; key += SECTION_SIZE) @@ -226,12 +242,11 @@ static void devm_memremap_pages_release(struct device *dev, void *data) percpu_ref_put(pgmap->ref); } - pgmap_radix_release(res); - /* pages are dead and unused, undo the arch mapping */ align_start = res->start & ~(SECTION_SIZE - 1); align_size = ALIGN(resource_size(res), SECTION_SIZE); arch_remove_memory(align_start, align_size); + pgmap_radix_release(res); dev_WARN_ONCE(dev, pgmap->altmap && pgmap->altmap->alloc, "%s: failed to free all reserved pages\n", __func__); } @@ -265,13 +280,17 @@ struct dev_pagemap *find_dev_pagemap(resource_size_t phys) void *devm_memremap_pages(struct device *dev, struct resource *res, struct percpu_ref *ref, struct vmem_altmap *altmap) { - int is_ram = region_intersects(res->start, resource_size(res), - "System RAM"); - resource_size_t key, align_start, align_size; + resource_size_t key, align_start, align_size, align_end; struct dev_pagemap *pgmap; struct page_map *page_map; + int error, nid, is_ram; unsigned long pfn; - int error, nid; + + align_start = res->start & ~(SECTION_SIZE - 1); + align_size = ALIGN(res->start + resource_size(res), SECTION_SIZE) + - align_start; + is_ram = region_intersects(align_start, align_size, + IORESOURCE_SYSTEM_RAM, IORES_DESC_NONE); if (is_ram == REGION_MIXED) { WARN_ONCE(1, "%s attempted on mixed region %pr\n", @@ -309,7 +328,8 @@ void *devm_memremap_pages(struct device *dev, struct resource *res, mutex_lock(&pgmap_lock); error = 0; - for (key = res->start; key <= res->end; key += SECTION_SIZE) { + align_end = align_start + align_size - 1; + for (key = align_start; key <= align_end; key += SECTION_SIZE) { struct dev_pagemap *dup; rcu_read_lock(); @@ -336,8 +356,6 @@ void *devm_memremap_pages(struct device *dev, struct resource *res, if (nid < 0) nid = numa_mem_id(); - align_start = res->start & ~(SECTION_SIZE - 1); - align_size = ALIGN(resource_size(res), SECTION_SIZE); error = arch_add_memory(nid, align_start, align_size, true); if (error) goto err_add_memory; @@ -345,8 +363,13 @@ void *devm_memremap_pages(struct device *dev, struct resource *res, for_each_device_pfn(pfn, page_map) { struct page *page = pfn_to_page(pfn); - /* ZONE_DEVICE pages must never appear on a slab lru */ - list_force_poison(&page->lru); + /* + * ZONE_DEVICE pages union ->lru with a ->pgmap back + * pointer. It is a bug if a ZONE_DEVICE page is ever + * freed or placed on a driver-private list. Seed the + * storage with LIST_POISON* values. + */ + list_del(&page->lru); page->pgmap = pgmap; } devres_add(dev, page_map); @@ -377,7 +400,7 @@ struct vmem_altmap *to_vmem_altmap(unsigned long memmap_start) /* * 'memmap_start' is the virtual address for the first "struct * page" in this range of the vmemmap array. In the case of - * CONFIG_SPARSE_VMEMMAP a page_to_pfn conversion is simple + * CONFIG_SPARSEMEM_VMEMMAP a page_to_pfn conversion is simple * pointer arithmetic, so we can perform this to_vmem_altmap() * conversion without concern for the initialization state of * the struct page fields. @@ -386,7 +409,7 @@ struct vmem_altmap *to_vmem_altmap(unsigned long memmap_start) struct dev_pagemap *pgmap; /* - * Uncoditionally retrieve a dev_pagemap associated with the + * Unconditionally retrieve a dev_pagemap associated with the * given physical address, this is only for use in the * arch_{add|remove}_memory() for setting up and tearing down * the memmap. diff --git a/kernel/module.c b/kernel/module.c index 8358f4697c0c..041200ca4a2d 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -53,6 +53,7 @@ #include <asm/sections.h> #include <linux/tracepoint.h> #include <linux/ftrace.h> +#include <linux/livepatch.h> #include <linux/async.h> #include <linux/percpu.h> #include <linux/kmemleak.h> @@ -303,6 +304,9 @@ struct load_info { struct _ddebug *debug; unsigned int num_debug; bool sig_ok; +#ifdef CONFIG_KALLSYMS + unsigned long mod_kallsyms_init_off; +#endif struct { unsigned int sym, str, mod, vers, info, pcpu; } index; @@ -981,6 +985,9 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user, mod->exit(); blocking_notifier_call_chain(&module_notify_list, MODULE_STATE_GOING, mod); + klp_module_going(mod); + ftrace_release_mod(mod); + async_synchronize_full(); /* Store the name of the last unloaded module for diagnostic purposes */ @@ -2480,10 +2487,21 @@ static void layout_symtab(struct module *mod, struct load_info *info) strsect->sh_flags |= SHF_ALLOC; strsect->sh_entsize = get_offset(mod, &mod->init_layout.size, strsect, info->index.str) | INIT_OFFSET_MASK; - mod->init_layout.size = debug_align(mod->init_layout.size); pr_debug("\t%s\n", info->secstrings + strsect->sh_name); + + /* We'll tack temporary mod_kallsyms on the end. */ + mod->init_layout.size = ALIGN(mod->init_layout.size, + __alignof__(struct mod_kallsyms)); + info->mod_kallsyms_init_off = mod->init_layout.size; + mod->init_layout.size += sizeof(struct mod_kallsyms); + mod->init_layout.size = debug_align(mod->init_layout.size); } +/* + * We use the full symtab and strtab which layout_symtab arranged to + * be appended to the init section. Later we switch to the cut-down + * core-only ones. + */ static void add_kallsyms(struct module *mod, const struct load_info *info) { unsigned int i, ndst; @@ -2492,29 +2510,34 @@ static void add_kallsyms(struct module *mod, const struct load_info *info) char *s; Elf_Shdr *symsec = &info->sechdrs[info->index.sym]; - mod->symtab = (void *)symsec->sh_addr; - mod->num_symtab = symsec->sh_size / sizeof(Elf_Sym); + /* Set up to point into init section. */ + mod->kallsyms = mod->init_layout.base + info->mod_kallsyms_init_off; + + mod->kallsyms->symtab = (void *)symsec->sh_addr; + mod->kallsyms->num_symtab = symsec->sh_size / sizeof(Elf_Sym); /* Make sure we get permanent strtab: don't use info->strtab. */ - mod->strtab = (void *)info->sechdrs[info->index.str].sh_addr; + mod->kallsyms->strtab = (void *)info->sechdrs[info->index.str].sh_addr; /* Set types up while we still have access to sections. */ - for (i = 0; i < mod->num_symtab; i++) - mod->symtab[i].st_info = elf_type(&mod->symtab[i], info); - - mod->core_symtab = dst = mod->core_layout.base + info->symoffs; - mod->core_strtab = s = mod->core_layout.base + info->stroffs; - src = mod->symtab; - for (ndst = i = 0; i < mod->num_symtab; i++) { + for (i = 0; i < mod->kallsyms->num_symtab; i++) + mod->kallsyms->symtab[i].st_info + = elf_type(&mod->kallsyms->symtab[i], info); + + /* Now populate the cut down core kallsyms for after init. */ + mod->core_kallsyms.symtab = dst = mod->core_layout.base + info->symoffs; + mod->core_kallsyms.strtab = s = mod->core_layout.base + info->stroffs; + src = mod->kallsyms->symtab; + for (ndst = i = 0; i < mod->kallsyms->num_symtab; i++) { if (i == 0 || is_core_symbol(src+i, info->sechdrs, info->hdr->e_shnum, info->index.pcpu)) { dst[ndst] = src[i]; - dst[ndst++].st_name = s - mod->core_strtab; - s += strlcpy(s, &mod->strtab[src[i].st_name], + dst[ndst++].st_name = s - mod->core_kallsyms.strtab; + s += strlcpy(s, &mod->kallsyms->strtab[src[i].st_name], KSYM_NAME_LEN) + 1; } } - mod->core_num_syms = ndst; + mod->core_kallsyms.num_symtab = ndst; } #else static inline void layout_symtab(struct module *mod, struct load_info *info) @@ -2654,7 +2677,7 @@ static int copy_module_from_user(const void __user *umod, unsigned long len, if (info->len < sizeof(*(info->hdr))) return -ENOEXEC; - err = security_kernel_module_from_file(NULL); + err = security_kernel_read_file(NULL, READING_MODULE); if (err) return err; @@ -2672,63 +2695,6 @@ static int copy_module_from_user(const void __user *umod, unsigned long len, return 0; } -/* Sets info->hdr and info->len. */ -static int copy_module_from_fd(int fd, struct load_info *info) -{ - struct fd f = fdget(fd); - int err; - struct kstat stat; - loff_t pos; - ssize_t bytes = 0; - - if (!f.file) - return -ENOEXEC; - - err = security_kernel_module_from_file(f.file); - if (err) - goto out; - - err = vfs_getattr(&f.file->f_path, &stat); - if (err) - goto out; - - if (stat.size > INT_MAX) { - err = -EFBIG; - goto out; - } - - /* Don't hand 0 to vmalloc, it whines. */ - if (stat.size == 0) { - err = -EINVAL; - goto out; - } - - info->hdr = vmalloc(stat.size); - if (!info->hdr) { - err = -ENOMEM; - goto out; - } - - pos = 0; - while (pos < stat.size) { - bytes = kernel_read(f.file, pos, (char *)(info->hdr) + pos, - stat.size - pos); - if (bytes < 0) { - vfree(info->hdr); - err = bytes; - goto out; - } - if (bytes == 0) - break; - pos += bytes; - } - info->len = pos; - -out: - fdput(f); - return err; -} - static void free_copy(struct load_info *info) { vfree(info->hdr); @@ -3263,9 +3229,8 @@ static noinline int do_init_module(struct module *mod) module_put(mod); trim_init_extable(mod); #ifdef CONFIG_KALLSYMS - mod->num_symtab = mod->core_num_syms; - mod->symtab = mod->core_symtab; - mod->strtab = mod->core_strtab; + /* Switch to core kallsyms now init is done: kallsyms may be walking! */ + rcu_assign_pointer(mod->kallsyms, &mod->core_kallsyms); #endif mod_tree_remove_init(mod); disable_ro_nx(&mod->init_layout); @@ -3295,6 +3260,8 @@ fail: module_put(mod); blocking_notifier_call_chain(&module_notify_list, MODULE_STATE_GOING, mod); + klp_module_going(mod); + ftrace_release_mod(mod); free_module(mod); wake_up_all(&module_wq); return ret; @@ -3371,8 +3338,6 @@ static int complete_formation(struct module *mod, struct load_info *info) mod->state = MODULE_STATE_COMING; mutex_unlock(&module_mutex); - blocking_notifier_call_chain(&module_notify_list, - MODULE_STATE_COMING, mod); return 0; out: @@ -3380,6 +3345,20 @@ out: return err; } +static int prepare_coming_module(struct module *mod) +{ + int err; + + ftrace_module_enable(mod); + err = klp_module_coming(mod); + if (err) + return err; + + blocking_notifier_call_chain(&module_notify_list, + MODULE_STATE_COMING, mod); + return 0; +} + static int unknown_module_param_cb(char *param, char *val, const char *modname, void *arg) { @@ -3494,13 +3473,17 @@ static int load_module(struct load_info *info, const char __user *uargs, if (err) goto ddebug_cleanup; + err = prepare_coming_module(mod); + if (err) + goto bug_cleanup; + /* Module is ready to execute: parsing args may do that. */ after_dashes = parse_args(mod->name, mod->args, mod->kp, mod->num_kp, - -32768, 32767, NULL, + -32768, 32767, mod, unknown_module_param_cb); if (IS_ERR(after_dashes)) { err = PTR_ERR(after_dashes); - goto bug_cleanup; + goto coming_cleanup; } else if (after_dashes) { pr_warn("%s: parameters '%s' after `--' ignored\n", mod->name, after_dashes); @@ -3509,7 +3492,7 @@ static int load_module(struct load_info *info, const char __user *uargs, /* Link in to syfs. */ err = mod_sysfs_setup(mod, info, mod->kp, mod->num_kp); if (err < 0) - goto bug_cleanup; + goto coming_cleanup; /* Get rid of temporary copy. */ free_copy(info); @@ -3519,15 +3502,17 @@ static int load_module(struct load_info *info, const char __user *uargs, return do_init_module(mod); + coming_cleanup: + blocking_notifier_call_chain(&module_notify_list, + MODULE_STATE_GOING, mod); + klp_module_going(mod); + bug_cleanup: /* module_bug_cleanup needs module_mutex protection */ mutex_lock(&module_mutex); module_bug_cleanup(mod); mutex_unlock(&module_mutex); - blocking_notifier_call_chain(&module_notify_list, - MODULE_STATE_GOING, mod); - /* we can't deallocate the module until we clear memory protection */ module_disable_ro(mod); module_disable_nx(mod); @@ -3589,8 +3574,10 @@ SYSCALL_DEFINE3(init_module, void __user *, umod, SYSCALL_DEFINE3(finit_module, int, fd, const char __user *, uargs, int, flags) { - int err; struct load_info info = { }; + loff_t size; + void *hdr; + int err; err = may_init_module(); if (err) @@ -3602,9 +3589,12 @@ SYSCALL_DEFINE3(finit_module, int, fd, const char __user *, uargs, int, flags) |MODULE_INIT_IGNORE_VERMAGIC)) return -EINVAL; - err = copy_module_from_fd(fd, &info); + err = kernel_read_file_from_fd(fd, &hdr, &size, INT_MAX, + READING_MODULE); if (err) return err; + info.hdr = hdr; + info.len = size; return load_module(&info, uargs, flags); } @@ -3627,6 +3617,11 @@ static inline int is_arm_mapping_symbol(const char *str) && (str[2] == '\0' || str[2] == '.'); } +static const char *symname(struct mod_kallsyms *kallsyms, unsigned int symnum) +{ + return kallsyms->strtab + kallsyms->symtab[symnum].st_name; +} + static const char *get_ksymbol(struct module *mod, unsigned long addr, unsigned long *size, @@ -3634,6 +3629,7 @@ static const char *get_ksymbol(struct module *mod, { unsigned int i, best = 0; unsigned long nextval; + struct mod_kallsyms *kallsyms = rcu_dereference_sched(mod->kallsyms); /* At worse, next value is at end of module */ if (within_module_init(addr, mod)) @@ -3643,32 +3639,32 @@ static const char *get_ksymbol(struct module *mod, /* Scan for closest preceding symbol, and next symbol. (ELF starts real symbols at 1). */ - for (i = 1; i < mod->num_symtab; i++) { - if (mod->symtab[i].st_shndx == SHN_UNDEF) + for (i = 1; i < kallsyms->num_symtab; i++) { + if (kallsyms->symtab[i].st_shndx == SHN_UNDEF) continue; /* We ignore unnamed symbols: they're uninformative * and inserted at a whim. */ - if (mod->symtab[i].st_value <= addr - && mod->symtab[i].st_value > mod->symtab[best].st_value - && *(mod->strtab + mod->symtab[i].st_name) != '\0' - && !is_arm_mapping_symbol(mod->strtab + mod->symtab[i].st_name)) + if (*symname(kallsyms, i) == '\0' + || is_arm_mapping_symbol(symname(kallsyms, i))) + continue; + + if (kallsyms->symtab[i].st_value <= addr + && kallsyms->symtab[i].st_value > kallsyms->symtab[best].st_value) best = i; - if (mod->symtab[i].st_value > addr - && mod->symtab[i].st_value < nextval - && *(mod->strtab + mod->symtab[i].st_name) != '\0' - && !is_arm_mapping_symbol(mod->strtab + mod->symtab[i].st_name)) - nextval = mod->symtab[i].st_value; + if (kallsyms->symtab[i].st_value > addr + && kallsyms->symtab[i].st_value < nextval) + nextval = kallsyms->symtab[i].st_value; } if (!best) return NULL; if (size) - *size = nextval - mod->symtab[best].st_value; + *size = nextval - kallsyms->symtab[best].st_value; if (offset) - *offset = addr - mod->symtab[best].st_value; - return mod->strtab + mod->symtab[best].st_name; + *offset = addr - kallsyms->symtab[best].st_value; + return symname(kallsyms, best); } /* For kallsyms to ask for address resolution. NULL means not found. Careful @@ -3758,19 +3754,21 @@ int module_get_kallsym(unsigned int symnum, unsigned long *value, char *type, preempt_disable(); list_for_each_entry_rcu(mod, &modules, list) { + struct mod_kallsyms *kallsyms; + if (mod->state == MODULE_STATE_UNFORMED) continue; - if (symnum < mod->num_symtab) { - *value = mod->symtab[symnum].st_value; - *type = mod->symtab[symnum].st_info; - strlcpy(name, mod->strtab + mod->symtab[symnum].st_name, - KSYM_NAME_LEN); + kallsyms = rcu_dereference_sched(mod->kallsyms); + if (symnum < kallsyms->num_symtab) { + *value = kallsyms->symtab[symnum].st_value; + *type = kallsyms->symtab[symnum].st_info; + strlcpy(name, symname(kallsyms, symnum), KSYM_NAME_LEN); strlcpy(module_name, mod->name, MODULE_NAME_LEN); *exported = is_exported(name, *value, mod); preempt_enable(); return 0; } - symnum -= mod->num_symtab; + symnum -= kallsyms->num_symtab; } preempt_enable(); return -ERANGE; @@ -3779,11 +3777,12 @@ int module_get_kallsym(unsigned int symnum, unsigned long *value, char *type, static unsigned long mod_find_symname(struct module *mod, const char *name) { unsigned int i; + struct mod_kallsyms *kallsyms = rcu_dereference_sched(mod->kallsyms); - for (i = 0; i < mod->num_symtab; i++) - if (strcmp(name, mod->strtab+mod->symtab[i].st_name) == 0 && - mod->symtab[i].st_info != 'U') - return mod->symtab[i].st_value; + for (i = 0; i < kallsyms->num_symtab; i++) + if (strcmp(name, symname(kallsyms, i)) == 0 && + kallsyms->symtab[i].st_info != 'U') + return kallsyms->symtab[i].st_value; return 0; } @@ -3822,11 +3821,14 @@ int module_kallsyms_on_each_symbol(int (*fn)(void *, const char *, module_assert_mutex(); list_for_each_entry(mod, &modules, list) { + /* We hold module_mutex: no need for rcu_dereference_sched */ + struct mod_kallsyms *kallsyms = mod->kallsyms; + if (mod->state == MODULE_STATE_UNFORMED) continue; - for (i = 0; i < mod->num_symtab; i++) { - ret = fn(data, mod->strtab + mod->symtab[i].st_name, - mod, mod->symtab[i].st_value); + for (i = 0; i < kallsyms->num_symtab; i++) { + ret = fn(data, symname(kallsyms, i), + mod, kallsyms->symtab[i].st_value); if (ret != 0) return ret; } diff --git a/kernel/module_signing.c b/kernel/module_signing.c index 6528a79d998d..64b9dead4a07 100644 --- a/kernel/module_signing.c +++ b/kernel/module_signing.c @@ -11,10 +11,17 @@ #include <linux/kernel.h> #include <linux/errno.h> +#include <linux/string.h> #include <keys/system_keyring.h> #include <crypto/public_key.h> #include "module-internal.h" +enum pkey_id_type { + PKEY_ID_PGP, /* OpenPGP generated key ID */ + PKEY_ID_X509, /* X.509 arbitrary subjectKeyIdentifier */ + PKEY_ID_PKCS7, /* Signature in PKCS#7 message */ +}; + /* * Module signature information block. * diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c index 49746c81ad8d..782102e59eed 100644 --- a/kernel/nsproxy.c +++ b/kernel/nsproxy.c @@ -25,6 +25,7 @@ #include <linux/proc_ns.h> #include <linux/file.h> #include <linux/syscalls.h> +#include <linux/cgroup.h> static struct kmem_cache *nsproxy_cachep; @@ -39,6 +40,9 @@ struct nsproxy init_nsproxy = { #ifdef CONFIG_NET .net_ns = &init_net, #endif +#ifdef CONFIG_CGROUPS + .cgroup_ns = &init_cgroup_ns, +#endif }; static inline struct nsproxy *create_nsproxy(void) @@ -92,6 +96,13 @@ static struct nsproxy *create_new_namespaces(unsigned long flags, goto out_pid; } + new_nsp->cgroup_ns = copy_cgroup_ns(flags, user_ns, + tsk->nsproxy->cgroup_ns); + if (IS_ERR(new_nsp->cgroup_ns)) { + err = PTR_ERR(new_nsp->cgroup_ns); + goto out_cgroup; + } + new_nsp->net_ns = copy_net_ns(flags, user_ns, tsk->nsproxy->net_ns); if (IS_ERR(new_nsp->net_ns)) { err = PTR_ERR(new_nsp->net_ns); @@ -101,6 +112,8 @@ static struct nsproxy *create_new_namespaces(unsigned long flags, return new_nsp; out_net: + put_cgroup_ns(new_nsp->cgroup_ns); +out_cgroup: if (new_nsp->pid_ns_for_children) put_pid_ns(new_nsp->pid_ns_for_children); out_pid: @@ -128,7 +141,8 @@ int copy_namespaces(unsigned long flags, struct task_struct *tsk) struct nsproxy *new_ns; if (likely(!(flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC | - CLONE_NEWPID | CLONE_NEWNET)))) { + CLONE_NEWPID | CLONE_NEWNET | + CLONE_NEWCGROUP)))) { get_nsproxy(old_ns); return 0; } @@ -165,6 +179,7 @@ void free_nsproxy(struct nsproxy *ns) put_ipc_ns(ns->ipc_ns); if (ns->pid_ns_for_children) put_pid_ns(ns->pid_ns_for_children); + put_cgroup_ns(ns->cgroup_ns); put_net(ns->net_ns); kmem_cache_free(nsproxy_cachep, ns); } @@ -180,7 +195,7 @@ int unshare_nsproxy_namespaces(unsigned long unshare_flags, int err = 0; if (!(unshare_flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC | - CLONE_NEWNET | CLONE_NEWPID))) + CLONE_NEWNET | CLONE_NEWPID | CLONE_NEWCGROUP))) return 0; user_ns = new_cred ? new_cred->user_ns : current_user_ns(); diff --git a/kernel/panic.c b/kernel/panic.c index d96469de72dc..535c96510a44 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -24,6 +24,7 @@ #include <linux/init.h> #include <linux/nmi.h> #include <linux/console.h> +#include <linux/bug.h> #define PANIC_TIMER_STEP 100 #define PANIC_BLINK_SPD 18 @@ -72,6 +73,26 @@ void __weak nmi_panic_self_stop(struct pt_regs *regs) atomic_t panic_cpu = ATOMIC_INIT(PANIC_CPU_INVALID); +/* + * A variant of panic() called from NMI context. We return if we've already + * panicked on this CPU. If another CPU already panicked, loop in + * nmi_panic_self_stop() which can provide architecture dependent code such + * as saving register state for crash dump. + */ +void nmi_panic(struct pt_regs *regs, const char *msg) +{ + int old_cpu, cpu; + + cpu = raw_smp_processor_id(); + old_cpu = atomic_cmpxchg(&panic_cpu, PANIC_CPU_INVALID, cpu); + + if (old_cpu == PANIC_CPU_INVALID) + panic("%s", msg); + else if (old_cpu != cpu) + nmi_panic_self_stop(regs); +} +EXPORT_SYMBOL(nmi_panic); + /** * panic - halt the system * @fmt: The text string to print @@ -449,20 +470,25 @@ void oops_exit(void) kmsg_dump(KMSG_DUMP_OOPS); } -#ifdef WANT_WARN_ON_SLOWPATH -struct slowpath_args { +struct warn_args { const char *fmt; va_list args; }; -static void warn_slowpath_common(const char *file, int line, void *caller, - unsigned taint, struct slowpath_args *args) +void __warn(const char *file, int line, void *caller, unsigned taint, + struct pt_regs *regs, struct warn_args *args) { disable_trace_on_warning(); pr_warn("------------[ cut here ]------------\n"); - pr_warn("WARNING: CPU: %d PID: %d at %s:%d %pS()\n", - raw_smp_processor_id(), current->pid, file, line, caller); + + if (file) + pr_warn("WARNING: CPU: %d PID: %d at %s:%d %pS\n", + raw_smp_processor_id(), current->pid, file, line, + caller); + else + pr_warn("WARNING: CPU: %d PID: %d at %pS\n", + raw_smp_processor_id(), current->pid, caller); if (args) vprintk(args->fmt, args->args); @@ -479,20 +505,27 @@ static void warn_slowpath_common(const char *file, int line, void *caller, } print_modules(); - dump_stack(); + + if (regs) + show_regs(regs); + else + dump_stack(); + print_oops_end_marker(); + /* Just a warning, don't kill lockdep. */ add_taint(taint, LOCKDEP_STILL_OK); } +#ifdef WANT_WARN_ON_SLOWPATH void warn_slowpath_fmt(const char *file, int line, const char *fmt, ...) { - struct slowpath_args args; + struct warn_args args; args.fmt = fmt; va_start(args.args, fmt); - warn_slowpath_common(file, line, __builtin_return_address(0), - TAINT_WARN, &args); + __warn(file, line, __builtin_return_address(0), TAINT_WARN, NULL, + &args); va_end(args.args); } EXPORT_SYMBOL(warn_slowpath_fmt); @@ -500,20 +533,18 @@ EXPORT_SYMBOL(warn_slowpath_fmt); void warn_slowpath_fmt_taint(const char *file, int line, unsigned taint, const char *fmt, ...) { - struct slowpath_args args; + struct warn_args args; args.fmt = fmt; va_start(args.args, fmt); - warn_slowpath_common(file, line, __builtin_return_address(0), - taint, &args); + __warn(file, line, __builtin_return_address(0), taint, NULL, &args); va_end(args.args); } EXPORT_SYMBOL(warn_slowpath_fmt_taint); void warn_slowpath_null(const char *file, int line) { - warn_slowpath_common(file, line, __builtin_return_address(0), - TAINT_WARN, NULL); + __warn(file, line, __builtin_return_address(0), TAINT_WARN, NULL, NULL); } EXPORT_SYMBOL(warn_slowpath_null); #endif diff --git a/kernel/pid.c b/kernel/pid.c index f4ad91b746f1..4d73a834c7e6 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -588,7 +588,7 @@ void __init pidhash_init(void) void __init pidmap_init(void) { - /* Veryify no one has done anything silly */ + /* Verify no one has done anything silly: */ BUILD_BUG_ON(PID_MAX_LIMIT >= PIDNS_HASH_ADDING); /* bump default and minimum pid_max based on number of cpus */ diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index 02e8dfaa1ce2..68d3ebc12601 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -235,7 +235,7 @@ config PM_TRACE_RTC config APM_EMULATION tristate "Advanced Power Management Emulation" - depends on PM && SYS_SUPPORTS_APM_EMULATION + depends on SYS_SUPPORTS_APM_EMULATION help APM is a BIOS specification for saving power using several different techniques. This is mostly useful for battery powered laptops with diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index b7342a24f559..fca9254280ee 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -339,6 +339,7 @@ int hibernation_snapshot(int platform_mode) pm_message_t msg; int error; + pm_suspend_clear_flags(); error = platform_begin(platform_mode); if (error) goto Close; @@ -1158,6 +1159,22 @@ static int __init kaslr_nohibernate_setup(char *str) return nohibernate_setup(str); } +static int __init page_poison_nohibernate_setup(char *str) +{ +#ifdef CONFIG_PAGE_POISONING_ZERO + /* + * The zeroing option for page poison skips the checks on alloc. + * since hibernation doesn't save free pages there's no way to + * guarantee the pages will still be zeroed. + */ + if (!strcmp(str, "on")) { + pr_info("Disabling hibernation due to page poisoning\n"); + return nohibernate_setup(str); + } +#endif + return 1; +} + __setup("noresume", noresume_setup); __setup("resume_offset=", resume_offset_setup); __setup("resume=", resume_setup); @@ -1166,3 +1183,4 @@ __setup("resumewait", resumewait_setup); __setup("resumedelay=", resumedelay_setup); __setup("nohibernate", nohibernate_setup); __setup("kaslr", kaslr_nohibernate_setup); +__setup("page_poison=", page_poison_nohibernate_setup); diff --git a/kernel/power/process.c b/kernel/power/process.c index 564f786df470..df058bed53ce 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -30,13 +30,12 @@ static int try_to_freeze_tasks(bool user_only) unsigned long end_time; unsigned int todo; bool wq_busy = false; - struct timeval start, end; - u64 elapsed_msecs64; + ktime_t start, end, elapsed; unsigned int elapsed_msecs; bool wakeup = false; int sleep_usecs = USEC_PER_MSEC; - do_gettimeofday(&start); + start = ktime_get_boottime(); end_time = jiffies + msecs_to_jiffies(freeze_timeout_msecs); @@ -78,10 +77,9 @@ static int try_to_freeze_tasks(bool user_only) sleep_usecs *= 2; } - do_gettimeofday(&end); - elapsed_msecs64 = timeval_to_ns(&end) - timeval_to_ns(&start); - do_div(elapsed_msecs64, NSEC_PER_MSEC); - elapsed_msecs = elapsed_msecs64; + end = ktime_get_boottime(); + elapsed = ktime_sub(end, start); + elapsed_msecs = ktime_to_ms(elapsed); if (todo) { pr_cont("\n"); diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index f9fe133c13e2..5b70d64b871e 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -248,7 +248,7 @@ static int suspend_test(int level) { #ifdef CONFIG_PM_DEBUG if (pm_test_level == level) { - printk(KERN_INFO "suspend debug: Waiting for %d second(s).\n", + pr_info("suspend debug: Waiting for %d second(s).\n", pm_test_delay); mdelay(pm_test_delay * 1000); return 1; @@ -320,7 +320,7 @@ static int suspend_enter(suspend_state_t state, bool *wakeup) error = dpm_suspend_late(PMSG_SUSPEND); if (error) { - printk(KERN_ERR "PM: late suspend of devices failed\n"); + pr_err("PM: late suspend of devices failed\n"); goto Platform_finish; } error = platform_suspend_prepare_late(state); @@ -329,7 +329,7 @@ static int suspend_enter(suspend_state_t state, bool *wakeup) error = dpm_suspend_noirq(PMSG_SUSPEND); if (error) { - printk(KERN_ERR "PM: noirq suspend of devices failed\n"); + pr_err("PM: noirq suspend of devices failed\n"); goto Platform_early_resume; } error = platform_suspend_prepare_noirq(state); @@ -473,8 +473,7 @@ static int enter_state(suspend_state_t state) if (state == PM_SUSPEND_FREEZE) { #ifdef CONFIG_PM_DEBUG if (pm_test_level != TEST_NONE && pm_test_level <= TEST_CPUS) { - pr_warning("PM: Unsupported test mode for suspend to idle," - "please choose none/freezer/devices/platform.\n"); + pr_warn("PM: Unsupported test mode for suspend to idle, please choose none/freezer/devices/platform.\n"); return -EAGAIN; } #endif diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index c963ba534a78..bfbf284e4218 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -367,16 +367,20 @@ static int logbuf_has_space(u32 msg_size, bool empty) static int log_make_free_space(u32 msg_size) { - while (log_first_seq < log_next_seq) { - if (logbuf_has_space(msg_size, false)) - return 0; + while (log_first_seq < log_next_seq && + !logbuf_has_space(msg_size, false)) { /* drop old messages until we have enough contiguous space */ log_first_idx = log_next(log_first_idx); log_first_seq++; } + if (clear_seq < log_first_seq) { + clear_seq = log_first_seq; + clear_idx = log_first_idx; + } + /* sequence numbers are equal, so the log buffer is empty */ - if (logbuf_has_space(msg_size, true)) + if (logbuf_has_space(msg_size, log_first_seq == log_next_seq)) return 0; return -ENOMEM; @@ -854,6 +858,7 @@ void log_buf_kexec_setup(void) VMCOREINFO_SYMBOL(log_buf); VMCOREINFO_SYMBOL(log_buf_len); VMCOREINFO_SYMBOL(log_first_idx); + VMCOREINFO_SYMBOL(clear_idx); VMCOREINFO_SYMBOL(log_next_idx); /* * Export struct printk_log size and field offsets. User space tools can @@ -1216,12 +1221,6 @@ static int syslog_print_all(char __user *buf, int size, bool clear) u32 idx; enum log_flags prev; - if (clear_seq < log_first_seq) { - /* messages are gone, move to first available one */ - clear_seq = log_first_seq; - clear_idx = log_first_idx; - } - /* * Find first record that fits, including all following records, * into the user-provided buffer for this dump. @@ -1483,58 +1482,6 @@ static void zap_locks(void) sema_init(&console_sem, 1); } -/* - * Check if we have any console that is capable of printing while cpu is - * booting or shutting down. Requires console_sem. - */ -static int have_callable_console(void) -{ - struct console *con; - - for_each_console(con) - if (con->flags & CON_ANYTIME) - return 1; - - return 0; -} - -/* - * Can we actually use the console at this time on this cpu? - * - * Console drivers may assume that per-cpu resources have been allocated. So - * unless they're explicitly marked as being able to cope (CON_ANYTIME) don't - * call them until this CPU is officially up. - */ -static inline int can_use_console(unsigned int cpu) -{ - return cpu_online(cpu) || have_callable_console(); -} - -/* - * Try to get console ownership to actually show the kernel - * messages from a 'printk'. Return true (and with the - * console_lock held, and 'console_locked' set) if it - * is successful, false otherwise. - */ -static int console_trylock_for_printk(void) -{ - unsigned int cpu = smp_processor_id(); - - if (!console_trylock()) - return 0; - /* - * If we can't use the console, we need to release the console - * semaphore by hand to avoid flushing the buffer. We need to hold the - * console semaphore in order to do this test safely. - */ - if (!can_use_console(cpu)) { - console_locked = 0; - up_console_sem(); - return 0; - } - return 1; -} - int printk_delay_msec __read_mostly; static inline void printk_delay(void) @@ -1681,7 +1628,6 @@ asmlinkage int vprintk_emit(int facility, int level, boot_delay_msec(level); printk_delay(); - /* This stops the holder of console_sem just where we want him */ local_irq_save(flags); this_cpu = smp_processor_id(); @@ -1705,6 +1651,7 @@ asmlinkage int vprintk_emit(int facility, int level, } lockdep_off(); + /* This stops the holder of console_sem just where we want him */ raw_spin_lock(&logbuf_lock); logbuf_cpu = this_cpu; @@ -1810,20 +1757,12 @@ asmlinkage int vprintk_emit(int facility, int level, if (!in_sched) { lockdep_off(); /* - * Disable preemption to avoid being preempted while holding - * console_sem which would prevent anyone from printing to - * console - */ - preempt_disable(); - - /* * Try to acquire and then immediately release the console * semaphore. The release will print out buffers and wake up * /dev/kmsg and syslog() users. */ - if (console_trylock_for_printk()) + if (console_trylock()) console_unlock(); - preempt_enable(); lockdep_on(); } @@ -2174,7 +2113,20 @@ int console_trylock(void) return 0; } console_locked = 1; - console_may_schedule = 0; + /* + * When PREEMPT_COUNT disabled we can't reliably detect if it's + * safe to schedule (e.g. calling printk while holding a spin_lock), + * because preempt_disable()/preempt_enable() are just barriers there + * and preempt_count() is always 0. + * + * RCU read sections have a separate preemption counter when + * PREEMPT_RCU enabled thus we must take extra care and check + * rcu_preempt_depth(), otherwise RCU read sections modify + * preempt_count(). + */ + console_may_schedule = !oops_in_progress && + preemptible() && + !rcu_preempt_depth(); return 1; } EXPORT_SYMBOL(console_trylock); @@ -2184,6 +2136,34 @@ int is_console_locked(void) return console_locked; } +/* + * Check if we have any console that is capable of printing while cpu is + * booting or shutting down. Requires console_sem. + */ +static int have_callable_console(void) +{ + struct console *con; + + for_each_console(con) + if ((con->flags & CON_ENABLED) && + (con->flags & CON_ANYTIME)) + return 1; + + return 0; +} + +/* + * Can we actually use the console at this time on this cpu? + * + * Console drivers may assume that per-cpu resources have been allocated. So + * unless they're explicitly marked as being able to cope (CON_ANYTIME) don't + * call them until this CPU is officially up. + */ +static inline int can_use_console(void) +{ + return cpu_online(raw_smp_processor_id()) || have_callable_console(); +} + static void console_cont_flush(char *text, size_t size) { unsigned long flags; @@ -2254,9 +2234,21 @@ void console_unlock(void) do_cond_resched = console_may_schedule; console_may_schedule = 0; +again: + /* + * We released the console_sem lock, so we need to recheck if + * cpu is online and (if not) is there at least one CON_ANYTIME + * console. + */ + if (!can_use_console()) { + console_locked = 0; + up_console_sem(); + return; + } + /* flush buffered message fragment immediately to console */ console_cont_flush(text, sizeof(text)); -again: + for (;;) { struct printk_log *msg; size_t ext_len = 0; diff --git a/kernel/profile.c b/kernel/profile.c index 99513e1160e5..c2199e9901c9 100644 --- a/kernel/profile.c +++ b/kernel/profile.c @@ -44,7 +44,7 @@ int prof_on __read_mostly; EXPORT_SYMBOL_GPL(prof_on); static cpumask_var_t prof_cpu_mask; -#ifdef CONFIG_SMP +#if defined(CONFIG_SMP) && defined(CONFIG_PROC_FS) static DEFINE_PER_CPU(struct profile_hit *[2], cpu_profile_hits); static DEFINE_PER_CPU(int, cpu_profile_flip); static DEFINE_MUTEX(profile_flip_mutex); @@ -59,6 +59,7 @@ int profile_setup(char *str) if (!strncmp(str, sleepstr, strlen(sleepstr))) { #ifdef CONFIG_SCHEDSTATS + force_schedstat_enabled(); prof_on = SLEEP_PROFILING; if (str[strlen(sleepstr)] == ',') str += strlen(sleepstr) + 1; @@ -201,7 +202,7 @@ int profile_event_unregister(enum profile_type type, struct notifier_block *n) } EXPORT_SYMBOL_GPL(profile_event_unregister); -#ifdef CONFIG_SMP +#if defined(CONFIG_SMP) && defined(CONFIG_PROC_FS) /* * Each cpu has a pair of open-addressed hashtables for pending * profile hits. read_profile() IPI's all cpus to request them diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 2341efe7fe02..d49bfa1e53e6 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -73,12 +73,11 @@ void __ptrace_unlink(struct task_struct *child) { BUG_ON(!child->ptrace); - child->ptrace = 0; child->parent = child->real_parent; list_del_init(&child->ptrace_entry); spin_lock(&child->sighand->siglock); - + child->ptrace = 0; /* * Clear all pending traps and TRAPPING. TRAPPING should be * cleared regardless of JOBCTL_STOP_PENDING. Do it explicitly. @@ -681,7 +680,7 @@ static int ptrace_peek_siginfo(struct task_struct *child, break; #ifdef CONFIG_COMPAT - if (unlikely(is_compat_task())) { + if (unlikely(in_compat_syscall())) { compat_siginfo_t __user *uinfo = compat_ptr(data); if (copy_siginfo_to_user32(uinfo, &info) || diff --git a/kernel/rcu/Makefile b/kernel/rcu/Makefile index 61a16569ffbf..032b2c015beb 100644 --- a/kernel/rcu/Makefile +++ b/kernel/rcu/Makefile @@ -1,3 +1,7 @@ +# Any varying coverage in these files is non-deterministic +# and is generally not a function of system call inputs. +KCOV_INSTRUMENT := n + obj-y += update.o sync.o obj-$(CONFIG_SRCU) += srcu.o obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index d2988d047d66..250ea67c1615 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -130,10 +130,8 @@ static struct rcu_torture __rcu *rcu_torture_current; static unsigned long rcu_torture_current_version; static struct rcu_torture rcu_tortures[10 * RCU_TORTURE_PIPE_LEN]; static DEFINE_SPINLOCK(rcu_torture_lock); -static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], - rcu_torture_count) = { 0 }; -static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], - rcu_torture_batch) = { 0 }; +static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], rcu_torture_count) = { 0 }; +static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], rcu_torture_batch) = { 0 }; static atomic_t rcu_torture_wcount[RCU_TORTURE_PIPE_LEN + 1]; static atomic_t n_rcu_torture_alloc; static atomic_t n_rcu_torture_alloc_fail; @@ -932,12 +930,14 @@ rcu_torture_writer(void *arg) int nsynctypes = 0; VERBOSE_TOROUT_STRING("rcu_torture_writer task started"); - pr_alert("%s" TORTURE_FLAG - " Grace periods expedited from boot/sysfs for %s,\n", - torture_type, cur_ops->name); - pr_alert("%s" TORTURE_FLAG - " Testing of dynamic grace-period expediting diabled.\n", - torture_type); + if (!can_expedite) { + pr_alert("%s" TORTURE_FLAG + " Grace periods expedited from boot/sysfs for %s,\n", + torture_type, cur_ops->name); + pr_alert("%s" TORTURE_FLAG + " Disabled dynamic grace-period expediting.\n", + torture_type); + } /* Initialize synctype[] array. If none set, take default. */ if (!gp_cond1 && !gp_exp1 && !gp_normal1 && !gp_sync1) diff --git a/kernel/rcu/tiny_plugin.h b/kernel/rcu/tiny_plugin.h index e492a5253e0f..196f0302e2f4 100644 --- a/kernel/rcu/tiny_plugin.h +++ b/kernel/rcu/tiny_plugin.h @@ -23,7 +23,7 @@ */ #include <linux/kthread.h> -#include <linux/module.h> +#include <linux/init.h> #include <linux/debugfs.h> #include <linux/seq_file.h> @@ -122,18 +122,7 @@ free_out: debugfs_remove_recursive(rcudir); return 1; } - -static void __exit rcutiny_trace_cleanup(void) -{ - debugfs_remove_recursive(rcudir); -} - -module_init(rcutiny_trace_init); -module_exit(rcutiny_trace_cleanup); - -MODULE_AUTHOR("Paul E. McKenney"); -MODULE_DESCRIPTION("Read-Copy Update tracing for tiny implementation"); -MODULE_LICENSE("GPL"); +device_initcall(rcutiny_trace_init); static void check_cpu_stall(struct rcu_ctrlblk *rcp) { diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index e41dd4131f7a..9a535a86e732 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -108,7 +108,6 @@ RCU_STATE_INITIALIZER(rcu_sched, 's', call_rcu_sched); RCU_STATE_INITIALIZER(rcu_bh, 'b', call_rcu_bh); static struct rcu_state *const rcu_state_p; -static struct rcu_data __percpu *const rcu_data_p; LIST_HEAD(rcu_struct_flavors); /* Dump rcu_node combining tree at boot to verify correct setup. */ @@ -1083,13 +1082,12 @@ static int dyntick_save_progress_counter(struct rcu_data *rdp, rcu_sysidle_check_cpu(rdp, isidle, maxj); if ((rdp->dynticks_snap & 0x1) == 0) { trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("dti")); - return 1; - } else { if (ULONG_CMP_LT(READ_ONCE(rdp->gpnum) + ULONG_MAX / 4, rdp->mynode->gpnum)) WRITE_ONCE(rdp->gpwrap, true); - return 0; + return 1; } + return 0; } /* @@ -1173,15 +1171,16 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp, smp_mb(); /* ->cond_resched_completed before *rcrmp. */ WRITE_ONCE(*rcrmp, READ_ONCE(*rcrmp) + rdp->rsp->flavor_mask); - resched_cpu(rdp->cpu); /* Force CPU into scheduler. */ - rdp->rsp->jiffies_resched += 5; /* Enable beating. */ - } else if (ULONG_CMP_GE(jiffies, rdp->rsp->jiffies_resched)) { - /* Time to beat on that CPU again! */ - resched_cpu(rdp->cpu); /* Force CPU into scheduler. */ - rdp->rsp->jiffies_resched += 5; /* Re-enable beating. */ } + rdp->rsp->jiffies_resched += 5; /* Re-enable beating. */ } + /* And if it has been a really long time, kick the CPU as well. */ + if (ULONG_CMP_GE(jiffies, + rdp->rsp->gp_start + 2 * jiffies_till_sched_qs) || + ULONG_CMP_GE(jiffies, rdp->rsp->gp_start + jiffies_till_sched_qs)) + resched_cpu(rdp->cpu); /* Force CPU into scheduler. */ + return 0; } @@ -1246,7 +1245,7 @@ static void rcu_dump_cpu_stacks(struct rcu_state *rsp) if (rnp->qsmask & (1UL << cpu)) dump_cpu_task(rnp->grplo + cpu); } - raw_spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); } } @@ -1266,12 +1265,12 @@ static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gpnum) raw_spin_lock_irqsave_rcu_node(rnp, flags); delta = jiffies - READ_ONCE(rsp->jiffies_stall); if (delta < RCU_STALL_RAT_DELAY || !rcu_gp_in_progress(rsp)) { - raw_spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); return; } WRITE_ONCE(rsp->jiffies_stall, jiffies + 3 * rcu_jiffies_till_stall_check() + 3); - raw_spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); /* * OK, time to rat on our buddy... @@ -1292,7 +1291,7 @@ static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gpnum) ndetected++; } } - raw_spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); } print_cpu_stall_info_end(); @@ -1357,7 +1356,7 @@ static void print_cpu_stall(struct rcu_state *rsp) if (ULONG_CMP_GE(jiffies, READ_ONCE(rsp->jiffies_stall))) WRITE_ONCE(rsp->jiffies_stall, jiffies + 3 * rcu_jiffies_till_stall_check() + 3); - raw_spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); /* * Attempt to revive the RCU machinery by forcing a context switch. @@ -1595,7 +1594,7 @@ rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp, } unlock_out: if (rnp != rnp_root) - raw_spin_unlock(&rnp_root->lock); + raw_spin_unlock_rcu_node(rnp_root); out: if (c_out != NULL) *c_out = c; @@ -1614,7 +1613,6 @@ static int rcu_future_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp) int needmore; struct rcu_data *rdp = this_cpu_ptr(rsp->rda); - rcu_nocb_gp_cleanup(rsp, rnp); rnp->need_future_gp[c & 0x1] = 0; needmore = rnp->need_future_gp[(c + 1) & 0x1]; trace_rcu_future_gp(rnp, rdp, c, @@ -1635,7 +1633,7 @@ static void rcu_gp_kthread_wake(struct rcu_state *rsp) !READ_ONCE(rsp->gp_flags) || !rsp->gp_kthread) return; - wake_up(&rsp->gp_wq); + swake_up(&rsp->gp_wq); } /* @@ -1815,7 +1813,7 @@ static void note_gp_changes(struct rcu_state *rsp, struct rcu_data *rdp) return; } needwake = __note_gp_changes(rsp, rnp, rdp); - raw_spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); if (needwake) rcu_gp_kthread_wake(rsp); } @@ -1840,7 +1838,7 @@ static bool rcu_gp_init(struct rcu_state *rsp) raw_spin_lock_irq_rcu_node(rnp); if (!READ_ONCE(rsp->gp_flags)) { /* Spurious wakeup, tell caller to go back to sleep. */ - raw_spin_unlock_irq(&rnp->lock); + raw_spin_unlock_irq_rcu_node(rnp); return false; } WRITE_ONCE(rsp->gp_flags, 0); /* Clear all flags: New grace period. */ @@ -1850,7 +1848,7 @@ static bool rcu_gp_init(struct rcu_state *rsp) * Grace period already in progress, don't start another. * Not supposed to be able to happen. */ - raw_spin_unlock_irq(&rnp->lock); + raw_spin_unlock_irq_rcu_node(rnp); return false; } @@ -1859,7 +1857,7 @@ static bool rcu_gp_init(struct rcu_state *rsp) /* Record GP times before starting GP, hence smp_store_release(). */ smp_store_release(&rsp->gpnum, rsp->gpnum + 1); trace_rcu_grace_period(rsp->name, rsp->gpnum, TPS("start")); - raw_spin_unlock_irq(&rnp->lock); + raw_spin_unlock_irq_rcu_node(rnp); /* * Apply per-leaf buffered online and offline operations to the @@ -1873,7 +1871,7 @@ static bool rcu_gp_init(struct rcu_state *rsp) if (rnp->qsmaskinit == rnp->qsmaskinitnext && !rnp->wait_blkd_tasks) { /* Nothing to do on this leaf rcu_node structure. */ - raw_spin_unlock_irq(&rnp->lock); + raw_spin_unlock_irq_rcu_node(rnp); continue; } @@ -1907,7 +1905,7 @@ static bool rcu_gp_init(struct rcu_state *rsp) rcu_cleanup_dead_rnp(rnp); } - raw_spin_unlock_irq(&rnp->lock); + raw_spin_unlock_irq_rcu_node(rnp); } /* @@ -1938,7 +1936,7 @@ static bool rcu_gp_init(struct rcu_state *rsp) trace_rcu_grace_period_init(rsp->name, rnp->gpnum, rnp->level, rnp->grplo, rnp->grphi, rnp->qsmask); - raw_spin_unlock_irq(&rnp->lock); + raw_spin_unlock_irq_rcu_node(rnp); cond_resched_rcu_qs(); WRITE_ONCE(rsp->gp_activity, jiffies); } @@ -1996,7 +1994,7 @@ static void rcu_gp_fqs(struct rcu_state *rsp, bool first_time) raw_spin_lock_irq_rcu_node(rnp); WRITE_ONCE(rsp->gp_flags, READ_ONCE(rsp->gp_flags) & ~RCU_GP_FLAG_FQS); - raw_spin_unlock_irq(&rnp->lock); + raw_spin_unlock_irq_rcu_node(rnp); } } @@ -2010,6 +2008,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp) int nocb = 0; struct rcu_data *rdp; struct rcu_node *rnp = rcu_get_root(rsp); + struct swait_queue_head *sq; WRITE_ONCE(rsp->gp_activity, jiffies); raw_spin_lock_irq_rcu_node(rnp); @@ -2025,7 +2024,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp) * safe for us to drop the lock in order to mark the grace * period as completed in all of the rcu_node structures. */ - raw_spin_unlock_irq(&rnp->lock); + raw_spin_unlock_irq_rcu_node(rnp); /* * Propagate new ->completed value to rcu_node structures so @@ -2046,7 +2045,9 @@ static void rcu_gp_cleanup(struct rcu_state *rsp) needgp = __note_gp_changes(rsp, rnp, rdp) || needgp; /* smp_mb() provided by prior unlock-lock pair. */ nocb += rcu_future_gp_cleanup(rsp, rnp); - raw_spin_unlock_irq(&rnp->lock); + sq = rcu_nocb_gp_get(rnp); + raw_spin_unlock_irq_rcu_node(rnp); + rcu_nocb_gp_cleanup(sq); cond_resched_rcu_qs(); WRITE_ONCE(rsp->gp_activity, jiffies); rcu_gp_slow(rsp, gp_cleanup_delay); @@ -2068,7 +2069,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp) READ_ONCE(rsp->gpnum), TPS("newreq")); } - raw_spin_unlock_irq(&rnp->lock); + raw_spin_unlock_irq_rcu_node(rnp); } /* @@ -2092,7 +2093,7 @@ static int __noreturn rcu_gp_kthread(void *arg) READ_ONCE(rsp->gpnum), TPS("reqwait")); rsp->gp_state = RCU_GP_WAIT_GPS; - wait_event_interruptible(rsp->gp_wq, + swait_event_interruptible(rsp->gp_wq, READ_ONCE(rsp->gp_flags) & RCU_GP_FLAG_INIT); rsp->gp_state = RCU_GP_DONE_GPS; @@ -2122,7 +2123,7 @@ static int __noreturn rcu_gp_kthread(void *arg) READ_ONCE(rsp->gpnum), TPS("fqswait")); rsp->gp_state = RCU_GP_WAIT_FQS; - ret = wait_event_interruptible_timeout(rsp->gp_wq, + ret = swait_event_interruptible_timeout(rsp->gp_wq, rcu_gp_fqs_check_wake(rsp, &gf), j); rsp->gp_state = RCU_GP_DOING_FQS; /* Locking provides needed memory barriers. */ @@ -2234,19 +2235,21 @@ static bool rcu_start_gp(struct rcu_state *rsp) } /* - * Report a full set of quiescent states to the specified rcu_state - * data structure. This involves cleaning up after the prior grace - * period and letting rcu_start_gp() start up the next grace period - * if one is needed. Note that the caller must hold rnp->lock, which - * is released before return. + * Report a full set of quiescent states to the specified rcu_state data + * structure. Invoke rcu_gp_kthread_wake() to awaken the grace-period + * kthread if another grace period is required. Whether we wake + * the grace-period kthread or it awakens itself for the next round + * of quiescent-state forcing, that kthread will clean up after the + * just-completed grace period. Note that the caller must hold rnp->lock, + * which is released before return. */ static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags) __releases(rcu_get_root(rsp)->lock) { WARN_ON_ONCE(!rcu_gp_in_progress(rsp)); WRITE_ONCE(rsp->gp_flags, READ_ONCE(rsp->gp_flags) | RCU_GP_FLAG_FQS); - raw_spin_unlock_irqrestore(&rcu_get_root(rsp)->lock, flags); - rcu_gp_kthread_wake(rsp); + raw_spin_unlock_irqrestore_rcu_node(rcu_get_root(rsp), flags); + swake_up(&rsp->gp_wq); /* Memory barrier implied by swake_up() path. */ } /* @@ -2275,7 +2278,7 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp, * Our bit has already been cleared, or the * relevant grace period is already over, so done. */ - raw_spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); return; } WARN_ON_ONCE(oldmask); /* Any child must be all zeroed! */ @@ -2287,7 +2290,7 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp, if (rnp->qsmask != 0 || rcu_preempt_blocked_readers_cgp(rnp)) { /* Other bits still set at this level, so done. */ - raw_spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); return; } mask = rnp->grpmask; @@ -2297,7 +2300,7 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp, break; } - raw_spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); rnp_c = rnp; rnp = rnp->parent; raw_spin_lock_irqsave_rcu_node(rnp, flags); @@ -2329,7 +2332,7 @@ static void rcu_report_unblock_qs_rnp(struct rcu_state *rsp, if (rcu_state_p == &rcu_sched_state || rsp != rcu_state_p || rnp->qsmask != 0 || rcu_preempt_blocked_readers_cgp(rnp)) { - raw_spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); return; /* Still need more quiescent states! */ } @@ -2346,19 +2349,14 @@ static void rcu_report_unblock_qs_rnp(struct rcu_state *rsp, /* Report up the rest of the hierarchy, tracking current ->gpnum. */ gps = rnp->gpnum; mask = rnp->grpmask; - raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ + raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */ raw_spin_lock_rcu_node(rnp_p); /* irqs already disabled. */ rcu_report_qs_rnp(mask, rsp, rnp_p, gps, flags); } /* * Record a quiescent state for the specified CPU to that CPU's rcu_data - * structure. This must be either called from the specified CPU, or - * called when the specified CPU is known to be offline (and when it is - * also known that no other CPU is concurrently trying to help the offline - * CPU). The lastcomp argument is used to make sure we are still in the - * grace period of interest. We don't want to end the current grace period - * based on quiescent states detected in an earlier grace period! + * structure. This must be called from the specified CPU. */ static void rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp) @@ -2383,14 +2381,14 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp) */ rdp->cpu_no_qs.b.norm = true; /* need qs for new gp. */ rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_qs_ctr); - raw_spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); return; } mask = rdp->grpmask; if ((rnp->qsmask & mask) == 0) { - raw_spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); } else { - rdp->core_needs_qs = 0; + rdp->core_needs_qs = false; /* * This GP can't end until cpu checks in, so all of our @@ -2599,36 +2597,15 @@ static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf) rnp->qsmaskinit &= ~mask; rnp->qsmask &= ~mask; if (rnp->qsmaskinit) { - raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ + raw_spin_unlock_rcu_node(rnp); + /* irqs remain disabled. */ return; } - raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ + raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */ } } /* - * The CPU is exiting the idle loop into the arch_cpu_idle_dead() - * function. We now remove it from the rcu_node tree's ->qsmaskinit - * bit masks. - */ -static void rcu_cleanup_dying_idle_cpu(int cpu, struct rcu_state *rsp) -{ - unsigned long flags; - unsigned long mask; - struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); - struct rcu_node *rnp = rdp->mynode; /* Outgoing CPU's rdp & rnp. */ - - if (!IS_ENABLED(CONFIG_HOTPLUG_CPU)) - return; - - /* Remove outgoing CPU from mask in the leaf rcu_node structure. */ - mask = rdp->grpmask; - raw_spin_lock_irqsave_rcu_node(rnp, flags); /* Enforce GP memory-order guarantee. */ - rnp->qsmaskinitnext &= ~mask; - raw_spin_unlock_irqrestore(&rnp->lock, flags); -} - -/* * The CPU has been completely removed, and some other CPU is reporting * this fact from process context. Do the remainder of the cleanup, * including orphaning the outgoing CPU's RCU callbacks, and also @@ -2859,7 +2836,7 @@ static void force_qs_rnp(struct rcu_state *rsp, rcu_report_qs_rnp(mask, rsp, rnp, rnp->gpnum, flags); } else { /* Nothing to do here, so just drop the lock. */ - raw_spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); } } } @@ -2895,12 +2872,12 @@ static void force_quiescent_state(struct rcu_state *rsp) raw_spin_unlock(&rnp_old->fqslock); if (READ_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) { rsp->n_force_qs_lh++; - raw_spin_unlock_irqrestore(&rnp_old->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rnp_old, flags); return; /* Someone beat us to it. */ } WRITE_ONCE(rsp->gp_flags, READ_ONCE(rsp->gp_flags) | RCU_GP_FLAG_FQS); - raw_spin_unlock_irqrestore(&rnp_old->lock, flags); - rcu_gp_kthread_wake(rsp); + raw_spin_unlock_irqrestore_rcu_node(rnp_old, flags); + swake_up(&rsp->gp_wq); /* Memory barrier implied by swake_up() path. */ } /* @@ -2925,7 +2902,7 @@ __rcu_process_callbacks(struct rcu_state *rsp) if (cpu_needs_another_gp(rsp, rdp)) { raw_spin_lock_rcu_node(rcu_get_root(rsp)); /* irqs disabled. */ needwake = rcu_start_gp(rsp); - raw_spin_unlock_irqrestore(&rcu_get_root(rsp)->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rcu_get_root(rsp), flags); if (needwake) rcu_gp_kthread_wake(rsp); } else { @@ -3016,7 +2993,7 @@ static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp, raw_spin_lock_rcu_node(rnp_root); needwake = rcu_start_gp(rsp); - raw_spin_unlock(&rnp_root->lock); + raw_spin_unlock_rcu_node(rnp_root); if (needwake) rcu_gp_kthread_wake(rsp); } else { @@ -3436,14 +3413,14 @@ static void sync_exp_reset_tree_hotplug(struct rcu_state *rsp) rcu_for_each_leaf_node(rsp, rnp) { raw_spin_lock_irqsave_rcu_node(rnp, flags); if (rnp->expmaskinit == rnp->expmaskinitnext) { - raw_spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); continue; /* No new CPUs, nothing to do. */ } /* Update this node's mask, track old value for propagation. */ oldmask = rnp->expmaskinit; rnp->expmaskinit = rnp->expmaskinitnext; - raw_spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); /* If was already nonzero, nothing to propagate. */ if (oldmask) @@ -3458,7 +3435,7 @@ static void sync_exp_reset_tree_hotplug(struct rcu_state *rsp) if (rnp_up->expmaskinit) done = true; rnp_up->expmaskinit |= mask; - raw_spin_unlock_irqrestore(&rnp_up->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rnp_up, flags); if (done) break; mask = rnp_up->grpmask; @@ -3481,7 +3458,7 @@ static void __maybe_unused sync_exp_reset_tree(struct rcu_state *rsp) raw_spin_lock_irqsave_rcu_node(rnp, flags); WARN_ON_ONCE(rnp->expmask); rnp->expmask = rnp->expmaskinit; - raw_spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); } } @@ -3522,19 +3499,19 @@ static void __rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, if (!rnp->expmask) rcu_initiate_boost(rnp, flags); else - raw_spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); break; } if (rnp->parent == NULL) { - raw_spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); if (wake) { smp_mb(); /* EGP done before wake_up(). */ - wake_up(&rsp->expedited_wq); + swake_up(&rsp->expedited_wq); } break; } mask = rnp->grpmask; - raw_spin_unlock(&rnp->lock); /* irqs remain disabled */ + raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled */ rnp = rnp->parent; raw_spin_lock_rcu_node(rnp); /* irqs already disabled */ WARN_ON_ONCE(!(rnp->expmask & mask)); @@ -3569,7 +3546,7 @@ static void rcu_report_exp_cpu_mult(struct rcu_state *rsp, struct rcu_node *rnp, raw_spin_lock_irqsave_rcu_node(rnp, flags); if (!(rnp->expmask & mask)) { - raw_spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); return; } rnp->expmask &= ~mask; @@ -3730,7 +3707,7 @@ static void sync_rcu_exp_select_cpus(struct rcu_state *rsp, */ if (rcu_preempt_has_tasks(rnp)) rnp->exp_tasks = rnp->blkd_tasks.next; - raw_spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); /* IPI the remaining CPUs for expedited quiescent state. */ mask = 1; @@ -3747,7 +3724,7 @@ retry_ipi: raw_spin_lock_irqsave_rcu_node(rnp, flags); if (cpu_online(cpu) && (rnp->expmask & mask)) { - raw_spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); schedule_timeout_uninterruptible(1); if (cpu_online(cpu) && (rnp->expmask & mask)) @@ -3756,7 +3733,7 @@ retry_ipi: } if (!(rnp->expmask & mask)) mask_ofl_ipi &= ~mask; - raw_spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); } /* Report quiescent states for those that went offline. */ mask_ofl_test |= mask_ofl_ipi; @@ -3780,7 +3757,7 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp) jiffies_start = jiffies; for (;;) { - ret = wait_event_interruptible_timeout( + ret = swait_event_timeout( rsp->expedited_wq, sync_rcu_preempt_exp_done(rnp_root), jiffies_stall); @@ -3788,7 +3765,7 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp) return; if (ret < 0) { /* Hit a signal, disable CPU stall warnings. */ - wait_event(rsp->expedited_wq, + swait_event(rsp->expedited_wq, sync_rcu_preempt_exp_done(rnp_root)); return; } @@ -4163,7 +4140,7 @@ static void rcu_init_new_rnp(struct rcu_node *rnp_leaf) return; raw_spin_lock_rcu_node(rnp); /* Interrupts already disabled. */ rnp->qsmaskinit |= mask; - raw_spin_unlock(&rnp->lock); /* Interrupts remain disabled. */ + raw_spin_unlock_rcu_node(rnp); /* Interrupts remain disabled. */ } } @@ -4187,7 +4164,7 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp) rdp->rsp = rsp; mutex_init(&rdp->exp_funnel_mutex); rcu_boot_init_nocb_percpu_data(rdp); - raw_spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); } /* @@ -4215,7 +4192,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp) rcu_sysidle_init_percpu_data(rdp->dynticks); atomic_set(&rdp->dynticks->dynticks, (atomic_read(&rdp->dynticks->dynticks) & ~0x1) + 1); - raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ + raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */ /* * Add CPU to leaf rcu_node pending-online bitmask. Any needed @@ -4236,7 +4213,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp) rdp->rcu_qs_ctr_snap = per_cpu(rcu_qs_ctr, cpu); rdp->core_needs_qs = false; trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpuonl")); - raw_spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); } static void rcu_prepare_cpu(int cpu) @@ -4247,6 +4224,46 @@ static void rcu_prepare_cpu(int cpu) rcu_init_percpu_data(cpu, rsp); } +#ifdef CONFIG_HOTPLUG_CPU +/* + * The CPU is exiting the idle loop into the arch_cpu_idle_dead() + * function. We now remove it from the rcu_node tree's ->qsmaskinit + * bit masks. + * The CPU is exiting the idle loop into the arch_cpu_idle_dead() + * function. We now remove it from the rcu_node tree's ->qsmaskinit + * bit masks. + */ +static void rcu_cleanup_dying_idle_cpu(int cpu, struct rcu_state *rsp) +{ + unsigned long flags; + unsigned long mask; + struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); + struct rcu_node *rnp = rdp->mynode; /* Outgoing CPU's rdp & rnp. */ + + if (!IS_ENABLED(CONFIG_HOTPLUG_CPU)) + return; + + /* Remove outgoing CPU from mask in the leaf rcu_node structure. */ + mask = rdp->grpmask; + raw_spin_lock_irqsave_rcu_node(rnp, flags); /* Enforce GP memory-order guarantee. */ + rnp->qsmaskinitnext &= ~mask; + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); +} + +void rcu_report_dead(unsigned int cpu) +{ + struct rcu_state *rsp; + + /* QS for any half-done expedited RCU-sched GP. */ + preempt_disable(); + rcu_report_exp_rdp(&rcu_sched_state, + this_cpu_ptr(rcu_sched_state.rda), true); + preempt_enable(); + for_each_rcu_flavor(rsp) + rcu_cleanup_dying_idle_cpu(cpu, rsp); +} +#endif + /* * Handle CPU online/offline notification events. */ @@ -4278,17 +4295,6 @@ int rcu_cpu_notify(struct notifier_block *self, for_each_rcu_flavor(rsp) rcu_cleanup_dying_cpu(rsp); break; - case CPU_DYING_IDLE: - /* QS for any half-done expedited RCU-sched GP. */ - preempt_disable(); - rcu_report_exp_rdp(&rcu_sched_state, - this_cpu_ptr(rcu_sched_state.rda), true); - preempt_enable(); - - for_each_rcu_flavor(rsp) { - rcu_cleanup_dying_idle_cpu(cpu, rsp); - } - break; case CPU_DEAD: case CPU_DEAD_FROZEN: case CPU_UP_CANCELED: @@ -4358,7 +4364,7 @@ static int __init rcu_spawn_gp_kthread(void) sp.sched_priority = kthread_prio; sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); } - raw_spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); wake_up_process(t); } rcu_spawn_nocb_kthreads(); @@ -4449,8 +4455,8 @@ static void __init rcu_init_one(struct rcu_state *rsp) cpustride *= levelspread[i]; rnp = rsp->level[i]; for (j = 0; j < levelcnt[i]; j++, rnp++) { - raw_spin_lock_init(&rnp->lock); - lockdep_set_class_and_name(&rnp->lock, + raw_spin_lock_init(&ACCESS_PRIVATE(rnp, lock)); + lockdep_set_class_and_name(&ACCESS_PRIVATE(rnp, lock), &rcu_node_class[i], buf[i]); raw_spin_lock_init(&rnp->fqslock); lockdep_set_class_and_name(&rnp->fqslock, @@ -4482,8 +4488,8 @@ static void __init rcu_init_one(struct rcu_state *rsp) } } - init_waitqueue_head(&rsp->gp_wq); - init_waitqueue_head(&rsp->expedited_wq); + init_swait_queue_head(&rsp->gp_wq); + init_swait_queue_head(&rsp->expedited_wq); rnp = rsp->level[rcu_num_lvls - 1]; for_each_possible_cpu(i) { while (i > rnp->grphi) diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 83360b4f4352..df668c0f9e64 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -27,6 +27,7 @@ #include <linux/threads.h> #include <linux/cpumask.h> #include <linux/seqlock.h> +#include <linux/swait.h> #include <linux/stop_machine.h> /* @@ -149,8 +150,9 @@ struct rcu_dynticks { * Definition for node within the RCU grace-period-detection hierarchy. */ struct rcu_node { - raw_spinlock_t lock; /* Root rcu_node's lock protects some */ - /* rcu_state fields as well as following. */ + raw_spinlock_t __private lock; /* Root rcu_node's lock protects */ + /* some rcu_state fields as well as */ + /* following. */ unsigned long gpnum; /* Current grace period for this node. */ /* This will either be equal to or one */ /* behind the root rcu_node's gpnum. */ @@ -243,7 +245,7 @@ struct rcu_node { /* Refused to boost: not sure why, though. */ /* This can happen due to race conditions. */ #ifdef CONFIG_RCU_NOCB_CPU - wait_queue_head_t nocb_gp_wq[2]; + struct swait_queue_head nocb_gp_wq[2]; /* Place for rcu_nocb_kthread() to wait GP. */ #endif /* #ifdef CONFIG_RCU_NOCB_CPU */ int need_future_gp[2]; @@ -399,7 +401,7 @@ struct rcu_data { atomic_long_t nocb_q_count_lazy; /* invocation (all stages). */ struct rcu_head *nocb_follower_head; /* CBs ready to invoke. */ struct rcu_head **nocb_follower_tail; - wait_queue_head_t nocb_wq; /* For nocb kthreads to sleep on. */ + struct swait_queue_head nocb_wq; /* For nocb kthreads to sleep on. */ struct task_struct *nocb_kthread; int nocb_defer_wakeup; /* Defer wakeup of nocb_kthread. */ @@ -478,7 +480,7 @@ struct rcu_state { unsigned long gpnum; /* Current gp number. */ unsigned long completed; /* # of last completed gp. */ struct task_struct *gp_kthread; /* Task for grace periods. */ - wait_queue_head_t gp_wq; /* Where GP task waits. */ + struct swait_queue_head gp_wq; /* Where GP task waits. */ short gp_flags; /* Commands for GP task. */ short gp_state; /* GP kthread sleep state. */ @@ -506,7 +508,7 @@ struct rcu_state { unsigned long expedited_sequence; /* Take a ticket. */ atomic_long_t expedited_normal; /* # fallbacks to normal. */ atomic_t expedited_need_qs; /* # CPUs left to check in. */ - wait_queue_head_t expedited_wq; /* Wait for check-ins. */ + struct swait_queue_head expedited_wq; /* Wait for check-ins. */ int ncpus_snap; /* # CPUs seen last time. */ unsigned long jiffies_force_qs; /* Time at which to invoke */ @@ -621,7 +623,8 @@ static void zero_cpu_stall_ticks(struct rcu_data *rdp); static void increment_cpu_stall_ticks(void); static bool rcu_nocb_cpu_needs_barrier(struct rcu_state *rsp, int cpu); static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq); -static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp); +static struct swait_queue_head *rcu_nocb_gp_get(struct rcu_node *rnp); +static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq); static void rcu_init_one_nocb(struct rcu_node *rnp); static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp, bool lazy, unsigned long flags); @@ -680,7 +683,7 @@ static inline void rcu_nocb_q_lengths(struct rcu_data *rdp, long *ql, long *qll) #endif /* #else #ifdef CONFIG_PPC */ /* - * Wrappers for the rcu_node::lock acquire. + * Wrappers for the rcu_node::lock acquire and release. * * Because the rcu_nodes form a tree, the tree traversal locking will observe * different lock values, this in turn means that an UNLOCK of one level @@ -689,29 +692,48 @@ static inline void rcu_nocb_q_lengths(struct rcu_data *rdp, long *ql, long *qll) * * In order to restore full ordering between tree levels, augment the regular * lock acquire functions with smp_mb__after_unlock_lock(). + * + * As ->lock of struct rcu_node is a __private field, therefore one should use + * these wrappers rather than directly call raw_spin_{lock,unlock}* on ->lock. */ static inline void raw_spin_lock_rcu_node(struct rcu_node *rnp) { - raw_spin_lock(&rnp->lock); + raw_spin_lock(&ACCESS_PRIVATE(rnp, lock)); smp_mb__after_unlock_lock(); } +static inline void raw_spin_unlock_rcu_node(struct rcu_node *rnp) +{ + raw_spin_unlock(&ACCESS_PRIVATE(rnp, lock)); +} + static inline void raw_spin_lock_irq_rcu_node(struct rcu_node *rnp) { - raw_spin_lock_irq(&rnp->lock); + raw_spin_lock_irq(&ACCESS_PRIVATE(rnp, lock)); smp_mb__after_unlock_lock(); } -#define raw_spin_lock_irqsave_rcu_node(rnp, flags) \ -do { \ - typecheck(unsigned long, flags); \ - raw_spin_lock_irqsave(&(rnp)->lock, flags); \ - smp_mb__after_unlock_lock(); \ +static inline void raw_spin_unlock_irq_rcu_node(struct rcu_node *rnp) +{ + raw_spin_unlock_irq(&ACCESS_PRIVATE(rnp, lock)); +} + +#define raw_spin_lock_irqsave_rcu_node(rnp, flags) \ +do { \ + typecheck(unsigned long, flags); \ + raw_spin_lock_irqsave(&ACCESS_PRIVATE(rnp, lock), flags); \ + smp_mb__after_unlock_lock(); \ +} while (0) + +#define raw_spin_unlock_irqrestore_rcu_node(rnp, flags) \ +do { \ + typecheck(unsigned long, flags); \ + raw_spin_unlock_irqrestore(&ACCESS_PRIVATE(rnp, lock), flags); \ } while (0) static inline bool raw_spin_trylock_rcu_node(struct rcu_node *rnp) { - bool locked = raw_spin_trylock(&rnp->lock); + bool locked = raw_spin_trylock(&ACCESS_PRIVATE(rnp, lock)); if (locked) smp_mb__after_unlock_lock(); diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 9467a8b7e756..efdf7b61ce12 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -235,7 +235,7 @@ static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp) rnp->gp_tasks = &t->rcu_node_entry; if (!rnp->exp_tasks && (blkd_state & RCU_EXP_BLKD)) rnp->exp_tasks = &t->rcu_node_entry; - raw_spin_unlock(&rnp->lock); /* rrupts remain disabled. */ + raw_spin_unlock_rcu_node(rnp); /* interrupts remain disabled. */ /* * Report the quiescent state for the expedited GP. This expedited @@ -489,7 +489,7 @@ void rcu_read_unlock_special(struct task_struct *t) !!rnp->gp_tasks); rcu_report_unblock_qs_rnp(rcu_state_p, rnp, flags); } else { - raw_spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); } /* Unboost if we were boosted. */ @@ -518,14 +518,14 @@ static void rcu_print_detail_task_stall_rnp(struct rcu_node *rnp) raw_spin_lock_irqsave_rcu_node(rnp, flags); if (!rcu_preempt_blocked_readers_cgp(rnp)) { - raw_spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); return; } t = list_entry(rnp->gp_tasks->prev, struct task_struct, rcu_node_entry); list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) sched_show_task(t); - raw_spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); } /* @@ -807,7 +807,6 @@ void exit_rcu(void) #else /* #ifdef CONFIG_PREEMPT_RCU */ static struct rcu_state *const rcu_state_p = &rcu_sched_state; -static struct rcu_data __percpu *const rcu_data_p = &rcu_sched_data; /* * Tell them what RCU they are running. @@ -991,7 +990,7 @@ static int rcu_boost(struct rcu_node *rnp) * might exit their RCU read-side critical sections on their own. */ if (rnp->exp_tasks == NULL && rnp->boost_tasks == NULL) { - raw_spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); return 0; } @@ -1028,7 +1027,7 @@ static int rcu_boost(struct rcu_node *rnp) */ t = container_of(tb, struct task_struct, rcu_node_entry); rt_mutex_init_proxy_locked(&rnp->boost_mtx, t); - raw_spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); /* Lock only for side effect: boosts task t's priority. */ rt_mutex_lock(&rnp->boost_mtx); rt_mutex_unlock(&rnp->boost_mtx); /* Then keep lockdep happy. */ @@ -1088,7 +1087,7 @@ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags) if (!rcu_preempt_blocked_readers_cgp(rnp) && rnp->exp_tasks == NULL) { rnp->n_balk_exp_gp_tasks++; - raw_spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); return; } if (rnp->exp_tasks != NULL || @@ -1098,13 +1097,13 @@ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags) ULONG_CMP_GE(jiffies, rnp->boost_time))) { if (rnp->exp_tasks == NULL) rnp->boost_tasks = rnp->gp_tasks; - raw_spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); t = rnp->boost_kthread_task; if (t) rcu_wake_cond(t, rnp->boost_kthread_status); } else { rcu_initiate_boost_trace(rnp); - raw_spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); } } @@ -1172,7 +1171,7 @@ static int rcu_spawn_one_boost_kthread(struct rcu_state *rsp, return PTR_ERR(t); raw_spin_lock_irqsave_rcu_node(rnp, flags); rnp->boost_kthread_task = t; - raw_spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); sp.sched_priority = kthread_prio; sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); wake_up_process(t); /* get to TASK_INTERRUPTIBLE quickly. */ @@ -1308,7 +1307,7 @@ static void rcu_prepare_kthreads(int cpu) static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags) __releases(rnp->lock) { - raw_spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); } static void invoke_rcu_callbacks_kthread(void) @@ -1559,7 +1558,7 @@ static void rcu_prepare_for_idle(void) rnp = rdp->mynode; raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */ needwake = rcu_accelerate_cbs(rsp, rnp, rdp); - raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ + raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */ if (needwake) rcu_gp_kthread_wake(rsp); } @@ -1811,9 +1810,9 @@ early_param("rcu_nocb_poll", parse_rcu_nocb_poll); * Wake up any no-CBs CPUs' kthreads that were waiting on the just-ended * grace period. */ -static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp) +static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq) { - wake_up_all(&rnp->nocb_gp_wq[rnp->completed & 0x1]); + swake_up_all(sq); } /* @@ -1829,10 +1828,15 @@ static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq) rnp->need_future_gp[(rnp->completed + 1) & 0x1] += nrq; } +static struct swait_queue_head *rcu_nocb_gp_get(struct rcu_node *rnp) +{ + return &rnp->nocb_gp_wq[rnp->completed & 0x1]; +} + static void rcu_init_one_nocb(struct rcu_node *rnp) { - init_waitqueue_head(&rnp->nocb_gp_wq[0]); - init_waitqueue_head(&rnp->nocb_gp_wq[1]); + init_swait_queue_head(&rnp->nocb_gp_wq[0]); + init_swait_queue_head(&rnp->nocb_gp_wq[1]); } #ifndef CONFIG_RCU_NOCB_CPU_ALL @@ -1857,7 +1861,7 @@ static void wake_nocb_leader(struct rcu_data *rdp, bool force) if (READ_ONCE(rdp_leader->nocb_leader_sleep) || force) { /* Prior smp_mb__after_atomic() orders against prior enqueue. */ WRITE_ONCE(rdp_leader->nocb_leader_sleep, false); - wake_up(&rdp_leader->nocb_wq); + swake_up(&rdp_leader->nocb_wq); } } @@ -2059,7 +2063,7 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp) raw_spin_lock_irqsave_rcu_node(rnp, flags); needwake = rcu_start_future_gp(rnp, rdp, &c); - raw_spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); if (needwake) rcu_gp_kthread_wake(rdp->rsp); @@ -2069,7 +2073,7 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp) */ trace_rcu_future_gp(rnp, rdp, c, TPS("StartWait")); for (;;) { - wait_event_interruptible( + swait_event_interruptible( rnp->nocb_gp_wq[c & 0x1], (d = ULONG_CMP_GE(READ_ONCE(rnp->completed), c))); if (likely(d)) @@ -2097,7 +2101,7 @@ wait_again: /* Wait for callbacks to appear. */ if (!rcu_nocb_poll) { trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu, "Sleep"); - wait_event_interruptible(my_rdp->nocb_wq, + swait_event_interruptible(my_rdp->nocb_wq, !READ_ONCE(my_rdp->nocb_leader_sleep)); /* Memory barrier handled by smp_mb() calls below and repoll. */ } else if (firsttime) { @@ -2172,7 +2176,7 @@ wait_again: * List was empty, wake up the follower. * Memory barriers supplied by atomic_long_add(). */ - wake_up(&rdp->nocb_wq); + swake_up(&rdp->nocb_wq); } } @@ -2193,7 +2197,7 @@ static void nocb_follower_wait(struct rcu_data *rdp) if (!rcu_nocb_poll) { trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, "FollowerSleep"); - wait_event_interruptible(rdp->nocb_wq, + swait_event_interruptible(rdp->nocb_wq, READ_ONCE(rdp->nocb_follower_head)); } else if (firsttime) { /* Don't drown trace log with "Poll"! */ @@ -2352,7 +2356,7 @@ void __init rcu_init_nohz(void) static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp) { rdp->nocb_tail = &rdp->nocb_head; - init_waitqueue_head(&rdp->nocb_wq); + init_swait_queue_head(&rdp->nocb_wq); rdp->nocb_follower_tail = &rdp->nocb_follower_head; } @@ -2502,7 +2506,7 @@ static bool rcu_nocb_cpu_needs_barrier(struct rcu_state *rsp, int cpu) return false; } -static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp) +static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq) { } @@ -2510,6 +2514,11 @@ static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq) { } +static struct swait_queue_head *rcu_nocb_gp_get(struct rcu_node *rnp) +{ + return NULL; +} + static void rcu_init_one_nocb(struct rcu_node *rnp) { } diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index 76b94e19430b..ca828b41c938 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -128,6 +128,7 @@ bool rcu_gp_is_normal(void) { return READ_ONCE(rcu_normal); } +EXPORT_SYMBOL_GPL(rcu_gp_is_normal); static atomic_t rcu_expedited_nesting = ATOMIC_INIT(IS_ENABLED(CONFIG_RCU_EXPEDITE_BOOT) ? 1 : 0); diff --git a/kernel/resource.c b/kernel/resource.c index 09c0597840b0..2e78ead30934 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -233,9 +233,9 @@ static struct resource * __request_resource(struct resource *root, struct resour } } -static int __release_resource(struct resource *old) +static int __release_resource(struct resource *old, bool release_child) { - struct resource *tmp, **p; + struct resource *tmp, **p, *chd; p = &old->parent->child; for (;;) { @@ -243,7 +243,17 @@ static int __release_resource(struct resource *old) if (!tmp) break; if (tmp == old) { - *p = tmp->sibling; + if (release_child || !(tmp->child)) { + *p = tmp->sibling; + } else { + for (chd = tmp->child;; chd = chd->sibling) { + chd->parent = tmp->parent; + if (!(chd->sibling)) + break; + } + *p = tmp->child; + chd->sibling = tmp->sibling; + } old->parent = NULL; return 0; } @@ -325,7 +335,7 @@ int release_resource(struct resource *old) int retval; write_lock(&resource_lock); - retval = __release_resource(old); + retval = __release_resource(old, true); write_unlock(&resource_lock); return retval; } @@ -333,13 +343,13 @@ int release_resource(struct resource *old) EXPORT_SYMBOL(release_resource); /* - * Finds the lowest iomem reosurce exists with-in [res->start.res->end) - * the caller must specify res->start, res->end, res->flags and "name". - * If found, returns 0, res is overwritten, if not found, returns -1. - * This walks through whole tree and not just first level children - * until and unless first_level_children_only is true. + * Finds the lowest iomem resource existing within [res->start.res->end). + * The caller must specify res->start, res->end, res->flags, and optionally + * desc. If found, returns 0, res is overwritten, if not found, returns -1. + * This function walks the whole tree and not just first level children until + * and unless first_level_children_only is true. */ -static int find_next_iomem_res(struct resource *res, char *name, +static int find_next_iomem_res(struct resource *res, unsigned long desc, bool first_level_children_only) { resource_size_t start, end; @@ -358,9 +368,9 @@ static int find_next_iomem_res(struct resource *res, char *name, read_lock(&resource_lock); for (p = iomem_resource.child; p; p = next_resource(p, sibling_only)) { - if (p->flags != res->flags) + if ((p->flags & res->flags) != res->flags) continue; - if (name && strcmp(p->name, name)) + if ((desc != IORES_DESC_NONE) && (desc != p->desc)) continue; if (p->start > end) { p = NULL; @@ -385,15 +395,18 @@ static int find_next_iomem_res(struct resource *res, char *name, * Walks through iomem resources and calls func() with matching resource * ranges. This walks through whole tree and not just first level children. * All the memory ranges which overlap start,end and also match flags and - * name are valid candidates. + * desc are valid candidates. * - * @name: name of resource - * @flags: resource flags + * @desc: I/O resource descriptor. Use IORES_DESC_NONE to skip @desc check. + * @flags: I/O resource flags * @start: start addr * @end: end addr + * + * NOTE: For a new descriptor search, define a new IORES_DESC in + * <linux/ioport.h> and set it in 'desc' of a target resource entry. */ -int walk_iomem_res(char *name, unsigned long flags, u64 start, u64 end, - void *arg, int (*func)(u64, u64, void *)) +int walk_iomem_res_desc(unsigned long desc, unsigned long flags, u64 start, + u64 end, void *arg, int (*func)(u64, u64, void *)) { struct resource res; u64 orig_end; @@ -403,23 +416,27 @@ int walk_iomem_res(char *name, unsigned long flags, u64 start, u64 end, res.end = end; res.flags = flags; orig_end = res.end; + while ((res.start < res.end) && - (!find_next_iomem_res(&res, name, false))) { + (!find_next_iomem_res(&res, desc, false))) { + ret = (*func)(res.start, res.end, arg); if (ret) break; + res.start = res.end + 1; res.end = orig_end; } + return ret; } /* - * This function calls callback against all memory range of "System RAM" - * which are marked as IORESOURCE_MEM and IORESOUCE_BUSY. - * Now, this function is only for "System RAM". This function deals with - * full ranges and not pfn. If resources are not pfn aligned, dealing - * with pfn can truncate ranges. + * This function calls the @func callback against all memory ranges of type + * System RAM which are marked as IORESOURCE_SYSTEM_RAM and IORESOUCE_BUSY. + * Now, this function is only for System RAM, it deals with full ranges and + * not PFNs. If resources are not PFN-aligned, dealing with PFNs can truncate + * ranges. */ int walk_system_ram_res(u64 start, u64 end, void *arg, int (*func)(u64, u64, void *)) @@ -430,10 +447,10 @@ int walk_system_ram_res(u64 start, u64 end, void *arg, res.start = start; res.end = end; - res.flags = IORESOURCE_MEM | IORESOURCE_BUSY; + res.flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY; orig_end = res.end; while ((res.start < res.end) && - (!find_next_iomem_res(&res, "System RAM", true))) { + (!find_next_iomem_res(&res, IORES_DESC_NONE, true))) { ret = (*func)(res.start, res.end, arg); if (ret) break; @@ -446,9 +463,9 @@ int walk_system_ram_res(u64 start, u64 end, void *arg, #if !defined(CONFIG_ARCH_HAS_WALK_MEMORY) /* - * This function calls callback against all memory range of "System RAM" - * which are marked as IORESOURCE_MEM and IORESOUCE_BUSY. - * Now, this function is only for "System RAM". + * This function calls the @func callback against all memory ranges of type + * System RAM which are marked as IORESOURCE_SYSTEM_RAM and IORESOUCE_BUSY. + * It is to be used only for System RAM. */ int walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages, void *arg, int (*func)(unsigned long, unsigned long, void *)) @@ -460,10 +477,10 @@ int walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages, res.start = (u64) start_pfn << PAGE_SHIFT; res.end = ((u64)(start_pfn + nr_pages) << PAGE_SHIFT) - 1; - res.flags = IORESOURCE_MEM | IORESOURCE_BUSY; + res.flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY; orig_end = res.end; while ((res.start < res.end) && - (find_next_iomem_res(&res, "System RAM", true) >= 0)) { + (find_next_iomem_res(&res, IORES_DESC_NONE, true) >= 0)) { pfn = (res.start + PAGE_SIZE - 1) >> PAGE_SHIFT; end_pfn = (res.end + 1) >> PAGE_SHIFT; if (end_pfn > pfn) @@ -484,7 +501,7 @@ static int __is_ram(unsigned long pfn, unsigned long nr_pages, void *arg) } /* * This generic page_is_ram() returns true if specified address is - * registered as "System RAM" in iomem_resource list. + * registered as System RAM in iomem_resource list. */ int __weak page_is_ram(unsigned long pfn) { @@ -496,30 +513,34 @@ EXPORT_SYMBOL_GPL(page_is_ram); * region_intersects() - determine intersection of region with known resources * @start: region start address * @size: size of region - * @name: name of resource (in iomem_resource) + * @flags: flags of resource (in iomem_resource) + * @desc: descriptor of resource (in iomem_resource) or IORES_DESC_NONE * * Check if the specified region partially overlaps or fully eclipses a - * resource identified by @name. Return REGION_DISJOINT if the region - * does not overlap @name, return REGION_MIXED if the region overlaps - * @type and another resource, and return REGION_INTERSECTS if the - * region overlaps @type and no other defined resource. Note, that - * REGION_INTERSECTS is also returned in the case when the specified - * region overlaps RAM and undefined memory holes. + * resource identified by @flags and @desc (optional with IORES_DESC_NONE). + * Return REGION_DISJOINT if the region does not overlap @flags/@desc, + * return REGION_MIXED if the region overlaps @flags/@desc and another + * resource, and return REGION_INTERSECTS if the region overlaps @flags/@desc + * and no other defined resource. Note that REGION_INTERSECTS is also + * returned in the case when the specified region overlaps RAM and undefined + * memory holes. * * region_intersect() is used by memory remapping functions to ensure * the user is not remapping RAM and is a vast speed up over walking * through the resource table page by page. */ -int region_intersects(resource_size_t start, size_t size, const char *name) +int region_intersects(resource_size_t start, size_t size, unsigned long flags, + unsigned long desc) { - unsigned long flags = IORESOURCE_MEM | IORESOURCE_BUSY; resource_size_t end = start + size - 1; int type = 0; int other = 0; struct resource *p; read_lock(&resource_lock); for (p = iomem_resource.child; p ; p = p->sibling) { - bool is_type = strcmp(p->name, name) == 0 && p->flags == flags; + bool is_type = (((p->flags & flags) == flags) && + ((desc == IORES_DESC_NONE) || + (desc == p->desc))); if (start >= p->start && start <= p->end) is_type ? type++ : other++; @@ -538,6 +559,7 @@ int region_intersects(resource_size_t start, size_t size, const char *name) return REGION_DISJOINT; } +EXPORT_SYMBOL_GPL(region_intersects); void __weak arch_remove_reservations(struct resource *avail) { @@ -667,7 +689,7 @@ static int reallocate_resource(struct resource *root, struct resource *old, old->start = new.start; old->end = new.end; } else { - __release_resource(old); + __release_resource(old, true); *old = new; conflict = __request_resource(root, old); BUG_ON(conflict); @@ -813,6 +835,9 @@ static struct resource * __insert_resource(struct resource *parent, struct resou * entirely fit within the range of the new resource, then the new * resource is inserted and the conflicting resources become children of * the new resource. + * + * This function is intended for producers of resources, such as FW modules + * and bus drivers. */ struct resource *insert_resource_conflict(struct resource *parent, struct resource *new) { @@ -830,6 +855,9 @@ struct resource *insert_resource_conflict(struct resource *parent, struct resour * @new: new resource to insert * * Returns 0 on success, -EBUSY if the resource can't be inserted. + * + * This function is intended for producers of resources, such as FW modules + * and bus drivers. */ int insert_resource(struct resource *parent, struct resource *new) { @@ -838,6 +866,7 @@ int insert_resource(struct resource *parent, struct resource *new) conflict = insert_resource_conflict(parent, new); return conflict ? -EBUSY : 0; } +EXPORT_SYMBOL_GPL(insert_resource); /** * insert_resource_expand_to_fit - Insert a resource into the resource tree @@ -873,6 +902,32 @@ void insert_resource_expand_to_fit(struct resource *root, struct resource *new) write_unlock(&resource_lock); } +/** + * remove_resource - Remove a resource in the resource tree + * @old: resource to remove + * + * Returns 0 on success, -EINVAL if the resource is not valid. + * + * This function removes a resource previously inserted by insert_resource() + * or insert_resource_conflict(), and moves the children (if any) up to + * where they were before. insert_resource() and insert_resource_conflict() + * insert a new resource, and move any conflicting resources down to the + * children of the new resource. + * + * insert_resource(), insert_resource_conflict() and remove_resource() are + * intended for producers of resources, such as FW modules and bus drivers. + */ +int remove_resource(struct resource *old) +{ + int retval; + + write_lock(&resource_lock); + retval = __release_resource(old, false); + write_unlock(&resource_lock); + return retval; +} +EXPORT_SYMBOL_GPL(remove_resource); + static int __adjust_resource(struct resource *res, resource_size_t start, resource_size_t size) { @@ -948,6 +1003,7 @@ static void __init __reserve_region_with_split(struct resource *root, res->start = start; res->end = end; res->flags = IORESOURCE_BUSY; + res->desc = IORES_DESC_NONE; while (1) { @@ -982,6 +1038,7 @@ static void __init __reserve_region_with_split(struct resource *root, next_res->start = conflict->end + 1; next_res->end = end; next_res->flags = IORESOURCE_BUSY; + next_res->desc = IORES_DESC_NONE; } } else { res->start = conflict->end + 1; @@ -1071,21 +1128,24 @@ struct resource * __request_region(struct resource *parent, res->name = name; res->start = start; res->end = start + n - 1; - res->flags = resource_type(parent); - res->flags |= IORESOURCE_BUSY | flags; write_lock(&resource_lock); for (;;) { struct resource *conflict; + res->flags = resource_type(parent) | resource_ext_type(parent); + res->flags |= IORESOURCE_BUSY | flags; + res->desc = parent->desc; + conflict = __request_resource(parent, res); if (!conflict) break; if (conflict != parent) { - parent = conflict; - if (!(conflict->flags & IORESOURCE_BUSY)) + if (!(conflict->flags & IORESOURCE_BUSY)) { + parent = conflict; continue; + } } if (conflict->flags & flags & IORESOURCE_MUXED) { add_wait_queue(&muxed_resource_wait, &wait); @@ -1237,6 +1297,7 @@ int release_mem_region_adjustable(struct resource *parent, new_res->start = end + 1; new_res->end = res->end; new_res->flags = res->flags; + new_res->desc = res->desc; new_res->parent = res->parent; new_res->sibling = res->sibling; new_res->child = NULL; @@ -1412,6 +1473,7 @@ static int __init reserve_setup(char *str) res->start = io_start; res->end = io_start + io_num - 1; res->flags = IORESOURCE_BUSY; + res->desc = IORES_DESC_NONE; res->child = NULL; if (request_resource(res->start >= 0x10000 ? &iomem_resource : &ioport_resource, res) == 0) reserved = x+1; diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index 67687973ce80..414d9c16da42 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -2,6 +2,10 @@ ifdef CONFIG_FUNCTION_TRACER CFLAGS_REMOVE_clock.o = $(CC_FLAGS_FTRACE) endif +# These files are disabled because they produce non-interesting flaky coverage +# that is not a function of syscall inputs. E.g. involuntary context switches. +KCOV_INSTRUMENT := n + ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y) # According to Alan Modra <[email protected]>, the -fno-omit-frame-pointer is # needed for x86 only. Why this used to be enabled for all architectures is beyond @@ -13,9 +17,10 @@ endif obj-y += core.o loadavg.o clock.o cputime.o obj-y += idle_task.o fair.o rt.o deadline.o stop_task.o -obj-y += wait.o completion.o idle.o +obj-y += wait.o swait.o completion.o idle.o obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o obj-$(CONFIG_SCHEDSTATS) += stats.o obj-$(CONFIG_SCHED_DEBUG) += debug.o obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o +obj-$(CONFIG_CPU_FREQ) += cpufreq.o diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c index bc54e84675da..fedb967a9841 100644 --- a/kernel/sched/clock.c +++ b/kernel/sched/clock.c @@ -61,6 +61,7 @@ #include <linux/static_key.h> #include <linux/workqueue.h> #include <linux/compiler.h> +#include <linux/tick.h> /* * Scheduler clock - returns current time in nanosec units. @@ -89,6 +90,8 @@ static void __set_sched_clock_stable(void) { if (!sched_clock_stable()) static_key_slow_inc(&__sched_clock_stable); + + tick_dep_clear(TICK_DEP_BIT_CLOCK_UNSTABLE); } void set_sched_clock_stable(void) @@ -108,6 +111,8 @@ static void __clear_sched_clock_stable(struct work_struct *work) /* XXX worry about clock continuity */ if (sched_clock_stable()) static_key_slow_dec(&__sched_clock_stable); + + tick_dep_set(TICK_DEP_BIT_CLOCK_UNSTABLE); } static DECLARE_WORK(sched_clock_work, __clear_sched_clock_stable); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 63d3a24e081a..d8465eeab8b3 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -26,6 +26,7 @@ * Thomas Gleixner, Mike Kravetz */ +#include <linux/kasan.h> #include <linux/mm.h> #include <linux/module.h> #include <linux/nmi.h> @@ -66,14 +67,13 @@ #include <linux/pagemap.h> #include <linux/hrtimer.h> #include <linux/tick.h> -#include <linux/debugfs.h> #include <linux/ctype.h> #include <linux/ftrace.h> #include <linux/slab.h> #include <linux/init_task.h> -#include <linux/binfmts.h> #include <linux/context_tracking.h> #include <linux/compiler.h> +#include <linux/frame.h> #include <asm/switch_to.h> #include <asm/tlb.h> @@ -124,138 +124,6 @@ const_debug unsigned int sysctl_sched_features = #undef SCHED_FEAT -#ifdef CONFIG_SCHED_DEBUG -#define SCHED_FEAT(name, enabled) \ - #name , - -static const char * const sched_feat_names[] = { -#include "features.h" -}; - -#undef SCHED_FEAT - -static int sched_feat_show(struct seq_file *m, void *v) -{ - int i; - - for (i = 0; i < __SCHED_FEAT_NR; i++) { - if (!(sysctl_sched_features & (1UL << i))) - seq_puts(m, "NO_"); - seq_printf(m, "%s ", sched_feat_names[i]); - } - seq_puts(m, "\n"); - - return 0; -} - -#ifdef HAVE_JUMP_LABEL - -#define jump_label_key__true STATIC_KEY_INIT_TRUE -#define jump_label_key__false STATIC_KEY_INIT_FALSE - -#define SCHED_FEAT(name, enabled) \ - jump_label_key__##enabled , - -struct static_key sched_feat_keys[__SCHED_FEAT_NR] = { -#include "features.h" -}; - -#undef SCHED_FEAT - -static void sched_feat_disable(int i) -{ - static_key_disable(&sched_feat_keys[i]); -} - -static void sched_feat_enable(int i) -{ - static_key_enable(&sched_feat_keys[i]); -} -#else -static void sched_feat_disable(int i) { }; -static void sched_feat_enable(int i) { }; -#endif /* HAVE_JUMP_LABEL */ - -static int sched_feat_set(char *cmp) -{ - int i; - int neg = 0; - - if (strncmp(cmp, "NO_", 3) == 0) { - neg = 1; - cmp += 3; - } - - for (i = 0; i < __SCHED_FEAT_NR; i++) { - if (strcmp(cmp, sched_feat_names[i]) == 0) { - if (neg) { - sysctl_sched_features &= ~(1UL << i); - sched_feat_disable(i); - } else { - sysctl_sched_features |= (1UL << i); - sched_feat_enable(i); - } - break; - } - } - - return i; -} - -static ssize_t -sched_feat_write(struct file *filp, const char __user *ubuf, - size_t cnt, loff_t *ppos) -{ - char buf[64]; - char *cmp; - int i; - struct inode *inode; - - if (cnt > 63) - cnt = 63; - - if (copy_from_user(&buf, ubuf, cnt)) - return -EFAULT; - - buf[cnt] = 0; - cmp = strstrip(buf); - - /* Ensure the static_key remains in a consistent state */ - inode = file_inode(filp); - inode_lock(inode); - i = sched_feat_set(cmp); - inode_unlock(inode); - if (i == __SCHED_FEAT_NR) - return -EINVAL; - - *ppos += cnt; - - return cnt; -} - -static int sched_feat_open(struct inode *inode, struct file *filp) -{ - return single_open(filp, sched_feat_show, NULL); -} - -static const struct file_operations sched_feat_fops = { - .open = sched_feat_open, - .write = sched_feat_write, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; - -static __init int sched_init_debug(void) -{ - debugfs_create_file("sched_features", 0644, NULL, NULL, - &sched_feat_fops); - - return 0; -} -late_initcall(sched_init_debug); -#endif /* CONFIG_SCHED_DEBUG */ - /* * Number of tasks to iterate in a single balance run. * Limited because this is done with IRQs disabled. @@ -453,20 +321,6 @@ static inline void init_hrtick(void) } #endif /* CONFIG_SCHED_HRTICK */ -/* - * cmpxchg based fetch_or, macro so it works for different integer types - */ -#define fetch_or(ptr, val) \ -({ typeof(*(ptr)) __old, __val = *(ptr); \ - for (;;) { \ - __old = cmpxchg((ptr), __val, __val | (val)); \ - if (__old == __val) \ - break; \ - __val = __old; \ - } \ - __old; \ -}) - #if defined(CONFIG_SMP) && defined(TIF_POLLING_NRFLAG) /* * Atomically set TIF_NEED_RESCHED and test for TIF_POLLING_NRFLAG, @@ -715,31 +569,36 @@ static inline bool got_nohz_idle_kick(void) #endif /* CONFIG_NO_HZ_COMMON */ #ifdef CONFIG_NO_HZ_FULL -bool sched_can_stop_tick(void) +bool sched_can_stop_tick(struct rq *rq) { + int fifo_nr_running; + + /* Deadline tasks, even if single, need the tick */ + if (rq->dl.dl_nr_running) + return false; + /* - * FIFO realtime policy runs the highest priority task. Other runnable - * tasks are of a lower priority. The scheduler tick does nothing. + * FIFO realtime policy runs the highest priority task (after DEADLINE). + * Other runnable tasks are of a lower priority. The scheduler tick + * isn't needed. */ - if (current->policy == SCHED_FIFO) + fifo_nr_running = rq->rt.rt_nr_running - rq->rt.rr_nr_running; + if (fifo_nr_running) return true; /* * Round-robin realtime tasks time slice with other tasks at the same - * realtime priority. Is this task the only one at this priority? + * realtime priority. */ - if (current->policy == SCHED_RR) { - struct sched_rt_entity *rt_se = ¤t->rt; - - return list_is_singular(&rt_se->run_list); + if (rq->rt.rr_nr_running) { + if (rq->rt.rr_nr_running == 1) + return true; + else + return false; } - /* - * More than one running task need preemption. - * nr_running update is assumed to be visible - * after IPI is sent from wakers. - */ - if (this_rq()->nr_running > 1) + /* Normal multitasking need periodic preemption checks */ + if (rq->cfs.nr_running > 1) return false; return true; @@ -2093,7 +1952,8 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) ttwu_queue(p, cpu); stat: - ttwu_stat(p, cpu, wake_flags); + if (schedstat_enabled()) + ttwu_stat(p, cpu, wake_flags); out: raw_spin_unlock_irqrestore(&p->pi_lock, flags); @@ -2141,7 +2001,8 @@ static void try_to_wake_up_local(struct task_struct *p) ttwu_activate(rq, p, ENQUEUE_WAKEUP); ttwu_do_wakeup(rq, p, 0); - ttwu_stat(p, smp_processor_id(), 0); + if (schedstat_enabled()) + ttwu_stat(p, smp_processor_id(), 0); out: raw_spin_unlock(&p->pi_lock); } @@ -2183,7 +2044,6 @@ void __dl_clear_params(struct task_struct *p) dl_se->dl_bw = 0; dl_se->dl_throttled = 0; - dl_se->dl_new = 1; dl_se->dl_yielded = 0; } @@ -2210,6 +2070,7 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) #endif #ifdef CONFIG_SCHEDSTATS + /* Even if schedstat is disabled, there should not be garbage */ memset(&p->se.statistics, 0, sizeof(p->se.statistics)); #endif @@ -2218,6 +2079,10 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) __dl_clear_params(p); INIT_LIST_HEAD(&p->rt.run_list); + p->rt.timeout = 0; + p->rt.time_slice = sched_rr_timeslice; + p->rt.on_rq = 0; + p->rt.on_list = 0; #ifdef CONFIG_PREEMPT_NOTIFIERS INIT_HLIST_HEAD(&p->preempt_notifiers); @@ -2281,6 +2146,69 @@ int sysctl_numa_balancing(struct ctl_table *table, int write, #endif #endif +DEFINE_STATIC_KEY_FALSE(sched_schedstats); + +#ifdef CONFIG_SCHEDSTATS +static void set_schedstats(bool enabled) +{ + if (enabled) + static_branch_enable(&sched_schedstats); + else + static_branch_disable(&sched_schedstats); +} + +void force_schedstat_enabled(void) +{ + if (!schedstat_enabled()) { + pr_info("kernel profiling enabled schedstats, disable via kernel.sched_schedstats.\n"); + static_branch_enable(&sched_schedstats); + } +} + +static int __init setup_schedstats(char *str) +{ + int ret = 0; + if (!str) + goto out; + + if (!strcmp(str, "enable")) { + set_schedstats(true); + ret = 1; + } else if (!strcmp(str, "disable")) { + set_schedstats(false); + ret = 1; + } +out: + if (!ret) + pr_warn("Unable to parse schedstats=\n"); + + return ret; +} +__setup("schedstats=", setup_schedstats); + +#ifdef CONFIG_PROC_SYSCTL +int sysctl_schedstats(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + struct ctl_table t; + int err; + int state = static_branch_likely(&sched_schedstats); + + if (write && !capable(CAP_SYS_ADMIN)) + return -EPERM; + + t = *table; + t.data = &state; + err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos); + if (err < 0) + return err; + if (write) + set_schedstats(state); + return err; +} +#endif +#endif + /* * fork()/clone()-time setup: */ @@ -2762,7 +2690,7 @@ asmlinkage __visible void schedule_tail(struct task_struct *prev) /* * context_switch - switch to the new MM and the new thread's register state. */ -static inline struct rq * +static __always_inline struct rq * context_switch(struct rq *rq, struct task_struct *prev, struct task_struct *next) { @@ -3010,16 +2938,6 @@ u64 scheduler_tick_max_deferment(void) } #endif -notrace unsigned long get_parent_ip(unsigned long addr) -{ - if (in_lock_functions(addr)) { - addr = CALLER_ADDR2; - if (in_lock_functions(addr)) - addr = CALLER_ADDR3; - } - return addr; -} - #if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \ defined(CONFIG_PREEMPT_TRACER)) @@ -3041,7 +2959,7 @@ void preempt_count_add(int val) PREEMPT_MASK - 10); #endif if (preempt_count() == val) { - unsigned long ip = get_parent_ip(CALLER_ADDR1); + unsigned long ip = get_lock_parent_ip(); #ifdef CONFIG_DEBUG_PREEMPT current->preempt_disable_ip = ip; #endif @@ -3068,7 +2986,7 @@ void preempt_count_sub(int val) #endif if (preempt_count() == val) - trace_preempt_on(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1)); + trace_preempt_on(CALLER_ADDR0, get_lock_parent_ip()); __preempt_count_sub(val); } EXPORT_SYMBOL(preempt_count_sub); @@ -3257,7 +3175,7 @@ static void __sched notrace __schedule(bool preempt) if (prev->flags & PF_WQ_WORKER) { struct task_struct *to_wakeup; - to_wakeup = wq_worker_sleeping(prev, cpu); + to_wakeup = wq_worker_sleeping(prev); if (to_wakeup) try_to_wake_up_local(to_wakeup); } @@ -3280,7 +3198,6 @@ static void __sched notrace __schedule(bool preempt) trace_sched_switch(preempt, prev, next); rq = context_switch(rq, prev, next); /* unlocks the rq */ - cpu = cpu_of(rq); } else { lockdep_unpin_lock(&rq->lock); raw_spin_unlock_irq(&rq->lock); @@ -3288,6 +3205,7 @@ static void __sched notrace __schedule(bool preempt) balance_callback(rq); } +STACK_FRAME_NON_STANDARD(__schedule); /* switch_to() */ static inline void sched_submit_work(struct task_struct *tsk) { @@ -3466,7 +3384,7 @@ EXPORT_SYMBOL(default_wake_function); */ void rt_mutex_setprio(struct task_struct *p, int prio) { - int oldprio, queued, running, enqueue_flag = ENQUEUE_RESTORE; + int oldprio, queued, running, queue_flag = DEQUEUE_SAVE | DEQUEUE_MOVE; struct rq *rq; const struct sched_class *prev_class; @@ -3494,11 +3412,15 @@ void rt_mutex_setprio(struct task_struct *p, int prio) trace_sched_pi_setprio(p, prio); oldprio = p->prio; + + if (oldprio == prio) + queue_flag &= ~DEQUEUE_MOVE; + prev_class = p->sched_class; queued = task_on_rq_queued(p); running = task_current(rq, p); if (queued) - dequeue_task(rq, p, DEQUEUE_SAVE); + dequeue_task(rq, p, queue_flag); if (running) put_prev_task(rq, p); @@ -3516,7 +3438,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio) if (!dl_prio(p->normal_prio) || (pi_task && dl_entity_preempt(&pi_task->dl, &p->dl))) { p->dl.dl_boosted = 1; - enqueue_flag |= ENQUEUE_REPLENISH; + queue_flag |= ENQUEUE_REPLENISH; } else p->dl.dl_boosted = 0; p->sched_class = &dl_sched_class; @@ -3524,7 +3446,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio) if (dl_prio(oldprio)) p->dl.dl_boosted = 0; if (oldprio < prio) - enqueue_flag |= ENQUEUE_HEAD; + queue_flag |= ENQUEUE_HEAD; p->sched_class = &rt_sched_class; } else { if (dl_prio(oldprio)) @@ -3539,7 +3461,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio) if (running) p->sched_class->set_curr_task(rq); if (queued) - enqueue_task(rq, p, enqueue_flag); + enqueue_task(rq, p, queue_flag); check_class_changed(rq, p, prev_class, oldprio); out_unlock: @@ -3895,6 +3817,7 @@ static int __sched_setscheduler(struct task_struct *p, const struct sched_class *prev_class; struct rq *rq; int reset_on_fork; + int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE; /* may grab non-irq protected spin_locks */ BUG_ON(in_interrupt()); @@ -4077,17 +4000,14 @@ change: * itself. */ new_effective_prio = rt_mutex_get_effective_prio(p, newprio); - if (new_effective_prio == oldprio) { - __setscheduler_params(p, attr); - task_rq_unlock(rq, p, &flags); - return 0; - } + if (new_effective_prio == oldprio) + queue_flags &= ~DEQUEUE_MOVE; } queued = task_on_rq_queued(p); running = task_current(rq, p); if (queued) - dequeue_task(rq, p, DEQUEUE_SAVE); + dequeue_task(rq, p, queue_flags); if (running) put_prev_task(rq, p); @@ -4097,15 +4017,14 @@ change: if (running) p->sched_class->set_curr_task(rq); if (queued) { - int enqueue_flags = ENQUEUE_RESTORE; /* * We enqueue to tail when the priority of a task is * increased (user space view). */ - if (oldprio <= p->prio) - enqueue_flags |= ENQUEUE_HEAD; + if (oldprio < p->prio) + queue_flags |= ENQUEUE_HEAD; - enqueue_task(rq, p, enqueue_flags); + enqueue_task(rq, p, queue_flags); } check_class_changed(rq, p, prev_class, oldprio); @@ -5096,6 +5015,8 @@ void init_idle(struct task_struct *idle, int cpu) idle->state = TASK_RUNNING; idle->se.exec_start = sched_clock(); + kasan_unpoison_task_stack(idle); + #ifdef CONFIG_SMP /* * Its possible that init_idle() gets called multiple times on a task, @@ -5405,183 +5326,6 @@ static void migrate_tasks(struct rq *dead_rq) } #endif /* CONFIG_HOTPLUG_CPU */ -#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL) - -static struct ctl_table sd_ctl_dir[] = { - { - .procname = "sched_domain", - .mode = 0555, - }, - {} -}; - -static struct ctl_table sd_ctl_root[] = { - { - .procname = "kernel", - .mode = 0555, - .child = sd_ctl_dir, - }, - {} -}; - -static struct ctl_table *sd_alloc_ctl_entry(int n) -{ - struct ctl_table *entry = - kcalloc(n, sizeof(struct ctl_table), GFP_KERNEL); - - return entry; -} - -static void sd_free_ctl_entry(struct ctl_table **tablep) -{ - struct ctl_table *entry; - - /* - * In the intermediate directories, both the child directory and - * procname are dynamically allocated and could fail but the mode - * will always be set. In the lowest directory the names are - * static strings and all have proc handlers. - */ - for (entry = *tablep; entry->mode; entry++) { - if (entry->child) - sd_free_ctl_entry(&entry->child); - if (entry->proc_handler == NULL) - kfree(entry->procname); - } - - kfree(*tablep); - *tablep = NULL; -} - -static int min_load_idx = 0; -static int max_load_idx = CPU_LOAD_IDX_MAX-1; - -static void -set_table_entry(struct ctl_table *entry, - const char *procname, void *data, int maxlen, - umode_t mode, proc_handler *proc_handler, - bool load_idx) -{ - entry->procname = procname; - entry->data = data; - entry->maxlen = maxlen; - entry->mode = mode; - entry->proc_handler = proc_handler; - - if (load_idx) { - entry->extra1 = &min_load_idx; - entry->extra2 = &max_load_idx; - } -} - -static struct ctl_table * -sd_alloc_ctl_domain_table(struct sched_domain *sd) -{ - struct ctl_table *table = sd_alloc_ctl_entry(14); - - if (table == NULL) - return NULL; - - set_table_entry(&table[0], "min_interval", &sd->min_interval, - sizeof(long), 0644, proc_doulongvec_minmax, false); - set_table_entry(&table[1], "max_interval", &sd->max_interval, - sizeof(long), 0644, proc_doulongvec_minmax, false); - set_table_entry(&table[2], "busy_idx", &sd->busy_idx, - sizeof(int), 0644, proc_dointvec_minmax, true); - set_table_entry(&table[3], "idle_idx", &sd->idle_idx, - sizeof(int), 0644, proc_dointvec_minmax, true); - set_table_entry(&table[4], "newidle_idx", &sd->newidle_idx, - sizeof(int), 0644, proc_dointvec_minmax, true); - set_table_entry(&table[5], "wake_idx", &sd->wake_idx, - sizeof(int), 0644, proc_dointvec_minmax, true); - set_table_entry(&table[6], "forkexec_idx", &sd->forkexec_idx, - sizeof(int), 0644, proc_dointvec_minmax, true); - set_table_entry(&table[7], "busy_factor", &sd->busy_factor, - sizeof(int), 0644, proc_dointvec_minmax, false); - set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct, - sizeof(int), 0644, proc_dointvec_minmax, false); - set_table_entry(&table[9], "cache_nice_tries", - &sd->cache_nice_tries, - sizeof(int), 0644, proc_dointvec_minmax, false); - set_table_entry(&table[10], "flags", &sd->flags, - sizeof(int), 0644, proc_dointvec_minmax, false); - set_table_entry(&table[11], "max_newidle_lb_cost", - &sd->max_newidle_lb_cost, - sizeof(long), 0644, proc_doulongvec_minmax, false); - set_table_entry(&table[12], "name", sd->name, - CORENAME_MAX_SIZE, 0444, proc_dostring, false); - /* &table[13] is terminator */ - - return table; -} - -static struct ctl_table *sd_alloc_ctl_cpu_table(int cpu) -{ - struct ctl_table *entry, *table; - struct sched_domain *sd; - int domain_num = 0, i; - char buf[32]; - - for_each_domain(cpu, sd) - domain_num++; - entry = table = sd_alloc_ctl_entry(domain_num + 1); - if (table == NULL) - return NULL; - - i = 0; - for_each_domain(cpu, sd) { - snprintf(buf, 32, "domain%d", i); - entry->procname = kstrdup(buf, GFP_KERNEL); - entry->mode = 0555; - entry->child = sd_alloc_ctl_domain_table(sd); - entry++; - i++; - } - return table; -} - -static struct ctl_table_header *sd_sysctl_header; -static void register_sched_domain_sysctl(void) -{ - int i, cpu_num = num_possible_cpus(); - struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1); - char buf[32]; - - WARN_ON(sd_ctl_dir[0].child); - sd_ctl_dir[0].child = entry; - - if (entry == NULL) - return; - - for_each_possible_cpu(i) { - snprintf(buf, 32, "cpu%d", i); - entry->procname = kstrdup(buf, GFP_KERNEL); - entry->mode = 0555; - entry->child = sd_alloc_ctl_cpu_table(i); - entry++; - } - - WARN_ON(sd_sysctl_header); - sd_sysctl_header = register_sysctl_table(sd_ctl_root); -} - -/* may be called multiple times per register */ -static void unregister_sched_domain_sysctl(void) -{ - unregister_sysctl_table(sd_sysctl_header); - sd_sysctl_header = NULL; - if (sd_ctl_dir[0].child) - sd_free_ctl_entry(&sd_ctl_dir[0].child); -} -#else -static void register_sched_domain_sysctl(void) -{ -} -static void unregister_sched_domain_sysctl(void) -{ -} -#endif /* CONFIG_SCHED_DEBUG && CONFIG_SYSCTL */ - static void set_rq_online(struct rq *rq) { if (!rq->online) { @@ -5627,6 +5371,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) case CPU_UP_PREPARE: rq->calc_load_update = calc_load_update; + account_reset_rq(rq); break; case CPU_ONLINE: @@ -5692,16 +5437,6 @@ static int sched_cpu_active(struct notifier_block *nfb, set_cpu_rq_start_time(); return NOTIFY_OK; - case CPU_ONLINE: - /* - * At this point a starting CPU has marked itself as online via - * set_cpu_online(). But it might not yet have marked itself - * as active, which is essential from here on. - */ - set_cpu_active(cpu, true); - stop_machine_unpark(cpu); - return NOTIFY_OK; - case CPU_DOWN_FAILED: set_cpu_active(cpu, true); return NOTIFY_OK; @@ -6173,11 +5908,16 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) /* Setup the mask of cpus configured for isolated domains */ static int __init isolated_cpu_setup(char *str) { + int ret; + alloc_bootmem_cpumask_var(&cpu_isolated_map); - cpulist_parse(str, cpu_isolated_map); + ret = cpulist_parse(str, cpu_isolated_map); + if (ret) { + pr_err("sched: Error, all isolcpus= values must be between 0 and %d\n", nr_cpu_ids); + return 0; + } return 1; } - __setup("isolcpus=", isolated_cpu_setup); struct s_data { @@ -6840,7 +6580,7 @@ static void sched_init_numa(void) sched_domains_numa_masks[i][j] = mask; - for (k = 0; k < nr_node_ids; k++) { + for_each_node(k) { if (node_distance(j, k) > sched_domains_numa_distance[i]) continue; @@ -7798,7 +7538,7 @@ void set_curr_task(int cpu, struct task_struct *p) /* task_group_lock serializes the addition/removal of task groups */ static DEFINE_SPINLOCK(task_group_lock); -static void free_sched_group(struct task_group *tg) +static void sched_free_group(struct task_group *tg) { free_fair_sched_group(tg); free_rt_sched_group(tg); @@ -7824,7 +7564,7 @@ struct task_group *sched_create_group(struct task_group *parent) return tg; err: - free_sched_group(tg); + sched_free_group(tg); return ERR_PTR(-ENOMEM); } @@ -7844,27 +7584,24 @@ void sched_online_group(struct task_group *tg, struct task_group *parent) } /* rcu callback to free various structures associated with a task group */ -static void free_sched_group_rcu(struct rcu_head *rhp) +static void sched_free_group_rcu(struct rcu_head *rhp) { /* now it should be safe to free those cfs_rqs */ - free_sched_group(container_of(rhp, struct task_group, rcu)); + sched_free_group(container_of(rhp, struct task_group, rcu)); } -/* Destroy runqueue etc associated with a task group */ void sched_destroy_group(struct task_group *tg) { /* wait for possible concurrent references to cfs_rqs complete */ - call_rcu(&tg->rcu, free_sched_group_rcu); + call_rcu(&tg->rcu, sched_free_group_rcu); } void sched_offline_group(struct task_group *tg) { unsigned long flags; - int i; /* end participation in shares distribution */ - for_each_possible_cpu(i) - unregister_fair_sched_group(tg, i); + unregister_fair_sched_group(tg); spin_lock_irqsave(&task_group_lock, flags); list_del_rcu(&tg->list); @@ -7890,7 +7627,7 @@ void sched_move_task(struct task_struct *tsk) queued = task_on_rq_queued(tsk); if (queued) - dequeue_task(rq, tsk, DEQUEUE_SAVE); + dequeue_task(rq, tsk, DEQUEUE_SAVE | DEQUEUE_MOVE); if (unlikely(running)) put_prev_task(rq, tsk); @@ -7914,7 +7651,7 @@ void sched_move_task(struct task_struct *tsk) if (unlikely(running)) tsk->sched_class->set_curr_task(rq); if (queued) - enqueue_task(rq, tsk, ENQUEUE_RESTORE); + enqueue_task(rq, tsk, ENQUEUE_RESTORE | ENQUEUE_MOVE); task_rq_unlock(rq, tsk, &flags); } @@ -8315,31 +8052,26 @@ cpu_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) if (IS_ERR(tg)) return ERR_PTR(-ENOMEM); + sched_online_group(tg, parent); + return &tg->css; } -static int cpu_cgroup_css_online(struct cgroup_subsys_state *css) +static void cpu_cgroup_css_released(struct cgroup_subsys_state *css) { struct task_group *tg = css_tg(css); - struct task_group *parent = css_tg(css->parent); - if (parent) - sched_online_group(tg, parent); - return 0; + sched_offline_group(tg); } static void cpu_cgroup_css_free(struct cgroup_subsys_state *css) { struct task_group *tg = css_tg(css); - sched_destroy_group(tg); -} - -static void cpu_cgroup_css_offline(struct cgroup_subsys_state *css) -{ - struct task_group *tg = css_tg(css); - - sched_offline_group(tg); + /* + * Relies on the RCU grace period between css_released() and this. + */ + sched_free_group(tg); } static void cpu_cgroup_fork(struct task_struct *task) @@ -8699,14 +8431,13 @@ static struct cftype cpu_files[] = { struct cgroup_subsys cpu_cgrp_subsys = { .css_alloc = cpu_cgroup_css_alloc, + .css_released = cpu_cgroup_css_released, .css_free = cpu_cgroup_css_free, - .css_online = cpu_cgroup_css_online, - .css_offline = cpu_cgroup_css_offline, .fork = cpu_cgroup_fork, .can_attach = cpu_cgroup_can_attach, .attach = cpu_cgroup_attach, .legacy_cftypes = cpu_files, - .early_init = 1, + .early_init = true, }; #endif /* CONFIG_CGROUP_SCHED */ diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c index dd7cbb55bbf2..4a811203c04a 100644 --- a/kernel/sched/cpuacct.c +++ b/kernel/sched/cpuacct.c @@ -145,13 +145,16 @@ static u64 cpuusage_read(struct cgroup_subsys_state *css, struct cftype *cft) } static int cpuusage_write(struct cgroup_subsys_state *css, struct cftype *cft, - u64 reset) + u64 val) { struct cpuacct *ca = css_ca(css); int err = 0; int i; - if (reset) { + /* + * Only allow '0' here to do a reset. + */ + if (val) { err = -EINVAL; goto out; } @@ -235,23 +238,10 @@ static struct cftype files[] = { void cpuacct_charge(struct task_struct *tsk, u64 cputime) { struct cpuacct *ca; - int cpu; - - cpu = task_cpu(tsk); rcu_read_lock(); - - ca = task_ca(tsk); - - while (true) { - u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); - *cpuusage += cputime; - - ca = parent_ca(ca); - if (!ca) - break; - } - + for (ca = task_ca(tsk); ca; ca = parent_ca(ca)) + *this_cpu_ptr(ca->cpuusage) += cputime; rcu_read_unlock(); } @@ -260,18 +250,13 @@ void cpuacct_charge(struct task_struct *tsk, u64 cputime) * * Note: it's the caller that updates the account of the root cgroup. */ -void cpuacct_account_field(struct task_struct *p, int index, u64 val) +void cpuacct_account_field(struct task_struct *tsk, int index, u64 val) { - struct kernel_cpustat *kcpustat; struct cpuacct *ca; rcu_read_lock(); - ca = task_ca(p); - while (ca != &root_cpuacct) { - kcpustat = this_cpu_ptr(ca->cpustat); - kcpustat->cpustat[index] += val; - ca = parent_ca(ca); - } + for (ca = task_ca(tsk); ca != &root_cpuacct; ca = parent_ca(ca)) + this_cpu_ptr(ca->cpustat)->cpustat[index] += val; rcu_read_unlock(); } @@ -279,5 +264,5 @@ struct cgroup_subsys cpuacct_cgrp_subsys = { .css_alloc = cpuacct_css_alloc, .css_free = cpuacct_css_free, .legacy_cftypes = files, - .early_init = 1, + .early_init = true, }; diff --git a/kernel/sched/cpuacct.h b/kernel/sched/cpuacct.h index ed605624a5e7..ba72807c73d4 100644 --- a/kernel/sched/cpuacct.h +++ b/kernel/sched/cpuacct.h @@ -1,7 +1,7 @@ #ifdef CONFIG_CGROUP_CPUACCT extern void cpuacct_charge(struct task_struct *tsk, u64 cputime); -extern void cpuacct_account_field(struct task_struct *p, int index, u64 val); +extern void cpuacct_account_field(struct task_struct *tsk, int index, u64 val); #else @@ -10,7 +10,7 @@ static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) } static inline void -cpuacct_account_field(struct task_struct *p, int index, u64 val) +cpuacct_account_field(struct task_struct *tsk, int index, u64 val) { } diff --git a/kernel/sched/cpufreq.c b/kernel/sched/cpufreq.c new file mode 100644 index 000000000000..928c4ba32f68 --- /dev/null +++ b/kernel/sched/cpufreq.c @@ -0,0 +1,37 @@ +/* + * Scheduler code and data structures related to cpufreq. + * + * Copyright (C) 2016, Intel Corporation + * Author: Rafael J. Wysocki <[email protected]> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include "sched.h" + +DEFINE_PER_CPU(struct update_util_data *, cpufreq_update_util_data); + +/** + * cpufreq_set_update_util_data - Populate the CPU's update_util_data pointer. + * @cpu: The CPU to set the pointer for. + * @data: New pointer value. + * + * Set and publish the update_util_data pointer for the given CPU. That pointer + * points to a struct update_util_data object containing a callback function + * to call from cpufreq_update_util(). That function will be called from an RCU + * read-side critical section, so it must not sleep. + * + * Callers must use RCU-sched callbacks to free any memory that might be + * accessed via the old update_util_data pointer or invoke synchronize_sched() + * right after this function to avoid use-after-free. + */ +void cpufreq_set_update_util_data(int cpu, struct update_util_data *data) +{ + if (WARN_ON(data && !data->func)) + return; + + rcu_assign_pointer(per_cpu(cpufreq_update_util_data, cpu), data); +} +EXPORT_SYMBOL_GPL(cpufreq_set_update_util_data); diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index b2ab2ffb1adc..75f98c5498d5 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -262,21 +262,21 @@ static __always_inline bool steal_account_process_tick(void) #ifdef CONFIG_PARAVIRT if (static_key_false(¶virt_steal_enabled)) { u64 steal; - cputime_t steal_ct; + unsigned long steal_jiffies; steal = paravirt_steal_clock(smp_processor_id()); steal -= this_rq()->prev_steal_time; /* - * cputime_t may be less precise than nsecs (eg: if it's - * based on jiffies). Lets cast the result to cputime + * steal is in nsecs but our caller is expecting steal + * time in jiffies. Lets cast the result to jiffies * granularity and account the rest on the next rounds. */ - steal_ct = nsecs_to_cputime(steal); - this_rq()->prev_steal_time += cputime_to_nsecs(steal_ct); + steal_jiffies = nsecs_to_jiffies(steal); + this_rq()->prev_steal_time += jiffies_to_nsecs(steal_jiffies); - account_steal_time(steal_ct); - return steal_ct; + account_steal_time(jiffies_to_cputime(steal_jiffies)); + return steal_jiffies; } #endif return false; @@ -668,26 +668,25 @@ void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime #endif /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */ #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN -static unsigned long long vtime_delta(struct task_struct *tsk) +static cputime_t vtime_delta(struct task_struct *tsk) { - unsigned long long clock; + unsigned long now = READ_ONCE(jiffies); - clock = local_clock(); - if (clock < tsk->vtime_snap) + if (time_before(now, (unsigned long)tsk->vtime_snap)) return 0; - return clock - tsk->vtime_snap; + return jiffies_to_cputime(now - tsk->vtime_snap); } static cputime_t get_vtime_delta(struct task_struct *tsk) { - unsigned long long delta = vtime_delta(tsk); + unsigned long now = READ_ONCE(jiffies); + unsigned long delta = now - tsk->vtime_snap; WARN_ON_ONCE(tsk->vtime_snap_whence == VTIME_INACTIVE); - tsk->vtime_snap += delta; + tsk->vtime_snap = now; - /* CHECKME: always safe to convert nsecs to cputime? */ - return nsecs_to_cputime(delta); + return jiffies_to_cputime(delta); } static void __vtime_account_system(struct task_struct *tsk) @@ -699,6 +698,9 @@ static void __vtime_account_system(struct task_struct *tsk) void vtime_account_system(struct task_struct *tsk) { + if (!vtime_delta(tsk)) + return; + write_seqcount_begin(&tsk->vtime_seqcount); __vtime_account_system(tsk); write_seqcount_end(&tsk->vtime_seqcount); @@ -707,7 +709,8 @@ void vtime_account_system(struct task_struct *tsk) void vtime_gen_account_irq_exit(struct task_struct *tsk) { write_seqcount_begin(&tsk->vtime_seqcount); - __vtime_account_system(tsk); + if (vtime_delta(tsk)) + __vtime_account_system(tsk); if (context_tracking_in_user()) tsk->vtime_snap_whence = VTIME_USER; write_seqcount_end(&tsk->vtime_seqcount); @@ -718,16 +721,19 @@ void vtime_account_user(struct task_struct *tsk) cputime_t delta_cpu; write_seqcount_begin(&tsk->vtime_seqcount); - delta_cpu = get_vtime_delta(tsk); tsk->vtime_snap_whence = VTIME_SYS; - account_user_time(tsk, delta_cpu, cputime_to_scaled(delta_cpu)); + if (vtime_delta(tsk)) { + delta_cpu = get_vtime_delta(tsk); + account_user_time(tsk, delta_cpu, cputime_to_scaled(delta_cpu)); + } write_seqcount_end(&tsk->vtime_seqcount); } void vtime_user_enter(struct task_struct *tsk) { write_seqcount_begin(&tsk->vtime_seqcount); - __vtime_account_system(tsk); + if (vtime_delta(tsk)) + __vtime_account_system(tsk); tsk->vtime_snap_whence = VTIME_USER; write_seqcount_end(&tsk->vtime_seqcount); } @@ -742,7 +748,8 @@ void vtime_guest_enter(struct task_struct *tsk) * that can thus safely catch up with a tickless delta. */ write_seqcount_begin(&tsk->vtime_seqcount); - __vtime_account_system(tsk); + if (vtime_delta(tsk)) + __vtime_account_system(tsk); current->flags |= PF_VCPU; write_seqcount_end(&tsk->vtime_seqcount); } @@ -772,7 +779,7 @@ void arch_vtime_task_switch(struct task_struct *prev) write_seqcount_begin(¤t->vtime_seqcount); current->vtime_snap_whence = VTIME_SYS; - current->vtime_snap = sched_clock_cpu(smp_processor_id()); + current->vtime_snap = jiffies; write_seqcount_end(¤t->vtime_seqcount); } @@ -783,7 +790,7 @@ void vtime_init_idle(struct task_struct *t, int cpu) local_irq_save(flags); write_seqcount_begin(&t->vtime_seqcount); t->vtime_snap_whence = VTIME_SYS; - t->vtime_snap = sched_clock_cpu(cpu); + t->vtime_snap = jiffies; write_seqcount_end(&t->vtime_seqcount); local_irq_restore(flags); } diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index cd64c979d0e1..affd97ec9f65 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -352,7 +352,15 @@ static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq = dl_rq_of_se(dl_se); struct rq *rq = rq_of_dl_rq(dl_rq); - WARN_ON(!dl_se->dl_new || dl_se->dl_throttled); + WARN_ON(dl_time_before(rq_clock(rq), dl_se->deadline)); + + /* + * We are racing with the deadline timer. So, do nothing because + * the deadline timer handler will take care of properly recharging + * the runtime and postponing the deadline + */ + if (dl_se->dl_throttled) + return; /* * We use the regular wall clock time to set deadlines in the @@ -361,7 +369,6 @@ static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se, */ dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline; dl_se->runtime = pi_se->dl_runtime; - dl_se->dl_new = 0; } /* @@ -399,6 +406,9 @@ static void replenish_dl_entity(struct sched_dl_entity *dl_se, dl_se->runtime = pi_se->dl_runtime; } + if (dl_se->dl_yielded && dl_se->runtime > 0) + dl_se->runtime = 0; + /* * We keep moving the deadline away until we get some * available runtime for the entity. This ensures correct @@ -420,7 +430,7 @@ static void replenish_dl_entity(struct sched_dl_entity *dl_se, * entity. */ if (dl_time_before(dl_se->deadline, rq_clock(rq))) { - printk_deferred_once("sched: DL replenish lagged to much\n"); + printk_deferred_once("sched: DL replenish lagged too much\n"); dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline; dl_se->runtime = pi_se->dl_runtime; } @@ -500,15 +510,6 @@ static void update_dl_entity(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq = dl_rq_of_se(dl_se); struct rq *rq = rq_of_dl_rq(dl_rq); - /* - * The arrival of a new instance needs special treatment, i.e., - * the actual scheduling parameters have to be "renewed". - */ - if (dl_se->dl_new) { - setup_new_dl_entity(dl_se, pi_se); - return; - } - if (dl_time_before(dl_se->deadline, rq_clock(rq)) || dl_entity_overflow(dl_se, pi_se, rq_clock(rq))) { dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline; @@ -605,16 +606,6 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer) } /* - * This is possible if switched_from_dl() raced against a running - * callback that took the above !dl_task() path and we've since then - * switched back into SCHED_DEADLINE. - * - * There's nothing to do except drop our task reference. - */ - if (dl_se->dl_new) - goto unlock; - - /* * The task might have been boosted by someone else and might be in the * boosting/deboosting path, its not throttled. */ @@ -726,6 +717,10 @@ static void update_curr_dl(struct rq *rq) if (!dl_task(curr) || !on_dl_rq(dl_se)) return; + /* Kick cpufreq (see the comment in linux/cpufreq.h). */ + if (cpu_of(rq) == smp_processor_id()) + cpufreq_trigger_update(rq_clock(rq)); + /* * Consumed budget is computed considering the time as * observed by schedulable tasks (excluding time spent @@ -735,8 +730,11 @@ static void update_curr_dl(struct rq *rq) * approach need further study. */ delta_exec = rq_clock_task(rq) - curr->se.exec_start; - if (unlikely((s64)delta_exec <= 0)) + if (unlikely((s64)delta_exec <= 0)) { + if (unlikely(dl_se->dl_yielded)) + goto throttle; return; + } schedstat_set(curr->se.statistics.exec_max, max(curr->se.statistics.exec_max, delta_exec)); @@ -749,8 +747,10 @@ static void update_curr_dl(struct rq *rq) sched_rt_avg_update(rq, delta_exec); - dl_se->runtime -= dl_se->dl_yielded ? 0 : delta_exec; - if (dl_runtime_exceeded(dl_se)) { + dl_se->runtime -= delta_exec; + +throttle: + if (dl_runtime_exceeded(dl_se) || dl_se->dl_yielded) { dl_se->dl_throttled = 1; __dequeue_task_dl(rq, curr, 0); if (unlikely(dl_se->dl_boosted || !start_dl_timer(curr))) @@ -917,7 +917,7 @@ enqueue_dl_entity(struct sched_dl_entity *dl_se, * parameters of the task might need updating. Otherwise, * we want a replenishment of its runtime. */ - if (dl_se->dl_new || flags & ENQUEUE_WAKEUP) + if (flags & ENQUEUE_WAKEUP) update_dl_entity(dl_se, pi_se); else if (flags & ENQUEUE_REPLENISH) replenish_dl_entity(dl_se, pi_se); @@ -994,18 +994,14 @@ static void dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags) */ static void yield_task_dl(struct rq *rq) { - struct task_struct *p = rq->curr; - /* * We make the task go to sleep until its current deadline by * forcing its runtime to zero. This way, update_curr_dl() stops * it and the bandwidth timer will wake it up and will give it * new scheduling parameters (thanks to dl_yielded=1). */ - if (p->dl.runtime > 0) { - rq->curr->dl.dl_yielded = 1; - p->dl.runtime = 0; - } + rq->curr->dl.dl_yielded = 1; + update_rq_clock(rq); update_curr_dl(rq); /* @@ -1722,6 +1718,9 @@ static void switched_from_dl(struct rq *rq, struct task_struct *p) */ static void switched_to_dl(struct rq *rq, struct task_struct *p) { + if (dl_time_before(p->dl.deadline, rq_clock(rq))) + setup_new_dl_entity(&p->dl, &p->dl); + if (task_on_rq_queued(p) && rq->curr != p) { #ifdef CONFIG_SMP if (p->nr_cpus_allowed > 1 && rq->dl.overloaded) @@ -1768,8 +1767,7 @@ static void prio_changed_dl(struct rq *rq, struct task_struct *p, */ resched_curr(rq); #endif /* CONFIG_SMP */ - } else - switched_to_dl(rq, p); + } } const struct sched_class dl_sched_class = { diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 641511771ae6..4fbc3bd5ff60 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -16,6 +16,7 @@ #include <linux/kallsyms.h> #include <linux/utsname.h> #include <linux/mempolicy.h> +#include <linux/debugfs.h> #include "sched.h" @@ -58,6 +59,309 @@ static unsigned long nsec_low(unsigned long long nsec) #define SPLIT_NS(x) nsec_high(x), nsec_low(x) +#define SCHED_FEAT(name, enabled) \ + #name , + +static const char * const sched_feat_names[] = { +#include "features.h" +}; + +#undef SCHED_FEAT + +static int sched_feat_show(struct seq_file *m, void *v) +{ + int i; + + for (i = 0; i < __SCHED_FEAT_NR; i++) { + if (!(sysctl_sched_features & (1UL << i))) + seq_puts(m, "NO_"); + seq_printf(m, "%s ", sched_feat_names[i]); + } + seq_puts(m, "\n"); + + return 0; +} + +#ifdef HAVE_JUMP_LABEL + +#define jump_label_key__true STATIC_KEY_INIT_TRUE +#define jump_label_key__false STATIC_KEY_INIT_FALSE + +#define SCHED_FEAT(name, enabled) \ + jump_label_key__##enabled , + +struct static_key sched_feat_keys[__SCHED_FEAT_NR] = { +#include "features.h" +}; + +#undef SCHED_FEAT + +static void sched_feat_disable(int i) +{ + static_key_disable(&sched_feat_keys[i]); +} + +static void sched_feat_enable(int i) +{ + static_key_enable(&sched_feat_keys[i]); +} +#else +static void sched_feat_disable(int i) { }; +static void sched_feat_enable(int i) { }; +#endif /* HAVE_JUMP_LABEL */ + +static int sched_feat_set(char *cmp) +{ + int i; + int neg = 0; + + if (strncmp(cmp, "NO_", 3) == 0) { + neg = 1; + cmp += 3; + } + + for (i = 0; i < __SCHED_FEAT_NR; i++) { + if (strcmp(cmp, sched_feat_names[i]) == 0) { + if (neg) { + sysctl_sched_features &= ~(1UL << i); + sched_feat_disable(i); + } else { + sysctl_sched_features |= (1UL << i); + sched_feat_enable(i); + } + break; + } + } + + return i; +} + +static ssize_t +sched_feat_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + char buf[64]; + char *cmp; + int i; + struct inode *inode; + + if (cnt > 63) + cnt = 63; + + if (copy_from_user(&buf, ubuf, cnt)) + return -EFAULT; + + buf[cnt] = 0; + cmp = strstrip(buf); + + /* Ensure the static_key remains in a consistent state */ + inode = file_inode(filp); + inode_lock(inode); + i = sched_feat_set(cmp); + inode_unlock(inode); + if (i == __SCHED_FEAT_NR) + return -EINVAL; + + *ppos += cnt; + + return cnt; +} + +static int sched_feat_open(struct inode *inode, struct file *filp) +{ + return single_open(filp, sched_feat_show, NULL); +} + +static const struct file_operations sched_feat_fops = { + .open = sched_feat_open, + .write = sched_feat_write, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static __init int sched_init_debug(void) +{ + debugfs_create_file("sched_features", 0644, NULL, NULL, + &sched_feat_fops); + + return 0; +} +late_initcall(sched_init_debug); + +#ifdef CONFIG_SMP + +#ifdef CONFIG_SYSCTL + +static struct ctl_table sd_ctl_dir[] = { + { + .procname = "sched_domain", + .mode = 0555, + }, + {} +}; + +static struct ctl_table sd_ctl_root[] = { + { + .procname = "kernel", + .mode = 0555, + .child = sd_ctl_dir, + }, + {} +}; + +static struct ctl_table *sd_alloc_ctl_entry(int n) +{ + struct ctl_table *entry = + kcalloc(n, sizeof(struct ctl_table), GFP_KERNEL); + + return entry; +} + +static void sd_free_ctl_entry(struct ctl_table **tablep) +{ + struct ctl_table *entry; + + /* + * In the intermediate directories, both the child directory and + * procname are dynamically allocated and could fail but the mode + * will always be set. In the lowest directory the names are + * static strings and all have proc handlers. + */ + for (entry = *tablep; entry->mode; entry++) { + if (entry->child) + sd_free_ctl_entry(&entry->child); + if (entry->proc_handler == NULL) + kfree(entry->procname); + } + + kfree(*tablep); + *tablep = NULL; +} + +static int min_load_idx = 0; +static int max_load_idx = CPU_LOAD_IDX_MAX-1; + +static void +set_table_entry(struct ctl_table *entry, + const char *procname, void *data, int maxlen, + umode_t mode, proc_handler *proc_handler, + bool load_idx) +{ + entry->procname = procname; + entry->data = data; + entry->maxlen = maxlen; + entry->mode = mode; + entry->proc_handler = proc_handler; + + if (load_idx) { + entry->extra1 = &min_load_idx; + entry->extra2 = &max_load_idx; + } +} + +static struct ctl_table * +sd_alloc_ctl_domain_table(struct sched_domain *sd) +{ + struct ctl_table *table = sd_alloc_ctl_entry(14); + + if (table == NULL) + return NULL; + + set_table_entry(&table[0], "min_interval", &sd->min_interval, + sizeof(long), 0644, proc_doulongvec_minmax, false); + set_table_entry(&table[1], "max_interval", &sd->max_interval, + sizeof(long), 0644, proc_doulongvec_minmax, false); + set_table_entry(&table[2], "busy_idx", &sd->busy_idx, + sizeof(int), 0644, proc_dointvec_minmax, true); + set_table_entry(&table[3], "idle_idx", &sd->idle_idx, + sizeof(int), 0644, proc_dointvec_minmax, true); + set_table_entry(&table[4], "newidle_idx", &sd->newidle_idx, + sizeof(int), 0644, proc_dointvec_minmax, true); + set_table_entry(&table[5], "wake_idx", &sd->wake_idx, + sizeof(int), 0644, proc_dointvec_minmax, true); + set_table_entry(&table[6], "forkexec_idx", &sd->forkexec_idx, + sizeof(int), 0644, proc_dointvec_minmax, true); + set_table_entry(&table[7], "busy_factor", &sd->busy_factor, + sizeof(int), 0644, proc_dointvec_minmax, false); + set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct, + sizeof(int), 0644, proc_dointvec_minmax, false); + set_table_entry(&table[9], "cache_nice_tries", + &sd->cache_nice_tries, + sizeof(int), 0644, proc_dointvec_minmax, false); + set_table_entry(&table[10], "flags", &sd->flags, + sizeof(int), 0644, proc_dointvec_minmax, false); + set_table_entry(&table[11], "max_newidle_lb_cost", + &sd->max_newidle_lb_cost, + sizeof(long), 0644, proc_doulongvec_minmax, false); + set_table_entry(&table[12], "name", sd->name, + CORENAME_MAX_SIZE, 0444, proc_dostring, false); + /* &table[13] is terminator */ + + return table; +} + +static struct ctl_table *sd_alloc_ctl_cpu_table(int cpu) +{ + struct ctl_table *entry, *table; + struct sched_domain *sd; + int domain_num = 0, i; + char buf[32]; + + for_each_domain(cpu, sd) + domain_num++; + entry = table = sd_alloc_ctl_entry(domain_num + 1); + if (table == NULL) + return NULL; + + i = 0; + for_each_domain(cpu, sd) { + snprintf(buf, 32, "domain%d", i); + entry->procname = kstrdup(buf, GFP_KERNEL); + entry->mode = 0555; + entry->child = sd_alloc_ctl_domain_table(sd); + entry++; + i++; + } + return table; +} + +static struct ctl_table_header *sd_sysctl_header; +void register_sched_domain_sysctl(void) +{ + int i, cpu_num = num_possible_cpus(); + struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1); + char buf[32]; + + WARN_ON(sd_ctl_dir[0].child); + sd_ctl_dir[0].child = entry; + + if (entry == NULL) + return; + + for_each_possible_cpu(i) { + snprintf(buf, 32, "cpu%d", i); + entry->procname = kstrdup(buf, GFP_KERNEL); + entry->mode = 0555; + entry->child = sd_alloc_ctl_cpu_table(i); + entry++; + } + + WARN_ON(sd_sysctl_header); + sd_sysctl_header = register_sysctl_table(sd_ctl_root); +} + +/* may be called multiple times per register */ +void unregister_sched_domain_sysctl(void) +{ + unregister_sysctl_table(sd_sysctl_header); + sd_sysctl_header = NULL; + if (sd_ctl_dir[0].child) + sd_free_ctl_entry(&sd_ctl_dir[0].child); +} +#endif /* CONFIG_SYSCTL */ +#endif /* CONFIG_SMP */ + #ifdef CONFIG_FAIR_GROUP_SCHED static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group *tg) { @@ -75,16 +379,18 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group PN(se->vruntime); PN(se->sum_exec_runtime); #ifdef CONFIG_SCHEDSTATS - PN(se->statistics.wait_start); - PN(se->statistics.sleep_start); - PN(se->statistics.block_start); - PN(se->statistics.sleep_max); - PN(se->statistics.block_max); - PN(se->statistics.exec_max); - PN(se->statistics.slice_max); - PN(se->statistics.wait_max); - PN(se->statistics.wait_sum); - P(se->statistics.wait_count); + if (schedstat_enabled()) { + PN(se->statistics.wait_start); + PN(se->statistics.sleep_start); + PN(se->statistics.block_start); + PN(se->statistics.sleep_max); + PN(se->statistics.block_max); + PN(se->statistics.exec_max); + PN(se->statistics.slice_max); + PN(se->statistics.wait_max); + PN(se->statistics.wait_sum); + P(se->statistics.wait_count); + } #endif P(se->load.weight); #ifdef CONFIG_SMP @@ -122,10 +428,12 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p) (long long)(p->nvcsw + p->nivcsw), p->prio); #ifdef CONFIG_SCHEDSTATS - SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld", - SPLIT_NS(p->se.statistics.wait_sum), - SPLIT_NS(p->se.sum_exec_runtime), - SPLIT_NS(p->se.statistics.sum_sleep_runtime)); + if (schedstat_enabled()) { + SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld", + SPLIT_NS(p->se.statistics.wait_sum), + SPLIT_NS(p->se.sum_exec_runtime), + SPLIT_NS(p->se.statistics.sum_sleep_runtime)); + } #else SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld", 0LL, 0L, @@ -258,8 +566,17 @@ void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq) void print_dl_rq(struct seq_file *m, int cpu, struct dl_rq *dl_rq) { + struct dl_bw *dl_bw; + SEQ_printf(m, "\ndl_rq[%d]:\n", cpu); SEQ_printf(m, " .%-30s: %ld\n", "dl_nr_running", dl_rq->dl_nr_running); +#ifdef CONFIG_SMP + dl_bw = &cpu_rq(cpu)->rd->dl_bw; +#else + dl_bw = &dl_rq->dl_bw; +#endif + SEQ_printf(m, " .%-30s: %lld\n", "dl_bw->bw", dl_bw->bw); + SEQ_printf(m, " .%-30s: %lld\n", "dl_bw->total_bw", dl_bw->total_bw); } extern __read_mostly int sched_clock_running; @@ -313,17 +630,18 @@ do { \ #define P(n) SEQ_printf(m, " .%-30s: %d\n", #n, rq->n); #define P64(n) SEQ_printf(m, " .%-30s: %Ld\n", #n, rq->n); - P(yld_count); - - P(sched_count); - P(sched_goidle); #ifdef CONFIG_SMP P64(avg_idle); P64(max_idle_balance_cost); #endif - P(ttwu_count); - P(ttwu_local); + if (schedstat_enabled()) { + P(yld_count); + P(sched_count); + P(sched_goidle); + P(ttwu_count); + P(ttwu_local); + } #undef P #undef P64 @@ -569,38 +887,39 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m) nr_switches = p->nvcsw + p->nivcsw; #ifdef CONFIG_SCHEDSTATS - PN(se.statistics.sum_sleep_runtime); - PN(se.statistics.wait_start); - PN(se.statistics.sleep_start); - PN(se.statistics.block_start); - PN(se.statistics.sleep_max); - PN(se.statistics.block_max); - PN(se.statistics.exec_max); - PN(se.statistics.slice_max); - PN(se.statistics.wait_max); - PN(se.statistics.wait_sum); - P(se.statistics.wait_count); - PN(se.statistics.iowait_sum); - P(se.statistics.iowait_count); P(se.nr_migrations); - P(se.statistics.nr_migrations_cold); - P(se.statistics.nr_failed_migrations_affine); - P(se.statistics.nr_failed_migrations_running); - P(se.statistics.nr_failed_migrations_hot); - P(se.statistics.nr_forced_migrations); - P(se.statistics.nr_wakeups); - P(se.statistics.nr_wakeups_sync); - P(se.statistics.nr_wakeups_migrate); - P(se.statistics.nr_wakeups_local); - P(se.statistics.nr_wakeups_remote); - P(se.statistics.nr_wakeups_affine); - P(se.statistics.nr_wakeups_affine_attempts); - P(se.statistics.nr_wakeups_passive); - P(se.statistics.nr_wakeups_idle); - { + if (schedstat_enabled()) { u64 avg_atom, avg_per_cpu; + PN(se.statistics.sum_sleep_runtime); + PN(se.statistics.wait_start); + PN(se.statistics.sleep_start); + PN(se.statistics.block_start); + PN(se.statistics.sleep_max); + PN(se.statistics.block_max); + PN(se.statistics.exec_max); + PN(se.statistics.slice_max); + PN(se.statistics.wait_max); + PN(se.statistics.wait_sum); + P(se.statistics.wait_count); + PN(se.statistics.iowait_sum); + P(se.statistics.iowait_count); + P(se.statistics.nr_migrations_cold); + P(se.statistics.nr_failed_migrations_affine); + P(se.statistics.nr_failed_migrations_running); + P(se.statistics.nr_failed_migrations_hot); + P(se.statistics.nr_forced_migrations); + P(se.statistics.nr_wakeups); + P(se.statistics.nr_wakeups_sync); + P(se.statistics.nr_wakeups_migrate); + P(se.statistics.nr_wakeups_local); + P(se.statistics.nr_wakeups_remote); + P(se.statistics.nr_wakeups_affine); + P(se.statistics.nr_wakeups_affine_attempts); + P(se.statistics.nr_wakeups_passive); + P(se.statistics.nr_wakeups_idle); + avg_atom = p->se.sum_exec_runtime; if (nr_switches) avg_atom = div64_ul(avg_atom, nr_switches); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 1926606ece80..0fe30e66aff1 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -20,8 +20,8 @@ * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra */ -#include <linux/latencytop.h> #include <linux/sched.h> +#include <linux/latencytop.h> #include <linux/cpumask.h> #include <linux/cpuidle.h> #include <linux/slab.h> @@ -755,7 +755,9 @@ static void update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se) { struct task_struct *p; - u64 delta = rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start; + u64 delta; + + delta = rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start; if (entity_is_task(se)) { p = task_of(se); @@ -776,22 +778,12 @@ update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se) se->statistics.wait_sum += delta; se->statistics.wait_start = 0; } -#else -static inline void -update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se) -{ -} - -static inline void -update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se) -{ -} -#endif /* * Task is being enqueued - update stats: */ -static void update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) +static inline void +update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) { /* * Are we enqueueing a waiting task? (for current tasks @@ -802,7 +794,7 @@ static void update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) } static inline void -update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) +update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) { /* * Mark the end of the wait period if dequeueing a @@ -810,8 +802,41 @@ update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) */ if (se != cfs_rq->curr) update_stats_wait_end(cfs_rq, se); + + if (flags & DEQUEUE_SLEEP) { + if (entity_is_task(se)) { + struct task_struct *tsk = task_of(se); + + if (tsk->state & TASK_INTERRUPTIBLE) + se->statistics.sleep_start = rq_clock(rq_of(cfs_rq)); + if (tsk->state & TASK_UNINTERRUPTIBLE) + se->statistics.block_start = rq_clock(rq_of(cfs_rq)); + } + } + +} +#else +static inline void +update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ } +static inline void +update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ +} + +static inline void +update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ +} + +static inline void +update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) +{ +} +#endif + /* * We are picking a new current task - update its stats: */ @@ -907,10 +932,11 @@ struct numa_group { spinlock_t lock; /* nr_tasks, tasks */ int nr_tasks; pid_t gid; + int active_nodes; struct rcu_head rcu; - nodemask_t active_nodes; unsigned long total_faults; + unsigned long max_faults_cpu; /* * Faults_cpu is used to decide whether memory should move * towards the CPU. As a consequence, these stats are weighted @@ -969,6 +995,18 @@ static inline unsigned long group_faults_cpu(struct numa_group *group, int nid) group->faults_cpu[task_faults_idx(NUMA_MEM, nid, 1)]; } +/* + * A node triggering more than 1/3 as many NUMA faults as the maximum is + * considered part of a numa group's pseudo-interleaving set. Migrations + * between these nodes are slowed down, to allow things to settle down. + */ +#define ACTIVE_NODE_FRACTION 3 + +static bool numa_is_active_node(int nid, struct numa_group *ng) +{ + return group_faults_cpu(ng, nid) * ACTIVE_NODE_FRACTION > ng->max_faults_cpu; +} + /* Handle placement on systems where not all nodes are directly connected. */ static unsigned long score_nearby_nodes(struct task_struct *p, int nid, int maxdist, bool task) @@ -1118,27 +1156,23 @@ bool should_numa_migrate_memory(struct task_struct *p, struct page * page, return true; /* - * Do not migrate if the destination is not a node that - * is actively used by this numa group. + * Destination node is much more heavily used than the source + * node? Allow migration. */ - if (!node_isset(dst_nid, ng->active_nodes)) - return false; - - /* - * Source is a node that is not actively used by this - * numa group, while the destination is. Migrate. - */ - if (!node_isset(src_nid, ng->active_nodes)) + if (group_faults_cpu(ng, dst_nid) > group_faults_cpu(ng, src_nid) * + ACTIVE_NODE_FRACTION) return true; /* - * Both source and destination are nodes in active - * use by this numa group. Maximize memory bandwidth - * by migrating from more heavily used groups, to less - * heavily used ones, spreading the load around. - * Use a 1/4 hysteresis to avoid spurious page movement. + * Distribute memory according to CPU & memory use on each node, + * with 3/4 hysteresis to avoid unnecessary memory migrations: + * + * faults_cpu(dst) 3 faults_cpu(src) + * --------------- * - > --------------- + * faults_mem(dst) 4 faults_mem(src) */ - return group_faults(p, dst_nid) < (group_faults(p, src_nid) * 3 / 4); + return group_faults_cpu(ng, dst_nid) * group_faults(p, src_nid) * 3 > + group_faults_cpu(ng, src_nid) * group_faults(p, dst_nid) * 4; } static unsigned long weighted_cpuload(const int cpu); @@ -1220,8 +1254,6 @@ static void task_numa_assign(struct task_numa_env *env, { if (env->best_task) put_task_struct(env->best_task); - if (p) - get_task_struct(p); env->best_task = p; env->best_imp = imp; @@ -1289,20 +1321,30 @@ static void task_numa_compare(struct task_numa_env *env, long imp = env->p->numa_group ? groupimp : taskimp; long moveimp = imp; int dist = env->dist; + bool assigned = false; rcu_read_lock(); raw_spin_lock_irq(&dst_rq->lock); cur = dst_rq->curr; /* - * No need to move the exiting task, and this ensures that ->curr - * wasn't reaped and thus get_task_struct() in task_numa_assign() - * is safe under RCU read lock. - * Note that rcu_read_lock() itself can't protect from the final - * put_task_struct() after the last schedule(). + * No need to move the exiting task or idle task. */ if ((cur->flags & PF_EXITING) || is_idle_task(cur)) cur = NULL; + else { + /* + * The task_struct must be protected here to protect the + * p->numa_faults access in the task_weight since the + * numa_faults could already be freed in the following path: + * finish_task_switch() + * --> put_task_struct() + * --> __put_task_struct() + * --> task_numa_free() + */ + get_task_struct(cur); + } + raw_spin_unlock_irq(&dst_rq->lock); /* @@ -1386,6 +1428,7 @@ balance: */ if (!load_too_imbalanced(src_load, dst_load, env)) { imp = moveimp - 1; + put_task_struct(cur); cur = NULL; goto assign; } @@ -1411,9 +1454,16 @@ balance: env->dst_cpu = select_idle_sibling(env->p, env->dst_cpu); assign: + assigned = true; task_numa_assign(env, cur, imp); unlock: rcu_read_unlock(); + /* + * The dst_rq->curr isn't assigned. The protection for task_struct is + * finished. + */ + if (cur && !assigned) + put_task_struct(cur); } static void task_numa_find_cpu(struct task_numa_env *env, @@ -1468,7 +1518,7 @@ static int task_numa_migrate(struct task_struct *p) .best_task = NULL, .best_imp = 0, - .best_cpu = -1 + .best_cpu = -1, }; struct sched_domain *sd; unsigned long taskweight, groupweight; @@ -1520,8 +1570,7 @@ static int task_numa_migrate(struct task_struct *p) * multiple NUMA nodes; in order to better consolidate the group, * we need to check other locations. */ - if (env.best_cpu == -1 || (p->numa_group && - nodes_weight(p->numa_group->active_nodes) > 1)) { + if (env.best_cpu == -1 || (p->numa_group && p->numa_group->active_nodes > 1)) { for_each_online_node(nid) { if (nid == env.src_nid || nid == p->numa_preferred_nid) continue; @@ -1556,12 +1605,14 @@ static int task_numa_migrate(struct task_struct *p) * trying for a better one later. Do not set the preferred node here. */ if (p->numa_group) { + struct numa_group *ng = p->numa_group; + if (env.best_cpu == -1) nid = env.src_nid; else nid = env.dst_nid; - if (node_isset(nid, p->numa_group->active_nodes)) + if (ng->active_nodes > 1 && numa_is_active_node(env.dst_nid, ng)) sched_setnuma(p, env.dst_nid); } @@ -1611,20 +1662,15 @@ static void numa_migrate_preferred(struct task_struct *p) } /* - * Find the nodes on which the workload is actively running. We do this by + * Find out how many nodes on the workload is actively running on. Do this by * tracking the nodes from which NUMA hinting faults are triggered. This can * be different from the set of nodes where the workload's memory is currently * located. - * - * The bitmask is used to make smarter decisions on when to do NUMA page - * migrations, To prevent flip-flopping, and excessive page migrations, nodes - * are added when they cause over 6/16 of the maximum number of faults, but - * only removed when they drop below 3/16. */ -static void update_numa_active_node_mask(struct numa_group *numa_group) +static void numa_group_count_active_nodes(struct numa_group *numa_group) { unsigned long faults, max_faults = 0; - int nid; + int nid, active_nodes = 0; for_each_online_node(nid) { faults = group_faults_cpu(numa_group, nid); @@ -1634,12 +1680,12 @@ static void update_numa_active_node_mask(struct numa_group *numa_group) for_each_online_node(nid) { faults = group_faults_cpu(numa_group, nid); - if (!node_isset(nid, numa_group->active_nodes)) { - if (faults > max_faults * 6 / 16) - node_set(nid, numa_group->active_nodes); - } else if (faults < max_faults * 3 / 16) - node_clear(nid, numa_group->active_nodes); + if (faults * ACTIVE_NODE_FRACTION > max_faults) + active_nodes++; } + + numa_group->max_faults_cpu = max_faults; + numa_group->active_nodes = active_nodes; } /* @@ -1930,7 +1976,7 @@ static void task_numa_placement(struct task_struct *p) update_task_scan_period(p, fault_types[0], fault_types[1]); if (p->numa_group) { - update_numa_active_node_mask(p->numa_group); + numa_group_count_active_nodes(p->numa_group); spin_unlock_irq(group_lock); max_nid = preferred_group_nid(p, max_group_nid); } @@ -1974,14 +2020,14 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags, return; atomic_set(&grp->refcount, 1); + grp->active_nodes = 1; + grp->max_faults_cpu = 0; spin_lock_init(&grp->lock); grp->gid = p->pid; /* Second half of the array tracks nids where faults happen */ grp->faults_cpu = grp->faults + NR_NUMA_HINT_FAULT_TYPES * nr_node_ids; - node_set(task_node(current), grp->active_nodes); - for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) grp->faults[i] = p->numa_faults[i]; @@ -2095,6 +2141,7 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags) bool migrated = flags & TNF_MIGRATED; int cpu_node = task_node(current); int local = !!(flags & TNF_FAULT_LOCAL); + struct numa_group *ng; int priv; if (!static_branch_likely(&sched_numa_balancing)) @@ -2135,9 +2182,10 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags) * actively using should be counted as local. This allows the * scan rate to slow down when a workload has settled down. */ - if (!priv && !local && p->numa_group && - node_isset(cpu_node, p->numa_group->active_nodes) && - node_isset(mem_node, p->numa_group->active_nodes)) + ng = p->numa_group; + if (!priv && !local && ng && ng->active_nodes > 1 && + numa_is_active_node(cpu_node, ng) && + numa_is_active_node(mem_node, ng)) local = 1; task_numa_placement(p); @@ -2808,7 +2856,8 @@ static inline void update_load_avg(struct sched_entity *se, int update_tg) { struct cfs_rq *cfs_rq = cfs_rq_of(se); u64 now = cfs_rq_clock_task(cfs_rq); - int cpu = cpu_of(rq_of(cfs_rq)); + struct rq *rq = rq_of(cfs_rq); + int cpu = cpu_of(rq); /* * Track task load average for carrying it to new CPU after migrated, and @@ -2820,6 +2869,29 @@ static inline void update_load_avg(struct sched_entity *se, int update_tg) if (update_cfs_rq_load_avg(now, cfs_rq) && update_tg) update_tg_load_avg(cfs_rq, 0); + + if (cpu == smp_processor_id() && &rq->cfs == cfs_rq) { + unsigned long max = rq->cpu_capacity_orig; + + /* + * There are a few boundary cases this might miss but it should + * get called often enough that that should (hopefully) not be + * a real problem -- added to that it only calls on the local + * CPU, so if we enqueue remotely we'll miss an update, but + * the next tick/schedule should update. + * + * It will not get called when we go idle, because the idle + * thread is a different class (!fair), nor will the utilization + * number include things like RT tasks. + * + * As is, the util number is not freq-invariant (we'd have to + * implement arch_scale_freq_capacity() for that). + * + * See cpu_util(). + */ + cpufreq_update_util(rq_clock(rq), + min(cfs_rq->avg.util_avg, max), max); + } } static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) @@ -3086,32 +3158,64 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial) static void check_enqueue_throttle(struct cfs_rq *cfs_rq); +static inline void check_schedstat_required(void) +{ +#ifdef CONFIG_SCHEDSTATS + if (schedstat_enabled()) + return; + + /* Force schedstat enabled if a dependent tracepoint is active */ + if (trace_sched_stat_wait_enabled() || + trace_sched_stat_sleep_enabled() || + trace_sched_stat_iowait_enabled() || + trace_sched_stat_blocked_enabled() || + trace_sched_stat_runtime_enabled()) { + pr_warn_once("Scheduler tracepoints stat_sleep, stat_iowait, " + "stat_blocked and stat_runtime require the " + "kernel parameter schedstats=enabled or " + "kernel.sched_schedstats=1\n"); + } +#endif +} + static void enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) { + bool renorm = !(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_WAKING); + bool curr = cfs_rq->curr == se; + /* - * Update the normalized vruntime before updating min_vruntime - * through calling update_curr(). + * If we're the current task, we must renormalise before calling + * update_curr(). */ - if (!(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_WAKING)) + if (renorm && curr) se->vruntime += cfs_rq->min_vruntime; + update_curr(cfs_rq); + /* - * Update run-time statistics of the 'current'. + * Otherwise, renormalise after, such that we're placed at the current + * moment in time, instead of some random moment in the past. */ - update_curr(cfs_rq); + if (renorm && !curr) + se->vruntime += cfs_rq->min_vruntime; + enqueue_entity_load_avg(cfs_rq, se); account_entity_enqueue(cfs_rq, se); update_cfs_shares(cfs_rq); if (flags & ENQUEUE_WAKEUP) { place_entity(cfs_rq, se, 0); - enqueue_sleeper(cfs_rq, se); + if (schedstat_enabled()) + enqueue_sleeper(cfs_rq, se); } - update_stats_enqueue(cfs_rq, se); - check_spread(cfs_rq, se); - if (se != cfs_rq->curr) + check_schedstat_required(); + if (schedstat_enabled()) { + update_stats_enqueue(cfs_rq, se); + check_spread(cfs_rq, se); + } + if (!curr) __enqueue_entity(cfs_rq, se); se->on_rq = 1; @@ -3177,19 +3281,8 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) update_curr(cfs_rq); dequeue_entity_load_avg(cfs_rq, se); - update_stats_dequeue(cfs_rq, se); - if (flags & DEQUEUE_SLEEP) { -#ifdef CONFIG_SCHEDSTATS - if (entity_is_task(se)) { - struct task_struct *tsk = task_of(se); - - if (tsk->state & TASK_INTERRUPTIBLE) - se->statistics.sleep_start = rq_clock(rq_of(cfs_rq)); - if (tsk->state & TASK_UNINTERRUPTIBLE) - se->statistics.block_start = rq_clock(rq_of(cfs_rq)); - } -#endif - } + if (schedstat_enabled()) + update_stats_dequeue(cfs_rq, se, flags); clear_buddies(cfs_rq, se); @@ -3263,7 +3356,8 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) * a CPU. So account for the time it spent waiting on the * runqueue. */ - update_stats_wait_end(cfs_rq, se); + if (schedstat_enabled()) + update_stats_wait_end(cfs_rq, se); __dequeue_entity(cfs_rq, se); update_load_avg(se, 1); } @@ -3276,7 +3370,7 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) * least twice that of our own weight (i.e. dont track it * when there are only lesser-weight tasks around): */ - if (rq_of(cfs_rq)->load.weight >= 2*se->load.weight) { + if (schedstat_enabled() && rq_of(cfs_rq)->load.weight >= 2*se->load.weight) { se->statistics.slice_max = max(se->statistics.slice_max, se->sum_exec_runtime - se->prev_sum_exec_runtime); } @@ -3359,9 +3453,13 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev) /* throttle cfs_rqs exceeding runtime */ check_cfs_rq_runtime(cfs_rq); - check_spread(cfs_rq, prev); + if (schedstat_enabled()) { + check_spread(cfs_rq, prev); + if (prev->on_rq) + update_stats_wait_start(cfs_rq, prev); + } + if (prev->on_rq) { - update_stats_wait_start(cfs_rq, prev); /* Put 'current' back into the tree. */ __enqueue_entity(cfs_rq, prev); /* in !on_rq case, update occurred at dequeue */ @@ -4443,9 +4541,17 @@ static void __update_cpu_load(struct rq *this_rq, unsigned long this_load, /* scale is effectively 1 << i now, and >> i divides by scale */ - old_load = this_rq->cpu_load[i] - tickless_load; + old_load = this_rq->cpu_load[i]; old_load = decay_load_missed(old_load, pending_updates - 1, i); - old_load += tickless_load; + if (tickless_load) { + old_load -= decay_load_missed(tickless_load, pending_updates - 1, i); + /* + * old_load can never be a negative value because a + * decayed tickless_load cannot be greater than the + * original tickless_load. + */ + old_load += tickless_load; + } new_load = this_load; /* * Round up the averaging division if load is increasing. This @@ -4468,6 +4574,25 @@ static unsigned long weighted_cpuload(const int cpu) } #ifdef CONFIG_NO_HZ_COMMON +static void __update_cpu_load_nohz(struct rq *this_rq, + unsigned long curr_jiffies, + unsigned long load, + int active) +{ + unsigned long pending_updates; + + pending_updates = curr_jiffies - this_rq->last_load_update_tick; + if (pending_updates) { + this_rq->last_load_update_tick = curr_jiffies; + /* + * In the regular NOHZ case, we were idle, this means load 0. + * In the NOHZ_FULL case, we were non-idle, we should consider + * its weighted load. + */ + __update_cpu_load(this_rq, load, pending_updates, active); + } +} + /* * There is no sane way to deal with nohz on smp when using jiffies because the * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading @@ -4485,22 +4610,15 @@ static unsigned long weighted_cpuload(const int cpu) * Called from nohz_idle_balance() to update the load ratings before doing the * idle balance. */ -static void update_idle_cpu_load(struct rq *this_rq) +static void update_cpu_load_idle(struct rq *this_rq) { - unsigned long curr_jiffies = READ_ONCE(jiffies); - unsigned long load = weighted_cpuload(cpu_of(this_rq)); - unsigned long pending_updates; - /* * bail if there's load or we're actually up-to-date. */ - if (load || curr_jiffies == this_rq->last_load_update_tick) + if (weighted_cpuload(cpu_of(this_rq))) return; - pending_updates = curr_jiffies - this_rq->last_load_update_tick; - this_rq->last_load_update_tick = curr_jiffies; - - __update_cpu_load(this_rq, load, pending_updates, 0); + __update_cpu_load_nohz(this_rq, READ_ONCE(jiffies), 0, 0); } /* @@ -4511,22 +4629,12 @@ void update_cpu_load_nohz(int active) struct rq *this_rq = this_rq(); unsigned long curr_jiffies = READ_ONCE(jiffies); unsigned long load = active ? weighted_cpuload(cpu_of(this_rq)) : 0; - unsigned long pending_updates; if (curr_jiffies == this_rq->last_load_update_tick) return; raw_spin_lock(&this_rq->lock); - pending_updates = curr_jiffies - this_rq->last_load_update_tick; - if (pending_updates) { - this_rq->last_load_update_tick = curr_jiffies; - /* - * In the regular NOHZ case, we were idle, this means load 0. - * In the NOHZ_FULL case, we were non-idle, we should consider - * its weighted load. - */ - __update_cpu_load(this_rq, load, pending_updates, active); - } + __update_cpu_load_nohz(this_rq, curr_jiffies, load, active); raw_spin_unlock(&this_rq->lock); } #endif /* CONFIG_NO_HZ */ @@ -4538,7 +4646,7 @@ void update_cpu_load_active(struct rq *this_rq) { unsigned long load = weighted_cpuload(cpu_of(this_rq)); /* - * See the mess around update_idle_cpu_load() / update_cpu_load_nohz(). + * See the mess around update_cpu_load_idle() / update_cpu_load_nohz(). */ this_rq->last_load_update_tick = jiffies; __update_cpu_load(this_rq, load, 1, 1); @@ -4971,7 +5079,19 @@ static int select_idle_sibling(struct task_struct *p, int target) return i; /* - * Otherwise, iterate the domains and find an elegible idle cpu. + * Otherwise, iterate the domains and find an eligible idle cpu. + * + * A completely idle sched group at higher domains is more + * desirable than an idle group at a lower level, because lower + * domains have smaller groups and usually share hardware + * resources which causes tasks to contend on them, e.g. x86 + * hyperthread siblings in the lowest domain (SMT) can contend + * on the shared cpu pipeline. + * + * However, while we prefer idle groups at higher domains + * finding an idle cpu at the lowest domain is still better than + * returning 'target', which we've already established, isn't + * idle. */ sd = rcu_dereference(per_cpu(sd_llc, target)); for_each_lower_domain(sd) { @@ -4981,11 +5101,16 @@ static int select_idle_sibling(struct task_struct *p, int target) tsk_cpus_allowed(p))) goto next; + /* Ensure the entire group is idle */ for_each_cpu(i, sched_group_cpus(sg)) { if (i == target || !idle_cpu(i)) goto next; } + /* + * It doesn't matter which cpu we pick, the + * whole group is idle. + */ target = cpumask_first_and(sched_group_cpus(sg), tsk_cpus_allowed(p)); goto done; @@ -7832,7 +7957,7 @@ static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) if (time_after_eq(jiffies, rq->next_balance)) { raw_spin_lock_irq(&rq->lock); update_rq_clock(rq); - update_idle_cpu_load(rq); + update_cpu_load_idle(rq); raw_spin_unlock_irq(&rq->lock); rebalance_domains(rq, CPU_IDLE); } @@ -8218,11 +8343,8 @@ void free_fair_sched_group(struct task_group *tg) for_each_possible_cpu(i) { if (tg->cfs_rq) kfree(tg->cfs_rq[i]); - if (tg->se) { - if (tg->se[i]) - remove_entity_load_avg(tg->se[i]); + if (tg->se) kfree(tg->se[i]); - } } kfree(tg->cfs_rq); @@ -8270,21 +8392,29 @@ err: return 0; } -void unregister_fair_sched_group(struct task_group *tg, int cpu) +void unregister_fair_sched_group(struct task_group *tg) { - struct rq *rq = cpu_rq(cpu); unsigned long flags; + struct rq *rq; + int cpu; - /* - * Only empty task groups can be destroyed; so we can speculatively - * check on_list without danger of it being re-added. - */ - if (!tg->cfs_rq[cpu]->on_list) - return; + for_each_possible_cpu(cpu) { + if (tg->se[cpu]) + remove_entity_load_avg(tg->se[cpu]); - raw_spin_lock_irqsave(&rq->lock, flags); - list_del_leaf_cfs_rq(tg->cfs_rq[cpu]); - raw_spin_unlock_irqrestore(&rq->lock, flags); + /* + * Only empty task groups can be destroyed; so we can speculatively + * check on_list without danger of it being re-added. + */ + if (!tg->cfs_rq[cpu]->on_list) + continue; + + rq = cpu_rq(cpu); + + raw_spin_lock_irqsave(&rq->lock, flags); + list_del_leaf_cfs_rq(tg->cfs_rq[cpu]); + raw_spin_unlock_irqrestore(&rq->lock, flags); + } } void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, @@ -8366,7 +8496,7 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) return 1; } -void unregister_fair_sched_group(struct task_group *tg, int cpu) { } +void unregister_fair_sched_group(struct task_group *tg) { } #endif /* CONFIG_FAIR_GROUP_SCHED */ diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index de0e786b2667..bd12c6c714ec 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -4,6 +4,7 @@ #include <linux/sched.h> #include <linux/cpu.h> #include <linux/cpuidle.h> +#include <linux/cpuhotplug.h> #include <linux/tick.h> #include <linux/mm.h> #include <linux/stackprotector.h> @@ -162,7 +163,7 @@ static void cpuidle_idle_call(void) */ if (idle_should_freeze()) { entered_state = cpuidle_enter_freeze(drv, dev); - if (entered_state >= 0) { + if (entered_state > 0) { local_irq_enable(); goto exit_idle; } @@ -193,8 +194,6 @@ exit_idle: rcu_idle_exit(); } -DEFINE_PER_CPU(bool, cpu_dead_idle); - /* * Generic idle loop implementation * @@ -221,10 +220,7 @@ static void cpu_idle_loop(void) rmb(); if (cpu_is_offline(smp_processor_id())) { - rcu_cpu_notify(NULL, CPU_DYING_IDLE, - (void *)(long)smp_processor_id()); - smp_mb(); /* all activity before dead. */ - this_cpu_write(cpu_dead_idle, true); + cpuhp_report_idle_dead(); arch_cpu_idle_dead(); } @@ -291,5 +287,6 @@ void cpu_startup_entry(enum cpuhp_state state) boot_init_stack_canary(); #endif arch_cpu_idle_prepare(); + cpuhp_online_idle(state); cpu_idle_loop(); } diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 8ec86abe0ea1..c41ea7ac1764 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -58,7 +58,15 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b) raw_spin_lock(&rt_b->rt_runtime_lock); if (!rt_b->rt_period_active) { rt_b->rt_period_active = 1; - hrtimer_forward_now(&rt_b->rt_period_timer, rt_b->rt_period); + /* + * SCHED_DEADLINE updates the bandwidth, as a run away + * RT task with a DL task could hog a CPU. But DL does + * not reset the period. If a deadline task was running + * without an RT task running, it can cause RT tasks to + * throttle when they start up. Kick the timer right away + * to update the period. + */ + hrtimer_forward_now(&rt_b->rt_period_timer, ns_to_ktime(0)); hrtimer_start_expires(&rt_b->rt_period_timer, HRTIMER_MODE_ABS_PINNED); } raw_spin_unlock(&rt_b->rt_runtime_lock); @@ -436,7 +444,7 @@ static void dequeue_top_rt_rq(struct rt_rq *rt_rq); static inline int on_rt_rq(struct sched_rt_entity *rt_se) { - return !list_empty(&rt_se->run_list); + return rt_se->on_rq; } #ifdef CONFIG_RT_GROUP_SCHED @@ -482,8 +490,8 @@ static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se) return rt_se->my_q; } -static void enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head); -static void dequeue_rt_entity(struct sched_rt_entity *rt_se); +static void enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags); +static void dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags); static void sched_rt_rq_enqueue(struct rt_rq *rt_rq) { @@ -499,7 +507,7 @@ static void sched_rt_rq_enqueue(struct rt_rq *rt_rq) if (!rt_se) enqueue_top_rt_rq(rt_rq); else if (!on_rt_rq(rt_se)) - enqueue_rt_entity(rt_se, false); + enqueue_rt_entity(rt_se, 0); if (rt_rq->highest_prio.curr < curr->prio) resched_curr(rq); @@ -516,7 +524,7 @@ static void sched_rt_rq_dequeue(struct rt_rq *rt_rq) if (!rt_se) dequeue_top_rt_rq(rt_rq); else if (on_rt_rq(rt_se)) - dequeue_rt_entity(rt_se); + dequeue_rt_entity(rt_se, 0); } static inline int rt_rq_throttled(struct rt_rq *rt_rq) @@ -945,6 +953,10 @@ static void update_curr_rt(struct rq *rq) if (curr->sched_class != &rt_sched_class) return; + /* Kick cpufreq (see the comment in linux/cpufreq.h). */ + if (cpu_of(rq) == smp_processor_id()) + cpufreq_trigger_update(rq_clock(rq)); + delta_exec = rq_clock_task(rq) - curr->se.exec_start; if (unlikely((s64)delta_exec <= 0)) return; @@ -1142,12 +1154,27 @@ unsigned int rt_se_nr_running(struct sched_rt_entity *rt_se) } static inline +unsigned int rt_se_rr_nr_running(struct sched_rt_entity *rt_se) +{ + struct rt_rq *group_rq = group_rt_rq(rt_se); + struct task_struct *tsk; + + if (group_rq) + return group_rq->rr_nr_running; + + tsk = rt_task_of(rt_se); + + return (tsk->policy == SCHED_RR) ? 1 : 0; +} + +static inline void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) { int prio = rt_se_prio(rt_se); WARN_ON(!rt_prio(prio)); rt_rq->rt_nr_running += rt_se_nr_running(rt_se); + rt_rq->rr_nr_running += rt_se_rr_nr_running(rt_se); inc_rt_prio(rt_rq, prio); inc_rt_migration(rt_se, rt_rq); @@ -1160,13 +1187,37 @@ void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) WARN_ON(!rt_prio(rt_se_prio(rt_se))); WARN_ON(!rt_rq->rt_nr_running); rt_rq->rt_nr_running -= rt_se_nr_running(rt_se); + rt_rq->rr_nr_running -= rt_se_rr_nr_running(rt_se); dec_rt_prio(rt_rq, rt_se_prio(rt_se)); dec_rt_migration(rt_se, rt_rq); dec_rt_group(rt_se, rt_rq); } -static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head) +/* + * Change rt_se->run_list location unless SAVE && !MOVE + * + * assumes ENQUEUE/DEQUEUE flags match + */ +static inline bool move_entity(unsigned int flags) +{ + if ((flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)) == DEQUEUE_SAVE) + return false; + + return true; +} + +static void __delist_rt_entity(struct sched_rt_entity *rt_se, struct rt_prio_array *array) +{ + list_del_init(&rt_se->run_list); + + if (list_empty(array->queue + rt_se_prio(rt_se))) + __clear_bit(rt_se_prio(rt_se), array->bitmap); + + rt_se->on_list = 0; +} + +static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags) { struct rt_rq *rt_rq = rt_rq_of_se(rt_se); struct rt_prio_array *array = &rt_rq->active; @@ -1179,26 +1230,37 @@ static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head) * get throttled and the current group doesn't have any other * active members. */ - if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running)) + if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running)) { + if (rt_se->on_list) + __delist_rt_entity(rt_se, array); return; + } - if (head) - list_add(&rt_se->run_list, queue); - else - list_add_tail(&rt_se->run_list, queue); - __set_bit(rt_se_prio(rt_se), array->bitmap); + if (move_entity(flags)) { + WARN_ON_ONCE(rt_se->on_list); + if (flags & ENQUEUE_HEAD) + list_add(&rt_se->run_list, queue); + else + list_add_tail(&rt_se->run_list, queue); + + __set_bit(rt_se_prio(rt_se), array->bitmap); + rt_se->on_list = 1; + } + rt_se->on_rq = 1; inc_rt_tasks(rt_se, rt_rq); } -static void __dequeue_rt_entity(struct sched_rt_entity *rt_se) +static void __dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags) { struct rt_rq *rt_rq = rt_rq_of_se(rt_se); struct rt_prio_array *array = &rt_rq->active; - list_del_init(&rt_se->run_list); - if (list_empty(array->queue + rt_se_prio(rt_se))) - __clear_bit(rt_se_prio(rt_se), array->bitmap); + if (move_entity(flags)) { + WARN_ON_ONCE(!rt_se->on_list); + __delist_rt_entity(rt_se, array); + } + rt_se->on_rq = 0; dec_rt_tasks(rt_se, rt_rq); } @@ -1207,7 +1269,7 @@ static void __dequeue_rt_entity(struct sched_rt_entity *rt_se) * Because the prio of an upper entry depends on the lower * entries, we must remove entries top - down. */ -static void dequeue_rt_stack(struct sched_rt_entity *rt_se) +static void dequeue_rt_stack(struct sched_rt_entity *rt_se, unsigned int flags) { struct sched_rt_entity *back = NULL; @@ -1220,31 +1282,31 @@ static void dequeue_rt_stack(struct sched_rt_entity *rt_se) for (rt_se = back; rt_se; rt_se = rt_se->back) { if (on_rt_rq(rt_se)) - __dequeue_rt_entity(rt_se); + __dequeue_rt_entity(rt_se, flags); } } -static void enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head) +static void enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags) { struct rq *rq = rq_of_rt_se(rt_se); - dequeue_rt_stack(rt_se); + dequeue_rt_stack(rt_se, flags); for_each_sched_rt_entity(rt_se) - __enqueue_rt_entity(rt_se, head); + __enqueue_rt_entity(rt_se, flags); enqueue_top_rt_rq(&rq->rt); } -static void dequeue_rt_entity(struct sched_rt_entity *rt_se) +static void dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags) { struct rq *rq = rq_of_rt_se(rt_se); - dequeue_rt_stack(rt_se); + dequeue_rt_stack(rt_se, flags); for_each_sched_rt_entity(rt_se) { struct rt_rq *rt_rq = group_rt_rq(rt_se); if (rt_rq && rt_rq->rt_nr_running) - __enqueue_rt_entity(rt_se, false); + __enqueue_rt_entity(rt_se, flags); } enqueue_top_rt_rq(&rq->rt); } @@ -1260,7 +1322,7 @@ enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags) if (flags & ENQUEUE_WAKEUP) rt_se->timeout = 0; - enqueue_rt_entity(rt_se, flags & ENQUEUE_HEAD); + enqueue_rt_entity(rt_se, flags); if (!task_current(rq, p) && p->nr_cpus_allowed > 1) enqueue_pushable_task(rq, p); @@ -1271,7 +1333,7 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags) struct sched_rt_entity *rt_se = &p->rt; update_curr_rt(rq); - dequeue_rt_entity(rt_se); + dequeue_rt_entity(rt_se, flags); dequeue_pushable_task(rq, p); } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 10f16374df7f..ec2e8d23527e 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -3,6 +3,7 @@ #include <linux/sched/sysctl.h> #include <linux/sched/rt.h> #include <linux/sched/deadline.h> +#include <linux/binfmts.h> #include <linux/mutex.h> #include <linux/spinlock.h> #include <linux/stop_machine.h> @@ -313,12 +314,11 @@ extern int tg_nop(struct task_group *tg, void *data); extern void free_fair_sched_group(struct task_group *tg); extern int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent); -extern void unregister_fair_sched_group(struct task_group *tg, int cpu); +extern void unregister_fair_sched_group(struct task_group *tg); extern void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, struct sched_entity *se, int cpu, struct sched_entity *parent); extern void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b); -extern int sched_group_set_shares(struct task_group *tg, unsigned long shares); extern void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b); extern void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b); @@ -450,6 +450,7 @@ static inline int rt_bandwidth_enabled(void) struct rt_rq { struct rt_prio_array active; unsigned int rt_nr_running; + unsigned int rr_nr_running; #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED struct { int curr; /* highest queued rt task prio */ @@ -909,6 +910,18 @@ static inline unsigned int group_first_cpu(struct sched_group *group) extern int group_balance_cpu(struct sched_group *sg); +#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL) +void register_sched_domain_sysctl(void); +void unregister_sched_domain_sysctl(void); +#else +static inline void register_sched_domain_sysctl(void) +{ +} +static inline void unregister_sched_domain_sysctl(void) +{ +} +#endif + #else static inline void sched_ttwu_pending(void) { } @@ -1022,6 +1035,7 @@ extern struct static_key sched_feat_keys[__SCHED_FEAT_NR]; #endif /* SCHED_DEBUG && HAVE_JUMP_LABEL */ extern struct static_key_false sched_numa_balancing; +extern struct static_key_false sched_schedstats; static inline u64 global_rt_period(void) { @@ -1130,18 +1144,40 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) extern const int sched_prio_to_weight[40]; extern const u32 sched_prio_to_wmult[40]; +/* + * {de,en}queue flags: + * + * DEQUEUE_SLEEP - task is no longer runnable + * ENQUEUE_WAKEUP - task just became runnable + * + * SAVE/RESTORE - an otherwise spurious dequeue/enqueue, done to ensure tasks + * are in a known state which allows modification. Such pairs + * should preserve as much state as possible. + * + * MOVE - paired with SAVE/RESTORE, explicitly does not preserve the location + * in the runqueue. + * + * ENQUEUE_HEAD - place at front of runqueue (tail if not specified) + * ENQUEUE_REPLENISH - CBS (replenish runtime and postpone deadline) + * ENQUEUE_WAKING - sched_class::task_waking was called + * + */ + +#define DEQUEUE_SLEEP 0x01 +#define DEQUEUE_SAVE 0x02 /* matches ENQUEUE_RESTORE */ +#define DEQUEUE_MOVE 0x04 /* matches ENQUEUE_MOVE */ + #define ENQUEUE_WAKEUP 0x01 -#define ENQUEUE_HEAD 0x02 +#define ENQUEUE_RESTORE 0x02 +#define ENQUEUE_MOVE 0x04 + +#define ENQUEUE_HEAD 0x08 +#define ENQUEUE_REPLENISH 0x10 #ifdef CONFIG_SMP -#define ENQUEUE_WAKING 0x04 /* sched_class::task_waking was called */ +#define ENQUEUE_WAKING 0x20 #else #define ENQUEUE_WAKING 0x00 #endif -#define ENQUEUE_REPLENISH 0x08 -#define ENQUEUE_RESTORE 0x10 - -#define DEQUEUE_SLEEP 0x01 -#define DEQUEUE_SAVE 0x02 #define RETRY_TASK ((void *)-1UL) @@ -1278,6 +1314,35 @@ unsigned long to_ratio(u64 period, u64 runtime); extern void init_entity_runnable_average(struct sched_entity *se); +#ifdef CONFIG_NO_HZ_FULL +extern bool sched_can_stop_tick(struct rq *rq); + +/* + * Tick may be needed by tasks in the runqueue depending on their policy and + * requirements. If tick is needed, lets send the target an IPI to kick it out of + * nohz mode if necessary. + */ +static inline void sched_update_tick_dependency(struct rq *rq) +{ + int cpu; + + if (!tick_nohz_full_enabled()) + return; + + cpu = cpu_of(rq); + + if (!tick_nohz_full_cpu(cpu)) + return; + + if (sched_can_stop_tick(rq)) + tick_nohz_dep_clear_cpu(cpu, TICK_DEP_BIT_SCHED); + else + tick_nohz_dep_set_cpu(cpu, TICK_DEP_BIT_SCHED); +} +#else +static inline void sched_update_tick_dependency(struct rq *rq) { } +#endif + static inline void add_nr_running(struct rq *rq, unsigned count) { unsigned prev_nr = rq->nr_running; @@ -1289,26 +1354,16 @@ static inline void add_nr_running(struct rq *rq, unsigned count) if (!rq->rd->overload) rq->rd->overload = true; #endif - -#ifdef CONFIG_NO_HZ_FULL - if (tick_nohz_full_cpu(rq->cpu)) { - /* - * Tick is needed if more than one task runs on a CPU. - * Send the target an IPI to kick it out of nohz mode. - * - * We assume that IPI implies full memory barrier and the - * new value of rq->nr_running is visible on reception - * from the target. - */ - tick_nohz_full_kick_cpu(rq->cpu); - } -#endif } + + sched_update_tick_dependency(rq); } static inline void sub_nr_running(struct rq *rq, unsigned count) { rq->nr_running -= count; + /* Check if we still need preemption */ + sched_update_tick_dependency(rq); } static inline void rq_last_tick_reset(struct rq *rq) @@ -1738,3 +1793,64 @@ static inline u64 irq_time_read(int cpu) } #endif /* CONFIG_64BIT */ #endif /* CONFIG_IRQ_TIME_ACCOUNTING */ + +#ifdef CONFIG_CPU_FREQ +DECLARE_PER_CPU(struct update_util_data *, cpufreq_update_util_data); + +/** + * cpufreq_update_util - Take a note about CPU utilization changes. + * @time: Current time. + * @util: Current utilization. + * @max: Utilization ceiling. + * + * This function is called by the scheduler on every invocation of + * update_load_avg() on the CPU whose utilization is being updated. + * + * It can only be called from RCU-sched read-side critical sections. + */ +static inline void cpufreq_update_util(u64 time, unsigned long util, unsigned long max) +{ + struct update_util_data *data; + + data = rcu_dereference_sched(*this_cpu_ptr(&cpufreq_update_util_data)); + if (data) + data->func(data, time, util, max); +} + +/** + * cpufreq_trigger_update - Trigger CPU performance state evaluation if needed. + * @time: Current time. + * + * The way cpufreq is currently arranged requires it to evaluate the CPU + * performance state (frequency/voltage) on a regular basis to prevent it from + * being stuck in a completely inadequate performance level for too long. + * That is not guaranteed to happen if the updates are only triggered from CFS, + * though, because they may not be coming in if RT or deadline tasks are active + * all the time (or there are RT and DL tasks only). + * + * As a workaround for that issue, this function is called by the RT and DL + * sched classes to trigger extra cpufreq updates to prevent it from stalling, + * but that really is a band-aid. Going forward it should be replaced with + * solutions targeted more specifically at RT and DL tasks. + */ +static inline void cpufreq_trigger_update(u64 time) +{ + cpufreq_update_util(time, ULONG_MAX, 0); +} +#else +static inline void cpufreq_update_util(u64 time, unsigned long util, unsigned long max) {} +static inline void cpufreq_trigger_update(u64 time) {} +#endif /* CONFIG_CPU_FREQ */ + +static inline void account_reset_rq(struct rq *rq) +{ +#ifdef CONFIG_IRQ_TIME_ACCOUNTING + rq->prev_irq_time = 0; +#endif +#ifdef CONFIG_PARAVIRT + rq->prev_steal_time = 0; +#endif +#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING + rq->prev_steal_time_rq = 0; +#endif +} diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h index b0fbc7632de5..70b3b6a20fb0 100644 --- a/kernel/sched/stats.h +++ b/kernel/sched/stats.h @@ -29,9 +29,10 @@ rq_sched_info_dequeued(struct rq *rq, unsigned long long delta) if (rq) rq->rq_sched_info.run_delay += delta; } -# define schedstat_inc(rq, field) do { (rq)->field++; } while (0) -# define schedstat_add(rq, field, amt) do { (rq)->field += (amt); } while (0) -# define schedstat_set(var, val) do { var = (val); } while (0) +# define schedstat_enabled() static_branch_unlikely(&sched_schedstats) +# define schedstat_inc(rq, field) do { if (schedstat_enabled()) { (rq)->field++; } } while (0) +# define schedstat_add(rq, field, amt) do { if (schedstat_enabled()) { (rq)->field += (amt); } } while (0) +# define schedstat_set(var, val) do { if (schedstat_enabled()) { var = (val); } } while (0) #else /* !CONFIG_SCHEDSTATS */ static inline void rq_sched_info_arrive(struct rq *rq, unsigned long long delta) @@ -42,6 +43,7 @@ rq_sched_info_dequeued(struct rq *rq, unsigned long long delta) static inline void rq_sched_info_depart(struct rq *rq, unsigned long long delta) {} +# define schedstat_enabled() 0 # define schedstat_inc(rq, field) do { } while (0) # define schedstat_add(rq, field, amt) do { } while (0) # define schedstat_set(var, val) do { } while (0) diff --git a/kernel/sched/swait.c b/kernel/sched/swait.c new file mode 100644 index 000000000000..82f0dff90030 --- /dev/null +++ b/kernel/sched/swait.c @@ -0,0 +1,123 @@ +#include <linux/sched.h> +#include <linux/swait.h> + +void __init_swait_queue_head(struct swait_queue_head *q, const char *name, + struct lock_class_key *key) +{ + raw_spin_lock_init(&q->lock); + lockdep_set_class_and_name(&q->lock, key, name); + INIT_LIST_HEAD(&q->task_list); +} +EXPORT_SYMBOL(__init_swait_queue_head); + +/* + * The thing about the wake_up_state() return value; I think we can ignore it. + * + * If for some reason it would return 0, that means the previously waiting + * task is already running, so it will observe condition true (or has already). + */ +void swake_up_locked(struct swait_queue_head *q) +{ + struct swait_queue *curr; + + if (list_empty(&q->task_list)) + return; + + curr = list_first_entry(&q->task_list, typeof(*curr), task_list); + wake_up_process(curr->task); + list_del_init(&curr->task_list); +} +EXPORT_SYMBOL(swake_up_locked); + +void swake_up(struct swait_queue_head *q) +{ + unsigned long flags; + + if (!swait_active(q)) + return; + + raw_spin_lock_irqsave(&q->lock, flags); + swake_up_locked(q); + raw_spin_unlock_irqrestore(&q->lock, flags); +} +EXPORT_SYMBOL(swake_up); + +/* + * Does not allow usage from IRQ disabled, since we must be able to + * release IRQs to guarantee bounded hold time. + */ +void swake_up_all(struct swait_queue_head *q) +{ + struct swait_queue *curr; + LIST_HEAD(tmp); + + if (!swait_active(q)) + return; + + raw_spin_lock_irq(&q->lock); + list_splice_init(&q->task_list, &tmp); + while (!list_empty(&tmp)) { + curr = list_first_entry(&tmp, typeof(*curr), task_list); + + wake_up_state(curr->task, TASK_NORMAL); + list_del_init(&curr->task_list); + + if (list_empty(&tmp)) + break; + + raw_spin_unlock_irq(&q->lock); + raw_spin_lock_irq(&q->lock); + } + raw_spin_unlock_irq(&q->lock); +} +EXPORT_SYMBOL(swake_up_all); + +void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait) +{ + wait->task = current; + if (list_empty(&wait->task_list)) + list_add(&wait->task_list, &q->task_list); +} + +void prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait, int state) +{ + unsigned long flags; + + raw_spin_lock_irqsave(&q->lock, flags); + __prepare_to_swait(q, wait); + set_current_state(state); + raw_spin_unlock_irqrestore(&q->lock, flags); +} +EXPORT_SYMBOL(prepare_to_swait); + +long prepare_to_swait_event(struct swait_queue_head *q, struct swait_queue *wait, int state) +{ + if (signal_pending_state(state, current)) + return -ERESTARTSYS; + + prepare_to_swait(q, wait, state); + + return 0; +} +EXPORT_SYMBOL(prepare_to_swait_event); + +void __finish_swait(struct swait_queue_head *q, struct swait_queue *wait) +{ + __set_current_state(TASK_RUNNING); + if (!list_empty(&wait->task_list)) + list_del_init(&wait->task_list); +} + +void finish_swait(struct swait_queue_head *q, struct swait_queue *wait) +{ + unsigned long flags; + + __set_current_state(TASK_RUNNING); + + if (!list_empty_careful(&wait->task_list)) { + raw_spin_lock_irqsave(&q->lock, flags); + list_del_init(&wait->task_list); + raw_spin_unlock_irqrestore(&q->lock, flags); + } +} +EXPORT_SYMBOL(finish_swait); diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 580ac2d4024f..e1e5a354854e 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -316,24 +316,24 @@ static inline void seccomp_sync_threads(void) put_seccomp_filter(thread); smp_store_release(&thread->seccomp.filter, caller->seccomp.filter); + + /* + * Don't let an unprivileged task work around + * the no_new_privs restriction by creating + * a thread that sets it up, enters seccomp, + * then dies. + */ + if (task_no_new_privs(caller)) + task_set_no_new_privs(thread); + /* * Opt the other thread into seccomp if needed. * As threads are considered to be trust-realm * equivalent (see ptrace_may_access), it is safe to * allow one thread to transition the other. */ - if (thread->seccomp.mode == SECCOMP_MODE_DISABLED) { - /* - * Don't let an unprivileged task work around - * the no_new_privs restriction by creating - * a thread that sets it up, enters seccomp, - * then dies. - */ - if (task_no_new_privs(caller)) - task_set_no_new_privs(thread); - + if (thread->seccomp.mode == SECCOMP_MODE_DISABLED) seccomp_assign_mode(thread, SECCOMP_MODE_FILTER); - } } } @@ -395,7 +395,7 @@ seccomp_prepare_user_filter(const char __user *user_filter) struct seccomp_filter *filter = ERR_PTR(-EFAULT); #ifdef CONFIG_COMPAT - if (is_compat_task()) { + if (in_compat_syscall()) { struct compat_sock_fprog fprog32; if (copy_from_user(&fprog32, user_filter, sizeof(fprog32))) goto out; @@ -529,7 +529,7 @@ static void __secure_computing_strict(int this_syscall) { int *syscall_whitelist = mode1_syscalls; #ifdef CONFIG_COMPAT - if (is_compat_task()) + if (in_compat_syscall()) syscall_whitelist = mode1_syscalls_32; #endif do { diff --git a/kernel/signal.c b/kernel/signal.c index f3f1f7a972fd..aa9bf00749c1 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2709,6 +2709,10 @@ int copy_siginfo_to_user(siginfo_t __user *to, const siginfo_t *from) err |= __put_user(from->si_upper, &to->si_upper); } #endif +#ifdef SEGV_PKUERR + if (from->si_signo == SIGSEGV && from->si_code == SEGV_PKUERR) + err |= __put_user(from->si_pkey, &to->si_pkey); +#endif break; case __SI_CHLD: err |= __put_user(from->si_pid, &to->si_pid); @@ -3508,8 +3512,10 @@ static int sigsuspend(sigset_t *set) current->saved_sigmask = current->blocked; set_current_blocked(set); - __set_current_state(TASK_INTERRUPTIBLE); - schedule(); + while (!signal_pending(current)) { + __set_current_state(TASK_INTERRUPTIBLE); + schedule(); + } set_restore_sigmask(); return -ERESTARTNOHAND; } @@ -3579,6 +3585,10 @@ __weak const char *arch_vma_name(struct vm_area_struct *vma) void __init signals_init(void) { + /* If this check fails, the __ARCH_SI_PREAMBLE_SIZE value is wrong! */ + BUILD_BUG_ON(__ARCH_SI_PREAMBLE_SIZE + != offsetof(struct siginfo, _sifields._pad)); + sigqueue_cachep = KMEM_CACHE(sigqueue, SLAB_PANIC); } diff --git a/kernel/smp.c b/kernel/smp.c index d903c02223af..74165443c240 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -105,13 +105,12 @@ void __init call_function_init(void) * previous function call. For multi-cpu calls its even more interesting * as we'll have to ensure no other cpu is observing our csd. */ -static void csd_lock_wait(struct call_single_data *csd) +static __always_inline void csd_lock_wait(struct call_single_data *csd) { - while (smp_load_acquire(&csd->flags) & CSD_FLAG_LOCK) - cpu_relax(); + smp_cond_acquire(!(csd->flags & CSD_FLAG_LOCK)); } -static void csd_lock(struct call_single_data *csd) +static __always_inline void csd_lock(struct call_single_data *csd) { csd_lock_wait(csd); csd->flags |= CSD_FLAG_LOCK; @@ -124,7 +123,7 @@ static void csd_lock(struct call_single_data *csd) smp_wmb(); } -static void csd_unlock(struct call_single_data *csd) +static __always_inline void csd_unlock(struct call_single_data *csd) { WARN_ON(!(csd->flags & CSD_FLAG_LOCK)); @@ -569,6 +568,7 @@ void __init smp_init(void) unsigned int cpu; idle_threads_init(); + cpuhp_threads_init(); /* FIXME: This should be done in userspace --RR */ for_each_present_cpu(cpu) { diff --git a/kernel/smpboot.c b/kernel/smpboot.c index d264f59bff56..13bc43d1fb22 100644 --- a/kernel/smpboot.c +++ b/kernel/smpboot.c @@ -226,7 +226,7 @@ static void smpboot_unpark_thread(struct smp_hotplug_thread *ht, unsigned int cp kthread_unpark(tsk); } -void smpboot_unpark_threads(unsigned int cpu) +int smpboot_unpark_threads(unsigned int cpu) { struct smp_hotplug_thread *cur; @@ -235,6 +235,7 @@ void smpboot_unpark_threads(unsigned int cpu) if (cpumask_test_cpu(cpu, cur->cpumask)) smpboot_unpark_thread(cur, cpu); mutex_unlock(&smpboot_threads_lock); + return 0; } static void smpboot_park_thread(struct smp_hotplug_thread *ht, unsigned int cpu) @@ -245,7 +246,7 @@ static void smpboot_park_thread(struct smp_hotplug_thread *ht, unsigned int cpu) kthread_park(tsk); } -void smpboot_park_threads(unsigned int cpu) +int smpboot_park_threads(unsigned int cpu) { struct smp_hotplug_thread *cur; @@ -253,6 +254,7 @@ void smpboot_park_threads(unsigned int cpu) list_for_each_entry_reverse(cur, &hotplug_threads, list) smpboot_park_thread(cur, cpu); mutex_unlock(&smpboot_threads_lock); + return 0; } static void smpboot_destroy_threads(struct smp_hotplug_thread *ht) diff --git a/kernel/smpboot.h b/kernel/smpboot.h index 72415a0eb955..485b81cfab34 100644 --- a/kernel/smpboot.h +++ b/kernel/smpboot.h @@ -14,7 +14,9 @@ static inline void idle_threads_init(void) { } #endif int smpboot_create_threads(unsigned int cpu); -void smpboot_park_threads(unsigned int cpu); -void smpboot_unpark_threads(unsigned int cpu); +int smpboot_park_threads(unsigned int cpu); +int smpboot_unpark_threads(unsigned int cpu); + +void __init cpuhp_threads_init(void); #endif diff --git a/kernel/softirq.c b/kernel/softirq.c index 479e4436f787..17caf4b63342 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -116,9 +116,9 @@ void __local_bh_disable_ip(unsigned long ip, unsigned int cnt) if (preempt_count() == cnt) { #ifdef CONFIG_DEBUG_PREEMPT - current->preempt_disable_ip = get_parent_ip(CALLER_ADDR1); + current->preempt_disable_ip = get_lock_parent_ip(); #endif - trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1)); + trace_preempt_off(CALLER_ADDR0, get_lock_parent_ip()); } } EXPORT_SYMBOL(__local_bh_disable_ip); @@ -227,7 +227,7 @@ static inline bool lockdep_softirq_start(void) { return false; } static inline void lockdep_softirq_end(bool in_hardirq) { } #endif -asmlinkage __visible void __do_softirq(void) +asmlinkage __visible void __softirq_entry __do_softirq(void) { unsigned long end = jiffies + MAX_SOFTIRQ_TIME; unsigned long old_flags = current->flags; diff --git a/kernel/sys.c b/kernel/sys.c index 78947de6f969..cf8ba545c7d3 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -2169,7 +2169,10 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, error = perf_event_task_enable(); break; case PR_GET_TIMERSLACK: - error = current->timer_slack_ns; + if (current->timer_slack_ns > ULONG_MAX) + error = ULONG_MAX; + else + error = current->timer_slack_ns; break; case PR_SET_TIMERSLACK: if (arg2 <= 0) diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 97715fd9e790..725587f10667 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -126,6 +126,7 @@ static int __maybe_unused two = 2; static int __maybe_unused four = 4; static unsigned long one_ul = 1; static int one_hundred = 100; +static int one_thousand = 1000; #ifdef CONFIG_PRINTK static int ten_thousand = 10000; #endif @@ -350,6 +351,17 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, +#ifdef CONFIG_SCHEDSTATS + { + .procname = "sched_schedstats", + .data = NULL, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = sysctl_schedstats, + .extra1 = &zero, + .extra2 = &one, + }, +#endif /* CONFIG_SCHEDSTATS */ #endif /* CONFIG_SMP */ #ifdef CONFIG_NUMA_BALANCING { @@ -505,7 +517,7 @@ static struct ctl_table kern_table[] = { .data = &latencytop_enabled, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = sysctl_latencytop, }, #endif #ifdef CONFIG_BLK_DEV_INITRD @@ -1393,6 +1405,15 @@ static struct ctl_table vm_table[] = { .extra1 = &zero, }, { + .procname = "watermark_scale_factor", + .data = &watermark_scale_factor, + .maxlen = sizeof(watermark_scale_factor), + .mode = 0644, + .proc_handler = watermark_scale_factor_sysctl_handler, + .extra1 = &one, + .extra2 = &one_thousand, + }, + { .procname = "percpu_pagelist_fraction", .data = &percpu_pagelist_fraction, .maxlen = sizeof(percpu_pagelist_fraction), diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c index 7e7746a42a62..10a1d7dc9313 100644 --- a/kernel/sysctl_binary.c +++ b/kernel/sysctl_binary.c @@ -1321,7 +1321,7 @@ static ssize_t binary_sysctl(const int *name, int nlen, } mnt = task_active_pid_ns(current)->proc_mnt; - file = file_open_root(mnt->mnt_root, mnt, pathname, flags); + file = file_open_root(mnt->mnt_root, mnt, pathname, flags, 0); result = PTR_ERR(file); if (IS_ERR(file)) goto out_putname; diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 664de539299b..56ece145a814 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -323,13 +323,42 @@ static void clocksource_enqueue_watchdog(struct clocksource *cs) /* cs is a watchdog. */ if (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES; + } + spin_unlock_irqrestore(&watchdog_lock, flags); +} + +static void clocksource_select_watchdog(bool fallback) +{ + struct clocksource *cs, *old_wd; + unsigned long flags; + + spin_lock_irqsave(&watchdog_lock, flags); + /* save current watchdog */ + old_wd = watchdog; + if (fallback) + watchdog = NULL; + + list_for_each_entry(cs, &clocksource_list, list) { + /* cs is a clocksource to be watched. */ + if (cs->flags & CLOCK_SOURCE_MUST_VERIFY) + continue; + + /* Skip current if we were requested for a fallback. */ + if (fallback && cs == old_wd) + continue; + /* Pick the best watchdog. */ - if (!watchdog || cs->rating > watchdog->rating) { + if (!watchdog || cs->rating > watchdog->rating) watchdog = cs; - /* Reset watchdog cycles */ - clocksource_reset_watchdog(); - } } + /* If we failed to find a fallback restore the old one. */ + if (!watchdog) + watchdog = old_wd; + + /* If we changed the watchdog we need to reset cycles. */ + if (watchdog != old_wd) + clocksource_reset_watchdog(); + /* Check if the watchdog timer needs to be started. */ clocksource_start_watchdog(); spin_unlock_irqrestore(&watchdog_lock, flags); @@ -404,6 +433,7 @@ static void clocksource_enqueue_watchdog(struct clocksource *cs) cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES; } +static void clocksource_select_watchdog(bool fallback) { } static inline void clocksource_dequeue_watchdog(struct clocksource *cs) { } static inline void clocksource_resume_watchdog(void) { } static inline int __clocksource_watchdog_kthread(void) { return 0; } @@ -736,6 +766,7 @@ int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq) clocksource_enqueue(cs); clocksource_enqueue_watchdog(cs); clocksource_select(); + clocksource_select_watchdog(false); mutex_unlock(&clocksource_mutex); return 0; } @@ -758,6 +789,7 @@ void clocksource_change_rating(struct clocksource *cs, int rating) mutex_lock(&clocksource_mutex); __clocksource_change_rating(cs, rating); clocksource_select(); + clocksource_select_watchdog(false); mutex_unlock(&clocksource_mutex); } EXPORT_SYMBOL(clocksource_change_rating); @@ -767,12 +799,12 @@ EXPORT_SYMBOL(clocksource_change_rating); */ static int clocksource_unbind(struct clocksource *cs) { - /* - * I really can't convince myself to support this on hardware - * designed by lobotomized monkeys. - */ - if (clocksource_is_watchdog(cs)) - return -EBUSY; + if (clocksource_is_watchdog(cs)) { + /* Select and try to install a replacement watchdog. */ + clocksource_select_watchdog(true); + if (clocksource_is_watchdog(cs)) + return -EBUSY; + } if (cs == curr_clocksource) { /* Select and try to install a replacement clock source */ diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 435b8850dd80..fa0b983290cf 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -515,7 +515,7 @@ static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base) /* * High resolution timer enabled ? */ -static int hrtimer_hres_enabled __read_mostly = 1; +static bool hrtimer_hres_enabled __read_mostly = true; unsigned int hrtimer_resolution __read_mostly = LOW_RES_NSEC; EXPORT_SYMBOL_GPL(hrtimer_resolution); @@ -524,13 +524,7 @@ EXPORT_SYMBOL_GPL(hrtimer_resolution); */ static int __init setup_hrtimer_hres(char *str) { - if (!strcmp(str, "off")) - hrtimer_hres_enabled = 0; - else if (!strcmp(str, "on")) - hrtimer_hres_enabled = 1; - else - return 0; - return 1; + return (kstrtobool(str, &hrtimer_hres_enabled) == 0); } __setup("highres=", setup_hrtimer_hres); @@ -897,10 +891,10 @@ static int enqueue_hrtimer(struct hrtimer *timer, */ static void __remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base, - unsigned long newstate, int reprogram) + u8 newstate, int reprogram) { struct hrtimer_cpu_base *cpu_base = base->cpu_base; - unsigned int state = timer->state; + u8 state = timer->state; timer->state = newstate; if (!(state & HRTIMER_STATE_ENQUEUED)) @@ -930,7 +924,7 @@ static inline int remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base, bool restart) { if (hrtimer_is_queued(timer)) { - unsigned long state = timer->state; + u8 state = timer->state; int reprogram; /* @@ -954,6 +948,22 @@ remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base, bool rest return 0; } +static inline ktime_t hrtimer_update_lowres(struct hrtimer *timer, ktime_t tim, + const enum hrtimer_mode mode) +{ +#ifdef CONFIG_TIME_LOW_RES + /* + * CONFIG_TIME_LOW_RES indicates that the system has no way to return + * granular time values. For relative timers we add hrtimer_resolution + * (i.e. one jiffie) to prevent short timeouts. + */ + timer->is_rel = mode & HRTIMER_MODE_REL; + if (timer->is_rel) + tim = ktime_add_safe(tim, ktime_set(0, hrtimer_resolution)); +#endif + return tim; +} + /** * hrtimer_start_range_ns - (re)start an hrtimer on the current CPU * @timer: the timer to be added @@ -963,7 +973,7 @@ remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base, bool rest * relative (HRTIMER_MODE_REL) */ void hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, - unsigned long delta_ns, const enum hrtimer_mode mode) + u64 delta_ns, const enum hrtimer_mode mode) { struct hrtimer_clock_base *base, *new_base; unsigned long flags; @@ -974,19 +984,10 @@ void hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, /* Remove an active timer from the queue: */ remove_hrtimer(timer, base, true); - if (mode & HRTIMER_MODE_REL) { + if (mode & HRTIMER_MODE_REL) tim = ktime_add_safe(tim, base->get_time()); - /* - * CONFIG_TIME_LOW_RES is a temporary way for architectures - * to signal that they simply return xtime in - * do_gettimeoffset(). In this case we want to round up by - * resolution when starting a relative timer, to avoid short - * timeouts. This will go away with the GTOD framework. - */ -#ifdef CONFIG_TIME_LOW_RES - tim = ktime_add_safe(tim, ktime_set(0, hrtimer_resolution)); -#endif - } + + tim = hrtimer_update_lowres(timer, tim, mode); hrtimer_set_expires_range_ns(timer, tim, delta_ns); @@ -1074,19 +1075,23 @@ EXPORT_SYMBOL_GPL(hrtimer_cancel); /** * hrtimer_get_remaining - get remaining time for the timer * @timer: the timer to read + * @adjust: adjust relative timers when CONFIG_TIME_LOW_RES=y */ -ktime_t hrtimer_get_remaining(const struct hrtimer *timer) +ktime_t __hrtimer_get_remaining(const struct hrtimer *timer, bool adjust) { unsigned long flags; ktime_t rem; lock_hrtimer_base(timer, &flags); - rem = hrtimer_expires_remaining(timer); + if (IS_ENABLED(CONFIG_TIME_LOW_RES) && adjust) + rem = hrtimer_expires_remaining_adjusted(timer); + else + rem = hrtimer_expires_remaining(timer); unlock_hrtimer_base(timer, &flags); return rem; } -EXPORT_SYMBOL_GPL(hrtimer_get_remaining); +EXPORT_SYMBOL_GPL(__hrtimer_get_remaining); #ifdef CONFIG_NO_HZ_COMMON /** @@ -1220,6 +1225,14 @@ static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base, fn = timer->function; /* + * Clear the 'is relative' flag for the TIME_LOW_RES case. If the + * timer is restarted with a period then it becomes an absolute + * timer. If its not restarted it does not matter. + */ + if (IS_ENABLED(CONFIG_TIME_LOW_RES)) + timer->is_rel = false; + + /* * Because we run timers from hardirq context, there is no chance * they get migrated to another cpu, therefore its safe to unlock * the timer base. @@ -1529,7 +1542,7 @@ long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp, struct restart_block *restart; struct hrtimer_sleeper t; int ret = 0; - unsigned long slack; + u64 slack; slack = current->timer_slack_ns; if (dl_task(current) || rt_task(current)) @@ -1705,7 +1718,7 @@ void __init hrtimers_init(void) * @clock: timer clock, CLOCK_MONOTONIC or CLOCK_REALTIME */ int __sched -schedule_hrtimeout_range_clock(ktime_t *expires, unsigned long delta, +schedule_hrtimeout_range_clock(ktime_t *expires, u64 delta, const enum hrtimer_mode mode, int clock) { struct hrtimer_sleeper t; @@ -1773,7 +1786,7 @@ schedule_hrtimeout_range_clock(ktime_t *expires, unsigned long delta, * * Returns 0 when the timer has expired otherwise -EINTR */ -int __sched schedule_hrtimeout_range(ktime_t *expires, unsigned long delta, +int __sched schedule_hrtimeout_range(ktime_t *expires, u64 delta, const enum hrtimer_mode mode) { return schedule_hrtimeout_range_clock(expires, delta, mode, diff --git a/kernel/time/itimer.c b/kernel/time/itimer.c index 8d262b467573..1d5c7204ddc9 100644 --- a/kernel/time/itimer.c +++ b/kernel/time/itimer.c @@ -26,7 +26,7 @@ */ static struct timeval itimer_get_remtime(struct hrtimer *timer) { - ktime_t rem = hrtimer_get_remaining(timer); + ktime_t rem = __hrtimer_get_remaining(timer, true); /* * Racy but safe: if the itimer expires after the above diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c index 347fecf86a3f..555e21f7b966 100644 --- a/kernel/time/jiffies.c +++ b/kernel/time/jiffies.c @@ -68,7 +68,7 @@ static struct clocksource clocksource_jiffies = { .name = "jiffies", .rating = 1, /* lowest valid rating*/ .read = jiffies_read, - .mask = 0xffffffff, /*32bits*/ + .mask = CLOCKSOURCE_MASK(32), .mult = NSEC_PER_JIFFY << JIFFIES_SHIFT, /* details above */ .shift = JIFFIES_SHIFT, .max_cycles = 10, diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 36f2ca09aa5e..6df8927c58a5 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -685,8 +685,18 @@ int ntp_validate_timex(struct timex *txc) if (!capable(CAP_SYS_TIME)) return -EPERM; - if (!timeval_inject_offset_valid(&txc->time)) - return -EINVAL; + if (txc->modes & ADJ_NANO) { + struct timespec ts; + + ts.tv_sec = txc->time.tv_sec; + ts.tv_nsec = txc->time.tv_usec; + if (!timespec_inject_offset_valid(&ts)) + return -EINVAL; + + } else { + if (!timeval_inject_offset_valid(&txc->time)) + return -EINVAL; + } } /* diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index f5e86d282d52..1cafba860b08 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -333,7 +333,6 @@ static int posix_cpu_clock_get(const clockid_t which_clock, struct timespec *tp) return err; } - /* * Validate the clockid_t for a new CPU-clock timer, and initialize the timer. * This is called from sys_timer_create() and do_cpu_nanosleep() with the @@ -517,6 +516,10 @@ static void arm_timer(struct k_itimer *timer) cputime_expires->sched_exp = exp; break; } + if (CPUCLOCK_PERTHREAD(timer->it_clock)) + tick_dep_set_task(p, TICK_DEP_BIT_POSIX_TIMER); + else + tick_dep_set_signal(p->signal, TICK_DEP_BIT_POSIX_TIMER); } } @@ -582,39 +585,6 @@ static int cpu_timer_sample_group(const clockid_t which_clock, return 0; } -#ifdef CONFIG_NO_HZ_FULL -static void nohz_kick_work_fn(struct work_struct *work) -{ - tick_nohz_full_kick_all(); -} - -static DECLARE_WORK(nohz_kick_work, nohz_kick_work_fn); - -/* - * We need the IPIs to be sent from sane process context. - * The posix cpu timers are always set with irqs disabled. - */ -static void posix_cpu_timer_kick_nohz(void) -{ - if (context_tracking_is_enabled()) - schedule_work(&nohz_kick_work); -} - -bool posix_cpu_timers_can_stop_tick(struct task_struct *tsk) -{ - if (!task_cputime_zero(&tsk->cputime_expires)) - return false; - - /* Check if cputimer is running. This is accessed without locking. */ - if (READ_ONCE(tsk->signal->cputimer.running)) - return false; - - return true; -} -#else -static inline void posix_cpu_timer_kick_nohz(void) { } -#endif - /* * Guts of sys_timer_settime for CPU timers. * This is called with the timer locked and interrupts disabled. @@ -761,8 +731,7 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags, sample_to_timespec(timer->it_clock, old_incr, &old->it_interval); } - if (!ret) - posix_cpu_timer_kick_nohz(); + return ret; } @@ -911,6 +880,8 @@ static void check_thread_timers(struct task_struct *tsk, __group_send_sig_info(SIGXCPU, SEND_SIG_PRIV, tsk); } } + if (task_cputime_zero(tsk_expires)) + tick_dep_clear_task(tsk, TICK_DEP_BIT_POSIX_TIMER); } static inline void stop_process_timers(struct signal_struct *sig) @@ -919,6 +890,7 @@ static inline void stop_process_timers(struct signal_struct *sig) /* Turn off cputimer->running. This is done without locking. */ WRITE_ONCE(cputimer->running, false); + tick_dep_clear_signal(sig, TICK_DEP_BIT_POSIX_TIMER); } static u32 onecputick; @@ -1095,8 +1067,6 @@ void posix_cpu_timer_schedule(struct k_itimer *timer) arm_timer(timer); unlock_task_sighand(p, &flags); - /* Kick full dynticks CPUs in case they need to tick on the new timer */ - posix_cpu_timer_kick_nohz(); out: timer->it_overrun_last = timer->it_overrun; timer->it_overrun = -1; @@ -1270,7 +1240,7 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx, } if (!*newval) - goto out; + return; *newval += now; } @@ -1288,8 +1258,8 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx, tsk->signal->cputime_expires.virt_exp = *newval; break; } -out: - posix_cpu_timer_kick_nohz(); + + tick_dep_set_signal(tsk->signal, TICK_DEP_BIT_POSIX_TIMER); } static int do_cpu_nanosleep(const clockid_t which_clock, int flags, diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index 31d11ac9fa47..f2826c35e918 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -760,7 +760,7 @@ common_timer_get(struct k_itimer *timr, struct itimerspec *cur_setting) (timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE)) timr->it_overrun += (unsigned int) hrtimer_forward(timer, now, iv); - remaining = ktime_sub(hrtimer_get_expires(timer), now); + remaining = __hrtimer_expires_remaining_adjusted(timer, now); /* Return 0 only, when the timer is expired and not pending */ if (remaining.tv64 <= 0) { /* diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 9d7a053545f5..084b79f5917e 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -22,7 +22,6 @@ #include <linux/module.h> #include <linux/irq_work.h> #include <linux/posix-timers.h> -#include <linux/perf_event.h> #include <linux/context_tracking.h> #include <asm/irq_regs.h> @@ -36,16 +35,17 @@ */ static DEFINE_PER_CPU(struct tick_sched, tick_cpu_sched); -/* - * The time, when the last jiffy update happened. Protected by jiffies_lock. - */ -static ktime_t last_jiffies_update; - struct tick_sched *tick_get_tick_sched(int cpu) { return &per_cpu(tick_cpu_sched, cpu); } +#if defined(CONFIG_NO_HZ_COMMON) || defined(CONFIG_HIGH_RES_TIMERS) +/* + * The time, when the last jiffy update happened. Protected by jiffies_lock. + */ +static ktime_t last_jiffies_update; + /* * Must be called with interrupts disabled ! */ @@ -151,59 +151,69 @@ static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs) update_process_times(user_mode(regs)); profile_tick(CPU_PROFILING); } +#endif #ifdef CONFIG_NO_HZ_FULL cpumask_var_t tick_nohz_full_mask; cpumask_var_t housekeeping_mask; bool tick_nohz_full_running; +static unsigned long tick_dep_mask; + +static void trace_tick_dependency(unsigned long dep) +{ + if (dep & TICK_DEP_MASK_POSIX_TIMER) { + trace_tick_stop(0, TICK_DEP_MASK_POSIX_TIMER); + return; + } + + if (dep & TICK_DEP_MASK_PERF_EVENTS) { + trace_tick_stop(0, TICK_DEP_MASK_PERF_EVENTS); + return; + } + + if (dep & TICK_DEP_MASK_SCHED) { + trace_tick_stop(0, TICK_DEP_MASK_SCHED); + return; + } + + if (dep & TICK_DEP_MASK_CLOCK_UNSTABLE) + trace_tick_stop(0, TICK_DEP_MASK_CLOCK_UNSTABLE); +} -static bool can_stop_full_tick(void) +static bool can_stop_full_tick(struct tick_sched *ts) { WARN_ON_ONCE(!irqs_disabled()); - if (!sched_can_stop_tick()) { - trace_tick_stop(0, "more than 1 task in runqueue\n"); + if (tick_dep_mask) { + trace_tick_dependency(tick_dep_mask); return false; } - if (!posix_cpu_timers_can_stop_tick(current)) { - trace_tick_stop(0, "posix timers running\n"); + if (ts->tick_dep_mask) { + trace_tick_dependency(ts->tick_dep_mask); return false; } - if (!perf_event_can_stop_tick()) { - trace_tick_stop(0, "perf events running\n"); + if (current->tick_dep_mask) { + trace_tick_dependency(current->tick_dep_mask); return false; } - /* sched_clock_tick() needs us? */ -#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK - /* - * TODO: kick full dynticks CPUs when - * sched_clock_stable is set. - */ - if (!sched_clock_stable()) { - trace_tick_stop(0, "unstable sched clock\n"); - /* - * Don't allow the user to think they can get - * full NO_HZ with this machine. - */ - WARN_ONCE(tick_nohz_full_running, - "NO_HZ FULL will not work with unstable sched clock"); + if (current->signal->tick_dep_mask) { + trace_tick_dependency(current->signal->tick_dep_mask); return false; } -#endif return true; } -static void nohz_full_kick_work_func(struct irq_work *work) +static void nohz_full_kick_func(struct irq_work *work) { /* Empty, the tick restart happens on tick_nohz_irq_exit() */ } static DEFINE_PER_CPU(struct irq_work, nohz_full_kick_work) = { - .func = nohz_full_kick_work_func, + .func = nohz_full_kick_func, }; /* @@ -212,7 +222,7 @@ static DEFINE_PER_CPU(struct irq_work, nohz_full_kick_work) = { * This kick, unlike tick_nohz_full_kick_cpu() and tick_nohz_full_kick_all(), * is NMI safe. */ -void tick_nohz_full_kick(void) +static void tick_nohz_full_kick(void) { if (!tick_nohz_full_cpu(smp_processor_id())) return; @@ -232,27 +242,112 @@ void tick_nohz_full_kick_cpu(int cpu) irq_work_queue_on(&per_cpu(nohz_full_kick_work, cpu), cpu); } -static void nohz_full_kick_ipi(void *info) -{ - /* Empty, the tick restart happens on tick_nohz_irq_exit() */ -} - /* * Kick all full dynticks CPUs in order to force these to re-evaluate * their dependency on the tick and restart it if necessary. */ -void tick_nohz_full_kick_all(void) +static void tick_nohz_full_kick_all(void) { + int cpu; + if (!tick_nohz_full_running) return; preempt_disable(); - smp_call_function_many(tick_nohz_full_mask, - nohz_full_kick_ipi, NULL, false); - tick_nohz_full_kick(); + for_each_cpu_and(cpu, tick_nohz_full_mask, cpu_online_mask) + tick_nohz_full_kick_cpu(cpu); preempt_enable(); } +static void tick_nohz_dep_set_all(unsigned long *dep, + enum tick_dep_bits bit) +{ + unsigned long prev; + + prev = fetch_or(dep, BIT_MASK(bit)); + if (!prev) + tick_nohz_full_kick_all(); +} + +/* + * Set a global tick dependency. Used by perf events that rely on freq and + * by unstable clock. + */ +void tick_nohz_dep_set(enum tick_dep_bits bit) +{ + tick_nohz_dep_set_all(&tick_dep_mask, bit); +} + +void tick_nohz_dep_clear(enum tick_dep_bits bit) +{ + clear_bit(bit, &tick_dep_mask); +} + +/* + * Set per-CPU tick dependency. Used by scheduler and perf events in order to + * manage events throttling. + */ +void tick_nohz_dep_set_cpu(int cpu, enum tick_dep_bits bit) +{ + unsigned long prev; + struct tick_sched *ts; + + ts = per_cpu_ptr(&tick_cpu_sched, cpu); + + prev = fetch_or(&ts->tick_dep_mask, BIT_MASK(bit)); + if (!prev) { + preempt_disable(); + /* Perf needs local kick that is NMI safe */ + if (cpu == smp_processor_id()) { + tick_nohz_full_kick(); + } else { + /* Remote irq work not NMI-safe */ + if (!WARN_ON_ONCE(in_nmi())) + tick_nohz_full_kick_cpu(cpu); + } + preempt_enable(); + } +} + +void tick_nohz_dep_clear_cpu(int cpu, enum tick_dep_bits bit) +{ + struct tick_sched *ts = per_cpu_ptr(&tick_cpu_sched, cpu); + + clear_bit(bit, &ts->tick_dep_mask); +} + +/* + * Set a per-task tick dependency. Posix CPU timers need this in order to elapse + * per task timers. + */ +void tick_nohz_dep_set_task(struct task_struct *tsk, enum tick_dep_bits bit) +{ + /* + * We could optimize this with just kicking the target running the task + * if that noise matters for nohz full users. + */ + tick_nohz_dep_set_all(&tsk->tick_dep_mask, bit); +} + +void tick_nohz_dep_clear_task(struct task_struct *tsk, enum tick_dep_bits bit) +{ + clear_bit(bit, &tsk->tick_dep_mask); +} + +/* + * Set a per-taskgroup tick dependency. Posix CPU timers need this in order to elapse + * per process timers. + */ +void tick_nohz_dep_set_signal(struct signal_struct *sig, enum tick_dep_bits bit) +{ + tick_nohz_dep_set_all(&sig->tick_dep_mask, bit); +} + +void tick_nohz_dep_clear_signal(struct signal_struct *sig, enum tick_dep_bits bit) +{ + clear_bit(bit, &sig->tick_dep_mask); +} + /* * Re-evaluate the need for the tick as we switch the current task. * It might need the tick due to per task/process properties: @@ -261,15 +356,19 @@ void tick_nohz_full_kick_all(void) void __tick_nohz_task_switch(void) { unsigned long flags; + struct tick_sched *ts; local_irq_save(flags); if (!tick_nohz_full_cpu(smp_processor_id())) goto out; - if (tick_nohz_tick_stopped() && !can_stop_full_tick()) - tick_nohz_full_kick(); + ts = this_cpu_ptr(&tick_cpu_sched); + if (ts->tick_stopped) { + if (current->tick_dep_mask || current->signal->tick_dep_mask) + tick_nohz_full_kick(); + } out: local_irq_restore(flags); } @@ -279,7 +378,7 @@ static int __init tick_nohz_full_setup(char *str) { alloc_bootmem_cpumask_var(&tick_nohz_full_mask); if (cpulist_parse(str, tick_nohz_full_mask) < 0) { - pr_warning("NOHZ: Incorrect nohz_full cpumask\n"); + pr_warn("NO_HZ: Incorrect nohz_full cpumask\n"); free_bootmem_cpumask_var(tick_nohz_full_mask); return 1; } @@ -347,8 +446,7 @@ void __init tick_nohz_init(void) * interrupts to avoid circular dependency on the tick */ if (!arch_irq_work_has_interrupt()) { - pr_warning("NO_HZ: Can't run full dynticks because arch doesn't " - "support irq work self-IPIs\n"); + pr_warn("NO_HZ: Can't run full dynticks because arch doesn't support irq work self-IPIs\n"); cpumask_clear(tick_nohz_full_mask); cpumask_copy(housekeeping_mask, cpu_possible_mask); tick_nohz_full_running = false; @@ -358,7 +456,8 @@ void __init tick_nohz_init(void) cpu = smp_processor_id(); if (cpumask_test_cpu(cpu, tick_nohz_full_mask)) { - pr_warning("NO_HZ: Clearing %d from nohz_full range for timekeeping\n", cpu); + pr_warn("NO_HZ: Clearing %d from nohz_full range for timekeeping\n", + cpu); cpumask_clear_cpu(cpu, tick_nohz_full_mask); } @@ -387,20 +486,14 @@ void __init tick_nohz_init(void) /* * NO HZ enabled ? */ -int tick_nohz_enabled __read_mostly = 1; +bool tick_nohz_enabled __read_mostly = true; unsigned long tick_nohz_active __read_mostly; /* * Enable / Disable tickless mode */ static int __init setup_tick_nohz(char *str) { - if (!strcmp(str, "off")) - tick_nohz_enabled = 0; - else if (!strcmp(str, "on")) - tick_nohz_enabled = 1; - else - return 0; - return 1; + return (kstrtobool(str, &tick_nohz_enabled) == 0); } __setup("nohz=", setup_tick_nohz); @@ -687,7 +780,7 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts, ts->last_tick = hrtimer_get_expires(&ts->sched_timer); ts->tick_stopped = 1; - trace_tick_stop(1, " "); + trace_tick_stop(1, TICK_DEP_MASK_NONE); } /* @@ -738,7 +831,7 @@ static void tick_nohz_full_update_tick(struct tick_sched *ts) if (!ts->tick_stopped && ts->nohz_mode == NOHZ_MODE_INACTIVE) return; - if (can_stop_full_tick()) + if (can_stop_full_tick(ts)) tick_nohz_stop_sched_tick(ts, ktime_get(), cpu); else if (ts->tick_stopped) tick_nohz_restart_sched_tick(ts, ktime_get(), 1); @@ -993,9 +1086,9 @@ static void tick_nohz_switch_to_nohz(void) /* Get the next period */ next = tick_init_jiffy_update(); - hrtimer_forward_now(&ts->sched_timer, tick_period); hrtimer_set_expires(&ts->sched_timer, next); - tick_program_event(next, 1); + hrtimer_forward_now(&ts->sched_timer, tick_period); + tick_program_event(hrtimer_get_expires(&ts->sched_timer), 1); tick_nohz_activate(ts, NOHZ_MODE_LOWRES); } diff --git a/kernel/time/tick-sched.h b/kernel/time/tick-sched.h index a4a8d4e9baa1..eb4e32566a83 100644 --- a/kernel/time/tick-sched.h +++ b/kernel/time/tick-sched.h @@ -60,6 +60,7 @@ struct tick_sched { u64 next_timer; ktime_t idle_expires; int do_timer_last; + unsigned long tick_dep_mask; }; extern struct tick_sched *tick_get_tick_sched(int cpu); diff --git a/kernel/time/time.c b/kernel/time/time.c index 86751c68e08d..be115b020d27 100644 --- a/kernel/time/time.c +++ b/kernel/time/time.c @@ -322,6 +322,13 @@ EXPORT_SYMBOL(timespec_trunc); * -year/100+year/400 terms, and add 10.] * * This algorithm was first published by Gauss (I think). + * + * A leap second can be indicated by calling this function with sec as + * 60 (allowable under ISO 8601). The leap second is treated the same + * as the following second since they don't exist in UNIX time. + * + * An encoding of midnight at the end of the day as 24:00:00 - ie. midnight + * tomorrow - (allowable under ISO 8601) is supported. */ time64_t mktime64(const unsigned int year0, const unsigned int mon0, const unsigned int day, const unsigned int hour, @@ -338,7 +345,7 @@ time64_t mktime64(const unsigned int year0, const unsigned int mon0, return ((((time64_t) (year/4 - year/100 + year/400 + 367*mon/12 + day) + year*365 - 719499 - )*24 + hour /* now have hours */ + )*24 + hour /* now have hours - midnight tomorrow handled here */ )*60 + min /* now have minutes */ )*60 + sec; /* finally seconds */ } diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 34b4cedfa80d..479d25cd3d4f 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -131,7 +131,7 @@ static void timekeeping_check_update(struct timekeeper *tk, cycle_t offset) printk_deferred(" timekeeping: Your kernel is sick, but tries to cope by capping time updates\n"); } else { if (offset > (max_cycles >> 1)) { - printk_deferred("INFO: timekeeping: Cycle offset (%lld) is larger than the the '%s' clock's 50%% safety margin (%lld)\n", + printk_deferred("INFO: timekeeping: Cycle offset (%lld) is larger than the '%s' clock's 50%% safety margin (%lld)\n", offset, name, max_cycles >> 1); printk_deferred(" timekeeping: Your kernel is still fine, but is feeling a bit nervous\n"); } @@ -233,6 +233,7 @@ static void tk_setup_internals(struct timekeeper *tk, struct clocksource *clock) u64 tmp, ntpinterval; struct clocksource *old_clock; + ++tk->cs_was_changed_seq; old_clock = tk->tkr_mono.clock; tk->tkr_mono.clock = clock; tk->tkr_mono.read = clock->read; @@ -298,17 +299,34 @@ u32 (*arch_gettimeoffset)(void) = default_arch_gettimeoffset; static inline u32 arch_gettimeoffset(void) { return 0; } #endif +static inline s64 timekeeping_delta_to_ns(struct tk_read_base *tkr, + cycle_t delta) +{ + s64 nsec; + + nsec = delta * tkr->mult + tkr->xtime_nsec; + nsec >>= tkr->shift; + + /* If arch requires, add in get_arch_timeoffset() */ + return nsec + arch_gettimeoffset(); +} + static inline s64 timekeeping_get_ns(struct tk_read_base *tkr) { cycle_t delta; - s64 nsec; delta = timekeeping_get_delta(tkr); + return timekeeping_delta_to_ns(tkr, delta); +} - nsec = (delta * tkr->mult + tkr->xtime_nsec) >> tkr->shift; +static inline s64 timekeeping_cycles_to_ns(struct tk_read_base *tkr, + cycle_t cycles) +{ + cycle_t delta; - /* If arch requires, add in get_arch_timeoffset() */ - return nsec + arch_gettimeoffset(); + /* calculate the delta since the last update_wall_time */ + delta = clocksource_delta(cycles, tkr->cycle_last, tkr->mask); + return timekeeping_delta_to_ns(tkr, delta); } /** @@ -857,44 +875,262 @@ time64_t __ktime_get_real_seconds(void) return tk->xtime_sec; } +/** + * ktime_get_snapshot - snapshots the realtime/monotonic raw clocks with counter + * @systime_snapshot: pointer to struct receiving the system time snapshot + */ +void ktime_get_snapshot(struct system_time_snapshot *systime_snapshot) +{ + struct timekeeper *tk = &tk_core.timekeeper; + unsigned long seq; + ktime_t base_raw; + ktime_t base_real; + s64 nsec_raw; + s64 nsec_real; + cycle_t now; -#ifdef CONFIG_NTP_PPS + WARN_ON_ONCE(timekeeping_suspended); + + do { + seq = read_seqcount_begin(&tk_core.seq); + + now = tk->tkr_mono.read(tk->tkr_mono.clock); + systime_snapshot->cs_was_changed_seq = tk->cs_was_changed_seq; + systime_snapshot->clock_was_set_seq = tk->clock_was_set_seq; + base_real = ktime_add(tk->tkr_mono.base, + tk_core.timekeeper.offs_real); + base_raw = tk->tkr_raw.base; + nsec_real = timekeeping_cycles_to_ns(&tk->tkr_mono, now); + nsec_raw = timekeeping_cycles_to_ns(&tk->tkr_raw, now); + } while (read_seqcount_retry(&tk_core.seq, seq)); + + systime_snapshot->cycles = now; + systime_snapshot->real = ktime_add_ns(base_real, nsec_real); + systime_snapshot->raw = ktime_add_ns(base_raw, nsec_raw); +} +EXPORT_SYMBOL_GPL(ktime_get_snapshot); + +/* Scale base by mult/div checking for overflow */ +static int scale64_check_overflow(u64 mult, u64 div, u64 *base) +{ + u64 tmp, rem; + + tmp = div64_u64_rem(*base, div, &rem); + + if (((int)sizeof(u64)*8 - fls64(mult) < fls64(tmp)) || + ((int)sizeof(u64)*8 - fls64(mult) < fls64(rem))) + return -EOVERFLOW; + tmp *= mult; + rem *= mult; + + do_div(rem, div); + *base = tmp + rem; + return 0; +} /** - * ktime_get_raw_and_real_ts64 - get day and raw monotonic time in timespec format - * @ts_raw: pointer to the timespec to be set to raw monotonic time - * @ts_real: pointer to the timespec to be set to the time of day + * adjust_historical_crosststamp - adjust crosstimestamp previous to current interval + * @history: Snapshot representing start of history + * @partial_history_cycles: Cycle offset into history (fractional part) + * @total_history_cycles: Total history length in cycles + * @discontinuity: True indicates clock was set on history period + * @ts: Cross timestamp that should be adjusted using + * partial/total ratio * - * This function reads both the time of day and raw monotonic time at the - * same time atomically and stores the resulting timestamps in timespec - * format. + * Helper function used by get_device_system_crosststamp() to correct the + * crosstimestamp corresponding to the start of the current interval to the + * system counter value (timestamp point) provided by the driver. The + * total_history_* quantities are the total history starting at the provided + * reference point and ending at the start of the current interval. The cycle + * count between the driver timestamp point and the start of the current + * interval is partial_history_cycles. */ -void ktime_get_raw_and_real_ts64(struct timespec64 *ts_raw, struct timespec64 *ts_real) +static int adjust_historical_crosststamp(struct system_time_snapshot *history, + cycle_t partial_history_cycles, + cycle_t total_history_cycles, + bool discontinuity, + struct system_device_crosststamp *ts) { struct timekeeper *tk = &tk_core.timekeeper; - unsigned long seq; - s64 nsecs_raw, nsecs_real; + u64 corr_raw, corr_real; + bool interp_forward; + int ret; - WARN_ON_ONCE(timekeeping_suspended); + if (total_history_cycles == 0 || partial_history_cycles == 0) + return 0; + + /* Interpolate shortest distance from beginning or end of history */ + interp_forward = partial_history_cycles > total_history_cycles/2 ? + true : false; + partial_history_cycles = interp_forward ? + total_history_cycles - partial_history_cycles : + partial_history_cycles; + + /* + * Scale the monotonic raw time delta by: + * partial_history_cycles / total_history_cycles + */ + corr_raw = (u64)ktime_to_ns( + ktime_sub(ts->sys_monoraw, history->raw)); + ret = scale64_check_overflow(partial_history_cycles, + total_history_cycles, &corr_raw); + if (ret) + return ret; + + /* + * If there is a discontinuity in the history, scale monotonic raw + * correction by: + * mult(real)/mult(raw) yielding the realtime correction + * Otherwise, calculate the realtime correction similar to monotonic + * raw calculation + */ + if (discontinuity) { + corr_real = mul_u64_u32_div + (corr_raw, tk->tkr_mono.mult, tk->tkr_raw.mult); + } else { + corr_real = (u64)ktime_to_ns( + ktime_sub(ts->sys_realtime, history->real)); + ret = scale64_check_overflow(partial_history_cycles, + total_history_cycles, &corr_real); + if (ret) + return ret; + } + + /* Fixup monotonic raw and real time time values */ + if (interp_forward) { + ts->sys_monoraw = ktime_add_ns(history->raw, corr_raw); + ts->sys_realtime = ktime_add_ns(history->real, corr_real); + } else { + ts->sys_monoraw = ktime_sub_ns(ts->sys_monoraw, corr_raw); + ts->sys_realtime = ktime_sub_ns(ts->sys_realtime, corr_real); + } + + return 0; +} + +/* + * cycle_between - true if test occurs chronologically between before and after + */ +static bool cycle_between(cycle_t before, cycle_t test, cycle_t after) +{ + if (test > before && test < after) + return true; + if (test < before && before > after) + return true; + return false; +} + +/** + * get_device_system_crosststamp - Synchronously capture system/device timestamp + * @get_time_fn: Callback to get simultaneous device time and + * system counter from the device driver + * @ctx: Context passed to get_time_fn() + * @history_begin: Historical reference point used to interpolate system + * time when counter provided by the driver is before the current interval + * @xtstamp: Receives simultaneously captured system and device time + * + * Reads a timestamp from a device and correlates it to system time + */ +int get_device_system_crosststamp(int (*get_time_fn) + (ktime_t *device_time, + struct system_counterval_t *sys_counterval, + void *ctx), + void *ctx, + struct system_time_snapshot *history_begin, + struct system_device_crosststamp *xtstamp) +{ + struct system_counterval_t system_counterval; + struct timekeeper *tk = &tk_core.timekeeper; + cycle_t cycles, now, interval_start; + unsigned int clock_was_set_seq = 0; + ktime_t base_real, base_raw; + s64 nsec_real, nsec_raw; + u8 cs_was_changed_seq; + unsigned long seq; + bool do_interp; + int ret; do { seq = read_seqcount_begin(&tk_core.seq); + /* + * Try to synchronously capture device time and a system + * counter value calling back into the device driver + */ + ret = get_time_fn(&xtstamp->device, &system_counterval, ctx); + if (ret) + return ret; + + /* + * Verify that the clocksource associated with the captured + * system counter value is the same as the currently installed + * timekeeper clocksource + */ + if (tk->tkr_mono.clock != system_counterval.cs) + return -ENODEV; + cycles = system_counterval.cycles; - *ts_raw = tk->raw_time; - ts_real->tv_sec = tk->xtime_sec; - ts_real->tv_nsec = 0; + /* + * Check whether the system counter value provided by the + * device driver is on the current timekeeping interval. + */ + now = tk->tkr_mono.read(tk->tkr_mono.clock); + interval_start = tk->tkr_mono.cycle_last; + if (!cycle_between(interval_start, cycles, now)) { + clock_was_set_seq = tk->clock_was_set_seq; + cs_was_changed_seq = tk->cs_was_changed_seq; + cycles = interval_start; + do_interp = true; + } else { + do_interp = false; + } - nsecs_raw = timekeeping_get_ns(&tk->tkr_raw); - nsecs_real = timekeeping_get_ns(&tk->tkr_mono); + base_real = ktime_add(tk->tkr_mono.base, + tk_core.timekeeper.offs_real); + base_raw = tk->tkr_raw.base; + nsec_real = timekeeping_cycles_to_ns(&tk->tkr_mono, + system_counterval.cycles); + nsec_raw = timekeeping_cycles_to_ns(&tk->tkr_raw, + system_counterval.cycles); } while (read_seqcount_retry(&tk_core.seq, seq)); - timespec64_add_ns(ts_raw, nsecs_raw); - timespec64_add_ns(ts_real, nsecs_real); -} -EXPORT_SYMBOL(ktime_get_raw_and_real_ts64); + xtstamp->sys_realtime = ktime_add_ns(base_real, nsec_real); + xtstamp->sys_monoraw = ktime_add_ns(base_raw, nsec_raw); -#endif /* CONFIG_NTP_PPS */ + /* + * Interpolate if necessary, adjusting back from the start of the + * current interval + */ + if (do_interp) { + cycle_t partial_history_cycles, total_history_cycles; + bool discontinuity; + + /* + * Check that the counter value occurs after the provided + * history reference and that the history doesn't cross a + * clocksource change + */ + if (!history_begin || + !cycle_between(history_begin->cycles, + system_counterval.cycles, cycles) || + history_begin->cs_was_changed_seq != cs_was_changed_seq) + return -EINVAL; + partial_history_cycles = cycles - system_counterval.cycles; + total_history_cycles = cycles - history_begin->cycles; + discontinuity = + history_begin->clock_was_set_seq != clock_was_set_seq; + + ret = adjust_historical_crosststamp(history_begin, + partial_history_cycles, + total_history_cycles, + discontinuity, xtstamp); + if (ret) + return ret; + } + + return 0; +} +EXPORT_SYMBOL_GPL(get_device_system_crosststamp); /** * do_gettimeofday - Returns the time of day in a timeval diff --git a/kernel/time/timer.c b/kernel/time/timer.c index bbc5d1114583..73164c3aa56b 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -1566,6 +1566,17 @@ signed long __sched schedule_timeout_uninterruptible(signed long timeout) } EXPORT_SYMBOL(schedule_timeout_uninterruptible); +/* + * Like schedule_timeout_uninterruptible(), except this task will not contribute + * to load average. + */ +signed long __sched schedule_timeout_idle(signed long timeout) +{ + __set_current_state(TASK_IDLE); + return schedule_timeout(timeout); +} +EXPORT_SYMBOL(schedule_timeout_idle); + #ifdef CONFIG_HOTPLUG_CPU static void migrate_timer_list(struct tvec_base *new_base, struct hlist_head *head) { @@ -1698,10 +1709,10 @@ EXPORT_SYMBOL(msleep_interruptible); static void __sched do_usleep_range(unsigned long min, unsigned long max) { ktime_t kmin; - unsigned long delta; + u64 delta; kmin = ktime_set(0, min * NSEC_PER_USEC); - delta = (max - min) * NSEC_PER_USEC; + delta = (u64)(max - min) * NSEC_PER_USEC; schedule_hrtimeout_range(&kmin, delta, HRTIMER_MODE_REL); } diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c index f75e35b60149..ba7d8b288bb3 100644 --- a/kernel/time/timer_list.c +++ b/kernel/time/timer_list.c @@ -69,7 +69,7 @@ print_timer(struct seq_file *m, struct hrtimer *taddr, struct hrtimer *timer, print_name_offset(m, taddr); SEQ_printf(m, ", "); print_name_offset(m, timer->function); - SEQ_printf(m, ", S:%02lx", timer->state); + SEQ_printf(m, ", S:%02x", timer->state); #ifdef CONFIG_TIMER_STATS SEQ_printf(m, ", "); print_name_offset(m, timer->start_site); diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 2aeb6ffc0a1e..f94e7a21f52d 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -1437,12 +1437,12 @@ static struct trace_event trace_blk_event = { static int __init init_blk_tracer(void) { if (!register_trace_event(&trace_blk_event)) { - pr_warning("Warning: could not register block events\n"); + pr_warn("Warning: could not register block events\n"); return 1; } if (register_tracer(&blk_tracer) != 0) { - pr_warning("Warning: could not register the block tracer\n"); + pr_warn("Warning: could not register the block tracer\n"); unregister_trace_event(&trace_blk_event); return 1; } diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 45dd798bcd37..3e4ffb3ace5f 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -13,8 +13,6 @@ #include <linux/ctype.h> #include "trace.h" -static DEFINE_PER_CPU(int, bpf_prog_active); - /** * trace_call_bpf - invoke BPF program * @prog: BPF program @@ -191,14 +189,17 @@ static u64 bpf_perf_event_read(u64 r1, u64 index, u64 r3, u64 r4, u64 r5) struct bpf_map *map = (struct bpf_map *) (unsigned long) r1; struct bpf_array *array = container_of(map, struct bpf_array, map); struct perf_event *event; + struct file *file; if (unlikely(index >= array->map.max_entries)) return -E2BIG; - event = (struct perf_event *)array->ptrs[index]; - if (!event) + file = (struct file *)array->ptrs[index]; + if (unlikely(!file)) return -ENOENT; + event = file->private_data; + /* make sure event is local and doesn't have pmu::count */ if (event->oncpu != smp_processor_id() || event->pmu->count) @@ -228,6 +229,7 @@ static u64 bpf_perf_event_output(u64 r1, u64 r2, u64 index, u64 r4, u64 size) void *data = (void *) (long) r4; struct perf_sample_data sample_data; struct perf_event *event; + struct file *file; struct perf_raw_record raw = { .size = size, .data = data, @@ -236,10 +238,12 @@ static u64 bpf_perf_event_output(u64 r1, u64 r2, u64 index, u64 r4, u64 size) if (unlikely(index >= array->map.max_entries)) return -E2BIG; - event = (struct perf_event *)array->ptrs[index]; - if (unlikely(!event)) + file = (struct file *)array->ptrs[index]; + if (unlikely(!file)) return -ENOENT; + event = file->private_data; + if (unlikely(event->attr.type != PERF_TYPE_SOFTWARE || event->attr.config != PERF_COUNT_SW_BPF_OUTPUT)) return -EINVAL; @@ -293,6 +297,8 @@ static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func return &bpf_perf_event_read_proto; case BPF_FUNC_perf_event_output: return &bpf_perf_event_output_proto; + case BPF_FUNC_get_stackid: + return &bpf_get_stackid_proto; default: return NULL; } diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index eca592f977b2..b1870fbd2b67 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1030,8 +1030,7 @@ static __init void ftrace_profile_tracefs(struct dentry *d_tracer) for_each_possible_cpu(cpu) { stat = &per_cpu(ftrace_profile_stats, cpu); - /* allocate enough for function name + cpu number */ - name = kmalloc(32, GFP_KERNEL); + name = kasprintf(GFP_KERNEL, "function%d", cpu); if (!name) { /* * The files created are permanent, if something happens @@ -1043,7 +1042,6 @@ static __init void ftrace_profile_tracefs(struct dentry *d_tracer) return; } stat->stat = function_stats; - snprintf(name, 32, "function%d", cpu); stat->stat.name = name; ret = register_stat_tracer(&stat->stat); if (ret) { @@ -1058,8 +1056,7 @@ static __init void ftrace_profile_tracefs(struct dentry *d_tracer) entry = tracefs_create_file("function_profile_enabled", 0644, d_tracer, NULL, &ftrace_profile_fops); if (!entry) - pr_warning("Could not create tracefs " - "'function_profile_enabled' entry\n"); + pr_warn("Could not create tracefs 'function_profile_enabled' entry\n"); } #else /* CONFIG_FUNCTION_PROFILER */ @@ -1610,7 +1607,7 @@ static bool test_rec_ops_needs_regs(struct dyn_ftrace *rec) return keep_regs; } -static void __ftrace_hash_rec_update(struct ftrace_ops *ops, +static bool __ftrace_hash_rec_update(struct ftrace_ops *ops, int filter_hash, bool inc) { @@ -1618,12 +1615,13 @@ static void __ftrace_hash_rec_update(struct ftrace_ops *ops, struct ftrace_hash *other_hash; struct ftrace_page *pg; struct dyn_ftrace *rec; + bool update = false; int count = 0; int all = 0; /* Only update if the ops has been registered */ if (!(ops->flags & FTRACE_OPS_FL_ENABLED)) - return; + return false; /* * In the filter_hash case: @@ -1650,7 +1648,7 @@ static void __ftrace_hash_rec_update(struct ftrace_ops *ops, * then there's nothing to do. */ if (ftrace_hash_empty(hash)) - return; + return false; } do_for_each_ftrace_rec(pg, rec) { @@ -1694,7 +1692,7 @@ static void __ftrace_hash_rec_update(struct ftrace_ops *ops, if (inc) { rec->flags++; if (FTRACE_WARN_ON(ftrace_rec_count(rec) == FTRACE_REF_MAX)) - return; + return false; /* * If there's only a single callback registered to a @@ -1720,7 +1718,7 @@ static void __ftrace_hash_rec_update(struct ftrace_ops *ops, rec->flags |= FTRACE_FL_REGS; } else { if (FTRACE_WARN_ON(ftrace_rec_count(rec) == 0)) - return; + return false; rec->flags--; /* @@ -1753,22 +1751,28 @@ static void __ftrace_hash_rec_update(struct ftrace_ops *ops, */ } count++; + + /* Must match FTRACE_UPDATE_CALLS in ftrace_modify_all_code() */ + update |= ftrace_test_record(rec, 1) != FTRACE_UPDATE_IGNORE; + /* Shortcut, if we handled all records, we are done. */ if (!all && count == hash->count) - return; + return update; } while_for_each_ftrace_rec(); + + return update; } -static void ftrace_hash_rec_disable(struct ftrace_ops *ops, +static bool ftrace_hash_rec_disable(struct ftrace_ops *ops, int filter_hash) { - __ftrace_hash_rec_update(ops, filter_hash, 0); + return __ftrace_hash_rec_update(ops, filter_hash, 0); } -static void ftrace_hash_rec_enable(struct ftrace_ops *ops, +static bool ftrace_hash_rec_enable(struct ftrace_ops *ops, int filter_hash) { - __ftrace_hash_rec_update(ops, filter_hash, 1); + return __ftrace_hash_rec_update(ops, filter_hash, 1); } static void ftrace_hash_rec_update_modify(struct ftrace_ops *ops, @@ -2314,8 +2318,8 @@ unsigned long ftrace_get_addr_curr(struct dyn_ftrace *rec) if (rec->flags & FTRACE_FL_TRAMP_EN) { ops = ftrace_find_tramp_ops_curr(rec); if (FTRACE_WARN_ON(!ops)) { - pr_warning("Bad trampoline accounting at: %p (%pS)\n", - (void *)rec->ip, (void *)rec->ip); + pr_warn("Bad trampoline accounting at: %p (%pS)\n", + (void *)rec->ip, (void *)rec->ip); /* Ftrace is shutting down, return anything */ return (unsigned long)FTRACE_ADDR; } @@ -2644,7 +2648,6 @@ static int ftrace_startup(struct ftrace_ops *ops, int command) return ret; ftrace_start_up++; - command |= FTRACE_UPDATE_CALLS; /* * Note that ftrace probes uses this to start up @@ -2665,7 +2668,8 @@ static int ftrace_startup(struct ftrace_ops *ops, int command) return ret; } - ftrace_hash_rec_enable(ops, 1); + if (ftrace_hash_rec_enable(ops, 1)) + command |= FTRACE_UPDATE_CALLS; ftrace_startup_enable(command); @@ -2695,11 +2699,11 @@ static int ftrace_shutdown(struct ftrace_ops *ops, int command) /* Disabling ipmodify never fails */ ftrace_hash_ipmodify_disable(ops); - ftrace_hash_rec_disable(ops, 1); - ops->flags &= ~FTRACE_OPS_FL_ENABLED; + if (ftrace_hash_rec_disable(ops, 1)) + command |= FTRACE_UPDATE_CALLS; - command |= FTRACE_UPDATE_CALLS; + ops->flags &= ~FTRACE_OPS_FL_ENABLED; if (saved_ftrace_func != ftrace_trace_function) { saved_ftrace_func = ftrace_trace_function; @@ -4961,7 +4965,7 @@ void ftrace_release_mod(struct module *mod) mutex_unlock(&ftrace_lock); } -static void ftrace_module_enable(struct module *mod) +void ftrace_module_enable(struct module *mod) { struct dyn_ftrace *rec; struct ftrace_page *pg; @@ -5038,38 +5042,8 @@ void ftrace_module_init(struct module *mod) ftrace_process_locs(mod, mod->ftrace_callsites, mod->ftrace_callsites + mod->num_ftrace_callsites); } - -static int ftrace_module_notify(struct notifier_block *self, - unsigned long val, void *data) -{ - struct module *mod = data; - - switch (val) { - case MODULE_STATE_COMING: - ftrace_module_enable(mod); - break; - case MODULE_STATE_GOING: - ftrace_release_mod(mod); - break; - default: - break; - } - - return 0; -} -#else -static int ftrace_module_notify(struct notifier_block *self, - unsigned long val, void *data) -{ - return 0; -} #endif /* CONFIG_MODULES */ -struct notifier_block ftrace_module_nb = { - .notifier_call = ftrace_module_notify, - .priority = INT_MIN, /* Run after anything that can remove kprobes */ -}; - void __init ftrace_init(void) { extern unsigned long __start_mcount_loc[]; @@ -5098,10 +5072,6 @@ void __init ftrace_init(void) __start_mcount_loc, __stop_mcount_loc); - ret = register_module_notifier(&ftrace_module_nb); - if (ret) - pr_warning("Failed to register trace ftrace module exit notifier\n"); - set_ftrace_early_filters(); return; diff --git a/kernel/trace/power-traces.c b/kernel/trace/power-traces.c index eb4220a132ec..81b87451c0ea 100644 --- a/kernel/trace/power-traces.c +++ b/kernel/trace/power-traces.c @@ -15,4 +15,5 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(suspend_resume); EXPORT_TRACEPOINT_SYMBOL_GPL(cpu_idle); +EXPORT_TRACEPOINT_SYMBOL_GPL(powernv_throttle); diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 87fb9801bd9e..a2f0b9f33e9b 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -74,11 +74,6 @@ static struct tracer_opt dummy_tracer_opt[] = { { } }; -static struct tracer_flags dummy_tracer_flags = { - .val = 0, - .opts = dummy_tracer_opt -}; - static int dummy_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set) { @@ -1258,12 +1253,22 @@ int __init register_tracer(struct tracer *type) if (!type->set_flag) type->set_flag = &dummy_set_flag; - if (!type->flags) - type->flags = &dummy_tracer_flags; - else + if (!type->flags) { + /*allocate a dummy tracer_flags*/ + type->flags = kmalloc(sizeof(*type->flags), GFP_KERNEL); + if (!type->flags) { + ret = -ENOMEM; + goto out; + } + type->flags->val = 0; + type->flags->opts = dummy_tracer_opt; + } else if (!type->flags->opts) type->flags->opts = dummy_tracer_opt; + /* store the tracer for __set_tracer_option */ + type->flags->trace = type; + ret = run_tracer_selftest(type); if (ret < 0) goto out; @@ -1659,6 +1664,7 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags, #else TRACE_FLAG_IRQS_NOSUPPORT | #endif + ((pc & NMI_MASK ) ? TRACE_FLAG_NMI : 0) | ((pc & HARDIRQ_MASK) ? TRACE_FLAG_HARDIRQ : 0) | ((pc & SOFTIRQ_MASK) ? TRACE_FLAG_SOFTIRQ : 0) | (tif_need_resched() ? TRACE_FLAG_NEED_RESCHED : 0) | @@ -1751,7 +1757,7 @@ void trace_buffer_unlock_commit_regs(struct trace_array *tr, { __buffer_unlock_commit(buffer, event); - ftrace_trace_stack(tr, buffer, flags, 6, pc, regs); + ftrace_trace_stack(tr, buffer, flags, 0, pc, regs); ftrace_trace_userstack(buffer, flags, pc); } EXPORT_SYMBOL_GPL(trace_buffer_unlock_commit_regs); @@ -2071,20 +2077,20 @@ void trace_printk_init_buffers(void) /* trace_printk() is for debug use only. Don't use it in production. */ - pr_warning("\n"); - pr_warning("**********************************************************\n"); - pr_warning("** NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE **\n"); - pr_warning("** **\n"); - pr_warning("** trace_printk() being used. Allocating extra memory. **\n"); - pr_warning("** **\n"); - pr_warning("** This means that this is a DEBUG kernel and it is **\n"); - pr_warning("** unsafe for production use. **\n"); - pr_warning("** **\n"); - pr_warning("** If you see this message and you are not debugging **\n"); - pr_warning("** the kernel, report this immediately to your vendor! **\n"); - pr_warning("** **\n"); - pr_warning("** NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE **\n"); - pr_warning("**********************************************************\n"); + pr_warn("\n"); + pr_warn("**********************************************************\n"); + pr_warn("** NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE **\n"); + pr_warn("** **\n"); + pr_warn("** trace_printk() being used. Allocating extra memory. **\n"); + pr_warn("** **\n"); + pr_warn("** This means that this is a DEBUG kernel and it is **\n"); + pr_warn("** unsafe for production use. **\n"); + pr_warn("** **\n"); + pr_warn("** If you see this message and you are not debugging **\n"); + pr_warn("** the kernel, report this immediately to your vendor! **\n"); + pr_warn("** **\n"); + pr_warn("** NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE **\n"); + pr_warn("**********************************************************\n"); /* Expand the buffers to set size */ tracing_update_buffers(); @@ -3505,7 +3511,7 @@ static int __set_tracer_option(struct trace_array *tr, struct tracer_flags *tracer_flags, struct tracer_opt *opts, int neg) { - struct tracer *trace = tr->current_trace; + struct tracer *trace = tracer_flags->trace; int ret; ret = trace->set_flag(tr, tracer_flags->val, opts->bit, !neg); @@ -4101,7 +4107,7 @@ trace_insert_enum_map_file(struct module *mod, struct trace_enum_map **start, */ map_array = kmalloc(sizeof(*map_array) * (len + 2), GFP_KERNEL); if (!map_array) { - pr_warning("Unable to allocate trace enum mapping\n"); + pr_warn("Unable to allocate trace enum mapping\n"); return; } @@ -4949,7 +4955,10 @@ static ssize_t tracing_splice_read_pipe(struct file *filp, spd.nr_pages = i; - ret = splice_to_pipe(pipe, &spd); + if (i) + ret = splice_to_pipe(pipe, &spd); + else + ret = 0; out: splice_shrink_spd(&spd); return ret; @@ -6131,7 +6140,7 @@ tracing_init_tracefs_percpu(struct trace_array *tr, long cpu) snprintf(cpu_dir, 30, "cpu%ld", cpu); d_cpu = tracefs_create_dir(cpu_dir, d_percpu); if (!d_cpu) { - pr_warning("Could not create tracefs '%s' entry\n", cpu_dir); + pr_warn("Could not create tracefs '%s' entry\n", cpu_dir); return; } @@ -6318,7 +6327,7 @@ struct dentry *trace_create_file(const char *name, ret = tracefs_create_file(name, mode, parent, data, fops); if (!ret) - pr_warning("Could not create tracefs '%s' entry\n", name); + pr_warn("Could not create tracefs '%s' entry\n", name); return ret; } @@ -6337,7 +6346,7 @@ static struct dentry *trace_options_init_dentry(struct trace_array *tr) tr->options = tracefs_create_dir("options", d_tracer); if (!tr->options) { - pr_warning("Could not create tracefs directory 'options'\n"); + pr_warn("Could not create tracefs directory 'options'\n"); return NULL; } @@ -6391,11 +6400,8 @@ create_trace_option_files(struct trace_array *tr, struct tracer *tracer) return; for (i = 0; i < tr->nr_topts; i++) { - /* - * Check if these flags have already been added. - * Some tracers share flags. - */ - if (tr->topts[i].tracer->flags == tracer->flags) + /* Make sure there's no duplicate flags. */ + if (WARN_ON_ONCE(tr->topts[i].tracer->flags == tracer->flags)) return; } @@ -7248,8 +7254,8 @@ __init static int tracer_alloc_buffers(void) if (trace_boot_clock) { ret = tracing_set_clock(&global_trace, trace_boot_clock); if (ret < 0) - pr_warning("Trace clock %s not defined, going back to default\n", - trace_boot_clock); + pr_warn("Trace clock %s not defined, going back to default\n", + trace_boot_clock); } /* diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 8414fa40bf27..3fff4adfd431 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -125,6 +125,7 @@ enum trace_flag_type { TRACE_FLAG_HARDIRQ = 0x08, TRACE_FLAG_SOFTIRQ = 0x10, TRACE_FLAG_PREEMPT_RESCHED = 0x20, + TRACE_FLAG_NMI = 0x40, }; #define TRACE_BUF_SIZE 1024 @@ -345,6 +346,7 @@ struct tracer_opt { struct tracer_flags { u32 val; struct tracer_opt *opts; + struct tracer *trace; }; /* Makes more easy to define a tracer opt */ @@ -1111,6 +1113,18 @@ struct filter_pred { unsigned short right; }; +static inline bool is_string_field(struct ftrace_event_field *field) +{ + return field->filter_type == FILTER_DYN_STRING || + field->filter_type == FILTER_STATIC_STRING || + field->filter_type == FILTER_PTR_STRING; +} + +static inline bool is_function_field(struct ftrace_event_field *field) +{ + return field->filter_type == FILTER_TRACE_FN; +} + extern enum regex_type filter_parse_regex(char *buff, int len, char **search, int *not); extern void print_event_filter(struct trace_event_file *file, @@ -1159,9 +1173,24 @@ struct event_trigger_data { struct event_filter __rcu *filter; char *filter_str; void *private_data; + bool paused; struct list_head list; }; +extern void trigger_data_free(struct event_trigger_data *data); +extern int event_trigger_init(struct event_trigger_ops *ops, + struct event_trigger_data *data); +extern int trace_event_trigger_enable_disable(struct trace_event_file *file, + int trigger_enable); +extern void update_cond_flag(struct trace_event_file *file); +extern void unregister_trigger(char *glob, struct event_trigger_ops *ops, + struct event_trigger_data *test, + struct trace_event_file *file); +extern int set_trigger_filter(char *filter_str, + struct event_trigger_data *trigger_data, + struct trace_event_file *file); +extern int register_event_command(struct event_command *cmd); + /** * struct event_trigger_ops - callbacks for trace event triggers * @@ -1174,7 +1203,8 @@ struct event_trigger_data { * @func: The trigger 'probe' function called when the triggering * event occurs. The data passed into this callback is the data * that was supplied to the event_command @reg() function that - * registered the trigger (see struct event_command). + * registered the trigger (see struct event_command) along with + * the trace record, rec. * * @init: An optional initialization function called for the trigger * when the trigger is registered (via the event_command reg() @@ -1199,7 +1229,8 @@ struct event_trigger_data { * (see trace_event_triggers.c). */ struct event_trigger_ops { - void (*func)(struct event_trigger_data *data); + void (*func)(struct event_trigger_data *data, + void *rec); int (*init)(struct event_trigger_ops *ops, struct event_trigger_data *data); void (*free)(struct event_trigger_ops *ops, @@ -1243,27 +1274,10 @@ struct event_trigger_ops { * values are defined by adding new values to the trigger_type * enum in include/linux/trace_events.h. * - * @post_trigger: A flag that says whether or not this command needs - * to have its action delayed until after the current event has - * been closed. Some triggers need to avoid being invoked while - * an event is currently in the process of being logged, since - * the trigger may itself log data into the trace buffer. Thus - * we make sure the current event is committed before invoking - * those triggers. To do that, the trigger invocation is split - * in two - the first part checks the filter using the current - * trace record; if a command has the @post_trigger flag set, it - * sets a bit for itself in the return value, otherwise it - * directly invokes the trigger. Once all commands have been - * either invoked or set their return flag, the current record is - * either committed or discarded. At that point, if any commands - * have deferred their triggers, those commands are finally - * invoked following the close of the current event. In other - * words, if the event_trigger_ops @func() probe implementation - * itself logs to the trace buffer, this flag should be set, - * otherwise it can be left unspecified. + * @flags: See the enum event_command_flags below. * - * All the methods below, except for @set_filter(), must be - * implemented. + * All the methods below, except for @set_filter() and @unreg_all(), + * must be implemented. * * @func: The callback function responsible for parsing and * registering the trigger written to the 'trigger' file by the @@ -1288,6 +1302,10 @@ struct event_trigger_ops { * This is usually implemented by the generic utility function * @unregister_trigger() (see trace_event_triggers.c). * + * @unreg_all: An optional function called to remove all the triggers + * from the list of triggers associated with the event. Called + * when a trigger file is opened in truncate mode. + * * @set_filter: An optional function called to parse and set a filter * for the trigger. If no @set_filter() method is set for the * event command, filters set by the user for the command will be @@ -1301,7 +1319,7 @@ struct event_command { struct list_head list; char *name; enum event_trigger_type trigger_type; - bool post_trigger; + int flags; int (*func)(struct event_command *cmd_ops, struct trace_event_file *file, char *glob, char *cmd, char *params); @@ -1313,12 +1331,56 @@ struct event_command { struct event_trigger_ops *ops, struct event_trigger_data *data, struct trace_event_file *file); + void (*unreg_all)(struct trace_event_file *file); int (*set_filter)(char *filter_str, struct event_trigger_data *data, struct trace_event_file *file); struct event_trigger_ops *(*get_trigger_ops)(char *cmd, char *param); }; +/** + * enum event_command_flags - flags for struct event_command + * + * @POST_TRIGGER: A flag that says whether or not this command needs + * to have its action delayed until after the current event has + * been closed. Some triggers need to avoid being invoked while + * an event is currently in the process of being logged, since + * the trigger may itself log data into the trace buffer. Thus + * we make sure the current event is committed before invoking + * those triggers. To do that, the trigger invocation is split + * in two - the first part checks the filter using the current + * trace record; if a command has the @post_trigger flag set, it + * sets a bit for itself in the return value, otherwise it + * directly invokes the trigger. Once all commands have been + * either invoked or set their return flag, the current record is + * either committed or discarded. At that point, if any commands + * have deferred their triggers, those commands are finally + * invoked following the close of the current event. In other + * words, if the event_trigger_ops @func() probe implementation + * itself logs to the trace buffer, this flag should be set, + * otherwise it can be left unspecified. + * + * @NEEDS_REC: A flag that says whether or not this command needs + * access to the trace record in order to perform its function, + * regardless of whether or not it has a filter associated with + * it (filters make a trigger require access to the trace record + * but are not always present). + */ +enum event_command_flags { + EVENT_CMD_FL_POST_TRIGGER = 1, + EVENT_CMD_FL_NEEDS_REC = 2, +}; + +static inline bool event_command_post_trigger(struct event_command *cmd_ops) +{ + return cmd_ops->flags & EVENT_CMD_FL_POST_TRIGGER; +} + +static inline bool event_command_needs_rec(struct event_command *cmd_ops) +{ + return cmd_ops->flags & EVENT_CMD_FL_NEEDS_REC; +} + extern int trace_event_enable_disable(struct trace_event_file *file, int enable, int soft_disable); extern int tracing_alloc_snapshot(void); @@ -1365,8 +1427,13 @@ int perf_ftrace_event_register(struct trace_event_call *call, #ifdef CONFIG_FTRACE_SYSCALLS void init_ftrace_syscalls(void); +const char *get_syscall_name(int syscall); #else static inline void init_ftrace_syscalls(void) { } +static inline const char *get_syscall_name(int syscall) +{ + return NULL; +} #endif #ifdef CONFIG_EVENT_TRACING diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index f333e57c4614..05ddc0820771 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -97,16 +97,16 @@ trace_find_event_field(struct trace_event_call *call, char *name) struct ftrace_event_field *field; struct list_head *head; - field = __find_event_field(&ftrace_generic_fields, name); + head = trace_get_fields(call); + field = __find_event_field(head, name); if (field) return field; - field = __find_event_field(&ftrace_common_fields, name); + field = __find_event_field(&ftrace_generic_fields, name); if (field) return field; - head = trace_get_fields(call); - return __find_event_field(head, name); + return __find_event_field(&ftrace_common_fields, name); } static int __trace_define_field(struct list_head *head, const char *type, @@ -171,8 +171,10 @@ static int trace_define_generic_fields(void) { int ret; - __generic_field(int, cpu, FILTER_OTHER); - __generic_field(char *, comm, FILTER_PTR_STRING); + __generic_field(int, CPU, FILTER_CPU); + __generic_field(int, cpu, FILTER_CPU); + __generic_field(char *, COMM, FILTER_COMM); + __generic_field(char *, comm, FILTER_COMM); return ret; } @@ -869,7 +871,8 @@ t_next(struct seq_file *m, void *v, loff_t *pos) * The ftrace subsystem is for showing formats only. * They can not be enabled or disabled via the event files. */ - if (call->class && call->class->reg) + if (call->class && call->class->reg && + !(call->flags & TRACE_EVENT_FL_IGNORE_ENABLE)) return file; } diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index f93a219b18da..b3f5051cd4e9 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -961,18 +961,6 @@ int filter_assign_type(const char *type) return FILTER_OTHER; } -static bool is_function_field(struct ftrace_event_field *field) -{ - return field->filter_type == FILTER_TRACE_FN; -} - -static bool is_string_field(struct ftrace_event_field *field) -{ - return field->filter_type == FILTER_DYN_STRING || - field->filter_type == FILTER_STATIC_STRING || - field->filter_type == FILTER_PTR_STRING; -} - static bool is_legal_op(struct ftrace_event_field *field, int op) { if (is_string_field(field) && @@ -1043,13 +1031,14 @@ static int init_pred(struct filter_parse_state *ps, return -EINVAL; } - if (is_string_field(field)) { + if (field->filter_type == FILTER_COMM) { + filter_build_regex(pred); + fn = filter_pred_comm; + pred->regex.field_len = TASK_COMM_LEN; + } else if (is_string_field(field)) { filter_build_regex(pred); - if (!strcmp(field->name, "comm")) { - fn = filter_pred_comm; - pred->regex.field_len = TASK_COMM_LEN; - } else if (field->filter_type == FILTER_STATIC_STRING) { + if (field->filter_type == FILTER_STATIC_STRING) { fn = filter_pred_string; pred->regex.field_len = field->size; } else if (field->filter_type == FILTER_DYN_STRING) @@ -1072,7 +1061,7 @@ static int init_pred(struct filter_parse_state *ps, } pred->val = val; - if (!strcmp(field->name, "cpu")) + if (field->filter_type == FILTER_CPU) fn = filter_pred_cpu; else fn = select_comparison_fn(pred->op, field->size, diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c index b38f617b6181..d67992f3bb0e 100644 --- a/kernel/trace/trace_events_trigger.c +++ b/kernel/trace/trace_events_trigger.c @@ -28,8 +28,7 @@ static LIST_HEAD(trigger_commands); static DEFINE_MUTEX(trigger_cmd_mutex); -static void -trigger_data_free(struct event_trigger_data *data) +void trigger_data_free(struct event_trigger_data *data) { if (data->cmd_ops->set_filter) data->cmd_ops->set_filter(NULL, data, NULL); @@ -73,18 +72,20 @@ event_triggers_call(struct trace_event_file *file, void *rec) return tt; list_for_each_entry_rcu(data, &file->triggers, list) { + if (data->paused) + continue; if (!rec) { - data->ops->func(data); + data->ops->func(data, rec); continue; } filter = rcu_dereference_sched(data->filter); if (filter && !filter_match_preds(filter, rec)) continue; - if (data->cmd_ops->post_trigger) { + if (event_command_post_trigger(data->cmd_ops)) { tt |= data->cmd_ops->trigger_type; continue; } - data->ops->func(data); + data->ops->func(data, rec); } return tt; } @@ -94,6 +95,7 @@ EXPORT_SYMBOL_GPL(event_triggers_call); * event_triggers_post_call - Call 'post_triggers' for a trace event * @file: The trace_event_file associated with the event * @tt: enum event_trigger_type containing a set bit for each trigger to invoke + * @rec: The trace entry for the event * * For each trigger associated with an event, invoke the trigger * function registered with the associated trigger command, if the @@ -104,13 +106,16 @@ EXPORT_SYMBOL_GPL(event_triggers_call); */ void event_triggers_post_call(struct trace_event_file *file, - enum event_trigger_type tt) + enum event_trigger_type tt, + void *rec) { struct event_trigger_data *data; list_for_each_entry_rcu(data, &file->triggers, list) { + if (data->paused) + continue; if (data->cmd_ops->trigger_type & tt) - data->ops->func(data); + data->ops->func(data, rec); } } EXPORT_SYMBOL_GPL(event_triggers_post_call); @@ -188,6 +193,19 @@ static int event_trigger_regex_open(struct inode *inode, struct file *file) return -ENODEV; } + if ((file->f_mode & FMODE_WRITE) && + (file->f_flags & O_TRUNC)) { + struct trace_event_file *event_file; + struct event_command *p; + + event_file = event_file_data(file); + + list_for_each_entry(p, &trigger_commands, list) { + if (p->unreg_all) + p->unreg_all(event_file); + } + } + if (file->f_mode & FMODE_READ) { ret = seq_open(file, &event_triggers_seq_ops); if (!ret) { @@ -306,7 +324,7 @@ const struct file_operations event_trigger_fops = { * Currently we only register event commands from __init, so mark this * __init too. */ -static __init int register_event_command(struct event_command *cmd) +__init int register_event_command(struct event_command *cmd) { struct event_command *p; int ret = 0; @@ -395,9 +413,8 @@ event_trigger_print(const char *name, struct seq_file *m, * * Return: 0 on success, errno otherwise */ -static int -event_trigger_init(struct event_trigger_ops *ops, - struct event_trigger_data *data) +int event_trigger_init(struct event_trigger_ops *ops, + struct event_trigger_data *data) { data->ref++; return 0; @@ -425,8 +442,8 @@ event_trigger_free(struct event_trigger_ops *ops, trigger_data_free(data); } -static int trace_event_trigger_enable_disable(struct trace_event_file *file, - int trigger_enable) +int trace_event_trigger_enable_disable(struct trace_event_file *file, + int trigger_enable) { int ret = 0; @@ -483,13 +500,14 @@ clear_event_triggers(struct trace_array *tr) * its TRIGGER_COND bit set, otherwise the TRIGGER_COND bit should be * cleared. */ -static void update_cond_flag(struct trace_event_file *file) +void update_cond_flag(struct trace_event_file *file) { struct event_trigger_data *data; bool set_cond = false; list_for_each_entry_rcu(data, &file->triggers, list) { - if (data->filter || data->cmd_ops->post_trigger) { + if (data->filter || event_command_post_trigger(data->cmd_ops) || + event_command_needs_rec(data->cmd_ops)) { set_cond = true; break; } @@ -560,9 +578,9 @@ out: * Usually used directly as the @unreg method in event command * implementations. */ -static void unregister_trigger(char *glob, struct event_trigger_ops *ops, - struct event_trigger_data *test, - struct trace_event_file *file) +void unregister_trigger(char *glob, struct event_trigger_ops *ops, + struct event_trigger_data *test, + struct trace_event_file *file) { struct event_trigger_data *data; bool unregistered = false; @@ -696,9 +714,9 @@ event_trigger_callback(struct event_command *cmd_ops, * * Return: 0 on success, errno otherwise */ -static int set_trigger_filter(char *filter_str, - struct event_trigger_data *trigger_data, - struct trace_event_file *file) +int set_trigger_filter(char *filter_str, + struct event_trigger_data *trigger_data, + struct trace_event_file *file) { struct event_trigger_data *data = trigger_data; struct event_filter *filter = NULL, *tmp; @@ -747,7 +765,7 @@ static int set_trigger_filter(char *filter_str, } static void -traceon_trigger(struct event_trigger_data *data) +traceon_trigger(struct event_trigger_data *data, void *rec) { if (tracing_is_on()) return; @@ -756,7 +774,7 @@ traceon_trigger(struct event_trigger_data *data) } static void -traceon_count_trigger(struct event_trigger_data *data) +traceon_count_trigger(struct event_trigger_data *data, void *rec) { if (tracing_is_on()) return; @@ -771,7 +789,7 @@ traceon_count_trigger(struct event_trigger_data *data) } static void -traceoff_trigger(struct event_trigger_data *data) +traceoff_trigger(struct event_trigger_data *data, void *rec) { if (!tracing_is_on()) return; @@ -780,7 +798,7 @@ traceoff_trigger(struct event_trigger_data *data) } static void -traceoff_count_trigger(struct event_trigger_data *data) +traceoff_count_trigger(struct event_trigger_data *data, void *rec) { if (!tracing_is_on()) return; @@ -876,13 +894,13 @@ static struct event_command trigger_traceoff_cmd = { #ifdef CONFIG_TRACER_SNAPSHOT static void -snapshot_trigger(struct event_trigger_data *data) +snapshot_trigger(struct event_trigger_data *data, void *rec) { tracing_snapshot(); } static void -snapshot_count_trigger(struct event_trigger_data *data) +snapshot_count_trigger(struct event_trigger_data *data, void *rec) { if (!data->count) return; @@ -890,7 +908,7 @@ snapshot_count_trigger(struct event_trigger_data *data) if (data->count != -1) (data->count)--; - snapshot_trigger(data); + snapshot_trigger(data, rec); } static int @@ -969,13 +987,13 @@ static __init int register_trigger_snapshot_cmd(void) { return 0; } #define STACK_SKIP 3 static void -stacktrace_trigger(struct event_trigger_data *data) +stacktrace_trigger(struct event_trigger_data *data, void *rec) { trace_dump_stack(STACK_SKIP); } static void -stacktrace_count_trigger(struct event_trigger_data *data) +stacktrace_count_trigger(struct event_trigger_data *data, void *rec) { if (!data->count) return; @@ -983,7 +1001,7 @@ stacktrace_count_trigger(struct event_trigger_data *data) if (data->count != -1) (data->count)--; - stacktrace_trigger(data); + stacktrace_trigger(data, rec); } static int @@ -1017,7 +1035,7 @@ stacktrace_get_trigger_ops(char *cmd, char *param) static struct event_command trigger_stacktrace_cmd = { .name = "stacktrace", .trigger_type = ETT_STACKTRACE, - .post_trigger = true, + .flags = EVENT_CMD_FL_POST_TRIGGER, .func = event_trigger_callback, .reg = register_trigger, .unreg = unregister_trigger, @@ -1054,7 +1072,7 @@ struct enable_trigger_data { }; static void -event_enable_trigger(struct event_trigger_data *data) +event_enable_trigger(struct event_trigger_data *data, void *rec) { struct enable_trigger_data *enable_data = data->private_data; @@ -1065,7 +1083,7 @@ event_enable_trigger(struct event_trigger_data *data) } static void -event_enable_count_trigger(struct event_trigger_data *data) +event_enable_count_trigger(struct event_trigger_data *data, void *rec) { struct enable_trigger_data *enable_data = data->private_data; @@ -1079,7 +1097,7 @@ event_enable_count_trigger(struct event_trigger_data *data) if (data->count != -1) (data->count)--; - event_enable_trigger(data); + event_enable_trigger(data, rec); } static int diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index fcd41a166405..5a095c2e4b69 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c @@ -219,6 +219,8 @@ static void tracing_stop_function_trace(struct trace_array *tr) unregister_ftrace_function(tr->ops); } +static struct tracer function_trace; + static int func_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set) { @@ -228,6 +230,10 @@ func_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set) if (!!set == !!(func_flags.val & TRACE_FUNC_OPT_STACK)) break; + /* We can change this flag when not running. */ + if (tr->current_trace != &function_trace) + break; + unregister_ftrace_function(tr->ops); if (set) { diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index a663cbb84107..3a0244ff7ea8 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -8,6 +8,7 @@ */ #include <linux/uaccess.h> #include <linux/ftrace.h> +#include <linux/interrupt.h> #include <linux/slab.h> #include <linux/fs.h> @@ -1350,7 +1351,7 @@ void graph_trace_open(struct trace_iterator *iter) out_err_free: kfree(data); out_err: - pr_warning("function graph tracer: not enough memory\n"); + pr_warn("function graph tracer: not enough memory\n"); } void graph_trace_close(struct trace_iterator *iter) @@ -1468,12 +1469,12 @@ static __init int init_graph_trace(void) max_bytes_for_cpu = snprintf(NULL, 0, "%d", nr_cpu_ids - 1); if (!register_trace_event(&graph_trace_entry_event)) { - pr_warning("Warning: could not register graph trace events\n"); + pr_warn("Warning: could not register graph trace events\n"); return 1; } if (!register_trace_event(&graph_trace_ret_event)) { - pr_warning("Warning: could not register graph trace events\n"); + pr_warn("Warning: could not register graph trace events\n"); return 1; } diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index e4e56589ec1d..03cdff84d026 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -109,8 +109,12 @@ static int func_prolog_dec(struct trace_array *tr, return 0; local_save_flags(*flags); - /* slight chance to get a false positive on tracing_cpu */ - if (!irqs_disabled_flags(*flags)) + /* + * Slight chance to get a false positive on tracing_cpu, + * although I'm starting to think there isn't a chance. + * Leave this for now just to be paranoid. + */ + if (!irqs_disabled_flags(*flags) && !preempt_count()) return 0; *data = per_cpu_ptr(tr->trace_buffer.data, cpu); @@ -622,7 +626,6 @@ static int __irqsoff_tracer_init(struct trace_array *tr) irqsoff_trace = tr; /* make sure that the tracer is visible */ smp_wmb(); - tracing_reset_online_cpus(&tr->trace_buffer); ftrace_init_array_ops(tr, irqsoff_tracer_call); diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index c9956440d0e6..919e0ddd8fcc 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -30,7 +30,7 @@ struct trace_kprobe { struct list_head list; struct kretprobe rp; /* Use rp.kp for kprobe use */ - unsigned long nhit; + unsigned long __percpu *nhit; const char *symbol; /* symbol name */ struct trace_probe tp; }; @@ -274,6 +274,10 @@ static struct trace_kprobe *alloc_trace_kprobe(const char *group, if (!tk) return ERR_PTR(ret); + tk->nhit = alloc_percpu(unsigned long); + if (!tk->nhit) + goto error; + if (symbol) { tk->symbol = kstrdup(symbol, GFP_KERNEL); if (!tk->symbol) @@ -313,6 +317,7 @@ static struct trace_kprobe *alloc_trace_kprobe(const char *group, error: kfree(tk->tp.call.name); kfree(tk->symbol); + free_percpu(tk->nhit); kfree(tk); return ERR_PTR(ret); } @@ -327,6 +332,7 @@ static void free_trace_kprobe(struct trace_kprobe *tk) kfree(tk->tp.call.class->system); kfree(tk->tp.call.name); kfree(tk->symbol); + free_percpu(tk->nhit); kfree(tk); } @@ -453,16 +459,14 @@ static int __register_trace_kprobe(struct trace_kprobe *tk) if (ret == 0) tk->tp.flags |= TP_FLAG_REGISTERED; else { - pr_warning("Could not insert probe at %s+%lu: %d\n", - trace_kprobe_symbol(tk), trace_kprobe_offset(tk), ret); + pr_warn("Could not insert probe at %s+%lu: %d\n", + trace_kprobe_symbol(tk), trace_kprobe_offset(tk), ret); if (ret == -ENOENT && trace_kprobe_is_on_module(tk)) { - pr_warning("This probe might be able to register after" - "target module is loaded. Continue.\n"); + pr_warn("This probe might be able to register after target module is loaded. Continue.\n"); ret = 0; } else if (ret == -EILSEQ) { - pr_warning("Probing address(0x%p) is not an " - "instruction boundary.\n", - tk->rp.kp.addr); + pr_warn("Probing address(0x%p) is not an instruction boundary.\n", + tk->rp.kp.addr); ret = -EINVAL; } } @@ -523,7 +527,7 @@ static int register_trace_kprobe(struct trace_kprobe *tk) /* Register new event */ ret = register_kprobe_event(tk); if (ret) { - pr_warning("Failed to register probe event(%d)\n", ret); + pr_warn("Failed to register probe event(%d)\n", ret); goto end; } @@ -558,10 +562,9 @@ static int trace_kprobe_module_callback(struct notifier_block *nb, __unregister_trace_kprobe(tk); ret = __register_trace_kprobe(tk); if (ret) - pr_warning("Failed to re-register probe %s on" - "%s: %d\n", - trace_event_name(&tk->tp.call), - mod->name, ret); + pr_warn("Failed to re-register probe %s on %s: %d\n", + trace_event_name(&tk->tp.call), + mod->name, ret); } } mutex_unlock(&probe_lock); @@ -874,9 +877,14 @@ static const struct file_operations kprobe_events_ops = { static int probes_profile_seq_show(struct seq_file *m, void *v) { struct trace_kprobe *tk = v; + unsigned long nhit = 0; + int cpu; + + for_each_possible_cpu(cpu) + nhit += *per_cpu_ptr(tk->nhit, cpu); seq_printf(m, " %-44s %15lu %15lu\n", - trace_event_name(&tk->tp.call), tk->nhit, + trace_event_name(&tk->tp.call), nhit, tk->rp.kp.nmissed); return 0; @@ -1225,7 +1233,7 @@ static int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs) { struct trace_kprobe *tk = container_of(kp, struct trace_kprobe, rp.kp); - tk->nhit++; + raw_cpu_inc(*tk->nhit); if (tk->tp.flags & TP_FLAG_TRACE) kprobe_trace_func(tk, regs); @@ -1242,7 +1250,7 @@ kretprobe_dispatcher(struct kretprobe_instance *ri, struct pt_regs *regs) { struct trace_kprobe *tk = container_of(ri->rp, struct trace_kprobe, rp); - tk->nhit++; + raw_cpu_inc(*tk->nhit); if (tk->tp.flags & TP_FLAG_TRACE) kretprobe_trace_func(tk, ri, regs); @@ -1325,16 +1333,14 @@ static __init int init_kprobe_trace(void) /* Event list interface */ if (!entry) - pr_warning("Could not create tracefs " - "'kprobe_events' entry\n"); + pr_warn("Could not create tracefs 'kprobe_events' entry\n"); /* Profile interface */ entry = tracefs_create_file("kprobe_profile", 0444, d_tracer, NULL, &kprobe_profile_ops); if (!entry) - pr_warning("Could not create tracefs " - "'kprobe_profile' entry\n"); + pr_warn("Could not create tracefs 'kprobe_profile' entry\n"); return 0; } fs_initcall(init_kprobe_trace); diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c index 2be8c4f2403d..68f376ca6d3f 100644 --- a/kernel/trace/trace_mmiotrace.c +++ b/kernel/trace/trace_mmiotrace.c @@ -146,7 +146,7 @@ static ssize_t mmio_read(struct trace_iterator *iter, struct file *filp, /* XXX: This is later than where events were lost. */ trace_seq_printf(s, "MARK 0.000000 Lost %lu events.\n", n); if (!overrun_detected) - pr_warning("mmiotrace has lost events.\n"); + pr_warn("mmiotrace has lost events\n"); overrun_detected = true; goto print_out; } diff --git a/kernel/trace/trace_nop.c b/kernel/trace/trace_nop.c index 8bb2071474dd..49f61fe96a6b 100644 --- a/kernel/trace/trace_nop.c +++ b/kernel/trace/trace_nop.c @@ -56,7 +56,7 @@ static void nop_trace_reset(struct trace_array *tr) } /* It only serves as a signal handler and a callback to - * accept or refuse tthe setting of a flag. + * accept or refuse the setting of a flag. * If you don't implement it, then the flag setting will be * automatically accepted. */ @@ -75,7 +75,7 @@ static int nop_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set) if (bit == TRACE_NOP_OPT_REFUSE) { printk(KERN_DEBUG "nop_test_refuse flag set to %d: we refuse." - "Now cat trace_options to see the result\n", + " Now cat trace_options to see the result\n", set); return -EINVAL; } diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 282982195e09..0bb9cf2d53e6 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -389,7 +389,9 @@ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry) char irqs_off; int hardirq; int softirq; + int nmi; + nmi = entry->flags & TRACE_FLAG_NMI; hardirq = entry->flags & TRACE_FLAG_HARDIRQ; softirq = entry->flags & TRACE_FLAG_SOFTIRQ; @@ -415,10 +417,12 @@ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry) } hardsoft_irq = + (nmi && hardirq) ? 'Z' : + nmi ? 'z' : (hardirq && softirq) ? 'H' : - hardirq ? 'h' : - softirq ? 's' : - '.'; + hardirq ? 'h' : + softirq ? 's' : + '.' ; trace_seq_printf(s, "%c%c%c", irqs_off, need_resched, hardsoft_irq); diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c index 060df67dbdd1..f96f0383f6c6 100644 --- a/kernel/trace/trace_printk.c +++ b/kernel/trace/trace_printk.c @@ -296,6 +296,9 @@ static int t_show(struct seq_file *m, void *v) const char *str = *fmt; int i; + if (!*fmt) + return 0; + seq_printf(m, "0x%lx : \"", *(unsigned long *)fmt); /* diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index 1769a81da8a7..1d372fa6fefb 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -636,8 +636,8 @@ ssize_t traceprobe_probes_write(struct file *file, const char __user *buffer, *tmp = '\0'; size = tmp - kbuf + 1; } else if (done + size < count) { - pr_warning("Line length is too long: " - "Should be less than %d.", WRITE_BUFSIZE); + pr_warn("Line length is too long: Should be less than %d\n", + WRITE_BUFSIZE); ret = -EINVAL; goto out; } diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index dda9e6742950..2a1abbaca10e 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -126,6 +126,13 @@ check_stack(unsigned long ip, unsigned long *stack) } /* + * Some archs may not have the passed in ip in the dump. + * If that happens, we need to show everything. + */ + if (i == stack_trace_max.nr_entries) + i = 0; + + /* * Now find where in the stack these are. */ x = 0; @@ -149,7 +156,11 @@ check_stack(unsigned long ip, unsigned long *stack) for (; p < top && i < stack_trace_max.nr_entries; p++) { if (stack_dump_trace[i] == ULONG_MAX) break; - if (*p == stack_dump_trace[i]) { + /* + * The READ_ONCE_NOCHECK is used to let KASAN know that + * this is not a stack-out-of-bounds error. + */ + if ((READ_ONCE_NOCHECK(*p)) == stack_dump_trace[i]) { stack_dump_trace[x] = stack_dump_trace[i++]; this_size = stack_trace_index[x++] = (top - p) * sizeof(unsigned long); diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c index 6cf935316769..413ff108fbd0 100644 --- a/kernel/trace/trace_stat.c +++ b/kernel/trace/trace_stat.c @@ -281,8 +281,7 @@ static int tracing_stat_init(void) stat_dir = tracefs_create_dir("trace_stat", d_tracing); if (!stat_dir) - pr_warning("Could not create tracefs " - "'trace_stat' entry\n"); + pr_warn("Could not create tracefs 'trace_stat' entry\n"); return 0; } diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index 0655afbea83f..e78f364cc192 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -106,6 +106,17 @@ static struct syscall_metadata *syscall_nr_to_meta(int nr) return syscalls_metadata[nr]; } +const char *get_syscall_name(int syscall) +{ + struct syscall_metadata *entry; + + entry = syscall_nr_to_meta(syscall); + if (!entry) + return NULL; + + return entry->name; +} + static enum print_line_t print_syscall_enter(struct trace_iterator *iter, int flags, struct trace_event *event) @@ -186,11 +197,11 @@ print_syscall_exit(struct trace_iterator *iter, int flags, extern char *__bad_type_size(void); -#define SYSCALL_FIELD(type, name) \ - sizeof(type) != sizeof(trace.name) ? \ +#define SYSCALL_FIELD(type, field, name) \ + sizeof(type) != sizeof(trace.field) ? \ __bad_type_size() : \ - #type, #name, offsetof(typeof(trace), name), \ - sizeof(trace.name), is_signed_type(type) + #type, #name, offsetof(typeof(trace), field), \ + sizeof(trace.field), is_signed_type(type) static int __init __set_enter_print_fmt(struct syscall_metadata *entry, char *buf, int len) @@ -261,7 +272,8 @@ static int __init syscall_enter_define_fields(struct trace_event_call *call) int i; int offset = offsetof(typeof(trace), args); - ret = trace_define_field(call, SYSCALL_FIELD(int, nr), FILTER_OTHER); + ret = trace_define_field(call, SYSCALL_FIELD(int, nr, __syscall_nr), + FILTER_OTHER); if (ret) return ret; @@ -281,11 +293,12 @@ static int __init syscall_exit_define_fields(struct trace_event_call *call) struct syscall_trace_exit trace; int ret; - ret = trace_define_field(call, SYSCALL_FIELD(int, nr), FILTER_OTHER); + ret = trace_define_field(call, SYSCALL_FIELD(int, nr, __syscall_nr), + FILTER_OTHER); if (ret) return ret; - ret = trace_define_field(call, SYSCALL_FIELD(long, ret), + ret = trace_define_field(call, SYSCALL_FIELD(long, ret, ret), FILTER_OTHER); return ret; diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index d2f6d0be3503..7915142c89e4 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -334,7 +334,7 @@ static int register_trace_uprobe(struct trace_uprobe *tu) ret = register_uprobe_event(tu); if (ret) { - pr_warning("Failed to register probe event(%d)\n", ret); + pr_warn("Failed to register probe event(%d)\n", ret); goto end; } diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c index ecd536de603a..d0639d917899 100644 --- a/kernel/tracepoint.c +++ b/kernel/tracepoint.c @@ -491,7 +491,7 @@ static __init int init_tracepoints(void) ret = register_module_notifier(&tracepoint_module_nb); if (ret) - pr_warning("Failed to register tracepoint module enter notifier\n"); + pr_warn("Failed to register tracepoint module enter notifier\n"); return ret; } diff --git a/kernel/tsacct.c b/kernel/tsacct.c index 975cb49e32bf..f8e26ab963ed 100644 --- a/kernel/tsacct.c +++ b/kernel/tsacct.c @@ -93,9 +93,11 @@ void xacct_add_tsk(struct taskstats *stats, struct task_struct *p) { struct mm_struct *mm; - /* convert pages-usec to Mbyte-usec */ - stats->coremem = p->acct_rss_mem1 * PAGE_SIZE / MB; - stats->virtmem = p->acct_vm_mem1 * PAGE_SIZE / MB; + /* convert pages-nsec/1024 to Mbyte-usec, see __acct_update_integrals */ + stats->coremem = p->acct_rss_mem1 * PAGE_SIZE; + do_div(stats->coremem, 1000 * KB); + stats->virtmem = p->acct_vm_mem1 * PAGE_SIZE; + do_div(stats->virtmem, 1000 * KB); mm = get_task_mm(p); if (mm) { /* adjust to KB unit */ @@ -123,27 +125,28 @@ void xacct_add_tsk(struct taskstats *stats, struct task_struct *p) static void __acct_update_integrals(struct task_struct *tsk, cputime_t utime, cputime_t stime) { - if (likely(tsk->mm)) { - cputime_t time, dtime; - struct timeval value; - unsigned long flags; - u64 delta; - - local_irq_save(flags); - time = stime + utime; - dtime = time - tsk->acct_timexpd; - jiffies_to_timeval(cputime_to_jiffies(dtime), &value); - delta = value.tv_sec; - delta = delta * USEC_PER_SEC + value.tv_usec; - - if (delta == 0) - goto out; - tsk->acct_timexpd = time; - tsk->acct_rss_mem1 += delta * get_mm_rss(tsk->mm); - tsk->acct_vm_mem1 += delta * tsk->mm->total_vm; - out: - local_irq_restore(flags); - } + cputime_t time, dtime; + u64 delta; + + if (!likely(tsk->mm)) + return; + + time = stime + utime; + dtime = time - tsk->acct_timexpd; + /* Avoid division: cputime_t is often in nanoseconds already. */ + delta = cputime_to_nsecs(dtime); + + if (delta < TICK_NSEC) + return; + + tsk->acct_timexpd = time; + /* + * Divide by 1024 to avoid overflow, and to avoid division. + * The final unit reported to userspace is Mbyte-usecs, + * the rest of the math is done in xacct_add_tsk. + */ + tsk->acct_rss_mem1 += delta * get_mm_rss(tsk->mm) >> 10; + tsk->acct_vm_mem1 += delta * tsk->mm->total_vm >> 10; } /** @@ -153,9 +156,12 @@ static void __acct_update_integrals(struct task_struct *tsk, void acct_update_integrals(struct task_struct *tsk) { cputime_t utime, stime; + unsigned long flags; + local_irq_save(flags); task_cputime(tsk, &utime, &stime); __acct_update_integrals(tsk, utime, stime); + local_irq_restore(flags); } /** diff --git a/kernel/watchdog.c b/kernel/watchdog.c index b3ace6ebbba3..9acb29f280ec 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -923,6 +923,9 @@ static int proc_watchdog_common(int which, struct ctl_table *table, int write, * both lockup detectors are disabled if proc_watchdog_update() * returns an error. */ + if (old == new) + goto out; + err = proc_watchdog_update(); } out: @@ -967,7 +970,7 @@ int proc_soft_watchdog(struct ctl_table *table, int write, int proc_watchdog_thresh(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { - int err, old; + int err, old, new; get_online_cpus(); mutex_lock(&watchdog_proc_mutex); @@ -987,6 +990,10 @@ int proc_watchdog_thresh(struct ctl_table *table, int write, /* * Update the sample period. Restore on failure. */ + new = ACCESS_ONCE(watchdog_thresh); + if (old == new) + goto out; + set_sample_period(); err = proc_watchdog_update(); if (err) { diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 61a0264e28f9..2232ae3e3ad6 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -301,11 +301,26 @@ static DEFINE_SPINLOCK(wq_mayday_lock); /* protects wq->maydays list */ static LIST_HEAD(workqueues); /* PR: list of all workqueues */ static bool workqueue_freezing; /* PL: have wqs started freezing? */ -static cpumask_var_t wq_unbound_cpumask; /* PL: low level cpumask for all unbound wqs */ +/* PL: allowable cpus for unbound wqs and work items */ +static cpumask_var_t wq_unbound_cpumask; + +/* CPU where unbound work was last round robin scheduled from this CPU */ +static DEFINE_PER_CPU(int, wq_rr_cpu_last); + +/* + * Local execution of unbound work items is no longer guaranteed. The + * following always forces round-robin CPU selection on unbound work items + * to uncover usages which depend on it. + */ +#ifdef CONFIG_DEBUG_WQ_FORCE_RR_CPU +static bool wq_debug_force_rr_cpu = true; +#else +static bool wq_debug_force_rr_cpu = false; +#endif +module_param_named(debug_force_rr_cpu, wq_debug_force_rr_cpu, bool, 0644); /* the per-cpu worker pools */ -static DEFINE_PER_CPU_SHARED_ALIGNED(struct worker_pool [NR_STD_WORKER_POOLS], - cpu_worker_pools); +static DEFINE_PER_CPU_SHARED_ALIGNED(struct worker_pool [NR_STD_WORKER_POOLS], cpu_worker_pools); static DEFINE_IDR(worker_pool_idr); /* PR: idr of all pools */ @@ -570,6 +585,16 @@ static struct pool_workqueue *unbound_pwq_by_node(struct workqueue_struct *wq, int node) { assert_rcu_or_wq_mutex_or_pool_mutex(wq); + + /* + * XXX: @node can be NUMA_NO_NODE if CPU goes offline while a + * delayed item is pending. The plan is to keep CPU -> NODE + * mapping valid and stable across CPU on/offlines. Once that + * happens, this workaround can be removed. + */ + if (unlikely(node == NUMA_NO_NODE)) + return wq->dfl_pwq; + return rcu_dereference_raw(wq->numa_pwq_tbl[node]); } @@ -832,7 +857,6 @@ void wq_worker_waking_up(struct task_struct *task, int cpu) /** * wq_worker_sleeping - a worker is going to sleep * @task: task going to sleep - * @cpu: CPU in question, must be the current CPU number * * This function is called during schedule() when a busy worker is * going to sleep. Worker on the same cpu can be woken up by @@ -844,7 +868,7 @@ void wq_worker_waking_up(struct task_struct *task, int cpu) * Return: * Worker task on @cpu to wake up, %NULL if none. */ -struct task_struct *wq_worker_sleeping(struct task_struct *task, int cpu) +struct task_struct *wq_worker_sleeping(struct task_struct *task) { struct worker *worker = kthread_data(task), *to_wakeup = NULL; struct worker_pool *pool; @@ -860,7 +884,7 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task, int cpu) pool = worker->pool; /* this can only happen on the local cpu */ - if (WARN_ON_ONCE(cpu != raw_smp_processor_id() || pool->cpu != cpu)) + if (WARN_ON_ONCE(pool->cpu != raw_smp_processor_id())) return NULL; /* @@ -1298,6 +1322,39 @@ static bool is_chained_work(struct workqueue_struct *wq) return worker && worker->current_pwq->wq == wq; } +/* + * When queueing an unbound work item to a wq, prefer local CPU if allowed + * by wq_unbound_cpumask. Otherwise, round robin among the allowed ones to + * avoid perturbing sensitive tasks. + */ +static int wq_select_unbound_cpu(int cpu) +{ + static bool printed_dbg_warning; + int new_cpu; + + if (likely(!wq_debug_force_rr_cpu)) { + if (cpumask_test_cpu(cpu, wq_unbound_cpumask)) + return cpu; + } else if (!printed_dbg_warning) { + pr_warn("workqueue: round-robin CPU selection forced, expect performance impact\n"); + printed_dbg_warning = true; + } + + if (cpumask_empty(wq_unbound_cpumask)) + return cpu; + + new_cpu = __this_cpu_read(wq_rr_cpu_last); + new_cpu = cpumask_next_and(new_cpu, wq_unbound_cpumask, cpu_online_mask); + if (unlikely(new_cpu >= nr_cpu_ids)) { + new_cpu = cpumask_first_and(wq_unbound_cpumask, cpu_online_mask); + if (unlikely(new_cpu >= nr_cpu_ids)) + return cpu; + } + __this_cpu_write(wq_rr_cpu_last, new_cpu); + + return new_cpu; +} + static void __queue_work(int cpu, struct workqueue_struct *wq, struct work_struct *work) { @@ -1323,7 +1380,7 @@ static void __queue_work(int cpu, struct workqueue_struct *wq, return; retry: if (req_cpu == WORK_CPU_UNBOUND) - cpu = raw_smp_processor_id(); + cpu = wq_select_unbound_cpu(raw_smp_processor_id()); /* pwq which will be used unless @work is executing elsewhere */ if (!(wq->flags & WQ_UNBOUND)) @@ -1464,13 +1521,13 @@ static void __queue_delayed_work(int cpu, struct workqueue_struct *wq, timer_stats_timer_set_start_info(&dwork->timer); dwork->wq = wq; - /* timer isn't guaranteed to run in this cpu, record earlier */ - if (cpu == WORK_CPU_UNBOUND) - cpu = raw_smp_processor_id(); dwork->cpu = cpu; timer->expires = jiffies + delay; - add_timer_on(timer, cpu); + if (unlikely(cpu != WORK_CPU_UNBOUND)) + add_timer_on(timer, cpu); + else + add_timer(timer); } /** @@ -2355,7 +2412,8 @@ static void check_flush_dependency(struct workqueue_struct *target_wq, WARN_ONCE(current->flags & PF_MEMALLOC, "workqueue: PF_MEMALLOC task %d(%s) is flushing !WQ_MEM_RECLAIM %s:%pf", current->pid, current->comm, target_wq->name, target_func); - WARN_ONCE(worker && (worker->current_pwq->wq->flags & WQ_MEM_RECLAIM), + WARN_ONCE(worker && ((worker->current_pwq->wq->flags & + (WQ_MEM_RECLAIM | __WQ_LEGACY)) == WQ_MEM_RECLAIM), "workqueue: WQ_MEM_RECLAIM %s:%pf is flushing !WQ_MEM_RECLAIM %s:%pf", worker->current_pwq->wq->name, worker->current_func, target_wq->name, target_func); @@ -4636,7 +4694,7 @@ static void work_for_cpu_fn(struct work_struct *work) } /** - * work_on_cpu - run a function in user context on a particular cpu + * work_on_cpu - run a function in thread context on a particular cpu * @cpu: the cpu to run on * @fn: the function to run * @arg: the function arg @@ -5162,8 +5220,8 @@ int workqueue_sysfs_register(struct workqueue_struct *wq) wq_dev->wq = wq; wq_dev->dev.bus = &wq_subsys; - wq_dev->dev.init_name = wq->name; wq_dev->dev.release = wq_device_release; + dev_set_name(&wq_dev->dev, "%s", wq->name); /* * unbound_attrs are created separately. Suppress uevent until diff --git a/kernel/workqueue_internal.h b/kernel/workqueue_internal.h index 45215870ac6c..8635417c587b 100644 --- a/kernel/workqueue_internal.h +++ b/kernel/workqueue_internal.h @@ -69,6 +69,6 @@ static inline struct worker *current_wq_worker(void) * sched/core.c and workqueue.c. */ void wq_worker_waking_up(struct task_struct *task, int cpu); -struct task_struct *wq_worker_sleeping(struct task_struct *task, int cpu); +struct task_struct *wq_worker_sleeping(struct task_struct *task); #endif /* _KERNEL_WORKQUEUE_INTERNAL_H */ |