diff options
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/bpf/arraymap.c | 3 | ||||
-rw-r--r-- | kernel/bpf/sockmap.c | 99 | ||||
-rw-r--r-- | kernel/bpf/syscall.c | 23 | ||||
-rw-r--r-- | kernel/compat.c | 1 | ||||
-rw-r--r-- | kernel/events/ring_buffer.c | 7 | ||||
-rw-r--r-- | kernel/events/uprobes.c | 7 | ||||
-rw-r--r-- | kernel/kthread.c | 50 | ||||
-rw-r--r-- | kernel/module.c | 5 | ||||
-rw-r--r-- | kernel/sched/autogroup.c | 7 | ||||
-rw-r--r-- | kernel/sched/core.c | 56 | ||||
-rw-r--r-- | kernel/sched/cpufreq_schedutil.c | 16 | ||||
-rw-r--r-- | kernel/sched/fair.c | 59 | ||||
-rw-r--r-- | kernel/signal.c | 17 | ||||
-rw-r--r-- | kernel/stop_machine.c | 19 | ||||
-rw-r--r-- | kernel/time/clocksource.c | 63 | ||||
-rw-r--r-- | kernel/trace/ftrace.c | 4 | ||||
-rw-r--r-- | kernel/trace/trace_events_filter.c | 3 | ||||
-rw-r--r-- | kernel/trace/trace_events_hist.c | 12 | ||||
-rw-r--r-- | kernel/trace/trace_stack.c | 2 | ||||
-rw-r--r-- | kernel/trace/trace_uprobe.c | 35 | ||||
-rw-r--r-- | kernel/tracepoint.c | 4 |
21 files changed, 272 insertions, 220 deletions
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 14750e7c5ee4..027107f4be53 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -476,7 +476,7 @@ static u32 prog_fd_array_sys_lookup_elem(void *ptr) } /* decrement refcnt of all bpf_progs that are stored in this map */ -void bpf_fd_array_map_clear(struct bpf_map *map) +static void bpf_fd_array_map_clear(struct bpf_map *map) { struct bpf_array *array = container_of(map, struct bpf_array, map); int i; @@ -495,6 +495,7 @@ const struct bpf_map_ops prog_array_map_ops = { .map_fd_get_ptr = prog_fd_array_get_ptr, .map_fd_put_ptr = prog_fd_array_put_ptr, .map_fd_sys_lookup_elem = prog_fd_array_sys_lookup_elem, + .map_release_uref = bpf_fd_array_map_clear, }; static struct bpf_event_entry *bpf_event_entry_gen(struct file *perf_file, diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c index a3b21385e947..098eca568c2b 100644 --- a/kernel/bpf/sockmap.c +++ b/kernel/bpf/sockmap.c @@ -43,6 +43,7 @@ #include <net/tcp.h> #include <linux/ptr_ring.h> #include <net/inet_common.h> +#include <linux/sched/signal.h> #define SOCK_CREATE_FLAG_MASK \ (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY) @@ -325,6 +326,9 @@ retry: if (ret > 0) { if (apply) apply_bytes -= ret; + + sg->offset += ret; + sg->length -= ret; size -= ret; offset += ret; if (uncharge) @@ -332,8 +336,6 @@ retry: goto retry; } - sg->length = size; - sg->offset = offset; return ret; } @@ -391,7 +393,8 @@ static void return_mem_sg(struct sock *sk, int bytes, struct sk_msg_buff *md) } while (i != md->sg_end); } -static void free_bytes_sg(struct sock *sk, int bytes, struct sk_msg_buff *md) +static void free_bytes_sg(struct sock *sk, int bytes, + struct sk_msg_buff *md, bool charge) { struct scatterlist *sg = md->sg_data; int i = md->sg_start, free; @@ -401,11 +404,13 @@ static void free_bytes_sg(struct sock *sk, int bytes, struct sk_msg_buff *md) if (bytes < free) { sg[i].length -= bytes; sg[i].offset += bytes; - sk_mem_uncharge(sk, bytes); + if (charge) + sk_mem_uncharge(sk, bytes); break; } - sk_mem_uncharge(sk, sg[i].length); + if (charge) + sk_mem_uncharge(sk, sg[i].length); put_page(sg_page(&sg[i])); bytes -= sg[i].length; sg[i].length = 0; @@ -416,6 +421,7 @@ static void free_bytes_sg(struct sock *sk, int bytes, struct sk_msg_buff *md) if (i == MAX_SKB_FRAGS) i = 0; } + md->sg_start = i; } static int free_sg(struct sock *sk, int start, struct sk_msg_buff *md) @@ -523,8 +529,6 @@ static int bpf_tcp_ingress(struct sock *sk, int apply_bytes, i = md->sg_start; do { - r->sg_data[i] = md->sg_data[i]; - size = (apply && apply_bytes < md->sg_data[i].length) ? apply_bytes : md->sg_data[i].length; @@ -535,6 +539,7 @@ static int bpf_tcp_ingress(struct sock *sk, int apply_bytes, } sk_mem_charge(sk, size); + r->sg_data[i] = md->sg_data[i]; r->sg_data[i].length = size; md->sg_data[i].length -= size; md->sg_data[i].offset += size; @@ -575,10 +580,10 @@ static int bpf_tcp_sendmsg_do_redirect(struct sock *sk, int send, struct sk_msg_buff *md, int flags) { + bool ingress = !!(md->flags & BPF_F_INGRESS); struct smap_psock *psock; struct scatterlist *sg; - int i, err, free = 0; - bool ingress = !!(md->flags & BPF_F_INGRESS); + int err = 0; sg = md->sg_data; @@ -606,16 +611,8 @@ static int bpf_tcp_sendmsg_do_redirect(struct sock *sk, int send, out_rcu: rcu_read_unlock(); out: - i = md->sg_start; - while (sg[i].length) { - free += sg[i].length; - put_page(sg_page(&sg[i])); - sg[i].length = 0; - i++; - if (i == MAX_SKB_FRAGS) - i = 0; - } - return free; + free_bytes_sg(NULL, send, md, false); + return err; } static inline void bpf_md_init(struct smap_psock *psock) @@ -700,19 +697,26 @@ more_data: err = bpf_tcp_sendmsg_do_redirect(redir, send, m, flags); lock_sock(sk); + if (unlikely(err < 0)) { + free_start_sg(sk, m); + psock->sg_size = 0; + if (!cork) + *copied -= send; + } else { + psock->sg_size -= send; + } + if (cork) { free_start_sg(sk, m); + psock->sg_size = 0; kfree(m); m = NULL; + err = 0; } - if (unlikely(err)) - *copied -= err; - else - psock->sg_size -= send; break; case __SK_DROP: default: - free_bytes_sg(sk, send, m); + free_bytes_sg(sk, send, m, true); apply_bytes_dec(psock, send); *copied -= send; psock->sg_size -= send; @@ -732,6 +736,26 @@ out_err: return err; } +static int bpf_wait_data(struct sock *sk, + struct smap_psock *psk, int flags, + long timeo, int *err) +{ + int rc; + + DEFINE_WAIT_FUNC(wait, woken_wake_function); + + add_wait_queue(sk_sleep(sk), &wait); + sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); + rc = sk_wait_event(sk, &timeo, + !list_empty(&psk->ingress) || + !skb_queue_empty(&sk->sk_receive_queue), + &wait); + sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk); + remove_wait_queue(sk_sleep(sk), &wait); + + return rc; +} + static int bpf_tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, int flags, int *addr_len) { @@ -755,6 +779,7 @@ static int bpf_tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, return tcp_recvmsg(sk, msg, len, nonblock, flags, addr_len); lock_sock(sk); +bytes_ready: while (copied != len) { struct scatterlist *sg; struct sk_msg_buff *md; @@ -809,6 +834,28 @@ static int bpf_tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, } } + if (!copied) { + long timeo; + int data; + int err = 0; + + timeo = sock_rcvtimeo(sk, nonblock); + data = bpf_wait_data(sk, psock, flags, timeo, &err); + + if (data) { + if (!skb_queue_empty(&sk->sk_receive_queue)) { + release_sock(sk); + smap_release_sock(psock, sk); + copied = tcp_recvmsg(sk, msg, len, nonblock, flags, addr_len); + return copied; + } + goto bytes_ready; + } + + if (err) + copied = err; + } + release_sock(sk); smap_release_sock(psock, sk); return copied; @@ -1831,7 +1878,7 @@ static int sock_map_update_elem(struct bpf_map *map, return err; } -static void sock_map_release(struct bpf_map *map, struct file *map_file) +static void sock_map_release(struct bpf_map *map) { struct bpf_stab *stab = container_of(map, struct bpf_stab, map); struct bpf_prog *orig; @@ -1855,7 +1902,7 @@ const struct bpf_map_ops sock_map_ops = { .map_get_next_key = sock_map_get_next_key, .map_update_elem = sock_map_update_elem, .map_delete_elem = sock_map_delete_elem, - .map_release = sock_map_release, + .map_release_uref = sock_map_release, }; BPF_CALL_4(bpf_sock_map_update, struct bpf_sock_ops_kern *, bpf_sock, diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 4ca46df19c9a..016ef9025827 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -26,6 +26,7 @@ #include <linux/cred.h> #include <linux/timekeeping.h> #include <linux/ctype.h> +#include <linux/nospec.h> #define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PROG_ARRAY || \ (map)->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || \ @@ -102,12 +103,14 @@ const struct bpf_map_ops bpf_map_offload_ops = { static struct bpf_map *find_and_alloc_map(union bpf_attr *attr) { const struct bpf_map_ops *ops; + u32 type = attr->map_type; struct bpf_map *map; int err; - if (attr->map_type >= ARRAY_SIZE(bpf_map_types)) + if (type >= ARRAY_SIZE(bpf_map_types)) return ERR_PTR(-EINVAL); - ops = bpf_map_types[attr->map_type]; + type = array_index_nospec(type, ARRAY_SIZE(bpf_map_types)); + ops = bpf_map_types[type]; if (!ops) return ERR_PTR(-EINVAL); @@ -122,7 +125,7 @@ static struct bpf_map *find_and_alloc_map(union bpf_attr *attr) if (IS_ERR(map)) return map; map->ops = ops; - map->map_type = attr->map_type; + map->map_type = type; return map; } @@ -257,8 +260,8 @@ static void bpf_map_free_deferred(struct work_struct *work) static void bpf_map_put_uref(struct bpf_map *map) { if (atomic_dec_and_test(&map->usercnt)) { - if (map->map_type == BPF_MAP_TYPE_PROG_ARRAY) - bpf_fd_array_map_clear(map); + if (map->ops->map_release_uref) + map->ops->map_release_uref(map); } } @@ -871,11 +874,17 @@ static const struct bpf_prog_ops * const bpf_prog_types[] = { static int find_prog_type(enum bpf_prog_type type, struct bpf_prog *prog) { - if (type >= ARRAY_SIZE(bpf_prog_types) || !bpf_prog_types[type]) + const struct bpf_prog_ops *ops; + + if (type >= ARRAY_SIZE(bpf_prog_types)) + return -EINVAL; + type = array_index_nospec(type, ARRAY_SIZE(bpf_prog_types)); + ops = bpf_prog_types[type]; + if (!ops) return -EINVAL; if (!bpf_prog_is_dev_bound(prog->aux)) - prog->aux->ops = bpf_prog_types[type]; + prog->aux->ops = ops; else prog->aux->ops = &bpf_offload_prog_ops; prog->type = type; diff --git a/kernel/compat.c b/kernel/compat.c index 6d21894806b4..92d8c98c0f57 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -34,6 +34,7 @@ int compat_get_timex(struct timex *txc, const struct compat_timex __user *utp) { struct compat_timex tx32; + memset(txc, 0, sizeof(struct timex)); if (copy_from_user(&tx32, utp, sizeof(struct compat_timex))) return -EFAULT; diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index 6c6b3c48db71..1d8ca9ea9979 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -14,6 +14,7 @@ #include <linux/slab.h> #include <linux/circ_buf.h> #include <linux/poll.h> +#include <linux/nospec.h> #include "internal.h" @@ -867,8 +868,10 @@ perf_mmap_to_page(struct ring_buffer *rb, unsigned long pgoff) return NULL; /* AUX space */ - if (pgoff >= rb->aux_pgoff) - return virt_to_page(rb->aux_pages[pgoff - rb->aux_pgoff]); + if (pgoff >= rb->aux_pgoff) { + int aux_pgoff = array_index_nospec(pgoff - rb->aux_pgoff, rb->aux_nr_pages); + return virt_to_page(rb->aux_pages[aux_pgoff]); + } } return __perf_mmap_to_page(rb, pgoff); diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index ce6848e46e94..1725b902983f 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -491,7 +491,7 @@ static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset) if (!uprobe) return NULL; - uprobe->inode = igrab(inode); + uprobe->inode = inode; uprobe->offset = offset; init_rwsem(&uprobe->register_rwsem); init_rwsem(&uprobe->consumer_rwsem); @@ -502,7 +502,6 @@ static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset) if (cur_uprobe) { kfree(uprobe); uprobe = cur_uprobe; - iput(inode); } return uprobe; @@ -701,7 +700,6 @@ static void delete_uprobe(struct uprobe *uprobe) rb_erase(&uprobe->rb_node, &uprobes_tree); spin_unlock(&uprobes_treelock); RB_CLEAR_NODE(&uprobe->rb_node); /* for uprobe_is_active() */ - iput(uprobe->inode); put_uprobe(uprobe); } @@ -873,7 +871,8 @@ static void __uprobe_unregister(struct uprobe *uprobe, struct uprobe_consumer *u * tuple). Creation refcount stops uprobe_unregister from freeing the * @uprobe even before the register operation is complete. Creation * refcount is released when the last @uc for the @uprobe - * unregisters. + * unregisters. Caller of uprobe_register() is required to keep @inode + * (and the containing mount) referenced. * * Return errno if it cannot successully install probes * else return 0 (success) diff --git a/kernel/kthread.c b/kernel/kthread.c index cd50e99202b0..2017a39ab490 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -55,7 +55,6 @@ enum KTHREAD_BITS { KTHREAD_IS_PER_CPU = 0, KTHREAD_SHOULD_STOP, KTHREAD_SHOULD_PARK, - KTHREAD_IS_PARKED, }; static inline void set_kthread_struct(void *kthread) @@ -177,14 +176,12 @@ void *kthread_probe_data(struct task_struct *task) static void __kthread_parkme(struct kthread *self) { - __set_current_state(TASK_PARKED); - while (test_bit(KTHREAD_SHOULD_PARK, &self->flags)) { - if (!test_and_set_bit(KTHREAD_IS_PARKED, &self->flags)) - complete(&self->parked); + for (;;) { + set_current_state(TASK_PARKED); + if (!test_bit(KTHREAD_SHOULD_PARK, &self->flags)) + break; schedule(); - __set_current_state(TASK_PARKED); } - clear_bit(KTHREAD_IS_PARKED, &self->flags); __set_current_state(TASK_RUNNING); } @@ -194,6 +191,11 @@ void kthread_parkme(void) } EXPORT_SYMBOL_GPL(kthread_parkme); +void kthread_park_complete(struct task_struct *k) +{ + complete(&to_kthread(k)->parked); +} + static int kthread(void *_create) { /* Copy data: it's on kthread's stack */ @@ -450,22 +452,15 @@ void kthread_unpark(struct task_struct *k) { struct kthread *kthread = to_kthread(k); - clear_bit(KTHREAD_SHOULD_PARK, &kthread->flags); /* - * We clear the IS_PARKED bit here as we don't wait - * until the task has left the park code. So if we'd - * park before that happens we'd see the IS_PARKED bit - * which might be about to be cleared. + * Newly created kthread was parked when the CPU was offline. + * The binding was lost and we need to set it again. */ - if (test_and_clear_bit(KTHREAD_IS_PARKED, &kthread->flags)) { - /* - * Newly created kthread was parked when the CPU was offline. - * The binding was lost and we need to set it again. - */ - if (test_bit(KTHREAD_IS_PER_CPU, &kthread->flags)) - __kthread_bind(k, kthread->cpu, TASK_PARKED); - wake_up_state(k, TASK_PARKED); - } + if (test_bit(KTHREAD_IS_PER_CPU, &kthread->flags)) + __kthread_bind(k, kthread->cpu, TASK_PARKED); + + clear_bit(KTHREAD_SHOULD_PARK, &kthread->flags); + wake_up_state(k, TASK_PARKED); } EXPORT_SYMBOL_GPL(kthread_unpark); @@ -488,12 +483,13 @@ int kthread_park(struct task_struct *k) if (WARN_ON(k->flags & PF_EXITING)) return -ENOSYS; - if (!test_bit(KTHREAD_IS_PARKED, &kthread->flags)) { - set_bit(KTHREAD_SHOULD_PARK, &kthread->flags); - if (k != current) { - wake_up_process(k); - wait_for_completion(&kthread->parked); - } + if (WARN_ON_ONCE(test_bit(KTHREAD_SHOULD_PARK, &kthread->flags))) + return -EBUSY; + + set_bit(KTHREAD_SHOULD_PARK, &kthread->flags); + if (k != current) { + wake_up_process(k); + wait_for_completion(&kthread->parked); } return 0; diff --git a/kernel/module.c b/kernel/module.c index ce8066b88178..c9bea7f2b43e 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -3517,6 +3517,11 @@ static noinline int do_init_module(struct module *mod) * walking this with preempt disabled. In all the failure paths, we * call synchronize_sched(), but we don't want to slow down the success * path, so use actual RCU here. + * Note that module_alloc() on most architectures creates W+X page + * mappings which won't be cleaned up until do_free_init() runs. Any + * code such as mark_rodata_ro() which depends on those mappings to + * be cleaned up needs to sync with the queued work - ie + * rcu_barrier_sched() */ call_rcu_sched(&freeinit->rcu, do_free_init); mutex_unlock(&module_mutex); diff --git a/kernel/sched/autogroup.c b/kernel/sched/autogroup.c index 6be6c575b6cd..2d4ff5353ded 100644 --- a/kernel/sched/autogroup.c +++ b/kernel/sched/autogroup.c @@ -2,6 +2,7 @@ /* * Auto-group scheduling implementation: */ +#include <linux/nospec.h> #include "sched.h" unsigned int __read_mostly sysctl_sched_autogroup_enabled = 1; @@ -209,7 +210,7 @@ int proc_sched_autogroup_set_nice(struct task_struct *p, int nice) static unsigned long next = INITIAL_JIFFIES; struct autogroup *ag; unsigned long shares; - int err; + int err, idx; if (nice < MIN_NICE || nice > MAX_NICE) return -EINVAL; @@ -227,7 +228,9 @@ int proc_sched_autogroup_set_nice(struct task_struct *p, int nice) next = HZ / 10 + jiffies; ag = autogroup_task_get(p); - shares = scale_load(sched_prio_to_weight[nice + 20]); + + idx = array_index_nospec(nice + 20, 40); + shares = scale_load(sched_prio_to_weight[idx]); down_write(&ag->lock); err = sched_group_set_shares(ag->tg, shares); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 5e10aaeebfcc..092f7c4de903 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7,6 +7,9 @@ */ #include "sched.h" +#include <linux/kthread.h> +#include <linux/nospec.h> + #include <asm/switch_to.h> #include <asm/tlb.h> @@ -2718,20 +2721,28 @@ static struct rq *finish_task_switch(struct task_struct *prev) membarrier_mm_sync_core_before_usermode(mm); mmdrop(mm); } - if (unlikely(prev_state == TASK_DEAD)) { - if (prev->sched_class->task_dead) - prev->sched_class->task_dead(prev); + if (unlikely(prev_state & (TASK_DEAD|TASK_PARKED))) { + switch (prev_state) { + case TASK_DEAD: + if (prev->sched_class->task_dead) + prev->sched_class->task_dead(prev); - /* - * Remove function-return probe instances associated with this - * task and put them back on the free list. - */ - kprobe_flush_task(prev); + /* + * Remove function-return probe instances associated with this + * task and put them back on the free list. + */ + kprobe_flush_task(prev); + + /* Task is done with its stack. */ + put_task_stack(prev); - /* Task is done with its stack. */ - put_task_stack(prev); + put_task_struct(prev); + break; - put_task_struct(prev); + case TASK_PARKED: + kthread_park_complete(prev); + break; + } } tick_nohz_task_switch(); @@ -3498,23 +3509,8 @@ static void __sched notrace __schedule(bool preempt) void __noreturn do_task_dead(void) { - /* - * The setting of TASK_RUNNING by try_to_wake_up() may be delayed - * when the following two conditions become true. - * - There is race condition of mmap_sem (It is acquired by - * exit_mm()), and - * - SMI occurs before setting TASK_RUNINNG. - * (or hypervisor of virtual machine switches to other guest) - * As a result, we may become TASK_RUNNING after becoming TASK_DEAD - * - * To avoid it, we have to wait for releasing tsk->pi_lock which - * is held by try_to_wake_up() - */ - raw_spin_lock_irq(¤t->pi_lock); - raw_spin_unlock_irq(¤t->pi_lock); - /* Causes final put_task_struct in finish_task_switch(): */ - __set_current_state(TASK_DEAD); + set_special_state(TASK_DEAD); /* Tell freezer to ignore us: */ current->flags |= PF_NOFREEZE; @@ -6928,11 +6924,15 @@ static int cpu_weight_nice_write_s64(struct cgroup_subsys_state *css, struct cftype *cft, s64 nice) { unsigned long weight; + int idx; if (nice < MIN_NICE || nice > MAX_NICE) return -ERANGE; - weight = sched_prio_to_weight[NICE_TO_PRIO(nice) - MAX_RT_PRIO]; + idx = NICE_TO_PRIO(nice) - MAX_RT_PRIO; + idx = array_index_nospec(idx, 40); + weight = sched_prio_to_weight[idx]; + return sched_group_set_shares(css_tg(css), scale_load(weight)); } #endif diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index d2c6083304b4..e13df951aca7 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -305,7 +305,8 @@ static void sugov_update_single(struct update_util_data *hook, u64 time, * Do not reduce the frequency if the CPU has not been idle * recently, as the reduction is likely to be premature then. */ - if (busy && next_f < sg_policy->next_freq) { + if (busy && next_f < sg_policy->next_freq && + sg_policy->next_freq != UINT_MAX) { next_f = sg_policy->next_freq; /* Reset cached freq as next_freq has changed */ @@ -396,19 +397,6 @@ static void sugov_irq_work(struct irq_work *irq_work) sg_policy = container_of(irq_work, struct sugov_policy, irq_work); - /* - * For RT tasks, the schedutil governor shoots the frequency to maximum. - * Special care must be taken to ensure that this kthread doesn't result - * in the same behavior. - * - * This is (mostly) guaranteed by the work_in_progress flag. The flag is - * updated only at the end of the sugov_work() function and before that - * the schedutil governor rejects all other frequency scaling requests. - * - * There is a very rare case though, where the RT thread yields right - * after the work_in_progress flag is cleared. The effects of that are - * neglected for now. - */ kthread_queue_work(&sg_policy->worker, &sg_policy->work); } diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 54dc31e7ab9b..79f574dba096 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1854,7 +1854,6 @@ static int task_numa_migrate(struct task_struct *p) static void numa_migrate_preferred(struct task_struct *p) { unsigned long interval = HZ; - unsigned long numa_migrate_retry; /* This task has no NUMA fault statistics yet */ if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults)) @@ -1862,18 +1861,7 @@ static void numa_migrate_preferred(struct task_struct *p) /* Periodically retry migrating the task to the preferred node */ interval = min(interval, msecs_to_jiffies(p->numa_scan_period) / 16); - numa_migrate_retry = jiffies + interval; - - /* - * Check that the new retry threshold is after the current one. If - * the retry is in the future, it implies that wake_affine has - * temporarily asked NUMA balancing to backoff from placement. - */ - if (numa_migrate_retry > p->numa_migrate_retry) - return; - - /* Safe to try placing the task on the preferred node */ - p->numa_migrate_retry = numa_migrate_retry; + p->numa_migrate_retry = jiffies + interval; /* Success if task is already running on preferred CPU */ if (task_node(p) == p->numa_preferred_nid) @@ -5922,48 +5910,6 @@ wake_affine_weight(struct sched_domain *sd, struct task_struct *p, return this_eff_load < prev_eff_load ? this_cpu : nr_cpumask_bits; } -#ifdef CONFIG_NUMA_BALANCING -static void -update_wa_numa_placement(struct task_struct *p, int prev_cpu, int target) -{ - unsigned long interval; - - if (!static_branch_likely(&sched_numa_balancing)) - return; - - /* If balancing has no preference then continue gathering data */ - if (p->numa_preferred_nid == -1) - return; - - /* - * If the wakeup is not affecting locality then it is neutral from - * the perspective of NUMA balacing so continue gathering data. - */ - if (cpu_to_node(prev_cpu) == cpu_to_node(target)) - return; - - /* - * Temporarily prevent NUMA balancing trying to place waker/wakee after - * wakee has been moved by wake_affine. This will potentially allow - * related tasks to converge and update their data placement. The - * 4 * numa_scan_period is to allow the two-pass filter to migrate - * hot data to the wakers node. - */ - interval = max(sysctl_numa_balancing_scan_delay, - p->numa_scan_period << 2); - p->numa_migrate_retry = jiffies + msecs_to_jiffies(interval); - - interval = max(sysctl_numa_balancing_scan_delay, - current->numa_scan_period << 2); - current->numa_migrate_retry = jiffies + msecs_to_jiffies(interval); -} -#else -static void -update_wa_numa_placement(struct task_struct *p, int prev_cpu, int target) -{ -} -#endif - static int wake_affine(struct sched_domain *sd, struct task_struct *p, int this_cpu, int prev_cpu, int sync) { @@ -5979,7 +5925,6 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, if (target == nr_cpumask_bits) return prev_cpu; - update_wa_numa_placement(p, prev_cpu, target); schedstat_inc(sd->ttwu_move_affine); schedstat_inc(p->se.statistics.nr_wakeups_affine); return target; @@ -9847,6 +9792,7 @@ static int idle_balance(struct rq *this_rq, struct rq_flags *rf) if (curr_cost > this_rq->max_idle_balance_cost) this_rq->max_idle_balance_cost = curr_cost; +out: /* * While browsing the domains, we released the rq lock, a task could * have been enqueued in the meantime. Since we're not going idle, @@ -9855,7 +9801,6 @@ static int idle_balance(struct rq *this_rq, struct rq_flags *rf) if (this_rq->cfs.h_nr_running && !pulled_task) pulled_task = 1; -out: /* Move the next balance forward */ if (time_after(this_rq->next_balance, next_balance)) this_rq->next_balance = next_balance; diff --git a/kernel/signal.c b/kernel/signal.c index d4ccea599692..9c33163a6165 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1961,14 +1961,27 @@ static void ptrace_stop(int exit_code, int why, int clear_code, siginfo_t *info) return; } + set_special_state(TASK_TRACED); + /* * We're committing to trapping. TRACED should be visible before * TRAPPING is cleared; otherwise, the tracer might fail do_wait(). * Also, transition to TRACED and updates to ->jobctl should be * atomic with respect to siglock and should be done after the arch * hook as siglock is released and regrabbed across it. + * + * TRACER TRACEE + * + * ptrace_attach() + * [L] wait_on_bit(JOBCTL_TRAPPING) [S] set_special_state(TRACED) + * do_wait() + * set_current_state() smp_wmb(); + * ptrace_do_wait() + * wait_task_stopped() + * task_stopped_code() + * [L] task_is_traced() [S] task_clear_jobctl_trapping(); */ - set_current_state(TASK_TRACED); + smp_wmb(); current->last_siginfo = info; current->exit_code = exit_code; @@ -2176,7 +2189,7 @@ static bool do_signal_stop(int signr) if (task_participate_group_stop(current)) notify = CLD_STOPPED; - __set_current_state(TASK_STOPPED); + set_special_state(TASK_STOPPED); spin_unlock_irq(¤t->sighand->siglock); /* diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index b7591261652d..64c0291b579c 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -21,6 +21,7 @@ #include <linux/smpboot.h> #include <linux/atomic.h> #include <linux/nmi.h> +#include <linux/sched/wake_q.h> /* * Structure to determine completion condition and record errors. May @@ -65,27 +66,31 @@ static void cpu_stop_signal_done(struct cpu_stop_done *done) } static void __cpu_stop_queue_work(struct cpu_stopper *stopper, - struct cpu_stop_work *work) + struct cpu_stop_work *work, + struct wake_q_head *wakeq) { list_add_tail(&work->list, &stopper->works); - wake_up_process(stopper->thread); + wake_q_add(wakeq, stopper->thread); } /* queue @work to @stopper. if offline, @work is completed immediately */ static bool cpu_stop_queue_work(unsigned int cpu, struct cpu_stop_work *work) { struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu); + DEFINE_WAKE_Q(wakeq); unsigned long flags; bool enabled; spin_lock_irqsave(&stopper->lock, flags); enabled = stopper->enabled; if (enabled) - __cpu_stop_queue_work(stopper, work); + __cpu_stop_queue_work(stopper, work, &wakeq); else if (work->done) cpu_stop_signal_done(work->done); spin_unlock_irqrestore(&stopper->lock, flags); + wake_up_q(&wakeq); + return enabled; } @@ -229,6 +234,7 @@ static int cpu_stop_queue_two_works(int cpu1, struct cpu_stop_work *work1, { struct cpu_stopper *stopper1 = per_cpu_ptr(&cpu_stopper, cpu1); struct cpu_stopper *stopper2 = per_cpu_ptr(&cpu_stopper, cpu2); + DEFINE_WAKE_Q(wakeq); int err; retry: spin_lock_irq(&stopper1->lock); @@ -252,8 +258,8 @@ retry: goto unlock; err = 0; - __cpu_stop_queue_work(stopper1, work1); - __cpu_stop_queue_work(stopper2, work2); + __cpu_stop_queue_work(stopper1, work1, &wakeq); + __cpu_stop_queue_work(stopper2, work2, &wakeq); unlock: spin_unlock(&stopper2->lock); spin_unlock_irq(&stopper1->lock); @@ -263,6 +269,9 @@ unlock: cpu_relax(); goto retry; } + + wake_up_q(&wakeq); + return err; } /** diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 0e974cface0b..84f37420fcf5 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -119,6 +119,16 @@ static DEFINE_SPINLOCK(watchdog_lock); static int watchdog_running; static atomic_t watchdog_reset_pending; +static void inline clocksource_watchdog_lock(unsigned long *flags) +{ + spin_lock_irqsave(&watchdog_lock, *flags); +} + +static void inline clocksource_watchdog_unlock(unsigned long *flags) +{ + spin_unlock_irqrestore(&watchdog_lock, *flags); +} + static int clocksource_watchdog_kthread(void *data); static void __clocksource_change_rating(struct clocksource *cs, int rating); @@ -142,9 +152,19 @@ static void __clocksource_unstable(struct clocksource *cs) cs->flags &= ~(CLOCK_SOURCE_VALID_FOR_HRES | CLOCK_SOURCE_WATCHDOG); cs->flags |= CLOCK_SOURCE_UNSTABLE; + /* + * If the clocksource is registered clocksource_watchdog_kthread() will + * re-rate and re-select. + */ + if (list_empty(&cs->list)) { + cs->rating = 0; + return; + } + if (cs->mark_unstable) cs->mark_unstable(cs); + /* kick clocksource_watchdog_kthread() */ if (finished_booting) schedule_work(&watchdog_work); } @@ -153,10 +173,8 @@ static void __clocksource_unstable(struct clocksource *cs) * clocksource_mark_unstable - mark clocksource unstable via watchdog * @cs: clocksource to be marked unstable * - * This function is called instead of clocksource_change_rating from - * cpu hotplug code to avoid a deadlock between the clocksource mutex - * and the cpu hotplug mutex. It defers the update of the clocksource - * to the watchdog thread. + * This function is called by the x86 TSC code to mark clocksources as unstable; + * it defers demotion and re-selection to a kthread. */ void clocksource_mark_unstable(struct clocksource *cs) { @@ -164,7 +182,7 @@ void clocksource_mark_unstable(struct clocksource *cs) spin_lock_irqsave(&watchdog_lock, flags); if (!(cs->flags & CLOCK_SOURCE_UNSTABLE)) { - if (list_empty(&cs->wd_list)) + if (!list_empty(&cs->list) && list_empty(&cs->wd_list)) list_add(&cs->wd_list, &watchdog_list); __clocksource_unstable(cs); } @@ -319,9 +337,8 @@ static void clocksource_resume_watchdog(void) static void clocksource_enqueue_watchdog(struct clocksource *cs) { - unsigned long flags; + INIT_LIST_HEAD(&cs->wd_list); - spin_lock_irqsave(&watchdog_lock, flags); if (cs->flags & CLOCK_SOURCE_MUST_VERIFY) { /* cs is a clocksource to be watched. */ list_add(&cs->wd_list, &watchdog_list); @@ -331,7 +348,6 @@ static void clocksource_enqueue_watchdog(struct clocksource *cs) if (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES; } - spin_unlock_irqrestore(&watchdog_lock, flags); } static void clocksource_select_watchdog(bool fallback) @@ -373,9 +389,6 @@ static void clocksource_select_watchdog(bool fallback) static void clocksource_dequeue_watchdog(struct clocksource *cs) { - unsigned long flags; - - spin_lock_irqsave(&watchdog_lock, flags); if (cs != watchdog) { if (cs->flags & CLOCK_SOURCE_MUST_VERIFY) { /* cs is a watched clocksource. */ @@ -384,21 +397,19 @@ static void clocksource_dequeue_watchdog(struct clocksource *cs) clocksource_stop_watchdog(); } } - spin_unlock_irqrestore(&watchdog_lock, flags); } static int __clocksource_watchdog_kthread(void) { struct clocksource *cs, *tmp; unsigned long flags; - LIST_HEAD(unstable); int select = 0; spin_lock_irqsave(&watchdog_lock, flags); list_for_each_entry_safe(cs, tmp, &watchdog_list, wd_list) { if (cs->flags & CLOCK_SOURCE_UNSTABLE) { list_del_init(&cs->wd_list); - list_add(&cs->wd_list, &unstable); + __clocksource_change_rating(cs, 0); select = 1; } if (cs->flags & CLOCK_SOURCE_RESELECT) { @@ -410,11 +421,6 @@ static int __clocksource_watchdog_kthread(void) clocksource_stop_watchdog(); spin_unlock_irqrestore(&watchdog_lock, flags); - /* Needs to be done outside of watchdog lock */ - list_for_each_entry_safe(cs, tmp, &unstable, wd_list) { - list_del_init(&cs->wd_list); - __clocksource_change_rating(cs, 0); - } return select; } @@ -447,6 +453,9 @@ static inline int __clocksource_watchdog_kthread(void) { return 0; } static bool clocksource_is_watchdog(struct clocksource *cs) { return false; } void clocksource_mark_unstable(struct clocksource *cs) { } +static void inline clocksource_watchdog_lock(unsigned long *flags) { } +static void inline clocksource_watchdog_unlock(unsigned long *flags) { } + #endif /* CONFIG_CLOCKSOURCE_WATCHDOG */ /** @@ -779,14 +788,19 @@ EXPORT_SYMBOL_GPL(__clocksource_update_freq_scale); */ int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq) { + unsigned long flags; /* Initialize mult/shift and max_idle_ns */ __clocksource_update_freq_scale(cs, scale, freq); /* Add clocksource to the clocksource list */ mutex_lock(&clocksource_mutex); + + clocksource_watchdog_lock(&flags); clocksource_enqueue(cs); clocksource_enqueue_watchdog(cs); + clocksource_watchdog_unlock(&flags); + clocksource_select(); clocksource_select_watchdog(false); mutex_unlock(&clocksource_mutex); @@ -808,8 +822,13 @@ static void __clocksource_change_rating(struct clocksource *cs, int rating) */ void clocksource_change_rating(struct clocksource *cs, int rating) { + unsigned long flags; + mutex_lock(&clocksource_mutex); + clocksource_watchdog_lock(&flags); __clocksource_change_rating(cs, rating); + clocksource_watchdog_unlock(&flags); + clocksource_select(); clocksource_select_watchdog(false); mutex_unlock(&clocksource_mutex); @@ -821,6 +840,8 @@ EXPORT_SYMBOL(clocksource_change_rating); */ static int clocksource_unbind(struct clocksource *cs) { + unsigned long flags; + if (clocksource_is_watchdog(cs)) { /* Select and try to install a replacement watchdog. */ clocksource_select_watchdog(true); @@ -834,8 +855,12 @@ static int clocksource_unbind(struct clocksource *cs) if (curr_clocksource == cs) return -EBUSY; } + + clocksource_watchdog_lock(&flags); clocksource_dequeue_watchdog(cs); list_del_init(&cs->list); + clocksource_watchdog_unlock(&flags); + return 0; } diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 16bbf062018f..8d83bcf9ef69 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -5514,10 +5514,10 @@ static __init int ftrace_init_dyn_tracefs(struct dentry *d_tracer) ftrace_create_filter_files(&global_ops, d_tracer); #ifdef CONFIG_FUNCTION_GRAPH_TRACER - trace_create_file("set_graph_function", 0444, d_tracer, + trace_create_file("set_graph_function", 0644, d_tracer, NULL, &ftrace_graph_fops); - trace_create_file("set_graph_notrace", 0444, d_tracer, + trace_create_file("set_graph_notrace", 0644, d_tracer, NULL, &ftrace_graph_notrace_fops); #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 1f951b3df60c..7d306b74230f 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -762,6 +762,9 @@ static int regex_match_full(char *str, struct regex *r, int len) static int regex_match_front(char *str, struct regex *r, int len) { + if (len < r->len) + return 0; + if (strncmp(str, r->pattern, r->len) == 0) return 1; return 0; diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 0d7b3ffbecc2..b9061ed59bbd 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -2466,6 +2466,7 @@ parse_field(struct hist_trigger_data *hist_data, struct trace_event_file *file, else if (strcmp(modifier, "usecs") == 0) *flags |= HIST_FIELD_FL_TIMESTAMP_USECS; else { + hist_err("Invalid field modifier: ", modifier); field = ERR_PTR(-EINVAL); goto out; } @@ -2481,6 +2482,7 @@ parse_field(struct hist_trigger_data *hist_data, struct trace_event_file *file, else { field = trace_find_event_field(file->event_call, field_name); if (!field || !field->size) { + hist_err("Couldn't find field: ", field_name); field = ERR_PTR(-EINVAL); goto out; } @@ -4913,6 +4915,16 @@ static void hist_field_print(struct seq_file *m, struct hist_field *hist_field) seq_printf(m, "%s", field_name); } else if (hist_field->flags & HIST_FIELD_FL_TIMESTAMP) seq_puts(m, "common_timestamp"); + + if (hist_field->flags) { + if (!(hist_field->flags & HIST_FIELD_FL_VAR_REF) && + !(hist_field->flags & HIST_FIELD_FL_EXPR)) { + const char *flags = get_hist_field_flags(hist_field); + + if (flags) + seq_printf(m, ".%s", flags); + } + } } static int event_hist_trigger_print(struct seq_file *m, diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index 3c7bfc4bf5e9..4237eba4ef20 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -472,7 +472,7 @@ static __init int stack_trace_init(void) NULL, &stack_trace_fops); #ifdef CONFIG_DYNAMIC_FTRACE - trace_create_file("stack_trace_filter", 0444, d_tracer, + trace_create_file("stack_trace_filter", 0644, d_tracer, &trace_ops, &stack_trace_filter_fops); #endif diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 34fd0e0ec51d..ac892878dbe6 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -55,6 +55,7 @@ struct trace_uprobe { struct list_head list; struct trace_uprobe_filter filter; struct uprobe_consumer consumer; + struct path path; struct inode *inode; char *filename; unsigned long offset; @@ -289,7 +290,7 @@ static void free_trace_uprobe(struct trace_uprobe *tu) for (i = 0; i < tu->tp.nr_args; i++) traceprobe_free_probe_arg(&tu->tp.args[i]); - iput(tu->inode); + path_put(&tu->path); kfree(tu->tp.call.class->system); kfree(tu->tp.call.name); kfree(tu->filename); @@ -363,7 +364,6 @@ end: static int create_trace_uprobe(int argc, char **argv) { struct trace_uprobe *tu; - struct inode *inode; char *arg, *event, *group, *filename; char buf[MAX_EVENT_NAME_LEN]; struct path path; @@ -371,7 +371,6 @@ static int create_trace_uprobe(int argc, char **argv) bool is_delete, is_return; int i, ret; - inode = NULL; ret = 0; is_delete = false; is_return = false; @@ -437,21 +436,16 @@ static int create_trace_uprobe(int argc, char **argv) } /* Find the last occurrence, in case the path contains ':' too. */ arg = strrchr(argv[1], ':'); - if (!arg) { - ret = -EINVAL; - goto fail_address_parse; - } + if (!arg) + return -EINVAL; *arg++ = '\0'; filename = argv[1]; ret = kern_path(filename, LOOKUP_FOLLOW, &path); if (ret) - goto fail_address_parse; - - inode = igrab(d_real_inode(path.dentry)); - path_put(&path); + return ret; - if (!inode || !S_ISREG(inode->i_mode)) { + if (!d_is_reg(path.dentry)) { ret = -EINVAL; goto fail_address_parse; } @@ -490,7 +484,7 @@ static int create_trace_uprobe(int argc, char **argv) goto fail_address_parse; } tu->offset = offset; - tu->inode = inode; + tu->path = path; tu->filename = kstrdup(filename, GFP_KERNEL); if (!tu->filename) { @@ -558,7 +552,7 @@ error: return ret; fail_address_parse: - iput(inode); + path_put(&path); pr_info("Failed to parse address or file.\n"); @@ -922,6 +916,7 @@ probe_event_enable(struct trace_uprobe *tu, struct trace_event_file *file, goto err_flags; tu->consumer.filter = filter; + tu->inode = d_real_inode(tu->path.dentry); ret = uprobe_register(tu->inode, tu->offset, &tu->consumer); if (ret) goto err_buffer; @@ -967,6 +962,7 @@ probe_event_disable(struct trace_uprobe *tu, struct trace_event_file *file) WARN_ON(!uprobe_filter_is_empty(&tu->filter)); uprobe_unregister(tu->inode, tu->offset, &tu->consumer); + tu->inode = NULL; tu->tp.flags &= file ? ~TP_FLAG_TRACE : ~TP_FLAG_PROFILE; uprobe_buffer_disable(); @@ -1337,7 +1333,6 @@ struct trace_event_call * create_local_trace_uprobe(char *name, unsigned long offs, bool is_return) { struct trace_uprobe *tu; - struct inode *inode; struct path path; int ret; @@ -1345,11 +1340,8 @@ create_local_trace_uprobe(char *name, unsigned long offs, bool is_return) if (ret) return ERR_PTR(ret); - inode = igrab(d_inode(path.dentry)); - path_put(&path); - - if (!inode || !S_ISREG(inode->i_mode)) { - iput(inode); + if (!d_is_reg(path.dentry)) { + path_put(&path); return ERR_PTR(-EINVAL); } @@ -1364,11 +1356,12 @@ create_local_trace_uprobe(char *name, unsigned long offs, bool is_return) if (IS_ERR(tu)) { pr_info("Failed to allocate trace_uprobe.(%d)\n", (int)PTR_ERR(tu)); + path_put(&path); return ERR_CAST(tu); } tu->offset = offs; - tu->inode = inode; + tu->path = path; tu->filename = kstrdup(name, GFP_KERNEL); init_trace_event_call(tu, &tu->tp.call); diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c index 671b13457387..1e37da2e0c25 100644 --- a/kernel/tracepoint.c +++ b/kernel/tracepoint.c @@ -207,7 +207,7 @@ static int tracepoint_add_func(struct tracepoint *tp, lockdep_is_held(&tracepoints_mutex)); old = func_add(&tp_funcs, func, prio); if (IS_ERR(old)) { - WARN_ON_ONCE(1); + WARN_ON_ONCE(PTR_ERR(old) != -ENOMEM); return PTR_ERR(old); } @@ -239,7 +239,7 @@ static int tracepoint_remove_func(struct tracepoint *tp, lockdep_is_held(&tracepoints_mutex)); old = func_remove(&tp_funcs, func); if (IS_ERR(old)) { - WARN_ON_ONCE(1); + WARN_ON_ONCE(PTR_ERR(old) != -ENOMEM); return PTR_ERR(old); } |