aboutsummaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/cgroup.c110
-rw-r--r--kernel/events/core.c114
-rw-r--r--kernel/fork.c4
-rw-r--r--kernel/irq/handle.c2
-rw-r--r--kernel/irq/msi.c6
-rw-r--r--kernel/irq/proc.c19
-rw-r--r--kernel/kmod.c8
-rw-r--r--kernel/locking/lockdep.c10
-rw-r--r--kernel/memremap.c14
-rw-r--r--kernel/module.c8
-rw-r--r--kernel/rcu/tree.c5
-rw-r--r--kernel/sched/core.c36
-rw-r--r--kernel/sched/deadline.c17
-rw-r--r--kernel/sched/fair.c9
-rw-r--r--kernel/sched/idle.c2
-rw-r--r--kernel/sched/sched.h5
-rw-r--r--kernel/sched/wait.c7
-rw-r--r--kernel/time/clocksource.c2
-rw-r--r--kernel/time/timekeeping.c2
-rw-r--r--kernel/trace/trace_stack.c11
-rw-r--r--kernel/workqueue.c8
21 files changed, 289 insertions, 110 deletions
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 2cf0f79f1fc9..2c9eae6ad970 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -46,7 +46,6 @@
#include <linux/slab.h>
#include <linux/spinlock.h>
#include <linux/rwsem.h>
-#include <linux/percpu-rwsem.h>
#include <linux/string.h>
#include <linux/sort.h>
#include <linux/kmod.h>
@@ -104,8 +103,6 @@ static DEFINE_SPINLOCK(cgroup_idr_lock);
*/
static DEFINE_SPINLOCK(release_agent_path_lock);
-struct percpu_rw_semaphore cgroup_threadgroup_rwsem;
-
#define cgroup_assert_mutex_or_rcu_locked() \
RCU_LOCKDEP_WARN(!rcu_read_lock_held() && \
!lockdep_is_held(&cgroup_mutex), \
@@ -874,6 +871,48 @@ static struct css_set *find_css_set(struct css_set *old_cset,
return cset;
}
+void cgroup_threadgroup_change_begin(struct task_struct *tsk)
+{
+ down_read(&tsk->signal->group_rwsem);
+}
+
+void cgroup_threadgroup_change_end(struct task_struct *tsk)
+{
+ up_read(&tsk->signal->group_rwsem);
+}
+
+/**
+ * threadgroup_lock - lock threadgroup
+ * @tsk: member task of the threadgroup to lock
+ *
+ * Lock the threadgroup @tsk belongs to. No new task is allowed to enter
+ * and member tasks aren't allowed to exit (as indicated by PF_EXITING) or
+ * change ->group_leader/pid. This is useful for cases where the threadgroup
+ * needs to stay stable across blockable operations.
+ *
+ * fork and exit explicitly call threadgroup_change_{begin|end}() for
+ * synchronization. While held, no new task will be added to threadgroup
+ * and no existing live task will have its PF_EXITING set.
+ *
+ * de_thread() does threadgroup_change_{begin|end}() when a non-leader
+ * sub-thread becomes a new leader.
+ */
+static void threadgroup_lock(struct task_struct *tsk)
+{
+ down_write(&tsk->signal->group_rwsem);
+}
+
+/**
+ * threadgroup_unlock - unlock threadgroup
+ * @tsk: member task of the threadgroup to unlock
+ *
+ * Reverse threadgroup_lock().
+ */
+static inline void threadgroup_unlock(struct task_struct *tsk)
+{
+ up_write(&tsk->signal->group_rwsem);
+}
+
static struct cgroup_root *cgroup_root_from_kf(struct kernfs_root *kf_root)
{
struct cgroup *root_cgrp = kf_root->kn->priv;
@@ -2074,9 +2113,9 @@ static void cgroup_task_migrate(struct cgroup *old_cgrp,
lockdep_assert_held(&css_set_rwsem);
/*
- * We are synchronized through cgroup_threadgroup_rwsem against
- * PF_EXITING setting such that we can't race against cgroup_exit()
- * changing the css_set to init_css_set and dropping the old one.
+ * We are synchronized through threadgroup_lock() against PF_EXITING
+ * setting such that we can't race against cgroup_exit() changing the
+ * css_set to init_css_set and dropping the old one.
*/
WARN_ON_ONCE(tsk->flags & PF_EXITING);
old_cset = task_css_set(tsk);
@@ -2133,11 +2172,10 @@ static void cgroup_migrate_finish(struct list_head *preloaded_csets)
* @src_cset and add it to @preloaded_csets, which should later be cleaned
* up by cgroup_migrate_finish().
*
- * This function may be called without holding cgroup_threadgroup_rwsem
- * even if the target is a process. Threads may be created and destroyed
- * but as long as cgroup_mutex is not dropped, no new css_set can be put
- * into play and the preloaded css_sets are guaranteed to cover all
- * migrations.
+ * This function may be called without holding threadgroup_lock even if the
+ * target is a process. Threads may be created and destroyed but as long
+ * as cgroup_mutex is not dropped, no new css_set can be put into play and
+ * the preloaded css_sets are guaranteed to cover all migrations.
*/
static void cgroup_migrate_add_src(struct css_set *src_cset,
struct cgroup *dst_cgrp,
@@ -2240,7 +2278,7 @@ err:
* @threadgroup: whether @leader points to the whole process or a single task
*
* Migrate a process or task denoted by @leader to @cgrp. If migrating a
- * process, the caller must be holding cgroup_threadgroup_rwsem. The
+ * process, the caller must be holding threadgroup_lock of @leader. The
* caller is also responsible for invoking cgroup_migrate_add_src() and
* cgroup_migrate_prepare_dst() on the targets before invoking this
* function and following up with cgroup_migrate_finish().
@@ -2368,7 +2406,7 @@ out_release_tset:
* @leader: the task or the leader of the threadgroup to be attached
* @threadgroup: attach the whole threadgroup?
*
- * Call holding cgroup_mutex and cgroup_threadgroup_rwsem.
+ * Call holding cgroup_mutex and threadgroup_lock of @leader.
*/
static int cgroup_attach_task(struct cgroup *dst_cgrp,
struct task_struct *leader, bool threadgroup)
@@ -2460,13 +2498,14 @@ static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf,
if (!cgrp)
return -ENODEV;
- percpu_down_write(&cgroup_threadgroup_rwsem);
+retry_find_task:
rcu_read_lock();
if (pid) {
tsk = find_task_by_vpid(pid);
if (!tsk) {
+ rcu_read_unlock();
ret = -ESRCH;
- goto out_unlock_rcu;
+ goto out_unlock_cgroup;
}
} else {
tsk = current;
@@ -2482,23 +2521,37 @@ static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf,
*/
if (tsk == kthreadd_task || (tsk->flags & PF_NO_SETAFFINITY)) {
ret = -EINVAL;
- goto out_unlock_rcu;
+ rcu_read_unlock();
+ goto out_unlock_cgroup;
}
get_task_struct(tsk);
rcu_read_unlock();
+ threadgroup_lock(tsk);
+ if (threadgroup) {
+ if (!thread_group_leader(tsk)) {
+ /*
+ * a race with de_thread from another thread's exec()
+ * may strip us of our leadership, if this happens,
+ * there is no choice but to throw this task away and
+ * try again; this is
+ * "double-double-toil-and-trouble-check locking".
+ */
+ threadgroup_unlock(tsk);
+ put_task_struct(tsk);
+ goto retry_find_task;
+ }
+ }
+
ret = cgroup_procs_write_permission(tsk, cgrp, of);
if (!ret)
ret = cgroup_attach_task(cgrp, tsk, threadgroup);
- put_task_struct(tsk);
- goto out_unlock_threadgroup;
+ threadgroup_unlock(tsk);
-out_unlock_rcu:
- rcu_read_unlock();
-out_unlock_threadgroup:
- percpu_up_write(&cgroup_threadgroup_rwsem);
+ put_task_struct(tsk);
+out_unlock_cgroup:
cgroup_kn_unlock(of->kn);
return ret ?: nbytes;
}
@@ -2643,8 +2696,6 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp)
lockdep_assert_held(&cgroup_mutex);
- percpu_down_write(&cgroup_threadgroup_rwsem);
-
/* look up all csses currently attached to @cgrp's subtree */
down_read(&css_set_rwsem);
css_for_each_descendant_pre(css, cgroup_css(cgrp, NULL)) {
@@ -2700,8 +2751,17 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp)
goto out_finish;
last_task = task;
+ threadgroup_lock(task);
+ /* raced against de_thread() from another thread? */
+ if (!thread_group_leader(task)) {
+ threadgroup_unlock(task);
+ put_task_struct(task);
+ continue;
+ }
+
ret = cgroup_migrate(src_cset->dfl_cgrp, task, true);
+ threadgroup_unlock(task);
put_task_struct(task);
if (WARN(ret, "cgroup: failed to update controllers for the default hierarchy (%d), further operations may crash or hang\n", ret))
@@ -2711,7 +2771,6 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp)
out_finish:
cgroup_migrate_finish(&preloaded_csets);
- percpu_up_write(&cgroup_threadgroup_rwsem);
return ret;
}
@@ -5024,7 +5083,6 @@ int __init cgroup_init(void)
unsigned long key;
int ssid, err;
- BUG_ON(percpu_init_rwsem(&cgroup_threadgroup_rwsem));
BUG_ON(cgroup_init_cftypes(NULL, cgroup_dfl_base_files));
BUG_ON(cgroup_init_cftypes(NULL, cgroup_legacy_base_files));
diff --git a/kernel/events/core.c b/kernel/events/core.c
index f548f69c4299..b11756f9b6dc 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -1243,11 +1243,7 @@ static inline void perf_event__state_init(struct perf_event *event)
PERF_EVENT_STATE_INACTIVE;
}
-/*
- * Called at perf_event creation and when events are attached/detached from a
- * group.
- */
-static void perf_event__read_size(struct perf_event *event)
+static void __perf_event_read_size(struct perf_event *event, int nr_siblings)
{
int entry = sizeof(u64); /* value */
int size = 0;
@@ -1263,7 +1259,7 @@ static void perf_event__read_size(struct perf_event *event)
entry += sizeof(u64);
if (event->attr.read_format & PERF_FORMAT_GROUP) {
- nr += event->group_leader->nr_siblings;
+ nr += nr_siblings;
size += sizeof(u64);
}
@@ -1271,14 +1267,11 @@ static void perf_event__read_size(struct perf_event *event)
event->read_size = size;
}
-static void perf_event__header_size(struct perf_event *event)
+static void __perf_event_header_size(struct perf_event *event, u64 sample_type)
{
struct perf_sample_data *data;
- u64 sample_type = event->attr.sample_type;
u16 size = 0;
- perf_event__read_size(event);
-
if (sample_type & PERF_SAMPLE_IP)
size += sizeof(data->ip);
@@ -1303,6 +1296,17 @@ static void perf_event__header_size(struct perf_event *event)
event->header_size = size;
}
+/*
+ * Called at perf_event creation and when events are attached/detached from a
+ * group.
+ */
+static void perf_event__header_size(struct perf_event *event)
+{
+ __perf_event_read_size(event,
+ event->group_leader->nr_siblings);
+ __perf_event_header_size(event, event->attr.sample_type);
+}
+
static void perf_event__id_header_size(struct perf_event *event)
{
struct perf_sample_data *data;
@@ -1330,6 +1334,27 @@ static void perf_event__id_header_size(struct perf_event *event)
event->id_header_size = size;
}
+static bool perf_event_validate_size(struct perf_event *event)
+{
+ /*
+ * The values computed here will be over-written when we actually
+ * attach the event.
+ */
+ __perf_event_read_size(event, event->group_leader->nr_siblings + 1);
+ __perf_event_header_size(event, event->attr.sample_type & ~PERF_SAMPLE_READ);
+ perf_event__id_header_size(event);
+
+ /*
+ * Sum the lot; should not exceed the 64k limit we have on records.
+ * Conservative limit to allow for callchains and other variable fields.
+ */
+ if (event->read_size + event->header_size +
+ event->id_header_size + sizeof(struct perf_event_header) >= 16*1024)
+ return false;
+
+ return true;
+}
+
static void perf_group_attach(struct perf_event *event)
{
struct perf_event *group_leader = event->group_leader, *pos;
@@ -8297,13 +8322,35 @@ SYSCALL_DEFINE5(perf_event_open,
if (move_group) {
gctx = group_leader->ctx;
+ mutex_lock_double(&gctx->mutex, &ctx->mutex);
+ } else {
+ mutex_lock(&ctx->mutex);
+ }
+ if (!perf_event_validate_size(event)) {
+ err = -E2BIG;
+ goto err_locked;
+ }
+
+ /*
+ * Must be under the same ctx::mutex as perf_install_in_context(),
+ * because we need to serialize with concurrent event creation.
+ */
+ if (!exclusive_event_installable(event, ctx)) {
+ /* exclusive and group stuff are assumed mutually exclusive */
+ WARN_ON_ONCE(move_group);
+
+ err = -EBUSY;
+ goto err_locked;
+ }
+
+ WARN_ON_ONCE(ctx->parent_ctx);
+
+ if (move_group) {
/*
* See perf_event_ctx_lock() for comments on the details
* of swizzling perf_event::ctx.
*/
- mutex_lock_double(&gctx->mutex, &ctx->mutex);
-
perf_remove_from_context(group_leader, false);
list_for_each_entry(sibling, &group_leader->sibling_list,
@@ -8311,13 +8358,7 @@ SYSCALL_DEFINE5(perf_event_open,
perf_remove_from_context(sibling, false);
put_ctx(gctx);
}
- } else {
- mutex_lock(&ctx->mutex);
- }
- WARN_ON_ONCE(ctx->parent_ctx);
-
- if (move_group) {
/*
* Wait for everybody to stop referencing the events through
* the old lists, before installing it on new lists.
@@ -8349,22 +8390,29 @@ SYSCALL_DEFINE5(perf_event_open,
perf_event__state_init(group_leader);
perf_install_in_context(ctx, group_leader, group_leader->cpu);
get_ctx(ctx);
- }
- if (!exclusive_event_installable(event, ctx)) {
- err = -EBUSY;
- mutex_unlock(&ctx->mutex);
- fput(event_file);
- goto err_context;
+ /*
+ * Now that all events are installed in @ctx, nothing
+ * references @gctx anymore, so drop the last reference we have
+ * on it.
+ */
+ put_ctx(gctx);
}
+ /*
+ * Precalculate sample_data sizes; do while holding ctx::mutex such
+ * that we're serialized against further additions and before
+ * perf_install_in_context() which is the point the event is active and
+ * can use these values.
+ */
+ perf_event__header_size(event);
+ perf_event__id_header_size(event);
+
perf_install_in_context(ctx, event, event->cpu);
perf_unpin_context(ctx);
- if (move_group) {
+ if (move_group)
mutex_unlock(&gctx->mutex);
- put_ctx(gctx);
- }
mutex_unlock(&ctx->mutex);
put_online_cpus();
@@ -8376,12 +8424,6 @@ SYSCALL_DEFINE5(perf_event_open,
mutex_unlock(&current->perf_event_mutex);
/*
- * Precalculate sample_data sizes
- */
- perf_event__header_size(event);
- perf_event__id_header_size(event);
-
- /*
* Drop the reference on the group_event after placing the
* new event on the sibling_list. This ensures destruction
* of the group leader will find the pointer to itself in
@@ -8391,6 +8433,12 @@ SYSCALL_DEFINE5(perf_event_open,
fd_install(event_fd, event_file);
return event_fd;
+err_locked:
+ if (move_group)
+ mutex_unlock(&gctx->mutex);
+ mutex_unlock(&ctx->mutex);
+/* err_file: */
+ fput(event_file);
err_context:
perf_unpin_context(ctx);
put_ctx(ctx);
diff --git a/kernel/fork.c b/kernel/fork.c
index 7d5f0f118a63..2845623fb582 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1149,6 +1149,10 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
tty_audit_fork(sig);
sched_autogroup_fork(sig);
+#ifdef CONFIG_CGROUPS
+ init_rwsem(&sig->group_rwsem);
+#endif
+
sig->oom_score_adj = current->signal->oom_score_adj;
sig->oom_score_adj_min = current->signal->oom_score_adj_min;
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index de41a68fc038..e25a83b67cce 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -22,7 +22,6 @@
/**
* handle_bad_irq - handle spurious and unhandled irqs
- * @irq: the interrupt number
* @desc: description of the interrupt
*
* Handles spurious and unhandled IRQ's. It also prints a debugmessage.
@@ -35,6 +34,7 @@ void handle_bad_irq(struct irq_desc *desc)
kstat_incr_irqs_this_cpu(desc);
ack_bad_irq(irq);
}
+EXPORT_SYMBOL_GPL(handle_bad_irq);
/*
* Special, empty irq handler:
diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c
index 7e6512b9dc1f..be9149f62eb8 100644
--- a/kernel/irq/msi.c
+++ b/kernel/irq/msi.c
@@ -228,11 +228,7 @@ static void msi_domain_update_chip_ops(struct msi_domain_info *info)
{
struct irq_chip *chip = info->chip;
- BUG_ON(!chip);
- if (!chip->irq_mask)
- chip->irq_mask = pci_msi_mask_irq;
- if (!chip->irq_unmask)
- chip->irq_unmask = pci_msi_unmask_irq;
+ BUG_ON(!chip || !chip->irq_mask || !chip->irq_unmask);
if (!chip->irq_set_affinity)
chip->irq_set_affinity = msi_domain_set_affinity;
}
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index e3a8c9577ba6..a50ddc9417ff 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -12,6 +12,7 @@
#include <linux/seq_file.h>
#include <linux/interrupt.h>
#include <linux/kernel_stat.h>
+#include <linux/mutex.h>
#include "internals.h"
@@ -323,18 +324,29 @@ void register_handler_proc(unsigned int irq, struct irqaction *action)
void register_irq_proc(unsigned int irq, struct irq_desc *desc)
{
+ static DEFINE_MUTEX(register_lock);
char name [MAX_NAMELEN];
- if (!root_irq_dir || (desc->irq_data.chip == &no_irq_chip) || desc->dir)
+ if (!root_irq_dir || (desc->irq_data.chip == &no_irq_chip))
return;
+ /*
+ * irq directories are registered only when a handler is
+ * added, not when the descriptor is created, so multiple
+ * tasks might try to register at the same time.
+ */
+ mutex_lock(&register_lock);
+
+ if (desc->dir)
+ goto out_unlock;
+
memset(name, 0, MAX_NAMELEN);
sprintf(name, "%d", irq);
/* create /proc/irq/1234 */
desc->dir = proc_mkdir(name, root_irq_dir);
if (!desc->dir)
- return;
+ goto out_unlock;
#ifdef CONFIG_SMP
/* create /proc/irq/<irq>/smp_affinity */
@@ -355,6 +367,9 @@ void register_irq_proc(unsigned int irq, struct irq_desc *desc)
proc_create_data("spurious", 0444, desc->dir,
&irq_spurious_proc_fops, (void *)(long)irq);
+
+out_unlock:
+ mutex_unlock(&register_lock);
}
void unregister_irq_proc(unsigned int irq, struct irq_desc *desc)
diff --git a/kernel/kmod.c b/kernel/kmod.c
index da98d0593de2..0277d1216f80 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -327,9 +327,13 @@ static void call_usermodehelper_exec_work(struct work_struct *work)
call_usermodehelper_exec_sync(sub_info);
} else {
pid_t pid;
-
+ /*
+ * Use CLONE_PARENT to reparent it to kthreadd; we do not
+ * want to pollute current->children, and we need a parent
+ * that always ignores SIGCHLD to ensure auto-reaping.
+ */
pid = kernel_thread(call_usermodehelper_exec_async, sub_info,
- SIGCHLD);
+ CLONE_PARENT | SIGCHLD);
if (pid < 0) {
sub_info->retval = pid;
umh_complete(sub_info);
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index 8acfbf773e06..4e49cc4c9952 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -3068,7 +3068,7 @@ static int __lock_is_held(struct lockdep_map *lock);
static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
int trylock, int read, int check, int hardirqs_off,
struct lockdep_map *nest_lock, unsigned long ip,
- int references)
+ int references, int pin_count)
{
struct task_struct *curr = current;
struct lock_class *class = NULL;
@@ -3157,7 +3157,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
hlock->waittime_stamp = 0;
hlock->holdtime_stamp = lockstat_clock();
#endif
- hlock->pin_count = 0;
+ hlock->pin_count = pin_count;
if (check && !mark_irqflags(curr, hlock))
return 0;
@@ -3343,7 +3343,7 @@ found_it:
hlock_class(hlock)->subclass, hlock->trylock,
hlock->read, hlock->check, hlock->hardirqs_off,
hlock->nest_lock, hlock->acquire_ip,
- hlock->references))
+ hlock->references, hlock->pin_count))
return 0;
}
@@ -3433,7 +3433,7 @@ found_it:
hlock_class(hlock)->subclass, hlock->trylock,
hlock->read, hlock->check, hlock->hardirqs_off,
hlock->nest_lock, hlock->acquire_ip,
- hlock->references))
+ hlock->references, hlock->pin_count))
return 0;
}
@@ -3583,7 +3583,7 @@ void lock_acquire(struct lockdep_map *lock, unsigned int subclass,
current->lockdep_recursion = 1;
trace_lock_acquire(lock, subclass, trylock, read, check, nest_lock, ip);
__lock_acquire(lock, subclass, trylock, read, check,
- irqs_disabled_flags(flags), nest_lock, ip, 0);
+ irqs_disabled_flags(flags), nest_lock, ip, 0, 0);
current->lockdep_recursion = 0;
raw_local_irq_restore(flags);
}
diff --git a/kernel/memremap.c b/kernel/memremap.c
index 72b0c66628b6..9d6b55587eaa 100644
--- a/kernel/memremap.c
+++ b/kernel/memremap.c
@@ -24,6 +24,16 @@ __weak void __iomem *ioremap_cache(resource_size_t offset, unsigned long size)
}
#endif
+static void *try_ram_remap(resource_size_t offset, size_t size)
+{
+ struct page *page = pfn_to_page(offset >> PAGE_SHIFT);
+
+ /* In the simple case just return the existing linear address */
+ if (!PageHighMem(page))
+ return __va(offset);
+ return NULL; /* fallback to ioremap_cache */
+}
+
/**
* memremap() - remap an iomem_resource as cacheable memory
* @offset: iomem resource start address
@@ -66,8 +76,8 @@ void *memremap(resource_size_t offset, size_t size, unsigned long flags)
* the requested range is potentially in "System RAM"
*/
if (is_ram == REGION_INTERSECTS)
- addr = __va(offset);
- else
+ addr = try_ram_remap(offset, size);
+ if (!addr)
addr = ioremap_cache(offset, size);
}
diff --git a/kernel/module.c b/kernel/module.c
index b86b7bf1be38..8f051a106676 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -1063,11 +1063,15 @@ void symbol_put_addr(void *addr)
if (core_kernel_text(a))
return;
- /* module_text_address is safe here: we're supposed to have reference
- * to module from symbol_get, so it can't go away. */
+ /*
+ * Even though we hold a reference on the module; we still need to
+ * disable preemption in order to safely traverse the data structure.
+ */
+ preempt_disable();
modaddr = __module_text_address(a);
BUG_ON(!modaddr);
module_put(modaddr);
+ preempt_enable();
}
EXPORT_SYMBOL_GPL(symbol_put_addr);
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 9f75f25cc5d9..775d36cc0050 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -3868,6 +3868,7 @@ static void rcu_init_new_rnp(struct rcu_node *rnp_leaf)
static void __init
rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp)
{
+ static struct lock_class_key rcu_exp_sched_rdp_class;
unsigned long flags;
struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
struct rcu_node *rnp = rcu_get_root(rsp);
@@ -3883,6 +3884,10 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp)
mutex_init(&rdp->exp_funnel_mutex);
rcu_boot_init_nocb_percpu_data(rdp);
raw_spin_unlock_irqrestore(&rnp->lock, flags);
+ if (rsp == &rcu_sched_state)
+ lockdep_set_class_and_name(&rdp->exp_funnel_mutex,
+ &rcu_exp_sched_rdp_class,
+ "rcu_data_exp_sched");
}
/*
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 2f9c92884817..bcd214e4b4d6 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2366,8 +2366,15 @@ void wake_up_new_task(struct task_struct *p)
trace_sched_wakeup_new(p);
check_preempt_curr(rq, p, WF_FORK);
#ifdef CONFIG_SMP
- if (p->sched_class->task_woken)
+ if (p->sched_class->task_woken) {
+ /*
+ * Nothing relies on rq->lock after this, so its fine to
+ * drop it.
+ */
+ lockdep_unpin_lock(&rq->lock);
p->sched_class->task_woken(rq, p);
+ lockdep_pin_lock(&rq->lock);
+ }
#endif
task_rq_unlock(rq, p, &flags);
}
@@ -2517,11 +2524,11 @@ static struct rq *finish_task_switch(struct task_struct *prev)
* If a task dies, then it sets TASK_DEAD in tsk->state and calls
* schedule one last time. The schedule call will never return, and
* the scheduled task must drop that reference.
- * The test for TASK_DEAD must occur while the runqueue locks are
- * still held, otherwise prev could be scheduled on another cpu, die
- * there before we look at prev->state, and then the reference would
- * be dropped twice.
- * Manfred Spraul <manfred@colorfullife.com>
+ *
+ * We must observe prev->state before clearing prev->on_cpu (in
+ * finish_lock_switch), otherwise a concurrent wakeup can get prev
+ * running on another CPU and we could rave with its RUNNING -> DEAD
+ * transition, resulting in a double drop.
*/
prev_state = prev->state;
vtime_task_switch(prev);
@@ -4934,7 +4941,15 @@ void init_idle(struct task_struct *idle, int cpu)
idle->state = TASK_RUNNING;
idle->se.exec_start = sched_clock();
- do_set_cpus_allowed(idle, cpumask_of(cpu));
+#ifdef CONFIG_SMP
+ /*
+ * Its possible that init_idle() gets called multiple times on a task,
+ * in that case do_set_cpus_allowed() will not do the right thing.
+ *
+ * And since this is boot we can forgo the serialization.
+ */
+ set_cpus_allowed_common(idle, cpumask_of(cpu));
+#endif
/*
* We're having a chicken and egg problem, even though we are
* holding rq->lock, the cpu isn't yet set to this cpu so the
@@ -4951,7 +4966,7 @@ void init_idle(struct task_struct *idle, int cpu)
rq->curr = rq->idle = idle;
idle->on_rq = TASK_ON_RQ_QUEUED;
-#if defined(CONFIG_SMP)
+#ifdef CONFIG_SMP
idle->on_cpu = 1;
#endif
raw_spin_unlock(&rq->lock);
@@ -4966,7 +4981,7 @@ void init_idle(struct task_struct *idle, int cpu)
idle->sched_class = &idle_sched_class;
ftrace_graph_init_idle_task(idle, cpu);
vtime_init_idle(idle, cpu);
-#if defined(CONFIG_SMP)
+#ifdef CONFIG_SMP
sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu);
#endif
}
@@ -7230,9 +7245,6 @@ void __init sched_init_smp(void)
alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL);
alloc_cpumask_var(&fallback_doms, GFP_KERNEL);
- /* nohz_full won't take effect without isolating the cpus. */
- tick_nohz_full_add_cpus_to(cpu_isolated_map);
-
sched_init_numa();
/*
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index fc8f01083527..8b0a15e285f9 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -668,8 +668,15 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer)
* Queueing this task back might have overloaded rq, check if we need
* to kick someone away.
*/
- if (has_pushable_dl_tasks(rq))
+ if (has_pushable_dl_tasks(rq)) {
+ /*
+ * Nothing relies on rq->lock after this, so its safe to drop
+ * rq->lock.
+ */
+ lockdep_unpin_lock(&rq->lock);
push_dl_task(rq);
+ lockdep_pin_lock(&rq->lock);
+ }
#endif
unlock:
@@ -1066,8 +1073,9 @@ select_task_rq_dl(struct task_struct *p, int cpu, int sd_flag, int flags)
int target = find_later_rq(p);
if (target != -1 &&
- dl_time_before(p->dl.deadline,
- cpu_rq(target)->dl.earliest_dl.curr))
+ (dl_time_before(p->dl.deadline,
+ cpu_rq(target)->dl.earliest_dl.curr) ||
+ (cpu_rq(target)->dl.dl_nr_running == 0)))
cpu = target;
}
rcu_read_unlock();
@@ -1417,7 +1425,8 @@ static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq)
later_rq = cpu_rq(cpu);
- if (!dl_time_before(task->dl.deadline,
+ if (later_rq->dl.dl_nr_running &&
+ !dl_time_before(task->dl.deadline,
later_rq->dl.earliest_dl.curr)) {
/*
* Target rq has tasks of equal or earlier deadline,
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 6e2e3483b1ec..9a5e60fe721a 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2363,7 +2363,7 @@ static inline long calc_tg_weight(struct task_group *tg, struct cfs_rq *cfs_rq)
*/
tg_weight = atomic_long_read(&tg->load_avg);
tg_weight -= cfs_rq->tg_load_avg_contrib;
- tg_weight += cfs_rq_load_avg(cfs_rq);
+ tg_weight += cfs_rq->load.weight;
return tg_weight;
}
@@ -2373,7 +2373,7 @@ static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
long tg_weight, load, shares;
tg_weight = calc_tg_weight(tg, cfs_rq);
- load = cfs_rq_load_avg(cfs_rq);
+ load = cfs_rq->load.weight;
shares = (tg->shares * load);
if (tg_weight)
@@ -2664,13 +2664,14 @@ static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq);
/* Group cfs_rq's load_avg is used for task_h_load and update_cfs_share */
static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
{
- int decayed;
struct sched_avg *sa = &cfs_rq->avg;
+ int decayed, removed = 0;
if (atomic_long_read(&cfs_rq->removed_load_avg)) {
long r = atomic_long_xchg(&cfs_rq->removed_load_avg, 0);
sa->load_avg = max_t(long, sa->load_avg - r, 0);
sa->load_sum = max_t(s64, sa->load_sum - r * LOAD_AVG_MAX, 0);
+ removed = 1;
}
if (atomic_long_read(&cfs_rq->removed_util_avg)) {
@@ -2688,7 +2689,7 @@ static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
cfs_rq->load_last_update_time_copy = sa->last_update_time;
#endif
- return decayed;
+ return decayed || removed;
}
/* Update task and its cfs_rq load average */
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 8f177c73ae19..4a2ef5a02fd3 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -57,9 +57,11 @@ static inline int cpu_idle_poll(void)
rcu_idle_enter();
trace_cpu_idle_rcuidle(0, smp_processor_id());
local_irq_enable();
+ stop_critical_timings();
while (!tif_need_resched() &&
(cpu_idle_force_poll || tick_check_broadcast_expired()))
cpu_relax();
+ start_critical_timings();
trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id());
rcu_idle_exit();
return 1;
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 68cda117574c..6d2a119c7ad9 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1078,9 +1078,10 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
* After ->on_cpu is cleared, the task can be moved to a different CPU.
* We must ensure this doesn't happen until the switch is completely
* finished.
+ *
+ * Pairs with the control dependency and rmb in try_to_wake_up().
*/
- smp_wmb();
- prev->on_cpu = 0;
+ smp_store_release(&prev->on_cpu, 0);
#endif
#ifdef CONFIG_DEBUG_SPINLOCK
/* this is a valid case when another task releases the spinlock */
diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c
index 272d9322bc5d..052e02672d12 100644
--- a/kernel/sched/wait.c
+++ b/kernel/sched/wait.c
@@ -106,10 +106,9 @@ void __wake_up_locked(wait_queue_head_t *q, unsigned int mode, int nr)
}
EXPORT_SYMBOL_GPL(__wake_up_locked);
-void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, int nr,
- void *key)
+void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key)
{
- __wake_up_common(q, mode, nr, 0, key);
+ __wake_up_common(q, mode, 1, 0, key);
}
EXPORT_SYMBOL_GPL(__wake_up_locked_key);
@@ -284,7 +283,7 @@ void abort_exclusive_wait(wait_queue_head_t *q, wait_queue_t *wait,
if (!list_empty(&wait->task_list))
list_del_init(&wait->task_list);
else if (waitqueue_active(q))
- __wake_up_locked_key(q, mode, 1, key);
+ __wake_up_locked_key(q, mode, key);
spin_unlock_irqrestore(&q->lock, flags);
}
EXPORT_SYMBOL(abort_exclusive_wait);
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 841b72f720e8..3a38775b50c2 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -217,7 +217,7 @@ static void clocksource_watchdog(unsigned long data)
continue;
/* Check the deviation from the watchdog clocksource. */
- if ((abs(cs_nsec - wd_nsec) > WATCHDOG_THRESHOLD)) {
+ if (abs64(cs_nsec - wd_nsec) > WATCHDOG_THRESHOLD) {
pr_warn("timekeeping watchdog: Marking clocksource '%s' as unstable because the skew is too large:\n",
cs->name);
pr_warn(" '%s' wd_now: %llx wd_last: %llx mask: %llx\n",
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 3739ac6aa473..44d2cc0436f4 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -1251,7 +1251,7 @@ void __init timekeeping_init(void)
set_normalized_timespec64(&tmp, -boot.tv_sec, -boot.tv_nsec);
tk_set_wall_to_mono(tk, tmp);
- timekeeping_update(tk, TK_MIRROR);
+ timekeeping_update(tk, TK_MIRROR | TK_CLOCK_WAS_SET);
write_seqcount_end(&tk_core.seq);
raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index b746399ab59c..8abf1ba18085 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -85,9 +85,19 @@ check_stack(unsigned long ip, unsigned long *stack)
if (!object_is_on_stack(stack))
return;
+ /* Can't do this from NMI context (can cause deadlocks) */
+ if (in_nmi())
+ return;
+
local_irq_save(flags);
arch_spin_lock(&max_stack_lock);
+ /*
+ * RCU may not be watching, make it see us.
+ * The stack trace code uses rcu_sched.
+ */
+ rcu_irq_enter();
+
/* In case another CPU set the tracer_frame on us */
if (unlikely(!frame_size))
this_size -= tracer_frame;
@@ -169,6 +179,7 @@ check_stack(unsigned long ip, unsigned long *stack)
}
out:
+ rcu_irq_exit();
arch_spin_unlock(&max_stack_lock);
local_irq_restore(flags);
}
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index ca71582fcfab..bcb14cafe007 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -1458,13 +1458,13 @@ static void __queue_delayed_work(int cpu, struct workqueue_struct *wq,
timer_stats_timer_set_start_info(&dwork->timer);
dwork->wq = wq;
+ /* timer isn't guaranteed to run in this cpu, record earlier */
+ if (cpu == WORK_CPU_UNBOUND)
+ cpu = raw_smp_processor_id();
dwork->cpu = cpu;
timer->expires = jiffies + delay;
- if (unlikely(cpu != WORK_CPU_UNBOUND))
- add_timer_on(timer, cpu);
- else
- add_timer(timer);
+ add_timer_on(timer, cpu);
}
/**