aboutsummaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/bpf/btf.c20
-rw-r--r--kernel/bpf/syscall.c5
-rw-r--r--kernel/bpf/verifier.c10
-rw-r--r--kernel/cgroup/cgroup.c20
-rw-r--r--kernel/cgroup/legacy_freezer.c8
-rw-r--r--kernel/cpu.c402
-rw-r--r--kernel/irq/chip.c17
-rw-r--r--kernel/irq/debugfs.c2
-rw-r--r--kernel/irq/internals.h13
-rw-r--r--kernel/irq/irqdesc.c77
-rw-r--r--kernel/irq/irqdomain.c2
-rw-r--r--kernel/irq/resend.c47
-rw-r--r--kernel/power/hibernate.c179
-rw-r--r--kernel/power/power.h10
-rw-r--r--kernel/power/swap.c30
-rw-r--r--kernel/smp.c43
-rw-r--r--kernel/smpboot.c163
-rw-r--r--kernel/softirq.c22
-rw-r--r--kernel/time/alarmtimer.c4
-rw-r--r--kernel/time/hrtimer.c3
-rw-r--r--kernel/time/posix-timers.c525
-rw-r--r--kernel/time/tick-common.c13
-rw-r--r--kernel/time/tick-sched.c15
-rw-r--r--kernel/trace/trace.c2
-rw-r--r--kernel/trace/trace_events_user.c290
-rw-r--r--kernel/trace/trace_output.c2
-rw-r--r--kernel/watch_queue.c12
-rw-r--r--kernel/workqueue.c13
28 files changed, 1231 insertions, 718 deletions
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 6b682b8e4b50..72b32b7cd9cd 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -744,13 +744,12 @@ static bool btf_name_offset_valid(const struct btf *btf, u32 offset)
return offset < btf->hdr.str_len;
}
-static bool __btf_name_char_ok(char c, bool first, bool dot_ok)
+static bool __btf_name_char_ok(char c, bool first)
{
if ((first ? !isalpha(c) :
!isalnum(c)) &&
c != '_' &&
- ((c == '.' && !dot_ok) ||
- c != '.'))
+ c != '.')
return false;
return true;
}
@@ -767,20 +766,20 @@ static const char *btf_str_by_offset(const struct btf *btf, u32 offset)
return NULL;
}
-static bool __btf_name_valid(const struct btf *btf, u32 offset, bool dot_ok)
+static bool __btf_name_valid(const struct btf *btf, u32 offset)
{
/* offset must be valid */
const char *src = btf_str_by_offset(btf, offset);
const char *src_limit;
- if (!__btf_name_char_ok(*src, true, dot_ok))
+ if (!__btf_name_char_ok(*src, true))
return false;
/* set a limit on identifier length */
src_limit = src + KSYM_NAME_LEN;
src++;
while (*src && src < src_limit) {
- if (!__btf_name_char_ok(*src, false, dot_ok))
+ if (!__btf_name_char_ok(*src, false))
return false;
src++;
}
@@ -788,17 +787,14 @@ static bool __btf_name_valid(const struct btf *btf, u32 offset, bool dot_ok)
return !*src;
}
-/* Only C-style identifier is permitted. This can be relaxed if
- * necessary.
- */
static bool btf_name_valid_identifier(const struct btf *btf, u32 offset)
{
- return __btf_name_valid(btf, offset, false);
+ return __btf_name_valid(btf, offset);
}
static bool btf_name_valid_section(const struct btf *btf, u32 offset)
{
- return __btf_name_valid(btf, offset, true);
+ return __btf_name_valid(btf, offset);
}
static const char *__btf_name_by_offset(const struct btf *btf, u32 offset)
@@ -4422,7 +4418,7 @@ static s32 btf_var_check_meta(struct btf_verifier_env *env,
}
if (!t->name_off ||
- !__btf_name_valid(env->btf, t->name_off, true)) {
+ !__btf_name_valid(env->btf, t->name_off)) {
btf_verifier_log_type(env, t, "Invalid name");
return -EINVAL;
}
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 0c21d0d8efe4..f1c8733f76b8 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -3440,6 +3440,11 @@ static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog,
return prog->enforce_expected_attach_type &&
prog->expected_attach_type != attach_type ?
-EINVAL : 0;
+ case BPF_PROG_TYPE_KPROBE:
+ if (prog->expected_attach_type == BPF_TRACE_KPROBE_MULTI &&
+ attach_type != BPF_TRACE_KPROBE_MULTI)
+ return -EINVAL;
+ return 0;
default:
return 0;
}
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 5871aa78d01a..cf5f230360f5 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -3868,6 +3868,9 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env,
return err;
}
save_register_state(state, spi, reg, size);
+ /* Break the relation on a narrowing spill. */
+ if (fls64(reg->umax_value) > BITS_PER_BYTE * size)
+ state->stack[spi].spilled_ptr.id = 0;
} else if (!reg && !(off % BPF_REG_SIZE) && is_bpf_st_mem(insn) &&
insn->imm != 0 && env->bpf_capable) {
struct bpf_reg_state fake_reg = {};
@@ -17214,9 +17217,10 @@ static int jit_subprogs(struct bpf_verifier_env *env)
}
/* finally lock prog and jit images for all functions and
- * populate kallsysm
+ * populate kallsysm. Begin at the first subprogram, since
+ * bpf_prog_load will add the kallsyms for the main program.
*/
- for (i = 0; i < env->subprog_cnt; i++) {
+ for (i = 1; i < env->subprog_cnt; i++) {
bpf_prog_lock_ro(func[i]);
bpf_prog_kallsyms_add(func[i]);
}
@@ -17242,6 +17246,8 @@ static int jit_subprogs(struct bpf_verifier_env *env)
prog->jited = 1;
prog->bpf_func = func[0]->bpf_func;
prog->jited_len = func[0]->jited_len;
+ prog->aux->extable = func[0]->aux->extable;
+ prog->aux->num_exentries = func[0]->aux->num_exentries;
prog->aux->func = func;
prog->aux->func_cnt = env->subprog_cnt;
bpf_prog_jit_attempt_done(prog);
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 245cf62ce85a..4d42f0cbc11e 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -1798,7 +1798,7 @@ int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask)
{
struct cgroup *dcgrp = &dst_root->cgrp;
struct cgroup_subsys *ss;
- int ssid, i, ret;
+ int ssid, ret;
u16 dfl_disable_ss_mask = 0;
lockdep_assert_held(&cgroup_mutex);
@@ -1842,7 +1842,8 @@ int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask)
struct cgroup_root *src_root = ss->root;
struct cgroup *scgrp = &src_root->cgrp;
struct cgroup_subsys_state *css = cgroup_css(scgrp, ss);
- struct css_set *cset;
+ struct css_set *cset, *cset_pos;
+ struct css_task_iter *it;
WARN_ON(!css || cgroup_css(dcgrp, ss));
@@ -1860,9 +1861,22 @@ int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask)
css->cgroup = dcgrp;
spin_lock_irq(&css_set_lock);
- hash_for_each(css_set_table, i, cset, hlist)
+ WARN_ON(!list_empty(&dcgrp->e_csets[ss->id]));
+ list_for_each_entry_safe(cset, cset_pos, &scgrp->e_csets[ss->id],
+ e_cset_node[ss->id]) {
list_move_tail(&cset->e_cset_node[ss->id],
&dcgrp->e_csets[ss->id]);
+ /*
+ * all css_sets of scgrp together in same order to dcgrp,
+ * patch in-flight iterators to preserve correct iteration.
+ * since the iterator is always advanced right away and
+ * finished when it->cset_pos meets it->cset_head, so only
+ * update it->cset_head is enough here.
+ */
+ list_for_each_entry(it, &cset->task_iters, iters_node)
+ if (it->cset_head == &scgrp->e_csets[ss->id])
+ it->cset_head = &dcgrp->e_csets[ss->id];
+ }
spin_unlock_irq(&css_set_lock);
if (ss->css_rstat_flush) {
diff --git a/kernel/cgroup/legacy_freezer.c b/kernel/cgroup/legacy_freezer.c
index 936473203a6b..122dacb3a443 100644
--- a/kernel/cgroup/legacy_freezer.c
+++ b/kernel/cgroup/legacy_freezer.c
@@ -108,16 +108,18 @@ static int freezer_css_online(struct cgroup_subsys_state *css)
struct freezer *freezer = css_freezer(css);
struct freezer *parent = parent_freezer(freezer);
+ cpus_read_lock();
mutex_lock(&freezer_mutex);
freezer->state |= CGROUP_FREEZER_ONLINE;
if (parent && (parent->state & CGROUP_FREEZING)) {
freezer->state |= CGROUP_FREEZING_PARENT | CGROUP_FROZEN;
- static_branch_inc(&freezer_active);
+ static_branch_inc_cpuslocked(&freezer_active);
}
mutex_unlock(&freezer_mutex);
+ cpus_read_unlock();
return 0;
}
@@ -132,14 +134,16 @@ static void freezer_css_offline(struct cgroup_subsys_state *css)
{
struct freezer *freezer = css_freezer(css);
+ cpus_read_lock();
mutex_lock(&freezer_mutex);
if (freezer->state & CGROUP_FREEZING)
- static_branch_dec(&freezer_active);
+ static_branch_dec_cpuslocked(&freezer_active);
freezer->state = 0;
mutex_unlock(&freezer_mutex);
+ cpus_read_unlock();
}
static void freezer_css_free(struct cgroup_subsys_state *css)
diff --git a/kernel/cpu.c b/kernel/cpu.c
index f4a2c5845bcb..88a7ede322bd 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -17,6 +17,7 @@
#include <linux/cpu.h>
#include <linux/oom.h>
#include <linux/rcupdate.h>
+#include <linux/delay.h>
#include <linux/export.h>
#include <linux/bug.h>
#include <linux/kthread.h>
@@ -59,6 +60,7 @@
* @last: For multi-instance rollback, remember how far we got
* @cb_state: The state for a single callback (install/uninstall)
* @result: Result of the operation
+ * @ap_sync_state: State for AP synchronization
* @done_up: Signal completion to the issuer of the task for cpu-up
* @done_down: Signal completion to the issuer of the task for cpu-down
*/
@@ -76,6 +78,7 @@ struct cpuhp_cpu_state {
struct hlist_node *last;
enum cpuhp_state cb_state;
int result;
+ atomic_t ap_sync_state;
struct completion done_up;
struct completion done_down;
#endif
@@ -276,6 +279,182 @@ static bool cpuhp_is_atomic_state(enum cpuhp_state state)
return CPUHP_AP_IDLE_DEAD <= state && state < CPUHP_AP_ONLINE;
}
+/* Synchronization state management */
+enum cpuhp_sync_state {
+ SYNC_STATE_DEAD,
+ SYNC_STATE_KICKED,
+ SYNC_STATE_SHOULD_DIE,
+ SYNC_STATE_ALIVE,
+ SYNC_STATE_SHOULD_ONLINE,
+ SYNC_STATE_ONLINE,
+};
+
+#ifdef CONFIG_HOTPLUG_CORE_SYNC
+/**
+ * cpuhp_ap_update_sync_state - Update synchronization state during bringup/teardown
+ * @state: The synchronization state to set
+ *
+ * No synchronization point. Just update of the synchronization state, but implies
+ * a full barrier so that the AP changes are visible before the control CPU proceeds.
+ */
+static inline void cpuhp_ap_update_sync_state(enum cpuhp_sync_state state)
+{
+ atomic_t *st = this_cpu_ptr(&cpuhp_state.ap_sync_state);
+
+ (void)atomic_xchg(st, state);
+}
+
+void __weak arch_cpuhp_sync_state_poll(void) { cpu_relax(); }
+
+static bool cpuhp_wait_for_sync_state(unsigned int cpu, enum cpuhp_sync_state state,
+ enum cpuhp_sync_state next_state)
+{
+ atomic_t *st = per_cpu_ptr(&cpuhp_state.ap_sync_state, cpu);
+ ktime_t now, end, start = ktime_get();
+ int sync;
+
+ end = start + 10ULL * NSEC_PER_SEC;
+
+ sync = atomic_read(st);
+ while (1) {
+ if (sync == state) {
+ if (!atomic_try_cmpxchg(st, &sync, next_state))
+ continue;
+ return true;
+ }
+
+ now = ktime_get();
+ if (now > end) {
+ /* Timeout. Leave the state unchanged */
+ return false;
+ } else if (now - start < NSEC_PER_MSEC) {
+ /* Poll for one millisecond */
+ arch_cpuhp_sync_state_poll();
+ } else {
+ usleep_range_state(USEC_PER_MSEC, 2 * USEC_PER_MSEC, TASK_UNINTERRUPTIBLE);
+ }
+ sync = atomic_read(st);
+ }
+ return true;
+}
+#else /* CONFIG_HOTPLUG_CORE_SYNC */
+static inline void cpuhp_ap_update_sync_state(enum cpuhp_sync_state state) { }
+#endif /* !CONFIG_HOTPLUG_CORE_SYNC */
+
+#ifdef CONFIG_HOTPLUG_CORE_SYNC_DEAD
+/**
+ * cpuhp_ap_report_dead - Update synchronization state to DEAD
+ *
+ * No synchronization point. Just update of the synchronization state.
+ */
+void cpuhp_ap_report_dead(void)
+{
+ cpuhp_ap_update_sync_state(SYNC_STATE_DEAD);
+}
+
+void __weak arch_cpuhp_cleanup_dead_cpu(unsigned int cpu) { }
+
+/*
+ * Late CPU shutdown synchronization point. Cannot use cpuhp_state::done_down
+ * because the AP cannot issue complete() at this stage.
+ */
+static void cpuhp_bp_sync_dead(unsigned int cpu)
+{
+ atomic_t *st = per_cpu_ptr(&cpuhp_state.ap_sync_state, cpu);
+ int sync = atomic_read(st);
+
+ do {
+ /* CPU can have reported dead already. Don't overwrite that! */
+ if (sync == SYNC_STATE_DEAD)
+ break;
+ } while (!atomic_try_cmpxchg(st, &sync, SYNC_STATE_SHOULD_DIE));
+
+ if (cpuhp_wait_for_sync_state(cpu, SYNC_STATE_DEAD, SYNC_STATE_DEAD)) {
+ /* CPU reached dead state. Invoke the cleanup function */
+ arch_cpuhp_cleanup_dead_cpu(cpu);
+ return;
+ }
+
+ /* No further action possible. Emit message and give up. */
+ pr_err("CPU%u failed to report dead state\n", cpu);
+}
+#else /* CONFIG_HOTPLUG_CORE_SYNC_DEAD */
+static inline void cpuhp_bp_sync_dead(unsigned int cpu) { }
+#endif /* !CONFIG_HOTPLUG_CORE_SYNC_DEAD */
+
+#ifdef CONFIG_HOTPLUG_CORE_SYNC_FULL
+/**
+ * cpuhp_ap_sync_alive - Synchronize AP with the control CPU once it is alive
+ *
+ * Updates the AP synchronization state to SYNC_STATE_ALIVE and waits
+ * for the BP to release it.
+ */
+void cpuhp_ap_sync_alive(void)
+{
+ atomic_t *st = this_cpu_ptr(&cpuhp_state.ap_sync_state);
+
+ cpuhp_ap_update_sync_state(SYNC_STATE_ALIVE);
+
+ /* Wait for the control CPU to release it. */
+ while (atomic_read(st) != SYNC_STATE_SHOULD_ONLINE)
+ cpu_relax();
+}
+
+static bool cpuhp_can_boot_ap(unsigned int cpu)
+{
+ atomic_t *st = per_cpu_ptr(&cpuhp_state.ap_sync_state, cpu);
+ int sync = atomic_read(st);
+
+again:
+ switch (sync) {
+ case SYNC_STATE_DEAD:
+ /* CPU is properly dead */
+ break;
+ case SYNC_STATE_KICKED:
+ /* CPU did not come up in previous attempt */
+ break;
+ case SYNC_STATE_ALIVE:
+ /* CPU is stuck cpuhp_ap_sync_alive(). */
+ break;
+ default:
+ /* CPU failed to report online or dead and is in limbo state. */
+ return false;
+ }
+
+ /* Prepare for booting */
+ if (!atomic_try_cmpxchg(st, &sync, SYNC_STATE_KICKED))
+ goto again;
+
+ return true;
+}
+
+void __weak arch_cpuhp_cleanup_kick_cpu(unsigned int cpu) { }
+
+/*
+ * Early CPU bringup synchronization point. Cannot use cpuhp_state::done_up
+ * because the AP cannot issue complete() so early in the bringup.
+ */
+static int cpuhp_bp_sync_alive(unsigned int cpu)
+{
+ int ret = 0;
+
+ if (!IS_ENABLED(CONFIG_HOTPLUG_CORE_SYNC_FULL))
+ return 0;
+
+ if (!cpuhp_wait_for_sync_state(cpu, SYNC_STATE_ALIVE, SYNC_STATE_SHOULD_ONLINE)) {
+ pr_err("CPU%u failed to report alive state\n", cpu);
+ ret = -EIO;
+ }
+
+ /* Let the architecture cleanup the kick alive mechanics. */
+ arch_cpuhp_cleanup_kick_cpu(cpu);
+ return ret;
+}
+#else /* CONFIG_HOTPLUG_CORE_SYNC_FULL */
+static inline int cpuhp_bp_sync_alive(unsigned int cpu) { return 0; }
+static inline bool cpuhp_can_boot_ap(unsigned int cpu) { return true; }
+#endif /* !CONFIG_HOTPLUG_CORE_SYNC_FULL */
+
/* Serializes the updates to cpu_online_mask, cpu_present_mask */
static DEFINE_MUTEX(cpu_add_remove_lock);
bool cpuhp_tasks_frozen;
@@ -470,8 +649,23 @@ bool cpu_smt_possible(void)
cpu_smt_control != CPU_SMT_NOT_SUPPORTED;
}
EXPORT_SYMBOL_GPL(cpu_smt_possible);
+
+static inline bool cpuhp_smt_aware(void)
+{
+ return topology_smt_supported();
+}
+
+static inline const struct cpumask *cpuhp_get_primary_thread_mask(void)
+{
+ return cpu_primary_thread_mask;
+}
#else
static inline bool cpu_smt_allowed(unsigned int cpu) { return true; }
+static inline bool cpuhp_smt_aware(void) { return false; }
+static inline const struct cpumask *cpuhp_get_primary_thread_mask(void)
+{
+ return cpu_present_mask;
+}
#endif
static inline enum cpuhp_state
@@ -558,7 +752,7 @@ static int cpuhp_kick_ap(int cpu, struct cpuhp_cpu_state *st,
return ret;
}
-static int bringup_wait_for_ap(unsigned int cpu)
+static int bringup_wait_for_ap_online(unsigned int cpu)
{
struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
@@ -579,38 +773,94 @@ static int bringup_wait_for_ap(unsigned int cpu)
*/
if (!cpu_smt_allowed(cpu))
return -ECANCELED;
+ return 0;
+}
+
+#ifdef CONFIG_HOTPLUG_SPLIT_STARTUP
+static int cpuhp_kick_ap_alive(unsigned int cpu)
+{
+ if (!cpuhp_can_boot_ap(cpu))
+ return -EAGAIN;
+
+ return arch_cpuhp_kick_ap_alive(cpu, idle_thread_get(cpu));
+}
+
+static int cpuhp_bringup_ap(unsigned int cpu)
+{
+ struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
+ int ret;
+
+ /*
+ * Some architectures have to walk the irq descriptors to
+ * setup the vector space for the cpu which comes online.
+ * Prevent irq alloc/free across the bringup.
+ */
+ irq_lock_sparse();
+
+ ret = cpuhp_bp_sync_alive(cpu);
+ if (ret)
+ goto out_unlock;
+
+ ret = bringup_wait_for_ap_online(cpu);
+ if (ret)
+ goto out_unlock;
+
+ irq_unlock_sparse();
if (st->target <= CPUHP_AP_ONLINE_IDLE)
return 0;
return cpuhp_kick_ap(cpu, st, st->target);
-}
+out_unlock:
+ irq_unlock_sparse();
+ return ret;
+}
+#else
static int bringup_cpu(unsigned int cpu)
{
+ struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
struct task_struct *idle = idle_thread_get(cpu);
int ret;
- /*
- * Reset stale stack state from the last time this CPU was online.
- */
- scs_task_reset(idle);
- kasan_unpoison_task_stack(idle);
+ if (!cpuhp_can_boot_ap(cpu))
+ return -EAGAIN;
/*
* Some architectures have to walk the irq descriptors to
* setup the vector space for the cpu which comes online.
- * Prevent irq alloc/free across the bringup.
+ *
+ * Prevent irq alloc/free across the bringup by acquiring the
+ * sparse irq lock. Hold it until the upcoming CPU completes the
+ * startup in cpuhp_online_idle() which allows to avoid
+ * intermediate synchronization points in the architecture code.
*/
irq_lock_sparse();
- /* Arch-specific enabling code. */
ret = __cpu_up(cpu, idle);
- irq_unlock_sparse();
if (ret)
- return ret;
- return bringup_wait_for_ap(cpu);
+ goto out_unlock;
+
+ ret = cpuhp_bp_sync_alive(cpu);
+ if (ret)
+ goto out_unlock;
+
+ ret = bringup_wait_for_ap_online(cpu);
+ if (ret)
+ goto out_unlock;
+
+ irq_unlock_sparse();
+
+ if (st->target <= CPUHP_AP_ONLINE_IDLE)
+ return 0;
+
+ return cpuhp_kick_ap(cpu, st, st->target);
+
+out_unlock:
+ irq_unlock_sparse();
+ return ret;
}
+#endif
static int finish_cpu(unsigned int cpu)
{
@@ -1099,6 +1349,8 @@ static int takedown_cpu(unsigned int cpu)
/* This actually kills the CPU. */
__cpu_die(cpu);
+ cpuhp_bp_sync_dead(cpu);
+
tick_cleanup_dead_cpu(cpu);
rcutree_migrate_callbacks(cpu);
return 0;
@@ -1345,8 +1597,10 @@ void cpuhp_online_idle(enum cpuhp_state state)
if (state != CPUHP_AP_ONLINE_IDLE)
return;
+ cpuhp_ap_update_sync_state(SYNC_STATE_ONLINE);
+
/*
- * Unpart the stopper thread before we start the idle loop (and start
+ * Unpark the stopper thread before we start the idle loop (and start
* scheduling); this ensures the stopper task is always available.
*/
stop_machine_unpark(smp_processor_id());
@@ -1383,6 +1637,12 @@ static int _cpu_up(unsigned int cpu, int tasks_frozen, enum cpuhp_state target)
ret = PTR_ERR(idle);
goto out;
}
+
+ /*
+ * Reset stale stack state from the last time this CPU was online.
+ */
+ scs_task_reset(idle);
+ kasan_unpoison_task_stack(idle);
}
cpuhp_tasks_frozen = tasks_frozen;
@@ -1502,18 +1762,96 @@ int bringup_hibernate_cpu(unsigned int sleep_cpu)
return 0;
}
-void bringup_nonboot_cpus(unsigned int setup_max_cpus)
+static void __init cpuhp_bringup_mask(const struct cpumask *mask, unsigned int ncpus,
+ enum cpuhp_state target)
{
unsigned int cpu;
- for_each_present_cpu(cpu) {
- if (num_online_cpus() >= setup_max_cpus)
+ for_each_cpu(cpu, mask) {
+ struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
+
+ if (cpu_up(cpu, target) && can_rollback_cpu(st)) {
+ /*
+ * If this failed then cpu_up() might have only
+ * rolled back to CPUHP_BP_KICK_AP for the final
+ * online. Clean it up. NOOP if already rolled back.
+ */
+ WARN_ON(cpuhp_invoke_callback_range(false, cpu, st, CPUHP_OFFLINE));
+ }
+
+ if (!--ncpus)
break;
- if (!cpu_online(cpu))
- cpu_up(cpu, CPUHP_ONLINE);
}
}
+#ifdef CONFIG_HOTPLUG_PARALLEL
+static bool __cpuhp_parallel_bringup __ro_after_init = true;
+
+static int __init parallel_bringup_parse_param(char *arg)
+{
+ return kstrtobool(arg, &__cpuhp_parallel_bringup);
+}
+early_param("cpuhp.parallel", parallel_bringup_parse_param);
+
+/*
+ * On architectures which have enabled parallel bringup this invokes all BP
+ * prepare states for each of the to be onlined APs first. The last state
+ * sends the startup IPI to the APs. The APs proceed through the low level
+ * bringup code in parallel and then wait for the control CPU to release
+ * them one by one for the final onlining procedure.
+ *
+ * This avoids waiting for each AP to respond to the startup IPI in
+ * CPUHP_BRINGUP_CPU.
+ */
+static bool __init cpuhp_bringup_cpus_parallel(unsigned int ncpus)
+{
+ const struct cpumask *mask = cpu_present_mask;
+
+ if (__cpuhp_parallel_bringup)
+ __cpuhp_parallel_bringup = arch_cpuhp_init_parallel_bringup();
+ if (!__cpuhp_parallel_bringup)
+ return false;
+
+ if (cpuhp_smt_aware()) {
+ const struct cpumask *pmask = cpuhp_get_primary_thread_mask();
+ static struct cpumask tmp_mask __initdata;
+
+ /*
+ * X86 requires to prevent that SMT siblings stopped while
+ * the primary thread does a microcode update for various
+ * reasons. Bring the primary threads up first.
+ */
+ cpumask_and(&tmp_mask, mask, pmask);
+ cpuhp_bringup_mask(&tmp_mask, ncpus, CPUHP_BP_KICK_AP);
+ cpuhp_bringup_mask(&tmp_mask, ncpus, CPUHP_ONLINE);
+ /* Account for the online CPUs */
+ ncpus -= num_online_cpus();
+ if (!ncpus)
+ return true;
+ /* Create the mask for secondary CPUs */
+ cpumask_andnot(&tmp_mask, mask, pmask);
+ mask = &tmp_mask;
+ }
+
+ /* Bring the not-yet started CPUs up */
+ cpuhp_bringup_mask(mask, ncpus, CPUHP_BP_KICK_AP);
+ cpuhp_bringup_mask(mask, ncpus, CPUHP_ONLINE);
+ return true;
+}
+#else
+static inline bool cpuhp_bringup_cpus_parallel(unsigned int ncpus) { return false; }
+#endif /* CONFIG_HOTPLUG_PARALLEL */
+
+void __init bringup_nonboot_cpus(unsigned int setup_max_cpus)
+{
+ /* Try parallel bringup optimization if enabled */
+ if (cpuhp_bringup_cpus_parallel(setup_max_cpus))
+ return;
+
+ /* Full per CPU serialized bringup */
+ cpuhp_bringup_mask(cpu_present_mask, setup_max_cpus, CPUHP_ONLINE);
+}
+
#ifdef CONFIG_PM_SLEEP_SMP
static cpumask_var_t frozen_cpus;
@@ -1740,13 +2078,38 @@ static struct cpuhp_step cpuhp_hp_states[] = {
.startup.single = timers_prepare_cpu,
.teardown.single = timers_dead_cpu,
},
- /* Kicks the plugged cpu into life */
+
+#ifdef CONFIG_HOTPLUG_SPLIT_STARTUP
+ /*
+ * Kicks the AP alive. AP will wait in cpuhp_ap_sync_alive() until
+ * the next step will release it.
+ */
+ [CPUHP_BP_KICK_AP] = {
+ .name = "cpu:kick_ap",
+ .startup.single = cpuhp_kick_ap_alive,
+ },
+
+ /*
+ * Waits for the AP to reach cpuhp_ap_sync_alive() and then
+ * releases it for the complete bringup.
+ */
+ [CPUHP_BRINGUP_CPU] = {
+ .name = "cpu:bringup",
+ .startup.single = cpuhp_bringup_ap,
+ .teardown.single = finish_cpu,
+ .cant_stop = true,
+ },
+#else
+ /*
+ * All-in-one CPU bringup state which includes the kick alive.
+ */
[CPUHP_BRINGUP_CPU] = {
.name = "cpu:bringup",
.startup.single = bringup_cpu,
.teardown.single = finish_cpu,
.cant_stop = true,
},
+#endif
/* Final state before CPU kills itself */
[CPUHP_AP_IDLE_DEAD] = {
.name = "idle:dead",
@@ -2723,6 +3086,7 @@ void __init boot_cpu_hotplug_init(void)
{
#ifdef CONFIG_SMP
cpumask_set_cpu(smp_processor_id(), &cpus_booted_once_mask);
+ atomic_set(this_cpu_ptr(&cpuhp_state.ap_sync_state), SYNC_STATE_ONLINE);
#endif
this_cpu_write(cpuhp_state.state, CPUHP_ONLINE);
this_cpu_write(cpuhp_state.target, CPUHP_ONLINE);
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 49e7bc871fec..ee8c0acf39df 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -306,6 +306,7 @@ static void __irq_disable(struct irq_desc *desc, bool mask);
void irq_shutdown(struct irq_desc *desc)
{
if (irqd_is_started(&desc->irq_data)) {
+ clear_irq_resend(desc);
desc->depth = 1;
if (desc->irq_data.chip->irq_shutdown) {
desc->irq_data.chip->irq_shutdown(&desc->irq_data);
@@ -692,8 +693,16 @@ void handle_fasteoi_irq(struct irq_desc *desc)
raw_spin_lock(&desc->lock);
- if (!irq_may_run(desc))
+ /*
+ * When an affinity change races with IRQ handling, the next interrupt
+ * can arrive on the new CPU before the original CPU has completed
+ * handling the previous one - it may need to be resent.
+ */
+ if (!irq_may_run(desc)) {
+ if (irqd_needs_resend_when_in_progress(&desc->irq_data))
+ desc->istate |= IRQS_PENDING;
goto out;
+ }
desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING);
@@ -715,6 +724,12 @@ void handle_fasteoi_irq(struct irq_desc *desc)
cond_unmask_eoi_irq(desc, chip);
+ /*
+ * When the race described above happens this will resend the interrupt.
+ */
+ if (unlikely(desc->istate & IRQS_PENDING))
+ check_irq_resend(desc, false);
+
raw_spin_unlock(&desc->lock);
return;
out:
diff --git a/kernel/irq/debugfs.c b/kernel/irq/debugfs.c
index bbcaac64038e..5971a66be034 100644
--- a/kernel/irq/debugfs.c
+++ b/kernel/irq/debugfs.c
@@ -133,6 +133,8 @@ static const struct irq_bit_descr irqdata_states[] = {
BIT_MASK_DESCR(IRQD_HANDLE_ENFORCE_IRQCTX),
BIT_MASK_DESCR(IRQD_IRQ_ENABLED_ON_SUSPEND),
+
+ BIT_MASK_DESCR(IRQD_RESEND_WHEN_IN_PROGRESS),
};
static const struct irq_bit_descr irqdesc_states[] = {
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index 5fdc0b557579..bdd35bb9c735 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -12,9 +12,9 @@
#include <linux/sched/clock.h>
#ifdef CONFIG_SPARSE_IRQ
-# define IRQ_BITMAP_BITS (NR_IRQS + 8196)
+# define MAX_SPARSE_IRQS INT_MAX
#else
-# define IRQ_BITMAP_BITS NR_IRQS
+# define MAX_SPARSE_IRQS NR_IRQS
#endif
#define istate core_internal_state__do_not_mess_with_it
@@ -47,9 +47,12 @@ enum {
* detection
* IRQS_POLL_INPROGRESS - polling in progress
* IRQS_ONESHOT - irq is not unmasked in primary handler
- * IRQS_REPLAY - irq is replayed
+ * IRQS_REPLAY - irq has been resent and will not be resent
+ * again until the handler has run and cleared
+ * this flag.
* IRQS_WAITING - irq is waiting
- * IRQS_PENDING - irq is pending and replayed later
+ * IRQS_PENDING - irq needs to be resent and should be resent
+ * at the next available opportunity.
* IRQS_SUSPENDED - irq is suspended
* IRQS_NMI - irq line is used to deliver NMIs
* IRQS_SYSFS - descriptor has been added to sysfs
@@ -113,6 +116,8 @@ irqreturn_t handle_irq_event(struct irq_desc *desc);
/* Resending of interrupts :*/
int check_irq_resend(struct irq_desc *desc, bool inject);
+void clear_irq_resend(struct irq_desc *desc);
+void irq_resend_init(struct irq_desc *desc);
bool irq_wait_for_poll(struct irq_desc *desc);
void __irq_wake_thread(struct irq_desc *desc, struct irqaction *action);
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index 240e145e969f..27ca1c866f29 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -12,8 +12,7 @@
#include <linux/export.h>
#include <linux/interrupt.h>
#include <linux/kernel_stat.h>
-#include <linux/radix-tree.h>
-#include <linux/bitmap.h>
+#include <linux/maple_tree.h>
#include <linux/irqdomain.h>
#include <linux/sysfs.h>
@@ -131,7 +130,40 @@ int nr_irqs = NR_IRQS;
EXPORT_SYMBOL_GPL(nr_irqs);
static DEFINE_MUTEX(sparse_irq_lock);
-static DECLARE_BITMAP(allocated_irqs, IRQ_BITMAP_BITS);
+static struct maple_tree sparse_irqs = MTREE_INIT_EXT(sparse_irqs,
+ MT_FLAGS_ALLOC_RANGE |
+ MT_FLAGS_LOCK_EXTERN |
+ MT_FLAGS_USE_RCU,
+ sparse_irq_lock);
+
+static int irq_find_free_area(unsigned int from, unsigned int cnt)
+{
+ MA_STATE(mas, &sparse_irqs, 0, 0);
+
+ if (mas_empty_area(&mas, from, MAX_SPARSE_IRQS, cnt))
+ return -ENOSPC;
+ return mas.index;
+}
+
+static unsigned int irq_find_at_or_after(unsigned int offset)
+{
+ unsigned long index = offset;
+ struct irq_desc *desc = mt_find(&sparse_irqs, &index, nr_irqs);
+
+ return desc ? irq_desc_get_irq(desc) : nr_irqs;
+}
+
+static void irq_insert_desc(unsigned int irq, struct irq_desc *desc)
+{
+ MA_STATE(mas, &sparse_irqs, irq, irq);
+ WARN_ON(mas_store_gfp(&mas, desc, GFP_KERNEL) != 0);
+}
+
+static void delete_irq_desc(unsigned int irq)
+{
+ MA_STATE(mas, &sparse_irqs, irq, irq);
+ mas_erase(&mas);
+}
#ifdef CONFIG_SPARSE_IRQ
@@ -344,26 +376,14 @@ static void irq_sysfs_del(struct irq_desc *desc) {}
#endif /* CONFIG_SYSFS */
-static RADIX_TREE(irq_desc_tree, GFP_KERNEL);
-
-static void irq_insert_desc(unsigned int irq, struct irq_desc *desc)
-{
- radix_tree_insert(&irq_desc_tree, irq, desc);
-}
-
struct irq_desc *irq_to_desc(unsigned int irq)
{
- return radix_tree_lookup(&irq_desc_tree, irq);
+ return mtree_load(&sparse_irqs, irq);
}
#ifdef CONFIG_KVM_BOOK3S_64_HV_MODULE
EXPORT_SYMBOL_GPL(irq_to_desc);
#endif
-static void delete_irq_desc(unsigned int irq)
-{
- radix_tree_delete(&irq_desc_tree, irq);
-}
-
#ifdef CONFIG_SMP
static void free_masks(struct irq_desc *desc)
{
@@ -415,6 +435,7 @@ static struct irq_desc *alloc_desc(int irq, int node, unsigned int flags,
desc_set_defaults(irq, desc, node, affinity, owner);
irqd_set(&desc->irq_data, flags);
kobject_init(&desc->kobj, &irq_kobj_type);
+ irq_resend_init(desc);
return desc;
@@ -505,7 +526,6 @@ static int alloc_descs(unsigned int start, unsigned int cnt, int node,
irq_sysfs_add(start + i, desc);
irq_add_debugfs_entry(start + i, desc);
}
- bitmap_set(allocated_irqs, start, cnt);
return start;
err:
@@ -516,7 +536,7 @@ err:
static int irq_expand_nr_irqs(unsigned int nr)
{
- if (nr > IRQ_BITMAP_BITS)
+ if (nr > MAX_SPARSE_IRQS)
return -ENOMEM;
nr_irqs = nr;
return 0;
@@ -534,18 +554,17 @@ int __init early_irq_init(void)
printk(KERN_INFO "NR_IRQS: %d, nr_irqs: %d, preallocated irqs: %d\n",
NR_IRQS, nr_irqs, initcnt);
- if (WARN_ON(nr_irqs > IRQ_BITMAP_BITS))
- nr_irqs = IRQ_BITMAP_BITS;
+ if (WARN_ON(nr_irqs > MAX_SPARSE_IRQS))
+ nr_irqs = MAX_SPARSE_IRQS;
- if (WARN_ON(initcnt > IRQ_BITMAP_BITS))
- initcnt = IRQ_BITMAP_BITS;
+ if (WARN_ON(initcnt > MAX_SPARSE_IRQS))
+ initcnt = MAX_SPARSE_IRQS;
if (initcnt > nr_irqs)
nr_irqs = initcnt;
for (i = 0; i < initcnt; i++) {
desc = alloc_desc(i, node, 0, NULL, NULL);
- set_bit(i, allocated_irqs);
irq_insert_desc(i, desc);
}
return arch_early_irq_init();
@@ -581,6 +600,7 @@ int __init early_irq_init(void)
mutex_init(&desc[i].request_mutex);
init_waitqueue_head(&desc[i].wait_for_threads);
desc_set_defaults(i, &desc[i], node, NULL, NULL);
+ irq_resend_init(desc);
}
return arch_early_irq_init();
}
@@ -599,6 +619,7 @@ static void free_desc(unsigned int irq)
raw_spin_lock_irqsave(&desc->lock, flags);
desc_set_defaults(irq, desc, irq_desc_get_node(desc), NULL, NULL);
raw_spin_unlock_irqrestore(&desc->lock, flags);
+ delete_irq_desc(irq);
}
static inline int alloc_descs(unsigned int start, unsigned int cnt, int node,
@@ -611,8 +632,8 @@ static inline int alloc_descs(unsigned int start, unsigned int cnt, int node,
struct irq_desc *desc = irq_to_desc(start + i);
desc->owner = owner;
+ irq_insert_desc(start + i, desc);
}
- bitmap_set(allocated_irqs, start, cnt);
return start;
}
@@ -624,7 +645,7 @@ static int irq_expand_nr_irqs(unsigned int nr)
void irq_mark_irq(unsigned int irq)
{
mutex_lock(&sparse_irq_lock);
- bitmap_set(allocated_irqs, irq, 1);
+ irq_insert_desc(irq, irq_desc + irq);
mutex_unlock(&sparse_irq_lock);
}
@@ -768,7 +789,6 @@ void irq_free_descs(unsigned int from, unsigned int cnt)
for (i = 0; i < cnt; i++)
free_desc(from + i);
- bitmap_clear(allocated_irqs, from, cnt);
mutex_unlock(&sparse_irq_lock);
}
EXPORT_SYMBOL_GPL(irq_free_descs);
@@ -810,8 +830,7 @@ __irq_alloc_descs(int irq, unsigned int from, unsigned int cnt, int node,
mutex_lock(&sparse_irq_lock);
- start = bitmap_find_next_zero_area(allocated_irqs, IRQ_BITMAP_BITS,
- from, cnt, 0);
+ start = irq_find_free_area(from, cnt);
ret = -EEXIST;
if (irq >=0 && start != irq)
goto unlock;
@@ -836,7 +855,7 @@ EXPORT_SYMBOL_GPL(__irq_alloc_descs);
*/
unsigned int irq_get_next_irq(unsigned int offset)
{
- return find_next_bit(allocated_irqs, nr_irqs, offset);
+ return irq_find_at_or_after(offset);
}
struct irq_desc *
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index f34760a1e222..5bd01624e447 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -1915,6 +1915,8 @@ static void irq_domain_check_hierarchy(struct irq_domain *domain)
#endif /* CONFIG_IRQ_DOMAIN_HIERARCHY */
#ifdef CONFIG_GENERIC_IRQ_DEBUGFS
+#include "internals.h"
+
static struct dentry *domain_dir;
static void
diff --git a/kernel/irq/resend.c b/kernel/irq/resend.c
index 0c46e9fe3a89..edec335c0a7a 100644
--- a/kernel/irq/resend.c
+++ b/kernel/irq/resend.c
@@ -21,8 +21,9 @@
#ifdef CONFIG_HARDIRQS_SW_RESEND
-/* Bitmap to handle software resend of interrupts: */
-static DECLARE_BITMAP(irqs_resend, IRQ_BITMAP_BITS);
+/* hlist_head to handle software resend of interrupts: */
+static HLIST_HEAD(irq_resend_list);
+static DEFINE_RAW_SPINLOCK(irq_resend_lock);
/*
* Run software resends of IRQ's
@@ -30,18 +31,17 @@ static DECLARE_BITMAP(irqs_resend, IRQ_BITMAP_BITS);
static void resend_irqs(struct tasklet_struct *unused)
{
struct irq_desc *desc;
- int irq;
-
- while (!bitmap_empty(irqs_resend, nr_irqs)) {
- irq = find_first_bit(irqs_resend, nr_irqs);
- clear_bit(irq, irqs_resend);
- desc = irq_to_desc(irq);
- if (!desc)
- continue;
- local_irq_disable();
+
+ raw_spin_lock_irq(&irq_resend_lock);
+ while (!hlist_empty(&irq_resend_list)) {
+ desc = hlist_entry(irq_resend_list.first, struct irq_desc,
+ resend_node);
+ hlist_del_init(&desc->resend_node);
+ raw_spin_unlock(&irq_resend_lock);
desc->handle_irq(desc);
- local_irq_enable();
+ raw_spin_lock(&irq_resend_lock);
}
+ raw_spin_unlock_irq(&irq_resend_lock);
}
/* Tasklet to handle resend: */
@@ -49,8 +49,6 @@ static DECLARE_TASKLET(resend_tasklet, resend_irqs);
static int irq_sw_resend(struct irq_desc *desc)
{
- unsigned int irq = irq_desc_get_irq(desc);
-
/*
* Validate whether this interrupt can be safely injected from
* non interrupt context
@@ -70,16 +68,31 @@ static int irq_sw_resend(struct irq_desc *desc)
*/
if (!desc->parent_irq)
return -EINVAL;
- irq = desc->parent_irq;
}
- /* Set it pending and activate the softirq: */
- set_bit(irq, irqs_resend);
+ /* Add to resend_list and activate the softirq: */
+ raw_spin_lock(&irq_resend_lock);
+ hlist_add_head(&desc->resend_node, &irq_resend_list);
+ raw_spin_unlock(&irq_resend_lock);
tasklet_schedule(&resend_tasklet);
return 0;
}
+void clear_irq_resend(struct irq_desc *desc)
+{
+ raw_spin_lock(&irq_resend_lock);
+ hlist_del_init(&desc->resend_node);
+ raw_spin_unlock(&irq_resend_lock);
+}
+
+void irq_resend_init(struct irq_desc *desc)
+{
+ INIT_HLIST_NODE(&desc->resend_node);
+}
#else
+void clear_irq_resend(struct irq_desc *desc) {}
+void irq_resend_init(struct irq_desc *desc) {}
+
static int irq_sw_resend(struct irq_desc *desc)
{
return -EINVAL;
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index 30d1274f03f6..f62e89d0d906 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -11,6 +11,7 @@
#define pr_fmt(fmt) "PM: hibernation: " fmt
+#include <linux/blkdev.h>
#include <linux/export.h>
#include <linux/suspend.h>
#include <linux/reboot.h>
@@ -64,7 +65,6 @@ enum {
static int hibernation_mode = HIBERNATION_SHUTDOWN;
bool freezer_test_done;
-bool snapshot_test;
static const struct platform_hibernation_ops *hibernation_ops;
@@ -684,26 +684,22 @@ static void power_down(void)
cpu_relax();
}
-static int load_image_and_restore(void)
+static int load_image_and_restore(bool snapshot_test)
{
int error;
unsigned int flags;
- fmode_t mode = FMODE_READ;
-
- if (snapshot_test)
- mode |= FMODE_EXCL;
pm_pr_dbg("Loading hibernation image.\n");
lock_device_hotplug();
error = create_basic_memory_bitmaps();
if (error) {
- swsusp_close(mode);
+ swsusp_close(snapshot_test);
goto Unlock;
}
error = swsusp_read(&flags);
- swsusp_close(mode);
+ swsusp_close(snapshot_test);
if (!error)
error = hibernation_restore(flags & SF_PLATFORM_MODE);
@@ -721,6 +717,7 @@ static int load_image_and_restore(void)
*/
int hibernate(void)
{
+ bool snapshot_test = false;
unsigned int sleep_flags;
int error;
@@ -748,9 +745,6 @@ int hibernate(void)
if (error)
goto Exit;
- /* protected by system_transition_mutex */
- snapshot_test = false;
-
lock_device_hotplug();
/* Allocate memory management structures */
error = create_basic_memory_bitmaps();
@@ -792,9 +786,9 @@ int hibernate(void)
unlock_device_hotplug();
if (snapshot_test) {
pm_pr_dbg("Checking hibernation image\n");
- error = swsusp_check();
+ error = swsusp_check(snapshot_test);
if (!error)
- error = load_image_and_restore();
+ error = load_image_and_restore(snapshot_test);
}
thaw_processes();
@@ -910,52 +904,10 @@ unlock:
}
EXPORT_SYMBOL_GPL(hibernate_quiet_exec);
-/**
- * software_resume - Resume from a saved hibernation image.
- *
- * This routine is called as a late initcall, when all devices have been
- * discovered and initialized already.
- *
- * The image reading code is called to see if there is a hibernation image
- * available for reading. If that is the case, devices are quiesced and the
- * contents of memory is restored from the saved image.
- *
- * If this is successful, control reappears in the restored target kernel in
- * hibernation_snapshot() which returns to hibernate(). Otherwise, the routine
- * attempts to recover gracefully and make the kernel return to the normal mode
- * of operation.
- */
-static int software_resume(void)
+static int __init find_resume_device(void)
{
- int error;
-
- /*
- * If the user said "noresume".. bail out early.
- */
- if (noresume || !hibernation_available())
- return 0;
-
- /*
- * name_to_dev_t() below takes a sysfs buffer mutex when sysfs
- * is configured into the kernel. Since the regular hibernate
- * trigger path is via sysfs which takes a buffer mutex before
- * calling hibernate functions (which take system_transition_mutex)
- * this can cause lockdep to complain about a possible ABBA deadlock
- * which cannot happen since we're in the boot code here and
- * sysfs can't be invoked yet. Therefore, we use a subclass
- * here to avoid lockdep complaining.
- */
- mutex_lock_nested(&system_transition_mutex, SINGLE_DEPTH_NESTING);
-
- snapshot_test = false;
-
- if (swsusp_resume_device)
- goto Check_image;
-
- if (!strlen(resume_file)) {
- error = -ENOENT;
- goto Unlock;
- }
+ if (!strlen(resume_file))
+ return -ENOENT;
pm_pr_dbg("Checking hibernation image partition %s\n", resume_file);
@@ -966,40 +918,41 @@ static int software_resume(void)
}
/* Check if the device is there */
- swsusp_resume_device = name_to_dev_t(resume_file);
- if (!swsusp_resume_device) {
- /*
- * Some device discovery might still be in progress; we need
- * to wait for this to finish.
- */
- wait_for_device_probe();
-
- if (resume_wait) {
- while ((swsusp_resume_device = name_to_dev_t(resume_file)) == 0)
- msleep(10);
- async_synchronize_full();
- }
+ if (!early_lookup_bdev(resume_file, &swsusp_resume_device))
+ return 0;
- swsusp_resume_device = name_to_dev_t(resume_file);
- if (!swsusp_resume_device) {
- error = -ENODEV;
- goto Unlock;
- }
+ /*
+ * Some device discovery might still be in progress; we need to wait for
+ * this to finish.
+ */
+ wait_for_device_probe();
+ if (resume_wait) {
+ while (early_lookup_bdev(resume_file, &swsusp_resume_device))
+ msleep(10);
+ async_synchronize_full();
}
- Check_image:
+ return early_lookup_bdev(resume_file, &swsusp_resume_device);
+}
+
+static int software_resume(void)
+{
+ int error;
+
pm_pr_dbg("Hibernation image partition %d:%d present\n",
MAJOR(swsusp_resume_device), MINOR(swsusp_resume_device));
pm_pr_dbg("Looking for hibernation image.\n");
- error = swsusp_check();
+
+ mutex_lock(&system_transition_mutex);
+ error = swsusp_check(false);
if (error)
goto Unlock;
/* The snapshot device should not be opened while we're running */
if (!hibernate_acquire()) {
error = -EBUSY;
- swsusp_close(FMODE_READ | FMODE_EXCL);
+ swsusp_close(false);
goto Unlock;
}
@@ -1020,7 +973,7 @@ static int software_resume(void)
goto Close_Finish;
}
- error = load_image_and_restore();
+ error = load_image_and_restore(false);
thaw_processes();
Finish:
pm_notifier_call_chain(PM_POST_RESTORE);
@@ -1034,11 +987,43 @@ static int software_resume(void)
pm_pr_dbg("Hibernation image not present or could not be loaded.\n");
return error;
Close_Finish:
- swsusp_close(FMODE_READ | FMODE_EXCL);
+ swsusp_close(false);
goto Finish;
}
-late_initcall_sync(software_resume);
+/**
+ * software_resume_initcall - Resume from a saved hibernation image.
+ *
+ * This routine is called as a late initcall, when all devices have been
+ * discovered and initialized already.
+ *
+ * The image reading code is called to see if there is a hibernation image
+ * available for reading. If that is the case, devices are quiesced and the
+ * contents of memory is restored from the saved image.
+ *
+ * If this is successful, control reappears in the restored target kernel in
+ * hibernation_snapshot() which returns to hibernate(). Otherwise, the routine
+ * attempts to recover gracefully and make the kernel return to the normal mode
+ * of operation.
+ */
+static int __init software_resume_initcall(void)
+{
+ /*
+ * If the user said "noresume".. bail out early.
+ */
+ if (noresume || !hibernation_available())
+ return 0;
+
+ if (!swsusp_resume_device) {
+ int error = find_resume_device();
+
+ if (error)
+ return error;
+ }
+
+ return software_resume();
+}
+late_initcall_sync(software_resume_initcall);
static const char * const hibernation_modes[] = {
@@ -1177,7 +1162,11 @@ static ssize_t resume_store(struct kobject *kobj, struct kobj_attribute *attr,
unsigned int sleep_flags;
int len = n;
char *name;
- dev_t res;
+ dev_t dev;
+ int error;
+
+ if (!hibernation_available())
+ return 0;
if (len && buf[len-1] == '\n')
len--;
@@ -1185,13 +1174,29 @@ static ssize_t resume_store(struct kobject *kobj, struct kobj_attribute *attr,
if (!name)
return -ENOMEM;
- res = name_to_dev_t(name);
+ error = lookup_bdev(name, &dev);
+ if (error) {
+ unsigned maj, min, offset;
+ char *p, dummy;
+
+ if (sscanf(name, "%u:%u%c", &maj, &min, &dummy) == 2 ||
+ sscanf(name, "%u:%u:%u:%c", &maj, &min, &offset,
+ &dummy) == 3) {
+ dev = MKDEV(maj, min);
+ if (maj != MAJOR(dev) || min != MINOR(dev))
+ error = -EINVAL;
+ } else {
+ dev = new_decode_dev(simple_strtoul(name, &p, 16));
+ if (*p)
+ error = -EINVAL;
+ }
+ }
kfree(name);
- if (!res)
- return -EINVAL;
+ if (error)
+ return error;
sleep_flags = lock_system_sleep();
- swsusp_resume_device = res;
+ swsusp_resume_device = dev;
unlock_system_sleep(sleep_flags);
pm_pr_dbg("Configured hibernation resume from disk to %u\n",
diff --git a/kernel/power/power.h b/kernel/power/power.h
index b83c8d5e188d..f4a380b1aa00 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -26,9 +26,6 @@ extern void __init hibernate_image_size_init(void);
/* Maximum size of architecture specific data in a hibernation header */
#define MAX_ARCH_HEADER_SIZE (sizeof(struct new_utsname) + 4)
-extern int arch_hibernation_header_save(void *addr, unsigned int max_size);
-extern int arch_hibernation_header_restore(void *addr);
-
static inline int init_header_complete(struct swsusp_info *info)
{
return arch_hibernation_header_save(info, MAX_ARCH_HEADER_SIZE);
@@ -41,8 +38,6 @@ static inline const char *check_image_kernel(struct swsusp_info *info)
}
#endif /* CONFIG_ARCH_HIBERNATION_HEADER */
-extern int hibernate_resume_nonboot_cpu_disable(void);
-
/*
* Keep some memory free so that I/O operations can succeed without paging
* [Might this be more than 4 MB?]
@@ -59,7 +54,6 @@ asmlinkage int swsusp_save(void);
/* kernel/power/hibernate.c */
extern bool freezer_test_done;
-extern bool snapshot_test;
extern int hibernation_snapshot(int platform_mode);
extern int hibernation_restore(int platform_mode);
@@ -174,11 +168,11 @@ extern int swsusp_swap_in_use(void);
#define SF_HW_SIG 8
/* kernel/power/hibernate.c */
-extern int swsusp_check(void);
+int swsusp_check(bool snapshot_test);
extern void swsusp_free(void);
extern int swsusp_read(unsigned int *flags_p);
extern int swsusp_write(unsigned int flags);
-extern void swsusp_close(fmode_t);
+void swsusp_close(bool snapshot_test);
#ifdef CONFIG_SUSPEND
extern int swsusp_unmark(void);
#endif
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 92e41ed292ad..f6ebcd00c410 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -356,14 +356,14 @@ static int swsusp_swap_check(void)
return res;
root_swap = res;
- hib_resume_bdev = blkdev_get_by_dev(swsusp_resume_device, FMODE_WRITE,
- NULL);
+ hib_resume_bdev = blkdev_get_by_dev(swsusp_resume_device,
+ BLK_OPEN_WRITE, NULL, NULL);
if (IS_ERR(hib_resume_bdev))
return PTR_ERR(hib_resume_bdev);
res = set_blocksize(hib_resume_bdev, PAGE_SIZE);
if (res < 0)
- blkdev_put(hib_resume_bdev, FMODE_WRITE);
+ blkdev_put(hib_resume_bdev, NULL);
return res;
}
@@ -443,7 +443,7 @@ static int get_swap_writer(struct swap_map_handle *handle)
err_rel:
release_swap_writer(handle);
err_close:
- swsusp_close(FMODE_WRITE);
+ swsusp_close(false);
return ret;
}
@@ -508,7 +508,7 @@ static int swap_writer_finish(struct swap_map_handle *handle,
if (error)
free_all_swap_pages(root_swap);
release_swap_writer(handle);
- swsusp_close(FMODE_WRITE);
+ swsusp_close(false);
return error;
}
@@ -1510,21 +1510,19 @@ end:
return error;
}
+static void *swsusp_holder;
+
/**
* swsusp_check - Check for swsusp signature in the resume device
*/
-int swsusp_check(void)
+int swsusp_check(bool snapshot_test)
{
+ void *holder = snapshot_test ? &swsusp_holder : NULL;
int error;
- void *holder;
- fmode_t mode = FMODE_READ;
- if (snapshot_test)
- mode |= FMODE_EXCL;
-
- hib_resume_bdev = blkdev_get_by_dev(swsusp_resume_device,
- mode, &holder);
+ hib_resume_bdev = blkdev_get_by_dev(swsusp_resume_device, BLK_OPEN_READ,
+ holder, NULL);
if (!IS_ERR(hib_resume_bdev)) {
set_blocksize(hib_resume_bdev, PAGE_SIZE);
clear_page(swsusp_header);
@@ -1551,7 +1549,7 @@ int swsusp_check(void)
put:
if (error)
- blkdev_put(hib_resume_bdev, mode);
+ blkdev_put(hib_resume_bdev, holder);
else
pr_debug("Image signature found, resuming\n");
} else {
@@ -1568,14 +1566,14 @@ put:
* swsusp_close - close swap device.
*/
-void swsusp_close(fmode_t mode)
+void swsusp_close(bool snapshot_test)
{
if (IS_ERR(hib_resume_bdev)) {
pr_debug("Image device not initialised\n");
return;
}
- blkdev_put(hib_resume_bdev, mode);
+ blkdev_put(hib_resume_bdev, snapshot_test ? &swsusp_holder : NULL);
}
/**
diff --git a/kernel/smp.c b/kernel/smp.c
index ab3e5dad6cfe..385179dae360 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -27,6 +27,9 @@
#include <linux/jump_label.h>
#include <trace/events/ipi.h>
+#define CREATE_TRACE_POINTS
+#include <trace/events/csd.h>
+#undef CREATE_TRACE_POINTS
#include "smpboot.h"
#include "sched/smp.h"
@@ -121,6 +124,14 @@ send_call_function_ipi_mask(struct cpumask *mask)
arch_send_call_function_ipi_mask(mask);
}
+static __always_inline void
+csd_do_func(smp_call_func_t func, void *info, struct __call_single_data *csd)
+{
+ trace_csd_function_entry(func, csd);
+ func(info);
+ trace_csd_function_exit(func, csd);
+}
+
#ifdef CONFIG_CSD_LOCK_WAIT_DEBUG
static DEFINE_STATIC_KEY_MAYBE(CONFIG_CSD_LOCK_WAIT_DEBUG_DEFAULT, csdlock_debug_enabled);
@@ -329,7 +340,7 @@ void __smp_call_single_queue(int cpu, struct llist_node *node)
* even if we haven't sent the smp_call IPI yet (e.g. the stopper
* executes migration_cpu_stop() on the remote CPU).
*/
- if (trace_ipi_send_cpu_enabled()) {
+ if (trace_csd_queue_cpu_enabled()) {
call_single_data_t *csd;
smp_call_func_t func;
@@ -337,7 +348,7 @@ void __smp_call_single_queue(int cpu, struct llist_node *node)
func = CSD_TYPE(csd) == CSD_TYPE_TTWU ?
sched_ttwu_pending : csd->func;
- trace_ipi_send_cpu(cpu, _RET_IP_, func);
+ trace_csd_queue_cpu(cpu, _RET_IP_, func, csd);
}
/*
@@ -375,7 +386,7 @@ static int generic_exec_single(int cpu, struct __call_single_data *csd)
csd_lock_record(csd);
csd_unlock(csd);
local_irq_save(flags);
- func(info);
+ csd_do_func(func, info, NULL);
csd_lock_record(NULL);
local_irq_restore(flags);
return 0;
@@ -477,7 +488,7 @@ static void __flush_smp_call_function_queue(bool warn_cpu_offline)
}
csd_lock_record(csd);
- func(info);
+ csd_do_func(func, info, csd);
csd_unlock(csd);
csd_lock_record(NULL);
} else {
@@ -508,7 +519,7 @@ static void __flush_smp_call_function_queue(bool warn_cpu_offline)
csd_lock_record(csd);
csd_unlock(csd);
- func(info);
+ csd_do_func(func, info, csd);
csd_lock_record(NULL);
} else if (type == CSD_TYPE_IRQ_WORK) {
irq_work_single(csd);
@@ -522,8 +533,10 @@ static void __flush_smp_call_function_queue(bool warn_cpu_offline)
/*
* Third; only CSD_TYPE_TTWU is left, issue those.
*/
- if (entry)
- sched_ttwu_pending(entry);
+ if (entry) {
+ csd = llist_entry(entry, typeof(*csd), node.llist);
+ csd_do_func(sched_ttwu_pending, entry, csd);
+ }
}
@@ -728,7 +741,7 @@ static void smp_call_function_many_cond(const struct cpumask *mask,
int cpu, last_cpu, this_cpu = smp_processor_id();
struct call_function_data *cfd;
bool wait = scf_flags & SCF_WAIT;
- int nr_cpus = 0, nr_queued = 0;
+ int nr_cpus = 0;
bool run_remote = false;
bool run_local = false;
@@ -786,22 +799,16 @@ static void smp_call_function_many_cond(const struct cpumask *mask,
csd->node.src = smp_processor_id();
csd->node.dst = cpu;
#endif
+ trace_csd_queue_cpu(cpu, _RET_IP_, func, csd);
+
if (llist_add(&csd->node.llist, &per_cpu(call_single_queue, cpu))) {
__cpumask_set_cpu(cpu, cfd->cpumask_ipi);
nr_cpus++;
last_cpu = cpu;
}
- nr_queued++;
}
/*
- * Trace each smp_function_call_*() as an IPI, actual IPIs
- * will be traced with func==generic_smp_call_function_single_ipi().
- */
- if (nr_queued)
- trace_ipi_send_cpumask(cfd->cpumask, _RET_IP_, func);
-
- /*
* Choose the most efficient way to send an IPI. Note that the
* number of CPUs might be zero due to concurrent changes to the
* provided mask.
@@ -816,7 +823,7 @@ static void smp_call_function_many_cond(const struct cpumask *mask,
unsigned long flags;
local_irq_save(flags);
- func(info);
+ csd_do_func(func, info, NULL);
local_irq_restore(flags);
}
@@ -892,7 +899,7 @@ EXPORT_SYMBOL(setup_max_cpus);
* SMP mode to <NUM>.
*/
-void __weak arch_disable_smp_support(void) { }
+void __weak __init arch_disable_smp_support(void) { }
static int __init nosmp(char *str)
{
diff --git a/kernel/smpboot.c b/kernel/smpboot.c
index 2c7396da470c..f47d8f375946 100644
--- a/kernel/smpboot.c
+++ b/kernel/smpboot.c
@@ -325,166 +325,3 @@ void smpboot_unregister_percpu_thread(struct smp_hotplug_thread *plug_thread)
cpus_read_unlock();
}
EXPORT_SYMBOL_GPL(smpboot_unregister_percpu_thread);
-
-static DEFINE_PER_CPU(atomic_t, cpu_hotplug_state) = ATOMIC_INIT(CPU_POST_DEAD);
-
-/*
- * Called to poll specified CPU's state, for example, when waiting for
- * a CPU to come online.
- */
-int cpu_report_state(int cpu)
-{
- return atomic_read(&per_cpu(cpu_hotplug_state, cpu));
-}
-
-/*
- * If CPU has died properly, set its state to CPU_UP_PREPARE and
- * return success. Otherwise, return -EBUSY if the CPU died after
- * cpu_wait_death() timed out. And yet otherwise again, return -EAGAIN
- * if cpu_wait_death() timed out and the CPU still hasn't gotten around
- * to dying. In the latter two cases, the CPU might not be set up
- * properly, but it is up to the arch-specific code to decide.
- * Finally, -EIO indicates an unanticipated problem.
- *
- * Note that it is permissible to omit this call entirely, as is
- * done in architectures that do no CPU-hotplug error checking.
- */
-int cpu_check_up_prepare(int cpu)
-{
- if (!IS_ENABLED(CONFIG_HOTPLUG_CPU)) {
- atomic_set(&per_cpu(cpu_hotplug_state, cpu), CPU_UP_PREPARE);
- return 0;
- }
-
- switch (atomic_read(&per_cpu(cpu_hotplug_state, cpu))) {
-
- case CPU_POST_DEAD:
-
- /* The CPU died properly, so just start it up again. */
- atomic_set(&per_cpu(cpu_hotplug_state, cpu), CPU_UP_PREPARE);
- return 0;
-
- case CPU_DEAD_FROZEN:
-
- /*
- * Timeout during CPU death, so let caller know.
- * The outgoing CPU completed its processing, but after
- * cpu_wait_death() timed out and reported the error. The
- * caller is free to proceed, in which case the state
- * will be reset properly by cpu_set_state_online().
- * Proceeding despite this -EBUSY return makes sense
- * for systems where the outgoing CPUs take themselves
- * offline, with no post-death manipulation required from
- * a surviving CPU.
- */
- return -EBUSY;
-
- case CPU_BROKEN:
-
- /*
- * The most likely reason we got here is that there was
- * a timeout during CPU death, and the outgoing CPU never
- * did complete its processing. This could happen on
- * a virtualized system if the outgoing VCPU gets preempted
- * for more than five seconds, and the user attempts to
- * immediately online that same CPU. Trying again later
- * might return -EBUSY above, hence -EAGAIN.
- */
- return -EAGAIN;
-
- case CPU_UP_PREPARE:
- /*
- * Timeout while waiting for the CPU to show up. Allow to try
- * again later.
- */
- return 0;
-
- default:
-
- /* Should not happen. Famous last words. */
- return -EIO;
- }
-}
-
-/*
- * Mark the specified CPU online.
- *
- * Note that it is permissible to omit this call entirely, as is
- * done in architectures that do no CPU-hotplug error checking.
- */
-void cpu_set_state_online(int cpu)
-{
- (void)atomic_xchg(&per_cpu(cpu_hotplug_state, cpu), CPU_ONLINE);
-}
-
-#ifdef CONFIG_HOTPLUG_CPU
-
-/*
- * Wait for the specified CPU to exit the idle loop and die.
- */
-bool cpu_wait_death(unsigned int cpu, int seconds)
-{
- int jf_left = seconds * HZ;
- int oldstate;
- bool ret = true;
- int sleep_jf = 1;
-
- might_sleep();
-
- /* The outgoing CPU will normally get done quite quickly. */
- if (atomic_read(&per_cpu(cpu_hotplug_state, cpu)) == CPU_DEAD)
- goto update_state_early;
- udelay(5);
-
- /* But if the outgoing CPU dawdles, wait increasingly long times. */
- while (atomic_read(&per_cpu(cpu_hotplug_state, cpu)) != CPU_DEAD) {
- schedule_timeout_uninterruptible(sleep_jf);
- jf_left -= sleep_jf;
- if (jf_left <= 0)
- break;
- sleep_jf = DIV_ROUND_UP(sleep_jf * 11, 10);
- }
-update_state_early:
- oldstate = atomic_read(&per_cpu(cpu_hotplug_state, cpu));
-update_state:
- if (oldstate == CPU_DEAD) {
- /* Outgoing CPU died normally, update state. */
- smp_mb(); /* atomic_read() before update. */
- atomic_set(&per_cpu(cpu_hotplug_state, cpu), CPU_POST_DEAD);
- } else {
- /* Outgoing CPU still hasn't died, set state accordingly. */
- if (!atomic_try_cmpxchg(&per_cpu(cpu_hotplug_state, cpu),
- &oldstate, CPU_BROKEN))
- goto update_state;
- ret = false;
- }
- return ret;
-}
-
-/*
- * Called by the outgoing CPU to report its successful death. Return
- * false if this report follows the surviving CPU's timing out.
- *
- * A separate "CPU_DEAD_FROZEN" is used when the surviving CPU
- * timed out. This approach allows architectures to omit calls to
- * cpu_check_up_prepare() and cpu_set_state_online() without defeating
- * the next cpu_wait_death()'s polling loop.
- */
-bool cpu_report_death(void)
-{
- int oldstate;
- int newstate;
- int cpu = smp_processor_id();
-
- oldstate = atomic_read(&per_cpu(cpu_hotplug_state, cpu));
- do {
- if (oldstate != CPU_BROKEN)
- newstate = CPU_DEAD;
- else
- newstate = CPU_DEAD_FROZEN;
- } while (!atomic_try_cmpxchg(&per_cpu(cpu_hotplug_state, cpu),
- &oldstate, newstate));
- return newstate == CPU_DEAD;
-}
-
-#endif /* #ifdef CONFIG_HOTPLUG_CPU */
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 1b725510dd0f..807b34ccd797 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -80,21 +80,6 @@ static void wakeup_softirqd(void)
wake_up_process(tsk);
}
-/*
- * If ksoftirqd is scheduled, we do not want to process pending softirqs
- * right now. Let ksoftirqd handle this at its own rate, to get fairness,
- * unless we're doing some of the synchronous softirqs.
- */
-#define SOFTIRQ_NOW_MASK ((1 << HI_SOFTIRQ) | (1 << TASKLET_SOFTIRQ))
-static bool ksoftirqd_running(unsigned long pending)
-{
- struct task_struct *tsk = __this_cpu_read(ksoftirqd);
-
- if (pending & SOFTIRQ_NOW_MASK)
- return false;
- return tsk && task_is_running(tsk) && !__kthread_should_park(tsk);
-}
-
#ifdef CONFIG_TRACE_IRQFLAGS
DEFINE_PER_CPU(int, hardirqs_enabled);
DEFINE_PER_CPU(int, hardirq_context);
@@ -236,7 +221,7 @@ void __local_bh_enable_ip(unsigned long ip, unsigned int cnt)
goto out;
pending = local_softirq_pending();
- if (!pending || ksoftirqd_running(pending))
+ if (!pending)
goto out;
/*
@@ -432,9 +417,6 @@ static inline bool should_wake_ksoftirqd(void)
static inline void invoke_softirq(void)
{
- if (ksoftirqd_running(local_softirq_pending()))
- return;
-
if (!force_irqthreads() || !__this_cpu_read(ksoftirqd)) {
#ifdef CONFIG_HAVE_IRQ_EXIT_ON_IRQ_STACK
/*
@@ -468,7 +450,7 @@ asmlinkage __visible void do_softirq(void)
pending = local_softirq_pending();
- if (pending && !ksoftirqd_running(pending))
+ if (pending)
do_softirq_own_stack();
local_irq_restore(flags);
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index 82b28ab0f328..8d9f13d847f0 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -751,7 +751,7 @@ static int alarm_timer_create(struct k_itimer *new_timer)
static enum alarmtimer_restart alarmtimer_nsleep_wakeup(struct alarm *alarm,
ktime_t now)
{
- struct task_struct *task = (struct task_struct *)alarm->data;
+ struct task_struct *task = alarm->data;
alarm->data = NULL;
if (task)
@@ -847,7 +847,7 @@ static int alarm_timer_nsleep(const clockid_t which_clock, int flags,
struct restart_block *restart = &current->restart_block;
struct alarm alarm;
ktime_t exp;
- int ret = 0;
+ int ret;
if (!alarmtimer_get_rtcdev())
return -EOPNOTSUPP;
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index e8c08292defc..238262e4aba7 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -164,6 +164,7 @@ static inline bool is_migration_base(struct hrtimer_clock_base *base)
static
struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer,
unsigned long *flags)
+ __acquires(&timer->base->lock)
{
struct hrtimer_clock_base *base;
@@ -280,6 +281,7 @@ static inline bool is_migration_base(struct hrtimer_clock_base *base)
static inline struct hrtimer_clock_base *
lock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags)
+ __acquires(&timer->base->cpu_base->lock)
{
struct hrtimer_clock_base *base = timer->base;
@@ -1013,6 +1015,7 @@ void hrtimers_resume_local(void)
*/
static inline
void unlock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags)
+ __releases(&timer->base->cpu_base->lock)
{
raw_spin_unlock_irqrestore(&timer->base->cpu_base->lock, *flags);
}
diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c
index 808a247205a9..b924f0f096fa 100644
--- a/kernel/time/posix-timers.c
+++ b/kernel/time/posix-timers.c
@@ -35,20 +35,17 @@
#include "timekeeping.h"
#include "posix-timers.h"
-/*
- * Management arrays for POSIX timers. Timers are now kept in static hash table
- * with 512 entries.
- * Timer ids are allocated by local routine, which selects proper hash head by
- * key, constructed from current->signal address and per signal struct counter.
- * This keeps timer ids unique per process, but now they can intersect between
- * processes.
- */
+static struct kmem_cache *posix_timers_cache;
/*
- * Lets keep our timers in a slab cache :-)
+ * Timers are managed in a hash table for lockless lookup. The hash key is
+ * constructed from current::signal and the timer ID and the timer is
+ * matched against current::signal and the timer ID when walking the hash
+ * bucket list.
+ *
+ * This allows checkpoint/restore to reconstruct the exact timer IDs for
+ * a process.
*/
-static struct kmem_cache *posix_timers_cache;
-
static DEFINE_HASHTABLE(posix_timers_hashtable, 9);
static DEFINE_SPINLOCK(hash_lock);
@@ -56,52 +53,12 @@ static const struct k_clock * const posix_clocks[];
static const struct k_clock *clockid_to_kclock(const clockid_t id);
static const struct k_clock clock_realtime, clock_monotonic;
-/*
- * we assume that the new SIGEV_THREAD_ID shares no bits with the other
- * SIGEV values. Here we put out an error if this assumption fails.
- */
+/* SIGEV_THREAD_ID cannot share a bit with the other SIGEV values. */
#if SIGEV_THREAD_ID != (SIGEV_THREAD_ID & \
- ~(SIGEV_SIGNAL | SIGEV_NONE | SIGEV_THREAD))
+ ~(SIGEV_SIGNAL | SIGEV_NONE | SIGEV_THREAD))
#error "SIGEV_THREAD_ID must not share bit with other SIGEV values!"
#endif
-/*
- * The timer ID is turned into a timer address by idr_find().
- * Verifying a valid ID consists of:
- *
- * a) checking that idr_find() returns other than -1.
- * b) checking that the timer id matches the one in the timer itself.
- * c) that the timer owner is in the callers thread group.
- */
-
-/*
- * CLOCKs: The POSIX standard calls for a couple of clocks and allows us
- * to implement others. This structure defines the various
- * clocks.
- *
- * RESOLUTION: Clock resolution is used to round up timer and interval
- * times, NOT to report clock times, which are reported with as
- * much resolution as the system can muster. In some cases this
- * resolution may depend on the underlying clock hardware and
- * may not be quantifiable until run time, and only then is the
- * necessary code is written. The standard says we should say
- * something about this issue in the documentation...
- *
- * FUNCTIONS: The CLOCKs structure defines possible functions to
- * handle various clock functions.
- *
- * The standard POSIX timer management code assumes the
- * following: 1.) The k_itimer struct (sched.h) is used for
- * the timer. 2.) The list, it_lock, it_clock, it_id and
- * it_pid fields are not modified by timer code.
- *
- * Permissions: It is assumed that the clock_settime() function defined
- * for each clock will take care of permission checks. Some
- * clocks may be set able by any user (i.e. local process
- * clocks) others not. Currently the only set able clock we
- * have is CLOCK_REALTIME and its high res counter part, both of
- * which we beg off on and pass to do_sys_settimeofday().
- */
static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags);
#define lock_timer(tid, flags) \
@@ -121,9 +78,9 @@ static struct k_itimer *__posix_timers_find(struct hlist_head *head,
{
struct k_itimer *timer;
- hlist_for_each_entry_rcu(timer, head, t_hash,
- lockdep_is_held(&hash_lock)) {
- if ((timer->it_signal == sig) && (timer->it_id == id))
+ hlist_for_each_entry_rcu(timer, head, t_hash, lockdep_is_held(&hash_lock)) {
+ /* timer->it_signal can be set concurrently */
+ if ((READ_ONCE(timer->it_signal) == sig) && (timer->it_id == id))
return timer;
}
return NULL;
@@ -140,25 +97,30 @@ static struct k_itimer *posix_timer_by_id(timer_t id)
static int posix_timer_add(struct k_itimer *timer)
{
struct signal_struct *sig = current->signal;
- int first_free_id = sig->posix_timer_id;
struct hlist_head *head;
- int ret = -ENOENT;
+ unsigned int cnt, id;
- do {
+ /*
+ * FIXME: Replace this by a per signal struct xarray once there is
+ * a plan to handle the resulting CRIU regression gracefully.
+ */
+ for (cnt = 0; cnt <= INT_MAX; cnt++) {
spin_lock(&hash_lock);
- head = &posix_timers_hashtable[hash(sig, sig->posix_timer_id)];
- if (!__posix_timers_find(head, sig, sig->posix_timer_id)) {
+ id = sig->next_posix_timer_id;
+
+ /* Write the next ID back. Clamp it to the positive space */
+ sig->next_posix_timer_id = (id + 1) & INT_MAX;
+
+ head = &posix_timers_hashtable[hash(sig, id)];
+ if (!__posix_timers_find(head, sig, id)) {
hlist_add_head_rcu(&timer->t_hash, head);
- ret = sig->posix_timer_id;
+ spin_unlock(&hash_lock);
+ return id;
}
- if (++sig->posix_timer_id < 0)
- sig->posix_timer_id = 0;
- if ((sig->posix_timer_id == first_free_id) && (ret == -ENOENT))
- /* Loop over all possible ids completed */
- ret = -EAGAIN;
spin_unlock(&hash_lock);
- } while (ret == -ENOENT);
- return ret;
+ }
+ /* POSIX return code when no timer ID could be allocated */
+ return -EAGAIN;
}
static inline void unlock_timer(struct k_itimer *timr, unsigned long flags)
@@ -166,7 +128,6 @@ static inline void unlock_timer(struct k_itimer *timr, unsigned long flags)
spin_unlock_irqrestore(&timr->it_lock, flags);
}
-/* Get clock_realtime */
static int posix_get_realtime_timespec(clockid_t which_clock, struct timespec64 *tp)
{
ktime_get_real_ts64(tp);
@@ -178,7 +139,6 @@ static ktime_t posix_get_realtime_ktime(clockid_t which_clock)
return ktime_get_real();
}
-/* Set clock_realtime */
static int posix_clock_realtime_set(const clockid_t which_clock,
const struct timespec64 *tp)
{
@@ -191,9 +151,6 @@ static int posix_clock_realtime_adj(const clockid_t which_clock,
return do_adjtimex(t);
}
-/*
- * Get monotonic time for posix timers
- */
static int posix_get_monotonic_timespec(clockid_t which_clock, struct timespec64 *tp)
{
ktime_get_ts64(tp);
@@ -206,9 +163,6 @@ static ktime_t posix_get_monotonic_ktime(clockid_t which_clock)
return ktime_get();
}
-/*
- * Get monotonic-raw time for posix timers
- */
static int posix_get_monotonic_raw(clockid_t which_clock, struct timespec64 *tp)
{
ktime_get_raw_ts64(tp);
@@ -216,7 +170,6 @@ static int posix_get_monotonic_raw(clockid_t which_clock, struct timespec64 *tp)
return 0;
}
-
static int posix_get_realtime_coarse(clockid_t which_clock, struct timespec64 *tp)
{
ktime_get_coarse_real_ts64(tp);
@@ -267,9 +220,6 @@ static int posix_get_hrtimer_res(clockid_t which_clock, struct timespec64 *tp)
return 0;
}
-/*
- * Initialize everything, well, just everything in Posix clocks/timers ;)
- */
static __init int init_posix_timers(void)
{
posix_timers_cache = kmem_cache_create("posix_timers_cache",
@@ -300,15 +250,9 @@ static void common_hrtimer_rearm(struct k_itimer *timr)
}
/*
- * This function is exported for use by the signal deliver code. It is
- * called just prior to the info block being released and passes that
- * block to us. It's function is to update the overrun entry AND to
- * restart the timer. It should only be called if the timer is to be
- * restarted (i.e. we have flagged this in the sys_private entry of the
- * info block).
- *
- * To protect against the timer going away while the interrupt is queued,
- * we require that the it_requeue_pending flag be set.
+ * This function is called from the signal delivery code if
+ * info->si_sys_private is not zero, which indicates that the timer has to
+ * be rearmed. Restart the timer and update info::si_overrun.
*/
void posixtimer_rearm(struct kernel_siginfo *info)
{
@@ -357,18 +301,18 @@ int posix_timer_event(struct k_itimer *timr, int si_private)
}
/*
- * This function gets called when a POSIX.1b interval timer expires. It
- * is used as a callback from the kernel internal timer. The
- * run_timer_list code ALWAYS calls with interrupts on.
-
- * This code is for CLOCK_REALTIME* and CLOCK_MONOTONIC* timers.
+ * This function gets called when a POSIX.1b interval timer expires from
+ * the HRTIMER interrupt (soft interrupt on RT kernels).
+ *
+ * Handles CLOCK_REALTIME, CLOCK_MONOTONIC, CLOCK_BOOTTIME and CLOCK_TAI
+ * based timers.
*/
static enum hrtimer_restart posix_timer_fn(struct hrtimer *timer)
{
+ enum hrtimer_restart ret = HRTIMER_NORESTART;
struct k_itimer *timr;
unsigned long flags;
int si_private = 0;
- enum hrtimer_restart ret = HRTIMER_NORESTART;
timr = container_of(timer, struct k_itimer, it.real.timer);
spin_lock_irqsave(&timr->it_lock, flags);
@@ -379,9 +323,10 @@ static enum hrtimer_restart posix_timer_fn(struct hrtimer *timer)
if (posix_timer_event(timr, si_private)) {
/*
- * signal was not sent because of sig_ignor
- * we will not get a call back to restart it AND
- * it should be restarted.
+ * The signal was not queued due to SIG_IGN. As a
+ * consequence the timer is not going to be rearmed from
+ * the signal delivery path. But as a real signal handler
+ * can be installed later the timer must be rearmed here.
*/
if (timr->it_interval != 0) {
ktime_t now = hrtimer_cb_get_time(timer);
@@ -390,34 +335,35 @@ static enum hrtimer_restart posix_timer_fn(struct hrtimer *timer)
* FIXME: What we really want, is to stop this
* timer completely and restart it in case the
* SIG_IGN is removed. This is a non trivial
- * change which involves sighand locking
- * (sigh !), which we don't want to do late in
- * the release cycle.
+ * change to the signal handling code.
+ *
+ * For now let timers with an interval less than a
+ * jiffie expire every jiffie and recheck for a
+ * valid signal handler.
+ *
+ * This avoids interrupt starvation in case of a
+ * very small interval, which would expire the
+ * timer immediately again.
+ *
+ * Moving now ahead of time by one jiffie tricks
+ * hrtimer_forward() to expire the timer later,
+ * while it still maintains the overrun accuracy
+ * for the price of a slight inconsistency in the
+ * timer_gettime() case. This is at least better
+ * than a timer storm.
*
- * For now we just let timers with an interval
- * less than a jiffie expire every jiffie to
- * avoid softirq starvation in case of SIG_IGN
- * and a very small interval, which would put
- * the timer right back on the softirq pending
- * list. By moving now ahead of time we trick
- * hrtimer_forward() to expire the timer
- * later, while we still maintain the overrun
- * accuracy, but have some inconsistency in
- * the timer_gettime() case. This is at least
- * better than a starved softirq. A more
- * complex fix which solves also another related
- * inconsistency is already in the pipeline.
+ * Only required when high resolution timers are
+ * enabled as the periodic tick based timers are
+ * automatically aligned to the next tick.
*/
-#ifdef CONFIG_HIGH_RES_TIMERS
- {
- ktime_t kj = NSEC_PER_SEC / HZ;
+ if (IS_ENABLED(CONFIG_HIGH_RES_TIMERS)) {
+ ktime_t kj = TICK_NSEC;
if (timr->it_interval < kj)
now = ktime_add(now, kj);
}
-#endif
- timr->it_overrun += hrtimer_forward(timer, now,
- timr->it_interval);
+
+ timr->it_overrun += hrtimer_forward(timer, now, timr->it_interval);
ret = HRTIMER_RESTART;
++timr->it_requeue_pending;
timr->it_active = 1;
@@ -454,8 +400,8 @@ static struct pid *good_sigevent(sigevent_t * event)
static struct k_itimer * alloc_posix_timer(void)
{
- struct k_itimer *tmr;
- tmr = kmem_cache_zalloc(posix_timers_cache, GFP_KERNEL);
+ struct k_itimer *tmr = kmem_cache_zalloc(posix_timers_cache, GFP_KERNEL);
+
if (!tmr)
return tmr;
if (unlikely(!(tmr->sigq = sigqueue_alloc()))) {
@@ -473,21 +419,21 @@ static void k_itimer_rcu_free(struct rcu_head *head)
kmem_cache_free(posix_timers_cache, tmr);
}
-#define IT_ID_SET 1
-#define IT_ID_NOT_SET 0
-static void release_posix_timer(struct k_itimer *tmr, int it_id_set)
+static void posix_timer_free(struct k_itimer *tmr)
{
- if (it_id_set) {
- unsigned long flags;
- spin_lock_irqsave(&hash_lock, flags);
- hlist_del_rcu(&tmr->t_hash);
- spin_unlock_irqrestore(&hash_lock, flags);
- }
put_pid(tmr->it_pid);
sigqueue_free(tmr->sigq);
call_rcu(&tmr->rcu, k_itimer_rcu_free);
}
+static void posix_timer_unhash_and_free(struct k_itimer *tmr)
+{
+ spin_lock(&hash_lock);
+ hlist_del_rcu(&tmr->t_hash);
+ spin_unlock(&hash_lock);
+ posix_timer_free(tmr);
+}
+
static int common_timer_create(struct k_itimer *new_timer)
{
hrtimer_init(&new_timer->it.real.timer, new_timer->it_clock, 0);
@@ -501,7 +447,6 @@ static int do_timer_create(clockid_t which_clock, struct sigevent *event,
const struct k_clock *kc = clockid_to_kclock(which_clock);
struct k_itimer *new_timer;
int error, new_timer_id;
- int it_id_set = IT_ID_NOT_SET;
if (!kc)
return -EINVAL;
@@ -513,13 +458,18 @@ static int do_timer_create(clockid_t which_clock, struct sigevent *event,
return -EAGAIN;
spin_lock_init(&new_timer->it_lock);
+
+ /*
+ * Add the timer to the hash table. The timer is not yet valid
+ * because new_timer::it_signal is still NULL. The timer id is also
+ * not yet visible to user space.
+ */
new_timer_id = posix_timer_add(new_timer);
if (new_timer_id < 0) {
- error = new_timer_id;
- goto out;
+ posix_timer_free(new_timer);
+ return new_timer_id;
}
- it_id_set = IT_ID_SET;
new_timer->it_id = (timer_t) new_timer_id;
new_timer->it_clock = which_clock;
new_timer->kclock = kc;
@@ -547,30 +497,33 @@ static int do_timer_create(clockid_t which_clock, struct sigevent *event,
new_timer->sigq->info.si_tid = new_timer->it_id;
new_timer->sigq->info.si_code = SI_TIMER;
- if (copy_to_user(created_timer_id,
- &new_timer_id, sizeof (new_timer_id))) {
+ if (copy_to_user(created_timer_id, &new_timer_id, sizeof (new_timer_id))) {
error = -EFAULT;
goto out;
}
-
+ /*
+ * After succesful copy out, the timer ID is visible to user space
+ * now but not yet valid because new_timer::signal is still NULL.
+ *
+ * Complete the initialization with the clock specific create
+ * callback.
+ */
error = kc->timer_create(new_timer);
if (error)
goto out;
spin_lock_irq(&current->sighand->siglock);
- new_timer->it_signal = current->signal;
+ /* This makes the timer valid in the hash table */
+ WRITE_ONCE(new_timer->it_signal, current->signal);
list_add(&new_timer->list, &current->signal->posix_timers);
spin_unlock_irq(&current->sighand->siglock);
-
- return 0;
/*
- * In the case of the timer belonging to another task, after
- * the task is unlocked, the timer is owned by the other task
- * and may cease to exist at any time. Don't use or modify
- * new_timer after the unlock call.
+ * After unlocking sighand::siglock @new_timer is subject to
+ * concurrent removal and cannot be touched anymore
*/
+ return 0;
out:
- release_posix_timer(new_timer, it_id_set);
+ posix_timer_unhash_and_free(new_timer);
return error;
}
@@ -604,13 +557,6 @@ COMPAT_SYSCALL_DEFINE3(timer_create, clockid_t, which_clock,
}
#endif
-/*
- * Locking issues: We need to protect the result of the id look up until
- * we get the timer locked down so it is not deleted under us. The
- * removal is done under the idr spinlock so we use that here to bridge
- * the find to the timer lock. To avoid a dead lock, the timer id MUST
- * be release with out holding the timer lock.
- */
static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags)
{
struct k_itimer *timr;
@@ -622,10 +568,35 @@ static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags)
if ((unsigned long long)timer_id > INT_MAX)
return NULL;
+ /*
+ * The hash lookup and the timers are RCU protected.
+ *
+ * Timers are added to the hash in invalid state where
+ * timr::it_signal == NULL. timer::it_signal is only set after the
+ * rest of the initialization succeeded.
+ *
+ * Timer destruction happens in steps:
+ * 1) Set timr::it_signal to NULL with timr::it_lock held
+ * 2) Release timr::it_lock
+ * 3) Remove from the hash under hash_lock
+ * 4) Call RCU for removal after the grace period
+ *
+ * Holding rcu_read_lock() accross the lookup ensures that
+ * the timer cannot be freed.
+ *
+ * The lookup validates locklessly that timr::it_signal ==
+ * current::it_signal and timr::it_id == @timer_id. timr::it_id
+ * can't change, but timr::it_signal becomes NULL during
+ * destruction.
+ */
rcu_read_lock();
timr = posix_timer_by_id(timer_id);
if (timr) {
spin_lock_irqsave(&timr->it_lock, *flags);
+ /*
+ * Validate under timr::it_lock that timr::it_signal is
+ * still valid. Pairs with #1 above.
+ */
if (timr->it_signal == current->signal) {
rcu_read_unlock();
return timr;
@@ -652,20 +623,16 @@ static s64 common_hrtimer_forward(struct k_itimer *timr, ktime_t now)
}
/*
- * Get the time remaining on a POSIX.1b interval timer. This function
- * is ALWAYS called with spin_lock_irq on the timer, thus it must not
- * mess with irq.
+ * Get the time remaining on a POSIX.1b interval timer.
*
- * We have a couple of messes to clean up here. First there is the case
- * of a timer that has a requeue pending. These timers should appear to
- * be in the timer list with an expiry as if we were to requeue them
- * now.
+ * Two issues to handle here:
*
- * The second issue is the SIGEV_NONE timer which may be active but is
- * not really ever put in the timer list (to save system resources).
- * This timer may be expired, and if so, we will do it here. Otherwise
- * it is the same as a requeue pending timer WRT to what we should
- * report.
+ * 1) The timer has a requeue pending. The return value must appear as
+ * if the timer has been requeued right now.
+ *
+ * 2) The timer is a SIGEV_NONE timer. These timers are never enqueued
+ * into the hrtimer queue and therefore never expired. Emulate expiry
+ * here taking #1 into account.
*/
void common_timer_get(struct k_itimer *timr, struct itimerspec64 *cur_setting)
{
@@ -681,8 +648,12 @@ void common_timer_get(struct k_itimer *timr, struct itimerspec64 *cur_setting)
cur_setting->it_interval = ktime_to_timespec64(iv);
} else if (!timr->it_active) {
/*
- * SIGEV_NONE oneshot timers are never queued. Check them
- * below.
+ * SIGEV_NONE oneshot timers are never queued and therefore
+ * timr->it_active is always false. The check below
+ * vs. remaining time will handle this case.
+ *
+ * For all other timers there is nothing to update here, so
+ * return.
*/
if (!sig_none)
return;
@@ -691,18 +662,29 @@ void common_timer_get(struct k_itimer *timr, struct itimerspec64 *cur_setting)
now = kc->clock_get_ktime(timr->it_clock);
/*
- * When a requeue is pending or this is a SIGEV_NONE timer move the
- * expiry time forward by intervals, so expiry is > now.
+ * If this is an interval timer and either has requeue pending or
+ * is a SIGEV_NONE timer move the expiry time forward by intervals,
+ * so expiry is > now.
*/
if (iv && (timr->it_requeue_pending & REQUEUE_PENDING || sig_none))
timr->it_overrun += kc->timer_forward(timr, now);
remaining = kc->timer_remaining(timr, now);
- /* Return 0 only, when the timer is expired and not pending */
+ /*
+ * As @now is retrieved before a possible timer_forward() and
+ * cannot be reevaluated by the compiler @remaining is based on the
+ * same @now value. Therefore @remaining is consistent vs. @now.
+ *
+ * Consequently all interval timers, i.e. @iv > 0, cannot have a
+ * remaining time <= 0 because timer_forward() guarantees to move
+ * them forward so that the next timer expiry is > @now.
+ */
if (remaining <= 0) {
/*
- * A single shot SIGEV_NONE timer must return 0, when
- * it is expired !
+ * A single shot SIGEV_NONE timer must return 0, when it is
+ * expired! Timers which have a real signal delivery mode
+ * must return a remaining time greater than 0 because the
+ * signal has not yet been delivered.
*/
if (!sig_none)
cur_setting->it_value.tv_nsec = 1;
@@ -711,11 +693,10 @@ void common_timer_get(struct k_itimer *timr, struct itimerspec64 *cur_setting)
}
}
-/* Get the time remaining on a POSIX.1b interval timer. */
static int do_timer_gettime(timer_t timer_id, struct itimerspec64 *setting)
{
- struct k_itimer *timr;
const struct k_clock *kc;
+ struct k_itimer *timr;
unsigned long flags;
int ret = 0;
@@ -765,20 +746,29 @@ SYSCALL_DEFINE2(timer_gettime32, timer_t, timer_id,
#endif
-/*
- * Get the number of overruns of a POSIX.1b interval timer. This is to
- * be the overrun of the timer last delivered. At the same time we are
- * accumulating overruns on the next timer. The overrun is frozen when
- * the signal is delivered, either at the notify time (if the info block
- * is not queued) or at the actual delivery time (as we are informed by
- * the call back to posixtimer_rearm(). So all we need to do is
- * to pick up the frozen overrun.
+/**
+ * sys_timer_getoverrun - Get the number of overruns of a POSIX.1b interval timer
+ * @timer_id: The timer ID which identifies the timer
+ *
+ * The "overrun count" of a timer is one plus the number of expiration
+ * intervals which have elapsed between the first expiry, which queues the
+ * signal and the actual signal delivery. On signal delivery the "overrun
+ * count" is calculated and cached, so it can be returned directly here.
+ *
+ * As this is relative to the last queued signal the returned overrun count
+ * is meaningless outside of the signal delivery path and even there it
+ * does not accurately reflect the current state when user space evaluates
+ * it.
+ *
+ * Returns:
+ * -EINVAL @timer_id is invalid
+ * 1..INT_MAX The number of overruns related to the last delivered signal
*/
SYSCALL_DEFINE1(timer_getoverrun, timer_t, timer_id)
{
struct k_itimer *timr;
- int overrun;
unsigned long flags;
+ int overrun;
timr = lock_timer(timer_id, &flags);
if (!timr)
@@ -831,10 +821,18 @@ static void common_timer_wait_running(struct k_itimer *timer)
}
/*
- * On PREEMPT_RT this prevent priority inversion against softirq kthread in
- * case it gets preempted while executing a timer callback. See comments in
- * hrtimer_cancel_wait_running. For PREEMPT_RT=n this just results in a
- * cpu_relax().
+ * On PREEMPT_RT this prevents priority inversion and a potential livelock
+ * against the ksoftirqd thread in case that ksoftirqd gets preempted while
+ * executing a hrtimer callback.
+ *
+ * See the comments in hrtimer_cancel_wait_running(). For PREEMPT_RT=n this
+ * just results in a cpu_relax().
+ *
+ * For POSIX CPU timers with CONFIG_POSIX_CPU_TIMERS_TASK_WORK=n this is
+ * just a cpu_relax(). With CONFIG_POSIX_CPU_TIMERS_TASK_WORK=y this
+ * prevents spinning on an eventually scheduled out task and a livelock
+ * when the task which tries to delete or disarm the timer has preempted
+ * the task which runs the expiry in task work context.
*/
static struct k_itimer *timer_wait_running(struct k_itimer *timer,
unsigned long *flags)
@@ -943,8 +941,7 @@ SYSCALL_DEFINE4(timer_settime, timer_t, timer_id, int, flags,
const struct __kernel_itimerspec __user *, new_setting,
struct __kernel_itimerspec __user *, old_setting)
{
- struct itimerspec64 new_spec, old_spec;
- struct itimerspec64 *rtn = old_setting ? &old_spec : NULL;
+ struct itimerspec64 new_spec, old_spec, *rtn;
int error = 0;
if (!new_setting)
@@ -953,6 +950,7 @@ SYSCALL_DEFINE4(timer_settime, timer_t, timer_id, int, flags,
if (get_itimerspec64(&new_spec, new_setting))
return -EFAULT;
+ rtn = old_setting ? &old_spec : NULL;
error = do_timer_settime(timer_id, flags, &new_spec, rtn);
if (!error && old_setting) {
if (put_itimerspec64(&old_spec, old_setting))
@@ -1026,38 +1024,71 @@ retry_delete:
list_del(&timer->list);
spin_unlock(&current->sighand->siglock);
/*
- * This keeps any tasks waiting on the spin lock from thinking
- * they got something (see the lock code above).
+ * A concurrent lookup could check timer::it_signal lockless. It
+ * will reevaluate with timer::it_lock held and observe the NULL.
*/
- timer->it_signal = NULL;
+ WRITE_ONCE(timer->it_signal, NULL);
unlock_timer(timer, flags);
- release_posix_timer(timer, IT_ID_SET);
+ posix_timer_unhash_and_free(timer);
return 0;
}
/*
- * return timer owned by the process, used by exit_itimers
+ * Delete a timer if it is armed, remove it from the hash and schedule it
+ * for RCU freeing.
*/
static void itimer_delete(struct k_itimer *timer)
{
-retry_delete:
- spin_lock_irq(&timer->it_lock);
+ unsigned long flags;
+ /*
+ * irqsave is required to make timer_wait_running() work.
+ */
+ spin_lock_irqsave(&timer->it_lock, flags);
+
+retry_delete:
+ /*
+ * Even if the timer is not longer accessible from other tasks
+ * it still might be armed and queued in the underlying timer
+ * mechanism. Worse, that timer mechanism might run the expiry
+ * function concurrently.
+ */
if (timer_delete_hook(timer) == TIMER_RETRY) {
- spin_unlock_irq(&timer->it_lock);
+ /*
+ * Timer is expired concurrently, prevent livelocks
+ * and pointless spinning on RT.
+ *
+ * timer_wait_running() drops timer::it_lock, which opens
+ * the possibility for another task to delete the timer.
+ *
+ * That's not possible here because this is invoked from
+ * do_exit() only for the last thread of the thread group.
+ * So no other task can access and delete that timer.
+ */
+ if (WARN_ON_ONCE(timer_wait_running(timer, &flags) != timer))
+ return;
+
goto retry_delete;
}
list_del(&timer->list);
- spin_unlock_irq(&timer->it_lock);
- release_posix_timer(timer, IT_ID_SET);
+ /*
+ * Setting timer::it_signal to NULL is technically not required
+ * here as nothing can access the timer anymore legitimately via
+ * the hash table. Set it to NULL nevertheless so that all deletion
+ * paths are consistent.
+ */
+ WRITE_ONCE(timer->it_signal, NULL);
+
+ spin_unlock_irqrestore(&timer->it_lock, flags);
+ posix_timer_unhash_and_free(timer);
}
/*
- * This is called by do_exit or de_thread, only when nobody else can
- * modify the signal->posix_timers list. Yet we need sighand->siglock
- * to prevent the race with /proc/pid/timers.
+ * Invoked from do_exit() when the last thread of a thread group exits.
+ * At that point no other task can access the timers of the dying
+ * task anymore.
*/
void exit_itimers(struct task_struct *tsk)
{
@@ -1067,10 +1098,12 @@ void exit_itimers(struct task_struct *tsk)
if (list_empty(&tsk->signal->posix_timers))
return;
+ /* Protect against concurrent read via /proc/$PID/timers */
spin_lock_irq(&tsk->sighand->siglock);
list_replace_init(&tsk->signal->posix_timers, &timers);
spin_unlock_irq(&tsk->sighand->siglock);
+ /* The timers are not longer accessible via tsk::signal */
while (!list_empty(&timers)) {
tmr = list_first_entry(&timers, struct k_itimer, list);
itimer_delete(tmr);
@@ -1089,6 +1122,10 @@ SYSCALL_DEFINE2(clock_settime, const clockid_t, which_clock,
if (get_timespec64(&new_tp, tp))
return -EFAULT;
+ /*
+ * Permission checks have to be done inside the clock specific
+ * setter callback.
+ */
return kc->clock_set(which_clock, &new_tp);
}
@@ -1139,6 +1176,79 @@ SYSCALL_DEFINE2(clock_adjtime, const clockid_t, which_clock,
return err;
}
+/**
+ * sys_clock_getres - Get the resolution of a clock
+ * @which_clock: The clock to get the resolution for
+ * @tp: Pointer to a a user space timespec64 for storage
+ *
+ * POSIX defines:
+ *
+ * "The clock_getres() function shall return the resolution of any
+ * clock. Clock resolutions are implementation-defined and cannot be set by
+ * a process. If the argument res is not NULL, the resolution of the
+ * specified clock shall be stored in the location pointed to by res. If
+ * res is NULL, the clock resolution is not returned. If the time argument
+ * of clock_settime() is not a multiple of res, then the value is truncated
+ * to a multiple of res."
+ *
+ * Due to the various hardware constraints the real resolution can vary
+ * wildly and even change during runtime when the underlying devices are
+ * replaced. The kernel also can use hardware devices with different
+ * resolutions for reading the time and for arming timers.
+ *
+ * The kernel therefore deviates from the POSIX spec in various aspects:
+ *
+ * 1) The resolution returned to user space
+ *
+ * For CLOCK_REALTIME, CLOCK_MONOTONIC, CLOCK_BOOTTIME, CLOCK_TAI,
+ * CLOCK_REALTIME_ALARM, CLOCK_BOOTTIME_ALAREM and CLOCK_MONOTONIC_RAW
+ * the kernel differentiates only two cases:
+ *
+ * I) Low resolution mode:
+ *
+ * When high resolution timers are disabled at compile or runtime
+ * the resolution returned is nanoseconds per tick, which represents
+ * the precision at which timers expire.
+ *
+ * II) High resolution mode:
+ *
+ * When high resolution timers are enabled the resolution returned
+ * is always one nanosecond independent of the actual resolution of
+ * the underlying hardware devices.
+ *
+ * For CLOCK_*_ALARM the actual resolution depends on system
+ * state. When system is running the resolution is the same as the
+ * resolution of the other clocks. During suspend the actual
+ * resolution is the resolution of the underlying RTC device which
+ * might be way less precise than the clockevent device used during
+ * running state.
+ *
+ * For CLOCK_REALTIME_COARSE and CLOCK_MONOTONIC_COARSE the resolution
+ * returned is always nanoseconds per tick.
+ *
+ * For CLOCK_PROCESS_CPUTIME and CLOCK_THREAD_CPUTIME the resolution
+ * returned is always one nanosecond under the assumption that the
+ * underlying scheduler clock has a better resolution than nanoseconds
+ * per tick.
+ *
+ * For dynamic POSIX clocks (PTP devices) the resolution returned is
+ * always one nanosecond.
+ *
+ * 2) Affect on sys_clock_settime()
+ *
+ * The kernel does not truncate the time which is handed in to
+ * sys_clock_settime(). The kernel internal timekeeping is always using
+ * nanoseconds precision independent of the clocksource device which is
+ * used to read the time from. The resolution of that device only
+ * affects the presicion of the time returned by sys_clock_gettime().
+ *
+ * Returns:
+ * 0 Success. @tp contains the resolution
+ * -EINVAL @which_clock is not a valid clock ID
+ * -EFAULT Copying the resolution to @tp faulted
+ * -ENODEV Dynamic POSIX clock is not backed by a device
+ * -EOPNOTSUPP Dynamic POSIX clock does not support getres()
+ */
SYSCALL_DEFINE2(clock_getres, const clockid_t, which_clock,
struct __kernel_timespec __user *, tp)
{
@@ -1230,7 +1340,7 @@ SYSCALL_DEFINE2(clock_getres_time32, clockid_t, which_clock,
#endif
/*
- * nanosleep for monotonic and realtime clocks
+ * sys_clock_nanosleep() for CLOCK_REALTIME and CLOCK_TAI
*/
static int common_nsleep(const clockid_t which_clock, int flags,
const struct timespec64 *rqtp)
@@ -1242,8 +1352,13 @@ static int common_nsleep(const clockid_t which_clock, int flags,
which_clock);
}
+/*
+ * sys_clock_nanosleep() for CLOCK_MONOTONIC and CLOCK_BOOTTIME
+ *
+ * Absolute nanosleeps for these clocks are time-namespace adjusted.
+ */
static int common_nsleep_timens(const clockid_t which_clock, int flags,
- const struct timespec64 *rqtp)
+ const struct timespec64 *rqtp)
{
ktime_t texp = timespec64_to_ktime(*rqtp);
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index 65b8658da829..e9138cd7a0f5 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -218,19 +218,8 @@ static void tick_setup_device(struct tick_device *td,
* this cpu:
*/
if (tick_do_timer_cpu == TICK_DO_TIMER_BOOT) {
- ktime_t next_p;
- u32 rem;
-
tick_do_timer_cpu = cpu;
-
- next_p = ktime_get();
- div_u64_rem(next_p, TICK_NSEC, &rem);
- if (rem) {
- next_p -= rem;
- next_p += TICK_NSEC;
- }
-
- tick_next_period = next_p;
+ tick_next_period = ktime_get();
#ifdef CONFIG_NO_HZ_FULL
/*
* The boot CPU may be nohz_full, in which case set
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 52254679ec48..4df14db4da49 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -161,8 +161,19 @@ static ktime_t tick_init_jiffy_update(void)
raw_spin_lock(&jiffies_lock);
write_seqcount_begin(&jiffies_seq);
/* Did we start the jiffies update yet ? */
- if (last_jiffies_update == 0)
+ if (last_jiffies_update == 0) {
+ u32 rem;
+
+ /*
+ * Ensure that the tick is aligned to a multiple of
+ * TICK_NSEC.
+ */
+ div_u64_rem(tick_next_period, TICK_NSEC, &rem);
+ if (rem)
+ tick_next_period += TICK_NSEC - rem;
+
last_jiffies_update = tick_next_period;
+ }
period = last_jiffies_update;
write_seqcount_end(&jiffies_seq);
raw_spin_unlock(&jiffies_lock);
@@ -1030,7 +1041,7 @@ static bool report_idle_softirq(void)
return false;
}
- if (ratelimit < 10)
+ if (ratelimit >= 10)
return false;
/* On RT, softirqs handling may be waiting on some lock */
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 64a4dde073ef..5d2c5678b66f 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -5199,7 +5199,7 @@ static const struct file_operations tracing_fops = {
.open = tracing_open,
.read = seq_read,
.read_iter = seq_read_iter,
- .splice_read = generic_file_splice_read,
+ .splice_read = copy_splice_read,
.write = tracing_write_stub,
.llseek = tracing_lseek,
.release = tracing_release,
diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c
index dbb14705d0d3..8df0550415e7 100644
--- a/kernel/trace/trace_events_user.c
+++ b/kernel/trace/trace_events_user.c
@@ -50,6 +50,18 @@
#define EVENT_STATUS_OTHER BIT(7)
/*
+ * User register flags are not allowed yet, keep them here until we are
+ * ready to expose them out to the user ABI.
+ */
+enum user_reg_flag {
+ /* Event will not delete upon last reference closing */
+ USER_EVENT_REG_PERSIST = 1U << 0,
+
+ /* This value or above is currently non-ABI */
+ USER_EVENT_REG_MAX = 1U << 1,
+};
+
+/*
* Stores the system name, tables, and locks for a group of events. This
* allows isolation for events by various means.
*/
@@ -85,8 +97,10 @@ struct user_event {
struct hlist_node node;
struct list_head fields;
struct list_head validators;
+ struct work_struct put_work;
refcount_t refcnt;
int min_size;
+ int reg_flags;
char status;
};
@@ -165,76 +179,151 @@ typedef void (*user_event_func_t) (struct user_event *user, struct iov_iter *i,
static int user_event_parse(struct user_event_group *group, char *name,
char *args, char *flags,
- struct user_event **newuser);
+ struct user_event **newuser, int reg_flags);
static struct user_event_mm *user_event_mm_get(struct user_event_mm *mm);
static struct user_event_mm *user_event_mm_get_all(struct user_event *user);
static void user_event_mm_put(struct user_event_mm *mm);
+static int destroy_user_event(struct user_event *user);
static u32 user_event_key(char *name)
{
return jhash(name, strlen(name), 0);
}
-static void user_event_group_destroy(struct user_event_group *group)
+static struct user_event *user_event_get(struct user_event *user)
{
- kfree(group->system_name);
- kfree(group);
+ refcount_inc(&user->refcnt);
+
+ return user;
}
-static char *user_event_group_system_name(struct user_namespace *user_ns)
+static void delayed_destroy_user_event(struct work_struct *work)
{
- char *system_name;
- int len = sizeof(USER_EVENTS_SYSTEM) + 1;
+ struct user_event *user = container_of(
+ work, struct user_event, put_work);
- if (user_ns != &init_user_ns) {
+ mutex_lock(&event_mutex);
+
+ if (!refcount_dec_and_test(&user->refcnt))
+ goto out;
+
+ if (destroy_user_event(user)) {
/*
- * Unexpected at this point:
- * We only currently support init_user_ns.
- * When we enable more, this will trigger a failure so log.
+ * The only reason this would fail here is if we cannot
+ * update the visibility of the event. In this case the
+ * event stays in the hashtable, waiting for someone to
+ * attempt to delete it later.
*/
- pr_warn("user_events: Namespace other than init_user_ns!\n");
- return NULL;
+ pr_warn("user_events: Unable to delete event\n");
+ refcount_set(&user->refcnt, 1);
}
+out:
+ mutex_unlock(&event_mutex);
+}
- system_name = kmalloc(len, GFP_KERNEL);
+static void user_event_put(struct user_event *user, bool locked)
+{
+ bool delete;
- if (!system_name)
- return NULL;
+ if (unlikely(!user))
+ return;
- snprintf(system_name, len, "%s", USER_EVENTS_SYSTEM);
+ /*
+ * When the event is not enabled for auto-delete there will always
+ * be at least 1 reference to the event. During the event creation
+ * we initially set the refcnt to 2 to achieve this. In those cases
+ * the caller must acquire event_mutex and after decrement check if
+ * the refcnt is 1, meaning this is the last reference. When auto
+ * delete is enabled, there will only be 1 ref, IE: refcnt will be
+ * only set to 1 during creation to allow the below checks to go
+ * through upon the last put. The last put must always be done with
+ * the event mutex held.
+ */
+ if (!locked) {
+ lockdep_assert_not_held(&event_mutex);
+ delete = refcount_dec_and_mutex_lock(&user->refcnt, &event_mutex);
+ } else {
+ lockdep_assert_held(&event_mutex);
+ delete = refcount_dec_and_test(&user->refcnt);
+ }
- return system_name;
+ if (!delete)
+ return;
+
+ /*
+ * We now have the event_mutex in all cases, which ensures that
+ * no new references will be taken until event_mutex is released.
+ * New references come through find_user_event(), which requires
+ * the event_mutex to be held.
+ */
+
+ if (user->reg_flags & USER_EVENT_REG_PERSIST) {
+ /* We should not get here when persist flag is set */
+ pr_alert("BUG: Auto-delete engaged on persistent event\n");
+ goto out;
+ }
+
+ /*
+ * Unfortunately we have to attempt the actual destroy in a work
+ * queue. This is because not all cases handle a trace_event_call
+ * being removed within the class->reg() operation for unregister.
+ */
+ INIT_WORK(&user->put_work, delayed_destroy_user_event);
+
+ /*
+ * Since the event is still in the hashtable, we have to re-inc
+ * the ref count to 1. This count will be decremented and checked
+ * in the work queue to ensure it's still the last ref. This is
+ * needed because a user-process could register the same event in
+ * between the time of event_mutex release and the work queue
+ * running the delayed destroy. If we removed the item now from
+ * the hashtable, this would result in a timing window where a
+ * user process would fail a register because the trace_event_call
+ * register would fail in the tracing layers.
+ */
+ refcount_set(&user->refcnt, 1);
+
+ if (WARN_ON_ONCE(!schedule_work(&user->put_work))) {
+ /*
+ * If we fail we must wait for an admin to attempt delete or
+ * another register/close of the event, whichever is first.
+ */
+ pr_warn("user_events: Unable to queue delayed destroy\n");
+ }
+out:
+ /* Ensure if we didn't have event_mutex before we unlock it */
+ if (!locked)
+ mutex_unlock(&event_mutex);
}
-static inline struct user_event_group
-*user_event_group_from_user_ns(struct user_namespace *user_ns)
+static void user_event_group_destroy(struct user_event_group *group)
{
- if (user_ns == &init_user_ns)
- return init_group;
-
- return NULL;
+ kfree(group->system_name);
+ kfree(group);
}
-static struct user_event_group *current_user_event_group(void)
+static char *user_event_group_system_name(void)
{
- struct user_namespace *user_ns = current_user_ns();
- struct user_event_group *group = NULL;
+ char *system_name;
+ int len = sizeof(USER_EVENTS_SYSTEM) + 1;
- while (user_ns) {
- group = user_event_group_from_user_ns(user_ns);
+ system_name = kmalloc(len, GFP_KERNEL);
- if (group)
- break;
+ if (!system_name)
+ return NULL;
- user_ns = user_ns->parent;
- }
+ snprintf(system_name, len, "%s", USER_EVENTS_SYSTEM);
- return group;
+ return system_name;
}
-static struct user_event_group
-*user_event_group_create(struct user_namespace *user_ns)
+static struct user_event_group *current_user_event_group(void)
+{
+ return init_group;
+}
+
+static struct user_event_group *user_event_group_create(void)
{
struct user_event_group *group;
@@ -243,7 +332,7 @@ static struct user_event_group
if (!group)
return NULL;
- group->system_name = user_event_group_system_name(user_ns);
+ group->system_name = user_event_group_system_name();
if (!group->system_name)
goto error;
@@ -259,12 +348,13 @@ error:
return NULL;
};
-static void user_event_enabler_destroy(struct user_event_enabler *enabler)
+static void user_event_enabler_destroy(struct user_event_enabler *enabler,
+ bool locked)
{
list_del_rcu(&enabler->mm_enablers_link);
/* No longer tracking the event via the enabler */
- refcount_dec(&enabler->event->refcnt);
+ user_event_put(enabler->event, locked);
kfree(enabler);
}
@@ -326,7 +416,7 @@ static void user_event_enabler_fault_fixup(struct work_struct *work)
/* User asked for enabler to be removed during fault */
if (test_bit(ENABLE_VAL_FREEING_BIT, ENABLE_BITOPS(enabler))) {
- user_event_enabler_destroy(enabler);
+ user_event_enabler_destroy(enabler, true);
goto out;
}
@@ -501,14 +591,12 @@ static bool user_event_enabler_dup(struct user_event_enabler *orig,
if (!enabler)
return false;
- enabler->event = orig->event;
+ enabler->event = user_event_get(orig->event);
enabler->addr = orig->addr;
/* Only dup part of value (ignore future flags, etc) */
enabler->values = orig->values & ENABLE_VAL_DUP_MASK;
- refcount_inc(&enabler->event->refcnt);
-
/* Enablers not exposed yet, RCU not required */
list_add(&enabler->mm_enablers_link, &mm->enablers);
@@ -625,7 +713,7 @@ static void user_event_mm_destroy(struct user_event_mm *mm)
struct user_event_enabler *enabler, *next;
list_for_each_entry_safe(enabler, next, &mm->enablers, mm_enablers_link)
- user_event_enabler_destroy(enabler);
+ user_event_enabler_destroy(enabler, false);
mmdrop(mm->mm);
kfree(mm);
@@ -780,7 +868,7 @@ retry:
* exit or run exec(), which includes forks and clones.
*/
if (!*write_result) {
- refcount_inc(&enabler->event->refcnt);
+ user_event_get(user);
list_add_rcu(&enabler->mm_enablers_link, &user_mm->enablers);
}
@@ -803,7 +891,12 @@ out:
static __always_inline __must_check
bool user_event_last_ref(struct user_event *user)
{
- return refcount_read(&user->refcnt) == 1;
+ int last = 0;
+
+ if (user->reg_flags & USER_EVENT_REG_PERSIST)
+ last = 1;
+
+ return refcount_read(&user->refcnt) == last;
}
static __always_inline __must_check
@@ -842,7 +935,8 @@ static struct list_head *user_event_get_fields(struct trace_event_call *call)
* Upon success user_event has its ref count increased by 1.
*/
static int user_event_parse_cmd(struct user_event_group *group,
- char *raw_command, struct user_event **newuser)
+ char *raw_command, struct user_event **newuser,
+ int reg_flags)
{
char *name = raw_command;
char *args = strpbrk(name, " ");
@@ -856,7 +950,7 @@ static int user_event_parse_cmd(struct user_event_group *group,
if (flags)
*flags++ = '\0';
- return user_event_parse(group, name, args, flags, newuser);
+ return user_event_parse(group, name, args, flags, newuser, reg_flags);
}
static int user_field_array_size(const char *type)
@@ -1367,10 +1461,8 @@ static struct user_event *find_user_event(struct user_event_group *group,
*outkey = key;
hash_for_each_possible(group->register_table, user, node, key)
- if (!strcmp(EVENT_NAME(user), name)) {
- refcount_inc(&user->refcnt);
- return user;
- }
+ if (!strcmp(EVENT_NAME(user), name))
+ return user_event_get(user);
return NULL;
}
@@ -1432,7 +1524,7 @@ static void user_event_ftrace(struct user_event *user, struct iov_iter *i,
if (unlikely(!entry))
return;
- if (unlikely(!copy_nofault(entry + 1, i->count, i)))
+ if (unlikely(i->count != 0 && !copy_nofault(entry + 1, i->count, i)))
goto discard;
if (!list_empty(&user->validators) &&
@@ -1473,7 +1565,7 @@ static void user_event_perf(struct user_event *user, struct iov_iter *i,
perf_fetch_caller_regs(regs);
- if (unlikely(!copy_nofault(perf_entry + 1, i->count, i)))
+ if (unlikely(i->count != 0 && !copy_nofault(perf_entry + 1, i->count, i)))
goto discard;
if (!list_empty(&user->validators) &&
@@ -1584,12 +1676,12 @@ static int user_event_reg(struct trace_event_call *call,
return ret;
inc:
- refcount_inc(&user->refcnt);
+ user_event_get(user);
update_enable_bit_for(user);
return 0;
dec:
update_enable_bit_for(user);
- refcount_dec(&user->refcnt);
+ user_event_put(user, true);
return 0;
}
@@ -1620,10 +1712,11 @@ static int user_event_create(const char *raw_command)
mutex_lock(&group->reg_mutex);
- ret = user_event_parse_cmd(group, name, &user);
+ /* Dyn events persist, otherwise they would cleanup immediately */
+ ret = user_event_parse_cmd(group, name, &user, USER_EVENT_REG_PERSIST);
if (!ret)
- refcount_dec(&user->refcnt);
+ user_event_put(user, false);
mutex_unlock(&group->reg_mutex);
@@ -1745,6 +1838,8 @@ static bool user_event_match(const char *system, const char *event,
if (match && argc > 0)
match = user_fields_match(user, argc, argv);
+ else if (match && argc == 0)
+ match = list_empty(&user->fields);
return match;
}
@@ -1781,11 +1876,17 @@ static int user_event_trace_register(struct user_event *user)
*/
static int user_event_parse(struct user_event_group *group, char *name,
char *args, char *flags,
- struct user_event **newuser)
+ struct user_event **newuser, int reg_flags)
{
int ret;
u32 key;
struct user_event *user;
+ int argc = 0;
+ char **argv;
+
+ /* User register flags are not ready yet */
+ if (reg_flags != 0 || flags != NULL)
+ return -EINVAL;
/* Prevent dyn_event from racing */
mutex_lock(&event_mutex);
@@ -1793,13 +1894,35 @@ static int user_event_parse(struct user_event_group *group, char *name,
mutex_unlock(&event_mutex);
if (user) {
- *newuser = user;
- /*
- * Name is allocated by caller, free it since it already exists.
- * Caller only worries about failure cases for freeing.
- */
- kfree(name);
+ if (args) {
+ argv = argv_split(GFP_KERNEL, args, &argc);
+ if (!argv) {
+ ret = -ENOMEM;
+ goto error;
+ }
+
+ ret = user_fields_match(user, argc, (const char **)argv);
+ argv_free(argv);
+
+ } else
+ ret = list_empty(&user->fields);
+
+ if (ret) {
+ *newuser = user;
+ /*
+ * Name is allocated by caller, free it since it already exists.
+ * Caller only worries about failure cases for freeing.
+ */
+ kfree(name);
+ } else {
+ ret = -EADDRINUSE;
+ goto error;
+ }
+
return 0;
+error:
+ user_event_put(user, false);
+ return ret;
}
user = kzalloc(sizeof(*user), GFP_KERNEL_ACCOUNT);
@@ -1852,8 +1975,15 @@ static int user_event_parse(struct user_event_group *group, char *name,
if (ret)
goto put_user_lock;
- /* Ensure we track self ref and caller ref (2) */
- refcount_set(&user->refcnt, 2);
+ user->reg_flags = reg_flags;
+
+ if (user->reg_flags & USER_EVENT_REG_PERSIST) {
+ /* Ensure we track self ref and caller ref (2) */
+ refcount_set(&user->refcnt, 2);
+ } else {
+ /* Ensure we track only caller ref (1) */
+ refcount_set(&user->refcnt, 1);
+ }
dyn_event_init(&user->devent, &user_event_dops);
dyn_event_add(&user->devent, &user->call);
@@ -1885,7 +2015,7 @@ static int delete_user_event(struct user_event_group *group, char *name)
if (!user)
return -ENOENT;
- refcount_dec(&user->refcnt);
+ user_event_put(user, true);
if (!user_event_last_ref(user))
return -EBUSY;
@@ -2044,9 +2174,7 @@ static int user_events_ref_add(struct user_event_file_info *info,
for (i = 0; i < count; ++i)
new_refs->events[i] = refs->events[i];
- new_refs->events[i] = user;
-
- refcount_inc(&user->refcnt);
+ new_refs->events[i] = user_event_get(user);
rcu_assign_pointer(info->refs, new_refs);
@@ -2077,8 +2205,8 @@ static long user_reg_get(struct user_reg __user *ureg, struct user_reg *kreg)
if (ret)
return ret;
- /* Ensure no flags, since we don't support any yet */
- if (kreg->flags != 0)
+ /* Ensure only valid flags */
+ if (kreg->flags & ~(USER_EVENT_REG_MAX-1))
return -EINVAL;
/* Ensure supported size */
@@ -2150,7 +2278,7 @@ static long user_events_ioctl_reg(struct user_event_file_info *info,
return ret;
}
- ret = user_event_parse_cmd(info->group, name, &user);
+ ret = user_event_parse_cmd(info->group, name, &user, reg.flags);
if (ret) {
kfree(name);
@@ -2160,7 +2288,7 @@ static long user_events_ioctl_reg(struct user_event_file_info *info,
ret = user_events_ref_add(info, user);
/* No longer need parse ref, ref_add either worked or not */
- refcount_dec(&user->refcnt);
+ user_event_put(user, false);
/* Positive number is index and valid */
if (ret < 0)
@@ -2309,7 +2437,7 @@ static long user_events_ioctl_unreg(unsigned long uarg)
set_bit(ENABLE_VAL_FREEING_BIT, ENABLE_BITOPS(enabler));
if (!test_bit(ENABLE_VAL_FAULTING_BIT, ENABLE_BITOPS(enabler)))
- user_event_enabler_destroy(enabler);
+ user_event_enabler_destroy(enabler, true);
/* Removed at least one */
ret = 0;
@@ -2367,7 +2495,6 @@ static int user_events_release(struct inode *node, struct file *file)
struct user_event_file_info *info = file->private_data;
struct user_event_group *group;
struct user_event_refs *refs;
- struct user_event *user;
int i;
if (!info)
@@ -2391,12 +2518,9 @@ static int user_events_release(struct inode *node, struct file *file)
* The underlying user_events are ref counted, and cannot be freed.
* After this decrement, the user_events may be freed elsewhere.
*/
- for (i = 0; i < refs->count; ++i) {
- user = refs->events[i];
+ for (i = 0; i < refs->count; ++i)
+ user_event_put(refs->events[i], false);
- if (user)
- refcount_dec(&user->refcnt);
- }
out:
file->private_data = NULL;
@@ -2577,7 +2701,7 @@ static int __init trace_events_user_init(void)
if (!fault_cache)
return -ENOMEM;
- init_group = user_event_group_create(&init_user_ns);
+ init_group = user_event_group_create();
if (!init_group) {
kmem_cache_destroy(fault_cache);
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 15f05faaae44..1e33f367783e 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -847,7 +847,7 @@ static void print_fields(struct trace_iterator *iter, struct trace_event_call *c
int ret;
void *pos;
- list_for_each_entry(field, head, link) {
+ list_for_each_entry_reverse(field, head, link) {
trace_seq_printf(&iter->seq, " %s=", field->name);
if (field->offset + field->size > iter->ent_size) {
trace_seq_puts(&iter->seq, "<OVERFLOW>");
diff --git a/kernel/watch_queue.c b/kernel/watch_queue.c
index e91cb4c2833f..d0b6b390ee42 100644
--- a/kernel/watch_queue.c
+++ b/kernel/watch_queue.c
@@ -42,7 +42,7 @@ MODULE_AUTHOR("Red Hat, Inc.");
static inline bool lock_wqueue(struct watch_queue *wqueue)
{
spin_lock_bh(&wqueue->lock);
- if (unlikely(wqueue->defunct)) {
+ if (unlikely(!wqueue->pipe)) {
spin_unlock_bh(&wqueue->lock);
return false;
}
@@ -104,9 +104,6 @@ static bool post_one_notification(struct watch_queue *wqueue,
unsigned int head, tail, mask, note, offset, len;
bool done = false;
- if (!pipe)
- return false;
-
spin_lock_irq(&pipe->rd_wait.lock);
mask = pipe->ring_size - 1;
@@ -603,8 +600,11 @@ void watch_queue_clear(struct watch_queue *wqueue)
rcu_read_lock();
spin_lock_bh(&wqueue->lock);
- /* Prevent new notifications from being stored. */
- wqueue->defunct = true;
+ /*
+ * This pipe can be freed by callers like free_pipe_info().
+ * Removing this reference also prevents new notifications.
+ */
+ wqueue->pipe = NULL;
while (!hlist_empty(&wqueue->watches)) {
watch = hlist_entry(wqueue->watches.first, struct watch, queue_node);
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 4666a1a92a31..c913e333cce8 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -705,12 +705,17 @@ static void clear_work_data(struct work_struct *work)
set_work_data(work, WORK_STRUCT_NO_POOL, 0);
}
+static inline struct pool_workqueue *work_struct_pwq(unsigned long data)
+{
+ return (struct pool_workqueue *)(data & WORK_STRUCT_WQ_DATA_MASK);
+}
+
static struct pool_workqueue *get_work_pwq(struct work_struct *work)
{
unsigned long data = atomic_long_read(&work->data);
if (data & WORK_STRUCT_PWQ)
- return (void *)(data & WORK_STRUCT_WQ_DATA_MASK);
+ return work_struct_pwq(data);
else
return NULL;
}
@@ -738,8 +743,7 @@ static struct worker_pool *get_work_pool(struct work_struct *work)
assert_rcu_or_pool_mutex();
if (data & WORK_STRUCT_PWQ)
- return ((struct pool_workqueue *)
- (data & WORK_STRUCT_WQ_DATA_MASK))->pool;
+ return work_struct_pwq(data)->pool;
pool_id = data >> WORK_OFFQ_POOL_SHIFT;
if (pool_id == WORK_OFFQ_POOL_NONE)
@@ -760,8 +764,7 @@ static int get_work_pool_id(struct work_struct *work)
unsigned long data = atomic_long_read(&work->data);
if (data & WORK_STRUCT_PWQ)
- return ((struct pool_workqueue *)
- (data & WORK_STRUCT_WQ_DATA_MASK))->pool->id;
+ return work_struct_pwq(data)->pool->id;
return data >> WORK_OFFQ_POOL_SHIFT;
}