diff options
Diffstat (limited to 'kernel/time')
-rw-r--r-- | kernel/time/clocksource-wdtest.c | 1 | ||||
-rw-r--r-- | kernel/time/test_udelay.c | 1 | ||||
-rw-r--r-- | kernel/time/tick-broadcast.c | 23 | ||||
-rw-r--r-- | kernel/time/tick-sched.c | 22 | ||||
-rw-r--r-- | kernel/time/time_test.c | 1 | ||||
-rw-r--r-- | kernel/time/timekeeping.c | 131 | ||||
-rw-r--r-- | kernel/time/timer.c | 2 | ||||
-rw-r--r-- | kernel/time/timer_migration.c | 393 | ||||
-rw-r--r-- | kernel/time/timer_migration.h | 27 |
9 files changed, 376 insertions, 225 deletions
diff --git a/kernel/time/clocksource-wdtest.c b/kernel/time/clocksource-wdtest.c index d06185e054ea..62e73444ffe4 100644 --- a/kernel/time/clocksource-wdtest.c +++ b/kernel/time/clocksource-wdtest.c @@ -22,6 +22,7 @@ #include "tick-internal.h" MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("Clocksource watchdog unit test"); MODULE_AUTHOR("Paul E. McKenney <paulmck@kernel.org>"); static int holdoff = IS_BUILTIN(CONFIG_TEST_CLOCKSOURCE_WATCHDOG) ? 10 : 0; diff --git a/kernel/time/test_udelay.c b/kernel/time/test_udelay.c index 20d5df631570..783f2297111b 100644 --- a/kernel/time/test_udelay.c +++ b/kernel/time/test_udelay.c @@ -155,5 +155,6 @@ static void __exit udelay_test_exit(void) module_exit(udelay_test_exit); +MODULE_DESCRIPTION("udelay test module"); MODULE_AUTHOR("David Riley <davidriley@chromium.org>"); MODULE_LICENSE("GPL"); diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index 771d1e040303..b4843099a8da 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -1141,6 +1141,7 @@ void tick_broadcast_switch_to_oneshot(void) #ifdef CONFIG_HOTPLUG_CPU void hotplug_cpu__broadcast_tick_pull(int deadcpu) { + struct tick_device *td = this_cpu_ptr(&tick_cpu_device); struct clock_event_device *bc; unsigned long flags; @@ -1148,6 +1149,28 @@ void hotplug_cpu__broadcast_tick_pull(int deadcpu) bc = tick_broadcast_device.evtdev; if (bc && broadcast_needs_cpu(bc, deadcpu)) { + /* + * If the broadcast force bit of the current CPU is set, + * then the current CPU has not yet reprogrammed the local + * timer device to avoid a ping-pong race. See + * ___tick_broadcast_oneshot_control(). + * + * If the broadcast device is hrtimer based then + * programming the broadcast event below does not have any + * effect because the local clockevent device is not + * running and not programmed because the broadcast event + * is not earlier than the pending event of the local clock + * event device. As a consequence all CPUs waiting for a + * broadcast event are stuck forever. + * + * Detect this condition and reprogram the cpu local timer + * device to avoid the starvation. + */ + if (tick_check_broadcast_expired()) { + cpumask_clear_cpu(smp_processor_id(), tick_broadcast_force_mask); + tick_program_event(td->evtdev->next_event, 1); + } + /* This moves the broadcast assignment to this CPU: */ clockevents_program_event(bc, bc->next_event, 1); } diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 71a792cd8936..753a184c7090 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -1026,10 +1026,10 @@ static void tick_nohz_stop_tick(struct tick_sched *ts, int cpu) if (expires == KTIME_MAX || ts->next_tick == hrtimer_get_expires(&ts->sched_timer)) return; - WARN_ON_ONCE(1); - printk_once("basemono: %llu ts->next_tick: %llu dev->next_event: %llu timer->active: %d timer->expires: %llu\n", - basemono, ts->next_tick, dev->next_event, - hrtimer_active(&ts->sched_timer), hrtimer_get_expires(&ts->sched_timer)); + WARN_ONCE(1, "basemono: %llu ts->next_tick: %llu dev->next_event: %llu " + "timer->active: %d timer->expires: %llu\n", basemono, ts->next_tick, + dev->next_event, hrtimer_active(&ts->sched_timer), + hrtimer_get_expires(&ts->sched_timer)); } /* @@ -1385,20 +1385,6 @@ unsigned long tick_nohz_get_idle_calls_cpu(int cpu) return ts->idle_calls; } -/** - * tick_nohz_get_idle_calls - return the current idle calls counter value - * - * Called from the schedutil frequency scaling governor in scheduler context. - * - * Return: the current idle calls counter value for the current CPU - */ -unsigned long tick_nohz_get_idle_calls(void) -{ - struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); - - return ts->idle_calls; -} - static void tick_nohz_account_idle_time(struct tick_sched *ts, ktime_t now) { diff --git a/kernel/time/time_test.c b/kernel/time/time_test.c index 3e5d422dd15c..2889763165e5 100644 --- a/kernel/time/time_test.c +++ b/kernel/time/time_test.c @@ -96,4 +96,5 @@ static struct kunit_suite time_test_suite = { }; kunit_test_suite(time_test_suite); +MODULE_DESCRIPTION("time unit test suite"); MODULE_LICENSE("GPL"); diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 4e18db1819f8..2fa87dcfeda9 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -1195,6 +1195,108 @@ static bool timestamp_in_interval(u64 start, u64 end, u64 ts) return false; } +static bool convert_clock(u64 *val, u32 numerator, u32 denominator) +{ + u64 rem, res; + + if (!numerator || !denominator) + return false; + + res = div64_u64_rem(*val, denominator, &rem) * numerator; + *val = res + div_u64(rem * numerator, denominator); + return true; +} + +static bool convert_base_to_cs(struct system_counterval_t *scv) +{ + struct clocksource *cs = tk_core.timekeeper.tkr_mono.clock; + struct clocksource_base *base; + u32 num, den; + + /* The timestamp was taken from the time keeper clock source */ + if (cs->id == scv->cs_id) + return true; + + /* + * Check whether cs_id matches the base clock. Prevent the compiler from + * re-evaluating @base as the clocksource might change concurrently. + */ + base = READ_ONCE(cs->base); + if (!base || base->id != scv->cs_id) + return false; + + num = scv->use_nsecs ? cs->freq_khz : base->numerator; + den = scv->use_nsecs ? USEC_PER_SEC : base->denominator; + + if (!convert_clock(&scv->cycles, num, den)) + return false; + + scv->cycles += base->offset; + return true; +} + +static bool convert_cs_to_base(u64 *cycles, enum clocksource_ids base_id) +{ + struct clocksource *cs = tk_core.timekeeper.tkr_mono.clock; + struct clocksource_base *base; + + /* + * Check whether base_id matches the base clock. Prevent the compiler from + * re-evaluating @base as the clocksource might change concurrently. + */ + base = READ_ONCE(cs->base); + if (!base || base->id != base_id) + return false; + + *cycles -= base->offset; + if (!convert_clock(cycles, base->denominator, base->numerator)) + return false; + return true; +} + +static bool convert_ns_to_cs(u64 *delta) +{ + struct tk_read_base *tkr = &tk_core.timekeeper.tkr_mono; + + if (BITS_TO_BYTES(fls64(*delta) + tkr->shift) >= sizeof(*delta)) + return false; + + *delta = div_u64((*delta << tkr->shift) - tkr->xtime_nsec, tkr->mult); + return true; +} + +/** + * ktime_real_to_base_clock() - Convert CLOCK_REALTIME timestamp to a base clock timestamp + * @treal: CLOCK_REALTIME timestamp to convert + * @base_id: base clocksource id + * @cycles: pointer to store the converted base clock timestamp + * + * Converts a supplied, future realtime clock value to the corresponding base clock value. + * + * Return: true if the conversion is successful, false otherwise. + */ +bool ktime_real_to_base_clock(ktime_t treal, enum clocksource_ids base_id, u64 *cycles) +{ + struct timekeeper *tk = &tk_core.timekeeper; + unsigned int seq; + u64 delta; + + do { + seq = read_seqcount_begin(&tk_core.seq); + if ((u64)treal < tk->tkr_mono.base_real) + return false; + delta = (u64)treal - tk->tkr_mono.base_real; + if (!convert_ns_to_cs(&delta)) + return false; + *cycles = tk->tkr_mono.cycle_last + delta; + if (!convert_cs_to_base(cycles, base_id)) + return false; + } while (read_seqcount_retry(&tk_core.seq, seq)); + + return true; +} +EXPORT_SYMBOL_GPL(ktime_real_to_base_clock); + /** * get_device_system_crosststamp - Synchronously capture system/device timestamp * @get_time_fn: Callback to get simultaneous device time and @@ -1241,7 +1343,7 @@ int get_device_system_crosststamp(int (*get_time_fn) * installed timekeeper clocksource */ if (system_counterval.cs_id == CSID_GENERIC || - tk->tkr_mono.clock->id != system_counterval.cs_id) + !convert_base_to_cs(&system_counterval)) return -ENODEV; cycles = system_counterval.cycles; @@ -1307,6 +1409,30 @@ int get_device_system_crosststamp(int (*get_time_fn) EXPORT_SYMBOL_GPL(get_device_system_crosststamp); /** + * timekeeping_clocksource_has_base - Check whether the current clocksource + * is based on given a base clock + * @id: base clocksource ID + * + * Note: The return value is a snapshot which can become invalid right + * after the function returns. + * + * Return: true if the timekeeper clocksource has a base clock with @id, + * false otherwise + */ +bool timekeeping_clocksource_has_base(enum clocksource_ids id) +{ + /* + * This is a snapshot, so no point in using the sequence + * count. Just prevent the compiler from re-evaluating @base as the + * clocksource might change concurrently. + */ + struct clocksource_base *base = READ_ONCE(tk_core.timekeeper.tkr_mono.clock->base); + + return base ? base->id == id : false; +} +EXPORT_SYMBOL_GPL(timekeeping_clocksource_has_base); + +/** * do_settimeofday64 - Sets the time of day. * @ts: pointer to the timespec64 variable containing the new time * @@ -2421,6 +2547,7 @@ EXPORT_SYMBOL_GPL(random_get_entropy_fallback); /** * do_adjtimex() - Accessor function to NTP __do_adjtimex function + * @txc: Pointer to kernel_timex structure containing NTP parameters */ int do_adjtimex(struct __kernel_timex *txc) { @@ -2489,6 +2616,8 @@ int do_adjtimex(struct __kernel_timex *txc) #ifdef CONFIG_NTP_PPS /** * hardpps() - Accessor function to NTP __hardpps function + * @phase_ts: Pointer to timespec64 structure representing phase timestamp + * @raw_ts: Pointer to timespec64 structure representing raw timestamp */ void hardpps(const struct timespec64 *phase_ts, const struct timespec64 *raw_ts) { diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 48288dd4a102..64b0d8a0aa0f 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -289,7 +289,7 @@ static void timers_update_migration(void) } #ifdef CONFIG_SYSCTL -static int timer_migration_handler(struct ctl_table *table, int write, +static int timer_migration_handler(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { int ret; diff --git a/kernel/time/timer_migration.c b/kernel/time/timer_migration.c index 84413114db5c..8d57f7686bb0 100644 --- a/kernel/time/timer_migration.c +++ b/kernel/time/timer_migration.c @@ -475,9 +475,54 @@ static bool tmigr_check_lonely(struct tmigr_group *group) return bitmap_weight(&active, BIT_CNT) <= 1; } -typedef bool (*up_f)(struct tmigr_group *, struct tmigr_group *, void *); +/** + * struct tmigr_walk - data required for walking the hierarchy + * @nextexp: Next CPU event expiry information which is handed into + * the timer migration code by the timer code + * (get_next_timer_interrupt()) + * @firstexp: Contains the first event expiry information when + * hierarchy is completely idle. When CPU itself was the + * last going idle, information makes sure, that CPU will + * be back in time. When using this value in the remote + * expiry case, firstexp is stored in the per CPU tmigr_cpu + * struct of CPU which expires remote timers. It is updated + * in top level group only. Be aware, there could occur a + * new top level of the hierarchy between the 'top level + * call' in tmigr_update_events() and the check for the + * parent group in walk_groups(). Then @firstexp might + * contain a value != KTIME_MAX even if it was not the + * final top level. This is not a problem, as the worst + * outcome is a CPU which might wake up a little early. + * @evt: Pointer to tmigr_event which needs to be queued (of idle + * child group) + * @childmask: groupmask of child group + * @remote: Is set, when the new timer path is executed in + * tmigr_handle_remote_cpu() + * @basej: timer base in jiffies + * @now: timer base monotonic + * @check: is set if there is the need to handle remote timers; + * required in tmigr_requires_handle_remote() only + * @tmc_active: this flag indicates, whether the CPU which triggers + * the hierarchy walk is !idle in the timer migration + * hierarchy. When the CPU is idle and the whole hierarchy is + * idle, only the first event of the top level has to be + * considered. + */ +struct tmigr_walk { + u64 nextexp; + u64 firstexp; + struct tmigr_event *evt; + u8 childmask; + bool remote; + unsigned long basej; + u64 now; + bool check; + bool tmc_active; +}; + +typedef bool (*up_f)(struct tmigr_group *, struct tmigr_group *, struct tmigr_walk *); -static void __walk_groups(up_f up, void *data, +static void __walk_groups(up_f up, struct tmigr_walk *data, struct tmigr_cpu *tmc) { struct tmigr_group *child = NULL, *group = tmc->tmgroup; @@ -490,64 +535,17 @@ static void __walk_groups(up_f up, void *data, child = group; group = group->parent; + data->childmask = child->groupmask; } while (group); } -static void walk_groups(up_f up, void *data, struct tmigr_cpu *tmc) +static void walk_groups(up_f up, struct tmigr_walk *data, struct tmigr_cpu *tmc) { lockdep_assert_held(&tmc->lock); __walk_groups(up, data, tmc); } -/** - * struct tmigr_walk - data required for walking the hierarchy - * @nextexp: Next CPU event expiry information which is handed into - * the timer migration code by the timer code - * (get_next_timer_interrupt()) - * @firstexp: Contains the first event expiry information when last - * active CPU of hierarchy is on the way to idle to make - * sure CPU will be back in time. - * @evt: Pointer to tmigr_event which needs to be queued (of idle - * child group) - * @childmask: childmask of child group - * @remote: Is set, when the new timer path is executed in - * tmigr_handle_remote_cpu() - */ -struct tmigr_walk { - u64 nextexp; - u64 firstexp; - struct tmigr_event *evt; - u8 childmask; - bool remote; -}; - -/** - * struct tmigr_remote_data - data required for remote expiry hierarchy walk - * @basej: timer base in jiffies - * @now: timer base monotonic - * @firstexp: returns expiry of the first timer in the idle timer - * migration hierarchy to make sure the timer is handled in - * time; it is stored in the per CPU tmigr_cpu struct of - * CPU which expires remote timers - * @childmask: childmask of child group - * @check: is set if there is the need to handle remote timers; - * required in tmigr_requires_handle_remote() only - * @tmc_active: this flag indicates, whether the CPU which triggers - * the hierarchy walk is !idle in the timer migration - * hierarchy. When the CPU is idle and the whole hierarchy is - * idle, only the first event of the top level has to be - * considered. - */ -struct tmigr_remote_data { - unsigned long basej; - u64 now; - u64 firstexp; - u8 childmask; - bool check; - bool tmc_active; -}; - /* * Returns the next event of the timerqueue @group->events * @@ -618,10 +616,9 @@ static u64 tmigr_next_groupevt_expires(struct tmigr_group *group) static bool tmigr_active_up(struct tmigr_group *group, struct tmigr_group *child, - void *ptr) + struct tmigr_walk *data) { union tmigr_state curstate, newstate; - struct tmigr_walk *data = ptr; bool walk_done; u8 childmask; @@ -649,8 +646,7 @@ static bool tmigr_active_up(struct tmigr_group *group, } while (!atomic_try_cmpxchg(&group->migr_state, &curstate.state, newstate.state)); - if ((walk_done == false) && group->parent) - data->childmask = group->childmask; + trace_tmigr_group_set_cpu_active(group, newstate, childmask); /* * The group is active (again). The group event might be still queued @@ -666,8 +662,6 @@ static bool tmigr_active_up(struct tmigr_group *group, */ group->groupevt.ignore = true; - trace_tmigr_group_set_cpu_active(group, newstate, childmask); - return walk_done; } @@ -675,7 +669,7 @@ static void __tmigr_cpu_activate(struct tmigr_cpu *tmc) { struct tmigr_walk data; - data.childmask = tmc->childmask; + data.childmask = tmc->groupmask; trace_tmigr_cpu_active(tmc); @@ -860,10 +854,8 @@ unlock: static bool tmigr_new_timer_up(struct tmigr_group *group, struct tmigr_group *child, - void *ptr) + struct tmigr_walk *data) { - struct tmigr_walk *data = ptr; - return tmigr_update_events(group, child, data); } @@ -995,9 +987,8 @@ unlock: static bool tmigr_handle_remote_up(struct tmigr_group *group, struct tmigr_group *child, - void *ptr) + struct tmigr_walk *data) { - struct tmigr_remote_data *data = ptr; struct tmigr_event *evt; unsigned long jif; u8 childmask; @@ -1034,12 +1025,10 @@ again: } /* - * Update of childmask for the next level and keep track of the expiry - * of the first event that needs to be handled (group->next_expiry was - * updated by tmigr_next_expired_groupevt(), next was set by - * tmigr_handle_remote_cpu()). + * Keep track of the expiry of the first event that needs to be handled + * (group->next_expiry was updated by tmigr_next_expired_groupevt(), + * next was set by tmigr_handle_remote_cpu()). */ - data->childmask = group->childmask; data->firstexp = group->next_expiry; raw_spin_unlock_irq(&group->lock); @@ -1055,12 +1044,12 @@ again: void tmigr_handle_remote(void) { struct tmigr_cpu *tmc = this_cpu_ptr(&tmigr_cpu); - struct tmigr_remote_data data; + struct tmigr_walk data; if (tmigr_is_not_available(tmc)) return; - data.childmask = tmc->childmask; + data.childmask = tmc->groupmask; data.firstexp = KTIME_MAX; /* @@ -1068,7 +1057,7 @@ void tmigr_handle_remote(void) * in tmigr_handle_remote_up() anyway. Keep this check to speed up the * return when nothing has to be done. */ - if (!tmigr_check_migrator(tmc->tmgroup, tmc->childmask)) { + if (!tmigr_check_migrator(tmc->tmgroup, tmc->groupmask)) { /* * If this CPU was an idle migrator, make sure to clear its wakeup * value so it won't chase timers that have already expired elsewhere. @@ -1097,9 +1086,8 @@ void tmigr_handle_remote(void) static bool tmigr_requires_handle_remote_up(struct tmigr_group *group, struct tmigr_group *child, - void *ptr) + struct tmigr_walk *data) { - struct tmigr_remote_data *data = ptr; u8 childmask; childmask = data->childmask; @@ -1118,7 +1106,7 @@ static bool tmigr_requires_handle_remote_up(struct tmigr_group *group, * group before reading the next_expiry value. */ if (group->parent && !data->tmc_active) - goto out; + return false; /* * The lock is required on 32bit architectures to read the variable @@ -1143,9 +1131,6 @@ static bool tmigr_requires_handle_remote_up(struct tmigr_group *group, raw_spin_unlock(&group->lock); } -out: - /* Update of childmask for the next level */ - data->childmask = group->childmask; return false; } @@ -1157,7 +1142,7 @@ out: bool tmigr_requires_handle_remote(void) { struct tmigr_cpu *tmc = this_cpu_ptr(&tmigr_cpu); - struct tmigr_remote_data data; + struct tmigr_walk data; unsigned long jif; bool ret = false; @@ -1165,7 +1150,7 @@ bool tmigr_requires_handle_remote(void) return ret; data.now = get_jiffies_update(&jif); - data.childmask = tmc->childmask; + data.childmask = tmc->groupmask; data.firstexp = KTIME_MAX; data.tmc_active = !tmc->idle; data.check = false; @@ -1230,14 +1215,13 @@ u64 tmigr_cpu_new_timer(u64 nextexp) if (nextexp != tmc->cpuevt.nextevt.expires || tmc->cpuevt.ignore) { ret = tmigr_new_timer(tmc, nextexp); + /* + * Make sure the reevaluation of timers in idle path + * will not miss an event. + */ + WRITE_ONCE(tmc->wakeup, ret); } } - /* - * Make sure the reevaluation of timers in idle path will not miss an - * event. - */ - WRITE_ONCE(tmc->wakeup, ret); - trace_tmigr_cpu_new_timer_idle(tmc, nextexp); raw_spin_unlock(&tmc->lock); return ret; @@ -1245,10 +1229,9 @@ u64 tmigr_cpu_new_timer(u64 nextexp) static bool tmigr_inactive_up(struct tmigr_group *group, struct tmigr_group *child, - void *ptr) + struct tmigr_walk *data) { union tmigr_state curstate, newstate, childstate; - struct tmigr_walk *data = ptr; bool walk_done; u8 childmask; @@ -1299,9 +1282,10 @@ static bool tmigr_inactive_up(struct tmigr_group *group, WARN_ON_ONCE((newstate.migrator != TMIGR_NONE) && !(newstate.active)); - if (atomic_try_cmpxchg(&group->migr_state, &curstate.state, - newstate.state)) + if (atomic_try_cmpxchg(&group->migr_state, &curstate.state, newstate.state)) { + trace_tmigr_group_set_cpu_inactive(group, newstate, childmask); break; + } /* * The memory barrier is paired with the cmpxchg() in @@ -1317,22 +1301,6 @@ static bool tmigr_inactive_up(struct tmigr_group *group, /* Event Handling */ tmigr_update_events(group, child, data); - if (group->parent && (walk_done == false)) - data->childmask = group->childmask; - - /* - * data->firstexp was set by tmigr_update_events() and contains the - * expiry of the first global event which needs to be handled. It - * differs from KTIME_MAX if: - * - group is the top level group and - * - group is idle (which means CPU was the last active CPU in the - * hierarchy) and - * - there is a pending event in the hierarchy - */ - WARN_ON_ONCE(data->firstexp != KTIME_MAX && group->parent); - - trace_tmigr_group_set_cpu_inactive(group, newstate, childmask); - return walk_done; } @@ -1341,7 +1309,7 @@ static u64 __tmigr_cpu_deactivate(struct tmigr_cpu *tmc, u64 nextexp) struct tmigr_walk data = { .nextexp = nextexp, .firstexp = KTIME_MAX, .evt = &tmc->cpuevt, - .childmask = tmc->childmask }; + .childmask = tmc->groupmask }; /* * If nextexp is KTIME_MAX, the CPU event will be ignored because the @@ -1400,7 +1368,7 @@ u64 tmigr_cpu_deactivate(u64 nextexp) * the only one in the level 0 group; and if it is the * only one in level 0 group, but there are more than a * single group active on the way to top level) - * * nextevt - when CPU is offline and has to handle timer on his own + * * nextevt - when CPU is offline and has to handle timer on its own * or when on the way to top in every group only a single * child is active but @nextevt is before the lowest * next_expiry encountered while walking up to top level. @@ -1419,7 +1387,7 @@ u64 tmigr_quick_check(u64 nextevt) if (WARN_ON_ONCE(tmc->idle)) return nextevt; - if (!tmigr_check_migrator_and_lonely(tmc->tmgroup, tmc->childmask)) + if (!tmigr_check_migrator_and_lonely(tmc->tmgroup, tmc->groupmask)) return KTIME_MAX; do { @@ -1442,6 +1410,66 @@ u64 tmigr_quick_check(u64 nextevt) return KTIME_MAX; } +/* + * tmigr_trigger_active() - trigger a CPU to become active again + * + * This function is executed on a CPU which is part of cpu_online_mask, when the + * last active CPU in the hierarchy is offlining. With this, it is ensured that + * the other CPU is active and takes over the migrator duty. + */ +static long tmigr_trigger_active(void *unused) +{ + struct tmigr_cpu *tmc = this_cpu_ptr(&tmigr_cpu); + + WARN_ON_ONCE(!tmc->online || tmc->idle); + + return 0; +} + +static int tmigr_cpu_offline(unsigned int cpu) +{ + struct tmigr_cpu *tmc = this_cpu_ptr(&tmigr_cpu); + int migrator; + u64 firstexp; + + raw_spin_lock_irq(&tmc->lock); + tmc->online = false; + WRITE_ONCE(tmc->wakeup, KTIME_MAX); + + /* + * CPU has to handle the local events on his own, when on the way to + * offline; Therefore nextevt value is set to KTIME_MAX + */ + firstexp = __tmigr_cpu_deactivate(tmc, KTIME_MAX); + trace_tmigr_cpu_offline(tmc); + raw_spin_unlock_irq(&tmc->lock); + + if (firstexp != KTIME_MAX) { + migrator = cpumask_any_but(cpu_online_mask, cpu); + work_on_cpu(migrator, tmigr_trigger_active, NULL); + } + + return 0; +} + +static int tmigr_cpu_online(unsigned int cpu) +{ + struct tmigr_cpu *tmc = this_cpu_ptr(&tmigr_cpu); + + /* Check whether CPU data was successfully initialized */ + if (WARN_ON_ONCE(!tmc->tmgroup)) + return -EINVAL; + + raw_spin_lock_irq(&tmc->lock); + trace_tmigr_cpu_online(tmc); + tmc->idle = timer_base_is_idle(); + if (!tmc->idle) + __tmigr_cpu_activate(tmc); + tmc->online = true; + raw_spin_unlock_irq(&tmc->lock); + return 0; +} + static void tmigr_init_group(struct tmigr_group *group, unsigned int lvl, int node) { @@ -1514,21 +1542,25 @@ static struct tmigr_group *tmigr_get_group(unsigned int cpu, int node, } static void tmigr_connect_child_parent(struct tmigr_group *child, - struct tmigr_group *parent) + struct tmigr_group *parent, + bool activate) { - union tmigr_state childstate; + struct tmigr_walk data; raw_spin_lock_irq(&child->lock); raw_spin_lock_nested(&parent->lock, SINGLE_DEPTH_NESTING); child->parent = parent; - child->childmask = BIT(parent->num_children++); + child->groupmask = BIT(parent->num_children++); raw_spin_unlock(&parent->lock); raw_spin_unlock_irq(&child->lock); trace_tmigr_connect_child_parent(child); + if (!activate) + return; + /* * To prevent inconsistent states, active children need to be active in * the new parent as well. Inactive children are already marked inactive @@ -1544,21 +1576,24 @@ static void tmigr_connect_child_parent(struct tmigr_group *child, * child to the new parent. So tmigr_connect_child_parent() is * executed with the formerly top level group (child) and the newly * created group (parent). + * + * * It is ensured that the child is active, as this setup path is + * executed in hotplug prepare callback. This is exectued by an + * already connected and !idle CPU. Even if all other CPUs go idle, + * the CPU executing the setup will be responsible up to current top + * level group. And the next time it goes inactive, it will release + * the new childmask and parent to subsequent walkers through this + * @child. Therefore propagate active state unconditionally. */ - childstate.state = atomic_read(&child->migr_state); - if (childstate.migrator != TMIGR_NONE) { - struct tmigr_walk data; - - data.childmask = child->childmask; + data.childmask = child->groupmask; - /* - * There is only one new level per time. When connecting the - * child and the parent and set the child active when the parent - * is inactive, the parent needs to be the uppermost - * level. Otherwise there went something wrong! - */ - WARN_ON(!tmigr_active_up(parent, child, &data) && parent->parent); - } + /* + * There is only one new level per time (which is protected by + * tmigr_mutex). When connecting the child and the parent and set the + * child active when the parent is inactive, the parent needs to be the + * uppermost level. Otherwise there went something wrong! + */ + WARN_ON(!tmigr_active_up(parent, child, &data) && parent->parent); } static int tmigr_setup_groups(unsigned int cpu, unsigned int node) @@ -1611,12 +1646,12 @@ static int tmigr_setup_groups(unsigned int cpu, unsigned int node) * Update tmc -> group / child -> group connection */ if (i == 0) { - struct tmigr_cpu *tmc = this_cpu_ptr(&tmigr_cpu); + struct tmigr_cpu *tmc = per_cpu_ptr(&tmigr_cpu, cpu); raw_spin_lock_irq(&group->lock); tmc->tmgroup = group; - tmc->childmask = BIT(group->num_children++); + tmc->groupmask = BIT(group->num_children++); raw_spin_unlock_irq(&group->lock); @@ -1626,7 +1661,8 @@ static int tmigr_setup_groups(unsigned int cpu, unsigned int node) continue; } else { child = stack[i - 1]; - tmigr_connect_child_parent(child, group); + /* Will be activated at online time */ + tmigr_connect_child_parent(child, group, false); } /* check if uppermost level was newly created */ @@ -1637,12 +1673,21 @@ static int tmigr_setup_groups(unsigned int cpu, unsigned int node) lvllist = &tmigr_level_list[top]; if (group->num_children == 1 && list_is_singular(lvllist)) { + /* + * The target CPU must never do the prepare work, except + * on early boot when the boot CPU is the target. Otherwise + * it may spuriously activate the old top level group inside + * the new one (nevertheless whether old top level group is + * active or not) and/or release an uninitialized childmask. + */ + WARN_ON_ONCE(cpu == raw_smp_processor_id()); + lvllist = &tmigr_level_list[top - 1]; list_for_each_entry(child, lvllist, list) { if (child->parent) continue; - tmigr_connect_child_parent(child, group); + tmigr_connect_child_parent(child, group, true); } } } @@ -1664,80 +1709,31 @@ static int tmigr_add_cpu(unsigned int cpu) return ret; } -static int tmigr_cpu_online(unsigned int cpu) -{ - struct tmigr_cpu *tmc = this_cpu_ptr(&tmigr_cpu); - int ret; - - /* First online attempt? Initialize CPU data */ - if (!tmc->tmgroup) { - raw_spin_lock_init(&tmc->lock); - - ret = tmigr_add_cpu(cpu); - if (ret < 0) - return ret; - - if (tmc->childmask == 0) - return -EINVAL; - - timerqueue_init(&tmc->cpuevt.nextevt); - tmc->cpuevt.nextevt.expires = KTIME_MAX; - tmc->cpuevt.ignore = true; - tmc->cpuevt.cpu = cpu; - - tmc->remote = false; - WRITE_ONCE(tmc->wakeup, KTIME_MAX); - } - raw_spin_lock_irq(&tmc->lock); - trace_tmigr_cpu_online(tmc); - tmc->idle = timer_base_is_idle(); - if (!tmc->idle) - __tmigr_cpu_activate(tmc); - tmc->online = true; - raw_spin_unlock_irq(&tmc->lock); - return 0; -} - -/* - * tmigr_trigger_active() - trigger a CPU to become active again - * - * This function is executed on a CPU which is part of cpu_online_mask, when the - * last active CPU in the hierarchy is offlining. With this, it is ensured that - * the other CPU is active and takes over the migrator duty. - */ -static long tmigr_trigger_active(void *unused) +static int tmigr_cpu_prepare(unsigned int cpu) { - struct tmigr_cpu *tmc = this_cpu_ptr(&tmigr_cpu); + struct tmigr_cpu *tmc = per_cpu_ptr(&tmigr_cpu, cpu); + int ret = 0; - WARN_ON_ONCE(!tmc->online || tmc->idle); - - return 0; -} - -static int tmigr_cpu_offline(unsigned int cpu) -{ - struct tmigr_cpu *tmc = this_cpu_ptr(&tmigr_cpu); - int migrator; - u64 firstexp; + /* Not first online attempt? */ + if (tmc->tmgroup) + return ret; - raw_spin_lock_irq(&tmc->lock); - tmc->online = false; + raw_spin_lock_init(&tmc->lock); + timerqueue_init(&tmc->cpuevt.nextevt); + tmc->cpuevt.nextevt.expires = KTIME_MAX; + tmc->cpuevt.ignore = true; + tmc->cpuevt.cpu = cpu; + tmc->remote = false; WRITE_ONCE(tmc->wakeup, KTIME_MAX); - /* - * CPU has to handle the local events on his own, when on the way to - * offline; Therefore nextevt value is set to KTIME_MAX - */ - firstexp = __tmigr_cpu_deactivate(tmc, KTIME_MAX); - trace_tmigr_cpu_offline(tmc); - raw_spin_unlock_irq(&tmc->lock); + ret = tmigr_add_cpu(cpu); + if (ret < 0) + return ret; - if (firstexp != KTIME_MAX) { - migrator = cpumask_any_but(cpu_online_mask, cpu); - work_on_cpu(migrator, tmigr_trigger_active, NULL); - } + if (tmc->groupmask == 0) + return -EINVAL; - return 0; + return ret; } static int __init tmigr_init(void) @@ -1796,6 +1792,11 @@ static int __init tmigr_init(void) tmigr_hierarchy_levels, TMIGR_CHILDREN_PER_GROUP, tmigr_crossnode_level); + ret = cpuhp_setup_state(CPUHP_TMIGR_PREPARE, "tmigr:prepare", + tmigr_cpu_prepare, NULL); + if (ret) + goto err; + ret = cpuhp_setup_state(CPUHP_AP_TMIGR_ONLINE, "tmigr:online", tmigr_cpu_online, tmigr_cpu_offline); if (ret) @@ -1807,4 +1808,4 @@ err: pr_err("Timer migration setup failed\n"); return ret; } -late_initcall(tmigr_init); +early_initcall(tmigr_init); diff --git a/kernel/time/timer_migration.h b/kernel/time/timer_migration.h index 6c37d94a37d9..154accc7a543 100644 --- a/kernel/time/timer_migration.h +++ b/kernel/time/timer_migration.h @@ -22,7 +22,17 @@ struct tmigr_event { * struct tmigr_group - timer migration hierarchy group * @lock: Lock protecting the event information and group hierarchy * information during setup - * @parent: Pointer to the parent group + * @parent: Pointer to the parent group. Pointer is updated when a + * new hierarchy level is added because of a CPU coming + * online the first time. Once it is set, the pointer will + * not be removed or updated. When accessing parent pointer + * lock less to decide whether to abort a propagation or + * not, it is not a problem. The worst outcome is an + * unnecessary/early CPU wake up. But do not access parent + * pointer several times in the same 'action' (like + * activation, deactivation, check for remote expiry,...) + * without holding the lock as it is not ensured that value + * will not change. * @groupevt: Next event of the group which is only used when the * group is !active. The group event is then queued into * the parent timer queue. @@ -41,9 +51,8 @@ struct tmigr_event { * @num_children: Counter of group children to make sure the group is only * filled with TMIGR_CHILDREN_PER_GROUP; Required for setup * only - * @childmask: childmask of the group in the parent group; is set - * during setup and will never change; can be read - * lockless + * @groupmask: mask of the group in the parent group; is set during + * setup and will never change; can be read lockless * @list: List head that is added to the per level * tmigr_level_list; is required during setup when a * new group needs to be connected to the existing @@ -59,7 +68,7 @@ struct tmigr_group { unsigned int level; int numa_node; unsigned int num_children; - u8 childmask; + u8 groupmask; struct list_head list; }; @@ -79,7 +88,7 @@ struct tmigr_group { * hierarchy * @remote: Is set when timers of the CPU are expired remotely * @tmgroup: Pointer to the parent group - * @childmask: childmask of tmigr_cpu in the parent group + * @groupmask: mask of tmigr_cpu in the parent group * @wakeup: Stores the first timer when the timer migration * hierarchy is completely idle and remote expiry was done; * is returned to timer code in the idle path and is only @@ -92,7 +101,7 @@ struct tmigr_cpu { bool idle; bool remote; struct tmigr_group *tmgroup; - u8 childmask; + u8 groupmask; u64 wakeup; struct tmigr_event cpuevt; }; @@ -108,8 +117,8 @@ union tmigr_state { u32 state; /** * struct - split state of tmigr_group - * @active: Contains each childmask bit of the active children - * @migrator: Contains childmask of the child which is migrator + * @active: Contains each mask bit of the active children + * @migrator: Contains mask of the child which is migrator * @seq: Sequence counter needs to be increased when an update * to the tmigr_state is done. It prevents a race when * updates in the child groups are propagated in changed |