diff options
Diffstat (limited to 'kernel/sched/clock.c')
| -rw-r--r-- | kernel/sched/clock.c | 228 |
1 files changed, 152 insertions, 76 deletions
diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c index e85a725e5c34..ca0f8fc945c6 100644 --- a/kernel/sched/clock.c +++ b/kernel/sched/clock.c @@ -58,10 +58,13 @@ #include <linux/percpu.h> #include <linux/ktime.h> #include <linux/sched.h> +#include <linux/nmi.h> +#include <linux/sched/clock.h> #include <linux/static_key.h> #include <linux/workqueue.h> #include <linux/compiler.h> #include <linux/tick.h> +#include <linux/init.h> /* * Scheduler clock - returns current time in nanosec units. @@ -77,91 +80,144 @@ EXPORT_SYMBOL_GPL(sched_clock); __read_mostly int sched_clock_running; -#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK -static struct static_key __sched_clock_stable = STATIC_KEY_INIT; -static int __sched_clock_stable_early; - -int sched_clock_stable(void) +void sched_clock_init(void) { - return static_key_false(&__sched_clock_stable); + sched_clock_running = 1; } -static void __set_sched_clock_stable(void) -{ - if (!sched_clock_stable()) - static_key_slow_inc(&__sched_clock_stable); - - tick_dep_clear(TICK_DEP_BIT_CLOCK_UNSTABLE); -} +#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK +/* + * We must start with !__sched_clock_stable because the unstable -> stable + * transition is accurate, while the stable -> unstable transition is not. + * + * Similarly we start with __sched_clock_stable_early, thereby assuming we + * will become stable, such that there's only a single 1 -> 0 transition. + */ +static DEFINE_STATIC_KEY_FALSE(__sched_clock_stable); +static int __sched_clock_stable_early = 1; -void set_sched_clock_stable(void) -{ - __sched_clock_stable_early = 1; +/* + * We want: ktime_get_ns() + __gtod_offset == sched_clock() + __sched_clock_offset + */ +__read_mostly u64 __sched_clock_offset; +static __read_mostly u64 __gtod_offset; - smp_mb(); /* matches sched_clock_init() */ +struct sched_clock_data { + u64 tick_raw; + u64 tick_gtod; + u64 clock; +}; - if (!sched_clock_running) - return; +static DEFINE_PER_CPU_SHARED_ALIGNED(struct sched_clock_data, sched_clock_data); - __set_sched_clock_stable(); +static inline struct sched_clock_data *this_scd(void) +{ + return this_cpu_ptr(&sched_clock_data); } -static void __clear_sched_clock_stable(struct work_struct *work) +static inline struct sched_clock_data *cpu_sdc(int cpu) { - /* XXX worry about clock continuity */ - if (sched_clock_stable()) - static_key_slow_dec(&__sched_clock_stable); + return &per_cpu(sched_clock_data, cpu); +} - tick_dep_set(TICK_DEP_BIT_CLOCK_UNSTABLE); +int sched_clock_stable(void) +{ + return static_branch_likely(&__sched_clock_stable); } -static DECLARE_WORK(sched_clock_work, __clear_sched_clock_stable); +static void __scd_stamp(struct sched_clock_data *scd) +{ + scd->tick_gtod = ktime_get_ns(); + scd->tick_raw = sched_clock(); +} -void clear_sched_clock_stable(void) +static void __set_sched_clock_stable(void) { - __sched_clock_stable_early = 0; + struct sched_clock_data *scd; - smp_mb(); /* matches sched_clock_init() */ + /* + * Since we're still unstable and the tick is already running, we have + * to disable IRQs in order to get a consistent scd->tick* reading. + */ + local_irq_disable(); + scd = this_scd(); + /* + * Attempt to make the (initial) unstable->stable transition continuous. + */ + __sched_clock_offset = (scd->tick_gtod + __gtod_offset) - (scd->tick_raw); + local_irq_enable(); - if (!sched_clock_running) - return; + printk(KERN_INFO "sched_clock: Marking stable (%lld, %lld)->(%lld, %lld)\n", + scd->tick_gtod, __gtod_offset, + scd->tick_raw, __sched_clock_offset); - schedule_work(&sched_clock_work); + static_branch_enable(&__sched_clock_stable); + tick_dep_clear(TICK_DEP_BIT_CLOCK_UNSTABLE); } -struct sched_clock_data { - u64 tick_raw; - u64 tick_gtod; - u64 clock; -}; +/* + * If we ever get here, we're screwed, because we found out -- typically after + * the fact -- that TSC wasn't good. This means all our clocksources (including + * ktime) could have reported wrong values. + * + * What we do here is an attempt to fix up and continue sort of where we left + * off in a coherent manner. + * + * The only way to fully avoid random clock jumps is to boot with: + * "tsc=unstable". + */ +static void __sched_clock_work(struct work_struct *work) +{ + struct sched_clock_data *scd; + int cpu; -static DEFINE_PER_CPU_SHARED_ALIGNED(struct sched_clock_data, sched_clock_data); + /* take a current timestamp and set 'now' */ + preempt_disable(); + scd = this_scd(); + __scd_stamp(scd); + scd->clock = scd->tick_gtod + __gtod_offset; + preempt_enable(); -static inline struct sched_clock_data *this_scd(void) -{ - return this_cpu_ptr(&sched_clock_data); + /* clone to all CPUs */ + for_each_possible_cpu(cpu) + per_cpu(sched_clock_data, cpu) = *scd; + + printk(KERN_WARNING "TSC found unstable after boot, most likely due to broken BIOS. Use 'tsc=unstable'.\n"); + printk(KERN_INFO "sched_clock: Marking unstable (%lld, %lld)<-(%lld, %lld)\n", + scd->tick_gtod, __gtod_offset, + scd->tick_raw, __sched_clock_offset); + + static_branch_disable(&__sched_clock_stable); } -static inline struct sched_clock_data *cpu_sdc(int cpu) +static DECLARE_WORK(sched_clock_work, __sched_clock_work); + +static void __clear_sched_clock_stable(void) { - return &per_cpu(sched_clock_data, cpu); + if (!sched_clock_stable()) + return; + + tick_dep_set(TICK_DEP_BIT_CLOCK_UNSTABLE); + schedule_work(&sched_clock_work); } -void sched_clock_init(void) +void clear_sched_clock_stable(void) { - u64 ktime_now = ktime_to_ns(ktime_get()); - int cpu; + __sched_clock_stable_early = 0; - for_each_possible_cpu(cpu) { - struct sched_clock_data *scd = cpu_sdc(cpu); + smp_mb(); /* matches sched_clock_init_late() */ - scd->tick_raw = 0; - scd->tick_gtod = ktime_now; - scd->clock = ktime_now; - } - - sched_clock_running = 1; + if (sched_clock_running == 2) + __clear_sched_clock_stable(); +} +/* + * We run this as late_initcall() such that it runs after all built-in drivers, + * notably: acpi_processor and intel_idle, which can mark the TSC as unstable. + */ +static int __init sched_clock_init_late(void) +{ + sched_clock_running = 2; /* * Ensure that it is impossible to not do a static_key update. * @@ -173,9 +229,10 @@ void sched_clock_init(void) if (__sched_clock_stable_early) __set_sched_clock_stable(); - else - __clear_sched_clock_stable(NULL); + + return 0; } +late_initcall(sched_clock_init_late); /* * min, max except they take wrapping into account @@ -199,7 +256,7 @@ static inline u64 wrap_max(u64 x, u64 y) */ static u64 sched_clock_local(struct sched_clock_data *scd) { - u64 now, clock, old_clock, min_clock, max_clock; + u64 now, clock, old_clock, min_clock, max_clock, gtod; s64 delta; again: @@ -216,9 +273,10 @@ again: * scd->tick_gtod + TICK_NSEC); */ - clock = scd->tick_gtod + delta; - min_clock = wrap_max(scd->tick_gtod, old_clock); - max_clock = wrap_max(old_clock, scd->tick_gtod + TICK_NSEC); + gtod = scd->tick_gtod + __gtod_offset; + clock = gtod + delta; + min_clock = wrap_max(gtod, old_clock); + max_clock = wrap_max(old_clock, gtod + TICK_NSEC); clock = wrap_max(clock, min_clock); clock = wrap_min(clock, max_clock); @@ -302,7 +360,7 @@ u64 sched_clock_cpu(int cpu) u64 clock; if (sched_clock_stable()) - return sched_clock(); + return sched_clock() + __sched_clock_offset; if (unlikely(!sched_clock_running)) return 0ull; @@ -323,7 +381,6 @@ EXPORT_SYMBOL_GPL(sched_clock_cpu); void sched_clock_tick(void) { struct sched_clock_data *scd; - u64 now, now_gtod; if (sched_clock_stable()) return; @@ -334,14 +391,31 @@ void sched_clock_tick(void) WARN_ON_ONCE(!irqs_disabled()); scd = this_scd(); - now_gtod = ktime_to_ns(ktime_get()); - now = sched_clock(); - - scd->tick_raw = now; - scd->tick_gtod = now_gtod; + __scd_stamp(scd); sched_clock_local(scd); } +void sched_clock_tick_stable(void) +{ + u64 gtod, clock; + + if (!sched_clock_stable()) + return; + + /* + * Called under watchdog_lock. + * + * The watchdog just found this TSC to (still) be stable, so now is a + * good moment to update our __gtod_offset. Because once we find the + * TSC to be unstable, any computation will be computing crap. + */ + local_irq_disable(); + gtod = ktime_get_ns(); + clock = sched_clock(); + __gtod_offset = (clock + __sched_clock_offset) - gtod; + local_irq_enable(); +} + /* * We are going deep-idle (irqs are disabled): */ @@ -352,25 +426,26 @@ void sched_clock_idle_sleep_event(void) EXPORT_SYMBOL_GPL(sched_clock_idle_sleep_event); /* - * We just idled delta nanoseconds (called with irqs disabled): + * We just idled; resync with ktime. */ -void sched_clock_idle_wakeup_event(u64 delta_ns) +void sched_clock_idle_wakeup_event(void) { - if (timekeeping_suspended) + unsigned long flags; + + if (sched_clock_stable()) + return; + + if (unlikely(timekeeping_suspended)) return; + local_irq_save(flags); sched_clock_tick(); - touch_softlockup_watchdog_sched(); + local_irq_restore(flags); } EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event); #else /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */ -void sched_clock_init(void) -{ - sched_clock_running = 1; -} - u64 sched_clock_cpu(int cpu) { if (unlikely(!sched_clock_running)) @@ -378,6 +453,7 @@ u64 sched_clock_cpu(int cpu) return sched_clock(); } + #endif /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */ /* |