From 203cbf77de59fc8f13502dcfd11350c6d4a5c95f Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 14 Jan 2016 16:54:46 +0000 Subject: hrtimer: Handle remaining time proper for TIME_LOW_RES If CONFIG_TIME_LOW_RES is enabled we add a jiffie to the relative timeout to prevent short sleeps, but we do not account for that in interfaces which retrieve the remaining time. Helge observed that timerfd can return a remaining time larger than the relative timeout. That's not expected and breaks userland test programs. Store the information that the timer was armed relative and provide functions to adjust the remaining time. To avoid bloating the hrtimer struct make state a u8, which as a bonus results in better code on x86 at least. Reported-and-tested-by: Helge Deller Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra Cc: John Stultz Cc: linux-m68k@lists.linux-m68k.org Cc: dhowells@redhat.com Cc: stable@vger.kernel.org Link: http://lkml.kernel.org/r/20160114164159.273328486@linutronix.de Signed-off-by: Thomas Gleixner --- kernel/time/hrtimer.c | 55 ++++++++++++++++++++++++++++++++---------------- kernel/time/timer_list.c | 2 +- 2 files changed, 38 insertions(+), 19 deletions(-) (limited to 'kernel/time') diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 435b8850dd80..fa909f9fd559 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -897,10 +897,10 @@ static int enqueue_hrtimer(struct hrtimer *timer, */ static void __remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base, - unsigned long newstate, int reprogram) + u8 newstate, int reprogram) { struct hrtimer_cpu_base *cpu_base = base->cpu_base; - unsigned int state = timer->state; + u8 state = timer->state; timer->state = newstate; if (!(state & HRTIMER_STATE_ENQUEUED)) @@ -930,7 +930,7 @@ static inline int remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base, bool restart) { if (hrtimer_is_queued(timer)) { - unsigned long state = timer->state; + u8 state = timer->state; int reprogram; /* @@ -954,6 +954,22 @@ remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base, bool rest return 0; } +static inline ktime_t hrtimer_update_lowres(struct hrtimer *timer, ktime_t tim, + const enum hrtimer_mode mode) +{ +#ifdef CONFIG_TIME_LOW_RES + /* + * CONFIG_TIME_LOW_RES indicates that the system has no way to return + * granular time values. For relative timers we add hrtimer_resolution + * (i.e. one jiffie) to prevent short timeouts. + */ + timer->is_rel = mode & HRTIMER_MODE_REL; + if (timer->is_rel) + tim = ktime_add_safe(tim, ktime_set(0, hrtimer_resolution)); +#endif + return tim; +} + /** * hrtimer_start_range_ns - (re)start an hrtimer on the current CPU * @timer: the timer to be added @@ -974,19 +990,10 @@ void hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, /* Remove an active timer from the queue: */ remove_hrtimer(timer, base, true); - if (mode & HRTIMER_MODE_REL) { + if (mode & HRTIMER_MODE_REL) tim = ktime_add_safe(tim, base->get_time()); - /* - * CONFIG_TIME_LOW_RES is a temporary way for architectures - * to signal that they simply return xtime in - * do_gettimeoffset(). In this case we want to round up by - * resolution when starting a relative timer, to avoid short - * timeouts. This will go away with the GTOD framework. - */ -#ifdef CONFIG_TIME_LOW_RES - tim = ktime_add_safe(tim, ktime_set(0, hrtimer_resolution)); -#endif - } + + tim = hrtimer_update_lowres(timer, tim, mode); hrtimer_set_expires_range_ns(timer, tim, delta_ns); @@ -1074,19 +1081,23 @@ EXPORT_SYMBOL_GPL(hrtimer_cancel); /** * hrtimer_get_remaining - get remaining time for the timer * @timer: the timer to read + * @adjust: adjust relative timers when CONFIG_TIME_LOW_RES=y */ -ktime_t hrtimer_get_remaining(const struct hrtimer *timer) +ktime_t __hrtimer_get_remaining(const struct hrtimer *timer, bool adjust) { unsigned long flags; ktime_t rem; lock_hrtimer_base(timer, &flags); - rem = hrtimer_expires_remaining(timer); + if (IS_ENABLED(CONFIG_TIME_LOW_RES) && adjust) + rem = hrtimer_expires_remaining_adjusted(timer); + else + rem = hrtimer_expires_remaining(timer); unlock_hrtimer_base(timer, &flags); return rem; } -EXPORT_SYMBOL_GPL(hrtimer_get_remaining); +EXPORT_SYMBOL_GPL(__hrtimer_get_remaining); #ifdef CONFIG_NO_HZ_COMMON /** @@ -1219,6 +1230,14 @@ static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base, timer_stats_account_hrtimer(timer); fn = timer->function; + /* + * Clear the 'is relative' flag for the TIME_LOW_RES case. If the + * timer is restarted with a period then it becomes an absolute + * timer. If its not restarted it does not matter. + */ + if (IS_ENABLED(CONFIG_TIME_LOW_RES)) + timer->is_rel = false; + /* * Because we run timers from hardirq context, there is no chance * they get migrated to another cpu, therefore its safe to unlock diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c index f75e35b60149..ba7d8b288bb3 100644 --- a/kernel/time/timer_list.c +++ b/kernel/time/timer_list.c @@ -69,7 +69,7 @@ print_timer(struct seq_file *m, struct hrtimer *taddr, struct hrtimer *timer, print_name_offset(m, taddr); SEQ_printf(m, ", "); print_name_offset(m, timer->function); - SEQ_printf(m, ", S:%02lx", timer->state); + SEQ_printf(m, ", S:%02x", timer->state); #ifdef CONFIG_TIMER_STATS SEQ_printf(m, ", "); print_name_offset(m, timer->start_site); -- cgit From 572c39172684c3711e4a03c9a7380067e2b0661c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 14 Jan 2016 16:54:47 +0000 Subject: posix-timers: Handle relative timers with CONFIG_TIME_LOW_RES proper As Helge reported for timerfd we have the same issue in posix timers. We return remaining time larger than the programmed relative time to user space in case of CONFIG_TIME_LOW_RES=y. Use the proper function to adjust the extra time added in hrtimer_start_range_ns(). Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra Cc: Helge Deller Cc: John Stultz Cc: linux-m68k@lists.linux-m68k.org Cc: dhowells@redhat.com Cc: stable@vger.kernel.org Link: http://lkml.kernel.org/r/20160114164159.450510905@linutronix.de Signed-off-by: Thomas Gleixner --- kernel/time/posix-timers.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/time') diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index 31d11ac9fa47..f2826c35e918 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -760,7 +760,7 @@ common_timer_get(struct k_itimer *timr, struct itimerspec *cur_setting) (timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE)) timr->it_overrun += (unsigned int) hrtimer_forward(timer, now, iv); - remaining = ktime_sub(hrtimer_get_expires(timer), now); + remaining = __hrtimer_expires_remaining_adjusted(timer, now); /* Return 0 only, when the timer is expired and not pending */ if (remaining.tv64 <= 0) { /* -- cgit From 51cbb5242a41700a3f250ecfb48dcfb7e4375ea4 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 14 Jan 2016 16:54:48 +0000 Subject: itimers: Handle relative timers with CONFIG_TIME_LOW_RES proper As Helge reported for timerfd we have the same issue in itimers. We return remaining time larger than the programmed relative time to user space in case of CONFIG_TIME_LOW_RES=y. Use the proper function to adjust the extra time added in hrtimer_start_range_ns(). Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra Cc: Helge Deller Cc: John Stultz Cc: linux-m68k@lists.linux-m68k.org Cc: dhowells@redhat.com Cc: stable@vger.kernel.org Link: http://lkml.kernel.org/r/20160114164159.528222587@linutronix.de Signed-off-by: Thomas Gleixner --- kernel/time/itimer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/time') diff --git a/kernel/time/itimer.c b/kernel/time/itimer.c index 8d262b467573..1d5c7204ddc9 100644 --- a/kernel/time/itimer.c +++ b/kernel/time/itimer.c @@ -26,7 +26,7 @@ */ static struct timeval itimer_get_remtime(struct hrtimer *timer) { - ktime_t rem = hrtimer_get_remaining(timer); + ktime_t rem = __hrtimer_get_remaining(timer, true); /* * Racy but safe: if the itimer expires after the above -- cgit From dd4e17ab704269bce71402285f5e8b9ac24b1eff Mon Sep 17 00:00:00 2001 From: John Stultz Date: Thu, 21 Jan 2016 15:03:34 -0800 Subject: ntp: Fix ADJ_SETOFFSET being used w/ ADJ_NANO Recently, in commit 37cf4dc3370f I forgot to check if the timeval being passed was actually a timespec (as is signaled with ADJ_NANO). This resulted in that patch breaking ADJ_SETOFFSET users who set ADJ_NANO, by rejecting valid timespecs that were compared with valid timeval ranges. This patch addresses this by checking for the ADJ_NANO flag and using the timepsec check instead in that case. Reported-by: Harald Hoyer Reported-by: Kay Sievers Fixes: 37cf4dc3370f "time: Verify time values in adjtimex ADJ_SETOFFSET to avoid overflow" Signed-off-by: John Stultz Cc: Sasha Levin Cc: Richard Cochran Cc: Prarit Bhargava Cc: David Herrmann Link: http://lkml.kernel.org/r/1453417415-19110-2-git-send-email-john.stultz@linaro.org Signed-off-by: Thomas Gleixner --- kernel/time/ntp.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) (limited to 'kernel/time') diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 36f2ca09aa5e..6df8927c58a5 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -685,8 +685,18 @@ int ntp_validate_timex(struct timex *txc) if (!capable(CAP_SYS_TIME)) return -EPERM; - if (!timeval_inject_offset_valid(&txc->time)) - return -EINVAL; + if (txc->modes & ADJ_NANO) { + struct timespec ts; + + ts.tv_sec = txc->time.tv_sec; + ts.tv_nsec = txc->time.tv_usec; + if (!timespec_inject_offset_valid(&ts)) + return -EINVAL; + + } else { + if (!timeval_inject_offset_valid(&txc->time)) + return -EINVAL; + } } /* -- cgit From 7809998ab1af22602a8463845108edc49dfb9ef0 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 25 Jan 2016 16:41:49 +0100 Subject: tick/sched: Hide unused oneshot timer code A couple of functions in kernel/time/tick-sched.c are only relevant for oneshot timer mode, i.e. when hires-timers or nohz mode are enabled. If both are disabled, we get gcc warnings about them: kernel/time/tick-sched.c:98:16: warning: 'tick_init_jiffy_update' defined but not used [-Wunused-function] static ktime_t tick_init_jiffy_update(void) ^ kernel/time/tick-sched.c:112:13: warning: 'tick_sched_do_timer' defined but not used [-Wunused-function] static void tick_sched_do_timer(ktime_t now) ^ kernel/time/tick-sched.c:134:13: warning: 'tick_sched_handle' defined but not used [-Wunused-function] static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs) ^ This encloses the whole set of functions in an appropriate ifdef to avoid the warning and to make it clearer when they are used. Signed-off-by: Arnd Bergmann Cc: linux-arm-kernel@lists.infradead.org Link: http://lkml.kernel.org/r/1453736525-1959191-1-git-send-email-arnd@arndb.de Signed-off-by: Thomas Gleixner --- kernel/time/tick-sched.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) (limited to 'kernel/time') diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 7ea28ed3109d..cbe5d8dcf15a 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -36,16 +36,17 @@ */ static DEFINE_PER_CPU(struct tick_sched, tick_cpu_sched); -/* - * The time, when the last jiffy update happened. Protected by jiffies_lock. - */ -static ktime_t last_jiffies_update; - struct tick_sched *tick_get_tick_sched(int cpu) { return &per_cpu(tick_cpu_sched, cpu); } +#if defined(CONFIG_NO_HZ_COMMON) || defined(CONFIG_HIGH_RES_TIMERS) +/* + * The time, when the last jiffy update happened. Protected by jiffies_lock. + */ +static ktime_t last_jiffies_update; + /* * Must be called with interrupts disabled ! */ @@ -151,6 +152,7 @@ static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs) update_process_times(user_mode(regs)); profile_tick(CPU_PROFILING); } +#endif #ifdef CONFIG_NO_HZ_FULL cpumask_var_t tick_nohz_full_mask; -- cgit From 9c808765e88efb6fa6af7e2206ef89512f1840a7 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Fri, 15 Jan 2016 17:41:08 +0000 Subject: hrtimer: Add support for CLOCK_MONOTONIC_RAW The KVM/ARM timer implementation arms a hrtimer when a vcpu is blocked (usually because it is waiting for an interrupt) while its timer is going to kick in the future. It is essential that this timer doesn't get adjusted, or the guest will end up being woken-up at the wrong time (NTP running on the host seems to confuse the hell out of some guests). In order to allow this, let's add CLOCK_MONOTONIC_RAW support to hrtimer (it is so far only supported for posix timers). It also has the (limited) benefit of fixing de0421d53bfb ("mac80211_hwsim: shuffle code to prepare for dynamic radios"), which already uses this functionnality without realizing wasn't implemented (just being lucky...). Signed-off-by: Marc Zyngier Cc: Tomasz Nowicki Cc: Christoffer Dall Link: http://lkml.kernel.org/r/1452879670-16133-2-git-send-email-marc.zyngier@arm.com Signed-off-by: Thomas Gleixner --- include/linux/hrtimer.h | 1 + kernel/time/hrtimer.c | 11 ++++++++++- 2 files changed, 11 insertions(+), 1 deletion(-) (limited to 'kernel/time') diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index 76dd4f0da5ca..a6d64af5e73f 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -151,6 +151,7 @@ enum hrtimer_base_type { HRTIMER_BASE_REALTIME, HRTIMER_BASE_BOOTTIME, HRTIMER_BASE_TAI, + HRTIMER_BASE_MONOTONIC_RAW, HRTIMER_MAX_CLOCK_BASES, }; diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 435b8850dd80..a125f222fee2 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -90,12 +90,18 @@ DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) = .clockid = CLOCK_TAI, .get_time = &ktime_get_clocktai, }, + { + .index = HRTIMER_BASE_MONOTONIC_RAW, + .clockid = CLOCK_MONOTONIC_RAW, + .get_time = &ktime_get_raw, + }, } }; static const int hrtimer_clock_to_base_table[MAX_CLOCKS] = { [CLOCK_REALTIME] = HRTIMER_BASE_REALTIME, [CLOCK_MONOTONIC] = HRTIMER_BASE_MONOTONIC, + [CLOCK_MONOTONIC_RAW] = HRTIMER_BASE_MONOTONIC_RAW, [CLOCK_BOOTTIME] = HRTIMER_BASE_BOOTTIME, [CLOCK_TAI] = HRTIMER_BASE_TAI, }; @@ -1268,7 +1274,10 @@ static void __hrtimer_run_queues(struct hrtimer_cpu_base *cpu_base, ktime_t now) if (!(active & 0x01)) continue; - basenow = ktime_add(now, base->offset); + if (unlikely(base->index == HRTIMER_BASE_MONOTONIC_RAW)) + basenow = ktime_get_raw(); + else + basenow = ktime_add(now, base->offset); while ((node = timerqueue_getnext(&base->active))) { struct hrtimer *timer; -- cgit From 9006a01829a50cfd6bbd4980910ed46e895e93d7 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Fri, 15 Jan 2016 17:41:09 +0000 Subject: hrtimer: Catch illegal clockids It is way too easy to take any random clockid and feed it to the hrtimer subsystem. At best, it gets mapped to a monotonic base, but it would be better to just catch illegal values as early as possible. This patch does exactly that, mapping illegal clockids to an illegal base index, and panicing when we detect the illegal condition. Signed-off-by: Marc Zyngier Cc: Tomasz Nowicki Cc: Christoffer Dall Link: http://lkml.kernel.org/r/1452879670-16133-3-git-send-email-marc.zyngier@arm.com Signed-off-by: Thomas Gleixner --- kernel/time/hrtimer.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'kernel/time') diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index a125f222fee2..cb0fe70f3c51 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -99,6 +99,9 @@ DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) = }; static const int hrtimer_clock_to_base_table[MAX_CLOCKS] = { + /* Make sure we catch unsupported clockids */ + [0 ... MAX_CLOCKS - 1] = HRTIMER_MAX_CLOCK_BASES, + [CLOCK_REALTIME] = HRTIMER_BASE_REALTIME, [CLOCK_MONOTONIC] = HRTIMER_BASE_MONOTONIC, [CLOCK_MONOTONIC_RAW] = HRTIMER_BASE_MONOTONIC_RAW, @@ -108,7 +111,9 @@ static const int hrtimer_clock_to_base_table[MAX_CLOCKS] = { static inline int hrtimer_clockid_to_base(clockid_t clock_id) { - return hrtimer_clock_to_base_table[clock_id]; + int base = hrtimer_clock_to_base_table[clock_id]; + BUG_ON(base == HRTIMER_MAX_CLOCK_BASES); + return base; } /* -- cgit From bbf66d897adf2bb0c310db96c97e8db6369f39e1 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Fri, 22 Jan 2016 18:31:53 +0100 Subject: clocksource: Allow unregistering the watchdog Hyper-V vmbus module registers TSC page clocksource when loaded. This is the clocksource with the highest rating and thus it becomes the watchdog making unloading of the vmbus module impossible. Separate clocksource_select_watchdog() from clocksource_enqueue_watchdog() and use it on clocksource register/rating change/unregister. After all, lobotomized monkeys may need some love too. Signed-off-by: Vitaly Kuznetsov Cc: John Stultz Cc: Dexuan Cui Cc: K. Y. Srinivasan Link: http://lkml.kernel.org/r/1453483913-25672-1-git-send-email-vkuznets@redhat.com Signed-off-by: Thomas Gleixner --- kernel/time/clocksource.c | 52 ++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 42 insertions(+), 10 deletions(-) (limited to 'kernel/time') diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 664de539299b..56ece145a814 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -323,13 +323,42 @@ static void clocksource_enqueue_watchdog(struct clocksource *cs) /* cs is a watchdog. */ if (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES; + } + spin_unlock_irqrestore(&watchdog_lock, flags); +} + +static void clocksource_select_watchdog(bool fallback) +{ + struct clocksource *cs, *old_wd; + unsigned long flags; + + spin_lock_irqsave(&watchdog_lock, flags); + /* save current watchdog */ + old_wd = watchdog; + if (fallback) + watchdog = NULL; + + list_for_each_entry(cs, &clocksource_list, list) { + /* cs is a clocksource to be watched. */ + if (cs->flags & CLOCK_SOURCE_MUST_VERIFY) + continue; + + /* Skip current if we were requested for a fallback. */ + if (fallback && cs == old_wd) + continue; + /* Pick the best watchdog. */ - if (!watchdog || cs->rating > watchdog->rating) { + if (!watchdog || cs->rating > watchdog->rating) watchdog = cs; - /* Reset watchdog cycles */ - clocksource_reset_watchdog(); - } } + /* If we failed to find a fallback restore the old one. */ + if (!watchdog) + watchdog = old_wd; + + /* If we changed the watchdog we need to reset cycles. */ + if (watchdog != old_wd) + clocksource_reset_watchdog(); + /* Check if the watchdog timer needs to be started. */ clocksource_start_watchdog(); spin_unlock_irqrestore(&watchdog_lock, flags); @@ -404,6 +433,7 @@ static void clocksource_enqueue_watchdog(struct clocksource *cs) cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES; } +static void clocksource_select_watchdog(bool fallback) { } static inline void clocksource_dequeue_watchdog(struct clocksource *cs) { } static inline void clocksource_resume_watchdog(void) { } static inline int __clocksource_watchdog_kthread(void) { return 0; } @@ -736,6 +766,7 @@ int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq) clocksource_enqueue(cs); clocksource_enqueue_watchdog(cs); clocksource_select(); + clocksource_select_watchdog(false); mutex_unlock(&clocksource_mutex); return 0; } @@ -758,6 +789,7 @@ void clocksource_change_rating(struct clocksource *cs, int rating) mutex_lock(&clocksource_mutex); __clocksource_change_rating(cs, rating); clocksource_select(); + clocksource_select_watchdog(false); mutex_unlock(&clocksource_mutex); } EXPORT_SYMBOL(clocksource_change_rating); @@ -767,12 +799,12 @@ EXPORT_SYMBOL(clocksource_change_rating); */ static int clocksource_unbind(struct clocksource *cs) { - /* - * I really can't convince myself to support this on hardware - * designed by lobotomized monkeys. - */ - if (clocksource_is_watchdog(cs)) - return -EBUSY; + if (clocksource_is_watchdog(cs)) { + /* Select and try to install a replacement watchdog. */ + clocksource_select_watchdog(true); + if (clocksource_is_watchdog(cs)) + return -EBUSY; + } if (cs == curr_clocksource) { /* Select and try to install a replacement clock source */ -- cgit From 1ca8ec532fc2d986f1f4a319857bb18e0c9739b4 Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Wed, 27 Jan 2016 19:26:07 +0800 Subject: tick/nohz: Set the correct expiry when switching to nohz/lowres mode commit 0ff53d096422 sets the next tick interrupt to the last jiffies update, i.e. in the past, because the forward operation is invoked before the set operation. There is no resulting damage (yet), but we get an extra pointless tick interrupt. Revert the order so we get the next tick interrupt in the future. Fixes: commit 0ff53d096422 "tick: sched: Force tick interrupt and get rid of softirq magic" Signed-off-by: Wanpeng Li Cc: Peter Zijlstra Cc: Frederic Weisbecker Cc: stable@vger.kernel.org Link: http://lkml.kernel.org/r/1453893967-3458-1-git-send-email-wanpeng.li@hotmail.com Signed-off-by: Thomas Gleixner --- kernel/time/tick-sched.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel/time') diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index cbe5d8dcf15a..de2d9fef6ea6 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -995,9 +995,9 @@ static void tick_nohz_switch_to_nohz(void) /* Get the next period */ next = tick_init_jiffy_update(); - hrtimer_forward_now(&ts->sched_timer, tick_period); hrtimer_set_expires(&ts->sched_timer, next); - tick_program_event(next, 1); + hrtimer_forward_now(&ts->sched_timer, tick_period); + tick_program_event(hrtimer_get_expires(&ts->sched_timer), 1); tick_nohz_activate(ts, NOHZ_MODE_LOWRES); } -- cgit From 8537bb95a63e4be330359721f0ecd422f4a4c0ca Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Mon, 7 Dec 2015 16:55:23 +0100 Subject: nohz: Implement wide kick on top of irq work It simplifies it and allows wide kick to be performed, even when IRQs are disabled, without an asynchronous level in the middle. This comes at a cost of some more overhead on features like perf and posix cpu timers slow-paths, which is probably not much important for nohz full users. Requested-by: Peter Zijlstra Reviewed-by: Chris Metcalf Cc: Christoph Lameter Cc: Chris Metcalf Cc: Ingo Molnar Cc: Luiz Capitulino Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: Viresh Kumar Signed-off-by: Frederic Weisbecker --- kernel/time/tick-sched.c | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) (limited to 'kernel/time') diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 0b17424349eb..548a4e2551a9 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -234,24 +234,20 @@ void tick_nohz_full_kick_cpu(int cpu) irq_work_queue_on(&per_cpu(nohz_full_kick_work, cpu), cpu); } -static void nohz_full_kick_ipi(void *info) -{ - /* Empty, the tick restart happens on tick_nohz_irq_exit() */ -} - /* * Kick all full dynticks CPUs in order to force these to re-evaluate * their dependency on the tick and restart it if necessary. */ void tick_nohz_full_kick_all(void) { + int cpu; + if (!tick_nohz_full_running) return; preempt_disable(); - smp_call_function_many(tick_nohz_full_mask, - nohz_full_kick_ipi, NULL, false); - tick_nohz_full_kick(); + for_each_cpu_and(cpu, tick_nohz_full_mask, cpu_online_mask) + tick_nohz_full_kick_cpu(cpu); preempt_enable(); } -- cgit From fc4fa6e112c0f999fab022a4eb7f6614bb47c7ab Mon Sep 17 00:00:00 2001 From: Masanari Iida Date: Sun, 13 Dec 2015 15:26:11 +0900 Subject: treewide: Fix typo in printk This patch fix spelling typos found in printk and Kconfig. Signed-off-by: Masanari Iida Acked-by: Randy Dunlap Signed-off-by: Jiri Kosina --- drivers/atm/firestream.c | 2 +- drivers/crypto/nx/nx-842.c | 2 +- drivers/infiniband/hw/usnic/usnic_ib_qp_grp.c | 2 +- drivers/input/touchscreen/wdt87xx_i2c.c | 2 +- drivers/net/ethernet/nuvoton/w90p910_ether.c | 2 +- drivers/net/wireless/realtek/rtlwifi/rtl8821ae/dm.c | 2 +- drivers/usb/gadget/legacy/Kconfig | 3 +-- kernel/time/timekeeping.c | 2 +- lib/842/842_decompress.c | 2 +- net/openvswitch/vport-geneve.c | 2 +- tools/testing/selftests/timers/alarmtimer-suspend.c | 2 +- 11 files changed, 11 insertions(+), 12 deletions(-) (limited to 'kernel/time') diff --git a/drivers/atm/firestream.c b/drivers/atm/firestream.c index 82f2ae0d7cc4..a969a7e443be 100644 --- a/drivers/atm/firestream.c +++ b/drivers/atm/firestream.c @@ -168,7 +168,7 @@ static char *res_strings[] = { "reserved 14", "Unrecognized cell", "reserved 16", - "reassemby abort: AAL5 abort", + "reassembly abort: AAL5 abort", "packet purged", "packet ageing timeout", "channel ageing timeout", diff --git a/drivers/crypto/nx/nx-842.c b/drivers/crypto/nx/nx-842.c index 046c1c45411b..d94e25df503b 100644 --- a/drivers/crypto/nx/nx-842.c +++ b/drivers/crypto/nx/nx-842.c @@ -308,7 +308,7 @@ int nx842_crypto_compress(struct crypto_tfm *tfm, h = !n && add_header ? hdrsize : 0; if (ignore) - pr_warn("interal error, ignore is set %x\n", ignore); + pr_warn("internal error, ignore is set %x\n", ignore); ret = compress(ctx, &p, &hdr->group[n], &c, &ignore, h); if (ret) diff --git a/drivers/infiniband/hw/usnic/usnic_ib_qp_grp.c b/drivers/infiniband/hw/usnic/usnic_ib_qp_grp.c index 85dc3f989ff7..704d08744116 100644 --- a/drivers/infiniband/hw/usnic/usnic_ib_qp_grp.c +++ b/drivers/infiniband/hw/usnic/usnic_ib_qp_grp.c @@ -64,7 +64,7 @@ const char *usnic_ib_qp_grp_state_to_string(enum ib_qp_state state) case IB_QPS_ERR: return "ERR"; default: - return "UNKOWN STATE"; + return "UNKNOWN STATE"; } } diff --git a/drivers/input/touchscreen/wdt87xx_i2c.c b/drivers/input/touchscreen/wdt87xx_i2c.c index 515c20a6e10f..73861ad22df4 100644 --- a/drivers/input/touchscreen/wdt87xx_i2c.c +++ b/drivers/input/touchscreen/wdt87xx_i2c.c @@ -848,7 +848,7 @@ static int wdt87xx_do_update_firmware(struct i2c_client *client, error = wdt87xx_get_sysparam(client, &wdt->param); if (error) dev_err(&client->dev, - "failed to refresh system paramaters: %d\n", error); + "failed to refresh system parameters: %d\n", error); out: enable_irq(client->irq); mutex_unlock(&wdt->fw_mutex); diff --git a/drivers/net/ethernet/nuvoton/w90p910_ether.c b/drivers/net/ethernet/nuvoton/w90p910_ether.c index afa445842f3e..52d9a94aebb9 100644 --- a/drivers/net/ethernet/nuvoton/w90p910_ether.c +++ b/drivers/net/ethernet/nuvoton/w90p910_ether.c @@ -1038,7 +1038,7 @@ static int w90p910_ether_probe(struct platform_device *pdev) error = register_netdev(dev); if (error != 0) { - dev_err(&pdev->dev, "Regiter EMC w90p910 FAILED\n"); + dev_err(&pdev->dev, "Register EMC w90p910 FAILED\n"); error = -ENODEV; goto failed_put_rmiiclk; } diff --git a/drivers/net/wireless/realtek/rtlwifi/rtl8821ae/dm.c b/drivers/net/wireless/realtek/rtlwifi/rtl8821ae/dm.c index b57cfd965196..95dcbff4673b 100644 --- a/drivers/net/wireless/realtek/rtlwifi/rtl8821ae/dm.c +++ b/drivers/net/wireless/realtek/rtlwifi/rtl8821ae/dm.c @@ -626,7 +626,7 @@ static void rtl8821ae_dm_find_minimum_rssi(struct ieee80211_hw *hw) rtl_dm_dig->min_undec_pwdb_for_dm = rtlpriv->dm.entry_min_undec_sm_pwdb; RT_TRACE(rtlpriv, COMP_BB_POWERSAVING, DBG_LOUD, - "AP Ext Port or disconnet PWDB = 0x%x\n", + "AP Ext Port or disconnect PWDB = 0x%x\n", rtl_dm_dig->min_undec_pwdb_for_dm); } RT_TRACE(rtlpriv, COMP_DIG, DBG_LOUD, diff --git a/drivers/usb/gadget/legacy/Kconfig b/drivers/usb/gadget/legacy/Kconfig index 4d682ad7bf23..fb1cdd9af472 100644 --- a/drivers/usb/gadget/legacy/Kconfig +++ b/drivers/usb/gadget/legacy/Kconfig @@ -103,8 +103,7 @@ config USB_ETH - CDC Ethernet Emulation Model (EEM) is a newer standard that has a simpler interface that can be used by more USB hardware. - RNDIS support is an additional option, more demanding than than - subset. + RNDIS support is an additional option, more demanding than subset. Within the USB device, this gadget driver exposes a network device "usbX", where X depends on what other networking devices you have. diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index b1356b7ae570..0d4cc7601df7 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -131,7 +131,7 @@ static void timekeeping_check_update(struct timekeeper *tk, cycle_t offset) printk_deferred(" timekeeping: Your kernel is sick, but tries to cope by capping time updates\n"); } else { if (offset > (max_cycles >> 1)) { - printk_deferred("INFO: timekeeping: Cycle offset (%lld) is larger than the the '%s' clock's 50%% safety margin (%lld)\n", + printk_deferred("INFO: timekeeping: Cycle offset (%lld) is larger than the '%s' clock's 50%% safety margin (%lld)\n", offset, name, max_cycles >> 1); printk_deferred(" timekeeping: Your kernel is still fine, but is feeling a bit nervous\n"); } diff --git a/lib/842/842_decompress.c b/lib/842/842_decompress.c index 8881dad2a6a0..a2a941f8112d 100644 --- a/lib/842/842_decompress.c +++ b/lib/842/842_decompress.c @@ -250,7 +250,7 @@ static int do_op(struct sw842_param *p, u8 o) case OP_ACTION_NOOP: break; default: - pr_err("Interal error, invalid op %x\n", op); + pr_err("Internal error, invalid op %x\n", op); return -EINVAL; } diff --git a/net/openvswitch/vport-geneve.c b/net/openvswitch/vport-geneve.c index efb736bb6855..69f1de58a3b4 100644 --- a/net/openvswitch/vport-geneve.c +++ b/net/openvswitch/vport-geneve.c @@ -133,6 +133,6 @@ static void __exit ovs_geneve_tnl_exit(void) module_init(ovs_geneve_tnl_init); module_exit(ovs_geneve_tnl_exit); -MODULE_DESCRIPTION("OVS: Geneve swiching port"); +MODULE_DESCRIPTION("OVS: Geneve switching port"); MODULE_LICENSE("GPL"); MODULE_ALIAS("vport-type-5"); diff --git a/tools/testing/selftests/timers/alarmtimer-suspend.c b/tools/testing/selftests/timers/alarmtimer-suspend.c index 72cacf5383dd..2b361b830395 100644 --- a/tools/testing/selftests/timers/alarmtimer-suspend.c +++ b/tools/testing/selftests/timers/alarmtimer-suspend.c @@ -153,7 +153,7 @@ int main(void) alarmcount = 0; if (timer_create(alarm_clock_id, &se, &tm1) == -1) { - printf("timer_create failled, %s unspported?\n", + printf("timer_create failed, %s unsupported?\n", clockstring(alarm_clock_id)); break; } -- cgit From 232d26373d310a941ef2ab46e53ea62fe076ed13 Mon Sep 17 00:00:00 2001 From: Alexander Kuleshov Date: Fri, 26 Feb 2016 19:14:14 -0800 Subject: jiffies: Use CLOCKSOURCE_MASK instead of constant The CLOCKSOURCE_MASK(32) macro expands to the same value, but makes code more readable. Signed-off-by: Alexander Kuleshov Signed-off-by: John Stultz Cc: Prarit Bhargava Cc: Richard Cochran Link: http://lkml.kernel.org/r/1456542854-22104-3-git-send-email-john.stultz@linaro.org Signed-off-by: Thomas Gleixner --- kernel/time/jiffies.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/time') diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c index 347fecf86a3f..555e21f7b966 100644 --- a/kernel/time/jiffies.c +++ b/kernel/time/jiffies.c @@ -68,7 +68,7 @@ static struct clocksource clocksource_jiffies = { .name = "jiffies", .rating = 1, /* lowest valid rating*/ .read = jiffies_read, - .mask = 0xffffffff, /*32bits*/ + .mask = CLOCKSOURCE_MASK(32), .mult = NSEC_PER_JIFFY << JIFFIES_SHIFT, /* details above */ .shift = JIFFIES_SHIFT, .max_cycles = 10, -- cgit From ede5147d515694e012cd958ec874b9daf8a65fec Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 24 Feb 2016 14:37:53 +0000 Subject: Handle ISO 8601 leap seconds and encodings of midnight in mktime64() Handle the following ISO 8601 features in mktime64(): (1) Leap seconds. Leap seconds are indicated by the seconds parameter being the value 60. Handle this by treating it the same as 00 of the following minute. It has been pointed out that a minute may contain two leap seconds. However, pending discussion of what that looks like and how to handle it, I'm not going to concern myself with it. (2) Alternate encodings of midnight. Two different encodings of midnight are permitted - 00:00:00 and 24:00:00 - the first is midnight today and the second is midnight tomorrow and is exactly equivalent to the first with tomorrow's date. As it happens, we don't actually need to change mktime64() to handle either of these - just comment them as valid parameters. These facility will be used by the X.509 parser. Doing it in mktime64() makes the policy common to the whole kernel and easier to find. Signed-off-by: David Howells Acked-by: Arnd Bergmann cc: John Stultz cc: Rudolf Polzer cc: One Thousand Gnomes --- kernel/time/time.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'kernel/time') diff --git a/kernel/time/time.c b/kernel/time/time.c index 86751c68e08d..be115b020d27 100644 --- a/kernel/time/time.c +++ b/kernel/time/time.c @@ -322,6 +322,13 @@ EXPORT_SYMBOL(timespec_trunc); * -year/100+year/400 terms, and add 10.] * * This algorithm was first published by Gauss (I think). + * + * A leap second can be indicated by calling this function with sec as + * 60 (allowable under ISO 8601). The leap second is treated the same + * as the following second since they don't exist in UNIX time. + * + * An encoding of midnight at the end of the day as 24:00:00 - ie. midnight + * tomorrow - (allowable under ISO 8601) is supported. */ time64_t mktime64(const unsigned int year0, const unsigned int mon0, const unsigned int day, const unsigned int hour, @@ -338,7 +345,7 @@ time64_t mktime64(const unsigned int year0, const unsigned int mon0, return ((((time64_t) (year/4 - year/100 + year/400 + 367*mon/12 + day) + year*365 - 719499 - )*24 + hour /* now have hours */ + )*24 + hour /* now have hours - midnight tomorrow handled here */ )*60 + min /* now have minutes */ )*60 + sec; /* finally seconds */ } -- cgit From d027d45d8a17a4145eab2d841140e9acbb7feb59 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sun, 7 Jun 2015 15:54:30 +0200 Subject: nohz: New tick dependency mask The tick dependency is evaluated on every IRQ and context switch. This consists is a batch of checks which determine whether it is safe to stop the tick or not. These checks are often split in many details: posix cpu timers, scheduler, sched clock, perf events.... each of which are made of smaller details: posix cpu timer involves checking process wide timers then thread wide timers. Perf involves checking freq events then more per cpu details. Checking these informations asynchronously every time we update the full dynticks state bring avoidable overhead and a messy layout. Let's introduce instead tick dependency masks: one for system wide dependency (unstable sched clock, freq based perf events), one for CPU wide dependency (sched, throttling perf events), and task/signal level dependencies (posix cpu timers). The subsystems are responsible for setting and clearing their dependency through a set of APIs that will take care of concurrent dependency mask modifications and kick targets to restart the relevant CPU tick whenever needed. This new dependency engine stays beside the old one until all subsystems having a tick dependency are converted to it. Suggested-by: Thomas Gleixner Suggested-by: Peter Zijlstra Reviewed-by: Chris Metcalf Cc: Christoph Lameter Cc: Chris Metcalf Cc: Ingo Molnar Cc: Luiz Capitulino Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: Viresh Kumar Signed-off-by: Frederic Weisbecker --- include/linux/sched.h | 8 +++ include/linux/tick.h | 92 +++++++++++++++++++++++++++++ kernel/time/tick-sched.c | 150 ++++++++++++++++++++++++++++++++++++++++++++--- kernel/time/tick-sched.h | 1 + 4 files changed, 244 insertions(+), 7 deletions(-) (limited to 'kernel/time') diff --git a/include/linux/sched.h b/include/linux/sched.h index a10494a94cc3..307e48375f21 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -719,6 +719,10 @@ struct signal_struct { /* Earliest-expiration cache. */ struct task_cputime cputime_expires; +#ifdef CONFIG_NO_HZ_FULL + unsigned long tick_dep_mask; +#endif + struct list_head cpu_timers[3]; struct pid *tty_old_pgrp; @@ -1542,6 +1546,10 @@ struct task_struct { VTIME_SYS, } vtime_snap_whence; #endif + +#ifdef CONFIG_NO_HZ_FULL + unsigned long tick_dep_mask; +#endif unsigned long nvcsw, nivcsw; /* context switch counts */ u64 start_time; /* monotonic time in nsec */ u64 real_start_time; /* boot based time in nsec */ diff --git a/include/linux/tick.h b/include/linux/tick.h index 97fd4e543846..d60e5df8ec28 100644 --- a/include/linux/tick.h +++ b/include/linux/tick.h @@ -97,6 +97,18 @@ static inline void tick_broadcast_exit(void) tick_broadcast_oneshot_control(TICK_BROADCAST_EXIT); } +enum tick_dep_bits { + TICK_DEP_BIT_POSIX_TIMER = 0, + TICK_DEP_BIT_PERF_EVENTS = 1, + TICK_DEP_BIT_SCHED = 2, + TICK_DEP_BIT_CLOCK_UNSTABLE = 3 +}; + +#define TICK_DEP_MASK_POSIX_TIMER (1 << TICK_DEP_BIT_POSIX_TIMER) +#define TICK_DEP_MASK_PERF_EVENTS (1 << TICK_DEP_BIT_PERF_EVENTS) +#define TICK_DEP_MASK_SCHED (1 << TICK_DEP_BIT_SCHED) +#define TICK_DEP_MASK_CLOCK_UNSTABLE (1 << TICK_DEP_BIT_CLOCK_UNSTABLE) + #ifdef CONFIG_NO_HZ_COMMON extern int tick_nohz_enabled; extern int tick_nohz_tick_stopped(void); @@ -154,6 +166,72 @@ static inline int housekeeping_any_cpu(void) return cpumask_any_and(housekeeping_mask, cpu_online_mask); } +extern void tick_nohz_dep_set(enum tick_dep_bits bit); +extern void tick_nohz_dep_clear(enum tick_dep_bits bit); +extern void tick_nohz_dep_set_cpu(int cpu, enum tick_dep_bits bit); +extern void tick_nohz_dep_clear_cpu(int cpu, enum tick_dep_bits bit); +extern void tick_nohz_dep_set_task(struct task_struct *tsk, + enum tick_dep_bits bit); +extern void tick_nohz_dep_clear_task(struct task_struct *tsk, + enum tick_dep_bits bit); +extern void tick_nohz_dep_set_signal(struct signal_struct *signal, + enum tick_dep_bits bit); +extern void tick_nohz_dep_clear_signal(struct signal_struct *signal, + enum tick_dep_bits bit); + +/* + * The below are tick_nohz_[set,clear]_dep() wrappers that optimize off-cases + * on top of static keys. + */ +static inline void tick_dep_set(enum tick_dep_bits bit) +{ + if (tick_nohz_full_enabled()) + tick_nohz_dep_set(bit); +} + +static inline void tick_dep_clear(enum tick_dep_bits bit) +{ + if (tick_nohz_full_enabled()) + tick_nohz_dep_clear(bit); +} + +static inline void tick_dep_set_cpu(int cpu, enum tick_dep_bits bit) +{ + if (tick_nohz_full_cpu(cpu)) + tick_nohz_dep_set_cpu(cpu, bit); +} + +static inline void tick_dep_clear_cpu(int cpu, enum tick_dep_bits bit) +{ + if (tick_nohz_full_cpu(cpu)) + tick_nohz_dep_clear_cpu(cpu, bit); +} + +static inline void tick_dep_set_task(struct task_struct *tsk, + enum tick_dep_bits bit) +{ + if (tick_nohz_full_enabled()) + tick_nohz_dep_set_task(tsk, bit); +} +static inline void tick_dep_clear_task(struct task_struct *tsk, + enum tick_dep_bits bit) +{ + if (tick_nohz_full_enabled()) + tick_nohz_dep_clear_task(tsk, bit); +} +static inline void tick_dep_set_signal(struct signal_struct *signal, + enum tick_dep_bits bit) +{ + if (tick_nohz_full_enabled()) + tick_nohz_dep_set_signal(signal, bit); +} +static inline void tick_dep_clear_signal(struct signal_struct *signal, + enum tick_dep_bits bit) +{ + if (tick_nohz_full_enabled()) + tick_nohz_dep_clear_signal(signal, bit); +} + extern void tick_nohz_full_kick(void); extern void tick_nohz_full_kick_cpu(int cpu); extern void tick_nohz_full_kick_all(void); @@ -166,6 +244,20 @@ static inline int housekeeping_any_cpu(void) static inline bool tick_nohz_full_enabled(void) { return false; } static inline bool tick_nohz_full_cpu(int cpu) { return false; } static inline void tick_nohz_full_add_cpus_to(struct cpumask *mask) { } + +static inline void tick_dep_set(enum tick_dep_bits bit) { } +static inline void tick_dep_clear(enum tick_dep_bits bit) { } +static inline void tick_dep_set_cpu(int cpu, enum tick_dep_bits bit) { } +static inline void tick_dep_clear_cpu(int cpu, enum tick_dep_bits bit) { } +static inline void tick_dep_set_task(struct task_struct *tsk, + enum tick_dep_bits bit) { } +static inline void tick_dep_clear_task(struct task_struct *tsk, + enum tick_dep_bits bit) { } +static inline void tick_dep_set_signal(struct signal_struct *signal, + enum tick_dep_bits bit) { } +static inline void tick_dep_clear_signal(struct signal_struct *signal, + enum tick_dep_bits bit) { } + static inline void tick_nohz_full_kick_cpu(int cpu) { } static inline void tick_nohz_full_kick(void) { } static inline void tick_nohz_full_kick_all(void) { } diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 548a4e2551a9..74ab7dbdc023 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -158,11 +158,53 @@ static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs) cpumask_var_t tick_nohz_full_mask; cpumask_var_t housekeeping_mask; bool tick_nohz_full_running; +static unsigned long tick_dep_mask; -static bool can_stop_full_tick(void) +static void trace_tick_dependency(unsigned long dep) +{ + if (dep & TICK_DEP_MASK_POSIX_TIMER) { + trace_tick_stop(0, "posix timers running\n"); + return; + } + + if (dep & TICK_DEP_MASK_PERF_EVENTS) { + trace_tick_stop(0, "perf events running\n"); + return; + } + + if (dep & TICK_DEP_MASK_SCHED) { + trace_tick_stop(0, "more than 1 task in runqueue\n"); + return; + } + + if (dep & TICK_DEP_MASK_CLOCK_UNSTABLE) + trace_tick_stop(0, "unstable sched clock\n"); +} + +static bool can_stop_full_tick(struct tick_sched *ts) { WARN_ON_ONCE(!irqs_disabled()); + if (tick_dep_mask) { + trace_tick_dependency(tick_dep_mask); + return false; + } + + if (ts->tick_dep_mask) { + trace_tick_dependency(ts->tick_dep_mask); + return false; + } + + if (current->tick_dep_mask) { + trace_tick_dependency(current->tick_dep_mask); + return false; + } + + if (current->signal->tick_dep_mask) { + trace_tick_dependency(current->signal->tick_dep_mask); + return false; + } + if (!sched_can_stop_tick()) { trace_tick_stop(0, "more than 1 task in runqueue\n"); return false; @@ -178,9 +220,10 @@ static bool can_stop_full_tick(void) return false; } - /* sched_clock_tick() needs us? */ #ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK /* + * sched_clock_tick() needs us? + * * TODO: kick full dynticks CPUs when * sched_clock_stable is set. */ @@ -199,13 +242,13 @@ static bool can_stop_full_tick(void) return true; } -static void nohz_full_kick_work_func(struct irq_work *work) +static void nohz_full_kick_func(struct irq_work *work) { /* Empty, the tick restart happens on tick_nohz_irq_exit() */ } static DEFINE_PER_CPU(struct irq_work, nohz_full_kick_work) = { - .func = nohz_full_kick_work_func, + .func = nohz_full_kick_func, }; /* @@ -251,6 +294,95 @@ void tick_nohz_full_kick_all(void) preempt_enable(); } +static void tick_nohz_dep_set_all(unsigned long *dep, + enum tick_dep_bits bit) +{ + unsigned long prev; + + prev = fetch_or(dep, BIT_MASK(bit)); + if (!prev) + tick_nohz_full_kick_all(); +} + +/* + * Set a global tick dependency. Used by perf events that rely on freq and + * by unstable clock. + */ +void tick_nohz_dep_set(enum tick_dep_bits bit) +{ + tick_nohz_dep_set_all(&tick_dep_mask, bit); +} + +void tick_nohz_dep_clear(enum tick_dep_bits bit) +{ + clear_bit(bit, &tick_dep_mask); +} + +/* + * Set per-CPU tick dependency. Used by scheduler and perf events in order to + * manage events throttling. + */ +void tick_nohz_dep_set_cpu(int cpu, enum tick_dep_bits bit) +{ + unsigned long prev; + struct tick_sched *ts; + + ts = per_cpu_ptr(&tick_cpu_sched, cpu); + + prev = fetch_or(&ts->tick_dep_mask, BIT_MASK(bit)); + if (!prev) { + preempt_disable(); + /* Perf needs local kick that is NMI safe */ + if (cpu == smp_processor_id()) { + tick_nohz_full_kick(); + } else { + /* Remote irq work not NMI-safe */ + if (!WARN_ON_ONCE(in_nmi())) + tick_nohz_full_kick_cpu(cpu); + } + preempt_enable(); + } +} + +void tick_nohz_dep_clear_cpu(int cpu, enum tick_dep_bits bit) +{ + struct tick_sched *ts = per_cpu_ptr(&tick_cpu_sched, cpu); + + clear_bit(bit, &ts->tick_dep_mask); +} + +/* + * Set a per-task tick dependency. Posix CPU timers need this in order to elapse + * per task timers. + */ +void tick_nohz_dep_set_task(struct task_struct *tsk, enum tick_dep_bits bit) +{ + /* + * We could optimize this with just kicking the target running the task + * if that noise matters for nohz full users. + */ + tick_nohz_dep_set_all(&tsk->tick_dep_mask, bit); +} + +void tick_nohz_dep_clear_task(struct task_struct *tsk, enum tick_dep_bits bit) +{ + clear_bit(bit, &tsk->tick_dep_mask); +} + +/* + * Set a per-taskgroup tick dependency. Posix CPU timers need this in order to elapse + * per process timers. + */ +void tick_nohz_dep_set_signal(struct signal_struct *sig, enum tick_dep_bits bit) +{ + tick_nohz_dep_set_all(&sig->tick_dep_mask, bit); +} + +void tick_nohz_dep_clear_signal(struct signal_struct *sig, enum tick_dep_bits bit) +{ + clear_bit(bit, &sig->tick_dep_mask); +} + /* * Re-evaluate the need for the tick as we switch the current task. * It might need the tick due to per task/process properties: @@ -259,15 +391,19 @@ void tick_nohz_full_kick_all(void) void __tick_nohz_task_switch(void) { unsigned long flags; + struct tick_sched *ts; local_irq_save(flags); if (!tick_nohz_full_cpu(smp_processor_id())) goto out; - if (tick_nohz_tick_stopped() && !can_stop_full_tick()) - tick_nohz_full_kick(); + ts = this_cpu_ptr(&tick_cpu_sched); + if (ts->tick_stopped) { + if (current->tick_dep_mask || current->signal->tick_dep_mask) + tick_nohz_full_kick(); + } out: local_irq_restore(flags); } @@ -736,7 +872,7 @@ static void tick_nohz_full_update_tick(struct tick_sched *ts) if (!ts->tick_stopped && ts->nohz_mode == NOHZ_MODE_INACTIVE) return; - if (can_stop_full_tick()) + if (can_stop_full_tick(ts)) tick_nohz_stop_sched_tick(ts, ktime_get(), cpu); else if (ts->tick_stopped) tick_nohz_restart_sched_tick(ts, ktime_get(), 1); diff --git a/kernel/time/tick-sched.h b/kernel/time/tick-sched.h index a4a8d4e9baa1..eb4e32566a83 100644 --- a/kernel/time/tick-sched.h +++ b/kernel/time/tick-sched.h @@ -60,6 +60,7 @@ struct tick_sched { u64 next_timer; ktime_t idle_expires; int do_timer_last; + unsigned long tick_dep_mask; }; extern struct tick_sched *tick_get_tick_sched(int cpu); -- cgit From e6e6cc22e067a6f44449aa8fd0328404079c3ba5 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 11 Dec 2015 03:27:25 +0100 Subject: nohz: Use enum code for tick stop failure tracing message It makes nohz tracing more lightweight, standard and easier to parse. Examples: user_loop-2904 [007] d..1 517.701126: tick_stop: success=1 dependency=NONE user_loop-2904 [007] dn.1 518.021181: tick_stop: success=0 dependency=SCHED posix_timers-6142 [007] d..1 1739.027400: tick_stop: success=0 dependency=POSIX_TIMER user_loop-5463 [007] dN.1 1185.931939: tick_stop: success=0 dependency=PERF_EVENTS Suggested-by: Peter Zijlstra Reviewed-by: Chris Metcalf Cc: Christoph Lameter Cc: Chris Metcalf Cc: Ingo Molnar Cc: Luiz Capitulino Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: Viresh Kumar Signed-off-by: Frederic Weisbecker --- include/linux/tick.h | 1 + include/trace/events/timer.h | 36 +++++++++++++++++++++++++++++++----- kernel/time/tick-sched.c | 18 +++++++++--------- 3 files changed, 41 insertions(+), 14 deletions(-) (limited to 'kernel/time') diff --git a/include/linux/tick.h b/include/linux/tick.h index d60e5df8ec28..96d276a923bf 100644 --- a/include/linux/tick.h +++ b/include/linux/tick.h @@ -104,6 +104,7 @@ enum tick_dep_bits { TICK_DEP_BIT_CLOCK_UNSTABLE = 3 }; +#define TICK_DEP_MASK_NONE 0 #define TICK_DEP_MASK_POSIX_TIMER (1 << TICK_DEP_BIT_POSIX_TIMER) #define TICK_DEP_MASK_PERF_EVENTS (1 << TICK_DEP_BIT_PERF_EVENTS) #define TICK_DEP_MASK_SCHED (1 << TICK_DEP_BIT_SCHED) diff --git a/include/trace/events/timer.h b/include/trace/events/timer.h index 073b9ac245ba..51440131d337 100644 --- a/include/trace/events/timer.h +++ b/include/trace/events/timer.h @@ -328,23 +328,49 @@ TRACE_EVENT(itimer_expire, ); #ifdef CONFIG_NO_HZ_COMMON + +#define TICK_DEP_NAMES \ + tick_dep_name(NONE) \ + tick_dep_name(POSIX_TIMER) \ + tick_dep_name(PERF_EVENTS) \ + tick_dep_name(SCHED) \ + tick_dep_name_end(CLOCK_UNSTABLE) + +#undef tick_dep_name +#undef tick_dep_name_end + +#define tick_dep_name(sdep) TRACE_DEFINE_ENUM(TICK_DEP_MASK_##sdep); +#define tick_dep_name_end(sdep) TRACE_DEFINE_ENUM(TICK_DEP_MASK_##sdep); + +TICK_DEP_NAMES + +#undef tick_dep_name +#undef tick_dep_name_end + +#define tick_dep_name(sdep) { TICK_DEP_MASK_##sdep, #sdep }, +#define tick_dep_name_end(sdep) { TICK_DEP_MASK_##sdep, #sdep } + +#define show_tick_dep_name(val) \ + __print_symbolic(val, TICK_DEP_NAMES) + TRACE_EVENT(tick_stop, - TP_PROTO(int success, char *error_msg), + TP_PROTO(int success, int dependency), - TP_ARGS(success, error_msg), + TP_ARGS(success, dependency), TP_STRUCT__entry( __field( int , success ) - __string( msg, error_msg ) + __field( int , dependency ) ), TP_fast_assign( __entry->success = success; - __assign_str(msg, error_msg); + __entry->dependency = dependency; ), - TP_printk("success=%s msg=%s", __entry->success ? "yes" : "no", __get_str(msg)) + TP_printk("success=%d dependency=%s", __entry->success, \ + show_tick_dep_name(__entry->dependency)) ); #endif diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 74ab7dbdc023..47e5ac45ce69 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -163,22 +163,22 @@ static unsigned long tick_dep_mask; static void trace_tick_dependency(unsigned long dep) { if (dep & TICK_DEP_MASK_POSIX_TIMER) { - trace_tick_stop(0, "posix timers running\n"); + trace_tick_stop(0, TICK_DEP_MASK_POSIX_TIMER); return; } if (dep & TICK_DEP_MASK_PERF_EVENTS) { - trace_tick_stop(0, "perf events running\n"); + trace_tick_stop(0, TICK_DEP_MASK_PERF_EVENTS); return; } if (dep & TICK_DEP_MASK_SCHED) { - trace_tick_stop(0, "more than 1 task in runqueue\n"); + trace_tick_stop(0, TICK_DEP_MASK_SCHED); return; } if (dep & TICK_DEP_MASK_CLOCK_UNSTABLE) - trace_tick_stop(0, "unstable sched clock\n"); + trace_tick_stop(0, TICK_DEP_MASK_CLOCK_UNSTABLE); } static bool can_stop_full_tick(struct tick_sched *ts) @@ -206,17 +206,17 @@ static bool can_stop_full_tick(struct tick_sched *ts) } if (!sched_can_stop_tick()) { - trace_tick_stop(0, "more than 1 task in runqueue\n"); + trace_tick_stop(0, TICK_DEP_MASK_SCHED); return false; } if (!posix_cpu_timers_can_stop_tick(current)) { - trace_tick_stop(0, "posix timers running\n"); + trace_tick_stop(0, TICK_DEP_MASK_POSIX_TIMER); return false; } if (!perf_event_can_stop_tick()) { - trace_tick_stop(0, "perf events running\n"); + trace_tick_stop(0, TICK_DEP_MASK_PERF_EVENTS); return false; } @@ -228,7 +228,7 @@ static bool can_stop_full_tick(struct tick_sched *ts) * sched_clock_stable is set. */ if (!sched_clock_stable()) { - trace_tick_stop(0, "unstable sched clock\n"); + trace_tick_stop(0, TICK_DEP_MASK_CLOCK_UNSTABLE); /* * Don't allow the user to think they can get * full NO_HZ with this machine. @@ -821,7 +821,7 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts, ts->last_tick = hrtimer_get_expires(&ts->sched_timer); ts->tick_stopped = 1; - trace_tick_stop(1, " "); + trace_tick_stop(1, TICK_DEP_MASK_NONE); } /* -- cgit From 555e0c1ef7ff49ee5ac3a1eb12de4a2e4722f63d Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 16 Jul 2015 17:42:29 +0200 Subject: perf: Migrate perf to use new tick dependency mask model Instead of providing asynchronous checks for the nohz subsystem to verify perf event tick dependency, migrate perf to the new mask. Perf needs the tick for two situations: 1) Freq events. We could set the tick dependency when those are installed on a CPU context. But setting a global dependency on top of the global freq events accounting is much easier. If people want that to be optimized, we can still refine that on the per-CPU tick dependency level. This patch dooesn't change the current behaviour anyway. 2) Throttled events: this is a per-cpu dependency. Reviewed-by: Chris Metcalf Cc: Christoph Lameter Cc: Chris Metcalf Cc: Ingo Molnar Cc: Luiz Capitulino Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: Viresh Kumar Signed-off-by: Frederic Weisbecker --- include/linux/perf_event.h | 6 ----- include/linux/tick.h | 2 -- kernel/events/core.c | 65 ++++++++++++++++++++++++++++++++++------------ kernel/time/tick-sched.c | 8 +----- 4 files changed, 49 insertions(+), 32 deletions(-) (limited to 'kernel/time') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index b35a61a481fa..d3ff88c13632 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -1108,12 +1108,6 @@ static inline void perf_event_task_tick(void) { } static inline int perf_event_release_kernel(struct perf_event *event) { return 0; } #endif -#if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_NO_HZ_FULL) -extern bool perf_event_can_stop_tick(void); -#else -static inline bool perf_event_can_stop_tick(void) { return true; } -#endif - #if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_INTEL) extern void perf_restore_debug_store(void); #else diff --git a/include/linux/tick.h b/include/linux/tick.h index 96d276a923bf..0e63db4bc662 100644 --- a/include/linux/tick.h +++ b/include/linux/tick.h @@ -233,7 +233,6 @@ static inline void tick_dep_clear_signal(struct signal_struct *signal, tick_nohz_dep_clear_signal(signal, bit); } -extern void tick_nohz_full_kick(void); extern void tick_nohz_full_kick_cpu(int cpu); extern void tick_nohz_full_kick_all(void); extern void __tick_nohz_task_switch(void); @@ -260,7 +259,6 @@ static inline void tick_dep_clear_signal(struct signal_struct *signal, enum tick_dep_bits bit) { } static inline void tick_nohz_full_kick_cpu(int cpu) { } -static inline void tick_nohz_full_kick(void) { } static inline void tick_nohz_full_kick_all(void) { } static inline void __tick_nohz_task_switch(void) { } #endif diff --git a/kernel/events/core.c b/kernel/events/core.c index 5946460b2425..f23d480052e3 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -3060,17 +3060,6 @@ done: return rotate; } -#ifdef CONFIG_NO_HZ_FULL -bool perf_event_can_stop_tick(void) -{ - if (atomic_read(&nr_freq_events) || - __this_cpu_read(perf_throttled_count)) - return false; - else - return true; -} -#endif - void perf_event_task_tick(void) { struct list_head *head = this_cpu_ptr(&active_ctx_list); @@ -3081,6 +3070,7 @@ void perf_event_task_tick(void) __this_cpu_inc(perf_throttled_seq); throttled = __this_cpu_xchg(perf_throttled_count, 0); + tick_dep_clear_cpu(smp_processor_id(), TICK_DEP_BIT_PERF_EVENTS); list_for_each_entry_safe(ctx, tmp, head, active_ctx_list) perf_adjust_freq_unthr_context(ctx, throttled); @@ -3511,6 +3501,28 @@ static void unaccount_event_cpu(struct perf_event *event, int cpu) atomic_dec(&per_cpu(perf_cgroup_events, cpu)); } +#ifdef CONFIG_NO_HZ_FULL +static DEFINE_SPINLOCK(nr_freq_lock); +#endif + +static void unaccount_freq_event_nohz(void) +{ +#ifdef CONFIG_NO_HZ_FULL + spin_lock(&nr_freq_lock); + if (atomic_dec_and_test(&nr_freq_events)) + tick_nohz_dep_clear(TICK_DEP_BIT_PERF_EVENTS); + spin_unlock(&nr_freq_lock); +#endif +} + +static void unaccount_freq_event(void) +{ + if (tick_nohz_full_enabled()) + unaccount_freq_event_nohz(); + else + atomic_dec(&nr_freq_events); +} + static void unaccount_event(struct perf_event *event) { bool dec = false; @@ -3527,7 +3539,7 @@ static void unaccount_event(struct perf_event *event) if (event->attr.task) atomic_dec(&nr_task_events); if (event->attr.freq) - atomic_dec(&nr_freq_events); + unaccount_freq_event(); if (event->attr.context_switch) { dec = true; atomic_dec(&nr_switch_events); @@ -6349,9 +6361,9 @@ static int __perf_event_overflow(struct perf_event *event, if (unlikely(throttle && hwc->interrupts >= max_samples_per_tick)) { __this_cpu_inc(perf_throttled_count); + tick_dep_set_cpu(smp_processor_id(), TICK_DEP_BIT_PERF_EVENTS); hwc->interrupts = MAX_INTERRUPTS; perf_log_throttle(event, 0); - tick_nohz_full_kick(); ret = 1; } } @@ -7741,6 +7753,27 @@ static void account_event_cpu(struct perf_event *event, int cpu) atomic_inc(&per_cpu(perf_cgroup_events, cpu)); } +/* Freq events need the tick to stay alive (see perf_event_task_tick). */ +static void account_freq_event_nohz(void) +{ +#ifdef CONFIG_NO_HZ_FULL + /* Lock so we don't race with concurrent unaccount */ + spin_lock(&nr_freq_lock); + if (atomic_inc_return(&nr_freq_events) == 1) + tick_nohz_dep_set(TICK_DEP_BIT_PERF_EVENTS); + spin_unlock(&nr_freq_lock); +#endif +} + +static void account_freq_event(void) +{ + if (tick_nohz_full_enabled()) + account_freq_event_nohz(); + else + atomic_inc(&nr_freq_events); +} + + static void account_event(struct perf_event *event) { bool inc = false; @@ -7756,10 +7789,8 @@ static void account_event(struct perf_event *event) atomic_inc(&nr_comm_events); if (event->attr.task) atomic_inc(&nr_task_events); - if (event->attr.freq) { - if (atomic_inc_return(&nr_freq_events) == 1) - tick_nohz_full_kick_all(); - } + if (event->attr.freq) + account_freq_event(); if (event->attr.context_switch) { atomic_inc(&nr_switch_events); inc = true; diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 47e5ac45ce69..3f79def37ca9 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -22,7 +22,6 @@ #include #include #include -#include #include #include @@ -215,11 +214,6 @@ static bool can_stop_full_tick(struct tick_sched *ts) return false; } - if (!perf_event_can_stop_tick()) { - trace_tick_stop(0, TICK_DEP_MASK_PERF_EVENTS); - return false; - } - #ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK /* * sched_clock_tick() needs us? @@ -257,7 +251,7 @@ static DEFINE_PER_CPU(struct irq_work, nohz_full_kick_work) = { * This kick, unlike tick_nohz_full_kick_cpu() and tick_nohz_full_kick_all(), * is NMI safe. */ -void tick_nohz_full_kick(void) +static void tick_nohz_full_kick(void) { if (!tick_nohz_full_cpu(smp_processor_id())) return; -- cgit From 76d92ac305f23cada3a9b3c48a7ccea5f71019cb Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 17 Jul 2015 22:25:49 +0200 Subject: sched: Migrate sched to use new tick dependency mask model Instead of providing asynchronous checks for the nohz subsystem to verify sched tick dependency, migrate sched to the new mask. Everytime a task is enqueued or dequeued, we evaluate the state of the tick dependency on top of the policy of the tasks in the runqueue, by order of priority: SCHED_DEADLINE: Need the tick in order to periodically check for runtime SCHED_FIFO : Don't need the tick (no round-robin) SCHED_RR : Need the tick if more than 1 task of the same priority for round robin (simplified with checking if more than one SCHED_RR task no matter what priority). SCHED_NORMAL : Need the tick if more than 1 task for round-robin. We could optimize that further with one flag per sched policy on the tick dependency mask and perform only the checks relevant to the policy concerned by an enqueue/dequeue operation. Since the checks aren't based on the current task anymore, we could get rid of the task switch hook but it's still needed for posix cpu timers. Reviewed-by: Chris Metcalf Cc: Christoph Lameter Cc: Chris Metcalf Cc: Ingo Molnar Cc: Luiz Capitulino Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: Viresh Kumar Signed-off-by: Frederic Weisbecker --- include/linux/sched.h | 3 --- kernel/sched/core.c | 35 ++++++++++++++++++++--------------- kernel/sched/sched.h | 47 +++++++++++++++++++++++++++++++++-------------- kernel/time/tick-sched.c | 5 ----- 4 files changed, 53 insertions(+), 37 deletions(-) (limited to 'kernel/time') diff --git a/include/linux/sched.h b/include/linux/sched.h index 307e48375f21..2b10348806d8 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2364,10 +2364,7 @@ static inline void wake_up_nohz_cpu(int cpu) { } #endif #ifdef CONFIG_NO_HZ_FULL -extern bool sched_can_stop_tick(void); extern u64 scheduler_tick_max_deferment(void); -#else -static inline bool sched_can_stop_tick(void) { return false; } #endif #ifdef CONFIG_SCHED_AUTOGROUP diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 7142feb4b92d..1fad82364ffe 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -701,31 +701,36 @@ static inline bool got_nohz_idle_kick(void) #endif /* CONFIG_NO_HZ_COMMON */ #ifdef CONFIG_NO_HZ_FULL -bool sched_can_stop_tick(void) +bool sched_can_stop_tick(struct rq *rq) { + int fifo_nr_running; + + /* Deadline tasks, even if single, need the tick */ + if (rq->dl.dl_nr_running) + return false; + /* - * FIFO realtime policy runs the highest priority task. Other runnable - * tasks are of a lower priority. The scheduler tick does nothing. + * FIFO realtime policy runs the highest priority task (after DEADLINE). + * Other runnable tasks are of a lower priority. The scheduler tick + * isn't needed. */ - if (current->policy == SCHED_FIFO) + fifo_nr_running = rq->rt.rt_nr_running - rq->rt.rr_nr_running; + if (fifo_nr_running) return true; /* * Round-robin realtime tasks time slice with other tasks at the same - * realtime priority. Is this task the only one at this priority? + * realtime priority. */ - if (current->policy == SCHED_RR) { - struct sched_rt_entity *rt_se = ¤t->rt; - - return list_is_singular(&rt_se->run_list); + if (rq->rt.rr_nr_running) { + if (rq->rt.rr_nr_running == 1) + return true; + else + return false; } - /* - * More than one running task need preemption. - * nr_running update is assumed to be visible - * after IPI is sent from wakers. - */ - if (this_rq()->nr_running > 1) + /* Normal multitasking need periodic preemption checks */ + if (rq->cfs.nr_running > 1) return false; return true; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index f0abfce14044..4f0bca770108 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1279,6 +1279,35 @@ unsigned long to_ratio(u64 period, u64 runtime); extern void init_entity_runnable_average(struct sched_entity *se); +#ifdef CONFIG_NO_HZ_FULL +extern bool sched_can_stop_tick(struct rq *rq); + +/* + * Tick may be needed by tasks in the runqueue depending on their policy and + * requirements. If tick is needed, lets send the target an IPI to kick it out of + * nohz mode if necessary. + */ +static inline void sched_update_tick_dependency(struct rq *rq) +{ + int cpu; + + if (!tick_nohz_full_enabled()) + return; + + cpu = cpu_of(rq); + + if (!tick_nohz_full_cpu(cpu)) + return; + + if (sched_can_stop_tick(rq)) + tick_nohz_dep_clear_cpu(cpu, TICK_DEP_BIT_SCHED); + else + tick_nohz_dep_set_cpu(cpu, TICK_DEP_BIT_SCHED); +} +#else +static inline void sched_update_tick_dependency(struct rq *rq) { } +#endif + static inline void add_nr_running(struct rq *rq, unsigned count) { unsigned prev_nr = rq->nr_running; @@ -1290,26 +1319,16 @@ static inline void add_nr_running(struct rq *rq, unsigned count) if (!rq->rd->overload) rq->rd->overload = true; #endif - -#ifdef CONFIG_NO_HZ_FULL - if (tick_nohz_full_cpu(rq->cpu)) { - /* - * Tick is needed if more than one task runs on a CPU. - * Send the target an IPI to kick it out of nohz mode. - * - * We assume that IPI implies full memory barrier and the - * new value of rq->nr_running is visible on reception - * from the target. - */ - tick_nohz_full_kick_cpu(rq->cpu); - } -#endif } + + sched_update_tick_dependency(rq); } static inline void sub_nr_running(struct rq *rq, unsigned count) { rq->nr_running -= count; + /* Check if we still need preemption */ + sched_update_tick_dependency(rq); } static inline void rq_last_tick_reset(struct rq *rq) diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 3f79def37ca9..f312c60c3ed2 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -204,11 +204,6 @@ static bool can_stop_full_tick(struct tick_sched *ts) return false; } - if (!sched_can_stop_tick()) { - trace_tick_stop(0, TICK_DEP_MASK_SCHED); - return false; - } - if (!posix_cpu_timers_can_stop_tick(current)) { trace_tick_stop(0, TICK_DEP_MASK_POSIX_TIMER); return false; -- cgit From b78783000d5cb7c5994e6742e1d1ce594bfea15b Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 17 Jul 2015 22:25:49 +0200 Subject: posix-cpu-timers: Migrate to use new tick dependency mask model Instead of providing asynchronous checks for the nohz subsystem to verify posix cpu timers tick dependency, migrate the latter to the new mask. In order to keep track of the running timers and expose the tick dependency accordingly, we must probe the timers queuing and dequeuing on threads and process lists. Unfortunately it implies both task and signal level dependencies. We should be able to further optimize this and merge all that on the task level dependency, at the cost of a bit of complexity and may be overhead. Reviewed-by: Chris Metcalf Cc: Christoph Lameter Cc: Chris Metcalf Cc: Ingo Molnar Cc: Luiz Capitulino Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: Viresh Kumar Signed-off-by: Frederic Weisbecker --- include/linux/posix-timers.h | 3 --- include/linux/tick.h | 2 -- kernel/time/posix-cpu-timers.c | 52 +++++++++--------------------------------- kernel/time/tick-sched.c | 7 +----- 4 files changed, 12 insertions(+), 52 deletions(-) (limited to 'kernel/time') diff --git a/include/linux/posix-timers.h b/include/linux/posix-timers.h index 907f3fd191ac..62d44c176071 100644 --- a/include/linux/posix-timers.h +++ b/include/linux/posix-timers.h @@ -128,9 +128,6 @@ void posix_cpu_timer_schedule(struct k_itimer *timer); void run_posix_cpu_timers(struct task_struct *task); void posix_cpu_timers_exit(struct task_struct *task); void posix_cpu_timers_exit_group(struct task_struct *task); - -bool posix_cpu_timers_can_stop_tick(struct task_struct *tsk); - void set_process_cpu_timer(struct task_struct *task, unsigned int clock_idx, cputime_t *newval, cputime_t *oldval); diff --git a/include/linux/tick.h b/include/linux/tick.h index 0e63db4bc662..21f73649a4dc 100644 --- a/include/linux/tick.h +++ b/include/linux/tick.h @@ -234,7 +234,6 @@ static inline void tick_dep_clear_signal(struct signal_struct *signal, } extern void tick_nohz_full_kick_cpu(int cpu); -extern void tick_nohz_full_kick_all(void); extern void __tick_nohz_task_switch(void); #else static inline int housekeeping_any_cpu(void) @@ -259,7 +258,6 @@ static inline void tick_dep_clear_signal(struct signal_struct *signal, enum tick_dep_bits bit) { } static inline void tick_nohz_full_kick_cpu(int cpu) { } -static inline void tick_nohz_full_kick_all(void) { } static inline void __tick_nohz_task_switch(void) { } #endif diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index f5e86d282d52..1cafba860b08 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -333,7 +333,6 @@ static int posix_cpu_clock_get(const clockid_t which_clock, struct timespec *tp) return err; } - /* * Validate the clockid_t for a new CPU-clock timer, and initialize the timer. * This is called from sys_timer_create() and do_cpu_nanosleep() with the @@ -517,6 +516,10 @@ static void arm_timer(struct k_itimer *timer) cputime_expires->sched_exp = exp; break; } + if (CPUCLOCK_PERTHREAD(timer->it_clock)) + tick_dep_set_task(p, TICK_DEP_BIT_POSIX_TIMER); + else + tick_dep_set_signal(p->signal, TICK_DEP_BIT_POSIX_TIMER); } } @@ -582,39 +585,6 @@ static int cpu_timer_sample_group(const clockid_t which_clock, return 0; } -#ifdef CONFIG_NO_HZ_FULL -static void nohz_kick_work_fn(struct work_struct *work) -{ - tick_nohz_full_kick_all(); -} - -static DECLARE_WORK(nohz_kick_work, nohz_kick_work_fn); - -/* - * We need the IPIs to be sent from sane process context. - * The posix cpu timers are always set with irqs disabled. - */ -static void posix_cpu_timer_kick_nohz(void) -{ - if (context_tracking_is_enabled()) - schedule_work(&nohz_kick_work); -} - -bool posix_cpu_timers_can_stop_tick(struct task_struct *tsk) -{ - if (!task_cputime_zero(&tsk->cputime_expires)) - return false; - - /* Check if cputimer is running. This is accessed without locking. */ - if (READ_ONCE(tsk->signal->cputimer.running)) - return false; - - return true; -} -#else -static inline void posix_cpu_timer_kick_nohz(void) { } -#endif - /* * Guts of sys_timer_settime for CPU timers. * This is called with the timer locked and interrupts disabled. @@ -761,8 +731,7 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags, sample_to_timespec(timer->it_clock, old_incr, &old->it_interval); } - if (!ret) - posix_cpu_timer_kick_nohz(); + return ret; } @@ -911,6 +880,8 @@ static void check_thread_timers(struct task_struct *tsk, __group_send_sig_info(SIGXCPU, SEND_SIG_PRIV, tsk); } } + if (task_cputime_zero(tsk_expires)) + tick_dep_clear_task(tsk, TICK_DEP_BIT_POSIX_TIMER); } static inline void stop_process_timers(struct signal_struct *sig) @@ -919,6 +890,7 @@ static inline void stop_process_timers(struct signal_struct *sig) /* Turn off cputimer->running. This is done without locking. */ WRITE_ONCE(cputimer->running, false); + tick_dep_clear_signal(sig, TICK_DEP_BIT_POSIX_TIMER); } static u32 onecputick; @@ -1095,8 +1067,6 @@ void posix_cpu_timer_schedule(struct k_itimer *timer) arm_timer(timer); unlock_task_sighand(p, &flags); - /* Kick full dynticks CPUs in case they need to tick on the new timer */ - posix_cpu_timer_kick_nohz(); out: timer->it_overrun_last = timer->it_overrun; timer->it_overrun = -1; @@ -1270,7 +1240,7 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx, } if (!*newval) - goto out; + return; *newval += now; } @@ -1288,8 +1258,8 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx, tsk->signal->cputime_expires.virt_exp = *newval; break; } -out: - posix_cpu_timer_kick_nohz(); + + tick_dep_set_signal(tsk->signal, TICK_DEP_BIT_POSIX_TIMER); } static int do_cpu_nanosleep(const clockid_t which_clock, int flags, diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index f312c60c3ed2..e4f2916e66a9 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -204,11 +204,6 @@ static bool can_stop_full_tick(struct tick_sched *ts) return false; } - if (!posix_cpu_timers_can_stop_tick(current)) { - trace_tick_stop(0, TICK_DEP_MASK_POSIX_TIMER); - return false; - } - #ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK /* * sched_clock_tick() needs us? @@ -270,7 +265,7 @@ void tick_nohz_full_kick_cpu(int cpu) * Kick all full dynticks CPUs in order to force these to re-evaluate * their dependency on the tick and restart it if necessary. */ -void tick_nohz_full_kick_all(void) +static void tick_nohz_full_kick_all(void) { int cpu; -- cgit From 4f49b90abb4aca6fe677c95fc352fd0674d489bd Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 22 Jul 2015 17:03:52 +0200 Subject: sched-clock: Migrate to use new tick dependency mask model Instead of checking sched_clock_stable from the nohz subsystem to verify its tick dependency, migrate it to the new mask in order to include it to the all-in-one check. Reviewed-by: Chris Metcalf Cc: Christoph Lameter Cc: Chris Metcalf Cc: Ingo Molnar Cc: Luiz Capitulino Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: Viresh Kumar Signed-off-by: Frederic Weisbecker --- kernel/sched/clock.c | 5 +++++ kernel/time/tick-sched.c | 19 ------------------- 2 files changed, 5 insertions(+), 19 deletions(-) (limited to 'kernel/time') diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c index bc54e84675da..fedb967a9841 100644 --- a/kernel/sched/clock.c +++ b/kernel/sched/clock.c @@ -61,6 +61,7 @@ #include #include #include +#include /* * Scheduler clock - returns current time in nanosec units. @@ -89,6 +90,8 @@ static void __set_sched_clock_stable(void) { if (!sched_clock_stable()) static_key_slow_inc(&__sched_clock_stable); + + tick_dep_clear(TICK_DEP_BIT_CLOCK_UNSTABLE); } void set_sched_clock_stable(void) @@ -108,6 +111,8 @@ static void __clear_sched_clock_stable(struct work_struct *work) /* XXX worry about clock continuity */ if (sched_clock_stable()) static_key_slow_dec(&__sched_clock_stable); + + tick_dep_set(TICK_DEP_BIT_CLOCK_UNSTABLE); } static DECLARE_WORK(sched_clock_work, __clear_sched_clock_stable); diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index e4f2916e66a9..969e6704c3c9 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -204,25 +204,6 @@ static bool can_stop_full_tick(struct tick_sched *ts) return false; } -#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK - /* - * sched_clock_tick() needs us? - * - * TODO: kick full dynticks CPUs when - * sched_clock_stable is set. - */ - if (!sched_clock_stable()) { - trace_tick_stop(0, TICK_DEP_MASK_CLOCK_UNSTABLE); - /* - * Don't allow the user to think they can get - * full NO_HZ with this machine. - */ - WARN_ONCE(tick_nohz_full_running, - "NO_HZ FULL will not work with unstable sched clock"); - return false; - } -#endif - return true; } -- cgit From 6bd58f09e1d8cc6c50a824c00bf0d617919986a1 Mon Sep 17 00:00:00 2001 From: "Christopher S. Hall" Date: Mon, 22 Feb 2016 03:15:19 -0800 Subject: time: Add cycles to nanoseconds translation The timekeeping code does not currently provide a way to translate externally provided clocksource cycles to system time. The cycle count is always provided by the result clocksource read() method internal to the timekeeping code. The added function timekeeping_cycles_to_ns() calculated a nanosecond value from a cycle count that can be added to tk_read_base.base value yielding the current system time. This allows clocksource cycle values external to the timekeeping code to provide a cycle count that can be transformed to system time. Cc: Prarit Bhargava Cc: Richard Cochran Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Andy Lutomirski Cc: kevin.b.stanton@intel.com Cc: kevin.j.clarke@intel.com Cc: hpa@zytor.com Cc: jeffrey.t.kirsher@intel.com Cc: netdev@vger.kernel.org Reviewed-by: Thomas Gleixner Signed-off-by: Christopher S. Hall Signed-off-by: John Stultz --- kernel/time/timekeeping.c | 25 +++++++++++++++++++++---- 1 file changed, 21 insertions(+), 4 deletions(-) (limited to 'kernel/time') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 34b4cedfa80d..4243d28177ac 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -298,17 +298,34 @@ u32 (*arch_gettimeoffset)(void) = default_arch_gettimeoffset; static inline u32 arch_gettimeoffset(void) { return 0; } #endif +static inline s64 timekeeping_delta_to_ns(struct tk_read_base *tkr, + cycle_t delta) +{ + s64 nsec; + + nsec = delta * tkr->mult + tkr->xtime_nsec; + nsec >>= tkr->shift; + + /* If arch requires, add in get_arch_timeoffset() */ + return nsec + arch_gettimeoffset(); +} + static inline s64 timekeeping_get_ns(struct tk_read_base *tkr) { cycle_t delta; - s64 nsec; delta = timekeeping_get_delta(tkr); + return timekeeping_delta_to_ns(tkr, delta); +} - nsec = (delta * tkr->mult + tkr->xtime_nsec) >> tkr->shift; +static inline s64 timekeeping_cycles_to_ns(struct tk_read_base *tkr, + cycle_t cycles) +{ + cycle_t delta; - /* If arch requires, add in get_arch_timeoffset() */ - return nsec + arch_gettimeoffset(); + /* calculate the delta since the last update_wall_time */ + delta = clocksource_delta(cycles, tkr->cycle_last, tkr->mask); + return timekeeping_delta_to_ns(tkr, delta); } /** -- cgit From 9da0f49c8767cc0ef6101cb21156cf4380ed50dd Mon Sep 17 00:00:00 2001 From: "Christopher S. Hall" Date: Mon, 22 Feb 2016 03:15:20 -0800 Subject: time: Add timekeeping snapshot code capturing system time and counter In the current timekeeping code there isn't any interface to atomically capture the current relationship between the system counter and system time. ktime_get_snapshot() returns this triple (counter, monotonic raw, realtime) in the system_time_snapshot struct. Cc: Prarit Bhargava Cc: Richard Cochran Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Andy Lutomirski Cc: kevin.b.stanton@intel.com Cc: kevin.j.clarke@intel.com Cc: hpa@zytor.com Cc: jeffrey.t.kirsher@intel.com Cc: netdev@vger.kernel.org Reviewed-by: Thomas Gleixner Signed-off-by: Christopher S. Hall [jstultz: Moved structure definitions around to clean things up, fixed cycles_t/cycle_t confusion.] Signed-off-by: John Stultz --- include/linux/timekeeping.h | 18 ++++++++++++++++++ kernel/time/timekeeping.c | 30 ++++++++++++++++++++++++++++++ 2 files changed, 48 insertions(+) (limited to 'kernel/time') diff --git a/include/linux/timekeeping.h b/include/linux/timekeeping.h index ec89d846324c..7817591af46f 100644 --- a/include/linux/timekeeping.h +++ b/include/linux/timekeeping.h @@ -266,6 +266,24 @@ extern void timekeeping_inject_sleeptime64(struct timespec64 *delta); extern void ktime_get_raw_and_real_ts64(struct timespec64 *ts_raw, struct timespec64 *ts_real); +/* + * struct system_time_snapshot - simultaneous raw/real time capture with + * counter value + * @cycles: Clocksource counter value to produce the system times + * @real: Realtime system time + * @raw: Monotonic raw system time + */ +struct system_time_snapshot { + cycle_t cycles; + ktime_t real; + ktime_t raw; +}; + +/* + * Simultaneously snapshot realtime and monotonic raw clocks + */ +extern void ktime_get_snapshot(struct system_time_snapshot *systime_snapshot); + /* * Persistent clock related interfaces */ diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 4243d28177ac..89b4695bd083 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -874,6 +874,36 @@ time64_t __ktime_get_real_seconds(void) return tk->xtime_sec; } +/** + * ktime_get_snapshot - snapshots the realtime/monotonic raw clocks with counter + * @systime_snapshot: pointer to struct receiving the system time snapshot + */ +void ktime_get_snapshot(struct system_time_snapshot *systime_snapshot) +{ + struct timekeeper *tk = &tk_core.timekeeper; + unsigned long seq; + ktime_t base_raw; + ktime_t base_real; + s64 nsec_raw; + s64 nsec_real; + cycle_t now; + + do { + seq = read_seqcount_begin(&tk_core.seq); + + now = tk->tkr_mono.read(tk->tkr_mono.clock); + base_real = ktime_add(tk->tkr_mono.base, + tk_core.timekeeper.offs_real); + base_raw = tk->tkr_raw.base; + nsec_real = timekeeping_cycles_to_ns(&tk->tkr_mono, now); + nsec_raw = timekeeping_cycles_to_ns(&tk->tkr_raw, now); + } while (read_seqcount_retry(&tk_core.seq, seq)); + + systime_snapshot->cycles = now; + systime_snapshot->real = ktime_add_ns(base_real, nsec_real); + systime_snapshot->raw = ktime_add_ns(base_raw, nsec_raw); +} +EXPORT_SYMBOL_GPL(ktime_get_snapshot); #ifdef CONFIG_NTP_PPS -- cgit From ba26621e63ce6dc481d90ab9f6902e058d4ea39a Mon Sep 17 00:00:00 2001 From: "Christopher S. Hall" Date: Mon, 22 Feb 2016 03:15:21 -0800 Subject: time: Remove duplicated code in ktime_get_raw_and_real() The code in ktime_get_snapshot() is a superset of the code in ktime_get_raw_and_real() code. Further, ktime_get_raw_and_real() is called only by the PPS code, pps_get_ts(). Consolidate the pps_get_ts() code into a single function calling ktime_get_snapshot() and eliminate ktime_get_raw_and_real(). A side effect of this is that the raw and real results of pps_get_ts() correspond to exactly the same clock cycle. Previously these values represented separate reads of the system clock. Cc: Prarit Bhargava Cc: Richard Cochran Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Andy Lutomirski Cc: kevin.b.stanton@intel.com Cc: kevin.j.clarke@intel.com Cc: hpa@zytor.com Cc: jeffrey.t.kirsher@intel.com Cc: netdev@vger.kernel.org Reviewed-by: Thomas Gleixner Signed-off-by: Christopher S. Hall Signed-off-by: John Stultz --- include/linux/pps_kernel.h | 17 ++++++----------- kernel/time/timekeeping.c | 40 ++-------------------------------------- 2 files changed, 8 insertions(+), 49 deletions(-) (limited to 'kernel/time') diff --git a/include/linux/pps_kernel.h b/include/linux/pps_kernel.h index 54bf1484d41f..35ac903956c7 100644 --- a/include/linux/pps_kernel.h +++ b/include/linux/pps_kernel.h @@ -111,22 +111,17 @@ static inline void timespec_to_pps_ktime(struct pps_ktime *kt, kt->nsec = ts.tv_nsec; } -#ifdef CONFIG_NTP_PPS - static inline void pps_get_ts(struct pps_event_time *ts) { - ktime_get_raw_and_real_ts64(&ts->ts_raw, &ts->ts_real); -} + struct system_time_snapshot snap; -#else /* CONFIG_NTP_PPS */ - -static inline void pps_get_ts(struct pps_event_time *ts) -{ - ktime_get_real_ts64(&ts->ts_real); + ktime_get_snapshot(&snap); + ts->ts_real = ktime_to_timespec64(snap.real); +#ifdef CONFIG_NTP_PPS + ts->ts_raw = ktime_to_timespec64(snap.raw); +#endif } -#endif /* CONFIG_NTP_PPS */ - /* Subtract known time delay from PPS event time(s) */ static inline void pps_sub_ts(struct pps_event_time *ts, struct timespec64 delta) { diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 89b4695bd083..af19a49d5223 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -888,6 +888,8 @@ void ktime_get_snapshot(struct system_time_snapshot *systime_snapshot) s64 nsec_real; cycle_t now; + WARN_ON_ONCE(timekeeping_suspended); + do { seq = read_seqcount_begin(&tk_core.seq); @@ -905,44 +907,6 @@ void ktime_get_snapshot(struct system_time_snapshot *systime_snapshot) } EXPORT_SYMBOL_GPL(ktime_get_snapshot); -#ifdef CONFIG_NTP_PPS - -/** - * ktime_get_raw_and_real_ts64 - get day and raw monotonic time in timespec format - * @ts_raw: pointer to the timespec to be set to raw monotonic time - * @ts_real: pointer to the timespec to be set to the time of day - * - * This function reads both the time of day and raw monotonic time at the - * same time atomically and stores the resulting timestamps in timespec - * format. - */ -void ktime_get_raw_and_real_ts64(struct timespec64 *ts_raw, struct timespec64 *ts_real) -{ - struct timekeeper *tk = &tk_core.timekeeper; - unsigned long seq; - s64 nsecs_raw, nsecs_real; - - WARN_ON_ONCE(timekeeping_suspended); - - do { - seq = read_seqcount_begin(&tk_core.seq); - - *ts_raw = tk->raw_time; - ts_real->tv_sec = tk->xtime_sec; - ts_real->tv_nsec = 0; - - nsecs_raw = timekeeping_get_ns(&tk->tkr_raw); - nsecs_real = timekeeping_get_ns(&tk->tkr_mono); - - } while (read_seqcount_retry(&tk_core.seq, seq)); - - timespec64_add_ns(ts_raw, nsecs_raw); - timespec64_add_ns(ts_real, nsecs_real); -} -EXPORT_SYMBOL(ktime_get_raw_and_real_ts64); - -#endif /* CONFIG_NTP_PPS */ - /** * do_gettimeofday - Returns the time of day in a timeval * @tv: pointer to the timeval to be set -- cgit From 8006c24595cab106bcb9da12d35e32e14ff492df Mon Sep 17 00:00:00 2001 From: "Christopher S. Hall" Date: Mon, 22 Feb 2016 03:15:22 -0800 Subject: time: Add driver cross timestamp interface for higher precision time synchronization ACKNOWLEDGMENT: cross timestamp code was developed by Thomas Gleixner . It has changed considerably and any mistakes are mine. The precision with which events on multiple networked systems can be synchronized using, as an example, PTP (IEEE 1588, 802.1AS) is limited by the precision of the cross timestamps between the system clock and the device (timestamp) clock. Precision here is the degree of simultaneity when capturing the cross timestamp. Currently the PTP cross timestamp is captured in software using the PTP device driver ioctl PTP_SYS_OFFSET. Reads of the device clock are interleaved with reads of the realtime clock. At best, the precision of this cross timestamp is on the order of several microseconds due to software latencies. Sub-microsecond precision is required for industrial control and some media applications. To achieve this level of precision hardware supported cross timestamping is needed. The function get_device_system_crosstimestamp() allows device drivers to return a cross timestamp with system time properly scaled to nanoseconds. The realtime value is needed to discipline that clock using PTP and the monotonic raw value is used for applications that don't require a "real" time, but need an unadjusted clock time. The get_device_system_crosstimestamp() code calls back into the driver to ensure that the system counter is within the current timekeeping update interval. Modern Intel hardware provides an Always Running Timer (ART) which is exactly related to TSC through a known frequency ratio. The ART is routed to devices on the system and is used to precisely and simultaneously capture the device clock with the ART. Cc: Prarit Bhargava Cc: Richard Cochran Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Andy Lutomirski Cc: kevin.b.stanton@intel.com Cc: kevin.j.clarke@intel.com Cc: hpa@zytor.com Cc: jeffrey.t.kirsher@intel.com Cc: netdev@vger.kernel.org Reviewed-by: Thomas Gleixner Signed-off-by: Christopher S. Hall [jstultz: Reworked to remove extra structures and simplify calling] Signed-off-by: John Stultz --- include/linux/timekeeping.h | 35 ++++++++++++++++++++++++++++ kernel/time/timekeeping.c | 56 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 91 insertions(+) (limited to 'kernel/time') diff --git a/include/linux/timekeeping.h b/include/linux/timekeeping.h index 7817591af46f..4a2ca65fc778 100644 --- a/include/linux/timekeeping.h +++ b/include/linux/timekeeping.h @@ -279,6 +279,41 @@ struct system_time_snapshot { ktime_t raw; }; +/* + * struct system_device_crosststamp - system/device cross-timestamp + * (syncronized capture) + * @device: Device time + * @sys_realtime: Realtime simultaneous with device time + * @sys_monoraw: Monotonic raw simultaneous with device time + */ +struct system_device_crosststamp { + ktime_t device; + ktime_t sys_realtime; + ktime_t sys_monoraw; +}; + +/* + * struct system_counterval_t - system counter value with the pointer to the + * corresponding clocksource + * @cycles: System counter value + * @cs: Clocksource corresponding to system counter value. Used by + * timekeeping code to verify comparibility of two cycle values + */ +struct system_counterval_t { + cycle_t cycles; + struct clocksource *cs; +}; + +/* + * Get cross timestamp between system clock and device clock + */ +extern int get_device_system_crosststamp( + int (*get_time_fn)(ktime_t *device_time, + struct system_counterval_t *system_counterval, + void *ctx), + void *ctx, + struct system_device_crosststamp *xtstamp); + /* * Simultaneously snapshot realtime and monotonic raw clocks */ diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index af19a49d5223..dba595cdb200 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -907,6 +907,62 @@ void ktime_get_snapshot(struct system_time_snapshot *systime_snapshot) } EXPORT_SYMBOL_GPL(ktime_get_snapshot); +/** + * get_device_system_crosststamp - Synchronously capture system/device timestamp + * @sync_devicetime: Callback to get simultaneous device time and + * system counter from the device driver + * @xtstamp: Receives simultaneously captured system and device time + * + * Reads a timestamp from a device and correlates it to system time + */ +int get_device_system_crosststamp(int (*get_time_fn) + (ktime_t *device_time, + struct system_counterval_t *sys_counterval, + void *ctx), + void *ctx, + struct system_device_crosststamp *xtstamp) +{ + struct system_counterval_t system_counterval; + struct timekeeper *tk = &tk_core.timekeeper; + ktime_t base_real, base_raw; + s64 nsec_real, nsec_raw; + unsigned long seq; + int ret; + + do { + seq = read_seqcount_begin(&tk_core.seq); + /* + * Try to synchronously capture device time and a system + * counter value calling back into the device driver + */ + ret = get_time_fn(&xtstamp->device, &system_counterval, ctx); + if (ret) + return ret; + + /* + * Verify that the clocksource associated with the captured + * system counter value is the same as the currently installed + * timekeeper clocksource + */ + if (tk->tkr_mono.clock != system_counterval.cs) + return -ENODEV; + + base_real = ktime_add(tk->tkr_mono.base, + tk_core.timekeeper.offs_real); + base_raw = tk->tkr_raw.base; + + nsec_real = timekeeping_cycles_to_ns(&tk->tkr_mono, + system_counterval.cycles); + nsec_raw = timekeeping_cycles_to_ns(&tk->tkr_raw, + system_counterval.cycles); + } while (read_seqcount_retry(&tk_core.seq, seq)); + + xtstamp->sys_realtime = ktime_add_ns(base_real, nsec_real); + xtstamp->sys_monoraw = ktime_add_ns(base_raw, nsec_raw); + return 0; +} +EXPORT_SYMBOL_GPL(get_device_system_crosststamp); + /** * do_gettimeofday - Returns the time of day in a timeval * @tv: pointer to the timeval to be set -- cgit From 2c756feb18d9ec258dbb3a3d11c47e28820690d7 Mon Sep 17 00:00:00 2001 From: "Christopher S. Hall" Date: Mon, 22 Feb 2016 03:15:23 -0800 Subject: time: Add history to cross timestamp interface supporting slower devices Another representative use case of time sync and the correlated clocksource (in addition to PTP noted above) is PTP synchronized audio. In a streaming application, as an example, samples will be sent and/or received by multiple devices with a presentation time that is in terms of the PTP master clock. Synchronizing the audio output on these devices requires correlating the audio clock with the PTP master clock. The more precise this correlation is, the better the audio quality (i.e. out of sync audio sounds bad). From an application standpoint, to correlate the PTP master clock with the audio device clock, the system clock is used as a intermediate timebase. The transforms such an application would perform are: System Clock <-> Audio clock System Clock <-> Network Device Clock [<-> PTP Master Clock] Modern Intel platforms can perform a more accurate cross timestamp in hardware (ART,audio device clock). The audio driver requires ART->system time transforms -- the same as required for the network driver. These platforms offload audio processing (including cross-timestamps) to a DSP which to ensure uninterrupted audio processing, communicates and response to the host only once every millsecond. As a result is takes up to a millisecond for the DSP to receive a request, the request is processed by the DSP, the audio output hardware is polled for completion, the result is copied into shared memory, and the host is notified. All of these operation occur on a millisecond cadence. This transaction requires about 2 ms, but under heavier workloads it may take up to 4 ms. Adding a history allows these slow devices the option of providing an ART value outside of the current interval. In this case, the callback provided is an accessor function for the previously obtained counter value. If get_system_device_crosststamp() receives a counter value previous to cycle_last, it consults the history provided as an argument in history_ref and interpolates the realtime and monotonic raw system time using the provided counter value. If there are any clock discontinuities, e.g. from calling settimeofday(), the monotonic raw time is interpolated in the usual way, but the realtime clock time is adjusted by scaling the monotonic raw adjustment. When an accessor function is used a history argument *must* be provided. The history is initialized using ktime_get_snapshot() and must be called before the counter values are read. Cc: Prarit Bhargava Cc: Richard Cochran Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Andy Lutomirski Cc: kevin.b.stanton@intel.com Cc: kevin.j.clarke@intel.com Cc: hpa@zytor.com Cc: jeffrey.t.kirsher@intel.com Cc: netdev@vger.kernel.org Reviewed-by: Thomas Gleixner Signed-off-by: Christopher S. Hall [jstultz: Fixed up cycles_t/cycle_t type confusion] Signed-off-by: John Stultz --- include/linux/timekeeper_internal.h | 2 + include/linux/timekeeping.h | 5 ++ kernel/time/timekeeping.c | 171 +++++++++++++++++++++++++++++++++++- 3 files changed, 177 insertions(+), 1 deletion(-) (limited to 'kernel/time') diff --git a/include/linux/timekeeper_internal.h b/include/linux/timekeeper_internal.h index 25247220b4b7..e88005459035 100644 --- a/include/linux/timekeeper_internal.h +++ b/include/linux/timekeeper_internal.h @@ -50,6 +50,7 @@ struct tk_read_base { * @offs_tai: Offset clock monotonic -> clock tai * @tai_offset: The current UTC to TAI offset in seconds * @clock_was_set_seq: The sequence number of clock was set events + * @cs_was_changed_seq: The sequence number of clocksource change events * @next_leap_ktime: CLOCK_MONOTONIC time value of a pending leap-second * @raw_time: Monotonic raw base time in timespec64 format * @cycle_interval: Number of clock cycles in one NTP interval @@ -91,6 +92,7 @@ struct timekeeper { ktime_t offs_tai; s32 tai_offset; unsigned int clock_was_set_seq; + u8 cs_was_changed_seq; ktime_t next_leap_ktime; struct timespec64 raw_time; diff --git a/include/linux/timekeeping.h b/include/linux/timekeeping.h index 4a2ca65fc778..96f37bee3bc1 100644 --- a/include/linux/timekeeping.h +++ b/include/linux/timekeeping.h @@ -272,11 +272,15 @@ extern void ktime_get_raw_and_real_ts64(struct timespec64 *ts_raw, * @cycles: Clocksource counter value to produce the system times * @real: Realtime system time * @raw: Monotonic raw system time + * @clock_was_set_seq: The sequence number of clock was set events + * @cs_was_changed_seq: The sequence number of clocksource change events */ struct system_time_snapshot { cycle_t cycles; ktime_t real; ktime_t raw; + unsigned int clock_was_set_seq; + u8 cs_was_changed_seq; }; /* @@ -312,6 +316,7 @@ extern int get_device_system_crosststamp( struct system_counterval_t *system_counterval, void *ctx), void *ctx, + struct system_time_snapshot *history, struct system_device_crosststamp *xtstamp); /* diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index dba595cdb200..931b0b1a71e9 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -233,6 +233,7 @@ static void tk_setup_internals(struct timekeeper *tk, struct clocksource *clock) u64 tmp, ntpinterval; struct clocksource *old_clock; + ++tk->cs_was_changed_seq; old_clock = tk->tkr_mono.clock; tk->tkr_mono.clock = clock; tk->tkr_mono.read = clock->read; @@ -894,6 +895,8 @@ void ktime_get_snapshot(struct system_time_snapshot *systime_snapshot) seq = read_seqcount_begin(&tk_core.seq); now = tk->tkr_mono.read(tk->tkr_mono.clock); + systime_snapshot->cs_was_changed_seq = tk->cs_was_changed_seq; + systime_snapshot->clock_was_set_seq = tk->clock_was_set_seq; base_real = ktime_add(tk->tkr_mono.base, tk_core.timekeeper.offs_real); base_raw = tk->tkr_raw.base; @@ -907,10 +910,123 @@ void ktime_get_snapshot(struct system_time_snapshot *systime_snapshot) } EXPORT_SYMBOL_GPL(ktime_get_snapshot); +/* Scale base by mult/div checking for overflow */ +static int scale64_check_overflow(u64 mult, u64 div, u64 *base) +{ + u64 tmp, rem; + + tmp = div64_u64_rem(*base, div, &rem); + + if (((int)sizeof(u64)*8 - fls64(mult) < fls64(tmp)) || + ((int)sizeof(u64)*8 - fls64(mult) < fls64(rem))) + return -EOVERFLOW; + tmp *= mult; + rem *= mult; + + do_div(rem, div); + *base = tmp + rem; + return 0; +} + +/** + * adjust_historical_crosststamp - adjust crosstimestamp previous to current interval + * @history: Snapshot representing start of history + * @partial_history_cycles: Cycle offset into history (fractional part) + * @total_history_cycles: Total history length in cycles + * @discontinuity: True indicates clock was set on history period + * @ts: Cross timestamp that should be adjusted using + * partial/total ratio + * + * Helper function used by get_device_system_crosststamp() to correct the + * crosstimestamp corresponding to the start of the current interval to the + * system counter value (timestamp point) provided by the driver. The + * total_history_* quantities are the total history starting at the provided + * reference point and ending at the start of the current interval. The cycle + * count between the driver timestamp point and the start of the current + * interval is partial_history_cycles. + */ +static int adjust_historical_crosststamp(struct system_time_snapshot *history, + cycle_t partial_history_cycles, + cycle_t total_history_cycles, + bool discontinuity, + struct system_device_crosststamp *ts) +{ + struct timekeeper *tk = &tk_core.timekeeper; + u64 corr_raw, corr_real; + bool interp_forward; + int ret; + + if (total_history_cycles == 0 || partial_history_cycles == 0) + return 0; + + /* Interpolate shortest distance from beginning or end of history */ + interp_forward = partial_history_cycles > total_history_cycles/2 ? + true : false; + partial_history_cycles = interp_forward ? + total_history_cycles - partial_history_cycles : + partial_history_cycles; + + /* + * Scale the monotonic raw time delta by: + * partial_history_cycles / total_history_cycles + */ + corr_raw = (u64)ktime_to_ns( + ktime_sub(ts->sys_monoraw, history->raw)); + ret = scale64_check_overflow(partial_history_cycles, + total_history_cycles, &corr_raw); + if (ret) + return ret; + + /* + * If there is a discontinuity in the history, scale monotonic raw + * correction by: + * mult(real)/mult(raw) yielding the realtime correction + * Otherwise, calculate the realtime correction similar to monotonic + * raw calculation + */ + if (discontinuity) { + corr_real = mul_u64_u32_div + (corr_raw, tk->tkr_mono.mult, tk->tkr_raw.mult); + } else { + corr_real = (u64)ktime_to_ns( + ktime_sub(ts->sys_realtime, history->real)); + ret = scale64_check_overflow(partial_history_cycles, + total_history_cycles, &corr_real); + if (ret) + return ret; + } + + /* Fixup monotonic raw and real time time values */ + if (interp_forward) { + ts->sys_monoraw = ktime_add_ns(history->raw, corr_raw); + ts->sys_realtime = ktime_add_ns(history->real, corr_real); + } else { + ts->sys_monoraw = ktime_sub_ns(ts->sys_monoraw, corr_raw); + ts->sys_realtime = ktime_sub_ns(ts->sys_realtime, corr_real); + } + + return 0; +} + +/* + * cycle_between - true if test occurs chronologically between before and after + */ +static bool cycle_between(cycle_t before, cycle_t test, cycle_t after) +{ + if (test > before && test < after) + return true; + if (test < before && before > after) + return true; + return false; +} + /** * get_device_system_crosststamp - Synchronously capture system/device timestamp - * @sync_devicetime: Callback to get simultaneous device time and + * @get_time_fn: Callback to get simultaneous device time and * system counter from the device driver + * @ctx: Context passed to get_time_fn() + * @history_begin: Historical reference point used to interpolate system + * time when counter provided by the driver is before the current interval * @xtstamp: Receives simultaneously captured system and device time * * Reads a timestamp from a device and correlates it to system time @@ -920,13 +1036,18 @@ int get_device_system_crosststamp(int (*get_time_fn) struct system_counterval_t *sys_counterval, void *ctx), void *ctx, + struct system_time_snapshot *history_begin, struct system_device_crosststamp *xtstamp) { struct system_counterval_t system_counterval; struct timekeeper *tk = &tk_core.timekeeper; + cycle_t cycles, now, interval_start; + unsigned int clock_was_set_seq; ktime_t base_real, base_raw; s64 nsec_real, nsec_raw; + u8 cs_was_changed_seq; unsigned long seq; + bool do_interp; int ret; do { @@ -946,6 +1067,22 @@ int get_device_system_crosststamp(int (*get_time_fn) */ if (tk->tkr_mono.clock != system_counterval.cs) return -ENODEV; + cycles = system_counterval.cycles; + + /* + * Check whether the system counter value provided by the + * device driver is on the current timekeeping interval. + */ + now = tk->tkr_mono.read(tk->tkr_mono.clock); + interval_start = tk->tkr_mono.cycle_last; + if (!cycle_between(interval_start, cycles, now)) { + clock_was_set_seq = tk->clock_was_set_seq; + cs_was_changed_seq = tk->cs_was_changed_seq; + cycles = interval_start; + do_interp = true; + } else { + do_interp = false; + } base_real = ktime_add(tk->tkr_mono.base, tk_core.timekeeper.offs_real); @@ -959,6 +1096,38 @@ int get_device_system_crosststamp(int (*get_time_fn) xtstamp->sys_realtime = ktime_add_ns(base_real, nsec_real); xtstamp->sys_monoraw = ktime_add_ns(base_raw, nsec_raw); + + /* + * Interpolate if necessary, adjusting back from the start of the + * current interval + */ + if (do_interp) { + cycle_t partial_history_cycles, total_history_cycles; + bool discontinuity; + + /* + * Check that the counter value occurs after the provided + * history reference and that the history doesn't cross a + * clocksource change + */ + if (!history_begin || + !cycle_between(history_begin->cycles, + system_counterval.cycles, cycles) || + history_begin->cs_was_changed_seq != cs_was_changed_seq) + return -EINVAL; + partial_history_cycles = cycles - system_counterval.cycles; + total_history_cycles = cycles - history_begin->cycles; + discontinuity = + history_begin->clock_was_set_seq != clock_was_set_seq; + + ret = adjust_historical_crosststamp(history_begin, + partial_history_cycles, + total_history_cycles, + discontinuity, xtstamp); + if (ret) + return ret; + } + return 0; } EXPORT_SYMBOL_GPL(get_device_system_crosststamp); -- cgit From 82e88ff1ea948d83125a8aaa7c9809f03ccc500f Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 3 Mar 2016 11:11:12 +0100 Subject: hrtimer: Revert CLOCK_MONOTONIC_RAW support Revert commits: a6e707ddbdf1: KVM: arm/arm64: timer: Switch to CLOCK_MONOTONIC_RAW 9006a01829a5: hrtimer: Catch illegal clockids 9c808765e88e: hrtimer: Add support for CLOCK_MONOTONIC_RAW Marc found out, that there are fundamental issues with that patch series because __hrtimer_get_next_event() and hrtimer_forward() need support for CLOCK_MONOTONIC_RAW. Nothing which is easily fixed, so revert the whole lot. Reported-by: Marc Zyngier Link: http://lkml.kernel.org/r/56D6CEF0.8060607@arm.com Signed-off-by: Thomas Gleixner --- include/linux/hrtimer.h | 1 - kernel/time/hrtimer.c | 18 ++---------------- virt/kvm/arm/arch_timer.c | 4 ++-- 3 files changed, 4 insertions(+), 19 deletions(-) (limited to 'kernel/time') diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index a6d64af5e73f..76dd4f0da5ca 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -151,7 +151,6 @@ enum hrtimer_base_type { HRTIMER_BASE_REALTIME, HRTIMER_BASE_BOOTTIME, HRTIMER_BASE_TAI, - HRTIMER_BASE_MONOTONIC_RAW, HRTIMER_MAX_CLOCK_BASES, }; diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index cb0fe70f3c51..435b8850dd80 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -90,30 +90,19 @@ DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) = .clockid = CLOCK_TAI, .get_time = &ktime_get_clocktai, }, - { - .index = HRTIMER_BASE_MONOTONIC_RAW, - .clockid = CLOCK_MONOTONIC_RAW, - .get_time = &ktime_get_raw, - }, } }; static const int hrtimer_clock_to_base_table[MAX_CLOCKS] = { - /* Make sure we catch unsupported clockids */ - [0 ... MAX_CLOCKS - 1] = HRTIMER_MAX_CLOCK_BASES, - [CLOCK_REALTIME] = HRTIMER_BASE_REALTIME, [CLOCK_MONOTONIC] = HRTIMER_BASE_MONOTONIC, - [CLOCK_MONOTONIC_RAW] = HRTIMER_BASE_MONOTONIC_RAW, [CLOCK_BOOTTIME] = HRTIMER_BASE_BOOTTIME, [CLOCK_TAI] = HRTIMER_BASE_TAI, }; static inline int hrtimer_clockid_to_base(clockid_t clock_id) { - int base = hrtimer_clock_to_base_table[clock_id]; - BUG_ON(base == HRTIMER_MAX_CLOCK_BASES); - return base; + return hrtimer_clock_to_base_table[clock_id]; } /* @@ -1279,10 +1268,7 @@ static void __hrtimer_run_queues(struct hrtimer_cpu_base *cpu_base, ktime_t now) if (!(active & 0x01)) continue; - if (unlikely(base->index == HRTIMER_BASE_MONOTONIC_RAW)) - basenow = ktime_get_raw(); - else - basenow = ktime_add(now, base->offset); + basenow = ktime_add(now, base->offset); while ((node = timerqueue_getnext(&base->active))) { struct hrtimer *timer; diff --git a/virt/kvm/arm/arch_timer.c b/virt/kvm/arm/arch_timer.c index 97c58153f923..69bca185c471 100644 --- a/virt/kvm/arm/arch_timer.c +++ b/virt/kvm/arm/arch_timer.c @@ -48,7 +48,7 @@ static bool timer_is_armed(struct arch_timer_cpu *timer) static void timer_arm(struct arch_timer_cpu *timer, u64 ns) { timer->armed = true; - hrtimer_start(&timer->timer, ktime_add_ns(ktime_get_raw(), ns), + hrtimer_start(&timer->timer, ktime_add_ns(ktime_get(), ns), HRTIMER_MODE_ABS); } @@ -308,7 +308,7 @@ void kvm_timer_vcpu_init(struct kvm_vcpu *vcpu) struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu; INIT_WORK(&timer->expired, kvm_timer_inject_irq_work); - hrtimer_init(&timer->timer, CLOCK_MONOTONIC_RAW, HRTIMER_MODE_ABS); + hrtimer_init(&timer->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); timer->timer.function = kvm_timer_expire; } -- cgit From 6436257b491cc0d456c39330dfc22126148d5ed7 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 8 Mar 2016 11:09:53 +0100 Subject: time/timekeeping: Work around false positive GCC warning MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Newer GCC versions trigger the following warning: kernel/time/timekeeping.c: In function ‘get_device_system_crosststamp’: kernel/time/timekeeping.c:987:5: warning: ‘clock_was_set_seq’ may be used uninitialized in this function [-Wmaybe-uninitialized] if (discontinuity) { ^ kernel/time/timekeeping.c:1045:15: note: ‘clock_was_set_seq’ was declared here unsigned int clock_was_set_seq; ^ GCC clearly is unable to recognize that the 'do_interp' boolean tracks the initialization status of 'clock_was_set_seq'. The GCC version used was: gcc version 5.3.1 20151207 (Red Hat 5.3.1-2) (GCC) Work it around by initializing clock_was_set_seq to 0. Compilers that are able to recognize the code flow will eliminate the unnecessary initialization. Acked-by: Thomas Gleixner Cc: John Stultz Cc: Linus Torvalds Cc: Peter Zijlstra Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/time/timekeeping.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/time') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 931b0b1a71e9..9c629bbed572 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -1042,7 +1042,7 @@ int get_device_system_crosststamp(int (*get_time_fn) struct system_counterval_t system_counterval; struct timekeeper *tk = &tk_core.timekeeper; cycle_t cycles, now, interval_start; - unsigned int clock_was_set_seq; + unsigned int clock_was_set_seq = 0; ktime_t base_real, base_raw; s64 nsec_real, nsec_raw; u8 cs_was_changed_seq; -- cgit From da8b44d5a9f8bf26da637b7336508ca534d6b319 Mon Sep 17 00:00:00 2001 From: John Stultz Date: Thu, 17 Mar 2016 14:20:51 -0700 Subject: timer: convert timer_slack_ns from unsigned long to u64 This patchset introduces a /proc//timerslack_ns interface which would allow controlling processes to be able to set the timerslack value on other processes in order to save power by avoiding wakeups (Something Android currently does via out-of-tree patches). The first patch tries to fix the internal timer_slack_ns usage which was defined as a long, which limits the slack range to ~4 seconds on 32bit systems. It converts it to a u64, which provides the same basically unlimited slack (500 years) on both 32bit and 64bit machines. The second patch introduces the /proc//timerslack_ns interface which allows the full 64bit slack range for a task to be read or set on both 32bit and 64bit machines. With these two patches, on a 32bit machine, after setting the slack on bash to 10 seconds: $ time sleep 1 real 0m10.747s user 0m0.001s sys 0m0.005s The first patch is a little ugly, since I had to chase the slack delta arguments through a number of functions converting them to u64s. Let me know if it makes sense to break that up more or not. Other than that things are fairly straightforward. This patch (of 2): The timer_slack_ns value in the task struct is currently a unsigned long. This means that on 32bit applications, the maximum slack is just over 4 seconds. However, on 64bit machines, its much much larger (~500 years). This disparity could make application development a little (as well as the default_slack) to a u64. This means both 32bit and 64bit systems have the same effective internal slack range. Now the existing ABI via PR_GET_TIMERSLACK and PR_SET_TIMERSLACK specify the interface as a unsigned long, so we preserve that limitation on 32bit systems, where SET_TIMERSLACK can only set the slack to a unsigned long value, and GET_TIMERSLACK will return ULONG_MAX if the slack is actually larger then what can be stored by an unsigned long. This patch also modifies hrtimer functions which specified the slack delta as a unsigned long. Signed-off-by: John Stultz Cc: Arjan van de Ven Cc: Thomas Gleixner Cc: Oren Laadan Cc: Ruchi Kandoi Cc: Rom Lemarchand Cc: Kees Cook Cc: Android Kernel Team Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/eventpoll.c | 2 +- fs/select.c | 8 ++++---- include/linux/freezer.h | 2 +- include/linux/hrtimer.h | 12 +++++++----- include/linux/poll.h | 2 +- include/linux/sched.h | 4 ++-- kernel/sys.c | 5 ++++- kernel/time/hrtimer.c | 8 ++++---- kernel/time/timer.c | 4 ++-- 9 files changed, 26 insertions(+), 21 deletions(-) (limited to 'kernel/time') diff --git a/fs/eventpoll.c b/fs/eventpoll.c index cde60741cad2..8a74a2a52e0f 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -1616,7 +1616,7 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events, { int res = 0, eavail, timed_out = 0; unsigned long flags; - long slack = 0; + u64 slack = 0; wait_queue_t wait; ktime_t expires, *to = NULL; diff --git a/fs/select.c b/fs/select.c index 79d0d4953cad..869293988c2a 100644 --- a/fs/select.c +++ b/fs/select.c @@ -70,9 +70,9 @@ static long __estimate_accuracy(struct timespec *tv) return slack; } -long select_estimate_accuracy(struct timespec *tv) +u64 select_estimate_accuracy(struct timespec *tv) { - unsigned long ret; + u64 ret; struct timespec now; /* @@ -402,7 +402,7 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time) struct poll_wqueues table; poll_table *wait; int retval, i, timed_out = 0; - unsigned long slack = 0; + u64 slack = 0; unsigned int busy_flag = net_busy_loop_on() ? POLL_BUSY_LOOP : 0; unsigned long busy_end = 0; @@ -784,7 +784,7 @@ static int do_poll(struct poll_list *list, struct poll_wqueues *wait, poll_table* pt = &wait->pt; ktime_t expire, *to = NULL; int timed_out = 0, count = 0; - unsigned long slack = 0; + u64 slack = 0; unsigned int busy_flag = net_busy_loop_on() ? POLL_BUSY_LOOP : 0; unsigned long busy_end = 0; diff --git a/include/linux/freezer.h b/include/linux/freezer.h index 6b7fd9cf5ea2..dd03e837ebb7 100644 --- a/include/linux/freezer.h +++ b/include/linux/freezer.h @@ -231,7 +231,7 @@ static inline long freezable_schedule_timeout_killable_unsafe(long timeout) * call this with locks held. */ static inline int freezable_schedule_hrtimeout_range(ktime_t *expires, - unsigned long delta, const enum hrtimer_mode mode) + u64 delta, const enum hrtimer_mode mode) { int __retval; freezer_do_not_count(); diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index 2ead22dd74a0..c98c6539e2c2 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -220,7 +220,7 @@ static inline void hrtimer_set_expires_range(struct hrtimer *timer, ktime_t time timer->node.expires = ktime_add_safe(time, delta); } -static inline void hrtimer_set_expires_range_ns(struct hrtimer *timer, ktime_t time, unsigned long delta) +static inline void hrtimer_set_expires_range_ns(struct hrtimer *timer, ktime_t time, u64 delta) { timer->_softexpires = time; timer->node.expires = ktime_add_safe(time, ns_to_ktime(delta)); @@ -378,7 +378,7 @@ static inline void destroy_hrtimer_on_stack(struct hrtimer *timer) { } /* Basic timer operations: */ extern void hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, - unsigned long range_ns, const enum hrtimer_mode mode); + u64 range_ns, const enum hrtimer_mode mode); /** * hrtimer_start - (re)start an hrtimer on the current CPU @@ -399,7 +399,7 @@ extern int hrtimer_try_to_cancel(struct hrtimer *timer); static inline void hrtimer_start_expires(struct hrtimer *timer, enum hrtimer_mode mode) { - unsigned long delta; + u64 delta; ktime_t soft, hard; soft = hrtimer_get_softexpires(timer); hard = hrtimer_get_expires(timer); @@ -477,10 +477,12 @@ extern long hrtimer_nanosleep_restart(struct restart_block *restart_block); extern void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, struct task_struct *tsk); -extern int schedule_hrtimeout_range(ktime_t *expires, unsigned long delta, +extern int schedule_hrtimeout_range(ktime_t *expires, u64 delta, const enum hrtimer_mode mode); extern int schedule_hrtimeout_range_clock(ktime_t *expires, - unsigned long delta, const enum hrtimer_mode mode, int clock); + u64 delta, + const enum hrtimer_mode mode, + int clock); extern int schedule_hrtimeout(ktime_t *expires, const enum hrtimer_mode mode); /* Soft interrupt function to run the hrtimer queues: */ diff --git a/include/linux/poll.h b/include/linux/poll.h index c08386fb3e08..9fb4f40d9a26 100644 --- a/include/linux/poll.h +++ b/include/linux/poll.h @@ -96,7 +96,7 @@ extern void poll_initwait(struct poll_wqueues *pwq); extern void poll_freewait(struct poll_wqueues *pwq); extern int poll_schedule_timeout(struct poll_wqueues *pwq, int state, ktime_t *expires, unsigned long slack); -extern long select_estimate_accuracy(struct timespec *tv); +extern u64 select_estimate_accuracy(struct timespec *tv); static inline int poll_schedule(struct poll_wqueues *pwq, int state) diff --git a/include/linux/sched.h b/include/linux/sched.h index eb7f2f84009b..3284d07edec7 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1792,8 +1792,8 @@ struct task_struct { * time slack values; these are used to round up poll() and * select() etc timeout values. These are in nanoseconds. */ - unsigned long timer_slack_ns; - unsigned long default_timer_slack_ns; + u64 timer_slack_ns; + u64 default_timer_slack_ns; #ifdef CONFIG_KASAN unsigned int kasan_depth; diff --git a/kernel/sys.c b/kernel/sys.c index 78947de6f969..cf8ba545c7d3 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -2169,7 +2169,10 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, error = perf_event_task_enable(); break; case PR_GET_TIMERSLACK: - error = current->timer_slack_ns; + if (current->timer_slack_ns > ULONG_MAX) + error = ULONG_MAX; + else + error = current->timer_slack_ns; break; case PR_SET_TIMERSLACK: if (arg2 <= 0) diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index fa909f9fd559..58a321c34cfb 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -979,7 +979,7 @@ static inline ktime_t hrtimer_update_lowres(struct hrtimer *timer, ktime_t tim, * relative (HRTIMER_MODE_REL) */ void hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, - unsigned long delta_ns, const enum hrtimer_mode mode) + u64 delta_ns, const enum hrtimer_mode mode) { struct hrtimer_clock_base *base, *new_base; unsigned long flags; @@ -1548,7 +1548,7 @@ long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp, struct restart_block *restart; struct hrtimer_sleeper t; int ret = 0; - unsigned long slack; + u64 slack; slack = current->timer_slack_ns; if (dl_task(current) || rt_task(current)) @@ -1724,7 +1724,7 @@ void __init hrtimers_init(void) * @clock: timer clock, CLOCK_MONOTONIC or CLOCK_REALTIME */ int __sched -schedule_hrtimeout_range_clock(ktime_t *expires, unsigned long delta, +schedule_hrtimeout_range_clock(ktime_t *expires, u64 delta, const enum hrtimer_mode mode, int clock) { struct hrtimer_sleeper t; @@ -1792,7 +1792,7 @@ schedule_hrtimeout_range_clock(ktime_t *expires, unsigned long delta, * * Returns 0 when the timer has expired otherwise -EINTR */ -int __sched schedule_hrtimeout_range(ktime_t *expires, unsigned long delta, +int __sched schedule_hrtimeout_range(ktime_t *expires, u64 delta, const enum hrtimer_mode mode) { return schedule_hrtimeout_range_clock(expires, delta, mode, diff --git a/kernel/time/timer.c b/kernel/time/timer.c index bbc5d1114583..d1798fa0c743 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -1698,10 +1698,10 @@ EXPORT_SYMBOL(msleep_interruptible); static void __sched do_usleep_range(unsigned long min, unsigned long max) { ktime_t kmin; - unsigned long delta; + u64 delta; kmin = ktime_set(0, min * NSEC_PER_USEC); - delta = (max - min) * NSEC_PER_USEC; + delta = (u64)(max - min) * NSEC_PER_USEC; schedule_hrtimeout_range(&kmin, delta, HRTIMER_MODE_REL); } -- cgit From 4cc7ecb7f2a60e8deb783b8fbf7c1ae467acb920 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 17 Mar 2016 14:23:00 -0700 Subject: param: convert some "on"/"off" users to strtobool This changes several users of manual "on"/"off" parsing to use strtobool. Some side-effects: - these uses will now parse y/n/1/0 meaningfully too - the early_param uses will now bubble up parse errors Signed-off-by: Kees Cook Acked-by: Heiko Carstens Acked-by: Michael Ellerman Cc: Amitkumar Karwar Cc: Andy Shevchenko Cc: Daniel Borkmann Cc: Joe Perches Cc: Kalle Valo Cc: Martin Schwidefsky Cc: Nishant Sarmukadam Cc: Rasmus Villemoes Cc: Steve French Cc: Stephen Rothwell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/powerpc/kernel/rtasd.c | 9 ++------- arch/powerpc/platforms/pseries/hotplug-cpu.c | 10 ++-------- arch/s390/kernel/time.c | 8 ++------ arch/s390/kernel/topology.c | 7 ++----- arch/x86/kernel/aperture_64.c | 12 ++---------- include/linux/tick.h | 2 +- kernel/time/hrtimer.c | 10 ++-------- kernel/time/tick-sched.c | 10 ++-------- 8 files changed, 15 insertions(+), 53 deletions(-) (limited to 'kernel/time') diff --git a/arch/powerpc/kernel/rtasd.c b/arch/powerpc/kernel/rtasd.c index 5a2c049c1c61..aa610ce8742f 100644 --- a/arch/powerpc/kernel/rtasd.c +++ b/arch/powerpc/kernel/rtasd.c @@ -49,7 +49,7 @@ static unsigned int rtas_error_log_buffer_max; static unsigned int event_scan; static unsigned int rtas_event_scan_rate; -static int full_rtas_msgs = 0; +static bool full_rtas_msgs; /* Stop logging to nvram after first fatal error */ static int logging_enabled; /* Until we initialize everything, @@ -592,11 +592,6 @@ __setup("surveillance=", surveillance_setup); static int __init rtasmsgs_setup(char *str) { - if (strcmp(str, "on") == 0) - full_rtas_msgs = 1; - else if (strcmp(str, "off") == 0) - full_rtas_msgs = 0; - - return 1; + return (kstrtobool(str, &full_rtas_msgs) == 0); } __setup("rtasmsgs=", rtasmsgs_setup); diff --git a/arch/powerpc/platforms/pseries/hotplug-cpu.c b/arch/powerpc/platforms/pseries/hotplug-cpu.c index 32274f72fe3f..282837a1d74b 100644 --- a/arch/powerpc/platforms/pseries/hotplug-cpu.c +++ b/arch/powerpc/platforms/pseries/hotplug-cpu.c @@ -47,20 +47,14 @@ static DEFINE_PER_CPU(enum cpu_state_vals, current_state) = CPU_STATE_OFFLINE; static enum cpu_state_vals default_offline_state = CPU_STATE_OFFLINE; -static int cede_offline_enabled __read_mostly = 1; +static bool cede_offline_enabled __read_mostly = true; /* * Enable/disable cede_offline when available. */ static int __init setup_cede_offline(char *str) { - if (!strcmp(str, "off")) - cede_offline_enabled = 0; - else if (!strcmp(str, "on")) - cede_offline_enabled = 1; - else - return 0; - return 1; + return (kstrtobool(str, &cede_offline_enabled) == 0); } __setup("cede_offline=", setup_cede_offline); diff --git a/arch/s390/kernel/time.c b/arch/s390/kernel/time.c index c4e5f183f225..9409d32f285e 100644 --- a/arch/s390/kernel/time.c +++ b/arch/s390/kernel/time.c @@ -1432,7 +1432,7 @@ device_initcall(etr_init_sysfs); /* * Server Time Protocol (STP) code. */ -static int stp_online; +static bool stp_online; static struct stp_sstpi stp_info; static void *stp_page; @@ -1443,11 +1443,7 @@ static struct timer_list stp_timer; static int __init early_parse_stp(char *p) { - if (strncmp(p, "off", 3) == 0) - stp_online = 0; - else if (strncmp(p, "on", 2) == 0) - stp_online = 1; - return 0; + return kstrtobool(p, &stp_online); } early_param("stp", early_parse_stp); diff --git a/arch/s390/kernel/topology.c b/arch/s390/kernel/topology.c index 40b8102fdadb..64298a867589 100644 --- a/arch/s390/kernel/topology.c +++ b/arch/s390/kernel/topology.c @@ -37,7 +37,7 @@ static void set_topology_timer(void); static void topology_work_fn(struct work_struct *work); static struct sysinfo_15_1_x *tl_info; -static int topology_enabled = 1; +static bool topology_enabled = true; static DECLARE_WORK(topology_work, topology_work_fn); /* @@ -444,10 +444,7 @@ static const struct cpumask *cpu_book_mask(int cpu) static int __init early_parse_topology(char *p) { - if (strncmp(p, "off", 3)) - return 0; - topology_enabled = 0; - return 0; + return kstrtobool(p, &topology_enabled); } early_param("topology", early_parse_topology); diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c index 6e85f713641d..0a2bb1f62e72 100644 --- a/arch/x86/kernel/aperture_64.c +++ b/arch/x86/kernel/aperture_64.c @@ -227,19 +227,11 @@ static u32 __init search_agp_bridge(u32 *order, int *valid_agp) return 0; } -static int gart_fix_e820 __initdata = 1; +static bool gart_fix_e820 __initdata = true; static int __init parse_gart_mem(char *p) { - if (!p) - return -EINVAL; - - if (!strncmp(p, "off", 3)) - gart_fix_e820 = 0; - else if (!strncmp(p, "on", 2)) - gart_fix_e820 = 1; - - return 0; + return kstrtobool(p, &gart_fix_e820); } early_param("gart_fix_e820", parse_gart_mem); diff --git a/include/linux/tick.h b/include/linux/tick.h index 21f73649a4dc..62be0786d6d0 100644 --- a/include/linux/tick.h +++ b/include/linux/tick.h @@ -111,7 +111,7 @@ enum tick_dep_bits { #define TICK_DEP_MASK_CLOCK_UNSTABLE (1 << TICK_DEP_BIT_CLOCK_UNSTABLE) #ifdef CONFIG_NO_HZ_COMMON -extern int tick_nohz_enabled; +extern bool tick_nohz_enabled; extern int tick_nohz_tick_stopped(void); extern void tick_nohz_idle_enter(void); extern void tick_nohz_idle_exit(void); diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 58a321c34cfb..fa0b983290cf 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -515,7 +515,7 @@ static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base) /* * High resolution timer enabled ? */ -static int hrtimer_hres_enabled __read_mostly = 1; +static bool hrtimer_hres_enabled __read_mostly = true; unsigned int hrtimer_resolution __read_mostly = LOW_RES_NSEC; EXPORT_SYMBOL_GPL(hrtimer_resolution); @@ -524,13 +524,7 @@ EXPORT_SYMBOL_GPL(hrtimer_resolution); */ static int __init setup_hrtimer_hres(char *str) { - if (!strcmp(str, "off")) - hrtimer_hres_enabled = 0; - else if (!strcmp(str, "on")) - hrtimer_hres_enabled = 1; - else - return 0; - return 1; + return (kstrtobool(str, &hrtimer_hres_enabled) == 0); } __setup("highres=", setup_hrtimer_hres); diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 969e6704c3c9..195fe7d2caad 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -486,20 +486,14 @@ void __init tick_nohz_init(void) /* * NO HZ enabled ? */ -int tick_nohz_enabled __read_mostly = 1; +bool tick_nohz_enabled __read_mostly = true; unsigned long tick_nohz_active __read_mostly; /* * Enable / Disable tickless mode */ static int __init setup_tick_nohz(char *str) { - if (!strcmp(str, "off")) - tick_nohz_enabled = 0; - else if (!strcmp(str, "on")) - tick_nohz_enabled = 1; - else - return 0; - return 1; + return (kstrtobool(str, &tick_nohz_enabled) == 0); } __setup("nohz=", setup_tick_nohz); -- cgit From a395d6a7e3d6e3d1d316376db0c4c8b5d2995930 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Tue, 22 Mar 2016 14:28:09 -0700 Subject: kernel/...: convert pr_warning to pr_warn Use the more common logging method with the eventual goal of removing pr_warning altogether. Miscellanea: - Realign arguments - Coalesce formats - Add missing space between a few coalesced formats Signed-off-by: Joe Perches Acked-by: Rafael J. Wysocki [kernel/power/suspend.c] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/irq/manage.c | 4 ++-- kernel/power/suspend.c | 3 +-- kernel/time/tick-sched.c | 8 ++++---- kernel/trace/blktrace.c | 4 ++-- kernel/trace/ftrace.c | 7 +++---- kernel/trace/trace.c | 40 ++++++++++++++++++------------------ kernel/trace/trace_functions_graph.c | 6 +++--- kernel/trace/trace_kprobe.c | 27 ++++++++++-------------- kernel/trace/trace_mmiotrace.c | 2 +- kernel/trace/trace_probe.c | 4 ++-- kernel/trace/trace_stat.c | 3 +-- kernel/trace/trace_uprobe.c | 2 +- kernel/tracepoint.c | 2 +- 13 files changed, 52 insertions(+), 60 deletions(-) (limited to 'kernel/time') diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 64731e84c982..cc1cc641d653 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -1322,8 +1322,8 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) if (nmsk != omsk) /* hope the handler works with current trigger mode */ - pr_warning("irq %d uses trigger mode %u; requested %u\n", - irq, nmsk, omsk); + pr_warn("irq %d uses trigger mode %u; requested %u\n", + irq, nmsk, omsk); } *old_ptr = new; diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 230a77225e2e..5b70d64b871e 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -473,8 +473,7 @@ static int enter_state(suspend_state_t state) if (state == PM_SUSPEND_FREEZE) { #ifdef CONFIG_PM_DEBUG if (pm_test_level != TEST_NONE && pm_test_level <= TEST_CPUS) { - pr_warning("PM: Unsupported test mode for suspend to idle," - "please choose none/freezer/devices/platform.\n"); + pr_warn("PM: Unsupported test mode for suspend to idle, please choose none/freezer/devices/platform.\n"); return -EAGAIN; } #endif diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 195fe7d2caad..084b79f5917e 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -378,7 +378,7 @@ static int __init tick_nohz_full_setup(char *str) { alloc_bootmem_cpumask_var(&tick_nohz_full_mask); if (cpulist_parse(str, tick_nohz_full_mask) < 0) { - pr_warning("NOHZ: Incorrect nohz_full cpumask\n"); + pr_warn("NO_HZ: Incorrect nohz_full cpumask\n"); free_bootmem_cpumask_var(tick_nohz_full_mask); return 1; } @@ -446,8 +446,7 @@ void __init tick_nohz_init(void) * interrupts to avoid circular dependency on the tick */ if (!arch_irq_work_has_interrupt()) { - pr_warning("NO_HZ: Can't run full dynticks because arch doesn't " - "support irq work self-IPIs\n"); + pr_warn("NO_HZ: Can't run full dynticks because arch doesn't support irq work self-IPIs\n"); cpumask_clear(tick_nohz_full_mask); cpumask_copy(housekeeping_mask, cpu_possible_mask); tick_nohz_full_running = false; @@ -457,7 +456,8 @@ void __init tick_nohz_init(void) cpu = smp_processor_id(); if (cpumask_test_cpu(cpu, tick_nohz_full_mask)) { - pr_warning("NO_HZ: Clearing %d from nohz_full range for timekeeping\n", cpu); + pr_warn("NO_HZ: Clearing %d from nohz_full range for timekeeping\n", + cpu); cpumask_clear_cpu(cpu, tick_nohz_full_mask); } diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 2aeb6ffc0a1e..f94e7a21f52d 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -1437,12 +1437,12 @@ static struct trace_event trace_blk_event = { static int __init init_blk_tracer(void) { if (!register_trace_event(&trace_blk_event)) { - pr_warning("Warning: could not register block events\n"); + pr_warn("Warning: could not register block events\n"); return 1; } if (register_tracer(&blk_tracer) != 0) { - pr_warning("Warning: could not register the block tracer\n"); + pr_warn("Warning: could not register the block tracer\n"); unregister_trace_event(&trace_blk_event); return 1; } diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 57a6eea84694..2ece9f1a3e5a 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1058,8 +1058,7 @@ static __init void ftrace_profile_tracefs(struct dentry *d_tracer) entry = tracefs_create_file("function_profile_enabled", 0644, d_tracer, NULL, &ftrace_profile_fops); if (!entry) - pr_warning("Could not create tracefs " - "'function_profile_enabled' entry\n"); + pr_warn("Could not create tracefs 'function_profile_enabled' entry\n"); } #else /* CONFIG_FUNCTION_PROFILER */ @@ -2314,8 +2313,8 @@ unsigned long ftrace_get_addr_curr(struct dyn_ftrace *rec) if (rec->flags & FTRACE_FL_TRAMP_EN) { ops = ftrace_find_tramp_ops_curr(rec); if (FTRACE_WARN_ON(!ops)) { - pr_warning("Bad trampoline accounting at: %p (%pS)\n", - (void *)rec->ip, (void *)rec->ip); + pr_warn("Bad trampoline accounting at: %p (%pS)\n", + (void *)rec->ip, (void *)rec->ip); /* Ftrace is shutting down, return anything */ return (unsigned long)FTRACE_ADDR; } diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index d9293402ee68..032b388bea66 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2071,20 +2071,20 @@ void trace_printk_init_buffers(void) /* trace_printk() is for debug use only. Don't use it in production. */ - pr_warning("\n"); - pr_warning("**********************************************************\n"); - pr_warning("** NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE **\n"); - pr_warning("** **\n"); - pr_warning("** trace_printk() being used. Allocating extra memory. **\n"); - pr_warning("** **\n"); - pr_warning("** This means that this is a DEBUG kernel and it is **\n"); - pr_warning("** unsafe for production use. **\n"); - pr_warning("** **\n"); - pr_warning("** If you see this message and you are not debugging **\n"); - pr_warning("** the kernel, report this immediately to your vendor! **\n"); - pr_warning("** **\n"); - pr_warning("** NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE **\n"); - pr_warning("**********************************************************\n"); + pr_warn("\n"); + pr_warn("**********************************************************\n"); + pr_warn("** NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE **\n"); + pr_warn("** **\n"); + pr_warn("** trace_printk() being used. Allocating extra memory. **\n"); + pr_warn("** **\n"); + pr_warn("** This means that this is a DEBUG kernel and it is **\n"); + pr_warn("** unsafe for production use. **\n"); + pr_warn("** **\n"); + pr_warn("** If you see this message and you are not debugging **\n"); + pr_warn("** the kernel, report this immediately to your vendor! **\n"); + pr_warn("** **\n"); + pr_warn("** NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE **\n"); + pr_warn("**********************************************************\n"); /* Expand the buffers to set size */ tracing_update_buffers(); @@ -4101,7 +4101,7 @@ trace_insert_enum_map_file(struct module *mod, struct trace_enum_map **start, */ map_array = kmalloc(sizeof(*map_array) * (len + 2), GFP_KERNEL); if (!map_array) { - pr_warning("Unable to allocate trace enum mapping\n"); + pr_warn("Unable to allocate trace enum mapping\n"); return; } @@ -6131,7 +6131,7 @@ tracing_init_tracefs_percpu(struct trace_array *tr, long cpu) snprintf(cpu_dir, 30, "cpu%ld", cpu); d_cpu = tracefs_create_dir(cpu_dir, d_percpu); if (!d_cpu) { - pr_warning("Could not create tracefs '%s' entry\n", cpu_dir); + pr_warn("Could not create tracefs '%s' entry\n", cpu_dir); return; } @@ -6318,7 +6318,7 @@ struct dentry *trace_create_file(const char *name, ret = tracefs_create_file(name, mode, parent, data, fops); if (!ret) - pr_warning("Could not create tracefs '%s' entry\n", name); + pr_warn("Could not create tracefs '%s' entry\n", name); return ret; } @@ -6337,7 +6337,7 @@ static struct dentry *trace_options_init_dentry(struct trace_array *tr) tr->options = tracefs_create_dir("options", d_tracer); if (!tr->options) { - pr_warning("Could not create tracefs directory 'options'\n"); + pr_warn("Could not create tracefs directory 'options'\n"); return NULL; } @@ -7248,8 +7248,8 @@ __init static int tracer_alloc_buffers(void) if (trace_boot_clock) { ret = tracing_set_clock(&global_trace, trace_boot_clock); if (ret < 0) - pr_warning("Trace clock %s not defined, going back to default\n", - trace_boot_clock); + pr_warn("Trace clock %s not defined, going back to default\n", + trace_boot_clock); } /* diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index a663cbb84107..91d6a63a2ea7 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -1350,7 +1350,7 @@ void graph_trace_open(struct trace_iterator *iter) out_err_free: kfree(data); out_err: - pr_warning("function graph tracer: not enough memory\n"); + pr_warn("function graph tracer: not enough memory\n"); } void graph_trace_close(struct trace_iterator *iter) @@ -1468,12 +1468,12 @@ static __init int init_graph_trace(void) max_bytes_for_cpu = snprintf(NULL, 0, "%d", nr_cpu_ids - 1); if (!register_trace_event(&graph_trace_entry_event)) { - pr_warning("Warning: could not register graph trace events\n"); + pr_warn("Warning: could not register graph trace events\n"); return 1; } if (!register_trace_event(&graph_trace_ret_event)) { - pr_warning("Warning: could not register graph trace events\n"); + pr_warn("Warning: could not register graph trace events\n"); return 1; } diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 21b81a41dae5..919e0ddd8fcc 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -459,16 +459,14 @@ static int __register_trace_kprobe(struct trace_kprobe *tk) if (ret == 0) tk->tp.flags |= TP_FLAG_REGISTERED; else { - pr_warning("Could not insert probe at %s+%lu: %d\n", - trace_kprobe_symbol(tk), trace_kprobe_offset(tk), ret); + pr_warn("Could not insert probe at %s+%lu: %d\n", + trace_kprobe_symbol(tk), trace_kprobe_offset(tk), ret); if (ret == -ENOENT && trace_kprobe_is_on_module(tk)) { - pr_warning("This probe might be able to register after" - "target module is loaded. Continue.\n"); + pr_warn("This probe might be able to register after target module is loaded. Continue.\n"); ret = 0; } else if (ret == -EILSEQ) { - pr_warning("Probing address(0x%p) is not an " - "instruction boundary.\n", - tk->rp.kp.addr); + pr_warn("Probing address(0x%p) is not an instruction boundary.\n", + tk->rp.kp.addr); ret = -EINVAL; } } @@ -529,7 +527,7 @@ static int register_trace_kprobe(struct trace_kprobe *tk) /* Register new event */ ret = register_kprobe_event(tk); if (ret) { - pr_warning("Failed to register probe event(%d)\n", ret); + pr_warn("Failed to register probe event(%d)\n", ret); goto end; } @@ -564,10 +562,9 @@ static int trace_kprobe_module_callback(struct notifier_block *nb, __unregister_trace_kprobe(tk); ret = __register_trace_kprobe(tk); if (ret) - pr_warning("Failed to re-register probe %s on" - "%s: %d\n", - trace_event_name(&tk->tp.call), - mod->name, ret); + pr_warn("Failed to re-register probe %s on %s: %d\n", + trace_event_name(&tk->tp.call), + mod->name, ret); } } mutex_unlock(&probe_lock); @@ -1336,16 +1333,14 @@ static __init int init_kprobe_trace(void) /* Event list interface */ if (!entry) - pr_warning("Could not create tracefs " - "'kprobe_events' entry\n"); + pr_warn("Could not create tracefs 'kprobe_events' entry\n"); /* Profile interface */ entry = tracefs_create_file("kprobe_profile", 0444, d_tracer, NULL, &kprobe_profile_ops); if (!entry) - pr_warning("Could not create tracefs " - "'kprobe_profile' entry\n"); + pr_warn("Could not create tracefs 'kprobe_profile' entry\n"); return 0; } fs_initcall(init_kprobe_trace); diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c index 2be8c4f2403d..68f376ca6d3f 100644 --- a/kernel/trace/trace_mmiotrace.c +++ b/kernel/trace/trace_mmiotrace.c @@ -146,7 +146,7 @@ static ssize_t mmio_read(struct trace_iterator *iter, struct file *filp, /* XXX: This is later than where events were lost. */ trace_seq_printf(s, "MARK 0.000000 Lost %lu events.\n", n); if (!overrun_detected) - pr_warning("mmiotrace has lost events.\n"); + pr_warn("mmiotrace has lost events\n"); overrun_detected = true; goto print_out; } diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index 1769a81da8a7..1d372fa6fefb 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -636,8 +636,8 @@ ssize_t traceprobe_probes_write(struct file *file, const char __user *buffer, *tmp = '\0'; size = tmp - kbuf + 1; } else if (done + size < count) { - pr_warning("Line length is too long: " - "Should be less than %d.", WRITE_BUFSIZE); + pr_warn("Line length is too long: Should be less than %d\n", + WRITE_BUFSIZE); ret = -EINVAL; goto out; } diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c index 6cf935316769..413ff108fbd0 100644 --- a/kernel/trace/trace_stat.c +++ b/kernel/trace/trace_stat.c @@ -281,8 +281,7 @@ static int tracing_stat_init(void) stat_dir = tracefs_create_dir("trace_stat", d_tracing); if (!stat_dir) - pr_warning("Could not create tracefs " - "'trace_stat' entry\n"); + pr_warn("Could not create tracefs 'trace_stat' entry\n"); return 0; } diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index d2f6d0be3503..7915142c89e4 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -334,7 +334,7 @@ static int register_trace_uprobe(struct trace_uprobe *tu) ret = register_uprobe_event(tu); if (ret) { - pr_warning("Failed to register probe event(%d)\n", ret); + pr_warn("Failed to register probe event(%d)\n", ret); goto end; } diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c index ecd536de603a..d0639d917899 100644 --- a/kernel/tracepoint.c +++ b/kernel/tracepoint.c @@ -491,7 +491,7 @@ static __init int init_tracepoints(void) ret = register_module_notifier(&tracepoint_module_nb); if (ret) - pr_warning("Failed to register tracepoint module enter notifier\n"); + pr_warn("Failed to register tracepoint module enter notifier\n"); return ret; } -- cgit From 69b27baf00fa9b7b14b3263c105390d1683425b2 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Fri, 25 Mar 2016 14:20:21 -0700 Subject: sched: add schedule_timeout_idle() This will be needed in the patch "mm, oom: introduce oom reaper". Acked-by: Michal Hocko Cc: Ingo Molnar Cc: Peter Zijlstra Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sched.h | 1 + kernel/time/timer.c | 11 +++++++++++ 2 files changed, 12 insertions(+) (limited to 'kernel/time') diff --git a/include/linux/sched.h b/include/linux/sched.h index 589c4780b077..478b41de7f7d 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -426,6 +426,7 @@ extern signed long schedule_timeout(signed long timeout); extern signed long schedule_timeout_interruptible(signed long timeout); extern signed long schedule_timeout_killable(signed long timeout); extern signed long schedule_timeout_uninterruptible(signed long timeout); +extern signed long schedule_timeout_idle(signed long timeout); asmlinkage void schedule(void); extern void schedule_preempt_disabled(void); diff --git a/kernel/time/timer.c b/kernel/time/timer.c index d1798fa0c743..73164c3aa56b 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -1566,6 +1566,17 @@ signed long __sched schedule_timeout_uninterruptible(signed long timeout) } EXPORT_SYMBOL(schedule_timeout_uninterruptible); +/* + * Like schedule_timeout_uninterruptible(), except this task will not contribute + * to load average. + */ +signed long __sched schedule_timeout_idle(signed long timeout) +{ + __set_current_state(TASK_IDLE); + return schedule_timeout(timeout); +} +EXPORT_SYMBOL(schedule_timeout_idle); + #ifdef CONFIG_HOTPLUG_CPU static void migrate_timer_list(struct tvec_base *new_base, struct hlist_head *head) { -- cgit