From e8b175946c16d7001b22620f52d78ab497efc9d0 Mon Sep 17 00:00:00 2001 From: Shaibal Dutta Date: Fri, 31 Jan 2014 11:18:24 -0800 Subject: timekeeping: Move clock sync work to power efficient workqueue For better use of CPU idle time, allow the scheduler to select the CPU on which the CMOS clock sync work would be scheduled. This improves idle residency time and conserver power. This functionality is enabled when CONFIG_WQ_POWER_EFFICIENT is selected. Signed-off-by: Shaibal Dutta [zoran.markovic@linaro.org: Added commit message. Aligned code.] Signed-off-by: Zoran Markovic Cc: John Stultz Link: http://lkml.kernel.org/r/1391195904-12497-1-git-send-email-zoran.markovic@linaro.org Signed-off-by: Thomas Gleixner --- kernel/time/ntp.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index af8d1d4f3d55..419a52cecd20 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -514,12 +514,13 @@ static void sync_cmos_clock(struct work_struct *work) next.tv_sec++; next.tv_nsec -= NSEC_PER_SEC; } - schedule_delayed_work(&sync_cmos_work, timespec_to_jiffies(&next)); + queue_delayed_work(system_power_efficient_wq, + &sync_cmos_work, timespec_to_jiffies(&next)); } void ntp_notify_cmos_timer(void) { - schedule_delayed_work(&sync_cmos_work, 0); + queue_delayed_work(system_power_efficient_wq, &sync_cmos_work, 0); } #else -- cgit From 627ee7947e2e83ba565c31c5c9373d6e364b1ecd Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 3 Feb 2014 14:34:31 -0800 Subject: clockevents: Serialize calls to clockevents_update_freq() in the core We can identify the broadcast device in the core and serialize all callers including interrupts on a different CPU against the update. Also, disabling interrupts is moved into the core allowing callers to leave interrutps enabled when calling clockevents_update_freq(). Signed-off-by: Soren Brinkmann Cc: linux-arm-kernel@lists.infradead.org Cc: Soeren Brinkmann Cc: Daniel Lezcano Cc: Michal Simek Link: http://lkml.kernel.org/r/1391466877-28908-2-git-send-email-soren.brinkmann@xilinx.com Signed-off-by: Thomas Gleixner --- kernel/time/clockevents.c | 29 ++++++++++++++++++++++------- kernel/time/tick-broadcast.c | 25 +++++++++++++++++++------ kernel/time/tick-internal.h | 4 ++++ 3 files changed, 45 insertions(+), 13 deletions(-) diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index 086ad6043bcb..641d91003a45 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c @@ -439,6 +439,16 @@ void clockevents_config_and_register(struct clock_event_device *dev, } EXPORT_SYMBOL_GPL(clockevents_config_and_register); +int __clockevents_update_freq(struct clock_event_device *dev, u32 freq) +{ + clockevents_config(dev, freq); + + if (dev->mode != CLOCK_EVT_MODE_ONESHOT) + return 0; + + return clockevents_program_event(dev, dev->next_event, false); +} + /** * clockevents_update_freq - Update frequency and reprogram a clock event device. * @dev: device to modify @@ -446,17 +456,22 @@ EXPORT_SYMBOL_GPL(clockevents_config_and_register); * * Reconfigure and reprogram a clock event device in oneshot * mode. Must be called on the cpu for which the device delivers per - * cpu timer events with interrupts disabled! Returns 0 on success, - * -ETIME when the event is in the past. + * cpu timer events. If called for the broadcast device the core takes + * care of serialization. + * + * Returns 0 on success, -ETIME when the event is in the past. */ int clockevents_update_freq(struct clock_event_device *dev, u32 freq) { - clockevents_config(dev, freq); - - if (dev->mode != CLOCK_EVT_MODE_ONESHOT) - return 0; + unsigned long flags; + int ret; - return clockevents_program_event(dev, dev->next_event, false); + local_irq_save(flags); + ret = tick_broadcast_update_freq(dev, freq); + if (ret == -ENODEV) + ret = __clockevents_update_freq(dev, freq); + local_irq_restore(flags); + return ret; } /* diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index 43780ab5e279..003e6c3663b1 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -120,6 +120,19 @@ int tick_is_broadcast_device(struct clock_event_device *dev) return (dev && tick_broadcast_device.evtdev == dev); } +int tick_broadcast_update_freq(struct clock_event_device *dev, u32 freq) +{ + int ret = -ENODEV; + + if (tick_is_broadcast_device(dev)) { + raw_spin_lock(&tick_broadcast_lock); + ret = __clockevents_update_freq(dev, freq); + raw_spin_unlock(&tick_broadcast_lock); + } + return ret; +} + + static void err_broadcast(const struct cpumask *mask) { pr_crit_once("Failed to broadcast timer tick. Some CPUs may be unresponsive.\n"); @@ -272,12 +285,8 @@ static void tick_do_broadcast(struct cpumask *mask) */ static void tick_do_periodic_broadcast(void) { - raw_spin_lock(&tick_broadcast_lock); - cpumask_and(tmpmask, cpu_online_mask, tick_broadcast_mask); tick_do_broadcast(tmpmask); - - raw_spin_unlock(&tick_broadcast_lock); } /* @@ -287,13 +296,15 @@ static void tick_handle_periodic_broadcast(struct clock_event_device *dev) { ktime_t next; + raw_spin_lock(&tick_broadcast_lock); + tick_do_periodic_broadcast(); /* * The device is in periodic mode. No reprogramming necessary: */ if (dev->mode == CLOCK_EVT_MODE_PERIODIC) - return; + goto unlock; /* * Setup the next period for devices, which do not have @@ -306,9 +317,11 @@ static void tick_handle_periodic_broadcast(struct clock_event_device *dev) next = ktime_add(next, tick_period); if (!clockevents_program_event(dev, next, false)) - return; + goto unlock; tick_do_periodic_broadcast(); } +unlock: + raw_spin_unlock(&tick_broadcast_lock); } /* diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h index 8329669b51ec..26f1c0ba9d81 100644 --- a/kernel/time/tick-internal.h +++ b/kernel/time/tick-internal.h @@ -111,6 +111,7 @@ extern int tick_resume_broadcast(void); extern void tick_broadcast_init(void); extern void tick_set_periodic_handler(struct clock_event_device *dev, int broadcast); +int tick_broadcast_update_freq(struct clock_event_device *dev, u32 freq); #else /* !BROADCAST */ @@ -133,6 +134,8 @@ static inline void tick_shutdown_broadcast(unsigned int *cpup) { } static inline void tick_suspend_broadcast(void) { } static inline int tick_resume_broadcast(void) { return 0; } static inline void tick_broadcast_init(void) { } +static inline int tick_broadcast_update_freq(struct clock_event_device *dev, + u32 freq) { return -ENODEV; } /* * Set the periodic handler in non broadcast mode @@ -154,5 +157,6 @@ static inline int tick_device_is_functional(struct clock_event_device *dev) #endif +int __clockevents_update_freq(struct clock_event_device *dev, u32 freq); extern void do_timer(unsigned long ticks); extern void update_wall_time(void); -- cgit From fe79a9ba11962a603fb6af68fcb476e64031e46c Mon Sep 17 00:00:00 2001 From: Soren Brinkmann Date: Mon, 3 Feb 2014 14:34:32 -0800 Subject: clockevents: Adjust timer interval when frequency changes clockevent devices in periodic mode are not updated when the frequency of the device changes. Issue a dev->set_mode() callback which forces the device to reevaluate the timer settings. Signed-off-by: Soren Brinkmann Cc: linux-arm-kernel@lists.infradead.org Cc: Daniel Lezcano Cc: Michal Simek Link: http://lkml.kernel.org/r/1391466877-28908-3-git-send-email-soren.brinkmann@xilinx.com Signed-off-by: Thomas Gleixner --- kernel/time/clockevents.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index 641d91003a45..f85e5fda9c66 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c @@ -443,10 +443,13 @@ int __clockevents_update_freq(struct clock_event_device *dev, u32 freq) { clockevents_config(dev, freq); - if (dev->mode != CLOCK_EVT_MODE_ONESHOT) - return 0; + if (dev->mode == CLOCK_EVT_MODE_ONESHOT) + return clockevents_program_event(dev, dev->next_event, false); + + if (dev->mode == CLOCK_EVT_MODE_PERIODIC) + dev->set_mode(CLOCK_EVT_MODE_PERIODIC, dev); - return clockevents_program_event(dev, dev->next_event, false); + return 0; } /** -- cgit From da7e6f45c34d39186b72328bacc4dd86bff60e0a Mon Sep 17 00:00:00 2001 From: Preeti U Murthy Date: Fri, 7 Feb 2014 13:36:06 +0530 Subject: time: Change the return type of clockevents_notify() to integer The broadcast framework can potentially be made use of by archs which do not have an external clock device as well. Then, it is required that one of the CPUs need to handle the broadcasting of wakeup IPIs to the CPUs in deep idle. As a result its local timers should remain functional all the time. For such a CPU, the BROADCAST_ENTER notification has to fail indicating that its clock device cannot be shutdown. To make way for this support, change the return type of tick_broadcast_oneshot_control() and hence clockevents_notify() to indicate such scenarios. Signed-off-by: Preeti U Murthy Cc: deepthi@linux.vnet.ibm.com Cc: paulmck@linux.vnet.ibm.com Cc: fweisbec@gmail.com Cc: paulus@samba.org Cc: srivatsa.bhat@linux.vnet.ibm.com Cc: svaidy@linux.vnet.ibm.com Cc: peterz@infradead.org Cc: benh@kernel.crashing.org Cc: rafael.j.wysocki@intel.com Cc: linuxppc-dev@lists.ozlabs.org Link: http://lkml.kernel.org/r/20140207080606.17187.78306.stgit@preeti.in.ibm.com Signed-off-by: Thomas Gleixner --- include/linux/clockchips.h | 6 +++--- kernel/time/clockevents.c | 8 +++++--- kernel/time/tick-broadcast.c | 6 ++++-- kernel/time/tick-internal.h | 6 +++--- 4 files changed, 15 insertions(+), 11 deletions(-) diff --git a/include/linux/clockchips.h b/include/linux/clockchips.h index 493aa021c7a9..e0c5a6c418de 100644 --- a/include/linux/clockchips.h +++ b/include/linux/clockchips.h @@ -186,9 +186,9 @@ static inline int tick_check_broadcast_expired(void) { return 0; } #endif #ifdef CONFIG_GENERIC_CLOCKEVENTS -extern void clockevents_notify(unsigned long reason, void *arg); +extern int clockevents_notify(unsigned long reason, void *arg); #else -static inline void clockevents_notify(unsigned long reason, void *arg) {} +static inline int clockevents_notify(unsigned long reason, void *arg) { return 0; } #endif #else /* CONFIG_GENERIC_CLOCKEVENTS_BUILD */ @@ -196,7 +196,7 @@ static inline void clockevents_notify(unsigned long reason, void *arg) {} static inline void clockevents_suspend(void) {} static inline void clockevents_resume(void) {} -static inline void clockevents_notify(unsigned long reason, void *arg) {} +static inline int clockevents_notify(unsigned long reason, void *arg) { return 0; } static inline int tick_check_broadcast_expired(void) { return 0; } #endif diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index f85e5fda9c66..ad362c260ef4 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c @@ -542,12 +542,13 @@ void clockevents_resume(void) #ifdef CONFIG_GENERIC_CLOCKEVENTS /** * clockevents_notify - notification about relevant events + * Returns 0 on success, any other value on error */ -void clockevents_notify(unsigned long reason, void *arg) +int clockevents_notify(unsigned long reason, void *arg) { struct clock_event_device *dev, *tmp; unsigned long flags; - int cpu; + int cpu, ret = 0; raw_spin_lock_irqsave(&clockevents_lock, flags); @@ -560,7 +561,7 @@ void clockevents_notify(unsigned long reason, void *arg) case CLOCK_EVT_NOTIFY_BROADCAST_ENTER: case CLOCK_EVT_NOTIFY_BROADCAST_EXIT: - tick_broadcast_oneshot_control(reason); + ret = tick_broadcast_oneshot_control(reason); break; case CLOCK_EVT_NOTIFY_CPU_DYING: @@ -603,6 +604,7 @@ void clockevents_notify(unsigned long reason, void *arg) break; } raw_spin_unlock_irqrestore(&clockevents_lock, flags); + return ret; } EXPORT_SYMBOL_GPL(clockevents_notify); diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index 003e6c3663b1..84c8fd91d744 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -646,14 +646,15 @@ again: /* * Powerstate information: The system enters/leaves a state, where * affected devices might stop + * Returns 0 on success, -EBUSY if the cpu is used to broadcast wakeups. */ -void tick_broadcast_oneshot_control(unsigned long reason) +int tick_broadcast_oneshot_control(unsigned long reason) { struct clock_event_device *bc, *dev; struct tick_device *td; unsigned long flags; ktime_t now; - int cpu; + int cpu, ret = 0; /* * Periodic mode does not care about the enter/exit of power @@ -759,6 +760,7 @@ void tick_broadcast_oneshot_control(unsigned long reason) } out: raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); + return ret; } /* diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h index 26f1c0ba9d81..0756c62c219a 100644 --- a/kernel/time/tick-internal.h +++ b/kernel/time/tick-internal.h @@ -46,7 +46,7 @@ extern int tick_switch_to_oneshot(void (*handler)(struct clock_event_device *)); extern void tick_resume_oneshot(void); # ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST extern void tick_broadcast_setup_oneshot(struct clock_event_device *bc); -extern void tick_broadcast_oneshot_control(unsigned long reason); +extern int tick_broadcast_oneshot_control(unsigned long reason); extern void tick_broadcast_switch_to_oneshot(void); extern void tick_shutdown_broadcast_oneshot(unsigned int *cpup); extern int tick_resume_broadcast_oneshot(struct clock_event_device *bc); @@ -58,7 +58,7 @@ static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc) { BUG(); } -static inline void tick_broadcast_oneshot_control(unsigned long reason) { } +static inline int tick_broadcast_oneshot_control(unsigned long reason) { return 0; } static inline void tick_broadcast_switch_to_oneshot(void) { } static inline void tick_shutdown_broadcast_oneshot(unsigned int *cpup) { } static inline int tick_broadcast_oneshot_active(void) { return 0; } @@ -87,7 +87,7 @@ static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc) { BUG(); } -static inline void tick_broadcast_oneshot_control(unsigned long reason) { } +static inline int tick_broadcast_oneshot_control(unsigned long reason) { return 0; } static inline void tick_shutdown_broadcast_oneshot(unsigned int *cpup) { } static inline int tick_resume_broadcast_oneshot(struct clock_event_device *bc) { -- cgit From ba8f20c2eb4158a443e9d6a909aee5010efa0c69 Mon Sep 17 00:00:00 2001 From: Preeti U Murthy Date: Fri, 7 Feb 2014 13:36:52 +0530 Subject: cpuidle: Handle clockevents_notify(BROADCAST_ENTER) failure Some archs set the CPUIDLE_FLAG_TIMER_STOP flag for idle states in which the local timers stop. The cpuidle_idle_call() currently handles such idle states by calling into the broadcast framework so as to wakeup CPUs at their next wakeup event. With the hrtimer mode of broadcast, the BROADCAST_ENTER call into the broadcast frameowork can fail for archs that do not have an external clock device to handle wakeups and the CPU in question has thus to be made the stand by CPU. This patch handles such cases by failing the call into cpuidle so that the arch can take some default action. The arch will certainly not enter a similar idle state because a failed cpuidle call will also implicitly indicate that the broadcast framework has not registered this CPU to be woken up. Hence we are safe if we fail the cpuidle call. In the process move the functions that trace idle statistics just before and after the entry and exit into idle states respectively. In other scenarios where the call to cpuidle fails, we end up not tracing idle entry and exit since a decision on an idle state could not be taken. Similarly when the call to broadcast framework fails, we skip tracing idle statistics because we are in no further position to take a decision on an alternative idle state to enter into. Signed-off-by: Preeti U Murthy Cc: deepthi@linux.vnet.ibm.com Cc: paulmck@linux.vnet.ibm.com Cc: fweisbec@gmail.com Cc: paulus@samba.org Cc: srivatsa.bhat@linux.vnet.ibm.com Cc: svaidy@linux.vnet.ibm.com Cc: peterz@infradead.org Cc: benh@kernel.crashing.org Acked-by: Rafael J. Wysocki Cc: linuxppc-dev@lists.ozlabs.org Link: http://lkml.kernel.org/r/20140207080652.17187.66344.stgit@preeti.in.ibm.com Signed-off-by: Thomas Gleixner --- drivers/cpuidle/cpuidle.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c index a55e68f2cfc8..09d05ab262be 100644 --- a/drivers/cpuidle/cpuidle.c +++ b/drivers/cpuidle/cpuidle.c @@ -140,12 +140,14 @@ int cpuidle_idle_call(void) return 0; } - trace_cpu_idle_rcuidle(next_state, dev->cpu); - broadcast = !!(drv->states[next_state].flags & CPUIDLE_FLAG_TIMER_STOP); - if (broadcast) - clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &dev->cpu); + if (broadcast && + clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &dev->cpu)) + return -EBUSY; + + + trace_cpu_idle_rcuidle(next_state, dev->cpu); if (cpuidle_state_is_coupled(dev, drv, next_state)) entered_state = cpuidle_enter_state_coupled(dev, drv, @@ -153,11 +155,11 @@ int cpuidle_idle_call(void) else entered_state = cpuidle_enter_state(dev, drv, next_state); + trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, dev->cpu); + if (broadcast) clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_EXIT, &dev->cpu); - trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, dev->cpu); - /* give the governor an opportunity to reflect on the outcome */ if (cpuidle_curr_governor->reflect) cpuidle_curr_governor->reflect(dev, entered_state); -- cgit From 5d1638acb9f62fa7eb0c07cb85318bbe1f13b227 Mon Sep 17 00:00:00 2001 From: Preeti U Murthy Date: Fri, 7 Feb 2014 13:36:32 +0530 Subject: tick: Introduce hrtimer based broadcast On some architectures, in certain CPU deep idle states the local timers stop. An external clock device is used to wakeup these CPUs. The kernel support for the wakeup of these CPUs is provided by the tick broadcast framework by using the external clock device as the wakeup source. However not all implementations of architectures provide such an external clock device. This patch includes support in the broadcast framework to handle the wakeup of the CPUs in deep idle states on such systems by queuing a hrtimer on one of the CPUs, which is meant to handle the wakeup of CPUs in deep idle states. This patchset introduces a pseudo clock device which can be registered by the archs as tick_broadcast_device in the absence of a real external clock device. Once registered, the broadcast framework will work as is for these architectures as long as the archs take care of the BROADCAST_ENTER notification failing for one of the CPUs. This CPU is made the stand by CPU to handle wakeup of the CPUs in deep idle and it *must not enter deep idle states*. The CPU with the earliest wakeup is chosen to be this CPU. Hence this way the stand by CPU dynamically moves around and so does the hrtimer which is queued to trigger at the next earliest wakeup time. This is consistent with the case where an external clock device is present. The smp affinity of this clock device is set to the CPU with the earliest wakeup. This patchset handles the hotplug of the stand by CPU as well by moving the hrtimer on to the CPU handling the CPU_DEAD notification. Originally-from: Thomas Gleixner Signed-off-by: Preeti U Murthy Cc: deepthi@linux.vnet.ibm.com Cc: paulmck@linux.vnet.ibm.com Cc: fweisbec@gmail.com Cc: paulus@samba.org Cc: srivatsa.bhat@linux.vnet.ibm.com Cc: svaidy@linux.vnet.ibm.com Cc: peterz@infradead.org Cc: benh@kernel.crashing.org Cc: rafael.j.wysocki@intel.com Cc: linuxppc-dev@lists.ozlabs.org Link: http://lkml.kernel.org/r/20140207080632.17187.80532.stgit@preeti.in.ibm.com Signed-off-by: Thomas Gleixner --- include/linux/clockchips.h | 9 +++ kernel/time/Makefile | 2 +- kernel/time/tick-broadcast-hrtimer.c | 106 +++++++++++++++++++++++++++++++++++ kernel/time/tick-broadcast.c | 54 +++++++++++++++++- 4 files changed, 167 insertions(+), 4 deletions(-) create mode 100644 kernel/time/tick-broadcast-hrtimer.c diff --git a/include/linux/clockchips.h b/include/linux/clockchips.h index e0c5a6c418de..dbe9e1457168 100644 --- a/include/linux/clockchips.h +++ b/include/linux/clockchips.h @@ -62,6 +62,11 @@ enum clock_event_mode { #define CLOCK_EVT_FEAT_DYNIRQ 0x000020 #define CLOCK_EVT_FEAT_PERCPU 0x000040 +/* + * Clockevent device is based on a hrtimer for broadcast + */ +#define CLOCK_EVT_FEAT_HRTIMER 0x000080 + /** * struct clock_event_device - clock event device descriptor * @event_handler: Assigned by the framework to be called by the low @@ -83,6 +88,7 @@ enum clock_event_mode { * @name: ptr to clock event name * @rating: variable to rate clock event devices * @irq: IRQ number (only for non CPU local devices) + * @bound_on: Bound on CPU * @cpumask: cpumask to indicate for which CPUs this device works * @list: list head for the management code * @owner: module reference @@ -113,6 +119,7 @@ struct clock_event_device { const char *name; int rating; int irq; + int bound_on; const struct cpumask *cpumask; struct list_head list; struct module *owner; @@ -180,9 +187,11 @@ extern int tick_receive_broadcast(void); #endif #if defined(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST) && defined(CONFIG_TICK_ONESHOT) +extern void tick_setup_hrtimer_broadcast(void); extern int tick_check_broadcast_expired(void); #else static inline int tick_check_broadcast_expired(void) { return 0; } +static void tick_setup_hrtimer_broadcast(void) {}; #endif #ifdef CONFIG_GENERIC_CLOCKEVENTS diff --git a/kernel/time/Makefile b/kernel/time/Makefile index 9250130646f5..06151ef4a744 100644 --- a/kernel/time/Makefile +++ b/kernel/time/Makefile @@ -3,7 +3,7 @@ obj-y += timeconv.o posix-clock.o alarmtimer.o obj-$(CONFIG_GENERIC_CLOCKEVENTS_BUILD) += clockevents.o obj-$(CONFIG_GENERIC_CLOCKEVENTS) += tick-common.o -obj-$(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST) += tick-broadcast.o +obj-$(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST) += tick-broadcast.o tick-broadcast-hrtimer.o obj-$(CONFIG_GENERIC_SCHED_CLOCK) += sched_clock.o obj-$(CONFIG_TICK_ONESHOT) += tick-oneshot.o obj-$(CONFIG_TICK_ONESHOT) += tick-sched.o diff --git a/kernel/time/tick-broadcast-hrtimer.c b/kernel/time/tick-broadcast-hrtimer.c new file mode 100644 index 000000000000..92425279312b --- /dev/null +++ b/kernel/time/tick-broadcast-hrtimer.c @@ -0,0 +1,106 @@ +/* + * linux/kernel/time/tick-broadcast-hrtimer.c + * This file emulates a local clock event device + * via a pseudo clock device. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "tick-internal.h" + +static struct hrtimer bctimer; + +static void bc_set_mode(enum clock_event_mode mode, + struct clock_event_device *bc) +{ + switch (mode) { + case CLOCK_EVT_MODE_SHUTDOWN: + /* + * Note, we cannot cancel the timer here as we might + * run into the following live lock scenario: + * + * cpu 0 cpu1 + * lock(broadcast_lock); + * hrtimer_interrupt() + * bc_handler() + * tick_handle_oneshot_broadcast(); + * lock(broadcast_lock); + * hrtimer_cancel() + * wait_for_callback() + */ + hrtimer_try_to_cancel(&bctimer); + break; + default: + break; + } +} + +/* + * This is called from the guts of the broadcast code when the cpu + * which is about to enter idle has the earliest broadcast timer event. + */ +static int bc_set_next(ktime_t expires, struct clock_event_device *bc) +{ + /* + * We try to cancel the timer first. If the callback is on + * flight on some other cpu then we let it handle it. If we + * were able to cancel the timer nothing can rearm it as we + * own broadcast_lock. + * + * However we can also be called from the event handler of + * ce_broadcast_hrtimer itself when it expires. We cannot + * restart the timer because we are in the callback, but we + * can set the expiry time and let the callback return + * HRTIMER_RESTART. + */ + if (hrtimer_try_to_cancel(&bctimer) >= 0) { + hrtimer_start(&bctimer, expires, HRTIMER_MODE_ABS_PINNED); + /* Bind the "device" to the cpu */ + bc->bound_on = smp_processor_id(); + } else if (bc->bound_on == smp_processor_id()) { + hrtimer_set_expires(&bctimer, expires); + } + return 0; +} + +static struct clock_event_device ce_broadcast_hrtimer = { + .set_mode = bc_set_mode, + .set_next_ktime = bc_set_next, + .features = CLOCK_EVT_FEAT_ONESHOT | + CLOCK_EVT_FEAT_KTIME | + CLOCK_EVT_FEAT_HRTIMER, + .rating = 0, + .bound_on = -1, + .min_delta_ns = 1, + .max_delta_ns = KTIME_MAX, + .min_delta_ticks = 1, + .max_delta_ticks = KTIME_MAX, + .mult = 1, + .shift = 0, + .cpumask = cpu_all_mask, +}; + +static enum hrtimer_restart bc_handler(struct hrtimer *t) +{ + ce_broadcast_hrtimer.event_handler(&ce_broadcast_hrtimer); + + if (ce_broadcast_hrtimer.next_event.tv64 == KTIME_MAX) + return HRTIMER_NORESTART; + + return HRTIMER_RESTART; +} + +void tick_setup_hrtimer_broadcast(void) +{ + hrtimer_init(&bctimer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); + bctimer.function = bc_handler; + clockevents_register_device(&ce_broadcast_hrtimer); +} diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index 84c8fd91d744..63c7b2d9ed8e 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -643,6 +643,42 @@ again: raw_spin_unlock(&tick_broadcast_lock); } +static int broadcast_needs_cpu(struct clock_event_device *bc, int cpu) +{ + if (!(bc->features & CLOCK_EVT_FEAT_HRTIMER)) + return 0; + if (bc->next_event.tv64 == KTIME_MAX) + return 0; + return bc->bound_on == cpu ? -EBUSY : 0; +} + +static void broadcast_shutdown_local(struct clock_event_device *bc, + struct clock_event_device *dev) +{ + /* + * For hrtimer based broadcasting we cannot shutdown the cpu + * local device if our own event is the first one to expire or + * if we own the broadcast timer. + */ + if (bc->features & CLOCK_EVT_FEAT_HRTIMER) { + if (broadcast_needs_cpu(bc, smp_processor_id())) + return; + if (dev->next_event.tv64 < bc->next_event.tv64) + return; + } + clockevents_set_mode(dev, CLOCK_EVT_MODE_SHUTDOWN); +} + +static void broadcast_move_bc(int deadcpu) +{ + struct clock_event_device *bc = tick_broadcast_device.evtdev; + + if (!bc || !broadcast_needs_cpu(bc, deadcpu)) + return; + /* This moves the broadcast assignment to this cpu */ + clockevents_program_event(bc, bc->next_event, 1); +} + /* * Powerstate information: The system enters/leaves a state, where * affected devices might stop @@ -661,7 +697,7 @@ int tick_broadcast_oneshot_control(unsigned long reason) * states */ if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) - return; + return 0; /* * We are called with preemtion disabled from the depth of the @@ -672,7 +708,7 @@ int tick_broadcast_oneshot_control(unsigned long reason) dev = td->evtdev; if (!(dev->features & CLOCK_EVT_FEAT_C3STOP)) - return; + return 0; bc = tick_broadcast_device.evtdev; @@ -680,7 +716,7 @@ int tick_broadcast_oneshot_control(unsigned long reason) if (reason == CLOCK_EVT_NOTIFY_BROADCAST_ENTER) { if (!cpumask_test_and_set_cpu(cpu, tick_broadcast_oneshot_mask)) { WARN_ON_ONCE(cpumask_test_cpu(cpu, tick_broadcast_pending_mask)); - clockevents_set_mode(dev, CLOCK_EVT_MODE_SHUTDOWN); + broadcast_shutdown_local(bc, dev); /* * We only reprogram the broadcast timer if we * did not mark ourself in the force mask and @@ -693,6 +729,16 @@ int tick_broadcast_oneshot_control(unsigned long reason) dev->next_event.tv64 < bc->next_event.tv64) tick_broadcast_set_event(bc, cpu, dev->next_event, 1); } + /* + * If the current CPU owns the hrtimer broadcast + * mechanism, it cannot go deep idle and we remove the + * CPU from the broadcast mask. We don't have to go + * through the EXIT path as the local timer is not + * shutdown. + */ + ret = broadcast_needs_cpu(bc, cpu); + if (ret) + cpumask_clear_cpu(cpu, tick_broadcast_oneshot_mask); } else { if (cpumask_test_and_clear_cpu(cpu, tick_broadcast_oneshot_mask)) { clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT); @@ -866,6 +912,8 @@ void tick_shutdown_broadcast_oneshot(unsigned int *cpup) cpumask_clear_cpu(cpu, tick_broadcast_pending_mask); cpumask_clear_cpu(cpu, tick_broadcast_force_mask); + broadcast_move_bc(cpu); + raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); } -- cgit From f1689bb7abec8e2e670d8ad11eaa86d54bad8cfd Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 7 Feb 2014 16:00:46 +0100 Subject: time: Fixup fallout from recent clockevent/tick changes Make the stub function static inline instead of static and move the clockevents related function into the proper ifdeffed section. Reported-by: Fengguang Wu Signed-off-by: Thomas Gleixner Cc: Soren Brinkmann Cc: Preeti U Murthy --- include/linux/clockchips.h | 2 +- kernel/time/tick-internal.h | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/include/linux/clockchips.h b/include/linux/clockchips.h index dbe9e1457168..20a7183f2831 100644 --- a/include/linux/clockchips.h +++ b/include/linux/clockchips.h @@ -191,7 +191,7 @@ extern void tick_setup_hrtimer_broadcast(void); extern int tick_check_broadcast_expired(void); #else static inline int tick_check_broadcast_expired(void) { return 0; } -static void tick_setup_hrtimer_broadcast(void) {}; +static inline void tick_setup_hrtimer_broadcast(void) {}; #endif #ifdef CONFIG_GENERIC_CLOCKEVENTS diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h index 0756c62c219a..7ab92b19965a 100644 --- a/kernel/time/tick-internal.h +++ b/kernel/time/tick-internal.h @@ -155,8 +155,9 @@ static inline int tick_device_is_functional(struct clock_event_device *dev) return !(dev->features & CLOCK_EVT_FEAT_DUMMY); } +int __clockevents_update_freq(struct clock_event_device *dev, u32 freq); + #endif -int __clockevents_update_freq(struct clock_event_device *dev, u32 freq); extern void do_timer(unsigned long ticks); extern void update_wall_time(void); -- cgit From 849401b66d305f3feb75b6db7459b95ad190552a Mon Sep 17 00:00:00 2001 From: Preeti U Murthy Date: Sun, 9 Feb 2014 11:32:22 +0530 Subject: tick: Fixup more fallout from hrtimer broadcast mode The hrtimer mode of broadcast is supported only when GENERIC_CLOCKEVENTS_BROADCAST and TICK_ONESHOT config options are enabled. Hence compile in the functions for hrtimer mode of broadcast only when these options are selected. Also fix max_delta_ticks value for the pseudo clock device. Reported-by: Fengguang Wu Reported-by: Ingo Molnar Signed-off-by: Preeti U Murthy Link: http://lkml.kernel.org/r/52F719EE.9010304@linux.vnet.ibm.com Signed-off-by: Thomas Gleixner --- include/linux/clockchips.h | 1 + kernel/time/Makefile | 5 ++++- kernel/time/tick-broadcast-hrtimer.c | 2 +- 3 files changed, 6 insertions(+), 2 deletions(-) diff --git a/include/linux/clockchips.h b/include/linux/clockchips.h index 20a7183f2831..2e4cb67f6e56 100644 --- a/include/linux/clockchips.h +++ b/include/linux/clockchips.h @@ -207,6 +207,7 @@ static inline void clockevents_resume(void) {} static inline int clockevents_notify(unsigned long reason, void *arg) { return 0; } static inline int tick_check_broadcast_expired(void) { return 0; } +static inline void tick_setup_hrtimer_broadcast(void) {}; #endif diff --git a/kernel/time/Makefile b/kernel/time/Makefile index 06151ef4a744..57a413fd0ebf 100644 --- a/kernel/time/Makefile +++ b/kernel/time/Makefile @@ -3,7 +3,10 @@ obj-y += timeconv.o posix-clock.o alarmtimer.o obj-$(CONFIG_GENERIC_CLOCKEVENTS_BUILD) += clockevents.o obj-$(CONFIG_GENERIC_CLOCKEVENTS) += tick-common.o -obj-$(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST) += tick-broadcast.o tick-broadcast-hrtimer.o +ifeq ($(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST),y) + obj-y += tick-broadcast.o + obj-$(CONFIG_TICK_ONESHOT) += tick-broadcast-hrtimer.o +endif obj-$(CONFIG_GENERIC_SCHED_CLOCK) += sched_clock.o obj-$(CONFIG_TICK_ONESHOT) += tick-oneshot.o obj-$(CONFIG_TICK_ONESHOT) += tick-sched.o diff --git a/kernel/time/tick-broadcast-hrtimer.c b/kernel/time/tick-broadcast-hrtimer.c index 92425279312b..eb682d5c697c 100644 --- a/kernel/time/tick-broadcast-hrtimer.c +++ b/kernel/time/tick-broadcast-hrtimer.c @@ -82,7 +82,7 @@ static struct clock_event_device ce_broadcast_hrtimer = { .min_delta_ns = 1, .max_delta_ns = KTIME_MAX, .min_delta_ticks = 1, - .max_delta_ticks = KTIME_MAX, + .max_delta_ticks = ULONG_MAX, .mult = 1, .shift = 0, .cpumask = cpu_all_mask, -- cgit From 8ba14654282ed6bb386d0a2f1ab329bfb293403f Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Mon, 10 Feb 2014 17:09:54 +0100 Subject: timer: Spare IPI when deferrable timer is queued on idle remote targets When a timer is enqueued or modified on a remote target, the latter is expected to see and handle this timer on its next tick. However if the target is idle and CONFIG_NO_HZ_IDLE=y, the CPU may be sleeping tickless and the timer may be ignored. wake_up_nohz_cpu() takes care of that by setting TIF_NEED_RESCHED and sending an IPI to idle targets so that the tick is reevaluated on the idle loop through the tick_nohz_idle_*() APIs. Now this is all performed regardless of the power properties of the timer. If the timer is deferrable, idle targets don't need to be woken up. Only the next buzy tick needs to care about it, and no IPI kick is needed for that to happen. So lets spare the IPI on idle targets when the timer is deferrable. Meanwhile we keep the current behaviour on full dynticks targets. We can spare IPIs on idle full dynticks targets as well but some tricky races against idle_cpu() must be dealt all along to make sure that the timer is well handled after idle exit. We can deal with that later since NO_HZ_FULL already has more important powersaving issues. Reported-by: Thomas Gleixner Signed-off-by: Viresh Kumar Cc: Ingo Molnar Cc: Paul Gortmaker Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/CAKohpomMZ0TAN2e6N76_g4ZRzxd5vZ1XfuZfxrP7GMxfTNiLVw@mail.gmail.com Signed-off-by: Frederic Weisbecker --- kernel/timer.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/kernel/timer.c b/kernel/timer.c index accfd241b9e5..b75e7893be14 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -939,8 +939,15 @@ void add_timer_on(struct timer_list *timer, int cpu) * with the timer by holding the timer base lock. This also * makes sure that a CPU on the way to stop its tick can not * evaluate the timer wheel. + * + * Spare the IPI for deferrable timers on idle targets though. + * The next busy ticks will take care of it. Except full dynticks + * require special care against races with idle_cpu(), lets deal + * with that later. */ - wake_up_nohz_cpu(cpu); + if (!tbase_get_deferrable(timer->base) || tick_nohz_full_cpu(cpu)) + wake_up_nohz_cpu(cpu); + spin_unlock_irqrestore(&base->lock, flags); } EXPORT_SYMBOL_GPL(add_timer_on); -- cgit From f96a34e27df19335155394a235ea3a096bc52a71 Mon Sep 17 00:00:00 2001 From: Paul Gortmaker Date: Thu, 6 Feb 2014 13:36:21 -0500 Subject: nohz: ensure users are aware boot CPU is not NO_HZ_FULL This bit of information is in the Kconfig help text: "Note the boot CPU will still be kept outside the range to handle the timekeeping duty." However neither the variable NO_HZ_FULL_ALL, or the prompt convey this important detail, so lets add it to the prompt to make it more explicitly obvious to the average user. Acked-by: Paul E. McKenney Signed-off-by: Paul Gortmaker Cc: Ingo Molnar Cc: Paul Gortmaker Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1391711781-7466-1-git-send-email-paul.gortmaker@windriver.com Signed-off-by: Frederic Weisbecker --- kernel/time/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig index 3ce6e8c5f3fc..f448513a45ed 100644 --- a/kernel/time/Kconfig +++ b/kernel/time/Kconfig @@ -124,7 +124,7 @@ config NO_HZ_FULL endchoice config NO_HZ_FULL_ALL - bool "Full dynticks system on all CPUs by default" + bool "Full dynticks system on all CPUs by default (except CPU 0)" depends on NO_HZ_FULL help If the user doesn't pass the nohz_full boot option to -- cgit From fff421580f512fc044cc7421fdff31a7a6997350 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 14 Jan 2014 20:20:43 -0800 Subject: timers: Track total number of timers in list Currently, the tvec_base structure's ->active_timers field tracks only the non-deferrable timers, which means that even if ->active_timers is zero, there might well be deferrable timers in the list. This commit therefore adds an ->all_timers field to track all the timers, whether deferrable or not. Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett Acked-by: Peter Zijlstra Reviewed-by: Oleg Nesterov Reviewed-by: Steven Rostedt Tested-by: Mike Galbraith --- kernel/timer.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/kernel/timer.c b/kernel/timer.c index accfd241b9e5..fdc43834f3af 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -81,6 +81,7 @@ struct tvec_base { unsigned long timer_jiffies; unsigned long next_timer; unsigned long active_timers; + unsigned long all_timers; struct tvec_root tv1; struct tvec tv2; struct tvec tv3; @@ -392,6 +393,7 @@ static void internal_add_timer(struct tvec_base *base, struct timer_list *timer) base->next_timer = timer->expires; base->active_timers++; } + base->all_timers++; } #ifdef CONFIG_TIMER_STATS @@ -671,6 +673,7 @@ detach_expired_timer(struct timer_list *timer, struct tvec_base *base) detach_timer(timer, true); if (!tbase_get_deferrable(timer->base)) base->active_timers--; + base->all_timers--; } static int detach_if_pending(struct timer_list *timer, struct tvec_base *base, @@ -685,6 +688,7 @@ static int detach_if_pending(struct timer_list *timer, struct tvec_base *base, if (timer->expires == base->next_timer) base->next_timer = base->timer_jiffies; } + base->all_timers--; return 1; } @@ -1559,6 +1563,7 @@ static int init_timers_cpu(int cpu) base->timer_jiffies = jiffies; base->next_timer = base->timer_jiffies; base->active_timers = 0; + base->all_timers = 0; return 0; } -- cgit From d550e81dc0ddc04f1b417c179c214103a28e0ee8 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 16 Dec 2013 05:57:10 -0800 Subject: timers: Reduce __run_timers() latency for empty list The __run_timers() function currently steps through the list one jiffy at a time in order to update the timer wheel. However, if the timer wheel is empty, no adjustment is needed other than updating ->timer_jiffies. In this case, which is likely to be common for NO_HZ_FULL kernels, the kernel currently incurs a large latency for no good reason. This commit therefore short-circuits this case. Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett Acked-by: Peter Zijlstra Reviewed-by: Oleg Nesterov Reviewed-by: Steven Rostedt Tested-by: Mike Galbraith --- kernel/timer.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/kernel/timer.c b/kernel/timer.c index fdc43834f3af..c8bc7091d8f3 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -338,6 +338,20 @@ void set_timer_slack(struct timer_list *timer, int slack_hz) } EXPORT_SYMBOL_GPL(set_timer_slack); +/* + * If the list is empty, catch up ->timer_jiffies to the current time. + * The caller must hold the tvec_base lock. Returns true if the list + * was empty and therefore ->timer_jiffies was updated. + */ +static bool catchup_timer_jiffies(struct tvec_base *base) +{ + if (!base->all_timers) { + base->timer_jiffies = jiffies; + return true; + } + return false; +} + static void __internal_add_timer(struct tvec_base *base, struct timer_list *timer) { @@ -1150,6 +1164,10 @@ static inline void __run_timers(struct tvec_base *base) struct timer_list *timer; spin_lock_irq(&base->lock); + if (catchup_timer_jiffies(base)) { + spin_unlock_irq(&base->lock); + return; + } while (time_after_eq(jiffies, base->timer_jiffies)) { struct list_head work_list; struct list_head *head = &work_list; -- cgit From 16d937f880312e3f47157d4d6d6ebf7e61523378 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 16 Dec 2013 10:32:01 -0800 Subject: timers: Reduce future __run_timers() latency for newly emptied list The __run_timers() function currently steps through the list one jiffy at a time in order to update the timer wheel. However, if the timer wheel is empty, no adjustment is needed other than updating ->timer_jiffies. Therefore, if we just emptied the timer wheel, for example, by deleting the last timer, we should mark the timer wheel as being up to date. This marking will reduce (and perhaps eliminate) the jiffy-stepping that a future __run_timers() call will need to do in response to some future timer posting or migration. This commit therefore catches ->timer_jiffies for this case. Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett Acked-by: Peter Zijlstra Reviewed-by: Oleg Nesterov Reviewed-by: Steven Rostedt Tested-by: Mike Galbraith --- kernel/timer.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/timer.c b/kernel/timer.c index c8bc7091d8f3..dfac34f7186f 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -688,6 +688,7 @@ detach_expired_timer(struct timer_list *timer, struct tvec_base *base) if (!tbase_get_deferrable(timer->base)) base->active_timers--; base->all_timers--; + (void)catchup_timer_jiffies(base); } static int detach_if_pending(struct timer_list *timer, struct tvec_base *base, @@ -703,6 +704,7 @@ static int detach_if_pending(struct timer_list *timer, struct tvec_base *base, base->next_timer = base->timer_jiffies; } base->all_timers--; + (void)catchup_timer_jiffies(base); return 1; } -- cgit From 18d8cb64c9c074cbe2bd677ab10fff8283abdb62 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 16 Dec 2013 10:41:50 -0800 Subject: timers: Reduce future __run_timers() latency for first add to empty list The __run_timers() function currently steps through the list one jiffy at a time in order to update the timer wheel. However, if the timer wheel is empty, no adjustment is needed other than updating ->timer_jiffies. Therefore, just before we add a timer to an empty timer wheel, we should mark the timer wheel as being up to date. This marking will reduce (and perhaps eliminate) the jiffy-stepping that a future __run_timers() call will need to do in response to some future timer posting or migration. This commit therefore updates ->timer_jiffies for this case. Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett Acked-by: Peter Zijlstra Reviewed-by: Oleg Nesterov Reviewed-by: Steven Rostedt Tested-by: Mike Galbraith --- kernel/timer.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/timer.c b/kernel/timer.c index dfac34f7186f..0c638cf3d9d2 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -398,6 +398,7 @@ __internal_add_timer(struct tvec_base *base, struct timer_list *timer) static void internal_add_timer(struct tvec_base *base, struct timer_list *timer) { + (void)catchup_timer_jiffies(base); __internal_add_timer(base, timer); /* * Update base->active_timers and base->next_timer -- cgit From aea369b959bef10d235cd0714789cd8b0fe170b8 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 15 Jan 2014 16:19:27 -0800 Subject: timers: Make internal_add_timer() update ->next_timer if ->active_timers == 0 The internal_add_timer() function updates base->next_timer only if timer->expires < base->next_timer. This is correct, but it also makes sense to do the same if we add the first non-deferrable timer. Signed-off-by: Oleg Nesterov Reviewed-by: Steven Rostedt Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett Acked-by: Peter Zijlstra Tested-by: Mike Galbraith --- kernel/timer.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/timer.c b/kernel/timer.c index 0c638cf3d9d2..c0d8898fed98 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -404,9 +404,9 @@ static void internal_add_timer(struct tvec_base *base, struct timer_list *timer) * Update base->active_timers and base->next_timer */ if (!tbase_get_deferrable(timer->base)) { - if (time_before(timer->expires, base->next_timer)) + if (!base->active_timers++ || + time_before(timer->expires, base->next_timer)) base->next_timer = timer->expires; - base->active_timers++; } base->all_timers++; } -- cgit