From c1e16aa2792a129d8920e22210ef21eb62a0f80a Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 28 Feb 2007 20:12:19 -0800 Subject: [PATCH] Fix posix-cpu-timer breakage caused by stale p->last_ran value Problem description at: http://bugzilla.kernel.org/show_bug.cgi?id=8048 Commit b18ec80396834497933d77b81ec0918519f4e2a7 [PATCH] sched: improve migration accuracy optimized the scheduler time calculations, but broke posix-cpu-timers. The problem is that the p->last_ran value is not updated after a context switch. So a subsequent call to current_sched_time() calculates with a stale p->last_ran value, i.e. accounts the full time, which the task was scheduled away. Signed-off-by: Thomas Gleixner Acked-by: Ingo Molnar Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 0dc757246d89..0c5ebf59b8bc 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -3566,7 +3566,7 @@ switch_tasks: sched_info_switch(prev, next); if (likely(prev != next)) { - next->timestamp = now; + next->timestamp = next->last_ran = now; rq->nr_switches++; rq->curr = next; ++*switch_count; -- cgit From 7355690ead6d61f6344072ae61060f985060da29 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 28 Feb 2007 20:13:42 -0800 Subject: [PATCH] sched: fix SMT scheduler bug The SMT scheduler incorrectly skips kernel threads even if they are runnable (but they are preempted by a higher-prio user-space task which got SMT-delayed by an even higher-priority task running on a sibling CPU). Fix this for now by only doing the SMT-nice optimization if the to-be-delayed task is the only runnable task. (This should cover most of the real-life cases anyway.) This bug has been in the SMT scheduler since 2.6.17 or so, but has only been noticed now by the active check in the dynticks code. Signed-off-by: Ingo Molnar Cc: Michal Piotrowski Cc: Nick Piggin Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 0c5ebf59b8bc..5f102e6c7a4c 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -3547,7 +3547,7 @@ need_resched_nonpreemptible: } } next->sleep_type = SLEEP_NORMAL; - if (dependent_sleeper(cpu, rq, next)) + if (rq->nr_running == 1 && dependent_sleeper(cpu, rq, next)) next = rq->idle; switch_tasks: if (next == rq->idle) -- cgit From 69f7c0a1be84b10a81b6edcce2dbee0cdec26eba Mon Sep 17 00:00:00 2001 From: Con Kolivas Date: Mon, 5 Mar 2007 00:30:29 -0800 Subject: [PATCH] sched: remove SMT nice Remove the SMT-nice feature which idles sibling cpus on SMT cpus to facilitiate nice working properly where cpu power is shared. The idling of cpus in the presence of runnable tasks is considered too fragile, easy to break with outside code, and the complexity of managing this system if an architecture comes along with many logical cores sharing cpu power will be unworkable. Remove the associated per_cpu_gain variable in sched_domains used only by this code. Also: The reason is that with dynticks enabled, this code breaks without yet further tweaks so dynticks brought on the rapid demise of this code. So either we tweak this code or kill it off entirely. It was Ingo's preference to kill it off. Either way this needs to happen for 2.6.21 since dynticks has gone in. Signed-off-by: Con Kolivas Acked-by: Ingo Molnar Cc: Nick Piggin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/asm-i386/topology.h | 1 - include/asm-ia64/topology.h | 2 - include/asm-mips/mach-ip27/topology.h | 1 - include/asm-powerpc/topology.h | 1 - include/asm-x86_64/topology.h | 1 - include/linux/sched.h | 1 - include/linux/topology.h | 4 - kernel/sched.c | 155 +--------------------------------- 8 files changed, 1 insertion(+), 165 deletions(-) (limited to 'kernel/sched.c') diff --git a/include/asm-i386/topology.h b/include/asm-i386/topology.h index ac58580ad664..7fc512d90ea8 100644 --- a/include/asm-i386/topology.h +++ b/include/asm-i386/topology.h @@ -85,7 +85,6 @@ static inline int node_to_first_cpu(int node) .idle_idx = 1, \ .newidle_idx = 2, \ .wake_idx = 1, \ - .per_cpu_gain = 100, \ .flags = SD_LOAD_BALANCE \ | SD_BALANCE_EXEC \ | SD_BALANCE_FORK \ diff --git a/include/asm-ia64/topology.h b/include/asm-ia64/topology.h index 22ed6749557e..233f1caae048 100644 --- a/include/asm-ia64/topology.h +++ b/include/asm-ia64/topology.h @@ -65,7 +65,6 @@ void build_cpu_to_node_map(void); .max_interval = 4, \ .busy_factor = 64, \ .imbalance_pct = 125, \ - .per_cpu_gain = 100, \ .cache_nice_tries = 2, \ .busy_idx = 2, \ .idle_idx = 1, \ @@ -97,7 +96,6 @@ void build_cpu_to_node_map(void); .newidle_idx = 0, /* unused */ \ .wake_idx = 1, \ .forkexec_idx = 1, \ - .per_cpu_gain = 100, \ .flags = SD_LOAD_BALANCE \ | SD_BALANCE_EXEC \ | SD_BALANCE_FORK \ diff --git a/include/asm-mips/mach-ip27/topology.h b/include/asm-mips/mach-ip27/topology.h index 44790fdc5d00..61d9be3f3175 100644 --- a/include/asm-mips/mach-ip27/topology.h +++ b/include/asm-mips/mach-ip27/topology.h @@ -28,7 +28,6 @@ extern unsigned char __node_distances[MAX_COMPACT_NODES][MAX_COMPACT_NODES]; .busy_factor = 32, \ .imbalance_pct = 125, \ .cache_nice_tries = 1, \ - .per_cpu_gain = 100, \ .flags = SD_LOAD_BALANCE \ | SD_BALANCE_EXEC \ | SD_WAKE_BALANCE, \ diff --git a/include/asm-powerpc/topology.h b/include/asm-powerpc/topology.h index 6610495f5f16..0ad21a849b5f 100644 --- a/include/asm-powerpc/topology.h +++ b/include/asm-powerpc/topology.h @@ -57,7 +57,6 @@ static inline int pcibus_to_node(struct pci_bus *bus) .busy_factor = 32, \ .imbalance_pct = 125, \ .cache_nice_tries = 1, \ - .per_cpu_gain = 100, \ .busy_idx = 3, \ .idle_idx = 1, \ .newidle_idx = 2, \ diff --git a/include/asm-x86_64/topology.h b/include/asm-x86_64/topology.h index 2facec5914d2..4fd6fb23953e 100644 --- a/include/asm-x86_64/topology.h +++ b/include/asm-x86_64/topology.h @@ -43,7 +43,6 @@ extern int __node_distance(int, int); .newidle_idx = 0, \ .wake_idx = 1, \ .forkexec_idx = 1, \ - .per_cpu_gain = 100, \ .flags = SD_LOAD_BALANCE \ | SD_BALANCE_FORK \ | SD_BALANCE_EXEC \ diff --git a/include/linux/sched.h b/include/linux/sched.h index 6f7c9a4d80e5..49fe2997a016 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -684,7 +684,6 @@ struct sched_domain { unsigned int imbalance_pct; /* No balance until over watermark */ unsigned long long cache_hot_time; /* Task considered cache hot (ns) */ unsigned int cache_nice_tries; /* Leave cache hot tasks for # tries */ - unsigned int per_cpu_gain; /* CPU % gained by adding domain cpus */ unsigned int busy_idx; unsigned int idle_idx; unsigned int newidle_idx; diff --git a/include/linux/topology.h b/include/linux/topology.h index 6c5a6e6e813b..a9d1f049cc15 100644 --- a/include/linux/topology.h +++ b/include/linux/topology.h @@ -96,7 +96,6 @@ .busy_factor = 64, \ .imbalance_pct = 110, \ .cache_nice_tries = 0, \ - .per_cpu_gain = 25, \ .busy_idx = 0, \ .idle_idx = 0, \ .newidle_idx = 1, \ @@ -128,7 +127,6 @@ .busy_factor = 64, \ .imbalance_pct = 125, \ .cache_nice_tries = 1, \ - .per_cpu_gain = 100, \ .busy_idx = 2, \ .idle_idx = 1, \ .newidle_idx = 2, \ @@ -159,7 +157,6 @@ .busy_factor = 64, \ .imbalance_pct = 125, \ .cache_nice_tries = 1, \ - .per_cpu_gain = 100, \ .busy_idx = 2, \ .idle_idx = 1, \ .newidle_idx = 2, \ @@ -193,7 +190,6 @@ .newidle_idx = 0, /* unused */ \ .wake_idx = 0, /* unused */ \ .forkexec_idx = 0, /* unused */ \ - .per_cpu_gain = 100, \ .flags = SD_LOAD_BALANCE \ | SD_SERIALIZE, \ .last_balance = jiffies, \ diff --git a/kernel/sched.c b/kernel/sched.c index 5f102e6c7a4c..a4ca632c477c 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -3006,23 +3006,6 @@ static inline void idle_balance(int cpu, struct rq *rq) } #endif -static inline void wake_priority_sleeper(struct rq *rq) -{ -#ifdef CONFIG_SCHED_SMT - if (!rq->nr_running) - return; - - spin_lock(&rq->lock); - /* - * If an SMT sibling task has been put to sleep for priority - * reasons reschedule the idle task to see if it can now run. - */ - if (rq->nr_running) - resched_task(rq->idle); - spin_unlock(&rq->lock); -#endif -} - DEFINE_PER_CPU(struct kernel_stat, kstat); EXPORT_PER_CPU_SYMBOL(kstat); @@ -3239,10 +3222,7 @@ void scheduler_tick(void) update_cpu_clock(p, rq, now); - if (p == rq->idle) - /* Task on the idle queue */ - wake_priority_sleeper(rq); - else + if (p != rq->idle) task_running_tick(rq, p); #ifdef CONFIG_SMP update_load(rq); @@ -3251,136 +3231,6 @@ void scheduler_tick(void) #endif } -#ifdef CONFIG_SCHED_SMT -static inline void wakeup_busy_runqueue(struct rq *rq) -{ - /* If an SMT runqueue is sleeping due to priority reasons wake it up */ - if (rq->curr == rq->idle && rq->nr_running) - resched_task(rq->idle); -} - -/* - * Called with interrupt disabled and this_rq's runqueue locked. - */ -static void wake_sleeping_dependent(int this_cpu) -{ - struct sched_domain *tmp, *sd = NULL; - int i; - - for_each_domain(this_cpu, tmp) { - if (tmp->flags & SD_SHARE_CPUPOWER) { - sd = tmp; - break; - } - } - - if (!sd) - return; - - for_each_cpu_mask(i, sd->span) { - struct rq *smt_rq = cpu_rq(i); - - if (i == this_cpu) - continue; - if (unlikely(!spin_trylock(&smt_rq->lock))) - continue; - - wakeup_busy_runqueue(smt_rq); - spin_unlock(&smt_rq->lock); - } -} - -/* - * number of 'lost' timeslices this task wont be able to fully - * utilize, if another task runs on a sibling. This models the - * slowdown effect of other tasks running on siblings: - */ -static inline unsigned long -smt_slice(struct task_struct *p, struct sched_domain *sd) -{ - return p->time_slice * (100 - sd->per_cpu_gain) / 100; -} - -/* - * To minimise lock contention and not have to drop this_rq's runlock we only - * trylock the sibling runqueues and bypass those runqueues if we fail to - * acquire their lock. As we only trylock the normal locking order does not - * need to be obeyed. - */ -static int -dependent_sleeper(int this_cpu, struct rq *this_rq, struct task_struct *p) -{ - struct sched_domain *tmp, *sd = NULL; - int ret = 0, i; - - /* kernel/rt threads do not participate in dependent sleeping */ - if (!p->mm || rt_task(p)) - return 0; - - for_each_domain(this_cpu, tmp) { - if (tmp->flags & SD_SHARE_CPUPOWER) { - sd = tmp; - break; - } - } - - if (!sd) - return 0; - - for_each_cpu_mask(i, sd->span) { - struct task_struct *smt_curr; - struct rq *smt_rq; - - if (i == this_cpu) - continue; - - smt_rq = cpu_rq(i); - if (unlikely(!spin_trylock(&smt_rq->lock))) - continue; - - smt_curr = smt_rq->curr; - - if (!smt_curr->mm) - goto unlock; - - /* - * If a user task with lower static priority than the - * running task on the SMT sibling is trying to schedule, - * delay it till there is proportionately less timeslice - * left of the sibling task to prevent a lower priority - * task from using an unfair proportion of the - * physical cpu's resources. -ck - */ - if (rt_task(smt_curr)) { - /* - * With real time tasks we run non-rt tasks only - * per_cpu_gain% of the time. - */ - if ((jiffies % DEF_TIMESLICE) > - (sd->per_cpu_gain * DEF_TIMESLICE / 100)) - ret = 1; - } else { - if (smt_curr->static_prio < p->static_prio && - !TASK_PREEMPTS_CURR(p, smt_rq) && - smt_slice(smt_curr, sd) > task_timeslice(p)) - ret = 1; - } -unlock: - spin_unlock(&smt_rq->lock); - } - return ret; -} -#else -static inline void wake_sleeping_dependent(int this_cpu) -{ -} -static inline int -dependent_sleeper(int this_cpu, struct rq *this_rq, struct task_struct *p) -{ - return 0; -} -#endif - #if defined(CONFIG_PREEMPT) && defined(CONFIG_DEBUG_PREEMPT) void fastcall add_preempt_count(int val) @@ -3507,7 +3357,6 @@ need_resched_nonpreemptible: if (!rq->nr_running) { next = rq->idle; rq->expired_timestamp = 0; - wake_sleeping_dependent(cpu); goto switch_tasks; } } @@ -3547,8 +3396,6 @@ need_resched_nonpreemptible: } } next->sleep_type = SLEEP_NORMAL; - if (rq->nr_running == 1 && dependent_sleeper(cpu, rq, next)) - next = rq->idle; switch_tasks: if (next == rq->idle) schedstat_inc(rq, sched_goidle); -- cgit From 35f6f753b79705bc4b62da5dcc218d75ffa88370 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 6 Apr 2007 21:18:06 +0200 Subject: [PATCH] sched: get rid of p->children use in show_task() the p->parent PID printout gives us all the information about the task tree that we need - the eldest_child()/older_sibling()/ younger_sibling() printouts are mostly historic and i do not remember ever having used those fields. (IMO in fact they confuse the SysRq-T output.) So remove them. This code has sentimental value though, those fields and printouts are one of the oldest ones still surviving from Linux v0.95's kernel/sched.c: if (p->p_ysptr || p->p_osptr) printk(" Younger sib=%d, older sib=%d\n\r", p->p_ysptr ? p->p_ysptr->pid : -1, p->p_osptr ? p->p_osptr->pid : -1); else printk("\n\r"); written 15 years ago, in early 1992. Signed-off-by: Ingo Molnar Signed-off-by: Linus 'snif' Torvalds --- kernel/sched.c | 35 +---------------------------------- 1 file changed, 1 insertion(+), 34 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index a4ca632c477c..cdad3b04242a 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -4687,27 +4687,6 @@ out_unlock: return retval; } -static inline struct task_struct *eldest_child(struct task_struct *p) -{ - if (list_empty(&p->children)) - return NULL; - return list_entry(p->children.next,struct task_struct,sibling); -} - -static inline struct task_struct *older_sibling(struct task_struct *p) -{ - if (p->sibling.prev==&p->parent->children) - return NULL; - return list_entry(p->sibling.prev,struct task_struct,sibling); -} - -static inline struct task_struct *younger_sibling(struct task_struct *p) -{ - if (p->sibling.next==&p->parent->children) - return NULL; - return list_entry(p->sibling.next,struct task_struct,sibling); -} - static const char stat_nam[] = "RSDTtZX"; static void show_task(struct task_struct *p) @@ -4738,19 +4717,7 @@ static void show_task(struct task_struct *p) free = (unsigned long)n - (unsigned long)end_of_stack(p); } #endif - printk("%5lu %5d %6d ", free, p->pid, p->parent->pid); - if ((relative = eldest_child(p))) - printk("%5d ", relative->pid); - else - printk(" "); - if ((relative = younger_sibling(p))) - printk("%7d", relative->pid); - else - printk(" "); - if ((relative = older_sibling(p))) - printk(" %5d", relative->pid); - else - printk(" "); + printk("%5lu %5d %6d", free, p->pid, p->parent->pid); if (!p->mm) printk(" (L-TLB)\n"); else -- cgit From d354d2f4a6fc1b722c2e464a8b3cfd2f6afb304b Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sat, 7 Apr 2007 10:18:33 -0700 Subject: sched.c: Remove unused variable 'relative' Getting rid of the p->children printout in show_task() left behind an unused variable. Signed-off-by: Linus Torvalds --- kernel/sched.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index cdad3b04242a..b9a683730148 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -4691,7 +4691,6 @@ static const char stat_nam[] = "RSDTtZX"; static void show_task(struct task_struct *p) { - struct task_struct *relative; unsigned long free = 0; unsigned state; -- cgit From 39bc89fd4019b164002adaacef92c4140e37955a Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 25 Apr 2007 20:50:03 -0700 Subject: make SysRq-T show all tasks again show_state() (SysRq-T) developed the buggy habbit of not showing TASK_RUNNING tasks. This was due to the mistaken belief that state_filter == -1 would be a pass-through filter - while in reality it did not let TASK_RUNNING == 0 p->state values through. Fix this by restoring the original '!state_filter means all tasks' special-case i had in the original version. Test-built and test-booted on i686, SysRq-T now works as intended. Signed-off-by: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sched.h | 4 ++-- kernel/sched.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel/sched.c') diff --git a/include/linux/sched.h b/include/linux/sched.h index 49fe2997a016..a1707583de49 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -196,13 +196,13 @@ extern void init_idle(struct task_struct *idle, int cpu); extern cpumask_t nohz_cpu_mask; /* - * Only dump TASK_* tasks. (-1 for all tasks) + * Only dump TASK_* tasks. (0 for all tasks) */ extern void show_state_filter(unsigned long state_filter); static inline void show_state(void) { - show_state_filter(-1); + show_state_filter(0); } extern void show_regs(struct pt_regs *); diff --git a/kernel/sched.c b/kernel/sched.c index b9a683730148..960d7c5fca39 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -4746,7 +4746,7 @@ void show_state_filter(unsigned long state_filter) * console might take alot of time: */ touch_nmi_watchdog(); - if (p->state & state_filter) + if (!state_filter || (p->state & state_filter)) show_task(p); } while_each_thread(g, p); -- cgit