aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--include/linux/sched.h52
-rw-r--r--include/linux/sched/sysctl.h1
-rw-r--r--kernel/sched/core.c17
-rw-r--r--kernel/sched/fair.c7
-rw-r--r--kernel/sysctl.c7
5 files changed, 45 insertions, 39 deletions
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 348f51b0ec92..3762fe4e3a80 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -262,20 +262,9 @@ extern char ___assert_task_state[1 - 2*!!(
#define set_task_state(tsk, state_value) \
do { \
(tsk)->task_state_change = _THIS_IP_; \
- smp_store_mb((tsk)->state, (state_value)); \
+ smp_store_mb((tsk)->state, (state_value)); \
} while (0)
-/*
- * set_current_state() includes a barrier so that the write of current->state
- * is correctly serialised wrt the caller's subsequent test of whether to
- * actually sleep:
- *
- * set_current_state(TASK_UNINTERRUPTIBLE);
- * if (do_i_need_to_sleep())
- * schedule();
- *
- * If the caller does not need such serialisation then use __set_current_state()
- */
#define __set_current_state(state_value) \
do { \
current->task_state_change = _THIS_IP_; \
@@ -284,11 +273,19 @@ extern char ___assert_task_state[1 - 2*!!(
#define set_current_state(state_value) \
do { \
current->task_state_change = _THIS_IP_; \
- smp_store_mb(current->state, (state_value)); \
+ smp_store_mb(current->state, (state_value)); \
} while (0)
#else
+/*
+ * @tsk had better be current, or you get to keep the pieces.
+ *
+ * The only reason is that computing current can be more expensive than
+ * using a pointer that's already available.
+ *
+ * Therefore, see set_current_state().
+ */
#define __set_task_state(tsk, state_value) \
do { (tsk)->state = (state_value); } while (0)
#define set_task_state(tsk, state_value) \
@@ -299,11 +296,34 @@ extern char ___assert_task_state[1 - 2*!!(
* is correctly serialised wrt the caller's subsequent test of whether to
* actually sleep:
*
+ * for (;;) {
* set_current_state(TASK_UNINTERRUPTIBLE);
- * if (do_i_need_to_sleep())
- * schedule();
+ * if (!need_sleep)
+ * break;
+ *
+ * schedule();
+ * }
+ * __set_current_state(TASK_RUNNING);
+ *
+ * If the caller does not need such serialisation (because, for instance, the
+ * condition test and condition change and wakeup are under the same lock) then
+ * use __set_current_state().
+ *
+ * The above is typically ordered against the wakeup, which does:
+ *
+ * need_sleep = false;
+ * wake_up_state(p, TASK_UNINTERRUPTIBLE);
+ *
+ * Where wake_up_state() (and all other wakeup primitives) imply enough
+ * barriers to order the store of the variable against wakeup.
+ *
+ * Wakeup will do: if (@state & p->state) p->state = TASK_RUNNING, that is,
+ * once it observes the TASK_UNINTERRUPTIBLE store the waking CPU can issue a
+ * TASK_RUNNING store which can collide with __set_current_state(TASK_RUNNING).
+ *
+ * This is obviously fine, since they both store the exact same value.
*
- * If the caller does not need such serialisation then use __set_current_state()
+ * Also see the comments of try_to_wake_up().
*/
#define __set_current_state(state_value) \
do { current->state = (state_value); } while (0)
diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h
index 22db1e63707e..441145351301 100644
--- a/include/linux/sched/sysctl.h
+++ b/include/linux/sched/sysctl.h
@@ -36,7 +36,6 @@ extern unsigned int sysctl_numa_balancing_scan_size;
extern unsigned int sysctl_sched_migration_cost;
extern unsigned int sysctl_sched_nr_migrate;
extern unsigned int sysctl_sched_time_avg;
-extern unsigned int sysctl_sched_shares_window;
int sched_proc_update_handler(struct ctl_table *table, int write,
void __user *buffer, size_t *length,
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 154fd689fe02..f3cfa0dd5b34 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1995,14 +1995,15 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags)
* @state: the mask of task states that can be woken
* @wake_flags: wake modifier flags (WF_*)
*
- * Put it on the run-queue if it's not already there. The "current"
- * thread is always on the run-queue (except when the actual
- * re-schedule is in progress), and as such you're allowed to do
- * the simpler "current->state = TASK_RUNNING" to mark yourself
- * runnable without the overhead of this.
- *
- * Return: %true if @p was woken up, %false if it was already running.
- * or @state didn't match @p's state.
+ * If (@state & @p->state) @p->state = TASK_RUNNING.
+ *
+ * If the task was not queued/runnable, also place it back on a runqueue.
+ *
+ * Atomic against schedule() which would dequeue a task, also see
+ * set_current_state().
+ *
+ * Return: %true if @p->state changes (an actual wakeup was done),
+ * %false otherwise.
*/
static int
try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index c242944f5cbd..3cf446c53043 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -93,13 +93,6 @@ unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL;
const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
-/*
- * The exponential sliding window over which load is averaged for shares
- * distribution.
- * (default: 10msec)
- */
-unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL;
-
#ifdef CONFIG_CFS_BANDWIDTH
/*
* Amount of runtime to allocate from global (tg) to local (per-cfs_rq) pool
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 706309f9ed84..739fb17371af 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -347,13 +347,6 @@ static struct ctl_table kern_table[] = {
.mode = 0644,
.proc_handler = proc_dointvec,
},
- {
- .procname = "sched_shares_window_ns",
- .data = &sysctl_sched_shares_window,
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = proc_dointvec,
- },
#ifdef CONFIG_SCHEDSTATS
{
.procname = "sched_schedstats",