diff options
Diffstat (limited to 'kernel/sched/core.c')
| -rw-r--r-- | kernel/sched/core.c | 362 | 
1 files changed, 248 insertions, 114 deletions
| diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 44817c640e99..154fd689fe02 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -581,6 +581,8 @@ static bool wake_up_full_nohz_cpu(int cpu)  	 * If needed we can still optimize that later with an  	 * empty IRQ.  	 */ +	if (cpu_is_offline(cpu)) +		return true;  /* Don't try to wake offline CPUs. */  	if (tick_nohz_full_cpu(cpu)) {  		if (cpu != smp_processor_id() ||  		    tick_nohz_tick_stopped()) @@ -591,6 +593,11 @@ static bool wake_up_full_nohz_cpu(int cpu)  	return false;  } +/* + * Wake up the specified CPU.  If the CPU is going offline, it is the + * caller's responsibility to deal with the lost wakeup, for example, + * by hooking into the CPU_DEAD notifier like timers and hrtimers do. + */  void wake_up_nohz_cpu(int cpu)  {  	if (!wake_up_full_nohz_cpu(cpu)) @@ -1063,8 +1070,12 @@ static int migration_cpu_stop(void *data)  	 * holding rq->lock, if p->on_rq == 0 it cannot get enqueued because  	 * we're holding p->pi_lock.  	 */ -	if (task_rq(p) == rq && task_on_rq_queued(p)) -		rq = __migrate_task(rq, p, arg->dest_cpu); +	if (task_rq(p) == rq) { +		if (task_on_rq_queued(p)) +			rq = __migrate_task(rq, p, arg->dest_cpu); +		else +			p->wake_cpu = arg->dest_cpu; +	}  	raw_spin_unlock(&rq->lock);  	raw_spin_unlock(&p->pi_lock); @@ -1105,10 +1116,10 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)  	p->sched_class->set_cpus_allowed(p, new_mask); -	if (running) -		p->sched_class->set_curr_task(rq);  	if (queued)  		enqueue_task(rq, p, ENQUEUE_RESTORE); +	if (running) +		set_curr_task(rq, p);  }  /* @@ -1265,7 +1276,7 @@ static void __migrate_swap_task(struct task_struct *p, int cpu)  		/*  		 * Task isn't running anymore; make it appear like we migrated  		 * it before it went to sleep. This means on wakeup we make the -		 * previous cpu our targer instead of where it really is. +		 * previous cpu our target instead of where it really is.  		 */  		p->wake_cpu = cpu;  	} @@ -1629,23 +1640,25 @@ static inline int __set_cpus_allowed_ptr(struct task_struct *p,  static void  ttwu_stat(struct task_struct *p, int cpu, int wake_flags)  { -#ifdef CONFIG_SCHEDSTATS -	struct rq *rq = this_rq(); +	struct rq *rq; -#ifdef CONFIG_SMP -	int this_cpu = smp_processor_id(); +	if (!schedstat_enabled()) +		return; -	if (cpu == this_cpu) { -		schedstat_inc(rq, ttwu_local); -		schedstat_inc(p, se.statistics.nr_wakeups_local); +	rq = this_rq(); + +#ifdef CONFIG_SMP +	if (cpu == rq->cpu) { +		schedstat_inc(rq->ttwu_local); +		schedstat_inc(p->se.statistics.nr_wakeups_local);  	} else {  		struct sched_domain *sd; -		schedstat_inc(p, se.statistics.nr_wakeups_remote); +		schedstat_inc(p->se.statistics.nr_wakeups_remote);  		rcu_read_lock(); -		for_each_domain(this_cpu, sd) { +		for_each_domain(rq->cpu, sd) {  			if (cpumask_test_cpu(cpu, sched_domain_span(sd))) { -				schedstat_inc(sd, ttwu_wake_remote); +				schedstat_inc(sd->ttwu_wake_remote);  				break;  			}  		} @@ -1653,17 +1666,14 @@ ttwu_stat(struct task_struct *p, int cpu, int wake_flags)  	}  	if (wake_flags & WF_MIGRATED) -		schedstat_inc(p, se.statistics.nr_wakeups_migrate); - +		schedstat_inc(p->se.statistics.nr_wakeups_migrate);  #endif /* CONFIG_SMP */ -	schedstat_inc(rq, ttwu_count); -	schedstat_inc(p, se.statistics.nr_wakeups); +	schedstat_inc(rq->ttwu_count); +	schedstat_inc(p->se.statistics.nr_wakeups);  	if (wake_flags & WF_SYNC) -		schedstat_inc(p, se.statistics.nr_wakeups_sync); - -#endif /* CONFIG_SCHEDSTATS */ +		schedstat_inc(p->se.statistics.nr_wakeups_sync);  }  static inline void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags) @@ -2084,8 +2094,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)  	ttwu_queue(p, cpu, wake_flags);  stat: -	if (schedstat_enabled()) -		ttwu_stat(p, cpu, wake_flags); +	ttwu_stat(p, cpu, wake_flags);  out:  	raw_spin_unlock_irqrestore(&p->pi_lock, flags); @@ -2095,6 +2104,7 @@ out:  /**   * try_to_wake_up_local - try to wake up a local task with rq lock held   * @p: the thread to be awakened + * @cookie: context's cookie for pinning   *   * Put @p on the run-queue if it's not already there. The caller must   * ensure that this_rq() is locked, @p is bound to this_rq() and not @@ -2133,8 +2143,7 @@ static void try_to_wake_up_local(struct task_struct *p, struct pin_cookie cookie  		ttwu_activate(rq, p, ENQUEUE_WAKEUP);  	ttwu_do_wakeup(rq, p, 0, cookie); -	if (schedstat_enabled()) -		ttwu_stat(p, smp_processor_id(), 0); +	ttwu_stat(p, smp_processor_id(), 0);  out:  	raw_spin_unlock(&p->pi_lock);  } @@ -2772,6 +2781,10 @@ static struct rq *finish_task_switch(struct task_struct *prev)  		 * task and put them back on the free list.  		 */  		kprobe_flush_task(prev); + +		/* Task is done with its stack. */ +		put_task_stack(prev); +  		put_task_struct(prev);  	} @@ -3192,6 +3205,9 @@ static inline void preempt_latency_stop(int val) { }   */  static noinline void __schedule_bug(struct task_struct *prev)  { +	/* Save this before calling printk(), since that will clobber it */ +	unsigned long preempt_disable_ip = get_preempt_disable_ip(current); +  	if (oops_in_progress)  		return; @@ -3202,13 +3218,12 @@ static noinline void __schedule_bug(struct task_struct *prev)  	print_modules();  	if (irqs_disabled())  		print_irqtrace_events(prev); -#ifdef CONFIG_DEBUG_PREEMPT -	if (in_atomic_preempt_off()) { +	if (IS_ENABLED(CONFIG_DEBUG_PREEMPT) +	    && in_atomic_preempt_off()) {  		pr_err("Preemption disabled at:"); -		print_ip_sym(current->preempt_disable_ip); +		print_ip_sym(preempt_disable_ip);  		pr_cont("\n");  	} -#endif  	if (panic_on_warn)  		panic("scheduling while atomic\n"); @@ -3234,7 +3249,7 @@ static inline void schedule_debug(struct task_struct *prev)  	profile_hit(SCHED_PROFILING, __builtin_return_address(0)); -	schedstat_inc(this_rq(), sched_count); +	schedstat_inc(this_rq()->sched_count);  }  /* @@ -3327,17 +3342,6 @@ static void __sched notrace __schedule(bool preempt)  	rq = cpu_rq(cpu);  	prev = rq->curr; -	/* -	 * do_exit() calls schedule() with preemption disabled as an exception; -	 * however we must fix that up, otherwise the next task will see an -	 * inconsistent (higher) preempt count. -	 * -	 * It also avoids the below schedule_debug() test from complaining -	 * about this. -	 */ -	if (unlikely(prev->state == TASK_DEAD)) -		preempt_enable_no_resched_notrace(); -  	schedule_debug(prev);  	if (sched_feat(HRTICK)) @@ -3403,7 +3407,33 @@ static void __sched notrace __schedule(bool preempt)  	balance_callback(rq);  } -STACK_FRAME_NON_STANDARD(__schedule); /* switch_to() */ + +void __noreturn do_task_dead(void) +{ +	/* +	 * The setting of TASK_RUNNING by try_to_wake_up() may be delayed +	 * when the following two conditions become true. +	 *   - There is race condition of mmap_sem (It is acquired by +	 *     exit_mm()), and +	 *   - SMI occurs before setting TASK_RUNINNG. +	 *     (or hypervisor of virtual machine switches to other guest) +	 *  As a result, we may become TASK_RUNNING after becoming TASK_DEAD +	 * +	 * To avoid it, we have to wait for releasing tsk->pi_lock which +	 * is held by try_to_wake_up() +	 */ +	smp_mb(); +	raw_spin_unlock_wait(¤t->pi_lock); + +	/* causes final put_task_struct in finish_task_switch(). */ +	__set_current_state(TASK_DEAD); +	current->flags |= PF_NOFREEZE;	/* tell freezer to ignore us */ +	__schedule(false); +	BUG(); +	/* Avoid "noreturn function does return".  */ +	for (;;) +		cpu_relax();	/* For when BUG is null */ +}  static inline void sched_submit_work(struct task_struct *tsk)  { @@ -3687,10 +3717,10 @@ void rt_mutex_setprio(struct task_struct *p, int prio)  	p->prio = prio; -	if (running) -		p->sched_class->set_curr_task(rq);  	if (queued)  		enqueue_task(rq, p, queue_flag); +	if (running) +		set_curr_task(rq, p);  	check_class_changed(rq, p, prev_class, oldprio);  out_unlock: @@ -3704,7 +3734,8 @@ out_unlock:  void set_user_nice(struct task_struct *p, long nice)  { -	int old_prio, delta, queued; +	bool queued, running; +	int old_prio, delta;  	struct rq_flags rf;  	struct rq *rq; @@ -3726,8 +3757,11 @@ void set_user_nice(struct task_struct *p, long nice)  		goto out_unlock;  	}  	queued = task_on_rq_queued(p); +	running = task_current(rq, p);  	if (queued)  		dequeue_task(rq, p, DEQUEUE_SAVE); +	if (running) +		put_prev_task(rq, p);  	p->static_prio = NICE_TO_PRIO(nice);  	set_load_weight(p); @@ -3744,6 +3778,8 @@ void set_user_nice(struct task_struct *p, long nice)  		if (delta < 0 || (delta > 0 && task_running(rq, p)))  			resched_curr(rq);  	} +	if (running) +		set_curr_task(rq, p);  out_unlock:  	task_rq_unlock(rq, p, &rf);  } @@ -4243,8 +4279,6 @@ change:  	prev_class = p->sched_class;  	__setscheduler(rq, p, attr, pi); -	if (running) -		p->sched_class->set_curr_task(rq);  	if (queued) {  		/*  		 * We enqueue to tail when the priority of a task is @@ -4255,6 +4289,8 @@ change:  		enqueue_task(rq, p, queue_flags);  	} +	if (running) +		set_curr_task(rq, p);  	check_class_changed(rq, p, prev_class, oldprio);  	preempt_disable(); /* avoid rq from going away on us */ @@ -4846,7 +4882,7 @@ SYSCALL_DEFINE0(sched_yield)  {  	struct rq *rq = this_rq_lock(); -	schedstat_inc(rq, yld_count); +	schedstat_inc(rq->yld_count);  	current->sched_class->yield_task(rq);  	/* @@ -4863,6 +4899,7 @@ SYSCALL_DEFINE0(sched_yield)  	return 0;  } +#ifndef CONFIG_PREEMPT  int __sched _cond_resched(void)  {  	if (should_resched(0)) { @@ -4872,6 +4909,7 @@ int __sched _cond_resched(void)  	return 0;  }  EXPORT_SYMBOL(_cond_resched); +#endif  /*   * __cond_resched_lock() - if a reschedule is pending, drop the given lock, @@ -4997,7 +5035,7 @@ again:  	yielded = curr->sched_class->yield_to_task(rq, p, preempt);  	if (yielded) { -		schedstat_inc(rq, yld_count); +		schedstat_inc(rq->yld_count);  		/*  		 * Make p's CPU reschedule; pick_next_entity takes care of  		 * fairness. @@ -5154,21 +5192,14 @@ void sched_show_task(struct task_struct *p)  	int ppid;  	unsigned long state = p->state; +	if (!try_get_task_stack(p)) +		return;  	if (state)  		state = __ffs(state) + 1;  	printk(KERN_INFO "%-15.15s %c", p->comm,  		state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?'); -#if BITS_PER_LONG == 32 -	if (state == TASK_RUNNING) -		printk(KERN_CONT " running  "); -	else -		printk(KERN_CONT " %08lx ", thread_saved_pc(p)); -#else  	if (state == TASK_RUNNING)  		printk(KERN_CONT "  running task    "); -	else -		printk(KERN_CONT " %016lx ", thread_saved_pc(p)); -#endif  #ifdef CONFIG_DEBUG_STACK_USAGE  	free = stack_not_used(p);  #endif @@ -5183,6 +5214,7 @@ void sched_show_task(struct task_struct *p)  	print_worker_info(KERN_INFO, p);  	show_stack(p, NULL); +	put_task_stack(p);  }  void show_state_filter(unsigned long state_filter) @@ -5417,10 +5449,10 @@ void sched_setnuma(struct task_struct *p, int nid)  	p->numa_preferred_nid = nid; -	if (running) -		p->sched_class->set_curr_task(rq);  	if (queued)  		enqueue_task(rq, p, ENQUEUE_RESTORE); +	if (running) +		set_curr_task(rq, p);  	task_rq_unlock(rq, p, &rf);  }  #endif /* CONFIG_NUMA_BALANCING */ @@ -5717,6 +5749,8 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)  	}  }  #else /* !CONFIG_SCHED_DEBUG */ + +# define sched_debug_enabled 0  # define sched_domain_debug(sd, cpu) do { } while (0)  static inline bool sched_debug(void)  { @@ -5735,6 +5769,7 @@ static int sd_degenerate(struct sched_domain *sd)  			 SD_BALANCE_FORK |  			 SD_BALANCE_EXEC |  			 SD_SHARE_CPUCAPACITY | +			 SD_ASYM_CPUCAPACITY |  			 SD_SHARE_PKG_RESOURCES |  			 SD_SHARE_POWERDOMAIN)) {  		if (sd->groups != sd->groups->next) @@ -5765,6 +5800,7 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)  				SD_BALANCE_NEWIDLE |  				SD_BALANCE_FORK |  				SD_BALANCE_EXEC | +				SD_ASYM_CPUCAPACITY |  				SD_SHARE_CPUCAPACITY |  				SD_SHARE_PKG_RESOURCES |  				SD_PREFER_SIBLING | @@ -5909,10 +5945,8 @@ static void free_sched_groups(struct sched_group *sg, int free_sgc)  	} while (sg != first);  } -static void free_sched_domain(struct rcu_head *rcu) +static void destroy_sched_domain(struct sched_domain *sd)  { -	struct sched_domain *sd = container_of(rcu, struct sched_domain, rcu); -  	/*  	 * If its an overlapping domain it has private groups, iterate and  	 * nuke them all. @@ -5923,18 +5957,26 @@ static void free_sched_domain(struct rcu_head *rcu)  		kfree(sd->groups->sgc);  		kfree(sd->groups);  	} +	if (sd->shared && atomic_dec_and_test(&sd->shared->ref)) +		kfree(sd->shared);  	kfree(sd);  } -static void destroy_sched_domain(struct sched_domain *sd, int cpu) +static void destroy_sched_domains_rcu(struct rcu_head *rcu)  { -	call_rcu(&sd->rcu, free_sched_domain); +	struct sched_domain *sd = container_of(rcu, struct sched_domain, rcu); + +	while (sd) { +		struct sched_domain *parent = sd->parent; +		destroy_sched_domain(sd); +		sd = parent; +	}  } -static void destroy_sched_domains(struct sched_domain *sd, int cpu) +static void destroy_sched_domains(struct sched_domain *sd)  { -	for (; sd; sd = sd->parent) -		destroy_sched_domain(sd, cpu); +	if (sd) +		call_rcu(&sd->rcu, destroy_sched_domains_rcu);  }  /* @@ -5949,14 +5991,14 @@ static void destroy_sched_domains(struct sched_domain *sd, int cpu)  DEFINE_PER_CPU(struct sched_domain *, sd_llc);  DEFINE_PER_CPU(int, sd_llc_size);  DEFINE_PER_CPU(int, sd_llc_id); +DEFINE_PER_CPU(struct sched_domain_shared *, sd_llc_shared);  DEFINE_PER_CPU(struct sched_domain *, sd_numa); -DEFINE_PER_CPU(struct sched_domain *, sd_busy);  DEFINE_PER_CPU(struct sched_domain *, sd_asym);  static void update_top_cache_domain(int cpu)  { +	struct sched_domain_shared *sds = NULL;  	struct sched_domain *sd; -	struct sched_domain *busy_sd = NULL;  	int id = cpu;  	int size = 1; @@ -5964,13 +6006,13 @@ static void update_top_cache_domain(int cpu)  	if (sd) {  		id = cpumask_first(sched_domain_span(sd));  		size = cpumask_weight(sched_domain_span(sd)); -		busy_sd = sd->parent; /* sd_busy */ +		sds = sd->shared;  	} -	rcu_assign_pointer(per_cpu(sd_busy, cpu), busy_sd);  	rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);  	per_cpu(sd_llc_size, cpu) = size;  	per_cpu(sd_llc_id, cpu) = id; +	rcu_assign_pointer(per_cpu(sd_llc_shared, cpu), sds);  	sd = lowest_flag_domain(cpu, SD_NUMA);  	rcu_assign_pointer(per_cpu(sd_numa, cpu), sd); @@ -6006,7 +6048,7 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)  			 */  			if (parent->flags & SD_PREFER_SIBLING)  				tmp->flags |= SD_PREFER_SIBLING; -			destroy_sched_domain(parent, cpu); +			destroy_sched_domain(parent);  		} else  			tmp = tmp->parent;  	} @@ -6014,7 +6056,7 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)  	if (sd && sd_degenerate(sd)) {  		tmp = sd;  		sd = sd->parent; -		destroy_sched_domain(tmp, cpu); +		destroy_sched_domain(tmp);  		if (sd)  			sd->child = NULL;  	} @@ -6024,7 +6066,7 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)  	rq_attach_root(rq, rd);  	tmp = rq->sd;  	rcu_assign_pointer(rq->sd, sd); -	destroy_sched_domains(tmp, cpu); +	destroy_sched_domains(tmp);  	update_top_cache_domain(cpu);  } @@ -6267,7 +6309,6 @@ static void init_sched_groups_capacity(int cpu, struct sched_domain *sd)  		return;  	update_group_capacity(sd, cpu); -	atomic_set(&sg->sgc->nr_busy_cpus, sg->group_weight);  }  /* @@ -6355,6 +6396,9 @@ static void claim_allocations(int cpu, struct sched_domain *sd)  	WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd);  	*per_cpu_ptr(sdd->sd, cpu) = NULL; +	if (atomic_read(&(*per_cpu_ptr(sdd->sds, cpu))->ref)) +		*per_cpu_ptr(sdd->sds, cpu) = NULL; +  	if (atomic_read(&(*per_cpu_ptr(sdd->sg, cpu))->ref))  		*per_cpu_ptr(sdd->sg, cpu) = NULL; @@ -6374,26 +6418,37 @@ static int sched_domains_curr_level;  /*   * SD_flags allowed in topology descriptions.   * - * SD_SHARE_CPUCAPACITY      - describes SMT topologies - * SD_SHARE_PKG_RESOURCES - describes shared caches - * SD_NUMA                - describes NUMA topologies - * SD_SHARE_POWERDOMAIN   - describes shared power domain + * These flags are purely descriptive of the topology and do not prescribe + * behaviour. Behaviour is artificial and mapped in the below sd_init() + * function: + * + *   SD_SHARE_CPUCAPACITY   - describes SMT topologies + *   SD_SHARE_PKG_RESOURCES - describes shared caches + *   SD_NUMA                - describes NUMA topologies + *   SD_SHARE_POWERDOMAIN   - describes shared power domain + *   SD_ASYM_CPUCAPACITY    - describes mixed capacity topologies   * - * Odd one out: - * SD_ASYM_PACKING        - describes SMT quirks + * Odd one out, which beside describing the topology has a quirk also + * prescribes the desired behaviour that goes along with it: + * + *   SD_ASYM_PACKING        - describes SMT quirks   */  #define TOPOLOGY_SD_FLAGS		\  	(SD_SHARE_CPUCAPACITY |		\  	 SD_SHARE_PKG_RESOURCES |	\  	 SD_NUMA |			\  	 SD_ASYM_PACKING |		\ +	 SD_ASYM_CPUCAPACITY |		\  	 SD_SHARE_POWERDOMAIN)  static struct sched_domain * -sd_init(struct sched_domain_topology_level *tl, int cpu) +sd_init(struct sched_domain_topology_level *tl, +	const struct cpumask *cpu_map, +	struct sched_domain *child, int cpu)  { -	struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu); -	int sd_weight, sd_flags = 0; +	struct sd_data *sdd = &tl->data; +	struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu); +	int sd_id, sd_weight, sd_flags = 0;  #ifdef CONFIG_NUMA  	/* @@ -6442,15 +6497,26 @@ sd_init(struct sched_domain_topology_level *tl, int cpu)  		.smt_gain		= 0,  		.max_newidle_lb_cost	= 0,  		.next_decay_max_lb_cost	= jiffies, +		.child			= child,  #ifdef CONFIG_SCHED_DEBUG  		.name			= tl->name,  #endif  	}; +	cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu)); +	sd_id = cpumask_first(sched_domain_span(sd)); +  	/*  	 * Convert topological properties into behaviour.  	 */ +	if (sd->flags & SD_ASYM_CPUCAPACITY) { +		struct sched_domain *t = sd; + +		for_each_lower_domain(t) +			t->flags |= SD_BALANCE_WAKE; +	} +  	if (sd->flags & SD_SHARE_CPUCAPACITY) {  		sd->flags |= SD_PREFER_SIBLING;  		sd->imbalance_pct = 110; @@ -6482,7 +6548,17 @@ sd_init(struct sched_domain_topology_level *tl, int cpu)  		sd->idle_idx = 1;  	} -	sd->private = &tl->data; +	/* +	 * For all levels sharing cache; connect a sched_domain_shared +	 * instance. +	 */ +	if (sd->flags & SD_SHARE_PKG_RESOURCES) { +		sd->shared = *per_cpu_ptr(sdd->sds, sd_id); +		atomic_inc(&sd->shared->ref); +		atomic_set(&sd->shared->nr_busy_cpus, sd_weight); +	} + +	sd->private = sdd;  	return sd;  } @@ -6509,6 +6585,9 @@ static struct sched_domain_topology_level *sched_domain_topology =  void set_sched_topology(struct sched_domain_topology_level *tl)  { +	if (WARN_ON_ONCE(sched_smp_initialized)) +		return; +  	sched_domain_topology = tl;  } @@ -6789,6 +6868,10 @@ static int __sdt_alloc(const struct cpumask *cpu_map)  		if (!sdd->sd)  			return -ENOMEM; +		sdd->sds = alloc_percpu(struct sched_domain_shared *); +		if (!sdd->sds) +			return -ENOMEM; +  		sdd->sg = alloc_percpu(struct sched_group *);  		if (!sdd->sg)  			return -ENOMEM; @@ -6799,6 +6882,7 @@ static int __sdt_alloc(const struct cpumask *cpu_map)  		for_each_cpu(j, cpu_map) {  			struct sched_domain *sd; +			struct sched_domain_shared *sds;  			struct sched_group *sg;  			struct sched_group_capacity *sgc; @@ -6809,6 +6893,13 @@ static int __sdt_alloc(const struct cpumask *cpu_map)  			*per_cpu_ptr(sdd->sd, j) = sd; +			sds = kzalloc_node(sizeof(struct sched_domain_shared), +					GFP_KERNEL, cpu_to_node(j)); +			if (!sds) +				return -ENOMEM; + +			*per_cpu_ptr(sdd->sds, j) = sds; +  			sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),  					GFP_KERNEL, cpu_to_node(j));  			if (!sg) @@ -6848,6 +6939,8 @@ static void __sdt_free(const struct cpumask *cpu_map)  				kfree(*per_cpu_ptr(sdd->sd, j));  			} +			if (sdd->sds) +				kfree(*per_cpu_ptr(sdd->sds, j));  			if (sdd->sg)  				kfree(*per_cpu_ptr(sdd->sg, j));  			if (sdd->sgc) @@ -6855,6 +6948,8 @@ static void __sdt_free(const struct cpumask *cpu_map)  		}  		free_percpu(sdd->sd);  		sdd->sd = NULL; +		free_percpu(sdd->sds); +		sdd->sds = NULL;  		free_percpu(sdd->sg);  		sdd->sg = NULL;  		free_percpu(sdd->sgc); @@ -6866,16 +6961,12 @@ struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,  		const struct cpumask *cpu_map, struct sched_domain_attr *attr,  		struct sched_domain *child, int cpu)  { -	struct sched_domain *sd = sd_init(tl, cpu); -	if (!sd) -		return child; +	struct sched_domain *sd = sd_init(tl, cpu_map, child, cpu); -	cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu));  	if (child) {  		sd->level = child->level + 1;  		sched_domain_level_max = max(sched_domain_level_max, sd->level);  		child->parent = sd; -		sd->child = child;  		if (!cpumask_subset(sched_domain_span(child),  				    sched_domain_span(sd))) { @@ -6906,6 +6997,7 @@ static int build_sched_domains(const struct cpumask *cpu_map,  	enum s_alloc alloc_state;  	struct sched_domain *sd;  	struct s_data d; +	struct rq *rq = NULL;  	int i, ret = -ENOMEM;  	alloc_state = __visit_domain_allocation_hell(&d, cpu_map); @@ -6956,11 +7048,22 @@ static int build_sched_domains(const struct cpumask *cpu_map,  	/* Attach the domains */  	rcu_read_lock();  	for_each_cpu(i, cpu_map) { +		rq = cpu_rq(i);  		sd = *per_cpu_ptr(d.sd, i); + +		/* Use READ_ONCE()/WRITE_ONCE() to avoid load/store tearing: */ +		if (rq->cpu_capacity_orig > READ_ONCE(d.rd->max_cpu_capacity)) +			WRITE_ONCE(d.rd->max_cpu_capacity, rq->cpu_capacity_orig); +  		cpu_attach_domain(sd, d.rd, i);  	}  	rcu_read_unlock(); +	if (rq && sched_debug_enabled) { +		pr_info("span: %*pbl (max cpu_capacity = %lu)\n", +			cpumask_pr_args(cpu_map), rq->rd->max_cpu_capacity); +	} +  	ret = 0;  error:  	__free_domain_allocs(&d, alloc_state, cpu_map); @@ -7319,6 +7422,22 @@ int sched_cpu_dying(unsigned int cpu)  }  #endif +#ifdef CONFIG_SCHED_SMT +DEFINE_STATIC_KEY_FALSE(sched_smt_present); + +static void sched_init_smt(void) +{ +	/* +	 * We've enumerated all CPUs and will assume that if any CPU +	 * has SMT siblings, CPU0 will too. +	 */ +	if (cpumask_weight(cpu_smt_mask(0)) > 1) +		static_branch_enable(&sched_smt_present); +} +#else +static inline void sched_init_smt(void) { } +#endif +  void __init sched_init_smp(void)  {  	cpumask_var_t non_isolated_cpus; @@ -7348,6 +7467,9 @@ void __init sched_init_smp(void)  	init_sched_rt_class();  	init_sched_dl_class(); + +	sched_init_smt(); +  	sched_smp_initialized = true;  } @@ -7385,12 +7507,29 @@ static struct kmem_cache *task_group_cache __read_mostly;  #endif  DECLARE_PER_CPU(cpumask_var_t, load_balance_mask); +DECLARE_PER_CPU(cpumask_var_t, select_idle_mask); + +#define WAIT_TABLE_BITS 8 +#define WAIT_TABLE_SIZE (1 << WAIT_TABLE_BITS) +static wait_queue_head_t bit_wait_table[WAIT_TABLE_SIZE] __cacheline_aligned; + +wait_queue_head_t *bit_waitqueue(void *word, int bit) +{ +	const int shift = BITS_PER_LONG == 32 ? 5 : 6; +	unsigned long val = (unsigned long)word << shift | bit; + +	return bit_wait_table + hash_long(val, WAIT_TABLE_BITS); +} +EXPORT_SYMBOL(bit_waitqueue);  void __init sched_init(void)  {  	int i, j;  	unsigned long alloc_size = 0, ptr; +	for (i = 0; i < WAIT_TABLE_SIZE; i++) +		init_waitqueue_head(bit_wait_table + i); +  #ifdef CONFIG_FAIR_GROUP_SCHED  	alloc_size += 2 * nr_cpu_ids * sizeof(void **);  #endif @@ -7421,6 +7560,8 @@ void __init sched_init(void)  	for_each_possible_cpu(i) {  		per_cpu(load_balance_mask, i) = (cpumask_var_t)kzalloc_node(  			cpumask_size(), GFP_KERNEL, cpu_to_node(i)); +		per_cpu(select_idle_mask, i) = (cpumask_var_t)kzalloc_node( +			cpumask_size(), GFP_KERNEL, cpu_to_node(i));  	}  #endif /* CONFIG_CPUMASK_OFFSTACK */ @@ -7523,10 +7664,6 @@ void __init sched_init(void)  	set_load_weight(&init_task); -#ifdef CONFIG_PREEMPT_NOTIFIERS -	INIT_HLIST_HEAD(&init_task.preempt_notifiers); -#endif -  	/*  	 * The boot idle thread does lazy MMU switching as well:  	 */ @@ -7534,11 +7671,6 @@ void __init sched_init(void)  	enter_lazy_tlb(&init_mm, current);  	/* -	 * During early bootup we pretend to be a normal task: -	 */ -	current->sched_class = &fair_sched_class; - -	/*  	 * Make us the idle thread. Technically, schedule() should not be  	 * called from this thread, however somewhere below it might be,  	 * but because we are the idle thread, we just pick up running again @@ -7592,6 +7724,7 @@ EXPORT_SYMBOL(__might_sleep);  void ___might_sleep(const char *file, int line, int preempt_offset)  {  	static unsigned long prev_jiffy;	/* ratelimiting */ +	unsigned long preempt_disable_ip;  	rcu_sleep_check(); /* WARN_ON_ONCE() by default, no rate limit reqd. */  	if ((preempt_count_equals(preempt_offset) && !irqs_disabled() && @@ -7602,6 +7735,9 @@ void ___might_sleep(const char *file, int line, int preempt_offset)  		return;  	prev_jiffy = jiffies; +	/* Save this before calling printk(), since that will clobber it */ +	preempt_disable_ip = get_preempt_disable_ip(current); +  	printk(KERN_ERR  		"BUG: sleeping function called from invalid context at %s:%d\n",  			file, line); @@ -7616,14 +7752,14 @@ void ___might_sleep(const char *file, int line, int preempt_offset)  	debug_show_held_locks(current);  	if (irqs_disabled())  		print_irqtrace_events(current); -#ifdef CONFIG_DEBUG_PREEMPT -	if (!preempt_count_equals(preempt_offset)) { +	if (IS_ENABLED(CONFIG_DEBUG_PREEMPT) +	    && !preempt_count_equals(preempt_offset)) {  		pr_err("Preemption disabled at:"); -		print_ip_sym(current->preempt_disable_ip); +		print_ip_sym(preempt_disable_ip);  		pr_cont("\n");  	} -#endif  	dump_stack(); +	add_taint(TAINT_WARN, LOCKDEP_STILL_OK);  }  EXPORT_SYMBOL(___might_sleep);  #endif @@ -7644,12 +7780,10 @@ void normalize_rt_tasks(void)  		if (p->flags & PF_KTHREAD)  			continue; -		p->se.exec_start		= 0; -#ifdef CONFIG_SCHEDSTATS -		p->se.statistics.wait_start	= 0; -		p->se.statistics.sleep_start	= 0; -		p->se.statistics.block_start	= 0; -#endif +		p->se.exec_start = 0; +		schedstat_set(p->se.statistics.wait_start,  0); +		schedstat_set(p->se.statistics.sleep_start, 0); +		schedstat_set(p->se.statistics.block_start, 0);  		if (!dl_task(p) && !rt_task(p)) {  			/* @@ -7710,7 +7844,7 @@ struct task_struct *curr_task(int cpu)   *   * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!   */ -void set_curr_task(int cpu, struct task_struct *p) +void ia64_set_curr_task(int cpu, struct task_struct *p)  {  	cpu_curr(cpu) = p;  } @@ -7841,10 +7975,10 @@ void sched_move_task(struct task_struct *tsk)  	sched_change_group(tsk, TASK_MOVE_GROUP); -	if (unlikely(running)) -		tsk->sched_class->set_curr_task(rq);  	if (queued)  		enqueue_task(rq, tsk, ENQUEUE_RESTORE | ENQUEUE_MOVE); +	if (unlikely(running)) +		set_curr_task(rq, tsk);  	task_rq_unlock(rq, tsk, &rf);  } |