diff options
Diffstat (limited to 'kernel')
| -rw-r--r-- | kernel/context_tracking.c | 41 | ||||
| -rw-r--r-- | kernel/cpu/idle.c | 17 | ||||
| -rw-r--r-- | kernel/events/core.c | 233 | ||||
| -rw-r--r-- | kernel/events/hw_breakpoint.c | 6 | ||||
| -rw-r--r-- | kernel/events/internal.h | 4 | ||||
| -rw-r--r-- | kernel/kprobes.c | 30 | ||||
| -rw-r--r-- | kernel/power/Kconfig | 1 | ||||
| -rw-r--r-- | kernel/ptrace.c | 20 | ||||
| -rw-r--r-- | kernel/range.c | 21 | ||||
| -rw-r--r-- | kernel/sched/core.c | 23 | ||||
| -rw-r--r-- | kernel/sched/cputime.c | 6 | ||||
| -rw-r--r-- | kernel/time/tick-broadcast.c | 11 | ||||
| -rw-r--r-- | kernel/time/tick-sched.c | 2 | ||||
| -rw-r--r-- | kernel/wait.c | 88 | 
14 files changed, 382 insertions, 121 deletions
| diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c index 65349f07b878..383f8231e436 100644 --- a/kernel/context_tracking.c +++ b/kernel/context_tracking.c @@ -15,7 +15,6 @@   */  #include <linux/context_tracking.h> -#include <linux/kvm_host.h>  #include <linux/rcupdate.h>  #include <linux/sched.h>  #include <linux/hardirq.h> @@ -71,6 +70,46 @@ void user_enter(void)  	local_irq_restore(flags);  } +#ifdef CONFIG_PREEMPT +/** + * preempt_schedule_context - preempt_schedule called by tracing + * + * The tracing infrastructure uses preempt_enable_notrace to prevent + * recursion and tracing preempt enabling caused by the tracing + * infrastructure itself. But as tracing can happen in areas coming + * from userspace or just about to enter userspace, a preempt enable + * can occur before user_exit() is called. This will cause the scheduler + * to be called when the system is still in usermode. + * + * To prevent this, the preempt_enable_notrace will use this function + * instead of preempt_schedule() to exit user context if needed before + * calling the scheduler. + */ +void __sched notrace preempt_schedule_context(void) +{ +	struct thread_info *ti = current_thread_info(); +	enum ctx_state prev_ctx; + +	if (likely(ti->preempt_count || irqs_disabled())) +		return; + +	/* +	 * Need to disable preemption in case user_exit() is traced +	 * and the tracer calls preempt_enable_notrace() causing +	 * an infinite recursion. +	 */ +	preempt_disable_notrace(); +	prev_ctx = exception_enter(); +	preempt_enable_no_resched_notrace(); + +	preempt_schedule(); + +	preempt_disable_notrace(); +	exception_exit(prev_ctx); +	preempt_enable_notrace(); +} +EXPORT_SYMBOL_GPL(preempt_schedule_context); +#endif /* CONFIG_PREEMPT */  /**   * user_exit - Inform the context tracking that the CPU is diff --git a/kernel/cpu/idle.c b/kernel/cpu/idle.c index d5585f5e038e..e695c0a0bcb5 100644 --- a/kernel/cpu/idle.c +++ b/kernel/cpu/idle.c @@ -5,6 +5,7 @@  #include <linux/cpu.h>  #include <linux/tick.h>  #include <linux/mm.h> +#include <linux/stackprotector.h>  #include <asm/tlb.h> @@ -58,6 +59,7 @@ void __weak arch_cpu_idle_dead(void) { }  void __weak arch_cpu_idle(void)  {  	cpu_idle_force_poll = 1; +	local_irq_enable();  }  /* @@ -112,6 +114,21 @@ static void cpu_idle_loop(void)  void cpu_startup_entry(enum cpuhp_state state)  { +	/* +	 * This #ifdef needs to die, but it's too late in the cycle to +	 * make this generic (arm and sh have never invoked the canary +	 * init for the non boot cpus!). Will be fixed in 3.11 +	 */ +#ifdef CONFIG_X86 +	/* +	 * If we're the non-boot CPU, nothing set the stack canary up +	 * for us. The boot CPU already has it initialized but no harm +	 * in doing it again. This is a good place for updating it, as +	 * we wont ever return from this function (so the invalid +	 * canaries already on the stack wont ever trigger). +	 */ +	boot_init_stack_canary(); +#endif  	current_set_polling();  	arch_cpu_idle_prepare();  	cpu_idle_loop(); diff --git a/kernel/events/core.c b/kernel/events/core.c index 9dc297faf7c0..b391907d5352 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -196,9 +196,6 @@ static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,  static void update_context_time(struct perf_event_context *ctx);  static u64 perf_event_time(struct perf_event *event); -static void ring_buffer_attach(struct perf_event *event, -			       struct ring_buffer *rb); -  void __weak perf_event_print_debug(void)	{ }  extern __weak const char *perf_pmu_name(void) @@ -2918,6 +2915,7 @@ static void free_event_rcu(struct rcu_head *head)  }  static void ring_buffer_put(struct ring_buffer *rb); +static void ring_buffer_detach(struct perf_event *event, struct ring_buffer *rb);  static void free_event(struct perf_event *event)  { @@ -2942,15 +2940,30 @@ static void free_event(struct perf_event *event)  		if (has_branch_stack(event)) {  			static_key_slow_dec_deferred(&perf_sched_events);  			/* is system-wide event */ -			if (!(event->attach_state & PERF_ATTACH_TASK)) +			if (!(event->attach_state & PERF_ATTACH_TASK)) {  				atomic_dec(&per_cpu(perf_branch_stack_events,  						    event->cpu)); +			}  		}  	}  	if (event->rb) { -		ring_buffer_put(event->rb); -		event->rb = NULL; +		struct ring_buffer *rb; + +		/* +		 * Can happen when we close an event with re-directed output. +		 * +		 * Since we have a 0 refcount, perf_mmap_close() will skip +		 * over us; possibly making our ring_buffer_put() the last. +		 */ +		mutex_lock(&event->mmap_mutex); +		rb = event->rb; +		if (rb) { +			rcu_assign_pointer(event->rb, NULL); +			ring_buffer_detach(event, rb); +			ring_buffer_put(rb); /* could be last */ +		} +		mutex_unlock(&event->mmap_mutex);  	}  	if (is_cgroup_event(event)) @@ -3188,30 +3201,13 @@ static unsigned int perf_poll(struct file *file, poll_table *wait)  	unsigned int events = POLL_HUP;  	/* -	 * Race between perf_event_set_output() and perf_poll(): perf_poll() -	 * grabs the rb reference but perf_event_set_output() overrides it. -	 * Here is the timeline for two threads T1, T2: -	 * t0: T1, rb = rcu_dereference(event->rb) -	 * t1: T2, old_rb = event->rb -	 * t2: T2, event->rb = new rb -	 * t3: T2, ring_buffer_detach(old_rb) -	 * t4: T1, ring_buffer_attach(rb1) -	 * t5: T1, poll_wait(event->waitq) -	 * -	 * To avoid this problem, we grab mmap_mutex in perf_poll() -	 * thereby ensuring that the assignment of the new ring buffer -	 * and the detachment of the old buffer appear atomic to perf_poll() +	 * Pin the event->rb by taking event->mmap_mutex; otherwise +	 * perf_event_set_output() can swizzle our rb and make us miss wakeups.  	 */  	mutex_lock(&event->mmap_mutex); - -	rcu_read_lock(); -	rb = rcu_dereference(event->rb); -	if (rb) { -		ring_buffer_attach(event, rb); +	rb = event->rb; +	if (rb)  		events = atomic_xchg(&rb->poll, 0); -	} -	rcu_read_unlock(); -  	mutex_unlock(&event->mmap_mutex);  	poll_wait(file, &event->waitq, wait); @@ -3521,16 +3517,12 @@ static void ring_buffer_attach(struct perf_event *event,  		return;  	spin_lock_irqsave(&rb->event_lock, flags); -	if (!list_empty(&event->rb_entry)) -		goto unlock; - -	list_add(&event->rb_entry, &rb->event_list); -unlock: +	if (list_empty(&event->rb_entry)) +		list_add(&event->rb_entry, &rb->event_list);  	spin_unlock_irqrestore(&rb->event_lock, flags);  } -static void ring_buffer_detach(struct perf_event *event, -			       struct ring_buffer *rb) +static void ring_buffer_detach(struct perf_event *event, struct ring_buffer *rb)  {  	unsigned long flags; @@ -3549,13 +3541,10 @@ static void ring_buffer_wakeup(struct perf_event *event)  	rcu_read_lock();  	rb = rcu_dereference(event->rb); -	if (!rb) -		goto unlock; - -	list_for_each_entry_rcu(event, &rb->event_list, rb_entry) -		wake_up_all(&event->waitq); - -unlock: +	if (rb) { +		list_for_each_entry_rcu(event, &rb->event_list, rb_entry) +			wake_up_all(&event->waitq); +	}  	rcu_read_unlock();  } @@ -3584,18 +3573,10 @@ static struct ring_buffer *ring_buffer_get(struct perf_event *event)  static void ring_buffer_put(struct ring_buffer *rb)  { -	struct perf_event *event, *n; -	unsigned long flags; -  	if (!atomic_dec_and_test(&rb->refcount))  		return; -	spin_lock_irqsave(&rb->event_lock, flags); -	list_for_each_entry_safe(event, n, &rb->event_list, rb_entry) { -		list_del_init(&event->rb_entry); -		wake_up_all(&event->waitq); -	} -	spin_unlock_irqrestore(&rb->event_lock, flags); +	WARN_ON_ONCE(!list_empty(&rb->event_list));  	call_rcu(&rb->rcu_head, rb_free_rcu);  } @@ -3605,26 +3586,100 @@ static void perf_mmap_open(struct vm_area_struct *vma)  	struct perf_event *event = vma->vm_file->private_data;  	atomic_inc(&event->mmap_count); +	atomic_inc(&event->rb->mmap_count);  } +/* + * A buffer can be mmap()ed multiple times; either directly through the same + * event, or through other events by use of perf_event_set_output(). + * + * In order to undo the VM accounting done by perf_mmap() we need to destroy + * the buffer here, where we still have a VM context. This means we need + * to detach all events redirecting to us. + */  static void perf_mmap_close(struct vm_area_struct *vma)  {  	struct perf_event *event = vma->vm_file->private_data; -	if (atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) { -		unsigned long size = perf_data_size(event->rb); -		struct user_struct *user = event->mmap_user; -		struct ring_buffer *rb = event->rb; +	struct ring_buffer *rb = event->rb; +	struct user_struct *mmap_user = rb->mmap_user; +	int mmap_locked = rb->mmap_locked; +	unsigned long size = perf_data_size(rb); + +	atomic_dec(&rb->mmap_count); + +	if (!atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) +		return; -		atomic_long_sub((size >> PAGE_SHIFT) + 1, &user->locked_vm); -		vma->vm_mm->pinned_vm -= event->mmap_locked; -		rcu_assign_pointer(event->rb, NULL); -		ring_buffer_detach(event, rb); +	/* Detach current event from the buffer. */ +	rcu_assign_pointer(event->rb, NULL); +	ring_buffer_detach(event, rb); +	mutex_unlock(&event->mmap_mutex); + +	/* If there's still other mmap()s of this buffer, we're done. */ +	if (atomic_read(&rb->mmap_count)) { +		ring_buffer_put(rb); /* can't be last */ +		return; +	} + +	/* +	 * No other mmap()s, detach from all other events that might redirect +	 * into the now unreachable buffer. Somewhat complicated by the +	 * fact that rb::event_lock otherwise nests inside mmap_mutex. +	 */ +again: +	rcu_read_lock(); +	list_for_each_entry_rcu(event, &rb->event_list, rb_entry) { +		if (!atomic_long_inc_not_zero(&event->refcount)) { +			/* +			 * This event is en-route to free_event() which will +			 * detach it and remove it from the list. +			 */ +			continue; +		} +		rcu_read_unlock(); + +		mutex_lock(&event->mmap_mutex); +		/* +		 * Check we didn't race with perf_event_set_output() which can +		 * swizzle the rb from under us while we were waiting to +		 * acquire mmap_mutex. +		 * +		 * If we find a different rb; ignore this event, a next +		 * iteration will no longer find it on the list. We have to +		 * still restart the iteration to make sure we're not now +		 * iterating the wrong list. +		 */ +		if (event->rb == rb) { +			rcu_assign_pointer(event->rb, NULL); +			ring_buffer_detach(event, rb); +			ring_buffer_put(rb); /* can't be last, we still have one */ +		}  		mutex_unlock(&event->mmap_mutex); +		put_event(event); -		ring_buffer_put(rb); -		free_uid(user); +		/* +		 * Restart the iteration; either we're on the wrong list or +		 * destroyed its integrity by doing a deletion. +		 */ +		goto again;  	} +	rcu_read_unlock(); + +	/* +	 * It could be there's still a few 0-ref events on the list; they'll +	 * get cleaned up by free_event() -- they'll also still have their +	 * ref on the rb and will free it whenever they are done with it. +	 * +	 * Aside from that, this buffer is 'fully' detached and unmapped, +	 * undo the VM accounting. +	 */ + +	atomic_long_sub((size >> PAGE_SHIFT) + 1, &mmap_user->locked_vm); +	vma->vm_mm->pinned_vm -= mmap_locked; +	free_uid(mmap_user); + +	ring_buffer_put(rb); /* could be last */  }  static const struct vm_operations_struct perf_mmap_vmops = { @@ -3674,12 +3729,24 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)  		return -EINVAL;  	WARN_ON_ONCE(event->ctx->parent_ctx); +again:  	mutex_lock(&event->mmap_mutex);  	if (event->rb) { -		if (event->rb->nr_pages == nr_pages) -			atomic_inc(&event->rb->refcount); -		else +		if (event->rb->nr_pages != nr_pages) {  			ret = -EINVAL; +			goto unlock; +		} + +		if (!atomic_inc_not_zero(&event->rb->mmap_count)) { +			/* +			 * Raced against perf_mmap_close() through +			 * perf_event_set_output(). Try again, hope for better +			 * luck. +			 */ +			mutex_unlock(&event->mmap_mutex); +			goto again; +		} +  		goto unlock;  	} @@ -3720,12 +3787,16 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)  		ret = -ENOMEM;  		goto unlock;  	} -	rcu_assign_pointer(event->rb, rb); + +	atomic_set(&rb->mmap_count, 1); +	rb->mmap_locked = extra; +	rb->mmap_user = get_current_user();  	atomic_long_add(user_extra, &user->locked_vm); -	event->mmap_locked = extra; -	event->mmap_user = get_current_user(); -	vma->vm_mm->pinned_vm += event->mmap_locked; +	vma->vm_mm->pinned_vm += extra; + +	ring_buffer_attach(event, rb); +	rcu_assign_pointer(event->rb, rb);  	perf_event_update_userpage(event); @@ -3734,7 +3805,11 @@ unlock:  		atomic_inc(&event->mmap_count);  	mutex_unlock(&event->mmap_mutex); -	vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; +	/* +	 * Since pinned accounting is per vm we cannot allow fork() to copy our +	 * vma. +	 */ +	vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND | VM_DONTDUMP;  	vma->vm_ops = &perf_mmap_vmops;  	return ret; @@ -6412,6 +6487,8 @@ set:  	if (atomic_read(&event->mmap_count))  		goto unlock; +	old_rb = event->rb; +  	if (output_event) {  		/* get the rb we want to redirect to */  		rb = ring_buffer_get(output_event); @@ -6419,16 +6496,28 @@ set:  			goto unlock;  	} -	old_rb = event->rb; -	rcu_assign_pointer(event->rb, rb);  	if (old_rb)  		ring_buffer_detach(event, old_rb); + +	if (rb) +		ring_buffer_attach(event, rb); + +	rcu_assign_pointer(event->rb, rb); + +	if (old_rb) { +		ring_buffer_put(old_rb); +		/* +		 * Since we detached before setting the new rb, so that we +		 * could attach the new rb, we could have missed a wakeup. +		 * Provide it now. +		 */ +		wake_up_all(&event->waitq); +	} +  	ret = 0;  unlock:  	mutex_unlock(&event->mmap_mutex); -	if (old_rb) -		ring_buffer_put(old_rb);  out:  	return ret;  } diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c index a64f8aeb5c1f..20185ea64aa6 100644 --- a/kernel/events/hw_breakpoint.c +++ b/kernel/events/hw_breakpoint.c @@ -120,7 +120,7 @@ static int task_bp_pinned(int cpu, struct perf_event *bp, enum bp_type_idx type)  	list_for_each_entry(iter, &bp_task_head, hw.bp_list) {  		if (iter->hw.bp_target == tsk &&  		    find_slot_idx(iter) == type && -		    cpu == iter->cpu) +		    (iter->cpu < 0 || cpu == iter->cpu))  			count += hw_breakpoint_weight(iter);  	} @@ -149,7 +149,7 @@ fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp,  		return;  	} -	for_each_online_cpu(cpu) { +	for_each_possible_cpu(cpu) {  		unsigned int nr;  		nr = per_cpu(nr_cpu_bp_pinned[type], cpu); @@ -235,7 +235,7 @@ toggle_bp_slot(struct perf_event *bp, bool enable, enum bp_type_idx type,  	if (cpu >= 0) {  		toggle_bp_task_slot(bp, cpu, enable, type, weight);  	} else { -		for_each_online_cpu(cpu) +		for_each_possible_cpu(cpu)  			toggle_bp_task_slot(bp, cpu, enable, type, weight);  	} diff --git a/kernel/events/internal.h b/kernel/events/internal.h index eb675c4d59df..ca6599723be5 100644 --- a/kernel/events/internal.h +++ b/kernel/events/internal.h @@ -31,6 +31,10 @@ struct ring_buffer {  	spinlock_t			event_lock;  	struct list_head		event_list; +	atomic_t			mmap_count; +	unsigned long			mmap_locked; +	struct user_struct		*mmap_user; +  	struct perf_event_mmap_page	*user_page;  	void				*data_pages[0];  }; diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 3fed7f0cbcdf..bddf3b201a48 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -467,6 +467,7 @@ static struct kprobe *__kprobes get_optimized_kprobe(unsigned long addr)  /* Optimization staging list, protected by kprobe_mutex */  static LIST_HEAD(optimizing_list);  static LIST_HEAD(unoptimizing_list); +static LIST_HEAD(freeing_list);  static void kprobe_optimizer(struct work_struct *work);  static DECLARE_DELAYED_WORK(optimizing_work, kprobe_optimizer); @@ -504,7 +505,7 @@ static __kprobes void do_optimize_kprobes(void)   * Unoptimize (replace a jump with a breakpoint and remove the breakpoint   * if need) kprobes listed on unoptimizing_list.   */ -static __kprobes void do_unoptimize_kprobes(struct list_head *free_list) +static __kprobes void do_unoptimize_kprobes(void)  {  	struct optimized_kprobe *op, *tmp; @@ -515,9 +516,9 @@ static __kprobes void do_unoptimize_kprobes(struct list_head *free_list)  	/* Ditto to do_optimize_kprobes */  	get_online_cpus();  	mutex_lock(&text_mutex); -	arch_unoptimize_kprobes(&unoptimizing_list, free_list); +	arch_unoptimize_kprobes(&unoptimizing_list, &freeing_list);  	/* Loop free_list for disarming */ -	list_for_each_entry_safe(op, tmp, free_list, list) { +	list_for_each_entry_safe(op, tmp, &freeing_list, list) {  		/* Disarm probes if marked disabled */  		if (kprobe_disabled(&op->kp))  			arch_disarm_kprobe(&op->kp); @@ -536,11 +537,11 @@ static __kprobes void do_unoptimize_kprobes(struct list_head *free_list)  }  /* Reclaim all kprobes on the free_list */ -static __kprobes void do_free_cleaned_kprobes(struct list_head *free_list) +static __kprobes void do_free_cleaned_kprobes(void)  {  	struct optimized_kprobe *op, *tmp; -	list_for_each_entry_safe(op, tmp, free_list, list) { +	list_for_each_entry_safe(op, tmp, &freeing_list, list) {  		BUG_ON(!kprobe_unused(&op->kp));  		list_del_init(&op->list);  		free_aggr_kprobe(&op->kp); @@ -556,8 +557,6 @@ static __kprobes void kick_kprobe_optimizer(void)  /* Kprobe jump optimizer */  static __kprobes void kprobe_optimizer(struct work_struct *work)  { -	LIST_HEAD(free_list); -  	mutex_lock(&kprobe_mutex);  	/* Lock modules while optimizing kprobes */  	mutex_lock(&module_mutex); @@ -566,7 +565,7 @@ static __kprobes void kprobe_optimizer(struct work_struct *work)  	 * Step 1: Unoptimize kprobes and collect cleaned (unused and disarmed)  	 * kprobes before waiting for quiesence period.  	 */ -	do_unoptimize_kprobes(&free_list); +	do_unoptimize_kprobes();  	/*  	 * Step 2: Wait for quiesence period to ensure all running interrupts @@ -581,7 +580,7 @@ static __kprobes void kprobe_optimizer(struct work_struct *work)  	do_optimize_kprobes();  	/* Step 4: Free cleaned kprobes after quiesence period */ -	do_free_cleaned_kprobes(&free_list); +	do_free_cleaned_kprobes();  	mutex_unlock(&module_mutex);  	mutex_unlock(&kprobe_mutex); @@ -723,8 +722,19 @@ static void __kprobes kill_optimized_kprobe(struct kprobe *p)  	if (!list_empty(&op->list))  		/* Dequeue from the (un)optimization queue */  		list_del_init(&op->list); -  	op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED; + +	if (kprobe_unused(p)) { +		/* Enqueue if it is unused */ +		list_add(&op->list, &freeing_list); +		/* +		 * Remove unused probes from the hash list. After waiting +		 * for synchronization, this probe is reclaimed. +		 * (reclaiming is done by do_free_cleaned_kprobes().) +		 */ +		hlist_del_rcu(&op->kp.hlist); +	} +  	/* Don't touch the code, because it is already freed. */  	arch_remove_optimized_kprobe(op);  } diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index 5dfdc9ea180b..9c39de095ba9 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -100,7 +100,6 @@ config PM_SLEEP_SMP  	depends on SMP  	depends on ARCH_SUSPEND_POSSIBLE || ARCH_HIBERNATION_POSSIBLE  	depends on PM_SLEEP -	select HOTPLUG  	select HOTPLUG_CPU  config PM_AUTOSLEEP diff --git a/kernel/ptrace.c b/kernel/ptrace.c index aed981a3f69c..335a7ae697f5 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -665,20 +665,22 @@ static int ptrace_peek_siginfo(struct task_struct *child,  		if (unlikely(is_compat_task())) {  			compat_siginfo_t __user *uinfo = compat_ptr(data); -			ret = copy_siginfo_to_user32(uinfo, &info); -			ret |= __put_user(info.si_code, &uinfo->si_code); +			if (copy_siginfo_to_user32(uinfo, &info) || +			    __put_user(info.si_code, &uinfo->si_code)) { +				ret = -EFAULT; +				break; +			} +  		} else  #endif  		{  			siginfo_t __user *uinfo = (siginfo_t __user *) data; -			ret = copy_siginfo_to_user(uinfo, &info); -			ret |= __put_user(info.si_code, &uinfo->si_code); -		} - -		if (ret) { -			ret = -EFAULT; -			break; +			if (copy_siginfo_to_user(uinfo, &info) || +			    __put_user(info.si_code, &uinfo->si_code)) { +				ret = -EFAULT; +				break; +			}  		}  		data += sizeof(siginfo_t); diff --git a/kernel/range.c b/kernel/range.c index eb911dbce267..322ea8e93e4b 100644 --- a/kernel/range.c +++ b/kernel/range.c @@ -4,7 +4,7 @@  #include <linux/kernel.h>  #include <linux/init.h>  #include <linux/sort.h> - +#include <linux/string.h>  #include <linux/range.h>  int add_range(struct range *range, int az, int nr_range, u64 start, u64 end) @@ -32,9 +32,8 @@ int add_range_with_merge(struct range *range, int az, int nr_range,  	if (start >= end)  		return nr_range; -	/* Try to merge it with old one: */ +	/* get new start/end: */  	for (i = 0; i < nr_range; i++) { -		u64 final_start, final_end;  		u64 common_start, common_end;  		if (!range[i].end) @@ -45,14 +44,16 @@ int add_range_with_merge(struct range *range, int az, int nr_range,  		if (common_start > common_end)  			continue; -		final_start = min(range[i].start, start); -		final_end = max(range[i].end, end); +		/* new start/end, will add it back at last */ +		start = min(range[i].start, start); +		end = max(range[i].end, end); -		/* clear it and add it back for further merge */ -		range[i].start = 0; -		range[i].end =  0; -		return add_range_with_merge(range, az, nr_range, -			final_start, final_end); +		memmove(&range[i], &range[i + 1], +			(nr_range - (i + 1)) * sizeof(range[i])); +		range[nr_range - 1].start = 0; +		range[nr_range - 1].end   = 0; +		nr_range--; +		i--;  	}  	/* Need to add it: */ diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 58453b8272fd..e8b335016c52 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -633,7 +633,19 @@ void wake_up_nohz_cpu(int cpu)  static inline bool got_nohz_idle_kick(void)  {  	int cpu = smp_processor_id(); -	return idle_cpu(cpu) && test_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu)); + +	if (!test_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu))) +		return false; + +	if (idle_cpu(cpu) && !need_resched()) +		return true; + +	/* +	 * We can't run Idle Load Balance on this CPU for this time so we +	 * cancel it and clear NOHZ_BALANCE_KICK +	 */ +	clear_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu)); +	return false;  }  #else /* CONFIG_NO_HZ_COMMON */ @@ -1393,8 +1405,9 @@ static void sched_ttwu_pending(void)  void scheduler_ipi(void)  { -	if (llist_empty(&this_rq()->wake_list) && !got_nohz_idle_kick() -	    && !tick_nohz_full_cpu(smp_processor_id())) +	if (llist_empty(&this_rq()->wake_list) +			&& !tick_nohz_full_cpu(smp_processor_id()) +			&& !got_nohz_idle_kick())  		return;  	/* @@ -1417,7 +1430,7 @@ void scheduler_ipi(void)  	/*  	 * Check if someone kicked us for doing the nohz idle load balance.  	 */ -	if (unlikely(got_nohz_idle_kick() && !need_resched())) { +	if (unlikely(got_nohz_idle_kick())) {  		this_rq()->idle_balance = 1;  		raise_softirq_irqoff(SCHED_SOFTIRQ);  	} @@ -4745,7 +4758,7 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)  	 */  	idle->sched_class = &idle_sched_class;  	ftrace_graph_init_idle_task(idle, cpu); -	vtime_init_idle(idle); +	vtime_init_idle(idle, cpu);  #if defined(CONFIG_SMP)  	sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu);  #endif diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index cc2dc3eea8a3..b5ccba22603b 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -747,17 +747,17 @@ void arch_vtime_task_switch(struct task_struct *prev)  	write_seqlock(¤t->vtime_seqlock);  	current->vtime_snap_whence = VTIME_SYS; -	current->vtime_snap = sched_clock(); +	current->vtime_snap = sched_clock_cpu(smp_processor_id());  	write_sequnlock(¤t->vtime_seqlock);  } -void vtime_init_idle(struct task_struct *t) +void vtime_init_idle(struct task_struct *t, int cpu)  {  	unsigned long flags;  	write_seqlock_irqsave(&t->vtime_seqlock, flags);  	t->vtime_snap_whence = VTIME_SYS; -	t->vtime_snap = sched_clock(); +	t->vtime_snap = sched_clock_cpu(cpu);  	write_sequnlock_irqrestore(&t->vtime_seqlock, flags);  } diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index 0c739423b0f9..20d6fba70652 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -599,8 +599,6 @@ void tick_broadcast_oneshot_control(unsigned long reason)  	} else {  		if (cpumask_test_and_clear_cpu(cpu, tick_broadcast_oneshot_mask)) {  			clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT); -			if (dev->next_event.tv64 == KTIME_MAX) -				goto out;  			/*  			 * The cpu which was handling the broadcast  			 * timer marked this cpu in the broadcast @@ -615,6 +613,11 @@ void tick_broadcast_oneshot_control(unsigned long reason)  				goto out;  			/* +			 * Bail out if there is no next event. +			 */ +			if (dev->next_event.tv64 == KTIME_MAX) +				goto out; +			/*  			 * If the pending bit is not set, then we are  			 * either the CPU handling the broadcast  			 * interrupt or we got woken by something else. @@ -698,10 +701,6 @@ void tick_broadcast_setup_oneshot(struct clock_event_device *bc)  		bc->event_handler = tick_handle_oneshot_broadcast; -		/* Take the do_timer update */ -		if (!tick_nohz_full_cpu(cpu)) -			tick_do_timer_cpu = cpu; -  		/*  		 * We must be careful here. There might be other CPUs  		 * waiting for periodic broadcast. We need to set the diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index f4208138fbf4..0cf1c1453181 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -306,7 +306,7 @@ static int __cpuinit tick_nohz_cpu_down_callback(struct notifier_block *nfb,  		 * we can't safely shutdown that CPU.  		 */  		if (have_nohz_full_mask && tick_do_timer_cpu == cpu) -			return -EINVAL; +			return NOTIFY_BAD;  		break;  	}  	return NOTIFY_OK; diff --git a/kernel/wait.c b/kernel/wait.c index 6698e0c04ead..ce0daa320a26 100644 --- a/kernel/wait.c +++ b/kernel/wait.c @@ -287,3 +287,91 @@ wait_queue_head_t *bit_waitqueue(void *word, int bit)  	return &zone->wait_table[hash_long(val, zone->wait_table_bits)];  }  EXPORT_SYMBOL(bit_waitqueue); + +/* + * Manipulate the atomic_t address to produce a better bit waitqueue table hash + * index (we're keying off bit -1, but that would produce a horrible hash + * value). + */ +static inline wait_queue_head_t *atomic_t_waitqueue(atomic_t *p) +{ +	if (BITS_PER_LONG == 64) { +		unsigned long q = (unsigned long)p; +		return bit_waitqueue((void *)(q & ~1), q & 1); +	} +	return bit_waitqueue(p, 0); +} + +static int wake_atomic_t_function(wait_queue_t *wait, unsigned mode, int sync, +				  void *arg) +{ +	struct wait_bit_key *key = arg; +	struct wait_bit_queue *wait_bit +		= container_of(wait, struct wait_bit_queue, wait); +	atomic_t *val = key->flags; + +	if (wait_bit->key.flags != key->flags || +	    wait_bit->key.bit_nr != key->bit_nr || +	    atomic_read(val) != 0) +		return 0; +	return autoremove_wake_function(wait, mode, sync, key); +} + +/* + * To allow interruptible waiting and asynchronous (i.e. nonblocking) waiting, + * the actions of __wait_on_atomic_t() are permitted return codes.  Nonzero + * return codes halt waiting and return. + */ +static __sched +int __wait_on_atomic_t(wait_queue_head_t *wq, struct wait_bit_queue *q, +		       int (*action)(atomic_t *), unsigned mode) +{ +	atomic_t *val; +	int ret = 0; + +	do { +		prepare_to_wait(wq, &q->wait, mode); +		val = q->key.flags; +		if (atomic_read(val) == 0) +			ret = (*action)(val); +	} while (!ret && atomic_read(val) != 0); +	finish_wait(wq, &q->wait); +	return ret; +} + +#define DEFINE_WAIT_ATOMIC_T(name, p)					\ +	struct wait_bit_queue name = {					\ +		.key = __WAIT_ATOMIC_T_KEY_INITIALIZER(p),		\ +		.wait	= {						\ +			.private	= current,			\ +			.func		= wake_atomic_t_function,	\ +			.task_list	=				\ +				LIST_HEAD_INIT((name).wait.task_list),	\ +		},							\ +	} + +__sched int out_of_line_wait_on_atomic_t(atomic_t *p, int (*action)(atomic_t *), +					 unsigned mode) +{ +	wait_queue_head_t *wq = atomic_t_waitqueue(p); +	DEFINE_WAIT_ATOMIC_T(wait, p); + +	return __wait_on_atomic_t(wq, &wait, action, mode); +} +EXPORT_SYMBOL(out_of_line_wait_on_atomic_t); + +/** + * wake_up_atomic_t - Wake up a waiter on a atomic_t + * @word: The word being waited on, a kernel virtual address + * @bit: The bit of the word being waited on + * + * Wake up anyone waiting for the atomic_t to go to zero. + * + * Abuse the bit-waker function and its waitqueue hash table set (the atomic_t + * check is done by the waiter's wake function, not the by the waker itself). + */ +void wake_up_atomic_t(atomic_t *p) +{ +	__wake_up_bit(atomic_t_waitqueue(p), p, WAIT_ATOMIC_T_BIT_NR); +} +EXPORT_SYMBOL(wake_up_atomic_t); |