diff options
Diffstat (limited to 'mm/oom_kill.c')
| -rw-r--r-- | mm/oom_kill.c | 172 | 
1 files changed, 123 insertions, 49 deletions
diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 86349586eacb..ddf74487f848 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -174,8 +174,13 @@ unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg,  	if (!p)  		return 0; +	/* +	 * Do not even consider tasks which are explicitly marked oom +	 * unkillable or have been already oom reaped. +	 */  	adj = (long)p->signal->oom_score_adj; -	if (adj == OOM_SCORE_ADJ_MIN) { +	if (adj == OOM_SCORE_ADJ_MIN || +			test_bit(MMF_OOM_REAPED, &p->mm->flags)) {  		task_unlock(p);  		return 0;  	} @@ -278,12 +283,8 @@ enum oom_scan_t oom_scan_process_thread(struct oom_control *oc,  	 * This task already has access to memory reserves and is being killed.  	 * Don't allow any other task to have access to the reserves.  	 */ -	if (test_tsk_thread_flag(task, TIF_MEMDIE)) { -		if (!is_sysrq_oom(oc)) -			return OOM_SCAN_ABORT; -	} -	if (!task->mm) -		return OOM_SCAN_CONTINUE; +	if (!is_sysrq_oom(oc) && atomic_read(&task->signal->oom_victims)) +		return OOM_SCAN_ABORT;  	/*  	 * If task is allocating a lot of memory and has been marked to be @@ -302,12 +303,12 @@ enum oom_scan_t oom_scan_process_thread(struct oom_control *oc,  static struct task_struct *select_bad_process(struct oom_control *oc,  		unsigned int *ppoints, unsigned long totalpages)  { -	struct task_struct *g, *p; +	struct task_struct *p;  	struct task_struct *chosen = NULL;  	unsigned long chosen_points = 0;  	rcu_read_lock(); -	for_each_process_thread(g, p) { +	for_each_process(p) {  		unsigned int points;  		switch (oom_scan_process_thread(oc, p, totalpages)) { @@ -326,9 +327,6 @@ static struct task_struct *select_bad_process(struct oom_control *oc,  		points = oom_badness(p, NULL, oc->nodemask, totalpages);  		if (!points || points < chosen_points)  			continue; -		/* Prefer thread group leaders for display purposes */ -		if (points == chosen_points && thread_group_leader(chosen)) -			continue;  		chosen = p;  		chosen_points = points; @@ -412,6 +410,25 @@ bool oom_killer_disabled __read_mostly;  #define K(x) ((x) << (PAGE_SHIFT-10)) +/* + * task->mm can be NULL if the task is the exited group leader.  So to + * determine whether the task is using a particular mm, we examine all the + * task's threads: if one of those is using this mm then this task was also + * using it. + */ +static bool process_shares_mm(struct task_struct *p, struct mm_struct *mm) +{ +	struct task_struct *t; + +	for_each_thread(p, t) { +		struct mm_struct *t_mm = READ_ONCE(t->mm); +		if (t_mm) +			return t_mm == mm; +	} +	return false; +} + +  #ifdef CONFIG_MMU  /*   * OOM Reaper kernel thread which tries to reap the memory used by the OOM @@ -422,18 +439,33 @@ static DECLARE_WAIT_QUEUE_HEAD(oom_reaper_wait);  static struct task_struct *oom_reaper_list;  static DEFINE_SPINLOCK(oom_reaper_lock); -  static bool __oom_reap_task(struct task_struct *tsk)  {  	struct mmu_gather tlb;  	struct vm_area_struct *vma; -	struct mm_struct *mm; +	struct mm_struct *mm = NULL;  	struct task_struct *p;  	struct zap_details details = {.check_swap_entries = true,  				      .ignore_dirty = true};  	bool ret = true;  	/* +	 * We have to make sure to not race with the victim exit path +	 * and cause premature new oom victim selection: +	 * __oom_reap_task		exit_mm +	 *   atomic_inc_not_zero +	 *				  mmput +	 *				    atomic_dec_and_test +	 *				  exit_oom_victim +	 *				[...] +	 *				out_of_memory +	 *				  select_bad_process +	 *				    # no TIF_MEMDIE task selects new victim +	 *  unmap_page_range # frees some memory +	 */ +	mutex_lock(&oom_lock); + +	/*  	 * Make sure we find the associated mm_struct even when the particular  	 * thread has already terminated and cleared its mm.  	 * We might have race with exit path so consider our work done if there @@ -441,19 +473,14 @@ static bool __oom_reap_task(struct task_struct *tsk)  	 */  	p = find_lock_task_mm(tsk);  	if (!p) -		return true; - +		goto unlock_oom;  	mm = p->mm; -	if (!atomic_inc_not_zero(&mm->mm_users)) { -		task_unlock(p); -		return true; -	} - +	atomic_inc(&mm->mm_users);  	task_unlock(p);  	if (!down_read_trylock(&mm->mmap_sem)) {  		ret = false; -		goto out; +		goto unlock_oom;  	}  	tlb_gather_mmu(&tlb, mm, 0, -1); @@ -491,16 +518,19 @@ static bool __oom_reap_task(struct task_struct *tsk)  	up_read(&mm->mmap_sem);  	/* -	 * Clear TIF_MEMDIE because the task shouldn't be sitting on a -	 * reasonably reclaimable memory anymore. OOM killer can continue -	 * by selecting other victim if unmapping hasn't led to any -	 * improvements. This also means that selecting this task doesn't -	 * make any sense. +	 * This task can be safely ignored because we cannot do much more +	 * to release its memory.  	 */ -	tsk->signal->oom_score_adj = OOM_SCORE_ADJ_MIN; -	exit_oom_victim(tsk); -out: -	mmput(mm); +	set_bit(MMF_OOM_REAPED, &mm->flags); +unlock_oom: +	mutex_unlock(&oom_lock); +	/* +	 * Drop our reference but make sure the mmput slow path is called from a +	 * different context because we shouldn't risk we get stuck there and +	 * put the oom_reaper out of the way. +	 */ +	if (mm) +		mmput_async(mm);  	return ret;  } @@ -519,6 +549,15 @@ static void oom_reap_task(struct task_struct *tsk)  		debug_show_all_locks();  	} +	/* +	 * Clear TIF_MEMDIE because the task shouldn't be sitting on a +	 * reasonably reclaimable memory anymore or it is not a good candidate +	 * for the oom victim right now because it cannot release its memory +	 * itself nor by the oom reaper. +	 */ +	tsk->oom_reaper_list = NULL; +	exit_oom_victim(tsk); +  	/* Drop a reference taken by wake_oom_reaper */  	put_task_struct(tsk);  } @@ -563,6 +602,46 @@ static void wake_oom_reaper(struct task_struct *tsk)  	wake_up(&oom_reaper_wait);  } +/* Check if we can reap the given task. This has to be called with stable + * tsk->mm + */ +void try_oom_reaper(struct task_struct *tsk) +{ +	struct mm_struct *mm = tsk->mm; +	struct task_struct *p; + +	if (!mm) +		return; + +	/* +	 * There might be other threads/processes which are either not +	 * dying or even not killable. +	 */ +	if (atomic_read(&mm->mm_users) > 1) { +		rcu_read_lock(); +		for_each_process(p) { +			if (!process_shares_mm(p, mm)) +				continue; +			if (fatal_signal_pending(p)) +				continue; + +			/* +			 * If the task is exiting make sure the whole thread group +			 * is exiting and cannot acces mm anymore. +			 */ +			if (signal_group_exit(p->signal)) +				continue; + +			/* Give up */ +			rcu_read_unlock(); +			return; +		} +		rcu_read_unlock(); +	} + +	wake_oom_reaper(tsk); +} +  static int __init oom_init(void)  {  	oom_reaper_th = kthread_run(oom_reaper, NULL, "oom_reaper"); @@ -593,6 +672,7 @@ void mark_oom_victim(struct task_struct *tsk)  	/* OOM killer might race with memcg OOM */  	if (test_and_set_tsk_thread_flag(tsk, TIF_MEMDIE))  		return; +	atomic_inc(&tsk->signal->oom_victims);  	/*  	 * Make sure that the task is woken up from uninterruptible sleep  	 * if it is frozen because OOM killer wouldn't be able to free @@ -610,6 +690,7 @@ void exit_oom_victim(struct task_struct *tsk)  {  	if (!test_and_clear_tsk_thread_flag(tsk, TIF_MEMDIE))  		return; +	atomic_dec(&tsk->signal->oom_victims);  	if (!atomic_dec_return(&oom_victims))  		wake_up_all(&oom_victims_wait); @@ -653,24 +734,6 @@ void oom_killer_enable(void)  }  /* - * task->mm can be NULL if the task is the exited group leader.  So to - * determine whether the task is using a particular mm, we examine all the - * task's threads: if one of those is using this mm then this task was also - * using it. - */ -static bool process_shares_mm(struct task_struct *p, struct mm_struct *mm) -{ -	struct task_struct *t; - -	for_each_thread(p, t) { -		struct mm_struct *t_mm = READ_ONCE(t->mm); -		if (t_mm) -			return t_mm == mm; -	} -	return false; -} - -/*   * Must be called while holding a reference to p, which will be released upon   * returning.   */ @@ -694,6 +757,7 @@ void oom_kill_process(struct oom_control *oc, struct task_struct *p,  	task_lock(p);  	if (p->mm && task_will_free_mem(p)) {  		mark_oom_victim(p); +		try_oom_reaper(p);  		task_unlock(p);  		put_task_struct(p);  		return; @@ -873,10 +937,20 @@ bool out_of_memory(struct oom_control *oc)  	if (current->mm &&  	    (fatal_signal_pending(current) || task_will_free_mem(current))) {  		mark_oom_victim(current); +		try_oom_reaper(current);  		return true;  	}  	/* +	 * The OOM killer does not compensate for IO-less reclaim. +	 * pagefault_out_of_memory lost its gfp context so we have to +	 * make sure exclude 0 mask - all other users should have at least +	 * ___GFP_DIRECT_RECLAIM to get here. +	 */ +	if (oc->gfp_mask && !(oc->gfp_mask & (__GFP_FS|__GFP_NOFAIL))) +		return true; + +	/*  	 * Check if there were limitations on the allocation (only relevant for  	 * NUMA) that may require different handling.  	 */  |