diff options
Diffstat (limited to 'mm/mlock.c')
| -rw-r--r-- | mm/mlock.c | 142 | 
1 files changed, 91 insertions, 51 deletions
diff --git a/mm/mlock.c b/mm/mlock.c index d480cd6fc475..4e1a68162285 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -91,6 +91,26 @@ void mlock_vma_page(struct page *page)  }  /* + * Isolate a page from LRU with optional get_page() pin. + * Assumes lru_lock already held and page already pinned. + */ +static bool __munlock_isolate_lru_page(struct page *page, bool getpage) +{ +	if (PageLRU(page)) { +		struct lruvec *lruvec; + +		lruvec = mem_cgroup_page_lruvec(page, page_zone(page)); +		if (getpage) +			get_page(page); +		ClearPageLRU(page); +		del_page_from_lru_list(page, lruvec, page_lru(page)); +		return true; +	} + +	return false; +} + +/*   * Finish munlock after successful page isolation   *   * Page must be locked. This is a wrapper for try_to_munlock() @@ -126,14 +146,17 @@ static void __munlock_isolated_page(struct page *page)  static void __munlock_isolation_failed(struct page *page)  {  	if (PageUnevictable(page)) -		count_vm_event(UNEVICTABLE_PGSTRANDED); +		__count_vm_event(UNEVICTABLE_PGSTRANDED);  	else -		count_vm_event(UNEVICTABLE_PGMUNLOCKED); +		__count_vm_event(UNEVICTABLE_PGMUNLOCKED);  }  /**   * munlock_vma_page - munlock a vma page - * @page - page to be unlocked + * @page - page to be unlocked, either a normal page or THP page head + * + * returns the size of the page as a page mask (0 for normal page, + *         HPAGE_PMD_NR - 1 for THP head page)   *   * called from munlock()/munmap() path with page supposedly on the LRU.   * When we munlock a page, because the vma where we found the page is being @@ -148,21 +171,36 @@ static void __munlock_isolation_failed(struct page *page)   */  unsigned int munlock_vma_page(struct page *page)  { -	unsigned int page_mask = 0; +	unsigned int nr_pages; +	struct zone *zone = page_zone(page);  	BUG_ON(!PageLocked(page)); -	if (TestClearPageMlocked(page)) { -		unsigned int nr_pages = hpage_nr_pages(page); -		mod_zone_page_state(page_zone(page), NR_MLOCK, -nr_pages); -		page_mask = nr_pages - 1; -		if (!isolate_lru_page(page)) -			__munlock_isolated_page(page); -		else -			__munlock_isolation_failed(page); +	/* +	 * Serialize with any parallel __split_huge_page_refcount() which +	 * might otherwise copy PageMlocked to part of the tail pages before +	 * we clear it in the head page. It also stabilizes hpage_nr_pages(). +	 */ +	spin_lock_irq(&zone->lru_lock); + +	nr_pages = hpage_nr_pages(page); +	if (!TestClearPageMlocked(page)) +		goto unlock_out; + +	__mod_zone_page_state(zone, NR_MLOCK, -nr_pages); + +	if (__munlock_isolate_lru_page(page, true)) { +		spin_unlock_irq(&zone->lru_lock); +		__munlock_isolated_page(page); +		goto out;  	} +	__munlock_isolation_failed(page); + +unlock_out: +	spin_unlock_irq(&zone->lru_lock); -	return page_mask; +out: +	return nr_pages - 1;  }  /** @@ -241,8 +279,8 @@ static int __mlock_posix_error_return(long retval)  static bool __putback_lru_fast_prepare(struct page *page, struct pagevec *pvec,  		int *pgrescued)  { -	VM_BUG_ON(PageLRU(page)); -	VM_BUG_ON(!PageLocked(page)); +	VM_BUG_ON_PAGE(PageLRU(page), page); +	VM_BUG_ON_PAGE(!PageLocked(page), page);  	if (page_mapcount(page) <= 1 && page_evictable(page)) {  		pagevec_add(pvec, page); @@ -286,50 +324,45 @@ static void __munlock_pagevec(struct pagevec *pvec, struct zone *zone)  {  	int i;  	int nr = pagevec_count(pvec); -	int delta_munlocked = -nr; +	int delta_munlocked;  	struct pagevec pvec_putback;  	int pgrescued = 0; +	pagevec_init(&pvec_putback, 0); +  	/* Phase 1: page isolation */  	spin_lock_irq(&zone->lru_lock);  	for (i = 0; i < nr; i++) {  		struct page *page = pvec->pages[i];  		if (TestClearPageMlocked(page)) { -			struct lruvec *lruvec; -			int lru; - -			if (PageLRU(page)) { -				lruvec = mem_cgroup_page_lruvec(page, zone); -				lru = page_lru(page); -				/* -				 * We already have pin from follow_page_mask() -				 * so we can spare the get_page() here. -				 */ -				ClearPageLRU(page); -				del_page_from_lru_list(page, lruvec, lru); -			} else { -				__munlock_isolation_failed(page); -				goto skip_munlock; -			} - -		} else { -skip_munlock:  			/* -			 * We won't be munlocking this page in the next phase -			 * but we still need to release the follow_page_mask() -			 * pin. +			 * We already have pin from follow_page_mask() +			 * so we can spare the get_page() here.  			 */ -			pvec->pages[i] = NULL; -			put_page(page); -			delta_munlocked++; +			if (__munlock_isolate_lru_page(page, false)) +				continue; +			else +				__munlock_isolation_failed(page);  		} + +		/* +		 * We won't be munlocking this page in the next phase +		 * but we still need to release the follow_page_mask() +		 * pin. We cannot do it under lru_lock however. If it's +		 * the last pin, __page_cache_release() would deadlock. +		 */ +		pagevec_add(&pvec_putback, pvec->pages[i]); +		pvec->pages[i] = NULL;  	} +	delta_munlocked = -nr + pagevec_count(&pvec_putback);  	__mod_zone_page_state(zone, NR_MLOCK, delta_munlocked);  	spin_unlock_irq(&zone->lru_lock); +	/* Now we can release pins of pages that we are not munlocking */ +	pagevec_release(&pvec_putback); +  	/* Phase 2: page munlock */ -	pagevec_init(&pvec_putback, 0);  	for (i = 0; i < nr; i++) {  		struct page *page = pvec->pages[i]; @@ -440,7 +473,8 @@ void munlock_vma_pages_range(struct vm_area_struct *vma,  	while (start < end) {  		struct page *page = NULL; -		unsigned int page_mask, page_increm; +		unsigned int page_mask; +		unsigned long page_increm;  		struct pagevec pvec;  		struct zone *zone;  		int zoneid; @@ -490,7 +524,9 @@ void munlock_vma_pages_range(struct vm_area_struct *vma,  				goto next;  			}  		} -		page_increm = 1 + (~(start >> PAGE_SHIFT) & page_mask); +		/* It's a bug to munlock in the middle of a THP page */ +		VM_BUG_ON((start >> PAGE_SHIFT) & page_mask); +		page_increm = 1 + page_mask;  		start += page_increm * PAGE_SIZE;  next:  		cond_resched(); @@ -689,19 +725,21 @@ SYSCALL_DEFINE2(mlock, unsigned long, start, size_t, len)  	lru_add_drain_all();	/* flush pagevec */ -	down_write(¤t->mm->mmap_sem);  	len = PAGE_ALIGN(len + (start & ~PAGE_MASK));  	start &= PAGE_MASK; -	locked = len >> PAGE_SHIFT; -	locked += current->mm->locked_vm; -  	lock_limit = rlimit(RLIMIT_MEMLOCK);  	lock_limit >>= PAGE_SHIFT; +	locked = len >> PAGE_SHIFT; + +	down_write(¤t->mm->mmap_sem); + +	locked += current->mm->locked_vm;  	/* check against resource limits */  	if ((locked <= lock_limit) || capable(CAP_IPC_LOCK))  		error = do_mlock(start, len, 1); +  	up_write(¤t->mm->mmap_sem);  	if (!error)  		error = __mm_populate(start, len, 0); @@ -712,11 +750,13 @@ SYSCALL_DEFINE2(munlock, unsigned long, start, size_t, len)  {  	int ret; -	down_write(¤t->mm->mmap_sem);  	len = PAGE_ALIGN(len + (start & ~PAGE_MASK));  	start &= PAGE_MASK; + +	down_write(¤t->mm->mmap_sem);  	ret = do_mlock(start, len, 0);  	up_write(¤t->mm->mmap_sem); +  	return ret;  } @@ -761,12 +801,12 @@ SYSCALL_DEFINE1(mlockall, int, flags)  	if (flags & MCL_CURRENT)  		lru_add_drain_all();	/* flush pagevec */ -	down_write(¤t->mm->mmap_sem); -  	lock_limit = rlimit(RLIMIT_MEMLOCK);  	lock_limit >>= PAGE_SHIFT;  	ret = -ENOMEM; +	down_write(¤t->mm->mmap_sem); +  	if (!(flags & MCL_CURRENT) || (current->mm->total_vm <= lock_limit) ||  	    capable(CAP_IPC_LOCK))  		ret = do_mlockall(flags);  |