diff options
Diffstat (limited to 'mm/memory.c')
| -rw-r--r-- | mm/memory.c | 436 | 
1 files changed, 269 insertions, 167 deletions
diff --git a/mm/memory.c b/mm/memory.c index 8068893697bb..22e037e3364e 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -690,12 +690,11 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr,  	/*  	 * Choose text because data symbols depend on CONFIG_KALLSYMS_ALL=y  	 */ -	if (vma->vm_ops) -		printk(KERN_ALERT "vma->vm_ops->fault: %pSR\n", -		       vma->vm_ops->fault); -	if (vma->vm_file) -		printk(KERN_ALERT "vma->vm_file->f_op->mmap: %pSR\n", -		       vma->vm_file->f_op->mmap); +	pr_alert("file:%pD fault:%pf mmap:%pf readpage:%pf\n", +		 vma->vm_file, +		 vma->vm_ops ? vma->vm_ops->fault : NULL, +		 vma->vm_file ? vma->vm_file->f_op->mmap : NULL, +		 mapping ? mapping->a_ops->readpage : NULL);  	dump_stack();  	add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);  } @@ -1983,167 +1982,91 @@ static int do_page_mkwrite(struct vm_area_struct *vma, struct page *page,  }  /* - * This routine handles present pages, when users try to write - * to a shared page. It is done by copying the page to a new address - * and decrementing the shared-page counter for the old page. + * Handle write page faults for pages that can be reused in the current vma   * - * Note that this routine assumes that the protection checks have been - * done by the caller (the low-level page fault routine in most cases). - * Thus we can safely just mark it writable once we've done any necessary - * COW. - * - * We also mark the page dirty at this point even though the page will - * change only once the write actually happens. This avoids a few races, - * and potentially makes it more efficient. - * - * We enter with non-exclusive mmap_sem (to exclude vma changes, - * but allow concurrent faults), with pte both mapped and locked. - * We return with mmap_sem still held, but pte unmapped and unlocked. + * This can happen either due to the mapping being with the VM_SHARED flag, + * or due to us being the last reference standing to the page. In either + * case, all we need to do here is to mark the page as writable and update + * any related book-keeping.   */ -static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, -		unsigned long address, pte_t *page_table, pmd_t *pmd, -		spinlock_t *ptl, pte_t orig_pte) +static inline int wp_page_reuse(struct mm_struct *mm, +			struct vm_area_struct *vma, unsigned long address, +			pte_t *page_table, spinlock_t *ptl, pte_t orig_pte, +			struct page *page, int page_mkwrite, +			int dirty_shared)  	__releases(ptl)  { -	struct page *old_page, *new_page = NULL;  	pte_t entry; -	int ret = 0; -	int page_mkwrite = 0; -	bool dirty_shared = false; -	unsigned long mmun_start = 0;	/* For mmu_notifiers */ -	unsigned long mmun_end = 0;	/* For mmu_notifiers */ -	struct mem_cgroup *memcg; - -	old_page = vm_normal_page(vma, address, orig_pte); -	if (!old_page) { -		/* -		 * VM_MIXEDMAP !pfn_valid() case, or VM_SOFTDIRTY clear on a -		 * VM_PFNMAP VMA. -		 * -		 * We should not cow pages in a shared writeable mapping. -		 * Just mark the pages writable as we can't do any dirty -		 * accounting on raw pfn maps. -		 */ -		if ((vma->vm_flags & (VM_WRITE|VM_SHARED)) == -				     (VM_WRITE|VM_SHARED)) -			goto reuse; -		goto gotten; -	} -  	/* -	 * Take out anonymous pages first, anonymous shared vmas are -	 * not dirty accountable. +	 * Clear the pages cpupid information as the existing +	 * information potentially belongs to a now completely +	 * unrelated process.  	 */ -	if (PageAnon(old_page) && !PageKsm(old_page)) { -		if (!trylock_page(old_page)) { -			page_cache_get(old_page); -			pte_unmap_unlock(page_table, ptl); -			lock_page(old_page); -			page_table = pte_offset_map_lock(mm, pmd, address, -							 &ptl); -			if (!pte_same(*page_table, orig_pte)) { -				unlock_page(old_page); -				goto unlock; -			} -			page_cache_release(old_page); -		} -		if (reuse_swap_page(old_page)) { -			/* -			 * The page is all ours.  Move it to our anon_vma so -			 * the rmap code will not search our parent or siblings. -			 * Protected against the rmap code by the page lock. -			 */ -			page_move_anon_rmap(old_page, vma, address); -			unlock_page(old_page); -			goto reuse; -		} -		unlock_page(old_page); -	} else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) == -					(VM_WRITE|VM_SHARED))) { -		page_cache_get(old_page); -		/* -		 * Only catch write-faults on shared writable pages, -		 * read-only shared pages can get COWed by -		 * get_user_pages(.write=1, .force=1). -		 */ -		if (vma->vm_ops && vma->vm_ops->page_mkwrite) { -			int tmp; - -			pte_unmap_unlock(page_table, ptl); -			tmp = do_page_mkwrite(vma, old_page, address); -			if (unlikely(!tmp || (tmp & -					(VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) { -				page_cache_release(old_page); -				return tmp; -			} -			/* -			 * Since we dropped the lock we need to revalidate -			 * the PTE as someone else may have changed it.  If -			 * they did, we just return, as we can count on the -			 * MMU to tell us if they didn't also make it writable. -			 */ -			page_table = pte_offset_map_lock(mm, pmd, address, -							 &ptl); -			if (!pte_same(*page_table, orig_pte)) { -				unlock_page(old_page); -				goto unlock; -			} -			page_mkwrite = 1; -		} - -		dirty_shared = true; - -reuse: -		/* -		 * Clear the pages cpupid information as the existing -		 * information potentially belongs to a now completely -		 * unrelated process. -		 */ -		if (old_page) -			page_cpupid_xchg_last(old_page, (1 << LAST_CPUPID_SHIFT) - 1); - -		flush_cache_page(vma, address, pte_pfn(orig_pte)); -		entry = pte_mkyoung(orig_pte); -		entry = maybe_mkwrite(pte_mkdirty(entry), vma); -		if (ptep_set_access_flags(vma, address, page_table, entry,1)) -			update_mmu_cache(vma, address, page_table); -		pte_unmap_unlock(page_table, ptl); -		ret |= VM_FAULT_WRITE; +	if (page) +		page_cpupid_xchg_last(page, (1 << LAST_CPUPID_SHIFT) - 1); -		if (dirty_shared) { -			struct address_space *mapping; -			int dirtied; +	flush_cache_page(vma, address, pte_pfn(orig_pte)); +	entry = pte_mkyoung(orig_pte); +	entry = maybe_mkwrite(pte_mkdirty(entry), vma); +	if (ptep_set_access_flags(vma, address, page_table, entry, 1)) +		update_mmu_cache(vma, address, page_table); +	pte_unmap_unlock(page_table, ptl); -			if (!page_mkwrite) -				lock_page(old_page); +	if (dirty_shared) { +		struct address_space *mapping; +		int dirtied; -			dirtied = set_page_dirty(old_page); -			VM_BUG_ON_PAGE(PageAnon(old_page), old_page); -			mapping = old_page->mapping; -			unlock_page(old_page); -			page_cache_release(old_page); +		if (!page_mkwrite) +			lock_page(page); -			if ((dirtied || page_mkwrite) && mapping) { -				/* -				 * Some device drivers do not set page.mapping -				 * but still dirty their pages -				 */ -				balance_dirty_pages_ratelimited(mapping); -			} +		dirtied = set_page_dirty(page); +		VM_BUG_ON_PAGE(PageAnon(page), page); +		mapping = page->mapping; +		unlock_page(page); +		page_cache_release(page); -			if (!page_mkwrite) -				file_update_time(vma->vm_file); +		if ((dirtied || page_mkwrite) && mapping) { +			/* +			 * Some device drivers do not set page.mapping +			 * but still dirty their pages +			 */ +			balance_dirty_pages_ratelimited(mapping);  		} -		return ret; +		if (!page_mkwrite) +			file_update_time(vma->vm_file);  	} -	/* -	 * Ok, we need to copy. Oh, well.. -	 */ -	page_cache_get(old_page); -gotten: -	pte_unmap_unlock(page_table, ptl); +	return VM_FAULT_WRITE; +} + +/* + * Handle the case of a page which we actually need to copy to a new page. + * + * Called with mmap_sem locked and the old page referenced, but + * without the ptl held. + * + * High level logic flow: + * + * - Allocate a page, copy the content of the old page to the new one. + * - Handle book keeping and accounting - cgroups, mmu-notifiers, etc. + * - Take the PTL. If the pte changed, bail out and release the allocated page + * - If the pte is still the way we remember it, update the page table and all + *   relevant references. This includes dropping the reference the page-table + *   held to the old page, as well as updating the rmap. + * - In any case, unlock the PTL and drop the reference we took to the old page. + */ +static int wp_page_copy(struct mm_struct *mm, struct vm_area_struct *vma, +			unsigned long address, pte_t *page_table, pmd_t *pmd, +			pte_t orig_pte, struct page *old_page) +{ +	struct page *new_page = NULL; +	spinlock_t *ptl = NULL; +	pte_t entry; +	int page_copied = 0; +	const unsigned long mmun_start = address & PAGE_MASK;	/* For mmu_notifiers */ +	const unsigned long mmun_end = mmun_start + PAGE_SIZE;	/* For mmu_notifiers */ +	struct mem_cgroup *memcg;  	if (unlikely(anon_vma_prepare(vma)))  		goto oom; @@ -2163,8 +2086,6 @@ gotten:  	if (mem_cgroup_try_charge(new_page, mm, GFP_KERNEL, &memcg))  		goto oom_free_new; -	mmun_start  = address & PAGE_MASK; -	mmun_end    = mmun_start + PAGE_SIZE;  	mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);  	/* @@ -2177,8 +2098,9 @@ gotten:  				dec_mm_counter_fast(mm, MM_FILEPAGES);  				inc_mm_counter_fast(mm, MM_ANONPAGES);  			} -		} else +		} else {  			inc_mm_counter_fast(mm, MM_ANONPAGES); +		}  		flush_cache_page(vma, address, pte_pfn(orig_pte));  		entry = mk_pte(new_page, vma->vm_page_prot);  		entry = maybe_mkwrite(pte_mkdirty(entry), vma); @@ -2227,29 +2149,29 @@ gotten:  		/* Free the old page.. */  		new_page = old_page; -		ret |= VM_FAULT_WRITE; -	} else +		page_copied = 1; +	} else {  		mem_cgroup_cancel_charge(new_page, memcg); +	}  	if (new_page)  		page_cache_release(new_page); -unlock: +  	pte_unmap_unlock(page_table, ptl); -	if (mmun_end > mmun_start) -		mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); +	mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);  	if (old_page) {  		/*  		 * Don't let another task, with possibly unlocked vma,  		 * keep the mlocked page.  		 */ -		if ((ret & VM_FAULT_WRITE) && (vma->vm_flags & VM_LOCKED)) { +		if (page_copied && (vma->vm_flags & VM_LOCKED)) {  			lock_page(old_page);	/* LRU manipulation */  			munlock_vma_page(old_page);  			unlock_page(old_page);  		}  		page_cache_release(old_page);  	} -	return ret; +	return page_copied ? VM_FAULT_WRITE : 0;  oom_free_new:  	page_cache_release(new_page);  oom: @@ -2258,6 +2180,179 @@ oom:  	return VM_FAULT_OOM;  } +/* + * Handle write page faults for VM_MIXEDMAP or VM_PFNMAP for a VM_SHARED + * mapping + */ +static int wp_pfn_shared(struct mm_struct *mm, +			struct vm_area_struct *vma, unsigned long address, +			pte_t *page_table, spinlock_t *ptl, pte_t orig_pte, +			pmd_t *pmd) +{ +	if (vma->vm_ops && vma->vm_ops->pfn_mkwrite) { +		struct vm_fault vmf = { +			.page = NULL, +			.pgoff = linear_page_index(vma, address), +			.virtual_address = (void __user *)(address & PAGE_MASK), +			.flags = FAULT_FLAG_WRITE | FAULT_FLAG_MKWRITE, +		}; +		int ret; + +		pte_unmap_unlock(page_table, ptl); +		ret = vma->vm_ops->pfn_mkwrite(vma, &vmf); +		if (ret & VM_FAULT_ERROR) +			return ret; +		page_table = pte_offset_map_lock(mm, pmd, address, &ptl); +		/* +		 * We might have raced with another page fault while we +		 * released the pte_offset_map_lock. +		 */ +		if (!pte_same(*page_table, orig_pte)) { +			pte_unmap_unlock(page_table, ptl); +			return 0; +		} +	} +	return wp_page_reuse(mm, vma, address, page_table, ptl, orig_pte, +			     NULL, 0, 0); +} + +static int wp_page_shared(struct mm_struct *mm, struct vm_area_struct *vma, +			  unsigned long address, pte_t *page_table, +			  pmd_t *pmd, spinlock_t *ptl, pte_t orig_pte, +			  struct page *old_page) +	__releases(ptl) +{ +	int page_mkwrite = 0; + +	page_cache_get(old_page); + +	/* +	 * Only catch write-faults on shared writable pages, +	 * read-only shared pages can get COWed by +	 * get_user_pages(.write=1, .force=1). +	 */ +	if (vma->vm_ops && vma->vm_ops->page_mkwrite) { +		int tmp; + +		pte_unmap_unlock(page_table, ptl); +		tmp = do_page_mkwrite(vma, old_page, address); +		if (unlikely(!tmp || (tmp & +				      (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) { +			page_cache_release(old_page); +			return tmp; +		} +		/* +		 * Since we dropped the lock we need to revalidate +		 * the PTE as someone else may have changed it.  If +		 * they did, we just return, as we can count on the +		 * MMU to tell us if they didn't also make it writable. +		 */ +		page_table = pte_offset_map_lock(mm, pmd, address, +						 &ptl); +		if (!pte_same(*page_table, orig_pte)) { +			unlock_page(old_page); +			pte_unmap_unlock(page_table, ptl); +			page_cache_release(old_page); +			return 0; +		} +		page_mkwrite = 1; +	} + +	return wp_page_reuse(mm, vma, address, page_table, ptl, +			     orig_pte, old_page, page_mkwrite, 1); +} + +/* + * This routine handles present pages, when users try to write + * to a shared page. It is done by copying the page to a new address + * and decrementing the shared-page counter for the old page. + * + * Note that this routine assumes that the protection checks have been + * done by the caller (the low-level page fault routine in most cases). + * Thus we can safely just mark it writable once we've done any necessary + * COW. + * + * We also mark the page dirty at this point even though the page will + * change only once the write actually happens. This avoids a few races, + * and potentially makes it more efficient. + * + * We enter with non-exclusive mmap_sem (to exclude vma changes, + * but allow concurrent faults), with pte both mapped and locked. + * We return with mmap_sem still held, but pte unmapped and unlocked. + */ +static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, +		unsigned long address, pte_t *page_table, pmd_t *pmd, +		spinlock_t *ptl, pte_t orig_pte) +	__releases(ptl) +{ +	struct page *old_page; + +	old_page = vm_normal_page(vma, address, orig_pte); +	if (!old_page) { +		/* +		 * VM_MIXEDMAP !pfn_valid() case, or VM_SOFTDIRTY clear on a +		 * VM_PFNMAP VMA. +		 * +		 * We should not cow pages in a shared writeable mapping. +		 * Just mark the pages writable and/or call ops->pfn_mkwrite. +		 */ +		if ((vma->vm_flags & (VM_WRITE|VM_SHARED)) == +				     (VM_WRITE|VM_SHARED)) +			return wp_pfn_shared(mm, vma, address, page_table, ptl, +					     orig_pte, pmd); + +		pte_unmap_unlock(page_table, ptl); +		return wp_page_copy(mm, vma, address, page_table, pmd, +				    orig_pte, old_page); +	} + +	/* +	 * Take out anonymous pages first, anonymous shared vmas are +	 * not dirty accountable. +	 */ +	if (PageAnon(old_page) && !PageKsm(old_page)) { +		if (!trylock_page(old_page)) { +			page_cache_get(old_page); +			pte_unmap_unlock(page_table, ptl); +			lock_page(old_page); +			page_table = pte_offset_map_lock(mm, pmd, address, +							 &ptl); +			if (!pte_same(*page_table, orig_pte)) { +				unlock_page(old_page); +				pte_unmap_unlock(page_table, ptl); +				page_cache_release(old_page); +				return 0; +			} +			page_cache_release(old_page); +		} +		if (reuse_swap_page(old_page)) { +			/* +			 * The page is all ours.  Move it to our anon_vma so +			 * the rmap code will not search our parent or siblings. +			 * Protected against the rmap code by the page lock. +			 */ +			page_move_anon_rmap(old_page, vma, address); +			unlock_page(old_page); +			return wp_page_reuse(mm, vma, address, page_table, ptl, +					     orig_pte, old_page, 0, 0); +		} +		unlock_page(old_page); +	} else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) == +					(VM_WRITE|VM_SHARED))) { +		return wp_page_shared(mm, vma, address, page_table, pmd, +				      ptl, orig_pte, old_page); +	} + +	/* +	 * Ok, we need to copy. Oh, well.. +	 */ +	page_cache_get(old_page); + +	pte_unmap_unlock(page_table, ptl); +	return wp_page_copy(mm, vma, address, page_table, pmd, +			    orig_pte, old_page); +} +  static void unmap_mapping_range_vma(struct vm_area_struct *vma,  		unsigned long start_addr, unsigned long end_addr,  		struct zap_details *details) @@ -2784,7 +2879,7 @@ static void do_fault_around(struct vm_area_struct *vma, unsigned long address,  	struct vm_fault vmf;  	int off; -	nr_pages = ACCESS_ONCE(fault_around_bytes) >> PAGE_SHIFT; +	nr_pages = READ_ONCE(fault_around_bytes) >> PAGE_SHIFT;  	mask = ~(nr_pages * PAGE_SIZE - 1) & PAGE_MASK;  	start_addr = max(address & mask, vma->vm_start); @@ -3035,6 +3130,7 @@ static int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,  	int last_cpupid;  	int target_nid;  	bool migrated = false; +	bool was_writable = pte_write(pte);  	int flags = 0;  	/* A PROT_NONE fault should not end up here */ @@ -3059,6 +3155,8 @@ static int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,  	/* Make it present again */  	pte = pte_modify(pte, vma->vm_page_prot);  	pte = pte_mkyoung(pte); +	if (was_writable) +		pte = pte_mkwrite(pte);  	set_pte_at(mm, addr, ptep, pte);  	update_mmu_cache(vma, addr, ptep); @@ -3069,11 +3167,14 @@ static int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,  	}  	/* -	 * Avoid grouping on DSO/COW pages in specific and RO pages -	 * in general, RO pages shouldn't hurt as much anyway since -	 * they can be in shared cache state. +	 * Avoid grouping on RO pages in general. RO pages shouldn't hurt as +	 * much anyway since they can be in shared cache state. This misses +	 * the case where a mapping is writable but the process never writes +	 * to it but pte_write gets cleared during protection updates and +	 * pte_dirty has unpredictable behaviour between PTE scan updates, +	 * background writeback, dirty balancing and application behaviour.  	 */ -	if (!pte_write(pte)) +	if (!(vma->vm_flags & VM_WRITE))  		flags |= TNF_NO_GROUP;  	/* @@ -3097,7 +3198,8 @@ static int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,  	if (migrated) {  		page_nid = target_nid;  		flags |= TNF_MIGRATED; -	} +	} else +		flags |= TNF_MIGRATE_FAIL;  out:  	if (page_nid != -1)  |