diff options
Diffstat (limited to 'mm/filemap.c')
| -rw-r--r-- | mm/filemap.c | 121 | 
1 files changed, 83 insertions, 38 deletions
diff --git a/mm/filemap.c b/mm/filemap.c index 750e779c23db..7437b2bd75c1 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -124,6 +124,15 @@   *    ->private_lock		(zap_pte_range->block_dirty_folio)   */ +static void mapping_set_update(struct xa_state *xas, +		struct address_space *mapping) +{ +	if (dax_mapping(mapping) || shmem_mapping(mapping)) +		return; +	xas_set_update(xas, workingset_update_node); +	xas_set_lru(xas, &shadow_nodes); +} +  static void page_cache_delete(struct address_space *mapping,  				   struct folio *folio, void *shadow)  { @@ -843,7 +852,7 @@ noinline int __filemap_add_folio(struct address_space *mapping,  		struct folio *folio, pgoff_t index, gfp_t gfp, void **shadowp)  {  	XA_STATE(xas, &mapping->i_pages, index); -	int huge = folio_test_hugetlb(folio); +	bool huge = folio_test_hugetlb(folio);  	bool charged = false;  	long nr = 1; @@ -1354,7 +1363,7 @@ void migration_entry_wait_on_locked(swp_entry_t entry, spinlock_t *ptl)  	unsigned long pflags;  	bool in_thrashing;  	wait_queue_head_t *q; -	struct folio *folio = page_folio(pfn_swap_entry_to_page(entry)); +	struct folio *folio = pfn_swap_entry_folio(entry);  	q = folio_waitqueue(folio);  	if (!folio_test_uptodate(folio) && folio_test_workingset(folio)) { @@ -1912,8 +1921,6 @@ no_page:  			gfp_t alloc_gfp = gfp;  			err = -ENOMEM; -			if (order == 1) -				order = 0;  			if (order > 0)  				alloc_gfp |= __GFP_NORETRY | __GFP_NOWARN;  			folio = filemap_alloc_folio(alloc_gfp, order); @@ -2609,15 +2616,6 @@ ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *iter,  		end_offset = min_t(loff_t, isize, iocb->ki_pos + iter->count);  		/* -		 * Pairs with a barrier in -		 * block_write_end()->mark_buffer_dirty() or other page -		 * dirtying routines like iomap_write_end() to ensure -		 * changes to page contents are visible before we see -		 * increased inode size. -		 */ -		smp_rmb(); - -		/*  		 * Once we start copying data, we don't want to be touching any  		 * cachelines that might be contended:  		 */ @@ -3183,6 +3181,48 @@ static struct file *do_async_mmap_readahead(struct vm_fault *vmf,  	return fpin;  } +static vm_fault_t filemap_fault_recheck_pte_none(struct vm_fault *vmf) +{ +	struct vm_area_struct *vma = vmf->vma; +	vm_fault_t ret = 0; +	pte_t *ptep; + +	/* +	 * We might have COW'ed a pagecache folio and might now have an mlocked +	 * anon folio mapped. The original pagecache folio is not mlocked and +	 * might have been evicted. During a read+clear/modify/write update of +	 * the PTE, such as done in do_numa_page()/change_pte_range(), we +	 * temporarily clear the PTE under PT lock and might detect it here as +	 * "none" when not holding the PT lock. +	 * +	 * Not rechecking the PTE under PT lock could result in an unexpected +	 * major fault in an mlock'ed region. Recheck only for this special +	 * scenario while holding the PT lock, to not degrade non-mlocked +	 * scenarios. Recheck the PTE without PT lock firstly, thereby reducing +	 * the number of times we hold PT lock. +	 */ +	if (!(vma->vm_flags & VM_LOCKED)) +		return 0; + +	if (!(vmf->flags & FAULT_FLAG_ORIG_PTE_VALID)) +		return 0; + +	ptep = pte_offset_map(vmf->pmd, vmf->address); +	if (unlikely(!ptep)) +		return VM_FAULT_NOPAGE; + +	if (unlikely(!pte_none(ptep_get_lockless(ptep)))) { +		ret = VM_FAULT_NOPAGE; +	} else { +		spin_lock(vmf->ptl); +		if (unlikely(!pte_none(ptep_get(ptep)))) +			ret = VM_FAULT_NOPAGE; +		spin_unlock(vmf->ptl); +	} +	pte_unmap(ptep); +	return ret; +} +  /**   * filemap_fault - read in file data for page fault handling   * @vmf:	struct vm_fault containing details of the fault @@ -3238,6 +3278,10 @@ vm_fault_t filemap_fault(struct vm_fault *vmf)  			mapping_locked = true;  		}  	} else { +		ret = filemap_fault_recheck_pte_none(vmf); +		if (unlikely(ret)) +			return ret; +  		/* No page in the page cache at all */  		count_vm_event(PGMAJFAULT);  		count_memcg_event_mm(vmf->vma->vm_mm, PGMAJFAULT); @@ -4111,28 +4155,40 @@ static void filemap_cachestat(struct address_space *mapping,  	rcu_read_lock();  	xas_for_each(&xas, folio, last_index) { +		int order;  		unsigned long nr_pages;  		pgoff_t folio_first_index, folio_last_index; +		/* +		 * Don't deref the folio. It is not pinned, and might +		 * get freed (and reused) underneath us. +		 * +		 * We *could* pin it, but that would be expensive for +		 * what should be a fast and lightweight syscall. +		 * +		 * Instead, derive all information of interest from +		 * the rcu-protected xarray. +		 */ +  		if (xas_retry(&xas, folio))  			continue; +		order = xa_get_order(xas.xa, xas.xa_index); +		nr_pages = 1 << order; +		folio_first_index = round_down(xas.xa_index, 1 << order); +		folio_last_index = folio_first_index + nr_pages - 1; + +		/* Folios might straddle the range boundaries, only count covered pages */ +		if (folio_first_index < first_index) +			nr_pages -= first_index - folio_first_index; + +		if (folio_last_index > last_index) +			nr_pages -= folio_last_index - last_index; +  		if (xa_is_value(folio)) {  			/* page is evicted */  			void *shadow = (void *)folio;  			bool workingset; /* not used */ -			int order = xa_get_order(xas.xa, xas.xa_index); - -			nr_pages = 1 << order; -			folio_first_index = round_down(xas.xa_index, 1 << order); -			folio_last_index = folio_first_index + nr_pages - 1; - -			/* Folios might straddle the range boundaries, only count covered pages */ -			if (folio_first_index < first_index) -				nr_pages -= first_index - folio_first_index; - -			if (folio_last_index > last_index) -				nr_pages -= folio_last_index - last_index;  			cs->nr_evicted += nr_pages; @@ -4150,24 +4206,13 @@ static void filemap_cachestat(struct address_space *mapping,  			goto resched;  		} -		nr_pages = folio_nr_pages(folio); -		folio_first_index = folio_pgoff(folio); -		folio_last_index = folio_first_index + nr_pages - 1; - -		/* Folios might straddle the range boundaries, only count covered pages */ -		if (folio_first_index < first_index) -			nr_pages -= first_index - folio_first_index; - -		if (folio_last_index > last_index) -			nr_pages -= folio_last_index - last_index; -  		/* page is in cache */  		cs->nr_cache += nr_pages; -		if (folio_test_dirty(folio)) +		if (xas_get_mark(&xas, PAGECACHE_TAG_DIRTY))  			cs->nr_dirty += nr_pages; -		if (folio_test_writeback(folio)) +		if (xas_get_mark(&xas, PAGECACHE_TAG_WRITEBACK))  			cs->nr_writeback += nr_pages;  resched:  |