diff options
Diffstat (limited to 'mm/filemap.c')
| -rw-r--r-- | mm/filemap.c | 209 |
1 files changed, 168 insertions, 41 deletions
diff --git a/mm/filemap.c b/mm/filemap.c index 9710f43a89ac..1d6b3a369077 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -45,6 +45,7 @@ #include <linux/migrate.h> #include <linux/pipe_fs_i.h> #include <linux/splice.h> +#include <linux/rcupdate_wait.h> #include <asm/pgalloc.h> #include <asm/tlbflush.h> #include "internal.h" @@ -113,16 +114,25 @@ * ->i_pages lock (try_to_unmap_one) * ->lruvec->lru_lock (follow_page->mark_page_accessed) * ->lruvec->lru_lock (check_pte_range->isolate_lru_page) - * ->private_lock (page_remove_rmap->set_page_dirty) - * ->i_pages lock (page_remove_rmap->set_page_dirty) - * bdi.wb->list_lock (page_remove_rmap->set_page_dirty) - * ->inode->i_lock (page_remove_rmap->set_page_dirty) - * ->memcg->move_lock (page_remove_rmap->folio_memcg_lock) + * ->private_lock (folio_remove_rmap_pte->set_page_dirty) + * ->i_pages lock (folio_remove_rmap_pte->set_page_dirty) + * bdi.wb->list_lock (folio_remove_rmap_pte->set_page_dirty) + * ->inode->i_lock (folio_remove_rmap_pte->set_page_dirty) + * ->memcg->move_lock (folio_remove_rmap_pte->folio_memcg_lock) * bdi.wb->list_lock (zap_pte_range->set_page_dirty) * ->inode->i_lock (zap_pte_range->set_page_dirty) * ->private_lock (zap_pte_range->block_dirty_folio) */ +static void mapping_set_update(struct xa_state *xas, + struct address_space *mapping) +{ + if (dax_mapping(mapping) || shmem_mapping(mapping)) + return; + xas_set_update(xas, workingset_update_node); + xas_set_lru(xas, &shadow_nodes); +} + static void page_cache_delete(struct address_space *mapping, struct folio *folio, void *shadow) { @@ -842,7 +852,7 @@ noinline int __filemap_add_folio(struct address_space *mapping, struct folio *folio, pgoff_t index, gfp_t gfp, void **shadowp) { XA_STATE(xas, &mapping->i_pages, index); - int huge = folio_test_hugetlb(folio); + bool huge = folio_test_hugetlb(folio); bool charged = false; long nr = 1; @@ -1353,7 +1363,7 @@ void migration_entry_wait_on_locked(swp_entry_t entry, spinlock_t *ptl) unsigned long pflags; bool in_thrashing; wait_queue_head_t *q; - struct folio *folio = page_folio(pfn_swap_entry_to_page(entry)); + struct folio *folio = pfn_swap_entry_folio(entry); q = folio_waitqueue(folio); if (!folio_test_uptodate(folio) && folio_test_workingset(folio)) { @@ -1530,7 +1540,7 @@ EXPORT_SYMBOL(folio_end_private_2); * folio_wait_private_2 - Wait for PG_private_2 to be cleared on a folio. * @folio: The folio to wait on. * - * Wait for PG_private_2 (aka PG_fscache) to be cleared on a folio. + * Wait for PG_private_2 to be cleared on a folio. */ void folio_wait_private_2(struct folio *folio) { @@ -1543,8 +1553,8 @@ EXPORT_SYMBOL(folio_wait_private_2); * folio_wait_private_2_killable - Wait for PG_private_2 to be cleared on a folio. * @folio: The folio to wait on. * - * Wait for PG_private_2 (aka PG_fscache) to be cleared on a folio or until a - * fatal signal is received by the calling task. + * Wait for PG_private_2 to be cleared on a folio or until a fatal signal is + * received by the calling task. * * Return: * - 0 if successful. @@ -1623,7 +1633,7 @@ EXPORT_SYMBOL_GPL(__folio_lock_killable); static int __folio_lock_async(struct folio *folio, struct wait_page_queue *wait) { struct wait_queue_head *q = folio_waitqueue(folio); - int ret = 0; + int ret; wait->folio = folio; wait->bit_nr = PG_locked; @@ -1911,8 +1921,6 @@ no_page: gfp_t alloc_gfp = gfp; err = -ENOMEM; - if (order == 1) - order = 0; if (order > 0) alloc_gfp |= __GFP_NORETRY | __GFP_NOWARN; folio = filemap_alloc_folio(alloc_gfp, order); @@ -2173,7 +2181,7 @@ update_start: if (nr) { folio = fbatch->folios[nr - 1]; - *start = folio->index + folio_nr_pages(folio); + *start = folio_next_index(folio); } out: rcu_read_unlock(); @@ -2678,6 +2686,7 @@ int kiocb_write_and_wait(struct kiocb *iocb, size_t count) return filemap_write_and_wait_range(mapping, pos, end); } +EXPORT_SYMBOL_GPL(kiocb_write_and_wait); int kiocb_invalidate_pages(struct kiocb *iocb, size_t count) { @@ -2705,6 +2714,7 @@ int kiocb_invalidate_pages(struct kiocb *iocb, size_t count) return invalidate_inode_pages2_range(mapping, pos >> PAGE_SHIFT, end >> PAGE_SHIFT); } +EXPORT_SYMBOL_GPL(kiocb_invalidate_pages); /** * generic_file_read_iter - generic filesystem read routine @@ -3171,6 +3181,48 @@ static struct file *do_async_mmap_readahead(struct vm_fault *vmf, return fpin; } +static vm_fault_t filemap_fault_recheck_pte_none(struct vm_fault *vmf) +{ + struct vm_area_struct *vma = vmf->vma; + vm_fault_t ret = 0; + pte_t *ptep; + + /* + * We might have COW'ed a pagecache folio and might now have an mlocked + * anon folio mapped. The original pagecache folio is not mlocked and + * might have been evicted. During a read+clear/modify/write update of + * the PTE, such as done in do_numa_page()/change_pte_range(), we + * temporarily clear the PTE under PT lock and might detect it here as + * "none" when not holding the PT lock. + * + * Not rechecking the PTE under PT lock could result in an unexpected + * major fault in an mlock'ed region. Recheck only for this special + * scenario while holding the PT lock, to not degrade non-mlocked + * scenarios. Recheck the PTE without PT lock firstly, thereby reducing + * the number of times we hold PT lock. + */ + if (!(vma->vm_flags & VM_LOCKED)) + return 0; + + if (!(vmf->flags & FAULT_FLAG_ORIG_PTE_VALID)) + return 0; + + ptep = pte_offset_map(vmf->pmd, vmf->address); + if (unlikely(!ptep)) + return VM_FAULT_NOPAGE; + + if (unlikely(!pte_none(ptep_get_lockless(ptep)))) { + ret = VM_FAULT_NOPAGE; + } else { + spin_lock(vmf->ptl); + if (unlikely(!pte_none(ptep_get(ptep)))) + ret = VM_FAULT_NOPAGE; + spin_unlock(vmf->ptl); + } + pte_unmap(ptep); + return ret; +} + /** * filemap_fault - read in file data for page fault handling * @vmf: struct vm_fault containing details of the fault @@ -3226,6 +3278,10 @@ vm_fault_t filemap_fault(struct vm_fault *vmf) mapping_locked = true; } } else { + ret = filemap_fault_recheck_pte_none(vmf); + if (unlikely(ret)) + return ret; + /* No page in the page cache at all */ count_vm_event(PGMAJFAULT); count_memcg_event_mm(vmf->vma->vm_mm, PGMAJFAULT); @@ -3371,7 +3427,7 @@ static bool filemap_map_pmd(struct vm_fault *vmf, struct folio *folio, } } - if (pmd_none(*vmf->pmd)) + if (pmd_none(*vmf->pmd) && vmf->prealloc_pte) pmd_install(mm, vmf->pmd, &vmf->prealloc_pte); return false; @@ -3443,7 +3499,7 @@ static vm_fault_t filemap_map_folio_range(struct vm_fault *vmf, * handled in the specific fault path, and it'll prohibit the * fault-around logic. */ - if (!pte_none(vmf->pte[count])) + if (!pte_none(ptep_get(&vmf->pte[count]))) goto skip; count++; @@ -4078,6 +4134,60 @@ bool filemap_release_folio(struct folio *folio, gfp_t gfp) } EXPORT_SYMBOL(filemap_release_folio); +/** + * filemap_invalidate_inode - Invalidate/forcibly write back a range of an inode's pagecache + * @inode: The inode to flush + * @flush: Set to write back rather than simply invalidate. + * @start: First byte to in range. + * @end: Last byte in range (inclusive), or LLONG_MAX for everything from start + * onwards. + * + * Invalidate all the folios on an inode that contribute to the specified + * range, possibly writing them back first. Whilst the operation is + * undertaken, the invalidate lock is held to prevent new folios from being + * installed. + */ +int filemap_invalidate_inode(struct inode *inode, bool flush, + loff_t start, loff_t end) +{ + struct address_space *mapping = inode->i_mapping; + pgoff_t first = start >> PAGE_SHIFT; + pgoff_t last = end >> PAGE_SHIFT; + pgoff_t nr = end == LLONG_MAX ? ULONG_MAX : last - first + 1; + + if (!mapping || !mapping->nrpages || end < start) + goto out; + + /* Prevent new folios from being added to the inode. */ + filemap_invalidate_lock(mapping); + + if (!mapping->nrpages) + goto unlock; + + unmap_mapping_pages(mapping, first, nr, false); + + /* Write back the data if we're asked to. */ + if (flush) { + struct writeback_control wbc = { + .sync_mode = WB_SYNC_ALL, + .nr_to_write = LONG_MAX, + .range_start = start, + .range_end = end, + }; + + filemap_fdatawrite_wbc(mapping, &wbc); + } + + /* Wait for writeback to complete on all folios and discard. */ + truncate_inode_pages_range(mapping, start, end); + +unlock: + filemap_invalidate_unlock(mapping); +out: + return filemap_check_errors(mapping); +} +EXPORT_SYMBOL_GPL(filemap_invalidate_inode); + #ifdef CONFIG_CACHESTAT_SYSCALL /** * filemap_cachestat() - compute the page cache statistics of a mapping @@ -4099,28 +4209,40 @@ static void filemap_cachestat(struct address_space *mapping, rcu_read_lock(); xas_for_each(&xas, folio, last_index) { + int order; unsigned long nr_pages; pgoff_t folio_first_index, folio_last_index; + /* + * Don't deref the folio. It is not pinned, and might + * get freed (and reused) underneath us. + * + * We *could* pin it, but that would be expensive for + * what should be a fast and lightweight syscall. + * + * Instead, derive all information of interest from + * the rcu-protected xarray. + */ + if (xas_retry(&xas, folio)) continue; + order = xa_get_order(xas.xa, xas.xa_index); + nr_pages = 1 << order; + folio_first_index = round_down(xas.xa_index, 1 << order); + folio_last_index = folio_first_index + nr_pages - 1; + + /* Folios might straddle the range boundaries, only count covered pages */ + if (folio_first_index < first_index) + nr_pages -= first_index - folio_first_index; + + if (folio_last_index > last_index) + nr_pages -= folio_last_index - last_index; + if (xa_is_value(folio)) { /* page is evicted */ void *shadow = (void *)folio; bool workingset; /* not used */ - int order = xa_get_order(xas.xa, xas.xa_index); - - nr_pages = 1 << order; - folio_first_index = round_down(xas.xa_index, 1 << order); - folio_last_index = folio_first_index + nr_pages - 1; - - /* Folios might straddle the range boundaries, only count covered pages */ - if (folio_first_index < first_index) - nr_pages -= first_index - folio_first_index; - - if (folio_last_index > last_index) - nr_pages -= folio_last_index - last_index; cs->nr_evicted += nr_pages; @@ -4129,7 +4251,23 @@ static void filemap_cachestat(struct address_space *mapping, /* shmem file - in swap cache */ swp_entry_t swp = radix_to_swp_entry(folio); + /* swapin error results in poisoned entry */ + if (non_swap_entry(swp)) + goto resched; + + /* + * Getting a swap entry from the shmem + * inode means we beat + * shmem_unuse(). rcu_read_lock() + * ensures swapoff waits for us before + * freeing the swapper space. However, + * we can race with swapping and + * invalidation, so there might not be + * a shadow in the swapcache (yet). + */ shadow = get_shadow_from_swap_cache(swp); + if (!shadow) + goto resched; } #endif if (workingset_test_recent(shadow, true, &workingset)) @@ -4138,24 +4276,13 @@ static void filemap_cachestat(struct address_space *mapping, goto resched; } - nr_pages = folio_nr_pages(folio); - folio_first_index = folio_pgoff(folio); - folio_last_index = folio_first_index + nr_pages - 1; - - /* Folios might straddle the range boundaries, only count covered pages */ - if (folio_first_index < first_index) - nr_pages -= first_index - folio_first_index; - - if (folio_last_index > last_index) - nr_pages -= folio_last_index - last_index; - /* page is in cache */ cs->nr_cache += nr_pages; - if (folio_test_dirty(folio)) + if (xas_get_mark(&xas, PAGECACHE_TAG_DIRTY)) cs->nr_dirty += nr_pages; - if (folio_test_writeback(folio)) + if (xas_get_mark(&xas, PAGECACHE_TAG_WRITEBACK)) cs->nr_writeback += nr_pages; resched: |