aboutsummaryrefslogtreecommitdiff
path: root/mm/filemap.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/filemap.c')
-rw-r--r--mm/filemap.c209
1 files changed, 168 insertions, 41 deletions
diff --git a/mm/filemap.c b/mm/filemap.c
index 9710f43a89ac..1d6b3a369077 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -45,6 +45,7 @@
#include <linux/migrate.h>
#include <linux/pipe_fs_i.h>
#include <linux/splice.h>
+#include <linux/rcupdate_wait.h>
#include <asm/pgalloc.h>
#include <asm/tlbflush.h>
#include "internal.h"
@@ -113,16 +114,25 @@
* ->i_pages lock (try_to_unmap_one)
* ->lruvec->lru_lock (follow_page->mark_page_accessed)
* ->lruvec->lru_lock (check_pte_range->isolate_lru_page)
- * ->private_lock (page_remove_rmap->set_page_dirty)
- * ->i_pages lock (page_remove_rmap->set_page_dirty)
- * bdi.wb->list_lock (page_remove_rmap->set_page_dirty)
- * ->inode->i_lock (page_remove_rmap->set_page_dirty)
- * ->memcg->move_lock (page_remove_rmap->folio_memcg_lock)
+ * ->private_lock (folio_remove_rmap_pte->set_page_dirty)
+ * ->i_pages lock (folio_remove_rmap_pte->set_page_dirty)
+ * bdi.wb->list_lock (folio_remove_rmap_pte->set_page_dirty)
+ * ->inode->i_lock (folio_remove_rmap_pte->set_page_dirty)
+ * ->memcg->move_lock (folio_remove_rmap_pte->folio_memcg_lock)
* bdi.wb->list_lock (zap_pte_range->set_page_dirty)
* ->inode->i_lock (zap_pte_range->set_page_dirty)
* ->private_lock (zap_pte_range->block_dirty_folio)
*/
+static void mapping_set_update(struct xa_state *xas,
+ struct address_space *mapping)
+{
+ if (dax_mapping(mapping) || shmem_mapping(mapping))
+ return;
+ xas_set_update(xas, workingset_update_node);
+ xas_set_lru(xas, &shadow_nodes);
+}
+
static void page_cache_delete(struct address_space *mapping,
struct folio *folio, void *shadow)
{
@@ -842,7 +852,7 @@ noinline int __filemap_add_folio(struct address_space *mapping,
struct folio *folio, pgoff_t index, gfp_t gfp, void **shadowp)
{
XA_STATE(xas, &mapping->i_pages, index);
- int huge = folio_test_hugetlb(folio);
+ bool huge = folio_test_hugetlb(folio);
bool charged = false;
long nr = 1;
@@ -1353,7 +1363,7 @@ void migration_entry_wait_on_locked(swp_entry_t entry, spinlock_t *ptl)
unsigned long pflags;
bool in_thrashing;
wait_queue_head_t *q;
- struct folio *folio = page_folio(pfn_swap_entry_to_page(entry));
+ struct folio *folio = pfn_swap_entry_folio(entry);
q = folio_waitqueue(folio);
if (!folio_test_uptodate(folio) && folio_test_workingset(folio)) {
@@ -1530,7 +1540,7 @@ EXPORT_SYMBOL(folio_end_private_2);
* folio_wait_private_2 - Wait for PG_private_2 to be cleared on a folio.
* @folio: The folio to wait on.
*
- * Wait for PG_private_2 (aka PG_fscache) to be cleared on a folio.
+ * Wait for PG_private_2 to be cleared on a folio.
*/
void folio_wait_private_2(struct folio *folio)
{
@@ -1543,8 +1553,8 @@ EXPORT_SYMBOL(folio_wait_private_2);
* folio_wait_private_2_killable - Wait for PG_private_2 to be cleared on a folio.
* @folio: The folio to wait on.
*
- * Wait for PG_private_2 (aka PG_fscache) to be cleared on a folio or until a
- * fatal signal is received by the calling task.
+ * Wait for PG_private_2 to be cleared on a folio or until a fatal signal is
+ * received by the calling task.
*
* Return:
* - 0 if successful.
@@ -1623,7 +1633,7 @@ EXPORT_SYMBOL_GPL(__folio_lock_killable);
static int __folio_lock_async(struct folio *folio, struct wait_page_queue *wait)
{
struct wait_queue_head *q = folio_waitqueue(folio);
- int ret = 0;
+ int ret;
wait->folio = folio;
wait->bit_nr = PG_locked;
@@ -1911,8 +1921,6 @@ no_page:
gfp_t alloc_gfp = gfp;
err = -ENOMEM;
- if (order == 1)
- order = 0;
if (order > 0)
alloc_gfp |= __GFP_NORETRY | __GFP_NOWARN;
folio = filemap_alloc_folio(alloc_gfp, order);
@@ -2173,7 +2181,7 @@ update_start:
if (nr) {
folio = fbatch->folios[nr - 1];
- *start = folio->index + folio_nr_pages(folio);
+ *start = folio_next_index(folio);
}
out:
rcu_read_unlock();
@@ -2678,6 +2686,7 @@ int kiocb_write_and_wait(struct kiocb *iocb, size_t count)
return filemap_write_and_wait_range(mapping, pos, end);
}
+EXPORT_SYMBOL_GPL(kiocb_write_and_wait);
int kiocb_invalidate_pages(struct kiocb *iocb, size_t count)
{
@@ -2705,6 +2714,7 @@ int kiocb_invalidate_pages(struct kiocb *iocb, size_t count)
return invalidate_inode_pages2_range(mapping, pos >> PAGE_SHIFT,
end >> PAGE_SHIFT);
}
+EXPORT_SYMBOL_GPL(kiocb_invalidate_pages);
/**
* generic_file_read_iter - generic filesystem read routine
@@ -3171,6 +3181,48 @@ static struct file *do_async_mmap_readahead(struct vm_fault *vmf,
return fpin;
}
+static vm_fault_t filemap_fault_recheck_pte_none(struct vm_fault *vmf)
+{
+ struct vm_area_struct *vma = vmf->vma;
+ vm_fault_t ret = 0;
+ pte_t *ptep;
+
+ /*
+ * We might have COW'ed a pagecache folio and might now have an mlocked
+ * anon folio mapped. The original pagecache folio is not mlocked and
+ * might have been evicted. During a read+clear/modify/write update of
+ * the PTE, such as done in do_numa_page()/change_pte_range(), we
+ * temporarily clear the PTE under PT lock and might detect it here as
+ * "none" when not holding the PT lock.
+ *
+ * Not rechecking the PTE under PT lock could result in an unexpected
+ * major fault in an mlock'ed region. Recheck only for this special
+ * scenario while holding the PT lock, to not degrade non-mlocked
+ * scenarios. Recheck the PTE without PT lock firstly, thereby reducing
+ * the number of times we hold PT lock.
+ */
+ if (!(vma->vm_flags & VM_LOCKED))
+ return 0;
+
+ if (!(vmf->flags & FAULT_FLAG_ORIG_PTE_VALID))
+ return 0;
+
+ ptep = pte_offset_map(vmf->pmd, vmf->address);
+ if (unlikely(!ptep))
+ return VM_FAULT_NOPAGE;
+
+ if (unlikely(!pte_none(ptep_get_lockless(ptep)))) {
+ ret = VM_FAULT_NOPAGE;
+ } else {
+ spin_lock(vmf->ptl);
+ if (unlikely(!pte_none(ptep_get(ptep))))
+ ret = VM_FAULT_NOPAGE;
+ spin_unlock(vmf->ptl);
+ }
+ pte_unmap(ptep);
+ return ret;
+}
+
/**
* filemap_fault - read in file data for page fault handling
* @vmf: struct vm_fault containing details of the fault
@@ -3226,6 +3278,10 @@ vm_fault_t filemap_fault(struct vm_fault *vmf)
mapping_locked = true;
}
} else {
+ ret = filemap_fault_recheck_pte_none(vmf);
+ if (unlikely(ret))
+ return ret;
+
/* No page in the page cache at all */
count_vm_event(PGMAJFAULT);
count_memcg_event_mm(vmf->vma->vm_mm, PGMAJFAULT);
@@ -3371,7 +3427,7 @@ static bool filemap_map_pmd(struct vm_fault *vmf, struct folio *folio,
}
}
- if (pmd_none(*vmf->pmd))
+ if (pmd_none(*vmf->pmd) && vmf->prealloc_pte)
pmd_install(mm, vmf->pmd, &vmf->prealloc_pte);
return false;
@@ -3443,7 +3499,7 @@ static vm_fault_t filemap_map_folio_range(struct vm_fault *vmf,
* handled in the specific fault path, and it'll prohibit the
* fault-around logic.
*/
- if (!pte_none(vmf->pte[count]))
+ if (!pte_none(ptep_get(&vmf->pte[count])))
goto skip;
count++;
@@ -4078,6 +4134,60 @@ bool filemap_release_folio(struct folio *folio, gfp_t gfp)
}
EXPORT_SYMBOL(filemap_release_folio);
+/**
+ * filemap_invalidate_inode - Invalidate/forcibly write back a range of an inode's pagecache
+ * @inode: The inode to flush
+ * @flush: Set to write back rather than simply invalidate.
+ * @start: First byte to in range.
+ * @end: Last byte in range (inclusive), or LLONG_MAX for everything from start
+ * onwards.
+ *
+ * Invalidate all the folios on an inode that contribute to the specified
+ * range, possibly writing them back first. Whilst the operation is
+ * undertaken, the invalidate lock is held to prevent new folios from being
+ * installed.
+ */
+int filemap_invalidate_inode(struct inode *inode, bool flush,
+ loff_t start, loff_t end)
+{
+ struct address_space *mapping = inode->i_mapping;
+ pgoff_t first = start >> PAGE_SHIFT;
+ pgoff_t last = end >> PAGE_SHIFT;
+ pgoff_t nr = end == LLONG_MAX ? ULONG_MAX : last - first + 1;
+
+ if (!mapping || !mapping->nrpages || end < start)
+ goto out;
+
+ /* Prevent new folios from being added to the inode. */
+ filemap_invalidate_lock(mapping);
+
+ if (!mapping->nrpages)
+ goto unlock;
+
+ unmap_mapping_pages(mapping, first, nr, false);
+
+ /* Write back the data if we're asked to. */
+ if (flush) {
+ struct writeback_control wbc = {
+ .sync_mode = WB_SYNC_ALL,
+ .nr_to_write = LONG_MAX,
+ .range_start = start,
+ .range_end = end,
+ };
+
+ filemap_fdatawrite_wbc(mapping, &wbc);
+ }
+
+ /* Wait for writeback to complete on all folios and discard. */
+ truncate_inode_pages_range(mapping, start, end);
+
+unlock:
+ filemap_invalidate_unlock(mapping);
+out:
+ return filemap_check_errors(mapping);
+}
+EXPORT_SYMBOL_GPL(filemap_invalidate_inode);
+
#ifdef CONFIG_CACHESTAT_SYSCALL
/**
* filemap_cachestat() - compute the page cache statistics of a mapping
@@ -4099,28 +4209,40 @@ static void filemap_cachestat(struct address_space *mapping,
rcu_read_lock();
xas_for_each(&xas, folio, last_index) {
+ int order;
unsigned long nr_pages;
pgoff_t folio_first_index, folio_last_index;
+ /*
+ * Don't deref the folio. It is not pinned, and might
+ * get freed (and reused) underneath us.
+ *
+ * We *could* pin it, but that would be expensive for
+ * what should be a fast and lightweight syscall.
+ *
+ * Instead, derive all information of interest from
+ * the rcu-protected xarray.
+ */
+
if (xas_retry(&xas, folio))
continue;
+ order = xa_get_order(xas.xa, xas.xa_index);
+ nr_pages = 1 << order;
+ folio_first_index = round_down(xas.xa_index, 1 << order);
+ folio_last_index = folio_first_index + nr_pages - 1;
+
+ /* Folios might straddle the range boundaries, only count covered pages */
+ if (folio_first_index < first_index)
+ nr_pages -= first_index - folio_first_index;
+
+ if (folio_last_index > last_index)
+ nr_pages -= folio_last_index - last_index;
+
if (xa_is_value(folio)) {
/* page is evicted */
void *shadow = (void *)folio;
bool workingset; /* not used */
- int order = xa_get_order(xas.xa, xas.xa_index);
-
- nr_pages = 1 << order;
- folio_first_index = round_down(xas.xa_index, 1 << order);
- folio_last_index = folio_first_index + nr_pages - 1;
-
- /* Folios might straddle the range boundaries, only count covered pages */
- if (folio_first_index < first_index)
- nr_pages -= first_index - folio_first_index;
-
- if (folio_last_index > last_index)
- nr_pages -= folio_last_index - last_index;
cs->nr_evicted += nr_pages;
@@ -4129,7 +4251,23 @@ static void filemap_cachestat(struct address_space *mapping,
/* shmem file - in swap cache */
swp_entry_t swp = radix_to_swp_entry(folio);
+ /* swapin error results in poisoned entry */
+ if (non_swap_entry(swp))
+ goto resched;
+
+ /*
+ * Getting a swap entry from the shmem
+ * inode means we beat
+ * shmem_unuse(). rcu_read_lock()
+ * ensures swapoff waits for us before
+ * freeing the swapper space. However,
+ * we can race with swapping and
+ * invalidation, so there might not be
+ * a shadow in the swapcache (yet).
+ */
shadow = get_shadow_from_swap_cache(swp);
+ if (!shadow)
+ goto resched;
}
#endif
if (workingset_test_recent(shadow, true, &workingset))
@@ -4138,24 +4276,13 @@ static void filemap_cachestat(struct address_space *mapping,
goto resched;
}
- nr_pages = folio_nr_pages(folio);
- folio_first_index = folio_pgoff(folio);
- folio_last_index = folio_first_index + nr_pages - 1;
-
- /* Folios might straddle the range boundaries, only count covered pages */
- if (folio_first_index < first_index)
- nr_pages -= first_index - folio_first_index;
-
- if (folio_last_index > last_index)
- nr_pages -= folio_last_index - last_index;
-
/* page is in cache */
cs->nr_cache += nr_pages;
- if (folio_test_dirty(folio))
+ if (xas_get_mark(&xas, PAGECACHE_TAG_DIRTY))
cs->nr_dirty += nr_pages;
- if (folio_test_writeback(folio))
+ if (xas_get_mark(&xas, PAGECACHE_TAG_WRITEBACK))
cs->nr_writeback += nr_pages;
resched: