aboutsummaryrefslogtreecommitdiff
path: root/mm/filemap.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/filemap.c')
-rw-r--r--mm/filemap.c185
1 files changed, 120 insertions, 65 deletions
diff --git a/mm/filemap.c b/mm/filemap.c
index 1d6b3a369077..d62150418b91 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -168,7 +168,7 @@ static void filemap_unaccount_folio(struct address_space *mapping,
add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
if (mapping_exiting(mapping) && !folio_test_large(folio)) {
- int mapcount = page_mapcount(&folio->page);
+ int mapcount = folio_mapcount(folio);
if (folio_ref_count(folio) >= mapcount + 2) {
/*
@@ -177,7 +177,7 @@ static void filemap_unaccount_folio(struct address_space *mapping,
* and we'd rather not leak it: if we're wrong,
* another bad page check should catch it later.
*/
- page_mapcount_reset(&folio->page);
+ atomic_set(&folio->_mapcount, -1);
folio_ref_sub(folio, mapcount);
}
}
@@ -852,23 +852,18 @@ noinline int __filemap_add_folio(struct address_space *mapping,
struct folio *folio, pgoff_t index, gfp_t gfp, void **shadowp)
{
XA_STATE(xas, &mapping->i_pages, index);
- bool huge = folio_test_hugetlb(folio);
- bool charged = false;
- long nr = 1;
+ void *alloced_shadow = NULL;
+ int alloced_order = 0;
+ bool huge;
+ long nr;
VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
VM_BUG_ON_FOLIO(folio_test_swapbacked(folio), folio);
mapping_set_update(&xas, mapping);
- if (!huge) {
- int error = mem_cgroup_charge(folio, NULL, gfp);
- if (error)
- return error;
- charged = true;
- }
-
VM_BUG_ON_FOLIO(index & (folio_nr_pages(folio) - 1), folio);
xas_set_order(&xas, index, folio_order(folio));
+ huge = folio_test_hugetlb(folio);
nr = folio_nr_pages(folio);
gfp &= GFP_RECLAIM_MASK;
@@ -876,13 +871,10 @@ noinline int __filemap_add_folio(struct address_space *mapping,
folio->mapping = mapping;
folio->index = xas.xa_index;
- do {
- unsigned int order = xa_get_order(xas.xa, xas.xa_index);
+ for (;;) {
+ int order = -1, split_order = 0;
void *entry, *old = NULL;
- if (order > folio_order(folio))
- xas_split_alloc(&xas, xa_load(xas.xa, xas.xa_index),
- order, gfp);
xas_lock_irq(&xas);
xas_for_each_conflict(&xas, entry) {
old = entry;
@@ -890,19 +882,33 @@ noinline int __filemap_add_folio(struct address_space *mapping,
xas_set_err(&xas, -EEXIST);
goto unlock;
}
+ /*
+ * If a larger entry exists,
+ * it will be the first and only entry iterated.
+ */
+ if (order == -1)
+ order = xas_get_order(&xas);
+ }
+
+ /* entry may have changed before we re-acquire the lock */
+ if (alloced_order && (old != alloced_shadow || order != alloced_order)) {
+ xas_destroy(&xas);
+ alloced_order = 0;
}
if (old) {
- if (shadowp)
- *shadowp = old;
- /* entry may have been split before we acquired lock */
- order = xa_get_order(xas.xa, xas.xa_index);
- if (order > folio_order(folio)) {
+ if (order > 0 && order > folio_order(folio)) {
/* How to handle large swap entries? */
BUG_ON(shmem_mapping(mapping));
+ if (!alloced_order) {
+ split_order = order;
+ goto unlock;
+ }
xas_split(&xas, old, order);
xas_reset(&xas);
}
+ if (shadowp)
+ *shadowp = old;
}
xas_store(&xas, folio);
@@ -918,9 +924,24 @@ noinline int __filemap_add_folio(struct address_space *mapping,
__lruvec_stat_mod_folio(folio,
NR_FILE_THPS, nr);
}
+
unlock:
xas_unlock_irq(&xas);
- } while (xas_nomem(&xas, gfp));
+
+ /* split needed, alloc here and retry. */
+ if (split_order) {
+ xas_split_alloc(&xas, old, split_order, gfp);
+ if (xas_error(&xas))
+ goto error;
+ alloced_shadow = old;
+ alloced_order = split_order;
+ xas_reset(&xas);
+ continue;
+ }
+
+ if (!xas_nomem(&xas, gfp))
+ break;
+ }
if (xas_error(&xas))
goto error;
@@ -928,8 +949,6 @@ unlock:
trace_mm_filemap_add_to_page_cache(folio);
return 0;
error:
- if (charged)
- mem_cgroup_uncharge(folio);
folio->mapping = NULL;
/* Leave page->index set: truncation relies upon it */
folio_put_refs(folio, nr);
@@ -943,11 +962,16 @@ int filemap_add_folio(struct address_space *mapping, struct folio *folio,
void *shadow = NULL;
int ret;
+ ret = mem_cgroup_charge(folio, NULL, gfp);
+ if (ret)
+ return ret;
+
__folio_set_locked(folio);
ret = __filemap_add_folio(mapping, folio, index, gfp, &shadow);
- if (unlikely(ret))
+ if (unlikely(ret)) {
+ mem_cgroup_uncharge(folio);
__folio_clear_locked(folio);
- else {
+ } else {
/*
* The folio might have been evicted from cache only
* recently, in which case it should be activated like
@@ -966,7 +990,7 @@ int filemap_add_folio(struct address_space *mapping, struct folio *folio,
EXPORT_SYMBOL_GPL(filemap_add_folio);
#ifdef CONFIG_NUMA
-struct folio *filemap_alloc_folio(gfp_t gfp, unsigned int order)
+struct folio *filemap_alloc_folio_noprof(gfp_t gfp, unsigned int order)
{
int n;
struct folio *folio;
@@ -976,14 +1000,14 @@ struct folio *filemap_alloc_folio(gfp_t gfp, unsigned int order)
do {
cpuset_mems_cookie = read_mems_allowed_begin();
n = cpuset_mem_spread_node();
- folio = __folio_alloc_node(gfp, order, n);
+ folio = __folio_alloc_node_noprof(gfp, order, n);
} while (!folio && read_mems_allowed_retry(cpuset_mems_cookie));
return folio;
}
- return folio_alloc(gfp, order);
+ return folio_alloc_noprof(gfp, order);
}
-EXPORT_SYMBOL(filemap_alloc_folio);
+EXPORT_SYMBOL(filemap_alloc_folio_noprof);
#endif
/*
@@ -1728,12 +1752,12 @@ pgoff_t page_cache_next_miss(struct address_space *mapping,
while (max_scan--) {
void *entry = xas_next(&xas);
if (!entry || xa_is_value(entry))
- break;
+ return xas.xa_index;
if (xas.xa_index == 0)
- break;
+ return 0;
}
- return xas.xa_index;
+ return index + max_scan;
}
EXPORT_SYMBOL(page_cache_next_miss);
@@ -1786,7 +1810,7 @@ EXPORT_SYMBOL(page_cache_prev_miss);
* C. Return the page to the page allocator
*
* This means that any page may have its reference count temporarily
- * increased by a speculative page cache (or fast GUP) lookup as it can
+ * increased by a speculative page cache (or GUP-fast) lookup as it can
* be allocated by another user before the RCU grace period expires.
* Because the refcount temporarily acquired here may end up being the
* last refcount on the page, any page allocation must be freeable by
@@ -1823,7 +1847,7 @@ repeat:
if (!folio || xa_is_value(folio))
goto out;
- if (!folio_try_get_rcu(folio))
+ if (!folio_try_get(folio))
goto repeat;
if (unlikely(folio != xas_reload(&xas))) {
@@ -1977,7 +2001,7 @@ retry:
if (!folio || xa_is_value(folio))
return folio;
- if (!folio_try_get_rcu(folio))
+ if (!folio_try_get(folio))
goto reset;
if (unlikely(folio != xas_reload(xas))) {
@@ -2157,7 +2181,7 @@ unsigned filemap_get_folios_contig(struct address_space *mapping,
if (xa_is_value(folio))
goto update_start;
- if (!folio_try_get_rcu(folio))
+ if (!folio_try_get(folio))
goto retry;
if (unlikely(folio != xas_reload(&xas)))
@@ -2289,7 +2313,7 @@ static void filemap_get_read_batch(struct address_space *mapping,
break;
if (xa_is_sibling(folio))
break;
- if (!folio_try_get_rcu(folio))
+ if (!folio_try_get(folio))
goto retry;
if (unlikely(folio != xas_reload(&xas)))
@@ -3100,7 +3124,7 @@ static struct file *do_sync_mmap_readahead(struct vm_fault *vmf)
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
/* Use the readahead code, even if readahead is disabled */
- if (vm_flags & VM_HUGEPAGE) {
+ if ((vm_flags & VM_HUGEPAGE) && HPAGE_PMD_ORDER <= MAX_PAGECACHE_ORDER) {
fpin = maybe_unlock_mmap_for_io(vmf, fpin);
ractl._index &= ~((unsigned long)HPAGE_PMD_NR - 1);
ra->size = HPAGE_PMD_NR;
@@ -3207,7 +3231,8 @@ static vm_fault_t filemap_fault_recheck_pte_none(struct vm_fault *vmf)
if (!(vmf->flags & FAULT_FLAG_ORIG_PTE_VALID))
return 0;
- ptep = pte_offset_map(vmf->pmd, vmf->address);
+ ptep = pte_offset_map_nolock(vma->vm_mm, vmf->pmd, vmf->address,
+ &vmf->ptl);
if (unlikely(!ptep))
return VM_FAULT_NOPAGE;
@@ -3448,7 +3473,7 @@ static struct folio *next_uptodate_folio(struct xa_state *xas,
continue;
if (folio_test_locked(folio))
continue;
- if (!folio_try_get_rcu(folio))
+ if (!folio_try_get(folio))
continue;
/* Has the page moved or been split? */
if (unlikely(folio != xas_reload(xas)))
@@ -3481,7 +3506,7 @@ skip:
static vm_fault_t filemap_map_folio_range(struct vm_fault *vmf,
struct folio *folio, unsigned long start,
unsigned long addr, unsigned int nr_pages,
- unsigned int *mmap_miss)
+ unsigned long *rss, unsigned int *mmap_miss)
{
vm_fault_t ret = 0;
struct page *page = folio_page(folio, start);
@@ -3492,7 +3517,15 @@ static vm_fault_t filemap_map_folio_range(struct vm_fault *vmf,
if (PageHWPoison(page + count))
goto skip;
- (*mmap_miss)++;
+ /*
+ * If there are too many folios that are recently evicted
+ * in a file, they will probably continue to be evicted.
+ * In such situation, read-ahead is only a waste of IO.
+ * Don't decrease mmap_miss in this scenario to make sure
+ * we can stop read-ahead.
+ */
+ if (!folio_test_workingset(folio))
+ (*mmap_miss)++;
/*
* NOTE: If there're PTE markers, we'll leave them to be
@@ -3507,6 +3540,7 @@ static vm_fault_t filemap_map_folio_range(struct vm_fault *vmf,
skip:
if (count) {
set_pte_range(vmf, folio, page, count, addr);
+ *rss += count;
folio_ref_add(folio, count);
if (in_range(vmf->address, addr, count * PAGE_SIZE))
ret = VM_FAULT_NOPAGE;
@@ -3521,6 +3555,7 @@ skip:
if (count) {
set_pte_range(vmf, folio, page, count, addr);
+ *rss += count;
folio_ref_add(folio, count);
if (in_range(vmf->address, addr, count * PAGE_SIZE))
ret = VM_FAULT_NOPAGE;
@@ -3533,7 +3568,7 @@ skip:
static vm_fault_t filemap_map_order0_folio(struct vm_fault *vmf,
struct folio *folio, unsigned long addr,
- unsigned int *mmap_miss)
+ unsigned long *rss, unsigned int *mmap_miss)
{
vm_fault_t ret = 0;
struct page *page = &folio->page;
@@ -3541,7 +3576,9 @@ static vm_fault_t filemap_map_order0_folio(struct vm_fault *vmf,
if (PageHWPoison(page))
return ret;
- (*mmap_miss)++;
+ /* See comment of filemap_map_folio_range() */
+ if (!folio_test_workingset(folio))
+ (*mmap_miss)++;
/*
* NOTE: If there're PTE markers, we'll leave them to be
@@ -3555,6 +3592,7 @@ static vm_fault_t filemap_map_order0_folio(struct vm_fault *vmf,
ret = VM_FAULT_NOPAGE;
set_pte_range(vmf, folio, page, 1, addr);
+ (*rss)++;
folio_ref_inc(folio);
return ret;
@@ -3571,7 +3609,8 @@ vm_fault_t filemap_map_pages(struct vm_fault *vmf,
XA_STATE(xas, &mapping->i_pages, start_pgoff);
struct folio *folio;
vm_fault_t ret = 0;
- unsigned int nr_pages = 0, mmap_miss = 0, mmap_miss_saved;
+ unsigned long rss = 0;
+ unsigned int nr_pages = 0, mmap_miss = 0, mmap_miss_saved, folio_type;
rcu_read_lock();
folio = next_uptodate_folio(&xas, mapping, end_pgoff);
@@ -3590,6 +3629,8 @@ vm_fault_t filemap_map_pages(struct vm_fault *vmf,
folio_put(folio);
goto out;
}
+
+ folio_type = mm_counter_file(folio);
do {
unsigned long end;
@@ -3601,15 +3642,16 @@ vm_fault_t filemap_map_pages(struct vm_fault *vmf,
if (!folio_test_large(folio))
ret |= filemap_map_order0_folio(vmf,
- folio, addr, &mmap_miss);
+ folio, addr, &rss, &mmap_miss);
else
ret |= filemap_map_folio_range(vmf, folio,
xas.xa_index - folio->index, addr,
- nr_pages, &mmap_miss);
+ nr_pages, &rss, &mmap_miss);
folio_unlock(folio);
folio_put(folio);
} while ((folio = next_uptodate_folio(&xas, mapping, end_pgoff)) != NULL);
+ add_mm_counter(vma->vm_mm, folio_type, rss);
pte_unmap_unlock(vmf->pte, vmf->ptl);
out:
rcu_read_unlock();
@@ -3940,21 +3982,24 @@ ssize_t generic_perform_write(struct kiocb *iocb, struct iov_iter *i)
loff_t pos = iocb->ki_pos;
struct address_space *mapping = file->f_mapping;
const struct address_space_operations *a_ops = mapping->a_ops;
+ size_t chunk = mapping_max_folio_size(mapping);
long status = 0;
ssize_t written = 0;
do {
struct page *page;
- unsigned long offset; /* Offset into pagecache page */
- unsigned long bytes; /* Bytes to write to page */
+ struct folio *folio;
+ size_t offset; /* Offset into folio */
+ size_t bytes; /* Bytes to write to folio */
size_t copied; /* Bytes copied from user */
void *fsdata = NULL;
- offset = (pos & (PAGE_SIZE - 1));
- bytes = min_t(unsigned long, PAGE_SIZE - offset,
- iov_iter_count(i));
+ bytes = iov_iter_count(i);
+retry:
+ offset = pos & (chunk - 1);
+ bytes = min(chunk - offset, bytes);
+ balance_dirty_pages_ratelimited(mapping);
-again:
/*
* Bring in the user page that we will copy from _first_.
* Otherwise there's a nasty deadlock on copying from the
@@ -3976,11 +4021,16 @@ again:
if (unlikely(status < 0))
break;
+ folio = page_folio(page);
+ offset = offset_in_folio(folio, pos);
+ if (bytes > folio_size(folio) - offset)
+ bytes = folio_size(folio) - offset;
+
if (mapping_writably_mapped(mapping))
- flush_dcache_page(page);
+ flush_dcache_folio(folio);
- copied = copy_page_from_iter_atomic(page, offset, bytes, i);
- flush_dcache_page(page);
+ copied = copy_folio_from_iter_atomic(folio, offset, bytes, i);
+ flush_dcache_folio(folio);
status = a_ops->write_end(file, mapping, pos, bytes, copied,
page, fsdata);
@@ -3998,14 +4048,16 @@ again:
* halfway through, might be a race with munmap,
* might be severe memory pressure.
*/
- if (copied)
+ if (chunk > PAGE_SIZE)
+ chunk /= 2;
+ if (copied) {
bytes = copied;
- goto again;
+ goto retry;
+ }
+ } else {
+ pos += status;
+ written += status;
}
- pos += status;
- written += status;
-
- balance_dirty_pages_ratelimited(mapping);
} while (iov_iter_count(i));
if (!written)
@@ -4207,6 +4259,9 @@ static void filemap_cachestat(struct address_space *mapping,
XA_STATE(xas, &mapping->i_pages, first_index);
struct folio *folio;
+ /* Flush stats (and potentially sleep) outside the RCU read section. */
+ mem_cgroup_flush_stats_ratelimited(NULL);
+
rcu_read_lock();
xas_for_each(&xas, folio, last_index) {
int order;
@@ -4270,7 +4325,7 @@ static void filemap_cachestat(struct address_space *mapping,
goto resched;
}
#endif
- if (workingset_test_recent(shadow, true, &workingset))
+ if (workingset_test_recent(shadow, true, &workingset, false))
cs->nr_recently_evicted += nr_pages;
goto resched;