aboutsummaryrefslogtreecommitdiff
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/compaction.c8
-rw-r--r--mm/damon/core-test.h10
-rw-r--r--mm/damon/core.c1
-rw-r--r--mm/hugetlb.c75
-rw-r--r--mm/ksm.c2
-rw-r--r--mm/memory-failure.c31
-rw-r--r--mm/memory.c31
-rw-r--r--mm/mempolicy.c15
-rw-r--r--mm/mlock.c9
-rw-r--r--mm/mmap.c1
-rw-r--r--mm/pagewalk.c5
-rw-r--r--mm/shmem.c9
-rw-r--r--mm/swapfile.c8
-rw-r--r--mm/zsmalloc.c14
14 files changed, 143 insertions, 76 deletions
diff --git a/mm/compaction.c b/mm/compaction.c
index dbc9f86b1934..eacca2794e47 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -912,11 +912,12 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
/*
* Check if the pageblock has already been marked skipped.
- * Only the aligned PFN is checked as the caller isolates
+ * Only the first PFN is checked as the caller isolates
* COMPACT_CLUSTER_MAX at a time so the second call must
* not falsely conclude that the block should be skipped.
*/
- if (!valid_page && pageblock_aligned(low_pfn)) {
+ if (!valid_page && (pageblock_aligned(low_pfn) ||
+ low_pfn == cc->zone->zone_start_pfn)) {
if (!isolation_suitable(cc, page)) {
low_pfn = end_pfn;
folio = NULL;
@@ -2002,7 +2003,8 @@ static isolate_migrate_t isolate_migratepages(struct compact_control *cc)
* before making it "skip" so other compaction instances do
* not scan the same block.
*/
- if (pageblock_aligned(low_pfn) &&
+ if ((pageblock_aligned(low_pfn) ||
+ low_pfn == cc->zone->zone_start_pfn) &&
!fast_find_block && !isolation_suitable(cc, page))
continue;
diff --git a/mm/damon/core-test.h b/mm/damon/core-test.h
index c11210124344..bb07721909e1 100644
--- a/mm/damon/core-test.h
+++ b/mm/damon/core-test.h
@@ -320,25 +320,25 @@ static void damon_test_update_monitoring_result(struct kunit *test)
static void damon_test_set_attrs(struct kunit *test)
{
- struct damon_ctx ctx;
+ struct damon_ctx *c = damon_new_ctx();
struct damon_attrs valid_attrs = {
.min_nr_regions = 10, .max_nr_regions = 1000,
.sample_interval = 5000, .aggr_interval = 100000,};
struct damon_attrs invalid_attrs;
- KUNIT_EXPECT_EQ(test, damon_set_attrs(&ctx, &valid_attrs), 0);
+ KUNIT_EXPECT_EQ(test, damon_set_attrs(c, &valid_attrs), 0);
invalid_attrs = valid_attrs;
invalid_attrs.min_nr_regions = 1;
- KUNIT_EXPECT_EQ(test, damon_set_attrs(&ctx, &invalid_attrs), -EINVAL);
+ KUNIT_EXPECT_EQ(test, damon_set_attrs(c, &invalid_attrs), -EINVAL);
invalid_attrs = valid_attrs;
invalid_attrs.max_nr_regions = 9;
- KUNIT_EXPECT_EQ(test, damon_set_attrs(&ctx, &invalid_attrs), -EINVAL);
+ KUNIT_EXPECT_EQ(test, damon_set_attrs(c, &invalid_attrs), -EINVAL);
invalid_attrs = valid_attrs;
invalid_attrs.aggr_interval = 4999;
- KUNIT_EXPECT_EQ(test, damon_set_attrs(&ctx, &invalid_attrs), -EINVAL);
+ KUNIT_EXPECT_EQ(test, damon_set_attrs(c, &invalid_attrs), -EINVAL);
}
static struct kunit_case damon_test_cases[] = {
diff --git a/mm/damon/core.c b/mm/damon/core.c
index 91cff7f2997e..eb9580942a5c 100644
--- a/mm/damon/core.c
+++ b/mm/damon/core.c
@@ -273,6 +273,7 @@ struct damos_filter *damos_new_filter(enum damos_filter_type type,
return NULL;
filter->type = type;
filter->matching = matching;
+ INIT_LIST_HEAD(&filter->list);
return filter;
}
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 64a3239b6407..6da626bfb52e 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1579,9 +1579,37 @@ static inline void destroy_compound_gigantic_folio(struct folio *folio,
unsigned int order) { }
#endif
+static inline void __clear_hugetlb_destructor(struct hstate *h,
+ struct folio *folio)
+{
+ lockdep_assert_held(&hugetlb_lock);
+
+ /*
+ * Very subtle
+ *
+ * For non-gigantic pages set the destructor to the normal compound
+ * page dtor. This is needed in case someone takes an additional
+ * temporary ref to the page, and freeing is delayed until they drop
+ * their reference.
+ *
+ * For gigantic pages set the destructor to the null dtor. This
+ * destructor will never be called. Before freeing the gigantic
+ * page destroy_compound_gigantic_folio will turn the folio into a
+ * simple group of pages. After this the destructor does not
+ * apply.
+ *
+ */
+ if (hstate_is_gigantic(h))
+ folio_set_compound_dtor(folio, NULL_COMPOUND_DTOR);
+ else
+ folio_set_compound_dtor(folio, COMPOUND_PAGE_DTOR);
+}
+
/*
- * Remove hugetlb folio from lists, and update dtor so that the folio appears
- * as just a compound page.
+ * Remove hugetlb folio from lists.
+ * If vmemmap exists for the folio, update dtor so that the folio appears
+ * as just a compound page. Otherwise, wait until after allocating vmemmap
+ * to update dtor.
*
* A reference is held on the folio, except in the case of demote.
*
@@ -1612,31 +1640,19 @@ static void __remove_hugetlb_folio(struct hstate *h, struct folio *folio,
}
/*
- * Very subtle
- *
- * For non-gigantic pages set the destructor to the normal compound
- * page dtor. This is needed in case someone takes an additional
- * temporary ref to the page, and freeing is delayed until they drop
- * their reference.
- *
- * For gigantic pages set the destructor to the null dtor. This
- * destructor will never be called. Before freeing the gigantic
- * page destroy_compound_gigantic_folio will turn the folio into a
- * simple group of pages. After this the destructor does not
- * apply.
- *
- * This handles the case where more than one ref is held when and
- * after update_and_free_hugetlb_folio is called.
- *
- * In the case of demote we do not ref count the page as it will soon
- * be turned into a page of smaller size.
+ * We can only clear the hugetlb destructor after allocating vmemmap
+ * pages. Otherwise, someone (memory error handling) may try to write
+ * to tail struct pages.
+ */
+ if (!folio_test_hugetlb_vmemmap_optimized(folio))
+ __clear_hugetlb_destructor(h, folio);
+
+ /*
+ * In the case of demote we do not ref count the page as it will soon
+ * be turned into a page of smaller size.
*/
if (!demote)
folio_ref_unfreeze(folio, 1);
- if (hstate_is_gigantic(h))
- folio_set_compound_dtor(folio, NULL_COMPOUND_DTOR);
- else
- folio_set_compound_dtor(folio, COMPOUND_PAGE_DTOR);
h->nr_huge_pages--;
h->nr_huge_pages_node[nid]--;
@@ -1705,6 +1721,7 @@ static void __update_and_free_hugetlb_folio(struct hstate *h,
{
int i;
struct page *subpage;
+ bool clear_dtor = folio_test_hugetlb_vmemmap_optimized(folio);
if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported())
return;
@@ -1735,6 +1752,16 @@ static void __update_and_free_hugetlb_folio(struct hstate *h,
if (unlikely(folio_test_hwpoison(folio)))
folio_clear_hugetlb_hwpoison(folio);
+ /*
+ * If vmemmap pages were allocated above, then we need to clear the
+ * hugetlb destructor under the hugetlb lock.
+ */
+ if (clear_dtor) {
+ spin_lock_irq(&hugetlb_lock);
+ __clear_hugetlb_destructor(h, folio);
+ spin_unlock_irq(&hugetlb_lock);
+ }
+
for (i = 0; i < pages_per_huge_page(h); i++) {
subpage = folio_page(folio, i);
subpage->flags &= ~(1 << PG_locked | 1 << PG_error |
diff --git a/mm/ksm.c b/mm/ksm.c
index ba266359da55..d20d7662419b 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -2784,6 +2784,8 @@ struct page *ksm_might_need_to_copy(struct page *page,
anon_vma->root == vma->anon_vma->root) {
return page; /* still no need to copy it */
}
+ if (PageHWPoison(page))
+ return ERR_PTR(-EHWPOISON);
if (!PageUptodate(page))
return page; /* let do_swap_page report the error */
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index e245191e6b04..9a285038d765 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -2466,7 +2466,7 @@ int unpoison_memory(unsigned long pfn)
{
struct folio *folio;
struct page *p;
- int ret = -EBUSY;
+ int ret = -EBUSY, ghp;
unsigned long count = 1;
bool huge = false;
static DEFINE_RATELIMIT_STATE(unpoison_rs, DEFAULT_RATELIMIT_INTERVAL,
@@ -2487,7 +2487,7 @@ int unpoison_memory(unsigned long pfn)
goto unlock_mutex;
}
- if (!folio_test_hwpoison(folio)) {
+ if (!PageHWPoison(p)) {
unpoison_pr_info("Unpoison: Page was already unpoisoned %#lx\n",
pfn, &unpoison_rs);
goto unlock_mutex;
@@ -2499,6 +2499,13 @@ int unpoison_memory(unsigned long pfn)
goto unlock_mutex;
}
+ if (folio_test_slab(folio) || PageTable(&folio->page) || folio_test_reserved(folio))
+ goto unlock_mutex;
+
+ /*
+ * Note that folio->_mapcount is overloaded in SLAB, so the simple test
+ * in folio_mapped() has to be done after folio_test_slab() is checked.
+ */
if (folio_mapped(folio)) {
unpoison_pr_info("Unpoison: Someone maps the hwpoison page %#lx\n",
pfn, &unpoison_rs);
@@ -2511,32 +2518,28 @@ int unpoison_memory(unsigned long pfn)
goto unlock_mutex;
}
- if (folio_test_slab(folio) || PageTable(&folio->page) || folio_test_reserved(folio))
- goto unlock_mutex;
-
- ret = get_hwpoison_page(p, MF_UNPOISON);
- if (!ret) {
+ ghp = get_hwpoison_page(p, MF_UNPOISON);
+ if (!ghp) {
if (PageHuge(p)) {
huge = true;
count = folio_free_raw_hwp(folio, false);
- if (count == 0) {
- ret = -EBUSY;
+ if (count == 0)
goto unlock_mutex;
- }
}
ret = folio_test_clear_hwpoison(folio) ? 0 : -EBUSY;
- } else if (ret < 0) {
- if (ret == -EHWPOISON) {
+ } else if (ghp < 0) {
+ if (ghp == -EHWPOISON) {
ret = put_page_back_buddy(p) ? 0 : -EBUSY;
- } else
+ } else {
+ ret = ghp;
unpoison_pr_info("Unpoison: failed to grab page %#lx\n",
pfn, &unpoison_rs);
+ }
} else {
if (PageHuge(p)) {
huge = true;
count = folio_free_raw_hwp(folio, false);
if (count == 0) {
- ret = -EBUSY;
folio_put(folio);
goto unlock_mutex;
}
diff --git a/mm/memory.c b/mm/memory.c
index 01f39e8144ef..1ec1ef3418bf 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -5393,27 +5393,28 @@ retry:
if (!vma_is_anonymous(vma) && !vma_is_tcp(vma))
goto inval;
- /* find_mergeable_anon_vma uses adjacent vmas which are not locked */
- if (!vma->anon_vma && !vma_is_tcp(vma))
- goto inval;
-
if (!vma_start_read(vma))
goto inval;
/*
+ * find_mergeable_anon_vma uses adjacent vmas which are not locked.
+ * This check must happen after vma_start_read(); otherwise, a
+ * concurrent mremap() with MREMAP_DONTUNMAP could dissociate the VMA
+ * from its anon_vma.
+ */
+ if (unlikely(!vma->anon_vma && !vma_is_tcp(vma)))
+ goto inval_end_read;
+
+ /*
* Due to the possibility of userfault handler dropping mmap_lock, avoid
* it for now and fall back to page fault handling under mmap_lock.
*/
- if (userfaultfd_armed(vma)) {
- vma_end_read(vma);
- goto inval;
- }
+ if (userfaultfd_armed(vma))
+ goto inval_end_read;
/* Check since vm_start/vm_end might change before we lock the VMA */
- if (unlikely(address < vma->vm_start || address >= vma->vm_end)) {
- vma_end_read(vma);
- goto inval;
- }
+ if (unlikely(address < vma->vm_start || address >= vma->vm_end))
+ goto inval_end_read;
/* Check if the VMA got isolated after we found it */
if (vma->detached) {
@@ -5425,6 +5426,9 @@ retry:
rcu_read_unlock();
return vma;
+
+inval_end_read:
+ vma_end_read(vma);
inval:
rcu_read_unlock();
count_vm_vma_lock_event(VMA_LOCK_ABORT);
@@ -5701,6 +5705,9 @@ int __access_remote_vm(struct mm_struct *mm, unsigned long addr, void *buf,
if (mmap_read_lock_killable(mm))
return 0;
+ /* Untag the address before looking up the VMA */
+ addr = untagged_addr_remote(mm, addr);
+
/* Avoid triggering the temporary warning in __get_user_pages */
if (!vma_lookup(mm, addr) && !expand_stack(mm, addr))
return 0;
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index edc25195f5bd..c53f8beeb507 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -384,8 +384,10 @@ void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
VMA_ITERATOR(vmi, mm, 0);
mmap_write_lock(mm);
- for_each_vma(vmi, vma)
+ for_each_vma(vmi, vma) {
+ vma_start_write(vma);
mpol_rebind_policy(vma->vm_policy, new);
+ }
mmap_write_unlock(mm);
}
@@ -768,6 +770,8 @@ static int vma_replace_policy(struct vm_area_struct *vma,
struct mempolicy *old;
struct mempolicy *new;
+ vma_assert_write_locked(vma);
+
pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
vma->vm_start, vma->vm_end, vma->vm_pgoff,
vma->vm_ops, vma->vm_file,
@@ -1313,6 +1317,14 @@ static long do_mbind(unsigned long start, unsigned long len,
if (err)
goto mpol_out;
+ /*
+ * Lock the VMAs before scanning for pages to migrate, to ensure we don't
+ * miss a concurrently inserted page.
+ */
+ vma_iter_init(&vmi, mm, start);
+ for_each_vma_range(vmi, vma, end)
+ vma_start_write(vma);
+
ret = queue_pages_range(mm, start, end, nmask,
flags | MPOL_MF_INVERT, &pagelist);
@@ -1538,6 +1550,7 @@ SYSCALL_DEFINE4(set_mempolicy_home_node, unsigned long, start, unsigned long, le
break;
}
+ vma_start_write(vma);
new->home_node = home_node;
err = mbind_range(&vmi, vma, &prev, start, end, new);
mpol_put(new);
diff --git a/mm/mlock.c b/mm/mlock.c
index d7db94519884..0a0c996c5c21 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -477,7 +477,6 @@ static int apply_vma_lock_flags(unsigned long start, size_t len,
{
unsigned long nstart, end, tmp;
struct vm_area_struct *vma, *prev;
- int error;
VMA_ITERATOR(vmi, current->mm, start);
VM_BUG_ON(offset_in_page(start));
@@ -498,6 +497,7 @@ static int apply_vma_lock_flags(unsigned long start, size_t len,
nstart = start;
tmp = vma->vm_start;
for_each_vma_range(vmi, vma, end) {
+ int error;
vm_flags_t newflags;
if (vma->vm_start != tmp)
@@ -511,14 +511,15 @@ static int apply_vma_lock_flags(unsigned long start, size_t len,
tmp = end;
error = mlock_fixup(&vmi, vma, &prev, nstart, tmp, newflags);
if (error)
- break;
+ return error;
+ tmp = vma_iter_end(&vmi);
nstart = tmp;
}
- if (vma_iter_end(&vmi) < end)
+ if (tmp < end)
return -ENOMEM;
- return error;
+ return 0;
}
/*
diff --git a/mm/mmap.c b/mm/mmap.c
index 3eda23c9ebe7..3937479d0e07 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -615,6 +615,7 @@ static inline int dup_anon_vma(struct vm_area_struct *dst,
* anon pages imported.
*/
if (src->anon_vma && !dst->anon_vma) {
+ vma_start_write(dst);
dst->anon_vma = src->anon_vma;
return anon_vma_clone(dst, src);
}
diff --git a/mm/pagewalk.c b/mm/pagewalk.c
index 64437105fe0d..2022333805d3 100644
--- a/mm/pagewalk.c
+++ b/mm/pagewalk.c
@@ -48,8 +48,11 @@ static int walk_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
if (walk->no_vma) {
/*
* pte_offset_map() might apply user-specific validation.
+ * Indeed, on x86_64 the pmd entries set up by init_espfix_ap()
+ * fit its pmd_bad() check (_PAGE_NX set and _PAGE_RW clear),
+ * and CONFIG_EFI_PGT_DUMP efi_mm goes so far as to walk them.
*/
- if (walk->mm == &init_mm)
+ if (walk->mm == &init_mm || addr >= TASK_SIZE)
pte = pte_offset_kernel(pmd, addr);
else
pte = pte_offset_map(pmd, addr);
diff --git a/mm/shmem.c b/mm/shmem.c
index 2f2e0e618072..f5af4b943e42 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -2796,7 +2796,8 @@ static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos,
if (*ppos >= i_size_read(inode))
break;
- error = shmem_get_folio(inode, *ppos / PAGE_SIZE, &folio, SGP_READ);
+ error = shmem_get_folio(inode, *ppos / PAGE_SIZE, &folio,
+ SGP_READ);
if (error) {
if (error == -EINVAL)
error = 0;
@@ -2805,7 +2806,9 @@ static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos,
if (folio) {
folio_unlock(folio);
- if (folio_test_hwpoison(folio)) {
+ if (folio_test_hwpoison(folio) ||
+ (folio_test_large(folio) &&
+ folio_test_has_hwpoisoned(folio))) {
error = -EIO;
break;
}
@@ -2841,7 +2844,7 @@ static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos,
folio_put(folio);
folio = NULL;
} else {
- n = splice_zeropage_into_pipe(pipe, *ppos, len);
+ n = splice_zeropage_into_pipe(pipe, *ppos, part);
}
if (!n)
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 8e6dde68b389..b15112b1f1a8 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -1746,7 +1746,7 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
struct page *swapcache;
spinlock_t *ptl;
pte_t *pte, new_pte, old_pte;
- bool hwposioned = false;
+ bool hwpoisoned = PageHWPoison(page);
int ret = 1;
swapcache = page;
@@ -1754,7 +1754,7 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
if (unlikely(!page))
return -ENOMEM;
else if (unlikely(PTR_ERR(page) == -EHWPOISON))
- hwposioned = true;
+ hwpoisoned = true;
pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
if (unlikely(!pte || !pte_same_as_swp(ptep_get(pte),
@@ -1765,11 +1765,11 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
old_pte = ptep_get(pte);
- if (unlikely(hwposioned || !PageUptodate(page))) {
+ if (unlikely(hwpoisoned || !PageUptodate(page))) {
swp_entry_t swp_entry;
dec_mm_counter(vma->vm_mm, MM_SWAPENTS);
- if (hwposioned) {
+ if (hwpoisoned) {
swp_entry = make_hwpoison_entry(swapcache);
page = swapcache;
} else {
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index 3f057970504e..32916d28d9d9 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -1798,6 +1798,7 @@ static void replace_sub_page(struct size_class *class, struct zspage *zspage,
static bool zs_page_isolate(struct page *page, isolate_mode_t mode)
{
+ struct zs_pool *pool;
struct zspage *zspage;
/*
@@ -1807,9 +1808,10 @@ static bool zs_page_isolate(struct page *page, isolate_mode_t mode)
VM_BUG_ON_PAGE(PageIsolated(page), page);
zspage = get_zspage(page);
- migrate_write_lock(zspage);
+ pool = zspage->pool;
+ spin_lock(&pool->lock);
inc_zspage_isolation(zspage);
- migrate_write_unlock(zspage);
+ spin_unlock(&pool->lock);
return true;
}
@@ -1875,12 +1877,12 @@ static int zs_page_migrate(struct page *newpage, struct page *page,
kunmap_atomic(s_addr);
replace_sub_page(class, zspage, newpage, page);
+ dec_zspage_isolation(zspage);
/*
* Since we complete the data copy and set up new zspage structure,
* it's okay to release the pool's lock.
*/
spin_unlock(&pool->lock);
- dec_zspage_isolation(zspage);
migrate_write_unlock(zspage);
get_page(newpage);
@@ -1897,14 +1899,16 @@ static int zs_page_migrate(struct page *newpage, struct page *page,
static void zs_page_putback(struct page *page)
{
+ struct zs_pool *pool;
struct zspage *zspage;
VM_BUG_ON_PAGE(!PageIsolated(page), page);
zspage = get_zspage(page);
- migrate_write_lock(zspage);
+ pool = zspage->pool;
+ spin_lock(&pool->lock);
dec_zspage_isolation(zspage);
- migrate_write_unlock(zspage);
+ spin_unlock(&pool->lock);
}
static const struct movable_operations zsmalloc_mops = {