diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/compaction.c | 8 | ||||
-rw-r--r-- | mm/damon/core-test.h | 10 | ||||
-rw-r--r-- | mm/damon/core.c | 1 | ||||
-rw-r--r-- | mm/hugetlb.c | 75 | ||||
-rw-r--r-- | mm/ksm.c | 2 | ||||
-rw-r--r-- | mm/memory-failure.c | 31 | ||||
-rw-r--r-- | mm/memory.c | 31 | ||||
-rw-r--r-- | mm/mempolicy.c | 15 | ||||
-rw-r--r-- | mm/mlock.c | 9 | ||||
-rw-r--r-- | mm/mmap.c | 1 | ||||
-rw-r--r-- | mm/pagewalk.c | 5 | ||||
-rw-r--r-- | mm/shmem.c | 9 | ||||
-rw-r--r-- | mm/swapfile.c | 8 | ||||
-rw-r--r-- | mm/zsmalloc.c | 14 |
14 files changed, 143 insertions, 76 deletions
diff --git a/mm/compaction.c b/mm/compaction.c index dbc9f86b1934..eacca2794e47 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -912,11 +912,12 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, /* * Check if the pageblock has already been marked skipped. - * Only the aligned PFN is checked as the caller isolates + * Only the first PFN is checked as the caller isolates * COMPACT_CLUSTER_MAX at a time so the second call must * not falsely conclude that the block should be skipped. */ - if (!valid_page && pageblock_aligned(low_pfn)) { + if (!valid_page && (pageblock_aligned(low_pfn) || + low_pfn == cc->zone->zone_start_pfn)) { if (!isolation_suitable(cc, page)) { low_pfn = end_pfn; folio = NULL; @@ -2002,7 +2003,8 @@ static isolate_migrate_t isolate_migratepages(struct compact_control *cc) * before making it "skip" so other compaction instances do * not scan the same block. */ - if (pageblock_aligned(low_pfn) && + if ((pageblock_aligned(low_pfn) || + low_pfn == cc->zone->zone_start_pfn) && !fast_find_block && !isolation_suitable(cc, page)) continue; diff --git a/mm/damon/core-test.h b/mm/damon/core-test.h index c11210124344..bb07721909e1 100644 --- a/mm/damon/core-test.h +++ b/mm/damon/core-test.h @@ -320,25 +320,25 @@ static void damon_test_update_monitoring_result(struct kunit *test) static void damon_test_set_attrs(struct kunit *test) { - struct damon_ctx ctx; + struct damon_ctx *c = damon_new_ctx(); struct damon_attrs valid_attrs = { .min_nr_regions = 10, .max_nr_regions = 1000, .sample_interval = 5000, .aggr_interval = 100000,}; struct damon_attrs invalid_attrs; - KUNIT_EXPECT_EQ(test, damon_set_attrs(&ctx, &valid_attrs), 0); + KUNIT_EXPECT_EQ(test, damon_set_attrs(c, &valid_attrs), 0); invalid_attrs = valid_attrs; invalid_attrs.min_nr_regions = 1; - KUNIT_EXPECT_EQ(test, damon_set_attrs(&ctx, &invalid_attrs), -EINVAL); + KUNIT_EXPECT_EQ(test, damon_set_attrs(c, &invalid_attrs), -EINVAL); invalid_attrs = valid_attrs; invalid_attrs.max_nr_regions = 9; - KUNIT_EXPECT_EQ(test, damon_set_attrs(&ctx, &invalid_attrs), -EINVAL); + KUNIT_EXPECT_EQ(test, damon_set_attrs(c, &invalid_attrs), -EINVAL); invalid_attrs = valid_attrs; invalid_attrs.aggr_interval = 4999; - KUNIT_EXPECT_EQ(test, damon_set_attrs(&ctx, &invalid_attrs), -EINVAL); + KUNIT_EXPECT_EQ(test, damon_set_attrs(c, &invalid_attrs), -EINVAL); } static struct kunit_case damon_test_cases[] = { diff --git a/mm/damon/core.c b/mm/damon/core.c index 91cff7f2997e..eb9580942a5c 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -273,6 +273,7 @@ struct damos_filter *damos_new_filter(enum damos_filter_type type, return NULL; filter->type = type; filter->matching = matching; + INIT_LIST_HEAD(&filter->list); return filter; } diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 64a3239b6407..6da626bfb52e 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1579,9 +1579,37 @@ static inline void destroy_compound_gigantic_folio(struct folio *folio, unsigned int order) { } #endif +static inline void __clear_hugetlb_destructor(struct hstate *h, + struct folio *folio) +{ + lockdep_assert_held(&hugetlb_lock); + + /* + * Very subtle + * + * For non-gigantic pages set the destructor to the normal compound + * page dtor. This is needed in case someone takes an additional + * temporary ref to the page, and freeing is delayed until they drop + * their reference. + * + * For gigantic pages set the destructor to the null dtor. This + * destructor will never be called. Before freeing the gigantic + * page destroy_compound_gigantic_folio will turn the folio into a + * simple group of pages. After this the destructor does not + * apply. + * + */ + if (hstate_is_gigantic(h)) + folio_set_compound_dtor(folio, NULL_COMPOUND_DTOR); + else + folio_set_compound_dtor(folio, COMPOUND_PAGE_DTOR); +} + /* - * Remove hugetlb folio from lists, and update dtor so that the folio appears - * as just a compound page. + * Remove hugetlb folio from lists. + * If vmemmap exists for the folio, update dtor so that the folio appears + * as just a compound page. Otherwise, wait until after allocating vmemmap + * to update dtor. * * A reference is held on the folio, except in the case of demote. * @@ -1612,31 +1640,19 @@ static void __remove_hugetlb_folio(struct hstate *h, struct folio *folio, } /* - * Very subtle - * - * For non-gigantic pages set the destructor to the normal compound - * page dtor. This is needed in case someone takes an additional - * temporary ref to the page, and freeing is delayed until they drop - * their reference. - * - * For gigantic pages set the destructor to the null dtor. This - * destructor will never be called. Before freeing the gigantic - * page destroy_compound_gigantic_folio will turn the folio into a - * simple group of pages. After this the destructor does not - * apply. - * - * This handles the case where more than one ref is held when and - * after update_and_free_hugetlb_folio is called. - * - * In the case of demote we do not ref count the page as it will soon - * be turned into a page of smaller size. + * We can only clear the hugetlb destructor after allocating vmemmap + * pages. Otherwise, someone (memory error handling) may try to write + * to tail struct pages. + */ + if (!folio_test_hugetlb_vmemmap_optimized(folio)) + __clear_hugetlb_destructor(h, folio); + + /* + * In the case of demote we do not ref count the page as it will soon + * be turned into a page of smaller size. */ if (!demote) folio_ref_unfreeze(folio, 1); - if (hstate_is_gigantic(h)) - folio_set_compound_dtor(folio, NULL_COMPOUND_DTOR); - else - folio_set_compound_dtor(folio, COMPOUND_PAGE_DTOR); h->nr_huge_pages--; h->nr_huge_pages_node[nid]--; @@ -1705,6 +1721,7 @@ static void __update_and_free_hugetlb_folio(struct hstate *h, { int i; struct page *subpage; + bool clear_dtor = folio_test_hugetlb_vmemmap_optimized(folio); if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported()) return; @@ -1735,6 +1752,16 @@ static void __update_and_free_hugetlb_folio(struct hstate *h, if (unlikely(folio_test_hwpoison(folio))) folio_clear_hugetlb_hwpoison(folio); + /* + * If vmemmap pages were allocated above, then we need to clear the + * hugetlb destructor under the hugetlb lock. + */ + if (clear_dtor) { + spin_lock_irq(&hugetlb_lock); + __clear_hugetlb_destructor(h, folio); + spin_unlock_irq(&hugetlb_lock); + } + for (i = 0; i < pages_per_huge_page(h); i++) { subpage = folio_page(folio, i); subpage->flags &= ~(1 << PG_locked | 1 << PG_error | @@ -2784,6 +2784,8 @@ struct page *ksm_might_need_to_copy(struct page *page, anon_vma->root == vma->anon_vma->root) { return page; /* still no need to copy it */ } + if (PageHWPoison(page)) + return ERR_PTR(-EHWPOISON); if (!PageUptodate(page)) return page; /* let do_swap_page report the error */ diff --git a/mm/memory-failure.c b/mm/memory-failure.c index e245191e6b04..9a285038d765 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -2466,7 +2466,7 @@ int unpoison_memory(unsigned long pfn) { struct folio *folio; struct page *p; - int ret = -EBUSY; + int ret = -EBUSY, ghp; unsigned long count = 1; bool huge = false; static DEFINE_RATELIMIT_STATE(unpoison_rs, DEFAULT_RATELIMIT_INTERVAL, @@ -2487,7 +2487,7 @@ int unpoison_memory(unsigned long pfn) goto unlock_mutex; } - if (!folio_test_hwpoison(folio)) { + if (!PageHWPoison(p)) { unpoison_pr_info("Unpoison: Page was already unpoisoned %#lx\n", pfn, &unpoison_rs); goto unlock_mutex; @@ -2499,6 +2499,13 @@ int unpoison_memory(unsigned long pfn) goto unlock_mutex; } + if (folio_test_slab(folio) || PageTable(&folio->page) || folio_test_reserved(folio)) + goto unlock_mutex; + + /* + * Note that folio->_mapcount is overloaded in SLAB, so the simple test + * in folio_mapped() has to be done after folio_test_slab() is checked. + */ if (folio_mapped(folio)) { unpoison_pr_info("Unpoison: Someone maps the hwpoison page %#lx\n", pfn, &unpoison_rs); @@ -2511,32 +2518,28 @@ int unpoison_memory(unsigned long pfn) goto unlock_mutex; } - if (folio_test_slab(folio) || PageTable(&folio->page) || folio_test_reserved(folio)) - goto unlock_mutex; - - ret = get_hwpoison_page(p, MF_UNPOISON); - if (!ret) { + ghp = get_hwpoison_page(p, MF_UNPOISON); + if (!ghp) { if (PageHuge(p)) { huge = true; count = folio_free_raw_hwp(folio, false); - if (count == 0) { - ret = -EBUSY; + if (count == 0) goto unlock_mutex; - } } ret = folio_test_clear_hwpoison(folio) ? 0 : -EBUSY; - } else if (ret < 0) { - if (ret == -EHWPOISON) { + } else if (ghp < 0) { + if (ghp == -EHWPOISON) { ret = put_page_back_buddy(p) ? 0 : -EBUSY; - } else + } else { + ret = ghp; unpoison_pr_info("Unpoison: failed to grab page %#lx\n", pfn, &unpoison_rs); + } } else { if (PageHuge(p)) { huge = true; count = folio_free_raw_hwp(folio, false); if (count == 0) { - ret = -EBUSY; folio_put(folio); goto unlock_mutex; } diff --git a/mm/memory.c b/mm/memory.c index 01f39e8144ef..1ec1ef3418bf 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -5393,27 +5393,28 @@ retry: if (!vma_is_anonymous(vma) && !vma_is_tcp(vma)) goto inval; - /* find_mergeable_anon_vma uses adjacent vmas which are not locked */ - if (!vma->anon_vma && !vma_is_tcp(vma)) - goto inval; - if (!vma_start_read(vma)) goto inval; /* + * find_mergeable_anon_vma uses adjacent vmas which are not locked. + * This check must happen after vma_start_read(); otherwise, a + * concurrent mremap() with MREMAP_DONTUNMAP could dissociate the VMA + * from its anon_vma. + */ + if (unlikely(!vma->anon_vma && !vma_is_tcp(vma))) + goto inval_end_read; + + /* * Due to the possibility of userfault handler dropping mmap_lock, avoid * it for now and fall back to page fault handling under mmap_lock. */ - if (userfaultfd_armed(vma)) { - vma_end_read(vma); - goto inval; - } + if (userfaultfd_armed(vma)) + goto inval_end_read; /* Check since vm_start/vm_end might change before we lock the VMA */ - if (unlikely(address < vma->vm_start || address >= vma->vm_end)) { - vma_end_read(vma); - goto inval; - } + if (unlikely(address < vma->vm_start || address >= vma->vm_end)) + goto inval_end_read; /* Check if the VMA got isolated after we found it */ if (vma->detached) { @@ -5425,6 +5426,9 @@ retry: rcu_read_unlock(); return vma; + +inval_end_read: + vma_end_read(vma); inval: rcu_read_unlock(); count_vm_vma_lock_event(VMA_LOCK_ABORT); @@ -5701,6 +5705,9 @@ int __access_remote_vm(struct mm_struct *mm, unsigned long addr, void *buf, if (mmap_read_lock_killable(mm)) return 0; + /* Untag the address before looking up the VMA */ + addr = untagged_addr_remote(mm, addr); + /* Avoid triggering the temporary warning in __get_user_pages */ if (!vma_lookup(mm, addr) && !expand_stack(mm, addr)) return 0; diff --git a/mm/mempolicy.c b/mm/mempolicy.c index edc25195f5bd..c53f8beeb507 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -384,8 +384,10 @@ void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new) VMA_ITERATOR(vmi, mm, 0); mmap_write_lock(mm); - for_each_vma(vmi, vma) + for_each_vma(vmi, vma) { + vma_start_write(vma); mpol_rebind_policy(vma->vm_policy, new); + } mmap_write_unlock(mm); } @@ -768,6 +770,8 @@ static int vma_replace_policy(struct vm_area_struct *vma, struct mempolicy *old; struct mempolicy *new; + vma_assert_write_locked(vma); + pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n", vma->vm_start, vma->vm_end, vma->vm_pgoff, vma->vm_ops, vma->vm_file, @@ -1313,6 +1317,14 @@ static long do_mbind(unsigned long start, unsigned long len, if (err) goto mpol_out; + /* + * Lock the VMAs before scanning for pages to migrate, to ensure we don't + * miss a concurrently inserted page. + */ + vma_iter_init(&vmi, mm, start); + for_each_vma_range(vmi, vma, end) + vma_start_write(vma); + ret = queue_pages_range(mm, start, end, nmask, flags | MPOL_MF_INVERT, &pagelist); @@ -1538,6 +1550,7 @@ SYSCALL_DEFINE4(set_mempolicy_home_node, unsigned long, start, unsigned long, le break; } + vma_start_write(vma); new->home_node = home_node; err = mbind_range(&vmi, vma, &prev, start, end, new); mpol_put(new); diff --git a/mm/mlock.c b/mm/mlock.c index d7db94519884..0a0c996c5c21 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -477,7 +477,6 @@ static int apply_vma_lock_flags(unsigned long start, size_t len, { unsigned long nstart, end, tmp; struct vm_area_struct *vma, *prev; - int error; VMA_ITERATOR(vmi, current->mm, start); VM_BUG_ON(offset_in_page(start)); @@ -498,6 +497,7 @@ static int apply_vma_lock_flags(unsigned long start, size_t len, nstart = start; tmp = vma->vm_start; for_each_vma_range(vmi, vma, end) { + int error; vm_flags_t newflags; if (vma->vm_start != tmp) @@ -511,14 +511,15 @@ static int apply_vma_lock_flags(unsigned long start, size_t len, tmp = end; error = mlock_fixup(&vmi, vma, &prev, nstart, tmp, newflags); if (error) - break; + return error; + tmp = vma_iter_end(&vmi); nstart = tmp; } - if (vma_iter_end(&vmi) < end) + if (tmp < end) return -ENOMEM; - return error; + return 0; } /* diff --git a/mm/mmap.c b/mm/mmap.c index 3eda23c9ebe7..3937479d0e07 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -615,6 +615,7 @@ static inline int dup_anon_vma(struct vm_area_struct *dst, * anon pages imported. */ if (src->anon_vma && !dst->anon_vma) { + vma_start_write(dst); dst->anon_vma = src->anon_vma; return anon_vma_clone(dst, src); } diff --git a/mm/pagewalk.c b/mm/pagewalk.c index 64437105fe0d..2022333805d3 100644 --- a/mm/pagewalk.c +++ b/mm/pagewalk.c @@ -48,8 +48,11 @@ static int walk_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, if (walk->no_vma) { /* * pte_offset_map() might apply user-specific validation. + * Indeed, on x86_64 the pmd entries set up by init_espfix_ap() + * fit its pmd_bad() check (_PAGE_NX set and _PAGE_RW clear), + * and CONFIG_EFI_PGT_DUMP efi_mm goes so far as to walk them. */ - if (walk->mm == &init_mm) + if (walk->mm == &init_mm || addr >= TASK_SIZE) pte = pte_offset_kernel(pmd, addr); else pte = pte_offset_map(pmd, addr); diff --git a/mm/shmem.c b/mm/shmem.c index 2f2e0e618072..f5af4b943e42 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2796,7 +2796,8 @@ static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos, if (*ppos >= i_size_read(inode)) break; - error = shmem_get_folio(inode, *ppos / PAGE_SIZE, &folio, SGP_READ); + error = shmem_get_folio(inode, *ppos / PAGE_SIZE, &folio, + SGP_READ); if (error) { if (error == -EINVAL) error = 0; @@ -2805,7 +2806,9 @@ static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos, if (folio) { folio_unlock(folio); - if (folio_test_hwpoison(folio)) { + if (folio_test_hwpoison(folio) || + (folio_test_large(folio) && + folio_test_has_hwpoisoned(folio))) { error = -EIO; break; } @@ -2841,7 +2844,7 @@ static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos, folio_put(folio); folio = NULL; } else { - n = splice_zeropage_into_pipe(pipe, *ppos, len); + n = splice_zeropage_into_pipe(pipe, *ppos, part); } if (!n) diff --git a/mm/swapfile.c b/mm/swapfile.c index 8e6dde68b389..b15112b1f1a8 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1746,7 +1746,7 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, struct page *swapcache; spinlock_t *ptl; pte_t *pte, new_pte, old_pte; - bool hwposioned = false; + bool hwpoisoned = PageHWPoison(page); int ret = 1; swapcache = page; @@ -1754,7 +1754,7 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, if (unlikely(!page)) return -ENOMEM; else if (unlikely(PTR_ERR(page) == -EHWPOISON)) - hwposioned = true; + hwpoisoned = true; pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); if (unlikely(!pte || !pte_same_as_swp(ptep_get(pte), @@ -1765,11 +1765,11 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, old_pte = ptep_get(pte); - if (unlikely(hwposioned || !PageUptodate(page))) { + if (unlikely(hwpoisoned || !PageUptodate(page))) { swp_entry_t swp_entry; dec_mm_counter(vma->vm_mm, MM_SWAPENTS); - if (hwposioned) { + if (hwpoisoned) { swp_entry = make_hwpoison_entry(swapcache); page = swapcache; } else { diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 3f057970504e..32916d28d9d9 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -1798,6 +1798,7 @@ static void replace_sub_page(struct size_class *class, struct zspage *zspage, static bool zs_page_isolate(struct page *page, isolate_mode_t mode) { + struct zs_pool *pool; struct zspage *zspage; /* @@ -1807,9 +1808,10 @@ static bool zs_page_isolate(struct page *page, isolate_mode_t mode) VM_BUG_ON_PAGE(PageIsolated(page), page); zspage = get_zspage(page); - migrate_write_lock(zspage); + pool = zspage->pool; + spin_lock(&pool->lock); inc_zspage_isolation(zspage); - migrate_write_unlock(zspage); + spin_unlock(&pool->lock); return true; } @@ -1875,12 +1877,12 @@ static int zs_page_migrate(struct page *newpage, struct page *page, kunmap_atomic(s_addr); replace_sub_page(class, zspage, newpage, page); + dec_zspage_isolation(zspage); /* * Since we complete the data copy and set up new zspage structure, * it's okay to release the pool's lock. */ spin_unlock(&pool->lock); - dec_zspage_isolation(zspage); migrate_write_unlock(zspage); get_page(newpage); @@ -1897,14 +1899,16 @@ static int zs_page_migrate(struct page *newpage, struct page *page, static void zs_page_putback(struct page *page) { + struct zs_pool *pool; struct zspage *zspage; VM_BUG_ON_PAGE(!PageIsolated(page), page); zspage = get_zspage(page); - migrate_write_lock(zspage); + pool = zspage->pool; + spin_lock(&pool->lock); dec_zspage_isolation(zspage); - migrate_write_unlock(zspage); + spin_unlock(&pool->lock); } static const struct movable_operations zsmalloc_mops = { |