diff options
Diffstat (limited to 'mm')
| -rw-r--r-- | mm/damon/paddr.c | 5 | ||||
| -rw-r--r-- | mm/huge_memory.c | 6 | ||||
| -rw-r--r-- | mm/hugetlb.c | 14 | ||||
| -rw-r--r-- | mm/kfence/Makefile | 2 | ||||
| -rw-r--r-- | mm/kfence/core.c | 42 | ||||
| -rw-r--r-- | mm/ksm.c | 11 | ||||
| -rw-r--r-- | mm/memory.c | 16 | ||||
| -rw-r--r-- | mm/migrate.c | 185 | ||||
| -rw-r--r-- | mm/mincore.c | 2 | ||||
| -rw-r--r-- | mm/mmap.c | 10 | ||||
| -rw-r--r-- | mm/mprotect.c | 2 | ||||
| -rw-r--r-- | mm/page_alloc.c | 3 | ||||
| -rw-r--r-- | mm/slab.c | 2 | ||||
| -rw-r--r-- | mm/swapfile.c | 3 | ||||
| -rw-r--r-- | mm/vmalloc.c | 36 | 
15 files changed, 200 insertions, 139 deletions
| diff --git a/mm/damon/paddr.c b/mm/damon/paddr.c index 6c655d9b5639..dd9c33fbe805 100644 --- a/mm/damon/paddr.c +++ b/mm/damon/paddr.c @@ -130,7 +130,6 @@ static bool damon_pa_young(unsigned long paddr, unsigned long *folio_sz)  			accessed = false;  		else  			accessed = true; -		folio_put(folio);  		goto out;  	} @@ -144,10 +143,10 @@ static bool damon_pa_young(unsigned long paddr, unsigned long *folio_sz)  	if (need_lock)  		folio_unlock(folio); -	folio_put(folio);  out:  	*folio_sz = folio_size(folio); +	folio_put(folio);  	return accessed;  } @@ -281,8 +280,8 @@ static inline unsigned long damon_pa_mark_accessed_or_deactivate(  			folio_mark_accessed(folio);  		else  			folio_deactivate(folio); -		folio_put(folio);  		applied += folio_nr_pages(folio); +		folio_put(folio);  	}  	return applied * PAGE_SIZE;  } diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 4fc43859e59a..032fb0ef9cd1 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2037,7 +2037,7 @@ static void __split_huge_zero_page_pmd(struct vm_area_struct *vma,  {  	struct mm_struct *mm = vma->vm_mm;  	pgtable_t pgtable; -	pmd_t _pmd; +	pmd_t _pmd, old_pmd;  	int i;  	/* @@ -2048,7 +2048,7 @@ static void __split_huge_zero_page_pmd(struct vm_area_struct *vma,  	 *  	 * See Documentation/mm/mmu_notifier.rst  	 */ -	pmdp_huge_clear_flush(vma, haddr, pmd); +	old_pmd = pmdp_huge_clear_flush(vma, haddr, pmd);  	pgtable = pgtable_trans_huge_withdraw(mm, pmd);  	pmd_populate(mm, &_pmd, pgtable); @@ -2057,6 +2057,8 @@ static void __split_huge_zero_page_pmd(struct vm_area_struct *vma,  		pte_t *pte, entry;  		entry = pfn_pte(my_zero_pfn(haddr), vma->vm_page_prot);  		entry = pte_mkspecial(entry); +		if (pmd_uffd_wp(old_pmd)) +			entry = pte_mkuffd_wp(entry);  		pte = pte_offset_map(&_pmd, haddr);  		VM_BUG_ON(!pte_none(*pte));  		set_pte_at(mm, haddr, pte, entry); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 07abcb6eb203..245038a9fe4e 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -5478,7 +5478,7 @@ static vm_fault_t hugetlb_wp(struct mm_struct *mm, struct vm_area_struct *vma,  		       struct folio *pagecache_folio, spinlock_t *ptl)  {  	const bool unshare = flags & FAULT_FLAG_UNSHARE; -	pte_t pte; +	pte_t pte = huge_ptep_get(ptep);  	struct hstate *h = hstate_vma(vma);  	struct page *old_page;  	struct folio *new_folio; @@ -5488,6 +5488,17 @@ static vm_fault_t hugetlb_wp(struct mm_struct *mm, struct vm_area_struct *vma,  	struct mmu_notifier_range range;  	/* +	 * Never handle CoW for uffd-wp protected pages.  It should be only +	 * handled when the uffd-wp protection is removed. +	 * +	 * Note that only the CoW optimization path (in hugetlb_no_page()) +	 * can trigger this, because hugetlb_fault() will always resolve +	 * uffd-wp bit first. +	 */ +	if (!unshare && huge_pte_uffd_wp(pte)) +		return 0; + +	/*  	 * hugetlb does not support FOLL_FORCE-style write faults that keep the  	 * PTE mapped R/O such as maybe_mkwrite() would do.  	 */ @@ -5500,7 +5511,6 @@ static vm_fault_t hugetlb_wp(struct mm_struct *mm, struct vm_area_struct *vma,  		return 0;  	} -	pte = huge_ptep_get(ptep);  	old_page = pte_page(pte);  	delayacct_wpcopy_start(); diff --git a/mm/kfence/Makefile b/mm/kfence/Makefile index 0bb95728a784..2de2a58d11a1 100644 --- a/mm/kfence/Makefile +++ b/mm/kfence/Makefile @@ -2,5 +2,5 @@  obj-y := core.o report.o -CFLAGS_kfence_test.o := -g -fno-omit-frame-pointer -fno-optimize-sibling-calls +CFLAGS_kfence_test.o := -fno-omit-frame-pointer -fno-optimize-sibling-calls  obj-$(CONFIG_KFENCE_KUNIT_TEST) += kfence_test.o diff --git a/mm/kfence/core.c b/mm/kfence/core.c index 5349c37a5dac..1065e0568d05 100644 --- a/mm/kfence/core.c +++ b/mm/kfence/core.c @@ -556,15 +556,11 @@ static unsigned long kfence_init_pool(void)  	 * enters __slab_free() slow-path.  	 */  	for (i = 0; i < KFENCE_POOL_SIZE / PAGE_SIZE; i++) { -		struct slab *slab = page_slab(&pages[i]); +		struct slab *slab = page_slab(nth_page(pages, i));  		if (!i || (i % 2))  			continue; -		/* Verify we do not have a compound head page. */ -		if (WARN_ON(compound_head(&pages[i]) != &pages[i])) -			return addr; -  		__folio_set_slab(slab_folio(slab));  #ifdef CONFIG_MEMCG  		slab->memcg_data = (unsigned long)&kfence_metadata[i / 2 - 1].objcg | @@ -597,12 +593,26 @@ static unsigned long kfence_init_pool(void)  		/* Protect the right redzone. */  		if (unlikely(!kfence_protect(addr + PAGE_SIZE))) -			return addr; +			goto reset_slab;  		addr += 2 * PAGE_SIZE;  	}  	return 0; + +reset_slab: +	for (i = 0; i < KFENCE_POOL_SIZE / PAGE_SIZE; i++) { +		struct slab *slab = page_slab(nth_page(pages, i)); + +		if (!i || (i % 2)) +			continue; +#ifdef CONFIG_MEMCG +		slab->memcg_data = 0; +#endif +		__folio_clear_slab(slab_folio(slab)); +	} + +	return addr;  }  static bool __init kfence_init_pool_early(void) @@ -632,16 +642,6 @@ static bool __init kfence_init_pool_early(void)  	 * fails for the first page, and therefore expect addr==__kfence_pool in  	 * most failure cases.  	 */ -	for (char *p = (char *)addr; p < __kfence_pool + KFENCE_POOL_SIZE; p += PAGE_SIZE) { -		struct slab *slab = virt_to_slab(p); - -		if (!slab) -			continue; -#ifdef CONFIG_MEMCG -		slab->memcg_data = 0; -#endif -		__folio_clear_slab(slab_folio(slab)); -	}  	memblock_free_late(__pa(addr), KFENCE_POOL_SIZE - (addr - (unsigned long)__kfence_pool));  	__kfence_pool = NULL;  	return false; @@ -726,10 +726,14 @@ static const struct seq_operations objects_sops = {  };  DEFINE_SEQ_ATTRIBUTE(objects); -static int __init kfence_debugfs_init(void) +static int kfence_debugfs_init(void)  { -	struct dentry *kfence_dir = debugfs_create_dir("kfence", NULL); +	struct dentry *kfence_dir; + +	if (!READ_ONCE(kfence_enabled)) +		return 0; +	kfence_dir = debugfs_create_dir("kfence", NULL);  	debugfs_create_file("stats", 0444, kfence_dir, NULL, &stats_fops);  	debugfs_create_file("objects", 0400, kfence_dir, NULL, &objects_fops);  	return 0; @@ -883,6 +887,8 @@ static int kfence_init_late(void)  	}  	kfence_init_enable(); +	kfence_debugfs_init(); +  	return 0;  } @@ -988,9 +988,15 @@ static int unmerge_and_remove_all_rmap_items(void)  		mm = mm_slot->slot.mm;  		mmap_read_lock(mm); + +		/* +		 * Exit right away if mm is exiting to avoid lockdep issue in +		 * the maple tree +		 */ +		if (ksm_test_exit(mm)) +			goto mm_exiting; +  		for_each_vma(vmi, vma) { -			if (ksm_test_exit(mm)) -				break;  			if (!(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma)  				continue;  			err = unmerge_ksm_pages(vma, @@ -999,6 +1005,7 @@ static int unmerge_and_remove_all_rmap_items(void)  				goto error;  		} +mm_exiting:  		remove_trailing_rmap_items(&mm_slot->rmap_list);  		mmap_read_unlock(mm); diff --git a/mm/memory.c b/mm/memory.c index f456f3b5049c..01a23ad48a04 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3563,8 +3563,21 @@ static vm_fault_t remove_device_exclusive_entry(struct vm_fault *vmf)  	struct vm_area_struct *vma = vmf->vma;  	struct mmu_notifier_range range; -	if (!folio_lock_or_retry(folio, vma->vm_mm, vmf->flags)) +	/* +	 * We need a reference to lock the folio because we don't hold +	 * the PTL so a racing thread can remove the device-exclusive +	 * entry and unmap it. If the folio is free the entry must +	 * have been removed already. If it happens to have already +	 * been re-allocated after being freed all we do is lock and +	 * unlock it. +	 */ +	if (!folio_try_get(folio)) +		return 0; + +	if (!folio_lock_or_retry(folio, vma->vm_mm, vmf->flags)) { +		folio_put(folio);  		return VM_FAULT_RETRY; +	}  	mmu_notifier_range_init_owner(&range, MMU_NOTIFY_EXCLUSIVE, 0,  				vma->vm_mm, vmf->address & PAGE_MASK,  				(vmf->address & PAGE_MASK) + PAGE_SIZE, NULL); @@ -3577,6 +3590,7 @@ static vm_fault_t remove_device_exclusive_entry(struct vm_fault *vmf)  	pte_unmap_unlock(vmf->pte, vmf->ptl);  	folio_unlock(folio); +	folio_put(folio);  	mmu_notifier_invalidate_range_end(&range);  	return 0; diff --git a/mm/migrate.c b/mm/migrate.c index 98f1c11197a8..db3f154446af 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1112,9 +1112,8 @@ static void migrate_folio_done(struct folio *src,  /* Obtain the lock on page, remove all ptes. */  static int migrate_folio_unmap(new_page_t get_new_page, free_page_t put_new_page,  			       unsigned long private, struct folio *src, -			       struct folio **dstp, int force, bool avoid_force_lock, -			       enum migrate_mode mode, enum migrate_reason reason, -			       struct list_head *ret) +			       struct folio **dstp, enum migrate_mode mode, +			       enum migrate_reason reason, struct list_head *ret)  {  	struct folio *dst;  	int rc = -EAGAIN; @@ -1144,7 +1143,7 @@ static int migrate_folio_unmap(new_page_t get_new_page, free_page_t put_new_page  	dst->private = NULL;  	if (!folio_trylock(src)) { -		if (!force || mode == MIGRATE_ASYNC) +		if (mode == MIGRATE_ASYNC)  			goto out;  		/* @@ -1163,17 +1162,6 @@ static int migrate_folio_unmap(new_page_t get_new_page, free_page_t put_new_page  		if (current->flags & PF_MEMALLOC)  			goto out; -		/* -		 * We have locked some folios and are going to wait to lock -		 * this folio.  To avoid a potential deadlock, let's bail -		 * out and not do that. The locked folios will be moved and -		 * unlocked, then we can wait to lock this folio. -		 */ -		if (avoid_force_lock) { -			rc = -EDEADLOCK; -			goto out; -		} -  		folio_lock(src);  	}  	locked = true; @@ -1193,8 +1181,6 @@ static int migrate_folio_unmap(new_page_t get_new_page, free_page_t put_new_page  			rc = -EBUSY;  			goto out;  		} -		if (!force) -			goto out;  		folio_wait_writeback(src);  	} @@ -1253,7 +1239,7 @@ static int migrate_folio_unmap(new_page_t get_new_page, free_page_t put_new_page  		/* Establish migration ptes */  		VM_BUG_ON_FOLIO(folio_test_anon(src) &&  			       !folio_test_ksm(src) && !anon_vma, src); -		try_to_migrate(src, TTU_BATCH_FLUSH); +		try_to_migrate(src, mode == MIGRATE_ASYNC ? TTU_BATCH_FLUSH : 0);  		page_was_mapped = 1;  	} @@ -1267,7 +1253,7 @@ out:  	 * A folio that has not been unmapped will be restored to  	 * right list unless we want to retry.  	 */ -	if (rc == -EAGAIN || rc == -EDEADLOCK) +	if (rc == -EAGAIN)  		ret = NULL;  	migrate_folio_undo_src(src, page_was_mapped, anon_vma, locked, ret); @@ -1508,6 +1494,9 @@ static inline int try_split_folio(struct folio *folio, struct list_head *split_f  #define NR_MAX_BATCHED_MIGRATION	512  #endif  #define NR_MAX_MIGRATE_PAGES_RETRY	10 +#define NR_MAX_MIGRATE_ASYNC_RETRY	3 +#define NR_MAX_MIGRATE_SYNC_RETRY					\ +	(NR_MAX_MIGRATE_PAGES_RETRY - NR_MAX_MIGRATE_ASYNC_RETRY)  struct migrate_pages_stats {  	int nr_succeeded;	/* Normal and large folios migrated successfully, in @@ -1618,13 +1607,19 @@ static int migrate_hugetlbs(struct list_head *from, new_page_t get_new_page,  /*   * migrate_pages_batch() first unmaps folios in the from list as many as   * possible, then move the unmapped folios. + * + * We only batch migration if mode == MIGRATE_ASYNC to avoid to wait a + * lock or bit when we have locked more than one folio.  Which may cause + * deadlock (e.g., for loop device).  So, if mode != MIGRATE_ASYNC, the + * length of the from list must be <= 1.   */  static int migrate_pages_batch(struct list_head *from, new_page_t get_new_page,  		free_page_t put_new_page, unsigned long private,  		enum migrate_mode mode, int reason, struct list_head *ret_folios, -		struct migrate_pages_stats *stats) +		struct list_head *split_folios, struct migrate_pages_stats *stats, +		int nr_pass)  { -	int retry; +	int retry = 1;  	int large_retry = 1;  	int thp_retry = 1;  	int nr_failed = 0; @@ -1634,21 +1629,15 @@ static int migrate_pages_batch(struct list_head *from, new_page_t get_new_page,  	bool is_large = false;  	bool is_thp = false;  	struct folio *folio, *folio2, *dst = NULL, *dst2; -	int rc, rc_saved, nr_pages; -	LIST_HEAD(split_folios); +	int rc, rc_saved = 0, nr_pages;  	LIST_HEAD(unmap_folios);  	LIST_HEAD(dst_folios);  	bool nosplit = (reason == MR_NUMA_MISPLACED); -	bool no_split_folio_counting = false; -	bool avoid_force_lock; -retry: -	rc_saved = 0; -	avoid_force_lock = false; -	retry = 1; -	for (pass = 0; -	     pass < NR_MAX_MIGRATE_PAGES_RETRY && (retry || large_retry); -	     pass++) { +	VM_WARN_ON_ONCE(mode != MIGRATE_ASYNC && +			!list_empty(from) && !list_is_singular(from)); + +	for (pass = 0; pass < nr_pass && (retry || large_retry); pass++) {  		retry = 0;  		large_retry = 0;  		thp_retry = 0; @@ -1679,7 +1668,7 @@ retry:  			if (!thp_migration_supported() && is_thp) {  				nr_large_failed++;  				stats->nr_thp_failed++; -				if (!try_split_folio(folio, &split_folios)) { +				if (!try_split_folio(folio, split_folios)) {  					stats->nr_thp_split++;  					continue;  				} @@ -1689,15 +1678,13 @@ retry:  			}  			rc = migrate_folio_unmap(get_new_page, put_new_page, private, -						 folio, &dst, pass > 2, avoid_force_lock, -						 mode, reason, ret_folios); +						 folio, &dst, mode, reason, ret_folios);  			/*  			 * The rules are:  			 *	Success: folio will be freed  			 *	Unmap: folio will be put on unmap_folios list,  			 *	       dst folio put on dst_folios list  			 *	-EAGAIN: stay on the from list -			 *	-EDEADLOCK: stay on the from list  			 *	-ENOMEM: stay on the from list  			 *	Other errno: put on ret_folios list  			 */ @@ -1712,7 +1699,7 @@ retry:  					stats->nr_thp_failed += is_thp;  					/* Large folio NUMA faulting doesn't split to retry. */  					if (!nosplit) { -						int ret = try_split_folio(folio, &split_folios); +						int ret = try_split_folio(folio, split_folios);  						if (!ret) {  							stats->nr_thp_split += is_thp; @@ -1729,18 +1716,11 @@ retry:  							break;  						}  					} -				} else if (!no_split_folio_counting) { +				} else {  					nr_failed++;  				}  				stats->nr_failed_pages += nr_pages + nr_retry_pages; -				/* -				 * There might be some split folios of fail-to-migrate large -				 * folios left in split_folios list. Move them to ret_folios -				 * list so that they could be put back to the right list by -				 * the caller otherwise the folio refcnt will be leaked. -				 */ -				list_splice_init(&split_folios, ret_folios);  				/* nr_failed isn't updated for not used */  				nr_large_failed += large_retry;  				stats->nr_thp_failed += thp_retry; @@ -1749,19 +1729,11 @@ retry:  					goto out;  				else  					goto move; -			case -EDEADLOCK: -				/* -				 * The folio cannot be locked for potential deadlock. -				 * Go move (and unlock) all locked folios.  Then we can -				 * try again. -				 */ -				rc_saved = rc; -				goto move;  			case -EAGAIN:  				if (is_large) {  					large_retry++;  					thp_retry += is_thp; -				} else if (!no_split_folio_counting) { +				} else {  					retry++;  				}  				nr_retry_pages += nr_pages; @@ -1771,11 +1743,6 @@ retry:  				stats->nr_thp_succeeded += is_thp;  				break;  			case MIGRATEPAGE_UNMAP: -				/* -				 * We have locked some folios, don't force lock -				 * to avoid deadlock. -				 */ -				avoid_force_lock = true;  				list_move_tail(&folio->lru, &unmap_folios);  				list_add_tail(&dst->lru, &dst_folios);  				break; @@ -1789,7 +1756,7 @@ retry:  				if (is_large) {  					nr_large_failed++;  					stats->nr_thp_failed += is_thp; -				} else if (!no_split_folio_counting) { +				} else {  					nr_failed++;  				} @@ -1807,9 +1774,7 @@ move:  	try_to_unmap_flush();  	retry = 1; -	for (pass = 0; -	     pass < NR_MAX_MIGRATE_PAGES_RETRY && (retry || large_retry); -	     pass++) { +	for (pass = 0; pass < nr_pass && (retry || large_retry); pass++) {  		retry = 0;  		large_retry = 0;  		thp_retry = 0; @@ -1838,7 +1803,7 @@ move:  				if (is_large) {  					large_retry++;  					thp_retry += is_thp; -				} else if (!no_split_folio_counting) { +				} else {  					retry++;  				}  				nr_retry_pages += nr_pages; @@ -1851,7 +1816,7 @@ move:  				if (is_large) {  					nr_large_failed++;  					stats->nr_thp_failed += is_thp; -				} else if (!no_split_folio_counting) { +				} else {  					nr_failed++;  				} @@ -1888,30 +1853,52 @@ out:  		dst2 = list_next_entry(dst, lru);  	} -	/* -	 * Try to migrate split folios of fail-to-migrate large folios, no -	 * nr_failed counting in this round, since all split folios of a -	 * large folio is counted as 1 failure in the first round. -	 */ -	if (rc >= 0 && !list_empty(&split_folios)) { -		/* -		 * Move non-migrated folios (after NR_MAX_MIGRATE_PAGES_RETRY -		 * retries) to ret_folios to avoid migrating them again. -		 */ -		list_splice_init(from, ret_folios); -		list_splice_init(&split_folios, from); -		no_split_folio_counting = true; -		goto retry; -	} +	return rc; +} +static int migrate_pages_sync(struct list_head *from, new_page_t get_new_page, +		free_page_t put_new_page, unsigned long private, +		enum migrate_mode mode, int reason, struct list_head *ret_folios, +		struct list_head *split_folios, struct migrate_pages_stats *stats) +{ +	int rc, nr_failed = 0; +	LIST_HEAD(folios); +	struct migrate_pages_stats astats; + +	memset(&astats, 0, sizeof(astats)); +	/* Try to migrate in batch with MIGRATE_ASYNC mode firstly */ +	rc = migrate_pages_batch(from, get_new_page, put_new_page, private, MIGRATE_ASYNC, +				 reason, &folios, split_folios, &astats, +				 NR_MAX_MIGRATE_ASYNC_RETRY); +	stats->nr_succeeded += astats.nr_succeeded; +	stats->nr_thp_succeeded += astats.nr_thp_succeeded; +	stats->nr_thp_split += astats.nr_thp_split; +	if (rc < 0) { +		stats->nr_failed_pages += astats.nr_failed_pages; +		stats->nr_thp_failed += astats.nr_thp_failed; +		list_splice_tail(&folios, ret_folios); +		return rc; +	} +	stats->nr_thp_failed += astats.nr_thp_split; +	nr_failed += astats.nr_thp_split;  	/* -	 * We have unlocked all locked folios, so we can force lock now, let's -	 * try again. +	 * Fall back to migrate all failed folios one by one synchronously. All +	 * failed folios except split THPs will be retried, so their failure +	 * isn't counted  	 */ -	if (rc == -EDEADLOCK) -		goto retry; +	list_splice_tail_init(&folios, from); +	while (!list_empty(from)) { +		list_move(from->next, &folios); +		rc = migrate_pages_batch(&folios, get_new_page, put_new_page, +					 private, mode, reason, ret_folios, +					 split_folios, stats, NR_MAX_MIGRATE_SYNC_RETRY); +		list_splice_tail_init(&folios, ret_folios); +		if (rc < 0) +			return rc; +		nr_failed += rc; +	} -	return rc; +	return nr_failed;  }  /* @@ -1949,6 +1936,7 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page,  	struct folio *folio, *folio2;  	LIST_HEAD(folios);  	LIST_HEAD(ret_folios); +	LIST_HEAD(split_folios);  	struct migrate_pages_stats stats;  	trace_mm_migrate_pages_start(mode, reason); @@ -1959,6 +1947,7 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page,  				     mode, reason, &stats, &ret_folios);  	if (rc_gather < 0)  		goto out; +  again:  	nr_pages = 0;  	list_for_each_entry_safe(folio, folio2, from, lru) { @@ -1969,20 +1958,36 @@ again:  		}  		nr_pages += folio_nr_pages(folio); -		if (nr_pages > NR_MAX_BATCHED_MIGRATION) +		if (nr_pages >= NR_MAX_BATCHED_MIGRATION)  			break;  	} -	if (nr_pages > NR_MAX_BATCHED_MIGRATION) -		list_cut_before(&folios, from, &folio->lru); +	if (nr_pages >= NR_MAX_BATCHED_MIGRATION) +		list_cut_before(&folios, from, &folio2->lru);  	else  		list_splice_init(from, &folios); -	rc = migrate_pages_batch(&folios, get_new_page, put_new_page, private, -				 mode, reason, &ret_folios, &stats); +	if (mode == MIGRATE_ASYNC) +		rc = migrate_pages_batch(&folios, get_new_page, put_new_page, private, +					 mode, reason, &ret_folios, &split_folios, &stats, +					 NR_MAX_MIGRATE_PAGES_RETRY); +	else +		rc = migrate_pages_sync(&folios, get_new_page, put_new_page, private, +					mode, reason, &ret_folios, &split_folios, &stats);  	list_splice_tail_init(&folios, &ret_folios);  	if (rc < 0) {  		rc_gather = rc; +		list_splice_tail(&split_folios, &ret_folios);  		goto out;  	} +	if (!list_empty(&split_folios)) { +		/* +		 * Failure isn't counted since all split folios of a large folio +		 * is counted as 1 failure already.  And, we only try to migrate +		 * with minimal effort, force MIGRATE_ASYNC mode and retry once. +		 */ +		migrate_pages_batch(&split_folios, get_new_page, put_new_page, private, +				    MIGRATE_ASYNC, reason, &ret_folios, NULL, &stats, 1); +		list_splice_tail_init(&split_folios, &ret_folios); +	}  	rc_gather += rc;  	if (!list_empty(from))  		goto again; diff --git a/mm/mincore.c b/mm/mincore.c index cd69b9db0081..d359650b0f75 100644 --- a/mm/mincore.c +++ b/mm/mincore.c @@ -33,7 +33,7 @@ static int mincore_hugetlb(pte_t *pte, unsigned long hmask, unsigned long addr,  	 * Hugepages under user process are always in RAM and never  	 * swapped out, but theoretically it needs to be checked.  	 */ -	present = pte && !huge_pte_none(huge_ptep_get(pte)); +	present = pte && !huge_pte_none_mostly(huge_ptep_get(pte));  	for (; addr != end; vec++, addr += PAGE_SIZE)  		*vec = present;  	walk->private = vec; diff --git a/mm/mmap.c b/mm/mmap.c index 740b54be3ed4..ff68a67a2a7c 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2277,7 +2277,7 @@ do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma,  	int count = 0;  	int error = -ENOMEM;  	MA_STATE(mas_detach, &mt_detach, 0, 0); -	mt_init_flags(&mt_detach, MT_FLAGS_LOCK_EXTERN); +	mt_init_flags(&mt_detach, vmi->mas.tree->ma_flags & MT_FLAGS_LOCK_MASK);  	mt_set_external_lock(&mt_detach, &mm->mmap_lock);  	/* @@ -2621,12 +2621,7 @@ cannot_expand:  	if (map_deny_write_exec(vma, vma->vm_flags)) {  		error = -EACCES; -		if (file) -			goto close_and_free_vma; -		else if (vma->vm_file) -			goto unmap_and_free_vma; -		else -			goto free_vma; +		goto close_and_free_vma;  	}  	/* Allow architectures to sanity-check the vm_flags */ @@ -3042,6 +3037,7 @@ void exit_mmap(struct mm_struct *mm)  	 */  	set_bit(MMF_OOM_SKIP, &mm->flags);  	mmap_write_lock(mm); +	mt_clear_in_rcu(&mm->mm_mt);  	free_pgtables(&tlb, &mm->mm_mt, vma, FIRST_USER_ADDRESS,  		      USER_PGTABLES_CEILING);  	tlb_finish_mmu(&tlb); diff --git a/mm/mprotect.c b/mm/mprotect.c index 231929f119d9..13e84d8c0797 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -805,7 +805,7 @@ static int do_mprotect_pkey(unsigned long start, size_t len,  		if (map_deny_write_exec(vma, newflags)) {  			error = -EACCES; -			goto out; +			break;  		}  		/* Allow architectures to sanity-check the new flags */ diff --git a/mm/page_alloc.c b/mm/page_alloc.c index ac1fc986af44..7136c36c5d01 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1398,6 +1398,7 @@ static __always_inline bool free_pages_prepare(struct page *page,  			unsigned int order, bool check_free, fpi_t fpi_flags)  {  	int bad = 0; +	bool skip_kasan_poison = should_skip_kasan_poison(page, fpi_flags);  	bool init = want_init_on_free();  	VM_BUG_ON_PAGE(PageTail(page), page); @@ -1470,7 +1471,7 @@ static __always_inline bool free_pages_prepare(struct page *page,  	 * With hardware tag-based KASAN, memory tags must be set before the  	 * page becomes unavailable via debug_pagealloc or arch_free_page.  	 */ -	if (!should_skip_kasan_poison(page, fpi_flags)) { +	if (!skip_kasan_poison) {  		kasan_poison_pages(page, order, init);  		/* Memory is already initialized if KASAN did it internally. */ diff --git a/mm/slab.c b/mm/slab.c index dabc2a671fc6..edbe722fb906 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -839,7 +839,7 @@ static int init_cache_node(struct kmem_cache *cachep, int node, gfp_t gfp)  	return 0;  } -#if (defined(CONFIG_NUMA) && defined(CONFIG_MEMORY_HOTPLUG)) || defined(CONFIG_SMP) +#if defined(CONFIG_NUMA) || defined(CONFIG_SMP)  /*   * Allocates and initializes node for a node on each slab cache, used for   * either memory or cpu hotplug.  If memory is being hot-added, the kmem_cache_node diff --git a/mm/swapfile.c b/mm/swapfile.c index 62ba2bf577d7..2c718f45745f 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -679,6 +679,7 @@ static void __del_from_avail_list(struct swap_info_struct *p)  {  	int nid; +	assert_spin_locked(&p->lock);  	for_each_node(nid)  		plist_del(&p->avail_lists[nid], &swap_avail_heads[nid]);  } @@ -2434,8 +2435,8 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)  		spin_unlock(&swap_lock);  		goto out_dput;  	} -	del_from_avail_list(p);  	spin_lock(&p->lock); +	del_from_avail_list(p);  	if (p->prio < 0) {  		struct swap_info_struct *si = p;  		int nid; diff --git a/mm/vmalloc.c b/mm/vmalloc.c index ef910bf349e1..a50072066221 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -2883,6 +2883,8 @@ vm_area_alloc_pages(gfp_t gfp, int nid,  		unsigned int order, unsigned int nr_pages, struct page **pages)  {  	unsigned int nr_allocated = 0; +	gfp_t alloc_gfp = gfp; +	bool nofail = false;  	struct page *page;  	int i; @@ -2893,6 +2895,7 @@ vm_area_alloc_pages(gfp_t gfp, int nid,  	 * more permissive.  	 */  	if (!order) { +		/* bulk allocator doesn't support nofail req. officially */  		gfp_t bulk_gfp = gfp & ~__GFP_NOFAIL;  		while (nr_allocated < nr_pages) { @@ -2931,20 +2934,35 @@ vm_area_alloc_pages(gfp_t gfp, int nid,  			if (nr != nr_pages_request)  				break;  		} +	} else if (gfp & __GFP_NOFAIL) { +		/* +		 * Higher order nofail allocations are really expensive and +		 * potentially dangerous (pre-mature OOM, disruptive reclaim +		 * and compaction etc. +		 */ +		alloc_gfp &= ~__GFP_NOFAIL; +		nofail = true;  	}  	/* High-order pages or fallback path if "bulk" fails. */ -  	while (nr_allocated < nr_pages) {  		if (fatal_signal_pending(current))  			break;  		if (nid == NUMA_NO_NODE) -			page = alloc_pages(gfp, order); +			page = alloc_pages(alloc_gfp, order);  		else -			page = alloc_pages_node(nid, gfp, order); -		if (unlikely(!page)) -			break; +			page = alloc_pages_node(nid, alloc_gfp, order); +		if (unlikely(!page)) { +			if (!nofail) +				break; + +			/* fall back to the zero order allocations */ +			alloc_gfp |= __GFP_NOFAIL; +			order = 0; +			continue; +		} +  		/*  		 * Higher order allocations must be able to be treated as  		 * indepdenent small pages by callers (as they can with @@ -3024,9 +3042,11 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,  	 * allocation request, free them via vfree() if any.  	 */  	if (area->nr_pages != nr_small_pages) { -		warn_alloc(gfp_mask, NULL, -			"vmalloc error: size %lu, page order %u, failed to allocate pages", -			area->nr_pages * PAGE_SIZE, page_order); +		/* vm_area_alloc_pages() can also fail due to a fatal signal */ +		if (!fatal_signal_pending(current)) +			warn_alloc(gfp_mask, NULL, +				"vmalloc error: size %lu, page order %u, failed to allocate pages", +				area->nr_pages * PAGE_SIZE, page_order);  		goto fail;  	} |