diff options
Diffstat (limited to 'mm/migrate.c')
| -rw-r--r-- | mm/migrate.c | 406 | 
1 files changed, 263 insertions, 143 deletions
diff --git a/mm/migrate.c b/mm/migrate.c index cf25b00f03c8..18ce840914f0 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -50,6 +50,7 @@  #include <linux/ptrace.h>  #include <linux/oom.h>  #include <linux/memory.h> +#include <linux/random.h>  #include <asm/tlbflush.h> @@ -236,20 +237,19 @@ static bool remove_migration_pte(struct page *page, struct vm_area_struct *vma,  			pte = pte_mkhuge(pte);  			pte = arch_make_huge_pte(pte, shift, vma->vm_flags); -			set_huge_pte_at(vma->vm_mm, pvmw.address, pvmw.pte, pte);  			if (PageAnon(new))  				hugepage_add_anon_rmap(new, vma, pvmw.address);  			else  				page_dup_rmap(new, true); +			set_huge_pte_at(vma->vm_mm, pvmw.address, pvmw.pte, pte);  		} else  #endif  		{ -			set_pte_at(vma->vm_mm, pvmw.address, pvmw.pte, pte); -  			if (PageAnon(new))  				page_add_anon_rmap(new, vma, pvmw.address, false);  			else  				page_add_file_rmap(new, false); +			set_pte_at(vma->vm_mm, pvmw.address, pvmw.pte, pte);  		}  		if (vma->vm_flags & VM_LOCKED && !PageTransCompound(new))  			mlock_vma_page(new); @@ -291,7 +291,7 @@ void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep,  {  	pte_t pte;  	swp_entry_t entry; -	struct page *page; +	struct folio *folio;  	spin_lock(ptl);  	pte = *ptep; @@ -302,18 +302,17 @@ void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep,  	if (!is_migration_entry(entry))  		goto out; -	page = pfn_swap_entry_to_page(entry); -	page = compound_head(page); +	folio = page_folio(pfn_swap_entry_to_page(entry));  	/*  	 * Once page cache replacement of page migration started, page_count -	 * is zero; but we must not call put_and_wait_on_page_locked() without -	 * a ref. Use get_page_unless_zero(), and just fault again if it fails. +	 * is zero; but we must not call folio_put_wait_locked() without +	 * a ref. Use folio_try_get(), and just fault again if it fails.  	 */ -	if (!get_page_unless_zero(page)) +	if (!folio_try_get(folio))  		goto out;  	pte_unmap_unlock(ptep, ptl); -	put_and_wait_on_page_locked(page, TASK_UNINTERRUPTIBLE); +	folio_put_wait_locked(folio, TASK_UNINTERRUPTIBLE);  	return;  out:  	pte_unmap_unlock(ptep, ptl); @@ -338,16 +337,16 @@ void migration_entry_wait_huge(struct vm_area_struct *vma,  void pmd_migration_entry_wait(struct mm_struct *mm, pmd_t *pmd)  {  	spinlock_t *ptl; -	struct page *page; +	struct folio *folio;  	ptl = pmd_lock(mm, pmd);  	if (!is_pmd_migration_entry(*pmd))  		goto unlock; -	page = pfn_swap_entry_to_page(pmd_to_swp_entry(*pmd)); -	if (!get_page_unless_zero(page)) +	folio = page_folio(pfn_swap_entry_to_page(pmd_to_swp_entry(*pmd))); +	if (!folio_try_get(folio))  		goto unlock;  	spin_unlock(ptl); -	put_and_wait_on_page_locked(page, TASK_UNINTERRUPTIBLE); +	folio_put_wait_locked(folio, TASK_UNINTERRUPTIBLE);  	return;  unlock:  	spin_unlock(ptl); @@ -434,14 +433,6 @@ int folio_migrate_mapping(struct address_space *mapping,  	}  	xas_store(&xas, newfolio); -	if (nr > 1) { -		int i; - -		for (i = 1; i < nr; i++) { -			xas_next(&xas); -			xas_store(&xas, newfolio); -		} -	}  	/*  	 * Drop cache reference from old page by unfreezing @@ -1093,80 +1084,6 @@ out:  	return rc;  } - -/* - * node_demotion[] example: - * - * Consider a system with two sockets.  Each socket has - * three classes of memory attached: fast, medium and slow. - * Each memory class is placed in its own NUMA node.  The - * CPUs are placed in the node with the "fast" memory.  The - * 6 NUMA nodes (0-5) might be split among the sockets like - * this: - * - *	Socket A: 0, 1, 2 - *	Socket B: 3, 4, 5 - * - * When Node 0 fills up, its memory should be migrated to - * Node 1.  When Node 1 fills up, it should be migrated to - * Node 2.  The migration path start on the nodes with the - * processors (since allocations default to this node) and - * fast memory, progress through medium and end with the - * slow memory: - * - *	0 -> 1 -> 2 -> stop - *	3 -> 4 -> 5 -> stop - * - * This is represented in the node_demotion[] like this: - * - *	{  1, // Node 0 migrates to 1 - *	   2, // Node 1 migrates to 2 - *	  -1, // Node 2 does not migrate - *	   4, // Node 3 migrates to 4 - *	   5, // Node 4 migrates to 5 - *	  -1} // Node 5 does not migrate - */ - -/* - * Writes to this array occur without locking.  Cycles are - * not allowed: Node X demotes to Y which demotes to X... - * - * If multiple reads are performed, a single rcu_read_lock() - * must be held over all reads to ensure that no cycles are - * observed. - */ -static int node_demotion[MAX_NUMNODES] __read_mostly = -	{[0 ...  MAX_NUMNODES - 1] = NUMA_NO_NODE}; - -/** - * next_demotion_node() - Get the next node in the demotion path - * @node: The starting node to lookup the next node - * - * Return: node id for next memory node in the demotion path hierarchy - * from @node; NUMA_NO_NODE if @node is terminal.  This does not keep - * @node online or guarantee that it *continues* to be the next demotion - * target. - */ -int next_demotion_node(int node) -{ -	int target; - -	/* -	 * node_demotion[] is updated without excluding this -	 * function from running.  RCU doesn't provide any -	 * compiler barriers, so the READ_ONCE() is required -	 * to avoid compiler reordering or read merging. -	 * -	 * Make sure to use RCU over entire code blocks if -	 * node_demotion[] reads need to be consistent. -	 */ -	rcu_read_lock(); -	target = READ_ONCE(node_demotion[node]); -	rcu_read_unlock(); - -	return target; -} -  /*   * Obtain the lock on page, remove all ptes and migrate the page   * to the newly allocated page in newpage. @@ -1422,7 +1339,7 @@ static inline int try_split_thp(struct page *page, struct page **page2,   * @mode:		The migration mode that specifies the constraints for   *			page migration, if any.   * @reason:		The reason for page migration. - * @ret_succeeded:	Set to the number of pages migrated successfully if + * @ret_succeeded:	Set to the number of normal pages migrated successfully if   *			the caller passes a non-NULL pointer.   *   * The function returns after 10 attempts or if no pages are movable any more @@ -1430,7 +1347,9 @@ static inline int try_split_thp(struct page *page, struct page **page2,   * It is caller's responsibility to call putback_movable_pages() to return pages   * to the LRU or free list only if ret != 0.   * - * Returns the number of pages that were not migrated, or an error code. + * Returns the number of {normal page, THP, hugetlb} that were not migrated, or + * an error code. The number of THP splits will be considered as the number of + * non-migrated THP, no matter how many subpages of the THP are migrated successfully.   */  int migrate_pages(struct list_head *from, new_page_t get_new_page,  		free_page_t put_new_page, unsigned long private, @@ -1439,6 +1358,7 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page,  	int retry = 1;  	int thp_retry = 1;  	int nr_failed = 0; +	int nr_failed_pages = 0;  	int nr_succeeded = 0;  	int nr_thp_succeeded = 0;  	int nr_thp_failed = 0; @@ -1450,13 +1370,16 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page,  	int swapwrite = current->flags & PF_SWAPWRITE;  	int rc, nr_subpages;  	LIST_HEAD(ret_pages); +	LIST_HEAD(thp_split_pages);  	bool nosplit = (reason == MR_NUMA_MISPLACED); +	bool no_subpage_counting = false;  	trace_mm_migrate_pages_start(mode, reason);  	if (!swapwrite)  		current->flags |= PF_SWAPWRITE; +thp_subpage_migration:  	for (pass = 0; pass < 10 && (retry || thp_retry); pass++) {  		retry = 0;  		thp_retry = 0; @@ -1469,7 +1392,7 @@ retry:  			 * during migration.  			 */  			is_thp = PageTransHuge(page) && !PageHuge(page); -			nr_subpages = thp_nr_pages(page); +			nr_subpages = compound_nr(page);  			cond_resched();  			if (PageHuge(page)) @@ -1505,18 +1428,20 @@ retry:  			case -ENOSYS:  				/* THP migration is unsupported */  				if (is_thp) { -					if (!try_split_thp(page, &page2, from)) { +					nr_thp_failed++; +					if (!try_split_thp(page, &page2, &thp_split_pages)) {  						nr_thp_split++;  						goto retry;  					} -					nr_thp_failed++; -					nr_failed += nr_subpages; +					nr_failed_pages += nr_subpages;  					break;  				}  				/* Hugetlb migration is unsupported */ -				nr_failed++; +				if (!no_subpage_counting) +					nr_failed++; +				nr_failed_pages += nr_subpages;  				break;  			case -ENOMEM:  				/* @@ -1525,16 +1450,19 @@ retry:  				 * THP NUMA faulting doesn't split THP to retry.  				 */  				if (is_thp && !nosplit) { -					if (!try_split_thp(page, &page2, from)) { +					nr_thp_failed++; +					if (!try_split_thp(page, &page2, &thp_split_pages)) {  						nr_thp_split++;  						goto retry;  					} -					nr_thp_failed++; -					nr_failed += nr_subpages; +					nr_failed_pages += nr_subpages;  					goto out;  				} -				nr_failed++; + +				if (!no_subpage_counting) +					nr_failed++; +				nr_failed_pages += nr_subpages;  				goto out;  			case -EAGAIN:  				if (is_thp) { @@ -1544,12 +1472,11 @@ retry:  				retry++;  				break;  			case MIGRATEPAGE_SUCCESS: +				nr_succeeded += nr_subpages;  				if (is_thp) {  					nr_thp_succeeded++; -					nr_succeeded += nr_subpages;  					break;  				} -				nr_succeeded++;  				break;  			default:  				/* @@ -1560,17 +1487,37 @@ retry:  				 */  				if (is_thp) {  					nr_thp_failed++; -					nr_failed += nr_subpages; +					nr_failed_pages += nr_subpages;  					break;  				} -				nr_failed++; + +				if (!no_subpage_counting) +					nr_failed++; +				nr_failed_pages += nr_subpages;  				break;  			}  		}  	} -	nr_failed += retry + thp_retry; +	nr_failed += retry;  	nr_thp_failed += thp_retry; -	rc = nr_failed; +	/* +	 * Try to migrate subpages of fail-to-migrate THPs, no nr_failed +	 * counting in this round, since all subpages of a THP is counted +	 * as 1 failure in the first round. +	 */ +	if (!list_empty(&thp_split_pages)) { +		/* +		 * Move non-migrated pages (after 10 retries) to ret_pages +		 * to avoid migrating them again. +		 */ +		list_splice_init(from, &ret_pages); +		list_splice_init(&thp_split_pages, from); +		no_subpage_counting = true; +		retry = 1; +		goto thp_subpage_migration; +	} + +	rc = nr_failed + nr_thp_failed;  out:  	/*  	 * Put the permanent failure page back to migration list, they @@ -1579,11 +1526,11 @@ out:  	list_splice(&ret_pages, from);  	count_vm_events(PGMIGRATE_SUCCESS, nr_succeeded); -	count_vm_events(PGMIGRATE_FAIL, nr_failed); +	count_vm_events(PGMIGRATE_FAIL, nr_failed_pages);  	count_vm_events(THP_MIGRATION_SUCCESS, nr_thp_succeeded);  	count_vm_events(THP_MIGRATION_FAIL, nr_thp_failed);  	count_vm_events(THP_MIGRATION_SPLIT, nr_thp_split); -	trace_mm_migrate_pages(nr_succeeded, nr_failed, nr_thp_succeeded, +	trace_mm_migrate_pages(nr_succeeded, nr_failed_pages, nr_thp_succeeded,  			       nr_thp_failed, nr_thp_split, mode, reason);  	if (!swapwrite) @@ -2525,8 +2472,7 @@ static bool migrate_vma_check_page(struct page *page)  static void migrate_vma_unmap(struct migrate_vma *migrate)  {  	const unsigned long npages = migrate->npages; -	const unsigned long start = migrate->start; -	unsigned long addr, i, restore = 0; +	unsigned long i, restore = 0;  	bool allow_drain = true;  	lru_add_drain(); @@ -2572,7 +2518,7 @@ static void migrate_vma_unmap(struct migrate_vma *migrate)  		}  	} -	for (addr = start, i = 0; i < npages && restore; addr += PAGE_SIZE, i++) { +	for (i = 0; i < npages && restore; i++) {  		struct page *page = migrate_pfn_to_page(migrate->src[i]);  		if (!page || (migrate->src[i] & MIGRATE_PFN_MIGRATE)) @@ -2970,14 +2916,152 @@ void migrate_vma_finalize(struct migrate_vma *migrate)  EXPORT_SYMBOL(migrate_vma_finalize);  #endif /* CONFIG_DEVICE_PRIVATE */ +/* + * node_demotion[] example: + * + * Consider a system with two sockets.  Each socket has + * three classes of memory attached: fast, medium and slow. + * Each memory class is placed in its own NUMA node.  The + * CPUs are placed in the node with the "fast" memory.  The + * 6 NUMA nodes (0-5) might be split among the sockets like + * this: + * + *	Socket A: 0, 1, 2 + *	Socket B: 3, 4, 5 + * + * When Node 0 fills up, its memory should be migrated to + * Node 1.  When Node 1 fills up, it should be migrated to + * Node 2.  The migration path start on the nodes with the + * processors (since allocations default to this node) and + * fast memory, progress through medium and end with the + * slow memory: + * + *	0 -> 1 -> 2 -> stop + *	3 -> 4 -> 5 -> stop + * + * This is represented in the node_demotion[] like this: + * + *	{  nr=1, nodes[0]=1 }, // Node 0 migrates to 1 + *	{  nr=1, nodes[0]=2 }, // Node 1 migrates to 2 + *	{  nr=0, nodes[0]=-1 }, // Node 2 does not migrate + *	{  nr=1, nodes[0]=4 }, // Node 3 migrates to 4 + *	{  nr=1, nodes[0]=5 }, // Node 4 migrates to 5 + *	{  nr=0, nodes[0]=-1 }, // Node 5 does not migrate + * + * Moreover some systems may have multiple slow memory nodes. + * Suppose a system has one socket with 3 memory nodes, node 0 + * is fast memory type, and node 1/2 both are slow memory + * type, and the distance between fast memory node and slow + * memory node is same. So the migration path should be: + * + *	0 -> 1/2 -> stop + * + * This is represented in the node_demotion[] like this: + *	{ nr=2, {nodes[0]=1, nodes[1]=2} }, // Node 0 migrates to node 1 and node 2 + *	{ nr=0, nodes[0]=-1, }, // Node 1 dose not migrate + *	{ nr=0, nodes[0]=-1, }, // Node 2 does not migrate + */ + +/* + * Writes to this array occur without locking.  Cycles are + * not allowed: Node X demotes to Y which demotes to X... + * + * If multiple reads are performed, a single rcu_read_lock() + * must be held over all reads to ensure that no cycles are + * observed. + */ +#define DEFAULT_DEMOTION_TARGET_NODES 15 + +#if MAX_NUMNODES < DEFAULT_DEMOTION_TARGET_NODES +#define DEMOTION_TARGET_NODES	(MAX_NUMNODES - 1) +#else +#define DEMOTION_TARGET_NODES	DEFAULT_DEMOTION_TARGET_NODES +#endif + +struct demotion_nodes { +	unsigned short nr; +	short nodes[DEMOTION_TARGET_NODES]; +}; + +static struct demotion_nodes *node_demotion __read_mostly; + +/** + * next_demotion_node() - Get the next node in the demotion path + * @node: The starting node to lookup the next node + * + * Return: node id for next memory node in the demotion path hierarchy + * from @node; NUMA_NO_NODE if @node is terminal.  This does not keep + * @node online or guarantee that it *continues* to be the next demotion + * target. + */ +int next_demotion_node(int node) +{ +	struct demotion_nodes *nd; +	unsigned short target_nr, index; +	int target; + +	if (!node_demotion) +		return NUMA_NO_NODE; + +	nd = &node_demotion[node]; + +	/* +	 * node_demotion[] is updated without excluding this +	 * function from running.  RCU doesn't provide any +	 * compiler barriers, so the READ_ONCE() is required +	 * to avoid compiler reordering or read merging. +	 * +	 * Make sure to use RCU over entire code blocks if +	 * node_demotion[] reads need to be consistent. +	 */ +	rcu_read_lock(); +	target_nr = READ_ONCE(nd->nr); + +	switch (target_nr) { +	case 0: +		target = NUMA_NO_NODE; +		goto out; +	case 1: +		index = 0; +		break; +	default: +		/* +		 * If there are multiple target nodes, just select one +		 * target node randomly. +		 * +		 * In addition, we can also use round-robin to select +		 * target node, but we should introduce another variable +		 * for node_demotion[] to record last selected target node, +		 * that may cause cache ping-pong due to the changing of +		 * last target node. Or introducing per-cpu data to avoid +		 * caching issue, which seems more complicated. So selecting +		 * target node randomly seems better until now. +		 */ +		index = get_random_int() % target_nr; +		break; +	} + +	target = READ_ONCE(nd->nodes[index]); + +out: +	rcu_read_unlock(); +	return target; +} +  #if defined(CONFIG_HOTPLUG_CPU)  /* Disable reclaim-based migration. */  static void __disable_all_migrate_targets(void)  { -	int node; +	int node, i; + +	if (!node_demotion) +		return; -	for_each_online_node(node) -		node_demotion[node] = NUMA_NO_NODE; +	for_each_online_node(node) { +		node_demotion[node].nr = 0; +		for (i = 0; i < DEMOTION_TARGET_NODES; i++) +			node_demotion[node].nodes[i] = NUMA_NO_NODE; +	}  }  static void disable_all_migrate_targets(void) @@ -3004,26 +3088,40 @@ static void disable_all_migrate_targets(void)   * Failing here is OK.  It might just indicate   * being at the end of a chain.   */ -static int establish_migrate_target(int node, nodemask_t *used) +static int establish_migrate_target(int node, nodemask_t *used, +				    int best_distance)  { -	int migration_target; +	int migration_target, index, val; +	struct demotion_nodes *nd; -	/* -	 * Can not set a migration target on a -	 * node with it already set. -	 * -	 * No need for READ_ONCE() here since this -	 * in the write path for node_demotion[]. -	 * This should be the only thread writing. -	 */ -	if (node_demotion[node] != NUMA_NO_NODE) +	if (!node_demotion)  		return NUMA_NO_NODE; +	nd = &node_demotion[node]; +  	migration_target = find_next_best_node(node, used);  	if (migration_target == NUMA_NO_NODE)  		return NUMA_NO_NODE; -	node_demotion[node] = migration_target; +	/* +	 * If the node has been set a migration target node before, +	 * which means it's the best distance between them. Still +	 * check if this node can be demoted to other target nodes +	 * if they have a same best distance. +	 */ +	if (best_distance != -1) { +		val = node_distance(node, migration_target); +		if (val > best_distance) +			return NUMA_NO_NODE; +	} + +	index = nd->nr; +	if (WARN_ONCE(index >= DEMOTION_TARGET_NODES, +		      "Exceeds maximum demotion target nodes\n")) +		return NUMA_NO_NODE; + +	nd->nodes[index] = migration_target; +	nd->nr++;  	return migration_target;  } @@ -3039,7 +3137,9 @@ static int establish_migrate_target(int node, nodemask_t *used)   *   * The difference here is that cycles must be avoided.  If   * node0 migrates to node1, then neither node1, nor anything - * node1 migrates to can migrate to node0. + * node1 migrates to can migrate to node0. Also one node can + * be migrated to multiple nodes if the target nodes all have + * a same best-distance against the source node.   *   * This function can run simultaneously with readers of   * node_demotion[].  However, it can not run simultaneously @@ -3051,7 +3151,7 @@ static void __set_migration_target_nodes(void)  	nodemask_t next_pass	= NODE_MASK_NONE;  	nodemask_t this_pass	= NODE_MASK_NONE;  	nodemask_t used_targets = NODE_MASK_NONE; -	int node; +	int node, best_distance;  	/*  	 * Avoid any oddities like cycles that could occur @@ -3080,18 +3180,33 @@ again:  	 * multiple source nodes to share a destination.  	 */  	nodes_or(used_targets, used_targets, this_pass); -	for_each_node_mask(node, this_pass) { -		int target_node = establish_migrate_target(node, &used_targets); -		if (target_node == NUMA_NO_NODE) -			continue; +	for_each_node_mask(node, this_pass) { +		best_distance = -1;  		/* -		 * Visit targets from this pass in the next pass. -		 * Eventually, every node will have been part of -		 * a pass, and will become set in 'used_targets'. +		 * Try to set up the migration path for the node, and the target +		 * migration nodes can be multiple, so doing a loop to find all +		 * the target nodes if they all have a best node distance.  		 */ -		node_set(target_node, next_pass); +		do { +			int target_node = +				establish_migrate_target(node, &used_targets, +							 best_distance); + +			if (target_node == NUMA_NO_NODE) +				break; + +			if (best_distance == -1) +				best_distance = node_distance(node, target_node); + +			/* +			 * Visit targets from this pass in the next pass. +			 * Eventually, every node will have been part of +			 * a pass, and will become set in 'used_targets'. +			 */ +			node_set(target_node, next_pass); +		} while (1);  	}  	/*  	 * 'next_pass' contains nodes which became migration @@ -3192,6 +3307,11 @@ static int __init migrate_on_reclaim_init(void)  {  	int ret; +	node_demotion = kmalloc_array(nr_node_ids, +				      sizeof(struct demotion_nodes), +				      GFP_KERNEL); +	WARN_ON(!node_demotion); +  	ret = cpuhp_setup_state_nocalls(CPUHP_MM_DEMOTION_DEAD, "mm/demotion:offline",  					NULL, migration_offline_cpu);  	/*  |