diff options
Diffstat (limited to 'mm/vmscan.c')
| -rw-r--r-- | mm/vmscan.c | 1088 | 
1 files changed, 695 insertions, 393 deletions
diff --git a/mm/vmscan.c b/mm/vmscan.c index bd6637fcd8f9..9c1c5e8b24b8 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -55,6 +55,8 @@  #include <linux/ctype.h>  #include <linux/debugfs.h>  #include <linux/khugepaged.h> +#include <linux/rculist_nulls.h> +#include <linux/random.h>  #include <asm/tlbflush.h>  #include <asm/div64.h> @@ -135,12 +137,6 @@ struct scan_control {  	/* Always discard instead of demoting to lower tier memory */  	unsigned int no_demotion:1; -#ifdef CONFIG_LRU_GEN -	/* help kswapd make better choices among multiple memcgs */ -	unsigned int memcgs_need_aging:1; -	unsigned long last_reclaimed; -#endif -  	/* Allocation order */  	s8 order; @@ -449,6 +445,11 @@ static bool cgroup_reclaim(struct scan_control *sc)  	return sc->target_mem_cgroup;  } +static bool global_reclaim(struct scan_control *sc) +{ +	return !sc->target_mem_cgroup || mem_cgroup_is_root(sc->target_mem_cgroup); +} +  /**   * writeback_throttling_sane - is the usual dirty throttling mechanism available?   * @sc: scan_control in question @@ -499,6 +500,11 @@ static bool cgroup_reclaim(struct scan_control *sc)  	return false;  } +static bool global_reclaim(struct scan_control *sc) +{ +	return true; +} +  static bool writeback_throttling_sane(struct scan_control *sc)  {  	return true; @@ -741,6 +747,8 @@ EXPORT_SYMBOL(register_shrinker);   */  void unregister_shrinker(struct shrinker *shrinker)  { +	struct dentry *debugfs_entry; +  	if (!(shrinker->flags & SHRINKER_REGISTERED))  		return; @@ -749,9 +757,11 @@ void unregister_shrinker(struct shrinker *shrinker)  	shrinker->flags &= ~SHRINKER_REGISTERED;  	if (shrinker->flags & SHRINKER_MEMCG_AWARE)  		unregister_memcg_shrinker(shrinker); -	shrinker_debugfs_remove(shrinker); +	debugfs_entry = shrinker_debugfs_remove(shrinker);  	up_write(&shrinker_rwsem); +	debugfs_remove_recursive(debugfs_entry); +  	kfree(shrinker->nr_deferred);  	shrinker->nr_deferred = NULL;  } @@ -905,7 +915,7 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,  		}  		/* Call non-slab shrinkers even though kmem is disabled */ -		if (!memcg_kmem_enabled() && +		if (!memcg_kmem_online() &&  		    !(shrinker->flags & SHRINKER_NONSLAB))  			continue; @@ -1920,7 +1930,7 @@ retry:  			     !test_bit(PGDAT_DIRTY, &pgdat->flags))) {  				/*  				 * Immediately reclaim when written back. -				 * Similar in principle to deactivate_page() +				 * Similar in principle to folio_deactivate()  				 * except we already have the folio isolated  				 * and know it's dirty  				 */ @@ -2327,12 +2337,12 @@ move:   * (2) The lru_lock must not be held.   * (3) Interrupts must be enabled.   * - * Return: 0 if the folio was removed from an LRU list. - * -EBUSY if the folio was not on an LRU list. + * Return: true if the folio was removed from an LRU list. + * false if the folio was not on an LRU list.   */ -int folio_isolate_lru(struct folio *folio) +bool folio_isolate_lru(struct folio *folio)  { -	int ret = -EBUSY; +	bool ret = false;  	VM_BUG_ON_FOLIO(!folio_ref_count(folio), folio); @@ -2343,7 +2353,7 @@ int folio_isolate_lru(struct folio *folio)  		lruvec = folio_lruvec_lock_irq(folio);  		lruvec_del_folio(lruvec, folio);  		unlock_page_lruvec_irq(lruvec); -		ret = 0; +		ret = true;  	}  	return ret; @@ -3176,6 +3186,9 @@ DEFINE_STATIC_KEY_ARRAY_FALSE(lru_gen_caps, NR_LRU_GEN_CAPS);  		for ((type) = 0; (type) < ANON_AND_FILE; (type)++)	\  			for ((zone) = 0; (zone) < MAX_NR_ZONES; (zone)++) +#define get_memcg_gen(seq)	((seq) % MEMCG_NR_GENS) +#define get_memcg_bin(bin)	((bin) % MEMCG_NR_BINS) +  static struct lruvec *get_lruvec(struct mem_cgroup *memcg, int nid)  {  	struct pglist_data *pgdat = NODE_DATA(nid); @@ -3201,6 +3214,9 @@ static int get_swappiness(struct lruvec *lruvec, struct scan_control *sc)  	struct mem_cgroup *memcg = lruvec_memcg(lruvec);  	struct pglist_data *pgdat = lruvec_pgdat(lruvec); +	if (!sc->may_swap) +		return 0; +  	if (!can_demote(pgdat->node_id, sc) &&  	    mem_cgroup_get_nr_swap_pages(memcg) < MIN_LRU_BATCH)  		return 0; @@ -3215,13 +3231,105 @@ static int get_nr_gens(struct lruvec *lruvec, int type)  static bool __maybe_unused seq_is_valid(struct lruvec *lruvec)  { -	/* see the comment on lru_gen_struct */ +	/* see the comment on lru_gen_folio */  	return get_nr_gens(lruvec, LRU_GEN_FILE) >= MIN_NR_GENS &&  	       get_nr_gens(lruvec, LRU_GEN_FILE) <= get_nr_gens(lruvec, LRU_GEN_ANON) &&  	       get_nr_gens(lruvec, LRU_GEN_ANON) <= MAX_NR_GENS;  }  /****************************************************************************** + *                          Bloom filters + ******************************************************************************/ + +/* + * Bloom filters with m=1<<15, k=2 and the false positive rates of ~1/5 when + * n=10,000 and ~1/2 when n=20,000, where, conventionally, m is the number of + * bits in a bitmap, k is the number of hash functions and n is the number of + * inserted items. + * + * Page table walkers use one of the two filters to reduce their search space. + * To get rid of non-leaf entries that no longer have enough leaf entries, the + * aging uses the double-buffering technique to flip to the other filter each + * time it produces a new generation. For non-leaf entries that have enough + * leaf entries, the aging carries them over to the next generation in + * walk_pmd_range(); the eviction also report them when walking the rmap + * in lru_gen_look_around(). + * + * For future optimizations: + * 1. It's not necessary to keep both filters all the time. The spare one can be + *    freed after the RCU grace period and reallocated if needed again. + * 2. And when reallocating, it's worth scaling its size according to the number + *    of inserted entries in the other filter, to reduce the memory overhead on + *    small systems and false positives on large systems. + * 3. Jenkins' hash function is an alternative to Knuth's. + */ +#define BLOOM_FILTER_SHIFT	15 + +static inline int filter_gen_from_seq(unsigned long seq) +{ +	return seq % NR_BLOOM_FILTERS; +} + +static void get_item_key(void *item, int *key) +{ +	u32 hash = hash_ptr(item, BLOOM_FILTER_SHIFT * 2); + +	BUILD_BUG_ON(BLOOM_FILTER_SHIFT * 2 > BITS_PER_TYPE(u32)); + +	key[0] = hash & (BIT(BLOOM_FILTER_SHIFT) - 1); +	key[1] = hash >> BLOOM_FILTER_SHIFT; +} + +static bool test_bloom_filter(struct lruvec *lruvec, unsigned long seq, void *item) +{ +	int key[2]; +	unsigned long *filter; +	int gen = filter_gen_from_seq(seq); + +	filter = READ_ONCE(lruvec->mm_state.filters[gen]); +	if (!filter) +		return true; + +	get_item_key(item, key); + +	return test_bit(key[0], filter) && test_bit(key[1], filter); +} + +static void update_bloom_filter(struct lruvec *lruvec, unsigned long seq, void *item) +{ +	int key[2]; +	unsigned long *filter; +	int gen = filter_gen_from_seq(seq); + +	filter = READ_ONCE(lruvec->mm_state.filters[gen]); +	if (!filter) +		return; + +	get_item_key(item, key); + +	if (!test_bit(key[0], filter)) +		set_bit(key[0], filter); +	if (!test_bit(key[1], filter)) +		set_bit(key[1], filter); +} + +static void reset_bloom_filter(struct lruvec *lruvec, unsigned long seq) +{ +	unsigned long *filter; +	int gen = filter_gen_from_seq(seq); + +	filter = lruvec->mm_state.filters[gen]; +	if (filter) { +		bitmap_clear(filter, 0, BIT(BLOOM_FILTER_SHIFT)); +		return; +	} + +	filter = bitmap_zalloc(BIT(BLOOM_FILTER_SHIFT), +			       __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN); +	WRITE_ONCE(lruvec->mm_state.filters[gen], filter); +} + +/******************************************************************************   *                          mm_struct list   ******************************************************************************/ @@ -3323,13 +3431,16 @@ void lru_gen_migrate_mm(struct mm_struct *mm)  	if (mem_cgroup_disabled())  		return; +	/* migration can happen before addition */ +	if (!mm->lru_gen.memcg) +		return; +  	rcu_read_lock();  	memcg = mem_cgroup_from_task(task);  	rcu_read_unlock();  	if (memcg == mm->lru_gen.memcg)  		return; -	VM_WARN_ON_ONCE(!mm->lru_gen.memcg);  	VM_WARN_ON_ONCE(list_empty(&mm->lru_gen.list));  	lru_gen_del_mm(mm); @@ -3337,94 +3448,6 @@ void lru_gen_migrate_mm(struct mm_struct *mm)  }  #endif -/* - * Bloom filters with m=1<<15, k=2 and the false positive rates of ~1/5 when - * n=10,000 and ~1/2 when n=20,000, where, conventionally, m is the number of - * bits in a bitmap, k is the number of hash functions and n is the number of - * inserted items. - * - * Page table walkers use one of the two filters to reduce their search space. - * To get rid of non-leaf entries that no longer have enough leaf entries, the - * aging uses the double-buffering technique to flip to the other filter each - * time it produces a new generation. For non-leaf entries that have enough - * leaf entries, the aging carries them over to the next generation in - * walk_pmd_range(); the eviction also report them when walking the rmap - * in lru_gen_look_around(). - * - * For future optimizations: - * 1. It's not necessary to keep both filters all the time. The spare one can be - *    freed after the RCU grace period and reallocated if needed again. - * 2. And when reallocating, it's worth scaling its size according to the number - *    of inserted entries in the other filter, to reduce the memory overhead on - *    small systems and false positives on large systems. - * 3. Jenkins' hash function is an alternative to Knuth's. - */ -#define BLOOM_FILTER_SHIFT	15 - -static inline int filter_gen_from_seq(unsigned long seq) -{ -	return seq % NR_BLOOM_FILTERS; -} - -static void get_item_key(void *item, int *key) -{ -	u32 hash = hash_ptr(item, BLOOM_FILTER_SHIFT * 2); - -	BUILD_BUG_ON(BLOOM_FILTER_SHIFT * 2 > BITS_PER_TYPE(u32)); - -	key[0] = hash & (BIT(BLOOM_FILTER_SHIFT) - 1); -	key[1] = hash >> BLOOM_FILTER_SHIFT; -} - -static void reset_bloom_filter(struct lruvec *lruvec, unsigned long seq) -{ -	unsigned long *filter; -	int gen = filter_gen_from_seq(seq); - -	filter = lruvec->mm_state.filters[gen]; -	if (filter) { -		bitmap_clear(filter, 0, BIT(BLOOM_FILTER_SHIFT)); -		return; -	} - -	filter = bitmap_zalloc(BIT(BLOOM_FILTER_SHIFT), -			       __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN); -	WRITE_ONCE(lruvec->mm_state.filters[gen], filter); -} - -static void update_bloom_filter(struct lruvec *lruvec, unsigned long seq, void *item) -{ -	int key[2]; -	unsigned long *filter; -	int gen = filter_gen_from_seq(seq); - -	filter = READ_ONCE(lruvec->mm_state.filters[gen]); -	if (!filter) -		return; - -	get_item_key(item, key); - -	if (!test_bit(key[0], filter)) -		set_bit(key[0], filter); -	if (!test_bit(key[1], filter)) -		set_bit(key[1], filter); -} - -static bool test_bloom_filter(struct lruvec *lruvec, unsigned long seq, void *item) -{ -	int key[2]; -	unsigned long *filter; -	int gen = filter_gen_from_seq(seq); - -	filter = READ_ONCE(lruvec->mm_state.filters[gen]); -	if (!filter) -		return true; - -	get_item_key(item, key); - -	return test_bit(key[0], filter) && test_bit(key[1], filter); -} -  static void reset_mm_stats(struct lruvec *lruvec, struct lru_gen_mm_walk *walk, bool last)  {  	int i; @@ -3612,7 +3635,7 @@ struct ctrl_pos {  static void read_ctrl_pos(struct lruvec *lruvec, int type, int tier, int gain,  			  struct ctrl_pos *pos)  { -	struct lru_gen_struct *lrugen = &lruvec->lrugen; +	struct lru_gen_folio *lrugen = &lruvec->lrugen;  	int hist = lru_hist_from_seq(lrugen->min_seq[type]);  	pos->refaulted = lrugen->avg_refaulted[type][tier] + @@ -3627,7 +3650,7 @@ static void read_ctrl_pos(struct lruvec *lruvec, int type, int tier, int gain,  static void reset_ctrl_pos(struct lruvec *lruvec, int type, bool carryover)  {  	int hist, tier; -	struct lru_gen_struct *lrugen = &lruvec->lrugen; +	struct lru_gen_folio *lrugen = &lruvec->lrugen;  	bool clear = carryover ? NR_HIST_GENS == 1 : NR_HIST_GENS > 1;  	unsigned long seq = carryover ? lrugen->min_seq[type] : lrugen->max_seq + 1; @@ -3704,7 +3727,7 @@ static int folio_update_gen(struct folio *folio, int gen)  static int folio_inc_gen(struct lruvec *lruvec, struct folio *folio, bool reclaiming)  {  	int type = folio_is_file_lru(folio); -	struct lru_gen_struct *lrugen = &lruvec->lrugen; +	struct lru_gen_folio *lrugen = &lruvec->lrugen;  	int new_gen, old_gen = lru_gen_from_seq(lrugen->min_seq[type]);  	unsigned long new_flags, old_flags = READ_ONCE(folio->flags); @@ -3749,7 +3772,7 @@ static void update_batch_size(struct lru_gen_mm_walk *walk, struct folio *folio,  static void reset_batch_size(struct lruvec *lruvec, struct lru_gen_mm_walk *walk)  {  	int gen, type, zone; -	struct lru_gen_struct *lrugen = &lruvec->lrugen; +	struct lru_gen_folio *lrugen = &lruvec->lrugen;  	walk->batched = 0; @@ -3782,7 +3805,10 @@ static int should_skip_vma(unsigned long start, unsigned long end, struct mm_wal  	if (is_vm_hugetlb_page(vma))  		return true; -	if (vma->vm_flags & (VM_LOCKED | VM_SPECIAL | VM_SEQ_READ | VM_RAND_READ)) +	if (!vma_has_recency(vma)) +		return true; + +	if (vma->vm_flags & (VM_LOCKED | VM_SPECIAL))  		return true;  	if (vma == get_gate_vma(vma->vm_mm)) @@ -3977,8 +4003,8 @@ restart:  }  #if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG) -static void walk_pmd_range_locked(pud_t *pud, unsigned long next, struct vm_area_struct *vma, -				  struct mm_walk *args, unsigned long *bitmap, unsigned long *start) +static void walk_pmd_range_locked(pud_t *pud, unsigned long addr, struct vm_area_struct *vma, +				  struct mm_walk *args, unsigned long *bitmap, unsigned long *first)  {  	int i;  	pmd_t *pmd; @@ -3991,18 +4017,19 @@ static void walk_pmd_range_locked(pud_t *pud, unsigned long next, struct vm_area  	VM_WARN_ON_ONCE(pud_leaf(*pud));  	/* try to batch at most 1+MIN_LRU_BATCH+1 entries */ -	if (*start == -1) { -		*start = next; +	if (*first == -1) { +		*first = addr; +		bitmap_zero(bitmap, MIN_LRU_BATCH);  		return;  	} -	i = next == -1 ? 0 : pmd_index(next) - pmd_index(*start); +	i = addr == -1 ? 0 : pmd_index(addr) - pmd_index(*first);  	if (i && i <= MIN_LRU_BATCH) {  		__set_bit(i - 1, bitmap);  		return;  	} -	pmd = pmd_offset(pud, *start); +	pmd = pmd_offset(pud, *first);  	ptl = pmd_lockptr(args->mm, pmd);  	if (!spin_trylock(ptl)) @@ -4013,15 +4040,16 @@ static void walk_pmd_range_locked(pud_t *pud, unsigned long next, struct vm_area  	do {  		unsigned long pfn;  		struct folio *folio; -		unsigned long addr = i ? (*start & PMD_MASK) + i * PMD_SIZE : *start; + +		/* don't round down the first address */ +		addr = i ? (*first & PMD_MASK) + i * PMD_SIZE : *first;  		pfn = get_pmd_pfn(pmd[i], vma, addr);  		if (pfn == -1)  			goto next;  		if (!pmd_trans_huge(pmd[i])) { -			if (arch_has_hw_nonleaf_pmd_young() && -			    get_cap(LRU_GEN_NONLEAF_YOUNG)) +			if (arch_has_hw_nonleaf_pmd_young() && get_cap(LRU_GEN_NONLEAF_YOUNG))  				pmdp_test_and_clear_young(vma, addr, pmd + i);  			goto next;  		} @@ -4050,12 +4078,11 @@ next:  	arch_leave_lazy_mmu_mode();  	spin_unlock(ptl);  done: -	*start = -1; -	bitmap_zero(bitmap, MIN_LRU_BATCH); +	*first = -1;  }  #else -static void walk_pmd_range_locked(pud_t *pud, unsigned long next, struct vm_area_struct *vma, -				  struct mm_walk *args, unsigned long *bitmap, unsigned long *start) +static void walk_pmd_range_locked(pud_t *pud, unsigned long addr, struct vm_area_struct *vma, +				  struct mm_walk *args, unsigned long *bitmap, unsigned long *first)  {  }  #endif @@ -4068,9 +4095,9 @@ static void walk_pmd_range(pud_t *pud, unsigned long start, unsigned long end,  	unsigned long next;  	unsigned long addr;  	struct vm_area_struct *vma; -	unsigned long pos = -1; +	unsigned long bitmap[BITS_TO_LONGS(MIN_LRU_BATCH)]; +	unsigned long first = -1;  	struct lru_gen_mm_walk *walk = args->private; -	unsigned long bitmap[BITS_TO_LONGS(MIN_LRU_BATCH)] = {};  	VM_WARN_ON_ONCE(pud_leaf(*pud)); @@ -4109,18 +4136,17 @@ restart:  			if (pfn < pgdat->node_start_pfn || pfn >= pgdat_end_pfn(pgdat))  				continue; -			walk_pmd_range_locked(pud, addr, vma, args, bitmap, &pos); +			walk_pmd_range_locked(pud, addr, vma, args, bitmap, &first);  			continue;  		}  #endif  		walk->mm_stats[MM_NONLEAF_TOTAL]++; -		if (arch_has_hw_nonleaf_pmd_young() && -		    get_cap(LRU_GEN_NONLEAF_YOUNG)) { +		if (arch_has_hw_nonleaf_pmd_young() && get_cap(LRU_GEN_NONLEAF_YOUNG)) {  			if (!pmd_young(val))  				continue; -			walk_pmd_range_locked(pud, addr, vma, args, bitmap, &pos); +			walk_pmd_range_locked(pud, addr, vma, args, bitmap, &first);  		}  		if (!walk->force_scan && !test_bloom_filter(walk->lruvec, walk->max_seq, pmd + i)) @@ -4137,7 +4163,7 @@ restart:  		update_bloom_filter(walk->lruvec, walk->max_seq + 1, pmd + i);  	} -	walk_pmd_range_locked(pud, -1, vma, args, bitmap, &pos); +	walk_pmd_range_locked(pud, -1, vma, args, bitmap, &first);  	if (i < PTRS_PER_PMD && get_next_vma(PUD_MASK, PMD_SIZE, args, &start, &end))  		goto restart; @@ -4227,7 +4253,7 @@ static void walk_mm(struct lruvec *lruvec, struct mm_struct *mm, struct lru_gen_  	} while (err == -EAGAIN);  } -static struct lru_gen_mm_walk *set_mm_walk(struct pglist_data *pgdat) +static struct lru_gen_mm_walk *set_mm_walk(struct pglist_data *pgdat, bool force_alloc)  {  	struct lru_gen_mm_walk *walk = current->reclaim_state->mm_walk; @@ -4235,7 +4261,7 @@ static struct lru_gen_mm_walk *set_mm_walk(struct pglist_data *pgdat)  		VM_WARN_ON_ONCE(walk);  		walk = &pgdat->mm_walk; -	} else if (!pgdat && !walk) { +	} else if (!walk && force_alloc) {  		VM_WARN_ON_ONCE(current_is_kswapd());  		walk = kzalloc(sizeof(*walk), __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN); @@ -4263,7 +4289,7 @@ static bool inc_min_seq(struct lruvec *lruvec, int type, bool can_swap)  {  	int zone;  	int remaining = MAX_LRU_BATCH; -	struct lru_gen_struct *lrugen = &lruvec->lrugen; +	struct lru_gen_folio *lrugen = &lruvec->lrugen;  	int new_gen, old_gen = lru_gen_from_seq(lrugen->min_seq[type]);  	if (type == LRU_GEN_ANON && !can_swap) @@ -4271,7 +4297,7 @@ static bool inc_min_seq(struct lruvec *lruvec, int type, bool can_swap)  	/* prevent cold/hot inversion if force_scan is true */  	for (zone = 0; zone < MAX_NR_ZONES; zone++) { -		struct list_head *head = &lrugen->lists[old_gen][type][zone]; +		struct list_head *head = &lrugen->folios[old_gen][type][zone];  		while (!list_empty(head)) {  			struct folio *folio = lru_to_folio(head); @@ -4282,7 +4308,7 @@ static bool inc_min_seq(struct lruvec *lruvec, int type, bool can_swap)  			VM_WARN_ON_ONCE_FOLIO(folio_zonenum(folio) != zone, folio);  			new_gen = folio_inc_gen(lruvec, folio, false); -			list_move_tail(&folio->lru, &lrugen->lists[new_gen][type][zone]); +			list_move_tail(&folio->lru, &lrugen->folios[new_gen][type][zone]);  			if (!--remaining)  				return false; @@ -4299,7 +4325,7 @@ static bool try_to_inc_min_seq(struct lruvec *lruvec, bool can_swap)  {  	int gen, type, zone;  	bool success = false; -	struct lru_gen_struct *lrugen = &lruvec->lrugen; +	struct lru_gen_folio *lrugen = &lruvec->lrugen;  	DEFINE_MIN_SEQ(lruvec);  	VM_WARN_ON_ONCE(!seq_is_valid(lruvec)); @@ -4310,7 +4336,7 @@ static bool try_to_inc_min_seq(struct lruvec *lruvec, bool can_swap)  			gen = lru_gen_from_seq(min_seq[type]);  			for (zone = 0; zone < MAX_NR_ZONES; zone++) { -				if (!list_empty(&lrugen->lists[gen][type][zone])) +				if (!list_empty(&lrugen->folios[gen][type][zone]))  					goto next;  			} @@ -4320,7 +4346,7 @@ next:  		;  	} -	/* see the comment on lru_gen_struct */ +	/* see the comment on lru_gen_folio */  	if (can_swap) {  		min_seq[LRU_GEN_ANON] = min(min_seq[LRU_GEN_ANON], min_seq[LRU_GEN_FILE]);  		min_seq[LRU_GEN_FILE] = max(min_seq[LRU_GEN_ANON], lrugen->min_seq[LRU_GEN_FILE]); @@ -4342,7 +4368,7 @@ static void inc_max_seq(struct lruvec *lruvec, bool can_swap, bool force_scan)  {  	int prev, next;  	int type, zone; -	struct lru_gen_struct *lrugen = &lruvec->lrugen; +	struct lru_gen_folio *lrugen = &lruvec->lrugen;  	spin_lock_irq(&lruvec->lru_lock); @@ -4400,7 +4426,7 @@ static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long max_seq,  	bool success;  	struct lru_gen_mm_walk *walk;  	struct mm_struct *mm = NULL; -	struct lru_gen_struct *lrugen = &lruvec->lrugen; +	struct lru_gen_folio *lrugen = &lruvec->lrugen;  	VM_WARN_ON_ONCE(max_seq > READ_ONCE(lrugen->max_seq)); @@ -4416,12 +4442,12 @@ static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long max_seq,  	 * handful of PTEs. Spreading the work out over a period of time usually  	 * is less efficient, but it avoids bursty page faults.  	 */ -	if (!force_scan && !(arch_has_hw_pte_young() && get_cap(LRU_GEN_MM_WALK))) { +	if (!arch_has_hw_pte_young() || !get_cap(LRU_GEN_MM_WALK)) {  		success = iterate_mm_list_nowalk(lruvec, max_seq);  		goto done;  	} -	walk = set_mm_walk(NULL); +	walk = set_mm_walk(NULL, true);  	if (!walk) {  		success = iterate_mm_list_nowalk(lruvec, max_seq);  		goto done; @@ -4444,8 +4470,7 @@ done:  		if (sc->priority <= DEF_PRIORITY - 2)  			wait_event_killable(lruvec->mm_state.wait,  					    max_seq < READ_ONCE(lrugen->max_seq)); - -		return max_seq < READ_ONCE(lrugen->max_seq); +		return false;  	}  	VM_WARN_ON_ONCE(max_seq != READ_ONCE(lrugen->max_seq)); @@ -4458,97 +4483,56 @@ done:  	return true;  } -static bool should_run_aging(struct lruvec *lruvec, unsigned long max_seq, unsigned long *min_seq, -			     struct scan_control *sc, bool can_swap, unsigned long *nr_to_scan) +/****************************************************************************** + *                          working set protection + ******************************************************************************/ + +static bool lruvec_is_sizable(struct lruvec *lruvec, struct scan_control *sc)  {  	int gen, type, zone; -	unsigned long old = 0; -	unsigned long young = 0;  	unsigned long total = 0; -	struct lru_gen_struct *lrugen = &lruvec->lrugen; +	bool can_swap = get_swappiness(lruvec, sc); +	struct lru_gen_folio *lrugen = &lruvec->lrugen;  	struct mem_cgroup *memcg = lruvec_memcg(lruvec); +	DEFINE_MAX_SEQ(lruvec); +	DEFINE_MIN_SEQ(lruvec);  	for (type = !can_swap; type < ANON_AND_FILE; type++) {  		unsigned long seq;  		for (seq = min_seq[type]; seq <= max_seq; seq++) { -			unsigned long size = 0; -  			gen = lru_gen_from_seq(seq);  			for (zone = 0; zone < MAX_NR_ZONES; zone++) -				size += max(READ_ONCE(lrugen->nr_pages[gen][type][zone]), 0L); - -			total += size; -			if (seq == max_seq) -				young += size; -			else if (seq + MIN_NR_GENS == max_seq) -				old += size; +				total += max(READ_ONCE(lrugen->nr_pages[gen][type][zone]), 0L);  		}  	} -	/* try to scrape all its memory if this memcg was deleted */ -	*nr_to_scan = mem_cgroup_online(memcg) ? (total >> sc->priority) : total; - -	/* -	 * The aging tries to be lazy to reduce the overhead, while the eviction -	 * stalls when the number of generations reaches MIN_NR_GENS. Hence, the -	 * ideal number of generations is MIN_NR_GENS+1. -	 */ -	if (min_seq[!can_swap] + MIN_NR_GENS > max_seq) -		return true; -	if (min_seq[!can_swap] + MIN_NR_GENS < max_seq) -		return false; - -	/* -	 * It's also ideal to spread pages out evenly, i.e., 1/(MIN_NR_GENS+1) -	 * of the total number of pages for each generation. A reasonable range -	 * for this average portion is [1/MIN_NR_GENS, 1/(MIN_NR_GENS+2)]. The -	 * aging cares about the upper bound of hot pages, while the eviction -	 * cares about the lower bound of cold pages. -	 */ -	if (young * MIN_NR_GENS > total) -		return true; -	if (old * (MIN_NR_GENS + 2) < total) -		return true; - -	return false; +	/* whether the size is big enough to be helpful */ +	return mem_cgroup_online(memcg) ? (total >> sc->priority) : total;  } -static bool age_lruvec(struct lruvec *lruvec, struct scan_control *sc, unsigned long min_ttl) +static bool lruvec_is_reclaimable(struct lruvec *lruvec, struct scan_control *sc, +				  unsigned long min_ttl)  { -	bool need_aging; -	unsigned long nr_to_scan; -	int swappiness = get_swappiness(lruvec, sc); +	int gen; +	unsigned long birth;  	struct mem_cgroup *memcg = lruvec_memcg(lruvec); -	DEFINE_MAX_SEQ(lruvec);  	DEFINE_MIN_SEQ(lruvec); -	VM_WARN_ON_ONCE(sc->memcg_low_reclaim); +	/* see the comment on lru_gen_folio */ +	gen = lru_gen_from_seq(min_seq[LRU_GEN_FILE]); +	birth = READ_ONCE(lruvec->lrugen.timestamps[gen]); -	mem_cgroup_calculate_protection(NULL, memcg); - -	if (mem_cgroup_below_min(NULL, memcg)) +	if (time_is_after_jiffies(birth + min_ttl))  		return false; -	need_aging = should_run_aging(lruvec, max_seq, min_seq, sc, swappiness, &nr_to_scan); - -	if (min_ttl) { -		int gen = lru_gen_from_seq(min_seq[LRU_GEN_FILE]); -		unsigned long birth = READ_ONCE(lruvec->lrugen.timestamps[gen]); - -		if (time_is_after_jiffies(birth + min_ttl)) -			return false; - -		/* the size is likely too small to be helpful */ -		if (!nr_to_scan && sc->priority != DEF_PRIORITY) -			return false; -	} +	if (!lruvec_is_sizable(lruvec, sc)) +		return false; -	if (need_aging) -		try_to_inc_max_seq(lruvec, max_seq, sc, swappiness, false); +	mem_cgroup_calculate_protection(NULL, memcg); -	return true; +	return !mem_cgroup_below_min(NULL, memcg);  }  /* to protect the working set of the last N jiffies */ @@ -4557,46 +4541,30 @@ static unsigned long lru_gen_min_ttl __read_mostly;  static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc)  {  	struct mem_cgroup *memcg; -	bool success = false;  	unsigned long min_ttl = READ_ONCE(lru_gen_min_ttl);  	VM_WARN_ON_ONCE(!current_is_kswapd()); -	sc->last_reclaimed = sc->nr_reclaimed; - -	/* -	 * To reduce the chance of going into the aging path, which can be -	 * costly, optimistically skip it if the flag below was cleared in the -	 * eviction path. This improves the overall performance when multiple -	 * memcgs are available. -	 */ -	if (!sc->memcgs_need_aging) { -		sc->memcgs_need_aging = true; +	/* check the order to exclude compaction-induced reclaim */ +	if (!min_ttl || sc->order || sc->priority == DEF_PRIORITY)  		return; -	} - -	set_mm_walk(pgdat);  	memcg = mem_cgroup_iter(NULL, NULL, NULL);  	do {  		struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat); -		if (age_lruvec(lruvec, sc, min_ttl)) -			success = true; +		if (lruvec_is_reclaimable(lruvec, sc, min_ttl)) { +			mem_cgroup_iter_break(NULL, memcg); +			return; +		}  		cond_resched();  	} while ((memcg = mem_cgroup_iter(NULL, memcg, NULL))); -	clear_mm_walk(); - -	/* check the order to exclude compaction-induced reclaim */ -	if (success || !min_ttl || sc->order) -		return; -  	/*  	 * The main goal is to OOM kill if every generation from all memcgs is  	 * younger than min_ttl. However, another possibility is all memcgs are -	 * either below min or empty. +	 * either too small or below min.  	 */  	if (mutex_trylock(&oom_lock)) {  		struct oom_control oc = { @@ -4609,6 +4577,10 @@ static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc)  	}  } +/****************************************************************************** + *                          rmap/PT walk feedback + ******************************************************************************/ +  /*   * This function exploits spatial locality when shrink_folio_list() walks the   * rmap. It scans the adjacent PTEs of a young PTE and promotes hot pages. If @@ -4619,13 +4591,12 @@ static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc)  void lru_gen_look_around(struct page_vma_mapped_walk *pvmw)  {  	int i; -	pte_t *pte;  	unsigned long start;  	unsigned long end; -	unsigned long addr;  	struct lru_gen_mm_walk *walk;  	int young = 0; -	unsigned long bitmap[BITS_TO_LONGS(MIN_LRU_BATCH)] = {}; +	pte_t *pte = pvmw->pte; +	unsigned long addr = pvmw->address;  	struct folio *folio = pfn_folio(pvmw->pfn);  	struct mem_cgroup *memcg = folio_memcg(folio);  	struct pglist_data *pgdat = folio_pgdat(folio); @@ -4642,25 +4613,28 @@ void lru_gen_look_around(struct page_vma_mapped_walk *pvmw)  	/* avoid taking the LRU lock under the PTL when possible */  	walk = current->reclaim_state ? current->reclaim_state->mm_walk : NULL; -	start = max(pvmw->address & PMD_MASK, pvmw->vma->vm_start); -	end = min(pvmw->address | ~PMD_MASK, pvmw->vma->vm_end - 1) + 1; +	start = max(addr & PMD_MASK, pvmw->vma->vm_start); +	end = min(addr | ~PMD_MASK, pvmw->vma->vm_end - 1) + 1;  	if (end - start > MIN_LRU_BATCH * PAGE_SIZE) { -		if (pvmw->address - start < MIN_LRU_BATCH * PAGE_SIZE / 2) +		if (addr - start < MIN_LRU_BATCH * PAGE_SIZE / 2)  			end = start + MIN_LRU_BATCH * PAGE_SIZE; -		else if (end - pvmw->address < MIN_LRU_BATCH * PAGE_SIZE / 2) +		else if (end - addr < MIN_LRU_BATCH * PAGE_SIZE / 2)  			start = end - MIN_LRU_BATCH * PAGE_SIZE;  		else { -			start = pvmw->address - MIN_LRU_BATCH * PAGE_SIZE / 2; -			end = pvmw->address + MIN_LRU_BATCH * PAGE_SIZE / 2; +			start = addr - MIN_LRU_BATCH * PAGE_SIZE / 2; +			end = addr + MIN_LRU_BATCH * PAGE_SIZE / 2;  		}  	} -	pte = pvmw->pte - (pvmw->address - start) / PAGE_SIZE; +	/* folio_update_gen() requires stable folio_memcg() */ +	if (!mem_cgroup_trylock_pages(memcg)) +		return; -	rcu_read_lock();  	arch_enter_lazy_mmu_mode(); +	pte -= (addr - start) / PAGE_SIZE; +  	for (i = 0, addr = start; addr != end; i++, addr += PAGE_SIZE) {  		unsigned long pfn; @@ -4685,58 +4659,171 @@ void lru_gen_look_around(struct page_vma_mapped_walk *pvmw)  		      !folio_test_swapcache(folio)))  			folio_mark_dirty(folio); +		if (walk) { +			old_gen = folio_update_gen(folio, new_gen); +			if (old_gen >= 0 && old_gen != new_gen) +				update_batch_size(walk, folio, old_gen, new_gen); + +			continue; +		} +  		old_gen = folio_lru_gen(folio);  		if (old_gen < 0)  			folio_set_referenced(folio);  		else if (old_gen != new_gen) -			__set_bit(i, bitmap); +			folio_activate(folio);  	}  	arch_leave_lazy_mmu_mode(); -	rcu_read_unlock(); +	mem_cgroup_unlock_pages();  	/* feedback from rmap walkers to page table walkers */  	if (suitable_to_scan(i, young))  		update_bloom_filter(lruvec, max_seq, pvmw->pmd); +} -	if (!walk && bitmap_weight(bitmap, MIN_LRU_BATCH) < PAGEVEC_SIZE) { -		for_each_set_bit(i, bitmap, MIN_LRU_BATCH) { -			folio = pfn_folio(pte_pfn(pte[i])); -			folio_activate(folio); -		} -		return; +/****************************************************************************** + *                          memcg LRU + ******************************************************************************/ + +/* see the comment on MEMCG_NR_GENS */ +enum { +	MEMCG_LRU_NOP, +	MEMCG_LRU_HEAD, +	MEMCG_LRU_TAIL, +	MEMCG_LRU_OLD, +	MEMCG_LRU_YOUNG, +}; + +#ifdef CONFIG_MEMCG + +static int lru_gen_memcg_seg(struct lruvec *lruvec) +{ +	return READ_ONCE(lruvec->lrugen.seg); +} + +static void lru_gen_rotate_memcg(struct lruvec *lruvec, int op) +{ +	int seg; +	int old, new; +	int bin = get_random_u32_below(MEMCG_NR_BINS); +	struct pglist_data *pgdat = lruvec_pgdat(lruvec); + +	spin_lock(&pgdat->memcg_lru.lock); + +	VM_WARN_ON_ONCE(hlist_nulls_unhashed(&lruvec->lrugen.list)); + +	seg = 0; +	new = old = lruvec->lrugen.gen; + +	/* see the comment on MEMCG_NR_GENS */ +	if (op == MEMCG_LRU_HEAD) +		seg = MEMCG_LRU_HEAD; +	else if (op == MEMCG_LRU_TAIL) +		seg = MEMCG_LRU_TAIL; +	else if (op == MEMCG_LRU_OLD) +		new = get_memcg_gen(pgdat->memcg_lru.seq); +	else if (op == MEMCG_LRU_YOUNG) +		new = get_memcg_gen(pgdat->memcg_lru.seq + 1); +	else +		VM_WARN_ON_ONCE(true); + +	hlist_nulls_del_rcu(&lruvec->lrugen.list); + +	if (op == MEMCG_LRU_HEAD || op == MEMCG_LRU_OLD) +		hlist_nulls_add_head_rcu(&lruvec->lrugen.list, &pgdat->memcg_lru.fifo[new][bin]); +	else +		hlist_nulls_add_tail_rcu(&lruvec->lrugen.list, &pgdat->memcg_lru.fifo[new][bin]); + +	pgdat->memcg_lru.nr_memcgs[old]--; +	pgdat->memcg_lru.nr_memcgs[new]++; + +	lruvec->lrugen.gen = new; +	WRITE_ONCE(lruvec->lrugen.seg, seg); + +	if (!pgdat->memcg_lru.nr_memcgs[old] && old == get_memcg_gen(pgdat->memcg_lru.seq)) +		WRITE_ONCE(pgdat->memcg_lru.seq, pgdat->memcg_lru.seq + 1); + +	spin_unlock(&pgdat->memcg_lru.lock); +} + +void lru_gen_online_memcg(struct mem_cgroup *memcg) +{ +	int gen; +	int nid; +	int bin = get_random_u32_below(MEMCG_NR_BINS); + +	for_each_node(nid) { +		struct pglist_data *pgdat = NODE_DATA(nid); +		struct lruvec *lruvec = get_lruvec(memcg, nid); + +		spin_lock(&pgdat->memcg_lru.lock); + +		VM_WARN_ON_ONCE(!hlist_nulls_unhashed(&lruvec->lrugen.list)); + +		gen = get_memcg_gen(pgdat->memcg_lru.seq); + +		hlist_nulls_add_tail_rcu(&lruvec->lrugen.list, &pgdat->memcg_lru.fifo[gen][bin]); +		pgdat->memcg_lru.nr_memcgs[gen]++; + +		lruvec->lrugen.gen = gen; + +		spin_unlock(&pgdat->memcg_lru.lock);  	} +} -	/* folio_update_gen() requires stable folio_memcg() */ -	if (!mem_cgroup_trylock_pages(memcg)) -		return; +void lru_gen_offline_memcg(struct mem_cgroup *memcg) +{ +	int nid; -	if (!walk) { -		spin_lock_irq(&lruvec->lru_lock); -		new_gen = lru_gen_from_seq(lruvec->lrugen.max_seq); +	for_each_node(nid) { +		struct lruvec *lruvec = get_lruvec(memcg, nid); + +		lru_gen_rotate_memcg(lruvec, MEMCG_LRU_OLD);  	} +} -	for_each_set_bit(i, bitmap, MIN_LRU_BATCH) { -		folio = pfn_folio(pte_pfn(pte[i])); -		if (folio_memcg_rcu(folio) != memcg) -			continue; +void lru_gen_release_memcg(struct mem_cgroup *memcg) +{ +	int gen; +	int nid; -		old_gen = folio_update_gen(folio, new_gen); -		if (old_gen < 0 || old_gen == new_gen) -			continue; +	for_each_node(nid) { +		struct pglist_data *pgdat = NODE_DATA(nid); +		struct lruvec *lruvec = get_lruvec(memcg, nid); -		if (walk) -			update_batch_size(walk, folio, old_gen, new_gen); -		else -			lru_gen_update_size(lruvec, folio, old_gen, new_gen); +		spin_lock(&pgdat->memcg_lru.lock); + +		VM_WARN_ON_ONCE(hlist_nulls_unhashed(&lruvec->lrugen.list)); + +		gen = lruvec->lrugen.gen; + +		hlist_nulls_del_rcu(&lruvec->lrugen.list); +		pgdat->memcg_lru.nr_memcgs[gen]--; + +		if (!pgdat->memcg_lru.nr_memcgs[gen] && gen == get_memcg_gen(pgdat->memcg_lru.seq)) +			WRITE_ONCE(pgdat->memcg_lru.seq, pgdat->memcg_lru.seq + 1); + +		spin_unlock(&pgdat->memcg_lru.lock);  	} +} + +void lru_gen_soft_reclaim(struct lruvec *lruvec) +{ +	/* see the comment on MEMCG_NR_GENS */ +	if (lru_gen_memcg_seg(lruvec) != MEMCG_LRU_HEAD) +		lru_gen_rotate_memcg(lruvec, MEMCG_LRU_HEAD); +} -	if (!walk) -		spin_unlock_irq(&lruvec->lru_lock); +#else /* !CONFIG_MEMCG */ -	mem_cgroup_unlock_pages(); +static int lru_gen_memcg_seg(struct lruvec *lruvec) +{ +	return 0;  } +#endif +  /******************************************************************************   *                          the eviction   ******************************************************************************/ @@ -4750,7 +4837,7 @@ static bool sort_folio(struct lruvec *lruvec, struct folio *folio, int tier_idx)  	int delta = folio_nr_pages(folio);  	int refs = folio_lru_refs(folio);  	int tier = lru_tier_from_refs(refs); -	struct lru_gen_struct *lrugen = &lruvec->lrugen; +	struct lru_gen_folio *lrugen = &lruvec->lrugen;  	VM_WARN_ON_ONCE_FOLIO(gen >= MAX_NR_GENS, folio); @@ -4775,7 +4862,7 @@ static bool sort_folio(struct lruvec *lruvec, struct folio *folio, int tier_idx)  	/* promoted */  	if (gen != lru_gen_from_seq(lrugen->min_seq[type])) { -		list_move(&folio->lru, &lrugen->lists[gen][type][zone]); +		list_move(&folio->lru, &lrugen->folios[gen][type][zone]);  		return true;  	} @@ -4784,7 +4871,7 @@ static bool sort_folio(struct lruvec *lruvec, struct folio *folio, int tier_idx)  		int hist = lru_hist_from_seq(lrugen->min_seq[type]);  		gen = folio_inc_gen(lruvec, folio, false); -		list_move_tail(&folio->lru, &lrugen->lists[gen][type][zone]); +		list_move_tail(&folio->lru, &lrugen->folios[gen][type][zone]);  		WRITE_ONCE(lrugen->protected[hist][type][tier - 1],  			   lrugen->protected[hist][type][tier - 1] + delta); @@ -4796,7 +4883,7 @@ static bool sort_folio(struct lruvec *lruvec, struct folio *folio, int tier_idx)  	if (folio_test_locked(folio) || folio_test_writeback(folio) ||  	    (type == LRU_GEN_FILE && folio_test_dirty(folio))) {  		gen = folio_inc_gen(lruvec, folio, true); -		list_move(&folio->lru, &lrugen->lists[gen][type][zone]); +		list_move(&folio->lru, &lrugen->folios[gen][type][zone]);  		return true;  	} @@ -4807,12 +4894,8 @@ static bool isolate_folio(struct lruvec *lruvec, struct folio *folio, struct sca  {  	bool success; -	/* unmapping inhibited */ -	if (!sc->may_unmap && folio_mapped(folio)) -		return false; -  	/* swapping inhibited */ -	if (!(sc->may_writepage && (sc->gfp_mask & __GFP_IO)) && +	if (!(sc->gfp_mask & __GFP_IO) &&  	    (folio_test_dirty(folio) ||  	     (folio_test_anon(folio) && !folio_test_swapcache(folio))))  		return false; @@ -4850,7 +4933,7 @@ static int scan_folios(struct lruvec *lruvec, struct scan_control *sc,  	int scanned = 0;  	int isolated = 0;  	int remaining = MAX_LRU_BATCH; -	struct lru_gen_struct *lrugen = &lruvec->lrugen; +	struct lru_gen_folio *lrugen = &lruvec->lrugen;  	struct mem_cgroup *memcg = lruvec_memcg(lruvec);  	VM_WARN_ON_ONCE(!list_empty(list)); @@ -4863,7 +4946,7 @@ static int scan_folios(struct lruvec *lruvec, struct scan_control *sc,  	for (zone = sc->reclaim_idx; zone >= 0; zone--) {  		LIST_HEAD(moved);  		int skipped = 0; -		struct list_head *head = &lrugen->lists[gen][type][zone]; +		struct list_head *head = &lrugen->folios[gen][type][zone];  		while (!list_empty(head)) {  			struct folio *folio = lru_to_folio(head); @@ -4909,9 +4992,8 @@ static int scan_folios(struct lruvec *lruvec, struct scan_control *sc,  	__count_vm_events(PGSCAN_ANON + type, isolated);  	/* -	 * There might not be eligible pages due to reclaim_idx, may_unmap and -	 * may_writepage. Check the remaining to prevent livelock if it's not -	 * making progress. +	 * There might not be eligible folios due to reclaim_idx. Check the +	 * remaining to prevent livelock if it's not making progress.  	 */  	return isolated || !remaining ? scanned : 0;  } @@ -5006,8 +5088,7 @@ static int isolate_folios(struct lruvec *lruvec, struct scan_control *sc, int sw  	return scanned;  } -static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swappiness, -			bool *need_swapping) +static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swappiness)  {  	int type;  	int scanned; @@ -5096,153 +5177,348 @@ retry:  		goto retry;  	} -	if (need_swapping && type == LRU_GEN_ANON) -		*need_swapping = true; -  	return scanned;  } +static bool should_run_aging(struct lruvec *lruvec, unsigned long max_seq, +			     struct scan_control *sc, bool can_swap, unsigned long *nr_to_scan) +{ +	int gen, type, zone; +	unsigned long old = 0; +	unsigned long young = 0; +	unsigned long total = 0; +	struct lru_gen_folio *lrugen = &lruvec->lrugen; +	struct mem_cgroup *memcg = lruvec_memcg(lruvec); +	DEFINE_MIN_SEQ(lruvec); + +	/* whether this lruvec is completely out of cold folios */ +	if (min_seq[!can_swap] + MIN_NR_GENS > max_seq) { +		*nr_to_scan = 0; +		return true; +	} + +	for (type = !can_swap; type < ANON_AND_FILE; type++) { +		unsigned long seq; + +		for (seq = min_seq[type]; seq <= max_seq; seq++) { +			unsigned long size = 0; + +			gen = lru_gen_from_seq(seq); + +			for (zone = 0; zone < MAX_NR_ZONES; zone++) +				size += max(READ_ONCE(lrugen->nr_pages[gen][type][zone]), 0L); + +			total += size; +			if (seq == max_seq) +				young += size; +			else if (seq + MIN_NR_GENS == max_seq) +				old += size; +		} +	} + +	/* try to scrape all its memory if this memcg was deleted */ +	*nr_to_scan = mem_cgroup_online(memcg) ? (total >> sc->priority) : total; + +	/* +	 * The aging tries to be lazy to reduce the overhead, while the eviction +	 * stalls when the number of generations reaches MIN_NR_GENS. Hence, the +	 * ideal number of generations is MIN_NR_GENS+1. +	 */ +	if (min_seq[!can_swap] + MIN_NR_GENS < max_seq) +		return false; + +	/* +	 * It's also ideal to spread pages out evenly, i.e., 1/(MIN_NR_GENS+1) +	 * of the total number of pages for each generation. A reasonable range +	 * for this average portion is [1/MIN_NR_GENS, 1/(MIN_NR_GENS+2)]. The +	 * aging cares about the upper bound of hot pages, while the eviction +	 * cares about the lower bound of cold pages. +	 */ +	if (young * MIN_NR_GENS > total) +		return true; +	if (old * (MIN_NR_GENS + 2) < total) +		return true; + +	return false; +} +  /*   * For future optimizations:   * 1. Defer try_to_inc_max_seq() to workqueues to reduce latency for memcg   *    reclaim.   */ -static unsigned long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc, -				    bool can_swap, bool *need_aging) +static long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc, bool can_swap)  {  	unsigned long nr_to_scan;  	struct mem_cgroup *memcg = lruvec_memcg(lruvec);  	DEFINE_MAX_SEQ(lruvec); -	DEFINE_MIN_SEQ(lruvec); -	if (mem_cgroup_below_min(sc->target_mem_cgroup, memcg) || -	    (mem_cgroup_below_low(sc->target_mem_cgroup, memcg) && -	     !sc->memcg_low_reclaim)) +	if (mem_cgroup_below_min(sc->target_mem_cgroup, memcg))  		return 0; -	*need_aging = should_run_aging(lruvec, max_seq, min_seq, sc, can_swap, &nr_to_scan); -	if (!*need_aging) +	if (!should_run_aging(lruvec, max_seq, sc, can_swap, &nr_to_scan))  		return nr_to_scan;  	/* skip the aging path at the default priority */  	if (sc->priority == DEF_PRIORITY) -		goto done; +		return nr_to_scan; -	/* leave the work to lru_gen_age_node() */ -	if (current_is_kswapd()) -		return 0; +	/* skip this lruvec as it's low on cold folios */ +	return try_to_inc_max_seq(lruvec, max_seq, sc, can_swap, false) ? -1 : 0; +} -	if (try_to_inc_max_seq(lruvec, max_seq, sc, can_swap, false)) -		return nr_to_scan; -done: -	return min_seq[!can_swap] + MIN_NR_GENS <= max_seq ? nr_to_scan : 0; +static unsigned long get_nr_to_reclaim(struct scan_control *sc) +{ +	/* don't abort memcg reclaim to ensure fairness */ +	if (!global_reclaim(sc)) +		return -1; + +	return max(sc->nr_to_reclaim, compact_gap(sc->order));  } -static bool should_abort_scan(struct lruvec *lruvec, unsigned long seq, -			      struct scan_control *sc, bool need_swapping) +static bool try_to_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)  { -	int i; -	DEFINE_MAX_SEQ(lruvec); +	long nr_to_scan; +	unsigned long scanned = 0; +	unsigned long nr_to_reclaim = get_nr_to_reclaim(sc); +	int swappiness = get_swappiness(lruvec, sc); -	if (!current_is_kswapd()) { -		/* age each memcg at most once to ensure fairness */ -		if (max_seq - seq > 1) -			return true; +	/* clean file folios are more likely to exist */ +	if (swappiness && !(sc->gfp_mask & __GFP_IO)) +		swappiness = 1; -		/* over-swapping can increase allocation latency */ -		if (sc->nr_reclaimed >= sc->nr_to_reclaim && need_swapping) -			return true; +	while (true) { +		int delta; -		/* give this thread a chance to exit and free its memory */ -		if (fatal_signal_pending(current)) { -			sc->nr_reclaimed += MIN_LRU_BATCH; -			return true; -		} +		nr_to_scan = get_nr_to_scan(lruvec, sc, swappiness); +		if (nr_to_scan <= 0) +			break; -		if (cgroup_reclaim(sc)) -			return false; -	} else if (sc->nr_reclaimed - sc->last_reclaimed < sc->nr_to_reclaim) -		return false; +		delta = evict_folios(lruvec, sc, swappiness); +		if (!delta) +			break; -	/* keep scanning at low priorities to ensure fairness */ -	if (sc->priority > DEF_PRIORITY - 2) -		return false; +		scanned += delta; +		if (scanned >= nr_to_scan) +			break; -	/* -	 * A minimum amount of work was done under global memory pressure. For -	 * kswapd, it may be overshooting. For direct reclaim, the allocation -	 * may succeed if all suitable zones are somewhat safe. In either case, -	 * it's better to stop now, and restart later if necessary. -	 */ -	for (i = 0; i <= sc->reclaim_idx; i++) { -		unsigned long wmark; -		struct zone *zone = lruvec_pgdat(lruvec)->node_zones + i; +		if (sc->nr_reclaimed >= nr_to_reclaim) +			break; -		if (!managed_zone(zone)) +		cond_resched(); +	} + +	/* whether try_to_inc_max_seq() was successful */ +	return nr_to_scan < 0; +} + +static int shrink_one(struct lruvec *lruvec, struct scan_control *sc) +{ +	bool success; +	unsigned long scanned = sc->nr_scanned; +	unsigned long reclaimed = sc->nr_reclaimed; +	int seg = lru_gen_memcg_seg(lruvec); +	struct mem_cgroup *memcg = lruvec_memcg(lruvec); +	struct pglist_data *pgdat = lruvec_pgdat(lruvec); + +	/* see the comment on MEMCG_NR_GENS */ +	if (!lruvec_is_sizable(lruvec, sc)) +		return seg != MEMCG_LRU_TAIL ? MEMCG_LRU_TAIL : MEMCG_LRU_YOUNG; + +	mem_cgroup_calculate_protection(NULL, memcg); + +	if (mem_cgroup_below_min(NULL, memcg)) +		return MEMCG_LRU_YOUNG; + +	if (mem_cgroup_below_low(NULL, memcg)) { +		/* see the comment on MEMCG_NR_GENS */ +		if (seg != MEMCG_LRU_TAIL) +			return MEMCG_LRU_TAIL; + +		memcg_memory_event(memcg, MEMCG_LOW); +	} + +	success = try_to_shrink_lruvec(lruvec, sc); + +	shrink_slab(sc->gfp_mask, pgdat->node_id, memcg, sc->priority); + +	if (!sc->proactive) +		vmpressure(sc->gfp_mask, memcg, false, sc->nr_scanned - scanned, +			   sc->nr_reclaimed - reclaimed); + +	sc->nr_reclaimed += current->reclaim_state->reclaimed_slab; +	current->reclaim_state->reclaimed_slab = 0; + +	return success ? MEMCG_LRU_YOUNG : 0; +} + +#ifdef CONFIG_MEMCG + +static void shrink_many(struct pglist_data *pgdat, struct scan_control *sc) +{ +	int op; +	int gen; +	int bin; +	int first_bin; +	struct lruvec *lruvec; +	struct lru_gen_folio *lrugen; +	struct mem_cgroup *memcg; +	const struct hlist_nulls_node *pos; +	unsigned long nr_to_reclaim = get_nr_to_reclaim(sc); + +	bin = first_bin = get_random_u32_below(MEMCG_NR_BINS); +restart: +	op = 0; +	memcg = NULL; +	gen = get_memcg_gen(READ_ONCE(pgdat->memcg_lru.seq)); + +	rcu_read_lock(); + +	hlist_nulls_for_each_entry_rcu(lrugen, pos, &pgdat->memcg_lru.fifo[gen][bin], list) { +		if (op) +			lru_gen_rotate_memcg(lruvec, op); + +		mem_cgroup_put(memcg); + +		lruvec = container_of(lrugen, struct lruvec, lrugen); +		memcg = lruvec_memcg(lruvec); + +		if (!mem_cgroup_tryget(memcg)) { +			op = 0; +			memcg = NULL;  			continue; +		} -		wmark = current_is_kswapd() ? high_wmark_pages(zone) : low_wmark_pages(zone); -		if (wmark > zone_page_state(zone, NR_FREE_PAGES)) -			return false; +		rcu_read_unlock(); + +		op = shrink_one(lruvec, sc); + +		rcu_read_lock(); + +		if (sc->nr_reclaimed >= nr_to_reclaim) +			break;  	} -	sc->nr_reclaimed += MIN_LRU_BATCH; +	rcu_read_unlock(); -	return true; +	if (op) +		lru_gen_rotate_memcg(lruvec, op); + +	mem_cgroup_put(memcg); + +	if (sc->nr_reclaimed >= nr_to_reclaim) +		return; + +	/* restart if raced with lru_gen_rotate_memcg() */ +	if (gen != get_nulls_value(pos)) +		goto restart; + +	/* try the rest of the bins of the current generation */ +	bin = get_memcg_bin(bin + 1); +	if (bin != first_bin) +		goto restart;  }  static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)  {  	struct blk_plug plug; -	bool need_aging = false; -	bool need_swapping = false; -	unsigned long scanned = 0; -	unsigned long reclaimed = sc->nr_reclaimed; -	DEFINE_MAX_SEQ(lruvec); + +	VM_WARN_ON_ONCE(global_reclaim(sc)); +	VM_WARN_ON_ONCE(!sc->may_writepage || !sc->may_unmap);  	lru_add_drain();  	blk_start_plug(&plug); -	set_mm_walk(lruvec_pgdat(lruvec)); +	set_mm_walk(NULL, sc->proactive); -	while (true) { -		int delta; -		int swappiness; -		unsigned long nr_to_scan; +	if (try_to_shrink_lruvec(lruvec, sc)) +		lru_gen_rotate_memcg(lruvec, MEMCG_LRU_YOUNG); -		if (sc->may_swap) -			swappiness = get_swappiness(lruvec, sc); -		else if (!cgroup_reclaim(sc) && get_swappiness(lruvec, sc)) -			swappiness = 1; -		else -			swappiness = 0; +	clear_mm_walk(); -		nr_to_scan = get_nr_to_scan(lruvec, sc, swappiness, &need_aging); -		if (!nr_to_scan) -			goto done; +	blk_finish_plug(&plug); +} -		delta = evict_folios(lruvec, sc, swappiness, &need_swapping); -		if (!delta) -			goto done; +#else /* !CONFIG_MEMCG */ -		scanned += delta; -		if (scanned >= nr_to_scan) -			break; +static void shrink_many(struct pglist_data *pgdat, struct scan_control *sc) +{ +	BUILD_BUG(); +} -		if (should_abort_scan(lruvec, max_seq, sc, need_swapping)) -			break; +static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) +{ +	BUILD_BUG(); +} -		cond_resched(); -	} +#endif + +static void set_initial_priority(struct pglist_data *pgdat, struct scan_control *sc) +{ +	int priority; +	unsigned long reclaimable; +	struct lruvec *lruvec = mem_cgroup_lruvec(NULL, pgdat); + +	if (sc->priority != DEF_PRIORITY || sc->nr_to_reclaim < MIN_LRU_BATCH) +		return; +	/* +	 * Determine the initial priority based on ((total / MEMCG_NR_GENS) >> +	 * priority) * reclaimed_to_scanned_ratio = nr_to_reclaim, where the +	 * estimated reclaimed_to_scanned_ratio = inactive / total. +	 */ +	reclaimable = node_page_state(pgdat, NR_INACTIVE_FILE); +	if (get_swappiness(lruvec, sc)) +		reclaimable += node_page_state(pgdat, NR_INACTIVE_ANON); + +	reclaimable /= MEMCG_NR_GENS; + +	/* round down reclaimable and round up sc->nr_to_reclaim */ +	priority = fls_long(reclaimable) - 1 - fls_long(sc->nr_to_reclaim - 1); + +	sc->priority = clamp(priority, 0, DEF_PRIORITY); +} + +static void lru_gen_shrink_node(struct pglist_data *pgdat, struct scan_control *sc) +{ +	struct blk_plug plug; +	unsigned long reclaimed = sc->nr_reclaimed; + +	VM_WARN_ON_ONCE(!global_reclaim(sc)); + +	/* +	 * Unmapped clean folios are already prioritized. Scanning for more of +	 * them is likely futile and can cause high reclaim latency when there +	 * is a large number of memcgs. +	 */ +	if (!sc->may_writepage || !sc->may_unmap) +		goto done; + +	lru_add_drain(); + +	blk_start_plug(&plug); + +	set_mm_walk(pgdat, sc->proactive); + +	set_initial_priority(pgdat, sc); + +	if (current_is_kswapd()) +		sc->nr_reclaimed = 0; + +	if (mem_cgroup_disabled()) +		shrink_one(&pgdat->__lruvec, sc); +	else +		shrink_many(pgdat, sc); + +	if (current_is_kswapd()) +		sc->nr_reclaimed += reclaimed; -	/* see the comment in lru_gen_age_node() */ -	if (sc->nr_reclaimed - reclaimed >= MIN_LRU_BATCH && !need_aging) -		sc->memcgs_need_aging = false; -done:  	clear_mm_walk();  	blk_finish_plug(&plug); +done: +	/* kswapd should never fail */ +	pgdat->kswapd_failures = 0;  }  /****************************************************************************** @@ -5251,7 +5527,7 @@ done:  static bool __maybe_unused state_is_valid(struct lruvec *lruvec)  { -	struct lru_gen_struct *lrugen = &lruvec->lrugen; +	struct lru_gen_folio *lrugen = &lruvec->lrugen;  	if (lrugen->enabled) {  		enum lru_list lru; @@ -5264,7 +5540,7 @@ static bool __maybe_unused state_is_valid(struct lruvec *lruvec)  		int gen, type, zone;  		for_each_gen_type_zone(gen, type, zone) { -			if (!list_empty(&lrugen->lists[gen][type][zone])) +			if (!list_empty(&lrugen->folios[gen][type][zone]))  				return false;  		}  	} @@ -5309,7 +5585,7 @@ static bool drain_evictable(struct lruvec *lruvec)  	int remaining = MAX_LRU_BATCH;  	for_each_gen_type_zone(gen, type, zone) { -		struct list_head *head = &lruvec->lrugen.lists[gen][type][zone]; +		struct list_head *head = &lruvec->lrugen.folios[gen][type][zone];  		while (!list_empty(head)) {  			bool success; @@ -5530,7 +5806,7 @@ static void lru_gen_seq_show_full(struct seq_file *m, struct lruvec *lruvec,  	int i;  	int type, tier;  	int hist = lru_hist_from_seq(seq); -	struct lru_gen_struct *lrugen = &lruvec->lrugen; +	struct lru_gen_folio *lrugen = &lruvec->lrugen;  	for (tier = 0; tier < MAX_NR_TIERS; tier++) {  		seq_printf(m, "            %10d", tier); @@ -5580,7 +5856,7 @@ static int lru_gen_seq_show(struct seq_file *m, void *v)  	unsigned long seq;  	bool full = !debugfs_real_fops(m->file)->write;  	struct lruvec *lruvec = v; -	struct lru_gen_struct *lrugen = &lruvec->lrugen; +	struct lru_gen_folio *lrugen = &lruvec->lrugen;  	int nid = lruvec_pgdat(lruvec)->node_id;  	struct mem_cgroup *memcg = lruvec_memcg(lruvec);  	DEFINE_MAX_SEQ(lruvec); @@ -5677,7 +5953,7 @@ static int run_eviction(struct lruvec *lruvec, unsigned long seq, struct scan_co  		if (sc->nr_reclaimed >= nr_to_reclaim)  			return 0; -		if (!evict_folios(lruvec, sc, swappiness, NULL)) +		if (!evict_folios(lruvec, sc, swappiness))  			return 0;  		cond_resched(); @@ -5698,11 +5974,11 @@ static int run_cmd(char cmd, int memcg_id, int nid, unsigned long seq,  	if (!mem_cgroup_disabled()) {  		rcu_read_lock(); +  		memcg = mem_cgroup_from_id(memcg_id); -#ifdef CONFIG_MEMCG -		if (memcg && !css_tryget(&memcg->css)) +		if (!mem_cgroup_tryget(memcg))  			memcg = NULL; -#endif +  		rcu_read_unlock();  		if (!memcg) @@ -5762,7 +6038,7 @@ static ssize_t lru_gen_seq_write(struct file *file, const char __user *src,  	set_task_reclaim_state(current, &sc.reclaim_state);  	flags = memalloc_noreclaim_save();  	blk_start_plug(&plug); -	if (!set_mm_walk(NULL)) { +	if (!set_mm_walk(NULL, true)) {  		err = -ENOMEM;  		goto done;  	} @@ -5834,7 +6110,7 @@ void lru_gen_init_lruvec(struct lruvec *lruvec)  {  	int i;  	int gen, type, zone; -	struct lru_gen_struct *lrugen = &lruvec->lrugen; +	struct lru_gen_folio *lrugen = &lruvec->lrugen;  	lrugen->max_seq = MIN_NR_GENS + 1;  	lrugen->enabled = lru_gen_enabled(); @@ -5843,13 +6119,26 @@ void lru_gen_init_lruvec(struct lruvec *lruvec)  		lrugen->timestamps[i] = jiffies;  	for_each_gen_type_zone(gen, type, zone) -		INIT_LIST_HEAD(&lrugen->lists[gen][type][zone]); +		INIT_LIST_HEAD(&lrugen->folios[gen][type][zone]);  	lruvec->mm_state.seq = MIN_NR_GENS;  	init_waitqueue_head(&lruvec->mm_state.wait);  }  #ifdef CONFIG_MEMCG + +void lru_gen_init_pgdat(struct pglist_data *pgdat) +{ +	int i, j; + +	spin_lock_init(&pgdat->memcg_lru.lock); + +	for (i = 0; i < MEMCG_NR_GENS; i++) { +		for (j = 0; j < MEMCG_NR_BINS; j++) +			INIT_HLIST_NULLS_HEAD(&pgdat->memcg_lru.fifo[i][j], i); +	} +} +  void lru_gen_init_memcg(struct mem_cgroup *memcg)  {  	INIT_LIST_HEAD(&memcg->mm_list.fifo); @@ -5861,19 +6150,25 @@ void lru_gen_exit_memcg(struct mem_cgroup *memcg)  	int i;  	int nid; +	VM_WARN_ON_ONCE(!list_empty(&memcg->mm_list.fifo)); +  	for_each_node(nid) {  		struct lruvec *lruvec = get_lruvec(memcg, nid); +		VM_WARN_ON_ONCE(lruvec->mm_state.nr_walkers);  		VM_WARN_ON_ONCE(memchr_inv(lruvec->lrugen.nr_pages, 0,  					   sizeof(lruvec->lrugen.nr_pages))); +		lruvec->lrugen.list.next = LIST_POISON1; +  		for (i = 0; i < NR_BLOOM_FILTERS; i++) {  			bitmap_free(lruvec->mm_state.filters[i]);  			lruvec->mm_state.filters[i] = NULL;  		}  	}  } -#endif + +#endif /* CONFIG_MEMCG */  static int __init init_lru_gen(void)  { @@ -5900,6 +6195,10 @@ static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc  {  } +static void lru_gen_shrink_node(struct pglist_data *pgdat, struct scan_control *sc) +{ +} +  #endif /* CONFIG_LRU_GEN */  static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) @@ -5913,7 +6212,7 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)  	bool proportional_reclaim;  	struct blk_plug plug; -	if (lru_gen_enabled()) { +	if (lru_gen_enabled() && !global_reclaim(sc)) {  		lru_gen_shrink_lruvec(lruvec, sc);  		return;  	} @@ -6156,6 +6455,11 @@ static void shrink_node(pg_data_t *pgdat, struct scan_control *sc)  	struct lruvec *target_lruvec;  	bool reclaimable = false; +	if (lru_gen_enabled() && global_reclaim(sc)) { +		lru_gen_shrink_node(pgdat, sc); +		return; +	} +  	target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat);  again: @@ -6754,8 +7058,7 @@ unsigned long mem_cgroup_shrink_node(struct mem_cgroup *memcg,  unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,  					   unsigned long nr_pages,  					   gfp_t gfp_mask, -					   unsigned int reclaim_options, -					   nodemask_t *nodemask) +					   unsigned int reclaim_options)  {  	unsigned long nr_reclaimed;  	unsigned int noreclaim_flag; @@ -6770,7 +7073,6 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,  		.may_unmap = 1,  		.may_swap = !!(reclaim_options & MEMCG_RECLAIM_MAY_SWAP),  		.proactive = !!(reclaim_options & MEMCG_RECLAIM_PROACTIVE), -		.nodemask = nodemask,  	};  	/*  	 * Traverse the ZONELIST_FALLBACK zonelist of the current node to put  |