diff options
-rw-r--r-- | include/linux/zswap.h | 16 | ||||
-rw-r--r-- | mm/zswap.c | 108 |
2 files changed, 70 insertions, 54 deletions
diff --git a/include/linux/zswap.h b/include/linux/zswap.h index 6cecb4a4f68b..9cd1beef0654 100644 --- a/include/linux/zswap.h +++ b/include/linux/zswap.h @@ -13,17 +13,15 @@ extern atomic_t zswap_stored_pages; struct zswap_lruvec_state { /* - * Number of pages in zswap that should be protected from the shrinker. - * This number is an estimate of the following counts: + * Number of swapped in pages from disk, i.e not found in the zswap pool. * - * a) Recent page faults. - * b) Recent insertion to the zswap LRU. This includes new zswap stores, - * as well as recent zswap LRU rotations. - * - * These pages are likely to be warm, and might incur IO if the are written - * to swap. + * This is consumed and subtracted from the lru size in + * zswap_shrinker_count() to penalize past overshrinking that led to disk + * swapins. The idea is that had we considered this many more pages in the + * LRU active/protected and not written them back, we would not have had to + * swapped them in. */ - atomic_long_t nr_zswap_protected; + atomic_long_t nr_disk_swapins; }; unsigned long zswap_total_pages(void); diff --git a/mm/zswap.c b/mm/zswap.c index 71b75ff1f3fb..df66ab102d27 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -187,6 +187,10 @@ static struct shrinker *zswap_shrinker; * length - the length in bytes of the compressed page data. Needed during * decompression. For a same value filled page length is 0, and both * pool and lru are invalid and must be ignored. + * referenced - true if the entry recently entered the zswap pool. Unset by the + * writeback logic. The entry is only reclaimed by the writeback + * logic if referenced is unset. See comments in the shrinker + * section for context. * pool - the zswap_pool the entry's data is in * handle - zpool allocation handle that stores the compressed page data * value - value of the same-value filled pages which have same content @@ -196,6 +200,7 @@ static struct shrinker *zswap_shrinker; struct zswap_entry { swp_entry_t swpentry; unsigned int length; + bool referenced; struct zswap_pool *pool; union { unsigned long handle; @@ -700,11 +705,8 @@ static inline int entry_to_nid(struct zswap_entry *entry) static void zswap_lru_add(struct list_lru *list_lru, struct zswap_entry *entry) { - atomic_long_t *nr_zswap_protected; - unsigned long lru_size, old, new; int nid = entry_to_nid(entry); struct mem_cgroup *memcg; - struct lruvec *lruvec; /* * Note that it is safe to use rcu_read_lock() here, even in the face of @@ -722,19 +724,6 @@ static void zswap_lru_add(struct list_lru *list_lru, struct zswap_entry *entry) memcg = mem_cgroup_from_entry(entry); /* will always succeed */ list_lru_add(list_lru, &entry->lru, nid, memcg); - - /* Update the protection area */ - lru_size = list_lru_count_one(list_lru, nid, memcg); - lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(nid)); - nr_zswap_protected = &lruvec->zswap_lruvec_state.nr_zswap_protected; - old = atomic_long_inc_return(nr_zswap_protected); - /* - * Decay to avoid overflow and adapt to changing workloads. - * This is based on LRU reclaim cost decaying heuristics. - */ - do { - new = old > lru_size / 4 ? old / 2 : old; - } while (!atomic_long_try_cmpxchg(nr_zswap_protected, &old, new)); rcu_read_unlock(); } @@ -752,7 +741,7 @@ static void zswap_lru_del(struct list_lru *list_lru, struct zswap_entry *entry) void zswap_lruvec_state_init(struct lruvec *lruvec) { - atomic_long_set(&lruvec->zswap_lruvec_state.nr_zswap_protected, 0); + atomic_long_set(&lruvec->zswap_lruvec_state.nr_disk_swapins, 0); } void zswap_folio_swapin(struct folio *folio) @@ -761,7 +750,7 @@ void zswap_folio_swapin(struct folio *folio) if (folio) { lruvec = folio_lruvec(folio); - atomic_long_inc(&lruvec->zswap_lruvec_state.nr_zswap_protected); + atomic_long_inc(&lruvec->zswap_lruvec_state.nr_disk_swapins); } } @@ -1095,6 +1084,28 @@ static int zswap_writeback_entry(struct zswap_entry *entry, /********************************* * shrinker functions **********************************/ +/* + * The dynamic shrinker is modulated by the following factors: + * + * 1. Each zswap entry has a referenced bit, which the shrinker unsets (giving + * the entry a second chance) before rotating it in the LRU list. If the + * entry is considered again by the shrinker, with its referenced bit unset, + * it is written back. The writeback rate as a result is dynamically + * adjusted by the pool activities - if the pool is dominated by new entries + * (i.e lots of recent zswapouts), these entries will be protected and + * the writeback rate will slow down. On the other hand, if the pool has a + * lot of stagnant entries, these entries will be reclaimed immediately, + * effectively increasing the writeback rate. + * + * 2. Swapins counter: If we observe swapins, it is a sign that we are + * overshrinking and should slow down. We maintain a swapins counter, which + * is consumed and subtract from the number of eligible objects on the LRU + * in zswap_shrinker_count(). + * + * 3. Compression ratio. The better the workload compresses, the less gains we + * can expect from writeback. We scale down the number of objects available + * for reclaim by this ratio. + */ static enum lru_status shrink_memcg_cb(struct list_head *item, struct list_lru_one *l, spinlock_t *lock, void *arg) { @@ -1105,6 +1116,16 @@ static enum lru_status shrink_memcg_cb(struct list_head *item, struct list_lru_o int writeback_result; /* + * Second chance algorithm: if the entry has its referenced bit set, give it + * a second chance. Only clear the referenced bit and rotate it in the + * zswap's LRU list. + */ + if (entry->referenced) { + entry->referenced = false; + return LRU_ROTATE; + } + + /* * As soon as we drop the LRU lock, the entry can be freed by * a concurrent invalidation. This means the following: * @@ -1170,8 +1191,7 @@ static enum lru_status shrink_memcg_cb(struct list_head *item, struct list_lru_o static unsigned long zswap_shrinker_scan(struct shrinker *shrinker, struct shrink_control *sc) { - struct lruvec *lruvec = mem_cgroup_lruvec(sc->memcg, NODE_DATA(sc->nid)); - unsigned long shrink_ret, nr_protected, lru_size; + unsigned long shrink_ret; bool encountered_page_in_swapcache = false; if (!zswap_shrinker_enabled || @@ -1180,25 +1200,6 @@ static unsigned long zswap_shrinker_scan(struct shrinker *shrinker, return SHRINK_STOP; } - nr_protected = - atomic_long_read(&lruvec->zswap_lruvec_state.nr_zswap_protected); - lru_size = list_lru_shrink_count(&zswap_list_lru, sc); - - /* - * Abort if we are shrinking into the protected region. - * - * This short-circuiting is necessary because if we have too many multiple - * concurrent reclaimers getting the freeable zswap object counts at the - * same time (before any of them made reasonable progress), the total - * number of reclaimed objects might be more than the number of unprotected - * objects (i.e the reclaimers will reclaim into the protected area of the - * zswap LRU). - */ - if (nr_protected >= lru_size - sc->nr_to_scan) { - sc->nr_scanned = 0; - return SHRINK_STOP; - } - shrink_ret = list_lru_shrink_walk(&zswap_list_lru, sc, &shrink_memcg_cb, &encountered_page_in_swapcache); @@ -1213,7 +1214,10 @@ static unsigned long zswap_shrinker_count(struct shrinker *shrinker, { struct mem_cgroup *memcg = sc->memcg; struct lruvec *lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(sc->nid)); - unsigned long nr_backing, nr_stored, nr_freeable, nr_protected; + atomic_long_t *nr_disk_swapins = + &lruvec->zswap_lruvec_state.nr_disk_swapins; + unsigned long nr_backing, nr_stored, nr_freeable, nr_disk_swapins_cur, + nr_remain; if (!zswap_shrinker_enabled || !mem_cgroup_zswap_writeback_enabled(memcg)) return 0; @@ -1246,14 +1250,27 @@ static unsigned long zswap_shrinker_count(struct shrinker *shrinker, if (!nr_stored) return 0; - nr_protected = - atomic_long_read(&lruvec->zswap_lruvec_state.nr_zswap_protected); nr_freeable = list_lru_shrink_count(&zswap_list_lru, sc); + if (!nr_freeable) + return 0; + /* - * Subtract the lru size by an estimate of the number of pages - * that should be protected. + * Subtract from the lru size the number of pages that are recently swapped + * in from disk. The idea is that had we protect the zswap's LRU by this + * amount of pages, these disk swapins would not have happened. */ - nr_freeable = nr_freeable > nr_protected ? nr_freeable - nr_protected : 0; + nr_disk_swapins_cur = atomic_long_read(nr_disk_swapins); + do { + if (nr_freeable >= nr_disk_swapins_cur) + nr_remain = 0; + else + nr_remain = nr_disk_swapins_cur - nr_freeable; + } while (!atomic_long_try_cmpxchg( + nr_disk_swapins, &nr_disk_swapins_cur, nr_remain)); + + nr_freeable -= nr_disk_swapins_cur - nr_remain; + if (!nr_freeable) + return 0; /* * Scale the number of freeable pages by the memory saving factor. @@ -1506,6 +1523,7 @@ bool zswap_store(struct folio *folio) store_entry: entry->swpentry = swp; entry->objcg = objcg; + entry->referenced = true; old = xa_store(tree, offset, entry, GFP_KERNEL); if (xa_is_err(old)) { |