aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--include/linux/zswap.h16
-rw-r--r--mm/zswap.c108
2 files changed, 70 insertions, 54 deletions
diff --git a/include/linux/zswap.h b/include/linux/zswap.h
index 6cecb4a4f68b..9cd1beef0654 100644
--- a/include/linux/zswap.h
+++ b/include/linux/zswap.h
@@ -13,17 +13,15 @@ extern atomic_t zswap_stored_pages;
struct zswap_lruvec_state {
/*
- * Number of pages in zswap that should be protected from the shrinker.
- * This number is an estimate of the following counts:
+ * Number of swapped in pages from disk, i.e not found in the zswap pool.
*
- * a) Recent page faults.
- * b) Recent insertion to the zswap LRU. This includes new zswap stores,
- * as well as recent zswap LRU rotations.
- *
- * These pages are likely to be warm, and might incur IO if the are written
- * to swap.
+ * This is consumed and subtracted from the lru size in
+ * zswap_shrinker_count() to penalize past overshrinking that led to disk
+ * swapins. The idea is that had we considered this many more pages in the
+ * LRU active/protected and not written them back, we would not have had to
+ * swapped them in.
*/
- atomic_long_t nr_zswap_protected;
+ atomic_long_t nr_disk_swapins;
};
unsigned long zswap_total_pages(void);
diff --git a/mm/zswap.c b/mm/zswap.c
index 71b75ff1f3fb..df66ab102d27 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -187,6 +187,10 @@ static struct shrinker *zswap_shrinker;
* length - the length in bytes of the compressed page data. Needed during
* decompression. For a same value filled page length is 0, and both
* pool and lru are invalid and must be ignored.
+ * referenced - true if the entry recently entered the zswap pool. Unset by the
+ * writeback logic. The entry is only reclaimed by the writeback
+ * logic if referenced is unset. See comments in the shrinker
+ * section for context.
* pool - the zswap_pool the entry's data is in
* handle - zpool allocation handle that stores the compressed page data
* value - value of the same-value filled pages which have same content
@@ -196,6 +200,7 @@ static struct shrinker *zswap_shrinker;
struct zswap_entry {
swp_entry_t swpentry;
unsigned int length;
+ bool referenced;
struct zswap_pool *pool;
union {
unsigned long handle;
@@ -700,11 +705,8 @@ static inline int entry_to_nid(struct zswap_entry *entry)
static void zswap_lru_add(struct list_lru *list_lru, struct zswap_entry *entry)
{
- atomic_long_t *nr_zswap_protected;
- unsigned long lru_size, old, new;
int nid = entry_to_nid(entry);
struct mem_cgroup *memcg;
- struct lruvec *lruvec;
/*
* Note that it is safe to use rcu_read_lock() here, even in the face of
@@ -722,19 +724,6 @@ static void zswap_lru_add(struct list_lru *list_lru, struct zswap_entry *entry)
memcg = mem_cgroup_from_entry(entry);
/* will always succeed */
list_lru_add(list_lru, &entry->lru, nid, memcg);
-
- /* Update the protection area */
- lru_size = list_lru_count_one(list_lru, nid, memcg);
- lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(nid));
- nr_zswap_protected = &lruvec->zswap_lruvec_state.nr_zswap_protected;
- old = atomic_long_inc_return(nr_zswap_protected);
- /*
- * Decay to avoid overflow and adapt to changing workloads.
- * This is based on LRU reclaim cost decaying heuristics.
- */
- do {
- new = old > lru_size / 4 ? old / 2 : old;
- } while (!atomic_long_try_cmpxchg(nr_zswap_protected, &old, new));
rcu_read_unlock();
}
@@ -752,7 +741,7 @@ static void zswap_lru_del(struct list_lru *list_lru, struct zswap_entry *entry)
void zswap_lruvec_state_init(struct lruvec *lruvec)
{
- atomic_long_set(&lruvec->zswap_lruvec_state.nr_zswap_protected, 0);
+ atomic_long_set(&lruvec->zswap_lruvec_state.nr_disk_swapins, 0);
}
void zswap_folio_swapin(struct folio *folio)
@@ -761,7 +750,7 @@ void zswap_folio_swapin(struct folio *folio)
if (folio) {
lruvec = folio_lruvec(folio);
- atomic_long_inc(&lruvec->zswap_lruvec_state.nr_zswap_protected);
+ atomic_long_inc(&lruvec->zswap_lruvec_state.nr_disk_swapins);
}
}
@@ -1095,6 +1084,28 @@ static int zswap_writeback_entry(struct zswap_entry *entry,
/*********************************
* shrinker functions
**********************************/
+/*
+ * The dynamic shrinker is modulated by the following factors:
+ *
+ * 1. Each zswap entry has a referenced bit, which the shrinker unsets (giving
+ * the entry a second chance) before rotating it in the LRU list. If the
+ * entry is considered again by the shrinker, with its referenced bit unset,
+ * it is written back. The writeback rate as a result is dynamically
+ * adjusted by the pool activities - if the pool is dominated by new entries
+ * (i.e lots of recent zswapouts), these entries will be protected and
+ * the writeback rate will slow down. On the other hand, if the pool has a
+ * lot of stagnant entries, these entries will be reclaimed immediately,
+ * effectively increasing the writeback rate.
+ *
+ * 2. Swapins counter: If we observe swapins, it is a sign that we are
+ * overshrinking and should slow down. We maintain a swapins counter, which
+ * is consumed and subtract from the number of eligible objects on the LRU
+ * in zswap_shrinker_count().
+ *
+ * 3. Compression ratio. The better the workload compresses, the less gains we
+ * can expect from writeback. We scale down the number of objects available
+ * for reclaim by this ratio.
+ */
static enum lru_status shrink_memcg_cb(struct list_head *item, struct list_lru_one *l,
spinlock_t *lock, void *arg)
{
@@ -1105,6 +1116,16 @@ static enum lru_status shrink_memcg_cb(struct list_head *item, struct list_lru_o
int writeback_result;
/*
+ * Second chance algorithm: if the entry has its referenced bit set, give it
+ * a second chance. Only clear the referenced bit and rotate it in the
+ * zswap's LRU list.
+ */
+ if (entry->referenced) {
+ entry->referenced = false;
+ return LRU_ROTATE;
+ }
+
+ /*
* As soon as we drop the LRU lock, the entry can be freed by
* a concurrent invalidation. This means the following:
*
@@ -1170,8 +1191,7 @@ static enum lru_status shrink_memcg_cb(struct list_head *item, struct list_lru_o
static unsigned long zswap_shrinker_scan(struct shrinker *shrinker,
struct shrink_control *sc)
{
- struct lruvec *lruvec = mem_cgroup_lruvec(sc->memcg, NODE_DATA(sc->nid));
- unsigned long shrink_ret, nr_protected, lru_size;
+ unsigned long shrink_ret;
bool encountered_page_in_swapcache = false;
if (!zswap_shrinker_enabled ||
@@ -1180,25 +1200,6 @@ static unsigned long zswap_shrinker_scan(struct shrinker *shrinker,
return SHRINK_STOP;
}
- nr_protected =
- atomic_long_read(&lruvec->zswap_lruvec_state.nr_zswap_protected);
- lru_size = list_lru_shrink_count(&zswap_list_lru, sc);
-
- /*
- * Abort if we are shrinking into the protected region.
- *
- * This short-circuiting is necessary because if we have too many multiple
- * concurrent reclaimers getting the freeable zswap object counts at the
- * same time (before any of them made reasonable progress), the total
- * number of reclaimed objects might be more than the number of unprotected
- * objects (i.e the reclaimers will reclaim into the protected area of the
- * zswap LRU).
- */
- if (nr_protected >= lru_size - sc->nr_to_scan) {
- sc->nr_scanned = 0;
- return SHRINK_STOP;
- }
-
shrink_ret = list_lru_shrink_walk(&zswap_list_lru, sc, &shrink_memcg_cb,
&encountered_page_in_swapcache);
@@ -1213,7 +1214,10 @@ static unsigned long zswap_shrinker_count(struct shrinker *shrinker,
{
struct mem_cgroup *memcg = sc->memcg;
struct lruvec *lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(sc->nid));
- unsigned long nr_backing, nr_stored, nr_freeable, nr_protected;
+ atomic_long_t *nr_disk_swapins =
+ &lruvec->zswap_lruvec_state.nr_disk_swapins;
+ unsigned long nr_backing, nr_stored, nr_freeable, nr_disk_swapins_cur,
+ nr_remain;
if (!zswap_shrinker_enabled || !mem_cgroup_zswap_writeback_enabled(memcg))
return 0;
@@ -1246,14 +1250,27 @@ static unsigned long zswap_shrinker_count(struct shrinker *shrinker,
if (!nr_stored)
return 0;
- nr_protected =
- atomic_long_read(&lruvec->zswap_lruvec_state.nr_zswap_protected);
nr_freeable = list_lru_shrink_count(&zswap_list_lru, sc);
+ if (!nr_freeable)
+ return 0;
+
/*
- * Subtract the lru size by an estimate of the number of pages
- * that should be protected.
+ * Subtract from the lru size the number of pages that are recently swapped
+ * in from disk. The idea is that had we protect the zswap's LRU by this
+ * amount of pages, these disk swapins would not have happened.
*/
- nr_freeable = nr_freeable > nr_protected ? nr_freeable - nr_protected : 0;
+ nr_disk_swapins_cur = atomic_long_read(nr_disk_swapins);
+ do {
+ if (nr_freeable >= nr_disk_swapins_cur)
+ nr_remain = 0;
+ else
+ nr_remain = nr_disk_swapins_cur - nr_freeable;
+ } while (!atomic_long_try_cmpxchg(
+ nr_disk_swapins, &nr_disk_swapins_cur, nr_remain));
+
+ nr_freeable -= nr_disk_swapins_cur - nr_remain;
+ if (!nr_freeable)
+ return 0;
/*
* Scale the number of freeable pages by the memory saving factor.
@@ -1506,6 +1523,7 @@ bool zswap_store(struct folio *folio)
store_entry:
entry->swpentry = swp;
entry->objcg = objcg;
+ entry->referenced = true;
old = xa_store(tree, offset, entry, GFP_KERNEL);
if (xa_is_err(old)) {