aboutsummaryrefslogtreecommitdiff
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/backing-dev.c120
-rw-r--r--mm/khugepaged.c2
-rw-r--r--mm/memcontrol.c139
-rw-r--r--mm/page-writeback.c4
-rw-r--r--mm/page_alloc.c2
-rw-r--r--mm/percpu.c23
6 files changed, 254 insertions, 36 deletions
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index e8e89158adec..d9daa3e422d0 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -1,6 +1,7 @@
// SPDX-License-Identifier: GPL-2.0-only
#include <linux/wait.h>
+#include <linux/rbtree.h>
#include <linux/backing-dev.h>
#include <linux/kthread.h>
#include <linux/freezer.h>
@@ -22,10 +23,12 @@ EXPORT_SYMBOL_GPL(noop_backing_dev_info);
static struct class *bdi_class;
/*
- * bdi_lock protects updates to bdi_list. bdi_list has RCU reader side
- * locking.
+ * bdi_lock protects bdi_tree and updates to bdi_list. bdi_list has RCU
+ * reader side locking.
*/
DEFINE_SPINLOCK(bdi_lock);
+static u64 bdi_id_cursor;
+static struct rb_root bdi_tree = RB_ROOT;
LIST_HEAD(bdi_list);
/* bdi_wq serves all asynchronous writeback tasks */
@@ -615,13 +618,12 @@ out_put:
}
/**
- * wb_get_create - get wb for a given memcg, create if necessary
+ * wb_get_lookup - get wb for a given memcg
* @bdi: target bdi
* @memcg_css: cgroup_subsys_state of the target memcg (must have positive ref)
- * @gfp: allocation mask to use
*
- * Try to get the wb for @memcg_css on @bdi. If it doesn't exist, try to
- * create one. The returned wb has its refcount incremented.
+ * Try to get the wb for @memcg_css on @bdi. The returned wb has its
+ * refcount incremented.
*
* This function uses css_get() on @memcg_css and thus expects its refcnt
* to be positive on invocation. IOW, rcu_read_lock() protection on
@@ -638,6 +640,39 @@ out_put:
* each lookup. On mismatch, the existing wb is discarded and a new one is
* created.
*/
+struct bdi_writeback *wb_get_lookup(struct backing_dev_info *bdi,
+ struct cgroup_subsys_state *memcg_css)
+{
+ struct bdi_writeback *wb;
+
+ if (!memcg_css->parent)
+ return &bdi->wb;
+
+ rcu_read_lock();
+ wb = radix_tree_lookup(&bdi->cgwb_tree, memcg_css->id);
+ if (wb) {
+ struct cgroup_subsys_state *blkcg_css;
+
+ /* see whether the blkcg association has changed */
+ blkcg_css = cgroup_get_e_css(memcg_css->cgroup, &io_cgrp_subsys);
+ if (unlikely(wb->blkcg_css != blkcg_css || !wb_tryget(wb)))
+ wb = NULL;
+ css_put(blkcg_css);
+ }
+ rcu_read_unlock();
+
+ return wb;
+}
+
+/**
+ * wb_get_create - get wb for a given memcg, create if necessary
+ * @bdi: target bdi
+ * @memcg_css: cgroup_subsys_state of the target memcg (must have positive ref)
+ * @gfp: allocation mask to use
+ *
+ * Try to get the wb for @memcg_css on @bdi. If it doesn't exist, try to
+ * create one. See wb_get_lookup() for more details.
+ */
struct bdi_writeback *wb_get_create(struct backing_dev_info *bdi,
struct cgroup_subsys_state *memcg_css,
gfp_t gfp)
@@ -650,20 +685,7 @@ struct bdi_writeback *wb_get_create(struct backing_dev_info *bdi,
return &bdi->wb;
do {
- rcu_read_lock();
- wb = radix_tree_lookup(&bdi->cgwb_tree, memcg_css->id);
- if (wb) {
- struct cgroup_subsys_state *blkcg_css;
-
- /* see whether the blkcg association has changed */
- blkcg_css = cgroup_get_e_css(memcg_css->cgroup,
- &io_cgrp_subsys);
- if (unlikely(wb->blkcg_css != blkcg_css ||
- !wb_tryget(wb)))
- wb = NULL;
- css_put(blkcg_css);
- }
- rcu_read_unlock();
+ wb = wb_get_lookup(bdi, memcg_css);
} while (!wb && !cgwb_create(bdi, memcg_css, gfp));
return wb;
@@ -859,9 +881,58 @@ struct backing_dev_info *bdi_alloc_node(gfp_t gfp_mask, int node_id)
}
EXPORT_SYMBOL(bdi_alloc_node);
+static struct rb_node **bdi_lookup_rb_node(u64 id, struct rb_node **parentp)
+{
+ struct rb_node **p = &bdi_tree.rb_node;
+ struct rb_node *parent = NULL;
+ struct backing_dev_info *bdi;
+
+ lockdep_assert_held(&bdi_lock);
+
+ while (*p) {
+ parent = *p;
+ bdi = rb_entry(parent, struct backing_dev_info, rb_node);
+
+ if (bdi->id > id)
+ p = &(*p)->rb_left;
+ else if (bdi->id < id)
+ p = &(*p)->rb_right;
+ else
+ break;
+ }
+
+ if (parentp)
+ *parentp = parent;
+ return p;
+}
+
+/**
+ * bdi_get_by_id - lookup and get bdi from its id
+ * @id: bdi id to lookup
+ *
+ * Find bdi matching @id and get it. Returns NULL if the matching bdi
+ * doesn't exist or is already unregistered.
+ */
+struct backing_dev_info *bdi_get_by_id(u64 id)
+{
+ struct backing_dev_info *bdi = NULL;
+ struct rb_node **p;
+
+ spin_lock_bh(&bdi_lock);
+ p = bdi_lookup_rb_node(id, NULL);
+ if (*p) {
+ bdi = rb_entry(*p, struct backing_dev_info, rb_node);
+ bdi_get(bdi);
+ }
+ spin_unlock_bh(&bdi_lock);
+
+ return bdi;
+}
+
int bdi_register_va(struct backing_dev_info *bdi, const char *fmt, va_list args)
{
struct device *dev;
+ struct rb_node *parent, **p;
if (bdi->dev) /* The driver needs to use separate queues per device */
return 0;
@@ -877,7 +948,15 @@ int bdi_register_va(struct backing_dev_info *bdi, const char *fmt, va_list args)
set_bit(WB_registered, &bdi->wb.state);
spin_lock_bh(&bdi_lock);
+
+ bdi->id = ++bdi_id_cursor;
+
+ p = bdi_lookup_rb_node(bdi->id, &parent);
+ rb_link_node(&bdi->rb_node, parent, p);
+ rb_insert_color(&bdi->rb_node, &bdi_tree);
+
list_add_tail_rcu(&bdi->bdi_list, &bdi_list);
+
spin_unlock_bh(&bdi_lock);
trace_writeback_bdi_register(bdi);
@@ -918,6 +997,7 @@ EXPORT_SYMBOL(bdi_register_owner);
static void bdi_remove_from_list(struct backing_dev_info *bdi)
{
spin_lock_bh(&bdi_lock);
+ rb_erase(&bdi->rb_node, &bdi_tree);
list_del_rcu(&bdi->bdi_list);
spin_unlock_bh(&bdi_lock);
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index eaaa21b23215..ccede2425c3f 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -710,7 +710,7 @@ static bool khugepaged_scan_abort(int nid)
for (i = 0; i < MAX_NUMNODES; i++) {
if (!khugepaged_node_load[i])
continue;
- if (node_distance(nid, i) > RECLAIM_DISTANCE)
+ if (node_distance(nid, i) > node_reclaim_distance)
return true;
}
return false;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 9ec5e12486a7..597d58101872 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -87,6 +87,10 @@ int do_swap_account __read_mostly;
#define do_swap_account 0
#endif
+#ifdef CONFIG_CGROUP_WRITEBACK
+static DECLARE_WAIT_QUEUE_HEAD(memcg_cgwb_frn_waitq);
+#endif
+
/* Whether legacy memory+swap accounting is active */
static bool do_memsw_account(void)
{
@@ -4172,6 +4176,8 @@ static int mem_cgroup_oom_control_write(struct cgroup_subsys_state *css,
#ifdef CONFIG_CGROUP_WRITEBACK
+#include <trace/events/writeback.h>
+
static int memcg_wb_domain_init(struct mem_cgroup *memcg, gfp_t gfp)
{
return wb_domain_init(&memcg->cgwb_domain, gfp);
@@ -4255,6 +4261,130 @@ void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages,
}
}
+/*
+ * Foreign dirty flushing
+ *
+ * There's an inherent mismatch between memcg and writeback. The former
+ * trackes ownership per-page while the latter per-inode. This was a
+ * deliberate design decision because honoring per-page ownership in the
+ * writeback path is complicated, may lead to higher CPU and IO overheads
+ * and deemed unnecessary given that write-sharing an inode across
+ * different cgroups isn't a common use-case.
+ *
+ * Combined with inode majority-writer ownership switching, this works well
+ * enough in most cases but there are some pathological cases. For
+ * example, let's say there are two cgroups A and B which keep writing to
+ * different but confined parts of the same inode. B owns the inode and
+ * A's memory is limited far below B's. A's dirty ratio can rise enough to
+ * trigger balance_dirty_pages() sleeps but B's can be low enough to avoid
+ * triggering background writeback. A will be slowed down without a way to
+ * make writeback of the dirty pages happen.
+ *
+ * Conditions like the above can lead to a cgroup getting repatedly and
+ * severely throttled after making some progress after each
+ * dirty_expire_interval while the underyling IO device is almost
+ * completely idle.
+ *
+ * Solving this problem completely requires matching the ownership tracking
+ * granularities between memcg and writeback in either direction. However,
+ * the more egregious behaviors can be avoided by simply remembering the
+ * most recent foreign dirtying events and initiating remote flushes on
+ * them when local writeback isn't enough to keep the memory clean enough.
+ *
+ * The following two functions implement such mechanism. When a foreign
+ * page - a page whose memcg and writeback ownerships don't match - is
+ * dirtied, mem_cgroup_track_foreign_dirty() records the inode owning
+ * bdi_writeback on the page owning memcg. When balance_dirty_pages()
+ * decides that the memcg needs to sleep due to high dirty ratio, it calls
+ * mem_cgroup_flush_foreign() which queues writeback on the recorded
+ * foreign bdi_writebacks which haven't expired. Both the numbers of
+ * recorded bdi_writebacks and concurrent in-flight foreign writebacks are
+ * limited to MEMCG_CGWB_FRN_CNT.
+ *
+ * The mechanism only remembers IDs and doesn't hold any object references.
+ * As being wrong occasionally doesn't matter, updates and accesses to the
+ * records are lockless and racy.
+ */
+void mem_cgroup_track_foreign_dirty_slowpath(struct page *page,
+ struct bdi_writeback *wb)
+{
+ struct mem_cgroup *memcg = page->mem_cgroup;
+ struct memcg_cgwb_frn *frn;
+ u64 now = get_jiffies_64();
+ u64 oldest_at = now;
+ int oldest = -1;
+ int i;
+
+ trace_track_foreign_dirty(page, wb);
+
+ /*
+ * Pick the slot to use. If there is already a slot for @wb, keep
+ * using it. If not replace the oldest one which isn't being
+ * written out.
+ */
+ for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++) {
+ frn = &memcg->cgwb_frn[i];
+ if (frn->bdi_id == wb->bdi->id &&
+ frn->memcg_id == wb->memcg_css->id)
+ break;
+ if (time_before64(frn->at, oldest_at) &&
+ atomic_read(&frn->done.cnt) == 1) {
+ oldest = i;
+ oldest_at = frn->at;
+ }
+ }
+
+ if (i < MEMCG_CGWB_FRN_CNT) {
+ /*
+ * Re-using an existing one. Update timestamp lazily to
+ * avoid making the cacheline hot. We want them to be
+ * reasonably up-to-date and significantly shorter than
+ * dirty_expire_interval as that's what expires the record.
+ * Use the shorter of 1s and dirty_expire_interval / 8.
+ */
+ unsigned long update_intv =
+ min_t(unsigned long, HZ,
+ msecs_to_jiffies(dirty_expire_interval * 10) / 8);
+
+ if (time_before64(frn->at, now - update_intv))
+ frn->at = now;
+ } else if (oldest >= 0) {
+ /* replace the oldest free one */
+ frn = &memcg->cgwb_frn[oldest];
+ frn->bdi_id = wb->bdi->id;
+ frn->memcg_id = wb->memcg_css->id;
+ frn->at = now;
+ }
+}
+
+/* issue foreign writeback flushes for recorded foreign dirtying events */
+void mem_cgroup_flush_foreign(struct bdi_writeback *wb)
+{
+ struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css);
+ unsigned long intv = msecs_to_jiffies(dirty_expire_interval * 10);
+ u64 now = jiffies_64;
+ int i;
+
+ for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++) {
+ struct memcg_cgwb_frn *frn = &memcg->cgwb_frn[i];
+
+ /*
+ * If the record is older than dirty_expire_interval,
+ * writeback on it has already started. No need to kick it
+ * off again. Also, don't start a new one if there's
+ * already one in flight.
+ */
+ if (time_after64(frn->at, now - intv) &&
+ atomic_read(&frn->done.cnt) == 1) {
+ frn->at = 0;
+ trace_flush_foreign(wb, frn->bdi_id, frn->memcg_id);
+ cgroup_writeback_by_id(frn->bdi_id, frn->memcg_id, 0,
+ WB_REASON_FOREIGN_FLUSH,
+ &frn->done);
+ }
+ }
+}
+
#else /* CONFIG_CGROUP_WRITEBACK */
static int memcg_wb_domain_init(struct mem_cgroup *memcg, gfp_t gfp)
@@ -4777,6 +4907,7 @@ static struct mem_cgroup *mem_cgroup_alloc(void)
struct mem_cgroup *memcg;
unsigned int size;
int node;
+ int __maybe_unused i;
size = sizeof(struct mem_cgroup);
size += nr_node_ids * sizeof(struct mem_cgroup_per_node *);
@@ -4820,6 +4951,9 @@ static struct mem_cgroup *mem_cgroup_alloc(void)
#endif
#ifdef CONFIG_CGROUP_WRITEBACK
INIT_LIST_HEAD(&memcg->cgwb_list);
+ for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++)
+ memcg->cgwb_frn[i].done =
+ __WB_COMPLETION_INIT(&memcg_cgwb_frn_waitq);
#endif
idr_replace(&mem_cgroup_idr, memcg, memcg->id.id);
return memcg;
@@ -4949,7 +5083,12 @@ static void mem_cgroup_css_released(struct cgroup_subsys_state *css)
static void mem_cgroup_css_free(struct cgroup_subsys_state *css)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
+ int __maybe_unused i;
+#ifdef CONFIG_CGROUP_WRITEBACK
+ for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++)
+ wb_wait_for_completion(&memcg->cgwb_frn[i].done);
+#endif
if (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nosocket)
static_branch_dec(&memcg_sockets_enabled_key);
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 1804f64ff43c..50055d2e4ea8 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -1667,6 +1667,8 @@ static void balance_dirty_pages(struct bdi_writeback *wb,
if (unlikely(!writeback_in_progress(wb)))
wb_start_background_writeback(wb);
+ mem_cgroup_flush_foreign(wb);
+
/*
* Calculate global domain's pos_ratio and select the
* global dtc by default.
@@ -2427,6 +2429,8 @@ void account_page_dirtied(struct page *page, struct address_space *mapping)
task_io_account_write(PAGE_SIZE);
current->nr_dirtied++;
this_cpu_inc(bdp_ratelimits);
+
+ mem_cgroup_track_foreign_dirty(page, wb);
}
}
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 9c9194959271..6991ccec9c32 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3511,7 +3511,7 @@ bool zone_watermark_ok_safe(struct zone *z, unsigned int order,
static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)
{
return node_distance(zone_to_nid(local_zone), zone_to_nid(zone)) <=
- RECLAIM_DISTANCE;
+ node_reclaim_distance;
}
#else /* CONFIG_NUMA */
static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)
diff --git a/mm/percpu.c b/mm/percpu.c
index 9821241fdede..7e06a1e58720 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -2125,7 +2125,7 @@ struct pcpu_alloc_info * __init pcpu_alloc_alloc_info(int nr_groups,
void *ptr;
int unit;
- base_size = ALIGN(sizeof(*ai) + nr_groups * sizeof(ai->groups[0]),
+ base_size = ALIGN(struct_size(ai, groups, nr_groups),
__alignof__(ai->groups[0].cpu_map[0]));
ai_size = base_size + nr_units * sizeof(ai->groups[0].cpu_map[0]);
@@ -2220,7 +2220,7 @@ static void pcpu_dump_alloc_info(const char *lvl,
* @base_addr: mapped address
*
* Initialize the first percpu chunk which contains the kernel static
- * perpcu area. This function is to be called from arch percpu area
+ * percpu area. This function is to be called from arch percpu area
* setup path.
*
* @ai contains all information necessary to initialize the first
@@ -2267,12 +2267,9 @@ static void pcpu_dump_alloc_info(const char *lvl,
* share the same vm, but use offset regions in the area allocation map.
* The chunk serving the dynamic region is circulated in the chunk slots
* and available for dynamic allocation like any other chunk.
- *
- * RETURNS:
- * 0 on success, -errno on failure.
*/
-int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
- void *base_addr)
+void __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
+ void *base_addr)
{
size_t size_sum = ai->static_size + ai->reserved_size + ai->dyn_size;
size_t static_size, dyn_size;
@@ -2457,7 +2454,6 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
/* we're done */
pcpu_base_addr = base_addr;
- return 0;
}
#ifdef CONFIG_SMP
@@ -2710,7 +2706,7 @@ int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size,
struct pcpu_alloc_info *ai;
size_t size_sum, areas_size;
unsigned long max_distance;
- int group, i, highest_group, rc;
+ int group, i, highest_group, rc = 0;
ai = pcpu_build_alloc_info(reserved_size, dyn_size, atom_size,
cpu_distance_fn);
@@ -2795,7 +2791,7 @@ int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size,
PFN_DOWN(size_sum), ai->static_size, ai->reserved_size,
ai->dyn_size, ai->unit_size);
- rc = pcpu_setup_first_chunk(ai, base);
+ pcpu_setup_first_chunk(ai, base);
goto out_free;
out_free_areas:
@@ -2839,7 +2835,7 @@ int __init pcpu_page_first_chunk(size_t reserved_size,
int unit_pages;
size_t pages_size;
struct page **pages;
- int unit, i, j, rc;
+ int unit, i, j, rc = 0;
int upa;
int nr_g0_units;
@@ -2920,7 +2916,7 @@ int __init pcpu_page_first_chunk(size_t reserved_size,
unit_pages, psize_str, ai->static_size,
ai->reserved_size, ai->dyn_size);
- rc = pcpu_setup_first_chunk(ai, vm.addr);
+ pcpu_setup_first_chunk(ai, vm.addr);
goto out_free_ar;
enomem:
@@ -3014,8 +3010,7 @@ void __init setup_per_cpu_areas(void)
ai->groups[0].nr_units = 1;
ai->groups[0].cpu_map[0] = 0;
- if (pcpu_setup_first_chunk(ai, fc) < 0)
- panic("Failed to initialize percpu areas.");
+ pcpu_setup_first_chunk(ai, fc);
pcpu_free_alloc_info(ai);
}