aboutsummaryrefslogtreecommitdiff
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/damon/dbgfs.c7
-rw-r--r--mm/damon/sysfs.c4
-rw-r--r--mm/failslab.c12
-rw-r--r--mm/huge_memory.c20
-rw-r--r--mm/hugetlb.c45
-rw-r--r--mm/hugetlb_vmemmap.c1
-rw-r--r--mm/kfence/report.c13
-rw-r--r--mm/khugepaged.c35
-rw-r--r--mm/kmemleak.c61
-rw-r--r--mm/kmsan/instrumentation.c1
-rw-r--r--mm/kmsan/kmsan.h2
-rw-r--r--mm/kmsan/shadow.c1
-rw-r--r--mm/maccess.c2
-rw-r--r--mm/madvise.c12
-rw-r--r--mm/memcontrol.c2
-rw-r--r--mm/memory-failure.c5
-rw-r--r--mm/memory-tiers.c8
-rw-r--r--mm/memory.c2
-rw-r--r--mm/mempolicy.c17
-rw-r--r--mm/memremap.c1
-rw-r--r--mm/migrate.c7
-rw-r--r--mm/migrate_device.c8
-rw-r--r--mm/mmap.c29
-rw-r--r--mm/page_alloc.c28
-rw-r--r--mm/page_ext.c2
-rw-r--r--mm/page_isolation.c2
-rw-r--r--mm/shmem.c17
-rw-r--r--mm/slab_common.c24
-rw-r--r--mm/swapfile.c8
-rw-r--r--mm/userfaultfd.c27
-rw-r--r--mm/vmscan.c72
-rw-r--r--mm/zsmalloc.c3
32 files changed, 332 insertions, 146 deletions
diff --git a/mm/damon/dbgfs.c b/mm/damon/dbgfs.c
index 6f0ae7d3ae39..b3f454a5c682 100644
--- a/mm/damon/dbgfs.c
+++ b/mm/damon/dbgfs.c
@@ -890,6 +890,7 @@ out:
static int dbgfs_rm_context(char *name)
{
struct dentry *root, *dir, **new_dirs;
+ struct inode *inode;
struct damon_ctx **new_ctxs;
int i, j;
int ret = 0;
@@ -905,6 +906,12 @@ static int dbgfs_rm_context(char *name)
if (!dir)
return -ENOENT;
+ inode = d_inode(dir);
+ if (!S_ISDIR(inode->i_mode)) {
+ ret = -EINVAL;
+ goto out_dput;
+ }
+
new_dirs = kmalloc_array(dbgfs_nr_ctxs - 1, sizeof(*dbgfs_dirs),
GFP_KERNEL);
if (!new_dirs) {
diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c
index 9f1219a67e3f..5ce403378c20 100644
--- a/mm/damon/sysfs.c
+++ b/mm/damon/sysfs.c
@@ -2339,6 +2339,10 @@ static int damon_sysfs_upd_schemes_stats(struct damon_sysfs_kdamond *kdamond)
damon_for_each_scheme(scheme, ctx) {
struct damon_sysfs_stats *sysfs_stats;
+ /* user could have removed the scheme sysfs dir */
+ if (schemes_idx >= sysfs_schemes->nr)
+ break;
+
sysfs_stats = sysfs_schemes->schemes_arr[schemes_idx++]->stats;
sysfs_stats->nr_tried = scheme->stat.nr_tried;
sysfs_stats->sz_tried = scheme->stat.sz_tried;
diff --git a/mm/failslab.c b/mm/failslab.c
index 58df9789f1d2..ffc420c0e767 100644
--- a/mm/failslab.c
+++ b/mm/failslab.c
@@ -16,6 +16,8 @@ static struct {
bool __should_failslab(struct kmem_cache *s, gfp_t gfpflags)
{
+ int flags = 0;
+
/* No fault-injection for bootstrap cache */
if (unlikely(s == kmem_cache))
return false;
@@ -30,10 +32,16 @@ bool __should_failslab(struct kmem_cache *s, gfp_t gfpflags)
if (failslab.cache_filter && !(s->flags & SLAB_FAILSLAB))
return false;
+ /*
+ * In some cases, it expects to specify __GFP_NOWARN
+ * to avoid printing any information(not just a warning),
+ * thus avoiding deadlocks. See commit 6b9dbedbe349 for
+ * details.
+ */
if (gfpflags & __GFP_NOWARN)
- failslab.attr.no_warn = true;
+ flags |= FAULT_NOWARN;
- return should_fail(&failslab.attr, s->object_size);
+ return should_fail_ex(&failslab.attr, s->object_size, flags);
}
static int __init setup_failslab(char *str)
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 1cc4a5f4791e..811d19b5c4f6 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -2206,9 +2206,12 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
entry = pte_wrprotect(entry);
if (!young)
entry = pte_mkold(entry);
- /* NOTE: this may set soft-dirty too on some archs */
- if (dirty)
- entry = pte_mkdirty(entry);
+ /*
+ * NOTE: we don't do pte_mkdirty when dirty==true
+ * because it breaks sparc64 which can sigsegv
+ * random process. Need to revisit when we figure
+ * out what is special with sparc64.
+ */
if (soft_dirty)
entry = pte_mksoft_dirty(entry);
if (uffd_wp)
@@ -2455,7 +2458,16 @@ static void __split_huge_page_tail(struct page *head, int tail,
page_tail);
page_tail->mapping = head->mapping;
page_tail->index = head->index + tail;
- page_tail->private = 0;
+
+ /*
+ * page->private should not be set in tail pages with the exception
+ * of swap cache pages that store the swp_entry_t in tail pages.
+ * Fix up and warn once if private is unexpectedly set.
+ */
+ if (!folio_test_swapcache(page_folio(head))) {
+ VM_WARN_ON_ONCE_PAGE(page_tail->private != 0, page_tail);
+ page_tail->private = 0;
+ }
/* Page flags must be visible before we make the page non-compound. */
smp_wmb();
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index b586cdd75930..f1385c3b6c96 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1014,15 +1014,23 @@ void hugetlb_dup_vma_private(struct vm_area_struct *vma)
VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma);
/*
* Clear vm_private_data
+ * - For shared mappings this is a per-vma semaphore that may be
+ * allocated in a subsequent call to hugetlb_vm_op_open.
+ * Before clearing, make sure pointer is not associated with vma
+ * as this will leak the structure. This is the case when called
+ * via clear_vma_resv_huge_pages() and hugetlb_vm_op_open has already
+ * been called to allocate a new structure.
* - For MAP_PRIVATE mappings, this is the reserve map which does
* not apply to children. Faults generated by the children are
* not guaranteed to succeed, even if read-only.
- * - For shared mappings this is a per-vma semaphore that may be
- * allocated in a subsequent call to hugetlb_vm_op_open.
*/
- vma->vm_private_data = (void *)0;
- if (!(vma->vm_flags & VM_MAYSHARE))
- return;
+ if (vma->vm_flags & VM_MAYSHARE) {
+ struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;
+
+ if (vma_lock && vma_lock->vma != vma)
+ vma->vm_private_data = NULL;
+ } else
+ vma->vm_private_data = NULL;
}
/*
@@ -1792,6 +1800,7 @@ static bool __prep_compound_gigantic_page(struct page *page, unsigned int order,
/* we rely on prep_new_huge_page to set the destructor */
set_compound_order(page, order);
+ __ClearPageReserved(page);
__SetPageHead(page);
for (i = 0; i < nr_pages; i++) {
p = nth_page(page, i);
@@ -1808,7 +1817,8 @@ static bool __prep_compound_gigantic_page(struct page *page, unsigned int order,
* on the head page when they need know if put_page() is needed
* after get_user_pages().
*/
- __ClearPageReserved(p);
+ if (i != 0) /* head page cleared above */
+ __ClearPageReserved(p);
/*
* Subtle and very unlikely
*
@@ -2924,11 +2934,11 @@ struct page *alloc_huge_page(struct vm_area_struct *vma,
page = alloc_buddy_huge_page_with_mpol(h, vma, addr);
if (!page)
goto out_uncharge_cgroup;
+ spin_lock_irq(&hugetlb_lock);
if (!avoid_reserve && vma_has_reserves(vma, gbl_chg)) {
SetHPageRestoreReserve(page);
h->resv_huge_pages--;
}
- spin_lock_irq(&hugetlb_lock);
list_add(&page->lru, &h->hugepage_activelist);
set_page_refcounted(page);
/* Fall through */
@@ -4601,6 +4611,7 @@ static void hugetlb_vm_op_open(struct vm_area_struct *vma)
struct resv_map *resv = vma_resv_map(vma);
/*
+ * HPAGE_RESV_OWNER indicates a private mapping.
* This new VMA should share its siblings reservation map if present.
* The VMA will only ever have a valid reservation map pointer where
* it is being copied for another still existing VMA. As that VMA
@@ -4615,11 +4626,21 @@ static void hugetlb_vm_op_open(struct vm_area_struct *vma)
/*
* vma_lock structure for sharable mappings is vma specific.
- * Clear old pointer (if copied via vm_area_dup) and create new.
+ * Clear old pointer (if copied via vm_area_dup) and allocate
+ * new structure. Before clearing, make sure vma_lock is not
+ * for this vma.
*/
if (vma->vm_flags & VM_MAYSHARE) {
- vma->vm_private_data = NULL;
- hugetlb_vma_lock_alloc(vma);
+ struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;
+
+ if (vma_lock) {
+ if (vma_lock->vma != vma) {
+ vma->vm_private_data = NULL;
+ hugetlb_vma_lock_alloc(vma);
+ } else
+ pr_warn("HugeTLB: vma_lock already exists in %s.\n", __func__);
+ } else
+ hugetlb_vma_lock_alloc(vma);
}
}
@@ -6092,6 +6113,10 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
ptl = huge_pte_lock(h, dst_mm, dst_pte);
+ ret = -EIO;
+ if (PageHWPoison(page))
+ goto out_release_unlock;
+
/*
* We allow to overwrite a pte marker: consider when both MISSING|WP
* registered, we firstly wr-protect a none pte which has no page cache
diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c
index ba2a2596fb4e..4962dd1ba4a6 100644
--- a/mm/hugetlb_vmemmap.c
+++ b/mm/hugetlb_vmemmap.c
@@ -11,6 +11,7 @@
#define pr_fmt(fmt) "HugeTLB: " fmt
#include <linux/pgtable.h>
+#include <linux/moduleparam.h>
#include <linux/bootmem_info.h>
#include <asm/pgalloc.h>
#include <asm/tlbflush.h>
diff --git a/mm/kfence/report.c b/mm/kfence/report.c
index 7e496856c2eb..46ecea18c4ca 100644
--- a/mm/kfence/report.c
+++ b/mm/kfence/report.c
@@ -75,18 +75,23 @@ static int get_stack_skipnr(const unsigned long stack_entries[], int num_entries
if (str_has_prefix(buf, ARCH_FUNC_PREFIX "kfence_") ||
str_has_prefix(buf, ARCH_FUNC_PREFIX "__kfence_") ||
+ str_has_prefix(buf, ARCH_FUNC_PREFIX "__kmem_cache_free") ||
!strncmp(buf, ARCH_FUNC_PREFIX "__slab_free", len)) {
/*
- * In case of tail calls from any of the below
- * to any of the above.
+ * In case of tail calls from any of the below to any of
+ * the above, optimized by the compiler such that the
+ * stack trace would omit the initial entry point below.
*/
fallback = skipnr + 1;
}
- /* Also the *_bulk() variants by only checking prefixes. */
+ /*
+ * The below list should only include the initial entry points
+ * into the slab allocators. Includes the *_bulk() variants by
+ * checking prefixes.
+ */
if (str_has_prefix(buf, ARCH_FUNC_PREFIX "kfree") ||
str_has_prefix(buf, ARCH_FUNC_PREFIX "kmem_cache_free") ||
- str_has_prefix(buf, ARCH_FUNC_PREFIX "__kmem_cache_free") ||
str_has_prefix(buf, ARCH_FUNC_PREFIX "__kmalloc") ||
str_has_prefix(buf, ARCH_FUNC_PREFIX "kmem_cache_alloc"))
goto found;
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 4734315f7940..a8d5ef2a77d2 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -97,8 +97,8 @@ struct collapse_control {
/* Num pages scanned per node */
u32 node_load[MAX_NUMNODES];
- /* Last target selected in hpage_collapse_find_target_node() */
- int last_target_node;
+ /* nodemask for allocation fallback */
+ nodemask_t alloc_nmask;
};
/**
@@ -734,7 +734,6 @@ static void khugepaged_alloc_sleep(void)
struct collapse_control khugepaged_collapse_control = {
.is_khugepaged = true,
- .last_target_node = NUMA_NO_NODE,
};
static bool hpage_collapse_scan_abort(int nid, struct collapse_control *cc)
@@ -783,16 +782,11 @@ static int hpage_collapse_find_target_node(struct collapse_control *cc)
target_node = nid;
}
- /* do some balance if several nodes have the same hit record */
- if (target_node <= cc->last_target_node)
- for (nid = cc->last_target_node + 1; nid < MAX_NUMNODES;
- nid++)
- if (max_value == cc->node_load[nid]) {
- target_node = nid;
- break;
- }
+ for_each_online_node(nid) {
+ if (max_value == cc->node_load[nid])
+ node_set(nid, cc->alloc_nmask);
+ }
- cc->last_target_node = target_node;
return target_node;
}
#else
@@ -802,9 +796,10 @@ static int hpage_collapse_find_target_node(struct collapse_control *cc)
}
#endif
-static bool hpage_collapse_alloc_page(struct page **hpage, gfp_t gfp, int node)
+static bool hpage_collapse_alloc_page(struct page **hpage, gfp_t gfp, int node,
+ nodemask_t *nmask)
{
- *hpage = __alloc_pages_node(node, gfp, HPAGE_PMD_ORDER);
+ *hpage = __alloc_pages(gfp, HPAGE_PMD_ORDER, node, nmask);
if (unlikely(!*hpage)) {
count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
return false;
@@ -955,12 +950,11 @@ static int __collapse_huge_page_swapin(struct mm_struct *mm,
static int alloc_charge_hpage(struct page **hpage, struct mm_struct *mm,
struct collapse_control *cc)
{
- /* Only allocate from the target node */
gfp_t gfp = (cc->is_khugepaged ? alloc_hugepage_khugepaged_gfpmask() :
- GFP_TRANSHUGE) | __GFP_THISNODE;
+ GFP_TRANSHUGE);
int node = hpage_collapse_find_target_node(cc);
- if (!hpage_collapse_alloc_page(hpage, gfp, node))
+ if (!hpage_collapse_alloc_page(hpage, gfp, node, &cc->alloc_nmask))
return SCAN_ALLOC_HUGE_PAGE_FAIL;
if (unlikely(mem_cgroup_charge(page_folio(*hpage), mm, gfp)))
return SCAN_CGROUP_CHARGE_FAIL;
@@ -1144,6 +1138,7 @@ static int hpage_collapse_scan_pmd(struct mm_struct *mm,
goto out;
memset(cc->node_load, 0, sizeof(cc->node_load));
+ nodes_clear(cc->alloc_nmask);
pte = pte_offset_map_lock(mm, pmd, address, &ptl);
for (_address = address, _pte = pte; _pte < pte + HPAGE_PMD_NR;
_pte++, _address += PAGE_SIZE) {
@@ -2077,6 +2072,7 @@ static int hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr,
present = 0;
swap = 0;
memset(cc->node_load, 0, sizeof(cc->node_load));
+ nodes_clear(cc->alloc_nmask);
rcu_read_lock();
xas_for_each(&xas, page, start + HPAGE_PMD_NR - 1) {
if (xas_retry(&xas, page))
@@ -2157,8 +2153,7 @@ static int hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr,
}
}
- trace_mm_khugepaged_scan_file(mm, page, file->f_path.dentry->d_iname,
- present, swap, result);
+ trace_mm_khugepaged_scan_file(mm, page, file, present, swap, result);
return result;
}
#else
@@ -2576,7 +2571,6 @@ int madvise_collapse(struct vm_area_struct *vma, struct vm_area_struct **prev,
if (!cc)
return -ENOMEM;
cc->is_khugepaged = false;
- cc->last_target_node = NUMA_NO_NODE;
mmgrab(mm);
lru_add_drain_all();
@@ -2602,6 +2596,7 @@ int madvise_collapse(struct vm_area_struct *vma, struct vm_area_struct **prev,
}
mmap_assert_locked(mm);
memset(cc->node_load, 0, sizeof(cc->node_load));
+ nodes_clear(cc->alloc_nmask);
if (IS_ENABLED(CONFIG_SHMEM) && vma->vm_file) {
struct file *file = get_file(vma->vm_file);
pgoff_t pgoff = linear_page_index(vma, addr);
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index 37af2dc8dac9..646e2979641f 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -1461,6 +1461,27 @@ static void scan_gray_list(void)
}
/*
+ * Conditionally call resched() in a object iteration loop while making sure
+ * that the given object won't go away without RCU read lock by performing a
+ * get_object() if !pinned.
+ *
+ * Return: false if can't do a cond_resched() due to get_object() failure
+ * true otherwise
+ */
+static bool kmemleak_cond_resched(struct kmemleak_object *object, bool pinned)
+{
+ if (!pinned && !get_object(object))
+ return false;
+
+ rcu_read_unlock();
+ cond_resched();
+ rcu_read_lock();
+ if (!pinned)
+ put_object(object);
+ return true;
+}
+
+/*
* Scan data sections and all the referenced memory blocks allocated via the
* kernel's standard allocators. This function must be called with the
* scan_mutex held.
@@ -1471,7 +1492,7 @@ static void kmemleak_scan(void)
struct zone *zone;
int __maybe_unused i;
int new_leaks = 0;
- int loop1_cnt = 0;
+ int loop_cnt = 0;
jiffies_last_scan = jiffies;
@@ -1480,7 +1501,6 @@ static void kmemleak_scan(void)
list_for_each_entry_rcu(object, &object_list, object_list) {
bool obj_pinned = false;
- loop1_cnt++;
raw_spin_lock_irq(&object->lock);
#ifdef DEBUG
/*
@@ -1514,24 +1534,11 @@ static void kmemleak_scan(void)
raw_spin_unlock_irq(&object->lock);
/*
- * Do a cond_resched() to avoid soft lockup every 64k objects.
- * Make sure a reference has been taken so that the object
- * won't go away without RCU read lock.
+ * Do a cond_resched() every 64k objects to avoid soft lockup.
*/
- if (!(loop1_cnt & 0xffff)) {
- if (!obj_pinned && !get_object(object)) {
- /* Try the next object instead */
- loop1_cnt--;
- continue;
- }
-
- rcu_read_unlock();
- cond_resched();
- rcu_read_lock();
-
- if (!obj_pinned)
- put_object(object);
- }
+ if (!(++loop_cnt & 0xffff) &&
+ !kmemleak_cond_resched(object, obj_pinned))
+ loop_cnt--; /* Try again on next object */
}
rcu_read_unlock();
@@ -1598,8 +1605,16 @@ static void kmemleak_scan(void)
* scan and color them gray until the next scan.
*/
rcu_read_lock();
+ loop_cnt = 0;
list_for_each_entry_rcu(object, &object_list, object_list) {
/*
+ * Do a cond_resched() every 64k objects to avoid soft lockup.
+ */
+ if (!(++loop_cnt & 0xffff) &&
+ !kmemleak_cond_resched(object, false))
+ loop_cnt--; /* Try again on next object */
+
+ /*
* This is racy but we can save the overhead of lock/unlock
* calls. The missed objects, if any, should be caught in
* the next scan.
@@ -1632,8 +1647,16 @@ static void kmemleak_scan(void)
* Scanning result reporting.
*/
rcu_read_lock();
+ loop_cnt = 0;
list_for_each_entry_rcu(object, &object_list, object_list) {
/*
+ * Do a cond_resched() every 64k objects to avoid soft lockup.
+ */
+ if (!(++loop_cnt & 0xffff) &&
+ !kmemleak_cond_resched(object, false))
+ loop_cnt--; /* Try again on next object */
+
+ /*
* This is racy but we can save the overhead of lock/unlock
* calls. The missed objects, if any, should be caught in
* the next scan.
diff --git a/mm/kmsan/instrumentation.c b/mm/kmsan/instrumentation.c
index 280d15413268..271f135f97a1 100644
--- a/mm/kmsan/instrumentation.c
+++ b/mm/kmsan/instrumentation.c
@@ -14,6 +14,7 @@
#include "kmsan.h"
#include <linux/gfp.h>
+#include <linux/kmsan_string.h>
#include <linux/mm.h>
#include <linux/uaccess.h>
diff --git a/mm/kmsan/kmsan.h b/mm/kmsan/kmsan.h
index 7019c46d33a7..a14744205435 100644
--- a/mm/kmsan/kmsan.h
+++ b/mm/kmsan/kmsan.h
@@ -124,6 +124,8 @@ static __always_inline bool kmsan_in_runtime(void)
{
if ((hardirq_count() >> HARDIRQ_SHIFT) > 1)
return true;
+ if (in_nmi())
+ return true;
return kmsan_get_context()->kmsan_in_runtime;
}
diff --git a/mm/kmsan/shadow.c b/mm/kmsan/shadow.c
index 21e3e196ec3c..a787c04e9583 100644
--- a/mm/kmsan/shadow.c
+++ b/mm/kmsan/shadow.c
@@ -167,6 +167,7 @@ void kmsan_copy_page_meta(struct page *dst, struct page *src)
__memcpy(origin_ptr_for(dst), origin_ptr_for(src), PAGE_SIZE);
kmsan_leave_runtime();
}
+EXPORT_SYMBOL(kmsan_copy_page_meta);
void kmsan_alloc_page(struct page *page, unsigned int order, gfp_t flags)
{
diff --git a/mm/maccess.c b/mm/maccess.c
index 5f4d240f67ec..074f6b086671 100644
--- a/mm/maccess.c
+++ b/mm/maccess.c
@@ -97,7 +97,7 @@ long strncpy_from_kernel_nofault(char *dst, const void *unsafe_addr, long count)
return src - unsafe_addr;
Efault:
pagefault_enable();
- dst[-1] = '\0';
+ dst[0] = '\0';
return -EFAULT;
}
diff --git a/mm/madvise.c b/mm/madvise.c
index 2baa93ca2310..c7105ec6d08c 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -813,7 +813,14 @@ static bool madvise_dontneed_free_valid_vma(struct vm_area_struct *vma,
if (start & ~huge_page_mask(hstate_vma(vma)))
return false;
- *end = ALIGN(*end, huge_page_size(hstate_vma(vma)));
+ /*
+ * Madvise callers expect the length to be rounded up to PAGE_SIZE
+ * boundaries, and may be unaware that this VMA uses huge pages.
+ * Avoid unexpected data loss by rounding down the number of
+ * huge pages freed.
+ */
+ *end = ALIGN_DOWN(*end, huge_page_size(hstate_vma(vma)));
+
return true;
}
@@ -828,6 +835,9 @@ static long madvise_dontneed_free(struct vm_area_struct *vma,
if (!madvise_dontneed_free_valid_vma(vma, start, &end, behavior))
return -EINVAL;
+ if (start == end)
+ return 0;
+
if (!userfaultfd_remove(vma, start, end)) {
*prev = NULL; /* mmap_lock has been dropped, prev is stale */
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 2d8549ae1b30..a1a35c12635e 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -3026,7 +3026,7 @@ struct obj_cgroup *get_obj_cgroup_from_page(struct page *page)
{
struct obj_cgroup *objcg;
- if (!memcg_kmem_enabled() || memcg_kmem_bypass())
+ if (!memcg_kmem_enabled())
return NULL;
if (PageMemcgKmem(page)) {
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 145bb561ddb3..bead6bccc7f2 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -1080,6 +1080,7 @@ static int me_huge_page(struct page_state *ps, struct page *p)
int res;
struct page *hpage = compound_head(p);
struct address_space *mapping;
+ bool extra_pins = false;
if (!PageHuge(hpage))
return MF_DELAYED;
@@ -1087,6 +1088,8 @@ static int me_huge_page(struct page_state *ps, struct page *p)
mapping = page_mapping(hpage);
if (mapping) {
res = truncate_error_page(hpage, page_to_pfn(p), mapping);
+ /* The page is kept in page cache. */
+ extra_pins = true;
unlock_page(hpage);
} else {
unlock_page(hpage);
@@ -1104,7 +1107,7 @@ static int me_huge_page(struct page_state *ps, struct page *p)
}
}
- if (has_extra_refcount(ps, p, false))
+ if (has_extra_refcount(ps, p, extra_pins))
res = MF_FAILED;
return res;
diff --git a/mm/memory-tiers.c b/mm/memory-tiers.c
index f116b7b6333e..fa8c9d07f9ce 100644
--- a/mm/memory-tiers.c
+++ b/mm/memory-tiers.c
@@ -131,8 +131,8 @@ static void memory_tier_device_release(struct device *dev)
kfree(tier);
}
-static ssize_t nodes_show(struct device *dev,
- struct device_attribute *attr, char *buf)
+static ssize_t nodelist_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
{
int ret;
nodemask_t nmask;
@@ -143,10 +143,10 @@ static ssize_t nodes_show(struct device *dev,
mutex_unlock(&memory_tier_lock);
return ret;
}
-static DEVICE_ATTR_RO(nodes);
+static DEVICE_ATTR_RO(nodelist);
static struct attribute *memtier_dev_attrs[] = {
- &dev_attr_nodes.attr,
+ &dev_attr_nodelist.attr,
NULL
};
diff --git a/mm/memory.c b/mm/memory.c
index f88c351aecd4..8a6d5c823f91 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3763,7 +3763,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
*/
get_page(vmf->page);
pte_unmap_unlock(vmf->pte, vmf->ptl);
- vmf->page->pgmap->ops->migrate_to_ram(vmf);
+ ret = vmf->page->pgmap->ops->migrate_to_ram(vmf);
put_page(vmf->page);
} else if (is_hwpoison_entry(entry)) {
ret = VM_FAULT_HWPOISON;
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index a937eaec5b68..61aa9aedb728 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -787,17 +787,22 @@ static int vma_replace_policy(struct vm_area_struct *vma,
static int mbind_range(struct mm_struct *mm, unsigned long start,
unsigned long end, struct mempolicy *new_pol)
{
- MA_STATE(mas, &mm->mm_mt, start - 1, start - 1);
+ MA_STATE(mas, &mm->mm_mt, start, start);
struct vm_area_struct *prev;
struct vm_area_struct *vma;
int err = 0;
pgoff_t pgoff;
- prev = mas_find_rev(&mas, 0);
- if (prev && (start < prev->vm_end))
- vma = prev;
- else
- vma = mas_next(&mas, end - 1);
+ prev = mas_prev(&mas, 0);
+ if (unlikely(!prev))
+ mas_set(&mas, start);
+
+ vma = mas_find(&mas, end - 1);
+ if (WARN_ON(!vma))
+ return 0;
+
+ if (start > vma->vm_start)
+ prev = vma;
for (; vma; vma = mas_next(&mas, end - 1)) {
unsigned long vmstart = max(start, vma->vm_start);
diff --git a/mm/memremap.c b/mm/memremap.c
index 421bec3a29ee..08cbf54fe037 100644
--- a/mm/memremap.c
+++ b/mm/memremap.c
@@ -335,6 +335,7 @@ void *memremap_pages(struct dev_pagemap *pgmap, int nid)
WARN(1, "File system DAX not supported\n");
return ERR_PTR(-EINVAL);
}
+ params.pgprot = pgprot_decrypted(params.pgprot);
break;
case MEMORY_DEVICE_GENERIC:
break;
diff --git a/mm/migrate.c b/mm/migrate.c
index 1379e1912772..dff333593a8a 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -1582,6 +1582,13 @@ out:
*/
list_splice(&ret_pages, from);
+ /*
+ * Return 0 in case all subpages of fail-to-migrate THPs are
+ * migrated successfully.
+ */
+ if (list_empty(from))
+ rc = 0;
+
count_vm_events(PGMIGRATE_SUCCESS, nr_succeeded);
count_vm_events(PGMIGRATE_FAIL, nr_failed_pages);
count_vm_events(THP_MIGRATION_SUCCESS, nr_thp_succeeded);
diff --git a/mm/migrate_device.c b/mm/migrate_device.c
index 6fa682eef7a0..721b2365dbca 100644
--- a/mm/migrate_device.c
+++ b/mm/migrate_device.c
@@ -357,7 +357,8 @@ static bool migrate_vma_check_page(struct page *page, struct page *fault_page)
}
/*
- * Unmaps pages for migration. Returns number of unmapped pages.
+ * Unmaps pages for migration. Returns number of source pfns marked as
+ * migrating.
*/
static unsigned long migrate_device_unmap(unsigned long *src_pfns,
unsigned long npages,
@@ -373,8 +374,11 @@ static unsigned long migrate_device_unmap(unsigned long *src_pfns,
struct page *page = migrate_pfn_to_page(src_pfns[i]);
struct folio *folio;
- if (!page)
+ if (!page) {
+ if (src_pfns[i] & MIGRATE_PFN_MIGRATE)
+ unmapped++;
continue;
+ }
/* ZONE_DEVICE pages are not on LRU */
if (!is_zone_device_page(page)) {
diff --git a/mm/mmap.c b/mm/mmap.c
index bf2122af94e7..74a84eb33b90 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -456,7 +456,7 @@ void vma_mas_remove(struct vm_area_struct *vma, struct ma_state *mas)
* vma_mas_szero() - Set a given range to zero. Used when modifying a
* vm_area_struct start or end.
*
- * @mm: The struct_mm
+ * @mas: The maple tree ma_state
* @start: The start address to zero
* @end: The end address to zero.
*/
@@ -618,7 +618,8 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start,
struct vm_area_struct *expand)
{
struct mm_struct *mm = vma->vm_mm;
- struct vm_area_struct *next_next, *next = find_vma(mm, vma->vm_end);
+ struct vm_area_struct *next_next = NULL; /* uninit var warning */
+ struct vm_area_struct *next = find_vma(mm, vma->vm_end);
struct vm_area_struct *orig_vma = vma;
struct address_space *mapping = NULL;
struct rb_root_cached *root = NULL;
@@ -2625,14 +2626,14 @@ cannot_expand:
if (error)
goto unmap_and_free_vma;
- /* Can addr have changed??
- *
- * Answer: Yes, several device drivers can do it in their
- * f_op->mmap method. -DaveM
+ /*
+ * Expansion is handled above, merging is handled below.
+ * Drivers should not alter the address of the VMA.
*/
- WARN_ON_ONCE(addr != vma->vm_start);
-
- addr = vma->vm_start;
+ if (WARN_ON((addr != vma->vm_start))) {
+ error = -EINVAL;
+ goto close_and_free_vma;
+ }
mas_reset(&mas);
/*
@@ -2654,7 +2655,6 @@ cannot_expand:
vm_area_free(vma);
vma = merge;
/* Update vm_flags to pick up the change. */
- addr = vma->vm_start;
vm_flags = vma->vm_flags;
goto unmap_writable;
}
@@ -2674,6 +2674,8 @@ cannot_expand:
error = -EINVAL;
if (file)
goto close_and_free_vma;
+ else if (vma->vm_file)
+ goto unmap_and_free_vma;
else
goto free_vma;
}
@@ -2681,6 +2683,8 @@ cannot_expand:
if (mas_preallocate(&mas, vma, GFP_KERNEL)) {
error = -ENOMEM;
if (file)
+ goto close_and_free_vma;
+ else if (vma->vm_file)
goto unmap_and_free_vma;
else
goto free_vma;
@@ -2751,7 +2755,7 @@ unmap_and_free_vma:
/* Undo any partial mapping done by a device driver. */
unmap_region(mm, mas.tree, vma, prev, next, vma->vm_start, vma->vm_end);
- if (vm_flags & VM_SHARED)
+ if (file && (vm_flags & VM_SHARED))
mapping_unmap_writable(file->f_mapping);
free_vma:
vm_area_free(vma);
@@ -2852,6 +2856,9 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
if (next->vm_flags != vma->vm_flags)
goto out;
+ if (start + size <= next->vm_end)
+ break;
+
prev = next;
}
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index e20ade858e71..6e60657875d3 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -807,6 +807,7 @@ static void prep_compound_tail(struct page *head, int tail_idx)
p->mapping = TAIL_MAPPING;
set_compound_head(p, head);
+ set_page_private(p, 0);
}
void prep_compound_page(struct page *page, unsigned int order)
@@ -3886,6 +3887,8 @@ __setup("fail_page_alloc=", setup_fail_page_alloc);
static bool __should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
{
+ int flags = 0;
+
if (order < fail_page_alloc.min_order)
return false;
if (gfp_mask & __GFP_NOFAIL)
@@ -3896,10 +3899,11 @@ static bool __should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
(gfp_mask & __GFP_DIRECT_RECLAIM))
return false;
+ /* See comment in __should_failslab() */
if (gfp_mask & __GFP_NOWARN)
- fail_page_alloc.attr.no_warn = true;
+ flags |= FAULT_NOWARN;
- return should_fail(&fail_page_alloc.attr, 1 << order);
+ return should_fail_ex(&fail_page_alloc.attr, 1 << order, flags);
}
#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
@@ -5784,14 +5788,18 @@ static void *make_alloc_exact(unsigned long addr, unsigned int order,
size_t size)
{
if (addr) {
- unsigned long alloc_end = addr + (PAGE_SIZE << order);
- unsigned long used = addr + PAGE_ALIGN(size);
-
- split_page(virt_to_page((void *)addr), order);
- while (used < alloc_end) {
- free_page(used);
- used += PAGE_SIZE;
- }
+ unsigned long nr = DIV_ROUND_UP(size, PAGE_SIZE);
+ struct page *page = virt_to_page((void *)addr);
+ struct page *last = page + nr;
+
+ split_page_owner(page, 1 << order);
+ split_page_memcg(page, 1 << order);
+ while (page < --last)
+ set_page_refcounted(last);
+
+ last = page + (1UL << order);
+ for (page += nr; page < last; page++)
+ __free_pages_ok(page, 0, FPI_TO_TAIL);
}
return (void *)addr;
}
diff --git a/mm/page_ext.c b/mm/page_ext.c
index affe80243b6d..ddf1968560f0 100644
--- a/mm/page_ext.c
+++ b/mm/page_ext.c
@@ -166,7 +166,7 @@ struct page_ext *page_ext_get(struct page *page)
/**
* page_ext_put() - Working with page extended information is done.
- * @page_ext - Page extended information received from page_ext_get().
+ * @page_ext: Page extended information received from page_ext_get().
*
* The page extended information of the page may not be valid after this
* function is called.
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index 04141a9bea70..47fbc1696466 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -330,7 +330,7 @@ static int isolate_single_pageblock(unsigned long boundary_pfn, int flags,
zone->zone_start_pfn);
if (skip_isolation) {
- int mt = get_pageblock_migratetype(pfn_to_page(isolate_pageblock));
+ int mt __maybe_unused = get_pageblock_migratetype(pfn_to_page(isolate_pageblock));
VM_BUG_ON(!is_migrate_isolate(mt));
} else {
diff --git a/mm/shmem.c b/mm/shmem.c
index 8280a5cb48df..c1d8b8a1aa3b 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -2424,9 +2424,26 @@ int shmem_mfill_atomic_pte(struct mm_struct *dst_mm,
if (!zeropage) { /* COPY */
page_kaddr = kmap_local_folio(folio, 0);
+ /*
+ * The read mmap_lock is held here. Despite the
+ * mmap_lock being read recursive a deadlock is still
+ * possible if a writer has taken a lock. For example:
+ *
+ * process A thread 1 takes read lock on own mmap_lock
+ * process A thread 2 calls mmap, blocks taking write lock
+ * process B thread 1 takes page fault, read lock on own mmap lock
+ * process B thread 2 calls mmap, blocks taking write lock
+ * process A thread 1 blocks taking read lock on process B
+ * process B thread 1 blocks taking read lock on process A
+ *
+ * Disable page faults to prevent potential deadlock
+ * and retry the copy outside the mmap_lock.
+ */
+ pagefault_disable();
ret = copy_from_user(page_kaddr,
(const void __user *)src_addr,
PAGE_SIZE);
+ pagefault_enable();
kunmap_local(page_kaddr);
/* fallback to copy_from_user outside mmap_lock */
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 33b1886b06eb..0042fb2730d1 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -941,7 +941,7 @@ void *__do_kmalloc_node(size_t size, gfp_t flags, int node, unsigned long caller
if (unlikely(size > KMALLOC_MAX_CACHE_SIZE)) {
ret = __kmalloc_large_node(size, flags, node);
- trace_kmalloc(_RET_IP_, ret, size,
+ trace_kmalloc(caller, ret, size,
PAGE_SIZE << get_order(size), flags, node);
return ret;
}
@@ -953,7 +953,7 @@ void *__do_kmalloc_node(size_t size, gfp_t flags, int node, unsigned long caller
ret = __kmem_cache_alloc_node(s, flags, node, size, caller);
ret = kasan_kmalloc(s, ret, size, flags);
- trace_kmalloc(_RET_IP_, ret, size, s->size, flags, node);
+ trace_kmalloc(caller, ret, size, s->size, flags, node);
return ret;
}
@@ -1010,7 +1010,7 @@ EXPORT_SYMBOL(kfree);
/**
* __ksize -- Report full size of underlying allocation
- * @objp: pointer to the object
+ * @object: pointer to the object
*
* This should only be used internally to query the true size of allocations.
* It is not meant to be a way to discover the usable size of an allocation
@@ -1018,7 +1018,7 @@ EXPORT_SYMBOL(kfree);
* the originally requested allocation size may trigger KASAN, UBSAN_BOUNDS,
* and/or FORTIFY_SOURCE.
*
- * Return: size of the actual memory used by @objp in bytes
+ * Return: size of the actual memory used by @object in bytes
*/
size_t __ksize(const void *object)
{
@@ -1040,7 +1040,6 @@ size_t __ksize(const void *object)
return slab_ksize(folio_slab(folio)->slab_cache);
}
-#ifdef CONFIG_TRACING
void *kmalloc_trace(struct kmem_cache *s, gfp_t gfpflags, size_t size)
{
void *ret = __kmem_cache_alloc_node(s, gfpflags, NUMA_NO_NODE,
@@ -1064,7 +1063,6 @@ void *kmalloc_node_trace(struct kmem_cache *s, gfp_t gfpflags,
return ret;
}
EXPORT_SYMBOL(kmalloc_node_trace);
-#endif /* !CONFIG_TRACING */
#endif /* !CONFIG_SLOB */
gfp_t kmalloc_fix_flags(gfp_t flags)
@@ -1411,20 +1409,6 @@ void kfree_sensitive(const void *p)
}
EXPORT_SYMBOL(kfree_sensitive);
-/**
- * ksize - get the actual amount of memory allocated for a given object
- * @objp: Pointer to the object
- *
- * kmalloc may internally round up allocations and return more memory
- * than requested. ksize() can be used to determine the actual amount of
- * memory allocated. The caller may use this additional memory, even though
- * a smaller amount of memory was initially specified with the kmalloc call.
- * The caller must guarantee that objp points to a valid object previously
- * allocated with either kmalloc() or kmem_cache_alloc(). The object
- * must not be freed during the duration of the call.
- *
- * Return: size of the actual memory used by @objp in bytes
- */
size_t ksize(const void *objp)
{
size_t size;
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 5fc1237a9f21..72e481aacd5d 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -973,23 +973,23 @@ done:
scan:
spin_unlock(&si->lock);
while (++offset <= READ_ONCE(si->highest_bit)) {
- if (swap_offset_available_and_locked(si, offset))
- goto checks;
if (unlikely(--latency_ration < 0)) {
cond_resched();
latency_ration = LATENCY_LIMIT;
scanned_many = true;
}
+ if (swap_offset_available_and_locked(si, offset))
+ goto checks;
}
offset = si->lowest_bit;
while (offset < scan_base) {
- if (swap_offset_available_and_locked(si, offset))
- goto checks;
if (unlikely(--latency_ration < 0)) {
cond_resched();
latency_ration = LATENCY_LIMIT;
scanned_many = true;
}
+ if (swap_offset_available_and_locked(si, offset))
+ goto checks;
offset++;
}
spin_lock(&si->lock);
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index e24e8a47ce8a..650ab6cfd5f4 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -64,7 +64,7 @@ int mfill_atomic_install_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd,
pte_t _dst_pte, *dst_pte;
bool writable = dst_vma->vm_flags & VM_WRITE;
bool vm_shared = dst_vma->vm_flags & VM_SHARED;
- bool page_in_cache = page->mapping;
+ bool page_in_cache = page_mapping(page);
spinlock_t *ptl;
struct inode *inode;
pgoff_t offset, max_off;
@@ -157,11 +157,28 @@ static int mcopy_atomic_pte(struct mm_struct *dst_mm,
if (!page)
goto out;
- page_kaddr = kmap_atomic(page);
+ page_kaddr = kmap_local_page(page);
+ /*
+ * The read mmap_lock is held here. Despite the
+ * mmap_lock being read recursive a deadlock is still
+ * possible if a writer has taken a lock. For example:
+ *
+ * process A thread 1 takes read lock on own mmap_lock
+ * process A thread 2 calls mmap, blocks taking write lock
+ * process B thread 1 takes page fault, read lock on own mmap lock
+ * process B thread 2 calls mmap, blocks taking write lock
+ * process A thread 1 blocks taking read lock on process B
+ * process B thread 1 blocks taking read lock on process A
+ *
+ * Disable page faults to prevent potential deadlock
+ * and retry the copy outside the mmap_lock.
+ */
+ pagefault_disable();
ret = copy_from_user(page_kaddr,
(const void __user *) src_addr,
PAGE_SIZE);
- kunmap_atomic(page_kaddr);
+ pagefault_enable();
+ kunmap_local(page_kaddr);
/* fallback to copy_from_user outside mmap_lock */
if (unlikely(ret)) {
@@ -646,11 +663,11 @@ retry:
mmap_read_unlock(dst_mm);
BUG_ON(!page);
- page_kaddr = kmap(page);
+ page_kaddr = kmap_local_page(page);
err = copy_from_user(page_kaddr,
(const void __user *) src_addr,
PAGE_SIZE);
- kunmap(page);
+ kunmap_local(page_kaddr);
if (unlikely(err)) {
err = -EFAULT;
goto out;
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 04d8b88e5216..026199c047e0 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2514,8 +2514,20 @@ static unsigned long shrink_inactive_list(unsigned long nr_to_scan,
* the flushers simply cannot keep up with the allocation
* rate. Nudge the flusher threads in case they are asleep.
*/
- if (stat.nr_unqueued_dirty == nr_taken)
+ if (stat.nr_unqueued_dirty == nr_taken) {
wakeup_flusher_threads(WB_REASON_VMSCAN);
+ /*
+ * For cgroupv1 dirty throttling is achieved by waking up
+ * the kernel flusher here and later waiting on folios
+ * which are in writeback to finish (see shrink_folio_list()).
+ *
+ * Flusher may not be able to issue writeback quickly
+ * enough for cgroupv1 writeback throttling to work
+ * on a large system.
+ */
+ if (!writeback_throttling_sane(sc))
+ reclaim_throttle(pgdat, VMSCAN_THROTTLE_WRITEBACK);
+ }
sc->nr.dirty += stat.nr_dirty;
sc->nr.congested += stat.nr_congested;
@@ -4971,10 +4983,13 @@ static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swap
int scanned;
int reclaimed;
LIST_HEAD(list);
+ LIST_HEAD(clean);
struct folio *folio;
+ struct folio *next;
enum vm_event_item item;
struct reclaim_stat stat;
struct lru_gen_mm_walk *walk;
+ bool skip_retry = false;
struct mem_cgroup *memcg = lruvec_memcg(lruvec);
struct pglist_data *pgdat = lruvec_pgdat(lruvec);
@@ -4991,20 +5006,37 @@ static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swap
if (list_empty(&list))
return scanned;
-
+retry:
reclaimed = shrink_folio_list(&list, pgdat, sc, &stat, false);
+ sc->nr_reclaimed += reclaimed;
- list_for_each_entry(folio, &list, lru) {
- /* restore LRU_REFS_FLAGS cleared by isolate_folio() */
- if (folio_test_workingset(folio))
- folio_set_referenced(folio);
+ list_for_each_entry_safe_reverse(folio, next, &list, lru) {
+ if (!folio_evictable(folio)) {
+ list_del(&folio->lru);
+ folio_putback_lru(folio);
+ continue;
+ }
- /* don't add rejected pages to the oldest generation */
if (folio_test_reclaim(folio) &&
- (folio_test_dirty(folio) || folio_test_writeback(folio)))
- folio_clear_active(folio);
- else
- folio_set_active(folio);
+ (folio_test_dirty(folio) || folio_test_writeback(folio))) {
+ /* restore LRU_REFS_FLAGS cleared by isolate_folio() */
+ if (folio_test_workingset(folio))
+ folio_set_referenced(folio);
+ continue;
+ }
+
+ if (skip_retry || folio_test_active(folio) || folio_test_referenced(folio) ||
+ folio_mapped(folio) || folio_test_locked(folio) ||
+ folio_test_dirty(folio) || folio_test_writeback(folio)) {
+ /* don't add rejected folios to the oldest generation */
+ set_mask_bits(&folio->flags, LRU_REFS_MASK | LRU_REFS_FLAGS,
+ BIT(PG_active));
+ continue;
+ }
+
+ /* retry folios that may have missed folio_rotate_reclaimable() */
+ list_move(&folio->lru, &clean);
+ sc->nr_scanned -= folio_nr_pages(folio);
}
spin_lock_irq(&lruvec->lru_lock);
@@ -5026,7 +5058,13 @@ static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swap
mem_cgroup_uncharge_list(&list);
free_unref_page_list(&list);
- sc->nr_reclaimed += reclaimed;
+ INIT_LIST_HEAD(&list);
+ list_splice_init(&clean, &list);
+
+ if (!list_empty(&list)) {
+ skip_retry = true;
+ goto retry;
+ }
if (need_swapping && type == LRU_GEN_ANON)
*need_swapping = true;
@@ -5844,8 +5882,8 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
enum lru_list lru;
unsigned long nr_reclaimed = 0;
unsigned long nr_to_reclaim = sc->nr_to_reclaim;
+ bool proportional_reclaim;
struct blk_plug plug;
- bool scan_adjusted;
if (lru_gen_enabled()) {
lru_gen_shrink_lruvec(lruvec, sc);
@@ -5868,8 +5906,8 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
* abort proportional reclaim if either the file or anon lru has already
* dropped to zero at the first pass.
*/
- scan_adjusted = (!cgroup_reclaim(sc) && !current_is_kswapd() &&
- sc->priority == DEF_PRIORITY);
+ proportional_reclaim = (!cgroup_reclaim(sc) && !current_is_kswapd() &&
+ sc->priority == DEF_PRIORITY);
blk_start_plug(&plug);
while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
@@ -5889,7 +5927,7 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
cond_resched();
- if (nr_reclaimed < nr_to_reclaim || scan_adjusted)
+ if (nr_reclaimed < nr_to_reclaim || proportional_reclaim)
continue;
/*
@@ -5940,8 +5978,6 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
nr_scanned = targets[lru] - nr[lru];
nr[lru] = targets[lru] * (100 - percentage) / 100;
nr[lru] -= min(nr[lru], nr_scanned);
-
- scan_adjusted = true;
}
blk_finish_plug(&plug);
sc->nr_reclaimed += nr_reclaimed;
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index 525758713a55..d03941cace2c 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -2311,6 +2311,9 @@ void zs_destroy_pool(struct zs_pool *pool)
int fg;
struct size_class *class = pool->size_class[i];
+ if (!class)
+ continue;
+
if (class->index != i)
continue;