aboutsummaryrefslogtreecommitdiff
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig3
-rw-r--r--mm/compaction.c43
-rw-r--r--mm/filemap.c3
-rw-r--r--mm/huge_memory.c18
-rw-r--r--mm/init-mm.c3
-rw-r--r--mm/kasan/generic.c10
-rw-r--r--mm/khugepaged.c1
-rw-r--r--mm/memblock.c5
-rw-r--r--mm/memcontrol.c29
-rw-r--r--mm/memory-failure.c2
-rw-r--r--mm/memory-tiers.c12
-rw-r--r--mm/memory.c2
-rw-r--r--mm/memory_hotplug.c8
-rw-r--r--mm/migrate.c2
-rw-r--r--mm/mm_init.c6
-rw-r--r--mm/mmap.c6
-rw-r--r--mm/page-writeback.c2
-rw-r--r--mm/page_alloc.c7
-rw-r--r--mm/percpu.c8
-rw-r--r--mm/readahead.c4
-rw-r--r--mm/shmem.c1
-rw-r--r--mm/swapfile.c1
-rw-r--r--mm/userfaultfd.c21
23 files changed, 146 insertions, 51 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index 1902cfe4cc4f..ffc3a2ba3a8c 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -1258,6 +1258,9 @@ config LOCK_MM_AND_FIND_VMA
bool
depends on !STACK_GROWSUP
+config IOMMU_MM_DATA
+ bool
+
source "mm/damon/Kconfig"
endmenu
diff --git a/mm/compaction.c b/mm/compaction.c
index 27ada42924d5..4add68d40e8d 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -882,6 +882,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
/* Time to isolate some pages for migration */
for (; low_pfn < end_pfn; low_pfn++) {
+ bool is_dirty, is_unevictable;
if (skip_on_failure && low_pfn >= next_skip_pfn) {
/*
@@ -1079,8 +1080,10 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
if (!folio_test_lru(folio))
goto isolate_fail_put;
+ is_unevictable = folio_test_unevictable(folio);
+
/* Compaction might skip unevictable pages but CMA takes them */
- if (!(mode & ISOLATE_UNEVICTABLE) && folio_test_unevictable(folio))
+ if (!(mode & ISOLATE_UNEVICTABLE) && is_unevictable)
goto isolate_fail_put;
/*
@@ -1092,26 +1095,42 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
if ((mode & ISOLATE_ASYNC_MIGRATE) && folio_test_writeback(folio))
goto isolate_fail_put;
- if ((mode & ISOLATE_ASYNC_MIGRATE) && folio_test_dirty(folio)) {
- bool migrate_dirty;
+ is_dirty = folio_test_dirty(folio);
+
+ if (((mode & ISOLATE_ASYNC_MIGRATE) && is_dirty) ||
+ (mapping && is_unevictable)) {
+ bool migrate_dirty = true;
+ bool is_unmovable;
/*
* Only folios without mappings or that have
- * a ->migrate_folio callback are possible to
- * migrate without blocking. However, we may
- * be racing with truncation, which can free
- * the mapping. Truncation holds the folio lock
- * until after the folio is removed from the page
- * cache so holding it ourselves is sufficient.
+ * a ->migrate_folio callback are possible to migrate
+ * without blocking.
+ *
+ * Folios from unmovable mappings are not migratable.
+ *
+ * However, we can be racing with truncation, which can
+ * free the mapping that we need to check. Truncation
+ * holds the folio lock until after the folio is removed
+ * from the page so holding it ourselves is sufficient.
+ *
+ * To avoid locking the folio just to check unmovable,
+ * assume every unmovable folio is also unevictable,
+ * which is a cheaper test. If our assumption goes
+ * wrong, it's not a correctness bug, just potentially
+ * wasted cycles.
*/
if (!folio_trylock(folio))
goto isolate_fail_put;
mapping = folio_mapping(folio);
- migrate_dirty = !mapping ||
- mapping->a_ops->migrate_folio;
+ if ((mode & ISOLATE_ASYNC_MIGRATE) && is_dirty) {
+ migrate_dirty = !mapping ||
+ mapping->a_ops->migrate_folio;
+ }
+ is_unmovable = mapping && mapping_unmovable(mapping);
folio_unlock(folio);
- if (!migrate_dirty)
+ if (!migrate_dirty || is_unmovable)
goto isolate_fail_put;
}
diff --git a/mm/filemap.c b/mm/filemap.c
index c8dafe70d4cc..750e779c23db 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -45,6 +45,7 @@
#include <linux/migrate.h>
#include <linux/pipe_fs_i.h>
#include <linux/splice.h>
+#include <linux/rcupdate_wait.h>
#include <asm/pgalloc.h>
#include <asm/tlbflush.h>
#include "internal.h"
@@ -2687,6 +2688,7 @@ int kiocb_write_and_wait(struct kiocb *iocb, size_t count)
return filemap_write_and_wait_range(mapping, pos, end);
}
+EXPORT_SYMBOL_GPL(kiocb_write_and_wait);
int kiocb_invalidate_pages(struct kiocb *iocb, size_t count)
{
@@ -2714,6 +2716,7 @@ int kiocb_invalidate_pages(struct kiocb *iocb, size_t count)
return invalidate_inode_pages2_range(mapping, pos >> PAGE_SHIFT,
end >> PAGE_SHIFT);
}
+EXPORT_SYMBOL_GPL(kiocb_invalidate_pages);
/**
* generic_file_read_iter - generic filesystem read routine
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 94ef5c02b459..94c958f7ebb5 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -37,6 +37,7 @@
#include <linux/page_owner.h>
#include <linux/sched/sysctl.h>
#include <linux/memory-tiers.h>
+#include <linux/compat.h>
#include <asm/tlb.h>
#include <asm/pgalloc.h>
@@ -809,7 +810,10 @@ static unsigned long __thp_get_unmapped_area(struct file *filp,
{
loff_t off_end = off + len;
loff_t off_align = round_up(off, size);
- unsigned long len_pad, ret;
+ unsigned long len_pad, ret, off_sub;
+
+ if (IS_ENABLED(CONFIG_32BIT) || in_compat_syscall())
+ return 0;
if (off_end <= off_align || (off_end - off_align) < size)
return 0;
@@ -835,7 +839,13 @@ static unsigned long __thp_get_unmapped_area(struct file *filp,
if (ret == addr)
return addr;
- ret += (off - ret) & (size - 1);
+ off_sub = (off - ret) & (size - 1);
+
+ if (current->mm->get_unmapped_area == arch_get_unmapped_area_topdown &&
+ !off_sub)
+ return ret + size;
+
+ ret += off_sub;
return ret;
}
@@ -2437,7 +2447,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
page = pmd_page(old_pmd);
folio = page_folio(page);
if (!folio_test_dirty(folio) && pmd_dirty(old_pmd))
- folio_set_dirty(folio);
+ folio_mark_dirty(folio);
if (!folio_test_referenced(folio) && pmd_young(old_pmd))
folio_set_referenced(folio);
folio_remove_rmap_pmd(folio, page, vma);
@@ -3563,7 +3573,7 @@ int set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw,
}
if (pmd_dirty(pmdval))
- folio_set_dirty(folio);
+ folio_mark_dirty(folio);
if (pmd_write(pmdval))
entry = make_writable_migration_entry(page_to_pfn(page));
else if (anon_exclusive)
diff --git a/mm/init-mm.c b/mm/init-mm.c
index cfd367822cdd..24c809379274 100644
--- a/mm/init-mm.c
+++ b/mm/init-mm.c
@@ -44,9 +44,6 @@ struct mm_struct init_mm = {
#endif
.user_ns = &init_user_ns,
.cpu_bitmap = CPU_BITS_NONE,
-#ifdef CONFIG_IOMMU_SVA
- .pasid = IOMMU_PASID_INVALID,
-#endif
INIT_MM_CONTEXT(init_mm)
};
diff --git a/mm/kasan/generic.c b/mm/kasan/generic.c
index 24c13dfb1e94..df6627f62402 100644
--- a/mm/kasan/generic.c
+++ b/mm/kasan/generic.c
@@ -487,6 +487,7 @@ void kasan_init_object_meta(struct kmem_cache *cache, const void *object)
__memset(alloc_meta, 0, sizeof(*alloc_meta));
/*
+ * Prepare the lock for saving auxiliary stack traces.
* Temporarily disable KASAN bug reporting to allow instrumented
* raw_spin_lock_init to access aux_lock, which resides inside
* of a redzone.
@@ -510,8 +511,13 @@ static void release_alloc_meta(struct kasan_alloc_meta *meta)
stack_depot_put(meta->aux_stack[0]);
stack_depot_put(meta->aux_stack[1]);
- /* Zero out alloc meta to mark it as invalid. */
- __memset(meta, 0, sizeof(*meta));
+ /*
+ * Zero out alloc meta to mark it as invalid but keep aux_lock
+ * initialized to avoid having to reinitialize it when another object
+ * is allocated in the same slot.
+ */
+ __memset(&meta->alloc_track, 0, sizeof(meta->alloc_track));
+ __memset(meta->aux_stack, 0, sizeof(meta->aux_stack));
}
static void release_free_meta(const void *object, struct kasan_free_meta *meta)
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 3defe6713ef1..2b219acb528e 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -17,6 +17,7 @@
#include <linux/userfaultfd_k.h>
#include <linux/page_idle.h>
#include <linux/page_table_check.h>
+#include <linux/rcupdate_wait.h>
#include <linux/swapops.h>
#include <linux/shmem_fs.h>
#include <linux/ksm.h>
diff --git a/mm/memblock.c b/mm/memblock.c
index 8c194d8afeec..4dcb2ee35eca 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -1885,7 +1885,7 @@ int __init_memblock memblock_search_pfn_nid(unsigned long pfn,
int mid = memblock_search(type, PFN_PHYS(pfn));
if (mid == -1)
- return -1;
+ return NUMA_NO_NODE;
*start_pfn = PFN_DOWN(type->regions[mid].base);
*end_pfn = PFN_DOWN(type->regions[mid].base + type->regions[mid].size);
@@ -2176,6 +2176,9 @@ static void __init memmap_init_reserved_pages(void)
start = region->base;
end = start + region->size;
+ if (nid == NUMA_NO_NODE || nid >= MAX_NUMNODES)
+ nid = early_pfn_to_nid(PFN_DOWN(start));
+
reserve_bootmem_region(start, end, nid);
}
}
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index e4c8735e7c85..46d8d02114cf 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2623,8 +2623,9 @@ static unsigned long calculate_high_delay(struct mem_cgroup *memcg,
}
/*
- * Scheduled by try_charge() to be executed from the userland return path
- * and reclaims memory over the high limit.
+ * Reclaims memory over the high limit. Called directly from
+ * try_charge() (context permitting), as well as from the userland
+ * return path where reclaim is always able to block.
*/
void mem_cgroup_handle_over_high(gfp_t gfp_mask)
{
@@ -2644,6 +2645,17 @@ void mem_cgroup_handle_over_high(gfp_t gfp_mask)
retry_reclaim:
/*
+ * Bail if the task is already exiting. Unlike memory.max,
+ * memory.high enforcement isn't as strict, and there is no
+ * OOM killer involved, which means the excess could already
+ * be much bigger (and still growing) than it could for
+ * memory.max; the dying task could get stuck in fruitless
+ * reclaim for a long time, which isn't desirable.
+ */
+ if (task_is_dying())
+ goto out;
+
+ /*
* The allocating task should reclaim at least the batch size, but for
* subsequent retries we only want to do what's necessary to prevent oom
* or breaching resource isolation.
@@ -2693,6 +2705,9 @@ retry_reclaim:
}
/*
+ * Reclaim didn't manage to push usage below the limit, slow
+ * this allocating task down.
+ *
* If we exit early, we're guaranteed to die (since
* schedule_timeout_killable sets TASK_KILLABLE). This means we don't
* need to account for any ill-begotten jiffies to pay them off later.
@@ -2887,11 +2902,17 @@ done_restock:
}
} while ((memcg = parent_mem_cgroup(memcg)));
+ /*
+ * Reclaim is set up above to be called from the userland
+ * return path. But also attempt synchronous reclaim to avoid
+ * excessive overrun while the task is still inside the
+ * kernel. If this is successful, the return path will see it
+ * when it rechecks the overage and simply bail out.
+ */
if (current->memcg_nr_pages_over_high > MEMCG_CHARGE_BATCH &&
!(current->flags & PF_MEMALLOC) &&
- gfpflags_allow_blocking(gfp_mask)) {
+ gfpflags_allow_blocking(gfp_mask))
mem_cgroup_handle_over_high(gfp_mask);
- }
return 0;
}
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 4f9b61f4a668..636280d04008 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -982,7 +982,7 @@ static bool has_extra_refcount(struct page_state *ps, struct page *p,
int count = page_count(p) - 1;
if (extra_pins)
- count -= 1;
+ count -= folio_nr_pages(page_folio(p));
if (count > 0) {
pr_err("%#lx: %s still referenced by %d users\n",
diff --git a/mm/memory-tiers.c b/mm/memory-tiers.c
index 8d5291add2bc..5462d9e3c84c 100644
--- a/mm/memory-tiers.c
+++ b/mm/memory-tiers.c
@@ -109,7 +109,7 @@ static struct demotion_nodes *node_demotion __read_mostly;
static BLOCKING_NOTIFIER_HEAD(mt_adistance_algorithms);
static bool default_dram_perf_error;
-static struct node_hmem_attrs default_dram_perf;
+static struct access_coordinate default_dram_perf;
static int default_dram_perf_ref_nid = NUMA_NO_NODE;
static const char *default_dram_perf_ref_source;
@@ -601,15 +601,15 @@ void clear_node_memory_type(int node, struct memory_dev_type *memtype)
}
EXPORT_SYMBOL_GPL(clear_node_memory_type);
-static void dump_hmem_attrs(struct node_hmem_attrs *attrs, const char *prefix)
+static void dump_hmem_attrs(struct access_coordinate *coord, const char *prefix)
{
pr_info(
"%sread_latency: %u, write_latency: %u, read_bandwidth: %u, write_bandwidth: %u\n",
- prefix, attrs->read_latency, attrs->write_latency,
- attrs->read_bandwidth, attrs->write_bandwidth);
+ prefix, coord->read_latency, coord->write_latency,
+ coord->read_bandwidth, coord->write_bandwidth);
}
-int mt_set_default_dram_perf(int nid, struct node_hmem_attrs *perf,
+int mt_set_default_dram_perf(int nid, struct access_coordinate *perf,
const char *source)
{
int rc = 0;
@@ -666,7 +666,7 @@ out:
return rc;
}
-int mt_perf_to_adistance(struct node_hmem_attrs *perf, int *adist)
+int mt_perf_to_adistance(struct access_coordinate *perf, int *adist)
{
if (default_dram_perf_error)
return -EIO;
diff --git a/mm/memory.c b/mm/memory.c
index 7e1f4849463a..89bcae0b224d 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1464,7 +1464,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
delay_rmap = 0;
if (!folio_test_anon(folio)) {
if (pte_dirty(ptent)) {
- folio_set_dirty(folio);
+ folio_mark_dirty(folio);
if (tlb_delay_rmap(tlb)) {
delay_rmap = 1;
force_flush = 1;
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index b3c0ff52bb72..21890994c1d3 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -101,9 +101,11 @@ static int set_memmap_mode(const char *val, const struct kernel_param *kp)
static int get_memmap_mode(char *buffer, const struct kernel_param *kp)
{
- if (*((int *)kp->arg) == MEMMAP_ON_MEMORY_FORCE)
- return sprintf(buffer, "force\n");
- return param_get_bool(buffer, kp);
+ int mode = *((int *)kp->arg);
+
+ if (mode == MEMMAP_ON_MEMORY_FORCE)
+ return sprintf(buffer, "force\n");
+ return sprintf(buffer, "%c\n", mode ? 'Y' : 'N');
}
static const struct kernel_param_ops memmap_mode_ops = {
diff --git a/mm/migrate.c b/mm/migrate.c
index bde8273cf15b..cc9f2bcd73b4 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -962,6 +962,8 @@ static int move_to_new_folio(struct folio *dst, struct folio *src,
if (!mapping)
rc = migrate_folio(mapping, dst, src, mode);
+ else if (mapping_unmovable(mapping))
+ rc = -EOPNOTSUPP;
else if (mapping->a_ops->migrate_folio)
/*
* Most folios have a mapping and most filesystems
diff --git a/mm/mm_init.c b/mm/mm_init.c
index 89dc29f1e6c6..2c19f5515e36 100644
--- a/mm/mm_init.c
+++ b/mm/mm_init.c
@@ -26,6 +26,7 @@
#include <linux/pgtable.h>
#include <linux/swap.h>
#include <linux/cma.h>
+#include <linux/crash_dump.h>
#include "internal.h"
#include "slab.h"
#include "shuffle.h"
@@ -381,6 +382,11 @@ static void __init find_zone_movable_pfns_for_nodes(void)
goto out;
}
+ if (is_kdump_kernel()) {
+ pr_warn("The system is under kdump, ignore kernelcore=mirror.\n");
+ goto out;
+ }
+
for_each_mem_region(r) {
if (memblock_is_mirror(r))
continue;
diff --git a/mm/mmap.c b/mm/mmap.c
index b78e83d351d2..d89770eaab6b 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1825,15 +1825,17 @@ get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
/*
* mmap_region() will call shmem_zero_setup() to create a file,
* so use shmem's get_unmapped_area in case it can be huge.
- * do_mmap() will clear pgoff, so match alignment.
*/
- pgoff = 0;
get_area = shmem_get_unmapped_area;
} else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) {
/* Ensures that larger anonymous mappings are THP aligned. */
get_area = thp_get_unmapped_area;
}
+ /* Always treat pgoff as zero for anonymous memory. */
+ if (!file)
+ pgoff = 0;
+
addr = get_area(file, addr, len, pgoff, flags);
if (IS_ERR_VALUE(addr))
return addr;
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index cd4e4ae77c40..02147b61712b 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -1638,7 +1638,7 @@ static inline void wb_dirty_limits(struct dirty_throttle_control *dtc)
*/
dtc->wb_thresh = __wb_calc_thresh(dtc);
dtc->wb_bg_thresh = dtc->thresh ?
- div_u64((u64)dtc->wb_thresh * dtc->bg_thresh, dtc->thresh) : 0;
+ div64_u64(dtc->wb_thresh * dtc->bg_thresh, dtc->thresh) : 0;
/*
* In order to avoid the stacked BDI deadlock we need
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index a01baf0454f8..150d4f23b010 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -916,6 +916,9 @@ static inline bool page_expected_state(struct page *page,
#ifdef CONFIG_MEMCG
page->memcg_data |
#endif
+#ifdef CONFIG_PAGE_POOL
+ ((page->pp_magic & ~0x3UL) == PP_SIGNATURE) |
+#endif
(page->flags & check_flags)))
return false;
@@ -942,6 +945,10 @@ static const char *page_bad_reason(struct page *page, unsigned long flags)
if (unlikely(page->memcg_data))
bad_reason = "page still charged to cgroup";
#endif
+#ifdef CONFIG_PAGE_POOL
+ if (unlikely((page->pp_magic & ~0x3UL) == PP_SIGNATURE))
+ bad_reason = "page_pool leak";
+#endif
return bad_reason;
}
diff --git a/mm/percpu.c b/mm/percpu.c
index 7b97d31df767..4e11fc1e6def 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -3333,13 +3333,7 @@ int __init pcpu_page_first_chunk(size_t reserved_size, pcpu_fc_cpu_to_node_fn_t
if (rc < 0)
panic("failed to map percpu area, err=%d\n", rc);
- /*
- * FIXME: Archs with virtual cache should flush local
- * cache for the linear mapping here - something
- * equivalent to flush_cache_vmap() on the local cpu.
- * flush_cache_vmap() can't be used as most supporting
- * data structures are not set up yet.
- */
+ flush_cache_vmap_early(unit_addr, unit_addr + ai->unit_size);
/* copy static data */
memcpy((void *)unit_addr, __per_cpu_load, ai->static_size);
diff --git a/mm/readahead.c b/mm/readahead.c
index 23620c57c122..2648ec4f0494 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -469,7 +469,7 @@ static inline int ra_alloc_folio(struct readahead_control *ractl, pgoff_t index,
if (!folio)
return -ENOMEM;
- mark = round_up(mark, 1UL << order);
+ mark = round_down(mark, 1UL << order);
if (index == mark)
folio_set_readahead(folio);
err = filemap_add_folio(ractl->mapping, folio, index, gfp);
@@ -575,7 +575,7 @@ static void ondemand_readahead(struct readahead_control *ractl,
* It's the expected callback index, assume sequential access.
* Ramp up sizes, and push forward the readahead window.
*/
- expected = round_up(ra->start + ra->size - ra->async_size,
+ expected = round_down(ra->start + ra->size - ra->async_size,
1UL << order);
if (index == expected || index == (ra->start + ra->size)) {
ra->start += ra->size;
diff --git a/mm/shmem.c b/mm/shmem.c
index 928aa2304932..d7c84ff62186 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -79,6 +79,7 @@ static struct vfsmount *shm_mnt __ro_after_init;
#include <linux/rmap.h>
#include <linux/uuid.h>
#include <linux/quotaops.h>
+#include <linux/rcupdate_wait.h>
#include <linux/uaccess.h>
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 3eec686484ef..556ff7347d5f 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -42,6 +42,7 @@
#include <linux/completion.h>
#include <linux/suspend.h>
#include <linux/zswap.h>
+#include <linux/plist.h>
#include <asm/tlbflush.h>
#include <linux/swapops.h>
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index 216ab4c8621f..75fcf1f783bc 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -357,6 +357,7 @@ static __always_inline ssize_t mfill_atomic_hugetlb(
unsigned long dst_start,
unsigned long src_start,
unsigned long len,
+ atomic_t *mmap_changing,
uffd_flags_t flags)
{
struct mm_struct *dst_mm = dst_vma->vm_mm;
@@ -472,6 +473,15 @@ retry:
goto out;
}
mmap_read_lock(dst_mm);
+ /*
+ * If memory mappings are changing because of non-cooperative
+ * operation (e.g. mremap) running in parallel, bail out and
+ * request the user to retry later
+ */
+ if (mmap_changing && atomic_read(mmap_changing)) {
+ err = -EAGAIN;
+ break;
+ }
dst_vma = NULL;
goto retry;
@@ -506,6 +516,7 @@ extern ssize_t mfill_atomic_hugetlb(struct vm_area_struct *dst_vma,
unsigned long dst_start,
unsigned long src_start,
unsigned long len,
+ atomic_t *mmap_changing,
uffd_flags_t flags);
#endif /* CONFIG_HUGETLB_PAGE */
@@ -622,8 +633,8 @@ retry:
* If this is a HUGETLB vma, pass off to appropriate routine
*/
if (is_vm_hugetlb_page(dst_vma))
- return mfill_atomic_hugetlb(dst_vma, dst_start,
- src_start, len, flags);
+ return mfill_atomic_hugetlb(dst_vma, dst_start, src_start,
+ len, mmap_changing, flags);
if (!vma_is_anonymous(dst_vma) && !vma_is_shmem(dst_vma))
goto out_unlock;
@@ -1393,6 +1404,12 @@ ssize_t move_pages(struct userfaultfd_ctx *ctx, struct mm_struct *mm,
err = -ENOENT;
break;
}
+ /* Avoid moving zeropages for now */
+ if (is_huge_zero_pmd(*src_pmd)) {
+ spin_unlock(ptl);
+ err = -EBUSY;
+ break;
+ }
/* Check if we can move the pmd without splitting it. */
if (move_splits_huge_pmd(dst_addr, src_addr, src_start + len) ||