aboutsummaryrefslogtreecommitdiff
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/backing-dev.c19
-rw-r--r--mm/gup.c29
-rw-r--r--mm/huge_memory.c7
-rw-r--r--mm/kasan/quarantine.c7
-rw-r--r--mm/kfence/core.c10
-rw-r--r--mm/memory-failure.c15
-rw-r--r--mm/mremap.c2
-rw-r--r--mm/nommu.c2
-rw-r--r--mm/page_io.c4
-rw-r--r--mm/readahead.c16
-rw-r--r--mm/swapfile.c32
-rw-r--r--mm/usercopy.c91
-rw-r--r--mm/util.c32
13 files changed, 142 insertions, 124 deletions
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 7176af65b103..ff60bd7d74e0 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -1,5 +1,6 @@
// SPDX-License-Identifier: GPL-2.0-only
+#include <linux/blkdev.h>
#include <linux/wait.h>
#include <linux/rbtree.h>
#include <linux/kthread.h>
@@ -390,7 +391,6 @@ static void cgwb_release_workfn(struct work_struct *work)
{
struct bdi_writeback *wb = container_of(work, struct bdi_writeback,
release_work);
- struct blkcg *blkcg = css_to_blkcg(wb->blkcg_css);
struct backing_dev_info *bdi = wb->bdi;
mutex_lock(&wb->bdi->cgwb_release_mutex);
@@ -401,7 +401,7 @@ static void cgwb_release_workfn(struct work_struct *work)
mutex_unlock(&wb->bdi->cgwb_release_mutex);
/* triggers blkg destruction if no online users left */
- blkcg_unpin_online(blkcg);
+ blkcg_unpin_online(wb->blkcg_css);
fprop_local_destroy_percpu(&wb->memcg_completions);
@@ -446,7 +446,6 @@ static int cgwb_create(struct backing_dev_info *bdi,
{
struct mem_cgroup *memcg;
struct cgroup_subsys_state *blkcg_css;
- struct blkcg *blkcg;
struct list_head *memcg_cgwb_list, *blkcg_cgwb_list;
struct bdi_writeback *wb;
unsigned long flags;
@@ -454,9 +453,8 @@ static int cgwb_create(struct backing_dev_info *bdi,
memcg = mem_cgroup_from_css(memcg_css);
blkcg_css = cgroup_get_e_css(memcg_css->cgroup, &io_cgrp_subsys);
- blkcg = css_to_blkcg(blkcg_css);
memcg_cgwb_list = &memcg->cgwb_list;
- blkcg_cgwb_list = &blkcg->cgwb_list;
+ blkcg_cgwb_list = blkcg_get_cgwb_list(blkcg_css);
/* look up again under lock and discard on blkcg mismatch */
spin_lock_irqsave(&cgwb_lock, flags);
@@ -511,7 +509,7 @@ static int cgwb_create(struct backing_dev_info *bdi,
list_add_tail_rcu(&wb->bdi_node, &bdi->wb_list);
list_add(&wb->memcg_node, memcg_cgwb_list);
list_add(&wb->blkcg_node, blkcg_cgwb_list);
- blkcg_pin_online(blkcg);
+ blkcg_pin_online(blkcg_css);
css_get(memcg_css);
css_get(blkcg_css);
}
@@ -724,18 +722,19 @@ void wb_memcg_offline(struct mem_cgroup *memcg)
/**
* wb_blkcg_offline - kill all wb's associated with a blkcg being offlined
- * @blkcg: blkcg being offlined
+ * @css: blkcg being offlined
*
* Also prevents creation of any new wb's associated with @blkcg.
*/
-void wb_blkcg_offline(struct blkcg *blkcg)
+void wb_blkcg_offline(struct cgroup_subsys_state *css)
{
struct bdi_writeback *wb, *next;
+ struct list_head *list = blkcg_get_cgwb_list(css);
spin_lock_irq(&cgwb_lock);
- list_for_each_entry_safe(wb, next, &blkcg->cgwb_list, blkcg_node)
+ list_for_each_entry_safe(wb, next, list, blkcg_node)
cgwb_kill(wb);
- blkcg->cgwb_list.next = NULL; /* prevent new wb's */
+ list->next = NULL; /* prevent new wb's */
spin_unlock_irq(&cgwb_lock);
}
diff --git a/mm/gup.c b/mm/gup.c
index f598a037eb04..501bc150792c 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -1648,6 +1648,35 @@ out:
}
EXPORT_SYMBOL(fault_in_writeable);
+/**
+ * fault_in_subpage_writeable - fault in an address range for writing
+ * @uaddr: start of address range
+ * @size: size of address range
+ *
+ * Fault in a user address range for writing while checking for permissions at
+ * sub-page granularity (e.g. arm64 MTE). This function should be used when
+ * the caller cannot guarantee forward progress of a copy_to_user() loop.
+ *
+ * Returns the number of bytes not faulted in (like copy_to_user() and
+ * copy_from_user()).
+ */
+size_t fault_in_subpage_writeable(char __user *uaddr, size_t size)
+{
+ size_t faulted_in;
+
+ /*
+ * Attempt faulting in at page granularity first for page table
+ * permission checking. The arch-specific probe_subpage_writeable()
+ * functions may not check for this.
+ */
+ faulted_in = size - fault_in_writeable(uaddr, size);
+ if (faulted_in)
+ faulted_in -= probe_subpage_writeable(uaddr, faulted_in);
+
+ return size - faulted_in;
+}
+EXPORT_SYMBOL(fault_in_subpage_writeable);
+
/*
* fault_in_safe_writeable - fault in an address range for writing
* @uaddr: start of address range
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index c468fee595ff..910a138e9859 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -2495,11 +2495,16 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
struct address_space *mapping = NULL;
int extra_pins, ret;
pgoff_t end;
+ bool is_hzp;
- VM_BUG_ON_PAGE(is_huge_zero_page(head), head);
VM_BUG_ON_PAGE(!PageLocked(head), head);
VM_BUG_ON_PAGE(!PageCompound(head), head);
+ is_hzp = is_huge_zero_page(head);
+ VM_WARN_ON_ONCE_PAGE(is_hzp, head);
+ if (is_hzp)
+ return -EBUSY;
+
if (PageWriteback(head))
return -EBUSY;
diff --git a/mm/kasan/quarantine.c b/mm/kasan/quarantine.c
index 08291ed33e93..0a9def8ce5e8 100644
--- a/mm/kasan/quarantine.c
+++ b/mm/kasan/quarantine.c
@@ -315,6 +315,13 @@ static void per_cpu_remove_cache(void *arg)
struct qlist_head *q;
q = this_cpu_ptr(&cpu_quarantine);
+ /*
+ * Ensure the ordering between the writing to q->offline and
+ * per_cpu_remove_cache. Prevent cpu_quarantine from being corrupted
+ * by interrupt.
+ */
+ if (READ_ONCE(q->offline))
+ return;
qlist_move_cache(q, &to_free, cache);
qlist_free_all(&to_free, cache);
}
diff --git a/mm/kfence/core.c b/mm/kfence/core.c
index 9b2b5f56f4ae..11a954763be9 100644
--- a/mm/kfence/core.c
+++ b/mm/kfence/core.c
@@ -621,6 +621,16 @@ static bool __init kfence_init_pool_early(void)
* fails for the first page, and therefore expect addr==__kfence_pool in
* most failure cases.
*/
+ for (char *p = (char *)addr; p < __kfence_pool + KFENCE_POOL_SIZE; p += PAGE_SIZE) {
+ struct slab *slab = virt_to_slab(p);
+
+ if (!slab)
+ continue;
+#ifdef CONFIG_MEMCG
+ slab->memcg_data = 0;
+#endif
+ __folio_clear_slab(slab_folio(slab));
+ }
memblock_free_late(__pa(addr), KFENCE_POOL_SIZE - (addr - (unsigned long)__kfence_pool));
__kfence_pool = NULL;
return false;
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 27760c19bad7..d4a4adcca01f 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -1274,7 +1274,7 @@ try_again:
}
out:
if (ret == -EIO)
- dump_page(p, "hwpoison: unhandlable page");
+ pr_err("Memory failure: %#lx: unhandlable page.\n", page_to_pfn(p));
return ret;
}
@@ -1861,19 +1861,6 @@ try_again:
if (PageTransHuge(hpage)) {
/*
- * Bail out before SetPageHasHWPoisoned() if hpage is
- * huge_zero_page, although PG_has_hwpoisoned is not
- * checked in set_huge_zero_page().
- *
- * TODO: Handle memory failure of huge_zero_page thoroughly.
- */
- if (is_huge_zero_page(hpage)) {
- action_result(pfn, MF_MSG_UNSPLIT_THP, MF_IGNORED);
- res = -EBUSY;
- goto unlock_mutex;
- }
-
- /*
* The flag must be set after the refcount is bumped
* otherwise it may race with THP split.
* And the flag can't be set in get_hwpoison_page() since
diff --git a/mm/mremap.c b/mm/mremap.c
index 303d3290b938..0b93fac76851 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -947,7 +947,7 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
return -EINTR;
vma = vma_lookup(mm, addr);
if (!vma) {
- ret = EFAULT;
+ ret = -EFAULT;
goto out;
}
diff --git a/mm/nommu.c b/mm/nommu.c
index 55a9e48a7a02..9d7afc2d959e 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -226,6 +226,8 @@ void *vmalloc(unsigned long size)
}
EXPORT_SYMBOL(vmalloc);
+void *vmalloc_huge(unsigned long size, gfp_t gfp_mask) __weak __alias(__vmalloc);
+
/*
* vzalloc - allocate virtually contiguous memory with zero fill
*
diff --git a/mm/page_io.c b/mm/page_io.c
index 89fbf3cae30f..3fbdab6a940e 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -360,7 +360,6 @@ int swap_readpage(struct page *page, bool synchronous)
* attempt to access it in the page fault retry time check.
*/
if (synchronous) {
- bio->bi_opf |= REQ_POLLED;
get_task_struct(current);
bio->bi_private = current;
}
@@ -372,8 +371,7 @@ int swap_readpage(struct page *page, bool synchronous)
if (!READ_ONCE(bio->bi_private))
break;
- if (!bio_poll(bio, NULL, 0))
- blk_io_schedule();
+ blk_io_schedule();
}
__set_current_state(TASK_RUNNING);
bio_put(bio);
diff --git a/mm/readahead.c b/mm/readahead.c
index 8e3775829513..26bf74a6b2fe 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -113,6 +113,7 @@
* ->readpage() which may be less efficient.
*/
+#include <linux/blkdev.h>
#include <linux/kernel.h>
#include <linux/dax.h>
#include <linux/gfp.h>
@@ -474,7 +475,8 @@ static inline int ra_alloc_folio(struct readahead_control *ractl, pgoff_t index,
if (!folio)
return -ENOMEM;
- if (mark - index < (1UL << order))
+ mark = round_up(mark, 1UL << order);
+ if (index == mark)
folio_set_readahead(folio);
err = filemap_add_folio(ractl->mapping, folio, index, gfp);
if (err)
@@ -555,8 +557,9 @@ static void ondemand_readahead(struct readahead_control *ractl,
struct file_ra_state *ra = ractl->ra;
unsigned long max_pages = ra->ra_pages;
unsigned long add_pages;
- unsigned long index = readahead_index(ractl);
- pgoff_t prev_index;
+ pgoff_t index = readahead_index(ractl);
+ pgoff_t expected, prev_index;
+ unsigned int order = folio ? folio_order(folio) : 0;
/*
* If the request exceeds the readahead window, allow the read to
@@ -575,8 +578,9 @@ static void ondemand_readahead(struct readahead_control *ractl,
* It's the expected callback index, assume sequential access.
* Ramp up sizes, and push forward the readahead window.
*/
- if ((index == (ra->start + ra->size - ra->async_size) ||
- index == (ra->start + ra->size))) {
+ expected = round_up(ra->start + ra->size - ra->async_size,
+ 1UL << order);
+ if (index == expected || index == (ra->start + ra->size)) {
ra->start += ra->size;
ra->size = get_next_ra_size(ra, max_pages);
ra->async_size = ra->size;
@@ -662,7 +666,7 @@ readit:
}
ractl->_index = ra->start;
- page_cache_ra_order(ractl, ra, folio ? folio_order(folio) : 0);
+ page_cache_ra_order(ractl, ra, order);
}
void page_cache_sync_ra(struct readahead_control *ractl,
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 63c61f8b2611..981a6e85c88e 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -6,6 +6,7 @@
* Swap reorganised 29.12.95, Stephen Tweedie
*/
+#include <linux/blkdev.h>
#include <linux/mm.h>
#include <linux/sched/mm.h>
#include <linux/sched/task.h>
@@ -179,7 +180,7 @@ static int discard_swap(struct swap_info_struct *si)
nr_blocks = ((sector_t)se->nr_pages - 1) << (PAGE_SHIFT - 9);
if (nr_blocks) {
err = blkdev_issue_discard(si->bdev, start_block,
- nr_blocks, GFP_KERNEL, 0);
+ nr_blocks, GFP_KERNEL);
if (err)
return err;
cond_resched();
@@ -190,7 +191,7 @@ static int discard_swap(struct swap_info_struct *si)
nr_blocks = (sector_t)se->nr_pages << (PAGE_SHIFT - 9);
err = blkdev_issue_discard(si->bdev, start_block,
- nr_blocks, GFP_KERNEL, 0);
+ nr_blocks, GFP_KERNEL);
if (err)
break;
@@ -254,7 +255,7 @@ static void discard_swap_cluster(struct swap_info_struct *si,
start_block <<= PAGE_SHIFT - 9;
nr_blocks <<= PAGE_SHIFT - 9;
if (blkdev_issue_discard(si->bdev, start_block,
- nr_blocks, GFP_NOIO, 0))
+ nr_blocks, GFP_NOIO))
break;
se = next_se(se);
@@ -2466,7 +2467,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
if (p->flags & SWP_CONTINUED)
free_swap_count_continuations(p);
- if (!p->bdev || !blk_queue_nonrot(bdev_get_queue(p->bdev)))
+ if (!p->bdev || !bdev_nonrot(p->bdev))
atomic_dec(&nr_rotate_swap);
mutex_lock(&swapon_mutex);
@@ -2761,7 +2762,7 @@ static int claim_swapfile(struct swap_info_struct *p, struct inode *inode)
* write only restriction. Hence zoned block devices are not
* suitable for swapping. Disallow them here.
*/
- if (blk_queue_is_zoned(p->bdev->bd_disk->queue))
+ if (bdev_is_zoned(p->bdev))
return -EINVAL;
p->flags |= SWP_BLKDEV;
} else if (S_ISREG(inode->i_mode)) {
@@ -2957,20 +2958,6 @@ static int setup_swap_map_and_extents(struct swap_info_struct *p,
return nr_extents;
}
-/*
- * Helper to sys_swapon determining if a given swap
- * backing device queue supports DISCARD operations.
- */
-static bool swap_discardable(struct swap_info_struct *si)
-{
- struct request_queue *q = bdev_get_queue(si->bdev);
-
- if (!blk_queue_discard(q))
- return false;
-
- return true;
-}
-
SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
{
struct swap_info_struct *p;
@@ -3065,13 +3052,13 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
goto bad_swap_unlock_inode;
}
- if (p->bdev && blk_queue_stable_writes(p->bdev->bd_disk->queue))
+ if (p->bdev && bdev_stable_writes(p->bdev))
p->flags |= SWP_STABLE_WRITES;
if (p->bdev && p->bdev->bd_disk->fops->rw_page)
p->flags |= SWP_SYNCHRONOUS_IO;
- if (p->bdev && blk_queue_nonrot(bdev_get_queue(p->bdev))) {
+ if (p->bdev && bdev_nonrot(p->bdev)) {
int cpu;
unsigned long ci, nr_cluster;
@@ -3132,7 +3119,8 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
sizeof(long),
GFP_KERNEL);
- if (p->bdev && (swap_flags & SWAP_FLAG_DISCARD) && swap_discardable(p)) {
+ if ((swap_flags & SWAP_FLAG_DISCARD) &&
+ p->bdev && bdev_max_discard_sectors(p->bdev)) {
/*
* When discard is enabled for swap with no particular
* policy flagged, we set all swap discard flags here in
diff --git a/mm/usercopy.c b/mm/usercopy.c
index 2c235d5c2364..baeacc735b83 100644
--- a/mm/usercopy.c
+++ b/mm/usercopy.c
@@ -17,6 +17,7 @@
#include <linux/sched/task.h>
#include <linux/sched/task_stack.h>
#include <linux/thread_info.h>
+#include <linux/vmalloc.h>
#include <linux/atomic.h>
#include <linux/jump_label.h>
#include <asm/sections.h>
@@ -157,91 +158,47 @@ static inline void check_bogus_address(const unsigned long ptr, unsigned long n,
usercopy_abort("null address", NULL, to_user, ptr, n);
}
-/* Checks for allocs that are marked in some way as spanning multiple pages. */
-static inline void check_page_span(const void *ptr, unsigned long n,
- struct page *page, bool to_user)
+static inline void check_heap_object(const void *ptr, unsigned long n,
+ bool to_user)
{
-#ifdef CONFIG_HARDENED_USERCOPY_PAGESPAN
- const void *end = ptr + n - 1;
- struct page *endpage;
- bool is_reserved, is_cma;
+ struct folio *folio;
- /*
- * Sometimes the kernel data regions are not marked Reserved (see
- * check below). And sometimes [_sdata,_edata) does not cover
- * rodata and/or bss, so check each range explicitly.
- */
+ if (is_kmap_addr(ptr)) {
+ unsigned long page_end = (unsigned long)ptr | (PAGE_SIZE - 1);
- /* Allow reads of kernel rodata region (if not marked as Reserved). */
- if (ptr >= (const void *)__start_rodata &&
- end <= (const void *)__end_rodata) {
- if (!to_user)
- usercopy_abort("rodata", NULL, to_user, 0, n);
+ if ((unsigned long)ptr + n - 1 > page_end)
+ usercopy_abort("kmap", NULL, to_user,
+ offset_in_page(ptr), n);
return;
}
- /* Allow kernel data region (if not marked as Reserved). */
- if (ptr >= (const void *)_sdata && end <= (const void *)_edata)
- return;
+ if (is_vmalloc_addr(ptr)) {
+ struct vm_struct *area = find_vm_area(ptr);
+ unsigned long offset;
- /* Allow kernel bss region (if not marked as Reserved). */
- if (ptr >= (const void *)__bss_start &&
- end <= (const void *)__bss_stop)
- return;
-
- /* Is the object wholly within one base page? */
- if (likely(((unsigned long)ptr & (unsigned long)PAGE_MASK) ==
- ((unsigned long)end & (unsigned long)PAGE_MASK)))
- return;
+ if (!area) {
+ usercopy_abort("vmalloc", "no area", to_user, 0, n);
+ return;
+ }
- /* Allow if fully inside the same compound (__GFP_COMP) page. */
- endpage = virt_to_head_page(end);
- if (likely(endpage == page))
+ offset = ptr - area->addr;
+ if (offset + n > get_vm_area_size(area))
+ usercopy_abort("vmalloc", NULL, to_user, offset, n);
return;
-
- /*
- * Reject if range is entirely either Reserved (i.e. special or
- * device memory), or CMA. Otherwise, reject since the object spans
- * several independently allocated pages.
- */
- is_reserved = PageReserved(page);
- is_cma = is_migrate_cma_page(page);
- if (!is_reserved && !is_cma)
- usercopy_abort("spans multiple pages", NULL, to_user, 0, n);
-
- for (ptr += PAGE_SIZE; ptr <= end; ptr += PAGE_SIZE) {
- page = virt_to_head_page(ptr);
- if (is_reserved && !PageReserved(page))
- usercopy_abort("spans Reserved and non-Reserved pages",
- NULL, to_user, 0, n);
- if (is_cma && !is_migrate_cma_page(page))
- usercopy_abort("spans CMA and non-CMA pages", NULL,
- to_user, 0, n);
}
-#endif
-}
-
-static inline void check_heap_object(const void *ptr, unsigned long n,
- bool to_user)
-{
- struct folio *folio;
if (!virt_addr_valid(ptr))
return;
- /*
- * When CONFIG_HIGHMEM=y, kmap_to_page() will give either the
- * highmem page or fallback to virt_to_page(). The following
- * is effectively a highmem-aware virt_to_slab().
- */
- folio = page_folio(kmap_to_page((void *)ptr));
+ folio = virt_to_folio(ptr);
if (folio_test_slab(folio)) {
/* Check slab allocator for flags and size. */
__check_heap_object(ptr, n, folio_slab(folio), to_user);
- } else {
- /* Verify object does not incorrectly span multiple pages. */
- check_page_span(ptr, n, folio_page(folio, 0), to_user);
+ } else if (folio_test_large(folio)) {
+ unsigned long offset = ptr - folio_address(folio);
+ if (offset + n > folio_size(folio))
+ usercopy_abort("page alloc", NULL, to_user, offset, n);
}
}
diff --git a/mm/util.c b/mm/util.c
index 3492a9e81aa3..ac63e5ca8b21 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -343,6 +343,38 @@ unsigned long randomize_stack_top(unsigned long stack_top)
#endif
}
+/**
+ * randomize_page - Generate a random, page aligned address
+ * @start: The smallest acceptable address the caller will take.
+ * @range: The size of the area, starting at @start, within which the
+ * random address must fall.
+ *
+ * If @start + @range would overflow, @range is capped.
+ *
+ * NOTE: Historical use of randomize_range, which this replaces, presumed that
+ * @start was already page aligned. We now align it regardless.
+ *
+ * Return: A page aligned address within [start, start + range). On error,
+ * @start is returned.
+ */
+unsigned long randomize_page(unsigned long start, unsigned long range)
+{
+ if (!PAGE_ALIGNED(start)) {
+ range -= PAGE_ALIGN(start) - start;
+ start = PAGE_ALIGN(start);
+ }
+
+ if (start > ULONG_MAX - range)
+ range = ULONG_MAX - start;
+
+ range >>= PAGE_SHIFT;
+
+ if (range == 0)
+ return start;
+
+ return start + (get_random_long() % range << PAGE_SHIFT);
+}
+
#ifdef CONFIG_ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT
unsigned long arch_randomize_brk(struct mm_struct *mm)
{