aboutsummaryrefslogtreecommitdiff
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig1
-rw-r--r--mm/backing-dev.c186
-rw-r--r--mm/filemap.c33
-rw-r--r--mm/gup.c148
-rw-r--r--mm/huge_memory.c99
-rw-r--r--mm/hugetlb.c10
-rw-r--r--mm/internal.h7
-rw-r--r--mm/kasan/kasan.h5
-rw-r--r--mm/kasan/report.c36
-rw-r--r--mm/kmemleak.c2
-rw-r--r--mm/memory.c2
-rw-r--r--mm/mempolicy.c20
-rw-r--r--mm/migrate.c9
-rw-r--r--mm/page_alloc.c57
-rw-r--r--mm/page_vma_mapped.c15
-rw-r--r--mm/percpu.c40
-rw-r--r--mm/rmap.c4
-rw-r--r--mm/swap.c37
-rw-r--r--mm/swap_cgroup.c2
-rw-r--r--mm/swap_slots.c2
-rw-r--r--mm/usercopy.c19
-rw-r--r--mm/vmalloc.c2
-rw-r--r--mm/vmstat.c18
-rw-r--r--mm/workingset.c2
-rw-r--r--mm/z3fold.c9
-rw-r--r--mm/zsmalloc.c2
26 files changed, 478 insertions, 289 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index 9b8fccb969dc..beb7a455915d 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -312,7 +312,6 @@ config NEED_BOUNCE_POOL
config NR_QUICK
int
depends on QUICKLIST
- default "2" if AVR32
default "1"
config VIRT_TO_BUS
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index c6f2a37028c2..f028a9a472fd 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -12,8 +12,6 @@
#include <linux/device.h>
#include <trace/events/writeback.h>
-static atomic_long_t bdi_seq = ATOMIC_LONG_INIT(0);
-
struct backing_dev_info noop_backing_dev_info = {
.name = "noop",
.capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK,
@@ -242,6 +240,8 @@ static __init int bdi_class_init(void)
}
postcore_initcall(bdi_class_init);
+static int bdi_init(struct backing_dev_info *bdi);
+
static int __init default_bdi_init(void)
{
int err;
@@ -294,6 +294,8 @@ static int wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi,
memset(wb, 0, sizeof(*wb));
+ if (wb != &bdi->wb)
+ bdi_get(bdi);
wb->bdi = bdi;
wb->last_old_flush = jiffies;
INIT_LIST_HEAD(&wb->b_dirty);
@@ -314,8 +316,10 @@ static int wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi,
wb->dirty_sleep = jiffies;
wb->congested = wb_congested_get_create(bdi, blkcg_id, gfp);
- if (!wb->congested)
- return -ENOMEM;
+ if (!wb->congested) {
+ err = -ENOMEM;
+ goto out_put_bdi;
+ }
err = fprop_local_init_percpu(&wb->completions, gfp);
if (err)
@@ -335,9 +339,14 @@ out_destroy_stat:
fprop_local_destroy_percpu(&wb->completions);
out_put_cong:
wb_congested_put(wb->congested);
+out_put_bdi:
+ if (wb != &bdi->wb)
+ bdi_put(bdi);
return err;
}
+static void cgwb_remove_from_bdi_list(struct bdi_writeback *wb);
+
/*
* Remove bdi from the global list and shutdown any threads we have running
*/
@@ -347,10 +356,18 @@ static void wb_shutdown(struct bdi_writeback *wb)
spin_lock_bh(&wb->work_lock);
if (!test_and_clear_bit(WB_registered, &wb->state)) {
spin_unlock_bh(&wb->work_lock);
+ /*
+ * Wait for wb shutdown to finish if someone else is just
+ * running wb_shutdown(). Otherwise we could proceed to wb /
+ * bdi destruction before wb_shutdown() is finished.
+ */
+ wait_on_bit(&wb->state, WB_shutting_down, TASK_UNINTERRUPTIBLE);
return;
}
+ set_bit(WB_shutting_down, &wb->state);
spin_unlock_bh(&wb->work_lock);
+ cgwb_remove_from_bdi_list(wb);
/*
* Drain work list and shutdown the delayed_work. !WB_registered
* tells wb_workfn() that @wb is dying and its work_list needs to
@@ -359,6 +376,12 @@ static void wb_shutdown(struct bdi_writeback *wb)
mod_delayed_work(bdi_wq, &wb->dwork, 0);
flush_delayed_work(&wb->dwork);
WARN_ON(!list_empty(&wb->work_list));
+ /*
+ * Make sure bit gets cleared after shutdown is finished. Matches with
+ * the barrier provided by test_and_clear_bit() above.
+ */
+ smp_wmb();
+ clear_bit(WB_shutting_down, &wb->state);
}
static void wb_exit(struct bdi_writeback *wb)
@@ -372,6 +395,8 @@ static void wb_exit(struct bdi_writeback *wb)
fprop_local_destroy_percpu(&wb->completions);
wb_congested_put(wb->congested);
+ if (wb != &wb->bdi->wb)
+ bdi_put(wb->bdi);
}
#ifdef CONFIG_CGROUP_WRITEBACK
@@ -381,11 +406,9 @@ static void wb_exit(struct bdi_writeback *wb)
/*
* cgwb_lock protects bdi->cgwb_tree, bdi->cgwb_congested_tree,
* blkcg->cgwb_list, and memcg->cgwb_list. bdi->cgwb_tree is also RCU
- * protected. cgwb_release_wait is used to wait for the completion of cgwb
- * releases from bdi destruction path.
+ * protected.
*/
static DEFINE_SPINLOCK(cgwb_lock);
-static DECLARE_WAIT_QUEUE_HEAD(cgwb_release_wait);
/**
* wb_congested_get_create - get or create a wb_congested
@@ -438,7 +461,7 @@ retry:
return NULL;
atomic_set(&new_congested->refcnt, 0);
- new_congested->bdi = bdi;
+ new_congested->__bdi = bdi;
new_congested->blkcg_id = blkcg_id;
goto retry;
@@ -466,10 +489,10 @@ void wb_congested_put(struct bdi_writeback_congested *congested)
}
/* bdi might already have been destroyed leaving @congested unlinked */
- if (congested->bdi) {
+ if (congested->__bdi) {
rb_erase(&congested->rb_node,
- &congested->bdi->cgwb_congested_tree);
- congested->bdi = NULL;
+ &congested->__bdi->cgwb_congested_tree);
+ congested->__bdi = NULL;
}
spin_unlock_irqrestore(&cgwb_lock, flags);
@@ -480,11 +503,6 @@ static void cgwb_release_workfn(struct work_struct *work)
{
struct bdi_writeback *wb = container_of(work, struct bdi_writeback,
release_work);
- struct backing_dev_info *bdi = wb->bdi;
-
- spin_lock_irq(&cgwb_lock);
- list_del_rcu(&wb->bdi_node);
- spin_unlock_irq(&cgwb_lock);
wb_shutdown(wb);
@@ -495,9 +513,6 @@ static void cgwb_release_workfn(struct work_struct *work)
percpu_ref_exit(&wb->refcnt);
wb_exit(wb);
kfree_rcu(wb, rcu);
-
- if (atomic_dec_and_test(&bdi->usage_cnt))
- wake_up_all(&cgwb_release_wait);
}
static void cgwb_release(struct percpu_ref *refcnt)
@@ -517,6 +532,13 @@ static void cgwb_kill(struct bdi_writeback *wb)
percpu_ref_kill(&wb->refcnt);
}
+static void cgwb_remove_from_bdi_list(struct bdi_writeback *wb)
+{
+ spin_lock_irq(&cgwb_lock);
+ list_del_rcu(&wb->bdi_node);
+ spin_unlock_irq(&cgwb_lock);
+}
+
static int cgwb_create(struct backing_dev_info *bdi,
struct cgroup_subsys_state *memcg_css, gfp_t gfp)
{
@@ -580,7 +602,6 @@ static int cgwb_create(struct backing_dev_info *bdi,
/* we might have raced another instance of this function */
ret = radix_tree_insert(&bdi->cgwb_tree, memcg_css->id, wb);
if (!ret) {
- atomic_inc(&bdi->usage_cnt);
list_add_tail_rcu(&wb->bdi_node, &bdi->wb_list);
list_add(&wb->memcg_node, memcg_cgwb_list);
list_add(&wb->blkcg_node, blkcg_cgwb_list);
@@ -670,7 +691,6 @@ static int cgwb_bdi_init(struct backing_dev_info *bdi)
INIT_RADIX_TREE(&bdi->cgwb_tree, GFP_ATOMIC);
bdi->cgwb_congested_tree = RB_ROOT;
- atomic_set(&bdi->usage_cnt, 1);
ret = wb_init(&bdi->wb, bdi, 1, GFP_KERNEL);
if (!ret) {
@@ -680,29 +700,26 @@ static int cgwb_bdi_init(struct backing_dev_info *bdi)
return ret;
}
-static void cgwb_bdi_destroy(struct backing_dev_info *bdi)
+static void cgwb_bdi_unregister(struct backing_dev_info *bdi)
{
struct radix_tree_iter iter;
void **slot;
+ struct bdi_writeback *wb;
WARN_ON(test_bit(WB_registered, &bdi->wb.state));
spin_lock_irq(&cgwb_lock);
radix_tree_for_each_slot(slot, &bdi->cgwb_tree, &iter, 0)
cgwb_kill(*slot);
- spin_unlock_irq(&cgwb_lock);
- /*
- * All cgwb's must be shutdown and released before returning. Drain
- * the usage counter to wait for all cgwb's ever created on @bdi.
- */
- atomic_dec(&bdi->usage_cnt);
- wait_event(cgwb_release_wait, !atomic_read(&bdi->usage_cnt));
- /*
- * Grab back our reference so that we hold it when @bdi gets
- * re-registered.
- */
- atomic_inc(&bdi->usage_cnt);
+ while (!list_empty(&bdi->wb_list)) {
+ wb = list_first_entry(&bdi->wb_list, struct bdi_writeback,
+ bdi_node);
+ spin_unlock_irq(&cgwb_lock);
+ wb_shutdown(wb);
+ spin_lock_irq(&cgwb_lock);
+ }
+ spin_unlock_irq(&cgwb_lock);
}
/**
@@ -752,11 +769,18 @@ static void cgwb_bdi_exit(struct backing_dev_info *bdi)
rb_entry(rbn, struct bdi_writeback_congested, rb_node);
rb_erase(rbn, &bdi->cgwb_congested_tree);
- congested->bdi = NULL; /* mark @congested unlinked */
+ congested->__bdi = NULL; /* mark @congested unlinked */
}
spin_unlock_irq(&cgwb_lock);
}
+static void cgwb_bdi_register(struct backing_dev_info *bdi)
+{
+ spin_lock_irq(&cgwb_lock);
+ list_add_tail_rcu(&bdi->wb.bdi_node, &bdi->wb_list);
+ spin_unlock_irq(&cgwb_lock);
+}
+
#else /* CONFIG_CGROUP_WRITEBACK */
static int cgwb_bdi_init(struct backing_dev_info *bdi)
@@ -777,16 +801,26 @@ static int cgwb_bdi_init(struct backing_dev_info *bdi)
return 0;
}
-static void cgwb_bdi_destroy(struct backing_dev_info *bdi) { }
+static void cgwb_bdi_unregister(struct backing_dev_info *bdi) { }
static void cgwb_bdi_exit(struct backing_dev_info *bdi)
{
wb_congested_put(bdi->wb_congested);
}
+static void cgwb_bdi_register(struct backing_dev_info *bdi)
+{
+ list_add_tail_rcu(&bdi->wb.bdi_node, &bdi->wb_list);
+}
+
+static void cgwb_remove_from_bdi_list(struct bdi_writeback *wb)
+{
+ list_del_rcu(&wb->bdi_node);
+}
+
#endif /* CONFIG_CGROUP_WRITEBACK */
-int bdi_init(struct backing_dev_info *bdi)
+static int bdi_init(struct backing_dev_info *bdi)
{
int ret;
@@ -802,11 +836,8 @@ int bdi_init(struct backing_dev_info *bdi)
ret = cgwb_bdi_init(bdi);
- list_add_tail_rcu(&bdi->wb.bdi_node, &bdi->wb_list);
-
return ret;
}
-EXPORT_SYMBOL(bdi_init);
struct backing_dev_info *bdi_alloc_node(gfp_t gfp_mask, int node_id)
{
@@ -823,22 +854,20 @@ struct backing_dev_info *bdi_alloc_node(gfp_t gfp_mask, int node_id)
}
return bdi;
}
+EXPORT_SYMBOL(bdi_alloc_node);
-int bdi_register(struct backing_dev_info *bdi, struct device *parent,
- const char *fmt, ...)
+int bdi_register_va(struct backing_dev_info *bdi, const char *fmt, va_list args)
{
- va_list args;
struct device *dev;
if (bdi->dev) /* The driver needs to use separate queues per device */
return 0;
- va_start(args, fmt);
- dev = device_create_vargs(bdi_class, parent, MKDEV(0, 0), bdi, fmt, args);
- va_end(args);
+ dev = device_create_vargs(bdi_class, NULL, MKDEV(0, 0), bdi, fmt, args);
if (IS_ERR(dev))
return PTR_ERR(dev);
+ cgwb_bdi_register(bdi);
bdi->dev = dev;
bdi_debug_register(bdi, dev_name(dev));
@@ -851,20 +880,25 @@ int bdi_register(struct backing_dev_info *bdi, struct device *parent,
trace_writeback_bdi_register(bdi);
return 0;
}
-EXPORT_SYMBOL(bdi_register);
+EXPORT_SYMBOL(bdi_register_va);
-int bdi_register_dev(struct backing_dev_info *bdi, dev_t dev)
+int bdi_register(struct backing_dev_info *bdi, const char *fmt, ...)
{
- return bdi_register(bdi, NULL, "%u:%u", MAJOR(dev), MINOR(dev));
+ va_list args;
+ int ret;
+
+ va_start(args, fmt);
+ ret = bdi_register_va(bdi, fmt, args);
+ va_end(args);
+ return ret;
}
-EXPORT_SYMBOL(bdi_register_dev);
+EXPORT_SYMBOL(bdi_register);
int bdi_register_owner(struct backing_dev_info *bdi, struct device *owner)
{
int rc;
- rc = bdi_register(bdi, NULL, "%u:%u", MAJOR(owner->devt),
- MINOR(owner->devt));
+ rc = bdi_register(bdi, "%u:%u", MAJOR(owner->devt), MINOR(owner->devt));
if (rc)
return rc;
/* Leaking owner reference... */
@@ -892,7 +926,7 @@ void bdi_unregister(struct backing_dev_info *bdi)
/* make sure nobody finds us on the bdi_list anymore */
bdi_remove_from_list(bdi);
wb_shutdown(&bdi->wb);
- cgwb_bdi_destroy(bdi);
+ cgwb_bdi_unregister(bdi);
if (bdi->dev) {
bdi_debug_unregister(bdi);
@@ -906,19 +940,16 @@ void bdi_unregister(struct backing_dev_info *bdi)
}
}
-static void bdi_exit(struct backing_dev_info *bdi)
-{
- WARN_ON_ONCE(bdi->dev);
- wb_exit(&bdi->wb);
- cgwb_bdi_exit(bdi);
-}
-
static void release_bdi(struct kref *ref)
{
struct backing_dev_info *bdi =
container_of(ref, struct backing_dev_info, refcnt);
- bdi_exit(bdi);
+ if (test_bit(WB_registered, &bdi->wb.state))
+ bdi_unregister(bdi);
+ WARN_ON_ONCE(bdi->dev);
+ wb_exit(&bdi->wb);
+ cgwb_bdi_exit(bdi);
kfree(bdi);
}
@@ -926,38 +957,7 @@ void bdi_put(struct backing_dev_info *bdi)
{
kref_put(&bdi->refcnt, release_bdi);
}
-
-void bdi_destroy(struct backing_dev_info *bdi)
-{
- bdi_unregister(bdi);
- bdi_exit(bdi);
-}
-EXPORT_SYMBOL(bdi_destroy);
-
-/*
- * For use from filesystems to quickly init and register a bdi associated
- * with dirty writeback
- */
-int bdi_setup_and_register(struct backing_dev_info *bdi, char *name)
-{
- int err;
-
- bdi->name = name;
- bdi->capabilities = 0;
- err = bdi_init(bdi);
- if (err)
- return err;
-
- err = bdi_register(bdi, NULL, "%.28s-%ld", name,
- atomic_long_inc_return(&bdi_seq));
- if (err) {
- bdi_destroy(bdi);
- return err;
- }
-
- return 0;
-}
-EXPORT_SYMBOL(bdi_setup_and_register);
+EXPORT_SYMBOL(bdi_put);
static wait_queue_head_t congestion_wqh[2] = {
__WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[0]),
diff --git a/mm/filemap.c b/mm/filemap.c
index 1694623a6289..dc59c5f35b37 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -519,7 +519,7 @@ EXPORT_SYMBOL(filemap_write_and_wait);
*
* Write out and wait upon file offsets lstart->lend, inclusive.
*
- * Note that `lend' is inclusive (describes the last byte to be written) so
+ * Note that @lend is inclusive (describes the last byte to be written) so
* that this function can be used to write to the very end-of-file (end = -1).
*/
int filemap_write_and_wait_range(struct address_space *mapping,
@@ -1277,12 +1277,14 @@ EXPORT_SYMBOL(find_lock_entry);
*
* PCG flags modify how the page is returned.
*
- * FGP_ACCESSED: the page will be marked accessed
- * FGP_LOCK: Page is return locked
- * FGP_CREAT: If page is not present then a new page is allocated using
- * @gfp_mask and added to the page cache and the VM's LRU
- * list. The page is returned locked and with an increased
- * refcount. Otherwise, %NULL is returned.
+ * @fgp_flags can be:
+ *
+ * - FGP_ACCESSED: the page will be marked accessed
+ * - FGP_LOCK: Page is return locked
+ * - FGP_CREAT: If page is not present then a new page is allocated using
+ * @gfp_mask and added to the page cache and the VM's LRU
+ * list. The page is returned locked and with an increased
+ * refcount. Otherwise, NULL is returned.
*
* If FGP_LOCK or FGP_CREAT are specified then the function may sleep even
* if the GFP flags specified for FGP_CREAT are atomic.
@@ -2033,7 +2035,6 @@ generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
if (iocb->ki_flags & IOCB_DIRECT) {
struct address_space *mapping = file->f_mapping;
struct inode *inode = mapping->host;
- struct iov_iter data = *iter;
loff_t size;
size = i_size_read(inode);
@@ -2044,11 +2045,12 @@ generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
file_accessed(file);
- retval = mapping->a_ops->direct_IO(iocb, &data);
+ retval = mapping->a_ops->direct_IO(iocb, iter);
if (retval >= 0) {
iocb->ki_pos += retval;
- iov_iter_advance(iter, retval);
+ count -= retval;
}
+ iov_iter_revert(iter, iov_iter_count(iter) - count);
/*
* Btrfs can have a short DIO read if we encounter
@@ -2059,7 +2061,7 @@ generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
* the rest of the read. Buffered reads will not work for
* DAX files, so don't bother trying.
*/
- if (retval < 0 || !iov_iter_count(iter) || iocb->ki_pos >= size ||
+ if (retval < 0 || !count || iocb->ki_pos >= size ||
IS_DAX(inode))
goto out;
}
@@ -2704,7 +2706,6 @@ generic_file_direct_write(struct kiocb *iocb, struct iov_iter *from)
ssize_t written;
size_t write_len;
pgoff_t end;
- struct iov_iter data;
write_len = iov_iter_count(from);
end = (pos + write_len - 1) >> PAGE_SHIFT;
@@ -2733,8 +2734,7 @@ generic_file_direct_write(struct kiocb *iocb, struct iov_iter *from)
}
}
- data = *from;
- written = mapping->a_ops->direct_IO(iocb, &data);
+ written = mapping->a_ops->direct_IO(iocb, from);
/*
* Finally, try again to invalidate clean pages which might have been
@@ -2751,13 +2751,14 @@ generic_file_direct_write(struct kiocb *iocb, struct iov_iter *from)
if (written > 0) {
pos += written;
- iov_iter_advance(from, written);
+ write_len -= written;
if (pos > i_size_read(inode) && !S_ISBLK(inode->i_mode)) {
i_size_write(inode, pos);
mark_inode_dirty(inode);
}
iocb->ki_pos = pos;
}
+ iov_iter_revert(from, write_len - iov_iter_count(from));
out:
return written;
}
@@ -3001,7 +3002,7 @@ EXPORT_SYMBOL(generic_file_write_iter);
* @gfp_mask: memory allocation flags (and I/O mode)
*
* The address_space is to try to release any data against the page
- * (presumably at page->private). If the release was successful, return `1'.
+ * (presumably at page->private). If the release was successful, return '1'.
* Otherwise return zero.
*
* This may also be called if PG_fscache is set on a page, indicating that the
diff --git a/mm/gup.c b/mm/gup.c
index 04aa405350dc..527ec2c6cca3 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -1189,34 +1189,57 @@ struct page *get_dump_page(unsigned long addr)
*/
#ifdef CONFIG_HAVE_GENERIC_RCU_GUP
+#ifndef gup_get_pte
+/*
+ * We assume that the PTE can be read atomically. If this is not the case for
+ * your architecture, please provide the helper.
+ */
+static inline pte_t gup_get_pte(pte_t *ptep)
+{
+ return READ_ONCE(*ptep);
+}
+#endif
+
+static void undo_dev_pagemap(int *nr, int nr_start, struct page **pages)
+{
+ while ((*nr) - nr_start) {
+ struct page *page = pages[--(*nr)];
+
+ ClearPageReferenced(page);
+ put_page(page);
+ }
+}
+
#ifdef __HAVE_ARCH_PTE_SPECIAL
static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
int write, struct page **pages, int *nr)
{
+ struct dev_pagemap *pgmap = NULL;
+ int nr_start = *nr, ret = 0;
pte_t *ptep, *ptem;
- int ret = 0;
ptem = ptep = pte_offset_map(&pmd, addr);
do {
- /*
- * In the line below we are assuming that the pte can be read
- * atomically. If this is not the case for your architecture,
- * please wrap this in a helper function!
- *
- * for an example see gup_get_pte in arch/x86/mm/gup.c
- */
- pte_t pte = READ_ONCE(*ptep);
+ pte_t pte = gup_get_pte(ptep);
struct page *head, *page;
/*
* Similar to the PMD case below, NUMA hinting must take slow
* path using the pte_protnone check.
*/
- if (!pte_present(pte) || pte_special(pte) ||
- pte_protnone(pte) || (write && !pte_write(pte)))
+ if (pte_protnone(pte))
goto pte_unmap;
- if (!arch_pte_access_permitted(pte, write))
+ if (!pte_access_permitted(pte, write))
+ goto pte_unmap;
+
+ if (pte_devmap(pte)) {
+ pgmap = get_dev_pagemap(pte_pfn(pte), pgmap);
+ if (unlikely(!pgmap)) {
+ undo_dev_pagemap(nr, nr_start, pages);
+ goto pte_unmap;
+ }
+ } else if (pte_special(pte))
goto pte_unmap;
VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
@@ -1232,6 +1255,9 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
}
VM_BUG_ON_PAGE(compound_head(page) != head, page);
+
+ put_dev_pagemap(pgmap);
+ SetPageReferenced(page);
pages[*nr] = page;
(*nr)++;
@@ -1261,15 +1287,76 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
}
#endif /* __HAVE_ARCH_PTE_SPECIAL */
+#ifdef __HAVE_ARCH_PTE_DEVMAP
+static int __gup_device_huge(unsigned long pfn, unsigned long addr,
+ unsigned long end, struct page **pages, int *nr)
+{
+ int nr_start = *nr;
+ struct dev_pagemap *pgmap = NULL;
+
+ do {
+ struct page *page = pfn_to_page(pfn);
+
+ pgmap = get_dev_pagemap(pfn, pgmap);
+ if (unlikely(!pgmap)) {
+ undo_dev_pagemap(nr, nr_start, pages);
+ return 0;
+ }
+ SetPageReferenced(page);
+ pages[*nr] = page;
+ get_page(page);
+ put_dev_pagemap(pgmap);
+ (*nr)++;
+ pfn++;
+ } while (addr += PAGE_SIZE, addr != end);
+ return 1;
+}
+
+static int __gup_device_huge_pmd(pmd_t pmd, unsigned long addr,
+ unsigned long end, struct page **pages, int *nr)
+{
+ unsigned long fault_pfn;
+
+ fault_pfn = pmd_pfn(pmd) + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
+ return __gup_device_huge(fault_pfn, addr, end, pages, nr);
+}
+
+static int __gup_device_huge_pud(pud_t pud, unsigned long addr,
+ unsigned long end, struct page **pages, int *nr)
+{
+ unsigned long fault_pfn;
+
+ fault_pfn = pud_pfn(pud) + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
+ return __gup_device_huge(fault_pfn, addr, end, pages, nr);
+}
+#else
+static int __gup_device_huge_pmd(pmd_t pmd, unsigned long addr,
+ unsigned long end, struct page **pages, int *nr)
+{
+ BUILD_BUG();
+ return 0;
+}
+
+static int __gup_device_huge_pud(pud_t pud, unsigned long addr,
+ unsigned long end, struct page **pages, int *nr)
+{
+ BUILD_BUG();
+ return 0;
+}
+#endif
+
static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr,
unsigned long end, int write, struct page **pages, int *nr)
{
struct page *head, *page;
int refs;
- if (write && !pmd_write(orig))
+ if (!pmd_access_permitted(orig, write))
return 0;
+ if (pmd_devmap(orig))
+ return __gup_device_huge_pmd(orig, addr, end, pages, nr);
+
refs = 0;
head = pmd_page(orig);
page = head + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
@@ -1293,6 +1380,7 @@ static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr,
return 0;
}
+ SetPageReferenced(head);
return 1;
}
@@ -1302,9 +1390,12 @@ static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr,
struct page *head, *page;
int refs;
- if (write && !pud_write(orig))
+ if (!pud_access_permitted(orig, write))
return 0;
+ if (pud_devmap(orig))
+ return __gup_device_huge_pud(orig, addr, end, pages, nr);
+
refs = 0;
head = pud_page(orig);
page = head + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
@@ -1328,6 +1419,7 @@ static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr,
return 0;
}
+ SetPageReferenced(head);
return 1;
}
@@ -1338,9 +1430,10 @@ static int gup_huge_pgd(pgd_t orig, pgd_t *pgdp, unsigned long addr,
int refs;
struct page *head, *page;
- if (write && !pgd_write(orig))
+ if (!pgd_access_permitted(orig, write))
return 0;
+ BUILD_BUG_ON(pgd_devmap(orig));
refs = 0;
head = pgd_page(orig);
page = head + ((addr & ~PGDIR_MASK) >> PAGE_SHIFT);
@@ -1364,6 +1457,7 @@ static int gup_huge_pgd(pgd_t orig, pgd_t *pgdp, unsigned long addr,
return 0;
}
+ SetPageReferenced(head);
return 1;
}
@@ -1520,6 +1614,21 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
return nr;
}
+#ifndef gup_fast_permitted
+/*
+ * Check if it's allowed to use __get_user_pages_fast() for the range, or
+ * we need to fall back to the slow version:
+ */
+bool gup_fast_permitted(unsigned long start, int nr_pages, int write)
+{
+ unsigned long len, end;
+
+ len = (unsigned long) nr_pages << PAGE_SHIFT;
+ end = start + len;
+ return end >= start;
+}
+#endif
+
/**
* get_user_pages_fast() - pin user pages in memory
* @start: starting user address
@@ -1539,11 +1648,14 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
int get_user_pages_fast(unsigned long start, int nr_pages, int write,
struct page **pages)
{
- int nr, ret;
+ int nr = 0, ret = 0;
start &= PAGE_MASK;
- nr = __get_user_pages_fast(start, nr_pages, write, pages);
- ret = nr;
+
+ if (gup_fast_permitted(start, nr_pages, write)) {
+ nr = __get_user_pages_fast(start, nr_pages, write, pages);
+ ret = nr;
+ }
if (nr < nr_pages) {
/* Try to get the remaining pages with get_user_pages */
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 1ebc93e179f3..f3c4f9d22821 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -240,18 +240,18 @@ static ssize_t defrag_store(struct kobject *kobj,
clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
- } else if (!memcmp("defer", buf,
- min(sizeof("defer")-1, count))) {
- clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
- clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
- clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
- set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
} else if (!memcmp("defer+madvise", buf,
min(sizeof("defer+madvise")-1, count))) {
clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
+ } else if (!memcmp("defer", buf,
+ min(sizeof("defer")-1, count))) {
+ clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
+ clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
+ clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
+ set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
} else if (!memcmp("madvise", buf,
min(sizeof("madvise")-1, count))) {
clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
@@ -1568,8 +1568,7 @@ bool madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
deactivate_page(page);
if (pmd_young(orig_pmd) || pmd_dirty(orig_pmd)) {
- orig_pmd = pmdp_huge_get_and_clear_full(tlb->mm, addr, pmd,
- tlb->fullmm);
+ pmdp_invalidate(vma, addr, pmd);
orig_pmd = pmd_mkold(orig_pmd);
orig_pmd = pmd_mkclean(orig_pmd);
@@ -1724,37 +1723,69 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
{
struct mm_struct *mm = vma->vm_mm;
spinlock_t *ptl;
- int ret = 0;
+ pmd_t entry;
+ bool preserve_write;
+ int ret;
ptl = __pmd_trans_huge_lock(pmd, vma);
- if (ptl) {
- pmd_t entry;
- bool preserve_write = prot_numa && pmd_write(*pmd);
- ret = 1;
+ if (!ptl)
+ return 0;
- /*
- * Avoid trapping faults against the zero page. The read-only
- * data is likely to be read-cached on the local CPU and
- * local/remote hits to the zero page are not interesting.
- */
- if (prot_numa && is_huge_zero_pmd(*pmd)) {
- spin_unlock(ptl);
- return ret;
- }
+ preserve_write = prot_numa && pmd_write(*pmd);
+ ret = 1;
- if (!prot_numa || !pmd_protnone(*pmd)) {
- entry = pmdp_huge_get_and_clear_notify(mm, addr, pmd);
- entry = pmd_modify(entry, newprot);
- if (preserve_write)
- entry = pmd_mk_savedwrite(entry);
- ret = HPAGE_PMD_NR;
- set_pmd_at(mm, addr, pmd, entry);
- BUG_ON(vma_is_anonymous(vma) && !preserve_write &&
- pmd_write(entry));
- }
- spin_unlock(ptl);
- }
+ /*
+ * Avoid trapping faults against the zero page. The read-only
+ * data is likely to be read-cached on the local CPU and
+ * local/remote hits to the zero page are not interesting.
+ */
+ if (prot_numa && is_huge_zero_pmd(*pmd))
+ goto unlock;
+
+ if (prot_numa && pmd_protnone(*pmd))
+ goto unlock;
+
+ /*
+ * In case prot_numa, we are under down_read(mmap_sem). It's critical
+ * to not clear pmd intermittently to avoid race with MADV_DONTNEED
+ * which is also under down_read(mmap_sem):
+ *
+ * CPU0: CPU1:
+ * change_huge_pmd(prot_numa=1)
+ * pmdp_huge_get_and_clear_notify()
+ * madvise_dontneed()
+ * zap_pmd_range()
+ * pmd_trans_huge(*pmd) == 0 (without ptl)
+ * // skip the pmd
+ * set_pmd_at();
+ * // pmd is re-established
+ *
+ * The race makes MADV_DONTNEED miss the huge pmd and don't clear it
+ * which may break userspace.
+ *
+ * pmdp_invalidate() is required to make sure we don't miss
+ * dirty/young flags set by hardware.
+ */
+ entry = *pmd;
+ pmdp_invalidate(vma, addr, pmd);
+ /*
+ * Recover dirty/young flags. It relies on pmdp_invalidate to not
+ * corrupt them.
+ */
+ if (pmd_dirty(*pmd))
+ entry = pmd_mkdirty(entry);
+ if (pmd_young(*pmd))
+ entry = pmd_mkyoung(entry);
+
+ entry = pmd_modify(entry, newprot);
+ if (preserve_write)
+ entry = pmd_mk_savedwrite(entry);
+ ret = HPAGE_PMD_NR;
+ set_pmd_at(mm, addr, pmd, entry);
+ BUG_ON(vma_is_anonymous(vma) && !preserve_write && pmd_write(entry));
+unlock:
+ spin_unlock(ptl);
return ret;
}
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 3d0aab9ee80d..e5828875f7bb 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -4403,7 +4403,9 @@ int hugetlb_reserve_pages(struct inode *inode,
return 0;
out_err:
if (!vma || vma->vm_flags & VM_MAYSHARE)
- region_abort(resv_map, from, to);
+ /* Don't call region_abort if region_chg failed */
+ if (chg >= 0)
+ region_abort(resv_map, from, to);
if (vma && is_vma_resv_set(vma, HPAGE_RESV_OWNER))
kref_put(&resv_map->refs, resv_map_release);
return ret;
@@ -4651,6 +4653,7 @@ follow_huge_pmd(struct mm_struct *mm, unsigned long address,
{
struct page *page = NULL;
spinlock_t *ptl;
+ pte_t pte;
retry:
ptl = pmd_lockptr(mm, pmd);
spin_lock(ptl);
@@ -4660,12 +4663,13 @@ retry:
*/
if (!pmd_huge(*pmd))
goto out;
- if (pmd_present(*pmd)) {
+ pte = huge_ptep_get((pte_t *)pmd);
+ if (pte_present(pte)) {
page = pmd_page(*pmd) + ((address & ~PMD_MASK) >> PAGE_SHIFT);
if (flags & FOLL_GET)
get_page(page);
} else {
- if (is_hugetlb_entry_migration(huge_ptep_get((pte_t *)pmd))) {
+ if (is_hugetlb_entry_migration(pte)) {
spin_unlock(ptl);
__migration_entry_wait(mm, (pte_t *)pmd, ptl);
goto retry;
diff --git a/mm/internal.h b/mm/internal.h
index ccfc2a2969f4..266efaeaa370 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -481,6 +481,13 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone,
enum ttu_flags;
struct tlbflush_unmap_batch;
+
+/*
+ * only for MM internal work items which do not depend on
+ * any allocations or locks which might depend on allocations
+ */
+extern struct workqueue_struct *mm_percpu_wq;
+
#ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
void try_to_unmap_flush(void);
void try_to_unmap_flush_dirty(void);
diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h
index 1c260e6b3b3c..dd2dea8eb077 100644
--- a/mm/kasan/kasan.h
+++ b/mm/kasan/kasan.h
@@ -96,11 +96,6 @@ static inline const void *kasan_shadow_to_mem(const void *shadow_addr)
<< KASAN_SHADOW_SCALE_SHIFT);
}
-static inline bool kasan_report_enabled(void)
-{
- return !current->kasan_depth;
-}
-
void kasan_report(unsigned long addr, size_t size,
bool is_write, unsigned long ip);
void kasan_report_double_free(struct kmem_cache *cache, void *object,
diff --git a/mm/kasan/report.c b/mm/kasan/report.c
index f479365530b6..ab42a0803f16 100644
--- a/mm/kasan/report.c
+++ b/mm/kasan/report.c
@@ -13,7 +13,9 @@
*
*/
+#include <linux/bitops.h>
#include <linux/ftrace.h>
+#include <linux/init.h>
#include <linux/kernel.h>
#include <linux/mm.h>
#include <linux/printk.h>
@@ -293,6 +295,40 @@ static void kasan_report_error(struct kasan_access_info *info)
kasan_end_report(&flags);
}
+static unsigned long kasan_flags;
+
+#define KASAN_BIT_REPORTED 0
+#define KASAN_BIT_MULTI_SHOT 1
+
+bool kasan_save_enable_multi_shot(void)
+{
+ return test_and_set_bit(KASAN_BIT_MULTI_SHOT, &kasan_flags);
+}
+EXPORT_SYMBOL_GPL(kasan_save_enable_multi_shot);
+
+void kasan_restore_multi_shot(bool enabled)
+{
+ if (!enabled)
+ clear_bit(KASAN_BIT_MULTI_SHOT, &kasan_flags);
+}
+EXPORT_SYMBOL_GPL(kasan_restore_multi_shot);
+
+static int __init kasan_set_multi_shot(char *str)
+{
+ set_bit(KASAN_BIT_MULTI_SHOT, &kasan_flags);
+ return 1;
+}
+__setup("kasan_multi_shot", kasan_set_multi_shot);
+
+static inline bool kasan_report_enabled(void)
+{
+ if (current->kasan_depth)
+ return false;
+ if (test_bit(KASAN_BIT_MULTI_SHOT, &kasan_flags))
+ return true;
+ return !test_and_set_bit(KASAN_BIT_REPORTED, &kasan_flags);
+}
+
void kasan_report(unsigned long addr, size_t size,
bool is_write, unsigned long ip)
{
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index 26c874e90b12..20036d4f9f13 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -1416,7 +1416,7 @@ static void kmemleak_scan(void)
/* data/bss scanning */
scan_large_block(_sdata, _edata);
scan_large_block(__bss_start, __bss_stop);
- scan_large_block(__start_data_ro_after_init, __end_data_ro_after_init);
+ scan_large_block(__start_ro_after_init, __end_ro_after_init);
#ifdef CONFIG_SMP
/* per-cpu sections scanning */
diff --git a/mm/memory.c b/mm/memory.c
index 235ba51b2fbf..6ff5d729ded0 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -4298,7 +4298,7 @@ void __might_fault(const char *file, int line)
* get paged out, therefore we'll never actually fault, and the
* below annotations will generate false positives.
*/
- if (segment_eq(get_fs(), KERNEL_DS))
+ if (uaccess_kernel())
return;
if (pagefault_disabled())
return;
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 75b2745bac41..37d0b334bfe9 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1529,7 +1529,6 @@ COMPAT_SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
COMPAT_SYSCALL_DEFINE3(set_mempolicy, int, mode, compat_ulong_t __user *, nmask,
compat_ulong_t, maxnode)
{
- long err = 0;
unsigned long __user *nm = NULL;
unsigned long nr_bits, alloc_size;
DECLARE_BITMAP(bm, MAX_NUMNODES);
@@ -1538,14 +1537,13 @@ COMPAT_SYSCALL_DEFINE3(set_mempolicy, int, mode, compat_ulong_t __user *, nmask,
alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
if (nmask) {
- err = compat_get_bitmap(bm, nmask, nr_bits);
+ if (compat_get_bitmap(bm, nmask, nr_bits))
+ return -EFAULT;
nm = compat_alloc_user_space(alloc_size);
- err |= copy_to_user(nm, bm, alloc_size);
+ if (copy_to_user(nm, bm, alloc_size))
+ return -EFAULT;
}
- if (err)
- return -EFAULT;
-
return sys_set_mempolicy(mode, nm, nr_bits+1);
}
@@ -1553,7 +1551,6 @@ COMPAT_SYSCALL_DEFINE6(mbind, compat_ulong_t, start, compat_ulong_t, len,
compat_ulong_t, mode, compat_ulong_t __user *, nmask,
compat_ulong_t, maxnode, compat_ulong_t, flags)
{
- long err = 0;
unsigned long __user *nm = NULL;
unsigned long nr_bits, alloc_size;
nodemask_t bm;
@@ -1562,14 +1559,13 @@ COMPAT_SYSCALL_DEFINE6(mbind, compat_ulong_t, start, compat_ulong_t, len,
alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
if (nmask) {
- err = compat_get_bitmap(nodes_addr(bm), nmask, nr_bits);
+ if (compat_get_bitmap(nodes_addr(bm), nmask, nr_bits))
+ return -EFAULT;
nm = compat_alloc_user_space(alloc_size);
- err |= copy_to_user(nm, nodes_addr(bm), alloc_size);
+ if (copy_to_user(nm, nodes_addr(bm), alloc_size))
+ return -EFAULT;
}
- if (err)
- return -EFAULT;
-
return sys_mbind(start, len, mode, nm, nr_bits+1, flags);
}
diff --git a/mm/migrate.c b/mm/migrate.c
index 9a0897a14d37..738f1d5f8350 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -184,9 +184,9 @@ void putback_movable_pages(struct list_head *l)
unlock_page(page);
put_page(page);
} else {
- putback_lru_page(page);
dec_node_page_state(page, NR_ISOLATED_ANON +
page_is_file_cache(page));
+ putback_lru_page(page);
}
}
}
@@ -209,8 +209,11 @@ static int remove_migration_pte(struct page *page, struct vm_area_struct *vma,
VM_BUG_ON_PAGE(PageTail(page), page);
while (page_vma_mapped_walk(&pvmw)) {
- new = page - pvmw.page->index +
- linear_page_index(vma, pvmw.address);
+ if (PageKsm(page))
+ new = page;
+ else
+ new = page - pvmw.page->index +
+ linear_page_index(vma, pvmw.address);
get_page(new);
pte = pte_mkold(mk_pte(new, READ_ONCE(vma->vm_page_prot)));
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 6cbde310abed..bd01501efab9 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1090,10 +1090,10 @@ static void free_pcppages_bulk(struct zone *zone, int count,
{
int migratetype = 0;
int batch_free = 0;
- unsigned long nr_scanned, flags;
+ unsigned long nr_scanned;
bool isolated_pageblocks;
- spin_lock_irqsave(&zone->lock, flags);
+ spin_lock(&zone->lock);
isolated_pageblocks = has_isolate_pageblock(zone);
nr_scanned = node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED);
if (nr_scanned)
@@ -1142,7 +1142,7 @@ static void free_pcppages_bulk(struct zone *zone, int count,
trace_mm_page_pcpu_drain(page, 0, mt);
} while (--count && --batch_free && !list_empty(list));
}
- spin_unlock_irqrestore(&zone->lock, flags);
+ spin_unlock(&zone->lock);
}
static void free_one_page(struct zone *zone,
@@ -1150,9 +1150,8 @@ static void free_one_page(struct zone *zone,
unsigned int order,
int migratetype)
{
- unsigned long nr_scanned, flags;
- spin_lock_irqsave(&zone->lock, flags);
- __count_vm_events(PGFREE, 1 << order);
+ unsigned long nr_scanned;
+ spin_lock(&zone->lock);
nr_scanned = node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED);
if (nr_scanned)
__mod_node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED, -nr_scanned);
@@ -1162,7 +1161,7 @@ static void free_one_page(struct zone *zone,
migratetype = get_pfnblock_migratetype(page, pfn);
}
__free_one_page(page, pfn, zone, order, migratetype);
- spin_unlock_irqrestore(&zone->lock, flags);
+ spin_unlock(&zone->lock);
}
static void __meminit __init_single_page(struct page *page, unsigned long pfn,
@@ -1240,6 +1239,7 @@ void __meminit reserve_bootmem_region(phys_addr_t start, phys_addr_t end)
static void __free_pages_ok(struct page *page, unsigned int order)
{
+ unsigned long flags;
int migratetype;
unsigned long pfn = page_to_pfn(page);
@@ -1247,7 +1247,10 @@ static void __free_pages_ok(struct page *page, unsigned int order)
return;
migratetype = get_pfnblock_migratetype(page, pfn);
+ local_irq_save(flags);
+ __count_vm_events(PGFREE, 1 << order);
free_one_page(page_zone(page), page, pfn, order, migratetype);
+ local_irq_restore(flags);
}
static void __init __free_pages_boot_core(struct page *page, unsigned int order)
@@ -2219,9 +2222,8 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
int migratetype, bool cold)
{
int i, alloced = 0;
- unsigned long flags;
- spin_lock_irqsave(&zone->lock, flags);
+ spin_lock(&zone->lock);
for (i = 0; i < count; ++i) {
struct page *page = __rmqueue(zone, order, migratetype);
if (unlikely(page == NULL))
@@ -2257,7 +2259,7 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
* pages added to the pcp list.
*/
__mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order));
- spin_unlock_irqrestore(&zone->lock, flags);
+ spin_unlock(&zone->lock);
return alloced;
}
@@ -2373,6 +2375,13 @@ void drain_all_pages(struct zone *zone)
*/
static cpumask_t cpus_with_pcps;
+ /*
+ * Make sure nobody triggers this path before mm_percpu_wq is fully
+ * initialized.
+ */
+ if (WARN_ON_ONCE(!mm_percpu_wq))
+ return;
+
/* Workqueues cannot recurse */
if (current->flags & PF_WQ_WORKER)
return;
@@ -2422,7 +2431,7 @@ void drain_all_pages(struct zone *zone)
for_each_cpu(cpu, &cpus_with_pcps) {
struct work_struct *work = per_cpu_ptr(&pcpu_drain, cpu);
INIT_WORK(work, drain_local_pages_wq);
- schedule_work_on(cpu, work);
+ queue_work_on(cpu, mm_percpu_wq, work);
}
for_each_cpu(cpu, &cpus_with_pcps)
flush_work(per_cpu_ptr(&pcpu_drain, cpu));
@@ -2478,20 +2487,17 @@ void free_hot_cold_page(struct page *page, bool cold)
{
struct zone *zone = page_zone(page);
struct per_cpu_pages *pcp;
+ unsigned long flags;
unsigned long pfn = page_to_pfn(page);
int migratetype;
- if (in_interrupt()) {
- __free_pages_ok(page, 0);
- return;
- }
-
if (!free_pcp_prepare(page))
return;
migratetype = get_pfnblock_migratetype(page, pfn);
set_pcppage_migratetype(page, migratetype);
- preempt_disable();
+ local_irq_save(flags);
+ __count_vm_event(PGFREE);
/*
* We only track unmovable, reclaimable and movable on pcp lists.
@@ -2508,7 +2514,6 @@ void free_hot_cold_page(struct page *page, bool cold)
migratetype = MIGRATE_MOVABLE;
}
- __count_vm_event(PGFREE);
pcp = &this_cpu_ptr(zone->pageset)->pcp;
if (!cold)
list_add(&page->lru, &pcp->lists[migratetype]);
@@ -2522,7 +2527,7 @@ void free_hot_cold_page(struct page *page, bool cold)
}
out:
- preempt_enable();
+ local_irq_restore(flags);
}
/*
@@ -2647,8 +2652,6 @@ static struct page *__rmqueue_pcplist(struct zone *zone, int migratetype,
{
struct page *page;
- VM_BUG_ON(in_interrupt());
-
do {
if (list_empty(list)) {
pcp->count += rmqueue_bulk(zone, 0,
@@ -2679,8 +2682,9 @@ static struct page *rmqueue_pcplist(struct zone *preferred_zone,
struct list_head *list;
bool cold = ((gfp_flags & __GFP_COLD) != 0);
struct page *page;
+ unsigned long flags;
- preempt_disable();
+ local_irq_save(flags);
pcp = &this_cpu_ptr(zone->pageset)->pcp;
list = &pcp->lists[migratetype];
page = __rmqueue_pcplist(zone, migratetype, cold, pcp, list);
@@ -2688,7 +2692,7 @@ static struct page *rmqueue_pcplist(struct zone *preferred_zone,
__count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order);
zone_statistics(preferred_zone, zone);
}
- preempt_enable();
+ local_irq_restore(flags);
return page;
}
@@ -2704,7 +2708,7 @@ struct page *rmqueue(struct zone *preferred_zone,
unsigned long flags;
struct page *page;
- if (likely(order == 0) && !in_interrupt()) {
+ if (likely(order == 0)) {
page = rmqueue_pcplist(preferred_zone, zone, order,
gfp_flags, migratetype);
goto out;
@@ -4243,7 +4247,8 @@ EXPORT_SYMBOL(free_pages_exact);
* nr_free_zone_pages() counts the number of counts pages which are beyond the
* high watermark within all zones at or below a given zone index. For each
* zone, the number of pages is calculated as:
- * managed_pages - high_pages
+ *
+ * nr_free_zone_pages = managed_pages - high_pages
*/
static unsigned long nr_free_zone_pages(int offset)
{
@@ -4519,13 +4524,13 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask)
K(node_page_state(pgdat, NR_FILE_MAPPED)),
K(node_page_state(pgdat, NR_FILE_DIRTY)),
K(node_page_state(pgdat, NR_WRITEBACK)),
+ K(node_page_state(pgdat, NR_SHMEM)),
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
K(node_page_state(pgdat, NR_SHMEM_THPS) * HPAGE_PMD_NR),
K(node_page_state(pgdat, NR_SHMEM_PMDMAPPED)
* HPAGE_PMD_NR),
K(node_page_state(pgdat, NR_ANON_THPS) * HPAGE_PMD_NR),
#endif
- K(node_page_state(pgdat, NR_SHMEM)),
K(node_page_state(pgdat, NR_WRITEBACK_TEMP)),
K(node_page_state(pgdat, NR_UNSTABLE_NFS)),
node_page_state(pgdat, NR_PAGES_SCANNED),
diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c
index c4c9def8ffea..de9c40d7304a 100644
--- a/mm/page_vma_mapped.c
+++ b/mm/page_vma_mapped.c
@@ -111,12 +111,8 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw)
if (pvmw->pmd && !pvmw->pte)
return not_found(pvmw);
- /* Only for THP, seek to next pte entry makes sense */
- if (pvmw->pte) {
- if (!PageTransHuge(pvmw->page) || PageHuge(pvmw->page))
- return not_found(pvmw);
+ if (pvmw->pte)
goto next_pte;
- }
if (unlikely(PageHuge(pvmw->page))) {
/* when pud is not present, pte will be NULL */
@@ -165,9 +161,14 @@ restart:
while (1) {
if (check_pte(pvmw))
return true;
-next_pte: do {
+next_pte:
+ /* Seek to next pte only makes sense for THP */
+ if (!PageTransHuge(pvmw->page) || PageHuge(pvmw->page))
+ return not_found(pvmw);
+ do {
pvmw->address += PAGE_SIZE;
- if (pvmw->address >=
+ if (pvmw->address >= pvmw->vma->vm_end ||
+ pvmw->address >=
__vma_address(pvmw->page, pvmw->vma) +
hpage_nr_pages(pvmw->page) * PAGE_SIZE)
return not_found(pvmw);
diff --git a/mm/percpu.c b/mm/percpu.c
index 60a6488e9e6d..e0aa8ae7bde7 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -1284,18 +1284,7 @@ void free_percpu(void __percpu *ptr)
}
EXPORT_SYMBOL_GPL(free_percpu);
-/**
- * is_kernel_percpu_address - test whether address is from static percpu area
- * @addr: address to test
- *
- * Test whether @addr belongs to in-kernel static percpu area. Module
- * static percpu areas are not considered. For those, use
- * is_module_percpu_address().
- *
- * RETURNS:
- * %true if @addr is from in-kernel static percpu area, %false otherwise.
- */
-bool is_kernel_percpu_address(unsigned long addr)
+bool __is_kernel_percpu_address(unsigned long addr, unsigned long *can_addr)
{
#ifdef CONFIG_SMP
const size_t static_size = __per_cpu_end - __per_cpu_start;
@@ -1304,16 +1293,39 @@ bool is_kernel_percpu_address(unsigned long addr)
for_each_possible_cpu(cpu) {
void *start = per_cpu_ptr(base, cpu);
+ void *va = (void *)addr;
- if ((void *)addr >= start && (void *)addr < start + static_size)
+ if (va >= start && va < start + static_size) {
+ if (can_addr) {
+ *can_addr = (unsigned long) (va - start);
+ *can_addr += (unsigned long)
+ per_cpu_ptr(base, get_boot_cpu_id());
+ }
return true;
- }
+ }
+ }
#endif
/* on UP, can't distinguish from other static vars, always false */
return false;
}
/**
+ * is_kernel_percpu_address - test whether address is from static percpu area
+ * @addr: address to test
+ *
+ * Test whether @addr belongs to in-kernel static percpu area. Module
+ * static percpu areas are not considered. For those, use
+ * is_module_percpu_address().
+ *
+ * RETURNS:
+ * %true if @addr is from in-kernel static percpu area, %false otherwise.
+ */
+bool is_kernel_percpu_address(unsigned long addr)
+{
+ return __is_kernel_percpu_address(addr, NULL);
+}
+
+/**
* per_cpu_ptr_to_phys - convert translated percpu address to physical address
* @addr: the address to be converted to physical address
*
diff --git a/mm/rmap.c b/mm/rmap.c
index 49ed681ccc7b..f6838015810f 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1159,7 +1159,7 @@ void page_add_file_rmap(struct page *page, bool compound)
goto out;
}
__mod_node_page_state(page_pgdat(page), NR_FILE_MAPPED, nr);
- mem_cgroup_inc_page_stat(page, MEM_CGROUP_STAT_FILE_MAPPED);
+ mem_cgroup_update_page_stat(page, MEM_CGROUP_STAT_FILE_MAPPED, nr);
out:
unlock_page_memcg(page);
}
@@ -1199,7 +1199,7 @@ static void page_remove_file_rmap(struct page *page, bool compound)
* pte lock(a spinlock) is held, which implies preemption disabled.
*/
__mod_node_page_state(page_pgdat(page), NR_FILE_MAPPED, -nr);
- mem_cgroup_dec_page_stat(page, MEM_CGROUP_STAT_FILE_MAPPED);
+ mem_cgroup_update_page_stat(page, MEM_CGROUP_STAT_FILE_MAPPED, -nr);
if (unlikely(PageMlocked(page)))
clear_page_mlock(page);
diff --git a/mm/swap.c b/mm/swap.c
index c4910f14f957..d8d9ee9e311a 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -97,6 +97,16 @@ static void __put_compound_page(struct page *page)
void __put_page(struct page *page)
{
+ if (is_zone_device_page(page)) {
+ put_dev_pagemap(page->pgmap);
+
+ /*
+ * The page belongs to the device that created pgmap. Do
+ * not return it to page allocator.
+ */
+ return;
+ }
+
if (unlikely(PageCompound(page)))
__put_compound_page(page);
else
@@ -670,30 +680,19 @@ static void lru_add_drain_per_cpu(struct work_struct *dummy)
static DEFINE_PER_CPU(struct work_struct, lru_add_drain_work);
-/*
- * lru_add_drain_wq is used to do lru_add_drain_all() from a WQ_MEM_RECLAIM
- * workqueue, aiding in getting memory freed.
- */
-static struct workqueue_struct *lru_add_drain_wq;
-
-static int __init lru_init(void)
-{
- lru_add_drain_wq = alloc_workqueue("lru-add-drain", WQ_MEM_RECLAIM, 0);
-
- if (WARN(!lru_add_drain_wq,
- "Failed to create workqueue lru_add_drain_wq"))
- return -ENOMEM;
-
- return 0;
-}
-early_initcall(lru_init);
-
void lru_add_drain_all(void)
{
static DEFINE_MUTEX(lock);
static struct cpumask has_work;
int cpu;
+ /*
+ * Make sure nobody triggers this path before mm_percpu_wq is fully
+ * initialized.
+ */
+ if (WARN_ON(!mm_percpu_wq))
+ return;
+
mutex_lock(&lock);
get_online_cpus();
cpumask_clear(&has_work);
@@ -707,7 +706,7 @@ void lru_add_drain_all(void)
pagevec_count(&per_cpu(lru_deactivate_pvecs, cpu)) ||
need_activate_page_drain(cpu)) {
INIT_WORK(work, lru_add_drain_per_cpu);
- queue_work_on(cpu, lru_add_drain_wq, work);
+ queue_work_on(cpu, mm_percpu_wq, work);
cpumask_set_cpu(cpu, &has_work);
}
}
diff --git a/mm/swap_cgroup.c b/mm/swap_cgroup.c
index 310ac0b8f974..ac6318a064d3 100644
--- a/mm/swap_cgroup.c
+++ b/mm/swap_cgroup.c
@@ -201,6 +201,8 @@ void swap_cgroup_swapoff(int type)
struct page *page = map[i];
if (page)
__free_page(page);
+ if (!(i % SWAP_CLUSTER_MAX))
+ cond_resched();
}
vfree(map);
}
diff --git a/mm/swap_slots.c b/mm/swap_slots.c
index 7ebb23836f68..b1ccb58ad397 100644
--- a/mm/swap_slots.c
+++ b/mm/swap_slots.c
@@ -267,8 +267,6 @@ int free_swap_slot(swp_entry_t entry)
{
struct swap_slots_cache *cache;
- WARN_ON_ONCE(!swap_slot_cache_initialized);
-
cache = &get_cpu_var(swp_slots);
if (use_swap_slot_cache && cache->slots_ret) {
spin_lock_irq(&cache->free_lock);
diff --git a/mm/usercopy.c b/mm/usercopy.c
index d155e12563b1..a9852b24715d 100644
--- a/mm/usercopy.c
+++ b/mm/usercopy.c
@@ -19,15 +19,9 @@
#include <linux/sched.h>
#include <linux/sched/task.h>
#include <linux/sched/task_stack.h>
+#include <linux/thread_info.h>
#include <asm/sections.h>
-enum {
- BAD_STACK = -1,
- NOT_STACK = 0,
- GOOD_FRAME,
- GOOD_STACK,
-};
-
/*
* Checks if a given pointer and length is contained by the current
* stack frame (if possible).
@@ -206,17 +200,6 @@ static inline const char *check_heap_object(const void *ptr, unsigned long n,
{
struct page *page;
- /*
- * Some architectures (arm64) return true for virt_addr_valid() on
- * vmalloced addresses. Work around this by checking for vmalloc
- * first.
- *
- * We also need to check for module addresses explicitly since we
- * may copy static data from modules to userspace
- */
- if (is_vmalloc_or_module_addr(ptr))
- return NULL;
-
if (!virt_addr_valid(ptr))
return NULL;
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 0b057628a7ba..b52aeed3f58e 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -1579,7 +1579,7 @@ void vfree_atomic(const void *addr)
* have CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG, but making the calling
* conventions for vfree() arch-depenedent would be a really bad idea)
*
- * NOTE: assumes that the object at *addr has a size >= sizeof(llist_node)
+ * NOTE: assumes that the object at @addr has a size >= sizeof(llist_node)
*/
void vfree(const void *addr)
{
diff --git a/mm/vmstat.c b/mm/vmstat.c
index b1947f0cbee2..5a4f5c5a31e8 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1552,7 +1552,6 @@ static const struct file_operations proc_vmstat_file_operations = {
#endif /* CONFIG_PROC_FS */
#ifdef CONFIG_SMP
-static struct workqueue_struct *vmstat_wq;
static DEFINE_PER_CPU(struct delayed_work, vmstat_work);
int sysctl_stat_interval __read_mostly = HZ;
@@ -1623,7 +1622,7 @@ static void vmstat_update(struct work_struct *w)
* to occur in the future. Keep on running the
* update worker thread.
*/
- queue_delayed_work_on(smp_processor_id(), vmstat_wq,
+ queue_delayed_work_on(smp_processor_id(), mm_percpu_wq,
this_cpu_ptr(&vmstat_work),
round_jiffies_relative(sysctl_stat_interval));
}
@@ -1702,7 +1701,7 @@ static void vmstat_shepherd(struct work_struct *w)
struct delayed_work *dw = &per_cpu(vmstat_work, cpu);
if (!delayed_work_pending(dw) && need_update(cpu))
- queue_delayed_work_on(cpu, vmstat_wq, dw, 0);
+ queue_delayed_work_on(cpu, mm_percpu_wq, dw, 0);
}
put_online_cpus();
@@ -1718,7 +1717,6 @@ static void __init start_shepherd_timer(void)
INIT_DEFERRABLE_WORK(per_cpu_ptr(&vmstat_work, cpu),
vmstat_update);
- vmstat_wq = alloc_workqueue("vmstat", WQ_FREEZABLE|WQ_MEM_RECLAIM, 0);
schedule_delayed_work(&shepherd,
round_jiffies_relative(sysctl_stat_interval));
}
@@ -1764,11 +1762,15 @@ static int vmstat_cpu_dead(unsigned int cpu)
#endif
-static int __init setup_vmstat(void)
+struct workqueue_struct *mm_percpu_wq;
+
+void __init init_mm_internals(void)
{
-#ifdef CONFIG_SMP
- int ret;
+ int ret __maybe_unused;
+
+ mm_percpu_wq = alloc_workqueue("mm_percpu_wq", WQ_MEM_RECLAIM, 0);
+#ifdef CONFIG_SMP
ret = cpuhp_setup_state_nocalls(CPUHP_MM_VMSTAT_DEAD, "mm/vmstat:dead",
NULL, vmstat_cpu_dead);
if (ret < 0)
@@ -1792,9 +1794,7 @@ static int __init setup_vmstat(void)
proc_create("vmstat", S_IRUGO, NULL, &proc_vmstat_file_operations);
proc_create("zoneinfo", S_IRUGO, NULL, &proc_zoneinfo_file_operations);
#endif
- return 0;
}
-module_init(setup_vmstat)
#if defined(CONFIG_DEBUG_FS) && defined(CONFIG_COMPACTION)
diff --git a/mm/workingset.c b/mm/workingset.c
index ac839fca0e76..eda05c71fa49 100644
--- a/mm/workingset.c
+++ b/mm/workingset.c
@@ -532,7 +532,7 @@ static int __init workingset_init(void)
pr_info("workingset: timestamp_bits=%d max_order=%d bucket_order=%u\n",
timestamp_bits, max_order, bucket_order);
- ret = list_lru_init_key(&shadow_nodes, &shadow_nodes_key);
+ ret = __list_lru_init(&shadow_nodes, true, &shadow_nodes_key);
if (ret)
goto err;
ret = register_shrinker(&workingset_shadow_shrinker);
diff --git a/mm/z3fold.c b/mm/z3fold.c
index f9492bccfd79..54f63c4a809a 100644
--- a/mm/z3fold.c
+++ b/mm/z3fold.c
@@ -185,6 +185,12 @@ static inline void z3fold_page_lock(struct z3fold_header *zhdr)
spin_lock(&zhdr->page_lock);
}
+/* Try to lock a z3fold page */
+static inline int z3fold_page_trylock(struct z3fold_header *zhdr)
+{
+ return spin_trylock(&zhdr->page_lock);
+}
+
/* Unlock a z3fold page */
static inline void z3fold_page_unlock(struct z3fold_header *zhdr)
{
@@ -385,7 +391,7 @@ static int z3fold_alloc(struct z3fold_pool *pool, size_t size, gfp_t gfp,
spin_lock(&pool->lock);
zhdr = list_first_entry_or_null(&pool->unbuddied[i],
struct z3fold_header, buddy);
- if (!zhdr) {
+ if (!zhdr || !z3fold_page_trylock(zhdr)) {
spin_unlock(&pool->lock);
continue;
}
@@ -394,7 +400,6 @@ static int z3fold_alloc(struct z3fold_pool *pool, size_t size, gfp_t gfp,
spin_unlock(&pool->lock);
page = virt_to_page(zhdr);
- z3fold_page_lock(zhdr);
if (zhdr->first_chunks == 0) {
if (zhdr->middle_chunks != 0 &&
chunks >= zhdr->start_middle)
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index b7ee9c34dbd6..d41edd28298b 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -276,7 +276,7 @@ struct zs_pool {
struct zspage {
struct {
unsigned int fullness:FULLNESS_BITS;
- unsigned int class:CLASS_BITS;
+ unsigned int class:CLASS_BITS + 1;
unsigned int isolated:ISOLATED_BITS;
unsigned int magic:MAGIC_VAL_BITS;
};