aboutsummaryrefslogtreecommitdiff
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig4
-rw-r--r--mm/Makefile1
-rw-r--r--mm/debug_vm_pgtable.c4
-rw-r--r--mm/dmapool.c2
-rw-r--r--mm/filemap.c61
-rw-r--r--mm/gup.c145
-rw-r--r--mm/hugetlb.c2
-rw-r--r--mm/internal.h25
-rw-r--r--mm/interval_tree.c2
-rw-r--r--mm/io-mapping.c29
-rw-r--r--mm/ioremap.c225
-rw-r--r--mm/kasan/common.c45
-rw-r--r--mm/kasan/generic.c12
-rw-r--r--mm/kasan/kasan.h24
-rw-r--r--mm/kasan/report_generic.c2
-rw-r--r--mm/kasan/shadow.c10
-rw-r--r--mm/kasan/sw_tags.c12
-rw-r--r--mm/kmemleak.c2
-rw-r--r--mm/memcontrol.c672
-rw-r--r--mm/memory-failure.c2
-rw-r--r--mm/memory.c191
-rw-r--r--mm/mempolicy.c76
-rw-r--r--mm/mempool.c4
-rw-r--r--mm/memremap.c2
-rw-r--r--mm/migrate.c2
-rw-r--r--mm/mm_init.c4
-rw-r--r--mm/mmap.c6
-rw-r--r--mm/mremap.c6
-rw-r--r--mm/msync.c6
-rw-r--r--mm/page-writeback.c9
-rw-r--r--mm/page_alloc.c370
-rw-r--r--mm/page_counter.c8
-rw-r--r--mm/page_owner.c68
-rw-r--r--mm/page_poison.c6
-rw-r--r--mm/percpu-vm.c7
-rw-r--r--mm/slab.c43
-rw-r--r--mm/slab.h17
-rw-r--r--mm/slab_common.c8
-rw-r--r--mm/slub.c87
-rw-r--r--mm/sparse.c1
-rw-r--r--mm/swap_state.c13
-rw-r--r--mm/util.c10
-rw-r--r--mm/vmalloc.c650
43 files changed, 1707 insertions, 1168 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index d0808a23e54b..3636da27c385 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -9,7 +9,6 @@ config SELECT_MEMORY_MODEL
choice
prompt "Memory model"
depends on SELECT_MEMORY_MODEL
- default DISCONTIGMEM_MANUAL if ARCH_DISCONTIGMEM_DEFAULT
default SPARSEMEM_MANUAL if ARCH_SPARSEMEM_DEFAULT
default FLATMEM_MANUAL
help
@@ -871,4 +870,7 @@ config MAPPING_DIRTY_HELPERS
config KMAP_LOCAL
bool
+# struct io_mapping based helper. Selected by drivers that need them
+config IO_MAPPING
+ bool
endmenu
diff --git a/mm/Makefile b/mm/Makefile
index 72227b24a616..c0135e385984 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -120,3 +120,4 @@ obj-$(CONFIG_MEMFD_CREATE) += memfd.o
obj-$(CONFIG_MAPPING_DIRTY_HELPERS) += mapping_dirty_helpers.o
obj-$(CONFIG_PTDUMP_CORE) += ptdump.o
obj-$(CONFIG_PAGE_REPORTING) += page_reporting.o
+obj-$(CONFIG_IO_MAPPING) += io-mapping.o
diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c
index a9bd6ce1ba02..05efe98a9ac2 100644
--- a/mm/debug_vm_pgtable.c
+++ b/mm/debug_vm_pgtable.c
@@ -247,7 +247,7 @@ static void __init pmd_huge_tests(pmd_t *pmdp, unsigned long pfn, pgprot_t prot)
{
pmd_t pmd;
- if (!arch_ioremap_pmd_supported())
+ if (!arch_vmap_pmd_supported(prot))
return;
pr_debug("Validating PMD huge\n");
@@ -385,7 +385,7 @@ static void __init pud_huge_tests(pud_t *pudp, unsigned long pfn, pgprot_t prot)
{
pud_t pud;
- if (!arch_ioremap_pud_supported())
+ if (!arch_vmap_pud_supported(prot))
return;
pr_debug("Validating PUD huge\n");
diff --git a/mm/dmapool.c b/mm/dmapool.c
index f3791532fef2..16483f86360e 100644
--- a/mm/dmapool.c
+++ b/mm/dmapool.c
@@ -157,7 +157,7 @@ struct dma_pool *dma_pool_create(const char *name, struct device *dev,
if (!retval)
return retval;
- strlcpy(retval->name, name, sizeof(retval->name));
+ strscpy(retval->name, name, sizeof(retval->name));
retval->dev = dev;
diff --git a/mm/filemap.c b/mm/filemap.c
index 151090fdcf29..5be57ba01d33 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -636,6 +636,49 @@ static bool mapping_needs_writeback(struct address_space *mapping)
}
/**
+ * filemap_range_needs_writeback - check if range potentially needs writeback
+ * @mapping: address space within which to check
+ * @start_byte: offset in bytes where the range starts
+ * @end_byte: offset in bytes where the range ends (inclusive)
+ *
+ * Find at least one page in the range supplied, usually used to check if
+ * direct writing in this range will trigger a writeback. Used by O_DIRECT
+ * read/write with IOCB_NOWAIT, to see if the caller needs to do
+ * filemap_write_and_wait_range() before proceeding.
+ *
+ * Return: %true if the caller should do filemap_write_and_wait_range() before
+ * doing O_DIRECT to a page in this range, %false otherwise.
+ */
+bool filemap_range_needs_writeback(struct address_space *mapping,
+ loff_t start_byte, loff_t end_byte)
+{
+ XA_STATE(xas, &mapping->i_pages, start_byte >> PAGE_SHIFT);
+ pgoff_t max = end_byte >> PAGE_SHIFT;
+ struct page *page;
+
+ if (!mapping_needs_writeback(mapping))
+ return false;
+ if (!mapping_tagged(mapping, PAGECACHE_TAG_DIRTY) &&
+ !mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK))
+ return false;
+ if (end_byte < start_byte)
+ return false;
+
+ rcu_read_lock();
+ xas_for_each(&xas, page, max) {
+ if (xas_retry(&xas, page))
+ continue;
+ if (xa_is_value(page))
+ continue;
+ if (PageDirty(page) || PageLocked(page) || PageWriteback(page))
+ break;
+ }
+ rcu_read_unlock();
+ return page != NULL;
+}
+EXPORT_SYMBOL_GPL(filemap_range_needs_writeback);
+
+/**
* filemap_write_and_wait_range - write out & wait on a file range
* @mapping: the address_space for the pages
* @lstart: offset in bytes where the range starts
@@ -1724,7 +1767,7 @@ EXPORT_SYMBOL(page_cache_prev_miss);
* @mapping: the address_space to search
* @index: The page cache index.
*
- * Looks up the page cache slot at @mapping & @offset. If there is a
+ * Looks up the page cache slot at @mapping & @index. If there is a
* page cache page, the head page is returned with an increased refcount.
*
* If the slot holds a shadow entry of a previously evicted page, or a
@@ -2305,8 +2348,6 @@ static int filemap_read_page(struct file *file, struct address_space *mapping,
return error;
if (PageUptodate(page))
return 0;
- if (!page->mapping) /* page truncated */
- return AOP_TRUNCATED_PAGE;
shrink_readahead_size_eio(&file->f_ra);
return -EIO;
}
@@ -2638,8 +2679,8 @@ generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
size = i_size_read(inode);
if (iocb->ki_flags & IOCB_NOWAIT) {
- if (filemap_range_has_page(mapping, iocb->ki_pos,
- iocb->ki_pos + count - 1))
+ if (filemap_range_needs_writeback(mapping, iocb->ki_pos,
+ iocb->ki_pos + count - 1))
return -EAGAIN;
} else {
retval = filemap_write_and_wait_range(mapping,
@@ -2937,7 +2978,6 @@ vm_fault_t filemap_fault(struct vm_fault *vmf)
struct file *file = vmf->vma->vm_file;
struct file *fpin = NULL;
struct address_space *mapping = file->f_mapping;
- struct file_ra_state *ra = &file->f_ra;
struct inode *inode = mapping->host;
pgoff_t offset = vmf->pgoff;
pgoff_t max_off;
@@ -3024,14 +3064,8 @@ page_not_uptodate:
* because there really aren't any performance issues here
* and we need to check for errors.
*/
- ClearPageError(page);
fpin = maybe_unlock_mmap_for_io(vmf, fpin);
- error = mapping->a_ops->readpage(file, page);
- if (!error) {
- wait_on_page_locked(page);
- if (!PageUptodate(page))
- error = -EIO;
- }
+ error = filemap_read_page(file, mapping, page);
if (fpin)
goto out_retry;
put_page(page);
@@ -3039,7 +3073,6 @@ page_not_uptodate:
if (!error || error == AOP_TRUNCATED_PAGE)
goto retry_find;
- shrink_readahead_size_eio(ra);
return VM_FAULT_SIGBUS;
out_retry:
diff --git a/mm/gup.c b/mm/gup.c
index ef7d2da9f03f..71e546e279fc 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -213,6 +213,58 @@ void unpin_user_page(struct page *page)
}
EXPORT_SYMBOL(unpin_user_page);
+static inline void compound_range_next(unsigned long i, unsigned long npages,
+ struct page **list, struct page **head,
+ unsigned int *ntails)
+{
+ struct page *next, *page;
+ unsigned int nr = 1;
+
+ if (i >= npages)
+ return;
+
+ next = *list + i;
+ page = compound_head(next);
+ if (PageCompound(page) && compound_order(page) >= 1)
+ nr = min_t(unsigned int,
+ page + compound_nr(page) - next, npages - i);
+
+ *head = page;
+ *ntails = nr;
+}
+
+#define for_each_compound_range(__i, __list, __npages, __head, __ntails) \
+ for (__i = 0, \
+ compound_range_next(__i, __npages, __list, &(__head), &(__ntails)); \
+ __i < __npages; __i += __ntails, \
+ compound_range_next(__i, __npages, __list, &(__head), &(__ntails)))
+
+static inline void compound_next(unsigned long i, unsigned long npages,
+ struct page **list, struct page **head,
+ unsigned int *ntails)
+{
+ struct page *page;
+ unsigned int nr;
+
+ if (i >= npages)
+ return;
+
+ page = compound_head(list[i]);
+ for (nr = i + 1; nr < npages; nr++) {
+ if (compound_head(list[nr]) != page)
+ break;
+ }
+
+ *head = page;
+ *ntails = nr - i;
+}
+
+#define for_each_compound_head(__i, __list, __npages, __head, __ntails) \
+ for (__i = 0, \
+ compound_next(__i, __npages, __list, &(__head), &(__ntails)); \
+ __i < __npages; __i += __ntails, \
+ compound_next(__i, __npages, __list, &(__head), &(__ntails)))
+
/**
* unpin_user_pages_dirty_lock() - release and optionally dirty gup-pinned pages
* @pages: array of pages to be maybe marked dirty, and definitely released.
@@ -239,20 +291,15 @@ void unpin_user_pages_dirty_lock(struct page **pages, unsigned long npages,
bool make_dirty)
{
unsigned long index;
-
- /*
- * TODO: this can be optimized for huge pages: if a series of pages is
- * physically contiguous and part of the same compound page, then a
- * single operation to the head page should suffice.
- */
+ struct page *head;
+ unsigned int ntails;
if (!make_dirty) {
unpin_user_pages(pages, npages);
return;
}
- for (index = 0; index < npages; index++) {
- struct page *page = compound_head(pages[index]);
+ for_each_compound_head(index, pages, npages, head, ntails) {
/*
* Checking PageDirty at this point may race with
* clear_page_dirty_for_io(), but that's OK. Two key
@@ -273,14 +320,50 @@ void unpin_user_pages_dirty_lock(struct page **pages, unsigned long npages,
* written back, so it gets written back again in the
* next writeback cycle. This is harmless.
*/
- if (!PageDirty(page))
- set_page_dirty_lock(page);
- unpin_user_page(page);
+ if (!PageDirty(head))
+ set_page_dirty_lock(head);
+ put_compound_head(head, ntails, FOLL_PIN);
}
}
EXPORT_SYMBOL(unpin_user_pages_dirty_lock);
/**
+ * unpin_user_page_range_dirty_lock() - release and optionally dirty
+ * gup-pinned page range
+ *
+ * @page: the starting page of a range maybe marked dirty, and definitely released.
+ * @npages: number of consecutive pages to release.
+ * @make_dirty: whether to mark the pages dirty
+ *
+ * "gup-pinned page range" refers to a range of pages that has had one of the
+ * pin_user_pages() variants called on that page.
+ *
+ * For the page ranges defined by [page .. page+npages], make that range (or
+ * its head pages, if a compound page) dirty, if @make_dirty is true, and if the
+ * page range was previously listed as clean.
+ *
+ * set_page_dirty_lock() is used internally. If instead, set_page_dirty() is
+ * required, then the caller should a) verify that this is really correct,
+ * because _lock() is usually required, and b) hand code it:
+ * set_page_dirty_lock(), unpin_user_page().
+ *
+ */
+void unpin_user_page_range_dirty_lock(struct page *page, unsigned long npages,
+ bool make_dirty)
+{
+ unsigned long index;
+ struct page *head;
+ unsigned int ntails;
+
+ for_each_compound_range(index, &page, npages, head, ntails) {
+ if (make_dirty && !PageDirty(head))
+ set_page_dirty_lock(head);
+ put_compound_head(head, ntails, FOLL_PIN);
+ }
+}
+EXPORT_SYMBOL(unpin_user_page_range_dirty_lock);
+
+/**
* unpin_user_pages() - release an array of gup-pinned pages.
* @pages: array of pages to be marked dirty and released.
* @npages: number of pages in the @pages array.
@@ -292,6 +375,8 @@ EXPORT_SYMBOL(unpin_user_pages_dirty_lock);
void unpin_user_pages(struct page **pages, unsigned long npages)
{
unsigned long index;
+ struct page *head;
+ unsigned int ntails;
/*
* If this WARN_ON() fires, then the system *might* be leaking pages (by
@@ -300,13 +385,9 @@ void unpin_user_pages(struct page **pages, unsigned long npages)
*/
if (WARN_ON(IS_ERR_VALUE(npages)))
return;
- /*
- * TODO: this can be optimized for huge pages: if a series of pages is
- * physically contiguous and part of the same compound page, then a
- * single operation to the head page should suffice.
- */
- for (index = 0; index < npages; index++)
- unpin_user_page(pages[index]);
+
+ for_each_compound_head(index, pages, npages, head, ntails)
+ put_compound_head(head, ntails, FOLL_PIN);
}
EXPORT_SYMBOL(unpin_user_pages);
@@ -435,18 +516,6 @@ retry:
}
}
- if (flags & FOLL_SPLIT && PageTransCompound(page)) {
- get_page(page);
- pte_unmap_unlock(ptep, ptl);
- lock_page(page);
- ret = split_huge_page(page);
- unlock_page(page);
- put_page(page);
- if (ret)
- return ERR_PTR(ret);
- goto retry;
- }
-
/* try_grab_page() does nothing unless FOLL_GET or FOLL_PIN is set. */
if (unlikely(!try_grab_page(page, flags))) {
page = ERR_PTR(-ENOMEM);
@@ -591,7 +660,7 @@ retry_locked:
spin_unlock(ptl);
return follow_page_pte(vma, address, pmd, flags, &ctx->pgmap);
}
- if (flags & (FOLL_SPLIT | FOLL_SPLIT_PMD)) {
+ if (flags & FOLL_SPLIT_PMD) {
int ret;
page = pmd_page(*pmd);
if (is_huge_zero_page(page)) {
@@ -600,19 +669,7 @@ retry_locked:
split_huge_pmd(vma, pmd, address);
if (pmd_trans_unstable(pmd))
ret = -EBUSY;
- } else if (flags & FOLL_SPLIT) {
- if (unlikely(!try_get_page(page))) {
- spin_unlock(ptl);
- return ERR_PTR(-ENOMEM);
- }
- spin_unlock(ptl);
- lock_page(page);
- ret = split_huge_page(page);
- unlock_page(page);
- put_page(page);
- if (pmd_none(*pmd))
- return no_page_table(vma, flags);
- } else { /* flags & FOLL_SPLIT_PMD */
+ } else {
spin_unlock(ptl);
split_huge_pmd(vma, pmd, address);
ret = pte_alloc(mm, pmd) ? -ENOMEM : 0;
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index a86a58ef132d..6c72433bec1e 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1616,7 +1616,7 @@ static struct page *alloc_buddy_huge_page(struct hstate *h,
gfp_mask |= __GFP_RETRY_MAYFAIL;
if (nid == NUMA_NO_NODE)
nid = numa_mem_id();
- page = __alloc_pages_nodemask(gfp_mask, order, nid, nmask);
+ page = __alloc_pages(gfp_mask, order, nid, nmask);
if (page)
__count_vm_event(HTLB_BUDDY_PGALLOC);
else
diff --git a/mm/internal.h b/mm/internal.h
index bbe900f9f095..ef5f336f59bd 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -145,10 +145,10 @@ extern pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address);
* family of functions.
*
* nodemask, migratetype and highest_zoneidx are initialized only once in
- * __alloc_pages_nodemask() and then never change.
+ * __alloc_pages() and then never change.
*
* zonelist, preferred_zone and highest_zoneidx are set first in
- * __alloc_pages_nodemask() for the fast path, and might be later changed
+ * __alloc_pages() for the fast path, and might be later changed
* in __alloc_pages_slowpath(). All other functions pass the whole structure
* by a const pointer.
*/
@@ -446,7 +446,9 @@ static inline struct file *maybe_unlock_mmap_for_io(struct vm_fault *vmf,
static inline void clear_page_mlock(struct page *page) { }
static inline void mlock_vma_page(struct page *page) { }
static inline void mlock_migrate_page(struct page *new, struct page *old) { }
-
+static inline void vunmap_range_noflush(unsigned long start, unsigned long end)
+{
+}
#endif /* !CONFIG_MMU */
/*
@@ -637,4 +639,21 @@ struct migration_target_control {
gfp_t gfp_mask;
};
+/*
+ * mm/vmalloc.c
+ */
+#ifdef CONFIG_MMU
+int vmap_pages_range_noflush(unsigned long addr, unsigned long end,
+ pgprot_t prot, struct page **pages, unsigned int page_shift);
+#else
+static inline
+int vmap_pages_range_noflush(unsigned long addr, unsigned long end,
+ pgprot_t prot, struct page **pages, unsigned int page_shift)
+{
+ return -EINVAL;
+}
+#endif
+
+void vunmap_range_noflush(unsigned long start, unsigned long end);
+
#endif /* __MM_INTERNAL_H */
diff --git a/mm/interval_tree.c b/mm/interval_tree.c
index 11c75fb07584..32e390c42c53 100644
--- a/mm/interval_tree.c
+++ b/mm/interval_tree.c
@@ -22,7 +22,7 @@ static inline unsigned long vma_last_pgoff(struct vm_area_struct *v)
INTERVAL_TREE_DEFINE(struct vm_area_struct, shared.rb,
unsigned long, shared.rb_subtree_last,
- vma_start_pgoff, vma_last_pgoff,, vma_interval_tree)
+ vma_start_pgoff, vma_last_pgoff, /* empty */, vma_interval_tree)
/* Insert node immediately after prev in the interval tree */
void vma_interval_tree_insert_after(struct vm_area_struct *node,
diff --git a/mm/io-mapping.c b/mm/io-mapping.c
new file mode 100644
index 000000000000..01b362799930
--- /dev/null
+++ b/mm/io-mapping.c
@@ -0,0 +1,29 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include <linux/mm.h>
+#include <linux/io-mapping.h>
+
+/**
+ * io_mapping_map_user - remap an I/O mapping to userspace
+ * @iomap: the source io_mapping
+ * @vma: user vma to map to
+ * @addr: target user address to start at
+ * @pfn: physical address of kernel memory
+ * @size: size of map area
+ *
+ * Note: this is only safe if the mm semaphore is held when called.
+ */
+int io_mapping_map_user(struct io_mapping *iomap, struct vm_area_struct *vma,
+ unsigned long addr, unsigned long pfn, unsigned long size)
+{
+ vm_flags_t expected_flags = VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP;
+
+ if (WARN_ON_ONCE((vma->vm_flags & expected_flags) != expected_flags))
+ return -EINVAL;
+
+ /* We rely on prevalidation of the io-mapping to skip track_pfn(). */
+ return remap_pfn_range_notrack(vma, addr, pfn, size,
+ __pgprot((pgprot_val(iomap->prot) & _PAGE_CACHE_MASK) |
+ (pgprot_val(vma->vm_page_prot) & ~_PAGE_CACHE_MASK)));
+}
+EXPORT_SYMBOL_GPL(io_mapping_map_user);
diff --git a/mm/ioremap.c b/mm/ioremap.c
index 5fa1ab41d152..d1dcc7e744ac 100644
--- a/mm/ioremap.c
+++ b/mm/ioremap.c
@@ -16,237 +16,22 @@
#include "pgalloc-track.h"
#ifdef CONFIG_HAVE_ARCH_HUGE_VMAP
-static int __read_mostly ioremap_p4d_capable;
-static int __read_mostly ioremap_pud_capable;
-static int __read_mostly ioremap_pmd_capable;
-static int __read_mostly ioremap_huge_disabled;
+static bool __ro_after_init iomap_max_page_shift = PAGE_SHIFT;
static int __init set_nohugeiomap(char *str)
{
- ioremap_huge_disabled = 1;
+ iomap_max_page_shift = P4D_SHIFT;
return 0;
}
early_param("nohugeiomap", set_nohugeiomap);
-
-void __init ioremap_huge_init(void)
-{
- if (!ioremap_huge_disabled) {
- if (arch_ioremap_p4d_supported())
- ioremap_p4d_capable = 1;
- if (arch_ioremap_pud_supported())
- ioremap_pud_capable = 1;
- if (arch_ioremap_pmd_supported())
- ioremap_pmd_capable = 1;
- }
-}
-
-static inline int ioremap_p4d_enabled(void)
-{
- return ioremap_p4d_capable;
-}
-
-static inline int ioremap_pud_enabled(void)
-{
- return ioremap_pud_capable;
-}
-
-static inline int ioremap_pmd_enabled(void)
-{
- return ioremap_pmd_capable;
-}
-
-#else /* !CONFIG_HAVE_ARCH_HUGE_VMAP */
-static inline int ioremap_p4d_enabled(void) { return 0; }
-static inline int ioremap_pud_enabled(void) { return 0; }
-static inline int ioremap_pmd_enabled(void) { return 0; }
+#else /* CONFIG_HAVE_ARCH_HUGE_VMAP */
+static const bool iomap_max_page_shift = PAGE_SHIFT;
#endif /* CONFIG_HAVE_ARCH_HUGE_VMAP */
-static int ioremap_pte_range(pmd_t *pmd, unsigned long addr,
- unsigned long end, phys_addr_t phys_addr, pgprot_t prot,
- pgtbl_mod_mask *mask)
-{
- pte_t *pte;
- u64 pfn;
-
- pfn = phys_addr >> PAGE_SHIFT;
- pte = pte_alloc_kernel_track(pmd, addr, mask);
- if (!pte)
- return -ENOMEM;
- do {
- BUG_ON(!pte_none(*pte));
- set_pte_at(&init_mm, addr, pte, pfn_pte(pfn, prot));
- pfn++;
- } while (pte++, addr += PAGE_SIZE, addr != end);
- *mask |= PGTBL_PTE_MODIFIED;
- return 0;
-}
-
-static int ioremap_try_huge_pmd(pmd_t *pmd, unsigned long addr,
- unsigned long end, phys_addr_t phys_addr,
- pgprot_t prot)
-{
- if (!ioremap_pmd_enabled())
- return 0;
-
- if ((end - addr) != PMD_SIZE)
- return 0;
-
- if (!IS_ALIGNED(addr, PMD_SIZE))
- return 0;
-
- if (!IS_ALIGNED(phys_addr, PMD_SIZE))
- return 0;
-
- if (pmd_present(*pmd) && !pmd_free_pte_page(pmd, addr))
- return 0;
-
- return pmd_set_huge(pmd, phys_addr, prot);
-}
-
-static inline int ioremap_pmd_range(pud_t *pud, unsigned long addr,
- unsigned long end, phys_addr_t phys_addr, pgprot_t prot,
- pgtbl_mod_mask *mask)
-{
- pmd_t *pmd;
- unsigned long next;
-
- pmd = pmd_alloc_track(&init_mm, pud, addr, mask);
- if (!pmd)
- return -ENOMEM;
- do {
- next = pmd_addr_end(addr, end);
-
- if (ioremap_try_huge_pmd(pmd, addr, next, phys_addr, prot)) {
- *mask |= PGTBL_PMD_MODIFIED;
- continue;
- }
-
- if (ioremap_pte_range(pmd, addr, next, phys_addr, prot, mask))
- return -ENOMEM;
- } while (pmd++, phys_addr += (next - addr), addr = next, addr != end);
- return 0;
-}
-
-static int ioremap_try_huge_pud(pud_t *pud, unsigned long addr,
- unsigned long end, phys_addr_t phys_addr,
- pgprot_t prot)
-{
- if (!ioremap_pud_enabled())
- return 0;
-
- if ((end - addr) != PUD_SIZE)
- return 0;
-
- if (!IS_ALIGNED(addr, PUD_SIZE))
- return 0;
-
- if (!IS_ALIGNED(phys_addr, PUD_SIZE))
- return 0;
-
- if (pud_present(*pud) && !pud_free_pmd_page(pud, addr))
- return 0;
-
- return pud_set_huge(pud, phys_addr, prot);
-}
-
-static inline int ioremap_pud_range(p4d_t *p4d, unsigned long addr,
- unsigned long end, phys_addr_t phys_addr, pgprot_t prot,
- pgtbl_mod_mask *mask)
-{
- pud_t *pud;
- unsigned long next;
-
- pud = pud_alloc_track(&init_mm, p4d, addr, mask);
- if (!pud)
- return -ENOMEM;
- do {
- next = pud_addr_end(addr, end);
-
- if (ioremap_try_huge_pud(pud, addr, next, phys_addr, prot)) {
- *mask |= PGTBL_PUD_MODIFIED;
- continue;
- }
-
- if (ioremap_pmd_range(pud, addr, next, phys_addr, prot, mask))
- return -ENOMEM;
- } while (pud++, phys_addr += (next - addr), addr = next, addr != end);
- return 0;
-}
-
-static int ioremap_try_huge_p4d(p4d_t *p4d, unsigned long addr,
- unsigned long end, phys_addr_t phys_addr,
- pgprot_t prot)
-{
- if (!ioremap_p4d_enabled())
- return 0;
-
- if ((end - addr) != P4D_SIZE)
- return 0;
-
- if (!IS_ALIGNED(addr, P4D_SIZE))
- return 0;
-
- if (!IS_ALIGNED(phys_addr, P4D_SIZE))
- return 0;
-
- if (p4d_present(*p4d) && !p4d_free_pud_page(p4d, addr))
- return 0;
-
- return p4d_set_huge(p4d, phys_addr, prot);
-}
-
-static inline int ioremap_p4d_range(pgd_t *pgd, unsigned long addr,
- unsigned long end, phys_addr_t phys_addr, pgprot_t prot,
- pgtbl_mod_mask *mask)
-{
- p4d_t *p4d;
- unsigned long next;
-
- p4d = p4d_alloc_track(&init_mm, pgd, addr, mask);
- if (!p4d)
- return -ENOMEM;
- do {
- next = p4d_addr_end(addr, end);
-
- if (ioremap_try_huge_p4d(p4d, addr, next, phys_addr, prot)) {
- *mask |= PGTBL_P4D_MODIFIED;
- continue;
- }
-
- if (ioremap_pud_range(p4d, addr, next, phys_addr, prot, mask))
- return -ENOMEM;
- } while (p4d++, phys_addr += (next - addr), addr = next, addr != end);
- return 0;
-}
-
int ioremap_page_range(unsigned long addr,
unsigned long end, phys_addr_t phys_addr, pgprot_t prot)
{
- pgd_t *pgd;
- unsigned long start;
- unsigned long next;
- int err;
- pgtbl_mod_mask mask = 0;
-
- might_sleep();
- BUG_ON(addr >= end);
-
- start = addr;
- pgd = pgd_offset_k(addr);
- do {
- next = pgd_addr_end(addr, end);
- err = ioremap_p4d_range(pgd, addr, next, phys_addr, prot,
- &mask);
- if (err)
- break;
- } while (pgd++, phys_addr += (next - addr), addr = next, addr != end);
-
- flush_cache_vmap(start, end);
-
- if (mask & ARCH_PAGE_TABLE_SYNC_MASK)
- arch_sync_kernel_mappings(start, end);
-
- return err;
+ return vmap_range(addr, end, phys_addr, prot, iomap_max_page_shift);
}
#ifdef CONFIG_GENERIC_IOREMAP
diff --git a/mm/kasan/common.c b/mm/kasan/common.c
index 7b53291dafa1..6bb87f2acd4e 100644
--- a/mm/kasan/common.c
+++ b/mm/kasan/common.c
@@ -60,7 +60,7 @@ void kasan_disable_current(void)
void __kasan_unpoison_range(const void *address, size_t size)
{
- kasan_unpoison(address, size);
+ kasan_unpoison(address, size, false);
}
#ifdef CONFIG_KASAN_STACK
@@ -69,7 +69,7 @@ void kasan_unpoison_task_stack(struct task_struct *task)
{
void *base = task_stack_page(task);
- kasan_unpoison(base, THREAD_SIZE);
+ kasan_unpoison(base, THREAD_SIZE, false);
}
/* Unpoison the stack for the current task beyond a watermark sp value. */
@@ -82,7 +82,7 @@ asmlinkage void kasan_unpoison_task_stack_below(const void *watermark)
*/
void *base = (void *)((unsigned long)watermark & ~(THREAD_SIZE - 1));
- kasan_unpoison(base, watermark - base);
+ kasan_unpoison(base, watermark - base, false);
}
#endif /* CONFIG_KASAN_STACK */
@@ -97,7 +97,7 @@ slab_flags_t __kasan_never_merge(void)
return 0;
}
-void __kasan_alloc_pages(struct page *page, unsigned int order)
+void __kasan_alloc_pages(struct page *page, unsigned int order, bool init)
{
u8 tag;
unsigned long i;
@@ -108,14 +108,14 @@ void __kasan_alloc_pages(struct page *page, unsigned int order)
tag = kasan_random_tag();
for (i = 0; i < (1 << order); i++)
page_kasan_tag_set(page + i, tag);
- kasan_unpoison(page_address(page), PAGE_SIZE << order);
+ kasan_unpoison(page_address(page), PAGE_SIZE << order, init);
}
-void __kasan_free_pages(struct page *page, unsigned int order)
+void __kasan_free_pages(struct page *page, unsigned int order, bool init)
{
if (likely(!PageHighMem(page)))
kasan_poison(page_address(page), PAGE_SIZE << order,
- KASAN_FREE_PAGE);
+ KASAN_FREE_PAGE, init);
}
/*
@@ -251,18 +251,18 @@ void __kasan_poison_slab(struct page *page)
for (i = 0; i < compound_nr(page); i++)
page_kasan_tag_reset(page + i);
kasan_poison(page_address(page), page_size(page),
- KASAN_KMALLOC_REDZONE);
+ KASAN_KMALLOC_REDZONE, false);
}
void __kasan_unpoison_object_data(struct kmem_cache *cache, void *object)
{
- kasan_unpoison(object, cache->object_size);
+ kasan_unpoison(object, cache->object_size, false);
}
void __kasan_poison_object_data(struct kmem_cache *cache, void *object)
{
kasan_poison(object, round_up(cache->object_size, KASAN_GRANULE_SIZE),
- KASAN_KMALLOC_REDZONE);
+ KASAN_KMALLOC_REDZONE, false);
}
/*
@@ -322,8 +322,8 @@ void * __must_check __kasan_init_slab_obj(struct kmem_cache *cache,
return (void *)object;
}
-static inline bool ____kasan_slab_free(struct kmem_cache *cache,
- void *object, unsigned long ip, bool quarantine)
+static inline bool ____kasan_slab_free(struct kmem_cache *cache, void *object,
+ unsigned long ip, bool quarantine, bool init)
{
u8 tag;
void *tagged_object;
@@ -351,7 +351,7 @@ static inline bool ____kasan_slab_free(struct kmem_cache *cache,
}
kasan_poison(object, round_up(cache->object_size, KASAN_GRANULE_SIZE),
- KASAN_KMALLOC_FREE);
+ KASAN_KMALLOC_FREE, init);
if ((IS_ENABLED(CONFIG_KASAN_GENERIC) && !quarantine))
return false;
@@ -362,9 +362,10 @@ static inline bool ____kasan_slab_free(struct kmem_cache *cache,
return kasan_quarantine_put(cache, object);
}
-bool __kasan_slab_free(struct kmem_cache *cache, void *object, unsigned long ip)
+bool __kasan_slab_free(struct kmem_cache *cache, void *object,
+ unsigned long ip, bool init)
{
- return ____kasan_slab_free(cache, object, ip, true);
+ return ____kasan_slab_free(cache, object, ip, true, init);
}
static inline bool ____kasan_kfree_large(void *ptr, unsigned long ip)
@@ -407,9 +408,9 @@ void __kasan_slab_free_mempool(void *ptr, unsigned long ip)
if (unlikely(!PageSlab(page))) {
if (____kasan_kfree_large(ptr, ip))
return;
- kasan_poison(ptr, page_size(page), KASAN_FREE_PAGE);
+ kasan_poison(ptr, page_size(page), KASAN_FREE_PAGE, false);
} else {
- ____kasan_slab_free(page->slab_cache, ptr, ip, false);
+ ____kasan_slab_free(page->slab_cache, ptr, ip, false, false);
}
}
@@ -428,7 +429,7 @@ static void set_alloc_info(struct kmem_cache *cache, void *object,
}
void * __must_check __kasan_slab_alloc(struct kmem_cache *cache,
- void *object, gfp_t flags)
+ void *object, gfp_t flags, bool init)
{
u8 tag;
void *tagged_object;
@@ -453,7 +454,7 @@ void * __must_check __kasan_slab_alloc(struct kmem_cache *cache,
* Unpoison the whole object.
* For kmalloc() allocations, kasan_kmalloc() will do precise poisoning.
*/
- kasan_unpoison(tagged_object, cache->object_size);
+ kasan_unpoison(tagged_object, cache->object_size, init);
/* Save alloc info (if possible) for non-kmalloc() allocations. */
if (kasan_stack_collection_enabled())
@@ -496,7 +497,7 @@ static inline void *____kasan_kmalloc(struct kmem_cache *cache,
redzone_end = round_up((unsigned long)(object + cache->object_size),
KASAN_GRANULE_SIZE);
kasan_poison((void *)redzone_start, redzone_end - redzone_start,
- KASAN_KMALLOC_REDZONE);
+ KASAN_KMALLOC_REDZONE, false);
/*
* Save alloc info (if possible) for kmalloc() allocations.
@@ -546,7 +547,7 @@ void * __must_check __kasan_kmalloc_large(const void *ptr, size_t size,
KASAN_GRANULE_SIZE);
redzone_end = (unsigned long)ptr + page_size(virt_to_page(ptr));
kasan_poison((void *)redzone_start, redzone_end - redzone_start,
- KASAN_PAGE_REDZONE);
+ KASAN_PAGE_REDZONE, false);
return (void *)ptr;
}
@@ -563,7 +564,7 @@ void * __must_check __kasan_krealloc(const void *object, size_t size, gfp_t flag
* Part of it might already have been unpoisoned, but it's unknown
* how big that part is.
*/
- kasan_unpoison(object, size);
+ kasan_unpoison(object, size, false);
page = virt_to_head_page(object);
diff --git a/mm/kasan/generic.c b/mm/kasan/generic.c
index 2e55e0f82f39..53cbf28859b5 100644
--- a/mm/kasan/generic.c
+++ b/mm/kasan/generic.c
@@ -208,11 +208,11 @@ static void register_global(struct kasan_global *global)
{
size_t aligned_size = round_up(global->size, KASAN_GRANULE_SIZE);
- kasan_unpoison(global->beg, global->size);
+ kasan_unpoison(global->beg, global->size, false);
kasan_poison(global->beg + aligned_size,
global->size_with_redzone - aligned_size,
- KASAN_GLOBAL_REDZONE);
+ KASAN_GLOBAL_REDZONE, false);
}
void __asan_register_globals(struct kasan_global *globals, size_t size)
@@ -292,11 +292,11 @@ void __asan_alloca_poison(unsigned long addr, size_t size)
WARN_ON(!IS_ALIGNED(addr, KASAN_ALLOCA_REDZONE_SIZE));
kasan_unpoison((const void *)(addr + rounded_down_size),
- size - rounded_down_size);
+ size - rounded_down_size, false);
kasan_poison(left_redzone, KASAN_ALLOCA_REDZONE_SIZE,
- KASAN_ALLOCA_LEFT);
+ KASAN_ALLOCA_LEFT, false);
kasan_poison(right_redzone, padding_size + KASAN_ALLOCA_REDZONE_SIZE,
- KASAN_ALLOCA_RIGHT);
+ KASAN_ALLOCA_RIGHT, false);
}
EXPORT_SYMBOL(__asan_alloca_poison);
@@ -306,7 +306,7 @@ void __asan_allocas_unpoison(const void *stack_top, const void *stack_bottom)
if (unlikely(!stack_top || stack_top > stack_bottom))
return;
- kasan_unpoison(stack_top, stack_bottom - stack_top);
+ kasan_unpoison(stack_top, stack_bottom - stack_top, false);
}
EXPORT_SYMBOL(__asan_allocas_unpoison);
diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h
index c1581e8a9b8e..3820ca54743b 100644
--- a/mm/kasan/kasan.h
+++ b/mm/kasan/kasan.h
@@ -163,7 +163,7 @@ struct kasan_alloc_meta {
struct kasan_track alloc_track;
#ifdef CONFIG_KASAN_GENERIC
/*
- * call_rcu() call stack is stored into struct kasan_alloc_meta.
+ * The auxiliary stack is stored into struct kasan_alloc_meta.
* The free stack is stored into struct kasan_free_meta.
*/
depot_stack_handle_t aux_stack[2];
@@ -314,7 +314,7 @@ static inline const void *arch_kasan_set_tag(const void *addr, u8 tag)
#define arch_get_mem_tag(addr) (0xFF)
#endif
#ifndef arch_set_mem_tag_range
-#define arch_set_mem_tag_range(addr, size, tag) ((void *)(addr))
+#define arch_set_mem_tag_range(addr, size, tag, init) ((void *)(addr))
#endif
#define hw_enable_tagging_sync() arch_enable_tagging_sync()
@@ -324,7 +324,8 @@ static inline const void *arch_kasan_set_tag(const void *addr, u8 tag)
#define hw_force_async_tag_fault() arch_force_async_tag_fault()
#define hw_get_random_tag() arch_get_random_tag()
#define hw_get_mem_tag(addr) arch_get_mem_tag(addr)
-#define hw_set_mem_tag_range(addr, size, tag) arch_set_mem_tag_range((addr), (size), (tag))
+#define hw_set_mem_tag_range(addr, size, tag, init) \
+ arch_set_mem_tag_range((addr), (size), (tag), (init))
#else /* CONFIG_KASAN_HW_TAGS */
@@ -358,7 +359,7 @@ static inline u8 kasan_random_tag(void) { return 0; }
#ifdef CONFIG_KASAN_HW_TAGS
-static inline void kasan_poison(const void *addr, size_t size, u8 value)
+static inline void kasan_poison(const void *addr, size_t size, u8 value, bool init)
{
addr = kasan_reset_tag(addr);
@@ -371,10 +372,10 @@ static inline void kasan_poison(const void *addr, size_t size, u8 value)
if (WARN_ON(size & KASAN_GRANULE_MASK))
return;
- hw_set_mem_tag_range((void *)addr, size, value);
+ hw_set_mem_tag_range((void *)addr, size, value, init);
}
-static inline void kasan_unpoison(const void *addr, size_t size)
+static inline void kasan_unpoison(const void *addr, size_t size, bool init)
{
u8 tag = get_tag(addr);
@@ -388,7 +389,7 @@ static inline void kasan_unpoison(const void *addr, size_t size)
return;
size = round_up(size, KASAN_GRANULE_SIZE);
- hw_set_mem_tag_range((void *)addr, size, tag);
+ hw_set_mem_tag_range((void *)addr, size, tag, init);
}
static inline bool kasan_byte_accessible(const void *addr)
@@ -396,8 +397,7 @@ static inline bool kasan_byte_accessible(const void *addr)
u8 ptr_tag = get_tag(addr);
u8 mem_tag = hw_get_mem_tag((void *)addr);
- return (mem_tag != KASAN_TAG_INVALID) &&
- (ptr_tag == KASAN_TAG_KERNEL || ptr_tag == mem_tag);
+ return ptr_tag == KASAN_TAG_KERNEL || ptr_tag == mem_tag;
}
#else /* CONFIG_KASAN_HW_TAGS */
@@ -407,22 +407,24 @@ static inline bool kasan_byte_accessible(const void *addr)
* @addr - range start address, must be aligned to KASAN_GRANULE_SIZE
* @size - range size, must be aligned to KASAN_GRANULE_SIZE
* @value - value that's written to metadata for the range
+ * @init - whether to initialize the memory range (only for hardware tag-based)
*
* The size gets aligned to KASAN_GRANULE_SIZE before marking the range.
*/
-void kasan_poison(const void *addr, size_t size, u8 value);
+void kasan_poison(const void *addr, size_t size, u8 value, bool init);
/**
* kasan_unpoison - mark the memory range as accessible
* @addr - range start address, must be aligned to KASAN_GRANULE_SIZE
* @size - range size, can be unaligned
+ * @init - whether to initialize the memory range (only for hardware tag-based)
*
* For the tag-based modes, the @size gets aligned to KASAN_GRANULE_SIZE before
* marking the range.
* For the generic mode, the last granule of the memory range gets partially
* unpoisoned based on the @size.
*/
-void kasan_unpoison(const void *addr, size_t size);
+void kasan_unpoison(const void *addr, size_t size, bool init);
bool kasan_byte_accessible(const void *addr);
diff --git a/mm/kasan/report_generic.c b/mm/kasan/report_generic.c
index de732bc341c5..139615ef326b 100644
--- a/mm/kasan/report_generic.c
+++ b/mm/kasan/report_generic.c
@@ -148,7 +148,7 @@ static bool __must_check tokenize_frame_descr(const char **frame_descr,
}
/* Copy token (+ 1 byte for '\0'). */
- strlcpy(token, *frame_descr, tok_len + 1);
+ strscpy(token, *frame_descr, tok_len + 1);
}
/* Advance frame_descr past separator. */
diff --git a/mm/kasan/shadow.c b/mm/kasan/shadow.c
index 63f43443f5d7..727ad4629173 100644
--- a/mm/kasan/shadow.c
+++ b/mm/kasan/shadow.c
@@ -69,7 +69,7 @@ void *memcpy(void *dest, const void *src, size_t len)
return __memcpy(dest, src, len);
}
-void kasan_poison(const void *addr, size_t size, u8 value)
+void kasan_poison(const void *addr, size_t size, u8 value, bool init)
{
void *shadow_start, *shadow_end;
@@ -106,7 +106,7 @@ void kasan_poison_last_granule(const void *addr, size_t size)
}
#endif
-void kasan_unpoison(const void *addr, size_t size)
+void kasan_unpoison(const void *addr, size_t size, bool init)
{
u8 tag = get_tag(addr);
@@ -129,7 +129,7 @@ void kasan_unpoison(const void *addr, size_t size)
return;
/* Unpoison all granules that cover the object. */
- kasan_poison(addr, round_up(size, KASAN_GRANULE_SIZE), tag);
+ kasan_poison(addr, round_up(size, KASAN_GRANULE_SIZE), tag, false);
/* Partially poison the last granule for the generic mode. */
if (IS_ENABLED(CONFIG_KASAN_GENERIC))
@@ -344,7 +344,7 @@ void kasan_poison_vmalloc(const void *start, unsigned long size)
return;
size = round_up(size, KASAN_GRANULE_SIZE);
- kasan_poison(start, size, KASAN_VMALLOC_INVALID);
+ kasan_poison(start, size, KASAN_VMALLOC_INVALID, false);
}
void kasan_unpoison_vmalloc(const void *start, unsigned long size)
@@ -352,7 +352,7 @@ void kasan_unpoison_vmalloc(const void *start, unsigned long size)
if (!is_vmalloc_or_module_addr(start))
return;
- kasan_unpoison(start, size);
+ kasan_unpoison(start, size, false);
}
static int kasan_depopulate_vmalloc_pte(pte_t *ptep, unsigned long addr,
diff --git a/mm/kasan/sw_tags.c b/mm/kasan/sw_tags.c
index 94c2d33be333..9df8e7f69e87 100644
--- a/mm/kasan/sw_tags.c
+++ b/mm/kasan/sw_tags.c
@@ -121,10 +121,14 @@ bool kasan_check_range(unsigned long addr, size_t size, bool write,
bool kasan_byte_accessible(const void *addr)
{
u8 tag = get_tag(addr);
- u8 shadow_byte = READ_ONCE(*(u8 *)kasan_mem_to_shadow(kasan_reset_tag(addr)));
+ void *untagged_addr = kasan_reset_tag(addr);
+ u8 shadow_byte;
- return (shadow_byte != KASAN_TAG_INVALID) &&
- (tag == KASAN_TAG_KERNEL || tag == shadow_byte);
+ if (untagged_addr < kasan_shadow_to_mem((void *)KASAN_SHADOW_START))
+ return false;
+
+ shadow_byte = READ_ONCE(*(u8 *)kasan_mem_to_shadow(untagged_addr));
+ return tag == KASAN_TAG_KERNEL || tag == shadow_byte;
}
#define DEFINE_HWASAN_LOAD_STORE(size) \
@@ -159,7 +163,7 @@ EXPORT_SYMBOL(__hwasan_storeN_noabort);
void __hwasan_tag_memory(unsigned long addr, u8 tag, unsigned long size)
{
- kasan_poison((void *)addr, size, tag);
+ kasan_poison((void *)addr, size, tag, false);
}
EXPORT_SYMBOL(__hwasan_tag_memory);
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index fe6e3ae8e8c6..92a2d4885808 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -1203,7 +1203,7 @@ static void update_refs(struct kmemleak_object *object)
}
/*
- * Memory scanning is a long process and it needs to be interruptable. This
+ * Memory scanning is a long process and it needs to be interruptible. This
* function checks whether such interrupt condition occurred.
*/
static int scan_should_stop(void)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index e064ac0d850a..c100265dc393 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -255,10 +255,8 @@ struct cgroup_subsys_state *vmpressure_to_css(struct vmpressure *vmpr)
#ifdef CONFIG_MEMCG_KMEM
extern spinlock_t css_set_lock;
-static int __memcg_kmem_charge(struct mem_cgroup *memcg, gfp_t gfp,
- unsigned int nr_pages);
-static void __memcg_kmem_uncharge(struct mem_cgroup *memcg,
- unsigned int nr_pages);
+static void obj_cgroup_uncharge_pages(struct obj_cgroup *objcg,
+ unsigned int nr_pages);
static void obj_cgroup_release(struct percpu_ref *ref)
{
@@ -295,7 +293,7 @@ static void obj_cgroup_release(struct percpu_ref *ref)
spin_lock_irqsave(&css_set_lock, flags);
memcg = obj_cgroup_memcg(objcg);
if (nr_pages)
- __memcg_kmem_uncharge(memcg, nr_pages);
+ obj_cgroup_uncharge_pages(objcg, nr_pages);
list_del(&objcg->list);
mem_cgroup_put(memcg);
spin_unlock_irqrestore(&css_set_lock, flags);
@@ -414,13 +412,14 @@ static int memcg_expand_one_shrinker_map(struct mem_cgroup *memcg,
int size, int old_size)
{
struct memcg_shrinker_map *new, *old;
+ struct mem_cgroup_per_node *pn;
int nid;
lockdep_assert_held(&memcg_shrinker_map_mutex);
for_each_node(nid) {
- old = rcu_dereference_protected(
- mem_cgroup_nodeinfo(memcg, nid)->shrinker_map, true);
+ pn = memcg->nodeinfo[nid];
+ old = rcu_dereference_protected(pn->shrinker_map, true);
/* Not yet online memcg */
if (!old)
return 0;
@@ -433,7 +432,7 @@ static int memcg_expand_one_shrinker_map(struct mem_cgroup *memcg,
memset(new->map, (int)0xff, old_size);
memset((void *)new->map + old_size, 0, size - old_size);
- rcu_assign_pointer(memcg->nodeinfo[nid]->shrinker_map, new);
+ rcu_assign_pointer(pn->shrinker_map, new);
call_rcu(&old->rcu, memcg_free_shrinker_map_rcu);
}
@@ -450,7 +449,7 @@ static void memcg_free_shrinker_maps(struct mem_cgroup *memcg)
return;
for_each_node(nid) {
- pn = mem_cgroup_nodeinfo(memcg, nid);
+ pn = memcg->nodeinfo[nid];
map = rcu_dereference_protected(pn->shrinker_map, true);
kvfree(map);
rcu_assign_pointer(pn->shrinker_map, NULL);
@@ -713,7 +712,7 @@ static void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg)
int nid;
for_each_node(nid) {
- mz = mem_cgroup_nodeinfo(memcg, nid);
+ mz = memcg->nodeinfo[nid];
mctz = soft_limit_tree_node(nid);
if (mctz)
mem_cgroup_remove_exceeded(mz, mctz);
@@ -764,28 +763,37 @@ mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz)
*/
void __mod_memcg_state(struct mem_cgroup *memcg, int idx, int val)
{
- long x, threshold = MEMCG_CHARGE_BATCH;
-
if (mem_cgroup_disabled())
return;
- if (memcg_stat_item_in_bytes(idx))
- threshold <<= PAGE_SHIFT;
+ __this_cpu_add(memcg->vmstats_percpu->state[idx], val);
+ cgroup_rstat_updated(memcg->css.cgroup, smp_processor_id());
+}
- x = val + __this_cpu_read(memcg->vmstats_percpu->stat[idx]);
- if (unlikely(abs(x) > threshold)) {
- struct mem_cgroup *mi;
+/* idx can be of type enum memcg_stat_item or node_stat_item. */
+static unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx)
+{
+ long x = READ_ONCE(memcg->vmstats.state[idx]);
+#ifdef CONFIG_SMP
+ if (x < 0)
+ x = 0;
+#endif
+ return x;
+}
- /*
- * Batch local counters to keep them in sync with
- * the hierarchical ones.
- */
- __this_cpu_add(memcg->vmstats_local->stat[idx], x);
- for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
- atomic_long_add(x, &mi->vmstats[idx]);
+/* idx can be of type enum memcg_stat_item or node_stat_item. */
+static unsigned long memcg_page_state_local(struct mem_cgroup *memcg, int idx)
+{
+ long x = 0;
+ int cpu;
+
+ for_each_possible_cpu(cpu)
+ x += per_cpu(memcg->vmstats_percpu->state[idx], cpu);
+#ifdef CONFIG_SMP
+ if (x < 0)
x = 0;
- }
- __this_cpu_write(memcg->vmstats_percpu->stat[idx], x);
+#endif
+ return x;
}
static struct mem_cgroup_per_node *
@@ -796,7 +804,7 @@ parent_nodeinfo(struct mem_cgroup_per_node *pn, int nid)
parent = parent_mem_cgroup(pn->memcg);
if (!parent)
return NULL;
- return mem_cgroup_nodeinfo(parent, nid);
+ return parent->nodeinfo[nid];
}
void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
@@ -855,18 +863,22 @@ void __mod_lruvec_page_state(struct page *page, enum node_stat_item idx,
int val)
{
struct page *head = compound_head(page); /* rmap on tail pages */
- struct mem_cgroup *memcg = page_memcg(head);
+ struct mem_cgroup *memcg;
pg_data_t *pgdat = page_pgdat(page);
struct lruvec *lruvec;
+ rcu_read_lock();
+ memcg = page_memcg(head);
/* Untracked pages have no memcg, no lruvec. Update only the node */
if (!memcg) {
+ rcu_read_unlock();
__mod_node_page_state(pgdat, idx, val);
return;
}
lruvec = mem_cgroup_lruvec(memcg, pgdat);
__mod_lruvec_state(lruvec, idx, val);
+ rcu_read_unlock();
}
EXPORT_SYMBOL(__mod_lruvec_page_state);
@@ -903,30 +915,16 @@ void __mod_lruvec_kmem_state(void *p, enum node_stat_item idx, int val)
void __count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx,
unsigned long count)
{
- unsigned long x;
-
if (mem_cgroup_disabled())
return;
- x = count + __this_cpu_read(memcg->vmstats_percpu->events[idx]);
- if (unlikely(x > MEMCG_CHARGE_BATCH)) {
- struct mem_cgroup *mi;
-
- /*
- * Batch local counters to keep them in sync with
- * the hierarchical ones.
- */
- __this_cpu_add(memcg->vmstats_local->events[idx], x);
- for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
- atomic_long_add(x, &mi->vmevents[idx]);
- x = 0;
- }
- __this_cpu_write(memcg->vmstats_percpu->events[idx], x);
+ __this_cpu_add(memcg->vmstats_percpu->events[idx], count);
+ cgroup_rstat_updated(memcg->css.cgroup, smp_processor_id());
}
static unsigned long memcg_events(struct mem_cgroup *memcg, int event)
{
- return atomic_long_read(&memcg->vmevents[event]);
+ return READ_ONCE(memcg->vmstats.events[event]);
}
static unsigned long memcg_events_local(struct mem_cgroup *memcg, int event)
@@ -935,7 +933,7 @@ static unsigned long memcg_events_local(struct mem_cgroup *memcg, int event)
int cpu;
for_each_possible_cpu(cpu)
- x += per_cpu(memcg->vmstats_local->events[event], cpu);
+ x += per_cpu(memcg->vmstats_percpu->events[event], cpu);
return x;
}
@@ -1055,20 +1053,6 @@ static __always_inline struct mem_cgroup *active_memcg(void)
return current->active_memcg;
}
-static __always_inline struct mem_cgroup *get_active_memcg(void)
-{
- struct mem_cgroup *memcg;
-
- rcu_read_lock();
- memcg = active_memcg();
- /* remote memcg must hold a ref. */
- if (memcg && WARN_ON_ONCE(!css_tryget(&memcg->css)))
- memcg = root_mem_cgroup;
- rcu_read_unlock();
-
- return memcg;
-}
-
static __always_inline bool memcg_kmem_bypass(void)
{
/* Allow remote memcg charging from any context. */
@@ -1083,20 +1067,6 @@ static __always_inline bool memcg_kmem_bypass(void)
}
/**
- * If active memcg is set, do not fallback to current->mm->memcg.
- */
-static __always_inline struct mem_cgroup *get_mem_cgroup_from_current(void)
-{
- if (memcg_kmem_bypass())
- return NULL;
-
- if (unlikely(active_memcg()))
- return get_active_memcg();
-
- return get_mem_cgroup_from_mm(current->mm);
-}
-
-/**
* mem_cgroup_iter - iterate over memory cgroup hierarchy
* @root: hierarchy root
* @prev: previously returned memcg, NULL on first invocation
@@ -1136,7 +1106,7 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
if (reclaim) {
struct mem_cgroup_per_node *mz;
- mz = mem_cgroup_nodeinfo(root, reclaim->pgdat->node_id);
+ mz = root->nodeinfo[reclaim->pgdat->node_id];
iter = &mz->iter;
if (prev && reclaim->generation != iter->generation)
@@ -1238,7 +1208,7 @@ static void __invalidate_reclaim_iterators(struct mem_cgroup *from,
int nid;
for_each_node(nid) {
- mz = mem_cgroup_nodeinfo(from, nid);
+ mz = from->nodeinfo[nid];
iter = &mz->iter;
cmpxchg(&iter->position, dead_memcg, NULL);
}
@@ -1571,6 +1541,7 @@ static char *memory_stat_format(struct mem_cgroup *memcg)
*
* Current memory state:
*/
+ cgroup_rstat_flush(memcg->css.cgroup);
for (i = 0; i < ARRAY_SIZE(memory_stats); i++) {
u64 size;
@@ -2118,11 +2089,10 @@ void mem_cgroup_print_oom_group(struct mem_cgroup *memcg)
* This function protects unlocked LRU pages from being moved to
* another cgroup.
*
- * It ensures lifetime of the returned memcg. Caller is responsible
- * for the lifetime of the page; __unlock_page_memcg() is available
- * when @page might get freed inside the locked section.
+ * It ensures lifetime of the locked memcg. Caller is responsible
+ * for the lifetime of the page.
*/
-struct mem_cgroup *lock_page_memcg(struct page *page)
+void lock_page_memcg(struct page *page)
{
struct page *head = compound_head(page); /* rmap on tail pages */
struct mem_cgroup *memcg;
@@ -2132,21 +2102,15 @@ struct mem_cgroup *lock_page_memcg(struct page *page)
* The RCU lock is held throughout the transaction. The fast
* path can get away without acquiring the memcg->move_lock
* because page moving starts with an RCU grace period.
- *
- * The RCU lock also protects the memcg from being freed when
- * the page state that is going to change is the only thing
- * preventing the page itself from being freed. E.g. writeback
- * doesn't hold a page reference and relies on PG_writeback to
- * keep off truncation, migration and so forth.
*/
rcu_read_lock();
if (mem_cgroup_disabled())
- return NULL;
+ return;
again:
memcg = page_memcg(head);
if (unlikely(!memcg))
- return NULL;
+ return;
#ifdef CONFIG_PROVE_LOCKING
local_irq_save(flags);
@@ -2155,7 +2119,7 @@ again:
#endif
if (atomic_read(&memcg->moving_account) <= 0)
- return memcg;
+ return;
spin_lock_irqsave(&memcg->move_lock, flags);
if (memcg != page_memcg(head)) {
@@ -2164,24 +2128,17 @@ again:
}
/*
- * When charge migration first begins, we can have locked and
- * unlocked page stat updates happening concurrently. Track
- * the task who has the lock for unlock_page_memcg().
+ * When charge migration first begins, we can have multiple
+ * critical sections holding the fast-path RCU lock and one
+ * holding the slowpath move_lock. Track the task who has the
+ * move_lock for unlock_page_memcg().
*/
memcg->move_lock_task = current;
memcg->move_lock_flags = flags;
-
- return memcg;
}
EXPORT_SYMBOL(lock_page_memcg);
-/**
- * __unlock_page_memcg - unlock and unpin a memcg
- * @memcg: the memcg
- *
- * Unlock and unpin a memcg returned by lock_page_memcg().
- */
-void __unlock_page_memcg(struct mem_cgroup *memcg)
+static void __unlock_page_memcg(struct mem_cgroup *memcg)
{
if (memcg && memcg->move_lock_task == current) {
unsigned long flags = memcg->move_lock_flags;
@@ -2381,50 +2338,39 @@ static void drain_all_stock(struct mem_cgroup *root_memcg)
mutex_unlock(&percpu_charge_mutex);
}
-static int memcg_hotplug_cpu_dead(unsigned int cpu)
+static void memcg_flush_lruvec_page_state(struct mem_cgroup *memcg, int cpu)
{
- struct memcg_stock_pcp *stock;
- struct mem_cgroup *memcg, *mi;
-
- stock = &per_cpu(memcg_stock, cpu);
- drain_stock(stock);
+ int nid;
- for_each_mem_cgroup(memcg) {
+ for_each_node(nid) {
+ struct mem_cgroup_per_node *pn = memcg->nodeinfo[nid];
+ unsigned long stat[NR_VM_NODE_STAT_ITEMS];
+ struct batched_lruvec_stat *lstatc;
int i;
- for (i = 0; i < MEMCG_NR_STAT; i++) {
- int nid;
- long x;
-
- x = this_cpu_xchg(memcg->vmstats_percpu->stat[i], 0);
- if (x)
- for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
- atomic_long_add(x, &memcg->vmstats[i]);
-
- if (i >= NR_VM_NODE_STAT_ITEMS)
- continue;
+ lstatc = per_cpu_ptr(pn->lruvec_stat_cpu, cpu);
+ for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) {
+ stat[i] = lstatc->count[i];
+ lstatc->count[i] = 0;
+ }
- for_each_node(nid) {
- struct mem_cgroup_per_node *pn;
+ do {
+ for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
+ atomic_long_add(stat[i], &pn->lruvec_stat[i]);
+ } while ((pn = parent_nodeinfo(pn, nid)));
+ }
+}
- pn = mem_cgroup_nodeinfo(memcg, nid);
- x = this_cpu_xchg(pn->lruvec_stat_cpu->count[i], 0);
- if (x)
- do {
- atomic_long_add(x, &pn->lruvec_stat[i]);
- } while ((pn = parent_nodeinfo(pn, nid)));
- }
- }
+static int memcg_hotplug_cpu_dead(unsigned int cpu)
+{
+ struct memcg_stock_pcp *stock;
+ struct mem_cgroup *memcg;
- for (i = 0; i < NR_VM_EVENT_ITEMS; i++) {
- long x;
+ stock = &per_cpu(memcg_stock, cpu);
+ drain_stock(stock);
- x = this_cpu_xchg(memcg->vmstats_percpu->events[i], 0);
- if (x)
- for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
- atomic_long_add(x, &memcg->vmevents[i]);
- }
- }
+ for_each_mem_cgroup(memcg)
+ memcg_flush_lruvec_page_state(memcg, cpu);
return 0;
}
@@ -2793,9 +2739,6 @@ retry:
if (gfp_mask & __GFP_RETRY_MAYFAIL)
goto nomem;
- if (gfp_mask & __GFP_NOFAIL)
- goto force;
-
if (fatal_signal_pending(current))
goto force;
@@ -2905,6 +2848,20 @@ static void commit_charge(struct page *page, struct mem_cgroup *memcg)
page->memcg_data = (unsigned long)memcg;
}
+static struct mem_cgroup *get_mem_cgroup_from_objcg(struct obj_cgroup *objcg)
+{
+ struct mem_cgroup *memcg;
+
+ rcu_read_lock();
+retry:
+ memcg = obj_cgroup_memcg(objcg);
+ if (unlikely(!css_tryget(&memcg->css)))
+ goto retry;
+ rcu_read_unlock();
+
+ return memcg;
+}
+
#ifdef CONFIG_MEMCG_KMEM
int memcg_alloc_page_obj_cgroups(struct page *page, struct kmem_cache *s,
gfp_t gfp, bool new_page)
@@ -3056,23 +3013,45 @@ static void memcg_free_cache_id(int id)
ida_simple_remove(&memcg_cache_ida, id);
}
-/**
- * __memcg_kmem_charge: charge a number of kernel pages to a memcg
- * @memcg: memory cgroup to charge
+/*
+ * obj_cgroup_uncharge_pages: uncharge a number of kernel pages from a objcg
+ * @objcg: object cgroup to uncharge
+ * @nr_pages: number of pages to uncharge
+ */
+static void obj_cgroup_uncharge_pages(struct obj_cgroup *objcg,
+ unsigned int nr_pages)
+{
+ struct mem_cgroup *memcg;
+
+ memcg = get_mem_cgroup_from_objcg(objcg);
+
+ if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
+ page_counter_uncharge(&memcg->kmem, nr_pages);
+ refill_stock(memcg, nr_pages);
+
+ css_put(&memcg->css);
+}
+
+/*
+ * obj_cgroup_charge_pages: charge a number of kernel pages to a objcg
+ * @objcg: object cgroup to charge
* @gfp: reclaim mode
* @nr_pages: number of pages to charge
*
* Returns 0 on success, an error code on failure.
*/
-static int __memcg_kmem_charge(struct mem_cgroup *memcg, gfp_t gfp,
- unsigned int nr_pages)
+static int obj_cgroup_charge_pages(struct obj_cgroup *objcg, gfp_t gfp,
+ unsigned int nr_pages)
{
struct page_counter *counter;
+ struct mem_cgroup *memcg;
int ret;
+ memcg = get_mem_cgroup_from_objcg(objcg);
+
ret = try_charge(memcg, gfp, nr_pages);
if (ret)
- return ret;
+ goto out;
if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) &&
!page_counter_try_charge(&memcg->kmem, nr_pages, &counter)) {
@@ -3084,25 +3063,15 @@ static int __memcg_kmem_charge(struct mem_cgroup *memcg, gfp_t gfp,
*/
if (gfp & __GFP_NOFAIL) {
page_counter_charge(&memcg->kmem, nr_pages);
- return 0;
+ goto out;
}
cancel_charge(memcg, nr_pages);
- return -ENOMEM;
+ ret = -ENOMEM;
}
- return 0;
-}
-
-/**
- * __memcg_kmem_uncharge: uncharge a number of kernel pages from a memcg
- * @memcg: memcg to uncharge
- * @nr_pages: number of pages to uncharge
- */
-static void __memcg_kmem_uncharge(struct mem_cgroup *memcg, unsigned int nr_pages)
-{
- if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
- page_counter_uncharge(&memcg->kmem, nr_pages);
+out:
+ css_put(&memcg->css);
- refill_stock(memcg, nr_pages);
+ return ret;
}
/**
@@ -3115,18 +3084,18 @@ static void __memcg_kmem_uncharge(struct mem_cgroup *memcg, unsigned int nr_page
*/
int __memcg_kmem_charge_page(struct page *page, gfp_t gfp, int order)
{
- struct mem_cgroup *memcg;
+ struct obj_cgroup *objcg;
int ret = 0;
- memcg = get_mem_cgroup_from_current();
- if (memcg && !mem_cgroup_is_root(memcg)) {
- ret = __memcg_kmem_charge(memcg, gfp, 1 << order);
+ objcg = get_obj_cgroup_from_current();
+ if (objcg) {
+ ret = obj_cgroup_charge_pages(objcg, gfp, 1 << order);
if (!ret) {
- page->memcg_data = (unsigned long)memcg |
+ page->memcg_data = (unsigned long)objcg |
MEMCG_DATA_KMEM;
return 0;
}
- css_put(&memcg->css);
+ obj_cgroup_put(objcg);
}
return ret;
}
@@ -3138,16 +3107,16 @@ int __memcg_kmem_charge_page(struct page *page, gfp_t gfp, int order)
*/
void __memcg_kmem_uncharge_page(struct page *page, int order)
{
- struct mem_cgroup *memcg = page_memcg(page);
+ struct obj_cgroup *objcg;
unsigned int nr_pages = 1 << order;
- if (!memcg)
+ if (!PageMemcgKmem(page))
return;
- VM_BUG_ON_PAGE(mem_cgroup_is_root(memcg), page);
- __memcg_kmem_uncharge(memcg, nr_pages);
+ objcg = __page_objcg(page);
+ obj_cgroup_uncharge_pages(objcg, nr_pages);
page->memcg_data = 0;
- css_put(&memcg->css);
+ obj_cgroup_put(objcg);
}
static bool consume_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes)
@@ -3180,11 +3149,8 @@ static void drain_obj_stock(struct memcg_stock_pcp *stock)
unsigned int nr_pages = stock->nr_bytes >> PAGE_SHIFT;
unsigned int nr_bytes = stock->nr_bytes & (PAGE_SIZE - 1);
- if (nr_pages) {
- rcu_read_lock();
- __memcg_kmem_uncharge(obj_cgroup_memcg(old), nr_pages);
- rcu_read_unlock();
- }
+ if (nr_pages)
+ obj_cgroup_uncharge_pages(old, nr_pages);
/*
* The leftover is flushed to the centralized per-memcg value.
@@ -3242,7 +3208,6 @@ static void refill_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes)
int obj_cgroup_charge(struct obj_cgroup *objcg, gfp_t gfp, size_t size)
{
- struct mem_cgroup *memcg;
unsigned int nr_pages, nr_bytes;
int ret;
@@ -3259,24 +3224,16 @@ int obj_cgroup_charge(struct obj_cgroup *objcg, gfp_t gfp, size_t size)
* refill_obj_stock(), called from this function or
* independently later.
*/
- rcu_read_lock();
-retry:
- memcg = obj_cgroup_memcg(objcg);
- if (unlikely(!css_tryget(&memcg->css)))
- goto retry;
- rcu_read_unlock();
-
nr_pages = size >> PAGE_SHIFT;
nr_bytes = size & (PAGE_SIZE - 1);
if (nr_bytes)
nr_pages += 1;
- ret = __memcg_kmem_charge(memcg, gfp, nr_pages);
+ ret = obj_cgroup_charge_pages(objcg, gfp, nr_pages);
if (!ret && nr_bytes)
refill_obj_stock(objcg, PAGE_SIZE - nr_bytes);
- css_put(&memcg->css);
return ret;
}
@@ -3300,7 +3257,11 @@ void split_page_memcg(struct page *head, unsigned int nr)
for (i = 1; i < nr; i++)
head[i].memcg_data = head->memcg_data;
- css_get_many(&memcg->css, nr - 1);
+
+ if (PageMemcgKmem(head))
+ obj_cgroup_get_many(__page_objcg(head), nr - 1);
+ else
+ css_get_many(&memcg->css, nr - 1);
}
#ifdef CONFIG_MEMCG_SWAP
@@ -3549,6 +3510,7 @@ static unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
unsigned long val;
if (mem_cgroup_is_root(memcg)) {
+ cgroup_rstat_flush(memcg->css.cgroup);
val = memcg_page_state(memcg, NR_FILE_PAGES) +
memcg_page_state(memcg, NR_ANON_MAPPED);
if (swap)
@@ -3613,57 +3575,6 @@ static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css,
}
}
-static void memcg_flush_percpu_vmstats(struct mem_cgroup *memcg)
-{
- unsigned long stat[MEMCG_NR_STAT] = {0};
- struct mem_cgroup *mi;
- int node, cpu, i;
-
- for_each_online_cpu(cpu)
- for (i = 0; i < MEMCG_NR_STAT; i++)
- stat[i] += per_cpu(memcg->vmstats_percpu->stat[i], cpu);
-
- for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
- for (i = 0; i < MEMCG_NR_STAT; i++)
- atomic_long_add(stat[i], &mi->vmstats[i]);
-
- for_each_node(node) {
- struct mem_cgroup_per_node *pn = memcg->nodeinfo[node];
- struct mem_cgroup_per_node *pi;
-
- for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
- stat[i] = 0;
-
- for_each_online_cpu(cpu)
- for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
- stat[i] += per_cpu(
- pn->lruvec_stat_cpu->count[i], cpu);
-
- for (pi = pn; pi; pi = parent_nodeinfo(pi, node))
- for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
- atomic_long_add(stat[i], &pi->lruvec_stat[i]);
- }
-}
-
-static void memcg_flush_percpu_vmevents(struct mem_cgroup *memcg)
-{
- unsigned long events[NR_VM_EVENT_ITEMS];
- struct mem_cgroup *mi;
- int cpu, i;
-
- for (i = 0; i < NR_VM_EVENT_ITEMS; i++)
- events[i] = 0;
-
- for_each_online_cpu(cpu)
- for (i = 0; i < NR_VM_EVENT_ITEMS; i++)
- events[i] += per_cpu(memcg->vmstats_percpu->events[i],
- cpu);
-
- for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
- for (i = 0; i < NR_VM_EVENT_ITEMS; i++)
- atomic_long_add(events[i], &mi->vmevents[i]);
-}
-
#ifdef CONFIG_MEMCG_KMEM
static int memcg_online_kmem(struct mem_cgroup *memcg)
{
@@ -3980,6 +3891,8 @@ static int memcg_numa_stat_show(struct seq_file *m, void *v)
int nid;
struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
+ cgroup_rstat_flush(memcg->css.cgroup);
+
for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) {
seq_printf(m, "%s=%lu", stat->name,
mem_cgroup_nr_lru_pages(memcg, stat->lru_mask,
@@ -4050,6 +3963,8 @@ static int memcg_stat_show(struct seq_file *m, void *v)
BUILD_BUG_ON(ARRAY_SIZE(memcg1_stat_names) != ARRAY_SIZE(memcg1_stats));
+ cgroup_rstat_flush(memcg->css.cgroup);
+
for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) {
unsigned long nr;
@@ -4108,7 +4023,7 @@ static int memcg_stat_show(struct seq_file *m, void *v)
unsigned long file_cost = 0;
for_each_online_pgdat(pgdat) {
- mz = mem_cgroup_nodeinfo(memcg, pgdat->node_id);
+ mz = memcg->nodeinfo[pgdat->node_id];
anon_cost += mz->lruvec.anon_cost;
file_cost += mz->lruvec.file_cost;
@@ -4137,7 +4052,7 @@ static int mem_cgroup_swappiness_write(struct cgroup_subsys_state *css,
if (val > 100)
return -EINVAL;
- if (css->parent)
+ if (!mem_cgroup_is_root(memcg))
memcg->swappiness = val;
else
vm_swappiness = val;
@@ -4487,7 +4402,7 @@ static int mem_cgroup_oom_control_write(struct cgroup_subsys_state *css,
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
/* cannot set to root cgroup and only 0 and 1 are allowed */
- if (!css->parent || !((val == 0) || (val == 1)))
+ if (mem_cgroup_is_root(memcg) || !((val == 0) || (val == 1)))
return -EINVAL;
memcg->oom_kill_disable = val;
@@ -4526,22 +4441,6 @@ struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb)
return &memcg->cgwb_domain;
}
-/*
- * idx can be of type enum memcg_stat_item or node_stat_item.
- * Keep in sync with memcg_exact_page().
- */
-static unsigned long memcg_exact_page_state(struct mem_cgroup *memcg, int idx)
-{
- long x = atomic_long_read(&memcg->vmstats[idx]);
- int cpu;
-
- for_each_online_cpu(cpu)
- x += per_cpu_ptr(memcg->vmstats_percpu, cpu)->stat[idx];
- if (x < 0)
- x = 0;
- return x;
-}
-
/**
* mem_cgroup_wb_stats - retrieve writeback related stats from its memcg
* @wb: bdi_writeback in question
@@ -4567,13 +4466,14 @@ void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages,
struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css);
struct mem_cgroup *parent;
- *pdirty = memcg_exact_page_state(memcg, NR_FILE_DIRTY);
+ cgroup_rstat_flush_irqsafe(memcg->css.cgroup);
- *pwriteback = memcg_exact_page_state(memcg, NR_WRITEBACK);
- *pfilepages = memcg_exact_page_state(memcg, NR_INACTIVE_FILE) +
- memcg_exact_page_state(memcg, NR_ACTIVE_FILE);
- *pheadroom = PAGE_COUNTER_MAX;
+ *pdirty = memcg_page_state(memcg, NR_FILE_DIRTY);
+ *pwriteback = memcg_page_state(memcg, NR_WRITEBACK);
+ *pfilepages = memcg_page_state(memcg, NR_INACTIVE_FILE) +
+ memcg_page_state(memcg, NR_ACTIVE_FILE);
+ *pheadroom = PAGE_COUNTER_MAX;
while ((parent = parent_mem_cgroup(memcg))) {
unsigned long ceiling = min(READ_ONCE(memcg->memory.max),
READ_ONCE(memcg->memory.high));
@@ -5205,19 +5105,20 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg)
for_each_node(node)
free_mem_cgroup_per_node_info(memcg, node);
free_percpu(memcg->vmstats_percpu);
- free_percpu(memcg->vmstats_local);
kfree(memcg);
}
static void mem_cgroup_free(struct mem_cgroup *memcg)
{
+ int cpu;
+
memcg_wb_domain_exit(memcg);
/*
- * Flush percpu vmstats and vmevents to guarantee the value correctness
- * on parent's and all ancestor levels.
+ * Flush percpu lruvec stats to guarantee the value
+ * correctness on parent's and all ancestor levels.
*/
- memcg_flush_percpu_vmstats(memcg);
- memcg_flush_percpu_vmevents(memcg);
+ for_each_online_cpu(cpu)
+ memcg_flush_lruvec_page_state(memcg, cpu);
__mem_cgroup_free(memcg);
}
@@ -5244,11 +5145,6 @@ static struct mem_cgroup *mem_cgroup_alloc(void)
goto fail;
}
- memcg->vmstats_local = alloc_percpu_gfp(struct memcg_vmstats_percpu,
- GFP_KERNEL_ACCOUNT);
- if (!memcg->vmstats_local)
- goto fail;
-
memcg->vmstats_percpu = alloc_percpu_gfp(struct memcg_vmstats_percpu,
GFP_KERNEL_ACCOUNT);
if (!memcg->vmstats_percpu)
@@ -5448,6 +5344,62 @@ static void mem_cgroup_css_reset(struct cgroup_subsys_state *css)
memcg_wb_domain_size_changed(memcg);
}
+static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu)
+{
+ struct mem_cgroup *memcg = mem_cgroup_from_css(css);
+ struct mem_cgroup *parent = parent_mem_cgroup(memcg);
+ struct memcg_vmstats_percpu *statc;
+ long delta, v;
+ int i;
+
+ statc = per_cpu_ptr(memcg->vmstats_percpu, cpu);
+
+ for (i = 0; i < MEMCG_NR_STAT; i++) {
+ /*
+ * Collect the aggregated propagation counts of groups
+ * below us. We're in a per-cpu loop here and this is
+ * a global counter, so the first cycle will get them.
+ */
+ delta = memcg->vmstats.state_pending[i];
+ if (delta)
+ memcg->vmstats.state_pending[i] = 0;
+
+ /* Add CPU changes on this level since the last flush */
+ v = READ_ONCE(statc->state[i]);
+ if (v != statc->state_prev[i]) {
+ delta += v - statc->state_prev[i];
+ statc->state_prev[i] = v;
+ }
+
+ if (!delta)
+ continue;
+
+ /* Aggregate counts on this level and propagate upwards */
+ memcg->vmstats.state[i] += delta;
+ if (parent)
+ parent->vmstats.state_pending[i] += delta;
+ }
+
+ for (i = 0; i < NR_VM_EVENT_ITEMS; i++) {
+ delta = memcg->vmstats.events_pending[i];
+ if (delta)
+ memcg->vmstats.events_pending[i] = 0;
+
+ v = READ_ONCE(statc->events[i]);
+ if (v != statc->events_prev[i]) {
+ delta += v - statc->events_prev[i];
+ statc->events_prev[i] = v;
+ }
+
+ if (!delta)
+ continue;
+
+ memcg->vmstats.events[i] += delta;
+ if (parent)
+ parent->vmstats.events_pending[i] += delta;
+ }
+}
+
#ifdef CONFIG_MMU
/* Handlers for move charge at task migration. */
static int mem_cgroup_do_precharge(unsigned long count)
@@ -6501,6 +6453,7 @@ struct cgroup_subsys memory_cgrp_subsys = {
.css_released = mem_cgroup_css_released,
.css_free = mem_cgroup_css_free,
.css_reset = mem_cgroup_css_reset,
+ .css_rstat_flush = mem_cgroup_css_rstat_flush,
.can_attach = mem_cgroup_can_attach,
.cancel_attach = mem_cgroup_cancel_attach,
.post_attach = mem_cgroup_move_task,
@@ -6683,6 +6636,27 @@ void mem_cgroup_calculate_protection(struct mem_cgroup *root,
atomic_long_read(&parent->memory.children_low_usage)));
}
+static int __mem_cgroup_charge(struct page *page, struct mem_cgroup *memcg,
+ gfp_t gfp)
+{
+ unsigned int nr_pages = thp_nr_pages(page);
+ int ret;
+
+ ret = try_charge(memcg, gfp, nr_pages);
+ if (ret)
+ goto out;
+
+ css_get(&memcg->css);
+ commit_charge(page, memcg);
+
+ local_irq_disable();
+ mem_cgroup_charge_statistics(memcg, page, nr_pages);
+ memcg_check_events(memcg, page);
+ local_irq_enable();
+out:
+ return ret;
+}
+
/**
* mem_cgroup_charge - charge a newly allocated page to a cgroup
* @page: page to charge
@@ -6692,55 +6666,71 @@ void mem_cgroup_calculate_protection(struct mem_cgroup *root,
* Try to charge @page to the memcg that @mm belongs to, reclaiming
* pages according to @gfp_mask if necessary.
*
+ * Do not use this for pages allocated for swapin.
+ *
* Returns 0 on success. Otherwise, an error code is returned.
*/
int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask)
{
- unsigned int nr_pages = thp_nr_pages(page);
- struct mem_cgroup *memcg = NULL;
- int ret = 0;
+ struct mem_cgroup *memcg;
+ int ret;
if (mem_cgroup_disabled())
- goto out;
+ return 0;
- if (PageSwapCache(page)) {
- swp_entry_t ent = { .val = page_private(page), };
- unsigned short id;
+ memcg = get_mem_cgroup_from_mm(mm);
+ ret = __mem_cgroup_charge(page, memcg, gfp_mask);
+ css_put(&memcg->css);
- /*
- * Every swap fault against a single page tries to charge the
- * page, bail as early as possible. shmem_unuse() encounters
- * already charged pages, too. page and memcg binding is
- * protected by the page lock, which serializes swap cache
- * removal, which in turn serializes uncharging.
- */
- VM_BUG_ON_PAGE(!PageLocked(page), page);
- if (page_memcg(compound_head(page)))
- goto out;
+ return ret;
+}
- id = lookup_swap_cgroup_id(ent);
- rcu_read_lock();
- memcg = mem_cgroup_from_id(id);
- if (memcg && !css_tryget_online(&memcg->css))
- memcg = NULL;
- rcu_read_unlock();
- }
+/**
+ * mem_cgroup_swapin_charge_page - charge a newly allocated page for swapin
+ * @page: page to charge
+ * @mm: mm context of the victim
+ * @gfp: reclaim mode
+ * @entry: swap entry for which the page is allocated
+ *
+ * This function charges a page allocated for swapin. Please call this before
+ * adding the page to the swapcache.
+ *
+ * Returns 0 on success. Otherwise, an error code is returned.
+ */
+int mem_cgroup_swapin_charge_page(struct page *page, struct mm_struct *mm,
+ gfp_t gfp, swp_entry_t entry)
+{
+ struct mem_cgroup *memcg;
+ unsigned short id;
+ int ret;
- if (!memcg)
+ if (mem_cgroup_disabled())
+ return 0;
+
+ id = lookup_swap_cgroup_id(entry);
+ rcu_read_lock();
+ memcg = mem_cgroup_from_id(id);
+ if (!memcg || !css_tryget_online(&memcg->css))
memcg = get_mem_cgroup_from_mm(mm);
+ rcu_read_unlock();
- ret = try_charge(memcg, gfp_mask, nr_pages);
- if (ret)
- goto out_put;
+ ret = __mem_cgroup_charge(page, memcg, gfp);
- css_get(&memcg->css);
- commit_charge(page, memcg);
-
- local_irq_disable();
- mem_cgroup_charge_statistics(memcg, page, nr_pages);
- memcg_check_events(memcg, page);
- local_irq_enable();
+ css_put(&memcg->css);
+ return ret;
+}
+/*
+ * mem_cgroup_swapin_uncharge_swap - uncharge swap slot
+ * @entry: swap entry for which the page is charged
+ *
+ * Call this function after successfully adding the charged page to swapcache.
+ *
+ * Note: This function assumes the page for which swap slot is being uncharged
+ * is order 0 page.
+ */
+void mem_cgroup_swapin_uncharge_swap(swp_entry_t entry)
+{
/*
* Cgroup1's unified memory+swap counter has been charged with the
* new swapcache page, finish the transfer by uncharging the swap
@@ -6753,25 +6743,19 @@ int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask)
* correspond 1:1 to page and swap slot lifetimes: we charge the
* page to memory here, and uncharge swap when the slot is freed.
*/
- if (do_memsw_account() && PageSwapCache(page)) {
- swp_entry_t entry = { .val = page_private(page) };
+ if (!mem_cgroup_disabled() && do_memsw_account()) {
/*
* The swap entry might not get freed for a long time,
* let's not wait for it. The page already received a
* memory+swap charge, drop the swap entry duplicate.
*/
- mem_cgroup_uncharge_swap(entry, nr_pages);
+ mem_cgroup_uncharge_swap(entry, 1);
}
-
-out_put:
- css_put(&memcg->css);
-out:
- return ret;
}
struct uncharge_gather {
struct mem_cgroup *memcg;
- unsigned long nr_pages;
+ unsigned long nr_memory;
unsigned long pgpgout;
unsigned long nr_kmem;
struct page *dummy_page;
@@ -6786,10 +6770,10 @@ static void uncharge_batch(const struct uncharge_gather *ug)
{
unsigned long flags;
- if (!mem_cgroup_is_root(ug->memcg)) {
- page_counter_uncharge(&ug->memcg->memory, ug->nr_pages);
+ if (ug->nr_memory) {
+ page_counter_uncharge(&ug->memcg->memory, ug->nr_memory);
if (do_memsw_account())
- page_counter_uncharge(&ug->memcg->memsw, ug->nr_pages);
+ page_counter_uncharge(&ug->memcg->memsw, ug->nr_memory);
if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && ug->nr_kmem)
page_counter_uncharge(&ug->memcg->kmem, ug->nr_kmem);
memcg_oom_recover(ug->memcg);
@@ -6797,7 +6781,7 @@ static void uncharge_batch(const struct uncharge_gather *ug)
local_irq_save(flags);
__count_memcg_events(ug->memcg, PGPGOUT, ug->pgpgout);
- __this_cpu_add(ug->memcg->vmstats_percpu->nr_page_events, ug->nr_pages);
+ __this_cpu_add(ug->memcg->vmstats_percpu->nr_page_events, ug->nr_memory);
memcg_check_events(ug->memcg, ug->dummy_page);
local_irq_restore(flags);
@@ -6808,40 +6792,60 @@ static void uncharge_batch(const struct uncharge_gather *ug)
static void uncharge_page(struct page *page, struct uncharge_gather *ug)
{
unsigned long nr_pages;
+ struct mem_cgroup *memcg;
+ struct obj_cgroup *objcg;
VM_BUG_ON_PAGE(PageLRU(page), page);
- if (!page_memcg(page))
- return;
-
/*
* Nobody should be changing or seriously looking at
- * page_memcg(page) at this point, we have fully
+ * page memcg or objcg at this point, we have fully
* exclusive access to the page.
*/
+ if (PageMemcgKmem(page)) {
+ objcg = __page_objcg(page);
+ /*
+ * This get matches the put at the end of the function and
+ * kmem pages do not hold memcg references anymore.
+ */
+ memcg = get_mem_cgroup_from_objcg(objcg);
+ } else {
+ memcg = __page_memcg(page);
+ }
+
+ if (!memcg)
+ return;
- if (ug->memcg != page_memcg(page)) {
+ if (ug->memcg != memcg) {
if (ug->memcg) {
uncharge_batch(ug);
uncharge_gather_clear(ug);
}
- ug->memcg = page_memcg(page);
+ ug->memcg = memcg;
+ ug->dummy_page = page;
/* pairs with css_put in uncharge_batch */
- css_get(&ug->memcg->css);
+ css_get(&memcg->css);
}
nr_pages = compound_nr(page);
- ug->nr_pages += nr_pages;
- if (PageMemcgKmem(page))
+ if (PageMemcgKmem(page)) {
+ ug->nr_memory += nr_pages;
ug->nr_kmem += nr_pages;
- else
+
+ page->memcg_data = 0;
+ obj_cgroup_put(objcg);
+ } else {
+ /* LRU pages aren't accounted at the root level */
+ if (!mem_cgroup_is_root(memcg))
+ ug->nr_memory += nr_pages;
ug->pgpgout++;
- ug->dummy_page = page;
- page->memcg_data = 0;
- css_put(&ug->memcg->css);
+ page->memcg_data = 0;
+ }
+
+ css_put(&memcg->css);
}
/**
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 24210c9bd843..bd3945446d47 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -1368,7 +1368,7 @@ static int memory_failure_dev_pagemap(unsigned long pfn, int flags,
* communicated in siginfo, see kill_proc()
*/
start = (page->index << PAGE_SHIFT) & ~(size - 1);
- unmap_mapping_range(page->mapping, start, start + size, 0);
+ unmap_mapping_range(page->mapping, start, size, 0);
}
kill_procs(&tokill, flags & MF_MUST_KILL, !unmap_success, pfn, flags);
rc = 0;
diff --git a/mm/memory.c b/mm/memory.c
index 550405fc3b5e..cbdc2cd9cedb 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2260,26 +2260,17 @@ static inline int remap_p4d_range(struct mm_struct *mm, pgd_t *pgd,
return 0;
}
-/**
- * remap_pfn_range - remap kernel memory to userspace
- * @vma: user vma to map to
- * @addr: target page aligned user address to start at
- * @pfn: page frame number of kernel physical memory address
- * @size: size of mapping area
- * @prot: page protection flags for this mapping
- *
- * Note: this is only safe if the mm semaphore is held when called.
- *
- * Return: %0 on success, negative error code otherwise.
+/*
+ * Variant of remap_pfn_range that does not call track_pfn_remap. The caller
+ * must have pre-validated the caching bits of the pgprot_t.
*/
-int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
- unsigned long pfn, unsigned long size, pgprot_t prot)
+int remap_pfn_range_notrack(struct vm_area_struct *vma, unsigned long addr,
+ unsigned long pfn, unsigned long size, pgprot_t prot)
{
pgd_t *pgd;
unsigned long next;
unsigned long end = addr + PAGE_ALIGN(size);
struct mm_struct *mm = vma->vm_mm;
- unsigned long remap_pfn = pfn;
int err;
if (WARN_ON_ONCE(!PAGE_ALIGNED(addr)))
@@ -2309,10 +2300,6 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
vma->vm_pgoff = pfn;
}
- err = track_pfn_remap(vma, &prot, remap_pfn, addr, PAGE_ALIGN(size));
- if (err)
- return -EINVAL;
-
vma->vm_flags |= VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP;
BUG_ON(addr >= end);
@@ -2324,12 +2311,36 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
err = remap_p4d_range(mm, pgd, addr, next,
pfn + (addr >> PAGE_SHIFT), prot);
if (err)
- break;
+ return err;
} while (pgd++, addr = next, addr != end);
+ return 0;
+}
+
+/**
+ * remap_pfn_range - remap kernel memory to userspace
+ * @vma: user vma to map to
+ * @addr: target page aligned user address to start at
+ * @pfn: page frame number of kernel physical memory address
+ * @size: size of mapping area
+ * @prot: page protection flags for this mapping
+ *
+ * Note: this is only safe if the mm semaphore is held when called.
+ *
+ * Return: %0 on success, negative error code otherwise.
+ */
+int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
+ unsigned long pfn, unsigned long size, pgprot_t prot)
+{
+ int err;
+
+ err = track_pfn_remap(vma, &prot, pfn, addr, PAGE_ALIGN(size));
if (err)
- untrack_pfn(vma, remap_pfn, PAGE_ALIGN(size));
+ return -EINVAL;
+ err = remap_pfn_range_notrack(vma, addr, pfn, size, prot);
+ if (err)
+ untrack_pfn(vma, pfn, PAGE_ALIGN(size));
return err;
}
EXPORT_SYMBOL(remap_pfn_range);
@@ -2446,13 +2457,21 @@ static int apply_to_pmd_range(struct mm_struct *mm, pud_t *pud,
}
do {
next = pmd_addr_end(addr, end);
- if (create || !pmd_none_or_clear_bad(pmd)) {
- err = apply_to_pte_range(mm, pmd, addr, next, fn, data,
- create, mask);
- if (err)
- break;
+ if (pmd_none(*pmd) && !create)
+ continue;
+ if (WARN_ON_ONCE(pmd_leaf(*pmd)))
+ return -EINVAL;
+ if (!pmd_none(*pmd) && WARN_ON_ONCE(pmd_bad(*pmd))) {
+ if (!create)
+ continue;
+ pmd_clear_bad(pmd);
}
+ err = apply_to_pte_range(mm, pmd, addr, next,
+ fn, data, create, mask);
+ if (err)
+ break;
} while (pmd++, addr = next, addr != end);
+
return err;
}
@@ -2474,13 +2493,21 @@ static int apply_to_pud_range(struct mm_struct *mm, p4d_t *p4d,
}
do {
next = pud_addr_end(addr, end);
- if (create || !pud_none_or_clear_bad(pud)) {
- err = apply_to_pmd_range(mm, pud, addr, next, fn, data,
- create, mask);
- if (err)
- break;
+ if (pud_none(*pud) && !create)
+ continue;
+ if (WARN_ON_ONCE(pud_leaf(*pud)))
+ return -EINVAL;
+ if (!pud_none(*pud) && WARN_ON_ONCE(pud_bad(*pud))) {
+ if (!create)
+ continue;
+ pud_clear_bad(pud);
}
+ err = apply_to_pmd_range(mm, pud, addr, next,
+ fn, data, create, mask);
+ if (err)
+ break;
} while (pud++, addr = next, addr != end);
+
return err;
}
@@ -2502,13 +2529,21 @@ static int apply_to_p4d_range(struct mm_struct *mm, pgd_t *pgd,
}
do {
next = p4d_addr_end(addr, end);
- if (create || !p4d_none_or_clear_bad(p4d)) {
- err = apply_to_pud_range(mm, p4d, addr, next, fn, data,
- create, mask);
- if (err)
- break;
+ if (p4d_none(*p4d) && !create)
+ continue;
+ if (WARN_ON_ONCE(p4d_leaf(*p4d)))
+ return -EINVAL;
+ if (!p4d_none(*p4d) && WARN_ON_ONCE(p4d_bad(*p4d))) {
+ if (!create)
+ continue;
+ p4d_clear_bad(p4d);
}
+ err = apply_to_pud_range(mm, p4d, addr, next,
+ fn, data, create, mask);
+ if (err)
+ break;
} while (p4d++, addr = next, addr != end);
+
return err;
}
@@ -2528,9 +2563,17 @@ static int __apply_to_page_range(struct mm_struct *mm, unsigned long addr,
pgd = pgd_offset(mm, addr);
do {
next = pgd_addr_end(addr, end);
- if (!create && pgd_none_or_clear_bad(pgd))
+ if (pgd_none(*pgd) && !create)
continue;
- err = apply_to_p4d_range(mm, pgd, addr, next, fn, data, create, &mask);
+ if (WARN_ON_ONCE(pgd_leaf(*pgd)))
+ return -EINVAL;
+ if (!pgd_none(*pgd) && WARN_ON_ONCE(pgd_bad(*pgd))) {
+ if (!create)
+ continue;
+ pgd_clear_bad(pgd);
+ }
+ err = apply_to_p4d_range(mm, pgd, addr, next,
+ fn, data, create, &mask);
if (err)
break;
} while (pgd++, addr = next, addr != end);
@@ -3309,28 +3352,26 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma,
vmf->address);
if (page) {
- int err;
-
__SetPageLocked(page);
__SetPageSwapBacked(page);
- set_page_private(page, entry.val);
- /* Tell memcg to use swap ownership records */
- SetPageSwapCache(page);
- err = mem_cgroup_charge(page, vma->vm_mm,
- GFP_KERNEL);
- ClearPageSwapCache(page);
- if (err) {
+ if (mem_cgroup_swapin_charge_page(page,
+ vma->vm_mm, GFP_KERNEL, entry)) {
ret = VM_FAULT_OOM;
goto out_page;
}
+ mem_cgroup_swapin_uncharge_swap(entry);
shadow = get_shadow_from_swap_cache(entry);
if (shadow)
workingset_refault(page, shadow);
lru_cache_add(page);
+
+ /* To provide entry to swap_readpage() */
+ set_page_private(page, entry.val);
swap_readpage(page, true);
+ set_page_private(page, 0);
}
} else {
page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE,
@@ -4100,7 +4141,6 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf)
int page_nid = NUMA_NO_NODE;
int last_cpupid;
int target_nid;
- bool migrated = false;
pte_t pte, old_pte;
bool was_writable = pte_savedwrite(vmf->orig_pte);
int flags = 0;
@@ -4117,29 +4157,17 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf)
goto out;
}
- /*
- * Make it present again, Depending on how arch implementes non
- * accessible ptes, some can allow access by kernel mode.
- */
- old_pte = ptep_modify_prot_start(vma, vmf->address, vmf->pte);
+ /* Get the normal PTE */
+ old_pte = ptep_get(vmf->pte);
pte = pte_modify(old_pte, vma->vm_page_prot);
- pte = pte_mkyoung(pte);
- if (was_writable)
- pte = pte_mkwrite(pte);
- ptep_modify_prot_commit(vma, vmf->address, vmf->pte, old_pte, pte);
- update_mmu_cache(vma, vmf->address, vmf->pte);
page = vm_normal_page(vma, vmf->address, pte);
- if (!page) {
- pte_unmap_unlock(vmf->pte, vmf->ptl);
- return 0;
- }
+ if (!page)
+ goto out_map;
/* TODO: handle PTE-mapped THP */
- if (PageCompound(page)) {
- pte_unmap_unlock(vmf->pte, vmf->ptl);
- return 0;
- }
+ if (PageCompound(page))
+ goto out_map;
/*
* Avoid grouping on RO pages in general. RO pages shouldn't hurt as
@@ -4149,7 +4177,7 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf)
* pte_dirty has unpredictable behaviour between PTE scan updates,
* background writeback, dirty balancing and application behaviour.
*/
- if (!pte_write(pte))
+ if (!was_writable)
flags |= TNF_NO_GROUP;
/*
@@ -4163,24 +4191,45 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf)
page_nid = page_to_nid(page);
target_nid = numa_migrate_prep(page, vma, vmf->address, page_nid,
&flags);
- pte_unmap_unlock(vmf->pte, vmf->ptl);
if (target_nid == NUMA_NO_NODE) {
put_page(page);
- goto out;
+ goto out_map;
}
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
/* Migrate to the requested node */
- migrated = migrate_misplaced_page(page, vma, target_nid);
- if (migrated) {
+ if (migrate_misplaced_page(page, vma, target_nid)) {
page_nid = target_nid;
flags |= TNF_MIGRATED;
- } else
+ } else {
flags |= TNF_MIGRATE_FAIL;
+ vmf->pte = pte_offset_map(vmf->pmd, vmf->address);
+ spin_lock(vmf->ptl);
+ if (unlikely(!pte_same(*vmf->pte, vmf->orig_pte))) {
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
+ goto out;
+ }
+ goto out_map;
+ }
out:
if (page_nid != NUMA_NO_NODE)
task_numa_fault(last_cpupid, page_nid, 1, flags);
return 0;
+out_map:
+ /*
+ * Make it present again, depending on how arch implements
+ * non-accessible ptes, some can allow access by kernel mode.
+ */
+ old_pte = ptep_modify_prot_start(vma, vmf->address, vmf->pte);
+ pte = pte_modify(old_pte, vma->vm_page_prot);
+ pte = pte_mkyoung(pte);
+ if (was_writable)
+ pte = pte_mkwrite(pte);
+ ptep_modify_prot_commit(vma, vmf->address, vmf->pte, old_pte, pte);
+ update_mmu_cache(vma, vmf->address, vmf->pte);
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
+ goto out;
}
static inline vm_fault_t create_huge_pmd(struct vm_fault *vmf)
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index ab51132547b8..cd0295567a04 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -2140,7 +2140,7 @@ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
{
struct page *page;
- page = __alloc_pages(gfp, order, nid);
+ page = __alloc_pages(gfp, order, nid, NULL);
/* skip NUMA_INTERLEAVE_HIT counter update if numa stats is disabled */
if (!static_branch_likely(&vm_numa_stat_key))
return page;
@@ -2153,30 +2153,22 @@ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
}
/**
- * alloc_pages_vma - Allocate a page for a VMA.
+ * alloc_pages_vma - Allocate a page for a VMA.
+ * @gfp: GFP flags.
+ * @order: Order of the GFP allocation.
+ * @vma: Pointer to VMA or NULL if not available.
+ * @addr: Virtual address of the allocation. Must be inside @vma.
+ * @node: Which node to prefer for allocation (modulo policy).
+ * @hugepage: For hugepages try only the preferred node if possible.
*
- * @gfp:
- * %GFP_USER user allocation.
- * %GFP_KERNEL kernel allocations,
- * %GFP_HIGHMEM highmem/user allocations,
- * %GFP_FS allocation should not call back into a file system.
- * %GFP_ATOMIC don't sleep.
+ * Allocate a page for a specific address in @vma, using the appropriate
+ * NUMA policy. When @vma is not NULL the caller must hold the mmap_lock
+ * of the mm_struct of the VMA to prevent it from going away. Should be
+ * used for all allocations for pages that will be mapped into user space.
*
- * @order:Order of the GFP allocation.
- * @vma: Pointer to VMA or NULL if not available.
- * @addr: Virtual Address of the allocation. Must be inside the VMA.
- * @node: Which node to prefer for allocation (modulo policy).
- * @hugepage: for hugepages try only the preferred node if possible
- *
- * This function allocates a page from the kernel page pool and applies
- * a NUMA policy associated with the VMA or the current process.
- * When VMA is not NULL caller must read-lock the mmap_lock of the
- * mm_struct of the VMA to prevent it from going away. Should be used for
- * all allocations for pages that will be mapped into user space. Returns
- * NULL when no page can be allocated.
+ * Return: The page on success or NULL if allocation fails.
*/
-struct page *
-alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
+struct page *alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
unsigned long addr, int node, bool hugepage)
{
struct mempolicy *pol;
@@ -2237,7 +2229,7 @@ alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
nmask = policy_nodemask(gfp, pol);
preferred_nid = policy_node(gfp, pol, node);
- page = __alloc_pages_nodemask(gfp, order, preferred_nid, nmask);
+ page = __alloc_pages(gfp, order, preferred_nid, nmask);
mpol_cond_put(pol);
out:
return page;
@@ -2245,21 +2237,20 @@ out:
EXPORT_SYMBOL(alloc_pages_vma);
/**
- * alloc_pages_current - Allocate pages.
+ * alloc_pages - Allocate pages.
+ * @gfp: GFP flags.
+ * @order: Power of two of number of pages to allocate.
*
- * @gfp:
- * %GFP_USER user allocation,
- * %GFP_KERNEL kernel allocation,
- * %GFP_HIGHMEM highmem allocation,
- * %GFP_FS don't call back into a file system.
- * %GFP_ATOMIC don't sleep.
- * @order: Power of two of allocation size in pages. 0 is a single page.
+ * Allocate 1 << @order contiguous pages. The physical address of the
+ * first page is naturally aligned (eg an order-3 allocation will be aligned
+ * to a multiple of 8 * PAGE_SIZE bytes). The NUMA policy of the current
+ * process is honoured when in process context.
*
- * Allocate a page from the kernel page pool. When not in
- * interrupt context and apply the current process NUMA policy.
- * Returns NULL when no page can be allocated.
+ * Context: Can be called from any context, providing the appropriate GFP
+ * flags are used.
+ * Return: The page on success or NULL if allocation fails.
*/
-struct page *alloc_pages_current(gfp_t gfp, unsigned order)
+struct page *alloc_pages(gfp_t gfp, unsigned order)
{
struct mempolicy *pol = &default_policy;
struct page *page;
@@ -2274,13 +2265,13 @@ struct page *alloc_pages_current(gfp_t gfp, unsigned order)
if (pol->mode == MPOL_INTERLEAVE)
page = alloc_page_interleave(gfp, order, interleave_nodes(pol));
else
- page = __alloc_pages_nodemask(gfp, order,
+ page = __alloc_pages(gfp, order,
policy_node(gfp, pol, numa_node_id()),
policy_nodemask(gfp, pol));
return page;
}
-EXPORT_SYMBOL(alloc_pages_current);
+EXPORT_SYMBOL(alloc_pages);
int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst)
{
@@ -2457,14 +2448,11 @@ static void sp_free(struct sp_node *n)
* @addr: virtual address where page mapped
*
* Lookup current policy node id for vma,addr and "compare to" page's
- * node id.
- *
- * Returns:
- * -1 - not misplaced, page is in the right node
- * node - node id where the page should be
- *
- * Policy determination "mimics" alloc_page_vma().
+ * node id. Policy determination "mimics" alloc_page_vma().
* Called from fault path where we know the vma and faulting address.
+ *
+ * Return: -1 if the page is in a node that is valid for this policy, or a
+ * suitable node ID to allocate a replacement page from.
*/
int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long addr)
{
diff --git a/mm/mempool.c b/mm/mempool.c
index 79959fac27d7..fe19d290a301 100644
--- a/mm/mempool.c
+++ b/mm/mempool.c
@@ -106,7 +106,7 @@ static __always_inline void kasan_poison_element(mempool_t *pool, void *element)
if (pool->alloc == mempool_alloc_slab || pool->alloc == mempool_kmalloc)
kasan_slab_free_mempool(element);
else if (pool->alloc == mempool_alloc_pages)
- kasan_free_pages(element, (unsigned long)pool->pool_data);
+ kasan_free_pages(element, (unsigned long)pool->pool_data, false);
}
static void kasan_unpoison_element(mempool_t *pool, void *element)
@@ -114,7 +114,7 @@ static void kasan_unpoison_element(mempool_t *pool, void *element)
if (pool->alloc == mempool_alloc_slab || pool->alloc == mempool_kmalloc)
kasan_unpoison_range(element, __ksize(element));
else if (pool->alloc == mempool_alloc_pages)
- kasan_alloc_pages(element, (unsigned long)pool->pool_data);
+ kasan_alloc_pages(element, (unsigned long)pool->pool_data, false);
}
static __always_inline void add_element(mempool_t *pool, void *element)
diff --git a/mm/memremap.c b/mm/memremap.c
index 7aa7d6e80ee5..15a074ffb8d7 100644
--- a/mm/memremap.c
+++ b/mm/memremap.c
@@ -1,4 +1,4 @@
-/* SPDX-License-Identifier: GPL-2.0 */
+// SPDX-License-Identifier: GPL-2.0
/* Copyright(c) 2015 Intel Corporation. All rights reserved. */
#include <linux/device.h>
#include <linux/io.h>
diff --git a/mm/migrate.c b/mm/migrate.c
index 62b81d5257aa..47df0df8f21a 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -1617,7 +1617,7 @@ struct page *alloc_migration_target(struct page *page, unsigned long private)
if (is_highmem_idx(zidx) || zidx == ZONE_MOVABLE)
gfp_mask |= __GFP_HIGHMEM;
- new_page = __alloc_pages_nodemask(gfp_mask, order, nid, mtc->nmask);
+ new_page = __alloc_pages(gfp_mask, order, nid, mtc->nmask);
if (new_page && PageTransHuge(new_page))
prep_transhuge_page(new_page);
diff --git a/mm/mm_init.c b/mm/mm_init.c
index 8e02e865cc65..9ddaf0e1b0ab 100644
--- a/mm/mm_init.c
+++ b/mm/mm_init.c
@@ -19,10 +19,6 @@
#ifdef CONFIG_DEBUG_MEMORY_INIT
int __meminitdata mminit_loglevel;
-#ifndef SECTIONS_SHIFT
-#define SECTIONS_SHIFT 0
-#endif
-
/* The zonelists are simply reported, validation is manual. */
void __init mminit_verify_zonelist(void)
{
diff --git a/mm/mmap.c b/mm/mmap.c
index 1d96a21acb2f..347ef9b83bb5 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -3409,14 +3409,10 @@ static const char *special_mapping_name(struct vm_area_struct *vma)
return ((struct vm_special_mapping *)vma->vm_private_data)->name;
}
-static int special_mapping_mremap(struct vm_area_struct *new_vma,
- unsigned long flags)
+static int special_mapping_mremap(struct vm_area_struct *new_vma)
{
struct vm_special_mapping *sm = new_vma->vm_private_data;
- if (flags & MREMAP_DONTUNMAP)
- return -EINVAL;
-
if (WARN_ON_ONCE(current->mm != new_vma->vm_mm))
return -EFAULT;
diff --git a/mm/mremap.c b/mm/mremap.c
index ec8f840399ed..d22629ff8f3c 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -545,7 +545,7 @@ static unsigned long move_vma(struct vm_area_struct *vma,
if (moved_len < old_len) {
err = -ENOMEM;
} else if (vma->vm_ops && vma->vm_ops->mremap) {
- err = vma->vm_ops->mremap(new_vma, flags);
+ err = vma->vm_ops->mremap(new_vma);
}
if (unlikely(err)) {
@@ -653,8 +653,8 @@ static struct vm_area_struct *vma_to_resize(unsigned long addr,
return ERR_PTR(-EINVAL);
}
- if (flags & MREMAP_DONTUNMAP && (!vma_is_anonymous(vma) ||
- vma->vm_flags & VM_SHARED))
+ if ((flags & MREMAP_DONTUNMAP) &&
+ (vma->vm_flags & (VM_DONTEXPAND | VM_PFNMAP)))
return ERR_PTR(-EINVAL);
if (is_vm_hugetlb_page(vma))
diff --git a/mm/msync.c b/mm/msync.c
index 69c6d2029531..137d1c104f3e 100644
--- a/mm/msync.c
+++ b/mm/msync.c
@@ -55,7 +55,9 @@ SYSCALL_DEFINE3(msync, unsigned long, start, size_t, len, int, flags)
goto out;
/*
* If the interval [start,end) covers some unmapped address ranges,
- * just ignore them, but return -ENOMEM at the end.
+ * just ignore them, but return -ENOMEM at the end. Besides, if the
+ * flag is MS_ASYNC (w/o MS_INVALIDATE) the result would be -ENOMEM
+ * anyway and there is nothing left to do, so return immediately.
*/
mmap_read_lock(mm);
vma = find_vma(mm, start);
@@ -69,6 +71,8 @@ SYSCALL_DEFINE3(msync, unsigned long, start, size_t, len, int, flags)
goto out_unlock;
/* Here start < vma->vm_end. */
if (start < vma->vm_start) {
+ if (flags == MS_ASYNC)
+ goto out_unlock;
start = vma->vm_start;
if (start >= end)
goto out_unlock;
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 9e35b636a393..5e761fb62800 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -2722,12 +2722,9 @@ EXPORT_SYMBOL(clear_page_dirty_for_io);
int test_clear_page_writeback(struct page *page)
{
struct address_space *mapping = page_mapping(page);
- struct mem_cgroup *memcg;
- struct lruvec *lruvec;
int ret;
- memcg = lock_page_memcg(page);
- lruvec = mem_cgroup_page_lruvec(page, page_pgdat(page));
+ lock_page_memcg(page);
if (mapping && mapping_use_writeback_tags(mapping)) {
struct inode *inode = mapping->host;
struct backing_dev_info *bdi = inode_to_bdi(inode);
@@ -2755,11 +2752,11 @@ int test_clear_page_writeback(struct page *page)
ret = TestClearPageWriteback(page);
}
if (ret) {
- dec_lruvec_state(lruvec, NR_WRITEBACK);
+ dec_lruvec_page_state(page, NR_WRITEBACK);
dec_zone_page_state(page, NR_ZONE_WRITE_PENDING);
inc_node_page_state(page, NR_WRITTEN);
}
- __unlock_page_memcg(memcg);
+ unlock_page_memcg(page);
return ret;
}
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index e2f19bf948db..6b208b1843bf 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -72,7 +72,6 @@
#include <linux/padata.h>
#include <linux/khugepaged.h>
#include <linux/buffer_head.h>
-
#include <asm/sections.h>
#include <asm/tlbflush.h>
#include <asm/div64.h>
@@ -108,6 +107,17 @@ typedef int __bitwise fpi_t;
*/
#define FPI_TO_TAIL ((__force fpi_t)BIT(1))
+/*
+ * Don't poison memory with KASAN (only for the tag-based modes).
+ * During boot, all non-reserved memblock memory is exposed to page_alloc.
+ * Poisoning all that memory lengthens boot time, especially on systems with
+ * large amount of RAM. This flag is used to skip that poisoning.
+ * This is only done for the tag-based KASAN modes, as those are able to
+ * detect memory corruptions with the memory tags assigned by default.
+ * All memory allocated normally after boot gets poisoned as usual.
+ */
+#define FPI_SKIP_KASAN_POISON ((__force fpi_t)BIT(2))
+
/* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */
static DEFINE_MUTEX(pcp_batch_high_lock);
#define MIN_PERCPU_PAGELIST_FRACTION (8)
@@ -384,10 +394,15 @@ static DEFINE_STATIC_KEY_TRUE(deferred_pages);
* on-demand allocation and then freed again before the deferred pages
* initialization is done, but this is not likely to happen.
*/
-static inline void kasan_free_nondeferred_pages(struct page *page, int order)
+static inline void kasan_free_nondeferred_pages(struct page *page, int order,
+ bool init, fpi_t fpi_flags)
{
- if (!static_branch_unlikely(&deferred_pages))
- kasan_free_pages(page, order);
+ if (static_branch_unlikely(&deferred_pages))
+ return;
+ if (!IS_ENABLED(CONFIG_KASAN_GENERIC) &&
+ (fpi_flags & FPI_SKIP_KASAN_POISON))
+ return;
+ kasan_free_pages(page, order, init);
}
/* Returns true if the struct page for the pfn is uninitialised */
@@ -438,7 +453,14 @@ defer_init(int nid, unsigned long pfn, unsigned long end_pfn)
return false;
}
#else
-#define kasan_free_nondeferred_pages(p, o) kasan_free_pages(p, o)
+static inline void kasan_free_nondeferred_pages(struct page *page, int order,
+ bool init, fpi_t fpi_flags)
+{
+ if (!IS_ENABLED(CONFIG_KASAN_GENERIC) &&
+ (fpi_flags & FPI_SKIP_KASAN_POISON))
+ return;
+ kasan_free_pages(page, order, init);
+}
static inline bool early_page_uninitialised(unsigned long pfn)
{
@@ -764,32 +786,36 @@ static inline void clear_page_guard(struct zone *zone, struct page *page,
*/
void init_mem_debugging_and_hardening(void)
{
+ bool page_poisoning_requested = false;
+
+#ifdef CONFIG_PAGE_POISONING
+ /*
+ * Page poisoning is debug page alloc for some arches. If
+ * either of those options are enabled, enable poisoning.
+ */
+ if (page_poisoning_enabled() ||
+ (!IS_ENABLED(CONFIG_ARCH_SUPPORTS_DEBUG_PAGEALLOC) &&
+ debug_pagealloc_enabled())) {
+ static_branch_enable(&_page_poisoning_enabled);
+ page_poisoning_requested = true;
+ }
+#endif
+
if (_init_on_alloc_enabled_early) {
- if (page_poisoning_enabled())
+ if (page_poisoning_requested)
pr_info("mem auto-init: CONFIG_PAGE_POISONING is on, "
"will take precedence over init_on_alloc\n");
else
static_branch_enable(&init_on_alloc);
}
if (_init_on_free_enabled_early) {
- if (page_poisoning_enabled())
+ if (page_poisoning_requested)
pr_info("mem auto-init: CONFIG_PAGE_POISONING is on, "
"will take precedence over init_on_free\n");
else
static_branch_enable(&init_on_free);
}
-#ifdef CONFIG_PAGE_POISONING
- /*
- * Page poisoning is debug page alloc for some arches. If
- * either of those options are enabled, enable poisoning.
- */
- if (page_poisoning_enabled() ||
- (!IS_ENABLED(CONFIG_ARCH_SUPPORTS_DEBUG_PAGEALLOC) &&
- debug_pagealloc_enabled()))
- static_branch_enable(&_page_poisoning_enabled);
-#endif
-
#ifdef CONFIG_DEBUG_PAGEALLOC
if (!debug_pagealloc_enabled())
return;
@@ -1103,7 +1129,7 @@ static inline bool page_expected_state(struct page *page,
if (unlikely((unsigned long)page->mapping |
page_ref_count(page) |
#ifdef CONFIG_MEMCG
- (unsigned long)page_memcg(page) |
+ page->memcg_data |
#endif
(page->flags & check_flags)))
return false;
@@ -1128,7 +1154,7 @@ static const char *page_bad_reason(struct page *page, unsigned long flags)
bad_reason = "PAGE_FLAGS_CHECK_AT_FREE flag(s) set";
}
#ifdef CONFIG_MEMCG
- if (unlikely(page_memcg(page)))
+ if (unlikely(page->memcg_data))
bad_reason = "page still charged to cgroup";
#endif
return bad_reason;
@@ -1216,9 +1242,10 @@ static void kernel_init_free_pages(struct page *page, int numpages)
}
static __always_inline bool free_pages_prepare(struct page *page,
- unsigned int order, bool check_free)
+ unsigned int order, bool check_free, fpi_t fpi_flags)
{
int bad = 0;
+ bool init;
VM_BUG_ON_PAGE(PageTail(page), page);
@@ -1276,16 +1303,21 @@ static __always_inline bool free_pages_prepare(struct page *page,
debug_check_no_obj_freed(page_address(page),
PAGE_SIZE << order);
}
- if (want_init_on_free())
- kernel_init_free_pages(page, 1 << order);
kernel_poison_pages(page, 1 << order);
/*
+ * As memory initialization might be integrated into KASAN,
+ * kasan_free_pages and kernel_init_free_pages must be
+ * kept together to avoid discrepancies in behavior.
+ *
* With hardware tag-based KASAN, memory tags must be set before the
* page becomes unavailable via debug_pagealloc or arch_free_page.
*/
- kasan_free_nondeferred_pages(page, order);
+ init = want_init_on_free();
+ if (init && !kasan_has_integrated_init())
+ kernel_init_free_pages(page, 1 << order);
+ kasan_free_nondeferred_pages(page, order, init, fpi_flags);
/*
* arch_free_page() can make the page's contents inaccessible. s390
@@ -1307,7 +1339,7 @@ static __always_inline bool free_pages_prepare(struct page *page,
*/
static bool free_pcp_prepare(struct page *page)
{
- return free_pages_prepare(page, 0, true);
+ return free_pages_prepare(page, 0, true, FPI_NONE);
}
static bool bulkfree_pcp_prepare(struct page *page)
@@ -1327,9 +1359,9 @@ static bool bulkfree_pcp_prepare(struct page *page)
static bool free_pcp_prepare(struct page *page)
{
if (debug_pagealloc_enabled_static())
- return free_pages_prepare(page, 0, true);
+ return free_pages_prepare(page, 0, true, FPI_NONE);
else
- return free_pages_prepare(page, 0, false);
+ return free_pages_prepare(page, 0, false, FPI_NONE);
}
static bool bulkfree_pcp_prepare(struct page *page)
@@ -1537,7 +1569,7 @@ static void __free_pages_ok(struct page *page, unsigned int order,
int migratetype;
unsigned long pfn = page_to_pfn(page);
- if (!free_pages_prepare(page, order, true))
+ if (!free_pages_prepare(page, order, true, fpi_flags))
return;
migratetype = get_pfnblock_migratetype(page, pfn);
@@ -1574,7 +1606,7 @@ void __free_pages_core(struct page *page, unsigned int order)
* Bypass PCP and place fresh pages right to the tail, primarily
* relevant for memory onlining.
*/
- __free_pages_ok(page, order, FPI_TO_TAIL);
+ __free_pages_ok(page, order, FPI_TO_TAIL | FPI_SKIP_KASAN_POISON);
}
#ifdef CONFIG_NEED_MULTIPLE_NODES
@@ -2292,17 +2324,32 @@ static bool check_new_pages(struct page *page, unsigned int order)
inline void post_alloc_hook(struct page *page, unsigned int order,
gfp_t gfp_flags)
{
+ bool init;
+
set_page_private(page, 0);
set_page_refcounted(page);
arch_alloc_page(page, order);
debug_pagealloc_map_pages(page, 1 << order);
- kasan_alloc_pages(page, order);
+
+ /*
+ * Page unpoisoning must happen before memory initialization.
+ * Otherwise, the poison pattern will be overwritten for __GFP_ZERO
+ * allocations and the page unpoisoning code will complain.
+ */
kernel_unpoison_pages(page, 1 << order);
- set_page_owner(page, order, gfp_flags);
- if (!want_init_on_free() && want_init_on_alloc(gfp_flags))
+ /*
+ * As memory initialization might be integrated into KASAN,
+ * kasan_alloc_pages and kernel_init_free_pages must be
+ * kept together to avoid discrepancies in behavior.
+ */
+ init = !want_init_on_free() && want_init_on_alloc(gfp_flags);
+ kasan_alloc_pages(page, order, init);
+ if (init && !kasan_has_integrated_init())
kernel_init_free_pages(page, 1 << order);
+
+ set_page_owner(page, order, gfp_flags);
}
static void prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags,
@@ -2386,19 +2433,21 @@ static inline struct page *__rmqueue_cma_fallback(struct zone *zone,
* boundary. If alignment is required, use move_freepages_block()
*/
static int move_freepages(struct zone *zone,
- struct page *start_page, struct page *end_page,
+ unsigned long start_pfn, unsigned long end_pfn,
int migratetype, int *num_movable)
{
struct page *page;
+ unsigned long pfn;
unsigned int order;
int pages_moved = 0;
- for (page = start_page; page <= end_page;) {
- if (!pfn_valid_within(page_to_pfn(page))) {
- page++;
+ for (pfn = start_pfn; pfn <= end_pfn;) {
+ if (!pfn_valid_within(pfn)) {
+ pfn++;
continue;
}
+ page = pfn_to_page(pfn);
if (!PageBuddy(page)) {
/*
* We assume that pages that could be isolated for
@@ -2408,8 +2457,7 @@ static int move_freepages(struct zone *zone,
if (num_movable &&
(PageLRU(page) || __PageMovable(page)))
(*num_movable)++;
-
- page++;
+ pfn++;
continue;
}
@@ -2419,7 +2467,7 @@ static int move_freepages(struct zone *zone,
order = buddy_order(page);
move_to_free_list(page, zone, order, migratetype);
- page += 1 << order;
+ pfn += 1 << order;
pages_moved += 1 << order;
}
@@ -2429,25 +2477,22 @@ static int move_freepages(struct zone *zone,
int move_freepages_block(struct zone *zone, struct page *page,
int migratetype, int *num_movable)
{
- unsigned long start_pfn, end_pfn;
- struct page *start_page, *end_page;
+ unsigned long start_pfn, end_pfn, pfn;
if (num_movable)
*num_movable = 0;
- start_pfn = page_to_pfn(page);
- start_pfn = start_pfn & ~(pageblock_nr_pages-1);
- start_page = pfn_to_page(start_pfn);
- end_page = start_page + pageblock_nr_pages - 1;
+ pfn = page_to_pfn(page);
+ start_pfn = pfn & ~(pageblock_nr_pages - 1);
end_pfn = start_pfn + pageblock_nr_pages - 1;
/* Do not cross zone boundaries */
if (!zone_spans_pfn(zone, start_pfn))
- start_page = page;
+ start_pfn = pfn;
if (!zone_spans_pfn(zone, end_pfn))
return 0;
- return move_freepages(zone, start_page, end_page, migratetype,
+ return move_freepages(zone, start_pfn, end_pfn, migratetype,
num_movable);
}
@@ -2908,7 +2953,7 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
unsigned long count, struct list_head *list,
int migratetype, unsigned int alloc_flags)
{
- int i, alloced = 0;
+ int i, allocated = 0;
spin_lock(&zone->lock);
for (i = 0; i < count; ++i) {
@@ -2931,7 +2976,7 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
* pages are ordered properly.
*/
list_add_tail(&page->lru, list);
- alloced++;
+ allocated++;
if (is_migrate_cma(get_pcppage_migratetype(page)))
__mod_zone_page_state(zone, NR_FREE_CMA_PAGES,
-(1 << order));
@@ -2940,12 +2985,12 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
/*
* i pages were removed from the buddy list even if some leak due
* to check_pcp_refill failing so adjust NR_FREE_PAGES based
- * on i. Do not confuse with 'alloced' which is the number of
+ * on i. Do not confuse with 'allocated' which is the number of
* pages added to the pcp list.
*/
__mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order));
spin_unlock(&zone->lock);
- return alloced;
+ return allocated;
}
#ifdef CONFIG_NUMA
@@ -3415,7 +3460,8 @@ static inline void zone_statistics(struct zone *preferred_zone, struct zone *z)
}
/* Remove page from the per-cpu list, caller must protect the list */
-static struct page *__rmqueue_pcplist(struct zone *zone, int migratetype,
+static inline
+struct page *__rmqueue_pcplist(struct zone *zone, int migratetype,
unsigned int alloc_flags,
struct per_cpu_pages *pcp,
struct list_head *list)
@@ -4921,7 +4967,7 @@ got_pg:
static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order,
int preferred_nid, nodemask_t *nodemask,
- struct alloc_context *ac, gfp_t *alloc_mask,
+ struct alloc_context *ac, gfp_t *alloc_gfp,
unsigned int *alloc_flags)
{
ac->highest_zoneidx = gfp_zone(gfp_mask);
@@ -4930,7 +4976,7 @@ static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order,
ac->migratetype = gfp_migratetype(gfp_mask);
if (cpusets_enabled()) {
- *alloc_mask |= __GFP_HARDWALL;
+ *alloc_gfp |= __GFP_HARDWALL;
/*
* When we are in the interrupt context, it is irrelevant
* to the current task context. It means that any node ok.
@@ -4966,15 +5012,160 @@ static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order,
}
/*
+ * __alloc_pages_bulk - Allocate a number of order-0 pages to a list or array
+ * @gfp: GFP flags for the allocation
+ * @preferred_nid: The preferred NUMA node ID to allocate from
+ * @nodemask: Set of nodes to allocate from, may be NULL
+ * @nr_pages: The number of pages desired on the list or array
+ * @page_list: Optional list to store the allocated pages
+ * @page_array: Optional array to store the pages
+ *
+ * This is a batched version of the page allocator that attempts to
+ * allocate nr_pages quickly. Pages are added to page_list if page_list
+ * is not NULL, otherwise it is assumed that the page_array is valid.
+ *
+ * For lists, nr_pages is the number of pages that should be allocated.
+ *
+ * For arrays, only NULL elements are populated with pages and nr_pages
+ * is the maximum number of pages that will be stored in the array.
+ *
+ * Returns the number of pages on the list or array.
+ */
+unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid,
+ nodemask_t *nodemask, int nr_pages,
+ struct list_head *page_list,
+ struct page **page_array)
+{
+ struct page *page;
+ unsigned long flags;
+ struct zone *zone;
+ struct zoneref *z;
+ struct per_cpu_pages *pcp;
+ struct list_head *pcp_list;
+ struct alloc_context ac;
+ gfp_t alloc_gfp;
+ unsigned int alloc_flags = ALLOC_WMARK_LOW;
+ int nr_populated = 0;
+
+ if (unlikely(nr_pages <= 0))
+ return 0;
+
+ /*
+ * Skip populated array elements to determine if any pages need
+ * to be allocated before disabling IRQs.
+ */
+ while (page_array && page_array[nr_populated] && nr_populated < nr_pages)
+ nr_populated++;
+
+ /* Use the single page allocator for one page. */
+ if (nr_pages - nr_populated == 1)
+ goto failed;
+
+ /* May set ALLOC_NOFRAGMENT, fragmentation will return 1 page. */
+ gfp &= gfp_allowed_mask;
+ alloc_gfp = gfp;
+ if (!prepare_alloc_pages(gfp, 0, preferred_nid, nodemask, &ac, &alloc_gfp, &alloc_flags))
+ return 0;
+ gfp = alloc_gfp;
+
+ /* Find an allowed local zone that meets the low watermark. */
+ for_each_zone_zonelist_nodemask(zone, z, ac.zonelist, ac.highest_zoneidx, ac.nodemask) {
+ unsigned long mark;
+
+ if (cpusets_enabled() && (alloc_flags & ALLOC_CPUSET) &&
+ !__cpuset_zone_allowed(zone, gfp)) {
+ continue;
+ }
+
+ if (nr_online_nodes > 1 && zone != ac.preferred_zoneref->zone &&
+ zone_to_nid(zone) != zone_to_nid(ac.preferred_zoneref->zone)) {
+ goto failed;
+ }
+
+ mark = wmark_pages(zone, alloc_flags & ALLOC_WMARK_MASK) + nr_pages;
+ if (zone_watermark_fast(zone, 0, mark,
+ zonelist_zone_idx(ac.preferred_zoneref),
+ alloc_flags, gfp)) {
+ break;
+ }
+ }
+
+ /*
+ * If there are no allowed local zones that meets the watermarks then
+ * try to allocate a single page and reclaim if necessary.
+ */
+ if (unlikely(!zone))
+ goto failed;
+
+ /* Attempt the batch allocation */
+ local_irq_save(flags);
+ pcp = &this_cpu_ptr(zone->pageset)->pcp;
+ pcp_list = &pcp->lists[ac.migratetype];
+
+ while (nr_populated < nr_pages) {
+
+ /* Skip existing pages */
+ if (page_array && page_array[nr_populated]) {
+ nr_populated++;
+ continue;
+ }
+
+ page = __rmqueue_pcplist(zone, ac.migratetype, alloc_flags,
+ pcp, pcp_list);
+ if (unlikely(!page)) {
+ /* Try and get at least one page */
+ if (!nr_populated)
+ goto failed_irq;
+ break;
+ }
+
+ /*
+ * Ideally this would be batched but the best way to do
+ * that cheaply is to first convert zone_statistics to
+ * be inaccurate per-cpu counter like vm_events to avoid
+ * a RMW cycle then do the accounting with IRQs enabled.
+ */
+ __count_zid_vm_events(PGALLOC, zone_idx(zone), 1);
+ zone_statistics(ac.preferred_zoneref->zone, zone);
+
+ prep_new_page(page, 0, gfp, 0);
+ if (page_list)
+ list_add(&page->lru, page_list);
+ else
+ page_array[nr_populated] = page;
+ nr_populated++;
+ }
+
+ local_irq_restore(flags);
+
+ return nr_populated;
+
+failed_irq:
+ local_irq_restore(flags);
+
+failed:
+ page = __alloc_pages(gfp, 0, preferred_nid, nodemask);
+ if (page) {
+ if (page_list)
+ list_add(&page->lru, page_list);
+ else
+ page_array[nr_populated] = page;
+ nr_populated++;
+ }
+
+ return nr_populated;
+}
+EXPORT_SYMBOL_GPL(__alloc_pages_bulk);
+
+/*
* This is the 'heart' of the zoned buddy allocator.
*/
-struct page *
-__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, int preferred_nid,
+struct page *__alloc_pages(gfp_t gfp, unsigned int order, int preferred_nid,
nodemask_t *nodemask)
{
struct page *page;
unsigned int alloc_flags = ALLOC_WMARK_LOW;
- gfp_t alloc_mask; /* The gfp_t that was actually used for allocation */
+ gfp_t alloc_gfp; /* The gfp_t that was actually used for allocation */
struct alloc_context ac = { };
/*
@@ -4982,23 +5173,24 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, int preferred_nid,
* so bail out early if the request is out of bound.
*/
if (unlikely(order >= MAX_ORDER)) {
- WARN_ON_ONCE(!(gfp_mask & __GFP_NOWARN));
+ WARN_ON_ONCE(!(gfp & __GFP_NOWARN));
return NULL;
}
- gfp_mask &= gfp_allowed_mask;
- alloc_mask = gfp_mask;
- if (!prepare_alloc_pages(gfp_mask, order, preferred_nid, nodemask, &ac, &alloc_mask, &alloc_flags))
+ gfp &= gfp_allowed_mask;
+ alloc_gfp = gfp;
+ if (!prepare_alloc_pages(gfp, order, preferred_nid, nodemask, &ac,
+ &alloc_gfp, &alloc_flags))
return NULL;
/*
* Forbid the first pass from falling back to types that fragment
* memory until all local zones are considered.
*/
- alloc_flags |= alloc_flags_nofragment(ac.preferred_zoneref->zone, gfp_mask);
+ alloc_flags |= alloc_flags_nofragment(ac.preferred_zoneref->zone, gfp);
/* First allocation attempt */
- page = get_page_from_freelist(alloc_mask, order, alloc_flags, &ac);
+ page = get_page_from_freelist(alloc_gfp, order, alloc_flags, &ac);
if (likely(page))
goto out;
@@ -5008,7 +5200,7 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, int preferred_nid,
* from a particular context which has been marked by
* memalloc_no{fs,io}_{save,restore}.
*/
- alloc_mask = current_gfp_context(gfp_mask);
+ alloc_gfp = current_gfp_context(gfp);
ac.spread_dirty_pages = false;
/*
@@ -5017,20 +5209,20 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, int preferred_nid,
*/
ac.nodemask = nodemask;
- page = __alloc_pages_slowpath(alloc_mask, order, &ac);
+ page = __alloc_pages_slowpath(alloc_gfp, order, &ac);
out:
- if (memcg_kmem_enabled() && (gfp_mask & __GFP_ACCOUNT) && page &&
- unlikely(__memcg_kmem_charge_page(page, gfp_mask, order) != 0)) {
+ if (memcg_kmem_enabled() && (gfp & __GFP_ACCOUNT) && page &&
+ unlikely(__memcg_kmem_charge_page(page, gfp, order) != 0)) {
__free_pages(page, order);
page = NULL;
}
- trace_mm_page_alloc(page, order, alloc_mask, ac.migratetype);
+ trace_mm_page_alloc(page, order, alloc_gfp, ac.migratetype);
return page;
}
-EXPORT_SYMBOL(__alloc_pages_nodemask);
+EXPORT_SYMBOL(__alloc_pages);
/*
* Common helper functions. Never use with __GFP_HIGHMEM because the returned
@@ -7689,7 +7881,7 @@ unsigned long free_reserved_area(void *start, void *end, int poison, const char
return pages;
}
-void __init mem_init_print_info(const char *str)
+void __init mem_init_print_info(void)
{
unsigned long physpages, codesize, datasize, rosize, bss_size;
unsigned long init_code_size, init_data_size;
@@ -7728,17 +7920,17 @@ void __init mem_init_print_info(const char *str)
#ifdef CONFIG_HIGHMEM
", %luK highmem"
#endif
- "%s%s)\n",
+ ")\n",
nr_free_pages() << (PAGE_SHIFT - 10),
physpages << (PAGE_SHIFT - 10),
codesize >> 10, datasize >> 10, rosize >> 10,
(init_data_size + init_code_size) >> 10, bss_size >> 10,
(physpages - totalram_pages() - totalcma_pages) << (PAGE_SHIFT - 10),
- totalcma_pages << (PAGE_SHIFT - 10),
+ totalcma_pages << (PAGE_SHIFT - 10)
#ifdef CONFIG_HIGHMEM
- totalhigh_pages() << (PAGE_SHIFT - 10),
+ , totalhigh_pages() << (PAGE_SHIFT - 10)
#endif
- str ? ", " : "", str ? str : "");
+ );
}
/**
@@ -8222,6 +8414,7 @@ void *__init alloc_large_system_hash(const char *tablename,
void *table = NULL;
gfp_t gfp_flags;
bool virt;
+ bool huge;
/* allow the kernel cmdline to have a say */
if (!numentries) {
@@ -8289,6 +8482,7 @@ void *__init alloc_large_system_hash(const char *tablename,
} else if (get_order(size) >= MAX_ORDER || hashdist) {
table = __vmalloc(size, gfp_flags);
virt = true;
+ huge = is_vm_area_hugepages(table);
} else {
/*
* If bucketsize is not a power-of-two, we may free
@@ -8305,7 +8499,7 @@ void *__init alloc_large_system_hash(const char *tablename,
pr_info("%s hash table entries: %ld (order: %d, %lu bytes, %s)\n",
tablename, 1UL << log2qty, ilog2(size) - PAGE_SHIFT, size,
- virt ? "vmalloc" : "linear");
+ virt ? (huge ? "vmalloc hugepage" : "vmalloc") : "linear");
if (_hash_shift)
*_hash_shift = log2qty;
@@ -8450,6 +8644,27 @@ static unsigned long pfn_max_align_up(unsigned long pfn)
pageblock_nr_pages));
}
+#if defined(CONFIG_DYNAMIC_DEBUG) || \
+ (defined(CONFIG_DYNAMIC_DEBUG_CORE) && defined(DYNAMIC_DEBUG_MODULE))
+/* Usage: See admin-guide/dynamic-debug-howto.rst */
+static void alloc_contig_dump_pages(struct list_head *page_list)
+{
+ DEFINE_DYNAMIC_DEBUG_METADATA(descriptor, "migrate failure");
+
+ if (DYNAMIC_DEBUG_BRANCH(descriptor)) {
+ struct page *page;
+
+ dump_stack();
+ list_for_each_entry(page, page_list, lru)
+ dump_page(page, "migration failure");
+ }
+}
+#else
+static inline void alloc_contig_dump_pages(struct list_head *page_list)
+{
+}
+#endif
+
/* [start, end) must belong to a single zone. */
static int __alloc_contig_migrate_range(struct compact_control *cc,
unsigned long start, unsigned long end)
@@ -8493,6 +8708,7 @@ static int __alloc_contig_migrate_range(struct compact_control *cc,
NULL, (unsigned long)&mtc, cc->mode, MR_CONTIG_RANGE);
}
if (ret < 0) {
+ alloc_contig_dump_pages(&cc->migratepages);
putback_movable_pages(&cc->migratepages);
return ret;
}
@@ -8602,8 +8818,6 @@ int alloc_contig_range(unsigned long start, unsigned long end,
* isolated thus they won't get removed from buddy.
*/
- lru_add_drain_all();
-
order = 0;
outer_start = start;
while (!PageBuddy(pfn_to_page(outer_start))) {
@@ -8629,8 +8843,6 @@ int alloc_contig_range(unsigned long start, unsigned long end,
/* Make sure the range is really isolated. */
if (test_pages_isolated(outer_start, end, 0)) {
- pr_info_ratelimited("%s: [%lx, %lx) PFNs busy\n",
- __func__, outer_start, end);
ret = -EBUSY;
goto done;
}
diff --git a/mm/page_counter.c b/mm/page_counter.c
index c6860f51b6c6..7d83641eb86b 100644
--- a/mm/page_counter.c
+++ b/mm/page_counter.c
@@ -52,9 +52,13 @@ void page_counter_cancel(struct page_counter *counter, unsigned long nr_pages)
long new;
new = atomic_long_sub_return(nr_pages, &counter->usage);
- propagate_protected_usage(counter, new);
/* More uncharges than charges? */
- WARN_ON_ONCE(new < 0);
+ if (WARN_ONCE(new < 0, "page_counter underflow: %ld nr_pages=%lu\n",
+ new, nr_pages)) {
+ new = 0;
+ atomic_long_set(&counter->usage, new);
+ }
+ propagate_protected_usage(counter, new);
}
/**
diff --git a/mm/page_owner.c b/mm/page_owner.c
index d15c7c4994f5..9661d5320a07 100644
--- a/mm/page_owner.c
+++ b/mm/page_owner.c
@@ -27,6 +27,7 @@ struct page_owner {
depot_stack_handle_t handle;
depot_stack_handle_t free_handle;
u64 ts_nsec;
+ u64 free_ts_nsec;
pid_t pid;
};
@@ -41,13 +42,7 @@ static void init_early_allocated_pages(void);
static int __init early_page_owner_param(char *buf)
{
- if (!buf)
- return -EINVAL;
-
- if (strcmp(buf, "on") == 0)
- page_owner_enabled = true;
-
- return 0;
+ return kstrtobool(buf, &page_owner_enabled);
}
early_param("page_owner", early_page_owner_param);
@@ -103,42 +98,30 @@ static inline struct page_owner *get_page_owner(struct page_ext *page_ext)
return (void *)page_ext + page_owner_ops.offset;
}
-static inline bool check_recursive_alloc(unsigned long *entries,
- unsigned int nr_entries,
- unsigned long ip)
-{
- unsigned int i;
-
- for (i = 0; i < nr_entries; i++) {
- if (entries[i] == ip)
- return true;
- }
- return false;
-}
-
static noinline depot_stack_handle_t save_stack(gfp_t flags)
{
unsigned long entries[PAGE_OWNER_STACK_DEPTH];
depot_stack_handle_t handle;
unsigned int nr_entries;
- nr_entries = stack_trace_save(entries, ARRAY_SIZE(entries), 2);
-
/*
- * We need to check recursion here because our request to
- * stackdepot could trigger memory allocation to save new
- * entry. New memory allocation would reach here and call
- * stack_depot_save_entries() again if we don't catch it. There is
- * still not enough memory in stackdepot so it would try to
- * allocate memory again and loop forever.
+ * Avoid recursion.
+ *
+ * Sometimes page metadata allocation tracking requires more
+ * memory to be allocated:
+ * - when new stack trace is saved to stack depot
+ * - when backtrace itself is calculated (ia64)
*/
- if (check_recursive_alloc(entries, nr_entries, _RET_IP_))
+ if (current->in_page_owner)
return dummy_handle;
+ current->in_page_owner = 1;
+ nr_entries = stack_trace_save(entries, ARRAY_SIZE(entries), 2);
handle = stack_depot_save(entries, nr_entries, flags);
if (!handle)
handle = failure_handle;
+ current->in_page_owner = 0;
return handle;
}
@@ -146,25 +129,27 @@ void __reset_page_owner(struct page *page, unsigned int order)
{
int i;
struct page_ext *page_ext;
- depot_stack_handle_t handle = 0;
+ depot_stack_handle_t handle;
struct page_owner *page_owner;
-
- handle = save_stack(GFP_NOWAIT | __GFP_NOWARN);
+ u64 free_ts_nsec = local_clock();
page_ext = lookup_page_ext(page);
if (unlikely(!page_ext))
return;
+
+ handle = save_stack(GFP_NOWAIT | __GFP_NOWARN);
for (i = 0; i < (1 << order); i++) {
__clear_bit(PAGE_EXT_OWNER_ALLOCATED, &page_ext->flags);
page_owner = get_page_owner(page_ext);
page_owner->free_handle = handle;
+ page_owner->free_ts_nsec = free_ts_nsec;
page_ext = page_ext_next(page_ext);
}
}
-static inline void __set_page_owner_handle(struct page *page,
- struct page_ext *page_ext, depot_stack_handle_t handle,
- unsigned int order, gfp_t gfp_mask)
+static inline void __set_page_owner_handle(struct page_ext *page_ext,
+ depot_stack_handle_t handle,
+ unsigned int order, gfp_t gfp_mask)
{
struct page_owner *page_owner;
int i;
@@ -194,7 +179,7 @@ noinline void __set_page_owner(struct page *page, unsigned int order,
return;
handle = save_stack(gfp_mask);
- __set_page_owner_handle(page, page_ext, handle, order, gfp_mask);
+ __set_page_owner_handle(page_ext, handle, order, gfp_mask);
}
void __set_page_owner_migrate_reason(struct page *page, int reason)
@@ -243,6 +228,7 @@ void __copy_page_owner(struct page *oldpage, struct page *newpage)
new_page_owner->handle = old_page_owner->handle;
new_page_owner->pid = old_page_owner->pid;
new_page_owner->ts_nsec = old_page_owner->ts_nsec;
+ new_page_owner->free_ts_nsec = old_page_owner->ts_nsec;
/*
* We don't clear the bit on the oldpage as it's going to be freed
@@ -356,10 +342,10 @@ print_page_owner(char __user *buf, size_t count, unsigned long pfn,
return -ENOMEM;
ret = snprintf(kbuf, count,
- "Page allocated via order %u, mask %#x(%pGg), pid %d, ts %llu ns\n",
+ "Page allocated via order %u, mask %#x(%pGg), pid %d, ts %llu ns, free_ts %llu ns\n",
page_owner->order, page_owner->gfp_mask,
&page_owner->gfp_mask, page_owner->pid,
- page_owner->ts_nsec);
+ page_owner->ts_nsec, page_owner->free_ts_nsec);
if (ret >= count)
goto err;
@@ -435,9 +421,9 @@ void __dump_page_owner(struct page *page)
else
pr_alert("page_owner tracks the page as freed\n");
- pr_alert("page last allocated via order %u, migratetype %s, gfp_mask %#x(%pGg), pid %d, ts %llu\n",
+ pr_alert("page last allocated via order %u, migratetype %s, gfp_mask %#x(%pGg), pid %d, ts %llu, free_ts %llu\n",
page_owner->order, migratetype_names[mt], gfp_mask, &gfp_mask,
- page_owner->pid, page_owner->ts_nsec);
+ page_owner->pid, page_owner->ts_nsec, page_owner->free_ts_nsec);
handle = READ_ONCE(page_owner->handle);
if (!handle) {
@@ -612,7 +598,7 @@ static void init_pages_in_zone(pg_data_t *pgdat, struct zone *zone)
continue;
/* Found early allocated page */
- __set_page_owner_handle(page, page_ext, early_handle,
+ __set_page_owner_handle(page_ext, early_handle,
0, 0);
count++;
}
diff --git a/mm/page_poison.c b/mm/page_poison.c
index 655dc5895604..98438985e1ed 100644
--- a/mm/page_poison.c
+++ b/mm/page_poison.c
@@ -2,6 +2,7 @@
#include <linux/kernel.h>
#include <linux/string.h>
#include <linux/mm.h>
+#include <linux/mmdebug.h>
#include <linux/highmem.h>
#include <linux/page_ext.h>
#include <linux/poison.h>
@@ -45,7 +46,7 @@ static bool single_bit_flip(unsigned char a, unsigned char b)
return error && !(error & (error - 1));
}
-static void check_poison_mem(unsigned char *mem, size_t bytes)
+static void check_poison_mem(struct page *page, unsigned char *mem, size_t bytes)
{
static DEFINE_RATELIMIT_STATE(ratelimit, 5 * HZ, 10);
unsigned char *start;
@@ -70,6 +71,7 @@ static void check_poison_mem(unsigned char *mem, size_t bytes)
print_hex_dump(KERN_ERR, "", DUMP_PREFIX_ADDRESS, 16, 1, start,
end - start + 1, 1);
dump_stack();
+ dump_page(page, "pagealloc: corrupted page details");
}
static void unpoison_page(struct page *page)
@@ -83,7 +85,7 @@ static void unpoison_page(struct page *page)
* that is freed to buddy. Thus no extra check is done to
* see if a page was poisoned.
*/
- check_poison_mem(kasan_reset_tag(addr), PAGE_SIZE);
+ check_poison_mem(page, kasan_reset_tag(addr), PAGE_SIZE);
kasan_enable_current();
kunmap_atomic(addr);
}
diff --git a/mm/percpu-vm.c b/mm/percpu-vm.c
index e46f7a6917f9..8d3844bc0c7c 100644
--- a/mm/percpu-vm.c
+++ b/mm/percpu-vm.c
@@ -8,6 +8,7 @@
* Chunks are mapped into vmalloc areas and populated page by page.
* This is the default chunk allocator.
*/
+#include "internal.h"
static struct page *pcpu_chunk_page(struct pcpu_chunk *chunk,
unsigned int cpu, int page_idx)
@@ -133,7 +134,7 @@ static void pcpu_pre_unmap_flush(struct pcpu_chunk *chunk,
static void __pcpu_unmap_pages(unsigned long addr, int nr_pages)
{
- unmap_kernel_range_noflush(addr, nr_pages << PAGE_SHIFT);
+ vunmap_range_noflush(addr, addr + (nr_pages << PAGE_SHIFT));
}
/**
@@ -192,8 +193,8 @@ static void pcpu_post_unmap_tlb_flush(struct pcpu_chunk *chunk,
static int __pcpu_map_pages(unsigned long addr, struct page **pages,
int nr_pages)
{
- return map_kernel_range_noflush(addr, nr_pages << PAGE_SHIFT,
- PAGE_KERNEL, pages);
+ return vmap_pages_range_noflush(addr, addr + (nr_pages << PAGE_SHIFT),
+ PAGE_KERNEL, pages, PAGE_SHIFT);
}
/**
diff --git a/mm/slab.c b/mm/slab.c
index 4e212cda8693..df45c437b394 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -3216,6 +3216,7 @@ slab_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid, size_t orig_
void *ptr;
int slab_node = numa_mem_id();
struct obj_cgroup *objcg = NULL;
+ bool init = false;
flags &= gfp_allowed_mask;
cachep = slab_pre_alloc_hook(cachep, &objcg, 1, flags);
@@ -3254,12 +3255,10 @@ slab_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid, size_t orig_
out:
local_irq_restore(save_flags);
ptr = cache_alloc_debugcheck_after(cachep, flags, ptr, caller);
-
- if (unlikely(slab_want_init_on_alloc(flags, cachep)) && ptr)
- memset(ptr, 0, cachep->object_size);
+ init = slab_want_init_on_alloc(flags, cachep);
out_hooks:
- slab_post_alloc_hook(cachep, objcg, flags, 1, &ptr);
+ slab_post_alloc_hook(cachep, objcg, flags, 1, &ptr, init);
return ptr;
}
@@ -3301,6 +3300,7 @@ slab_alloc(struct kmem_cache *cachep, gfp_t flags, size_t orig_size, unsigned lo
unsigned long save_flags;
void *objp;
struct obj_cgroup *objcg = NULL;
+ bool init = false;
flags &= gfp_allowed_mask;
cachep = slab_pre_alloc_hook(cachep, &objcg, 1, flags);
@@ -3317,12 +3317,10 @@ slab_alloc(struct kmem_cache *cachep, gfp_t flags, size_t orig_size, unsigned lo
local_irq_restore(save_flags);
objp = cache_alloc_debugcheck_after(cachep, flags, objp, caller);
prefetchw(objp);
-
- if (unlikely(slab_want_init_on_alloc(flags, cachep)) && objp)
- memset(objp, 0, cachep->object_size);
+ init = slab_want_init_on_alloc(flags, cachep);
out:
- slab_post_alloc_hook(cachep, objcg, flags, 1, &objp);
+ slab_post_alloc_hook(cachep, objcg, flags, 1, &objp, init);
return objp;
}
@@ -3427,17 +3425,24 @@ free_done:
static __always_inline void __cache_free(struct kmem_cache *cachep, void *objp,
unsigned long caller)
{
+ bool init;
+
if (is_kfence_address(objp)) {
kmemleak_free_recursive(objp, cachep->flags);
__kfence_free(objp);
return;
}
- if (unlikely(slab_want_init_on_free(cachep)))
+ /*
+ * As memory initialization might be integrated into KASAN,
+ * kasan_slab_free and initialization memset must be
+ * kept together to avoid discrepancies in behavior.
+ */
+ init = slab_want_init_on_free(cachep);
+ if (init && !kasan_has_integrated_init())
memset(objp, 0, cachep->object_size);
-
- /* Put the object into the quarantine, don't touch it for now. */
- if (kasan_slab_free(cachep, objp))
+ /* KASAN might put objp into memory quarantine, delaying its reuse. */
+ if (kasan_slab_free(cachep, objp, init))
return;
/* Use KCSAN to help debug racy use-after-free. */
@@ -3542,18 +3547,18 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
cache_alloc_debugcheck_after_bulk(s, flags, size, p, _RET_IP_);
- /* Clear memory outside IRQ disabled section */
- if (unlikely(slab_want_init_on_alloc(flags, s)))
- for (i = 0; i < size; i++)
- memset(p[i], 0, s->object_size);
-
- slab_post_alloc_hook(s, objcg, flags, size, p);
+ /*
+ * memcg and kmem_cache debug support and memory initialization.
+ * Done outside of the IRQ disabled section.
+ */
+ slab_post_alloc_hook(s, objcg, flags, size, p,
+ slab_want_init_on_alloc(flags, s));
/* FIXME: Trace call missing. Christoph would like a bulk variant */
return size;
error:
local_irq_enable();
cache_alloc_debugcheck_after_bulk(s, flags, i, p, _RET_IP_);
- slab_post_alloc_hook(s, objcg, flags, i, p);
+ slab_post_alloc_hook(s, objcg, flags, i, p, false);
__kmem_cache_free_bulk(s, i, p);
return 0;
}
diff --git a/mm/slab.h b/mm/slab.h
index c30ed35b3d5d..18c1927cd196 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -506,15 +506,24 @@ static inline struct kmem_cache *slab_pre_alloc_hook(struct kmem_cache *s,
}
static inline void slab_post_alloc_hook(struct kmem_cache *s,
- struct obj_cgroup *objcg,
- gfp_t flags, size_t size, void **p)
+ struct obj_cgroup *objcg, gfp_t flags,
+ size_t size, void **p, bool init)
{
size_t i;
flags &= gfp_allowed_mask;
+
+ /*
+ * As memory initialization might be integrated into KASAN,
+ * kasan_slab_alloc and initialization memset must be
+ * kept together to avoid discrepancies in behavior.
+ *
+ * As p[i] might get tagged, memset and kmemleak hook come after KASAN.
+ */
for (i = 0; i < size; i++) {
- p[i] = kasan_slab_alloc(s, p[i], flags);
- /* As p[i] might get tagged, call kmemleak hook after KASAN. */
+ p[i] = kasan_slab_alloc(s, p[i], flags, init);
+ if (p[i] && init && !kasan_has_integrated_init())
+ memset(p[i], 0, s->object_size);
kmemleak_alloc_recursive(p[i], s->object_size, 1,
s->flags, flags);
}
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 4c6107e39f9a..f8833d3e5d47 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -71,11 +71,19 @@ static int __init setup_slab_nomerge(char *str)
return 1;
}
+static int __init setup_slab_merge(char *str)
+{
+ slab_nomerge = false;
+ return 1;
+}
+
#ifdef CONFIG_SLUB
__setup_param("slub_nomerge", slub_nomerge, setup_slab_nomerge, 0);
+__setup_param("slub_merge", slub_merge, setup_slab_merge, 0);
#endif
__setup("slab_nomerge", setup_slab_nomerge);
+__setup("slab_merge", setup_slab_merge);
/*
* Determine the size of a slab object
diff --git a/mm/slub.c b/mm/slub.c
index 722f95e1ea0b..68123b21e65f 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -3,7 +3,7 @@
* SLUB: A slab allocator that limits cache line use instead of queuing
* objects in per cpu and per node lists.
*
- * The allocator synchronizes using per slab locks or atomic operatios
+ * The allocator synchronizes using per slab locks or atomic operations
* and only uses a centralized lock to manage a pool of partial slabs.
*
* (C) 2007 SGI, Christoph Lameter
@@ -160,7 +160,7 @@ static inline bool kmem_cache_has_cpu_partial(struct kmem_cache *s)
#undef SLUB_DEBUG_CMPXCHG
/*
- * Mininum number of partial slabs. These will be left on the partial
+ * Minimum number of partial slabs. These will be left on the partial
* lists even if they are empty. kmem_cache_shrink may reclaim them.
*/
#define MIN_PARTIAL 5
@@ -833,7 +833,7 @@ static int check_bytes_and_report(struct kmem_cache *s, struct page *page,
*
* A. Free pointer (if we cannot overwrite object on free)
* B. Tracking data for SLAB_STORE_USER
- * C. Padding to reach required alignment boundary or at mininum
+ * C. Padding to reach required alignment boundary or at minimum
* one word if debugging is on to be able to detect writes
* before the word boundary.
*
@@ -1533,7 +1533,8 @@ static __always_inline void kfree_hook(void *x)
kasan_kfree_large(x);
}
-static __always_inline bool slab_free_hook(struct kmem_cache *s, void *x)
+static __always_inline bool slab_free_hook(struct kmem_cache *s,
+ void *x, bool init)
{
kmemleak_free_recursive(x, s->flags);
@@ -1559,8 +1560,25 @@ static __always_inline bool slab_free_hook(struct kmem_cache *s, void *x)
__kcsan_check_access(x, s->object_size,
KCSAN_ACCESS_WRITE | KCSAN_ACCESS_ASSERT);
- /* KASAN might put x into memory quarantine, delaying its reuse */
- return kasan_slab_free(s, x);
+ /*
+ * As memory initialization might be integrated into KASAN,
+ * kasan_slab_free and initialization memset's must be
+ * kept together to avoid discrepancies in behavior.
+ *
+ * The initialization memset's clear the object and the metadata,
+ * but don't touch the SLAB redzone.
+ */
+ if (init) {
+ int rsize;
+
+ if (!kasan_has_integrated_init())
+ memset(kasan_reset_tag(x), 0, s->object_size);
+ rsize = (s->flags & SLAB_RED_ZONE) ? s->red_left_pad : 0;
+ memset((char *)kasan_reset_tag(x) + s->inuse, 0,
+ s->size - s->inuse - rsize);
+ }
+ /* KASAN might put x into memory quarantine, delaying its reuse. */
+ return kasan_slab_free(s, x, init);
}
static inline bool slab_free_freelist_hook(struct kmem_cache *s,
@@ -1570,10 +1588,9 @@ static inline bool slab_free_freelist_hook(struct kmem_cache *s,
void *object;
void *next = *head;
void *old_tail = *tail ? *tail : *head;
- int rsize;
if (is_kfence_address(next)) {
- slab_free_hook(s, next);
+ slab_free_hook(s, next, false);
return true;
}
@@ -1585,20 +1602,8 @@ static inline bool slab_free_freelist_hook(struct kmem_cache *s,
object = next;
next = get_freepointer(s, object);
- if (slab_want_init_on_free(s)) {
- /*
- * Clear the object and the metadata, but don't touch
- * the redzone.
- */
- memset(kasan_reset_tag(object), 0, s->object_size);
- rsize = (s->flags & SLAB_RED_ZONE) ? s->red_left_pad
- : 0;
- memset((char *)kasan_reset_tag(object) + s->inuse, 0,
- s->size - s->inuse - rsize);
-
- }
/* If object's reuse doesn't have to be delayed */
- if (!slab_free_hook(s, object)) {
+ if (!slab_free_hook(s, object, slab_want_init_on_free(s))) {
/* Move object to the new freelist */
set_freepointer(s, object, *head);
*head = object;
@@ -2823,6 +2828,7 @@ static __always_inline void *slab_alloc_node(struct kmem_cache *s,
struct page *page;
unsigned long tid;
struct obj_cgroup *objcg = NULL;
+ bool init = false;
s = slab_pre_alloc_hook(s, &objcg, 1, gfpflags);
if (!s)
@@ -2900,12 +2906,10 @@ redo:
}
maybe_wipe_obj_freeptr(s, object);
-
- if (unlikely(slab_want_init_on_alloc(gfpflags, s)) && object)
- memset(kasan_reset_tag(object), 0, s->object_size);
+ init = slab_want_init_on_alloc(gfpflags, s);
out:
- slab_post_alloc_hook(s, objcg, gfpflags, 1, &object);
+ slab_post_alloc_hook(s, objcg, gfpflags, 1, &object, init);
return object;
}
@@ -3237,7 +3241,7 @@ int build_detached_freelist(struct kmem_cache *s, size_t size,
}
if (is_kfence_address(object)) {
- slab_free_hook(df->s, object);
+ slab_free_hook(df->s, object, false);
__kfence_free(object);
p[size] = NULL; /* mark object processed */
return size;
@@ -3357,20 +3361,16 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
c->tid = next_tid(c->tid);
local_irq_enable();
- /* Clear memory outside IRQ disabled fastpath loop */
- if (unlikely(slab_want_init_on_alloc(flags, s))) {
- int j;
-
- for (j = 0; j < i; j++)
- memset(kasan_reset_tag(p[j]), 0, s->object_size);
- }
-
- /* memcg and kmem_cache debug support */
- slab_post_alloc_hook(s, objcg, flags, size, p);
+ /*
+ * memcg and kmem_cache debug support and memory initialization.
+ * Done outside of the IRQ disabled fastpath loop.
+ */
+ slab_post_alloc_hook(s, objcg, flags, size, p,
+ slab_want_init_on_alloc(flags, s));
return i;
error:
local_irq_enable();
- slab_post_alloc_hook(s, objcg, flags, i, p);
+ slab_post_alloc_hook(s, objcg, flags, i, p, false);
__kmem_cache_free_bulk(s, i, p);
return 0;
}
@@ -3422,7 +3422,7 @@ static unsigned int slub_min_objects;
*
* Higher order allocations also allow the placement of more objects in a
* slab and thereby reduce object handling overhead. If the user has
- * requested a higher mininum order then we start with that one instead of
+ * requested a higher minimum order then we start with that one instead of
* the smallest order which will fit the object.
*/
static inline unsigned int slab_order(unsigned int size,
@@ -3580,7 +3580,7 @@ static void early_kmem_cache_node_alloc(int node)
init_object(kmem_cache_node, n, SLUB_RED_ACTIVE);
init_tracking(kmem_cache_node, n);
#endif
- n = kasan_slab_alloc(kmem_cache_node, n, GFP_KERNEL);
+ n = kasan_slab_alloc(kmem_cache_node, n, GFP_KERNEL, false);
page->freelist = get_freepointer(kmem_cache_node, n);
page->inuse = 1;
page->frozen = 0;
@@ -3828,6 +3828,15 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order)
static int kmem_cache_open(struct kmem_cache *s, slab_flags_t flags)
{
+#ifdef CONFIG_SLUB_DEBUG
+ /*
+ * If no slub_debug was enabled globally, the static key is not yet
+ * enabled by setup_slub_debug(). Enable it if the cache is being
+ * created with any of the debugging flags passed explicitly.
+ */
+ if (flags & SLAB_DEBUG_FLAGS)
+ static_branch_enable(&slub_debug_enabled);
+#endif
s->flags = kmem_cache_flags(s->size, flags, s->name);
#ifdef CONFIG_SLAB_FREELIST_HARDENED
s->random = get_random_long();
diff --git a/mm/sparse.c b/mm/sparse.c
index 7bd23f9d6cef..33406ea2ecc4 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -547,6 +547,7 @@ static void __init sparse_init_nid(int nid, unsigned long pnum_begin,
pr_err("%s: node[%d] memory map backing failed. Some memory will not be available.",
__func__, nid);
pnum_begin = pnum;
+ sparse_buffer_fini();
goto failed;
}
check_usemap_section_nr(nid, usage);
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 3cdee7b11da9..fb7efa08fe57 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -497,16 +497,14 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
__SetPageLocked(page);
__SetPageSwapBacked(page);
- /* May fail (-ENOMEM) if XArray node allocation failed. */
- if (add_to_swap_cache(page, entry, gfp_mask & GFP_RECLAIM_MASK, &shadow)) {
- put_swap_page(page, entry);
+ if (mem_cgroup_swapin_charge_page(page, NULL, gfp_mask, entry))
goto fail_unlock;
- }
- if (mem_cgroup_charge(page, NULL, gfp_mask)) {
- delete_from_swap_cache(page);
+ /* May fail (-ENOMEM) if XArray node allocation failed. */
+ if (add_to_swap_cache(page, entry, gfp_mask & GFP_RECLAIM_MASK, &shadow))
goto fail_unlock;
- }
+
+ mem_cgroup_swapin_uncharge_swap(entry);
if (shadow)
workingset_refault(page, shadow);
@@ -517,6 +515,7 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
return page;
fail_unlock:
+ put_swap_page(page, entry);
unlock_page(page);
put_page(page);
return NULL;
diff --git a/mm/util.c b/mm/util.c
index c37e24d5fa43..083c5c417cfc 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -711,16 +711,6 @@ struct address_space *page_mapping(struct page *page)
}
EXPORT_SYMBOL(page_mapping);
-/*
- * For file cache pages, return the address_space, otherwise return NULL
- */
-struct address_space *page_mapping_file(struct page *page)
-{
- if (unlikely(PageSwapCache(page)))
- return NULL;
- return page_mapping(page);
-}
-
/* Slow path of page_mapcount() for compound pages */
int __page_mapcount(struct page *page)
{
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index d5f2a84e488a..d33894d7b27a 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -34,7 +34,7 @@
#include <linux/bitops.h>
#include <linux/rbtree_augmented.h>
#include <linux/overflow.h>
-
+#include <linux/pgtable.h>
#include <linux/uaccess.h>
#include <asm/tlbflush.h>
#include <asm/shmparam.h>
@@ -42,6 +42,19 @@
#include "internal.h"
#include "pgalloc-track.h"
+#ifdef CONFIG_HAVE_ARCH_HUGE_VMALLOC
+static bool __ro_after_init vmap_allow_huge = true;
+
+static int __init set_nohugevmalloc(char *str)
+{
+ vmap_allow_huge = false;
+ return 0;
+}
+early_param("nohugevmalloc", set_nohugevmalloc);
+#else /* CONFIG_HAVE_ARCH_HUGE_VMALLOC */
+static const bool vmap_allow_huge = false;
+#endif /* CONFIG_HAVE_ARCH_HUGE_VMALLOC */
+
bool is_vmalloc_addr(const void *x)
{
unsigned long addr = (unsigned long)x;
@@ -68,6 +81,218 @@ static void free_work(struct work_struct *w)
}
/*** Page table manipulation functions ***/
+static int vmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
+ phys_addr_t phys_addr, pgprot_t prot,
+ pgtbl_mod_mask *mask)
+{
+ pte_t *pte;
+ u64 pfn;
+
+ pfn = phys_addr >> PAGE_SHIFT;
+ pte = pte_alloc_kernel_track(pmd, addr, mask);
+ if (!pte)
+ return -ENOMEM;
+ do {
+ BUG_ON(!pte_none(*pte));
+ set_pte_at(&init_mm, addr, pte, pfn_pte(pfn, prot));
+ pfn++;
+ } while (pte++, addr += PAGE_SIZE, addr != end);
+ *mask |= PGTBL_PTE_MODIFIED;
+ return 0;
+}
+
+static int vmap_try_huge_pmd(pmd_t *pmd, unsigned long addr, unsigned long end,
+ phys_addr_t phys_addr, pgprot_t prot,
+ unsigned int max_page_shift)
+{
+ if (max_page_shift < PMD_SHIFT)
+ return 0;
+
+ if (!arch_vmap_pmd_supported(prot))
+ return 0;
+
+ if ((end - addr) != PMD_SIZE)
+ return 0;
+
+ if (!IS_ALIGNED(addr, PMD_SIZE))
+ return 0;
+
+ if (!IS_ALIGNED(phys_addr, PMD_SIZE))
+ return 0;
+
+ if (pmd_present(*pmd) && !pmd_free_pte_page(pmd, addr))
+ return 0;
+
+ return pmd_set_huge(pmd, phys_addr, prot);
+}
+
+static int vmap_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
+ phys_addr_t phys_addr, pgprot_t prot,
+ unsigned int max_page_shift, pgtbl_mod_mask *mask)
+{
+ pmd_t *pmd;
+ unsigned long next;
+
+ pmd = pmd_alloc_track(&init_mm, pud, addr, mask);
+ if (!pmd)
+ return -ENOMEM;
+ do {
+ next = pmd_addr_end(addr, end);
+
+ if (vmap_try_huge_pmd(pmd, addr, next, phys_addr, prot,
+ max_page_shift)) {
+ *mask |= PGTBL_PMD_MODIFIED;
+ continue;
+ }
+
+ if (vmap_pte_range(pmd, addr, next, phys_addr, prot, mask))
+ return -ENOMEM;
+ } while (pmd++, phys_addr += (next - addr), addr = next, addr != end);
+ return 0;
+}
+
+static int vmap_try_huge_pud(pud_t *pud, unsigned long addr, unsigned long end,
+ phys_addr_t phys_addr, pgprot_t prot,
+ unsigned int max_page_shift)
+{
+ if (max_page_shift < PUD_SHIFT)
+ return 0;
+
+ if (!arch_vmap_pud_supported(prot))
+ return 0;
+
+ if ((end - addr) != PUD_SIZE)
+ return 0;
+
+ if (!IS_ALIGNED(addr, PUD_SIZE))
+ return 0;
+
+ if (!IS_ALIGNED(phys_addr, PUD_SIZE))
+ return 0;
+
+ if (pud_present(*pud) && !pud_free_pmd_page(pud, addr))
+ return 0;
+
+ return pud_set_huge(pud, phys_addr, prot);
+}
+
+static int vmap_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end,
+ phys_addr_t phys_addr, pgprot_t prot,
+ unsigned int max_page_shift, pgtbl_mod_mask *mask)
+{
+ pud_t *pud;
+ unsigned long next;
+
+ pud = pud_alloc_track(&init_mm, p4d, addr, mask);
+ if (!pud)
+ return -ENOMEM;
+ do {
+ next = pud_addr_end(addr, end);
+
+ if (vmap_try_huge_pud(pud, addr, next, phys_addr, prot,
+ max_page_shift)) {
+ *mask |= PGTBL_PUD_MODIFIED;
+ continue;
+ }
+
+ if (vmap_pmd_range(pud, addr, next, phys_addr, prot,
+ max_page_shift, mask))
+ return -ENOMEM;
+ } while (pud++, phys_addr += (next - addr), addr = next, addr != end);
+ return 0;
+}
+
+static int vmap_try_huge_p4d(p4d_t *p4d, unsigned long addr, unsigned long end,
+ phys_addr_t phys_addr, pgprot_t prot,
+ unsigned int max_page_shift)
+{
+ if (max_page_shift < P4D_SHIFT)
+ return 0;
+
+ if (!arch_vmap_p4d_supported(prot))
+ return 0;
+
+ if ((end - addr) != P4D_SIZE)
+ return 0;
+
+ if (!IS_ALIGNED(addr, P4D_SIZE))
+ return 0;
+
+ if (!IS_ALIGNED(phys_addr, P4D_SIZE))
+ return 0;
+
+ if (p4d_present(*p4d) && !p4d_free_pud_page(p4d, addr))
+ return 0;
+
+ return p4d_set_huge(p4d, phys_addr, prot);
+}
+
+static int vmap_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end,
+ phys_addr_t phys_addr, pgprot_t prot,
+ unsigned int max_page_shift, pgtbl_mod_mask *mask)
+{
+ p4d_t *p4d;
+ unsigned long next;
+
+ p4d = p4d_alloc_track(&init_mm, pgd, addr, mask);
+ if (!p4d)
+ return -ENOMEM;
+ do {
+ next = p4d_addr_end(addr, end);
+
+ if (vmap_try_huge_p4d(p4d, addr, next, phys_addr, prot,
+ max_page_shift)) {
+ *mask |= PGTBL_P4D_MODIFIED;
+ continue;
+ }
+
+ if (vmap_pud_range(p4d, addr, next, phys_addr, prot,
+ max_page_shift, mask))
+ return -ENOMEM;
+ } while (p4d++, phys_addr += (next - addr), addr = next, addr != end);
+ return 0;
+}
+
+static int vmap_range_noflush(unsigned long addr, unsigned long end,
+ phys_addr_t phys_addr, pgprot_t prot,
+ unsigned int max_page_shift)
+{
+ pgd_t *pgd;
+ unsigned long start;
+ unsigned long next;
+ int err;
+ pgtbl_mod_mask mask = 0;
+
+ might_sleep();
+ BUG_ON(addr >= end);
+
+ start = addr;
+ pgd = pgd_offset_k(addr);
+ do {
+ next = pgd_addr_end(addr, end);
+ err = vmap_p4d_range(pgd, addr, next, phys_addr, prot,
+ max_page_shift, &mask);
+ if (err)
+ break;
+ } while (pgd++, phys_addr += (next - addr), addr = next, addr != end);
+
+ if (mask & ARCH_PAGE_TABLE_SYNC_MASK)
+ arch_sync_kernel_mappings(start, end);
+
+ return err;
+}
+
+int vmap_range(unsigned long addr, unsigned long end,
+ phys_addr_t phys_addr, pgprot_t prot,
+ unsigned int max_page_shift)
+{
+ int err;
+
+ err = vmap_range_noflush(addr, end, phys_addr, prot, max_page_shift);
+ flush_cache_vmap(addr, end);
+
+ return err;
+}
static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
pgtbl_mod_mask *mask)
@@ -153,22 +378,20 @@ static void vunmap_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end,
} while (p4d++, addr = next, addr != end);
}
-/**
- * unmap_kernel_range_noflush - unmap kernel VM area
- * @start: start of the VM area to unmap
- * @size: size of the VM area to unmap
+/*
+ * vunmap_range_noflush is similar to vunmap_range, but does not
+ * flush caches or TLBs.
*
- * Unmap PFN_UP(@size) pages at @addr. The VM area @addr and @size specify
- * should have been allocated using get_vm_area() and its friends.
+ * The caller is responsible for calling flush_cache_vmap() before calling
+ * this function, and flush_tlb_kernel_range after it has returned
+ * successfully (and before the addresses are expected to cause a page fault
+ * or be re-mapped for something else, if TLB flushes are being delayed or
+ * coalesced).
*
- * NOTE:
- * This function does NOT do any cache flushing. The caller is responsible
- * for calling flush_cache_vunmap() on to-be-mapped areas before calling this
- * function and flush_tlb_kernel_range() after.
+ * This is an internal function only. Do not use outside mm/.
*/
-void unmap_kernel_range_noflush(unsigned long start, unsigned long size)
+void vunmap_range_noflush(unsigned long start, unsigned long end)
{
- unsigned long end = start + size;
unsigned long next;
pgd_t *pgd;
unsigned long addr = start;
@@ -189,7 +412,23 @@ void unmap_kernel_range_noflush(unsigned long start, unsigned long size)
arch_sync_kernel_mappings(start, end);
}
-static int vmap_pte_range(pmd_t *pmd, unsigned long addr,
+/**
+ * vunmap_range - unmap kernel virtual addresses
+ * @addr: start of the VM area to unmap
+ * @end: end of the VM area to unmap (non-inclusive)
+ *
+ * Clears any present PTEs in the virtual address range, flushes TLBs and
+ * caches. Any subsequent access to the address before it has been re-mapped
+ * is a kernel bug.
+ */
+void vunmap_range(unsigned long addr, unsigned long end)
+{
+ flush_cache_vunmap(addr, end);
+ vunmap_range_noflush(addr, end);
+ flush_tlb_kernel_range(addr, end);
+}
+
+static int vmap_pages_pte_range(pmd_t *pmd, unsigned long addr,
unsigned long end, pgprot_t prot, struct page **pages, int *nr,
pgtbl_mod_mask *mask)
{
@@ -217,7 +456,7 @@ static int vmap_pte_range(pmd_t *pmd, unsigned long addr,
return 0;
}
-static int vmap_pmd_range(pud_t *pud, unsigned long addr,
+static int vmap_pages_pmd_range(pud_t *pud, unsigned long addr,
unsigned long end, pgprot_t prot, struct page **pages, int *nr,
pgtbl_mod_mask *mask)
{
@@ -229,13 +468,13 @@ static int vmap_pmd_range(pud_t *pud, unsigned long addr,
return -ENOMEM;
do {
next = pmd_addr_end(addr, end);
- if (vmap_pte_range(pmd, addr, next, prot, pages, nr, mask))
+ if (vmap_pages_pte_range(pmd, addr, next, prot, pages, nr, mask))
return -ENOMEM;
} while (pmd++, addr = next, addr != end);
return 0;
}
-static int vmap_pud_range(p4d_t *p4d, unsigned long addr,
+static int vmap_pages_pud_range(p4d_t *p4d, unsigned long addr,
unsigned long end, pgprot_t prot, struct page **pages, int *nr,
pgtbl_mod_mask *mask)
{
@@ -247,13 +486,13 @@ static int vmap_pud_range(p4d_t *p4d, unsigned long addr,
return -ENOMEM;
do {
next = pud_addr_end(addr, end);
- if (vmap_pmd_range(pud, addr, next, prot, pages, nr, mask))
+ if (vmap_pages_pmd_range(pud, addr, next, prot, pages, nr, mask))
return -ENOMEM;
} while (pud++, addr = next, addr != end);
return 0;
}
-static int vmap_p4d_range(pgd_t *pgd, unsigned long addr,
+static int vmap_pages_p4d_range(pgd_t *pgd, unsigned long addr,
unsigned long end, pgprot_t prot, struct page **pages, int *nr,
pgtbl_mod_mask *mask)
{
@@ -265,37 +504,18 @@ static int vmap_p4d_range(pgd_t *pgd, unsigned long addr,
return -ENOMEM;
do {
next = p4d_addr_end(addr, end);
- if (vmap_pud_range(p4d, addr, next, prot, pages, nr, mask))
+ if (vmap_pages_pud_range(p4d, addr, next, prot, pages, nr, mask))
return -ENOMEM;
} while (p4d++, addr = next, addr != end);
return 0;
}
-/**
- * map_kernel_range_noflush - map kernel VM area with the specified pages
- * @addr: start of the VM area to map
- * @size: size of the VM area to map
- * @prot: page protection flags to use
- * @pages: pages to map
- *
- * Map PFN_UP(@size) pages at @addr. The VM area @addr and @size specify should
- * have been allocated using get_vm_area() and its friends.
- *
- * NOTE:
- * This function does NOT do any cache flushing. The caller is responsible for
- * calling flush_cache_vmap() on to-be-mapped areas before calling this
- * function.
- *
- * RETURNS:
- * 0 on success, -errno on failure.
- */
-int map_kernel_range_noflush(unsigned long addr, unsigned long size,
- pgprot_t prot, struct page **pages)
+static int vmap_small_pages_range_noflush(unsigned long addr, unsigned long end,
+ pgprot_t prot, struct page **pages)
{
unsigned long start = addr;
- unsigned long end = addr + size;
- unsigned long next;
pgd_t *pgd;
+ unsigned long next;
int err = 0;
int nr = 0;
pgtbl_mod_mask mask = 0;
@@ -306,7 +526,7 @@ int map_kernel_range_noflush(unsigned long addr, unsigned long size,
next = pgd_addr_end(addr, end);
if (pgd_bad(*pgd))
mask |= PGTBL_PGD_MODIFIED;
- err = vmap_p4d_range(pgd, addr, next, prot, pages, &nr, &mask);
+ err = vmap_pages_p4d_range(pgd, addr, next, prot, pages, &nr, &mask);
if (err)
return err;
} while (pgd++, addr = next, addr != end);
@@ -317,14 +537,61 @@ int map_kernel_range_noflush(unsigned long addr, unsigned long size,
return 0;
}
-int map_kernel_range(unsigned long start, unsigned long size, pgprot_t prot,
- struct page **pages)
+/*
+ * vmap_pages_range_noflush is similar to vmap_pages_range, but does not
+ * flush caches.
+ *
+ * The caller is responsible for calling flush_cache_vmap() after this
+ * function returns successfully and before the addresses are accessed.
+ *
+ * This is an internal function only. Do not use outside mm/.
+ */
+int vmap_pages_range_noflush(unsigned long addr, unsigned long end,
+ pgprot_t prot, struct page **pages, unsigned int page_shift)
{
- int ret;
+ unsigned int i, nr = (end - addr) >> PAGE_SHIFT;
+
+ WARN_ON(page_shift < PAGE_SHIFT);
+
+ if (!IS_ENABLED(CONFIG_HAVE_ARCH_HUGE_VMALLOC) ||
+ page_shift == PAGE_SHIFT)
+ return vmap_small_pages_range_noflush(addr, end, prot, pages);
+
+ for (i = 0; i < nr; i += 1U << (page_shift - PAGE_SHIFT)) {
+ int err;
+
+ err = vmap_range_noflush(addr, addr + (1UL << page_shift),
+ __pa(page_address(pages[i])), prot,
+ page_shift);
+ if (err)
+ return err;
+
+ addr += 1UL << page_shift;
+ }
+
+ return 0;
+}
+
+/**
+ * vmap_pages_range - map pages to a kernel virtual address
+ * @addr: start of the VM area to map
+ * @end: end of the VM area to map (non-inclusive)
+ * @prot: page protection flags to use
+ * @pages: pages to map (always PAGE_SIZE pages)
+ * @page_shift: maximum shift that the pages may be mapped with, @pages must
+ * be aligned and contiguous up to at least this shift.
+ *
+ * RETURNS:
+ * 0 on success, -errno on failure.
+ */
+static int vmap_pages_range(unsigned long addr, unsigned long end,
+ pgprot_t prot, struct page **pages, unsigned int page_shift)
+{
+ int err;
- ret = map_kernel_range_noflush(start, size, prot, pages);
- flush_cache_vmap(start, start + size);
- return ret;
+ err = vmap_pages_range_noflush(addr, end, prot, pages, page_shift);
+ flush_cache_vmap(addr, end);
+ return err;
}
int is_vmalloc_or_module_addr(const void *x)
@@ -343,7 +610,9 @@ int is_vmalloc_or_module_addr(const void *x)
}
/*
- * Walk a vmap address to the struct page it maps.
+ * Walk a vmap address to the struct page it maps. Huge vmap mappings will
+ * return the tail page that corresponds to the base page address, which
+ * matches small vmap mappings.
*/
struct page *vmalloc_to_page(const void *vmalloc_addr)
{
@@ -363,25 +632,33 @@ struct page *vmalloc_to_page(const void *vmalloc_addr)
if (pgd_none(*pgd))
return NULL;
+ if (WARN_ON_ONCE(pgd_leaf(*pgd)))
+ return NULL; /* XXX: no allowance for huge pgd */
+ if (WARN_ON_ONCE(pgd_bad(*pgd)))
+ return NULL;
+
p4d = p4d_offset(pgd, addr);
if (p4d_none(*p4d))
return NULL;
- pud = pud_offset(p4d, addr);
+ if (p4d_leaf(*p4d))
+ return p4d_page(*p4d) + ((addr & ~P4D_MASK) >> PAGE_SHIFT);
+ if (WARN_ON_ONCE(p4d_bad(*p4d)))
+ return NULL;
- /*
- * Don't dereference bad PUD or PMD (below) entries. This will also
- * identify huge mappings, which we may encounter on architectures
- * that define CONFIG_HAVE_ARCH_HUGE_VMAP=y. Such regions will be
- * identified as vmalloc addresses by is_vmalloc_addr(), but are
- * not [unambiguously] associated with a struct page, so there is
- * no correct value to return for them.
- */
- WARN_ON_ONCE(pud_bad(*pud));
- if (pud_none(*pud) || pud_bad(*pud))
+ pud = pud_offset(p4d, addr);
+ if (pud_none(*pud))
return NULL;
+ if (pud_leaf(*pud))
+ return pud_page(*pud) + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
+ if (WARN_ON_ONCE(pud_bad(*pud)))
+ return NULL;
+
pmd = pmd_offset(pud, addr);
- WARN_ON_ONCE(pmd_bad(*pmd));
- if (pmd_none(*pmd) || pmd_bad(*pmd))
+ if (pmd_none(*pmd))
+ return NULL;
+ if (pmd_leaf(*pmd))
+ return pmd_page(*pmd) + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
+ if (WARN_ON_ONCE(pmd_bad(*pmd)))
return NULL;
ptep = pte_offset_map(pmd, addr);
@@ -389,6 +666,7 @@ struct page *vmalloc_to_page(const void *vmalloc_addr)
if (pte_present(pte))
page = pte_page(pte);
pte_unmap(ptep);
+
return page;
}
EXPORT_SYMBOL(vmalloc_to_page);
@@ -1152,6 +1430,29 @@ static void free_vmap_area(struct vmap_area *va)
spin_unlock(&free_vmap_area_lock);
}
+static inline void
+preload_this_cpu_lock(spinlock_t *lock, gfp_t gfp_mask, int node)
+{
+ struct vmap_area *va = NULL;
+
+ /*
+ * Preload this CPU with one extra vmap_area object. It is used
+ * when fit type of free area is NE_FIT_TYPE. It guarantees that
+ * a CPU that does an allocation is preloaded.
+ *
+ * We do it in non-atomic context, thus it allows us to use more
+ * permissive allocation masks to be more stable under low memory
+ * condition and high memory pressure.
+ */
+ if (!this_cpu_read(ne_fit_preload_node))
+ va = kmem_cache_alloc_node(vmap_area_cachep, gfp_mask, node);
+
+ spin_lock(lock);
+
+ if (va && __this_cpu_cmpxchg(ne_fit_preload_node, NULL, va))
+ kmem_cache_free(vmap_area_cachep, va);
+}
+
/*
* Allocate a region of KVA of the specified size and alignment, within the
* vstart and vend.
@@ -1161,7 +1462,7 @@ static struct vmap_area *alloc_vmap_area(unsigned long size,
unsigned long vstart, unsigned long vend,
int node, gfp_t gfp_mask)
{
- struct vmap_area *va, *pva;
+ struct vmap_area *va;
unsigned long addr;
int purged = 0;
int ret;
@@ -1187,43 +1488,14 @@ static struct vmap_area *alloc_vmap_area(unsigned long size,
kmemleak_scan_area(&va->rb_node, SIZE_MAX, gfp_mask);
retry:
- /*
- * Preload this CPU with one extra vmap_area object. It is used
- * when fit type of free area is NE_FIT_TYPE. Please note, it
- * does not guarantee that an allocation occurs on a CPU that
- * is preloaded, instead we minimize the case when it is not.
- * It can happen because of cpu migration, because there is a
- * race until the below spinlock is taken.
- *
- * The preload is done in non-atomic context, thus it allows us
- * to use more permissive allocation masks to be more stable under
- * low memory condition and high memory pressure. In rare case,
- * if not preloaded, GFP_NOWAIT is used.
- *
- * Set "pva" to NULL here, because of "retry" path.
- */
- pva = NULL;
-
- if (!this_cpu_read(ne_fit_preload_node))
- /*
- * Even if it fails we do not really care about that.
- * Just proceed as it is. If needed "overflow" path
- * will refill the cache we allocate from.
- */
- pva = kmem_cache_alloc_node(vmap_area_cachep, gfp_mask, node);
-
- spin_lock(&free_vmap_area_lock);
-
- if (pva && __this_cpu_cmpxchg(ne_fit_preload_node, NULL, pva))
- kmem_cache_free(vmap_area_cachep, pva);
+ preload_this_cpu_lock(&free_vmap_area_lock, gfp_mask, node);
+ addr = __alloc_vmap_area(size, align, vstart, vend);
+ spin_unlock(&free_vmap_area_lock);
/*
* If an allocation fails, the "vend" address is
* returned. Therefore trigger the overflow path.
*/
- addr = __alloc_vmap_area(size, align, vstart, vend);
- spin_unlock(&free_vmap_area_lock);
-
if (unlikely(addr == vend))
goto overflow;
@@ -1231,7 +1503,6 @@ retry:
va->va_end = addr + size;
va->vm = NULL;
-
spin_lock(&vmap_area_lock);
insert_vmap_area(va, &vmap_area_root, &vmap_area_list);
spin_unlock(&vmap_area_lock);
@@ -1448,7 +1719,7 @@ static void free_vmap_area_noflush(struct vmap_area *va)
static void free_unmap_vmap_area(struct vmap_area *va)
{
flush_cache_vunmap(va->va_start, va->va_end);
- unmap_kernel_range_noflush(va->va_start, va->va_end - va->va_start);
+ vunmap_range_noflush(va->va_start, va->va_end);
if (debug_pagealloc_enabled_static())
flush_tlb_kernel_range(va->va_start, va->va_end);
@@ -1726,7 +1997,7 @@ static void vb_free(unsigned long addr, unsigned long size)
offset = (addr & (VMAP_BLOCK_SIZE - 1)) >> PAGE_SHIFT;
vb = xa_load(&vmap_blocks, addr_to_vb_idx(addr));
- unmap_kernel_range_noflush(addr, size);
+ vunmap_range_noflush(addr, addr + size);
if (debug_pagealloc_enabled_static())
flush_tlb_kernel_range(addr, addr + size);
@@ -1762,7 +2033,7 @@ static void _vm_unmap_aliases(unsigned long start, unsigned long end, int flush)
rcu_read_lock();
list_for_each_entry_rcu(vb, &vbq->free, free_list) {
spin_lock(&vb->lock);
- if (vb->dirty) {
+ if (vb->dirty && vb->dirty != VMAP_BBMAP_BITS) {
unsigned long va_start = vb->va->va_start;
unsigned long s, e;
@@ -1879,16 +2150,36 @@ void *vm_map_ram(struct page **pages, unsigned int count, int node)
kasan_unpoison_vmalloc(mem, size);
- if (map_kernel_range(addr, size, PAGE_KERNEL, pages) < 0) {
+ if (vmap_pages_range(addr, addr + size, PAGE_KERNEL,
+ pages, PAGE_SHIFT) < 0) {
vm_unmap_ram(mem, count);
return NULL;
}
+
return mem;
}
EXPORT_SYMBOL(vm_map_ram);
static struct vm_struct *vmlist __initdata;
+static inline unsigned int vm_area_page_order(struct vm_struct *vm)
+{
+#ifdef CONFIG_HAVE_ARCH_HUGE_VMALLOC
+ return vm->page_order;
+#else
+ return 0;
+#endif
+}
+
+static inline void set_vm_area_page_order(struct vm_struct *vm, unsigned int order)
+{
+#ifdef CONFIG_HAVE_ARCH_HUGE_VMALLOC
+ vm->page_order = order;
+#else
+ BUG_ON(order != 0);
+#endif
+}
+
/**
* vm_area_add_early - add vmap area early during boot
* @vm: vm_struct to add
@@ -2023,23 +2314,6 @@ void __init vmalloc_init(void)
vmap_initialized = true;
}
-/**
- * unmap_kernel_range - unmap kernel VM area and flush cache and TLB
- * @addr: start of the VM area to unmap
- * @size: size of the VM area to unmap
- *
- * Similar to unmap_kernel_range_noflush() but flushes vcache before
- * the unmapping and tlb after.
- */
-void unmap_kernel_range(unsigned long addr, unsigned long size)
-{
- unsigned long end = addr + size;
-
- flush_cache_vunmap(addr, end);
- unmap_kernel_range_noflush(addr, size);
- flush_tlb_kernel_range(addr, end);
-}
-
static inline void setup_vmalloc_vm_locked(struct vm_struct *vm,
struct vmap_area *va, unsigned long flags, const void *caller)
{
@@ -2199,6 +2473,7 @@ static inline void set_area_direct_map(const struct vm_struct *area,
{
int i;
+ /* HUGE_VMALLOC passes small pages to set_direct_map */
for (i = 0; i < area->nr_pages; i++)
if (page_address(area->pages[i]))
set_direct_map(area->pages[i]);
@@ -2208,6 +2483,7 @@ static inline void set_area_direct_map(const struct vm_struct *area,
static void vm_remove_mappings(struct vm_struct *area, int deallocate_pages)
{
unsigned long start = ULONG_MAX, end = 0;
+ unsigned int page_order = vm_area_page_order(area);
int flush_reset = area->flags & VM_FLUSH_RESET_PERMS;
int flush_dmap = 0;
int i;
@@ -2232,11 +2508,14 @@ static void vm_remove_mappings(struct vm_struct *area, int deallocate_pages)
* map. Find the start and end range of the direct mappings to make sure
* the vm_unmap_aliases() flush includes the direct map.
*/
- for (i = 0; i < area->nr_pages; i++) {
+ for (i = 0; i < area->nr_pages; i += 1U << page_order) {
unsigned long addr = (unsigned long)page_address(area->pages[i]);
if (addr) {
+ unsigned long page_size;
+
+ page_size = PAGE_SIZE << page_order;
start = min(addr, start);
- end = max(addr + PAGE_SIZE, end);
+ end = max(addr + page_size, end);
flush_dmap = 1;
}
}
@@ -2277,13 +2556,14 @@ static void __vunmap(const void *addr, int deallocate_pages)
vm_remove_mappings(area, deallocate_pages);
if (deallocate_pages) {
+ unsigned int page_order = vm_area_page_order(area);
int i;
- for (i = 0; i < area->nr_pages; i++) {
+ for (i = 0; i < area->nr_pages; i += 1U << page_order) {
struct page *page = area->pages[i];
BUG_ON(!page);
- __free_pages(page, 0);
+ __free_pages(page, page_order);
}
atomic_long_sub(area->nr_pages, &nr_vmalloc_pages);
@@ -2402,6 +2682,7 @@ void *vmap(struct page **pages, unsigned int count,
unsigned long flags, pgprot_t prot)
{
struct vm_struct *area;
+ unsigned long addr;
unsigned long size; /* In bytes */
might_sleep();
@@ -2414,8 +2695,9 @@ void *vmap(struct page **pages, unsigned int count,
if (!area)
return NULL;
- if (map_kernel_range((unsigned long)area->addr, size, pgprot_nx(prot),
- pages) < 0) {
+ addr = (unsigned long)area->addr;
+ if (vmap_pages_range(addr, addr + size, pgprot_nx(prot),
+ pages, PAGE_SHIFT) < 0) {
vunmap(area->addr);
return NULL;
}
@@ -2474,15 +2756,19 @@ EXPORT_SYMBOL_GPL(vmap_pfn);
#endif /* CONFIG_VMAP_PFN */
static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
- pgprot_t prot, int node)
+ pgprot_t prot, unsigned int page_shift,
+ int node)
{
const gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO;
- unsigned int nr_pages = get_vm_area_size(area) >> PAGE_SHIFT;
+ unsigned long addr = (unsigned long)area->addr;
+ unsigned long size = get_vm_area_size(area);
unsigned long array_size;
- unsigned int i;
+ unsigned int nr_small_pages = size >> PAGE_SHIFT;
+ unsigned int page_order;
struct page **pages;
+ unsigned int i;
- array_size = (unsigned long)nr_pages * sizeof(struct page *);
+ array_size = (unsigned long)nr_small_pages * sizeof(struct page *);
gfp_mask |= __GFP_NOWARN;
if (!(gfp_mask & (GFP_DMA | GFP_DMA32)))
gfp_mask |= __GFP_HIGHMEM;
@@ -2497,42 +2783,60 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
if (!pages) {
free_vm_area(area);
+ warn_alloc(gfp_mask, NULL,
+ "vmalloc size %lu allocation failure: "
+ "page array size %lu allocation failed",
+ nr_small_pages * PAGE_SIZE, array_size);
return NULL;
}
area->pages = pages;
- area->nr_pages = nr_pages;
+ area->nr_pages = nr_small_pages;
+ set_vm_area_page_order(area, page_shift - PAGE_SHIFT);
- for (i = 0; i < area->nr_pages; i++) {
- struct page *page;
+ page_order = vm_area_page_order(area);
- if (node == NUMA_NO_NODE)
- page = alloc_page(gfp_mask);
- else
- page = alloc_pages_node(node, gfp_mask, 0);
+ /*
+ * Careful, we allocate and map page_order pages, but tracking is done
+ * per PAGE_SIZE page so as to keep the vm_struct APIs independent of
+ * the physical/mapped size.
+ */
+ for (i = 0; i < area->nr_pages; i += 1U << page_order) {
+ struct page *page;
+ int p;
+ /* Compound pages required for remap_vmalloc_page */
+ page = alloc_pages_node(node, gfp_mask | __GFP_COMP, page_order);
if (unlikely(!page)) {
/* Successfully allocated i pages, free them in __vfree() */
area->nr_pages = i;
atomic_long_add(area->nr_pages, &nr_vmalloc_pages);
+ warn_alloc(gfp_mask, NULL,
+ "vmalloc size %lu allocation failure: "
+ "page order %u allocation failed",
+ area->nr_pages * PAGE_SIZE, page_order);
goto fail;
}
- area->pages[i] = page;
+
+ for (p = 0; p < (1U << page_order); p++)
+ area->pages[i + p] = page + p;
+
if (gfpflags_allow_blocking(gfp_mask))
cond_resched();
}
atomic_long_add(area->nr_pages, &nr_vmalloc_pages);
- if (map_kernel_range((unsigned long)area->addr, get_vm_area_size(area),
- prot, pages) < 0)
+ if (vmap_pages_range(addr, addr + size, prot, pages, page_shift) < 0) {
+ warn_alloc(gfp_mask, NULL,
+ "vmalloc size %lu allocation failure: "
+ "failed to map pages",
+ area->nr_pages * PAGE_SIZE);
goto fail;
+ }
return area->addr;
fail:
- warn_alloc(gfp_mask, NULL,
- "vmalloc: allocation failure, allocated %ld of %ld bytes",
- (area->nr_pages*PAGE_SIZE), area->size);
__vfree(area->addr);
return NULL;
}
@@ -2563,19 +2867,54 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align,
struct vm_struct *area;
void *addr;
unsigned long real_size = size;
+ unsigned long real_align = align;
+ unsigned int shift = PAGE_SHIFT;
- size = PAGE_ALIGN(size);
- if (!size || (size >> PAGE_SHIFT) > totalram_pages())
- goto fail;
+ if (WARN_ON_ONCE(!size))
+ return NULL;
+
+ if ((size >> PAGE_SHIFT) > totalram_pages()) {
+ warn_alloc(gfp_mask, NULL,
+ "vmalloc size %lu allocation failure: "
+ "exceeds total pages", real_size);
+ return NULL;
+ }
- area = __get_vm_area_node(real_size, align, VM_ALLOC | VM_UNINITIALIZED |
+ if (vmap_allow_huge && !(vm_flags & VM_NO_HUGE_VMAP) &&
+ arch_vmap_pmd_supported(prot)) {
+ unsigned long size_per_node;
+
+ /*
+ * Try huge pages. Only try for PAGE_KERNEL allocations,
+ * others like modules don't yet expect huge pages in
+ * their allocations due to apply_to_page_range not
+ * supporting them.
+ */
+
+ size_per_node = size;
+ if (node == NUMA_NO_NODE)
+ size_per_node /= num_online_nodes();
+ if (size_per_node >= PMD_SIZE) {
+ shift = PMD_SHIFT;
+ align = max(real_align, 1UL << shift);
+ size = ALIGN(real_size, 1UL << shift);
+ }
+ }
+
+again:
+ size = PAGE_ALIGN(size);
+ area = __get_vm_area_node(size, align, VM_ALLOC | VM_UNINITIALIZED |
vm_flags, start, end, node, gfp_mask, caller);
- if (!area)
+ if (!area) {
+ warn_alloc(gfp_mask, NULL,
+ "vmalloc size %lu allocation failure: "
+ "vm_struct allocation failed", real_size);
goto fail;
+ }
- addr = __vmalloc_area_node(area, gfp_mask, prot, node);
+ addr = __vmalloc_area_node(area, gfp_mask, prot, shift, node);
if (!addr)
- return NULL;
+ goto fail;
/*
* In this function, newly allocated vm_struct has VM_UNINITIALIZED
@@ -2589,8 +2928,13 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align,
return addr;
fail:
- warn_alloc(gfp_mask, NULL,
- "vmalloc: allocation failure: %lu bytes", real_size);
+ if (shift > PAGE_SHIFT) {
+ shift = PAGE_SHIFT;
+ align = real_align;
+ size = real_size;
+ goto again;
+ }
+
return NULL;
}
@@ -2894,7 +3238,10 @@ long vread(char *buf, char *addr, unsigned long count)
count = -(unsigned long) addr;
spin_lock(&vmap_area_lock);
- list_for_each_entry(va, &vmap_area_list, list) {
+ va = __find_vmap_area((unsigned long)addr);
+ if (!va)
+ goto finished;
+ list_for_each_entry_from(va, &vmap_area_list, list) {
if (!count)
break;
@@ -3072,7 +3419,6 @@ int remap_vmalloc_range_partial(struct vm_area_struct *vma, unsigned long uaddr,
return 0;
}
-EXPORT_SYMBOL(remap_vmalloc_range_partial);
/**
* remap_vmalloc_range - map vmalloc pages to userspace