From f07745325cbd93afc5d8bcf7539a063d33134075 Mon Sep 17 00:00:00 2001 From: Ian Campbell Date: Mon, 9 Feb 2009 12:05:49 -0800 Subject: xen: define gnttab_set_map_op/unmap_op Impact: hypercall definitions These functions populate the gnttab data structures used by the granttab map and unmap ops and are used in the backend drivers. Originally xen-unstable.hg 9625:c3bb51c443a7 [ Include Stefano's fix for phys_addr_t ] Signed-off-by: Ian Campbell Signed-off-by: Stefano Stabellini Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Konrad Rzeszutek Wilk --- include/xen/grant_table.h | 39 ++++++++++++++++++++++++++++++++++++++- 1 file changed, 38 insertions(+), 1 deletion(-) diff --git a/include/xen/grant_table.h b/include/xen/grant_table.h index 9a731706a016..1821aa103d82 100644 --- a/include/xen/grant_table.h +++ b/include/xen/grant_table.h @@ -37,10 +37,16 @@ #ifndef __ASM_GNTTAB_H__ #define __ASM_GNTTAB_H__ -#include +#include + +#include #include + +#include #include +#include + /* NR_GRANT_FRAMES must be less than or equal to that configured in Xen */ #define NR_GRANT_FRAMES 4 @@ -107,6 +113,37 @@ void gnttab_grant_foreign_access_ref(grant_ref_t ref, domid_t domid, void gnttab_grant_foreign_transfer_ref(grant_ref_t, domid_t domid, unsigned long pfn); +static inline void +gnttab_set_map_op(struct gnttab_map_grant_ref *map, phys_addr_t addr, + uint32_t flags, grant_ref_t ref, domid_t domid) +{ + if (flags & GNTMAP_contains_pte) + map->host_addr = addr; + else if (xen_feature(XENFEAT_auto_translated_physmap)) + map->host_addr = __pa(addr); + else + map->host_addr = addr; + + map->flags = flags; + map->ref = ref; + map->dom = domid; +} + +static inline void +gnttab_set_unmap_op(struct gnttab_unmap_grant_ref *unmap, phys_addr_t addr, + uint32_t flags, grant_handle_t handle) +{ + if (flags & GNTMAP_contains_pte) + unmap->host_addr = addr; + else if (xen_feature(XENFEAT_auto_translated_physmap)) + unmap->host_addr = __pa(addr); + else + unmap->host_addr = addr; + + unmap->handle = handle; + unmap->dev_bus_addr = 0; +} + int arch_gnttab_map_shared(unsigned long *frames, unsigned long nr_gframes, unsigned long max_nr_gframes, struct grant_entry **__shared); -- cgit From ab31523c2fcac557226bac72cbdf5fafe01f9a26 Mon Sep 17 00:00:00 2001 From: Gerd Hoffmann Date: Tue, 14 Dec 2010 18:40:46 +0000 Subject: xen/gntdev: allow usermode to map granted pages The gntdev driver allows usermode to map granted pages from other domains. This is typically used to implement a Xen backend driver in user mode. Signed-off-by: Gerd Hoffmann Signed-off-by: Stefano Stabellini Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Konrad Rzeszutek Wilk --- drivers/xen/Kconfig | 7 + drivers/xen/Makefile | 2 + drivers/xen/gntdev.c | 635 +++++++++++++++++++++++++++++++++++++++++++++++++++ include/xen/gntdev.h | 119 ++++++++++ 4 files changed, 763 insertions(+) create mode 100644 drivers/xen/gntdev.c create mode 100644 include/xen/gntdev.h diff --git a/drivers/xen/Kconfig b/drivers/xen/Kconfig index 6e6180ccd726..bd3095f0dff2 100644 --- a/drivers/xen/Kconfig +++ b/drivers/xen/Kconfig @@ -62,6 +62,13 @@ config XEN_SYS_HYPERVISOR virtual environment, /sys/hypervisor will still be present, but will have no xen contents. +config XEN_GNTDEV + tristate "userspace grant access device driver" + depends on XEN + select MMU_NOTIFIER + help + Allows userspace processes to use grants. + config XEN_PLATFORM_PCI tristate "xen platform pci device driver" depends on XEN_PVHVM diff --git a/drivers/xen/Makefile b/drivers/xen/Makefile index 533a199e7a3f..674fdb5fee49 100644 --- a/drivers/xen/Makefile +++ b/drivers/xen/Makefile @@ -9,6 +9,7 @@ obj-$(CONFIG_HOTPLUG_CPU) += cpu_hotplug.o obj-$(CONFIG_XEN_XENCOMM) += xencomm.o obj-$(CONFIG_XEN_BALLOON) += balloon.o obj-$(CONFIG_XEN_DEV_EVTCHN) += xen-evtchn.o +obj-$(CONFIG_XEN_GNTDEV) += xen-gntdev.o obj-$(CONFIG_XENFS) += xenfs/ obj-$(CONFIG_XEN_SYS_HYPERVISOR) += sys-hypervisor.o obj-$(CONFIG_XEN_PLATFORM_PCI) += platform-pci.o @@ -16,4 +17,5 @@ obj-$(CONFIG_SWIOTLB_XEN) += swiotlb-xen.o obj-$(CONFIG_XEN_DOM0) += pci.o xen-evtchn-y := evtchn.o +xen-gntdev-y := gntdev.o diff --git a/drivers/xen/gntdev.c b/drivers/xen/gntdev.c new file mode 100644 index 000000000000..cfe063372aa4 --- /dev/null +++ b/drivers/xen/gntdev.c @@ -0,0 +1,635 @@ +/****************************************************************************** + * gntdev.c + * + * Device for accessing (in user-space) pages that have been granted by other + * domains. + * + * Copyright (c) 2006-2007, D G Murray. + * (c) 2009 Gerd Hoffmann + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#undef DEBUG + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Derek G. Murray , " + "Gerd Hoffmann "); +MODULE_DESCRIPTION("User-space granted page access driver"); + +static int limit = 1024; +module_param(limit, int, 0644); +MODULE_PARM_DESC(limit, "Maximum number of grants that may be mapped at " + "once by a gntdev instance"); + +struct gntdev_priv { + struct list_head maps; + uint32_t used; + uint32_t limit; + /* lock protects maps from concurrent changes */ + spinlock_t lock; + struct mm_struct *mm; + struct mmu_notifier mn; +}; + +struct grant_map { + struct list_head next; + struct gntdev_priv *priv; + struct vm_area_struct *vma; + int index; + int count; + int flags; + int is_mapped; + struct ioctl_gntdev_grant_ref *grants; + struct gnttab_map_grant_ref *map_ops; + struct gnttab_unmap_grant_ref *unmap_ops; +}; + +/* ------------------------------------------------------------------ */ + +static void gntdev_print_maps(struct gntdev_priv *priv, + char *text, int text_index) +{ +#ifdef DEBUG + struct grant_map *map; + + pr_debug("maps list (priv %p, usage %d/%d)\n", + priv, priv->used, priv->limit); + + list_for_each_entry(map, &priv->maps, next) + pr_debug(" index %2d, count %2d %s\n", + map->index, map->count, + map->index == text_index && text ? text : ""); +#endif +} + +static struct grant_map *gntdev_alloc_map(struct gntdev_priv *priv, int count) +{ + struct grant_map *add; + + add = kzalloc(sizeof(struct grant_map), GFP_KERNEL); + if (NULL == add) + return NULL; + + add->grants = kzalloc(sizeof(add->grants[0]) * count, GFP_KERNEL); + add->map_ops = kzalloc(sizeof(add->map_ops[0]) * count, GFP_KERNEL); + add->unmap_ops = kzalloc(sizeof(add->unmap_ops[0]) * count, GFP_KERNEL); + if (NULL == add->grants || + NULL == add->map_ops || + NULL == add->unmap_ops) + goto err; + + add->index = 0; + add->count = count; + add->priv = priv; + + if (add->count + priv->used > priv->limit) + goto err; + + return add; + +err: + kfree(add->grants); + kfree(add->map_ops); + kfree(add->unmap_ops); + kfree(add); + return NULL; +} + +static void gntdev_add_map(struct gntdev_priv *priv, struct grant_map *add) +{ + struct grant_map *map; + + list_for_each_entry(map, &priv->maps, next) { + if (add->index + add->count < map->index) { + list_add_tail(&add->next, &map->next); + goto done; + } + add->index = map->index + map->count; + } + list_add_tail(&add->next, &priv->maps); + +done: + priv->used += add->count; + gntdev_print_maps(priv, "[new]", add->index); +} + +static struct grant_map *gntdev_find_map_index(struct gntdev_priv *priv, + int index, int count) +{ + struct grant_map *map; + + list_for_each_entry(map, &priv->maps, next) { + if (map->index != index) + continue; + if (map->count != count) + continue; + return map; + } + return NULL; +} + +static struct grant_map *gntdev_find_map_vaddr(struct gntdev_priv *priv, + unsigned long vaddr) +{ + struct grant_map *map; + + list_for_each_entry(map, &priv->maps, next) { + if (!map->vma) + continue; + if (vaddr < map->vma->vm_start) + continue; + if (vaddr >= map->vma->vm_end) + continue; + return map; + } + return NULL; +} + +static int gntdev_del_map(struct grant_map *map) +{ + int i; + + if (map->vma) + return -EBUSY; + for (i = 0; i < map->count; i++) + if (map->unmap_ops[i].handle) + return -EBUSY; + + map->priv->used -= map->count; + list_del(&map->next); + return 0; +} + +static void gntdev_free_map(struct grant_map *map) +{ + if (!map) + return; + kfree(map->grants); + kfree(map->map_ops); + kfree(map->unmap_ops); + kfree(map); +} + +/* ------------------------------------------------------------------ */ + +static int find_grant_ptes(pte_t *pte, pgtable_t token, + unsigned long addr, void *data) +{ + struct grant_map *map = data; + unsigned int pgnr = (addr - map->vma->vm_start) >> PAGE_SHIFT; + u64 pte_maddr; + + BUG_ON(pgnr >= map->count); + pte_maddr = (u64)pfn_to_mfn(page_to_pfn(token)) << PAGE_SHIFT; + pte_maddr += (unsigned long)pte & ~PAGE_MASK; + gnttab_set_map_op(&map->map_ops[pgnr], pte_maddr, map->flags, + map->grants[pgnr].ref, + map->grants[pgnr].domid); + gnttab_set_unmap_op(&map->unmap_ops[pgnr], pte_maddr, map->flags, + 0 /* handle */); + return 0; +} + +static int map_grant_pages(struct grant_map *map) +{ + int i, err = 0; + + pr_debug("map %d+%d\n", map->index, map->count); + err = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, + map->map_ops, map->count); + if (err) + return err; + + for (i = 0; i < map->count; i++) { + if (map->map_ops[i].status) + err = -EINVAL; + map->unmap_ops[i].handle = map->map_ops[i].handle; + } + return err; +} + +static int unmap_grant_pages(struct grant_map *map, int offset, int pages) +{ + int i, err = 0; + + pr_debug("map %d+%d [%d+%d]\n", map->index, map->count, offset, pages); + err = HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, + map->unmap_ops + offset, pages); + if (err) + return err; + + for (i = 0; i < pages; i++) { + if (map->unmap_ops[offset+i].status) + err = -EINVAL; + map->unmap_ops[offset+i].handle = 0; + } + return err; +} + +/* ------------------------------------------------------------------ */ + +static void gntdev_vma_close(struct vm_area_struct *vma) +{ + struct grant_map *map = vma->vm_private_data; + + pr_debug("close %p\n", vma); + map->is_mapped = 0; + map->vma = NULL; + vma->vm_private_data = NULL; +} + +static int gntdev_vma_fault(struct vm_area_struct *vma, struct vm_fault *vmf) +{ + pr_debug("vaddr %p, pgoff %ld (shouldn't happen)\n", + vmf->virtual_address, vmf->pgoff); + vmf->flags = VM_FAULT_ERROR; + return 0; +} + +static struct vm_operations_struct gntdev_vmops = { + .close = gntdev_vma_close, + .fault = gntdev_vma_fault, +}; + +/* ------------------------------------------------------------------ */ + +static void mn_invl_range_start(struct mmu_notifier *mn, + struct mm_struct *mm, + unsigned long start, unsigned long end) +{ + struct gntdev_priv *priv = container_of(mn, struct gntdev_priv, mn); + struct grant_map *map; + unsigned long mstart, mend; + int err; + + spin_lock(&priv->lock); + list_for_each_entry(map, &priv->maps, next) { + if (!map->vma) + continue; + if (!map->is_mapped) + continue; + if (map->vma->vm_start >= end) + continue; + if (map->vma->vm_end <= start) + continue; + mstart = max(start, map->vma->vm_start); + mend = min(end, map->vma->vm_end); + pr_debug("map %d+%d (%lx %lx), range %lx %lx, mrange %lx %lx\n", + map->index, map->count, + map->vma->vm_start, map->vma->vm_end, + start, end, mstart, mend); + err = unmap_grant_pages(map, + (mstart - map->vma->vm_start) >> PAGE_SHIFT, + (mend - mstart) >> PAGE_SHIFT); + WARN_ON(err); + } + spin_unlock(&priv->lock); +} + +static void mn_invl_page(struct mmu_notifier *mn, + struct mm_struct *mm, + unsigned long address) +{ + mn_invl_range_start(mn, mm, address, address + PAGE_SIZE); +} + +static void mn_release(struct mmu_notifier *mn, + struct mm_struct *mm) +{ + struct gntdev_priv *priv = container_of(mn, struct gntdev_priv, mn); + struct grant_map *map; + int err; + + spin_lock(&priv->lock); + list_for_each_entry(map, &priv->maps, next) { + if (!map->vma) + continue; + pr_debug("map %d+%d (%lx %lx)\n", + map->index, map->count, + map->vma->vm_start, map->vma->vm_end); + err = unmap_grant_pages(map, /* offset */ 0, map->count); + WARN_ON(err); + } + spin_unlock(&priv->lock); +} + +struct mmu_notifier_ops gntdev_mmu_ops = { + .release = mn_release, + .invalidate_page = mn_invl_page, + .invalidate_range_start = mn_invl_range_start, +}; + +/* ------------------------------------------------------------------ */ + +static int gntdev_open(struct inode *inode, struct file *flip) +{ + struct gntdev_priv *priv; + int ret = 0; + + priv = kzalloc(sizeof(*priv), GFP_KERNEL); + if (!priv) + return -ENOMEM; + + INIT_LIST_HEAD(&priv->maps); + spin_lock_init(&priv->lock); + priv->limit = limit; + + priv->mm = get_task_mm(current); + if (!priv->mm) { + kfree(priv); + return -ENOMEM; + } + priv->mn.ops = &gntdev_mmu_ops; + ret = mmu_notifier_register(&priv->mn, priv->mm); + mmput(priv->mm); + + if (ret) { + kfree(priv); + return ret; + } + + flip->private_data = priv; + pr_debug("priv %p\n", priv); + + return 0; +} + +static int gntdev_release(struct inode *inode, struct file *flip) +{ + struct gntdev_priv *priv = flip->private_data; + struct grant_map *map; + int err; + + pr_debug("priv %p\n", priv); + + spin_lock(&priv->lock); + while (!list_empty(&priv->maps)) { + map = list_entry(priv->maps.next, struct grant_map, next); + err = gntdev_del_map(map); + if (WARN_ON(err)) + gntdev_free_map(map); + + } + spin_unlock(&priv->lock); + + mmu_notifier_unregister(&priv->mn, priv->mm); + kfree(priv); + return 0; +} + +static long gntdev_ioctl_map_grant_ref(struct gntdev_priv *priv, + struct ioctl_gntdev_map_grant_ref __user *u) +{ + struct ioctl_gntdev_map_grant_ref op; + struct grant_map *map; + int err; + + if (copy_from_user(&op, u, sizeof(op)) != 0) + return -EFAULT; + pr_debug("priv %p, add %d\n", priv, op.count); + if (unlikely(op.count <= 0)) + return -EINVAL; + if (unlikely(op.count > priv->limit)) + return -EINVAL; + + err = -ENOMEM; + map = gntdev_alloc_map(priv, op.count); + if (!map) + return err; + if (copy_from_user(map->grants, &u->refs, + sizeof(map->grants[0]) * op.count) != 0) { + gntdev_free_map(map); + return err; + } + + spin_lock(&priv->lock); + gntdev_add_map(priv, map); + op.index = map->index << PAGE_SHIFT; + spin_unlock(&priv->lock); + + if (copy_to_user(u, &op, sizeof(op)) != 0) { + spin_lock(&priv->lock); + gntdev_del_map(map); + spin_unlock(&priv->lock); + gntdev_free_map(map); + return err; + } + return 0; +} + +static long gntdev_ioctl_unmap_grant_ref(struct gntdev_priv *priv, + struct ioctl_gntdev_unmap_grant_ref __user *u) +{ + struct ioctl_gntdev_unmap_grant_ref op; + struct grant_map *map; + int err = -ENOENT; + + if (copy_from_user(&op, u, sizeof(op)) != 0) + return -EFAULT; + pr_debug("priv %p, del %d+%d\n", priv, (int)op.index, (int)op.count); + + spin_lock(&priv->lock); + map = gntdev_find_map_index(priv, op.index >> PAGE_SHIFT, op.count); + if (map) + err = gntdev_del_map(map); + spin_unlock(&priv->lock); + if (!err) + gntdev_free_map(map); + return err; +} + +static long gntdev_ioctl_get_offset_for_vaddr(struct gntdev_priv *priv, + struct ioctl_gntdev_get_offset_for_vaddr __user *u) +{ + struct ioctl_gntdev_get_offset_for_vaddr op; + struct grant_map *map; + + if (copy_from_user(&op, u, sizeof(op)) != 0) + return -EFAULT; + pr_debug("priv %p, offset for vaddr %lx\n", priv, (unsigned long)op.vaddr); + + spin_lock(&priv->lock); + map = gntdev_find_map_vaddr(priv, op.vaddr); + if (map == NULL || + map->vma->vm_start != op.vaddr) { + spin_unlock(&priv->lock); + return -EINVAL; + } + op.offset = map->index << PAGE_SHIFT; + op.count = map->count; + spin_unlock(&priv->lock); + + if (copy_to_user(u, &op, sizeof(op)) != 0) + return -EFAULT; + return 0; +} + +static long gntdev_ioctl_set_max_grants(struct gntdev_priv *priv, + struct ioctl_gntdev_set_max_grants __user *u) +{ + struct ioctl_gntdev_set_max_grants op; + + if (copy_from_user(&op, u, sizeof(op)) != 0) + return -EFAULT; + pr_debug("priv %p, limit %d\n", priv, op.count); + if (op.count > limit) + return -E2BIG; + + spin_lock(&priv->lock); + priv->limit = op.count; + spin_unlock(&priv->lock); + return 0; +} + +static long gntdev_ioctl(struct file *flip, + unsigned int cmd, unsigned long arg) +{ + struct gntdev_priv *priv = flip->private_data; + void __user *ptr = (void __user *)arg; + + switch (cmd) { + case IOCTL_GNTDEV_MAP_GRANT_REF: + return gntdev_ioctl_map_grant_ref(priv, ptr); + + case IOCTL_GNTDEV_UNMAP_GRANT_REF: + return gntdev_ioctl_unmap_grant_ref(priv, ptr); + + case IOCTL_GNTDEV_GET_OFFSET_FOR_VADDR: + return gntdev_ioctl_get_offset_for_vaddr(priv, ptr); + + case IOCTL_GNTDEV_SET_MAX_GRANTS: + return gntdev_ioctl_set_max_grants(priv, ptr); + + default: + pr_debug("priv %p, unknown cmd %x\n", priv, cmd); + return -ENOIOCTLCMD; + } + + return 0; +} + +static int gntdev_mmap(struct file *flip, struct vm_area_struct *vma) +{ + struct gntdev_priv *priv = flip->private_data; + int index = vma->vm_pgoff; + int count = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT; + struct grant_map *map; + int err = -EINVAL; + + if ((vma->vm_flags & VM_WRITE) && !(vma->vm_flags & VM_SHARED)) + return -EINVAL; + + pr_debug("map %d+%d at %lx (pgoff %lx)\n", + index, count, vma->vm_start, vma->vm_pgoff); + + spin_lock(&priv->lock); + map = gntdev_find_map_index(priv, index, count); + if (!map) + goto unlock_out; + if (map->vma) + goto unlock_out; + if (priv->mm != vma->vm_mm) { + printk(KERN_WARNING "Huh? Other mm?\n"); + goto unlock_out; + } + + vma->vm_ops = &gntdev_vmops; + + vma->vm_flags |= VM_RESERVED|VM_DONTCOPY|VM_DONTEXPAND; + + vma->vm_private_data = map; + map->vma = vma; + + map->flags = GNTMAP_host_map | GNTMAP_application_map | GNTMAP_contains_pte; + if (!(vma->vm_flags & VM_WRITE)) + map->flags |= GNTMAP_readonly; + + err = apply_to_page_range(vma->vm_mm, vma->vm_start, + vma->vm_end - vma->vm_start, + find_grant_ptes, map); + if (err) { + printk(KERN_WARNING "find_grant_ptes() failure.\n"); + goto unlock_out; + } + + err = map_grant_pages(map); + if (err) { + printk(KERN_WARNING "map_grant_pages() failure.\n"); + goto unlock_out; + } + map->is_mapped = 1; + +unlock_out: + spin_unlock(&priv->lock); + return err; +} + +static const struct file_operations gntdev_fops = { + .owner = THIS_MODULE, + .open = gntdev_open, + .release = gntdev_release, + .mmap = gntdev_mmap, + .unlocked_ioctl = gntdev_ioctl +}; + +static struct miscdevice gntdev_miscdev = { + .minor = MISC_DYNAMIC_MINOR, + .name = "xen/gntdev", + .fops = &gntdev_fops, +}; + +/* ------------------------------------------------------------------ */ + +static int __init gntdev_init(void) +{ + int err; + + if (!xen_domain()) + return -ENODEV; + + err = misc_register(&gntdev_miscdev); + if (err != 0) { + printk(KERN_ERR "Could not register gntdev device\n"); + return err; + } + return 0; +} + +static void __exit gntdev_exit(void) +{ + misc_deregister(&gntdev_miscdev); +} + +module_init(gntdev_init); +module_exit(gntdev_exit); + +/* ------------------------------------------------------------------ */ diff --git a/include/xen/gntdev.h b/include/xen/gntdev.h new file mode 100644 index 000000000000..eb23f4188f5a --- /dev/null +++ b/include/xen/gntdev.h @@ -0,0 +1,119 @@ +/****************************************************************************** + * gntdev.h + * + * Interface to /dev/xen/gntdev. + * + * Copyright (c) 2007, D G Murray + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation; or, when distributed + * separately from the Linux kernel or incorporated into other + * software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#ifndef __LINUX_PUBLIC_GNTDEV_H__ +#define __LINUX_PUBLIC_GNTDEV_H__ + +struct ioctl_gntdev_grant_ref { + /* The domain ID of the grant to be mapped. */ + uint32_t domid; + /* The grant reference of the grant to be mapped. */ + uint32_t ref; +}; + +/* + * Inserts the grant references into the mapping table of an instance + * of gntdev. N.B. This does not perform the mapping, which is deferred + * until mmap() is called with @index as the offset. + */ +#define IOCTL_GNTDEV_MAP_GRANT_REF \ +_IOC(_IOC_NONE, 'G', 0, sizeof(struct ioctl_gntdev_map_grant_ref)) +struct ioctl_gntdev_map_grant_ref { + /* IN parameters */ + /* The number of grants to be mapped. */ + uint32_t count; + uint32_t pad; + /* OUT parameters */ + /* The offset to be used on a subsequent call to mmap(). */ + uint64_t index; + /* Variable IN parameter. */ + /* Array of grant references, of size @count. */ + struct ioctl_gntdev_grant_ref refs[1]; +}; + +/* + * Removes the grant references from the mapping table of an instance of + * of gntdev. N.B. munmap() must be called on the relevant virtual address(es) + * before this ioctl is called, or an error will result. + */ +#define IOCTL_GNTDEV_UNMAP_GRANT_REF \ +_IOC(_IOC_NONE, 'G', 1, sizeof(struct ioctl_gntdev_unmap_grant_ref)) +struct ioctl_gntdev_unmap_grant_ref { + /* IN parameters */ + /* The offset was returned by the corresponding map operation. */ + uint64_t index; + /* The number of pages to be unmapped. */ + uint32_t count; + uint32_t pad; +}; + +/* + * Returns the offset in the driver's address space that corresponds + * to @vaddr. This can be used to perform a munmap(), followed by an + * UNMAP_GRANT_REF ioctl, where no state about the offset is retained by + * the caller. The number of pages that were allocated at the same time as + * @vaddr is returned in @count. + * + * N.B. Where more than one page has been mapped into a contiguous range, the + * supplied @vaddr must correspond to the start of the range; otherwise + * an error will result. It is only possible to munmap() the entire + * contiguously-allocated range at once, and not any subrange thereof. + */ +#define IOCTL_GNTDEV_GET_OFFSET_FOR_VADDR \ +_IOC(_IOC_NONE, 'G', 2, sizeof(struct ioctl_gntdev_get_offset_for_vaddr)) +struct ioctl_gntdev_get_offset_for_vaddr { + /* IN parameters */ + /* The virtual address of the first mapped page in a range. */ + uint64_t vaddr; + /* OUT parameters */ + /* The offset that was used in the initial mmap() operation. */ + uint64_t offset; + /* The number of pages mapped in the VM area that begins at @vaddr. */ + uint32_t count; + uint32_t pad; +}; + +/* + * Sets the maximum number of grants that may mapped at once by this gntdev + * instance. + * + * N.B. This must be called before any other ioctl is performed on the device. + */ +#define IOCTL_GNTDEV_SET_MAX_GRANTS \ +_IOC(_IOC_NONE, 'G', 3, sizeof(struct ioctl_gntdev_set_max_grants)) +struct ioctl_gntdev_set_max_grants { + /* IN parameter */ + /* The maximum number of grants that may be mapped at once. */ + uint32_t count; +}; + +#endif /* __LINUX_PUBLIC_GNTDEV_H__ */ -- cgit From 8d3eaea24609c7cd6fb0e6471f46a52f9e5d0202 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Thu, 11 Nov 2010 14:39:12 -0800 Subject: xen/gntdev: add VM_PFNMAP to vma These pages are from other domains, so don't have any local PFN. VM_PFNMAP is the closest concept Linux has to this. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Konrad Rzeszutek Wilk --- drivers/xen/gntdev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/xen/gntdev.c b/drivers/xen/gntdev.c index cfe063372aa4..fa6355a97081 100644 --- a/drivers/xen/gntdev.c +++ b/drivers/xen/gntdev.c @@ -564,7 +564,7 @@ static int gntdev_mmap(struct file *flip, struct vm_area_struct *vma) vma->vm_ops = &gntdev_vmops; - vma->vm_flags |= VM_RESERVED|VM_DONTCOPY|VM_DONTEXPAND; + vma->vm_flags |= VM_RESERVED|VM_DONTCOPY|VM_DONTEXPAND|VM_PFNMAP; vma->vm_private_data = map; map->vma = vma; -- cgit From b5eafe924bb054d7c56e6ebd18106352e8a3f916 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Mon, 6 Dec 2010 16:29:22 -0800 Subject: xen: move p2m handling to separate file Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/xen/Makefile | 3 +- arch/x86/xen/mmu.c | 365 ------------------------------------------------ arch/x86/xen/p2m.c | 376 ++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 378 insertions(+), 366 deletions(-) create mode 100644 arch/x86/xen/p2m.c diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile index 779385158915..17c565de3d64 100644 --- a/arch/x86/xen/Makefile +++ b/arch/x86/xen/Makefile @@ -12,7 +12,8 @@ CFLAGS_mmu.o := $(nostackp) obj-y := enlighten.o setup.o multicalls.o mmu.o irq.o \ time.o xen-asm.o xen-asm_$(BITS).o \ - grant-table.o suspend.o platform-pci-unplug.o + grant-table.o suspend.o platform-pci-unplug.o \ + p2m.o obj-$(CONFIG_SMP) += smp.o obj-$(CONFIG_PARAVIRT_SPINLOCKS)+= spinlock.o diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 44924e551fde..7575e55cd52e 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -173,371 +173,6 @@ DEFINE_PER_CPU(unsigned long, xen_current_cr3); /* actual vcpu cr3 */ */ #define USER_LIMIT ((STACK_TOP_MAX + PGDIR_SIZE - 1) & PGDIR_MASK) -/* - * Xen leaves the responsibility for maintaining p2m mappings to the - * guests themselves, but it must also access and update the p2m array - * during suspend/resume when all the pages are reallocated. - * - * The p2m table is logically a flat array, but we implement it as a - * three-level tree to allow the address space to be sparse. - * - * Xen - * | - * p2m_top p2m_top_mfn - * / \ / \ - * p2m_mid p2m_mid p2m_mid_mfn p2m_mid_mfn - * / \ / \ / / - * p2m p2m p2m p2m p2m p2m p2m ... - * - * The p2m_mid_mfn pages are mapped by p2m_top_mfn_p. - * - * The p2m_top and p2m_top_mfn levels are limited to 1 page, so the - * maximum representable pseudo-physical address space is: - * P2M_TOP_PER_PAGE * P2M_MID_PER_PAGE * P2M_PER_PAGE pages - * - * P2M_PER_PAGE depends on the architecture, as a mfn is always - * unsigned long (8 bytes on 64-bit, 4 bytes on 32), leading to - * 512 and 1024 entries respectively. - */ - -unsigned long xen_max_p2m_pfn __read_mostly; - -#define P2M_PER_PAGE (PAGE_SIZE / sizeof(unsigned long)) -#define P2M_MID_PER_PAGE (PAGE_SIZE / sizeof(unsigned long *)) -#define P2M_TOP_PER_PAGE (PAGE_SIZE / sizeof(unsigned long **)) - -#define MAX_P2M_PFN (P2M_TOP_PER_PAGE * P2M_MID_PER_PAGE * P2M_PER_PAGE) - -/* Placeholders for holes in the address space */ -static RESERVE_BRK_ARRAY(unsigned long, p2m_missing, P2M_PER_PAGE); -static RESERVE_BRK_ARRAY(unsigned long *, p2m_mid_missing, P2M_MID_PER_PAGE); -static RESERVE_BRK_ARRAY(unsigned long, p2m_mid_missing_mfn, P2M_MID_PER_PAGE); - -static RESERVE_BRK_ARRAY(unsigned long **, p2m_top, P2M_TOP_PER_PAGE); -static RESERVE_BRK_ARRAY(unsigned long, p2m_top_mfn, P2M_TOP_PER_PAGE); -static RESERVE_BRK_ARRAY(unsigned long *, p2m_top_mfn_p, P2M_TOP_PER_PAGE); - -RESERVE_BRK(p2m_mid, PAGE_SIZE * (MAX_DOMAIN_PAGES / (P2M_PER_PAGE * P2M_MID_PER_PAGE))); -RESERVE_BRK(p2m_mid_mfn, PAGE_SIZE * (MAX_DOMAIN_PAGES / (P2M_PER_PAGE * P2M_MID_PER_PAGE))); - -static inline unsigned p2m_top_index(unsigned long pfn) -{ - BUG_ON(pfn >= MAX_P2M_PFN); - return pfn / (P2M_MID_PER_PAGE * P2M_PER_PAGE); -} - -static inline unsigned p2m_mid_index(unsigned long pfn) -{ - return (pfn / P2M_PER_PAGE) % P2M_MID_PER_PAGE; -} - -static inline unsigned p2m_index(unsigned long pfn) -{ - return pfn % P2M_PER_PAGE; -} - -static void p2m_top_init(unsigned long ***top) -{ - unsigned i; - - for (i = 0; i < P2M_TOP_PER_PAGE; i++) - top[i] = p2m_mid_missing; -} - -static void p2m_top_mfn_init(unsigned long *top) -{ - unsigned i; - - for (i = 0; i < P2M_TOP_PER_PAGE; i++) - top[i] = virt_to_mfn(p2m_mid_missing_mfn); -} - -static void p2m_top_mfn_p_init(unsigned long **top) -{ - unsigned i; - - for (i = 0; i < P2M_TOP_PER_PAGE; i++) - top[i] = p2m_mid_missing_mfn; -} - -static void p2m_mid_init(unsigned long **mid) -{ - unsigned i; - - for (i = 0; i < P2M_MID_PER_PAGE; i++) - mid[i] = p2m_missing; -} - -static void p2m_mid_mfn_init(unsigned long *mid) -{ - unsigned i; - - for (i = 0; i < P2M_MID_PER_PAGE; i++) - mid[i] = virt_to_mfn(p2m_missing); -} - -static void p2m_init(unsigned long *p2m) -{ - unsigned i; - - for (i = 0; i < P2M_MID_PER_PAGE; i++) - p2m[i] = INVALID_P2M_ENTRY; -} - -/* - * Build the parallel p2m_top_mfn and p2m_mid_mfn structures - * - * This is called both at boot time, and after resuming from suspend: - * - At boot time we're called very early, and must use extend_brk() - * to allocate memory. - * - * - After resume we're called from within stop_machine, but the mfn - * tree should alreay be completely allocated. - */ -void xen_build_mfn_list_list(void) -{ - unsigned long pfn; - - /* Pre-initialize p2m_top_mfn to be completely missing */ - if (p2m_top_mfn == NULL) { - p2m_mid_missing_mfn = extend_brk(PAGE_SIZE, PAGE_SIZE); - p2m_mid_mfn_init(p2m_mid_missing_mfn); - - p2m_top_mfn_p = extend_brk(PAGE_SIZE, PAGE_SIZE); - p2m_top_mfn_p_init(p2m_top_mfn_p); - - p2m_top_mfn = extend_brk(PAGE_SIZE, PAGE_SIZE); - p2m_top_mfn_init(p2m_top_mfn); - } else { - /* Reinitialise, mfn's all change after migration */ - p2m_mid_mfn_init(p2m_mid_missing_mfn); - } - - for (pfn = 0; pfn < xen_max_p2m_pfn; pfn += P2M_PER_PAGE) { - unsigned topidx = p2m_top_index(pfn); - unsigned mididx = p2m_mid_index(pfn); - unsigned long **mid; - unsigned long *mid_mfn_p; - - mid = p2m_top[topidx]; - mid_mfn_p = p2m_top_mfn_p[topidx]; - - /* Don't bother allocating any mfn mid levels if - * they're just missing, just update the stored mfn, - * since all could have changed over a migrate. - */ - if (mid == p2m_mid_missing) { - BUG_ON(mididx); - BUG_ON(mid_mfn_p != p2m_mid_missing_mfn); - p2m_top_mfn[topidx] = virt_to_mfn(p2m_mid_missing_mfn); - pfn += (P2M_MID_PER_PAGE - 1) * P2M_PER_PAGE; - continue; - } - - if (mid_mfn_p == p2m_mid_missing_mfn) { - /* - * XXX boot-time only! We should never find - * missing parts of the mfn tree after - * runtime. extend_brk() will BUG if we call - * it too late. - */ - mid_mfn_p = extend_brk(PAGE_SIZE, PAGE_SIZE); - p2m_mid_mfn_init(mid_mfn_p); - - p2m_top_mfn_p[topidx] = mid_mfn_p; - } - - p2m_top_mfn[topidx] = virt_to_mfn(mid_mfn_p); - mid_mfn_p[mididx] = virt_to_mfn(mid[mididx]); - } -} - -void xen_setup_mfn_list_list(void) -{ - BUG_ON(HYPERVISOR_shared_info == &xen_dummy_shared_info); - - HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list = - virt_to_mfn(p2m_top_mfn); - HYPERVISOR_shared_info->arch.max_pfn = xen_max_p2m_pfn; -} - -/* Set up p2m_top to point to the domain-builder provided p2m pages */ -void __init xen_build_dynamic_phys_to_machine(void) -{ - unsigned long *mfn_list = (unsigned long *)xen_start_info->mfn_list; - unsigned long max_pfn = min(MAX_DOMAIN_PAGES, xen_start_info->nr_pages); - unsigned long pfn; - - xen_max_p2m_pfn = max_pfn; - - p2m_missing = extend_brk(PAGE_SIZE, PAGE_SIZE); - p2m_init(p2m_missing); - - p2m_mid_missing = extend_brk(PAGE_SIZE, PAGE_SIZE); - p2m_mid_init(p2m_mid_missing); - - p2m_top = extend_brk(PAGE_SIZE, PAGE_SIZE); - p2m_top_init(p2m_top); - - /* - * The domain builder gives us a pre-constructed p2m array in - * mfn_list for all the pages initially given to us, so we just - * need to graft that into our tree structure. - */ - for (pfn = 0; pfn < max_pfn; pfn += P2M_PER_PAGE) { - unsigned topidx = p2m_top_index(pfn); - unsigned mididx = p2m_mid_index(pfn); - - if (p2m_top[topidx] == p2m_mid_missing) { - unsigned long **mid = extend_brk(PAGE_SIZE, PAGE_SIZE); - p2m_mid_init(mid); - - p2m_top[topidx] = mid; - } - - p2m_top[topidx][mididx] = &mfn_list[pfn]; - } -} - -unsigned long get_phys_to_machine(unsigned long pfn) -{ - unsigned topidx, mididx, idx; - - if (unlikely(pfn >= MAX_P2M_PFN)) - return INVALID_P2M_ENTRY; - - topidx = p2m_top_index(pfn); - mididx = p2m_mid_index(pfn); - idx = p2m_index(pfn); - - return p2m_top[topidx][mididx][idx]; -} -EXPORT_SYMBOL_GPL(get_phys_to_machine); - -static void *alloc_p2m_page(void) -{ - return (void *)__get_free_page(GFP_KERNEL | __GFP_REPEAT); -} - -static void free_p2m_page(void *p) -{ - free_page((unsigned long)p); -} - -/* - * Fully allocate the p2m structure for a given pfn. We need to check - * that both the top and mid levels are allocated, and make sure the - * parallel mfn tree is kept in sync. We may race with other cpus, so - * the new pages are installed with cmpxchg; if we lose the race then - * simply free the page we allocated and use the one that's there. - */ -static bool alloc_p2m(unsigned long pfn) -{ - unsigned topidx, mididx; - unsigned long ***top_p, **mid; - unsigned long *top_mfn_p, *mid_mfn; - - topidx = p2m_top_index(pfn); - mididx = p2m_mid_index(pfn); - - top_p = &p2m_top[topidx]; - mid = *top_p; - - if (mid == p2m_mid_missing) { - /* Mid level is missing, allocate a new one */ - mid = alloc_p2m_page(); - if (!mid) - return false; - - p2m_mid_init(mid); - - if (cmpxchg(top_p, p2m_mid_missing, mid) != p2m_mid_missing) - free_p2m_page(mid); - } - - top_mfn_p = &p2m_top_mfn[topidx]; - mid_mfn = p2m_top_mfn_p[topidx]; - - BUG_ON(virt_to_mfn(mid_mfn) != *top_mfn_p); - - if (mid_mfn == p2m_mid_missing_mfn) { - /* Separately check the mid mfn level */ - unsigned long missing_mfn; - unsigned long mid_mfn_mfn; - - mid_mfn = alloc_p2m_page(); - if (!mid_mfn) - return false; - - p2m_mid_mfn_init(mid_mfn); - - missing_mfn = virt_to_mfn(p2m_mid_missing_mfn); - mid_mfn_mfn = virt_to_mfn(mid_mfn); - if (cmpxchg(top_mfn_p, missing_mfn, mid_mfn_mfn) != missing_mfn) - free_p2m_page(mid_mfn); - else - p2m_top_mfn_p[topidx] = mid_mfn; - } - - if (p2m_top[topidx][mididx] == p2m_missing) { - /* p2m leaf page is missing */ - unsigned long *p2m; - - p2m = alloc_p2m_page(); - if (!p2m) - return false; - - p2m_init(p2m); - - if (cmpxchg(&mid[mididx], p2m_missing, p2m) != p2m_missing) - free_p2m_page(p2m); - else - mid_mfn[mididx] = virt_to_mfn(p2m); - } - - return true; -} - -/* Try to install p2m mapping; fail if intermediate bits missing */ -bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn) -{ - unsigned topidx, mididx, idx; - - if (unlikely(pfn >= MAX_P2M_PFN)) { - BUG_ON(mfn != INVALID_P2M_ENTRY); - return true; - } - - topidx = p2m_top_index(pfn); - mididx = p2m_mid_index(pfn); - idx = p2m_index(pfn); - - if (p2m_top[topidx][mididx] == p2m_missing) - return mfn == INVALID_P2M_ENTRY; - - p2m_top[topidx][mididx][idx] = mfn; - - return true; -} - -bool set_phys_to_machine(unsigned long pfn, unsigned long mfn) -{ - if (unlikely(xen_feature(XENFEAT_auto_translated_physmap))) { - BUG_ON(pfn != mfn && mfn != INVALID_P2M_ENTRY); - return true; - } - - if (unlikely(!__set_phys_to_machine(pfn, mfn))) { - if (!alloc_p2m(pfn)) - return false; - - if (!__set_phys_to_machine(pfn, mfn)) - return false; - } - - return true; -} - unsigned long arbitrary_virt_to_mfn(void *vaddr) { xmaddr_t maddr = arbitrary_virt_to_machine(vaddr); diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c new file mode 100644 index 000000000000..259ec3bb8b6f --- /dev/null +++ b/arch/x86/xen/p2m.c @@ -0,0 +1,376 @@ +/* + * Xen leaves the responsibility for maintaining p2m mappings to the + * guests themselves, but it must also access and update the p2m array + * during suspend/resume when all the pages are reallocated. + * + * The p2m table is logically a flat array, but we implement it as a + * three-level tree to allow the address space to be sparse. + * + * Xen + * | + * p2m_top p2m_top_mfn + * / \ / \ + * p2m_mid p2m_mid p2m_mid_mfn p2m_mid_mfn + * / \ / \ / / + * p2m p2m p2m p2m p2m p2m p2m ... + * + * The p2m_mid_mfn pages are mapped by p2m_top_mfn_p. + * + * The p2m_top and p2m_top_mfn levels are limited to 1 page, so the + * maximum representable pseudo-physical address space is: + * P2M_TOP_PER_PAGE * P2M_MID_PER_PAGE * P2M_PER_PAGE pages + * + * P2M_PER_PAGE depends on the architecture, as a mfn is always + * unsigned long (8 bytes on 64-bit, 4 bytes on 32), leading to + * 512 and 1024 entries respectively. + */ + +#include +#include + +#include +#include + +#include +#include +#include + +#include "xen-ops.h" + +unsigned long xen_max_p2m_pfn __read_mostly; + +#define P2M_PER_PAGE (PAGE_SIZE / sizeof(unsigned long)) +#define P2M_MID_PER_PAGE (PAGE_SIZE / sizeof(unsigned long *)) +#define P2M_TOP_PER_PAGE (PAGE_SIZE / sizeof(unsigned long **)) + +#define MAX_P2M_PFN (P2M_TOP_PER_PAGE * P2M_MID_PER_PAGE * P2M_PER_PAGE) + +/* Placeholders for holes in the address space */ +static RESERVE_BRK_ARRAY(unsigned long, p2m_missing, P2M_PER_PAGE); +static RESERVE_BRK_ARRAY(unsigned long *, p2m_mid_missing, P2M_MID_PER_PAGE); +static RESERVE_BRK_ARRAY(unsigned long, p2m_mid_missing_mfn, P2M_MID_PER_PAGE); + +static RESERVE_BRK_ARRAY(unsigned long **, p2m_top, P2M_TOP_PER_PAGE); +static RESERVE_BRK_ARRAY(unsigned long, p2m_top_mfn, P2M_TOP_PER_PAGE); +static RESERVE_BRK_ARRAY(unsigned long *, p2m_top_mfn_p, P2M_TOP_PER_PAGE); + +RESERVE_BRK(p2m_mid, PAGE_SIZE * (MAX_DOMAIN_PAGES / (P2M_PER_PAGE * P2M_MID_PER_PAGE))); +RESERVE_BRK(p2m_mid_mfn, PAGE_SIZE * (MAX_DOMAIN_PAGES / (P2M_PER_PAGE * P2M_MID_PER_PAGE))); + +static inline unsigned p2m_top_index(unsigned long pfn) +{ + BUG_ON(pfn >= MAX_P2M_PFN); + return pfn / (P2M_MID_PER_PAGE * P2M_PER_PAGE); +} + +static inline unsigned p2m_mid_index(unsigned long pfn) +{ + return (pfn / P2M_PER_PAGE) % P2M_MID_PER_PAGE; +} + +static inline unsigned p2m_index(unsigned long pfn) +{ + return pfn % P2M_PER_PAGE; +} + +static void p2m_top_init(unsigned long ***top) +{ + unsigned i; + + for (i = 0; i < P2M_TOP_PER_PAGE; i++) + top[i] = p2m_mid_missing; +} + +static void p2m_top_mfn_init(unsigned long *top) +{ + unsigned i; + + for (i = 0; i < P2M_TOP_PER_PAGE; i++) + top[i] = virt_to_mfn(p2m_mid_missing_mfn); +} + +static void p2m_top_mfn_p_init(unsigned long **top) +{ + unsigned i; + + for (i = 0; i < P2M_TOP_PER_PAGE; i++) + top[i] = p2m_mid_missing_mfn; +} + +static void p2m_mid_init(unsigned long **mid) +{ + unsigned i; + + for (i = 0; i < P2M_MID_PER_PAGE; i++) + mid[i] = p2m_missing; +} + +static void p2m_mid_mfn_init(unsigned long *mid) +{ + unsigned i; + + for (i = 0; i < P2M_MID_PER_PAGE; i++) + mid[i] = virt_to_mfn(p2m_missing); +} + +static void p2m_init(unsigned long *p2m) +{ + unsigned i; + + for (i = 0; i < P2M_MID_PER_PAGE; i++) + p2m[i] = INVALID_P2M_ENTRY; +} + +/* + * Build the parallel p2m_top_mfn and p2m_mid_mfn structures + * + * This is called both at boot time, and after resuming from suspend: + * - At boot time we're called very early, and must use extend_brk() + * to allocate memory. + * + * - After resume we're called from within stop_machine, but the mfn + * tree should alreay be completely allocated. + */ +void xen_build_mfn_list_list(void) +{ + unsigned long pfn; + + /* Pre-initialize p2m_top_mfn to be completely missing */ + if (p2m_top_mfn == NULL) { + p2m_mid_missing_mfn = extend_brk(PAGE_SIZE, PAGE_SIZE); + p2m_mid_mfn_init(p2m_mid_missing_mfn); + + p2m_top_mfn_p = extend_brk(PAGE_SIZE, PAGE_SIZE); + p2m_top_mfn_p_init(p2m_top_mfn_p); + + p2m_top_mfn = extend_brk(PAGE_SIZE, PAGE_SIZE); + p2m_top_mfn_init(p2m_top_mfn); + } else { + /* Reinitialise, mfn's all change after migration */ + p2m_mid_mfn_init(p2m_mid_missing_mfn); + } + + for (pfn = 0; pfn < xen_max_p2m_pfn; pfn += P2M_PER_PAGE) { + unsigned topidx = p2m_top_index(pfn); + unsigned mididx = p2m_mid_index(pfn); + unsigned long **mid; + unsigned long *mid_mfn_p; + + mid = p2m_top[topidx]; + mid_mfn_p = p2m_top_mfn_p[topidx]; + + /* Don't bother allocating any mfn mid levels if + * they're just missing, just update the stored mfn, + * since all could have changed over a migrate. + */ + if (mid == p2m_mid_missing) { + BUG_ON(mididx); + BUG_ON(mid_mfn_p != p2m_mid_missing_mfn); + p2m_top_mfn[topidx] = virt_to_mfn(p2m_mid_missing_mfn); + pfn += (P2M_MID_PER_PAGE - 1) * P2M_PER_PAGE; + continue; + } + + if (mid_mfn_p == p2m_mid_missing_mfn) { + /* + * XXX boot-time only! We should never find + * missing parts of the mfn tree after + * runtime. extend_brk() will BUG if we call + * it too late. + */ + mid_mfn_p = extend_brk(PAGE_SIZE, PAGE_SIZE); + p2m_mid_mfn_init(mid_mfn_p); + + p2m_top_mfn_p[topidx] = mid_mfn_p; + } + + p2m_top_mfn[topidx] = virt_to_mfn(mid_mfn_p); + mid_mfn_p[mididx] = virt_to_mfn(mid[mididx]); + } +} + +void xen_setup_mfn_list_list(void) +{ + BUG_ON(HYPERVISOR_shared_info == &xen_dummy_shared_info); + + HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list = + virt_to_mfn(p2m_top_mfn); + HYPERVISOR_shared_info->arch.max_pfn = xen_max_p2m_pfn; +} + +/* Set up p2m_top to point to the domain-builder provided p2m pages */ +void __init xen_build_dynamic_phys_to_machine(void) +{ + unsigned long *mfn_list = (unsigned long *)xen_start_info->mfn_list; + unsigned long max_pfn = min(MAX_DOMAIN_PAGES, xen_start_info->nr_pages); + unsigned long pfn; + + xen_max_p2m_pfn = max_pfn; + + p2m_missing = extend_brk(PAGE_SIZE, PAGE_SIZE); + p2m_init(p2m_missing); + + p2m_mid_missing = extend_brk(PAGE_SIZE, PAGE_SIZE); + p2m_mid_init(p2m_mid_missing); + + p2m_top = extend_brk(PAGE_SIZE, PAGE_SIZE); + p2m_top_init(p2m_top); + + /* + * The domain builder gives us a pre-constructed p2m array in + * mfn_list for all the pages initially given to us, so we just + * need to graft that into our tree structure. + */ + for (pfn = 0; pfn < max_pfn; pfn += P2M_PER_PAGE) { + unsigned topidx = p2m_top_index(pfn); + unsigned mididx = p2m_mid_index(pfn); + + if (p2m_top[topidx] == p2m_mid_missing) { + unsigned long **mid = extend_brk(PAGE_SIZE, PAGE_SIZE); + p2m_mid_init(mid); + + p2m_top[topidx] = mid; + } + + p2m_top[topidx][mididx] = &mfn_list[pfn]; + } +} + +unsigned long get_phys_to_machine(unsigned long pfn) +{ + unsigned topidx, mididx, idx; + + if (unlikely(pfn >= MAX_P2M_PFN)) + return INVALID_P2M_ENTRY; + + topidx = p2m_top_index(pfn); + mididx = p2m_mid_index(pfn); + idx = p2m_index(pfn); + + return p2m_top[topidx][mididx][idx]; +} +EXPORT_SYMBOL_GPL(get_phys_to_machine); + +static void *alloc_p2m_page(void) +{ + return (void *)__get_free_page(GFP_KERNEL | __GFP_REPEAT); +} + +static void free_p2m_page(void *p) +{ + free_page((unsigned long)p); +} + +/* + * Fully allocate the p2m structure for a given pfn. We need to check + * that both the top and mid levels are allocated, and make sure the + * parallel mfn tree is kept in sync. We may race with other cpus, so + * the new pages are installed with cmpxchg; if we lose the race then + * simply free the page we allocated and use the one that's there. + */ +static bool alloc_p2m(unsigned long pfn) +{ + unsigned topidx, mididx; + unsigned long ***top_p, **mid; + unsigned long *top_mfn_p, *mid_mfn; + + topidx = p2m_top_index(pfn); + mididx = p2m_mid_index(pfn); + + top_p = &p2m_top[topidx]; + mid = *top_p; + + if (mid == p2m_mid_missing) { + /* Mid level is missing, allocate a new one */ + mid = alloc_p2m_page(); + if (!mid) + return false; + + p2m_mid_init(mid); + + if (cmpxchg(top_p, p2m_mid_missing, mid) != p2m_mid_missing) + free_p2m_page(mid); + } + + top_mfn_p = &p2m_top_mfn[topidx]; + mid_mfn = p2m_top_mfn_p[topidx]; + + BUG_ON(virt_to_mfn(mid_mfn) != *top_mfn_p); + + if (mid_mfn == p2m_mid_missing_mfn) { + /* Separately check the mid mfn level */ + unsigned long missing_mfn; + unsigned long mid_mfn_mfn; + + mid_mfn = alloc_p2m_page(); + if (!mid_mfn) + return false; + + p2m_mid_mfn_init(mid_mfn); + + missing_mfn = virt_to_mfn(p2m_mid_missing_mfn); + mid_mfn_mfn = virt_to_mfn(mid_mfn); + if (cmpxchg(top_mfn_p, missing_mfn, mid_mfn_mfn) != missing_mfn) + free_p2m_page(mid_mfn); + else + p2m_top_mfn_p[topidx] = mid_mfn; + } + + if (p2m_top[topidx][mididx] == p2m_missing) { + /* p2m leaf page is missing */ + unsigned long *p2m; + + p2m = alloc_p2m_page(); + if (!p2m) + return false; + + p2m_init(p2m); + + if (cmpxchg(&mid[mididx], p2m_missing, p2m) != p2m_missing) + free_p2m_page(p2m); + else + mid_mfn[mididx] = virt_to_mfn(p2m); + } + + return true; +} + +/* Try to install p2m mapping; fail if intermediate bits missing */ +bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn) +{ + unsigned topidx, mididx, idx; + + if (unlikely(pfn >= MAX_P2M_PFN)) { + BUG_ON(mfn != INVALID_P2M_ENTRY); + return true; + } + + topidx = p2m_top_index(pfn); + mididx = p2m_mid_index(pfn); + idx = p2m_index(pfn); + + if (p2m_top[topidx][mididx] == p2m_missing) + return mfn == INVALID_P2M_ENTRY; + + p2m_top[topidx][mididx][idx] = mfn; + + return true; +} + +bool set_phys_to_machine(unsigned long pfn, unsigned long mfn) +{ + if (unlikely(xen_feature(XENFEAT_auto_translated_physmap))) { + BUG_ON(pfn != mfn && mfn != INVALID_P2M_ENTRY); + return true; + } + + if (unlikely(!__set_phys_to_machine(pfn, mfn))) { + if (!alloc_p2m(pfn)) + return false; + + if (!__set_phys_to_machine(pfn, mfn)) + return false; + } + + return true; +} -- cgit From 448f2831934381e9d3c4d93e700ba7bbe14612dc Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Wed, 15 Dec 2010 13:19:33 +0000 Subject: xen: add m2p override mechanism Add a simple hashtable based mechanism to override some portions of the m2p, so that we can find out the pfn corresponding to an mfn of a granted page. In fact entries corresponding to granted pages in the m2p hold the original pfn value of the page in the source domain that granted it. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Stefano Stabellini Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/include/asm/xen/page.h | 16 +++++++-- arch/x86/xen/p2m.c | 80 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 93 insertions(+), 3 deletions(-) diff --git a/arch/x86/include/asm/xen/page.h b/arch/x86/include/asm/xen/page.h index 8760cc60a21c..50f0a0f6bd6a 100644 --- a/arch/x86/include/asm/xen/page.h +++ b/arch/x86/include/asm/xen/page.h @@ -42,6 +42,11 @@ extern unsigned int machine_to_phys_order; extern unsigned long get_phys_to_machine(unsigned long pfn); extern bool set_phys_to_machine(unsigned long pfn, unsigned long mfn); +extern void m2p_add_override(unsigned long mfn, struct page *page); +extern void m2p_remove_override(struct page *page); +extern struct page *m2p_find_override(unsigned long mfn); +extern unsigned long m2p_find_override_pfn(unsigned long mfn, unsigned long pfn); + static inline unsigned long pfn_to_mfn(unsigned long pfn) { unsigned long mfn; @@ -72,9 +77,6 @@ static inline unsigned long mfn_to_pfn(unsigned long mfn) if (xen_feature(XENFEAT_auto_translated_physmap)) return mfn; - if (unlikely((mfn >> machine_to_phys_order) != 0)) - return ~0; - pfn = 0; /* * The array access can fail (e.g., device space beyond end of RAM). @@ -83,6 +85,14 @@ static inline unsigned long mfn_to_pfn(unsigned long mfn) */ __get_user(pfn, &machine_to_phys_mapping[mfn]); + /* + * If this appears to be a foreign mfn (because the pfn + * doesn't map back to the mfn), then check the local override + * table to see if there's a better pfn to use. + */ + if (get_phys_to_machine(pfn) != mfn) + pfn = m2p_find_override_pfn(mfn, pfn); + return pfn; } diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c index 259ec3bb8b6f..8db19d50c467 100644 --- a/arch/x86/xen/p2m.c +++ b/arch/x86/xen/p2m.c @@ -27,6 +27,8 @@ #include #include +#include +#include #include #include @@ -37,6 +39,8 @@ #include "xen-ops.h" +static void __init m2p_override_init(void); + unsigned long xen_max_p2m_pfn __read_mostly; #define P2M_PER_PAGE (PAGE_SIZE / sizeof(unsigned long)) @@ -234,6 +238,8 @@ void __init xen_build_dynamic_phys_to_machine(void) p2m_top[topidx][mididx] = &mfn_list[pfn]; } + + m2p_override_init(); } unsigned long get_phys_to_machine(unsigned long pfn) @@ -374,3 +380,77 @@ bool set_phys_to_machine(unsigned long pfn, unsigned long mfn) return true; } + +#define M2P_OVERRIDE_HASH_SHIFT 10 +#define M2P_OVERRIDE_HASH (1 << M2P_OVERRIDE_HASH_SHIFT) + +static RESERVE_BRK_ARRAY(struct list_head, m2p_overrides, M2P_OVERRIDE_HASH); +static DEFINE_SPINLOCK(m2p_override_lock); + +static void __init m2p_override_init(void) +{ + unsigned i; + + m2p_overrides = extend_brk(sizeof(*m2p_overrides) * M2P_OVERRIDE_HASH, + sizeof(unsigned long)); + + for (i = 0; i < M2P_OVERRIDE_HASH; i++) + INIT_LIST_HEAD(&m2p_overrides[i]); +} + +static unsigned long mfn_hash(unsigned long mfn) +{ + return hash_long(mfn, M2P_OVERRIDE_HASH_SHIFT); +} + +/* Add an MFN override for a particular page */ +void m2p_add_override(unsigned long mfn, struct page *page) +{ + unsigned long flags; + page->private = mfn; + + spin_lock_irqsave(&m2p_override_lock, flags); + list_add(&page->lru, &m2p_overrides[mfn_hash(mfn)]); + spin_unlock_irqrestore(&m2p_override_lock, flags); +} + +void m2p_remove_override(struct page *page) +{ + unsigned long flags; + spin_lock_irqsave(&m2p_override_lock, flags); + list_del(&page->lru); + spin_unlock_irqrestore(&m2p_override_lock, flags); +} + +struct page *m2p_find_override(unsigned long mfn) +{ + unsigned long flags; + struct list_head *bucket = &m2p_overrides[mfn_hash(mfn)]; + struct page *p, *ret; + + ret = NULL; + + spin_lock_irqsave(&m2p_override_lock, flags); + + list_for_each_entry(p, bucket, lru) { + if (p->private == mfn) { + ret = p; + break; + } + } + + spin_unlock_irqrestore(&m2p_override_lock, flags); + + return ret; +} + +unsigned long m2p_find_override_pfn(unsigned long mfn, unsigned long pfn) +{ + struct page *p = m2p_find_override(mfn); + unsigned long ret = pfn; + + if (p) + ret = page_to_pfn(p); + + return ret; +} -- cgit From 9329e7604fe915fd0201633d3c38adae307d56a5 Mon Sep 17 00:00:00 2001 From: Ian Campbell Date: Wed, 8 Dec 2010 11:57:40 +0000 Subject: xen: gntdev: move use of GNTMAP_contains_pte next to the map_op This flag controls the meaning of gnttab_map_grant_ref.host_addr and specifies that the field contains a reference to the pte entry to be used to perform the mapping. Therefore move the use of this flag to the point at which we actually use a reference to the pte instead of something else, splitting up the usage of the flag in this way is confusing and potentially error prone. The other flags are all properties of the mapping itself as opposed to properties of the hypercall arguments and therefore it make sense to continue to pass them round in map->flags. Signed-off-by: Ian Campbell Cc: Stefano Stabellini Cc: Derek G. Murray Cc: Gerd Hoffmann Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Konrad Rzeszutek Wilk --- drivers/xen/gntdev.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/drivers/xen/gntdev.c b/drivers/xen/gntdev.c index fa6355a97081..888d76307d59 100644 --- a/drivers/xen/gntdev.c +++ b/drivers/xen/gntdev.c @@ -211,10 +211,12 @@ static int find_grant_ptes(pte_t *pte, pgtable_t token, BUG_ON(pgnr >= map->count); pte_maddr = (u64)pfn_to_mfn(page_to_pfn(token)) << PAGE_SHIFT; pte_maddr += (unsigned long)pte & ~PAGE_MASK; - gnttab_set_map_op(&map->map_ops[pgnr], pte_maddr, map->flags, + gnttab_set_map_op(&map->map_ops[pgnr], pte_maddr, + GNTMAP_contains_pte | map->flags, map->grants[pgnr].ref, map->grants[pgnr].domid); - gnttab_set_unmap_op(&map->unmap_ops[pgnr], pte_maddr, map->flags, + gnttab_set_unmap_op(&map->unmap_ops[pgnr], pte_maddr, + GNTMAP_contains_pte | map->flags, 0 /* handle */); return 0; } @@ -569,7 +571,7 @@ static int gntdev_mmap(struct file *flip, struct vm_area_struct *vma) vma->vm_private_data = map; map->vma = vma; - map->flags = GNTMAP_host_map | GNTMAP_application_map | GNTMAP_contains_pte; + map->flags = GNTMAP_host_map | GNTMAP_application_map; if (!(vma->vm_flags & VM_WRITE)) map->flags |= GNTMAP_readonly; -- cgit From ba5d1012292403c8037adf4a54c4ec50dfe846c4 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Wed, 8 Dec 2010 10:54:32 -0800 Subject: xen/gntdev: stop using "token" argument It's the struct page of the L1 pte page. But we can get its mfn by simply doing an arbitrary_virt_to_machine() on it anyway (which is the safe conservative choice; since we no longer allow HIGHPTE pages, we would never expect to be operating on a mapped pte page). Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Konrad Rzeszutek Wilk --- drivers/xen/gntdev.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/xen/gntdev.c b/drivers/xen/gntdev.c index 888d76307d59..a2ea5335e152 100644 --- a/drivers/xen/gntdev.c +++ b/drivers/xen/gntdev.c @@ -209,8 +209,8 @@ static int find_grant_ptes(pte_t *pte, pgtable_t token, u64 pte_maddr; BUG_ON(pgnr >= map->count); - pte_maddr = (u64)pfn_to_mfn(page_to_pfn(token)) << PAGE_SHIFT; - pte_maddr += (unsigned long)pte & ~PAGE_MASK; + pte_maddr = arbitrary_virt_to_machine(pte).maddr; + gnttab_set_map_op(&map->map_ops[pgnr], pte_maddr, GNTMAP_contains_pte | map->flags, map->grants[pgnr].ref, -- cgit From f0a70c882ea546bbd802643990ceded32c39facc Mon Sep 17 00:00:00 2001 From: Daniel De Graaf Date: Fri, 7 Jan 2011 11:51:47 +0000 Subject: xen/gntdev: Fix circular locking dependency apply_to_page_range will acquire PTE lock while priv->lock is held, and mn_invl_range_start tries to acquire priv->lock with PTE already held. Fix by not holding priv->lock during the entire map operation. This is safe because map->vma is set nonzero while the lock is held, which will cause subsequent maps to fail and will cause the unmap ioctl (and other users of gntdev_del_map) to return -EBUSY until the area is unmapped. It is similarly impossible for gntdev_vma_close to be called while the vma is still being created. Signed-off-by: Daniel De Graaf Signed-off-by: Konrad Rzeszutek Wilk --- drivers/xen/gntdev.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/drivers/xen/gntdev.c b/drivers/xen/gntdev.c index a2ea5335e152..aba76d437ea8 100644 --- a/drivers/xen/gntdev.c +++ b/drivers/xen/gntdev.c @@ -575,21 +575,26 @@ static int gntdev_mmap(struct file *flip, struct vm_area_struct *vma) if (!(vma->vm_flags & VM_WRITE)) map->flags |= GNTMAP_readonly; + spin_unlock(&priv->lock); + err = apply_to_page_range(vma->vm_mm, vma->vm_start, vma->vm_end - vma->vm_start, find_grant_ptes, map); if (err) { printk(KERN_WARNING "find_grant_ptes() failure.\n"); - goto unlock_out; + return err; } err = map_grant_pages(map); if (err) { printk(KERN_WARNING "map_grant_pages() failure.\n"); - goto unlock_out; + return err; } + map->is_mapped = 1; + return 0; + unlock_out: spin_unlock(&priv->lock); return err; -- cgit From 9b705f0e98c489b18ba22a6eab9d694b546c8552 Mon Sep 17 00:00:00 2001 From: Stefano Stabellini Date: Fri, 10 Dec 2010 14:52:45 +0000 Subject: xen p2m: transparently change the p2m mappings in the m2p override In m2p_add_override store the original mfn into page->index and then change the p2m mapping, setting mfns as FOREIGN_FRAME. In m2p_remove_override restore the original mapping. Signed-off-by: Stefano Stabellini Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/xen/p2m.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c index 8db19d50c467..b3b19d43b951 100644 --- a/arch/x86/xen/p2m.c +++ b/arch/x86/xen/p2m.c @@ -407,8 +407,11 @@ static unsigned long mfn_hash(unsigned long mfn) void m2p_add_override(unsigned long mfn, struct page *page) { unsigned long flags; + unsigned long pfn = page_to_pfn(page); page->private = mfn; + page->index = pfn_to_mfn(pfn); + __set_phys_to_machine(pfn, FOREIGN_FRAME(mfn)); spin_lock_irqsave(&m2p_override_lock, flags); list_add(&page->lru, &m2p_overrides[mfn_hash(mfn)]); spin_unlock_irqrestore(&m2p_override_lock, flags); @@ -417,9 +420,18 @@ void m2p_add_override(unsigned long mfn, struct page *page) void m2p_remove_override(struct page *page) { unsigned long flags; + unsigned long mfn; + unsigned long pfn; + + pfn = page_to_pfn(page); + mfn = get_phys_to_machine(pfn); + if (mfn == INVALID_P2M_ENTRY || !(mfn & FOREIGN_FRAME_BIT)) + return; + spin_lock_irqsave(&m2p_override_lock, flags); list_del(&page->lru); spin_unlock_irqrestore(&m2p_override_lock, flags); + __set_phys_to_machine(pfn, page->index); } struct page *m2p_find_override(unsigned long mfn) -- cgit From 289b777eac19c811b474593b4d2fd14e46340c23 Mon Sep 17 00:00:00 2001 From: Stefano Stabellini Date: Fri, 10 Dec 2010 14:54:44 +0000 Subject: xen: introduce gnttab_map_refs and gnttab_unmap_refs gnttab_map_refs maps some grant refs and uses the new m2p override to set a proper m2p mapping for the granted pages. gnttab_unmap_refs unmaps the granted refs and removes th mappings from the m2p override. Signed-off-by: Stefano Stabellini Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Konrad Rzeszutek Wilk --- drivers/xen/grant-table.c | 36 ++++++++++++++++++++++++++++++++++++ include/xen/grant_table.h | 5 +++++ 2 files changed, 41 insertions(+) diff --git a/drivers/xen/grant-table.c b/drivers/xen/grant-table.c index 6c4531816496..1afd5690858c 100644 --- a/drivers/xen/grant-table.c +++ b/drivers/xen/grant-table.c @@ -447,6 +447,42 @@ unsigned int gnttab_max_grant_frames(void) } EXPORT_SYMBOL_GPL(gnttab_max_grant_frames); +int gnttab_map_refs(struct gnttab_map_grant_ref *map_ops, + struct page **pages, unsigned int count) +{ + int i, ret; + pte_t *pte; + unsigned long mfn; + + ret = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, map_ops, count); + + for (i = 0; i < count; i++) { + /* m2p override only supported for GNTMAP_contains_pte mappings */ + if (!(map_ops[i].flags & GNTMAP_contains_pte)) + continue; + pte = (pte_t *) (mfn_to_virt(PFN_DOWN(map_ops[i].host_addr)) + + (map_ops[i].host_addr & ~PAGE_MASK)); + mfn = pte_mfn(*pte); + m2p_add_override(mfn, pages[i]); + } + + return ret; +} +EXPORT_SYMBOL_GPL(gnttab_map_refs); + +int gnttab_unmap_refs(struct gnttab_unmap_grant_ref *unmap_ops, + struct page **pages, unsigned int count) +{ + int i, ret; + + ret = HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, unmap_ops, count); + for (i = 0; i < count; i++) + m2p_remove_override(pages[i]); + + return ret; +} +EXPORT_SYMBOL_GPL(gnttab_unmap_refs); + static int gnttab_map(unsigned int start_idx, unsigned int end_idx) { struct gnttab_setup_table setup; diff --git a/include/xen/grant_table.h b/include/xen/grant_table.h index 1821aa103d82..b1fab6b5b3ef 100644 --- a/include/xen/grant_table.h +++ b/include/xen/grant_table.h @@ -155,4 +155,9 @@ unsigned int gnttab_max_grant_frames(void); #define gnttab_map_vaddr(map) ((void *)(map.host_virt_addr)) +int gnttab_map_refs(struct gnttab_map_grant_ref *map_ops, + struct page **pages, unsigned int count); +int gnttab_unmap_refs(struct gnttab_unmap_grant_ref *unmap_ops, + struct page **pages, unsigned int count); + #endif /* __ASM_GNTTAB_H__ */ -- cgit From a12b4eb34bb1ea16046c5b61e7a887e252cc1cce Mon Sep 17 00:00:00 2001 From: Stefano Stabellini Date: Fri, 10 Dec 2010 14:56:42 +0000 Subject: xen gntdev: use gnttab_map_refs and gnttab_unmap_refs Use gnttab_map_refs and gnttab_unmap_refs to map and unmap the grant ref, so that we can have a corresponding struct page. Signed-off-by: Stefano Stabellini Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Konrad Rzeszutek Wilk --- drivers/xen/gntdev.c | 37 ++++++++++++++++++++++++++++++------- 1 file changed, 30 insertions(+), 7 deletions(-) diff --git a/drivers/xen/gntdev.c b/drivers/xen/gntdev.c index aba76d437ea8..1e31cdcdae1e 100644 --- a/drivers/xen/gntdev.c +++ b/drivers/xen/gntdev.c @@ -71,6 +71,7 @@ struct grant_map { struct ioctl_gntdev_grant_ref *grants; struct gnttab_map_grant_ref *map_ops; struct gnttab_unmap_grant_ref *unmap_ops; + struct page **pages; }; /* ------------------------------------------------------------------ */ @@ -94,6 +95,7 @@ static void gntdev_print_maps(struct gntdev_priv *priv, static struct grant_map *gntdev_alloc_map(struct gntdev_priv *priv, int count) { struct grant_map *add; + int i; add = kzalloc(sizeof(struct grant_map), GFP_KERNEL); if (NULL == add) @@ -102,11 +104,19 @@ static struct grant_map *gntdev_alloc_map(struct gntdev_priv *priv, int count) add->grants = kzalloc(sizeof(add->grants[0]) * count, GFP_KERNEL); add->map_ops = kzalloc(sizeof(add->map_ops[0]) * count, GFP_KERNEL); add->unmap_ops = kzalloc(sizeof(add->unmap_ops[0]) * count, GFP_KERNEL); - if (NULL == add->grants || - NULL == add->map_ops || - NULL == add->unmap_ops) + add->pages = kzalloc(sizeof(add->pages[0]) * count, GFP_KERNEL); + if (NULL == add->grants || + NULL == add->map_ops || + NULL == add->unmap_ops || + NULL == add->pages) goto err; + for (i = 0; i < count; i++) { + add->pages[i] = alloc_page(GFP_KERNEL | __GFP_HIGHMEM); + if (add->pages[i] == NULL) + goto err; + } + add->index = 0; add->count = count; add->priv = priv; @@ -117,6 +127,12 @@ static struct grant_map *gntdev_alloc_map(struct gntdev_priv *priv, int count) return add; err: + if (add->pages) + for (i = 0; i < count; i++) { + if (add->pages[i]) + __free_page(add->pages[i]); + } + kfree(add->pages); kfree(add->grants); kfree(add->map_ops); kfree(add->unmap_ops); @@ -191,8 +207,17 @@ static int gntdev_del_map(struct grant_map *map) static void gntdev_free_map(struct grant_map *map) { + int i; + if (!map) return; + + if (map->pages) + for (i = 0; i < map->count; i++) { + if (map->pages[i]) + __free_page(map->pages[i]); + } + kfree(map->pages); kfree(map->grants); kfree(map->map_ops); kfree(map->unmap_ops); @@ -226,8 +251,7 @@ static int map_grant_pages(struct grant_map *map) int i, err = 0; pr_debug("map %d+%d\n", map->index, map->count); - err = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, - map->map_ops, map->count); + err = gnttab_map_refs(map->map_ops, map->pages, map->count); if (err) return err; @@ -244,8 +268,7 @@ static int unmap_grant_pages(struct grant_map *map, int offset, int pages) int i, err = 0; pr_debug("map %d+%d [%d+%d]\n", map->index, map->count, offset, pages); - err = HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, - map->unmap_ops + offset, pages); + err = gnttab_unmap_refs(map->unmap_ops + offset, map->pages, pages); if (err) return err; -- cgit From 87f1d40a706bdebdc8f959b9ac291d0d8fdfcc7e Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Mon, 13 Dec 2010 14:42:30 +0000 Subject: xen p2m: clear the old pte when adding a page to m2p_override When adding a page to m2p_override we change the p2m of the page so we need to also clear the old pte of the kernel linear mapping because it doesn't correspond anymore. When we remove the page from m2p_override we restore the original p2m of the page and we also restore the old pte of the kernel linear mapping. Before changing the p2m mappings in m2p_add_override and m2p_remove_override, check that the page passed as argument is valid and return an error if it is not. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Stefano Stabellini Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/include/asm/xen/page.h | 4 ++-- arch/x86/xen/p2m.c | 49 +++++++++++++++++++++++++++++++++++++---- drivers/xen/grant-table.c | 16 +++++++++++--- 3 files changed, 60 insertions(+), 9 deletions(-) diff --git a/arch/x86/include/asm/xen/page.h b/arch/x86/include/asm/xen/page.h index 50f0a0f6bd6a..f25bdf238a33 100644 --- a/arch/x86/include/asm/xen/page.h +++ b/arch/x86/include/asm/xen/page.h @@ -42,8 +42,8 @@ extern unsigned int machine_to_phys_order; extern unsigned long get_phys_to_machine(unsigned long pfn); extern bool set_phys_to_machine(unsigned long pfn, unsigned long mfn); -extern void m2p_add_override(unsigned long mfn, struct page *page); -extern void m2p_remove_override(struct page *page); +extern int m2p_add_override(unsigned long mfn, struct page *page); +extern int m2p_remove_override(struct page *page); extern struct page *m2p_find_override(unsigned long mfn); extern unsigned long m2p_find_override_pfn(unsigned long mfn, unsigned long pfn); diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c index b3b19d43b951..40d51225ff08 100644 --- a/arch/x86/xen/p2m.c +++ b/arch/x86/xen/p2m.c @@ -29,6 +29,7 @@ #include #include #include +#include #include #include @@ -404,34 +405,74 @@ static unsigned long mfn_hash(unsigned long mfn) } /* Add an MFN override for a particular page */ -void m2p_add_override(unsigned long mfn, struct page *page) +int m2p_add_override(unsigned long mfn, struct page *page) { unsigned long flags; - unsigned long pfn = page_to_pfn(page); + unsigned long pfn; + unsigned long address; + unsigned level; + pte_t *ptep = NULL; + + pfn = page_to_pfn(page); + if (!PageHighMem(page)) { + address = (unsigned long)__va(pfn << PAGE_SHIFT); + ptep = lookup_address(address, &level); + + if (WARN(ptep == NULL || level != PG_LEVEL_4K, + "m2p_add_override: pfn %lx not mapped", pfn)) + return -EINVAL; + } + page->private = mfn; page->index = pfn_to_mfn(pfn); __set_phys_to_machine(pfn, FOREIGN_FRAME(mfn)); + if (!PageHighMem(page)) + /* Just zap old mapping for now */ + pte_clear(&init_mm, address, ptep); + spin_lock_irqsave(&m2p_override_lock, flags); list_add(&page->lru, &m2p_overrides[mfn_hash(mfn)]); spin_unlock_irqrestore(&m2p_override_lock, flags); + + return 0; } -void m2p_remove_override(struct page *page) +int m2p_remove_override(struct page *page) { unsigned long flags; unsigned long mfn; unsigned long pfn; + unsigned long address; + unsigned level; + pte_t *ptep = NULL; pfn = page_to_pfn(page); mfn = get_phys_to_machine(pfn); if (mfn == INVALID_P2M_ENTRY || !(mfn & FOREIGN_FRAME_BIT)) - return; + return -EINVAL; + + if (!PageHighMem(page)) { + address = (unsigned long)__va(pfn << PAGE_SHIFT); + ptep = lookup_address(address, &level); + + if (WARN(ptep == NULL || level != PG_LEVEL_4K, + "m2p_remove_override: pfn %lx not mapped", pfn)) + return -EINVAL; + } spin_lock_irqsave(&m2p_override_lock, flags); list_del(&page->lru); spin_unlock_irqrestore(&m2p_override_lock, flags); __set_phys_to_machine(pfn, page->index); + + if (!PageHighMem(page)) + set_pte_at(&init_mm, address, ptep, + pfn_pte(pfn, PAGE_KERNEL)); + /* No tlb flush necessary because the caller already + * left the pte unmapped. */ + + return 0; } struct page *m2p_find_override(unsigned long mfn) diff --git a/drivers/xen/grant-table.c b/drivers/xen/grant-table.c index 1afd5690858c..9ef54ebc1194 100644 --- a/drivers/xen/grant-table.c +++ b/drivers/xen/grant-table.c @@ -455,6 +455,8 @@ int gnttab_map_refs(struct gnttab_map_grant_ref *map_ops, unsigned long mfn; ret = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, map_ops, count); + if (ret) + return ret; for (i = 0; i < count; i++) { /* m2p override only supported for GNTMAP_contains_pte mappings */ @@ -463,7 +465,9 @@ int gnttab_map_refs(struct gnttab_map_grant_ref *map_ops, pte = (pte_t *) (mfn_to_virt(PFN_DOWN(map_ops[i].host_addr)) + (map_ops[i].host_addr & ~PAGE_MASK)); mfn = pte_mfn(*pte); - m2p_add_override(mfn, pages[i]); + ret = m2p_add_override(mfn, pages[i]); + if (ret) + return ret; } return ret; @@ -476,8 +480,14 @@ int gnttab_unmap_refs(struct gnttab_unmap_grant_ref *unmap_ops, int i, ret; ret = HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, unmap_ops, count); - for (i = 0; i < count; i++) - m2p_remove_override(pages[i]); + if (ret) + return ret; + + for (i = 0; i < count; i++) { + ret = m2p_remove_override(pages[i]); + if (ret) + return ret; + } return ret; } -- cgit From e1b478e4ec4477520767d1a920433626263a2a6b Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Tue, 11 Jan 2011 15:09:16 -0500 Subject: xen/p2m: Fix module linking error. Fixes: ERROR: "m2p_find_override_pfn" [drivers/block/xen-blkfront.ko] undefined! Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/xen/p2m.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c index 40d51225ff08..8f2251d2a3f8 100644 --- a/arch/x86/xen/p2m.c +++ b/arch/x86/xen/p2m.c @@ -507,3 +507,4 @@ unsigned long m2p_find_override_pfn(unsigned long mfn, unsigned long pfn) return ret; } +EXPORT_SYMBOL_GPL(m2p_find_override_pfn); -- cgit From bf572541ab44240163eaa2d486b06f306a31d45a Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Wed, 12 Jan 2011 09:03:35 +1100 Subject: md: fix regression with re-adding devices to arrays with no metadata Commit 1a855a0606 (2.6.37-rc4) fixed a problem where devices were re-added when they shouldn't be but caused a regression in a less common case that means sometimes devices cannot be re-added when they should be. In particular, when re-adding a device to an array without metadata we should always access the device, but after the above commit we didn't. This patch sets the In_sync flag in that case so that the re-add succeeds. This patch is suitable for any -stable kernel to which 1a855a0606 was applied. Cc: stable@kernel.org Signed-off-by: NeilBrown --- drivers/md/md.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index 175c424f201f..0da25daea9be 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -5159,9 +5159,10 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info) /* set saved_raid_disk if appropriate */ if (!mddev->persistent) { if (info->state & (1<raid_disk < mddev->raid_disks) + info->raid_disk < mddev->raid_disks) { rdev->raid_disk = info->raid_disk; - else + set_bit(In_sync, &rdev->flags); + } else rdev->raid_disk = -1; } else super_types[mddev->major_version]. -- cgit From 00f28e4037c8d5782fa7a1b2666b0dca21522d69 Mon Sep 17 00:00:00 2001 From: Ian Campbell Date: Tue, 11 Jan 2011 11:50:28 +0000 Subject: xen-platform: use PCI interfaces to request IO and MEM resources. This is the correct interface to use and something has broken the use of the previous incorrect interface (which fails because the request conflicts with the resources assigned for the PCI device itself instead of nesting like the PCI interfaces do). Signed-off-by: Ian Campbell Signed-off-by: Konrad Rzeszutek Wilk Cc: stable@kernel.org # 2.6.37 only --- drivers/xen/platform-pci.c | 21 +++++++-------------- 1 file changed, 7 insertions(+), 14 deletions(-) diff --git a/drivers/xen/platform-pci.c b/drivers/xen/platform-pci.c index c01b5ddce529..afbe041f42c5 100644 --- a/drivers/xen/platform-pci.c +++ b/drivers/xen/platform-pci.c @@ -105,7 +105,7 @@ static int __devinit platform_pci_init(struct pci_dev *pdev, const struct pci_device_id *ent) { int i, ret; - long ioaddr, iolen; + long ioaddr; long mmio_addr, mmio_len; unsigned int max_nr_gframes; @@ -114,7 +114,6 @@ static int __devinit platform_pci_init(struct pci_dev *pdev, return i; ioaddr = pci_resource_start(pdev, 0); - iolen = pci_resource_len(pdev, 0); mmio_addr = pci_resource_start(pdev, 1); mmio_len = pci_resource_len(pdev, 1); @@ -125,19 +124,13 @@ static int __devinit platform_pci_init(struct pci_dev *pdev, goto pci_out; } - if (request_mem_region(mmio_addr, mmio_len, DRV_NAME) == NULL) { - dev_err(&pdev->dev, "MEM I/O resource 0x%lx @ 0x%lx busy\n", - mmio_addr, mmio_len); - ret = -EBUSY; + ret = pci_request_region(pdev, 1, DRV_NAME); + if (ret < 0) goto pci_out; - } - if (request_region(ioaddr, iolen, DRV_NAME) == NULL) { - dev_err(&pdev->dev, "I/O resource 0x%lx @ 0x%lx busy\n", - iolen, ioaddr); - ret = -EBUSY; + ret = pci_request_region(pdev, 0, DRV_NAME); + if (ret < 0) goto mem_out; - } platform_mmio = mmio_addr; platform_mmiolen = mmio_len; @@ -169,9 +162,9 @@ static int __devinit platform_pci_init(struct pci_dev *pdev, return 0; out: - release_region(ioaddr, iolen); + pci_release_region(pdev, 0); mem_out: - release_mem_region(mmio_addr, mmio_len); + pci_release_region(pdev, 1); pci_out: pci_disable_device(pdev); return ret; -- cgit From 7c8c06c5bb2dcc9fd0d2eeac4e9c10bf77ffed1a Mon Sep 17 00:00:00 2001 From: Ian Campbell Date: Tue, 11 Jan 2011 11:51:14 +0000 Subject: xen: rename platform-pci module to xen-platform-pci. platform-pci is rather generic for a modular distro style kernel. Signed-off-by: Ian Campbell Signed-off-by: Konrad Rzeszutek Wilk --- drivers/xen/Makefile | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/xen/Makefile b/drivers/xen/Makefile index 533a199e7a3f..f81819b0f916 100644 --- a/drivers/xen/Makefile +++ b/drivers/xen/Makefile @@ -11,9 +11,10 @@ obj-$(CONFIG_XEN_BALLOON) += balloon.o obj-$(CONFIG_XEN_DEV_EVTCHN) += xen-evtchn.o obj-$(CONFIG_XENFS) += xenfs/ obj-$(CONFIG_XEN_SYS_HYPERVISOR) += sys-hypervisor.o -obj-$(CONFIG_XEN_PLATFORM_PCI) += platform-pci.o +obj-$(CONFIG_XEN_PLATFORM_PCI) += xen-platform-pci.o obj-$(CONFIG_SWIOTLB_XEN) += swiotlb-xen.o obj-$(CONFIG_XEN_DOM0) += pci.o xen-evtchn-y := evtchn.o +xen-platform-pci-y := platform-pci.o -- cgit From e528db5b392ab2a624258a667fed0e65af6e411b Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Wed, 12 Jan 2011 12:58:06 -0500 Subject: xen-platform: Fix compile errors if CONFIG_PCI is not enabled. drivers/xen/platform-pci.c:127: error: implicit declaration of function 'pci_request_region' drivers/xen/platform-pci.c:165: error: implicit declaration of function 'pci_release_region' Signed-off-by: Konrad Rzeszutek Wilk --- drivers/xen/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/xen/Kconfig b/drivers/xen/Kconfig index 6e6180ccd726..6f52b3189803 100644 --- a/drivers/xen/Kconfig +++ b/drivers/xen/Kconfig @@ -64,7 +64,7 @@ config XEN_SYS_HYPERVISOR config XEN_PLATFORM_PCI tristate "xen platform pci device driver" - depends on XEN_PVHVM + depends on XEN_PVHVM && PCI default m help Driver for the Xen PCI Platform device: it is responsible for -- cgit From 36b471e047a77eaf3ec8e693bd9d8291c4dfd864 Mon Sep 17 00:00:00 2001 From: Vasiliy Kulikov Date: Fri, 26 Nov 2010 20:06:12 +0300 Subject: avr32: boards: setup: use IS_ERR() instead of NULL check clk_get() returns ERR_PTR() on error, not NULL. Signed-off-by: Vasiliy Kulikov Acked-by: Hans-Christian Egtvedt --- arch/avr32/boards/atngw100/setup.c | 2 +- arch/avr32/boards/atstk1000/atstk1002.c | 2 +- arch/avr32/boards/favr-32/setup.c | 2 +- arch/avr32/boards/hammerhead/setup.c | 2 +- arch/avr32/boards/merisc/setup.c | 2 +- arch/avr32/boards/mimc200/setup.c | 2 +- 6 files changed, 6 insertions(+), 6 deletions(-) diff --git a/arch/avr32/boards/atngw100/setup.c b/arch/avr32/boards/atngw100/setup.c index 8c6a2440e345..659d119ce712 100644 --- a/arch/avr32/boards/atngw100/setup.c +++ b/arch/avr32/boards/atngw100/setup.c @@ -188,7 +188,7 @@ static void __init set_hw_addr(struct platform_device *pdev) */ regs = (void __iomem __force *)res->start; pclk = clk_get(&pdev->dev, "pclk"); - if (!pclk) + if (IS_ERR(pclk)) return; clk_enable(pclk); diff --git a/arch/avr32/boards/atstk1000/atstk1002.c b/arch/avr32/boards/atstk1000/atstk1002.c index 2adc261c9e3d..6ce30fb2ec94 100644 --- a/arch/avr32/boards/atstk1000/atstk1002.c +++ b/arch/avr32/boards/atstk1000/atstk1002.c @@ -203,7 +203,7 @@ static void __init set_hw_addr(struct platform_device *pdev) */ regs = (void __iomem __force *)res->start; pclk = clk_get(&pdev->dev, "pclk"); - if (!pclk) + if (IS_ERR(pclk)) return; clk_enable(pclk); diff --git a/arch/avr32/boards/favr-32/setup.c b/arch/avr32/boards/favr-32/setup.c index 75f19f47fb2f..86fab77a5a00 100644 --- a/arch/avr32/boards/favr-32/setup.c +++ b/arch/avr32/boards/favr-32/setup.c @@ -206,7 +206,7 @@ static void __init set_hw_addr(struct platform_device *pdev) */ regs = (void __iomem __force *)res->start; pclk = clk_get(&pdev->dev, "pclk"); - if (!pclk) + if (IS_ERR(pclk)) return; clk_enable(pclk); diff --git a/arch/avr32/boards/hammerhead/setup.c b/arch/avr32/boards/hammerhead/setup.c index dd009875a405..da14fbdd4e8e 100644 --- a/arch/avr32/boards/hammerhead/setup.c +++ b/arch/avr32/boards/hammerhead/setup.c @@ -150,7 +150,7 @@ static void __init set_hw_addr(struct platform_device *pdev) regs = (void __iomem __force *)res->start; pclk = clk_get(&pdev->dev, "pclk"); - if (!pclk) + if (IS_ERR(pclk)) return; clk_enable(pclk); diff --git a/arch/avr32/boards/merisc/setup.c b/arch/avr32/boards/merisc/setup.c index 623b077594fc..e61bc948f959 100644 --- a/arch/avr32/boards/merisc/setup.c +++ b/arch/avr32/boards/merisc/setup.c @@ -134,7 +134,7 @@ static void __init set_hw_addr(struct platform_device *pdev) regs = (void __iomem __force *)res->start; pclk = clk_get(&pdev->dev, "pclk"); - if (!pclk) + if (IS_ERR(pclk)) return; clk_enable(pclk); diff --git a/arch/avr32/boards/mimc200/setup.c b/arch/avr32/boards/mimc200/setup.c index 523d8e183bef..c4da5cba2dbf 100644 --- a/arch/avr32/boards/mimc200/setup.c +++ b/arch/avr32/boards/mimc200/setup.c @@ -162,7 +162,7 @@ static void __init set_hw_addr(struct platform_device *pdev) */ regs = (void __iomem __force *)res->start; pclk = clk_get(&pdev->dev, "pclk"); - if (!pclk) + if (IS_ERR(pclk)) return; clk_enable(pclk); -- cgit From 12be8e71bf17d403b8f7866a3581ef34b77ea16f Mon Sep 17 00:00:00 2001 From: Hans-Christian Egtvedt Date: Thu, 9 Dec 2010 08:54:55 +0100 Subject: avr32: disable kprobes for all default configurations This patch will disable kprobes for all the default AVR32 board configurations. This works around a regression in kprobes which seems to be related to AVR32 is now lacking the struct kprobe_ctlblk. Signed-off-by: Hans-Christian Egtvedt --- arch/avr32/configs/atngw100_defconfig | 2 +- arch/avr32/configs/atngw100_evklcd100_defconfig | 2 +- arch/avr32/configs/atngw100_evklcd101_defconfig | 2 +- arch/avr32/configs/atngw100mkii_defconfig | 2 +- arch/avr32/configs/atngw100mkii_evklcd100_defconfig | 2 +- arch/avr32/configs/atngw100mkii_evklcd101_defconfig | 2 +- arch/avr32/configs/atstk1002_defconfig | 2 +- arch/avr32/configs/atstk1003_defconfig | 2 +- arch/avr32/configs/atstk1006_defconfig | 2 +- arch/avr32/configs/favr-32_defconfig | 2 +- arch/avr32/configs/hammerhead_defconfig | 2 +- 11 files changed, 11 insertions(+), 11 deletions(-) diff --git a/arch/avr32/configs/atngw100_defconfig b/arch/avr32/configs/atngw100_defconfig index 9854013d2728..fe9525e359ad 100644 --- a/arch/avr32/configs/atngw100_defconfig +++ b/arch/avr32/configs/atngw100_defconfig @@ -12,7 +12,7 @@ CONFIG_BLK_DEV_INITRD=y # CONFIG_COMPAT_BRK is not set CONFIG_PROFILING=y CONFIG_OPROFILE=m -CONFIG_KPROBES=y +# CONFIG_KPROBES is not set CONFIG_MODULES=y CONFIG_MODULE_UNLOAD=y CONFIG_MODULE_FORCE_UNLOAD=y diff --git a/arch/avr32/configs/atngw100_evklcd100_defconfig b/arch/avr32/configs/atngw100_evklcd100_defconfig index 7ceda354597b..8dae90cbe8a3 100644 --- a/arch/avr32/configs/atngw100_evklcd100_defconfig +++ b/arch/avr32/configs/atngw100_evklcd100_defconfig @@ -12,7 +12,7 @@ CONFIG_BLK_DEV_INITRD=y # CONFIG_COMPAT_BRK is not set CONFIG_PROFILING=y CONFIG_OPROFILE=m -CONFIG_KPROBES=y +# CONFIG_KPROBES is not set CONFIG_MODULES=y CONFIG_MODULE_UNLOAD=y CONFIG_MODULE_FORCE_UNLOAD=y diff --git a/arch/avr32/configs/atngw100_evklcd101_defconfig b/arch/avr32/configs/atngw100_evklcd101_defconfig index 7bc5b2ce68d5..985f4b937923 100644 --- a/arch/avr32/configs/atngw100_evklcd101_defconfig +++ b/arch/avr32/configs/atngw100_evklcd101_defconfig @@ -12,7 +12,7 @@ CONFIG_BLK_DEV_INITRD=y # CONFIG_COMPAT_BRK is not set CONFIG_PROFILING=y CONFIG_OPROFILE=m -CONFIG_KPROBES=y +# CONFIG_KPROBES is not set CONFIG_MODULES=y CONFIG_MODULE_UNLOAD=y CONFIG_MODULE_FORCE_UNLOAD=y diff --git a/arch/avr32/configs/atngw100mkii_defconfig b/arch/avr32/configs/atngw100mkii_defconfig index 4bd36821d4a2..e5d029595bf2 100644 --- a/arch/avr32/configs/atngw100mkii_defconfig +++ b/arch/avr32/configs/atngw100mkii_defconfig @@ -12,7 +12,7 @@ CONFIG_BLK_DEV_INITRD=y # CONFIG_COMPAT_BRK is not set CONFIG_PROFILING=y CONFIG_OPROFILE=m -CONFIG_KPROBES=y +# CONFIG_KPROBES is not set CONFIG_MODULES=y CONFIG_MODULE_UNLOAD=y CONFIG_MODULE_FORCE_UNLOAD=y diff --git a/arch/avr32/configs/atngw100mkii_evklcd100_defconfig b/arch/avr32/configs/atngw100mkii_evklcd100_defconfig index f8437ef3237f..93fee4ac3748 100644 --- a/arch/avr32/configs/atngw100mkii_evklcd100_defconfig +++ b/arch/avr32/configs/atngw100mkii_evklcd100_defconfig @@ -12,7 +12,7 @@ CONFIG_BLK_DEV_INITRD=y # CONFIG_COMPAT_BRK is not set CONFIG_PROFILING=y CONFIG_OPROFILE=m -CONFIG_KPROBES=y +# CONFIG_KPROBES is not set CONFIG_MODULES=y CONFIG_MODULE_UNLOAD=y CONFIG_MODULE_FORCE_UNLOAD=y diff --git a/arch/avr32/configs/atngw100mkii_evklcd101_defconfig b/arch/avr32/configs/atngw100mkii_evklcd101_defconfig index 7f58f996d945..bcc90ea1b4dc 100644 --- a/arch/avr32/configs/atngw100mkii_evklcd101_defconfig +++ b/arch/avr32/configs/atngw100mkii_evklcd101_defconfig @@ -12,7 +12,7 @@ CONFIG_BLK_DEV_INITRD=y # CONFIG_COMPAT_BRK is not set CONFIG_PROFILING=y CONFIG_OPROFILE=m -CONFIG_KPROBES=y +# CONFIG_KPROBES is not set CONFIG_MODULES=y CONFIG_MODULE_UNLOAD=y CONFIG_MODULE_FORCE_UNLOAD=y diff --git a/arch/avr32/configs/atstk1002_defconfig b/arch/avr32/configs/atstk1002_defconfig index aec4c43a75da..a7536b8afb46 100644 --- a/arch/avr32/configs/atstk1002_defconfig +++ b/arch/avr32/configs/atstk1002_defconfig @@ -11,7 +11,7 @@ CONFIG_BLK_DEV_INITRD=y # CONFIG_COMPAT_BRK is not set CONFIG_PROFILING=y CONFIG_OPROFILE=m -CONFIG_KPROBES=y +# CONFIG_KPROBES is not set CONFIG_MODULES=y CONFIG_MODULE_UNLOAD=y # CONFIG_BLK_DEV_BSG is not set diff --git a/arch/avr32/configs/atstk1003_defconfig b/arch/avr32/configs/atstk1003_defconfig index 50ba3db682ca..e19662af22f0 100644 --- a/arch/avr32/configs/atstk1003_defconfig +++ b/arch/avr32/configs/atstk1003_defconfig @@ -17,7 +17,7 @@ CONFIG_BLK_DEV_INITRD=y # CONFIG_COMPAT_BRK is not set CONFIG_PROFILING=y CONFIG_OPROFILE=m -CONFIG_KPROBES=y +# CONFIG_KPROBES is not set CONFIG_MODULES=y CONFIG_MODULE_UNLOAD=y # CONFIG_BLK_DEV_BSG is not set diff --git a/arch/avr32/configs/atstk1006_defconfig b/arch/avr32/configs/atstk1006_defconfig index dbcc1b51e506..ef76b4fb8658 100644 --- a/arch/avr32/configs/atstk1006_defconfig +++ b/arch/avr32/configs/atstk1006_defconfig @@ -11,7 +11,7 @@ CONFIG_BLK_DEV_INITRD=y # CONFIG_COMPAT_BRK is not set CONFIG_PROFILING=y CONFIG_OPROFILE=m -CONFIG_KPROBES=y +# CONFIG_KPROBES is not set CONFIG_MODULES=y CONFIG_MODULE_UNLOAD=y # CONFIG_BLK_DEV_BSG is not set diff --git a/arch/avr32/configs/favr-32_defconfig b/arch/avr32/configs/favr-32_defconfig index 0c813b661a0a..aeadc955db32 100644 --- a/arch/avr32/configs/favr-32_defconfig +++ b/arch/avr32/configs/favr-32_defconfig @@ -11,7 +11,7 @@ CONFIG_BLK_DEV_INITRD=y # CONFIG_COMPAT_BRK is not set CONFIG_PROFILING=y CONFIG_OPROFILE=m -CONFIG_KPROBES=y +# CONFIG_KPROBES is not set CONFIG_MODULES=y CONFIG_MODULE_UNLOAD=y # CONFIG_BLK_DEV_BSG is not set diff --git a/arch/avr32/configs/hammerhead_defconfig b/arch/avr32/configs/hammerhead_defconfig index dcc01f0eb294..1692beeb7ed3 100644 --- a/arch/avr32/configs/hammerhead_defconfig +++ b/arch/avr32/configs/hammerhead_defconfig @@ -12,7 +12,7 @@ CONFIG_BLK_DEV_INITRD=y # CONFIG_COMPAT_BRK is not set CONFIG_PROFILING=y CONFIG_OPROFILE=m -CONFIG_KPROBES=y +# CONFIG_KPROBES is not set CONFIG_MODULES=y CONFIG_MODULE_UNLOAD=y CONFIG_MODULE_FORCE_UNLOAD=y -- cgit From 664cb7142ced8b827e92e1851d1ed2cae922f225 Mon Sep 17 00:00:00 2001 From: Hans-Christian Egtvedt Date: Thu, 9 Dec 2010 00:19:33 +0100 Subject: avr32: use syscall prototypes from asm-generic instead of arch This patch removes the redundant syscalls prototypes in the architecture specific syscalls.h header file. These were identical with the ones in asm-generic/syscalls.h. Signed-off-by: Hans-Christian Egtvedt Reported-by: Peter Huewe Reported-by: Sven Schnelle Cc: stable --- arch/avr32/include/asm/syscalls.h | 9 --------- 1 file changed, 9 deletions(-) diff --git a/arch/avr32/include/asm/syscalls.h b/arch/avr32/include/asm/syscalls.h index ab608b70b24d..730a461c606f 100644 --- a/arch/avr32/include/asm/syscalls.h +++ b/arch/avr32/include/asm/syscalls.h @@ -16,18 +16,9 @@ #include /* kernel/process.c */ -asmlinkage int sys_fork(struct pt_regs *); asmlinkage int sys_clone(unsigned long, unsigned long, unsigned long, unsigned long, struct pt_regs *); -asmlinkage int sys_vfork(struct pt_regs *); -asmlinkage int sys_execve(const char __user *, char __user *__user *, - char __user *__user *, struct pt_regs *); - -/* kernel/signal.c */ -asmlinkage int sys_sigaltstack(const stack_t __user *, stack_t __user *, - struct pt_regs *); -asmlinkage int sys_rt_sigreturn(struct pt_regs *); /* mm/cache.c */ asmlinkage int sys_cacheflush(int, void __user *, size_t); -- cgit From 992a88b62ca327f10d82dbad71a8c061d4229888 Mon Sep 17 00:00:00 2001 From: Hans-Christian Egtvedt Date: Thu, 9 Dec 2010 09:02:06 +0100 Subject: avr32: make architecture sys_clone prototype match asm-generic prototype This patch will fix the arguments to the architecture sys_clone() function to match the asm-generic/syscalls.h prototype. In the same go remove the architecture specific prototype for the same function. The sys_clone() function is only called from assembly, hence the argument types were not having any affect. Signed-off-by: Hans-Christian Egtvedt --- arch/avr32/include/asm/syscalls.h | 5 ----- arch/avr32/kernel/process.c | 9 ++++----- 2 files changed, 4 insertions(+), 10 deletions(-) diff --git a/arch/avr32/include/asm/syscalls.h b/arch/avr32/include/asm/syscalls.h index 730a461c606f..244f2acab546 100644 --- a/arch/avr32/include/asm/syscalls.h +++ b/arch/avr32/include/asm/syscalls.h @@ -15,11 +15,6 @@ #include #include -/* kernel/process.c */ -asmlinkage int sys_clone(unsigned long, unsigned long, - unsigned long, unsigned long, - struct pt_regs *); - /* mm/cache.c */ asmlinkage int sys_cacheflush(int, void __user *, size_t); diff --git a/arch/avr32/kernel/process.c b/arch/avr32/kernel/process.c index 9c46aaad11ce..ef5a2a08fcca 100644 --- a/arch/avr32/kernel/process.c +++ b/arch/avr32/kernel/process.c @@ -367,14 +367,13 @@ asmlinkage int sys_fork(struct pt_regs *regs) } asmlinkage int sys_clone(unsigned long clone_flags, unsigned long newsp, - unsigned long parent_tidptr, - unsigned long child_tidptr, struct pt_regs *regs) + void __user *parent_tidptr, void __user *child_tidptr, + struct pt_regs *regs) { if (!newsp) newsp = regs->sp; - return do_fork(clone_flags, newsp, regs, 0, - (int __user *)parent_tidptr, - (int __user *)child_tidptr); + return do_fork(clone_flags, newsp, regs, 0, parent_tidptr, + child_tidptr); } asmlinkage int sys_vfork(struct pt_regs *regs) -- cgit From 1e2de47cddc70ae973cf468e2f2954b4ae80f4e3 Mon Sep 17 00:00:00 2001 From: John Stultz Date: Mon, 1 Nov 2010 13:12:27 -0700 Subject: avr32: Convert to clocksource_register_hz This converts the avr32 clocksource to use clocksource_register_hz. This is untested, so any assistance in testing would be appreciated! CC: Hans-Christian Egtvedt CC: Thomas Gleixner Signed-off-by: John Stultz --- arch/avr32/kernel/time.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/arch/avr32/kernel/time.c b/arch/avr32/kernel/time.c index 668ed2817e51..05ad29112ff4 100644 --- a/arch/avr32/kernel/time.c +++ b/arch/avr32/kernel/time.c @@ -35,7 +35,6 @@ static struct clocksource counter = { .rating = 50, .read = read_cycle_count, .mask = CLOCKSOURCE_MASK(32), - .shift = 16, .flags = CLOCK_SOURCE_IS_CONTINUOUS, }; @@ -123,9 +122,7 @@ void __init time_init(void) /* figure rate for counter */ counter_hz = clk_get_rate(boot_cpu_data.clk); - counter.mult = clocksource_hz2mult(counter_hz, counter.shift); - - ret = clocksource_register(&counter); + ret = clocksource_register_hz(&counter, counter_hz); if (ret) pr_debug("timer: could not register clocksource: %d\n", ret); -- cgit From c975ffadd144e8e99e83e8f80ee94efce4f170b5 Mon Sep 17 00:00:00 2001 From: Hans-Christian Egtvedt Date: Thu, 13 Jan 2011 12:18:32 +0100 Subject: avr32: update default configuration files for Atmel boards This patch adjusts some values to make the default configuration for Atmel boards more similar, and adds missing values to enable required functions. Also remove defined symbols for functions not in use. Signed-off-by: Hans-Christian Egtvedt --- arch/avr32/configs/atngw100_defconfig | 21 ++-- arch/avr32/configs/atngw100_evklcd100_defconfig | 15 +-- arch/avr32/configs/atngw100_evklcd101_defconfig | 15 +-- arch/avr32/configs/atngw100mkii_defconfig | 20 ++-- .../avr32/configs/atngw100mkii_evklcd100_defconfig | 15 +-- .../avr32/configs/atngw100mkii_evklcd101_defconfig | 15 +-- arch/avr32/configs/atstk1002_defconfig | 23 ++--- arch/avr32/configs/atstk1003_defconfig | 39 ++++---- arch/avr32/configs/atstk1004_defconfig | 109 ++++++++++++++++++--- arch/avr32/configs/atstk1006_defconfig | 21 ++-- 10 files changed, 184 insertions(+), 109 deletions(-) diff --git a/arch/avr32/configs/atngw100_defconfig b/arch/avr32/configs/atngw100_defconfig index fe9525e359ad..6f9ca56de1f6 100644 --- a/arch/avr32/configs/atngw100_defconfig +++ b/arch/avr32/configs/atngw100_defconfig @@ -2,10 +2,8 @@ CONFIG_EXPERIMENTAL=y # CONFIG_LOCALVERSION_AUTO is not set CONFIG_SYSVIPC=y CONFIG_POSIX_MQUEUE=y -CONFIG_BSD_PROCESS_ACCT=y -CONFIG_BSD_PROCESS_ACCT_V3=y CONFIG_LOG_BUF_SHIFT=14 -CONFIG_SYSFS_DEPRECATED_V2=y +CONFIG_RELAY=y CONFIG_BLK_DEV_INITRD=y # CONFIG_SYSCTL_SYSCALL is not set # CONFIG_BASE_FULL is not set @@ -15,7 +13,6 @@ CONFIG_OPROFILE=m # CONFIG_KPROBES is not set CONFIG_MODULES=y CONFIG_MODULE_UNLOAD=y -CONFIG_MODULE_FORCE_UNLOAD=y # CONFIG_BLK_DEV_BSG is not set # CONFIG_IOSCHED_DEADLINE is not set CONFIG_NO_HZ=y @@ -29,6 +26,7 @@ CONFIG_CPU_FREQ=y CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND=y CONFIG_CPU_FREQ_GOV_USERSPACE=y CONFIG_CPU_FREQ_AT32AP=y +CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS=y CONFIG_NET=y CONFIG_PACKET=y CONFIG_UNIX=y @@ -72,8 +70,8 @@ CONFIG_MTD_UBI=y CONFIG_BLK_DEV_LOOP=m CONFIG_BLK_DEV_NBD=m CONFIG_BLK_DEV_RAM=m +CONFIG_MISC_DEVICES=y CONFIG_ATMEL_TCLIB=y -CONFIG_EEPROM_AT24=m CONFIG_NETDEVICES=y CONFIG_TUN=m CONFIG_NET_ETHERNET=y @@ -106,6 +104,7 @@ CONFIG_GPIO_SYSFS=y CONFIG_WATCHDOG=y CONFIG_AT32AP700X_WDT=y CONFIG_USB_GADGET=y +CONFIG_USB_GADGET_VBUS_DRAW=350 CONFIG_USB_ZERO=m CONFIG_USB_ETH=m CONFIG_USB_GADGETFS=m @@ -115,14 +114,12 @@ CONFIG_USB_CDC_COMPOSITE=m CONFIG_MMC=y CONFIG_MMC_TEST=m CONFIG_MMC_ATMELMCI=y -CONFIG_MMC_SPI=m CONFIG_NEW_LEDS=y CONFIG_LEDS_CLASS=y CONFIG_LEDS_GPIO=y CONFIG_LEDS_TRIGGERS=y CONFIG_LEDS_TRIGGER_TIMER=y CONFIG_LEDS_TRIGGER_HEARTBEAT=y -CONFIG_LEDS_TRIGGER_DEFAULT_ON=y CONFIG_RTC_CLASS=y CONFIG_RTC_DRV_AT32AP700X=y CONFIG_DMADEVICES=y @@ -130,21 +127,23 @@ CONFIG_EXT2_FS=y CONFIG_EXT3_FS=y # CONFIG_EXT3_DEFAULTS_TO_ORDERED is not set # CONFIG_EXT3_FS_XATTR is not set +CONFIG_EXT4_FS=y +# CONFIG_EXT4_FS_XATTR is not set # CONFIG_DNOTIFY is not set CONFIG_FUSE_FS=m CONFIG_MSDOS_FS=m CONFIG_VFAT_FS=m CONFIG_FAT_DEFAULT_CODEPAGE=850 +CONFIG_PROC_KCORE=y CONFIG_TMPFS=y -CONFIG_CONFIGFS_FS=m +CONFIG_CONFIGFS_FS=y CONFIG_JFFS2_FS=y -CONFIG_UFS_FS=y +CONFIG_UBIFS_FS=y CONFIG_NFS_FS=y CONFIG_NFS_V3=y CONFIG_ROOT_NFS=y CONFIG_NFSD=m CONFIG_NFSD_V3=y -CONFIG_SMB_FS=m CONFIG_CIFS=m CONFIG_NLS_CODEPAGE_437=m CONFIG_NLS_CODEPAGE_850=m @@ -155,5 +154,3 @@ CONFIG_DEBUG_FS=y CONFIG_DEBUG_KERNEL=y CONFIG_DETECT_HUNG_TASK=y CONFIG_FRAME_POINTER=y -# CONFIG_RCU_CPU_STALL_DETECTOR is not set -CONFIG_CRYPTO_PCBC=m diff --git a/arch/avr32/configs/atngw100_evklcd100_defconfig b/arch/avr32/configs/atngw100_evklcd100_defconfig index 8dae90cbe8a3..7eece0af34c9 100644 --- a/arch/avr32/configs/atngw100_evklcd100_defconfig +++ b/arch/avr32/configs/atngw100_evklcd100_defconfig @@ -2,10 +2,8 @@ CONFIG_EXPERIMENTAL=y # CONFIG_LOCALVERSION_AUTO is not set CONFIG_SYSVIPC=y CONFIG_POSIX_MQUEUE=y -CONFIG_BSD_PROCESS_ACCT=y -CONFIG_BSD_PROCESS_ACCT_V3=y CONFIG_LOG_BUF_SHIFT=14 -CONFIG_SYSFS_DEPRECATED_V2=y +CONFIG_RELAY=y CONFIG_BLK_DEV_INITRD=y # CONFIG_SYSCTL_SYSCALL is not set # CONFIG_BASE_FULL is not set @@ -15,7 +13,6 @@ CONFIG_OPROFILE=m # CONFIG_KPROBES is not set CONFIG_MODULES=y CONFIG_MODULE_UNLOAD=y -CONFIG_MODULE_FORCE_UNLOAD=y # CONFIG_BLK_DEV_BSG is not set # CONFIG_IOSCHED_DEADLINE is not set CONFIG_NO_HZ=y @@ -31,6 +28,7 @@ CONFIG_CPU_FREQ=y CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND=y CONFIG_CPU_FREQ_GOV_USERSPACE=y CONFIG_CPU_FREQ_AT32AP=y +CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS=y CONFIG_NET=y CONFIG_PACKET=y CONFIG_UNIX=y @@ -74,8 +72,10 @@ CONFIG_MTD_UBI=y CONFIG_BLK_DEV_LOOP=m CONFIG_BLK_DEV_NBD=m CONFIG_BLK_DEV_RAM=m +CONFIG_MISC_DEVICES=y CONFIG_ATMEL_TCLIB=y CONFIG_NETDEVICES=y +CONFIG_TUN=m CONFIG_NET_ETHERNET=y CONFIG_MACB=y # CONFIG_NETDEV_1000 is not set @@ -104,6 +104,7 @@ CONFIG_I2C_GPIO=m CONFIG_SPI=y CONFIG_SPI_ATMEL=y CONFIG_SPI_SPIDEV=m +CONFIG_GPIO_SYSFS=y # CONFIG_HWMON is not set CONFIG_WATCHDOG=y CONFIG_AT32AP700X_WDT=y @@ -127,6 +128,7 @@ CONFIG_USB_FILE_STORAGE=m CONFIG_USB_G_SERIAL=m CONFIG_USB_CDC_COMPOSITE=m CONFIG_MMC=y +CONFIG_MMC_TEST=m CONFIG_MMC_ATMELMCI=y CONFIG_NEW_LEDS=y CONFIG_LEDS_CLASS=y @@ -141,11 +143,14 @@ CONFIG_EXT2_FS=y CONFIG_EXT3_FS=y # CONFIG_EXT3_DEFAULTS_TO_ORDERED is not set # CONFIG_EXT3_FS_XATTR is not set +CONFIG_EXT4_FS=y +# CONFIG_EXT4_FS_XATTR is not set # CONFIG_DNOTIFY is not set CONFIG_FUSE_FS=m CONFIG_MSDOS_FS=m CONFIG_VFAT_FS=m CONFIG_FAT_DEFAULT_CODEPAGE=850 +CONFIG_PROC_KCORE=y CONFIG_TMPFS=y CONFIG_CONFIGFS_FS=y CONFIG_JFFS2_FS=y @@ -155,7 +160,6 @@ CONFIG_NFS_V3=y CONFIG_ROOT_NFS=y CONFIG_NFSD=m CONFIG_NFSD_V3=y -CONFIG_SMB_FS=m CONFIG_CIFS=m CONFIG_NLS_CODEPAGE_437=m CONFIG_NLS_CODEPAGE_850=m @@ -166,4 +170,3 @@ CONFIG_DEBUG_FS=y CONFIG_DEBUG_KERNEL=y CONFIG_DETECT_HUNG_TASK=y CONFIG_FRAME_POINTER=y -# CONFIG_RCU_CPU_STALL_DETECTOR is not set diff --git a/arch/avr32/configs/atngw100_evklcd101_defconfig b/arch/avr32/configs/atngw100_evklcd101_defconfig index 985f4b937923..387eb9d6e423 100644 --- a/arch/avr32/configs/atngw100_evklcd101_defconfig +++ b/arch/avr32/configs/atngw100_evklcd101_defconfig @@ -2,10 +2,8 @@ CONFIG_EXPERIMENTAL=y # CONFIG_LOCALVERSION_AUTO is not set CONFIG_SYSVIPC=y CONFIG_POSIX_MQUEUE=y -CONFIG_BSD_PROCESS_ACCT=y -CONFIG_BSD_PROCESS_ACCT_V3=y CONFIG_LOG_BUF_SHIFT=14 -CONFIG_SYSFS_DEPRECATED_V2=y +CONFIG_RELAY=y CONFIG_BLK_DEV_INITRD=y # CONFIG_SYSCTL_SYSCALL is not set # CONFIG_BASE_FULL is not set @@ -15,7 +13,6 @@ CONFIG_OPROFILE=m # CONFIG_KPROBES is not set CONFIG_MODULES=y CONFIG_MODULE_UNLOAD=y -CONFIG_MODULE_FORCE_UNLOAD=y # CONFIG_BLK_DEV_BSG is not set # CONFIG_IOSCHED_DEADLINE is not set CONFIG_NO_HZ=y @@ -30,6 +27,7 @@ CONFIG_CPU_FREQ=y CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND=y CONFIG_CPU_FREQ_GOV_USERSPACE=y CONFIG_CPU_FREQ_AT32AP=y +CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS=y CONFIG_NET=y CONFIG_PACKET=y CONFIG_UNIX=y @@ -73,8 +71,10 @@ CONFIG_MTD_UBI=y CONFIG_BLK_DEV_LOOP=m CONFIG_BLK_DEV_NBD=m CONFIG_BLK_DEV_RAM=m +CONFIG_MISC_DEVICES=y CONFIG_ATMEL_TCLIB=y CONFIG_NETDEVICES=y +CONFIG_TUN=m CONFIG_NET_ETHERNET=y CONFIG_MACB=y # CONFIG_NETDEV_1000 is not set @@ -103,6 +103,7 @@ CONFIG_I2C_GPIO=m CONFIG_SPI=y CONFIG_SPI_ATMEL=y CONFIG_SPI_SPIDEV=m +CONFIG_GPIO_SYSFS=y # CONFIG_HWMON is not set CONFIG_WATCHDOG=y CONFIG_AT32AP700X_WDT=y @@ -126,6 +127,7 @@ CONFIG_USB_FILE_STORAGE=m CONFIG_USB_G_SERIAL=m CONFIG_USB_CDC_COMPOSITE=m CONFIG_MMC=y +CONFIG_MMC_TEST=m CONFIG_MMC_ATMELMCI=y CONFIG_NEW_LEDS=y CONFIG_LEDS_CLASS=y @@ -140,11 +142,14 @@ CONFIG_EXT2_FS=y CONFIG_EXT3_FS=y # CONFIG_EXT3_DEFAULTS_TO_ORDERED is not set # CONFIG_EXT3_FS_XATTR is not set +CONFIG_EXT4_FS=y +# CONFIG_EXT4_FS_XATTR is not set # CONFIG_DNOTIFY is not set CONFIG_FUSE_FS=m CONFIG_MSDOS_FS=m CONFIG_VFAT_FS=m CONFIG_FAT_DEFAULT_CODEPAGE=850 +CONFIG_PROC_KCORE=y CONFIG_TMPFS=y CONFIG_CONFIGFS_FS=y CONFIG_JFFS2_FS=y @@ -154,7 +159,6 @@ CONFIG_NFS_V3=y CONFIG_ROOT_NFS=y CONFIG_NFSD=m CONFIG_NFSD_V3=y -CONFIG_SMB_FS=m CONFIG_CIFS=m CONFIG_NLS_CODEPAGE_437=m CONFIG_NLS_CODEPAGE_850=m @@ -165,4 +169,3 @@ CONFIG_DEBUG_FS=y CONFIG_DEBUG_KERNEL=y CONFIG_DETECT_HUNG_TASK=y CONFIG_FRAME_POINTER=y -# CONFIG_RCU_CPU_STALL_DETECTOR is not set diff --git a/arch/avr32/configs/atngw100mkii_defconfig b/arch/avr32/configs/atngw100mkii_defconfig index e5d029595bf2..f0fe237133a9 100644 --- a/arch/avr32/configs/atngw100mkii_defconfig +++ b/arch/avr32/configs/atngw100mkii_defconfig @@ -2,10 +2,8 @@ CONFIG_EXPERIMENTAL=y # CONFIG_LOCALVERSION_AUTO is not set CONFIG_SYSVIPC=y CONFIG_POSIX_MQUEUE=y -CONFIG_BSD_PROCESS_ACCT=y -CONFIG_BSD_PROCESS_ACCT_V3=y CONFIG_LOG_BUF_SHIFT=14 -CONFIG_SYSFS_DEPRECATED_V2=y +CONFIG_RELAY=y CONFIG_BLK_DEV_INITRD=y # CONFIG_SYSCTL_SYSCALL is not set # CONFIG_BASE_FULL is not set @@ -15,7 +13,6 @@ CONFIG_OPROFILE=m # CONFIG_KPROBES is not set CONFIG_MODULES=y CONFIG_MODULE_UNLOAD=y -CONFIG_MODULE_FORCE_UNLOAD=y # CONFIG_BLK_DEV_BSG is not set # CONFIG_IOSCHED_DEADLINE is not set CONFIG_NO_HZ=y @@ -29,6 +26,7 @@ CONFIG_CPU_FREQ=y CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND=y CONFIG_CPU_FREQ_GOV_USERSPACE=y CONFIG_CPU_FREQ_AT32AP=y +CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS=y CONFIG_NET=y CONFIG_PACKET=y CONFIG_UNIX=y @@ -74,6 +72,7 @@ CONFIG_MTD_UBI=y CONFIG_BLK_DEV_LOOP=m CONFIG_BLK_DEV_NBD=m CONFIG_BLK_DEV_RAM=m +CONFIG_MISC_DEVICES=y CONFIG_ATMEL_TCLIB=y CONFIG_NETDEVICES=y CONFIG_TUN=m @@ -107,6 +106,7 @@ CONFIG_GPIO_SYSFS=y CONFIG_WATCHDOG=y CONFIG_AT32AP700X_WDT=y CONFIG_USB_GADGET=y +CONFIG_USB_GADGET_VBUS_DRAW=350 CONFIG_USB_ZERO=m CONFIG_USB_ETH=m CONFIG_USB_GADGETFS=m @@ -116,14 +116,12 @@ CONFIG_USB_CDC_COMPOSITE=m CONFIG_MMC=y CONFIG_MMC_TEST=m CONFIG_MMC_ATMELMCI=y -CONFIG_MMC_SPI=m CONFIG_NEW_LEDS=y CONFIG_LEDS_CLASS=y CONFIG_LEDS_GPIO=y CONFIG_LEDS_TRIGGERS=y CONFIG_LEDS_TRIGGER_TIMER=y CONFIG_LEDS_TRIGGER_HEARTBEAT=y -CONFIG_LEDS_TRIGGER_DEFAULT_ON=y CONFIG_RTC_CLASS=y CONFIG_RTC_DRV_AT32AP700X=y CONFIG_DMADEVICES=y @@ -131,21 +129,23 @@ CONFIG_EXT2_FS=y CONFIG_EXT3_FS=y # CONFIG_EXT3_DEFAULTS_TO_ORDERED is not set # CONFIG_EXT3_FS_XATTR is not set +CONFIG_EXT4_FS=y +# CONFIG_EXT4_FS_XATTR is not set # CONFIG_DNOTIFY is not set CONFIG_FUSE_FS=m CONFIG_MSDOS_FS=m CONFIG_VFAT_FS=m CONFIG_FAT_DEFAULT_CODEPAGE=850 +CONFIG_PROC_KCORE=y CONFIG_TMPFS=y -CONFIG_CONFIGFS_FS=m +CONFIG_CONFIGFS_FS=y CONFIG_JFFS2_FS=y -CONFIG_UFS_FS=y +CONFIG_UBIFS_FS=y CONFIG_NFS_FS=y CONFIG_NFS_V3=y CONFIG_ROOT_NFS=y CONFIG_NFSD=m CONFIG_NFSD_V3=y -CONFIG_SMB_FS=m CONFIG_CIFS=m CONFIG_NLS_CODEPAGE_437=m CONFIG_NLS_CODEPAGE_850=m @@ -156,5 +156,3 @@ CONFIG_DEBUG_FS=y CONFIG_DEBUG_KERNEL=y CONFIG_DETECT_HUNG_TASK=y CONFIG_FRAME_POINTER=y -# CONFIG_RCU_CPU_STALL_DETECTOR is not set -CONFIG_CRYPTO_PCBC=m diff --git a/arch/avr32/configs/atngw100mkii_evklcd100_defconfig b/arch/avr32/configs/atngw100mkii_evklcd100_defconfig index 93fee4ac3748..e4a7c1dc8380 100644 --- a/arch/avr32/configs/atngw100mkii_evklcd100_defconfig +++ b/arch/avr32/configs/atngw100mkii_evklcd100_defconfig @@ -2,10 +2,8 @@ CONFIG_EXPERIMENTAL=y # CONFIG_LOCALVERSION_AUTO is not set CONFIG_SYSVIPC=y CONFIG_POSIX_MQUEUE=y -CONFIG_BSD_PROCESS_ACCT=y -CONFIG_BSD_PROCESS_ACCT_V3=y CONFIG_LOG_BUF_SHIFT=14 -CONFIG_SYSFS_DEPRECATED_V2=y +CONFIG_RELAY=y CONFIG_BLK_DEV_INITRD=y # CONFIG_SYSCTL_SYSCALL is not set # CONFIG_BASE_FULL is not set @@ -15,7 +13,6 @@ CONFIG_OPROFILE=m # CONFIG_KPROBES is not set CONFIG_MODULES=y CONFIG_MODULE_UNLOAD=y -CONFIG_MODULE_FORCE_UNLOAD=y # CONFIG_BLK_DEV_BSG is not set # CONFIG_IOSCHED_DEADLINE is not set CONFIG_NO_HZ=y @@ -32,6 +29,7 @@ CONFIG_CPU_FREQ=y CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND=y CONFIG_CPU_FREQ_GOV_USERSPACE=y CONFIG_CPU_FREQ_AT32AP=y +CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS=y CONFIG_NET=y CONFIG_PACKET=y CONFIG_UNIX=y @@ -77,8 +75,10 @@ CONFIG_MTD_UBI=y CONFIG_BLK_DEV_LOOP=m CONFIG_BLK_DEV_NBD=m CONFIG_BLK_DEV_RAM=m +CONFIG_MISC_DEVICES=y CONFIG_ATMEL_TCLIB=y CONFIG_NETDEVICES=y +CONFIG_TUN=m CONFIG_NET_ETHERNET=y CONFIG_MACB=y # CONFIG_NETDEV_1000 is not set @@ -107,6 +107,7 @@ CONFIG_I2C_GPIO=m CONFIG_SPI=y CONFIG_SPI_ATMEL=y CONFIG_SPI_SPIDEV=m +CONFIG_GPIO_SYSFS=y # CONFIG_HWMON is not set CONFIG_WATCHDOG=y CONFIG_AT32AP700X_WDT=y @@ -130,6 +131,7 @@ CONFIG_USB_FILE_STORAGE=m CONFIG_USB_G_SERIAL=m CONFIG_USB_CDC_COMPOSITE=m CONFIG_MMC=y +CONFIG_MMC_TEST=m CONFIG_MMC_ATMELMCI=y CONFIG_NEW_LEDS=y CONFIG_LEDS_CLASS=y @@ -144,11 +146,14 @@ CONFIG_EXT2_FS=y CONFIG_EXT3_FS=y # CONFIG_EXT3_DEFAULTS_TO_ORDERED is not set # CONFIG_EXT3_FS_XATTR is not set +CONFIG_EXT4_FS=y +# CONFIG_EXT4_FS_XATTR is not set # CONFIG_DNOTIFY is not set CONFIG_FUSE_FS=m CONFIG_MSDOS_FS=m CONFIG_VFAT_FS=m CONFIG_FAT_DEFAULT_CODEPAGE=850 +CONFIG_PROC_KCORE=y CONFIG_TMPFS=y CONFIG_CONFIGFS_FS=y CONFIG_JFFS2_FS=y @@ -158,7 +163,6 @@ CONFIG_NFS_V3=y CONFIG_ROOT_NFS=y CONFIG_NFSD=m CONFIG_NFSD_V3=y -CONFIG_SMB_FS=m CONFIG_CIFS=m CONFIG_NLS_CODEPAGE_437=m CONFIG_NLS_CODEPAGE_850=m @@ -169,4 +173,3 @@ CONFIG_DEBUG_FS=y CONFIG_DEBUG_KERNEL=y CONFIG_DETECT_HUNG_TASK=y CONFIG_FRAME_POINTER=y -# CONFIG_RCU_CPU_STALL_DETECTOR is not set diff --git a/arch/avr32/configs/atngw100mkii_evklcd101_defconfig b/arch/avr32/configs/atngw100mkii_evklcd101_defconfig index bcc90ea1b4dc..6f37f70c2c37 100644 --- a/arch/avr32/configs/atngw100mkii_evklcd101_defconfig +++ b/arch/avr32/configs/atngw100mkii_evklcd101_defconfig @@ -2,10 +2,8 @@ CONFIG_EXPERIMENTAL=y # CONFIG_LOCALVERSION_AUTO is not set CONFIG_SYSVIPC=y CONFIG_POSIX_MQUEUE=y -CONFIG_BSD_PROCESS_ACCT=y -CONFIG_BSD_PROCESS_ACCT_V3=y CONFIG_LOG_BUF_SHIFT=14 -CONFIG_SYSFS_DEPRECATED_V2=y +CONFIG_RELAY=y CONFIG_BLK_DEV_INITRD=y # CONFIG_SYSCTL_SYSCALL is not set # CONFIG_BASE_FULL is not set @@ -15,7 +13,6 @@ CONFIG_OPROFILE=m # CONFIG_KPROBES is not set CONFIG_MODULES=y CONFIG_MODULE_UNLOAD=y -CONFIG_MODULE_FORCE_UNLOAD=y # CONFIG_BLK_DEV_BSG is not set # CONFIG_IOSCHED_DEADLINE is not set CONFIG_NO_HZ=y @@ -31,6 +28,7 @@ CONFIG_CPU_FREQ=y CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND=y CONFIG_CPU_FREQ_GOV_USERSPACE=y CONFIG_CPU_FREQ_AT32AP=y +CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS=y CONFIG_NET=y CONFIG_PACKET=y CONFIG_UNIX=y @@ -76,8 +74,10 @@ CONFIG_MTD_UBI=y CONFIG_BLK_DEV_LOOP=m CONFIG_BLK_DEV_NBD=m CONFIG_BLK_DEV_RAM=m +CONFIG_MISC_DEVICES=y CONFIG_ATMEL_TCLIB=y CONFIG_NETDEVICES=y +CONFIG_TUN=m CONFIG_NET_ETHERNET=y CONFIG_MACB=y # CONFIG_NETDEV_1000 is not set @@ -106,6 +106,7 @@ CONFIG_I2C_GPIO=m CONFIG_SPI=y CONFIG_SPI_ATMEL=y CONFIG_SPI_SPIDEV=m +CONFIG_GPIO_SYSFS=y # CONFIG_HWMON is not set CONFIG_WATCHDOG=y CONFIG_AT32AP700X_WDT=y @@ -129,6 +130,7 @@ CONFIG_USB_FILE_STORAGE=m CONFIG_USB_G_SERIAL=m CONFIG_USB_CDC_COMPOSITE=m CONFIG_MMC=y +CONFIG_MMC_TEST=m CONFIG_MMC_ATMELMCI=y CONFIG_NEW_LEDS=y CONFIG_LEDS_CLASS=y @@ -143,11 +145,14 @@ CONFIG_EXT2_FS=y CONFIG_EXT3_FS=y # CONFIG_EXT3_DEFAULTS_TO_ORDERED is not set # CONFIG_EXT3_FS_XATTR is not set +CONFIG_EXT4_FS=y +# CONFIG_EXT4_FS_XATTR is not set # CONFIG_DNOTIFY is not set CONFIG_FUSE_FS=m CONFIG_MSDOS_FS=m CONFIG_VFAT_FS=m CONFIG_FAT_DEFAULT_CODEPAGE=850 +CONFIG_PROC_KCORE=y CONFIG_TMPFS=y CONFIG_CONFIGFS_FS=y CONFIG_JFFS2_FS=y @@ -157,7 +162,6 @@ CONFIG_NFS_V3=y CONFIG_ROOT_NFS=y CONFIG_NFSD=m CONFIG_NFSD_V3=y -CONFIG_SMB_FS=m CONFIG_CIFS=m CONFIG_NLS_CODEPAGE_437=m CONFIG_NLS_CODEPAGE_850=m @@ -168,4 +172,3 @@ CONFIG_DEBUG_FS=y CONFIG_DEBUG_KERNEL=y CONFIG_DETECT_HUNG_TASK=y CONFIG_FRAME_POINTER=y -# CONFIG_RCU_CPU_STALL_DETECTOR is not set diff --git a/arch/avr32/configs/atstk1002_defconfig b/arch/avr32/configs/atstk1002_defconfig index a7536b8afb46..4fb01f5ab42f 100644 --- a/arch/avr32/configs/atstk1002_defconfig +++ b/arch/avr32/configs/atstk1002_defconfig @@ -3,7 +3,6 @@ CONFIG_EXPERIMENTAL=y CONFIG_SYSVIPC=y CONFIG_POSIX_MQUEUE=y CONFIG_LOG_BUF_SHIFT=14 -CONFIG_SYSFS_DEPRECATED_V2=y CONFIG_RELAY=y CONFIG_BLK_DEV_INITRD=y # CONFIG_SYSCTL_SYSCALL is not set @@ -26,6 +25,7 @@ CONFIG_CPU_FREQ=y CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND=y CONFIG_CPU_FREQ_GOV_USERSPACE=y CONFIG_CPU_FREQ_AT32AP=y +CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS=y CONFIG_NET=y CONFIG_PACKET=y CONFIG_UNIX=y @@ -35,6 +35,7 @@ CONFIG_INET=y CONFIG_IP_PNP=y CONFIG_IP_PNP_DHCP=y CONFIG_NET_IPIP=m +CONFIG_NET_IPGRE_DEMUX=m CONFIG_NET_IPGRE=m CONFIG_INET_AH=m CONFIG_INET_ESP=m @@ -58,16 +59,14 @@ CONFIG_MTD_BLOCK=y CONFIG_MTD_CFI=y CONFIG_MTD_CFI_AMDSTD=y CONFIG_MTD_PHYSMAP=y -CONFIG_MTD_DATAFLASH=m -CONFIG_MTD_M25P80=m CONFIG_MTD_UBI=y CONFIG_BLK_DEV_LOOP=m CONFIG_BLK_DEV_NBD=m CONFIG_BLK_DEV_RAM=m +CONFIG_MISC_DEVICES=y CONFIG_ATMEL_PWM=m CONFIG_ATMEL_TCLIB=y CONFIG_ATMEL_SSC=m -CONFIG_EEPROM_AT24=m # CONFIG_SCSI_PROC_FS is not set CONFIG_BLK_DEV_SD=m CONFIG_BLK_DEV_SR=m @@ -120,7 +119,6 @@ CONFIG_SND_MIXER_OSS=m CONFIG_SND_PCM_OSS=m # CONFIG_SND_SUPPORT_OLD_API is not set # CONFIG_SND_VERBOSE_PROCFS is not set -# CONFIG_SND_DRIVERS is not set CONFIG_SND_AT73C213=m # CONFIG_HID_SUPPORT is not set CONFIG_USB_GADGET=y @@ -131,16 +129,15 @@ CONFIG_USB_FILE_STORAGE=m CONFIG_USB_G_SERIAL=m CONFIG_USB_CDC_COMPOSITE=m CONFIG_MMC=y +CONFIG_MMC_TEST=m CONFIG_MMC_ATMELMCI=y -CONFIG_MMC_SPI=m CONFIG_NEW_LEDS=y -CONFIG_LEDS_CLASS=m +CONFIG_LEDS_CLASS=y CONFIG_LEDS_ATMEL_PWM=m CONFIG_LEDS_GPIO=m CONFIG_LEDS_TRIGGERS=y CONFIG_LEDS_TRIGGER_TIMER=m CONFIG_LEDS_TRIGGER_HEARTBEAT=m -CONFIG_LEDS_TRIGGER_DEFAULT_ON=m CONFIG_RTC_CLASS=y CONFIG_RTC_DRV_AT32AP700X=y CONFIG_DMADEVICES=y @@ -149,20 +146,23 @@ CONFIG_EXT3_FS=y # CONFIG_EXT3_DEFAULTS_TO_ORDERED is not set # CONFIG_EXT3_FS_XATTR is not set CONFIG_EXT4_FS=y +# CONFIG_EXT4_FS_XATTR is not set # CONFIG_DNOTIFY is not set CONFIG_FUSE_FS=m CONFIG_MSDOS_FS=m CONFIG_VFAT_FS=m +CONFIG_FAT_DEFAULT_CODEPAGE=850 CONFIG_PROC_KCORE=y CONFIG_TMPFS=y +CONFIG_CONFIGFS_FS=y CONFIG_JFFS2_FS=y -# CONFIG_JFFS2_FS_WRITEBUFFER is not set CONFIG_UBIFS_FS=y -CONFIG_MINIX_FS=m CONFIG_NFS_FS=y CONFIG_NFS_V3=y CONFIG_ROOT_NFS=y +CONFIG_CIFS=m CONFIG_NLS_CODEPAGE_437=m +CONFIG_NLS_CODEPAGE_850=m CONFIG_NLS_ISO8859_1=m CONFIG_NLS_UTF8=m CONFIG_MAGIC_SYSRQ=y @@ -170,6 +170,3 @@ CONFIG_DEBUG_FS=y CONFIG_DEBUG_KERNEL=y CONFIG_DETECT_HUNG_TASK=y CONFIG_FRAME_POINTER=y -# CONFIG_RCU_CPU_STALL_DETECTOR is not set -# CONFIG_CRYPTO_HW is not set -CONFIG_CRC_T10DIF=m diff --git a/arch/avr32/configs/atstk1003_defconfig b/arch/avr32/configs/atstk1003_defconfig index e19662af22f0..9faaf9b900f2 100644 --- a/arch/avr32/configs/atstk1003_defconfig +++ b/arch/avr32/configs/atstk1003_defconfig @@ -2,18 +2,11 @@ CONFIG_EXPERIMENTAL=y # CONFIG_LOCALVERSION_AUTO is not set CONFIG_SYSVIPC=y CONFIG_POSIX_MQUEUE=y -CONFIG_BSD_PROCESS_ACCT=y -CONFIG_BSD_PROCESS_ACCT_V3=y -CONFIG_TASKSTATS=y -CONFIG_TASK_DELAY_ACCT=y -CONFIG_AUDIT=y CONFIG_LOG_BUF_SHIFT=14 -CONFIG_SYSFS_DEPRECATED_V2=y CONFIG_RELAY=y CONFIG_BLK_DEV_INITRD=y # CONFIG_SYSCTL_SYSCALL is not set # CONFIG_BASE_FULL is not set -# CONFIG_SLUB_DEBUG is not set # CONFIG_COMPAT_BRK is not set CONFIG_PROFILING=y CONFIG_OPROFILE=m @@ -33,6 +26,7 @@ CONFIG_CPU_FREQ=y CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND=y CONFIG_CPU_FREQ_GOV_USERSPACE=y CONFIG_CPU_FREQ_AT32AP=y +CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS=y CONFIG_NET=y CONFIG_PACKET=y CONFIG_UNIX=y @@ -54,18 +48,18 @@ CONFIG_MTD_BLOCK=y CONFIG_MTD_CFI=y CONFIG_MTD_CFI_AMDSTD=y CONFIG_MTD_PHYSMAP=y -CONFIG_MTD_DATAFLASH=m -CONFIG_MTD_M25P80=m +CONFIG_MTD_UBI=y CONFIG_BLK_DEV_LOOP=m CONFIG_BLK_DEV_NBD=m CONFIG_BLK_DEV_RAM=m +CONFIG_MISC_DEVICES=y CONFIG_ATMEL_PWM=m CONFIG_ATMEL_TCLIB=y CONFIG_ATMEL_SSC=m -CONFIG_EEPROM_AT24=m # CONFIG_SCSI_PROC_FS is not set CONFIG_BLK_DEV_SD=m CONFIG_BLK_DEV_SR=m +# CONFIG_SCSI_LOWLEVEL is not set CONFIG_ATA=m # CONFIG_SATA_PMP is not set CONFIG_PATA_AT32=m @@ -77,6 +71,7 @@ CONFIG_PPP_ASYNC=m CONFIG_PPP_DEFLATE=m CONFIG_PPP_BSDCOMP=m CONFIG_INPUT=m +CONFIG_INPUT_EVDEV=m # CONFIG_KEYBOARD_ATKBD is not set CONFIG_KEYBOARD_GPIO=m # CONFIG_MOUSE_PS2 is not set @@ -106,7 +101,6 @@ CONFIG_SND_PCM_OSS=m CONFIG_SND_AT73C213=m # CONFIG_HID_SUPPORT is not set CONFIG_USB_GADGET=y -CONFIG_USB_GADGET_DEBUG_FS=y CONFIG_USB_ZERO=m CONFIG_USB_ETH=m CONFIG_USB_GADGETFS=m @@ -116,36 +110,39 @@ CONFIG_USB_CDC_COMPOSITE=m CONFIG_MMC=y CONFIG_MMC_TEST=m CONFIG_MMC_ATMELMCI=y -CONFIG_MMC_SPI=m CONFIG_NEW_LEDS=y CONFIG_LEDS_CLASS=y CONFIG_LEDS_ATMEL_PWM=m -CONFIG_LEDS_GPIO=y +CONFIG_LEDS_GPIO=m CONFIG_LEDS_TRIGGERS=y -CONFIG_LEDS_TRIGGER_TIMER=y -CONFIG_LEDS_TRIGGER_HEARTBEAT=y -CONFIG_LEDS_TRIGGER_DEFAULT_ON=y +CONFIG_LEDS_TRIGGER_TIMER=m +CONFIG_LEDS_TRIGGER_HEARTBEAT=m CONFIG_RTC_CLASS=y CONFIG_RTC_DRV_AT32AP700X=y CONFIG_DMADEVICES=y -CONFIG_DW_DMAC=y -CONFIG_EXT2_FS=m -CONFIG_EXT3_FS=m +CONFIG_EXT2_FS=y +CONFIG_EXT3_FS=y +# CONFIG_EXT3_DEFAULTS_TO_ORDERED is not set # CONFIG_EXT3_FS_XATTR is not set +CONFIG_EXT4_FS=y +# CONFIG_EXT4_FS_XATTR is not set # CONFIG_DNOTIFY is not set CONFIG_FUSE_FS=m CONFIG_MSDOS_FS=m CONFIG_VFAT_FS=m +CONFIG_FAT_DEFAULT_CODEPAGE=850 CONFIG_PROC_KCORE=y CONFIG_TMPFS=y -CONFIG_CONFIGFS_FS=m +CONFIG_CONFIGFS_FS=y CONFIG_JFFS2_FS=y +CONFIG_UBIFS_FS=y # CONFIG_NETWORK_FILESYSTEMS is not set CONFIG_NLS_CODEPAGE_437=m +CONFIG_NLS_CODEPAGE_850=m CONFIG_NLS_ISO8859_1=m CONFIG_NLS_UTF8=m CONFIG_MAGIC_SYSRQ=y CONFIG_DEBUG_FS=y CONFIG_DEBUG_KERNEL=y +CONFIG_DETECT_HUNG_TASK=y CONFIG_FRAME_POINTER=y -CONFIG_CRC_T10DIF=m diff --git a/arch/avr32/configs/atstk1004_defconfig b/arch/avr32/configs/atstk1004_defconfig index 329e10ba3b54..3d2a5d85f970 100644 --- a/arch/avr32/configs/atstk1004_defconfig +++ b/arch/avr32/configs/atstk1004_defconfig @@ -1,19 +1,32 @@ CONFIG_EXPERIMENTAL=y # CONFIG_LOCALVERSION_AUTO is not set +CONFIG_SYSVIPC=y +CONFIG_POSIX_MQUEUE=y CONFIG_LOG_BUF_SHIFT=14 -CONFIG_SYSFS_DEPRECATED_V2=y +CONFIG_RELAY=y +CONFIG_BLK_DEV_INITRD=y # CONFIG_SYSCTL_SYSCALL is not set # CONFIG_BASE_FULL is not set -# CONFIG_FUTEX is not set -# CONFIG_EPOLL is not set -# CONFIG_SIGNALFD is not set -# CONFIG_TIMERFD is not set -# CONFIG_EVENTFD is not set # CONFIG_COMPAT_BRK is not set -CONFIG_SLOB=y -# CONFIG_BLOCK is not set +CONFIG_PROFILING=y +CONFIG_OPROFILE=m +# CONFIG_KPROBES is not set +CONFIG_MODULES=y +CONFIG_MODULE_UNLOAD=y +# CONFIG_BLK_DEV_BSG is not set +# CONFIG_IOSCHED_DEADLINE is not set +CONFIG_NO_HZ=y +CONFIG_HIGH_RES_TIMERS=y CONFIG_BOARD_ATSTK1004=y # CONFIG_OWNERSHIP_TRACE is not set +CONFIG_NMI_DEBUGGING=y +CONFIG_PM=y +CONFIG_CPU_FREQ=y +# CONFIG_CPU_FREQ_STAT is not set +CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND=y +CONFIG_CPU_FREQ_GOV_USERSPACE=y +CONFIG_CPU_FREQ_AT32AP=y +CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS=y CONFIG_NET=y CONFIG_PACKET=y CONFIG_UNIX=y @@ -31,40 +44,104 @@ CONFIG_MTD=y CONFIG_MTD_PARTITIONS=y CONFIG_MTD_CMDLINE_PARTS=y CONFIG_MTD_CHAR=y +CONFIG_MTD_BLOCK=y CONFIG_MTD_CFI=y CONFIG_MTD_CFI_AMDSTD=y CONFIG_MTD_PHYSMAP=y -# CONFIG_MISC_DEVICES is not set -# CONFIG_INPUT is not set +CONFIG_MTD_UBI=y +CONFIG_BLK_DEV_LOOP=m +CONFIG_BLK_DEV_NBD=m +CONFIG_BLK_DEV_RAM=m +CONFIG_MISC_DEVICES=y +CONFIG_ATMEL_PWM=m +CONFIG_ATMEL_TCLIB=y +CONFIG_ATMEL_SSC=m +# CONFIG_SCSI_PROC_FS is not set +CONFIG_BLK_DEV_SD=m +CONFIG_BLK_DEV_SR=m +# CONFIG_SCSI_LOWLEVEL is not set +CONFIG_ATA=m +# CONFIG_SATA_PMP is not set +CONFIG_PATA_AT32=m +CONFIG_NETDEVICES=y +# CONFIG_NETDEV_1000 is not set +# CONFIG_NETDEV_10000 is not set +CONFIG_PPP=m +CONFIG_PPP_ASYNC=m +CONFIG_PPP_DEFLATE=m +CONFIG_PPP_BSDCOMP=m +CONFIG_INPUT=m +CONFIG_INPUT_EVDEV=m +# CONFIG_KEYBOARD_ATKBD is not set +CONFIG_KEYBOARD_GPIO=m +# CONFIG_MOUSE_PS2 is not set +CONFIG_MOUSE_GPIO=m # CONFIG_SERIO is not set # CONFIG_VT is not set # CONFIG_DEVKMEM is not set CONFIG_SERIAL_ATMEL=y CONFIG_SERIAL_ATMEL_CONSOLE=y -# CONFIG_SERIAL_ATMEL_PDC is not set # CONFIG_LEGACY_PTYS is not set # CONFIG_HW_RANDOM is not set +CONFIG_I2C=m +CONFIG_I2C_CHARDEV=m +CONFIG_I2C_GPIO=m CONFIG_SPI=y CONFIG_SPI_ATMEL=y +CONFIG_SPI_SPIDEV=m +CONFIG_GPIO_SYSFS=y # CONFIG_HWMON is not set CONFIG_WATCHDOG=y CONFIG_AT32AP700X_WDT=y CONFIG_FB=y CONFIG_FB_ATMEL=y CONFIG_BACKLIGHT_LCD_SUPPORT=y +CONFIG_LCD_CLASS_DEVICE=y CONFIG_LCD_LTV350QV=y # CONFIG_BACKLIGHT_CLASS_DEVICE is not set CONFIG_USB_GADGET=y -CONFIG_USB_ETH=y -# CONFIG_USB_ETH_RNDIS is not set +CONFIG_USB_ZERO=m +CONFIG_USB_ETH=m +CONFIG_USB_GADGETFS=m +CONFIG_USB_FILE_STORAGE=m +CONFIG_USB_G_SERIAL=m +CONFIG_USB_CDC_COMPOSITE=m +CONFIG_MMC=y +CONFIG_MMC_TEST=m +CONFIG_MMC_ATMELMCI=y +CONFIG_NEW_LEDS=y +CONFIG_LEDS_CLASS=y +CONFIG_LEDS_ATMEL_PWM=m +CONFIG_LEDS_GPIO=m +CONFIG_LEDS_TRIGGERS=y +CONFIG_LEDS_TRIGGER_TIMER=m +CONFIG_LEDS_TRIGGER_HEARTBEAT=m CONFIG_RTC_CLASS=y -# CONFIG_RTC_INTF_PROC is not set CONFIG_RTC_DRV_AT32AP700X=y +CONFIG_DMADEVICES=y +CONFIG_EXT2_FS=y +CONFIG_EXT3_FS=y +# CONFIG_EXT3_DEFAULTS_TO_ORDERED is not set +# CONFIG_EXT3_FS_XATTR is not set +CONFIG_EXT4_FS=y +# CONFIG_EXT4_FS_XATTR is not set # CONFIG_DNOTIFY is not set +CONFIG_FUSE_FS=m +CONFIG_MSDOS_FS=m +CONFIG_VFAT_FS=m +CONFIG_FAT_DEFAULT_CODEPAGE=850 CONFIG_PROC_KCORE=y -# CONFIG_PROC_PAGE_MONITOR is not set CONFIG_TMPFS=y +CONFIG_CONFIGFS_FS=y CONFIG_JFFS2_FS=y -# CONFIG_JFFS2_FS_WRITEBUFFER is not set +CONFIG_UBIFS_FS=y # CONFIG_NETWORK_FILESYSTEMS is not set +CONFIG_NLS_CODEPAGE_437=m +CONFIG_NLS_CODEPAGE_850=m +CONFIG_NLS_ISO8859_1=m +CONFIG_NLS_UTF8=m CONFIG_MAGIC_SYSRQ=y +CONFIG_DEBUG_FS=y +CONFIG_DEBUG_KERNEL=y +CONFIG_DETECT_HUNG_TASK=y +CONFIG_FRAME_POINTER=y diff --git a/arch/avr32/configs/atstk1006_defconfig b/arch/avr32/configs/atstk1006_defconfig index ef76b4fb8658..1ed8f22d4fe2 100644 --- a/arch/avr32/configs/atstk1006_defconfig +++ b/arch/avr32/configs/atstk1006_defconfig @@ -3,7 +3,6 @@ CONFIG_EXPERIMENTAL=y CONFIG_SYSVIPC=y CONFIG_POSIX_MQUEUE=y CONFIG_LOG_BUF_SHIFT=14 -CONFIG_SYSFS_DEPRECATED_V2=y CONFIG_RELAY=y CONFIG_BLK_DEV_INITRD=y # CONFIG_SYSCTL_SYSCALL is not set @@ -37,6 +36,7 @@ CONFIG_INET=y CONFIG_IP_PNP=y CONFIG_IP_PNP_DHCP=y CONFIG_NET_IPIP=m +CONFIG_NET_IPGRE_DEMUX=m CONFIG_NET_IPGRE=m CONFIG_INET_AH=m CONFIG_INET_ESP=m @@ -60,15 +60,13 @@ CONFIG_MTD_BLOCK=y CONFIG_MTD_CFI=y CONFIG_MTD_CFI_AMDSTD=y CONFIG_MTD_PHYSMAP=y -CONFIG_MTD_DATAFLASH=m -CONFIG_MTD_DATAFLASH_OTP=y -CONFIG_MTD_M25P80=m CONFIG_MTD_NAND=y CONFIG_MTD_NAND_ATMEL=y CONFIG_MTD_UBI=y CONFIG_BLK_DEV_LOOP=m CONFIG_BLK_DEV_NBD=m CONFIG_BLK_DEV_RAM=m +CONFIG_MISC_DEVICES=y CONFIG_ATMEL_PWM=m CONFIG_ATMEL_TCLIB=y CONFIG_ATMEL_SSC=m @@ -132,17 +130,17 @@ CONFIG_USB_ETH=m CONFIG_USB_GADGETFS=m CONFIG_USB_FILE_STORAGE=m CONFIG_USB_G_SERIAL=m +CONFIG_USB_CDC_COMPOSITE=m CONFIG_MMC=y +CONFIG_MMC_TEST=m CONFIG_MMC_ATMELMCI=y -CONFIG_MMC_SPI=m CONFIG_NEW_LEDS=y -CONFIG_LEDS_CLASS=m +CONFIG_LEDS_CLASS=y CONFIG_LEDS_ATMEL_PWM=m CONFIG_LEDS_GPIO=m CONFIG_LEDS_TRIGGERS=y CONFIG_LEDS_TRIGGER_TIMER=m CONFIG_LEDS_TRIGGER_HEARTBEAT=m -CONFIG_LEDS_TRIGGER_DEFAULT_ON=m CONFIG_RTC_CLASS=y CONFIG_RTC_DRV_AT32AP700X=y CONFIG_DMADEVICES=y @@ -156,15 +154,18 @@ CONFIG_EXT4_FS=y CONFIG_FUSE_FS=m CONFIG_MSDOS_FS=m CONFIG_VFAT_FS=m +CONFIG_FAT_DEFAULT_CODEPAGE=850 CONFIG_PROC_KCORE=y CONFIG_TMPFS=y +CONFIG_CONFIGFS_FS=y CONFIG_JFFS2_FS=y CONFIG_UBIFS_FS=y -CONFIG_MINIX_FS=m CONFIG_NFS_FS=y CONFIG_NFS_V3=y CONFIG_ROOT_NFS=y +CONFIG_CIFS=m CONFIG_NLS_CODEPAGE_437=m +CONFIG_NLS_CODEPAGE_850=m CONFIG_NLS_ISO8859_1=m CONFIG_NLS_UTF8=m CONFIG_MAGIC_SYSRQ=y @@ -172,7 +173,3 @@ CONFIG_DEBUG_FS=y CONFIG_DEBUG_KERNEL=y CONFIG_DETECT_HUNG_TASK=y CONFIG_FRAME_POINTER=y -# CONFIG_RCU_CPU_STALL_DETECTOR is not set -CONFIG_CRYPTO_FIPS=y -# CONFIG_CRYPTO_HW is not set -CONFIG_CRC_T10DIF=m -- cgit From c217649bf2d60ac119afd71d938278cffd55962b Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Thu, 13 Jan 2011 19:53:46 +0000 Subject: dm: dont take i_mutex to change device size No longer needlessly hold md->bdev->bd_inode->i_mutex when changing the size of a DM device. This additional locking is unnecessary because i_size_write() is already protected by the existing critical section in dm_swap_table(). DM already has a reference on md->bdev so the associated bd_inode may be changed without lifetime concerns. A negative side-effect of having held md->bdev->bd_inode->i_mutex was that a concurrent DM device resize and flush (via fsync) would deadlock. Dropping md->bdev->bd_inode->i_mutex eliminates this potential for deadlock. The following reproducer no longer deadlocks: https://www.redhat.com/archives/dm-devel/2009-July/msg00284.html Signed-off-by: Mike Snitzer Signed-off-by: Mikulas Patocka Signed-off-by: Alasdair G Kergon Cc: stable@kernel.org --- drivers/md/dm.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/md/dm.c b/drivers/md/dm.c index f48a2f359ac4..4ef066a3090f 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -1992,13 +1992,14 @@ static void event_callback(void *context) wake_up(&md->eventq); } +/* + * Protected by md->suspend_lock obtained by dm_swap_table(). + */ static void __set_size(struct mapped_device *md, sector_t size) { set_capacity(md->disk, size); - mutex_lock(&md->bdev->bd_inode->i_mutex); i_size_write(md->bdev->bd_inode, (loff_t)size << SECTOR_SHIFT); - mutex_unlock(&md->bdev->bd_inode->i_mutex); } /* -- cgit From 09c9d4c9b6a2b5909ae3c6265e4cd3820b636863 Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Thu, 13 Jan 2011 19:59:46 +0000 Subject: dm mpath: disable blk_abort_queue Revert commit 224cb3e981f1b2f9f93dbd49eaef505d17d894c2 dm: Call blk_abort_queue on failed paths Multipath began to use blk_abort_queue() to allow for lower latency path deactivation. This was found to cause list corruption: the cmd gets blk_abort_queued/timedout run on it and the scsi eh somehow is able to complete and run scsi_queue_insert while scsi_request_fn is still trying to process the request. https://www.redhat.com/archives/dm-devel/2010-November/msg00085.html Signed-off-by: Mike Snitzer Signed-off-by: Alasdair G Kergon Cc: Mike Anderson Cc: Mike Christie Cc: stable@kernel.org --- drivers/md/dm-mpath.c | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c index 487ecda90ad4..406091f9692b 100644 --- a/drivers/md/dm-mpath.c +++ b/drivers/md/dm-mpath.c @@ -33,7 +33,6 @@ struct pgpath { unsigned fail_count; /* Cumulative failure count */ struct dm_path path; - struct work_struct deactivate_path; struct work_struct activate_path; }; @@ -116,7 +115,6 @@ static struct workqueue_struct *kmultipathd, *kmpath_handlerd; static void process_queued_ios(struct work_struct *work); static void trigger_event(struct work_struct *work); static void activate_path(struct work_struct *work); -static void deactivate_path(struct work_struct *work); /*----------------------------------------------- @@ -129,7 +127,6 @@ static struct pgpath *alloc_pgpath(void) if (pgpath) { pgpath->is_active = 1; - INIT_WORK(&pgpath->deactivate_path, deactivate_path); INIT_WORK(&pgpath->activate_path, activate_path); } @@ -141,14 +138,6 @@ static void free_pgpath(struct pgpath *pgpath) kfree(pgpath); } -static void deactivate_path(struct work_struct *work) -{ - struct pgpath *pgpath = - container_of(work, struct pgpath, deactivate_path); - - blk_abort_queue(pgpath->path.dev->bdev->bd_disk->queue); -} - static struct priority_group *alloc_priority_group(void) { struct priority_group *pg; @@ -995,7 +984,6 @@ static int fail_path(struct pgpath *pgpath) pgpath->path.dev->name, m->nr_valid_paths); schedule_work(&m->trigger_event); - queue_work(kmultipathd, &pgpath->deactivate_path); out: spin_unlock_irqrestore(&m->lock, flags); -- cgit From d9bf0b508ddfe19883b982b29a03c02ccbf53806 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Thu, 13 Jan 2011 19:59:47 +0000 Subject: dm io: remove BIO_RW_SYNCIO flag from kcopyd Remove the REQ_SYNC flag to improve write throughput when writing to the origin with a snapshot on the same device (using the CFQ I/O scheduler). Sequential write throughput (chunksize of 4k, 32k, 512k) unpatched: 8.5, 8.6, 9.3 MB/s patched: 15.2, 18.5, 17.5 MB/s Snapshot exception reallocations are triggered by writes that are usually async, so mark the associated dm_io_request as async as well. This helps when using the CFQ I/O scheduler because it has separate queues for sync and async I/O. Async is optimized for throughput; sync for latency. With this change we're consciously favoring throughput over latency. Signed-off-by: Mikulas Patocka Signed-off-by: Mike Snitzer Signed-off-by: Alasdair G Kergon --- drivers/md/dm-kcopyd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/md/dm-kcopyd.c b/drivers/md/dm-kcopyd.c index d8587bac5682..5ad9231c8700 100644 --- a/drivers/md/dm-kcopyd.c +++ b/drivers/md/dm-kcopyd.c @@ -345,7 +345,7 @@ static int run_io_job(struct kcopyd_job *job) { int r; struct dm_io_request io_req = { - .bi_rw = job->rw | REQ_SYNC | REQ_UNPLUG, + .bi_rw = job->rw | REQ_UNPLUG, .mem.type = DM_IO_PAGE_LIST, .mem.ptr.pl = job->pages, .mem.offset = job->offset, -- cgit From 84c89557a302e18414a011cc52b1abd034860743 Mon Sep 17 00:00:00 2001 From: Peter Jones Date: Thu, 13 Jan 2011 19:59:47 +0000 Subject: dm ioctl: allow rename to fill empty uuid Allow the uuid of a mapped device to be set after device creation. Previously the uuid (which is optional) could only be set by DM_DEV_CREATE. If no uuid was supplied it could not be set later. Sometimes it's necessary to create the device before the uuid is known, and in such cases the uuid must be filled in after the creation. This patch extends DM_DEV_RENAME to accept a uuid accompanied by a new flag DM_UUID_FLAG. This can only be done once and if no uuid was previously supplied. It cannot be used to change an existing uuid. DM_VERSION_MINOR is also bumped to 19 to indicate this interface extension is available. Signed-off-by: Peter Jones Signed-off-by: Jonathan Brassow Signed-off-by: Alasdair G Kergon --- drivers/md/dm-ioctl.c | 103 +++++++++++++++++++++++++++++++++++------------ include/linux/dm-ioctl.h | 12 ++++-- 2 files changed, 87 insertions(+), 28 deletions(-) diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c index 4b54618b4159..f3481d922206 100644 --- a/drivers/md/dm-ioctl.c +++ b/drivers/md/dm-ioctl.c @@ -295,19 +295,55 @@ retry: DMWARN("remove_all left %d open device(s)", dev_skipped); } +/* + * Set the uuid of a hash_cell that isn't already set. + */ +static void __set_cell_uuid(struct hash_cell *hc, char *new_uuid) +{ + mutex_lock(&dm_hash_cells_mutex); + hc->uuid = new_uuid; + mutex_unlock(&dm_hash_cells_mutex); + + list_add(&hc->uuid_list, _uuid_buckets + hash_str(new_uuid)); +} + +/* + * Changes the name of a hash_cell and returns the old name for + * the caller to free. + */ +static char *__change_cell_name(struct hash_cell *hc, char *new_name) +{ + char *old_name; + + /* + * Rename and move the name cell. + */ + list_del(&hc->name_list); + old_name = hc->name; + + mutex_lock(&dm_hash_cells_mutex); + hc->name = new_name; + mutex_unlock(&dm_hash_cells_mutex); + + list_add(&hc->name_list, _name_buckets + hash_str(new_name)); + + return old_name; +} + static struct mapped_device *dm_hash_rename(struct dm_ioctl *param, const char *new) { - char *new_name, *old_name; + char *new_data, *old_name = NULL; struct hash_cell *hc; struct dm_table *table; struct mapped_device *md; + unsigned change_uuid = (param->flags & DM_UUID_FLAG) ? 1 : 0; /* * duplicate new. */ - new_name = kstrdup(new, GFP_KERNEL); - if (!new_name) + new_data = kstrdup(new, GFP_KERNEL); + if (!new_data) return ERR_PTR(-ENOMEM); down_write(&_hash_lock); @@ -315,13 +351,19 @@ static struct mapped_device *dm_hash_rename(struct dm_ioctl *param, /* * Is new free ? */ - hc = __get_name_cell(new); + if (change_uuid) + hc = __get_uuid_cell(new); + else + hc = __get_name_cell(new); + if (hc) { - DMWARN("asked to rename to an already-existing name %s -> %s", + DMWARN("Unable to change %s on mapped device %s to one that " + "already exists: %s", + change_uuid ? "uuid" : "name", param->name, new); dm_put(hc->md); up_write(&_hash_lock); - kfree(new_name); + kfree(new_data); return ERR_PTR(-EBUSY); } @@ -330,22 +372,30 @@ static struct mapped_device *dm_hash_rename(struct dm_ioctl *param, */ hc = __get_name_cell(param->name); if (!hc) { - DMWARN("asked to rename a non-existent device %s -> %s", - param->name, new); + DMWARN("Unable to rename non-existent device, %s to %s%s", + param->name, change_uuid ? "uuid " : "", new); up_write(&_hash_lock); - kfree(new_name); + kfree(new_data); return ERR_PTR(-ENXIO); } /* - * rename and move the name cell. + * Does this device already have a uuid? */ - list_del(&hc->name_list); - old_name = hc->name; - mutex_lock(&dm_hash_cells_mutex); - hc->name = new_name; - mutex_unlock(&dm_hash_cells_mutex); - list_add(&hc->name_list, _name_buckets + hash_str(new_name)); + if (change_uuid && hc->uuid) { + DMWARN("Unable to change uuid of mapped device %s to %s " + "because uuid is already set to %s", + param->name, new, hc->uuid); + dm_put(hc->md); + up_write(&_hash_lock); + kfree(new_data); + return ERR_PTR(-EINVAL); + } + + if (change_uuid) + __set_cell_uuid(hc, new_data); + else + old_name = __change_cell_name(hc, new_data); /* * Wake up any dm event waiters. @@ -774,21 +824,24 @@ static int invalid_str(char *str, void *end) static int dev_rename(struct dm_ioctl *param, size_t param_size) { int r; - char *new_name = (char *) param + param->data_start; + char *new_data = (char *) param + param->data_start; struct mapped_device *md; + unsigned change_uuid = (param->flags & DM_UUID_FLAG) ? 1 : 0; - if (new_name < param->data || - invalid_str(new_name, (void *) param + param_size) || - strlen(new_name) > DM_NAME_LEN - 1) { - DMWARN("Invalid new logical volume name supplied."); + if (new_data < param->data || + invalid_str(new_data, (void *) param + param_size) || + strlen(new_data) > (change_uuid ? DM_UUID_LEN - 1 : DM_NAME_LEN - 1)) { + DMWARN("Invalid new mapped device name or uuid string supplied."); return -EINVAL; } - r = check_name(new_name); - if (r) - return r; + if (!change_uuid) { + r = check_name(new_data); + if (r) + return r; + } - md = dm_hash_rename(param, new_name); + md = dm_hash_rename(param, new_data); if (IS_ERR(md)) return PTR_ERR(md); diff --git a/include/linux/dm-ioctl.h b/include/linux/dm-ioctl.h index 49eab360d5d4..e453e1174ae1 100644 --- a/include/linux/dm-ioctl.h +++ b/include/linux/dm-ioctl.h @@ -44,7 +44,7 @@ * Remove a device, destroy any tables. * * DM_DEV_RENAME: - * Rename a device. + * Rename a device or set its uuid if none was previously supplied. * * DM_SUSPEND: * This performs both suspend and resume, depending which flag is @@ -267,9 +267,9 @@ enum { #define DM_DEV_SET_GEOMETRY _IOWR(DM_IOCTL, DM_DEV_SET_GEOMETRY_CMD, struct dm_ioctl) #define DM_VERSION_MAJOR 4 -#define DM_VERSION_MINOR 18 +#define DM_VERSION_MINOR 19 #define DM_VERSION_PATCHLEVEL 0 -#define DM_VERSION_EXTRA "-ioctl (2010-06-29)" +#define DM_VERSION_EXTRA "-ioctl (2010-10-14)" /* Status bits */ #define DM_READONLY_FLAG (1 << 0) /* In/Out */ @@ -322,4 +322,10 @@ enum { */ #define DM_UEVENT_GENERATED_FLAG (1 << 13) /* Out */ +/* + * If set, rename changes the uuid not the name. Only permitted + * if no uuid was previously supplied: an existing uuid cannot be changed. + */ +#define DM_UUID_FLAG (1 << 14) /* In */ + #endif /* _LINUX_DM_IOCTL_H */ -- cgit From 5fc2ffeabb9ee0fc0e71ff16b49f34f0ed3d05b4 Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Thu, 13 Jan 2011 19:59:48 +0000 Subject: dm raid1: support discard Enable discard support in the DM mirror target. Also change an existing use of 'bvec' to 'addr' in the union. Signed-off-by: Mike Snitzer Signed-off-by: Alasdair G Kergon --- drivers/md/dm-raid1.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c index 19a59b041c27..c87756eb7a21 100644 --- a/drivers/md/dm-raid1.c +++ b/drivers/md/dm-raid1.c @@ -261,7 +261,7 @@ static int mirror_flush(struct dm_target *ti) struct dm_io_request io_req = { .bi_rw = WRITE_FLUSH, .mem.type = DM_IO_KMEM, - .mem.ptr.bvec = NULL, + .mem.ptr.addr = NULL, .client = ms->io_client, }; @@ -637,6 +637,12 @@ static void do_write(struct mirror_set *ms, struct bio *bio) .client = ms->io_client, }; + if (bio->bi_rw & REQ_DISCARD) { + io_req.bi_rw |= REQ_DISCARD; + io_req.mem.type = DM_IO_KMEM; + io_req.mem.ptr.addr = NULL; + } + for (i = 0, m = ms->mirror; i < ms->nr_mirrors; i++, m++) map_region(dest++, m, bio); @@ -670,7 +676,8 @@ static void do_writes(struct mirror_set *ms, struct bio_list *writes) bio_list_init(&requeue); while ((bio = bio_list_pop(writes))) { - if (bio->bi_rw & REQ_FLUSH) { + if ((bio->bi_rw & REQ_FLUSH) || + (bio->bi_rw & REQ_DISCARD)) { bio_list_add(&sync, bio); continue; } @@ -1076,6 +1083,7 @@ static int mirror_ctr(struct dm_target *ti, unsigned int argc, char **argv) ti->private = ms; ti->split_io = dm_rh_get_region_size(ms->rh); ti->num_flush_requests = 1; + ti->num_discard_requests = 1; ms->kmirrord_wq = create_singlethread_workqueue("kmirrord"); if (!ms->kmirrord_wq) { -- cgit From 4a1aeb98297e17f4e0a8cdda919e63bf528b2e5d Mon Sep 17 00:00:00 2001 From: Milan Broz Date: Thu, 13 Jan 2011 19:59:48 +0000 Subject: dm: remove dm_mutex after bkl conversion This patch replaces dm_mutex with _minor_lock in dm_blk_close() and then removes it. During the BKL conversion, commit 6e9624b8caec290d28b4c6d9ec75749df6372b87 (block: push down BKL into .open and .release) pushed lock_kernel() down into dm_blk_open/close calls. Commit 2a48fc0ab24241755dc93bfd4f01d68efab47f5a (block: autoconvert trivial BKL users to private mutex) converted it to a local mutex, but _minor_lock is sufficient. Signed-off-by: Milan Broz Signed-off-by: Alasdair G Kergon --- drivers/md/dm.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 4ef066a3090f..0de692176ad4 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -32,7 +32,6 @@ #define DM_COOKIE_ENV_VAR_NAME "DM_COOKIE" #define DM_COOKIE_LENGTH 24 -static DEFINE_MUTEX(dm_mutex); static const char *_name = DM_NAME; static unsigned int major = 0; @@ -328,7 +327,6 @@ static int dm_blk_open(struct block_device *bdev, fmode_t mode) { struct mapped_device *md; - mutex_lock(&dm_mutex); spin_lock(&_minor_lock); md = bdev->bd_disk->private_data; @@ -346,7 +344,6 @@ static int dm_blk_open(struct block_device *bdev, fmode_t mode) out: spin_unlock(&_minor_lock); - mutex_unlock(&dm_mutex); return md ? 0 : -ENXIO; } @@ -355,10 +352,12 @@ static int dm_blk_close(struct gendisk *disk, fmode_t mode) { struct mapped_device *md = disk->private_data; - mutex_lock(&dm_mutex); + spin_lock(&_minor_lock); + atomic_dec(&md->open_count); dm_put(md); - mutex_unlock(&dm_mutex); + + spin_unlock(&_minor_lock); return 0; } -- cgit From 69a8cfcda21017364df1c21b720daf304b5598a6 Mon Sep 17 00:00:00 2001 From: Milan Broz Date: Thu, 13 Jan 2011 19:59:49 +0000 Subject: dm crypt: set key size early Simplify key size verification (hexadecimal string) and set key size early in constructor. (Patch required by later changes.) Signed-off-by: Milan Broz Signed-off-by: Alasdair G Kergon --- drivers/md/dm-crypt.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index d5b0e4c0e702..4c4408a2602f 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -973,15 +973,15 @@ static void crypt_encode_key(char *hex, u8 *key, unsigned int size) static int crypt_set_key(struct crypt_config *cc, char *key) { - unsigned key_size = strlen(key) >> 1; - - if (cc->key_size && cc->key_size != key_size) + /* The key size may not be changed. */ + if (cc->key_size != (strlen(key) >> 1)) return -EINVAL; - cc->key_size = key_size; /* initial settings */ + /* Hyphen (which gives a key_size of zero) means there is no key. */ + if (!cc->key_size && strcmp(key, "-")) + return -EINVAL; - if ((!key_size && strcmp(key, "-")) || - (key_size && crypt_decode_key(cc->key, key, key_size) < 0)) + if (cc->key_size && crypt_decode_key(cc->key, key, cc->key_size) < 0) return -EINVAL; set_bit(DM_CRYPT_KEY_VALID, &cc->flags); @@ -1194,6 +1194,7 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) ti->error = "Cannot allocate encryption context"; return -ENOMEM; } + cc->key_size = key_size; ti->private = cc; ret = crypt_ctr_cipher(ti, argv[0], argv[1]); -- cgit From 4a038677df4da84e42fd68b5ab2dfa4d82baa444 Mon Sep 17 00:00:00 2001 From: Jonathan Brassow Date: Thu, 13 Jan 2011 19:59:49 +0000 Subject: dm log userspace: trap all failed log construction errors When constructing a mirror log, it is possible for the initial request to fail for other reasons besides -ESRCH. These must be handled too. Signed-off-by: Jonathan Brassow Signed-off-by: Mike Snitzer Signed-off-by: Alasdair G Kergon --- drivers/md/dm-log-userspace-base.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/drivers/md/dm-log-userspace-base.c b/drivers/md/dm-log-userspace-base.c index 1ed0094f064b..1c25ad3d02a2 100644 --- a/drivers/md/dm-log-userspace-base.c +++ b/drivers/md/dm-log-userspace-base.c @@ -181,8 +181,11 @@ static int userspace_ctr(struct dm_dirty_log *log, struct dm_target *ti, r = dm_consult_userspace(lc->uuid, lc->luid, DM_ULOG_CTR, ctr_str, str_size, NULL, NULL); - if (r == -ESRCH) { - DMERR("Userspace log server not found"); + if (r < 0) { + if (r == -ESRCH) + DMERR("Userspace log server not found"); + else + DMERR("Userspace log server failed to create log"); goto out; } @@ -214,10 +217,9 @@ out: static void userspace_dtr(struct dm_dirty_log *log) { - int r; struct log_c *lc = log->context; - r = dm_consult_userspace(lc->uuid, lc->luid, DM_ULOG_DTR, + (void) dm_consult_userspace(lc->uuid, lc->luid, DM_ULOG_DTR, NULL, 0, NULL, NULL); -- cgit From 8d35d3e37eed884ba15229a146df846f399909b4 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Thu, 13 Jan 2011 19:59:50 +0000 Subject: dm kcopyd: delay unplugging Make kcopyd merge more I/O requests by using device unplugging. Without this patch, each I/O request is dispatched separately to the device. If the device supports tagged queuing, there are many small requests sent to the device. To improve performance, this patch will batch as many requests as possible, allowing the queue to merge consecutive requests, and send them to the device at once. In my tests (15k SCSI disk), this patch improves sequential write throughput: Sequential write throughput (chunksize of 4k, 32k, 512k) unpatched: 15.2, 18.5, 17.5 MB/s patched: 14.4, 22.6, 23.0 MB/s In most common uses (snapshot or two-way mirror), kcopyd is only used for two devices, one for reading and the other for writing, thus this optimization is implemented only for two devices. The optimization may be extended to n-way mirrors with some code complexity increase. We keep track of two block devices to unplug (one for read and the other for write) and unplug them when exiting "do_work" thread. If there are more devices used (in theory it could happen, in practice it is rare), we unplug immediately. Signed-off-by: Mikulas Patocka Signed-off-by: Mike Snitzer Signed-off-by: Alasdair G Kergon --- drivers/md/dm-kcopyd.c | 54 +++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 51 insertions(+), 3 deletions(-) diff --git a/drivers/md/dm-kcopyd.c b/drivers/md/dm-kcopyd.c index 5ad9231c8700..dad32f8bce7d 100644 --- a/drivers/md/dm-kcopyd.c +++ b/drivers/md/dm-kcopyd.c @@ -37,6 +37,13 @@ struct dm_kcopyd_client { unsigned int nr_pages; unsigned int nr_free_pages; + /* + * Block devices to unplug. + * Non-NULL pointer means that a block device has some pending requests + * and needs to be unplugged. + */ + struct block_device *unplug[2]; + struct dm_io_client *io_client; wait_queue_head_t destroyq; @@ -308,6 +315,31 @@ static int run_complete_job(struct kcopyd_job *job) return 0; } +/* + * Unplug the block device at the specified index. + */ +static void unplug(struct dm_kcopyd_client *kc, int rw) +{ + if (kc->unplug[rw] != NULL) { + blk_unplug(bdev_get_queue(kc->unplug[rw])); + kc->unplug[rw] = NULL; + } +} + +/* + * Prepare block device unplug. If there's another device + * to be unplugged at the same array index, we unplug that + * device first. + */ +static void prepare_unplug(struct dm_kcopyd_client *kc, int rw, + struct block_device *bdev) +{ + if (likely(kc->unplug[rw] == bdev)) + return; + unplug(kc, rw); + kc->unplug[rw] = bdev; +} + static void complete_io(unsigned long error, void *context) { struct kcopyd_job *job = (struct kcopyd_job *) context; @@ -345,7 +377,7 @@ static int run_io_job(struct kcopyd_job *job) { int r; struct dm_io_request io_req = { - .bi_rw = job->rw | REQ_UNPLUG, + .bi_rw = job->rw, .mem.type = DM_IO_PAGE_LIST, .mem.ptr.pl = job->pages, .mem.offset = job->offset, @@ -354,10 +386,16 @@ static int run_io_job(struct kcopyd_job *job) .client = job->kc->io_client, }; - if (job->rw == READ) + if (job->rw == READ) { r = dm_io(&io_req, 1, &job->source, NULL); - else + prepare_unplug(job->kc, READ, job->source.bdev); + } else { + if (job->num_dests > 1) + io_req.bi_rw |= REQ_UNPLUG; r = dm_io(&io_req, job->num_dests, job->dests, NULL); + if (!(io_req.bi_rw & REQ_UNPLUG)) + prepare_unplug(job->kc, WRITE, job->dests[0].bdev); + } return r; } @@ -435,10 +473,18 @@ static void do_work(struct work_struct *work) * Pages jobs when successful will jump onto the io jobs * list. io jobs call wake when they complete and it all * starts again. + * + * Note that io_jobs add block devices to the unplug array, + * this array is cleared with "unplug" calls. It is thus + * forbidden to run complete_jobs after io_jobs and before + * unplug because the block device could be destroyed in + * job completion callback. */ process_jobs(&kc->complete_jobs, kc, run_complete_job); process_jobs(&kc->pages_jobs, kc, run_pages_job); process_jobs(&kc->io_jobs, kc, run_io_job); + unplug(kc, READ); + unplug(kc, WRITE); } /* @@ -619,6 +665,8 @@ int dm_kcopyd_client_create(unsigned int nr_pages, INIT_LIST_HEAD(&kc->io_jobs); INIT_LIST_HEAD(&kc->pages_jobs); + memset(kc->unplug, 0, sizeof(kc->unplug)); + kc->job_pool = mempool_create_slab_pool(MIN_JOBS, _job_cache); if (!kc->job_pool) goto bad_slab; -- cgit From 909cc4fb48dd9870f6ebe4bd32cfbe37c102df62 Mon Sep 17 00:00:00 2001 From: Jonathan Brassow Date: Thu, 13 Jan 2011 19:59:50 +0000 Subject: dm log userspace: split flush queue Split the 'flush_list', which contained a mix of both 'mark' and 'clear' requests, into two distinct lists ('mark_list' and 'clear_list'). The device mapper log implementations (used by various DM targets) are allowed to cache 'mark' and 'clear' requests until a 'flush' is received. Until now, these cached requests were kept in the same list. They will now be put into distinct lists to facilitate group processing of these requests (in the next patch). Signed-off-by: Jonathan Brassow Signed-off-by: Mike Snitzer Signed-off-by: Alasdair G Kergon --- drivers/md/dm-log-userspace-base.c | 41 +++++++++++++++++++++++++++++--------- 1 file changed, 32 insertions(+), 9 deletions(-) diff --git a/drivers/md/dm-log-userspace-base.c b/drivers/md/dm-log-userspace-base.c index 1c25ad3d02a2..767adf300fa5 100644 --- a/drivers/md/dm-log-userspace-base.c +++ b/drivers/md/dm-log-userspace-base.c @@ -37,8 +37,15 @@ struct log_c { */ uint64_t in_sync_hint; + /* + * Mark and clear requests are held until a flush is issued + * so that we can group, and thereby limit, the amount of + * network traffic between kernel and userspace. The 'flush_lock' + * is used to protect these lists. + */ spinlock_t flush_lock; - struct list_head flush_list; /* only for clear and mark requests */ + struct list_head mark_list; + struct list_head clear_list; }; static mempool_t *flush_entry_pool; @@ -169,7 +176,8 @@ static int userspace_ctr(struct dm_dirty_log *log, struct dm_target *ti, strncpy(lc->uuid, argv[0], DM_UUID_LEN); spin_lock_init(&lc->flush_lock); - INIT_LIST_HEAD(&lc->flush_list); + INIT_LIST_HEAD(&lc->mark_list); + INIT_LIST_HEAD(&lc->clear_list); str_size = build_constructor_string(ti, argc - 1, argv + 1, &ctr_str); if (str_size < 0) { @@ -362,14 +370,16 @@ static int userspace_flush(struct dm_dirty_log *log) int r = 0; unsigned long flags; struct log_c *lc = log->context; - LIST_HEAD(flush_list); + LIST_HEAD(mark_list); + LIST_HEAD(clear_list); struct flush_entry *fe, *tmp_fe; spin_lock_irqsave(&lc->flush_lock, flags); - list_splice_init(&lc->flush_list, &flush_list); + list_splice_init(&lc->mark_list, &mark_list); + list_splice_init(&lc->clear_list, &clear_list); spin_unlock_irqrestore(&lc->flush_lock, flags); - if (list_empty(&flush_list)) + if (list_empty(&mark_list) && list_empty(&clear_list)) return 0; /* @@ -379,7 +389,16 @@ static int userspace_flush(struct dm_dirty_log *log) * do it one by one. */ - list_for_each_entry(fe, &flush_list, list) { + list_for_each_entry(fe, &mark_list, list) { + r = userspace_do_request(lc, lc->uuid, fe->type, + (char *)&fe->region, + sizeof(fe->region), + NULL, NULL); + if (r) + goto fail; + } + + list_for_each_entry(fe, &clear_list, list) { r = userspace_do_request(lc, lc->uuid, fe->type, (char *)&fe->region, sizeof(fe->region), @@ -397,7 +416,11 @@ fail: * Calling code will receive an error and will know that * the log facility has failed. */ - list_for_each_entry_safe(fe, tmp_fe, &flush_list, list) { + list_for_each_entry_safe(fe, tmp_fe, &mark_list, list) { + list_del(&fe->list); + mempool_free(fe, flush_entry_pool); + } + list_for_each_entry_safe(fe, tmp_fe, &clear_list, list) { list_del(&fe->list); mempool_free(fe, flush_entry_pool); } @@ -427,7 +450,7 @@ static void userspace_mark_region(struct dm_dirty_log *log, region_t region) spin_lock_irqsave(&lc->flush_lock, flags); fe->type = DM_ULOG_MARK_REGION; fe->region = region; - list_add(&fe->list, &lc->flush_list); + list_add(&fe->list, &lc->mark_list); spin_unlock_irqrestore(&lc->flush_lock, flags); return; @@ -464,7 +487,7 @@ static void userspace_clear_region(struct dm_dirty_log *log, region_t region) spin_lock_irqsave(&lc->flush_lock, flags); fe->type = DM_ULOG_CLEAR_REGION; fe->region = region; - list_add(&fe->list, &lc->flush_list); + list_add(&fe->list, &lc->clear_list); spin_unlock_irqrestore(&lc->flush_lock, flags); return; -- cgit From 085ae0651b2791f3a430ddb76da92925b9952e13 Mon Sep 17 00:00:00 2001 From: Jonathan Brassow Date: Thu, 13 Jan 2011 19:59:51 +0000 Subject: dm log userspace: group clear and mark requests Allow the device-mapper log's 'mark' and 'clear' requests to be grouped and processed in a batch. This can significantly reduce the amount of traffic going between the kernel and userspace (where the processing daemon resides). Signed-off-by: Jonathan Brassow Signed-off-by: Mike Snitzer Signed-off-by: Alasdair G Kergon --- drivers/md/dm-log-userspace-base.c | 102 ++++++++++++++++++++++++++++--------- 1 file changed, 79 insertions(+), 23 deletions(-) diff --git a/drivers/md/dm-log-userspace-base.c b/drivers/md/dm-log-userspace-base.c index 767adf300fa5..31e1687e7bf6 100644 --- a/drivers/md/dm-log-userspace-base.c +++ b/drivers/md/dm-log-userspace-base.c @@ -18,6 +18,14 @@ struct flush_entry { struct list_head list; }; +/* + * This limit on the number of mark and clear request is, to a degree, + * arbitrary. However, there is some basis for the choice in the limits + * imposed on the size of data payload by dm-log-userspace-transfer.c: + * dm_consult_userspace(). + */ +#define MAX_FLUSH_GROUP_COUNT 32 + struct log_c { struct dm_target *ti; uint32_t region_size; @@ -348,6 +356,71 @@ static int userspace_in_sync(struct dm_dirty_log *log, region_t region, return (r) ? 0 : (int)in_sync; } +static int flush_one_by_one(struct log_c *lc, struct list_head *flush_list) +{ + int r = 0; + struct flush_entry *fe; + + list_for_each_entry(fe, flush_list, list) { + r = userspace_do_request(lc, lc->uuid, fe->type, + (char *)&fe->region, + sizeof(fe->region), + NULL, NULL); + if (r) + break; + } + + return r; +} + +static int flush_by_group(struct log_c *lc, struct list_head *flush_list) +{ + int r = 0; + int count; + uint32_t type = 0; + struct flush_entry *fe, *tmp_fe; + LIST_HEAD(tmp_list); + uint64_t group[MAX_FLUSH_GROUP_COUNT]; + + /* + * Group process the requests + */ + while (!list_empty(flush_list)) { + count = 0; + + list_for_each_entry_safe(fe, tmp_fe, flush_list, list) { + group[count] = fe->region; + count++; + + list_del(&fe->list); + list_add(&fe->list, &tmp_list); + + type = fe->type; + if (count >= MAX_FLUSH_GROUP_COUNT) + break; + } + + r = userspace_do_request(lc, lc->uuid, type, + (char *)(group), + count * sizeof(uint64_t), + NULL, NULL); + if (r) { + /* Group send failed. Attempt one-by-one. */ + list_splice_init(&tmp_list, flush_list); + r = flush_one_by_one(lc, flush_list); + break; + } + } + + /* + * Must collect flush_entrys that were successfully processed + * as a group so that they will be free'd by the caller. + */ + list_splice_init(&tmp_list, flush_list); + + return r; +} + /* * userspace_flush * @@ -382,30 +455,13 @@ static int userspace_flush(struct dm_dirty_log *log) if (list_empty(&mark_list) && list_empty(&clear_list)) return 0; - /* - * FIXME: Count up requests, group request types, - * allocate memory to stick all requests in and - * send to server in one go. Failing the allocation, - * do it one by one. - */ - - list_for_each_entry(fe, &mark_list, list) { - r = userspace_do_request(lc, lc->uuid, fe->type, - (char *)&fe->region, - sizeof(fe->region), - NULL, NULL); - if (r) - goto fail; - } + r = flush_by_group(lc, &mark_list); + if (r) + goto fail; - list_for_each_entry(fe, &clear_list, list) { - r = userspace_do_request(lc, lc->uuid, fe->type, - (char *)&fe->region, - sizeof(fe->region), - NULL, NULL); - if (r) - goto fail; - } + r = flush_by_group(lc, &clear_list); + if (r) + goto fail; r = userspace_do_request(lc, lc->uuid, DM_ULOG_FLUSH, NULL, 0, NULL, NULL); -- cgit From 86a54a4802df10d23ccd655e2083e812fe990243 Mon Sep 17 00:00:00 2001 From: Jonathan Brassow Date: Thu, 13 Jan 2011 19:59:52 +0000 Subject: dm log userspace: add version number to comms This patch adds a 'version' field to the 'dm_ulog_request' structure. The 'version' field is taken from a portion of the unused 'padding' field in the 'dm_ulog_request' structure. This was done to avoid changing the size of the structure and possibly disrupting backwards compatibility. The version number will help notify user-space daemons when a change has been made to the kernel/userspace log API. Signed-off-by: Jonathan Brassow Signed-off-by: Mike Snitzer Signed-off-by: Alasdair G Kergon --- drivers/md/dm-log-userspace-base.c | 6 ++++-- drivers/md/dm-log-userspace-transfer.c | 1 + include/linux/dm-log-userspace.h | 13 ++++++++++++- 3 files changed, 17 insertions(+), 3 deletions(-) diff --git a/drivers/md/dm-log-userspace-base.c b/drivers/md/dm-log-userspace-base.c index 31e1687e7bf6..aa2e0c374ab3 100644 --- a/drivers/md/dm-log-userspace-base.c +++ b/drivers/md/dm-log-userspace-base.c @@ -12,6 +12,8 @@ #include "dm-log-userspace-transfer.h" +#define DM_LOG_USERSPACE_VSN "1.1.0" + struct flush_entry { int type; region_t region; @@ -765,7 +767,7 @@ static int __init userspace_dirty_log_init(void) return r; } - DMINFO("version 1.0.0 loaded"); + DMINFO("version " DM_LOG_USERSPACE_VSN " loaded"); return 0; } @@ -775,7 +777,7 @@ static void __exit userspace_dirty_log_exit(void) dm_ulog_tfr_exit(); mempool_destroy(flush_entry_pool); - DMINFO("version 1.0.0 unloaded"); + DMINFO("version " DM_LOG_USERSPACE_VSN " unloaded"); return; } diff --git a/drivers/md/dm-log-userspace-transfer.c b/drivers/md/dm-log-userspace-transfer.c index 075cbcf8a9f5..049eaf12aaab 100644 --- a/drivers/md/dm-log-userspace-transfer.c +++ b/drivers/md/dm-log-userspace-transfer.c @@ -198,6 +198,7 @@ resend: memset(tfr, 0, DM_ULOG_PREALLOCED_SIZE - sizeof(struct cn_msg)); memcpy(tfr->uuid, uuid, DM_UUID_LEN); + tfr->version = DM_ULOG_REQUEST_VERSION; tfr->luid = luid; tfr->seq = dm_ulog_seq++; diff --git a/include/linux/dm-log-userspace.h b/include/linux/dm-log-userspace.h index 0c3c3a2110c4..eeace7d3ff15 100644 --- a/include/linux/dm-log-userspace.h +++ b/include/linux/dm-log-userspace.h @@ -370,6 +370,16 @@ #define DM_ULOG_REQUEST_TYPE(request_type) \ (DM_ULOG_REQUEST_MASK & (request_type)) +/* + * DM_ULOG_REQUEST_VERSION is incremented when there is a + * change to the way information is passed between kernel + * and userspace. This could be a structure change of + * dm_ulog_request or a change in the way requests are + * issued/handled. Changes are outlined here: + * version 1: Initial implementation + */ +#define DM_ULOG_REQUEST_VERSION 1 + struct dm_ulog_request { /* * The local unique identifier (luid) and the universally unique @@ -383,8 +393,9 @@ struct dm_ulog_request { */ uint64_t luid; char uuid[DM_UUID_LEN]; - char padding[7]; /* Padding because DM_UUID_LEN = 129 */ + char padding[3]; /* Padding because DM_UUID_LEN = 129 */ + uint32_t version; /* See DM_ULOG_REQUEST_VERSION */ int32_t error; /* Used to report back processing errors */ uint32_t seq; /* Sequence number for request */ -- cgit From 7dbcd137414f3877737802438926d6dba7906a9a Mon Sep 17 00:00:00 2001 From: Milan Broz Date: Thu, 13 Jan 2011 19:59:52 +0000 Subject: dm crypt: simplify compatible table output Rename cc->cipher_mode to cc->cipher_string and store the whole of the cipher information so it can easily be printed when processing the DM_DEV_STATUS ioctl. Signed-off-by: Milan Broz Signed-off-by: Alasdair G Kergon --- drivers/md/dm-crypt.c | 28 ++++++++++++---------------- 1 file changed, 12 insertions(+), 16 deletions(-) diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index 4c4408a2602f..9a896e1cb2ea 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -108,7 +108,7 @@ struct crypt_config { struct workqueue_struct *crypt_queue; char *cipher; - char *cipher_mode; + char *cipher_string; struct crypt_iv_operations *iv_gen_ops; union { @@ -1030,7 +1030,7 @@ static void crypt_dtr(struct dm_target *ti) dm_put_device(ti, cc->dev); kzfree(cc->cipher); - kzfree(cc->cipher_mode); + kzfree(cc->cipher_string); /* Must zero key material before freeing */ kzfree(cc); @@ -1050,6 +1050,10 @@ static int crypt_ctr_cipher(struct dm_target *ti, return -EINVAL; } + cc->cipher_string = kstrdup(cipher_in, GFP_KERNEL); + if (!cc->cipher_string) + goto bad_mem; + /* * Legacy dm-crypt cipher specification * cipher-mode-iv:ivopts @@ -1061,12 +1065,6 @@ static int crypt_ctr_cipher(struct dm_target *ti, if (!cc->cipher) goto bad_mem; - if (tmp) { - cc->cipher_mode = kstrdup(tmp, GFP_KERNEL); - if (!cc->cipher_mode) - goto bad_mem; - } - chainmode = strsep(&tmp, "-"); ivopts = strsep(&tmp, "-"); ivmode = strsep(&ivopts, ":"); @@ -1074,10 +1072,11 @@ static int crypt_ctr_cipher(struct dm_target *ti, if (tmp) DMWARN("Ignoring unexpected additional cipher options"); - /* Compatibility mode for old dm-crypt mappings */ + /* + * For compatibility with the original dm-crypt mapping format, if + * only the cipher name is supplied, use cbc-plain. + */ if (!chainmode || (!strcmp(chainmode, "plain") && !ivmode)) { - kfree(cc->cipher_mode); - cc->cipher_mode = kstrdup("cbc-plain", GFP_KERNEL); chainmode = "cbc"; ivmode = "plain"; } @@ -1307,10 +1306,7 @@ static int crypt_status(struct dm_target *ti, status_type_t type, break; case STATUSTYPE_TABLE: - if (cc->cipher_mode) - DMEMIT("%s-%s ", cc->cipher, cc->cipher_mode); - else - DMEMIT("%s ", cc->cipher); + DMEMIT("%s ", cc->cipher_string); if (cc->key_size > 0) { if ((maxlen - sz) < ((cc->key_size << 1) + 1)) @@ -1422,7 +1418,7 @@ static int crypt_iterate_devices(struct dm_target *ti, static struct target_type crypt_target = { .name = "crypt", - .version = {1, 7, 0}, + .version = {1, 8, 0}, .module = THIS_MODULE, .ctr = crypt_ctr, .dtr = crypt_dtr, -- cgit From c029772125594e31eb1a5ad9e0913724ed9891f2 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 13 Jan 2011 19:59:53 +0000 Subject: dm crypt: scale to multiple cpus Currently dm-crypt does all the encryption work for a single dm-crypt mapping in a single workqueue. This does not scale well when multiple CPUs are submitting IO at a high rate. The single CPU running the single thread cannot keep up with the encryption and encrypted IO performance tanks. This patch changes the crypto workqueue to be per CPU. This means that as long as the IO submitter (or the interrupt target CPUs for reads) runs on different CPUs the encryption work will be also parallel. To avoid a bottleneck on the IO worker I also changed those to be per-CPU threads. There is still some shared data, so I suspect some bouncing cache lines. But I haven't done a detailed study on that yet. Signed-off-by: Andi Kleen Signed-off-by: Milan Broz Signed-off-by: Alasdair G Kergon --- drivers/md/dm-crypt.c | 254 ++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 196 insertions(+), 58 deletions(-) diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index 9a896e1cb2ea..50ae6ef83738 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include @@ -77,7 +78,6 @@ struct crypt_iv_operations { }; struct iv_essiv_private { - struct crypto_cipher *tfm; struct crypto_hash *hash_tfm; u8 *salt; }; @@ -91,6 +91,22 @@ struct iv_benbi_private { * and encrypts / decrypts at the same time. */ enum flags { DM_CRYPT_SUSPENDED, DM_CRYPT_KEY_VALID }; + +/* + * Duplicated per-CPU state for cipher. + */ +struct crypt_cpu { + struct ablkcipher_request *req; + struct crypto_ablkcipher *tfm; + + /* ESSIV: struct crypto_cipher *essiv_tfm */ + void *iv_private; +}; + +/* + * The fields in here must be read only after initialization, + * changing state should be in crypt_cpu. + */ struct crypt_config { struct dm_dev *dev; sector_t start; @@ -118,6 +134,12 @@ struct crypt_config { sector_t iv_offset; unsigned int iv_size; + /* + * Duplicated per cpu state. Access through + * per_cpu_ptr() only. + */ + struct crypt_cpu __percpu *cpu; + /* * Layout of each crypto request: * @@ -132,9 +154,7 @@ struct crypt_config { * correctly aligned. */ unsigned int dmreq_start; - struct ablkcipher_request *req; - struct crypto_ablkcipher *tfm; unsigned long flags; unsigned int key_size; u8 key[0]; @@ -149,6 +169,19 @@ static struct kmem_cache *_crypt_io_pool; static void clone_init(struct dm_crypt_io *, struct bio *); static void kcryptd_queue_crypt(struct dm_crypt_io *io); +static struct crypt_cpu *this_crypt_config(struct crypt_config *cc) +{ + return this_cpu_ptr(cc->cpu); +} + +/* + * Use this to access cipher attributes that are the same for each CPU. + */ +static struct crypto_ablkcipher *any_tfm(struct crypt_config *cc) +{ + return __this_cpu_ptr(cc->cpu)->tfm; +} + /* * Different IV generation algorithms: * @@ -195,7 +228,8 @@ static int crypt_iv_essiv_init(struct crypt_config *cc) struct iv_essiv_private *essiv = &cc->iv_gen_private.essiv; struct hash_desc desc; struct scatterlist sg; - int err; + struct crypto_cipher *essiv_tfm; + int err, cpu; sg_init_one(&sg, cc->key, cc->key_size); desc.tfm = essiv->hash_tfm; @@ -205,8 +239,16 @@ static int crypt_iv_essiv_init(struct crypt_config *cc) if (err) return err; - return crypto_cipher_setkey(essiv->tfm, essiv->salt, + for_each_possible_cpu(cpu) { + essiv_tfm = per_cpu_ptr(cc->cpu, cpu)->iv_private, + + err = crypto_cipher_setkey(essiv_tfm, essiv->salt, crypto_hash_digestsize(essiv->hash_tfm)); + if (err) + return err; + } + + return 0; } /* Wipe salt and reset key derived from volume key */ @@ -214,24 +256,76 @@ static int crypt_iv_essiv_wipe(struct crypt_config *cc) { struct iv_essiv_private *essiv = &cc->iv_gen_private.essiv; unsigned salt_size = crypto_hash_digestsize(essiv->hash_tfm); + struct crypto_cipher *essiv_tfm; + int cpu, r, err = 0; memset(essiv->salt, 0, salt_size); - return crypto_cipher_setkey(essiv->tfm, essiv->salt, salt_size); + for_each_possible_cpu(cpu) { + essiv_tfm = per_cpu_ptr(cc->cpu, cpu)->iv_private; + r = crypto_cipher_setkey(essiv_tfm, essiv->salt, salt_size); + if (r) + err = r; + } + + return err; +} + +/* Set up per cpu cipher state */ +static struct crypto_cipher *setup_essiv_cpu(struct crypt_config *cc, + struct dm_target *ti, + u8 *salt, unsigned saltsize) +{ + struct crypto_cipher *essiv_tfm; + int err; + + /* Setup the essiv_tfm with the given salt */ + essiv_tfm = crypto_alloc_cipher(cc->cipher, 0, CRYPTO_ALG_ASYNC); + if (IS_ERR(essiv_tfm)) { + ti->error = "Error allocating crypto tfm for ESSIV"; + return essiv_tfm; + } + + if (crypto_cipher_blocksize(essiv_tfm) != + crypto_ablkcipher_ivsize(any_tfm(cc))) { + ti->error = "Block size of ESSIV cipher does " + "not match IV size of block cipher"; + crypto_free_cipher(essiv_tfm); + return ERR_PTR(-EINVAL); + } + + err = crypto_cipher_setkey(essiv_tfm, salt, saltsize); + if (err) { + ti->error = "Failed to set key for ESSIV cipher"; + crypto_free_cipher(essiv_tfm); + return ERR_PTR(err); + } + + return essiv_tfm; } static void crypt_iv_essiv_dtr(struct crypt_config *cc) { + int cpu; + struct crypt_cpu *cpu_cc; + struct crypto_cipher *essiv_tfm; struct iv_essiv_private *essiv = &cc->iv_gen_private.essiv; - crypto_free_cipher(essiv->tfm); - essiv->tfm = NULL; - crypto_free_hash(essiv->hash_tfm); essiv->hash_tfm = NULL; kzfree(essiv->salt); essiv->salt = NULL; + + for_each_possible_cpu(cpu) { + cpu_cc = per_cpu_ptr(cc->cpu, cpu); + essiv_tfm = cpu_cc->iv_private; + + if (essiv_tfm) + crypto_free_cipher(essiv_tfm); + + cpu_cc->iv_private = NULL; + } } static int crypt_iv_essiv_ctr(struct crypt_config *cc, struct dm_target *ti, @@ -240,7 +334,7 @@ static int crypt_iv_essiv_ctr(struct crypt_config *cc, struct dm_target *ti, struct crypto_cipher *essiv_tfm = NULL; struct crypto_hash *hash_tfm = NULL; u8 *salt = NULL; - int err; + int err, cpu; if (!opts) { ti->error = "Digest algorithm missing for ESSIV mode"; @@ -262,30 +356,22 @@ static int crypt_iv_essiv_ctr(struct crypt_config *cc, struct dm_target *ti, goto bad; } - /* Allocate essiv_tfm */ - essiv_tfm = crypto_alloc_cipher(cc->cipher, 0, CRYPTO_ALG_ASYNC); - if (IS_ERR(essiv_tfm)) { - ti->error = "Error allocating crypto tfm for ESSIV"; - err = PTR_ERR(essiv_tfm); - goto bad; - } - if (crypto_cipher_blocksize(essiv_tfm) != - crypto_ablkcipher_ivsize(cc->tfm)) { - ti->error = "Block size of ESSIV cipher does " - "not match IV size of block cipher"; - err = -EINVAL; - goto bad; - } - cc->iv_gen_private.essiv.salt = salt; - cc->iv_gen_private.essiv.tfm = essiv_tfm; cc->iv_gen_private.essiv.hash_tfm = hash_tfm; + for_each_possible_cpu(cpu) { + essiv_tfm = setup_essiv_cpu(cc, ti, salt, + crypto_hash_digestsize(hash_tfm)); + if (IS_ERR(essiv_tfm)) { + crypt_iv_essiv_dtr(cc); + return PTR_ERR(essiv_tfm); + } + per_cpu_ptr(cc->cpu, cpu)->iv_private = essiv_tfm; + } + return 0; bad: - if (essiv_tfm && !IS_ERR(essiv_tfm)) - crypto_free_cipher(essiv_tfm); if (hash_tfm && !IS_ERR(hash_tfm)) crypto_free_hash(hash_tfm); kfree(salt); @@ -294,16 +380,19 @@ bad: static int crypt_iv_essiv_gen(struct crypt_config *cc, u8 *iv, sector_t sector) { + struct crypto_cipher *essiv_tfm = this_crypt_config(cc)->iv_private; + memset(iv, 0, cc->iv_size); *(u64 *)iv = cpu_to_le64(sector); - crypto_cipher_encrypt_one(cc->iv_gen_private.essiv.tfm, iv, iv); + crypto_cipher_encrypt_one(essiv_tfm, iv, iv); + return 0; } static int crypt_iv_benbi_ctr(struct crypt_config *cc, struct dm_target *ti, const char *opts) { - unsigned bs = crypto_ablkcipher_blocksize(cc->tfm); + unsigned bs = crypto_ablkcipher_blocksize(any_tfm(cc)); int log = ilog2(bs); /* we need to calculate how far we must shift the sector count @@ -412,7 +501,7 @@ static int crypt_convert_block(struct crypt_config *cc, dmreq = dmreq_of_req(cc, req); iv = (u8 *)ALIGN((unsigned long)(dmreq + 1), - crypto_ablkcipher_alignmask(cc->tfm) + 1); + crypto_ablkcipher_alignmask(any_tfm(cc)) + 1); dmreq->ctx = ctx; sg_init_table(&dmreq->sg_in, 1); @@ -454,16 +543,19 @@ static int crypt_convert_block(struct crypt_config *cc, static void kcryptd_async_done(struct crypto_async_request *async_req, int error); + static void crypt_alloc_req(struct crypt_config *cc, struct convert_context *ctx) { - if (!cc->req) - cc->req = mempool_alloc(cc->req_pool, GFP_NOIO); - ablkcipher_request_set_tfm(cc->req, cc->tfm); - ablkcipher_request_set_callback(cc->req, CRYPTO_TFM_REQ_MAY_BACKLOG | - CRYPTO_TFM_REQ_MAY_SLEEP, - kcryptd_async_done, - dmreq_of_req(cc, cc->req)); + struct crypt_cpu *this_cc = this_crypt_config(cc); + + if (!this_cc->req) + this_cc->req = mempool_alloc(cc->req_pool, GFP_NOIO); + + ablkcipher_request_set_tfm(this_cc->req, this_cc->tfm); + ablkcipher_request_set_callback(this_cc->req, + CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP, + kcryptd_async_done, dmreq_of_req(cc, this_cc->req)); } /* @@ -472,6 +564,7 @@ static void crypt_alloc_req(struct crypt_config *cc, static int crypt_convert(struct crypt_config *cc, struct convert_context *ctx) { + struct crypt_cpu *this_cc = this_crypt_config(cc); int r; atomic_set(&ctx->pending, 1); @@ -483,7 +576,7 @@ static int crypt_convert(struct crypt_config *cc, atomic_inc(&ctx->pending); - r = crypt_convert_block(cc, ctx, cc->req); + r = crypt_convert_block(cc, ctx, this_cc->req); switch (r) { /* async */ @@ -492,7 +585,7 @@ static int crypt_convert(struct crypt_config *cc, INIT_COMPLETION(ctx->restart); /* fall through*/ case -EINPROGRESS: - cc->req = NULL; + this_cc->req = NULL; ctx->sector++; continue; @@ -651,6 +744,9 @@ static void crypt_dec_pending(struct dm_crypt_io *io) * They must be separated as otherwise the final stages could be * starved by new requests which can block in the first stages due * to memory allocation. + * + * The work is done per CPU global for all dm-crypt instances. + * They should not depend on each other and do not block. */ static void crypt_endio(struct bio *clone, int error) { @@ -971,6 +1067,20 @@ static void crypt_encode_key(char *hex, u8 *key, unsigned int size) } } +static int crypt_setkey_allcpus(struct crypt_config *cc) +{ + int cpu, err = 0, r; + + for_each_possible_cpu(cpu) { + r = crypto_ablkcipher_setkey(per_cpu_ptr(cc->cpu, cpu)->tfm, + cc->key, cc->key_size); + if (r) + err = r; + } + + return err; +} + static int crypt_set_key(struct crypt_config *cc, char *key) { /* The key size may not be changed. */ @@ -986,19 +1096,22 @@ static int crypt_set_key(struct crypt_config *cc, char *key) set_bit(DM_CRYPT_KEY_VALID, &cc->flags); - return crypto_ablkcipher_setkey(cc->tfm, cc->key, cc->key_size); + return crypt_setkey_allcpus(cc); } static int crypt_wipe_key(struct crypt_config *cc) { clear_bit(DM_CRYPT_KEY_VALID, &cc->flags); memset(&cc->key, 0, cc->key_size * sizeof(u8)); - return crypto_ablkcipher_setkey(cc->tfm, cc->key, cc->key_size); + + return crypt_setkey_allcpus(cc); } static void crypt_dtr(struct dm_target *ti) { struct crypt_config *cc = ti->private; + struct crypt_cpu *cpu_cc; + int cpu; ti->private = NULL; @@ -1010,6 +1123,15 @@ static void crypt_dtr(struct dm_target *ti) if (cc->crypt_queue) destroy_workqueue(cc->crypt_queue); + if (cc->cpu) + for_each_possible_cpu(cpu) { + cpu_cc = per_cpu_ptr(cc->cpu, cpu); + if (cpu_cc->req) + mempool_free(cpu_cc->req, cc->req_pool); + if (cpu_cc->tfm) + crypto_free_ablkcipher(cpu_cc->tfm); + } + if (cc->bs) bioset_free(cc->bs); @@ -1023,12 +1145,12 @@ static void crypt_dtr(struct dm_target *ti) if (cc->iv_gen_ops && cc->iv_gen_ops->dtr) cc->iv_gen_ops->dtr(cc); - if (cc->tfm && !IS_ERR(cc->tfm)) - crypto_free_ablkcipher(cc->tfm); - if (cc->dev) dm_put_device(ti, cc->dev); + if (cc->cpu) + free_percpu(cc->cpu); + kzfree(cc->cipher); kzfree(cc->cipher_string); @@ -1040,9 +1162,10 @@ static int crypt_ctr_cipher(struct dm_target *ti, char *cipher_in, char *key) { struct crypt_config *cc = ti->private; + struct crypto_ablkcipher *tfm; char *tmp, *cipher, *chainmode, *ivmode, *ivopts; char *cipher_api = NULL; - int ret = -EINVAL; + int cpu, ret = -EINVAL; /* Convert to crypto api definition? */ if (strchr(cipher_in, '(')) { @@ -1072,6 +1195,12 @@ static int crypt_ctr_cipher(struct dm_target *ti, if (tmp) DMWARN("Ignoring unexpected additional cipher options"); + cc->cpu = alloc_percpu(struct crypt_cpu); + if (!cc->cpu) { + ti->error = "Cannot allocate per cpu state"; + goto bad_mem; + } + /* * For compatibility with the original dm-crypt mapping format, if * only the cipher name is supplied, use cbc-plain. @@ -1098,11 +1227,14 @@ static int crypt_ctr_cipher(struct dm_target *ti, } /* Allocate cipher */ - cc->tfm = crypto_alloc_ablkcipher(cipher_api, 0, 0); - if (IS_ERR(cc->tfm)) { - ret = PTR_ERR(cc->tfm); - ti->error = "Error allocating crypto tfm"; - goto bad; + for_each_possible_cpu(cpu) { + tfm = crypto_alloc_ablkcipher(cipher_api, 0, 0); + if (IS_ERR(tfm)) { + ret = PTR_ERR(tfm); + ti->error = "Error allocating crypto tfm"; + goto bad; + } + per_cpu_ptr(cc->cpu, cpu)->tfm = tfm; } /* Initialize and set key */ @@ -1113,7 +1245,7 @@ static int crypt_ctr_cipher(struct dm_target *ti, } /* Initialize IV */ - cc->iv_size = crypto_ablkcipher_ivsize(cc->tfm); + cc->iv_size = crypto_ablkcipher_ivsize(any_tfm(cc)); if (cc->iv_size) /* at least a 64 bit sector number should fit in our buffer */ cc->iv_size = max(cc->iv_size, @@ -1208,9 +1340,9 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) } cc->dmreq_start = sizeof(struct ablkcipher_request); - cc->dmreq_start += crypto_ablkcipher_reqsize(cc->tfm); + cc->dmreq_start += crypto_ablkcipher_reqsize(any_tfm(cc)); cc->dmreq_start = ALIGN(cc->dmreq_start, crypto_tfm_ctx_alignment()); - cc->dmreq_start += crypto_ablkcipher_alignmask(cc->tfm) & + cc->dmreq_start += crypto_ablkcipher_alignmask(any_tfm(cc)) & ~(crypto_tfm_ctx_alignment() - 1); cc->req_pool = mempool_create_kmalloc_pool(MIN_IOS, cc->dmreq_start + @@ -1219,7 +1351,6 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) ti->error = "Cannot allocate crypt request mempool"; goto bad; } - cc->req = NULL; cc->page_pool = mempool_create_page_pool(MIN_POOL_PAGES, 0); if (!cc->page_pool) { @@ -1252,13 +1383,20 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) cc->start = tmpll; ret = -ENOMEM; - cc->io_queue = create_singlethread_workqueue("kcryptd_io"); + cc->io_queue = alloc_workqueue("kcryptd_io", + WQ_NON_REENTRANT| + WQ_MEM_RECLAIM, + 1); if (!cc->io_queue) { ti->error = "Couldn't create kcryptd io queue"; goto bad; } - cc->crypt_queue = create_singlethread_workqueue("kcryptd"); + cc->crypt_queue = alloc_workqueue("kcryptd", + WQ_NON_REENTRANT| + WQ_CPU_INTENSIVE| + WQ_MEM_RECLAIM, + 1); if (!cc->crypt_queue) { ti->error = "Couldn't create kcryptd queue"; goto bad; @@ -1418,7 +1556,7 @@ static int crypt_iterate_devices(struct dm_target *ti, static struct target_type crypt_target = { .name = "crypt", - .version = {1, 8, 0}, + .version = {1, 9, 0}, .module = THIS_MODULE, .ctr = crypt_ctr, .dtr = crypt_dtr, -- cgit From 20c82538e4f5ede51bc2b4795bc6e5cae772796d Mon Sep 17 00:00:00 2001 From: Milan Broz Date: Thu, 13 Jan 2011 19:59:53 +0000 Subject: dm crypt: use io thread for reads only if mempool exhausted If there is enough memory, code can directly submit bio instead queing this operation in separate thread. Try to alloc bio clone with GFP_NOWAIT and only if it fails use separate queue (map function cannot block here). Signed-off-by: Milan Broz Signed-off-by: Alasdair G Kergon --- drivers/md/dm-crypt.c | 37 +++++++++++++++++++++++-------------- 1 file changed, 23 insertions(+), 14 deletions(-) diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index 50ae6ef83738..dc4403bcc6a0 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -787,26 +787,30 @@ static void clone_init(struct dm_crypt_io *io, struct bio *clone) clone->bi_destructor = dm_crypt_bio_destructor; } -static void kcryptd_io_read(struct dm_crypt_io *io) +static void kcryptd_unplug(struct crypt_config *cc) +{ + blk_unplug(bdev_get_queue(cc->dev->bdev)); +} + +static int kcryptd_io_read(struct dm_crypt_io *io, gfp_t gfp) { struct crypt_config *cc = io->target->private; struct bio *base_bio = io->base_bio; struct bio *clone; - crypt_inc_pending(io); - /* * The block layer might modify the bvec array, so always * copy the required bvecs because we need the original * one in order to decrypt the whole bio data *afterwards*. */ - clone = bio_alloc_bioset(GFP_NOIO, bio_segments(base_bio), cc->bs); - if (unlikely(!clone)) { - io->error = -ENOMEM; - crypt_dec_pending(io); - return; + clone = bio_alloc_bioset(gfp, bio_segments(base_bio), cc->bs); + if (!clone) { + kcryptd_unplug(cc); + return 1; } + crypt_inc_pending(io); + clone_init(io, clone); clone->bi_idx = 0; clone->bi_vcnt = bio_segments(base_bio); @@ -816,6 +820,7 @@ static void kcryptd_io_read(struct dm_crypt_io *io) sizeof(struct bio_vec) * clone->bi_vcnt); generic_make_request(clone); + return 0; } static void kcryptd_io_write(struct dm_crypt_io *io) @@ -828,9 +833,12 @@ static void kcryptd_io(struct work_struct *work) { struct dm_crypt_io *io = container_of(work, struct dm_crypt_io, work); - if (bio_data_dir(io->base_bio) == READ) - kcryptd_io_read(io); - else + if (bio_data_dir(io->base_bio) == READ) { + crypt_inc_pending(io); + if (kcryptd_io_read(io, GFP_NOIO)) + io->error = -ENOMEM; + crypt_dec_pending(io); + } else kcryptd_io_write(io); } @@ -1424,9 +1432,10 @@ static int crypt_map(struct dm_target *ti, struct bio *bio, io = crypt_io_alloc(ti, bio, dm_target_offset(ti, bio->bi_sector)); - if (bio_data_dir(io->base_bio) == READ) - kcryptd_queue_io(io); - else + if (bio_data_dir(io->base_bio) == READ) { + if (kcryptd_io_read(io, GFP_NOWAIT)) + kcryptd_queue_io(io); + } else kcryptd_queue_crypt(io); return DM_MAPIO_SUBMITTED; -- cgit From 2dc5327d3acb3340ab6fa3981401b076b78a51f4 Mon Sep 17 00:00:00 2001 From: Milan Broz Date: Thu, 13 Jan 2011 19:59:54 +0000 Subject: dm crypt: add post iv call to iv generator IV (initialisation vector) can in principle depend not only on sector but also on plaintext data (or other attributes). Change IV generator interface to work directly with dmreq structure to allow such dependence in generator. Also add post() function which is called after the crypto operation. This allows tricky modification of decrypted data or IV internals. In asynchronous mode the post() can be called after ctx->sector count was increased so it is needed to add iv_sector copy directly to dmreq structure. (N.B. dmreq always include only one sector in scatterlists) Signed-off-by: Milan Broz Signed-off-by: Alasdair G Kergon --- drivers/md/dm-crypt.c | 48 +++++++++++++++++++++++++++++++++++------------- 1 file changed, 35 insertions(+), 13 deletions(-) diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index dc4403bcc6a0..e0ebe685be6a 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -64,6 +64,7 @@ struct dm_crypt_request { struct convert_context *ctx; struct scatterlist sg_in; struct scatterlist sg_out; + sector_t iv_sector; }; struct crypt_config; @@ -74,7 +75,10 @@ struct crypt_iv_operations { void (*dtr)(struct crypt_config *cc); int (*init)(struct crypt_config *cc); int (*wipe)(struct crypt_config *cc); - int (*generator)(struct crypt_config *cc, u8 *iv, sector_t sector); + int (*generator)(struct crypt_config *cc, u8 *iv, + struct dm_crypt_request *dmreq); + int (*post)(struct crypt_config *cc, u8 *iv, + struct dm_crypt_request *dmreq); }; struct iv_essiv_private { @@ -168,6 +172,7 @@ static struct kmem_cache *_crypt_io_pool; static void clone_init(struct dm_crypt_io *, struct bio *); static void kcryptd_queue_crypt(struct dm_crypt_io *io); +static u8 *iv_of_dmreq(struct crypt_config *cc, struct dm_crypt_request *dmreq); static struct crypt_cpu *this_crypt_config(struct crypt_config *cc) { @@ -205,19 +210,20 @@ static struct crypto_ablkcipher *any_tfm(struct crypt_config *cc) * http://article.gmane.org/gmane.linux.kernel.device-mapper.dm-crypt/454 */ -static int crypt_iv_plain_gen(struct crypt_config *cc, u8 *iv, sector_t sector) +static int crypt_iv_plain_gen(struct crypt_config *cc, u8 *iv, + struct dm_crypt_request *dmreq) { memset(iv, 0, cc->iv_size); - *(u32 *)iv = cpu_to_le32(sector & 0xffffffff); + *(u32 *)iv = cpu_to_le32(dmreq->iv_sector & 0xffffffff); return 0; } static int crypt_iv_plain64_gen(struct crypt_config *cc, u8 *iv, - sector_t sector) + struct dm_crypt_request *dmreq) { memset(iv, 0, cc->iv_size); - *(u64 *)iv = cpu_to_le64(sector); + *(u64 *)iv = cpu_to_le64(dmreq->iv_sector); return 0; } @@ -378,12 +384,13 @@ bad: return err; } -static int crypt_iv_essiv_gen(struct crypt_config *cc, u8 *iv, sector_t sector) +static int crypt_iv_essiv_gen(struct crypt_config *cc, u8 *iv, + struct dm_crypt_request *dmreq) { struct crypto_cipher *essiv_tfm = this_crypt_config(cc)->iv_private; memset(iv, 0, cc->iv_size); - *(u64 *)iv = cpu_to_le64(sector); + *(u64 *)iv = cpu_to_le64(dmreq->iv_sector); crypto_cipher_encrypt_one(essiv_tfm, iv, iv); return 0; @@ -417,19 +424,21 @@ static void crypt_iv_benbi_dtr(struct crypt_config *cc) { } -static int crypt_iv_benbi_gen(struct crypt_config *cc, u8 *iv, sector_t sector) +static int crypt_iv_benbi_gen(struct crypt_config *cc, u8 *iv, + struct dm_crypt_request *dmreq) { __be64 val; memset(iv, 0, cc->iv_size - sizeof(u64)); /* rest is cleared below */ - val = cpu_to_be64(((u64)sector << cc->iv_gen_private.benbi.shift) + 1); + val = cpu_to_be64(((u64)dmreq->iv_sector << cc->iv_gen_private.benbi.shift) + 1); put_unaligned(val, (__be64 *)(iv + cc->iv_size - sizeof(u64))); return 0; } -static int crypt_iv_null_gen(struct crypt_config *cc, u8 *iv, sector_t sector) +static int crypt_iv_null_gen(struct crypt_config *cc, u8 *iv, + struct dm_crypt_request *dmreq) { memset(iv, 0, cc->iv_size); @@ -489,6 +498,13 @@ static struct ablkcipher_request *req_of_dmreq(struct crypt_config *cc, return (struct ablkcipher_request *)((char *)dmreq - cc->dmreq_start); } +static u8 *iv_of_dmreq(struct crypt_config *cc, + struct dm_crypt_request *dmreq) +{ + return (u8 *)ALIGN((unsigned long)(dmreq + 1), + crypto_ablkcipher_alignmask(any_tfm(cc)) + 1); +} + static int crypt_convert_block(struct crypt_config *cc, struct convert_context *ctx, struct ablkcipher_request *req) @@ -500,9 +516,9 @@ static int crypt_convert_block(struct crypt_config *cc, int r = 0; dmreq = dmreq_of_req(cc, req); - iv = (u8 *)ALIGN((unsigned long)(dmreq + 1), - crypto_ablkcipher_alignmask(any_tfm(cc)) + 1); + iv = iv_of_dmreq(cc, dmreq); + dmreq->iv_sector = ctx->sector; dmreq->ctx = ctx; sg_init_table(&dmreq->sg_in, 1); sg_set_page(&dmreq->sg_in, bv_in->bv_page, 1 << SECTOR_SHIFT, @@ -525,7 +541,7 @@ static int crypt_convert_block(struct crypt_config *cc, } if (cc->iv_gen_ops) { - r = cc->iv_gen_ops->generator(cc, iv, ctx->sector); + r = cc->iv_gen_ops->generator(cc, iv, dmreq); if (r < 0) return r; } @@ -538,6 +554,9 @@ static int crypt_convert_block(struct crypt_config *cc, else r = crypto_ablkcipher_decrypt(req); + if (!r && cc->iv_gen_ops && cc->iv_gen_ops->post) + r = cc->iv_gen_ops->post(cc, iv, dmreq); + return r; } @@ -1005,6 +1024,9 @@ static void kcryptd_async_done(struct crypto_async_request *async_req, return; } + if (!error && cc->iv_gen_ops && cc->iv_gen_ops->post) + error = cc->iv_gen_ops->post(cc, iv_of_dmreq(cc, dmreq), dmreq); + mempool_free(req_of_dmreq(cc, dmreq), cc->req_pool); if (!atomic_dec_and_test(&ctx->pending)) -- cgit From d1f9642381847e2b94caa34c3533211cf36ffcf4 Mon Sep 17 00:00:00 2001 From: Milan Broz Date: Thu, 13 Jan 2011 19:59:54 +0000 Subject: dm crypt: add multi key capability This patch adds generic multikey handling to be used in following patch for Loop-AES mode compatibility. This patch extends mapping table to optional keycount and implements generic multi-key capability. With more keys defined the string is divided into several sections and these are used for tfms. The tfm is used according to sector offset (sector 0->tfm[0], sector 1->tfm[1], sector N->tfm[N modulo keycount]) (only power of two values supported for keycount here). Because of tfms per-cpu allocation, this mode can be take a lot of memory on large smp systems. Signed-off-by: Milan Broz Signed-off-by: Alasdair G Kergon Cc: Max Vozeler --- Documentation/device-mapper/dm-crypt.txt | 7 ++- drivers/md/dm-crypt.c | 85 ++++++++++++++++++++++++-------- 2 files changed, 70 insertions(+), 22 deletions(-) diff --git a/Documentation/device-mapper/dm-crypt.txt b/Documentation/device-mapper/dm-crypt.txt index 524de926290d..59293ac4a5d0 100644 --- a/Documentation/device-mapper/dm-crypt.txt +++ b/Documentation/device-mapper/dm-crypt.txt @@ -8,7 +8,7 @@ Parameters: Encryption cipher and an optional IV generation mode. - (In format cipher-chainmode-ivopts:ivmode). + (In format cipher[:keycount]-chainmode-ivopts:ivmode). Examples: des aes-cbc-essiv:sha256 @@ -20,6 +20,11 @@ Parameters: Key used for encryption. It is encoded as a hexadecimal number. You can only use key sizes that are valid for the selected cipher. + + Multi-key compatibility mode. You can define keys and + then sectors are encrypted according to their offsets (sector 0 uses key0; + sector 1 uses key1 etc.). must be a power of two. + The IV offset is a sector count that is added to the sector number before creating the IV. diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index e0ebe685be6a..b8b9267c4dbb 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -101,10 +101,9 @@ enum flags { DM_CRYPT_SUSPENDED, DM_CRYPT_KEY_VALID }; */ struct crypt_cpu { struct ablkcipher_request *req; - struct crypto_ablkcipher *tfm; - /* ESSIV: struct crypto_cipher *essiv_tfm */ void *iv_private; + struct crypto_ablkcipher *tfms[0]; }; /* @@ -143,6 +142,7 @@ struct crypt_config { * per_cpu_ptr() only. */ struct crypt_cpu __percpu *cpu; + unsigned tfms_count; /* * Layout of each crypto request: @@ -161,6 +161,7 @@ struct crypt_config { unsigned long flags; unsigned int key_size; + unsigned int key_parts; u8 key[0]; }; @@ -184,7 +185,7 @@ static struct crypt_cpu *this_crypt_config(struct crypt_config *cc) */ static struct crypto_ablkcipher *any_tfm(struct crypt_config *cc) { - return __this_cpu_ptr(cc->cpu)->tfm; + return __this_cpu_ptr(cc->cpu)->tfms[0]; } /* @@ -567,11 +568,12 @@ static void crypt_alloc_req(struct crypt_config *cc, struct convert_context *ctx) { struct crypt_cpu *this_cc = this_crypt_config(cc); + unsigned key_index = ctx->sector & (cc->tfms_count - 1); if (!this_cc->req) this_cc->req = mempool_alloc(cc->req_pool, GFP_NOIO); - ablkcipher_request_set_tfm(this_cc->req, this_cc->tfm); + ablkcipher_request_set_tfm(this_cc->req, this_cc->tfms[key_index]); ablkcipher_request_set_callback(this_cc->req, CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP, kcryptd_async_done, dmreq_of_req(cc, this_cc->req)); @@ -1097,15 +1099,48 @@ static void crypt_encode_key(char *hex, u8 *key, unsigned int size) } } +static void crypt_free_tfms(struct crypt_config *cc, int cpu) +{ + struct crypt_cpu *cpu_cc = per_cpu_ptr(cc->cpu, cpu); + unsigned i; + + for (i = 0; i < cc->tfms_count; i++) + if (cpu_cc->tfms[i] && !IS_ERR(cpu_cc->tfms[i])) { + crypto_free_ablkcipher(cpu_cc->tfms[i]); + cpu_cc->tfms[i] = NULL; + } +} + +static int crypt_alloc_tfms(struct crypt_config *cc, int cpu, char *ciphermode) +{ + struct crypt_cpu *cpu_cc = per_cpu_ptr(cc->cpu, cpu); + unsigned i; + int err; + + for (i = 0; i < cc->tfms_count; i++) { + cpu_cc->tfms[i] = crypto_alloc_ablkcipher(ciphermode, 0, 0); + if (IS_ERR(cpu_cc->tfms[i])) { + err = PTR_ERR(cpu_cc->tfms[i]); + crypt_free_tfms(cc, cpu); + return err; + } + } + + return 0; +} + static int crypt_setkey_allcpus(struct crypt_config *cc) { - int cpu, err = 0, r; + unsigned subkey_size = cc->key_size >> ilog2(cc->tfms_count); + int cpu, err = 0, i, r; for_each_possible_cpu(cpu) { - r = crypto_ablkcipher_setkey(per_cpu_ptr(cc->cpu, cpu)->tfm, - cc->key, cc->key_size); - if (r) - err = r; + for (i = 0; i < cc->tfms_count; i++) { + r = crypto_ablkcipher_setkey(per_cpu_ptr(cc->cpu, cpu)->tfms[i], + cc->key + (i * subkey_size), subkey_size); + if (r) + err = r; + } } return err; @@ -1158,8 +1193,7 @@ static void crypt_dtr(struct dm_target *ti) cpu_cc = per_cpu_ptr(cc->cpu, cpu); if (cpu_cc->req) mempool_free(cpu_cc->req, cc->req_pool); - if (cpu_cc->tfm) - crypto_free_ablkcipher(cpu_cc->tfm); + crypt_free_tfms(cc, cpu); } if (cc->bs) @@ -1192,8 +1226,7 @@ static int crypt_ctr_cipher(struct dm_target *ti, char *cipher_in, char *key) { struct crypt_config *cc = ti->private; - struct crypto_ablkcipher *tfm; - char *tmp, *cipher, *chainmode, *ivmode, *ivopts; + char *tmp, *cipher, *chainmode, *ivmode, *ivopts, *keycount; char *cipher_api = NULL; int cpu, ret = -EINVAL; @@ -1209,10 +1242,20 @@ static int crypt_ctr_cipher(struct dm_target *ti, /* * Legacy dm-crypt cipher specification - * cipher-mode-iv:ivopts + * cipher[:keycount]-mode-iv:ivopts */ tmp = cipher_in; - cipher = strsep(&tmp, "-"); + keycount = strsep(&tmp, "-"); + cipher = strsep(&keycount, ":"); + + if (!keycount) + cc->tfms_count = 1; + else if (sscanf(keycount, "%u", &cc->tfms_count) != 1 || + !is_power_of_2(cc->tfms_count)) { + ti->error = "Bad cipher key count specification"; + return -EINVAL; + } + cc->key_parts = cc->tfms_count; cc->cipher = kstrdup(cipher, GFP_KERNEL); if (!cc->cipher) @@ -1225,7 +1268,9 @@ static int crypt_ctr_cipher(struct dm_target *ti, if (tmp) DMWARN("Ignoring unexpected additional cipher options"); - cc->cpu = alloc_percpu(struct crypt_cpu); + cc->cpu = __alloc_percpu(sizeof(*(cc->cpu)) + + cc->tfms_count * sizeof(*(cc->cpu->tfms)), + __alignof__(struct crypt_cpu)); if (!cc->cpu) { ti->error = "Cannot allocate per cpu state"; goto bad_mem; @@ -1258,13 +1303,11 @@ static int crypt_ctr_cipher(struct dm_target *ti, /* Allocate cipher */ for_each_possible_cpu(cpu) { - tfm = crypto_alloc_ablkcipher(cipher_api, 0, 0); - if (IS_ERR(tfm)) { - ret = PTR_ERR(tfm); + ret = crypt_alloc_tfms(cc, cpu, cipher_api); + if (ret < 0) { ti->error = "Error allocating crypto tfm"; goto bad; } - per_cpu_ptr(cc->cpu, cpu)->tfm = tfm; } /* Initialize and set key */ @@ -1587,7 +1630,7 @@ static int crypt_iterate_devices(struct dm_target *ti, static struct target_type crypt_target = { .name = "crypt", - .version = {1, 9, 0}, + .version = {1, 10, 0}, .module = THIS_MODULE, .ctr = crypt_ctr, .dtr = crypt_dtr, -- cgit From 34745785937a2003c144c0d4802fa637470d87af Mon Sep 17 00:00:00 2001 From: Milan Broz Date: Thu, 13 Jan 2011 19:59:55 +0000 Subject: dm crypt: add loop aes iv generator This patch adds a compatible implementation of the block chaining mode used by the Loop-AES block device encryption system (http://loop-aes.sourceforge.net/) designed by Jari Ruusu. It operates on full 512 byte sectors and uses CBC with an IV derived from the sector number, the data and optionally extra IV seed. This means that after CBC decryption the first block of sector must be tweaked according to decrypted data. Loop-AES can use three encryption schemes: version 1: is plain aes-cbc mode (already compatible) version 2: uses 64 multikey scheme with own IV generator version 3: the same as version 2 with additional IV seed (it uses 65 keys, last key is used as IV seed) The IV generator is here named lmk (Loop-AES multikey) and for the cipher specification looks like: aes:64-cbc-lmk Version 2 and 3 is recognised according to length of provided multi-key string (which is just hexa encoded "raw key" used in original Loop-AES ioctl). Configuration of the device and decoding key string will be done in userspace (cryptsetup). (Loop-AES stores keys in gpg encrypted file, raw keys are output of simple hashing of lines in this file). Based on an implementation by Max Vozeler: http://article.gmane.org/gmane.linux.kernel.cryptoapi/3752/ Signed-off-by: Milan Broz Signed-off-by: Alasdair G Kergon CC: Max Vozeler --- drivers/md/dm-crypt.c | 193 +++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 192 insertions(+), 1 deletion(-) diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index b8b9267c4dbb..4e054bd91664 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -23,6 +23,9 @@ #include #include #include +#include +#include +#include #include @@ -90,6 +93,12 @@ struct iv_benbi_private { int shift; }; +#define LMK_SEED_SIZE 64 /* hash + 0 */ +struct iv_lmk_private { + struct crypto_shash *hash_tfm; + u8 *seed; +}; + /* * Crypt: maps a linear range of a block device * and encrypts / decrypts at the same time. @@ -133,6 +142,7 @@ struct crypt_config { union { struct iv_essiv_private essiv; struct iv_benbi_private benbi; + struct iv_lmk_private lmk; } iv_gen_private; sector_t iv_offset; unsigned int iv_size; @@ -207,6 +217,20 @@ static struct crypto_ablkcipher *any_tfm(struct crypt_config *cc) * null: the initial vector is always zero. Provides compatibility with * obsolete loop_fish2 devices. Do not use for new devices. * + * lmk: Compatible implementation of the block chaining mode used + * by the Loop-AES block device encryption system + * designed by Jari Ruusu. See http://loop-aes.sourceforge.net/ + * It operates on full 512 byte sectors and uses CBC + * with an IV derived from the sector number, the data and + * optionally extra IV seed. + * This means that after decryption the first block + * of sector must be tweaked according to decrypted data. + * Loop-AES can use three encryption schemes: + * version 1: is plain aes-cbc mode + * version 2: uses 64 multikey scheme with lmk IV generator + * version 3: the same as version 2 with additional IV seed + * (it uses 65 keys, last key is used as IV seed) + * * plumb: unimplemented, see: * http://article.gmane.org/gmane.linux.kernel.device-mapper.dm-crypt/454 */ @@ -446,6 +470,156 @@ static int crypt_iv_null_gen(struct crypt_config *cc, u8 *iv, return 0; } +static void crypt_iv_lmk_dtr(struct crypt_config *cc) +{ + struct iv_lmk_private *lmk = &cc->iv_gen_private.lmk; + + if (lmk->hash_tfm && !IS_ERR(lmk->hash_tfm)) + crypto_free_shash(lmk->hash_tfm); + lmk->hash_tfm = NULL; + + kzfree(lmk->seed); + lmk->seed = NULL; +} + +static int crypt_iv_lmk_ctr(struct crypt_config *cc, struct dm_target *ti, + const char *opts) +{ + struct iv_lmk_private *lmk = &cc->iv_gen_private.lmk; + + lmk->hash_tfm = crypto_alloc_shash("md5", 0, 0); + if (IS_ERR(lmk->hash_tfm)) { + ti->error = "Error initializing LMK hash"; + return PTR_ERR(lmk->hash_tfm); + } + + /* No seed in LMK version 2 */ + if (cc->key_parts == cc->tfms_count) { + lmk->seed = NULL; + return 0; + } + + lmk->seed = kzalloc(LMK_SEED_SIZE, GFP_KERNEL); + if (!lmk->seed) { + crypt_iv_lmk_dtr(cc); + ti->error = "Error kmallocing seed storage in LMK"; + return -ENOMEM; + } + + return 0; +} + +static int crypt_iv_lmk_init(struct crypt_config *cc) +{ + struct iv_lmk_private *lmk = &cc->iv_gen_private.lmk; + int subkey_size = cc->key_size / cc->key_parts; + + /* LMK seed is on the position of LMK_KEYS + 1 key */ + if (lmk->seed) + memcpy(lmk->seed, cc->key + (cc->tfms_count * subkey_size), + crypto_shash_digestsize(lmk->hash_tfm)); + + return 0; +} + +static int crypt_iv_lmk_wipe(struct crypt_config *cc) +{ + struct iv_lmk_private *lmk = &cc->iv_gen_private.lmk; + + if (lmk->seed) + memset(lmk->seed, 0, LMK_SEED_SIZE); + + return 0; +} + +static int crypt_iv_lmk_one(struct crypt_config *cc, u8 *iv, + struct dm_crypt_request *dmreq, + u8 *data) +{ + struct iv_lmk_private *lmk = &cc->iv_gen_private.lmk; + struct { + struct shash_desc desc; + char ctx[crypto_shash_descsize(lmk->hash_tfm)]; + } sdesc; + struct md5_state md5state; + u32 buf[4]; + int i, r; + + sdesc.desc.tfm = lmk->hash_tfm; + sdesc.desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP; + + r = crypto_shash_init(&sdesc.desc); + if (r) + return r; + + if (lmk->seed) { + r = crypto_shash_update(&sdesc.desc, lmk->seed, LMK_SEED_SIZE); + if (r) + return r; + } + + /* Sector is always 512B, block size 16, add data of blocks 1-31 */ + r = crypto_shash_update(&sdesc.desc, data + 16, 16 * 31); + if (r) + return r; + + /* Sector is cropped to 56 bits here */ + buf[0] = cpu_to_le32(dmreq->iv_sector & 0xFFFFFFFF); + buf[1] = cpu_to_le32((((u64)dmreq->iv_sector >> 32) & 0x00FFFFFF) | 0x80000000); + buf[2] = cpu_to_le32(4024); + buf[3] = 0; + r = crypto_shash_update(&sdesc.desc, (u8 *)buf, sizeof(buf)); + if (r) + return r; + + /* No MD5 padding here */ + r = crypto_shash_export(&sdesc.desc, &md5state); + if (r) + return r; + + for (i = 0; i < MD5_HASH_WORDS; i++) + __cpu_to_le32s(&md5state.hash[i]); + memcpy(iv, &md5state.hash, cc->iv_size); + + return 0; +} + +static int crypt_iv_lmk_gen(struct crypt_config *cc, u8 *iv, + struct dm_crypt_request *dmreq) +{ + u8 *src; + int r = 0; + + if (bio_data_dir(dmreq->ctx->bio_in) == WRITE) { + src = kmap_atomic(sg_page(&dmreq->sg_in), KM_USER0); + r = crypt_iv_lmk_one(cc, iv, dmreq, src + dmreq->sg_in.offset); + kunmap_atomic(src, KM_USER0); + } else + memset(iv, 0, cc->iv_size); + + return r; +} + +static int crypt_iv_lmk_post(struct crypt_config *cc, u8 *iv, + struct dm_crypt_request *dmreq) +{ + u8 *dst; + int r; + + if (bio_data_dir(dmreq->ctx->bio_in) == WRITE) + return 0; + + dst = kmap_atomic(sg_page(&dmreq->sg_out), KM_USER0); + r = crypt_iv_lmk_one(cc, iv, dmreq, dst + dmreq->sg_out.offset); + + /* Tweak the first block of plaintext sector */ + if (!r) + crypto_xor(dst + dmreq->sg_out.offset, iv, cc->iv_size); + + kunmap_atomic(dst, KM_USER0); + return r; +} + static struct crypt_iv_operations crypt_iv_plain_ops = { .generator = crypt_iv_plain_gen }; @@ -472,6 +646,15 @@ static struct crypt_iv_operations crypt_iv_null_ops = { .generator = crypt_iv_null_gen }; +static struct crypt_iv_operations crypt_iv_lmk_ops = { + .ctr = crypt_iv_lmk_ctr, + .dtr = crypt_iv_lmk_dtr, + .init = crypt_iv_lmk_init, + .wipe = crypt_iv_lmk_wipe, + .generator = crypt_iv_lmk_gen, + .post = crypt_iv_lmk_post +}; + static void crypt_convert_init(struct crypt_config *cc, struct convert_context *ctx, struct bio *bio_out, struct bio *bio_in, @@ -1341,7 +1524,15 @@ static int crypt_ctr_cipher(struct dm_target *ti, cc->iv_gen_ops = &crypt_iv_benbi_ops; else if (strcmp(ivmode, "null") == 0) cc->iv_gen_ops = &crypt_iv_null_ops; - else { + else if (strcmp(ivmode, "lmk") == 0) { + cc->iv_gen_ops = &crypt_iv_lmk_ops; + /* Version 2 and 3 is recognised according + * to length of provided multi-key string. + * If present (version 3), last key is used as IV seed. + */ + if (cc->key_size % cc->key_parts) + cc->key_parts++; + } else { ret = -EINVAL; ti->error = "Invalid IV mode"; goto bad; -- cgit From 810b492375f4aed5ce222982054adc0394a4bd33 Mon Sep 17 00:00:00 2001 From: Milan Broz Date: Thu, 13 Jan 2011 19:59:55 +0000 Subject: dm ioctl: suppress needless warning messages The device-mapper should not send warning messages to syslog if a device is not found. This can be done by userspace according to the returned dm-ioctl error code. So move these messages to debug level and use rate limiting to not flood syslog. Signed-off-by: Milan Broz Signed-off-by: Alasdair G Kergon --- drivers/md/dm-ioctl.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c index f3481d922206..6d12775a1061 100644 --- a/drivers/md/dm-ioctl.c +++ b/drivers/md/dm-ioctl.c @@ -779,7 +779,7 @@ static int dev_remove(struct dm_ioctl *param, size_t param_size) hc = __find_device_hash_cell(param); if (!hc) { - DMWARN("device doesn't appear to be in the dev hash table."); + DMDEBUG_LIMIT("device doesn't appear to be in the dev hash table."); up_write(&_hash_lock); return -ENXIO; } @@ -791,7 +791,7 @@ static int dev_remove(struct dm_ioctl *param, size_t param_size) */ r = dm_lock_for_deletion(md); if (r) { - DMWARN("unable to remove open device %s", hc->name); + DMDEBUG_LIMIT("unable to remove open device %s", hc->name); up_write(&_hash_lock); dm_put(md); return r; @@ -938,7 +938,7 @@ static int do_resume(struct dm_ioctl *param) hc = __find_device_hash_cell(param); if (!hc) { - DMWARN("device doesn't appear to be in the dev hash table."); + DMDEBUG_LIMIT("device doesn't appear to be in the dev hash table."); up_write(&_hash_lock); return -ENXIO; } @@ -1265,7 +1265,7 @@ static int table_clear(struct dm_ioctl *param, size_t param_size) hc = __find_device_hash_cell(param); if (!hc) { - DMWARN("device doesn't appear to be in the dev hash table."); + DMDEBUG_LIMIT("device doesn't appear to be in the dev hash table."); up_write(&_hash_lock); return -ENXIO; } -- cgit From fecec20e55ec117a09857ac1a455e2e6e2f17df4 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 13 Jan 2011 19:59:56 +0000 Subject: dm snapshot: remove unused dm_snapshot queued_bios_work dm_snapshot->queued_bios_work isn't used. Remove ->queued_bios[_work] from dm_snapshot structure, the flush_queued_bios work function and ksnapd workqueue. The DM snapshot changes that were going to use the ksnapd workqueue were either superseded (fix for origin write races) or never completed (deallocation of invalid snapshot's memory via workqueue). Signed-off-by: Tejun Heo Signed-off-by: Mike Snitzer Signed-off-by: Alasdair G Kergon --- drivers/md/dm-snap.c | 38 -------------------------------------- 1 file changed, 38 deletions(-) diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c index 53cf79d8bcbc..0f47698beafa 100644 --- a/drivers/md/dm-snap.c +++ b/drivers/md/dm-snap.c @@ -19,7 +19,6 @@ #include #include #include -#include #include "dm-exception-store.h" @@ -106,10 +105,6 @@ struct dm_snapshot { struct dm_kcopyd_client *kcopyd_client; - /* Queue of snapshot writes for ksnapd to flush */ - struct bio_list queued_bios; - struct work_struct queued_bios_work; - /* Wait for events based on state_bits */ unsigned long state_bits; @@ -160,9 +155,6 @@ struct dm_dev *dm_snap_cow(struct dm_snapshot *s) } EXPORT_SYMBOL(dm_snap_cow); -static struct workqueue_struct *ksnapd; -static void flush_queued_bios(struct work_struct *work); - static sector_t chunk_to_sector(struct dm_exception_store *store, chunk_t chunk) { @@ -1153,9 +1145,6 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv) spin_lock_init(&s->tracked_chunk_lock); - bio_list_init(&s->queued_bios); - INIT_WORK(&s->queued_bios_work, flush_queued_bios); - ti->private = s; ti->num_flush_requests = num_flush_requests; @@ -1279,8 +1268,6 @@ static void snapshot_dtr(struct dm_target *ti) struct dm_snapshot *s = ti->private; struct dm_snapshot *snap_src = NULL, *snap_dest = NULL; - flush_workqueue(ksnapd); - down_read(&_origins_lock); /* Check whether exception handover must be cancelled */ (void) __find_snapshots_sharing_cow(s, &snap_src, &snap_dest, NULL); @@ -1342,20 +1329,6 @@ static void flush_bios(struct bio *bio) } } -static void flush_queued_bios(struct work_struct *work) -{ - struct dm_snapshot *s = - container_of(work, struct dm_snapshot, queued_bios_work); - struct bio *queued_bios; - unsigned long flags; - - spin_lock_irqsave(&s->pe_lock, flags); - queued_bios = bio_list_get(&s->queued_bios); - spin_unlock_irqrestore(&s->pe_lock, flags); - - flush_bios(queued_bios); -} - static int do_origin(struct dm_dev *origin, struct bio *bio); /* @@ -2291,17 +2264,8 @@ static int __init dm_snapshot_init(void) goto bad_tracked_chunk_cache; } - ksnapd = create_singlethread_workqueue("ksnapd"); - if (!ksnapd) { - DMERR("Failed to create ksnapd workqueue."); - r = -ENOMEM; - goto bad_pending_pool; - } - return 0; -bad_pending_pool: - kmem_cache_destroy(tracked_chunk_cache); bad_tracked_chunk_cache: kmem_cache_destroy(pending_cache); bad_pending_cache: @@ -2322,8 +2286,6 @@ bad_register_snapshot_target: static void __exit dm_snapshot_exit(void) { - destroy_workqueue(ksnapd); - dm_unregister_target(&snapshot_target); dm_unregister_target(&origin_target); dm_unregister_target(&merge_target); -- cgit From d5ffa387e24646cb1cb55d80fd0f182a00e0edb7 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 13 Jan 2011 19:59:56 +0000 Subject: dm: dont use flush_scheduled_work flush_scheduled_work() is being deprecated. Flush the used work directly instead. In all dm targets, the only work which uses system_wq is ->trigger_event. Signed-off-by: Tejun Heo Signed-off-by: Mike Snitzer Signed-off-by: Alasdair G Kergon --- drivers/md/dm-mpath.c | 2 +- drivers/md/dm-raid1.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c index 406091f9692b..fcc59c3f756e 100644 --- a/drivers/md/dm-mpath.c +++ b/drivers/md/dm-mpath.c @@ -920,7 +920,7 @@ static void flush_multipath_work(struct multipath *m) flush_workqueue(kmpath_handlerd); multipath_wait_for_pg_init_completion(m); flush_workqueue(kmultipathd); - flush_scheduled_work(); + flush_work_sync(&m->trigger_event); } static void multipath_dtr(struct dm_target *ti) diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c index c87756eb7a21..0d58b6f875cc 100644 --- a/drivers/md/dm-raid1.c +++ b/drivers/md/dm-raid1.c @@ -1138,7 +1138,7 @@ static void mirror_dtr(struct dm_target *ti) del_timer_sync(&ms->timer); flush_workqueue(ms->kmirrord_wq); - flush_scheduled_work(); + flush_work_sync(&ms->trigger_event); dm_kcopyd_client_destroy(ms->kcopyd_client); destroy_workqueue(ms->kmirrord_wq); free_context(ms, ti, ms->nr_mirrors); -- cgit From f521f074abe7b3990f5df65482cdc3d851b80665 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 13 Jan 2011 19:59:57 +0000 Subject: dm stripe: switch from local workqueue to system_wq kstriped only serves sc->kstriped_ws which runs dm_table_event(). This doesn't need to be executed from an ordered workqueue w/ rescuer. Drop kstriped and use the system_wq instead. While at it, rename kstriped_ws to trigger_event so that it's consistent with other dm modules. Signed-off-by: Tejun Heo Signed-off-by: Mike Snitzer Signed-off-by: Alasdair G Kergon --- drivers/md/dm-stripe.c | 27 +++++++-------------------- 1 file changed, 7 insertions(+), 20 deletions(-) diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c index f0371b4c4fbf..dddfa14f2982 100644 --- a/drivers/md/dm-stripe.c +++ b/drivers/md/dm-stripe.c @@ -39,23 +39,20 @@ struct stripe_c { struct dm_target *ti; /* Work struct used for triggering events*/ - struct work_struct kstriped_ws; + struct work_struct trigger_event; struct stripe stripe[0]; }; -static struct workqueue_struct *kstriped; - /* * An event is triggered whenever a drive * drops out of a stripe volume. */ static void trigger_event(struct work_struct *work) { - struct stripe_c *sc = container_of(work, struct stripe_c, kstriped_ws); - + struct stripe_c *sc = container_of(work, struct stripe_c, + trigger_event); dm_table_event(sc->ti->table); - } static inline struct stripe_c *alloc_context(unsigned int stripes) @@ -160,7 +157,7 @@ static int stripe_ctr(struct dm_target *ti, unsigned int argc, char **argv) return -ENOMEM; } - INIT_WORK(&sc->kstriped_ws, trigger_event); + INIT_WORK(&sc->trigger_event, trigger_event); /* Set pointer to dm target; used in trigger_event */ sc->ti = ti; @@ -211,7 +208,7 @@ static void stripe_dtr(struct dm_target *ti) for (i = 0; i < sc->stripes; i++) dm_put_device(ti, sc->stripe[i].dev); - flush_workqueue(kstriped); + flush_work_sync(&sc->trigger_event); kfree(sc); } @@ -367,7 +364,7 @@ static int stripe_end_io(struct dm_target *ti, struct bio *bio, atomic_inc(&(sc->stripe[i].error_count)); if (atomic_read(&(sc->stripe[i].error_count)) < DM_IO_ERROR_THRESHOLD) - queue_work(kstriped, &sc->kstriped_ws); + schedule_work(&sc->trigger_event); } return error; @@ -401,7 +398,7 @@ static void stripe_io_hints(struct dm_target *ti, static struct target_type stripe_target = { .name = "striped", - .version = {1, 3, 0}, + .version = {1, 3, 1}, .module = THIS_MODULE, .ctr = stripe_ctr, .dtr = stripe_dtr, @@ -422,20 +419,10 @@ int __init dm_stripe_init(void) return r; } - kstriped = create_singlethread_workqueue("kstriped"); - if (!kstriped) { - DMERR("failed to create workqueue kstriped"); - dm_unregister_target(&stripe_target); - return -ENOMEM; - } - return r; } void dm_stripe_exit(void) { dm_unregister_target(&stripe_target); - destroy_workqueue(kstriped); - - return; } -- cgit From 4d4d66ab5322fa9b0f51842a76139387a40e1ce9 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 13 Jan 2011 19:59:57 +0000 Subject: dm: convert workqueues to alloc_ordered Convert all create[_singlethread]_work() users to the new alloc[_ordered]_workqueue(). This conversion is mechanical and doesn't introduce any behavior change. Signed-off-by: Tejun Heo Signed-off-by: Mike Snitzer Signed-off-by: Alasdair G Kergon --- drivers/md/dm-delay.c | 2 +- drivers/md/dm-kcopyd.c | 2 +- drivers/md/dm-mpath.c | 5 +++-- drivers/md/dm-raid1.c | 2 +- drivers/md/dm-snap-persistent.c | 2 +- drivers/md/dm.c | 2 +- 6 files changed, 8 insertions(+), 7 deletions(-) diff --git a/drivers/md/dm-delay.c b/drivers/md/dm-delay.c index baa11912cc94..f18375dcedd9 100644 --- a/drivers/md/dm-delay.c +++ b/drivers/md/dm-delay.c @@ -352,7 +352,7 @@ static int __init dm_delay_init(void) { int r = -ENOMEM; - kdelayd_wq = create_workqueue("kdelayd"); + kdelayd_wq = alloc_workqueue("kdelayd", WQ_MEM_RECLAIM, 0); if (!kdelayd_wq) { DMERR("Couldn't start kdelayd"); goto bad_queue; diff --git a/drivers/md/dm-kcopyd.c b/drivers/md/dm-kcopyd.c index dad32f8bce7d..63d67169c7f4 100644 --- a/drivers/md/dm-kcopyd.c +++ b/drivers/md/dm-kcopyd.c @@ -672,7 +672,7 @@ int dm_kcopyd_client_create(unsigned int nr_pages, goto bad_slab; INIT_WORK(&kc->kcopyd_work, do_work); - kc->kcopyd_wq = create_singlethread_workqueue("kcopyd"); + kc->kcopyd_wq = alloc_ordered_workqueue("kcopyd", WQ_MEM_RECLAIM); if (!kc->kcopyd_wq) goto bad_workqueue; diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c index fcc59c3f756e..35ab5781f88f 100644 --- a/drivers/md/dm-mpath.c +++ b/drivers/md/dm-mpath.c @@ -1675,7 +1675,7 @@ static int __init dm_multipath_init(void) return -EINVAL; } - kmultipathd = create_workqueue("kmpathd"); + kmultipathd = alloc_workqueue("kmpathd", WQ_MEM_RECLAIM, 0); if (!kmultipathd) { DMERR("failed to create workqueue kmpathd"); dm_unregister_target(&multipath_target); @@ -1689,7 +1689,8 @@ static int __init dm_multipath_init(void) * old workqueue would also create a bottleneck in the * path of the storage hardware device activation. */ - kmpath_handlerd = create_singlethread_workqueue("kmpath_handlerd"); + kmpath_handlerd = alloc_ordered_workqueue("kmpath_handlerd", + WQ_MEM_RECLAIM); if (!kmpath_handlerd) { DMERR("failed to create workqueue kmpath_handlerd"); destroy_workqueue(kmultipathd); diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c index 0d58b6f875cc..39917430f9f8 100644 --- a/drivers/md/dm-raid1.c +++ b/drivers/md/dm-raid1.c @@ -1085,7 +1085,7 @@ static int mirror_ctr(struct dm_target *ti, unsigned int argc, char **argv) ti->num_flush_requests = 1; ti->num_discard_requests = 1; - ms->kmirrord_wq = create_singlethread_workqueue("kmirrord"); + ms->kmirrord_wq = alloc_ordered_workqueue("kmirrord", WQ_MEM_RECLAIM); if (!ms->kmirrord_wq) { DMERR("couldn't start kmirrord"); r = -ENOMEM; diff --git a/drivers/md/dm-snap-persistent.c b/drivers/md/dm-snap-persistent.c index 2129cdb115dc..d3021a69c857 100644 --- a/drivers/md/dm-snap-persistent.c +++ b/drivers/md/dm-snap-persistent.c @@ -818,7 +818,7 @@ static int persistent_ctr(struct dm_exception_store *store, atomic_set(&ps->pending_count, 0); ps->callbacks = NULL; - ps->metadata_wq = create_singlethread_workqueue("ksnaphd"); + ps->metadata_wq = alloc_ordered_workqueue("ksnaphd", WQ_MEM_RECLAIM); if (!ps->metadata_wq) { kfree(ps); DMERR("couldn't start header metadata update thread"); diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 0de692176ad4..39aaa9290128 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -1883,7 +1883,7 @@ static struct mapped_device *alloc_dev(int minor) add_disk(md->disk); format_dev_t(md->name, MKDEV(_major, minor)); - md->wq = create_singlethread_workqueue("kdmflush"); + md->wq = alloc_ordered_workqueue("kdmflush", WQ_MEM_RECLAIM); if (!md->wq) goto bad_thread; -- cgit From 9c4376de98719d2768dd919553843de34bb094a6 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 13 Jan 2011 19:59:58 +0000 Subject: dm: use non reentrant workqueues if equivalent kmirrord_wq, kcopyd_work and md->wq are created per dm instance and serve only a single work item from the dm instance, so non-reentrant workqueues would provide the same ordering guarantees as ordered ones while allowing CPU affinity and use of the workqueues for other purposes. Switch them to non-reentrant workqueues. Signed-off-by: Tejun Heo Signed-off-by: Mike Snitzer Signed-off-by: Alasdair G Kergon --- drivers/md/dm-kcopyd.c | 3 ++- drivers/md/dm-raid1.c | 5 +++-- drivers/md/dm.c | 3 ++- include/linux/dm-ioctl.h | 4 ++-- 4 files changed, 9 insertions(+), 6 deletions(-) diff --git a/drivers/md/dm-kcopyd.c b/drivers/md/dm-kcopyd.c index 63d67169c7f4..924f5f0084c2 100644 --- a/drivers/md/dm-kcopyd.c +++ b/drivers/md/dm-kcopyd.c @@ -672,7 +672,8 @@ int dm_kcopyd_client_create(unsigned int nr_pages, goto bad_slab; INIT_WORK(&kc->kcopyd_work, do_work); - kc->kcopyd_wq = alloc_ordered_workqueue("kcopyd", WQ_MEM_RECLAIM); + kc->kcopyd_wq = alloc_workqueue("kcopyd", + WQ_NON_REENTRANT | WQ_MEM_RECLAIM, 0); if (!kc->kcopyd_wq) goto bad_workqueue; diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c index 39917430f9f8..dee326775c60 100644 --- a/drivers/md/dm-raid1.c +++ b/drivers/md/dm-raid1.c @@ -1085,7 +1085,8 @@ static int mirror_ctr(struct dm_target *ti, unsigned int argc, char **argv) ti->num_flush_requests = 1; ti->num_discard_requests = 1; - ms->kmirrord_wq = alloc_ordered_workqueue("kmirrord", WQ_MEM_RECLAIM); + ms->kmirrord_wq = alloc_workqueue("kmirrord", + WQ_NON_REENTRANT | WQ_MEM_RECLAIM, 0); if (!ms->kmirrord_wq) { DMERR("couldn't start kmirrord"); r = -ENOMEM; @@ -1414,7 +1415,7 @@ static int mirror_iterate_devices(struct dm_target *ti, static struct target_type mirror_target = { .name = "mirror", - .version = {1, 12, 0}, + .version = {1, 12, 1}, .module = THIS_MODULE, .ctr = mirror_ctr, .dtr = mirror_dtr, diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 39aaa9290128..e504bb40d60e 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -1883,7 +1883,8 @@ static struct mapped_device *alloc_dev(int minor) add_disk(md->disk); format_dev_t(md->name, MKDEV(_major, minor)); - md->wq = alloc_ordered_workqueue("kdmflush", WQ_MEM_RECLAIM); + md->wq = alloc_workqueue("kdmflush", + WQ_NON_REENTRANT | WQ_MEM_RECLAIM, 0); if (!md->wq) goto bad_thread; diff --git a/include/linux/dm-ioctl.h b/include/linux/dm-ioctl.h index e453e1174ae1..78bbf47bbb96 100644 --- a/include/linux/dm-ioctl.h +++ b/include/linux/dm-ioctl.h @@ -268,8 +268,8 @@ enum { #define DM_VERSION_MAJOR 4 #define DM_VERSION_MINOR 19 -#define DM_VERSION_PATCHLEVEL 0 -#define DM_VERSION_EXTRA "-ioctl (2010-10-14)" +#define DM_VERSION_PATCHLEVEL 1 +#define DM_VERSION_EXTRA "-ioctl (2011-01-07)" /* Status bits */ #define DM_READONLY_FLAG (1 << 0) /* In/Out */ -- cgit From 239c8dd533e74de4a7f3c85c4f9f430eb08867c1 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 13 Jan 2011 19:59:59 +0000 Subject: dm snapshot: persistent make metadata_wq multithreaded metadata_wq serves on-stack work items from chunk_io(). Even if multiple chunk_io() are simultaneously in progress, each is independent and queued only once, so multithreaded workqueue can be safely used. Switch metadata_wq to multithread and flush the work item instead of the workqueue in chunk_io(). Signed-off-by: Tejun Heo Signed-off-by: Mike Snitzer Signed-off-by: Alasdair G Kergon --- drivers/md/dm-snap-persistent.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/md/dm-snap-persistent.c b/drivers/md/dm-snap-persistent.c index d3021a69c857..95891dfcbca0 100644 --- a/drivers/md/dm-snap-persistent.c +++ b/drivers/md/dm-snap-persistent.c @@ -256,7 +256,7 @@ static int chunk_io(struct pstore *ps, void *area, chunk_t chunk, int rw, */ INIT_WORK_ONSTACK(&req.work, do_metadata); queue_work(ps->metadata_wq, &req.work); - flush_workqueue(ps->metadata_wq); + flush_work(&req.work); return req.result; } @@ -818,7 +818,7 @@ static int persistent_ctr(struct dm_exception_store *store, atomic_set(&ps->pending_count, 0); ps->callbacks = NULL; - ps->metadata_wq = alloc_ordered_workqueue("ksnaphd", WQ_MEM_RECLAIM); + ps->metadata_wq = alloc_workqueue("ksnaphd", WQ_MEM_RECLAIM, 0); if (!ps->metadata_wq) { kfree(ps); DMERR("couldn't start header metadata update thread"); -- cgit From b83b2f295aec418c7501c05df4dfd168a79d165a Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Thu, 13 Jan 2011 19:59:59 +0000 Subject: dm snapshot: avoid storing private suspended state Use dm_suspended() rather than having each snapshot target maintain a private 'suspended' flag in struct dm_snapshot. Signed-off-by: Mike Snitzer Signed-off-by: Alasdair G Kergon --- drivers/md/dm-snap.c | 24 ++++-------------------- 1 file changed, 4 insertions(+), 20 deletions(-) diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c index 0f47698beafa..fdde53cd12b7 100644 --- a/drivers/md/dm-snap.c +++ b/drivers/md/dm-snap.c @@ -79,9 +79,6 @@ struct dm_snapshot { /* Origin writes don't trigger exceptions until this is set */ int active; - /* Whether or not owning mapped_device is suspended */ - int suspended; - atomic_t pending_exceptions_count; mempool_t *pending_pool; @@ -1102,7 +1099,6 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv) s->ti = ti; s->valid = 1; s->active = 0; - s->suspended = 0; atomic_set(&s->pending_exceptions_count, 0); init_rwsem(&s->lock); INIT_LIST_HEAD(&s->list); @@ -1733,15 +1729,6 @@ static void snapshot_merge_presuspend(struct dm_target *ti) stop_merge(s); } -static void snapshot_postsuspend(struct dm_target *ti) -{ - struct dm_snapshot *s = ti->private; - - down_write(&s->lock); - s->suspended = 1; - up_write(&s->lock); -} - static int snapshot_preresume(struct dm_target *ti) { int r = 0; @@ -1756,7 +1743,7 @@ static int snapshot_preresume(struct dm_target *ti) DMERR("Unable to resume snapshot source until " "handover completes."); r = -EINVAL; - } else if (!snap_src->suspended) { + } else if (!dm_suspended(snap_src->ti)) { DMERR("Unable to perform snapshot handover until " "source is suspended."); r = -EINVAL; @@ -1789,7 +1776,6 @@ static void snapshot_resume(struct dm_target *ti) down_write(&s->lock); s->active = 1; - s->suspended = 0; up_write(&s->lock); } @@ -2167,7 +2153,7 @@ static int origin_iterate_devices(struct dm_target *ti, static struct target_type origin_target = { .name = "snapshot-origin", - .version = {1, 7, 0}, + .version = {1, 7, 1}, .module = THIS_MODULE, .ctr = origin_ctr, .dtr = origin_dtr, @@ -2180,13 +2166,12 @@ static struct target_type origin_target = { static struct target_type snapshot_target = { .name = "snapshot", - .version = {1, 9, 0}, + .version = {1, 10, 0}, .module = THIS_MODULE, .ctr = snapshot_ctr, .dtr = snapshot_dtr, .map = snapshot_map, .end_io = snapshot_end_io, - .postsuspend = snapshot_postsuspend, .preresume = snapshot_preresume, .resume = snapshot_resume, .status = snapshot_status, @@ -2195,14 +2180,13 @@ static struct target_type snapshot_target = { static struct target_type merge_target = { .name = dm_snapshot_merge_target_name, - .version = {1, 0, 0}, + .version = {1, 1, 0}, .module = THIS_MODULE, .ctr = snapshot_ctr, .dtr = snapshot_dtr, .map = snapshot_merge_map, .end_io = snapshot_end_io, .presuspend = snapshot_merge_presuspend, - .postsuspend = snapshot_postsuspend, .preresume = snapshot_preresume, .resume = snapshot_merge_resume, .status = snapshot_status, -- cgit From dbc883f1570d992ba926a8c9e22140ba473c6cc1 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Thu, 13 Jan 2011 20:00:00 +0000 Subject: dm log: use PTR_ERR value instead of ENOMEM It's nicer to return the PTR_ERR() value instead of just returning -ENOMEM. In the current code the PTR_ERR() value is always equal to -ENOMEM so this doesn't actually affect anything, but still... In addition, dm_dirty_log_create() doesn't check for a specific -ENOMEM return. So this change is safe relative to potential for a non -ENOMEM return in the future. Signed-off-by: Dan Carpenter Acked-by: Jonathan Brassow Signed-off-by: Mike Snitzer Signed-off-by: Alasdair G Kergon --- drivers/md/dm-log.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/md/dm-log.c b/drivers/md/dm-log.c index 33420e68d153..6951536ea29c 100644 --- a/drivers/md/dm-log.c +++ b/drivers/md/dm-log.c @@ -455,7 +455,7 @@ static int create_log_context(struct dm_dirty_log *log, struct dm_target *ti, r = PTR_ERR(lc->io_req.client); DMWARN("couldn't allocate disk io client"); kfree(lc); - return -ENOMEM; + return r; } lc->disk_header = vmalloc(buf_size); -- cgit From 052189a2ec956810feefb6a681416c5e6a207646 Mon Sep 17 00:00:00 2001 From: Kiyoshi Ueda Date: Thu, 13 Jan 2011 20:00:00 +0000 Subject: dm: remove superfluous irq disablement in dm_request_fn This patch changes spin_lock_irq() to spin_lock() in dm_request_fn(). This patch is just a clean-up and no functional change. The spin_lock_irq() was leftover from the early request-based dm code, where map_request() used to enable interrupts. Since current map_request() never enables interrupts, we can change it to spin_lock() to match the prior spin_unlock(). Auditing through the dm and block-layer code called from map_request(), I confirmed all functions save/restore interrupt status, so no function returning with interrupts enabled. Also I haven't observed any problem on my test environment which uses scsi and lpfc driver after heavy I/O testing with occasional path down/up. Added BUG_ON() to detect breakage in future. Signed-off-by: Kiyoshi Ueda Signed-off-by: Jun'ichi Nomura Signed-off-by: Mike Snitzer Signed-off-by: Alasdair G Kergon --- drivers/md/dm.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/md/dm.c b/drivers/md/dm.c index e504bb40d60e..eaa3af0e0632 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -1637,13 +1637,15 @@ static void dm_request_fn(struct request_queue *q) if (map_request(ti, clone, md)) goto requeued; - spin_lock_irq(q->queue_lock); + BUG_ON(!irqs_disabled()); + spin_lock(q->queue_lock); } goto out; requeued: - spin_lock_irq(q->queue_lock); + BUG_ON(!irqs_disabled()); + spin_lock(q->queue_lock); plug_and_out: if (!elv_queue_empty(q)) -- cgit From 4e2d19e46b507018c6ed15f6c081d8f887ae229c Mon Sep 17 00:00:00 2001 From: Chandra Seetharaman Date: Thu, 13 Jan 2011 20:00:01 +0000 Subject: dm mpath: delay activate_path retry on SCSI_DH_RETRY This patch adds a user-configurable 'pg_init_delay_msecs' feature. Use this feature to specify the number of milliseconds to delay before retrying scsi_dh_activate, when SCSI_DH_RETRY is returned. SCSI Device Handlers return SCSI_DH_IMM_RETRY if we could retry activation immediately and SCSI_DH_RETRY in cases where it is better to retry after some delay. Currently we immediately retry scsi_dh_activate irrespective of SCSI_DH_IMM_RETRY and SCSI_DH_RETRY. The 'pg_init_delay_msecs' feature may be provided during table create or load, e.g.: dmsetup create --table "0 20971520 multipath 3 queue_if_no_path \ pg_init_delay_msecs 2500 ..." mpatha The default for 'pg_init_delay_msecs' is 2000 milliseconds. Maximum configurable delay is 60000 milliseconds. Specifying a 'pg_init_delay_msecs' of 0 will cause immediate retry. Signed-off-by: Nikanth Karthikesan Signed-off-by: Chandra Seetharaman Acked-by: Mike Christie Signed-off-by: Mike Snitzer Signed-off-by: Alasdair G Kergon --- drivers/md/dm-mpath.c | 48 ++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 38 insertions(+), 10 deletions(-) diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c index 35ab5781f88f..b82d28819e2a 100644 --- a/drivers/md/dm-mpath.c +++ b/drivers/md/dm-mpath.c @@ -23,6 +23,8 @@ #define DM_MSG_PREFIX "multipath" #define MESG_STR(x) x, sizeof(x) +#define DM_PG_INIT_DELAY_MSECS 2000 +#define DM_PG_INIT_DELAY_DEFAULT ((unsigned) -1) /* Path properties */ struct pgpath { @@ -33,7 +35,7 @@ struct pgpath { unsigned fail_count; /* Cumulative failure count */ struct dm_path path; - struct work_struct activate_path; + struct delayed_work activate_path; }; #define path_to_pgpath(__pgp) container_of((__pgp), struct pgpath, path) @@ -64,11 +66,15 @@ struct multipath { const char *hw_handler_name; char *hw_handler_params; + unsigned nr_priority_groups; struct list_head priority_groups; + + wait_queue_head_t pg_init_wait; /* Wait for pg_init completion */ + unsigned pg_init_required; /* pg_init needs calling? */ unsigned pg_init_in_progress; /* Only one pg_init allowed at once */ - wait_queue_head_t pg_init_wait; /* Wait for pg_init completion */ + unsigned pg_init_delay_retry; /* Delay pg_init retry? */ unsigned nr_valid_paths; /* Total number of usable paths */ struct pgpath *current_pgpath; @@ -81,6 +87,7 @@ struct multipath { unsigned saved_queue_if_no_path;/* Saved state during suspension */ unsigned pg_init_retries; /* Number of times to retry pg_init */ unsigned pg_init_count; /* Number of times pg_init called */ + unsigned pg_init_delay_msecs; /* Number of msecs before pg_init retry */ struct work_struct process_queued_ios; struct list_head queued_ios; @@ -127,7 +134,7 @@ static struct pgpath *alloc_pgpath(void) if (pgpath) { pgpath->is_active = 1; - INIT_WORK(&pgpath->activate_path, activate_path); + INIT_DELAYED_WORK(&pgpath->activate_path, activate_path); } return pgpath; @@ -188,6 +195,7 @@ static struct multipath *alloc_multipath(struct dm_target *ti) INIT_LIST_HEAD(&m->queued_ios); spin_lock_init(&m->lock); m->queue_io = 1; + m->pg_init_delay_msecs = DM_PG_INIT_DELAY_DEFAULT; INIT_WORK(&m->process_queued_ios, process_queued_ios); INIT_WORK(&m->trigger_event, trigger_event); init_waitqueue_head(&m->pg_init_wait); @@ -227,14 +235,19 @@ static void free_multipath(struct multipath *m) static void __pg_init_all_paths(struct multipath *m) { struct pgpath *pgpath; + unsigned long pg_init_delay = 0; m->pg_init_count++; m->pg_init_required = 0; + if (m->pg_init_delay_retry) + pg_init_delay = msecs_to_jiffies(m->pg_init_delay_msecs != DM_PG_INIT_DELAY_DEFAULT ? + m->pg_init_delay_msecs : DM_PG_INIT_DELAY_MSECS); list_for_each_entry(pgpath, &m->current_pg->pgpaths, list) { /* Skip failed paths */ if (!pgpath->is_active) continue; - if (queue_work(kmpath_handlerd, &pgpath->activate_path)) + if (queue_delayed_work(kmpath_handlerd, &pgpath->activate_path, + pg_init_delay)) m->pg_init_in_progress++; } } @@ -782,8 +795,9 @@ static int parse_features(struct arg_set *as, struct multipath *m) const char *param_name; static struct param _params[] = { - {0, 3, "invalid number of feature args"}, + {0, 5, "invalid number of feature args"}, {1, 50, "pg_init_retries must be between 1 and 50"}, + {0, 60000, "pg_init_delay_msecs must be between 0 and 60000"}, }; r = read_param(_params, shift(as), &argc, &ti->error); @@ -810,6 +824,14 @@ static int parse_features(struct arg_set *as, struct multipath *m) continue; } + if (!strnicmp(param_name, MESG_STR("pg_init_delay_msecs")) && + (argc >= 1)) { + r = read_param(_params + 2, shift(as), + &m->pg_init_delay_msecs, &ti->error); + argc--; + continue; + } + ti->error = "Unrecognised multipath feature request"; r = -EINVAL; } while (argc && !r); @@ -1022,7 +1044,7 @@ static int reinstate_path(struct pgpath *pgpath) m->current_pgpath = NULL; queue_work(kmultipathd, &m->process_queued_ios); } else if (m->hw_handler_name && (m->current_pg == pgpath->pg)) { - if (queue_work(kmpath_handlerd, &pgpath->activate_path)) + if (queue_work(kmpath_handlerd, &pgpath->activate_path.work)) m->pg_init_in_progress++; } @@ -1157,6 +1179,7 @@ static void pg_init_done(void *data, int errors) struct priority_group *pg = pgpath->pg; struct multipath *m = pg->m; unsigned long flags; + unsigned delay_retry = 0; /* device or driver problems */ switch (errors) { @@ -1181,8 +1204,9 @@ static void pg_init_done(void *data, int errors) */ bypass_pg(m, pg, 1); break; - /* TODO: For SCSI_DH_RETRY we should wait a couple seconds */ case SCSI_DH_RETRY: + /* Wait before retrying. */ + delay_retry = 1; case SCSI_DH_IMM_RETRY: case SCSI_DH_RES_TEMP_UNAVAIL: if (pg_init_limit_reached(m, pgpath)) @@ -1215,6 +1239,7 @@ static void pg_init_done(void *data, int errors) if (!m->pg_init_required) m->queue_io = 0; + m->pg_init_delay_retry = delay_retry; queue_work(kmultipathd, &m->process_queued_ios); /* @@ -1229,7 +1254,7 @@ out: static void activate_path(struct work_struct *work) { struct pgpath *pgpath = - container_of(work, struct pgpath, activate_path); + container_of(work, struct pgpath, activate_path.work); scsi_dh_activate(bdev_get_queue(pgpath->path.dev->bdev), pg_init_done, pgpath); @@ -1370,11 +1395,14 @@ static int multipath_status(struct dm_target *ti, status_type_t type, DMEMIT("2 %u %u ", m->queue_size, m->pg_init_count); else { DMEMIT("%u ", m->queue_if_no_path + - (m->pg_init_retries > 0) * 2); + (m->pg_init_retries > 0) * 2 + + (m->pg_init_delay_msecs != DM_PG_INIT_DELAY_DEFAULT) * 2); if (m->queue_if_no_path) DMEMIT("queue_if_no_path "); if (m->pg_init_retries) DMEMIT("pg_init_retries %u ", m->pg_init_retries); + if (m->pg_init_delay_msecs != DM_PG_INIT_DELAY_DEFAULT) + DMEMIT("pg_init_delay_msecs %u ", m->pg_init_delay_msecs); } if (!m->hw_handler_name || type == STATUSTYPE_INFO) @@ -1643,7 +1671,7 @@ out: *---------------------------------------------------------------*/ static struct target_type multipath_target = { .name = "multipath", - .version = {1, 1, 1}, + .version = {1, 2, 0}, .module = THIS_MODULE, .ctr = multipath_ctr, .dtr = multipath_dtr, -- cgit From 9d357b0787bb3c91835d5e658c3bda178f9ca419 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 13 Jan 2011 20:00:01 +0000 Subject: dm: introduce target callbacks and congestion callback DM currently implements congestion checking by checking on congestion in each component device. For raid456 we need to also check if the stripe cache is congested. Add per-target congestion checker callback support. Extending the target_callbacks structure with additional callback functions allows for establishing multiple callbacks per-target (a callback is also needed for unplug). Cc: linux-raid@vger.kernel.org Signed-off-by: NeilBrown Signed-off-by: Jonathan Brassow Signed-off-by: Mike Snitzer Signed-off-by: Alasdair G Kergon --- drivers/md/dm-table.c | 14 ++++++++++++++ include/linux/device-mapper.h | 11 +++++++++++ 2 files changed, 25 insertions(+) diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index 985c20a4f30e..7e2ec3c05550 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -71,6 +71,8 @@ struct dm_table { void *event_context; struct dm_md_mempools *mempools; + + struct list_head target_callbacks; }; /* @@ -204,6 +206,7 @@ int dm_table_create(struct dm_table **result, fmode_t mode, return -ENOMEM; INIT_LIST_HEAD(&t->devices); + INIT_LIST_HEAD(&t->target_callbacks); atomic_set(&t->holders, 0); t->discards_supported = 1; @@ -1225,10 +1228,17 @@ int dm_table_resume_targets(struct dm_table *t) return 0; } +void dm_table_add_target_callbacks(struct dm_table *t, struct dm_target_callbacks *cb) +{ + list_add(&cb->list, &t->target_callbacks); +} +EXPORT_SYMBOL_GPL(dm_table_add_target_callbacks); + int dm_table_any_congested(struct dm_table *t, int bdi_bits) { struct dm_dev_internal *dd; struct list_head *devices = dm_table_get_devices(t); + struct dm_target_callbacks *cb; int r = 0; list_for_each_entry(dd, devices, list) { @@ -1243,6 +1253,10 @@ int dm_table_any_congested(struct dm_table *t, int bdi_bits) bdevname(dd->dm_dev.bdev, b)); } + list_for_each_entry(cb, &t->target_callbacks, list) + if (cb->congested_fn) + r |= cb->congested_fn(cb, bdi_bits); + return r; } diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h index 2970022faa63..4b1c63d478ab 100644 --- a/include/linux/device-mapper.h +++ b/include/linux/device-mapper.h @@ -193,6 +193,12 @@ struct dm_target { char *error; }; +/* Each target can link one of these into the table */ +struct dm_target_callbacks { + struct list_head list; + int (*congested_fn) (struct dm_target_callbacks *, int); +}; + int dm_register_target(struct target_type *t); void dm_unregister_target(struct target_type *t); @@ -268,6 +274,11 @@ int dm_table_create(struct dm_table **result, fmode_t mode, int dm_table_add_target(struct dm_table *t, const char *type, sector_t start, sector_t len, char *params); +/* + * Target_ctr should call this if it needs to add any callbacks. + */ +void dm_table_add_target_callbacks(struct dm_table *t, struct dm_target_callbacks *cb); + /* * Finally call this to make the table ready for use. */ -- cgit From 99d03c141b40914b67d63c9d23b8da4386422ed7 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 13 Jan 2011 20:00:02 +0000 Subject: dm: per target unplug callback support Add per-target unplug callback support. Cc: linux-raid@vger.kernel.org Signed-off-by: NeilBrown Signed-off-by: Jonathan Brassow Signed-off-by: Mike Snitzer Signed-off-by: Alasdair G Kergon --- drivers/md/dm-table.c | 5 +++++ include/linux/device-mapper.h | 1 + 2 files changed, 6 insertions(+) diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index 7e2ec3c05550..dffa0ac7c4f0 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -1278,6 +1278,7 @@ void dm_table_unplug_all(struct dm_table *t) { struct dm_dev_internal *dd; struct list_head *devices = dm_table_get_devices(t); + struct dm_target_callbacks *cb; list_for_each_entry(dd, devices, list) { struct request_queue *q = bdev_get_queue(dd->dm_dev.bdev); @@ -1290,6 +1291,10 @@ void dm_table_unplug_all(struct dm_table *t) dm_device_name(t->md), bdevname(dd->dm_dev.bdev, b)); } + + list_for_each_entry(cb, &t->target_callbacks, list) + if (cb->unplug_fn) + cb->unplug_fn(cb); } struct mapped_device *dm_table_get_md(struct dm_table *t) diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h index 4b1c63d478ab..272496d1fae4 100644 --- a/include/linux/device-mapper.h +++ b/include/linux/device-mapper.h @@ -197,6 +197,7 @@ struct dm_target { struct dm_target_callbacks { struct list_head list; int (*congested_fn) (struct dm_target_callbacks *, int); + void (*unplug_fn)(struct dm_target_callbacks *); }; int dm_register_target(struct target_type *t); -- cgit From 9d09e663d5502c46f2d9481c04c1087e1c2da698 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 13 Jan 2011 20:00:02 +0000 Subject: dm: raid456 basic support This patch is the skeleton for the DM target that will be the bridge from DM to MD (initially RAID456 and later RAID1). It provides a way to use device-mapper interfaces to the MD RAID456 drivers. As with all device-mapper targets, the nominal public interfaces are the constructor (CTR) tables and the status outputs (both STATUSTYPE_INFO and STATUSTYPE_TABLE). The CTR table looks like the following: 1: raid \ 2: <#raid_params> \ 3: <#raid_devs> .. Line 1 contains the standard first three arguments to any device-mapper target - the start, length, and target type fields. The target type in this case is "raid". Line 2 contains the arguments that define the particular raid type/personality/level, the required arguments for that raid type, and any optional arguments. Possible raid types include: raid4, raid5_la, raid5_ls, raid5_rs, raid6_zr, raid6_nr, and raid6_nc. (again, raid1 is planned for the future.) The list of required and optional parameters is the same for all the current raid types. The required parameters are positional, while the optional parameters are given as key/value pairs. The possible parameters are as follows: Chunk size in sectors. [[no]sync] Force/Prevent RAID initialization [rebuild ] Rebuild the drive indicated by the index [daemon_sleep ] Time between bitmap daemon work to clear bits [min_recovery_rate ] Throttle RAID initialization [max_recovery_rate ] Throttle RAID initialization [max_write_behind ] See '-write-behind=' (man mdadm) [stripe_cache ] Stripe cache size for higher RAIDs Line 3 contains the list of devices that compose the array in metadata/data device pairs. If the metadata is stored separately, a '-' is given for the metadata device position. If a drive has failed or is missing at creation time, a '-' can be given for both the metadata and data drives for a given position. Examples: # RAID4 - 4 data drives, 1 parity # No metadata devices specified to hold superblock/bitmap info # Chunk size of 1MiB # (Lines separated for easy reading) 0 1960893648 raid \ raid4 1 2048 \ 5 - 8:17 - 8:33 - 8:49 - 8:65 - 8:81 # RAID4 - 4 data drives, 1 parity (no metadata devices) # Chunk size of 1MiB, force RAID initialization, # min recovery rate at 20 kiB/sec/disk 0 1960893648 raid \ raid4 4 2048 min_recovery_rate 20 sync\ 5 - 8:17 - 8:33 - 8:49 - 8:65 - 8:81 Performing a 'dmsetup table' should display the CTR table used to construct the mapping (with possible reordering of optional parameters). Performing a 'dmsetup status' will yield information on the state and health of the array. The output is as follows: 1: raid \ 2: <#devices> <1 health char for each dev> Line 1 is standard DM output. Line 2 is best shown by example: 0 1960893648 raid raid4 5 AAAAA 2/490221568 Here we can see the RAID type is raid4, there are 5 devices - all of which are 'A'live, and the array is 2/490221568 complete with recovery. Cc: linux-raid@vger.kernel.org Signed-off-by: NeilBrown Signed-off-by: Jonathan Brassow Signed-off-by: Mike Snitzer Signed-off-by: Alasdair G Kergon --- Documentation/device-mapper/dm-raid.txt | 70 ++++ drivers/md/Kconfig | 24 ++ drivers/md/Makefile | 1 + drivers/md/dm-raid.c | 697 ++++++++++++++++++++++++++++++++ 4 files changed, 792 insertions(+) create mode 100644 Documentation/device-mapper/dm-raid.txt create mode 100644 drivers/md/dm-raid.c diff --git a/Documentation/device-mapper/dm-raid.txt b/Documentation/device-mapper/dm-raid.txt new file mode 100644 index 000000000000..33b6b7071ac8 --- /dev/null +++ b/Documentation/device-mapper/dm-raid.txt @@ -0,0 +1,70 @@ +Device-mapper RAID (dm-raid) is a bridge from DM to MD. It +provides a way to use device-mapper interfaces to access the MD RAID +drivers. + +As with all device-mapper targets, the nominal public interfaces are the +constructor (CTR) tables and the status outputs (both STATUSTYPE_INFO +and STATUSTYPE_TABLE). The CTR table looks like the following: + +1: raid \ +2: <#raid_params> \ +3: <#raid_devs> .. + +Line 1 contains the standard first three arguments to any device-mapper +target - the start, length, and target type fields. The target type in +this case is "raid". + +Line 2 contains the arguments that define the particular raid +type/personality/level, the required arguments for that raid type, and +any optional arguments. Possible raid types include: raid4, raid5_la, +raid5_ls, raid5_rs, raid6_zr, raid6_nr, and raid6_nc. (raid1 is +planned for the future.) The list of required and optional parameters +is the same for all the current raid types. The required parameters are +positional, while the optional parameters are given as key/value pairs. +The possible parameters are as follows: + Chunk size in sectors. + [[no]sync] Force/Prevent RAID initialization + [rebuild ] Rebuild the drive indicated by the index + [daemon_sleep ] Time between bitmap daemon work to clear bits + [min_recovery_rate ] Throttle RAID initialization + [max_recovery_rate ] Throttle RAID initialization + [max_write_behind ] See '-write-behind=' (man mdadm) + [stripe_cache ] Stripe cache size for higher RAIDs + +Line 3 contains the list of devices that compose the array in +metadata/data device pairs. If the metadata is stored separately, a '-' +is given for the metadata device position. If a drive has failed or is +missing at creation time, a '-' can be given for both the metadata and +data drives for a given position. + +NB. Currently all metadata devices must be specified as '-'. + +Examples: +# RAID4 - 4 data drives, 1 parity +# No metadata devices specified to hold superblock/bitmap info +# Chunk size of 1MiB +# (Lines separated for easy reading) +0 1960893648 raid \ + raid4 1 2048 \ + 5 - 8:17 - 8:33 - 8:49 - 8:65 - 8:81 + +# RAID4 - 4 data drives, 1 parity (no metadata devices) +# Chunk size of 1MiB, force RAID initialization, +# min recovery rate at 20 kiB/sec/disk +0 1960893648 raid \ + raid4 4 2048 min_recovery_rate 20 sync\ + 5 - 8:17 - 8:33 - 8:49 - 8:65 - 8:81 + +Performing a 'dmsetup table' should display the CTR table used to +construct the mapping (with possible reordering of optional +parameters). + +Performing a 'dmsetup status' will yield information on the state and +health of the array. The output is as follows: +1: raid \ +2: <#devices> <1 health char for each dev> + +Line 1 is standard DM output. Line 2 is best shown by example: + 0 1960893648 raid raid4 5 AAAAA 2/490221568 +Here we can see the RAID type is raid4, there are 5 devices - all of +which are 'A'live, and the array is 2/490221568 complete with recovery. diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig index bf1a95e31559..98d9ec85e0eb 100644 --- a/drivers/md/Kconfig +++ b/drivers/md/Kconfig @@ -240,6 +240,30 @@ config DM_MIRROR Allow volume managers to mirror logical volumes, also needed for live data migration tools such as 'pvmove'. +config DM_RAID + tristate "RAID 4/5/6 target (EXPERIMENTAL)" + depends on BLK_DEV_DM && EXPERIMENTAL + select MD_RAID456 + select BLK_DEV_MD + ---help--- + A dm target that supports RAID4, RAID5 and RAID6 mappings + + A RAID-5 set of N drives with a capacity of C MB per drive provides + the capacity of C * (N - 1) MB, and protects against a failure + of a single drive. For a given sector (row) number, (N - 1) drives + contain data sectors, and one drive contains the parity protection. + For a RAID-4 set, the parity blocks are present on a single drive, + while a RAID-5 set distributes the parity across the drives in one + of the available parity distribution methods. + + A RAID-6 set of N drives with a capacity of C MB per drive + provides the capacity of C * (N - 2) MB, and protects + against a failure of any two drives. For a given sector + (row) number, (N - 2) drives contain data sectors, and two + drives contains two independent redundancy syndromes. Like + RAID-5, RAID-6 distributes the syndromes across the drives + in one of the available parity distribution methods. + config DM_LOG_USERSPACE tristate "Mirror userspace logging (EXPERIMENTAL)" depends on DM_MIRROR && EXPERIMENTAL && NET diff --git a/drivers/md/Makefile b/drivers/md/Makefile index 5e3aac41919d..d0138606c2e8 100644 --- a/drivers/md/Makefile +++ b/drivers/md/Makefile @@ -36,6 +36,7 @@ obj-$(CONFIG_DM_SNAPSHOT) += dm-snapshot.o obj-$(CONFIG_DM_MIRROR) += dm-mirror.o dm-log.o dm-region-hash.o obj-$(CONFIG_DM_LOG_USERSPACE) += dm-log-userspace.o obj-$(CONFIG_DM_ZERO) += dm-zero.o +obj-$(CONFIG_DM_RAID) += dm-raid.o ifeq ($(CONFIG_DM_UEVENT),y) dm-mod-objs += dm-uevent.o diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c new file mode 100644 index 000000000000..b9e1e15ef11c --- /dev/null +++ b/drivers/md/dm-raid.c @@ -0,0 +1,697 @@ +/* + * Copyright (C) 2010-2011 Neil Brown + * Copyright (C) 2010-2011 Red Hat, Inc. All rights reserved. + * + * This file is released under the GPL. + */ + +#include + +#include "md.h" +#include "raid5.h" +#include "dm.h" +#include "bitmap.h" + +#define DM_MSG_PREFIX "raid" + +/* + * If the MD doesn't support MD_SYNC_STATE_FORCED yet, then + * make it so the flag doesn't set anything. + */ +#ifndef MD_SYNC_STATE_FORCED +#define MD_SYNC_STATE_FORCED 0 +#endif + +struct raid_dev { + /* + * Two DM devices, one to hold metadata and one to hold the + * actual data/parity. The reason for this is to not confuse + * ti->len and give more flexibility in altering size and + * characteristics. + * + * While it is possible for this device to be associated + * with a different physical device than the data_dev, it + * is intended for it to be the same. + * |--------- Physical Device ---------| + * |- meta_dev -|------ data_dev ------| + */ + struct dm_dev *meta_dev; + struct dm_dev *data_dev; + struct mdk_rdev_s rdev; +}; + +/* + * Flags for rs->print_flags field. + */ +#define DMPF_DAEMON_SLEEP 0x1 +#define DMPF_MAX_WRITE_BEHIND 0x2 +#define DMPF_SYNC 0x4 +#define DMPF_NOSYNC 0x8 +#define DMPF_STRIPE_CACHE 0x10 +#define DMPF_MIN_RECOVERY_RATE 0x20 +#define DMPF_MAX_RECOVERY_RATE 0x40 + +struct raid_set { + struct dm_target *ti; + + uint64_t print_flags; + + struct mddev_s md; + struct raid_type *raid_type; + struct dm_target_callbacks callbacks; + + struct raid_dev dev[0]; +}; + +/* Supported raid types and properties. */ +static struct raid_type { + const char *name; /* RAID algorithm. */ + const char *descr; /* Descriptor text for logging. */ + const unsigned parity_devs; /* # of parity devices. */ + const unsigned minimal_devs; /* minimal # of devices in set. */ + const unsigned level; /* RAID level. */ + const unsigned algorithm; /* RAID algorithm. */ +} raid_types[] = { + {"raid4", "RAID4 (dedicated parity disk)", 1, 2, 5, ALGORITHM_PARITY_0}, + {"raid5_la", "RAID5 (left asymmetric)", 1, 2, 5, ALGORITHM_LEFT_ASYMMETRIC}, + {"raid5_ra", "RAID5 (right asymmetric)", 1, 2, 5, ALGORITHM_RIGHT_ASYMMETRIC}, + {"raid5_ls", "RAID5 (left symmetric)", 1, 2, 5, ALGORITHM_LEFT_SYMMETRIC}, + {"raid5_rs", "RAID5 (right symmetric)", 1, 2, 5, ALGORITHM_RIGHT_SYMMETRIC}, + {"raid6_zr", "RAID6 (zero restart)", 2, 4, 6, ALGORITHM_ROTATING_ZERO_RESTART}, + {"raid6_nr", "RAID6 (N restart)", 2, 4, 6, ALGORITHM_ROTATING_N_RESTART}, + {"raid6_nc", "RAID6 (N continue)", 2, 4, 6, ALGORITHM_ROTATING_N_CONTINUE} +}; + +static struct raid_type *get_raid_type(char *name) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(raid_types); i++) + if (!strcmp(raid_types[i].name, name)) + return &raid_types[i]; + + return NULL; +} + +static struct raid_set *context_alloc(struct dm_target *ti, struct raid_type *raid_type, unsigned raid_devs) +{ + unsigned i; + struct raid_set *rs; + sector_t sectors_per_dev; + + if (raid_devs <= raid_type->parity_devs) { + ti->error = "Insufficient number of devices"; + return ERR_PTR(-EINVAL); + } + + sectors_per_dev = ti->len; + if (sector_div(sectors_per_dev, (raid_devs - raid_type->parity_devs))) { + ti->error = "Target length not divisible by number of data devices"; + return ERR_PTR(-EINVAL); + } + + rs = kzalloc(sizeof(*rs) + raid_devs * sizeof(rs->dev[0]), GFP_KERNEL); + if (!rs) { + ti->error = "Cannot allocate raid context"; + return ERR_PTR(-ENOMEM); + } + + mddev_init(&rs->md); + + rs->ti = ti; + rs->raid_type = raid_type; + rs->md.raid_disks = raid_devs; + rs->md.level = raid_type->level; + rs->md.new_level = rs->md.level; + rs->md.dev_sectors = sectors_per_dev; + rs->md.layout = raid_type->algorithm; + rs->md.new_layout = rs->md.layout; + rs->md.delta_disks = 0; + rs->md.recovery_cp = 0; + + for (i = 0; i < raid_devs; i++) + md_rdev_init(&rs->dev[i].rdev); + + /* + * Remaining items to be initialized by further RAID params: + * rs->md.persistent + * rs->md.external + * rs->md.chunk_sectors + * rs->md.new_chunk_sectors + */ + + return rs; +} + +static void context_free(struct raid_set *rs) +{ + int i; + + for (i = 0; i < rs->md.raid_disks; i++) + if (rs->dev[i].data_dev) + dm_put_device(rs->ti, rs->dev[i].data_dev); + + kfree(rs); +} + +/* + * For every device we have two words + * : meta device name or '-' if missing + * : data device name or '-' if missing + * + * This code parses those words. + */ +static int dev_parms(struct raid_set *rs, char **argv) +{ + int i; + int rebuild = 0; + int metadata_available = 0; + int ret = 0; + + for (i = 0; i < rs->md.raid_disks; i++, argv += 2) { + rs->dev[i].rdev.raid_disk = i; + + rs->dev[i].meta_dev = NULL; + rs->dev[i].data_dev = NULL; + + /* + * There are no offsets, since there is a separate device + * for data and metadata. + */ + rs->dev[i].rdev.data_offset = 0; + rs->dev[i].rdev.mddev = &rs->md; + + if (strcmp(argv[0], "-")) { + rs->ti->error = "Metadata devices not supported"; + return -EINVAL; + } + + if (!strcmp(argv[1], "-")) { + if (!test_bit(In_sync, &rs->dev[i].rdev.flags) && + (!rs->dev[i].rdev.recovery_offset)) { + rs->ti->error = "Drive designated for rebuild not specified"; + return -EINVAL; + } + + continue; + } + + ret = dm_get_device(rs->ti, argv[1], + dm_table_get_mode(rs->ti->table), + &rs->dev[i].data_dev); + if (ret) { + rs->ti->error = "RAID device lookup failure"; + return ret; + } + + rs->dev[i].rdev.bdev = rs->dev[i].data_dev->bdev; + list_add(&rs->dev[i].rdev.same_set, &rs->md.disks); + if (!test_bit(In_sync, &rs->dev[i].rdev.flags)) + rebuild++; + } + + if (metadata_available) { + rs->md.external = 0; + rs->md.persistent = 1; + rs->md.major_version = 2; + } else if (rebuild && !rs->md.recovery_cp) { + /* + * Without metadata, we will not be able to tell if the array + * is in-sync or not - we must assume it is not. Therefore, + * it is impossible to rebuild a drive. + * + * Even if there is metadata, the on-disk information may + * indicate that the array is not in-sync and it will then + * fail at that time. + * + * User could specify 'nosync' option if desperate. + */ + DMERR("Unable to rebuild drive while array is not in-sync"); + rs->ti->error = "RAID device lookup failure"; + return -EINVAL; + } + + return 0; +} + +/* + * Possible arguments are... + * RAID456: + * [optional_args] + * + * Optional args: + * [[no]sync] Force or prevent recovery of the entire array + * [rebuild ] Rebuild the drive indicated by the index + * [daemon_sleep ] Time between bitmap daemon work to clear bits + * [min_recovery_rate ] Throttle RAID initialization + * [max_recovery_rate ] Throttle RAID initialization + * [max_write_behind ] See '-write-behind=' (man mdadm) + * [stripe_cache ] Stripe cache size for higher RAIDs + */ +static int parse_raid_params(struct raid_set *rs, char **argv, + unsigned num_raid_params) +{ + unsigned i, rebuild_cnt = 0; + unsigned long value; + char *key; + + /* + * First, parse the in-order required arguments + */ + if ((strict_strtoul(argv[0], 10, &value) < 0) || + !is_power_of_2(value) || (value < 8)) { + rs->ti->error = "Bad chunk size"; + return -EINVAL; + } + + rs->md.new_chunk_sectors = rs->md.chunk_sectors = value; + argv++; + num_raid_params--; + + /* + * Second, parse the unordered optional arguments + */ + for (i = 0; i < rs->md.raid_disks; i++) + set_bit(In_sync, &rs->dev[i].rdev.flags); + + for (i = 0; i < num_raid_params; i++) { + if (!strcmp(argv[i], "nosync")) { + rs->md.recovery_cp = MaxSector; + rs->print_flags |= DMPF_NOSYNC; + rs->md.flags |= MD_SYNC_STATE_FORCED; + continue; + } + if (!strcmp(argv[i], "sync")) { + rs->md.recovery_cp = 0; + rs->print_flags |= DMPF_SYNC; + rs->md.flags |= MD_SYNC_STATE_FORCED; + continue; + } + + /* The rest of the optional arguments come in key/value pairs */ + if ((i + 1) >= num_raid_params) { + rs->ti->error = "Wrong number of raid parameters given"; + return -EINVAL; + } + + key = argv[i++]; + if (strict_strtoul(argv[i], 10, &value) < 0) { + rs->ti->error = "Bad numerical argument given in raid params"; + return -EINVAL; + } + + if (!strcmp(key, "rebuild")) { + if (++rebuild_cnt > rs->raid_type->parity_devs) { + rs->ti->error = "Too many rebuild drives given"; + return -EINVAL; + } + if (value > rs->md.raid_disks) { + rs->ti->error = "Invalid rebuild index given"; + return -EINVAL; + } + clear_bit(In_sync, &rs->dev[value].rdev.flags); + rs->dev[value].rdev.recovery_offset = 0; + } else if (!strcmp(key, "max_write_behind")) { + rs->print_flags |= DMPF_MAX_WRITE_BEHIND; + + /* + * In device-mapper, we specify things in sectors, but + * MD records this value in kB + */ + value /= 2; + if (value > COUNTER_MAX) { + rs->ti->error = "Max write-behind limit out of range"; + return -EINVAL; + } + rs->md.bitmap_info.max_write_behind = value; + } else if (!strcmp(key, "daemon_sleep")) { + rs->print_flags |= DMPF_DAEMON_SLEEP; + if (!value || (value > MAX_SCHEDULE_TIMEOUT)) { + rs->ti->error = "daemon sleep period out of range"; + return -EINVAL; + } + rs->md.bitmap_info.daemon_sleep = value; + } else if (!strcmp(key, "stripe_cache")) { + rs->print_flags |= DMPF_STRIPE_CACHE; + + /* + * In device-mapper, we specify things in sectors, but + * MD records this value in kB + */ + value /= 2; + + if (rs->raid_type->level < 5) { + rs->ti->error = "Inappropriate argument: stripe_cache"; + return -EINVAL; + } + if (raid5_set_cache_size(&rs->md, (int)value)) { + rs->ti->error = "Bad stripe_cache size"; + return -EINVAL; + } + } else if (!strcmp(key, "min_recovery_rate")) { + rs->print_flags |= DMPF_MIN_RECOVERY_RATE; + if (value > INT_MAX) { + rs->ti->error = "min_recovery_rate out of range"; + return -EINVAL; + } + rs->md.sync_speed_min = (int)value; + } else if (!strcmp(key, "max_recovery_rate")) { + rs->print_flags |= DMPF_MAX_RECOVERY_RATE; + if (value > INT_MAX) { + rs->ti->error = "max_recovery_rate out of range"; + return -EINVAL; + } + rs->md.sync_speed_max = (int)value; + } else { + DMERR("Unable to parse RAID parameter: %s", key); + rs->ti->error = "Unable to parse RAID parameters"; + return -EINVAL; + } + } + + /* Assume there are no metadata devices until the drives are parsed */ + rs->md.persistent = 0; + rs->md.external = 1; + + return 0; +} + +static void do_table_event(struct work_struct *ws) +{ + struct raid_set *rs = container_of(ws, struct raid_set, md.event_work); + + dm_table_event(rs->ti->table); +} + +static int raid_is_congested(struct dm_target_callbacks *cb, int bits) +{ + struct raid_set *rs = container_of(cb, struct raid_set, callbacks); + + return md_raid5_congested(&rs->md, bits); +} + +static void raid_unplug(struct dm_target_callbacks *cb) +{ + struct raid_set *rs = container_of(cb, struct raid_set, callbacks); + + md_raid5_unplug_device(rs->md.private); +} + +/* + * Construct a RAID4/5/6 mapping: + * Args: + * <#raid_params> \ + * <#raid_devs> { .. } + * + * ** metadata devices are not supported yet, use '-' instead ** + * + * varies by . See 'parse_raid_params' for + * details on possible . + */ +static int raid_ctr(struct dm_target *ti, unsigned argc, char **argv) +{ + int ret; + struct raid_type *rt; + unsigned long num_raid_params, num_raid_devs; + struct raid_set *rs = NULL; + + /* Must have at least <#raid_params> */ + if (argc < 2) { + ti->error = "Too few arguments"; + return -EINVAL; + } + + /* raid type */ + rt = get_raid_type(argv[0]); + if (!rt) { + ti->error = "Unrecognised raid_type"; + return -EINVAL; + } + argc--; + argv++; + + /* number of RAID parameters */ + if (strict_strtoul(argv[0], 10, &num_raid_params) < 0) { + ti->error = "Cannot understand number of RAID parameters"; + return -EINVAL; + } + argc--; + argv++; + + /* Skip over RAID params for now and find out # of devices */ + if (num_raid_params + 1 > argc) { + ti->error = "Arguments do not agree with counts given"; + return -EINVAL; + } + + if ((strict_strtoul(argv[num_raid_params], 10, &num_raid_devs) < 0) || + (num_raid_devs >= INT_MAX)) { + ti->error = "Cannot understand number of raid devices"; + return -EINVAL; + } + + rs = context_alloc(ti, rt, (unsigned)num_raid_devs); + if (IS_ERR(rs)) + return PTR_ERR(rs); + + ret = parse_raid_params(rs, argv, (unsigned)num_raid_params); + if (ret) + goto bad; + + ret = -EINVAL; + + argc -= num_raid_params + 1; /* +1: we already have num_raid_devs */ + argv += num_raid_params + 1; + + if (argc != (num_raid_devs * 2)) { + ti->error = "Supplied RAID devices does not match the count given"; + goto bad; + } + + ret = dev_parms(rs, argv); + if (ret) + goto bad; + + INIT_WORK(&rs->md.event_work, do_table_event); + ti->split_io = rs->md.chunk_sectors; + ti->private = rs; + + mutex_lock(&rs->md.reconfig_mutex); + ret = md_run(&rs->md); + rs->md.in_sync = 0; /* Assume already marked dirty */ + mutex_unlock(&rs->md.reconfig_mutex); + + if (ret) { + ti->error = "Fail to run raid array"; + goto bad; + } + + rs->callbacks.congested_fn = raid_is_congested; + rs->callbacks.unplug_fn = raid_unplug; + dm_table_add_target_callbacks(ti->table, &rs->callbacks); + + return 0; + +bad: + context_free(rs); + + return ret; +} + +static void raid_dtr(struct dm_target *ti) +{ + struct raid_set *rs = ti->private; + + list_del_init(&rs->callbacks.list); + md_stop(&rs->md); + context_free(rs); +} + +static int raid_map(struct dm_target *ti, struct bio *bio, union map_info *map_context) +{ + struct raid_set *rs = ti->private; + mddev_t *mddev = &rs->md; + + mddev->pers->make_request(mddev, bio); + + return DM_MAPIO_SUBMITTED; +} + +static int raid_status(struct dm_target *ti, status_type_t type, + char *result, unsigned maxlen) +{ + struct raid_set *rs = ti->private; + unsigned raid_param_cnt = 1; /* at least 1 for chunksize */ + unsigned sz = 0; + int i; + sector_t sync; + + switch (type) { + case STATUSTYPE_INFO: + DMEMIT("%s %d ", rs->raid_type->name, rs->md.raid_disks); + + for (i = 0; i < rs->md.raid_disks; i++) { + if (test_bit(Faulty, &rs->dev[i].rdev.flags)) + DMEMIT("D"); + else if (test_bit(In_sync, &rs->dev[i].rdev.flags)) + DMEMIT("A"); + else + DMEMIT("a"); + } + + if (test_bit(MD_RECOVERY_RUNNING, &rs->md.recovery)) + sync = rs->md.curr_resync_completed; + else + sync = rs->md.recovery_cp; + + if (sync > rs->md.resync_max_sectors) + sync = rs->md.resync_max_sectors; + + DMEMIT(" %llu/%llu", + (unsigned long long) sync, + (unsigned long long) rs->md.resync_max_sectors); + + break; + case STATUSTYPE_TABLE: + /* The string you would use to construct this array */ + for (i = 0; i < rs->md.raid_disks; i++) + if (rs->dev[i].data_dev && + !test_bit(In_sync, &rs->dev[i].rdev.flags)) + raid_param_cnt++; /* for rebuilds */ + + raid_param_cnt += (hweight64(rs->print_flags) * 2); + if (rs->print_flags & (DMPF_SYNC | DMPF_NOSYNC)) + raid_param_cnt--; + + DMEMIT("%s %u %u", rs->raid_type->name, + raid_param_cnt, rs->md.chunk_sectors); + + if ((rs->print_flags & DMPF_SYNC) && + (rs->md.recovery_cp == MaxSector)) + DMEMIT(" sync"); + if (rs->print_flags & DMPF_NOSYNC) + DMEMIT(" nosync"); + + for (i = 0; i < rs->md.raid_disks; i++) + if (rs->dev[i].data_dev && + !test_bit(In_sync, &rs->dev[i].rdev.flags)) + DMEMIT(" rebuild %u", i); + + if (rs->print_flags & DMPF_DAEMON_SLEEP) + DMEMIT(" daemon_sleep %lu", + rs->md.bitmap_info.daemon_sleep); + + if (rs->print_flags & DMPF_MIN_RECOVERY_RATE) + DMEMIT(" min_recovery_rate %d", rs->md.sync_speed_min); + + if (rs->print_flags & DMPF_MAX_RECOVERY_RATE) + DMEMIT(" max_recovery_rate %d", rs->md.sync_speed_max); + + if (rs->print_flags & DMPF_MAX_WRITE_BEHIND) + DMEMIT(" max_write_behind %lu", + rs->md.bitmap_info.max_write_behind); + + if (rs->print_flags & DMPF_STRIPE_CACHE) { + raid5_conf_t *conf = rs->md.private; + + /* convert from kiB to sectors */ + DMEMIT(" stripe_cache %d", + conf ? conf->max_nr_stripes * 2 : 0); + } + + DMEMIT(" %d", rs->md.raid_disks); + for (i = 0; i < rs->md.raid_disks; i++) { + DMEMIT(" -"); /* metadata device */ + + if (rs->dev[i].data_dev) + DMEMIT(" %s", rs->dev[i].data_dev->name); + else + DMEMIT(" -"); + } + } + + return 0; +} + +static int raid_iterate_devices(struct dm_target *ti, iterate_devices_callout_fn fn, void *data) +{ + struct raid_set *rs = ti->private; + unsigned i; + int ret = 0; + + for (i = 0; !ret && i < rs->md.raid_disks; i++) + if (rs->dev[i].data_dev) + ret = fn(ti, + rs->dev[i].data_dev, + 0, /* No offset on data devs */ + rs->md.dev_sectors, + data); + + return ret; +} + +static void raid_io_hints(struct dm_target *ti, struct queue_limits *limits) +{ + struct raid_set *rs = ti->private; + unsigned chunk_size = rs->md.chunk_sectors << 9; + raid5_conf_t *conf = rs->md.private; + + blk_limits_io_min(limits, chunk_size); + blk_limits_io_opt(limits, chunk_size * (conf->raid_disks - conf->max_degraded)); +} + +static void raid_presuspend(struct dm_target *ti) +{ + struct raid_set *rs = ti->private; + + md_stop_writes(&rs->md); +} + +static void raid_postsuspend(struct dm_target *ti) +{ + struct raid_set *rs = ti->private; + + mddev_suspend(&rs->md); +} + +static void raid_resume(struct dm_target *ti) +{ + struct raid_set *rs = ti->private; + + mddev_resume(&rs->md); +} + +static struct target_type raid_target = { + .name = "raid", + .version = {1, 0, 0}, + .module = THIS_MODULE, + .ctr = raid_ctr, + .dtr = raid_dtr, + .map = raid_map, + .status = raid_status, + .iterate_devices = raid_iterate_devices, + .io_hints = raid_io_hints, + .presuspend = raid_presuspend, + .postsuspend = raid_postsuspend, + .resume = raid_resume, +}; + +static int __init dm_raid_init(void) +{ + return dm_register_target(&raid_target); +} + +static void __exit dm_raid_exit(void) +{ + dm_unregister_target(&raid_target); +} + +module_init(dm_raid_init); +module_exit(dm_raid_exit); + +MODULE_DESCRIPTION(DM_NAME " raid4/5/6 target"); +MODULE_ALIAS("dm-raid4"); +MODULE_ALIAS("dm-raid5"); +MODULE_ALIAS("dm-raid6"); +MODULE_AUTHOR("Neil Brown "); +MODULE_LICENSE("GPL"); -- cgit From 8a0eebf66e3b1deae036553ba641a9c2bdbae678 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 13 Jan 2011 14:15:50 -0500 Subject: NFS: Fix NFSv3 exclusive open semantics Commit c0204fd2b8fe047b18b67e07e1bf2a03691240cd (NFS: Clean up nfs4_proc_create()) broke NFSv3 exclusive open by removing the code that passes the O_EXCL flag down to nfs3_proc_create(). This patch reverts that offending hunk from the original commit. Reported-by: Nick Bowler Signed-off-by: Trond Myklebust Cc: stable@kernel.org [2.6.37] Tested-by: Nick Bowler Signed-off-by: Linus Torvalds --- fs/nfs/dir.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 95b081bc9e25..64ee240f3c80 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -1579,6 +1579,7 @@ static int nfs_create(struct inode *dir, struct dentry *dentry, int mode, { struct iattr attr; int error; + int open_flags = 0; dfprintk(VFS, "NFS: create(%s/%ld), %s\n", dir->i_sb->s_id, dir->i_ino, dentry->d_name.name); @@ -1586,7 +1587,10 @@ static int nfs_create(struct inode *dir, struct dentry *dentry, int mode, attr.ia_mode = mode; attr.ia_valid = ATTR_MODE; - error = NFS_PROTO(dir)->create(dir, dentry, &attr, 0, NULL); + if ((nd->flags & LOOKUP_CREATE) != 0) + open_flags = nd->intent.open.flags; + + error = NFS_PROTO(dir)->create(dir, dentry, &attr, open_flags, NULL); if (error != 0) goto out_err; return 0; -- cgit From 6c9879101442b08581e8a0e3ae6b7f643a78fd63 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Fri, 14 Jan 2011 09:13:53 +1100 Subject: md: fix regression resulting in delays in clearing bits in a bitmap commit 589a594be1fb (2.6.37-rc4) fixed a problem were md_thread would sometimes call the ->run function at a bad time. If an error is detected during array start up after the md_thread has been started, the md_thread is killed. This resulted in the ->run function being called once. However the array may not be in a state that it is safe to call ->run. However the fix imposed meant that ->run was not called on a timeout. This means that when an array goes idle, bitmap bits do not get cleared promptly. While the array is busy the bits will still be cleared when appropriate so this is not very serious. There is no risk to data. Change the test so that we only avoid calling ->run when the thread is being stopped. This more explicitly addresses the problem situation. This is suitable for 2.6.37-stable and any -stable kernel to which 589a594be1fb was applied. Cc: stable@kernel.org Signed-off-by: NeilBrown --- drivers/md/md.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index 0da25daea9be..a3019121dc57 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -6042,7 +6042,8 @@ static int md_thread(void * arg) || kthread_should_stop(), thread->timeout); - if (test_and_clear_bit(THREAD_WAKEUP, &thread->flags)) + clear_bit(THREAD_WAKEUP, &thread->flags); + if (!kthread_should_stop()) thread->run(thread->mddev); } -- cgit From 067032bc628598606056412594042564fcf09e22 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Fri, 14 Jan 2011 09:14:33 +1100 Subject: md: Fix single printks with multiple KERN_s Noticed-by: Russell King Signed-off-by: Joe Perches Signed-off-by: NeilBrown --- drivers/md/raid1.c | 5 +++-- drivers/md/raid10.c | 5 +++-- drivers/md/raid5.c | 1 - 3 files changed, 6 insertions(+), 5 deletions(-) diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 845cf95b612c..d50056ba596b 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -1027,8 +1027,9 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev) } else set_bit(Faulty, &rdev->flags); set_bit(MD_CHANGE_DEVS, &mddev->flags); - printk(KERN_ALERT "md/raid1:%s: Disk failure on %s, disabling device.\n" - KERN_ALERT "md/raid1:%s: Operation continuing on %d devices.\n", + printk(KERN_ALERT + "md/raid1:%s: Disk failure on %s, disabling device.\n" + "md/raid1:%s: Operation continuing on %d devices.\n", mdname(mddev), bdevname(rdev->bdev, b), mdname(mddev), conf->raid_disks - mddev->degraded); } diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 0641674827f0..03825cbce2d4 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -1051,8 +1051,9 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev) } set_bit(Faulty, &rdev->flags); set_bit(MD_CHANGE_DEVS, &mddev->flags); - printk(KERN_ALERT "md/raid10:%s: Disk failure on %s, disabling device.\n" - KERN_ALERT "md/raid10:%s: Operation continuing on %d devices.\n", + printk(KERN_ALERT + "md/raid10:%s: Disk failure on %s, disabling device.\n" + "md/raid10:%s: Operation continuing on %d devices.\n", mdname(mddev), bdevname(rdev->bdev, b), mdname(mddev), conf->raid_disks - mddev->degraded); } diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index dc574f303f8b..316fbe79389a 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -1721,7 +1721,6 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev) set_bit(Faulty, &rdev->flags); printk(KERN_ALERT "md/raid:%s: Disk failure on %s, disabling device.\n" - KERN_ALERT "md/raid:%s: Operation continuing on %d devices.\n", mdname(mddev), bdevname(rdev->bdev, b), -- cgit From 0ca69886a8273ac1350143d562280bfcbe4760dc Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Fri, 14 Jan 2011 09:14:33 +1100 Subject: md: Ensure no IO request to get md device before it is properly initialised. When an md device is in the process of coming on line it is possible for an IO request (typically a partition table probe) to get through before the array is fully initialised, which can cause unexpected behaviour (e.g. a crash). So explicitly record when the array is ready for IO and don't allow IO through until then. There is no possibility for a similar problem when the array is going off-line as there must only be one 'open' at that time, and it is busy off-lining the array and so cannot send IO requests. So no memory barrier is needed in md_stop() This has been a bug since commit 409c57f3801 in 2.6.30 which introduced md_make_request. Before then, each personality would register its own make_request_fn when it was ready. This is suitable for any stable kernel from 2.6.30.y onwards. Cc: Signed-off-by: NeilBrown Reported-by: "Hawrylewicz Czarnowski, Przemyslaw" --- drivers/md/md.c | 8 ++++++-- drivers/md/md.h | 3 ++- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index a3019121dc57..540347c538f9 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -288,10 +288,12 @@ static int md_make_request(struct request_queue *q, struct bio *bio) int rv; int cpu; - if (mddev == NULL || mddev->pers == NULL) { + if (mddev == NULL || mddev->pers == NULL + || !mddev->ready) { bio_io_error(bio); return 0; } + smp_rmb(); /* Ensure implications of 'active' are visible */ rcu_read_lock(); if (mddev->suspended) { DEFINE_WAIT(__wait); @@ -4564,7 +4566,8 @@ int md_run(mddev_t *mddev) mddev->safemode_timer.data = (unsigned long) mddev; mddev->safemode_delay = (200 * HZ)/1000 +1; /* 200 msec delay */ mddev->in_sync = 1; - + smp_wmb(); + mddev->ready = 1; list_for_each_entry(rdev, &mddev->disks, same_set) if (rdev->raid_disk >= 0) { char nm[20]; @@ -4725,6 +4728,7 @@ EXPORT_SYMBOL_GPL(md_stop_writes); void md_stop(mddev_t *mddev) { + mddev->ready = 0; mddev->pers->stop(mddev); if (mddev->pers->sync_request && mddev->to_remove == NULL) mddev->to_remove = &md_redundancy_group; diff --git a/drivers/md/md.h b/drivers/md/md.h index d05bab55df4e..229675a604f7 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -148,7 +148,8 @@ struct mddev_s * are happening, so run/ * takeover/stop are not safe */ - + int ready; /* See when safe to pass + * IO requests down */ struct gendisk *gendisk; struct kobject kobj; -- cgit From 43c73ca43b3e03bb228ff9350b6b44d0e560f262 Mon Sep 17 00:00:00 2001 From: Jonathan Brassow Date: Fri, 14 Jan 2011 09:14:33 +1100 Subject: md/raid5: use sysfs_notify_dirent_safe to avoid NULL pointer With the module parameter 'start_dirty_degraded' set, raid5_spare_active() previously called sysfs_notify_dirent() with a NULL argument (rdev->sysfs_state) when a rebuild finished. Signed-off-by: Jonathan Brassow Signed-off-by: Mike Snitzer --- drivers/md/raid5.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 316fbe79389a..9212c077095c 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -5338,7 +5338,7 @@ static int raid5_spare_active(mddev_t *mddev) && !test_bit(Faulty, &tmp->rdev->flags) && !test_and_set_bit(In_sync, &tmp->rdev->flags)) { count++; - sysfs_notify_dirent(tmp->rdev->sysfs_state); + sysfs_notify_dirent_safe(tmp->rdev->sysfs_state); } } spin_lock_irqsave(&conf->device_lock, flags); -- cgit From defad61a5b16352d3e22a04d4c930a5b5a7fd1f0 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Fri, 14 Jan 2011 09:14:33 +1100 Subject: md: md_stop_writes requires mddev_lock. As md_stop_writes manipulates the sync_thread and calls md_update_sb, it need to be called with mddev_lock held. In all internal cases it is, but the symbol is exported for dm-raid to call and in that case the lock won't be help. Do make an exported version which takes the lock, and an internal version which does not. Signed-off-by: NeilBrown --- drivers/md/md.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index 540347c538f9..f43ff2962b2b 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -4704,7 +4704,7 @@ static void md_clean(mddev_t *mddev) mddev->plug = NULL; } -void md_stop_writes(mddev_t *mddev) +static void __md_stop_writes(mddev_t *mddev) { if (mddev->sync_thread) { set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); @@ -4724,6 +4724,13 @@ void md_stop_writes(mddev_t *mddev) md_update_sb(mddev, 1); } } + +void md_stop_writes(mddev_t *mddev) +{ + mddev_lock(mddev); + __md_stop_writes(mddev); + mddev_unlock(mddev); +} EXPORT_SYMBOL_GPL(md_stop_writes); void md_stop(mddev_t *mddev) @@ -4748,7 +4755,7 @@ static int md_set_readonly(mddev_t *mddev, int is_open) goto out; } if (mddev->pers) { - md_stop_writes(mddev); + __md_stop_writes(mddev); err = -ENXIO; if (mddev->ro==1) @@ -4785,7 +4792,7 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open) if (mddev->ro) set_disk_ro(disk, 0); - md_stop_writes(mddev); + __md_stop_writes(mddev); md_stop(mddev); mddev->queue->merge_bvec_fn = NULL; mddev->queue->unplug_fn = NULL; -- cgit From 7ebc0be7fff4146e87b4078f054977b72998abd3 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Fri, 14 Jan 2011 09:14:33 +1100 Subject: md: Be more careful about clearing flags bit in ->recovery Setting ->recovery to 0 is generally not a good idea as it could clear bits that shouldn't be cleared. In particular, MD_RECOVERY_FROZEN should only be cleared on explicit request from user-space. So when we need to clear things, just clear the bits that need clearing. As there are a few different places which reap a resync process - and some do an incomplte job - factor out the code for doing the from md_check_recovery and call that function instead of open coding part of it. Signed-off-by: NeilBrown Reported-by: Jonathan Brassow --- drivers/md/md.c | 86 ++++++++++++++++++++++++++++++++------------------------- 1 file changed, 49 insertions(+), 37 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index f43ff2962b2b..a91dc593f3ca 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -3746,6 +3746,8 @@ action_show(mddev_t *mddev, char *page) return sprintf(page, "%s\n", type); } +static void reap_sync_thread(mddev_t *mddev); + static ssize_t action_store(mddev_t *mddev, const char *page, size_t len) { @@ -3760,9 +3762,7 @@ action_store(mddev_t *mddev, const char *page, size_t len) if (cmd_match(page, "idle") || cmd_match(page, "frozen")) { if (mddev->sync_thread) { set_bit(MD_RECOVERY_INTR, &mddev->recovery); - md_unregister_thread(mddev->sync_thread); - mddev->sync_thread = NULL; - mddev->recovery = 0; + reap_sync_thread(mddev); } } else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || test_bit(MD_RECOVERY_NEEDED, &mddev->recovery)) @@ -4709,8 +4709,7 @@ static void __md_stop_writes(mddev_t *mddev) if (mddev->sync_thread) { set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); set_bit(MD_RECOVERY_INTR, &mddev->recovery); - md_unregister_thread(mddev->sync_thread); - mddev->sync_thread = NULL; + reap_sync_thread(mddev); } del_timer_sync(&mddev->safemode_timer); @@ -7044,6 +7043,45 @@ static int remove_and_add_spares(mddev_t *mddev) } return spares; } + +static void reap_sync_thread(mddev_t *mddev) +{ + mdk_rdev_t *rdev; + + /* resync has finished, collect result */ + md_unregister_thread(mddev->sync_thread); + mddev->sync_thread = NULL; + if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) && + !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) { + /* success...*/ + /* activate any spares */ + if (mddev->pers->spare_active(mddev)) + sysfs_notify(&mddev->kobj, NULL, + "degraded"); + } + if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && + mddev->pers->finish_reshape) + mddev->pers->finish_reshape(mddev); + md_update_sb(mddev, 1); + + /* if array is no-longer degraded, then any saved_raid_disk + * information must be scrapped + */ + if (!mddev->degraded) + list_for_each_entry(rdev, &mddev->disks, same_set) + rdev->saved_raid_disk = -1; + + clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery); + clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); + clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); + clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); + clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); + /* flag recovery needed just to double check */ + set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); + sysfs_notify_dirent_safe(mddev->sysfs_action); + md_new_event(mddev); +} + /* * This routine is regularly called by all per-raid-array threads to * deal with generic issues like resync and super-block update. @@ -7068,9 +7106,6 @@ static int remove_and_add_spares(mddev_t *mddev) */ void md_check_recovery(mddev_t *mddev) { - mdk_rdev_t *rdev; - - if (mddev->bitmap) bitmap_daemon_work(mddev); @@ -7138,34 +7173,7 @@ void md_check_recovery(mddev_t *mddev) goto unlock; } if (mddev->sync_thread) { - /* resync has finished, collect result */ - md_unregister_thread(mddev->sync_thread); - mddev->sync_thread = NULL; - if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) && - !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) { - /* success...*/ - /* activate any spares */ - if (mddev->pers->spare_active(mddev)) - sysfs_notify(&mddev->kobj, NULL, - "degraded"); - } - if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && - mddev->pers->finish_reshape) - mddev->pers->finish_reshape(mddev); - md_update_sb(mddev, 1); - - /* if array is no-longer degraded, then any saved_raid_disk - * information must be scrapped - */ - if (!mddev->degraded) - list_for_each_entry(rdev, &mddev->disks, same_set) - rdev->saved_raid_disk = -1; - - mddev->recovery = 0; - /* flag recovery needed just to double check */ - set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); - sysfs_notify_dirent_safe(mddev->sysfs_action); - md_new_event(mddev); + reap_sync_thread(mddev); goto unlock; } /* Set RUNNING before clearing NEEDED to avoid @@ -7223,7 +7231,11 @@ void md_check_recovery(mddev_t *mddev) " thread...\n", mdname(mddev)); /* leave the spares where they are, it shouldn't hurt */ - mddev->recovery = 0; + clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery); + clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); + clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); + clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); + clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); } else md_wakeup_thread(mddev->sync_thread); sysfs_notify_dirent_safe(mddev->sysfs_action); -- cgit From 57b2caa394393f8870ed41bdcc38a7542593018f Mon Sep 17 00:00:00 2001 From: Jonathan Brassow Date: Fri, 14 Jan 2011 09:14:33 +1100 Subject: md-new-param-to-calc_dev_sboffset When we allow for separate devices for data and metadata in a later patch, we will need to be able to calculate the superblock offset based on more than the bdev. Signed-off-by: Jonathan Brassow --- drivers/md/md.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index a91dc593f3ca..0a0d7c2f2ff6 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -705,9 +705,9 @@ static struct mdk_personality *find_pers(int level, char *clevel) } /* return the offset of the super block in 512byte sectors */ -static inline sector_t calc_dev_sboffset(struct block_device *bdev) +static inline sector_t calc_dev_sboffset(mdk_rdev_t *rdev) { - sector_t num_sectors = i_size_read(bdev->bd_inode) / 512; + sector_t num_sectors = i_size_read(rdev->bdev->bd_inode) / 512; return MD_NEW_SIZE_SECTORS(num_sectors); } @@ -991,7 +991,7 @@ static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version * * It also happens to be a multiple of 4Kb. */ - rdev->sb_start = calc_dev_sboffset(rdev->bdev); + rdev->sb_start = calc_dev_sboffset(rdev); ret = read_disk_sb(rdev, MD_SB_BYTES); if (ret) return ret; @@ -1332,7 +1332,7 @@ super_90_rdev_size_change(mdk_rdev_t *rdev, sector_t num_sectors) return 0; /* component must fit device */ if (rdev->mddev->bitmap_info.offset) return 0; /* can't move bitmap */ - rdev->sb_start = calc_dev_sboffset(rdev->bdev); + rdev->sb_start = calc_dev_sboffset(rdev); if (!num_sectors || num_sectors > rdev->sb_start) num_sectors = rdev->sb_start; md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size, @@ -5249,7 +5249,7 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info) printk(KERN_INFO "md: nonpersistent superblock ...\n"); rdev->sb_start = i_size_read(rdev->bdev->bd_inode) / 512; } else - rdev->sb_start = calc_dev_sboffset(rdev->bdev); + rdev->sb_start = calc_dev_sboffset(rdev); rdev->sectors = rdev->sb_start; err = bind_rdev_to_array(rdev, mddev); @@ -5316,7 +5316,7 @@ static int hot_add_disk(mddev_t * mddev, dev_t dev) } if (mddev->persistent) - rdev->sb_start = calc_dev_sboffset(rdev->bdev); + rdev->sb_start = calc_dev_sboffset(rdev); else rdev->sb_start = i_size_read(rdev->bdev->bd_inode) / 512; -- cgit From ccebd4c4159462c96397ae9af9c667bb394d7b70 Mon Sep 17 00:00:00 2001 From: Jonathan Brassow Date: Fri, 14 Jan 2011 09:14:33 +1100 Subject: md-new-param-to_sync_page_io Add new parameter to 'sync_page_io'. The new parameter allows us to distinguish between metadata and data operations. This becomes important later when we add the ability to use separate devices for data and metadata. Signed-off-by: Jonathan Brassow --- drivers/md/bitmap.c | 4 ++-- drivers/md/md.c | 9 ++++++--- drivers/md/md.h | 4 ++-- drivers/md/raid1.c | 28 ++++++++++++---------------- drivers/md/raid10.c | 12 ++++++------ 5 files changed, 28 insertions(+), 29 deletions(-) diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c index 5a1ffe3527aa..1977765ff964 100644 --- a/drivers/md/bitmap.c +++ b/drivers/md/bitmap.c @@ -210,11 +210,11 @@ static struct page *read_sb_page(mddev_t *mddev, loff_t offset, || test_bit(Faulty, &rdev->flags)) continue; - target = rdev->sb_start + offset + index * (PAGE_SIZE/512); + target = offset + index * (PAGE_SIZE/512); if (sync_page_io(rdev, target, roundup(size, bdev_logical_block_size(rdev->bdev)), - page, READ)) { + page, READ, true)) { page->index = index; attach_page_buffers(page, NULL); /* so that free_buffer will * quietly no-op */ diff --git a/drivers/md/md.c b/drivers/md/md.c index 0a0d7c2f2ff6..0bc10cc4b961 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -795,7 +795,7 @@ static void bi_complete(struct bio *bio, int error) } int sync_page_io(mdk_rdev_t *rdev, sector_t sector, int size, - struct page *page, int rw) + struct page *page, int rw, bool metadata_op) { struct bio *bio = bio_alloc_mddev(GFP_NOIO, 1, rdev->mddev); struct completion event; @@ -804,7 +804,10 @@ int sync_page_io(mdk_rdev_t *rdev, sector_t sector, int size, rw |= REQ_SYNC | REQ_UNPLUG; bio->bi_bdev = rdev->bdev; - bio->bi_sector = sector; + if (metadata_op) + bio->bi_sector = sector + rdev->sb_start; + else + bio->bi_sector = sector + rdev->data_offset; bio_add_page(bio, page, size, 0); init_completion(&event); bio->bi_private = &event; @@ -829,7 +832,7 @@ static int read_disk_sb(mdk_rdev_t * rdev, int size) return 0; - if (!sync_page_io(rdev, rdev->sb_start, size, rdev->sb_page, READ)) + if (!sync_page_io(rdev, 0, size, rdev->sb_page, READ, true)) goto fail; rdev->sb_loaded = 1; return 0; diff --git a/drivers/md/md.h b/drivers/md/md.h index 229675a604f7..7e4f358a26a6 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -498,8 +498,8 @@ extern void md_flush_request(mddev_t *mddev, struct bio *bio); extern void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev, sector_t sector, int size, struct page *page); extern void md_super_wait(mddev_t *mddev); -extern int sync_page_io(mdk_rdev_t *rdev, sector_t sector, int size, - struct page *page, int rw); +extern int sync_page_io(mdk_rdev_t *rdev, sector_t sector, int size, + struct page *page, int rw, bool metadata_op); extern void md_do_sync(mddev_t *mddev); extern void md_new_event(mddev_t *mddev); extern int md_allow_write(mddev_t *mddev); diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index d50056ba596b..a23ffa397ba9 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -1365,10 +1365,10 @@ static void sync_request_write(mddev_t *mddev, r1bio_t *r1_bio) */ rdev = conf->mirrors[d].rdev; if (sync_page_io(rdev, - sect + rdev->data_offset, + sect, s<<9, bio->bi_io_vec[idx].bv_page, - READ)) { + READ, false)) { success = 1; break; } @@ -1391,10 +1391,10 @@ static void sync_request_write(mddev_t *mddev, r1bio_t *r1_bio) rdev = conf->mirrors[d].rdev; atomic_add(s, &rdev->corrected_errors); if (sync_page_io(rdev, - sect + rdev->data_offset, + sect, s<<9, bio->bi_io_vec[idx].bv_page, - WRITE) == 0) + WRITE, false) == 0) md_error(mddev, rdev); } d = start; @@ -1406,10 +1406,10 @@ static void sync_request_write(mddev_t *mddev, r1bio_t *r1_bio) continue; rdev = conf->mirrors[d].rdev; if (sync_page_io(rdev, - sect + rdev->data_offset, + sect, s<<9, bio->bi_io_vec[idx].bv_page, - READ) == 0) + READ, false) == 0) md_error(mddev, rdev); } } else { @@ -1489,10 +1489,8 @@ static void fix_read_error(conf_t *conf, int read_disk, rdev = conf->mirrors[d].rdev; if (rdev && test_bit(In_sync, &rdev->flags) && - sync_page_io(rdev, - sect + rdev->data_offset, - s<<9, - conf->tmppage, READ)) + sync_page_io(rdev, sect, s<<9, + conf->tmppage, READ, false)) success = 1; else { d++; @@ -1515,9 +1513,8 @@ static void fix_read_error(conf_t *conf, int read_disk, rdev = conf->mirrors[d].rdev; if (rdev && test_bit(In_sync, &rdev->flags)) { - if (sync_page_io(rdev, - sect + rdev->data_offset, - s<<9, conf->tmppage, WRITE) + if (sync_page_io(rdev, sect, s<<9, + conf->tmppage, WRITE, false) == 0) /* Well, this device is dead */ md_error(mddev, rdev); @@ -1532,9 +1529,8 @@ static void fix_read_error(conf_t *conf, int read_disk, rdev = conf->mirrors[d].rdev; if (rdev && test_bit(In_sync, &rdev->flags)) { - if (sync_page_io(rdev, - sect + rdev->data_offset, - s<<9, conf->tmppage, READ) + if (sync_page_io(rdev, sect, s<<9, + conf->tmppage, READ, false) == 0) /* Well, this device is dead */ md_error(mddev, rdev); diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 03825cbce2d4..69b659544390 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -1560,9 +1560,9 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio) rcu_read_unlock(); success = sync_page_io(rdev, r10_bio->devs[sl].addr + - sect + rdev->data_offset, + sect, s<<9, - conf->tmppage, READ); + conf->tmppage, READ, false); rdev_dec_pending(rdev, mddev); rcu_read_lock(); if (success) @@ -1599,8 +1599,8 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio) atomic_add(s, &rdev->corrected_errors); if (sync_page_io(rdev, r10_bio->devs[sl].addr + - sect + rdev->data_offset, - s<<9, conf->tmppage, WRITE) + sect, + s<<9, conf->tmppage, WRITE, false) == 0) { /* Well, this device is dead */ printk(KERN_NOTICE @@ -1636,9 +1636,9 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio) rcu_read_unlock(); if (sync_page_io(rdev, r10_bio->devs[sl].addr + - sect + rdev->data_offset, + sect, s<<9, conf->tmppage, - READ) == 0) { + READ, false) == 0) { /* Well, this device is dead */ printk(KERN_NOTICE "md/raid10:%s: unable to read back " -- cgit From a6ff7e089c7fca813c956ccbed824087e89a3a49 Mon Sep 17 00:00:00 2001 From: Jonathan Brassow Date: Fri, 14 Jan 2011 09:14:34 +1100 Subject: md: separate meta and data devs Allow the metadata to be on a separate device from the data. This doesn't mean the data and metadata will by on separate physical devices - it simply gives device-mapper and userspace tools more flexibility. Signed-off-by: NeilBrown --- drivers/md/bitmap.c | 6 +++++- drivers/md/md.c | 10 ++++++---- drivers/md/md.h | 6 ++++++ 3 files changed, 17 insertions(+), 5 deletions(-) diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c index 1977765ff964..6cf587196b99 100644 --- a/drivers/md/bitmap.c +++ b/drivers/md/bitmap.c @@ -264,14 +264,18 @@ static mdk_rdev_t *next_active_rdev(mdk_rdev_t *rdev, mddev_t *mddev) static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait) { mdk_rdev_t *rdev = NULL; + struct block_device *bdev; mddev_t *mddev = bitmap->mddev; while ((rdev = next_active_rdev(rdev, mddev)) != NULL) { int size = PAGE_SIZE; loff_t offset = mddev->bitmap_info.offset; + + bdev = (rdev->meta_bdev) ? rdev->meta_bdev : rdev->bdev; + if (page->index == bitmap->file_pages-1) size = roundup(bitmap->last_page_size, - bdev_logical_block_size(rdev->bdev)); + bdev_logical_block_size(bdev)); /* Just make sure we aren't corrupting data or * metadata */ diff --git a/drivers/md/md.c b/drivers/md/md.c index 0bc10cc4b961..b98a85fd10b6 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -765,7 +765,7 @@ void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev, */ struct bio *bio = bio_alloc_mddev(GFP_NOIO, 1, mddev); - bio->bi_bdev = rdev->bdev; + bio->bi_bdev = rdev->meta_bdev ? rdev->meta_bdev : rdev->bdev; bio->bi_sector = sector; bio_add_page(bio, page, size, 0); bio->bi_private = rdev; @@ -803,7 +803,8 @@ int sync_page_io(mdk_rdev_t *rdev, sector_t sector, int size, rw |= REQ_SYNC | REQ_UNPLUG; - bio->bi_bdev = rdev->bdev; + bio->bi_bdev = (metadata_op && rdev->meta_bdev) ? + rdev->meta_bdev : rdev->bdev; if (metadata_op) bio->bi_sector = sector + rdev->sb_start; else @@ -4435,7 +4436,9 @@ int md_run(mddev_t *mddev) * We don't want the data to overlap the metadata, * Internal Bitmap issues have been handled elsewhere. */ - if (rdev->data_offset < rdev->sb_start) { + if (rdev->meta_bdev) { + /* Nothing to check */; + } else if (rdev->data_offset < rdev->sb_start) { if (mddev->dev_sectors && rdev->data_offset + mddev->dev_sectors > rdev->sb_start) { @@ -5532,7 +5535,6 @@ static int update_size(mddev_t *mddev, sector_t num_sectors) * sb_start or, if that is sync_thread) return -EBUSY; diff --git a/drivers/md/md.h b/drivers/md/md.h index 7e4f358a26a6..eec517ced31a 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -60,6 +60,12 @@ struct mdk_rdev_s mddev_t *mddev; /* RAID array if running */ int last_events; /* IO event timestamp */ + /* + * If meta_bdev is non-NULL, it means that a separate device is + * being used to store the metadata (superblock/bitmap) which + * would otherwise be contained on the same device as the data (bdev). + */ + struct block_device *meta_bdev; struct block_device *bdev; /* block device handle */ struct page *sb_page; -- cgit From 75d3da43cb74d2e5fb87816dbfecb839cd97c7f4 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Fri, 14 Jan 2011 09:14:34 +1100 Subject: md: Don't let implementation detail of curr_resync leak out through sysfs. mddev->curr_resync has artificial values of '1' and '2' which are used by the code which ensures only one resync is happening at a time on any given device. These values are internal and should never be exposed to user-space (except when translated appropriately as in the 'pending' status in /proc/mdstat). Unfortunately they are as ->curr_resync is assigned to ->curr_resync_completed and that value is directly visible through sysfs. So change the assignments to ->curr_resync_completed to get the same valued from elsewhere in a form that doesn't have the magic '1' or '2' values. Signed-off-by: NeilBrown --- drivers/md/bitmap.c | 2 +- drivers/md/md.c | 5 ++--- drivers/md/raid5.c | 4 ++-- 3 files changed, 5 insertions(+), 6 deletions(-) diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c index 6cf587196b99..9a35320fb59f 100644 --- a/drivers/md/bitmap.c +++ b/drivers/md/bitmap.c @@ -1546,7 +1546,7 @@ void bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector) wait_event(bitmap->mddev->recovery_wait, atomic_read(&bitmap->mddev->recovery_active) == 0); - bitmap->mddev->curr_resync_completed = bitmap->mddev->curr_resync; + bitmap->mddev->curr_resync_completed = sector; set_bit(MD_CHANGE_CLEAN, &bitmap->mddev->flags); sector &= ~((1ULL << CHUNK_BLOCK_SHIFT(bitmap)) - 1); s = 0; diff --git a/drivers/md/md.c b/drivers/md/md.c index b98a85fd10b6..42b1d9ead7f2 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -6824,7 +6824,7 @@ void md_do_sync(mddev_t *mddev) desc, mdname(mddev)); mddev->curr_resync = j; } - mddev->curr_resync_completed = mddev->curr_resync; + mddev->curr_resync_completed = j; while (j < max_sectors) { sector_t sectors; @@ -6842,8 +6842,7 @@ void md_do_sync(mddev_t *mddev) md_unplug(mddev); wait_event(mddev->recovery_wait, atomic_read(&mddev->recovery_active) == 0); - mddev->curr_resync_completed = - mddev->curr_resync; + mddev->curr_resync_completed = j; set_bit(MD_CHANGE_CLEAN, &mddev->flags); sysfs_notify(&mddev->kobj, NULL, "sync_completed"); } diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 9212c077095c..d223a6c0ccc4 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -4236,7 +4236,7 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped wait_event(conf->wait_for_overlap, atomic_read(&conf->reshape_stripes)==0); mddev->reshape_position = conf->reshape_progress; - mddev->curr_resync_completed = mddev->curr_resync; + mddev->curr_resync_completed = sector_nr; conf->reshape_checkpoint = jiffies; set_bit(MD_CHANGE_DEVS, &mddev->flags); md_wakeup_thread(mddev->thread); @@ -4337,7 +4337,7 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped wait_event(conf->wait_for_overlap, atomic_read(&conf->reshape_stripes) == 0); mddev->reshape_position = conf->reshape_progress; - mddev->curr_resync_completed = mddev->curr_resync + reshape_sectors; + mddev->curr_resync_completed = sector_nr; conf->reshape_checkpoint = jiffies; set_bit(MD_CHANGE_DEVS, &mddev->flags); md_wakeup_thread(mddev->thread); -- cgit From 23ddff3792f61193695114c68d6ebd57e974c4f8 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Fri, 14 Jan 2011 09:14:34 +1100 Subject: md: allow suspend_lo and suspend_hi to decrease as well as increase. The sysfs attributes 'suspend_lo' and 'suspend_hi' describe a region to which read/writes are suspended so that the under lying data can be manipulated without user-space noticing. Currently the window they describe can only move forwards along the device. However this is an unnecessary restriction which will cause problems with planned developments. So relax this restriction and allow these endpoints to move arbitrarily. Signed-off-by: NeilBrown --- drivers/md/md.c | 33 +++++++++++++++++++++------------ 1 file changed, 21 insertions(+), 12 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index 42b1d9ead7f2..dd64ad30a0fe 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -4016,19 +4016,24 @@ suspend_lo_store(mddev_t *mddev, const char *buf, size_t len) { char *e; unsigned long long new = simple_strtoull(buf, &e, 10); + unsigned long long old = mddev->suspend_lo; if (mddev->pers == NULL || mddev->pers->quiesce == NULL) return -EINVAL; if (buf == e || (*e && *e != '\n')) return -EINVAL; - if (new >= mddev->suspend_hi || - (new > mddev->suspend_lo && new < mddev->suspend_hi)) { - mddev->suspend_lo = new; + + mddev->suspend_lo = new; + if (new >= old) + /* Shrinking suspended region */ mddev->pers->quiesce(mddev, 2); - return len; - } else - return -EINVAL; + else { + /* Expanding suspended region - need to wait */ + mddev->pers->quiesce(mddev, 1); + mddev->pers->quiesce(mddev, 0); + } + return len; } static struct md_sysfs_entry md_suspend_lo = __ATTR(suspend_lo, S_IRUGO|S_IWUSR, suspend_lo_show, suspend_lo_store); @@ -4045,20 +4050,24 @@ suspend_hi_store(mddev_t *mddev, const char *buf, size_t len) { char *e; unsigned long long new = simple_strtoull(buf, &e, 10); + unsigned long long old = mddev->suspend_hi; if (mddev->pers == NULL || mddev->pers->quiesce == NULL) return -EINVAL; if (buf == e || (*e && *e != '\n')) return -EINVAL; - if ((new <= mddev->suspend_lo && mddev->suspend_lo >= mddev->suspend_hi) || - (new > mddev->suspend_lo && new > mddev->suspend_hi)) { - mddev->suspend_hi = new; + + mddev->suspend_hi = new; + if (new <= old) + /* Shrinking suspended region */ + mddev->pers->quiesce(mddev, 2); + else { + /* Expanding suspended region - need to wait */ mddev->pers->quiesce(mddev, 1); mddev->pers->quiesce(mddev, 0); - return len; - } else - return -EINVAL; + } + return len; } static struct md_sysfs_entry md_suspend_hi = __ATTR(suspend_hi, S_IRUGO|S_IWUSR, suspend_hi_show, suspend_hi_store); -- cgit From 13ae864bc86ff65547ffe7e966b6433a0d0edb8a Mon Sep 17 00:00:00 2001 From: Rémi Rérolle Date: Fri, 14 Jan 2011 09:14:34 +1100 Subject: md: fix sync_completed reporting for very large drives (>2TB) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The values exported in the sync_completed file are unsigned long, which overflows with very large drives, resulting in wrong values reported. Since sync_completed uses sectors as unit, we'll start getting wrong values with components larger than 2TB. This patch simply replaces the use of unsigned long by unsigned long long. Signed-off-by: Rémi Rérolle Signed-off-by: NeilBrown --- drivers/md/md.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index dd64ad30a0fe..5e3714fecee9 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -3918,7 +3918,7 @@ static struct md_sysfs_entry md_sync_speed = __ATTR_RO(sync_speed); static ssize_t sync_completed_show(mddev_t *mddev, char *page) { - unsigned long max_sectors, resync; + unsigned long long max_sectors, resync; if (!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) return sprintf(page, "none\n"); @@ -3929,7 +3929,7 @@ sync_completed_show(mddev_t *mddev, char *page) max_sectors = mddev->dev_sectors; resync = mddev->curr_resync_completed; - return sprintf(page, "%lu / %lu\n", resync, max_sectors); + return sprintf(page, "%llu / %llu\n", resync, max_sectors); } static struct md_sysfs_entry md_sync_completed = __ATTR_RO(sync_completed); -- cgit From 1a940fcee31ec6c18c2f24dbdad31d54e4c35048 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Fri, 14 Jan 2011 09:14:34 +1100 Subject: md/raid5: handle manually-added spares in start_reshape. It is possible to manually add spares to specific slots before starting a reshape. raid5_start_reshape should recognised this possibility and include it in the accounting. Signed-off-by: NeilBrown --- drivers/md/raid5.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index d223a6c0ccc4..5044babfcda0 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -5527,8 +5527,8 @@ static int raid5_start_reshape(mddev_t *mddev) return -ENOSPC; list_for_each_entry(rdev, &mddev->disks, same_set) - if (rdev->raid_disk < 0 && - !test_bit(Faulty, &rdev->flags)) + if ((rdev->raid_disk < 0 || rdev->raid_disk >= conf->raid_disks) + && !test_bit(Faulty, &rdev->flags)) spares++; if (spares - mddev->degraded < mddev->delta_disks - conf->max_degraded) @@ -5588,6 +5588,11 @@ static int raid5_start_reshape(mddev_t *mddev) /* Failure here is OK */; } else break; + } else if (rdev->raid_disk >= conf->previous_raid_disks + && !test_bit(Faulty, &rdev->flags)) { + /* This is a spare that was manually added */ + set_bit(In_sync, &rdev->flags); + added_devices++; } /* When a reshape changes the number of devices, ->degraded -- cgit From ba1b41b6b4e30cb66ae2775faadea05cae3ce61c Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Fri, 14 Jan 2011 09:14:34 +1100 Subject: md: range check slot number when manually adding a spare. When adding a spare to an active array, we should check the slot number, but allow it to be larger than raid_disks if a reshape is being prepared. Apply the same test when adding a device to an array-under-construction. It already had most of the test in place, but not quite all. Signed-off-by: NeilBrown --- drivers/md/md.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index 5e3714fecee9..665851308818 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -2479,6 +2479,10 @@ slot_store(mdk_rdev_t *rdev, const char *buf, size_t len) if (rdev2->raid_disk == slot) return -EEXIST; + if (slot >= rdev->mddev->raid_disks && + slot >= rdev->mddev->raid_disks + rdev->mddev->delta_disks) + return -ENOSPC; + rdev->raid_disk = slot; if (test_bit(In_sync, &rdev->flags)) rdev->saved_raid_disk = slot; @@ -2496,7 +2500,8 @@ slot_store(mdk_rdev_t *rdev, const char *buf, size_t len) /* failure here is OK */; /* don't wakeup anyone, leave that to userspace. */ } else { - if (slot >= rdev->mddev->raid_disks) + if (slot >= rdev->mddev->raid_disks && + slot >= rdev->mddev->raid_disks + rdev->mddev->delta_disks) return -ENOSPC; rdev->raid_disk = slot; /* assume it is working */ -- cgit From bf2cb0dab8c97f00a71875d9b13dbac17a2f47ca Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Fri, 14 Jan 2011 09:14:34 +1100 Subject: md: Fix removal of extra drives when converting RAID6 to RAID5 When a RAID6 is converted to a RAID5, the extra drive should be discarded. However it isn't due to a typo in a comparison. This bug was introduced in commit e93f68a1fc6 in 2.6.35-rc4 and is suitable for any -stable since than. As the extra drive is not removed, the 'degraded' counter is wrong and so the RAID5 will not respond correctly to a subsequent failure. Cc: stable@kernel.org Signed-off-by: NeilBrown --- drivers/md/md.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index 665851308818..3194a80f3620 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -3126,7 +3126,7 @@ level_store(mddev_t *mddev, const char *buf, size_t len) char nm[20]; if (rdev->raid_disk < 0) continue; - if (rdev->new_raid_disk > mddev->raid_disks) + if (rdev->new_raid_disk >= mddev->raid_disks) rdev->new_raid_disk = -1; if (rdev->new_raid_disk == rdev->raid_disk) continue; -- cgit From 09579770dcb8769f4f61046dcd01cc758cfa6d91 Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Thu, 13 Jan 2011 14:49:56 -0800 Subject: [IA64] fix build error - arch/ia64/kernel/perfmon.c arch/ia64/kernel/perfmon.c:621: error: duplicate 'static' Introduced by commit c74a1cbb3cac348f276fabc381758f5b0b4713b2 pass default dentry_operations to mount_pseudo() Signed-off-by: Tony Luck --- arch/ia64/kernel/perfmon.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/ia64/kernel/perfmon.c b/arch/ia64/kernel/perfmon.c index ac76da099a6d..89accc626b86 100644 --- a/arch/ia64/kernel/perfmon.c +++ b/arch/ia64/kernel/perfmon.c @@ -618,7 +618,7 @@ pfm_get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, } /* forward declaration */ -static static const struct dentry_operations pfmfs_dentry_operations; +static const struct dentry_operations pfmfs_dentry_operations; static struct dentry * pfmfs_mount(struct file_system_type *fs_type, int flags, const char *dev_name, void *data) -- cgit From 6254b32b5791e47ba1c679d023f26985fa34755a Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Thu, 13 Jan 2011 17:19:38 -0800 Subject: ecryptfs: fix broken build Stephen Rothwell reports that the vfs merge broke the build of ecryptfs. The breakage comes from commit 66cb76666d69 ("sanitize ecryptfs ->mount()") which was obviously not even build tested. Tssk, tssk, Al. This is the minimal build fixup for the situation, although I don't have a filesystem to actually test it with. Reported-by: Stephen Rothwell Cc: Al Viro Signed-off-by: Linus Torvalds --- fs/ecryptfs/main.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c index 9ed476906327..d3b28abdd6aa 100644 --- a/fs/ecryptfs/main.c +++ b/fs/ecryptfs/main.c @@ -141,13 +141,12 @@ int ecryptfs_init_persistent_file(struct dentry *ecryptfs_dentry) return rc; } -static inode *ecryptfs_get_inode(struct inode *lower_inode, +static struct inode *ecryptfs_get_inode(struct inode *lower_inode, struct super_block *sb) { struct inode *inode; int rc = 0; - lower_inode = lower_dentry->d_inode; if (lower_inode->i_sb != ecryptfs_superblock_to_lower(sb)) { rc = -EXDEV; goto out; @@ -202,7 +201,7 @@ int ecryptfs_interpose(struct dentry *lower_dentry, struct dentry *dentry, { struct inode *lower_inode = lower_dentry->d_inode; struct inode *inode = ecryptfs_get_inode(lower_inode, sb); - if (IS_ERR(inode) + if (IS_ERR(inode)) return PTR_ERR(inode); if (flags & ECRYPTFS_INTERPOSE_FLAG_D_ADD) d_add(dentry, inode); -- cgit From d8a3515e2a9523f8ed56d1f4537d16338bda2bc2 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Thu, 13 Jan 2011 17:26:46 -0800 Subject: Revert "gpiolib: annotate gpio-intialization with __must_check" This reverts commit 0fdae42d361bbb431ca0ab0efed5126a94821177, which wasn't really supposed to go in, and causes lots of annoying warnings. Quoth Andrew: "Complete brainfart - I meant to drop that patch ages ago." Quoth Greg: "Ick, yeah, that patch isn't ok to go in as-is, all of the callers need to be fixed up first, which is what I thought we had agreed on..." Reported-by: Stephen Rothwell Acked-by: Andrew Morton Acked-by: Greg KH Signed-off-by: Linus Torvalds --- Documentation/gpio.txt | 2 +- include/asm-generic/gpio.h | 10 +++++----- include/linux/gpio.h | 6 +++--- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/Documentation/gpio.txt b/Documentation/gpio.txt index a492d92bb098..792faa3c06cf 100644 --- a/Documentation/gpio.txt +++ b/Documentation/gpio.txt @@ -135,7 +135,7 @@ setting up a platform_device using the GPIO, is mark its direction: int gpio_direction_input(unsigned gpio); int gpio_direction_output(unsigned gpio, int value); -The return value is zero for success, else a negative errno. It must +The return value is zero for success, else a negative errno. It should be checked, since the get/set calls don't have error returns and since misconfiguration is possible. You should normally issue these calls from a task context. However, for spinlock-safe GPIOs it's OK to use them diff --git a/include/asm-generic/gpio.h b/include/asm-generic/gpio.h index 6098cae2af8e..ff5c66080c8c 100644 --- a/include/asm-generic/gpio.h +++ b/include/asm-generic/gpio.h @@ -147,11 +147,11 @@ extern struct gpio_chip *gpiochip_find(void *data, /* Always use the library code for GPIO management calls, * or when sleeping may be involved. */ -extern int __must_check gpio_request(unsigned gpio, const char *label); +extern int gpio_request(unsigned gpio, const char *label); extern void gpio_free(unsigned gpio); -extern int __must_check gpio_direction_input(unsigned gpio); -extern int __must_check gpio_direction_output(unsigned gpio, int value); +extern int gpio_direction_input(unsigned gpio); +extern int gpio_direction_output(unsigned gpio, int value); extern int gpio_set_debounce(unsigned gpio, unsigned debounce); @@ -192,8 +192,8 @@ struct gpio { const char *label; }; -extern int __must_check gpio_request_one(unsigned gpio, unsigned long flags, const char *label); -extern int __must_check gpio_request_array(struct gpio *array, size_t num); +extern int gpio_request_one(unsigned gpio, unsigned long flags, const char *label); +extern int gpio_request_array(struct gpio *array, size_t num); extern void gpio_free_array(struct gpio *array, size_t num); #ifdef CONFIG_GPIO_SYSFS diff --git a/include/linux/gpio.h b/include/linux/gpio.h index f79d67f413e4..4b47ed96f131 100644 --- a/include/linux/gpio.h +++ b/include/linux/gpio.h @@ -30,7 +30,7 @@ static inline int gpio_is_valid(int number) return 0; } -static inline int __must_check gpio_request(unsigned gpio, const char *label) +static inline int gpio_request(unsigned gpio, const char *label) { return -ENOSYS; } @@ -62,12 +62,12 @@ static inline void gpio_free_array(struct gpio *array, size_t num) WARN_ON(1); } -static inline int __must_check gpio_direction_input(unsigned gpio) +static inline int gpio_direction_input(unsigned gpio) { return -ENOSYS; } -static inline int __must_check gpio_direction_output(unsigned gpio, int value) +static inline int gpio_direction_output(unsigned gpio, int value) { return -ENOSYS; } -- cgit From 558bbb2fc75fd9676e07e347c021865e326c56db Mon Sep 17 00:00:00 2001 From: Bruce Chang Date: Thu, 13 Jan 2011 15:45:37 -0800 Subject: MAINTAINERS: update entries affecting VIA Technologies Since the original maintainer-Joseph Chan (josephchan@via.com.tw) doesn't handle the Linux driver for VIA now, I would like to request to update the maintainer for the SD/MMC CARD CONTROLLER DRIVER and VIA UNICHROME(PRO)/CHROME9 FRAMEBUFFER DRIVER before we find a better one. Signed-off-by: Bruce Chang Signed-off-by: Florian Tobias Schandinat Cc: Joseph Chan Cc: Geert Uytterhoeven Cc: Harald Welte Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- MAINTAINERS | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index 3dd5c6fce989..af656ded404e 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -6592,13 +6592,12 @@ F: Documentation/i2c/busses/i2c-viapro F: drivers/i2c/busses/i2c-viapro.c VIA SD/MMC CARD CONTROLLER DRIVER -M: Joseph Chan +M: Bruce Chang M: Harald Welte S: Maintained F: drivers/mmc/host/via-sdmmc.c VIA UNICHROME(PRO)/CHROME9 FRAMEBUFFER DRIVER -M: Joseph Chan M: Florian Tobias Schandinat L: linux-fbdev@vger.kernel.org S: Maintained -- cgit From 6c9ae009b298753a3baf71298d676a68b5a10c8f Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 13 Jan 2011 15:45:38 -0800 Subject: irq: use per_cpu kstat_irqs Use modern per_cpu API to increment {soft|hard}irq counters, and use per_cpu allocation for (struct irq_desc)->kstats_irq instead of an array. This gives better SMP/NUMA locality and saves few instructions per irq. With small nr_cpuids values (8 for example), kstats_irq was a small array (less than L1_CACHE_BYTES), potentially source of false sharing. In the !CONFIG_SPARSE_IRQ case, remove the huge, NUMA/cache unfriendly kstat_irqs_all[NR_IRQS][NR_CPUS] array. Note: we still populate kstats_irq for all possible irqs in early_irq_init(). We probably could use on-demand allocations. (Code included in alloc_descs()). Problem is not all IRQS are used with a prior alloc_descs() call. kstat_irqs_this_cpu() is not used anymore, remove it. Signed-off-by: Eric Dumazet Reviewed-by: Christoph Lameter Cc: Ingo Molnar Cc: Andi Kleen Cc: Tejun Heo Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/irqdesc.h | 2 +- include/linux/kernel_stat.h | 19 +++++++++---------- kernel/irq/irqdesc.c | 40 ++++++++++++++++++++++++++++++---------- 3 files changed, 40 insertions(+), 21 deletions(-) diff --git a/include/linux/irqdesc.h b/include/linux/irqdesc.h index 979c68cc7458..6a64c6fa81af 100644 --- a/include/linux/irqdesc.h +++ b/include/linux/irqdesc.h @@ -57,7 +57,7 @@ struct irq_desc { #endif struct timer_rand_state *timer_rand_state; - unsigned int *kstat_irqs; + unsigned int __percpu *kstat_irqs; irq_flow_handler_t handle_irq; struct irqaction *action; /* IRQ action list */ unsigned int status; /* IRQ status */ diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h index 44e83ba12b5b..0cce2db580c3 100644 --- a/include/linux/kernel_stat.h +++ b/include/linux/kernel_stat.h @@ -46,16 +46,14 @@ DECLARE_PER_CPU(struct kernel_stat, kstat); extern unsigned long long nr_context_switches(void); #ifndef CONFIG_GENERIC_HARDIRQS -#define kstat_irqs_this_cpu(irq) \ - (this_cpu_read(kstat.irqs[irq]) struct irq_desc; static inline void kstat_incr_irqs_this_cpu(unsigned int irq, struct irq_desc *desc) { - kstat_this_cpu.irqs[irq]++; - kstat_this_cpu.irqs_sum++; + __this_cpu_inc(kstat.irqs[irq]); + __this_cpu_inc(kstat.irqs_sum); } static inline unsigned int kstat_irqs_cpu(unsigned int irq, int cpu) @@ -65,17 +63,18 @@ static inline unsigned int kstat_irqs_cpu(unsigned int irq, int cpu) #else #include extern unsigned int kstat_irqs_cpu(unsigned int irq, int cpu); -#define kstat_irqs_this_cpu(DESC) \ - ((DESC)->kstat_irqs[smp_processor_id()]) -#define kstat_incr_irqs_this_cpu(irqno, DESC) do {\ - ((DESC)->kstat_irqs[smp_processor_id()]++);\ - kstat_this_cpu.irqs_sum++; } while (0) + +#define kstat_incr_irqs_this_cpu(irqno, DESC) \ +do { \ + __this_cpu_inc(*(DESC)->kstat_irqs); \ + __this_cpu_inc(kstat.irqs_sum); \ +} while (0) #endif static inline void kstat_incr_softirqs_this_cpu(unsigned int irq) { - kstat_this_cpu.softirqs[irq]++; + __this_cpu_inc(kstat.softirqs[irq]); } static inline unsigned int kstat_softirqs_cpu(unsigned int irq, int cpu) diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index 9988d03797f5..282f20230e67 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -72,6 +72,8 @@ static inline int desc_node(struct irq_desc *desc) { return 0; } static void desc_set_defaults(unsigned int irq, struct irq_desc *desc, int node) { + int cpu; + desc->irq_data.irq = irq; desc->irq_data.chip = &no_irq_chip; desc->irq_data.chip_data = NULL; @@ -83,7 +85,8 @@ static void desc_set_defaults(unsigned int irq, struct irq_desc *desc, int node) desc->irq_count = 0; desc->irqs_unhandled = 0; desc->name = NULL; - memset(desc->kstat_irqs, 0, nr_cpu_ids * sizeof(*(desc->kstat_irqs))); + for_each_possible_cpu(cpu) + *per_cpu_ptr(desc->kstat_irqs, cpu) = 0; desc_smp_init(desc, node); } @@ -133,8 +136,7 @@ static struct irq_desc *alloc_desc(int irq, int node) if (!desc) return NULL; /* allocate based on nr_cpu_ids */ - desc->kstat_irqs = kzalloc_node(nr_cpu_ids * sizeof(*desc->kstat_irqs), - gfp, node); + desc->kstat_irqs = alloc_percpu(unsigned int); if (!desc->kstat_irqs) goto err_desc; @@ -149,7 +151,7 @@ static struct irq_desc *alloc_desc(int irq, int node) return desc; err_kstat: - kfree(desc->kstat_irqs); + free_percpu(desc->kstat_irqs); err_desc: kfree(desc); return NULL; @@ -166,7 +168,7 @@ static void free_desc(unsigned int irq) mutex_unlock(&sparse_irq_lock); free_masks(desc); - kfree(desc->kstat_irqs); + free_percpu(desc->kstat_irqs); kfree(desc); } @@ -234,7 +236,6 @@ struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = { } }; -static unsigned int kstat_irqs_all[NR_IRQS][NR_CPUS]; int __init early_irq_init(void) { int count, i, node = first_online_node; @@ -250,7 +251,8 @@ int __init early_irq_init(void) for (i = 0; i < count; i++) { desc[i].irq_data.irq = i; desc[i].irq_data.chip = &no_irq_chip; - desc[i].kstat_irqs = kstat_irqs_all[i]; + /* TODO : do this allocation on-demand ... */ + desc[i].kstat_irqs = alloc_percpu(unsigned int); alloc_masks(desc + i, GFP_KERNEL, node); desc_smp_init(desc + i, node); lockdep_set_class(&desc[i].lock, &irq_desc_lock_class); @@ -275,6 +277,22 @@ static void free_desc(unsigned int irq) static inline int alloc_descs(unsigned int start, unsigned int cnt, int node) { +#if defined(CONFIG_KSTAT_IRQS_ONDEMAND) + struct irq_desc *desc; + unsigned int i; + + for (i = 0; i < cnt; i++) { + desc = irq_to_desc(start + i); + if (desc && !desc->kstat_irqs) { + unsigned int __percpu *stats = alloc_percpu(unsigned int); + + if (!stats) + return -1; + if (cmpxchg(&desc->kstat_irqs, NULL, stats) != NULL) + free_percpu(stats); + } + } +#endif return start; } #endif /* !CONFIG_SPARSE_IRQ */ @@ -391,7 +409,9 @@ void dynamic_irq_cleanup(unsigned int irq) unsigned int kstat_irqs_cpu(unsigned int irq, int cpu) { struct irq_desc *desc = irq_to_desc(irq); - return desc ? desc->kstat_irqs[cpu] : 0; + + return desc && desc->kstat_irqs ? + *per_cpu_ptr(desc->kstat_irqs, cpu) : 0; } #ifdef CONFIG_GENERIC_HARDIRQS @@ -401,10 +421,10 @@ unsigned int kstat_irqs(unsigned int irq) int cpu; int sum = 0; - if (!desc) + if (!desc || !desc->kstat_irqs) return 0; for_each_possible_cpu(cpu) - sum += desc->kstat_irqs[cpu]; + sum += *per_cpu_ptr(desc->kstat_irqs, cpu); return sum; } #endif /* CONFIG_GENERIC_HARDIRQS */ -- cgit From 5dfbd1d734ef5415bc47b034df7433ba21e40e7b Mon Sep 17 00:00:00 2001 From: Claudio Scordino Date: Thu, 13 Jan 2011 15:45:39 -0800 Subject: atmel_serial: fix RTS high after initialization in RS485 mode When working in RS485 mode, the atmel_serial driver keeps RTS high after the initialization of the serial port. It goes low only after the first character has been sent. [akpm@linux-foundation.org: simplify code] Signed-off-by: Claudio Scordino Signed-off-by: Arkadiusz Bubala Tested-by: Arkadiusz Bubala Cc: Nicolas Ferre Cc: Greg KH Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/serial/atmel_serial.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/serial/atmel_serial.c b/drivers/serial/atmel_serial.c index 3892666b5fbd..2a1d52fb4936 100644 --- a/drivers/serial/atmel_serial.c +++ b/drivers/serial/atmel_serial.c @@ -1732,6 +1732,11 @@ static int __devinit atmel_serial_probe(struct platform_device *pdev) device_init_wakeup(&pdev->dev, 1); platform_set_drvdata(pdev, port); + if (port->rs485.flags & SER_RS485_ENABLED) { + UART_PUT_MR(&port->uart, ATMEL_US_USMODE_NORMAL); + UART_PUT_CR(&port->uart, ATMEL_US_RTSEN); + } + return 0; err_add_port: -- cgit From 43bb40c9e3aa51a3b038c9df2c9afb4d4685614d Mon Sep 17 00:00:00 2001 From: Dave Jones Date: Thu, 13 Jan 2011 15:45:40 -0800 Subject: sched: remove long deprecated CLONE_STOPPED flag This warning was added in commit bdff746a3915 ("clone: prepare to recycle CLONE_STOPPED") three years ago. 2.6.26 came and went. As far as I know, no-one is actually using CLONE_STOPPED. Signed-off-by: Dave Jones Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sched.h | 3 ++- kernel/fork.c | 28 +--------------------------- 2 files changed, 3 insertions(+), 28 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index 96e23215e276..07402530fc70 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -21,7 +21,8 @@ #define CLONE_DETACHED 0x00400000 /* Unused, ignored */ #define CLONE_UNTRACED 0x00800000 /* set if the tracing process can't force CLONE_PTRACE on this clone */ #define CLONE_CHILD_SETTID 0x01000000 /* set the TID in the child */ -#define CLONE_STOPPED 0x02000000 /* Start in stopped state */ +/* 0x02000000 was previously the unused CLONE_STOPPED (Start in stopped state) + and is now available for re-use. */ #define CLONE_NEWUTS 0x04000000 /* New utsname group? */ #define CLONE_NEWIPC 0x08000000 /* New ipcs */ #define CLONE_NEWUSER 0x10000000 /* New user namespace */ diff --git a/kernel/fork.c b/kernel/fork.c index d9b44f20b6b0..76a1fdd80bdf 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1409,23 +1409,6 @@ long do_fork(unsigned long clone_flags, return -EPERM; } - /* - * We hope to recycle these flags after 2.6.26 - */ - if (unlikely(clone_flags & CLONE_STOPPED)) { - static int __read_mostly count = 100; - - if (count > 0 && printk_ratelimit()) { - char comm[TASK_COMM_LEN]; - - count--; - printk(KERN_INFO "fork(): process `%s' used deprecated " - "clone flags 0x%lx\n", - get_task_comm(comm, current), - clone_flags & CLONE_STOPPED); - } - } - /* * When called from kernel_thread, don't do user tracing stuff. */ @@ -1464,16 +1447,7 @@ long do_fork(unsigned long clone_flags, */ p->flags &= ~PF_STARTING; - if (unlikely(clone_flags & CLONE_STOPPED)) { - /* - * We'll start up with an immediate SIGSTOP. - */ - sigaddset(&p->pending.signal, SIGSTOP); - set_tsk_thread_flag(p, TIF_SIGPENDING); - __set_task_state(p, TASK_STOPPED); - } else { - wake_up_new_task(p, clone_flags); - } + wake_up_new_task(p, clone_flags); tracehook_report_clone_complete(trace, regs, clone_flags, nr, p); -- cgit From 88f5acf88ae6a9778f6d25d0d5d7ec2d57764a97 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 13 Jan 2011 15:45:41 -0800 Subject: mm: page allocator: adjust the per-cpu counter threshold when memory is low Commit aa45484 ("calculate a better estimate of NR_FREE_PAGES when memory is low") noted that watermarks were based on the vmstat NR_FREE_PAGES. To avoid synchronization overhead, these counters are maintained on a per-cpu basis and drained both periodically and when a threshold is above a threshold. On large CPU systems, the difference between the estimate and real value of NR_FREE_PAGES can be very high. The system can get into a case where pages are allocated far below the min watermark potentially causing livelock issues. The commit solved the problem by taking a better reading of NR_FREE_PAGES when memory was low. Unfortately, as reported by Shaohua Li this accurate reading can consume a large amount of CPU time on systems with many sockets due to cache line bouncing. This patch takes a different approach. For large machines where counter drift might be unsafe and while kswapd is awake, the per-cpu thresholds for the target pgdat are reduced to limit the level of drift to what should be a safe level. This incurs a performance penalty in heavy memory pressure by a factor that depends on the workload and the machine but the machine should function correctly without accidentally exhausting all memory on a node. There is an additional cost when kswapd wakes and sleeps but the event is not expected to be frequent - in Shaohua's test case, there was one recorded sleep and wake event at least. To ensure that kswapd wakes up, a safe version of zone_watermark_ok() is introduced that takes a more accurate reading of NR_FREE_PAGES when called from wakeup_kswapd, when deciding whether it is really safe to go back to sleep in sleeping_prematurely() and when deciding if a zone is really balanced or not in balance_pgdat(). We are still using an expensive function but limiting how often it is called. When the test case is reproduced, the time spent in the watermark functions is reduced. The following report is on the percentage of time spent cumulatively spent in the functions zone_nr_free_pages(), zone_watermark_ok(), __zone_watermark_ok(), zone_watermark_ok_safe(), zone_page_state_snapshot(), zone_page_state(). vanilla 11.6615% disable-threshold 0.2584% David said: : We had to pull aa454840 "mm: page allocator: calculate a better estimate : of NR_FREE_PAGES when memory is low and kswapd is awake" from 2.6.36 : internally because tests showed that it would cause the machine to stall : as the result of heavy kswapd activity. I merged it back with this fix as : it is pending in the -mm tree and it solves the issue we were seeing, so I : definitely think this should be pushed to -stable (and I would seriously : consider it for 2.6.37 inclusion even at this late date). Signed-off-by: Mel Gorman Reported-by: Shaohua Li Reviewed-by: Christoph Lameter Tested-by: Nicolas Bareil Cc: David Rientjes Cc: Kyle McMartin Cc: [2.6.37.1, 2.6.36.x] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmzone.h | 10 +++----- include/linux/vmstat.h | 5 ++++ mm/mmzone.c | 21 ---------------- mm/page_alloc.c | 35 ++++++++++++++++++++------ mm/vmscan.c | 23 +++++++++-------- mm/vmstat.c | 68 +++++++++++++++++++++++++++++++++++++++++++++++++- 6 files changed, 115 insertions(+), 47 deletions(-) diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 39c24ebe9cfd..48906629335c 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -458,12 +458,6 @@ static inline int zone_is_oom_locked(const struct zone *zone) return test_bit(ZONE_OOM_LOCKED, &zone->flags); } -#ifdef CONFIG_SMP -unsigned long zone_nr_free_pages(struct zone *zone); -#else -#define zone_nr_free_pages(zone) zone_page_state(zone, NR_FREE_PAGES) -#endif /* CONFIG_SMP */ - /* * The "priority" of VM scanning is how much of the queues we will scan in one * go. A value of 12 for DEF_PRIORITY implies that we will scan 1/4096th of the @@ -661,7 +655,9 @@ typedef struct pglist_data { extern struct mutex zonelists_mutex; void build_all_zonelists(void *data); void wakeup_kswapd(struct zone *zone, int order); -int zone_watermark_ok(struct zone *z, int order, unsigned long mark, +bool zone_watermark_ok(struct zone *z, int order, unsigned long mark, + int classzone_idx, int alloc_flags); +bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark, int classzone_idx, int alloc_flags); enum memmap_context { MEMMAP_EARLY, diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h index eaaea37b3b75..e4cc21cf5870 100644 --- a/include/linux/vmstat.h +++ b/include/linux/vmstat.h @@ -254,6 +254,8 @@ extern void dec_zone_state(struct zone *, enum zone_stat_item); extern void __dec_zone_state(struct zone *, enum zone_stat_item); void refresh_cpu_vm_stats(int); +void reduce_pgdat_percpu_threshold(pg_data_t *pgdat); +void restore_pgdat_percpu_threshold(pg_data_t *pgdat); #else /* CONFIG_SMP */ /* @@ -298,6 +300,9 @@ static inline void __dec_zone_page_state(struct page *page, #define dec_zone_page_state __dec_zone_page_state #define mod_zone_page_state __mod_zone_page_state +static inline void reduce_pgdat_percpu_threshold(pg_data_t *pgdat) { } +static inline void restore_pgdat_percpu_threshold(pg_data_t *pgdat) { } + static inline void refresh_cpu_vm_stats(int cpu) { } #endif diff --git a/mm/mmzone.c b/mm/mmzone.c index e35bfb82c855..f5b7d1760213 100644 --- a/mm/mmzone.c +++ b/mm/mmzone.c @@ -87,24 +87,3 @@ int memmap_valid_within(unsigned long pfn, return 1; } #endif /* CONFIG_ARCH_HAS_HOLES_MEMORYMODEL */ - -#ifdef CONFIG_SMP -/* Called when a more accurate view of NR_FREE_PAGES is needed */ -unsigned long zone_nr_free_pages(struct zone *zone) -{ - unsigned long nr_free_pages = zone_page_state(zone, NR_FREE_PAGES); - - /* - * While kswapd is awake, it is considered the zone is under some - * memory pressure. Under pressure, there is a risk that - * per-cpu-counter-drift will allow the min watermark to be breached - * potentially causing a live-lock. While kswapd is awake and - * free pages are low, get a better estimate for free pages - */ - if (nr_free_pages < zone->percpu_drift_mark && - !waitqueue_active(&zone->zone_pgdat->kswapd_wait)) - return zone_page_state_snapshot(zone, NR_FREE_PAGES); - - return nr_free_pages; -} -#endif /* CONFIG_SMP */ diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 826ba6922e84..22a1bb7723e4 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1460,24 +1460,24 @@ static inline int should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) #endif /* CONFIG_FAIL_PAGE_ALLOC */ /* - * Return 1 if free pages are above 'mark'. This takes into account the order + * Return true if free pages are above 'mark'. This takes into account the order * of the allocation. */ -int zone_watermark_ok(struct zone *z, int order, unsigned long mark, - int classzone_idx, int alloc_flags) +static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark, + int classzone_idx, int alloc_flags, long free_pages) { /* free_pages my go negative - that's OK */ long min = mark; - long free_pages = zone_nr_free_pages(z) - (1 << order) + 1; int o; + free_pages -= (1 << order) + 1; if (alloc_flags & ALLOC_HIGH) min -= min / 2; if (alloc_flags & ALLOC_HARDER) min -= min / 4; if (free_pages <= min + z->lowmem_reserve[classzone_idx]) - return 0; + return false; for (o = 0; o < order; o++) { /* At the next order, this order's pages become unavailable */ free_pages -= z->free_area[o].nr_free << o; @@ -1486,9 +1486,28 @@ int zone_watermark_ok(struct zone *z, int order, unsigned long mark, min >>= 1; if (free_pages <= min) - return 0; + return false; } - return 1; + return true; +} + +bool zone_watermark_ok(struct zone *z, int order, unsigned long mark, + int classzone_idx, int alloc_flags) +{ + return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags, + zone_page_state(z, NR_FREE_PAGES)); +} + +bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark, + int classzone_idx, int alloc_flags) +{ + long free_pages = zone_page_state(z, NR_FREE_PAGES); + + if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark) + free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES); + + return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags, + free_pages); } #ifdef CONFIG_NUMA @@ -2442,7 +2461,7 @@ void show_free_areas(void) " all_unreclaimable? %s" "\n", zone->name, - K(zone_nr_free_pages(zone)), + K(zone_page_state(zone, NR_FREE_PAGES)), K(min_wmark_pages(zone)), K(low_wmark_pages(zone)), K(high_wmark_pages(zone)), diff --git a/mm/vmscan.c b/mm/vmscan.c index 9ca587c69274..5da4295e7d67 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2143,7 +2143,7 @@ static int sleeping_prematurely(pg_data_t *pgdat, int order, long remaining) if (zone->all_unreclaimable) continue; - if (!zone_watermark_ok(zone, order, high_wmark_pages(zone), + if (!zone_watermark_ok_safe(zone, order, high_wmark_pages(zone), 0, 0)) return 1; } @@ -2230,7 +2230,7 @@ loop_again: shrink_active_list(SWAP_CLUSTER_MAX, zone, &sc, priority, 0); - if (!zone_watermark_ok(zone, order, + if (!zone_watermark_ok_safe(zone, order, high_wmark_pages(zone), 0, 0)) { end_zone = i; break; @@ -2276,7 +2276,7 @@ loop_again: * We put equal pressure on every zone, unless one * zone has way too many pages free already. */ - if (!zone_watermark_ok(zone, order, + if (!zone_watermark_ok_safe(zone, order, 8*high_wmark_pages(zone), end_zone, 0)) shrink_zone(priority, zone, &sc); reclaim_state->reclaimed_slab = 0; @@ -2297,7 +2297,7 @@ loop_again: total_scanned > sc.nr_reclaimed + sc.nr_reclaimed / 2) sc.may_writepage = 1; - if (!zone_watermark_ok(zone, order, + if (!zone_watermark_ok_safe(zone, order, high_wmark_pages(zone), end_zone, 0)) { all_zones_ok = 0; /* @@ -2305,7 +2305,7 @@ loop_again: * means that we have a GFP_ATOMIC allocation * failure risk. Hurry up! */ - if (!zone_watermark_ok(zone, order, + if (!zone_watermark_ok_safe(zone, order, min_wmark_pages(zone), end_zone, 0)) has_under_min_watermark_zone = 1; } else { @@ -2448,7 +2448,9 @@ static int kswapd(void *p) */ if (!sleeping_prematurely(pgdat, order, remaining)) { trace_mm_vmscan_kswapd_sleep(pgdat->node_id); + restore_pgdat_percpu_threshold(pgdat); schedule(); + reduce_pgdat_percpu_threshold(pgdat); } else { if (remaining) count_vm_event(KSWAPD_LOW_WMARK_HIT_QUICKLY); @@ -2487,16 +2489,17 @@ void wakeup_kswapd(struct zone *zone, int order) if (!populated_zone(zone)) return; - pgdat = zone->zone_pgdat; - if (zone_watermark_ok(zone, order, low_wmark_pages(zone), 0, 0)) + if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) return; + pgdat = zone->zone_pgdat; if (pgdat->kswapd_max_order < order) pgdat->kswapd_max_order = order; - trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, zone_idx(zone), order); - if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) - return; if (!waitqueue_active(&pgdat->kswapd_wait)) return; + if (zone_watermark_ok_safe(zone, order, low_wmark_pages(zone), 0, 0)) + return; + + trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, zone_idx(zone), order); wake_up_interruptible(&pgdat->kswapd_wait); } diff --git a/mm/vmstat.c b/mm/vmstat.c index 312d728976f1..bc0f095791b4 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -83,6 +83,30 @@ EXPORT_SYMBOL(vm_stat); #ifdef CONFIG_SMP +static int calculate_pressure_threshold(struct zone *zone) +{ + int threshold; + int watermark_distance; + + /* + * As vmstats are not up to date, there is drift between the estimated + * and real values. For high thresholds and a high number of CPUs, it + * is possible for the min watermark to be breached while the estimated + * value looks fine. The pressure threshold is a reduced value such + * that even the maximum amount of drift will not accidentally breach + * the min watermark + */ + watermark_distance = low_wmark_pages(zone) - min_wmark_pages(zone); + threshold = max(1, (int)(watermark_distance / num_online_cpus())); + + /* + * Maximum threshold is 125 + */ + threshold = min(125, threshold); + + return threshold; +} + static int calculate_threshold(struct zone *zone) { int threshold; @@ -161,6 +185,48 @@ static void refresh_zone_stat_thresholds(void) } } +void reduce_pgdat_percpu_threshold(pg_data_t *pgdat) +{ + struct zone *zone; + int cpu; + int threshold; + int i; + + get_online_cpus(); + for (i = 0; i < pgdat->nr_zones; i++) { + zone = &pgdat->node_zones[i]; + if (!zone->percpu_drift_mark) + continue; + + threshold = calculate_pressure_threshold(zone); + for_each_online_cpu(cpu) + per_cpu_ptr(zone->pageset, cpu)->stat_threshold + = threshold; + } + put_online_cpus(); +} + +void restore_pgdat_percpu_threshold(pg_data_t *pgdat) +{ + struct zone *zone; + int cpu; + int threshold; + int i; + + get_online_cpus(); + for (i = 0; i < pgdat->nr_zones; i++) { + zone = &pgdat->node_zones[i]; + if (!zone->percpu_drift_mark) + continue; + + threshold = calculate_threshold(zone); + for_each_online_cpu(cpu) + per_cpu_ptr(zone->pageset, cpu)->stat_threshold + = threshold; + } + put_online_cpus(); +} + /* * For use when we know that interrupts are disabled. */ @@ -911,7 +977,7 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, "\n scanned %lu" "\n spanned %lu" "\n present %lu", - zone_nr_free_pages(zone), + zone_page_state(zone, NR_FREE_PAGES), min_wmark_pages(zone), low_wmark_pages(zone), high_wmark_pages(zone), -- cgit From b44129b30652c8771db2265939bb8b463724043d Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 13 Jan 2011 15:45:43 -0800 Subject: mm: vmstat: use a single setter function and callback for adjusting percpu thresholds reduce_pgdat_percpu_threshold() and restore_pgdat_percpu_threshold() exist to adjust the per-cpu vmstat thresholds while kswapd is awake to avoid errors due to counter drift. The functions duplicate some code so this patch replaces them with a single set_pgdat_percpu_threshold() that takes a callback function to calculate the desired threshold as a parameter. [akpm@linux-foundation.org: readability tweak] [kosaki.motohiro@jp.fujitsu.com: set_pgdat_percpu_threshold(): don't use for_each_online_cpu] Signed-off-by: Mel Gorman Reviewed-by: Christoph Lameter Reviewed-by: KAMEZAWA Hiroyuki Signed-off-by: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/vmstat.h | 10 ++++++---- mm/vmscan.c | 19 +++++++++++++++++-- mm/vmstat.c | 36 +++++++----------------------------- 3 files changed, 30 insertions(+), 35 deletions(-) diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h index e4cc21cf5870..833e676d6d92 100644 --- a/include/linux/vmstat.h +++ b/include/linux/vmstat.h @@ -254,8 +254,11 @@ extern void dec_zone_state(struct zone *, enum zone_stat_item); extern void __dec_zone_state(struct zone *, enum zone_stat_item); void refresh_cpu_vm_stats(int); -void reduce_pgdat_percpu_threshold(pg_data_t *pgdat); -void restore_pgdat_percpu_threshold(pg_data_t *pgdat); + +int calculate_pressure_threshold(struct zone *zone); +int calculate_normal_threshold(struct zone *zone); +void set_pgdat_percpu_threshold(pg_data_t *pgdat, + int (*calculate_pressure)(struct zone *)); #else /* CONFIG_SMP */ /* @@ -300,8 +303,7 @@ static inline void __dec_zone_page_state(struct page *page, #define dec_zone_page_state __dec_zone_page_state #define mod_zone_page_state __mod_zone_page_state -static inline void reduce_pgdat_percpu_threshold(pg_data_t *pgdat) { } -static inline void restore_pgdat_percpu_threshold(pg_data_t *pgdat) { } +#define set_pgdat_percpu_threshold(pgdat, callback) { } static inline void refresh_cpu_vm_stats(int cpu) { } #endif diff --git a/mm/vmscan.c b/mm/vmscan.c index 5da4295e7d67..86f8c3418795 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2448,9 +2448,24 @@ static int kswapd(void *p) */ if (!sleeping_prematurely(pgdat, order, remaining)) { trace_mm_vmscan_kswapd_sleep(pgdat->node_id); - restore_pgdat_percpu_threshold(pgdat); + + /* + * vmstat counters are not perfectly + * accurate and the estimated value + * for counters such as NR_FREE_PAGES + * can deviate from the true value by + * nr_online_cpus * threshold. To + * avoid the zone watermarks being + * breached while under pressure, we + * reduce the per-cpu vmstat threshold + * while kswapd is awake and restore + * them before going back to sleep. + */ + set_pgdat_percpu_threshold(pgdat, + calculate_normal_threshold); schedule(); - reduce_pgdat_percpu_threshold(pgdat); + set_pgdat_percpu_threshold(pgdat, + calculate_pressure_threshold); } else { if (remaining) count_vm_event(KSWAPD_LOW_WMARK_HIT_QUICKLY); diff --git a/mm/vmstat.c b/mm/vmstat.c index bc0f095791b4..751a65e00aac 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -83,7 +83,7 @@ EXPORT_SYMBOL(vm_stat); #ifdef CONFIG_SMP -static int calculate_pressure_threshold(struct zone *zone) +int calculate_pressure_threshold(struct zone *zone) { int threshold; int watermark_distance; @@ -107,7 +107,7 @@ static int calculate_pressure_threshold(struct zone *zone) return threshold; } -static int calculate_threshold(struct zone *zone) +int calculate_normal_threshold(struct zone *zone) { int threshold; int mem; /* memory in 128 MB units */ @@ -166,7 +166,7 @@ static void refresh_zone_stat_thresholds(void) for_each_populated_zone(zone) { unsigned long max_drift, tolerate_drift; - threshold = calculate_threshold(zone); + threshold = calculate_normal_threshold(zone); for_each_online_cpu(cpu) per_cpu_ptr(zone->pageset, cpu)->stat_threshold @@ -185,46 +185,24 @@ static void refresh_zone_stat_thresholds(void) } } -void reduce_pgdat_percpu_threshold(pg_data_t *pgdat) +void set_pgdat_percpu_threshold(pg_data_t *pgdat, + int (*calculate_pressure)(struct zone *)) { struct zone *zone; int cpu; int threshold; int i; - get_online_cpus(); - for (i = 0; i < pgdat->nr_zones; i++) { - zone = &pgdat->node_zones[i]; - if (!zone->percpu_drift_mark) - continue; - - threshold = calculate_pressure_threshold(zone); - for_each_online_cpu(cpu) - per_cpu_ptr(zone->pageset, cpu)->stat_threshold - = threshold; - } - put_online_cpus(); -} - -void restore_pgdat_percpu_threshold(pg_data_t *pgdat) -{ - struct zone *zone; - int cpu; - int threshold; - int i; - - get_online_cpus(); for (i = 0; i < pgdat->nr_zones; i++) { zone = &pgdat->node_zones[i]; if (!zone->percpu_drift_mark) continue; - threshold = calculate_threshold(zone); - for_each_online_cpu(cpu) + threshold = (*calculate_pressure)(zone); + for_each_possible_cpu(cpu) per_cpu_ptr(zone->pageset, cpu)->stat_threshold = threshold; } - put_online_cpus(); } /* -- cgit From 6585027a5e8cb490e3a761b2f3f3c3acf722aff2 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 13 Jan 2011 15:45:44 -0800 Subject: writeback: integrated background writeback work Check whether background writeback is needed after finishing each work. When bdi flusher thread finishes doing some work check whether any kind of background writeback needs to be done (either because dirty_background_ratio is exceeded or because we need to start flushing old inodes). If so, just do background write back. This way, bdi_start_background_writeback() just needs to wake up the flusher thread. It will do background writeback as soon as there is no other work. This is a preparatory patch for the next patch which stops background writeback as soon as there is other work to do. Signed-off-by: Jan Kara Signed-off-by: Wu Fengguang Cc: Johannes Weiner Cc: Dave Chinner Cc: Christoph Hellwig Cc: Jan Engelhardt Cc: Jens Axboe Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/fs-writeback.c | 61 +++++++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 46 insertions(+), 15 deletions(-) diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 3d06ccc953aa..3a07f6d8bc0b 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -84,13 +84,9 @@ static inline struct inode *wb_inode(struct list_head *head) return list_entry(head, struct inode, i_wb_list); } -static void bdi_queue_work(struct backing_dev_info *bdi, - struct wb_writeback_work *work) +/* Wakeup flusher thread or forker thread to fork it. Requires bdi->wb_lock. */ +static void bdi_wakeup_flusher(struct backing_dev_info *bdi) { - trace_writeback_queue(bdi, work); - - spin_lock_bh(&bdi->wb_lock); - list_add_tail(&work->list, &bdi->work_list); if (bdi->wb.task) { wake_up_process(bdi->wb.task); } else { @@ -98,15 +94,26 @@ static void bdi_queue_work(struct backing_dev_info *bdi, * The bdi thread isn't there, wake up the forker thread which * will create and run it. */ - trace_writeback_nothread(bdi, work); wake_up_process(default_backing_dev_info.wb.task); } +} + +static void bdi_queue_work(struct backing_dev_info *bdi, + struct wb_writeback_work *work) +{ + trace_writeback_queue(bdi, work); + + spin_lock_bh(&bdi->wb_lock); + list_add_tail(&work->list, &bdi->work_list); + if (!bdi->wb.task) + trace_writeback_nothread(bdi, work); + bdi_wakeup_flusher(bdi); spin_unlock_bh(&bdi->wb_lock); } static void __bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages, - bool range_cyclic, bool for_background) + bool range_cyclic) { struct wb_writeback_work *work; @@ -126,7 +133,6 @@ __bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages, work->sync_mode = WB_SYNC_NONE; work->nr_pages = nr_pages; work->range_cyclic = range_cyclic; - work->for_background = for_background; bdi_queue_work(bdi, work); } @@ -144,7 +150,7 @@ __bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages, */ void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages) { - __bdi_start_writeback(bdi, nr_pages, true, false); + __bdi_start_writeback(bdi, nr_pages, true); } /** @@ -152,13 +158,20 @@ void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages) * @bdi: the backing device to write from * * Description: - * This does WB_SYNC_NONE background writeback. The IO is only - * started when this function returns, we make no guarentees on - * completion. Caller need not hold sb s_umount semaphore. + * This makes sure WB_SYNC_NONE background writeback happens. When + * this function returns, it is only guaranteed that for given BDI + * some IO is happening if we are over background dirty threshold. + * Caller need not hold sb s_umount semaphore. */ void bdi_start_background_writeback(struct backing_dev_info *bdi) { - __bdi_start_writeback(bdi, LONG_MAX, true, true); + /* + * We just wake up the flusher thread. It will perform background + * writeback as soon as there is no other work to do. + */ + spin_lock_bh(&bdi->wb_lock); + bdi_wakeup_flusher(bdi); + spin_unlock_bh(&bdi->wb_lock); } /* @@ -718,6 +731,23 @@ static unsigned long get_nr_dirty_pages(void) get_nr_dirty_inodes(); } +static long wb_check_background_flush(struct bdi_writeback *wb) +{ + if (over_bground_thresh()) { + + struct wb_writeback_work work = { + .nr_pages = LONG_MAX, + .sync_mode = WB_SYNC_NONE, + .for_background = 1, + .range_cyclic = 1, + }; + + return wb_writeback(wb, &work); + } + + return 0; +} + static long wb_check_old_data_flush(struct bdi_writeback *wb) { unsigned long expired; @@ -787,6 +817,7 @@ long wb_do_writeback(struct bdi_writeback *wb, int force_wait) * Check for periodic writeback, kupdated() style */ wrote += wb_check_old_data_flush(wb); + wrote += wb_check_background_flush(wb); clear_bit(BDI_writeback_running, &wb->bdi->state); return wrote; @@ -873,7 +904,7 @@ void wakeup_flusher_threads(long nr_pages) list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) { if (!bdi_has_dirty_io(bdi)) continue; - __bdi_start_writeback(bdi, nr_pages, false, false); + __bdi_start_writeback(bdi, nr_pages, false); } rcu_read_unlock(); } -- cgit From 71927e84e0aebfbe5a91565c3b207af25a4e9162 Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Thu, 13 Jan 2011 15:45:46 -0800 Subject: writeback: trace wakeup event for background writeback This tracks when balance_dirty_pages() tries to wakeup the flusher thread for background writeback (if it was not started already). Suggested-by: Christoph Hellwig Signed-off-by: Wu Fengguang Cc: Jan Kara Cc: Johannes Weiner Cc: Dave Chinner Cc: Jan Engelhardt Cc: Jens Axboe Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/fs-writeback.c | 1 + include/trace/events/writeback.h | 1 + 2 files changed, 2 insertions(+) diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 3a07f6d8bc0b..482de0a92ca7 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -169,6 +169,7 @@ void bdi_start_background_writeback(struct backing_dev_info *bdi) * We just wake up the flusher thread. It will perform background * writeback as soon as there is no other work to do. */ + trace_writeback_wake_background(bdi); spin_lock_bh(&bdi->wb_lock); bdi_wakeup_flusher(bdi); spin_unlock_bh(&bdi->wb_lock); diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h index 89a2b2db4375..4e249b927eaa 100644 --- a/include/trace/events/writeback.h +++ b/include/trace/events/writeback.h @@ -81,6 +81,7 @@ DEFINE_EVENT(writeback_class, name, \ TP_ARGS(bdi)) DEFINE_WRITEBACK_EVENT(writeback_nowork); +DEFINE_WRITEBACK_EVENT(writeback_wake_background); DEFINE_WRITEBACK_EVENT(writeback_wake_thread); DEFINE_WRITEBACK_EVENT(writeback_wake_forker_thread); DEFINE_WRITEBACK_EVENT(writeback_bdi_register); -- cgit From aa373cf550994623efb5d49a4d8775bafd10bbc1 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 13 Jan 2011 15:45:47 -0800 Subject: writeback: stop background/kupdate works from livelocking other works Background writeback is easily livelockable in a loop in wb_writeback() by a process continuously re-dirtying pages (or continuously appending to a file). This is in fact intended as the target of background writeback is to write dirty pages it can find as long as we are over dirty_background_threshold. But the above behavior gets inconvenient at times because no other work queued in the flusher thread's queue gets processed. In particular, since e.g. sync(1) relies on flusher thread to do all the IO for it, sync(1) can hang forever waiting for flusher thread to do the work. Generally, when a flusher thread has some work queued, someone submitted the work to achieve a goal more specific than what background writeback does. Moreover by working on the specific work, we also reduce amount of dirty pages which is exactly the target of background writeout. So it makes sense to give specific work a priority over a generic page cleaning. Thus we interrupt background writeback if there is some other work to do. We return to the background writeback after completing all the queued work. This may delay the writeback of expired inodes for a while, however the expired inodes will eventually be flushed to disk as long as the other works won't livelock. [fengguang.wu@intel.com: update comment] Signed-off-by: Jan Kara Signed-off-by: Wu Fengguang Cc: Johannes Weiner Cc: Dave Chinner Cc: Christoph Hellwig Cc: Jan Engelhardt Cc: Jens Axboe Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/fs-writeback.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 482de0a92ca7..9e72d04e706e 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -650,6 +650,16 @@ static long wb_writeback(struct bdi_writeback *wb, if (work->nr_pages <= 0) break; + /* + * Background writeout and kupdate-style writeback may + * run forever. Stop them if there is other work to do + * so that e.g. sync can proceed. They'll be restarted + * after the other works are all done. + */ + if ((work->for_background || work->for_kupdate) && + !list_empty(&wb->bdi->work_list)) + break; + /* * For background writeout, stop when we are below the * background dirty threshold -- cgit From b9543dac5bbc4aef0a598965b6b34f6259ab9a9b Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 13 Jan 2011 15:45:48 -0800 Subject: writeback: avoid livelocking WB_SYNC_ALL writeback When wb_writeback() is called in WB_SYNC_ALL mode, work->nr_to_write is usually set to LONG_MAX. The logic in wb_writeback() then calls __writeback_inodes_sb() with nr_to_write == MAX_WRITEBACK_PAGES and we easily end up with non-positive nr_to_write after the function returns, if the inode has more than MAX_WRITEBACK_PAGES dirty pages at the moment. When nr_to_write is <= 0 wb_writeback() decides we need another round of writeback but this is wrong in some cases! For example when a single large file is continuously dirtied, we would never finish syncing it because each pass would be able to write MAX_WRITEBACK_PAGES and inode dirty timestamp never gets updated (as inode is never completely clean). Thus __writeback_inodes_sb() would write the redirtied inode again and again. Fix the issue by setting nr_to_write to LONG_MAX in WB_SYNC_ALL mode. We do not need nr_to_write in WB_SYNC_ALL mode anyway since write_cache_pages() does livelock avoidance using page tagging in WB_SYNC_ALL mode. This makes wb_writeback() call __writeback_inodes_sb() only once on WB_SYNC_ALL. The latter function won't livelock because it works on - a finite set of files by doing queue_io() once at the beginning - a finite set of pages by PAGECACHE_TAG_TOWRITE page tagging After this patch, program from http://lkml.org/lkml/2010/10/24/154 is no longer able to stall sync forever. [fengguang.wu@intel.com: fix locking comment] Signed-off-by: Jan Kara Signed-off-by: Wu Fengguang Cc: Johannes Weiner Cc: Dave Chinner Cc: Christoph Hellwig Cc: Jan Engelhardt Cc: Jens Axboe Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/fs-writeback.c | 27 +++++++++++++++++++++++---- 1 file changed, 23 insertions(+), 4 deletions(-) diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 9e72d04e706e..e8063c938dd2 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -630,6 +630,7 @@ static long wb_writeback(struct bdi_writeback *wb, }; unsigned long oldest_jif; long wrote = 0; + long write_chunk; struct inode *inode; if (wbc.for_kupdate) { @@ -642,6 +643,24 @@ static long wb_writeback(struct bdi_writeback *wb, wbc.range_end = LLONG_MAX; } + /* + * WB_SYNC_ALL mode does livelock avoidance by syncing dirty + * inodes/pages in one big loop. Setting wbc.nr_to_write=LONG_MAX + * here avoids calling into writeback_inodes_wb() more than once. + * + * The intended call sequence for WB_SYNC_ALL writeback is: + * + * wb_writeback() + * __writeback_inodes_sb() <== called only once + * write_cache_pages() <== called once for each inode + * (quickly) tag currently dirty pages + * (maybe slowly) sync all tagged pages + */ + if (wbc.sync_mode == WB_SYNC_NONE) + write_chunk = MAX_WRITEBACK_PAGES; + else + write_chunk = LONG_MAX; + wbc.wb_start = jiffies; /* livelock avoidance */ for (;;) { /* @@ -668,7 +687,7 @@ static long wb_writeback(struct bdi_writeback *wb, break; wbc.more_io = 0; - wbc.nr_to_write = MAX_WRITEBACK_PAGES; + wbc.nr_to_write = write_chunk; wbc.pages_skipped = 0; trace_wbc_writeback_start(&wbc, wb->bdi); @@ -678,8 +697,8 @@ static long wb_writeback(struct bdi_writeback *wb, writeback_inodes_wb(wb, &wbc); trace_wbc_writeback_written(&wbc, wb->bdi); - work->nr_pages -= MAX_WRITEBACK_PAGES - wbc.nr_to_write; - wrote += MAX_WRITEBACK_PAGES - wbc.nr_to_write; + work->nr_pages -= write_chunk - wbc.nr_to_write; + wrote += write_chunk - wbc.nr_to_write; /* * If we consumed everything, see if we have more @@ -694,7 +713,7 @@ static long wb_writeback(struct bdi_writeback *wb, /* * Did we write something? Try for more */ - if (wbc.nr_to_write < MAX_WRITEBACK_PAGES) + if (wbc.nr_to_write < write_chunk) continue; /* * Nothing written. Wait for some inode to -- cgit From c691b9d983d7015d54057034f4cd9b6d8affd976 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Thu, 13 Jan 2011 15:45:48 -0800 Subject: sync_inode_metadata: fix comment Use correct function name, remove incorrect apostrophe Cc: Christoph Hellwig Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/fs-writeback.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index e8063c938dd2..05aab263e9aa 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -1303,11 +1303,11 @@ int sync_inode(struct inode *inode, struct writeback_control *wbc) EXPORT_SYMBOL(sync_inode); /** - * sync_inode - write an inode to disk + * sync_inode_metadata - write an inode to disk * @inode: the inode to sync * @wait: wait for I/O to complete. * - * Write an inode to disk and adjust it's dirty state after completion. + * Write an inode to disk and adjust its dirty state after completion. * * Note: only writes the actual inode, no associated data or other metadata. */ -- cgit From c3f0da631539b3b8e17f6dda567af9958d49d14f Mon Sep 17 00:00:00 2001 From: Bob Liu Date: Thu, 13 Jan 2011 15:45:49 -0800 Subject: mm/page-writeback.c: fix __set_page_dirty_no_writeback() return value __set_page_dirty_no_writeback() should return true if it actually transitioned the page from a clean to dirty state although it seems nobody uses its return value at present. Signed-off-by: Bob Liu Acked-by: Wu Fengguang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page-writeback.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/page-writeback.c b/mm/page-writeback.c index b5d8a1f820a0..28763b8bdbdd 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -1103,7 +1103,7 @@ EXPORT_SYMBOL(write_one_page); int __set_page_dirty_no_writeback(struct page *page) { if (!PageDirty(page)) - SetPageDirty(page); + return !TestSetPageDirty(page); return 0; } -- cgit From f0bc0a60b13f209df16062f94e9fb4b90dc08708 Mon Sep 17 00:00:00 2001 From: KOSAKI Motohiro Date: Thu, 13 Jan 2011 15:45:50 -0800 Subject: vmscan: factor out kswapd sleeping logic from kswapd() Currently, kswapd() has deep nesting and is slightly hard to read. Clean this up. Signed-off-by: KOSAKI Motohiro Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 92 ++++++++++++++++++++++++++++++------------------------------- 1 file changed, 46 insertions(+), 46 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index 86f8c3418795..cacdf6684971 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2371,6 +2371,50 @@ out: return sc.nr_reclaimed; } +static void kswapd_try_to_sleep(pg_data_t *pgdat, int order) +{ + long remaining = 0; + DEFINE_WAIT(wait); + + if (freezing(current) || kthread_should_stop()) + return; + + prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); + + /* Try to sleep for a short interval */ + if (!sleeping_prematurely(pgdat, order, remaining)) { + remaining = schedule_timeout(HZ/10); + finish_wait(&pgdat->kswapd_wait, &wait); + prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); + } + + /* + * After a short sleep, check if it was a premature sleep. If not, then + * go fully to sleep until explicitly woken up. + */ + if (!sleeping_prematurely(pgdat, order, remaining)) { + trace_mm_vmscan_kswapd_sleep(pgdat->node_id); + + /* + * vmstat counters are not perfectly accurate and the estimated + * value for counters such as NR_FREE_PAGES can deviate from the + * true value by nr_online_cpus * threshold. To avoid the zone + * watermarks being breached while under pressure, we reduce the + * per-cpu vmstat threshold while kswapd is awake and restore + * them before going back to sleep. + */ + set_pgdat_percpu_threshold(pgdat, calculate_normal_threshold); + schedule(); + set_pgdat_percpu_threshold(pgdat, calculate_pressure_threshold); + } else { + if (remaining) + count_vm_event(KSWAPD_LOW_WMARK_HIT_QUICKLY); + else + count_vm_event(KSWAPD_HIGH_WMARK_HIT_QUICKLY); + } + finish_wait(&pgdat->kswapd_wait, &wait); +} + /* * The background pageout daemon, started as a kernel thread * from the init process. @@ -2389,7 +2433,7 @@ static int kswapd(void *p) unsigned long order; pg_data_t *pgdat = (pg_data_t*)p; struct task_struct *tsk = current; - DEFINE_WAIT(wait); + struct reclaim_state reclaim_state = { .reclaimed_slab = 0, }; @@ -2421,7 +2465,6 @@ static int kswapd(void *p) unsigned long new_order; int ret; - prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); new_order = pgdat->kswapd_max_order; pgdat->kswapd_max_order = 0; if (order < new_order) { @@ -2431,52 +2474,9 @@ static int kswapd(void *p) */ order = new_order; } else { - if (!freezing(current) && !kthread_should_stop()) { - long remaining = 0; - - /* Try to sleep for a short interval */ - if (!sleeping_prematurely(pgdat, order, remaining)) { - remaining = schedule_timeout(HZ/10); - finish_wait(&pgdat->kswapd_wait, &wait); - prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); - } - - /* - * After a short sleep, check if it was a - * premature sleep. If not, then go fully - * to sleep until explicitly woken up - */ - if (!sleeping_prematurely(pgdat, order, remaining)) { - trace_mm_vmscan_kswapd_sleep(pgdat->node_id); - - /* - * vmstat counters are not perfectly - * accurate and the estimated value - * for counters such as NR_FREE_PAGES - * can deviate from the true value by - * nr_online_cpus * threshold. To - * avoid the zone watermarks being - * breached while under pressure, we - * reduce the per-cpu vmstat threshold - * while kswapd is awake and restore - * them before going back to sleep. - */ - set_pgdat_percpu_threshold(pgdat, - calculate_normal_threshold); - schedule(); - set_pgdat_percpu_threshold(pgdat, - calculate_pressure_threshold); - } else { - if (remaining) - count_vm_event(KSWAPD_LOW_WMARK_HIT_QUICKLY); - else - count_vm_event(KSWAPD_HIGH_WMARK_HIT_QUICKLY); - } - } - + kswapd_try_to_sleep(pgdat, order); order = pgdat->kswapd_max_order; } - finish_wait(&pgdat->kswapd_wait, &wait); ret = try_to_freeze(); if (kthread_should_stop()) -- cgit From 9cbb4cb21b19fff46cf1174d0ed699ef710e641c Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Thu, 13 Jan 2011 15:45:51 -0800 Subject: mm: find_get_pages_contig fixlet Testing ->mapping and ->index without a ref is not stable as the page may have been reused at this point. Signed-off-by: Nick Piggin Reviewed-by: Wu Fengguang Reviewed-by: Minchan Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/mm/filemap.c b/mm/filemap.c index ca389394fa2a..1a3dd5914726 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -837,9 +837,6 @@ repeat: if (radix_tree_deref_retry(page)) goto restart; - if (page->mapping == NULL || page->index != index) - break; - if (!page_cache_get_speculative(page)) goto repeat; @@ -849,6 +846,16 @@ repeat: goto repeat; } + /* + * must check mapping and index after taking the ref. + * otherwise we can get both false positives and false + * negatives, which is just confusing to the caller. + */ + if (page->mapping == NULL || page->index != index) { + page_cache_release(page); + break; + } + pages[ret] = page; ret++; index++; -- cgit From c32b0d4b3f19c2f5d29568f8b7b72b61693f1277 Mon Sep 17 00:00:00 2001 From: Hai Shan Date: Thu, 13 Jan 2011 15:45:51 -0800 Subject: fs/mpage.c: consolidate code Merge mpage_end_io_read() and mpage_end_io_write() into mpage_end_io() to eliminate code duplication. [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Hai Shan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/mpage.c | 49 +++++++++++++++++-------------------------------- 1 file changed, 17 insertions(+), 32 deletions(-) diff --git a/fs/mpage.c b/fs/mpage.c index fd56ca2ea556..d78455a81ec9 100644 --- a/fs/mpage.c +++ b/fs/mpage.c @@ -40,7 +40,7 @@ * status of that page is hard. See end_buffer_async_read() for the details. * There is no point in duplicating all that complexity. */ -static void mpage_end_io_read(struct bio *bio, int err) +static void mpage_end_io(struct bio *bio, int err) { const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; @@ -50,44 +50,29 @@ static void mpage_end_io_read(struct bio *bio, int err) if (--bvec >= bio->bi_io_vec) prefetchw(&bvec->bv_page->flags); - - if (uptodate) { - SetPageUptodate(page); - } else { - ClearPageUptodate(page); - SetPageError(page); - } - unlock_page(page); - } while (bvec >= bio->bi_io_vec); - bio_put(bio); -} - -static void mpage_end_io_write(struct bio *bio, int err) -{ - const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); - struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; - - do { - struct page *page = bvec->bv_page; - - if (--bvec >= bio->bi_io_vec) - prefetchw(&bvec->bv_page->flags); - - if (!uptodate){ - SetPageError(page); - if (page->mapping) - set_bit(AS_EIO, &page->mapping->flags); + if (bio_data_dir(bio) == READ) { + if (uptodate) { + SetPageUptodate(page); + } else { + ClearPageUptodate(page); + SetPageError(page); + } + unlock_page(page); + } else { /* bio_data_dir(bio) == WRITE */ + if (!uptodate) { + SetPageError(page); + if (page->mapping) + set_bit(AS_EIO, &page->mapping->flags); + } + end_page_writeback(page); } - end_page_writeback(page); } while (bvec >= bio->bi_io_vec); bio_put(bio); } static struct bio *mpage_bio_submit(int rw, struct bio *bio) { - bio->bi_end_io = mpage_end_io_read; - if (rw == WRITE) - bio->bi_end_io = mpage_end_io_write; + bio->bi_end_io = mpage_end_io; submit_bio(rw, bio); return NULL; } -- cgit From 62c70bce8ac236514c610020bb1ae5b8bde965cb Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Thu, 13 Jan 2011 15:45:52 -0800 Subject: mm: convert sprintf_symbol to %pS Signed-off-by: Joe Perches Acked-by: Pekka Enberg Cc: Jiri Kosina Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slub.c | 11 ++++------- mm/vmalloc.c | 9 ++------- 2 files changed, 6 insertions(+), 14 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index 008cd743a36a..c7ef0070dd86 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -3636,7 +3636,7 @@ static int list_locations(struct kmem_cache *s, char *buf, len += sprintf(buf + len, "%7ld ", l->count); if (l->addr) - len += sprint_symbol(buf + len, (unsigned long)l->addr); + len += sprintf(buf + len, "%pS", (void *)l->addr); else len += sprintf(buf + len, ""); @@ -3946,12 +3946,9 @@ SLAB_ATTR(min_partial); static ssize_t ctor_show(struct kmem_cache *s, char *buf) { - if (s->ctor) { - int n = sprint_symbol(buf, (unsigned long)s->ctor); - - return n + sprintf(buf + n, "\n"); - } - return 0; + if (!s->ctor) + return 0; + return sprintf(buf, "%pS\n", s->ctor); } SLAB_ATTR_RO(ctor); diff --git a/mm/vmalloc.c b/mm/vmalloc.c index eb5cc7d00c5a..48245af07b10 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -2456,13 +2456,8 @@ static int s_show(struct seq_file *m, void *p) seq_printf(m, "0x%p-0x%p %7ld", v->addr, v->addr + v->size, v->size); - if (v->caller) { - char buff[KSYM_SYMBOL_LEN]; - - seq_putc(m, ' '); - sprint_symbol(buff, (unsigned long)v->caller); - seq_puts(m, buff); - } + if (v->caller) + seq_printf(m, " %pS", v->caller); if (v->nr_pages) seq_printf(m, " pages=%d", v->nr_pages); -- cgit From 2d90508f638241a2e7422d884767398296ebe720 Mon Sep 17 00:00:00 2001 From: Nikanth Karthikesan Date: Thu, 13 Jan 2011 15:45:53 -0800 Subject: mm: smaps: export mlock information Currently there is no way to find whether a process has locked its pages in memory or not. And which of the memory regions are locked in memory. Add a new field "Locked" to export this information via the smaps file. Signed-off-by: Nikanth Karthikesan Acked-by: Balbir Singh Acked-by: Wu Fengguang Cc: Matt Mackall Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/filesystems/proc.txt | 3 +++ fs/proc/task_mmu.c | 7 +++++-- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt index 9471225212c4..ef757fca470b 100644 --- a/Documentation/filesystems/proc.txt +++ b/Documentation/filesystems/proc.txt @@ -375,6 +375,7 @@ Anonymous: 0 kB Swap: 0 kB KernelPageSize: 4 kB MMUPageSize: 4 kB +Locked: 374 kB The first of these lines shows the same information as is displayed for the mapping in /proc/PID/maps. The remaining lines show the size of the mapping @@ -670,6 +671,8 @@ varies by architecture and compile options. The following is from a > cat /proc/meminfo +The "Locked" indicates whether the mapping is locked in memory or not. + MemTotal: 16344972 kB MemFree: 13634064 kB diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index c3755bd8dd3e..60b914860f81 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -418,7 +418,8 @@ static int show_smap(struct seq_file *m, void *v) "Anonymous: %8lu kB\n" "Swap: %8lu kB\n" "KernelPageSize: %8lu kB\n" - "MMUPageSize: %8lu kB\n", + "MMUPageSize: %8lu kB\n" + "Locked: %8lu kB\n", (vma->vm_end - vma->vm_start) >> 10, mss.resident >> 10, (unsigned long)(mss.pss >> (10 + PSS_SHIFT)), @@ -430,7 +431,9 @@ static int show_smap(struct seq_file *m, void *v) mss.anonymous >> 10, mss.swap >> 10, vma_kernel_pagesize(vma) >> 10, - vma_mmu_pagesize(vma) >> 10); + vma_mmu_pagesize(vma) >> 10, + (vma->vm_flags & VM_LOCKED) ? + (unsigned long)(mss.pss >> (10 + PSS_SHIFT)) : 0); if (m->count < m->size) /* vma is copied successfully */ m->version = (vma != get_gate_vma(task)) ? vma->vm_start : 0; -- cgit From b7aba6984dc048503b69c2a885098cdd430832bf Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 13 Jan 2011 15:45:54 -0800 Subject: mm: compaction: add trace events for memory compaction activity In preparation for a patches promoting the use of memory compaction over lumpy reclaim, this patch adds trace points for memory compaction activity. Using them, we can monitor the scanning activity of the migration and free page scanners as well as the number and success rates of pages passed to page migration. Signed-off-by: Mel Gorman Cc: Andrea Arcangeli Cc: KOSAKI Motohiro Cc: Rik van Riel Cc: Johannes Weiner Cc: Andy Whitcroft Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/trace/events/compaction.h | 74 +++++++++++++++++++++++++++++++++++++++ mm/compaction.c | 14 +++++++- 2 files changed, 87 insertions(+), 1 deletion(-) create mode 100644 include/trace/events/compaction.h diff --git a/include/trace/events/compaction.h b/include/trace/events/compaction.h new file mode 100644 index 000000000000..388bcdd26d46 --- /dev/null +++ b/include/trace/events/compaction.h @@ -0,0 +1,74 @@ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM compaction + +#if !defined(_TRACE_COMPACTION_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_COMPACTION_H + +#include +#include +#include "gfpflags.h" + +DECLARE_EVENT_CLASS(mm_compaction_isolate_template, + + TP_PROTO(unsigned long nr_scanned, + unsigned long nr_taken), + + TP_ARGS(nr_scanned, nr_taken), + + TP_STRUCT__entry( + __field(unsigned long, nr_scanned) + __field(unsigned long, nr_taken) + ), + + TP_fast_assign( + __entry->nr_scanned = nr_scanned; + __entry->nr_taken = nr_taken; + ), + + TP_printk("nr_scanned=%lu nr_taken=%lu", + __entry->nr_scanned, + __entry->nr_taken) +); + +DEFINE_EVENT(mm_compaction_isolate_template, mm_compaction_isolate_migratepages, + + TP_PROTO(unsigned long nr_scanned, + unsigned long nr_taken), + + TP_ARGS(nr_scanned, nr_taken) +); + +DEFINE_EVENT(mm_compaction_isolate_template, mm_compaction_isolate_freepages, + TP_PROTO(unsigned long nr_scanned, + unsigned long nr_taken), + + TP_ARGS(nr_scanned, nr_taken) +); + +TRACE_EVENT(mm_compaction_migratepages, + + TP_PROTO(unsigned long nr_migrated, + unsigned long nr_failed), + + TP_ARGS(nr_migrated, nr_failed), + + TP_STRUCT__entry( + __field(unsigned long, nr_migrated) + __field(unsigned long, nr_failed) + ), + + TP_fast_assign( + __entry->nr_migrated = nr_migrated; + __entry->nr_failed = nr_failed; + ), + + TP_printk("nr_migrated=%lu nr_failed=%lu", + __entry->nr_migrated, + __entry->nr_failed) +); + + +#endif /* _TRACE_COMPACTION_H */ + +/* This part must be outside protection */ +#include diff --git a/mm/compaction.c b/mm/compaction.c index 1a8894eadf72..20011a850fef 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -16,6 +16,9 @@ #include #include "internal.h" +#define CREATE_TRACE_POINTS +#include + /* * compact_control is used to track pages being migrated and the free pages * they are being migrated to during memory compaction. The free_pfn starts @@ -60,7 +63,7 @@ static unsigned long isolate_freepages_block(struct zone *zone, struct list_head *freelist) { unsigned long zone_end_pfn, end_pfn; - int total_isolated = 0; + int nr_scanned = 0, total_isolated = 0; struct page *cursor; /* Get the last PFN we should scan for free pages at */ @@ -81,6 +84,7 @@ static unsigned long isolate_freepages_block(struct zone *zone, if (!pfn_valid_within(blockpfn)) continue; + nr_scanned++; if (!PageBuddy(page)) continue; @@ -100,6 +104,7 @@ static unsigned long isolate_freepages_block(struct zone *zone, } } + trace_mm_compaction_isolate_freepages(nr_scanned, total_isolated); return total_isolated; } @@ -234,6 +239,7 @@ static unsigned long isolate_migratepages(struct zone *zone, struct compact_control *cc) { unsigned long low_pfn, end_pfn; + unsigned long nr_scanned = 0, nr_isolated = 0; struct list_head *migratelist = &cc->migratepages; /* Do not scan outside zone boundaries */ @@ -266,6 +272,7 @@ static unsigned long isolate_migratepages(struct zone *zone, struct page *page; if (!pfn_valid_within(low_pfn)) continue; + nr_scanned++; /* Get the page and skip if free */ page = pfn_to_page(low_pfn); @@ -280,6 +287,7 @@ static unsigned long isolate_migratepages(struct zone *zone, del_page_from_lru_list(zone, page, page_lru(page)); list_add(&page->lru, migratelist); cc->nr_migratepages++; + nr_isolated++; /* Avoid isolating too much */ if (cc->nr_migratepages == COMPACT_CLUSTER_MAX) @@ -291,6 +299,8 @@ static unsigned long isolate_migratepages(struct zone *zone, spin_unlock_irq(&zone->lru_lock); cc->migrate_pfn = low_pfn; + trace_mm_compaction_isolate_migratepages(nr_scanned, nr_isolated); + return cc->nr_migratepages; } @@ -401,6 +411,8 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) count_vm_events(COMPACTPAGES, nr_migrate - nr_remaining); if (nr_remaining) count_vm_events(COMPACTPAGEFAILED, nr_remaining); + trace_mm_compaction_migratepages(nr_migrate - nr_remaining, + nr_remaining); /* Release LRU pages not migrated */ if (!list_empty(&cc->migratepages)) { -- cgit From ee64fc9354e515a79c7232cfde65c88ec627308b Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 13 Jan 2011 15:45:55 -0800 Subject: mm: vmscan: convert lumpy_mode into a bitmask Currently lumpy_mode is an enum and determines if lumpy reclaim is off, syncronous or asyncronous. In preparation for using compaction instead of lumpy reclaim, this patch converts the flags into a bitmap. Signed-off-by: Mel Gorman Cc: Andrea Arcangeli Cc: KOSAKI Motohiro Cc: Rik van Riel Cc: Johannes Weiner Cc: Andy Whitcroft Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/trace/events/vmscan.h | 6 +++--- mm/vmscan.c | 46 ++++++++++++++++++++++++++----------------- 2 files changed, 31 insertions(+), 21 deletions(-) diff --git a/include/trace/events/vmscan.h b/include/trace/events/vmscan.h index c255fcc587bf..be76429962ca 100644 --- a/include/trace/events/vmscan.h +++ b/include/trace/events/vmscan.h @@ -25,13 +25,13 @@ #define trace_reclaim_flags(page, sync) ( \ (page_is_file_cache(page) ? RECLAIM_WB_FILE : RECLAIM_WB_ANON) | \ - (sync == LUMPY_MODE_SYNC ? RECLAIM_WB_SYNC : RECLAIM_WB_ASYNC) \ + (sync & LUMPY_MODE_SYNC ? RECLAIM_WB_SYNC : RECLAIM_WB_ASYNC) \ ) #define trace_shrink_flags(file, sync) ( \ - (sync == LUMPY_MODE_SYNC ? RECLAIM_WB_MIXED : \ + (sync & LUMPY_MODE_SYNC ? RECLAIM_WB_MIXED : \ (file ? RECLAIM_WB_FILE : RECLAIM_WB_ANON)) | \ - (sync == LUMPY_MODE_SYNC ? RECLAIM_WB_SYNC : RECLAIM_WB_ASYNC) \ + (sync & LUMPY_MODE_SYNC ? RECLAIM_WB_SYNC : RECLAIM_WB_ASYNC) \ ) TRACE_EVENT(mm_vmscan_kswapd_sleep, diff --git a/mm/vmscan.c b/mm/vmscan.c index cacdf6684971..3464312bde07 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -51,11 +51,20 @@ #define CREATE_TRACE_POINTS #include -enum lumpy_mode { - LUMPY_MODE_NONE, - LUMPY_MODE_ASYNC, - LUMPY_MODE_SYNC, -}; +/* + * lumpy_mode determines how the inactive list is shrunk + * LUMPY_MODE_SINGLE: Reclaim only order-0 pages + * LUMPY_MODE_ASYNC: Do not block + * LUMPY_MODE_SYNC: Allow blocking e.g. call wait_on_page_writeback + * LUMPY_MODE_CONTIGRECLAIM: For high-order allocations, take a reference + * page from the LRU and reclaim all pages within a + * naturally aligned range + */ +typedef unsigned __bitwise__ lumpy_mode; +#define LUMPY_MODE_SINGLE ((__force lumpy_mode)0x01u) +#define LUMPY_MODE_ASYNC ((__force lumpy_mode)0x02u) +#define LUMPY_MODE_SYNC ((__force lumpy_mode)0x04u) +#define LUMPY_MODE_CONTIGRECLAIM ((__force lumpy_mode)0x08u) struct scan_control { /* Incremented by the number of inactive pages that were scanned */ @@ -88,7 +97,7 @@ struct scan_control { * Intend to reclaim enough continuous memory rather than reclaim * enough amount of memory. i.e, mode for high order allocation. */ - enum lumpy_mode lumpy_reclaim_mode; + lumpy_mode lumpy_reclaim_mode; /* Which cgroup do we reclaim from */ struct mem_cgroup *mem_cgroup; @@ -274,13 +283,13 @@ unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask, static void set_lumpy_reclaim_mode(int priority, struct scan_control *sc, bool sync) { - enum lumpy_mode mode = sync ? LUMPY_MODE_SYNC : LUMPY_MODE_ASYNC; + lumpy_mode syncmode = sync ? LUMPY_MODE_SYNC : LUMPY_MODE_ASYNC; /* * Some reclaim have alredy been failed. No worth to try synchronous * lumpy reclaim. */ - if (sync && sc->lumpy_reclaim_mode == LUMPY_MODE_NONE) + if (sync && sc->lumpy_reclaim_mode & LUMPY_MODE_SINGLE) return; /* @@ -288,17 +297,18 @@ static void set_lumpy_reclaim_mode(int priority, struct scan_control *sc, * trouble getting a small set of contiguous pages, we * will reclaim both active and inactive pages. */ + sc->lumpy_reclaim_mode = LUMPY_MODE_CONTIGRECLAIM; if (sc->order > PAGE_ALLOC_COSTLY_ORDER) - sc->lumpy_reclaim_mode = mode; + sc->lumpy_reclaim_mode |= syncmode; else if (sc->order && priority < DEF_PRIORITY - 2) - sc->lumpy_reclaim_mode = mode; + sc->lumpy_reclaim_mode |= syncmode; else - sc->lumpy_reclaim_mode = LUMPY_MODE_NONE; + sc->lumpy_reclaim_mode = LUMPY_MODE_SINGLE | LUMPY_MODE_ASYNC; } static void disable_lumpy_reclaim_mode(struct scan_control *sc) { - sc->lumpy_reclaim_mode = LUMPY_MODE_NONE; + sc->lumpy_reclaim_mode = LUMPY_MODE_SINGLE | LUMPY_MODE_ASYNC; } static inline int is_page_cache_freeable(struct page *page) @@ -429,7 +439,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping, * first attempt to free a range of pages fails. */ if (PageWriteback(page) && - sc->lumpy_reclaim_mode == LUMPY_MODE_SYNC) + (sc->lumpy_reclaim_mode & LUMPY_MODE_SYNC)) wait_on_page_writeback(page); if (!PageWriteback(page)) { @@ -622,7 +632,7 @@ static enum page_references page_check_references(struct page *page, referenced_page = TestClearPageReferenced(page); /* Lumpy reclaim - ignore references */ - if (sc->lumpy_reclaim_mode != LUMPY_MODE_NONE) + if (sc->lumpy_reclaim_mode & LUMPY_MODE_CONTIGRECLAIM) return PAGEREF_RECLAIM; /* @@ -739,7 +749,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, * for any page for which writeback has already * started. */ - if (sc->lumpy_reclaim_mode == LUMPY_MODE_SYNC && + if ((sc->lumpy_reclaim_mode & LUMPY_MODE_SYNC) && may_enter_fs) wait_on_page_writeback(page); else { @@ -1324,7 +1334,7 @@ static inline bool should_reclaim_stall(unsigned long nr_taken, return false; /* Only stall on lumpy reclaim */ - if (sc->lumpy_reclaim_mode == LUMPY_MODE_NONE) + if (sc->lumpy_reclaim_mode & LUMPY_MODE_SINGLE) return false; /* If we have relaimed everything on the isolated list, no stall */ @@ -1375,7 +1385,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone, if (scanning_global_lru(sc)) { nr_taken = isolate_pages_global(nr_to_scan, &page_list, &nr_scanned, sc->order, - sc->lumpy_reclaim_mode == LUMPY_MODE_NONE ? + sc->lumpy_reclaim_mode & LUMPY_MODE_SINGLE ? ISOLATE_INACTIVE : ISOLATE_BOTH, zone, 0, file); zone->pages_scanned += nr_scanned; @@ -1388,7 +1398,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone, } else { nr_taken = mem_cgroup_isolate_pages(nr_to_scan, &page_list, &nr_scanned, sc->order, - sc->lumpy_reclaim_mode == LUMPY_MODE_NONE ? + sc->lumpy_reclaim_mode & LUMPY_MODE_SINGLE ? ISOLATE_INACTIVE : ISOLATE_BOTH, zone, sc->mem_cgroup, 0, file); -- cgit From 3e7d344970673c5334cf7b5bb27c8c0942b06126 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 13 Jan 2011 15:45:56 -0800 Subject: mm: vmscan: reclaim order-0 and use compaction instead of lumpy reclaim Lumpy reclaim is disruptive. It reclaims a large number of pages and ignores the age of the pages it reclaims. This can incur significant stalls and potentially increase the number of major faults. Compaction has reached the point where it is considered reasonably stable (meaning it has passed a lot of testing) and is a potential candidate for displacing lumpy reclaim. This patch introduces an alternative to lumpy reclaim whe compaction is available called reclaim/compaction. The basic operation is very simple - instead of selecting a contiguous range of pages to reclaim, a number of order-0 pages are reclaimed and then compaction is later by either kswapd (compact_zone_order()) or direct compaction (__alloc_pages_direct_compact()). [akpm@linux-foundation.org: fix build] [akpm@linux-foundation.org: use conventional task_struct naming] Signed-off-by: Mel Gorman Cc: Andrea Arcangeli Cc: KOSAKI Motohiro Cc: Rik van Riel Acked-by: Johannes Weiner Cc: Andy Whitcroft Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/compaction.h | 14 +++++++ include/linux/kernel.h | 7 ++++ mm/compaction.c | 89 ++++++++++++++++++++++++--------------- mm/migrate.c | 17 ++++++++ mm/page_alloc.c | 16 +++++++ mm/vmscan.c | 102 ++++++++++++++++++++++++++++++++++++++------- 6 files changed, 196 insertions(+), 49 deletions(-) diff --git a/include/linux/compaction.h b/include/linux/compaction.h index 5ac51552d908..2592883d862d 100644 --- a/include/linux/compaction.h +++ b/include/linux/compaction.h @@ -22,6 +22,9 @@ extern int sysctl_extfrag_handler(struct ctl_table *table, int write, extern int fragmentation_index(struct zone *zone, unsigned int order); extern unsigned long try_to_compact_pages(struct zonelist *zonelist, int order, gfp_t gfp_mask, nodemask_t *mask); +extern unsigned long compaction_suitable(struct zone *zone, int order); +extern unsigned long compact_zone_order(struct zone *zone, int order, + gfp_t gfp_mask); /* Do not skip compaction more than 64 times */ #define COMPACT_MAX_DEFER_SHIFT 6 @@ -59,6 +62,17 @@ static inline unsigned long try_to_compact_pages(struct zonelist *zonelist, return COMPACT_CONTINUE; } +static inline unsigned long compaction_suitable(struct zone *zone, int order) +{ + return COMPACT_SKIPPED; +} + +static inline unsigned long compact_zone_order(struct zone *zone, int order, + gfp_t gfp_mask) +{ + return 0; +} + static inline void defer_compaction(struct zone *zone) { } diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 57dac7022b63..5a9d9059520b 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -600,6 +600,13 @@ struct sysinfo { #define NUMA_BUILD 0 #endif +/* This helps us avoid #ifdef CONFIG_COMPACTION */ +#ifdef CONFIG_COMPACTION +#define COMPACTION_BUILD 1 +#else +#define COMPACTION_BUILD 0 +#endif + /* Rebuild everything on CONFIG_FTRACE_MCOUNT_RECORD */ #ifdef CONFIG_FTRACE_MCOUNT_RECORD # define REBUILD_DUE_TO_FTRACE_MCOUNT_RECORD diff --git a/mm/compaction.c b/mm/compaction.c index 20011a850fef..8fe917ec7c11 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -384,10 +384,62 @@ static int compact_finished(struct zone *zone, return COMPACT_CONTINUE; } +/* + * compaction_suitable: Is this suitable to run compaction on this zone now? + * Returns + * COMPACT_SKIPPED - If there are too few free pages for compaction + * COMPACT_PARTIAL - If the allocation would succeed without compaction + * COMPACT_CONTINUE - If compaction should run now + */ +unsigned long compaction_suitable(struct zone *zone, int order) +{ + int fragindex; + unsigned long watermark; + + /* + * Watermarks for order-0 must be met for compaction. Note the 2UL. + * This is because during migration, copies of pages need to be + * allocated and for a short time, the footprint is higher + */ + watermark = low_wmark_pages(zone) + (2UL << order); + if (!zone_watermark_ok(zone, 0, watermark, 0, 0)) + return COMPACT_SKIPPED; + + /* + * fragmentation index determines if allocation failures are due to + * low memory or external fragmentation + * + * index of -1 implies allocations might succeed dependingon watermarks + * index towards 0 implies failure is due to lack of memory + * index towards 1000 implies failure is due to fragmentation + * + * Only compact if a failure would be due to fragmentation. + */ + fragindex = fragmentation_index(zone, order); + if (fragindex >= 0 && fragindex <= sysctl_extfrag_threshold) + return COMPACT_SKIPPED; + + if (fragindex == -1 && zone_watermark_ok(zone, order, watermark, 0, 0)) + return COMPACT_PARTIAL; + + return COMPACT_CONTINUE; +} + static int compact_zone(struct zone *zone, struct compact_control *cc) { int ret; + ret = compaction_suitable(zone, cc->order); + switch (ret) { + case COMPACT_PARTIAL: + case COMPACT_SKIPPED: + /* Compaction is likely to fail */ + return ret; + case COMPACT_CONTINUE: + /* Fall through to compaction */ + ; + } + /* Setup to move all movable pages to the end of the zone */ cc->migrate_pfn = zone->zone_start_pfn; cc->free_pfn = cc->migrate_pfn + zone->spanned_pages; @@ -429,7 +481,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) return ret; } -static unsigned long compact_zone_order(struct zone *zone, +unsigned long compact_zone_order(struct zone *zone, int order, gfp_t gfp_mask) { struct compact_control cc = { @@ -462,7 +514,6 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist, enum zone_type high_zoneidx = gfp_zone(gfp_mask); int may_enter_fs = gfp_mask & __GFP_FS; int may_perform_io = gfp_mask & __GFP_IO; - unsigned long watermark; struct zoneref *z; struct zone *zone; int rc = COMPACT_SKIPPED; @@ -480,43 +531,13 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist, /* Compact each zone in the list */ for_each_zone_zonelist_nodemask(zone, z, zonelist, high_zoneidx, nodemask) { - int fragindex; int status; - /* - * Watermarks for order-0 must be met for compaction. Note - * the 2UL. This is because during migration, copies of - * pages need to be allocated and for a short time, the - * footprint is higher - */ - watermark = low_wmark_pages(zone) + (2UL << order); - if (!zone_watermark_ok(zone, 0, watermark, 0, 0)) - continue; - - /* - * fragmentation index determines if allocation failures are - * due to low memory or external fragmentation - * - * index of -1 implies allocations might succeed depending - * on watermarks - * index towards 0 implies failure is due to lack of memory - * index towards 1000 implies failure is due to fragmentation - * - * Only compact if a failure would be due to fragmentation. - */ - fragindex = fragmentation_index(zone, order); - if (fragindex >= 0 && fragindex <= sysctl_extfrag_threshold) - continue; - - if (fragindex == -1 && zone_watermark_ok(zone, order, watermark, 0, 0)) { - rc = COMPACT_PARTIAL; - break; - } - status = compact_zone_order(zone, order, gfp_mask); rc = max(status, rc); - if (zone_watermark_ok(zone, order, watermark, 0, 0)) + /* If a normal allocation would succeed, stop compacting */ + if (zone_watermark_ok(zone, order, low_wmark_pages(zone), 0, 0)) break; } diff --git a/mm/migrate.c b/mm/migrate.c index 6ae8a66a7045..94875b265928 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -639,6 +639,23 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private, if (!trylock_page(page)) { if (!force) goto move_newpage; + + /* + * It's not safe for direct compaction to call lock_page. + * For example, during page readahead pages are added locked + * to the LRU. Later, when the IO completes the pages are + * marked uptodate and unlocked. However, the queueing + * could be merging multiple pages for one bio (e.g. + * mpage_readpages). If an allocation happens for the + * second or third page, the process can end up locking + * the same page twice and deadlocking. Rather than + * trying to be clever about what pages can be locked, + * avoid the use of lock_page for direct compaction + * altogether. + */ + if (current->flags & PF_MEMALLOC) + goto move_newpage; + lock_page(page); } diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 22a1bb7723e4..03a66a31bfcd 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1815,12 +1815,15 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, int migratetype, unsigned long *did_some_progress) { struct page *page; + struct task_struct *tsk = current; if (!order || compaction_deferred(preferred_zone)) return NULL; + tsk->flags |= PF_MEMALLOC; *did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask, nodemask); + tsk->flags &= ~PF_MEMALLOC; if (*did_some_progress != COMPACT_SKIPPED) { /* Page migration frees to the PCP lists but we want merging */ @@ -2121,6 +2124,19 @@ rebalance: /* Wait for some write requests to complete then retry */ wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50); goto rebalance; + } else { + /* + * High-order allocations do not necessarily loop after + * direct reclaim and reclaim/compaction depends on compaction + * being called after reclaim so call directly if necessary + */ + page = __alloc_pages_direct_compact(gfp_mask, order, + zonelist, high_zoneidx, + nodemask, + alloc_flags, preferred_zone, + migratetype, &did_some_progress); + if (page) + goto got_pg; } nopage: diff --git a/mm/vmscan.c b/mm/vmscan.c index 3464312bde07..10ebd74a423c 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -32,6 +32,7 @@ #include #include #include +#include #include #include #include @@ -59,12 +60,15 @@ * LUMPY_MODE_CONTIGRECLAIM: For high-order allocations, take a reference * page from the LRU and reclaim all pages within a * naturally aligned range + * LUMPY_MODE_COMPACTION: For high-order allocations, reclaim a number of + * order-0 pages and then compact the zone */ typedef unsigned __bitwise__ lumpy_mode; #define LUMPY_MODE_SINGLE ((__force lumpy_mode)0x01u) #define LUMPY_MODE_ASYNC ((__force lumpy_mode)0x02u) #define LUMPY_MODE_SYNC ((__force lumpy_mode)0x04u) #define LUMPY_MODE_CONTIGRECLAIM ((__force lumpy_mode)0x08u) +#define LUMPY_MODE_COMPACTION ((__force lumpy_mode)0x10u) struct scan_control { /* Incremented by the number of inactive pages that were scanned */ @@ -286,18 +290,20 @@ static void set_lumpy_reclaim_mode(int priority, struct scan_control *sc, lumpy_mode syncmode = sync ? LUMPY_MODE_SYNC : LUMPY_MODE_ASYNC; /* - * Some reclaim have alredy been failed. No worth to try synchronous - * lumpy reclaim. + * Initially assume we are entering either lumpy reclaim or + * reclaim/compaction.Depending on the order, we will either set the + * sync mode or just reclaim order-0 pages later. */ - if (sync && sc->lumpy_reclaim_mode & LUMPY_MODE_SINGLE) - return; + if (COMPACTION_BUILD) + sc->lumpy_reclaim_mode = LUMPY_MODE_COMPACTION; + else + sc->lumpy_reclaim_mode = LUMPY_MODE_CONTIGRECLAIM; /* - * If we need a large contiguous chunk of memory, or have - * trouble getting a small set of contiguous pages, we - * will reclaim both active and inactive pages. + * Avoid using lumpy reclaim or reclaim/compaction if possible by + * restricting when its set to either costly allocations or when + * under memory pressure */ - sc->lumpy_reclaim_mode = LUMPY_MODE_CONTIGRECLAIM; if (sc->order > PAGE_ALLOC_COSTLY_ORDER) sc->lumpy_reclaim_mode |= syncmode; else if (sc->order && priority < DEF_PRIORITY - 2) @@ -1385,8 +1391,8 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone, if (scanning_global_lru(sc)) { nr_taken = isolate_pages_global(nr_to_scan, &page_list, &nr_scanned, sc->order, - sc->lumpy_reclaim_mode & LUMPY_MODE_SINGLE ? - ISOLATE_INACTIVE : ISOLATE_BOTH, + sc->lumpy_reclaim_mode & LUMPY_MODE_CONTIGRECLAIM ? + ISOLATE_BOTH : ISOLATE_INACTIVE, zone, 0, file); zone->pages_scanned += nr_scanned; if (current_is_kswapd()) @@ -1398,8 +1404,8 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone, } else { nr_taken = mem_cgroup_isolate_pages(nr_to_scan, &page_list, &nr_scanned, sc->order, - sc->lumpy_reclaim_mode & LUMPY_MODE_SINGLE ? - ISOLATE_INACTIVE : ISOLATE_BOTH, + sc->lumpy_reclaim_mode & LUMPY_MODE_CONTIGRECLAIM ? + ISOLATE_BOTH : ISOLATE_INACTIVE, zone, sc->mem_cgroup, 0, file); /* @@ -1814,6 +1820,57 @@ out: } } +/* + * Reclaim/compaction depends on a number of pages being freed. To avoid + * disruption to the system, a small number of order-0 pages continue to be + * rotated and reclaimed in the normal fashion. However, by the time we get + * back to the allocator and call try_to_compact_zone(), we ensure that + * there are enough free pages for it to be likely successful + */ +static inline bool should_continue_reclaim(struct zone *zone, + unsigned long nr_reclaimed, + unsigned long nr_scanned, + struct scan_control *sc) +{ + unsigned long pages_for_compaction; + unsigned long inactive_lru_pages; + + /* If not in reclaim/compaction mode, stop */ + if (!(sc->lumpy_reclaim_mode & LUMPY_MODE_COMPACTION)) + return false; + + /* + * If we failed to reclaim and have scanned the full list, stop. + * NOTE: Checking just nr_reclaimed would exit reclaim/compaction far + * faster but obviously would be less likely to succeed + * allocation. If this is desirable, use GFP_REPEAT to decide + * if both reclaimed and scanned should be checked or just + * reclaimed + */ + if (!nr_reclaimed && !nr_scanned) + return false; + + /* + * If we have not reclaimed enough pages for compaction and the + * inactive lists are large enough, continue reclaiming + */ + pages_for_compaction = (2UL << sc->order); + inactive_lru_pages = zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON) + + zone_nr_lru_pages(zone, sc, LRU_INACTIVE_FILE); + if (sc->nr_reclaimed < pages_for_compaction && + inactive_lru_pages > pages_for_compaction) + return true; + + /* If compaction would go ahead or the allocation would succeed, stop */ + switch (compaction_suitable(zone, sc->order)) { + case COMPACT_PARTIAL: + case COMPACT_CONTINUE: + return false; + default: + return true; + } +} + /* * This is a basic per-zone page freer. Used by both kswapd and direct reclaim. */ @@ -1823,9 +1880,12 @@ static void shrink_zone(int priority, struct zone *zone, unsigned long nr[NR_LRU_LISTS]; unsigned long nr_to_scan; enum lru_list l; - unsigned long nr_reclaimed = sc->nr_reclaimed; + unsigned long nr_reclaimed; unsigned long nr_to_reclaim = sc->nr_to_reclaim; + unsigned long nr_scanned = sc->nr_scanned; +restart: + nr_reclaimed = 0; get_scan_count(zone, sc, nr, priority); while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] || @@ -1851,8 +1911,7 @@ static void shrink_zone(int priority, struct zone *zone, if (nr_reclaimed >= nr_to_reclaim && priority < DEF_PRIORITY) break; } - - sc->nr_reclaimed = nr_reclaimed; + sc->nr_reclaimed += nr_reclaimed; /* * Even if we did not try to evict anon pages at all, we want to @@ -1861,6 +1920,11 @@ static void shrink_zone(int priority, struct zone *zone, if (inactive_anon_is_low(zone, sc)) shrink_active_list(SWAP_CLUSTER_MAX, zone, sc, priority, 0); + /* reclaim/compaction might need reclaim to continue */ + if (should_continue_reclaim(zone, nr_reclaimed, + sc->nr_scanned - nr_scanned, sc)) + goto restart; + throttle_vm_writeout(sc->gfp_mask); } @@ -2307,6 +2371,14 @@ loop_again: total_scanned > sc.nr_reclaimed + sc.nr_reclaimed / 2) sc.may_writepage = 1; + /* + * Compact the zone for higher orders to reduce + * latencies for higher-order allocations that + * would ordinarily call try_to_compact_pages() + */ + if (sc.order > PAGE_ALLOC_COSTLY_ORDER) + compact_zone_order(zone, sc.order, sc.gfp_mask); + if (!zone_watermark_ok_safe(zone, order, high_wmark_pages(zone), end_zone, 0)) { all_zones_ok = 0; -- cgit From 77f1fe6b08b13a87391549c8a820ddc817b6f50e Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 13 Jan 2011 15:45:57 -0800 Subject: mm: migration: allow migration to operate asynchronously and avoid synchronous compaction in the faster path Migration synchronously waits for writeback if the initial passes fails. Callers of memory compaction do not necessarily want this behaviour if the caller is latency sensitive or expects that synchronous migration is not going to have a significantly better success rate. This patch adds a sync parameter to migrate_pages() allowing the caller to indicate if wait_on_page_writeback() is allowed within migration or not. For reclaim/compaction, try_to_compact_pages() is first called asynchronously, direct reclaim runs and then try_to_compact_pages() is called synchronously as there is a greater expectation that it'll succeed. [akpm@linux-foundation.org: build/merge fix] Signed-off-by: Mel Gorman Cc: Andrea Arcangeli Cc: KOSAKI Motohiro Cc: Rik van Riel Acked-by: Johannes Weiner Cc: Andy Whitcroft Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/compaction.h | 10 ++++++---- include/linux/migrate.h | 12 ++++++++---- mm/compaction.c | 14 ++++++++++---- mm/memory-failure.c | 8 +++++--- mm/memory_hotplug.c | 3 ++- mm/mempolicy.c | 4 ++-- mm/migrate.c | 22 +++++++++++++--------- mm/page_alloc.c | 21 +++++++++++++++------ mm/vmscan.c | 3 ++- 9 files changed, 63 insertions(+), 34 deletions(-) diff --git a/include/linux/compaction.h b/include/linux/compaction.h index 2592883d862d..72cba4034785 100644 --- a/include/linux/compaction.h +++ b/include/linux/compaction.h @@ -21,10 +21,11 @@ extern int sysctl_extfrag_handler(struct ctl_table *table, int write, extern int fragmentation_index(struct zone *zone, unsigned int order); extern unsigned long try_to_compact_pages(struct zonelist *zonelist, - int order, gfp_t gfp_mask, nodemask_t *mask); + int order, gfp_t gfp_mask, nodemask_t *mask, + bool sync); extern unsigned long compaction_suitable(struct zone *zone, int order); extern unsigned long compact_zone_order(struct zone *zone, int order, - gfp_t gfp_mask); + gfp_t gfp_mask, bool sync); /* Do not skip compaction more than 64 times */ #define COMPACT_MAX_DEFER_SHIFT 6 @@ -57,7 +58,8 @@ static inline bool compaction_deferred(struct zone *zone) #else static inline unsigned long try_to_compact_pages(struct zonelist *zonelist, - int order, gfp_t gfp_mask, nodemask_t *nodemask) + int order, gfp_t gfp_mask, nodemask_t *nodemask, + bool sync) { return COMPACT_CONTINUE; } @@ -68,7 +70,7 @@ static inline unsigned long compaction_suitable(struct zone *zone, int order) } static inline unsigned long compact_zone_order(struct zone *zone, int order, - gfp_t gfp_mask) + gfp_t gfp_mask, bool sync) { return 0; } diff --git a/include/linux/migrate.h b/include/linux/migrate.h index 085527fb8261..fa31902803fd 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -13,9 +13,11 @@ extern void putback_lru_pages(struct list_head *l); extern int migrate_page(struct address_space *, struct page *, struct page *); extern int migrate_pages(struct list_head *l, new_page_t x, - unsigned long private, int offlining); + unsigned long private, int offlining, + bool sync); extern int migrate_huge_pages(struct list_head *l, new_page_t x, - unsigned long private, int offlining); + unsigned long private, int offlining, + bool sync); extern int fail_migrate_page(struct address_space *, struct page *, struct page *); @@ -33,9 +35,11 @@ extern int migrate_huge_page_move_mapping(struct address_space *mapping, static inline void putback_lru_pages(struct list_head *l) {} static inline int migrate_pages(struct list_head *l, new_page_t x, - unsigned long private, int offlining) { return -ENOSYS; } + unsigned long private, int offlining, + bool sync) { return -ENOSYS; } static inline int migrate_huge_pages(struct list_head *l, new_page_t x, - unsigned long private, int offlining) { return -ENOSYS; } + unsigned long private, int offlining, + bool sync) { return -ENOSYS; } static inline int migrate_prep(void) { return -ENOSYS; } static inline int migrate_prep_local(void) { return -ENOSYS; } diff --git a/mm/compaction.c b/mm/compaction.c index 8fe917ec7c11..47fca1069343 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -33,6 +33,7 @@ struct compact_control { unsigned long nr_migratepages; /* Number of pages to migrate */ unsigned long free_pfn; /* isolate_freepages search base */ unsigned long migrate_pfn; /* isolate_migratepages search base */ + bool sync; /* Synchronous migration */ /* Account for isolated anon and file pages */ unsigned long nr_anon; @@ -455,7 +456,8 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) nr_migrate = cc->nr_migratepages; migrate_pages(&cc->migratepages, compaction_alloc, - (unsigned long)cc, 0); + (unsigned long)cc, 0, + cc->sync); update_nr_listpages(cc); nr_remaining = cc->nr_migratepages; @@ -482,7 +484,8 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) } unsigned long compact_zone_order(struct zone *zone, - int order, gfp_t gfp_mask) + int order, gfp_t gfp_mask, + bool sync) { struct compact_control cc = { .nr_freepages = 0, @@ -490,6 +493,7 @@ unsigned long compact_zone_order(struct zone *zone, .order = order, .migratetype = allocflags_to_migratetype(gfp_mask), .zone = zone, + .sync = sync, }; INIT_LIST_HEAD(&cc.freepages); INIT_LIST_HEAD(&cc.migratepages); @@ -505,11 +509,13 @@ int sysctl_extfrag_threshold = 500; * @order: The order of the current allocation * @gfp_mask: The GFP mask of the current allocation * @nodemask: The allowed nodes to allocate from + * @sync: Whether migration is synchronous or not * * This is the main entry point for direct page compaction. */ unsigned long try_to_compact_pages(struct zonelist *zonelist, - int order, gfp_t gfp_mask, nodemask_t *nodemask) + int order, gfp_t gfp_mask, nodemask_t *nodemask, + bool sync) { enum zone_type high_zoneidx = gfp_zone(gfp_mask); int may_enter_fs = gfp_mask & __GFP_FS; @@ -533,7 +539,7 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist, nodemask) { int status; - status = compact_zone_order(zone, order, gfp_mask); + status = compact_zone_order(zone, order, gfp_mask, sync); rc = max(status, rc); /* If a normal allocation would succeed, stop compacting */ diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 46ab2c044b0e..2323a8039a98 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1290,9 +1290,10 @@ static int soft_offline_huge_page(struct page *page, int flags) /* Keep page count to indicate a given hugepage is isolated. */ list_add(&hpage->lru, &pagelist); - ret = migrate_huge_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 0); + ret = migrate_huge_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 0, + true); if (ret) { - putback_lru_pages(&pagelist); + putback_lru_pages(&pagelist); pr_debug("soft offline: %#lx: migration failed %d, type %lx\n", pfn, ret, page->flags); if (ret > 0) @@ -1413,7 +1414,8 @@ int soft_offline_page(struct page *page, int flags) LIST_HEAD(pagelist); list_add(&page->lru, &pagelist); - ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 0); + ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, + 0, true); if (ret) { pr_info("soft offline: %#lx: migration failed %d, type %lx\n", pfn, ret, page->flags); diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 2c6523af5473..584fc5588fdd 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -733,7 +733,8 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) goto out; } /* this function returns # of failed pages */ - ret = migrate_pages(&source, hotremove_migrate_alloc, 0, 1); + ret = migrate_pages(&source, hotremove_migrate_alloc, 0, + 1, true); if (ret) putback_lru_pages(&source); } diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 11ff260fb282..9db274593086 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -935,7 +935,7 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest, return PTR_ERR(vma); if (!list_empty(&pagelist)) { - err = migrate_pages(&pagelist, new_node_page, dest, 0); + err = migrate_pages(&pagelist, new_node_page, dest, 0, true); if (err) putback_lru_pages(&pagelist); } @@ -1155,7 +1155,7 @@ static long do_mbind(unsigned long start, unsigned long len, if (!list_empty(&pagelist)) { nr_failed = migrate_pages(&pagelist, new_vma_page, - (unsigned long)vma, 0); + (unsigned long)vma, 0, true); if (nr_failed) putback_lru_pages(&pagelist); } diff --git a/mm/migrate.c b/mm/migrate.c index 94875b265928..dc47f6c40353 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -614,7 +614,7 @@ static int move_to_new_page(struct page *newpage, struct page *page, * to the newly allocated page in newpage. */ static int unmap_and_move(new_page_t get_new_page, unsigned long private, - struct page *page, int force, int offlining) + struct page *page, int force, int offlining, bool sync) { int rc = 0; int *result = NULL; @@ -682,7 +682,7 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private, BUG_ON(charge); if (PageWriteback(page)) { - if (!force) + if (!force || !sync) goto uncharge; wait_on_page_writeback(page); } @@ -827,7 +827,7 @@ move_newpage: */ static int unmap_and_move_huge_page(new_page_t get_new_page, unsigned long private, struct page *hpage, - int force, int offlining) + int force, int offlining, bool sync) { int rc = 0; int *result = NULL; @@ -841,7 +841,7 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, rc = -EAGAIN; if (!trylock_page(hpage)) { - if (!force) + if (!force || !sync) goto out; lock_page(hpage); } @@ -909,7 +909,8 @@ out: * Return: Number of pages not migrated or error code. */ int migrate_pages(struct list_head *from, - new_page_t get_new_page, unsigned long private, int offlining) + new_page_t get_new_page, unsigned long private, int offlining, + bool sync) { int retry = 1; int nr_failed = 0; @@ -929,7 +930,8 @@ int migrate_pages(struct list_head *from, cond_resched(); rc = unmap_and_move(get_new_page, private, - page, pass > 2, offlining); + page, pass > 2, offlining, + sync); switch(rc) { case -ENOMEM: @@ -958,7 +960,8 @@ out: } int migrate_huge_pages(struct list_head *from, - new_page_t get_new_page, unsigned long private, int offlining) + new_page_t get_new_page, unsigned long private, int offlining, + bool sync) { int retry = 1; int nr_failed = 0; @@ -974,7 +977,8 @@ int migrate_huge_pages(struct list_head *from, cond_resched(); rc = unmap_and_move_huge_page(get_new_page, - private, page, pass > 2, offlining); + private, page, pass > 2, offlining, + sync); switch(rc) { case -ENOMEM: @@ -1107,7 +1111,7 @@ set_status: err = 0; if (!list_empty(&pagelist)) { err = migrate_pages(&pagelist, new_page_node, - (unsigned long)pm, 0); + (unsigned long)pm, 0, true); if (err) putback_lru_pages(&pagelist); } diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 03a66a31bfcd..0fd486467b4b 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1812,7 +1812,8 @@ static struct page * __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist, enum zone_type high_zoneidx, nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, - int migratetype, unsigned long *did_some_progress) + int migratetype, unsigned long *did_some_progress, + bool sync_migration) { struct page *page; struct task_struct *tsk = current; @@ -1822,7 +1823,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, tsk->flags |= PF_MEMALLOC; *did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask, - nodemask); + nodemask, sync_migration); tsk->flags &= ~PF_MEMALLOC; if (*did_some_progress != COMPACT_SKIPPED) { @@ -1859,7 +1860,8 @@ static inline struct page * __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist, enum zone_type high_zoneidx, nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, - int migratetype, unsigned long *did_some_progress) + int migratetype, unsigned long *did_some_progress, + bool sync_migration) { return NULL; } @@ -2001,6 +2003,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, unsigned long pages_reclaimed = 0; unsigned long did_some_progress; struct task_struct *p = current; + bool sync_migration = false; /* * In the slowpath, we sanity check order to avoid ever trying to @@ -2063,14 +2066,19 @@ rebalance: if (test_thread_flag(TIF_MEMDIE) && !(gfp_mask & __GFP_NOFAIL)) goto nopage; - /* Try direct compaction */ + /* + * Try direct compaction. The first pass is asynchronous. Subsequent + * attempts after direct reclaim are synchronous + */ page = __alloc_pages_direct_compact(gfp_mask, order, zonelist, high_zoneidx, nodemask, alloc_flags, preferred_zone, - migratetype, &did_some_progress); + migratetype, &did_some_progress, + sync_migration); if (page) goto got_pg; + sync_migration = true; /* Try direct reclaim and then allocating */ page = __alloc_pages_direct_reclaim(gfp_mask, order, @@ -2134,7 +2142,8 @@ rebalance: zonelist, high_zoneidx, nodemask, alloc_flags, preferred_zone, - migratetype, &did_some_progress); + migratetype, &did_some_progress, + sync_migration); if (page) goto got_pg; } diff --git a/mm/vmscan.c b/mm/vmscan.c index 10ebd74a423c..8320d115c85d 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2377,7 +2377,8 @@ loop_again: * would ordinarily call try_to_compact_pages() */ if (sc.order > PAGE_ALLOC_COSTLY_ORDER) - compact_zone_order(zone, sc.order, sc.gfp_mask); + compact_zone_order(zone, sc.order, sc.gfp_mask, + false); if (!zone_watermark_ok_safe(zone, order, high_wmark_pages(zone), end_zone, 0)) { -- cgit From 7f0f24967b0349798803260b2e4bf347cffa1990 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 13 Jan 2011 15:45:58 -0800 Subject: mm: migration: cleanup migrate_pages API by matching types for offlining and sync With the introduction of the boolean sync parameter, the API looks a little inconsistent as offlining is still an int. Convert offlining to a bool for the sake of being tidy. Signed-off-by: Mel Gorman Cc: Andrea Arcangeli Cc: KOSAKI Motohiro Cc: Rik van Riel Acked-by: Johannes Weiner Cc: Andy Whitcroft Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/migrate.h | 8 ++++---- mm/compaction.c | 2 +- mm/memory_hotplug.c | 2 +- mm/mempolicy.c | 6 ++++-- mm/migrate.c | 8 ++++---- 5 files changed, 14 insertions(+), 12 deletions(-) diff --git a/include/linux/migrate.h b/include/linux/migrate.h index fa31902803fd..e39aeecfe9a2 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -13,10 +13,10 @@ extern void putback_lru_pages(struct list_head *l); extern int migrate_page(struct address_space *, struct page *, struct page *); extern int migrate_pages(struct list_head *l, new_page_t x, - unsigned long private, int offlining, + unsigned long private, bool offlining, bool sync); extern int migrate_huge_pages(struct list_head *l, new_page_t x, - unsigned long private, int offlining, + unsigned long private, bool offlining, bool sync); extern int fail_migrate_page(struct address_space *, @@ -35,10 +35,10 @@ extern int migrate_huge_page_move_mapping(struct address_space *mapping, static inline void putback_lru_pages(struct list_head *l) {} static inline int migrate_pages(struct list_head *l, new_page_t x, - unsigned long private, int offlining, + unsigned long private, bool offlining, bool sync) { return -ENOSYS; } static inline int migrate_huge_pages(struct list_head *l, new_page_t x, - unsigned long private, int offlining, + unsigned long private, bool offlining, bool sync) { return -ENOSYS; } static inline int migrate_prep(void) { return -ENOSYS; } diff --git a/mm/compaction.c b/mm/compaction.c index 47fca1069343..e005a30e968c 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -456,7 +456,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) nr_migrate = cc->nr_migratepages; migrate_pages(&cc->migratepages, compaction_alloc, - (unsigned long)cc, 0, + (unsigned long)cc, false, cc->sync); update_nr_listpages(cc); nr_remaining = cc->nr_migratepages; diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 584fc5588fdd..a2832c092509 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -734,7 +734,7 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) } /* this function returns # of failed pages */ ret = migrate_pages(&source, hotremove_migrate_alloc, 0, - 1, true); + true, true); if (ret) putback_lru_pages(&source); } diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 9db274593086..7ee55af8d79c 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -935,7 +935,8 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest, return PTR_ERR(vma); if (!list_empty(&pagelist)) { - err = migrate_pages(&pagelist, new_node_page, dest, 0, true); + err = migrate_pages(&pagelist, new_node_page, dest, + false, true); if (err) putback_lru_pages(&pagelist); } @@ -1155,7 +1156,8 @@ static long do_mbind(unsigned long start, unsigned long len, if (!list_empty(&pagelist)) { nr_failed = migrate_pages(&pagelist, new_vma_page, - (unsigned long)vma, 0, true); + (unsigned long)vma, + false, true); if (nr_failed) putback_lru_pages(&pagelist); } diff --git a/mm/migrate.c b/mm/migrate.c index dc47f6c40353..690d0de993af 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -614,7 +614,7 @@ static int move_to_new_page(struct page *newpage, struct page *page, * to the newly allocated page in newpage. */ static int unmap_and_move(new_page_t get_new_page, unsigned long private, - struct page *page, int force, int offlining, bool sync) + struct page *page, int force, bool offlining, bool sync) { int rc = 0; int *result = NULL; @@ -827,7 +827,7 @@ move_newpage: */ static int unmap_and_move_huge_page(new_page_t get_new_page, unsigned long private, struct page *hpage, - int force, int offlining, bool sync) + int force, bool offlining, bool sync) { int rc = 0; int *result = NULL; @@ -909,7 +909,7 @@ out: * Return: Number of pages not migrated or error code. */ int migrate_pages(struct list_head *from, - new_page_t get_new_page, unsigned long private, int offlining, + new_page_t get_new_page, unsigned long private, bool offlining, bool sync) { int retry = 1; @@ -960,7 +960,7 @@ out: } int migrate_huge_pages(struct list_head *from, - new_page_t get_new_page, unsigned long private, int offlining, + new_page_t get_new_page, unsigned long private, bool offlining, bool sync) { int retry = 1; -- cgit From 9927af740b1b9b1e769310bd0b91425e8047b803 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 13 Jan 2011 15:45:59 -0800 Subject: mm: compaction: perform a faster migration scan when migrating asynchronously try_to_compact_pages() is initially called to only migrate pages asychronously and kswapd always compacts asynchronously. Both are being optimistic so it is important to complete the work as quickly as possible to minimise stalls. This patch alters the scanner when asynchronous to only consider MIGRATE_MOVABLE pageblocks as migration candidates. This reduces stalls when allocating huge pages while not impairing allocation success rates as a full scan will be performed if necessary after direct reclaim. Signed-off-by: Mel Gorman Cc: Andrea Arcangeli Cc: KOSAKI Motohiro Cc: Rik van Riel Acked-by: Johannes Weiner Cc: Andy Whitcroft Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/compaction.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/mm/compaction.c b/mm/compaction.c index e005a30e968c..b0fbfdfad298 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -240,6 +240,7 @@ static unsigned long isolate_migratepages(struct zone *zone, struct compact_control *cc) { unsigned long low_pfn, end_pfn; + unsigned long last_pageblock_nr = 0, pageblock_nr; unsigned long nr_scanned = 0, nr_isolated = 0; struct list_head *migratelist = &cc->migratepages; @@ -280,6 +281,20 @@ static unsigned long isolate_migratepages(struct zone *zone, if (PageBuddy(page)) continue; + /* + * For async migration, also only scan in MOVABLE blocks. Async + * migration is optimistic to see if the minimum amount of work + * satisfies the allocation + */ + pageblock_nr = low_pfn >> pageblock_order; + if (!cc->sync && last_pageblock_nr != pageblock_nr && + get_pageblock_migratetype(page) != MIGRATE_MOVABLE) { + low_pfn += pageblock_nr_pages; + low_pfn = ALIGN(low_pfn, pageblock_nr_pages) - 1; + last_pageblock_nr = pageblock_nr; + continue; + } + /* Try isolate the page */ if (__isolate_lru_page(page, ISOLATE_BOTH, 0) != 0) continue; -- cgit From f3a310bc4e5ce7e55e1c8e25c31e63af017f3e50 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 13 Jan 2011 15:46:00 -0800 Subject: mm: vmscan: rename lumpy_mode to reclaim_mode With compaction being used instead of lumpy reclaim, the name lumpy_mode and associated variables is a bit misleading. Rename lumpy_mode to reclaim_mode which is a better fit. There is no functional change. Signed-off-by: Mel Gorman Cc: Andrea Arcangeli Cc: KOSAKI Motohiro Cc: Rik van Riel Acked-by: Johannes Weiner Cc: Andy Whitcroft Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/trace/events/vmscan.h | 6 ++-- mm/vmscan.c | 70 +++++++++++++++++++++---------------------- 2 files changed, 38 insertions(+), 38 deletions(-) diff --git a/include/trace/events/vmscan.h b/include/trace/events/vmscan.h index be76429962ca..ea422aaa23e1 100644 --- a/include/trace/events/vmscan.h +++ b/include/trace/events/vmscan.h @@ -25,13 +25,13 @@ #define trace_reclaim_flags(page, sync) ( \ (page_is_file_cache(page) ? RECLAIM_WB_FILE : RECLAIM_WB_ANON) | \ - (sync & LUMPY_MODE_SYNC ? RECLAIM_WB_SYNC : RECLAIM_WB_ASYNC) \ + (sync & RECLAIM_MODE_SYNC ? RECLAIM_WB_SYNC : RECLAIM_WB_ASYNC) \ ) #define trace_shrink_flags(file, sync) ( \ - (sync & LUMPY_MODE_SYNC ? RECLAIM_WB_MIXED : \ + (sync & RECLAIM_MODE_SYNC ? RECLAIM_WB_MIXED : \ (file ? RECLAIM_WB_FILE : RECLAIM_WB_ANON)) | \ - (sync & LUMPY_MODE_SYNC ? RECLAIM_WB_SYNC : RECLAIM_WB_ASYNC) \ + (sync & RECLAIM_MODE_SYNC ? RECLAIM_WB_SYNC : RECLAIM_WB_ASYNC) \ ) TRACE_EVENT(mm_vmscan_kswapd_sleep, diff --git a/mm/vmscan.c b/mm/vmscan.c index 8320d115c85d..7037cc8c60b6 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -53,22 +53,22 @@ #include /* - * lumpy_mode determines how the inactive list is shrunk - * LUMPY_MODE_SINGLE: Reclaim only order-0 pages - * LUMPY_MODE_ASYNC: Do not block - * LUMPY_MODE_SYNC: Allow blocking e.g. call wait_on_page_writeback - * LUMPY_MODE_CONTIGRECLAIM: For high-order allocations, take a reference + * reclaim_mode determines how the inactive list is shrunk + * RECLAIM_MODE_SINGLE: Reclaim only order-0 pages + * RECLAIM_MODE_ASYNC: Do not block + * RECLAIM_MODE_SYNC: Allow blocking e.g. call wait_on_page_writeback + * RECLAIM_MODE_LUMPYRECLAIM: For high-order allocations, take a reference * page from the LRU and reclaim all pages within a * naturally aligned range - * LUMPY_MODE_COMPACTION: For high-order allocations, reclaim a number of + * RECLAIM_MODE_COMPACTION: For high-order allocations, reclaim a number of * order-0 pages and then compact the zone */ -typedef unsigned __bitwise__ lumpy_mode; -#define LUMPY_MODE_SINGLE ((__force lumpy_mode)0x01u) -#define LUMPY_MODE_ASYNC ((__force lumpy_mode)0x02u) -#define LUMPY_MODE_SYNC ((__force lumpy_mode)0x04u) -#define LUMPY_MODE_CONTIGRECLAIM ((__force lumpy_mode)0x08u) -#define LUMPY_MODE_COMPACTION ((__force lumpy_mode)0x10u) +typedef unsigned __bitwise__ reclaim_mode_t; +#define RECLAIM_MODE_SINGLE ((__force reclaim_mode_t)0x01u) +#define RECLAIM_MODE_ASYNC ((__force reclaim_mode_t)0x02u) +#define RECLAIM_MODE_SYNC ((__force reclaim_mode_t)0x04u) +#define RECLAIM_MODE_LUMPYRECLAIM ((__force reclaim_mode_t)0x08u) +#define RECLAIM_MODE_COMPACTION ((__force reclaim_mode_t)0x10u) struct scan_control { /* Incremented by the number of inactive pages that were scanned */ @@ -101,7 +101,7 @@ struct scan_control { * Intend to reclaim enough continuous memory rather than reclaim * enough amount of memory. i.e, mode for high order allocation. */ - lumpy_mode lumpy_reclaim_mode; + reclaim_mode_t reclaim_mode; /* Which cgroup do we reclaim from */ struct mem_cgroup *mem_cgroup; @@ -284,10 +284,10 @@ unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask, return ret; } -static void set_lumpy_reclaim_mode(int priority, struct scan_control *sc, +static void set_reclaim_mode(int priority, struct scan_control *sc, bool sync) { - lumpy_mode syncmode = sync ? LUMPY_MODE_SYNC : LUMPY_MODE_ASYNC; + reclaim_mode_t syncmode = sync ? RECLAIM_MODE_SYNC : RECLAIM_MODE_ASYNC; /* * Initially assume we are entering either lumpy reclaim or @@ -295,9 +295,9 @@ static void set_lumpy_reclaim_mode(int priority, struct scan_control *sc, * sync mode or just reclaim order-0 pages later. */ if (COMPACTION_BUILD) - sc->lumpy_reclaim_mode = LUMPY_MODE_COMPACTION; + sc->reclaim_mode = RECLAIM_MODE_COMPACTION; else - sc->lumpy_reclaim_mode = LUMPY_MODE_CONTIGRECLAIM; + sc->reclaim_mode = RECLAIM_MODE_LUMPYRECLAIM; /* * Avoid using lumpy reclaim or reclaim/compaction if possible by @@ -305,16 +305,16 @@ static void set_lumpy_reclaim_mode(int priority, struct scan_control *sc, * under memory pressure */ if (sc->order > PAGE_ALLOC_COSTLY_ORDER) - sc->lumpy_reclaim_mode |= syncmode; + sc->reclaim_mode |= syncmode; else if (sc->order && priority < DEF_PRIORITY - 2) - sc->lumpy_reclaim_mode |= syncmode; + sc->reclaim_mode |= syncmode; else - sc->lumpy_reclaim_mode = LUMPY_MODE_SINGLE | LUMPY_MODE_ASYNC; + sc->reclaim_mode = RECLAIM_MODE_SINGLE | RECLAIM_MODE_ASYNC; } -static void disable_lumpy_reclaim_mode(struct scan_control *sc) +static void reset_reclaim_mode(struct scan_control *sc) { - sc->lumpy_reclaim_mode = LUMPY_MODE_SINGLE | LUMPY_MODE_ASYNC; + sc->reclaim_mode = RECLAIM_MODE_SINGLE | RECLAIM_MODE_ASYNC; } static inline int is_page_cache_freeable(struct page *page) @@ -445,7 +445,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping, * first attempt to free a range of pages fails. */ if (PageWriteback(page) && - (sc->lumpy_reclaim_mode & LUMPY_MODE_SYNC)) + (sc->reclaim_mode & RECLAIM_MODE_SYNC)) wait_on_page_writeback(page); if (!PageWriteback(page)) { @@ -453,7 +453,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping, ClearPageReclaim(page); } trace_mm_vmscan_writepage(page, - trace_reclaim_flags(page, sc->lumpy_reclaim_mode)); + trace_reclaim_flags(page, sc->reclaim_mode)); inc_zone_page_state(page, NR_VMSCAN_WRITE); return PAGE_SUCCESS; } @@ -638,7 +638,7 @@ static enum page_references page_check_references(struct page *page, referenced_page = TestClearPageReferenced(page); /* Lumpy reclaim - ignore references */ - if (sc->lumpy_reclaim_mode & LUMPY_MODE_CONTIGRECLAIM) + if (sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM) return PAGEREF_RECLAIM; /* @@ -755,7 +755,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, * for any page for which writeback has already * started. */ - if ((sc->lumpy_reclaim_mode & LUMPY_MODE_SYNC) && + if ((sc->reclaim_mode & RECLAIM_MODE_SYNC) && may_enter_fs) wait_on_page_writeback(page); else { @@ -911,7 +911,7 @@ cull_mlocked: try_to_free_swap(page); unlock_page(page); putback_lru_page(page); - disable_lumpy_reclaim_mode(sc); + reset_reclaim_mode(sc); continue; activate_locked: @@ -924,7 +924,7 @@ activate_locked: keep_locked: unlock_page(page); keep: - disable_lumpy_reclaim_mode(sc); + reset_reclaim_mode(sc); keep_lumpy: list_add(&page->lru, &ret_pages); VM_BUG_ON(PageLRU(page) || PageUnevictable(page)); @@ -1340,7 +1340,7 @@ static inline bool should_reclaim_stall(unsigned long nr_taken, return false; /* Only stall on lumpy reclaim */ - if (sc->lumpy_reclaim_mode & LUMPY_MODE_SINGLE) + if (sc->reclaim_mode & RECLAIM_MODE_SINGLE) return false; /* If we have relaimed everything on the isolated list, no stall */ @@ -1384,14 +1384,14 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone, return SWAP_CLUSTER_MAX; } - set_lumpy_reclaim_mode(priority, sc, false); + set_reclaim_mode(priority, sc, false); lru_add_drain(); spin_lock_irq(&zone->lru_lock); if (scanning_global_lru(sc)) { nr_taken = isolate_pages_global(nr_to_scan, &page_list, &nr_scanned, sc->order, - sc->lumpy_reclaim_mode & LUMPY_MODE_CONTIGRECLAIM ? + sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM ? ISOLATE_BOTH : ISOLATE_INACTIVE, zone, 0, file); zone->pages_scanned += nr_scanned; @@ -1404,7 +1404,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone, } else { nr_taken = mem_cgroup_isolate_pages(nr_to_scan, &page_list, &nr_scanned, sc->order, - sc->lumpy_reclaim_mode & LUMPY_MODE_CONTIGRECLAIM ? + sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM ? ISOLATE_BOTH : ISOLATE_INACTIVE, zone, sc->mem_cgroup, 0, file); @@ -1427,7 +1427,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone, /* Check if we should syncronously wait for writeback */ if (should_reclaim_stall(nr_taken, nr_reclaimed, priority, sc)) { - set_lumpy_reclaim_mode(priority, sc, true); + set_reclaim_mode(priority, sc, true); nr_reclaimed += shrink_page_list(&page_list, zone, sc); } @@ -1442,7 +1442,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone, zone_idx(zone), nr_scanned, nr_reclaimed, priority, - trace_shrink_flags(file, sc->lumpy_reclaim_mode)); + trace_shrink_flags(file, sc->reclaim_mode)); return nr_reclaimed; } @@ -1836,7 +1836,7 @@ static inline bool should_continue_reclaim(struct zone *zone, unsigned long inactive_lru_pages; /* If not in reclaim/compaction mode, stop */ - if (!(sc->lumpy_reclaim_mode & LUMPY_MODE_COMPACTION)) + if (!(sc->reclaim_mode & RECLAIM_MODE_COMPACTION)) return false; /* -- cgit From e5a5623b28198aa91ea71ee5d3846757fc76bc87 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Thu, 13 Jan 2011 15:46:00 -0800 Subject: mm: remove unused get_vm_area_node get_vm_area_node() is unused in the kernel and can thus be removed. Signed-off-by: David Rientjes Cc: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/vmalloc.h | 3 --- mm/vmalloc.c | 7 ------- 2 files changed, 10 deletions(-) diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index 44b54f619ac6..cb73c755fac8 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -90,9 +90,6 @@ extern struct vm_struct *__get_vm_area_caller(unsigned long size, unsigned long flags, unsigned long start, unsigned long end, void *caller); -extern struct vm_struct *get_vm_area_node(unsigned long size, - unsigned long flags, int node, - gfp_t gfp_mask); extern struct vm_struct *remove_vm_area(const void *addr); extern int map_vm_area(struct vm_struct *area, pgprot_t prot, diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 48245af07b10..78ec9d8bc57c 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -1315,13 +1315,6 @@ struct vm_struct *get_vm_area_caller(unsigned long size, unsigned long flags, -1, GFP_KERNEL, caller); } -struct vm_struct *get_vm_area_node(unsigned long size, unsigned long flags, - int node, gfp_t gfp_mask) -{ - return __get_vm_area_node(size, 1, flags, VMALLOC_START, VMALLOC_END, - node, gfp_mask, __builtin_return_address(0)); -} - static struct vm_struct *find_vm_area(const void *addr) { struct vmap_area *va; -- cgit From ec3f64fc9c196a304c4b7db3e1ff56d640628509 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Thu, 13 Jan 2011 15:46:01 -0800 Subject: mm: remove gfp mask from pcpu_get_vm_areas pcpu_get_vm_areas() only uses GFP_KERNEL allocations, so remove the gfp_t formal and use the mask internally. Signed-off-by: David Rientjes Cc: Christoph Lameter Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/vmalloc.h | 2 +- mm/percpu-vm.c | 2 +- mm/vmalloc.c | 21 +++++++++------------ 3 files changed, 11 insertions(+), 14 deletions(-) diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index cb73c755fac8..c7348b8d0a81 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -117,7 +117,7 @@ extern __init void vm_area_register_early(struct vm_struct *vm, size_t align); #ifdef CONFIG_SMP struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets, const size_t *sizes, int nr_vms, - size_t align, gfp_t gfp_mask); + size_t align); void pcpu_free_vm_areas(struct vm_struct **vms, int nr_vms); #endif diff --git a/mm/percpu-vm.c b/mm/percpu-vm.c index 7d9c1d0ebd3f..ea534960a04b 100644 --- a/mm/percpu-vm.c +++ b/mm/percpu-vm.c @@ -421,7 +421,7 @@ static struct pcpu_chunk *pcpu_create_chunk(void) return NULL; vms = pcpu_get_vm_areas(pcpu_group_offsets, pcpu_group_sizes, - pcpu_nr_groups, pcpu_atom_size, GFP_KERNEL); + pcpu_nr_groups, pcpu_atom_size); if (!vms) { pcpu_free_chunk(chunk); return NULL; diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 78ec9d8bc57c..f67546636322 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -2196,17 +2196,16 @@ static unsigned long pvm_determine_end(struct vmap_area **pnext, * @sizes: array containing size of each area * @nr_vms: the number of areas to allocate * @align: alignment, all entries in @offsets and @sizes must be aligned to this - * @gfp_mask: allocation mask * * Returns: kmalloc'd vm_struct pointer array pointing to allocated * vm_structs on success, %NULL on failure * * Percpu allocator wants to use congruent vm areas so that it can * maintain the offsets among percpu areas. This function allocates - * congruent vmalloc areas for it. These areas tend to be scattered - * pretty far, distance between two areas easily going up to - * gigabytes. To avoid interacting with regular vmallocs, these areas - * are allocated from top. + * congruent vmalloc areas for it with GFP_KERNEL. These areas tend to + * be scattered pretty far, distance between two areas easily going up + * to gigabytes. To avoid interacting with regular vmallocs, these + * areas are allocated from top. * * Despite its complicated look, this allocator is rather simple. It * does everything top-down and scans areas from the end looking for @@ -2217,7 +2216,7 @@ static unsigned long pvm_determine_end(struct vmap_area **pnext, */ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets, const size_t *sizes, int nr_vms, - size_t align, gfp_t gfp_mask) + size_t align) { const unsigned long vmalloc_start = ALIGN(VMALLOC_START, align); const unsigned long vmalloc_end = VMALLOC_END & ~(align - 1); @@ -2227,8 +2226,6 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets, unsigned long base, start, end, last_end; bool purged = false; - gfp_mask &= GFP_RECLAIM_MASK; - /* verify parameters and allocate data structures */ BUG_ON(align & ~PAGE_MASK || !is_power_of_2(align)); for (last_area = 0, area = 0; area < nr_vms; area++) { @@ -2261,14 +2258,14 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets, return NULL; } - vms = kzalloc(sizeof(vms[0]) * nr_vms, gfp_mask); - vas = kzalloc(sizeof(vas[0]) * nr_vms, gfp_mask); + vms = kzalloc(sizeof(vms[0]) * nr_vms, GFP_KERNEL); + vas = kzalloc(sizeof(vas[0]) * nr_vms, GFP_KERNEL); if (!vas || !vms) goto err_free; for (area = 0; area < nr_vms; area++) { - vas[area] = kzalloc(sizeof(struct vmap_area), gfp_mask); - vms[area] = kzalloc(sizeof(struct vm_struct), gfp_mask); + vas[area] = kzalloc(sizeof(struct vmap_area), GFP_KERNEL); + vms[area] = kzalloc(sizeof(struct vm_struct), GFP_KERNEL); if (!vas[area] || !vms[area]) goto err_free; } -- cgit From d0a21265dfb5fa8ae54e90d0fb6d1c215b10a28a Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Thu, 13 Jan 2011 15:46:02 -0800 Subject: mm: unify module_alloc code for vmalloc Four architectures (arm, mips, sparc, x86) use __vmalloc_area() for module_init(). Much of the code is duplicated and can be generalized in a globally accessible function, __vmalloc_node_range(). __vmalloc_node() now calls into __vmalloc_node_range() with a range of [VMALLOC_START, VMALLOC_END) for functionally equivalent behavior. Each architecture may then use __vmalloc_node_range() directly to remove the duplication of code. Signed-off-by: David Rientjes Cc: Christoph Lameter Cc: Russell King Cc: Ralf Baechle Cc: "David S. Miller" Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm/kernel/module.c | 14 +++---------- arch/mips/kernel/module.c | 14 +++---------- arch/sparc/kernel/module.c | 14 ++++--------- arch/x86/kernel/module.c | 17 ++++------------ include/linux/vmalloc.h | 5 +++-- mm/vmalloc.c | 50 +++++++++++++++++++++++++++------------------- 6 files changed, 46 insertions(+), 68 deletions(-) diff --git a/arch/arm/kernel/module.c b/arch/arm/kernel/module.c index 0c1bb68ff4a8..2cfe8161b478 100644 --- a/arch/arm/kernel/module.c +++ b/arch/arm/kernel/module.c @@ -38,17 +38,9 @@ #ifdef CONFIG_MMU void *module_alloc(unsigned long size) { - struct vm_struct *area; - - size = PAGE_ALIGN(size); - if (!size) - return NULL; - - area = __get_vm_area(size, VM_ALLOC, MODULES_VADDR, MODULES_END); - if (!area) - return NULL; - - return __vmalloc_area(area, GFP_KERNEL, PAGE_KERNEL_EXEC); + return __vmalloc_node_range(size, 1, MODULES_VADDR, MODULES_END, + GFP_KERNEL, PAGE_KERNEL_EXEC, -1, + __builtin_return_address(0)); } #else /* CONFIG_MMU */ void *module_alloc(unsigned long size) diff --git a/arch/mips/kernel/module.c b/arch/mips/kernel/module.c index 6f51dda87fce..d87a72e9fac7 100644 --- a/arch/mips/kernel/module.c +++ b/arch/mips/kernel/module.c @@ -46,17 +46,9 @@ static DEFINE_SPINLOCK(dbe_lock); void *module_alloc(unsigned long size) { #ifdef MODULE_START - struct vm_struct *area; - - size = PAGE_ALIGN(size); - if (!size) - return NULL; - - area = __get_vm_area(size, VM_ALLOC, MODULE_START, MODULE_END); - if (!area) - return NULL; - - return __vmalloc_area(area, GFP_KERNEL, PAGE_KERNEL); + return __vmalloc_node_range(size, 1, MODULE_START, MODULE_END, + GFP_KERNEL, PAGE_KERNEL, -1, + __builtin_return_address(0)); #else if (size == 0) return NULL; diff --git a/arch/sparc/kernel/module.c b/arch/sparc/kernel/module.c index ee3c7dde8d9f..8d348c474a2f 100644 --- a/arch/sparc/kernel/module.c +++ b/arch/sparc/kernel/module.c @@ -23,17 +23,11 @@ static void *module_map(unsigned long size) { - struct vm_struct *area; - - size = PAGE_ALIGN(size); - if (!size || size > MODULES_LEN) - return NULL; - - area = __get_vm_area(size, VM_ALLOC, MODULES_VADDR, MODULES_END); - if (!area) + if (PAGE_ALIGN(size) > MODULES_LEN) return NULL; - - return __vmalloc_area(area, GFP_KERNEL, PAGE_KERNEL); + return __vmalloc_node_range(size, 1, MODULES_VADDR, MODULES_END, + GFP_KERNEL, PAGE_KERNEL, -1, + __builtin_return_address(0)); } static char *dot2underscore(char *name) diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c index 8f2956091735..ab23f1ad4bf1 100644 --- a/arch/x86/kernel/module.c +++ b/arch/x86/kernel/module.c @@ -37,20 +37,11 @@ void *module_alloc(unsigned long size) { - struct vm_struct *area; - - if (!size) - return NULL; - size = PAGE_ALIGN(size); - if (size > MODULES_LEN) + if (PAGE_ALIGN(size) > MODULES_LEN) return NULL; - - area = __get_vm_area(size, VM_ALLOC, MODULES_VADDR, MODULES_END); - if (!area) - return NULL; - - return __vmalloc_area(area, GFP_KERNEL | __GFP_HIGHMEM, - PAGE_KERNEL_EXEC); + return __vmalloc_node_range(size, 1, MODULES_VADDR, MODULES_END, + GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL_EXEC, + -1, __builtin_return_address(0)); } /* Free memory returned from module_alloc */ diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index c7348b8d0a81..4ed6fcd6b726 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -59,8 +59,9 @@ extern void *vmalloc_exec(unsigned long size); extern void *vmalloc_32(unsigned long size); extern void *vmalloc_32_user(unsigned long size); extern void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot); -extern void *__vmalloc_area(struct vm_struct *area, gfp_t gfp_mask, - pgprot_t prot); +extern void *__vmalloc_node_range(unsigned long size, unsigned long align, + unsigned long start, unsigned long end, gfp_t gfp_mask, + pgprot_t prot, int node, void *caller); extern void vfree(const void *addr); extern void *vmap(struct page **pages, unsigned int count, diff --git a/mm/vmalloc.c b/mm/vmalloc.c index f67546636322..284346ee0e91 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -1530,25 +1530,12 @@ fail: return NULL; } -void *__vmalloc_area(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot) -{ - void *addr = __vmalloc_area_node(area, gfp_mask, prot, -1, - __builtin_return_address(0)); - - /* - * A ref_count = 3 is needed because the vm_struct and vmap_area - * structures allocated in the __get_vm_area_node() function contain - * references to the virtual address of the vmalloc'ed block. - */ - kmemleak_alloc(addr, area->size - PAGE_SIZE, 3, gfp_mask); - - return addr; -} - /** - * __vmalloc_node - allocate virtually contiguous memory + * __vmalloc_node_range - allocate virtually contiguous memory * @size: allocation size * @align: desired alignment + * @start: vm area range start + * @end: vm area range end * @gfp_mask: flags for the page level allocator * @prot: protection mask for the allocated pages * @node: node to use for allocation or -1 @@ -1558,9 +1545,9 @@ void *__vmalloc_area(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot) * allocator with @gfp_mask flags. Map them into contiguous * kernel virtual space, using a pagetable protection of @prot. */ -static void *__vmalloc_node(unsigned long size, unsigned long align, - gfp_t gfp_mask, pgprot_t prot, - int node, void *caller) +void *__vmalloc_node_range(unsigned long size, unsigned long align, + unsigned long start, unsigned long end, gfp_t gfp_mask, + pgprot_t prot, int node, void *caller) { struct vm_struct *area; void *addr; @@ -1570,8 +1557,8 @@ static void *__vmalloc_node(unsigned long size, unsigned long align, if (!size || (size >> PAGE_SHIFT) > totalram_pages) return NULL; - area = __get_vm_area_node(size, align, VM_ALLOC, VMALLOC_START, - VMALLOC_END, node, gfp_mask, caller); + area = __get_vm_area_node(size, align, VM_ALLOC, start, end, node, + gfp_mask, caller); if (!area) return NULL; @@ -1588,6 +1575,27 @@ static void *__vmalloc_node(unsigned long size, unsigned long align, return addr; } +/** + * __vmalloc_node - allocate virtually contiguous memory + * @size: allocation size + * @align: desired alignment + * @gfp_mask: flags for the page level allocator + * @prot: protection mask for the allocated pages + * @node: node to use for allocation or -1 + * @caller: caller's return address + * + * Allocate enough pages to cover @size from the page level + * allocator with @gfp_mask flags. Map them into contiguous + * kernel virtual space, using a pagetable protection of @prot. + */ +static void *__vmalloc_node(unsigned long size, unsigned long align, + gfp_t gfp_mask, pgprot_t prot, + int node, void *caller) +{ + return __vmalloc_node_range(size, align, VMALLOC_START, VMALLOC_END, + gfp_mask, prot, node, caller); +} + void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot) { return __vmalloc_node(size, 1, gfp_mask, prot, -1, -- cgit From dabb16f639820267b3850d804571c70bd93d4e07 Mon Sep 17 00:00:00 2001 From: Mandeep Singh Baines Date: Thu, 13 Jan 2011 15:46:05 -0800 Subject: oom: allow a non-CAP_SYS_RESOURCE proces to oom_score_adj down We'd like to be able to oom_score_adj a process up/down as it enters/leaves the foreground. Currently, it is not possible to oom_adj down without CAP_SYS_RESOURCE. This patch allows a task to decrease its oom_score_adj back to the value that a CAP_SYS_RESOURCE thread set it to or its inherited value at fork. Assuming the thread that has forked it has oom_score_adj of 0, each process could decrease it back from 0 upon activation unless a CAP_SYS_RESOURCE thread elevated it to something higher. Alternative considered: * a setuid binary * a daemon with CAP_SYS_RESOURCE Since you don't wan't all processes to be able to reduce their oom_adj, a setuid or daemon implementation would be complex. The alternatives also have much higher overhead. This patch updated from original patch based on feedback from David Rientjes. Signed-off-by: Mandeep Singh Baines Acked-by: David Rientjes Cc: KAMEZAWA Hiroyuki Cc: KOSAKI Motohiro Cc: Rik van Riel Cc: Ying Han Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/filesystems/proc.txt | 4 ++++ fs/proc/base.c | 4 +++- include/linux/sched.h | 2 ++ kernel/fork.c | 1 + 4 files changed, 10 insertions(+), 1 deletion(-) diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt index ef757fca470b..23cae6548d3a 100644 --- a/Documentation/filesystems/proc.txt +++ b/Documentation/filesystems/proc.txt @@ -1323,6 +1323,10 @@ scaled linearly with /proc//oom_score_adj. Writing to /proc//oom_score_adj or /proc//oom_adj will change the other with its scaled value. +The value of /proc//oom_score_adj may be reduced no lower than the last +value set by a CAP_SYS_RESOURCE process. To reduce the value any lower +requires CAP_SYS_RESOURCE. + NOTICE: /proc//oom_adj is deprecated and will be removed, please see Documentation/feature-removal-schedule.txt. diff --git a/fs/proc/base.c b/fs/proc/base.c index 93f1cdd5d3d7..9d096e82b201 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -1151,7 +1151,7 @@ static ssize_t oom_score_adj_write(struct file *file, const char __user *buf, goto err_task_lock; } - if (oom_score_adj < task->signal->oom_score_adj && + if (oom_score_adj < task->signal->oom_score_adj_min && !capable(CAP_SYS_RESOURCE)) { err = -EACCES; goto err_sighand; @@ -1164,6 +1164,8 @@ static ssize_t oom_score_adj_write(struct file *file, const char __user *buf, atomic_dec(&task->mm->oom_disable_count); } task->signal->oom_score_adj = oom_score_adj; + if (has_capability_noaudit(current, CAP_SYS_RESOURCE)) + task->signal->oom_score_adj_min = oom_score_adj; /* * Scale /proc/pid/oom_adj appropriately ensuring that OOM_DISABLE is * always attainable. diff --git a/include/linux/sched.h b/include/linux/sched.h index 07402530fc70..f23b5bb6f52e 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -634,6 +634,8 @@ struct signal_struct { int oom_adj; /* OOM kill score adjustment (bit shift) */ int oom_score_adj; /* OOM kill score adjustment */ + int oom_score_adj_min; /* OOM kill score adjustment minimum value. + * Only settable by CAP_SYS_RESOURCE. */ struct mutex cred_guard_mutex; /* guard against foreign influences on * credential calculations diff --git a/kernel/fork.c b/kernel/fork.c index 76a1fdd80bdf..1499607e4da2 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -910,6 +910,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) sig->oom_adj = current->signal->oom_adj; sig->oom_score_adj = current->signal->oom_score_adj; + sig->oom_score_adj_min = current->signal->oom_score_adj_min; mutex_init(&sig->cred_guard_mutex); -- cgit From 212260aa07135b327752dc02625c68cf4ce04caf Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Thu, 13 Jan 2011 15:46:06 -0800 Subject: mm: clear PageError bit in msync & fsync Temporary IO failures, eg. due to loss of both multipath paths, can permanently leave the PageError bit set on a page, resulting in msync or fsync returning -EIO over and over again, even if IO is now getting to the disk correctly. We already clear the AS_ENOSPC and AS_IO bits in mapping->flags in the filemap_fdatawait_range function. Also clearing the PageError bit on the page allows subsequent msync or fsync calls on this file to return without an error, if the subsequent IO succeeds. Unfortunately data written out in the msync or fsync call that returned -EIO can still get lost, because the page dirty bit appears to not get restored on IO error. However, the alternative could be potentially all of memory filling up with uncleanable dirty pages, hanging the system, so there is no nice choice here... Signed-off-by: Rik van Riel Acked-by: Valerie Aurora Acked-by: Jeff Layton Cc: Theodore Ts'o Acked-by: Jan Kara Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/page-flags.h | 2 +- mm/filemap.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 5f38c460367e..4805fdec8d6e 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -198,7 +198,7 @@ static inline int __TestClearPage##uname(struct page *page) { return 0; } struct page; /* forward declaration */ TESTPAGEFLAG(Locked, locked) TESTSETFLAG(Locked, locked) -PAGEFLAG(Error, error) +PAGEFLAG(Error, error) TESTCLEARFLAG(Error, error) PAGEFLAG(Referenced, referenced) TESTCLEARFLAG(Referenced, referenced) PAGEFLAG(Dirty, dirty) TESTSCFLAG(Dirty, dirty) __CLEARPAGEFLAG(Dirty, dirty) PAGEFLAG(LRU, lru) __CLEARPAGEFLAG(LRU, lru) diff --git a/mm/filemap.c b/mm/filemap.c index 1a3dd5914726..b4ad8e36c81a 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -298,7 +298,7 @@ int filemap_fdatawait_range(struct address_space *mapping, loff_t start_byte, continue; wait_on_page_writeback(page); - if (PageError(page)) + if (TestClearPageError(page)) ret = -EIO; } pagevec_release(&pvec); -- cgit From b009c024ff0059e293c1937516f2defe56263650 Mon Sep 17 00:00:00 2001 From: Michel Lespinasse Date: Thu, 13 Jan 2011 15:46:07 -0800 Subject: do_wp_page: remove the 'reuse' flag mlocking a shared, writable vma currently causes the corresponding pages to be marked as dirty and queued for writeback. This seems rather unnecessary given that the pages are not being actually modified during mlock. It is understood that for non-shared mappings (file or anon) we want to use a write fault in order to break COW, but there is just no such need for shared mappings. The first two patches in this series do not introduce any behavior change. The intent there is to make it obvious that dirtying file pages is only done in the (writable, shared) case. I think this clarifies the code, but I wouldn't mind dropping these two patches if there is no consensus about them. The last patch is where we actually avoid dirtying shared mappings during mlock. Note that as a side effect of this, we won't call page_mkwrite() for the mappings that define it, and won't be pre-allocating data blocks at the FS level if the mapped file was sparsely allocated. My understanding is that mlock does not need to provide such guarantee, as evidenced by the fact that it never did for the filesystems that don't define page_mkwrite() - including some common ones like ext3. However, I would like to gather feedback on this from filesystem people as a precaution. If this turns out to be a showstopper, maybe block preallocation can be added back on using a different interface. Large shared mlocks are getting significantly (>2x) faster in my tests, as the disk can be fully used for reading the file instead of having to share between this and writeback. This patch: Reorganize the code to remove the 'reuse' flag. No behavior changes. Signed-off-by: Michel Lespinasse Cc: Hugh Dickins Cc: Rik van Riel Cc: Kosaki Motohiro Cc: Peter Zijlstra Cc: Nick Piggin Cc: Theodore Tso Cc: Michael Rubin Cc: Suleiman Souhlal Cc: Dave Chinner Cc: Christoph Hellwig Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index 02e48aa0ed13..d0cc1c134a64 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2112,7 +2112,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, { struct page *old_page, *new_page; pte_t entry; - int reuse = 0, ret = 0; + int ret = 0; int page_mkwrite = 0; struct page *dirty_page = NULL; @@ -2149,14 +2149,16 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, } page_cache_release(old_page); } - reuse = reuse_swap_page(old_page); - if (reuse) + if (reuse_swap_page(old_page)) { /* * The page is all ours. Move it to our anon_vma so * the rmap code will not search our parent or siblings. * Protected against the rmap code by the page lock. */ page_move_anon_rmap(old_page, vma, address); + unlock_page(old_page); + goto reuse; + } unlock_page(old_page); } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) == (VM_WRITE|VM_SHARED))) { @@ -2220,10 +2222,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, } dirty_page = old_page; get_page(dirty_page); - reuse = 1; - } - if (reuse) { reuse: flush_cache_page(vma, address, pte_pfn(orig_pte)); entry = pte_mkyoung(orig_pte); -- cgit From 72ddc8f72270758951ccefb7d190f364d20215ab Mon Sep 17 00:00:00 2001 From: Michel Lespinasse Date: Thu, 13 Jan 2011 15:46:08 -0800 Subject: do_wp_page: clarify dirty_page handling Reorganize the code so that dirty pages are handled closer to the place that makes them dirty (handling write fault into shared, writable VMAs). No behavior changes. Signed-off-by: Michel Lespinasse Cc: Hugh Dickins Cc: Rik van Riel Cc: Kosaki Motohiro Cc: Peter Zijlstra Cc: Nick Piggin Cc: Theodore Tso Cc: Michael Rubin Cc: Suleiman Souhlal Cc: Dave Chinner Cc: Christoph Hellwig Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 72 ++++++++++++++++++++++++++++++++----------------------------- 1 file changed, 38 insertions(+), 34 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index d0cc1c134a64..9144fae9a68b 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2229,8 +2229,45 @@ reuse: entry = maybe_mkwrite(pte_mkdirty(entry), vma); if (ptep_set_access_flags(vma, address, page_table, entry,1)) update_mmu_cache(vma, address, page_table); + pte_unmap_unlock(page_table, ptl); ret |= VM_FAULT_WRITE; - goto unlock; + + if (!dirty_page) + return ret; + + /* + * Yes, Virginia, this is actually required to prevent a race + * with clear_page_dirty_for_io() from clearing the page dirty + * bit after it clear all dirty ptes, but before a racing + * do_wp_page installs a dirty pte. + * + * do_no_page is protected similarly. + */ + if (!page_mkwrite) { + wait_on_page_locked(dirty_page); + set_page_dirty_balance(dirty_page, page_mkwrite); + } + put_page(dirty_page); + if (page_mkwrite) { + struct address_space *mapping = dirty_page->mapping; + + set_page_dirty(dirty_page); + unlock_page(dirty_page); + page_cache_release(dirty_page); + if (mapping) { + /* + * Some device drivers do not set page.mapping + * but still dirty their pages + */ + balance_dirty_pages_ratelimited(mapping); + } + } + + /* file_update_time outside page_lock */ + if (vma->vm_file) + file_update_time(vma->vm_file); + + return ret; } /* @@ -2336,39 +2373,6 @@ gotten: page_cache_release(old_page); unlock: pte_unmap_unlock(page_table, ptl); - if (dirty_page) { - /* - * Yes, Virginia, this is actually required to prevent a race - * with clear_page_dirty_for_io() from clearing the page dirty - * bit after it clear all dirty ptes, but before a racing - * do_wp_page installs a dirty pte. - * - * do_no_page is protected similarly. - */ - if (!page_mkwrite) { - wait_on_page_locked(dirty_page); - set_page_dirty_balance(dirty_page, page_mkwrite); - } - put_page(dirty_page); - if (page_mkwrite) { - struct address_space *mapping = dirty_page->mapping; - - set_page_dirty(dirty_page); - unlock_page(dirty_page); - page_cache_release(dirty_page); - if (mapping) { - /* - * Some device drivers do not set page.mapping - * but still dirty their pages - */ - balance_dirty_pages_ratelimited(mapping); - } - } - - /* file_update_time outside page_lock */ - if (vma->vm_file) - file_update_time(vma->vm_file); - } return ret; oom_free_new: page_cache_release(new_page); -- cgit From 5ecfda041e4b4bd858d25bbf5a16c2a6c06d7272 Mon Sep 17 00:00:00 2001 From: Michel Lespinasse Date: Thu, 13 Jan 2011 15:46:09 -0800 Subject: mlock: avoid dirtying pages and triggering writeback When faulting in pages for mlock(), we want to break COW for anonymous or file pages within VM_WRITABLE, non-VM_SHARED vmas. However, there is no need to write-fault into VM_SHARED vmas since shared file pages can be mlocked first and dirtied later, when/if they actually get written to. Skipping the write fault is desirable, as we don't want to unnecessarily cause these pages to be dirtied and queued for writeback. Signed-off-by: Michel Lespinasse Cc: Hugh Dickins Cc: Rik van Riel Cc: Kosaki Motohiro Cc: Peter Zijlstra Cc: Nick Piggin Cc: Theodore Tso Cc: Michael Rubin Cc: Suleiman Souhlal Cc: Dave Chinner Cc: Christoph Hellwig Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 7 ++++++- mm/mlock.c | 7 ++++++- 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index 9144fae9a68b..b8f97b8575b7 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3299,7 +3299,12 @@ int make_pages_present(unsigned long addr, unsigned long end) vma = find_vma(current->mm, addr); if (!vma) return -ENOMEM; - write = (vma->vm_flags & VM_WRITE) != 0; + /* + * We want to touch writable mappings with a write fault in order + * to break COW, except for shared mappings because these don't COW + * and we would not want to dirty them for nothing. + */ + write = (vma->vm_flags & (VM_WRITE | VM_SHARED)) == VM_WRITE; BUG_ON(addr >= end); BUG_ON(end > vma->vm_end); len = DIV_ROUND_UP(end, PAGE_SIZE) - addr/PAGE_SIZE; diff --git a/mm/mlock.c b/mm/mlock.c index b70919ce4f72..4f318642fbbe 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -171,7 +171,12 @@ static long __mlock_vma_pages_range(struct vm_area_struct *vma, VM_BUG_ON(!rwsem_is_locked(&mm->mmap_sem)); gup_flags = FOLL_TOUCH | FOLL_GET; - if (vma->vm_flags & VM_WRITE) + /* + * We want to touch writable mappings with a write fault in order + * to break COW, except for shared mappings because these don't COW + * and we would not want to dirty them for nothing. + */ + if ((vma->vm_flags & (VM_WRITE | VM_SHARED)) == VM_WRITE) gup_flags |= FOLL_WRITE; /* We don't try to access the guard page of a stack vma */ -- cgit From fed067da46ad3b9acedaf794a5f05d0bc153280b Mon Sep 17 00:00:00 2001 From: Michel Lespinasse Date: Thu, 13 Jan 2011 15:46:10 -0800 Subject: mlock: only hold mmap_sem in shared mode when faulting in pages Currently mlock() holds mmap_sem in exclusive mode while the pages get faulted in. In the case of a large mlock, this can potentially take a very long time, during which various commands such as 'ps auxw' will block. This makes sysadmins unhappy: real 14m36.232s user 0m0.003s sys 0m0.015s (output from 'time ps auxw' while a 20GB file was being mlocked without being previously preloaded into page cache) I propose that mlock() could release mmap_sem after the VM_LOCKED bits have been set in all appropriate VMAs. Then a second pass could be done to actually mlock the pages, in small batches, releasing mmap_sem when we block on disk access or when we detect some contention. This patch: Before this change, mlock() holds mmap_sem in exclusive mode while the pages get faulted in. In the case of a large mlock, this can potentially take a very long time. Various things will block while mmap_sem is held, including 'ps auxw'. This can make sysadmins angry. I propose that mlock() could release mmap_sem after the VM_LOCKED bits have been set in all appropriate VMAs. Then a second pass could be done to actually mlock the pages with mmap_sem held for reads only. We need to recheck the vma flags after we re-acquire mmap_sem, but this is easy. In the case where a vma has been munlocked before mlock completes, pages that were already marked as PageMlocked() are handled by the munlock() call, and mlock() is careful to not mark new page batches as PageMlocked() after the munlock() call has cleared the VM_LOCKED vma flags. So, the end result will be identical to what'd happen if munlock() had executed after the mlock() call. In a later change, I will allow the second pass to release mmap_sem when blocking on disk accesses or when it is otherwise contended, so that it won't be held for long periods of time even in shared mode. Signed-off-by: Michel Lespinasse Tested-by: Valdis Kletnieks Cc: Hugh Dickins Cc: Rik van Riel Cc: Peter Zijlstra Cc: Nick Piggin Cc: KOSAKI Motohiro Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Thomas Gleixner Cc: David Howells Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mlock.c | 81 +++++++++++++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 64 insertions(+), 17 deletions(-) diff --git a/mm/mlock.c b/mm/mlock.c index 4f318642fbbe..67b3dd8616dc 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -377,18 +377,10 @@ static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev, int ret = 0; int lock = newflags & VM_LOCKED; - if (newflags == vma->vm_flags || - (vma->vm_flags & (VM_IO | VM_PFNMAP))) + if (newflags == vma->vm_flags || (vma->vm_flags & VM_SPECIAL) || + is_vm_hugetlb_page(vma) || vma == get_gate_vma(current)) goto out; /* don't set VM_LOCKED, don't count */ - if ((vma->vm_flags & (VM_DONTEXPAND | VM_RESERVED)) || - is_vm_hugetlb_page(vma) || - vma == get_gate_vma(current)) { - if (lock) - make_pages_present(start, end); - goto out; /* don't set VM_LOCKED, don't count */ - } - pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); *prev = vma_merge(mm, *prev, start, end, newflags, vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma)); @@ -424,14 +416,10 @@ success: * set VM_LOCKED, __mlock_vma_pages_range will bring it back. */ - if (lock) { + if (lock) vma->vm_flags = newflags; - ret = __mlock_vma_pages_range(vma, start, end); - if (ret < 0) - ret = __mlock_posix_error_return(ret); - } else { + else munlock_vma_pages_range(vma, start, end); - } out: *prev = vma; @@ -444,7 +432,8 @@ static int do_mlock(unsigned long start, size_t len, int on) struct vm_area_struct * vma, * prev; int error; - len = PAGE_ALIGN(len); + VM_BUG_ON(start & ~PAGE_MASK); + VM_BUG_ON(len != PAGE_ALIGN(len)); end = start + len; if (end < start) return -EINVAL; @@ -487,6 +476,58 @@ static int do_mlock(unsigned long start, size_t len, int on) return error; } +static int do_mlock_pages(unsigned long start, size_t len, int ignore_errors) +{ + struct mm_struct *mm = current->mm; + unsigned long end, nstart, nend; + struct vm_area_struct *vma = NULL; + int ret = 0; + + VM_BUG_ON(start & ~PAGE_MASK); + VM_BUG_ON(len != PAGE_ALIGN(len)); + end = start + len; + + down_read(&mm->mmap_sem); + for (nstart = start; nstart < end; nstart = nend) { + /* + * We want to fault in pages for [nstart; end) address range. + * Find first corresponding VMA. + */ + if (!vma) + vma = find_vma(mm, nstart); + else + vma = vma->vm_next; + if (!vma || vma->vm_start >= end) + break; + /* + * Set [nstart; nend) to intersection of desired address + * range with the first VMA. Also, skip undesirable VMA types. + */ + nend = min(end, vma->vm_end); + if (vma->vm_flags & (VM_IO | VM_PFNMAP)) + continue; + if (nstart < vma->vm_start) + nstart = vma->vm_start; + /* + * Now fault in a range of pages within the first VMA. + */ + if (vma->vm_flags & VM_LOCKED) { + ret = __mlock_vma_pages_range(vma, nstart, nend); + if (ret < 0 && ignore_errors) { + ret = 0; + continue; /* continue at next VMA */ + } + if (ret) { + ret = __mlock_posix_error_return(ret); + break; + } + } else + make_pages_present(nstart, nend); + } + up_read(&mm->mmap_sem); + return ret; /* 0 or negative error code */ +} + SYSCALL_DEFINE2(mlock, unsigned long, start, size_t, len) { unsigned long locked; @@ -512,6 +553,8 @@ SYSCALL_DEFINE2(mlock, unsigned long, start, size_t, len) if ((locked <= lock_limit) || capable(CAP_IPC_LOCK)) error = do_mlock(start, len, 1); up_write(¤t->mm->mmap_sem); + if (!error) + error = do_mlock_pages(start, len, 0); return error; } @@ -576,6 +619,10 @@ SYSCALL_DEFINE1(mlockall, int, flags) capable(CAP_IPC_LOCK)) ret = do_mlockall(flags); up_write(¤t->mm->mmap_sem); + if (!ret && (flags & MCL_CURRENT)) { + /* Ignore errors */ + do_mlock_pages(0, TASK_SIZE, 1); + } out: return ret; } -- cgit From 110d74a921f4d272b47ef6104fcf937df808f4c8 Mon Sep 17 00:00:00 2001 From: Michel Lespinasse Date: Thu, 13 Jan 2011 15:46:11 -0800 Subject: mm: add FOLL_MLOCK follow_page flag. Move the code to mlock pages from __mlock_vma_pages_range() to follow_page(). This allows __mlock_vma_pages_range() to not have to break down work into 16-page batches. An additional motivation for doing this within the present patch series is that it'll make it easier for a later chagne to drop mmap_sem when blocking on disk (we'd like to be able to resume at the page that was read from disk instead of at the start of a 16-page batch). Signed-off-by: Michel Lespinasse Cc: Hugh Dickins Cc: Rik van Riel Cc: Peter Zijlstra Cc: Nick Piggin Cc: KOSAKI Motohiro Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Thomas Gleixner Cc: David Howells Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 1 + mm/memory.c | 22 ++++++++++++++++++ mm/mlock.c | 65 +++++------------------------------------------------- 3 files changed, 28 insertions(+), 60 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 721f451c3029..9ade803bcc1b 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1415,6 +1415,7 @@ struct page *follow_page(struct vm_area_struct *, unsigned long address, #define FOLL_GET 0x04 /* do get_page on page */ #define FOLL_DUMP 0x08 /* give error on hole if it would be zero */ #define FOLL_FORCE 0x10 /* get_user_pages read/write w/o permission */ +#define FOLL_MLOCK 0x40 /* mark page as mlocked */ typedef int (*pte_fn_t)(pte_t *pte, pgtable_t token, unsigned long addr, void *data); diff --git a/mm/memory.c b/mm/memory.c index b8f97b8575b7..15e1f19a3b10 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1310,6 +1310,28 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address, */ mark_page_accessed(page); } + if (flags & FOLL_MLOCK) { + /* + * The preliminary mapping check is mainly to avoid the + * pointless overhead of lock_page on the ZERO_PAGE + * which might bounce very badly if there is contention. + * + * If the page is already locked, we don't need to + * handle it now - vmscan will handle it later if and + * when it attempts to reclaim the page. + */ + if (page->mapping && trylock_page(page)) { + lru_add_drain(); /* push cached pages to LRU */ + /* + * Because we lock page here and migration is + * blocked by the pte's page reference, we need + * only check for file-cache page truncation. + */ + if (page->mapping) + mlock_vma_page(page); + unlock_page(page); + } + } unlock: pte_unmap_unlock(ptep, ptl); out: diff --git a/mm/mlock.c b/mm/mlock.c index 67b3dd8616dc..25cc9e88c540 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -159,10 +159,9 @@ static long __mlock_vma_pages_range(struct vm_area_struct *vma, { struct mm_struct *mm = vma->vm_mm; unsigned long addr = start; - struct page *pages[16]; /* 16 gives a reasonable batch */ int nr_pages = (end - start) / PAGE_SIZE; - int ret = 0; int gup_flags; + int ret; VM_BUG_ON(start & ~PAGE_MASK); VM_BUG_ON(end & ~PAGE_MASK); @@ -170,7 +169,7 @@ static long __mlock_vma_pages_range(struct vm_area_struct *vma, VM_BUG_ON(end > vma->vm_end); VM_BUG_ON(!rwsem_is_locked(&mm->mmap_sem)); - gup_flags = FOLL_TOUCH | FOLL_GET; + gup_flags = FOLL_TOUCH | FOLL_MLOCK; /* * We want to touch writable mappings with a write fault in order * to break COW, except for shared mappings because these don't COW @@ -185,63 +184,9 @@ static long __mlock_vma_pages_range(struct vm_area_struct *vma, nr_pages--; } - while (nr_pages > 0) { - int i; - - cond_resched(); - - /* - * get_user_pages makes pages present if we are - * setting mlock. and this extra reference count will - * disable migration of this page. However, page may - * still be truncated out from under us. - */ - ret = __get_user_pages(current, mm, addr, - min_t(int, nr_pages, ARRAY_SIZE(pages)), - gup_flags, pages, NULL); - /* - * This can happen for, e.g., VM_NONLINEAR regions before - * a page has been allocated and mapped at a given offset, - * or for addresses that map beyond end of a file. - * We'll mlock the pages if/when they get faulted in. - */ - if (ret < 0) - break; - - lru_add_drain(); /* push cached pages to LRU */ - - for (i = 0; i < ret; i++) { - struct page *page = pages[i]; - - if (page->mapping) { - /* - * That preliminary check is mainly to avoid - * the pointless overhead of lock_page on the - * ZERO_PAGE: which might bounce very badly if - * there is contention. However, we're still - * dirtying its cacheline with get/put_page: - * we'll add another __get_user_pages flag to - * avoid it if that case turns out to matter. - */ - lock_page(page); - /* - * Because we lock page here and migration is - * blocked by the elevated reference, we need - * only check for file-cache page truncation. - */ - if (page->mapping) - mlock_vma_page(page); - unlock_page(page); - } - put_page(page); /* ref from get_user_pages() */ - } - - addr += ret * PAGE_SIZE; - nr_pages -= ret; - ret = 0; - } - - return ret; /* 0 or negative error code */ + ret = __get_user_pages(current, mm, addr, nr_pages, gup_flags, + NULL, NULL); + return max(ret, 0); /* 0 or negative error code */ } /* -- cgit From 5fdb2002131cd4e210b9638a4fc932ec7be491d1 Mon Sep 17 00:00:00 2001 From: Michel Lespinasse Date: Thu, 13 Jan 2011 15:46:12 -0800 Subject: mm: move VM_LOCKED check to __mlock_vma_pages_range() Use a single code path for faulting in pages during mlock. The reason to have it in this patch series is that I did not want to update both code paths in a later change that releases mmap_sem when blocking on disk. Signed-off-by: Michel Lespinasse Cc: Hugh Dickins Cc: Rik van Riel Cc: Peter Zijlstra Cc: Nick Piggin Cc: KOSAKI Motohiro Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Thomas Gleixner Cc: David Howells Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mlock.c | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/mm/mlock.c b/mm/mlock.c index 25cc9e88c540..84da66b7bbf0 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -169,7 +169,7 @@ static long __mlock_vma_pages_range(struct vm_area_struct *vma, VM_BUG_ON(end > vma->vm_end); VM_BUG_ON(!rwsem_is_locked(&mm->mmap_sem)); - gup_flags = FOLL_TOUCH | FOLL_MLOCK; + gup_flags = FOLL_TOUCH; /* * We want to touch writable mappings with a write fault in order * to break COW, except for shared mappings because these don't COW @@ -178,6 +178,9 @@ static long __mlock_vma_pages_range(struct vm_area_struct *vma, if ((vma->vm_flags & (VM_WRITE | VM_SHARED)) == VM_WRITE) gup_flags |= FOLL_WRITE; + if (vma->vm_flags & VM_LOCKED) + gup_flags |= FOLL_MLOCK; + /* We don't try to access the guard page of a stack vma */ if (stack_guard_page(vma, start)) { addr += PAGE_SIZE; @@ -456,18 +459,15 @@ static int do_mlock_pages(unsigned long start, size_t len, int ignore_errors) /* * Now fault in a range of pages within the first VMA. */ - if (vma->vm_flags & VM_LOCKED) { - ret = __mlock_vma_pages_range(vma, nstart, nend); - if (ret < 0 && ignore_errors) { - ret = 0; - continue; /* continue at next VMA */ - } - if (ret) { - ret = __mlock_posix_error_return(ret); - break; - } - } else - make_pages_present(nstart, nend); + ret = __mlock_vma_pages_range(vma, nstart, nend); + if (ret < 0 && ignore_errors) { + ret = 0; + continue; /* continue at next VMA */ + } + if (ret) { + ret = __mlock_posix_error_return(ret); + break; + } } up_read(&mm->mmap_sem); return ret; /* 0 or negative error code */ -- cgit From 53a7706d5ed8f1a53ba062b318773160cc476dde Mon Sep 17 00:00:00 2001 From: Michel Lespinasse Date: Thu, 13 Jan 2011 15:46:14 -0800 Subject: mlock: do not hold mmap_sem for extended periods of time __get_user_pages gets a new 'nonblocking' parameter to signal that the caller is prepared to re-acquire mmap_sem and retry the operation if needed. This is used to split off long operations if they are going to block on a disk transfer, or when we detect contention on the mmap_sem. [akpm@linux-foundation.org: remove ref to rwsem_is_contended()] Signed-off-by: Michel Lespinasse Cc: Hugh Dickins Cc: Rik van Riel Cc: Peter Zijlstra Cc: Nick Piggin Cc: KOSAKI Motohiro Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Thomas Gleixner Cc: David Howells Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/internal.h | 3 ++- mm/memory.c | 23 ++++++++++++++++++----- mm/mlock.c | 40 +++++++++++++++++++++++----------------- mm/nommu.c | 6 ++++-- 4 files changed, 47 insertions(+), 25 deletions(-) diff --git a/mm/internal.h b/mm/internal.h index dedb0aff673f..bd4f581f624a 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -243,7 +243,8 @@ static inline void mminit_validate_memmodel_limits(unsigned long *start_pfn, int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, unsigned long start, int len, unsigned int foll_flags, - struct page **pages, struct vm_area_struct **vmas); + struct page **pages, struct vm_area_struct **vmas, + int *nonblocking); #define ZONE_RECLAIM_NOSCAN -2 #define ZONE_RECLAIM_FULL -1 diff --git a/mm/memory.c b/mm/memory.c index 15e1f19a3b10..1bbe9a22429c 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1363,7 +1363,8 @@ no_page_table: int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, unsigned long start, int nr_pages, unsigned int gup_flags, - struct page **pages, struct vm_area_struct **vmas) + struct page **pages, struct vm_area_struct **vmas, + int *nonblocking) { int i; unsigned long vm_flags; @@ -1463,10 +1464,15 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, cond_resched(); while (!(page = follow_page(vma, start, foll_flags))) { int ret; + unsigned int fault_flags = 0; + + if (foll_flags & FOLL_WRITE) + fault_flags |= FAULT_FLAG_WRITE; + if (nonblocking) + fault_flags |= FAULT_FLAG_ALLOW_RETRY; ret = handle_mm_fault(mm, vma, start, - (foll_flags & FOLL_WRITE) ? - FAULT_FLAG_WRITE : 0); + fault_flags); if (ret & VM_FAULT_ERROR) { if (ret & VM_FAULT_OOM) @@ -1482,6 +1488,11 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, else tsk->min_flt++; + if (ret & VM_FAULT_RETRY) { + *nonblocking = 0; + return i; + } + /* * The VM_FAULT_WRITE bit tells us that * do_wp_page has broken COW when necessary, @@ -1581,7 +1592,8 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, if (force) flags |= FOLL_FORCE; - return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas); + return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas, + NULL); } EXPORT_SYMBOL(get_user_pages); @@ -1606,7 +1618,8 @@ struct page *get_dump_page(unsigned long addr) struct page *page; if (__get_user_pages(current, current->mm, addr, 1, - FOLL_FORCE | FOLL_DUMP | FOLL_GET, &page, &vma) < 1) + FOLL_FORCE | FOLL_DUMP | FOLL_GET, &page, &vma, + NULL) < 1) return NULL; flush_cache_page(vma, addr, page_to_pfn(page)); return page; diff --git a/mm/mlock.c b/mm/mlock.c index 84da66b7bbf0..13e81ee8be9d 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -155,13 +155,13 @@ static inline int stack_guard_page(struct vm_area_struct *vma, unsigned long add * vma->vm_mm->mmap_sem must be held for at least read. */ static long __mlock_vma_pages_range(struct vm_area_struct *vma, - unsigned long start, unsigned long end) + unsigned long start, unsigned long end, + int *nonblocking) { struct mm_struct *mm = vma->vm_mm; unsigned long addr = start; int nr_pages = (end - start) / PAGE_SIZE; int gup_flags; - int ret; VM_BUG_ON(start & ~PAGE_MASK); VM_BUG_ON(end & ~PAGE_MASK); @@ -187,9 +187,8 @@ static long __mlock_vma_pages_range(struct vm_area_struct *vma, nr_pages--; } - ret = __get_user_pages(current, mm, addr, nr_pages, gup_flags, - NULL, NULL); - return max(ret, 0); /* 0 or negative error code */ + return __get_user_pages(current, mm, addr, nr_pages, gup_flags, + NULL, NULL, nonblocking); } /* @@ -233,7 +232,7 @@ long mlock_vma_pages_range(struct vm_area_struct *vma, is_vm_hugetlb_page(vma) || vma == get_gate_vma(current))) { - __mlock_vma_pages_range(vma, start, end); + __mlock_vma_pages_range(vma, start, end, NULL); /* Hide errors from mmap() and other callers */ return 0; @@ -429,21 +428,23 @@ static int do_mlock_pages(unsigned long start, size_t len, int ignore_errors) struct mm_struct *mm = current->mm; unsigned long end, nstart, nend; struct vm_area_struct *vma = NULL; + int locked = 0; int ret = 0; VM_BUG_ON(start & ~PAGE_MASK); VM_BUG_ON(len != PAGE_ALIGN(len)); end = start + len; - down_read(&mm->mmap_sem); for (nstart = start; nstart < end; nstart = nend) { /* * We want to fault in pages for [nstart; end) address range. * Find first corresponding VMA. */ - if (!vma) + if (!locked) { + locked = 1; + down_read(&mm->mmap_sem); vma = find_vma(mm, nstart); - else + } else if (nstart >= vma->vm_end) vma = vma->vm_next; if (!vma || vma->vm_start >= end) break; @@ -457,19 +458,24 @@ static int do_mlock_pages(unsigned long start, size_t len, int ignore_errors) if (nstart < vma->vm_start) nstart = vma->vm_start; /* - * Now fault in a range of pages within the first VMA. + * Now fault in a range of pages. __mlock_vma_pages_range() + * double checks the vma flags, so that it won't mlock pages + * if the vma was already munlocked. */ - ret = __mlock_vma_pages_range(vma, nstart, nend); - if (ret < 0 && ignore_errors) { - ret = 0; - continue; /* continue at next VMA */ - } - if (ret) { + ret = __mlock_vma_pages_range(vma, nstart, nend, &locked); + if (ret < 0) { + if (ignore_errors) { + ret = 0; + continue; /* continue at next VMA */ + } ret = __mlock_posix_error_return(ret); break; } + nend = nstart + ret * PAGE_SIZE; + ret = 0; } - up_read(&mm->mmap_sem); + if (locked) + up_read(&mm->mmap_sem); return ret; /* 0 or negative error code */ } diff --git a/mm/nommu.c b/mm/nommu.c index ef4045d010d5..f59e1424d3db 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -127,7 +127,8 @@ unsigned int kobjsize(const void *objp) int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, unsigned long start, int nr_pages, unsigned int foll_flags, - struct page **pages, struct vm_area_struct **vmas) + struct page **pages, struct vm_area_struct **vmas, + int *retry) { struct vm_area_struct *vma; unsigned long vm_flags; @@ -185,7 +186,8 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, if (force) flags |= FOLL_FORCE; - return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas); + return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas, + NULL); } EXPORT_SYMBOL(get_user_pages); -- cgit From 1e50df39f6e2c3a4a3394df62baa8a213df16c54 Mon Sep 17 00:00:00 2001 From: KOSAKI Motohiro Date: Thu, 13 Jan 2011 15:46:14 -0800 Subject: mempolicy: remove tasklist_lock from migrate_pages Today, tasklist_lock in migrate_pages doesn't protect anything. rcu_read_lock() provide enough protection from pid hash walk. Signed-off-by: KOSAKI Motohiro Reported-by: Peter Zijlstra Cc: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mempolicy.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 7ee55af8d79c..e6d351265aed 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1310,16 +1310,13 @@ SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode, /* Find the mm_struct */ rcu_read_lock(); - read_lock(&tasklist_lock); task = pid ? find_task_by_vpid(pid) : current; if (!task) { - read_unlock(&tasklist_lock); rcu_read_unlock(); err = -ESRCH; goto out; } mm = get_task_mm(task); - read_unlock(&tasklist_lock); rcu_read_unlock(); err = -EINVAL; -- cgit From ddf9c6d472825ceda66b3adff0f6437dbcd37f71 Mon Sep 17 00:00:00 2001 From: Tobias Klauser Date: Thu, 13 Jan 2011 15:46:15 -0800 Subject: vmalloc: remove redundant unlikely() IS_ERR() already implies unlikely(), so it can be omitted here. Signed-off-by: Tobias Klauser Reviewed-by: Minchan Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmalloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 284346ee0e91..cac13b415635 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -748,7 +748,7 @@ static struct vmap_block *new_vmap_block(gfp_t gfp_mask) va = alloc_vmap_area(VMAP_BLOCK_SIZE, VMAP_BLOCK_SIZE, VMALLOC_START, VMALLOC_END, node, gfp_mask); - if (unlikely(IS_ERR(va))) { + if (IS_ERR(va)) { kfree(vb); return ERR_CAST(va); } -- cgit From 088e54658f559a13c2b988086512d76fe9e8f846 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 13 Jan 2011 15:46:16 -0800 Subject: mm: remove likely() from mapping_unevictable() The mapping_unevictable() has a likely() around the mapping parameter. This mapping parameter comes from page_mapping() which has an unlikely() that the page will be set as PAGE_MAPPING_ANON, and if so, it will return NULL. One would think that this unlikely() means that the mapping returned by page_mapping() would not be NULL, but where page_mapping() is used just above mapping_unevictable(), that unlikely() is incorrect most of the time. This means that the "likely(mapping)" in mapping_unevictable() is incorrect most of the time. Running the annotated branch profiler on my main box which runs firefox, evolution, xchat and is part of my distcc farm, I had this: correct incorrect % Function File Line ------- --------- - -------- ---- ---- 12872836 1269443893 98 mapping_unevictable pagemap.h 51 35935762 1270265395 97 page_mapping mm.h 659 1306198001 143659 0 page_mapping mm.h 657 203131478 121586 0 page_mapping mm.h 657 5415491 1116 0 page_mapping mm.h 657 74899487 1116 0 page_mapping mm.h 657 203132845 224 0 page_mapping mm.h 659 5415464 27 0 page_mapping mm.h 659 13552 0 0 page_mapping mm.h 657 13552 0 0 page_mapping mm.h 659 242630 0 0 page_mapping mm.h 657 242630 0 0 page_mapping mm.h 659 74899487 0 0 page_mapping mm.h 659 The page_mapping() is a static inline, which is why it shows up multiple times. The mapping_unevictable() is also a static inline but seems to be used only once in my setup. The unlikely in page_mapping() was correct a total of 1909540379 times and incorrect 1270533123 times, with a 39% being incorrect. Perhaps this is enough to remove the unlikely from page_mapping() as well. Signed-off-by: Steven Rostedt Reviewed-by: KOSAKI Motohiro Acked-by: Nick Piggin Acked-by: Rik van Riel Cc: Lee Schermerhorn Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/pagemap.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 2d1ffe3cf1ee..9c66e994540f 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -48,7 +48,7 @@ static inline void mapping_clear_unevictable(struct address_space *mapping) static inline int mapping_unevictable(struct address_space *mapping) { - if (likely(mapping)) + if (mapping) return test_bit(AS_UNEVICTABLE, &mapping->flags); return !!mapping; } -- cgit From e20e87795834f2f14cb53baf657b91d9c39f92c8 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 13 Jan 2011 15:46:17 -0800 Subject: mm: remove unlikely() from page_mapping() page_mapping() has a unlikely that the mapping has PAGE_MAPPING_ANON set. But running the annotated branch profiler on a normal desktop system doing vairous tasks (xchat, evolution, firefox, distcc), it is not really that unlikely that the mapping here will have the PAGE_MAPPING_ANON flag set: correct incorrect % Function File Line ------- --------- - -------- ---- ---- 35935762 1270265395 97 page_mapping mm.h 659 1306198001 143659 0 page_mapping mm.h 657 203131478 121586 0 page_mapping mm.h 657 5415491 1116 0 page_mapping mm.h 657 74899487 1116 0 page_mapping mm.h 657 203132845 224 0 page_mapping mm.h 659 5415464 27 0 page_mapping mm.h 659 13552 0 0 page_mapping mm.h 657 13552 0 0 page_mapping mm.h 659 242630 0 0 page_mapping mm.h 657 242630 0 0 page_mapping mm.h 659 74899487 0 0 page_mapping mm.h 659 The page_mapping() is a static inline, which is why it shows up multiple times. The unlikely in page_mapping() was correct a total of 1909540379 times and incorrect 1270533123 times, with a 39% being incorrect. With this much of an error, it's best to simply remove the unlikely and have the compiler and branch prediction figure this out. Signed-off-by: Steven Rostedt Cc: KOSAKI Motohiro Cc: Nick Piggin Cc: Rik van Riel Cc: Lee Schermerhorn Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 9ade803bcc1b..57e11b2a18ba 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -657,7 +657,7 @@ static inline struct address_space *page_mapping(struct page *page) VM_BUG_ON(PageSlab(page)); if (unlikely(PageSwapCache(page))) mapping = &swapper_space; - else if (unlikely((unsigned long)mapping & PAGE_MAPPING_ANON)) + else if ((unsigned long)mapping & PAGE_MAPPING_ANON) mapping = NULL; return mapping; } -- cgit From c585a2678d83ba8fb02fa6b197de0ac7d67377f1 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 13 Jan 2011 15:46:18 -0800 Subject: mm: remove likely() from grab_cache_page_write_begin() Running the annotated branch profiler on a box doing average work (firefox, evolution, xchat, distcc farm), the likely() used in grab_cache_page_write_begin() was incorrect most of the time: correct incorrect % Function File Line ------- --------- - -------- ---- ---- 1924262 71332401 97 grab_cache_page_write_begin filemap.c 2206 Adding a trace_printk() and running the function tracer limited to just this function I can see: gconfd-2-2696 [000] 4467.268935: grab_cache_page_write_begin: page= (null) mapping=ffff8800676a9460 index=7 gconfd-2-2696 [000] 4467.268946: grab_cache_page_write_begin <-ext3_write_begin gconfd-2-2696 [000] 4467.268947: grab_cache_page_write_begin: page= (null) mapping=ffff8800676a9460 index=8 gconfd-2-2696 [000] 4467.268959: grab_cache_page_write_begin <-ext3_write_begin gconfd-2-2696 [000] 4467.268960: grab_cache_page_write_begin: page= (null) mapping=ffff8800676a9460 index=9 gconfd-2-2696 [000] 4467.268972: grab_cache_page_write_begin <-ext3_write_begin gconfd-2-2696 [000] 4467.268973: grab_cache_page_write_begin: page= (null) mapping=ffff8800676a9460 index=10 gconfd-2-2696 [000] 4467.268991: grab_cache_page_write_begin <-ext3_write_begin gconfd-2-2696 [000] 4467.268992: grab_cache_page_write_begin: page= (null) mapping=ffff8800676a9460 index=11 gconfd-2-2696 [000] 4467.269005: grab_cache_page_write_begin <-ext3_write_begin Which shows that a lot of calls from ext3_write_begin will result in the page returned by "find_lock_page" will be NULL. Signed-off-by: Steven Rostedt Acked-by: Nick Piggin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/filemap.c b/mm/filemap.c index b4ad8e36c81a..83a45d35468b 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2227,7 +2227,7 @@ struct page *grab_cache_page_write_begin(struct address_space *mapping, gfp_notmask = __GFP_FS; repeat: page = find_lock_page(mapping, index); - if (likely(page)) + if (page) return page; page = __page_cache_alloc(mapping_gfp_mask(mapping) & ~gfp_notmask); -- cgit From 9950474883e027e6e728cbcff25f7f2bf0c96530 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 13 Jan 2011 15:46:20 -0800 Subject: mm: kswapd: stop high-order balancing when any suitable zone is balanced Simon Kirby reported the following problem We're seeing cases on a number of servers where cache never fully grows to use all available memory. Sometimes we see servers with 4 GB of memory that never seem to have less than 1.5 GB free, even with a constantly-active VM. In some cases, these servers also swap out while this happens, even though they are constantly reading the working set into memory. We have been seeing this happening for a long time; I don't think it's anything recent, and it still happens on 2.6.36. After some debugging work by Simon, Dave Hansen and others, the prevaling theory became that kswapd is reclaiming order-3 pages requested by SLUB too aggressive about it. There are two apparent problems here. On the target machine, there is a small Normal zone in comparison to DMA32. As kswapd tries to balance all zones, it would continually try reclaiming for Normal even though DMA32 was balanced enough for callers. The second problem is that sleeping_prematurely() does not use the same logic as balance_pgdat() when deciding whether to sleep or not. This keeps kswapd artifically awake. A number of tests were run and the figures from previous postings will look very different for a few reasons. One, the old figures were forcing my network card to use GFP_ATOMIC in attempt to replicate Simon's problem. Second, I previous specified slub_min_order=3 again in an attempt to reproduce Simon's problem. In this posting, I'm depending on Simon to say whether his problem is fixed or not and these figures are to show the impact to the ordinary cases. Finally, the "vmscan" figures are taken from /proc/vmstat instead of the tracepoints. There is less information but recording is less disruptive. The first test of relevance was postmark with a process running in the background reading a large amount of anonymous memory in blocks. The objective was to vaguely simulate what was happening on Simon's machine and it's memory intensive enough to have kswapd awake. POSTMARK traceonly kanyzone Transactions per second: 156.00 ( 0.00%) 153.00 (-1.96%) Data megabytes read per second: 21.51 ( 0.00%) 21.52 ( 0.05%) Data megabytes written per second: 29.28 ( 0.00%) 29.11 (-0.58%) Files created alone per second: 250.00 ( 0.00%) 416.00 (39.90%) Files create/transact per second: 79.00 ( 0.00%) 76.00 (-3.95%) Files deleted alone per second: 520.00 ( 0.00%) 420.00 (-23.81%) Files delete/transact per second: 79.00 ( 0.00%) 76.00 (-3.95%) MMTests Statistics: duration User/Sys Time Running Test (seconds) 16.58 17.4 Total Elapsed Time (seconds) 218.48 222.47 VMstat Reclaim Statistics: vmscan Direct reclaims 0 4 Direct reclaim pages scanned 0 203 Direct reclaim pages reclaimed 0 184 Kswapd pages scanned 326631 322018 Kswapd pages reclaimed 312632 309784 Kswapd low wmark quickly 1 4 Kswapd high wmark quickly 122 475 Kswapd skip congestion_wait 1 0 Pages activated 700040 705317 Pages deactivated 212113 203922 Pages written 9875 6363 Total pages scanned 326631 322221 Total pages reclaimed 312632 309968 %age total pages scanned/reclaimed 95.71% 96.20% %age total pages scanned/written 3.02% 1.97% proc vmstat: Faults Major Faults 300 254 Minor Faults 645183 660284 Page ins 493588 486704 Page outs 4960088 4986704 Swap ins 1230 661 Swap outs 9869 6355 Performance is mildly affected because kswapd is no longer doing as much work and the background memory consumer process is getting in the way. Note that kswapd scanned and reclaimed fewer pages as it's less aggressive and overall fewer pages were scanned and reclaimed. Swap in/out is particularly reduced again reflecting kswapd throwing out fewer pages. The slight performance impact is unfortunate here but it looks like a direct result of kswapd being less aggressive. As the bug report is about too many pages being freed by kswapd, it may have to be accepted for now. The second test is a streaming IO benchmark that was previously used by Johannes to show regressions in page reclaim. MICRO traceonly kanyzone User/Sys Time Running Test (seconds) 29.29 28.87 Total Elapsed Time (seconds) 492.18 488.79 VMstat Reclaim Statistics: vmscan Direct reclaims 2128 1460 Direct reclaim pages scanned 2284822 1496067 Direct reclaim pages reclaimed 148919 110937 Kswapd pages scanned 15450014 16202876 Kswapd pages reclaimed 8503697 8537897 Kswapd low wmark quickly 3100 3397 Kswapd high wmark quickly 1860 7243 Kswapd skip congestion_wait 708 801 Pages activated 9635 9573 Pages deactivated 1432 1271 Pages written 223 1130 Total pages scanned 17734836 17698943 Total pages reclaimed 8652616 8648834 %age total pages scanned/reclaimed 48.79% 48.87% %age total pages scanned/written 0.00% 0.01% proc vmstat: Faults Major Faults 165 221 Minor Faults 9655785 9656506 Page ins 3880 7228 Page outs 37692940 37480076 Swap ins 0 69 Swap outs 19 15 Again fewer pages are scanned and reclaimed as expected and this time the test completed faster. Note that kswapd is hitting its watermarks faster (low and high wmark quickly) which I expect is due to kswapd reclaiming fewer pages. I also ran fs-mark, iozone and sysbench but there is nothing interesting to report in the figures. Performance is not significantly changed and the reclaim statistics look reasonable. Tgis patch: When the allocator enters its slow path, kswapd is woken up to balance the node. It continues working until all zones within the node are balanced. For order-0 allocations, this makes perfect sense but for higher orders it can have unintended side-effects. If the zone sizes are imbalanced, kswapd may reclaim heavily within a smaller zone discarding an excessive number of pages. The user-visible behaviour is that kswapd is awake and reclaiming even though plenty of pages are free from a suitable zone. This patch alters the "balance" logic for high-order reclaim allowing kswapd to stop if any suitable zone becomes balanced to reduce the number of pages it reclaims from other zones. kswapd still tries to ensure that order-0 watermarks for all zones are met before sleeping. Signed-off-by: Mel Gorman Reviewed-by: Minchan Kim Reviewed-by: KAMEZAWA Hiroyuki Reviewed-by: Eric B Munson Cc: Simon Kirby Cc: KOSAKI Motohiro Cc: Shaohua Li Cc: Dave Hansen Cc: Johannes Weiner Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmzone.h | 3 ++- mm/page_alloc.c | 8 +++--- mm/vmscan.c | 68 +++++++++++++++++++++++++++++++++++++++++++------- 3 files changed, 66 insertions(+), 13 deletions(-) diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 48906629335c..dad3612a7e39 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -639,6 +639,7 @@ typedef struct pglist_data { wait_queue_head_t kswapd_wait; struct task_struct *kswapd; int kswapd_max_order; + enum zone_type classzone_idx; } pg_data_t; #define node_present_pages(nid) (NODE_DATA(nid)->node_present_pages) @@ -654,7 +655,7 @@ typedef struct pglist_data { extern struct mutex zonelists_mutex; void build_all_zonelists(void *data); -void wakeup_kswapd(struct zone *zone, int order); +void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx); bool zone_watermark_ok(struct zone *z, int order, unsigned long mark, int classzone_idx, int alloc_flags); bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark, diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 0fd486467b4b..c31460918506 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1944,13 +1944,14 @@ __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order, static inline void wake_all_kswapd(unsigned int order, struct zonelist *zonelist, - enum zone_type high_zoneidx) + enum zone_type high_zoneidx, + enum zone_type classzone_idx) { struct zoneref *z; struct zone *zone; for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) - wakeup_kswapd(zone, order); + wakeup_kswapd(zone, order, classzone_idx); } static inline int @@ -2028,7 +2029,8 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, goto nopage; restart: - wake_all_kswapd(order, zonelist, high_zoneidx); + wake_all_kswapd(order, zonelist, high_zoneidx, + zone_idx(preferred_zone)); /* * OK, we're below the kswapd watermark and have kicked background diff --git a/mm/vmscan.c b/mm/vmscan.c index 7037cc8c60b6..3584067800e1 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2246,11 +2246,14 @@ static int sleeping_prematurely(pg_data_t *pgdat, int order, long remaining) * interoperates with the page allocator fallback scheme to ensure that aging * of pages is balanced across the zones. */ -static unsigned long balance_pgdat(pg_data_t *pgdat, int order) +static unsigned long balance_pgdat(pg_data_t *pgdat, int order, + int classzone_idx) { int all_zones_ok; + int any_zone_ok; int priority; int i; + int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */ unsigned long total_scanned; struct reclaim_state *reclaim_state = current->reclaim_state; struct scan_control sc = { @@ -2273,7 +2276,6 @@ loop_again: count_vm_event(PAGEOUTRUN); for (priority = DEF_PRIORITY; priority >= 0; priority--) { - int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */ unsigned long lru_pages = 0; int has_under_min_watermark_zone = 0; @@ -2282,6 +2284,7 @@ loop_again: disable_swap_token(); all_zones_ok = 1; + any_zone_ok = 0; /* * Scan in the highmem->dma direction for the highest @@ -2400,10 +2403,12 @@ loop_again: * spectulatively avoid congestion waits */ zone_clear_flag(zone, ZONE_CONGESTED); + if (i <= classzone_idx) + any_zone_ok = 1; } } - if (all_zones_ok) + if (all_zones_ok || (order && any_zone_ok)) break; /* kswapd: all done */ /* * OK, kswapd is getting into trouble. Take a nap, then take @@ -2426,7 +2431,13 @@ loop_again: break; } out: - if (!all_zones_ok) { + + /* + * order-0: All zones must meet high watermark for a balanced node + * high-order: Any zone below pgdats classzone_idx must meet the high + * watermark for a balanced node + */ + if (!(all_zones_ok || (order && any_zone_ok))) { cond_resched(); try_to_freeze(); @@ -2451,6 +2462,36 @@ out: goto loop_again; } + /* + * If kswapd was reclaiming at a higher order, it has the option of + * sleeping without all zones being balanced. Before it does, it must + * ensure that the watermarks for order-0 on *all* zones are met and + * that the congestion flags are cleared. The congestion flag must + * be cleared as kswapd is the only mechanism that clears the flag + * and it is potentially going to sleep here. + */ + if (order) { + for (i = 0; i <= end_zone; i++) { + struct zone *zone = pgdat->node_zones + i; + + if (!populated_zone(zone)) + continue; + + if (zone->all_unreclaimable && priority != DEF_PRIORITY) + continue; + + /* Confirm the zone is balanced for order-0 */ + if (!zone_watermark_ok(zone, 0, + high_wmark_pages(zone), 0, 0)) { + order = sc.order = 0; + goto loop_again; + } + + /* If balanced, clear the congested flag */ + zone_clear_flag(zone, ZONE_CONGESTED); + } + } + return sc.nr_reclaimed; } @@ -2514,6 +2555,7 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order) static int kswapd(void *p) { unsigned long order; + int classzone_idx; pg_data_t *pgdat = (pg_data_t*)p; struct task_struct *tsk = current; @@ -2544,21 +2586,27 @@ static int kswapd(void *p) set_freezable(); order = 0; + classzone_idx = MAX_NR_ZONES - 1; for ( ; ; ) { unsigned long new_order; + int new_classzone_idx; int ret; new_order = pgdat->kswapd_max_order; + new_classzone_idx = pgdat->classzone_idx; pgdat->kswapd_max_order = 0; - if (order < new_order) { + pgdat->classzone_idx = MAX_NR_ZONES - 1; + if (order < new_order || classzone_idx > new_classzone_idx) { /* * Don't sleep if someone wants a larger 'order' - * allocation + * allocation or has tigher zone constraints */ order = new_order; + classzone_idx = new_classzone_idx; } else { kswapd_try_to_sleep(pgdat, order); order = pgdat->kswapd_max_order; + classzone_idx = pgdat->classzone_idx; } ret = try_to_freeze(); @@ -2571,7 +2619,7 @@ static int kswapd(void *p) */ if (!ret) { trace_mm_vmscan_kswapd_wake(pgdat->node_id, order); - balance_pgdat(pgdat, order); + balance_pgdat(pgdat, order, classzone_idx); } } return 0; @@ -2580,7 +2628,7 @@ static int kswapd(void *p) /* * A zone is low on free memory, so wake its kswapd task to service it. */ -void wakeup_kswapd(struct zone *zone, int order) +void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx) { pg_data_t *pgdat; @@ -2590,8 +2638,10 @@ void wakeup_kswapd(struct zone *zone, int order) if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) return; pgdat = zone->zone_pgdat; - if (pgdat->kswapd_max_order < order) + if (pgdat->kswapd_max_order < order) { pgdat->kswapd_max_order = order; + pgdat->classzone_idx = min(pgdat->classzone_idx, classzone_idx); + } if (!waitqueue_active(&pgdat->kswapd_wait)) return; if (zone_watermark_ok_safe(zone, order, low_wmark_pages(zone), 0, 0)) -- cgit From 1741c87757448cedd03224f01586504f9256415d Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 13 Jan 2011 15:46:21 -0800 Subject: mm: kswapd: keep kswapd awake for high-order allocations until a percentage of the node is balanced When reclaiming for high-orders, kswapd is responsible for balancing a node but it should not reclaim excessively. It avoids excessive reclaim by considering if any zone in a node is balanced then the node is balanced. In the cases where there are imbalanced zone sizes (e.g. ZONE_DMA with both ZONE_DMA32 and ZONE_NORMAL), kswapd can go to sleep prematurely as just one small zone was balanced. This alters the sleep logic of kswapd slightly. It counts the number of pages that make up the balanced zones. If the total number of balanced pages is more than a quarter of the zone, kswapd will go back to sleep. This should keep a node balanced without reclaiming an excessive number of pages. Signed-off-by: Mel Gorman Reviewed-by: Minchan Kim Reviewed-by: KAMEZAWA Hiroyuki Reviewed-by: Eric B Munson Cc: Simon Kirby Cc: KOSAKI Motohiro Cc: Shaohua Li Cc: Dave Hansen Cc: Johannes Weiner Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 58 +++++++++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 49 insertions(+), 9 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index 3584067800e1..d3488828331a 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2198,10 +2198,40 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, } #endif +/* + * pgdat_balanced is used when checking if a node is balanced for high-order + * allocations. Only zones that meet watermarks and are in a zone allowed + * by the callers classzone_idx are added to balanced_pages. The total of + * balanced pages must be at least 25% of the zones allowed by classzone_idx + * for the node to be considered balanced. Forcing all zones to be balanced + * for high orders can cause excessive reclaim when there are imbalanced zones. + * The choice of 25% is due to + * o a 16M DMA zone that is balanced will not balance a zone on any + * reasonable sized machine + * o On all other machines, the top zone must be at least a reasonable + * precentage of the middle zones. For example, on 32-bit x86, highmem + * would need to be at least 256M for it to be balance a whole node. + * Similarly, on x86-64 the Normal zone would need to be at least 1G + * to balance a node on its own. These seemed like reasonable ratios. + */ +static bool pgdat_balanced(pg_data_t *pgdat, unsigned long balanced_pages, + int classzone_idx) +{ + unsigned long present_pages = 0; + int i; + + for (i = 0; i <= classzone_idx; i++) + present_pages += pgdat->node_zones[i].present_pages; + + return balanced_pages > (present_pages >> 2); +} + /* is kswapd sleeping prematurely? */ static int sleeping_prematurely(pg_data_t *pgdat, int order, long remaining) { int i; + unsigned long balanced = 0; + bool all_zones_ok = true; /* If a direct reclaimer woke kswapd within HZ/10, it's premature */ if (remaining) @@ -2219,10 +2249,20 @@ static int sleeping_prematurely(pg_data_t *pgdat, int order, long remaining) if (!zone_watermark_ok_safe(zone, order, high_wmark_pages(zone), 0, 0)) - return 1; + all_zones_ok = false; + else + balanced += zone->present_pages; } - return 0; + /* + * For high-order requests, the balanced zones must contain at least + * 25% of the nodes pages for kswapd to sleep. For order-0, all zones + * must be balanced + */ + if (order) + return pgdat_balanced(pgdat, balanced, 0); + else + return !all_zones_ok; } /* @@ -2250,7 +2290,7 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx) { int all_zones_ok; - int any_zone_ok; + unsigned long balanced; int priority; int i; int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */ @@ -2284,7 +2324,7 @@ loop_again: disable_swap_token(); all_zones_ok = 1; - any_zone_ok = 0; + balanced = 0; /* * Scan in the highmem->dma direction for the highest @@ -2404,11 +2444,11 @@ loop_again: */ zone_clear_flag(zone, ZONE_CONGESTED); if (i <= classzone_idx) - any_zone_ok = 1; + balanced += zone->present_pages; } } - if (all_zones_ok || (order && any_zone_ok)) + if (all_zones_ok || (order && pgdat_balanced(pgdat, balanced, classzone_idx))) break; /* kswapd: all done */ /* * OK, kswapd is getting into trouble. Take a nap, then take @@ -2434,10 +2474,10 @@ out: /* * order-0: All zones must meet high watermark for a balanced node - * high-order: Any zone below pgdats classzone_idx must meet the high - * watermark for a balanced node + * high-order: Balanced zones must make up at least 25% of the node + * for the node to be balanced */ - if (!(all_zones_ok || (order && any_zone_ok))) { + if (!(all_zones_ok || (order && pgdat_balanced(pgdat, balanced, classzone_idx)))) { cond_resched(); try_to_freeze(); -- cgit From 0abdee2bd4118366c62349a304f81537be69af33 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 13 Jan 2011 15:46:22 -0800 Subject: mm: kswapd: use the order that kswapd was reclaiming at for sleeping_prematurely() Before kswapd goes to sleep, it uses sleeping_prematurely() to check if there was a race pushing a zone below its watermark. If the race happened, it stays awake. However, balance_pgdat() can decide to reclaim at order-0 if it decides that high-order reclaim is not working as expected. This information is not passed back to sleeping_prematurely(). The impact is that kswapd remains awake reclaiming pages long after it should have gone to sleep. This patch passes the adjusted order to sleeping_prematurely and uses the same logic as balance_pgdat to decide if it's ok to go to sleep. Signed-off-by: Mel Gorman Reviewed-by: Minchan Kim Reviewed-by: KAMEZAWA Hiroyuki Reviewed-by: Eric B Munson Cc: Simon Kirby Cc: KOSAKI Motohiro Cc: Shaohua Li Cc: Dave Hansen Cc: Johannes Weiner Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index d3488828331a..46711f080f38 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2227,7 +2227,7 @@ static bool pgdat_balanced(pg_data_t *pgdat, unsigned long balanced_pages, } /* is kswapd sleeping prematurely? */ -static int sleeping_prematurely(pg_data_t *pgdat, int order, long remaining) +static bool sleeping_prematurely(pg_data_t *pgdat, int order, long remaining) { int i; unsigned long balanced = 0; @@ -2237,7 +2237,7 @@ static int sleeping_prematurely(pg_data_t *pgdat, int order, long remaining) if (remaining) return 1; - /* If after HZ/10, a zone is below the high mark, it's premature */ + /* Check the watermark levels */ for (i = 0; i < pgdat->nr_zones; i++) { struct zone *zone = pgdat->node_zones + i; @@ -2269,7 +2269,7 @@ static int sleeping_prematurely(pg_data_t *pgdat, int order, long remaining) * For kswapd, balance_pgdat() will work across all this node's zones until * they are all at high_wmark_pages(zone). * - * Returns the number of pages which were actually freed. + * Returns the final order kswapd was reclaiming at * * There is special handling here for zones which are full of pinned pages. * This can happen if the pages are all mlocked, or if they are all used by @@ -2532,7 +2532,13 @@ out: } } - return sc.nr_reclaimed; + /* + * Return the order we were reclaiming at so sleeping_prematurely() + * makes a decision on the order we were last reclaiming at. However, + * if another caller entered the allocator slow path while kswapd + * was awake, order will remain at the higher level + */ + return order; } static void kswapd_try_to_sleep(pg_data_t *pgdat, int order) @@ -2659,7 +2665,7 @@ static int kswapd(void *p) */ if (!ret) { trace_mm_vmscan_kswapd_wake(pgdat->node_id, order); - balance_pgdat(pgdat, order, classzone_idx); + order = balance_pgdat(pgdat, order, classzone_idx); } } return 0; -- cgit From 4d40502ea580c35414a1466d86f96484910ebaec Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 13 Jan 2011 15:46:23 -0800 Subject: mm: kswapd: reset kswapd_max_order and classzone_idx after reading When kswapd wakes up, it reads its order and classzone from pgdat and calls balance_pgdat. While its awake, it potentially reclaimes at a high order and a low classzone index. This might have been a once-off that was not required by subsequent callers. However, because the pgdat values were not reset, they remain artifically high while balance_pgdat() is running and potentially kswapd enters a second unnecessary reclaim cycle. Reset the pgdat order and classzone index after reading. Signed-off-by: Mel Gorman Reviewed-by: Minchan Kim Reviewed-by: KAMEZAWA Hiroyuki Reviewed-by: Eric B Munson Cc: Simon Kirby Cc: KOSAKI Motohiro Cc: Shaohua Li Cc: Dave Hansen Cc: Johannes Weiner Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/mm/vmscan.c b/mm/vmscan.c index 46711f080f38..dafb9d91b604 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2653,6 +2653,8 @@ static int kswapd(void *p) kswapd_try_to_sleep(pgdat, order); order = pgdat->kswapd_max_order; classzone_idx = pgdat->classzone_idx; + pgdat->kswapd_max_order = 0; + pgdat->classzone_idx = MAX_NR_ZONES - 1; } ret = try_to_freeze(); -- cgit From 355b09c47a0cbb73b3e65a57c03f157f2e7ddb0b Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 13 Jan 2011 15:46:24 -0800 Subject: mm: kswapd: treat zone->all_unreclaimable in sleeping_prematurely similar to balance_pgdat() After DEF_PRIORITY, balance_pgdat() considers all_unreclaimable zones to be balanced but sleeping_prematurely does not. This can force kswapd to stay awake longer than it should. This patch fixes it. Signed-off-by: Mel Gorman Reviewed-by: Eric B Munson Cc: Minchan Kim Cc: KAMEZAWA Hiroyuki Cc: Simon Kirby Cc: KOSAKI Motohiro Cc: Shaohua Li Cc: Dave Hansen Cc: Johannes Weiner Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index dafb9d91b604..388a0447b8e8 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2244,8 +2244,16 @@ static bool sleeping_prematurely(pg_data_t *pgdat, int order, long remaining) if (!populated_zone(zone)) continue; - if (zone->all_unreclaimable) + /* + * balance_pgdat() skips over all_unreclaimable after + * DEF_PRIORITY. Effectively, it considers them balanced so + * they must be considered balanced here as well if kswapd + * is to sleep + */ + if (zone->all_unreclaimable) { + balanced += zone->present_pages; continue; + } if (!zone_watermark_ok_safe(zone, order, high_wmark_pages(zone), 0, 0)) -- cgit From dc83edd941f412e938841b4989be24aa288a1aa6 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 13 Jan 2011 15:46:26 -0800 Subject: mm: kswapd: use the classzone idx that kswapd was using for sleeping_prematurely() When kswapd is woken up for a high-order allocation, it takes account of the highest usable zone by the caller (the classzone idx). During allocation, this index is used to select the lowmem_reserve[] that should be applied to the watermark calculation in zone_watermark_ok(). When balancing a node, kswapd considers the highest unbalanced zone to be the classzone index. This will always be at least be the callers classzone_idx and can be higher. However, sleeping_prematurely() always considers the lowest zone (e.g. ZONE_DMA) to be the classzone index. This means that sleeping_prematurely() can consider a zone to be balanced that is unusable by the allocation request that originally woke kswapd. This patch changes sleeping_prematurely() to use a classzone_idx matching the value it used in balance_pgdat(). Signed-off-by: Mel Gorman Reviewed-by: Minchan Kim Reviewed-by: Eric B Munson Cc: KAMEZAWA Hiroyuki Cc: Simon Kirby Cc: KOSAKI Motohiro Cc: Shaohua Li Cc: Dave Hansen Cc: Johannes Weiner Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 29 ++++++++++++++++------------- 1 file changed, 16 insertions(+), 13 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index 388a0447b8e8..cfdef0bcc7ab 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2227,7 +2227,8 @@ static bool pgdat_balanced(pg_data_t *pgdat, unsigned long balanced_pages, } /* is kswapd sleeping prematurely? */ -static bool sleeping_prematurely(pg_data_t *pgdat, int order, long remaining) +static bool sleeping_prematurely(pg_data_t *pgdat, int order, long remaining, + int classzone_idx) { int i; unsigned long balanced = 0; @@ -2235,7 +2236,7 @@ static bool sleeping_prematurely(pg_data_t *pgdat, int order, long remaining) /* If a direct reclaimer woke kswapd within HZ/10, it's premature */ if (remaining) - return 1; + return true; /* Check the watermark levels */ for (i = 0; i < pgdat->nr_zones; i++) { @@ -2256,7 +2257,7 @@ static bool sleeping_prematurely(pg_data_t *pgdat, int order, long remaining) } if (!zone_watermark_ok_safe(zone, order, high_wmark_pages(zone), - 0, 0)) + classzone_idx, 0)) all_zones_ok = false; else balanced += zone->present_pages; @@ -2268,7 +2269,7 @@ static bool sleeping_prematurely(pg_data_t *pgdat, int order, long remaining) * must be balanced */ if (order) - return pgdat_balanced(pgdat, balanced, 0); + return pgdat_balanced(pgdat, balanced, classzone_idx); else return !all_zones_ok; } @@ -2295,7 +2296,7 @@ static bool sleeping_prematurely(pg_data_t *pgdat, int order, long remaining) * of pages is balanced across the zones. */ static unsigned long balance_pgdat(pg_data_t *pgdat, int order, - int classzone_idx) + int *classzone_idx) { int all_zones_ok; unsigned long balanced; @@ -2358,6 +2359,7 @@ loop_again: if (!zone_watermark_ok_safe(zone, order, high_wmark_pages(zone), 0, 0)) { end_zone = i; + *classzone_idx = i; break; } } @@ -2451,12 +2453,12 @@ loop_again: * spectulatively avoid congestion waits */ zone_clear_flag(zone, ZONE_CONGESTED); - if (i <= classzone_idx) + if (i <= *classzone_idx) balanced += zone->present_pages; } } - if (all_zones_ok || (order && pgdat_balanced(pgdat, balanced, classzone_idx))) + if (all_zones_ok || (order && pgdat_balanced(pgdat, balanced, *classzone_idx))) break; /* kswapd: all done */ /* * OK, kswapd is getting into trouble. Take a nap, then take @@ -2485,7 +2487,7 @@ out: * high-order: Balanced zones must make up at least 25% of the node * for the node to be balanced */ - if (!(all_zones_ok || (order && pgdat_balanced(pgdat, balanced, classzone_idx)))) { + if (!(all_zones_ok || (order && pgdat_balanced(pgdat, balanced, *classzone_idx)))) { cond_resched(); try_to_freeze(); @@ -2546,10 +2548,11 @@ out: * if another caller entered the allocator slow path while kswapd * was awake, order will remain at the higher level */ + *classzone_idx = end_zone; return order; } -static void kswapd_try_to_sleep(pg_data_t *pgdat, int order) +static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx) { long remaining = 0; DEFINE_WAIT(wait); @@ -2560,7 +2563,7 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order) prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); /* Try to sleep for a short interval */ - if (!sleeping_prematurely(pgdat, order, remaining)) { + if (!sleeping_prematurely(pgdat, order, remaining, classzone_idx)) { remaining = schedule_timeout(HZ/10); finish_wait(&pgdat->kswapd_wait, &wait); prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); @@ -2570,7 +2573,7 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order) * After a short sleep, check if it was a premature sleep. If not, then * go fully to sleep until explicitly woken up. */ - if (!sleeping_prematurely(pgdat, order, remaining)) { + if (!sleeping_prematurely(pgdat, order, remaining, classzone_idx)) { trace_mm_vmscan_kswapd_sleep(pgdat->node_id); /* @@ -2658,7 +2661,7 @@ static int kswapd(void *p) order = new_order; classzone_idx = new_classzone_idx; } else { - kswapd_try_to_sleep(pgdat, order); + kswapd_try_to_sleep(pgdat, order, classzone_idx); order = pgdat->kswapd_max_order; classzone_idx = pgdat->classzone_idx; pgdat->kswapd_max_order = 0; @@ -2675,7 +2678,7 @@ static int kswapd(void *p) */ if (!ret) { trace_mm_vmscan_kswapd_wake(pgdat->node_id, order); - order = balance_pgdat(pgdat, order, classzone_idx); + order = balance_pgdat(pgdat, order, &classzone_idx); } } return 0; -- cgit From ecb256f815232b35ae8382cff36ca8ce0bbd077e Mon Sep 17 00:00:00 2001 From: "Volodymyr G. Lukiianyk" Date: Thu, 13 Jan 2011 15:46:26 -0800 Subject: mm: set correct numa_zonelist_order string when configured on the kernel command line When numa_zonelist_order parameter is set to "node" or "zone" on the command line it's still showing as "default" in sysctl. That's because early_param parsing function changes only user_zonelist_order variable. Fix this by copying user-provided string to numa_zonelist_order if it was successfully parsed. Signed-off-by: Volodymyr G Lukiianyk Acked-by: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index c31460918506..c18d1f7cff84 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2631,9 +2631,16 @@ static int __parse_numa_zonelist_order(char *s) static __init int setup_numa_zonelist_order(char *s) { - if (s) - return __parse_numa_zonelist_order(s); - return 0; + int ret; + + if (!s) + return 0; + + ret = __parse_numa_zonelist_order(s); + if (ret == 0) + strlcpy(numa_zonelist_order, s, NUMA_ZONELIST_ORDER_LEN); + + return ret; } early_param("numa_zonelist_order", setup_numa_zonelist_order); -- cgit From 240c879f20a605346705be24253bc9fc6fa8a106 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Thu, 13 Jan 2011 15:46:27 -0800 Subject: writeback: avoid unnecessary determine_dirtyable_memory call I think determine_dirtyable_memory() is a rather costly function since it need many atomic reads for gathering zone/global page state. But when we use vm_dirty_bytes && dirty_background_bytes, we don't need that costly calculation. This patch eliminates such unnecessary overhead. NOTE : newly added if condition might add overhead in normal path. But it should be _really_ small because anyway we need the access both vm_dirty_bytes and dirty_background_bytes so it is likely to hit the cache. [akpm@linux-foundation.org: fix used-uninitialised warning] Signed-off-by: Minchan Kim Cc: Wu Fengguang Cc: Peter Zijlstra Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page-writeback.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 28763b8bdbdd..2cb01f6ec5d0 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -410,9 +410,12 @@ void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty) { unsigned long background; unsigned long dirty; - unsigned long available_memory = determine_dirtyable_memory(); + unsigned long uninitialized_var(available_memory); struct task_struct *tsk; + if (!vm_dirty_bytes || !dirty_background_bytes) + available_memory = determine_dirtyable_memory(); + if (vm_dirty_bytes) dirty = DIV_ROUND_UP(vm_dirty_bytes, PAGE_SIZE); else -- cgit From ae52a2adb5afa5ac5ec5fb5c7b24777f84b6c926 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Thu, 13 Jan 2011 15:46:28 -0800 Subject: thp: ksm: free swap when swapcache page is replaced When a swapcache page is replaced by a ksm page, it's best to free that swap immediately. Reported-by: Andrea Arcangeli Signed-off-by: Hugh Dickins Signed-off-by: Andrea Arcangeli Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/ksm.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/mm/ksm.c b/mm/ksm.c index 43bc893470b4..b5b907cb0f90 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -800,6 +800,8 @@ static int replace_page(struct vm_area_struct *vma, struct page *page, set_pte_at_notify(mm, addr, ptep, mk_pte(kpage, vma->vm_page_prot)); page_remove_rmap(page); + if (!page_mapped(page)) + try_to_free_swap(page); put_page(page); pte_unmap_unlock(ptep, ptl); -- cgit From 4e9f64c42d0ba5eb0c78569435ada4c224332ce4 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Thu, 13 Jan 2011 15:46:29 -0800 Subject: thp: fix bad_page to show the real reason the page is bad page_count shows the count of the head page, but the actual check is done on the tail page, so show what is really being checked. Signed-off-by: Andrea Arcangeli Acked-by: Rik van Riel Acked-by: Mel Gorman Reviewed-by: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index c18d1f7cff84..2a67c3bd403a 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5618,7 +5618,7 @@ void dump_page(struct page *page) { printk(KERN_ALERT "page:%p count:%d mapcount:%d mapping:%p index:%#lx\n", - page, page_count(page), page_mapcount(page), + page, atomic_read(&page->_count), page_mapcount(page), page->mapping, page->index); dump_page_flags(page->flags); } -- cgit From 1c9bf22c09ae14d65225d9b9619b2eb357350cd7 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Thu, 13 Jan 2011 15:46:30 -0800 Subject: thp: transparent hugepage support documentation Documentation/vm/transhuge.txt Signed-off-by: Andrea Arcangeli Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/vm/transhuge.txt | 298 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 298 insertions(+) create mode 100644 Documentation/vm/transhuge.txt diff --git a/Documentation/vm/transhuge.txt b/Documentation/vm/transhuge.txt new file mode 100644 index 000000000000..0924aaca3302 --- /dev/null +++ b/Documentation/vm/transhuge.txt @@ -0,0 +1,298 @@ += Transparent Hugepage Support = + +== Objective == + +Performance critical computing applications dealing with large memory +working sets are already running on top of libhugetlbfs and in turn +hugetlbfs. Transparent Hugepage Support is an alternative means of +using huge pages for the backing of virtual memory with huge pages +that supports the automatic promotion and demotion of page sizes and +without the shortcomings of hugetlbfs. + +Currently it only works for anonymous memory mappings but in the +future it can expand over the pagecache layer starting with tmpfs. + +The reason applications are running faster is because of two +factors. The first factor is almost completely irrelevant and it's not +of significant interest because it'll also have the downside of +requiring larger clear-page copy-page in page faults which is a +potentially negative effect. The first factor consists in taking a +single page fault for each 2M virtual region touched by userland (so +reducing the enter/exit kernel frequency by a 512 times factor). This +only matters the first time the memory is accessed for the lifetime of +a memory mapping. The second long lasting and much more important +factor will affect all subsequent accesses to the memory for the whole +runtime of the application. The second factor consist of two +components: 1) the TLB miss will run faster (especially with +virtualization using nested pagetables but almost always also on bare +metal without virtualization) and 2) a single TLB entry will be +mapping a much larger amount of virtual memory in turn reducing the +number of TLB misses. With virtualization and nested pagetables the +TLB can be mapped of larger size only if both KVM and the Linux guest +are using hugepages but a significant speedup already happens if only +one of the two is using hugepages just because of the fact the TLB +miss is going to run faster. + +== Design == + +- "graceful fallback": mm components which don't have transparent + hugepage knowledge fall back to breaking a transparent hugepage and + working on the regular pages and their respective regular pmd/pte + mappings + +- if a hugepage allocation fails because of memory fragmentation, + regular pages should be gracefully allocated instead and mixed in + the same vma without any failure or significant delay and without + userland noticing + +- if some task quits and more hugepages become available (either + immediately in the buddy or through the VM), guest physical memory + backed by regular pages should be relocated on hugepages + automatically (with khugepaged) + +- it doesn't require memory reservation and in turn it uses hugepages + whenever possible (the only possible reservation here is kernelcore= + to avoid unmovable pages to fragment all the memory but such a tweak + is not specific to transparent hugepage support and it's a generic + feature that applies to all dynamic high order allocations in the + kernel) + +- this initial support only offers the feature in the anonymous memory + regions but it'd be ideal to move it to tmpfs and the pagecache + later + +Transparent Hugepage Support maximizes the usefulness of free memory +if compared to the reservation approach of hugetlbfs by allowing all +unused memory to be used as cache or other movable (or even unmovable +entities). It doesn't require reservation to prevent hugepage +allocation failures to be noticeable from userland. It allows paging +and all other advanced VM features to be available on the +hugepages. It requires no modifications for applications to take +advantage of it. + +Applications however can be further optimized to take advantage of +this feature, like for example they've been optimized before to avoid +a flood of mmap system calls for every malloc(4k). Optimizing userland +is by far not mandatory and khugepaged already can take care of long +lived page allocations even for hugepage unaware applications that +deals with large amounts of memory. + +In certain cases when hugepages are enabled system wide, application +may end up allocating more memory resources. An application may mmap a +large region but only touch 1 byte of it, in that case a 2M page might +be allocated instead of a 4k page for no good. This is why it's +possible to disable hugepages system-wide and to only have them inside +MADV_HUGEPAGE madvise regions. + +Embedded systems should enable hugepages only inside madvise regions +to eliminate any risk of wasting any precious byte of memory and to +only run faster. + +Applications that gets a lot of benefit from hugepages and that don't +risk to lose memory by using hugepages, should use +madvise(MADV_HUGEPAGE) on their critical mmapped regions. + +== sysfs == + +Transparent Hugepage Support can be entirely disabled (mostly for +debugging purposes) or only enabled inside MADV_HUGEPAGE regions (to +avoid the risk of consuming more memory resources) or enabled system +wide. This can be achieved with one of: + +echo always >/sys/kernel/mm/transparent_hugepage/enabled +echo madvise >/sys/kernel/mm/transparent_hugepage/enabled +echo never >/sys/kernel/mm/transparent_hugepage/enabled + +It's also possible to limit defrag efforts in the VM to generate +hugepages in case they're not immediately free to madvise regions or +to never try to defrag memory and simply fallback to regular pages +unless hugepages are immediately available. Clearly if we spend CPU +time to defrag memory, we would expect to gain even more by the fact +we use hugepages later instead of regular pages. This isn't always +guaranteed, but it may be more likely in case the allocation is for a +MADV_HUGEPAGE region. + +echo always >/sys/kernel/mm/transparent_hugepage/defrag +echo madvise >/sys/kernel/mm/transparent_hugepage/defrag +echo never >/sys/kernel/mm/transparent_hugepage/defrag + +khugepaged will be automatically started when +transparent_hugepage/enabled is set to "always" or "madvise, and it'll +be automatically shutdown if it's set to "never". + +khugepaged runs usually at low frequency so while one may not want to +invoke defrag algorithms synchronously during the page faults, it +should be worth invoking defrag at least in khugepaged. However it's +also possible to disable defrag in khugepaged: + +echo yes >/sys/kernel/mm/transparent_hugepage/khugepaged/defrag +echo no >/sys/kernel/mm/transparent_hugepage/khugepaged/defrag + +You can also control how many pages khugepaged should scan at each +pass: + +/sys/kernel/mm/transparent_hugepage/khugepaged/pages_to_scan + +and how many milliseconds to wait in khugepaged between each pass (you +can set this to 0 to run khugepaged at 100% utilization of one core): + +/sys/kernel/mm/transparent_hugepage/khugepaged/scan_sleep_millisecs + +and how many milliseconds to wait in khugepaged if there's an hugepage +allocation failure to throttle the next allocation attempt. + +/sys/kernel/mm/transparent_hugepage/khugepaged/alloc_sleep_millisecs + +The khugepaged progress can be seen in the number of pages collapsed: + +/sys/kernel/mm/transparent_hugepage/khugepaged/pages_collapsed + +for each pass: + +/sys/kernel/mm/transparent_hugepage/khugepaged/full_scans + +== Boot parameter == + +You can change the sysfs boot time defaults of Transparent Hugepage +Support by passing the parameter "transparent_hugepage=always" or +"transparent_hugepage=madvise" or "transparent_hugepage=never" +(without "") to the kernel command line. + +== Need of application restart == + +The transparent_hugepage/enabled values only affect future +behavior. So to make them effective you need to restart any +application that could have been using hugepages. This also applies to +the regions registered in khugepaged. + +== get_user_pages and follow_page == + +get_user_pages and follow_page if run on a hugepage, will return the +head or tail pages as usual (exactly as they would do on +hugetlbfs). Most gup users will only care about the actual physical +address of the page and its temporary pinning to release after the I/O +is complete, so they won't ever notice the fact the page is huge. But +if any driver is going to mangle over the page structure of the tail +page (like for checking page->mapping or other bits that are relevant +for the head page and not the tail page), it should be updated to jump +to check head page instead (while serializing properly against +split_huge_page() to avoid the head and tail pages to disappear from +under it, see the futex code to see an example of that, hugetlbfs also +needed special handling in futex code for similar reasons). + +NOTE: these aren't new constraints to the GUP API, and they match the +same constrains that applies to hugetlbfs too, so any driver capable +of handling GUP on hugetlbfs will also work fine on transparent +hugepage backed mappings. + +In case you can't handle compound pages if they're returned by +follow_page, the FOLL_SPLIT bit can be specified as parameter to +follow_page, so that it will split the hugepages before returning +them. Migration for example passes FOLL_SPLIT as parameter to +follow_page because it's not hugepage aware and in fact it can't work +at all on hugetlbfs (but it instead works fine on transparent +hugepages thanks to FOLL_SPLIT). migration simply can't deal with +hugepages being returned (as it's not only checking the pfn of the +page and pinning it during the copy but it pretends to migrate the +memory in regular page sizes and with regular pte/pmd mappings). + +== Optimizing the applications == + +To be guaranteed that the kernel will map a 2M page immediately in any +memory region, the mmap region has to be hugepage naturally +aligned. posix_memalign() can provide that guarantee. + +== Hugetlbfs == + +You can use hugetlbfs on a kernel that has transparent hugepage +support enabled just fine as always. No difference can be noted in +hugetlbfs other than there will be less overall fragmentation. All +usual features belonging to hugetlbfs are preserved and +unaffected. libhugetlbfs will also work fine as usual. + +== Graceful fallback == + +Code walking pagetables but unware about huge pmds can simply call +split_huge_page_pmd(mm, pmd) where the pmd is the one returned by +pmd_offset. It's trivial to make the code transparent hugepage aware +by just grepping for "pmd_offset" and adding split_huge_page_pmd where +missing after pmd_offset returns the pmd. Thanks to the graceful +fallback design, with a one liner change, you can avoid to write +hundred if not thousand of lines of complex code to make your code +hugepage aware. + +If you're not walking pagetables but you run into a physical hugepage +but you can't handle it natively in your code, you can split it by +calling split_huge_page(page). This is what the Linux VM does before +it tries to swapout the hugepage for example. + +Example to make mremap.c transparent hugepage aware with a one liner +change: + +diff --git a/mm/mremap.c b/mm/mremap.c +--- a/mm/mremap.c ++++ b/mm/mremap.c +@@ -41,6 +41,7 @@ static pmd_t *get_old_pmd(struct mm_stru + return NULL; + + pmd = pmd_offset(pud, addr); ++ split_huge_page_pmd(mm, pmd); + if (pmd_none_or_clear_bad(pmd)) + return NULL; + +== Locking in hugepage aware code == + +We want as much code as possible hugepage aware, as calling +split_huge_page() or split_huge_page_pmd() has a cost. + +To make pagetable walks huge pmd aware, all you need to do is to call +pmd_trans_huge() on the pmd returned by pmd_offset. You must hold the +mmap_sem in read (or write) mode to be sure an huge pmd cannot be +created from under you by khugepaged (khugepaged collapse_huge_page +takes the mmap_sem in write mode in addition to the anon_vma lock). If +pmd_trans_huge returns false, you just fallback in the old code +paths. If instead pmd_trans_huge returns true, you have to take the +mm->page_table_lock and re-run pmd_trans_huge. Taking the +page_table_lock will prevent the huge pmd to be converted into a +regular pmd from under you (split_huge_page can run in parallel to the +pagetable walk). If the second pmd_trans_huge returns false, you +should just drop the page_table_lock and fallback to the old code as +before. Otherwise you should run pmd_trans_splitting on the pmd. In +case pmd_trans_splitting returns true, it means split_huge_page is +already in the middle of splitting the page. So if pmd_trans_splitting +returns true it's enough to drop the page_table_lock and call +wait_split_huge_page and then fallback the old code paths. You are +guaranteed by the time wait_split_huge_page returns, the pmd isn't +huge anymore. If pmd_trans_splitting returns false, you can proceed to +process the huge pmd and the hugepage natively. Once finished you can +drop the page_table_lock. + +== compound_lock, get_user_pages and put_page == + +split_huge_page internally has to distribute the refcounts in the head +page to the tail pages before clearing all PG_head/tail bits from the +page structures. It can do that easily for refcounts taken by huge pmd +mappings. But the GUI API as created by hugetlbfs (that returns head +and tail pages if running get_user_pages on an address backed by any +hugepage), requires the refcount to be accounted on the tail pages and +not only in the head pages, if we want to be able to run +split_huge_page while there are gup pins established on any tail +page. Failure to be able to run split_huge_page if there's any gup pin +on any tail page, would mean having to split all hugepages upfront in +get_user_pages which is unacceptable as too many gup users are +performance critical and they must work natively on hugepages like +they work natively on hugetlbfs already (hugetlbfs is simpler because +hugetlbfs pages cannot be splitted so there wouldn't be requirement of +accounting the pins on the tail pages for hugetlbfs). If we wouldn't +account the gup refcounts on the tail pages during gup, we won't know +anymore which tail page is pinned by gup and which is not while we run +split_huge_page. But we still have to add the gup pin to the head page +too, to know when we can free the compound page in case it's never +splitted during its lifetime. That requires changing not just +get_page, but put_page as well so that when put_page runs on a tail +page (and only on a tail page) it will find its respective head page, +and then it will decrease the head page refcount in addition to the +tail page refcount. To obtain a head page reliably and to decrease its +refcount without race conditions, put_page has to serialize against +__split_huge_page_refcount using a special per-page lock called +compound_lock. -- cgit From a826e422420b461a6247137c292ff83c4800354a Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Thu, 13 Jan 2011 15:46:31 -0800 Subject: thp: mm: define MADV_HUGEPAGE Define MADV_HUGEPAGE. Signed-off-by: Andrea Arcangeli Acked-by: Rik van Riel Acked-by: Arnd Bergmann Acked-by: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/alpha/include/asm/mman.h | 2 ++ arch/mips/include/asm/mman.h | 2 ++ arch/parisc/include/asm/mman.h | 2 ++ arch/xtensa/include/asm/mman.h | 2 ++ include/asm-generic/mman-common.h | 2 ++ 5 files changed, 10 insertions(+) diff --git a/arch/alpha/include/asm/mman.h b/arch/alpha/include/asm/mman.h index 99c56d47879d..f7465418a8f3 100644 --- a/arch/alpha/include/asm/mman.h +++ b/arch/alpha/include/asm/mman.h @@ -53,6 +53,8 @@ #define MADV_MERGEABLE 12 /* KSM may merge identical pages */ #define MADV_UNMERGEABLE 13 /* KSM may not merge identical pages */ +#define MADV_HUGEPAGE 14 /* Worth backing with hugepages */ + /* compatibility flags */ #define MAP_FILE 0 diff --git a/arch/mips/include/asm/mman.h b/arch/mips/include/asm/mman.h index c892bfb3e2c1..9d8184c527ff 100644 --- a/arch/mips/include/asm/mman.h +++ b/arch/mips/include/asm/mman.h @@ -77,6 +77,8 @@ #define MADV_UNMERGEABLE 13 /* KSM may not merge identical pages */ #define MADV_HWPOISON 100 /* poison a page for testing */ +#define MADV_HUGEPAGE 14 /* Worth backing with hugepages */ + /* compatibility flags */ #define MAP_FILE 0 diff --git a/arch/parisc/include/asm/mman.h b/arch/parisc/include/asm/mman.h index 9749c8afe83a..533c5dc173c1 100644 --- a/arch/parisc/include/asm/mman.h +++ b/arch/parisc/include/asm/mman.h @@ -59,6 +59,8 @@ #define MADV_MERGEABLE 65 /* KSM may merge identical pages */ #define MADV_UNMERGEABLE 66 /* KSM may not merge identical pages */ +#define MADV_HUGEPAGE 67 /* Worth backing with hugepages */ + /* compatibility flags */ #define MAP_FILE 0 #define MAP_VARIABLE 0 diff --git a/arch/xtensa/include/asm/mman.h b/arch/xtensa/include/asm/mman.h index fca4db425f6e..41be9a1ee08f 100644 --- a/arch/xtensa/include/asm/mman.h +++ b/arch/xtensa/include/asm/mman.h @@ -83,6 +83,8 @@ #define MADV_MERGEABLE 12 /* KSM may merge identical pages */ #define MADV_UNMERGEABLE 13 /* KSM may not merge identical pages */ +#define MADV_HUGEPAGE 14 /* Worth backing with hugepages */ + /* compatibility flags */ #define MAP_FILE 0 diff --git a/include/asm-generic/mman-common.h b/include/asm-generic/mman-common.h index 3da9e2742fa0..e91392fde4c6 100644 --- a/include/asm-generic/mman-common.h +++ b/include/asm-generic/mman-common.h @@ -45,6 +45,8 @@ #define MADV_MERGEABLE 12 /* KSM may merge identical pages */ #define MADV_UNMERGEABLE 13 /* KSM may not merge identical pages */ +#define MADV_HUGEPAGE 14 /* Worth backing with hugepages */ + /* compatibility flags */ #define MAP_FILE 0 -- cgit From e9da73d67729b58bba256123e2b4651e0d8a01ac Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Thu, 13 Jan 2011 15:46:32 -0800 Subject: thp: compound_lock Add a new compound_lock() needed to serialize put_page against __split_huge_page_refcount(). Signed-off-by: Andrea Arcangeli Acked-by: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 34 ++++++++++++++++++++++++++++++++++ include/linux/page-flags.h | 12 +++++++++++- 2 files changed, 45 insertions(+), 1 deletion(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 57e11b2a18ba..3b1754ad8785 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -14,6 +14,7 @@ #include #include #include +#include struct mempolicy; struct anon_vma; @@ -305,6 +306,39 @@ static inline int is_vmalloc_or_module_addr(const void *x) } #endif +static inline void compound_lock(struct page *page) +{ +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + bit_spin_lock(PG_compound_lock, &page->flags); +#endif +} + +static inline void compound_unlock(struct page *page) +{ +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + bit_spin_unlock(PG_compound_lock, &page->flags); +#endif +} + +static inline unsigned long compound_lock_irqsave(struct page *page) +{ + unsigned long uninitialized_var(flags); +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + local_irq_save(flags); + compound_lock(page); +#endif + return flags; +} + +static inline void compound_unlock_irqrestore(struct page *page, + unsigned long flags) +{ +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + compound_unlock(page); + local_irq_restore(flags); +#endif +} + static inline struct page *compound_head(struct page *page) { if (unlikely(PageTail(page))) diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 4805fdec8d6e..5a743cc87238 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -107,6 +107,9 @@ enum pageflags { #endif #ifdef CONFIG_MEMORY_FAILURE PG_hwpoison, /* hardware poisoned page. Don't touch */ +#endif +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + PG_compound_lock, #endif __NR_PAGEFLAGS, @@ -397,6 +400,12 @@ static inline void __ClearPageTail(struct page *page) #define __PG_MLOCKED 0 #endif +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +#define __PG_COMPOUND_LOCK (1 << PG_compound_lock) +#else +#define __PG_COMPOUND_LOCK 0 +#endif + /* * Flags checked when a page is freed. Pages being freed should not have * these flags set. It they are, there is a problem. @@ -406,7 +415,8 @@ static inline void __ClearPageTail(struct page *page) 1 << PG_private | 1 << PG_private_2 | \ 1 << PG_buddy | 1 << PG_writeback | 1 << PG_reserved | \ 1 << PG_slab | 1 << PG_swapcache | 1 << PG_active | \ - 1 << PG_unevictable | __PG_MLOCKED | __PG_HWPOISON) + 1 << PG_unevictable | __PG_MLOCKED | __PG_HWPOISON | \ + __PG_COMPOUND_LOCK) /* * Flags checked when a page is prepped for return by the page allocator. -- cgit From 9180706344487700b40da9eca5dedd3d11cb33b4 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Thu, 13 Jan 2011 15:46:32 -0800 Subject: thp: alter compound get_page/put_page Alter compound get_page/put_page to keep references on subpages too, in order to allow __split_huge_page_refcount to split an hugepage even while subpages have been pinned by one of the get_user_pages() variants. Signed-off-by: Andrea Arcangeli Acked-by: Rik van Riel Acked-by: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/powerpc/mm/gup.c | 12 +++++++ arch/x86/mm/gup.c | 12 +++++++ include/linux/mm.h | 24 ++++++++++++-- mm/swap.c | 90 +++++++++++++++++++++++++++++++++++++++++++++++---- 4 files changed, 129 insertions(+), 9 deletions(-) diff --git a/arch/powerpc/mm/gup.c b/arch/powerpc/mm/gup.c index d7efdbf640c7..fec13200868f 100644 --- a/arch/powerpc/mm/gup.c +++ b/arch/powerpc/mm/gup.c @@ -16,6 +16,16 @@ #ifdef __HAVE_ARCH_PTE_SPECIAL +static inline void get_huge_page_tail(struct page *page) +{ + /* + * __split_huge_page_refcount() cannot run + * from under us. + */ + VM_BUG_ON(atomic_read(&page->_count) < 0); + atomic_inc(&page->_count); +} + /* * The performance critical leaf functions are made noinline otherwise gcc * inlines everything into a single function which results in too much @@ -47,6 +57,8 @@ static noinline int gup_pte_range(pmd_t pmd, unsigned long addr, put_page(page); return 0; } + if (PageTail(page)) + get_huge_page_tail(page); pages[*nr] = page; (*nr)++; diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c index 738e6593799d..06f56fcf9a77 100644 --- a/arch/x86/mm/gup.c +++ b/arch/x86/mm/gup.c @@ -105,6 +105,16 @@ static inline void get_head_page_multiple(struct page *page, int nr) atomic_add(nr, &page->_count); } +static inline void get_huge_page_tail(struct page *page) +{ + /* + * __split_huge_page_refcount() cannot run + * from under us. + */ + VM_BUG_ON(atomic_read(&page->_count) < 0); + atomic_inc(&page->_count); +} + static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr, unsigned long end, int write, struct page **pages, int *nr) { @@ -128,6 +138,8 @@ static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr, do { VM_BUG_ON(compound_head(page) != head); pages[*nr] = page; + if (PageTail(page)) + get_huge_page_tail(page); (*nr)++; page++; refs++; diff --git a/include/linux/mm.h b/include/linux/mm.h index 3b1754ad8785..a2718e1ed585 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -353,9 +353,29 @@ static inline int page_count(struct page *page) static inline void get_page(struct page *page) { - page = compound_head(page); - VM_BUG_ON(atomic_read(&page->_count) == 0); + /* + * Getting a normal page or the head of a compound page + * requires to already have an elevated page->_count. Only if + * we're getting a tail page, the elevated page->_count is + * required only in the head page, so for tail pages the + * bugcheck only verifies that the page->_count isn't + * negative. + */ + VM_BUG_ON(atomic_read(&page->_count) < !PageTail(page)); atomic_inc(&page->_count); + /* + * Getting a tail page will elevate both the head and tail + * page->_count(s). + */ + if (unlikely(PageTail(page))) { + /* + * This is safe only because + * __split_huge_page_refcount can't run under + * get_page(). + */ + VM_BUG_ON(atomic_read(&page->first_page->_count) <= 0); + atomic_inc(&page->first_page->_count); + } } static inline struct page *virt_to_head_page(const void *x) diff --git a/mm/swap.c b/mm/swap.c index 3f4854205b16..33f5292fe132 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -56,17 +56,93 @@ static void __page_cache_release(struct page *page) del_page_from_lru(zone, page); spin_unlock_irqrestore(&zone->lru_lock, flags); } +} + +static void __put_single_page(struct page *page) +{ + __page_cache_release(page); free_hot_cold_page(page, 0); } -static void put_compound_page(struct page *page) +static void __put_compound_page(struct page *page) { - page = compound_head(page); - if (put_page_testzero(page)) { - compound_page_dtor *dtor; + compound_page_dtor *dtor; + + __page_cache_release(page); + dtor = get_compound_page_dtor(page); + (*dtor)(page); +} - dtor = get_compound_page_dtor(page); - (*dtor)(page); +static void put_compound_page(struct page *page) +{ + if (unlikely(PageTail(page))) { + /* __split_huge_page_refcount can run under us */ + struct page *page_head = page->first_page; + smp_rmb(); + /* + * If PageTail is still set after smp_rmb() we can be sure + * that the page->first_page we read wasn't a dangling pointer. + * See __split_huge_page_refcount() smp_wmb(). + */ + if (likely(PageTail(page) && get_page_unless_zero(page_head))) { + unsigned long flags; + /* + * Verify that our page_head wasn't converted + * to a a regular page before we got a + * reference on it. + */ + if (unlikely(!PageHead(page_head))) { + /* PageHead is cleared after PageTail */ + smp_rmb(); + VM_BUG_ON(PageTail(page)); + goto out_put_head; + } + /* + * Only run compound_lock on a valid PageHead, + * after having it pinned with + * get_page_unless_zero() above. + */ + smp_mb(); + /* page_head wasn't a dangling pointer */ + flags = compound_lock_irqsave(page_head); + if (unlikely(!PageTail(page))) { + /* __split_huge_page_refcount run before us */ + compound_unlock_irqrestore(page_head, flags); + VM_BUG_ON(PageHead(page_head)); + out_put_head: + if (put_page_testzero(page_head)) + __put_single_page(page_head); + out_put_single: + if (put_page_testzero(page)) + __put_single_page(page); + return; + } + VM_BUG_ON(page_head != page->first_page); + /* + * We can release the refcount taken by + * get_page_unless_zero now that + * split_huge_page_refcount is blocked on the + * compound_lock. + */ + if (put_page_testzero(page_head)) + VM_BUG_ON(1); + /* __split_huge_page_refcount will wait now */ + VM_BUG_ON(atomic_read(&page->_count) <= 0); + atomic_dec(&page->_count); + VM_BUG_ON(atomic_read(&page_head->_count) <= 0); + compound_unlock_irqrestore(page_head, flags); + if (put_page_testzero(page_head)) + __put_compound_page(page_head); + } else { + /* page_head is a dangling pointer */ + VM_BUG_ON(PageTail(page)); + goto out_put_single; + } + } else if (put_page_testzero(page)) { + if (PageHead(page)) + __put_compound_page(page); + else + __put_single_page(page); } } @@ -75,7 +151,7 @@ void put_page(struct page *page) if (unlikely(PageCompound(page))) put_compound_page(page); else if (put_page_testzero(page)) - __page_cache_release(page); + __put_single_page(page); } EXPORT_SYMBOL(put_page); -- cgit From a95a82e96c48270980dd248ccd5546f1b49e6f8a Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Thu, 13 Jan 2011 15:46:33 -0800 Subject: thp: put_page: recheck PageHead after releasing the compound_lock After releasing the compound_lock split_huge_page can still run and release the page before put_page_testzero runs. Signed-off-by: Andrea Arcangeli Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/swap.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/mm/swap.c b/mm/swap.c index 33f5292fe132..e0eeef940886 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -131,8 +131,12 @@ static void put_compound_page(struct page *page) atomic_dec(&page->_count); VM_BUG_ON(atomic_read(&page_head->_count) <= 0); compound_unlock_irqrestore(page_head, flags); - if (put_page_testzero(page_head)) - __put_compound_page(page_head); + if (put_page_testzero(page_head)) { + if (PageHead(page_head)) + __put_compound_page(page_head); + else + __put_single_page(page_head); + } } else { /* page_head is a dangling pointer */ VM_BUG_ON(PageTail(page)); -- cgit From a5b338f2b0b1ff73ae20c66ab831201549eaec01 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Thu, 13 Jan 2011 15:46:34 -0800 Subject: thp: update futex compound knowledge Futex code is smarter than most other gup_fast O_DIRECT code and knows about the compound internals. However now doing a put_page(head_page) will not release the pin on the tail page taken by gup-fast, leading to all sort of refcounting bugchecks. Getting a stable head_page is a little tricky. page_head = page is there because if this is not a tail page it's also the page_head. Only in case this is a tail page, compound_head is called, otherwise it's guaranteed unnecessary. And if it's a tail page compound_head has to run atomically inside irq disabled section __get_user_pages_fast before returning. Otherwise ->first_page won't be a stable pointer. Disableing irq before __get_user_page_fast and releasing irq after running compound_head is needed because if __get_user_page_fast returns == 1, it means the huge pmd is established and cannot go away from under us. pmdp_splitting_flush_notify in __split_huge_page_splitting will have to wait for local_irq_enable before the IPI delivery can return. This means __split_huge_page_refcount can't be running from under us, and in turn when we run compound_head(page) we're not reading a dangling pointer from tailpage->first_page. Then after we get to stable head page, we are always safe to call compound_lock and after taking the compound lock on head page we can finally re-check if the page returned by gup-fast is still a tail page. in which case we're set and we didn't need to split the hugepage in order to take a futex on it. Signed-off-by: Andrea Arcangeli Acked-by: Mel Gorman Acked-by: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/futex.c | 55 +++++++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 45 insertions(+), 10 deletions(-) diff --git a/kernel/futex.c b/kernel/futex.c index 3019b92e6917..52075633373f 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -233,7 +233,7 @@ get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key) { unsigned long address = (unsigned long)uaddr; struct mm_struct *mm = current->mm; - struct page *page; + struct page *page, *page_head; int err; /* @@ -265,11 +265,46 @@ again: if (err < 0) return err; - page = compound_head(page); - lock_page(page); - if (!page->mapping) { - unlock_page(page); +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + page_head = page; + if (unlikely(PageTail(page))) { put_page(page); + /* serialize against __split_huge_page_splitting() */ + local_irq_disable(); + if (likely(__get_user_pages_fast(address, 1, 1, &page) == 1)) { + page_head = compound_head(page); + /* + * page_head is valid pointer but we must pin + * it before taking the PG_lock and/or + * PG_compound_lock. The moment we re-enable + * irqs __split_huge_page_splitting() can + * return and the head page can be freed from + * under us. We can't take the PG_lock and/or + * PG_compound_lock on a page that could be + * freed from under us. + */ + if (page != page_head) { + get_page(page_head); + put_page(page); + } + local_irq_enable(); + } else { + local_irq_enable(); + goto again; + } + } +#else + page_head = compound_head(page); + if (page != page_head) { + get_page(page_head); + put_page(page); + } +#endif + + lock_page(page_head); + if (!page_head->mapping) { + unlock_page(page_head); + put_page(page_head); goto again; } @@ -280,20 +315,20 @@ again: * it's a read-only handle, it's expected that futexes attach to * the object not the particular process. */ - if (PageAnon(page)) { + if (PageAnon(page_head)) { key->both.offset |= FUT_OFF_MMSHARED; /* ref taken on mm */ key->private.mm = mm; key->private.address = address; } else { key->both.offset |= FUT_OFF_INODE; /* inode-based key */ - key->shared.inode = page->mapping->host; - key->shared.pgoff = page->index; + key->shared.inode = page_head->mapping->host; + key->shared.pgoff = page_head->index; } get_futex_key_refs(key); - unlock_page(page); - put_page(page); + unlock_page(page_head); + put_page(page_head); return 0; } -- cgit From 8dd60a3a65c1b057bf0031d28436d3447a3c545b Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Thu, 13 Jan 2011 15:46:34 -0800 Subject: thp: clear compound mapping Clear compound mapping for anonymous compound pages like it already happens for regular anonymous pages. But crash if mapping is set for any tail page, also the PageAnon check is meaningless for tail pages. This check only makes sense for the head page, for tail page it can only hide bugs and we definitely don't want to hide bugs. Signed-off-by: Andrea Arcangeli Acked-by: Rik van Riel Acked-by: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 2a67c3bd403a..8be81422d4bd 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -651,13 +651,10 @@ static bool free_pages_prepare(struct page *page, unsigned int order) trace_mm_page_free_direct(page, order); kmemcheck_free_shadow(page, order); - for (i = 0; i < (1 << order); i++) { - struct page *pg = page + i; - - if (PageAnon(pg)) - pg->mapping = NULL; - bad += free_pages_check(pg); - } + if (PageAnon(page)) + page->mapping = NULL; + for (i = 0; i < (1 << order); i++) + bad += free_pages_check(page + i); if (bad) return false; -- cgit From 0a47de52db04c5eb346e99a8d038e693689bedca Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Thu, 13 Jan 2011 15:46:35 -0800 Subject: thp: add native_set_pmd_at Used by paravirt and not paravirt set_pmd_at. Signed-off-by: Andrea Arcangeli Acked-by: Rik van Riel Acked-by: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/include/asm/pgtable.h | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index ada823a13c7c..6d3dde327332 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -530,6 +530,12 @@ static inline void native_set_pte_at(struct mm_struct *mm, unsigned long addr, native_set_pte(ptep, pte); } +static inline void native_set_pmd_at(struct mm_struct *mm, unsigned long addr, + pmd_t *pmdp , pmd_t pmd) +{ + native_set_pmd(pmdp, pmd); +} + #ifndef CONFIG_PARAVIRT /* * Rules for using pte_update - it must be called after any PTE update which -- cgit From 331127f799d1618e2ad978a0d220ed935a7b0ca8 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Thu, 13 Jan 2011 15:46:36 -0800 Subject: thp: add pmd paravirt ops Paravirt ops pmd_update/pmd_update_defer/pmd_set_at. Not all might be necessary (vmware needs pmd_update, Xen needs set_pmd_at, nobody needs pmd_update_defer), but this is to keep full simmetry with pte paravirt ops, which looks cleaner and simpler from a common code POV. Signed-off-by: Andrea Arcangeli Acked-by: Rik van Riel Acked-by: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/include/asm/paravirt.h | 25 +++++++++++++++++++++++++ arch/x86/include/asm/paravirt_types.h | 6 ++++++ arch/x86/kernel/paravirt.c | 3 +++ 3 files changed, 34 insertions(+) diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index 7709c12431b8..2071a8b2b32f 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -435,6 +435,11 @@ static inline void pte_update(struct mm_struct *mm, unsigned long addr, { PVOP_VCALL3(pv_mmu_ops.pte_update, mm, addr, ptep); } +static inline void pmd_update(struct mm_struct *mm, unsigned long addr, + pmd_t *pmdp) +{ + PVOP_VCALL3(pv_mmu_ops.pmd_update, mm, addr, pmdp); +} static inline void pte_update_defer(struct mm_struct *mm, unsigned long addr, pte_t *ptep) @@ -442,6 +447,12 @@ static inline void pte_update_defer(struct mm_struct *mm, unsigned long addr, PVOP_VCALL3(pv_mmu_ops.pte_update_defer, mm, addr, ptep); } +static inline void pmd_update_defer(struct mm_struct *mm, unsigned long addr, + pmd_t *pmdp) +{ + PVOP_VCALL3(pv_mmu_ops.pmd_update_defer, mm, addr, pmdp); +} + static inline pte_t __pte(pteval_t val) { pteval_t ret; @@ -543,6 +554,20 @@ static inline void set_pte_at(struct mm_struct *mm, unsigned long addr, PVOP_VCALL4(pv_mmu_ops.set_pte_at, mm, addr, ptep, pte.pte); } +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +static inline void set_pmd_at(struct mm_struct *mm, unsigned long addr, + pmd_t *pmdp, pmd_t pmd) +{ +#if PAGETABLE_LEVELS >= 3 + if (sizeof(pmdval_t) > sizeof(long)) + /* 5 arg words */ + pv_mmu_ops.set_pmd_at(mm, addr, pmdp, pmd); + else + PVOP_VCALL4(pv_mmu_ops.set_pmd_at, mm, addr, pmdp, pmd.pmd); +#endif +} +#endif + static inline void set_pmd(pmd_t *pmdp, pmd_t pmd) { pmdval_t val = native_pmd_val(pmd); diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h index b82bac975250..82885099c869 100644 --- a/arch/x86/include/asm/paravirt_types.h +++ b/arch/x86/include/asm/paravirt_types.h @@ -265,10 +265,16 @@ struct pv_mmu_ops { void (*set_pte_at)(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pteval); void (*set_pmd)(pmd_t *pmdp, pmd_t pmdval); + void (*set_pmd_at)(struct mm_struct *mm, unsigned long addr, + pmd_t *pmdp, pmd_t pmdval); void (*pte_update)(struct mm_struct *mm, unsigned long addr, pte_t *ptep); void (*pte_update_defer)(struct mm_struct *mm, unsigned long addr, pte_t *ptep); + void (*pmd_update)(struct mm_struct *mm, unsigned long addr, + pmd_t *pmdp); + void (*pmd_update_defer)(struct mm_struct *mm, + unsigned long addr, pmd_t *pmdp); pte_t (*ptep_modify_prot_start)(struct mm_struct *mm, unsigned long addr, pte_t *ptep); diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index c5b250011fd4..869e1aeeb71b 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -421,8 +421,11 @@ struct pv_mmu_ops pv_mmu_ops = { .set_pte = native_set_pte, .set_pte_at = native_set_pte_at, .set_pmd = native_set_pmd, + .set_pmd_at = native_set_pmd_at, .pte_update = paravirt_nop, .pte_update_defer = paravirt_nop, + .pmd_update = paravirt_nop, + .pmd_update_defer = paravirt_nop, .ptep_modify_prot_start = __ptep_modify_prot_start, .ptep_modify_prot_commit = __ptep_modify_prot_commit, -- cgit From 2609ae6d10af0531e826335bd1445d1ace17c847 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Thu, 13 Jan 2011 15:46:37 -0800 Subject: thp: no paravirt version of pmd ops No paravirt version of set_pmd_at/pmd_update/pmd_update_defer. Signed-off-by: Andrea Arcangeli Acked-by: Rik van Riel Acked-by: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/include/asm/pgtable.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 6d3dde327332..e576cbd7a343 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -35,6 +35,7 @@ extern struct mm_struct *pgd_page_get_mm(struct page *page); #else /* !CONFIG_PARAVIRT */ #define set_pte(ptep, pte) native_set_pte(ptep, pte) #define set_pte_at(mm, addr, ptep, pte) native_set_pte_at(mm, addr, ptep, pte) +#define set_pmd_at(mm, addr, pmdp, pmd) native_set_pmd_at(mm, addr, pmdp, pmd) #define set_pte_atomic(ptep, pte) \ native_set_pte_atomic(ptep, pte) @@ -59,6 +60,8 @@ extern struct mm_struct *pgd_page_get_mm(struct page *page); #define pte_update(mm, addr, ptep) do { } while (0) #define pte_update_defer(mm, addr, ptep) do { } while (0) +#define pmd_update(mm, addr, ptep) do { } while (0) +#define pmd_update_defer(mm, addr, ptep) do { } while (0) #define pgd_val(x) native_pgd_val(x) #define __pgd(x) native_make_pgd(x) -- cgit From 14fd403f2146f740942d78af4e0ee59396ad8eab Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Thu, 13 Jan 2011 15:46:37 -0800 Subject: thp: export maybe_mkwrite huge_memory.c needs it too when it fallbacks in copying hugepages into regular fragmented pages if hugepage allocation fails during COW. Signed-off-by: Andrea Arcangeli Acked-by: Rik van Riel Acked-by: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 13 +++++++++++++ mm/memory.c | 13 ------------- 2 files changed, 13 insertions(+), 13 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index a2718e1ed585..6bef67d74adf 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -429,6 +429,19 @@ static inline void set_compound_order(struct page *page, unsigned long order) page[1].lru.prev = (void *)order; } +/* + * Do pte_mkwrite, but only if the vma says VM_WRITE. We do this when + * servicing faults for write access. In the normal case, do always want + * pte_mkwrite. But get_user_pages can cause write faults for mappings + * that do not have writing enabled, when used by access_process_vm. + */ +static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma) +{ + if (likely(vma->vm_flags & VM_WRITE)) + pte = pte_mkwrite(pte); + return pte; +} + /* * Multiple processes may "see" the same page. E.g. for untouched * mappings of /dev/null, all processes see the same page full of diff --git a/mm/memory.c b/mm/memory.c index 1bbe9a22429c..bdf19366b705 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2083,19 +2083,6 @@ static inline int pte_unmap_same(struct mm_struct *mm, pmd_t *pmd, return same; } -/* - * Do pte_mkwrite, but only if the vma says VM_WRITE. We do this when - * servicing faults for write access. In the normal case, do always want - * pte_mkwrite. But get_user_pages can cause write faults for mappings - * that do not have writing enabled, when used by access_process_vm. - */ -static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma) -{ - if (likely(vma->vm_flags & VM_WRITE)) - pte = pte_mkwrite(pte); - return pte; -} - static inline void cow_user_page(struct page *dst, struct page *src, unsigned long va, struct vm_area_struct *vma) { /* -- cgit From 59ff421631295cd54dbf75dcc53d27e84af6d9c0 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Thu, 13 Jan 2011 15:46:38 -0800 Subject: thp: comment reminder in destroy_compound_page Warn destroy_compound_page that __split_huge_page_refcount is heavily dependent on its internal behavior. Signed-off-by: Andrea Arcangeli Acked-by: Rik van Riel Acked-by: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 1 + 1 file changed, 1 insertion(+) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 8be81422d4bd..6e62d5f9d40b 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -357,6 +357,7 @@ void prep_compound_page(struct page *page, unsigned long order) } } +/* update __split_huge_page_refcount if you change this function */ static int destroy_compound_page(struct page *page, unsigned long order) { int i; -- cgit From 4c76d9d1fb9b21fa10c9e4c1fab2875018a88aa1 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Thu, 13 Jan 2011 15:46:39 -0800 Subject: thp: CONFIG_TRANSPARENT_HUGEPAGE Add config option. Signed-off-by: Andrea Arcangeli Acked-by: Rik van Riel Acked-by: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/Kconfig | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/mm/Kconfig b/mm/Kconfig index c2c8a4a11898..3982be2d721a 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -302,6 +302,20 @@ config NOMMU_INITIAL_TRIM_EXCESS See Documentation/nommu-mmap.txt for more information. +config TRANSPARENT_HUGEPAGE + bool "Transparent Hugepage Support" if EMBEDDED + depends on X86_64 && MMU + default y + help + Transparent Hugepages allows the kernel to use huge pages and + huge tlb transparently to the applications whenever possible. + This feature can improve computing performance to certain + applications by speeding up page faults during memory + allocation, by reducing the number of tlb misses and by speeding + up the pagetable walking. + + If memory constrained on embedded, you may want to say N. + # # UP and nommu archs use km based percpu allocator # -- cgit From 5f6e8da70a289d403975907371ce5738c726ad3f Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Thu, 13 Jan 2011 15:46:40 -0800 Subject: thp: special pmd_trans_* functions These returns 0 at compile time when the config option is disabled, to allow gcc to eliminate the transparent hugepage function calls at compile time without additional #ifdefs (only the export of those functions have to be visible to gcc but they won't be required at link time and huge_memory.o can be not built at all). _PAGE_BIT_UNUSED1 is never used for pmd, only on pte. Signed-off-by: Andrea Arcangeli Acked-by: Rik van Riel Acked-by: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/include/asm/pgtable_64.h | 13 +++++++++++++ arch/x86/include/asm/pgtable_types.h | 2 ++ include/asm-generic/pgtable.h | 11 +++++++++++ 3 files changed, 26 insertions(+) diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h index f86da20347f2..6dffd4c551cc 100644 --- a/arch/x86/include/asm/pgtable_64.h +++ b/arch/x86/include/asm/pgtable_64.h @@ -168,6 +168,19 @@ extern void cleanup_highmap(void); #define kc_offset_to_vaddr(o) ((o) | ~__VIRTUAL_MASK) #define __HAVE_ARCH_PTE_SAME + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +static inline int pmd_trans_splitting(pmd_t pmd) +{ + return pmd_val(pmd) & _PAGE_SPLITTING; +} + +static inline int pmd_trans_huge(pmd_t pmd) +{ + return pmd_val(pmd) & _PAGE_PSE; +} +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ + #endif /* !__ASSEMBLY__ */ #endif /* _ASM_X86_PGTABLE_64_H */ diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h index d1f4a760be23..a81a6bfc1437 100644 --- a/arch/x86/include/asm/pgtable_types.h +++ b/arch/x86/include/asm/pgtable_types.h @@ -22,6 +22,7 @@ #define _PAGE_BIT_PAT_LARGE 12 /* On 2MB or 1GB pages */ #define _PAGE_BIT_SPECIAL _PAGE_BIT_UNUSED1 #define _PAGE_BIT_CPA_TEST _PAGE_BIT_UNUSED1 +#define _PAGE_BIT_SPLITTING _PAGE_BIT_UNUSED1 /* only valid on a PSE pmd */ #define _PAGE_BIT_NX 63 /* No execute: only valid after cpuid check */ /* If _PAGE_BIT_PRESENT is clear, we use these: */ @@ -45,6 +46,7 @@ #define _PAGE_PAT_LARGE (_AT(pteval_t, 1) << _PAGE_BIT_PAT_LARGE) #define _PAGE_SPECIAL (_AT(pteval_t, 1) << _PAGE_BIT_SPECIAL) #define _PAGE_CPA_TEST (_AT(pteval_t, 1) << _PAGE_BIT_CPA_TEST) +#define _PAGE_SPLITTING (_AT(pteval_t, 1) << _PAGE_BIT_SPLITTING) #define __HAVE_ARCH_PTE_SPECIAL #ifdef CONFIG_KMEMCHECK diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h index 6f3c6ae4fe03..0ab2cd27c60f 100644 --- a/include/asm-generic/pgtable.h +++ b/include/asm-generic/pgtable.h @@ -348,6 +348,17 @@ extern void untrack_pfn_vma(struct vm_area_struct *vma, unsigned long pfn, unsigned long size); #endif +#ifndef CONFIG_TRANSPARENT_HUGEPAGE +static inline int pmd_trans_huge(pmd_t pmd) +{ + return 0; +} +static inline int pmd_trans_splitting(pmd_t pmd) +{ + return 0; +} +#endif + #endif /* !__ASSEMBLY__ */ #endif /* _ASM_GENERIC_PGTABLE_H */ -- cgit From e2cda322648122dc400c85ada80eaddbc612ef6a Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Thu, 13 Jan 2011 15:46:40 -0800 Subject: thp: add pmd mangling generic functions Some are needed to build but not actually used on archs not supporting transparent hugepages. Others like pmdp_clear_flush are used by x86 too. Signed-off-by: Andrea Arcangeli Acked-by: Rik van Riel Acked-by: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/asm-generic/pgtable.h | 214 ++++++++++++++++++++++++++++++------------ mm/Makefile | 2 +- mm/pgtable-generic.c | 123 ++++++++++++++++++++++++ 3 files changed, 278 insertions(+), 61 deletions(-) create mode 100644 mm/pgtable-generic.c diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h index 0ab2cd27c60f..f1eddf71dd0c 100644 --- a/include/asm-generic/pgtable.h +++ b/include/asm-generic/pgtable.h @@ -5,67 +5,108 @@ #ifdef CONFIG_MMU #ifndef __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS -/* - * Largely same as above, but only sets the access flags (dirty, - * accessed, and writable). Furthermore, we know it always gets set - * to a "more permissive" setting, which allows most architectures - * to optimize this. We return whether the PTE actually changed, which - * in turn instructs the caller to do things like update__mmu_cache. - * This used to be done in the caller, but sparc needs minor faults to - * force that call on sun4c so we changed this macro slightly - */ -#define ptep_set_access_flags(__vma, __address, __ptep, __entry, __dirty) \ -({ \ - int __changed = !pte_same(*(__ptep), __entry); \ - if (__changed) { \ - set_pte_at((__vma)->vm_mm, (__address), __ptep, __entry); \ - flush_tlb_page(__vma, __address); \ - } \ - __changed; \ -}) +extern int ptep_set_access_flags(struct vm_area_struct *vma, + unsigned long address, pte_t *ptep, + pte_t entry, int dirty); +#endif + +#ifndef __HAVE_ARCH_PMDP_SET_ACCESS_FLAGS +extern int pmdp_set_access_flags(struct vm_area_struct *vma, + unsigned long address, pmd_t *pmdp, + pmd_t entry, int dirty); #endif #ifndef __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG -#define ptep_test_and_clear_young(__vma, __address, __ptep) \ -({ \ - pte_t __pte = *(__ptep); \ - int r = 1; \ - if (!pte_young(__pte)) \ - r = 0; \ - else \ - set_pte_at((__vma)->vm_mm, (__address), \ - (__ptep), pte_mkold(__pte)); \ - r; \ -}) +static inline int ptep_test_and_clear_young(struct vm_area_struct *vma, + unsigned long address, + pte_t *ptep) +{ + pte_t pte = *ptep; + int r = 1; + if (!pte_young(pte)) + r = 0; + else + set_pte_at(vma->vm_mm, address, ptep, pte_mkold(pte)); + return r; +} +#endif + +#ifndef __HAVE_ARCH_PMDP_TEST_AND_CLEAR_YOUNG +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +static inline int pmdp_test_and_clear_young(struct vm_area_struct *vma, + unsigned long address, + pmd_t *pmdp) +{ + pmd_t pmd = *pmdp; + int r = 1; + if (!pmd_young(pmd)) + r = 0; + else + set_pmd_at(vma->vm_mm, address, pmdp, pmd_mkold(pmd)); + return r; +} +#else /* CONFIG_TRANSPARENT_HUGEPAGE */ +static inline int pmdp_test_and_clear_young(struct vm_area_struct *vma, + unsigned long address, + pmd_t *pmdp) +{ + BUG(); + return 0; +} +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #endif #ifndef __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH -#define ptep_clear_flush_young(__vma, __address, __ptep) \ -({ \ - int __young; \ - __young = ptep_test_and_clear_young(__vma, __address, __ptep); \ - if (__young) \ - flush_tlb_page(__vma, __address); \ - __young; \ -}) +int ptep_clear_flush_young(struct vm_area_struct *vma, + unsigned long address, pte_t *ptep); +#endif + +#ifndef __HAVE_ARCH_PMDP_CLEAR_YOUNG_FLUSH +int pmdp_clear_flush_young(struct vm_area_struct *vma, + unsigned long address, pmd_t *pmdp); #endif #ifndef __HAVE_ARCH_PTEP_GET_AND_CLEAR -#define ptep_get_and_clear(__mm, __address, __ptep) \ -({ \ - pte_t __pte = *(__ptep); \ - pte_clear((__mm), (__address), (__ptep)); \ - __pte; \ +static inline pte_t ptep_get_and_clear(struct mm_struct *mm, + unsigned long address, + pte_t *ptep) +{ + pte_t pte = *ptep; + pte_clear(mm, address, ptep); + return pte; +} +#endif + +#ifndef __HAVE_ARCH_PMDP_GET_AND_CLEAR +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +static inline pmd_t pmdp_get_and_clear(struct mm_struct *mm, + unsigned long address, + pmd_t *pmdp) +{ + pmd_t pmd = *pmdp; + pmd_clear(mm, address, pmdp); + return pmd; }) +#else /* CONFIG_TRANSPARENT_HUGEPAGE */ +static inline pmd_t pmdp_get_and_clear(struct mm_struct *mm, + unsigned long address, + pmd_t *pmdp) +{ + BUG(); + return __pmd(0); +} +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #endif #ifndef __HAVE_ARCH_PTEP_GET_AND_CLEAR_FULL -#define ptep_get_and_clear_full(__mm, __address, __ptep, __full) \ -({ \ - pte_t __pte; \ - __pte = ptep_get_and_clear((__mm), (__address), (__ptep)); \ - __pte; \ -}) +static inline pte_t ptep_get_and_clear_full(struct mm_struct *mm, + unsigned long address, pte_t *ptep, + int full) +{ + pte_t pte; + pte = ptep_get_and_clear(mm, address, ptep); + return pte; +} #endif /* @@ -74,20 +115,25 @@ * not present, or in the process of an address space destruction. */ #ifndef __HAVE_ARCH_PTE_CLEAR_NOT_PRESENT_FULL -#define pte_clear_not_present_full(__mm, __address, __ptep, __full) \ -do { \ - pte_clear((__mm), (__address), (__ptep)); \ -} while (0) +static inline void pte_clear_not_present_full(struct mm_struct *mm, + unsigned long address, + pte_t *ptep, + int full) +{ + pte_clear(mm, address, ptep); +} #endif #ifndef __HAVE_ARCH_PTEP_CLEAR_FLUSH -#define ptep_clear_flush(__vma, __address, __ptep) \ -({ \ - pte_t __pte; \ - __pte = ptep_get_and_clear((__vma)->vm_mm, __address, __ptep); \ - flush_tlb_page(__vma, __address); \ - __pte; \ -}) +extern pte_t ptep_clear_flush(struct vm_area_struct *vma, + unsigned long address, + pte_t *ptep); +#endif + +#ifndef __HAVE_ARCH_PMDP_CLEAR_FLUSH +extern pmd_t pmdp_clear_flush(struct vm_area_struct *vma, + unsigned long address, + pmd_t *pmdp); #endif #ifndef __HAVE_ARCH_PTEP_SET_WRPROTECT @@ -99,8 +145,49 @@ static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addres } #endif +#ifndef __HAVE_ARCH_PMDP_SET_WRPROTECT +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +static inline void pmdp_set_wrprotect(struct mm_struct *mm, + unsigned long address, pmd_t *pmdp) +{ + pmd_t old_pmd = *pmdp; + set_pmd_at(mm, address, pmdp, pmd_wrprotect(old_pmd)); +} +#else /* CONFIG_TRANSPARENT_HUGEPAGE */ +static inline void pmdp_set_wrprotect(struct mm_struct *mm, + unsigned long address, pmd_t *pmdp) +{ + BUG(); +} +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ +#endif + +#ifndef __HAVE_ARCH_PMDP_SPLITTING_FLUSH +extern pmd_t pmdp_clear_flush(struct vm_area_struct *vma, + unsigned long address, + pmd_t *pmdp); +#endif + #ifndef __HAVE_ARCH_PTE_SAME -#define pte_same(A,B) (pte_val(A) == pte_val(B)) +static inline int pte_same(pte_t pte_a, pte_t pte_b) +{ + return pte_val(pte_a) == pte_val(pte_b); +} +#endif + +#ifndef __HAVE_ARCH_PMD_SAME +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +static inline int pmd_same(pmd_t pmd_a, pmd_t pmd_b) +{ + return pmd_val(pmd_a) == pmd_val(pmd_b); +} +#else /* CONFIG_TRANSPARENT_HUGEPAGE */ +static inline int pmd_same(pmd_t pmd_a, pmd_t pmd_b) +{ + BUG(); + return 0; +} +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #endif #ifndef __HAVE_ARCH_PAGE_TEST_DIRTY @@ -357,6 +444,13 @@ static inline int pmd_trans_splitting(pmd_t pmd) { return 0; } +#ifndef __HAVE_ARCH_PMD_WRITE +static inline int pmd_write(pmd_t pmd) +{ + BUG(); + return 0; +} +#endif /* __HAVE_ARCH_PMD_WRITE */ #endif #endif /* !__ASSEMBLY__ */ diff --git a/mm/Makefile b/mm/Makefile index f73f75a29f82..380772a9ccb8 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -5,7 +5,7 @@ mmu-y := nommu.o mmu-$(CONFIG_MMU) := fremap.o highmem.o madvise.o memory.o mincore.o \ mlock.o mmap.o mprotect.o mremap.o msync.o rmap.o \ - vmalloc.o pagewalk.o + vmalloc.o pagewalk.o pgtable-generic.o obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \ maccess.o page_alloc.o page-writeback.o \ diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c new file mode 100644 index 000000000000..d030548047e2 --- /dev/null +++ b/mm/pgtable-generic.c @@ -0,0 +1,123 @@ +/* + * mm/pgtable-generic.c + * + * Generic pgtable methods declared in asm-generic/pgtable.h + * + * Copyright (C) 2010 Linus Torvalds + */ + +#include +#include + +#ifndef __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS +/* + * Only sets the access flags (dirty, accessed, and + * writable). Furthermore, we know it always gets set to a "more + * permissive" setting, which allows most architectures to optimize + * this. We return whether the PTE actually changed, which in turn + * instructs the caller to do things like update__mmu_cache. This + * used to be done in the caller, but sparc needs minor faults to + * force that call on sun4c so we changed this macro slightly + */ +int ptep_set_access_flags(struct vm_area_struct *vma, + unsigned long address, pte_t *ptep, + pte_t entry, int dirty) +{ + int changed = !pte_same(*ptep, entry); + if (changed) { + set_pte_at(vma->vm_mm, address, ptep, entry); + flush_tlb_page(vma, address); + } + return changed; +} +#endif + +#ifndef __HAVE_ARCH_PMDP_SET_ACCESS_FLAGS +int pmdp_set_access_flags(struct vm_area_struct *vma, + unsigned long address, pmd_t *pmdp, + pmd_t entry, int dirty) +{ +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + int changed = !pmd_same(*pmdp, entry); + VM_BUG_ON(address & ~HPAGE_PMD_MASK); + if (changed) { + set_pmd_at(vma->vm_mm, address, pmdp, entry); + flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE); + } + return changed; +#else /* CONFIG_TRANSPARENT_HUGEPAGE */ + BUG(); + return 0; +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ +} +#endif + +#ifndef __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH +int ptep_clear_flush_young(struct vm_area_struct *vma, + unsigned long address, pte_t *ptep) +{ + int young; + young = ptep_test_and_clear_young(vma, address, ptep); + if (young) + flush_tlb_page(vma, address); + return young; +} +#endif + +#ifndef __HAVE_ARCH_PMDP_CLEAR_YOUNG_FLUSH +int pmdp_clear_flush_young(struct vm_area_struct *vma, + unsigned long address, pmd_t *pmdp) +{ + int young; +#ifndef CONFIG_TRANSPARENT_HUGEPAGE + BUG(); +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ + VM_BUG_ON(address & ~HPAGE_PMD_MASK); + young = pmdp_test_and_clear_young(vma, address, pmdp); + if (young) + flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE); + return young; +} +#endif + +#ifndef __HAVE_ARCH_PTEP_CLEAR_FLUSH +pte_t ptep_clear_flush(struct vm_area_struct *vma, unsigned long address, + pte_t *ptep) +{ + pte_t pte; + pte = ptep_get_and_clear((vma)->vm_mm, address, ptep); + flush_tlb_page(vma, address); + return pte; +} +#endif + +#ifndef __HAVE_ARCH_PMDP_CLEAR_FLUSH +pmd_t pmdp_clear_flush(struct vm_area_struct *vma, unsigned long address, + pmd_t *pmdp) +{ + pmd_t pmd; +#ifndef CONFIG_TRANSPARENT_HUGEPAGE + BUG(); +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ + VM_BUG_ON(address & ~HPAGE_PMD_MASK); + pmd = pmdp_get_and_clear(vma->vm_mm, address, pmdp); + flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE); + return pmd; +} +#endif + +#ifndef __HAVE_ARCH_PMDP_SPLITTING_FLUSH +pmd_t pmdp_splitting_flush(struct vm_area_struct *vma, unsigned long address, + pmd_t *pmdp) +{ +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + pmd_t pmd = pmd_mksplitting(*pmdp); + VM_BUG_ON(address & ~HPAGE_PMD_MASK); + set_pmd_at(vma->vm_mm, address, pmdp, pmd); + /* tlb flush only to serialize against gup-fast */ + flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE); +#else /* CONFIG_TRANSPARENT_HUGEPAGE */ + BUG(); +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ +} +#endif -- cgit From db3eb96f4e6281b84dd33c8980dacc27f2efe177 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Thu, 13 Jan 2011 15:46:41 -0800 Subject: thp: add pmd mangling functions to x86 Add needed pmd mangling functions with symmetry with their pte counterparts. pmdp_splitting_flush() is the only new addition on the pmd_ methods and it's needed to serialize the VM against split_huge_page. It simply atomically sets the splitting bit in a similar way pmdp_clear_flush_young atomically clears the accessed bit. pmdp_splitting_flush() also has to flush the tlb to make it effective against gup_fast, but it wouldn't really require to flush the tlb too. Just the tlb flush is the simplest operation we can invoke to serialize pmdp_splitting_flush() against gup_fast. Signed-off-by: Andrea Arcangeli Acked-by: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/include/asm/pgtable.h | 2 +- arch/x86/include/asm/pgtable_64.h | 119 +++++++++++++++++++++++++++++++++++--- arch/x86/mm/pgtable.c | 66 +++++++++++++++++++++ 3 files changed, 179 insertions(+), 8 deletions(-) diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index e576cbd7a343..3278038e9706 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -353,7 +353,7 @@ static inline unsigned long pmd_page_vaddr(pmd_t pmd) * Currently stuck as a macro due to indirect forward reference to * linux/mmzone.h's __section_mem_map_addr() definition: */ -#define pmd_page(pmd) pfn_to_page(pmd_val(pmd) >> PAGE_SHIFT) +#define pmd_page(pmd) pfn_to_page((pmd_val(pmd) & PTE_PFN_MASK) >> PAGE_SHIFT) /* * the pmd page can be thought of an array like this: pmd_t[PTRS_PER_PMD] diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h index 6dffd4c551cc..1fb61a74b2e1 100644 --- a/arch/x86/include/asm/pgtable_64.h +++ b/arch/x86/include/asm/pgtable_64.h @@ -59,6 +59,16 @@ static inline void native_set_pte_atomic(pte_t *ptep, pte_t pte) native_set_pte(ptep, pte); } +static inline void native_set_pmd(pmd_t *pmdp, pmd_t pmd) +{ + *pmdp = pmd; +} + +static inline void native_pmd_clear(pmd_t *pmd) +{ + native_set_pmd(pmd, native_make_pmd(0)); +} + static inline pte_t native_ptep_get_and_clear(pte_t *xp) { #ifdef CONFIG_SMP @@ -72,14 +82,17 @@ static inline pte_t native_ptep_get_and_clear(pte_t *xp) #endif } -static inline void native_set_pmd(pmd_t *pmdp, pmd_t pmd) -{ - *pmdp = pmd; -} - -static inline void native_pmd_clear(pmd_t *pmd) +static inline pmd_t native_pmdp_get_and_clear(pmd_t *xp) { - native_set_pmd(pmd, native_make_pmd(0)); +#ifdef CONFIG_SMP + return native_make_pmd(xchg(&xp->pmd, 0)); +#else + /* native_local_pmdp_get_and_clear, + but duplicated because of cyclic dependency */ + pmd_t ret = *xp; + native_pmd_clear(xp); + return ret; +#endif } static inline void native_set_pud(pud_t *pudp, pud_t pud) @@ -181,6 +194,98 @@ static inline int pmd_trans_huge(pmd_t pmd) } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ +#define mk_pmd(page, pgprot) pfn_pmd(page_to_pfn(page), (pgprot)) + +#define __HAVE_ARCH_PMDP_SET_ACCESS_FLAGS +extern int pmdp_set_access_flags(struct vm_area_struct *vma, + unsigned long address, pmd_t *pmdp, + pmd_t entry, int dirty); + +#define __HAVE_ARCH_PMDP_TEST_AND_CLEAR_YOUNG +extern int pmdp_test_and_clear_young(struct vm_area_struct *vma, + unsigned long addr, pmd_t *pmdp); + +#define __HAVE_ARCH_PMDP_CLEAR_YOUNG_FLUSH +extern int pmdp_clear_flush_young(struct vm_area_struct *vma, + unsigned long address, pmd_t *pmdp); + + +#define __HAVE_ARCH_PMDP_SPLITTING_FLUSH +extern void pmdp_splitting_flush(struct vm_area_struct *vma, + unsigned long addr, pmd_t *pmdp); + +#define __HAVE_ARCH_PMD_WRITE +static inline int pmd_write(pmd_t pmd) +{ + return pmd_flags(pmd) & _PAGE_RW; +} + +#define __HAVE_ARCH_PMDP_GET_AND_CLEAR +static inline pmd_t pmdp_get_and_clear(struct mm_struct *mm, unsigned long addr, + pmd_t *pmdp) +{ + pmd_t pmd = native_pmdp_get_and_clear(pmdp); + pmd_update(mm, addr, pmdp); + return pmd; +} + +#define __HAVE_ARCH_PMDP_SET_WRPROTECT +static inline void pmdp_set_wrprotect(struct mm_struct *mm, + unsigned long addr, pmd_t *pmdp) +{ + clear_bit(_PAGE_BIT_RW, (unsigned long *)&pmdp->pmd); + pmd_update(mm, addr, pmdp); +} + +static inline int pmd_young(pmd_t pmd) +{ + return pmd_flags(pmd) & _PAGE_ACCESSED; +} + +static inline pmd_t pmd_set_flags(pmd_t pmd, pmdval_t set) +{ + pmdval_t v = native_pmd_val(pmd); + + return native_make_pmd(v | set); +} + +static inline pmd_t pmd_clear_flags(pmd_t pmd, pmdval_t clear) +{ + pmdval_t v = native_pmd_val(pmd); + + return native_make_pmd(v & ~clear); +} + +static inline pmd_t pmd_mkold(pmd_t pmd) +{ + return pmd_clear_flags(pmd, _PAGE_ACCESSED); +} + +static inline pmd_t pmd_wrprotect(pmd_t pmd) +{ + return pmd_clear_flags(pmd, _PAGE_RW); +} + +static inline pmd_t pmd_mkdirty(pmd_t pmd) +{ + return pmd_set_flags(pmd, _PAGE_DIRTY); +} + +static inline pmd_t pmd_mkhuge(pmd_t pmd) +{ + return pmd_set_flags(pmd, _PAGE_PSE); +} + +static inline pmd_t pmd_mkyoung(pmd_t pmd) +{ + return pmd_set_flags(pmd, _PAGE_ACCESSED); +} + +static inline pmd_t pmd_mkwrite(pmd_t pmd) +{ + return pmd_set_flags(pmd, _PAGE_RW); +} + #endif /* !__ASSEMBLY__ */ #endif /* _ASM_X86_PGTABLE_64_H */ diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index 8be8c7d7bc89..65e92d58f942 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -320,6 +320,25 @@ int ptep_set_access_flags(struct vm_area_struct *vma, return changed; } +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +int pmdp_set_access_flags(struct vm_area_struct *vma, + unsigned long address, pmd_t *pmdp, + pmd_t entry, int dirty) +{ + int changed = !pmd_same(*pmdp, entry); + + VM_BUG_ON(address & ~HPAGE_PMD_MASK); + + if (changed && dirty) { + *pmdp = entry; + pmd_update_defer(vma->vm_mm, address, pmdp); + flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE); + } + + return changed; +} +#endif + int ptep_test_and_clear_young(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) { @@ -335,6 +354,23 @@ int ptep_test_and_clear_young(struct vm_area_struct *vma, return ret; } +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +int pmdp_test_and_clear_young(struct vm_area_struct *vma, + unsigned long addr, pmd_t *pmdp) +{ + int ret = 0; + + if (pmd_young(*pmdp)) + ret = test_and_clear_bit(_PAGE_BIT_ACCESSED, + (unsigned long *) &pmdp->pmd); + + if (ret) + pmd_update(vma->vm_mm, addr, pmdp); + + return ret; +} +#endif + int ptep_clear_flush_young(struct vm_area_struct *vma, unsigned long address, pte_t *ptep) { @@ -347,6 +383,36 @@ int ptep_clear_flush_young(struct vm_area_struct *vma, return young; } +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +int pmdp_clear_flush_young(struct vm_area_struct *vma, + unsigned long address, pmd_t *pmdp) +{ + int young; + + VM_BUG_ON(address & ~HPAGE_PMD_MASK); + + young = pmdp_test_and_clear_young(vma, address, pmdp); + if (young) + flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE); + + return young; +} + +void pmdp_splitting_flush(struct vm_area_struct *vma, + unsigned long address, pmd_t *pmdp) +{ + int set; + VM_BUG_ON(address & ~HPAGE_PMD_MASK); + set = !test_and_set_bit(_PAGE_BIT_SPLITTING, + (unsigned long *)&pmdp->pmd); + if (set) { + pmd_update(vma->vm_mm, address, pmdp); + /* need tlb flush only to serialize against gup-fast */ + flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE); + } +} +#endif + /** * reserve_top_address - reserves a hole in the top of kernel address space * @reserve - size of hole to reserve -- cgit From 64cc6ae001d70bc59e5f854e6b5678f59110df16 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Thu, 13 Jan 2011 15:46:42 -0800 Subject: thp: bail out gup_fast on splitting pmd Force gup_fast to take the slow path and block if the pmd is splitting, not only if it's none. Signed-off-by: Andrea Arcangeli Acked-by: Rik van Riel Acked-by: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/mm/gup.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c index 06f56fcf9a77..269aa53932e0 100644 --- a/arch/x86/mm/gup.c +++ b/arch/x86/mm/gup.c @@ -160,7 +160,18 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end, pmd_t pmd = *pmdp; next = pmd_addr_end(addr, end); - if (pmd_none(pmd)) + /* + * The pmd_trans_splitting() check below explains why + * pmdp_splitting_flush has to flush the tlb, to stop + * this gup-fast code from running while we set the + * splitting bit in the pmd. Returning zero will take + * the slow path that will call wait_split_huge_page() + * if the pmd is still in splitting state. gup-fast + * can't because it has irq disabled and + * wait_split_huge_page() would never return as the + * tlb flush IPI wouldn't run. + */ + if (pmd_none(pmd) || pmd_trans_splitting(pmd)) return 0; if (unlikely(pmd_large(pmd))) { if (!gup_huge_pmd(pmd, addr, next, write, pages, nr)) -- cgit From 8ac1f8320a0073f28cf9e0491af4cd98f504f92a Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Thu, 13 Jan 2011 15:46:43 -0800 Subject: thp: pte alloc trans splitting pte alloc routines must wait for split_huge_page if the pmd is not present and not null (i.e. pmd_trans_splitting). The additional branches are optimized away at compile time by pmd_trans_splitting if the config option is off. However we must pass the vma down in order to know the anon_vma lock to wait for. [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Andrea Arcangeli Acked-by: Rik van Riel Acked-by: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm/mm/pgd.c | 2 +- arch/ia64/mm/hugetlbpage.c | 2 +- arch/sh/mm/hugetlbpage.c | 2 +- arch/sparc/mm/generic_32.c | 2 +- arch/sparc/mm/generic_64.c | 2 +- arch/sparc/mm/hugetlbpage.c | 2 +- arch/um/kernel/skas/mmu.c | 2 +- arch/x86/kernel/tboot.c | 2 +- include/linux/mm.h | 15 +++++++++------ mm/memory.c | 19 +++++++++++++------ mm/mremap.c | 8 +++++--- 11 files changed, 35 insertions(+), 23 deletions(-) diff --git a/arch/arm/mm/pgd.c b/arch/arm/mm/pgd.c index 93292a18cf77..709244c66fa3 100644 --- a/arch/arm/mm/pgd.c +++ b/arch/arm/mm/pgd.c @@ -50,7 +50,7 @@ pgd_t *pgd_alloc(struct mm_struct *mm) if (!new_pmd) goto no_pmd; - new_pte = pte_alloc_map(mm, new_pmd, 0); + new_pte = pte_alloc_map(mm, NULL, new_pmd, 0); if (!new_pte) goto no_pte; diff --git a/arch/ia64/mm/hugetlbpage.c b/arch/ia64/mm/hugetlbpage.c index 1841ee7e65f9..5ca674b74737 100644 --- a/arch/ia64/mm/hugetlbpage.c +++ b/arch/ia64/mm/hugetlbpage.c @@ -38,7 +38,7 @@ huge_pte_alloc(struct mm_struct *mm, unsigned long addr, unsigned long sz) if (pud) { pmd = pmd_alloc(mm, pud, taddr); if (pmd) - pte = pte_alloc_map(mm, pmd, taddr); + pte = pte_alloc_map(mm, NULL, pmd, taddr); } return pte; } diff --git a/arch/sh/mm/hugetlbpage.c b/arch/sh/mm/hugetlbpage.c index 9163db3e8d15..d7762349ea48 100644 --- a/arch/sh/mm/hugetlbpage.c +++ b/arch/sh/mm/hugetlbpage.c @@ -35,7 +35,7 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, if (pud) { pmd = pmd_alloc(mm, pud, addr); if (pmd) - pte = pte_alloc_map(mm, pmd, addr); + pte = pte_alloc_map(mm, NULL, pmd, addr); } } diff --git a/arch/sparc/mm/generic_32.c b/arch/sparc/mm/generic_32.c index 5edcac184eaf..e6067b75f11c 100644 --- a/arch/sparc/mm/generic_32.c +++ b/arch/sparc/mm/generic_32.c @@ -50,7 +50,7 @@ static inline int io_remap_pmd_range(struct mm_struct *mm, pmd_t * pmd, unsigned end = PGDIR_SIZE; offset -= address; do { - pte_t * pte = pte_alloc_map(mm, pmd, address); + pte_t *pte = pte_alloc_map(mm, NULL, pmd, address); if (!pte) return -ENOMEM; io_remap_pte_range(mm, pte, address, end - address, address + offset, prot, space); diff --git a/arch/sparc/mm/generic_64.c b/arch/sparc/mm/generic_64.c index 04f2bf4cd571..3cb00dfd4bd6 100644 --- a/arch/sparc/mm/generic_64.c +++ b/arch/sparc/mm/generic_64.c @@ -92,7 +92,7 @@ static inline int io_remap_pmd_range(struct mm_struct *mm, pmd_t * pmd, unsigned end = PGDIR_SIZE; offset -= address; do { - pte_t * pte = pte_alloc_map(mm, pmd, address); + pte_t *pte = pte_alloc_map(mm, NULL, pmd, address); if (!pte) return -ENOMEM; io_remap_pte_range(mm, pte, address, end - address, address + offset, prot, space); diff --git a/arch/sparc/mm/hugetlbpage.c b/arch/sparc/mm/hugetlbpage.c index 5fdddf134caa..f4e97646ce23 100644 --- a/arch/sparc/mm/hugetlbpage.c +++ b/arch/sparc/mm/hugetlbpage.c @@ -214,7 +214,7 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, if (pud) { pmd = pmd_alloc(mm, pud, addr); if (pmd) - pte = pte_alloc_map(mm, pmd, addr); + pte = pte_alloc_map(mm, NULL, pmd, addr); } return pte; } diff --git a/arch/um/kernel/skas/mmu.c b/arch/um/kernel/skas/mmu.c index 3d099f974785..1aee587e9c5d 100644 --- a/arch/um/kernel/skas/mmu.c +++ b/arch/um/kernel/skas/mmu.c @@ -31,7 +31,7 @@ static int init_stub_pte(struct mm_struct *mm, unsigned long proc, if (!pmd) goto out_pmd; - pte = pte_alloc_map(mm, pmd, proc); + pte = pte_alloc_map(mm, NULL, pmd, proc); if (!pte) goto out_pte; diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c index c2f1b26141e2..998e972f3b1a 100644 --- a/arch/x86/kernel/tboot.c +++ b/arch/x86/kernel/tboot.c @@ -133,7 +133,7 @@ static int map_tboot_page(unsigned long vaddr, unsigned long pfn, pmd = pmd_alloc(&tboot_mm, pud, vaddr); if (!pmd) return -1; - pte = pte_alloc_map(&tboot_mm, pmd, vaddr); + pte = pte_alloc_map(&tboot_mm, NULL, pmd, vaddr); if (!pte) return -1; set_pte_at(&tboot_mm, vaddr, pte, pfn_pte(pfn, prot)); diff --git a/include/linux/mm.h b/include/linux/mm.h index 6bef67d74adf..14ddd98b063f 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1131,7 +1131,8 @@ static inline int __pmd_alloc(struct mm_struct *mm, pud_t *pud, int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address); #endif -int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address); +int __pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma, + pmd_t *pmd, unsigned long address); int __pte_alloc_kernel(pmd_t *pmd, unsigned long address); /* @@ -1200,16 +1201,18 @@ static inline void pgtable_page_dtor(struct page *page) pte_unmap(pte); \ } while (0) -#define pte_alloc_map(mm, pmd, address) \ - ((unlikely(!pmd_present(*(pmd))) && __pte_alloc(mm, pmd, address))? \ - NULL: pte_offset_map(pmd, address)) +#define pte_alloc_map(mm, vma, pmd, address) \ + ((unlikely(pmd_none(*(pmd))) && __pte_alloc(mm, vma, \ + pmd, address))? \ + NULL: pte_offset_map(pmd, address)) #define pte_alloc_map_lock(mm, pmd, address, ptlp) \ - ((unlikely(!pmd_present(*(pmd))) && __pte_alloc(mm, pmd, address))? \ + ((unlikely(pmd_none(*(pmd))) && __pte_alloc(mm, NULL, \ + pmd, address))? \ NULL: pte_offset_map_lock(mm, pmd, address, ptlp)) #define pte_alloc_kernel(pmd, address) \ - ((unlikely(!pmd_present(*(pmd))) && __pte_alloc_kernel(pmd, address))? \ + ((unlikely(pmd_none(*(pmd))) && __pte_alloc_kernel(pmd, address))? \ NULL: pte_offset_kernel(pmd, address)) extern void free_area_init(unsigned long * zones_size); diff --git a/mm/memory.c b/mm/memory.c index bdf19366b705..567bca80ea53 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -394,9 +394,11 @@ void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma, } } -int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address) +int __pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma, + pmd_t *pmd, unsigned long address) { pgtable_t new = pte_alloc_one(mm, address); + int wait_split_huge_page; if (!new) return -ENOMEM; @@ -416,14 +418,18 @@ int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address) smp_wmb(); /* Could be smp_wmb__xxx(before|after)_spin_lock */ spin_lock(&mm->page_table_lock); - if (!pmd_present(*pmd)) { /* Has another populated it ? */ + wait_split_huge_page = 0; + if (likely(pmd_none(*pmd))) { /* Has another populated it ? */ mm->nr_ptes++; pmd_populate(mm, pmd, new); new = NULL; - } + } else if (unlikely(pmd_trans_splitting(*pmd))) + wait_split_huge_page = 1; spin_unlock(&mm->page_table_lock); if (new) pte_free(mm, new); + if (wait_split_huge_page) + wait_split_huge_page(vma->anon_vma, pmd); return 0; } @@ -436,10 +442,11 @@ int __pte_alloc_kernel(pmd_t *pmd, unsigned long address) smp_wmb(); /* See comment in __pte_alloc */ spin_lock(&init_mm.page_table_lock); - if (!pmd_present(*pmd)) { /* Has another populated it ? */ + if (likely(pmd_none(*pmd))) { /* Has another populated it ? */ pmd_populate_kernel(&init_mm, pmd, new); new = NULL; - } + } else + VM_BUG_ON(pmd_trans_splitting(*pmd)); spin_unlock(&init_mm.page_table_lock); if (new) pte_free_kernel(&init_mm, new); @@ -3253,7 +3260,7 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, pmd = pmd_alloc(mm, pud, address); if (!pmd) return VM_FAULT_OOM; - pte = pte_alloc_map(mm, pmd, address); + pte = pte_alloc_map(mm, vma, pmd, address); if (!pte) return VM_FAULT_OOM; diff --git a/mm/mremap.c b/mm/mremap.c index 563fbdd6293a..b09eefaea0b8 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -47,7 +47,8 @@ static pmd_t *get_old_pmd(struct mm_struct *mm, unsigned long addr) return pmd; } -static pmd_t *alloc_new_pmd(struct mm_struct *mm, unsigned long addr) +static pmd_t *alloc_new_pmd(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long addr) { pgd_t *pgd; pud_t *pud; @@ -62,7 +63,8 @@ static pmd_t *alloc_new_pmd(struct mm_struct *mm, unsigned long addr) if (!pmd) return NULL; - if (!pmd_present(*pmd) && __pte_alloc(mm, pmd, addr)) + VM_BUG_ON(pmd_trans_huge(*pmd)); + if (pmd_none(*pmd) && __pte_alloc(mm, vma, pmd, addr)) return NULL; return pmd; @@ -147,7 +149,7 @@ unsigned long move_page_tables(struct vm_area_struct *vma, old_pmd = get_old_pmd(vma->vm_mm, old_addr); if (!old_pmd) continue; - new_pmd = alloc_new_pmd(vma->vm_mm, new_addr); + new_pmd = alloc_new_pmd(vma->vm_mm, vma, new_addr); if (!new_pmd) break; next = (new_addr + PMD_SIZE) & PMD_MASK; -- cgit From 91a4ee2670e0ee2b0630a0eb24fb9a87ea3acd0a Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Thu, 13 Jan 2011 15:46:44 -0800 Subject: thp: add pmd mmu_notifier helpers Add mmu notifier helpers to handle pmd huge operations. Signed-off-by: Andrea Arcangeli Acked-by: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmu_notifier.h | 40 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h index 43dcfbdc39de..cbfab1e9957d 100644 --- a/include/linux/mmu_notifier.h +++ b/include/linux/mmu_notifier.h @@ -243,6 +243,32 @@ static inline void mmu_notifier_mm_destroy(struct mm_struct *mm) __pte; \ }) +#define pmdp_clear_flush_notify(__vma, __address, __pmdp) \ +({ \ + pmd_t __pmd; \ + struct vm_area_struct *___vma = __vma; \ + unsigned long ___address = __address; \ + VM_BUG_ON(__address & ~HPAGE_PMD_MASK); \ + mmu_notifier_invalidate_range_start(___vma->vm_mm, ___address, \ + (__address)+HPAGE_PMD_SIZE);\ + __pmd = pmdp_clear_flush(___vma, ___address, __pmdp); \ + mmu_notifier_invalidate_range_end(___vma->vm_mm, ___address, \ + (__address)+HPAGE_PMD_SIZE); \ + __pmd; \ +}) + +#define pmdp_splitting_flush_notify(__vma, __address, __pmdp) \ +({ \ + struct vm_area_struct *___vma = __vma; \ + unsigned long ___address = __address; \ + VM_BUG_ON(__address & ~HPAGE_PMD_MASK); \ + mmu_notifier_invalidate_range_start(___vma->vm_mm, ___address, \ + (__address)+HPAGE_PMD_SIZE);\ + pmdp_splitting_flush(___vma, ___address, __pmdp); \ + mmu_notifier_invalidate_range_end(___vma->vm_mm, ___address, \ + (__address)+HPAGE_PMD_SIZE); \ +}) + #define ptep_clear_flush_young_notify(__vma, __address, __ptep) \ ({ \ int __young; \ @@ -254,6 +280,17 @@ static inline void mmu_notifier_mm_destroy(struct mm_struct *mm) __young; \ }) +#define pmdp_clear_flush_young_notify(__vma, __address, __pmdp) \ +({ \ + int __young; \ + struct vm_area_struct *___vma = __vma; \ + unsigned long ___address = __address; \ + __young = pmdp_clear_flush_young(___vma, ___address, __pmdp); \ + __young |= mmu_notifier_clear_flush_young(___vma->vm_mm, \ + ___address); \ + __young; \ +}) + #define set_pte_at_notify(__mm, __address, __ptep, __pte) \ ({ \ struct mm_struct *___mm = __mm; \ @@ -305,7 +342,10 @@ static inline void mmu_notifier_mm_destroy(struct mm_struct *mm) } #define ptep_clear_flush_young_notify ptep_clear_flush_young +#define pmdp_clear_flush_young_notify pmdp_clear_flush_young #define ptep_clear_flush_notify ptep_clear_flush +#define pmdp_clear_flush_notify pmdp_clear_flush +#define pmdp_splitting_flush_notify pmdp_splitting_flush #define set_pte_at_notify set_pte_at #endif /* CONFIG_MMU_NOTIFIER */ -- cgit From 4e6af67e970a2ac287739a4c526c857b5bda27ec Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Thu, 13 Jan 2011 15:46:44 -0800 Subject: thp: clear page compound split_huge_page must transform a compound page to a regular page and needs ClearPageCompound. Signed-off-by: Andrea Arcangeli Acked-by: Rik van Riel Reviewed-by: Christoph Lameter Acked-by: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/page-flags.h | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 5a743cc87238..4835cae71047 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -347,7 +347,7 @@ static inline void set_page_writeback(struct page *page) * tests can be used in performance sensitive paths. PageCompound is * generally not used in hot code paths. */ -__PAGEFLAG(Head, head) +__PAGEFLAG(Head, head) CLEARPAGEFLAG(Head, head) __PAGEFLAG(Tail, tail) static inline int PageCompound(struct page *page) @@ -355,6 +355,13 @@ static inline int PageCompound(struct page *page) return page->flags & ((1L << PG_head) | (1L << PG_tail)); } +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +static inline void ClearPageCompound(struct page *page) +{ + BUG_ON(!PageHead(page)); + ClearPageHead(page); +} +#endif #else /* * Reduce page flag use as much as possible by overlapping @@ -392,6 +399,14 @@ static inline void __ClearPageTail(struct page *page) page->flags &= ~PG_head_tail_mask; } +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +static inline void ClearPageCompound(struct page *page) +{ + BUG_ON((page->flags & PG_head_tail_mask) != (1 << PG_compound)); + clear_bit(PG_compound, &page->flags); +} +#endif + #endif /* !PAGEFLAGS_EXTENDED */ #ifdef CONFIG_MMU -- cgit From e7a00c45f29c0155007aa150bf231a70fa470365 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Thu, 13 Jan 2011 15:46:45 -0800 Subject: thp: add pmd_huge_pte to mm_struct This increase the size of the mm struct a bit but it is needed to preallocate one pte for each hugepage so that split_huge_page will not require a fail path. Guarantee of success is a fundamental property of split_huge_page to avoid decrasing swapping reliability and to avoid adding -ENOMEM fail paths that would otherwise force the hugepage-unaware VM code to learn rolling back in the middle of its pte mangling operations (if something we need it to learn handling pmd_trans_huge natively rather being capable of rollback). When split_huge_page runs a pte is needed to succeed the split, to map the newly splitted regular pages with a regular pte. This way all existing VM code remains backwards compatible by just adding a split_huge_page* one liner. The memory waste of those preallocated ptes is negligible and so it is worth it. Signed-off-by: Andrea Arcangeli Acked-by: Rik van Riel Acked-by: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm_types.h | 3 +++ kernel/fork.c | 7 +++++++ 2 files changed, 10 insertions(+) diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index bb7288a782fd..26bc4e2cd275 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -309,6 +309,9 @@ struct mm_struct { #endif #ifdef CONFIG_MMU_NOTIFIER struct mmu_notifier_mm *mmu_notifier_mm; +#endif +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + pgtable_t pmd_huge_pte; /* protected by page_table_lock */ #endif /* How many tasks sharing this mm are OOM_DISABLE */ atomic_t oom_disable_count; diff --git a/kernel/fork.c b/kernel/fork.c index 1499607e4da2..f78f50ba6cb2 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -529,6 +529,9 @@ void __mmdrop(struct mm_struct *mm) mm_free_pgd(mm); destroy_context(mm); mmu_notifier_mm_destroy(mm); +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + VM_BUG_ON(mm->pmd_huge_pte); +#endif free_mm(mm); } EXPORT_SYMBOL_GPL(__mmdrop); @@ -669,6 +672,10 @@ struct mm_struct *dup_mm(struct task_struct *tsk) mm->token_priority = 0; mm->last_interval = 0; +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + mm->pmd_huge_pte = NULL; +#endif + if (!mm_init(mm, tsk)) goto fail_nomem; -- cgit From bae9c19bf12bb2a914a8e530270f41d36cc87c63 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Thu, 13 Jan 2011 15:46:46 -0800 Subject: thp: split_huge_page_mm/vma split_huge_page_pmd compat code. Each one of those would need to be expanded to hundred of lines of complex code without a fully reliable split_huge_page_pmd design. Signed-off-by: Andrea Arcangeli Acked-by: Rik van Riel Acked-by: Mel Gorman Signed-off-by: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/kernel/vm86_32.c | 1 + mm/mempolicy.c | 1 + mm/mincore.c | 1 + mm/mprotect.c | 1 + mm/mremap.c | 1 + mm/pagewalk.c | 1 + 6 files changed, 6 insertions(+) diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c index 61fb98519622..863f8753ab0a 100644 --- a/arch/x86/kernel/vm86_32.c +++ b/arch/x86/kernel/vm86_32.c @@ -179,6 +179,7 @@ static void mark_screen_rdonly(struct mm_struct *mm) if (pud_none_or_clear_bad(pud)) goto out; pmd = pmd_offset(pud, 0xA0000); + split_huge_page_pmd(mm, pmd); if (pmd_none_or_clear_bad(pmd)) goto out; pte = pte_offset_map_lock(mm, pmd, 0xA0000, &ptl); diff --git a/mm/mempolicy.c b/mm/mempolicy.c index e6d351265aed..83b7df309fc4 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -514,6 +514,7 @@ static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud, pmd = pmd_offset(pud, addr); do { next = pmd_addr_end(addr, end); + split_huge_page_pmd(vma->vm_mm, pmd); if (pmd_none_or_clear_bad(pmd)) continue; if (check_pte_range(vma, pmd, addr, next, nodes, diff --git a/mm/mincore.c b/mm/mincore.c index 9ac42dc6d7b6..9959bb41570e 100644 --- a/mm/mincore.c +++ b/mm/mincore.c @@ -154,6 +154,7 @@ static void mincore_pmd_range(struct vm_area_struct *vma, pud_t *pud, pmd = pmd_offset(pud, addr); do { next = pmd_addr_end(addr, end); + split_huge_page_pmd(vma->vm_mm, pmd); if (pmd_none_or_clear_bad(pmd)) mincore_unmapped_range(vma, addr, next, vec); else diff --git a/mm/mprotect.c b/mm/mprotect.c index 4c5133873097..bd27db6b992b 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -88,6 +88,7 @@ static inline void change_pmd_range(struct mm_struct *mm, pud_t *pud, pmd = pmd_offset(pud, addr); do { next = pmd_addr_end(addr, end); + split_huge_page_pmd(mm, pmd); if (pmd_none_or_clear_bad(pmd)) continue; change_pte_range(mm, pmd, addr, next, newprot, dirty_accountable); diff --git a/mm/mremap.c b/mm/mremap.c index b09eefaea0b8..9925b6391b80 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -41,6 +41,7 @@ static pmd_t *get_old_pmd(struct mm_struct *mm, unsigned long addr) return NULL; pmd = pmd_offset(pud, addr); + split_huge_page_pmd(mm, pmd); if (pmd_none_or_clear_bad(pmd)) return NULL; diff --git a/mm/pagewalk.c b/mm/pagewalk.c index 38cc58b8b2b0..7cfa6ae02303 100644 --- a/mm/pagewalk.c +++ b/mm/pagewalk.c @@ -34,6 +34,7 @@ static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end, pmd = pmd_offset(pud, addr); do { next = pmd_addr_end(addr, end); + split_huge_page_pmd(walk->mm, pmd); if (pmd_none_or_clear_bad(pmd)) { if (walk->pte_hole) err = walk->pte_hole(addr, next, walk); -- cgit From 3f04f62f90d46a82dd73027c5fd7a15daed5c33d Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Thu, 13 Jan 2011 15:46:47 -0800 Subject: thp: split_huge_page paging Paging logic that splits the page before it is unmapped and added to swap to ensure backwards compatibility with the legacy swap code. Eventually swap should natively pageout the hugepages to increase performance and decrease seeking and fragmentation of swap space. swapoff can just skip over huge pmd as they cannot be part of swap yet. In add_to_swap be careful to split the page only if we got a valid swap entry so we don't split hugepages with a full swap. In theory we could split pages before isolating them during the lru scan, but for khugepaged to be safe, I'm relying on either mmap_sem write mode, or PG_lock taken, so split_huge_page has to run either with mmap_sem read/write mode or PG_lock taken. Calling it from isolate_lru_page would make locking more complicated, in addition to that split_huge_page would deadlock if called by __isolate_lru_page because it has to take the lru lock to add the tail pages. Signed-off-by: Andrea Arcangeli Acked-by: Mel Gorman Acked-by: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory-failure.c | 2 ++ mm/rmap.c | 1 + mm/swap_state.c | 6 ++++++ mm/swapfile.c | 2 ++ 4 files changed, 11 insertions(+) diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 2323a8039a98..6a283cc9317c 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -386,6 +386,8 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill, struct task_struct *tsk; struct anon_vma *av; + if (unlikely(split_huge_page(page))) + return; read_lock(&tasklist_lock); av = page_lock_anon_vma(page); if (av == NULL) /* Not actually mapped anymore */ diff --git a/mm/rmap.c b/mm/rmap.c index c95d2ba27a0b..a3197a8a295b 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1400,6 +1400,7 @@ int try_to_unmap(struct page *page, enum ttu_flags flags) int ret; BUG_ON(!PageLocked(page)); + BUG_ON(PageTransHuge(page)); if (unlikely(PageKsm(page))) ret = try_to_unmap_ksm(page, flags); diff --git a/mm/swap_state.c b/mm/swap_state.c index e10f5833167f..5c8cfabbc9bc 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -157,6 +157,12 @@ int add_to_swap(struct page *page) if (!entry.val) return 0; + if (unlikely(PageTransHuge(page))) + if (unlikely(split_huge_page(page))) { + swapcache_free(entry, NULL); + return 0; + } + /* * Radix-tree node allocations from PF_MEMALLOC contexts could * completely exhaust the page allocator. __GFP_NOMEMALLOC diff --git a/mm/swapfile.c b/mm/swapfile.c index b6adcfbf6f48..07a458d72fa8 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -964,6 +964,8 @@ static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud, pmd = pmd_offset(pud, addr); do { next = pmd_addr_end(addr, end); + if (unlikely(pmd_trans_huge(*pmd))) + continue; if (pmd_none_or_clear_bad(pmd)) continue; ret = unuse_pte_range(vma, pmd, addr, next, entry, page); -- cgit From 47ad8475c000141eacb3ecda5e5ce4b43a9cd04d Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Thu, 13 Jan 2011 15:46:47 -0800 Subject: thp: clear_copy_huge_page Move the copy/clear_huge_page functions to common code to share between hugetlb.c and huge_memory.c. Signed-off-by: Andrea Arcangeli Acked-by: Rik van Riel Acked-by: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 9 +++++++ mm/hugetlb.c | 70 +++-------------------------------------------------- mm/memory.c | 71 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 83 insertions(+), 67 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 14ddd98b063f..cc6ab1038f6f 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1589,5 +1589,14 @@ static inline int is_hwpoison_address(unsigned long addr) extern void dump_page(struct page *page); +#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLBFS) +extern void clear_huge_page(struct page *page, + unsigned long addr, + unsigned int pages_per_huge_page); +extern void copy_user_huge_page(struct page *dst, struct page *src, + unsigned long addr, struct vm_area_struct *vma, + unsigned int pages_per_huge_page); +#endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */ + #endif /* __KERNEL__ */ #endif /* _LINUX_MM_H */ diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 85855240933d..7bf223d6677b 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -394,71 +394,6 @@ static int vma_has_reserves(struct vm_area_struct *vma) return 0; } -static void clear_gigantic_page(struct page *page, - unsigned long addr, unsigned long sz) -{ - int i; - struct page *p = page; - - might_sleep(); - for (i = 0; i < sz/PAGE_SIZE; i++, p = mem_map_next(p, page, i)) { - cond_resched(); - clear_user_highpage(p, addr + i * PAGE_SIZE); - } -} -static void clear_huge_page(struct page *page, - unsigned long addr, unsigned long sz) -{ - int i; - - if (unlikely(sz/PAGE_SIZE > MAX_ORDER_NR_PAGES)) { - clear_gigantic_page(page, addr, sz); - return; - } - - might_sleep(); - for (i = 0; i < sz/PAGE_SIZE; i++) { - cond_resched(); - clear_user_highpage(page + i, addr + i * PAGE_SIZE); - } -} - -static void copy_user_gigantic_page(struct page *dst, struct page *src, - unsigned long addr, struct vm_area_struct *vma) -{ - int i; - struct hstate *h = hstate_vma(vma); - struct page *dst_base = dst; - struct page *src_base = src; - - for (i = 0; i < pages_per_huge_page(h); ) { - cond_resched(); - copy_user_highpage(dst, src, addr + i*PAGE_SIZE, vma); - - i++; - dst = mem_map_next(dst, dst_base, i); - src = mem_map_next(src, src_base, i); - } -} - -static void copy_user_huge_page(struct page *dst, struct page *src, - unsigned long addr, struct vm_area_struct *vma) -{ - int i; - struct hstate *h = hstate_vma(vma); - - if (unlikely(pages_per_huge_page(h) > MAX_ORDER_NR_PAGES)) { - copy_user_gigantic_page(dst, src, addr, vma); - return; - } - - might_sleep(); - for (i = 0; i < pages_per_huge_page(h); i++) { - cond_resched(); - copy_user_highpage(dst + i, src + i, addr + i*PAGE_SIZE, vma); - } -} - static void copy_gigantic_page(struct page *dst, struct page *src) { int i; @@ -2454,7 +2389,8 @@ retry_avoidcopy: return VM_FAULT_OOM; } - copy_user_huge_page(new_page, old_page, address, vma); + copy_user_huge_page(new_page, old_page, address, vma, + pages_per_huge_page(h)); __SetPageUptodate(new_page); /* @@ -2558,7 +2494,7 @@ retry: ret = -PTR_ERR(page); goto out; } - clear_huge_page(page, address, huge_page_size(h)); + clear_huge_page(page, address, pages_per_huge_page(h)); __SetPageUptodate(page); if (vma->vm_flags & VM_MAYSHARE) { diff --git a/mm/memory.c b/mm/memory.c index 567bca80ea53..60e1c68d8218 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3645,3 +3645,74 @@ void might_fault(void) } EXPORT_SYMBOL(might_fault); #endif + +#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLBFS) +static void clear_gigantic_page(struct page *page, + unsigned long addr, + unsigned int pages_per_huge_page) +{ + int i; + struct page *p = page; + + might_sleep(); + for (i = 0; i < pages_per_huge_page; + i++, p = mem_map_next(p, page, i)) { + cond_resched(); + clear_user_highpage(p, addr + i * PAGE_SIZE); + } +} +void clear_huge_page(struct page *page, + unsigned long addr, unsigned int pages_per_huge_page) +{ + int i; + + if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES)) { + clear_gigantic_page(page, addr, pages_per_huge_page); + return; + } + + might_sleep(); + for (i = 0; i < pages_per_huge_page; i++) { + cond_resched(); + clear_user_highpage(page + i, addr + i * PAGE_SIZE); + } +} + +static void copy_user_gigantic_page(struct page *dst, struct page *src, + unsigned long addr, + struct vm_area_struct *vma, + unsigned int pages_per_huge_page) +{ + int i; + struct page *dst_base = dst; + struct page *src_base = src; + + for (i = 0; i < pages_per_huge_page; ) { + cond_resched(); + copy_user_highpage(dst, src, addr + i*PAGE_SIZE, vma); + + i++; + dst = mem_map_next(dst, dst_base, i); + src = mem_map_next(src, src_base, i); + } +} + +void copy_user_huge_page(struct page *dst, struct page *src, + unsigned long addr, struct vm_area_struct *vma, + unsigned int pages_per_huge_page) +{ + int i; + + if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES)) { + copy_user_gigantic_page(dst, src, addr, vma, + pages_per_huge_page); + return; + } + + might_sleep(); + for (i = 0; i < pages_per_huge_page; i++) { + cond_resched(); + copy_user_highpage(dst + i, src + i, addr + i*PAGE_SIZE, vma); + } +} +#endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */ -- cgit From 936a5fe6e6148c0b3ea0d792b903847d9b9931a1 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Thu, 13 Jan 2011 15:46:48 -0800 Subject: thp: kvm mmu transparent hugepage support This should work for both hugetlbfs and transparent hugepages. [akpm@linux-foundation.org: bring forward PageTransCompound() addition for bisectability] Signed-off-by: Andrea Arcangeli Cc: Avi Kivity Cc: Marcelo Tosatti Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/kvm/mmu.c | 91 ++++++++++++++++++++++++++++++++++++++-------- arch/x86/kvm/paging_tmpl.h | 9 ++++- include/linux/page-flags.h | 12 ++++++ virt/kvm/kvm_main.c | 32 +++++++++++++++- 4 files changed, 125 insertions(+), 19 deletions(-) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 9cafbb499813..47b2c3288b6b 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -554,14 +554,18 @@ static int host_mapping_level(struct kvm *kvm, gfn_t gfn) return ret; } -static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn) +static bool mapping_level_dirty_bitmap(struct kvm_vcpu *vcpu, gfn_t large_gfn) { struct kvm_memory_slot *slot; - int host_level, level, max_level; - slot = gfn_to_memslot(vcpu->kvm, large_gfn); if (slot && slot->dirty_bitmap) - return PT_PAGE_TABLE_LEVEL; + return true; + return false; +} + +static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn) +{ + int host_level, level, max_level; host_level = host_mapping_level(vcpu->kvm, large_gfn); @@ -2281,6 +2285,48 @@ static int kvm_handle_bad_page(struct kvm *kvm, gfn_t gfn, pfn_t pfn) return 1; } +static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu, + gfn_t *gfnp, pfn_t *pfnp, int *levelp) +{ + pfn_t pfn = *pfnp; + gfn_t gfn = *gfnp; + int level = *levelp; + + /* + * Check if it's a transparent hugepage. If this would be an + * hugetlbfs page, level wouldn't be set to + * PT_PAGE_TABLE_LEVEL and there would be no adjustment done + * here. + */ + if (!is_error_pfn(pfn) && !kvm_is_mmio_pfn(pfn) && + level == PT_PAGE_TABLE_LEVEL && + PageTransCompound(pfn_to_page(pfn)) && + !has_wrprotected_page(vcpu->kvm, gfn, PT_DIRECTORY_LEVEL)) { + unsigned long mask; + /* + * mmu_notifier_retry was successful and we hold the + * mmu_lock here, so the pmd can't become splitting + * from under us, and in turn + * __split_huge_page_refcount() can't run from under + * us and we can safely transfer the refcount from + * PG_tail to PG_head as we switch the pfn to tail to + * head. + */ + *levelp = level = PT_DIRECTORY_LEVEL; + mask = KVM_PAGES_PER_HPAGE(level) - 1; + VM_BUG_ON((gfn & mask) != (pfn & mask)); + if (pfn & mask) { + gfn &= ~mask; + *gfnp = gfn; + kvm_release_pfn_clean(pfn); + pfn &= ~mask; + if (!get_page_unless_zero(pfn_to_page(pfn))) + BUG(); + *pfnp = pfn; + } + } +} + static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn, gva_t gva, pfn_t *pfn, bool write, bool *writable); @@ -2289,20 +2335,25 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn, { int r; int level; + int force_pt_level; pfn_t pfn; unsigned long mmu_seq; bool map_writable; - level = mapping_level(vcpu, gfn); - - /* - * This path builds a PAE pagetable - so we can map 2mb pages at - * maximum. Therefore check if the level is larger than that. - */ - if (level > PT_DIRECTORY_LEVEL) - level = PT_DIRECTORY_LEVEL; + force_pt_level = mapping_level_dirty_bitmap(vcpu, gfn); + if (likely(!force_pt_level)) { + level = mapping_level(vcpu, gfn); + /* + * This path builds a PAE pagetable - so we can map + * 2mb pages at maximum. Therefore check if the level + * is larger than that. + */ + if (level > PT_DIRECTORY_LEVEL) + level = PT_DIRECTORY_LEVEL; - gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1); + gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1); + } else + level = PT_PAGE_TABLE_LEVEL; mmu_seq = vcpu->kvm->mmu_notifier_seq; smp_rmb(); @@ -2318,6 +2369,8 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn, if (mmu_notifier_retry(vcpu, mmu_seq)) goto out_unlock; kvm_mmu_free_some_pages(vcpu); + if (likely(!force_pt_level)) + transparent_hugepage_adjust(vcpu, &gfn, &pfn, &level); r = __direct_map(vcpu, v, write, map_writable, level, gfn, pfn, prefault); spin_unlock(&vcpu->kvm->mmu_lock); @@ -2655,6 +2708,7 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code, pfn_t pfn; int r; int level; + int force_pt_level; gfn_t gfn = gpa >> PAGE_SHIFT; unsigned long mmu_seq; int write = error_code & PFERR_WRITE_MASK; @@ -2667,9 +2721,12 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code, if (r) return r; - level = mapping_level(vcpu, gfn); - - gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1); + force_pt_level = mapping_level_dirty_bitmap(vcpu, gfn); + if (likely(!force_pt_level)) { + level = mapping_level(vcpu, gfn); + gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1); + } else + level = PT_PAGE_TABLE_LEVEL; mmu_seq = vcpu->kvm->mmu_notifier_seq; smp_rmb(); @@ -2684,6 +2741,8 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code, if (mmu_notifier_retry(vcpu, mmu_seq)) goto out_unlock; kvm_mmu_free_some_pages(vcpu); + if (likely(!force_pt_level)) + transparent_hugepage_adjust(vcpu, &gfn, &pfn, &level); r = __direct_map(vcpu, gpa, write, map_writable, level, gfn, pfn, prefault); spin_unlock(&vcpu->kvm->mmu_lock); diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 53210f1e94c2..6bccc24c4181 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -550,6 +550,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code, int r; pfn_t pfn; int level = PT_PAGE_TABLE_LEVEL; + int force_pt_level; unsigned long mmu_seq; bool map_writable; @@ -577,7 +578,11 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code, return 0; } - if (walker.level >= PT_DIRECTORY_LEVEL) { + if (walker.level >= PT_DIRECTORY_LEVEL) + force_pt_level = mapping_level_dirty_bitmap(vcpu, walker.gfn); + else + force_pt_level = 1; + if (!force_pt_level) { level = min(walker.level, mapping_level(vcpu, walker.gfn)); walker.gfn = walker.gfn & ~(KVM_PAGES_PER_HPAGE(level) - 1); } @@ -599,6 +604,8 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code, trace_kvm_mmu_audit(vcpu, AUDIT_PRE_PAGE_FAULT); kvm_mmu_free_some_pages(vcpu); + if (!force_pt_level) + transparent_hugepage_adjust(vcpu, &walker.gfn, &pfn, &level); sptep = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault, level, &write_pt, pfn, map_writable, prefault); (void)sptep; diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 4835cae71047..907f1605926b 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -409,6 +409,18 @@ static inline void ClearPageCompound(struct page *page) #endif /* !PAGEFLAGS_EXTENDED */ +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +static inline int PageTransCompound(struct page *page) +{ + return PageCompound(page); +} +#else +static inline int PageTransCompound(struct page *page) +{ + return 0; +} +#endif + #ifdef CONFIG_MMU #define __PG_MLOCKED (1 << PG_mlocked) #else diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 7f686251f711..85ab7db0d366 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -104,8 +104,36 @@ static pfn_t fault_pfn; inline int kvm_is_mmio_pfn(pfn_t pfn) { if (pfn_valid(pfn)) { - struct page *page = compound_head(pfn_to_page(pfn)); - return PageReserved(page); + struct page *head; + struct page *tail = pfn_to_page(pfn); + head = compound_head(tail); + if (head != tail) { + smp_rmb(); + /* + * head may be a dangling pointer. + * __split_huge_page_refcount clears PageTail + * before overwriting first_page, so if + * PageTail is still there it means the head + * pointer isn't dangling. + */ + if (PageTail(tail)) { + /* + * the "head" is not a dangling + * pointer but the hugepage may have + * been splitted from under us (and we + * may not hold a reference count on + * the head page so it can be reused + * before we run PageReferenced), so + * we've to recheck PageTail before + * returning what we just read. + */ + int reserved = PageReserved(head); + smp_rmb(); + if (PageTail(tail)) + return reserved; + } + } + return PageReserved(tail); } return true; -- cgit From 32dba98e085f8b2b4345887df9abf5e0e93bfc12 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Thu, 13 Jan 2011 15:46:49 -0800 Subject: thp: _GFP_NO_KSWAPD Transparent hugepage allocations must be allowed not to invoke kswapd or any other kind of indirect reclaim (especially when the defrag sysfs is control disabled). It's unacceptable to swap out anonymous pages (potentially anonymous transparent hugepages) in order to create new transparent hugepages. This is true for the MADV_HUGEPAGE areas too (swapping out a kvm virtual machine and so having it suffer an unbearable slowdown, so another one with guest physical memory marked MADV_HUGEPAGE can run 30% faster if it is running memory intensive workloads, makes no sense). If a transparent hugepage allocation fails the slowdown is minor and there is total fallback, so kswapd should never be asked to swapout memory to allow the high order allocation to succeed. Signed-off-by: Andrea Arcangeli Acked-by: Rik van Riel Acked-by: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/gfp.h | 5 ++++- mm/page_alloc.c | 3 ++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/include/linux/gfp.h b/include/linux/gfp.h index f54adfcbec9c..49d2356bb82d 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -34,6 +34,7 @@ struct vm_area_struct; #else #define ___GFP_NOTRACK 0 #endif +#define ___GFP_NO_KSWAPD 0x400000u /* * GFP bitmasks.. @@ -81,13 +82,15 @@ struct vm_area_struct; #define __GFP_RECLAIMABLE ((__force gfp_t)___GFP_RECLAIMABLE) /* Page is reclaimable */ #define __GFP_NOTRACK ((__force gfp_t)___GFP_NOTRACK) /* Don't track with kmemcheck */ +#define __GFP_NO_KSWAPD ((__force gfp_t)___GFP_NO_KSWAPD) + /* * This may seem redundant, but it's a way of annotating false positives vs. * allocations that simply cannot be supported (e.g. page tables). */ #define __GFP_NOTRACK_FALSE_POSITIVE (__GFP_NOTRACK) -#define __GFP_BITS_SHIFT 22 /* Room for 22 __GFP_FOO bits */ +#define __GFP_BITS_SHIFT 23 /* Room for 23 __GFP_FOO bits */ #define __GFP_BITS_MASK ((__force gfp_t)((1 << __GFP_BITS_SHIFT) - 1)) /* This equals 0, but use constants in case they ever change */ diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 6e62d5f9d40b..bbd0423f2820 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2027,7 +2027,8 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, goto nopage; restart: - wake_all_kswapd(order, zonelist, high_zoneidx, + if (!(gfp_mask & __GFP_NO_KSWAPD)) + wake_all_kswapd(order, zonelist, high_zoneidx, zone_idx(preferred_zone)); /* -- cgit From 5c3240d92e29ae7bfb9cb58a9b37e80ab40894ff Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Thu, 13 Jan 2011 15:46:49 -0800 Subject: thp: don't alloc harder for gfp nomemalloc even if nowait Not worth throwing away the precious reserved free memory pool for allocations that can fail gracefully (either through mempool or because they're transhuge allocations later falling back to 4k allocations). Signed-off-by: Andrea Arcangeli Acked-by: Rik van Riel Reviewed-by: KOSAKI Motohiro Acked-by: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index bbd0423f2820..e7664b9f706c 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1971,7 +1971,12 @@ gfp_to_alloc_flags(gfp_t gfp_mask) alloc_flags |= (__force int) (gfp_mask & __GFP_HIGH); if (!wait) { - alloc_flags |= ALLOC_HARDER; + /* + * Not worth trying to allocate harder for + * __GFP_NOMEMALLOC even if it can't schedule. + */ + if (!(gfp_mask & __GFP_NOMEMALLOC)) + alloc_flags |= ALLOC_HARDER; /* * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc. * See also cpuset_zone_allowed() comment in kernel/cpuset.c. -- cgit From 71e3aac0724ffe8918992d76acfe3aad7d8724a5 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Thu, 13 Jan 2011 15:46:52 -0800 Subject: thp: transparent hugepage core Lately I've been working to make KVM use hugepages transparently without the usual restrictions of hugetlbfs. Some of the restrictions I'd like to see removed: 1) hugepages have to be swappable or the guest physical memory remains locked in RAM and can't be paged out to swap 2) if a hugepage allocation fails, regular pages should be allocated instead and mixed in the same vma without any failure and without userland noticing 3) if some task quits and more hugepages become available in the buddy, guest physical memory backed by regular pages should be relocated on hugepages automatically in regions under madvise(MADV_HUGEPAGE) (ideally event driven by waking up the kernel deamon if the order=HPAGE_PMD_SHIFT-PAGE_SHIFT list becomes not null) 4) avoidance of reservation and maximization of use of hugepages whenever possible. Reservation (needed to avoid runtime fatal faliures) may be ok for 1 machine with 1 database with 1 database cache with 1 database cache size known at boot time. It's definitely not feasible with a virtualization hypervisor usage like RHEV-H that runs an unknown number of virtual machines with an unknown size of each virtual machine with an unknown amount of pagecache that could be potentially useful in the host for guest not using O_DIRECT (aka cache=off). hugepages in the virtualization hypervisor (and also in the guest!) are much more important than in a regular host not using virtualization, becasue with NPT/EPT they decrease the tlb-miss cacheline accesses from 24 to 19 in case only the hypervisor uses transparent hugepages, and they decrease the tlb-miss cacheline accesses from 19 to 15 in case both the linux hypervisor and the linux guest both uses this patch (though the guest will limit the addition speedup to anonymous regions only for now...). Even more important is that the tlb miss handler is much slower on a NPT/EPT guest than for a regular shadow paging or no-virtualization scenario. So maximizing the amount of virtual memory cached by the TLB pays off significantly more with NPT/EPT than without (even if there would be no significant speedup in the tlb-miss runtime). The first (and more tedious) part of this work requires allowing the VM to handle anonymous hugepages mixed with regular pages transparently on regular anonymous vmas. This is what this patch tries to achieve in the least intrusive possible way. We want hugepages and hugetlb to be used in a way so that all applications can benefit without changes (as usual we leverage the KVM virtualization design: by improving the Linux VM at large, KVM gets the performance boost too). The most important design choice is: always fallback to 4k allocation if the hugepage allocation fails! This is the _very_ opposite of some large pagecache patches that failed with -EIO back then if a 64k (or similar) allocation failed... Second important decision (to reduce the impact of the feature on the existing pagetable handling code) is that at any time we can split an hugepage into 512 regular pages and it has to be done with an operation that can't fail. This way the reliability of the swapping isn't decreased (no need to allocate memory when we are short on memory to swap) and it's trivial to plug a split_huge_page* one-liner where needed without polluting the VM. Over time we can teach mprotect, mremap and friends to handle pmd_trans_huge natively without calling split_huge_page*. The fact it can't fail isn't just for swap: if split_huge_page would return -ENOMEM (instead of the current void) we'd need to rollback the mprotect from the middle of it (ideally including undoing the split_vma) which would be a big change and in the very wrong direction (it'd likely be simpler not to call split_huge_page at all and to teach mprotect and friends to handle hugepages instead of rolling them back from the middle). In short the very value of split_huge_page is that it can't fail. The collapsing and madvise(MADV_HUGEPAGE) part will remain separated and incremental and it'll just be an "harmless" addition later if this initial part is agreed upon. It also should be noted that locking-wise replacing regular pages with hugepages is going to be very easy if compared to what I'm doing below in split_huge_page, as it will only happen when page_count(page) matches page_mapcount(page) if we can take the PG_lock and mmap_sem in write mode. collapse_huge_page will be a "best effort" that (unlike split_huge_page) can fail at the minimal sign of trouble and we can try again later. collapse_huge_page will be similar to how KSM works and the madvise(MADV_HUGEPAGE) will work similar to madvise(MADV_MERGEABLE). The default I like is that transparent hugepages are used at page fault time. This can be changed with /sys/kernel/mm/transparent_hugepage/enabled. The control knob can be set to three values "always", "madvise", "never" which mean respectively that hugepages are always used, or only inside madvise(MADV_HUGEPAGE) regions, or never used. /sys/kernel/mm/transparent_hugepage/defrag instead controls if the hugepage allocation should defrag memory aggressively "always", only inside "madvise" regions, or "never". The pmd_trans_splitting/pmd_trans_huge locking is very solid. The put_page (from get_user_page users that can't use mmu notifier like O_DIRECT) that runs against a __split_huge_page_refcount instead was a pain to serialize in a way that would result always in a coherent page count for both tail and head. I think my locking solution with a compound_lock taken only after the page_first is valid and is still a PageHead should be safe but it surely needs review from SMP race point of view. In short there is no current existing way to serialize the O_DIRECT final put_page against split_huge_page_refcount so I had to invent a new one (O_DIRECT loses knowledge on the mapping status by the time gup_fast returns so...). And I didn't want to impact all gup/gup_fast users for now, maybe if we change the gup interface substantially we can avoid this locking, I admit I didn't think too much about it because changing the gup unpinning interface would be invasive. If we ignored O_DIRECT we could stick to the existing compound refcounting code, by simply adding a get_user_pages_fast_flags(foll_flags) where KVM (and any other mmu notifier user) would call it without FOLL_GET (and if FOLL_GET isn't set we'd just BUG_ON if nobody registered itself in the current task mmu notifier list yet). But O_DIRECT is fundamental for decent performance of virtualized I/O on fast storage so we can't avoid it to solve the race of put_page against split_huge_page_refcount to achieve a complete hugepage feature for KVM. Swap and oom works fine (well just like with regular pages ;). MMU notifier is handled transparently too, with the exception of the young bit on the pmd, that didn't have a range check but I think KVM will be fine because the whole point of hugepages is that EPT/NPT will also use a huge pmd when they notice gup returns pages with PageCompound set, so they won't care of a range and there's just the pmd young bit to check in that case. NOTE: in some cases if the L2 cache is small, this may slowdown and waste memory during COWs because 4M of memory are accessed in a single fault instead of 8k (the payoff is that after COW the program can run faster). So we might want to switch the copy_huge_page (and clear_huge_page too) to not temporal stores. I also extensively researched ways to avoid this cache trashing with a full prefault logic that would cow in 8k/16k/32k/64k up to 1M (I can send those patches that fully implemented prefault) but I concluded they're not worth it and they add an huge additional complexity and they remove all tlb benefits until the full hugepage has been faulted in, to save a little bit of memory and some cache during app startup, but they still don't improve substantially the cache-trashing during startup if the prefault happens in >4k chunks. One reason is that those 4k pte entries copied are still mapped on a perfectly cache-colored hugepage, so the trashing is the worst one can generate in those copies (cow of 4k page copies aren't so well colored so they trashes less, but again this results in software running faster after the page fault). Those prefault patches allowed things like a pte where post-cow pages were local 4k regular anon pages and the not-yet-cowed pte entries were pointing in the middle of some hugepage mapped read-only. If it doesn't payoff substantially with todays hardware it will payoff even less in the future with larger l2 caches, and the prefault logic would blot the VM a lot. If one is emebdded transparent_hugepage can be disabled during boot with sysfs or with the boot commandline parameter transparent_hugepage=0 (or transparent_hugepage=2 to restrict hugepages inside madvise regions) that will ensure not a single hugepage is allocated at boot time. It is simple enough to just disable transparent hugepage globally and let transparent hugepages be allocated selectively by applications in the MADV_HUGEPAGE region (both at page fault time, and if enabled with the collapse_huge_page too through the kernel daemon). This patch supports only hugepages mapped in the pmd, archs that have smaller hugepages will not fit in this patch alone. Also some archs like power have certain tlb limits that prevents mixing different page size in the same regions so they will not fit in this framework that requires "graceful fallback" to basic PAGE_SIZE in case of physical memory fragmentation. hugetlbfs remains a perfect fit for those because its software limits happen to match the hardware limits. hugetlbfs also remains a perfect fit for hugepage sizes like 1GByte that cannot be hoped to be found not fragmented after a certain system uptime and that would be very expensive to defragment with relocation, so requiring reservation. hugetlbfs is the "reservation way", the point of transparent hugepages is not to have any reservation at all and maximizing the use of cache and hugepages at all times automatically. Some performance result: vmx andrea # LD_PRELOAD=/usr/lib64/libhugetlbfs.so HUGETLB_MORECORE=yes HUGETLB_PATH=/mnt/huge/ ./largep ages3 memset page fault 1566023 memset tlb miss 453854 memset second tlb miss 453321 random access tlb miss 41635 random access second tlb miss 41658 vmx andrea # LD_PRELOAD=/usr/lib64/libhugetlbfs.so HUGETLB_MORECORE=yes HUGETLB_PATH=/mnt/huge/ ./largepages3 memset page fault 1566471 memset tlb miss 453375 memset second tlb miss 453320 random access tlb miss 41636 random access second tlb miss 41637 vmx andrea # ./largepages3 memset page fault 1566642 memset tlb miss 453417 memset second tlb miss 453313 random access tlb miss 41630 random access second tlb miss 41647 vmx andrea # ./largepages3 memset page fault 1566872 memset tlb miss 453418 memset second tlb miss 453315 random access tlb miss 41618 random access second tlb miss 41659 vmx andrea # echo 0 > /proc/sys/vm/transparent_hugepage vmx andrea # ./largepages3 memset page fault 2182476 memset tlb miss 460305 memset second tlb miss 460179 random access tlb miss 44483 random access second tlb miss 44186 vmx andrea # ./largepages3 memset page fault 2182791 memset tlb miss 460742 memset second tlb miss 459962 random access tlb miss 43981 random access second tlb miss 43988 ============ #include #include #include #include #define SIZE (3UL*1024*1024*1024) int main() { char *p = malloc(SIZE), *p2; struct timeval before, after; gettimeofday(&before, NULL); memset(p, 0, SIZE); gettimeofday(&after, NULL); printf("memset page fault %Lu\n", (after.tv_sec-before.tv_sec)*1000000UL + after.tv_usec-before.tv_usec); gettimeofday(&before, NULL); memset(p, 0, SIZE); gettimeofday(&after, NULL); printf("memset tlb miss %Lu\n", (after.tv_sec-before.tv_sec)*1000000UL + after.tv_usec-before.tv_usec); gettimeofday(&before, NULL); memset(p, 0, SIZE); gettimeofday(&after, NULL); printf("memset second tlb miss %Lu\n", (after.tv_sec-before.tv_sec)*1000000UL + after.tv_usec-before.tv_usec); gettimeofday(&before, NULL); for (p2 = p; p2 < p+SIZE; p2 += 4096) *p2 = 0; gettimeofday(&after, NULL); printf("random access tlb miss %Lu\n", (after.tv_sec-before.tv_sec)*1000000UL + after.tv_usec-before.tv_usec); gettimeofday(&before, NULL); for (p2 = p; p2 < p+SIZE; p2 += 4096) *p2 = 0; gettimeofday(&after, NULL); printf("random access second tlb miss %Lu\n", (after.tv_sec-before.tv_sec)*1000000UL + after.tv_usec-before.tv_usec); return 0; } ============ Signed-off-by: Andrea Arcangeli Acked-by: Rik van Riel Signed-off-by: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/include/asm/pgtable_64.h | 5 + include/linux/gfp.h | 3 + include/linux/huge_mm.h | 118 +++++ include/linux/mm.h | 4 + include/linux/mm_inline.h | 11 +- include/linux/page-flags.h | 21 + include/linux/rmap.h | 2 + include/linux/swap.h | 2 + mm/Makefile | 1 + mm/huge_memory.c | 901 ++++++++++++++++++++++++++++++++++++++ mm/internal.h | 4 + mm/memory.c | 84 +++- mm/rmap.c | 62 ++- mm/swap.c | 37 ++ 14 files changed, 1220 insertions(+), 35 deletions(-) create mode 100644 include/linux/huge_mm.h create mode 100644 mm/huge_memory.c diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h index 1fb61a74b2e1..b2df039a4119 100644 --- a/arch/x86/include/asm/pgtable_64.h +++ b/arch/x86/include/asm/pgtable_64.h @@ -286,6 +286,11 @@ static inline pmd_t pmd_mkwrite(pmd_t pmd) return pmd_set_flags(pmd, _PAGE_RW); } +static inline pmd_t pmd_mknotpresent(pmd_t pmd) +{ + return pmd_clear_flags(pmd, _PAGE_PRESENT); +} + #endif /* !__ASSEMBLY__ */ #endif /* _ASM_X86_PGTABLE_64_H */ diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 49d2356bb82d..d95082cc6f4a 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -109,6 +109,9 @@ struct vm_area_struct; __GFP_HARDWALL | __GFP_HIGHMEM | \ __GFP_MOVABLE) #define GFP_IOFS (__GFP_IO | __GFP_FS) +#define GFP_TRANSHUGE (GFP_HIGHUSER_MOVABLE | __GFP_COMP | \ + __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN | \ + __GFP_NO_KSWAPD) #ifdef CONFIG_NUMA #define GFP_THISNODE (__GFP_THISNODE | __GFP_NOWARN | __GFP_NORETRY) diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h new file mode 100644 index 000000000000..9301824c7491 --- /dev/null +++ b/include/linux/huge_mm.h @@ -0,0 +1,118 @@ +#ifndef _LINUX_HUGE_MM_H +#define _LINUX_HUGE_MM_H + +extern int do_huge_pmd_anonymous_page(struct mm_struct *mm, + struct vm_area_struct *vma, + unsigned long address, pmd_t *pmd, + unsigned int flags); +extern int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, + pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr, + struct vm_area_struct *vma); +extern int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long address, pmd_t *pmd, + pmd_t orig_pmd); +extern pgtable_t get_pmd_huge_pte(struct mm_struct *mm); +extern struct page *follow_trans_huge_pmd(struct mm_struct *mm, + unsigned long addr, + pmd_t *pmd, + unsigned int flags); +extern int zap_huge_pmd(struct mmu_gather *tlb, + struct vm_area_struct *vma, + pmd_t *pmd); + +enum transparent_hugepage_flag { + TRANSPARENT_HUGEPAGE_FLAG, + TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, + TRANSPARENT_HUGEPAGE_DEFRAG_FLAG, + TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, +#ifdef CONFIG_DEBUG_VM + TRANSPARENT_HUGEPAGE_DEBUG_COW_FLAG, +#endif +}; + +enum page_check_address_pmd_flag { + PAGE_CHECK_ADDRESS_PMD_FLAG, + PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG, + PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG, +}; +extern pmd_t *page_check_address_pmd(struct page *page, + struct mm_struct *mm, + unsigned long address, + enum page_check_address_pmd_flag flag); + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +#define HPAGE_PMD_SHIFT HPAGE_SHIFT +#define HPAGE_PMD_MASK HPAGE_MASK +#define HPAGE_PMD_SIZE HPAGE_SIZE + +#define transparent_hugepage_enabled(__vma) \ + (transparent_hugepage_flags & (1<vm_flags & VM_HUGEPAGE)) +#define transparent_hugepage_defrag(__vma) \ + ((transparent_hugepage_flags & \ + (1<vm_flags & VM_HUGEPAGE)) +#ifdef CONFIG_DEBUG_VM +#define transparent_hugepage_debug_cow() \ + (transparent_hugepage_flags & \ + (1<root->lock); \ + /* \ + * spin_unlock_wait() is just a loop in C and so the \ + * CPU can reorder anything around it. \ + */ \ + smp_mb(); \ + BUG_ON(pmd_trans_splitting(*____pmd) || \ + pmd_trans_huge(*____pmd)); \ + } while (0) +#define HPAGE_PMD_ORDER (HPAGE_PMD_SHIFT-PAGE_SHIFT) +#define HPAGE_PMD_NR (1< MAX_ORDER +#error "hugepages can't be allocated by the buddy allocator" +#endif +#else /* CONFIG_TRANSPARENT_HUGEPAGE */ +#define HPAGE_PMD_SHIFT ({ BUG(); 0; }) +#define HPAGE_PMD_MASK ({ BUG(); 0; }) +#define HPAGE_PMD_SIZE ({ BUG(); 0; }) + +#define transparent_hugepage_enabled(__vma) 0 + +#define transparent_hugepage_flags 0UL +static inline int split_huge_page(struct page *page) +{ + return 0; +} +#define split_huge_page_pmd(__mm, __pmd) \ + do { } while (0) +#define wait_split_huge_page(__anon_vma, __pmd) \ + do { } while (0) +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ + +#endif /* _LINUX_HUGE_MM_H */ diff --git a/include/linux/mm.h b/include/linux/mm.h index cc6ab1038f6f..78adec4ba9f4 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -111,6 +111,9 @@ extern unsigned int kobjsize(const void *objp); #define VM_SAO 0x20000000 /* Strong Access Ordering (powerpc) */ #define VM_PFN_AT_MMAP 0x40000000 /* PFNMAP vma that is fully mapped at mmap time */ #define VM_MERGEABLE 0x80000000 /* KSM may merge identical pages */ +#if BITS_PER_LONG > 32 +#define VM_HUGEPAGE 0x100000000UL /* MADV_HUGEPAGE marked this vma */ +#endif /* Bits set in the VMA until the stack is in its final location */ #define VM_STACK_INCOMPLETE_SETUP (VM_RAND_READ | VM_SEQ_READ) @@ -243,6 +246,7 @@ struct inode; * files which need it (119 of them) */ #include +#include /* * Methods to modify the page usage count. diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h index 8835b877b8db..650f31eabdb1 100644 --- a/include/linux/mm_inline.h +++ b/include/linux/mm_inline.h @@ -20,13 +20,20 @@ static inline int page_is_file_cache(struct page *page) } static inline void -add_page_to_lru_list(struct zone *zone, struct page *page, enum lru_list l) +__add_page_to_lru_list(struct zone *zone, struct page *page, enum lru_list l, + struct list_head *head) { - list_add(&page->lru, &zone->lru[l].list); + list_add(&page->lru, head); __inc_zone_state(zone, NR_LRU_BASE + l); mem_cgroup_add_lru_list(page, l); } +static inline void +add_page_to_lru_list(struct zone *zone, struct page *page, enum lru_list l) +{ + __add_page_to_lru_list(zone, page, l, &zone->lru[l].list); +} + static inline void del_page_from_lru_list(struct zone *zone, struct page *page, enum lru_list l) { diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 907f1605926b..4ca1241ef94e 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -410,11 +410,32 @@ static inline void ClearPageCompound(struct page *page) #endif /* !PAGEFLAGS_EXTENDED */ #ifdef CONFIG_TRANSPARENT_HUGEPAGE +/* + * PageHuge() only returns true for hugetlbfs pages, but not for + * normal or transparent huge pages. + * + * PageTransHuge() returns true for both transparent huge and + * hugetlbfs pages, but not normal pages. PageTransHuge() can only be + * called only in the core VM paths where hugetlbfs pages can't exist. + */ +static inline int PageTransHuge(struct page *page) +{ + VM_BUG_ON(PageTail(page)); + return PageHead(page); +} + static inline int PageTransCompound(struct page *page) { return PageCompound(page); } + #else + +static inline int PageTransHuge(struct page *page) +{ + return 0; +} + static inline int PageTransCompound(struct page *page) { return 0; diff --git a/include/linux/rmap.h b/include/linux/rmap.h index bb83c0da2071..e9fd04ca1e51 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -198,6 +198,8 @@ enum ttu_flags { }; #define TTU_ACTION(x) ((x) & TTU_ACTION_MASK) +bool is_vma_temporary_stack(struct vm_area_struct *vma); + int try_to_unmap(struct page *, enum ttu_flags flags); int try_to_unmap_one(struct page *, struct vm_area_struct *, unsigned long address, enum ttu_flags flags); diff --git a/include/linux/swap.h b/include/linux/swap.h index eba53e71d2cc..4d559325d919 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -208,6 +208,8 @@ extern unsigned int nr_free_pagecache_pages(void); /* linux/mm/swap.c */ extern void __lru_cache_add(struct page *, enum lru_list lru); extern void lru_cache_add_lru(struct page *, enum lru_list lru); +extern void lru_add_page_tail(struct zone* zone, + struct page *page, struct page *page_tail); extern void activate_page(struct page *); extern void mark_page_accessed(struct page *); extern void lru_add_drain(void); diff --git a/mm/Makefile b/mm/Makefile index 380772a9ccb8..2b1b575ae712 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -37,6 +37,7 @@ obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o obj-$(CONFIG_FS_XIP) += filemap_xip.o obj-$(CONFIG_MIGRATION) += migrate.o obj-$(CONFIG_QUICKLIST) += quicklist.o +obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o page_cgroup.o obj-$(CONFIG_MEMORY_FAILURE) += memory-failure.o obj-$(CONFIG_HWPOISON_INJECT) += hwpoison-inject.o diff --git a/mm/huge_memory.c b/mm/huge_memory.c new file mode 100644 index 000000000000..0c1e8f939f7c --- /dev/null +++ b/mm/huge_memory.c @@ -0,0 +1,901 @@ +/* + * Copyright (C) 2009 Red Hat, Inc. + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "internal.h" + +unsigned long transparent_hugepage_flags __read_mostly = + (1<page_table_lock); + + /* FIFO */ + pgtable = mm->pmd_huge_pte; + if (list_empty(&pgtable->lru)) + mm->pmd_huge_pte = NULL; + else { + mm->pmd_huge_pte = list_entry(pgtable->lru.next, + struct page, lru); + list_del(&pgtable->lru); + } + return pgtable; +} + +static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm, + struct vm_area_struct *vma, + unsigned long address, + pmd_t *pmd, pmd_t orig_pmd, + struct page *page, + unsigned long haddr) +{ + pgtable_t pgtable; + pmd_t _pmd; + int ret = 0, i; + struct page **pages; + + pages = kmalloc(sizeof(struct page *) * HPAGE_PMD_NR, + GFP_KERNEL); + if (unlikely(!pages)) { + ret |= VM_FAULT_OOM; + goto out; + } + + for (i = 0; i < HPAGE_PMD_NR; i++) { + pages[i] = alloc_page_vma(GFP_HIGHUSER_MOVABLE, + vma, address); + if (unlikely(!pages[i])) { + while (--i >= 0) + put_page(pages[i]); + kfree(pages); + ret |= VM_FAULT_OOM; + goto out; + } + } + + for (i = 0; i < HPAGE_PMD_NR; i++) { + copy_user_highpage(pages[i], page + i, + haddr + PAGE_SHIFT*i, vma); + __SetPageUptodate(pages[i]); + cond_resched(); + } + + spin_lock(&mm->page_table_lock); + if (unlikely(!pmd_same(*pmd, orig_pmd))) + goto out_free_pages; + VM_BUG_ON(!PageHead(page)); + + pmdp_clear_flush_notify(vma, haddr, pmd); + /* leave pmd empty until pte is filled */ + + pgtable = get_pmd_huge_pte(mm); + pmd_populate(mm, &_pmd, pgtable); + + for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) { + pte_t *pte, entry; + entry = mk_pte(pages[i], vma->vm_page_prot); + entry = maybe_mkwrite(pte_mkdirty(entry), vma); + page_add_new_anon_rmap(pages[i], vma, haddr); + pte = pte_offset_map(&_pmd, haddr); + VM_BUG_ON(!pte_none(*pte)); + set_pte_at(mm, haddr, pte, entry); + pte_unmap(pte); + } + kfree(pages); + + mm->nr_ptes++; + smp_wmb(); /* make pte visible before pmd */ + pmd_populate(mm, pmd, pgtable); + page_remove_rmap(page); + spin_unlock(&mm->page_table_lock); + + ret |= VM_FAULT_WRITE; + put_page(page); + +out: + return ret; + +out_free_pages: + spin_unlock(&mm->page_table_lock); + for (i = 0; i < HPAGE_PMD_NR; i++) + put_page(pages[i]); + kfree(pages); + goto out; +} + +int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long address, pmd_t *pmd, pmd_t orig_pmd) +{ + int ret = 0; + struct page *page, *new_page; + unsigned long haddr; + + VM_BUG_ON(!vma->anon_vma); + spin_lock(&mm->page_table_lock); + if (unlikely(!pmd_same(*pmd, orig_pmd))) + goto out_unlock; + + page = pmd_page(orig_pmd); + VM_BUG_ON(!PageCompound(page) || !PageHead(page)); + haddr = address & HPAGE_PMD_MASK; + if (page_mapcount(page) == 1) { + pmd_t entry; + entry = pmd_mkyoung(orig_pmd); + entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); + if (pmdp_set_access_flags(vma, haddr, pmd, entry, 1)) + update_mmu_cache(vma, address, entry); + ret |= VM_FAULT_WRITE; + goto out_unlock; + } + get_page(page); + spin_unlock(&mm->page_table_lock); + + if (transparent_hugepage_enabled(vma) && + !transparent_hugepage_debug_cow()) + new_page = alloc_hugepage(transparent_hugepage_defrag(vma)); + else + new_page = NULL; + + if (unlikely(!new_page)) { + ret = do_huge_pmd_wp_page_fallback(mm, vma, address, + pmd, orig_pmd, page, haddr); + put_page(page); + goto out; + } + + copy_user_huge_page(new_page, page, haddr, vma, HPAGE_PMD_NR); + __SetPageUptodate(new_page); + + spin_lock(&mm->page_table_lock); + put_page(page); + if (unlikely(!pmd_same(*pmd, orig_pmd))) + put_page(new_page); + else { + pmd_t entry; + VM_BUG_ON(!PageHead(page)); + entry = mk_pmd(new_page, vma->vm_page_prot); + entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); + entry = pmd_mkhuge(entry); + pmdp_clear_flush_notify(vma, haddr, pmd); + page_add_new_anon_rmap(new_page, vma, haddr); + set_pmd_at(mm, haddr, pmd, entry); + update_mmu_cache(vma, address, entry); + page_remove_rmap(page); + put_page(page); + ret |= VM_FAULT_WRITE; + } +out_unlock: + spin_unlock(&mm->page_table_lock); +out: + return ret; +} + +struct page *follow_trans_huge_pmd(struct mm_struct *mm, + unsigned long addr, + pmd_t *pmd, + unsigned int flags) +{ + struct page *page = NULL; + + assert_spin_locked(&mm->page_table_lock); + + if (flags & FOLL_WRITE && !pmd_write(*pmd)) + goto out; + + page = pmd_page(*pmd); + VM_BUG_ON(!PageHead(page)); + if (flags & FOLL_TOUCH) { + pmd_t _pmd; + /* + * We should set the dirty bit only for FOLL_WRITE but + * for now the dirty bit in the pmd is meaningless. + * And if the dirty bit will become meaningful and + * we'll only set it with FOLL_WRITE, an atomic + * set_bit will be required on the pmd to set the + * young bit, instead of the current set_pmd_at. + */ + _pmd = pmd_mkyoung(pmd_mkdirty(*pmd)); + set_pmd_at(mm, addr & HPAGE_PMD_MASK, pmd, _pmd); + } + page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT; + VM_BUG_ON(!PageCompound(page)); + if (flags & FOLL_GET) + get_page(page); + +out: + return page; +} + +int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, + pmd_t *pmd) +{ + int ret = 0; + + spin_lock(&tlb->mm->page_table_lock); + if (likely(pmd_trans_huge(*pmd))) { + if (unlikely(pmd_trans_splitting(*pmd))) { + spin_unlock(&tlb->mm->page_table_lock); + wait_split_huge_page(vma->anon_vma, + pmd); + } else { + struct page *page; + pgtable_t pgtable; + pgtable = get_pmd_huge_pte(tlb->mm); + page = pmd_page(*pmd); + pmd_clear(pmd); + page_remove_rmap(page); + VM_BUG_ON(page_mapcount(page) < 0); + add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR); + VM_BUG_ON(!PageHead(page)); + spin_unlock(&tlb->mm->page_table_lock); + tlb_remove_page(tlb, page); + pte_free(tlb->mm, pgtable); + ret = 1; + } + } else + spin_unlock(&tlb->mm->page_table_lock); + + return ret; +} + +pmd_t *page_check_address_pmd(struct page *page, + struct mm_struct *mm, + unsigned long address, + enum page_check_address_pmd_flag flag) +{ + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd, *ret = NULL; + + if (address & ~HPAGE_PMD_MASK) + goto out; + + pgd = pgd_offset(mm, address); + if (!pgd_present(*pgd)) + goto out; + + pud = pud_offset(pgd, address); + if (!pud_present(*pud)) + goto out; + + pmd = pmd_offset(pud, address); + if (pmd_none(*pmd)) + goto out; + if (pmd_page(*pmd) != page) + goto out; + VM_BUG_ON(flag == PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG && + pmd_trans_splitting(*pmd)); + if (pmd_trans_huge(*pmd)) { + VM_BUG_ON(flag == PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG && + !pmd_trans_splitting(*pmd)); + ret = pmd; + } +out: + return ret; +} + +static int __split_huge_page_splitting(struct page *page, + struct vm_area_struct *vma, + unsigned long address) +{ + struct mm_struct *mm = vma->vm_mm; + pmd_t *pmd; + int ret = 0; + + spin_lock(&mm->page_table_lock); + pmd = page_check_address_pmd(page, mm, address, + PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG); + if (pmd) { + /* + * We can't temporarily set the pmd to null in order + * to split it, the pmd must remain marked huge at all + * times or the VM won't take the pmd_trans_huge paths + * and it won't wait on the anon_vma->root->lock to + * serialize against split_huge_page*. + */ + pmdp_splitting_flush_notify(vma, address, pmd); + ret = 1; + } + spin_unlock(&mm->page_table_lock); + + return ret; +} + +static void __split_huge_page_refcount(struct page *page) +{ + int i; + unsigned long head_index = page->index; + struct zone *zone = page_zone(page); + + /* prevent PageLRU to go away from under us, and freeze lru stats */ + spin_lock_irq(&zone->lru_lock); + compound_lock(page); + + for (i = 1; i < HPAGE_PMD_NR; i++) { + struct page *page_tail = page + i; + + /* tail_page->_count cannot change */ + atomic_sub(atomic_read(&page_tail->_count), &page->_count); + BUG_ON(page_count(page) <= 0); + atomic_add(page_mapcount(page) + 1, &page_tail->_count); + BUG_ON(atomic_read(&page_tail->_count) <= 0); + + /* after clearing PageTail the gup refcount can be released */ + smp_mb(); + + page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; + page_tail->flags |= (page->flags & + ((1L << PG_referenced) | + (1L << PG_swapbacked) | + (1L << PG_mlocked) | + (1L << PG_uptodate))); + page_tail->flags |= (1L << PG_dirty); + + /* + * 1) clear PageTail before overwriting first_page + * 2) clear PageTail before clearing PageHead for VM_BUG_ON + */ + smp_wmb(); + + /* + * __split_huge_page_splitting() already set the + * splitting bit in all pmd that could map this + * hugepage, that will ensure no CPU can alter the + * mapcount on the head page. The mapcount is only + * accounted in the head page and it has to be + * transferred to all tail pages in the below code. So + * for this code to be safe, the split the mapcount + * can't change. But that doesn't mean userland can't + * keep changing and reading the page contents while + * we transfer the mapcount, so the pmd splitting + * status is achieved setting a reserved bit in the + * pmd, not by clearing the present bit. + */ + BUG_ON(page_mapcount(page_tail)); + page_tail->_mapcount = page->_mapcount; + + BUG_ON(page_tail->mapping); + page_tail->mapping = page->mapping; + + page_tail->index = ++head_index; + + BUG_ON(!PageAnon(page_tail)); + BUG_ON(!PageUptodate(page_tail)); + BUG_ON(!PageDirty(page_tail)); + BUG_ON(!PageSwapBacked(page_tail)); + + lru_add_page_tail(zone, page, page_tail); + } + + ClearPageCompound(page); + compound_unlock(page); + spin_unlock_irq(&zone->lru_lock); + + for (i = 1; i < HPAGE_PMD_NR; i++) { + struct page *page_tail = page + i; + BUG_ON(page_count(page_tail) <= 0); + /* + * Tail pages may be freed if there wasn't any mapping + * like if add_to_swap() is running on a lru page that + * had its mapping zapped. And freeing these pages + * requires taking the lru_lock so we do the put_page + * of the tail pages after the split is complete. + */ + put_page(page_tail); + } + + /* + * Only the head page (now become a regular page) is required + * to be pinned by the caller. + */ + BUG_ON(page_count(page) <= 0); +} + +static int __split_huge_page_map(struct page *page, + struct vm_area_struct *vma, + unsigned long address) +{ + struct mm_struct *mm = vma->vm_mm; + pmd_t *pmd, _pmd; + int ret = 0, i; + pgtable_t pgtable; + unsigned long haddr; + + spin_lock(&mm->page_table_lock); + pmd = page_check_address_pmd(page, mm, address, + PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG); + if (pmd) { + pgtable = get_pmd_huge_pte(mm); + pmd_populate(mm, &_pmd, pgtable); + + for (i = 0, haddr = address; i < HPAGE_PMD_NR; + i++, haddr += PAGE_SIZE) { + pte_t *pte, entry; + BUG_ON(PageCompound(page+i)); + entry = mk_pte(page + i, vma->vm_page_prot); + entry = maybe_mkwrite(pte_mkdirty(entry), vma); + if (!pmd_write(*pmd)) + entry = pte_wrprotect(entry); + else + BUG_ON(page_mapcount(page) != 1); + if (!pmd_young(*pmd)) + entry = pte_mkold(entry); + pte = pte_offset_map(&_pmd, haddr); + BUG_ON(!pte_none(*pte)); + set_pte_at(mm, haddr, pte, entry); + pte_unmap(pte); + } + + mm->nr_ptes++; + smp_wmb(); /* make pte visible before pmd */ + /* + * Up to this point the pmd is present and huge and + * userland has the whole access to the hugepage + * during the split (which happens in place). If we + * overwrite the pmd with the not-huge version + * pointing to the pte here (which of course we could + * if all CPUs were bug free), userland could trigger + * a small page size TLB miss on the small sized TLB + * while the hugepage TLB entry is still established + * in the huge TLB. Some CPU doesn't like that. See + * http://support.amd.com/us/Processor_TechDocs/41322.pdf, + * Erratum 383 on page 93. Intel should be safe but is + * also warns that it's only safe if the permission + * and cache attributes of the two entries loaded in + * the two TLB is identical (which should be the case + * here). But it is generally safer to never allow + * small and huge TLB entries for the same virtual + * address to be loaded simultaneously. So instead of + * doing "pmd_populate(); flush_tlb_range();" we first + * mark the current pmd notpresent (atomically because + * here the pmd_trans_huge and pmd_trans_splitting + * must remain set at all times on the pmd until the + * split is complete for this pmd), then we flush the + * SMP TLB and finally we write the non-huge version + * of the pmd entry with pmd_populate. + */ + set_pmd_at(mm, address, pmd, pmd_mknotpresent(*pmd)); + flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE); + pmd_populate(mm, pmd, pgtable); + ret = 1; + } + spin_unlock(&mm->page_table_lock); + + return ret; +} + +/* must be called with anon_vma->root->lock hold */ +static void __split_huge_page(struct page *page, + struct anon_vma *anon_vma) +{ + int mapcount, mapcount2; + struct anon_vma_chain *avc; + + BUG_ON(!PageHead(page)); + BUG_ON(PageTail(page)); + + mapcount = 0; + list_for_each_entry(avc, &anon_vma->head, same_anon_vma) { + struct vm_area_struct *vma = avc->vma; + unsigned long addr = vma_address(page, vma); + BUG_ON(is_vma_temporary_stack(vma)); + if (addr == -EFAULT) + continue; + mapcount += __split_huge_page_splitting(page, vma, addr); + } + BUG_ON(mapcount != page_mapcount(page)); + + __split_huge_page_refcount(page); + + mapcount2 = 0; + list_for_each_entry(avc, &anon_vma->head, same_anon_vma) { + struct vm_area_struct *vma = avc->vma; + unsigned long addr = vma_address(page, vma); + BUG_ON(is_vma_temporary_stack(vma)); + if (addr == -EFAULT) + continue; + mapcount2 += __split_huge_page_map(page, vma, addr); + } + BUG_ON(mapcount != mapcount2); +} + +int split_huge_page(struct page *page) +{ + struct anon_vma *anon_vma; + int ret = 1; + + BUG_ON(!PageAnon(page)); + anon_vma = page_lock_anon_vma(page); + if (!anon_vma) + goto out; + ret = 0; + if (!PageCompound(page)) + goto out_unlock; + + BUG_ON(!PageSwapBacked(page)); + __split_huge_page(page, anon_vma); + + BUG_ON(PageCompound(page)); +out_unlock: + page_unlock_anon_vma(anon_vma); +out: + return ret; +} + +void __split_huge_page_pmd(struct mm_struct *mm, pmd_t *pmd) +{ + struct page *page; + + spin_lock(&mm->page_table_lock); + if (unlikely(!pmd_trans_huge(*pmd))) { + spin_unlock(&mm->page_table_lock); + return; + } + page = pmd_page(*pmd); + VM_BUG_ON(!page_count(page)); + get_page(page); + spin_unlock(&mm->page_table_lock); + + split_huge_page(page); + + put_page(page); + BUG_ON(pmd_trans_huge(*pmd)); +} diff --git a/mm/internal.h b/mm/internal.h index bd4f581f624a..69488205723d 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -134,6 +134,10 @@ static inline void mlock_migrate_page(struct page *newpage, struct page *page) } } +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +extern unsigned long vma_address(struct page *page, + struct vm_area_struct *vma); +#endif #else /* !CONFIG_MMU */ static inline int is_mlocked_vma(struct vm_area_struct *v, struct page *p) { diff --git a/mm/memory.c b/mm/memory.c index 60e1c68d8218..c50a195041ec 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -726,9 +726,9 @@ out_set_pte: return 0; } -static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, - pmd_t *dst_pmd, pmd_t *src_pmd, struct vm_area_struct *vma, - unsigned long addr, unsigned long end) +int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, + pmd_t *dst_pmd, pmd_t *src_pmd, struct vm_area_struct *vma, + unsigned long addr, unsigned long end) { pte_t *orig_src_pte, *orig_dst_pte; pte_t *src_pte, *dst_pte; @@ -802,6 +802,16 @@ static inline int copy_pmd_range(struct mm_struct *dst_mm, struct mm_struct *src src_pmd = pmd_offset(src_pud, addr); do { next = pmd_addr_end(addr, end); + if (pmd_trans_huge(*src_pmd)) { + int err; + err = copy_huge_pmd(dst_mm, src_mm, + dst_pmd, src_pmd, addr, vma); + if (err == -ENOMEM) + return -ENOMEM; + if (!err) + continue; + /* fall through */ + } if (pmd_none_or_clear_bad(src_pmd)) continue; if (copy_pte_range(dst_mm, src_mm, dst_pmd, src_pmd, @@ -1004,6 +1014,15 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb, pmd = pmd_offset(pud, addr); do { next = pmd_addr_end(addr, end); + if (pmd_trans_huge(*pmd)) { + if (next-addr != HPAGE_PMD_SIZE) + split_huge_page_pmd(vma->vm_mm, pmd); + else if (zap_huge_pmd(tlb, vma, pmd)) { + (*zap_work)--; + continue; + } + /* fall through */ + } if (pmd_none_or_clear_bad(pmd)) { (*zap_work)--; continue; @@ -1280,11 +1299,27 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address, pmd = pmd_offset(pud, address); if (pmd_none(*pmd)) goto no_page_table; - if (pmd_huge(*pmd)) { + if (pmd_huge(*pmd) && vma->vm_flags & VM_HUGETLB) { BUG_ON(flags & FOLL_GET); page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE); goto out; } + if (pmd_trans_huge(*pmd)) { + spin_lock(&mm->page_table_lock); + if (likely(pmd_trans_huge(*pmd))) { + if (unlikely(pmd_trans_splitting(*pmd))) { + spin_unlock(&mm->page_table_lock); + wait_split_huge_page(vma->anon_vma, pmd); + } else { + page = follow_trans_huge_pmd(mm, address, + pmd, flags); + spin_unlock(&mm->page_table_lock); + goto out; + } + } else + spin_unlock(&mm->page_table_lock); + /* fall through */ + } if (unlikely(pmd_bad(*pmd))) goto no_page_table; @@ -3179,9 +3214,9 @@ static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma, * but allow concurrent faults), and pte mapped but not yet locked. * We return with mmap_sem still held, but pte unmapped and unlocked. */ -static inline int handle_pte_fault(struct mm_struct *mm, - struct vm_area_struct *vma, unsigned long address, - pte_t *pte, pmd_t *pmd, unsigned int flags) +int handle_pte_fault(struct mm_struct *mm, + struct vm_area_struct *vma, unsigned long address, + pte_t *pte, pmd_t *pmd, unsigned int flags) { pte_t entry; spinlock_t *ptl; @@ -3260,9 +3295,40 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, pmd = pmd_alloc(mm, pud, address); if (!pmd) return VM_FAULT_OOM; - pte = pte_alloc_map(mm, vma, pmd, address); - if (!pte) + if (pmd_none(*pmd) && transparent_hugepage_enabled(vma)) { + if (!vma->vm_ops) + return do_huge_pmd_anonymous_page(mm, vma, address, + pmd, flags); + } else { + pmd_t orig_pmd = *pmd; + barrier(); + if (pmd_trans_huge(orig_pmd)) { + if (flags & FAULT_FLAG_WRITE && + !pmd_write(orig_pmd) && + !pmd_trans_splitting(orig_pmd)) + return do_huge_pmd_wp_page(mm, vma, address, + pmd, orig_pmd); + return 0; + } + } + + /* + * Use __pte_alloc instead of pte_alloc_map, because we can't + * run pte_offset_map on the pmd, if an huge pmd could + * materialize from under us from a different thread. + */ + if (unlikely(__pte_alloc(mm, vma, pmd, address))) return VM_FAULT_OOM; + /* if an huge pmd materialized from under us just retry later */ + if (unlikely(pmd_trans_huge(*pmd))) + return 0; + /* + * A regular pmd is established and it can't morph into a huge pmd + * from under us anymore at this point because we hold the mmap_sem + * read mode and khugepaged takes it in write mode. So now it's + * safe to run pte_offset_map(). + */ + pte = pte_offset_map(pmd, address); return handle_pte_fault(mm, vma, address, pte, pmd, flags); } diff --git a/mm/rmap.c b/mm/rmap.c index a3197a8a295b..e41375a6b029 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -360,7 +360,7 @@ void page_unlock_anon_vma(struct anon_vma *anon_vma) * Returns virtual address or -EFAULT if page's index/offset is not * within the range mapped the @vma. */ -static inline unsigned long +inline unsigned long vma_address(struct page *page, struct vm_area_struct *vma) { pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); @@ -435,6 +435,8 @@ pte_t *__page_check_address(struct page *page, struct mm_struct *mm, pmd = pmd_offset(pud, address); if (!pmd_present(*pmd)) return NULL; + if (pmd_trans_huge(*pmd)) + return NULL; pte = pte_offset_map(pmd, address); /* Make a quick check before getting the lock */ @@ -489,35 +491,17 @@ int page_referenced_one(struct page *page, struct vm_area_struct *vma, unsigned long *vm_flags) { struct mm_struct *mm = vma->vm_mm; - pte_t *pte; - spinlock_t *ptl; int referenced = 0; - pte = page_check_address(page, mm, address, &ptl, 0); - if (!pte) - goto out; - /* * Don't want to elevate referenced for mlocked page that gets this far, * in order that it progresses to try_to_unmap and is moved to the * unevictable list. */ if (vma->vm_flags & VM_LOCKED) { - *mapcount = 1; /* break early from loop */ + *mapcount = 0; /* break early from loop */ *vm_flags |= VM_LOCKED; - goto out_unmap; - } - - if (ptep_clear_flush_young_notify(vma, address, pte)) { - /* - * Don't treat a reference through a sequentially read - * mapping as such. If the page has been used in - * another mapping, we will catch it; if this other - * mapping is already gone, the unmap path will have - * set PG_referenced or activated the page. - */ - if (likely(!VM_SequentialReadHint(vma))) - referenced++; + goto out; } /* Pretend the page is referenced if the task has the @@ -526,9 +510,39 @@ int page_referenced_one(struct page *page, struct vm_area_struct *vma, rwsem_is_locked(&mm->mmap_sem)) referenced++; -out_unmap: + if (unlikely(PageTransHuge(page))) { + pmd_t *pmd; + + spin_lock(&mm->page_table_lock); + pmd = page_check_address_pmd(page, mm, address, + PAGE_CHECK_ADDRESS_PMD_FLAG); + if (pmd && !pmd_trans_splitting(*pmd) && + pmdp_clear_flush_young_notify(vma, address, pmd)) + referenced++; + spin_unlock(&mm->page_table_lock); + } else { + pte_t *pte; + spinlock_t *ptl; + + pte = page_check_address(page, mm, address, &ptl, 0); + if (!pte) + goto out; + + if (ptep_clear_flush_young_notify(vma, address, pte)) { + /* + * Don't treat a reference through a sequentially read + * mapping as such. If the page has been used in + * another mapping, we will catch it; if this other + * mapping is already gone, the unmap path will have + * set PG_referenced or activated the page. + */ + if (likely(!VM_SequentialReadHint(vma))) + referenced++; + } + pte_unmap_unlock(pte, ptl); + } + (*mapcount)--; - pte_unmap_unlock(pte, ptl); if (referenced) *vm_flags |= vma->vm_flags; @@ -1202,7 +1216,7 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount, return ret; } -static bool is_vma_temporary_stack(struct vm_area_struct *vma) +bool is_vma_temporary_stack(struct vm_area_struct *vma) { int maybe_stack = vma->vm_flags & (VM_GROWSDOWN | VM_GROWSUP); diff --git a/mm/swap.c b/mm/swap.c index e0eeef940886..c02f93611a84 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -479,6 +479,43 @@ void __pagevec_release(struct pagevec *pvec) EXPORT_SYMBOL(__pagevec_release); +/* used by __split_huge_page_refcount() */ +void lru_add_page_tail(struct zone* zone, + struct page *page, struct page *page_tail) +{ + int active; + enum lru_list lru; + const int file = 0; + struct list_head *head; + + VM_BUG_ON(!PageHead(page)); + VM_BUG_ON(PageCompound(page_tail)); + VM_BUG_ON(PageLRU(page_tail)); + VM_BUG_ON(!spin_is_locked(&zone->lru_lock)); + + SetPageLRU(page_tail); + + if (page_evictable(page_tail, NULL)) { + if (PageActive(page)) { + SetPageActive(page_tail); + active = 1; + lru = LRU_ACTIVE_ANON; + } else { + active = 0; + lru = LRU_INACTIVE_ANON; + } + update_page_reclaim_stat(zone, page_tail, file, active); + if (likely(PageLRU(page))) + head = page->lru.prev; + else + head = &zone->lru[lru].list; + __add_page_to_lru_list(zone, page_tail, lru, head); + } else { + SetPageUnevictable(page_tail); + add_page_to_lru_list(zone, page_tail, LRU_UNEVICTABLE); + } +} + /* * Add the passed pages to the LRU, then drop the caller's refcount * on them. Reinitialises the caller's pagevec. -- cgit From 8a07651ee8cdaa9e27cb4ae372aed347533770f5 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Thu, 13 Jan 2011 15:46:52 -0800 Subject: thp: transparent hugepage core fixlet If you configure THP in addition to HUGETLB_PAGE on x86_32 without PAE, the p?d-folding works out that munlock_vma_pages_range() can crash to follow_page()'s pud_huge() BUG_ON(flags & FOLL_GET): it needs the same VM_HUGETLB check already there on the pmd_huge() line. Conveniently, openSUSE provides a "blogd" which tests this out at startup! Signed-off-by: Hugh Dickins Cc: Rik van Riel Cc: Johannes Weiner Cc: Andrea Arcangeli Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/memory.c b/mm/memory.c index c50a195041ec..840ce9d98f8b 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1288,7 +1288,7 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address, pud = pud_offset(pgd, address); if (pud_none(*pud)) goto no_page_table; - if (pud_huge(*pud)) { + if (pud_huge(*pud) && vma->vm_flags & VM_HUGETLB) { BUG_ON(flags & FOLL_GET); page = follow_huge_pud(mm, address, pud, flags & FOLL_WRITE); goto out; -- cgit From 05759d380a9d7f131a475186c07fce58ceaa8902 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Thu, 13 Jan 2011 15:46:53 -0800 Subject: thp: split_huge_page anon_vma ordering dependency This documents how split_huge_page is safe vs new vma inserctions into the anon_vma that may have already released the anon_vma->lock but not established pmds yet when split_huge_page starts. Signed-off-by: Andrea Arcangeli Acked-by: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 16 ++++++++++++++++ mm/rmap.c | 4 ++++ 2 files changed, 20 insertions(+) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 0c1e8f939f7c..763507932898 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -841,6 +841,19 @@ static void __split_huge_page(struct page *page, continue; mapcount += __split_huge_page_splitting(page, vma, addr); } + /* + * It is critical that new vmas are added to the tail of the + * anon_vma list. This guarantes that if copy_huge_pmd() runs + * and establishes a child pmd before + * __split_huge_page_splitting() freezes the parent pmd (so if + * we fail to prevent copy_huge_pmd() from running until the + * whole __split_huge_page() is complete), we will still see + * the newly established pmd of the child later during the + * walk, to be able to set it as pmd_trans_splitting too. + */ + if (mapcount != page_mapcount(page)) + printk(KERN_ERR "mapcount %d page_mapcount %d\n", + mapcount, page_mapcount(page)); BUG_ON(mapcount != page_mapcount(page)); __split_huge_page_refcount(page); @@ -854,6 +867,9 @@ static void __split_huge_page(struct page *page, continue; mapcount2 += __split_huge_page_map(page, vma, addr); } + if (mapcount != mapcount2) + printk(KERN_ERR "mapcount %d mapcount2 %d page_mapcount %d\n", + mapcount, mapcount2, page_mapcount(page)); BUG_ON(mapcount != mapcount2); } diff --git a/mm/rmap.c b/mm/rmap.c index e41375a6b029..92e14dcfe737 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -177,6 +177,10 @@ static void anon_vma_chain_link(struct vm_area_struct *vma, list_add(&avc->same_vma, &vma->anon_vma_chain); anon_vma_lock(anon_vma); + /* + * It's critical to add new vmas to the tail of the anon_vma, + * see comment in huge_memory.c:__split_huge_page(). + */ list_add_tail(&avc->same_anon_vma, &anon_vma->head); anon_vma_unlock(anon_vma); } -- cgit From f66055ab6fb9731dbfce320c5202ef4441b5d77f Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Thu, 13 Jan 2011 15:46:54 -0800 Subject: thp: verify pmd_trans_huge isn't leaking pte_trans_huge must not leak in certain vmas like the mmio special pfn or filebacked mappings. Signed-off-by: Andrea Arcangeli Acked-by: Rik van Riel Acked-by: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/mm/memory.c b/mm/memory.c index 840ce9d98f8b..c1a80e00458d 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1451,6 +1451,7 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, pmd = pmd_offset(pud, pg); if (pmd_none(*pmd)) return i ? : -EFAULT; + VM_BUG_ON(pmd_trans_huge(*pmd)); pte = pte_offset_map(pmd, pg); if (pte_none(*pte)) { pte_unmap(pte); @@ -1675,8 +1676,10 @@ pte_t *__get_locked_pte(struct mm_struct *mm, unsigned long addr, pud_t * pud = pud_alloc(mm, pgd, addr); if (pud) { pmd_t * pmd = pmd_alloc(mm, pud, addr); - if (pmd) + if (pmd) { + VM_BUG_ON(pmd_trans_huge(*pmd)); return pte_alloc_map_lock(mm, pmd, addr, ptl); + } } return NULL; } @@ -1895,6 +1898,7 @@ static inline int remap_pmd_range(struct mm_struct *mm, pud_t *pud, pmd = pmd_alloc(mm, pud, addr); if (!pmd) return -ENOMEM; + VM_BUG_ON(pmd_trans_huge(*pmd)); do { next = pmd_addr_end(addr, end); if (remap_pte_range(mm, pmd, addr, next, @@ -3471,6 +3475,7 @@ static int __follow_pte(struct mm_struct *mm, unsigned long address, goto out; pmd = pmd_offset(pud, address); + VM_BUG_ON(pmd_trans_huge(*pmd)); if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd))) goto out; -- cgit From 0af4e98b6b095c74588af04872f83d333c958c32 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Thu, 13 Jan 2011 15:46:55 -0800 Subject: thp: madvise(MADV_HUGEPAGE) Add madvise MADV_HUGEPAGE to mark regions that are important to be hugepage backed. Return -EINVAL if the vma is not of an anonymous type, or the feature isn't built into the kernel. Never silently return success. Signed-off-by: Andrea Arcangeli Acked-by: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/huge_mm.h | 6 ++++++ mm/huge_memory.c | 16 ++++++++++++++++ mm/madvise.c | 8 ++++++++ 3 files changed, 30 insertions(+) diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 9301824c7491..d9ab70d776e2 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -97,6 +97,7 @@ extern void __split_huge_page_pmd(struct mm_struct *mm, pmd_t *pmd); #if HPAGE_PMD_ORDER > MAX_ORDER #error "hugepages can't be allocated by the buddy allocator" #endif +extern int hugepage_madvise(unsigned long *vm_flags); #else /* CONFIG_TRANSPARENT_HUGEPAGE */ #define HPAGE_PMD_SHIFT ({ BUG(); 0; }) #define HPAGE_PMD_MASK ({ BUG(); 0; }) @@ -113,6 +114,11 @@ static inline int split_huge_page(struct page *page) do { } while (0) #define wait_split_huge_page(__anon_vma, __pmd) \ do { } while (0) +static inline int hugepage_madvise(unsigned long *vm_flags) +{ + BUG(); + return 0; +} #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #endif /* _LINUX_HUGE_MM_H */ diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 763507932898..620891f4e54f 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -896,6 +896,22 @@ out: return ret; } +int hugepage_madvise(unsigned long *vm_flags) +{ + /* + * Be somewhat over-protective like KSM for now! + */ + if (*vm_flags & (VM_HUGEPAGE | VM_SHARED | VM_MAYSHARE | + VM_PFNMAP | VM_IO | VM_DONTEXPAND | + VM_RESERVED | VM_HUGETLB | VM_INSERTPAGE | + VM_MIXEDMAP | VM_SAO)) + return -EINVAL; + + *vm_flags |= VM_HUGEPAGE; + + return 0; +} + void __split_huge_page_pmd(struct mm_struct *mm, pmd_t *pmd) { struct page *page; diff --git a/mm/madvise.c b/mm/madvise.c index 319528b8db74..ecde40a401c1 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -71,6 +71,11 @@ static long madvise_behavior(struct vm_area_struct * vma, if (error) goto out; break; + case MADV_HUGEPAGE: + error = hugepage_madvise(&new_flags); + if (error) + goto out; + break; } if (new_flags == vma->vm_flags) { @@ -282,6 +287,9 @@ madvise_behavior_valid(int behavior) #ifdef CONFIG_KSM case MADV_MERGEABLE: case MADV_UNMERGEABLE: +#endif +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + case MADV_HUGEPAGE: #endif return 1; -- cgit From 500d65d471018d9a13b0d51b7e141ed2a3555c1d Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Thu, 13 Jan 2011 15:46:55 -0800 Subject: thp: pmd_trans_huge migrate bugcheck No pmd_trans_huge should ever materialize in migration ptes areas, because we split the hugepage before migration ptes are instantiated. Signed-off-by: Andrea Arcangeli Acked-by: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 1 + mm/memory.c | 5 +++++ mm/migrate.c | 7 ++++++- 3 files changed, 12 insertions(+), 1 deletion(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 78adec4ba9f4..2ec5138badab 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1490,6 +1490,7 @@ struct page *follow_page(struct vm_area_struct *, unsigned long address, #define FOLL_DUMP 0x08 /* give error on hole if it would be zero */ #define FOLL_FORCE 0x10 /* get_user_pages read/write w/o permission */ #define FOLL_MLOCK 0x40 /* mark page as mlocked */ +#define FOLL_SPLIT 0x80 /* don't return transhuge pages, split them */ typedef int (*pte_fn_t)(pte_t *pte, pgtable_t token, unsigned long addr, void *data); diff --git a/mm/memory.c b/mm/memory.c index c1a80e00458d..12ee1ea237f5 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1305,6 +1305,10 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address, goto out; } if (pmd_trans_huge(*pmd)) { + if (flags & FOLL_SPLIT) { + split_huge_page_pmd(mm, pmd); + goto split_fallthrough; + } spin_lock(&mm->page_table_lock); if (likely(pmd_trans_huge(*pmd))) { if (unlikely(pmd_trans_splitting(*pmd))) { @@ -1320,6 +1324,7 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address, spin_unlock(&mm->page_table_lock); /* fall through */ } +split_fallthrough: if (unlikely(pmd_bad(*pmd))) goto no_page_table; diff --git a/mm/migrate.c b/mm/migrate.c index 690d0de993af..1a531b760b3b 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -113,6 +113,8 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma, goto out; pmd = pmd_offset(pud, addr); + if (pmd_trans_huge(*pmd)) + goto out; if (!pmd_present(*pmd)) goto out; @@ -632,6 +634,9 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private, /* page was freed from under us. So we are done. */ goto move_newpage; } + if (unlikely(PageTransHuge(page))) + if (unlikely(split_huge_page(page))) + goto move_newpage; /* prepare cgroup just returns 0 or -ENOMEM */ rc = -EAGAIN; @@ -1063,7 +1068,7 @@ static int do_move_page_to_node_array(struct mm_struct *mm, if (!vma || pp->addr < vma->vm_start || !vma_migratable(vma)) goto set_status; - page = follow_page(vma, pp->addr, FOLL_GET); + page = follow_page(vma, pp->addr, FOLL_GET|FOLL_SPLIT); err = PTR_ERR(page); if (IS_ERR(page)) -- cgit From ec1685109f1314a30919489ef2800ed626a38c1e Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Thu, 13 Jan 2011 15:46:56 -0800 Subject: thp: memcg compound Teach memcg to charge/uncharge compound pages. Signed-off-by: Andrea Arcangeli Acked-by: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 83 ++++++++++++++++++++++++++++++++++++--------------------- 1 file changed, 53 insertions(+), 30 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 00bb8a64d028..356d4964fe95 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1027,6 +1027,10 @@ mem_cgroup_get_reclaim_stat_from_page(struct page *page) { struct page_cgroup *pc; struct mem_cgroup_per_zone *mz; + int page_size = PAGE_SIZE; + + if (PageTransHuge(page)) + page_size <<= compound_order(page); if (mem_cgroup_disabled()) return NULL; @@ -1887,12 +1891,14 @@ static int __mem_cgroup_do_charge(struct mem_cgroup *mem, gfp_t gfp_mask, * oom-killer can be invoked. */ static int __mem_cgroup_try_charge(struct mm_struct *mm, - gfp_t gfp_mask, struct mem_cgroup **memcg, bool oom) + gfp_t gfp_mask, + struct mem_cgroup **memcg, bool oom, + int page_size) { int nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES; struct mem_cgroup *mem = NULL; int ret; - int csize = CHARGE_SIZE; + int csize = max(CHARGE_SIZE, (unsigned long) page_size); /* * Unlike gloval-vm's OOM-kill, we're not in memory shortage @@ -1917,7 +1923,7 @@ again: VM_BUG_ON(css_is_removed(&mem->css)); if (mem_cgroup_is_root(mem)) goto done; - if (consume_stock(mem)) + if (page_size == PAGE_SIZE && consume_stock(mem)) goto done; css_get(&mem->css); } else { @@ -1940,7 +1946,7 @@ again: rcu_read_unlock(); goto done; } - if (consume_stock(mem)) { + if (page_size == PAGE_SIZE && consume_stock(mem)) { /* * It seems dagerous to access memcg without css_get(). * But considering how consume_stok works, it's not @@ -1981,7 +1987,7 @@ again: case CHARGE_OK: break; case CHARGE_RETRY: /* not in OOM situation but retry */ - csize = PAGE_SIZE; + csize = page_size; css_put(&mem->css); mem = NULL; goto again; @@ -2002,8 +2008,8 @@ again: } } while (ret != CHARGE_OK); - if (csize > PAGE_SIZE) - refill_stock(mem, csize - PAGE_SIZE); + if (csize > page_size) + refill_stock(mem, csize - page_size); css_put(&mem->css); done: *memcg = mem; @@ -2031,9 +2037,10 @@ static void __mem_cgroup_cancel_charge(struct mem_cgroup *mem, } } -static void mem_cgroup_cancel_charge(struct mem_cgroup *mem) +static void mem_cgroup_cancel_charge(struct mem_cgroup *mem, + int page_size) { - __mem_cgroup_cancel_charge(mem, 1); + __mem_cgroup_cancel_charge(mem, page_size >> PAGE_SHIFT); } /* @@ -2089,8 +2096,9 @@ struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page) */ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem, - struct page_cgroup *pc, - enum charge_type ctype) + struct page_cgroup *pc, + enum charge_type ctype, + int page_size) { /* try_charge() can return NULL to *memcg, taking care of it. */ if (!mem) @@ -2099,7 +2107,7 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem, lock_page_cgroup(pc); if (unlikely(PageCgroupUsed(pc))) { unlock_page_cgroup(pc); - mem_cgroup_cancel_charge(mem); + mem_cgroup_cancel_charge(mem, page_size); return; } @@ -2173,7 +2181,7 @@ static void __mem_cgroup_move_account(struct page_cgroup *pc, mem_cgroup_charge_statistics(from, pc, false); if (uncharge) /* This is not "cancel", but cancel_charge does all we need. */ - mem_cgroup_cancel_charge(from); + mem_cgroup_cancel_charge(from, PAGE_SIZE); /* caller should have done css_get */ pc->mem_cgroup = to; @@ -2234,13 +2242,14 @@ static int mem_cgroup_move_parent(struct page_cgroup *pc, goto put; parent = mem_cgroup_from_cont(pcg); - ret = __mem_cgroup_try_charge(NULL, gfp_mask, &parent, false); + ret = __mem_cgroup_try_charge(NULL, gfp_mask, &parent, false, + PAGE_SIZE); if (ret || !parent) goto put_back; ret = mem_cgroup_move_account(pc, child, parent, true); if (ret) - mem_cgroup_cancel_charge(parent); + mem_cgroup_cancel_charge(parent, PAGE_SIZE); put_back: putback_lru_page(page); put: @@ -2261,6 +2270,10 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm, struct mem_cgroup *mem = NULL; struct page_cgroup *pc; int ret; + int page_size = PAGE_SIZE; + + if (PageTransHuge(page)) + page_size <<= compound_order(page); pc = lookup_page_cgroup(page); /* can happen at boot */ @@ -2268,11 +2281,11 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm, return 0; prefetchw(pc); - ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, true); + ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, true, page_size); if (ret || !mem) return ret; - __mem_cgroup_commit_charge(mem, pc, ctype); + __mem_cgroup_commit_charge(mem, pc, ctype, page_size); return 0; } @@ -2281,8 +2294,6 @@ int mem_cgroup_newpage_charge(struct page *page, { if (mem_cgroup_disabled()) return 0; - if (PageCompound(page)) - return 0; /* * If already mapped, we don't have to account. * If page cache, page->mapping has address_space. @@ -2388,13 +2399,13 @@ int mem_cgroup_try_charge_swapin(struct mm_struct *mm, if (!mem) goto charge_cur_mm; *ptr = mem; - ret = __mem_cgroup_try_charge(NULL, mask, ptr, true); + ret = __mem_cgroup_try_charge(NULL, mask, ptr, true, PAGE_SIZE); css_put(&mem->css); return ret; charge_cur_mm: if (unlikely(!mm)) mm = &init_mm; - return __mem_cgroup_try_charge(mm, mask, ptr, true); + return __mem_cgroup_try_charge(mm, mask, ptr, true, PAGE_SIZE); } static void @@ -2410,7 +2421,7 @@ __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr, cgroup_exclude_rmdir(&ptr->css); pc = lookup_page_cgroup(page); mem_cgroup_lru_del_before_commit_swapcache(page); - __mem_cgroup_commit_charge(ptr, pc, ctype); + __mem_cgroup_commit_charge(ptr, pc, ctype, PAGE_SIZE); mem_cgroup_lru_add_after_commit_swapcache(page); /* * Now swap is on-memory. This means this page may be @@ -2459,11 +2470,12 @@ void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *mem) return; if (!mem) return; - mem_cgroup_cancel_charge(mem); + mem_cgroup_cancel_charge(mem, PAGE_SIZE); } static void -__do_uncharge(struct mem_cgroup *mem, const enum charge_type ctype) +__do_uncharge(struct mem_cgroup *mem, const enum charge_type ctype, + int page_size) { struct memcg_batch_info *batch = NULL; bool uncharge_memsw = true; @@ -2490,6 +2502,9 @@ __do_uncharge(struct mem_cgroup *mem, const enum charge_type ctype) if (!batch->do_batch || test_thread_flag(TIF_MEMDIE)) goto direct_uncharge; + if (page_size != PAGE_SIZE) + goto direct_uncharge; + /* * In typical case, batch->memcg == mem. This means we can * merge a series of uncharges to an uncharge of res_counter. @@ -2503,9 +2518,9 @@ __do_uncharge(struct mem_cgroup *mem, const enum charge_type ctype) batch->memsw_bytes += PAGE_SIZE; return; direct_uncharge: - res_counter_uncharge(&mem->res, PAGE_SIZE); + res_counter_uncharge(&mem->res, page_size); if (uncharge_memsw) - res_counter_uncharge(&mem->memsw, PAGE_SIZE); + res_counter_uncharge(&mem->memsw, page_size); if (unlikely(batch->memcg != mem)) memcg_oom_recover(mem); return; @@ -2519,6 +2534,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) { struct page_cgroup *pc; struct mem_cgroup *mem = NULL; + int page_size = PAGE_SIZE; if (mem_cgroup_disabled()) return NULL; @@ -2526,6 +2542,9 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) if (PageSwapCache(page)) return NULL; + if (PageTransHuge(page)) + page_size <<= compound_order(page); + /* * Check if our page_cgroup is valid */ @@ -2579,7 +2598,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) mem_cgroup_get(mem); } if (!mem_cgroup_is_root(mem)) - __do_uncharge(mem, ctype); + __do_uncharge(mem, ctype, page_size); return mem; @@ -2774,6 +2793,7 @@ int mem_cgroup_prepare_migration(struct page *page, enum charge_type ctype; int ret = 0; + VM_BUG_ON(PageTransHuge(page)); if (mem_cgroup_disabled()) return 0; @@ -2823,7 +2843,7 @@ int mem_cgroup_prepare_migration(struct page *page, return 0; *ptr = mem; - ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, ptr, false); + ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, ptr, false, PAGE_SIZE); css_put(&mem->css);/* drop extra refcnt */ if (ret || *ptr == NULL) { if (PageAnon(page)) { @@ -2850,7 +2870,7 @@ int mem_cgroup_prepare_migration(struct page *page, ctype = MEM_CGROUP_CHARGE_TYPE_CACHE; else ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM; - __mem_cgroup_commit_charge(mem, pc, ctype); + __mem_cgroup_commit_charge(mem, pc, ctype, PAGE_SIZE); return ret; } @@ -4461,7 +4481,8 @@ one_by_one: batch_count = PRECHARGE_COUNT_AT_ONCE; cond_resched(); } - ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, &mem, false); + ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, &mem, false, + PAGE_SIZE); if (ret || !mem) /* mem_cgroup_clear_mc() will do uncharge later */ return -ENOMEM; @@ -4623,6 +4644,7 @@ static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd, pte_t *pte; spinlock_t *ptl; + VM_BUG_ON(pmd_trans_huge(*pmd)); pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); for (; addr != end; pte++, addr += PAGE_SIZE) if (is_target_pte_for_mc(vma, addr, *pte, NULL)) @@ -4789,6 +4811,7 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd, spinlock_t *ptl; retry: + VM_BUG_ON(pmd_trans_huge(*pmd)); pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); for (; addr != end; addr += PAGE_SIZE) { pte_t ptent = *(pte++); -- cgit From 152c9ccb75548c027fa3103efa4fa4e19a345449 Mon Sep 17 00:00:00 2001 From: Daisuke Nishimura Date: Thu, 13 Jan 2011 15:46:56 -0800 Subject: thp: transhuge-memcg: commit tail pages at charge By this patch, when a transparent hugepage is charged, not only the head page but also all the tail pages are committed, IOW pc->mem_cgroup and pc->flags of tail pages are set. Without this patch: - Tail pages are not linked to any memcg's LRU at splitting. This causes many problems, for example, the charged memcg's directory can never be rmdir'ed because it doesn't have enough pages to scan to make the usage decrease to 0. - "rss" field in memory.stat would be incorrect. Moreover, usage_in_bytes in root cgroup is calculated by the stat not by res_counter(since 2.6.32), it would be incorrect too. Signed-off-by: Daisuke Nishimura Signed-off-by: Andrea Arcangeli Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 52 +++++++++++++++++++++++++++++++++++----------------- 1 file changed, 35 insertions(+), 17 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 356d4964fe95..a1bb59d4c9d0 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2094,23 +2094,10 @@ struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page) * commit a charge got by __mem_cgroup_try_charge() and makes page_cgroup to be * USED state. If already USED, uncharge and return. */ - -static void __mem_cgroup_commit_charge(struct mem_cgroup *mem, - struct page_cgroup *pc, - enum charge_type ctype, - int page_size) +static void ____mem_cgroup_commit_charge(struct mem_cgroup *mem, + struct page_cgroup *pc, + enum charge_type ctype) { - /* try_charge() can return NULL to *memcg, taking care of it. */ - if (!mem) - return; - - lock_page_cgroup(pc); - if (unlikely(PageCgroupUsed(pc))) { - unlock_page_cgroup(pc); - mem_cgroup_cancel_charge(mem, page_size); - return; - } - pc->mem_cgroup = mem; /* * We access a page_cgroup asynchronously without lock_page_cgroup(). @@ -2135,6 +2122,33 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem, } mem_cgroup_charge_statistics(mem, pc, true); +} + +static void __mem_cgroup_commit_charge(struct mem_cgroup *mem, + struct page_cgroup *pc, + enum charge_type ctype, + int page_size) +{ + int i; + int count = page_size >> PAGE_SHIFT; + + /* try_charge() can return NULL to *memcg, taking care of it. */ + if (!mem) + return; + + lock_page_cgroup(pc); + if (unlikely(PageCgroupUsed(pc))) { + unlock_page_cgroup(pc); + mem_cgroup_cancel_charge(mem, page_size); + return; + } + + /* + * we don't need page_cgroup_lock about tail pages, becase they are not + * accessed by any other context at this point. + */ + for (i = 0; i < count; i++) + ____mem_cgroup_commit_charge(mem, pc + i, ctype); unlock_page_cgroup(pc); /* @@ -2532,6 +2546,8 @@ direct_uncharge: static struct mem_cgroup * __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) { + int i; + int count; struct page_cgroup *pc; struct mem_cgroup *mem = NULL; int page_size = PAGE_SIZE; @@ -2545,6 +2561,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) if (PageTransHuge(page)) page_size <<= compound_order(page); + count = page_size >> PAGE_SHIFT; /* * Check if our page_cgroup is valid */ @@ -2577,7 +2594,8 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) break; } - mem_cgroup_charge_statistics(mem, pc, false); + for (i = 0; i < count; i++) + mem_cgroup_charge_statistics(mem, pc + i, false); ClearPageCgroupUsed(pc); /* -- cgit From b9bbfbe30ae088cc88a4b2ba7732baeebd1a0162 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Thu, 13 Jan 2011 15:46:57 -0800 Subject: thp: memcg huge memory Add memcg charge/uncharge to hugepage faults in huge_memory.c. Signed-off-by: Andrea Arcangeli Acked-by: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 36 +++++++++++++++++++++++++++++++----- 1 file changed, 31 insertions(+), 5 deletions(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 620891f4e54f..a313403b3c5e 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -233,6 +233,7 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, VM_BUG_ON(!PageCompound(page)); pgtable = pte_alloc_one(mm, haddr); if (unlikely(!pgtable)) { + mem_cgroup_uncharge_page(page); put_page(page); return VM_FAULT_OOM; } @@ -243,6 +244,7 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, spin_lock(&mm->page_table_lock); if (unlikely(!pmd_none(*pmd))) { spin_unlock(&mm->page_table_lock); + mem_cgroup_uncharge_page(page); put_page(page); pte_free(mm, pgtable); } else { @@ -286,6 +288,10 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, page = alloc_hugepage(transparent_hugepage_defrag(vma)); if (unlikely(!page)) goto out; + if (unlikely(mem_cgroup_newpage_charge(page, mm, GFP_KERNEL))) { + put_page(page); + goto out; + } return __do_huge_pmd_anonymous_page(mm, vma, haddr, pmd, page); } @@ -402,9 +408,17 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm, for (i = 0; i < HPAGE_PMD_NR; i++) { pages[i] = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); - if (unlikely(!pages[i])) { - while (--i >= 0) + if (unlikely(!pages[i] || + mem_cgroup_newpage_charge(pages[i], mm, + GFP_KERNEL))) { + if (pages[i]) put_page(pages[i]); + mem_cgroup_uncharge_start(); + while (--i >= 0) { + mem_cgroup_uncharge_page(pages[i]); + put_page(pages[i]); + } + mem_cgroup_uncharge_end(); kfree(pages); ret |= VM_FAULT_OOM; goto out; @@ -455,8 +469,12 @@ out: out_free_pages: spin_unlock(&mm->page_table_lock); - for (i = 0; i < HPAGE_PMD_NR; i++) + mem_cgroup_uncharge_start(); + for (i = 0; i < HPAGE_PMD_NR; i++) { + mem_cgroup_uncharge_page(pages[i]); put_page(pages[i]); + } + mem_cgroup_uncharge_end(); kfree(pages); goto out; } @@ -501,14 +519,22 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, goto out; } + if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) { + put_page(new_page); + put_page(page); + ret |= VM_FAULT_OOM; + goto out; + } + copy_user_huge_page(new_page, page, haddr, vma, HPAGE_PMD_NR); __SetPageUptodate(new_page); spin_lock(&mm->page_table_lock); put_page(page); - if (unlikely(!pmd_same(*pmd, orig_pmd))) + if (unlikely(!pmd_same(*pmd, orig_pmd))) { + mem_cgroup_uncharge_page(new_page); put_page(new_page); - else { + } else { pmd_t entry; VM_BUG_ON(!PageHead(page)); entry = mk_pmd(new_page, vma->vm_page_prot); -- cgit From 79134171df238171daa4c024a42b77b401ccb00b Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Thu, 13 Jan 2011 15:46:58 -0800 Subject: thp: transparent hugepage vmstat Add hugepage stat information to /proc/vmstat and /proc/meminfo. Signed-off-by: Andrea Arcangeli Acked-by: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/meminfo.c | 14 +++++++++++++- include/linux/mmzone.h | 1 + mm/huge_memory.c | 3 +++ mm/rmap.c | 20 ++++++++++++++++---- mm/vmstat.c | 1 + 5 files changed, 34 insertions(+), 5 deletions(-) diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c index a65239cfd97e..ed257d141568 100644 --- a/fs/proc/meminfo.c +++ b/fs/proc/meminfo.c @@ -100,6 +100,9 @@ static int meminfo_proc_show(struct seq_file *m, void *v) "VmallocChunk: %8lu kB\n" #ifdef CONFIG_MEMORY_FAILURE "HardwareCorrupted: %5lu kB\n" +#endif +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + "AnonHugePages: %8lu kB\n" #endif , K(i.totalram), @@ -128,7 +131,12 @@ static int meminfo_proc_show(struct seq_file *m, void *v) K(i.freeswap), K(global_page_state(NR_FILE_DIRTY)), K(global_page_state(NR_WRITEBACK)), - K(global_page_state(NR_ANON_PAGES)), + K(global_page_state(NR_ANON_PAGES) +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + + global_page_state(NR_ANON_TRANSPARENT_HUGEPAGES) * + HPAGE_PMD_NR +#endif + ), K(global_page_state(NR_FILE_MAPPED)), K(global_page_state(NR_SHMEM)), K(global_page_state(NR_SLAB_RECLAIMABLE) + @@ -150,6 +158,10 @@ static int meminfo_proc_show(struct seq_file *m, void *v) vmi.largest_chunk >> 10 #ifdef CONFIG_MEMORY_FAILURE ,atomic_long_read(&mce_bad_pages) << (PAGE_SHIFT - 10) +#endif +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + ,K(global_page_state(NR_ANON_TRANSPARENT_HUGEPAGES) * + HPAGE_PMD_NR) #endif ); diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index dad3612a7e39..02ecb0189b1d 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -114,6 +114,7 @@ enum zone_stat_item { NUMA_LOCAL, /* allocation from local node */ NUMA_OTHER, /* allocation from other node */ #endif + NR_ANON_TRANSPARENT_HUGEPAGES, NR_VM_ZONE_STAT_ITEMS }; /* diff --git a/mm/huge_memory.c b/mm/huge_memory.c index a313403b3c5e..7101112a5429 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -751,6 +751,9 @@ static void __split_huge_page_refcount(struct page *page) lru_add_page_tail(zone, page, page_tail); } + __dec_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES); + __mod_zone_page_state(zone, NR_ANON_PAGES, HPAGE_PMD_NR); + ClearPageCompound(page); compound_unlock(page); spin_unlock_irq(&zone->lru_lock); diff --git a/mm/rmap.c b/mm/rmap.c index 92e14dcfe737..3825ae4bc32f 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -882,8 +882,13 @@ void do_page_add_anon_rmap(struct page *page, struct vm_area_struct *vma, unsigned long address, int exclusive) { int first = atomic_inc_and_test(&page->_mapcount); - if (first) - __inc_zone_page_state(page, NR_ANON_PAGES); + if (first) { + if (!PageTransHuge(page)) + __inc_zone_page_state(page, NR_ANON_PAGES); + else + __inc_zone_page_state(page, + NR_ANON_TRANSPARENT_HUGEPAGES); + } if (unlikely(PageKsm(page))) return; @@ -911,7 +916,10 @@ void page_add_new_anon_rmap(struct page *page, VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end); SetPageSwapBacked(page); atomic_set(&page->_mapcount, 0); /* increment count (starts at -1) */ - __inc_zone_page_state(page, NR_ANON_PAGES); + if (!PageTransHuge(page)) + __inc_zone_page_state(page, NR_ANON_PAGES); + else + __inc_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES); __page_set_anon_rmap(page, vma, address, 1); if (page_evictable(page, vma)) lru_cache_add_lru(page, LRU_ACTIVE_ANON); @@ -964,7 +972,11 @@ void page_remove_rmap(struct page *page) return; if (PageAnon(page)) { mem_cgroup_uncharge_page(page); - __dec_zone_page_state(page, NR_ANON_PAGES); + if (!PageTransHuge(page)) + __dec_zone_page_state(page, NR_ANON_PAGES); + else + __dec_zone_page_state(page, + NR_ANON_TRANSPARENT_HUGEPAGES); } else { __dec_zone_page_state(page, NR_FILE_MAPPED); mem_cgroup_update_file_mapped(page, -1); diff --git a/mm/vmstat.c b/mm/vmstat.c index 751a65e00aac..0c3b5048773e 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -880,6 +880,7 @@ static const char * const vmstat_text[] = { "numa_local", "numa_other", #endif + "nr_anon_transparent_hugepages", "nr_dirty_threshold", "nr_dirty_background_threshold", -- cgit From ba76149f47d8c939efa0acc07a191237af900471 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Thu, 13 Jan 2011 15:46:58 -0800 Subject: thp: khugepaged Add khugepaged to relocate fragmented pages into hugepages if new hugepages become available. (this is indipendent of the defrag logic that will have to make new hugepages available) The fundamental reason why khugepaged is unavoidable, is that some memory can be fragmented and not everything can be relocated. So when a virtual machine quits and releases gigabytes of hugepages, we want to use those freely available hugepages to create huge-pmd in the other virtual machines that may be running on fragmented memory, to maximize the CPU efficiency at all times. The scan is slow, it takes nearly zero cpu time, except when it copies data (in which case it means we definitely want to pay for that cpu time) so it seems a good tradeoff. In addition to the hugepages being released by other process releasing memory, we have the strong suspicion that the performance impact of potentially defragmenting hugepages during or before each page fault could lead to more performance inconsistency than allocating small pages at first and having them collapsed into large pages later... if they prove themselfs to be long lived mappings (khugepaged scan is slow so short lived mappings have low probability to run into khugepaged if compared to long lived mappings). Signed-off-by: Andrea Arcangeli Acked-by: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/huge_mm.h | 1 + include/linux/khugepaged.h | 66 +++ include/linux/sched.h | 1 + kernel/fork.c | 5 + mm/huge_memory.c | 1073 +++++++++++++++++++++++++++++++++++++++++++- 5 files changed, 1136 insertions(+), 10 deletions(-) create mode 100644 include/linux/khugepaged.h diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index d9ab70d776e2..43a694ef8904 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -25,6 +25,7 @@ enum transparent_hugepage_flag { TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, TRANSPARENT_HUGEPAGE_DEFRAG_FLAG, TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, + TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG, #ifdef CONFIG_DEBUG_VM TRANSPARENT_HUGEPAGE_DEBUG_COW_FLAG, #endif diff --git a/include/linux/khugepaged.h b/include/linux/khugepaged.h new file mode 100644 index 000000000000..552f3184756c --- /dev/null +++ b/include/linux/khugepaged.h @@ -0,0 +1,66 @@ +#ifndef _LINUX_KHUGEPAGED_H +#define _LINUX_KHUGEPAGED_H + +#include /* MMF_VM_HUGEPAGE */ + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +extern int __khugepaged_enter(struct mm_struct *mm); +extern void __khugepaged_exit(struct mm_struct *mm); +extern int khugepaged_enter_vma_merge(struct vm_area_struct *vma); + +#define khugepaged_enabled() \ + (transparent_hugepage_flags & \ + ((1<flags)) + return __khugepaged_enter(mm); + return 0; +} + +static inline void khugepaged_exit(struct mm_struct *mm) +{ + if (test_bit(MMF_VM_HUGEPAGE, &mm->flags)) + __khugepaged_exit(mm); +} + +static inline int khugepaged_enter(struct vm_area_struct *vma) +{ + if (!test_bit(MMF_VM_HUGEPAGE, &vma->vm_mm->flags)) + if (khugepaged_always() || + (khugepaged_req_madv() && + vma->vm_flags & VM_HUGEPAGE)) + if (__khugepaged_enter(vma->vm_mm)) + return -ENOMEM; + return 0; +} +#else /* CONFIG_TRANSPARENT_HUGEPAGE */ +static inline int khugepaged_fork(struct mm_struct *mm, struct mm_struct *oldmm) +{ + return 0; +} +static inline void khugepaged_exit(struct mm_struct *mm) +{ +} +static inline int khugepaged_enter(struct vm_area_struct *vma) +{ + return 0; +} +static inline int khugepaged_enter_vma_merge(struct vm_area_struct *vma) +{ + return 0; +} +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ + +#endif /* _LINUX_KHUGEPAGED_H */ diff --git a/include/linux/sched.h b/include/linux/sched.h index f23b5bb6f52e..d747f948b34e 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -434,6 +434,7 @@ extern int get_dumpable(struct mm_struct *mm); #endif /* leave room for more dump flags */ #define MMF_VM_MERGEABLE 16 /* KSM may merge identical pages */ +#define MMF_VM_HUGEPAGE 17 /* set when VM_HUGEPAGE is set on vma */ #define MMF_INIT_MASK (MMF_DUMPABLE_MASK | MMF_DUMP_FILTER_MASK) diff --git a/kernel/fork.c b/kernel/fork.c index f78f50ba6cb2..25e429152ddc 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -66,6 +66,7 @@ #include #include #include +#include #include #include @@ -328,6 +329,9 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) rb_parent = NULL; pprev = &mm->mmap; retval = ksm_fork(mm, oldmm); + if (retval) + goto out; + retval = khugepaged_fork(mm, oldmm); if (retval) goto out; @@ -546,6 +550,7 @@ void mmput(struct mm_struct *mm) if (atomic_dec_and_test(&mm->mm_users)) { exit_aio(mm); ksm_exit(mm); + khugepaged_exit(mm); /* must run before exit_mmap */ exit_mmap(mm); set_mm_exe_file(mm, NULL); if (!list_empty(&mm->mmlist)) { diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 7101112a5429..ae2bf08b1099 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -12,14 +12,111 @@ #include #include #include +#include +#include +#include #include #include #include "internal.h" +/* + * By default transparent hugepage support is enabled for all mappings + * and khugepaged scans all mappings. Defrag is only invoked by + * khugepaged hugepage allocations and by page faults inside + * MADV_HUGEPAGE regions to avoid the risk of slowing down short lived + * allocations. + */ unsigned long transparent_hugepage_flags __read_mostly = - (1< 0) { + int err = start_khugepaged(); + if (err) + ret = err; + } + + return ret; } static struct kobj_attribute enabled_attr = __ATTR(enabled, 0644, enabled_show, enabled_store); @@ -153,20 +260,212 @@ static struct attribute *hugepage_attr[] = { static struct attribute_group hugepage_attr_group = { .attrs = hugepage_attr, - .name = "transparent_hugepage", +}; + +static ssize_t scan_sleep_millisecs_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + return sprintf(buf, "%u\n", khugepaged_scan_sleep_millisecs); +} + +static ssize_t scan_sleep_millisecs_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + unsigned long msecs; + int err; + + err = strict_strtoul(buf, 10, &msecs); + if (err || msecs > UINT_MAX) + return -EINVAL; + + khugepaged_scan_sleep_millisecs = msecs; + wake_up_interruptible(&khugepaged_wait); + + return count; +} +static struct kobj_attribute scan_sleep_millisecs_attr = + __ATTR(scan_sleep_millisecs, 0644, scan_sleep_millisecs_show, + scan_sleep_millisecs_store); + +static ssize_t alloc_sleep_millisecs_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + return sprintf(buf, "%u\n", khugepaged_alloc_sleep_millisecs); +} + +static ssize_t alloc_sleep_millisecs_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + unsigned long msecs; + int err; + + err = strict_strtoul(buf, 10, &msecs); + if (err || msecs > UINT_MAX) + return -EINVAL; + + khugepaged_alloc_sleep_millisecs = msecs; + wake_up_interruptible(&khugepaged_wait); + + return count; +} +static struct kobj_attribute alloc_sleep_millisecs_attr = + __ATTR(alloc_sleep_millisecs, 0644, alloc_sleep_millisecs_show, + alloc_sleep_millisecs_store); + +static ssize_t pages_to_scan_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + return sprintf(buf, "%u\n", khugepaged_pages_to_scan); +} +static ssize_t pages_to_scan_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + int err; + unsigned long pages; + + err = strict_strtoul(buf, 10, &pages); + if (err || !pages || pages > UINT_MAX) + return -EINVAL; + + khugepaged_pages_to_scan = pages; + + return count; +} +static struct kobj_attribute pages_to_scan_attr = + __ATTR(pages_to_scan, 0644, pages_to_scan_show, + pages_to_scan_store); + +static ssize_t pages_collapsed_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + return sprintf(buf, "%u\n", khugepaged_pages_collapsed); +} +static struct kobj_attribute pages_collapsed_attr = + __ATTR_RO(pages_collapsed); + +static ssize_t full_scans_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + return sprintf(buf, "%u\n", khugepaged_full_scans); +} +static struct kobj_attribute full_scans_attr = + __ATTR_RO(full_scans); + +static ssize_t khugepaged_defrag_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return single_flag_show(kobj, attr, buf, + TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG); +} +static ssize_t khugepaged_defrag_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + return single_flag_store(kobj, attr, buf, count, + TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG); +} +static struct kobj_attribute khugepaged_defrag_attr = + __ATTR(defrag, 0644, khugepaged_defrag_show, + khugepaged_defrag_store); + +/* + * max_ptes_none controls if khugepaged should collapse hugepages over + * any unmapped ptes in turn potentially increasing the memory + * footprint of the vmas. When max_ptes_none is 0 khugepaged will not + * reduce the available free memory in the system as it + * runs. Increasing max_ptes_none will instead potentially reduce the + * free memory in the system during the khugepaged scan. + */ +static ssize_t khugepaged_max_ptes_none_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + return sprintf(buf, "%u\n", khugepaged_max_ptes_none); +} +static ssize_t khugepaged_max_ptes_none_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + int err; + unsigned long max_ptes_none; + + err = strict_strtoul(buf, 10, &max_ptes_none); + if (err || max_ptes_none > HPAGE_PMD_NR-1) + return -EINVAL; + + khugepaged_max_ptes_none = max_ptes_none; + + return count; +} +static struct kobj_attribute khugepaged_max_ptes_none_attr = + __ATTR(max_ptes_none, 0644, khugepaged_max_ptes_none_show, + khugepaged_max_ptes_none_store); + +static struct attribute *khugepaged_attr[] = { + &khugepaged_defrag_attr.attr, + &khugepaged_max_ptes_none_attr.attr, + &pages_to_scan_attr.attr, + &pages_collapsed_attr.attr, + &full_scans_attr.attr, + &scan_sleep_millisecs_attr.attr, + &alloc_sleep_millisecs_attr.attr, + NULL, +}; + +static struct attribute_group khugepaged_attr_group = { + .attrs = khugepaged_attr, + .name = "khugepaged", }; #endif /* CONFIG_SYSFS */ static int __init hugepage_init(void) { -#ifdef CONFIG_SYSFS int err; +#ifdef CONFIG_SYSFS + static struct kobject *hugepage_kobj; - err = sysfs_create_group(mm_kobj, &hugepage_attr_group); - if (err) - printk(KERN_ERR "hugepage: register sysfs failed\n"); + err = -ENOMEM; + hugepage_kobj = kobject_create_and_add("transparent_hugepage", mm_kobj); + if (unlikely(!hugepage_kobj)) { + printk(KERN_ERR "hugepage: failed kobject create\n"); + goto out; + } + + err = sysfs_create_group(hugepage_kobj, &hugepage_attr_group); + if (err) { + printk(KERN_ERR "hugepage: failed register hugeage group\n"); + goto out; + } + + err = sysfs_create_group(hugepage_kobj, &khugepaged_attr_group); + if (err) { + printk(KERN_ERR "hugepage: failed register hugeage group\n"); + goto out; + } #endif - return 0; + + err = khugepaged_slab_init(); + if (err) + goto out; + + err = mm_slots_hash_init(); + if (err) { + khugepaged_slab_free(); + goto out; + } + + start_khugepaged(); + +out: + return err; } module_init(hugepage_init) @@ -285,6 +584,8 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, if (haddr >= vma->vm_start && haddr + HPAGE_PMD_SIZE <= vma->vm_end) { if (unlikely(anon_vma_prepare(vma))) return VM_FAULT_OOM; + if (unlikely(khugepaged_enter(vma))) + return VM_FAULT_OOM; page = alloc_hugepage(transparent_hugepage_defrag(vma)); if (unlikely(!page)) goto out; @@ -941,6 +1242,758 @@ int hugepage_madvise(unsigned long *vm_flags) return 0; } +static int __init khugepaged_slab_init(void) +{ + mm_slot_cache = kmem_cache_create("khugepaged_mm_slot", + sizeof(struct mm_slot), + __alignof__(struct mm_slot), 0, NULL); + if (!mm_slot_cache) + return -ENOMEM; + + return 0; +} + +static void __init khugepaged_slab_free(void) +{ + kmem_cache_destroy(mm_slot_cache); + mm_slot_cache = NULL; +} + +static inline struct mm_slot *alloc_mm_slot(void) +{ + if (!mm_slot_cache) /* initialization failed */ + return NULL; + return kmem_cache_zalloc(mm_slot_cache, GFP_KERNEL); +} + +static inline void free_mm_slot(struct mm_slot *mm_slot) +{ + kmem_cache_free(mm_slot_cache, mm_slot); +} + +static int __init mm_slots_hash_init(void) +{ + mm_slots_hash = kzalloc(MM_SLOTS_HASH_HEADS * sizeof(struct hlist_head), + GFP_KERNEL); + if (!mm_slots_hash) + return -ENOMEM; + return 0; +} + +#if 0 +static void __init mm_slots_hash_free(void) +{ + kfree(mm_slots_hash); + mm_slots_hash = NULL; +} +#endif + +static struct mm_slot *get_mm_slot(struct mm_struct *mm) +{ + struct mm_slot *mm_slot; + struct hlist_head *bucket; + struct hlist_node *node; + + bucket = &mm_slots_hash[((unsigned long)mm / sizeof(struct mm_struct)) + % MM_SLOTS_HASH_HEADS]; + hlist_for_each_entry(mm_slot, node, bucket, hash) { + if (mm == mm_slot->mm) + return mm_slot; + } + return NULL; +} + +static void insert_to_mm_slots_hash(struct mm_struct *mm, + struct mm_slot *mm_slot) +{ + struct hlist_head *bucket; + + bucket = &mm_slots_hash[((unsigned long)mm / sizeof(struct mm_struct)) + % MM_SLOTS_HASH_HEADS]; + mm_slot->mm = mm; + hlist_add_head(&mm_slot->hash, bucket); +} + +static inline int khugepaged_test_exit(struct mm_struct *mm) +{ + return atomic_read(&mm->mm_users) == 0; +} + +int __khugepaged_enter(struct mm_struct *mm) +{ + struct mm_slot *mm_slot; + int wakeup; + + mm_slot = alloc_mm_slot(); + if (!mm_slot) + return -ENOMEM; + + /* __khugepaged_exit() must not run from under us */ + VM_BUG_ON(khugepaged_test_exit(mm)); + if (unlikely(test_and_set_bit(MMF_VM_HUGEPAGE, &mm->flags))) { + free_mm_slot(mm_slot); + return 0; + } + + spin_lock(&khugepaged_mm_lock); + insert_to_mm_slots_hash(mm, mm_slot); + /* + * Insert just behind the scanning cursor, to let the area settle + * down a little. + */ + wakeup = list_empty(&khugepaged_scan.mm_head); + list_add_tail(&mm_slot->mm_node, &khugepaged_scan.mm_head); + spin_unlock(&khugepaged_mm_lock); + + atomic_inc(&mm->mm_count); + if (wakeup) + wake_up_interruptible(&khugepaged_wait); + + return 0; +} + +int khugepaged_enter_vma_merge(struct vm_area_struct *vma) +{ + unsigned long hstart, hend; + if (!vma->anon_vma) + /* + * Not yet faulted in so we will register later in the + * page fault if needed. + */ + return 0; + if (vma->vm_file || vma->vm_ops) + /* khugepaged not yet working on file or special mappings */ + return 0; + VM_BUG_ON(is_linear_pfn_mapping(vma) || is_pfn_mapping(vma)); + hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK; + hend = vma->vm_end & HPAGE_PMD_MASK; + if (hstart < hend) + return khugepaged_enter(vma); + return 0; +} + +void __khugepaged_exit(struct mm_struct *mm) +{ + struct mm_slot *mm_slot; + int free = 0; + + spin_lock(&khugepaged_mm_lock); + mm_slot = get_mm_slot(mm); + if (mm_slot && khugepaged_scan.mm_slot != mm_slot) { + hlist_del(&mm_slot->hash); + list_del(&mm_slot->mm_node); + free = 1; + } + + if (free) { + spin_unlock(&khugepaged_mm_lock); + clear_bit(MMF_VM_HUGEPAGE, &mm->flags); + free_mm_slot(mm_slot); + mmdrop(mm); + } else if (mm_slot) { + spin_unlock(&khugepaged_mm_lock); + /* + * This is required to serialize against + * khugepaged_test_exit() (which is guaranteed to run + * under mmap sem read mode). Stop here (after we + * return all pagetables will be destroyed) until + * khugepaged has finished working on the pagetables + * under the mmap_sem. + */ + down_write(&mm->mmap_sem); + up_write(&mm->mmap_sem); + } else + spin_unlock(&khugepaged_mm_lock); +} + +static void release_pte_page(struct page *page) +{ + /* 0 stands for page_is_file_cache(page) == false */ + dec_zone_page_state(page, NR_ISOLATED_ANON + 0); + unlock_page(page); + putback_lru_page(page); +} + +static void release_pte_pages(pte_t *pte, pte_t *_pte) +{ + while (--_pte >= pte) { + pte_t pteval = *_pte; + if (!pte_none(pteval)) + release_pte_page(pte_page(pteval)); + } +} + +static void release_all_pte_pages(pte_t *pte) +{ + release_pte_pages(pte, pte + HPAGE_PMD_NR); +} + +static int __collapse_huge_page_isolate(struct vm_area_struct *vma, + unsigned long address, + pte_t *pte) +{ + struct page *page; + pte_t *_pte; + int referenced = 0, isolated = 0, none = 0; + for (_pte = pte; _pte < pte+HPAGE_PMD_NR; + _pte++, address += PAGE_SIZE) { + pte_t pteval = *_pte; + if (pte_none(pteval)) { + if (++none <= khugepaged_max_ptes_none) + continue; + else { + release_pte_pages(pte, _pte); + goto out; + } + } + if (!pte_present(pteval) || !pte_write(pteval)) { + release_pte_pages(pte, _pte); + goto out; + } + page = vm_normal_page(vma, address, pteval); + if (unlikely(!page)) { + release_pte_pages(pte, _pte); + goto out; + } + VM_BUG_ON(PageCompound(page)); + BUG_ON(!PageAnon(page)); + VM_BUG_ON(!PageSwapBacked(page)); + + /* cannot use mapcount: can't collapse if there's a gup pin */ + if (page_count(page) != 1) { + release_pte_pages(pte, _pte); + goto out; + } + /* + * We can do it before isolate_lru_page because the + * page can't be freed from under us. NOTE: PG_lock + * is needed to serialize against split_huge_page + * when invoked from the VM. + */ + if (!trylock_page(page)) { + release_pte_pages(pte, _pte); + goto out; + } + /* + * Isolate the page to avoid collapsing an hugepage + * currently in use by the VM. + */ + if (isolate_lru_page(page)) { + unlock_page(page); + release_pte_pages(pte, _pte); + goto out; + } + /* 0 stands for page_is_file_cache(page) == false */ + inc_zone_page_state(page, NR_ISOLATED_ANON + 0); + VM_BUG_ON(!PageLocked(page)); + VM_BUG_ON(PageLRU(page)); + + /* If there is no mapped pte young don't collapse the page */ + if (pte_young(pteval)) + referenced = 1; + } + if (unlikely(!referenced)) + release_all_pte_pages(pte); + else + isolated = 1; +out: + return isolated; +} + +static void __collapse_huge_page_copy(pte_t *pte, struct page *page, + struct vm_area_struct *vma, + unsigned long address, + spinlock_t *ptl) +{ + pte_t *_pte; + for (_pte = pte; _pte < pte+HPAGE_PMD_NR; _pte++) { + pte_t pteval = *_pte; + struct page *src_page; + + if (pte_none(pteval)) { + clear_user_highpage(page, address); + add_mm_counter(vma->vm_mm, MM_ANONPAGES, 1); + } else { + src_page = pte_page(pteval); + copy_user_highpage(page, src_page, address, vma); + VM_BUG_ON(page_mapcount(src_page) != 1); + VM_BUG_ON(page_count(src_page) != 2); + release_pte_page(src_page); + /* + * ptl mostly unnecessary, but preempt has to + * be disabled to update the per-cpu stats + * inside page_remove_rmap(). + */ + spin_lock(ptl); + /* + * paravirt calls inside pte_clear here are + * superfluous. + */ + pte_clear(vma->vm_mm, address, _pte); + page_remove_rmap(src_page); + spin_unlock(ptl); + free_page_and_swap_cache(src_page); + } + + address += PAGE_SIZE; + page++; + } +} + +static void collapse_huge_page(struct mm_struct *mm, + unsigned long address, + struct page **hpage) +{ + struct vm_area_struct *vma; + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd, _pmd; + pte_t *pte; + pgtable_t pgtable; + struct page *new_page; + spinlock_t *ptl; + int isolated; + unsigned long hstart, hend; + + VM_BUG_ON(address & ~HPAGE_PMD_MASK); + VM_BUG_ON(!*hpage); + + /* + * Prevent all access to pagetables with the exception of + * gup_fast later hanlded by the ptep_clear_flush and the VM + * handled by the anon_vma lock + PG_lock. + */ + down_write(&mm->mmap_sem); + if (unlikely(khugepaged_test_exit(mm))) + goto out; + + vma = find_vma(mm, address); + hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK; + hend = vma->vm_end & HPAGE_PMD_MASK; + if (address < hstart || address + HPAGE_PMD_SIZE > hend) + goto out; + + if (!(vma->vm_flags & VM_HUGEPAGE) && !khugepaged_always()) + goto out; + + /* VM_PFNMAP vmas may have vm_ops null but vm_file set */ + if (!vma->anon_vma || vma->vm_ops || vma->vm_file) + goto out; + VM_BUG_ON(is_linear_pfn_mapping(vma) || is_pfn_mapping(vma)); + + pgd = pgd_offset(mm, address); + if (!pgd_present(*pgd)) + goto out; + + pud = pud_offset(pgd, address); + if (!pud_present(*pud)) + goto out; + + pmd = pmd_offset(pud, address); + /* pmd can't go away or become huge under us */ + if (!pmd_present(*pmd) || pmd_trans_huge(*pmd)) + goto out; + + new_page = *hpage; + if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) + goto out; + + anon_vma_lock(vma->anon_vma); + + pte = pte_offset_map(pmd, address); + ptl = pte_lockptr(mm, pmd); + + spin_lock(&mm->page_table_lock); /* probably unnecessary */ + /* + * After this gup_fast can't run anymore. This also removes + * any huge TLB entry from the CPU so we won't allow + * huge and small TLB entries for the same virtual address + * to avoid the risk of CPU bugs in that area. + */ + _pmd = pmdp_clear_flush_notify(vma, address, pmd); + spin_unlock(&mm->page_table_lock); + + spin_lock(ptl); + isolated = __collapse_huge_page_isolate(vma, address, pte); + spin_unlock(ptl); + pte_unmap(pte); + + if (unlikely(!isolated)) { + spin_lock(&mm->page_table_lock); + BUG_ON(!pmd_none(*pmd)); + set_pmd_at(mm, address, pmd, _pmd); + spin_unlock(&mm->page_table_lock); + anon_vma_unlock(vma->anon_vma); + mem_cgroup_uncharge_page(new_page); + goto out; + } + + /* + * All pages are isolated and locked so anon_vma rmap + * can't run anymore. + */ + anon_vma_unlock(vma->anon_vma); + + __collapse_huge_page_copy(pte, new_page, vma, address, ptl); + __SetPageUptodate(new_page); + pgtable = pmd_pgtable(_pmd); + VM_BUG_ON(page_count(pgtable) != 1); + VM_BUG_ON(page_mapcount(pgtable) != 0); + + _pmd = mk_pmd(new_page, vma->vm_page_prot); + _pmd = maybe_pmd_mkwrite(pmd_mkdirty(_pmd), vma); + _pmd = pmd_mkhuge(_pmd); + + /* + * spin_lock() below is not the equivalent of smp_wmb(), so + * this is needed to avoid the copy_huge_page writes to become + * visible after the set_pmd_at() write. + */ + smp_wmb(); + + spin_lock(&mm->page_table_lock); + BUG_ON(!pmd_none(*pmd)); + page_add_new_anon_rmap(new_page, vma, address); + set_pmd_at(mm, address, pmd, _pmd); + update_mmu_cache(vma, address, entry); + prepare_pmd_huge_pte(pgtable, mm); + mm->nr_ptes--; + spin_unlock(&mm->page_table_lock); + + *hpage = NULL; + khugepaged_pages_collapsed++; +out: + up_write(&mm->mmap_sem); +} + +static int khugepaged_scan_pmd(struct mm_struct *mm, + struct vm_area_struct *vma, + unsigned long address, + struct page **hpage) +{ + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + pte_t *pte, *_pte; + int ret = 0, referenced = 0, none = 0; + struct page *page; + unsigned long _address; + spinlock_t *ptl; + + VM_BUG_ON(address & ~HPAGE_PMD_MASK); + + pgd = pgd_offset(mm, address); + if (!pgd_present(*pgd)) + goto out; + + pud = pud_offset(pgd, address); + if (!pud_present(*pud)) + goto out; + + pmd = pmd_offset(pud, address); + if (!pmd_present(*pmd) || pmd_trans_huge(*pmd)) + goto out; + + pte = pte_offset_map_lock(mm, pmd, address, &ptl); + for (_address = address, _pte = pte; _pte < pte+HPAGE_PMD_NR; + _pte++, _address += PAGE_SIZE) { + pte_t pteval = *_pte; + if (pte_none(pteval)) { + if (++none <= khugepaged_max_ptes_none) + continue; + else + goto out_unmap; + } + if (!pte_present(pteval) || !pte_write(pteval)) + goto out_unmap; + page = vm_normal_page(vma, _address, pteval); + if (unlikely(!page)) + goto out_unmap; + VM_BUG_ON(PageCompound(page)); + if (!PageLRU(page) || PageLocked(page) || !PageAnon(page)) + goto out_unmap; + /* cannot use mapcount: can't collapse if there's a gup pin */ + if (page_count(page) != 1) + goto out_unmap; + if (pte_young(pteval)) + referenced = 1; + } + if (referenced) + ret = 1; +out_unmap: + pte_unmap_unlock(pte, ptl); + if (ret) { + up_read(&mm->mmap_sem); + collapse_huge_page(mm, address, hpage); + } +out: + return ret; +} + +static void collect_mm_slot(struct mm_slot *mm_slot) +{ + struct mm_struct *mm = mm_slot->mm; + + VM_BUG_ON(!spin_is_locked(&khugepaged_mm_lock)); + + if (khugepaged_test_exit(mm)) { + /* free mm_slot */ + hlist_del(&mm_slot->hash); + list_del(&mm_slot->mm_node); + + /* + * Not strictly needed because the mm exited already. + * + * clear_bit(MMF_VM_HUGEPAGE, &mm->flags); + */ + + /* khugepaged_mm_lock actually not necessary for the below */ + free_mm_slot(mm_slot); + mmdrop(mm); + } +} + +static unsigned int khugepaged_scan_mm_slot(unsigned int pages, + struct page **hpage) +{ + struct mm_slot *mm_slot; + struct mm_struct *mm; + struct vm_area_struct *vma; + int progress = 0; + + VM_BUG_ON(!pages); + VM_BUG_ON(!spin_is_locked(&khugepaged_mm_lock)); + + if (khugepaged_scan.mm_slot) + mm_slot = khugepaged_scan.mm_slot; + else { + mm_slot = list_entry(khugepaged_scan.mm_head.next, + struct mm_slot, mm_node); + khugepaged_scan.address = 0; + khugepaged_scan.mm_slot = mm_slot; + } + spin_unlock(&khugepaged_mm_lock); + + mm = mm_slot->mm; + down_read(&mm->mmap_sem); + if (unlikely(khugepaged_test_exit(mm))) + vma = NULL; + else + vma = find_vma(mm, khugepaged_scan.address); + + progress++; + for (; vma; vma = vma->vm_next) { + unsigned long hstart, hend; + + cond_resched(); + if (unlikely(khugepaged_test_exit(mm))) { + progress++; + break; + } + + if (!(vma->vm_flags & VM_HUGEPAGE) && + !khugepaged_always()) { + progress++; + continue; + } + + /* VM_PFNMAP vmas may have vm_ops null but vm_file set */ + if (!vma->anon_vma || vma->vm_ops || vma->vm_file) { + khugepaged_scan.address = vma->vm_end; + progress++; + continue; + } + VM_BUG_ON(is_linear_pfn_mapping(vma) || is_pfn_mapping(vma)); + + hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK; + hend = vma->vm_end & HPAGE_PMD_MASK; + if (hstart >= hend) { + progress++; + continue; + } + if (khugepaged_scan.address < hstart) + khugepaged_scan.address = hstart; + if (khugepaged_scan.address > hend) { + khugepaged_scan.address = hend + HPAGE_PMD_SIZE; + progress++; + continue; + } + BUG_ON(khugepaged_scan.address & ~HPAGE_PMD_MASK); + + while (khugepaged_scan.address < hend) { + int ret; + cond_resched(); + if (unlikely(khugepaged_test_exit(mm))) + goto breakouterloop; + + VM_BUG_ON(khugepaged_scan.address < hstart || + khugepaged_scan.address + HPAGE_PMD_SIZE > + hend); + ret = khugepaged_scan_pmd(mm, vma, + khugepaged_scan.address, + hpage); + /* move to next address */ + khugepaged_scan.address += HPAGE_PMD_SIZE; + progress += HPAGE_PMD_NR; + if (ret) + /* we released mmap_sem so break loop */ + goto breakouterloop_mmap_sem; + if (progress >= pages) + goto breakouterloop; + } + } +breakouterloop: + up_read(&mm->mmap_sem); /* exit_mmap will destroy ptes after this */ +breakouterloop_mmap_sem: + + spin_lock(&khugepaged_mm_lock); + BUG_ON(khugepaged_scan.mm_slot != mm_slot); + /* + * Release the current mm_slot if this mm is about to die, or + * if we scanned all vmas of this mm. + */ + if (khugepaged_test_exit(mm) || !vma) { + /* + * Make sure that if mm_users is reaching zero while + * khugepaged runs here, khugepaged_exit will find + * mm_slot not pointing to the exiting mm. + */ + if (mm_slot->mm_node.next != &khugepaged_scan.mm_head) { + khugepaged_scan.mm_slot = list_entry( + mm_slot->mm_node.next, + struct mm_slot, mm_node); + khugepaged_scan.address = 0; + } else { + khugepaged_scan.mm_slot = NULL; + khugepaged_full_scans++; + } + + collect_mm_slot(mm_slot); + } + + return progress; +} + +static int khugepaged_has_work(void) +{ + return !list_empty(&khugepaged_scan.mm_head) && + khugepaged_enabled(); +} + +static int khugepaged_wait_event(void) +{ + return !list_empty(&khugepaged_scan.mm_head) || + !khugepaged_enabled(); +} + +static void khugepaged_do_scan(struct page **hpage) +{ + unsigned int progress = 0, pass_through_head = 0; + unsigned int pages = khugepaged_pages_to_scan; + + barrier(); /* write khugepaged_pages_to_scan to local stack */ + + while (progress < pages) { + cond_resched(); + + if (!*hpage) { + *hpage = alloc_hugepage(khugepaged_defrag()); + if (unlikely(!*hpage)) + break; + } + + spin_lock(&khugepaged_mm_lock); + if (!khugepaged_scan.mm_slot) + pass_through_head++; + if (khugepaged_has_work() && + pass_through_head < 2) + progress += khugepaged_scan_mm_slot(pages - progress, + hpage); + else + progress = pages; + spin_unlock(&khugepaged_mm_lock); + } +} + +static struct page *khugepaged_alloc_hugepage(void) +{ + struct page *hpage; + + do { + hpage = alloc_hugepage(khugepaged_defrag()); + if (!hpage) { + DEFINE_WAIT(wait); + add_wait_queue(&khugepaged_wait, &wait); + schedule_timeout_interruptible( + msecs_to_jiffies( + khugepaged_alloc_sleep_millisecs)); + remove_wait_queue(&khugepaged_wait, &wait); + } + } while (unlikely(!hpage) && + likely(khugepaged_enabled())); + return hpage; +} + +static void khugepaged_loop(void) +{ + struct page *hpage; + + while (likely(khugepaged_enabled())) { + hpage = khugepaged_alloc_hugepage(); + if (unlikely(!hpage)) + break; + + khugepaged_do_scan(&hpage); + if (hpage) + put_page(hpage); + if (khugepaged_has_work()) { + DEFINE_WAIT(wait); + if (!khugepaged_scan_sleep_millisecs) + continue; + add_wait_queue(&khugepaged_wait, &wait); + schedule_timeout_interruptible( + msecs_to_jiffies( + khugepaged_scan_sleep_millisecs)); + remove_wait_queue(&khugepaged_wait, &wait); + } else if (khugepaged_enabled()) + wait_event_interruptible(khugepaged_wait, + khugepaged_wait_event()); + } +} + +static int khugepaged(void *none) +{ + struct mm_slot *mm_slot; + + set_user_nice(current, 19); + + /* serialize with start_khugepaged() */ + mutex_lock(&khugepaged_mutex); + + for (;;) { + mutex_unlock(&khugepaged_mutex); + BUG_ON(khugepaged_thread != current); + khugepaged_loop(); + BUG_ON(khugepaged_thread != current); + + mutex_lock(&khugepaged_mutex); + if (!khugepaged_enabled()) + break; + } + + spin_lock(&khugepaged_mm_lock); + mm_slot = khugepaged_scan.mm_slot; + khugepaged_scan.mm_slot = NULL; + if (mm_slot) + collect_mm_slot(mm_slot); + spin_unlock(&khugepaged_mm_lock); + + khugepaged_thread = NULL; + mutex_unlock(&khugepaged_mutex); + + return 0; +} + void __split_huge_page_pmd(struct mm_struct *mm, pmd_t *pmd) { struct page *page; -- cgit From b15d00b6af617251cc70a908df983e9aff57e169 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Thu, 13 Jan 2011 15:46:59 -0800 Subject: thp: khugepaged vma merge register in khugepaged if the vma grows. Signed-off-by: Andrea Arcangeli Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mmap.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/mm/mmap.c b/mm/mmap.c index 50a4aa0255a0..753f44d17047 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -29,6 +29,7 @@ #include #include #include +#include #include #include @@ -815,6 +816,7 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm, end, prev->vm_pgoff, NULL); if (err) return NULL; + khugepaged_enter_vma_merge(prev); return prev; } @@ -833,6 +835,7 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm, next->vm_pgoff - pglen, NULL); if (err) return NULL; + khugepaged_enter_vma_merge(area); return area; } @@ -1761,6 +1764,7 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address) } } vma_unlock_anon_vma(vma); + khugepaged_enter_vma_merge(vma); return error; } #endif /* CONFIG_STACK_GROWSUP || CONFIG_IA64 */ @@ -1808,6 +1812,7 @@ static int expand_downwards(struct vm_area_struct *vma, } } vma_unlock_anon_vma(vma); + khugepaged_enter_vma_merge(vma); return error; } -- cgit From 21ae5b01750f14140809508a478a4413792e0261 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Thu, 13 Jan 2011 15:47:00 -0800 Subject: thp: skip transhuge pages in ksm for now Skip transhuge pages in ksm for now. Signed-off-by: Andrea Arcangeli Reviewed-by: Rik van Riel Acked-by: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/ksm.c | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/mm/ksm.c b/mm/ksm.c index b5b907cb0f90..5e7d5d35ea82 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -430,7 +430,7 @@ static struct page *get_mergeable_page(struct rmap_item *rmap_item) page = follow_page(vma, addr, FOLL_GET); if (IS_ERR_OR_NULL(page)) goto out; - if (PageAnon(page)) { + if (PageAnon(page) && !PageTransCompound(page)) { flush_anon_page(vma, page, addr); flush_dcache_page(page); } else { @@ -1279,7 +1279,19 @@ next_mm: if (ksm_test_exit(mm)) break; *page = follow_page(vma, ksm_scan.address, FOLL_GET); - if (!IS_ERR_OR_NULL(*page) && PageAnon(*page)) { + if (IS_ERR_OR_NULL(*page)) { + ksm_scan.address += PAGE_SIZE; + cond_resched(); + continue; + } + if (PageTransCompound(*page)) { + put_page(*page); + ksm_scan.address &= HPAGE_PMD_MASK; + ksm_scan.address += HPAGE_PMD_SIZE; + cond_resched(); + continue; + } + if (PageAnon(*page)) { flush_anon_page(vma, *page, ksm_scan.address); flush_dcache_page(*page); rmap_item = get_next_rmap_item(slot, @@ -1293,8 +1305,7 @@ next_mm: up_read(&mm->mmap_sem); return rmap_item; } - if (!IS_ERR_OR_NULL(*page)) - put_page(*page); + put_page(*page); ksm_scan.address += PAGE_SIZE; cond_resched(); } -- cgit From 5f24ce5fd34c3ca1b3d10d30da754732da64d5c0 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Thu, 13 Jan 2011 15:47:00 -0800 Subject: thp: remove PG_buddy PG_buddy can be converted to _mapcount == -2. So the PG_compound_lock can be added to page->flags without overflowing (because of the sparse section bits increasing) with CONFIG_X86_PAE=y and CONFIG_X86_PAT=y. This also has to move the memory hotplug code from _mapcount to lru.next to avoid any risk of clashes. We can't use lru.next for PG_buddy removal, but memory hotplug can use lru.next even more easily than the mapcount instead. Signed-off-by: Andrea Arcangeli Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/page.c | 14 ++++++++------ include/linux/memory_hotplug.h | 14 +++++++++----- include/linux/mm.h | 21 +++++++++++++++++++++ include/linux/page-flags.h | 7 +------ mm/memory_hotplug.c | 14 ++++++++------ mm/page_alloc.c | 7 +++---- mm/sparse.c | 4 ++-- 7 files changed, 52 insertions(+), 29 deletions(-) diff --git a/fs/proc/page.c b/fs/proc/page.c index b06c674624e6..6d8e6a9e93ab 100644 --- a/fs/proc/page.c +++ b/fs/proc/page.c @@ -116,15 +116,17 @@ u64 stable_page_flags(struct page *page) if (PageHuge(page)) u |= 1 << KPF_HUGE; - u |= kpf_copy_bit(k, KPF_LOCKED, PG_locked); - /* - * Caveats on high order pages: - * PG_buddy will only be set on the head page; SLUB/SLQB do the same - * for PG_slab; SLOB won't set PG_slab at all on compound pages. + * Caveats on high order pages: page->_count will only be set + * -1 on the head page; SLUB/SLQB do the same for PG_slab; + * SLOB won't set PG_slab at all on compound pages. */ + if (PageBuddy(page)) + u |= 1 << KPF_BUDDY; + + u |= kpf_copy_bit(k, KPF_LOCKED, PG_locked); + u |= kpf_copy_bit(k, KPF_SLAB, PG_slab); - u |= kpf_copy_bit(k, KPF_BUDDY, PG_buddy); u |= kpf_copy_bit(k, KPF_ERROR, PG_error); u |= kpf_copy_bit(k, KPF_DIRTY, PG_dirty); diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index 31c237a00c48..24376fe7ee68 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -13,12 +13,16 @@ struct mem_section; #ifdef CONFIG_MEMORY_HOTPLUG /* - * Types for free bootmem. - * The normal smallest mapcount is -1. Here is smaller value than it. + * Types for free bootmem stored in page->lru.next. These have to be in + * some random range in unsigned long space for debugging purposes. */ -#define SECTION_INFO (-1 - 1) -#define MIX_SECTION_INFO (-1 - 2) -#define NODE_INFO (-1 - 3) +enum { + MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE = 12, + SECTION_INFO = MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE, + MIX_SECTION_INFO, + NODE_INFO, + MEMORY_HOTPLUG_MAX_BOOTMEM_TYPE = NODE_INFO, +}; /* * pgdat resizing functions diff --git a/include/linux/mm.h b/include/linux/mm.h index 2ec5138badab..7ab7d2b60041 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -397,6 +397,27 @@ static inline void init_page_count(struct page *page) atomic_set(&page->_count, 1); } +/* + * PageBuddy() indicate that the page is free and in the buddy system + * (see mm/page_alloc.c). + */ +static inline int PageBuddy(struct page *page) +{ + return atomic_read(&page->_mapcount) == -2; +} + +static inline void __SetPageBuddy(struct page *page) +{ + VM_BUG_ON(atomic_read(&page->_mapcount) != -1); + atomic_set(&page->_mapcount, -2); +} + +static inline void __ClearPageBuddy(struct page *page) +{ + VM_BUG_ON(!PageBuddy(page)); + atomic_set(&page->_mapcount, -1); +} + void put_page(struct page *page); void put_pages_list(struct list_head *pages); diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 4ca1241ef94e..0db8037e2725 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -48,9 +48,6 @@ * struct page (these bits with information) are always mapped into kernel * address space... * - * PG_buddy is set to indicate that the page is free and in the buddy system - * (see mm/page_alloc.c). - * * PG_hwpoison indicates that a page got corrupted in hardware and contains * data with incorrect ECC bits that triggered a machine check. Accessing is * not safe since it may cause another machine check. Don't touch! @@ -96,7 +93,6 @@ enum pageflags { PG_swapcache, /* Swap page: swp_entry_t in private */ PG_mappedtodisk, /* Has blocks allocated on-disk */ PG_reclaim, /* To be reclaimed asap */ - PG_buddy, /* Page is free, on buddy lists */ PG_swapbacked, /* Page is backed by RAM/swap */ PG_unevictable, /* Page is "unevictable" */ #ifdef CONFIG_MMU @@ -233,7 +229,6 @@ PAGEFLAG(OwnerPriv1, owner_priv_1) TESTCLEARFLAG(OwnerPriv1, owner_priv_1) * risky: they bypass page accounting. */ TESTPAGEFLAG(Writeback, writeback) TESTSCFLAG(Writeback, writeback) -__PAGEFLAG(Buddy, buddy) PAGEFLAG(MappedToDisk, mappedtodisk) /* PG_readahead is only used for file reads; PG_reclaim is only for writes */ @@ -461,7 +456,7 @@ static inline int PageTransCompound(struct page *page) #define PAGE_FLAGS_CHECK_AT_FREE \ (1 << PG_lru | 1 << PG_locked | \ 1 << PG_private | 1 << PG_private_2 | \ - 1 << PG_buddy | 1 << PG_writeback | 1 << PG_reserved | \ + 1 << PG_writeback | 1 << PG_reserved | \ 1 << PG_slab | 1 << PG_swapcache | 1 << PG_active | \ 1 << PG_unevictable | __PG_MLOCKED | __PG_HWPOISON | \ __PG_COMPOUND_LOCK) diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index a2832c092509..e92f04749fcb 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -82,9 +82,10 @@ static void release_memory_resource(struct resource *res) #ifdef CONFIG_MEMORY_HOTPLUG_SPARSE #ifndef CONFIG_SPARSEMEM_VMEMMAP -static void get_page_bootmem(unsigned long info, struct page *page, int type) +static void get_page_bootmem(unsigned long info, struct page *page, + unsigned long type) { - atomic_set(&page->_mapcount, type); + page->lru.next = (struct list_head *) type; SetPagePrivate(page); set_page_private(page, info); atomic_inc(&page->_count); @@ -94,15 +95,16 @@ static void get_page_bootmem(unsigned long info, struct page *page, int type) * so use __ref to tell modpost not to generate a warning */ void __ref put_page_bootmem(struct page *page) { - int type; + unsigned long type; - type = atomic_read(&page->_mapcount); - BUG_ON(type >= -1); + type = (unsigned long) page->lru.next; + BUG_ON(type < MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE || + type > MEMORY_HOTPLUG_MAX_BOOTMEM_TYPE); if (atomic_dec_return(&page->_count) == 1) { ClearPagePrivate(page); set_page_private(page, 0); - reset_page_mapcount(page); + INIT_LIST_HEAD(&page->lru); __free_pages_bootmem(page, 0); } diff --git a/mm/page_alloc.c b/mm/page_alloc.c index e7664b9f706c..9dfe49bceff4 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -449,8 +449,8 @@ __find_combined_index(unsigned long page_idx, unsigned int order) * (c) a page and its buddy have the same order && * (d) a page and its buddy are in the same zone. * - * For recording whether a page is in the buddy system, we use PG_buddy. - * Setting, clearing, and testing PG_buddy is serialized by zone->lock. + * For recording whether a page is in the buddy system, we set ->_mapcount -2. + * Setting, clearing, and testing _mapcount -2 is serialized by zone->lock. * * For recording page's order, we use page_private(page). */ @@ -483,7 +483,7 @@ static inline int page_is_buddy(struct page *page, struct page *buddy, * as necessary, plus some accounting needed to play nicely with other * parts of the VM system. * At each level, we keep a list of pages, which are heads of continuous - * free pages of length of (1 << order) and marked with PG_buddy. Page's + * free pages of length of (1 << order) and marked with _mapcount -2. Page's * order is recorded in page_private(page) field. * So when we are allocating or freeing one, we can derive the state of the * other. That is, if we allocate a small block, and both were @@ -5574,7 +5574,6 @@ static struct trace_print_flags pageflag_names[] = { {1UL << PG_swapcache, "swapcache" }, {1UL << PG_mappedtodisk, "mappedtodisk" }, {1UL << PG_reclaim, "reclaim" }, - {1UL << PG_buddy, "buddy" }, {1UL << PG_swapbacked, "swapbacked" }, {1UL << PG_unevictable, "unevictable" }, #ifdef CONFIG_MMU diff --git a/mm/sparse.c b/mm/sparse.c index 95ac219af379..93250207c5cf 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -671,10 +671,10 @@ static void __kfree_section_memmap(struct page *memmap, unsigned long nr_pages) static void free_map_bootmem(struct page *page, unsigned long nr_pages) { unsigned long maps_section_nr, removing_section_nr, i; - int magic; + unsigned long magic; for (i = 0; i < nr_pages; i++, page++) { - magic = atomic_read(&page->_mapcount); + magic = (unsigned long) page->lru.next; BUG_ON(magic == NODE_INFO); -- cgit From f2d6bfe9ff0acec30b713614260e78b03d20e909 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 13 Jan 2011 15:47:01 -0800 Subject: thp: add x86 32bit support Add support for transparent hugepages to x86 32bit. Share the same VM_ bitflag for VM_MAPPED_COPY. mm/nommu.c will never support transparent hugepages. Signed-off-by: Johannes Weiner Signed-off-by: Andrea Arcangeli Reviewed-by: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/include/asm/pgtable-2level.h | 9 +++ arch/x86/include/asm/pgtable-3level.h | 23 +++++++ arch/x86/include/asm/pgtable.h | 117 ++++++++++++++++++++++++++++++++++ arch/x86/include/asm/pgtable_64.h | 109 ------------------------------- arch/x86/mm/pgtable.c | 4 +- include/linux/mm.h | 7 +- mm/Kconfig | 2 +- 7 files changed, 156 insertions(+), 115 deletions(-) diff --git a/arch/x86/include/asm/pgtable-2level.h b/arch/x86/include/asm/pgtable-2level.h index 2334982b339e..98391db840c6 100644 --- a/arch/x86/include/asm/pgtable-2level.h +++ b/arch/x86/include/asm/pgtable-2level.h @@ -46,6 +46,15 @@ static inline pte_t native_ptep_get_and_clear(pte_t *xp) #define native_ptep_get_and_clear(xp) native_local_ptep_get_and_clear(xp) #endif +#ifdef CONFIG_SMP +static inline pmd_t native_pmdp_get_and_clear(pmd_t *xp) +{ + return __pmd(xchg((pmdval_t *)xp, 0)); +} +#else +#define native_pmdp_get_and_clear(xp) native_local_pmdp_get_and_clear(xp) +#endif + /* * Bits _PAGE_BIT_PRESENT, _PAGE_BIT_FILE and _PAGE_BIT_PROTNONE are taken, * split up the 29 bits of offset into this range: diff --git a/arch/x86/include/asm/pgtable-3level.h b/arch/x86/include/asm/pgtable-3level.h index 177b0165ea01..94b979d1b58d 100644 --- a/arch/x86/include/asm/pgtable-3level.h +++ b/arch/x86/include/asm/pgtable-3level.h @@ -104,6 +104,29 @@ static inline pte_t native_ptep_get_and_clear(pte_t *ptep) #define native_ptep_get_and_clear(xp) native_local_ptep_get_and_clear(xp) #endif +#ifdef CONFIG_SMP +union split_pmd { + struct { + u32 pmd_low; + u32 pmd_high; + }; + pmd_t pmd; +}; +static inline pmd_t native_pmdp_get_and_clear(pmd_t *pmdp) +{ + union split_pmd res, *orig = (union split_pmd *)pmdp; + + /* xchg acts as a barrier before setting of the high bits */ + res.pmd_low = xchg(&orig->pmd_low, 0); + res.pmd_high = orig->pmd_high; + orig->pmd_high = 0; + + return res.pmd; +} +#else +#define native_pmdp_get_and_clear(xp) native_local_pmdp_get_and_clear(xp) +#endif + /* * Bits 0, 6 and 7 are taken in the low part of the pte, * put the 32 bits of offset into the high part. diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 3278038e9706..001a3831567a 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -97,6 +97,11 @@ static inline int pte_young(pte_t pte) return pte_flags(pte) & _PAGE_ACCESSED; } +static inline int pmd_young(pmd_t pmd) +{ + return pmd_flags(pmd) & _PAGE_ACCESSED; +} + static inline int pte_write(pte_t pte) { return pte_flags(pte) & _PAGE_RW; @@ -145,6 +150,18 @@ static inline int pmd_large(pmd_t pte) (_PAGE_PSE | _PAGE_PRESENT); } +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +static inline int pmd_trans_splitting(pmd_t pmd) +{ + return pmd_val(pmd) & _PAGE_SPLITTING; +} + +static inline int pmd_trans_huge(pmd_t pmd) +{ + return pmd_val(pmd) & _PAGE_PSE; +} +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ + static inline pte_t pte_set_flags(pte_t pte, pteval_t set) { pteval_t v = native_pte_val(pte); @@ -219,6 +236,55 @@ static inline pte_t pte_mkspecial(pte_t pte) return pte_set_flags(pte, _PAGE_SPECIAL); } +static inline pmd_t pmd_set_flags(pmd_t pmd, pmdval_t set) +{ + pmdval_t v = native_pmd_val(pmd); + + return __pmd(v | set); +} + +static inline pmd_t pmd_clear_flags(pmd_t pmd, pmdval_t clear) +{ + pmdval_t v = native_pmd_val(pmd); + + return __pmd(v & ~clear); +} + +static inline pmd_t pmd_mkold(pmd_t pmd) +{ + return pmd_clear_flags(pmd, _PAGE_ACCESSED); +} + +static inline pmd_t pmd_wrprotect(pmd_t pmd) +{ + return pmd_clear_flags(pmd, _PAGE_RW); +} + +static inline pmd_t pmd_mkdirty(pmd_t pmd) +{ + return pmd_set_flags(pmd, _PAGE_DIRTY); +} + +static inline pmd_t pmd_mkhuge(pmd_t pmd) +{ + return pmd_set_flags(pmd, _PAGE_PSE); +} + +static inline pmd_t pmd_mkyoung(pmd_t pmd) +{ + return pmd_set_flags(pmd, _PAGE_ACCESSED); +} + +static inline pmd_t pmd_mkwrite(pmd_t pmd) +{ + return pmd_set_flags(pmd, _PAGE_RW); +} + +static inline pmd_t pmd_mknotpresent(pmd_t pmd) +{ + return pmd_clear_flags(pmd, _PAGE_PRESENT); +} + /* * Mask out unsupported bits in a present pgprot. Non-present pgprots * can use those bits for other purposes, so leave them be. @@ -527,6 +593,14 @@ static inline pte_t native_local_ptep_get_and_clear(pte_t *ptep) return res; } +static inline pmd_t native_local_pmdp_get_and_clear(pmd_t *pmdp) +{ + pmd_t res = *pmdp; + + native_pmd_clear(pmdp); + return res; +} + static inline void native_set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep , pte_t pte) { @@ -616,6 +690,49 @@ static inline void ptep_set_wrprotect(struct mm_struct *mm, #define flush_tlb_fix_spurious_fault(vma, address) +#define mk_pmd(page, pgprot) pfn_pmd(page_to_pfn(page), (pgprot)) + +#define __HAVE_ARCH_PMDP_SET_ACCESS_FLAGS +extern int pmdp_set_access_flags(struct vm_area_struct *vma, + unsigned long address, pmd_t *pmdp, + pmd_t entry, int dirty); + +#define __HAVE_ARCH_PMDP_TEST_AND_CLEAR_YOUNG +extern int pmdp_test_and_clear_young(struct vm_area_struct *vma, + unsigned long addr, pmd_t *pmdp); + +#define __HAVE_ARCH_PMDP_CLEAR_YOUNG_FLUSH +extern int pmdp_clear_flush_young(struct vm_area_struct *vma, + unsigned long address, pmd_t *pmdp); + + +#define __HAVE_ARCH_PMDP_SPLITTING_FLUSH +extern void pmdp_splitting_flush(struct vm_area_struct *vma, + unsigned long addr, pmd_t *pmdp); + +#define __HAVE_ARCH_PMD_WRITE +static inline int pmd_write(pmd_t pmd) +{ + return pmd_flags(pmd) & _PAGE_RW; +} + +#define __HAVE_ARCH_PMDP_GET_AND_CLEAR +static inline pmd_t pmdp_get_and_clear(struct mm_struct *mm, unsigned long addr, + pmd_t *pmdp) +{ + pmd_t pmd = native_pmdp_get_and_clear(pmdp); + pmd_update(mm, addr, pmdp); + return pmd; +} + +#define __HAVE_ARCH_PMDP_SET_WRPROTECT +static inline void pmdp_set_wrprotect(struct mm_struct *mm, + unsigned long addr, pmd_t *pmdp) +{ + clear_bit(_PAGE_BIT_RW, (unsigned long *)pmdp); + pmd_update(mm, addr, pmdp); +} + /* * clone_pgd_range(pgd_t *dst, pgd_t *src, int count); * diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h index b2df039a4119..975f709e09ae 100644 --- a/arch/x86/include/asm/pgtable_64.h +++ b/arch/x86/include/asm/pgtable_64.h @@ -182,115 +182,6 @@ extern void cleanup_highmap(void); #define __HAVE_ARCH_PTE_SAME -#ifdef CONFIG_TRANSPARENT_HUGEPAGE -static inline int pmd_trans_splitting(pmd_t pmd) -{ - return pmd_val(pmd) & _PAGE_SPLITTING; -} - -static inline int pmd_trans_huge(pmd_t pmd) -{ - return pmd_val(pmd) & _PAGE_PSE; -} -#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ - -#define mk_pmd(page, pgprot) pfn_pmd(page_to_pfn(page), (pgprot)) - -#define __HAVE_ARCH_PMDP_SET_ACCESS_FLAGS -extern int pmdp_set_access_flags(struct vm_area_struct *vma, - unsigned long address, pmd_t *pmdp, - pmd_t entry, int dirty); - -#define __HAVE_ARCH_PMDP_TEST_AND_CLEAR_YOUNG -extern int pmdp_test_and_clear_young(struct vm_area_struct *vma, - unsigned long addr, pmd_t *pmdp); - -#define __HAVE_ARCH_PMDP_CLEAR_YOUNG_FLUSH -extern int pmdp_clear_flush_young(struct vm_area_struct *vma, - unsigned long address, pmd_t *pmdp); - - -#define __HAVE_ARCH_PMDP_SPLITTING_FLUSH -extern void pmdp_splitting_flush(struct vm_area_struct *vma, - unsigned long addr, pmd_t *pmdp); - -#define __HAVE_ARCH_PMD_WRITE -static inline int pmd_write(pmd_t pmd) -{ - return pmd_flags(pmd) & _PAGE_RW; -} - -#define __HAVE_ARCH_PMDP_GET_AND_CLEAR -static inline pmd_t pmdp_get_and_clear(struct mm_struct *mm, unsigned long addr, - pmd_t *pmdp) -{ - pmd_t pmd = native_pmdp_get_and_clear(pmdp); - pmd_update(mm, addr, pmdp); - return pmd; -} - -#define __HAVE_ARCH_PMDP_SET_WRPROTECT -static inline void pmdp_set_wrprotect(struct mm_struct *mm, - unsigned long addr, pmd_t *pmdp) -{ - clear_bit(_PAGE_BIT_RW, (unsigned long *)&pmdp->pmd); - pmd_update(mm, addr, pmdp); -} - -static inline int pmd_young(pmd_t pmd) -{ - return pmd_flags(pmd) & _PAGE_ACCESSED; -} - -static inline pmd_t pmd_set_flags(pmd_t pmd, pmdval_t set) -{ - pmdval_t v = native_pmd_val(pmd); - - return native_make_pmd(v | set); -} - -static inline pmd_t pmd_clear_flags(pmd_t pmd, pmdval_t clear) -{ - pmdval_t v = native_pmd_val(pmd); - - return native_make_pmd(v & ~clear); -} - -static inline pmd_t pmd_mkold(pmd_t pmd) -{ - return pmd_clear_flags(pmd, _PAGE_ACCESSED); -} - -static inline pmd_t pmd_wrprotect(pmd_t pmd) -{ - return pmd_clear_flags(pmd, _PAGE_RW); -} - -static inline pmd_t pmd_mkdirty(pmd_t pmd) -{ - return pmd_set_flags(pmd, _PAGE_DIRTY); -} - -static inline pmd_t pmd_mkhuge(pmd_t pmd) -{ - return pmd_set_flags(pmd, _PAGE_PSE); -} - -static inline pmd_t pmd_mkyoung(pmd_t pmd) -{ - return pmd_set_flags(pmd, _PAGE_ACCESSED); -} - -static inline pmd_t pmd_mkwrite(pmd_t pmd) -{ - return pmd_set_flags(pmd, _PAGE_RW); -} - -static inline pmd_t pmd_mknotpresent(pmd_t pmd) -{ - return pmd_clear_flags(pmd, _PAGE_PRESENT); -} - #endif /* !__ASSEMBLY__ */ #endif /* _ASM_X86_PGTABLE_64_H */ diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index 65e92d58f942..500242d3c96d 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -362,7 +362,7 @@ int pmdp_test_and_clear_young(struct vm_area_struct *vma, if (pmd_young(*pmdp)) ret = test_and_clear_bit(_PAGE_BIT_ACCESSED, - (unsigned long *) &pmdp->pmd); + (unsigned long *)pmdp); if (ret) pmd_update(vma->vm_mm, addr, pmdp); @@ -404,7 +404,7 @@ void pmdp_splitting_flush(struct vm_area_struct *vma, int set; VM_BUG_ON(address & ~HPAGE_PMD_MASK); set = !test_and_set_bit(_PAGE_BIT_SPLITTING, - (unsigned long *)&pmdp->pmd); + (unsigned long *)pmdp); if (set) { pmd_update(vma->vm_mm, address, pmdp); /* need tlb flush only to serialize against gup-fast */ diff --git a/include/linux/mm.h b/include/linux/mm.h index 7ab7d2b60041..9c2695beab86 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -102,7 +102,11 @@ extern unsigned int kobjsize(const void *objp); #define VM_NORESERVE 0x00200000 /* should the VM suppress accounting */ #define VM_HUGETLB 0x00400000 /* Huge TLB Page VM */ #define VM_NONLINEAR 0x00800000 /* Is non-linear (remap_file_pages) */ +#ifndef CONFIG_TRANSPARENT_HUGEPAGE #define VM_MAPPED_COPY 0x01000000 /* T if mapped copy of data (nommu mmap) */ +#else +#define VM_HUGEPAGE 0x01000000 /* MADV_HUGEPAGE marked this vma */ +#endif #define VM_INSERTPAGE 0x02000000 /* The vma has had "vm_insert_page()" done on it */ #define VM_ALWAYSDUMP 0x04000000 /* Always include in core dumps */ @@ -111,9 +115,6 @@ extern unsigned int kobjsize(const void *objp); #define VM_SAO 0x20000000 /* Strong Access Ordering (powerpc) */ #define VM_PFN_AT_MMAP 0x40000000 /* PFNMAP vma that is fully mapped at mmap time */ #define VM_MERGEABLE 0x80000000 /* KSM may merge identical pages */ -#if BITS_PER_LONG > 32 -#define VM_HUGEPAGE 0x100000000UL /* MADV_HUGEPAGE marked this vma */ -#endif /* Bits set in the VMA until the stack is in its final location */ #define VM_STACK_INCOMPLETE_SETUP (VM_RAND_READ | VM_SEQ_READ) diff --git a/mm/Kconfig b/mm/Kconfig index 3982be2d721a..d774f77538ce 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -304,7 +304,7 @@ config NOMMU_INITIAL_TRIM_EXCESS config TRANSPARENT_HUGEPAGE bool "Transparent Hugepage Support" if EMBEDDED - depends on X86_64 && MMU + depends on X86 && MMU default y help Transparent Hugepages allows the kernel to use huge pages and -- cgit From 0ca1634d4143c3579273ca53b993df19f5c98e92 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 13 Jan 2011 15:47:02 -0800 Subject: thp: mincore transparent hugepage support Handle transparent huge page pmd entries natively instead of splitting them into subpages. Signed-off-by: Johannes Weiner Signed-off-by: Andrea Arcangeli Reviewed-by: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/huge_mm.h | 3 +++ mm/huge_memory.c | 25 +++++++++++++++++++++++++ mm/mincore.c | 8 +++++++- 3 files changed, 35 insertions(+), 1 deletion(-) diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 43a694ef8904..25125fb6acf7 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -19,6 +19,9 @@ extern struct page *follow_trans_huge_pmd(struct mm_struct *mm, extern int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, pmd_t *pmd); +extern int mincore_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, + unsigned long addr, unsigned long end, + unsigned char *vec); enum transparent_hugepage_flag { TRANSPARENT_HUGEPAGE_FLAG, diff --git a/mm/huge_memory.c b/mm/huge_memory.c index ae2bf08b1099..37e89a32a0b1 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -923,6 +923,31 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, return ret; } +int mincore_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, + unsigned long addr, unsigned long end, + unsigned char *vec) +{ + int ret = 0; + + spin_lock(&vma->vm_mm->page_table_lock); + if (likely(pmd_trans_huge(*pmd))) { + ret = !pmd_trans_splitting(*pmd); + spin_unlock(&vma->vm_mm->page_table_lock); + if (unlikely(!ret)) + wait_split_huge_page(vma->anon_vma, pmd); + else { + /* + * All logical pages in the range are present + * if backed by a huge page. + */ + memset(vec, 1, (end - addr) >> PAGE_SHIFT); + } + } else + spin_unlock(&vma->vm_mm->page_table_lock); + + return ret; +} + pmd_t *page_check_address_pmd(struct page *page, struct mm_struct *mm, unsigned long address, diff --git a/mm/mincore.c b/mm/mincore.c index 9959bb41570e..a4e6b9d75c76 100644 --- a/mm/mincore.c +++ b/mm/mincore.c @@ -154,7 +154,13 @@ static void mincore_pmd_range(struct vm_area_struct *vma, pud_t *pud, pmd = pmd_offset(pud, addr); do { next = pmd_addr_end(addr, end); - split_huge_page_pmd(vma->vm_mm, pmd); + if (pmd_trans_huge(*pmd)) { + if (mincore_huge_pmd(vma, pmd, addr, next, vec)) { + vec += (next - addr) >> PAGE_SHIFT; + continue; + } + /* fall through */ + } if (pmd_none_or_clear_bad(pmd)) mincore_unmapped_range(vma, addr, next, vec); else -- cgit From c489f1257b8cacd4881a18da1e93659f934a8e98 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 13 Jan 2011 15:47:02 -0800 Subject: thp: add pmd_modify Add pmd_modify() for use with mprotect() on huge pmds. Signed-off-by: Johannes Weiner Signed-off-by: Andrea Arcangeli Reviewed-by: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/include/asm/pgtable.h | 10 ++++++++++ arch/x86/include/asm/pgtable_types.h | 1 + 2 files changed, 11 insertions(+) diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 001a3831567a..c48ba055f693 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -325,6 +325,16 @@ static inline pte_t pte_modify(pte_t pte, pgprot_t newprot) return __pte(val); } +static inline pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot) +{ + pmdval_t val = pmd_val(pmd); + + val &= _HPAGE_CHG_MASK; + val |= massage_pgprot(newprot) & ~_HPAGE_CHG_MASK; + + return __pmd(val); +} + /* mprotect needs to preserve PAT bits when updating vm_page_prot */ #define pgprot_modify pgprot_modify static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot) diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h index a81a6bfc1437..7db7723d1f32 100644 --- a/arch/x86/include/asm/pgtable_types.h +++ b/arch/x86/include/asm/pgtable_types.h @@ -72,6 +72,7 @@ /* Set of bits not changed in pte_modify */ #define _PAGE_CHG_MASK (PTE_PFN_MASK | _PAGE_PCD | _PAGE_PWT | \ _PAGE_SPECIAL | _PAGE_ACCESSED | _PAGE_DIRTY) +#define _HPAGE_CHG_MASK (_PAGE_CHG_MASK | _PAGE_PSE) #define _PAGE_CACHE_MASK (_PAGE_PCD | _PAGE_PWT) #define _PAGE_CACHE_WB (0) -- cgit From b36f5b0710e9e3b92484de32920fddcb17278664 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 13 Jan 2011 15:47:03 -0800 Subject: thp: mprotect: pass vma down to page table walkers Flushing the tlb for huge pmds requires the vma's anon_vma, so pass along the vma instead of the mm, we can always get the latter when we need it. Signed-off-by: Johannes Weiner Signed-off-by: Andrea Arcangeli Reviewed-by: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mprotect.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/mm/mprotect.c b/mm/mprotect.c index bd27db6b992b..9402080d0d4a 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -78,7 +78,7 @@ static void change_pte_range(struct mm_struct *mm, pmd_t *pmd, pte_unmap_unlock(pte - 1, ptl); } -static inline void change_pmd_range(struct mm_struct *mm, pud_t *pud, +static inline void change_pmd_range(struct vm_area_struct *vma, pud_t *pud, unsigned long addr, unsigned long end, pgprot_t newprot, int dirty_accountable) { @@ -88,14 +88,15 @@ static inline void change_pmd_range(struct mm_struct *mm, pud_t *pud, pmd = pmd_offset(pud, addr); do { next = pmd_addr_end(addr, end); - split_huge_page_pmd(mm, pmd); + split_huge_page_pmd(vma->vm_mm, pmd); if (pmd_none_or_clear_bad(pmd)) continue; - change_pte_range(mm, pmd, addr, next, newprot, dirty_accountable); + change_pte_range(vma->vm_mm, pmd, addr, next, newprot, + dirty_accountable); } while (pmd++, addr = next, addr != end); } -static inline void change_pud_range(struct mm_struct *mm, pgd_t *pgd, +static inline void change_pud_range(struct vm_area_struct *vma, pgd_t *pgd, unsigned long addr, unsigned long end, pgprot_t newprot, int dirty_accountable) { @@ -107,7 +108,8 @@ static inline void change_pud_range(struct mm_struct *mm, pgd_t *pgd, next = pud_addr_end(addr, end); if (pud_none_or_clear_bad(pud)) continue; - change_pmd_range(mm, pud, addr, next, newprot, dirty_accountable); + change_pmd_range(vma, pud, addr, next, newprot, + dirty_accountable); } while (pud++, addr = next, addr != end); } @@ -127,7 +129,8 @@ static void change_protection(struct vm_area_struct *vma, next = pgd_addr_end(addr, end); if (pgd_none_or_clear_bad(pgd)) continue; - change_pud_range(mm, pgd, addr, next, newprot, dirty_accountable); + change_pud_range(vma, pgd, addr, next, newprot, + dirty_accountable); } while (pgd++, addr = next, addr != end); flush_tlb_range(vma, start, end); } -- cgit From cd7548ab360c462118568eebb8c6da3bc303b02e Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 13 Jan 2011 15:47:04 -0800 Subject: thp: mprotect: transparent huge page support Natively handle huge pmds when changing page tables on behalf of mprotect(). I left out update_mmu_cache() because we do not need it on x86 anyway but more importantly the interface works on ptes, not pmds. Signed-off-by: Johannes Weiner Signed-off-by: Andrea Arcangeli Reviewed-by: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/huge_mm.h | 2 ++ mm/huge_memory.c | 27 +++++++++++++++++++++++++++ mm/mprotect.c | 8 +++++++- 3 files changed, 36 insertions(+), 1 deletion(-) diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 25125fb6acf7..c590b08c6fa6 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -22,6 +22,8 @@ extern int zap_huge_pmd(struct mmu_gather *tlb, extern int mincore_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, unsigned long end, unsigned char *vec); +extern int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, + unsigned long addr, pgprot_t newprot); enum transparent_hugepage_flag { TRANSPARENT_HUGEPAGE_FLAG, diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 37e89a32a0b1..7b55fe0e998b 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -948,6 +948,33 @@ int mincore_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, return ret; } +int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, + unsigned long addr, pgprot_t newprot) +{ + struct mm_struct *mm = vma->vm_mm; + int ret = 0; + + spin_lock(&mm->page_table_lock); + if (likely(pmd_trans_huge(*pmd))) { + if (unlikely(pmd_trans_splitting(*pmd))) { + spin_unlock(&mm->page_table_lock); + wait_split_huge_page(vma->anon_vma, pmd); + } else { + pmd_t entry; + + entry = pmdp_get_and_clear(mm, addr, pmd); + entry = pmd_modify(entry, newprot); + set_pmd_at(mm, addr, pmd, entry); + spin_unlock(&vma->vm_mm->page_table_lock); + flush_tlb_range(vma, addr, addr + HPAGE_PMD_SIZE); + ret = 1; + } + } else + spin_unlock(&vma->vm_mm->page_table_lock); + + return ret; +} + pmd_t *page_check_address_pmd(struct page *page, struct mm_struct *mm, unsigned long address, diff --git a/mm/mprotect.c b/mm/mprotect.c index 9402080d0d4a..5a688a2756be 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -88,7 +88,13 @@ static inline void change_pmd_range(struct vm_area_struct *vma, pud_t *pud, pmd = pmd_offset(pud, addr); do { next = pmd_addr_end(addr, end); - split_huge_page_pmd(vma->vm_mm, pmd); + if (pmd_trans_huge(*pmd)) { + if (next - addr != HPAGE_PMD_SIZE) + split_huge_page_pmd(vma->vm_mm, pmd); + else if (change_huge_pmd(vma, pmd, addr, newprot)) + continue; + /* fall through */ + } if (pmd_none_or_clear_bad(pmd)) continue; change_pte_range(vma->vm_mm, pmd, addr, next, newprot, -- cgit From f000565adb770b14cebbafde0a4f3e61a3342a63 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Thu, 13 Jan 2011 15:47:04 -0800 Subject: thp: set recommended min free kbytes If transparent hugepage is enabled initialize min_free_kbytes to an optimal value by default. This moves the hugeadm algorithm in kernel. Signed-off-by: Andrea Arcangeli Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 52 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 52 insertions(+) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 7b55fe0e998b..4ed97a2a115f 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -85,6 +85,47 @@ struct khugepaged_scan { .mm_head = LIST_HEAD_INIT(khugepaged_scan.mm_head), }; + +static int set_recommended_min_free_kbytes(void) +{ + struct zone *zone; + int nr_zones = 0; + unsigned long recommended_min; + extern int min_free_kbytes; + + if (!test_bit(TRANSPARENT_HUGEPAGE_FLAG, + &transparent_hugepage_flags) && + !test_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, + &transparent_hugepage_flags)) + return 0; + + for_each_populated_zone(zone) + nr_zones++; + + /* Make sure at least 2 hugepages are free for MIGRATE_RESERVE */ + recommended_min = pageblock_nr_pages * nr_zones * 2; + + /* + * Make sure that on average at least two pageblocks are almost free + * of another type, one for a migratetype to fall back to and a + * second to avoid subsequent fallbacks of other types There are 3 + * MIGRATE_TYPES we care about. + */ + recommended_min += pageblock_nr_pages * nr_zones * + MIGRATE_PCPTYPES * MIGRATE_PCPTYPES; + + /* don't ever allow to reserve more than 5% of the lowmem */ + recommended_min = min(recommended_min, + (unsigned long) nr_free_buffer_pages() / 20); + recommended_min <<= (PAGE_SHIFT-10); + + if (recommended_min > min_free_kbytes) + min_free_kbytes = recommended_min; + setup_per_zone_wmarks(); + return 0; +} +late_initcall(set_recommended_min_free_kbytes); + static int start_khugepaged(void) { int err = 0; @@ -108,6 +149,8 @@ static int start_khugepaged(void) mutex_unlock(&khugepaged_mutex); if (wakeup) wake_up_interruptible(&khugepaged_wait); + + set_recommended_min_free_kbytes(); } else /* wakeup to exit */ wake_up_interruptible(&khugepaged_wait); @@ -177,6 +220,13 @@ static ssize_t enabled_store(struct kobject *kobj, ret = err; } + if (ret > 0 && + (test_bit(TRANSPARENT_HUGEPAGE_FLAG, + &transparent_hugepage_flags) || + test_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, + &transparent_hugepage_flags))) + set_recommended_min_free_kbytes(); + return ret; } static struct kobj_attribute enabled_attr = @@ -464,6 +514,8 @@ static int __init hugepage_init(void) start_khugepaged(); + set_recommended_min_free_kbytes(); + out: return err; } -- cgit From d39d33c332c611094f84cee39715866f4cbf79e2 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Thu, 13 Jan 2011 15:47:05 -0800 Subject: thp: enable direct defrag With memory compaction in, and lumpy-reclaim disabled, it seems safe enough to defrag memory during the (synchronous) transparent hugepage page faults (TRANSPARENT_HUGEPAGE_DEFRAG_FLAG) and not only during khugepaged (async) hugepage allocations that was already enabled even before memory compaction was in (TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG). Signed-off-by: Andrea Arcangeli Acked-by: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 1 + 1 file changed, 1 insertion(+) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 4ed97a2a115f..0415a83afd66 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -28,6 +28,7 @@ */ unsigned long transparent_hugepage_flags __read_mostly = (1< Date: Thu, 13 Jan 2011 15:47:05 -0800 Subject: thp: add numa awareness to hugepage allocations It's mostly a matter of replacing alloc_pages with alloc_pages_vma after introducing alloc_pages_vma. khugepaged needs special handling as the allocation has to happen inside collapse_huge_page where the vma is known and an error has to be returned to the outer loop to sleep alloc_sleep_millisecs in case of failure. But it retains the more efficient logic of handling allocation failures in khugepaged in case of CONFIG_NUMA=n. Signed-off-by: Andrea Arcangeli Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/gfp.h | 7 +++-- mm/huge_memory.c | 87 +++++++++++++++++++++++++++++++++++++++++++++-------- mm/mempolicy.c | 13 +++++--- 3 files changed, 87 insertions(+), 20 deletions(-) diff --git a/include/linux/gfp.h b/include/linux/gfp.h index d95082cc6f4a..a3b148a91874 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -331,14 +331,17 @@ alloc_pages(gfp_t gfp_mask, unsigned int order) { return alloc_pages_current(gfp_mask, order); } -extern struct page *alloc_page_vma(gfp_t gfp_mask, +extern struct page *alloc_pages_vma(gfp_t gfp_mask, int order, struct vm_area_struct *vma, unsigned long addr); #else #define alloc_pages(gfp_mask, order) \ alloc_pages_node(numa_node_id(), gfp_mask, order) -#define alloc_page_vma(gfp_mask, vma, addr) alloc_pages(gfp_mask, 0) +#define alloc_pages_vma(gfp_mask, order, vma, addr) \ + alloc_pages(gfp_mask, order) #endif #define alloc_page(gfp_mask) alloc_pages(gfp_mask, 0) +#define alloc_page_vma(gfp_mask, vma, addr) \ + alloc_pages_vma(gfp_mask, 0, vma, addr) extern unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order); extern unsigned long get_zeroed_page(gfp_t gfp_mask); diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 0415a83afd66..f6559e7711bd 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -620,11 +620,26 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, return ret; } +static inline gfp_t alloc_hugepage_gfpmask(int defrag) +{ + return GFP_TRANSHUGE & ~(defrag ? 0 : __GFP_WAIT); +} + +static inline struct page *alloc_hugepage_vma(int defrag, + struct vm_area_struct *vma, + unsigned long haddr) +{ + return alloc_pages_vma(alloc_hugepage_gfpmask(defrag), + HPAGE_PMD_ORDER, vma, haddr); +} + +#ifndef CONFIG_NUMA static inline struct page *alloc_hugepage(int defrag) { - return alloc_pages(GFP_TRANSHUGE & ~(defrag ? 0 : __GFP_WAIT), + return alloc_pages(alloc_hugepage_gfpmask(defrag), HPAGE_PMD_ORDER); } +#endif int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, pmd_t *pmd, @@ -639,7 +654,8 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, return VM_FAULT_OOM; if (unlikely(khugepaged_enter(vma))) return VM_FAULT_OOM; - page = alloc_hugepage(transparent_hugepage_defrag(vma)); + page = alloc_hugepage_vma(transparent_hugepage_defrag(vma), + vma, haddr); if (unlikely(!page)) goto out; if (unlikely(mem_cgroup_newpage_charge(page, mm, GFP_KERNEL))) { @@ -862,7 +878,8 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, if (transparent_hugepage_enabled(vma) && !transparent_hugepage_debug_cow()) - new_page = alloc_hugepage(transparent_hugepage_defrag(vma)); + new_page = alloc_hugepage_vma(transparent_hugepage_defrag(vma), + vma, haddr); else new_page = NULL; @@ -1661,7 +1678,11 @@ static void collapse_huge_page(struct mm_struct *mm, unsigned long hstart, hend; VM_BUG_ON(address & ~HPAGE_PMD_MASK); +#ifndef CONFIG_NUMA VM_BUG_ON(!*hpage); +#else + VM_BUG_ON(*hpage); +#endif /* * Prevent all access to pagetables with the exception of @@ -1699,9 +1720,17 @@ static void collapse_huge_page(struct mm_struct *mm, if (!pmd_present(*pmd) || pmd_trans_huge(*pmd)) goto out; +#ifndef CONFIG_NUMA new_page = *hpage; - if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) +#else + new_page = alloc_hugepage_vma(khugepaged_defrag(), vma, address); + if (unlikely(!new_page)) { + *hpage = ERR_PTR(-ENOMEM); goto out; + } +#endif + if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) + goto out_put_page; anon_vma_lock(vma->anon_vma); @@ -1730,7 +1759,7 @@ static void collapse_huge_page(struct mm_struct *mm, spin_unlock(&mm->page_table_lock); anon_vma_unlock(vma->anon_vma); mem_cgroup_uncharge_page(new_page); - goto out; + goto out_put_page; } /* @@ -1765,10 +1794,19 @@ static void collapse_huge_page(struct mm_struct *mm, mm->nr_ptes--; spin_unlock(&mm->page_table_lock); +#ifndef CONFIG_NUMA *hpage = NULL; +#endif khugepaged_pages_collapsed++; out: up_write(&mm->mmap_sem); + return; + +out_put_page: +#ifdef CONFIG_NUMA + put_page(new_page); +#endif + goto out; } static int khugepaged_scan_pmd(struct mm_struct *mm, @@ -2001,11 +2039,16 @@ static void khugepaged_do_scan(struct page **hpage) while (progress < pages) { cond_resched(); +#ifndef CONFIG_NUMA if (!*hpage) { *hpage = alloc_hugepage(khugepaged_defrag()); if (unlikely(!*hpage)) break; } +#else + if (IS_ERR(*hpage)) + break; +#endif spin_lock(&khugepaged_mm_lock); if (!khugepaged_scan.mm_slot) @@ -2020,37 +2063,55 @@ static void khugepaged_do_scan(struct page **hpage) } } +static void khugepaged_alloc_sleep(void) +{ + DEFINE_WAIT(wait); + add_wait_queue(&khugepaged_wait, &wait); + schedule_timeout_interruptible( + msecs_to_jiffies( + khugepaged_alloc_sleep_millisecs)); + remove_wait_queue(&khugepaged_wait, &wait); +} + +#ifndef CONFIG_NUMA static struct page *khugepaged_alloc_hugepage(void) { struct page *hpage; do { hpage = alloc_hugepage(khugepaged_defrag()); - if (!hpage) { - DEFINE_WAIT(wait); - add_wait_queue(&khugepaged_wait, &wait); - schedule_timeout_interruptible( - msecs_to_jiffies( - khugepaged_alloc_sleep_millisecs)); - remove_wait_queue(&khugepaged_wait, &wait); - } + if (!hpage) + khugepaged_alloc_sleep(); } while (unlikely(!hpage) && likely(khugepaged_enabled())); return hpage; } +#endif static void khugepaged_loop(void) { struct page *hpage; +#ifdef CONFIG_NUMA + hpage = NULL; +#endif while (likely(khugepaged_enabled())) { +#ifndef CONFIG_NUMA hpage = khugepaged_alloc_hugepage(); if (unlikely(!hpage)) break; +#else + if (IS_ERR(hpage)) { + khugepaged_alloc_sleep(); + hpage = NULL; + } +#endif khugepaged_do_scan(&hpage); +#ifndef CONFIG_NUMA if (hpage) put_page(hpage); +#endif if (khugepaged_has_work()) { DEFINE_WAIT(wait); if (!khugepaged_scan_sleep_millisecs) diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 83b7df309fc4..368fc9d23610 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1796,7 +1796,7 @@ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order, } /** - * alloc_page_vma - Allocate a page for a VMA. + * alloc_pages_vma - Allocate a page for a VMA. * * @gfp: * %GFP_USER user allocation. @@ -1805,6 +1805,7 @@ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order, * %GFP_FS allocation should not call back into a file system. * %GFP_ATOMIC don't sleep. * + * @order:Order of the GFP allocation. * @vma: Pointer to VMA or NULL if not available. * @addr: Virtual Address of the allocation. Must be inside the VMA. * @@ -1818,7 +1819,8 @@ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order, * Should be called with the mm_sem of the vma hold. */ struct page * -alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr) +alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma, + unsigned long addr) { struct mempolicy *pol = get_vma_policy(current, vma, addr); struct zonelist *zl; @@ -1830,7 +1832,7 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr) nid = interleave_nid(pol, vma, addr, PAGE_SHIFT); mpol_cond_put(pol); - page = alloc_page_interleave(gfp, 0, nid); + page = alloc_page_interleave(gfp, order, nid); put_mems_allowed(); return page; } @@ -1839,7 +1841,7 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr) /* * slow path: ref counted shared policy */ - struct page *page = __alloc_pages_nodemask(gfp, 0, + struct page *page = __alloc_pages_nodemask(gfp, order, zl, policy_nodemask(gfp, pol)); __mpol_put(pol); put_mems_allowed(); @@ -1848,7 +1850,8 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr) /* * fast path: default or task policy */ - page = __alloc_pages_nodemask(gfp, 0, zl, policy_nodemask(gfp, pol)); + page = __alloc_pages_nodemask(gfp, order, zl, + policy_nodemask(gfp, pol)); put_mems_allowed(); return page; } -- cgit From ce83d2174ea9c3d72d5821cf3ebc974e36391bf7 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Thu, 13 Jan 2011 15:47:06 -0800 Subject: thp: allocate memory in khugepaged outside of mmap_sem write mode This tries to be more friendly to filesystem in userland, with userland backends that allocate memory in the I/O paths and that could deadlock if khugepaged holds the mmap_sem write mode of the userland backend while allocating memory. Memory allocation may wait for writeback I/O completion from the daemon that may be blocked in the mmap_sem read mode if a page fault happens and the daemon wasn't using mlock for the memory required for the I/O submission and completion. Signed-off-by: Andrea Arcangeli Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 56 ++++++++++++++++++++++++++++++++++---------------------- 1 file changed, 34 insertions(+), 22 deletions(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index f6559e7711bd..bce6e12140e2 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1664,9 +1664,9 @@ static void __collapse_huge_page_copy(pte_t *pte, struct page *page, static void collapse_huge_page(struct mm_struct *mm, unsigned long address, - struct page **hpage) + struct page **hpage, + struct vm_area_struct *vma) { - struct vm_area_struct *vma; pgd_t *pgd; pud_t *pud; pmd_t *pmd, _pmd; @@ -1680,9 +1680,34 @@ static void collapse_huge_page(struct mm_struct *mm, VM_BUG_ON(address & ~HPAGE_PMD_MASK); #ifndef CONFIG_NUMA VM_BUG_ON(!*hpage); + new_page = *hpage; #else VM_BUG_ON(*hpage); + /* + * Allocate the page while the vma is still valid and under + * the mmap_sem read mode so there is no memory allocation + * later when we take the mmap_sem in write mode. This is more + * friendly behavior (OTOH it may actually hide bugs) to + * filesystems in userland with daemons allocating memory in + * the userland I/O paths. Allocating memory with the + * mmap_sem in read mode is good idea also to allow greater + * scalability. + */ + new_page = alloc_hugepage_vma(khugepaged_defrag(), vma, address); + if (unlikely(!new_page)) { + up_read(&mm->mmap_sem); + *hpage = ERR_PTR(-ENOMEM); + return; + } #endif + if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) { + up_read(&mm->mmap_sem); + put_page(new_page); + return; + } + + /* after allocating the hugepage upgrade to mmap_sem write mode */ + up_read(&mm->mmap_sem); /* * Prevent all access to pagetables with the exception of @@ -1720,18 +1745,6 @@ static void collapse_huge_page(struct mm_struct *mm, if (!pmd_present(*pmd) || pmd_trans_huge(*pmd)) goto out; -#ifndef CONFIG_NUMA - new_page = *hpage; -#else - new_page = alloc_hugepage_vma(khugepaged_defrag(), vma, address); - if (unlikely(!new_page)) { - *hpage = ERR_PTR(-ENOMEM); - goto out; - } -#endif - if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) - goto out_put_page; - anon_vma_lock(vma->anon_vma); pte = pte_offset_map(pmd, address); @@ -1759,7 +1772,7 @@ static void collapse_huge_page(struct mm_struct *mm, spin_unlock(&mm->page_table_lock); anon_vma_unlock(vma->anon_vma); mem_cgroup_uncharge_page(new_page); - goto out_put_page; + goto out; } /* @@ -1798,15 +1811,15 @@ static void collapse_huge_page(struct mm_struct *mm, *hpage = NULL; #endif khugepaged_pages_collapsed++; -out: +out_up_write: up_write(&mm->mmap_sem); return; -out_put_page: +out: #ifdef CONFIG_NUMA put_page(new_page); #endif - goto out; + goto out_up_write; } static int khugepaged_scan_pmd(struct mm_struct *mm, @@ -1865,10 +1878,9 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, ret = 1; out_unmap: pte_unmap_unlock(pte, ptl); - if (ret) { - up_read(&mm->mmap_sem); - collapse_huge_page(mm, address, hpage); - } + if (ret) + /* collapse_huge_page will return with the mmap_sem released */ + collapse_huge_page(mm, address, hpage, vma); out: return ret; } -- cgit From 13ece886d99cd668483113f7238e419d5331af26 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Thu, 13 Jan 2011 15:47:07 -0800 Subject: thp: transparent hugepage config choice Allow to choose between the always|madvise default for page faults and khugepaged at config time. madvise guarantees zero risk of higher memory footprint for applications (applications using madvise(MADV_HUGEPAGE) won't risk to use any more memory by backing their virtual regions with hugepages). Initially set the default to N and don't depend on EMBEDDED. Signed-off-by: Andrea Arcangeli Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/Kconfig | 27 +++++++++++++++++++++++++-- mm/huge_memory.c | 5 +++++ 2 files changed, 30 insertions(+), 2 deletions(-) diff --git a/mm/Kconfig b/mm/Kconfig index d774f77538ce..3e81687263b5 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -303,9 +303,8 @@ config NOMMU_INITIAL_TRIM_EXCESS See Documentation/nommu-mmap.txt for more information. config TRANSPARENT_HUGEPAGE - bool "Transparent Hugepage Support" if EMBEDDED + bool "Transparent Hugepage Support" depends on X86 && MMU - default y help Transparent Hugepages allows the kernel to use huge pages and huge tlb transparently to the applications whenever possible. @@ -316,6 +315,30 @@ config TRANSPARENT_HUGEPAGE If memory constrained on embedded, you may want to say N. +choice + prompt "Transparent Hugepage Support sysfs defaults" + depends on TRANSPARENT_HUGEPAGE + default TRANSPARENT_HUGEPAGE_ALWAYS + help + Selects the sysfs defaults for Transparent Hugepage Support. + + config TRANSPARENT_HUGEPAGE_ALWAYS + bool "always" + help + Enabling Transparent Hugepage always, can increase the + memory footprint of applications without a guaranteed + benefit but it will work automatically for all applications. + + config TRANSPARENT_HUGEPAGE_MADVISE + bool "madvise" + help + Enabling Transparent Hugepage madvise, will only provide a + performance improvement benefit to the applications using + madvise(MADV_HUGEPAGE) but it won't risk to increase the + memory footprint of applications without a guaranteed + benefit. +endchoice + # # UP and nommu archs use km based percpu allocator # diff --git a/mm/huge_memory.c b/mm/huge_memory.c index bce6e12140e2..30c3cec82023 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -27,7 +27,12 @@ * allocations. */ unsigned long transparent_hugepage_flags __read_mostly = +#ifdef CONFIG_TRANSPARENT_HUGEPAGE_ALWAYS (1< Date: Thu, 13 Jan 2011 15:47:07 -0800 Subject: thp: select CONFIG_COMPACTION if TRANSPARENT_HUGEPAGE enabled With transparent hugepage support we need compaction for the "defrag" sysfs controls to be effective. At the moment THP hangs the system if COMPACTION isn't selected, as without COMPACTION lumpy reclaim wouldn't be entirely disabled. So at the moment it's not orthogonal. When lumpy will be removed from the VM I can remove the select COMPACTION in theory, but then 99% of THP users would be still doing a mistake in disabling compaction, even if the mistake won't return in fatal runtime but just slightly degraded performance. So from a theoretical standpoing forcing the below select is not needed (the dependency isn't strict nor at compile time nor at runtime) but from a practical standpoint it is safer. If anybody really wants THP to run without compaction, it'd be such a weird setup that editing the Kconfig file to allow it will be surely not a problem. Signed-off-by: Andrea Arcangeli Acked-by: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/mm/Kconfig b/mm/Kconfig index 3e81687263b5..3ad483bdf505 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -305,6 +305,7 @@ config NOMMU_INITIAL_TRIM_EXCESS config TRANSPARENT_HUGEPAGE bool "Transparent Hugepage Support" depends on X86 && MMU + select COMPACTION help Transparent Hugepages allows the kernel to use huge pages and huge tlb transparently to the applications whenever possible. -- cgit From bc835011afbea3957217ee716093d791fb2fe44f Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Thu, 13 Jan 2011 15:47:08 -0800 Subject: thp: transhuge isolate_migratepages() It's not worth migrating transparent hugepages during compaction. Those hugepages don't create fragmentation. Signed-off-by: Andrea Arcangeli Acked-by: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/compaction.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/mm/compaction.c b/mm/compaction.c index b0fbfdfad298..da25b5a2e476 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -295,10 +295,25 @@ static unsigned long isolate_migratepages(struct zone *zone, continue; } + if (!PageLRU(page)) + continue; + + /* + * PageLRU is set, and lru_lock excludes isolation, + * splitting and collapsing (collapsing has already + * happened if PageLRU is set). + */ + if (PageTransHuge(page)) { + low_pfn += (1 << compound_order(page)) - 1; + continue; + } + /* Try isolate the page */ if (__isolate_lru_page(page, ISOLATE_BOTH, 0) != 0) continue; + VM_BUG_ON(PageTransCompound(page)); + /* Successfully isolated */ del_page_from_lru_list(zone, page, page_lru(page)); list_add(&page->lru, migratelist); -- cgit From 94fcc585fb85ad7b059c70872489b50044d401f3 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Thu, 13 Jan 2011 15:47:08 -0800 Subject: thp: avoid breaking huge pmd invariants in case of vma_adjust failures An huge pmd can only be mapped if the corresponding 2M virtual range is fully contained in the vma. At times the VM calls split_vma twice, if the first split_vma succeeds and the second fail, the first split_vma remains in effect and it's not rolled back. For split_vma or vma_adjust to fail an allocation failure is needed so it's a very unlikely event (the out of memory killer would normally fire before any allocation failure is visible to kernel and userland and if an out of memory condition happens it's unlikely to happen exactly here). Nevertheless it's safer to ensure that no huge pmd can be left around if the vma is adjusted in a way that can't fit hugepages anymore at the new vm_start/vm_end address. Signed-off-by: Andrea Arcangeli Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/huge_mm.h | 19 ++++++++++++ mm/huge_memory.c | 80 +++++++++++++++++++++++++++++++++++++++++++++++-- mm/mmap.c | 2 ++ 3 files changed, 99 insertions(+), 2 deletions(-) diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index c590b08c6fa6..827595228734 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -104,6 +104,19 @@ extern void __split_huge_page_pmd(struct mm_struct *mm, pmd_t *pmd); #error "hugepages can't be allocated by the buddy allocator" #endif extern int hugepage_madvise(unsigned long *vm_flags); +extern void __vma_adjust_trans_huge(struct vm_area_struct *vma, + unsigned long start, + unsigned long end, + long adjust_next); +static inline void vma_adjust_trans_huge(struct vm_area_struct *vma, + unsigned long start, + unsigned long end, + long adjust_next) +{ + if (!vma->anon_vma || vma->vm_ops || vma->vm_file) + return; + __vma_adjust_trans_huge(vma, start, end, adjust_next); +} #else /* CONFIG_TRANSPARENT_HUGEPAGE */ #define HPAGE_PMD_SHIFT ({ BUG(); 0; }) #define HPAGE_PMD_MASK ({ BUG(); 0; }) @@ -125,6 +138,12 @@ static inline int hugepage_madvise(unsigned long *vm_flags) BUG(); return 0; } +static inline void vma_adjust_trans_huge(struct vm_area_struct *vma, + unsigned long start, + unsigned long end, + long adjust_next) +{ +} #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #endif /* _LINUX_HUGE_MM_H */ diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 30c3cec82023..b6facc35e893 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1075,8 +1075,16 @@ pmd_t *page_check_address_pmd(struct page *page, goto out; if (pmd_page(*pmd) != page) goto out; - VM_BUG_ON(flag == PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG && - pmd_trans_splitting(*pmd)); + /* + * split_vma() may create temporary aliased mappings. There is + * no risk as long as all huge pmd are found and have their + * splitting bit set before __split_huge_page_refcount + * runs. Finding the same huge pmd more than once during the + * same rmap walk is not a problem. + */ + if (flag == PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG && + pmd_trans_splitting(*pmd)) + goto out; if (pmd_trans_huge(*pmd)) { VM_BUG_ON(flag == PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG && !pmd_trans_splitting(*pmd)); @@ -2196,3 +2204,71 @@ void __split_huge_page_pmd(struct mm_struct *mm, pmd_t *pmd) put_page(page); BUG_ON(pmd_trans_huge(*pmd)); } + +static void split_huge_page_address(struct mm_struct *mm, + unsigned long address) +{ + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + + VM_BUG_ON(!(address & ~HPAGE_PMD_MASK)); + + pgd = pgd_offset(mm, address); + if (!pgd_present(*pgd)) + return; + + pud = pud_offset(pgd, address); + if (!pud_present(*pud)) + return; + + pmd = pmd_offset(pud, address); + if (!pmd_present(*pmd)) + return; + /* + * Caller holds the mmap_sem write mode, so a huge pmd cannot + * materialize from under us. + */ + split_huge_page_pmd(mm, pmd); +} + +void __vma_adjust_trans_huge(struct vm_area_struct *vma, + unsigned long start, + unsigned long end, + long adjust_next) +{ + /* + * If the new start address isn't hpage aligned and it could + * previously contain an hugepage: check if we need to split + * an huge pmd. + */ + if (start & ~HPAGE_PMD_MASK && + (start & HPAGE_PMD_MASK) >= vma->vm_start && + (start & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= vma->vm_end) + split_huge_page_address(vma->vm_mm, start); + + /* + * If the new end address isn't hpage aligned and it could + * previously contain an hugepage: check if we need to split + * an huge pmd. + */ + if (end & ~HPAGE_PMD_MASK && + (end & HPAGE_PMD_MASK) >= vma->vm_start && + (end & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= vma->vm_end) + split_huge_page_address(vma->vm_mm, end); + + /* + * If we're also updating the vma->vm_next->vm_start, if the new + * vm_next->vm_start isn't page aligned and it could previously + * contain an hugepage: check if we need to split an huge pmd. + */ + if (adjust_next > 0) { + struct vm_area_struct *next = vma->vm_next; + unsigned long nstart = next->vm_start; + nstart += adjust_next << PAGE_SHIFT; + if (nstart & ~HPAGE_PMD_MASK && + (nstart & HPAGE_PMD_MASK) >= next->vm_start && + (nstart & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= next->vm_end) + split_huge_page_address(next->vm_mm, nstart); + } +} diff --git a/mm/mmap.c b/mm/mmap.c index 753f44d17047..73cc648873d6 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -589,6 +589,8 @@ again: remove_next = 1 + (end > next->vm_end); } } + vma_adjust_trans_huge(vma, start, end, adjust_next); + /* * When changing only vma->vm_end, we don't really need anon_vma * lock. This is a fairly rare case by itself, but the anon_vma -- cgit From 4b7167b9ff9b7f3f528cbc4c7d02ebd275b9b10c Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Thu, 13 Jan 2011 15:47:09 -0800 Subject: thp: don't allow transparent hugepage support without PSE Archs implementing Transparent Hugepage Support must implement a function called has_transparent_hugepage to be sure the virtual or physical CPU supports Transparent Hugepages. Signed-off-by: Andrea Arcangeli Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/include/asm/pgtable.h | 5 +++++ mm/huge_memory.c | 8 ++++++++ 2 files changed, 13 insertions(+) diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index c48ba055f693..18601c86fab1 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -160,6 +160,11 @@ static inline int pmd_trans_huge(pmd_t pmd) { return pmd_val(pmd) & _PAGE_PSE; } + +static inline int has_transparent_hugepage(void) +{ + return cpu_has_pse; +} #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ static inline pte_t pte_set_flags(pte_t pte, pteval_t set) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index b6facc35e893..915809b16edf 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -487,7 +487,15 @@ static int __init hugepage_init(void) int err; #ifdef CONFIG_SYSFS static struct kobject *hugepage_kobj; +#endif + err = -EINVAL; + if (!has_transparent_hugepage()) { + transparent_hugepage_flags = 0; + goto out; + } + +#ifdef CONFIG_SYSFS err = -ENOMEM; hugepage_kobj = kobject_create_and_add("transparent_hugepage", mm_kobj); if (unlikely(!hugepage_kobj)) { -- cgit From 8ee53820edfd1f3b6554c593f337148dd3d7fc91 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Thu, 13 Jan 2011 15:47:10 -0800 Subject: thp: mmu_notifier_test_young For GRU and EPT, we need gup-fast to set referenced bit too (this is why it's correct to return 0 when shadow_access_mask is zero, it requires gup-fast to set the referenced bit). qemu-kvm access already sets the young bit in the pte if it isn't zero-copy, if it's zero copy or a shadow paging EPT minor fault we relay on gup-fast to signal the page is in use... We also need to check the young bits on the secondary pagetables for NPT and not nested shadow mmu as the data may never get accessed again by the primary pte. Without this closer accuracy, we'd have to remove the heuristic that avoids collapsing hugepages in hugepage virtual regions that have not even a single subpage in use. ->test_young is full backwards compatible with GRU and other usages that don't have young bits in pagetables set by the hardware and that should nuke the secondary mmu mappings when ->clear_flush_young runs just like EPT does. Removing the heuristic that checks the young bit in khugepaged/collapse_huge_page completely isn't so bad either probably but I thought it was worth it and this makes it reliable. Signed-off-by: Andrea Arcangeli Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/mmu.c | 34 ++++++++++++++++++++++++++++++++++ arch/x86/mm/gup.c | 3 +++ include/linux/mmu_notifier.h | 26 ++++++++++++++++++++++++++ mm/huge_memory.c | 6 ++++-- mm/mmu_notifier.c | 20 ++++++++++++++++++++ virt/kvm/kvm_main.c | 17 +++++++++++++++++ 7 files changed, 105 insertions(+), 2 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index aa75f21a9fba..ffd7f8d29187 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -822,6 +822,7 @@ extern bool kvm_rebooting; #define KVM_ARCH_WANT_MMU_NOTIFIER int kvm_unmap_hva(struct kvm *kvm, unsigned long hva); int kvm_age_hva(struct kvm *kvm, unsigned long hva); +int kvm_test_age_hva(struct kvm *kvm, unsigned long hva); void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte); int cpuid_maxphyaddr(struct kvm_vcpu *vcpu); int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 47b2c3288b6b..f02b8edc3d44 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -945,6 +945,35 @@ static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp, return young; } +static int kvm_test_age_rmapp(struct kvm *kvm, unsigned long *rmapp, + unsigned long data) +{ + u64 *spte; + int young = 0; + + /* + * If there's no access bit in the secondary pte set by the + * hardware it's up to gup-fast/gup to set the access bit in + * the primary pte or in the page structure. + */ + if (!shadow_accessed_mask) + goto out; + + spte = rmap_next(kvm, rmapp, NULL); + while (spte) { + u64 _spte = *spte; + BUG_ON(!(_spte & PT_PRESENT_MASK)); + young = _spte & PT_ACCESSED_MASK; + if (young) { + young = 1; + break; + } + spte = rmap_next(kvm, rmapp, spte); + } +out: + return young; +} + #define RMAP_RECYCLE_THRESHOLD 1000 static void rmap_recycle(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn) @@ -965,6 +994,11 @@ int kvm_age_hva(struct kvm *kvm, unsigned long hva) return kvm_handle_hva(kvm, hva, 0, kvm_age_rmapp); } +int kvm_test_age_hva(struct kvm *kvm, unsigned long hva) +{ + return kvm_handle_hva(kvm, hva, 0, kvm_test_age_rmapp); +} + #ifdef MMU_DEBUG static int is_empty_shadow_page(u64 *spt) { diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c index 269aa53932e0..dbe34b931374 100644 --- a/arch/x86/mm/gup.c +++ b/arch/x86/mm/gup.c @@ -8,6 +8,7 @@ #include #include #include +#include #include @@ -89,6 +90,7 @@ static noinline int gup_pte_range(pmd_t pmd, unsigned long addr, VM_BUG_ON(!pfn_valid(pte_pfn(pte))); page = pte_page(pte); get_page(page); + SetPageReferenced(page); pages[*nr] = page; (*nr)++; @@ -103,6 +105,7 @@ static inline void get_head_page_multiple(struct page *page, int nr) VM_BUG_ON(page != compound_head(page)); VM_BUG_ON(page_count(page) == 0); atomic_add(nr, &page->_count); + SetPageReferenced(page); } static inline void get_huge_page_tail(struct page *page) diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h index cbfab1e9957d..cc2e7dfea9d7 100644 --- a/include/linux/mmu_notifier.h +++ b/include/linux/mmu_notifier.h @@ -61,6 +61,16 @@ struct mmu_notifier_ops { struct mm_struct *mm, unsigned long address); + /* + * test_young is called to check the young/accessed bitflag in + * the secondary pte. This is used to know if the page is + * frequently used without actually clearing the flag or tearing + * down the secondary mapping on the page. + */ + int (*test_young)(struct mmu_notifier *mn, + struct mm_struct *mm, + unsigned long address); + /* * change_pte is called in cases that pte mapping to page is changed: * for example, when ksm remaps pte to point to a new shared page. @@ -163,6 +173,8 @@ extern void __mmu_notifier_mm_destroy(struct mm_struct *mm); extern void __mmu_notifier_release(struct mm_struct *mm); extern int __mmu_notifier_clear_flush_young(struct mm_struct *mm, unsigned long address); +extern int __mmu_notifier_test_young(struct mm_struct *mm, + unsigned long address); extern void __mmu_notifier_change_pte(struct mm_struct *mm, unsigned long address, pte_t pte); extern void __mmu_notifier_invalidate_page(struct mm_struct *mm, @@ -186,6 +198,14 @@ static inline int mmu_notifier_clear_flush_young(struct mm_struct *mm, return 0; } +static inline int mmu_notifier_test_young(struct mm_struct *mm, + unsigned long address) +{ + if (mm_has_notifiers(mm)) + return __mmu_notifier_test_young(mm, address); + return 0; +} + static inline void mmu_notifier_change_pte(struct mm_struct *mm, unsigned long address, pte_t pte) { @@ -313,6 +333,12 @@ static inline int mmu_notifier_clear_flush_young(struct mm_struct *mm, return 0; } +static inline int mmu_notifier_test_young(struct mm_struct *mm, + unsigned long address) +{ + return 0; +} + static inline void mmu_notifier_change_pte(struct mm_struct *mm, unsigned long address, pte_t pte) { diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 915809b16edf..39d7df40c067 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1632,7 +1632,8 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, VM_BUG_ON(PageLRU(page)); /* If there is no mapped pte young don't collapse the page */ - if (pte_young(pteval)) + if (pte_young(pteval) || PageReferenced(page) || + mmu_notifier_test_young(vma->vm_mm, address)) referenced = 1; } if (unlikely(!referenced)) @@ -1892,7 +1893,8 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, /* cannot use mapcount: can't collapse if there's a gup pin */ if (page_count(page) != 1) goto out_unmap; - if (pte_young(pteval)) + if (pte_young(pteval) || PageReferenced(page) || + mmu_notifier_test_young(vma->vm_mm, address)) referenced = 1; } if (referenced) diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c index 438951d366f2..8d032de4088e 100644 --- a/mm/mmu_notifier.c +++ b/mm/mmu_notifier.c @@ -100,6 +100,26 @@ int __mmu_notifier_clear_flush_young(struct mm_struct *mm, return young; } +int __mmu_notifier_test_young(struct mm_struct *mm, + unsigned long address) +{ + struct mmu_notifier *mn; + struct hlist_node *n; + int young = 0; + + rcu_read_lock(); + hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) { + if (mn->ops->test_young) { + young = mn->ops->test_young(mn, mm, address); + if (young) + break; + } + } + rcu_read_unlock(); + + return young; +} + void __mmu_notifier_change_pte(struct mm_struct *mm, unsigned long address, pte_t pte) { diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 85ab7db0d366..4286d4766510 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -380,6 +380,22 @@ static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn, return young; } +static int kvm_mmu_notifier_test_young(struct mmu_notifier *mn, + struct mm_struct *mm, + unsigned long address) +{ + struct kvm *kvm = mmu_notifier_to_kvm(mn); + int young, idx; + + idx = srcu_read_lock(&kvm->srcu); + spin_lock(&kvm->mmu_lock); + young = kvm_test_age_hva(kvm, address); + spin_unlock(&kvm->mmu_lock); + srcu_read_unlock(&kvm->srcu, idx); + + return young; +} + static void kvm_mmu_notifier_release(struct mmu_notifier *mn, struct mm_struct *mm) { @@ -396,6 +412,7 @@ static const struct mmu_notifier_ops kvm_mmu_notifier_ops = { .invalidate_range_start = kvm_mmu_notifier_invalidate_range_start, .invalidate_range_end = kvm_mmu_notifier_invalidate_range_end, .clear_flush_young = kvm_mmu_notifier_clear_flush_young, + .test_young = kvm_mmu_notifier_test_young, .change_pte = kvm_mmu_notifier_change_pte, .release = kvm_mmu_notifier_release, }; -- cgit From 878aee7d6b5504e01b9caffce080e792b6b8d090 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Thu, 13 Jan 2011 15:47:10 -0800 Subject: thp: freeze khugepaged and ksmd It's unclear why schedule friendly kernel threads can't be taken away by the CPU through the scheduler itself. It's safer to stop them as they can trigger memory allocation, if kswapd also freezes itself to avoid generating I/O they have too. Signed-off-by: Andrea Arcangeli Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 14 ++++++++++++-- mm/ksm.c | 8 ++++++-- 2 files changed, 18 insertions(+), 4 deletions(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 39d7df40c067..45b6d53bcfbc 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -15,6 +15,7 @@ #include #include #include +#include #include #include #include "internal.h" @@ -2085,6 +2086,9 @@ static void khugepaged_do_scan(struct page **hpage) break; #endif + if (unlikely(kthread_should_stop() || freezing(current))) + break; + spin_lock(&khugepaged_mm_lock); if (!khugepaged_scan.mm_slot) pass_through_head++; @@ -2147,6 +2151,9 @@ static void khugepaged_loop(void) if (hpage) put_page(hpage); #endif + try_to_freeze(); + if (unlikely(kthread_should_stop())) + break; if (khugepaged_has_work()) { DEFINE_WAIT(wait); if (!khugepaged_scan_sleep_millisecs) @@ -2157,8 +2164,8 @@ static void khugepaged_loop(void) khugepaged_scan_sleep_millisecs)); remove_wait_queue(&khugepaged_wait, &wait); } else if (khugepaged_enabled()) - wait_event_interruptible(khugepaged_wait, - khugepaged_wait_event()); + wait_event_freezable(khugepaged_wait, + khugepaged_wait_event()); } } @@ -2166,6 +2173,7 @@ static int khugepaged(void *none) { struct mm_slot *mm_slot; + set_freezable(); set_user_nice(current, 19); /* serialize with start_khugepaged() */ @@ -2180,6 +2188,8 @@ static int khugepaged(void *none) mutex_lock(&khugepaged_mutex); if (!khugepaged_enabled()) break; + if (unlikely(kthread_should_stop())) + break; } spin_lock(&khugepaged_mm_lock); diff --git a/mm/ksm.c b/mm/ksm.c index 5e7d5d35ea82..e2b0afd0a031 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -34,6 +34,7 @@ #include #include #include +#include #include #include "internal.h" @@ -1365,7 +1366,7 @@ static void ksm_do_scan(unsigned int scan_npages) struct rmap_item *rmap_item; struct page *uninitialized_var(page); - while (scan_npages--) { + while (scan_npages-- && likely(!freezing(current))) { cond_resched(); rmap_item = scan_get_next_rmap_item(&page); if (!rmap_item) @@ -1383,6 +1384,7 @@ static int ksmd_should_run(void) static int ksm_scan_thread(void *nothing) { + set_freezable(); set_user_nice(current, 5); while (!kthread_should_stop()) { @@ -1391,11 +1393,13 @@ static int ksm_scan_thread(void *nothing) ksm_do_scan(ksm_thread_pages_to_scan); mutex_unlock(&ksm_thread_mutex); + try_to_freeze(); + if (ksmd_should_run()) { schedule_timeout_interruptible( msecs_to_jiffies(ksm_thread_sleep_millisecs)); } else { - wait_event_interruptible(ksm_thread_wait, + wait_event_freezable(ksm_thread_wait, ksmd_should_run() || kthread_should_stop()); } } -- cgit From 5a03b051ed87e72b959f32a86054e1142ac4cf55 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Thu, 13 Jan 2011 15:47:11 -0800 Subject: thp: use compaction in kswapd for GFP_ATOMIC order > 0 This takes advantage of memory compaction to properly generate pages of order > 0 if regular page reclaim fails and priority level becomes more severe and we don't reach the proper watermarks. Signed-off-by: Andrea Arcangeli Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/compaction.h | 11 ++++++++--- mm/compaction.c | 31 ++++++++++++++++++++++++++----- mm/vmscan.c | 30 ++++++++++++++++++++---------- 3 files changed, 54 insertions(+), 18 deletions(-) diff --git a/include/linux/compaction.h b/include/linux/compaction.h index 72cba4034785..dfa2ed4c0d26 100644 --- a/include/linux/compaction.h +++ b/include/linux/compaction.h @@ -11,6 +11,9 @@ /* The full zone was compacted */ #define COMPACT_COMPLETE 3 +#define COMPACT_MODE_DIRECT_RECLAIM 0 +#define COMPACT_MODE_KSWAPD 1 + #ifdef CONFIG_COMPACTION extern int sysctl_compact_memory; extern int sysctl_compaction_handler(struct ctl_table *table, int write, @@ -25,7 +28,8 @@ extern unsigned long try_to_compact_pages(struct zonelist *zonelist, bool sync); extern unsigned long compaction_suitable(struct zone *zone, int order); extern unsigned long compact_zone_order(struct zone *zone, int order, - gfp_t gfp_mask, bool sync); + gfp_t gfp_mask, bool sync, + int compact_mode); /* Do not skip compaction more than 64 times */ #define COMPACT_MAX_DEFER_SHIFT 6 @@ -70,9 +74,10 @@ static inline unsigned long compaction_suitable(struct zone *zone, int order) } static inline unsigned long compact_zone_order(struct zone *zone, int order, - gfp_t gfp_mask, bool sync) + gfp_t gfp_mask, bool sync, + int compact_mode) { - return 0; + return COMPACT_CONTINUE; } static inline void defer_compaction(struct zone *zone) diff --git a/mm/compaction.c b/mm/compaction.c index da25b5a2e476..60d52a7298d5 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -42,6 +42,8 @@ struct compact_control { unsigned int order; /* order a direct compactor needs */ int migratetype; /* MOVABLE, RECLAIMABLE etc */ struct zone *zone; + + int compact_mode; }; static unsigned long release_freepages(struct list_head *freelist) @@ -382,10 +384,10 @@ static void update_nr_listpages(struct compact_control *cc) } static int compact_finished(struct zone *zone, - struct compact_control *cc) + struct compact_control *cc) { unsigned int order; - unsigned long watermark = low_wmark_pages(zone) + (1 << cc->order); + unsigned long watermark; if (fatal_signal_pending(current)) return COMPACT_PARTIAL; @@ -395,12 +397,27 @@ static int compact_finished(struct zone *zone, return COMPACT_COMPLETE; /* Compaction run is not finished if the watermark is not met */ + if (cc->compact_mode != COMPACT_MODE_KSWAPD) + watermark = low_wmark_pages(zone); + else + watermark = high_wmark_pages(zone); + watermark += (1 << cc->order); + if (!zone_watermark_ok(zone, cc->order, watermark, 0, 0)) return COMPACT_CONTINUE; if (cc->order == -1) return COMPACT_CONTINUE; + /* + * Generating only one page of the right order is not enough + * for kswapd, we must continue until we're above the high + * watermark as a pool for high order GFP_ATOMIC allocations + * too. + */ + if (cc->compact_mode == COMPACT_MODE_KSWAPD) + return COMPACT_CONTINUE; + /* Direct compactor: Is a suitable page free? */ for (order = cc->order; order < MAX_ORDER; order++) { /* Job done if page is free of the right migratetype */ @@ -514,8 +531,9 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) } unsigned long compact_zone_order(struct zone *zone, - int order, gfp_t gfp_mask, - bool sync) + int order, gfp_t gfp_mask, + bool sync, + int compact_mode) { struct compact_control cc = { .nr_freepages = 0, @@ -524,6 +542,7 @@ unsigned long compact_zone_order(struct zone *zone, .migratetype = allocflags_to_migratetype(gfp_mask), .zone = zone, .sync = sync, + .compact_mode = compact_mode, }; INIT_LIST_HEAD(&cc.freepages); INIT_LIST_HEAD(&cc.migratepages); @@ -569,7 +588,8 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist, nodemask) { int status; - status = compact_zone_order(zone, order, gfp_mask, sync); + status = compact_zone_order(zone, order, gfp_mask, sync, + COMPACT_MODE_DIRECT_RECLAIM); rc = max(status, rc); /* If a normal allocation would succeed, stop compacting */ @@ -600,6 +620,7 @@ static int compact_node(int nid) .nr_freepages = 0, .nr_migratepages = 0, .order = -1, + .compact_mode = COMPACT_MODE_DIRECT_RECLAIM, }; zone = &pgdat->node_zones[zoneid]; diff --git a/mm/vmscan.c b/mm/vmscan.c index cfdef0bcc7ab..f5b762ae23a2 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -41,6 +41,7 @@ #include #include #include +#include #include #include @@ -2382,6 +2383,7 @@ loop_again: * cause too much scanning of the lower zones. */ for (i = 0; i <= end_zone; i++) { + int compaction; struct zone *zone = pgdat->node_zones + i; int nr_slab; @@ -2411,9 +2413,26 @@ loop_again: lru_pages); sc.nr_reclaimed += reclaim_state->reclaimed_slab; total_scanned += sc.nr_scanned; + + compaction = 0; + if (order && + zone_watermark_ok(zone, 0, + high_wmark_pages(zone), + end_zone, 0) && + !zone_watermark_ok(zone, order, + high_wmark_pages(zone), + end_zone, 0)) { + compact_zone_order(zone, + order, + sc.gfp_mask, false, + COMPACT_MODE_KSWAPD); + compaction = 1; + } + if (zone->all_unreclaimable) continue; - if (nr_slab == 0 && !zone_reclaimable(zone)) + if (!compaction && nr_slab == 0 && + !zone_reclaimable(zone)) zone->all_unreclaimable = 1; /* * If we've done a decent amount of scanning and @@ -2424,15 +2443,6 @@ loop_again: total_scanned > sc.nr_reclaimed + sc.nr_reclaimed / 2) sc.may_writepage = 1; - /* - * Compact the zone for higher orders to reduce - * latencies for higher-order allocations that - * would ordinarily call try_to_compact_pages() - */ - if (sc.order > PAGE_ALLOC_COSTLY_ORDER) - compact_zone_order(zone, sc.order, sc.gfp_mask, - false); - if (!zone_watermark_ok_safe(zone, order, high_wmark_pages(zone), end_zone, 0)) { all_zones_ok = 0; -- cgit From c5a73c3d55be1faadba35b41a862e036a3b12ddb Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Thu, 13 Jan 2011 15:47:11 -0800 Subject: thp: use compaction for all allocation orders It makes no sense not to enable compaction for small order pages as we don't want to end up with bad order 2 allocations and good and graceful order 9 allocations. Signed-off-by: Andrea Arcangeli Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/compaction.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/compaction.c b/mm/compaction.c index 60d52a7298d5..6d592a021072 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -578,7 +578,7 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist, * made because an assumption is made that the page allocator can satisfy * the "cheaper" orders without taking special steps */ - if (order <= PAGE_ALLOC_COSTLY_ORDER || !may_enter_fs || !may_perform_io) + if (!order || !may_enter_fs || !may_perform_io) return rc; count_vm_event(COMPACTSTALL); -- cgit From 97562cd243298acf573620c764a1037bd545c9bc Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Thu, 13 Jan 2011 15:47:12 -0800 Subject: thp: disable transparent hugepages by default on small systems On small systems, the extra memory used by the anti-fragmentation memory reserve and simply because huge pages are smaller than large pages can easily outweigh the benefits of less TLB misses. A less obvious concern is if run on a NUMA machine with asymmetric node sizes and one of them is very small. The reserve could make the node unusable. In case of the crashdump kernel, OOMs have been observed due to the anti-fragmentation memory reserve taking up a large fraction of the crashdump image. This patch disables transparent hugepages on systems with less than 1GB of RAM, but the hugepage subsystem is fully initialized so administrators can enable THP through /sys if desired. Signed-off-by: Rik van Riel Acked-by: Avi Kiviti Signed-off-by: Andrea Arcangeli Acked-by: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 45b6d53bcfbc..892d8a17a7e5 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -527,6 +527,14 @@ static int __init hugepage_init(void) goto out; } + /* + * By default disable transparent hugepages on smaller systems, + * where the extra memory used could hurt more than TLB overhead + * is likely to save. The admin can still enable it through /sys. + */ + if (totalram_pages < (512 << (20 - PAGE_SHIFT))) + transparent_hugepage_flags = 0; + start_khugepaged(); set_recommended_min_free_kbytes(); -- cgit From 2c888cfbc1b45508a44763d85ba2e8ac43faff5f Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Thu, 13 Jan 2011 15:47:13 -0800 Subject: thp: fix anon memory statistics with transparent hugepages Count each transparent hugepage as HPAGE_PMD_NR pages in the LRU statistics, so the Active(anon) and Inactive(anon) statistics in /proc/meminfo are correct. Signed-off-by: Rik van Riel Signed-off-by: Andrea Arcangeli Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/huge_mm.h | 8 ++++++++ include/linux/mm_inline.h | 8 +++++--- mm/huge_memory.c | 10 ++++++++++ mm/memcontrol.c | 2 +- mm/vmscan.c | 11 ++++++----- 5 files changed, 30 insertions(+), 9 deletions(-) diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 827595228734..9b48c24df260 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -117,11 +117,19 @@ static inline void vma_adjust_trans_huge(struct vm_area_struct *vma, return; __vma_adjust_trans_huge(vma, start, end, adjust_next); } +static inline int hpage_nr_pages(struct page *page) +{ + if (unlikely(PageTransHuge(page))) + return HPAGE_PMD_NR; + return 1; +} #else /* CONFIG_TRANSPARENT_HUGEPAGE */ #define HPAGE_PMD_SHIFT ({ BUG(); 0; }) #define HPAGE_PMD_MASK ({ BUG(); 0; }) #define HPAGE_PMD_SIZE ({ BUG(); 0; }) +#define hpage_nr_pages(x) 1 + #define transparent_hugepage_enabled(__vma) 0 #define transparent_hugepage_flags 0UL diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h index 650f31eabdb1..8f7d24712dc1 100644 --- a/include/linux/mm_inline.h +++ b/include/linux/mm_inline.h @@ -1,6 +1,8 @@ #ifndef LINUX_MM_INLINE_H #define LINUX_MM_INLINE_H +#include + /** * page_is_file_cache - should the page be on a file LRU or anon LRU? * @page: the page to test @@ -24,7 +26,7 @@ __add_page_to_lru_list(struct zone *zone, struct page *page, enum lru_list l, struct list_head *head) { list_add(&page->lru, head); - __inc_zone_state(zone, NR_LRU_BASE + l); + __mod_zone_page_state(zone, NR_LRU_BASE + l, hpage_nr_pages(page)); mem_cgroup_add_lru_list(page, l); } @@ -38,7 +40,7 @@ static inline void del_page_from_lru_list(struct zone *zone, struct page *page, enum lru_list l) { list_del(&page->lru); - __dec_zone_state(zone, NR_LRU_BASE + l); + __mod_zone_page_state(zone, NR_LRU_BASE + l, -hpage_nr_pages(page)); mem_cgroup_del_lru_list(page, l); } @@ -73,7 +75,7 @@ del_page_from_lru(struct zone *zone, struct page *page) l += LRU_ACTIVE; } } - __dec_zone_state(zone, NR_LRU_BASE + l); + __mod_zone_page_state(zone, NR_LRU_BASE + l, -hpage_nr_pages(page)); mem_cgroup_del_lru_list(page, l); } diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 892d8a17a7e5..f4f6041176a4 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1143,6 +1143,7 @@ static void __split_huge_page_refcount(struct page *page) int i; unsigned long head_index = page->index; struct zone *zone = page_zone(page); + int zonestat; /* prevent PageLRU to go away from under us, and freeze lru stats */ spin_lock_irq(&zone->lru_lock); @@ -1207,6 +1208,15 @@ static void __split_huge_page_refcount(struct page *page) __dec_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES); __mod_zone_page_state(zone, NR_ANON_PAGES, HPAGE_PMD_NR); + /* + * A hugepage counts for HPAGE_PMD_NR pages on the LRU statistics, + * so adjust those appropriately if this page is on the LRU. + */ + if (PageLRU(page)) { + zonestat = NR_LRU_BASE + page_lru(page); + __mod_zone_page_state(zone, zonestat, -(HPAGE_PMD_NR-1)); + } + ClearPageCompound(page); compound_unlock(page); spin_unlock_irq(&zone->lru_lock); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index a1bb59d4c9d0..f4ea3410fb4d 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1091,7 +1091,7 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan, case 0: list_move(&page->lru, dst); mem_cgroup_del_lru(page); - nr_taken++; + nr_taken += hpage_nr_pages(page); break; case -EBUSY: /* we don't affect global LRU but rotate in our LRU */ diff --git a/mm/vmscan.c b/mm/vmscan.c index f5b762ae23a2..0882014d2ce5 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1045,7 +1045,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, case 0: list_move(&page->lru, dst); mem_cgroup_del_lru(page); - nr_taken++; + nr_taken += hpage_nr_pages(page); break; case -EBUSY: @@ -1103,7 +1103,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, if (__isolate_lru_page(cursor_page, mode, file) == 0) { list_move(&cursor_page->lru, dst); mem_cgroup_del_lru(cursor_page); - nr_taken++; + nr_taken += hpage_nr_pages(page); nr_lumpy_taken++; if (PageDirty(cursor_page)) nr_lumpy_dirty++; @@ -1158,14 +1158,15 @@ static unsigned long clear_active_flags(struct list_head *page_list, struct page *page; list_for_each_entry(page, page_list, lru) { + int numpages = hpage_nr_pages(page); lru = page_lru_base_type(page); if (PageActive(page)) { lru += LRU_ACTIVE; ClearPageActive(page); - nr_active++; + nr_active += numpages; } if (count) - count[lru]++; + count[lru] += numpages; } return nr_active; @@ -1483,7 +1484,7 @@ static void move_active_pages_to_lru(struct zone *zone, list_move(&page->lru, &zone->lru[lru].list); mem_cgroup_add_lru_list(page, lru); - pgmoved++; + pgmoved += hpage_nr_pages(page); if (!pagevec_add(&pvec, page) || list_empty(list)) { spin_unlock_irq(&zone->lru_lock); -- cgit From 9992af102974f3f8a02a1f2729c3461881539e26 Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Thu, 13 Jan 2011 15:47:13 -0800 Subject: thp: scale nr_rotated to balance memory pressure Make sure we scale up nr_rotated when we encounter a referenced transparent huge page. This ensures pageout scanning balance is not distorted when there are huge pages on the LRU. Signed-off-by: Rik van Riel Signed-off-by: Andrea Arcangeli Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index 0882014d2ce5..47a50962ce81 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1276,7 +1276,8 @@ putback_lru_pages(struct zone *zone, struct scan_control *sc, add_page_to_lru_list(zone, page, lru); if (is_active_lru(lru)) { int file = is_file_lru(lru); - reclaim_stat->recent_rotated[file]++; + int numpages = hpage_nr_pages(page); + reclaim_stat->recent_rotated[file] += numpages; } if (!pagevec_add(&pvec, page)) { spin_unlock_irq(&zone->lru_lock); @@ -1552,7 +1553,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, } if (page_referenced(page, 0, sc->mem_cgroup, &vm_flags)) { - nr_rotated++; + nr_rotated += hpage_nr_pages(page); /* * Identify referenced, file-backed active pages and * give them one more trip around the active list. So -- cgit From 05b258e99725112c4febeab4fad23ea2c8908a3a Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Thu, 13 Jan 2011 15:47:14 -0800 Subject: thp: transparent hugepage sysfs meminfo Add hugepage statistics to per-node sysfs meminfo Reviewed-by: Rik van Riel Signed-off-by: David Rientjes Signed-off-by: Andrea Arcangeli Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/base/node.c | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/drivers/base/node.c b/drivers/base/node.c index ce012a9c6201..36b43052001d 100644 --- a/drivers/base/node.c +++ b/drivers/base/node.c @@ -117,12 +117,21 @@ static ssize_t node_read_meminfo(struct sys_device * dev, "Node %d WritebackTmp: %8lu kB\n" "Node %d Slab: %8lu kB\n" "Node %d SReclaimable: %8lu kB\n" - "Node %d SUnreclaim: %8lu kB\n", + "Node %d SUnreclaim: %8lu kB\n" +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + "Node %d AnonHugePages: %8lu kB\n" +#endif + , nid, K(node_page_state(nid, NR_FILE_DIRTY)), nid, K(node_page_state(nid, NR_WRITEBACK)), nid, K(node_page_state(nid, NR_FILE_PAGES)), nid, K(node_page_state(nid, NR_FILE_MAPPED)), - nid, K(node_page_state(nid, NR_ANON_PAGES)), + nid, K(node_page_state(nid, NR_ANON_PAGES) +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + + node_page_state(nid, NR_ANON_TRANSPARENT_HUGEPAGES) * + HPAGE_PMD_NR +#endif + ), nid, K(node_page_state(nid, NR_SHMEM)), nid, node_page_state(nid, NR_KERNEL_STACK) * THREAD_SIZE / 1024, @@ -133,7 +142,13 @@ static ssize_t node_read_meminfo(struct sys_device * dev, nid, K(node_page_state(nid, NR_SLAB_RECLAIMABLE) + node_page_state(nid, NR_SLAB_UNRECLAIMABLE)), nid, K(node_page_state(nid, NR_SLAB_RECLAIMABLE)), - nid, K(node_page_state(nid, NR_SLAB_UNRECLAIMABLE))); + nid, K(node_page_state(nid, NR_SLAB_UNRECLAIMABLE)) +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + , nid, + K(node_page_state(nid, NR_ANON_TRANSPARENT_HUGEPAGES) * + HPAGE_PMD_NR) +#endif + ); n += hugetlb_report_node_meminfo(nid, buf + n); return n; } -- cgit From 14d1a55cd26f1860f837f37ae42520c7c13b1347 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Thu, 13 Jan 2011 15:47:15 -0800 Subject: thp: add debug checks for mapcount related invariants Add debug checks for invariants that if broken could lead to mapcount vs page_mapcount debug checks to trigger later in split_huge_page. Signed-off-by: Andrea Arcangeli Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index 12ee1ea237f5..31250faff390 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -804,6 +804,7 @@ static inline int copy_pmd_range(struct mm_struct *dst_mm, struct mm_struct *src next = pmd_addr_end(addr, end); if (pmd_trans_huge(*src_pmd)) { int err; + VM_BUG_ON(next-addr != HPAGE_PMD_SIZE); err = copy_huge_pmd(dst_mm, src_mm, dst_pmd, src_pmd, addr, vma); if (err == -ENOMEM) @@ -1015,9 +1016,10 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb, do { next = pmd_addr_end(addr, end); if (pmd_trans_huge(*pmd)) { - if (next-addr != HPAGE_PMD_SIZE) + if (next-addr != HPAGE_PMD_SIZE) { + VM_BUG_ON(!rwsem_is_locked(&tlb->mm->mmap_sem)); split_huge_page_pmd(vma->vm_mm, pmd); - else if (zap_huge_pmd(tlb, vma, pmd)) { + } else if (zap_huge_pmd(tlb, vma, pmd)) { (*zap_work)--; continue; } -- cgit From 91600e9e592e48736e630851c83da2ad6bf0e91f Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Thu, 13 Jan 2011 15:47:16 -0800 Subject: thp: fix memory-failure hugetlbfs vs THP collision hugetlbfs was changed to allow memory failure to migrate the hugetlbfs pages and that broke THP as split_huge_page was then called on hugetlbfs pages too. compound_head/order was also run unsafe on THP pages that can be splitted at any time. All compound_head() invocations in memory-failure.c that are run on pages that aren't pinned and that can be freed and reused from under us (while compound_head is running) are buggy because compound_head can return a dangling pointer, but I'm not fixing this as this is a generic memory-failure bug not specific to THP but it applies to hugetlbfs too, so I can fix it later after THP is merged upstream. Signed-off-by: Andrea Arcangeli Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory-failure.c | 2 +- mm/rmap.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 6a283cc9317c..1b43d0ffff65 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -386,7 +386,7 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill, struct task_struct *tsk; struct anon_vma *av; - if (unlikely(split_huge_page(page))) + if (!PageHuge(page) && unlikely(split_huge_page(page))) return; read_lock(&tasklist_lock); av = page_lock_anon_vma(page); diff --git a/mm/rmap.c b/mm/rmap.c index 3825ae4bc32f..c30f33854f97 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1430,7 +1430,7 @@ int try_to_unmap(struct page *page, enum ttu_flags flags) int ret; BUG_ON(!PageLocked(page)); - BUG_ON(PageTransHuge(page)); + VM_BUG_ON(!PageHuge(page) && PageTransHuge(page)); if (unlikely(PageKsm(page))) ret = try_to_unmap_ksm(page, flags); -- cgit From 37c2ac7872a9387542616f658d20ac25f5bdb32e Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Thu, 13 Jan 2011 15:47:16 -0800 Subject: thp: compound_trans_order Read compound_trans_order safe. Noop for CONFIG_TRANSPARENT_HUGEPAGE=n. Signed-off-by: Andrea Arcangeli Cc: Daisuke Nishimura Cc: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 14 ++++++++++++++ mm/memcontrol.c | 12 ++++++------ mm/memory-failure.c | 12 ++++++------ 3 files changed, 26 insertions(+), 12 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 9c2695beab86..ce97a2bb0b19 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -450,6 +450,20 @@ static inline int compound_order(struct page *page) return (unsigned long)page[1].lru.prev; } +static inline int compound_trans_order(struct page *page) +{ + int order; + unsigned long flags; + + if (!PageHead(page)) + return 0; + + flags = compound_lock_irqsave(page); + order = compound_order(page); + compound_unlock_irqrestore(page, flags); + return order; +} + static inline void set_compound_order(struct page *page, unsigned long order) { page[1].lru.prev = (void *)order; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index f4ea3410fb4d..741206ffdace 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1027,10 +1027,6 @@ mem_cgroup_get_reclaim_stat_from_page(struct page *page) { struct page_cgroup *pc; struct mem_cgroup_per_zone *mz; - int page_size = PAGE_SIZE; - - if (PageTransHuge(page)) - page_size <<= compound_order(page); if (mem_cgroup_disabled()) return NULL; @@ -2286,8 +2282,10 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm, int ret; int page_size = PAGE_SIZE; - if (PageTransHuge(page)) + if (PageTransHuge(page)) { page_size <<= compound_order(page); + VM_BUG_ON(!PageTransHuge(page)); + } pc = lookup_page_cgroup(page); /* can happen at boot */ @@ -2558,8 +2556,10 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) if (PageSwapCache(page)) return NULL; - if (PageTransHuge(page)) + if (PageTransHuge(page)) { page_size <<= compound_order(page); + VM_BUG_ON(!PageTransHuge(page)); + } count = page_size >> PAGE_SHIFT; /* diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 1b43d0ffff65..548fbd70f026 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -203,7 +203,7 @@ static int kill_proc_ao(struct task_struct *t, unsigned long addr, int trapno, #ifdef __ARCH_SI_TRAPNO si.si_trapno = trapno; #endif - si.si_addr_lsb = compound_order(compound_head(page)) + PAGE_SHIFT; + si.si_addr_lsb = compound_trans_order(compound_head(page)) + PAGE_SHIFT; /* * Don't use force here, it's convenient if the signal * can be temporarily blocked. @@ -930,7 +930,7 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn, static void set_page_hwpoison_huge_page(struct page *hpage) { int i; - int nr_pages = 1 << compound_order(hpage); + int nr_pages = 1 << compound_trans_order(hpage); for (i = 0; i < nr_pages; i++) SetPageHWPoison(hpage + i); } @@ -938,7 +938,7 @@ static void set_page_hwpoison_huge_page(struct page *hpage) static void clear_page_hwpoison_huge_page(struct page *hpage) { int i; - int nr_pages = 1 << compound_order(hpage); + int nr_pages = 1 << compound_trans_order(hpage); for (i = 0; i < nr_pages; i++) ClearPageHWPoison(hpage + i); } @@ -968,7 +968,7 @@ int __memory_failure(unsigned long pfn, int trapno, int flags) return 0; } - nr_pages = 1 << compound_order(hpage); + nr_pages = 1 << compound_trans_order(hpage); atomic_long_add(nr_pages, &mce_bad_pages); /* @@ -1166,7 +1166,7 @@ int unpoison_memory(unsigned long pfn) return 0; } - nr_pages = 1 << compound_order(page); + nr_pages = 1 << compound_trans_order(page); if (!get_page_unless_zero(page)) { /* @@ -1304,7 +1304,7 @@ static int soft_offline_huge_page(struct page *page, int flags) } done: if (!PageHWPoison(hpage)) - atomic_long_add(1 << compound_order(hpage), &mce_bad_pages); + atomic_long_add(1 << compound_trans_order(hpage), &mce_bad_pages); set_page_hwpoison_huge_page(hpage); dequeue_hwpoisoned_huge_page(hpage); /* keep elevated page count for bad page */ -- cgit From 1ddd6db43a08cba56c7ee920800980862086f1c3 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Thu, 13 Jan 2011 15:47:17 -0800 Subject: thp: mm: define MADV_NOHUGEPAGE Define MADV_NOHUGEPAGE. Signed-off-by: Andrea Arcangeli Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/alpha/include/asm/mman.h | 1 + arch/mips/include/asm/mman.h | 1 + arch/parisc/include/asm/mman.h | 1 + arch/xtensa/include/asm/mman.h | 1 + include/asm-generic/mman-common.h | 1 + 5 files changed, 5 insertions(+) diff --git a/arch/alpha/include/asm/mman.h b/arch/alpha/include/asm/mman.h index f7465418a8f3..72db984f8781 100644 --- a/arch/alpha/include/asm/mman.h +++ b/arch/alpha/include/asm/mman.h @@ -54,6 +54,7 @@ #define MADV_UNMERGEABLE 13 /* KSM may not merge identical pages */ #define MADV_HUGEPAGE 14 /* Worth backing with hugepages */ +#define MADV_NOHUGEPAGE 15 /* Not worth backing with hugepages */ /* compatibility flags */ #define MAP_FILE 0 diff --git a/arch/mips/include/asm/mman.h b/arch/mips/include/asm/mman.h index 9d8184c527ff..785b4ea4ec3f 100644 --- a/arch/mips/include/asm/mman.h +++ b/arch/mips/include/asm/mman.h @@ -78,6 +78,7 @@ #define MADV_HWPOISON 100 /* poison a page for testing */ #define MADV_HUGEPAGE 14 /* Worth backing with hugepages */ +#define MADV_NOHUGEPAGE 15 /* Not worth backing with hugepages */ /* compatibility flags */ #define MAP_FILE 0 diff --git a/arch/parisc/include/asm/mman.h b/arch/parisc/include/asm/mman.h index 533c5dc173c1..f5b7bf5fba68 100644 --- a/arch/parisc/include/asm/mman.h +++ b/arch/parisc/include/asm/mman.h @@ -60,6 +60,7 @@ #define MADV_UNMERGEABLE 66 /* KSM may not merge identical pages */ #define MADV_HUGEPAGE 67 /* Worth backing with hugepages */ +#define MADV_NOHUGEPAGE 68 /* Not worth backing with hugepages */ /* compatibility flags */ #define MAP_FILE 0 diff --git a/arch/xtensa/include/asm/mman.h b/arch/xtensa/include/asm/mman.h index 41be9a1ee08f..30789010733d 100644 --- a/arch/xtensa/include/asm/mman.h +++ b/arch/xtensa/include/asm/mman.h @@ -84,6 +84,7 @@ #define MADV_UNMERGEABLE 13 /* KSM may not merge identical pages */ #define MADV_HUGEPAGE 14 /* Worth backing with hugepages */ +#define MADV_NOHUGEPAGE 15 /* Not worth backing with hugepages */ /* compatibility flags */ #define MAP_FILE 0 diff --git a/include/asm-generic/mman-common.h b/include/asm-generic/mman-common.h index e91392fde4c6..787abbb6d867 100644 --- a/include/asm-generic/mman-common.h +++ b/include/asm-generic/mman-common.h @@ -46,6 +46,7 @@ #define MADV_UNMERGEABLE 13 /* KSM may not merge identical pages */ #define MADV_HUGEPAGE 14 /* Worth backing with hugepages */ +#define MADV_NOHUGEPAGE 15 /* Not worth backing with hugepages */ /* compatibility flags */ #define MAP_FILE 0 -- cgit From a664b2d8555c659127bf8fe049a58449d394a707 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Thu, 13 Jan 2011 15:47:17 -0800 Subject: thp: madvise(MADV_NOHUGEPAGE) Add madvise MADV_NOHUGEPAGE to mark regions that are not important to be hugepage backed. Return -EINVAL if the vma is not of an anonymous type, or the feature isn't built into the kernel. Never silently return success. Signed-off-by: Andrea Arcangeli Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/huge_mm.h | 14 ++++++++------ include/linux/khugepaged.h | 7 ++++--- include/linux/mm.h | 1 + mm/huge_memory.c | 41 ++++++++++++++++++++++++++++++----------- mm/madvise.c | 4 +++- 5 files changed, 46 insertions(+), 21 deletions(-) diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 9b48c24df260..a8b7e42d19ec 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -52,10 +52,12 @@ extern pmd_t *page_check_address_pmd(struct page *page, #define HPAGE_PMD_SIZE HPAGE_SIZE #define transparent_hugepage_enabled(__vma) \ - (transparent_hugepage_flags & (1<vm_flags & VM_HUGEPAGE)) + ((transparent_hugepage_flags & \ + (1<vm_flags & VM_HUGEPAGE))) && \ + !((__vma)->vm_flags & VM_NOHUGEPAGE)) #define transparent_hugepage_defrag(__vma) \ ((transparent_hugepage_flags & \ (1< MAX_ORDER #error "hugepages can't be allocated by the buddy allocator" #endif -extern int hugepage_madvise(unsigned long *vm_flags); +extern int hugepage_madvise(unsigned long *vm_flags, int advice); extern void __vma_adjust_trans_huge(struct vm_area_struct *vma, unsigned long start, unsigned long end, @@ -141,7 +143,7 @@ static inline int split_huge_page(struct page *page) do { } while (0) #define wait_split_huge_page(__anon_vma, __pmd) \ do { } while (0) -static inline int hugepage_madvise(unsigned long *vm_flags) +static inline int hugepage_madvise(unsigned long *vm_flags, int advice) { BUG(); return 0; diff --git a/include/linux/khugepaged.h b/include/linux/khugepaged.h index 552f3184756c..6b394f0b5148 100644 --- a/include/linux/khugepaged.h +++ b/include/linux/khugepaged.h @@ -38,9 +38,10 @@ static inline void khugepaged_exit(struct mm_struct *mm) static inline int khugepaged_enter(struct vm_area_struct *vma) { if (!test_bit(MMF_VM_HUGEPAGE, &vma->vm_mm->flags)) - if (khugepaged_always() || - (khugepaged_req_madv() && - vma->vm_flags & VM_HUGEPAGE)) + if ((khugepaged_always() || + (khugepaged_req_madv() && + vma->vm_flags & VM_HUGEPAGE)) && + !(vma->vm_flags & VM_NOHUGEPAGE)) if (__khugepaged_enter(vma->vm_mm)) return -ENOMEM; return 0; diff --git a/include/linux/mm.h b/include/linux/mm.h index ce97a2bb0b19..956a35532f47 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -83,6 +83,7 @@ extern unsigned int kobjsize(const void *objp); #define VM_GROWSUP 0x00000200 #else #define VM_GROWSUP 0x00000000 +#define VM_NOHUGEPAGE 0x00000200 /* MADV_NOHUGEPAGE marked this vma */ #endif #define VM_PFNMAP 0x00000400 /* Page-ranges managed without "struct page", just pure PFN */ #define VM_DENYWRITE 0x00000800 /* ETXTBSY on write attempts.. */ diff --git a/mm/huge_memory.c b/mm/huge_memory.c index f4f6041176a4..fce667c0281d 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #include #include "internal.h" @@ -1388,18 +1389,36 @@ out: return ret; } -int hugepage_madvise(unsigned long *vm_flags) +int hugepage_madvise(unsigned long *vm_flags, int advice) { - /* - * Be somewhat over-protective like KSM for now! - */ - if (*vm_flags & (VM_HUGEPAGE | VM_SHARED | VM_MAYSHARE | - VM_PFNMAP | VM_IO | VM_DONTEXPAND | - VM_RESERVED | VM_HUGETLB | VM_INSERTPAGE | - VM_MIXEDMAP | VM_SAO)) - return -EINVAL; - - *vm_flags |= VM_HUGEPAGE; + switch (advice) { + case MADV_HUGEPAGE: + /* + * Be somewhat over-protective like KSM for now! + */ + if (*vm_flags & (VM_HUGEPAGE | + VM_SHARED | VM_MAYSHARE | + VM_PFNMAP | VM_IO | VM_DONTEXPAND | + VM_RESERVED | VM_HUGETLB | VM_INSERTPAGE | + VM_MIXEDMAP | VM_SAO)) + return -EINVAL; + *vm_flags &= ~VM_NOHUGEPAGE; + *vm_flags |= VM_HUGEPAGE; + break; + case MADV_NOHUGEPAGE: + /* + * Be somewhat over-protective like KSM for now! + */ + if (*vm_flags & (VM_NOHUGEPAGE | + VM_SHARED | VM_MAYSHARE | + VM_PFNMAP | VM_IO | VM_DONTEXPAND | + VM_RESERVED | VM_HUGETLB | VM_INSERTPAGE | + VM_MIXEDMAP | VM_SAO)) + return -EINVAL; + *vm_flags &= ~VM_HUGEPAGE; + *vm_flags |= VM_NOHUGEPAGE; + break; + } return 0; } diff --git a/mm/madvise.c b/mm/madvise.c index ecde40a401c1..bbac126e03ed 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -72,7 +72,8 @@ static long madvise_behavior(struct vm_area_struct * vma, goto out; break; case MADV_HUGEPAGE: - error = hugepage_madvise(&new_flags); + case MADV_NOHUGEPAGE: + error = hugepage_madvise(&new_flags, behavior); if (error) goto out; break; @@ -290,6 +291,7 @@ madvise_behavior_valid(int behavior) #endif #ifdef CONFIG_TRANSPARENT_HUGEPAGE case MADV_HUGEPAGE: + case MADV_NOHUGEPAGE: #endif return 1; -- cgit From 60ab3244ec85c44276c585a2a20d3750402e1cf4 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Thu, 13 Jan 2011 15:47:18 -0800 Subject: thp: khugepaged: make khugepaged aware about madvise MADV_HUGEPAGE and MADV_NOHUGEPAGE were fully effective only if run after mmap and before touching the memory. While this is enough for most usages, it's little effort to make madvise more dynamic at runtime on an existing mapping by making khugepaged aware about madvise. MADV_HUGEPAGE: register in khugepaged immediately without waiting a page fault (that may not ever happen if all pages are already mapped and the "enabled" knob was set to madvise during the initial page faults). MADV_NOHUGEPAGE: skip vmas marked VM_NOHUGEPAGE in khugepaged to stop collapsing pages where not needed. [akpm@linux-foundation.org: tweak comment] Signed-off-by: Andrea Arcangeli Cc: Michael Kerrisk Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/huge_mm.h | 6 ++++-- mm/huge_memory.c | 23 +++++++++++++++++++---- mm/madvise.c | 2 +- 3 files changed, 24 insertions(+), 7 deletions(-) diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index a8b7e42d19ec..bddfba1d7b85 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -105,7 +105,8 @@ extern void __split_huge_page_pmd(struct mm_struct *mm, pmd_t *pmd); #if HPAGE_PMD_ORDER > MAX_ORDER #error "hugepages can't be allocated by the buddy allocator" #endif -extern int hugepage_madvise(unsigned long *vm_flags, int advice); +extern int hugepage_madvise(struct vm_area_struct *vma, + unsigned long *vm_flags, int advice); extern void __vma_adjust_trans_huge(struct vm_area_struct *vma, unsigned long start, unsigned long end, @@ -143,7 +144,8 @@ static inline int split_huge_page(struct page *page) do { } while (0) #define wait_split_huge_page(__anon_vma, __pmd) \ do { } while (0) -static inline int hugepage_madvise(unsigned long *vm_flags, int advice) +static inline int hugepage_madvise(struct vm_area_struct *vma, + unsigned long *vm_flags, int advice) { BUG(); return 0; diff --git a/mm/huge_memory.c b/mm/huge_memory.c index fce667c0281d..004c9c2aac78 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1389,7 +1389,8 @@ out: return ret; } -int hugepage_madvise(unsigned long *vm_flags, int advice) +int hugepage_madvise(struct vm_area_struct *vma, + unsigned long *vm_flags, int advice) { switch (advice) { case MADV_HUGEPAGE: @@ -1404,6 +1405,13 @@ int hugepage_madvise(unsigned long *vm_flags, int advice) return -EINVAL; *vm_flags &= ~VM_NOHUGEPAGE; *vm_flags |= VM_HUGEPAGE; + /* + * If the vma become good for khugepaged to scan, + * register it here without waiting a page fault that + * may not happen any time soon. + */ + if (unlikely(khugepaged_enter_vma_merge(vma))) + return -ENOMEM; break; case MADV_NOHUGEPAGE: /* @@ -1417,6 +1425,11 @@ int hugepage_madvise(unsigned long *vm_flags, int advice) return -EINVAL; *vm_flags &= ~VM_HUGEPAGE; *vm_flags |= VM_NOHUGEPAGE; + /* + * Setting VM_NOHUGEPAGE will prevent khugepaged from scanning + * this vma even if we leave the mm registered in khugepaged if + * it got registered before VM_NOHUGEPAGE was set. + */ break; } @@ -1784,7 +1797,8 @@ static void collapse_huge_page(struct mm_struct *mm, if (address < hstart || address + HPAGE_PMD_SIZE > hend) goto out; - if (!(vma->vm_flags & VM_HUGEPAGE) && !khugepaged_always()) + if ((!(vma->vm_flags & VM_HUGEPAGE) && !khugepaged_always()) || + (vma->vm_flags & VM_NOHUGEPAGE)) goto out; /* VM_PFNMAP vmas may have vm_ops null but vm_file set */ @@ -2007,8 +2021,9 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, break; } - if (!(vma->vm_flags & VM_HUGEPAGE) && - !khugepaged_always()) { + if ((!(vma->vm_flags & VM_HUGEPAGE) && + !khugepaged_always()) || + (vma->vm_flags & VM_NOHUGEPAGE)) { progress++; continue; } diff --git a/mm/madvise.c b/mm/madvise.c index bbac126e03ed..2221491ed503 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -73,7 +73,7 @@ static long madvise_behavior(struct vm_area_struct * vma, break; case MADV_HUGEPAGE: case MADV_NOHUGEPAGE: - error = hugepage_madvise(&new_flags, behavior); + error = hugepage_madvise(vma, &new_flags, behavior); if (error) goto out; break; -- cgit From 29ad768cfc08611a4c1070d0f13f82eeea2bac7b Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Thu, 13 Jan 2011 15:47:19 -0800 Subject: thp: KSM on THP This makes KSM full operational with THP pages. Subpages are scanned while the hugepage is still in place and delivering max cpu performance, and only if there's a match and we're going to deduplicate memory, the single hugepages with the subpage match is split. There will be no false sharing between ksmd and khugepaged. khugepaged won't collapse 2m virtual regions with KSM pages inside. ksmd also should only split pages when the checksum matches and we're likely to split an hugepage for some long living ksm page (usual ksm heuristic to avoid sharing pages that get de-cowed). Signed-off-by: Andrea Arcangeli Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/ksm.c | 67 +++++++++++++++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 58 insertions(+), 9 deletions(-) diff --git a/mm/ksm.c b/mm/ksm.c index e2b0afd0a031..4d5a681923bb 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -412,6 +412,29 @@ out: up_read(&mm->mmap_sem); } +static struct page *page_trans_compound_anon(struct page *page) +{ + if (PageTransCompound(page)) { + struct page *head; + head = compound_head(page); + /* + * head may be a dangling pointer. + * __split_huge_page_refcount clears PageTail + * before overwriting first_page, so if + * PageTail is still there it means the head + * pointer isn't dangling. + */ + if (head != page) { + smp_rmb(); + if (!PageTransCompound(page)) + return NULL; + } + if (PageAnon(head)) + return head; + } + return NULL; +} + static struct page *get_mergeable_page(struct rmap_item *rmap_item) { struct mm_struct *mm = rmap_item->mm; @@ -431,7 +454,7 @@ static struct page *get_mergeable_page(struct rmap_item *rmap_item) page = follow_page(vma, addr, FOLL_GET); if (IS_ERR_OR_NULL(page)) goto out; - if (PageAnon(page) && !PageTransCompound(page)) { + if (PageAnon(page) || page_trans_compound_anon(page)) { flush_anon_page(vma, page, addr); flush_dcache_page(page); } else { @@ -709,6 +732,7 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page, if (addr == -EFAULT) goto out; + BUG_ON(PageTransCompound(page)); ptep = page_check_address(page, mm, addr, &ptl, 0); if (!ptep) goto out; @@ -784,6 +808,7 @@ static int replace_page(struct vm_area_struct *vma, struct page *page, goto out; pmd = pmd_offset(pud, addr); + BUG_ON(pmd_trans_huge(*pmd)); if (!pmd_present(*pmd)) goto out; @@ -811,6 +836,33 @@ out: return err; } +static int page_trans_compound_anon_split(struct page *page) +{ + int ret = 0; + struct page *transhuge_head = page_trans_compound_anon(page); + if (transhuge_head) { + /* Get the reference on the head to split it. */ + if (get_page_unless_zero(transhuge_head)) { + /* + * Recheck we got the reference while the head + * was still anonymous. + */ + if (PageAnon(transhuge_head)) + ret = split_huge_page(transhuge_head); + else + /* + * Retry later if split_huge_page run + * from under us. + */ + ret = 1; + put_page(transhuge_head); + } else + /* Retry later if split_huge_page run from under us. */ + ret = 1; + } + return ret; +} + /* * try_to_merge_one_page - take two pages and merge them into one * @vma: the vma that holds the pte pointing to page @@ -831,6 +883,9 @@ static int try_to_merge_one_page(struct vm_area_struct *vma, if (!(vma->vm_flags & VM_MERGEABLE)) goto out; + if (PageTransCompound(page) && page_trans_compound_anon_split(page)) + goto out; + BUG_ON(PageTransCompound(page)); if (!PageAnon(page)) goto out; @@ -1285,14 +1340,8 @@ next_mm: cond_resched(); continue; } - if (PageTransCompound(*page)) { - put_page(*page); - ksm_scan.address &= HPAGE_PMD_MASK; - ksm_scan.address += HPAGE_PMD_SIZE; - cond_resched(); - continue; - } - if (PageAnon(*page)) { + if (PageAnon(*page) || + page_trans_compound_anon(*page)) { flush_anon_page(vma, *page, ksm_scan.address); flush_dcache_page(*page); rmap_item = get_next_rmap_item(slot, -- cgit From 22e5c47ee238abe636655c3862ed28d6eb084ad4 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Thu, 13 Jan 2011 15:47:20 -0800 Subject: thp: add compound_trans_head() helper Cleanup some code with common compound_trans_head helper. Signed-off-by: Andrea Arcangeli Cc: Hugh Dickins Cc: Johannes Weiner Cc: Marcelo Tosatti Cc: Avi Kivity Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/huge_mm.h | 18 ++++++++++++++++++ mm/ksm.c | 15 +++------------ virt/kvm/kvm_main.c | 38 ++++++++++++++------------------------ 3 files changed, 35 insertions(+), 36 deletions(-) diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index bddfba1d7b85..8e6c8c42bc3c 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -126,6 +126,23 @@ static inline int hpage_nr_pages(struct page *page) return HPAGE_PMD_NR; return 1; } +static inline struct page *compound_trans_head(struct page *page) +{ + if (PageTail(page)) { + struct page *head; + head = page->first_page; + smp_rmb(); + /* + * head may be a dangling pointer. + * __split_huge_page_refcount clears PageTail before + * overwriting first_page, so if PageTail is still + * there it means the head pointer isn't dangling. + */ + if (PageTail(page)) + return head; + } + return page; +} #else /* CONFIG_TRANSPARENT_HUGEPAGE */ #define HPAGE_PMD_SHIFT ({ BUG(); 0; }) #define HPAGE_PMD_MASK ({ BUG(); 0; }) @@ -144,6 +161,7 @@ static inline int split_huge_page(struct page *page) do { } while (0) #define wait_split_huge_page(__anon_vma, __pmd) \ do { } while (0) +#define compound_trans_head(page) compound_head(page) static inline int hugepage_madvise(struct vm_area_struct *vma, unsigned long *vm_flags, int advice) { diff --git a/mm/ksm.c b/mm/ksm.c index 4d5a681923bb..33781de0b6bf 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -415,20 +415,11 @@ out: static struct page *page_trans_compound_anon(struct page *page) { if (PageTransCompound(page)) { - struct page *head; - head = compound_head(page); + struct page *head = compound_trans_head(page); /* - * head may be a dangling pointer. - * __split_huge_page_refcount clears PageTail - * before overwriting first_page, so if - * PageTail is still there it means the head - * pointer isn't dangling. + * head may actually be splitted and freed from under + * us but it's ok here. */ - if (head != page) { - smp_rmb(); - if (!PageTransCompound(page)) - return NULL; - } if (PageAnon(head)) return head; } diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 4286d4766510..f29abeb6a912 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -104,34 +104,24 @@ static pfn_t fault_pfn; inline int kvm_is_mmio_pfn(pfn_t pfn) { if (pfn_valid(pfn)) { - struct page *head; + int reserved; struct page *tail = pfn_to_page(pfn); - head = compound_head(tail); + struct page *head = compound_trans_head(tail); + reserved = PageReserved(head); if (head != tail) { - smp_rmb(); /* - * head may be a dangling pointer. - * __split_huge_page_refcount clears PageTail - * before overwriting first_page, so if - * PageTail is still there it means the head - * pointer isn't dangling. + * "head" is not a dangling pointer + * (compound_trans_head takes care of that) + * but the hugepage may have been splitted + * from under us (and we may not hold a + * reference count on the head page so it can + * be reused before we run PageReferenced), so + * we've to check PageTail before returning + * what we just read. */ - if (PageTail(tail)) { - /* - * the "head" is not a dangling - * pointer but the hugepage may have - * been splitted from under us (and we - * may not hold a reference count on - * the head page so it can be reused - * before we run PageReferenced), so - * we've to recheck PageTail before - * returning what we just read. - */ - int reserved = PageReserved(head); - smp_rmb(); - if (PageTail(tail)) - return reserved; - } + smp_rmb(); + if (PageTail(tail)) + return reserved; } return PageReserved(tail); } -- cgit From 29c1f677d424e8c5683a837fc4f03fc9f19201d7 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 13 Jan 2011 15:47:21 -0800 Subject: mm: migration: use rcu_dereference_protected when dereferencing the radix tree slot during file page migration migrate_pages() -> unmap_and_move() only calls rcu_read_lock() for anonymous pages, as introduced by git commit 989f89c57e6361e7d16fbd9572b5da7d313b073d ("fix rcu_read_lock() in page migraton"). The point of the RCU protection there is part of getting a stable reference to anon_vma and is only held for anon pages as file pages are locked which is sufficient protection against freeing. However, while a file page's mapping is being migrated, the radix tree is double checked to ensure it is the expected page. This uses radix_tree_deref_slot() -> rcu_dereference() without the RCU lock held triggering the following warning. [ 173.674290] =================================================== [ 173.676016] [ INFO: suspicious rcu_dereference_check() usage. ] [ 173.676016] --------------------------------------------------- [ 173.676016] include/linux/radix-tree.h:145 invoked rcu_dereference_check() without protection! [ 173.676016] [ 173.676016] other info that might help us debug this: [ 173.676016] [ 173.676016] [ 173.676016] rcu_scheduler_active = 1, debug_locks = 0 [ 173.676016] 1 lock held by hugeadm/2899: [ 173.676016] #0: (&(&inode->i_data.tree_lock)->rlock){..-.-.}, at: [] migrate_page_move_mapping+0x40/0x1ab [ 173.676016] [ 173.676016] stack backtrace: [ 173.676016] Pid: 2899, comm: hugeadm Not tainted 2.6.37-rc5-autobuild [ 173.676016] Call Trace: [ 173.676016] [] ? printk+0x14/0x1b [ 173.676016] [] lockdep_rcu_dereference+0x7d/0x86 [ 173.676016] [] migrate_page_move_mapping+0xca/0x1ab [ 173.676016] [] migrate_page+0x23/0x39 [ 173.676016] [] buffer_migrate_page+0x22/0x107 [ 173.676016] [] ? buffer_migrate_page+0x0/0x107 [ 173.676016] [] move_to_new_page+0x9a/0x1ae [ 173.676016] [] migrate_pages+0x1e7/0x2fa This patch introduces radix_tree_deref_slot_protected() which calls rcu_dereference_protected(). Users of it must pass in the mapping->tree_lock that is protecting this dereference. Holding the tree lock protects against parallel updaters of the radix tree meaning that rcu_dereference_protected is allowable. [akpm@linux-foundation.org: remove unneeded casts] Signed-off-by: Mel Gorman Cc: Minchan Kim Cc: KAMEZAWA Hiroyuki Cc: Milton Miller Cc: Nick Piggin Cc: Wu Fengguang Cc: [2.6.37.early] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/radix-tree.h | 16 ++++++++++++++++ mm/migrate.c | 4 ++-- 2 files changed, 18 insertions(+), 2 deletions(-) diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h index ab2baa5c4884..23241c2fecce 100644 --- a/include/linux/radix-tree.h +++ b/include/linux/radix-tree.h @@ -145,6 +145,22 @@ static inline void *radix_tree_deref_slot(void **pslot) return rcu_dereference(*pslot); } +/** + * radix_tree_deref_slot_protected - dereference a slot without RCU lock but with tree lock held + * @pslot: pointer to slot, returned by radix_tree_lookup_slot + * Returns: item that was stored in that slot with any direct pointer flag + * removed. + * + * Similar to radix_tree_deref_slot but only used during migration when a pages + * mapping is being moved. The caller does not hold the RCU read lock but it + * must hold the tree lock to prevent parallel updates. + */ +static inline void *radix_tree_deref_slot_protected(void **pslot, + spinlock_t *treelock) +{ + return rcu_dereference_protected(*pslot, lockdep_is_held(treelock)); +} + /** * radix_tree_deref_retry - check radix_tree_deref_slot * @arg: pointer returned by radix_tree_deref_slot diff --git a/mm/migrate.c b/mm/migrate.c index 1a531b760b3b..89a6bc8cd307 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -248,7 +248,7 @@ static int migrate_page_move_mapping(struct address_space *mapping, expected_count = 2 + page_has_private(page); if (page_count(page) != expected_count || - (struct page *)radix_tree_deref_slot(pslot) != page) { + radix_tree_deref_slot_protected(pslot, &mapping->tree_lock) != page) { spin_unlock_irq(&mapping->tree_lock); return -EAGAIN; } @@ -320,7 +320,7 @@ int migrate_huge_page_move_mapping(struct address_space *mapping, expected_count = 2 + page_has_private(page); if (page_count(page) != expected_count || - (struct page *)radix_tree_deref_slot(pslot) != page) { + radix_tree_deref_slot_protected(pslot, &mapping->tree_lock) != page) { spin_unlock_irq(&mapping->tree_lock); return -EAGAIN; } -- cgit From 32d6feadf4e17ea9b98071be9bbf402a74a4f818 Mon Sep 17 00:00:00 2001 From: Jesper Juhl Date: Thu, 13 Jan 2011 15:47:22 -0800 Subject: mm/hugetlb.c: fix error-path memory leak in nr_hugepages_store_common() The NODEMASK_ALLOC macro may dynamically allocate memory for its second argument ('nodes_allowed' in this context). In nr_hugepages_store_common() we may abort early if strict_strtoul() fails, but in that case we do not free the memory already allocated to 'nodes_allowed', causing a memory leak. This patch closes the leak by freeing the memory in the error path. [akpm@linux-foundation.org: use NODEMASK_FREE, per Minchan Kim] Signed-off-by: Jesper Juhl Cc: Minchan Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 7bf223d6677b..8e31cda6fc22 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1374,8 +1374,10 @@ static ssize_t nr_hugepages_store_common(bool obey_mempolicy, NODEMASK_ALLOC(nodemask_t, nodes_allowed, GFP_KERNEL | __GFP_NORETRY); err = strict_strtoul(buf, 10, &count); - if (err) + if (err) { + NODEMASK_FREE(nodes_allowed); return 0; + } h = kobj_to_hstate(kobj, &nid); if (nid == NUMA_NO_NODE) { -- cgit From 5520e89485252c759ee60d313e9422447659947b Mon Sep 17 00:00:00 2001 From: Jiri Kosina Date: Thu, 13 Jan 2011 15:47:23 -0800 Subject: brk: fix min_brk lower bound computation for COMPAT_BRK Even if CONFIG_COMPAT_BRK is set in the kernel configuration, it can still be overriden by randomize_va_space sysctl. If this is the case, the min_brk computation in sys_brk() implementation is wrong, as it solely takes into account COMPAT_BRK setting, assuming that brk start is not randomized. But that might not be the case if randomize_va_space sysctl has been set to '2' at the time the binary has been loaded from disk. In such case, the check has to be done in a same way as in !CONFIG_COMPAT_BRK case. In addition to that, the check for the COMPAT_BRK case introduced back in a5b4592c ("brk: make sys_brk() honor COMPAT_BRK when computing lower bound") is slightly wrong -- the lower bound shouldn't be mm->end_code, but mm->end_data instead, as that's where the legacy applications expect brk section to start (i.e. immediately after last global variable). [akpm@linux-foundation.org: fix comment] Signed-off-by: Jiri Kosina Cc: Geert Uytterhoeven Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mmap.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/mm/mmap.c b/mm/mmap.c index 73cc648873d6..2ec8eb5a9cdd 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -254,7 +254,15 @@ SYSCALL_DEFINE1(brk, unsigned long, brk) down_write(&mm->mmap_sem); #ifdef CONFIG_COMPAT_BRK - min_brk = mm->end_code; + /* + * CONFIG_COMPAT_BRK can still be overridden by setting + * randomize_va_space to 2, which will still cause mm->start_brk + * to be arbitrarily shifted + */ + if (mm->start_brk > PAGE_ALIGN(mm->end_data)) + min_brk = mm->start_brk; + else + min_brk = mm->end_data; #else min_brk = mm->start_brk; #endif -- cgit From 43506fad21ca3d8dc59e768ff458f7c5e5c01086 Mon Sep 17 00:00:00 2001 From: KyongHo Cho Date: Thu, 13 Jan 2011 15:47:24 -0800 Subject: mm/page_alloc.c: simplify calculation of combined index of adjacent buddy lists The previous approach of calucation of combined index was page_idx & ~(1 << order)) but we have same result with page_idx & buddy_idx This reduces instructions slightly as well as enhances readability. [akpm@linux-foundation.org: coding-style fixes] [akpm@linux-foundation.org: fix used-unintialised warning] Signed-off-by: KyongHo Cho Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 25 ++++++++++--------------- 1 file changed, 10 insertions(+), 15 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 9dfe49bceff4..bda1db301d44 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -427,18 +427,10 @@ static inline void rmv_page_order(struct page *page) * * Assumption: *_mem_map is contiguous at least up to MAX_ORDER */ -static inline struct page * -__page_find_buddy(struct page *page, unsigned long page_idx, unsigned int order) -{ - unsigned long buddy_idx = page_idx ^ (1 << order); - - return page + (buddy_idx - page_idx); -} - static inline unsigned long -__find_combined_index(unsigned long page_idx, unsigned int order) +__find_buddy_index(unsigned long page_idx, unsigned int order) { - return (page_idx & ~(1 << order)); + return page_idx ^ (1 << order); } /* @@ -500,6 +492,7 @@ static inline void __free_one_page(struct page *page, { unsigned long page_idx; unsigned long combined_idx; + unsigned long uninitialized_var(buddy_idx); struct page *buddy; if (unlikely(PageCompound(page))) @@ -514,7 +507,8 @@ static inline void __free_one_page(struct page *page, VM_BUG_ON(bad_range(zone, page)); while (order < MAX_ORDER-1) { - buddy = __page_find_buddy(page, page_idx, order); + buddy_idx = __find_buddy_index(page_idx, order); + buddy = page + (buddy_idx - page_idx); if (!page_is_buddy(page, buddy, order)) break; @@ -522,7 +516,7 @@ static inline void __free_one_page(struct page *page, list_del(&buddy->lru); zone->free_area[order].nr_free--; rmv_page_order(buddy); - combined_idx = __find_combined_index(page_idx, order); + combined_idx = buddy_idx & page_idx; page = page + (combined_idx - page_idx); page_idx = combined_idx; order++; @@ -539,9 +533,10 @@ static inline void __free_one_page(struct page *page, */ if ((order < MAX_ORDER-2) && pfn_valid_within(page_to_pfn(buddy))) { struct page *higher_page, *higher_buddy; - combined_idx = __find_combined_index(page_idx, order); - higher_page = page + combined_idx - page_idx; - higher_buddy = __page_find_buddy(higher_page, combined_idx, order + 1); + combined_idx = buddy_idx & page_idx; + higher_page = page + (combined_idx - page_idx); + buddy_idx = __find_buddy_index(combined_idx, order + 1); + higher_buddy = page + (buddy_idx - combined_idx); if (page_is_buddy(higher_page, higher_buddy, order + 1)) { list_add_tail(&page->lru, &zone->free_area[order].free_list[migratetype]); -- cgit From 84bc227d7fde049a568cd58a5610613feedc0dff Mon Sep 17 00:00:00 2001 From: Rolf Eike Beer Date: Thu, 13 Jan 2011 15:47:24 -0800 Subject: mm/dmapool.c: take lock only once in dma_pool_free() dma_pool_free() scans for the page to free in the pool list holding the pool lock. Then it releases the lock basically to acquire it immediately again. Modify the code to only take the lock once. This will do some additional loops and computations with the lock held in if memory debugging is activated. If it is not activated the only new operations with this lock is one if and one substraction. Signed-off-by: Rolf Eike Beer Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/dmapool.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/mm/dmapool.c b/mm/dmapool.c index 4df2de77e069..a2f6295b4df4 100644 --- a/mm/dmapool.c +++ b/mm/dmapool.c @@ -355,20 +355,15 @@ EXPORT_SYMBOL(dma_pool_alloc); static struct dma_page *pool_find_page(struct dma_pool *pool, dma_addr_t dma) { - unsigned long flags; struct dma_page *page; - spin_lock_irqsave(&pool->lock, flags); list_for_each_entry(page, &pool->page_list, page_list) { if (dma < page->dma) continue; if (dma < (page->dma + pool->allocation)) - goto done; + return page; } - page = NULL; - done: - spin_unlock_irqrestore(&pool->lock, flags); - return page; + return NULL; } /** @@ -386,8 +381,10 @@ void dma_pool_free(struct dma_pool *pool, void *vaddr, dma_addr_t dma) unsigned long flags; unsigned int offset; + spin_lock_irqsave(&pool->lock, flags); page = pool_find_page(pool, dma); if (!page) { + spin_unlock_irqrestore(&pool->lock, flags); if (pool->dev) dev_err(pool->dev, "dma_pool_free %s, %p/%lx (bad dma)\n", @@ -401,6 +398,7 @@ void dma_pool_free(struct dma_pool *pool, void *vaddr, dma_addr_t dma) offset = vaddr - page->vaddr; #ifdef DMAPOOL_DEBUG if ((dma - page->dma) != offset) { + spin_unlock_irqrestore(&pool->lock, flags); if (pool->dev) dev_err(pool->dev, "dma_pool_free %s, %p (bad vaddr)/%Lx\n", @@ -418,6 +416,7 @@ void dma_pool_free(struct dma_pool *pool, void *vaddr, dma_addr_t dma) chain = *(int *)(page->vaddr + chain); continue; } + spin_unlock_irqrestore(&pool->lock, flags); if (pool->dev) dev_err(pool->dev, "dma_pool_free %s, dma %Lx " "already free\n", pool->name, @@ -432,7 +431,6 @@ void dma_pool_free(struct dma_pool *pool, void *vaddr, dma_addr_t dma) memset(vaddr, POOL_POISON_FREED, pool->size); #endif - spin_lock_irqsave(&pool->lock, flags); page->in_use--; *(int *)vaddr = page->offset; page->offset = offset; -- cgit From 684265d4a30f133162f06ddb2e5010608e60e4bb Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Thu, 13 Jan 2011 15:47:25 -0800 Subject: mm/dmapool.c: use TASK_UNINTERRUPTIBLE in dma_pool_alloc() As it stands this code will degenerate into a busy-wait if the calling task has signal_pending(). Cc: Rolf Eike Beer Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/dmapool.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/dmapool.c b/mm/dmapool.c index a2f6295b4df4..03bf3bb4519a 100644 --- a/mm/dmapool.c +++ b/mm/dmapool.c @@ -324,7 +324,7 @@ void *dma_pool_alloc(struct dma_pool *pool, gfp_t mem_flags, if (mem_flags & __GFP_WAIT) { DECLARE_WAITQUEUE(wait, current); - __set_current_state(TASK_INTERRUPTIBLE); + __set_current_state(TASK_UNINTERRUPTIBLE); __add_wait_queue(&pool->waitq, &wait); spin_unlock_irqrestore(&pool->lock, flags); -- cgit From cb9ef8d5e394f70db64bda79c20d3569a20d2574 Mon Sep 17 00:00:00 2001 From: Stefan Hajnoczi Date: Thu, 13 Jan 2011 15:47:26 -0800 Subject: fs/fs-writeback.c: fix sync_inodes_sb() return value kernel-doc The sync_inodes_sb() function does not have a return value. Remove the outdated documentation comment. Signed-off-by: Stefan Hajnoczi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/fs-writeback.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 05aab263e9aa..59c6e4956786 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -1225,7 +1225,7 @@ EXPORT_SYMBOL(writeback_inodes_sb_nr_if_idle); * @sb: the superblock * * This function writes and waits on any dirty inode belonging to this - * super_block. The number of pages synced is returned. + * super_block. */ void sync_inodes_sb(struct super_block *sb) { -- cgit From 08d4a24659f1284f33e574211435aa12ce968477 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Thu, 13 Jan 2011 15:47:26 -0800 Subject: hugetlb: check the return value of string conversion in sysctl handler proc_doulongvec_minmax may fail if the given buffer doesn't represent a valid number. If we provide something invalid we will initialize the resulting value (nr_overcommit_huge_pages in this case) to a random value from the stack. The issue was introduced by a3d0c6aa when the default handler has been replaced by the helper function where we do not check the return value. Reproducer: echo "" > /proc/sys/vm/nr_overcommit_hugepages [akpm@linux-foundation.org: correctly propagate proc_doulongvec_minmax return code] Signed-off-by: Michal Hocko Cc: CAI Qian Cc: Nishanth Aravamudan Cc: Andrea Arcangeli Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 8e31cda6fc22..363c4d22602a 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1859,13 +1859,16 @@ static int hugetlb_sysctl_handler_common(bool obey_mempolicy, { struct hstate *h = &default_hstate; unsigned long tmp; + int ret; if (!write) tmp = h->max_huge_pages; table->data = &tmp; table->maxlen = sizeof(unsigned long); - proc_doulongvec_minmax(table, write, buffer, length, ppos); + ret = proc_doulongvec_minmax(table, write, buffer, length, ppos); + if (ret) + goto out; if (write) { NODEMASK_ALLOC(nodemask_t, nodes_allowed, @@ -1880,8 +1883,8 @@ static int hugetlb_sysctl_handler_common(bool obey_mempolicy, if (nodes_allowed != &node_states[N_HIGH_MEMORY]) NODEMASK_FREE(nodes_allowed); } - - return 0; +out: + return ret; } int hugetlb_sysctl_handler(struct ctl_table *table, int write, @@ -1919,21 +1922,24 @@ int hugetlb_overcommit_handler(struct ctl_table *table, int write, { struct hstate *h = &default_hstate; unsigned long tmp; + int ret; if (!write) tmp = h->nr_overcommit_huge_pages; table->data = &tmp; table->maxlen = sizeof(unsigned long); - proc_doulongvec_minmax(table, write, buffer, length, ppos); + ret = proc_doulongvec_minmax(table, write, buffer, length, ppos); + if (ret) + goto out; if (write) { spin_lock(&hugetlb_lock); h->nr_overcommit_huge_pages = tmp; spin_unlock(&hugetlb_lock); } - - return 0; +out: + return ret; } #endif /* CONFIG_SYSCTL */ -- cgit From adbe8726dc2a3805630d517270db17e3af86e526 Mon Sep 17 00:00:00 2001 From: Eric B Munson Date: Thu, 13 Jan 2011 15:47:27 -0800 Subject: hugetlb: do not allow pagesize >= MAX_ORDER pool adjustment Huge pages with order >= MAX_ORDER must be allocated at boot via the kernel command line, they cannot be allocated or freed once the kernel is up and running. Currently we allow values to be written to the sysfs and sysctl files controling pool size for these huge page sizes. This patch makes the store functions for nr_hugepages and nr_overcommit_hugepages return -EINVAL when the pool for a page size >= MAX_ORDER is changed. [akpm@linux-foundation.org: avoid multiple return paths in nr_hugepages_store_common()] [caiqian@redhat.com: add checking in hugetlb_overcommit_handler()] Signed-off-by: Eric B Munson Reported-by: CAI Qian Cc: Andrea Arcangeli Cc: Michal Hocko Cc: Nishanth Aravamudan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 363c4d22602a..ce8e5bb6f031 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1363,6 +1363,7 @@ static ssize_t nr_hugepages_show_common(struct kobject *kobj, return sprintf(buf, "%lu\n", nr_huge_pages); } + static ssize_t nr_hugepages_store_common(bool obey_mempolicy, struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t len) @@ -1375,11 +1376,16 @@ static ssize_t nr_hugepages_store_common(bool obey_mempolicy, err = strict_strtoul(buf, 10, &count); if (err) { - NODEMASK_FREE(nodes_allowed); - return 0; + err = 0; /* This seems wrong */ + goto out; } h = kobj_to_hstate(kobj, &nid); + if (h->order >= MAX_ORDER) { + err = -EINVAL; + goto out; + } + if (nid == NUMA_NO_NODE) { /* * global hstate attribute @@ -1405,6 +1411,9 @@ static ssize_t nr_hugepages_store_common(bool obey_mempolicy, NODEMASK_FREE(nodes_allowed); return len; +out: + NODEMASK_FREE(nodes_allowed); + return err; } static ssize_t nr_hugepages_show(struct kobject *kobj, @@ -1447,6 +1456,7 @@ static ssize_t nr_overcommit_hugepages_show(struct kobject *kobj, struct hstate *h = kobj_to_hstate(kobj, NULL); return sprintf(buf, "%lu\n", h->nr_overcommit_huge_pages); } + static ssize_t nr_overcommit_hugepages_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { @@ -1454,6 +1464,9 @@ static ssize_t nr_overcommit_hugepages_store(struct kobject *kobj, unsigned long input; struct hstate *h = kobj_to_hstate(kobj, NULL); + if (h->order >= MAX_ORDER) + return -EINVAL; + err = strict_strtoul(buf, 10, &input); if (err) return 0; @@ -1864,6 +1877,9 @@ static int hugetlb_sysctl_handler_common(bool obey_mempolicy, if (!write) tmp = h->max_huge_pages; + if (write && h->order >= MAX_ORDER) + return -EINVAL; + table->data = &tmp; table->maxlen = sizeof(unsigned long); ret = proc_doulongvec_minmax(table, write, buffer, length, ppos); @@ -1927,6 +1943,9 @@ int hugetlb_overcommit_handler(struct ctl_table *table, int write, if (!write) tmp = h->nr_overcommit_huge_pages; + if (write && h->order >= MAX_ORDER) + return -EINVAL; + table->data = &tmp; table->maxlen = sizeof(unsigned long); ret = proc_doulongvec_minmax(table, write, buffer, length, ppos); -- cgit From 73ae31e5986a4c0ee84bfd13ccd9b57a98956f6f Mon Sep 17 00:00:00 2001 From: Eric B Munson Date: Thu, 13 Jan 2011 15:47:28 -0800 Subject: hugetlb: fix handling of parse errors in sysfs When parsing changes to the huge page pool sizes made from userspace via the sysfs interface, bogus input values are being covered up by nr_hugepages_store_common and nr_overcommit_hugepages_store returning 0 when strict_strtoul returns an error. This can cause an infinite loop in the nr_hugepages_store code. This patch changes the return value for these functions to -EINVAL when strict_strtoul returns an error. Signed-off-by: Eric B Munson Reported-by: CAI Qian Cc: Andrea Arcangeli Cc: Eric B Munson Cc: Michal Hocko Cc: Nishanth Aravamudan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index ce8e5bb6f031..bb0b7c128015 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1375,10 +1375,8 @@ static ssize_t nr_hugepages_store_common(bool obey_mempolicy, NODEMASK_ALLOC(nodemask_t, nodes_allowed, GFP_KERNEL | __GFP_NORETRY); err = strict_strtoul(buf, 10, &count); - if (err) { - err = 0; /* This seems wrong */ + if (err) goto out; - } h = kobj_to_hstate(kobj, &nid); if (h->order >= MAX_ORDER) { @@ -1469,7 +1467,7 @@ static ssize_t nr_overcommit_hugepages_store(struct kobject *kobj, err = strict_strtoul(buf, 10, &input); if (err) - return 0; + return err; spin_lock(&hugetlb_lock); h->nr_overcommit_huge_pages = input; -- cgit From 2919bfd0758257c469abef8c26c3e516bbebb851 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Thu, 13 Jan 2011 15:47:29 -0800 Subject: ksm: drain pagevecs to lru It was hard to explain the page counts which were causing new LTP tests of KSM to fail: we need to drain the per-cpu pagevecs to LRU occasionally. Signed-off-by: Hugh Dickins Reported-by: CAI Qian Cc:Andrea Arcangeli Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/ksm.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/mm/ksm.c b/mm/ksm.c index 33781de0b6bf..c2b2a94f9d67 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -1296,6 +1296,18 @@ static struct rmap_item *scan_get_next_rmap_item(struct page **page) slot = ksm_scan.mm_slot; if (slot == &ksm_mm_head) { + /* + * A number of pages can hang around indefinitely on per-cpu + * pagevecs, raised page count preventing write_protect_page + * from merging them. Though it doesn't really matter much, + * it is puzzling to see some stuck in pages_volatile until + * other activity jostles them out, and they also prevented + * LTP's KSM test from succeeding deterministically; so drain + * them here (here rather than on entry to ksm_do_scan(), + * so we don't IPI too often when pages_to_scan is set low). + */ + lru_add_drain_all(); + root_unstable_tree = RB_ROOT; spin_lock(&ksm_mmlist_lock); -- cgit From 1ce82b69e96c838d007f316b8347b911fdfa9842 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Thu, 13 Jan 2011 15:47:30 -0800 Subject: mm: fix migration hangs on anon_vma lock Increased usage of page migration in mmotm reveals that the anon_vma locking in unmap_and_move() has been deficient since 2.6.36 (or even earlier). Review at the time of f18194275c39835cb84563500995e0d503a32d9a ("mm: fix hang on anon_vma->root->lock") missed the issue here: the anon_vma to which we get a reference may already have been freed back to its slab (it is in use when we check page_mapped, but that can change), and so its anon_vma->root may be switched at any moment by reuse in anon_vma_prepare. Perhaps we could fix that with a get_anon_vma_unless_zero(), but let's not: just rely on page_lock_anon_vma() to do all the hard thinking for us, then we don't need any rcu read locking over here. In removing the rcu_unlock label: since PageAnon is a bit in page->mapping, it's impossible for a !page->mapping page to be anon; but insert VM_BUG_ON in case the implementation ever changes. [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Hugh Dickins Reviewed-by: Mel Gorman Reviewed-by: Rik van Riel Cc: Naoya Horiguchi Cc: "Jun'ichi Nomura" Cc: Andi Kleen Cc: [2.6.37, 2.6.36] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/migrate.c | 48 +++++++++++++++++++----------------------------- 1 file changed, 19 insertions(+), 29 deletions(-) diff --git a/mm/migrate.c b/mm/migrate.c index 89a6bc8cd307..a20cf12edede 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -622,7 +622,6 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private, int *result = NULL; struct page *newpage = get_new_page(page, private, &result); int remap_swapcache = 1; - int rcu_locked = 0; int charge = 0; struct mem_cgroup *mem = NULL; struct anon_vma *anon_vma = NULL; @@ -694,20 +693,26 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private, /* * By try_to_unmap(), page->mapcount goes down to 0 here. In this case, * we cannot notice that anon_vma is freed while we migrates a page. - * This rcu_read_lock() delays freeing anon_vma pointer until the end + * This get_anon_vma() delays freeing anon_vma pointer until the end * of migration. File cache pages are no problem because of page_lock() * File Caches may use write_page() or lock_page() in migration, then, * just care Anon page here. */ if (PageAnon(page)) { - rcu_read_lock(); - rcu_locked = 1; - - /* Determine how to safely use anon_vma */ - if (!page_mapped(page)) { - if (!PageSwapCache(page)) - goto rcu_unlock; - + /* + * Only page_lock_anon_vma() understands the subtleties of + * getting a hold on an anon_vma from outside one of its mms. + */ + anon_vma = page_lock_anon_vma(page); + if (anon_vma) { + /* + * Take a reference count on the anon_vma if the + * page is mapped so that it is guaranteed to + * exist when the page is remapped later + */ + get_anon_vma(anon_vma); + page_unlock_anon_vma(anon_vma); + } else if (PageSwapCache(page)) { /* * We cannot be sure that the anon_vma of an unmapped * swapcache page is safe to use because we don't @@ -722,13 +727,7 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private, */ remap_swapcache = 0; } else { - /* - * Take a reference count on the anon_vma if the - * page is mapped so that it is guaranteed to - * exist when the page is remapped later - */ - anon_vma = page_anon_vma(page); - get_anon_vma(anon_vma); + goto uncharge; } } @@ -745,16 +744,10 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private, * free the metadata, so the page can be freed. */ if (!page->mapping) { - if (!PageAnon(page) && page_has_private(page)) { - /* - * Go direct to try_to_free_buffers() here because - * a) that's what try_to_release_page() would do anyway - * b) we may be under rcu_read_lock() here, so we can't - * use GFP_KERNEL which is what try_to_release_page() - * needs to be effective. - */ + VM_BUG_ON(PageAnon(page)); + if (page_has_private(page)) { try_to_free_buffers(page); - goto rcu_unlock; + goto uncharge; } goto skip_unmap; } @@ -768,14 +761,11 @@ skip_unmap: if (rc && remap_swapcache) remove_migration_ptes(page, page); -rcu_unlock: /* Drop an anon_vma reference if we took one */ if (anon_vma) drop_anon_vma(anon_vma); - if (rcu_locked) - rcu_read_unlock(); uncharge: if (!charge) mem_cgroup_end_migration(mem, page, newpage); -- cgit From fd4a4663db293bfd5dc20fb4113977f62895e550 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Thu, 13 Jan 2011 15:47:31 -0800 Subject: mm: fix hugepage migration 2.6.37 added an unmap_and_move_huge_page() for memory failure recovery, but its anon_vma handling was still based around the 2.6.35 conventions. Update it to use page_lock_anon_vma, get_anon_vma, page_unlock_anon_vma, drop_anon_vma in the same way as we're now changing unmap_and_move(). I don't particularly like to propose this for stable when I've not seen its problems in practice nor tested the solution: but it's clearly out of synch at present. Signed-off-by: Hugh Dickins Cc: Mel Gorman Cc: Rik van Riel Cc: Naoya Horiguchi Cc: "Jun'ichi Nomura" Cc: Andi Kleen Cc: [2.6.37, 2.6.36] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/migrate.c | 23 ++++++----------------- 1 file changed, 6 insertions(+), 17 deletions(-) diff --git a/mm/migrate.c b/mm/migrate.c index a20cf12edede..5b7d1fd29621 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -827,7 +827,6 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, int rc = 0; int *result = NULL; struct page *new_hpage = get_new_page(hpage, private, &result); - int rcu_locked = 0; struct anon_vma *anon_vma = NULL; if (!new_hpage) @@ -842,12 +841,10 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, } if (PageAnon(hpage)) { - rcu_read_lock(); - rcu_locked = 1; - - if (page_mapped(hpage)) { - anon_vma = page_anon_vma(hpage); - atomic_inc(&anon_vma->external_refcount); + anon_vma = page_lock_anon_vma(hpage); + if (anon_vma) { + get_anon_vma(anon_vma); + page_unlock_anon_vma(anon_vma); } } @@ -859,16 +856,8 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, if (rc) remove_migration_ptes(hpage, hpage); - if (anon_vma && atomic_dec_and_lock(&anon_vma->external_refcount, - &anon_vma->lock)) { - int empty = list_empty(&anon_vma->head); - spin_unlock(&anon_vma->lock); - if (empty) - anon_vma_free(anon_vma); - } - - if (rcu_locked) - rcu_read_unlock(); + if (anon_vma) + drop_anon_vma(anon_vma); out: unlock_page(hpage); -- cgit From c06b1fca18c3ad868bfcaca230146e3038583422 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Thu, 13 Jan 2011 15:47:32 -0800 Subject: mm/page_alloc.c: don't cache `current' in a local It's old-fashioned and unneeded. akpm:/usr/src/25> size mm/page_alloc.o text data bss dec hex filename 39884 1241317 18808 1300009 13d629 mm/page_alloc.o (before) 39838 1241317 18808 1299963 13d5fb mm/page_alloc.o (after) Acked-by: David Rientjes Acked-by: Mel Gorman Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 24 ++++++++++-------------- 1 file changed, 10 insertions(+), 14 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index bda1db301d44..90c1439549fd 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1809,15 +1809,14 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, bool sync_migration) { struct page *page; - struct task_struct *tsk = current; if (!order || compaction_deferred(preferred_zone)) return NULL; - tsk->flags |= PF_MEMALLOC; + current->flags |= PF_MEMALLOC; *did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask, nodemask, sync_migration); - tsk->flags &= ~PF_MEMALLOC; + current->flags &= ~PF_MEMALLOC; if (*did_some_progress != COMPACT_SKIPPED) { /* Page migration frees to the PCP lists but we want merging */ @@ -1869,23 +1868,22 @@ __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order, { struct page *page = NULL; struct reclaim_state reclaim_state; - struct task_struct *p = current; bool drained = false; cond_resched(); /* We now go into synchronous reclaim */ cpuset_memory_pressure_bump(); - p->flags |= PF_MEMALLOC; + current->flags |= PF_MEMALLOC; lockdep_set_current_reclaim_state(gfp_mask); reclaim_state.reclaimed_slab = 0; - p->reclaim_state = &reclaim_state; + current->reclaim_state = &reclaim_state; *did_some_progress = try_to_free_pages(zonelist, order, gfp_mask, nodemask); - p->reclaim_state = NULL; + current->reclaim_state = NULL; lockdep_clear_current_reclaim_state(); - p->flags &= ~PF_MEMALLOC; + current->flags &= ~PF_MEMALLOC; cond_resched(); @@ -1950,7 +1948,6 @@ void wake_all_kswapd(unsigned int order, struct zonelist *zonelist, static inline int gfp_to_alloc_flags(gfp_t gfp_mask) { - struct task_struct *p = current; int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET; const gfp_t wait = gfp_mask & __GFP_WAIT; @@ -1977,12 +1974,12 @@ gfp_to_alloc_flags(gfp_t gfp_mask) * See also cpuset_zone_allowed() comment in kernel/cpuset.c. */ alloc_flags &= ~ALLOC_CPUSET; - } else if (unlikely(rt_task(p)) && !in_interrupt()) + } else if (unlikely(rt_task(current)) && !in_interrupt()) alloc_flags |= ALLOC_HARDER; if (likely(!(gfp_mask & __GFP_NOMEMALLOC))) { if (!in_interrupt() && - ((p->flags & PF_MEMALLOC) || + ((current->flags & PF_MEMALLOC) || unlikely(test_thread_flag(TIF_MEMDIE)))) alloc_flags |= ALLOC_NO_WATERMARKS; } @@ -2001,7 +1998,6 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, int alloc_flags; unsigned long pages_reclaimed = 0; unsigned long did_some_progress; - struct task_struct *p = current; bool sync_migration = false; /* @@ -2060,7 +2056,7 @@ rebalance: goto nopage; /* Avoid recursion of direct reclaim */ - if (p->flags & PF_MEMALLOC) + if (current->flags & PF_MEMALLOC) goto nopage; /* Avoid allocations with no watermarks from looping endlessly */ @@ -2153,7 +2149,7 @@ nopage: if (!(gfp_mask & __GFP_NOWARN) && printk_ratelimit()) { printk(KERN_WARNING "%s: page allocation failure." " order:%d, mode:0x%x\n", - p->comm, order, gfp_mask); + current->comm, order, gfp_mask); dump_stack(); show_mem(); } -- cgit From d8505dee1a87b8d41b9c4ee1325cd72258226fbc Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Thu, 13 Jan 2011 15:47:33 -0800 Subject: mm: simplify code of swap.c Clean up code and remove duplicate code. Next patch will use pagevec_lru_move_fn introduced here too. Signed-off-by: Shaohua Li Cc: Andi Kleen Cc: Minchan Kim Cc: KOSAKI Motohiro Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/swap.c | 101 +++++++++++++++++++++++++++++++++----------------------------- 1 file changed, 54 insertions(+), 47 deletions(-) diff --git a/mm/swap.c b/mm/swap.c index c02f93611a84..ab498ea04ae3 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -178,15 +178,13 @@ void put_pages_list(struct list_head *pages) } EXPORT_SYMBOL(put_pages_list); -/* - * pagevec_move_tail() must be called with IRQ disabled. - * Otherwise this may cause nasty races. - */ -static void pagevec_move_tail(struct pagevec *pvec) +static void pagevec_lru_move_fn(struct pagevec *pvec, + void (*move_fn)(struct page *page, void *arg), + void *arg) { int i; - int pgmoved = 0; struct zone *zone = NULL; + unsigned long flags = 0; for (i = 0; i < pagevec_count(pvec); i++) { struct page *page = pvec->pages[i]; @@ -194,29 +192,49 @@ static void pagevec_move_tail(struct pagevec *pvec) if (pagezone != zone) { if (zone) - spin_unlock(&zone->lru_lock); + spin_unlock_irqrestore(&zone->lru_lock, flags); zone = pagezone; - spin_lock(&zone->lru_lock); - } - if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) { - int lru = page_lru_base_type(page); - list_move_tail(&page->lru, &zone->lru[lru].list); - pgmoved++; + spin_lock_irqsave(&zone->lru_lock, flags); } + + (*move_fn)(page, arg); } if (zone) - spin_unlock(&zone->lru_lock); - __count_vm_events(PGROTATED, pgmoved); - release_pages(pvec->pages, pvec->nr, pvec->cold); + spin_unlock_irqrestore(&zone->lru_lock, flags); + release_pages(pvec->pages, pagevec_count(pvec), pvec->cold); pagevec_reinit(pvec); } +static void pagevec_move_tail_fn(struct page *page, void *arg) +{ + int *pgmoved = arg; + struct zone *zone = page_zone(page); + + if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) { + int lru = page_lru_base_type(page); + list_move_tail(&page->lru, &zone->lru[lru].list); + (*pgmoved)++; + } +} + +/* + * pagevec_move_tail() must be called with IRQ disabled. + * Otherwise this may cause nasty races. + */ +static void pagevec_move_tail(struct pagevec *pvec) +{ + int pgmoved = 0; + + pagevec_lru_move_fn(pvec, pagevec_move_tail_fn, &pgmoved); + __count_vm_events(PGROTATED, pgmoved); +} + /* * Writeback is about to end against a page which has been marked for immediate * reclaim. If it still appears to be reclaimable, move it to the tail of the * inactive list. */ -void rotate_reclaimable_page(struct page *page) +void rotate_reclaimable_page(struct page *page) { if (!PageLocked(page) && !PageDirty(page) && !PageActive(page) && !PageUnevictable(page) && PageLRU(page)) { @@ -516,44 +534,33 @@ void lru_add_page_tail(struct zone* zone, } } +static void ____pagevec_lru_add_fn(struct page *page, void *arg) +{ + enum lru_list lru = (enum lru_list)arg; + struct zone *zone = page_zone(page); + int file = is_file_lru(lru); + int active = is_active_lru(lru); + + VM_BUG_ON(PageActive(page)); + VM_BUG_ON(PageUnevictable(page)); + VM_BUG_ON(PageLRU(page)); + + SetPageLRU(page); + if (active) + SetPageActive(page); + update_page_reclaim_stat(zone, page, file, active); + add_page_to_lru_list(zone, page, lru); +} + /* * Add the passed pages to the LRU, then drop the caller's refcount * on them. Reinitialises the caller's pagevec. */ void ____pagevec_lru_add(struct pagevec *pvec, enum lru_list lru) { - int i; - struct zone *zone = NULL; - VM_BUG_ON(is_unevictable_lru(lru)); - for (i = 0; i < pagevec_count(pvec); i++) { - struct page *page = pvec->pages[i]; - struct zone *pagezone = page_zone(page); - int file; - int active; - - if (pagezone != zone) { - if (zone) - spin_unlock_irq(&zone->lru_lock); - zone = pagezone; - spin_lock_irq(&zone->lru_lock); - } - VM_BUG_ON(PageActive(page)); - VM_BUG_ON(PageUnevictable(page)); - VM_BUG_ON(PageLRU(page)); - SetPageLRU(page); - active = is_active_lru(lru); - file = is_file_lru(lru); - if (active) - SetPageActive(page); - update_page_reclaim_stat(zone, page, file, active); - add_page_to_lru_list(zone, page, lru); - } - if (zone) - spin_unlock_irq(&zone->lru_lock); - release_pages(pvec->pages, pvec->nr, pvec->cold); - pagevec_reinit(pvec); + pagevec_lru_move_fn(pvec, ____pagevec_lru_add_fn, (void *)lru); } EXPORT_SYMBOL(____pagevec_lru_add); -- cgit From 744ed1442757767ffede5008bb13e0805085902e Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Thu, 13 Jan 2011 15:47:34 -0800 Subject: mm: batch activate_page() to reduce lock contention The zone->lru_lock is heavily contented in workload where activate_page() is frequently used. We could do batch activate_page() to reduce the lock contention. The batched pages will be added into zone list when the pool is full or page reclaim is trying to drain them. For example, in a 4 socket 64 CPU system, create a sparse file and 64 processes, processes shared map to the file. Each process read access the whole file and then exit. The process exit will do unmap_vmas() and cause a lot of activate_page() call. In such workload, we saw about 58% total time reduction with below patch. Other workloads with a lot of activate_page also benefits a lot too. I tested some microbenchmarks: case-anon-cow-rand-mt 0.58% case-anon-cow-rand -3.30% case-anon-cow-seq-mt -0.51% case-anon-cow-seq -5.68% case-anon-r-rand-mt 0.23% case-anon-r-rand 0.81% case-anon-r-seq-mt -0.71% case-anon-r-seq -1.99% case-anon-rx-rand-mt 2.11% case-anon-rx-seq-mt 3.46% case-anon-w-rand-mt -0.03% case-anon-w-rand -0.50% case-anon-w-seq-mt -1.08% case-anon-w-seq -0.12% case-anon-wx-rand-mt -5.02% case-anon-wx-seq-mt -1.43% case-fork 1.65% case-fork-sleep -0.07% case-fork-withmem 1.39% case-hugetlb -0.59% case-lru-file-mmap-read-mt -0.54% case-lru-file-mmap-read 0.61% case-lru-file-mmap-read-rand -2.24% case-lru-file-readonce -0.64% case-lru-file-readtwice -11.69% case-lru-memcg -1.35% case-mmap-pread-rand-mt 1.88% case-mmap-pread-rand -15.26% case-mmap-pread-seq-mt 0.89% case-mmap-pread-seq -69.72% case-mmap-xread-rand-mt 0.71% case-mmap-xread-seq-mt 0.38% The most significent are: case-lru-file-readtwice -11.69% case-mmap-pread-rand -15.26% case-mmap-pread-seq -69.72% which use activate_page a lot. others are basically variations because each run has slightly difference. [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Shaohua Li Cc: Andi Kleen Cc: Minchan Kim Cc: KOSAKI Motohiro Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/internal.h | 9 ++++++ mm/swap.c | 90 +++++++++++++++++++++++++++++++++++++++++++++++++++-------- mm/vmscan.c | 6 ++-- 3 files changed, 92 insertions(+), 13 deletions(-) diff --git a/mm/internal.h b/mm/internal.h index 69488205723d..4c98630f0f77 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -39,6 +39,15 @@ static inline void __put_page(struct page *page) extern unsigned long highest_memmap_pfn; +#ifdef CONFIG_SMP +extern int putback_active_lru_page(struct zone *zone, struct page *page); +#else +static inline int putback_active_lru_page(struct zone *zone, struct page *page) +{ + return 0; +} +#endif + /* * in mm/vmscan.c: */ diff --git a/mm/swap.c b/mm/swap.c index ab498ea04ae3..bbc1ce9f9460 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -271,27 +271,94 @@ static void update_page_reclaim_stat(struct zone *zone, struct page *page, } /* - * FIXME: speed this up? + * A page will go to active list either by activate_page or putback_lru_page. + * In the activate_page case, the page hasn't active bit set. The page might + * not in LRU list because it's isolated before it gets a chance to be moved to + * active list. The window is small because pagevec just stores several pages. + * For such case, we do nothing for such page. + * In the putback_lru_page case, the page isn't in lru list but has active + * bit set */ -void activate_page(struct page *page) +static void __activate_page(struct page *page, void *arg) { struct zone *zone = page_zone(page); + int file = page_is_file_cache(page); + int lru = page_lru_base_type(page); + bool putback = !PageLRU(page); - spin_lock_irq(&zone->lru_lock); - if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) { - int file = page_is_file_cache(page); - int lru = page_lru_base_type(page); + /* The page is isolated before it's moved to active list */ + if (!PageLRU(page) && !PageActive(page)) + return; + if ((PageLRU(page) && PageActive(page)) || PageUnevictable(page)) + return; + + if (!putback) del_page_from_lru_list(zone, page, lru); + else + SetPageLRU(page); - SetPageActive(page); - lru += LRU_ACTIVE; - add_page_to_lru_list(zone, page, lru); - __count_vm_event(PGACTIVATE); + SetPageActive(page); + lru += LRU_ACTIVE; + add_page_to_lru_list(zone, page, lru); + + if (putback) + return; + __count_vm_event(PGACTIVATE); + update_page_reclaim_stat(zone, page, file, 1); +} + +#ifdef CONFIG_SMP +static DEFINE_PER_CPU(struct pagevec, activate_page_pvecs); + +static void activate_page_drain(int cpu) +{ + struct pagevec *pvec = &per_cpu(activate_page_pvecs, cpu); + + if (pagevec_count(pvec)) + pagevec_lru_move_fn(pvec, __activate_page, NULL); +} + +void activate_page(struct page *page) +{ + if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) { + struct pagevec *pvec = &get_cpu_var(activate_page_pvecs); + + page_cache_get(page); + if (!pagevec_add(pvec, page)) + pagevec_lru_move_fn(pvec, __activate_page, NULL); + put_cpu_var(activate_page_pvecs); + } +} - update_page_reclaim_stat(zone, page, file, 1); +/* Caller should hold zone->lru_lock */ +int putback_active_lru_page(struct zone *zone, struct page *page) +{ + struct pagevec *pvec = &get_cpu_var(activate_page_pvecs); + + if (!pagevec_add(pvec, page)) { + spin_unlock_irq(&zone->lru_lock); + pagevec_lru_move_fn(pvec, __activate_page, NULL); + spin_lock_irq(&zone->lru_lock); } + put_cpu_var(activate_page_pvecs); + return 1; +} + +#else +static inline void activate_page_drain(int cpu) +{ +} + +void activate_page(struct page *page) +{ + struct zone *zone = page_zone(page); + + spin_lock_irq(&zone->lru_lock); + if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) + __activate_page(page, NULL); spin_unlock_irq(&zone->lru_lock); } +#endif /* * Mark a page as having seen activity. @@ -390,6 +457,7 @@ static void drain_cpu_pagevecs(int cpu) pagevec_move_tail(pvec); local_irq_restore(flags); } + activate_page_drain(cpu); } void lru_add_drain(void) diff --git a/mm/vmscan.c b/mm/vmscan.c index 47a50962ce81..99999a9b2b0b 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1271,14 +1271,16 @@ putback_lru_pages(struct zone *zone, struct scan_control *sc, spin_lock_irq(&zone->lru_lock); continue; } - SetPageLRU(page); lru = page_lru(page); - add_page_to_lru_list(zone, page, lru); if (is_active_lru(lru)) { int file = is_file_lru(lru); int numpages = hpage_nr_pages(page); reclaim_stat->recent_rotated[file] += numpages; + if (putback_active_lru_page(zone, page)) + continue; } + SetPageLRU(page); + add_page_to_lru_list(zone, page, lru); if (!pagevec_add(&pvec, page)) { spin_unlock_irq(&zone->lru_lock); __pagevec_release(&pvec); -- cgit From db16d5ec1f87f17511599bc77857dd1662b5a22f Mon Sep 17 00:00:00 2001 From: Greg Thelen Date: Thu, 13 Jan 2011 15:47:35 -0800 Subject: memcg: add page_cgroup flags for dirty page tracking This patchset provides the ability for each cgroup to have independent dirty page limits. Limiting dirty memory is like fixing the max amount of dirty (hard to reclaim) page cache used by a cgroup. So, in case of multiple cgroup writers, they will not be able to consume more than their designated share of dirty pages and will be forced to perform write-out if they cross that limit. The patches are based on a series proposed by Andrea Righi in Mar 2010. Overview: - Add page_cgroup flags to record when pages are dirty, in writeback, or nfs unstable. - Extend mem_cgroup to record the total number of pages in each of the interesting dirty states (dirty, writeback, unstable_nfs). - Add dirty parameters similar to the system-wide /proc/sys/vm/dirty_* limits to mem_cgroup. The mem_cgroup dirty parameters are accessible via cgroupfs control files. - Consider both system and per-memcg dirty limits in page writeback when deciding to queue background writeback or block for foreground writeback. Known shortcomings: - When a cgroup dirty limit is exceeded, then bdi writeback is employed to writeback dirty inodes. Bdi writeback considers inodes from any cgroup, not just inodes contributing dirty pages to the cgroup exceeding its limit. - When memory.use_hierarchy is set, then dirty limits are disabled. This is a implementation detail. An enhanced implementation is needed to check the chain of parents to ensure that no dirty limit is exceeded. Performance data: - A page fault microbenchmark workload was used to measure performance, which can be called in read or write mode: f = open(foo. $cpu) truncate(f, 4096) alarm(60) while (1) { p = mmap(f, 4096) if (write) *p = 1 else x = *p munmap(p) } - The workload was called for several points in the patch series in different modes: - s_read is a single threaded reader - s_write is a single threaded writer - p_read is a 16 thread reader, each operating on a different file - p_write is a 16 thread writer, each operating on a different file - Measurements were collected on a 16 core non-numa system using "perf stat --repeat 3". The -a option was used for parallel (p_*) runs. - All numbers are page fault rate (M/sec). Higher is better. - To compare the performance of a kernel without non-memcg compare the first and last rows, neither has memcg configured. The first row does not include any of these memcg patches. - To compare the performance of using memcg dirty limits, compare the baseline (2nd row titled "w/ memcg") with the the code and memcg enabled (2nd to last row titled "all patches"). root_cgroup child_cgroup s_read s_write p_read p_write s_read s_write p_read p_write mmotm w/o memcg 0.428 0.390 0.429 0.388 mmotm w/ memcg 0.411 0.378 0.391 0.362 0.412 0.377 0.385 0.363 all patches 0.384 0.360 0.370 0.348 0.381 0.363 0.368 0.347 all patches 0.431 0.402 0.427 0.395 w/o memcg This patch: Add additional flags to page_cgroup to track dirty pages within a mem_cgroup. Signed-off-by: KAMEZAWA Hiroyuki Signed-off-by: Andrea Righi Signed-off-by: Greg Thelen Acked-by: Daisuke Nishimura Cc: Balbir Singh Cc: Minchan Kim Cc: Wu Fengguang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/page_cgroup.h | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/include/linux/page_cgroup.h b/include/linux/page_cgroup.h index b02195dfc1b0..fdb5a92b5ac7 100644 --- a/include/linux/page_cgroup.h +++ b/include/linux/page_cgroup.h @@ -40,6 +40,9 @@ enum { PCG_USED, /* this object is in use. */ PCG_ACCT_LRU, /* page has been accounted for */ PCG_FILE_MAPPED, /* page is accounted as "mapped" */ + PCG_FILE_DIRTY, /* page is dirty */ + PCG_FILE_WRITEBACK, /* page is under writeback */ + PCG_FILE_UNSTABLE_NFS, /* page is NFS unstable */ PCG_MIGRATION, /* under page migration */ }; @@ -59,6 +62,10 @@ static inline void ClearPageCgroup##uname(struct page_cgroup *pc) \ static inline int TestClearPageCgroup##uname(struct page_cgroup *pc) \ { return test_and_clear_bit(PCG_##lname, &pc->flags); } +#define TESTSETPCGFLAG(uname, lname) \ +static inline int TestSetPageCgroup##uname(struct page_cgroup *pc) \ + { return test_and_set_bit(PCG_##lname, &pc->flags); } + /* Cache flag is set only once (at allocation) */ TESTPCGFLAG(Cache, CACHE) CLEARPCGFLAG(Cache, CACHE) @@ -78,6 +85,22 @@ SETPCGFLAG(FileMapped, FILE_MAPPED) CLEARPCGFLAG(FileMapped, FILE_MAPPED) TESTPCGFLAG(FileMapped, FILE_MAPPED) +SETPCGFLAG(FileDirty, FILE_DIRTY) +CLEARPCGFLAG(FileDirty, FILE_DIRTY) +TESTPCGFLAG(FileDirty, FILE_DIRTY) +TESTCLEARPCGFLAG(FileDirty, FILE_DIRTY) +TESTSETPCGFLAG(FileDirty, FILE_DIRTY) + +SETPCGFLAG(FileWriteback, FILE_WRITEBACK) +CLEARPCGFLAG(FileWriteback, FILE_WRITEBACK) +TESTPCGFLAG(FileWriteback, FILE_WRITEBACK) + +SETPCGFLAG(FileUnstableNFS, FILE_UNSTABLE_NFS) +CLEARPCGFLAG(FileUnstableNFS, FILE_UNSTABLE_NFS) +TESTPCGFLAG(FileUnstableNFS, FILE_UNSTABLE_NFS) +TESTCLEARPCGFLAG(FileUnstableNFS, FILE_UNSTABLE_NFS) +TESTSETPCGFLAG(FileUnstableNFS, FILE_UNSTABLE_NFS) + SETPCGFLAG(Migration, MIGRATION) CLEARPCGFLAG(Migration, MIGRATION) TESTPCGFLAG(Migration, MIGRATION) -- cgit From ece72400c2a27a3d726cb0854449f991d9fcd2da Mon Sep 17 00:00:00 2001 From: Greg Thelen Date: Thu, 13 Jan 2011 15:47:36 -0800 Subject: memcg: document cgroup dirty memory interfaces Document cgroup dirty memory interfaces and statistics. [akpm@linux-foundation.org: fix use_hierarchy description] Signed-off-by: Andrea Righi Signed-off-by: Greg Thelen Cc: KAMEZAWA Hiroyuki Cc: Daisuke Nishimura Cc: Balbir Singh Cc: Minchan Kim Cc: Wu Fengguang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/cgroups/memory.txt | 74 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 74 insertions(+) diff --git a/Documentation/cgroups/memory.txt b/Documentation/cgroups/memory.txt index 7781857dc940..bac328c232f5 100644 --- a/Documentation/cgroups/memory.txt +++ b/Documentation/cgroups/memory.txt @@ -385,6 +385,10 @@ mapped_file - # of bytes of mapped file (includes tmpfs/shmem) pgpgin - # of pages paged in (equivalent to # of charging events). pgpgout - # of pages paged out (equivalent to # of uncharging events). swap - # of bytes of swap usage +dirty - # of bytes that are waiting to get written back to the disk. +writeback - # of bytes that are actively being written back to the disk. +nfs_unstable - # of bytes sent to the NFS server, but not yet committed to + the actual storage. inactive_anon - # of bytes of anonymous memory and swap cache memory on LRU list. active_anon - # of bytes of anonymous and swap cache memory on active @@ -406,6 +410,9 @@ total_mapped_file - sum of all children's "cache" total_pgpgin - sum of all children's "pgpgin" total_pgpgout - sum of all children's "pgpgout" total_swap - sum of all children's "swap" +total_dirty - sum of all children's "dirty" +total_writeback - sum of all children's "writeback" +total_nfs_unstable - sum of all children's "nfs_unstable" total_inactive_anon - sum of all children's "inactive_anon" total_active_anon - sum of all children's "active_anon" total_inactive_file - sum of all children's "inactive_file" @@ -453,6 +460,73 @@ memory under it will be reclaimed. You can reset failcnt by writing 0 to failcnt file. # echo 0 > .../memory.failcnt +5.5 dirty memory + +Control the maximum amount of dirty pages a cgroup can have at any given time. + +Limiting dirty memory is like fixing the max amount of dirty (hard to reclaim) +page cache used by a cgroup. So, in case of multiple cgroup writers, they will +not be able to consume more than their designated share of dirty pages and will +be forced to perform write-out if they cross that limit. + +The interface is equivalent to the procfs interface: /proc/sys/vm/dirty_*. It +is possible to configure a limit to trigger both a direct writeback or a +background writeback performed by per-bdi flusher threads. The root cgroup +memory.dirty_* control files are read-only and match the contents of +the /proc/sys/vm/dirty_* files. + +Per-cgroup dirty limits can be set using the following files in the cgroupfs: + +- memory.dirty_ratio: the amount of dirty memory (expressed as a percentage of + cgroup memory) at which a process generating dirty pages will itself start + writing out dirty data. + +- memory.dirty_limit_in_bytes: the amount of dirty memory (expressed in bytes) + in the cgroup at which a process generating dirty pages will start itself + writing out dirty data. Suffix (k, K, m, M, g, or G) can be used to indicate + that value is kilo, mega or gigabytes. + + Note: memory.dirty_limit_in_bytes is the counterpart of memory.dirty_ratio. + Only one of them may be specified at a time. When one is written it is + immediately taken into account to evaluate the dirty memory limits and the + other appears as 0 when read. + +- memory.dirty_background_ratio: the amount of dirty memory of the cgroup + (expressed as a percentage of cgroup memory) at which background writeback + kernel threads will start writing out dirty data. + +- memory.dirty_background_limit_in_bytes: the amount of dirty memory (expressed + in bytes) in the cgroup at which background writeback kernel threads will + start writing out dirty data. Suffix (k, K, m, M, g, or G) can be used to + indicate that value is kilo, mega or gigabytes. + + Note: memory.dirty_background_limit_in_bytes is the counterpart of + memory.dirty_background_ratio. Only one of them may be specified at a time. + When one is written it is immediately taken into account to evaluate the dirty + memory limits and the other appears as 0 when read. + +A cgroup may contain more dirty memory than its dirty limit. This is possible +because of the principle that the first cgroup to touch a page is charged for +it. Subsequent page counting events (dirty, writeback, nfs_unstable) are also +counted to the originally charged cgroup. + +Example: If page is allocated by a cgroup A task, then the page is charged to +cgroup A. If the page is later dirtied by a task in cgroup B, then the cgroup A +dirty count will be incremented. If cgroup A is over its dirty limit but cgroup +B is not, then dirtying a cgroup A page from a cgroup B task may push cgroup A +over its dirty limit without throttling the dirtying cgroup B task. + +When use_hierarchy=0, each cgroup has dirty memory usage and limits. +System-wide dirty limits are also consulted. Dirty memory consumption is +checked against both system-wide and per-cgroup dirty limits. + +The current implementation does not enforce per-cgroup dirty limits when +use_hierarchy=1. System-wide dirty limits are used for processes in such +cgroups. Attempts to read memory.dirty_* files return the system-wide +values. Writes to the memory.dirty_* files return error. An enhanced +implementation is needed to check the chain of parents to ensure that no +dirty limit is exceeded. + 6. Hierarchy support The memory controller supports a deep hierarchy and hierarchical accounting. -- cgit From 2a7106f2cb0768d00fe8c1eb42a754a7d8518f08 Mon Sep 17 00:00:00 2001 From: Greg Thelen Date: Thu, 13 Jan 2011 15:47:37 -0800 Subject: memcg: create extensible page stat update routines Replace usage of the mem_cgroup_update_file_mapped() memcg statistic update routine with two new routines: * mem_cgroup_inc_page_stat() * mem_cgroup_dec_page_stat() As before, only the file_mapped statistic is managed. However, these more general interfaces allow for new statistics to be more easily added. New statistics are added with memcg dirty page accounting. Signed-off-by: Greg Thelen Signed-off-by: Andrea Righi Acked-by: KAMEZAWA Hiroyuki Acked-by: Daisuke Nishimura Cc: Balbir Singh Cc: Minchan Kim Cc: Wu Fengguang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 31 ++++++++++++++++++++++++++++--- mm/memcontrol.c | 16 +++++++--------- mm/rmap.c | 4 ++-- 3 files changed, 37 insertions(+), 14 deletions(-) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 159a0762aeaf..067115ce6b3e 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -25,6 +25,11 @@ struct page_cgroup; struct page; struct mm_struct; +/* Stats that can be updated by kernel. */ +enum mem_cgroup_page_stat_item { + MEMCG_NR_FILE_MAPPED, /* # of pages charged as file rss */ +}; + extern unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan, struct list_head *dst, unsigned long *scanned, int order, @@ -121,7 +126,22 @@ static inline bool mem_cgroup_disabled(void) return false; } -void mem_cgroup_update_file_mapped(struct page *page, int val); +void mem_cgroup_update_page_stat(struct page *page, + enum mem_cgroup_page_stat_item idx, + int val); + +static inline void mem_cgroup_inc_page_stat(struct page *page, + enum mem_cgroup_page_stat_item idx) +{ + mem_cgroup_update_page_stat(page, idx, 1); +} + +static inline void mem_cgroup_dec_page_stat(struct page *page, + enum mem_cgroup_page_stat_item idx) +{ + mem_cgroup_update_page_stat(page, idx, -1); +} + unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, gfp_t gfp_mask); u64 mem_cgroup_get_limit(struct mem_cgroup *mem); @@ -293,8 +313,13 @@ mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p) { } -static inline void mem_cgroup_update_file_mapped(struct page *page, - int val) +static inline void mem_cgroup_inc_page_stat(struct page *page, + enum mem_cgroup_page_stat_item idx) +{ +} + +static inline void mem_cgroup_dec_page_stat(struct page *page, + enum mem_cgroup_page_stat_item idx) { } diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 741206ffdace..3d8a0c79dece 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1600,7 +1600,8 @@ bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask) * possibility of race condition. If there is, we take a lock. */ -static void mem_cgroup_update_file_stat(struct page *page, int idx, int val) +void mem_cgroup_update_page_stat(struct page *page, + enum mem_cgroup_page_stat_item idx, int val) { struct mem_cgroup *mem; struct page_cgroup *pc = lookup_page_cgroup(page); @@ -1623,30 +1624,27 @@ static void mem_cgroup_update_file_stat(struct page *page, int idx, int val) goto out; } - this_cpu_add(mem->stat->count[idx], val); - switch (idx) { - case MEM_CGROUP_STAT_FILE_MAPPED: + case MEMCG_NR_FILE_MAPPED: if (val > 0) SetPageCgroupFileMapped(pc); else if (!page_mapped(page)) ClearPageCgroupFileMapped(pc); + idx = MEM_CGROUP_STAT_FILE_MAPPED; break; default: BUG(); } + this_cpu_add(mem->stat->count[idx], val); + out: if (unlikely(need_unlock)) unlock_page_cgroup(pc); rcu_read_unlock(); return; } - -void mem_cgroup_update_file_mapped(struct page *page, int val) -{ - mem_cgroup_update_file_stat(page, MEM_CGROUP_STAT_FILE_MAPPED, val); -} +EXPORT_SYMBOL(mem_cgroup_update_page_stat); /* * size of first charge trial. "32" comes from vmscan.c's magic value. diff --git a/mm/rmap.c b/mm/rmap.c index c30f33854f97..f21f4a1d6a1c 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -937,7 +937,7 @@ void page_add_file_rmap(struct page *page) { if (atomic_inc_and_test(&page->_mapcount)) { __inc_zone_page_state(page, NR_FILE_MAPPED); - mem_cgroup_update_file_mapped(page, 1); + mem_cgroup_inc_page_stat(page, MEMCG_NR_FILE_MAPPED); } } @@ -979,7 +979,7 @@ void page_remove_rmap(struct page *page) NR_ANON_TRANSPARENT_HUGEPAGES); } else { __dec_zone_page_state(page, NR_FILE_MAPPED); - mem_cgroup_update_file_mapped(page, -1); + mem_cgroup_dec_page_stat(page, MEMCG_NR_FILE_MAPPED); } /* * It would be tidy to reset the PageAnon mapping here, -- cgit From dbd4ea78f002df283c95d9774837041735fa1bf9 Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Thu, 13 Jan 2011 15:47:38 -0800 Subject: memcg: add lock to synchronize page accounting and migration Introduce a new bit spin lock, PCG_MOVE_LOCK, to synchronize the page accounting and migration code. This reworks the locking scheme of _update_stat() and _move_account() by adding new lock bit PCG_MOVE_LOCK, which is always taken under IRQ disable. 1. If pages are being migrated from a memcg, then updates to that memcg page statistics are protected by grabbing PCG_MOVE_LOCK using move_lock_page_cgroup(). In an upcoming commit, memcg dirty page accounting will be updating memcg page accounting (specifically: num writeback pages) from IRQ context (softirq). Avoid a deadlocking nested spin lock attempt by disabling irq on the local processor when grabbing the PCG_MOVE_LOCK. 2. lock for update_page_stat is used only for avoiding race with move_account(). So, IRQ awareness of lock_page_cgroup() itself is not a problem. The problem is between mem_cgroup_update_page_stat() and mem_cgroup_move_account_page(). Trade-off: * Changing lock_page_cgroup() to always disable IRQ (or local_bh) has some impacts on performance and I think it's bad to disable IRQ when it's not necessary. * adding a new lock makes move_account() slower. Score is here. Performance Impact: moving a 8G anon process. Before: real 0m0.792s user 0m0.000s sys 0m0.780s After: real 0m0.854s user 0m0.000s sys 0m0.842s This score is bad but planned patches for optimization can reduce this impact. Signed-off-by: KAMEZAWA Hiroyuki Signed-off-by: Greg Thelen Reviewed-by: Minchan Kim Acked-by: Daisuke Nishimura Cc: Andrea Righi Cc: Balbir Singh Cc: Wu Fengguang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/page_cgroup.h | 31 ++++++++++++++++++++++++++++--- mm/memcontrol.c | 9 +++++++-- 2 files changed, 35 insertions(+), 5 deletions(-) diff --git a/include/linux/page_cgroup.h b/include/linux/page_cgroup.h index fdb5a92b5ac7..5b0c971d7cae 100644 --- a/include/linux/page_cgroup.h +++ b/include/linux/page_cgroup.h @@ -35,15 +35,18 @@ struct page_cgroup *lookup_page_cgroup(struct page *page); enum { /* flags for mem_cgroup */ - PCG_LOCK, /* page cgroup is locked */ + PCG_LOCK, /* Lock for pc->mem_cgroup and following bits. */ PCG_CACHE, /* charged as cache */ PCG_USED, /* this object is in use. */ - PCG_ACCT_LRU, /* page has been accounted for */ + PCG_MIGRATION, /* under page migration */ + /* flags for mem_cgroup and file and I/O status */ + PCG_MOVE_LOCK, /* For race between move_account v.s. following bits */ PCG_FILE_MAPPED, /* page is accounted as "mapped" */ PCG_FILE_DIRTY, /* page is dirty */ PCG_FILE_WRITEBACK, /* page is under writeback */ PCG_FILE_UNSTABLE_NFS, /* page is NFS unstable */ - PCG_MIGRATION, /* under page migration */ + /* No lock in page_cgroup */ + PCG_ACCT_LRU, /* page has been accounted for (under lru_lock) */ }; #define TESTPCGFLAG(uname, lname) \ @@ -117,6 +120,10 @@ static inline enum zone_type page_cgroup_zid(struct page_cgroup *pc) static inline void lock_page_cgroup(struct page_cgroup *pc) { + /* + * Don't take this lock in IRQ context. + * This lock is for pc->mem_cgroup, USED, CACHE, MIGRATION + */ bit_spin_lock(PCG_LOCK, &pc->flags); } @@ -130,6 +137,24 @@ static inline int page_is_cgroup_locked(struct page_cgroup *pc) return bit_spin_is_locked(PCG_LOCK, &pc->flags); } +static inline void move_lock_page_cgroup(struct page_cgroup *pc, + unsigned long *flags) +{ + /* + * We know updates to pc->flags of page cache's stats are from both of + * usual context or IRQ context. Disable IRQ to avoid deadlock. + */ + local_irq_save(*flags); + bit_spin_lock(PCG_MOVE_LOCK, &pc->flags); +} + +static inline void move_unlock_page_cgroup(struct page_cgroup *pc, + unsigned long *flags) +{ + bit_spin_unlock(PCG_MOVE_LOCK, &pc->flags); + local_irq_restore(*flags); +} + #else /* CONFIG_CGROUP_MEM_RES_CTLR */ struct page_cgroup; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 3d8a0c79dece..d888956a2cfc 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1606,6 +1606,7 @@ void mem_cgroup_update_page_stat(struct page *page, struct mem_cgroup *mem; struct page_cgroup *pc = lookup_page_cgroup(page); bool need_unlock = false; + unsigned long uninitialized_var(flags); if (unlikely(!pc)) return; @@ -1617,7 +1618,7 @@ void mem_cgroup_update_page_stat(struct page *page, /* pc->mem_cgroup is unstable ? */ if (unlikely(mem_cgroup_stealed(mem))) { /* take a lock against to access pc->mem_cgroup */ - lock_page_cgroup(pc); + move_lock_page_cgroup(pc, &flags); need_unlock = true; mem = pc->mem_cgroup; if (!mem || !PageCgroupUsed(pc)) @@ -1640,7 +1641,7 @@ void mem_cgroup_update_page_stat(struct page *page, out: if (unlikely(need_unlock)) - unlock_page_cgroup(pc); + move_unlock_page_cgroup(pc, &flags); rcu_read_unlock(); return; } @@ -2211,9 +2212,13 @@ static int mem_cgroup_move_account(struct page_cgroup *pc, struct mem_cgroup *from, struct mem_cgroup *to, bool uncharge) { int ret = -EINVAL; + unsigned long flags; + lock_page_cgroup(pc); if (PageCgroupUsed(pc) && pc->mem_cgroup == from) { + move_lock_page_cgroup(pc, &flags); __mem_cgroup_move_account(pc, from, to, uncharge); + move_unlock_page_cgroup(pc, &flags); ret = 0; } unlock_page_cgroup(pc); -- cgit From f3e8eb70b1807d1b30aa6972af0cf30077c40112 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 13 Jan 2011 15:47:39 -0800 Subject: memcg: fix unit mismatch in memcg oom limit calculation Adding the number of swap pages to the byte limit of a memory control group makes no sense. Convert the pages to bytes before adding them. The only user of this code is the OOM killer, and the way it is used means that the error results in a higher OOM badness value. Since the cgroup limit is the same for all tasks in the cgroup, the error should have no practical impact at the moment. But let's not wait for future or changing users to trip over it. Signed-off-by: Johannes Weiner Cc: Greg Thelen Cc: David Rientjes Acked-by: KAMEZAWA Hiroyuki Cc: Balbir Singh Cc: Daisuke Nishimura Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index d888956a2cfc..9f8ad77f091b 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1312,8 +1312,9 @@ u64 mem_cgroup_get_limit(struct mem_cgroup *memcg) u64 limit; u64 memsw; - limit = res_counter_read_u64(&memcg->res, RES_LIMIT) + - total_swap_pages; + limit = res_counter_read_u64(&memcg->res, RES_LIMIT); + limit += total_swap_pages << PAGE_SHIFT; + memsw = res_counter_read_u64(&memcg->memsw, RES_LIMIT); /* * If memsw is finite and limits the amount of swap space available -- cgit From 043d18b1e5bdfc4870b8a19d00f0d5c636a5c231 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Thu, 13 Jan 2011 15:47:40 -0800 Subject: memcg: remove unnecessary return from void-returning mem_cgroup_del_lru_list() Signed-off-by: Minchan Kim Acked-by: Balbir Singh Acked-by: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 1 - 1 file changed, 1 deletion(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 9f8ad77f091b..1b44ad64f281 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -821,7 +821,6 @@ void mem_cgroup_del_lru_list(struct page *page, enum lru_list lru) return; VM_BUG_ON(list_empty(&pc->lru)); list_del_init(&pc->lru); - return; } void mem_cgroup_del_lru(struct page *page) -- cgit From dfe076b0971a783469bc2066e85d46e23c8acb1c Mon Sep 17 00:00:00 2001 From: Daisuke Nishimura Date: Thu, 13 Jan 2011 15:47:41 -0800 Subject: memcg: fix deadlock between cpuset and memcg Commit b1dd693e ("memcg: avoid deadlock between move charge and try_charge()") can cause another deadlock about mmap_sem on task migration if cpuset and memcg are mounted onto the same mount point. After the commit, cgroup_attach_task() has sequence like: cgroup_attach_task() ss->can_attach() cpuset_can_attach() mem_cgroup_can_attach() down_read(&mmap_sem) (1) ss->attach() cpuset_attach() mpol_rebind_mm() down_write(&mmap_sem) (2) up_write(&mmap_sem) cpuset_migrate_mm() do_migrate_pages() down_read(&mmap_sem) up_read(&mmap_sem) mem_cgroup_move_task() mem_cgroup_clear_mc() up_read(&mmap_sem) We can cause deadlock at (2) because we've already aquire the mmap_sem at (1). But the commit itself is necessary to fix deadlocks which have existed before the commit like: Ex.1) move charge | try charge --------------------------------------+------------------------------ mem_cgroup_can_attach() | down_write(&mmap_sem) mc.moving_task = current | .. mem_cgroup_precharge_mc() | __mem_cgroup_try_charge() mem_cgroup_count_precharge() | prepare_to_wait() down_read(&mmap_sem) | if (mc.moving_task) -> cannot aquire the lock | -> true | schedule() | -> move charge should wake it up Ex.2) move charge | try charge --------------------------------------+------------------------------ mem_cgroup_can_attach() | mc.moving_task = current | mem_cgroup_precharge_mc() | mem_cgroup_count_precharge() | down_read(&mmap_sem) | .. | up_read(&mmap_sem) | | down_write(&mmap_sem) mem_cgroup_move_task() | .. mem_cgroup_move_charge() | __mem_cgroup_try_charge() down_read(&mmap_sem) | prepare_to_wait() -> cannot aquire the lock | if (mc.moving_task) | -> true | schedule() | -> move charge should wake it up This patch fixes all of these problems by: 1. revert the commit. 2. To fix the Ex.1, we set mc.moving_task after mem_cgroup_count_precharge() has released the mmap_sem. 3. To fix the Ex.2, we use down_read_trylock() instead of down_read() in mem_cgroup_move_charge() and, if it has failed to aquire the lock, cancel all extra charges, wake up all waiters, and retry trylock. Signed-off-by: Daisuke Nishimura Reported-by: Ben Blum Cc: Miao Xie Cc: David Rientjes Cc: Paul Menage Cc: Hiroyuki Kamezawa Cc: Balbir Singh Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 84 +++++++++++++++++++++++++++++++++------------------------ 1 file changed, 49 insertions(+), 35 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 1b44ad64f281..c339d7431bda 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -292,7 +292,6 @@ static struct move_charge_struct { unsigned long moved_charge; unsigned long moved_swap; struct task_struct *moving_task; /* a task moving charges */ - struct mm_struct *mm; wait_queue_head_t waitq; /* a waitq for other context */ } mc = { .lock = __SPIN_LOCK_UNLOCKED(mc.lock), @@ -4681,7 +4680,7 @@ static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm) unsigned long precharge; struct vm_area_struct *vma; - /* We've already held the mmap_sem */ + down_read(&mm->mmap_sem); for (vma = mm->mmap; vma; vma = vma->vm_next) { struct mm_walk mem_cgroup_count_precharge_walk = { .pmd_entry = mem_cgroup_count_precharge_pte_range, @@ -4693,6 +4692,7 @@ static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm) walk_page_range(vma->vm_start, vma->vm_end, &mem_cgroup_count_precharge_walk); } + up_read(&mm->mmap_sem); precharge = mc.precharge; mc.precharge = 0; @@ -4702,10 +4702,15 @@ static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm) static int mem_cgroup_precharge_mc(struct mm_struct *mm) { - return mem_cgroup_do_precharge(mem_cgroup_count_precharge(mm)); + unsigned long precharge = mem_cgroup_count_precharge(mm); + + VM_BUG_ON(mc.moving_task); + mc.moving_task = current; + return mem_cgroup_do_precharge(precharge); } -static void mem_cgroup_clear_mc(void) +/* cancels all extra charges on mc.from and mc.to, and wakes up all waiters. */ +static void __mem_cgroup_clear_mc(void) { struct mem_cgroup *from = mc.from; struct mem_cgroup *to = mc.to; @@ -4740,23 +4745,28 @@ static void mem_cgroup_clear_mc(void) PAGE_SIZE * mc.moved_swap); } /* we've already done mem_cgroup_get(mc.to) */ - mc.moved_swap = 0; } - if (mc.mm) { - up_read(&mc.mm->mmap_sem); - mmput(mc.mm); - } + memcg_oom_recover(from); + memcg_oom_recover(to); + wake_up_all(&mc.waitq); +} + +static void mem_cgroup_clear_mc(void) +{ + struct mem_cgroup *from = mc.from; + + /* + * we must clear moving_task before waking up waiters at the end of + * task migration. + */ + mc.moving_task = NULL; + __mem_cgroup_clear_mc(); spin_lock(&mc.lock); mc.from = NULL; mc.to = NULL; spin_unlock(&mc.lock); - mc.moving_task = NULL; - mc.mm = NULL; mem_cgroup_end_move(from); - memcg_oom_recover(from); - memcg_oom_recover(to); - wake_up_all(&mc.waitq); } static int mem_cgroup_can_attach(struct cgroup_subsys *ss, @@ -4778,38 +4788,23 @@ static int mem_cgroup_can_attach(struct cgroup_subsys *ss, return 0; /* We move charges only when we move a owner of the mm */ if (mm->owner == p) { - /* - * We do all the move charge works under one mmap_sem to - * avoid deadlock with down_write(&mmap_sem) - * -> try_charge() -> if (mc.moving_task) -> sleep. - */ - down_read(&mm->mmap_sem); - VM_BUG_ON(mc.from); VM_BUG_ON(mc.to); VM_BUG_ON(mc.precharge); VM_BUG_ON(mc.moved_charge); VM_BUG_ON(mc.moved_swap); - VM_BUG_ON(mc.moving_task); - VM_BUG_ON(mc.mm); - mem_cgroup_start_move(from); spin_lock(&mc.lock); mc.from = from; mc.to = mem; - mc.precharge = 0; - mc.moved_charge = 0; - mc.moved_swap = 0; spin_unlock(&mc.lock); - mc.moving_task = current; - mc.mm = mm; + /* We set mc.moving_task later */ ret = mem_cgroup_precharge_mc(mm); if (ret) mem_cgroup_clear_mc(); - /* We call up_read() and mmput() in clear_mc(). */ - } else - mmput(mm); + } + mmput(mm); } return ret; } @@ -4898,7 +4893,19 @@ static void mem_cgroup_move_charge(struct mm_struct *mm) struct vm_area_struct *vma; lru_add_drain_all(); - /* We've already held the mmap_sem */ +retry: + if (unlikely(!down_read_trylock(&mm->mmap_sem))) { + /* + * Someone who are holding the mmap_sem might be waiting in + * waitq. So we cancel all extra charges, wake up all waiters, + * and retry. Because we cancel precharges, we might not be able + * to move enough charges, but moving charge is a best-effort + * feature anyway, so it wouldn't be a big problem. + */ + __mem_cgroup_clear_mc(); + cond_resched(); + goto retry; + } for (vma = mm->mmap; vma; vma = vma->vm_next) { int ret; struct mm_walk mem_cgroup_move_charge_walk = { @@ -4917,6 +4924,7 @@ static void mem_cgroup_move_charge(struct mm_struct *mm) */ break; } + up_read(&mm->mmap_sem); } static void mem_cgroup_move_task(struct cgroup_subsys *ss, @@ -4925,11 +4933,17 @@ static void mem_cgroup_move_task(struct cgroup_subsys *ss, struct task_struct *p, bool threadgroup) { - if (!mc.mm) + struct mm_struct *mm; + + if (!mc.to) /* no need to move charge */ return; - mem_cgroup_move_charge(mc.mm); + mm = get_task_mm(p); + if (mm) { + mem_cgroup_move_charge(mm); + mmput(mm); + } mem_cgroup_clear_mc(); } #else /* !CONFIG_MMU */ -- cgit From 17295c88a160c6eea3fcf46cec9d08a0fcb02db9 Mon Sep 17 00:00:00 2001 From: Jesper Juhl Date: Thu, 13 Jan 2011 15:47:42 -0800 Subject: memcg: use [kv]zalloc[_node] rather than [kv]malloc+memset In mem_cgroup_alloc() we currently do either kmalloc() or vmalloc() then followed by memset() to zero the memory. This can be more efficiently achieved by using kzalloc() and vzalloc(). There's also one situation where we can use kzalloc_node() - this is what's new in this version of the patch. Signed-off-by: Jesper Juhl Cc: KAMEZAWA Hiroyuki Cc: Minchan Kim Cc: Wu Fengguang Cc: Balbir Singh Cc: Li Zefan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index c339d7431bda..6424ba0fce83 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -4216,13 +4216,11 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node) */ if (!node_state(node, N_NORMAL_MEMORY)) tmp = -1; - pn = kmalloc_node(sizeof(*pn), GFP_KERNEL, tmp); + pn = kzalloc_node(sizeof(*pn), GFP_KERNEL, tmp); if (!pn) return 1; mem->info.nodeinfo[node] = pn; - memset(pn, 0, sizeof(*pn)); - for (zone = 0; zone < MAX_NR_ZONES; zone++) { mz = &pn->zoneinfo[zone]; for_each_lru(l) @@ -4246,14 +4244,13 @@ static struct mem_cgroup *mem_cgroup_alloc(void) /* Can be very big if MAX_NUMNODES is very big */ if (size < PAGE_SIZE) - mem = kmalloc(size, GFP_KERNEL); + mem = kzalloc(size, GFP_KERNEL); else - mem = vmalloc(size); + mem = vzalloc(size); if (!mem) return NULL; - memset(mem, 0, size); mem->stat = alloc_percpu(struct mem_cgroup_stat_cpu); if (!mem->stat) goto out_free; -- cgit From 50de1dd967d4ba3b8a90ebe7a4f5feca24191317 Mon Sep 17 00:00:00 2001 From: Daisuke Nishimura Date: Thu, 13 Jan 2011 15:47:43 -0800 Subject: memcg: fix memory migration of shmem swapcache In the current implementation mem_cgroup_end_migration() decides whether the page migration has succeeded or not by checking "oldpage->mapping". But if we are tring to migrate a shmem swapcache, the page->mapping of it is NULL from the begining, so the check would be invalid. As a result, mem_cgroup_end_migration() assumes the migration has succeeded even if it's not, so "newpage" would be freed while it's not uncharged. This patch fixes it by passing mem_cgroup_end_migration() the result of the page migration. Signed-off-by: Daisuke Nishimura Reviewed-by: Minchan Kim Acked-by: KAMEZAWA Hiroyuki Acked-by: Balbir Singh Cc: Minchan Kim Reviewed-by: Johannes Weiner Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 5 ++--- mm/memcontrol.c | 5 ++--- mm/migrate.c | 2 +- 3 files changed, 5 insertions(+), 7 deletions(-) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 067115ce6b3e..6a576f989437 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -98,7 +98,7 @@ extern int mem_cgroup_prepare_migration(struct page *page, struct page *newpage, struct mem_cgroup **ptr); extern void mem_cgroup_end_migration(struct mem_cgroup *mem, - struct page *oldpage, struct page *newpage); + struct page *oldpage, struct page *newpage, bool migration_ok); /* * For memory reclaim. @@ -251,8 +251,7 @@ mem_cgroup_prepare_migration(struct page *page, struct page *newpage, } static inline void mem_cgroup_end_migration(struct mem_cgroup *mem, - struct page *oldpage, - struct page *newpage) + struct page *oldpage, struct page *newpage, bool migration_ok) { } diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 6424ba0fce83..8ab841031436 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2896,7 +2896,7 @@ int mem_cgroup_prepare_migration(struct page *page, /* remove redundant charge if migration failed*/ void mem_cgroup_end_migration(struct mem_cgroup *mem, - struct page *oldpage, struct page *newpage) + struct page *oldpage, struct page *newpage, bool migration_ok) { struct page *used, *unused; struct page_cgroup *pc; @@ -2905,8 +2905,7 @@ void mem_cgroup_end_migration(struct mem_cgroup *mem, return; /* blocks rmdir() */ cgroup_exclude_rmdir(&mem->css); - /* at migration success, oldpage->mapping is NULL. */ - if (oldpage->mapping) { + if (!migration_ok) { used = oldpage; unused = newpage; } else { diff --git a/mm/migrate.c b/mm/migrate.c index 5b7d1fd29621..46fe8cc13d67 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -768,7 +768,7 @@ skip_unmap: uncharge: if (!charge) - mem_cgroup_end_migration(mem, page, newpage); + mem_cgroup_end_migration(mem, page, newpage, rc == 0); unlock: unlock_page(page); -- cgit From bba958783b1b4cb0a9420f4e11082467132a334c Mon Sep 17 00:00:00 2001 From: Paul Mundt Date: Fri, 14 Jan 2011 15:57:47 +0900 Subject: mmc: sh_mmcif: Convert to __raw_xxx() I/O accessors. When using the I/O accessors in raw mode from the boot stubs we don't want to bother with any of the complexity associated with readl/writel and friends. Furthermore, utilization within the context of the host driver itself is all performed on an ioremapped window, so using the __raw variants there doesn't pose any problem either. If and when barriers need to be added in the future, these will need to be explicitly written out, but this is so far not a concern for any of the affected CPUs in question. This fixes up the link error introduced by the ARM tree via its barrier refactoring: arch/arm/boot/compressed/mmcif-sh7372.o: In function `mmcif_loader': mmcif-sh7372.c:(.text+0x9e8): undefined reference to `outer_cache Following the change in: http://www.arm.linux.org.uk/developer/patches/viewpatch.php?id=6275/1 Reported-by: Simon Horman Signed-off-by: Paul Mundt --- include/linux/mmc/sh_mmcif.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/linux/mmc/sh_mmcif.h b/include/linux/mmc/sh_mmcif.h index bf173502d744..38d393092812 100644 --- a/include/linux/mmc/sh_mmcif.h +++ b/include/linux/mmc/sh_mmcif.h @@ -94,12 +94,12 @@ struct sh_mmcif_plat_data { static inline u32 sh_mmcif_readl(void __iomem *addr, int reg) { - return readl(addr + reg); + return __raw_readl(addr + reg); } static inline void sh_mmcif_writel(void __iomem *addr, int reg, u32 val) { - writel(val, addr + reg); + __raw_writel(val, addr + reg); } #define SH_MMCIF_BBS 512 /* boot block size */ -- cgit