From 90c5029e471636f21221bf66b9a46ada2ab79a22 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@muc.de>
Date: Wed, 27 Jul 2005 11:43:50 -0700
Subject: [PATCH] Undo mempolicy shared policy rbtree microoptimization

All mempolicy changes must be inside the spinlock and readding the rb_erase
prevents a crash while doing:

> echo "1" > /tmp/numatest
> numactl --length=0x4000 --shm /tmp/numatest --localalloc
> numactl --length=0x2000 --offset=0 --shm /tmp/numatest --membind=0
> numactl --length=0x2000 --offset=0x2000 --shm /tmp/numatest --membind=1
> ipcs
> ipcrm -M "the_key_value_of_this_shm_area"

Based on a patch by John Blackwood

Cc: <john.blackwood@ccur.com>
Cc: <andrea@suse.de>
Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/mempolicy.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index cb41c31e7c87..1694845526be 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1138,11 +1138,11 @@ void mpol_free_shared_policy(struct shared_policy *p)
 	while (next) {
 		n = rb_entry(next, struct sp_node, nd);
 		next = rb_next(&n->nd);
+		rb_erase(&n->nd, &p->root);
 		mpol_free(n->policy);
 		kmem_cache_free(sn_cache, n);
 	}
 	spin_unlock(&p->lock);
-	p->root = RB_ROOT;
 }
 
 /* assumes fs == KERNEL_DS */
-- 
cgit 


From 1aaf18ff9de1f37bf674236fc0779c3aaa65b998 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Wed, 27 Jul 2005 11:43:54 -0700
Subject: [PATCH] check_user_page_readable() deadlock fix

Fix bug identifued by Richard Purdie <rpurdie@rpsys.net>.

oprofile calls check_user_page_readable() from interrupt context, so we
deadlock over various VFS locks.

But check_user_page_readable() doesn't imply either a read or a write of the
page's contents.  Change __follow_page() so that check_user_page_readable()
can tell __follow_page() that we're not accessing the page's contents, and use
that info to avoid the troublesome lock-takings.

Also, make follow_page() inline for the single callsite in memory.c to save a
bit of stack space.

Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/memory.c | 25 +++++++++++++++----------
 1 file changed, 15 insertions(+), 10 deletions(-)

(limited to 'mm')

diff --git a/mm/memory.c b/mm/memory.c
index beabdefa6254..6fe77acbc1cd 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -776,8 +776,8 @@ unsigned long zap_page_range(struct vm_area_struct *vma, unsigned long address,
  * Do a quick page-table lookup for a single page.
  * mm->page_table_lock must be held.
  */
-static struct page *
-__follow_page(struct mm_struct *mm, unsigned long address, int read, int write)
+static struct page *__follow_page(struct mm_struct *mm, unsigned long address,
+			int read, int write, int accessed)
 {
 	pgd_t *pgd;
 	pud_t *pud;
@@ -818,9 +818,11 @@ __follow_page(struct mm_struct *mm, unsigned long address, int read, int write)
 		pfn = pte_pfn(pte);
 		if (pfn_valid(pfn)) {
 			page = pfn_to_page(pfn);
-			if (write && !pte_dirty(pte) && !PageDirty(page))
-				set_page_dirty(page);
-			mark_page_accessed(page);
+			if (accessed) {
+				if (write && !pte_dirty(pte) &&!PageDirty(page))
+					set_page_dirty(page);
+				mark_page_accessed(page);
+			}
 			return page;
 		}
 	}
@@ -829,16 +831,19 @@ out:
 	return NULL;
 }
 
-struct page *
+inline struct page *
 follow_page(struct mm_struct *mm, unsigned long address, int write)
 {
-	return __follow_page(mm, address, /*read*/0, write);
+	return __follow_page(mm, address, 0, write, 1);
 }
 
-int
-check_user_page_readable(struct mm_struct *mm, unsigned long address)
+/*
+ * check_user_page_readable() can be called frm niterrupt context by oprofile,
+ * so we need to avoid taking any non-irq-safe locks
+ */
+int check_user_page_readable(struct mm_struct *mm, unsigned long address)
 {
-	return __follow_page(mm, address, /*read*/1, /*write*/0) != NULL;
+	return __follow_page(mm, address, 1, 0, 0) != NULL;
 }
 EXPORT_SYMBOL(check_user_page_readable);
 
-- 
cgit 


From 165cd40235732644b1856a5ed5e158c9b93f6010 Mon Sep 17 00:00:00 2001
From: suzuki <suzuki@in.ibm.com>
Date: Wed, 27 Jul 2005 11:43:59 -0700
Subject: [PATCH] madvise() does not always return -EBADF on non-file mapped
 area

The madvise() system call returns -EBADF for areas which does not map to
files, only for *behaviour* request MADV_WILLNEED.

According to man pages, madvise returns :

EBADF - the map exists, but the area maps something that isn't a file.

Fixes bug 2995.

Signed-off-by: Suzuki K P <suzuki@in.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/madvise.c | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

(limited to 'mm')

diff --git a/mm/madvise.c b/mm/madvise.c
index 73180a22877e..c8c01a12fea4 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -83,9 +83,6 @@ static long madvise_willneed(struct vm_area_struct * vma,
 {
 	struct file *file = vma->vm_file;
 
-	if (!file)
-		return -EBADF;
-
 	if (file->f_mapping->a_ops->get_xip_page) {
 		/* no bad return value, but ignore advice */
 		return 0;
@@ -140,11 +137,16 @@ static long madvise_dontneed(struct vm_area_struct * vma,
 	return 0;
 }
 
-static long madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev,
-			unsigned long start, unsigned long end, int behavior)
+static long
+madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev,
+		unsigned long start, unsigned long end, int behavior)
 {
+	struct file *filp = vma->vm_file;
 	long error = -EBADF;
 
+	if (!filp)
+		goto  out;
+
 	switch (behavior) {
 	case MADV_NORMAL:
 	case MADV_SEQUENTIAL:
@@ -165,6 +167,7 @@ static long madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev
 		break;
 	}
 		
+out:
 	return error;
 }
 
-- 
cgit 


From 12b1c5f382194d3f656e78fb5c9c8f2bfbe8ed8a Mon Sep 17 00:00:00 2001
From: Andy Whitcroft <apw@shadowen.org>
Date: Wed, 27 Jul 2005 11:44:02 -0700
Subject: [PATCH] Remove bogus warning in page_alloc.c

Originally __free_pages_bulk used the relative page number within a zone to
define its buddies.  This meant that to maintain the "maximally aligned"
requirements (that an allocation of size N will be aligned at least to N
physically) zones had to also be aligned to 1<<MAX_ORDER pages.  When
__free_pages_bulk was updated to use the relative page frame numbers of the
free'd pages to pair buddies this released the alignment constraint on the
'left' edge of the zone.  This allows _either_ edge of the zone to contain
partial MAX_ORDER sized buddies.  These simply never will have matching
buddies and thus will never make it to the 'top' of the pyramid.

The patch below removes a now redundant check ensuring that the mem_map was
aligned to MAX_ORDER.

Signed-off-by: Andy Whitcroft <apw@shadowen.org>
Cc: Christoph Lameter <christoph@lameter.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/page_alloc.c | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'mm')

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 1d6ba6a4b594..42bccfb8464d 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1861,7 +1861,6 @@ static void __init free_area_init_core(struct pglist_data *pgdat,
 		unsigned long *zones_size, unsigned long *zholes_size)
 {
 	unsigned long i, j;
-	const unsigned long zone_required_alignment = 1UL << (MAX_ORDER-1);
 	int cpu, nid = pgdat->node_id;
 	unsigned long zone_start_pfn = pgdat->node_start_pfn;
 
@@ -1934,9 +1933,6 @@ static void __init free_area_init_core(struct pglist_data *pgdat,
 		zone->zone_mem_map = pfn_to_page(zone_start_pfn);
 		zone->zone_start_pfn = zone_start_pfn;
 
-		if ((zone_start_pfn) & (zone_required_alignment-1))
-			printk(KERN_CRIT "BUG: wrong zone alignment, it will crash\n");
-
 		memmap_init(size, nid, j, zone_start_pfn);
 
 		zonetable_add(zone, nid, j, zone_start_pfn, size);
-- 
cgit