aboutsummaryrefslogtreecommitdiff
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/hugetlb.c14
-rw-r--r--mm/madvise.c21
-rw-r--r--mm/mempolicy.c2
-rw-r--r--mm/page_alloc.c32
-rw-r--r--mm/slab.c186
-rw-r--r--mm/slob.c2
-rw-r--r--mm/swap.c32
-rw-r--r--mm/vmscan.c106
8 files changed, 252 insertions, 143 deletions
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index b21d78c941b5..508707704d2c 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -85,7 +85,7 @@ void free_huge_page(struct page *page)
BUG_ON(page_count(page));
INIT_LIST_HEAD(&page->lru);
- page[1].mapping = NULL;
+ page[1].lru.next = NULL; /* reset dtor */
spin_lock(&hugetlb_lock);
enqueue_huge_page(page);
@@ -105,9 +105,9 @@ struct page *alloc_huge_page(struct vm_area_struct *vma, unsigned long addr)
}
spin_unlock(&hugetlb_lock);
set_page_count(page, 1);
- page[1].mapping = (void *)free_huge_page;
+ page[1].lru.next = (void *)free_huge_page; /* set dtor */
for (i = 0; i < (HPAGE_SIZE/PAGE_SIZE); ++i)
- clear_highpage(&page[i]);
+ clear_user_highpage(&page[i], addr);
return page;
}
@@ -391,12 +391,7 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
if (!new_page) {
page_cache_release(old_page);
-
- /* Logically this is OOM, not a SIGBUS, but an OOM
- * could cause the kernel to go killing other
- * processes which won't help the hugepage situation
- * at all (?) */
- return VM_FAULT_SIGBUS;
+ return VM_FAULT_OOM;
}
spin_unlock(&mm->page_table_lock);
@@ -444,6 +439,7 @@ retry:
page = alloc_huge_page(vma, address);
if (!page) {
hugetlb_put_quota(mapping);
+ ret = VM_FAULT_OOM;
goto out;
}
diff --git a/mm/madvise.c b/mm/madvise.c
index ae0ae3ea299a..af3d573b0141 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -22,16 +22,23 @@ static long madvise_behavior(struct vm_area_struct * vma,
struct mm_struct * mm = vma->vm_mm;
int error = 0;
pgoff_t pgoff;
- int new_flags = vma->vm_flags & ~VM_READHINTMASK;
+ int new_flags = vma->vm_flags;
switch (behavior) {
+ case MADV_NORMAL:
+ new_flags = new_flags & ~VM_RAND_READ & ~VM_SEQ_READ;
+ break;
case MADV_SEQUENTIAL:
- new_flags |= VM_SEQ_READ;
+ new_flags = (new_flags & ~VM_RAND_READ) | VM_SEQ_READ;
break;
case MADV_RANDOM:
- new_flags |= VM_RAND_READ;
+ new_flags = (new_flags & ~VM_SEQ_READ) | VM_RAND_READ;
break;
- default:
+ case MADV_DONTFORK:
+ new_flags |= VM_DONTCOPY;
+ break;
+ case MADV_DOFORK:
+ new_flags &= ~VM_DONTCOPY;
break;
}
@@ -177,6 +184,12 @@ madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev,
long error;
switch (behavior) {
+ case MADV_DOFORK:
+ if (vma->vm_flags & VM_IO) {
+ error = -EINVAL;
+ break;
+ }
+ case MADV_DONTFORK:
case MADV_NORMAL:
case MADV_SEQUENTIAL:
case MADV_RANDOM:
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 27da6d5c77ba..3bd7fb7e4b75 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1159,6 +1159,7 @@ static inline unsigned interleave_nid(struct mempolicy *pol,
return interleave_nodes(pol);
}
+#ifdef CONFIG_HUGETLBFS
/* Return a zonelist suitable for a huge page allocation. */
struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr)
{
@@ -1172,6 +1173,7 @@ struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr)
}
return zonelist_policy(GFP_HIGHUSER, pol);
}
+#endif
/* Allocate a page in interleaved policy.
Own path because it needs to do special accounting. */
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 44b4eb4202d9..62c122528587 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -56,6 +56,7 @@ long nr_swap_pages;
int percpu_pagelist_fraction;
static void fastcall free_hot_cold_page(struct page *page, int cold);
+static void __free_pages_ok(struct page *page, unsigned int order);
/*
* results with 256, 32 in the lowmem_reserve sysctl:
@@ -169,20 +170,23 @@ static void bad_page(struct page *page)
* All pages have PG_compound set. All pages have their ->private pointing at
* the head page (even the head page has this).
*
- * The first tail page's ->mapping, if non-zero, holds the address of the
- * compound page's put_page() function.
- *
- * The order of the allocation is stored in the first tail page's ->index
- * This is only for debug at present. This usage means that zero-order pages
- * may not be compound.
+ * The first tail page's ->lru.next holds the address of the compound page's
+ * put_page() function. Its ->lru.prev holds the order of allocation.
+ * This usage means that zero-order pages may not be compound.
*/
+
+static void free_compound_page(struct page *page)
+{
+ __free_pages_ok(page, (unsigned long)page[1].lru.prev);
+}
+
static void prep_compound_page(struct page *page, unsigned long order)
{
int i;
int nr_pages = 1 << order;
- page[1].mapping = NULL;
- page[1].index = order;
+ page[1].lru.next = (void *)free_compound_page; /* set dtor */
+ page[1].lru.prev = (void *)order;
for (i = 0; i < nr_pages; i++) {
struct page *p = page + i;
@@ -196,7 +200,7 @@ static void destroy_compound_page(struct page *page, unsigned long order)
int i;
int nr_pages = 1 << order;
- if (unlikely(page[1].index != order))
+ if (unlikely((unsigned long)page[1].lru.prev != order))
bad_page(page);
for (i = 0; i < nr_pages; i++) {
@@ -1213,18 +1217,21 @@ static void __get_page_state(struct page_state *ret, int nr, cpumask_t *cpumask)
{
int cpu = 0;
- memset(ret, 0, sizeof(*ret));
+ memset(ret, 0, nr * sizeof(unsigned long));
cpus_and(*cpumask, *cpumask, cpu_online_map);
cpu = first_cpu(*cpumask);
while (cpu < NR_CPUS) {
unsigned long *in, *out, off;
+ if (!cpu_isset(cpu, *cpumask))
+ continue;
+
in = (unsigned long *)&per_cpu(page_states, cpu);
cpu = next_cpu(cpu, *cpumask);
- if (cpu < NR_CPUS)
+ if (likely(cpu < NR_CPUS))
prefetch(&per_cpu(page_states, cpu));
out = (unsigned long *)ret;
@@ -1886,8 +1893,7 @@ static void setup_pagelist_highmark(struct per_cpu_pageset *p,
* not check if the processor is online before following the pageset pointer.
* Other parts of the kernel may not check if the zone is available.
*/
-static struct per_cpu_pageset
- boot_pageset[NR_CPUS];
+static struct per_cpu_pageset boot_pageset[NR_CPUS];
/*
* Dynamically allocate memory for the
diff --git a/mm/slab.c b/mm/slab.c
index 71370256a7eb..add05d808a4a 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -294,6 +294,7 @@ struct kmem_list3 {
unsigned long next_reap;
int free_touched;
unsigned int free_limit;
+ unsigned int colour_next; /* Per-node cache coloring */
spinlock_t list_lock;
struct array_cache *shared; /* shared per node */
struct array_cache **alien; /* on other nodes */
@@ -344,6 +345,7 @@ static void kmem_list3_init(struct kmem_list3 *parent)
INIT_LIST_HEAD(&parent->slabs_free);
parent->shared = NULL;
parent->alien = NULL;
+ parent->colour_next = 0;
spin_lock_init(&parent->list_lock);
parent->free_objects = 0;
parent->free_touched = 0;
@@ -390,7 +392,6 @@ struct kmem_cache {
size_t colour; /* cache colouring range */
unsigned int colour_off; /* colour offset */
- unsigned int colour_next; /* cache colouring */
struct kmem_cache *slabp_cache;
unsigned int slab_size;
unsigned int dflags; /* dynamic flags */
@@ -883,14 +884,14 @@ static void __drain_alien_cache(struct kmem_cache *cachep,
}
}
-static void drain_alien_cache(struct kmem_cache *cachep, struct kmem_list3 *l3)
+static void drain_alien_cache(struct kmem_cache *cachep, struct array_cache **alien)
{
int i = 0;
struct array_cache *ac;
unsigned long flags;
for_each_online_node(i) {
- ac = l3->alien[i];
+ ac = alien[i];
if (ac) {
spin_lock_irqsave(&ac->lock, flags);
__drain_alien_cache(cachep, ac, i);
@@ -899,9 +900,18 @@ static void drain_alien_cache(struct kmem_cache *cachep, struct kmem_list3 *l3)
}
}
#else
-#define alloc_alien_cache(node, limit) do { } while (0)
-#define free_alien_cache(ac_ptr) do { } while (0)
-#define drain_alien_cache(cachep, l3) do { } while (0)
+
+#define drain_alien_cache(cachep, alien) do { } while (0)
+
+static inline struct array_cache **alloc_alien_cache(int node, int limit)
+{
+ return (struct array_cache **) 0x01020304ul;
+}
+
+static inline void free_alien_cache(struct array_cache **ac_ptr)
+{
+}
+
#endif
static int __devinit cpuup_callback(struct notifier_block *nfb,
@@ -935,6 +945,11 @@ static int __devinit cpuup_callback(struct notifier_block *nfb,
l3->next_reap = jiffies + REAPTIMEOUT_LIST3 +
((unsigned long)cachep) % REAPTIMEOUT_LIST3;
+ /*
+ * The l3s don't come and go as CPUs come and
+ * go. cache_chain_mutex is sufficient
+ * protection here.
+ */
cachep->nodelists[node] = l3;
}
@@ -949,26 +964,46 @@ static int __devinit cpuup_callback(struct notifier_block *nfb,
& array cache's */
list_for_each_entry(cachep, &cache_chain, next) {
struct array_cache *nc;
+ struct array_cache *shared;
+ struct array_cache **alien;
nc = alloc_arraycache(node, cachep->limit,
- cachep->batchcount);
+ cachep->batchcount);
if (!nc)
goto bad;
+ shared = alloc_arraycache(node,
+ cachep->shared * cachep->batchcount,
+ 0xbaadf00d);
+ if (!shared)
+ goto bad;
+
+ alien = alloc_alien_cache(node, cachep->limit);
+ if (!alien)
+ goto bad;
cachep->array[cpu] = nc;
l3 = cachep->nodelists[node];
BUG_ON(!l3);
- if (!l3->shared) {
- if (!(nc = alloc_arraycache(node,
- cachep->shared *
- cachep->batchcount,
- 0xbaadf00d)))
- goto bad;
- /* we are serialised from CPU_DEAD or
- CPU_UP_CANCELLED by the cpucontrol lock */
- l3->shared = nc;
+ spin_lock_irq(&l3->list_lock);
+ if (!l3->shared) {
+ /*
+ * We are serialised from CPU_DEAD or
+ * CPU_UP_CANCELLED by the cpucontrol lock
+ */
+ l3->shared = shared;
+ shared = NULL;
}
+#ifdef CONFIG_NUMA
+ if (!l3->alien) {
+ l3->alien = alien;
+ alien = NULL;
+ }
+#endif
+ spin_unlock_irq(&l3->list_lock);
+
+ kfree(shared);
+ free_alien_cache(alien);
}
mutex_unlock(&cache_chain_mutex);
break;
@@ -977,25 +1012,34 @@ static int __devinit cpuup_callback(struct notifier_block *nfb,
break;
#ifdef CONFIG_HOTPLUG_CPU
case CPU_DEAD:
+ /*
+ * Even if all the cpus of a node are down, we don't free the
+ * kmem_list3 of any cache. This to avoid a race between
+ * cpu_down, and a kmalloc allocation from another cpu for
+ * memory from the node of the cpu going down. The list3
+ * structure is usually allocated from kmem_cache_create() and
+ * gets destroyed at kmem_cache_destroy().
+ */
/* fall thru */
case CPU_UP_CANCELED:
mutex_lock(&cache_chain_mutex);
list_for_each_entry(cachep, &cache_chain, next) {
struct array_cache *nc;
+ struct array_cache *shared;
+ struct array_cache **alien;
cpumask_t mask;
mask = node_to_cpumask(node);
- spin_lock_irq(&cachep->spinlock);
/* cpu is dead; no one can alloc from it. */
nc = cachep->array[cpu];
cachep->array[cpu] = NULL;
l3 = cachep->nodelists[node];
if (!l3)
- goto unlock_cache;
+ goto free_array_cache;
- spin_lock(&l3->list_lock);
+ spin_lock_irq(&l3->list_lock);
/* Free limit for this kmem_list3 */
l3->free_limit -= cachep->batchcount;
@@ -1003,34 +1047,44 @@ static int __devinit cpuup_callback(struct notifier_block *nfb,
free_block(cachep, nc->entry, nc->avail, node);
if (!cpus_empty(mask)) {
- spin_unlock(&l3->list_lock);
- goto unlock_cache;
+ spin_unlock_irq(&l3->list_lock);
+ goto free_array_cache;
}
- if (l3->shared) {
+ shared = l3->shared;
+ if (shared) {
free_block(cachep, l3->shared->entry,
l3->shared->avail, node);
- kfree(l3->shared);
l3->shared = NULL;
}
- if (l3->alien) {
- drain_alien_cache(cachep, l3);
- free_alien_cache(l3->alien);
- l3->alien = NULL;
- }
- /* free slabs belonging to this node */
- if (__node_shrink(cachep, node)) {
- cachep->nodelists[node] = NULL;
- spin_unlock(&l3->list_lock);
- kfree(l3);
- } else {
- spin_unlock(&l3->list_lock);
+ alien = l3->alien;
+ l3->alien = NULL;
+
+ spin_unlock_irq(&l3->list_lock);
+
+ kfree(shared);
+ if (alien) {
+ drain_alien_cache(cachep, alien);
+ free_alien_cache(alien);
}
- unlock_cache:
- spin_unlock_irq(&cachep->spinlock);
+free_array_cache:
kfree(nc);
}
+ /*
+ * In the previous loop, all the objects were freed to
+ * the respective cache's slabs, now we can go ahead and
+ * shrink each nodelist to its limit.
+ */
+ list_for_each_entry(cachep, &cache_chain, next) {
+ l3 = cachep->nodelists[node];
+ if (!l3)
+ continue;
+ spin_lock_irq(&l3->list_lock);
+ /* free slabs belonging to this node */
+ __node_shrink(cachep, node);
+ spin_unlock_irq(&l3->list_lock);
+ }
mutex_unlock(&cache_chain_mutex);
break;
#endif
@@ -1119,7 +1173,6 @@ void __init kmem_cache_init(void)
BUG();
cache_cache.colour = left_over / cache_cache.colour_off;
- cache_cache.colour_next = 0;
cache_cache.slab_size = ALIGN(cache_cache.num * sizeof(kmem_bufctl_t) +
sizeof(struct slab), cache_line_size());
@@ -1664,6 +1717,12 @@ kmem_cache_create (const char *name, size_t size, size_t align,
BUG();
}
+ /*
+ * Prevent CPUs from coming and going.
+ * lock_cpu_hotplug() nests outside cache_chain_mutex
+ */
+ lock_cpu_hotplug();
+
mutex_lock(&cache_chain_mutex);
list_for_each(p, &cache_chain) {
@@ -1865,8 +1924,6 @@ kmem_cache_create (const char *name, size_t size, size_t align,
cachep->dtor = dtor;
cachep->name = name;
- /* Don't let CPUs to come and go */
- lock_cpu_hotplug();
if (g_cpucache_up == FULL) {
enable_cpucache(cachep);
@@ -1925,12 +1982,12 @@ kmem_cache_create (const char *name, size_t size, size_t align,
/* cache setup completed, link it into the list */
list_add(&cachep->next, &cache_chain);
- unlock_cpu_hotplug();
oops:
if (!cachep && (flags & SLAB_PANIC))
panic("kmem_cache_create(): failed to create slab `%s'\n",
name);
mutex_unlock(&cache_chain_mutex);
+ unlock_cpu_hotplug();
return cachep;
}
EXPORT_SYMBOL(kmem_cache_create);
@@ -2011,18 +2068,16 @@ static void drain_cpu_caches(struct kmem_cache *cachep)
smp_call_function_all_cpus(do_drain, cachep);
check_irq_on();
- spin_lock_irq(&cachep->spinlock);
for_each_online_node(node) {
l3 = cachep->nodelists[node];
if (l3) {
- spin_lock(&l3->list_lock);
+ spin_lock_irq(&l3->list_lock);
drain_array_locked(cachep, l3->shared, 1, node);
- spin_unlock(&l3->list_lock);
+ spin_unlock_irq(&l3->list_lock);
if (l3->alien)
- drain_alien_cache(cachep, l3);
+ drain_alien_cache(cachep, l3->alien);
}
}
- spin_unlock_irq(&cachep->spinlock);
}
static int __node_shrink(struct kmem_cache *cachep, int node)
@@ -2324,20 +2379,20 @@ static int cache_grow(struct kmem_cache *cachep, gfp_t flags, int nodeid)
*/
ctor_flags |= SLAB_CTOR_ATOMIC;
- /* About to mess with non-constant members - lock. */
+ /* Take the l3 list lock to change the colour_next on this node */
check_irq_off();
- spin_lock(&cachep->spinlock);
+ l3 = cachep->nodelists[nodeid];
+ spin_lock(&l3->list_lock);
/* Get colour for the slab, and cal the next value. */
- offset = cachep->colour_next;
- cachep->colour_next++;
- if (cachep->colour_next >= cachep->colour)
- cachep->colour_next = 0;
- offset *= cachep->colour_off;
+ offset = l3->colour_next;
+ l3->colour_next++;
+ if (l3->colour_next >= cachep->colour)
+ l3->colour_next = 0;
+ spin_unlock(&l3->list_lock);
- spin_unlock(&cachep->spinlock);
+ offset *= cachep->colour_off;
- check_irq_off();
if (local_flags & __GFP_WAIT)
local_irq_enable();
@@ -2367,7 +2422,6 @@ static int cache_grow(struct kmem_cache *cachep, gfp_t flags, int nodeid)
if (local_flags & __GFP_WAIT)
local_irq_disable();
check_irq_off();
- l3 = cachep->nodelists[nodeid];
spin_lock(&l3->list_lock);
/* Make slab active. */
@@ -2725,6 +2779,7 @@ static void *__cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int node
BUG_ON(!l3);
retry:
+ check_irq_off();
spin_lock(&l3->list_lock);
entry = l3->slabs_partial.next;
if (entry == &l3->slabs_partial) {
@@ -3304,11 +3359,11 @@ static int do_tune_cpucache(struct kmem_cache *cachep, int limit, int batchcount
smp_call_function_all_cpus(do_ccupdate_local, (void *)&new);
check_irq_on();
- spin_lock_irq(&cachep->spinlock);
+ spin_lock(&cachep->spinlock);
cachep->batchcount = batchcount;
cachep->limit = limit;
cachep->shared = shared;
- spin_unlock_irq(&cachep->spinlock);
+ spin_unlock(&cachep->spinlock);
for_each_online_cpu(i) {
struct array_cache *ccold = new.new[i];
@@ -3440,7 +3495,7 @@ static void cache_reap(void *unused)
l3 = searchp->nodelists[numa_node_id()];
if (l3->alien)
- drain_alien_cache(searchp, l3);
+ drain_alien_cache(searchp, l3->alien);
spin_lock_irq(&l3->list_lock);
drain_array_locked(searchp, cpu_cache_get(searchp), 0,
@@ -3564,8 +3619,7 @@ static int s_show(struct seq_file *m, void *p)
int node;
struct kmem_list3 *l3;
- check_irq_on();
- spin_lock_irq(&cachep->spinlock);
+ spin_lock(&cachep->spinlock);
active_objs = 0;
num_slabs = 0;
for_each_online_node(node) {
@@ -3573,7 +3627,8 @@ static int s_show(struct seq_file *m, void *p)
if (!l3)
continue;
- spin_lock(&l3->list_lock);
+ check_irq_on();
+ spin_lock_irq(&l3->list_lock);
list_for_each(q, &l3->slabs_full) {
slabp = list_entry(q, struct slab, list);
@@ -3598,9 +3653,10 @@ static int s_show(struct seq_file *m, void *p)
num_slabs++;
}
free_objects += l3->free_objects;
- shared_avail += l3->shared->avail;
+ if (l3->shared)
+ shared_avail += l3->shared->avail;
- spin_unlock(&l3->list_lock);
+ spin_unlock_irq(&l3->list_lock);
}
num_slabs += active_slabs;
num_objs = num_slabs * cachep->num;
@@ -3644,7 +3700,7 @@ static int s_show(struct seq_file *m, void *p)
}
#endif
seq_putc(m, '\n');
- spin_unlock_irq(&cachep->spinlock);
+ spin_unlock(&cachep->spinlock);
return 0;
}
diff --git a/mm/slob.c b/mm/slob.c
index 1c240c4b71d9..a1f42bdc0245 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -336,7 +336,7 @@ EXPORT_SYMBOL(slab_reclaim_pages);
#ifdef CONFIG_SMP
-void *__alloc_percpu(size_t size, size_t align)
+void *__alloc_percpu(size_t size)
{
int i;
struct percpu_data *pdata = kmalloc(sizeof (*pdata), GFP_KERNEL);
diff --git a/mm/swap.c b/mm/swap.c
index bc2442a7b0ee..cce3dda59c59 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -34,19 +34,22 @@
/* How many pages do we try to swap or page in/out together? */
int page_cluster;
-void put_page(struct page *page)
+static void put_compound_page(struct page *page)
{
- if (unlikely(PageCompound(page))) {
- page = (struct page *)page_private(page);
- if (put_page_testzero(page)) {
- void (*dtor)(struct page *page);
+ page = (struct page *)page_private(page);
+ if (put_page_testzero(page)) {
+ void (*dtor)(struct page *page);
- dtor = (void (*)(struct page *))page[1].mapping;
- (*dtor)(page);
- }
- return;
+ dtor = (void (*)(struct page *))page[1].lru.next;
+ (*dtor)(page);
}
- if (put_page_testzero(page))
+}
+
+void put_page(struct page *page)
+{
+ if (unlikely(PageCompound(page)))
+ put_compound_page(page);
+ else if (put_page_testzero(page))
__page_cache_release(page);
}
EXPORT_SYMBOL(put_page);
@@ -244,6 +247,15 @@ void release_pages(struct page **pages, int nr, int cold)
struct page *page = pages[i];
struct zone *pagezone;
+ if (unlikely(PageCompound(page))) {
+ if (zone) {
+ spin_unlock_irq(&zone->lru_lock);
+ zone = NULL;
+ }
+ put_compound_page(page);
+ continue;
+ }
+
if (!put_page_testzero(page))
continue;
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 5a610804cd06..1838c15ca4fd 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -443,6 +443,10 @@ static int shrink_list(struct list_head *page_list, struct scan_control *sc)
BUG_ON(PageActive(page));
sc->nr_scanned++;
+
+ if (!sc->may_swap && page_mapped(page))
+ goto keep_locked;
+
/* Double the slab pressure for mapped and swapcache pages */
if (page_mapped(page) || PageSwapCache(page))
sc->nr_scanned++;
@@ -632,7 +636,7 @@ static int swap_page(struct page *page)
struct address_space *mapping = page_mapping(page);
if (page_mapped(page) && mapping)
- if (try_to_unmap(page, 0) != SWAP_SUCCESS)
+ if (try_to_unmap(page, 1) != SWAP_SUCCESS)
goto unlock_retry;
if (PageDirty(page)) {
@@ -839,7 +843,7 @@ EXPORT_SYMBOL(migrate_page);
* pages are swapped out.
*
* The function returns after 10 attempts or if no pages
- * are movable anymore because t has become empty
+ * are movable anymore because to has become empty
* or no retryable pages exist anymore.
*
* Return: Number of pages not migrated when "to" ran empty.
@@ -928,12 +932,21 @@ redo:
goto unlock_both;
if (mapping->a_ops->migratepage) {
+ /*
+ * Most pages have a mapping and most filesystems
+ * should provide a migration function. Anonymous
+ * pages are part of swap space which also has its
+ * own migration function. This is the most common
+ * path for page migration.
+ */
rc = mapping->a_ops->migratepage(newpage, page);
goto unlock_both;
}
/*
- * Trigger writeout if page is dirty
+ * Default handling if a filesystem does not provide
+ * a migration function. We can only migrate clean
+ * pages so try to write out any dirty pages first.
*/
if (PageDirty(page)) {
switch (pageout(page, mapping)) {
@@ -949,9 +962,10 @@ redo:
; /* try to migrate the page below */
}
}
+
/*
- * If we have no buffer or can release the buffer
- * then do a simple migration.
+ * Buffers are managed in a filesystem specific way.
+ * We must have no buffers or drop them.
*/
if (!page_has_buffers(page) ||
try_to_release_page(page, GFP_KERNEL)) {
@@ -966,6 +980,11 @@ redo:
* swap them out.
*/
if (pass > 4) {
+ /*
+ * Persistently unable to drop buffers..... As a
+ * measure of last resort we fall back to
+ * swap_page().
+ */
unlock_page(newpage);
newpage = NULL;
rc = swap_page(page);
@@ -1176,9 +1195,47 @@ refill_inactive_zone(struct zone *zone, struct scan_control *sc)
struct page *page;
struct pagevec pvec;
int reclaim_mapped = 0;
- long mapped_ratio;
- long distress;
- long swap_tendency;
+
+ if (unlikely(sc->may_swap)) {
+ long mapped_ratio;
+ long distress;
+ long swap_tendency;
+
+ /*
+ * `distress' is a measure of how much trouble we're having
+ * reclaiming pages. 0 -> no problems. 100 -> great trouble.
+ */
+ distress = 100 >> zone->prev_priority;
+
+ /*
+ * The point of this algorithm is to decide when to start
+ * reclaiming mapped memory instead of just pagecache. Work out
+ * how much memory
+ * is mapped.
+ */
+ mapped_ratio = (sc->nr_mapped * 100) / total_memory;
+
+ /*
+ * Now decide how much we really want to unmap some pages. The
+ * mapped ratio is downgraded - just because there's a lot of
+ * mapped memory doesn't necessarily mean that page reclaim
+ * isn't succeeding.
+ *
+ * The distress ratio is important - we don't want to start
+ * going oom.
+ *
+ * A 100% value of vm_swappiness overrides this algorithm
+ * altogether.
+ */
+ swap_tendency = mapped_ratio / 2 + distress + vm_swappiness;
+
+ /*
+ * Now use this metric to decide whether to start moving mapped
+ * memory onto the inactive list.
+ */
+ if (swap_tendency >= 100)
+ reclaim_mapped = 1;
+ }
lru_add_drain();
spin_lock_irq(&zone->lru_lock);
@@ -1188,37 +1245,6 @@ refill_inactive_zone(struct zone *zone, struct scan_control *sc)
zone->nr_active -= pgmoved;
spin_unlock_irq(&zone->lru_lock);
- /*
- * `distress' is a measure of how much trouble we're having reclaiming
- * pages. 0 -> no problems. 100 -> great trouble.
- */
- distress = 100 >> zone->prev_priority;
-
- /*
- * The point of this algorithm is to decide when to start reclaiming
- * mapped memory instead of just pagecache. Work out how much memory
- * is mapped.
- */
- mapped_ratio = (sc->nr_mapped * 100) / total_memory;
-
- /*
- * Now decide how much we really want to unmap some pages. The mapped
- * ratio is downgraded - just because there's a lot of mapped memory
- * doesn't necessarily mean that page reclaim isn't succeeding.
- *
- * The distress ratio is important - we don't want to start going oom.
- *
- * A 100% value of vm_swappiness overrides this algorithm altogether.
- */
- swap_tendency = mapped_ratio / 2 + distress + vm_swappiness;
-
- /*
- * Now use this metric to decide whether to start moving mapped memory
- * onto the inactive list.
- */
- if (swap_tendency >= 100)
- reclaim_mapped = 1;
-
while (!list_empty(&l_hold)) {
cond_resched();
page = lru_to_page(&l_hold);
@@ -1595,9 +1621,7 @@ scan:
sc.nr_reclaimed = 0;
sc.priority = priority;
sc.swap_cluster_max = nr_pages? nr_pages : SWAP_CLUSTER_MAX;
- atomic_inc(&zone->reclaim_in_progress);
shrink_zone(zone, &sc);
- atomic_dec(&zone->reclaim_in_progress);
reclaim_state->reclaimed_slab = 0;
nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL,
lru_pages);