From 3d9a854c2dac3e888393b23ba7adafcce4d6d4b9 Mon Sep 17 00:00:00 2001
From: Denys Vlasenko <vda.linux@googlemail.com>
Date: Sat, 20 Feb 2010 01:03:43 +0100
Subject: Rename .data[.percpu][.XXX] to .data[..percpu][..XXX].

Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
Signed-off-by: Michal Marek <mmarek@suse.cz>
---
 kernel/module.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/module.c b/kernel/module.c
index f82386bd9ee9..5daf0abd63c1 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -397,7 +397,7 @@ static unsigned int find_pcpusec(Elf_Ehdr *hdr,
 				 Elf_Shdr *sechdrs,
 				 const char *secstrings)
 {
-	return find_sec(hdr, sechdrs, secstrings, ".data.percpu");
+	return find_sec(hdr, sechdrs, secstrings, ".data..percpu");
 }
 
 static void percpu_modcopy(void *pcpudest, const void *from, unsigned long size)
-- 
cgit v1.2.3-73-gaa49b


From 54e88fad223c4e1d94289611a90c7fe3ebe5631b Mon Sep 17 00:00:00 2001
From: "Amit K. Arora" <aarora@linux.vnet.ibm.com>
Date: Tue, 25 May 2010 18:53:46 +0530
Subject: sched: Make sure timers have migrated before killing the
 migration_thread

Problem: In a stress test where some heavy tests were running along with
regular CPU offlining and onlining, a hang was observed. The system seems
to be hung at a point where migration_call() tries to kill the
migration_thread of the dying CPU, which just got moved to the current
CPU. This migration thread does not get a chance to run (and die) since
rt_throttled is set to 1 on current, and it doesn't get cleared as the
hrtimer which is supposed to reset the rt bandwidth
(sched_rt_period_timer) is tied to the CPU which we just marked dead!

Solution: This patch pushes the killing of migration thread to
"CPU_POST_DEAD" event. By then all the timers (including
sched_rt_period_timer) should have got migrated (along with other
callbacks).

Signed-off-by: Amit Arora <aarora@in.ibm.com>
Signed-off-by: Gautham R Shenoy <ego@in.ibm.com>
Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Thomas Gleixner <tglx@linutronix.de>
LKML-Reference: <20100525132346.GA14986@amitarora.in.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/stop_machine.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index b4e7431e7c78..70f8d90331e9 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -321,7 +321,7 @@ static int __cpuinit cpu_stop_cpu_callback(struct notifier_block *nfb,
 
 #ifdef CONFIG_HOTPLUG_CPU
 	case CPU_UP_CANCELED:
-	case CPU_DEAD:
+	case CPU_POST_DEAD:
 	{
 		struct cpu_stop_work *work;
 
-- 
cgit v1.2.3-73-gaa49b


From 293a7cfeedc2b2380a7c7274902323c3cf5f7575 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Mon, 31 May 2010 19:53:50 +0930
Subject: module: fix reference to mod->percpu after freeing module.

Rafael sees a sometimes crash at precpu_modfree from kernel/module.c; it
only occurred with another (since-reverted) patch, but that patch simply
changed timing to uncover this bug, it was otherwise unrelated.

The comment about the mod being freed is self-explanatory, but neither
Tejun nor I read it.  This bug was introduced in 259354deaa, after it
had previously been fixed in 6e2b75740b.  How embarrassing.

Reported-by: "Rafael J. Wysocki" <rjw@sisk.pl>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Embarrassingly-Acked-by: Tejun Heo <tj@kernel.org>
Cc: Masami Hiramatsu <mhiramat@redhat.com>
Tested-by: "Rafael J. Wysocki" <rjw@sisk.pl>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/module.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/module.c b/kernel/module.c
index 333fbcc96978..d806e00e4450 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -2014,6 +2014,7 @@ static noinline struct module *load_module(void __user *umod,
 	long err = 0;
 	void *ptr = NULL; /* Stops spurious gcc warning */
 	unsigned long symoffs, stroffs, *strmap;
+	void __percpu *percpu;
 
 	mm_segment_t old_fs;
 
@@ -2158,6 +2159,8 @@ static noinline struct module *load_module(void __user *umod,
 			goto free_mod;
 		sechdrs[pcpuindex].sh_flags &= ~(unsigned long)SHF_ALLOC;
 	}
+	/* Keep this around for failure path. */
+	percpu = mod_percpu(mod);
 
 	/* Determine total sizes, and put offsets in sh_entsize.  For now
 	   this is done generically; there doesn't appear to be any
@@ -2463,7 +2466,7 @@ static noinline struct module *load_module(void __user *umod,
 	module_free(mod, mod->module_core);
 	/* mod will be freed with core. Don't access it beyond this line! */
  free_percpu:
-	percpu_modfree(mod);
+	free_percpu(percpu);
  free_mod:
 	kfree(args);
 	kfree(strmap);
-- 
cgit v1.2.3-73-gaa49b


From e51fd5e22e12b39f49b1bb60b37b300b17378a43 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 31 May 2010 12:37:30 +0200
Subject: sched: Fix wake_affine() vs RT tasks

Mike reports that since e9e9250b (sched: Scale down cpu_power due to RT
tasks), wake_affine() goes funny on RT tasks due to them still having a
!0 weight and wake_affine() still subtracts that from the rq weight.

Since nobody should be using se->weight for RT tasks, set the value to
zero. Also, since we now use ->cpu_power to normalize rq weights to
account for RT cpu usage, add that factor into the imbalance computation.

Reported-by: Mike Galbraith <efault@gmx.de>
Tested-by: Mike Galbraith <efault@gmx.de>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1275316109.27810.22969.camel@twins>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c      | 24 ++++++------------------
 kernel/sched_fair.c | 22 ++++++++++++++++------
 2 files changed, 22 insertions(+), 24 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index d48408142503..f8b8996228dd 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -544,6 +544,8 @@ struct rq {
 	struct root_domain *rd;
 	struct sched_domain *sd;
 
+	unsigned long cpu_power;
+
 	unsigned char idle_at_tick;
 	/* For active balancing */
 	int post_schedule;
@@ -1499,24 +1501,9 @@ static unsigned long target_load(int cpu, int type)
 	return max(rq->cpu_load[type-1], total);
 }
 
-static struct sched_group *group_of(int cpu)
-{
-	struct sched_domain *sd = rcu_dereference_sched(cpu_rq(cpu)->sd);
-
-	if (!sd)
-		return NULL;
-
-	return sd->groups;
-}
-
 static unsigned long power_of(int cpu)
 {
-	struct sched_group *group = group_of(cpu);
-
-	if (!group)
-		return SCHED_LOAD_SCALE;
-
-	return group->cpu_power;
+	return cpu_rq(cpu)->cpu_power;
 }
 
 static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
@@ -1854,8 +1841,8 @@ static void dec_nr_running(struct rq *rq)
 static void set_load_weight(struct task_struct *p)
 {
 	if (task_has_rt_policy(p)) {
-		p->se.load.weight = prio_to_weight[0] * 2;
-		p->se.load.inv_weight = prio_to_wmult[0] >> 1;
+		p->se.load.weight = 0;
+		p->se.load.inv_weight = WMULT_CONST;
 		return;
 	}
 
@@ -7605,6 +7592,7 @@ void __init sched_init(void)
 #ifdef CONFIG_SMP
 		rq->sd = NULL;
 		rq->rd = NULL;
+		rq->cpu_power = SCHED_LOAD_SCALE;
 		rq->post_schedule = 0;
 		rq->active_balance = 0;
 		rq->next_balance = jiffies;
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 217e4a9393e4..eed35eded602 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1225,7 +1225,6 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
 	unsigned long this_load, load;
 	int idx, this_cpu, prev_cpu;
 	unsigned long tl_per_task;
-	unsigned int imbalance;
 	struct task_group *tg;
 	unsigned long weight;
 	int balanced;
@@ -1252,8 +1251,6 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
 	tg = task_group(p);
 	weight = p->se.load.weight;
 
-	imbalance = 100 + (sd->imbalance_pct - 100) / 2;
-
 	/*
 	 * In low-load situations, where prev_cpu is idle and this_cpu is idle
 	 * due to the sync cause above having dropped this_load to 0, we'll
@@ -1263,9 +1260,21 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
 	 * Otherwise check if either cpus are near enough in load to allow this
 	 * task to be woken on this_cpu.
 	 */
-	balanced = !this_load ||
-		100*(this_load + effective_load(tg, this_cpu, weight, weight)) <=
-		imbalance*(load + effective_load(tg, prev_cpu, 0, weight));
+	if (this_load) {
+		unsigned long this_eff_load, prev_eff_load;
+
+		this_eff_load = 100;
+		this_eff_load *= power_of(prev_cpu);
+		this_eff_load *= this_load +
+			effective_load(tg, this_cpu, weight, weight);
+
+		prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2;
+		prev_eff_load *= power_of(this_cpu);
+		prev_eff_load *= load + effective_load(tg, prev_cpu, 0, weight);
+
+		balanced = this_eff_load <= prev_eff_load;
+	} else
+		balanced = true;
 
 	/*
 	 * If the currently running task will sleep within
@@ -2298,6 +2307,7 @@ static void update_cpu_power(struct sched_domain *sd, int cpu)
 	if (!power)
 		power = 1;
 
+	cpu_rq(cpu)->cpu_power = power;
 	sdg->cpu_power = power;
 }
 
-- 
cgit v1.2.3-73-gaa49b


From 5c113fbeed7a5a192d8431a768965f8a45c16475 Mon Sep 17 00:00:00 2001
From: Daniel J Blueman <daniel.blueman@gmail.com>
Date: Tue, 1 Jun 2010 12:15:11 +0100
Subject: fix cpu_chain section mismatch...

In commit e9fb7631ebcd ("cpu-hotplug: introduce cpu_notify(),
__cpu_notify(), cpu_notify_nofail()") the new helper functions access
cpu_chain.  As a result, it shouldn't be marked __cpuinitdata (via
section mismatch warning).

Alternatively, the helper functions should be forced inline, or marked
__ref or __cpuinit.  In the meantime, this patch silences the warning
the trivial way.

Signed-off-by: Daniel J Blueman <daniel.blueman@gmail.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/cpu.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/cpu.c b/kernel/cpu.c
index 8b92539b4754..97d1b426a4ac 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -34,7 +34,7 @@ void cpu_maps_update_done(void)
 	mutex_unlock(&cpu_add_remove_lock);
 }
 
-static __cpuinitdata RAW_NOTIFIER_HEAD(cpu_chain);
+static RAW_NOTIFIER_HEAD(cpu_chain);
 
 /* If set, cpu_up and cpu_down will return -EBUSY and do nothing.
  * Should always be manipulated under cpu_add_remove_lock
-- 
cgit v1.2.3-73-gaa49b


From ff9da691c0498ff81fdd014e7a0731dab2337dac Mon Sep 17 00:00:00 2001
From: Jens Axboe <jaxboe@fusionio.com>
Date: Thu, 3 Jun 2010 14:54:39 +0200
Subject: pipe: change /proc/sys/fs/pipe-max-pages to byte sized interface

This changes the interface to be based on bytes instead. The API
matches that of F_SETPIPE_SZ in that it rounds up the passed in
size so that the resulting page array is a power-of-2 in size.

The proc file is renamed to /proc/sys/fs/pipe-max-size to
reflect this change.

Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 fs/pipe.c                 | 54 ++++++++++++++++++++++++++++++++++++-----------
 include/linux/pipe_fs_i.h |  4 +++-
 kernel/sysctl.c           |  8 +++----
 3 files changed, 49 insertions(+), 17 deletions(-)

(limited to 'kernel')

diff --git a/fs/pipe.c b/fs/pipe.c
index f98fae3e36b0..69c4c7c13ea9 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -26,9 +26,14 @@
 
 /*
  * The max size that a non-root user is allowed to grow the pipe. Can
- * be set by root in /proc/sys/fs/pipe-max-pages
+ * be set by root in /proc/sys/fs/pipe-max-size
  */
-unsigned int pipe_max_pages = PIPE_DEF_BUFFERS * 16;
+unsigned int pipe_max_size = 1048576;
+
+/*
+ * Minimum pipe size, as required by POSIX
+ */
+unsigned int pipe_min_size = PAGE_SIZE;
 
 /*
  * We use a start+len construction, which provides full use of the 
@@ -1156,6 +1161,35 @@ static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long nr_pages)
 	return nr_pages * PAGE_SIZE;
 }
 
+/*
+ * Currently we rely on the pipe array holding a power-of-2 number
+ * of pages.
+ */
+static inline unsigned int round_pipe_size(unsigned int size)
+{
+	unsigned long nr_pages;
+
+	nr_pages = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
+	return roundup_pow_of_two(nr_pages) << PAGE_SHIFT;
+}
+
+/*
+ * This should work even if CONFIG_PROC_FS isn't set, as proc_dointvec_minmax
+ * will return an error.
+ */
+int pipe_proc_fn(struct ctl_table *table, int write, void __user *buf,
+		 size_t *lenp, loff_t *ppos)
+{
+	int ret;
+
+	ret = proc_dointvec_minmax(table, write, buf, lenp, ppos);
+	if (ret < 0 || !write)
+		return ret;
+
+	pipe_max_size = round_pipe_size(pipe_max_size);
+	return ret;
+}
+
 long pipe_fcntl(struct file *file, unsigned int cmd, unsigned long arg)
 {
 	struct pipe_inode_info *pipe;
@@ -1169,23 +1203,19 @@ long pipe_fcntl(struct file *file, unsigned int cmd, unsigned long arg)
 
 	switch (cmd) {
 	case F_SETPIPE_SZ: {
-		unsigned long nr_pages;
+		unsigned int size, nr_pages;
 
-		/*
-		 * Currently the array must be a power-of-2 size, so adjust
-		 * upwards if needed.
-		 */
-		nr_pages = (arg + PAGE_SIZE - 1) >> PAGE_SHIFT;
-		nr_pages = roundup_pow_of_two(nr_pages);
+		size = round_pipe_size(arg);
+		nr_pages = size >> PAGE_SHIFT;
 
-		if (!capable(CAP_SYS_RESOURCE) && nr_pages > pipe_max_pages) {
+		if (!capable(CAP_SYS_RESOURCE) && size > pipe_max_size) {
 			ret = -EPERM;
 			goto out;
-		} else if (nr_pages < 1) {
+		} else if (nr_pages < PAGE_SIZE) {
 			ret = -EINVAL;
 			goto out;
 		}
-		ret = pipe_set_size(pipe, arg);
+		ret = pipe_set_size(pipe, nr_pages);
 		break;
 		}
 	case F_GETPIPE_SZ:
diff --git a/include/linux/pipe_fs_i.h b/include/linux/pipe_fs_i.h
index 16de3933c45e..445796945ac9 100644
--- a/include/linux/pipe_fs_i.h
+++ b/include/linux/pipe_fs_i.h
@@ -139,7 +139,9 @@ void pipe_lock(struct pipe_inode_info *);
 void pipe_unlock(struct pipe_inode_info *);
 void pipe_double_lock(struct pipe_inode_info *, struct pipe_inode_info *);
 
-extern unsigned int pipe_max_pages;
+extern unsigned int pipe_max_size, pipe_min_size;
+int pipe_proc_fn(struct ctl_table *, int, void __user *, size_t *, loff_t *);
+
 
 /* Drop the inode semaphore and wait for a pipe event, atomically */
 void pipe_wait(struct pipe_inode_info *pipe);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 997080f00e0b..d24f761f4876 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1471,12 +1471,12 @@ static struct ctl_table fs_table[] = {
 	},
 #endif
 	{
-		.procname	= "pipe-max-pages",
-		.data		= &pipe_max_pages,
+		.procname	= "pipe-max-size",
+		.data		= &pipe_max_size,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec_minmax,
-		.extra1		= &two,
+		.proc_handler	= &pipe_proc_fn,
+		.extra1		= &pipe_min_size,
 	},
 /*
  * NOTE: do not add new entries to this table unless you have read
-- 
cgit v1.2.3-73-gaa49b


From 485d527686850d68a0e9006dd9904f19f122485e Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Fri, 4 Jun 2010 14:14:58 -0700
Subject: sys_personality: change sys_personality() to accept "unsigned int"
 instead of u_long

task_struct->pesonality is "unsigned int", but sys_personality() paths use
"unsigned long pesonality".  This means that every assignment or
comparison is not right.  In particular, if this argument does not fit
into "unsigned int" __set_personality() changes the caller's personality
and then sys_personality() returns -EINVAL.

Turn this argument into "unsigned int" and avoid overflows.  Obviously,
this is the user-visible change, we just ignore the upper bits.  But this
can't break the sane application.

There is another thing which can confuse the poorly written applications.
User-space thinks that this syscall returns int, not long.  This means
that the returned value can be negative and look like the error code.  But
note that libc won't be confused and thus errno won't be set, and with
this patch the user-space can never get -1 unless sys_personality() really
fails.  And, most importantly, the negative RET != -1 is only possible if
that app previously called personality(RET).

Pointed-out-by: Wenming Zhang <wezhang@redhat.com>
Suggested-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/personality.h |  2 +-
 include/linux/syscalls.h    |  2 +-
 kernel/exec_domain.c        | 18 +++++++++---------
 3 files changed, 11 insertions(+), 11 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/personality.h b/include/linux/personality.h
index 126120819a0d..eec3bae164d4 100644
--- a/include/linux/personality.h
+++ b/include/linux/personality.h
@@ -12,7 +12,7 @@ struct pt_regs;
 
 extern int		register_exec_domain(struct exec_domain *);
 extern int		unregister_exec_domain(struct exec_domain *);
-extern int		__set_personality(unsigned long);
+extern int		__set_personality(unsigned int);
 
 #endif /* __KERNEL__ */
 
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index a1a86a53bc73..7f614ce274a9 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -289,7 +289,7 @@ asmlinkage long sys_capget(cap_user_header_t header,
 				cap_user_data_t dataptr);
 asmlinkage long sys_capset(cap_user_header_t header,
 				const cap_user_data_t data);
-asmlinkage long sys_personality(u_long personality);
+asmlinkage long sys_personality(unsigned int personality);
 
 asmlinkage long sys_sigpending(old_sigset_t __user *set);
 asmlinkage long sys_sigprocmask(int how, old_sigset_t __user *set,
diff --git a/kernel/exec_domain.c b/kernel/exec_domain.c
index c35452cadded..dd62f8e714ca 100644
--- a/kernel/exec_domain.c
+++ b/kernel/exec_domain.c
@@ -27,7 +27,7 @@ static struct exec_domain *exec_domains = &default_exec_domain;
 static DEFINE_RWLOCK(exec_domains_lock);
 
 
-static u_long ident_map[32] = {
+static unsigned long ident_map[32] = {
 	0,	1,	2,	3,	4,	5,	6,	7,
 	8,	9,	10,	11,	12,	13,	14,	15,
 	16,	17,	18,	19,	20,	21,	22,	23,
@@ -56,10 +56,10 @@ default_handler(int segment, struct pt_regs *regp)
 }
 
 static struct exec_domain *
-lookup_exec_domain(u_long personality)
+lookup_exec_domain(unsigned int personality)
 {
-	struct exec_domain *	ep;
-	u_long			pers = personality(personality);
+	unsigned int pers = personality(personality);
+	struct exec_domain *ep;
 
 	read_lock(&exec_domains_lock);
 	for (ep = exec_domains; ep; ep = ep->next) {
@@ -70,7 +70,7 @@ lookup_exec_domain(u_long personality)
 
 #ifdef CONFIG_MODULES
 	read_unlock(&exec_domains_lock);
-	request_module("personality-%ld", pers);
+	request_module("personality-%d", pers);
 	read_lock(&exec_domains_lock);
 
 	for (ep = exec_domains; ep; ep = ep->next) {
@@ -135,7 +135,7 @@ unregister:
 }
 
 int
-__set_personality(u_long personality)
+__set_personality(unsigned int personality)
 {
 	struct exec_domain	*ep, *oep;
 
@@ -188,9 +188,9 @@ static int __init proc_execdomains_init(void)
 module_init(proc_execdomains_init);
 #endif
 
-SYSCALL_DEFINE1(personality, u_long, personality)
+SYSCALL_DEFINE1(personality, unsigned int, personality)
 {
-	u_long old = current->personality;
+	unsigned int old = current->personality;
 
 	if (personality != 0xffffffff) {
 		set_personality(personality);
@@ -198,7 +198,7 @@ SYSCALL_DEFINE1(personality, u_long, personality)
 			return -EINVAL;
 	}
 
-	return (long)old;
+	return old;
 }
 
 
-- 
cgit v1.2.3-73-gaa49b


From 94b3dd0f7bb393d93e84a173b1df9b8b64c83ac4 Mon Sep 17 00:00:00 2001
From: Greg Thelen <gthelen@google.com>
Date: Fri, 4 Jun 2010 14:15:03 -0700
Subject: cgroups: alloc_css_id() increments hierarchy depth

Child groups should have a greater depth than their parents.  Prior to
this change, the parent would incorrectly report zero memory usage for
child cgroups when use_hierarchy is enabled.

test script:
  mount -t cgroup none /cgroups -o memory
  cd /cgroups
  mkdir cg1

  echo 1 > cg1/memory.use_hierarchy
  mkdir cg1/cg11

  echo $$ > cg1/cg11/tasks
  dd if=/dev/zero of=/tmp/foo bs=1M count=1

  echo
  echo CHILD
  grep cache cg1/cg11/memory.stat

  echo
  echo PARENT
  grep cache cg1/memory.stat

  echo $$ > tasks
  rmdir cg1/cg11 cg1
  cd /
  umount /cgroups

Using fae9c79, a recent patch that changed alloc_css_id() depth computation,
the parent incorrectly reports zero usage:
  root@ubuntu:~# ./test
  1+0 records in
  1+0 records out
  1048576 bytes (1.0 MB) copied, 0.0151844 s, 69.1 MB/s

  CHILD
  cache 1048576
  total_cache 1048576

  PARENT
  cache 0
  total_cache 0

With this patch, the parent correctly includes child usage:
  root@ubuntu:~# ./test
  1+0 records in
  1+0 records out
  1048576 bytes (1.0 MB) copied, 0.0136827 s, 76.6 MB/s

  CHILD
  cache 1052672
  total_cache 1052672

  PARENT
  cache 0
  total_cache 1052672

Signed-off-by: Greg Thelen <gthelen@google.com>
Acked-by: Paul Menage <menage@google.com>
Acked-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Acked-by: Li Zefan <lizf@cn.fujitsu.com>
Cc: <stable@kernel.org>		[2.6.34.x]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/cgroup.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 422cb19f156e..3ac6f5b0a64b 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -4598,7 +4598,7 @@ static int alloc_css_id(struct cgroup_subsys *ss, struct cgroup *parent,
 	parent_css = parent->subsys[subsys_id];
 	child_css = child->subsys[subsys_id];
 	parent_id = parent_css->id;
-	depth = parent_id->depth;
+	depth = parent_id->depth + 1;
 
 	child_id = get_new_cssid(ss, depth);
 	if (IS_ERR(child_id))
-- 
cgit v1.2.3-73-gaa49b


From 9e506f7adce8e6165a104d3d78fddd8ff0cdccf8 Mon Sep 17 00:00:00 2001
From: Akinobu Mita <akinobu.mita@gmail.com>
Date: Fri, 4 Jun 2010 14:15:04 -0700
Subject: kernel/: fix BUG_ON checks for cpu notifier callbacks direct call

The commit 80b5184cc537718122e036afe7e62d202b70d077 ("kernel/: convert cpu
notifier to return encapsulate errno value") changed the return value of
cpu notifier callbacks.

Those callbacks don't return NOTIFY_BAD on failures anymore.  But there
are a few callbacks which are called directly at init time and checking
the return value.

I forgot to change BUG_ON checking by the direct callers in the commit.

Signed-off-by: Akinobu Mita <akinobu.mita@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/softirq.c | 2 +-
 kernel/timer.c   | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/softirq.c b/kernel/softirq.c
index 825e1126008f..07b4f1b1a73a 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -850,7 +850,7 @@ static __init int spawn_ksoftirqd(void)
 	void *cpu = (void *)(long)smp_processor_id();
 	int err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu);
 
-	BUG_ON(err == NOTIFY_BAD);
+	BUG_ON(err != NOTIFY_OK);
 	cpu_callback(&cpu_nfb, CPU_ONLINE, cpu);
 	register_cpu_notifier(&cpu_nfb);
 	return 0;
diff --git a/kernel/timer.c b/kernel/timer.c
index 2454172a80d3..ee305c8d4e18 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -1717,7 +1717,7 @@ void __init init_timers(void)
 
 	init_timer_stats();
 
-	BUG_ON(err == NOTIFY_BAD);
+	BUG_ON(err != NOTIFY_OK);
 	register_cpu_notifier(&timers_nb);
 	open_softirq(TIMER_SOFTIRQ, run_timer_softirq);
 }
-- 
cgit v1.2.3-73-gaa49b


From 2c02dfe7fe3fba97a5665d329d039d2415ea5607 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Mon, 31 May 2010 12:19:37 -0700
Subject: module: Make the 'usage' lists be two-way

When adding a module that depends on another one, we used to create a
one-way list of "modules_which_use_me", so that module unloading could
see who needs a module.

It's actually quite simple to make that list go both ways: so that we
not only can see "who uses me", but also see a list of modules that are
"used by me".

In fact, we always wanted that list in "module_unload_free()": when we
unload a module, we want to also release all the other modules that are
used by that module.  But because we didn't have that list, we used to
first iterate over all modules, and then iterate over each "used by me"
list of that module.

By making the list two-way, we simplify module_unload_free(), and it
allows for some trivial fixes later too.

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au> (cleaned & rebased)
---
 include/linux/module.h |  4 ++-
 kernel/module.c        | 79 ++++++++++++++++++++++++++++++--------------------
 2 files changed, 51 insertions(+), 32 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/module.h b/include/linux/module.h
index 6914fcad4673..680db9e2ac36 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -359,7 +359,9 @@ struct module
 
 #ifdef CONFIG_MODULE_UNLOAD
 	/* What modules depend on me? */
-	struct list_head modules_which_use_me;
+	struct list_head source_list;
+	/* What modules do I depend on? */
+	struct list_head target_list;
 
 	/* Who is waiting for us to be unloaded */
 	struct task_struct *waiter;
diff --git a/kernel/module.c b/kernel/module.c
index 0129769301e3..be18c3e34684 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -523,7 +523,8 @@ static void module_unload_init(struct module *mod)
 {
 	int cpu;
 
-	INIT_LIST_HEAD(&mod->modules_which_use_me);
+	INIT_LIST_HEAD(&mod->source_list);
+	INIT_LIST_HEAD(&mod->target_list);
 	for_each_possible_cpu(cpu) {
 		per_cpu_ptr(mod->refptr, cpu)->incs = 0;
 		per_cpu_ptr(mod->refptr, cpu)->decs = 0;
@@ -538,8 +539,9 @@ static void module_unload_init(struct module *mod)
 /* modules using other modules */
 struct module_use
 {
-	struct list_head list;
-	struct module *module_which_uses;
+	struct list_head source_list;
+	struct list_head target_list;
+	struct module *source, *target;
 };
 
 /* Does a already use b? */
@@ -547,8 +549,8 @@ static int already_uses(struct module *a, struct module *b)
 {
 	struct module_use *use;
 
-	list_for_each_entry(use, &b->modules_which_use_me, list) {
-		if (use->module_which_uses == a) {
+	list_for_each_entry(use, &b->source_list, source_list) {
+		if (use->source == a) {
 			DEBUGP("%s uses %s!\n", a->name, b->name);
 			return 1;
 		}
@@ -557,6 +559,33 @@ static int already_uses(struct module *a, struct module *b)
 	return 0;
 }
 
+/*
+ * Module a uses b
+ *  - we add 'a' as a "source", 'b' as a "target" of module use
+ *  - the module_use is added to the list of 'b' sources (so
+ *    'b' can walk the list to see who sourced them), and of 'a'
+ *    targets (so 'a' can see what modules it targets).
+ */
+static int add_module_usage(struct module *a, struct module *b)
+{
+	int no_warn;
+	struct module_use *use;
+
+	DEBUGP("Allocating new usage for %s.\n", a->name);
+	use = kmalloc(sizeof(*use), GFP_ATOMIC);
+	if (!use) {
+		printk(KERN_WARNING "%s: out of memory loading\n", a->name);
+		return -ENOMEM;
+	}
+
+	use->source = a;
+	use->target = b;
+	list_add(&use->source_list, &b->source_list);
+	list_add(&use->target_list, &a->target_list);
+	no_warn = sysfs_create_link(b->holders_dir, &a->mkobj.kobj, a->name);
+	return 0;
+}
+
 /* Module a uses b */
 int use_module(struct module *a, struct module *b)
 {
@@ -578,17 +607,11 @@ int use_module(struct module *a, struct module *b)
 	if (err)
 		return 0;
 
-	DEBUGP("Allocating new usage for %s.\n", a->name);
-	use = kmalloc(sizeof(*use), GFP_ATOMIC);
-	if (!use) {
-		printk("%s: out of memory loading\n", a->name);
+	err = add_module_usage(a, b);
+	if (err) {
 		module_put(b);
 		return 0;
 	}
-
-	use->module_which_uses = a;
-	list_add(&use->list, &b->modules_which_use_me);
-	no_warn = sysfs_create_link(b->holders_dir, &a->mkobj.kobj, a->name);
 	return 1;
 }
 EXPORT_SYMBOL_GPL(use_module);
@@ -596,22 +619,16 @@ EXPORT_SYMBOL_GPL(use_module);
 /* Clear the unload stuff of the module. */
 static void module_unload_free(struct module *mod)
 {
-	struct module *i;
-
-	list_for_each_entry(i, &modules, list) {
-		struct module_use *use;
+	struct module_use *use, *tmp;
 
-		list_for_each_entry(use, &i->modules_which_use_me, list) {
-			if (use->module_which_uses == mod) {
-				DEBUGP("%s unusing %s\n", mod->name, i->name);
-				module_put(i);
-				list_del(&use->list);
-				kfree(use);
-				sysfs_remove_link(i->holders_dir, mod->name);
-				/* There can be at most one match. */
-				break;
-			}
-		}
+	list_for_each_entry_safe(use, tmp, &mod->target_list, target_list) {
+		struct module *i = use->target;
+		DEBUGP("%s unusing %s\n", mod->name, i->name);
+		module_put(i);
+		list_del(&use->source_list);
+		list_del(&use->target_list);
+		kfree(use);
+		sysfs_remove_link(i->holders_dir, mod->name);
 	}
 }
 
@@ -735,7 +752,7 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user,
 		goto out;
 	}
 
-	if (!list_empty(&mod->modules_which_use_me)) {
+	if (!list_empty(&mod->source_list)) {
 		/* Other modules depend on us: get rid of them first. */
 		ret = -EWOULDBLOCK;
 		goto out;
@@ -799,9 +816,9 @@ static inline void print_unload_info(struct seq_file *m, struct module *mod)
 
 	/* Always include a trailing , so userspace can differentiate
            between this and the old multi-field proc format. */
-	list_for_each_entry(use, &mod->modules_which_use_me, list) {
+	list_for_each_entry(use, &mod->source_list, source_list) {
 		printed_something = 1;
-		seq_printf(m, "%s,", use->module_which_uses->name);
+		seq_printf(m, "%s,", use->source->name);
 	}
 
 	if (mod->init != NULL && mod->exit == NULL) {
-- 
cgit v1.2.3-73-gaa49b


From c8e21ced08b39ef8dfe7236fb2a923a95f645262 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Sat, 5 Jun 2010 11:17:35 -0600
Subject: module: fix kdb's illicit use of struct module_use.

Linus changed the structure, and luckily this didn't compile any more.

Reported-by: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Cc: Jason Wessel <jason.wessel@windriver.com>
Cc: Martin Hicks <mort@sgi.com>
---
 include/linux/module.h      |  7 +++++++
 kernel/debug/kdb/kdb_main.c | 12 +++---------
 kernel/module.c             | 11 +----------
 3 files changed, 11 insertions(+), 19 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/module.h b/include/linux/module.h
index 680db9e2ac36..5d8fca5dcff5 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -181,6 +181,13 @@ void *__symbol_get(const char *symbol);
 void *__symbol_get_gpl(const char *symbol);
 #define symbol_get(x) ((typeof(&x))(__symbol_get(MODULE_SYMBOL_PREFIX #x)))
 
+/* modules using other modules: kdb wants to see this. */
+struct module_use {
+	struct list_head source_list;
+	struct list_head target_list;
+	struct module *source, *target;
+};
+
 #ifndef __GENKSYMS__
 #ifdef CONFIG_MODVERSIONS
 /* Mark the CRC weak since genksyms apparently decides not to
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c
index b724c791b6d4..184cd8209c36 100644
--- a/kernel/debug/kdb/kdb_main.c
+++ b/kernel/debug/kdb/kdb_main.c
@@ -1857,12 +1857,6 @@ static int kdb_ef(int argc, const char **argv)
 }
 
 #if defined(CONFIG_MODULES)
-/* modules using other modules */
-struct module_use {
-	struct list_head list;
-	struct module *module_which_uses;
-};
-
 /*
  * kdb_lsmod - This function implements the 'lsmod' command.  Lists
  *	currently loaded kernel modules.
@@ -1894,9 +1888,9 @@ static int kdb_lsmod(int argc, const char **argv)
 		{
 			struct module_use *use;
 			kdb_printf(" [ ");
-			list_for_each_entry(use, &mod->modules_which_use_me,
-					    list)
-				kdb_printf("%s ", use->module_which_uses->name);
+			list_for_each_entry(use, &mod->source_list,
+					    source_list)
+				kdb_printf("%s ", use->target->name);
 			kdb_printf("]\n");
 		}
 #endif
diff --git a/kernel/module.c b/kernel/module.c
index be18c3e34684..bbb1d812c79c 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -536,14 +536,6 @@ static void module_unload_init(struct module *mod)
 	mod->waiter = current;
 }
 
-/* modules using other modules */
-struct module_use
-{
-	struct list_head source_list;
-	struct list_head target_list;
-	struct module *source, *target;
-};
-
 /* Does a already use b? */
 static int already_uses(struct module *a, struct module *b)
 {
@@ -589,8 +581,7 @@ static int add_module_usage(struct module *a, struct module *b)
 /* Module a uses b */
 int use_module(struct module *a, struct module *b)
 {
-	struct module_use *use;
-	int no_warn, err;
+	int err;
 
 	if (b == NULL || already_uses(a, b)) return 1;
 
-- 
cgit v1.2.3-73-gaa49b


From 80a3d1bb410e000e176931a076cdf19a1e89a955 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Sat, 5 Jun 2010 11:17:36 -0600
Subject: module: move sysfs exposure to end of load_module

This means a little extra work, but is more logical: we don't put
anything in sysfs until we're about to put the module into the
global list an parse its parameters.

This also gives us a logical place to put duplicate module detection
in the next patch.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 kernel/module.c | 47 ++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 36 insertions(+), 11 deletions(-)

(limited to 'kernel')

diff --git a/kernel/module.c b/kernel/module.c
index bbb1d812c79c..c690d9885797 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -560,7 +560,6 @@ static int already_uses(struct module *a, struct module *b)
  */
 static int add_module_usage(struct module *a, struct module *b)
 {
-	int no_warn;
 	struct module_use *use;
 
 	DEBUGP("Allocating new usage for %s.\n", a->name);
@@ -574,7 +573,6 @@ static int add_module_usage(struct module *a, struct module *b)
 	use->target = b;
 	list_add(&use->source_list, &b->source_list);
 	list_add(&use->target_list, &a->target_list);
-	no_warn = sysfs_create_link(b->holders_dir, &a->mkobj.kobj, a->name);
 	return 0;
 }
 
@@ -619,7 +617,6 @@ static void module_unload_free(struct module *mod)
 		list_del(&use->source_list);
 		list_del(&use->target_list);
 		kfree(use);
-		sysfs_remove_link(i->holders_dir, mod->name);
 	}
 }
 
@@ -1303,6 +1300,29 @@ static inline void remove_notes_attrs(struct module *mod)
 #endif
 
 #ifdef CONFIG_SYSFS
+static void add_usage_links(struct module *mod)
+{
+#ifdef CONFIG_MODULE_UNLOAD
+	struct module_use *use;
+	int nowarn;
+
+	list_for_each_entry(use, &mod->target_list, target_list) {
+		nowarn = sysfs_create_link(use->target->holders_dir,
+					   &mod->mkobj.kobj, mod->name);
+	}
+#endif
+}
+
+static void del_usage_links(struct module *mod)
+{
+#ifdef CONFIG_MODULE_UNLOAD
+	struct module_use *use;
+
+	list_for_each_entry(use, &mod->target_list, target_list)
+		sysfs_remove_link(use->target->holders_dir, mod->name);
+#endif
+}
+
 int module_add_modinfo_attrs(struct module *mod)
 {
 	struct module_attribute *attr;
@@ -1385,6 +1405,10 @@ int mod_sysfs_setup(struct module *mod,
 {
 	int err;
 
+	err = mod_sysfs_init(mod);
+	if (err)
+		goto out;
+
 	mod->holders_dir = kobject_create_and_add("holders", &mod->mkobj.kobj);
 	if (!mod->holders_dir) {
 		err = -ENOMEM;
@@ -1399,6 +1423,8 @@ int mod_sysfs_setup(struct module *mod,
 	if (err)
 		goto out_unreg_param;
 
+	add_usage_links(mod);
+
 	kobject_uevent(&mod->mkobj.kobj, KOBJ_ADD);
 	return 0;
 
@@ -1408,6 +1434,7 @@ out_unreg_holders:
 	kobject_put(mod->holders_dir);
 out_unreg:
 	kobject_put(&mod->mkobj.kobj);
+out:
 	return err;
 }
 
@@ -1422,10 +1449,15 @@ static void mod_sysfs_fini(struct module *mod)
 {
 }
 
+static void del_usage_links(struct module *mod)
+{
+}
+
 #endif /* CONFIG_SYSFS */
 
 static void mod_kobject_remove(struct module *mod)
 {
+	del_usage_links(mod);
 	module_remove_modinfo_attrs(mod);
 	module_param_sysfs_remove(mod);
 	kobject_put(mod->mkobj.drivers_dir);
@@ -2242,11 +2274,6 @@ static noinline struct module *load_module(void __user *umod,
 	/* Now we've moved module, initialize linked lists, etc. */
 	module_unload_init(mod);
 
-	/* add kobject, so we can reference it. */
-	err = mod_sysfs_init(mod);
-	if (err)
-		goto free_unload;
-
 	/* Set up license info based on the info section */
 	set_license(mod, get_modinfo(sechdrs, infoindex, "license"));
 
@@ -2443,6 +2470,7 @@ static noinline struct module *load_module(void __user *umod,
 	err = mod_sysfs_setup(mod, mod->kp, mod->num_kp);
 	if (err < 0)
 		goto unlink;
+
 	add_sect_attrs(mod, hdr->e_shnum, secstrings, sechdrs);
 	add_notes_attrs(mod, hdr->e_shnum, secstrings, sechdrs);
 
@@ -2461,9 +2489,6 @@ static noinline struct module *load_module(void __user *umod,
 	module_arch_cleanup(mod);
  cleanup:
 	free_modinfo(mod);
-	kobject_del(&mod->mkobj.kobj);
-	kobject_put(&mod->mkobj.kobj);
- free_unload:
 	module_unload_free(mod);
 #if defined(CONFIG_MODULE_UNLOAD)
 	free_percpu(mod->refptr);
-- 
cgit v1.2.3-73-gaa49b


From 6407ebb271fc34440b306f305e1efb7685eece26 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Sat, 5 Jun 2010 11:17:36 -0600
Subject: module: Make module sysfs functions private.

These were placed in the header in ef665c1a06 to get the various
SYSFS/MODULE config combintations to compile.

That may have been necessary then, but it's not now.  These functions
are all local to module.c.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Cc: Randy Dunlap <randy.dunlap@oracle.com>
---
 include/linux/module.h | 33 ---------------------------------
 kernel/module.c        | 29 +++++++++++++++++++++++++----
 2 files changed, 25 insertions(+), 37 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/module.h b/include/linux/module.h
index 5d8fca5dcff5..8a6b9fdc7ffa 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -672,43 +672,10 @@ static inline int module_get_iter_tracepoints(struct tracepoint_iter *iter)
 
 #endif /* CONFIG_MODULES */
 
-struct device_driver;
 #ifdef CONFIG_SYSFS
-struct module;
-
 extern struct kset *module_kset;
 extern struct kobj_type module_ktype;
 extern int module_sysfs_initialized;
-
-int mod_sysfs_init(struct module *mod);
-int mod_sysfs_setup(struct module *mod,
-			   struct kernel_param *kparam,
-			   unsigned int num_params);
-int module_add_modinfo_attrs(struct module *mod);
-void module_remove_modinfo_attrs(struct module *mod);
-
-#else /* !CONFIG_SYSFS */
-
-static inline int mod_sysfs_init(struct module *mod)
-{
-	return 0;
-}
-
-static inline int mod_sysfs_setup(struct module *mod,
-			   struct kernel_param *kparam,
-			   unsigned int num_params)
-{
-	return 0;
-}
-
-static inline int module_add_modinfo_attrs(struct module *mod)
-{
-	return 0;
-}
-
-static inline void module_remove_modinfo_attrs(struct module *mod)
-{ }
-
 #endif /* CONFIG_SYSFS */
 
 #define symbol_request(x) try_then_request_module(symbol_get(x), "symbol:" #x)
diff --git a/kernel/module.c b/kernel/module.c
index c690d9885797..808aa18dd661 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -1323,7 +1323,7 @@ static void del_usage_links(struct module *mod)
 #endif
 }
 
-int module_add_modinfo_attrs(struct module *mod)
+static int module_add_modinfo_attrs(struct module *mod)
 {
 	struct module_attribute *attr;
 	struct module_attribute *temp_attr;
@@ -1349,7 +1349,7 @@ int module_add_modinfo_attrs(struct module *mod)
 	return error;
 }
 
-void module_remove_modinfo_attrs(struct module *mod)
+static void module_remove_modinfo_attrs(struct module *mod)
 {
 	struct module_attribute *attr;
 	int i;
@@ -1365,7 +1365,7 @@ void module_remove_modinfo_attrs(struct module *mod)
 	kfree(mod->modinfo_attrs);
 }
 
-int mod_sysfs_init(struct module *mod)
+static int mod_sysfs_init(struct module *mod)
 {
 	int err;
 	struct kobject *kobj;
@@ -1399,7 +1399,7 @@ out:
 	return err;
 }
 
-int mod_sysfs_setup(struct module *mod,
+static int mod_sysfs_setup(struct module *mod,
 			   struct kernel_param *kparam,
 			   unsigned int num_params)
 {
@@ -1445,6 +1445,27 @@ static void mod_sysfs_fini(struct module *mod)
 
 #else /* CONFIG_SYSFS */
 
+static inline int mod_sysfs_init(struct module *mod)
+{
+	return 0;
+}
+
+static inline int mod_sysfs_setup(struct module *mod,
+			   struct kernel_param *kparam,
+			   unsigned int num_params)
+{
+	return 0;
+}
+
+static inline int module_add_modinfo_attrs(struct module *mod)
+{
+	return 0;
+}
+
+static inline void module_remove_modinfo_attrs(struct module *mod)
+{
+}
+
 static void mod_sysfs_fini(struct module *mod)
 {
 }
-- 
cgit v1.2.3-73-gaa49b


From 75676500f8298f0ee89db12db97294883c4b768e Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Sat, 5 Jun 2010 11:17:36 -0600
Subject: module: make locking more fine-grained.

Kay Sievers <kay.sievers@vrfy.org> reports that we still have some
contention over module loading which is slowing boot.

Linus also disliked a previous "drop lock and regrab" patch to fix the
bne2 "gave up waiting for init of module libcrc32c" message.

This is more ambitious: we only grab the lock where we need it.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Cc: Brandon Philips <brandon@ifup.org>
Cc: Kay Sievers <kay.sievers@vrfy.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/module.c | 65 +++++++++++++++++++++++++++++++++++++--------------------
 1 file changed, 42 insertions(+), 23 deletions(-)

(limited to 'kernel')

diff --git a/kernel/module.c b/kernel/module.c
index 808aa18dd661..d293c213c22c 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -72,7 +72,11 @@
 /* If this is set, the section belongs in the init part of the module */
 #define INIT_OFFSET_MASK (1UL << (BITS_PER_LONG-1))
 
-/* List of modules, protected by module_mutex or preempt_disable
+/*
+ * Mutex protects:
+ * 1) List of modules (also safely readable with preempt_disable),
+ * 2) module_use links,
+ * 3) module_addr_min/module_addr_max.
  * (delete uses stop_machine/add uses RCU list operations). */
 DEFINE_MUTEX(module_mutex);
 EXPORT_SYMBOL_GPL(module_mutex);
@@ -90,7 +94,8 @@ static DECLARE_WAIT_QUEUE_HEAD(module_wq);
 
 static BLOCKING_NOTIFIER_HEAD(module_notify_list);
 
-/* Bounds of module allocation, for speeding __module_address */
+/* Bounds of module allocation, for speeding __module_address.
+ * Protected by module_mutex. */
 static unsigned long module_addr_min = -1UL, module_addr_max = 0;
 
 int register_module_notifier(struct notifier_block * nb)
@@ -329,7 +334,7 @@ static bool find_symbol_in_section(const struct symsearch *syms,
 }
 
 /* Find a symbol and return it, along with, (optional) crc and
- * (optional) module which owns it */
+ * (optional) module which owns it.  Needs preempt disabled or module_mutex. */
 const struct kernel_symbol *find_symbol(const char *name,
 					struct module **owner,
 					const unsigned long **crc,
@@ -576,7 +581,7 @@ static int add_module_usage(struct module *a, struct module *b)
 	return 0;
 }
 
-/* Module a uses b */
+/* Module a uses b: caller needs module_mutex() */
 int use_module(struct module *a, struct module *b)
 {
 	int err;
@@ -610,6 +615,7 @@ static void module_unload_free(struct module *mod)
 {
 	struct module_use *use, *tmp;
 
+	mutex_lock(&module_mutex);
 	list_for_each_entry_safe(use, tmp, &mod->target_list, target_list) {
 		struct module *i = use->target;
 		DEBUGP("%s unusing %s\n", mod->name, i->name);
@@ -618,6 +624,7 @@ static void module_unload_free(struct module *mod)
 		list_del(&use->target_list);
 		kfree(use);
 	}
+	mutex_unlock(&module_mutex);
 }
 
 #ifdef CONFIG_MODULE_FORCE_UNLOAD
@@ -784,13 +791,14 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user,
 	blocking_notifier_call_chain(&module_notify_list,
 				     MODULE_STATE_GOING, mod);
 	async_synchronize_full();
-	mutex_lock(&module_mutex);
+
 	/* Store the name of the last unloaded module for diagnostic purposes */
 	strlcpy(last_unloaded_module, mod->name, sizeof(last_unloaded_module));
 	ddebug_remove_module(mod->name);
-	free_module(mod);
 
- out:
+	free_module(mod);
+	return 0;
+out:
 	mutex_unlock(&module_mutex);
 	return ret;
 }
@@ -1006,6 +1014,8 @@ static inline int check_modstruct_version(Elf_Shdr *sechdrs,
 {
 	const unsigned long *crc;
 
+	/* Since this should be found in kernel (which can't be removed),
+	 * no locking is necessary. */
 	if (!find_symbol(MODULE_SYMBOL_PREFIX "module_layout", NULL,
 			 &crc, true, false))
 		BUG();
@@ -1048,8 +1058,7 @@ static inline int same_magic(const char *amagic, const char *bmagic,
 }
 #endif /* CONFIG_MODVERSIONS */
 
-/* Resolve a symbol for this module.  I.e. if we find one, record usage.
-   Must be holding module_mutex. */
+/* Resolve a symbol for this module.  I.e. if we find one, record usage. */
 static const struct kernel_symbol *resolve_symbol(Elf_Shdr *sechdrs,
 						  unsigned int versindex,
 						  const char *name,
@@ -1059,6 +1068,7 @@ static const struct kernel_symbol *resolve_symbol(Elf_Shdr *sechdrs,
 	const struct kernel_symbol *sym;
 	const unsigned long *crc;
 
+	mutex_lock(&module_mutex);
 	sym = find_symbol(name, &owner, &crc,
 			  !(mod->taints & (1 << TAINT_PROPRIETARY_MODULE)), true);
 	/* use_module can fail due to OOM,
@@ -1068,6 +1078,7 @@ static const struct kernel_symbol *resolve_symbol(Elf_Shdr *sechdrs,
 		    || !use_module(mod, owner))
 			sym = NULL;
 	}
+	mutex_unlock(&module_mutex);
 	return sym;
 }
 
@@ -1306,10 +1317,12 @@ static void add_usage_links(struct module *mod)
 	struct module_use *use;
 	int nowarn;
 
+	mutex_lock(&module_mutex);
 	list_for_each_entry(use, &mod->target_list, target_list) {
 		nowarn = sysfs_create_link(use->target->holders_dir,
 					   &mod->mkobj.kobj, mod->name);
 	}
+	mutex_unlock(&module_mutex);
 #endif
 }
 
@@ -1318,8 +1331,10 @@ static void del_usage_links(struct module *mod)
 #ifdef CONFIG_MODULE_UNLOAD
 	struct module_use *use;
 
+	mutex_lock(&module_mutex);
 	list_for_each_entry(use, &mod->target_list, target_list)
 		sysfs_remove_link(use->target->holders_dir, mod->name);
+	mutex_unlock(&module_mutex);
 #endif
 }
 
@@ -1497,13 +1512,15 @@ static int __unlink_module(void *_mod)
 	return 0;
 }
 
-/* Free a module, remove from lists, etc (must hold module_mutex). */
+/* Free a module, remove from lists, etc. */
 static void free_module(struct module *mod)
 {
 	trace_module_free(mod);
 
 	/* Delete from various lists */
+	mutex_lock(&module_mutex);
 	stop_machine(__unlink_module, mod, NULL);
+	mutex_unlock(&module_mutex);
 	remove_notes_attrs(mod);
 	remove_sect_attrs(mod);
 	mod_kobject_remove(mod);
@@ -1575,7 +1592,14 @@ static int verify_export_symbols(struct module *mod)
 
 	for (i = 0; i < ARRAY_SIZE(arr); i++) {
 		for (s = arr[i].sym; s < arr[i].sym + arr[i].num; s++) {
-			if (find_symbol(s->name, &owner, NULL, true, false)) {
+			const struct kernel_symbol *sym;
+
+			/* Stopping preemption makes find_symbol safe. */
+			preempt_disable();
+			sym = find_symbol(s->name, &owner, NULL, true, false);
+			preempt_enable();
+
+			if (sym) {
 				printk(KERN_ERR
 				       "%s: exports duplicate symbol %s"
 				       " (owned by %s)\n",
@@ -2021,11 +2045,13 @@ static void *module_alloc_update_bounds(unsigned long size)
 	void *ret = module_alloc(size);
 
 	if (ret) {
+		mutex_lock(&module_mutex);
 		/* Update module bounds. */
 		if ((unsigned long)ret < module_addr_min)
 			module_addr_min = (unsigned long)ret;
 		if ((unsigned long)ret + size > module_addr_max)
 			module_addr_max = (unsigned long)ret + size;
+		mutex_unlock(&module_mutex);
 	}
 	return ret;
 }
@@ -2482,7 +2508,9 @@ static noinline struct module *load_module(void __user *umod,
 	 * function to insert in a way safe to concurrent readers.
 	 * The mutex protects against concurrent writers.
 	 */
+	mutex_lock(&module_mutex);
 	list_add_rcu(&mod->list, &modules);
+	mutex_unlock(&module_mutex);
 
 	err = parse_args(mod->name, mod->args, mod->kp, mod->num_kp, NULL);
 	if (err < 0)
@@ -2504,8 +2532,10 @@ static noinline struct module *load_module(void __user *umod,
 	return mod;
 
  unlink:
+	mutex_lock(&module_mutex);
 	/* Unlink carefully: kallsyms could be walking list. */
 	list_del_rcu(&mod->list);
+	mutex_unlock(&module_mutex);
 	synchronize_sched();
 	module_arch_cleanup(mod);
  cleanup:
@@ -2556,19 +2586,10 @@ SYSCALL_DEFINE3(init_module, void __user *, umod,
 	if (!capable(CAP_SYS_MODULE) || modules_disabled)
 		return -EPERM;
 
-	/* Only one module load at a time, please */
-	if (mutex_lock_interruptible(&module_mutex) != 0)
-		return -EINTR;
-
 	/* Do all the hard work */
 	mod = load_module(umod, len, uargs);
-	if (IS_ERR(mod)) {
-		mutex_unlock(&module_mutex);
+	if (IS_ERR(mod))
 		return PTR_ERR(mod);
-	}
-
-	/* Drop lock so they can recurse */
-	mutex_unlock(&module_mutex);
 
 	blocking_notifier_call_chain(&module_notify_list,
 			MODULE_STATE_COMING, mod);
@@ -2585,9 +2606,7 @@ SYSCALL_DEFINE3(init_module, void __user *, umod,
 		module_put(mod);
 		blocking_notifier_call_chain(&module_notify_list,
 					     MODULE_STATE_GOING, mod);
-		mutex_lock(&module_mutex);
 		free_module(mod);
-		mutex_unlock(&module_mutex);
 		wake_up(&module_wq);
 		return ret;
 	}
-- 
cgit v1.2.3-73-gaa49b


From 3bafeb6247042dcbb72b0141ec7c7107de9f0b99 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Sat, 5 Jun 2010 11:17:36 -0600
Subject: module: move find_module check to end

I think Rusty may have made the lock a bit _too_ finegrained there, and
didn't add it to some places that needed it. It looks, for example, like
PATCH 1/2 actually drops the lock in places where it's needed
("find_module()" is documented to need it, but now load_module() didn't
hold it at all when it did the find_module()).

Rather than adding a new "module_loading" list, I think we should be able
to just use the existing "modules" list, and just fix up the locking a
bit.

In fact, maybe we could just move the "look up existing module" a bit
later - optimistically assuming that the module doesn't exist, and then
just undoing the work if it turns out that we were wrong, just before
adding ourselves to the list.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 kernel/module.c | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/module.c b/kernel/module.c
index d293c213c22c..28450047852a 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -2226,11 +2226,6 @@ static noinline struct module *load_module(void __user *umod,
 		goto free_mod;
 	}
 
-	if (find_module(mod->name)) {
-		err = -EEXIST;
-		goto free_mod;
-	}
-
 	mod->state = MODULE_STATE_COMING;
 
 	/* Allow arches to frob section contents and sizes.  */
@@ -2509,6 +2504,12 @@ static noinline struct module *load_module(void __user *umod,
 	 * The mutex protects against concurrent writers.
 	 */
 	mutex_lock(&module_mutex);
+	if (find_module(mod->name)) {
+		err = -EEXIST;
+		/* This will also unlock the mutex */
+		goto already_exists;
+	}
+
 	list_add_rcu(&mod->list, &modules);
 	mutex_unlock(&module_mutex);
 
@@ -2535,6 +2536,7 @@ static noinline struct module *load_module(void __user *umod,
 	mutex_lock(&module_mutex);
 	/* Unlink carefully: kallsyms could be walking list. */
 	list_del_rcu(&mod->list);
+ already_exists:
 	mutex_unlock(&module_mutex);
 	synchronize_sched();
 	module_arch_cleanup(mod);
-- 
cgit v1.2.3-73-gaa49b


From be593f4ce4eb1bd40e38fdc403371f149f6f12eb Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Sat, 5 Jun 2010 11:17:37 -0600
Subject: module: verify_export_symbols under the lock

It disabled preempt so it was "safe", but nothing stops another module
slipping in before this module is added to the global list now we don't
hold the lock the whole time.

So we check this just after we check for duplicate modules, and just
before we put the module in the global list.

(find_symbol finds symbols in coming and going modules, too).

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 kernel/module.c | 26 ++++++++++----------------
 1 file changed, 10 insertions(+), 16 deletions(-)

(limited to 'kernel')

diff --git a/kernel/module.c b/kernel/module.c
index 28450047852a..f99558e1945a 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -1571,6 +1571,8 @@ EXPORT_SYMBOL_GPL(__symbol_get);
 /*
  * Ensure that an exported symbol [global namespace] does not already exist
  * in the kernel or in some other module's exported symbol table.
+ *
+ * You must hold the module_mutex.
  */
 static int verify_export_symbols(struct module *mod)
 {
@@ -1592,14 +1594,7 @@ static int verify_export_symbols(struct module *mod)
 
 	for (i = 0; i < ARRAY_SIZE(arr); i++) {
 		for (s = arr[i].sym; s < arr[i].sym + arr[i].num; s++) {
-			const struct kernel_symbol *sym;
-
-			/* Stopping preemption makes find_symbol safe. */
-			preempt_disable();
-			sym = find_symbol(s->name, &owner, NULL, true, false);
-			preempt_enable();
-
-			if (sym) {
+			if (find_symbol(s->name, &owner, NULL, true, false)) {
 				printk(KERN_ERR
 				       "%s: exports duplicate symbol %s"
 				       " (owned by %s)\n",
@@ -2440,11 +2435,6 @@ static noinline struct module *load_module(void __user *umod,
 			goto cleanup;
 	}
 
-        /* Find duplicate symbols */
-	err = verify_export_symbols(mod);
-	if (err < 0)
-		goto cleanup;
-
   	/* Set up and sort exception table */
 	mod->extable = section_objs(hdr, sechdrs, secstrings, "__ex_table",
 				    sizeof(*mod->extable), &mod->num_exentries);
@@ -2506,10 +2496,14 @@ static noinline struct module *load_module(void __user *umod,
 	mutex_lock(&module_mutex);
 	if (find_module(mod->name)) {
 		err = -EEXIST;
-		/* This will also unlock the mutex */
-		goto already_exists;
+		goto unlock;
 	}
 
+	/* Find duplicate symbols */
+	err = verify_export_symbols(mod);
+	if (err < 0)
+		goto unlock;
+
 	list_add_rcu(&mod->list, &modules);
 	mutex_unlock(&module_mutex);
 
@@ -2536,7 +2530,7 @@ static noinline struct module *load_module(void __user *umod,
 	mutex_lock(&module_mutex);
 	/* Unlink carefully: kallsyms could be walking list. */
 	list_del_rcu(&mod->list);
- already_exists:
+ unlock:
 	mutex_unlock(&module_mutex);
 	synchronize_sched();
 	module_arch_cleanup(mod);
-- 
cgit v1.2.3-73-gaa49b


From 9bea7f23952d5948f8e5dfdff4de09bb9981fb5f Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Sat, 5 Jun 2010 11:17:37 -0600
Subject: module: fix bne2 "gave up waiting for init of module libcrc32c"

Problem: it's hard to avoid an init routine stumbling over a
request_module these days.  And it's not clear it's always a bad idea:
for example, a module like kvm with dynamic dependencies on kvm-intel
or kvm-amd would be neater if it could simply request_module the right
one.

In this particular case, it's libcrc32c:

	libcrc32c_mod_init
	 crypto_alloc_shash
	  crypto_alloc_tfm
	   crypto_find_alg
	    crypto_alg_mod_lookup
	     crypto_larval_lookup
	      request_module

If another module is waiting inside resolve_symbol() for libcrc32c to
finish initializing (ie. bne2 depends on libcrc32c) then it does so
holding the module lock, and our request_module() can't make progress
until that is released.

Waiting inside resolve_symbol() without the lock isn't all that hard:
we just need to pass the -EBUSY up the call chain so we can sleep
where we don't hold the lock.  Error reporting is a bit trickier: we
need to copy the name of the unfinished module before releasing the
lock.

Other notes:
1) This also fixes a theoretical issue where a weak dependency would allow
   symbol version mismatches to be ignored.
2) We rename use_module to ref_module to make life easier for the only
   external user (the out-of-tree ksplice patches).

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Tim Abbot <tabbott@ksplice.com>
Tested-by: Brandon Philips <bphilips@suse.de>
---
 kernel/module.c | 91 +++++++++++++++++++++++++++++++++++++--------------------
 1 file changed, 59 insertions(+), 32 deletions(-)

(limited to 'kernel')

diff --git a/kernel/module.c b/kernel/module.c
index f99558e1945a..8c6b42840dd1 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -582,33 +582,26 @@ static int add_module_usage(struct module *a, struct module *b)
 }
 
 /* Module a uses b: caller needs module_mutex() */
-int use_module(struct module *a, struct module *b)
+int ref_module(struct module *a, struct module *b)
 {
 	int err;
 
-	if (b == NULL || already_uses(a, b)) return 1;
-
-	/* If we're interrupted or time out, we fail. */
-	if (wait_event_interruptible_timeout(
-		    module_wq, (err = strong_try_module_get(b)) != -EBUSY,
-		    30 * HZ) <= 0) {
-		printk("%s: gave up waiting for init of module %s.\n",
-		       a->name, b->name);
+	if (b == NULL || already_uses(a, b))
 		return 0;
-	}
 
-	/* If strong_try_module_get() returned a different error, we fail. */
+	/* If module isn't available, we fail. */
+	err = strong_try_module_get(b);
 	if (err)
-		return 0;
+		return err;
 
 	err = add_module_usage(a, b);
 	if (err) {
 		module_put(b);
-		return 0;
+		return err;
 	}
-	return 1;
+	return 0;
 }
-EXPORT_SYMBOL_GPL(use_module);
+EXPORT_SYMBOL_GPL(ref_module);
 
 /* Clear the unload stuff of the module. */
 static void module_unload_free(struct module *mod)
@@ -893,11 +886,11 @@ static inline void module_unload_free(struct module *mod)
 {
 }
 
-int use_module(struct module *a, struct module *b)
+int ref_module(struct module *a, struct module *b)
 {
-	return strong_try_module_get(b) == 0;
+	return strong_try_module_get(b);
 }
-EXPORT_SYMBOL_GPL(use_module);
+EXPORT_SYMBOL_GPL(ref_module);
 
 static inline void module_unload_init(struct module *mod)
 {
@@ -1062,26 +1055,58 @@ static inline int same_magic(const char *amagic, const char *bmagic,
 static const struct kernel_symbol *resolve_symbol(Elf_Shdr *sechdrs,
 						  unsigned int versindex,
 						  const char *name,
-						  struct module *mod)
+						  struct module *mod,
+						  char ownername[])
 {
 	struct module *owner;
 	const struct kernel_symbol *sym;
 	const unsigned long *crc;
+	int err;
 
 	mutex_lock(&module_mutex);
 	sym = find_symbol(name, &owner, &crc,
 			  !(mod->taints & (1 << TAINT_PROPRIETARY_MODULE)), true);
-	/* use_module can fail due to OOM,
-	   or module initialization or unloading */
-	if (sym) {
-		if (!check_version(sechdrs, versindex, name, mod, crc, owner)
-		    || !use_module(mod, owner))
-			sym = NULL;
+	if (!sym)
+		goto unlock;
+
+	if (!check_version(sechdrs, versindex, name, mod, crc, owner)) {
+		sym = ERR_PTR(-EINVAL);
+		goto getname;
 	}
+
+	err = ref_module(mod, owner);
+	if (err) {
+		sym = ERR_PTR(err);
+		goto getname;
+	}
+
+getname:
+	/* We must make copy under the lock if we failed to get ref. */
+	strncpy(ownername, module_name(owner), MODULE_NAME_LEN);
+unlock:
 	mutex_unlock(&module_mutex);
 	return sym;
 }
 
+static const struct kernel_symbol *resolve_symbol_wait(Elf_Shdr *sechdrs,
+						       unsigned int versindex,
+						       const char *name,
+						       struct module *mod)
+{
+	const struct kernel_symbol *ksym;
+	char ownername[MODULE_NAME_LEN];
+
+	if (wait_event_interruptible_timeout(module_wq,
+			!IS_ERR(ksym = resolve_symbol(sechdrs, versindex, name,
+						      mod, ownername)) ||
+			PTR_ERR(ksym) != -EBUSY,
+					     30 * HZ) <= 0) {
+		printk(KERN_WARNING "%s: gave up waiting for init of module %s.\n",
+		       mod->name, ownername);
+	}
+	return ksym;
+}
+
 /*
  * /sys/module/foo/sections stuff
  * J. Corbet <corbet@lwn.net>
@@ -1638,21 +1663,23 @@ static int simplify_symbols(Elf_Shdr *sechdrs,
 			break;
 
 		case SHN_UNDEF:
-			ksym = resolve_symbol(sechdrs, versindex,
-					      strtab + sym[i].st_name, mod);
+			ksym = resolve_symbol_wait(sechdrs, versindex,
+						   strtab + sym[i].st_name,
+						   mod);
 			/* Ok if resolved.  */
-			if (ksym) {
+			if (ksym && !IS_ERR(ksym)) {
 				sym[i].st_value = ksym->value;
 				break;
 			}
 
 			/* Ok if weak.  */
-			if (ELF_ST_BIND(sym[i].st_info) == STB_WEAK)
+			if (!ksym && ELF_ST_BIND(sym[i].st_info) == STB_WEAK)
 				break;
 
-			printk(KERN_WARNING "%s: Unknown symbol %s\n",
-			       mod->name, strtab + sym[i].st_name);
-			ret = -ENOENT;
+			printk(KERN_WARNING "%s: Unknown symbol %s (err %li)\n",
+			       mod->name, strtab + sym[i].st_name,
+			       PTR_ERR(ksym));
+			ret = PTR_ERR(ksym) ?: -ENOENT;
 			break;
 
 		default:
-- 
cgit v1.2.3-73-gaa49b