From 3d9a854c2dac3e888393b23ba7adafcce4d6d4b9 Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Sat, 20 Feb 2010 01:03:43 +0100 Subject: Rename .data[.percpu][.XXX] to .data[..percpu][..XXX]. Signed-off-by: Denys Vlasenko Signed-off-by: Michal Marek --- kernel/module.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index f82386bd9ee9..5daf0abd63c1 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -397,7 +397,7 @@ static unsigned int find_pcpusec(Elf_Ehdr *hdr, Elf_Shdr *sechdrs, const char *secstrings) { - return find_sec(hdr, sechdrs, secstrings, ".data.percpu"); + return find_sec(hdr, sechdrs, secstrings, ".data..percpu"); } static void percpu_modcopy(void *pcpudest, const void *from, unsigned long size) -- cgit v1.2.3-73-gaa49b From 54e88fad223c4e1d94289611a90c7fe3ebe5631b Mon Sep 17 00:00:00 2001 From: "Amit K. Arora" Date: Tue, 25 May 2010 18:53:46 +0530 Subject: sched: Make sure timers have migrated before killing the migration_thread Problem: In a stress test where some heavy tests were running along with regular CPU offlining and onlining, a hang was observed. The system seems to be hung at a point where migration_call() tries to kill the migration_thread of the dying CPU, which just got moved to the current CPU. This migration thread does not get a chance to run (and die) since rt_throttled is set to 1 on current, and it doesn't get cleared as the hrtimer which is supposed to reset the rt bandwidth (sched_rt_period_timer) is tied to the CPU which we just marked dead! Solution: This patch pushes the killing of migration thread to "CPU_POST_DEAD" event. By then all the timers (including sched_rt_period_timer) should have got migrated (along with other callbacks). Signed-off-by: Amit Arora Signed-off-by: Gautham R Shenoy Acked-by: Tejun Heo Signed-off-by: Peter Zijlstra Cc: Thomas Gleixner LKML-Reference: <20100525132346.GA14986@amitarora.in.ibm.com> Signed-off-by: Ingo Molnar --- kernel/stop_machine.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index b4e7431e7c78..70f8d90331e9 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -321,7 +321,7 @@ static int __cpuinit cpu_stop_cpu_callback(struct notifier_block *nfb, #ifdef CONFIG_HOTPLUG_CPU case CPU_UP_CANCELED: - case CPU_DEAD: + case CPU_POST_DEAD: { struct cpu_stop_work *work; -- cgit v1.2.3-73-gaa49b From 293a7cfeedc2b2380a7c7274902323c3cf5f7575 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Mon, 31 May 2010 19:53:50 +0930 Subject: module: fix reference to mod->percpu after freeing module. Rafael sees a sometimes crash at precpu_modfree from kernel/module.c; it only occurred with another (since-reverted) patch, but that patch simply changed timing to uncover this bug, it was otherwise unrelated. The comment about the mod being freed is self-explanatory, but neither Tejun nor I read it. This bug was introduced in 259354deaa, after it had previously been fixed in 6e2b75740b. How embarrassing. Reported-by: "Rafael J. Wysocki" Signed-off-by: Rusty Russell Embarrassingly-Acked-by: Tejun Heo Cc: Masami Hiramatsu Tested-by: "Rafael J. Wysocki" Signed-off-by: Linus Torvalds --- kernel/module.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index 333fbcc96978..d806e00e4450 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -2014,6 +2014,7 @@ static noinline struct module *load_module(void __user *umod, long err = 0; void *ptr = NULL; /* Stops spurious gcc warning */ unsigned long symoffs, stroffs, *strmap; + void __percpu *percpu; mm_segment_t old_fs; @@ -2158,6 +2159,8 @@ static noinline struct module *load_module(void __user *umod, goto free_mod; sechdrs[pcpuindex].sh_flags &= ~(unsigned long)SHF_ALLOC; } + /* Keep this around for failure path. */ + percpu = mod_percpu(mod); /* Determine total sizes, and put offsets in sh_entsize. For now this is done generically; there doesn't appear to be any @@ -2463,7 +2466,7 @@ static noinline struct module *load_module(void __user *umod, module_free(mod, mod->module_core); /* mod will be freed with core. Don't access it beyond this line! */ free_percpu: - percpu_modfree(mod); + free_percpu(percpu); free_mod: kfree(args); kfree(strmap); -- cgit v1.2.3-73-gaa49b From e51fd5e22e12b39f49b1bb60b37b300b17378a43 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 31 May 2010 12:37:30 +0200 Subject: sched: Fix wake_affine() vs RT tasks Mike reports that since e9e9250b (sched: Scale down cpu_power due to RT tasks), wake_affine() goes funny on RT tasks due to them still having a !0 weight and wake_affine() still subtracts that from the rq weight. Since nobody should be using se->weight for RT tasks, set the value to zero. Also, since we now use ->cpu_power to normalize rq weights to account for RT cpu usage, add that factor into the imbalance computation. Reported-by: Mike Galbraith Tested-by: Mike Galbraith Signed-off-by: Peter Zijlstra LKML-Reference: <1275316109.27810.22969.camel@twins> Signed-off-by: Ingo Molnar --- kernel/sched.c | 24 ++++++------------------ kernel/sched_fair.c | 22 ++++++++++++++++------ 2 files changed, 22 insertions(+), 24 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index d48408142503..f8b8996228dd 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -544,6 +544,8 @@ struct rq { struct root_domain *rd; struct sched_domain *sd; + unsigned long cpu_power; + unsigned char idle_at_tick; /* For active balancing */ int post_schedule; @@ -1499,24 +1501,9 @@ static unsigned long target_load(int cpu, int type) return max(rq->cpu_load[type-1], total); } -static struct sched_group *group_of(int cpu) -{ - struct sched_domain *sd = rcu_dereference_sched(cpu_rq(cpu)->sd); - - if (!sd) - return NULL; - - return sd->groups; -} - static unsigned long power_of(int cpu) { - struct sched_group *group = group_of(cpu); - - if (!group) - return SCHED_LOAD_SCALE; - - return group->cpu_power; + return cpu_rq(cpu)->cpu_power; } static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd); @@ -1854,8 +1841,8 @@ static void dec_nr_running(struct rq *rq) static void set_load_weight(struct task_struct *p) { if (task_has_rt_policy(p)) { - p->se.load.weight = prio_to_weight[0] * 2; - p->se.load.inv_weight = prio_to_wmult[0] >> 1; + p->se.load.weight = 0; + p->se.load.inv_weight = WMULT_CONST; return; } @@ -7605,6 +7592,7 @@ void __init sched_init(void) #ifdef CONFIG_SMP rq->sd = NULL; rq->rd = NULL; + rq->cpu_power = SCHED_LOAD_SCALE; rq->post_schedule = 0; rq->active_balance = 0; rq->next_balance = jiffies; diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 217e4a9393e4..eed35eded602 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1225,7 +1225,6 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) unsigned long this_load, load; int idx, this_cpu, prev_cpu; unsigned long tl_per_task; - unsigned int imbalance; struct task_group *tg; unsigned long weight; int balanced; @@ -1252,8 +1251,6 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) tg = task_group(p); weight = p->se.load.weight; - imbalance = 100 + (sd->imbalance_pct - 100) / 2; - /* * In low-load situations, where prev_cpu is idle and this_cpu is idle * due to the sync cause above having dropped this_load to 0, we'll @@ -1263,9 +1260,21 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) * Otherwise check if either cpus are near enough in load to allow this * task to be woken on this_cpu. */ - balanced = !this_load || - 100*(this_load + effective_load(tg, this_cpu, weight, weight)) <= - imbalance*(load + effective_load(tg, prev_cpu, 0, weight)); + if (this_load) { + unsigned long this_eff_load, prev_eff_load; + + this_eff_load = 100; + this_eff_load *= power_of(prev_cpu); + this_eff_load *= this_load + + effective_load(tg, this_cpu, weight, weight); + + prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2; + prev_eff_load *= power_of(this_cpu); + prev_eff_load *= load + effective_load(tg, prev_cpu, 0, weight); + + balanced = this_eff_load <= prev_eff_load; + } else + balanced = true; /* * If the currently running task will sleep within @@ -2298,6 +2307,7 @@ static void update_cpu_power(struct sched_domain *sd, int cpu) if (!power) power = 1; + cpu_rq(cpu)->cpu_power = power; sdg->cpu_power = power; } -- cgit v1.2.3-73-gaa49b From 5c113fbeed7a5a192d8431a768965f8a45c16475 Mon Sep 17 00:00:00 2001 From: Daniel J Blueman Date: Tue, 1 Jun 2010 12:15:11 +0100 Subject: fix cpu_chain section mismatch... In commit e9fb7631ebcd ("cpu-hotplug: introduce cpu_notify(), __cpu_notify(), cpu_notify_nofail()") the new helper functions access cpu_chain. As a result, it shouldn't be marked __cpuinitdata (via section mismatch warning). Alternatively, the helper functions should be forced inline, or marked __ref or __cpuinit. In the meantime, this patch silences the warning the trivial way. Signed-off-by: Daniel J Blueman Signed-off-by: Linus Torvalds --- kernel/cpu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index 8b92539b4754..97d1b426a4ac 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -34,7 +34,7 @@ void cpu_maps_update_done(void) mutex_unlock(&cpu_add_remove_lock); } -static __cpuinitdata RAW_NOTIFIER_HEAD(cpu_chain); +static RAW_NOTIFIER_HEAD(cpu_chain); /* If set, cpu_up and cpu_down will return -EBUSY and do nothing. * Should always be manipulated under cpu_add_remove_lock -- cgit v1.2.3-73-gaa49b From ff9da691c0498ff81fdd014e7a0731dab2337dac Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 3 Jun 2010 14:54:39 +0200 Subject: pipe: change /proc/sys/fs/pipe-max-pages to byte sized interface This changes the interface to be based on bytes instead. The API matches that of F_SETPIPE_SZ in that it rounds up the passed in size so that the resulting page array is a power-of-2 in size. The proc file is renamed to /proc/sys/fs/pipe-max-size to reflect this change. Signed-off-by: Jens Axboe --- fs/pipe.c | 54 ++++++++++++++++++++++++++++++++++++----------- include/linux/pipe_fs_i.h | 4 +++- kernel/sysctl.c | 8 +++---- 3 files changed, 49 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/fs/pipe.c b/fs/pipe.c index f98fae3e36b0..69c4c7c13ea9 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -26,9 +26,14 @@ /* * The max size that a non-root user is allowed to grow the pipe. Can - * be set by root in /proc/sys/fs/pipe-max-pages + * be set by root in /proc/sys/fs/pipe-max-size */ -unsigned int pipe_max_pages = PIPE_DEF_BUFFERS * 16; +unsigned int pipe_max_size = 1048576; + +/* + * Minimum pipe size, as required by POSIX + */ +unsigned int pipe_min_size = PAGE_SIZE; /* * We use a start+len construction, which provides full use of the @@ -1156,6 +1161,35 @@ static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long nr_pages) return nr_pages * PAGE_SIZE; } +/* + * Currently we rely on the pipe array holding a power-of-2 number + * of pages. + */ +static inline unsigned int round_pipe_size(unsigned int size) +{ + unsigned long nr_pages; + + nr_pages = (size + PAGE_SIZE - 1) >> PAGE_SHIFT; + return roundup_pow_of_two(nr_pages) << PAGE_SHIFT; +} + +/* + * This should work even if CONFIG_PROC_FS isn't set, as proc_dointvec_minmax + * will return an error. + */ +int pipe_proc_fn(struct ctl_table *table, int write, void __user *buf, + size_t *lenp, loff_t *ppos) +{ + int ret; + + ret = proc_dointvec_minmax(table, write, buf, lenp, ppos); + if (ret < 0 || !write) + return ret; + + pipe_max_size = round_pipe_size(pipe_max_size); + return ret; +} + long pipe_fcntl(struct file *file, unsigned int cmd, unsigned long arg) { struct pipe_inode_info *pipe; @@ -1169,23 +1203,19 @@ long pipe_fcntl(struct file *file, unsigned int cmd, unsigned long arg) switch (cmd) { case F_SETPIPE_SZ: { - unsigned long nr_pages; + unsigned int size, nr_pages; - /* - * Currently the array must be a power-of-2 size, so adjust - * upwards if needed. - */ - nr_pages = (arg + PAGE_SIZE - 1) >> PAGE_SHIFT; - nr_pages = roundup_pow_of_two(nr_pages); + size = round_pipe_size(arg); + nr_pages = size >> PAGE_SHIFT; - if (!capable(CAP_SYS_RESOURCE) && nr_pages > pipe_max_pages) { + if (!capable(CAP_SYS_RESOURCE) && size > pipe_max_size) { ret = -EPERM; goto out; - } else if (nr_pages < 1) { + } else if (nr_pages < PAGE_SIZE) { ret = -EINVAL; goto out; } - ret = pipe_set_size(pipe, arg); + ret = pipe_set_size(pipe, nr_pages); break; } case F_GETPIPE_SZ: diff --git a/include/linux/pipe_fs_i.h b/include/linux/pipe_fs_i.h index 16de3933c45e..445796945ac9 100644 --- a/include/linux/pipe_fs_i.h +++ b/include/linux/pipe_fs_i.h @@ -139,7 +139,9 @@ void pipe_lock(struct pipe_inode_info *); void pipe_unlock(struct pipe_inode_info *); void pipe_double_lock(struct pipe_inode_info *, struct pipe_inode_info *); -extern unsigned int pipe_max_pages; +extern unsigned int pipe_max_size, pipe_min_size; +int pipe_proc_fn(struct ctl_table *, int, void __user *, size_t *, loff_t *); + /* Drop the inode semaphore and wait for a pipe event, atomically */ void pipe_wait(struct pipe_inode_info *pipe); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 997080f00e0b..d24f761f4876 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1471,12 +1471,12 @@ static struct ctl_table fs_table[] = { }, #endif { - .procname = "pipe-max-pages", - .data = &pipe_max_pages, + .procname = "pipe-max-size", + .data = &pipe_max_size, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec_minmax, - .extra1 = &two, + .proc_handler = &pipe_proc_fn, + .extra1 = &pipe_min_size, }, /* * NOTE: do not add new entries to this table unless you have read -- cgit v1.2.3-73-gaa49b From 485d527686850d68a0e9006dd9904f19f122485e Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 4 Jun 2010 14:14:58 -0700 Subject: sys_personality: change sys_personality() to accept "unsigned int" instead of u_long task_struct->pesonality is "unsigned int", but sys_personality() paths use "unsigned long pesonality". This means that every assignment or comparison is not right. In particular, if this argument does not fit into "unsigned int" __set_personality() changes the caller's personality and then sys_personality() returns -EINVAL. Turn this argument into "unsigned int" and avoid overflows. Obviously, this is the user-visible change, we just ignore the upper bits. But this can't break the sane application. There is another thing which can confuse the poorly written applications. User-space thinks that this syscall returns int, not long. This means that the returned value can be negative and look like the error code. But note that libc won't be confused and thus errno won't be set, and with this patch the user-space can never get -1 unless sys_personality() really fails. And, most importantly, the negative RET != -1 is only possible if that app previously called personality(RET). Pointed-out-by: Wenming Zhang Suggested-by: Linus Torvalds Signed-off-by: Oleg Nesterov Cc: "H. Peter Anvin" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/personality.h | 2 +- include/linux/syscalls.h | 2 +- kernel/exec_domain.c | 18 +++++++++--------- 3 files changed, 11 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/include/linux/personality.h b/include/linux/personality.h index 126120819a0d..eec3bae164d4 100644 --- a/include/linux/personality.h +++ b/include/linux/personality.h @@ -12,7 +12,7 @@ struct pt_regs; extern int register_exec_domain(struct exec_domain *); extern int unregister_exec_domain(struct exec_domain *); -extern int __set_personality(unsigned long); +extern int __set_personality(unsigned int); #endif /* __KERNEL__ */ diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index a1a86a53bc73..7f614ce274a9 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -289,7 +289,7 @@ asmlinkage long sys_capget(cap_user_header_t header, cap_user_data_t dataptr); asmlinkage long sys_capset(cap_user_header_t header, const cap_user_data_t data); -asmlinkage long sys_personality(u_long personality); +asmlinkage long sys_personality(unsigned int personality); asmlinkage long sys_sigpending(old_sigset_t __user *set); asmlinkage long sys_sigprocmask(int how, old_sigset_t __user *set, diff --git a/kernel/exec_domain.c b/kernel/exec_domain.c index c35452cadded..dd62f8e714ca 100644 --- a/kernel/exec_domain.c +++ b/kernel/exec_domain.c @@ -27,7 +27,7 @@ static struct exec_domain *exec_domains = &default_exec_domain; static DEFINE_RWLOCK(exec_domains_lock); -static u_long ident_map[32] = { +static unsigned long ident_map[32] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, @@ -56,10 +56,10 @@ default_handler(int segment, struct pt_regs *regp) } static struct exec_domain * -lookup_exec_domain(u_long personality) +lookup_exec_domain(unsigned int personality) { - struct exec_domain * ep; - u_long pers = personality(personality); + unsigned int pers = personality(personality); + struct exec_domain *ep; read_lock(&exec_domains_lock); for (ep = exec_domains; ep; ep = ep->next) { @@ -70,7 +70,7 @@ lookup_exec_domain(u_long personality) #ifdef CONFIG_MODULES read_unlock(&exec_domains_lock); - request_module("personality-%ld", pers); + request_module("personality-%d", pers); read_lock(&exec_domains_lock); for (ep = exec_domains; ep; ep = ep->next) { @@ -135,7 +135,7 @@ unregister: } int -__set_personality(u_long personality) +__set_personality(unsigned int personality) { struct exec_domain *ep, *oep; @@ -188,9 +188,9 @@ static int __init proc_execdomains_init(void) module_init(proc_execdomains_init); #endif -SYSCALL_DEFINE1(personality, u_long, personality) +SYSCALL_DEFINE1(personality, unsigned int, personality) { - u_long old = current->personality; + unsigned int old = current->personality; if (personality != 0xffffffff) { set_personality(personality); @@ -198,7 +198,7 @@ SYSCALL_DEFINE1(personality, u_long, personality) return -EINVAL; } - return (long)old; + return old; } -- cgit v1.2.3-73-gaa49b From 94b3dd0f7bb393d93e84a173b1df9b8b64c83ac4 Mon Sep 17 00:00:00 2001 From: Greg Thelen Date: Fri, 4 Jun 2010 14:15:03 -0700 Subject: cgroups: alloc_css_id() increments hierarchy depth Child groups should have a greater depth than their parents. Prior to this change, the parent would incorrectly report zero memory usage for child cgroups when use_hierarchy is enabled. test script: mount -t cgroup none /cgroups -o memory cd /cgroups mkdir cg1 echo 1 > cg1/memory.use_hierarchy mkdir cg1/cg11 echo $$ > cg1/cg11/tasks dd if=/dev/zero of=/tmp/foo bs=1M count=1 echo echo CHILD grep cache cg1/cg11/memory.stat echo echo PARENT grep cache cg1/memory.stat echo $$ > tasks rmdir cg1/cg11 cg1 cd / umount /cgroups Using fae9c79, a recent patch that changed alloc_css_id() depth computation, the parent incorrectly reports zero usage: root@ubuntu:~# ./test 1+0 records in 1+0 records out 1048576 bytes (1.0 MB) copied, 0.0151844 s, 69.1 MB/s CHILD cache 1048576 total_cache 1048576 PARENT cache 0 total_cache 0 With this patch, the parent correctly includes child usage: root@ubuntu:~# ./test 1+0 records in 1+0 records out 1048576 bytes (1.0 MB) copied, 0.0136827 s, 76.6 MB/s CHILD cache 1052672 total_cache 1052672 PARENT cache 0 total_cache 1052672 Signed-off-by: Greg Thelen Acked-by: Paul Menage Acked-by: KAMEZAWA Hiroyuki Acked-by: Li Zefan Cc: [2.6.34.x] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cgroup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 422cb19f156e..3ac6f5b0a64b 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -4598,7 +4598,7 @@ static int alloc_css_id(struct cgroup_subsys *ss, struct cgroup *parent, parent_css = parent->subsys[subsys_id]; child_css = child->subsys[subsys_id]; parent_id = parent_css->id; - depth = parent_id->depth; + depth = parent_id->depth + 1; child_id = get_new_cssid(ss, depth); if (IS_ERR(child_id)) -- cgit v1.2.3-73-gaa49b From 9e506f7adce8e6165a104d3d78fddd8ff0cdccf8 Mon Sep 17 00:00:00 2001 From: Akinobu Mita Date: Fri, 4 Jun 2010 14:15:04 -0700 Subject: kernel/: fix BUG_ON checks for cpu notifier callbacks direct call The commit 80b5184cc537718122e036afe7e62d202b70d077 ("kernel/: convert cpu notifier to return encapsulate errno value") changed the return value of cpu notifier callbacks. Those callbacks don't return NOTIFY_BAD on failures anymore. But there are a few callbacks which are called directly at init time and checking the return value. I forgot to change BUG_ON checking by the direct callers in the commit. Signed-off-by: Akinobu Mita Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/softirq.c | 2 +- kernel/timer.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/softirq.c b/kernel/softirq.c index 825e1126008f..07b4f1b1a73a 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -850,7 +850,7 @@ static __init int spawn_ksoftirqd(void) void *cpu = (void *)(long)smp_processor_id(); int err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu); - BUG_ON(err == NOTIFY_BAD); + BUG_ON(err != NOTIFY_OK); cpu_callback(&cpu_nfb, CPU_ONLINE, cpu); register_cpu_notifier(&cpu_nfb); return 0; diff --git a/kernel/timer.c b/kernel/timer.c index 2454172a80d3..ee305c8d4e18 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -1717,7 +1717,7 @@ void __init init_timers(void) init_timer_stats(); - BUG_ON(err == NOTIFY_BAD); + BUG_ON(err != NOTIFY_OK); register_cpu_notifier(&timers_nb); open_softirq(TIMER_SOFTIRQ, run_timer_softirq); } -- cgit v1.2.3-73-gaa49b From 2c02dfe7fe3fba97a5665d329d039d2415ea5607 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Mon, 31 May 2010 12:19:37 -0700 Subject: module: Make the 'usage' lists be two-way When adding a module that depends on another one, we used to create a one-way list of "modules_which_use_me", so that module unloading could see who needs a module. It's actually quite simple to make that list go both ways: so that we not only can see "who uses me", but also see a list of modules that are "used by me". In fact, we always wanted that list in "module_unload_free()": when we unload a module, we want to also release all the other modules that are used by that module. But because we didn't have that list, we used to first iterate over all modules, and then iterate over each "used by me" list of that module. By making the list two-way, we simplify module_unload_free(), and it allows for some trivial fixes later too. Signed-off-by: Linus Torvalds Signed-off-by: Rusty Russell (cleaned & rebased) --- include/linux/module.h | 4 ++- kernel/module.c | 79 ++++++++++++++++++++++++++++++-------------------- 2 files changed, 51 insertions(+), 32 deletions(-) (limited to 'kernel') diff --git a/include/linux/module.h b/include/linux/module.h index 6914fcad4673..680db9e2ac36 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -359,7 +359,9 @@ struct module #ifdef CONFIG_MODULE_UNLOAD /* What modules depend on me? */ - struct list_head modules_which_use_me; + struct list_head source_list; + /* What modules do I depend on? */ + struct list_head target_list; /* Who is waiting for us to be unloaded */ struct task_struct *waiter; diff --git a/kernel/module.c b/kernel/module.c index 0129769301e3..be18c3e34684 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -523,7 +523,8 @@ static void module_unload_init(struct module *mod) { int cpu; - INIT_LIST_HEAD(&mod->modules_which_use_me); + INIT_LIST_HEAD(&mod->source_list); + INIT_LIST_HEAD(&mod->target_list); for_each_possible_cpu(cpu) { per_cpu_ptr(mod->refptr, cpu)->incs = 0; per_cpu_ptr(mod->refptr, cpu)->decs = 0; @@ -538,8 +539,9 @@ static void module_unload_init(struct module *mod) /* modules using other modules */ struct module_use { - struct list_head list; - struct module *module_which_uses; + struct list_head source_list; + struct list_head target_list; + struct module *source, *target; }; /* Does a already use b? */ @@ -547,8 +549,8 @@ static int already_uses(struct module *a, struct module *b) { struct module_use *use; - list_for_each_entry(use, &b->modules_which_use_me, list) { - if (use->module_which_uses == a) { + list_for_each_entry(use, &b->source_list, source_list) { + if (use->source == a) { DEBUGP("%s uses %s!\n", a->name, b->name); return 1; } @@ -557,6 +559,33 @@ static int already_uses(struct module *a, struct module *b) return 0; } +/* + * Module a uses b + * - we add 'a' as a "source", 'b' as a "target" of module use + * - the module_use is added to the list of 'b' sources (so + * 'b' can walk the list to see who sourced them), and of 'a' + * targets (so 'a' can see what modules it targets). + */ +static int add_module_usage(struct module *a, struct module *b) +{ + int no_warn; + struct module_use *use; + + DEBUGP("Allocating new usage for %s.\n", a->name); + use = kmalloc(sizeof(*use), GFP_ATOMIC); + if (!use) { + printk(KERN_WARNING "%s: out of memory loading\n", a->name); + return -ENOMEM; + } + + use->source = a; + use->target = b; + list_add(&use->source_list, &b->source_list); + list_add(&use->target_list, &a->target_list); + no_warn = sysfs_create_link(b->holders_dir, &a->mkobj.kobj, a->name); + return 0; +} + /* Module a uses b */ int use_module(struct module *a, struct module *b) { @@ -578,17 +607,11 @@ int use_module(struct module *a, struct module *b) if (err) return 0; - DEBUGP("Allocating new usage for %s.\n", a->name); - use = kmalloc(sizeof(*use), GFP_ATOMIC); - if (!use) { - printk("%s: out of memory loading\n", a->name); + err = add_module_usage(a, b); + if (err) { module_put(b); return 0; } - - use->module_which_uses = a; - list_add(&use->list, &b->modules_which_use_me); - no_warn = sysfs_create_link(b->holders_dir, &a->mkobj.kobj, a->name); return 1; } EXPORT_SYMBOL_GPL(use_module); @@ -596,22 +619,16 @@ EXPORT_SYMBOL_GPL(use_module); /* Clear the unload stuff of the module. */ static void module_unload_free(struct module *mod) { - struct module *i; - - list_for_each_entry(i, &modules, list) { - struct module_use *use; + struct module_use *use, *tmp; - list_for_each_entry(use, &i->modules_which_use_me, list) { - if (use->module_which_uses == mod) { - DEBUGP("%s unusing %s\n", mod->name, i->name); - module_put(i); - list_del(&use->list); - kfree(use); - sysfs_remove_link(i->holders_dir, mod->name); - /* There can be at most one match. */ - break; - } - } + list_for_each_entry_safe(use, tmp, &mod->target_list, target_list) { + struct module *i = use->target; + DEBUGP("%s unusing %s\n", mod->name, i->name); + module_put(i); + list_del(&use->source_list); + list_del(&use->target_list); + kfree(use); + sysfs_remove_link(i->holders_dir, mod->name); } } @@ -735,7 +752,7 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user, goto out; } - if (!list_empty(&mod->modules_which_use_me)) { + if (!list_empty(&mod->source_list)) { /* Other modules depend on us: get rid of them first. */ ret = -EWOULDBLOCK; goto out; @@ -799,9 +816,9 @@ static inline void print_unload_info(struct seq_file *m, struct module *mod) /* Always include a trailing , so userspace can differentiate between this and the old multi-field proc format. */ - list_for_each_entry(use, &mod->modules_which_use_me, list) { + list_for_each_entry(use, &mod->source_list, source_list) { printed_something = 1; - seq_printf(m, "%s,", use->module_which_uses->name); + seq_printf(m, "%s,", use->source->name); } if (mod->init != NULL && mod->exit == NULL) { -- cgit v1.2.3-73-gaa49b From c8e21ced08b39ef8dfe7236fb2a923a95f645262 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Sat, 5 Jun 2010 11:17:35 -0600 Subject: module: fix kdb's illicit use of struct module_use. Linus changed the structure, and luckily this didn't compile any more. Reported-by: Stephen Rothwell Signed-off-by: Rusty Russell Cc: Jason Wessel Cc: Martin Hicks --- include/linux/module.h | 7 +++++++ kernel/debug/kdb/kdb_main.c | 12 +++--------- kernel/module.c | 11 +---------- 3 files changed, 11 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/include/linux/module.h b/include/linux/module.h index 680db9e2ac36..5d8fca5dcff5 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -181,6 +181,13 @@ void *__symbol_get(const char *symbol); void *__symbol_get_gpl(const char *symbol); #define symbol_get(x) ((typeof(&x))(__symbol_get(MODULE_SYMBOL_PREFIX #x))) +/* modules using other modules: kdb wants to see this. */ +struct module_use { + struct list_head source_list; + struct list_head target_list; + struct module *source, *target; +}; + #ifndef __GENKSYMS__ #ifdef CONFIG_MODVERSIONS /* Mark the CRC weak since genksyms apparently decides not to diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c index b724c791b6d4..184cd8209c36 100644 --- a/kernel/debug/kdb/kdb_main.c +++ b/kernel/debug/kdb/kdb_main.c @@ -1857,12 +1857,6 @@ static int kdb_ef(int argc, const char **argv) } #if defined(CONFIG_MODULES) -/* modules using other modules */ -struct module_use { - struct list_head list; - struct module *module_which_uses; -}; - /* * kdb_lsmod - This function implements the 'lsmod' command. Lists * currently loaded kernel modules. @@ -1894,9 +1888,9 @@ static int kdb_lsmod(int argc, const char **argv) { struct module_use *use; kdb_printf(" [ "); - list_for_each_entry(use, &mod->modules_which_use_me, - list) - kdb_printf("%s ", use->module_which_uses->name); + list_for_each_entry(use, &mod->source_list, + source_list) + kdb_printf("%s ", use->target->name); kdb_printf("]\n"); } #endif diff --git a/kernel/module.c b/kernel/module.c index be18c3e34684..bbb1d812c79c 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -536,14 +536,6 @@ static void module_unload_init(struct module *mod) mod->waiter = current; } -/* modules using other modules */ -struct module_use -{ - struct list_head source_list; - struct list_head target_list; - struct module *source, *target; -}; - /* Does a already use b? */ static int already_uses(struct module *a, struct module *b) { @@ -589,8 +581,7 @@ static int add_module_usage(struct module *a, struct module *b) /* Module a uses b */ int use_module(struct module *a, struct module *b) { - struct module_use *use; - int no_warn, err; + int err; if (b == NULL || already_uses(a, b)) return 1; -- cgit v1.2.3-73-gaa49b From 80a3d1bb410e000e176931a076cdf19a1e89a955 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Sat, 5 Jun 2010 11:17:36 -0600 Subject: module: move sysfs exposure to end of load_module This means a little extra work, but is more logical: we don't put anything in sysfs until we're about to put the module into the global list an parse its parameters. This also gives us a logical place to put duplicate module detection in the next patch. Signed-off-by: Rusty Russell --- kernel/module.c | 47 ++++++++++++++++++++++++++++++++++++----------- 1 file changed, 36 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index bbb1d812c79c..c690d9885797 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -560,7 +560,6 @@ static int already_uses(struct module *a, struct module *b) */ static int add_module_usage(struct module *a, struct module *b) { - int no_warn; struct module_use *use; DEBUGP("Allocating new usage for %s.\n", a->name); @@ -574,7 +573,6 @@ static int add_module_usage(struct module *a, struct module *b) use->target = b; list_add(&use->source_list, &b->source_list); list_add(&use->target_list, &a->target_list); - no_warn = sysfs_create_link(b->holders_dir, &a->mkobj.kobj, a->name); return 0; } @@ -619,7 +617,6 @@ static void module_unload_free(struct module *mod) list_del(&use->source_list); list_del(&use->target_list); kfree(use); - sysfs_remove_link(i->holders_dir, mod->name); } } @@ -1303,6 +1300,29 @@ static inline void remove_notes_attrs(struct module *mod) #endif #ifdef CONFIG_SYSFS +static void add_usage_links(struct module *mod) +{ +#ifdef CONFIG_MODULE_UNLOAD + struct module_use *use; + int nowarn; + + list_for_each_entry(use, &mod->target_list, target_list) { + nowarn = sysfs_create_link(use->target->holders_dir, + &mod->mkobj.kobj, mod->name); + } +#endif +} + +static void del_usage_links(struct module *mod) +{ +#ifdef CONFIG_MODULE_UNLOAD + struct module_use *use; + + list_for_each_entry(use, &mod->target_list, target_list) + sysfs_remove_link(use->target->holders_dir, mod->name); +#endif +} + int module_add_modinfo_attrs(struct module *mod) { struct module_attribute *attr; @@ -1385,6 +1405,10 @@ int mod_sysfs_setup(struct module *mod, { int err; + err = mod_sysfs_init(mod); + if (err) + goto out; + mod->holders_dir = kobject_create_and_add("holders", &mod->mkobj.kobj); if (!mod->holders_dir) { err = -ENOMEM; @@ -1399,6 +1423,8 @@ int mod_sysfs_setup(struct module *mod, if (err) goto out_unreg_param; + add_usage_links(mod); + kobject_uevent(&mod->mkobj.kobj, KOBJ_ADD); return 0; @@ -1408,6 +1434,7 @@ out_unreg_holders: kobject_put(mod->holders_dir); out_unreg: kobject_put(&mod->mkobj.kobj); +out: return err; } @@ -1422,10 +1449,15 @@ static void mod_sysfs_fini(struct module *mod) { } +static void del_usage_links(struct module *mod) +{ +} + #endif /* CONFIG_SYSFS */ static void mod_kobject_remove(struct module *mod) { + del_usage_links(mod); module_remove_modinfo_attrs(mod); module_param_sysfs_remove(mod); kobject_put(mod->mkobj.drivers_dir); @@ -2242,11 +2274,6 @@ static noinline struct module *load_module(void __user *umod, /* Now we've moved module, initialize linked lists, etc. */ module_unload_init(mod); - /* add kobject, so we can reference it. */ - err = mod_sysfs_init(mod); - if (err) - goto free_unload; - /* Set up license info based on the info section */ set_license(mod, get_modinfo(sechdrs, infoindex, "license")); @@ -2443,6 +2470,7 @@ static noinline struct module *load_module(void __user *umod, err = mod_sysfs_setup(mod, mod->kp, mod->num_kp); if (err < 0) goto unlink; + add_sect_attrs(mod, hdr->e_shnum, secstrings, sechdrs); add_notes_attrs(mod, hdr->e_shnum, secstrings, sechdrs); @@ -2461,9 +2489,6 @@ static noinline struct module *load_module(void __user *umod, module_arch_cleanup(mod); cleanup: free_modinfo(mod); - kobject_del(&mod->mkobj.kobj); - kobject_put(&mod->mkobj.kobj); - free_unload: module_unload_free(mod); #if defined(CONFIG_MODULE_UNLOAD) free_percpu(mod->refptr); -- cgit v1.2.3-73-gaa49b From 6407ebb271fc34440b306f305e1efb7685eece26 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Sat, 5 Jun 2010 11:17:36 -0600 Subject: module: Make module sysfs functions private. These were placed in the header in ef665c1a06 to get the various SYSFS/MODULE config combintations to compile. That may have been necessary then, but it's not now. These functions are all local to module.c. Signed-off-by: Rusty Russell Cc: Randy Dunlap --- include/linux/module.h | 33 --------------------------------- kernel/module.c | 29 +++++++++++++++++++++++++---- 2 files changed, 25 insertions(+), 37 deletions(-) (limited to 'kernel') diff --git a/include/linux/module.h b/include/linux/module.h index 5d8fca5dcff5..8a6b9fdc7ffa 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -672,43 +672,10 @@ static inline int module_get_iter_tracepoints(struct tracepoint_iter *iter) #endif /* CONFIG_MODULES */ -struct device_driver; #ifdef CONFIG_SYSFS -struct module; - extern struct kset *module_kset; extern struct kobj_type module_ktype; extern int module_sysfs_initialized; - -int mod_sysfs_init(struct module *mod); -int mod_sysfs_setup(struct module *mod, - struct kernel_param *kparam, - unsigned int num_params); -int module_add_modinfo_attrs(struct module *mod); -void module_remove_modinfo_attrs(struct module *mod); - -#else /* !CONFIG_SYSFS */ - -static inline int mod_sysfs_init(struct module *mod) -{ - return 0; -} - -static inline int mod_sysfs_setup(struct module *mod, - struct kernel_param *kparam, - unsigned int num_params) -{ - return 0; -} - -static inline int module_add_modinfo_attrs(struct module *mod) -{ - return 0; -} - -static inline void module_remove_modinfo_attrs(struct module *mod) -{ } - #endif /* CONFIG_SYSFS */ #define symbol_request(x) try_then_request_module(symbol_get(x), "symbol:" #x) diff --git a/kernel/module.c b/kernel/module.c index c690d9885797..808aa18dd661 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -1323,7 +1323,7 @@ static void del_usage_links(struct module *mod) #endif } -int module_add_modinfo_attrs(struct module *mod) +static int module_add_modinfo_attrs(struct module *mod) { struct module_attribute *attr; struct module_attribute *temp_attr; @@ -1349,7 +1349,7 @@ int module_add_modinfo_attrs(struct module *mod) return error; } -void module_remove_modinfo_attrs(struct module *mod) +static void module_remove_modinfo_attrs(struct module *mod) { struct module_attribute *attr; int i; @@ -1365,7 +1365,7 @@ void module_remove_modinfo_attrs(struct module *mod) kfree(mod->modinfo_attrs); } -int mod_sysfs_init(struct module *mod) +static int mod_sysfs_init(struct module *mod) { int err; struct kobject *kobj; @@ -1399,7 +1399,7 @@ out: return err; } -int mod_sysfs_setup(struct module *mod, +static int mod_sysfs_setup(struct module *mod, struct kernel_param *kparam, unsigned int num_params) { @@ -1445,6 +1445,27 @@ static void mod_sysfs_fini(struct module *mod) #else /* CONFIG_SYSFS */ +static inline int mod_sysfs_init(struct module *mod) +{ + return 0; +} + +static inline int mod_sysfs_setup(struct module *mod, + struct kernel_param *kparam, + unsigned int num_params) +{ + return 0; +} + +static inline int module_add_modinfo_attrs(struct module *mod) +{ + return 0; +} + +static inline void module_remove_modinfo_attrs(struct module *mod) +{ +} + static void mod_sysfs_fini(struct module *mod) { } -- cgit v1.2.3-73-gaa49b From 75676500f8298f0ee89db12db97294883c4b768e Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Sat, 5 Jun 2010 11:17:36 -0600 Subject: module: make locking more fine-grained. Kay Sievers reports that we still have some contention over module loading which is slowing boot. Linus also disliked a previous "drop lock and regrab" patch to fix the bne2 "gave up waiting for init of module libcrc32c" message. This is more ambitious: we only grab the lock where we need it. Signed-off-by: Rusty Russell Cc: Brandon Philips Cc: Kay Sievers Cc: Linus Torvalds --- kernel/module.c | 65 +++++++++++++++++++++++++++++++++++++-------------------- 1 file changed, 42 insertions(+), 23 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index 808aa18dd661..d293c213c22c 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -72,7 +72,11 @@ /* If this is set, the section belongs in the init part of the module */ #define INIT_OFFSET_MASK (1UL << (BITS_PER_LONG-1)) -/* List of modules, protected by module_mutex or preempt_disable +/* + * Mutex protects: + * 1) List of modules (also safely readable with preempt_disable), + * 2) module_use links, + * 3) module_addr_min/module_addr_max. * (delete uses stop_machine/add uses RCU list operations). */ DEFINE_MUTEX(module_mutex); EXPORT_SYMBOL_GPL(module_mutex); @@ -90,7 +94,8 @@ static DECLARE_WAIT_QUEUE_HEAD(module_wq); static BLOCKING_NOTIFIER_HEAD(module_notify_list); -/* Bounds of module allocation, for speeding __module_address */ +/* Bounds of module allocation, for speeding __module_address. + * Protected by module_mutex. */ static unsigned long module_addr_min = -1UL, module_addr_max = 0; int register_module_notifier(struct notifier_block * nb) @@ -329,7 +334,7 @@ static bool find_symbol_in_section(const struct symsearch *syms, } /* Find a symbol and return it, along with, (optional) crc and - * (optional) module which owns it */ + * (optional) module which owns it. Needs preempt disabled or module_mutex. */ const struct kernel_symbol *find_symbol(const char *name, struct module **owner, const unsigned long **crc, @@ -576,7 +581,7 @@ static int add_module_usage(struct module *a, struct module *b) return 0; } -/* Module a uses b */ +/* Module a uses b: caller needs module_mutex() */ int use_module(struct module *a, struct module *b) { int err; @@ -610,6 +615,7 @@ static void module_unload_free(struct module *mod) { struct module_use *use, *tmp; + mutex_lock(&module_mutex); list_for_each_entry_safe(use, tmp, &mod->target_list, target_list) { struct module *i = use->target; DEBUGP("%s unusing %s\n", mod->name, i->name); @@ -618,6 +624,7 @@ static void module_unload_free(struct module *mod) list_del(&use->target_list); kfree(use); } + mutex_unlock(&module_mutex); } #ifdef CONFIG_MODULE_FORCE_UNLOAD @@ -784,13 +791,14 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user, blocking_notifier_call_chain(&module_notify_list, MODULE_STATE_GOING, mod); async_synchronize_full(); - mutex_lock(&module_mutex); + /* Store the name of the last unloaded module for diagnostic purposes */ strlcpy(last_unloaded_module, mod->name, sizeof(last_unloaded_module)); ddebug_remove_module(mod->name); - free_module(mod); - out: + free_module(mod); + return 0; +out: mutex_unlock(&module_mutex); return ret; } @@ -1006,6 +1014,8 @@ static inline int check_modstruct_version(Elf_Shdr *sechdrs, { const unsigned long *crc; + /* Since this should be found in kernel (which can't be removed), + * no locking is necessary. */ if (!find_symbol(MODULE_SYMBOL_PREFIX "module_layout", NULL, &crc, true, false)) BUG(); @@ -1048,8 +1058,7 @@ static inline int same_magic(const char *amagic, const char *bmagic, } #endif /* CONFIG_MODVERSIONS */ -/* Resolve a symbol for this module. I.e. if we find one, record usage. - Must be holding module_mutex. */ +/* Resolve a symbol for this module. I.e. if we find one, record usage. */ static const struct kernel_symbol *resolve_symbol(Elf_Shdr *sechdrs, unsigned int versindex, const char *name, @@ -1059,6 +1068,7 @@ static const struct kernel_symbol *resolve_symbol(Elf_Shdr *sechdrs, const struct kernel_symbol *sym; const unsigned long *crc; + mutex_lock(&module_mutex); sym = find_symbol(name, &owner, &crc, !(mod->taints & (1 << TAINT_PROPRIETARY_MODULE)), true); /* use_module can fail due to OOM, @@ -1068,6 +1078,7 @@ static const struct kernel_symbol *resolve_symbol(Elf_Shdr *sechdrs, || !use_module(mod, owner)) sym = NULL; } + mutex_unlock(&module_mutex); return sym; } @@ -1306,10 +1317,12 @@ static void add_usage_links(struct module *mod) struct module_use *use; int nowarn; + mutex_lock(&module_mutex); list_for_each_entry(use, &mod->target_list, target_list) { nowarn = sysfs_create_link(use->target->holders_dir, &mod->mkobj.kobj, mod->name); } + mutex_unlock(&module_mutex); #endif } @@ -1318,8 +1331,10 @@ static void del_usage_links(struct module *mod) #ifdef CONFIG_MODULE_UNLOAD struct module_use *use; + mutex_lock(&module_mutex); list_for_each_entry(use, &mod->target_list, target_list) sysfs_remove_link(use->target->holders_dir, mod->name); + mutex_unlock(&module_mutex); #endif } @@ -1497,13 +1512,15 @@ static int __unlink_module(void *_mod) return 0; } -/* Free a module, remove from lists, etc (must hold module_mutex). */ +/* Free a module, remove from lists, etc. */ static void free_module(struct module *mod) { trace_module_free(mod); /* Delete from various lists */ + mutex_lock(&module_mutex); stop_machine(__unlink_module, mod, NULL); + mutex_unlock(&module_mutex); remove_notes_attrs(mod); remove_sect_attrs(mod); mod_kobject_remove(mod); @@ -1575,7 +1592,14 @@ static int verify_export_symbols(struct module *mod) for (i = 0; i < ARRAY_SIZE(arr); i++) { for (s = arr[i].sym; s < arr[i].sym + arr[i].num; s++) { - if (find_symbol(s->name, &owner, NULL, true, false)) { + const struct kernel_symbol *sym; + + /* Stopping preemption makes find_symbol safe. */ + preempt_disable(); + sym = find_symbol(s->name, &owner, NULL, true, false); + preempt_enable(); + + if (sym) { printk(KERN_ERR "%s: exports duplicate symbol %s" " (owned by %s)\n", @@ -2021,11 +2045,13 @@ static void *module_alloc_update_bounds(unsigned long size) void *ret = module_alloc(size); if (ret) { + mutex_lock(&module_mutex); /* Update module bounds. */ if ((unsigned long)ret < module_addr_min) module_addr_min = (unsigned long)ret; if ((unsigned long)ret + size > module_addr_max) module_addr_max = (unsigned long)ret + size; + mutex_unlock(&module_mutex); } return ret; } @@ -2482,7 +2508,9 @@ static noinline struct module *load_module(void __user *umod, * function to insert in a way safe to concurrent readers. * The mutex protects against concurrent writers. */ + mutex_lock(&module_mutex); list_add_rcu(&mod->list, &modules); + mutex_unlock(&module_mutex); err = parse_args(mod->name, mod->args, mod->kp, mod->num_kp, NULL); if (err < 0) @@ -2504,8 +2532,10 @@ static noinline struct module *load_module(void __user *umod, return mod; unlink: + mutex_lock(&module_mutex); /* Unlink carefully: kallsyms could be walking list. */ list_del_rcu(&mod->list); + mutex_unlock(&module_mutex); synchronize_sched(); module_arch_cleanup(mod); cleanup: @@ -2556,19 +2586,10 @@ SYSCALL_DEFINE3(init_module, void __user *, umod, if (!capable(CAP_SYS_MODULE) || modules_disabled) return -EPERM; - /* Only one module load at a time, please */ - if (mutex_lock_interruptible(&module_mutex) != 0) - return -EINTR; - /* Do all the hard work */ mod = load_module(umod, len, uargs); - if (IS_ERR(mod)) { - mutex_unlock(&module_mutex); + if (IS_ERR(mod)) return PTR_ERR(mod); - } - - /* Drop lock so they can recurse */ - mutex_unlock(&module_mutex); blocking_notifier_call_chain(&module_notify_list, MODULE_STATE_COMING, mod); @@ -2585,9 +2606,7 @@ SYSCALL_DEFINE3(init_module, void __user *, umod, module_put(mod); blocking_notifier_call_chain(&module_notify_list, MODULE_STATE_GOING, mod); - mutex_lock(&module_mutex); free_module(mod); - mutex_unlock(&module_mutex); wake_up(&module_wq); return ret; } -- cgit v1.2.3-73-gaa49b From 3bafeb6247042dcbb72b0141ec7c7107de9f0b99 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sat, 5 Jun 2010 11:17:36 -0600 Subject: module: move find_module check to end I think Rusty may have made the lock a bit _too_ finegrained there, and didn't add it to some places that needed it. It looks, for example, like PATCH 1/2 actually drops the lock in places where it's needed ("find_module()" is documented to need it, but now load_module() didn't hold it at all when it did the find_module()). Rather than adding a new "module_loading" list, I think we should be able to just use the existing "modules" list, and just fix up the locking a bit. In fact, maybe we could just move the "look up existing module" a bit later - optimistically assuming that the module doesn't exist, and then just undoing the work if it turns out that we were wrong, just before adding ourselves to the list. Signed-off-by: Rusty Russell --- kernel/module.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index d293c213c22c..28450047852a 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -2226,11 +2226,6 @@ static noinline struct module *load_module(void __user *umod, goto free_mod; } - if (find_module(mod->name)) { - err = -EEXIST; - goto free_mod; - } - mod->state = MODULE_STATE_COMING; /* Allow arches to frob section contents and sizes. */ @@ -2509,6 +2504,12 @@ static noinline struct module *load_module(void __user *umod, * The mutex protects against concurrent writers. */ mutex_lock(&module_mutex); + if (find_module(mod->name)) { + err = -EEXIST; + /* This will also unlock the mutex */ + goto already_exists; + } + list_add_rcu(&mod->list, &modules); mutex_unlock(&module_mutex); @@ -2535,6 +2536,7 @@ static noinline struct module *load_module(void __user *umod, mutex_lock(&module_mutex); /* Unlink carefully: kallsyms could be walking list. */ list_del_rcu(&mod->list); + already_exists: mutex_unlock(&module_mutex); synchronize_sched(); module_arch_cleanup(mod); -- cgit v1.2.3-73-gaa49b From be593f4ce4eb1bd40e38fdc403371f149f6f12eb Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Sat, 5 Jun 2010 11:17:37 -0600 Subject: module: verify_export_symbols under the lock It disabled preempt so it was "safe", but nothing stops another module slipping in before this module is added to the global list now we don't hold the lock the whole time. So we check this just after we check for duplicate modules, and just before we put the module in the global list. (find_symbol finds symbols in coming and going modules, too). Signed-off-by: Rusty Russell --- kernel/module.c | 26 ++++++++++---------------- 1 file changed, 10 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index 28450047852a..f99558e1945a 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -1571,6 +1571,8 @@ EXPORT_SYMBOL_GPL(__symbol_get); /* * Ensure that an exported symbol [global namespace] does not already exist * in the kernel or in some other module's exported symbol table. + * + * You must hold the module_mutex. */ static int verify_export_symbols(struct module *mod) { @@ -1592,14 +1594,7 @@ static int verify_export_symbols(struct module *mod) for (i = 0; i < ARRAY_SIZE(arr); i++) { for (s = arr[i].sym; s < arr[i].sym + arr[i].num; s++) { - const struct kernel_symbol *sym; - - /* Stopping preemption makes find_symbol safe. */ - preempt_disable(); - sym = find_symbol(s->name, &owner, NULL, true, false); - preempt_enable(); - - if (sym) { + if (find_symbol(s->name, &owner, NULL, true, false)) { printk(KERN_ERR "%s: exports duplicate symbol %s" " (owned by %s)\n", @@ -2440,11 +2435,6 @@ static noinline struct module *load_module(void __user *umod, goto cleanup; } - /* Find duplicate symbols */ - err = verify_export_symbols(mod); - if (err < 0) - goto cleanup; - /* Set up and sort exception table */ mod->extable = section_objs(hdr, sechdrs, secstrings, "__ex_table", sizeof(*mod->extable), &mod->num_exentries); @@ -2506,10 +2496,14 @@ static noinline struct module *load_module(void __user *umod, mutex_lock(&module_mutex); if (find_module(mod->name)) { err = -EEXIST; - /* This will also unlock the mutex */ - goto already_exists; + goto unlock; } + /* Find duplicate symbols */ + err = verify_export_symbols(mod); + if (err < 0) + goto unlock; + list_add_rcu(&mod->list, &modules); mutex_unlock(&module_mutex); @@ -2536,7 +2530,7 @@ static noinline struct module *load_module(void __user *umod, mutex_lock(&module_mutex); /* Unlink carefully: kallsyms could be walking list. */ list_del_rcu(&mod->list); - already_exists: + unlock: mutex_unlock(&module_mutex); synchronize_sched(); module_arch_cleanup(mod); -- cgit v1.2.3-73-gaa49b From 9bea7f23952d5948f8e5dfdff4de09bb9981fb5f Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Sat, 5 Jun 2010 11:17:37 -0600 Subject: module: fix bne2 "gave up waiting for init of module libcrc32c" Problem: it's hard to avoid an init routine stumbling over a request_module these days. And it's not clear it's always a bad idea: for example, a module like kvm with dynamic dependencies on kvm-intel or kvm-amd would be neater if it could simply request_module the right one. In this particular case, it's libcrc32c: libcrc32c_mod_init crypto_alloc_shash crypto_alloc_tfm crypto_find_alg crypto_alg_mod_lookup crypto_larval_lookup request_module If another module is waiting inside resolve_symbol() for libcrc32c to finish initializing (ie. bne2 depends on libcrc32c) then it does so holding the module lock, and our request_module() can't make progress until that is released. Waiting inside resolve_symbol() without the lock isn't all that hard: we just need to pass the -EBUSY up the call chain so we can sleep where we don't hold the lock. Error reporting is a bit trickier: we need to copy the name of the unfinished module before releasing the lock. Other notes: 1) This also fixes a theoretical issue where a weak dependency would allow symbol version mismatches to be ignored. 2) We rename use_module to ref_module to make life easier for the only external user (the out-of-tree ksplice patches). Signed-off-by: Rusty Russell Cc: Linus Torvalds Cc: Tim Abbot Tested-by: Brandon Philips --- kernel/module.c | 91 +++++++++++++++++++++++++++++++++++++-------------------- 1 file changed, 59 insertions(+), 32 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index f99558e1945a..8c6b42840dd1 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -582,33 +582,26 @@ static int add_module_usage(struct module *a, struct module *b) } /* Module a uses b: caller needs module_mutex() */ -int use_module(struct module *a, struct module *b) +int ref_module(struct module *a, struct module *b) { int err; - if (b == NULL || already_uses(a, b)) return 1; - - /* If we're interrupted or time out, we fail. */ - if (wait_event_interruptible_timeout( - module_wq, (err = strong_try_module_get(b)) != -EBUSY, - 30 * HZ) <= 0) { - printk("%s: gave up waiting for init of module %s.\n", - a->name, b->name); + if (b == NULL || already_uses(a, b)) return 0; - } - /* If strong_try_module_get() returned a different error, we fail. */ + /* If module isn't available, we fail. */ + err = strong_try_module_get(b); if (err) - return 0; + return err; err = add_module_usage(a, b); if (err) { module_put(b); - return 0; + return err; } - return 1; + return 0; } -EXPORT_SYMBOL_GPL(use_module); +EXPORT_SYMBOL_GPL(ref_module); /* Clear the unload stuff of the module. */ static void module_unload_free(struct module *mod) @@ -893,11 +886,11 @@ static inline void module_unload_free(struct module *mod) { } -int use_module(struct module *a, struct module *b) +int ref_module(struct module *a, struct module *b) { - return strong_try_module_get(b) == 0; + return strong_try_module_get(b); } -EXPORT_SYMBOL_GPL(use_module); +EXPORT_SYMBOL_GPL(ref_module); static inline void module_unload_init(struct module *mod) { @@ -1062,26 +1055,58 @@ static inline int same_magic(const char *amagic, const char *bmagic, static const struct kernel_symbol *resolve_symbol(Elf_Shdr *sechdrs, unsigned int versindex, const char *name, - struct module *mod) + struct module *mod, + char ownername[]) { struct module *owner; const struct kernel_symbol *sym; const unsigned long *crc; + int err; mutex_lock(&module_mutex); sym = find_symbol(name, &owner, &crc, !(mod->taints & (1 << TAINT_PROPRIETARY_MODULE)), true); - /* use_module can fail due to OOM, - or module initialization or unloading */ - if (sym) { - if (!check_version(sechdrs, versindex, name, mod, crc, owner) - || !use_module(mod, owner)) - sym = NULL; + if (!sym) + goto unlock; + + if (!check_version(sechdrs, versindex, name, mod, crc, owner)) { + sym = ERR_PTR(-EINVAL); + goto getname; } + + err = ref_module(mod, owner); + if (err) { + sym = ERR_PTR(err); + goto getname; + } + +getname: + /* We must make copy under the lock if we failed to get ref. */ + strncpy(ownername, module_name(owner), MODULE_NAME_LEN); +unlock: mutex_unlock(&module_mutex); return sym; } +static const struct kernel_symbol *resolve_symbol_wait(Elf_Shdr *sechdrs, + unsigned int versindex, + const char *name, + struct module *mod) +{ + const struct kernel_symbol *ksym; + char ownername[MODULE_NAME_LEN]; + + if (wait_event_interruptible_timeout(module_wq, + !IS_ERR(ksym = resolve_symbol(sechdrs, versindex, name, + mod, ownername)) || + PTR_ERR(ksym) != -EBUSY, + 30 * HZ) <= 0) { + printk(KERN_WARNING "%s: gave up waiting for init of module %s.\n", + mod->name, ownername); + } + return ksym; +} + /* * /sys/module/foo/sections stuff * J. Corbet @@ -1638,21 +1663,23 @@ static int simplify_symbols(Elf_Shdr *sechdrs, break; case SHN_UNDEF: - ksym = resolve_symbol(sechdrs, versindex, - strtab + sym[i].st_name, mod); + ksym = resolve_symbol_wait(sechdrs, versindex, + strtab + sym[i].st_name, + mod); /* Ok if resolved. */ - if (ksym) { + if (ksym && !IS_ERR(ksym)) { sym[i].st_value = ksym->value; break; } /* Ok if weak. */ - if (ELF_ST_BIND(sym[i].st_info) == STB_WEAK) + if (!ksym && ELF_ST_BIND(sym[i].st_info) == STB_WEAK) break; - printk(KERN_WARNING "%s: Unknown symbol %s\n", - mod->name, strtab + sym[i].st_name); - ret = -ENOENT; + printk(KERN_WARNING "%s: Unknown symbol %s (err %li)\n", + mod->name, strtab + sym[i].st_name, + PTR_ERR(ksym)); + ret = PTR_ERR(ksym) ?: -ENOENT; break; default: -- cgit v1.2.3-73-gaa49b