diff options
Diffstat (limited to 'kernel')
74 files changed, 3738 insertions, 2307 deletions
diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt index 0b46a5dff4c0..c64ce9c14207 100644 --- a/kernel/Kconfig.preempt +++ b/kernel/Kconfig.preempt @@ -23,7 +23,7 @@ config PREEMPT_VOLUNTARY "explicit preemption points" to the kernel code. These new preemption points have been selected to reduce the maximum latency of rescheduling, providing faster application reactions, - at the cost of slighly lower throughput. + at the cost of slightly lower throughput. This allows reaction to interactive events by allowing a low priority process to voluntarily preempt itself even if it @@ -43,7 +43,7 @@ config PREEMPT even if it is in kernel mode executing a system call and would otherwise not be about to reach a natural preemption point. This allows applications to run more 'smoothly' even when the - system is under load, at the cost of slighly lower throughput + system is under load, at the cost of slightly lower throughput and a slight runtime overhead to kernel code. Select this if you are building a kernel for a desktop or diff --git a/kernel/Makefile b/kernel/Makefile index ac6b27abb1ad..642d4277c2ea 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -8,7 +8,7 @@ obj-y = sched.o fork.o exec_domain.o panic.o printk.o profile.o \ signal.o sys.o kmod.o workqueue.o pid.o \ rcupdate.o extable.o params.o posix-timers.o \ kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \ - hrtimer.o rwsem.o latency.o nsproxy.o srcu.o + hrtimer.o rwsem.o latency.o nsproxy.o srcu.o die_notifier.o obj-$(CONFIG_STACKTRACE) += stacktrace.o obj-y += time/ diff --git a/kernel/audit.c b/kernel/audit.c index 4e9d20829681..d13276d41410 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -515,8 +515,8 @@ static int audit_netlink_ok(struct sk_buff *skb, u16 msg_type) err = -EPERM; break; case AUDIT_USER: - case AUDIT_FIRST_USER_MSG...AUDIT_LAST_USER_MSG: - case AUDIT_FIRST_USER_MSG2...AUDIT_LAST_USER_MSG2: + case AUDIT_FIRST_USER_MSG ... AUDIT_LAST_USER_MSG: + case AUDIT_FIRST_USER_MSG2 ... AUDIT_LAST_USER_MSG2: if (security_netlink_recv(skb, CAP_AUDIT_WRITE)) err = -EPERM; break; @@ -614,8 +614,8 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) loginuid, sid); break; case AUDIT_USER: - case AUDIT_FIRST_USER_MSG...AUDIT_LAST_USER_MSG: - case AUDIT_FIRST_USER_MSG2...AUDIT_LAST_USER_MSG2: + case AUDIT_FIRST_USER_MSG ... AUDIT_LAST_USER_MSG: + case AUDIT_FIRST_USER_MSG2 ... AUDIT_LAST_USER_MSG2: if (!audit_enabled && msg_type != AUDIT_USER_AVC) return 0; diff --git a/kernel/audit.h b/kernel/audit.h index a3370232a390..815d6f5c04ee 100644 --- a/kernel/audit.h +++ b/kernel/audit.h @@ -83,6 +83,7 @@ struct audit_krule { u32 field_count; char *filterkey; /* ties events to rules */ struct audit_field *fields; + struct audit_field *arch_f; /* quick access to arch field */ struct audit_field *inode_f; /* quick access to an inode field */ struct audit_watch *watch; /* associated watch */ struct list_head rlist; /* entry in audit_watch.rules list */ @@ -131,17 +132,19 @@ extern void audit_handle_ievent(struct inotify_watch *, u32, u32, u32, extern int selinux_audit_rule_update(void); #ifdef CONFIG_AUDITSYSCALL -extern void __audit_signal_info(int sig, struct task_struct *t); -static inline void audit_signal_info(int sig, struct task_struct *t) +extern int __audit_signal_info(int sig, struct task_struct *t); +static inline int audit_signal_info(int sig, struct task_struct *t) { - if (unlikely(audit_pid && t->tgid == audit_pid)) - __audit_signal_info(sig, t); + if (unlikely((audit_pid && t->tgid == audit_pid) || + (audit_signals && !audit_dummy_context()))) + return __audit_signal_info(sig, t); + return 0; } extern enum audit_state audit_filter_inodes(struct task_struct *, struct audit_context *); extern void audit_set_auditable(struct audit_context *); #else -#define audit_signal_info(s,t) +#define audit_signal_info(s,t) AUDIT_DISABLED #define audit_filter_inodes(t,c) AUDIT_DISABLED #define audit_set_auditable(c) #endif diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index 3749193aed8c..74cc0fc6bb81 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -311,6 +311,45 @@ int audit_match_class(int class, unsigned syscall) return classes[class][AUDIT_WORD(syscall)] & AUDIT_BIT(syscall); } +#ifdef CONFIG_AUDITSYSCALL +static inline int audit_match_class_bits(int class, u32 *mask) +{ + int i; + + if (classes[class]) { + for (i = 0; i < AUDIT_BITMASK_SIZE; i++) + if (mask[i] & classes[class][i]) + return 0; + } + return 1; +} + +static int audit_match_signal(struct audit_entry *entry) +{ + struct audit_field *arch = entry->rule.arch_f; + + if (!arch) { + /* When arch is unspecified, we must check both masks on biarch + * as syscall number alone is ambiguous. */ + return (audit_match_class_bits(AUDIT_CLASS_SIGNAL, + entry->rule.mask) && + audit_match_class_bits(AUDIT_CLASS_SIGNAL_32, + entry->rule.mask)); + } + + switch(audit_classify_arch(arch->val)) { + case 0: /* native */ + return (audit_match_class_bits(AUDIT_CLASS_SIGNAL, + entry->rule.mask)); + case 1: /* 32bit on biarch */ + return (audit_match_class_bits(AUDIT_CLASS_SIGNAL_32, + entry->rule.mask)); + default: + return 1; + } +} +#endif + /* Common user-space to kernel rule translation. */ static inline struct audit_entry *audit_to_entry_common(struct audit_rule *rule) { @@ -429,6 +468,7 @@ static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule) err = -EINVAL; goto exit_free; } + entry->rule.arch_f = f; break; case AUDIT_PERM: if (f->val & ~15) @@ -519,7 +559,6 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data, case AUDIT_FSGID: case AUDIT_LOGINUID: case AUDIT_PERS: - case AUDIT_ARCH: case AUDIT_MSGTYPE: case AUDIT_PPID: case AUDIT_DEVMAJOR: @@ -531,6 +570,9 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data, case AUDIT_ARG2: case AUDIT_ARG3: break; + case AUDIT_ARCH: + entry->rule.arch_f = f; + break; case AUDIT_SUBJ_USER: case AUDIT_SUBJ_ROLE: case AUDIT_SUBJ_TYPE: @@ -1221,6 +1263,9 @@ static inline int audit_add_rule(struct audit_entry *entry, #ifdef CONFIG_AUDITSYSCALL if (!dont_count) audit_n_rules++; + + if (!audit_match_signal(entry)) + audit_signals++; #endif mutex_unlock(&audit_filter_mutex); @@ -1294,6 +1339,9 @@ static inline int audit_del_rule(struct audit_entry *entry, #ifdef CONFIG_AUDITSYSCALL if (!dont_count) audit_n_rules--; + + if (!audit_match_signal(entry)) + audit_signals--; #endif mutex_unlock(&audit_filter_mutex); diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 628c7ac590a0..e36481ed61b4 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -78,17 +78,15 @@ extern int audit_enabled; * for saving names from getname(). */ #define AUDIT_NAMES 20 -/* AUDIT_NAMES_RESERVED is the number of slots we reserve in the - * audit_context from being used for nameless inodes from - * path_lookup. */ -#define AUDIT_NAMES_RESERVED 7 - /* Indicates that audit should log the full pathname. */ #define AUDIT_NAME_FULL -1 /* number of audit rules */ int audit_n_rules; +/* determines whether we collect data for signals sent */ +int audit_signals; + /* When fs/namei.c:getname() is called, we store the pointer in name and * we don't let putname() free it (instead we free all of the saved * pointers at syscall exit time). @@ -114,6 +112,9 @@ struct audit_aux_data { #define AUDIT_AUX_IPCPERM 0 +/* Number of target pids per aux struct. */ +#define AUDIT_AUX_PIDS 16 + struct audit_aux_data_mq_open { struct audit_aux_data d; int oflag; @@ -181,6 +182,13 @@ struct audit_aux_data_path { struct vfsmount *mnt; }; +struct audit_aux_data_pids { + struct audit_aux_data d; + pid_t target_pid[AUDIT_AUX_PIDS]; + u32 target_sid[AUDIT_AUX_PIDS]; + int pid_count; +}; + /* The per-task audit context. */ struct audit_context { int dummy; /* must be the first element */ @@ -201,6 +209,7 @@ struct audit_context { struct vfsmount * pwdmnt; struct audit_context *previous; /* For nested syscalls */ struct audit_aux_data *aux; + struct audit_aux_data *aux_pids; /* Save things to print about task_struct */ pid_t pid, ppid; @@ -209,6 +218,9 @@ struct audit_context { unsigned long personality; int arch; + pid_t target_pid; + u32 target_sid; + #if AUDIT_DEBUG int put_count; int ino_count; @@ -654,6 +666,10 @@ static inline void audit_free_aux(struct audit_context *context) context->aux = aux->next; kfree(aux); } + while ((aux = context->aux_pids)) { + context->aux_pids = aux->next; + kfree(aux); + } } static inline void audit_zero_context(struct audit_context *context, @@ -795,6 +811,29 @@ static void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk audit_log_task_context(ab); } +static int audit_log_pid_context(struct audit_context *context, pid_t pid, + u32 sid) +{ + struct audit_buffer *ab; + char *s = NULL; + u32 len; + int rc = 0; + + ab = audit_log_start(context, GFP_KERNEL, AUDIT_OBJ_PID); + if (!ab) + return 1; + + if (selinux_sid_to_string(sid, &s, &len)) { + audit_log_format(ab, "opid=%d obj=(none)", pid); + rc = 1; + } else + audit_log_format(ab, "opid=%d obj=%s", pid, s); + audit_log_end(ab); + kfree(s); + + return rc; +} + static void audit_log_exit(struct audit_context *context, struct task_struct *tsk) { int i, call_panic = 0; @@ -973,6 +1012,21 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts audit_log_end(ab); } + for (aux = context->aux_pids; aux; aux = aux->next) { + struct audit_aux_data_pids *axs = (void *)aux; + int i; + + for (i = 0; i < axs->pid_count; i++) + if (audit_log_pid_context(context, axs->target_pid[i], + axs->target_sid[i])) + call_panic = 1; + } + + if (context->target_pid && + audit_log_pid_context(context, context->target_pid, + context->target_sid)) + call_panic = 1; + if (context->pwd && context->pwdmnt) { ab = audit_log_start(context, GFP_KERNEL, AUDIT_CWD); if (ab) { @@ -1193,6 +1247,10 @@ void audit_syscall_exit(int valid, long return_code) } else { audit_free_names(context); audit_free_aux(context); + context->aux = NULL; + context->aux_pids = NULL; + context->target_pid = 0; + context->target_sid = 0; kfree(context->filterkey); context->filterkey = NULL; tsk->audit_context = context; @@ -1226,6 +1284,7 @@ void __audit_getname(const char *name) context->names[context->name_count].name_len = AUDIT_NAME_FULL; context->names[context->name_count].name_put = 1; context->names[context->name_count].ino = (unsigned long)-1; + context->names[context->name_count].osid = 0; ++context->name_count; if (!context->pwd) { read_lock(¤t->fs->lock); @@ -1279,6 +1338,28 @@ void audit_putname(const char *name) #endif } +static int audit_inc_name_count(struct audit_context *context, + const struct inode *inode) +{ + if (context->name_count >= AUDIT_NAMES) { + if (inode) + printk(KERN_DEBUG "name_count maxed, losing inode data: " + "dev=%02x:%02x, inode=%lu", + MAJOR(inode->i_sb->s_dev), + MINOR(inode->i_sb->s_dev), + inode->i_ino); + + else + printk(KERN_DEBUG "name_count maxed, losing inode data"); + return 1; + } + context->name_count++; +#if AUDIT_DEBUG + context->ino_count++; +#endif + return 0; +} + /* Copy inode data into an audit_names. */ static void audit_copy_inode(struct audit_names *name, const struct inode *inode) { @@ -1316,13 +1397,10 @@ void __audit_inode(const char *name, const struct inode *inode) else { /* FIXME: how much do we care about inodes that have no * associated name? */ - if (context->name_count >= AUDIT_NAMES - AUDIT_NAMES_RESERVED) + if (audit_inc_name_count(context, inode)) return; - idx = context->name_count++; + idx = context->name_count - 1; context->names[idx].name = NULL; -#if AUDIT_DEBUG - ++context->ino_count; -#endif } audit_copy_inode(&context->names[idx], inode); } @@ -1346,7 +1424,7 @@ void __audit_inode_child(const char *dname, const struct inode *inode, { int idx; struct audit_context *context = current->audit_context; - const char *found_name = NULL; + const char *found_parent = NULL, *found_child = NULL; int dirlen = 0; if (!context->in_syscall) @@ -1354,88 +1432,73 @@ void __audit_inode_child(const char *dname, const struct inode *inode, /* determine matching parent */ if (!dname) - goto update_context; - for (idx = 0; idx < context->name_count; idx++) - if (context->names[idx].ino == parent->i_ino) { - const char *name = context->names[idx].name; + goto add_names; - if (!name) - continue; + /* parent is more likely, look for it first */ + for (idx = 0; idx < context->name_count; idx++) { + struct audit_names *n = &context->names[idx]; - if (audit_compare_dname_path(dname, name, &dirlen) == 0) { - context->names[idx].name_len = dirlen; - found_name = name; - break; - } + if (!n->name) + continue; + + if (n->ino == parent->i_ino && + !audit_compare_dname_path(dname, n->name, &dirlen)) { + n->name_len = dirlen; /* update parent data in place */ + found_parent = n->name; + goto add_names; } + } -update_context: - idx = context->name_count; - if (context->name_count == AUDIT_NAMES) { - printk(KERN_DEBUG "name_count maxed and losing %s\n", - found_name ?: "(null)"); - return; + /* no matching parent, look for matching child */ + for (idx = 0; idx < context->name_count; idx++) { + struct audit_names *n = &context->names[idx]; + + if (!n->name) + continue; + + /* strcmp() is the more likely scenario */ + if (!strcmp(dname, n->name) || + !audit_compare_dname_path(dname, n->name, &dirlen)) { + if (inode) + audit_copy_inode(n, inode); + else + n->ino = (unsigned long)-1; + found_child = n->name; + goto add_names; + } } - context->name_count++; -#if AUDIT_DEBUG - context->ino_count++; -#endif - /* Re-use the name belonging to the slot for a matching parent directory. - * All names for this context are relinquished in audit_free_names() */ - context->names[idx].name = found_name; - context->names[idx].name_len = AUDIT_NAME_FULL; - context->names[idx].name_put = 0; /* don't call __putname() */ - - if (!inode) - context->names[idx].ino = (unsigned long)-1; - else - audit_copy_inode(&context->names[idx], inode); - - /* A parent was not found in audit_names, so copy the inode data for the - * provided parent. */ - if (!found_name) { - idx = context->name_count; - if (context->name_count == AUDIT_NAMES) { - printk(KERN_DEBUG - "name_count maxed and losing parent inode data: dev=%02x:%02x, inode=%lu", - MAJOR(parent->i_sb->s_dev), - MINOR(parent->i_sb->s_dev), - parent->i_ino); + +add_names: + if (!found_parent) { + if (audit_inc_name_count(context, parent)) return; - } - context->name_count++; -#if AUDIT_DEBUG - context->ino_count++; -#endif + idx = context->name_count - 1; + context->names[idx].name = NULL; audit_copy_inode(&context->names[idx], parent); } -} -/** - * audit_inode_update - update inode info for last collected name - * @inode: inode being audited - * - * When open() is called on an existing object with the O_CREAT flag, the inode - * data audit initially collects is incorrect. This additional hook ensures - * audit has the inode data for the actual object to be opened. - */ -void __audit_inode_update(const struct inode *inode) -{ - struct audit_context *context = current->audit_context; - int idx; + if (!found_child) { + if (audit_inc_name_count(context, inode)) + return; + idx = context->name_count - 1; - if (!context->in_syscall || !inode) - return; + /* Re-use the name belonging to the slot for a matching parent + * directory. All names for this context are relinquished in + * audit_free_names() */ + if (found_parent) { + context->names[idx].name = found_parent; + context->names[idx].name_len = AUDIT_NAME_FULL; + /* don't call __putname() */ + context->names[idx].name_put = 0; + } else { + context->names[idx].name = NULL; + } - if (context->name_count == 0) { - context->name_count++; -#if AUDIT_DEBUG - context->ino_count++; -#endif + if (inode) + audit_copy_inode(&context->names[idx], inode); + else + context->names[idx].ino = (unsigned long)-1; } - idx = context->name_count - 1; - - audit_copy_inode(&context->names[idx], inode); } /** @@ -1880,6 +1943,14 @@ int audit_sockaddr(int len, void *a) return 0; } +void __audit_ptrace(struct task_struct *t) +{ + struct audit_context *context = current->audit_context; + + context->target_pid = t->pid; + selinux_get_task_sid(t, &context->target_sid); +} + /** * audit_avc_path - record the granting or denial of permissions * @dentry: dentry to record @@ -1918,15 +1989,17 @@ int audit_avc_path(struct dentry *dentry, struct vfsmount *mnt) * If the audit subsystem is being terminated, record the task (pid) * and uid that is doing that. */ -void __audit_signal_info(int sig, struct task_struct *t) +int __audit_signal_info(int sig, struct task_struct *t) { + struct audit_aux_data_pids *axp; + struct task_struct *tsk = current; + struct audit_context *ctx = tsk->audit_context; extern pid_t audit_sig_pid; extern uid_t audit_sig_uid; extern u32 audit_sig_sid; - if (sig == SIGTERM || sig == SIGHUP || sig == SIGUSR1) { - struct task_struct *tsk = current; - struct audit_context *ctx = tsk->audit_context; + if (audit_pid && t->tgid == audit_pid && + (sig == SIGTERM || sig == SIGHUP || sig == SIGUSR1)) { audit_sig_pid = tsk->pid; if (ctx) audit_sig_uid = ctx->loginuid; @@ -1934,4 +2007,72 @@ void __audit_signal_info(int sig, struct task_struct *t) audit_sig_uid = tsk->uid; selinux_get_task_sid(tsk, &audit_sig_sid); } + + if (!audit_signals) /* audit_context checked in wrapper */ + return 0; + + /* optimize the common case by putting first signal recipient directly + * in audit_context */ + if (!ctx->target_pid) { + ctx->target_pid = t->tgid; + selinux_get_task_sid(t, &ctx->target_sid); + return 0; + } + + axp = (void *)ctx->aux_pids; + if (!axp || axp->pid_count == AUDIT_AUX_PIDS) { + axp = kzalloc(sizeof(*axp), GFP_ATOMIC); + if (!axp) + return -ENOMEM; + + axp->d.type = AUDIT_OBJ_PID; + axp->d.next = ctx->aux_pids; + ctx->aux_pids = (void *)axp; + } + BUG_ON(axp->pid_count > AUDIT_AUX_PIDS); + + axp->target_pid[axp->pid_count] = t->tgid; + selinux_get_task_sid(t, &axp->target_sid[axp->pid_count]); + axp->pid_count++; + + return 0; +} + +/** + * audit_core_dumps - record information about processes that end abnormally + * @sig: signal value + * + * If a process ends with a core dump, something fishy is going on and we + * should record the event for investigation. + */ +void audit_core_dumps(long signr) +{ + struct audit_buffer *ab; + u32 sid; + + if (!audit_enabled) + return; + + if (signr == SIGQUIT) /* don't care for those */ + return; + + ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_ANOM_ABEND); + audit_log_format(ab, "auid=%u uid=%u gid=%u", + audit_get_loginuid(current->audit_context), + current->uid, current->gid); + selinux_get_task_sid(current, &sid); + if (sid) { + char *ctx = NULL; + u32 len; + + if (selinux_sid_to_string(sid, &ctx, &len)) + audit_log_format(ab, " ssid=%u", sid); + else + audit_log_format(ab, " subj=%s", ctx); + kfree(ctx); + } + audit_log_format(ab, " pid=%d comm=", current->pid); + audit_log_untrustedstring(ab, current->comm); + audit_log_format(ab, " sig=%ld", signr); + audit_log_end(ab); } diff --git a/kernel/compat.c b/kernel/compat.c index cebb4c28c039..3bae3742c2aa 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -475,8 +475,8 @@ asmlinkage long compat_sys_sched_getaffinity(compat_pid_t pid, unsigned int len, return min_length; } -static int get_compat_itimerspec(struct itimerspec *dst, - struct compat_itimerspec __user *src) +int get_compat_itimerspec(struct itimerspec *dst, + const struct compat_itimerspec __user *src) { if (get_compat_timespec(&dst->it_interval, &src->it_interval) || get_compat_timespec(&dst->it_value, &src->it_value)) @@ -484,8 +484,8 @@ static int get_compat_itimerspec(struct itimerspec *dst, return 0; } -static int put_compat_itimerspec(struct compat_itimerspec __user *dst, - struct itimerspec *src) +int put_compat_itimerspec(struct compat_itimerspec __user *dst, + const struct itimerspec *src) { if (put_compat_timespec(&src->it_interval, &dst->it_interval) || put_compat_timespec(&src->it_value, &dst->it_value)) diff --git a/kernel/configs.c b/kernel/configs.c index 8fa1fb28f8a7..e84d3f9c6c7b 100644 --- a/kernel/configs.c +++ b/kernel/configs.c @@ -61,18 +61,9 @@ static ssize_t ikconfig_read_current(struct file *file, char __user *buf, size_t len, loff_t * offset) { - loff_t pos = *offset; - ssize_t count; - - if (pos >= kernel_config_data_size) - return 0; - - count = min(len, (size_t)(kernel_config_data_size - pos)); - if (copy_to_user(buf, kernel_config_data + MAGIC_SIZE + pos, count)) - return -EFAULT; - - *offset += count; - return count; + return simple_read_from_buffer(buf, len, offset, + kernel_config_data + MAGIC_SIZE, + kernel_config_data_size); } static const struct file_operations ikconfig_file_ops = { diff --git a/kernel/cpu.c b/kernel/cpu.c index 36e70845cfc3..208cf3497c10 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -97,7 +97,7 @@ static inline void check_for_tasks(int cpu) (!cputime_eq(p->utime, cputime_zero) || !cputime_eq(p->stime, cputime_zero))) printk(KERN_WARNING "Task %s (pid = %d) is on cpu %d\ - (state = %ld, flags = %lx) \n", + (state = %ld, flags = %x) \n", p->comm, p->pid, cpu, p->state, p->flags); } write_unlock_irq(&tasklist_lock); @@ -120,11 +120,13 @@ static int take_cpu_down(void *unused) } /* Requires cpu_add_remove_lock to be held */ -static int _cpu_down(unsigned int cpu) +static int _cpu_down(unsigned int cpu, int tasks_frozen) { - int err; + int err, nr_calls = 0; struct task_struct *p; cpumask_t old_allowed, tmp; + void *hcpu = (void *)(long)cpu; + unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0; if (num_online_cpus() == 1) return -EBUSY; @@ -132,12 +134,16 @@ static int _cpu_down(unsigned int cpu) if (!cpu_online(cpu)) return -EINVAL; - err = raw_notifier_call_chain(&cpu_chain, CPU_DOWN_PREPARE, - (void *)(long)cpu); + raw_notifier_call_chain(&cpu_chain, CPU_LOCK_ACQUIRE, hcpu); + err = __raw_notifier_call_chain(&cpu_chain, CPU_DOWN_PREPARE | mod, + hcpu, -1, &nr_calls); if (err == NOTIFY_BAD) { + __raw_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED | mod, + hcpu, nr_calls, NULL); printk("%s: attempt to take down CPU %u failed\n", __FUNCTION__, cpu); - return -EINVAL; + err = -EINVAL; + goto out_release; } /* Ensure that we are not runnable on dying cpu */ @@ -152,8 +158,8 @@ static int _cpu_down(unsigned int cpu) if (IS_ERR(p) || cpu_online(cpu)) { /* CPU didn't die: tell everyone. Can't complain. */ - if (raw_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED, - (void *)(long)cpu) == NOTIFY_BAD) + if (raw_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED | mod, + hcpu) == NOTIFY_BAD) BUG(); if (IS_ERR(p)) { @@ -170,13 +176,9 @@ static int _cpu_down(unsigned int cpu) /* This actually kills the CPU. */ __cpu_die(cpu); - /* Move it here so it can run. */ - kthread_bind(p, get_cpu()); - put_cpu(); - /* CPU is completely dead: tell everyone. Too late to complain. */ - if (raw_notifier_call_chain(&cpu_chain, CPU_DEAD, - (void *)(long)cpu) == NOTIFY_BAD) + if (raw_notifier_call_chain(&cpu_chain, CPU_DEAD | mod, + hcpu) == NOTIFY_BAD) BUG(); check_for_tasks(cpu); @@ -185,6 +187,8 @@ out_thread: err = kthread_stop(p); out_allowed: set_cpus_allowed(current, old_allowed); +out_release: + raw_notifier_call_chain(&cpu_chain, CPU_LOCK_RELEASE, hcpu); return err; } @@ -196,7 +200,7 @@ int cpu_down(unsigned int cpu) if (cpu_hotplug_disabled) err = -EBUSY; else - err = _cpu_down(cpu); + err = _cpu_down(cpu, 0); mutex_unlock(&cpu_add_remove_lock); return err; @@ -204,15 +208,18 @@ int cpu_down(unsigned int cpu) #endif /*CONFIG_HOTPLUG_CPU*/ /* Requires cpu_add_remove_lock to be held */ -static int __cpuinit _cpu_up(unsigned int cpu) +static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen) { - int ret; + int ret, nr_calls = 0; void *hcpu = (void *)(long)cpu; + unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0; if (cpu_online(cpu) || !cpu_present(cpu)) return -EINVAL; - ret = raw_notifier_call_chain(&cpu_chain, CPU_UP_PREPARE, hcpu); + raw_notifier_call_chain(&cpu_chain, CPU_LOCK_ACQUIRE, hcpu); + ret = __raw_notifier_call_chain(&cpu_chain, CPU_UP_PREPARE | mod, hcpu, + -1, &nr_calls); if (ret == NOTIFY_BAD) { printk("%s: attempt to bring up CPU %u failed\n", __FUNCTION__, cpu); @@ -229,12 +236,13 @@ static int __cpuinit _cpu_up(unsigned int cpu) BUG_ON(!cpu_online(cpu)); /* Now call notifier in preparation. */ - raw_notifier_call_chain(&cpu_chain, CPU_ONLINE, hcpu); + raw_notifier_call_chain(&cpu_chain, CPU_ONLINE | mod, hcpu); out_notify: if (ret != 0) - raw_notifier_call_chain(&cpu_chain, - CPU_UP_CANCELED, hcpu); + __raw_notifier_call_chain(&cpu_chain, + CPU_UP_CANCELED | mod, hcpu, nr_calls, NULL); + raw_notifier_call_chain(&cpu_chain, CPU_LOCK_RELEASE, hcpu); return ret; } @@ -247,19 +255,13 @@ int __cpuinit cpu_up(unsigned int cpu) if (cpu_hotplug_disabled) err = -EBUSY; else - err = _cpu_up(cpu); + err = _cpu_up(cpu, 0); mutex_unlock(&cpu_add_remove_lock); return err; } #ifdef CONFIG_SUSPEND_SMP -/* Needed to prevent the microcode driver from requesting firmware in its CPU - * hotplug notifier during the suspend/resume. - */ -int suspend_cpu_hotplug; -EXPORT_SYMBOL(suspend_cpu_hotplug); - static cpumask_t frozen_cpus; int disable_nonboot_cpus(void) @@ -267,7 +269,6 @@ int disable_nonboot_cpus(void) int cpu, first_cpu, error = 0; mutex_lock(&cpu_add_remove_lock); - suspend_cpu_hotplug = 1; first_cpu = first_cpu(cpu_online_map); /* We take down all of the non-boot CPUs in one shot to avoid races * with the userspace trying to use the CPU hotplug at the same time @@ -277,7 +278,7 @@ int disable_nonboot_cpus(void) for_each_online_cpu(cpu) { if (cpu == first_cpu) continue; - error = _cpu_down(cpu); + error = _cpu_down(cpu, 1); if (!error) { cpu_set(cpu, frozen_cpus); printk("CPU%d is down\n", cpu); @@ -294,7 +295,6 @@ int disable_nonboot_cpus(void) } else { printk(KERN_ERR "Non-boot CPUs are not disabled\n"); } - suspend_cpu_hotplug = 0; mutex_unlock(&cpu_add_remove_lock); return error; } @@ -309,10 +309,9 @@ void enable_nonboot_cpus(void) if (cpus_empty(frozen_cpus)) goto out; - suspend_cpu_hotplug = 1; printk("Enabling non-boot CPUs ...\n"); for_each_cpu_mask(cpu, frozen_cpus) { - error = _cpu_up(cpu); + error = _cpu_up(cpu, 1); if (!error) { printk("CPU%d is up\n", cpu); continue; @@ -320,7 +319,6 @@ void enable_nonboot_cpus(void) printk(KERN_WARNING "Error taking CPU%d up: %d\n", cpu, error); } cpus_clear(frozen_cpus); - suspend_cpu_hotplug = 0; out: mutex_unlock(&cpu_add_remove_lock); } diff --git a/kernel/cpuset.c b/kernel/cpuset.c index d240349cbf0f..f57854b08922 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -42,7 +42,6 @@ #include <linux/seq_file.h> #include <linux/security.h> #include <linux/slab.h> -#include <linux/smp_lock.h> #include <linux/spinlock.h> #include <linux/stat.h> #include <linux/string.h> @@ -822,11 +821,22 @@ static int update_cpumask(struct cpuset *cs, char *buf) return -EACCES; trialcs = *cs; - retval = cpulist_parse(buf, trialcs.cpus_allowed); - if (retval < 0) - return retval; + + /* + * We allow a cpuset's cpus_allowed to be empty; if it has attached + * tasks, we'll catch it later when we validate the change and return + * -ENOSPC. + */ + if (!buf[0] || (buf[0] == '\n' && !buf[1])) { + cpus_clear(trialcs.cpus_allowed); + } else { + retval = cpulist_parse(buf, trialcs.cpus_allowed); + if (retval < 0) + return retval; + } cpus_and(trialcs.cpus_allowed, trialcs.cpus_allowed, cpu_online_map); - if (cpus_empty(trialcs.cpus_allowed)) + /* cpus_allowed cannot be empty for a cpuset with attached tasks. */ + if (atomic_read(&cs->count) && cpus_empty(trialcs.cpus_allowed)) return -ENOSPC; retval = validate_change(cs, &trialcs); if (retval < 0) @@ -919,16 +929,27 @@ static int update_nodemask(struct cpuset *cs, char *buf) return -EACCES; trialcs = *cs; - retval = nodelist_parse(buf, trialcs.mems_allowed); - if (retval < 0) - goto done; + + /* + * We allow a cpuset's mems_allowed to be empty; if it has attached + * tasks, we'll catch it later when we validate the change and return + * -ENOSPC. + */ + if (!buf[0] || (buf[0] == '\n' && !buf[1])) { + nodes_clear(trialcs.mems_allowed); + } else { + retval = nodelist_parse(buf, trialcs.mems_allowed); + if (retval < 0) + goto done; + } nodes_and(trialcs.mems_allowed, trialcs.mems_allowed, node_online_map); oldmem = cs->mems_allowed; if (nodes_equal(oldmem, trialcs.mems_allowed)) { retval = 0; /* Too easy - nothing to do */ goto done; } - if (nodes_empty(trialcs.mems_allowed)) { + /* mems_allowed cannot be empty for a cpuset with attached tasks. */ + if (atomic_read(&cs->count) && nodes_empty(trialcs.mems_allowed)) { retval = -ENOSPC; goto done; } @@ -1751,12 +1772,7 @@ static ssize_t cpuset_tasks_read(struct file *file, char __user *buf, { struct ctr_struct *ctr = file->private_data; - if (*ppos + nbytes > ctr->bufsz) - nbytes = ctr->bufsz - *ppos; - if (copy_to_user(buf, ctr->buf + *ppos, nbytes)) - return -EFAULT; - *ppos += nbytes; - return nbytes; + return simple_read_from_buffer(buf, nbytes, ppos, ctr->buf, ctr->bufsz); } static int cpuset_tasks_release(struct inode *unused_inode, struct file *file) @@ -2200,10 +2216,6 @@ void cpuset_fork(struct task_struct *child) * it is holding that mutex while calling check_for_release(), * which calls kmalloc(), so can't be called holding callback_mutex(). * - * We don't need to task_lock() this reference to tsk->cpuset, - * because tsk is already marked PF_EXITING, so attach_task() won't - * mess with it, or task is a failed fork, never visible to attach_task. - * * the_top_cpuset_hack: * * Set the exiting tasks cpuset to the root cpuset (top_cpuset). @@ -2242,8 +2254,10 @@ void cpuset_exit(struct task_struct *tsk) { struct cpuset *cs; + task_lock(current); cs = tsk->cpuset; tsk->cpuset = &top_cpuset; /* the_top_cpuset_hack - see above */ + task_unlock(current); if (notify_on_release(cs)) { char *pathbuf = NULL; diff --git a/kernel/die_notifier.c b/kernel/die_notifier.c new file mode 100644 index 000000000000..0d98827887a7 --- /dev/null +++ b/kernel/die_notifier.c @@ -0,0 +1,38 @@ + +#include <linux/module.h> +#include <linux/notifier.h> +#include <linux/vmalloc.h> +#include <linux/kdebug.h> + + +static ATOMIC_NOTIFIER_HEAD(die_chain); + +int notify_die(enum die_val val, const char *str, + struct pt_regs *regs, long err, int trap, int sig) +{ + struct die_args args = { + .regs = regs, + .str = str, + .err = err, + .trapnr = trap, + .signr = sig, + + }; + + return atomic_notifier_call_chain(&die_chain, val, &args); +} + +int register_die_notifier(struct notifier_block *nb) +{ + vmalloc_sync_all(); + return atomic_notifier_chain_register(&die_chain, nb); +} +EXPORT_SYMBOL_GPL(register_die_notifier); + +int unregister_die_notifier(struct notifier_block *nb) +{ + return atomic_notifier_chain_unregister(&die_chain, nb); +} +EXPORT_SYMBOL_GPL(unregister_die_notifier); + + diff --git a/kernel/exit.c b/kernel/exit.c index 92369240d91d..c6d14b8008dd 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -7,7 +7,6 @@ #include <linux/mm.h> #include <linux/slab.h> #include <linux/interrupt.h> -#include <linux/smp_lock.h> #include <linux/module.h> #include <linux/capability.h> #include <linux/completion.h> @@ -25,8 +24,10 @@ #include <linux/pid_namespace.h> #include <linux/ptrace.h> #include <linux/profile.h> +#include <linux/signalfd.h> #include <linux/mount.h> #include <linux/proc_fs.h> +#include <linux/kthread.h> #include <linux/mempolicy.h> #include <linux/taskstats_kern.h> #include <linux/delayacct.h> @@ -42,6 +43,7 @@ #include <linux/audit.h> /* for audit_free() */ #include <linux/resource.h> #include <linux/blkdev.h> +#include <linux/task_io_accounting_ops.h> #include <asm/uaccess.h> #include <asm/unistd.h> @@ -82,6 +84,14 @@ static void __exit_signal(struct task_struct *tsk) sighand = rcu_dereference(tsk->sighand); spin_lock(&sighand->siglock); + /* + * Notify that this sighand has been detached. This must + * be called with the tsk->sighand lock held. Also, this + * access tsk->sighand internally, so it must be called + * before tsk->sighand is reset. + */ + signalfd_detach_locked(tsk); + posix_cpu_timers_exit(tsk); if (atomic_dec_and_test(&sig->count)) posix_cpu_timers_exit_group(tsk); @@ -113,6 +123,8 @@ static void __exit_signal(struct task_struct *tsk) sig->nvcsw += tsk->nvcsw; sig->nivcsw += tsk->nivcsw; sig->sched_time += tsk->sched_time; + sig->inblock += task_io_get_inblock(tsk); + sig->oublock += task_io_get_oublock(tsk); sig = NULL; /* Marker for below. */ } @@ -255,26 +267,25 @@ static int has_stopped_jobs(struct pid *pgrp) } /** - * reparent_to_init - Reparent the calling kernel thread to the init task of the pid space that the thread belongs to. + * reparent_to_kthreadd - Reparent the calling kernel thread to kthreadd * * If a kernel thread is launched as a result of a system call, or if - * it ever exits, it should generally reparent itself to init so that - * it is correctly cleaned up on exit. + * it ever exits, it should generally reparent itself to kthreadd so it + * isn't in the way of other processes and is correctly cleaned up on exit. * * The various task state such as scheduling policy and priority may have * been inherited from a user process, so we reset them to sane values here. * - * NOTE that reparent_to_init() gives the caller full capabilities. + * NOTE that reparent_to_kthreadd() gives the caller full capabilities. */ -static void reparent_to_init(void) +static void reparent_to_kthreadd(void) { write_lock_irq(&tasklist_lock); ptrace_unlink(current); /* Reparent to init */ remove_parent(current); - current->parent = child_reaper(current); - current->real_parent = child_reaper(current); + current->real_parent = current->parent = kthreadd_task; add_parent(current); /* Set the exit signal to SIGCHLD so we signal init on exit */ @@ -300,12 +311,12 @@ void __set_special_pids(pid_t session, pid_t pgrp) if (process_session(curr) != session) { detach_pid(curr, PIDTYPE_SID); set_signal_session(curr->signal, session); - attach_pid(curr, PIDTYPE_SID, session); + attach_pid(curr, PIDTYPE_SID, find_pid(session)); } if (process_group(curr) != pgrp) { detach_pid(curr, PIDTYPE_PGID); curr->signal->pgrp = pgrp; - attach_pid(curr, PIDTYPE_PGID, pgrp); + attach_pid(curr, PIDTYPE_PGID, find_pid(pgrp)); } } @@ -348,7 +359,7 @@ int disallow_signal(int sig) return -EINVAL; spin_lock_irq(¤t->sighand->siglock); - sigaddset(¤t->blocked, sig); + current->sighand->action[(sig)-1].sa.sa_handler = SIG_IGN; recalc_sigpending(); spin_unlock_irq(¤t->sighand->siglock); return 0; @@ -401,7 +412,7 @@ void daemonize(const char *name, ...) current->files = init_task.files; atomic_inc(¤t->files->count); - reparent_to_init(); + reparent_to_kthreadd(); } EXPORT_SYMBOL(daemonize); @@ -1194,6 +1205,12 @@ static int wait_task_zombie(struct task_struct *p, int noreap, p->nvcsw + sig->nvcsw + sig->cnvcsw; psig->cnivcsw += p->nivcsw + sig->nivcsw + sig->cnivcsw; + psig->cinblock += + task_io_get_inblock(p) + + sig->inblock + sig->cinblock; + psig->coublock += + task_io_get_oublock(p) + + sig->oublock + sig->coublock; spin_unlock_irq(&p->parent->sighand->siglock); } diff --git a/kernel/fork.c b/kernel/fork.c index b7d169def942..87069cfc18a1 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -14,7 +14,6 @@ #include <linux/slab.h> #include <linux/init.h> #include <linux/unistd.h> -#include <linux/smp_lock.h> #include <linux/module.h> #include <linux/vmalloc.h> #include <linux/completion.h> @@ -106,7 +105,7 @@ static struct kmem_cache *mm_cachep; void free_task(struct task_struct *tsk) { - free_thread_info(tsk->thread_info); + free_thread_info(tsk->stack); rt_mutex_debug_task_free(tsk); free_task_struct(tsk); } @@ -176,7 +175,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig) } *tsk = *orig; - tsk->thread_info = ti; + tsk->stack = ti; setup_thread_stack(tsk, orig); #ifdef CONFIG_CC_STACKPROTECTOR @@ -876,6 +875,7 @@ static inline int copy_signal(unsigned long clone_flags, struct task_struct * ts sig->utime = sig->stime = sig->cutime = sig->cstime = cputime_zero; sig->nvcsw = sig->nivcsw = sig->cnvcsw = sig->cnivcsw = 0; sig->min_flt = sig->maj_flt = sig->cmin_flt = sig->cmaj_flt = 0; + sig->inblock = sig->oublock = sig->cinblock = sig->coublock = 0; sig->sched_time = 0; INIT_LIST_HEAD(&sig->cpu_timers[0]); INIT_LIST_HEAD(&sig->cpu_timers[1]); @@ -956,7 +956,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, unsigned long stack_size, int __user *parent_tidptr, int __user *child_tidptr, - int pid) + struct pid *pid) { int retval; struct task_struct *p = NULL; @@ -1023,7 +1023,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, p->did_exec = 0; delayacct_tsk_init(p); /* Must remain after dup_task_struct() */ copy_flags(clone_flags, p); - p->pid = pid; + p->pid = pid_nr(pid); retval = -EFAULT; if (clone_flags & CLONE_PARENT_SETTID) if (put_user(p->pid, parent_tidptr)) @@ -1252,13 +1252,13 @@ static struct task_struct *copy_process(unsigned long clone_flags, p->signal->tty = current->signal->tty; p->signal->pgrp = process_group(current); set_signal_session(p->signal, process_session(current)); - attach_pid(p, PIDTYPE_PGID, process_group(p)); - attach_pid(p, PIDTYPE_SID, process_session(p)); + attach_pid(p, PIDTYPE_PGID, task_pgrp(current)); + attach_pid(p, PIDTYPE_SID, task_session(current)); list_add_tail_rcu(&p->tasks, &init_task.tasks); __get_cpu_var(process_counts)++; } - attach_pid(p, PIDTYPE_PID, p->pid); + attach_pid(p, PIDTYPE_PID, pid); nr_threads++; } @@ -1322,7 +1322,8 @@ struct task_struct * __cpuinit fork_idle(int cpu) struct task_struct *task; struct pt_regs regs; - task = copy_process(CLONE_VM, 0, idle_regs(®s), 0, NULL, NULL, 0); + task = copy_process(CLONE_VM, 0, idle_regs(®s), 0, NULL, NULL, + &init_struct_pid); if (!IS_ERR(task)) init_idle(task, cpu); @@ -1372,7 +1373,7 @@ long do_fork(unsigned long clone_flags, clone_flags |= CLONE_PTRACE; } - p = copy_process(clone_flags, stack_start, regs, stack_size, parent_tidptr, child_tidptr, nr); + p = copy_process(clone_flags, stack_start, regs, stack_size, parent_tidptr, child_tidptr, pid); /* * Do this prior waking up the new thread - the thread pointer * might get invalid after that point, if the thread exits quickly. @@ -1421,12 +1422,13 @@ long do_fork(unsigned long clone_flags, #define ARCH_MIN_MMSTRUCT_ALIGN 0 #endif -static void sighand_ctor(void *data, struct kmem_cache *cachep, unsigned long flags) +static void sighand_ctor(void *data, struct kmem_cache *cachep, + unsigned long flags) { struct sighand_struct *sighand = data; - if (flags & SLAB_CTOR_CONSTRUCTOR) - spin_lock_init(&sighand->siglock); + spin_lock_init(&sighand->siglock); + INIT_LIST_HEAD(&sighand->signalfd_list); } void __init proc_caches_init(void) @@ -1452,7 +1454,6 @@ void __init proc_caches_init(void) SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL); } - /* * Check constraints on flags passed to the unshare system call and * force unsharing of additional process context as appropriate. @@ -1516,26 +1517,6 @@ static int unshare_fs(unsigned long unshare_flags, struct fs_struct **new_fsp) } /* - * Unshare the mnt_namespace structure if it is being shared - */ -static int unshare_mnt_namespace(unsigned long unshare_flags, - struct mnt_namespace **new_nsp, struct fs_struct *new_fs) -{ - struct mnt_namespace *ns = current->nsproxy->mnt_ns; - - if ((unshare_flags & CLONE_NEWNS) && ns) { - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - *new_nsp = dup_mnt_ns(current, new_fs ? new_fs : current->fs); - if (!*new_nsp) - return -ENOMEM; - } - - return 0; -} - -/* * Unsharing of sighand is not supported yet */ static int unshare_sighand(unsigned long unshare_flags, struct sighand_struct **new_sighp) @@ -1593,16 +1574,6 @@ static int unshare_semundo(unsigned long unshare_flags, struct sem_undo_list **n return 0; } -#ifndef CONFIG_IPC_NS -static inline int unshare_ipcs(unsigned long flags, struct ipc_namespace **ns) -{ - if (flags & CLONE_NEWIPC) - return -EINVAL; - - return 0; -} -#endif - /* * unshare allows a process to 'unshare' part of the process * context which was originally shared using clone. copy_* @@ -1615,14 +1586,11 @@ asmlinkage long sys_unshare(unsigned long unshare_flags) { int err = 0; struct fs_struct *fs, *new_fs = NULL; - struct mnt_namespace *ns, *new_ns = NULL; struct sighand_struct *new_sigh = NULL; struct mm_struct *mm, *new_mm = NULL, *active_mm = NULL; struct files_struct *fd, *new_fd = NULL; struct sem_undo_list *new_ulist = NULL; struct nsproxy *new_nsproxy = NULL, *old_nsproxy = NULL; - struct uts_namespace *uts, *new_uts = NULL; - struct ipc_namespace *ipc, *new_ipc = NULL; check_unshare_flags(&unshare_flags); @@ -1637,36 +1605,24 @@ asmlinkage long sys_unshare(unsigned long unshare_flags) goto bad_unshare_out; if ((err = unshare_fs(unshare_flags, &new_fs))) goto bad_unshare_cleanup_thread; - if ((err = unshare_mnt_namespace(unshare_flags, &new_ns, new_fs))) - goto bad_unshare_cleanup_fs; if ((err = unshare_sighand(unshare_flags, &new_sigh))) - goto bad_unshare_cleanup_ns; + goto bad_unshare_cleanup_fs; if ((err = unshare_vm(unshare_flags, &new_mm))) goto bad_unshare_cleanup_sigh; if ((err = unshare_fd(unshare_flags, &new_fd))) goto bad_unshare_cleanup_vm; if ((err = unshare_semundo(unshare_flags, &new_ulist))) goto bad_unshare_cleanup_fd; - if ((err = unshare_utsname(unshare_flags, &new_uts))) + if ((err = unshare_nsproxy_namespaces(unshare_flags, &new_nsproxy, + new_fs))) goto bad_unshare_cleanup_semundo; - if ((err = unshare_ipcs(unshare_flags, &new_ipc))) - goto bad_unshare_cleanup_uts; - - if (new_ns || new_uts || new_ipc) { - old_nsproxy = current->nsproxy; - new_nsproxy = dup_namespaces(old_nsproxy); - if (!new_nsproxy) { - err = -ENOMEM; - goto bad_unshare_cleanup_ipc; - } - } - if (new_fs || new_ns || new_mm || new_fd || new_ulist || - new_uts || new_ipc) { + if (new_fs || new_mm || new_fd || new_ulist || new_nsproxy) { task_lock(current); if (new_nsproxy) { + old_nsproxy = current->nsproxy; current->nsproxy = new_nsproxy; new_nsproxy = old_nsproxy; } @@ -1677,12 +1633,6 @@ asmlinkage long sys_unshare(unsigned long unshare_flags) new_fs = fs; } - if (new_ns) { - ns = current->nsproxy->mnt_ns; - current->nsproxy->mnt_ns = new_ns; - new_ns = ns; - } - if (new_mm) { mm = current->mm; active_mm = current->active_mm; @@ -1698,32 +1648,12 @@ asmlinkage long sys_unshare(unsigned long unshare_flags) new_fd = fd; } - if (new_uts) { - uts = current->nsproxy->uts_ns; - current->nsproxy->uts_ns = new_uts; - new_uts = uts; - } - - if (new_ipc) { - ipc = current->nsproxy->ipc_ns; - current->nsproxy->ipc_ns = new_ipc; - new_ipc = ipc; - } - task_unlock(current); } if (new_nsproxy) put_nsproxy(new_nsproxy); -bad_unshare_cleanup_ipc: - if (new_ipc) - put_ipc_ns(new_ipc); - -bad_unshare_cleanup_uts: - if (new_uts) - put_uts_ns(new_uts); - bad_unshare_cleanup_semundo: bad_unshare_cleanup_fd: if (new_fd) @@ -1738,10 +1668,6 @@ bad_unshare_cleanup_sigh: if (atomic_dec_and_test(&new_sigh->count)) kmem_cache_free(sighand_cachep, new_sigh); -bad_unshare_cleanup_ns: - if (new_ns) - put_mnt_ns(new_ns); - bad_unshare_cleanup_fs: if (new_fs) put_fs_struct(new_fs); diff --git a/kernel/futex.c b/kernel/futex.c index 5a270b5e3f95..b7ce15c67e32 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -16,6 +16,9 @@ * Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> * Copyright (C) 2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com> * + * PRIVATE futexes by Eric Dumazet + * Copyright (C) 2007 Eric Dumazet <dada1@cosmosbay.com> + * * Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly * enough at me, Linus for the original (flawed) idea, Matthew * Kirkwood for proof-of-concept implementation. @@ -48,37 +51,18 @@ #include <linux/pagemap.h> #include <linux/syscalls.h> #include <linux/signal.h> +#include <linux/module.h> #include <asm/futex.h> #include "rtmutex_common.h" -#define FUTEX_HASHBITS (CONFIG_BASE_SMALL ? 4 : 8) +#ifdef CONFIG_DEBUG_RT_MUTEXES +# include "rtmutex-debug.h" +#else +# include "rtmutex.h" +#endif -/* - * Futexes are matched on equal values of this key. - * The key type depends on whether it's a shared or private mapping. - * Don't rearrange members without looking at hash_futex(). - * - * offset is aligned to a multiple of sizeof(u32) (== 4) by definition. - * We set bit 0 to indicate if it's an inode-based key. - */ -union futex_key { - struct { - unsigned long pgoff; - struct inode *inode; - int offset; - } shared; - struct { - unsigned long address; - struct mm_struct *mm; - int offset; - } private; - struct { - unsigned long word; - void *ptr; - int offset; - } both; -}; +#define FUTEX_HASHBITS (CONFIG_BASE_SMALL ? 4 : 8) /* * Priority Inheritance state: @@ -106,12 +90,12 @@ struct futex_pi_state { * we can wake only the relevant ones (hashed queues may be shared). * * A futex_q has a woken state, just like tasks have TASK_RUNNING. - * It is considered woken when list_empty(&q->list) || q->lock_ptr == 0. + * It is considered woken when plist_node_empty(&q->list) || q->lock_ptr == 0. * The order of wakup is always to make the first condition true, then * wake up q->waiters, then make the second condition true. */ struct futex_q { - struct list_head list; + struct plist_node list; wait_queue_head_t waiters; /* Which hash list lock to use: */ @@ -127,14 +111,20 @@ struct futex_q { /* Optional priority inheritance state: */ struct futex_pi_state *pi_state; struct task_struct *task; + + /* + * This waiter is used in case of requeue from a + * normal futex to a PI-futex + */ + struct rt_mutex_waiter waiter; }; /* * Split the global futex_lock into every hash list lock. */ struct futex_hash_bucket { - spinlock_t lock; - struct list_head chain; + spinlock_t lock; + struct plist_head chain; }; static struct futex_hash_bucket futex_queues[1<<FUTEX_HASHBITS]; @@ -163,19 +153,26 @@ static inline int match_futex(union futex_key *key1, union futex_key *key2) && key1->both.offset == key2->both.offset); } -/* - * Get parameters which are the keys for a futex. +/** + * get_futex_key - Get parameters which are the keys for a futex. + * @uaddr: virtual address of the futex + * @shared: NULL for a PROCESS_PRIVATE futex, + * ¤t->mm->mmap_sem for a PROCESS_SHARED futex + * @key: address where result is stored. + * + * Returns a negative error code or 0 + * The key words are stored in *key on success. * * For shared mappings, it's (page->index, vma->vm_file->f_path.dentry->d_inode, * offset_within_page). For private mappings, it's (uaddr, current->mm). * We can usually work out the index without swapping in the page. * - * Returns: 0, or negative error code. - * The key words are stored in *key on success. - * - * Should be called with ¤t->mm->mmap_sem but NOT any spinlocks. + * fshared is NULL for PROCESS_PRIVATE futexes + * For other futexes, it points to ¤t->mm->mmap_sem and + * caller must have taken the reader lock. but NOT any spinlocks. */ -static int get_futex_key(u32 __user *uaddr, union futex_key *key) +int get_futex_key(u32 __user *uaddr, struct rw_semaphore *fshared, + union futex_key *key) { unsigned long address = (unsigned long)uaddr; struct mm_struct *mm = current->mm; @@ -187,11 +184,25 @@ static int get_futex_key(u32 __user *uaddr, union futex_key *key) * The futex address must be "naturally" aligned. */ key->both.offset = address % PAGE_SIZE; - if (unlikely((key->both.offset % sizeof(u32)) != 0)) + if (unlikely((address % sizeof(u32)) != 0)) return -EINVAL; address -= key->both.offset; /* + * PROCESS_PRIVATE futexes are fast. + * As the mm cannot disappear under us and the 'key' only needs + * virtual address, we dont even have to find the underlying vma. + * Note : We do have to check 'uaddr' is a valid user address, + * but access_ok() should be faster than find_vma() + */ + if (!fshared) { + if (unlikely(!access_ok(VERIFY_WRITE, uaddr, sizeof(u32)))) + return -EFAULT; + key->private.mm = mm; + key->private.address = address; + return 0; + } + /* * The futex is hashed differently depending on whether * it's in a shared or private mapping. So check vma first. */ @@ -205,6 +216,9 @@ static int get_futex_key(u32 __user *uaddr, union futex_key *key) if (unlikely((vma->vm_flags & (VM_IO|VM_READ)) != VM_READ)) return (vma->vm_flags & VM_IO) ? -EPERM : -EACCES; + /* Save the user address in the ley */ + key->uaddr = uaddr; + /* * Private mappings are handled in a simple way. * @@ -215,6 +229,7 @@ static int get_futex_key(u32 __user *uaddr, union futex_key *key) * mappings of _writable_ handles. */ if (likely(!(vma->vm_flags & VM_MAYSHARE))) { + key->both.offset |= FUT_OFF_MMSHARED; /* reference taken on mm */ key->private.mm = mm; key->private.address = address; return 0; @@ -224,7 +239,7 @@ static int get_futex_key(u32 __user *uaddr, union futex_key *key) * Linear file mappings are also simple. */ key->shared.inode = vma->vm_file->f_path.dentry->d_inode; - key->both.offset++; /* Bit 0 of offset indicates inode-based key. */ + key->both.offset |= FUT_OFF_INODE; /* inode-based key. */ if (likely(!(vma->vm_flags & VM_NONLINEAR))) { key->shared.pgoff = (((address - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff); @@ -246,37 +261,46 @@ static int get_futex_key(u32 __user *uaddr, union futex_key *key) } return err; } +EXPORT_SYMBOL_GPL(get_futex_key); /* * Take a reference to the resource addressed by a key. * Can be called while holding spinlocks. * - * NOTE: mmap_sem MUST be held between get_futex_key() and calling this - * function, if it is called at all. mmap_sem keeps key->shared.inode valid. */ -static inline void get_key_refs(union futex_key *key) +inline void get_futex_key_refs(union futex_key *key) { - if (key->both.ptr != 0) { - if (key->both.offset & 1) + if (key->both.ptr == 0) + return; + switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) { + case FUT_OFF_INODE: atomic_inc(&key->shared.inode->i_count); - else + break; + case FUT_OFF_MMSHARED: atomic_inc(&key->private.mm->mm_count); + break; } } +EXPORT_SYMBOL_GPL(get_futex_key_refs); /* * Drop a reference to the resource addressed by a key. * The hash bucket spinlock must not be held. */ -static void drop_key_refs(union futex_key *key) +void drop_futex_key_refs(union futex_key *key) { - if (key->both.ptr != 0) { - if (key->both.offset & 1) + if (key->both.ptr == 0) + return; + switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) { + case FUT_OFF_INODE: iput(key->shared.inode); - else + break; + case FUT_OFF_MMSHARED: mmdrop(key->private.mm); + break; } } +EXPORT_SYMBOL_GPL(drop_futex_key_refs); static inline int get_futex_value_locked(u32 *dest, u32 __user *from) { @@ -290,28 +314,38 @@ static inline int get_futex_value_locked(u32 *dest, u32 __user *from) } /* - * Fault handling. Called with current->mm->mmap_sem held. + * Fault handling. + * if fshared is non NULL, current->mm->mmap_sem is already held */ -static int futex_handle_fault(unsigned long address, int attempt) +static int futex_handle_fault(unsigned long address, + struct rw_semaphore *fshared, int attempt) { struct vm_area_struct * vma; struct mm_struct *mm = current->mm; + int ret = -EFAULT; - if (attempt > 2 || !(vma = find_vma(mm, address)) || - vma->vm_start > address || !(vma->vm_flags & VM_WRITE)) - return -EFAULT; + if (attempt > 2) + return ret; - switch (handle_mm_fault(mm, vma, address, 1)) { - case VM_FAULT_MINOR: - current->min_flt++; - break; - case VM_FAULT_MAJOR: - current->maj_flt++; - break; - default: - return -EFAULT; + if (!fshared) + down_read(&mm->mmap_sem); + vma = find_vma(mm, address); + if (vma && address >= vma->vm_start && + (vma->vm_flags & VM_WRITE)) { + switch (handle_mm_fault(mm, vma, address, 1)) { + case VM_FAULT_MINOR: + ret = 0; + current->min_flt++; + break; + case VM_FAULT_MAJOR: + ret = 0; + current->maj_flt++; + break; + } } - return 0; + if (!fshared) + up_read(&mm->mmap_sem); + return ret; } /* @@ -461,18 +495,19 @@ void exit_pi_state_list(struct task_struct *curr) } static int -lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, struct futex_q *me) +lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, + union futex_key *key, struct futex_pi_state **ps) { struct futex_pi_state *pi_state = NULL; struct futex_q *this, *next; - struct list_head *head; + struct plist_head *head; struct task_struct *p; pid_t pid; head = &hb->chain; - list_for_each_entry_safe(this, next, head, list) { - if (match_futex(&this->key, &me->key)) { + plist_for_each_entry_safe(this, next, head, list) { + if (match_futex(&this->key, key)) { /* * Another waiter already exists - bump up * the refcount and return its pi_state: @@ -487,7 +522,7 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, struct futex_q *me) WARN_ON(!atomic_read(&pi_state->refcount)); atomic_inc(&pi_state->refcount); - me->pi_state = pi_state; + *ps = pi_state; return 0; } @@ -514,7 +549,7 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, struct futex_q *me) rt_mutex_init_proxy_locked(&pi_state->pi_mutex, p); /* Store the key for possible exit cleanups: */ - pi_state->key = me->key; + pi_state->key = *key; spin_lock_irq(&p->pi_lock); WARN_ON(!list_empty(&pi_state->list)); @@ -524,7 +559,7 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, struct futex_q *me) put_task_struct(p); - me->pi_state = pi_state; + *ps = pi_state; return 0; } @@ -535,12 +570,12 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, struct futex_q *me) */ static void wake_futex(struct futex_q *q) { - list_del_init(&q->list); + plist_del(&q->list, &q->list.plist); if (q->filp) send_sigio(&q->filp->f_owner, q->fd, POLL_IN); /* * The lock in wake_up_all() is a crucial memory barrier after the - * list_del_init() and also before assigning to q->lock_ptr. + * plist_del() and also before assigning to q->lock_ptr. */ wake_up_all(&q->waiters); /* @@ -584,6 +619,8 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this) */ if (!(uval & FUTEX_OWNER_DIED)) { newval = FUTEX_WAITERS | new_owner->pid; + /* Keep the FUTEX_WAITER_REQUEUED flag if it was set */ + newval |= (uval & FUTEX_WAITER_REQUEUED); pagefault_disable(); curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval); @@ -651,17 +688,19 @@ double_lock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2) * Wake up all waiters hashed on the physical page that is mapped * to this virtual address: */ -static int futex_wake(u32 __user *uaddr, int nr_wake) +static int futex_wake(u32 __user *uaddr, struct rw_semaphore *fshared, + int nr_wake) { struct futex_hash_bucket *hb; struct futex_q *this, *next; - struct list_head *head; + struct plist_head *head; union futex_key key; int ret; - down_read(¤t->mm->mmap_sem); + if (fshared) + down_read(fshared); - ret = get_futex_key(uaddr, &key); + ret = get_futex_key(uaddr, fshared, &key); if (unlikely(ret != 0)) goto out; @@ -669,7 +708,7 @@ static int futex_wake(u32 __user *uaddr, int nr_wake) spin_lock(&hb->lock); head = &hb->chain; - list_for_each_entry_safe(this, next, head, list) { + plist_for_each_entry_safe(this, next, head, list) { if (match_futex (&this->key, &key)) { if (this->pi_state) { ret = -EINVAL; @@ -683,7 +722,261 @@ static int futex_wake(u32 __user *uaddr, int nr_wake) spin_unlock(&hb->lock); out: - up_read(¤t->mm->mmap_sem); + if (fshared) + up_read(fshared); + return ret; +} + +/* + * Called from futex_requeue_pi. + * Set FUTEX_WAITERS and FUTEX_WAITER_REQUEUED flags on the + * PI-futex value; search its associated pi_state if an owner exist + * or create a new one without owner. + */ +static inline int +lookup_pi_state_for_requeue(u32 __user *uaddr, struct futex_hash_bucket *hb, + union futex_key *key, + struct futex_pi_state **pi_state) +{ + u32 curval, uval, newval; + +retry: + /* + * We can't handle a fault cleanly because we can't + * release the locks here. Simply return the fault. + */ + if (get_futex_value_locked(&curval, uaddr)) + return -EFAULT; + + /* set the flags FUTEX_WAITERS and FUTEX_WAITER_REQUEUED */ + if ((curval & (FUTEX_WAITERS | FUTEX_WAITER_REQUEUED)) + != (FUTEX_WAITERS | FUTEX_WAITER_REQUEUED)) { + /* + * No waiters yet, we prepare the futex to have some waiters. + */ + + uval = curval; + newval = uval | FUTEX_WAITERS | FUTEX_WAITER_REQUEUED; + + pagefault_disable(); + curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval); + pagefault_enable(); + + if (unlikely(curval == -EFAULT)) + return -EFAULT; + if (unlikely(curval != uval)) + goto retry; + } + + if (!(curval & FUTEX_TID_MASK) + || lookup_pi_state(curval, hb, key, pi_state)) { + /* the futex has no owner (yet) or the lookup failed: + allocate one pi_state without owner */ + + *pi_state = alloc_pi_state(); + + /* Already stores the key: */ + (*pi_state)->key = *key; + + /* init the mutex without owner */ + __rt_mutex_init(&(*pi_state)->pi_mutex, NULL); + } + + return 0; +} + +/* + * Keep the first nr_wake waiter from futex1, wake up one, + * and requeue the next nr_requeue waiters following hashed on + * one physical page to another physical page (PI-futex uaddr2) + */ +static int futex_requeue_pi(u32 __user *uaddr1, + struct rw_semaphore *fshared, + u32 __user *uaddr2, + int nr_wake, int nr_requeue, u32 *cmpval) +{ + union futex_key key1, key2; + struct futex_hash_bucket *hb1, *hb2; + struct plist_head *head1; + struct futex_q *this, *next; + struct futex_pi_state *pi_state2 = NULL; + struct rt_mutex_waiter *waiter, *top_waiter = NULL; + struct rt_mutex *lock2 = NULL; + int ret, drop_count = 0; + + if (refill_pi_state_cache()) + return -ENOMEM; + +retry: + /* + * First take all the futex related locks: + */ + if (fshared) + down_read(fshared); + + ret = get_futex_key(uaddr1, fshared, &key1); + if (unlikely(ret != 0)) + goto out; + ret = get_futex_key(uaddr2, fshared, &key2); + if (unlikely(ret != 0)) + goto out; + + hb1 = hash_futex(&key1); + hb2 = hash_futex(&key2); + + double_lock_hb(hb1, hb2); + + if (likely(cmpval != NULL)) { + u32 curval; + + ret = get_futex_value_locked(&curval, uaddr1); + + if (unlikely(ret)) { + spin_unlock(&hb1->lock); + if (hb1 != hb2) + spin_unlock(&hb2->lock); + + /* + * If we would have faulted, release mmap_sem, fault + * it in and start all over again. + */ + if (fshared) + up_read(fshared); + + ret = get_user(curval, uaddr1); + + if (!ret) + goto retry; + + return ret; + } + if (curval != *cmpval) { + ret = -EAGAIN; + goto out_unlock; + } + } + + head1 = &hb1->chain; + plist_for_each_entry_safe(this, next, head1, list) { + if (!match_futex (&this->key, &key1)) + continue; + if (++ret <= nr_wake) { + wake_futex(this); + } else { + /* + * FIRST: get and set the pi_state + */ + if (!pi_state2) { + int s; + /* do this only the first time we requeue someone */ + s = lookup_pi_state_for_requeue(uaddr2, hb2, + &key2, &pi_state2); + if (s) { + ret = s; + goto out_unlock; + } + + lock2 = &pi_state2->pi_mutex; + spin_lock(&lock2->wait_lock); + + /* Save the top waiter of the wait_list */ + if (rt_mutex_has_waiters(lock2)) + top_waiter = rt_mutex_top_waiter(lock2); + } else + atomic_inc(&pi_state2->refcount); + + + this->pi_state = pi_state2; + + /* + * SECOND: requeue futex_q to the correct hashbucket + */ + + /* + * If key1 and key2 hash to the same bucket, no need to + * requeue. + */ + if (likely(head1 != &hb2->chain)) { + plist_del(&this->list, &hb1->chain); + plist_add(&this->list, &hb2->chain); + this->lock_ptr = &hb2->lock; +#ifdef CONFIG_DEBUG_PI_LIST + this->list.plist.lock = &hb2->lock; +#endif + } + this->key = key2; + get_futex_key_refs(&key2); + drop_count++; + + + /* + * THIRD: queue it to lock2 + */ + spin_lock_irq(&this->task->pi_lock); + waiter = &this->waiter; + waiter->task = this->task; + waiter->lock = lock2; + plist_node_init(&waiter->list_entry, this->task->prio); + plist_node_init(&waiter->pi_list_entry, this->task->prio); + plist_add(&waiter->list_entry, &lock2->wait_list); + this->task->pi_blocked_on = waiter; + spin_unlock_irq(&this->task->pi_lock); + + if (ret - nr_wake >= nr_requeue) + break; + } + } + + /* If we've requeued some tasks and the top_waiter of the rt_mutex + has changed, we must adjust the priority of the owner, if any */ + if (drop_count) { + struct task_struct *owner = rt_mutex_owner(lock2); + if (owner && + (top_waiter != (waiter = rt_mutex_top_waiter(lock2)))) { + int chain_walk = 0; + + spin_lock_irq(&owner->pi_lock); + if (top_waiter) + plist_del(&top_waiter->pi_list_entry, &owner->pi_waiters); + else + /* + * There was no waiters before the requeue, + * the flag must be updated + */ + mark_rt_mutex_waiters(lock2); + + plist_add(&waiter->pi_list_entry, &owner->pi_waiters); + __rt_mutex_adjust_prio(owner); + if (owner->pi_blocked_on) { + chain_walk = 1; + get_task_struct(owner); + } + + spin_unlock_irq(&owner->pi_lock); + spin_unlock(&lock2->wait_lock); + + if (chain_walk) + rt_mutex_adjust_prio_chain(owner, 0, lock2, NULL, + current); + } else { + /* No owner or the top_waiter does not change */ + mark_rt_mutex_waiters(lock2); + spin_unlock(&lock2->wait_lock); + } + } + +out_unlock: + spin_unlock(&hb1->lock); + if (hb1 != hb2) + spin_unlock(&hb2->lock); + + /* drop_futex_key_refs() must be called outside the spinlocks. */ + while (--drop_count >= 0) + drop_futex_key_refs(&key1); + +out: + if (fshared) + up_read(fshared); return ret; } @@ -692,22 +985,24 @@ out: * to this virtual address: */ static int -futex_wake_op(u32 __user *uaddr1, u32 __user *uaddr2, +futex_wake_op(u32 __user *uaddr1, struct rw_semaphore *fshared, + u32 __user *uaddr2, int nr_wake, int nr_wake2, int op) { union futex_key key1, key2; struct futex_hash_bucket *hb1, *hb2; - struct list_head *head; + struct plist_head *head; struct futex_q *this, *next; int ret, op_ret, attempt = 0; retryfull: - down_read(¤t->mm->mmap_sem); + if (fshared) + down_read(fshared); - ret = get_futex_key(uaddr1, &key1); + ret = get_futex_key(uaddr1, fshared, &key1); if (unlikely(ret != 0)) goto out; - ret = get_futex_key(uaddr2, &key2); + ret = get_futex_key(uaddr2, fshared, &key2); if (unlikely(ret != 0)) goto out; @@ -747,11 +1042,10 @@ retry: * still holding the mmap_sem. */ if (attempt++) { - if (futex_handle_fault((unsigned long)uaddr2, - attempt)) { - ret = -EFAULT; + ret = futex_handle_fault((unsigned long)uaddr2, + fshared, attempt); + if (ret) goto out; - } goto retry; } @@ -759,7 +1053,8 @@ retry: * If we would have faulted, release mmap_sem, * fault it in and start all over again. */ - up_read(¤t->mm->mmap_sem); + if (fshared) + up_read(fshared); ret = get_user(dummy, uaddr2); if (ret) @@ -770,7 +1065,7 @@ retry: head = &hb1->chain; - list_for_each_entry_safe(this, next, head, list) { + plist_for_each_entry_safe(this, next, head, list) { if (match_futex (&this->key, &key1)) { wake_futex(this); if (++ret >= nr_wake) @@ -782,7 +1077,7 @@ retry: head = &hb2->chain; op_ret = 0; - list_for_each_entry_safe(this, next, head, list) { + plist_for_each_entry_safe(this, next, head, list) { if (match_futex (&this->key, &key2)) { wake_futex(this); if (++op_ret >= nr_wake2) @@ -796,7 +1091,8 @@ retry: if (hb1 != hb2) spin_unlock(&hb2->lock); out: - up_read(¤t->mm->mmap_sem); + if (fshared) + up_read(fshared); return ret; } @@ -804,22 +1100,24 @@ out: * Requeue all waiters hashed on one physical page to another * physical page. */ -static int futex_requeue(u32 __user *uaddr1, u32 __user *uaddr2, +static int futex_requeue(u32 __user *uaddr1, struct rw_semaphore *fshared, + u32 __user *uaddr2, int nr_wake, int nr_requeue, u32 *cmpval) { union futex_key key1, key2; struct futex_hash_bucket *hb1, *hb2; - struct list_head *head1; + struct plist_head *head1; struct futex_q *this, *next; int ret, drop_count = 0; retry: - down_read(¤t->mm->mmap_sem); + if (fshared) + down_read(fshared); - ret = get_futex_key(uaddr1, &key1); + ret = get_futex_key(uaddr1, fshared, &key1); if (unlikely(ret != 0)) goto out; - ret = get_futex_key(uaddr2, &key2); + ret = get_futex_key(uaddr2, fshared, &key2); if (unlikely(ret != 0)) goto out; @@ -842,7 +1140,8 @@ static int futex_requeue(u32 __user *uaddr1, u32 __user *uaddr2, * If we would have faulted, release mmap_sem, fault * it in and start all over again. */ - up_read(¤t->mm->mmap_sem); + if (fshared) + up_read(fshared); ret = get_user(curval, uaddr1); @@ -858,7 +1157,7 @@ static int futex_requeue(u32 __user *uaddr1, u32 __user *uaddr2, } head1 = &hb1->chain; - list_for_each_entry_safe(this, next, head1, list) { + plist_for_each_entry_safe(this, next, head1, list) { if (!match_futex (&this->key, &key1)) continue; if (++ret <= nr_wake) { @@ -869,11 +1168,15 @@ static int futex_requeue(u32 __user *uaddr1, u32 __user *uaddr2, * requeue. */ if (likely(head1 != &hb2->chain)) { - list_move_tail(&this->list, &hb2->chain); + plist_del(&this->list, &hb1->chain); + plist_add(&this->list, &hb2->chain); this->lock_ptr = &hb2->lock; - } +#ifdef CONFIG_DEBUG_PI_LIST + this->list.plist.lock = &hb2->lock; +#endif + } this->key = key2; - get_key_refs(&key2); + get_futex_key_refs(&key2); drop_count++; if (ret - nr_wake >= nr_requeue) @@ -886,12 +1189,13 @@ out_unlock: if (hb1 != hb2) spin_unlock(&hb2->lock); - /* drop_key_refs() must be called outside the spinlocks. */ + /* drop_futex_key_refs() must be called outside the spinlocks. */ while (--drop_count >= 0) - drop_key_refs(&key1); + drop_futex_key_refs(&key1); out: - up_read(¤t->mm->mmap_sem); + if (fshared) + up_read(fshared); return ret; } @@ -906,7 +1210,7 @@ queue_lock(struct futex_q *q, int fd, struct file *filp) init_waitqueue_head(&q->waiters); - get_key_refs(&q->key); + get_futex_key_refs(&q->key); hb = hash_futex(&q->key); q->lock_ptr = &hb->lock; @@ -916,7 +1220,23 @@ queue_lock(struct futex_q *q, int fd, struct file *filp) static inline void __queue_me(struct futex_q *q, struct futex_hash_bucket *hb) { - list_add_tail(&q->list, &hb->chain); + int prio; + + /* + * The priority used to register this element is + * - either the real thread-priority for the real-time threads + * (i.e. threads with a priority lower than MAX_RT_PRIO) + * - or MAX_RT_PRIO for non-RT threads. + * Thus, all RT-threads are woken first in priority order, and + * the others are woken last, in FIFO order. + */ + prio = min(current->normal_prio, MAX_RT_PRIO); + + plist_node_init(&q->list, prio); +#ifdef CONFIG_DEBUG_PI_LIST + q->list.plist.lock = &hb->lock; +#endif + plist_add(&q->list, &hb->chain); q->task = current; spin_unlock(&hb->lock); } @@ -925,7 +1245,7 @@ static inline void queue_unlock(struct futex_q *q, struct futex_hash_bucket *hb) { spin_unlock(&hb->lock); - drop_key_refs(&q->key); + drop_futex_key_refs(&q->key); } /* @@ -971,8 +1291,8 @@ static int unqueue_me(struct futex_q *q) spin_unlock(lock_ptr); goto retry; } - WARN_ON(list_empty(&q->list)); - list_del(&q->list); + WARN_ON(plist_node_empty(&q->list)); + plist_del(&q->list, &q->list.plist); BUG_ON(q->pi_state); @@ -980,29 +1300,94 @@ static int unqueue_me(struct futex_q *q) ret = 1; } - drop_key_refs(&q->key); + drop_futex_key_refs(&q->key); return ret; } /* * PI futexes can not be requeued and must remove themself from the - * hash bucket. The hash bucket lock is held on entry and dropped here. + * hash bucket. The hash bucket lock (i.e. lock_ptr) is held on entry + * and dropped here. */ -static void unqueue_me_pi(struct futex_q *q, struct futex_hash_bucket *hb) +static void unqueue_me_pi(struct futex_q *q) { - WARN_ON(list_empty(&q->list)); - list_del(&q->list); + WARN_ON(plist_node_empty(&q->list)); + plist_del(&q->list, &q->list.plist); BUG_ON(!q->pi_state); free_pi_state(q->pi_state); q->pi_state = NULL; - spin_unlock(&hb->lock); + spin_unlock(q->lock_ptr); - drop_key_refs(&q->key); + drop_futex_key_refs(&q->key); } -static int futex_wait(u32 __user *uaddr, u32 val, unsigned long time) +/* + * Fixup the pi_state owner with current. + * + * The cur->mm semaphore must be held, it is released at return of this + * function. + */ +static int fixup_pi_state_owner(u32 __user *uaddr, struct rw_semaphore *fshared, + struct futex_q *q, + struct futex_hash_bucket *hb, + struct task_struct *curr) +{ + u32 newtid = curr->pid | FUTEX_WAITERS; + struct futex_pi_state *pi_state = q->pi_state; + u32 uval, curval, newval; + int ret; + + /* Owner died? */ + if (pi_state->owner != NULL) { + spin_lock_irq(&pi_state->owner->pi_lock); + WARN_ON(list_empty(&pi_state->list)); + list_del_init(&pi_state->list); + spin_unlock_irq(&pi_state->owner->pi_lock); + } else + newtid |= FUTEX_OWNER_DIED; + + pi_state->owner = curr; + + spin_lock_irq(&curr->pi_lock); + WARN_ON(!list_empty(&pi_state->list)); + list_add(&pi_state->list, &curr->pi_state_list); + spin_unlock_irq(&curr->pi_lock); + + /* Unqueue and drop the lock */ + unqueue_me_pi(q); + if (fshared) + up_read(fshared); + /* + * We own it, so we have to replace the pending owner + * TID. This must be atomic as we have preserve the + * owner died bit here. + */ + ret = get_user(uval, uaddr); + while (!ret) { + newval = (uval & FUTEX_OWNER_DIED) | newtid; + newval |= (uval & FUTEX_WAITER_REQUEUED); + curval = futex_atomic_cmpxchg_inatomic(uaddr, + uval, newval); + if (curval == -EFAULT) + ret = -EFAULT; + if (curval == uval) + break; + uval = curval; + } + return ret; +} + +/* + * In case we must use restart_block to restart a futex_wait, + * we encode in the 'arg3' shared capability + */ +#define ARG3_SHARED 1 + +static long futex_wait_restart(struct restart_block *restart); +static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared, + u32 val, ktime_t *abs_time) { struct task_struct *curr = current; DECLARE_WAITQUEUE(wait, curr); @@ -1010,12 +1395,15 @@ static int futex_wait(u32 __user *uaddr, u32 val, unsigned long time) struct futex_q q; u32 uval; int ret; + struct hrtimer_sleeper t, *to = NULL; + int rem = 0; q.pi_state = NULL; retry: - down_read(&curr->mm->mmap_sem); + if (fshared) + down_read(fshared); - ret = get_futex_key(uaddr, &q.key); + ret = get_futex_key(uaddr, fshared, &q.key); if (unlikely(ret != 0)) goto out_release_sem; @@ -1038,8 +1426,8 @@ static int futex_wait(u32 __user *uaddr, u32 val, unsigned long time) * a wakeup when *uaddr != val on entry to the syscall. This is * rare, but normal. * - * We hold the mmap semaphore, so the mapping cannot have changed - * since we looked it up in get_futex_key. + * for shared futexes, we hold the mmap semaphore, so the mapping + * cannot have changed since we looked it up in get_futex_key. */ ret = get_futex_value_locked(&uval, uaddr); @@ -1050,7 +1438,8 @@ static int futex_wait(u32 __user *uaddr, u32 val, unsigned long time) * If we would have faulted, release mmap_sem, fault it in and * start all over again. */ - up_read(&curr->mm->mmap_sem); + if (fshared) + up_read(fshared); ret = get_user(uval, uaddr); @@ -1062,6 +1451,14 @@ static int futex_wait(u32 __user *uaddr, u32 val, unsigned long time) if (uval != val) goto out_unlock_release_sem; + /* + * This rt_mutex_waiter structure is prepared here and will + * be used only if this task is requeued from a normal futex to + * a PI-futex with futex_requeue_pi. + */ + debug_rt_mutex_init_waiter(&q.waiter); + q.waiter.task = NULL; + /* Only actually queue if *uaddr contained val. */ __queue_me(&q, hb); @@ -1069,7 +1466,8 @@ static int futex_wait(u32 __user *uaddr, u32 val, unsigned long time) * Now the futex is queued and we have checked the data, we * don't want to hold mmap_sem while we sleep. */ - up_read(&curr->mm->mmap_sem); + if (fshared) + up_read(fshared); /* * There might have been scheduling since the queue_me(), as we @@ -1084,11 +1482,34 @@ static int futex_wait(u32 __user *uaddr, u32 val, unsigned long time) __set_current_state(TASK_INTERRUPTIBLE); add_wait_queue(&q.waiters, &wait); /* - * !list_empty() is safe here without any lock. + * !plist_node_empty() is safe here without any lock. * q.lock_ptr != 0 is not safe, because of ordering against wakeup. */ - if (likely(!list_empty(&q.list))) - time = schedule_timeout(time); + if (likely(!plist_node_empty(&q.list))) { + if (!abs_time) + schedule(); + else { + to = &t; + hrtimer_init(&t.timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); + hrtimer_init_sleeper(&t, current); + t.timer.expires = *abs_time; + + hrtimer_start(&t.timer, t.timer.expires, HRTIMER_MODE_ABS); + + /* + * the timer could have already expired, in which + * case current would be flagged for rescheduling. + * Don't bother calling schedule. + */ + if (likely(t.task)) + schedule(); + + hrtimer_cancel(&t.timer); + + /* Flag if a timeout occured */ + rem = (t.task == NULL); + } + } __set_current_state(TASK_RUNNING); /* @@ -1096,62 +1517,203 @@ static int futex_wait(u32 __user *uaddr, u32 val, unsigned long time) * we are the only user of it. */ + if (q.pi_state) { + /* + * We were woken but have been requeued on a PI-futex. + * We have to complete the lock acquisition by taking + * the rtmutex. + */ + + struct rt_mutex *lock = &q.pi_state->pi_mutex; + + spin_lock(&lock->wait_lock); + if (unlikely(q.waiter.task)) { + remove_waiter(lock, &q.waiter); + } + spin_unlock(&lock->wait_lock); + + if (rem) + ret = -ETIMEDOUT; + else + ret = rt_mutex_timed_lock(lock, to, 1); + + if (fshared) + down_read(fshared); + spin_lock(q.lock_ptr); + + /* + * Got the lock. We might not be the anticipated owner if we + * did a lock-steal - fix up the PI-state in that case. + */ + if (!ret && q.pi_state->owner != curr) { + /* + * We MUST play with the futex we were requeued on, + * NOT the current futex. + * We can retrieve it from the key of the pi_state + */ + uaddr = q.pi_state->key.uaddr; + + /* mmap_sem and hash_bucket lock are unlocked at + return of this function */ + ret = fixup_pi_state_owner(uaddr, fshared, + &q, hb, curr); + } else { + /* + * Catch the rare case, where the lock was released + * when we were on the way back before we locked + * the hash bucket. + */ + if (ret && q.pi_state->owner == curr) { + if (rt_mutex_trylock(&q.pi_state->pi_mutex)) + ret = 0; + } + /* Unqueue and drop the lock */ + unqueue_me_pi(&q); + if (fshared) + up_read(fshared); + } + + debug_rt_mutex_free_waiter(&q.waiter); + + return ret; + } + + debug_rt_mutex_free_waiter(&q.waiter); + /* If we were woken (and unqueued), we succeeded, whatever. */ if (!unqueue_me(&q)) return 0; - if (time == 0) + if (rem) return -ETIMEDOUT; + /* * We expect signal_pending(current), but another thread may * have handled it for us already. */ - return -EINTR; + if (!abs_time) + return -ERESTARTSYS; + else { + struct restart_block *restart; + restart = ¤t_thread_info()->restart_block; + restart->fn = futex_wait_restart; + restart->arg0 = (unsigned long)uaddr; + restart->arg1 = (unsigned long)val; + restart->arg2 = (unsigned long)abs_time; + restart->arg3 = 0; + if (fshared) + restart->arg3 |= ARG3_SHARED; + return -ERESTART_RESTARTBLOCK; + } out_unlock_release_sem: queue_unlock(&q, hb); out_release_sem: - up_read(&curr->mm->mmap_sem); + if (fshared) + up_read(fshared); return ret; } + +static long futex_wait_restart(struct restart_block *restart) +{ + u32 __user *uaddr = (u32 __user *)restart->arg0; + u32 val = (u32)restart->arg1; + ktime_t *abs_time = (ktime_t *)restart->arg2; + struct rw_semaphore *fshared = NULL; + + restart->fn = do_no_restart_syscall; + if (restart->arg3 & ARG3_SHARED) + fshared = ¤t->mm->mmap_sem; + return (long)futex_wait(uaddr, fshared, val, abs_time); +} + + +static void set_pi_futex_owner(struct futex_hash_bucket *hb, + union futex_key *key, struct task_struct *p) +{ + struct plist_head *head; + struct futex_q *this, *next; + struct futex_pi_state *pi_state = NULL; + struct rt_mutex *lock; + + /* Search a waiter that should already exists */ + + head = &hb->chain; + + plist_for_each_entry_safe(this, next, head, list) { + if (match_futex (&this->key, key)) { + pi_state = this->pi_state; + break; + } + } + + BUG_ON(!pi_state); + + /* set p as pi_state's owner */ + lock = &pi_state->pi_mutex; + + spin_lock(&lock->wait_lock); + spin_lock_irq(&p->pi_lock); + + list_add(&pi_state->list, &p->pi_state_list); + pi_state->owner = p; + + + /* set p as pi_mutex's owner */ + debug_rt_mutex_proxy_lock(lock, p); + WARN_ON(rt_mutex_owner(lock)); + rt_mutex_set_owner(lock, p, 0); + rt_mutex_deadlock_account_lock(lock, p); + + plist_add(&rt_mutex_top_waiter(lock)->pi_list_entry, + &p->pi_waiters); + __rt_mutex_adjust_prio(p); + + spin_unlock_irq(&p->pi_lock); + spin_unlock(&lock->wait_lock); +} + /* * Userspace tried a 0 -> TID atomic transition of the futex value * and failed. The kernel side here does the whole locking operation: * if there are waiters then it will block, it does PI, etc. (Due to * races the kernel might see a 0 value of the futex too.) */ -static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec, - long nsec, int trylock) +static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared, + int detect, ktime_t *time, int trylock) { struct hrtimer_sleeper timeout, *to = NULL; struct task_struct *curr = current; struct futex_hash_bucket *hb; u32 uval, newval, curval; struct futex_q q; - int ret, attempt = 0; + int ret, lock_held, attempt = 0; if (refill_pi_state_cache()) return -ENOMEM; - if (sec != MAX_SCHEDULE_TIMEOUT) { + if (time) { to = &timeout; hrtimer_init(&to->timer, CLOCK_REALTIME, HRTIMER_MODE_ABS); hrtimer_init_sleeper(to, current); - to->timer.expires = ktime_set(sec, nsec); + to->timer.expires = *time; } q.pi_state = NULL; retry: - down_read(&curr->mm->mmap_sem); + if (fshared) + down_read(fshared); - ret = get_futex_key(uaddr, &q.key); + ret = get_futex_key(uaddr, fshared, &q.key); if (unlikely(ret != 0)) goto out_release_sem; hb = queue_lock(&q, -1, NULL); retry_locked: + lock_held = 0; + /* * To avoid races, we attempt to take the lock here again * (by doing a 0 -> TID atomic cmpxchg), while holding all @@ -1170,7 +1732,16 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec, if (unlikely((curval & FUTEX_TID_MASK) == current->pid)) { if (!detect && 0) force_sig(SIGKILL, current); - ret = -EDEADLK; + /* + * Normally, this check is done in user space. + * In case of requeue, the owner may attempt to lock this futex, + * even if the ownership has already been given by the previous + * waker. + * In the usual case, this is a case of deadlock, but not in case + * of REQUEUE_PI. + */ + if (!(curval & FUTEX_WAITER_REQUEUED)) + ret = -EDEADLK; goto out_unlock_release_sem; } @@ -1182,7 +1753,18 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec, goto out_unlock_release_sem; uval = curval; - newval = uval | FUTEX_WAITERS; + /* + * In case of a requeue, check if there already is an owner + * If not, just take the futex. + */ + if ((curval & FUTEX_WAITER_REQUEUED) && !(curval & FUTEX_TID_MASK)) { + /* set current as futex owner */ + newval = curval | current->pid; + lock_held = 1; + } else + /* Set the WAITERS flag, so the owner will know it has someone + to wake at next unlock */ + newval = curval | FUTEX_WAITERS; pagefault_disable(); curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval); @@ -1193,11 +1775,16 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec, if (unlikely(curval != uval)) goto retry_locked; + if (lock_held) { + set_pi_futex_owner(hb, &q.key, curr); + goto out_unlock_release_sem; + } + /* * We dont have the lock. Look up the PI state (or create it if * we are the first waiter): */ - ret = lookup_pi_state(uval, hb, &q); + ret = lookup_pi_state(uval, hb, &q.key, &q.pi_state); if (unlikely(ret)) { /* @@ -1239,7 +1826,8 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec, * Now the futex is queued and we have checked the data, we * don't want to hold mmap_sem while we sleep. */ - up_read(&curr->mm->mmap_sem); + if (fshared) + up_read(fshared); WARN_ON(!q.pi_state); /* @@ -1253,52 +1841,18 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec, ret = ret ? 0 : -EWOULDBLOCK; } - down_read(&curr->mm->mmap_sem); + if (fshared) + down_read(fshared); spin_lock(q.lock_ptr); /* * Got the lock. We might not be the anticipated owner if we * did a lock-steal - fix up the PI-state in that case. */ - if (!ret && q.pi_state->owner != curr) { - u32 newtid = current->pid | FUTEX_WAITERS; - - /* Owner died? */ - if (q.pi_state->owner != NULL) { - spin_lock_irq(&q.pi_state->owner->pi_lock); - WARN_ON(list_empty(&q.pi_state->list)); - list_del_init(&q.pi_state->list); - spin_unlock_irq(&q.pi_state->owner->pi_lock); - } else - newtid |= FUTEX_OWNER_DIED; - - q.pi_state->owner = current; - - spin_lock_irq(¤t->pi_lock); - WARN_ON(!list_empty(&q.pi_state->list)); - list_add(&q.pi_state->list, ¤t->pi_state_list); - spin_unlock_irq(¤t->pi_lock); - - /* Unqueue and drop the lock */ - unqueue_me_pi(&q, hb); - up_read(&curr->mm->mmap_sem); - /* - * We own it, so we have to replace the pending owner - * TID. This must be atomic as we have preserve the - * owner died bit here. - */ - ret = get_user(uval, uaddr); - while (!ret) { - newval = (uval & FUTEX_OWNER_DIED) | newtid; - curval = futex_atomic_cmpxchg_inatomic(uaddr, - uval, newval); - if (curval == -EFAULT) - ret = -EFAULT; - if (curval == uval) - break; - uval = curval; - } - } else { + if (!ret && q.pi_state->owner != curr) + /* mmap_sem is unlocked at return of this function */ + ret = fixup_pi_state_owner(uaddr, fshared, &q, hb, curr); + else { /* * Catch the rare case, where the lock was released * when we were on the way back before we locked @@ -1309,8 +1863,9 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec, ret = 0; } /* Unqueue and drop the lock */ - unqueue_me_pi(&q, hb); - up_read(&curr->mm->mmap_sem); + unqueue_me_pi(&q); + if (fshared) + up_read(fshared); } if (!detect && ret == -EDEADLK && 0) @@ -1322,7 +1877,8 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec, queue_unlock(&q, hb); out_release_sem: - up_read(&curr->mm->mmap_sem); + if (fshared) + up_read(fshared); return ret; uaddr_faulted: @@ -1333,15 +1889,16 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec, * still holding the mmap_sem. */ if (attempt++) { - if (futex_handle_fault((unsigned long)uaddr, attempt)) { - ret = -EFAULT; + ret = futex_handle_fault((unsigned long)uaddr, fshared, + attempt); + if (ret) goto out_unlock_release_sem; - } goto retry_locked; } queue_unlock(&q, hb); - up_read(&curr->mm->mmap_sem); + if (fshared) + up_read(fshared); ret = get_user(uval, uaddr); if (!ret && (uval != -EFAULT)) @@ -1355,12 +1912,12 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec, * This is the in-kernel slowpath: we look up the PI state (if any), * and do the rt-mutex unlock. */ -static int futex_unlock_pi(u32 __user *uaddr) +static int futex_unlock_pi(u32 __user *uaddr, struct rw_semaphore *fshared) { struct futex_hash_bucket *hb; struct futex_q *this, *next; u32 uval; - struct list_head *head; + struct plist_head *head; union futex_key key; int ret, attempt = 0; @@ -1375,9 +1932,10 @@ retry: /* * First take all the futex related locks: */ - down_read(¤t->mm->mmap_sem); + if (fshared) + down_read(fshared); - ret = get_futex_key(uaddr, &key); + ret = get_futex_key(uaddr, fshared, &key); if (unlikely(ret != 0)) goto out; @@ -1411,7 +1969,7 @@ retry_locked: */ head = &hb->chain; - list_for_each_entry_safe(this, next, head, list) { + plist_for_each_entry_safe(this, next, head, list) { if (!match_futex (&this->key, &key)) continue; ret = wake_futex_pi(uaddr, uval, this); @@ -1436,7 +1994,8 @@ retry_locked: out_unlock: spin_unlock(&hb->lock); out: - up_read(¤t->mm->mmap_sem); + if (fshared) + up_read(fshared); return ret; @@ -1448,15 +2007,16 @@ pi_faulted: * still holding the mmap_sem. */ if (attempt++) { - if (futex_handle_fault((unsigned long)uaddr, attempt)) { - ret = -EFAULT; + ret = futex_handle_fault((unsigned long)uaddr, fshared, + attempt); + if (ret) goto out_unlock; - } goto retry_locked; } spin_unlock(&hb->lock); - up_read(¤t->mm->mmap_sem); + if (fshared) + up_read(fshared); ret = get_user(uval, uaddr); if (!ret && (uval != -EFAULT)) @@ -1485,10 +2045,10 @@ static unsigned int futex_poll(struct file *filp, poll_wait(filp, &q->waiters, wait); /* - * list_empty() is safe here without any lock. + * plist_node_empty() is safe here without any lock. * q->lock_ptr != 0 is not safe, because of ordering against wakeup. */ - if (list_empty(&q->list)) + if (plist_node_empty(&q->list)) ret = POLLIN | POLLRDNORM; return ret; @@ -1508,6 +2068,7 @@ static int futex_fd(u32 __user *uaddr, int signal) struct futex_q *q; struct file *filp; int ret, err; + struct rw_semaphore *fshared; static unsigned long printk_interval; if (printk_timed_ratelimit(&printk_interval, 60 * 60 * 1000)) { @@ -1549,11 +2110,12 @@ static int futex_fd(u32 __user *uaddr, int signal) } q->pi_state = NULL; - down_read(¤t->mm->mmap_sem); - err = get_futex_key(uaddr, &q->key); + fshared = ¤t->mm->mmap_sem; + down_read(fshared); + err = get_futex_key(uaddr, fshared, &q->key); if (unlikely(err != 0)) { - up_read(¤t->mm->mmap_sem); + up_read(fshared); kfree(q); goto error; } @@ -1565,7 +2127,7 @@ static int futex_fd(u32 __user *uaddr, int signal) filp->private_data = q; queue_me(q, ret, filp); - up_read(¤t->mm->mmap_sem); + up_read(fshared); /* Now we map fd to filp, so userspace can access it */ fd_install(ret, filp); @@ -1678,6 +2240,8 @@ retry: * userspace. */ mval = (uval & FUTEX_WAITERS) | FUTEX_OWNER_DIED; + /* Also keep the FUTEX_WAITER_REQUEUED flag if set */ + mval |= (uval & FUTEX_WAITER_REQUEUED); nval = futex_atomic_cmpxchg_inatomic(uaddr, uval, mval); if (nval == -EFAULT) @@ -1692,7 +2256,7 @@ retry: */ if (!pi) { if (uval & FUTEX_WAITERS) - futex_wake(uaddr, 1); + futex_wake(uaddr, &curr->mm->mmap_sem, 1); } } return 0; @@ -1748,7 +2312,8 @@ void exit_robust_list(struct task_struct *curr) return; if (pending) - handle_futex_death((void __user *)pending + futex_offset, curr, pip); + handle_futex_death((void __user *)pending + futex_offset, + curr, pip); while (entry != &head->list) { /* @@ -1774,39 +2339,47 @@ void exit_robust_list(struct task_struct *curr) } } -long do_futex(u32 __user *uaddr, int op, u32 val, unsigned long timeout, +long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, u32 __user *uaddr2, u32 val2, u32 val3) { int ret; + int cmd = op & FUTEX_CMD_MASK; + struct rw_semaphore *fshared = NULL; - switch (op) { + if (!(op & FUTEX_PRIVATE_FLAG)) + fshared = ¤t->mm->mmap_sem; + + switch (cmd) { case FUTEX_WAIT: - ret = futex_wait(uaddr, val, timeout); + ret = futex_wait(uaddr, fshared, val, timeout); break; case FUTEX_WAKE: - ret = futex_wake(uaddr, val); + ret = futex_wake(uaddr, fshared, val); break; case FUTEX_FD: /* non-zero val means F_SETOWN(getpid()) & F_SETSIG(val) */ ret = futex_fd(uaddr, val); break; case FUTEX_REQUEUE: - ret = futex_requeue(uaddr, uaddr2, val, val2, NULL); + ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, NULL); break; case FUTEX_CMP_REQUEUE: - ret = futex_requeue(uaddr, uaddr2, val, val2, &val3); + ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, &val3); break; case FUTEX_WAKE_OP: - ret = futex_wake_op(uaddr, uaddr2, val, val2, val3); + ret = futex_wake_op(uaddr, fshared, uaddr2, val, val2, val3); break; case FUTEX_LOCK_PI: - ret = futex_lock_pi(uaddr, val, timeout, val2, 0); + ret = futex_lock_pi(uaddr, fshared, val, timeout, 0); break; case FUTEX_UNLOCK_PI: - ret = futex_unlock_pi(uaddr); + ret = futex_unlock_pi(uaddr, fshared); break; case FUTEX_TRYLOCK_PI: - ret = futex_lock_pi(uaddr, 0, timeout, val2, 1); + ret = futex_lock_pi(uaddr, fshared, 0, timeout, 1); + break; + case FUTEX_CMP_REQUEUE_PI: + ret = futex_requeue_pi(uaddr, fshared, uaddr2, val, val2, &val3); break; default: ret = -ENOSYS; @@ -1819,29 +2392,30 @@ asmlinkage long sys_futex(u32 __user *uaddr, int op, u32 val, struct timespec __user *utime, u32 __user *uaddr2, u32 val3) { - struct timespec t; - unsigned long timeout = MAX_SCHEDULE_TIMEOUT; + struct timespec ts; + ktime_t t, *tp = NULL; u32 val2 = 0; + int cmd = op & FUTEX_CMD_MASK; - if (utime && (op == FUTEX_WAIT || op == FUTEX_LOCK_PI)) { - if (copy_from_user(&t, utime, sizeof(t)) != 0) + if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI)) { + if (copy_from_user(&ts, utime, sizeof(ts)) != 0) return -EFAULT; - if (!timespec_valid(&t)) + if (!timespec_valid(&ts)) return -EINVAL; - if (op == FUTEX_WAIT) - timeout = timespec_to_jiffies(&t) + 1; - else { - timeout = t.tv_sec; - val2 = t.tv_nsec; - } + + t = timespec_to_ktime(ts); + if (cmd == FUTEX_WAIT) + t = ktime_add(ktime_get(), t); + tp = &t; } /* - * requeue parameter in 'utime' if op == FUTEX_REQUEUE. + * requeue parameter in 'utime' if cmd == FUTEX_REQUEUE. */ - if (op == FUTEX_REQUEUE || op == FUTEX_CMP_REQUEUE) + if (cmd == FUTEX_REQUEUE || cmd == FUTEX_CMP_REQUEUE + || cmd == FUTEX_CMP_REQUEUE_PI) val2 = (u32) (unsigned long) utime; - return do_futex(uaddr, op, val, timeout, uaddr2, val2, val3); + return do_futex(uaddr, op, val, tp, uaddr2, val2, val3); } static int futexfs_get_sb(struct file_system_type *fs_type, @@ -1871,7 +2445,7 @@ static int __init init(void) } for (i = 0; i < ARRAY_SIZE(futex_queues); i++) { - INIT_LIST_HEAD(&futex_queues[i].chain); + plist_head_init(&futex_queues[i].chain, &futex_queues[i].lock); spin_lock_init(&futex_queues[i].lock); } return 0; diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c index 50f24eea6cd0..338a9b489fbc 100644 --- a/kernel/futex_compat.c +++ b/kernel/futex_compat.c @@ -141,24 +141,24 @@ asmlinkage long compat_sys_futex(u32 __user *uaddr, int op, u32 val, struct compat_timespec __user *utime, u32 __user *uaddr2, u32 val3) { - struct timespec t; - unsigned long timeout = MAX_SCHEDULE_TIMEOUT; + struct timespec ts; + ktime_t t, *tp = NULL; int val2 = 0; if (utime && (op == FUTEX_WAIT || op == FUTEX_LOCK_PI)) { - if (get_compat_timespec(&t, utime)) + if (get_compat_timespec(&ts, utime)) return -EFAULT; - if (!timespec_valid(&t)) + if (!timespec_valid(&ts)) return -EINVAL; + + t = timespec_to_ktime(ts); if (op == FUTEX_WAIT) - timeout = timespec_to_jiffies(&t) + 1; - else { - timeout = t.tv_sec; - val2 = t.tv_nsec; - } + t = ktime_add(ktime_get(), t); + tp = &t; } - if (op == FUTEX_REQUEUE || op == FUTEX_CMP_REQUEUE) + if (op == FUTEX_REQUEUE || op == FUTEX_CMP_REQUEUE + || op == FUTEX_CMP_REQUEUE_PI) val2 = (int) (unsigned long) utime; - return do_futex(uaddr, op, val, timeout, uaddr2, val2, val3); + return do_futex(uaddr, op, val, tp, uaddr2, val2, val3); } diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index 1b3033105b40..23c03f43e196 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -669,6 +669,7 @@ hrtimer_forward(struct hrtimer *timer, ktime_t now, ktime_t interval) return orun; } +EXPORT_SYMBOL_GPL(hrtimer_forward); /* * enqueue_hrtimer - internal function to (re)start a timer @@ -1410,11 +1411,13 @@ static int __cpuinit hrtimer_cpu_notify(struct notifier_block *self, switch (action) { case CPU_UP_PREPARE: + case CPU_UP_PREPARE_FROZEN: init_hrtimers_cpu(cpu); break; #ifdef CONFIG_HOTPLUG_CPU case CPU_DEAD: + case CPU_DEAD_FROZEN: clockevents_notify(CLOCK_EVT_NOTIFY_CPU_DEAD, &cpu); migrate_hrtimers(cpu); break; diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index aff1f0fabb0d..e391cbb1f566 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -22,7 +22,6 @@ * handle_bad_irq - handle spurious and unhandled irqs * @irq: the interrupt number * @desc: description of the interrupt - * @regs: pointer to a register structure * * Handles spurious and unhandled IRQ's. It also prints a debugmessage. */ @@ -48,7 +47,7 @@ handle_bad_irq(unsigned int irq, struct irq_desc *desc) * * Controller mappings for all interrupt sources: */ -struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned = { +struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = { [0 ... NR_IRQS-1] = { .status = IRQ_DISABLED, .chip = &no_irq_chip, @@ -180,6 +179,8 @@ fastcall unsigned int __do_IRQ(unsigned int irq) if (desc->chip->ack) desc->chip->ack(irq); action_ret = handle_IRQ_event(irq, desc->action); + if (!noirqdebug) + note_interrupt(irq, desc, action_ret); desc->chip->end(irq); return 1; } diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 5597c157442a..203a518b6f14 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -317,10 +317,7 @@ int setup_irq(unsigned int irq, struct irqaction *new) } *p = new; -#if defined(CONFIG_IRQ_PER_CPU) - if (new->flags & IRQF_PERCPU) - desc->status |= IRQ_PER_CPU; -#endif + /* Exclude IRQ from balancing */ if (new->flags & IRQF_NOBALANCING) desc->status |= IRQ_NO_BALANCING; @@ -328,6 +325,11 @@ int setup_irq(unsigned int irq, struct irqaction *new) if (!shared) { irq_chip_set_defaults(desc->chip); +#if defined(CONFIG_IRQ_PER_CPU) + if (new->flags & IRQF_PERCPU) + desc->status |= IRQ_PER_CPU; +#endif + /* Setup the type (level, edge polarity) if configured: */ if (new->flags & IRQF_TRIGGER_MASK) { if (desc->chip && desc->chip->set_type) diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index 2db91eb54ad8..b4f1674fca79 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -27,6 +27,10 @@ static int irq_affinity_read_proc(char *page, char **start, off_t off, return len; } +#ifndef is_affinity_mask_valid +#define is_affinity_mask_valid(val) 1 +#endif + int no_irq_affinity; static int irq_affinity_write_proc(struct file *file, const char __user *buffer, unsigned long count, void *data) @@ -42,6 +46,9 @@ static int irq_affinity_write_proc(struct file *file, const char __user *buffer, if (err) return err; + if (!is_affinity_mask_valid(new_value)) + return -EINVAL; + /* * Do not allow disabling IRQs completely - it's a too easy * way to make the system unusable accidentally :-) At least @@ -66,12 +73,19 @@ static int name_unique(unsigned int irq, struct irqaction *new_action) { struct irq_desc *desc = irq_desc + irq; struct irqaction *action; + unsigned long flags; + int ret = 1; - for (action = desc->action ; action; action = action->next) + spin_lock_irqsave(&desc->lock, flags); + for (action = desc->action ; action; action = action->next) { if ((action != new_action) && action->name && - !strcmp(new_action->name, action->name)) - return 0; - return 1; + !strcmp(new_action->name, action->name)) { + ret = 0; + break; + } + } + spin_unlock_irqrestore(&desc->lock, flags); + return ret; } void register_handler_proc(unsigned int irq, struct irqaction *action) diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c index 9d8c79b48823..b0d81aae472f 100644 --- a/kernel/irq/spurious.c +++ b/kernel/irq/spurious.c @@ -146,7 +146,9 @@ void note_interrupt(unsigned int irq, struct irq_desc *desc, if (unlikely(irqfixup)) { /* Don't punish working computers */ - if ((irqfixup == 2 && irq == 0) || action_ret == IRQ_NONE) { + if ((irqfixup == 2 && ((irq == 0) || + (desc->action->flags & IRQF_IRQPOLL))) || + action_ret == IRQ_NONE) { int ok = misrouted_irq(irq); if (action_ret == IRQ_NONE) desc->irqs_unhandled -= ok; diff --git a/kernel/itimer.c b/kernel/itimer.c index 307c6a632ef6..3205e8e114fa 100644 --- a/kernel/itimer.c +++ b/kernel/itimer.c @@ -7,7 +7,6 @@ /* These are all the functions necessary to implement itimers */ #include <linux/mm.h> -#include <linux/smp_lock.h> #include <linux/interrupt.h> #include <linux/syscalls.h> #include <linux/time.h> @@ -139,59 +138,11 @@ enum hrtimer_restart it_real_fn(struct hrtimer *timer) } /* - * We do not care about correctness. We just sanitize the values so - * the ktime_t operations which expect normalized values do not - * break. This converts negative values to long timeouts similar to - * the code in kernel versions < 2.6.16 - * - * Print a limited number of warning messages when an invalid timeval - * is detected. - */ -static void fixup_timeval(struct timeval *tv, int interval) -{ - static int warnlimit = 10; - unsigned long tmp; - - if (warnlimit > 0) { - warnlimit--; - printk(KERN_WARNING - "setitimer: %s (pid = %d) provided " - "invalid timeval %s: tv_sec = %ld tv_usec = %ld\n", - current->comm, current->pid, - interval ? "it_interval" : "it_value", - tv->tv_sec, (long) tv->tv_usec); - } - - tmp = tv->tv_usec; - if (tmp >= USEC_PER_SEC) { - tv->tv_usec = tmp % USEC_PER_SEC; - tv->tv_sec += tmp / USEC_PER_SEC; - } - - tmp = tv->tv_sec; - if (tmp > LONG_MAX) - tv->tv_sec = LONG_MAX; -} - -/* * Returns true if the timeval is in canonical form */ #define timeval_valid(t) \ (((t)->tv_sec >= 0) && (((unsigned long) (t)->tv_usec) < USEC_PER_SEC)) -/* - * Check for invalid timevals, sanitize them and print a limited - * number of warnings. - */ -static void check_itimerval(struct itimerval *value) { - - if (unlikely(!timeval_valid(&value->it_value))) - fixup_timeval(&value->it_value, 0); - - if (unlikely(!timeval_valid(&value->it_interval))) - fixup_timeval(&value->it_interval, 1); -} - int do_setitimer(int which, struct itimerval *value, struct itimerval *ovalue) { struct task_struct *tsk = current; @@ -201,15 +152,10 @@ int do_setitimer(int which, struct itimerval *value, struct itimerval *ovalue) /* * Validate the timevals in value. - * - * Note: Although the spec requires that invalid values shall - * return -EINVAL, we just fixup the value and print a limited - * number of warnings in order not to break users of this - * historical misfeature. - * - * Scheduled for replacement in March 2007 */ - check_itimerval(value); + if (!timeval_valid(&value->it_value) || + !timeval_valid(&value->it_interval)) + return -EINVAL; switch (which) { case ITIMER_REAL: diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index 5a0de8409739..f1bda23140b2 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -214,8 +214,10 @@ static unsigned long get_symbol_pos(unsigned long addr, symbol_end = (unsigned long)_etext; } - *symbolsize = symbol_end - symbol_start; - *offset = addr - symbol_start; + if (symbolsize) + *symbolsize = symbol_end - symbol_start; + if (offset) + *offset = addr - symbol_start; return low; } @@ -267,6 +269,42 @@ const char *kallsyms_lookup(unsigned long addr, return NULL; } +int lookup_symbol_name(unsigned long addr, char *symname) +{ + symname[0] = '\0'; + symname[KSYM_NAME_LEN] = '\0'; + + if (is_ksym_addr(addr)) { + unsigned long pos; + + pos = get_symbol_pos(addr, NULL, NULL); + /* Grab name */ + kallsyms_expand_symbol(get_symbol_offset(pos), symname); + return 0; + } + /* see if it's in a module */ + return lookup_module_symbol_name(addr, symname); +} + +int lookup_symbol_attrs(unsigned long addr, unsigned long *size, + unsigned long *offset, char *modname, char *name) +{ + name[0] = '\0'; + name[KSYM_NAME_LEN] = '\0'; + + if (is_ksym_addr(addr)) { + unsigned long pos; + + pos = get_symbol_pos(addr, size, offset); + /* Grab name */ + kallsyms_expand_symbol(get_symbol_offset(pos), name); + modname[0] = '\0'; + return 0; + } + /* see if it's in a module */ + return lookup_module_symbol_attrs(addr, size, offset, modname, name); +} + /* Look up a kernel symbol and return it in a text buffer. */ int sprint_symbol(char *buffer, unsigned long address) { @@ -301,25 +339,20 @@ void __print_symbol(const char *fmt, unsigned long address) struct kallsym_iter { loff_t pos; - struct module *owner; unsigned long value; unsigned int nameoff; /* If iterating in core kernel symbols */ char type; char name[KSYM_NAME_LEN+1]; + char module_name[MODULE_NAME_LEN + 1]; + int exported; }; static int get_ksymbol_mod(struct kallsym_iter *iter) { - iter->owner = module_get_kallsym(iter->pos - kallsyms_num_syms, - &iter->value, &iter->type, - iter->name, sizeof(iter->name)); - if (iter->owner == NULL) + if (module_get_kallsym(iter->pos - kallsyms_num_syms, &iter->value, + &iter->type, iter->name, iter->module_name, + &iter->exported) < 0) return 0; - - /* Label it "global" if it is exported, "local" if not exported. */ - iter->type = is_exported(iter->name, iter->owner) - ? toupper(iter->type) : tolower(iter->type); - return 1; } @@ -328,7 +361,7 @@ static unsigned long get_ksymbol_core(struct kallsym_iter *iter) { unsigned off = iter->nameoff; - iter->owner = NULL; + iter->module_name[0] = '\0'; iter->value = kallsyms_addresses[iter->pos]; iter->type = kallsyms_get_symbol_type(off); @@ -392,12 +425,17 @@ static int s_show(struct seq_file *m, void *p) if (!iter->name[0]) return 0; - if (iter->owner) + if (iter->module_name[0]) { + char type; + + /* Label it "global" if it is exported, + * "local" if not exported. */ + type = iter->exported ? toupper(iter->type) : + tolower(iter->type); seq_printf(m, "%0*lx %c %s\t[%s]\n", (int)(2*sizeof(void*)), - iter->value, iter->type, iter->name, - module_name(iter->owner)); - else + iter->value, type, iter->name, iter->module_name); + } else seq_printf(m, "%0*lx %c %s\n", (int)(2*sizeof(void*)), iter->value, iter->type, iter->name); @@ -432,18 +470,11 @@ static int kallsyms_open(struct inode *inode, struct file *file) return ret; } -static int kallsyms_release(struct inode *inode, struct file *file) -{ - struct seq_file *m = (struct seq_file *)file->private_data; - kfree(m->private); - return seq_release(inode, file); -} - static const struct file_operations kallsyms_operations = { .open = kallsyms_open, .read = seq_read, .llseek = seq_lseek, - .release = kallsyms_release, + .release = seq_release_private, }; static int __init kallsyms_init(void) diff --git a/kernel/kexec.c b/kernel/kexec.c index 2a59c8a01ae0..25db14b89e82 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -1118,8 +1118,8 @@ void crash_save_cpu(struct pt_regs *regs, int cpu) memset(&prstatus, 0, sizeof(prstatus)); prstatus.pr_pid = current->pid; elf_core_copy_regs(&prstatus.pr_reg, regs); - buf = append_elf_note(buf, "CORE", NT_PRSTATUS, &prstatus, - sizeof(prstatus)); + buf = append_elf_note(buf, KEXEC_CORE_NOTE_NAME, NT_PRSTATUS, + &prstatus, sizeof(prstatus)); final_note(buf); } diff --git a/kernel/kmod.c b/kernel/kmod.c index 796276141e51..4d32eb077179 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -23,7 +23,6 @@ #include <linux/syscalls.h> #include <linux/unistd.h> #include <linux/kmod.h> -#include <linux/smp_lock.h> #include <linux/slab.h> #include <linux/mnt_namespace.h> #include <linux/completion.h> @@ -136,7 +135,6 @@ static int ____call_usermodehelper(void *data) /* Unblock all signals and set the session keyring. */ new_session = key_get(sub_info->ring); - flush_signals(current); spin_lock_irq(¤t->sighand->siglock); old_session = __install_session_keyring(current, new_session); flush_signal_handlers(current, 1); @@ -166,6 +164,12 @@ static int ____call_usermodehelper(void *data) /* We can run anywhere, unlike our parent keventd(). */ set_cpus_allowed(current, CPU_MASK_ALL); + /* + * Our parent is keventd, which runs with elevated scheduling priority. + * Avoid propagating that into the userspace child. + */ + set_user_nice(current, 0); + retval = -EPERM; if (current->fs->root) retval = kernel_execve(sub_info->path, @@ -181,14 +185,9 @@ static int wait_for_helper(void *data) { struct subprocess_info *sub_info = data; pid_t pid; - struct k_sigaction sa; /* Install a handler: if SIGCLD isn't handled sys_wait4 won't * populate the status, but will return -ECHILD. */ - sa.sa.sa_handler = SIG_IGN; - sa.sa.sa_flags = 0; - siginitset(&sa.sa.sa_mask, sigmask(SIGCHLD)); - do_sigaction(SIGCHLD, &sa, NULL); allow_signal(SIGCHLD); pid = kernel_thread(____call_usermodehelper, sub_info, SIGCHLD); diff --git a/kernel/kprobes.c b/kernel/kprobes.c index d25a9ada3f8e..9e47d8c493f3 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -35,16 +35,19 @@ #include <linux/hash.h> #include <linux/init.h> #include <linux/slab.h> +#include <linux/stddef.h> #include <linux/module.h> #include <linux/moduleloader.h> #include <linux/kallsyms.h> #include <linux/freezer.h> #include <linux/seq_file.h> #include <linux/debugfs.h> +#include <linux/kdebug.h> + #include <asm-generic/sections.h> #include <asm/cacheflush.h> #include <asm/errno.h> -#include <asm/kdebug.h> +#include <asm/uaccess.h> #define KPROBE_HASH_BITS 6 #define KPROBE_TABLE_SIZE (1 << KPROBE_HASH_BITS) @@ -63,6 +66,9 @@ static struct hlist_head kprobe_table[KPROBE_TABLE_SIZE]; static struct hlist_head kretprobe_inst_table[KPROBE_TABLE_SIZE]; static atomic_t kprobe_count; +/* NOTE: change this value only with kprobe_mutex held */ +static bool kprobe_enabled; + DEFINE_MUTEX(kprobe_mutex); /* Protects kprobe_table */ DEFINE_SPINLOCK(kretprobe_lock); /* Protects kretprobe_inst_table */ static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL; @@ -132,9 +138,8 @@ kprobe_opcode_t __kprobes *get_insn_slot(void) struct kprobe_insn_page *kip; struct hlist_node *pos; - retry: - hlist_for_each(pos, &kprobe_insn_pages) { - kip = hlist_entry(pos, struct kprobe_insn_page, hlist); + retry: + hlist_for_each_entry(kip, pos, &kprobe_insn_pages, hlist) { if (kip->nused < INSNS_PER_PAGE) { int i; for (i = 0; i < INSNS_PER_PAGE; i++) { @@ -155,9 +160,8 @@ kprobe_opcode_t __kprobes *get_insn_slot(void) } /* All out of space. Need to allocate a new page. Use slot 0. */ kip = kmalloc(sizeof(struct kprobe_insn_page), GFP_KERNEL); - if (!kip) { + if (!kip) return NULL; - } /* * Use module_alloc so this page is within +/- 2GB of where the @@ -213,9 +217,8 @@ static int __kprobes collect_garbage_slots(void) if (check_safety() != 0) return -EAGAIN; - hlist_for_each_safe(pos, next, &kprobe_insn_pages) { + hlist_for_each_entry_safe(kip, pos, next, &kprobe_insn_pages, hlist) { int i; - kip = hlist_entry(pos, struct kprobe_insn_page, hlist); if (kip->ngarbage == 0) continue; kip->ngarbage = 0; /* we will collect all garbages */ @@ -234,8 +237,7 @@ void __kprobes free_insn_slot(kprobe_opcode_t * slot, int dirty) struct kprobe_insn_page *kip; struct hlist_node *pos; - hlist_for_each(pos, &kprobe_insn_pages) { - kip = hlist_entry(pos, struct kprobe_insn_page, hlist); + hlist_for_each_entry(kip, pos, &kprobe_insn_pages, hlist) { if (kip->insns <= slot && slot < kip->insns + (INSNS_PER_PAGE * MAX_INSN_SIZE)) { int i = (slot - kip->insns) / MAX_INSN_SIZE; @@ -248,9 +250,9 @@ void __kprobes free_insn_slot(kprobe_opcode_t * slot, int dirty) break; } } - if (dirty && (++kprobe_garbage_slots > INSNS_PER_PAGE)) { + + if (dirty && ++kprobe_garbage_slots > INSNS_PER_PAGE) collect_garbage_slots(); - } } #endif @@ -316,7 +318,6 @@ static void __kprobes aggr_post_handler(struct kprobe *p, struct pt_regs *regs, reset_kprobe_instance(); } } - return; } static int __kprobes aggr_fault_handler(struct kprobe *p, struct pt_regs *regs, @@ -362,46 +363,6 @@ void __kprobes kprobes_inc_nmissed_count(struct kprobe *p) } /* Called with kretprobe_lock held */ -struct kretprobe_instance __kprobes *get_free_rp_inst(struct kretprobe *rp) -{ - struct hlist_node *node; - struct kretprobe_instance *ri; - hlist_for_each_entry(ri, node, &rp->free_instances, uflist) - return ri; - return NULL; -} - -/* Called with kretprobe_lock held */ -static struct kretprobe_instance __kprobes *get_used_rp_inst(struct kretprobe - *rp) -{ - struct hlist_node *node; - struct kretprobe_instance *ri; - hlist_for_each_entry(ri, node, &rp->used_instances, uflist) - return ri; - return NULL; -} - -/* Called with kretprobe_lock held */ -void __kprobes add_rp_inst(struct kretprobe_instance *ri) -{ - /* - * Remove rp inst off the free list - - * Add it back when probed function returns - */ - hlist_del(&ri->uflist); - - /* Add rp inst onto table */ - INIT_HLIST_NODE(&ri->hlist); - hlist_add_head(&ri->hlist, - &kretprobe_inst_table[hash_ptr(ri->task, KPROBE_HASH_BITS)]); - - /* Also add this rp inst to the used list. */ - INIT_HLIST_NODE(&ri->uflist); - hlist_add_head(&ri->uflist, &ri->rp->used_instances); -} - -/* Called with kretprobe_lock held */ void __kprobes recycle_rp_inst(struct kretprobe_instance *ri, struct hlist_head *head) { @@ -454,7 +415,9 @@ void __kprobes kprobe_flush_task(struct task_struct *tk) static inline void free_rp_inst(struct kretprobe *rp) { struct kretprobe_instance *ri; - while ((ri = get_free_rp_inst(rp)) != NULL) { + struct hlist_node *pos, *next; + + hlist_for_each_entry_safe(ri, pos, next, &rp->free_instances, uflist) { hlist_del(&ri->uflist); kfree(ri); } @@ -535,8 +498,8 @@ static int __kprobes register_aggr_kprobe(struct kprobe *old_p, static int __kprobes in_kprobes_functions(unsigned long addr) { - if (addr >= (unsigned long)__kprobes_text_start - && addr < (unsigned long)__kprobes_text_end) + if (addr >= (unsigned long)__kprobes_text_start && + addr < (unsigned long)__kprobes_text_end) return -EINVAL; return 0; } @@ -563,19 +526,24 @@ static int __kprobes __register_kprobe(struct kprobe *p, return -EINVAL; p->addr = (kprobe_opcode_t *)(((char *)p->addr)+ p->offset); - if ((!kernel_text_address((unsigned long) p->addr)) || - in_kprobes_functions((unsigned long) p->addr)) + if (!kernel_text_address((unsigned long) p->addr) || + in_kprobes_functions((unsigned long) p->addr)) return -EINVAL; p->mod_refcounted = 0; - /* Check are we probing a module */ - if ((probed_mod = module_text_address((unsigned long) p->addr))) { + + /* + * Check if are we probing a module. + */ + probed_mod = module_text_address((unsigned long) p->addr); + if (probed_mod) { struct module *calling_mod = module_text_address(called_from); - /* We must allow modules to probe themself and - * in this case avoid incrementing the module refcount, - * so as to allow unloading of self probing modules. + /* + * We must allow modules to probe themself and in this case + * avoid incrementing the module refcount, so as to allow + * unloading of self probing modules. */ - if (calling_mod && (calling_mod != probed_mod)) { + if (calling_mod && calling_mod != probed_mod) { if (unlikely(!try_module_get(probed_mod))) return -EINVAL; p->mod_refcounted = 1; @@ -593,19 +561,21 @@ static int __kprobes __register_kprobe(struct kprobe *p, goto out; } - if ((ret = arch_prepare_kprobe(p)) != 0) + ret = arch_prepare_kprobe(p); + if (ret) goto out; INIT_HLIST_NODE(&p->hlist); hlist_add_head_rcu(&p->hlist, &kprobe_table[hash_ptr(p->addr, KPROBE_HASH_BITS)]); - if (atomic_add_return(1, &kprobe_count) == \ + if (kprobe_enabled) { + if (atomic_add_return(1, &kprobe_count) == \ (ARCH_INACTIVE_KPROBE_COUNT + 1)) - register_page_fault_notifier(&kprobe_page_fault_nb); - - arch_arm_kprobe(p); + register_page_fault_notifier(&kprobe_page_fault_nb); + arch_arm_kprobe(p); + } out: mutex_unlock(&kprobe_mutex); @@ -616,8 +586,7 @@ out: int __kprobes register_kprobe(struct kprobe *p) { - return __register_kprobe(p, - (unsigned long)__builtin_return_address(0)); + return __register_kprobe(p, (unsigned long)__builtin_return_address(0)); } void __kprobes unregister_kprobe(struct kprobe *p) @@ -641,11 +610,16 @@ void __kprobes unregister_kprobe(struct kprobe *p) return; } valid_p: - if ((old_p == p) || ((old_p->pre_handler == aggr_pre_handler) && - (p->list.next == &old_p->list) && - (p->list.prev == &old_p->list))) { - /* Only probe on the hash list */ - arch_disarm_kprobe(p); + if (old_p == p || + (old_p->pre_handler == aggr_pre_handler && + p->list.next == &old_p->list && p->list.prev == &old_p->list)) { + /* + * Only probe on the hash list. Disarm only if kprobes are + * enabled - otherwise, the breakpoint would already have + * been removed. We save on flushing icache. + */ + if (kprobe_enabled) + arch_disarm_kprobe(p); hlist_del_rcu(&old_p->hlist); cleanup_p = 1; } else { @@ -656,9 +630,11 @@ valid_p: mutex_unlock(&kprobe_mutex); synchronize_sched(); - if (p->mod_refcounted && - (mod = module_text_address((unsigned long)p->addr))) - module_put(mod); + if (p->mod_refcounted) { + mod = module_text_address((unsigned long)p->addr); + if (mod) + module_put(mod); + } if (cleanup_p) { if (p != old_p) { @@ -729,7 +705,21 @@ static int __kprobes pre_handler_kretprobe(struct kprobe *p, /*TODO: consider to only swap the RA after the last pre_handler fired */ spin_lock_irqsave(&kretprobe_lock, flags); - arch_prepare_kretprobe(rp, regs); + if (!hlist_empty(&rp->free_instances)) { + struct kretprobe_instance *ri; + + ri = hlist_entry(rp->free_instances.first, + struct kretprobe_instance, uflist); + ri->rp = rp; + ri->task = current; + arch_prepare_kretprobe(ri, regs); + + /* XXX(hch): why is there no hlist_move_head? */ + hlist_del(&ri->uflist); + hlist_add_head(&ri->uflist, &ri->rp->used_instances); + hlist_add_head(&ri->hlist, kretprobe_inst_table_head(ri->task)); + } else + rp->nmissed++; spin_unlock_irqrestore(&kretprobe_lock, flags); return 0; } @@ -792,11 +782,13 @@ void __kprobes unregister_kretprobe(struct kretprobe *rp) { unsigned long flags; struct kretprobe_instance *ri; + struct hlist_node *pos, *next; unregister_kprobe(&rp->kp); + /* No race here */ spin_lock_irqsave(&kretprobe_lock, flags); - while ((ri = get_used_rp_inst(rp)) != NULL) { + hlist_for_each_entry_safe(ri, pos, next, &rp->used_instances, uflist) { ri->rp = NULL; hlist_del(&ri->uflist); } @@ -816,6 +808,9 @@ static int __init init_kprobes(void) } atomic_set(&kprobe_count, 0); + /* By default, kprobes are enabled */ + kprobe_enabled = true; + err = arch_init_kprobes(); if (!err) err = register_die_notifier(&kprobe_exceptions_nb); @@ -825,7 +820,7 @@ static int __init init_kprobes(void) #ifdef CONFIG_DEBUG_FS static void __kprobes report_probe(struct seq_file *pi, struct kprobe *p, - const char *sym, int offset,char *modname) + const char *sym, int offset,char *modname) { char *kprobe_type; @@ -867,13 +862,13 @@ static int __kprobes show_kprobe_addr(struct seq_file *pi, void *v) struct kprobe *p, *kp; const char *sym = NULL; unsigned int i = *(loff_t *) v; - unsigned long size, offset = 0; + unsigned long offset = 0; char *modname, namebuf[128]; head = &kprobe_table[i]; preempt_disable(); hlist_for_each_entry_rcu(p, node, head, hlist) { - sym = kallsyms_lookup((unsigned long)p->addr, &size, + sym = kallsyms_lookup((unsigned long)p->addr, NULL, &offset, &modname, namebuf); if (p->pre_handler == aggr_pre_handler) { list_for_each_entry_rcu(kp, &p->list, list) @@ -904,21 +899,149 @@ static struct file_operations debugfs_kprobes_operations = { .release = seq_release, }; +static void __kprobes enable_all_kprobes(void) +{ + struct hlist_head *head; + struct hlist_node *node; + struct kprobe *p; + unsigned int i; + + mutex_lock(&kprobe_mutex); + + /* If kprobes are already enabled, just return */ + if (kprobe_enabled) + goto already_enabled; + + /* + * Re-register the page fault notifier only if there are any + * active probes at the time of enabling kprobes globally + */ + if (atomic_read(&kprobe_count) > ARCH_INACTIVE_KPROBE_COUNT) + register_page_fault_notifier(&kprobe_page_fault_nb); + + for (i = 0; i < KPROBE_TABLE_SIZE; i++) { + head = &kprobe_table[i]; + hlist_for_each_entry_rcu(p, node, head, hlist) + arch_arm_kprobe(p); + } + + kprobe_enabled = true; + printk(KERN_INFO "Kprobes globally enabled\n"); + +already_enabled: + mutex_unlock(&kprobe_mutex); + return; +} + +static void __kprobes disable_all_kprobes(void) +{ + struct hlist_head *head; + struct hlist_node *node; + struct kprobe *p; + unsigned int i; + + mutex_lock(&kprobe_mutex); + + /* If kprobes are already disabled, just return */ + if (!kprobe_enabled) + goto already_disabled; + + kprobe_enabled = false; + printk(KERN_INFO "Kprobes globally disabled\n"); + for (i = 0; i < KPROBE_TABLE_SIZE; i++) { + head = &kprobe_table[i]; + hlist_for_each_entry_rcu(p, node, head, hlist) { + if (!arch_trampoline_kprobe(p)) + arch_disarm_kprobe(p); + } + } + + mutex_unlock(&kprobe_mutex); + /* Allow all currently running kprobes to complete */ + synchronize_sched(); + + mutex_lock(&kprobe_mutex); + /* Unconditionally unregister the page_fault notifier */ + unregister_page_fault_notifier(&kprobe_page_fault_nb); + +already_disabled: + mutex_unlock(&kprobe_mutex); + return; +} + +/* + * XXX: The debugfs bool file interface doesn't allow for callbacks + * when the bool state is switched. We can reuse that facility when + * available + */ +static ssize_t read_enabled_file_bool(struct file *file, + char __user *user_buf, size_t count, loff_t *ppos) +{ + char buf[3]; + + if (kprobe_enabled) + buf[0] = '1'; + else + buf[0] = '0'; + buf[1] = '\n'; + buf[2] = 0x00; + return simple_read_from_buffer(user_buf, count, ppos, buf, 2); +} + +static ssize_t write_enabled_file_bool(struct file *file, + const char __user *user_buf, size_t count, loff_t *ppos) +{ + char buf[32]; + int buf_size; + + buf_size = min(count, (sizeof(buf)-1)); + if (copy_from_user(buf, user_buf, buf_size)) + return -EFAULT; + + switch (buf[0]) { + case 'y': + case 'Y': + case '1': + enable_all_kprobes(); + break; + case 'n': + case 'N': + case '0': + disable_all_kprobes(); + break; + } + + return count; +} + +static struct file_operations fops_kp = { + .read = read_enabled_file_bool, + .write = write_enabled_file_bool, +}; + static int __kprobes debugfs_kprobe_init(void) { struct dentry *dir, *file; + unsigned int value = 1; dir = debugfs_create_dir("kprobes", NULL); if (!dir) return -ENOMEM; - file = debugfs_create_file("list", 0444, dir , 0 , + file = debugfs_create_file("list", 0444, dir, NULL, &debugfs_kprobes_operations); if (!file) { debugfs_remove(dir); return -ENOMEM; } + file = debugfs_create_file("enabled", 0600, dir, + &value, &fops_kp); + if (!file) { + debugfs_remove(dir); + return -ENOMEM; + } + return 0; } diff --git a/kernel/kthread.c b/kernel/kthread.c index 87c50ccd1d4e..df8a8e8f6ca4 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -1,7 +1,7 @@ /* Kernel thread helper functions. * Copyright (C) 2004 IBM Corporation, Rusty Russell. * - * Creation is done via keventd, so that we get a clean environment + * Creation is done via kthreadd, so that we get a clean environment * even if we're invoked from userspace (think modprobe, hotplug cpu, * etc.). */ @@ -15,24 +15,22 @@ #include <linux/mutex.h> #include <asm/semaphore.h> -/* - * We dont want to execute off keventd since it might - * hold a semaphore our callers hold too: - */ -static struct workqueue_struct *helper_wq; +static DEFINE_SPINLOCK(kthread_create_lock); +static LIST_HEAD(kthread_create_list); +struct task_struct *kthreadd_task; struct kthread_create_info { - /* Information passed to kthread() from keventd. */ + /* Information passed to kthread() from kthreadd. */ int (*threadfn)(void *data); void *data; struct completion started; - /* Result passed back to kthread_create() from keventd. */ + /* Result passed back to kthread_create() from kthreadd. */ struct task_struct *result; struct completion done; - struct work_struct work; + struct list_head list; }; struct kthread_stop_info @@ -60,42 +58,17 @@ int kthread_should_stop(void) } EXPORT_SYMBOL(kthread_should_stop); -static void kthread_exit_files(void) -{ - struct fs_struct *fs; - struct task_struct *tsk = current; - - exit_fs(tsk); /* current->fs->count--; */ - fs = init_task.fs; - tsk->fs = fs; - atomic_inc(&fs->count); - exit_files(tsk); - current->files = init_task.files; - atomic_inc(&tsk->files->count); -} - static int kthread(void *_create) { struct kthread_create_info *create = _create; int (*threadfn)(void *data); void *data; - sigset_t blocked; int ret = -EINTR; - kthread_exit_files(); - - /* Copy data: it's on keventd's stack */ + /* Copy data: it's on kthread's stack */ threadfn = create->threadfn; data = create->data; - /* Block and flush all signals (in case we're not from keventd). */ - sigfillset(&blocked); - sigprocmask(SIG_BLOCK, &blocked, NULL); - flush_signals(current); - - /* By default we can run anywhere, unlike keventd. */ - set_cpus_allowed(current, CPU_MASK_ALL); - /* OK, tell user we're spawned, wait for stop or wakeup */ __set_current_state(TASK_INTERRUPTIBLE); complete(&create->started); @@ -112,11 +85,8 @@ static int kthread(void *_create) return 0; } -/* We are keventd: create a thread. */ -static void keventd_create_kthread(struct work_struct *work) +static void create_kthread(struct kthread_create_info *create) { - struct kthread_create_info *create = - container_of(work, struct kthread_create_info, work); int pid; /* We want our own signal handler (we take no signals by default). */ @@ -162,17 +132,14 @@ struct task_struct *kthread_create(int (*threadfn)(void *data), create.data = data; init_completion(&create.started); init_completion(&create.done); - INIT_WORK(&create.work, keventd_create_kthread); - - /* - * The workqueue needs to start up first: - */ - if (!helper_wq) - create.work.func(&create.work); - else { - queue_work(helper_wq, &create.work); - wait_for_completion(&create.done); - } + + spin_lock(&kthread_create_lock); + list_add_tail(&create.list, &kthread_create_list); + wake_up_process(kthreadd_task); + spin_unlock(&kthread_create_lock); + + wait_for_completion(&create.done); + if (!IS_ERR(create.result)) { va_list args; va_start(args, namefmt); @@ -180,7 +147,6 @@ struct task_struct *kthread_create(int (*threadfn)(void *data), namefmt, args); va_end(args); } - return create.result; } EXPORT_SYMBOL(kthread_create); @@ -245,12 +211,47 @@ int kthread_stop(struct task_struct *k) } EXPORT_SYMBOL(kthread_stop); -static __init int helper_init(void) + +static __init void kthreadd_setup(void) { - helper_wq = create_singlethread_workqueue("kthread"); - BUG_ON(!helper_wq); + struct task_struct *tsk = current; - return 0; + set_task_comm(tsk, "kthreadd"); + + ignore_signals(tsk); + + set_user_nice(tsk, -5); + set_cpus_allowed(tsk, CPU_MASK_ALL); } -core_initcall(helper_init); +int kthreadd(void *unused) +{ + /* Setup a clean context for our children to inherit. */ + kthreadd_setup(); + + current->flags |= PF_NOFREEZE; + + for (;;) { + set_current_state(TASK_INTERRUPTIBLE); + if (list_empty(&kthread_create_list)) + schedule(); + __set_current_state(TASK_RUNNING); + + spin_lock(&kthread_create_lock); + while (!list_empty(&kthread_create_list)) { + struct kthread_create_info *create; + + create = list_entry(kthread_create_list.next, + struct kthread_create_info, list); + list_del_init(&create->list); + spin_unlock(&kthread_create_lock); + + create_kthread(create); + + spin_lock(&kthread_create_lock); + } + spin_unlock(&kthread_create_lock); + } + + return 0; +} diff --git a/kernel/lockdep.c b/kernel/lockdep.c index 7065a687ac54..1a5ff2211d88 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -257,9 +257,8 @@ static int save_trace(struct stack_trace *trace) trace->entries = stack_trace + nr_stack_trace_entries; trace->skip = 3; - trace->all_contexts = 0; - save_stack_trace(trace, NULL); + save_stack_trace(trace); trace->max_entries = trace->nr_entries; @@ -341,10 +340,7 @@ static const char *usage_str[] = const char * __get_key_name(struct lockdep_subclass_key *key, char *str) { - unsigned long offs, size; - char *modname; - - return kallsyms_lookup((unsigned long)key, &size, &offs, &modname, str); + return kallsyms_lookup((unsigned long)key, NULL, NULL, NULL, str); } void @@ -1313,8 +1309,9 @@ out_unlock_set: /* * Look up a dependency chain. If the key is not present yet then - * add it and return 0 - in this case the new dependency chain is - * validated. If the key is already hashed, return 1. + * add it and return 1 - in this case the new dependency chain is + * validated. If the key is already hashed, return 0. + * (On return with 1 graph_lock is held.) */ static inline int lookup_chain_cache(u64 chain_key, struct lock_class *class) { @@ -1577,7 +1574,7 @@ valid_state(struct task_struct *curr, struct held_lock *this, * Mark a lock with a usage bit, and validate the state transition: */ static int mark_lock(struct task_struct *curr, struct held_lock *this, - enum lock_usage_bit new_bit, unsigned long ip) + enum lock_usage_bit new_bit) { unsigned int new_mask = 1 << new_bit, ret = 1; @@ -1600,14 +1597,6 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this, this->class->usage_mask |= new_mask; -#ifdef CONFIG_TRACE_IRQFLAGS - if (new_bit == LOCK_ENABLED_HARDIRQS || - new_bit == LOCK_ENABLED_HARDIRQS_READ) - ip = curr->hardirq_enable_ip; - else if (new_bit == LOCK_ENABLED_SOFTIRQS || - new_bit == LOCK_ENABLED_SOFTIRQS_READ) - ip = curr->softirq_enable_ip; -#endif if (!save_trace(this->class->usage_traces + new_bit)) return 0; @@ -1806,7 +1795,7 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this, * Mark all held locks with a usage bit: */ static int -mark_held_locks(struct task_struct *curr, int hardirq, unsigned long ip) +mark_held_locks(struct task_struct *curr, int hardirq) { enum lock_usage_bit usage_bit; struct held_lock *hlock; @@ -1826,7 +1815,7 @@ mark_held_locks(struct task_struct *curr, int hardirq, unsigned long ip) else usage_bit = LOCK_ENABLED_SOFTIRQS; } - if (!mark_lock(curr, hlock, usage_bit, ip)) + if (!mark_lock(curr, hlock, usage_bit)) return 0; } @@ -1879,7 +1868,7 @@ void trace_hardirqs_on(void) * We are going to turn hardirqs on, so set the * usage bit for all held locks: */ - if (!mark_held_locks(curr, 1, ip)) + if (!mark_held_locks(curr, 1)) return; /* * If we have softirqs enabled, then set the usage @@ -1887,7 +1876,7 @@ void trace_hardirqs_on(void) * this bit from being set before) */ if (curr->softirqs_enabled) - if (!mark_held_locks(curr, 0, ip)) + if (!mark_held_locks(curr, 0)) return; curr->hardirq_enable_ip = ip; @@ -1955,7 +1944,7 @@ void trace_softirqs_on(unsigned long ip) * enabled too: */ if (curr->hardirqs_enabled) - mark_held_locks(curr, 0, ip); + mark_held_locks(curr, 0); } /* @@ -2093,43 +2082,43 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, if (read) { if (curr->hardirq_context) if (!mark_lock(curr, hlock, - LOCK_USED_IN_HARDIRQ_READ, ip)) + LOCK_USED_IN_HARDIRQ_READ)) return 0; if (curr->softirq_context) if (!mark_lock(curr, hlock, - LOCK_USED_IN_SOFTIRQ_READ, ip)) + LOCK_USED_IN_SOFTIRQ_READ)) return 0; } else { if (curr->hardirq_context) - if (!mark_lock(curr, hlock, LOCK_USED_IN_HARDIRQ, ip)) + if (!mark_lock(curr, hlock, LOCK_USED_IN_HARDIRQ)) return 0; if (curr->softirq_context) - if (!mark_lock(curr, hlock, LOCK_USED_IN_SOFTIRQ, ip)) + if (!mark_lock(curr, hlock, LOCK_USED_IN_SOFTIRQ)) return 0; } } if (!hardirqs_off) { if (read) { if (!mark_lock(curr, hlock, - LOCK_ENABLED_HARDIRQS_READ, ip)) + LOCK_ENABLED_HARDIRQS_READ)) return 0; if (curr->softirqs_enabled) if (!mark_lock(curr, hlock, - LOCK_ENABLED_SOFTIRQS_READ, ip)) + LOCK_ENABLED_SOFTIRQS_READ)) return 0; } else { if (!mark_lock(curr, hlock, - LOCK_ENABLED_HARDIRQS, ip)) + LOCK_ENABLED_HARDIRQS)) return 0; if (curr->softirqs_enabled) if (!mark_lock(curr, hlock, - LOCK_ENABLED_SOFTIRQS, ip)) + LOCK_ENABLED_SOFTIRQS)) return 0; } } #endif /* mark it as used: */ - if (!mark_lock(curr, hlock, LOCK_USED, ip)) + if (!mark_lock(curr, hlock, LOCK_USED)) return 0; out_calc_hash: /* diff --git a/kernel/module.c b/kernel/module.c index 1eb8ca565ba0..9bd93de01f4a 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -19,6 +19,7 @@ #include <linux/module.h> #include <linux/moduleloader.h> #include <linux/init.h> +#include <linux/kallsyms.h> #include <linux/kernel.h> #include <linux/slab.h> #include <linux/vmalloc.h> @@ -95,9 +96,9 @@ static inline void add_taint_module(struct module *mod, unsigned flag) mod->taints |= flag; } -/* A thread that wants to hold a reference to a module only while it - * is running can call ths to safely exit. - * nfsd and lockd use this. +/* + * A thread that wants to hold a reference to a module only while it + * is running can call this to safely exit. nfsd and lockd use this. */ void __module_put_and_exit(struct module *mod, long code) { @@ -310,14 +311,14 @@ static int split_block(unsigned int i, unsigned short size) { /* Reallocation required? */ if (pcpu_num_used + 1 > pcpu_num_allocated) { - int *new = kmalloc(sizeof(new[0]) * pcpu_num_allocated*2, - GFP_KERNEL); + int *new; + + new = krealloc(pcpu_size, sizeof(new[0])*pcpu_num_allocated*2, + GFP_KERNEL); if (!new) return 0; - memcpy(new, pcpu_size, sizeof(new[0])*pcpu_num_allocated); pcpu_num_allocated *= 2; - kfree(pcpu_size); pcpu_size = new; } @@ -1198,7 +1199,7 @@ static int __unlink_module(void *_mod) return 0; } -/* Free a module, remove from lists, etc (must hold module mutex). */ +/* Free a module, remove from lists, etc (must hold module_mutex). */ static void free_module(struct module *mod) { /* Delete from various lists */ @@ -1245,7 +1246,7 @@ EXPORT_SYMBOL_GPL(__symbol_get); /* * Ensure that an exported symbol [global namespace] does not already exist - * in the Kernel or in some other modules exported symbol table. + * in the kernel or in some other module's exported symbol table. */ static int verify_export_symbols(struct module *mod) { @@ -1471,7 +1472,7 @@ static void setup_modinfo(struct module *mod, Elf_Shdr *sechdrs, } #ifdef CONFIG_KALLSYMS -int is_exported(const char *name, const struct module *mod) +static int is_exported(const char *name, const struct module *mod) { if (!mod && lookup_symbol(name, __start___ksymtab, __stop___ksymtab)) return 1; @@ -2097,8 +2098,10 @@ static const char *get_ksymbol(struct module *mod, if (!best) return NULL; - *size = nextval - mod->symtab[best].st_value; - *offset = addr - mod->symtab[best].st_value; + if (size) + *size = nextval - mod->symtab[best].st_value; + if (offset) + *offset = addr - mod->symtab[best].st_value; return mod->strtab + mod->symtab[best].st_name; } @@ -2123,8 +2126,58 @@ const char *module_address_lookup(unsigned long addr, return NULL; } -struct module *module_get_kallsym(unsigned int symnum, unsigned long *value, - char *type, char *name, size_t namelen) +int lookup_module_symbol_name(unsigned long addr, char *symname) +{ + struct module *mod; + + mutex_lock(&module_mutex); + list_for_each_entry(mod, &modules, list) { + if (within(addr, mod->module_init, mod->init_size) || + within(addr, mod->module_core, mod->core_size)) { + const char *sym; + + sym = get_ksymbol(mod, addr, NULL, NULL); + if (!sym) + goto out; + strlcpy(symname, sym, KSYM_NAME_LEN + 1); + mutex_unlock(&module_mutex); + return 0; + } + } +out: + mutex_unlock(&module_mutex); + return -ERANGE; +} + +int lookup_module_symbol_attrs(unsigned long addr, unsigned long *size, + unsigned long *offset, char *modname, char *name) +{ + struct module *mod; + + mutex_lock(&module_mutex); + list_for_each_entry(mod, &modules, list) { + if (within(addr, mod->module_init, mod->init_size) || + within(addr, mod->module_core, mod->core_size)) { + const char *sym; + + sym = get_ksymbol(mod, addr, size, offset); + if (!sym) + goto out; + if (modname) + strlcpy(modname, mod->name, MODULE_NAME_LEN + 1); + if (name) + strlcpy(name, sym, KSYM_NAME_LEN + 1); + mutex_unlock(&module_mutex); + return 0; + } + } +out: + mutex_unlock(&module_mutex); + return -ERANGE; +} + +int module_get_kallsym(unsigned int symnum, unsigned long *value, char *type, + char *name, char *module_name, int *exported) { struct module *mod; @@ -2134,14 +2187,16 @@ struct module *module_get_kallsym(unsigned int symnum, unsigned long *value, *value = mod->symtab[symnum].st_value; *type = mod->symtab[symnum].st_info; strlcpy(name, mod->strtab + mod->symtab[symnum].st_name, - namelen); + KSYM_NAME_LEN + 1); + strlcpy(module_name, mod->name, MODULE_NAME_LEN + 1); + *exported = is_exported(name, mod); mutex_unlock(&module_mutex); - return mod; + return 0; } symnum -= mod->num_symtab; } mutex_unlock(&module_mutex); - return NULL; + return -ERANGE; } static unsigned long mod_find_symname(struct module *mod, const char *name) diff --git a/kernel/mutex.c b/kernel/mutex.c index e7cbbb82765b..303eab18484b 100644 --- a/kernel/mutex.c +++ b/kernel/mutex.c @@ -133,7 +133,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass) debug_mutex_lock_common(lock, &waiter); mutex_acquire(&lock->dep_map, subclass, 0, _RET_IP_); - debug_mutex_add_waiter(lock, &waiter, task->thread_info); + debug_mutex_add_waiter(lock, &waiter, task_thread_info(task)); /* add waiting tasks to the end of the waitqueue (FIFO): */ list_add_tail(&waiter.list, &lock->wait_list); @@ -159,7 +159,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass) */ if (unlikely(state == TASK_INTERRUPTIBLE && signal_pending(task))) { - mutex_remove_waiter(lock, &waiter, task->thread_info); + mutex_remove_waiter(lock, &waiter, task_thread_info(task)); mutex_release(&lock->dep_map, 1, _RET_IP_); spin_unlock_mutex(&lock->wait_lock, flags); @@ -175,8 +175,8 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass) } /* got the lock - rejoice! */ - mutex_remove_waiter(lock, &waiter, task->thread_info); - debug_mutex_set_owner(lock, task->thread_info); + mutex_remove_waiter(lock, &waiter, task_thread_info(task)); + debug_mutex_set_owner(lock, task_thread_info(task)); /* set it to 0 if there are no waiters left: */ if (likely(list_empty(&lock->wait_list))) diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c index f5b9ee6f6bbb..1bc4b55241a8 100644 --- a/kernel/nsproxy.c +++ b/kernel/nsproxy.c @@ -38,10 +38,8 @@ void get_task_namespaces(struct task_struct *tsk) /* * creates a copy of "orig" with refcount 1. - * This does not grab references to the contained namespaces, - * so that needs to be done by dup_namespaces. */ -static inline struct nsproxy *clone_namespaces(struct nsproxy *orig) +static inline struct nsproxy *clone_nsproxy(struct nsproxy *orig) { struct nsproxy *ns; @@ -52,26 +50,49 @@ static inline struct nsproxy *clone_namespaces(struct nsproxy *orig) } /* - * copies the nsproxy, setting refcount to 1, and grabbing a - * reference to all contained namespaces. Called from - * sys_unshare() + * Create new nsproxy and all of its the associated namespaces. + * Return the newly created nsproxy. Do not attach this to the task, + * leave it to the caller to do proper locking and attach it to task. */ -struct nsproxy *dup_namespaces(struct nsproxy *orig) +static struct nsproxy *create_new_namespaces(int flags, struct task_struct *tsk, + struct fs_struct *new_fs) { - struct nsproxy *ns = clone_namespaces(orig); + struct nsproxy *new_nsp; - if (ns) { - if (ns->mnt_ns) - get_mnt_ns(ns->mnt_ns); - if (ns->uts_ns) - get_uts_ns(ns->uts_ns); - if (ns->ipc_ns) - get_ipc_ns(ns->ipc_ns); - if (ns->pid_ns) - get_pid_ns(ns->pid_ns); - } + new_nsp = clone_nsproxy(tsk->nsproxy); + if (!new_nsp) + return ERR_PTR(-ENOMEM); - return ns; + new_nsp->mnt_ns = copy_mnt_ns(flags, tsk->nsproxy->mnt_ns, new_fs); + if (IS_ERR(new_nsp->mnt_ns)) + goto out_ns; + + new_nsp->uts_ns = copy_utsname(flags, tsk->nsproxy->uts_ns); + if (IS_ERR(new_nsp->uts_ns)) + goto out_uts; + + new_nsp->ipc_ns = copy_ipcs(flags, tsk->nsproxy->ipc_ns); + if (IS_ERR(new_nsp->ipc_ns)) + goto out_ipc; + + new_nsp->pid_ns = copy_pid_ns(flags, tsk->nsproxy->pid_ns); + if (IS_ERR(new_nsp->pid_ns)) + goto out_pid; + + return new_nsp; + +out_pid: + if (new_nsp->ipc_ns) + put_ipc_ns(new_nsp->ipc_ns); +out_ipc: + if (new_nsp->uts_ns) + put_uts_ns(new_nsp->uts_ns); +out_uts: + if (new_nsp->mnt_ns) + put_mnt_ns(new_nsp->mnt_ns); +out_ns: + kfree(new_nsp); + return ERR_PTR(-ENOMEM); } /* @@ -92,47 +113,21 @@ int copy_namespaces(int flags, struct task_struct *tsk) if (!(flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC))) return 0; - new_ns = clone_namespaces(old_ns); - if (!new_ns) { - err = -ENOMEM; + if (!capable(CAP_SYS_ADMIN)) { + err = -EPERM; goto out; } - tsk->nsproxy = new_ns; - - err = copy_mnt_ns(flags, tsk); - if (err) - goto out_ns; - - err = copy_utsname(flags, tsk); - if (err) - goto out_uts; - - err = copy_ipcs(flags, tsk); - if (err) - goto out_ipc; - - err = copy_pid_ns(flags, tsk); - if (err) - goto out_pid; + new_ns = create_new_namespaces(flags, tsk, tsk->fs); + if (IS_ERR(new_ns)) { + err = PTR_ERR(new_ns); + goto out; + } + tsk->nsproxy = new_ns; out: put_nsproxy(old_ns); return err; - -out_pid: - if (new_ns->ipc_ns) - put_ipc_ns(new_ns->ipc_ns); -out_ipc: - if (new_ns->uts_ns) - put_uts_ns(new_ns->uts_ns); -out_uts: - if (new_ns->mnt_ns) - put_mnt_ns(new_ns->mnt_ns); -out_ns: - tsk->nsproxy = old_ns; - kfree(new_ns); - goto out; } void free_nsproxy(struct nsproxy *ns) @@ -147,3 +142,41 @@ void free_nsproxy(struct nsproxy *ns) put_pid_ns(ns->pid_ns); kfree(ns); } + +/* + * Called from unshare. Unshare all the namespaces part of nsproxy. + * On sucess, returns the new nsproxy and a reference to old nsproxy + * to make sure it stays around. + */ +int unshare_nsproxy_namespaces(unsigned long unshare_flags, + struct nsproxy **new_nsp, struct fs_struct *new_fs) +{ + struct nsproxy *old_ns = current->nsproxy; + int err = 0; + + if (!(unshare_flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC))) + return 0; + +#ifndef CONFIG_IPC_NS + if (unshare_flags & CLONE_NEWIPC) + return -EINVAL; +#endif + +#ifndef CONFIG_UTS_NS + if (unshare_flags & CLONE_NEWUTS) + return -EINVAL; +#endif + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + get_nsproxy(old_ns); + + *new_nsp = create_new_namespaces(unshare_flags, current, + new_fs ? new_fs : current->fs); + if (IS_ERR(*new_nsp)) { + err = PTR_ERR(*new_nsp); + put_nsproxy(old_ns); + } + return err; +} diff --git a/kernel/params.c b/kernel/params.c index 312172320b4c..e61c46c97ce7 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -269,7 +269,7 @@ int param_get_invbool(char *buffer, struct kernel_param *kp) return param_get_bool(buffer, &dummy); } -/* We cheat here and temporarily mangle the string. */ +/* We break the rule and mangle the string. */ static int param_array(const char *name, const char *val, unsigned int min, unsigned int max, diff --git a/kernel/pid.c b/kernel/pid.c index 9c80bc23d6b8..eb66bd2953ab 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -27,11 +27,13 @@ #include <linux/bootmem.h> #include <linux/hash.h> #include <linux/pid_namespace.h> +#include <linux/init_task.h> #define pid_hashfn(nr) hash_long((unsigned long)nr, pidhash_shift) static struct hlist_head *pid_hash; static int pidhash_shift; static struct kmem_cache *pid_cachep; +struct pid init_struct_pid = INIT_STRUCT_PID; int pid_max = PID_MAX_DEFAULT; @@ -247,13 +249,16 @@ struct pid * fastcall find_pid(int nr) } EXPORT_SYMBOL_GPL(find_pid); -int fastcall attach_pid(struct task_struct *task, enum pid_type type, int nr) +/* + * attach_pid() must be called with the tasklist_lock write-held. + */ +int fastcall attach_pid(struct task_struct *task, enum pid_type type, + struct pid *pid) { struct pid_link *link; - struct pid *pid; link = &task->pids[type]; - link->pid = pid = find_pid(nr); + link->pid = pid; hlist_add_head_rcu(&link->node, &pid->tasks[type]); return 0; @@ -360,16 +365,11 @@ struct pid *find_ge_pid(int nr) } EXPORT_SYMBOL_GPL(find_get_pid); -int copy_pid_ns(int flags, struct task_struct *tsk) +struct pid_namespace *copy_pid_ns(int flags, struct pid_namespace *old_ns) { - struct pid_namespace *old_ns = tsk->nsproxy->pid_ns; - int err = 0; - - if (!old_ns) - return 0; - + BUG_ON(!old_ns); get_pid_ns(old_ns); - return err; + return old_ns; } void free_pid_ns(struct kref *kref) diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index 657f77697415..1de710e18373 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c @@ -971,7 +971,7 @@ static void check_thread_timers(struct task_struct *tsk, maxfire = 20; tsk->it_prof_expires = cputime_zero; while (!list_empty(timers)) { - struct cpu_timer_list *t = list_entry(timers->next, + struct cpu_timer_list *t = list_first_entry(timers, struct cpu_timer_list, entry); if (!--maxfire || cputime_lt(prof_ticks(tsk), t->expires.cpu)) { @@ -986,7 +986,7 @@ static void check_thread_timers(struct task_struct *tsk, maxfire = 20; tsk->it_virt_expires = cputime_zero; while (!list_empty(timers)) { - struct cpu_timer_list *t = list_entry(timers->next, + struct cpu_timer_list *t = list_first_entry(timers, struct cpu_timer_list, entry); if (!--maxfire || cputime_lt(virt_ticks(tsk), t->expires.cpu)) { @@ -1001,7 +1001,7 @@ static void check_thread_timers(struct task_struct *tsk, maxfire = 20; tsk->it_sched_expires = 0; while (!list_empty(timers)) { - struct cpu_timer_list *t = list_entry(timers->next, + struct cpu_timer_list *t = list_first_entry(timers, struct cpu_timer_list, entry); if (!--maxfire || tsk->sched_time < t->expires.sched) { @@ -1057,7 +1057,7 @@ static void check_process_timers(struct task_struct *tsk, maxfire = 20; prof_expires = cputime_zero; while (!list_empty(timers)) { - struct cpu_timer_list *t = list_entry(timers->next, + struct cpu_timer_list *t = list_first_entry(timers, struct cpu_timer_list, entry); if (!--maxfire || cputime_lt(ptime, t->expires.cpu)) { @@ -1072,7 +1072,7 @@ static void check_process_timers(struct task_struct *tsk, maxfire = 20; virt_expires = cputime_zero; while (!list_empty(timers)) { - struct cpu_timer_list *t = list_entry(timers->next, + struct cpu_timer_list *t = list_first_entry(timers, struct cpu_timer_list, entry); if (!--maxfire || cputime_lt(utime, t->expires.cpu)) { @@ -1087,7 +1087,7 @@ static void check_process_timers(struct task_struct *tsk, maxfire = 20; sched_expires = 0; while (!list_empty(timers)) { - struct cpu_timer_list *t = list_entry(timers->next, + struct cpu_timer_list *t = list_first_entry(timers, struct cpu_timer_list, entry); if (!--maxfire || sched_time < t->expires.sched) { @@ -1400,7 +1400,7 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx, */ head = &tsk->signal->cpu_timers[clock_idx]; if (list_empty(head) || - cputime_ge(list_entry(head->next, + cputime_ge(list_first_entry(head, struct cpu_timer_list, entry)->expires.cpu, *newval)) { /* diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c index 44318ca71978..588c99da0307 100644 --- a/kernel/posix-timers.c +++ b/kernel/posix-timers.c @@ -31,7 +31,6 @@ * POSIX clocks & timers */ #include <linux/mm.h> -#include <linux/smp_lock.h> #include <linux/interrupt.h> #include <linux/slab.h> #include <linux/time.h> diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index 877721708fa4..495b7d4dd330 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -79,7 +79,7 @@ config PM_SYSFS_DEPRECATED config SOFTWARE_SUSPEND bool "Software Suspend (Hibernation)" - depends on PM && SWAP && ((X86 && (!SMP || SUSPEND_SMP)) || ((FRV || PPC32) && !SMP)) + depends on PM && SWAP && (((X86 || PPC64_SWSUSP) && (!SMP || SUSPEND_SMP)) || ((FRV || PPC32) && !SMP)) ---help--- Enable the suspend to disk (STD) functionality, which is usually called "hibernation" in user interfaces. STD checkpoints the @@ -139,7 +139,7 @@ config PM_STD_PARTITION config SUSPEND_SMP bool - depends on HOTPLUG_CPU && X86 && PM + depends on HOTPLUG_CPU && (X86 || PPC64) && PM default y config APM_EMULATION diff --git a/kernel/power/disk.c b/kernel/power/disk.c index 06331374d862..f445b9cd60fb 100644 --- a/kernel/power/disk.c +++ b/kernel/power/disk.c @@ -30,30 +30,69 @@ char resume_file[256] = CONFIG_PM_STD_PARTITION; dev_t swsusp_resume_device; sector_t swsusp_resume_block; +enum { + HIBERNATION_INVALID, + HIBERNATION_PLATFORM, + HIBERNATION_TEST, + HIBERNATION_TESTPROC, + HIBERNATION_SHUTDOWN, + HIBERNATION_REBOOT, + /* keep last */ + __HIBERNATION_AFTER_LAST +}; +#define HIBERNATION_MAX (__HIBERNATION_AFTER_LAST-1) +#define HIBERNATION_FIRST (HIBERNATION_INVALID + 1) + +static int hibernation_mode = HIBERNATION_SHUTDOWN; + +struct hibernation_ops *hibernation_ops; + +/** + * hibernation_set_ops - set the global hibernate operations + * @ops: the hibernation operations to use in subsequent hibernation transitions + */ + +void hibernation_set_ops(struct hibernation_ops *ops) +{ + if (ops && !(ops->prepare && ops->enter && ops->finish)) { + WARN_ON(1); + return; + } + mutex_lock(&pm_mutex); + hibernation_ops = ops; + if (ops) + hibernation_mode = HIBERNATION_PLATFORM; + else if (hibernation_mode == HIBERNATION_PLATFORM) + hibernation_mode = HIBERNATION_SHUTDOWN; + + mutex_unlock(&pm_mutex); +} + + /** * platform_prepare - prepare the machine for hibernation using the * platform driver if so configured and return an error code if it fails */ -static inline int platform_prepare(void) +static int platform_prepare(void) { - int error = 0; + return (hibernation_mode == HIBERNATION_PLATFORM && hibernation_ops) ? + hibernation_ops->prepare() : 0; +} - switch (pm_disk_mode) { - case PM_DISK_TEST: - case PM_DISK_TESTPROC: - case PM_DISK_SHUTDOWN: - case PM_DISK_REBOOT: - break; - default: - if (pm_ops && pm_ops->prepare) - error = pm_ops->prepare(PM_SUSPEND_DISK); - } - return error; +/** + * platform_finish - switch the machine to the normal mode of operation + * using the platform driver (must be called after platform_prepare()) + */ + +static void platform_finish(void) +{ + if (hibernation_mode == HIBERNATION_PLATFORM && hibernation_ops) + hibernation_ops->finish(); } /** - * power_down - Shut machine down for hibernate. + * power_down - Shut the machine down for hibernation. * * Use the platform driver, if configured so; otherwise try * to power off or reboot. @@ -61,20 +100,20 @@ static inline int platform_prepare(void) static void power_down(void) { - switch (pm_disk_mode) { - case PM_DISK_TEST: - case PM_DISK_TESTPROC: + switch (hibernation_mode) { + case HIBERNATION_TEST: + case HIBERNATION_TESTPROC: break; - case PM_DISK_SHUTDOWN: + case HIBERNATION_SHUTDOWN: kernel_power_off(); break; - case PM_DISK_REBOOT: + case HIBERNATION_REBOOT: kernel_restart(NULL); break; - default: - if (pm_ops && pm_ops->enter) { + case HIBERNATION_PLATFORM: + if (hibernation_ops) { kernel_shutdown_prepare(SYSTEM_SUSPEND_DISK); - pm_ops->enter(PM_SUSPEND_DISK); + hibernation_ops->enter(); break; } } @@ -87,20 +126,6 @@ static void power_down(void) while(1); } -static inline void platform_finish(void) -{ - switch (pm_disk_mode) { - case PM_DISK_TEST: - case PM_DISK_TESTPROC: - case PM_DISK_SHUTDOWN: - case PM_DISK_REBOOT: - break; - default: - if (pm_ops && pm_ops->finish) - pm_ops->finish(PM_SUSPEND_DISK); - } -} - static void unprepare_processes(void) { thaw_processes(); @@ -120,13 +145,10 @@ static int prepare_processes(void) } /** - * pm_suspend_disk - The granpappy of hibernation power management. - * - * If not, then call swsusp to do its thing, then figure out how - * to power down the system. + * hibernate - The granpappy of the built-in hibernation management */ -int pm_suspend_disk(void) +int hibernate(void) { int error; @@ -143,7 +165,8 @@ int pm_suspend_disk(void) if (error) goto Finish; - if (pm_disk_mode == PM_DISK_TESTPROC) { + mutex_lock(&pm_mutex); + if (hibernation_mode == HIBERNATION_TESTPROC) { printk("swsusp debug: Waiting for 5 seconds.\n"); mdelay(5000); goto Thaw; @@ -168,7 +191,7 @@ int pm_suspend_disk(void) if (error) goto Enable_cpus; - if (pm_disk_mode == PM_DISK_TEST) { + if (hibernation_mode == HIBERNATION_TEST) { printk("swsusp debug: Waiting for 5 seconds.\n"); mdelay(5000); goto Enable_cpus; @@ -205,6 +228,7 @@ int pm_suspend_disk(void) device_resume(); resume_console(); Thaw: + mutex_unlock(&pm_mutex); unprepare_processes(); Finish: free_basic_memory_bitmaps(); @@ -220,7 +244,7 @@ int pm_suspend_disk(void) * Called as a late_initcall (so all devices are discovered and * initialized), we call swsusp to see if we have a saved image or not. * If so, we quiesce devices, the restore the saved image. We will - * return above (in pm_suspend_disk() ) if everything goes well. + * return above (in hibernate() ) if everything goes well. * Otherwise, we fail gracefully and return to the normally * scheduled program. * @@ -315,25 +339,26 @@ static int software_resume(void) late_initcall(software_resume); -static const char * const pm_disk_modes[] = { - [PM_DISK_PLATFORM] = "platform", - [PM_DISK_SHUTDOWN] = "shutdown", - [PM_DISK_REBOOT] = "reboot", - [PM_DISK_TEST] = "test", - [PM_DISK_TESTPROC] = "testproc", +static const char * const hibernation_modes[] = { + [HIBERNATION_PLATFORM] = "platform", + [HIBERNATION_SHUTDOWN] = "shutdown", + [HIBERNATION_REBOOT] = "reboot", + [HIBERNATION_TEST] = "test", + [HIBERNATION_TESTPROC] = "testproc", }; /** - * disk - Control suspend-to-disk mode + * disk - Control hibernation mode * * Suspend-to-disk can be handled in several ways. We have a few options * for putting the system to sleep - using the platform driver (e.g. ACPI - * or other pm_ops), powering off the system or rebooting the system - * (for testing) as well as the two test modes. + * or other hibernation_ops), powering off the system or rebooting the + * system (for testing) as well as the two test modes. * * The system can support 'platform', and that is known a priori (and - * encoded in pm_ops). However, the user may choose 'shutdown' or 'reboot' - * as alternatives, as well as the test modes 'test' and 'testproc'. + * encoded by the presence of hibernation_ops). However, the user may + * choose 'shutdown' or 'reboot' as alternatives, as well as one fo the + * test modes, 'test' or 'testproc'. * * show() will display what the mode is currently set to. * store() will accept one of @@ -345,7 +370,7 @@ static const char * const pm_disk_modes[] = { * 'testproc' * * It will only change to 'platform' if the system - * supports it (as determined from pm_ops->pm_disk_mode). + * supports it (as determined by having hibernation_ops). */ static ssize_t disk_show(struct kset *kset, char *buf) @@ -353,28 +378,25 @@ static ssize_t disk_show(struct kset *kset, char *buf) int i; char *start = buf; - for (i = PM_DISK_PLATFORM; i < PM_DISK_MAX; i++) { - if (!pm_disk_modes[i]) + for (i = HIBERNATION_FIRST; i <= HIBERNATION_MAX; i++) { + if (!hibernation_modes[i]) continue; switch (i) { - case PM_DISK_SHUTDOWN: - case PM_DISK_REBOOT: - case PM_DISK_TEST: - case PM_DISK_TESTPROC: + case HIBERNATION_SHUTDOWN: + case HIBERNATION_REBOOT: + case HIBERNATION_TEST: + case HIBERNATION_TESTPROC: break; - default: - if (pm_ops && pm_ops->enter && - (i == pm_ops->pm_disk_mode)) + case HIBERNATION_PLATFORM: + if (hibernation_ops) break; /* not a valid mode, continue with loop */ continue; } - if (i == pm_disk_mode) - buf += sprintf(buf, "[%s]", pm_disk_modes[i]); + if (i == hibernation_mode) + buf += sprintf(buf, "[%s] ", hibernation_modes[i]); else - buf += sprintf(buf, "%s", pm_disk_modes[i]); - if (i+1 != PM_DISK_MAX) - buf += sprintf(buf, " "); + buf += sprintf(buf, "%s ", hibernation_modes[i]); } buf += sprintf(buf, "\n"); return buf-start; @@ -387,39 +409,39 @@ static ssize_t disk_store(struct kset *kset, const char *buf, size_t n) int i; int len; char *p; - suspend_disk_method_t mode = 0; + int mode = HIBERNATION_INVALID; p = memchr(buf, '\n', n); len = p ? p - buf : n; mutex_lock(&pm_mutex); - for (i = PM_DISK_PLATFORM; i < PM_DISK_MAX; i++) { - if (!strncmp(buf, pm_disk_modes[i], len)) { + for (i = HIBERNATION_FIRST; i <= HIBERNATION_MAX; i++) { + if (len == strlen(hibernation_modes[i]) + && !strncmp(buf, hibernation_modes[i], len)) { mode = i; break; } } - if (mode) { + if (mode != HIBERNATION_INVALID) { switch (mode) { - case PM_DISK_SHUTDOWN: - case PM_DISK_REBOOT: - case PM_DISK_TEST: - case PM_DISK_TESTPROC: - pm_disk_mode = mode; + case HIBERNATION_SHUTDOWN: + case HIBERNATION_REBOOT: + case HIBERNATION_TEST: + case HIBERNATION_TESTPROC: + hibernation_mode = mode; break; - default: - if (pm_ops && pm_ops->enter && - (mode == pm_ops->pm_disk_mode)) - pm_disk_mode = mode; + case HIBERNATION_PLATFORM: + if (hibernation_ops) + hibernation_mode = mode; else error = -EINVAL; } - } else { + } else error = -EINVAL; - } - pr_debug("PM: suspend-to-disk mode set to '%s'\n", - pm_disk_modes[mode]); + if (!error) + pr_debug("PM: suspend-to-disk mode set to '%s'\n", + hibernation_modes[mode]); mutex_unlock(&pm_mutex); return error ? error : n; } diff --git a/kernel/power/main.c b/kernel/power/main.c index f6dda685e7e2..8812985f3029 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -30,7 +30,6 @@ DEFINE_MUTEX(pm_mutex); struct pm_ops *pm_ops; -suspend_disk_method_t pm_disk_mode = PM_DISK_SHUTDOWN; /** * pm_set_ops - Set the global power method table. @@ -41,10 +40,6 @@ void pm_set_ops(struct pm_ops * ops) { mutex_lock(&pm_mutex); pm_ops = ops; - if (ops && ops->pm_disk_mode != PM_DISK_INVALID) { - pm_disk_mode = ops->pm_disk_mode; - } else - pm_disk_mode = PM_DISK_SHUTDOWN; mutex_unlock(&pm_mutex); } @@ -102,25 +97,26 @@ static int suspend_prepare(suspend_state_t state) } } - if (pm_ops->prepare) { - if ((error = pm_ops->prepare(state))) - goto Thaw; - } - suspend_console(); error = device_suspend(PMSG_SUSPEND); if (error) { printk(KERN_ERR "Some devices failed to suspend\n"); - goto Resume_devices; + goto Resume_console; + } + if (pm_ops->prepare) { + if ((error = pm_ops->prepare(state))) + goto Resume_devices; } + error = disable_nonboot_cpus(); if (!error) return 0; enable_nonboot_cpus(); - Resume_devices: pm_finish(state); + Resume_devices: device_resume(); + Resume_console: resume_console(); Thaw: thaw_processes(); @@ -184,24 +180,12 @@ static void suspend_finish(suspend_state_t state) static const char * const pm_states[PM_SUSPEND_MAX] = { [PM_SUSPEND_STANDBY] = "standby", [PM_SUSPEND_MEM] = "mem", - [PM_SUSPEND_DISK] = "disk", }; static inline int valid_state(suspend_state_t state) { - /* Suspend-to-disk does not really need low-level support. - * It can work with shutdown/reboot if needed. If it isn't - * configured, then it cannot be supported. - */ - if (state == PM_SUSPEND_DISK) -#ifdef CONFIG_SOFTWARE_SUSPEND - return 1; -#else - return 0; -#endif - - /* all other states need lowlevel support and need to be - * valid to the lowlevel implementation, no valid callback + /* All states need lowlevel support and need to be valid + * to the lowlevel implementation, no valid callback * implies that none are valid. */ if (!pm_ops || !pm_ops->valid || !pm_ops->valid(state)) return 0; @@ -229,11 +213,6 @@ static int enter_state(suspend_state_t state) if (!mutex_trylock(&pm_mutex)) return -EBUSY; - if (state == PM_SUSPEND_DISK) { - error = pm_suspend_disk(); - goto Unlock; - } - pr_debug("PM: Preparing system for %s sleep\n", pm_states[state]); if ((error = suspend_prepare(state))) goto Unlock; @@ -251,7 +230,7 @@ static int enter_state(suspend_state_t state) /** * pm_suspend - Externally visible function for suspending system. - * @state: Enumarted value of state to enter. + * @state: Enumerated value of state to enter. * * Determine whether or not value is within range, get state * structure, and enter (above). @@ -289,7 +268,13 @@ static ssize_t state_show(struct kset *kset, char *buf) if (pm_states[i] && valid_state(i)) s += sprintf(s,"%s ", pm_states[i]); } - s += sprintf(s,"\n"); +#ifdef CONFIG_SOFTWARE_SUSPEND + s += sprintf(s, "%s\n", "disk"); +#else + if (s != buf) + /* convert the last space to a newline */ + *(s-1) = '\n'; +#endif return (s - buf); } @@ -304,8 +289,14 @@ static ssize_t state_store(struct kset *kset, const char *buf, size_t n) p = memchr(buf, '\n', n); len = p ? p - buf : n; + /* First, check if we are requested to hibernate */ + if (len == 4 && !strncmp(buf, "disk", len)) { + error = hibernate(); + return error ? error : n; + } + for (s = &pm_states[state]; state < PM_SUSPEND_MAX; s++, state++) { - if (*s && !strncmp(buf, *s, len)) + if (*s && len == strlen(*s) && !strncmp(buf, *s, len)) break; } if (state < PM_SUSPEND_MAX && *s) diff --git a/kernel/power/power.h b/kernel/power/power.h index 34b43542785a..51381487103f 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h @@ -25,12 +25,7 @@ struct swsusp_info { */ #define SPARE_PAGES ((1024 * 1024) >> PAGE_SHIFT) -extern int pm_suspend_disk(void); -#else -static inline int pm_suspend_disk(void) -{ - return -EPERM; -} +extern struct hibernation_ops *hibernation_ops; #endif extern int pfn_is_nosave(unsigned long); diff --git a/kernel/power/process.c b/kernel/power/process.c index 0eb5c420e8ed..088419387388 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -8,7 +8,6 @@ #undef DEBUG -#include <linux/smp_lock.h> #include <linux/interrupt.h> #include <linux/suspend.h> #include <linux/module.h> @@ -25,10 +24,9 @@ static inline int freezeable(struct task_struct * p) { - if ((p == current) || + if ((p == current) || (p->flags & PF_NOFREEZE) || - (p->exit_state == EXIT_ZOMBIE) || - (p->exit_state == EXIT_DEAD)) + (p->exit_state != 0)) return 0; return 1; } diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 128da11f01c2..a3b7854b8f7c 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -14,7 +14,6 @@ #include <linux/module.h> #include <linux/mm.h> #include <linux/suspend.h> -#include <linux/smp_lock.h> #include <linux/delay.h> #include <linux/bitops.h> #include <linux/spinlock.h> @@ -608,7 +607,8 @@ static LIST_HEAD(nosave_regions); */ void __init -register_nosave_region(unsigned long start_pfn, unsigned long end_pfn) +__register_nosave_region(unsigned long start_pfn, unsigned long end_pfn, + int use_kmalloc) { struct nosave_region *region; @@ -624,8 +624,13 @@ register_nosave_region(unsigned long start_pfn, unsigned long end_pfn) goto Report; } } - /* This allocation cannot fail */ - region = alloc_bootmem_low(sizeof(struct nosave_region)); + if (use_kmalloc) { + /* during init, this shouldn't fail */ + region = kmalloc(sizeof(struct nosave_region), GFP_KERNEL); + BUG_ON(!region); + } else + /* This allocation cannot fail */ + region = alloc_bootmem_low(sizeof(struct nosave_region)); region->start_pfn = start_pfn; region->end_pfn = end_pfn; list_add_tail(®ion->list, &nosave_regions); @@ -1228,7 +1233,7 @@ asmlinkage int swsusp_save(void) nr_copy_pages = nr_pages; nr_meta_pages = DIV_ROUND_UP(nr_pages * sizeof(long), PAGE_SIZE); - printk("swsusp: critical section/: done (%d pages copied)\n", nr_pages); + printk("swsusp: critical section: done (%d pages copied)\n", nr_pages); return 0; } diff --git a/kernel/power/swap.c b/kernel/power/swap.c index e83ed9945a80..b8b235cc19d1 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -12,7 +12,6 @@ */ #include <linux/module.h> -#include <linux/smp_lock.h> #include <linux/file.h> #include <linux/utsname.h> #include <linux/version.h> diff --git a/kernel/power/user.c b/kernel/power/user.c index 040560d9c312..24d7d78e6f42 100644 --- a/kernel/power/user.c +++ b/kernel/power/user.c @@ -130,16 +130,16 @@ static inline int platform_prepare(void) { int error = 0; - if (pm_ops && pm_ops->prepare) - error = pm_ops->prepare(PM_SUSPEND_DISK); + if (hibernation_ops) + error = hibernation_ops->prepare(); return error; } static inline void platform_finish(void) { - if (pm_ops && pm_ops->finish) - pm_ops->finish(PM_SUSPEND_DISK); + if (hibernation_ops) + hibernation_ops->finish(); } static inline int snapshot_suspend(int platform_suspend) @@ -384,7 +384,7 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp, switch (arg) { case PMOPS_PREPARE: - if (pm_ops && pm_ops->enter) { + if (hibernation_ops) { data->platform_suspend = 1; error = 0; } else { @@ -395,8 +395,7 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp, case PMOPS_ENTER: if (data->platform_suspend) { kernel_shutdown_prepare(SYSTEM_SUSPEND_DISK); - error = pm_ops->enter(PM_SUSPEND_DISK); - error = 0; + error = hibernation_ops->enter(); } break; diff --git a/kernel/printk.c b/kernel/printk.c index 4b47e59248df..0bbdeac2810c 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -20,7 +20,6 @@ #include <linux/mm.h> #include <linux/tty.h> #include <linux/tty_driver.h> -#include <linux/smp_lock.h> #include <linux/console.h> #include <linux/init.h> #include <linux/module.h> @@ -931,8 +930,16 @@ void register_console(struct console *console) { int i; unsigned long flags; + struct console *bootconsole = NULL; - if (preferred_console < 0) + if (console_drivers) { + if (console->flags & CON_BOOT) + return; + if (console_drivers->flags & CON_BOOT) + bootconsole = console_drivers; + } + + if (preferred_console < 0 || bootconsole || !console_drivers) preferred_console = selected_console; /* @@ -978,8 +985,11 @@ void register_console(struct console *console) if (!(console->flags & CON_ENABLED)) return; - if (console_drivers && (console_drivers->flags & CON_BOOT)) { - unregister_console(console_drivers); + if (bootconsole) { + printk(KERN_INFO "console handover: boot [%s%d] -> real [%s%d]\n", + bootconsole->name, bootconsole->index, + console->name, console->index); + unregister_console(bootconsole); console->flags &= ~CON_PRINTBUFFER; } @@ -1030,16 +1040,11 @@ int unregister_console(struct console *console) } } - /* If last console is removed, we re-enable picking the first - * one that gets registered. Without that, pmac early boot console - * would prevent fbcon from taking over. - * + /* * If this isn't the last console and it has CON_CONSDEV set, we * need to set it on the next preferred console. */ - if (console_drivers == NULL) - preferred_console = selected_console; - else if (console->flags & CON_CONSDEV) + if (console_drivers != NULL && console->flags & CON_CONSDEV) console_drivers->flags |= CON_CONSDEV; release_console_sem(); diff --git a/kernel/profile.c b/kernel/profile.c index 9bfadb248dd8..5b20fe977bed 100644 --- a/kernel/profile.c +++ b/kernel/profile.c @@ -26,6 +26,7 @@ #include <asm/sections.h> #include <asm/semaphore.h> #include <asm/irq_regs.h> +#include <asm/ptrace.h> struct profile_hit { u32 pc, hits; @@ -340,6 +341,7 @@ static int __devinit profile_cpu_callback(struct notifier_block *info, switch (action) { case CPU_UP_PREPARE: + case CPU_UP_PREPARE_FROZEN: node = cpu_to_node(cpu); per_cpu(cpu_profile_flip, cpu) = 0; if (!per_cpu(cpu_profile_hits, cpu)[1]) { @@ -365,10 +367,13 @@ static int __devinit profile_cpu_callback(struct notifier_block *info, __free_page(page); return NOTIFY_BAD; case CPU_ONLINE: + case CPU_ONLINE_FROZEN: cpu_set(cpu, prof_cpu_mask); break; case CPU_UP_CANCELED: + case CPU_UP_CANCELED_FROZEN: case CPU_DEAD: + case CPU_DEAD_FROZEN: cpu_clear(cpu, prof_cpu_mask); if (per_cpu(cpu_profile_hits, cpu)[0]) { page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[0]); diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 4d50e06fd745..ad7949a589dd 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -18,6 +18,7 @@ #include <linux/ptrace.h> #include <linux/security.h> #include <linux/signal.h> +#include <linux/audit.h> #include <asm/pgtable.h> #include <asm/uaccess.h> @@ -161,6 +162,8 @@ int ptrace_attach(struct task_struct *task) { int retval; + audit_ptrace(task); + retval = -EPERM; if (task->pid <= 1) goto out; diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c index 3554b76da84c..2c2dd8410dc4 100644 --- a/kernel/rcupdate.c +++ b/kernel/rcupdate.c @@ -558,9 +558,11 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self, long cpu = (long)hcpu; switch (action) { case CPU_UP_PREPARE: + case CPU_UP_PREPARE_FROZEN: rcu_online_cpu(cpu); break; case CPU_DEAD: + case CPU_DEAD_FROZEN: rcu_offline_cpu(cpu); break; default: diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index bcd14e83ef39..55ba82a85a66 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c @@ -502,10 +502,6 @@ static struct rcu_torture_ops sched_ops = { .name = "sched" }; -static struct rcu_torture_ops *torture_ops[] = - { &rcu_ops, &rcu_sync_ops, &rcu_bh_ops, &rcu_bh_sync_ops, &srcu_ops, - &sched_ops, NULL }; - /* * RCU torture writer kthread. Repeatedly substitutes a new structure * for that pointed to by rcu_torture_current, freeing the old structure @@ -534,7 +530,7 @@ rcu_torture_writer(void *arg) rp->rtort_mbtest = 1; rcu_assign_pointer(rcu_torture_current, rp); smp_wmb(); - if (old_rp != NULL) { + if (old_rp) { i = old_rp->rtort_pipe_count; if (i > RCU_TORTURE_PIPE_LEN) i = RCU_TORTURE_PIPE_LEN; @@ -685,7 +681,7 @@ rcu_torture_printk(char *page) atomic_read(&rcu_torture_wcount[i])); } cnt += sprintf(&page[cnt], "\n"); - if (cur_ops->stats != NULL) + if (cur_ops->stats) cnt += cur_ops->stats(&page[cnt]); return cnt; } @@ -749,13 +745,13 @@ static void rcu_torture_shuffle_tasks(void) set_cpus_allowed(current, tmp_mask); - if (reader_tasks != NULL) { + if (reader_tasks) { for (i = 0; i < nrealreaders; i++) if (reader_tasks[i]) set_cpus_allowed(reader_tasks[i], tmp_mask); } - if (fakewriter_tasks != NULL) { + if (fakewriter_tasks) { for (i = 0; i < nfakewriters; i++) if (fakewriter_tasks[i]) set_cpus_allowed(fakewriter_tasks[i], tmp_mask); @@ -808,21 +804,21 @@ rcu_torture_cleanup(void) int i; fullstop = 1; - if (shuffler_task != NULL) { + if (shuffler_task) { VERBOSE_PRINTK_STRING("Stopping rcu_torture_shuffle task"); kthread_stop(shuffler_task); } shuffler_task = NULL; - if (writer_task != NULL) { + if (writer_task) { VERBOSE_PRINTK_STRING("Stopping rcu_torture_writer task"); kthread_stop(writer_task); } writer_task = NULL; - if (reader_tasks != NULL) { + if (reader_tasks) { for (i = 0; i < nrealreaders; i++) { - if (reader_tasks[i] != NULL) { + if (reader_tasks[i]) { VERBOSE_PRINTK_STRING( "Stopping rcu_torture_reader task"); kthread_stop(reader_tasks[i]); @@ -834,9 +830,9 @@ rcu_torture_cleanup(void) } rcu_torture_current = NULL; - if (fakewriter_tasks != NULL) { + if (fakewriter_tasks) { for (i = 0; i < nfakewriters; i++) { - if (fakewriter_tasks[i] != NULL) { + if (fakewriter_tasks[i]) { VERBOSE_PRINTK_STRING( "Stopping rcu_torture_fakewriter task"); kthread_stop(fakewriter_tasks[i]); @@ -847,7 +843,7 @@ rcu_torture_cleanup(void) fakewriter_tasks = NULL; } - if (stats_task != NULL) { + if (stats_task) { VERBOSE_PRINTK_STRING("Stopping rcu_torture_stats task"); kthread_stop(stats_task); } @@ -858,7 +854,7 @@ rcu_torture_cleanup(void) rcu_torture_stats_print(); /* -After- the stats thread is stopped! */ - if (cur_ops->cleanup != NULL) + if (cur_ops->cleanup) cur_ops->cleanup(); if (atomic_read(&n_rcu_torture_error)) rcu_torture_print_module_parms("End of test: FAILURE"); @@ -866,27 +862,28 @@ rcu_torture_cleanup(void) rcu_torture_print_module_parms("End of test: SUCCESS"); } -static int +static int __init rcu_torture_init(void) { int i; int cpu; int firsterr = 0; + static struct rcu_torture_ops *torture_ops[] = + { &rcu_ops, &rcu_sync_ops, &rcu_bh_ops, &rcu_bh_sync_ops, + &srcu_ops, &sched_ops, }; /* Process args and tell the world that the torturer is on the job. */ - - for (i = 0; cur_ops = torture_ops[i], cur_ops != NULL; i++) { + for (i = 0; i < ARRAY_SIZE(torture_ops); i++) { cur_ops = torture_ops[i]; - if (strcmp(torture_type, cur_ops->name) == 0) { + if (strcmp(torture_type, cur_ops->name) == 0) break; - } } - if (cur_ops == NULL) { + if (i == ARRAY_SIZE(torture_ops)) { printk(KERN_ALERT "rcutorture: invalid torture type: \"%s\"\n", torture_type); return (-EINVAL); } - if (cur_ops->init != NULL) + if (cur_ops->init) cur_ops->init(); /* no "goto unwind" prior to this point!!! */ if (nreaders >= 0) @@ -899,7 +896,7 @@ rcu_torture_init(void) /* Set up the freelist. */ INIT_LIST_HEAD(&rcu_torture_freelist); - for (i = 0; i < sizeof(rcu_tortures) / sizeof(rcu_tortures[0]); i++) { + for (i = 0; i < ARRAY_SIZE(rcu_tortures); i++) { rcu_tortures[i].rtort_mbtest = 0; list_add_tail(&rcu_tortures[i].rtort_free, &rcu_torture_freelist); diff --git a/kernel/relay.c b/kernel/relay.c index 577f251c7e28..4311101b0ca7 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -310,16 +310,13 @@ static struct rchan_callbacks default_channel_callbacks = { /** * wakeup_readers - wake up readers waiting on a channel - * @work: work struct that contains the the channel buffer + * @data: contains the channel buffer * - * This is the work function used to defer reader waking. The - * reason waking is deferred is that calling directly from write - * causes problems if you're writing from say the scheduler. + * This is the timer function used to defer reader waking. */ -static void wakeup_readers(struct work_struct *work) +static void wakeup_readers(unsigned long data) { - struct rchan_buf *buf = - container_of(work, struct rchan_buf, wake_readers.work); + struct rchan_buf *buf = (struct rchan_buf *)data; wake_up_interruptible(&buf->read_wait); } @@ -337,11 +334,9 @@ static void __relay_reset(struct rchan_buf *buf, unsigned int init) if (init) { init_waitqueue_head(&buf->read_wait); kref_init(&buf->kref); - INIT_DELAYED_WORK(&buf->wake_readers, NULL); - } else { - cancel_delayed_work(&buf->wake_readers); - flush_scheduled_work(); - } + setup_timer(&buf->timer, wakeup_readers, (unsigned long)buf); + } else + del_timer_sync(&buf->timer); buf->subbufs_produced = 0; buf->subbufs_consumed = 0; @@ -447,8 +442,7 @@ end: static void relay_close_buf(struct rchan_buf *buf) { buf->finalized = 1; - cancel_delayed_work(&buf->wake_readers); - flush_scheduled_work(); + del_timer_sync(&buf->timer); kref_put(&buf->kref, relay_remove_buf); } @@ -490,6 +484,7 @@ static int __cpuinit relay_hotcpu_callback(struct notifier_block *nb, switch(action) { case CPU_UP_PREPARE: + case CPU_UP_PREPARE_FROZEN: mutex_lock(&relay_channels_mutex); list_for_each_entry(chan, &relay_channels, list) { if (chan->buf[hotcpu]) @@ -506,6 +501,7 @@ static int __cpuinit relay_hotcpu_callback(struct notifier_block *nb, mutex_unlock(&relay_channels_mutex); break; case CPU_DEAD: + case CPU_DEAD_FROZEN: /* No need to flush the cpu : will be flushed upon * final relay_flush() call. */ break; @@ -608,11 +604,14 @@ size_t relay_switch_subbuf(struct rchan_buf *buf, size_t length) buf->dentry->d_inode->i_size += buf->chan->subbuf_size - buf->padding[old_subbuf]; smp_mb(); - if (waitqueue_active(&buf->read_wait)) { - PREPARE_DELAYED_WORK(&buf->wake_readers, - wakeup_readers); - schedule_delayed_work(&buf->wake_readers, 1); - } + if (waitqueue_active(&buf->read_wait)) + /* + * Calling wake_up_interruptible() from here + * will deadlock if we happen to be logging + * from the scheduler (trying to re-grab + * rq->lock), so defer it. + */ + __mod_timer(&buf->timer, jiffies + 1); } old = buf->data; diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c index 180978cb2f75..12879f6c1ec3 100644 --- a/kernel/rtmutex.c +++ b/kernel/rtmutex.c @@ -56,7 +56,7 @@ * state. */ -static void +void rt_mutex_set_owner(struct rt_mutex *lock, struct task_struct *owner, unsigned long mask) { @@ -81,29 +81,6 @@ static void fixup_rt_mutex_waiters(struct rt_mutex *lock) } /* - * We can speed up the acquire/release, if the architecture - * supports cmpxchg and if there's no debugging state to be set up - */ -#if defined(__HAVE_ARCH_CMPXCHG) && !defined(CONFIG_DEBUG_RT_MUTEXES) -# define rt_mutex_cmpxchg(l,c,n) (cmpxchg(&l->owner, c, n) == c) -static inline void mark_rt_mutex_waiters(struct rt_mutex *lock) -{ - unsigned long owner, *p = (unsigned long *) &lock->owner; - - do { - owner = *p; - } while (cmpxchg(p, owner, owner | RT_MUTEX_HAS_WAITERS) != owner); -} -#else -# define rt_mutex_cmpxchg(l,c,n) (0) -static inline void mark_rt_mutex_waiters(struct rt_mutex *lock) -{ - lock->owner = (struct task_struct *) - ((unsigned long)lock->owner | RT_MUTEX_HAS_WAITERS); -} -#endif - -/* * Calculate task priority from the waiter list priority * * Return task->normal_prio when the waiter list is empty or when @@ -123,7 +100,7 @@ int rt_mutex_getprio(struct task_struct *task) * * This can be both boosting and unboosting. task->pi_lock must be held. */ -static void __rt_mutex_adjust_prio(struct task_struct *task) +void __rt_mutex_adjust_prio(struct task_struct *task) { int prio = rt_mutex_getprio(task); @@ -159,11 +136,11 @@ int max_lock_depth = 1024; * Decreases task's usage by one - may thus free the task. * Returns 0 or -EDEADLK. */ -static int rt_mutex_adjust_prio_chain(struct task_struct *task, - int deadlock_detect, - struct rt_mutex *orig_lock, - struct rt_mutex_waiter *orig_waiter, - struct task_struct *top_task) +int rt_mutex_adjust_prio_chain(struct task_struct *task, + int deadlock_detect, + struct rt_mutex *orig_lock, + struct rt_mutex_waiter *orig_waiter, + struct task_struct *top_task) { struct rt_mutex *lock; struct rt_mutex_waiter *waiter, *top_waiter = orig_waiter; @@ -524,8 +501,8 @@ static void wakeup_next_waiter(struct rt_mutex *lock) * * Must be called with lock->wait_lock held */ -static void remove_waiter(struct rt_mutex *lock, - struct rt_mutex_waiter *waiter) +void remove_waiter(struct rt_mutex *lock, + struct rt_mutex_waiter *waiter) { int first = (waiter == rt_mutex_top_waiter(lock)); struct task_struct *owner = rt_mutex_owner(lock); diff --git a/kernel/rtmutex_common.h b/kernel/rtmutex_common.h index 9c75856e791e..242ec7ee740b 100644 --- a/kernel/rtmutex_common.h +++ b/kernel/rtmutex_common.h @@ -113,6 +113,29 @@ static inline unsigned long rt_mutex_owner_pending(struct rt_mutex *lock) } /* + * We can speed up the acquire/release, if the architecture + * supports cmpxchg and if there's no debugging state to be set up + */ +#if defined(__HAVE_ARCH_CMPXCHG) && !defined(CONFIG_DEBUG_RT_MUTEXES) +# define rt_mutex_cmpxchg(l,c,n) (cmpxchg(&l->owner, c, n) == c) +static inline void mark_rt_mutex_waiters(struct rt_mutex *lock) +{ + unsigned long owner, *p = (unsigned long *) &lock->owner; + + do { + owner = *p; + } while (cmpxchg(p, owner, owner | RT_MUTEX_HAS_WAITERS) != owner); +} +#else +# define rt_mutex_cmpxchg(l,c,n) (0) +static inline void mark_rt_mutex_waiters(struct rt_mutex *lock) +{ + lock->owner = (struct task_struct *) + ((unsigned long)lock->owner | RT_MUTEX_HAS_WAITERS); +} +#endif + +/* * PI-futex support (proxy locking functions, etc.): */ extern struct task_struct *rt_mutex_next_owner(struct rt_mutex *lock); @@ -120,4 +143,15 @@ extern void rt_mutex_init_proxy_locked(struct rt_mutex *lock, struct task_struct *proxy_owner); extern void rt_mutex_proxy_unlock(struct rt_mutex *lock, struct task_struct *proxy_owner); + +extern void rt_mutex_set_owner(struct rt_mutex *lock, struct task_struct *owner, + unsigned long mask); +extern void __rt_mutex_adjust_prio(struct task_struct *task); +extern int rt_mutex_adjust_prio_chain(struct task_struct *task, + int deadlock_detect, + struct rt_mutex *orig_lock, + struct rt_mutex_waiter *orig_waiter, + struct task_struct *top_task); +extern void remove_waiter(struct rt_mutex *lock, + struct rt_mutex_waiter *waiter); #endif diff --git a/kernel/rwsem.c b/kernel/rwsem.c index 291ded556aa0..9a87886b022e 100644 --- a/kernel/rwsem.c +++ b/kernel/rwsem.c @@ -60,7 +60,7 @@ int down_write_trylock(struct rw_semaphore *sem) int ret = __down_write_trylock(sem); if (ret == 1) - rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_); + rwsem_acquire(&sem->dep_map, 0, 1, _RET_IP_); return ret; } diff --git a/kernel/sched.c b/kernel/sched.c index 0227f1625a75..799d23b4e35d 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -52,8 +52,9 @@ #include <linux/tsacct_kern.h> #include <linux/kprobes.h> #include <linux/delayacct.h> -#include <asm/tlb.h> +#include <linux/reciprocal_div.h> +#include <asm/tlb.h> #include <asm/unistd.h> /* @@ -181,6 +182,27 @@ static unsigned int static_prio_timeslice(int static_prio) return SCALE_PRIO(DEF_TIMESLICE, static_prio); } +#ifdef CONFIG_SMP +/* + * Divide a load by a sched group cpu_power : (load / sg->__cpu_power) + * Since cpu_power is a 'constant', we can use a reciprocal divide. + */ +static inline u32 sg_div_cpu_power(const struct sched_group *sg, u32 load) +{ + return reciprocal_divide(load, sg->reciprocal_cpu_power); +} + +/* + * Each time a sched group cpu_power is changed, + * we must compute its reciprocal value + */ +static inline void sg_inc_cpu_power(struct sched_group *sg, u32 val) +{ + sg->__cpu_power += val; + sg->reciprocal_cpu_power = reciprocal_value(sg->__cpu_power); +} +#endif + /* * task_timeslice() scales user-nice values [ -20 ... 0 ... 19 ] * to time slice values: [800ms ... 100ms ... 5ms] @@ -223,6 +245,10 @@ struct rq { unsigned long raw_weighted_load; #ifdef CONFIG_SMP unsigned long cpu_load[3]; + unsigned char idle_at_tick; +#ifdef CONFIG_NO_HZ + unsigned char in_nohz_recently; +#endif #endif unsigned long long nr_switches; @@ -278,7 +304,8 @@ struct rq { struct lock_class_key rq_lock_key; }; -static DEFINE_PER_CPU(struct rq, runqueues); +static DEFINE_PER_CPU(struct rq, runqueues) ____cacheline_aligned_in_smp; +static DEFINE_MUTEX(sched_hotcpu_mutex); static inline int cpu_of(struct rq *rq) { @@ -1049,6 +1076,17 @@ static void resched_task(struct task_struct *p) if (!tsk_is_polling(p)) smp_send_reschedule(cpu); } + +static void resched_cpu(int cpu) +{ + struct rq *rq = cpu_rq(cpu); + unsigned long flags; + + if (!spin_trylock_irqsave(&rq->lock, flags)) + return; + resched_task(cpu_curr(cpu)); + spin_unlock_irqrestore(&rq->lock, flags); +} #else static inline void resched_task(struct task_struct *p) { @@ -1241,7 +1279,8 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu) } /* Adjust by relative CPU power of the group */ - avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power; + avg_load = sg_div_cpu_power(group, + avg_load * SCHED_LOAD_SCALE); if (local_group) { this_load = avg_load; @@ -1368,7 +1407,16 @@ static int wake_idle(int cpu, struct task_struct *p) struct sched_domain *sd; int i; - if (idle_cpu(cpu)) + /* + * If it is idle, then it is the best cpu to run this task. + * + * This cpu is also the best, if it has more than one task already. + * Siblings must be also busy(in most cases) as they didn't already + * pickup the extra load from this cpu and hence we need not check + * sibling runqueue info. This will avoid the checks and cache miss + * penalities associated with that. + */ + if (idle_cpu(cpu) || cpu_rq(cpu)->nr_running > 1) return cpu; for_each_domain(cpu, sd) { @@ -2352,12 +2400,13 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, } total_load += avg_load; - total_pwr += group->cpu_power; + total_pwr += group->__cpu_power; /* Adjust by relative CPU power of the group */ - avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power; + avg_load = sg_div_cpu_power(group, + avg_load * SCHED_LOAD_SCALE); - group_capacity = group->cpu_power / SCHED_LOAD_SCALE; + group_capacity = group->__cpu_power / SCHED_LOAD_SCALE; if (local_group) { this_load = avg_load; @@ -2468,8 +2517,8 @@ group_next: max_pull = min(max_load - avg_load, max_load - busiest_load_per_task); /* How much load to actually move to equalise the imbalance */ - *imbalance = min(max_pull * busiest->cpu_power, - (avg_load - this_load) * this->cpu_power) + *imbalance = min(max_pull * busiest->__cpu_power, + (avg_load - this_load) * this->__cpu_power) / SCHED_LOAD_SCALE; /* @@ -2503,28 +2552,29 @@ small_imbalance: * moving them. */ - pwr_now += busiest->cpu_power * - min(busiest_load_per_task, max_load); - pwr_now += this->cpu_power * - min(this_load_per_task, this_load); + pwr_now += busiest->__cpu_power * + min(busiest_load_per_task, max_load); + pwr_now += this->__cpu_power * + min(this_load_per_task, this_load); pwr_now /= SCHED_LOAD_SCALE; /* Amount of load we'd subtract */ - tmp = busiest_load_per_task * SCHED_LOAD_SCALE / - busiest->cpu_power; + tmp = sg_div_cpu_power(busiest, + busiest_load_per_task * SCHED_LOAD_SCALE); if (max_load > tmp) - pwr_move += busiest->cpu_power * + pwr_move += busiest->__cpu_power * min(busiest_load_per_task, max_load - tmp); /* Amount of load we'd add */ - if (max_load * busiest->cpu_power < + if (max_load * busiest->__cpu_power < busiest_load_per_task * SCHED_LOAD_SCALE) - tmp = max_load * busiest->cpu_power / this->cpu_power; + tmp = sg_div_cpu_power(this, + max_load * busiest->__cpu_power); else - tmp = busiest_load_per_task * SCHED_LOAD_SCALE / - this->cpu_power; - pwr_move += this->cpu_power * - min(this_load_per_task, this_load + tmp); + tmp = sg_div_cpu_power(this, + busiest_load_per_task * SCHED_LOAD_SCALE); + pwr_move += this->__cpu_power * + min(this_load_per_task, this_load + tmp); pwr_move /= SCHED_LOAD_SCALE; /* Move if we gain throughput */ @@ -2657,6 +2707,12 @@ redo: double_rq_unlock(this_rq, busiest); local_irq_restore(flags); + /* + * some other cpu did the load balance for us. + */ + if (nr_moved && this_cpu != smp_processor_id()) + resched_cpu(this_cpu); + /* All tasks on this runqueue were pinned by CPU affinity */ if (unlikely(all_pinned)) { cpu_clear(cpu_of(busiest), cpus); @@ -2927,32 +2983,98 @@ static void update_load(struct rq *this_rq) } } +#ifdef CONFIG_NO_HZ +static struct { + atomic_t load_balancer; + cpumask_t cpu_mask; +} nohz ____cacheline_aligned = { + .load_balancer = ATOMIC_INIT(-1), + .cpu_mask = CPU_MASK_NONE, +}; + /* - * run_rebalance_domains is triggered when needed from the scheduler tick. + * This routine will try to nominate the ilb (idle load balancing) + * owner among the cpus whose ticks are stopped. ilb owner will do the idle + * load balancing on behalf of all those cpus. If all the cpus in the system + * go into this tickless mode, then there will be no ilb owner (as there is + * no need for one) and all the cpus will sleep till the next wakeup event + * arrives... * + * For the ilb owner, tick is not stopped. And this tick will be used + * for idle load balancing. ilb owner will still be part of + * nohz.cpu_mask.. + * + * While stopping the tick, this cpu will become the ilb owner if there + * is no other owner. And will be the owner till that cpu becomes busy + * or if all cpus in the system stop their ticks at which point + * there is no need for ilb owner. + * + * When the ilb owner becomes busy, it nominates another owner, during the + * next busy scheduler_tick() + */ +int select_nohz_load_balancer(int stop_tick) +{ + int cpu = smp_processor_id(); + + if (stop_tick) { + cpu_set(cpu, nohz.cpu_mask); + cpu_rq(cpu)->in_nohz_recently = 1; + + /* + * If we are going offline and still the leader, give up! + */ + if (cpu_is_offline(cpu) && + atomic_read(&nohz.load_balancer) == cpu) { + if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu) + BUG(); + return 0; + } + + /* time for ilb owner also to sleep */ + if (cpus_weight(nohz.cpu_mask) == num_online_cpus()) { + if (atomic_read(&nohz.load_balancer) == cpu) + atomic_set(&nohz.load_balancer, -1); + return 0; + } + + if (atomic_read(&nohz.load_balancer) == -1) { + /* make me the ilb owner */ + if (atomic_cmpxchg(&nohz.load_balancer, -1, cpu) == -1) + return 1; + } else if (atomic_read(&nohz.load_balancer) == cpu) + return 1; + } else { + if (!cpu_isset(cpu, nohz.cpu_mask)) + return 0; + + cpu_clear(cpu, nohz.cpu_mask); + + if (atomic_read(&nohz.load_balancer) == cpu) + if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu) + BUG(); + } + return 0; +} +#endif + +static DEFINE_SPINLOCK(balancing); + +/* * It checks each scheduling domain to see if it is due to be balanced, * and initiates a balancing operation if so. * * Balancing parameters are set up in arch_init_sched_domains. */ -static DEFINE_SPINLOCK(balancing); - -static void run_rebalance_domains(struct softirq_action *h) +static inline void rebalance_domains(int cpu, enum idle_type idle) { - int this_cpu = smp_processor_id(), balance = 1; - struct rq *this_rq = cpu_rq(this_cpu); + int balance = 1; + struct rq *rq = cpu_rq(cpu); unsigned long interval; struct sched_domain *sd; - /* - * We are idle if there are no processes running. This - * is valid even if we are the idle process (SMT). - */ - enum idle_type idle = !this_rq->nr_running ? - SCHED_IDLE : NOT_IDLE; - /* Earliest time when we have to call run_rebalance_domains again */ + /* Earliest time when we have to do rebalance again */ unsigned long next_balance = jiffies + 60*HZ; - for_each_domain(this_cpu, sd) { + for_each_domain(cpu, sd) { if (!(sd->flags & SD_LOAD_BALANCE)) continue; @@ -2971,7 +3093,7 @@ static void run_rebalance_domains(struct softirq_action *h) } if (time_after_eq(jiffies, sd->last_balance + interval)) { - if (load_balance(this_cpu, this_rq, sd, idle, &balance)) { + if (load_balance(cpu, rq, sd, idle, &balance)) { /* * We've pulled tasks over so either we're no * longer idle, or one of our SMT siblings is @@ -2995,7 +3117,114 @@ out: if (!balance) break; } - this_rq->next_balance = next_balance; + rq->next_balance = next_balance; +} + +/* + * run_rebalance_domains is triggered when needed from the scheduler tick. + * In CONFIG_NO_HZ case, the idle load balance owner will do the + * rebalancing for all the cpus for whom scheduler ticks are stopped. + */ +static void run_rebalance_domains(struct softirq_action *h) +{ + int local_cpu = smp_processor_id(); + struct rq *local_rq = cpu_rq(local_cpu); + enum idle_type idle = local_rq->idle_at_tick ? SCHED_IDLE : NOT_IDLE; + + rebalance_domains(local_cpu, idle); + +#ifdef CONFIG_NO_HZ + /* + * If this cpu is the owner for idle load balancing, then do the + * balancing on behalf of the other idle cpus whose ticks are + * stopped. + */ + if (local_rq->idle_at_tick && + atomic_read(&nohz.load_balancer) == local_cpu) { + cpumask_t cpus = nohz.cpu_mask; + struct rq *rq; + int balance_cpu; + + cpu_clear(local_cpu, cpus); + for_each_cpu_mask(balance_cpu, cpus) { + /* + * If this cpu gets work to do, stop the load balancing + * work being done for other cpus. Next load + * balancing owner will pick it up. + */ + if (need_resched()) + break; + + rebalance_domains(balance_cpu, SCHED_IDLE); + + rq = cpu_rq(balance_cpu); + if (time_after(local_rq->next_balance, rq->next_balance)) + local_rq->next_balance = rq->next_balance; + } + } +#endif +} + +/* + * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing. + * + * In case of CONFIG_NO_HZ, this is the place where we nominate a new + * idle load balancing owner or decide to stop the periodic load balancing, + * if the whole system is idle. + */ +static inline void trigger_load_balance(int cpu) +{ + struct rq *rq = cpu_rq(cpu); +#ifdef CONFIG_NO_HZ + /* + * If we were in the nohz mode recently and busy at the current + * scheduler tick, then check if we need to nominate new idle + * load balancer. + */ + if (rq->in_nohz_recently && !rq->idle_at_tick) { + rq->in_nohz_recently = 0; + + if (atomic_read(&nohz.load_balancer) == cpu) { + cpu_clear(cpu, nohz.cpu_mask); + atomic_set(&nohz.load_balancer, -1); + } + + if (atomic_read(&nohz.load_balancer) == -1) { + /* + * simple selection for now: Nominate the + * first cpu in the nohz list to be the next + * ilb owner. + * + * TBD: Traverse the sched domains and nominate + * the nearest cpu in the nohz.cpu_mask. + */ + int ilb = first_cpu(nohz.cpu_mask); + + if (ilb != NR_CPUS) + resched_cpu(ilb); + } + } + + /* + * If this cpu is idle and doing idle load balancing for all the + * cpus with ticks stopped, is it time for that to stop? + */ + if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) == cpu && + cpus_weight(nohz.cpu_mask) == num_online_cpus()) { + resched_cpu(cpu); + return; + } + + /* + * If this cpu is idle and the idle load balancing is done by + * someone else, then no need raise the SCHED_SOFTIRQ + */ + if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) != cpu && + cpu_isset(cpu, nohz.cpu_mask)) + return; +#endif + if (time_after_eq(jiffies, rq->next_balance)) + raise_softirq(SCHED_SOFTIRQ); } #else /* @@ -3218,16 +3447,17 @@ void scheduler_tick(void) unsigned long long now = sched_clock(); struct task_struct *p = current; int cpu = smp_processor_id(); + int idle_at_tick = idle_cpu(cpu); struct rq *rq = cpu_rq(cpu); update_cpu_clock(p, rq, now); - if (p != rq->idle) + if (!idle_at_tick) task_running_tick(rq, p); #ifdef CONFIG_SMP update_load(rq); - if (time_after_eq(jiffies, rq->next_balance)) - raise_softirq(SCHED_SOFTIRQ); + rq->idle_at_tick = idle_at_tick; + trigger_load_balance(cpu); #endif } @@ -4291,13 +4521,13 @@ long sched_setaffinity(pid_t pid, cpumask_t new_mask) struct task_struct *p; int retval; - lock_cpu_hotplug(); + mutex_lock(&sched_hotcpu_mutex); read_lock(&tasklist_lock); p = find_process_by_pid(pid); if (!p) { read_unlock(&tasklist_lock); - unlock_cpu_hotplug(); + mutex_unlock(&sched_hotcpu_mutex); return -ESRCH; } @@ -4324,7 +4554,7 @@ long sched_setaffinity(pid_t pid, cpumask_t new_mask) out_unlock: put_task_struct(p); - unlock_cpu_hotplug(); + mutex_unlock(&sched_hotcpu_mutex); return retval; } @@ -4381,7 +4611,7 @@ long sched_getaffinity(pid_t pid, cpumask_t *mask) struct task_struct *p; int retval; - lock_cpu_hotplug(); + mutex_lock(&sched_hotcpu_mutex); read_lock(&tasklist_lock); retval = -ESRCH; @@ -4397,7 +4627,7 @@ long sched_getaffinity(pid_t pid, cpumask_t *mask) out_unlock: read_unlock(&tasklist_lock); - unlock_cpu_hotplug(); + mutex_unlock(&sched_hotcpu_mutex); if (retval) return retval; @@ -4750,6 +4980,8 @@ void show_state_filter(unsigned long state_filter) show_task(p); } while_each_thread(g, p); + touch_all_softlockup_watchdogs(); + read_unlock(&tasklist_lock); /* * Only show locks if all tasks are dumped: @@ -5157,7 +5389,12 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) struct rq *rq; switch (action) { + case CPU_LOCK_ACQUIRE: + mutex_lock(&sched_hotcpu_mutex); + break; + case CPU_UP_PREPARE: + case CPU_UP_PREPARE_FROZEN: p = kthread_create(migration_thread, hcpu, "migration/%d",cpu); if (IS_ERR(p)) return NOTIFY_BAD; @@ -5171,12 +5408,14 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) break; case CPU_ONLINE: + case CPU_ONLINE_FROZEN: /* Strictly unneccessary, as first user will wake it. */ wake_up_process(cpu_rq(cpu)->migration_thread); break; #ifdef CONFIG_HOTPLUG_CPU case CPU_UP_CANCELED: + case CPU_UP_CANCELED_FROZEN: if (!cpu_rq(cpu)->migration_thread) break; /* Unbind it from offline cpu so it can run. Fall thru. */ @@ -5187,6 +5426,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) break; case CPU_DEAD: + case CPU_DEAD_FROZEN: migrate_live_tasks(cpu); rq = cpu_rq(cpu); kthread_stop(rq->migration_thread); @@ -5202,7 +5442,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) BUG_ON(rq->nr_running != 0); /* No need to migrate the tasks: it was best-effort if - * they didn't do lock_cpu_hotplug(). Just wake up + * they didn't take sched_hotcpu_mutex. Just wake up * the requestors. */ spin_lock_irq(&rq->lock); while (!list_empty(&rq->migration_queue)) { @@ -5216,6 +5456,9 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) spin_unlock_irq(&rq->lock); break; #endif + case CPU_LOCK_RELEASE: + mutex_unlock(&sched_hotcpu_mutex); + break; } return NOTIFY_OK; } @@ -5304,7 +5547,7 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu) break; } - if (!group->cpu_power) { + if (!group->__cpu_power) { printk("\n"); printk(KERN_ERR "ERROR: domain->cpu_power not " "set\n"); @@ -5481,7 +5724,7 @@ init_sched_build_groups(cpumask_t span, const cpumask_t *cpu_map, continue; sg->cpumask = CPU_MASK_NONE; - sg->cpu_power = 0; + sg->__cpu_power = 0; for_each_cpu_mask(j, span) { if (group_fn(j, cpu_map, NULL) != group) @@ -6170,7 +6413,7 @@ next_sg: continue; } - sg->cpu_power += sd->groups->cpu_power; + sg_inc_cpu_power(sg, sd->groups->__cpu_power); } sg = sg->next; if (sg != group_head) @@ -6245,6 +6488,8 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd) child = sd->child; + sd->groups->__cpu_power = 0; + /* * For perf policy, if the groups in child domain share resources * (for example cores sharing some portions of the cache hierarchy @@ -6255,18 +6500,16 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd) if (!child || (!(sd->flags & SD_POWERSAVINGS_BALANCE) && (child->flags & (SD_SHARE_CPUPOWER | SD_SHARE_PKG_RESOURCES)))) { - sd->groups->cpu_power = SCHED_LOAD_SCALE; + sg_inc_cpu_power(sd->groups, SCHED_LOAD_SCALE); return; } - sd->groups->cpu_power = 0; - /* * add cpu_power of each child group to this groups cpu_power */ group = child->groups; do { - sd->groups->cpu_power += group->cpu_power; + sg_inc_cpu_power(sd->groups, group->__cpu_power); group = group->next; } while (group != child->groups); } @@ -6426,7 +6669,7 @@ static int build_sched_domains(const cpumask_t *cpu_map) sd = &per_cpu(node_domains, j); sd->groups = sg; } - sg->cpu_power = 0; + sg->__cpu_power = 0; sg->cpumask = nodemask; sg->next = sg; cpus_or(covered, covered, nodemask); @@ -6454,7 +6697,7 @@ static int build_sched_domains(const cpumask_t *cpu_map) "Can not alloc domain group for node %d\n", j); goto error; } - sg->cpu_power = 0; + sg->__cpu_power = 0; sg->cpumask = tmp; sg->next = prev->next; cpus_or(covered, covered, tmp); @@ -6591,10 +6834,10 @@ int arch_reinit_sched_domains(void) { int err; - lock_cpu_hotplug(); + mutex_lock(&sched_hotcpu_mutex); detach_destroy_domains(&cpu_online_map); err = arch_init_sched_domains(&cpu_online_map); - unlock_cpu_hotplug(); + mutex_unlock(&sched_hotcpu_mutex); return err; } @@ -6673,14 +6916,20 @@ static int update_sched_domains(struct notifier_block *nfb, { switch (action) { case CPU_UP_PREPARE: + case CPU_UP_PREPARE_FROZEN: case CPU_DOWN_PREPARE: + case CPU_DOWN_PREPARE_FROZEN: detach_destroy_domains(&cpu_online_map); return NOTIFY_OK; case CPU_UP_CANCELED: + case CPU_UP_CANCELED_FROZEN: case CPU_DOWN_FAILED: + case CPU_DOWN_FAILED_FROZEN: case CPU_ONLINE: + case CPU_ONLINE_FROZEN: case CPU_DEAD: + case CPU_DEAD_FROZEN: /* * Fall through and re-initialise the domains. */ @@ -6699,12 +6948,12 @@ void __init sched_init_smp(void) { cpumask_t non_isolated_cpus; - lock_cpu_hotplug(); + mutex_lock(&sched_hotcpu_mutex); arch_init_sched_domains(&cpu_online_map); cpus_andnot(non_isolated_cpus, cpu_possible_map, cpu_isolated_map); if (cpus_empty(non_isolated_cpus)) cpu_set(smp_processor_id(), non_isolated_cpus); - unlock_cpu_hotplug(); + mutex_unlock(&sched_hotcpu_mutex); /* XXX: Theoretical race here - CPU may be hotplugged now */ hotcpu_notifier(update_sched_domains, 0); diff --git a/kernel/signal.c b/kernel/signal.c index 2b4087d545a3..364fc95bf97c 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -12,7 +12,6 @@ #include <linux/slab.h> #include <linux/module.h> -#include <linux/smp_lock.h> #include <linux/init.h> #include <linux/sched.h> #include <linux/fs.h> @@ -22,6 +21,7 @@ #include <linux/syscalls.h> #include <linux/ptrace.h> #include <linux/signal.h> +#include <linux/signalfd.h> #include <linux/capability.h> #include <linux/freezer.h> #include <linux/pid_namespace.h> @@ -39,125 +39,6 @@ static struct kmem_cache *sigqueue_cachep; -/* - * In POSIX a signal is sent either to a specific thread (Linux task) - * or to the process as a whole (Linux thread group). How the signal - * is sent determines whether it's to one thread or the whole group, - * which determines which signal mask(s) are involved in blocking it - * from being delivered until later. When the signal is delivered, - * either it's caught or ignored by a user handler or it has a default - * effect that applies to the whole thread group (POSIX process). - * - * The possible effects an unblocked signal set to SIG_DFL can have are: - * ignore - Nothing Happens - * terminate - kill the process, i.e. all threads in the group, - * similar to exit_group. The group leader (only) reports - * WIFSIGNALED status to its parent. - * coredump - write a core dump file describing all threads using - * the same mm and then kill all those threads - * stop - stop all the threads in the group, i.e. TASK_STOPPED state - * - * SIGKILL and SIGSTOP cannot be caught, blocked, or ignored. - * Other signals when not blocked and set to SIG_DFL behaves as follows. - * The job control signals also have other special effects. - * - * +--------------------+------------------+ - * | POSIX signal | default action | - * +--------------------+------------------+ - * | SIGHUP | terminate | - * | SIGINT | terminate | - * | SIGQUIT | coredump | - * | SIGILL | coredump | - * | SIGTRAP | coredump | - * | SIGABRT/SIGIOT | coredump | - * | SIGBUS | coredump | - * | SIGFPE | coredump | - * | SIGKILL | terminate(+) | - * | SIGUSR1 | terminate | - * | SIGSEGV | coredump | - * | SIGUSR2 | terminate | - * | SIGPIPE | terminate | - * | SIGALRM | terminate | - * | SIGTERM | terminate | - * | SIGCHLD | ignore | - * | SIGCONT | ignore(*) | - * | SIGSTOP | stop(*)(+) | - * | SIGTSTP | stop(*) | - * | SIGTTIN | stop(*) | - * | SIGTTOU | stop(*) | - * | SIGURG | ignore | - * | SIGXCPU | coredump | - * | SIGXFSZ | coredump | - * | SIGVTALRM | terminate | - * | SIGPROF | terminate | - * | SIGPOLL/SIGIO | terminate | - * | SIGSYS/SIGUNUSED | coredump | - * | SIGSTKFLT | terminate | - * | SIGWINCH | ignore | - * | SIGPWR | terminate | - * | SIGRTMIN-SIGRTMAX | terminate | - * +--------------------+------------------+ - * | non-POSIX signal | default action | - * +--------------------+------------------+ - * | SIGEMT | coredump | - * +--------------------+------------------+ - * - * (+) For SIGKILL and SIGSTOP the action is "always", not just "default". - * (*) Special job control effects: - * When SIGCONT is sent, it resumes the process (all threads in the group) - * from TASK_STOPPED state and also clears any pending/queued stop signals - * (any of those marked with "stop(*)"). This happens regardless of blocking, - * catching, or ignoring SIGCONT. When any stop signal is sent, it clears - * any pending/queued SIGCONT signals; this happens regardless of blocking, - * catching, or ignored the stop signal, though (except for SIGSTOP) the - * default action of stopping the process may happen later or never. - */ - -#ifdef SIGEMT -#define M_SIGEMT M(SIGEMT) -#else -#define M_SIGEMT 0 -#endif - -#if SIGRTMIN > BITS_PER_LONG -#define M(sig) (1ULL << ((sig)-1)) -#else -#define M(sig) (1UL << ((sig)-1)) -#endif -#define T(sig, mask) (M(sig) & (mask)) - -#define SIG_KERNEL_ONLY_MASK (\ - M(SIGKILL) | M(SIGSTOP) ) - -#define SIG_KERNEL_STOP_MASK (\ - M(SIGSTOP) | M(SIGTSTP) | M(SIGTTIN) | M(SIGTTOU) ) - -#define SIG_KERNEL_COREDUMP_MASK (\ - M(SIGQUIT) | M(SIGILL) | M(SIGTRAP) | M(SIGABRT) | \ - M(SIGFPE) | M(SIGSEGV) | M(SIGBUS) | M(SIGSYS) | \ - M(SIGXCPU) | M(SIGXFSZ) | M_SIGEMT ) - -#define SIG_KERNEL_IGNORE_MASK (\ - M(SIGCONT) | M(SIGCHLD) | M(SIGWINCH) | M(SIGURG) ) - -#define sig_kernel_only(sig) \ - (((sig) < SIGRTMIN) && T(sig, SIG_KERNEL_ONLY_MASK)) -#define sig_kernel_coredump(sig) \ - (((sig) < SIGRTMIN) && T(sig, SIG_KERNEL_COREDUMP_MASK)) -#define sig_kernel_ignore(sig) \ - (((sig) < SIGRTMIN) && T(sig, SIG_KERNEL_IGNORE_MASK)) -#define sig_kernel_stop(sig) \ - (((sig) < SIGRTMIN) && T(sig, SIG_KERNEL_STOP_MASK)) - -#define sig_needs_tasklist(sig) ((sig) == SIGCONT) - -#define sig_user_defined(t, signr) \ - (((t)->sighand->action[(signr)-1].sa.sa_handler != SIG_DFL) && \ - ((t)->sighand->action[(signr)-1].sa.sa_handler != SIG_IGN)) - -#define sig_fatal(t, signr) \ - (!T(signr, SIG_KERNEL_IGNORE_MASK|SIG_KERNEL_STOP_MASK) && \ - (t)->sighand->action[(signr)-1].sa.sa_handler == SIG_DFL) static int sig_ignored(struct task_struct *t, int sig) { @@ -233,8 +114,7 @@ void recalc_sigpending(void) /* Given the mask, find the first available signal that should be serviced. */ -static int -next_signal(struct sigpending *pending, sigset_t *mask) +int next_signal(struct sigpending *pending, sigset_t *mask) { unsigned long i, *s, *m, x; int sig = 0; @@ -329,6 +209,16 @@ void flush_signals(struct task_struct *t) spin_unlock_irqrestore(&t->sighand->siglock, flags); } +void ignore_signals(struct task_struct *t) +{ + int i; + + for (i = 0; i < _NSIG; ++i) + t->sighand->action[i].sa.sa_handler = SIG_IGN; + + flush_signals(t); +} + /* * Flush all handlers for a task. */ @@ -607,6 +497,11 @@ static int check_kill_permission(int sig, struct siginfo *info, int error = -EINVAL; if (!valid_signal(sig)) return error; + + error = audit_signal_info(sig, t); /* Let audit system see the signal */ + if (error) + return error; + error = -EPERM; if ((info == SEND_SIG_NOINFO || (!is_si_special(info) && SI_FROMUSER(info))) && ((sig != SIGCONT) || @@ -616,10 +511,7 @@ static int check_kill_permission(int sig, struct siginfo *info, && !capable(CAP_KILL)) return error; - error = security_task_kill(t, info, sig, 0); - if (!error) - audit_signal_info(sig, t); /* Let audit system see the signal */ - return error; + return security_task_kill(t, info, sig, 0); } /* forward decl */ @@ -740,6 +632,12 @@ static int send_signal(int sig, struct siginfo *info, struct task_struct *t, int ret = 0; /* + * Deliver the signal to listening signalfds. This must be called + * with the sighand lock held. + */ + signalfd_notify(t, sig); + + /* * fast-pathed signals for kernel-internal things like SIGSTOP * or SIGKILL. */ @@ -1033,17 +931,6 @@ void zap_other_threads(struct task_struct *p) if (t->exit_state) continue; - /* - * We don't want to notify the parent, since we are - * killed as part of a thread group due to another - * thread doing an execve() or similar. So set the - * exit signal to -1 to allow immediate reaping of - * the process. But don't detach the thread group - * leader. - */ - if (t != p->group_leader) - t->exit_signal = -1; - /* SIGKILL will be handled before any pending SIGSTOP */ sigaddset(&t->pending.signal, SIGKILL); signal_wake_up(t, 1); @@ -1401,6 +1288,11 @@ int send_sigqueue(int sig, struct sigqueue *q, struct task_struct *p) ret = 1; goto out; } + /* + * Deliver the signal to listening signalfds. This must be called + * with the sighand lock held. + */ + signalfd_notify(p, sig); list_add_tail(&q->list, &p->pending.list); sigaddset(&p->pending.signal, sig); @@ -1444,6 +1336,11 @@ send_group_sigqueue(int sig, struct sigqueue *q, struct task_struct *p) q->info.si_overrun++; goto out; } + /* + * Deliver the signal to listening signalfds. This must be called + * with the sighand lock held. + */ + signalfd_notify(p, sig); /* * Put this signal on the shared-pending queue. @@ -2104,6 +2001,8 @@ int copy_siginfo_to_user(siginfo_t __user *to, siginfo_t *from) /* * If you change siginfo_t structure, please be sure * this code is fixed accordingly. + * Please remember to update the signalfd_copyinfo() function + * inside fs/signalfd.c too, in case siginfo_t changes. * It should never copy any pad contained in the structure * to avoid security leaks, but must copy the generic * 3 ints plus the relevant union member. diff --git a/kernel/softirq.c b/kernel/softirq.c index 8b75008e2bd8..0b9886a00e74 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -593,6 +593,7 @@ static int __cpuinit cpu_callback(struct notifier_block *nfb, switch (action) { case CPU_UP_PREPARE: + case CPU_UP_PREPARE_FROZEN: p = kthread_create(ksoftirqd, hcpu, "ksoftirqd/%d", hotcpu); if (IS_ERR(p)) { printk("ksoftirqd for %i failed\n", hotcpu); @@ -602,16 +603,19 @@ static int __cpuinit cpu_callback(struct notifier_block *nfb, per_cpu(ksoftirqd, hotcpu) = p; break; case CPU_ONLINE: + case CPU_ONLINE_FROZEN: wake_up_process(per_cpu(ksoftirqd, hotcpu)); break; #ifdef CONFIG_HOTPLUG_CPU case CPU_UP_CANCELED: + case CPU_UP_CANCELED_FROZEN: if (!per_cpu(ksoftirqd, hotcpu)) break; /* Unbind so it can run. Fall thru. */ kthread_bind(per_cpu(ksoftirqd, hotcpu), any_online_cpu(cpu_online_map)); case CPU_DEAD: + case CPU_DEAD_FROZEN: p = per_cpu(ksoftirqd, hotcpu); per_cpu(ksoftirqd, hotcpu) = NULL; kthread_stop(p); diff --git a/kernel/softlockup.c b/kernel/softlockup.c index 50afeb813305..0131e296ffb4 100644 --- a/kernel/softlockup.c +++ b/kernel/softlockup.c @@ -34,12 +34,32 @@ static struct notifier_block panic_block = { .notifier_call = softlock_panic, }; +/* + * Returns seconds, approximately. We don't need nanosecond + * resolution, and we don't need to waste time with a big divide when + * 2^30ns == 1.074s. + */ +static unsigned long get_timestamp(void) +{ + return sched_clock() >> 30; /* 2^30 ~= 10^9 */ +} + void touch_softlockup_watchdog(void) { - __raw_get_cpu_var(touch_timestamp) = jiffies; + __raw_get_cpu_var(touch_timestamp) = get_timestamp(); } EXPORT_SYMBOL(touch_softlockup_watchdog); +void touch_all_softlockup_watchdogs(void) +{ + int cpu; + + /* Cause each CPU to re-update its timestamp rather than complain */ + for_each_online_cpu(cpu) + per_cpu(touch_timestamp, cpu) = 0; +} +EXPORT_SYMBOL(touch_all_softlockup_watchdogs); + /* * This callback runs from the timer interrupt, and checks * whether the watchdog thread has hung or not: @@ -48,9 +68,18 @@ void softlockup_tick(void) { int this_cpu = smp_processor_id(); unsigned long touch_timestamp = per_cpu(touch_timestamp, this_cpu); + unsigned long print_timestamp; + unsigned long now; + + if (touch_timestamp == 0) { + touch_softlockup_watchdog(); + return; + } + + print_timestamp = per_cpu(print_timestamp, this_cpu); - /* prevent double reports: */ - if (per_cpu(print_timestamp, this_cpu) == touch_timestamp || + /* report at most once a second */ + if (print_timestamp < (touch_timestamp + 1) || did_panic || !per_cpu(watchdog_task, this_cpu)) return; @@ -61,12 +90,14 @@ void softlockup_tick(void) return; } + now = get_timestamp(); + /* Wake up the high-prio watchdog task every second: */ - if (time_after(jiffies, touch_timestamp + HZ)) + if (now > (touch_timestamp + 1)) wake_up_process(per_cpu(watchdog_task, this_cpu)); /* Warn about unreasonable 10+ seconds delays: */ - if (time_after(jiffies, touch_timestamp + 10*HZ)) { + if (now > (touch_timestamp + 10)) { per_cpu(print_timestamp, this_cpu) = touch_timestamp; spin_lock(&print_lock); @@ -82,11 +113,14 @@ void softlockup_tick(void) */ static int watchdog(void * __bind_cpu) { - struct sched_param param = { .sched_priority = 99 }; + struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; sched_setscheduler(current, SCHED_FIFO, ¶m); current->flags |= PF_NOFREEZE; + /* initialize timestamp */ + touch_softlockup_watchdog(); + /* * Run briefly once per second to reset the softlockup timestamp. * If this gets delayed for more than 10 seconds then the @@ -112,27 +146,31 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) switch (action) { case CPU_UP_PREPARE: + case CPU_UP_PREPARE_FROZEN: BUG_ON(per_cpu(watchdog_task, hotcpu)); p = kthread_create(watchdog, hcpu, "watchdog/%d", hotcpu); if (IS_ERR(p)) { printk("watchdog for %i failed\n", hotcpu); return NOTIFY_BAD; } - per_cpu(touch_timestamp, hotcpu) = jiffies; + per_cpu(touch_timestamp, hotcpu) = 0; per_cpu(watchdog_task, hotcpu) = p; kthread_bind(p, hotcpu); break; case CPU_ONLINE: + case CPU_ONLINE_FROZEN: wake_up_process(per_cpu(watchdog_task, hotcpu)); break; #ifdef CONFIG_HOTPLUG_CPU case CPU_UP_CANCELED: + case CPU_UP_CANCELED_FROZEN: if (!per_cpu(watchdog_task, hotcpu)) break; /* Unbind so it can run. Fall thru. */ kthread_bind(per_cpu(watchdog_task, hotcpu), any_online_cpu(cpu_online_map)); case CPU_DEAD: + case CPU_DEAD_FROZEN: p = per_cpu(watchdog_task, hotcpu); per_cpu(watchdog_task, hotcpu) = NULL; kthread_stop(p); diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index 12458040e665..fcee2a8e6da3 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -1,12 +1,15 @@ /* Copyright 2005 Rusty Russell rusty@rustcorp.com.au IBM Corporation. * GPL v2 and any later version. */ -#include <linux/stop_machine.h> -#include <linux/kthread.h> -#include <linux/sched.h> #include <linux/cpu.h> #include <linux/err.h> +#include <linux/kthread.h> +#include <linux/module.h> +#include <linux/sched.h> +#include <linux/stop_machine.h> #include <linux/syscalls.h> +#include <linux/interrupt.h> + #include <asm/atomic.h> #include <asm/semaphore.h> #include <asm/uaccess.h> @@ -44,6 +47,7 @@ static int stopmachine(void *cpu) if (stopmachine_state == STOPMACHINE_DISABLE_IRQ && !irqs_disabled) { local_irq_disable(); + hard_irq_disable(); irqs_disabled = 1; /* Ack: irqs disabled. */ smp_mb(); /* Must read state first. */ @@ -123,6 +127,7 @@ static int stop_machine(void) /* Make them disable irqs. */ local_irq_disable(); + hard_irq_disable(); stopmachine_set_state(STOPMACHINE_DISABLE_IRQ); return 0; @@ -208,3 +213,4 @@ int stop_machine_run(int (*fn)(void *), void *data, unsigned int cpu) return ret; } +EXPORT_SYMBOL_GPL(stop_machine_run); diff --git a/kernel/sys.c b/kernel/sys.c index fe1f3ab20477..872271ccc384 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -14,6 +14,7 @@ #include <linux/prctl.h> #include <linux/highuid.h> #include <linux/fs.h> +#include <linux/resource.h> #include <linux/kernel.h> #include <linux/kexec.h> #include <linux/workqueue.h> @@ -29,6 +30,7 @@ #include <linux/signal.h> #include <linux/cn_proc.h> #include <linux/getcpu.h> +#include <linux/task_io_accounting_ops.h> #include <linux/compat.h> #include <linux/syscalls.h> @@ -134,19 +136,39 @@ static int notifier_chain_unregister(struct notifier_block **nl, return -ENOENT; } +/** + * notifier_call_chain - Informs the registered notifiers about an event. + * @nl: Pointer to head of the blocking notifier chain + * @val: Value passed unmodified to notifier function + * @v: Pointer passed unmodified to notifier function + * @nr_to_call: Number of notifier functions to be called. Don't care + * value of this parameter is -1. + * @nr_calls: Records the number of notifications sent. Don't care + * value of this field is NULL. + * @returns: notifier_call_chain returns the value returned by the + * last notifier function called. + */ + static int __kprobes notifier_call_chain(struct notifier_block **nl, - unsigned long val, void *v) + unsigned long val, void *v, + int nr_to_call, int *nr_calls) { int ret = NOTIFY_DONE; struct notifier_block *nb, *next_nb; nb = rcu_dereference(*nl); - while (nb) { + + while (nb && nr_to_call) { next_nb = rcu_dereference(nb->next); ret = nb->notifier_call(nb, val, v); + + if (nr_calls) + (*nr_calls)++; + if ((ret & NOTIFY_STOP_MASK) == NOTIFY_STOP_MASK) break; nb = next_nb; + nr_to_call--; } return ret; } @@ -205,10 +227,12 @@ int atomic_notifier_chain_unregister(struct atomic_notifier_head *nh, EXPORT_SYMBOL_GPL(atomic_notifier_chain_unregister); /** - * atomic_notifier_call_chain - Call functions in an atomic notifier chain + * __atomic_notifier_call_chain - Call functions in an atomic notifier chain * @nh: Pointer to head of the atomic notifier chain * @val: Value passed unmodified to notifier function * @v: Pointer passed unmodified to notifier function + * @nr_to_call: See the comment for notifier_call_chain. + * @nr_calls: See the comment for notifier_call_chain. * * Calls each function in a notifier chain in turn. The functions * run in an atomic context, so they must not block. @@ -222,19 +246,27 @@ EXPORT_SYMBOL_GPL(atomic_notifier_chain_unregister); * of the last notifier function called. */ -int __kprobes atomic_notifier_call_chain(struct atomic_notifier_head *nh, - unsigned long val, void *v) +int __kprobes __atomic_notifier_call_chain(struct atomic_notifier_head *nh, + unsigned long val, void *v, + int nr_to_call, int *nr_calls) { int ret; rcu_read_lock(); - ret = notifier_call_chain(&nh->head, val, v); + ret = notifier_call_chain(&nh->head, val, v, nr_to_call, nr_calls); rcu_read_unlock(); return ret; } -EXPORT_SYMBOL_GPL(atomic_notifier_call_chain); +EXPORT_SYMBOL_GPL(__atomic_notifier_call_chain); +int __kprobes atomic_notifier_call_chain(struct atomic_notifier_head *nh, + unsigned long val, void *v) +{ + return __atomic_notifier_call_chain(nh, val, v, -1, NULL); +} + +EXPORT_SYMBOL_GPL(atomic_notifier_call_chain); /* * Blocking notifier chain routines. All access to the chain is * synchronized by an rwsem. @@ -304,10 +336,12 @@ int blocking_notifier_chain_unregister(struct blocking_notifier_head *nh, EXPORT_SYMBOL_GPL(blocking_notifier_chain_unregister); /** - * blocking_notifier_call_chain - Call functions in a blocking notifier chain + * __blocking_notifier_call_chain - Call functions in a blocking notifier chain * @nh: Pointer to head of the blocking notifier chain * @val: Value passed unmodified to notifier function * @v: Pointer passed unmodified to notifier function + * @nr_to_call: See comment for notifier_call_chain. + * @nr_calls: See comment for notifier_call_chain. * * Calls each function in a notifier chain in turn. The functions * run in a process context, so they are allowed to block. @@ -320,8 +354,9 @@ EXPORT_SYMBOL_GPL(blocking_notifier_chain_unregister); * of the last notifier function called. */ -int blocking_notifier_call_chain(struct blocking_notifier_head *nh, - unsigned long val, void *v) +int __blocking_notifier_call_chain(struct blocking_notifier_head *nh, + unsigned long val, void *v, + int nr_to_call, int *nr_calls) { int ret = NOTIFY_DONE; @@ -332,12 +367,19 @@ int blocking_notifier_call_chain(struct blocking_notifier_head *nh, */ if (rcu_dereference(nh->head)) { down_read(&nh->rwsem); - ret = notifier_call_chain(&nh->head, val, v); + ret = notifier_call_chain(&nh->head, val, v, nr_to_call, + nr_calls); up_read(&nh->rwsem); } return ret; } +EXPORT_SYMBOL_GPL(__blocking_notifier_call_chain); +int blocking_notifier_call_chain(struct blocking_notifier_head *nh, + unsigned long val, void *v) +{ + return __blocking_notifier_call_chain(nh, val, v, -1, NULL); +} EXPORT_SYMBOL_GPL(blocking_notifier_call_chain); /* @@ -383,10 +425,12 @@ int raw_notifier_chain_unregister(struct raw_notifier_head *nh, EXPORT_SYMBOL_GPL(raw_notifier_chain_unregister); /** - * raw_notifier_call_chain - Call functions in a raw notifier chain + * __raw_notifier_call_chain - Call functions in a raw notifier chain * @nh: Pointer to head of the raw notifier chain * @val: Value passed unmodified to notifier function * @v: Pointer passed unmodified to notifier function + * @nr_to_call: See comment for notifier_call_chain. + * @nr_calls: See comment for notifier_call_chain * * Calls each function in a notifier chain in turn. The functions * run in an undefined context. @@ -400,10 +444,19 @@ EXPORT_SYMBOL_GPL(raw_notifier_chain_unregister); * of the last notifier function called. */ +int __raw_notifier_call_chain(struct raw_notifier_head *nh, + unsigned long val, void *v, + int nr_to_call, int *nr_calls) +{ + return notifier_call_chain(&nh->head, val, v, nr_to_call, nr_calls); +} + +EXPORT_SYMBOL_GPL(__raw_notifier_call_chain); + int raw_notifier_call_chain(struct raw_notifier_head *nh, unsigned long val, void *v) { - return notifier_call_chain(&nh->head, val, v); + return __raw_notifier_call_chain(nh, val, v, -1, NULL); } EXPORT_SYMBOL_GPL(raw_notifier_call_chain); @@ -478,10 +531,12 @@ int srcu_notifier_chain_unregister(struct srcu_notifier_head *nh, EXPORT_SYMBOL_GPL(srcu_notifier_chain_unregister); /** - * srcu_notifier_call_chain - Call functions in an SRCU notifier chain + * __srcu_notifier_call_chain - Call functions in an SRCU notifier chain * @nh: Pointer to head of the SRCU notifier chain * @val: Value passed unmodified to notifier function * @v: Pointer passed unmodified to notifier function + * @nr_to_call: See comment for notifier_call_chain. + * @nr_calls: See comment for notifier_call_chain * * Calls each function in a notifier chain in turn. The functions * run in a process context, so they are allowed to block. @@ -494,18 +549,25 @@ EXPORT_SYMBOL_GPL(srcu_notifier_chain_unregister); * of the last notifier function called. */ -int srcu_notifier_call_chain(struct srcu_notifier_head *nh, - unsigned long val, void *v) +int __srcu_notifier_call_chain(struct srcu_notifier_head *nh, + unsigned long val, void *v, + int nr_to_call, int *nr_calls) { int ret; int idx; idx = srcu_read_lock(&nh->srcu); - ret = notifier_call_chain(&nh->head, val, v); + ret = notifier_call_chain(&nh->head, val, v, nr_to_call, nr_calls); srcu_read_unlock(&nh->srcu, idx); return ret; } +EXPORT_SYMBOL_GPL(__srcu_notifier_call_chain); +int srcu_notifier_call_chain(struct srcu_notifier_head *nh, + unsigned long val, void *v) +{ + return __srcu_notifier_call_chain(nh, val, v, -1, NULL); +} EXPORT_SYMBOL_GPL(srcu_notifier_call_chain); /** @@ -598,7 +660,7 @@ asmlinkage long sys_setpriority(int which, int who, int niceval) int error = -EINVAL; struct pid *pgrp; - if (which > 2 || which < 0) + if (which > PRIO_USER || which < PRIO_PROCESS) goto out; /* normalize: avoid signed division (rounding problems) */ @@ -662,7 +724,7 @@ asmlinkage long sys_getpriority(int which, int who) long niceval, retval = -ESRCH; struct pid *pgrp; - if (which > 2 || which < 0) + if (which > PRIO_USER || which < PRIO_PROCESS) return -EINVAL; read_lock(&tasklist_lock); @@ -881,7 +943,7 @@ asmlinkage long sys_reboot(int magic1, int magic2, unsigned int cmd, void __user #ifdef CONFIG_SOFTWARE_SUSPEND case LINUX_REBOOT_CMD_SW_SUSPEND: { - int ret = pm_suspend(PM_SUSPEND_DISK); + int ret = hibernate(); unlock_kernel(); return ret; } @@ -1292,7 +1354,7 @@ asmlinkage long sys_setfsuid(uid_t uid) } /* - * Samma på svenska.. + * Samma pÃ¥ svenska.. */ asmlinkage long sys_setfsgid(gid_t gid) { @@ -1426,7 +1488,7 @@ asmlinkage long sys_setpgid(pid_t pid, pid_t pgid) if (process_group(p) != pgid) { detach_pid(p, PIDTYPE_PGID); p->signal->pgrp = pgid; - attach_pid(p, PIDTYPE_PGID, pgid); + attach_pid(p, PIDTYPE_PGID, find_pid(pgid)); } err = 0; @@ -1923,6 +1985,16 @@ asmlinkage long sys_setrlimit(unsigned int resource, struct rlimit __user *rlim) if (retval) return retval; + if (resource == RLIMIT_CPU && new_rlim.rlim_cur == 0) { + /* + * The caller is asking for an immediate RLIMIT_CPU + * expiry. But we use the zero value to mean "it was + * never set". So let's cheat and make it one second + * instead + */ + new_rlim.rlim_cur = 1; + } + task_lock(current->group_leader); *old_rlim = new_rlim; task_unlock(current->group_leader); @@ -1944,15 +2016,6 @@ asmlinkage long sys_setrlimit(unsigned int resource, struct rlimit __user *rlim) unsigned long rlim_cur = new_rlim.rlim_cur; cputime_t cputime; - if (rlim_cur == 0) { - /* - * The caller is asking for an immediate RLIMIT_CPU - * expiry. But we use the zero value to mean "it was - * never set". So let's cheat and make it one second - * instead - */ - rlim_cur = 1; - } cputime = secs_to_cputime(rlim_cur); read_lock(&tasklist_lock); spin_lock_irq(¤t->sighand->siglock); @@ -2021,6 +2084,8 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r) r->ru_nivcsw = p->signal->cnivcsw; r->ru_minflt = p->signal->cmin_flt; r->ru_majflt = p->signal->cmaj_flt; + r->ru_inblock = p->signal->cinblock; + r->ru_oublock = p->signal->coublock; if (who == RUSAGE_CHILDREN) break; @@ -2032,6 +2097,8 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r) r->ru_nivcsw += p->signal->nivcsw; r->ru_minflt += p->signal->min_flt; r->ru_majflt += p->signal->maj_flt; + r->ru_inblock += p->signal->inblock; + r->ru_oublock += p->signal->oublock; t = p; do { utime = cputime_add(utime, t->utime); @@ -2040,6 +2107,8 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r) r->ru_nivcsw += t->nivcsw; r->ru_minflt += t->min_flt; r->ru_majflt += t->maj_flt; + r->ru_inblock += task_io_get_inblock(t); + r->ru_oublock += task_io_get_oublock(t); t = next_thread(t); } while (t != p); break; diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index d7306d0f3dfc..7e11e2c98bf9 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -141,3 +141,10 @@ cond_syscall(compat_sys_migrate_pages); cond_syscall(sys_bdflush); cond_syscall(sys_ioprio_set); cond_syscall(sys_ioprio_get); + +/* New file descriptors */ +cond_syscall(sys_signalfd); +cond_syscall(sys_timerfd); +cond_syscall(compat_sys_signalfd); +cond_syscall(compat_sys_timerfd); +cond_syscall(sys_eventfd); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index c904748f2290..30ee462ee79f 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -76,6 +76,8 @@ extern int pid_max_min, pid_max_max; extern int sysctl_drop_caches; extern int percpu_pagelist_fraction; extern int compat_log; +extern int maps_protect; +extern int sysctl_stat_interval; /* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */ static int maxolduid = 65535; @@ -225,7 +227,7 @@ static ctl_table kern_table[] = { .ctl_name = KERN_CORE_PATTERN, .procname = "core_pattern", .data = core_pattern, - .maxlen = 128, + .maxlen = CORENAME_MAX_SIZE, .mode = 0644, .proc_handler = &proc_dostring, .strategy = &sysctl_string, @@ -603,6 +605,16 @@ static ctl_table kern_table[] = { .proc_handler = &proc_dointvec, }, #endif +#ifdef CONFIG_PROC_FS + { + .ctl_name = CTL_UNNUMBERED, + .procname = "maps_protect", + .data = &maps_protect, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, +#endif { .ctl_name = 0 } }; @@ -846,6 +858,17 @@ static ctl_table vm_table[] = { .extra2 = &one_hundred, }, #endif +#ifdef CONFIG_SMP + { + .ctl_name = CTL_UNNUMBERED, + .procname = "stat_interval", + .data = &sysctl_stat_interval, + .maxlen = sizeof(sysctl_stat_interval), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + .strategy = &sysctl_jiffies, + }, +#endif #if defined(CONFIG_X86_32) || \ (defined(CONFIG_SUPERH) && defined(CONFIG_VSYSCALL)) { diff --git a/kernel/time.c b/kernel/time.c index ba18ec4899bd..f04791f69408 100644 --- a/kernel/time.c +++ b/kernel/time.c @@ -31,7 +31,6 @@ #include <linux/timex.h> #include <linux/capability.h> #include <linux/errno.h> -#include <linux/smp_lock.h> #include <linux/syscalls.h> #include <linux/security.h> #include <linux/fs.h> @@ -247,6 +246,36 @@ struct timespec current_fs_time(struct super_block *sb) } EXPORT_SYMBOL(current_fs_time); +/* + * Convert jiffies to milliseconds and back. + * + * Avoid unnecessary multiplications/divisions in the + * two most common HZ cases: + */ +unsigned int inline jiffies_to_msecs(const unsigned long j) +{ +#if HZ <= MSEC_PER_SEC && !(MSEC_PER_SEC % HZ) + return (MSEC_PER_SEC / HZ) * j; +#elif HZ > MSEC_PER_SEC && !(HZ % MSEC_PER_SEC) + return (j + (HZ / MSEC_PER_SEC) - 1)/(HZ / MSEC_PER_SEC); +#else + return (j * MSEC_PER_SEC) / HZ; +#endif +} +EXPORT_SYMBOL(jiffies_to_msecs); + +unsigned int inline jiffies_to_usecs(const unsigned long j) +{ +#if HZ <= USEC_PER_SEC && !(USEC_PER_SEC % HZ) + return (USEC_PER_SEC / HZ) * j; +#elif HZ > USEC_PER_SEC && !(HZ % USEC_PER_SEC) + return (j + (HZ / USEC_PER_SEC) - 1)/(HZ / USEC_PER_SEC); +#else + return (j * USEC_PER_SEC) / HZ; +#endif +} +EXPORT_SYMBOL(jiffies_to_usecs); + /** * timespec_trunc - Truncate timespec to a granularity * @t: Timespec @@ -473,36 +502,6 @@ struct timeval ns_to_timeval(const s64 nsec) EXPORT_SYMBOL(ns_to_timeval); /* - * Convert jiffies to milliseconds and back. - * - * Avoid unnecessary multiplications/divisions in the - * two most common HZ cases: - */ -unsigned int jiffies_to_msecs(const unsigned long j) -{ -#if HZ <= MSEC_PER_SEC && !(MSEC_PER_SEC % HZ) - return (MSEC_PER_SEC / HZ) * j; -#elif HZ > MSEC_PER_SEC && !(HZ % MSEC_PER_SEC) - return (j + (HZ / MSEC_PER_SEC) - 1)/(HZ / MSEC_PER_SEC); -#else - return (j * MSEC_PER_SEC) / HZ; -#endif -} -EXPORT_SYMBOL(jiffies_to_msecs); - -unsigned int jiffies_to_usecs(const unsigned long j) -{ -#if HZ <= USEC_PER_SEC && !(USEC_PER_SEC % HZ) - return (USEC_PER_SEC / HZ) * j; -#elif HZ > USEC_PER_SEC && !(HZ % USEC_PER_SEC) - return (j + (HZ / USEC_PER_SEC) - 1)/(HZ / USEC_PER_SEC); -#else - return (j * USEC_PER_SEC) / HZ; -#endif -} -EXPORT_SYMBOL(jiffies_to_usecs); - -/* * When we convert to jiffies then we interpret incoming values * the following way: * diff --git a/kernel/time/Makefile b/kernel/time/Makefile index 93bccba1f265..99b6034fc86b 100644 --- a/kernel/time/Makefile +++ b/kernel/time/Makefile @@ -1,4 +1,4 @@ -obj-y += ntp.o clocksource.o jiffies.o timer_list.o +obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o obj-$(CONFIG_GENERIC_CLOCKEVENTS) += clockevents.o obj-$(CONFIG_GENERIC_CLOCKEVENTS) += tick-common.o diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index fe5c7db24247..51b6a6a6158c 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -74,15 +74,17 @@ static struct clocksource *watchdog; static struct timer_list watchdog_timer; static DEFINE_SPINLOCK(watchdog_lock); static cycle_t watchdog_last; +static unsigned long watchdog_resumed; + /* - * Interval: 0.5sec Treshold: 0.0625s + * Interval: 0.5sec Threshold: 0.0625s */ #define WATCHDOG_INTERVAL (HZ >> 1) -#define WATCHDOG_TRESHOLD (NSEC_PER_SEC >> 4) +#define WATCHDOG_THRESHOLD (NSEC_PER_SEC >> 4) static void clocksource_ratewd(struct clocksource *cs, int64_t delta) { - if (delta > -WATCHDOG_TRESHOLD && delta < WATCHDOG_TRESHOLD) + if (delta > -WATCHDOG_THRESHOLD && delta < WATCHDOG_THRESHOLD) return; printk(KERN_WARNING "Clocksource %s unstable (delta = %Ld ns)\n", @@ -98,15 +100,24 @@ static void clocksource_watchdog(unsigned long data) struct clocksource *cs, *tmp; cycle_t csnow, wdnow; int64_t wd_nsec, cs_nsec; + int resumed; spin_lock(&watchdog_lock); + resumed = test_and_clear_bit(0, &watchdog_resumed); + wdnow = watchdog->read(); wd_nsec = cyc2ns(watchdog, (wdnow - watchdog_last) & watchdog->mask); watchdog_last = wdnow; list_for_each_entry_safe(cs, tmp, &watchdog_list, wd_list) { csnow = cs->read(); + + if (unlikely(resumed)) { + cs->wd_last = csnow; + continue; + } + /* Initialized ? */ if (!(cs->flags & CLOCK_SOURCE_WATCHDOG)) { if ((cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) && @@ -136,6 +147,11 @@ static void clocksource_watchdog(unsigned long data) } spin_unlock(&watchdog_lock); } +static void clocksource_resume_watchdog(void) +{ + set_bit(0, &watchdog_resumed); +} + static void clocksource_check_watchdog(struct clocksource *cs) { struct clocksource *cse; @@ -182,9 +198,34 @@ static void clocksource_check_watchdog(struct clocksource *cs) if (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES; } + +static inline void clocksource_resume_watchdog(void) { } #endif /** + * clocksource_resume - resume the clocksource(s) + */ +void clocksource_resume(void) +{ + struct list_head *tmp; + unsigned long flags; + + spin_lock_irqsave(&clocksource_lock, flags); + + list_for_each(tmp, &clocksource_list) { + struct clocksource *cs; + + cs = list_entry(tmp, struct clocksource, list); + if (cs->resume) + cs->resume(); + } + + clocksource_resume_watchdog(); + + spin_unlock_irqrestore(&clocksource_lock, flags); +} + +/** * clocksource_get_next - Returns the selected clocksource * */ diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index cb25649c6f50..87aa5ff931e0 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -11,6 +11,8 @@ #include <linux/mm.h> #include <linux/time.h> #include <linux/timex.h> +#include <linux/jiffies.h> +#include <linux/hrtimer.h> #include <asm/div64.h> #include <asm/timex.h> diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index bfda3f7f0716..a96ec9ab3454 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -31,7 +31,7 @@ DEFINE_PER_CPU(struct tick_device, tick_cpu_device); */ ktime_t tick_next_period; ktime_t tick_period; -static int tick_do_timer_cpu = -1; +int tick_do_timer_cpu __read_mostly = -1; DEFINE_SPINLOCK(tick_device_lock); /* @@ -295,6 +295,12 @@ static void tick_shutdown(unsigned int *cpup) clockevents_exchange_device(dev, NULL); td->evtdev = NULL; } + /* Transfer the do_timer job away from this cpu */ + if (*cpup == tick_do_timer_cpu) { + int cpu = first_cpu(cpu_online_map); + + tick_do_timer_cpu = (cpu != NR_CPUS) ? cpu : -1; + } spin_unlock_irqrestore(&tick_device_lock, flags); } diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h index c9d203bde518..bb13f2724905 100644 --- a/kernel/time/tick-internal.h +++ b/kernel/time/tick-internal.h @@ -5,6 +5,7 @@ DECLARE_PER_CPU(struct tick_device, tick_cpu_device); extern spinlock_t tick_device_lock; extern ktime_t tick_next_period; extern ktime_t tick_period; +extern int tick_do_timer_cpu __read_mostly; extern void tick_setup_periodic(struct clock_event_device *dev, int broadcast); extern void tick_handle_periodic(struct clock_event_device *dev); diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 51556b95f60f..3483e6cb9549 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -217,10 +217,30 @@ void tick_nohz_stop_sched_tick(void) * the scheduler tick in nohz_restart_sched_tick. */ if (!ts->tick_stopped) { + if (select_nohz_load_balancer(1)) { + /* + * sched tick not stopped! + */ + cpu_clear(cpu, nohz_cpu_mask); + goto out; + } + ts->idle_tick = ts->sched_timer.expires; ts->tick_stopped = 1; ts->idle_jiffies = last_jiffies; } + + /* + * If this cpu is the one which updates jiffies, then + * give up the assignment and let it be taken by the + * cpu which runs the tick timer next, which might be + * this cpu as well. If we don't drop this here the + * jiffies might be stale and do_timer() never + * invoked. + */ + if (cpu == tick_do_timer_cpu) + tick_do_timer_cpu = -1; + /* * calculate the expiry time for the next timer wheel * timer @@ -273,6 +293,7 @@ void tick_nohz_restart_sched_tick(void) now = ktime_get(); local_irq_disable(); + select_nohz_load_balancer(0); tick_do_update_jiffies64(now); cpu_clear(cpu, nohz_cpu_mask); @@ -338,12 +359,24 @@ static void tick_nohz_handler(struct clock_event_device *dev) { struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); struct pt_regs *regs = get_irq_regs(); + int cpu = smp_processor_id(); ktime_t now = ktime_get(); dev->next_event.tv64 = KTIME_MAX; + /* + * Check if the do_timer duty was dropped. We don't care about + * concurrency: This happens only when the cpu in charge went + * into a long sleep. If two cpus happen to assign themself to + * this duty, then the jiffies update is still serialized by + * xtime_lock. + */ + if (unlikely(tick_do_timer_cpu == -1)) + tick_do_timer_cpu = cpu; + /* Check, if the jiffies need an update */ - tick_do_update_jiffies64(now); + if (tick_do_timer_cpu == cpu) + tick_do_update_jiffies64(now); /* * When we are idle and the tick is stopped, we have to touch @@ -431,9 +464,23 @@ static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer) struct hrtimer_cpu_base *base = timer->base->cpu_base; struct pt_regs *regs = get_irq_regs(); ktime_t now = ktime_get(); + int cpu = smp_processor_id(); + +#ifdef CONFIG_NO_HZ + /* + * Check if the do_timer duty was dropped. We don't care about + * concurrency: This happens only when the cpu in charge went + * into a long sleep. If two cpus happen to assign themself to + * this duty, then the jiffies update is still serialized by + * xtime_lock. + */ + if (unlikely(tick_do_timer_cpu == -1)) + tick_do_timer_cpu = cpu; +#endif /* Check, if the jiffies need an update */ - tick_do_update_jiffies64(now); + if (tick_do_timer_cpu == cpu) + tick_do_update_jiffies64(now); /* * Do not call, when we are not in irq context and have diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c new file mode 100644 index 000000000000..3d1042f82a68 --- /dev/null +++ b/kernel/time/timekeeping.c @@ -0,0 +1,478 @@ +/* + * linux/kernel/time/timekeeping.c + * + * Kernel timekeeping code and accessor functions + * + * This code was moved from linux/kernel/timer.c. + * Please see that file for copyright and history logs. + * + */ + +#include <linux/module.h> +#include <linux/interrupt.h> +#include <linux/percpu.h> +#include <linux/init.h> +#include <linux/mm.h> +#include <linux/sysdev.h> +#include <linux/clocksource.h> +#include <linux/jiffies.h> +#include <linux/time.h> +#include <linux/tick.h> + + +/* + * This read-write spinlock protects us from races in SMP while + * playing with xtime and avenrun. + */ +__attribute__((weak)) __cacheline_aligned_in_smp DEFINE_SEQLOCK(xtime_lock); + +EXPORT_SYMBOL(xtime_lock); + + +/* + * The current time + * wall_to_monotonic is what we need to add to xtime (or xtime corrected + * for sub jiffie times) to get to monotonic time. Monotonic is pegged + * at zero at system boot time, so wall_to_monotonic will be negative, + * however, we will ALWAYS keep the tv_nsec part positive so we can use + * the usual normalization. + */ +struct timespec xtime __attribute__ ((aligned (16))); +struct timespec wall_to_monotonic __attribute__ ((aligned (16))); + +EXPORT_SYMBOL(xtime); + + +static struct clocksource *clock; /* pointer to current clocksource */ + + +#ifdef CONFIG_GENERIC_TIME +/** + * __get_nsec_offset - Returns nanoseconds since last call to periodic_hook + * + * private function, must hold xtime_lock lock when being + * called. Returns the number of nanoseconds since the + * last call to update_wall_time() (adjusted by NTP scaling) + */ +static inline s64 __get_nsec_offset(void) +{ + cycle_t cycle_now, cycle_delta; + s64 ns_offset; + + /* read clocksource: */ + cycle_now = clocksource_read(clock); + + /* calculate the delta since the last update_wall_time: */ + cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; + + /* convert to nanoseconds: */ + ns_offset = cyc2ns(clock, cycle_delta); + + return ns_offset; +} + +/** + * __get_realtime_clock_ts - Returns the time of day in a timespec + * @ts: pointer to the timespec to be set + * + * Returns the time of day in a timespec. Used by + * do_gettimeofday() and get_realtime_clock_ts(). + */ +static inline void __get_realtime_clock_ts(struct timespec *ts) +{ + unsigned long seq; + s64 nsecs; + + do { + seq = read_seqbegin(&xtime_lock); + + *ts = xtime; + nsecs = __get_nsec_offset(); + + } while (read_seqretry(&xtime_lock, seq)); + + timespec_add_ns(ts, nsecs); +} + +/** + * getnstimeofday - Returns the time of day in a timespec + * @ts: pointer to the timespec to be set + * + * Returns the time of day in a timespec. + */ +void getnstimeofday(struct timespec *ts) +{ + __get_realtime_clock_ts(ts); +} + +EXPORT_SYMBOL(getnstimeofday); + +/** + * do_gettimeofday - Returns the time of day in a timeval + * @tv: pointer to the timeval to be set + * + * NOTE: Users should be converted to using get_realtime_clock_ts() + */ +void do_gettimeofday(struct timeval *tv) +{ + struct timespec now; + + __get_realtime_clock_ts(&now); + tv->tv_sec = now.tv_sec; + tv->tv_usec = now.tv_nsec/1000; +} + +EXPORT_SYMBOL(do_gettimeofday); +/** + * do_settimeofday - Sets the time of day + * @tv: pointer to the timespec variable containing the new time + * + * Sets the time of day to the new time and update NTP and notify hrtimers + */ +int do_settimeofday(struct timespec *tv) +{ + unsigned long flags; + time_t wtm_sec, sec = tv->tv_sec; + long wtm_nsec, nsec = tv->tv_nsec; + + if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC) + return -EINVAL; + + write_seqlock_irqsave(&xtime_lock, flags); + + nsec -= __get_nsec_offset(); + + wtm_sec = wall_to_monotonic.tv_sec + (xtime.tv_sec - sec); + wtm_nsec = wall_to_monotonic.tv_nsec + (xtime.tv_nsec - nsec); + + set_normalized_timespec(&xtime, sec, nsec); + set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec); + + clock->error = 0; + ntp_clear(); + + update_vsyscall(&xtime, clock); + + write_sequnlock_irqrestore(&xtime_lock, flags); + + /* signal hrtimers about time change */ + clock_was_set(); + + return 0; +} + +EXPORT_SYMBOL(do_settimeofday); + +/** + * change_clocksource - Swaps clocksources if a new one is available + * + * Accumulates current time interval and initializes new clocksource + */ +static void change_clocksource(void) +{ + struct clocksource *new; + cycle_t now; + u64 nsec; + + new = clocksource_get_next(); + + if (clock == new) + return; + + now = clocksource_read(new); + nsec = __get_nsec_offset(); + timespec_add_ns(&xtime, nsec); + + clock = new; + clock->cycle_last = now; + + clock->error = 0; + clock->xtime_nsec = 0; + clocksource_calculate_interval(clock, NTP_INTERVAL_LENGTH); + + tick_clock_notify(); + + printk(KERN_INFO "Time: %s clocksource has been installed.\n", + clock->name); +} +#else +static inline void change_clocksource(void) { } +#endif + +/** + * timekeeping_is_continuous - check to see if timekeeping is free running + */ +int timekeeping_is_continuous(void) +{ + unsigned long seq; + int ret; + + do { + seq = read_seqbegin(&xtime_lock); + + ret = clock->flags & CLOCK_SOURCE_VALID_FOR_HRES; + + } while (read_seqretry(&xtime_lock, seq)); + + return ret; +} + +/** + * read_persistent_clock - Return time in seconds from the persistent clock. + * + * Weak dummy function for arches that do not yet support it. + * Returns seconds from epoch using the battery backed persistent clock. + * Returns zero if unsupported. + * + * XXX - Do be sure to remove it once all arches implement it. + */ +unsigned long __attribute__((weak)) read_persistent_clock(void) +{ + return 0; +} + +/* + * timekeeping_init - Initializes the clocksource and common timekeeping values + */ +void __init timekeeping_init(void) +{ + unsigned long flags; + unsigned long sec = read_persistent_clock(); + + write_seqlock_irqsave(&xtime_lock, flags); + + ntp_clear(); + + clock = clocksource_get_next(); + clocksource_calculate_interval(clock, NTP_INTERVAL_LENGTH); + clock->cycle_last = clocksource_read(clock); + + xtime.tv_sec = sec; + xtime.tv_nsec = 0; + set_normalized_timespec(&wall_to_monotonic, + -xtime.tv_sec, -xtime.tv_nsec); + + write_sequnlock_irqrestore(&xtime_lock, flags); +} + +/* flag for if timekeeping is suspended */ +static int timekeeping_suspended; +/* time in seconds when suspend began */ +static unsigned long timekeeping_suspend_time; + +/** + * timekeeping_resume - Resumes the generic timekeeping subsystem. + * @dev: unused + * + * This is for the generic clocksource timekeeping. + * xtime/wall_to_monotonic/jiffies/etc are + * still managed by arch specific suspend/resume code. + */ +static int timekeeping_resume(struct sys_device *dev) +{ + unsigned long flags; + unsigned long now = read_persistent_clock(); + + clocksource_resume(); + + write_seqlock_irqsave(&xtime_lock, flags); + + if (now && (now > timekeeping_suspend_time)) { + unsigned long sleep_length = now - timekeeping_suspend_time; + + xtime.tv_sec += sleep_length; + wall_to_monotonic.tv_sec -= sleep_length; + } + /* re-base the last cycle value */ + clock->cycle_last = clocksource_read(clock); + clock->error = 0; + timekeeping_suspended = 0; + write_sequnlock_irqrestore(&xtime_lock, flags); + + touch_softlockup_watchdog(); + + clockevents_notify(CLOCK_EVT_NOTIFY_RESUME, NULL); + + /* Resume hrtimers */ + hres_timers_resume(); + + return 0; +} + +static int timekeeping_suspend(struct sys_device *dev, pm_message_t state) +{ + unsigned long flags; + + write_seqlock_irqsave(&xtime_lock, flags); + timekeeping_suspended = 1; + timekeeping_suspend_time = read_persistent_clock(); + write_sequnlock_irqrestore(&xtime_lock, flags); + + clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL); + + return 0; +} + +/* sysfs resume/suspend bits for timekeeping */ +static struct sysdev_class timekeeping_sysclass = { + .resume = timekeeping_resume, + .suspend = timekeeping_suspend, + set_kset_name("timekeeping"), +}; + +static struct sys_device device_timer = { + .id = 0, + .cls = &timekeeping_sysclass, +}; + +static int __init timekeeping_init_device(void) +{ + int error = sysdev_class_register(&timekeeping_sysclass); + if (!error) + error = sysdev_register(&device_timer); + return error; +} + +device_initcall(timekeeping_init_device); + +/* + * If the error is already larger, we look ahead even further + * to compensate for late or lost adjustments. + */ +static __always_inline int clocksource_bigadjust(s64 error, s64 *interval, + s64 *offset) +{ + s64 tick_error, i; + u32 look_ahead, adj; + s32 error2, mult; + + /* + * Use the current error value to determine how much to look ahead. + * The larger the error the slower we adjust for it to avoid problems + * with losing too many ticks, otherwise we would overadjust and + * produce an even larger error. The smaller the adjustment the + * faster we try to adjust for it, as lost ticks can do less harm + * here. This is tuned so that an error of about 1 msec is adusted + * within about 1 sec (or 2^20 nsec in 2^SHIFT_HZ ticks). + */ + error2 = clock->error >> (TICK_LENGTH_SHIFT + 22 - 2 * SHIFT_HZ); + error2 = abs(error2); + for (look_ahead = 0; error2 > 0; look_ahead++) + error2 >>= 2; + + /* + * Now calculate the error in (1 << look_ahead) ticks, but first + * remove the single look ahead already included in the error. + */ + tick_error = current_tick_length() >> + (TICK_LENGTH_SHIFT - clock->shift + 1); + tick_error -= clock->xtime_interval >> 1; + error = ((error - tick_error) >> look_ahead) + tick_error; + + /* Finally calculate the adjustment shift value. */ + i = *interval; + mult = 1; + if (error < 0) { + error = -error; + *interval = -*interval; + *offset = -*offset; + mult = -1; + } + for (adj = 0; error > i; adj++) + error >>= 1; + + *interval <<= adj; + *offset <<= adj; + return mult << adj; +} + +/* + * Adjust the multiplier to reduce the error value, + * this is optimized for the most common adjustments of -1,0,1, + * for other values we can do a bit more work. + */ +static void clocksource_adjust(struct clocksource *clock, s64 offset) +{ + s64 error, interval = clock->cycle_interval; + int adj; + + error = clock->error >> (TICK_LENGTH_SHIFT - clock->shift - 1); + if (error > interval) { + error >>= 2; + if (likely(error <= interval)) + adj = 1; + else + adj = clocksource_bigadjust(error, &interval, &offset); + } else if (error < -interval) { + error >>= 2; + if (likely(error >= -interval)) { + adj = -1; + interval = -interval; + offset = -offset; + } else + adj = clocksource_bigadjust(error, &interval, &offset); + } else + return; + + clock->mult += adj; + clock->xtime_interval += interval; + clock->xtime_nsec -= offset; + clock->error -= (interval - offset) << + (TICK_LENGTH_SHIFT - clock->shift); +} + +/** + * update_wall_time - Uses the current clocksource to increment the wall time + * + * Called from the timer interrupt, must hold a write on xtime_lock. + */ +void update_wall_time(void) +{ + cycle_t offset; + + /* Make sure we're fully resumed: */ + if (unlikely(timekeeping_suspended)) + return; + +#ifdef CONFIG_GENERIC_TIME + offset = (clocksource_read(clock) - clock->cycle_last) & clock->mask; +#else + offset = clock->cycle_interval; +#endif + clock->xtime_nsec += (s64)xtime.tv_nsec << clock->shift; + + /* normally this loop will run just once, however in the + * case of lost or late ticks, it will accumulate correctly. + */ + while (offset >= clock->cycle_interval) { + /* accumulate one interval */ + clock->xtime_nsec += clock->xtime_interval; + clock->cycle_last += clock->cycle_interval; + offset -= clock->cycle_interval; + + if (clock->xtime_nsec >= (u64)NSEC_PER_SEC << clock->shift) { + clock->xtime_nsec -= (u64)NSEC_PER_SEC << clock->shift; + xtime.tv_sec++; + second_overflow(); + } + + /* interpolator bits */ + time_interpolator_update(clock->xtime_interval + >> clock->shift); + + /* accumulate error between NTP and clock interval */ + clock->error += current_tick_length(); + clock->error -= clock->xtime_interval << (TICK_LENGTH_SHIFT - clock->shift); + } + + /* correct the clock when NTP error is too big */ + clocksource_adjust(clock, offset); + + /* store full nanoseconds into xtime */ + xtime.tv_nsec = (s64)clock->xtime_nsec >> clock->shift; + clock->xtime_nsec -= (s64)xtime.tv_nsec << clock->shift; + + /* check to see if there is a new clocksource to use */ + change_clocksource(); + update_vsyscall(&xtime, clock); +} diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c index 59df5e8555a8..8bbcfb77f7d2 100644 --- a/kernel/time/timer_list.c +++ b/kernel/time/timer_list.c @@ -38,17 +38,12 @@ DECLARE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases); static void print_name_offset(struct seq_file *m, void *sym) { - unsigned long addr = (unsigned long)sym; - char namebuf[KSYM_NAME_LEN+1]; - unsigned long size, offset; - const char *sym_name; - char *modname; - - sym_name = kallsyms_lookup(addr, &size, &offset, &modname, namebuf); - if (sym_name) - SEQ_printf(m, "%s", sym_name); - else + char symname[KSYM_NAME_LEN+1]; + + if (lookup_symbol_name((unsigned long)sym, symname) < 0) SEQ_printf(m, "<%p>", sym); + else + SEQ_printf(m, "%s", symname); } static void @@ -70,7 +65,7 @@ print_timer(struct seq_file *m, struct hrtimer *timer, int idx, u64 now) SEQ_printf(m, ", %s/%d", tmp, timer->start_pid); #endif SEQ_printf(m, "\n"); - SEQ_printf(m, " # expires at %Ld nsecs [in %Ld nsecs]\n", + SEQ_printf(m, " # expires at %Lu nsecs [in %Lu nsecs]\n", (unsigned long long)ktime_to_ns(timer->expires), (unsigned long long)(ktime_to_ns(timer->expires) - now)); } @@ -116,14 +111,14 @@ print_base(struct seq_file *m, struct hrtimer_clock_base *base, u64 now) { SEQ_printf(m, " .index: %d\n", base->index); - SEQ_printf(m, " .resolution: %Ld nsecs\n", + SEQ_printf(m, " .resolution: %Lu nsecs\n", (unsigned long long)ktime_to_ns(base->resolution)); SEQ_printf(m, " .get_time: "); print_name_offset(m, base->get_time); SEQ_printf(m, "\n"); #ifdef CONFIG_HIGH_RES_TIMERS - SEQ_printf(m, " .offset: %Ld nsecs\n", - ktime_to_ns(base->offset)); + SEQ_printf(m, " .offset: %Lu nsecs\n", + (unsigned long long) ktime_to_ns(base->offset)); #endif SEQ_printf(m, "active timers:\n"); print_active_timers(m, base, now); @@ -140,10 +135,11 @@ static void print_cpu(struct seq_file *m, int cpu, u64 now) print_base(m, cpu_base->clock_base + i, now); } #define P(x) \ - SEQ_printf(m, " .%-15s: %Ld\n", #x, (u64)(cpu_base->x)) + SEQ_printf(m, " .%-15s: %Lu\n", #x, \ + (unsigned long long)(cpu_base->x)) #define P_ns(x) \ - SEQ_printf(m, " .%-15s: %Ld nsecs\n", #x, \ - (u64)(ktime_to_ns(cpu_base->x))) + SEQ_printf(m, " .%-15s: %Lu nsecs\n", #x, \ + (unsigned long long)(ktime_to_ns(cpu_base->x))) #ifdef CONFIG_HIGH_RES_TIMERS P_ns(expires_next); @@ -155,10 +151,11 @@ static void print_cpu(struct seq_file *m, int cpu, u64 now) #ifdef CONFIG_TICK_ONESHOT # define P(x) \ - SEQ_printf(m, " .%-15s: %Ld\n", #x, (u64)(ts->x)) + SEQ_printf(m, " .%-15s: %Lu\n", #x, \ + (unsigned long long)(ts->x)) # define P_ns(x) \ - SEQ_printf(m, " .%-15s: %Ld nsecs\n", #x, \ - (u64)(ktime_to_ns(ts->x))) + SEQ_printf(m, " .%-15s: %Lu nsecs\n", #x, \ + (unsigned long long)(ktime_to_ns(ts->x))) { struct tick_sched *ts = tick_get_tick_sched(cpu); P(nohz_mode); @@ -172,7 +169,8 @@ static void print_cpu(struct seq_file *m, int cpu, u64 now) P(last_jiffies); P(next_jiffies); P_ns(idle_expires); - SEQ_printf(m, "jiffies: %Ld\n", (u64)jiffies); + SEQ_printf(m, "jiffies: %Lu\n", + (unsigned long long)jiffies); } #endif diff --git a/kernel/time/timer_stats.c b/kernel/time/timer_stats.c index 1bc4882e28e0..868f1bceb07f 100644 --- a/kernel/time/timer_stats.c +++ b/kernel/time/timer_stats.c @@ -257,16 +257,12 @@ void timer_stats_update_stats(void *timer, pid_t pid, void *startf, static void print_name_offset(struct seq_file *m, unsigned long addr) { - char namebuf[KSYM_NAME_LEN+1]; - unsigned long size, offset; - const char *sym_name; - char *modname; - - sym_name = kallsyms_lookup(addr, &size, &offset, &modname, namebuf); - if (sym_name) - seq_printf(m, "%s", sym_name); - else + char symname[KSYM_NAME_LEN+1]; + + if (lookup_symbol_name(addr, symname) < 0) seq_printf(m, "<%p>", (void *)addr); + else + seq_printf(m, "%s", symname); } static int tstats_show(struct seq_file *m, void *v) diff --git a/kernel/timer.c b/kernel/timer.c index b22bd39740dd..5ec5490f8d85 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -1,7 +1,7 @@ /* * linux/kernel/timer.c * - * Kernel internal timers, kernel timekeeping, basic process system calls + * Kernel internal timers, basic process system calls * * Copyright (C) 1991, 1992 Linus Torvalds * @@ -74,7 +74,7 @@ struct tvec_t_base_s { tvec_t tv3; tvec_t tv4; tvec_t tv5; -} ____cacheline_aligned_in_smp; +} ____cacheline_aligned; typedef struct tvec_t_base_s tvec_base_t; @@ -82,6 +82,37 @@ tvec_base_t boot_tvec_bases; EXPORT_SYMBOL(boot_tvec_bases); static DEFINE_PER_CPU(tvec_base_t *, tvec_bases) = &boot_tvec_bases; +/* + * Note that all tvec_bases is 2 byte aligned and lower bit of + * base in timer_list is guaranteed to be zero. Use the LSB for + * the new flag to indicate whether the timer is deferrable + */ +#define TBASE_DEFERRABLE_FLAG (0x1) + +/* Functions below help us manage 'deferrable' flag */ +static inline unsigned int tbase_get_deferrable(tvec_base_t *base) +{ + return ((unsigned int)(unsigned long)base & TBASE_DEFERRABLE_FLAG); +} + +static inline tvec_base_t *tbase_get_base(tvec_base_t *base) +{ + return ((tvec_base_t *)((unsigned long)base & ~TBASE_DEFERRABLE_FLAG)); +} + +static inline void timer_set_deferrable(struct timer_list *timer) +{ + timer->base = ((tvec_base_t *)((unsigned long)(timer->base) | + TBASE_DEFERRABLE_FLAG)); +} + +static inline void +timer_set_base(struct timer_list *timer, tvec_base_t *new_base) +{ + timer->base = (tvec_base_t *)((unsigned long)(new_base) | + tbase_get_deferrable(timer->base)); +} + /** * __round_jiffies - function to round jiffies to a full second * @j: the time in (absolute) jiffies that should be rounded @@ -295,6 +326,13 @@ void fastcall init_timer(struct timer_list *timer) } EXPORT_SYMBOL(init_timer); +void fastcall init_timer_deferrable(struct timer_list *timer) +{ + init_timer(timer); + timer_set_deferrable(timer); +} +EXPORT_SYMBOL(init_timer_deferrable); + static inline void detach_timer(struct timer_list *timer, int clear_pending) { @@ -325,10 +363,11 @@ static tvec_base_t *lock_timer_base(struct timer_list *timer, tvec_base_t *base; for (;;) { - base = timer->base; + tvec_base_t *prelock_base = timer->base; + base = tbase_get_base(prelock_base); if (likely(base != NULL)) { spin_lock_irqsave(&base->lock, *flags); - if (likely(base == timer->base)) + if (likely(prelock_base == timer->base)) return base; /* The timer has migrated to another CPU */ spin_unlock_irqrestore(&base->lock, *flags); @@ -365,11 +404,11 @@ int __mod_timer(struct timer_list *timer, unsigned long expires) */ if (likely(base->running_timer != timer)) { /* See the comment in lock_timer_base() */ - timer->base = NULL; + timer_set_base(timer, NULL); spin_unlock(&base->lock); base = new_base; spin_lock(&base->lock); - timer->base = base; + timer_set_base(timer, base); } } @@ -397,7 +436,7 @@ void add_timer_on(struct timer_list *timer, int cpu) timer_stats_timer_set_start_info(timer); BUG_ON(timer_pending(timer) || !timer->function); spin_lock_irqsave(&base->lock, flags); - timer->base = base; + timer_set_base(timer, base); internal_add_timer(base, timer); spin_unlock_irqrestore(&base->lock, flags); } @@ -550,7 +589,7 @@ static int cascade(tvec_base_t *base, tvec_t *tv, int index) * don't have to detach them individually. */ list_for_each_entry_safe(timer, tmp, &tv_list, entry) { - BUG_ON(timer->base != base); + BUG_ON(tbase_get_base(timer->base) != base); internal_add_timer(base, timer); } @@ -590,7 +629,7 @@ static inline void __run_timers(tvec_base_t *base) void (*fn)(unsigned long); unsigned long data; - timer = list_entry(head->next,struct timer_list,entry); + timer = list_first_entry(head, struct timer_list,entry); fn = timer->function; data = timer->data; @@ -636,6 +675,9 @@ static unsigned long __next_timer_interrupt(tvec_base_t *base) index = slot = timer_jiffies & TVR_MASK; do { list_for_each_entry(nte, base->tv1.vec + slot, entry) { + if (tbase_get_deferrable(nte->base)) + continue; + found = 1; expires = nte->expires; /* Look at the cascade bucket(s)? */ @@ -752,455 +794,6 @@ unsigned long next_timer_interrupt(void) #endif -/******************************************************************/ - -/* - * The current time - * wall_to_monotonic is what we need to add to xtime (or xtime corrected - * for sub jiffie times) to get to monotonic time. Monotonic is pegged - * at zero at system boot time, so wall_to_monotonic will be negative, - * however, we will ALWAYS keep the tv_nsec part positive so we can use - * the usual normalization. - */ -struct timespec xtime __attribute__ ((aligned (16))); -struct timespec wall_to_monotonic __attribute__ ((aligned (16))); - -EXPORT_SYMBOL(xtime); - - -/* XXX - all of this timekeeping code should be later moved to time.c */ -#include <linux/clocksource.h> -static struct clocksource *clock; /* pointer to current clocksource */ - -#ifdef CONFIG_GENERIC_TIME -/** - * __get_nsec_offset - Returns nanoseconds since last call to periodic_hook - * - * private function, must hold xtime_lock lock when being - * called. Returns the number of nanoseconds since the - * last call to update_wall_time() (adjusted by NTP scaling) - */ -static inline s64 __get_nsec_offset(void) -{ - cycle_t cycle_now, cycle_delta; - s64 ns_offset; - - /* read clocksource: */ - cycle_now = clocksource_read(clock); - - /* calculate the delta since the last update_wall_time: */ - cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; - - /* convert to nanoseconds: */ - ns_offset = cyc2ns(clock, cycle_delta); - - return ns_offset; -} - -/** - * __get_realtime_clock_ts - Returns the time of day in a timespec - * @ts: pointer to the timespec to be set - * - * Returns the time of day in a timespec. Used by - * do_gettimeofday() and get_realtime_clock_ts(). - */ -static inline void __get_realtime_clock_ts(struct timespec *ts) -{ - unsigned long seq; - s64 nsecs; - - do { - seq = read_seqbegin(&xtime_lock); - - *ts = xtime; - nsecs = __get_nsec_offset(); - - } while (read_seqretry(&xtime_lock, seq)); - - timespec_add_ns(ts, nsecs); -} - -/** - * getnstimeofday - Returns the time of day in a timespec - * @ts: pointer to the timespec to be set - * - * Returns the time of day in a timespec. - */ -void getnstimeofday(struct timespec *ts) -{ - __get_realtime_clock_ts(ts); -} - -EXPORT_SYMBOL(getnstimeofday); - -/** - * do_gettimeofday - Returns the time of day in a timeval - * @tv: pointer to the timeval to be set - * - * NOTE: Users should be converted to using get_realtime_clock_ts() - */ -void do_gettimeofday(struct timeval *tv) -{ - struct timespec now; - - __get_realtime_clock_ts(&now); - tv->tv_sec = now.tv_sec; - tv->tv_usec = now.tv_nsec/1000; -} - -EXPORT_SYMBOL(do_gettimeofday); -/** - * do_settimeofday - Sets the time of day - * @tv: pointer to the timespec variable containing the new time - * - * Sets the time of day to the new time and update NTP and notify hrtimers - */ -int do_settimeofday(struct timespec *tv) -{ - unsigned long flags; - time_t wtm_sec, sec = tv->tv_sec; - long wtm_nsec, nsec = tv->tv_nsec; - - if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC) - return -EINVAL; - - write_seqlock_irqsave(&xtime_lock, flags); - - nsec -= __get_nsec_offset(); - - wtm_sec = wall_to_monotonic.tv_sec + (xtime.tv_sec - sec); - wtm_nsec = wall_to_monotonic.tv_nsec + (xtime.tv_nsec - nsec); - - set_normalized_timespec(&xtime, sec, nsec); - set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec); - - clock->error = 0; - ntp_clear(); - - update_vsyscall(&xtime, clock); - - write_sequnlock_irqrestore(&xtime_lock, flags); - - /* signal hrtimers about time change */ - clock_was_set(); - - return 0; -} - -EXPORT_SYMBOL(do_settimeofday); - -/** - * change_clocksource - Swaps clocksources if a new one is available - * - * Accumulates current time interval and initializes new clocksource - */ -static void change_clocksource(void) -{ - struct clocksource *new; - cycle_t now; - u64 nsec; - - new = clocksource_get_next(); - - if (clock == new) - return; - - now = clocksource_read(new); - nsec = __get_nsec_offset(); - timespec_add_ns(&xtime, nsec); - - clock = new; - clock->cycle_last = now; - - clock->error = 0; - clock->xtime_nsec = 0; - clocksource_calculate_interval(clock, NTP_INTERVAL_LENGTH); - - tick_clock_notify(); - - printk(KERN_INFO "Time: %s clocksource has been installed.\n", - clock->name); -} -#else -static inline void change_clocksource(void) { } -#endif - -/** - * timekeeping_is_continuous - check to see if timekeeping is free running - */ -int timekeeping_is_continuous(void) -{ - unsigned long seq; - int ret; - - do { - seq = read_seqbegin(&xtime_lock); - - ret = clock->flags & CLOCK_SOURCE_VALID_FOR_HRES; - - } while (read_seqretry(&xtime_lock, seq)); - - return ret; -} - -/** - * read_persistent_clock - Return time in seconds from the persistent clock. - * - * Weak dummy function for arches that do not yet support it. - * Returns seconds from epoch using the battery backed persistent clock. - * Returns zero if unsupported. - * - * XXX - Do be sure to remove it once all arches implement it. - */ -unsigned long __attribute__((weak)) read_persistent_clock(void) -{ - return 0; -} - -/* - * timekeeping_init - Initializes the clocksource and common timekeeping values - */ -void __init timekeeping_init(void) -{ - unsigned long flags; - unsigned long sec = read_persistent_clock(); - - write_seqlock_irqsave(&xtime_lock, flags); - - ntp_clear(); - - clock = clocksource_get_next(); - clocksource_calculate_interval(clock, NTP_INTERVAL_LENGTH); - clock->cycle_last = clocksource_read(clock); - - xtime.tv_sec = sec; - xtime.tv_nsec = 0; - set_normalized_timespec(&wall_to_monotonic, - -xtime.tv_sec, -xtime.tv_nsec); - - write_sequnlock_irqrestore(&xtime_lock, flags); -} - -/* flag for if timekeeping is suspended */ -static int timekeeping_suspended; -/* time in seconds when suspend began */ -static unsigned long timekeeping_suspend_time; - -/** - * timekeeping_resume - Resumes the generic timekeeping subsystem. - * @dev: unused - * - * This is for the generic clocksource timekeeping. - * xtime/wall_to_monotonic/jiffies/etc are - * still managed by arch specific suspend/resume code. - */ -static int timekeeping_resume(struct sys_device *dev) -{ - unsigned long flags; - unsigned long now = read_persistent_clock(); - - write_seqlock_irqsave(&xtime_lock, flags); - - if (now && (now > timekeeping_suspend_time)) { - unsigned long sleep_length = now - timekeeping_suspend_time; - - xtime.tv_sec += sleep_length; - wall_to_monotonic.tv_sec -= sleep_length; - } - /* re-base the last cycle value */ - clock->cycle_last = clocksource_read(clock); - clock->error = 0; - timekeeping_suspended = 0; - write_sequnlock_irqrestore(&xtime_lock, flags); - - touch_softlockup_watchdog(); - - clockevents_notify(CLOCK_EVT_NOTIFY_RESUME, NULL); - - /* Resume hrtimers */ - hres_timers_resume(); - - return 0; -} - -static int timekeeping_suspend(struct sys_device *dev, pm_message_t state) -{ - unsigned long flags; - - write_seqlock_irqsave(&xtime_lock, flags); - timekeeping_suspended = 1; - timekeeping_suspend_time = read_persistent_clock(); - write_sequnlock_irqrestore(&xtime_lock, flags); - - clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL); - - return 0; -} - -/* sysfs resume/suspend bits for timekeeping */ -static struct sysdev_class timekeeping_sysclass = { - .resume = timekeeping_resume, - .suspend = timekeeping_suspend, - set_kset_name("timekeeping"), -}; - -static struct sys_device device_timer = { - .id = 0, - .cls = &timekeeping_sysclass, -}; - -static int __init timekeeping_init_device(void) -{ - int error = sysdev_class_register(&timekeeping_sysclass); - if (!error) - error = sysdev_register(&device_timer); - return error; -} - -device_initcall(timekeeping_init_device); - -/* - * If the error is already larger, we look ahead even further - * to compensate for late or lost adjustments. - */ -static __always_inline int clocksource_bigadjust(s64 error, s64 *interval, - s64 *offset) -{ - s64 tick_error, i; - u32 look_ahead, adj; - s32 error2, mult; - - /* - * Use the current error value to determine how much to look ahead. - * The larger the error the slower we adjust for it to avoid problems - * with losing too many ticks, otherwise we would overadjust and - * produce an even larger error. The smaller the adjustment the - * faster we try to adjust for it, as lost ticks can do less harm - * here. This is tuned so that an error of about 1 msec is adusted - * within about 1 sec (or 2^20 nsec in 2^SHIFT_HZ ticks). - */ - error2 = clock->error >> (TICK_LENGTH_SHIFT + 22 - 2 * SHIFT_HZ); - error2 = abs(error2); - for (look_ahead = 0; error2 > 0; look_ahead++) - error2 >>= 2; - - /* - * Now calculate the error in (1 << look_ahead) ticks, but first - * remove the single look ahead already included in the error. - */ - tick_error = current_tick_length() >> - (TICK_LENGTH_SHIFT - clock->shift + 1); - tick_error -= clock->xtime_interval >> 1; - error = ((error - tick_error) >> look_ahead) + tick_error; - - /* Finally calculate the adjustment shift value. */ - i = *interval; - mult = 1; - if (error < 0) { - error = -error; - *interval = -*interval; - *offset = -*offset; - mult = -1; - } - for (adj = 0; error > i; adj++) - error >>= 1; - - *interval <<= adj; - *offset <<= adj; - return mult << adj; -} - -/* - * Adjust the multiplier to reduce the error value, - * this is optimized for the most common adjustments of -1,0,1, - * for other values we can do a bit more work. - */ -static void clocksource_adjust(struct clocksource *clock, s64 offset) -{ - s64 error, interval = clock->cycle_interval; - int adj; - - error = clock->error >> (TICK_LENGTH_SHIFT - clock->shift - 1); - if (error > interval) { - error >>= 2; - if (likely(error <= interval)) - adj = 1; - else - adj = clocksource_bigadjust(error, &interval, &offset); - } else if (error < -interval) { - error >>= 2; - if (likely(error >= -interval)) { - adj = -1; - interval = -interval; - offset = -offset; - } else - adj = clocksource_bigadjust(error, &interval, &offset); - } else - return; - - clock->mult += adj; - clock->xtime_interval += interval; - clock->xtime_nsec -= offset; - clock->error -= (interval - offset) << - (TICK_LENGTH_SHIFT - clock->shift); -} - -/** - * update_wall_time - Uses the current clocksource to increment the wall time - * - * Called from the timer interrupt, must hold a write on xtime_lock. - */ -static void update_wall_time(void) -{ - cycle_t offset; - - /* Make sure we're fully resumed: */ - if (unlikely(timekeeping_suspended)) - return; - -#ifdef CONFIG_GENERIC_TIME - offset = (clocksource_read(clock) - clock->cycle_last) & clock->mask; -#else - offset = clock->cycle_interval; -#endif - clock->xtime_nsec += (s64)xtime.tv_nsec << clock->shift; - - /* normally this loop will run just once, however in the - * case of lost or late ticks, it will accumulate correctly. - */ - while (offset >= clock->cycle_interval) { - /* accumulate one interval */ - clock->xtime_nsec += clock->xtime_interval; - clock->cycle_last += clock->cycle_interval; - offset -= clock->cycle_interval; - - if (clock->xtime_nsec >= (u64)NSEC_PER_SEC << clock->shift) { - clock->xtime_nsec -= (u64)NSEC_PER_SEC << clock->shift; - xtime.tv_sec++; - second_overflow(); - } - - /* interpolator bits */ - time_interpolator_update(clock->xtime_interval - >> clock->shift); - - /* accumulate error between NTP and clock interval */ - clock->error += current_tick_length(); - clock->error -= clock->xtime_interval << (TICK_LENGTH_SHIFT - clock->shift); - } - - /* correct the clock when NTP error is too big */ - clocksource_adjust(clock, offset); - - /* store full nanoseconds into xtime */ - xtime.tv_nsec = (s64)clock->xtime_nsec >> clock->shift; - clock->xtime_nsec -= (s64)xtime.tv_nsec << clock->shift; - - /* check to see if there is a new clocksource to use */ - change_clocksource(); - update_vsyscall(&xtime, clock); -} - /* * Called from the timer interrupt handler to charge one tick to the current * process. user_tick is 1 if the tick is user time, 0 for system. @@ -1264,14 +857,6 @@ static inline void calc_load(unsigned long ticks) } /* - * This read-write spinlock protects us from races in SMP while - * playing with xtime and avenrun. - */ -__attribute__((weak)) __cacheline_aligned_in_smp DEFINE_SEQLOCK(xtime_lock); - -EXPORT_SYMBOL(xtime_lock); - -/* * This function runs timers and the timer-tq in bottom half context. */ static void run_timer_softirq(struct softirq_action *h) @@ -1617,6 +1202,13 @@ static int __devinit init_timers_cpu(int cpu) cpu_to_node(cpu)); if (!base) return -ENOMEM; + + /* Make sure that tvec_base is 2 byte aligned */ + if (tbase_get_deferrable(base)) { + WARN_ON(1); + kfree(base); + return -ENOMEM; + } memset(base, 0, sizeof(*base)); per_cpu(tvec_bases, cpu) = base; } else { @@ -1656,9 +1248,9 @@ static void migrate_timer_list(tvec_base_t *new_base, struct list_head *head) struct timer_list *timer; while (!list_empty(head)) { - timer = list_entry(head->next, struct timer_list, entry); + timer = list_first_entry(head, struct timer_list, entry); detach_timer(timer, 0); - timer->base = new_base; + timer_set_base(timer, new_base); internal_add_timer(new_base, timer); } } @@ -1701,11 +1293,13 @@ static int __cpuinit timer_cpu_notify(struct notifier_block *self, long cpu = (long)hcpu; switch(action) { case CPU_UP_PREPARE: + case CPU_UP_PREPARE_FROZEN: if (init_timers_cpu(cpu) < 0) return NOTIFY_BAD; break; #ifdef CONFIG_HOTPLUG_CPU case CPU_DEAD: + case CPU_DEAD_FROZEN: migrate_timers(cpu); break; #endif diff --git a/kernel/uid16.c b/kernel/uid16.c index 187e2a423878..dd308ba4e03b 100644 --- a/kernel/uid16.c +++ b/kernel/uid16.c @@ -6,7 +6,6 @@ #include <linux/mm.h> #include <linux/utsname.h> #include <linux/mman.h> -#include <linux/smp_lock.h> #include <linux/notifier.h> #include <linux/reboot.h> #include <linux/prctl.h> diff --git a/kernel/utsname.c b/kernel/utsname.c index c859164a6993..160c8c5136bd 100644 --- a/kernel/utsname.c +++ b/kernel/utsname.c @@ -32,58 +32,25 @@ static struct uts_namespace *clone_uts_ns(struct uts_namespace *old_ns) } /* - * unshare the current process' utsname namespace. - * called only in sys_unshare() - */ -int unshare_utsname(unsigned long unshare_flags, struct uts_namespace **new_uts) -{ - if (unshare_flags & CLONE_NEWUTS) { - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - *new_uts = clone_uts_ns(current->nsproxy->uts_ns); - if (!*new_uts) - return -ENOMEM; - } - - return 0; -} - -/* * Copy task tsk's utsname namespace, or clone it if flags * specifies CLONE_NEWUTS. In latter case, changes to the * utsname of this process won't be seen by parent, and vice * versa. */ -int copy_utsname(int flags, struct task_struct *tsk) +struct uts_namespace *copy_utsname(int flags, struct uts_namespace *old_ns) { - struct uts_namespace *old_ns = tsk->nsproxy->uts_ns; struct uts_namespace *new_ns; - int err = 0; - - if (!old_ns) - return 0; + BUG_ON(!old_ns); get_uts_ns(old_ns); if (!(flags & CLONE_NEWUTS)) - return 0; - - if (!capable(CAP_SYS_ADMIN)) { - err = -EPERM; - goto out; - } + return old_ns; new_ns = clone_uts_ns(old_ns); - if (!new_ns) { - err = -ENOMEM; - goto out; - } - tsk->nsproxy->uts_ns = new_ns; -out: put_uts_ns(old_ns); - return err; + return new_ns; } void free_uts_ns(struct kref *kref) diff --git a/kernel/wait.c b/kernel/wait.c index 59a82f63275d..444ddbfaefc4 100644 --- a/kernel/wait.c +++ b/kernel/wait.c @@ -61,7 +61,7 @@ EXPORT_SYMBOL(remove_wait_queue); * The spin_unlock() itself is semi-permeable and only protects * one way (it only protects stuff inside the critical region and * stops them from bleeding out - it would still allow subsequent - * loads to move into the the critical region). + * loads to move into the critical region). */ void fastcall prepare_to_wait(wait_queue_head_t *q, wait_queue_t *wait, int state) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index b6fa5e63085d..fb56fedd5c02 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -36,30 +36,20 @@ /* * The per-CPU workqueue (if single thread, we always use the first * possible cpu). - * - * The sequence counters are for flush_scheduled_work(). It wants to wait - * until all currently-scheduled works are completed, but it doesn't - * want to be livelocked by new, incoming ones. So it waits until - * remove_sequence is >= the insert_sequence which pertained when - * flush_scheduled_work() was called. */ struct cpu_workqueue_struct { spinlock_t lock; - long remove_sequence; /* Least-recently added (next to run) */ - long insert_sequence; /* Next to add */ - struct list_head worklist; wait_queue_head_t more_work; - wait_queue_head_t work_done; + struct work_struct *current_work; struct workqueue_struct *wq; struct task_struct *thread; + int should_stop; int run_depth; /* Detect run_workqueue() recursion depth */ - - int freezeable; /* Freeze the thread during suspend */ } ____cacheline_aligned; /* @@ -68,8 +58,10 @@ struct cpu_workqueue_struct { */ struct workqueue_struct { struct cpu_workqueue_struct *cpu_wq; + struct list_head list; const char *name; - struct list_head list; /* Empty if single thread */ + int singlethread; + int freezeable; /* Freeze threads during suspend */ }; /* All the per-cpu workqueues on the system, for hotplug cpu to add/remove @@ -77,106 +69,68 @@ struct workqueue_struct { static DEFINE_MUTEX(workqueue_mutex); static LIST_HEAD(workqueues); -static int singlethread_cpu; +static int singlethread_cpu __read_mostly; +static cpumask_t cpu_singlethread_map __read_mostly; +/* optimization, we could use cpu_possible_map */ +static cpumask_t cpu_populated_map __read_mostly; /* If it's single threaded, it isn't in the list of workqueues. */ static inline int is_single_threaded(struct workqueue_struct *wq) { - return list_empty(&wq->list); + return wq->singlethread; +} + +static const cpumask_t *wq_cpu_map(struct workqueue_struct *wq) +{ + return is_single_threaded(wq) + ? &cpu_singlethread_map : &cpu_populated_map; +} + +static +struct cpu_workqueue_struct *wq_per_cpu(struct workqueue_struct *wq, int cpu) +{ + if (unlikely(is_single_threaded(wq))) + cpu = singlethread_cpu; + return per_cpu_ptr(wq->cpu_wq, cpu); } /* * Set the workqueue on which a work item is to be run * - Must *only* be called if the pending flag is set */ -static inline void set_wq_data(struct work_struct *work, void *wq) +static inline void set_wq_data(struct work_struct *work, + struct cpu_workqueue_struct *cwq) { unsigned long new; BUG_ON(!work_pending(work)); - new = (unsigned long) wq | (1UL << WORK_STRUCT_PENDING); + new = (unsigned long) cwq | (1UL << WORK_STRUCT_PENDING); new |= WORK_STRUCT_FLAG_MASK & *work_data_bits(work); atomic_long_set(&work->data, new); } -static inline void *get_wq_data(struct work_struct *work) +static inline +struct cpu_workqueue_struct *get_wq_data(struct work_struct *work) { return (void *) (atomic_long_read(&work->data) & WORK_STRUCT_WQ_DATA_MASK); } -static int __run_work(struct cpu_workqueue_struct *cwq, struct work_struct *work) +static void insert_work(struct cpu_workqueue_struct *cwq, + struct work_struct *work, int tail) { - int ret = 0; - unsigned long flags; - - spin_lock_irqsave(&cwq->lock, flags); + set_wq_data(work, cwq); /* - * We need to re-validate the work info after we've gotten - * the cpu_workqueue lock. We can run the work now iff: - * - * - the wq_data still matches the cpu_workqueue_struct - * - AND the work is still marked pending - * - AND the work is still on a list (which will be this - * workqueue_struct list) - * - * All these conditions are important, because we - * need to protect against the work being run right - * now on another CPU (all but the last one might be - * true if it's currently running and has not been - * released yet, for example). + * Ensure that we get the right work->data if we see the + * result of list_add() below, see try_to_grab_pending(). */ - if (get_wq_data(work) == cwq - && work_pending(work) - && !list_empty(&work->entry)) { - work_func_t f = work->func; - list_del_init(&work->entry); - spin_unlock_irqrestore(&cwq->lock, flags); - - if (!test_bit(WORK_STRUCT_NOAUTOREL, work_data_bits(work))) - work_release(work); - f(work); - - spin_lock_irqsave(&cwq->lock, flags); - cwq->remove_sequence++; - wake_up(&cwq->work_done); - ret = 1; - } - spin_unlock_irqrestore(&cwq->lock, flags); - return ret; -} - -/** - * run_scheduled_work - run scheduled work synchronously - * @work: work to run - * - * This checks if the work was pending, and runs it - * synchronously if so. It returns a boolean to indicate - * whether it had any scheduled work to run or not. - * - * NOTE! This _only_ works for normal work_structs. You - * CANNOT use this for delayed work, because the wq data - * for delayed work will not point properly to the per- - * CPU workqueue struct, but will change! - */ -int fastcall run_scheduled_work(struct work_struct *work) -{ - for (;;) { - struct cpu_workqueue_struct *cwq; - - if (!work_pending(work)) - return 0; - if (list_empty(&work->entry)) - return 0; - /* NOTE! This depends intimately on __queue_work! */ - cwq = get_wq_data(work); - if (!cwq) - return 0; - if (__run_work(cwq, work)) - return 1; - } + smp_wmb(); + if (tail) + list_add_tail(&work->entry, &cwq->worklist); + else + list_add(&work->entry, &cwq->worklist); + wake_up(&cwq->more_work); } -EXPORT_SYMBOL(run_scheduled_work); /* Preempt must be disabled. */ static void __queue_work(struct cpu_workqueue_struct *cwq, @@ -185,10 +139,7 @@ static void __queue_work(struct cpu_workqueue_struct *cwq, unsigned long flags; spin_lock_irqsave(&cwq->lock, flags); - set_wq_data(work, cwq); - list_add_tail(&work->entry, &cwq->worklist); - cwq->insert_sequence++; - wake_up(&cwq->more_work); + insert_work(cwq, work, 1); spin_unlock_irqrestore(&cwq->lock, flags); } @@ -204,16 +155,14 @@ static void __queue_work(struct cpu_workqueue_struct *cwq, */ int fastcall queue_work(struct workqueue_struct *wq, struct work_struct *work) { - int ret = 0, cpu = get_cpu(); + int ret = 0; if (!test_and_set_bit(WORK_STRUCT_PENDING, work_data_bits(work))) { - if (unlikely(is_single_threaded(wq))) - cpu = singlethread_cpu; BUG_ON(!list_empty(&work->entry)); - __queue_work(per_cpu_ptr(wq->cpu_wq, cpu), work); + __queue_work(wq_per_cpu(wq, get_cpu()), work); + put_cpu(); ret = 1; } - put_cpu(); return ret; } EXPORT_SYMBOL_GPL(queue_work); @@ -221,13 +170,10 @@ EXPORT_SYMBOL_GPL(queue_work); void delayed_work_timer_fn(unsigned long __data) { struct delayed_work *dwork = (struct delayed_work *)__data; - struct workqueue_struct *wq = get_wq_data(&dwork->work); - int cpu = smp_processor_id(); + struct cpu_workqueue_struct *cwq = get_wq_data(&dwork->work); + struct workqueue_struct *wq = cwq->wq; - if (unlikely(is_single_threaded(wq))) - cpu = singlethread_cpu; - - __queue_work(per_cpu_ptr(wq->cpu_wq, cpu), &dwork->work); + __queue_work(wq_per_cpu(wq, smp_processor_id()), &dwork->work); } /** @@ -241,27 +187,11 @@ void delayed_work_timer_fn(unsigned long __data) int fastcall queue_delayed_work(struct workqueue_struct *wq, struct delayed_work *dwork, unsigned long delay) { - int ret = 0; - struct timer_list *timer = &dwork->timer; - struct work_struct *work = &dwork->work; - - timer_stats_timer_set_start_info(timer); + timer_stats_timer_set_start_info(&dwork->timer); if (delay == 0) - return queue_work(wq, work); - - if (!test_and_set_bit(WORK_STRUCT_PENDING, work_data_bits(work))) { - BUG_ON(timer_pending(timer)); - BUG_ON(!list_empty(&work->entry)); + return queue_work(wq, &dwork->work); - /* This stores wq for the moment, for the timer_fn */ - set_wq_data(work, wq); - timer->expires = jiffies + delay; - timer->data = (unsigned long)dwork; - timer->function = delayed_work_timer_fn; - add_timer(timer); - ret = 1; - } - return ret; + return queue_delayed_work_on(-1, wq, dwork, delay); } EXPORT_SYMBOL_GPL(queue_delayed_work); @@ -285,12 +215,16 @@ int queue_delayed_work_on(int cpu, struct workqueue_struct *wq, BUG_ON(timer_pending(timer)); BUG_ON(!list_empty(&work->entry)); - /* This stores wq for the moment, for the timer_fn */ - set_wq_data(work, wq); + /* This stores cwq for the moment, for the timer_fn */ + set_wq_data(work, wq_per_cpu(wq, raw_smp_processor_id())); timer->expires = jiffies + delay; timer->data = (unsigned long)dwork; timer->function = delayed_work_timer_fn; - add_timer_on(timer, cpu); + + if (unlikely(cpu >= 0)) + add_timer_on(timer, cpu); + else + add_timer(timer); ret = 1; } return ret; @@ -299,13 +233,7 @@ EXPORT_SYMBOL_GPL(queue_delayed_work_on); static void run_workqueue(struct cpu_workqueue_struct *cwq) { - unsigned long flags; - - /* - * Keep taking off work from the queue until - * done. - */ - spin_lock_irqsave(&cwq->lock, flags); + spin_lock_irq(&cwq->lock); cwq->run_depth++; if (cwq->run_depth > 3) { /* morton gets to eat his hat */ @@ -318,12 +246,12 @@ static void run_workqueue(struct cpu_workqueue_struct *cwq) struct work_struct, entry); work_func_t f = work->func; + cwq->current_work = work; list_del_init(cwq->worklist.next); - spin_unlock_irqrestore(&cwq->lock, flags); + spin_unlock_irq(&cwq->lock); BUG_ON(get_wq_data(work) != cwq); - if (!test_bit(WORK_STRUCT_NOAUTOREL, work_data_bits(work))) - work_release(work); + work_clear_pending(work); f(work); if (unlikely(in_atomic() || lockdep_depth(current) > 0)) { @@ -337,63 +265,81 @@ static void run_workqueue(struct cpu_workqueue_struct *cwq) dump_stack(); } - spin_lock_irqsave(&cwq->lock, flags); - cwq->remove_sequence++; - wake_up(&cwq->work_done); + spin_lock_irq(&cwq->lock); + cwq->current_work = NULL; } cwq->run_depth--; - spin_unlock_irqrestore(&cwq->lock, flags); + spin_unlock_irq(&cwq->lock); +} + +/* + * NOTE: the caller must not touch *cwq if this func returns true + */ +static int cwq_should_stop(struct cpu_workqueue_struct *cwq) +{ + int should_stop = cwq->should_stop; + + if (unlikely(should_stop)) { + spin_lock_irq(&cwq->lock); + should_stop = cwq->should_stop && list_empty(&cwq->worklist); + if (should_stop) + cwq->thread = NULL; + spin_unlock_irq(&cwq->lock); + } + + return should_stop; } static int worker_thread(void *__cwq) { struct cpu_workqueue_struct *cwq = __cwq; - DECLARE_WAITQUEUE(wait, current); - struct k_sigaction sa; - sigset_t blocked; + DEFINE_WAIT(wait); - if (!cwq->freezeable) + if (!cwq->wq->freezeable) current->flags |= PF_NOFREEZE; set_user_nice(current, -5); - /* Block and flush all signals */ - sigfillset(&blocked); - sigprocmask(SIG_BLOCK, &blocked, NULL); - flush_signals(current); - - /* - * We inherited MPOL_INTERLEAVE from the booting kernel. - * Set MPOL_DEFAULT to insure node local allocations. - */ - numa_default_policy(); - - /* SIG_IGN makes children autoreap: see do_notify_parent(). */ - sa.sa.sa_handler = SIG_IGN; - sa.sa.sa_flags = 0; - siginitset(&sa.sa.sa_mask, sigmask(SIGCHLD)); - do_sigaction(SIGCHLD, &sa, (struct k_sigaction *)0); + for (;;) { + prepare_to_wait(&cwq->more_work, &wait, TASK_INTERRUPTIBLE); + if (!freezing(current) && !cwq->should_stop + && list_empty(&cwq->worklist)) + schedule(); + finish_wait(&cwq->more_work, &wait); - set_current_state(TASK_INTERRUPTIBLE); - while (!kthread_should_stop()) { - if (cwq->freezeable) - try_to_freeze(); + try_to_freeze(); - add_wait_queue(&cwq->more_work, &wait); - if (list_empty(&cwq->worklist)) - schedule(); - else - __set_current_state(TASK_RUNNING); - remove_wait_queue(&cwq->more_work, &wait); + if (cwq_should_stop(cwq)) + break; - if (!list_empty(&cwq->worklist)) - run_workqueue(cwq); - set_current_state(TASK_INTERRUPTIBLE); + run_workqueue(cwq); } - __set_current_state(TASK_RUNNING); + return 0; } +struct wq_barrier { + struct work_struct work; + struct completion done; +}; + +static void wq_barrier_func(struct work_struct *work) +{ + struct wq_barrier *barr = container_of(work, struct wq_barrier, work); + complete(&barr->done); +} + +static void insert_wq_barrier(struct cpu_workqueue_struct *cwq, + struct wq_barrier *barr, int tail) +{ + INIT_WORK(&barr->work, wq_barrier_func); + __set_bit(WORK_STRUCT_PENDING, work_data_bits(&barr->work)); + + init_completion(&barr->done); + + insert_work(cwq, &barr->work, tail); +} + static void flush_cpu_workqueue(struct cpu_workqueue_struct *cwq) { if (cwq->thread == current) { @@ -403,21 +349,18 @@ static void flush_cpu_workqueue(struct cpu_workqueue_struct *cwq) */ run_workqueue(cwq); } else { - DEFINE_WAIT(wait); - long sequence_needed; + struct wq_barrier barr; + int active = 0; spin_lock_irq(&cwq->lock); - sequence_needed = cwq->insert_sequence; - - while (sequence_needed - cwq->remove_sequence > 0) { - prepare_to_wait(&cwq->work_done, &wait, - TASK_UNINTERRUPTIBLE); - spin_unlock_irq(&cwq->lock); - schedule(); - spin_lock_irq(&cwq->lock); + if (!list_empty(&cwq->worklist) || cwq->current_work != NULL) { + insert_wq_barrier(cwq, &barr, 1); + active = 1; } - finish_wait(&cwq->work_done, &wait); spin_unlock_irq(&cwq->lock); + + if (active) + wait_for_completion(&barr.done); } } @@ -428,151 +371,145 @@ static void flush_cpu_workqueue(struct cpu_workqueue_struct *cwq) * Forces execution of the workqueue and blocks until its completion. * This is typically used in driver shutdown handlers. * - * This function will sample each workqueue's current insert_sequence number and - * will sleep until the head sequence is greater than or equal to that. This - * means that we sleep until all works which were queued on entry have been - * handled, but we are not livelocked by new incoming ones. + * We sleep until all works which were queued on entry have been handled, + * but we are not livelocked by new incoming ones. * * This function used to run the workqueues itself. Now we just wait for the * helper threads to do it. */ void fastcall flush_workqueue(struct workqueue_struct *wq) { + const cpumask_t *cpu_map = wq_cpu_map(wq); + int cpu; + might_sleep(); + for_each_cpu_mask(cpu, *cpu_map) + flush_cpu_workqueue(per_cpu_ptr(wq->cpu_wq, cpu)); +} +EXPORT_SYMBOL_GPL(flush_workqueue); - if (is_single_threaded(wq)) { - /* Always use first cpu's area. */ - flush_cpu_workqueue(per_cpu_ptr(wq->cpu_wq, singlethread_cpu)); - } else { - int cpu; +/* + * Upon a successful return, the caller "owns" WORK_STRUCT_PENDING bit, + * so this work can't be re-armed in any way. + */ +static int try_to_grab_pending(struct work_struct *work) +{ + struct cpu_workqueue_struct *cwq; + int ret = 0; - mutex_lock(&workqueue_mutex); - for_each_online_cpu(cpu) - flush_cpu_workqueue(per_cpu_ptr(wq->cpu_wq, cpu)); - mutex_unlock(&workqueue_mutex); + if (!test_and_set_bit(WORK_STRUCT_PENDING, work_data_bits(work))) + return 1; + + /* + * The queueing is in progress, or it is already queued. Try to + * steal it from ->worklist without clearing WORK_STRUCT_PENDING. + */ + + cwq = get_wq_data(work); + if (!cwq) + return ret; + + spin_lock_irq(&cwq->lock); + if (!list_empty(&work->entry)) { + /* + * This work is queued, but perhaps we locked the wrong cwq. + * In that case we must see the new value after rmb(), see + * insert_work()->wmb(). + */ + smp_rmb(); + if (cwq == get_wq_data(work)) { + list_del_init(&work->entry); + ret = 1; + } } + spin_unlock_irq(&cwq->lock); + + return ret; } -EXPORT_SYMBOL_GPL(flush_workqueue); -static struct task_struct *create_workqueue_thread(struct workqueue_struct *wq, - int cpu, int freezeable) +static void wait_on_cpu_work(struct cpu_workqueue_struct *cwq, + struct work_struct *work) { - struct cpu_workqueue_struct *cwq = per_cpu_ptr(wq->cpu_wq, cpu); - struct task_struct *p; + struct wq_barrier barr; + int running = 0; - spin_lock_init(&cwq->lock); - cwq->wq = wq; - cwq->thread = NULL; - cwq->insert_sequence = 0; - cwq->remove_sequence = 0; - cwq->freezeable = freezeable; - INIT_LIST_HEAD(&cwq->worklist); - init_waitqueue_head(&cwq->more_work); - init_waitqueue_head(&cwq->work_done); + spin_lock_irq(&cwq->lock); + if (unlikely(cwq->current_work == work)) { + insert_wq_barrier(cwq, &barr, 0); + running = 1; + } + spin_unlock_irq(&cwq->lock); - if (is_single_threaded(wq)) - p = kthread_create(worker_thread, cwq, "%s", wq->name); - else - p = kthread_create(worker_thread, cwq, "%s/%d", wq->name, cpu); - if (IS_ERR(p)) - return NULL; - cwq->thread = p; - return p; + if (unlikely(running)) + wait_for_completion(&barr.done); } -struct workqueue_struct *__create_workqueue(const char *name, - int singlethread, int freezeable) +static void wait_on_work(struct work_struct *work) { - int cpu, destroy = 0; + struct cpu_workqueue_struct *cwq; struct workqueue_struct *wq; - struct task_struct *p; + const cpumask_t *cpu_map; + int cpu; - wq = kzalloc(sizeof(*wq), GFP_KERNEL); - if (!wq) - return NULL; + might_sleep(); - wq->cpu_wq = alloc_percpu(struct cpu_workqueue_struct); - if (!wq->cpu_wq) { - kfree(wq); - return NULL; - } + cwq = get_wq_data(work); + if (!cwq) + return; - wq->name = name; - mutex_lock(&workqueue_mutex); - if (singlethread) { - INIT_LIST_HEAD(&wq->list); - p = create_workqueue_thread(wq, singlethread_cpu, freezeable); - if (!p) - destroy = 1; - else - wake_up_process(p); - } else { - list_add(&wq->list, &workqueues); - for_each_online_cpu(cpu) { - p = create_workqueue_thread(wq, cpu, freezeable); - if (p) { - kthread_bind(p, cpu); - wake_up_process(p); - } else - destroy = 1; - } - } - mutex_unlock(&workqueue_mutex); + wq = cwq->wq; + cpu_map = wq_cpu_map(wq); - /* - * Was there any error during startup? If yes then clean up: - */ - if (destroy) { - destroy_workqueue(wq); - wq = NULL; - } - return wq; + for_each_cpu_mask(cpu, *cpu_map) + wait_on_cpu_work(per_cpu_ptr(wq->cpu_wq, cpu), work); } -EXPORT_SYMBOL_GPL(__create_workqueue); -static void cleanup_workqueue_thread(struct workqueue_struct *wq, int cpu) +/** + * cancel_work_sync - block until a work_struct's callback has terminated + * @work: the work which is to be flushed + * + * cancel_work_sync() will cancel the work if it is queued. If the work's + * callback appears to be running, cancel_work_sync() will block until it + * has completed. + * + * It is possible to use this function if the work re-queues itself. It can + * cancel the work even if it migrates to another workqueue, however in that + * case it only guarantees that work->func() has completed on the last queued + * workqueue. + * + * cancel_work_sync(&delayed_work->work) should be used only if ->timer is not + * pending, otherwise it goes into a busy-wait loop until the timer expires. + * + * The caller must ensure that workqueue_struct on which this work was last + * queued can't be destroyed before this function returns. + */ +void cancel_work_sync(struct work_struct *work) { - struct cpu_workqueue_struct *cwq; - unsigned long flags; - struct task_struct *p; - - cwq = per_cpu_ptr(wq->cpu_wq, cpu); - spin_lock_irqsave(&cwq->lock, flags); - p = cwq->thread; - cwq->thread = NULL; - spin_unlock_irqrestore(&cwq->lock, flags); - if (p) - kthread_stop(p); + while (!try_to_grab_pending(work)) + cpu_relax(); + wait_on_work(work); + work_clear_pending(work); } +EXPORT_SYMBOL_GPL(cancel_work_sync); /** - * destroy_workqueue - safely terminate a workqueue - * @wq: target workqueue + * cancel_rearming_delayed_work - reliably kill off a delayed work. + * @dwork: the delayed work struct * - * Safely destroy a workqueue. All work currently pending will be done first. + * It is possible to use this function if @dwork rearms itself via queue_work() + * or queue_delayed_work(). See also the comment for cancel_work_sync(). */ -void destroy_workqueue(struct workqueue_struct *wq) +void cancel_rearming_delayed_work(struct delayed_work *dwork) { - int cpu; - - flush_workqueue(wq); - - /* We don't need the distraction of CPUs appearing and vanishing. */ - mutex_lock(&workqueue_mutex); - if (is_single_threaded(wq)) - cleanup_workqueue_thread(wq, singlethread_cpu); - else { - for_each_online_cpu(cpu) - cleanup_workqueue_thread(wq, cpu); - list_del(&wq->list); - } - mutex_unlock(&workqueue_mutex); - free_percpu(wq->cpu_wq); - kfree(wq); + while (!del_timer(&dwork->timer) && + !try_to_grab_pending(&dwork->work)) + cpu_relax(); + wait_on_work(&dwork->work); + work_clear_pending(&dwork->work); } -EXPORT_SYMBOL_GPL(destroy_workqueue); +EXPORT_SYMBOL(cancel_rearming_delayed_work); -static struct workqueue_struct *keventd_wq; +static struct workqueue_struct *keventd_wq __read_mostly; /** * schedule_work - put work task in global workqueue @@ -638,7 +575,7 @@ int schedule_on_each_cpu(work_func_t func) if (!works) return -ENOMEM; - mutex_lock(&workqueue_mutex); + preempt_disable(); /* CPU hotplug */ for_each_online_cpu(cpu) { struct work_struct *work = per_cpu_ptr(works, cpu); @@ -646,7 +583,7 @@ int schedule_on_each_cpu(work_func_t func) set_bit(WORK_STRUCT_PENDING, work_data_bits(work)); __queue_work(per_cpu_ptr(keventd_wq->cpu_wq, cpu), work); } - mutex_unlock(&workqueue_mutex); + preempt_enable(); flush_workqueue(keventd_wq); free_percpu(works); return 0; @@ -659,29 +596,6 @@ void flush_scheduled_work(void) EXPORT_SYMBOL(flush_scheduled_work); /** - * cancel_rearming_delayed_workqueue - reliably kill off a delayed work whose handler rearms the delayed work. - * @wq: the controlling workqueue structure - * @dwork: the delayed work struct - */ -void cancel_rearming_delayed_workqueue(struct workqueue_struct *wq, - struct delayed_work *dwork) -{ - while (!cancel_delayed_work(dwork)) - flush_workqueue(wq); -} -EXPORT_SYMBOL(cancel_rearming_delayed_workqueue); - -/** - * cancel_rearming_delayed_work - reliably kill off a delayed keventd work whose handler rearms the delayed work. - * @dwork: the delayed work struct - */ -void cancel_rearming_delayed_work(struct delayed_work *dwork) -{ - cancel_rearming_delayed_workqueue(keventd_wq, dwork); -} -EXPORT_SYMBOL(cancel_rearming_delayed_work); - -/** * execute_in_process_context - reliably execute the routine with user context * @fn: the function to execute * @ew: guaranteed storage for the execute work structure (must @@ -728,94 +642,209 @@ int current_is_keventd(void) } -/* Take the work from this (downed) CPU. */ -static void take_over_work(struct workqueue_struct *wq, unsigned int cpu) +static struct cpu_workqueue_struct * +init_cpu_workqueue(struct workqueue_struct *wq, int cpu) { struct cpu_workqueue_struct *cwq = per_cpu_ptr(wq->cpu_wq, cpu); - struct list_head list; - struct work_struct *work; - spin_lock_irq(&cwq->lock); - list_replace_init(&cwq->worklist, &list); + cwq->wq = wq; + spin_lock_init(&cwq->lock); + INIT_LIST_HEAD(&cwq->worklist); + init_waitqueue_head(&cwq->more_work); + + return cwq; +} + +static int create_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu) +{ + struct workqueue_struct *wq = cwq->wq; + const char *fmt = is_single_threaded(wq) ? "%s" : "%s/%d"; + struct task_struct *p; + + p = kthread_create(worker_thread, cwq, fmt, wq->name, cpu); + /* + * Nobody can add the work_struct to this cwq, + * if (caller is __create_workqueue) + * nobody should see this wq + * else // caller is CPU_UP_PREPARE + * cpu is not on cpu_online_map + * so we can abort safely. + */ + if (IS_ERR(p)) + return PTR_ERR(p); + + cwq->thread = p; + cwq->should_stop = 0; + + return 0; +} + +static void start_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu) +{ + struct task_struct *p = cwq->thread; - while (!list_empty(&list)) { - printk("Taking work for %s\n", wq->name); - work = list_entry(list.next,struct work_struct,entry); - list_del(&work->entry); - __queue_work(per_cpu_ptr(wq->cpu_wq, smp_processor_id()), work); + if (p != NULL) { + if (cpu >= 0) + kthread_bind(p, cpu); + wake_up_process(p); } - spin_unlock_irq(&cwq->lock); } -/* We're holding the cpucontrol mutex here */ -static int __devinit workqueue_cpu_callback(struct notifier_block *nfb, - unsigned long action, - void *hcpu) +struct workqueue_struct *__create_workqueue(const char *name, + int singlethread, int freezeable) { - unsigned int hotcpu = (unsigned long)hcpu; struct workqueue_struct *wq; + struct cpu_workqueue_struct *cwq; + int err = 0, cpu; - switch (action) { - case CPU_UP_PREPARE: - mutex_lock(&workqueue_mutex); - /* Create a new workqueue thread for it. */ - list_for_each_entry(wq, &workqueues, list) { - if (!create_workqueue_thread(wq, hotcpu, 0)) { - printk("workqueue for %i failed\n", hotcpu); - return NOTIFY_BAD; - } - } - break; + wq = kzalloc(sizeof(*wq), GFP_KERNEL); + if (!wq) + return NULL; - case CPU_ONLINE: - /* Kick off worker threads. */ - list_for_each_entry(wq, &workqueues, list) { - struct cpu_workqueue_struct *cwq; + wq->cpu_wq = alloc_percpu(struct cpu_workqueue_struct); + if (!wq->cpu_wq) { + kfree(wq); + return NULL; + } - cwq = per_cpu_ptr(wq->cpu_wq, hotcpu); - kthread_bind(cwq->thread, hotcpu); - wake_up_process(cwq->thread); - } - mutex_unlock(&workqueue_mutex); - break; + wq->name = name; + wq->singlethread = singlethread; + wq->freezeable = freezeable; + INIT_LIST_HEAD(&wq->list); - case CPU_UP_CANCELED: - list_for_each_entry(wq, &workqueues, list) { - if (!per_cpu_ptr(wq->cpu_wq, hotcpu)->thread) + if (singlethread) { + cwq = init_cpu_workqueue(wq, singlethread_cpu); + err = create_workqueue_thread(cwq, singlethread_cpu); + start_workqueue_thread(cwq, -1); + } else { + mutex_lock(&workqueue_mutex); + list_add(&wq->list, &workqueues); + + for_each_possible_cpu(cpu) { + cwq = init_cpu_workqueue(wq, cpu); + if (err || !cpu_online(cpu)) continue; - /* Unbind so it can run. */ - kthread_bind(per_cpu_ptr(wq->cpu_wq, hotcpu)->thread, - any_online_cpu(cpu_online_map)); - cleanup_workqueue_thread(wq, hotcpu); + err = create_workqueue_thread(cwq, cpu); + start_workqueue_thread(cwq, cpu); } mutex_unlock(&workqueue_mutex); - break; + } + + if (err) { + destroy_workqueue(wq); + wq = NULL; + } + return wq; +} +EXPORT_SYMBOL_GPL(__create_workqueue); + +static void cleanup_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu) +{ + struct wq_barrier barr; + int alive = 0; + + spin_lock_irq(&cwq->lock); + if (cwq->thread != NULL) { + insert_wq_barrier(cwq, &barr, 1); + cwq->should_stop = 1; + alive = 1; + } + spin_unlock_irq(&cwq->lock); + + if (alive) { + wait_for_completion(&barr.done); - case CPU_DOWN_PREPARE: + while (unlikely(cwq->thread != NULL)) + cpu_relax(); + /* + * Wait until cwq->thread unlocks cwq->lock, + * it won't touch *cwq after that. + */ + smp_rmb(); + spin_unlock_wait(&cwq->lock); + } +} + +/** + * destroy_workqueue - safely terminate a workqueue + * @wq: target workqueue + * + * Safely destroy a workqueue. All work currently pending will be done first. + */ +void destroy_workqueue(struct workqueue_struct *wq) +{ + const cpumask_t *cpu_map = wq_cpu_map(wq); + struct cpu_workqueue_struct *cwq; + int cpu; + + mutex_lock(&workqueue_mutex); + list_del(&wq->list); + mutex_unlock(&workqueue_mutex); + + for_each_cpu_mask(cpu, *cpu_map) { + cwq = per_cpu_ptr(wq->cpu_wq, cpu); + cleanup_workqueue_thread(cwq, cpu); + } + + free_percpu(wq->cpu_wq); + kfree(wq); +} +EXPORT_SYMBOL_GPL(destroy_workqueue); + +static int __devinit workqueue_cpu_callback(struct notifier_block *nfb, + unsigned long action, + void *hcpu) +{ + unsigned int cpu = (unsigned long)hcpu; + struct cpu_workqueue_struct *cwq; + struct workqueue_struct *wq; + + action &= ~CPU_TASKS_FROZEN; + + switch (action) { + case CPU_LOCK_ACQUIRE: mutex_lock(&workqueue_mutex); - break; + return NOTIFY_OK; - case CPU_DOWN_FAILED: + case CPU_LOCK_RELEASE: mutex_unlock(&workqueue_mutex); - break; + return NOTIFY_OK; - case CPU_DEAD: - list_for_each_entry(wq, &workqueues, list) - cleanup_workqueue_thread(wq, hotcpu); - list_for_each_entry(wq, &workqueues, list) - take_over_work(wq, hotcpu); - mutex_unlock(&workqueue_mutex); - break; + case CPU_UP_PREPARE: + cpu_set(cpu, cpu_populated_map); + } + + list_for_each_entry(wq, &workqueues, list) { + cwq = per_cpu_ptr(wq->cpu_wq, cpu); + + switch (action) { + case CPU_UP_PREPARE: + if (!create_workqueue_thread(cwq, cpu)) + break; + printk(KERN_ERR "workqueue for %i failed\n", cpu); + return NOTIFY_BAD; + + case CPU_ONLINE: + start_workqueue_thread(cwq, cpu); + break; + + case CPU_UP_CANCELED: + start_workqueue_thread(cwq, -1); + case CPU_DEAD: + cleanup_workqueue_thread(cwq, cpu); + break; + } } return NOTIFY_OK; } -void init_workqueues(void) +void __init init_workqueues(void) { + cpu_populated_map = cpu_online_map; singlethread_cpu = first_cpu(cpu_possible_map); + cpu_singlethread_map = cpumask_of_cpu(singlethread_cpu); hotcpu_notifier(workqueue_cpu_callback, 0); keventd_wq = create_workqueue("events"); BUG_ON(!keventd_wq); } - |