diff options
Diffstat (limited to 'kernel')
104 files changed, 9468 insertions, 2977 deletions
diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks new file mode 100644 index 000000000000..88c92fb44618 --- /dev/null +++ b/kernel/Kconfig.locks @@ -0,0 +1,202 @@ +# +# The ARCH_INLINE foo is necessary because select ignores "depends on" +# +config ARCH_INLINE_SPIN_TRYLOCK + bool + +config ARCH_INLINE_SPIN_TRYLOCK_BH + bool + +config ARCH_INLINE_SPIN_LOCK + bool + +config ARCH_INLINE_SPIN_LOCK_BH + bool + +config ARCH_INLINE_SPIN_LOCK_IRQ + bool + +config ARCH_INLINE_SPIN_LOCK_IRQSAVE + bool + +config ARCH_INLINE_SPIN_UNLOCK + bool + +config ARCH_INLINE_SPIN_UNLOCK_BH + bool + +config ARCH_INLINE_SPIN_UNLOCK_IRQ + bool + +config ARCH_INLINE_SPIN_UNLOCK_IRQRESTORE + bool + + +config ARCH_INLINE_READ_TRYLOCK + bool + +config ARCH_INLINE_READ_LOCK + bool + +config ARCH_INLINE_READ_LOCK_BH + bool + +config ARCH_INLINE_READ_LOCK_IRQ + bool + +config ARCH_INLINE_READ_LOCK_IRQSAVE + bool + +config ARCH_INLINE_READ_UNLOCK + bool + +config ARCH_INLINE_READ_UNLOCK_BH + bool + +config ARCH_INLINE_READ_UNLOCK_IRQ + bool + +config ARCH_INLINE_READ_UNLOCK_IRQRESTORE + bool + + +config ARCH_INLINE_WRITE_TRYLOCK + bool + +config ARCH_INLINE_WRITE_LOCK + bool + +config ARCH_INLINE_WRITE_LOCK_BH + bool + +config ARCH_INLINE_WRITE_LOCK_IRQ + bool + +config ARCH_INLINE_WRITE_LOCK_IRQSAVE + bool + +config ARCH_INLINE_WRITE_UNLOCK + bool + +config ARCH_INLINE_WRITE_UNLOCK_BH + bool + +config ARCH_INLINE_WRITE_UNLOCK_IRQ + bool + +config ARCH_INLINE_WRITE_UNLOCK_IRQRESTORE + bool + +# +# lock_* functions are inlined when: +# - DEBUG_SPINLOCK=n and GENERIC_LOCKBREAK=n and ARCH_INLINE_*LOCK=y +# +# trylock_* functions are inlined when: +# - DEBUG_SPINLOCK=n and ARCH_INLINE_*LOCK=y +# +# unlock and unlock_irq functions are inlined when: +# - DEBUG_SPINLOCK=n and ARCH_INLINE_*LOCK=y +# or +# - DEBUG_SPINLOCK=n and PREEMPT=n +# +# unlock_bh and unlock_irqrestore functions are inlined when: +# - DEBUG_SPINLOCK=n and ARCH_INLINE_*LOCK=y +# + +config INLINE_SPIN_TRYLOCK + def_bool !DEBUG_SPINLOCK && ARCH_INLINE_SPIN_TRYLOCK + +config INLINE_SPIN_TRYLOCK_BH + def_bool !DEBUG_SPINLOCK && ARCH_INLINE_SPIN_TRYLOCK_BH + +config INLINE_SPIN_LOCK + def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && ARCH_INLINE_SPIN_LOCK + +config INLINE_SPIN_LOCK_BH + def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \ + ARCH_INLINE_SPIN_LOCK_BH + +config INLINE_SPIN_LOCK_IRQ + def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \ + ARCH_INLINE_SPIN_LOCK_IRQ + +config INLINE_SPIN_LOCK_IRQSAVE + def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \ + ARCH_INLINE_SPIN_LOCK_IRQSAVE + +config INLINE_SPIN_UNLOCK + def_bool !DEBUG_SPINLOCK && (!PREEMPT || ARCH_INLINE_SPIN_UNLOCK) + +config INLINE_SPIN_UNLOCK_BH + def_bool !DEBUG_SPINLOCK && ARCH_INLINE_SPIN_UNLOCK_BH + +config INLINE_SPIN_UNLOCK_IRQ + def_bool !DEBUG_SPINLOCK && (!PREEMPT || ARCH_INLINE_SPIN_UNLOCK_BH) + +config INLINE_SPIN_UNLOCK_IRQRESTORE + def_bool !DEBUG_SPINLOCK && ARCH_INLINE_SPIN_UNLOCK_IRQRESTORE + + +config INLINE_READ_TRYLOCK + def_bool !DEBUG_SPINLOCK && ARCH_INLINE_READ_TRYLOCK + +config INLINE_READ_LOCK + def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && ARCH_INLINE_READ_LOCK + +config INLINE_READ_LOCK_BH + def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \ + ARCH_INLINE_READ_LOCK_BH + +config INLINE_READ_LOCK_IRQ + def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \ + ARCH_INLINE_READ_LOCK_IRQ + +config INLINE_READ_LOCK_IRQSAVE + def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \ + ARCH_INLINE_READ_LOCK_IRQSAVE + +config INLINE_READ_UNLOCK + def_bool !DEBUG_SPINLOCK && (!PREEMPT || ARCH_INLINE_READ_UNLOCK) + +config INLINE_READ_UNLOCK_BH + def_bool !DEBUG_SPINLOCK && ARCH_INLINE_READ_UNLOCK_BH + +config INLINE_READ_UNLOCK_IRQ + def_bool !DEBUG_SPINLOCK && (!PREEMPT || ARCH_INLINE_READ_UNLOCK_BH) + +config INLINE_READ_UNLOCK_IRQRESTORE + def_bool !DEBUG_SPINLOCK && ARCH_INLINE_READ_UNLOCK_IRQRESTORE + + +config INLINE_WRITE_TRYLOCK + def_bool !DEBUG_SPINLOCK && ARCH_INLINE_WRITE_TRYLOCK + +config INLINE_WRITE_LOCK + def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && ARCH_INLINE_WRITE_LOCK + +config INLINE_WRITE_LOCK_BH + def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \ + ARCH_INLINE_WRITE_LOCK_BH + +config INLINE_WRITE_LOCK_IRQ + def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \ + ARCH_INLINE_WRITE_LOCK_IRQ + +config INLINE_WRITE_LOCK_IRQSAVE + def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \ + ARCH_INLINE_WRITE_LOCK_IRQSAVE + +config INLINE_WRITE_UNLOCK + def_bool !DEBUG_SPINLOCK && (!PREEMPT || ARCH_INLINE_WRITE_UNLOCK) + +config INLINE_WRITE_UNLOCK_BH + def_bool !DEBUG_SPINLOCK && ARCH_INLINE_WRITE_UNLOCK_BH + +config INLINE_WRITE_UNLOCK_IRQ + def_bool !DEBUG_SPINLOCK && (!PREEMPT || ARCH_INLINE_WRITE_UNLOCK_BH) + +config INLINE_WRITE_UNLOCK_IRQRESTORE + def_bool !DEBUG_SPINLOCK && ARCH_INLINE_WRITE_UNLOCK_IRQRESTORE + +config MUTEX_SPIN_ON_OWNER + def_bool SMP && !DEBUG_MUTEXES && !HAVE_DEFAULT_NO_SPIN_MUTEXES diff --git a/kernel/Makefile b/kernel/Makefile index 187c89b4783d..982c50e2ce53 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -21,6 +21,7 @@ CFLAGS_REMOVE_mutex-debug.o = -pg CFLAGS_REMOVE_rtmutex-debug.o = -pg CFLAGS_REMOVE_cgroup-debug.o = -pg CFLAGS_REMOVE_sched_clock.o = -pg +CFLAGS_REMOVE_perf_event.o = -pg endif obj-$(CONFIG_FREEZER) += freezer.o @@ -58,7 +59,6 @@ obj-$(CONFIG_KEXEC) += kexec.o obj-$(CONFIG_BACKTRACE_SELF_TEST) += backtracetest.o obj-$(CONFIG_COMPAT) += compat.o obj-$(CONFIG_CGROUPS) += cgroup.o -obj-$(CONFIG_CGROUP_DEBUG) += cgroup_debug.o obj-$(CONFIG_CGROUP_FREEZER) += cgroup_freezer.o obj-$(CONFIG_CPUSETS) += cpuset.o obj-$(CONFIG_CGROUP_NS) += ns_cgroup.o @@ -83,6 +83,7 @@ obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o obj-$(CONFIG_TREE_RCU) += rcutree.o obj-$(CONFIG_TREE_PREEMPT_RCU) += rcutree.o obj-$(CONFIG_TREE_RCU_TRACE) += rcutree_trace.o +obj-$(CONFIG_TINY_RCU) += rcutiny.o obj-$(CONFIG_RELAY) += relay.o obj-$(CONFIG_SYSCTL) += utsname_sysctl.o obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o @@ -95,7 +96,9 @@ obj-$(CONFIG_X86_DS) += trace/ obj-$(CONFIG_RING_BUFFER) += trace/ obj-$(CONFIG_SMP) += sched_cpupri.o obj-$(CONFIG_SLOW_WORK) += slow-work.o +obj-$(CONFIG_SLOW_WORK_DEBUG) += slow-work-debugfs.o obj-$(CONFIG_PERF_EVENTS) += perf_event.o +obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y) # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is diff --git a/kernel/audit.c b/kernel/audit.c index defc2e6f1e3b..5feed232be9d 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -855,18 +855,24 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) break; } case AUDIT_SIGNAL_INFO: - err = security_secid_to_secctx(audit_sig_sid, &ctx, &len); - if (err) - return err; + len = 0; + if (audit_sig_sid) { + err = security_secid_to_secctx(audit_sig_sid, &ctx, &len); + if (err) + return err; + } sig_data = kmalloc(sizeof(*sig_data) + len, GFP_KERNEL); if (!sig_data) { - security_release_secctx(ctx, len); + if (audit_sig_sid) + security_release_secctx(ctx, len); return -ENOMEM; } sig_data->uid = audit_sig_uid; sig_data->pid = audit_sig_pid; - memcpy(sig_data->ctx, ctx, len); - security_release_secctx(ctx, len); + if (audit_sig_sid) { + memcpy(sig_data->ctx, ctx, len); + security_release_secctx(ctx, len); + } audit_send_reply(NETLINK_CB(skb).pid, seq, AUDIT_SIGNAL_INFO, 0, 0, sig_data, sizeof(*sig_data) + len); kfree(sig_data); diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c index 0e96dbc60ea9..cc7e87936cbc 100644 --- a/kernel/audit_watch.c +++ b/kernel/audit_watch.c @@ -45,8 +45,8 @@ struct audit_watch { atomic_t count; /* reference count */ - char *path; /* insertion path */ dev_t dev; /* associated superblock device */ + char *path; /* insertion path */ unsigned long ino; /* associated inode number */ struct audit_parent *parent; /* associated parent */ struct list_head wlist; /* entry in parent->watches list */ diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 68d3c6a0ecd6..267e484f0198 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -168,12 +168,12 @@ struct audit_context { int in_syscall; /* 1 if task is in a syscall */ enum audit_state state, current_state; unsigned int serial; /* serial number for record */ - struct timespec ctime; /* time of syscall entry */ int major; /* syscall number */ + struct timespec ctime; /* time of syscall entry */ unsigned long argv[4]; /* syscall arguments */ - int return_valid; /* return code is valid */ long return_code;/* syscall return code */ u64 prio; + int return_valid; /* return code is valid */ int name_count; struct audit_names names[AUDIT_NAMES]; char * filterkey; /* key for rule that triggered record */ @@ -198,8 +198,8 @@ struct audit_context { char target_comm[TASK_COMM_LEN]; struct audit_tree_refs *trees, *first_trees; - int tree_count; struct list_head killed_trees; + int tree_count; int type; union { diff --git a/kernel/capability.c b/kernel/capability.c index 4e17041963f5..7f876e60521f 100644 --- a/kernel/capability.c +++ b/kernel/capability.c @@ -29,7 +29,6 @@ EXPORT_SYMBOL(__cap_empty_set); EXPORT_SYMBOL(__cap_full_set); EXPORT_SYMBOL(__cap_init_eff_set); -#ifdef CONFIG_SECURITY_FILE_CAPABILITIES int file_caps_enabled = 1; static int __init file_caps_disable(char *str) @@ -38,7 +37,6 @@ static int __init file_caps_disable(char *str) return 1; } __setup("no_file_caps", file_caps_disable); -#endif /* * More recent versions of libcap are available from: @@ -169,8 +167,8 @@ SYSCALL_DEFINE2(capget, cap_user_header_t, header, cap_user_data_t, dataptr) kernel_cap_t pE, pI, pP; ret = cap_validate_magic(header, &tocopy); - if (ret != 0) - return ret; + if ((dataptr == NULL) || (ret != 0)) + return ((dataptr == NULL) && (ret == -EINVAL)) ? 0 : ret; if (get_user(pid, &header->pid)) return -EFAULT; @@ -238,7 +236,7 @@ SYSCALL_DEFINE2(capget, cap_user_header_t, header, cap_user_data_t, dataptr) SYSCALL_DEFINE2(capset, cap_user_header_t, header, const cap_user_data_t, data) { struct __user_cap_data_struct kdata[_KERNEL_CAPABILITY_U32S]; - unsigned i, tocopy; + unsigned i, tocopy, copybytes; kernel_cap_t inheritable, permitted, effective; struct cred *new; int ret; @@ -255,8 +253,11 @@ SYSCALL_DEFINE2(capset, cap_user_header_t, header, const cap_user_data_t, data) if (pid != 0 && pid != task_pid_vnr(current)) return -EPERM; - if (copy_from_user(&kdata, data, - tocopy * sizeof(struct __user_cap_data_struct))) + copybytes = tocopy * sizeof(struct __user_cap_data_struct); + if (copybytes > sizeof(kdata)) + return -EFAULT; + + if (copy_from_user(&kdata, data, copybytes)) return -EFAULT; for (i = 0; i < tocopy; i++) { diff --git a/kernel/cgroup.c b/kernel/cgroup.c index cd83d9933b6b..0249f4be9b5c 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -23,6 +23,7 @@ */ #include <linux/cgroup.h> +#include <linux/ctype.h> #include <linux/errno.h> #include <linux/fs.h> #include <linux/kernel.h> @@ -48,6 +49,8 @@ #include <linux/namei.h> #include <linux/smp_lock.h> #include <linux/pid_namespace.h> +#include <linux/idr.h> +#include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */ #include <asm/atomic.h> @@ -60,6 +63,8 @@ static struct cgroup_subsys *subsys[] = { #include <linux/cgroup_subsys.h> }; +#define MAX_CGROUP_ROOT_NAMELEN 64 + /* * A cgroupfs_root represents the root of a cgroup hierarchy, * and may be associated with a superblock to form an active @@ -74,6 +79,9 @@ struct cgroupfs_root { */ unsigned long subsys_bits; + /* Unique id for this hierarchy. */ + int hierarchy_id; + /* The bitmask of subsystems currently attached to this hierarchy */ unsigned long actual_subsys_bits; @@ -94,6 +102,9 @@ struct cgroupfs_root { /* The path to use for release notifications. */ char release_agent_path[PATH_MAX]; + + /* The name for this hierarchy - may be empty */ + char name[MAX_CGROUP_ROOT_NAMELEN]; }; /* @@ -141,6 +152,10 @@ struct css_id { static LIST_HEAD(roots); static int root_count; +static DEFINE_IDA(hierarchy_ida); +static int next_hierarchy_id; +static DEFINE_SPINLOCK(hierarchy_id_lock); + /* dummytop is a shorthand for the dummy hierarchy's top cgroup */ #define dummytop (&rootnode.top_cgroup) @@ -201,6 +216,7 @@ struct cg_cgroup_link { * cgroup, anchored on cgroup->css_sets */ struct list_head cgrp_link_list; + struct cgroup *cgrp; /* * List running through cg_cgroup_links pointing at a * single css_set object, anchored on css_set->cg_links @@ -227,8 +243,11 @@ static int cgroup_subsys_init_idr(struct cgroup_subsys *ss); static DEFINE_RWLOCK(css_set_lock); static int css_set_count; -/* hash table for cgroup groups. This improves the performance to - * find an existing css_set */ +/* + * hash table for cgroup groups. This improves the performance to find + * an existing css_set. This hash doesn't (currently) take into + * account cgroups in empty hierarchies. + */ #define CSS_SET_HASH_BITS 7 #define CSS_SET_TABLE_SIZE (1 << CSS_SET_HASH_BITS) static struct hlist_head css_set_table[CSS_SET_TABLE_SIZE]; @@ -248,48 +267,22 @@ static struct hlist_head *css_set_hash(struct cgroup_subsys_state *css[]) return &css_set_table[index]; } +static void free_css_set_rcu(struct rcu_head *obj) +{ + struct css_set *cg = container_of(obj, struct css_set, rcu_head); + kfree(cg); +} + /* We don't maintain the lists running through each css_set to its * task until after the first call to cgroup_iter_start(). This * reduces the fork()/exit() overhead for people who have cgroups * compiled into their kernel but not actually in use */ static int use_task_css_set_links __read_mostly; -/* When we create or destroy a css_set, the operation simply - * takes/releases a reference count on all the cgroups referenced - * by subsystems in this css_set. This can end up multiple-counting - * some cgroups, but that's OK - the ref-count is just a - * busy/not-busy indicator; ensuring that we only count each cgroup - * once would require taking a global lock to ensure that no - * subsystems moved between hierarchies while we were doing so. - * - * Possible TODO: decide at boot time based on the number of - * registered subsystems and the number of CPUs or NUMA nodes whether - * it's better for performance to ref-count every subsystem, or to - * take a global lock and only add one ref count to each hierarchy. - */ - -/* - * unlink a css_set from the list and free it - */ -static void unlink_css_set(struct css_set *cg) +static void __put_css_set(struct css_set *cg, int taskexit) { struct cg_cgroup_link *link; struct cg_cgroup_link *saved_link; - - hlist_del(&cg->hlist); - css_set_count--; - - list_for_each_entry_safe(link, saved_link, &cg->cg_links, - cg_link_list) { - list_del(&link->cg_link_list); - list_del(&link->cgrp_link_list); - kfree(link); - } -} - -static void __put_css_set(struct css_set *cg, int taskexit) -{ - int i; /* * Ensure that the refcount doesn't hit zero while any readers * can see it. Similar to atomic_dec_and_lock(), but for an @@ -302,21 +295,28 @@ static void __put_css_set(struct css_set *cg, int taskexit) write_unlock(&css_set_lock); return; } - unlink_css_set(cg); - write_unlock(&css_set_lock); - rcu_read_lock(); - for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { - struct cgroup *cgrp = rcu_dereference(cg->subsys[i]->cgroup); + /* This css_set is dead. unlink it and release cgroup refcounts */ + hlist_del(&cg->hlist); + css_set_count--; + + list_for_each_entry_safe(link, saved_link, &cg->cg_links, + cg_link_list) { + struct cgroup *cgrp = link->cgrp; + list_del(&link->cg_link_list); + list_del(&link->cgrp_link_list); if (atomic_dec_and_test(&cgrp->count) && notify_on_release(cgrp)) { if (taskexit) set_bit(CGRP_RELEASABLE, &cgrp->flags); check_for_release(cgrp); } + + kfree(link); } - rcu_read_unlock(); - kfree(cg); + + write_unlock(&css_set_lock); + call_rcu(&cg->rcu_head, free_css_set_rcu); } /* @@ -338,6 +338,78 @@ static inline void put_css_set_taskexit(struct css_set *cg) } /* + * compare_css_sets - helper function for find_existing_css_set(). + * @cg: candidate css_set being tested + * @old_cg: existing css_set for a task + * @new_cgrp: cgroup that's being entered by the task + * @template: desired set of css pointers in css_set (pre-calculated) + * + * Returns true if "cg" matches "old_cg" except for the hierarchy + * which "new_cgrp" belongs to, for which it should match "new_cgrp". + */ +static bool compare_css_sets(struct css_set *cg, + struct css_set *old_cg, + struct cgroup *new_cgrp, + struct cgroup_subsys_state *template[]) +{ + struct list_head *l1, *l2; + + if (memcmp(template, cg->subsys, sizeof(cg->subsys))) { + /* Not all subsystems matched */ + return false; + } + + /* + * Compare cgroup pointers in order to distinguish between + * different cgroups in heirarchies with no subsystems. We + * could get by with just this check alone (and skip the + * memcmp above) but on most setups the memcmp check will + * avoid the need for this more expensive check on almost all + * candidates. + */ + + l1 = &cg->cg_links; + l2 = &old_cg->cg_links; + while (1) { + struct cg_cgroup_link *cgl1, *cgl2; + struct cgroup *cg1, *cg2; + + l1 = l1->next; + l2 = l2->next; + /* See if we reached the end - both lists are equal length. */ + if (l1 == &cg->cg_links) { + BUG_ON(l2 != &old_cg->cg_links); + break; + } else { + BUG_ON(l2 == &old_cg->cg_links); + } + /* Locate the cgroups associated with these links. */ + cgl1 = list_entry(l1, struct cg_cgroup_link, cg_link_list); + cgl2 = list_entry(l2, struct cg_cgroup_link, cg_link_list); + cg1 = cgl1->cgrp; + cg2 = cgl2->cgrp; + /* Hierarchies should be linked in the same order. */ + BUG_ON(cg1->root != cg2->root); + + /* + * If this hierarchy is the hierarchy of the cgroup + * that's changing, then we need to check that this + * css_set points to the new cgroup; if it's any other + * hierarchy, then this css_set should point to the + * same cgroup as the old css_set. + */ + if (cg1->root == new_cgrp->root) { + if (cg1 != new_cgrp) + return false; + } else { + if (cg1 != cg2) + return false; + } + } + return true; +} + +/* * find_existing_css_set() is a helper for * find_css_set(), and checks to see whether an existing * css_set is suitable. @@ -378,10 +450,11 @@ static struct css_set *find_existing_css_set( hhead = css_set_hash(template); hlist_for_each_entry(cg, node, hhead, hlist) { - if (!memcmp(template, cg->subsys, sizeof(cg->subsys))) { - /* All subsystems matched */ - return cg; - } + if (!compare_css_sets(cg, oldcg, cgrp, template)) + continue; + + /* This css_set matches what we need */ + return cg; } /* No existing cgroup group matched */ @@ -435,8 +508,14 @@ static void link_css_set(struct list_head *tmp_cg_links, link = list_first_entry(tmp_cg_links, struct cg_cgroup_link, cgrp_link_list); link->cg = cg; + link->cgrp = cgrp; + atomic_inc(&cgrp->count); list_move(&link->cgrp_link_list, &cgrp->css_sets); - list_add(&link->cg_link_list, &cg->cg_links); + /* + * Always add links to the tail of the list so that the list + * is sorted by order of hierarchy creation + */ + list_add_tail(&link->cg_link_list, &cg->cg_links); } /* @@ -451,11 +530,11 @@ static struct css_set *find_css_set( { struct css_set *res; struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT]; - int i; struct list_head tmp_cg_links; struct hlist_head *hhead; + struct cg_cgroup_link *link; /* First see if we already have a cgroup group that matches * the desired set */ @@ -489,20 +568,12 @@ static struct css_set *find_css_set( write_lock(&css_set_lock); /* Add reference counts and links from the new css_set. */ - for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { - struct cgroup *cgrp = res->subsys[i]->cgroup; - struct cgroup_subsys *ss = subsys[i]; - atomic_inc(&cgrp->count); - /* - * We want to add a link once per cgroup, so we - * only do it for the first subsystem in each - * hierarchy - */ - if (ss->root->subsys_list.next == &ss->sibling) - link_css_set(&tmp_cg_links, res, cgrp); + list_for_each_entry(link, &oldcg->cg_links, cg_link_list) { + struct cgroup *c = link->cgrp; + if (c->root == cgrp->root) + c = cgrp; + link_css_set(&tmp_cg_links, res, c); } - if (list_empty(&rootnode.subsys_list)) - link_css_set(&tmp_cg_links, res, dummytop); BUG_ON(!list_empty(&tmp_cg_links)); @@ -518,6 +589,41 @@ static struct css_set *find_css_set( } /* + * Return the cgroup for "task" from the given hierarchy. Must be + * called with cgroup_mutex held. + */ +static struct cgroup *task_cgroup_from_root(struct task_struct *task, + struct cgroupfs_root *root) +{ + struct css_set *css; + struct cgroup *res = NULL; + + BUG_ON(!mutex_is_locked(&cgroup_mutex)); + read_lock(&css_set_lock); + /* + * No need to lock the task - since we hold cgroup_mutex the + * task can't change groups, so the only thing that can happen + * is that it exits and its css is set back to init_css_set. + */ + css = task->cgroups; + if (css == &init_css_set) { + res = &root->top_cgroup; + } else { + struct cg_cgroup_link *link; + list_for_each_entry(link, &css->cg_links, cg_link_list) { + struct cgroup *c = link->cgrp; + if (c->root == root) { + res = c; + break; + } + } + } + read_unlock(&css_set_lock); + BUG_ON(!res); + return res; +} + +/* * There is one global cgroup mutex. We also require taking * task_lock() when dereferencing a task's cgroup subsys pointers. * See "The task_lock() exception", at the end of this comment. @@ -597,7 +703,7 @@ static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, int mode); static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry); static int cgroup_populate_dir(struct cgroup *cgrp); static const struct inode_operations cgroup_dir_inode_operations; -static struct file_operations proc_cgroupstats_operations; +static const struct file_operations proc_cgroupstats_operations; static struct backing_dev_info cgroup_backing_dev_info = { .name = "cgroup", @@ -677,6 +783,12 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode) */ deactivate_super(cgrp->root->sb); + /* + * if we're getting rid of the cgroup, refcount should ensure + * that there are no pidlists left. + */ + BUG_ON(!list_empty(&cgrp->pidlists)); + call_rcu(&cgrp->rcu_head, free_cgroup_rcu); } iput(inode); @@ -841,6 +953,8 @@ static int cgroup_show_options(struct seq_file *seq, struct vfsmount *vfs) seq_puts(seq, ",noprefix"); if (strlen(root->release_agent_path)) seq_printf(seq, ",release_agent=%s", root->release_agent_path); + if (strlen(root->name)) + seq_printf(seq, ",name=%s", root->name); mutex_unlock(&cgroup_mutex); return 0; } @@ -849,6 +963,12 @@ struct cgroup_sb_opts { unsigned long subsys_bits; unsigned long flags; char *release_agent; + char *name; + /* User explicitly requested empty subsystem */ + bool none; + + struct cgroupfs_root *new_root; + }; /* Convert a hierarchy specifier into a bitmask of subsystems and @@ -863,9 +983,7 @@ static int parse_cgroupfs_options(char *data, mask = ~(1UL << cpuset_subsys_id); #endif - opts->subsys_bits = 0; - opts->flags = 0; - opts->release_agent = NULL; + memset(opts, 0, sizeof(*opts)); while ((token = strsep(&o, ",")) != NULL) { if (!*token) @@ -879,17 +997,42 @@ static int parse_cgroupfs_options(char *data, if (!ss->disabled) opts->subsys_bits |= 1ul << i; } + } else if (!strcmp(token, "none")) { + /* Explicitly have no subsystems */ + opts->none = true; } else if (!strcmp(token, "noprefix")) { set_bit(ROOT_NOPREFIX, &opts->flags); } else if (!strncmp(token, "release_agent=", 14)) { /* Specifying two release agents is forbidden */ if (opts->release_agent) return -EINVAL; - opts->release_agent = kzalloc(PATH_MAX, GFP_KERNEL); + opts->release_agent = + kstrndup(token + 14, PATH_MAX, GFP_KERNEL); if (!opts->release_agent) return -ENOMEM; - strncpy(opts->release_agent, token + 14, PATH_MAX - 1); - opts->release_agent[PATH_MAX - 1] = 0; + } else if (!strncmp(token, "name=", 5)) { + int i; + const char *name = token + 5; + /* Can't specify an empty name */ + if (!strlen(name)) + return -EINVAL; + /* Must match [\w.-]+ */ + for (i = 0; i < strlen(name); i++) { + char c = name[i]; + if (isalnum(c)) + continue; + if ((c == '.') || (c == '-') || (c == '_')) + continue; + return -EINVAL; + } + /* Specifying two names is forbidden */ + if (opts->name) + return -EINVAL; + opts->name = kstrndup(name, + MAX_CGROUP_ROOT_NAMELEN, + GFP_KERNEL); + if (!opts->name) + return -ENOMEM; } else { struct cgroup_subsys *ss; int i; @@ -906,6 +1049,8 @@ static int parse_cgroupfs_options(char *data, } } + /* Consistency checks */ + /* * Option noprefix was introduced just for backward compatibility * with the old cpuset, so we allow noprefix only if mounting just @@ -915,8 +1060,16 @@ static int parse_cgroupfs_options(char *data, (opts->subsys_bits & mask)) return -EINVAL; - /* We can't have an empty hierarchy */ - if (!opts->subsys_bits) + + /* Can't specify "none" and some subsystems */ + if (opts->subsys_bits && opts->none) + return -EINVAL; + + /* + * We either have to specify by name or by subsystems. (So all + * empty hierarchies must have a name). + */ + if (!opts->subsys_bits && !opts->name) return -EINVAL; return 0; @@ -944,6 +1097,12 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data) goto out_unlock; } + /* Don't allow name to change at remount */ + if (opts.name && strcmp(opts.name, root->name)) { + ret = -EINVAL; + goto out_unlock; + } + ret = rebind_subsystems(root, opts.subsys_bits); if (ret) goto out_unlock; @@ -955,6 +1114,7 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data) strcpy(root->release_agent_path, opts.release_agent); out_unlock: kfree(opts.release_agent); + kfree(opts.name); mutex_unlock(&cgroup_mutex); mutex_unlock(&cgrp->dentry->d_inode->i_mutex); unlock_kernel(); @@ -974,9 +1134,10 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp) INIT_LIST_HEAD(&cgrp->children); INIT_LIST_HEAD(&cgrp->css_sets); INIT_LIST_HEAD(&cgrp->release_list); - INIT_LIST_HEAD(&cgrp->pids_list); - init_rwsem(&cgrp->pids_mutex); + INIT_LIST_HEAD(&cgrp->pidlists); + mutex_init(&cgrp->pidlist_mutex); } + static void init_cgroup_root(struct cgroupfs_root *root) { struct cgroup *cgrp = &root->top_cgroup; @@ -988,33 +1149,106 @@ static void init_cgroup_root(struct cgroupfs_root *root) init_cgroup_housekeeping(cgrp); } +static bool init_root_id(struct cgroupfs_root *root) +{ + int ret = 0; + + do { + if (!ida_pre_get(&hierarchy_ida, GFP_KERNEL)) + return false; + spin_lock(&hierarchy_id_lock); + /* Try to allocate the next unused ID */ + ret = ida_get_new_above(&hierarchy_ida, next_hierarchy_id, + &root->hierarchy_id); + if (ret == -ENOSPC) + /* Try again starting from 0 */ + ret = ida_get_new(&hierarchy_ida, &root->hierarchy_id); + if (!ret) { + next_hierarchy_id = root->hierarchy_id + 1; + } else if (ret != -EAGAIN) { + /* Can only get here if the 31-bit IDR is full ... */ + BUG_ON(ret); + } + spin_unlock(&hierarchy_id_lock); + } while (ret); + return true; +} + static int cgroup_test_super(struct super_block *sb, void *data) { - struct cgroupfs_root *new = data; + struct cgroup_sb_opts *opts = data; struct cgroupfs_root *root = sb->s_fs_info; - /* First check subsystems */ - if (new->subsys_bits != root->subsys_bits) - return 0; + /* If we asked for a name then it must match */ + if (opts->name && strcmp(opts->name, root->name)) + return 0; - /* Next check flags */ - if (new->flags != root->flags) + /* + * If we asked for subsystems (or explicitly for no + * subsystems) then they must match + */ + if ((opts->subsys_bits || opts->none) + && (opts->subsys_bits != root->subsys_bits)) return 0; return 1; } +static struct cgroupfs_root *cgroup_root_from_opts(struct cgroup_sb_opts *opts) +{ + struct cgroupfs_root *root; + + if (!opts->subsys_bits && !opts->none) + return NULL; + + root = kzalloc(sizeof(*root), GFP_KERNEL); + if (!root) + return ERR_PTR(-ENOMEM); + + if (!init_root_id(root)) { + kfree(root); + return ERR_PTR(-ENOMEM); + } + init_cgroup_root(root); + + root->subsys_bits = opts->subsys_bits; + root->flags = opts->flags; + if (opts->release_agent) + strcpy(root->release_agent_path, opts->release_agent); + if (opts->name) + strcpy(root->name, opts->name); + return root; +} + +static void cgroup_drop_root(struct cgroupfs_root *root) +{ + if (!root) + return; + + BUG_ON(!root->hierarchy_id); + spin_lock(&hierarchy_id_lock); + ida_remove(&hierarchy_ida, root->hierarchy_id); + spin_unlock(&hierarchy_id_lock); + kfree(root); +} + static int cgroup_set_super(struct super_block *sb, void *data) { int ret; - struct cgroupfs_root *root = data; + struct cgroup_sb_opts *opts = data; + + /* If we don't have a new root, we can't set up a new sb */ + if (!opts->new_root) + return -EINVAL; + + BUG_ON(!opts->subsys_bits && !opts->none); ret = set_anon_super(sb, NULL); if (ret) return ret; - sb->s_fs_info = root; - root->sb = sb; + sb->s_fs_info = opts->new_root; + opts->new_root->sb = sb; sb->s_blocksize = PAGE_CACHE_SIZE; sb->s_blocksize_bits = PAGE_CACHE_SHIFT; @@ -1051,48 +1285,43 @@ static int cgroup_get_sb(struct file_system_type *fs_type, void *data, struct vfsmount *mnt) { struct cgroup_sb_opts opts; + struct cgroupfs_root *root; int ret = 0; struct super_block *sb; - struct cgroupfs_root *root; - struct list_head tmp_cg_links; + struct cgroupfs_root *new_root; /* First find the desired set of subsystems */ ret = parse_cgroupfs_options(data, &opts); - if (ret) { - kfree(opts.release_agent); - return ret; - } - - root = kzalloc(sizeof(*root), GFP_KERNEL); - if (!root) { - kfree(opts.release_agent); - return -ENOMEM; - } + if (ret) + goto out_err; - init_cgroup_root(root); - root->subsys_bits = opts.subsys_bits; - root->flags = opts.flags; - if (opts.release_agent) { - strcpy(root->release_agent_path, opts.release_agent); - kfree(opts.release_agent); + /* + * Allocate a new cgroup root. We may not need it if we're + * reusing an existing hierarchy. + */ + new_root = cgroup_root_from_opts(&opts); + if (IS_ERR(new_root)) { + ret = PTR_ERR(new_root); + goto out_err; } + opts.new_root = new_root; - sb = sget(fs_type, cgroup_test_super, cgroup_set_super, root); - + /* Locate an existing or new sb for this hierarchy */ + sb = sget(fs_type, cgroup_test_super, cgroup_set_super, &opts); if (IS_ERR(sb)) { - kfree(root); - return PTR_ERR(sb); + ret = PTR_ERR(sb); + cgroup_drop_root(opts.new_root); + goto out_err; } - if (sb->s_fs_info != root) { - /* Reusing an existing superblock */ - BUG_ON(sb->s_root == NULL); - kfree(root); - root = NULL; - } else { - /* New superblock */ + root = sb->s_fs_info; + BUG_ON(!root); + if (root == opts.new_root) { + /* We used the new root structure, so this is a new hierarchy */ + struct list_head tmp_cg_links; struct cgroup *root_cgrp = &root->top_cgroup; struct inode *inode; + struct cgroupfs_root *existing_root; int i; BUG_ON(sb->s_root != NULL); @@ -1105,6 +1334,18 @@ static int cgroup_get_sb(struct file_system_type *fs_type, mutex_lock(&inode->i_mutex); mutex_lock(&cgroup_mutex); + if (strlen(root->name)) { + /* Check for name clashes with existing mounts */ + for_each_active_root(existing_root) { + if (!strcmp(existing_root->name, root->name)) { + ret = -EBUSY; + mutex_unlock(&cgroup_mutex); + mutex_unlock(&inode->i_mutex); + goto drop_new_super; + } + } + } + /* * We're accessing css_set_count without locking * css_set_lock here, but that's OK - it can only be @@ -1123,7 +1364,8 @@ static int cgroup_get_sb(struct file_system_type *fs_type, if (ret == -EBUSY) { mutex_unlock(&cgroup_mutex); mutex_unlock(&inode->i_mutex); - goto free_cg_links; + free_cg_links(&tmp_cg_links); + goto drop_new_super; } /* EBUSY should be the only error here */ @@ -1155,17 +1397,27 @@ static int cgroup_get_sb(struct file_system_type *fs_type, BUG_ON(root->number_of_cgroups != 1); cgroup_populate_dir(root_cgrp); - mutex_unlock(&inode->i_mutex); mutex_unlock(&cgroup_mutex); + mutex_unlock(&inode->i_mutex); + } else { + /* + * We re-used an existing hierarchy - the new root (if + * any) is not needed + */ + cgroup_drop_root(opts.new_root); } simple_set_mnt(mnt, sb); + kfree(opts.release_agent); + kfree(opts.name); return 0; - free_cg_links: - free_cg_links(&tmp_cg_links); drop_new_super: deactivate_locked_super(sb); + out_err: + kfree(opts.release_agent); + kfree(opts.name); + return ret; } @@ -1211,7 +1463,7 @@ static void cgroup_kill_sb(struct super_block *sb) { mutex_unlock(&cgroup_mutex); kill_litter_super(sb); - kfree(root); + cgroup_drop_root(root); } static struct file_system_type cgroup_fs_type = { @@ -1276,27 +1528,6 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen) return 0; } -/* - * Return the first subsystem attached to a cgroup's hierarchy, and - * its subsystem id. - */ - -static void get_first_subsys(const struct cgroup *cgrp, - struct cgroup_subsys_state **css, int *subsys_id) -{ - const struct cgroupfs_root *root = cgrp->root; - const struct cgroup_subsys *test_ss; - BUG_ON(list_empty(&root->subsys_list)); - test_ss = list_entry(root->subsys_list.next, - struct cgroup_subsys, sibling); - if (css) { - *css = cgrp->subsys[test_ss->subsys_id]; - BUG_ON(!*css); - } - if (subsys_id) - *subsys_id = test_ss->subsys_id; -} - /** * cgroup_attach_task - attach task 'tsk' to cgroup 'cgrp' * @cgrp: the cgroup the task is attaching to @@ -1313,18 +1544,15 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) struct css_set *cg; struct css_set *newcg; struct cgroupfs_root *root = cgrp->root; - int subsys_id; - - get_first_subsys(cgrp, NULL, &subsys_id); /* Nothing to do if the task is already in that cgroup */ - oldcgrp = task_cgroup(tsk, subsys_id); + oldcgrp = task_cgroup_from_root(tsk, root); if (cgrp == oldcgrp) return 0; for_each_subsys(root, ss) { if (ss->can_attach) { - retval = ss->can_attach(ss, cgrp, tsk); + retval = ss->can_attach(ss, cgrp, tsk, false); if (retval) return retval; } @@ -1362,7 +1590,7 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) for_each_subsys(root, ss) { if (ss->attach) - ss->attach(ss, cgrp, oldcgrp, tsk); + ss->attach(ss, cgrp, oldcgrp, tsk, false); } set_bit(CGRP_RELEASABLE, &oldcgrp->flags); synchronize_rcu(); @@ -1423,15 +1651,6 @@ static int cgroup_tasks_write(struct cgroup *cgrp, struct cftype *cft, u64 pid) return ret; } -/* The various types of files and directories in a cgroup file system */ -enum cgroup_filetype { - FILE_ROOT, - FILE_DIR, - FILE_TASKLIST, - FILE_NOTIFY_ON_RELEASE, - FILE_RELEASE_AGENT, -}; - /** * cgroup_lock_live_group - take cgroup_mutex and check that cgrp is alive. * @cgrp: the cgroup to be checked for liveness @@ -1491,14 +1710,13 @@ static ssize_t cgroup_write_X64(struct cgroup *cgrp, struct cftype *cft, return -EFAULT; buffer[nbytes] = 0; /* nul-terminate */ - strstrip(buffer); if (cft->write_u64) { - u64 val = simple_strtoull(buffer, &end, 0); + u64 val = simple_strtoull(strstrip(buffer), &end, 0); if (*end) return -EINVAL; retval = cft->write_u64(cgrp, cft, val); } else { - s64 val = simple_strtoll(buffer, &end, 0); + s64 val = simple_strtoll(strstrip(buffer), &end, 0); if (*end) return -EINVAL; retval = cft->write_s64(cgrp, cft, val); @@ -1534,8 +1752,7 @@ static ssize_t cgroup_write_string(struct cgroup *cgrp, struct cftype *cft, } buffer[nbytes] = 0; /* nul-terminate */ - strstrip(buffer); - retval = cft->write_string(cgrp, cft, buffer); + retval = cft->write_string(cgrp, cft, strstrip(buffer)); if (!retval) retval = nbytes; out: @@ -1644,7 +1861,7 @@ static int cgroup_seqfile_release(struct inode *inode, struct file *file) return single_release(inode, file); } -static struct file_operations cgroup_seqfile_operations = { +static const struct file_operations cgroup_seqfile_operations = { .read = seq_read, .write = cgroup_file_write, .llseek = seq_lseek, @@ -1703,7 +1920,7 @@ static int cgroup_rename(struct inode *old_dir, struct dentry *old_dentry, return simple_rename(old_dir, old_dentry, new_dir, new_dentry); } -static struct file_operations cgroup_file_operations = { +static const struct file_operations cgroup_file_operations = { .read = cgroup_file_read, .write = cgroup_file_write, .llseek = generic_file_llseek, @@ -1876,7 +2093,7 @@ int cgroup_task_count(const struct cgroup *cgrp) * the start of a css_set */ static void cgroup_advance_iter(struct cgroup *cgrp, - struct cgroup_iter *it) + struct cgroup_iter *it) { struct list_head *l = it->cg_link; struct cg_cgroup_link *link; @@ -2129,7 +2346,7 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan) } /* - * Stuff for reading the 'tasks' file. + * Stuff for reading the 'tasks'/'procs' files. * * Reading this file can return large amounts of data if a cgroup has * *lots* of attached tasks. So it may need several calls to read(), @@ -2139,27 +2356,196 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan) */ /* - * Load into 'pidarray' up to 'npids' of the tasks using cgroup - * 'cgrp'. Return actual number of pids loaded. No need to - * task_lock(p) when reading out p->cgroup, since we're in an RCU - * read section, so the css_set can't go away, and is - * immutable after creation. + * The following two functions "fix" the issue where there are more pids + * than kmalloc will give memory for; in such cases, we use vmalloc/vfree. + * TODO: replace with a kernel-wide solution to this problem + */ +#define PIDLIST_TOO_LARGE(c) ((c) * sizeof(pid_t) > (PAGE_SIZE * 2)) +static void *pidlist_allocate(int count) +{ + if (PIDLIST_TOO_LARGE(count)) + return vmalloc(count * sizeof(pid_t)); + else + return kmalloc(count * sizeof(pid_t), GFP_KERNEL); +} +static void pidlist_free(void *p) +{ + if (is_vmalloc_addr(p)) + vfree(p); + else + kfree(p); +} +static void *pidlist_resize(void *p, int newcount) +{ + void *newlist; + /* note: if new alloc fails, old p will still be valid either way */ + if (is_vmalloc_addr(p)) { + newlist = vmalloc(newcount * sizeof(pid_t)); + if (!newlist) + return NULL; + memcpy(newlist, p, newcount * sizeof(pid_t)); + vfree(p); + } else { + newlist = krealloc(p, newcount * sizeof(pid_t), GFP_KERNEL); + } + return newlist; +} + +/* + * pidlist_uniq - given a kmalloc()ed list, strip out all duplicate entries + * If the new stripped list is sufficiently smaller and there's enough memory + * to allocate a new buffer, will let go of the unneeded memory. Returns the + * number of unique elements. + */ +/* is the size difference enough that we should re-allocate the array? */ +#define PIDLIST_REALLOC_DIFFERENCE(old, new) ((old) - PAGE_SIZE >= (new)) +static int pidlist_uniq(pid_t **p, int length) +{ + int src, dest = 1; + pid_t *list = *p; + pid_t *newlist; + + /* + * we presume the 0th element is unique, so i starts at 1. trivial + * edge cases first; no work needs to be done for either + */ + if (length == 0 || length == 1) + return length; + /* src and dest walk down the list; dest counts unique elements */ + for (src = 1; src < length; src++) { + /* find next unique element */ + while (list[src] == list[src-1]) { + src++; + if (src == length) + goto after; + } + /* dest always points to where the next unique element goes */ + list[dest] = list[src]; + dest++; + } +after: + /* + * if the length difference is large enough, we want to allocate a + * smaller buffer to save memory. if this fails due to out of memory, + * we'll just stay with what we've got. + */ + if (PIDLIST_REALLOC_DIFFERENCE(length, dest)) { + newlist = pidlist_resize(list, dest); + if (newlist) + *p = newlist; + } + return dest; +} + +static int cmppid(const void *a, const void *b) +{ + return *(pid_t *)a - *(pid_t *)b; +} + +/* + * find the appropriate pidlist for our purpose (given procs vs tasks) + * returns with the lock on that pidlist already held, and takes care + * of the use count, or returns NULL with no locks held if we're out of + * memory. */ -static int pid_array_load(pid_t *pidarray, int npids, struct cgroup *cgrp) +static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp, + enum cgroup_filetype type) { - int n = 0, pid; + struct cgroup_pidlist *l; + /* don't need task_nsproxy() if we're looking at ourself */ + struct pid_namespace *ns = get_pid_ns(current->nsproxy->pid_ns); + /* + * We can't drop the pidlist_mutex before taking the l->mutex in case + * the last ref-holder is trying to remove l from the list at the same + * time. Holding the pidlist_mutex precludes somebody taking whichever + * list we find out from under us - compare release_pid_array(). + */ + mutex_lock(&cgrp->pidlist_mutex); + list_for_each_entry(l, &cgrp->pidlists, links) { + if (l->key.type == type && l->key.ns == ns) { + /* found a matching list - drop the extra refcount */ + put_pid_ns(ns); + /* make sure l doesn't vanish out from under us */ + down_write(&l->mutex); + mutex_unlock(&cgrp->pidlist_mutex); + l->use_count++; + return l; + } + } + /* entry not found; create a new one */ + l = kmalloc(sizeof(struct cgroup_pidlist), GFP_KERNEL); + if (!l) { + mutex_unlock(&cgrp->pidlist_mutex); + put_pid_ns(ns); + return l; + } + init_rwsem(&l->mutex); + down_write(&l->mutex); + l->key.type = type; + l->key.ns = ns; + l->use_count = 0; /* don't increment here */ + l->list = NULL; + l->owner = cgrp; + list_add(&l->links, &cgrp->pidlists); + mutex_unlock(&cgrp->pidlist_mutex); + return l; +} + +/* + * Load a cgroup's pidarray with either procs' tgids or tasks' pids + */ +static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type, + struct cgroup_pidlist **lp) +{ + pid_t *array; + int length; + int pid, n = 0; /* used for populating the array */ struct cgroup_iter it; struct task_struct *tsk; + struct cgroup_pidlist *l; + + /* + * If cgroup gets more users after we read count, we won't have + * enough space - tough. This race is indistinguishable to the + * caller from the case that the additional cgroup users didn't + * show up until sometime later on. + */ + length = cgroup_task_count(cgrp); + array = pidlist_allocate(length); + if (!array) + return -ENOMEM; + /* now, populate the array */ cgroup_iter_start(cgrp, &it); while ((tsk = cgroup_iter_next(cgrp, &it))) { - if (unlikely(n == npids)) + if (unlikely(n == length)) break; - pid = task_pid_vnr(tsk); - if (pid > 0) - pidarray[n++] = pid; + /* get tgid or pid for procs or tasks file respectively */ + if (type == CGROUP_FILE_PROCS) + pid = task_tgid_vnr(tsk); + else + pid = task_pid_vnr(tsk); + if (pid > 0) /* make sure to only use valid results */ + array[n++] = pid; } cgroup_iter_end(cgrp, &it); - return n; + length = n; + /* now sort & (if procs) strip out duplicates */ + sort(array, length, sizeof(pid_t), cmppid, NULL); + if (type == CGROUP_FILE_PROCS) + length = pidlist_uniq(&array, length); + l = cgroup_pidlist_find(cgrp, type); + if (!l) { + pidlist_free(array); + return -ENOMEM; + } + /* store array, freeing old if necessary - lock already held */ + pidlist_free(l->list); + l->list = array; + l->length = length; + l->use_count++; + up_write(&l->mutex); + *lp = l; + return 0; } /** @@ -2216,37 +2602,14 @@ err: return ret; } -/* - * Cache pids for all threads in the same pid namespace that are - * opening the same "tasks" file. - */ -struct cgroup_pids { - /* The node in cgrp->pids_list */ - struct list_head list; - /* The cgroup those pids belong to */ - struct cgroup *cgrp; - /* The namepsace those pids belong to */ - struct pid_namespace *ns; - /* Array of process ids in the cgroup */ - pid_t *tasks_pids; - /* How many files are using the this tasks_pids array */ - int use_count; - /* Length of the current tasks_pids array */ - int length; -}; - -static int cmppid(const void *a, const void *b) -{ - return *(pid_t *)a - *(pid_t *)b; -} /* - * seq_file methods for the "tasks" file. The seq_file position is the + * seq_file methods for the tasks/procs files. The seq_file position is the * next pid to display; the seq_file iterator is a pointer to the pid - * in the cgroup->tasks_pids array. + * in the cgroup->l->list array. */ -static void *cgroup_tasks_start(struct seq_file *s, loff_t *pos) +static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos) { /* * Initially we receive a position value that corresponds to @@ -2254,48 +2617,45 @@ static void *cgroup_tasks_start(struct seq_file *s, loff_t *pos) * after a seek to the start). Use a binary-search to find the * next pid to display, if any */ - struct cgroup_pids *cp = s->private; - struct cgroup *cgrp = cp->cgrp; + struct cgroup_pidlist *l = s->private; int index = 0, pid = *pos; int *iter; - down_read(&cgrp->pids_mutex); + down_read(&l->mutex); if (pid) { - int end = cp->length; + int end = l->length; while (index < end) { int mid = (index + end) / 2; - if (cp->tasks_pids[mid] == pid) { + if (l->list[mid] == pid) { index = mid; break; - } else if (cp->tasks_pids[mid] <= pid) + } else if (l->list[mid] <= pid) index = mid + 1; else end = mid; } } /* If we're off the end of the array, we're done */ - if (index >= cp->length) + if (index >= l->length) return NULL; /* Update the abstract position to be the actual pid that we found */ - iter = cp->tasks_pids + index; + iter = l->list + index; *pos = *iter; return iter; } -static void cgroup_tasks_stop(struct seq_file *s, void *v) +static void cgroup_pidlist_stop(struct seq_file *s, void *v) { - struct cgroup_pids *cp = s->private; - struct cgroup *cgrp = cp->cgrp; - up_read(&cgrp->pids_mutex); + struct cgroup_pidlist *l = s->private; + up_read(&l->mutex); } -static void *cgroup_tasks_next(struct seq_file *s, void *v, loff_t *pos) +static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos) { - struct cgroup_pids *cp = s->private; - int *p = v; - int *end = cp->tasks_pids + cp->length; - + struct cgroup_pidlist *l = s->private; + pid_t *p = v; + pid_t *end = l->list + l->length; /* * Advance to the next pid in the array. If this goes off the * end, we're done @@ -2309,124 +2669,107 @@ static void *cgroup_tasks_next(struct seq_file *s, void *v, loff_t *pos) } } -static int cgroup_tasks_show(struct seq_file *s, void *v) +static int cgroup_pidlist_show(struct seq_file *s, void *v) { return seq_printf(s, "%d\n", *(int *)v); } -static const struct seq_operations cgroup_tasks_seq_operations = { - .start = cgroup_tasks_start, - .stop = cgroup_tasks_stop, - .next = cgroup_tasks_next, - .show = cgroup_tasks_show, +/* + * seq_operations functions for iterating on pidlists through seq_file - + * independent of whether it's tasks or procs + */ +static const struct seq_operations cgroup_pidlist_seq_operations = { + .start = cgroup_pidlist_start, + .stop = cgroup_pidlist_stop, + .next = cgroup_pidlist_next, + .show = cgroup_pidlist_show, }; -static void release_cgroup_pid_array(struct cgroup_pids *cp) +static void cgroup_release_pid_array(struct cgroup_pidlist *l) { - struct cgroup *cgrp = cp->cgrp; - - down_write(&cgrp->pids_mutex); - BUG_ON(!cp->use_count); - if (!--cp->use_count) { - list_del(&cp->list); - put_pid_ns(cp->ns); - kfree(cp->tasks_pids); - kfree(cp); + /* + * the case where we're the last user of this particular pidlist will + * have us remove it from the cgroup's list, which entails taking the + * mutex. since in pidlist_find the pidlist->lock depends on cgroup-> + * pidlist_mutex, we have to take pidlist_mutex first. + */ + mutex_lock(&l->owner->pidlist_mutex); + down_write(&l->mutex); + BUG_ON(!l->use_count); + if (!--l->use_count) { + /* we're the last user if refcount is 0; remove and free */ + list_del(&l->links); + mutex_unlock(&l->owner->pidlist_mutex); + pidlist_free(l->list); + put_pid_ns(l->key.ns); + up_write(&l->mutex); + kfree(l); + return; } - up_write(&cgrp->pids_mutex); + mutex_unlock(&l->owner->pidlist_mutex); + up_write(&l->mutex); } -static int cgroup_tasks_release(struct inode *inode, struct file *file) +static int cgroup_pidlist_release(struct inode *inode, struct file *file) { - struct seq_file *seq; - struct cgroup_pids *cp; - + struct cgroup_pidlist *l; if (!(file->f_mode & FMODE_READ)) return 0; - - seq = file->private_data; - cp = seq->private; - - release_cgroup_pid_array(cp); + /* + * the seq_file will only be initialized if the file was opened for + * reading; hence we check if it's not null only in that case. + */ + l = ((struct seq_file *)file->private_data)->private; + cgroup_release_pid_array(l); return seq_release(inode, file); } -static struct file_operations cgroup_tasks_operations = { +static const struct file_operations cgroup_pidlist_operations = { .read = seq_read, .llseek = seq_lseek, .write = cgroup_file_write, - .release = cgroup_tasks_release, + .release = cgroup_pidlist_release, }; /* - * Handle an open on 'tasks' file. Prepare an array containing the - * process id's of tasks currently attached to the cgroup being opened. + * The following functions handle opens on a file that displays a pidlist + * (tasks or procs). Prepare an array of the process/thread IDs of whoever's + * in the cgroup. */ - -static int cgroup_tasks_open(struct inode *unused, struct file *file) +/* helper function for the two below it */ +static int cgroup_pidlist_open(struct file *file, enum cgroup_filetype type) { struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent); - struct pid_namespace *ns = current->nsproxy->pid_ns; - struct cgroup_pids *cp; - pid_t *pidarray; - int npids; + struct cgroup_pidlist *l; int retval; /* Nothing to do for write-only files */ if (!(file->f_mode & FMODE_READ)) return 0; - /* - * If cgroup gets more users after we read count, we won't have - * enough space - tough. This race is indistinguishable to the - * caller from the case that the additional cgroup users didn't - * show up until sometime later on. - */ - npids = cgroup_task_count(cgrp); - pidarray = kmalloc(npids * sizeof(pid_t), GFP_KERNEL); - if (!pidarray) - return -ENOMEM; - npids = pid_array_load(pidarray, npids, cgrp); - sort(pidarray, npids, sizeof(pid_t), cmppid, NULL); - - /* - * Store the array in the cgroup, freeing the old - * array if necessary - */ - down_write(&cgrp->pids_mutex); - - list_for_each_entry(cp, &cgrp->pids_list, list) { - if (ns == cp->ns) - goto found; - } - - cp = kzalloc(sizeof(*cp), GFP_KERNEL); - if (!cp) { - up_write(&cgrp->pids_mutex); - kfree(pidarray); - return -ENOMEM; - } - cp->cgrp = cgrp; - cp->ns = ns; - get_pid_ns(ns); - list_add(&cp->list, &cgrp->pids_list); -found: - kfree(cp->tasks_pids); - cp->tasks_pids = pidarray; - cp->length = npids; - cp->use_count++; - up_write(&cgrp->pids_mutex); - - file->f_op = &cgroup_tasks_operations; + /* have the array populated */ + retval = pidlist_array_load(cgrp, type, &l); + if (retval) + return retval; + /* configure file information */ + file->f_op = &cgroup_pidlist_operations; - retval = seq_open(file, &cgroup_tasks_seq_operations); + retval = seq_open(file, &cgroup_pidlist_seq_operations); if (retval) { - release_cgroup_pid_array(cp); + cgroup_release_pid_array(l); return retval; } - ((struct seq_file *)file->private_data)->private = cp; + ((struct seq_file *)file->private_data)->private = l; return 0; } +static int cgroup_tasks_open(struct inode *unused, struct file *file) +{ + return cgroup_pidlist_open(file, CGROUP_FILE_TASKS); +} +static int cgroup_procs_open(struct inode *unused, struct file *file) +{ + return cgroup_pidlist_open(file, CGROUP_FILE_PROCS); +} static u64 cgroup_read_notify_on_release(struct cgroup *cgrp, struct cftype *cft) @@ -2449,21 +2792,27 @@ static int cgroup_write_notify_on_release(struct cgroup *cgrp, /* * for the common functions, 'private' gives the type of file */ +/* for hysterical raisins, we can't put this on the older files */ +#define CGROUP_FILE_GENERIC_PREFIX "cgroup." static struct cftype files[] = { { .name = "tasks", .open = cgroup_tasks_open, .write_u64 = cgroup_tasks_write, - .release = cgroup_tasks_release, - .private = FILE_TASKLIST, + .release = cgroup_pidlist_release, .mode = S_IRUGO | S_IWUSR, }, - + { + .name = CGROUP_FILE_GENERIC_PREFIX "procs", + .open = cgroup_procs_open, + /* .write_u64 = cgroup_procs_write, TODO */ + .release = cgroup_pidlist_release, + .mode = S_IRUGO, + }, { .name = "notify_on_release", .read_u64 = cgroup_read_notify_on_release, .write_u64 = cgroup_write_notify_on_release, - .private = FILE_NOTIFY_ON_RELEASE, }, }; @@ -2472,7 +2821,6 @@ static struct cftype cft_release_agent = { .read_seq_string = cgroup_release_agent_show, .write_string = cgroup_release_agent_write, .max_write_len = PATH_MAX, - .private = FILE_RELEASE_AGENT, }; static int cgroup_populate_dir(struct cgroup *cgrp) @@ -2879,6 +3227,7 @@ int __init cgroup_init_early(void) init_task.cgroups = &init_css_set; init_css_set_link.cg = &init_css_set; + init_css_set_link.cgrp = dummytop; list_add(&init_css_set_link.cgrp_link_list, &rootnode.top_cgroup.css_sets); list_add(&init_css_set_link.cg_link_list, @@ -2933,7 +3282,7 @@ int __init cgroup_init(void) /* Add init_css_set to the hash table */ hhead = css_set_hash(init_css_set.subsys); hlist_add_head(&init_css_set.hlist, hhead); - + BUG_ON(!init_root_id(&rootnode)); err = register_filesystem(&cgroup_fs_type); if (err < 0) goto out; @@ -2986,15 +3335,16 @@ static int proc_cgroup_show(struct seq_file *m, void *v) for_each_active_root(root) { struct cgroup_subsys *ss; struct cgroup *cgrp; - int subsys_id; int count = 0; - seq_printf(m, "%lu:", root->subsys_bits); + seq_printf(m, "%d:", root->hierarchy_id); for_each_subsys(root, ss) seq_printf(m, "%s%s", count++ ? "," : "", ss->name); + if (strlen(root->name)) + seq_printf(m, "%sname=%s", count ? "," : "", + root->name); seq_putc(m, ':'); - get_first_subsys(&root->top_cgroup, NULL, &subsys_id); - cgrp = task_cgroup(tsk, subsys_id); + cgrp = task_cgroup_from_root(tsk, root); retval = cgroup_path(cgrp, buf, PAGE_SIZE); if (retval < 0) goto out_unlock; @@ -3017,7 +3367,7 @@ static int cgroup_open(struct inode *inode, struct file *file) return single_open(file, proc_cgroup_show, pid); } -struct file_operations proc_cgroup_operations = { +const struct file_operations proc_cgroup_operations = { .open = cgroup_open, .read = seq_read, .llseek = seq_lseek, @@ -3033,8 +3383,8 @@ static int proc_cgroupstats_show(struct seq_file *m, void *v) mutex_lock(&cgroup_mutex); for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { struct cgroup_subsys *ss = subsys[i]; - seq_printf(m, "%s\t%lu\t%d\t%d\n", - ss->name, ss->root->subsys_bits, + seq_printf(m, "%s\t%d\t%d\t%d\n", + ss->name, ss->root->hierarchy_id, ss->root->number_of_cgroups, !ss->disabled); } mutex_unlock(&cgroup_mutex); @@ -3046,7 +3396,7 @@ static int cgroupstats_open(struct inode *inode, struct file *file) return single_open(file, proc_cgroupstats_show, NULL); } -static struct file_operations proc_cgroupstats_operations = { +static const struct file_operations proc_cgroupstats_operations = { .open = cgroupstats_open, .read = seq_read, .llseek = seq_lseek, @@ -3320,13 +3670,11 @@ int cgroup_is_descendant(const struct cgroup *cgrp, struct task_struct *task) { int ret; struct cgroup *target; - int subsys_id; if (cgrp == dummytop) return 1; - get_first_subsys(cgrp, NULL, &subsys_id); - target = task_cgroup(task, subsys_id); + target = task_cgroup_from_root(task, cgrp->root); while (cgrp != target && cgrp!= cgrp->top_cgroup) cgrp = cgrp->parent; ret = (cgrp == target); @@ -3358,8 +3706,10 @@ static void check_for_release(struct cgroup *cgrp) void __css_put(struct cgroup_subsys_state *css) { struct cgroup *cgrp = css->cgroup; + int val; rcu_read_lock(); - if (atomic_dec_return(&css->refcnt) == 1) { + val = atomic_dec_return(&css->refcnt); + if (val == 1) { if (notify_on_release(cgrp)) { set_bit(CGRP_RELEASABLE, &cgrp->flags); check_for_release(cgrp); @@ -3367,6 +3717,7 @@ void __css_put(struct cgroup_subsys_state *css) cgroup_wakeup_rmdir_waiter(cgrp); } rcu_read_unlock(); + WARN_ON_ONCE(val < 1); } /* @@ -3693,3 +4044,154 @@ css_get_next(struct cgroup_subsys *ss, int id, return ret; } +#ifdef CONFIG_CGROUP_DEBUG +static struct cgroup_subsys_state *debug_create(struct cgroup_subsys *ss, + struct cgroup *cont) +{ + struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL); + + if (!css) + return ERR_PTR(-ENOMEM); + + return css; +} + +static void debug_destroy(struct cgroup_subsys *ss, struct cgroup *cont) +{ + kfree(cont->subsys[debug_subsys_id]); +} + +static u64 cgroup_refcount_read(struct cgroup *cont, struct cftype *cft) +{ + return atomic_read(&cont->count); +} + +static u64 debug_taskcount_read(struct cgroup *cont, struct cftype *cft) +{ + return cgroup_task_count(cont); +} + +static u64 current_css_set_read(struct cgroup *cont, struct cftype *cft) +{ + return (u64)(unsigned long)current->cgroups; +} + +static u64 current_css_set_refcount_read(struct cgroup *cont, + struct cftype *cft) +{ + u64 count; + + rcu_read_lock(); + count = atomic_read(¤t->cgroups->refcount); + rcu_read_unlock(); + return count; +} + +static int current_css_set_cg_links_read(struct cgroup *cont, + struct cftype *cft, + struct seq_file *seq) +{ + struct cg_cgroup_link *link; + struct css_set *cg; + + read_lock(&css_set_lock); + rcu_read_lock(); + cg = rcu_dereference(current->cgroups); + list_for_each_entry(link, &cg->cg_links, cg_link_list) { + struct cgroup *c = link->cgrp; + const char *name; + + if (c->dentry) + name = c->dentry->d_name.name; + else + name = "?"; + seq_printf(seq, "Root %d group %s\n", + c->root->hierarchy_id, name); + } + rcu_read_unlock(); + read_unlock(&css_set_lock); + return 0; +} + +#define MAX_TASKS_SHOWN_PER_CSS 25 +static int cgroup_css_links_read(struct cgroup *cont, + struct cftype *cft, + struct seq_file *seq) +{ + struct cg_cgroup_link *link; + + read_lock(&css_set_lock); + list_for_each_entry(link, &cont->css_sets, cgrp_link_list) { + struct css_set *cg = link->cg; + struct task_struct *task; + int count = 0; + seq_printf(seq, "css_set %p\n", cg); + list_for_each_entry(task, &cg->tasks, cg_list) { + if (count++ > MAX_TASKS_SHOWN_PER_CSS) { + seq_puts(seq, " ...\n"); + break; + } else { + seq_printf(seq, " task %d\n", + task_pid_vnr(task)); + } + } + } + read_unlock(&css_set_lock); + return 0; +} + +static u64 releasable_read(struct cgroup *cgrp, struct cftype *cft) +{ + return test_bit(CGRP_RELEASABLE, &cgrp->flags); +} + +static struct cftype debug_files[] = { + { + .name = "cgroup_refcount", + .read_u64 = cgroup_refcount_read, + }, + { + .name = "taskcount", + .read_u64 = debug_taskcount_read, + }, + + { + .name = "current_css_set", + .read_u64 = current_css_set_read, + }, + + { + .name = "current_css_set_refcount", + .read_u64 = current_css_set_refcount_read, + }, + + { + .name = "current_css_set_cg_links", + .read_seq_string = current_css_set_cg_links_read, + }, + + { + .name = "cgroup_css_links", + .read_seq_string = cgroup_css_links_read, + }, + + { + .name = "releasable", + .read_u64 = releasable_read, + }, +}; + +static int debug_populate(struct cgroup_subsys *ss, struct cgroup *cont) +{ + return cgroup_add_files(cont, ss, debug_files, + ARRAY_SIZE(debug_files)); +} + +struct cgroup_subsys debug_subsys = { + .name = "debug", + .create = debug_create, + .destroy = debug_destroy, + .populate = debug_populate, + .subsys_id = debug_subsys_id, +}; +#endif /* CONFIG_CGROUP_DEBUG */ diff --git a/kernel/cgroup_debug.c b/kernel/cgroup_debug.c deleted file mode 100644 index 0c92d797baa6..000000000000 --- a/kernel/cgroup_debug.c +++ /dev/null @@ -1,105 +0,0 @@ -/* - * kernel/cgroup_debug.c - Example cgroup subsystem that - * exposes debug info - * - * Copyright (C) Google Inc, 2007 - * - * Developed by Paul Menage (menage@google.com) - * - */ - -#include <linux/cgroup.h> -#include <linux/fs.h> -#include <linux/slab.h> -#include <linux/rcupdate.h> - -#include <asm/atomic.h> - -static struct cgroup_subsys_state *debug_create(struct cgroup_subsys *ss, - struct cgroup *cont) -{ - struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL); - - if (!css) - return ERR_PTR(-ENOMEM); - - return css; -} - -static void debug_destroy(struct cgroup_subsys *ss, struct cgroup *cont) -{ - kfree(cont->subsys[debug_subsys_id]); -} - -static u64 cgroup_refcount_read(struct cgroup *cont, struct cftype *cft) -{ - return atomic_read(&cont->count); -} - -static u64 taskcount_read(struct cgroup *cont, struct cftype *cft) -{ - u64 count; - - count = cgroup_task_count(cont); - return count; -} - -static u64 current_css_set_read(struct cgroup *cont, struct cftype *cft) -{ - return (u64)(long)current->cgroups; -} - -static u64 current_css_set_refcount_read(struct cgroup *cont, - struct cftype *cft) -{ - u64 count; - - rcu_read_lock(); - count = atomic_read(¤t->cgroups->refcount); - rcu_read_unlock(); - return count; -} - -static u64 releasable_read(struct cgroup *cgrp, struct cftype *cft) -{ - return test_bit(CGRP_RELEASABLE, &cgrp->flags); -} - -static struct cftype files[] = { - { - .name = "cgroup_refcount", - .read_u64 = cgroup_refcount_read, - }, - { - .name = "taskcount", - .read_u64 = taskcount_read, - }, - - { - .name = "current_css_set", - .read_u64 = current_css_set_read, - }, - - { - .name = "current_css_set_refcount", - .read_u64 = current_css_set_refcount_read, - }, - - { - .name = "releasable", - .read_u64 = releasable_read, - }, -}; - -static int debug_populate(struct cgroup_subsys *ss, struct cgroup *cont) -{ - return cgroup_add_files(cont, ss, files, ARRAY_SIZE(files)); -} - -struct cgroup_subsys debug_subsys = { - .name = "debug", - .create = debug_create, - .destroy = debug_destroy, - .populate = debug_populate, - .subsys_id = debug_subsys_id, -}; diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c index fb249e2bcada..59e9ef6aab40 100644 --- a/kernel/cgroup_freezer.c +++ b/kernel/cgroup_freezer.c @@ -159,7 +159,7 @@ static bool is_task_frozen_enough(struct task_struct *task) */ static int freezer_can_attach(struct cgroup_subsys *ss, struct cgroup *new_cgroup, - struct task_struct *task) + struct task_struct *task, bool threadgroup) { struct freezer *freezer; @@ -177,6 +177,19 @@ static int freezer_can_attach(struct cgroup_subsys *ss, if (freezer->state == CGROUP_FROZEN) return -EBUSY; + if (threadgroup) { + struct task_struct *c; + + rcu_read_lock(); + list_for_each_entry_rcu(c, &task->thread_group, thread_group) { + if (is_task_frozen_enough(c)) { + rcu_read_unlock(); + return -EBUSY; + } + } + rcu_read_unlock(); + } + return 0; } diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 7e75a41bd508..3cf2183b472d 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -537,8 +537,7 @@ update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c) * element of the partition (one sched domain) to be passed to * partition_sched_domains(). */ -/* FIXME: see the FIXME in partition_sched_domains() */ -static int generate_sched_domains(struct cpumask **domains, +static int generate_sched_domains(cpumask_var_t **domains, struct sched_domain_attr **attributes) { LIST_HEAD(q); /* queue of cpusets to be scanned */ @@ -546,7 +545,7 @@ static int generate_sched_domains(struct cpumask **domains, struct cpuset **csa; /* array of all cpuset ptrs */ int csn; /* how many cpuset ptrs in csa so far */ int i, j, k; /* indices for partition finding loops */ - struct cpumask *doms; /* resulting partition; i.e. sched domains */ + cpumask_var_t *doms; /* resulting partition; i.e. sched domains */ struct sched_domain_attr *dattr; /* attributes for custom domains */ int ndoms = 0; /* number of sched domains in result */ int nslot; /* next empty doms[] struct cpumask slot */ @@ -557,7 +556,8 @@ static int generate_sched_domains(struct cpumask **domains, /* Special case for the 99% of systems with one, full, sched domain */ if (is_sched_load_balance(&top_cpuset)) { - doms = kmalloc(cpumask_size(), GFP_KERNEL); + ndoms = 1; + doms = alloc_sched_domains(ndoms); if (!doms) goto done; @@ -566,9 +566,8 @@ static int generate_sched_domains(struct cpumask **domains, *dattr = SD_ATTR_INIT; update_domain_attr_tree(dattr, &top_cpuset); } - cpumask_copy(doms, top_cpuset.cpus_allowed); + cpumask_copy(doms[0], top_cpuset.cpus_allowed); - ndoms = 1; goto done; } @@ -636,7 +635,7 @@ restart: * Now we know how many domains to create. * Convert <csn, csa> to <ndoms, doms> and populate cpu masks. */ - doms = kmalloc(ndoms * cpumask_size(), GFP_KERNEL); + doms = alloc_sched_domains(ndoms); if (!doms) goto done; @@ -656,7 +655,7 @@ restart: continue; } - dp = doms + nslot; + dp = doms[nslot]; if (nslot == ndoms) { static int warnings = 10; @@ -718,7 +717,7 @@ done: static void do_rebuild_sched_domains(struct work_struct *unused) { struct sched_domain_attr *attr; - struct cpumask *doms; + cpumask_var_t *doms; int ndoms; get_online_cpus(); @@ -1324,9 +1323,10 @@ static int fmeter_getrate(struct fmeter *fmp) static cpumask_var_t cpus_attach; /* Called by cgroups to determine if a cpuset is usable; cgroup_mutex held */ -static int cpuset_can_attach(struct cgroup_subsys *ss, - struct cgroup *cont, struct task_struct *tsk) +static int cpuset_can_attach(struct cgroup_subsys *ss, struct cgroup *cont, + struct task_struct *tsk, bool threadgroup) { + int ret; struct cpuset *cs = cgroup_cs(cont); if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)) @@ -1343,18 +1343,51 @@ static int cpuset_can_attach(struct cgroup_subsys *ss, if (tsk->flags & PF_THREAD_BOUND) return -EINVAL; - return security_task_setscheduler(tsk, 0, NULL); + ret = security_task_setscheduler(tsk, 0, NULL); + if (ret) + return ret; + if (threadgroup) { + struct task_struct *c; + + rcu_read_lock(); + list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) { + ret = security_task_setscheduler(c, 0, NULL); + if (ret) { + rcu_read_unlock(); + return ret; + } + } + rcu_read_unlock(); + } + return 0; } -static void cpuset_attach(struct cgroup_subsys *ss, - struct cgroup *cont, struct cgroup *oldcont, - struct task_struct *tsk) +static void cpuset_attach_task(struct task_struct *tsk, nodemask_t *to, + struct cpuset *cs) +{ + int err; + /* + * can_attach beforehand should guarantee that this doesn't fail. + * TODO: have a better way to handle failure here + */ + err = set_cpus_allowed_ptr(tsk, cpus_attach); + WARN_ON_ONCE(err); + + task_lock(tsk); + cpuset_change_task_nodemask(tsk, to); + task_unlock(tsk); + cpuset_update_task_spread_flag(cs, tsk); + +} + +static void cpuset_attach(struct cgroup_subsys *ss, struct cgroup *cont, + struct cgroup *oldcont, struct task_struct *tsk, + bool threadgroup) { nodemask_t from, to; struct mm_struct *mm; struct cpuset *cs = cgroup_cs(cont); struct cpuset *oldcs = cgroup_cs(oldcont); - int err; if (cs == &top_cpuset) { cpumask_copy(cpus_attach, cpu_possible_mask); @@ -1363,15 +1396,19 @@ static void cpuset_attach(struct cgroup_subsys *ss, guarantee_online_cpus(cs, cpus_attach); guarantee_online_mems(cs, &to); } - err = set_cpus_allowed_ptr(tsk, cpus_attach); - if (err) - return; - task_lock(tsk); - cpuset_change_task_nodemask(tsk, &to); - task_unlock(tsk); - cpuset_update_task_spread_flag(cs, tsk); + /* do per-task migration stuff possibly for each in the threadgroup */ + cpuset_attach_task(tsk, &to, cs); + if (threadgroup) { + struct task_struct *c; + rcu_read_lock(); + list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) { + cpuset_attach_task(c, &to, cs); + } + rcu_read_unlock(); + } + /* change mm; only needs to be done once even if threadgroup */ from = oldcs->mems_allowed; to = cs->mems_allowed; mm = get_task_mm(tsk); @@ -2014,7 +2051,7 @@ static int cpuset_track_online_cpus(struct notifier_block *unused_nb, unsigned long phase, void *unused_cpu) { struct sched_domain_attr *attr; - struct cpumask *doms; + cpumask_var_t *doms; int ndoms; switch (phase) { @@ -2499,15 +2536,9 @@ const struct file_operations proc_cpuset_operations = { }; #endif /* CONFIG_PROC_PID_CPUSET */ -/* Display task cpus_allowed, mems_allowed in /proc/<pid>/status file. */ +/* Display task mems_allowed in /proc/<pid>/status file. */ void cpuset_task_status_allowed(struct seq_file *m, struct task_struct *task) { - seq_printf(m, "Cpus_allowed:\t"); - seq_cpumask(m, &task->cpus_allowed); - seq_printf(m, "\n"); - seq_printf(m, "Cpus_allowed_list:\t"); - seq_cpumask_list(m, &task->cpus_allowed); - seq_printf(m, "\n"); seq_printf(m, "Mems_allowed:\t"); seq_nodemask(m, &task->mems_allowed); seq_printf(m, "\n"); diff --git a/kernel/cred.c b/kernel/cred.c index d7f7a01082eb..dd76cfe5f5b0 100644 --- a/kernel/cred.c +++ b/kernel/cred.c @@ -782,6 +782,25 @@ EXPORT_SYMBOL(set_create_files_as); #ifdef CONFIG_DEBUG_CREDENTIALS +bool creds_are_invalid(const struct cred *cred) +{ + if (cred->magic != CRED_MAGIC) + return true; + if (atomic_read(&cred->usage) < atomic_read(&cred->subscribers)) + return true; +#ifdef CONFIG_SECURITY_SELINUX + if (selinux_is_enabled()) { + if ((unsigned long) cred->security < PAGE_SIZE) + return true; + if ((*(u32 *)cred->security & 0xffffff00) == + (POISON_FREE << 24 | POISON_FREE << 16 | POISON_FREE << 8)) + return true; + } +#endif + return false; +} +EXPORT_SYMBOL(creds_are_invalid); + /* * dump invalid credentials */ diff --git a/kernel/exit.c b/kernel/exit.c index 60d6fdcc9265..80ae941cfd2e 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -49,6 +49,7 @@ #include <linux/init_task.h> #include <linux/perf_event.h> #include <trace/events/sched.h> +#include <linux/hw_breakpoint.h> #include <asm/uaccess.h> #include <asm/unistd.h> @@ -110,9 +111,9 @@ static void __exit_signal(struct task_struct *tsk) * We won't ever get here for the group leader, since it * will have been the last reference on the signal_struct. */ - sig->utime = cputime_add(sig->utime, task_utime(tsk)); - sig->stime = cputime_add(sig->stime, task_stime(tsk)); - sig->gtime = cputime_add(sig->gtime, task_gtime(tsk)); + sig->utime = cputime_add(sig->utime, tsk->utime); + sig->stime = cputime_add(sig->stime, tsk->stime); + sig->gtime = cputime_add(sig->gtime, tsk->gtime); sig->min_flt += tsk->min_flt; sig->maj_flt += tsk->maj_flt; sig->nvcsw += tsk->nvcsw; @@ -359,10 +360,8 @@ void __set_special_pids(struct pid *pid) { struct task_struct *curr = current->group_leader; - if (task_session(curr) != pid) { + if (task_session(curr) != pid) change_pid(curr, PIDTYPE_SID, pid); - proc_sid_connector(curr); - } if (task_pgrp(curr) != pid) change_pid(curr, PIDTYPE_PGID, pid); @@ -976,12 +975,14 @@ NORET_TYPE void do_exit(long code) disassociate_ctty(1); module_put(task_thread_info(tsk)->exec_domain->module); - if (tsk->binfmt) - module_put(tsk->binfmt->module); proc_exit_connector(tsk); /* + * FIXME: do that only when needed, using sched_exit tracepoint + */ + flush_ptrace_hw_breakpoint(tsk); + /* * Flush inherited counters to the parent - before the parent * gets woken up by child-exit notifications. */ @@ -993,8 +994,6 @@ NORET_TYPE void do_exit(long code) tsk->mempolicy = NULL; #endif #ifdef CONFIG_FUTEX - if (unlikely(!list_empty(&tsk->pi_state_list))) - exit_pi_state_list(tsk); if (unlikely(current->pi_state_cache)) kfree(current->pi_state_cache); #endif @@ -1097,28 +1096,28 @@ struct wait_opts { int __user *wo_stat; struct rusage __user *wo_rusage; + wait_queue_t child_wait; int notask_error; }; -static struct pid *task_pid_type(struct task_struct *task, enum pid_type type) +static inline +struct pid *task_pid_type(struct task_struct *task, enum pid_type type) { - struct pid *pid = NULL; - if (type == PIDTYPE_PID) - pid = task->pids[type].pid; - else if (type < PIDTYPE_MAX) - pid = task->group_leader->pids[type].pid; - return pid; + if (type != PIDTYPE_PID) + task = task->group_leader; + return task->pids[type].pid; } -static int eligible_child(struct wait_opts *wo, struct task_struct *p) +static int eligible_pid(struct wait_opts *wo, struct task_struct *p) { - int err; - - if (wo->wo_type < PIDTYPE_MAX) { - if (task_pid_type(p, wo->wo_type) != wo->wo_pid) - return 0; - } + return wo->wo_type == PIDTYPE_MAX || + task_pid_type(p, wo->wo_type) == wo->wo_pid; +} +static int eligible_child(struct wait_opts *wo, struct task_struct *p) +{ + if (!eligible_pid(wo, p)) + return 0; /* Wait for all children (clone and not) if __WALL is set; * otherwise, wait for clone children *only* if __WCLONE is * set; otherwise, wait for non-clone children *only*. (Note: @@ -1128,10 +1127,6 @@ static int eligible_child(struct wait_opts *wo, struct task_struct *p) && !(wo->wo_flags & __WALL)) return 0; - err = security_task_wait(p); - if (err) - return err; - return 1; } @@ -1144,18 +1139,20 @@ static int wait_noreap_copyout(struct wait_opts *wo, struct task_struct *p, put_task_struct(p); infop = wo->wo_info; - if (!retval) - retval = put_user(SIGCHLD, &infop->si_signo); - if (!retval) - retval = put_user(0, &infop->si_errno); - if (!retval) - retval = put_user((short)why, &infop->si_code); - if (!retval) - retval = put_user(pid, &infop->si_pid); - if (!retval) - retval = put_user(uid, &infop->si_uid); - if (!retval) - retval = put_user(status, &infop->si_status); + if (infop) { + if (!retval) + retval = put_user(SIGCHLD, &infop->si_signo); + if (!retval) + retval = put_user(0, &infop->si_errno); + if (!retval) + retval = put_user((short)why, &infop->si_code); + if (!retval) + retval = put_user(pid, &infop->si_pid); + if (!retval) + retval = put_user(uid, &infop->si_uid); + if (!retval) + retval = put_user(status, &infop->si_status); + } if (!retval) retval = pid; return retval; @@ -1213,6 +1210,7 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) struct signal_struct *psig; struct signal_struct *sig; unsigned long maxrss; + cputime_t tgutime, tgstime; /* * The resource counters for the group leader are in its @@ -1228,20 +1226,23 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) * need to protect the access to parent->signal fields, * as other threads in the parent group can be right * here reaping other children at the same time. + * + * We use thread_group_times() to get times for the thread + * group, which consolidates times for all threads in the + * group including the group leader. */ + thread_group_times(p, &tgutime, &tgstime); spin_lock_irq(&p->real_parent->sighand->siglock); psig = p->real_parent->signal; sig = p->signal; psig->cutime = cputime_add(psig->cutime, - cputime_add(p->utime, - cputime_add(sig->utime, - sig->cutime))); + cputime_add(tgutime, + sig->cutime)); psig->cstime = cputime_add(psig->cstime, - cputime_add(p->stime, - cputime_add(sig->stime, - sig->cstime))); + cputime_add(tgstime, + sig->cstime)); psig->cgtime = cputime_add(psig->cgtime, cputime_add(p->gtime, @@ -1485,13 +1486,14 @@ static int wait_task_continued(struct wait_opts *wo, struct task_struct *p) * then ->notask_error is 0 if @p is an eligible child, * or another error from security_task_wait(), or still -ECHILD. */ -static int wait_consider_task(struct wait_opts *wo, struct task_struct *parent, - int ptrace, struct task_struct *p) +static int wait_consider_task(struct wait_opts *wo, int ptrace, + struct task_struct *p) { int ret = eligible_child(wo, p); if (!ret) return ret; + ret = security_task_wait(p); if (unlikely(ret < 0)) { /* * If we have not yet seen any eligible child, @@ -1553,7 +1555,7 @@ static int do_wait_thread(struct wait_opts *wo, struct task_struct *tsk) * Do not consider detached threads. */ if (!task_detached(p)) { - int ret = wait_consider_task(wo, tsk, 0, p); + int ret = wait_consider_task(wo, 0, p); if (ret) return ret; } @@ -1567,7 +1569,7 @@ static int ptrace_do_wait(struct wait_opts *wo, struct task_struct *tsk) struct task_struct *p; list_for_each_entry(p, &tsk->ptraced, ptrace_entry) { - int ret = wait_consider_task(wo, tsk, 1, p); + int ret = wait_consider_task(wo, 1, p); if (ret) return ret; } @@ -1575,15 +1577,38 @@ static int ptrace_do_wait(struct wait_opts *wo, struct task_struct *tsk) return 0; } +static int child_wait_callback(wait_queue_t *wait, unsigned mode, + int sync, void *key) +{ + struct wait_opts *wo = container_of(wait, struct wait_opts, + child_wait); + struct task_struct *p = key; + + if (!eligible_pid(wo, p)) + return 0; + + if ((wo->wo_flags & __WNOTHREAD) && wait->private != p->parent) + return 0; + + return default_wake_function(wait, mode, sync, key); +} + +void __wake_up_parent(struct task_struct *p, struct task_struct *parent) +{ + __wake_up_sync_key(&parent->signal->wait_chldexit, + TASK_INTERRUPTIBLE, 1, p); +} + static long do_wait(struct wait_opts *wo) { - DECLARE_WAITQUEUE(wait, current); struct task_struct *tsk; int retval; trace_sched_process_wait(wo->wo_pid); - add_wait_queue(¤t->signal->wait_chldexit,&wait); + init_waitqueue_func_entry(&wo->child_wait, child_wait_callback); + wo->child_wait.private = current; + add_wait_queue(¤t->signal->wait_chldexit, &wo->child_wait); repeat: /* * If there is nothing that can match our critiera just get out. @@ -1624,32 +1649,7 @@ notask: } end: __set_current_state(TASK_RUNNING); - remove_wait_queue(¤t->signal->wait_chldexit,&wait); - if (wo->wo_info) { - struct siginfo __user *infop = wo->wo_info; - - if (retval > 0) - retval = 0; - else { - /* - * For a WNOHANG return, clear out all the fields - * we would set so the user can easily tell the - * difference. - */ - if (!retval) - retval = put_user(0, &infop->si_signo); - if (!retval) - retval = put_user(0, &infop->si_errno); - if (!retval) - retval = put_user(0, &infop->si_code); - if (!retval) - retval = put_user(0, &infop->si_pid); - if (!retval) - retval = put_user(0, &infop->si_uid); - if (!retval) - retval = put_user(0, &infop->si_status); - } - } + remove_wait_queue(¤t->signal->wait_chldexit, &wo->child_wait); return retval; } @@ -1694,6 +1694,29 @@ SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *, wo.wo_stat = NULL; wo.wo_rusage = ru; ret = do_wait(&wo); + + if (ret > 0) { + ret = 0; + } else if (infop) { + /* + * For a WNOHANG return, clear out all the fields + * we would set so the user can easily tell the + * difference. + */ + if (!ret) + ret = put_user(0, &infop->si_signo); + if (!ret) + ret = put_user(0, &infop->si_errno); + if (!ret) + ret = put_user(0, &infop->si_code); + if (!ret) + ret = put_user(0, &infop->si_pid); + if (!ret) + ret = put_user(0, &infop->si_uid); + if (!ret) + ret = put_user(0, &infop->si_status); + } + put_pid(pid); /* avoid REGPARM breakage on x86: */ diff --git a/kernel/fork.c b/kernel/fork.c index 51ad0b0b7266..3d6f121bbe8a 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -91,7 +91,7 @@ int nr_processes(void) int cpu; int total = 0; - for_each_online_cpu(cpu) + for_each_possible_cpu(cpu) total += per_cpu(process_counts, cpu); return total; @@ -434,6 +434,14 @@ __setup("coredump_filter=", coredump_filter_setup); #include <linux/init_task.h> +static void mm_init_aio(struct mm_struct *mm) +{ +#ifdef CONFIG_AIO + spin_lock_init(&mm->ioctx_lock); + INIT_HLIST_HEAD(&mm->ioctx_list); +#endif +} + static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p) { atomic_set(&mm->mm_users, 1); @@ -447,10 +455,9 @@ static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p) set_mm_counter(mm, file_rss, 0); set_mm_counter(mm, anon_rss, 0); spin_lock_init(&mm->page_table_lock); - spin_lock_init(&mm->ioctx_lock); - INIT_HLIST_HEAD(&mm->ioctx_list); mm->free_area_cache = TASK_UNMAPPED_BASE; mm->cached_hole_size = ~0UL; + mm_init_aio(mm); mm_init_owner(mm, p); if (likely(!mm_alloc_pgd(mm))) { @@ -511,6 +518,8 @@ void mmput(struct mm_struct *mm) spin_unlock(&mmlist_lock); } put_swap_token(mm); + if (mm->binfmt) + module_put(mm->binfmt->module); mmdrop(mm); } } @@ -561,12 +570,18 @@ void mm_release(struct task_struct *tsk, struct mm_struct *mm) /* Get rid of any futexes when releasing the mm */ #ifdef CONFIG_FUTEX - if (unlikely(tsk->robust_list)) + if (unlikely(tsk->robust_list)) { exit_robust_list(tsk); + tsk->robust_list = NULL; + } #ifdef CONFIG_COMPAT - if (unlikely(tsk->compat_robust_list)) + if (unlikely(tsk->compat_robust_list)) { compat_exit_robust_list(tsk); + tsk->compat_robust_list = NULL; + } #endif + if (unlikely(!list_empty(&tsk->pi_state_list))) + exit_pi_state_list(tsk); #endif /* Get rid of any cached register state */ @@ -636,9 +651,14 @@ struct mm_struct *dup_mm(struct task_struct *tsk) mm->hiwater_rss = get_mm_rss(mm); mm->hiwater_vm = mm->total_vm; + if (mm->binfmt && !try_module_get(mm->binfmt->module)) + goto free_pt; + return mm; free_pt: + /* don't put binfmt in mmput, we haven't got module yet */ + mm->binfmt = NULL; mmput(mm); fail_nomem: @@ -864,6 +884,9 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) sig->utime = sig->stime = sig->cutime = sig->cstime = cputime_zero; sig->gtime = cputime_zero; sig->cgtime = cputime_zero; +#ifndef CONFIG_VIRT_CPU_ACCOUNTING + sig->prev_utime = sig->prev_stime = cputime_zero; +#endif sig->nvcsw = sig->nivcsw = sig->cnvcsw = sig->cnivcsw = 0; sig->min_flt = sig->maj_flt = sig->cmin_flt = sig->cmaj_flt = 0; sig->inblock = sig->oublock = sig->cinblock = sig->coublock = 0; @@ -979,6 +1002,16 @@ static struct task_struct *copy_process(unsigned long clone_flags, if ((clone_flags & CLONE_SIGHAND) && !(clone_flags & CLONE_VM)) return ERR_PTR(-EINVAL); + /* + * Siblings of global init remain as zombies on exit since they are + * not reaped by their parent (swapper). To solve this and to avoid + * multi-rooted process trees, prevent global and container-inits + * from creating siblings. + */ + if ((clone_flags & CLONE_PARENT) && + current->signal->flags & SIGNAL_UNKILLABLE) + return ERR_PTR(-EINVAL); + retval = security_task_create(clone_flags); if (retval) goto fork_out; @@ -1020,9 +1053,6 @@ static struct task_struct *copy_process(unsigned long clone_flags, if (!try_module_get(task_thread_info(p)->exec_domain->module)) goto bad_fork_cleanup_count; - if (p->binfmt && !try_module_get(p->binfmt->module)) - goto bad_fork_cleanup_put_domain; - p->did_exec = 0; delayacct_tsk_init(p); /* Must remain after dup_task_struct() */ copy_flags(clone_flags, p); @@ -1039,8 +1069,10 @@ static struct task_struct *copy_process(unsigned long clone_flags, p->gtime = cputime_zero; p->utimescaled = cputime_zero; p->stimescaled = cputime_zero; +#ifndef CONFIG_VIRT_CPU_ACCOUNTING p->prev_utime = cputime_zero; p->prev_stime = cputime_zero; +#endif p->default_timer_slack_ns = current->timer_slack_ns; @@ -1310,9 +1342,6 @@ bad_fork_cleanup_cgroup: #endif cgroup_exit(p, cgroup_callbacks_done); delayacct_tsk_free(p); - if (p->binfmt) - module_put(p->binfmt->module); -bad_fork_cleanup_put_domain: module_put(task_thread_info(p)->exec_domain->module); bad_fork_cleanup_count: atomic_dec(&p->cred->user->processes); diff --git a/kernel/futex.c b/kernel/futex.c index 248dd119a86e..fb65e822fc41 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -89,36 +89,36 @@ struct futex_pi_state { union futex_key key; }; -/* - * We use this hashed waitqueue instead of a normal wait_queue_t, so +/** + * struct futex_q - The hashed futex queue entry, one per waiting task + * @task: the task waiting on the futex + * @lock_ptr: the hash bucket lock + * @key: the key the futex is hashed on + * @pi_state: optional priority inheritance state + * @rt_waiter: rt_waiter storage for use with requeue_pi + * @requeue_pi_key: the requeue_pi target futex key + * @bitset: bitset for the optional bitmasked wakeup + * + * We use this hashed waitqueue, instead of a normal wait_queue_t, so * we can wake only the relevant ones (hashed queues may be shared). * * A futex_q has a woken state, just like tasks have TASK_RUNNING. * It is considered woken when plist_node_empty(&q->list) || q->lock_ptr == 0. * The order of wakup is always to make the first condition true, then - * wake up q->waiter, then make the second condition true. + * the second. + * + * PI futexes are typically woken before they are removed from the hash list via + * the rt_mutex code. See unqueue_me_pi(). */ struct futex_q { struct plist_node list; - /* Waiter reference */ - struct task_struct *task; - /* Which hash list lock to use: */ + struct task_struct *task; spinlock_t *lock_ptr; - - /* Key which the futex is hashed on: */ union futex_key key; - - /* Optional priority inheritance state: */ struct futex_pi_state *pi_state; - - /* rt_waiter storage for requeue_pi: */ struct rt_mutex_waiter *rt_waiter; - - /* The expected requeue pi target futex key: */ union futex_key *requeue_pi_key; - - /* Bitset for the optional bitmasked wakeup */ u32 bitset; }; @@ -150,7 +150,8 @@ static struct futex_hash_bucket *hash_futex(union futex_key *key) */ static inline int match_futex(union futex_key *key1, union futex_key *key2) { - return (key1->both.word == key2->both.word + return (key1 && key2 + && key1->both.word == key2->both.word && key1->both.ptr == key2->both.ptr && key1->both.offset == key2->both.offset); } @@ -198,11 +199,12 @@ static void drop_futex_key_refs(union futex_key *key) } /** - * get_futex_key - Get parameters which are the keys for a futex. - * @uaddr: virtual address of the futex - * @fshared: 0 for a PROCESS_PRIVATE futex, 1 for PROCESS_SHARED - * @key: address where result is stored. - * @rw: mapping needs to be read/write (values: VERIFY_READ, VERIFY_WRITE) + * get_futex_key() - Get parameters which are the keys for a futex + * @uaddr: virtual address of the futex + * @fshared: 0 for a PROCESS_PRIVATE futex, 1 for PROCESS_SHARED + * @key: address where result is stored. + * @rw: mapping needs to be read/write (values: VERIFY_READ, + * VERIFY_WRITE) * * Returns a negative error code or 0 * The key words are stored in *key on success. @@ -288,8 +290,8 @@ void put_futex_key(int fshared, union futex_key *key) drop_futex_key_refs(key); } -/* - * fault_in_user_writeable - fault in user address and verify RW access +/** + * fault_in_user_writeable() - Fault in user address and verify RW access * @uaddr: pointer to faulting user space address * * Slow path to fixup the fault we just took in the atomic write @@ -309,8 +311,8 @@ static int fault_in_user_writeable(u32 __user *uaddr) /** * futex_top_waiter() - Return the highest priority waiter on a futex - * @hb: the hash bucket the futex_q's reside in - * @key: the futex key (to distinguish it from other futex futex_q's) + * @hb: the hash bucket the futex_q's reside in + * @key: the futex key (to distinguish it from other futex futex_q's) * * Must be called with the hb lock held. */ @@ -588,7 +590,7 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, } /** - * futex_lock_pi_atomic() - atomic work required to acquire a pi aware futex + * futex_lock_pi_atomic() - Atomic work required to acquire a pi aware futex * @uaddr: the pi futex user address * @hb: the pi futex hash bucket * @key: the futex key associated with uaddr and hb @@ -915,8 +917,8 @@ retry: hb1 = hash_futex(&key1); hb2 = hash_futex(&key2); - double_lock_hb(hb1, hb2); retry_private: + double_lock_hb(hb1, hb2); op_ret = futex_atomic_op_inuser(op, uaddr2); if (unlikely(op_ret < 0)) { @@ -1011,9 +1013,9 @@ void requeue_futex(struct futex_q *q, struct futex_hash_bucket *hb1, /** * requeue_pi_wake_futex() - Wake a task that acquired the lock during requeue - * q: the futex_q - * key: the key of the requeue target futex - * hb: the hash_bucket of the requeue target futex + * @q: the futex_q + * @key: the key of the requeue target futex + * @hb: the hash_bucket of the requeue target futex * * During futex_requeue, with requeue_pi=1, it is possible to acquire the * target futex if it is uncontended or via a lock steal. Set the futex_q key @@ -1027,7 +1029,6 @@ static inline void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key, struct futex_hash_bucket *hb) { - drop_futex_key_refs(&q->key); get_futex_key_refs(key); q->key = *key; @@ -1225,6 +1226,7 @@ retry_private: */ if (ret == 1) { WARN_ON(pi_state); + drop_count++; task_count++; ret = get_futex_value_locked(&curval2, uaddr2); if (!ret) @@ -1303,6 +1305,7 @@ retry_private: if (ret == 1) { /* We got the lock. */ requeue_pi_wake_futex(this, &key2, hb2); + drop_count++; continue; } else if (ret) { /* -EDEADLK */ @@ -1350,6 +1353,25 @@ static inline struct futex_hash_bucket *queue_lock(struct futex_q *q) return hb; } +static inline void +queue_unlock(struct futex_q *q, struct futex_hash_bucket *hb) +{ + spin_unlock(&hb->lock); + drop_futex_key_refs(&q->key); +} + +/** + * queue_me() - Enqueue the futex_q on the futex_hash_bucket + * @q: The futex_q to enqueue + * @hb: The destination hash bucket + * + * The hb->lock must be held by the caller, and is released here. A call to + * queue_me() is typically paired with exactly one call to unqueue_me(). The + * exceptions involve the PI related operations, which may use unqueue_me_pi() + * or nothing if the unqueue is done as part of the wake process and the unqueue + * state is implicit in the state of woken task (see futex_wait_requeue_pi() for + * an example). + */ static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb) { int prio; @@ -1373,19 +1395,17 @@ static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb) spin_unlock(&hb->lock); } -static inline void -queue_unlock(struct futex_q *q, struct futex_hash_bucket *hb) -{ - spin_unlock(&hb->lock); - drop_futex_key_refs(&q->key); -} - -/* - * queue_me and unqueue_me must be called as a pair, each - * exactly once. They are called with the hashed spinlock held. +/** + * unqueue_me() - Remove the futex_q from its futex_hash_bucket + * @q: The futex_q to unqueue + * + * The q->lock_ptr must not be held by the caller. A call to unqueue_me() must + * be paired with exactly one earlier call to queue_me(). + * + * Returns: + * 1 - if the futex_q was still queued (and we removed unqueued it) + * 0 - if the futex_q was already removed by the waking thread */ - -/* Return 1 if we were still queued (ie. 0 means we were woken) */ static int unqueue_me(struct futex_q *q) { spinlock_t *lock_ptr; @@ -1638,17 +1658,14 @@ out: static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q, struct hrtimer_sleeper *timeout) { - queue_me(q, hb); - /* - * There might have been scheduling since the queue_me(), as we - * cannot hold a spinlock across the get_user() in case it - * faults, and we cannot just set TASK_INTERRUPTIBLE state when - * queueing ourselves into the futex hash. This code thus has to - * rely on the futex_wake() code removing us from hash when it - * wakes us up. + * The task state is guaranteed to be set before another task can + * wake it. set_current_state() is implemented using set_mb() and + * queue_me() calls spin_unlock() upon completion, both serializing + * access to the hash list and forcing another memory barrier. */ set_current_state(TASK_INTERRUPTIBLE); + queue_me(q, hb); /* Arm the timer */ if (timeout) { @@ -1658,8 +1675,8 @@ static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q, } /* - * !plist_node_empty() is safe here without any lock. - * q.lock_ptr != 0 is not safe, because of ordering against wakeup. + * If we have been removed from the hash list, then another task + * has tried to wake us, and we can skip the call to schedule(). */ if (likely(!plist_node_empty(&q->list))) { /* @@ -1776,6 +1793,7 @@ static int futex_wait(u32 __user *uaddr, int fshared, current->timer_slack_ns); } +retry: /* Prepare to wait on uaddr. */ ret = futex_wait_setup(uaddr, val, fshared, &q, &hb); if (ret) @@ -1793,9 +1811,14 @@ static int futex_wait(u32 __user *uaddr, int fshared, goto out_put_key; /* - * We expect signal_pending(current), but another thread may - * have handled it for us already. + * We expect signal_pending(current), but we might be the + * victim of a spurious wakeup as well. */ + if (!signal_pending(current)) { + put_futex_key(fshared, &q.key); + goto retry; + } + ret = -ERESTARTSYS; if (!abs_time) goto out_put_key; @@ -2102,11 +2125,12 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb, * Unqueue the futex_q and determine which it was. */ plist_del(&q->list, &q->list.plist); - drop_futex_key_refs(&q->key); + /* Handle spurious wakeups gracefully */ + ret = -EWOULDBLOCK; if (timeout && !timeout->task) ret = -ETIMEDOUT; - else + else if (signal_pending(current)) ret = -ERESTARTNOINTR; } return ret; @@ -2114,12 +2138,12 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb, /** * futex_wait_requeue_pi() - Wait on uaddr and take uaddr2 - * @uaddr: the futex we initialyl wait on (non-pi) + * @uaddr: the futex we initially wait on (non-pi) * @fshared: whether the futexes are shared (1) or not (0). They must be * the same type, no requeueing from private to shared, etc. * @val: the expected value of uaddr * @abs_time: absolute timeout - * @bitset: 32 bit wakeup bitset set by userspace, defaults to all. + * @bitset: 32 bit wakeup bitset set by userspace, defaults to all * @clockrt: whether to use CLOCK_REALTIME (1) or CLOCK_MONOTONIC (0) * @uaddr2: the pi futex we will take prior to returning to user-space * @@ -2246,7 +2270,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared, res = fixup_owner(uaddr2, fshared, &q, !ret); /* * If fixup_owner() returned an error, proprogate that. If it - * acquired the lock, clear our -ETIMEDOUT or -EINTR. + * acquired the lock, clear -ETIMEDOUT or -EINTR. */ if (res) ret = (res < 0) ? res : 0; @@ -2302,9 +2326,9 @@ out: */ /** - * sys_set_robust_list - set the robust-futex list head of a task - * @head: pointer to the list-head - * @len: length of the list-head, as userspace expects + * sys_set_robust_list() - Set the robust-futex list head of a task + * @head: pointer to the list-head + * @len: length of the list-head, as userspace expects */ SYSCALL_DEFINE2(set_robust_list, struct robust_list_head __user *, head, size_t, len) @@ -2323,10 +2347,10 @@ SYSCALL_DEFINE2(set_robust_list, struct robust_list_head __user *, head, } /** - * sys_get_robust_list - get the robust-futex list head of a task - * @pid: pid of the process [zero for current task] - * @head_ptr: pointer to a list-head pointer, the kernel fills it in - * @len_ptr: pointer to a length field, the kernel fills in the header size + * sys_get_robust_list() - Get the robust-futex list head of a task + * @pid: pid of the process [zero for current task] + * @head_ptr: pointer to a list-head pointer, the kernel fills it in + * @len_ptr: pointer to a length field, the kernel fills in the header size */ SYSCALL_DEFINE3(get_robust_list, int, pid, struct robust_list_head __user * __user *, head_ptr, diff --git a/kernel/gcov/Kconfig b/kernel/gcov/Kconfig index 654efd09f6a9..70a298d6da71 100644 --- a/kernel/gcov/Kconfig +++ b/kernel/gcov/Kconfig @@ -34,7 +34,7 @@ config GCOV_KERNEL config GCOV_PROFILE_ALL bool "Profile entire Kernel" depends on GCOV_KERNEL - depends on S390 || X86 || (PPC && EXPERIMENTAL) + depends on S390 || X86 || (PPC && EXPERIMENTAL) || MICROBLAZE default n ---help--- This options activates profiling for the entire kernel. diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index e5d98ce50f89..3e1c36e7998f 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -509,13 +509,14 @@ static inline int hrtimer_hres_active(void) * next event * Called with interrupts disabled and base->lock held */ -static void hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base) +static void +hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal) { int i; struct hrtimer_clock_base *base = cpu_base->clock_base; - ktime_t expires; + ktime_t expires, expires_next; - cpu_base->expires_next.tv64 = KTIME_MAX; + expires_next.tv64 = KTIME_MAX; for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++, base++) { struct hrtimer *timer; @@ -531,10 +532,15 @@ static void hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base) */ if (expires.tv64 < 0) expires.tv64 = 0; - if (expires.tv64 < cpu_base->expires_next.tv64) - cpu_base->expires_next = expires; + if (expires.tv64 < expires_next.tv64) + expires_next = expires; } + if (skip_equal && expires_next.tv64 == cpu_base->expires_next.tv64) + return; + + cpu_base->expires_next.tv64 = expires_next.tv64; + if (cpu_base->expires_next.tv64 != KTIME_MAX) tick_program_event(cpu_base->expires_next, 1); } @@ -617,7 +623,7 @@ static void retrigger_next_event(void *arg) base->clock_base[CLOCK_REALTIME].offset = timespec_to_ktime(realtime_offset); - hrtimer_force_reprogram(base); + hrtimer_force_reprogram(base, 0); spin_unlock(&base->lock); } @@ -720,8 +726,6 @@ static int hrtimer_switch_to_hres(void) /* "Retrigger" the interrupt to get things going */ retrigger_next_event(NULL); local_irq_restore(flags); - printk(KERN_DEBUG "Switched to high resolution mode on CPU %d\n", - smp_processor_id()); return 1; } @@ -730,7 +734,8 @@ static int hrtimer_switch_to_hres(void) static inline int hrtimer_hres_active(void) { return 0; } static inline int hrtimer_is_hres_enabled(void) { return 0; } static inline int hrtimer_switch_to_hres(void) { return 0; } -static inline void hrtimer_force_reprogram(struct hrtimer_cpu_base *base) { } +static inline void +hrtimer_force_reprogram(struct hrtimer_cpu_base *base, int skip_equal) { } static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer, struct hrtimer_clock_base *base, int wakeup) @@ -873,19 +878,29 @@ static void __remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base, unsigned long newstate, int reprogram) { - if (timer->state & HRTIMER_STATE_ENQUEUED) { - /* - * Remove the timer from the rbtree and replace the - * first entry pointer if necessary. - */ - if (base->first == &timer->node) { - base->first = rb_next(&timer->node); - /* Reprogram the clock event device. if enabled */ - if (reprogram && hrtimer_hres_active()) - hrtimer_force_reprogram(base->cpu_base); + if (!(timer->state & HRTIMER_STATE_ENQUEUED)) + goto out; + + /* + * Remove the timer from the rbtree and replace the first + * entry pointer if necessary. + */ + if (base->first == &timer->node) { + base->first = rb_next(&timer->node); +#ifdef CONFIG_HIGH_RES_TIMERS + /* Reprogram the clock event device. if enabled */ + if (reprogram && hrtimer_hres_active()) { + ktime_t expires; + + expires = ktime_sub(hrtimer_get_expires(timer), + base->offset); + if (base->cpu_base->expires_next.tv64 == expires.tv64) + hrtimer_force_reprogram(base->cpu_base, 1); } - rb_erase(&timer->node, &base->active); +#endif } + rb_erase(&timer->node, &base->active); +out: timer->state = newstate; } diff --git a/kernel/hung_task.c b/kernel/hung_task.c index 022a4927b785..0c642d51aac2 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c @@ -144,7 +144,7 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout) rcu_read_lock(); do_each_thread(g, t) { - if (!--max_count) + if (!max_count--) goto unlock; if (!--batch_count) { batch_count = HUNG_TASK_BATCHING; @@ -171,12 +171,12 @@ static unsigned long timeout_jiffies(unsigned long timeout) * Process updating of timeout sysctl */ int proc_dohung_task_timeout_secs(struct ctl_table *table, int write, - struct file *filp, void __user *buffer, + void __user *buffer, size_t *lenp, loff_t *ppos) { int ret; - ret = proc_doulongvec_minmax(table, write, filp, buffer, lenp, ppos); + ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos); if (ret || !write) goto out; diff --git a/kernel/hw_breakpoint.c b/kernel/hw_breakpoint.c new file mode 100644 index 000000000000..cf5ee1628411 --- /dev/null +++ b/kernel/hw_breakpoint.c @@ -0,0 +1,423 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright (C) 2007 Alan Stern + * Copyright (C) IBM Corporation, 2009 + * Copyright (C) 2009, Frederic Weisbecker <fweisbec@gmail.com> + * + * Thanks to Ingo Molnar for his many suggestions. + * + * Authors: Alan Stern <stern@rowland.harvard.edu> + * K.Prasad <prasad@linux.vnet.ibm.com> + * Frederic Weisbecker <fweisbec@gmail.com> + */ + +/* + * HW_breakpoint: a unified kernel/user-space hardware breakpoint facility, + * using the CPU's debug registers. + * This file contains the arch-independent routines. + */ + +#include <linux/irqflags.h> +#include <linux/kallsyms.h> +#include <linux/notifier.h> +#include <linux/kprobes.h> +#include <linux/kdebug.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/percpu.h> +#include <linux/sched.h> +#include <linux/init.h> +#include <linux/smp.h> + +#include <linux/hw_breakpoint.h> + +/* + * Constraints data + */ + +/* Number of pinned cpu breakpoints in a cpu */ +static DEFINE_PER_CPU(unsigned int, nr_cpu_bp_pinned); + +/* Number of pinned task breakpoints in a cpu */ +static DEFINE_PER_CPU(unsigned int, task_bp_pinned[HBP_NUM]); + +/* Number of non-pinned cpu/task breakpoints in a cpu */ +static DEFINE_PER_CPU(unsigned int, nr_bp_flexible); + +/* Gather the number of total pinned and un-pinned bp in a cpuset */ +struct bp_busy_slots { + unsigned int pinned; + unsigned int flexible; +}; + +/* Serialize accesses to the above constraints */ +static DEFINE_MUTEX(nr_bp_mutex); + +/* + * Report the maximum number of pinned breakpoints a task + * have in this cpu + */ +static unsigned int max_task_bp_pinned(int cpu) +{ + int i; + unsigned int *tsk_pinned = per_cpu(task_bp_pinned, cpu); + + for (i = HBP_NUM -1; i >= 0; i--) { + if (tsk_pinned[i] > 0) + return i + 1; + } + + return 0; +} + +/* + * Report the number of pinned/un-pinned breakpoints we have in + * a given cpu (cpu > -1) or in all of them (cpu = -1). + */ +static void fetch_bp_busy_slots(struct bp_busy_slots *slots, int cpu) +{ + if (cpu >= 0) { + slots->pinned = per_cpu(nr_cpu_bp_pinned, cpu); + slots->pinned += max_task_bp_pinned(cpu); + slots->flexible = per_cpu(nr_bp_flexible, cpu); + + return; + } + + for_each_online_cpu(cpu) { + unsigned int nr; + + nr = per_cpu(nr_cpu_bp_pinned, cpu); + nr += max_task_bp_pinned(cpu); + + if (nr > slots->pinned) + slots->pinned = nr; + + nr = per_cpu(nr_bp_flexible, cpu); + + if (nr > slots->flexible) + slots->flexible = nr; + } +} + +/* + * Add a pinned breakpoint for the given task in our constraint table + */ +static void toggle_bp_task_slot(struct task_struct *tsk, int cpu, bool enable) +{ + int count = 0; + struct perf_event *bp; + struct perf_event_context *ctx = tsk->perf_event_ctxp; + unsigned int *tsk_pinned; + struct list_head *list; + unsigned long flags; + + if (WARN_ONCE(!ctx, "No perf context for this task")) + return; + + list = &ctx->event_list; + + spin_lock_irqsave(&ctx->lock, flags); + + /* + * The current breakpoint counter is not included in the list + * at the open() callback time + */ + list_for_each_entry(bp, list, event_entry) { + if (bp->attr.type == PERF_TYPE_BREAKPOINT) + count++; + } + + spin_unlock_irqrestore(&ctx->lock, flags); + + if (WARN_ONCE(count < 0, "No breakpoint counter found in the counter list")) + return; + + tsk_pinned = per_cpu(task_bp_pinned, cpu); + if (enable) { + tsk_pinned[count]++; + if (count > 0) + tsk_pinned[count-1]--; + } else { + tsk_pinned[count]--; + if (count > 0) + tsk_pinned[count-1]++; + } +} + +/* + * Add/remove the given breakpoint in our constraint table + */ +static void toggle_bp_slot(struct perf_event *bp, bool enable) +{ + int cpu = bp->cpu; + struct task_struct *tsk = bp->ctx->task; + + /* Pinned counter task profiling */ + if (tsk) { + if (cpu >= 0) { + toggle_bp_task_slot(tsk, cpu, enable); + return; + } + + for_each_online_cpu(cpu) + toggle_bp_task_slot(tsk, cpu, enable); + return; + } + + /* Pinned counter cpu profiling */ + if (enable) + per_cpu(nr_cpu_bp_pinned, bp->cpu)++; + else + per_cpu(nr_cpu_bp_pinned, bp->cpu)--; +} + +/* + * Contraints to check before allowing this new breakpoint counter: + * + * == Non-pinned counter == (Considered as pinned for now) + * + * - If attached to a single cpu, check: + * + * (per_cpu(nr_bp_flexible, cpu) || (per_cpu(nr_cpu_bp_pinned, cpu) + * + max(per_cpu(task_bp_pinned, cpu)))) < HBP_NUM + * + * -> If there are already non-pinned counters in this cpu, it means + * there is already a free slot for them. + * Otherwise, we check that the maximum number of per task + * breakpoints (for this cpu) plus the number of per cpu breakpoint + * (for this cpu) doesn't cover every registers. + * + * - If attached to every cpus, check: + * + * (per_cpu(nr_bp_flexible, *) || (max(per_cpu(nr_cpu_bp_pinned, *)) + * + max(per_cpu(task_bp_pinned, *)))) < HBP_NUM + * + * -> This is roughly the same, except we check the number of per cpu + * bp for every cpu and we keep the max one. Same for the per tasks + * breakpoints. + * + * + * == Pinned counter == + * + * - If attached to a single cpu, check: + * + * ((per_cpu(nr_bp_flexible, cpu) > 1) + per_cpu(nr_cpu_bp_pinned, cpu) + * + max(per_cpu(task_bp_pinned, cpu))) < HBP_NUM + * + * -> Same checks as before. But now the nr_bp_flexible, if any, must keep + * one register at least (or they will never be fed). + * + * - If attached to every cpus, check: + * + * ((per_cpu(nr_bp_flexible, *) > 1) + max(per_cpu(nr_cpu_bp_pinned, *)) + * + max(per_cpu(task_bp_pinned, *))) < HBP_NUM + */ +int reserve_bp_slot(struct perf_event *bp) +{ + struct bp_busy_slots slots = {0}; + int ret = 0; + + mutex_lock(&nr_bp_mutex); + + fetch_bp_busy_slots(&slots, bp->cpu); + + /* Flexible counters need to keep at least one slot */ + if (slots.pinned + (!!slots.flexible) == HBP_NUM) { + ret = -ENOSPC; + goto end; + } + + toggle_bp_slot(bp, true); + +end: + mutex_unlock(&nr_bp_mutex); + + return ret; +} + +void release_bp_slot(struct perf_event *bp) +{ + mutex_lock(&nr_bp_mutex); + + toggle_bp_slot(bp, false); + + mutex_unlock(&nr_bp_mutex); +} + + +int __register_perf_hw_breakpoint(struct perf_event *bp) +{ + int ret; + + ret = reserve_bp_slot(bp); + if (ret) + return ret; + + /* + * Ptrace breakpoints can be temporary perf events only + * meant to reserve a slot. In this case, it is created disabled and + * we don't want to check the params right now (as we put a null addr) + * But perf tools create events as disabled and we want to check + * the params for them. + * This is a quick hack that will be removed soon, once we remove + * the tmp breakpoints from ptrace + */ + if (!bp->attr.disabled || bp->callback == perf_bp_event) + ret = arch_validate_hwbkpt_settings(bp, bp->ctx->task); + + return ret; +} + +int register_perf_hw_breakpoint(struct perf_event *bp) +{ + bp->callback = perf_bp_event; + + return __register_perf_hw_breakpoint(bp); +} + +/** + * register_user_hw_breakpoint - register a hardware breakpoint for user space + * @attr: breakpoint attributes + * @triggered: callback to trigger when we hit the breakpoint + * @tsk: pointer to 'task_struct' of the process to which the address belongs + */ +struct perf_event * +register_user_hw_breakpoint(struct perf_event_attr *attr, + perf_callback_t triggered, + struct task_struct *tsk) +{ + return perf_event_create_kernel_counter(attr, -1, tsk->pid, triggered); +} +EXPORT_SYMBOL_GPL(register_user_hw_breakpoint); + +/** + * modify_user_hw_breakpoint - modify a user-space hardware breakpoint + * @bp: the breakpoint structure to modify + * @attr: new breakpoint attributes + * @triggered: callback to trigger when we hit the breakpoint + * @tsk: pointer to 'task_struct' of the process to which the address belongs + */ +struct perf_event * +modify_user_hw_breakpoint(struct perf_event *bp, struct perf_event_attr *attr, + perf_callback_t triggered, + struct task_struct *tsk) +{ + /* + * FIXME: do it without unregistering + * - We don't want to lose our slot + * - If the new bp is incorrect, don't lose the older one + */ + unregister_hw_breakpoint(bp); + + return perf_event_create_kernel_counter(attr, -1, tsk->pid, triggered); +} +EXPORT_SYMBOL_GPL(modify_user_hw_breakpoint); + +/** + * unregister_hw_breakpoint - unregister a user-space hardware breakpoint + * @bp: the breakpoint structure to unregister + */ +void unregister_hw_breakpoint(struct perf_event *bp) +{ + if (!bp) + return; + perf_event_release_kernel(bp); +} +EXPORT_SYMBOL_GPL(unregister_hw_breakpoint); + +/** + * register_wide_hw_breakpoint - register a wide breakpoint in the kernel + * @attr: breakpoint attributes + * @triggered: callback to trigger when we hit the breakpoint + * + * @return a set of per_cpu pointers to perf events + */ +struct perf_event ** +register_wide_hw_breakpoint(struct perf_event_attr *attr, + perf_callback_t triggered) +{ + struct perf_event **cpu_events, **pevent, *bp; + long err; + int cpu; + + cpu_events = alloc_percpu(typeof(*cpu_events)); + if (!cpu_events) + return ERR_PTR(-ENOMEM); + + for_each_possible_cpu(cpu) { + pevent = per_cpu_ptr(cpu_events, cpu); + bp = perf_event_create_kernel_counter(attr, cpu, -1, triggered); + + *pevent = bp; + + if (IS_ERR(bp)) { + err = PTR_ERR(bp); + goto fail; + } + } + + return cpu_events; + +fail: + for_each_possible_cpu(cpu) { + pevent = per_cpu_ptr(cpu_events, cpu); + if (IS_ERR(*pevent)) + break; + unregister_hw_breakpoint(*pevent); + } + free_percpu(cpu_events); + /* return the error if any */ + return ERR_PTR(err); +} +EXPORT_SYMBOL_GPL(register_wide_hw_breakpoint); + +/** + * unregister_wide_hw_breakpoint - unregister a wide breakpoint in the kernel + * @cpu_events: the per cpu set of events to unregister + */ +void unregister_wide_hw_breakpoint(struct perf_event **cpu_events) +{ + int cpu; + struct perf_event **pevent; + + for_each_possible_cpu(cpu) { + pevent = per_cpu_ptr(cpu_events, cpu); + unregister_hw_breakpoint(*pevent); + } + free_percpu(cpu_events); +} +EXPORT_SYMBOL_GPL(unregister_wide_hw_breakpoint); + +static struct notifier_block hw_breakpoint_exceptions_nb = { + .notifier_call = hw_breakpoint_exceptions_notify, + /* we need to be notified first */ + .priority = 0x7fffffff +}; + +static int __init init_hw_breakpoint(void) +{ + return register_die_notifier(&hw_breakpoint_exceptions_nb); +} +core_initcall(init_hw_breakpoint); + + +struct pmu perf_ops_bp = { + .enable = arch_install_hw_breakpoint, + .disable = arch_uninstall_hw_breakpoint, + .read = hw_breakpoint_pmu_read, + .unthrottle = hw_breakpoint_pmu_unthrottle +}; diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index c1660194d115..ba566c261adc 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -166,11 +166,11 @@ int set_irq_data(unsigned int irq, void *data) EXPORT_SYMBOL(set_irq_data); /** - * set_irq_data - set irq type data for an irq + * set_irq_msi - set MSI descriptor data for an irq * @irq: Interrupt number * @entry: Pointer to MSI descriptor data * - * Set the hardware irq controller data for an irq + * Set the MSI descriptor entry for an irq */ int set_irq_msi(unsigned int irq, struct msi_desc *entry) { @@ -590,7 +590,7 @@ out_unlock: } /** - * handle_percpu_IRQ - Per CPU local irq handler + * handle_percpu_irq - Per CPU local irq handler * @irq: the interrupt number * @desc: the interrupt description structure for this irq * diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index a81cf80554db..17c71bb565c6 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -11,6 +11,7 @@ */ #include <linux/irq.h> +#include <linux/sched.h> #include <linux/slab.h> #include <linux/module.h> #include <linux/random.h> diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index 692363dd591f..0832145fea97 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -136,7 +136,7 @@ out: static int default_affinity_open(struct inode *inode, struct file *file) { - return single_open(file, default_affinity_show, NULL); + return single_open(file, default_affinity_show, PDE(inode)->data); } static const struct file_operations default_affinity_proc_fops = { @@ -148,18 +148,28 @@ static const struct file_operations default_affinity_proc_fops = { }; #endif -static int irq_spurious_read(char *page, char **start, off_t off, - int count, int *eof, void *data) +static int irq_spurious_proc_show(struct seq_file *m, void *v) { - struct irq_desc *desc = irq_to_desc((long) data); - return sprintf(page, "count %u\n" - "unhandled %u\n" - "last_unhandled %u ms\n", - desc->irq_count, - desc->irqs_unhandled, - jiffies_to_msecs(desc->last_unhandled)); + struct irq_desc *desc = irq_to_desc((long) m->private); + + seq_printf(m, "count %u\n" "unhandled %u\n" "last_unhandled %u ms\n", + desc->irq_count, desc->irqs_unhandled, + jiffies_to_msecs(desc->last_unhandled)); + return 0; +} + +static int irq_spurious_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, irq_spurious_proc_show, NULL); } +static const struct file_operations irq_spurious_proc_fops = { + .open = irq_spurious_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + #define MAX_NAMELEN 128 static int name_unique(unsigned int irq, struct irqaction *new_action) @@ -204,7 +214,6 @@ void register_handler_proc(unsigned int irq, struct irqaction *action) void register_irq_proc(unsigned int irq, struct irq_desc *desc) { char name [MAX_NAMELEN]; - struct proc_dir_entry *entry; if (!root_irq_dir || (desc->chip == &no_irq_chip) || desc->dir) return; @@ -214,6 +223,8 @@ void register_irq_proc(unsigned int irq, struct irq_desc *desc) /* create /proc/irq/1234 */ desc->dir = proc_mkdir(name, root_irq_dir); + if (!desc->dir) + return; #ifdef CONFIG_SMP /* create /proc/irq/<irq>/smp_affinity */ @@ -221,11 +232,8 @@ void register_irq_proc(unsigned int irq, struct irq_desc *desc) &irq_affinity_proc_fops, (void *)(long)irq); #endif - entry = create_proc_entry("spurious", 0444, desc->dir); - if (entry) { - entry->data = (void *)(long)irq; - entry->read_proc = irq_spurious_read; - } + proc_create_data("spurious", 0444, desc->dir, + &irq_spurious_proc_fops, (void *)(long)irq); } #undef MAX_NAMELEN diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c index 114e704760fe..22b0a6eedf24 100644 --- a/kernel/irq/spurious.c +++ b/kernel/irq/spurious.c @@ -104,7 +104,7 @@ static int misrouted_irq(int irq) return ok; } -static void poll_all_shared_irqs(void) +static void poll_spurious_irqs(unsigned long dummy) { struct irq_desc *desc; int i; @@ -121,25 +121,15 @@ static void poll_all_shared_irqs(void) if (!(status & IRQ_SPURIOUS_DISABLED)) continue; + local_irq_disable(); try_one_irq(i, desc); + local_irq_enable(); } -} - -static void poll_spurious_irqs(unsigned long dummy) -{ - poll_all_shared_irqs(); mod_timer(&poll_spurious_irq_timer, jiffies + POLL_SPURIOUS_IRQ_INTERVAL); } -#ifdef CONFIG_DEBUG_SHIRQ -void debug_poll_all_shared_irqs(void) -{ - poll_all_shared_irqs(); -} -#endif - /* * If 99,900 of the previous 100,000 interrupts have not been handled * then assume that the IRQ is stuck in some manner. Drop a diagnostic diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index 8b6b8b697c68..8e5288a8a355 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -181,6 +181,7 @@ unsigned long kallsyms_lookup_name(const char *name) } return module_kallsyms_lookup_name(name); } +EXPORT_SYMBOL_GPL(kallsyms_lookup_name); int kallsyms_on_each_symbol(int (*fn)(void *, const char *, struct module *, unsigned long), diff --git a/kernel/kgdb.c b/kernel/kgdb.c index 9147a3190c9d..7d7014634022 100644 --- a/kernel/kgdb.c +++ b/kernel/kgdb.c @@ -870,7 +870,7 @@ static void gdb_cmd_getregs(struct kgdb_state *ks) /* * All threads that don't have debuggerinfo should be - * in __schedule() sleeping, since all other CPUs + * in schedule() sleeping, since all other CPUs * are in kgdb_wait, and thus have debuggerinfo. */ if (local_debuggerinfo) { diff --git a/kernel/kmod.c b/kernel/kmod.c index 689d20f39305..25b103190364 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -80,16 +80,16 @@ int __request_module(bool wait, const char *fmt, ...) #define MAX_KMOD_CONCURRENT 50 /* Completely arbitrary value - KAO */ static int kmod_loop_msg; - ret = security_kernel_module_request(); - if (ret) - return ret; - va_start(args, fmt); ret = vsnprintf(module_name, MODULE_NAME_LEN, fmt, args); va_end(args); if (ret >= MODULE_NAME_LEN) return -ENAMETOOLONG; + ret = security_kernel_module_request(module_name); + if (ret) + return ret; + /* If modprobe needs a service that is in a module, we get a recursive * loop. Limit the number of running kmod threads to max_threads/2 or * MAX_KMOD_CONCURRENT, whichever is the smaller. A cleaner method @@ -143,7 +143,6 @@ struct subprocess_info { static int ____call_usermodehelper(void *data) { struct subprocess_info *sub_info = data; - enum umh_wait wait = sub_info->wait; int retval; BUG_ON(atomic_read(&sub_info->cred->usage) != 1); @@ -185,14 +184,10 @@ static int ____call_usermodehelper(void *data) */ set_user_nice(current, 0); - if (wait == UMH_WAIT_EXEC) - complete(sub_info->complete); - retval = kernel_execve(sub_info->path, sub_info->argv, sub_info->envp); /* Exec failed? */ - if (wait != UMH_WAIT_EXEC) - sub_info->retval = retval; + sub_info->retval = retval; do_exit(0); } @@ -271,14 +266,16 @@ static void __call_usermodehelper(struct work_struct *work) switch (wait) { case UMH_NO_WAIT: - case UMH_WAIT_EXEC: break; case UMH_WAIT_PROC: if (pid > 0) break; sub_info->retval = pid; - break; + /* FALLTHROUGH */ + + case UMH_WAIT_EXEC: + complete(sub_info->complete); } } diff --git a/kernel/kprobes.c b/kernel/kprobes.c index cfadc1291d0b..e5342a344c43 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -90,6 +90,9 @@ static spinlock_t *kretprobe_table_lock_ptr(unsigned long hash) */ static struct kprobe_blackpoint kprobe_blacklist[] = { {"preempt_schedule",}, + {"native_get_debugreg",}, + {"irq_entries_start",}, + {"common_interrupt",}, {NULL} /* Terminator */ }; @@ -673,6 +676,40 @@ static kprobe_opcode_t __kprobes *kprobe_addr(struct kprobe *p) return (kprobe_opcode_t *)(((char *)addr) + p->offset); } +/* Check passed kprobe is valid and return kprobe in kprobe_table. */ +static struct kprobe * __kprobes __get_valid_kprobe(struct kprobe *p) +{ + struct kprobe *old_p, *list_p; + + old_p = get_kprobe(p->addr); + if (unlikely(!old_p)) + return NULL; + + if (p != old_p) { + list_for_each_entry_rcu(list_p, &old_p->list, list) + if (list_p == p) + /* kprobe p is a valid probe */ + goto valid; + return NULL; + } +valid: + return old_p; +} + +/* Return error if the kprobe is being re-registered */ +static inline int check_kprobe_rereg(struct kprobe *p) +{ + int ret = 0; + struct kprobe *old_p; + + mutex_lock(&kprobe_mutex); + old_p = __get_valid_kprobe(p); + if (old_p) + ret = -EINVAL; + mutex_unlock(&kprobe_mutex); + return ret; +} + int __kprobes register_kprobe(struct kprobe *p) { int ret = 0; @@ -685,6 +722,10 @@ int __kprobes register_kprobe(struct kprobe *p) return -EINVAL; p->addr = addr; + ret = check_kprobe_rereg(p); + if (ret) + return ret; + preempt_disable(); if (!kernel_text_address((unsigned long) p->addr) || in_kprobes_functions((unsigned long) p->addr)) { @@ -754,26 +795,6 @@ out: } EXPORT_SYMBOL_GPL(register_kprobe); -/* Check passed kprobe is valid and return kprobe in kprobe_table. */ -static struct kprobe * __kprobes __get_valid_kprobe(struct kprobe *p) -{ - struct kprobe *old_p, *list_p; - - old_p = get_kprobe(p->addr); - if (unlikely(!old_p)) - return NULL; - - if (p != old_p) { - list_for_each_entry_rcu(list_p, &old_p->list, list) - if (list_p == p) - /* kprobe p is a valid probe */ - goto valid; - return NULL; - } -valid: - return old_p; -} - /* * Unregister a kprobe without a scheduler synchronization. */ @@ -1014,9 +1035,9 @@ int __kprobes register_kretprobe(struct kretprobe *rp) /* Pre-allocate memory for max kretprobe instances */ if (rp->maxactive <= 0) { #ifdef CONFIG_PREEMPT - rp->maxactive = max(10, 2 * NR_CPUS); + rp->maxactive = max(10, 2 * num_possible_cpus()); #else - rp->maxactive = NR_CPUS; + rp->maxactive = num_possible_cpus(); #endif } spin_lock_init(&rp->lock); @@ -1141,6 +1162,13 @@ static void __kprobes kill_kprobe(struct kprobe *p) arch_remove_kprobe(p); } +void __kprobes dump_kprobe(struct kprobe *kp) +{ + printk(KERN_WARNING "Dumping kprobe:\n"); + printk(KERN_WARNING "Name: %s\nAddress: %p\nOffset: %x\n", + kp->symbol_name, kp->addr, kp->offset); +} + /* Module notifier call back, checking kprobes on the module */ static int __kprobes kprobes_module_callback(struct notifier_block *nb, unsigned long val, void *data) @@ -1333,7 +1361,7 @@ static int __kprobes kprobes_open(struct inode *inode, struct file *filp) return seq_open(filp, &kprobes_seq_ops); } -static struct file_operations debugfs_kprobes_operations = { +static const struct file_operations debugfs_kprobes_operations = { .open = kprobes_open, .read = seq_read, .llseek = seq_lseek, @@ -1515,7 +1543,7 @@ static ssize_t write_enabled_file_bool(struct file *file, return count; } -static struct file_operations fops_kp = { +static const struct file_operations fops_kp = { .read = read_enabled_file_bool, .write = write_enabled_file_bool, }; diff --git a/kernel/kthread.c b/kernel/kthread.c index 5fe709982caa..ab7ae57773e1 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -150,29 +150,6 @@ struct task_struct *kthread_create(int (*threadfn)(void *data), EXPORT_SYMBOL(kthread_create); /** - * kthread_bind - bind a just-created kthread to a cpu. - * @k: thread created by kthread_create(). - * @cpu: cpu (might not be online, must be possible) for @k to run on. - * - * Description: This function is equivalent to set_cpus_allowed(), - * except that @cpu doesn't need to be online, and the thread must be - * stopped (i.e., just returned from kthread_create()). - */ -void kthread_bind(struct task_struct *k, unsigned int cpu) -{ - /* Must have done schedule() in kthread() before we set_task_cpu */ - if (!wait_task_inactive(k, TASK_UNINTERRUPTIBLE)) { - WARN_ON(1); - return; - } - set_task_cpu(k, cpu); - k->cpus_allowed = cpumask_of_cpu(cpu); - k->rt.nr_cpus_allowed = 1; - k->flags |= PF_THREAD_BOUND; -} -EXPORT_SYMBOL(kthread_bind); - -/** * kthread_stop - stop a thread created by kthread_create(). * @k: thread created by kthread_create(). * diff --git a/kernel/lockdep.c b/kernel/lockdep.c index 3815ac1d58b2..f5dcd36d3151 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -49,7 +49,7 @@ #include "lockdep_internals.h" #define CREATE_TRACE_POINTS -#include <trace/events/lockdep.h> +#include <trace/events/lock.h> #ifdef CONFIG_PROVE_LOCKING int prove_locking = 1; @@ -142,6 +142,11 @@ static inline struct lock_class *hlock_class(struct held_lock *hlock) #ifdef CONFIG_LOCK_STAT static DEFINE_PER_CPU(struct lock_class_stats[MAX_LOCKDEP_KEYS], lock_stats); +static inline u64 lockstat_clock(void) +{ + return cpu_clock(smp_processor_id()); +} + static int lock_point(unsigned long points[], unsigned long ip) { int i; @@ -158,7 +163,7 @@ static int lock_point(unsigned long points[], unsigned long ip) return i; } -static void lock_time_inc(struct lock_time *lt, s64 time) +static void lock_time_inc(struct lock_time *lt, u64 time) { if (time > lt->max) lt->max = time; @@ -234,12 +239,12 @@ static void put_lock_stats(struct lock_class_stats *stats) static void lock_release_holdtime(struct held_lock *hlock) { struct lock_class_stats *stats; - s64 holdtime; + u64 holdtime; if (!lock_stat) return; - holdtime = sched_clock() - hlock->holdtime_stamp; + holdtime = lockstat_clock() - hlock->holdtime_stamp; stats = get_lock_stats(hlock_class(hlock)); if (hlock->read) @@ -2792,7 +2797,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, hlock->references = references; #ifdef CONFIG_LOCK_STAT hlock->waittime_stamp = 0; - hlock->holdtime_stamp = sched_clock(); + hlock->holdtime_stamp = lockstat_clock(); #endif if (check == 2 && !mark_irqflags(curr, hlock)) @@ -3322,7 +3327,7 @@ found_it: if (hlock->instance != lock) return; - hlock->waittime_stamp = sched_clock(); + hlock->waittime_stamp = lockstat_clock(); contention_point = lock_point(hlock_class(hlock)->contention_point, ip); contending_point = lock_point(hlock_class(hlock)->contending_point, @@ -3345,8 +3350,7 @@ __lock_acquired(struct lockdep_map *lock, unsigned long ip) struct held_lock *hlock, *prev_hlock; struct lock_class_stats *stats; unsigned int depth; - u64 now; - s64 waittime = 0; + u64 now, waittime = 0; int i, cpu; depth = curr->lockdep_depth; @@ -3374,7 +3378,7 @@ found_it: cpu = smp_processor_id(); if (hlock->waittime_stamp) { - now = sched_clock(); + now = lockstat_clock(); waittime = now - hlock->waittime_stamp; hlock->holdtime_stamp = now; } diff --git a/kernel/module.c b/kernel/module.c index e6bc4b28aa62..5842a71cf052 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -1187,7 +1187,8 @@ static void add_sect_attrs(struct module *mod, unsigned int nsect, /* Count loaded sections and allocate structures */ for (i = 0; i < nsect; i++) - if (sechdrs[i].sh_flags & SHF_ALLOC) + if (sechdrs[i].sh_flags & SHF_ALLOC + && sechdrs[i].sh_size) nloaded++; size[0] = ALIGN(sizeof(*sect_attrs) + nloaded * sizeof(sect_attrs->attrs[0]), @@ -1207,6 +1208,8 @@ static void add_sect_attrs(struct module *mod, unsigned int nsect, for (i = 0; i < nsect; i++) { if (! (sechdrs[i].sh_flags & SHF_ALLOC)) continue; + if (!sechdrs[i].sh_size) + continue; sattr->address = sechdrs[i].sh_addr; sattr->name = kstrdup(secstrings + sechdrs[i].sh_name, GFP_KERNEL); @@ -1797,6 +1800,17 @@ static void setup_modinfo(struct module *mod, Elf_Shdr *sechdrs, } } +static void free_modinfo(struct module *mod) +{ + struct module_attribute *attr; + int i; + + for (i = 0; (attr = modinfo_attrs[i]); i++) { + if (attr->free) + attr->free(mod); + } +} + #ifdef CONFIG_KALLSYMS /* lookup symbol in given range of kernel_symbols */ @@ -1862,13 +1876,93 @@ static char elf_type(const Elf_Sym *sym, return '?'; } +static bool is_core_symbol(const Elf_Sym *src, const Elf_Shdr *sechdrs, + unsigned int shnum) +{ + const Elf_Shdr *sec; + + if (src->st_shndx == SHN_UNDEF + || src->st_shndx >= shnum + || !src->st_name) + return false; + + sec = sechdrs + src->st_shndx; + if (!(sec->sh_flags & SHF_ALLOC) +#ifndef CONFIG_KALLSYMS_ALL + || !(sec->sh_flags & SHF_EXECINSTR) +#endif + || (sec->sh_entsize & INIT_OFFSET_MASK)) + return false; + + return true; +} + +static unsigned long layout_symtab(struct module *mod, + Elf_Shdr *sechdrs, + unsigned int symindex, + unsigned int strindex, + const Elf_Ehdr *hdr, + const char *secstrings, + unsigned long *pstroffs, + unsigned long *strmap) +{ + unsigned long symoffs; + Elf_Shdr *symsect = sechdrs + symindex; + Elf_Shdr *strsect = sechdrs + strindex; + const Elf_Sym *src; + const char *strtab; + unsigned int i, nsrc, ndst; + + /* Put symbol section at end of init part of module. */ + symsect->sh_flags |= SHF_ALLOC; + symsect->sh_entsize = get_offset(mod, &mod->init_size, symsect, + symindex) | INIT_OFFSET_MASK; + DEBUGP("\t%s\n", secstrings + symsect->sh_name); + + src = (void *)hdr + symsect->sh_offset; + nsrc = symsect->sh_size / sizeof(*src); + strtab = (void *)hdr + strsect->sh_offset; + for (ndst = i = 1; i < nsrc; ++i, ++src) + if (is_core_symbol(src, sechdrs, hdr->e_shnum)) { + unsigned int j = src->st_name; + + while(!__test_and_set_bit(j, strmap) && strtab[j]) + ++j; + ++ndst; + } + + /* Append room for core symbols at end of core part. */ + symoffs = ALIGN(mod->core_size, symsect->sh_addralign ?: 1); + mod->core_size = symoffs + ndst * sizeof(Elf_Sym); + + /* Put string table section at end of init part of module. */ + strsect->sh_flags |= SHF_ALLOC; + strsect->sh_entsize = get_offset(mod, &mod->init_size, strsect, + strindex) | INIT_OFFSET_MASK; + DEBUGP("\t%s\n", secstrings + strsect->sh_name); + + /* Append room for core symbols' strings at end of core part. */ + *pstroffs = mod->core_size; + __set_bit(0, strmap); + mod->core_size += bitmap_weight(strmap, strsect->sh_size); + + return symoffs; +} + static void add_kallsyms(struct module *mod, Elf_Shdr *sechdrs, + unsigned int shnum, unsigned int symindex, unsigned int strindex, - const char *secstrings) + unsigned long symoffs, + unsigned long stroffs, + const char *secstrings, + unsigned long *strmap) { - unsigned int i; + unsigned int i, ndst; + const Elf_Sym *src; + Elf_Sym *dst; + char *s; mod->symtab = (void *)sechdrs[symindex].sh_addr; mod->num_symtab = sechdrs[symindex].sh_size / sizeof(Elf_Sym); @@ -1878,13 +1972,46 @@ static void add_kallsyms(struct module *mod, for (i = 0; i < mod->num_symtab; i++) mod->symtab[i].st_info = elf_type(&mod->symtab[i], sechdrs, secstrings, mod); + + mod->core_symtab = dst = mod->module_core + symoffs; + src = mod->symtab; + *dst = *src; + for (ndst = i = 1; i < mod->num_symtab; ++i, ++src) { + if (!is_core_symbol(src, sechdrs, shnum)) + continue; + dst[ndst] = *src; + dst[ndst].st_name = bitmap_weight(strmap, dst[ndst].st_name); + ++ndst; + } + mod->core_num_syms = ndst; + + mod->core_strtab = s = mod->module_core + stroffs; + for (*s = 0, i = 1; i < sechdrs[strindex].sh_size; ++i) + if (test_bit(i, strmap)) + *++s = mod->strtab[i]; } #else +static inline unsigned long layout_symtab(struct module *mod, + Elf_Shdr *sechdrs, + unsigned int symindex, + unsigned int strindex, + const Elf_Ehdr *hdr, + const char *secstrings, + unsigned long *pstroffs, + unsigned long *strmap) +{ + return 0; +} + static inline void add_kallsyms(struct module *mod, Elf_Shdr *sechdrs, + unsigned int shnum, unsigned int symindex, unsigned int strindex, - const char *secstrings) + unsigned long symoffs, + unsigned long stroffs, + const char *secstrings, + const unsigned long *strmap) { } #endif /* CONFIG_KALLSYMS */ @@ -1959,6 +2086,8 @@ static noinline struct module *load_module(void __user *umod, struct module *mod; long err = 0; void *percpu = NULL, *ptr = NULL; /* Stops spurious gcc warning */ + unsigned long symoffs, stroffs, *strmap; + mm_segment_t old_fs; DEBUGP("load_module: umod=%p, len=%lu, uargs=%p\n", @@ -2040,11 +2169,6 @@ static noinline struct module *load_module(void __user *umod, /* Don't keep modinfo and version sections. */ sechdrs[infoindex].sh_flags &= ~(unsigned long)SHF_ALLOC; sechdrs[versindex].sh_flags &= ~(unsigned long)SHF_ALLOC; -#ifdef CONFIG_KALLSYMS - /* Keep symbol and string tables for decoding later. */ - sechdrs[symindex].sh_flags |= SHF_ALLOC; - sechdrs[strindex].sh_flags |= SHF_ALLOC; -#endif /* Check module struct version now, before we try to use module. */ if (!check_modstruct_version(sechdrs, versindex, mod)) { @@ -2080,6 +2204,13 @@ static noinline struct module *load_module(void __user *umod, goto free_hdr; } + strmap = kzalloc(BITS_TO_LONGS(sechdrs[strindex].sh_size) + * sizeof(long), GFP_KERNEL); + if (!strmap) { + err = -ENOMEM; + goto free_mod; + } + if (find_module(mod->name)) { err = -EEXIST; goto free_mod; @@ -2109,6 +2240,8 @@ static noinline struct module *load_module(void __user *umod, this is done generically; there doesn't appear to be any special cases for the architectures. */ layout_sections(mod, hdr, sechdrs, secstrings); + symoffs = layout_symtab(mod, sechdrs, symindex, strindex, hdr, + secstrings, &stroffs, strmap); /* Do the allocs. */ ptr = module_alloc_update_bounds(mod->core_size); @@ -2313,7 +2446,10 @@ static noinline struct module *load_module(void __user *umod, percpu_modcopy(mod->percpu, (void *)sechdrs[pcpuindex].sh_addr, sechdrs[pcpuindex].sh_size); - add_kallsyms(mod, sechdrs, symindex, strindex, secstrings); + add_kallsyms(mod, sechdrs, hdr->e_shnum, symindex, strindex, + symoffs, stroffs, secstrings, strmap); + kfree(strmap); + strmap = NULL; if (!mod->taints) { struct _ddebug *debug; @@ -2385,13 +2521,14 @@ static noinline struct module *load_module(void __user *umod, synchronize_sched(); module_arch_cleanup(mod); cleanup: + free_modinfo(mod); kobject_del(&mod->mkobj.kobj); kobject_put(&mod->mkobj.kobj); free_unload: module_unload_free(mod); #if defined(CONFIG_MODULE_UNLOAD) && defined(CONFIG_SMP) - free_init: percpu_modfree(mod->refptr); + free_init: #endif module_free(mod, mod->module_init); free_core: @@ -2402,6 +2539,7 @@ static noinline struct module *load_module(void __user *umod, percpu_modfree(percpu); free_mod: kfree(args); + kfree(strmap); free_hdr: vfree(hdr); return ERR_PTR(err); @@ -2491,6 +2629,11 @@ SYSCALL_DEFINE3(init_module, void __user *, umod, /* Drop initial reference. */ module_put(mod); trim_init_extable(mod); +#ifdef CONFIG_KALLSYMS + mod->num_symtab = mod->core_num_syms; + mod->symtab = mod->core_symtab; + mod->strtab = mod->core_strtab; +#endif module_free(mod, mod->module_init); mod->module_init = NULL; mod->init_size = 0; @@ -2952,7 +3095,6 @@ void module_layout(struct module *mod, struct modversion_info *ver, struct kernel_param *kp, struct kernel_symbol *ks, - struct marker *marker, struct tracepoint *tp) { } diff --git a/kernel/mutex-debug.c b/kernel/mutex-debug.c index 50d022e5a560..ec815a960b5d 100644 --- a/kernel/mutex-debug.c +++ b/kernel/mutex-debug.c @@ -16,6 +16,7 @@ #include <linux/delay.h> #include <linux/module.h> #include <linux/poison.h> +#include <linux/sched.h> #include <linux/spinlock.h> #include <linux/kallsyms.h> #include <linux/interrupt.h> diff --git a/kernel/mutex.c b/kernel/mutex.c index 947b3ad551f8..632f04c57d82 100644 --- a/kernel/mutex.c +++ b/kernel/mutex.c @@ -148,8 +148,8 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, preempt_disable(); mutex_acquire(&lock->dep_map, subclass, 0, ip); -#if defined(CONFIG_SMP) && !defined(CONFIG_DEBUG_MUTEXES) && \ - !defined(CONFIG_HAVE_DEFAULT_NO_SPIN_MUTEXES) + +#ifdef CONFIG_MUTEX_SPIN_ON_OWNER /* * Optimistic spinning. * diff --git a/kernel/notifier.c b/kernel/notifier.c index 61d5aa5eced3..acd24e7643eb 100644 --- a/kernel/notifier.c +++ b/kernel/notifier.c @@ -558,7 +558,7 @@ EXPORT_SYMBOL(unregister_reboot_notifier); static ATOMIC_NOTIFIER_HEAD(die_chain); -int notrace notify_die(enum die_val val, const char *str, +int notrace __kprobes notify_die(enum die_val val, const char *str, struct pt_regs *regs, long err, int trap, int sig) { struct die_args args = { diff --git a/kernel/ns_cgroup.c b/kernel/ns_cgroup.c index 5aa854f9e5ae..2a5dfec8efe0 100644 --- a/kernel/ns_cgroup.c +++ b/kernel/ns_cgroup.c @@ -42,8 +42,8 @@ int ns_cgroup_clone(struct task_struct *task, struct pid *pid) * (hence either you are in the same cgroup as task, or in an * ancestor cgroup thereof) */ -static int ns_can_attach(struct cgroup_subsys *ss, - struct cgroup *new_cgroup, struct task_struct *task) +static int ns_can_attach(struct cgroup_subsys *ss, struct cgroup *new_cgroup, + struct task_struct *task, bool threadgroup) { if (current != task) { if (!capable(CAP_SYS_ADMIN)) @@ -56,6 +56,18 @@ static int ns_can_attach(struct cgroup_subsys *ss, if (!cgroup_is_descendant(new_cgroup, task)) return -EPERM; + if (threadgroup) { + struct task_struct *c; + rcu_read_lock(); + list_for_each_entry_rcu(c, &task->thread_group, thread_group) { + if (!cgroup_is_descendant(new_cgroup, c)) { + rcu_read_unlock(); + return -EPERM; + } + } + rcu_read_unlock(); + } + return 0; } diff --git a/kernel/panic.c b/kernel/panic.c index bcdef26e3332..96b45d0b4ba5 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -90,6 +90,8 @@ NORET_TYPE void panic(const char * fmt, ...) atomic_notifier_call_chain(&panic_notifier_list, 0, buf); + bust_spinlocks(0); + if (!panic_blink) panic_blink = no_blink; @@ -136,7 +138,6 @@ NORET_TYPE void panic(const char * fmt, ...) mdelay(1); i++; } - bust_spinlocks(0); } EXPORT_SYMBOL(panic); diff --git a/kernel/params.c b/kernel/params.c index 7f6912ced2ba..d656c276508d 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -23,6 +23,7 @@ #include <linux/device.h> #include <linux/err.h> #include <linux/slab.h> +#include <linux/ctype.h> #if 0 #define DEBUGP printk @@ -87,7 +88,7 @@ static char *next_arg(char *args, char **param, char **val) } for (i = 0; args[i]; i++) { - if (args[i] == ' ' && !in_quote) + if (isspace(args[i]) && !in_quote) break; if (equals == 0) { if (args[i] == '=') @@ -121,7 +122,7 @@ static char *next_arg(char *args, char **param, char **val) next = args + i; /* Chew up trailing spaces. */ - while (*next == ' ') + while (isspace(*next)) next++; return next; } @@ -138,7 +139,7 @@ int parse_args(const char *name, DEBUGP("Parsing ARGS: %s\n", args); /* Chew leading spaces */ - while (*args == ' ') + while (isspace(*args)) args++; while (*args) { @@ -217,15 +218,11 @@ int param_set_charp(const char *val, struct kernel_param *kp) return -ENOSPC; } - if (kp->flags & KPARAM_KMALLOCED) - kfree(*(char **)kp->arg); - /* This is a hack. We can't need to strdup in early boot, and we * don't need to; this mangled commandline is preserved. */ if (slab_is_available()) { - kp->flags |= KPARAM_KMALLOCED; *(char **)kp->arg = kstrdup(val, GFP_KERNEL); - if (!kp->arg) + if (!*(char **)kp->arg) return -ENOMEM; } else *(const char **)kp->arg = val; @@ -303,6 +300,7 @@ static int param_array(const char *name, unsigned int min, unsigned int max, void *elem, int elemsize, int (*set)(const char *, struct kernel_param *kp), + u16 flags, unsigned int *num) { int ret; @@ -312,6 +310,7 @@ static int param_array(const char *name, /* Get the name right for errors. */ kp.name = name; kp.arg = elem; + kp.flags = flags; /* No equals sign? */ if (!val) { @@ -357,7 +356,8 @@ int param_array_set(const char *val, struct kernel_param *kp) unsigned int temp_num; return param_array(kp->name, val, 1, arr->max, arr->elem, - arr->elemsize, arr->set, arr->num ?: &temp_num); + arr->elemsize, arr->set, kp->flags, + arr->num ?: &temp_num); } int param_array_get(char *buffer, struct kernel_param *kp) @@ -604,11 +604,7 @@ void module_param_sysfs_remove(struct module *mod) void destroy_params(const struct kernel_param *params, unsigned num) { - unsigned int i; - - for (i = 0; i < num; i++) - if (params[i].flags & KPARAM_KMALLOCED) - kfree(*(char **)params[i].arg); + /* FIXME: This should free kmalloced charp parameters. It doesn't. */ } static void __init kernel_add_sysfs_param(const char *name, diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 76ac4db405e9..6b7ddba1dd64 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -20,6 +20,7 @@ #include <linux/percpu.h> #include <linux/ptrace.h> #include <linux/vmstat.h> +#include <linux/vmalloc.h> #include <linux/hardirq.h> #include <linux/rculist.h> #include <linux/uaccess.h> @@ -27,6 +28,8 @@ #include <linux/anon_inodes.h> #include <linux/kernel_stat.h> #include <linux/perf_event.h> +#include <linux/ftrace_event.h> +#include <linux/hw_breakpoint.h> #include <asm/irq_regs.h> @@ -243,6 +246,49 @@ static void perf_unpin_context(struct perf_event_context *ctx) put_ctx(ctx); } +static inline u64 perf_clock(void) +{ + return cpu_clock(smp_processor_id()); +} + +/* + * Update the record of the current time in a context. + */ +static void update_context_time(struct perf_event_context *ctx) +{ + u64 now = perf_clock(); + + ctx->time += now - ctx->timestamp; + ctx->timestamp = now; +} + +/* + * Update the total_time_enabled and total_time_running fields for a event. + */ +static void update_event_times(struct perf_event *event) +{ + struct perf_event_context *ctx = event->ctx; + u64 run_end; + + if (event->state < PERF_EVENT_STATE_INACTIVE || + event->group_leader->state < PERF_EVENT_STATE_INACTIVE) + return; + + if (ctx->is_active) + run_end = ctx->time; + else + run_end = event->tstamp_stopped; + + event->total_time_enabled = run_end - event->tstamp_enabled; + + if (event->state == PERF_EVENT_STATE_INACTIVE) + run_end = event->tstamp_stopped; + else + run_end = ctx->time; + + event->total_time_running = run_end - event->tstamp_running; +} + /* * Add a event from the lists for its context. * Must be called with ctx->mutex and ctx->lock held. @@ -291,6 +337,18 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx) if (event->group_leader != event) event->group_leader->nr_siblings--; + update_event_times(event); + + /* + * If event was in error state, then keep it + * that way, otherwise bogus counts will be + * returned on read(). The only way to get out + * of error state is by explicit re-enabling + * of the event + */ + if (event->state > PERF_EVENT_STATE_OFF) + event->state = PERF_EVENT_STATE_OFF; + /* * If this was a group event with sibling events then * upgrade the siblings to singleton events by adding them @@ -444,50 +502,11 @@ retry: * can remove the event safely, if the call above did not * succeed. */ - if (!list_empty(&event->group_entry)) { + if (!list_empty(&event->group_entry)) list_del_event(event, ctx); - } spin_unlock_irq(&ctx->lock); } -static inline u64 perf_clock(void) -{ - return cpu_clock(smp_processor_id()); -} - -/* - * Update the record of the current time in a context. - */ -static void update_context_time(struct perf_event_context *ctx) -{ - u64 now = perf_clock(); - - ctx->time += now - ctx->timestamp; - ctx->timestamp = now; -} - -/* - * Update the total_time_enabled and total_time_running fields for a event. - */ -static void update_event_times(struct perf_event *event) -{ - struct perf_event_context *ctx = event->ctx; - u64 run_end; - - if (event->state < PERF_EVENT_STATE_INACTIVE || - event->group_leader->state < PERF_EVENT_STATE_INACTIVE) - return; - - event->total_time_enabled = ctx->time - event->tstamp_enabled; - - if (event->state == PERF_EVENT_STATE_INACTIVE) - run_end = event->tstamp_stopped; - else - run_end = ctx->time; - - event->total_time_running = run_end - event->tstamp_running; -} - /* * Update total_time_enabled and total_time_running for all events in a group. */ @@ -1031,12 +1050,8 @@ void __perf_event_sched_out(struct perf_event_context *ctx, perf_disable(); if (ctx->nr_active) { - list_for_each_entry(event, &ctx->group_list, group_entry) { - if (event != event->group_leader) - event_sched_out(event, cpuctx, ctx); - else - group_sched_out(event, cpuctx, ctx); - } + list_for_each_entry(event, &ctx->group_list, group_entry) + group_sched_out(event, cpuctx, ctx); } perf_enable(); out: @@ -1062,8 +1077,6 @@ static int context_equiv(struct perf_event_context *ctx1, && !ctx1->pin_count && !ctx2->pin_count; } -static void __perf_event_read(void *event); - static void __perf_event_sync_stat(struct perf_event *event, struct perf_event *next_event) { @@ -1081,8 +1094,8 @@ static void __perf_event_sync_stat(struct perf_event *event, */ switch (event->state) { case PERF_EVENT_STATE_ACTIVE: - __perf_event_read(event); - break; + event->pmu->read(event); + /* fall-through */ case PERF_EVENT_STATE_INACTIVE: update_event_times(event); @@ -1121,6 +1134,8 @@ static void perf_event_sync_stat(struct perf_event_context *ctx, if (!ctx->nr_stat) return; + update_context_time(ctx); + event = list_first_entry(&ctx->event_list, struct perf_event, event_entry); @@ -1164,8 +1179,6 @@ void perf_event_task_sched_out(struct task_struct *task, if (likely(!ctx || !cpuctx->task_ctx)) return; - update_context_time(ctx); - rcu_read_lock(); parent = rcu_dereference(ctx->parent_ctx); next_ctx = next->perf_event_ctxp; @@ -1258,12 +1271,8 @@ __perf_event_sched_in(struct perf_event_context *ctx, if (event->cpu != -1 && event->cpu != cpu) continue; - if (event != event->group_leader) - event_sched_in(event, cpuctx, ctx, cpu); - else { - if (group_can_go_on(event, cpuctx, 1)) - group_sched_in(event, cpuctx, ctx, cpu); - } + if (group_can_go_on(event, cpuctx, 1)) + group_sched_in(event, cpuctx, ctx, cpu); /* * If this pinned group hasn't been scheduled, @@ -1291,15 +1300,9 @@ __perf_event_sched_in(struct perf_event_context *ctx, if (event->cpu != -1 && event->cpu != cpu) continue; - if (event != event->group_leader) { - if (event_sched_in(event, cpuctx, ctx, cpu)) + if (group_can_go_on(event, cpuctx, can_add_hw)) + if (group_sched_in(event, cpuctx, ctx, cpu)) can_add_hw = 0; - } else { - if (group_can_go_on(event, cpuctx, can_add_hw)) { - if (group_sched_in(event, cpuctx, ctx, cpu)) - can_add_hw = 0; - } - } } perf_enable(); out: @@ -1368,7 +1371,7 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx) u64 interrupts, freq; spin_lock(&ctx->lock); - list_for_each_entry(event, &ctx->group_list, group_entry) { + list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { if (event->state != PERF_EVENT_STATE_ACTIVE) continue; @@ -1528,7 +1531,6 @@ static void __perf_event_read(void *info) struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); struct perf_event *event = info; struct perf_event_context *ctx = event->ctx; - unsigned long flags; /* * If this is a task context, we need to check whether it is @@ -1540,12 +1542,12 @@ static void __perf_event_read(void *info) if (ctx->task && cpuctx->task_ctx != ctx) return; - local_irq_save(flags); - if (ctx->is_active) - update_context_time(ctx); - event->pmu->read(event); + spin_lock(&ctx->lock); + update_context_time(ctx); update_event_times(event); - local_irq_restore(flags); + spin_unlock(&ctx->lock); + + event->pmu->read(event); } static u64 perf_event_read(struct perf_event *event) @@ -1558,7 +1560,13 @@ static u64 perf_event_read(struct perf_event *event) smp_call_function_single(event->oncpu, __perf_event_read, event, 1); } else if (event->state == PERF_EVENT_STATE_INACTIVE) { + struct perf_event_context *ctx = event->ctx; + unsigned long flags; + + spin_lock_irqsave(&ctx->lock, flags); + update_context_time(ctx); update_event_times(event); + spin_unlock_irqrestore(&ctx->lock, flags); } return atomic64_read(&event->count); @@ -1671,6 +1679,8 @@ static struct perf_event_context *find_get_context(pid_t pid, int cpu) return ERR_PTR(err); } +static void perf_event_free_filter(struct perf_event *event); + static void free_event_rcu(struct rcu_head *head) { struct perf_event *event; @@ -1678,6 +1688,7 @@ static void free_event_rcu(struct rcu_head *head) event = container_of(head, struct perf_event, rcu_head); if (event->ns) put_pid_ns(event->ns); + perf_event_free_filter(event); kfree(event); } @@ -1709,16 +1720,10 @@ static void free_event(struct perf_event *event) call_rcu(&event->rcu_head, free_event_rcu); } -/* - * Called when the last reference to the file is gone. - */ -static int perf_release(struct inode *inode, struct file *file) +int perf_event_release_kernel(struct perf_event *event) { - struct perf_event *event = file->private_data; struct perf_event_context *ctx = event->ctx; - file->private_data = NULL; - WARN_ON_ONCE(ctx->parent_ctx); mutex_lock(&ctx->mutex); perf_event_remove_from_context(event); @@ -1733,6 +1738,19 @@ static int perf_release(struct inode *inode, struct file *file) return 0; } +EXPORT_SYMBOL_GPL(perf_event_release_kernel); + +/* + * Called when the last reference to the file is gone. + */ +static int perf_release(struct inode *inode, struct file *file) +{ + struct perf_event *event = file->private_data; + + file->private_data = NULL; + + return perf_event_release_kernel(event); +} static int perf_event_read_size(struct perf_event *event) { @@ -1759,91 +1777,94 @@ static int perf_event_read_size(struct perf_event *event) return size; } -static u64 perf_event_read_value(struct perf_event *event) +u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running) { struct perf_event *child; u64 total = 0; + *enabled = 0; + *running = 0; + + mutex_lock(&event->child_mutex); total += perf_event_read(event); - list_for_each_entry(child, &event->child_list, child_list) + *enabled += event->total_time_enabled + + atomic64_read(&event->child_total_time_enabled); + *running += event->total_time_running + + atomic64_read(&event->child_total_time_running); + + list_for_each_entry(child, &event->child_list, child_list) { total += perf_event_read(child); + *enabled += child->total_time_enabled; + *running += child->total_time_running; + } + mutex_unlock(&event->child_mutex); return total; } - -static int perf_event_read_entry(struct perf_event *event, - u64 read_format, char __user *buf) -{ - int n = 0, count = 0; - u64 values[2]; - - values[n++] = perf_event_read_value(event); - if (read_format & PERF_FORMAT_ID) - values[n++] = primary_event_id(event); - - count = n * sizeof(u64); - - if (copy_to_user(buf, values, count)) - return -EFAULT; - - return count; -} +EXPORT_SYMBOL_GPL(perf_event_read_value); static int perf_event_read_group(struct perf_event *event, u64 read_format, char __user *buf) { struct perf_event *leader = event->group_leader, *sub; - int n = 0, size = 0, err = -EFAULT; - u64 values[3]; + int n = 0, size = 0, ret = -EFAULT; + struct perf_event_context *ctx = leader->ctx; + u64 values[5]; + u64 count, enabled, running; + + mutex_lock(&ctx->mutex); + count = perf_event_read_value(leader, &enabled, &running); values[n++] = 1 + leader->nr_siblings; - if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) { - values[n++] = leader->total_time_enabled + - atomic64_read(&leader->child_total_time_enabled); - } - if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) { - values[n++] = leader->total_time_running + - atomic64_read(&leader->child_total_time_running); - } + if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) + values[n++] = enabled; + if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) + values[n++] = running; + values[n++] = count; + if (read_format & PERF_FORMAT_ID) + values[n++] = primary_event_id(leader); size = n * sizeof(u64); if (copy_to_user(buf, values, size)) - return -EFAULT; - - err = perf_event_read_entry(leader, read_format, buf + size); - if (err < 0) - return err; + goto unlock; - size += err; + ret = size; list_for_each_entry(sub, &leader->sibling_list, group_entry) { - err = perf_event_read_entry(sub, read_format, - buf + size); - if (err < 0) - return err; + n = 0; + + values[n++] = perf_event_read_value(sub, &enabled, &running); + if (read_format & PERF_FORMAT_ID) + values[n++] = primary_event_id(sub); + + size = n * sizeof(u64); + + if (copy_to_user(buf + ret, values, size)) { + ret = -EFAULT; + goto unlock; + } - size += err; + ret += size; } +unlock: + mutex_unlock(&ctx->mutex); - return size; + return ret; } static int perf_event_read_one(struct perf_event *event, u64 read_format, char __user *buf) { + u64 enabled, running; u64 values[4]; int n = 0; - values[n++] = perf_event_read_value(event); - if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) { - values[n++] = event->total_time_enabled + - atomic64_read(&event->child_total_time_enabled); - } - if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) { - values[n++] = event->total_time_running + - atomic64_read(&event->child_total_time_running); - } + values[n++] = perf_event_read_value(event, &enabled, &running); + if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) + values[n++] = enabled; + if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) + values[n++] = running; if (read_format & PERF_FORMAT_ID) values[n++] = primary_event_id(event); @@ -1874,12 +1895,10 @@ perf_read_hw(struct perf_event *event, char __user *buf, size_t count) return -ENOSPC; WARN_ON_ONCE(event->ctx->parent_ctx); - mutex_lock(&event->child_mutex); if (read_format & PERF_FORMAT_GROUP) ret = perf_event_read_group(event, read_format, buf); else ret = perf_event_read_one(event, read_format, buf); - mutex_unlock(&event->child_mutex); return ret; } @@ -1987,7 +2006,8 @@ unlock: return ret; } -int perf_event_set_output(struct perf_event *event, int output_fd); +static int perf_event_set_output(struct perf_event *event, int output_fd); +static int perf_event_set_filter(struct perf_event *event, void __user *arg); static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { @@ -2015,6 +2035,9 @@ static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg) case PERF_EVENT_IOC_SET_OUTPUT: return perf_event_set_output(event, arg); + case PERF_EVENT_IOC_SET_FILTER: + return perf_event_set_filter(event, (void __user *)arg); + default: return -ENOTTY; } @@ -2105,49 +2128,31 @@ unlock: rcu_read_unlock(); } -static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) +static unsigned long perf_data_size(struct perf_mmap_data *data) { - struct perf_event *event = vma->vm_file->private_data; - struct perf_mmap_data *data; - int ret = VM_FAULT_SIGBUS; - - if (vmf->flags & FAULT_FLAG_MKWRITE) { - if (vmf->pgoff == 0) - ret = 0; - return ret; - } - - rcu_read_lock(); - data = rcu_dereference(event->data); - if (!data) - goto unlock; - - if (vmf->pgoff == 0) { - vmf->page = virt_to_page(data->user_page); - } else { - int nr = vmf->pgoff - 1; - - if ((unsigned)nr > data->nr_pages) - goto unlock; + return data->nr_pages << (PAGE_SHIFT + data->data_order); +} - if (vmf->flags & FAULT_FLAG_WRITE) - goto unlock; +#ifndef CONFIG_PERF_USE_VMALLOC - vmf->page = virt_to_page(data->data_pages[nr]); - } +/* + * Back perf_mmap() with regular GFP_KERNEL-0 pages. + */ - get_page(vmf->page); - vmf->page->mapping = vma->vm_file->f_mapping; - vmf->page->index = vmf->pgoff; +static struct page * +perf_mmap_to_page(struct perf_mmap_data *data, unsigned long pgoff) +{ + if (pgoff > data->nr_pages) + return NULL; - ret = 0; -unlock: - rcu_read_unlock(); + if (pgoff == 0) + return virt_to_page(data->user_page); - return ret; + return virt_to_page(data->data_pages[pgoff - 1]); } -static int perf_mmap_data_alloc(struct perf_event *event, int nr_pages) +static struct perf_mmap_data * +perf_mmap_data_alloc(struct perf_event *event, int nr_pages) { struct perf_mmap_data *data; unsigned long size; @@ -2172,19 +2177,10 @@ static int perf_mmap_data_alloc(struct perf_event *event, int nr_pages) goto fail_data_pages; } + data->data_order = 0; data->nr_pages = nr_pages; - atomic_set(&data->lock, -1); - if (event->attr.watermark) { - data->watermark = min_t(long, PAGE_SIZE * nr_pages, - event->attr.wakeup_watermark); - } - if (!data->watermark) - data->watermark = max(PAGE_SIZE, PAGE_SIZE * nr_pages / 4); - - rcu_assign_pointer(event->data, data); - - return 0; + return data; fail_data_pages: for (i--; i >= 0; i--) @@ -2196,7 +2192,7 @@ fail_user_page: kfree(data); fail: - return -ENOMEM; + return NULL; } static void perf_mmap_free_page(unsigned long addr) @@ -2207,28 +2203,170 @@ static void perf_mmap_free_page(unsigned long addr) __free_page(page); } -static void __perf_mmap_data_free(struct rcu_head *rcu_head) +static void perf_mmap_data_free(struct perf_mmap_data *data) { - struct perf_mmap_data *data; int i; - data = container_of(rcu_head, struct perf_mmap_data, rcu_head); - perf_mmap_free_page((unsigned long)data->user_page); for (i = 0; i < data->nr_pages; i++) perf_mmap_free_page((unsigned long)data->data_pages[i]); + kfree(data); +} + +#else + +/* + * Back perf_mmap() with vmalloc memory. + * + * Required for architectures that have d-cache aliasing issues. + */ + +static struct page * +perf_mmap_to_page(struct perf_mmap_data *data, unsigned long pgoff) +{ + if (pgoff > (1UL << data->data_order)) + return NULL; + + return vmalloc_to_page((void *)data->user_page + pgoff * PAGE_SIZE); +} + +static void perf_mmap_unmark_page(void *addr) +{ + struct page *page = vmalloc_to_page(addr); + + page->mapping = NULL; +} + +static void perf_mmap_data_free_work(struct work_struct *work) +{ + struct perf_mmap_data *data; + void *base; + int i, nr; + + data = container_of(work, struct perf_mmap_data, work); + nr = 1 << data->data_order; + + base = data->user_page; + for (i = 0; i < nr + 1; i++) + perf_mmap_unmark_page(base + (i * PAGE_SIZE)); + + vfree(base); + kfree(data); +} + +static void perf_mmap_data_free(struct perf_mmap_data *data) +{ + schedule_work(&data->work); +} + +static struct perf_mmap_data * +perf_mmap_data_alloc(struct perf_event *event, int nr_pages) +{ + struct perf_mmap_data *data; + unsigned long size; + void *all_buf; + + WARN_ON(atomic_read(&event->mmap_count)); + + size = sizeof(struct perf_mmap_data); + size += sizeof(void *); + + data = kzalloc(size, GFP_KERNEL); + if (!data) + goto fail; + INIT_WORK(&data->work, perf_mmap_data_free_work); + + all_buf = vmalloc_user((nr_pages + 1) * PAGE_SIZE); + if (!all_buf) + goto fail_all_buf; + + data->user_page = all_buf; + data->data_pages[0] = all_buf + PAGE_SIZE; + data->data_order = ilog2(nr_pages); + data->nr_pages = 1; + + return data; + +fail_all_buf: kfree(data); + +fail: + return NULL; +} + +#endif + +static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) +{ + struct perf_event *event = vma->vm_file->private_data; + struct perf_mmap_data *data; + int ret = VM_FAULT_SIGBUS; + + if (vmf->flags & FAULT_FLAG_MKWRITE) { + if (vmf->pgoff == 0) + ret = 0; + return ret; + } + + rcu_read_lock(); + data = rcu_dereference(event->data); + if (!data) + goto unlock; + + if (vmf->pgoff && (vmf->flags & FAULT_FLAG_WRITE)) + goto unlock; + + vmf->page = perf_mmap_to_page(data, vmf->pgoff); + if (!vmf->page) + goto unlock; + + get_page(vmf->page); + vmf->page->mapping = vma->vm_file->f_mapping; + vmf->page->index = vmf->pgoff; + + ret = 0; +unlock: + rcu_read_unlock(); + + return ret; } -static void perf_mmap_data_free(struct perf_event *event) +static void +perf_mmap_data_init(struct perf_event *event, struct perf_mmap_data *data) +{ + long max_size = perf_data_size(data); + + atomic_set(&data->lock, -1); + + if (event->attr.watermark) { + data->watermark = min_t(long, max_size, + event->attr.wakeup_watermark); + } + + if (!data->watermark) + data->watermark = max_size / 2; + + + rcu_assign_pointer(event->data, data); +} + +static void perf_mmap_data_free_rcu(struct rcu_head *rcu_head) +{ + struct perf_mmap_data *data; + + data = container_of(rcu_head, struct perf_mmap_data, rcu_head); + perf_mmap_data_free(data); +} + +static void perf_mmap_data_release(struct perf_event *event) { struct perf_mmap_data *data = event->data; WARN_ON(atomic_read(&event->mmap_count)); rcu_assign_pointer(event->data, NULL); - call_rcu(&data->rcu_head, __perf_mmap_data_free); + call_rcu(&data->rcu_head, perf_mmap_data_free_rcu); } static void perf_mmap_open(struct vm_area_struct *vma) @@ -2244,16 +2382,17 @@ static void perf_mmap_close(struct vm_area_struct *vma) WARN_ON_ONCE(event->ctx->parent_ctx); if (atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) { + unsigned long size = perf_data_size(event->data); struct user_struct *user = current_user(); - atomic_long_sub(event->data->nr_pages + 1, &user->locked_vm); + atomic_long_sub((size >> PAGE_SHIFT) + 1, &user->locked_vm); vma->vm_mm->locked_vm -= event->data->nr_locked; - perf_mmap_data_free(event); + perf_mmap_data_release(event); mutex_unlock(&event->mmap_mutex); } } -static struct vm_operations_struct perf_mmap_vmops = { +static const struct vm_operations_struct perf_mmap_vmops = { .open = perf_mmap_open, .close = perf_mmap_close, .fault = perf_mmap_fault, @@ -2266,6 +2405,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) unsigned long user_locked, user_lock_limit; struct user_struct *user = current_user(); unsigned long locked, lock_limit; + struct perf_mmap_data *data; unsigned long vma_size; unsigned long nr_pages; long user_extra, extra; @@ -2328,10 +2468,15 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) } WARN_ON(event->data); - ret = perf_mmap_data_alloc(event, nr_pages); - if (ret) + + data = perf_mmap_data_alloc(event, nr_pages); + ret = -ENOMEM; + if (!data) goto unlock; + ret = 0; + perf_mmap_data_init(event, data); + atomic_set(&event->mmap_count, 1); atomic_long_add(user_extra, &user->locked_vm); vma->vm_mm->locked_vm += extra; @@ -2519,7 +2664,7 @@ static bool perf_output_space(struct perf_mmap_data *data, unsigned long tail, if (!data->writable) return true; - mask = (data->nr_pages << PAGE_SHIFT) - 1; + mask = perf_data_size(data) - 1; offset = (offset - tail) & mask; head = (head - tail) & mask; @@ -2558,20 +2703,21 @@ static void perf_output_wakeup(struct perf_output_handle *handle) static void perf_output_lock(struct perf_output_handle *handle) { struct perf_mmap_data *data = handle->data; - int cpu; + int cur, cpu = get_cpu(); handle->locked = 0; - local_irq_save(handle->flags); - cpu = smp_processor_id(); - - if (in_nmi() && atomic_read(&data->lock) == cpu) - return; + for (;;) { + cur = atomic_cmpxchg(&data->lock, -1, cpu); + if (cur == -1) { + handle->locked = 1; + break; + } + if (cur == cpu) + break; - while (atomic_cmpxchg(&data->lock, -1, cpu) != -1) cpu_relax(); - - handle->locked = 1; + } } static void perf_output_unlock(struct perf_output_handle *handle) @@ -2617,14 +2763,14 @@ again: if (atomic_xchg(&data->wakeup, 0)) perf_output_wakeup(handle); out: - local_irq_restore(handle->flags); + put_cpu(); } void perf_output_copy(struct perf_output_handle *handle, const void *buf, unsigned int len) { unsigned int pages_mask; - unsigned int offset; + unsigned long offset; unsigned int size; void **pages; @@ -2633,12 +2779,14 @@ void perf_output_copy(struct perf_output_handle *handle, pages = handle->data->data_pages; do { - unsigned int page_offset; + unsigned long page_offset; + unsigned long page_size; int nr; nr = (offset >> PAGE_SHIFT) & pages_mask; - page_offset = offset & (PAGE_SIZE - 1); - size = min_t(unsigned int, PAGE_SIZE - page_offset, len); + page_size = 1UL << (handle->data->data_order + PAGE_SHIFT); + page_offset = offset & (page_size - 1); + size = min_t(unsigned int, page_size - page_offset, len); memcpy(pages[nr] + page_offset, buf, size); @@ -3126,15 +3274,10 @@ static void perf_event_task_ctx(struct perf_event_context *ctx, { struct perf_event *event; - if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list)) - return; - - rcu_read_lock(); list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { if (perf_event_task_match(event)) perf_event_task_output(event, task_event); } - rcu_read_unlock(); } static void perf_event_task_event(struct perf_task_event *task_event) @@ -3142,11 +3285,11 @@ static void perf_event_task_event(struct perf_task_event *task_event) struct perf_cpu_context *cpuctx; struct perf_event_context *ctx = task_event->task_ctx; + rcu_read_lock(); cpuctx = &get_cpu_var(perf_cpu_context); perf_event_task_ctx(&cpuctx->ctx, task_event); put_cpu_var(perf_cpu_context); - rcu_read_lock(); if (!ctx) ctx = rcu_dereference(task_event->task->perf_event_ctxp); if (ctx) @@ -3238,15 +3381,10 @@ static void perf_event_comm_ctx(struct perf_event_context *ctx, { struct perf_event *event; - if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list)) - return; - - rcu_read_lock(); list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { if (perf_event_comm_match(event)) perf_event_comm_output(event, comm_event); } - rcu_read_unlock(); } static void perf_event_comm_event(struct perf_comm_event *comm_event) @@ -3257,7 +3395,7 @@ static void perf_event_comm_event(struct perf_comm_event *comm_event) char comm[TASK_COMM_LEN]; memset(comm, 0, sizeof(comm)); - strncpy(comm, comm_event->task->comm, sizeof(comm)); + strlcpy(comm, comm_event->task->comm, sizeof(comm)); size = ALIGN(strlen(comm)+1, sizeof(u64)); comm_event->comm = comm; @@ -3265,11 +3403,11 @@ static void perf_event_comm_event(struct perf_comm_event *comm_event) comm_event->event_id.header.size = sizeof(comm_event->event_id) + size; + rcu_read_lock(); cpuctx = &get_cpu_var(perf_cpu_context); perf_event_comm_ctx(&cpuctx->ctx, comm_event); put_cpu_var(perf_cpu_context); - rcu_read_lock(); /* * doesn't really matter which of the child contexts the * events ends up in. @@ -3362,15 +3500,10 @@ static void perf_event_mmap_ctx(struct perf_event_context *ctx, { struct perf_event *event; - if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list)) - return; - - rcu_read_lock(); list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { if (perf_event_mmap_match(event, mmap_event)) perf_event_mmap_output(event, mmap_event); } - rcu_read_unlock(); } static void perf_event_mmap_event(struct perf_mmap_event *mmap_event) @@ -3426,11 +3559,11 @@ got_name: mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size; + rcu_read_lock(); cpuctx = &get_cpu_var(perf_cpu_context); perf_event_mmap_ctx(&cpuctx->ctx, mmap_event); put_cpu_var(perf_cpu_context); - rcu_read_lock(); /* * doesn't really matter which of the child contexts the * events ends up in. @@ -3569,7 +3702,11 @@ static int __perf_event_overflow(struct perf_event *event, int nmi, perf_event_disable(event); } - perf_event_output(event, nmi, data, regs); + if (event->overflow_handler) + event->overflow_handler(event, nmi, data, regs); + else + perf_event_output(event, nmi, data, regs); + return ret; } @@ -3614,16 +3751,16 @@ again: return nr; } -static void perf_swevent_overflow(struct perf_event *event, +static void perf_swevent_overflow(struct perf_event *event, u64 overflow, int nmi, struct perf_sample_data *data, struct pt_regs *regs) { struct hw_perf_event *hwc = &event->hw; int throttle = 0; - u64 overflow; data->period = event->hw.last_period; - overflow = perf_swevent_set_period(event); + if (!overflow) + overflow = perf_swevent_set_period(event); if (hwc->interrupts == MAX_INTERRUPTS) return; @@ -3656,14 +3793,19 @@ static void perf_swevent_add(struct perf_event *event, u64 nr, atomic64_add(nr, &event->count); + if (!regs) + return; + if (!hwc->sample_period) return; - if (!regs) + if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq) + return perf_swevent_overflow(event, 1, nmi, data, regs); + + if (atomic64_add_negative(nr, &hwc->period_left)) return; - if (!atomic64_add_negative(nr, &hwc->period_left)) - perf_swevent_overflow(event, nmi, data, regs); + perf_swevent_overflow(event, 0, nmi, data, regs); } static int perf_swevent_is_counting(struct perf_event *event) @@ -3696,25 +3838,44 @@ static int perf_swevent_is_counting(struct perf_event *event) return 1; } +static int perf_tp_event_match(struct perf_event *event, + struct perf_sample_data *data); + +static int perf_exclude_event(struct perf_event *event, + struct pt_regs *regs) +{ + if (regs) { + if (event->attr.exclude_user && user_mode(regs)) + return 1; + + if (event->attr.exclude_kernel && !user_mode(regs)) + return 1; + } + + return 0; +} + static int perf_swevent_match(struct perf_event *event, enum perf_type_id type, - u32 event_id, struct pt_regs *regs) + u32 event_id, + struct perf_sample_data *data, + struct pt_regs *regs) { if (!perf_swevent_is_counting(event)) return 0; if (event->attr.type != type) return 0; + if (event->attr.config != event_id) return 0; - if (regs) { - if (event->attr.exclude_user && user_mode(regs)) - return 0; + if (perf_exclude_event(event, regs)) + return 0; - if (event->attr.exclude_kernel && !user_mode(regs)) - return 0; - } + if (event->attr.type == PERF_TYPE_TRACEPOINT && + !perf_tp_event_match(event, data)) + return 0; return 1; } @@ -3727,49 +3888,59 @@ static void perf_swevent_ctx_event(struct perf_event_context *ctx, { struct perf_event *event; - if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list)) - return; - - rcu_read_lock(); list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { - if (perf_swevent_match(event, type, event_id, regs)) + if (perf_swevent_match(event, type, event_id, data, regs)) perf_swevent_add(event, nr, nmi, data, regs); } - rcu_read_unlock(); } -static int *perf_swevent_recursion_context(struct perf_cpu_context *cpuctx) +int perf_swevent_get_recursion_context(void) { + struct perf_cpu_context *cpuctx = &get_cpu_var(perf_cpu_context); + int rctx; + if (in_nmi()) - return &cpuctx->recursion[3]; + rctx = 3; + else if (in_irq()) + rctx = 2; + else if (in_softirq()) + rctx = 1; + else + rctx = 0; - if (in_irq()) - return &cpuctx->recursion[2]; + if (cpuctx->recursion[rctx]) { + put_cpu_var(perf_cpu_context); + return -1; + } - if (in_softirq()) - return &cpuctx->recursion[1]; + cpuctx->recursion[rctx]++; + barrier(); - return &cpuctx->recursion[0]; + return rctx; +} +EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context); + +void perf_swevent_put_recursion_context(int rctx) +{ + struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); + barrier(); + cpuctx->recursion[rctx]--; + put_cpu_var(perf_cpu_context); } +EXPORT_SYMBOL_GPL(perf_swevent_put_recursion_context); static void do_perf_sw_event(enum perf_type_id type, u32 event_id, u64 nr, int nmi, struct perf_sample_data *data, struct pt_regs *regs) { - struct perf_cpu_context *cpuctx = &get_cpu_var(perf_cpu_context); - int *recursion = perf_swevent_recursion_context(cpuctx); + struct perf_cpu_context *cpuctx; struct perf_event_context *ctx; - if (*recursion) - goto out; - - (*recursion)++; - barrier(); - + cpuctx = &__get_cpu_var(perf_cpu_context); + rcu_read_lock(); perf_swevent_ctx_event(&cpuctx->ctx, type, event_id, nr, nmi, data, regs); - rcu_read_lock(); /* * doesn't really matter which of the child contexts the * events ends up in. @@ -3778,23 +3949,24 @@ static void do_perf_sw_event(enum perf_type_id type, u32 event_id, if (ctx) perf_swevent_ctx_event(ctx, type, event_id, nr, nmi, data, regs); rcu_read_unlock(); - - barrier(); - (*recursion)--; - -out: - put_cpu_var(perf_cpu_context); } void __perf_sw_event(u32 event_id, u64 nr, int nmi, struct pt_regs *regs, u64 addr) { - struct perf_sample_data data = { - .addr = addr, - }; + struct perf_sample_data data; + int rctx; - do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, nmi, - &data, regs); + rctx = perf_swevent_get_recursion_context(); + if (rctx < 0) + return; + + data.addr = addr; + data.raw = NULL; + + do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, nmi, &data, regs); + + perf_swevent_put_recursion_context(rctx); } static void perf_swevent_read(struct perf_event *event) @@ -3839,6 +4011,7 @@ static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer) event->pmu->read(event); data.addr = 0; + data.period = event->hw.last_period; regs = get_irq_regs(); /* * In case we exclude kernel IPs or are somehow not in interrupt @@ -3849,8 +4022,9 @@ static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer) regs = task_pt_regs(current); if (regs) { - if (perf_event_overflow(event, 0, &data, regs)) - ret = HRTIMER_NORESTART; + if (!(event->attr.exclude_idle && current->pid == 0)) + if (perf_event_overflow(event, 0, &data, regs)) + ret = HRTIMER_NORESTART; } period = max_t(u64, 10000, event->hw.sample_period); @@ -3859,6 +4033,42 @@ static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer) return ret; } +static void perf_swevent_start_hrtimer(struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + + hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + hwc->hrtimer.function = perf_swevent_hrtimer; + if (hwc->sample_period) { + u64 period; + + if (hwc->remaining) { + if (hwc->remaining < 0) + period = 10000; + else + period = hwc->remaining; + hwc->remaining = 0; + } else { + period = max_t(u64, 10000, hwc->sample_period); + } + __hrtimer_start_range_ns(&hwc->hrtimer, + ns_to_ktime(period), 0, + HRTIMER_MODE_REL, 0); + } +} + +static void perf_swevent_cancel_hrtimer(struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + + if (hwc->sample_period) { + ktime_t remaining = hrtimer_get_remaining(&hwc->hrtimer); + hwc->remaining = ktime_to_ns(remaining); + + hrtimer_cancel(&hwc->hrtimer); + } +} + /* * Software event: cpu wall time clock */ @@ -3881,22 +4091,14 @@ static int cpu_clock_perf_event_enable(struct perf_event *event) int cpu = raw_smp_processor_id(); atomic64_set(&hwc->prev_count, cpu_clock(cpu)); - hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - hwc->hrtimer.function = perf_swevent_hrtimer; - if (hwc->sample_period) { - u64 period = max_t(u64, 10000, hwc->sample_period); - __hrtimer_start_range_ns(&hwc->hrtimer, - ns_to_ktime(period), 0, - HRTIMER_MODE_REL, 0); - } + perf_swevent_start_hrtimer(event); return 0; } static void cpu_clock_perf_event_disable(struct perf_event *event) { - if (event->hw.sample_period) - hrtimer_cancel(&event->hw.hrtimer); + perf_swevent_cancel_hrtimer(event); cpu_clock_perf_event_update(event); } @@ -3933,22 +4135,15 @@ static int task_clock_perf_event_enable(struct perf_event *event) now = event->ctx->time; atomic64_set(&hwc->prev_count, now); - hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - hwc->hrtimer.function = perf_swevent_hrtimer; - if (hwc->sample_period) { - u64 period = max_t(u64, 10000, hwc->sample_period); - __hrtimer_start_range_ns(&hwc->hrtimer, - ns_to_ktime(period), 0, - HRTIMER_MODE_REL, 0); - } + + perf_swevent_start_hrtimer(event); return 0; } static void task_clock_perf_event_disable(struct perf_event *event) { - if (event->hw.sample_period) - hrtimer_cancel(&event->hw.hrtimer); + perf_swevent_cancel_hrtimer(event); task_clock_perf_event_update(event, event->ctx->time); } @@ -3976,6 +4171,7 @@ static const struct pmu perf_ops_task_clock = { }; #ifdef CONFIG_EVENT_PROFILE + void perf_tp_event(int event_id, u64 addr, u64 count, void *record, int entry_size) { @@ -3994,13 +4190,21 @@ void perf_tp_event(int event_id, u64 addr, u64 count, void *record, if (!regs) regs = task_pt_regs(current); + /* Trace events already protected against recursion */ do_perf_sw_event(PERF_TYPE_TRACEPOINT, event_id, count, 1, &data, regs); } EXPORT_SYMBOL_GPL(perf_tp_event); -extern int ftrace_profile_enable(int); -extern void ftrace_profile_disable(int); +static int perf_tp_event_match(struct perf_event *event, + struct perf_sample_data *data) +{ + void *record = data->raw->data; + + if (likely(!event->filter) || filter_match_preds(event->filter, record)) + return 1; + return 0; +} static void tp_perf_event_destroy(struct perf_event *event) { @@ -4025,11 +4229,99 @@ static const struct pmu *tp_perf_event_init(struct perf_event *event) return &perf_ops_generic; } + +static int perf_event_set_filter(struct perf_event *event, void __user *arg) +{ + char *filter_str; + int ret; + + if (event->attr.type != PERF_TYPE_TRACEPOINT) + return -EINVAL; + + filter_str = strndup_user(arg, PAGE_SIZE); + if (IS_ERR(filter_str)) + return PTR_ERR(filter_str); + + ret = ftrace_profile_set_filter(event, event->attr.config, filter_str); + + kfree(filter_str); + return ret; +} + +static void perf_event_free_filter(struct perf_event *event) +{ + ftrace_profile_free_filter(event); +} + #else + +static int perf_tp_event_match(struct perf_event *event, + struct perf_sample_data *data) +{ + return 1; +} + static const struct pmu *tp_perf_event_init(struct perf_event *event) { return NULL; } + +static int perf_event_set_filter(struct perf_event *event, void __user *arg) +{ + return -ENOENT; +} + +static void perf_event_free_filter(struct perf_event *event) +{ +} + +#endif /* CONFIG_EVENT_PROFILE */ + +#ifdef CONFIG_HAVE_HW_BREAKPOINT +static void bp_perf_event_destroy(struct perf_event *event) +{ + release_bp_slot(event); +} + +static const struct pmu *bp_perf_event_init(struct perf_event *bp) +{ + int err; + /* + * The breakpoint is already filled if we haven't created the counter + * through perf syscall + * FIXME: manage to get trigerred to NULL if it comes from syscalls + */ + if (!bp->callback) + err = register_perf_hw_breakpoint(bp); + else + err = __register_perf_hw_breakpoint(bp); + if (err) + return ERR_PTR(err); + + bp->destroy = bp_perf_event_destroy; + + return &perf_ops_bp; +} + +void perf_bp_event(struct perf_event *bp, void *data) +{ + struct perf_sample_data sample; + struct pt_regs *regs = data; + + sample.addr = bp->attr.bp_addr; + + if (!perf_exclude_event(bp, regs)) + perf_swevent_add(bp, 1, 1, &sample, regs); +} +#else +static const struct pmu *bp_perf_event_init(struct perf_event *bp) +{ + return NULL; +} + +void perf_bp_event(struct perf_event *bp, void *regs) +{ +} #endif atomic_t perf_swevent_enabled[PERF_COUNT_SW_MAX]; @@ -4076,6 +4368,8 @@ static const struct pmu *sw_perf_event_init(struct perf_event *event) case PERF_COUNT_SW_PAGE_FAULTS_MAJ: case PERF_COUNT_SW_CONTEXT_SWITCHES: case PERF_COUNT_SW_CPU_MIGRATIONS: + case PERF_COUNT_SW_ALIGNMENT_FAULTS: + case PERF_COUNT_SW_EMULATION_FAULTS: if (!event->parent) { atomic_inc(&perf_swevent_enabled[event_id]); event->destroy = sw_perf_event_destroy; @@ -4096,6 +4390,7 @@ perf_event_alloc(struct perf_event_attr *attr, struct perf_event_context *ctx, struct perf_event *group_leader, struct perf_event *parent_event, + perf_callback_t callback, gfp_t gfpflags) { const struct pmu *pmu; @@ -4138,6 +4433,11 @@ perf_event_alloc(struct perf_event_attr *attr, event->state = PERF_EVENT_STATE_INACTIVE; + if (!callback && parent_event) + callback = parent_event->callback; + + event->callback = callback; + if (attr->disabled) event->state = PERF_EVENT_STATE_OFF; @@ -4172,6 +4472,11 @@ perf_event_alloc(struct perf_event_attr *attr, pmu = tp_perf_event_init(event); break; + case PERF_TYPE_BREAKPOINT: + pmu = bp_perf_event_init(event); + break; + + default: break; } @@ -4284,7 +4589,7 @@ err_size: goto out; } -int perf_event_set_output(struct perf_event *event, int output_fd) +static int perf_event_set_output(struct perf_event *event, int output_fd) { struct perf_event *output_event = NULL; struct file *output_file = NULL; @@ -4414,7 +4719,7 @@ SYSCALL_DEFINE5(perf_event_open, } event = perf_event_alloc(&attr, cpu, ctx, group_leader, - NULL, GFP_KERNEL); + NULL, NULL, GFP_KERNEL); err = PTR_ERR(event); if (IS_ERR(event)) goto err_put_context; @@ -4462,6 +4767,60 @@ err_put_context: return err; } +/** + * perf_event_create_kernel_counter + * + * @attr: attributes of the counter to create + * @cpu: cpu in which the counter is bound + * @pid: task to profile + */ +struct perf_event * +perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, + pid_t pid, perf_callback_t callback) +{ + struct perf_event *event; + struct perf_event_context *ctx; + int err; + + /* + * Get the target context (task or percpu): + */ + + ctx = find_get_context(pid, cpu); + if (IS_ERR(ctx)) { + err = PTR_ERR(ctx); + goto err_exit; + } + + event = perf_event_alloc(attr, cpu, ctx, NULL, + NULL, callback, GFP_KERNEL); + if (IS_ERR(event)) { + err = PTR_ERR(event); + goto err_put_context; + } + + event->filp = NULL; + WARN_ON_ONCE(ctx->parent_ctx); + mutex_lock(&ctx->mutex); + perf_install_in_context(ctx, event, cpu); + ++ctx->generation; + mutex_unlock(&ctx->mutex); + + event->owner = current; + get_task_struct(current); + mutex_lock(¤t->perf_event_mutex); + list_add_tail(&event->owner_entry, ¤t->perf_event_list); + mutex_unlock(¤t->perf_event_mutex); + + return event; + + err_put_context: + put_ctx(ctx); + err_exit: + return ERR_PTR(err); +} +EXPORT_SYMBOL_GPL(perf_event_create_kernel_counter); + /* * inherit a event from parent task to child task: */ @@ -4487,7 +4846,7 @@ inherit_event(struct perf_event *parent_event, child_event = perf_event_alloc(&parent_event->attr, parent_event->cpu, child_ctx, group_leader, parent_event, - GFP_KERNEL); + NULL, GFP_KERNEL); if (IS_ERR(child_event)) return child_event; get_ctx(child_ctx); @@ -4505,6 +4864,8 @@ inherit_event(struct perf_event *parent_event, if (parent_event->attr.freq) child_event->hw.sample_period = parent_event->hw.sample_period; + child_event->overflow_handler = parent_event->overflow_handler; + /* * Link it up in the child's context: */ @@ -4594,7 +4955,6 @@ __perf_event_exit_task(struct perf_event *child_event, { struct perf_event *parent_event; - update_event_times(child_event); perf_event_remove_from_context(child_event); parent_event = child_event->parent; @@ -4646,6 +5006,7 @@ void perf_event_exit_task(struct task_struct *child) * the events from it. */ unclone_ctx(child_ctx); + update_context_time(child_ctx); spin_unlock_irqrestore(&child_ctx->lock, flags); /* @@ -4781,9 +5142,7 @@ int perf_event_init_task(struct task_struct *child) * We dont have to disable NMIs - we are only looking at * the list, not manipulating it: */ - list_for_each_entry_rcu(event, &parent_ctx->event_list, event_entry) { - if (event != event->group_leader) - continue; + list_for_each_entry(event, &parent_ctx->group_list, group_entry) { if (!event->attr.inherit) { inherited_all = 0; diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index 821722ae58a7..86b3796b0436 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -118,7 +118,7 @@ struct pid_namespace *copy_pid_ns(unsigned long flags, struct pid_namespace *old { if (!(flags & CLONE_NEWPID)) return get_pid_ns(old_ns); - if (flags & CLONE_THREAD) + if (flags & (CLONE_THREAD|CLONE_PARENT)) return ERR_PTR(-EINVAL); return create_pid_namespace(old_ns); } diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index 04b3a83d686f..04a9e90d248f 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -693,21 +693,22 @@ static int software_resume(void) /* The snapshot device should not be opened while we're running */ if (!atomic_add_unless(&snapshot_device_available, -1, 0)) { error = -EBUSY; + swsusp_close(FMODE_READ); goto Unlock; } pm_prepare_console(); error = pm_notifier_call_chain(PM_RESTORE_PREPARE); if (error) - goto Finish; + goto close_finish; error = usermodehelper_disable(); if (error) - goto Finish; + goto close_finish; error = create_basic_memory_bitmaps(); if (error) - goto Finish; + goto close_finish; pr_debug("PM: Preparing processes for restore.\n"); error = prepare_processes(); @@ -719,6 +720,7 @@ static int software_resume(void) pr_debug("PM: Reading hibernation image.\n"); error = swsusp_read(&flags); + swsusp_close(FMODE_READ); if (!error) hibernation_restore(flags & SF_PLATFORM_MODE); @@ -737,6 +739,9 @@ static int software_resume(void) mutex_unlock(&pm_mutex); pr_debug("PM: Resume from disk failed.\n"); return error; +close_finish: + swsusp_close(FMODE_READ); + goto Finish; } late_initcall(software_resume); diff --git a/kernel/power/suspend_test.c b/kernel/power/suspend_test.c index 17d8bb1acf9c..25596e450ac7 100644 --- a/kernel/power/suspend_test.c +++ b/kernel/power/suspend_test.c @@ -19,7 +19,7 @@ * The time it takes is system-specific though, so when we test this * during system bootup we allow a LOT of time. */ -#define TEST_SUSPEND_SECONDS 5 +#define TEST_SUSPEND_SECONDS 10 static unsigned long suspend_test_start_time; @@ -49,7 +49,8 @@ void suspend_test_finish(const char *label) * has some performance issues. The stack dump of a WARN_ON * is more likely to get the right attention than a printk... */ - WARN(msec > (TEST_SUSPEND_SECONDS * 1000), "Component: %s\n", label); + WARN(msec > (TEST_SUSPEND_SECONDS * 1000), + "Component: %s, time: %u\n", label, msec); } /* diff --git a/kernel/power/swap.c b/kernel/power/swap.c index 8ba052c86d48..890f6b11b1d3 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -13,7 +13,6 @@ #include <linux/module.h> #include <linux/file.h> -#include <linux/utsname.h> #include <linux/delay.h> #include <linux/bitops.h> #include <linux/genhd.h> @@ -315,7 +314,6 @@ static int save_image(struct swap_map_handle *handle, { unsigned int m; int ret; - int error = 0; int nr_pages; int err2; struct bio *bio; @@ -330,26 +328,27 @@ static int save_image(struct swap_map_handle *handle, nr_pages = 0; bio = NULL; do_gettimeofday(&start); - do { + while (1) { ret = snapshot_read_next(snapshot, PAGE_SIZE); - if (ret > 0) { - error = swap_write_page(handle, data_of(*snapshot), - &bio); - if (error) - break; - if (!(nr_pages % m)) - printk("\b\b\b\b%3d%%", nr_pages / m); - nr_pages++; - } - } while (ret > 0); + if (ret <= 0) + break; + ret = swap_write_page(handle, data_of(*snapshot), &bio); + if (ret) + break; + if (!(nr_pages % m)) + printk("\b\b\b\b%3d%%", nr_pages / m); + nr_pages++; + } err2 = wait_on_bio_chain(&bio); do_gettimeofday(&stop); - if (!error) - error = err2; - if (!error) + if (!ret) + ret = err2; + if (!ret) printk("\b\b\b\bdone\n"); + else + printk("\n"); swsusp_show_speed(&start, &stop, nr_to_write, "Wrote"); - return error; + return ret; } /** @@ -537,7 +536,8 @@ static int load_image(struct swap_map_handle *handle, snapshot_write_finalize(snapshot); if (!snapshot_image_loaded(snapshot)) error = -ENODATA; - } + } else + printk("\n"); swsusp_show_speed(&start, &stop, nr_to_read, "Read"); return error; } @@ -573,8 +573,6 @@ int swsusp_read(unsigned int *flags_p) error = load_image(&handle, &snapshot, header->pages - 1); release_swap_reader(&handle); - blkdev_put(resume_bdev, FMODE_READ); - if (!error) pr_debug("PM: Image successfully loaded\n"); else @@ -597,7 +595,7 @@ int swsusp_check(void) error = bio_read_page(swsusp_resume_block, swsusp_header, NULL); if (error) - return error; + goto put; if (!memcmp(SWSUSP_SIG, swsusp_header->sig, 10)) { memcpy(swsusp_header->sig, swsusp_header->orig_sig, 10); @@ -605,8 +603,10 @@ int swsusp_check(void) error = bio_write_page(swsusp_resume_block, swsusp_header, NULL); } else { - return -EINVAL; + error = -EINVAL; } + +put: if (error) blkdev_put(resume_bdev, FMODE_READ); else diff --git a/kernel/printk.c b/kernel/printk.c index f38b07f78a4e..b5ac4d99c667 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -33,6 +33,7 @@ #include <linux/bootmem.h> #include <linux/syscalls.h> #include <linux/kexec.h> +#include <linux/ratelimit.h> #include <asm/uaccess.h> @@ -1376,11 +1377,11 @@ late_initcall(disable_boot_consoles); */ DEFINE_RATELIMIT_STATE(printk_ratelimit_state, 5 * HZ, 10); -int printk_ratelimit(void) +int __printk_ratelimit(const char *func) { - return __ratelimit(&printk_ratelimit_state); + return ___ratelimit(&printk_ratelimit_state, func); } -EXPORT_SYMBOL(printk_ratelimit); +EXPORT_SYMBOL(__printk_ratelimit); /** * printk_timed_ratelimit - caller-controlled printk ratelimiting diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 307c285af59e..23bd09cd042e 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -266,9 +266,10 @@ static int ignoring_children(struct sighand_struct *sigh) * or self-reaping. Do notification now if it would have happened earlier. * If it should reap itself, return true. * - * If it's our own child, there is no notification to do. - * But if our normal children self-reap, then this child - * was prevented by ptrace and we must reap it now. + * If it's our own child, there is no notification to do. But if our normal + * children self-reap, then this child was prevented by ptrace and we must + * reap it now, in that case we must also wake up sub-threads sleeping in + * do_wait(). */ static bool __ptrace_detach(struct task_struct *tracer, struct task_struct *p) { @@ -278,8 +279,10 @@ static bool __ptrace_detach(struct task_struct *tracer, struct task_struct *p) if (!task_detached(p) && thread_group_empty(p)) { if (!same_thread_group(p->real_parent, tracer)) do_notify_parent(p, p->exit_signal); - else if (ignoring_children(tracer->sighand)) + else if (ignoring_children(tracer->sighand)) { + __wake_up_parent(p, tracer); p->exit_signal = -1; + } } if (task_detached(p)) { /* Mark it as in the process of being reaped. */ diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c index 37ac45483082..9b7fd4723878 100644 --- a/kernel/rcupdate.c +++ b/kernel/rcupdate.c @@ -44,23 +44,13 @@ #include <linux/cpu.h> #include <linux/mutex.h> #include <linux/module.h> -#include <linux/kernel_stat.h> -enum rcu_barrier { - RCU_BARRIER_STD, - RCU_BARRIER_BH, - RCU_BARRIER_SCHED, -}; - -static DEFINE_PER_CPU(struct rcu_head, rcu_barrier_head) = {NULL}; -static atomic_t rcu_barrier_cpu_count; -static DEFINE_MUTEX(rcu_barrier_mutex); -static struct completion rcu_barrier_completion; -int rcu_scheduler_active __read_mostly; - -static atomic_t rcu_migrate_type_count = ATOMIC_INIT(0); -static struct rcu_head rcu_migrate_head[3]; -static DECLARE_WAIT_QUEUE_HEAD(rcu_migrate_wq); +#ifdef CONFIG_DEBUG_LOCK_ALLOC +static struct lock_class_key rcu_lock_key; +struct lockdep_map rcu_lock_map = + STATIC_LOCKDEP_MAP_INIT("rcu_read_lock", &rcu_lock_key); +EXPORT_SYMBOL_GPL(rcu_lock_map); +#endif /* * Awaken the corresponding synchronize_rcu() instance now that a @@ -73,241 +63,3 @@ void wakeme_after_rcu(struct rcu_head *head) rcu = container_of(head, struct rcu_synchronize, head); complete(&rcu->completion); } - -#ifdef CONFIG_TREE_PREEMPT_RCU - -/** - * synchronize_rcu - wait until a grace period has elapsed. - * - * Control will return to the caller some time after a full grace - * period has elapsed, in other words after all currently executing RCU - * read-side critical sections have completed. RCU read-side critical - * sections are delimited by rcu_read_lock() and rcu_read_unlock(), - * and may be nested. - */ -void synchronize_rcu(void) -{ - struct rcu_synchronize rcu; - - if (!rcu_scheduler_active) - return; - - init_completion(&rcu.completion); - /* Will wake me after RCU finished. */ - call_rcu(&rcu.head, wakeme_after_rcu); - /* Wait for it. */ - wait_for_completion(&rcu.completion); -} -EXPORT_SYMBOL_GPL(synchronize_rcu); - -#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ - -/** - * synchronize_sched - wait until an rcu-sched grace period has elapsed. - * - * Control will return to the caller some time after a full rcu-sched - * grace period has elapsed, in other words after all currently executing - * rcu-sched read-side critical sections have completed. These read-side - * critical sections are delimited by rcu_read_lock_sched() and - * rcu_read_unlock_sched(), and may be nested. Note that preempt_disable(), - * local_irq_disable(), and so on may be used in place of - * rcu_read_lock_sched(). - * - * This means that all preempt_disable code sequences, including NMI and - * hardware-interrupt handlers, in progress on entry will have completed - * before this primitive returns. However, this does not guarantee that - * softirq handlers will have completed, since in some kernels, these - * handlers can run in process context, and can block. - * - * This primitive provides the guarantees made by the (now removed) - * synchronize_kernel() API. In contrast, synchronize_rcu() only - * guarantees that rcu_read_lock() sections will have completed. - * In "classic RCU", these two guarantees happen to be one and - * the same, but can differ in realtime RCU implementations. - */ -void synchronize_sched(void) -{ - struct rcu_synchronize rcu; - - if (rcu_blocking_is_gp()) - return; - - init_completion(&rcu.completion); - /* Will wake me after RCU finished. */ - call_rcu_sched(&rcu.head, wakeme_after_rcu); - /* Wait for it. */ - wait_for_completion(&rcu.completion); -} -EXPORT_SYMBOL_GPL(synchronize_sched); - -/** - * synchronize_rcu_bh - wait until an rcu_bh grace period has elapsed. - * - * Control will return to the caller some time after a full rcu_bh grace - * period has elapsed, in other words after all currently executing rcu_bh - * read-side critical sections have completed. RCU read-side critical - * sections are delimited by rcu_read_lock_bh() and rcu_read_unlock_bh(), - * and may be nested. - */ -void synchronize_rcu_bh(void) -{ - struct rcu_synchronize rcu; - - if (rcu_blocking_is_gp()) - return; - - init_completion(&rcu.completion); - /* Will wake me after RCU finished. */ - call_rcu_bh(&rcu.head, wakeme_after_rcu); - /* Wait for it. */ - wait_for_completion(&rcu.completion); -} -EXPORT_SYMBOL_GPL(synchronize_rcu_bh); - -static void rcu_barrier_callback(struct rcu_head *notused) -{ - if (atomic_dec_and_test(&rcu_barrier_cpu_count)) - complete(&rcu_barrier_completion); -} - -/* - * Called with preemption disabled, and from cross-cpu IRQ context. - */ -static void rcu_barrier_func(void *type) -{ - int cpu = smp_processor_id(); - struct rcu_head *head = &per_cpu(rcu_barrier_head, cpu); - - atomic_inc(&rcu_barrier_cpu_count); - switch ((enum rcu_barrier)type) { - case RCU_BARRIER_STD: - call_rcu(head, rcu_barrier_callback); - break; - case RCU_BARRIER_BH: - call_rcu_bh(head, rcu_barrier_callback); - break; - case RCU_BARRIER_SCHED: - call_rcu_sched(head, rcu_barrier_callback); - break; - } -} - -static inline void wait_migrated_callbacks(void) -{ - wait_event(rcu_migrate_wq, !atomic_read(&rcu_migrate_type_count)); - smp_mb(); /* In case we didn't sleep. */ -} - -/* - * Orchestrate the specified type of RCU barrier, waiting for all - * RCU callbacks of the specified type to complete. - */ -static void _rcu_barrier(enum rcu_barrier type) -{ - BUG_ON(in_interrupt()); - /* Take cpucontrol mutex to protect against CPU hotplug */ - mutex_lock(&rcu_barrier_mutex); - init_completion(&rcu_barrier_completion); - /* - * Initialize rcu_barrier_cpu_count to 1, then invoke - * rcu_barrier_func() on each CPU, so that each CPU also has - * incremented rcu_barrier_cpu_count. Only then is it safe to - * decrement rcu_barrier_cpu_count -- otherwise the first CPU - * might complete its grace period before all of the other CPUs - * did their increment, causing this function to return too - * early. - */ - atomic_set(&rcu_barrier_cpu_count, 1); - on_each_cpu(rcu_barrier_func, (void *)type, 1); - if (atomic_dec_and_test(&rcu_barrier_cpu_count)) - complete(&rcu_barrier_completion); - wait_for_completion(&rcu_barrier_completion); - mutex_unlock(&rcu_barrier_mutex); - wait_migrated_callbacks(); -} - -/** - * rcu_barrier - Wait until all in-flight call_rcu() callbacks complete. - */ -void rcu_barrier(void) -{ - _rcu_barrier(RCU_BARRIER_STD); -} -EXPORT_SYMBOL_GPL(rcu_barrier); - -/** - * rcu_barrier_bh - Wait until all in-flight call_rcu_bh() callbacks complete. - */ -void rcu_barrier_bh(void) -{ - _rcu_barrier(RCU_BARRIER_BH); -} -EXPORT_SYMBOL_GPL(rcu_barrier_bh); - -/** - * rcu_barrier_sched - Wait for in-flight call_rcu_sched() callbacks. - */ -void rcu_barrier_sched(void) -{ - _rcu_barrier(RCU_BARRIER_SCHED); -} -EXPORT_SYMBOL_GPL(rcu_barrier_sched); - -static void rcu_migrate_callback(struct rcu_head *notused) -{ - if (atomic_dec_and_test(&rcu_migrate_type_count)) - wake_up(&rcu_migrate_wq); -} - -extern int rcu_cpu_notify(struct notifier_block *self, - unsigned long action, void *hcpu); - -static int __cpuinit rcu_barrier_cpu_hotplug(struct notifier_block *self, - unsigned long action, void *hcpu) -{ - rcu_cpu_notify(self, action, hcpu); - if (action == CPU_DYING) { - /* - * preempt_disable() in on_each_cpu() prevents stop_machine(), - * so when "on_each_cpu(rcu_barrier_func, (void *)type, 1);" - * returns, all online cpus have queued rcu_barrier_func(), - * and the dead cpu(if it exist) queues rcu_migrate_callback()s. - * - * These callbacks ensure _rcu_barrier() waits for all - * RCU callbacks of the specified type to complete. - */ - atomic_set(&rcu_migrate_type_count, 3); - call_rcu_bh(rcu_migrate_head, rcu_migrate_callback); - call_rcu_sched(rcu_migrate_head + 1, rcu_migrate_callback); - call_rcu(rcu_migrate_head + 2, rcu_migrate_callback); - } else if (action == CPU_DOWN_PREPARE) { - /* Don't need to wait until next removal operation. */ - /* rcu_migrate_head is protected by cpu_add_remove_lock */ - wait_migrated_callbacks(); - } - - return NOTIFY_OK; -} - -void __init rcu_init(void) -{ - int i; - - __rcu_init(); - cpu_notifier(rcu_barrier_cpu_hotplug, 0); - - /* - * We don't need protection against CPU-hotplug here because - * this is called early in boot, before either interrupts - * or the scheduler are operational. - */ - for_each_online_cpu(i) - rcu_barrier_cpu_hotplug(NULL, CPU_UP_PREPARE, (void *)(long)i); -} - -void rcu_scheduler_starting(void) -{ - WARN_ON(num_online_cpus() != 1); - WARN_ON(nr_context_switches() > 0); - rcu_scheduler_active = 1; -} diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c new file mode 100644 index 000000000000..9f6d9ff2572c --- /dev/null +++ b/kernel/rcutiny.c @@ -0,0 +1,282 @@ +/* + * Read-Copy Update mechanism for mutual exclusion, the Bloatwatch edition. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright IBM Corporation, 2008 + * + * Author: Paul E. McKenney <paulmck@linux.vnet.ibm.com> + * + * For detailed explanation of Read-Copy Update mechanism see - + * Documentation/RCU + */ +#include <linux/moduleparam.h> +#include <linux/completion.h> +#include <linux/interrupt.h> +#include <linux/notifier.h> +#include <linux/rcupdate.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/mutex.h> +#include <linux/sched.h> +#include <linux/types.h> +#include <linux/init.h> +#include <linux/time.h> +#include <linux/cpu.h> + +/* Global control variables for rcupdate callback mechanism. */ +struct rcu_ctrlblk { + struct rcu_head *rcucblist; /* List of pending callbacks (CBs). */ + struct rcu_head **donetail; /* ->next pointer of last "done" CB. */ + struct rcu_head **curtail; /* ->next pointer of last CB. */ +}; + +/* Definition for rcupdate control block. */ +static struct rcu_ctrlblk rcu_ctrlblk = { + .donetail = &rcu_ctrlblk.rcucblist, + .curtail = &rcu_ctrlblk.rcucblist, +}; + +static struct rcu_ctrlblk rcu_bh_ctrlblk = { + .donetail = &rcu_bh_ctrlblk.rcucblist, + .curtail = &rcu_bh_ctrlblk.rcucblist, +}; + +#ifdef CONFIG_NO_HZ + +static long rcu_dynticks_nesting = 1; + +/* + * Enter dynticks-idle mode, which is an extended quiescent state + * if we have fully entered that mode (i.e., if the new value of + * dynticks_nesting is zero). + */ +void rcu_enter_nohz(void) +{ + if (--rcu_dynticks_nesting == 0) + rcu_sched_qs(0); /* implies rcu_bh_qsctr_inc(0) */ +} + +/* + * Exit dynticks-idle mode, so that we are no longer in an extended + * quiescent state. + */ +void rcu_exit_nohz(void) +{ + rcu_dynticks_nesting++; +} + +#endif /* #ifdef CONFIG_NO_HZ */ + +/* + * Helper function for rcu_qsctr_inc() and rcu_bh_qsctr_inc(). + * Also disable irqs to avoid confusion due to interrupt handlers + * invoking call_rcu(). + */ +static int rcu_qsctr_help(struct rcu_ctrlblk *rcp) +{ + unsigned long flags; + + local_irq_save(flags); + if (rcp->rcucblist != NULL && + rcp->donetail != rcp->curtail) { + rcp->donetail = rcp->curtail; + local_irq_restore(flags); + return 1; + } + local_irq_restore(flags); + + return 0; +} + +/* + * Record an rcu quiescent state. And an rcu_bh quiescent state while we + * are at it, given that any rcu quiescent state is also an rcu_bh + * quiescent state. Use "+" instead of "||" to defeat short circuiting. + */ +void rcu_sched_qs(int cpu) +{ + if (rcu_qsctr_help(&rcu_ctrlblk) + rcu_qsctr_help(&rcu_bh_ctrlblk)) + raise_softirq(RCU_SOFTIRQ); +} + +/* + * Record an rcu_bh quiescent state. + */ +void rcu_bh_qs(int cpu) +{ + if (rcu_qsctr_help(&rcu_bh_ctrlblk)) + raise_softirq(RCU_SOFTIRQ); +} + +/* + * Check to see if the scheduling-clock interrupt came from an extended + * quiescent state, and, if so, tell RCU about it. + */ +void rcu_check_callbacks(int cpu, int user) +{ + if (user || + (idle_cpu(cpu) && + !in_softirq() && + hardirq_count() <= (1 << HARDIRQ_SHIFT))) + rcu_sched_qs(cpu); + else if (!in_softirq()) + rcu_bh_qs(cpu); +} + +/* + * Helper function for rcu_process_callbacks() that operates on the + * specified rcu_ctrlkblk structure. + */ +static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp) +{ + struct rcu_head *next, *list; + unsigned long flags; + + /* If no RCU callbacks ready to invoke, just return. */ + if (&rcp->rcucblist == rcp->donetail) + return; + + /* Move the ready-to-invoke callbacks to a local list. */ + local_irq_save(flags); + list = rcp->rcucblist; + rcp->rcucblist = *rcp->donetail; + *rcp->donetail = NULL; + if (rcp->curtail == rcp->donetail) + rcp->curtail = &rcp->rcucblist; + rcp->donetail = &rcp->rcucblist; + local_irq_restore(flags); + + /* Invoke the callbacks on the local list. */ + while (list) { + next = list->next; + prefetch(next); + list->func(list); + list = next; + } +} + +/* + * Invoke any callbacks whose grace period has completed. + */ +static void rcu_process_callbacks(struct softirq_action *unused) +{ + __rcu_process_callbacks(&rcu_ctrlblk); + __rcu_process_callbacks(&rcu_bh_ctrlblk); +} + +/* + * Wait for a grace period to elapse. But it is illegal to invoke + * synchronize_sched() from within an RCU read-side critical section. + * Therefore, any legal call to synchronize_sched() is a quiescent + * state, and so on a UP system, synchronize_sched() need do nothing. + * Ditto for synchronize_rcu_bh(). (But Lai Jiangshan points out the + * benefits of doing might_sleep() to reduce latency.) + * + * Cool, huh? (Due to Josh Triplett.) + * + * But we want to make this a static inline later. + */ +void synchronize_sched(void) +{ + cond_resched(); +} +EXPORT_SYMBOL_GPL(synchronize_sched); + +void synchronize_rcu_bh(void) +{ + synchronize_sched(); +} +EXPORT_SYMBOL_GPL(synchronize_rcu_bh); + +/* + * Helper function for call_rcu() and call_rcu_bh(). + */ +static void __call_rcu(struct rcu_head *head, + void (*func)(struct rcu_head *rcu), + struct rcu_ctrlblk *rcp) +{ + unsigned long flags; + + head->func = func; + head->next = NULL; + + local_irq_save(flags); + *rcp->curtail = head; + rcp->curtail = &head->next; + local_irq_restore(flags); +} + +/* + * Post an RCU callback to be invoked after the end of an RCU grace + * period. But since we have but one CPU, that would be after any + * quiescent state. + */ +void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) +{ + __call_rcu(head, func, &rcu_ctrlblk); +} +EXPORT_SYMBOL_GPL(call_rcu); + +/* + * Post an RCU bottom-half callback to be invoked after any subsequent + * quiescent state. + */ +void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) +{ + __call_rcu(head, func, &rcu_bh_ctrlblk); +} +EXPORT_SYMBOL_GPL(call_rcu_bh); + +void rcu_barrier(void) +{ + struct rcu_synchronize rcu; + + init_completion(&rcu.completion); + /* Will wake me after RCU finished. */ + call_rcu(&rcu.head, wakeme_after_rcu); + /* Wait for it. */ + wait_for_completion(&rcu.completion); +} +EXPORT_SYMBOL_GPL(rcu_barrier); + +void rcu_barrier_bh(void) +{ + struct rcu_synchronize rcu; + + init_completion(&rcu.completion); + /* Will wake me after RCU finished. */ + call_rcu_bh(&rcu.head, wakeme_after_rcu); + /* Wait for it. */ + wait_for_completion(&rcu.completion); +} +EXPORT_SYMBOL_GPL(rcu_barrier_bh); + +void rcu_barrier_sched(void) +{ + struct rcu_synchronize rcu; + + init_completion(&rcu.completion); + /* Will wake me after RCU finished. */ + call_rcu_sched(&rcu.head, wakeme_after_rcu); + /* Wait for it. */ + wait_for_completion(&rcu.completion); +} +EXPORT_SYMBOL_GPL(rcu_barrier_sched); + +void __init rcu_init(void) +{ + open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); +} diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index 233768f21f97..a621a67ef4e3 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c @@ -327,6 +327,11 @@ rcu_torture_cb(struct rcu_head *p) cur_ops->deferred_free(rp); } +static int rcu_no_completed(void) +{ + return 0; +} + static void rcu_torture_deferred_free(struct rcu_torture *p) { call_rcu(&p->rtort_rcu, rcu_torture_cb); @@ -388,6 +393,21 @@ static struct rcu_torture_ops rcu_sync_ops = { .name = "rcu_sync" }; +static struct rcu_torture_ops rcu_expedited_ops = { + .init = rcu_sync_torture_init, + .cleanup = NULL, + .readlock = rcu_torture_read_lock, + .read_delay = rcu_read_delay, /* just reuse rcu's version. */ + .readunlock = rcu_torture_read_unlock, + .completed = rcu_no_completed, + .deferred_free = rcu_sync_torture_deferred_free, + .sync = synchronize_rcu_expedited, + .cb_barrier = NULL, + .stats = NULL, + .irq_capable = 1, + .name = "rcu_expedited" +}; + /* * Definitions for rcu_bh torture testing. */ @@ -547,6 +567,25 @@ static struct rcu_torture_ops srcu_ops = { .name = "srcu" }; +static void srcu_torture_synchronize_expedited(void) +{ + synchronize_srcu_expedited(&srcu_ctl); +} + +static struct rcu_torture_ops srcu_expedited_ops = { + .init = srcu_torture_init, + .cleanup = srcu_torture_cleanup, + .readlock = srcu_torture_read_lock, + .read_delay = srcu_read_delay, + .readunlock = srcu_torture_read_unlock, + .completed = srcu_torture_completed, + .deferred_free = rcu_sync_torture_deferred_free, + .sync = srcu_torture_synchronize_expedited, + .cb_barrier = NULL, + .stats = srcu_torture_stats, + .name = "srcu_expedited" +}; + /* * Definitions for sched torture testing. */ @@ -562,11 +601,6 @@ static void sched_torture_read_unlock(int idx) preempt_enable(); } -static int sched_torture_completed(void) -{ - return 0; -} - static void rcu_sched_torture_deferred_free(struct rcu_torture *p) { call_rcu_sched(&p->rtort_rcu, rcu_torture_cb); @@ -583,7 +617,7 @@ static struct rcu_torture_ops sched_ops = { .readlock = sched_torture_read_lock, .read_delay = rcu_read_delay, /* just reuse rcu's version. */ .readunlock = sched_torture_read_unlock, - .completed = sched_torture_completed, + .completed = rcu_no_completed, .deferred_free = rcu_sched_torture_deferred_free, .sync = sched_torture_synchronize, .cb_barrier = rcu_barrier_sched, @@ -592,13 +626,13 @@ static struct rcu_torture_ops sched_ops = { .name = "sched" }; -static struct rcu_torture_ops sched_ops_sync = { +static struct rcu_torture_ops sched_sync_ops = { .init = rcu_sync_torture_init, .cleanup = NULL, .readlock = sched_torture_read_lock, .read_delay = rcu_read_delay, /* just reuse rcu's version. */ .readunlock = sched_torture_read_unlock, - .completed = sched_torture_completed, + .completed = rcu_no_completed, .deferred_free = rcu_sync_torture_deferred_free, .sync = sched_torture_synchronize, .cb_barrier = NULL, @@ -606,15 +640,13 @@ static struct rcu_torture_ops sched_ops_sync = { .name = "sched_sync" }; -extern int rcu_expedited_torture_stats(char *page); - static struct rcu_torture_ops sched_expedited_ops = { .init = rcu_sync_torture_init, .cleanup = NULL, .readlock = sched_torture_read_lock, .read_delay = rcu_read_delay, /* just reuse rcu's version. */ .readunlock = sched_torture_read_unlock, - .completed = sched_torture_completed, + .completed = rcu_no_completed, .deferred_free = rcu_sync_torture_deferred_free, .sync = synchronize_sched_expedited, .cb_barrier = NULL, @@ -650,7 +682,7 @@ rcu_torture_writer(void *arg) old_rp = rcu_torture_current; rp->rtort_mbtest = 1; rcu_assign_pointer(rcu_torture_current, rp); - smp_wmb(); + smp_wmb(); /* Mods to old_rp must follow rcu_assign_pointer() */ if (old_rp) { i = old_rp->rtort_pipe_count; if (i > RCU_TORTURE_PIPE_LEN) @@ -1099,9 +1131,10 @@ rcu_torture_init(void) int cpu; int firsterr = 0; static struct rcu_torture_ops *torture_ops[] = - { &rcu_ops, &rcu_sync_ops, &rcu_bh_ops, &rcu_bh_sync_ops, - &sched_expedited_ops, - &srcu_ops, &sched_ops, &sched_ops_sync, }; + { &rcu_ops, &rcu_sync_ops, &rcu_expedited_ops, + &rcu_bh_ops, &rcu_bh_sync_ops, + &srcu_ops, &srcu_expedited_ops, + &sched_ops, &sched_sync_ops, &sched_expedited_ops, }; mutex_lock(&fullstop_mutex); @@ -1112,8 +1145,12 @@ rcu_torture_init(void) break; } if (i == ARRAY_SIZE(torture_ops)) { - printk(KERN_ALERT "rcutorture: invalid torture type: \"%s\"\n", + printk(KERN_ALERT "rcu-torture: invalid torture type: \"%s\"\n", torture_type); + printk(KERN_ALERT "rcu-torture types:"); + for (i = 0; i < ARRAY_SIZE(torture_ops); i++) + printk(KERN_ALERT " %s", torture_ops[i]->name); + printk(KERN_ALERT "\n"); mutex_unlock(&fullstop_mutex); return -EINVAL; } diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 52b06f6e158c..53ae9598f798 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -46,30 +46,30 @@ #include <linux/cpu.h> #include <linux/mutex.h> #include <linux/time.h> +#include <linux/kernel_stat.h> #include "rcutree.h" -#ifdef CONFIG_DEBUG_LOCK_ALLOC -static struct lock_class_key rcu_lock_key; -struct lockdep_map rcu_lock_map = - STATIC_LOCKDEP_MAP_INIT("rcu_read_lock", &rcu_lock_key); -EXPORT_SYMBOL_GPL(rcu_lock_map); -#endif - /* Data structures. */ +static struct lock_class_key rcu_node_class[NUM_RCU_LVLS]; + #define RCU_STATE_INITIALIZER(name) { \ .level = { &name.node[0] }, \ .levelcnt = { \ NUM_RCU_LVL_0, /* root of hierarchy. */ \ NUM_RCU_LVL_1, \ NUM_RCU_LVL_2, \ - NUM_RCU_LVL_3, /* == MAX_RCU_LVLS */ \ + NUM_RCU_LVL_3, \ + NUM_RCU_LVL_4, /* == MAX_RCU_LVLS */ \ }, \ - .signaled = RCU_SIGNAL_INIT, \ + .signaled = RCU_GP_IDLE, \ .gpnum = -300, \ .completed = -300, \ .onofflock = __SPIN_LOCK_UNLOCKED(&name.onofflock), \ + .orphan_cbs_list = NULL, \ + .orphan_cbs_tail = &name.orphan_cbs_list, \ + .orphan_qlen = 0, \ .fqslock = __SPIN_LOCK_UNLOCKED(&name.fqslock), \ .n_force_qs = 0, \ .n_force_qs_ngp = 0, \ @@ -81,24 +81,18 @@ DEFINE_PER_CPU(struct rcu_data, rcu_sched_data); struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh_state); DEFINE_PER_CPU(struct rcu_data, rcu_bh_data); -extern long rcu_batches_completed_sched(void); -static struct rcu_node *rcu_get_root(struct rcu_state *rsp); -static void cpu_quiet_msk(unsigned long mask, struct rcu_state *rsp, - struct rcu_node *rnp, unsigned long flags); -static void cpu_quiet_msk_finish(struct rcu_state *rsp, unsigned long flags); -#ifdef CONFIG_HOTPLUG_CPU -static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp); -#endif /* #ifdef CONFIG_HOTPLUG_CPU */ -static void __rcu_process_callbacks(struct rcu_state *rsp, - struct rcu_data *rdp); -static void __call_rcu(struct rcu_head *head, - void (*func)(struct rcu_head *rcu), - struct rcu_state *rsp); -static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp); -static void __cpuinit rcu_init_percpu_data(int cpu, struct rcu_state *rsp, - int preemptable); +static int rcu_scheduler_active __read_mostly; -#include "rcutree_plugin.h" + +/* + * Return true if an RCU grace period is in progress. The ACCESS_ONCE()s + * permit this function to be invoked without holding the root rcu_node + * structure's ->lock, but of course results can be subject to change. + */ +static int rcu_gp_in_progress(struct rcu_state *rsp) +{ + return ACCESS_ONCE(rsp->completed) != ACCESS_ONCE(rsp->gpnum); +} /* * Note a quiescent state. Because we do not need to know @@ -110,7 +104,7 @@ void rcu_sched_qs(int cpu) struct rcu_data *rdp; rdp = &per_cpu(rcu_sched_data, cpu); - rdp->passed_quiesc_completed = rdp->completed; + rdp->passed_quiesc_completed = rdp->gpnum - 1; barrier(); rdp->passed_quiesc = 1; rcu_preempt_note_context_switch(cpu); @@ -121,7 +115,7 @@ void rcu_bh_qs(int cpu) struct rcu_data *rdp; rdp = &per_cpu(rcu_bh_data, cpu); - rdp->passed_quiesc_completed = rdp->completed; + rdp->passed_quiesc_completed = rdp->gpnum - 1; barrier(); rdp->passed_quiesc = 1; } @@ -137,6 +131,10 @@ static int blimit = 10; /* Maximum callbacks per softirq. */ static int qhimark = 10000; /* If this many pending, ignore blimit. */ static int qlowmark = 100; /* Once only this many pending, use blimit. */ +module_param(blimit, int, 0); +module_param(qhimark, int, 0); +module_param(qlowmark, int, 0); + static void force_quiescent_state(struct rcu_state *rsp, int relaxed); static int rcu_pending(int cpu); @@ -173,9 +171,7 @@ cpu_has_callbacks_ready_to_invoke(struct rcu_data *rdp) static int cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp) { - /* ACCESS_ONCE() because we are accessing outside of lock. */ - return *rdp->nxttail[RCU_DONE_TAIL] && - ACCESS_ONCE(rsp->completed) == ACCESS_ONCE(rsp->gpnum); + return *rdp->nxttail[RCU_DONE_TAIL] && !rcu_gp_in_progress(rsp); } /* @@ -345,31 +341,12 @@ void rcu_irq_exit(void) set_need_resched(); } -/* - * Record the specified "completed" value, which is later used to validate - * dynticks counter manipulations. Specify "rsp->completed - 1" to - * unconditionally invalidate any future dynticks manipulations (which is - * useful at the beginning of a grace period). - */ -static void dyntick_record_completed(struct rcu_state *rsp, long comp) -{ - rsp->dynticks_completed = comp; -} - #ifdef CONFIG_SMP /* - * Recall the previously recorded value of the completion for dynticks. - */ -static long dyntick_recall_completed(struct rcu_state *rsp) -{ - return rsp->dynticks_completed; -} - -/* * Snapshot the specified CPU's dynticks counter so that we can later * credit them with an implicit quiescent state. Return 1 if this CPU - * is already in a quiescent state courtesy of dynticks idle mode. + * is in dynticks idle mode, which is an extended quiescent state. */ static int dyntick_save_progress_counter(struct rcu_data *rdp) { @@ -429,24 +406,8 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) #else /* #ifdef CONFIG_NO_HZ */ -static void dyntick_record_completed(struct rcu_state *rsp, long comp) -{ -} - #ifdef CONFIG_SMP -/* - * If there are no dynticks, then the only way that a CPU can passively - * be in a quiescent state is to be offline. Unlike dynticks idle, which - * is a point in time during the prior (already finished) grace period, - * an offline CPU is always in a quiescent state, and thus can be - * unconditionally applied. So just return the current value of completed. - */ -static long dyntick_recall_completed(struct rcu_state *rsp) -{ - return rsp->completed; -} - static int dyntick_save_progress_counter(struct rcu_data *rdp) { return 0; @@ -475,30 +436,34 @@ static void print_other_cpu_stall(struct rcu_state *rsp) long delta; unsigned long flags; struct rcu_node *rnp = rcu_get_root(rsp); - struct rcu_node *rnp_cur = rsp->level[NUM_RCU_LVLS - 1]; - struct rcu_node *rnp_end = &rsp->node[NUM_RCU_NODES]; /* Only let one CPU complain about others per time interval. */ spin_lock_irqsave(&rnp->lock, flags); delta = jiffies - rsp->jiffies_stall; - if (delta < RCU_STALL_RAT_DELAY || rsp->gpnum == rsp->completed) { + if (delta < RCU_STALL_RAT_DELAY || !rcu_gp_in_progress(rsp)) { spin_unlock_irqrestore(&rnp->lock, flags); return; } rsp->jiffies_stall = jiffies + RCU_SECONDS_TILL_STALL_RECHECK; + + /* + * Now rat on any tasks that got kicked up to the root rcu_node + * due to CPU offlining. + */ + rcu_print_task_stall(rnp); spin_unlock_irqrestore(&rnp->lock, flags); /* OK, time to rat on our buddy... */ printk(KERN_ERR "INFO: RCU detected CPU stalls:"); - for (; rnp_cur < rnp_end; rnp_cur++) { + rcu_for_each_leaf_node(rsp, rnp) { rcu_print_task_stall(rnp); - if (rnp_cur->qsmask == 0) + if (rnp->qsmask == 0) continue; - for (cpu = 0; cpu <= rnp_cur->grphi - rnp_cur->grplo; cpu++) - if (rnp_cur->qsmask & (1UL << cpu)) - printk(" %d", rnp_cur->grplo + cpu); + for (cpu = 0; cpu <= rnp->grphi - rnp->grplo; cpu++) + if (rnp->qsmask & (1UL << cpu)) + printk(" %d", rnp->grplo + cpu); } printk(" (detected by %d, t=%ld jiffies)\n", smp_processor_id(), (long)(jiffies - rsp->gp_start)); @@ -537,8 +502,7 @@ static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp) /* We haven't checked in, so go dump stack. */ print_cpu_stall(rsp); - } else if (rsp->gpnum != rsp->completed && - delta >= RCU_STALL_RAT_DELAY) { + } else if (rcu_gp_in_progress(rsp) && delta >= RCU_STALL_RAT_DELAY) { /* They had two time units to dump stack, so complain. */ print_other_cpu_stall(rsp); @@ -560,13 +524,33 @@ static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp) /* * Update CPU-local rcu_data state to record the newly noticed grace period. * This is used both when we started the grace period and when we notice - * that someone else started the grace period. + * that someone else started the grace period. The caller must hold the + * ->lock of the leaf rcu_node structure corresponding to the current CPU, + * and must have irqs disabled. */ +static void __note_new_gpnum(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp) +{ + if (rdp->gpnum != rnp->gpnum) { + rdp->qs_pending = 1; + rdp->passed_quiesc = 0; + rdp->gpnum = rnp->gpnum; + } +} + static void note_new_gpnum(struct rcu_state *rsp, struct rcu_data *rdp) { - rdp->qs_pending = 1; - rdp->passed_quiesc = 0; - rdp->gpnum = rsp->gpnum; + unsigned long flags; + struct rcu_node *rnp; + + local_irq_save(flags); + rnp = rdp->mynode; + if (rdp->gpnum == ACCESS_ONCE(rnp->gpnum) || /* outside lock. */ + !spin_trylock(&rnp->lock)) { /* irqs already off, retry later. */ + local_irq_restore(flags); + return; + } + __note_new_gpnum(rsp, rnp, rdp); + spin_unlock_irqrestore(&rnp->lock, flags); } /* @@ -590,6 +574,79 @@ check_for_new_grace_period(struct rcu_state *rsp, struct rcu_data *rdp) } /* + * Advance this CPU's callbacks, but only if the current grace period + * has ended. This may be called only from the CPU to whom the rdp + * belongs. In addition, the corresponding leaf rcu_node structure's + * ->lock must be held by the caller, with irqs disabled. + */ +static void +__rcu_process_gp_end(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp) +{ + /* Did another grace period end? */ + if (rdp->completed != rnp->completed) { + + /* Advance callbacks. No harm if list empty. */ + rdp->nxttail[RCU_DONE_TAIL] = rdp->nxttail[RCU_WAIT_TAIL]; + rdp->nxttail[RCU_WAIT_TAIL] = rdp->nxttail[RCU_NEXT_READY_TAIL]; + rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL]; + + /* Remember that we saw this grace-period completion. */ + rdp->completed = rnp->completed; + } +} + +/* + * Advance this CPU's callbacks, but only if the current grace period + * has ended. This may be called only from the CPU to whom the rdp + * belongs. + */ +static void +rcu_process_gp_end(struct rcu_state *rsp, struct rcu_data *rdp) +{ + unsigned long flags; + struct rcu_node *rnp; + + local_irq_save(flags); + rnp = rdp->mynode; + if (rdp->completed == ACCESS_ONCE(rnp->completed) || /* outside lock. */ + !spin_trylock(&rnp->lock)) { /* irqs already off, retry later. */ + local_irq_restore(flags); + return; + } + __rcu_process_gp_end(rsp, rnp, rdp); + spin_unlock_irqrestore(&rnp->lock, flags); +} + +/* + * Do per-CPU grace-period initialization for running CPU. The caller + * must hold the lock of the leaf rcu_node structure corresponding to + * this CPU. + */ +static void +rcu_start_gp_per_cpu(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp) +{ + /* Prior grace period ended, so advance callbacks for current CPU. */ + __rcu_process_gp_end(rsp, rnp, rdp); + + /* + * Because this CPU just now started the new grace period, we know + * that all of its callbacks will be covered by this upcoming grace + * period, even the ones that were registered arbitrarily recently. + * Therefore, advance all outstanding callbacks to RCU_WAIT_TAIL. + * + * Other CPUs cannot be sure exactly when the grace period started. + * Therefore, their recently registered callbacks must pass through + * an additional RCU_NEXT_READY stage, so that they will be handled + * by the next RCU grace period. + */ + rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL]; + rdp->nxttail[RCU_WAIT_TAIL] = rdp->nxttail[RCU_NEXT_TAIL]; + + /* Set state so that this CPU will detect the next quiescent state. */ + __note_new_gpnum(rsp, rnp, rdp); +} + +/* * Start a new RCU grace period if warranted, re-initializing the hierarchy * in preparation for detecting the next grace period. The caller must hold * the root node's ->lock, which is released before return. Hard irqs must @@ -603,7 +660,23 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags) struct rcu_node *rnp = rcu_get_root(rsp); if (!cpu_needs_another_gp(rsp, rdp)) { - spin_unlock_irqrestore(&rnp->lock, flags); + if (rnp->completed == rsp->completed) { + spin_unlock_irqrestore(&rnp->lock, flags); + return; + } + spin_unlock(&rnp->lock); /* irqs remain disabled. */ + + /* + * Propagate new ->completed value to rcu_node structures + * so that other CPUs don't have to wait until the start + * of the next grace period to process their callbacks. + */ + rcu_for_each_node_breadth_first(rsp, rnp) { + spin_lock(&rnp->lock); /* irqs already disabled. */ + rnp->completed = rsp->completed; + spin_unlock(&rnp->lock); /* irqs remain disabled. */ + } + local_irq_restore(flags); return; } @@ -613,23 +686,15 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags) rsp->signaled = RCU_GP_INIT; /* Hold off force_quiescent_state. */ rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS; record_gp_stall_check_time(rsp); - dyntick_record_completed(rsp, rsp->completed - 1); - note_new_gpnum(rsp, rdp); - - /* - * Because we are first, we know that all our callbacks will - * be covered by this upcoming grace period, even the ones - * that were registered arbitrarily recently. - */ - rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL]; - rdp->nxttail[RCU_WAIT_TAIL] = rdp->nxttail[RCU_NEXT_TAIL]; /* Special-case the common single-level case. */ if (NUM_RCU_NODES == 1) { rcu_preempt_check_blocked_tasks(rnp); rnp->qsmask = rnp->qsmaskinit; rnp->gpnum = rsp->gpnum; + rnp->completed = rsp->completed; rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state OK. */ + rcu_start_gp_per_cpu(rsp, rnp, rdp); spin_unlock_irqrestore(&rnp->lock, flags); return; } @@ -657,70 +722,51 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags) * one corresponding to this CPU, due to the fact that we have * irqs disabled. */ - for (rnp = &rsp->node[0]; rnp < &rsp->node[NUM_RCU_NODES]; rnp++) { - spin_lock(&rnp->lock); /* irqs already disabled. */ + rcu_for_each_node_breadth_first(rsp, rnp) { + spin_lock(&rnp->lock); /* irqs already disabled. */ rcu_preempt_check_blocked_tasks(rnp); rnp->qsmask = rnp->qsmaskinit; rnp->gpnum = rsp->gpnum; - spin_unlock(&rnp->lock); /* irqs already disabled. */ + rnp->completed = rsp->completed; + if (rnp == rdp->mynode) + rcu_start_gp_per_cpu(rsp, rnp, rdp); + spin_unlock(&rnp->lock); /* irqs remain disabled. */ } + rnp = rcu_get_root(rsp); + spin_lock(&rnp->lock); /* irqs already disabled. */ rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state now OK. */ + spin_unlock(&rnp->lock); /* irqs remain disabled. */ spin_unlock_irqrestore(&rsp->onofflock, flags); } /* - * Advance this CPU's callbacks, but only if the current grace period - * has ended. This may be called only from the CPU to whom the rdp - * belongs. - */ -static void -rcu_process_gp_end(struct rcu_state *rsp, struct rcu_data *rdp) -{ - long completed_snap; - unsigned long flags; - - local_irq_save(flags); - completed_snap = ACCESS_ONCE(rsp->completed); /* outside of lock. */ - - /* Did another grace period end? */ - if (rdp->completed != completed_snap) { - - /* Advance callbacks. No harm if list empty. */ - rdp->nxttail[RCU_DONE_TAIL] = rdp->nxttail[RCU_WAIT_TAIL]; - rdp->nxttail[RCU_WAIT_TAIL] = rdp->nxttail[RCU_NEXT_READY_TAIL]; - rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL]; - - /* Remember that we saw this grace-period completion. */ - rdp->completed = completed_snap; - } - local_irq_restore(flags); -} - -/* - * Clean up after the prior grace period and let rcu_start_gp() start up - * the next grace period if one is needed. Note that the caller must - * hold rnp->lock, as required by rcu_start_gp(), which will release it. + * Report a full set of quiescent states to the specified rcu_state + * data structure. This involves cleaning up after the prior grace + * period and letting rcu_start_gp() start up the next grace period + * if one is needed. Note that the caller must hold rnp->lock, as + * required by rcu_start_gp(), which will release it. */ -static void cpu_quiet_msk_finish(struct rcu_state *rsp, unsigned long flags) - __releases(rnp->lock) +static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags) + __releases(rcu_get_root(rsp)->lock) { - WARN_ON_ONCE(rsp->completed == rsp->gpnum); + WARN_ON_ONCE(!rcu_gp_in_progress(rsp)); rsp->completed = rsp->gpnum; - rcu_process_gp_end(rsp, rsp->rda[smp_processor_id()]); + rsp->signaled = RCU_GP_IDLE; rcu_start_gp(rsp, flags); /* releases root node's rnp->lock. */ } /* - * Similar to cpu_quiet(), for which it is a helper function. Allows - * a group of CPUs to be quieted at one go, though all the CPUs in the - * group must be represented by the same leaf rcu_node structure. - * That structure's lock must be held upon entry, and it is released - * before return. + * Similar to rcu_report_qs_rdp(), for which it is a helper function. + * Allows quiescent states for a group of CPUs to be reported at one go + * to the specified rcu_node structure, though all the CPUs in the group + * must be represented by the same rcu_node structure (which need not be + * a leaf rcu_node structure, though it often will be). That structure's + * lock must be held upon entry, and it is released before return. */ static void -cpu_quiet_msk(unsigned long mask, struct rcu_state *rsp, struct rcu_node *rnp, - unsigned long flags) +rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp, + struct rcu_node *rnp, unsigned long flags) __releases(rnp->lock) { struct rcu_node *rnp_c; @@ -756,21 +802,23 @@ cpu_quiet_msk(unsigned long mask, struct rcu_state *rsp, struct rcu_node *rnp, /* * Get here if we are the last CPU to pass through a quiescent - * state for this grace period. Invoke cpu_quiet_msk_finish() + * state for this grace period. Invoke rcu_report_qs_rsp() * to clean up and start the next grace period if one is needed. */ - cpu_quiet_msk_finish(rsp, flags); /* releases rnp->lock. */ + rcu_report_qs_rsp(rsp, flags); /* releases rnp->lock. */ } /* - * Record a quiescent state for the specified CPU, which must either be - * the current CPU. The lastcomp argument is used to make sure we are - * still in the grace period of interest. We don't want to end the current - * grace period based on quiescent states detected in an earlier grace - * period! + * Record a quiescent state for the specified CPU to that CPU's rcu_data + * structure. This must be either called from the specified CPU, or + * called when the specified CPU is known to be offline (and when it is + * also known that no other CPU is concurrently trying to help the offline + * CPU). The lastcomp argument is used to make sure we are still in the + * grace period of interest. We don't want to end the current grace period + * based on quiescent states detected in an earlier grace period! */ static void -cpu_quiet(int cpu, struct rcu_state *rsp, struct rcu_data *rdp, long lastcomp) +rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp, long lastcomp) { unsigned long flags; unsigned long mask; @@ -778,15 +826,15 @@ cpu_quiet(int cpu, struct rcu_state *rsp, struct rcu_data *rdp, long lastcomp) rnp = rdp->mynode; spin_lock_irqsave(&rnp->lock, flags); - if (lastcomp != ACCESS_ONCE(rsp->completed)) { + if (lastcomp != rnp->completed) { /* * Someone beat us to it for this grace period, so leave. * The race with GP start is resolved by the fact that we * hold the leaf rcu_node lock, so that the per-CPU bits * cannot yet be initialized -- so we would simply find our - * CPU's bit already cleared in cpu_quiet_msk() if this race - * occurred. + * CPU's bit already cleared in rcu_report_qs_rnp() if this + * race occurred. */ rdp->passed_quiesc = 0; /* try again later! */ spin_unlock_irqrestore(&rnp->lock, flags); @@ -804,7 +852,7 @@ cpu_quiet(int cpu, struct rcu_state *rsp, struct rcu_data *rdp, long lastcomp) */ rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL]; - cpu_quiet_msk(mask, rsp, rnp, flags); /* releases rnp->lock */ + rcu_report_qs_rnp(mask, rsp, rnp, flags); /* rlses rnp->lock */ } } @@ -835,24 +883,73 @@ rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp) if (!rdp->passed_quiesc) return; - /* Tell RCU we are done (but cpu_quiet() will be the judge of that). */ - cpu_quiet(rdp->cpu, rsp, rdp, rdp->passed_quiesc_completed); + /* + * Tell RCU we are done (but rcu_report_qs_rdp() will be the + * judge of that). + */ + rcu_report_qs_rdp(rdp->cpu, rsp, rdp, rdp->passed_quiesc_completed); } #ifdef CONFIG_HOTPLUG_CPU /* + * Move a dying CPU's RCU callbacks to the ->orphan_cbs_list for the + * specified flavor of RCU. The callbacks will be adopted by the next + * _rcu_barrier() invocation or by the CPU_DEAD notifier, whichever + * comes first. Because this is invoked from the CPU_DYING notifier, + * irqs are already disabled. + */ +static void rcu_send_cbs_to_orphanage(struct rcu_state *rsp) +{ + int i; + struct rcu_data *rdp = rsp->rda[smp_processor_id()]; + + if (rdp->nxtlist == NULL) + return; /* irqs disabled, so comparison is stable. */ + spin_lock(&rsp->onofflock); /* irqs already disabled. */ + *rsp->orphan_cbs_tail = rdp->nxtlist; + rsp->orphan_cbs_tail = rdp->nxttail[RCU_NEXT_TAIL]; + rdp->nxtlist = NULL; + for (i = 0; i < RCU_NEXT_SIZE; i++) + rdp->nxttail[i] = &rdp->nxtlist; + rsp->orphan_qlen += rdp->qlen; + rdp->qlen = 0; + spin_unlock(&rsp->onofflock); /* irqs remain disabled. */ +} + +/* + * Adopt previously orphaned RCU callbacks. + */ +static void rcu_adopt_orphan_cbs(struct rcu_state *rsp) +{ + unsigned long flags; + struct rcu_data *rdp; + + spin_lock_irqsave(&rsp->onofflock, flags); + rdp = rsp->rda[smp_processor_id()]; + if (rsp->orphan_cbs_list == NULL) { + spin_unlock_irqrestore(&rsp->onofflock, flags); + return; + } + *rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_cbs_list; + rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_cbs_tail; + rdp->qlen += rsp->orphan_qlen; + rsp->orphan_cbs_list = NULL; + rsp->orphan_cbs_tail = &rsp->orphan_cbs_list; + rsp->orphan_qlen = 0; + spin_unlock_irqrestore(&rsp->onofflock, flags); +} + +/* * Remove the outgoing CPU from the bitmasks in the rcu_node hierarchy * and move all callbacks from the outgoing CPU to the current one. */ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp) { - int i; unsigned long flags; - long lastcomp; unsigned long mask; + int need_report = 0; struct rcu_data *rdp = rsp->rda[cpu]; - struct rcu_data *rdp_me; struct rcu_node *rnp; /* Exclude any attempts to start a new grace period. */ @@ -865,42 +962,34 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp) spin_lock(&rnp->lock); /* irqs already disabled. */ rnp->qsmaskinit &= ~mask; if (rnp->qsmaskinit != 0) { - spin_unlock(&rnp->lock); /* irqs remain disabled. */ + if (rnp != rdp->mynode) + spin_unlock(&rnp->lock); /* irqs remain disabled. */ break; } - rcu_preempt_offline_tasks(rsp, rnp, rdp); + if (rnp == rdp->mynode) + need_report = rcu_preempt_offline_tasks(rsp, rnp, rdp); + else + spin_unlock(&rnp->lock); /* irqs remain disabled. */ mask = rnp->grpmask; - spin_unlock(&rnp->lock); /* irqs remain disabled. */ rnp = rnp->parent; } while (rnp != NULL); - lastcomp = rsp->completed; - - spin_unlock(&rsp->onofflock); /* irqs remain disabled. */ /* - * Move callbacks from the outgoing CPU to the running CPU. - * Note that the outgoing CPU is now quiscent, so it is now - * (uncharacteristically) safe to access its rcu_data structure. - * Note also that we must carefully retain the order of the - * outgoing CPU's callbacks in order for rcu_barrier() to work - * correctly. Finally, note that we start all the callbacks - * afresh, even those that have passed through a grace period - * and are therefore ready to invoke. The theory is that hotplug - * events are rare, and that if they are frequent enough to - * indefinitely delay callbacks, you have far worse things to - * be worrying about. + * We still hold the leaf rcu_node structure lock here, and + * irqs are still disabled. The reason for this subterfuge is + * because invoking rcu_report_unblock_qs_rnp() with ->onofflock + * held leads to deadlock. */ - rdp_me = rsp->rda[smp_processor_id()]; - if (rdp->nxtlist != NULL) { - *rdp_me->nxttail[RCU_NEXT_TAIL] = rdp->nxtlist; - rdp_me->nxttail[RCU_NEXT_TAIL] = rdp->nxttail[RCU_NEXT_TAIL]; - rdp->nxtlist = NULL; - for (i = 0; i < RCU_NEXT_SIZE; i++) - rdp->nxttail[i] = &rdp->nxtlist; - rdp_me->qlen += rdp->qlen; - rdp->qlen = 0; - } - local_irq_restore(flags); + spin_unlock(&rsp->onofflock); /* irqs remain disabled. */ + rnp = rdp->mynode; + if (need_report & RCU_OFL_TASKS_NORM_GP) + rcu_report_unblock_qs_rnp(rnp, flags); + else + spin_unlock_irqrestore(&rnp->lock, flags); + if (need_report & RCU_OFL_TASKS_EXP_GP) + rcu_report_exp_rnp(rsp, rnp); + + rcu_adopt_orphan_cbs(rsp); } /* @@ -918,6 +1007,14 @@ static void rcu_offline_cpu(int cpu) #else /* #ifdef CONFIG_HOTPLUG_CPU */ +static void rcu_send_cbs_to_orphanage(struct rcu_state *rsp) +{ +} + +static void rcu_adopt_orphan_cbs(struct rcu_state *rsp) +{ +} + static void rcu_offline_cpu(int cpu) { } @@ -928,7 +1025,7 @@ static void rcu_offline_cpu(int cpu) * Invoke any RCU callbacks that have made it to the end of their grace * period. Thottle as specified by rdp->blimit. */ -static void rcu_do_batch(struct rcu_data *rdp) +static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) { unsigned long flags; struct rcu_head *next, *list, **tail; @@ -981,6 +1078,13 @@ static void rcu_do_batch(struct rcu_data *rdp) if (rdp->blimit == LONG_MAX && rdp->qlen <= qlowmark) rdp->blimit = blimit; + /* Reset ->qlen_last_fqs_check trigger if enough CBs have drained. */ + if (rdp->qlen == 0 && rdp->qlen_last_fqs_check != 0) { + rdp->qlen_last_fqs_check = 0; + rdp->n_force_qs_snap = rsp->n_force_qs; + } else if (rdp->qlen < rdp->qlen_last_fqs_check - qhimark) + rdp->qlen_last_fqs_check = rdp->qlen; + local_irq_restore(flags); /* Re-raise the RCU softirq if there are callbacks remaining. */ @@ -1050,33 +1154,32 @@ static int rcu_process_dyntick(struct rcu_state *rsp, long lastcomp, int cpu; unsigned long flags; unsigned long mask; - struct rcu_node *rnp_cur = rsp->level[NUM_RCU_LVLS - 1]; - struct rcu_node *rnp_end = &rsp->node[NUM_RCU_NODES]; + struct rcu_node *rnp; - for (; rnp_cur < rnp_end; rnp_cur++) { + rcu_for_each_leaf_node(rsp, rnp) { mask = 0; - spin_lock_irqsave(&rnp_cur->lock, flags); - if (rsp->completed != lastcomp) { - spin_unlock_irqrestore(&rnp_cur->lock, flags); + spin_lock_irqsave(&rnp->lock, flags); + if (rnp->completed != lastcomp) { + spin_unlock_irqrestore(&rnp->lock, flags); return 1; } - if (rnp_cur->qsmask == 0) { - spin_unlock_irqrestore(&rnp_cur->lock, flags); + if (rnp->qsmask == 0) { + spin_unlock_irqrestore(&rnp->lock, flags); continue; } - cpu = rnp_cur->grplo; + cpu = rnp->grplo; bit = 1; - for (; cpu <= rnp_cur->grphi; cpu++, bit <<= 1) { - if ((rnp_cur->qsmask & bit) != 0 && f(rsp->rda[cpu])) + for (; cpu <= rnp->grphi; cpu++, bit <<= 1) { + if ((rnp->qsmask & bit) != 0 && f(rsp->rda[cpu])) mask |= bit; } - if (mask != 0 && rsp->completed == lastcomp) { + if (mask != 0 && rnp->completed == lastcomp) { - /* cpu_quiet_msk() releases rnp_cur->lock. */ - cpu_quiet_msk(mask, rsp, rnp_cur, flags); + /* rcu_report_qs_rnp() releases rnp->lock. */ + rcu_report_qs_rnp(mask, rsp, rnp, flags); continue; } - spin_unlock_irqrestore(&rnp_cur->lock, flags); + spin_unlock_irqrestore(&rnp->lock, flags); } return 0; } @@ -1091,8 +1194,9 @@ static void force_quiescent_state(struct rcu_state *rsp, int relaxed) long lastcomp; struct rcu_node *rnp = rcu_get_root(rsp); u8 signaled; + u8 forcenow; - if (ACCESS_ONCE(rsp->completed) == ACCESS_ONCE(rsp->gpnum)) + if (!rcu_gp_in_progress(rsp)) return; /* No grace period in progress, nothing to force. */ if (!spin_trylock_irqsave(&rsp->fqslock, flags)) { rsp->n_force_qs_lh++; /* Inexact, can lose counts. Tough! */ @@ -1103,19 +1207,20 @@ static void force_quiescent_state(struct rcu_state *rsp, int relaxed) goto unlock_ret; /* no emergency and done recently. */ rsp->n_force_qs++; spin_lock(&rnp->lock); - lastcomp = rsp->completed; + lastcomp = rsp->gpnum - 1; signaled = rsp->signaled; rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS; - if (lastcomp == rsp->gpnum) { + if(!rcu_gp_in_progress(rsp)) { rsp->n_force_qs_ngp++; spin_unlock(&rnp->lock); goto unlock_ret; /* no GP in progress, time updated. */ } spin_unlock(&rnp->lock); switch (signaled) { + case RCU_GP_IDLE: case RCU_GP_INIT: - break; /* grace period still initializing, ignore. */ + break; /* grace period idle or initializing, ignore. */ case RCU_SAVE_DYNTICK: @@ -1126,20 +1231,29 @@ static void force_quiescent_state(struct rcu_state *rsp, int relaxed) if (rcu_process_dyntick(rsp, lastcomp, dyntick_save_progress_counter)) goto unlock_ret; + /* fall into next case. */ + + case RCU_SAVE_COMPLETED: /* Update state, record completion counter. */ + forcenow = 0; spin_lock(&rnp->lock); - if (lastcomp == rsp->completed) { + if (lastcomp + 1 == rsp->gpnum && + lastcomp == rsp->completed && + rsp->signaled == signaled) { rsp->signaled = RCU_FORCE_QS; - dyntick_record_completed(rsp, lastcomp); + rsp->completed_fqs = lastcomp; + forcenow = signaled == RCU_SAVE_COMPLETED; } spin_unlock(&rnp->lock); - break; + if (!forcenow) + break; + /* fall into next case. */ case RCU_FORCE_QS: /* Check dyntick-idle state, send IPI to laggarts. */ - if (rcu_process_dyntick(rsp, dyntick_recall_completed(rsp), + if (rcu_process_dyntick(rsp, rsp->completed_fqs, rcu_implicit_dynticks_qs)) goto unlock_ret; @@ -1195,7 +1309,7 @@ __rcu_process_callbacks(struct rcu_state *rsp, struct rcu_data *rdp) } /* If there are callbacks ready, invoke them. */ - rcu_do_batch(rdp); + rcu_do_batch(rsp, rdp); } /* @@ -1251,7 +1365,7 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), rdp->nxttail[RCU_NEXT_TAIL] = &head->next; /* Start a new grace period if one not already started. */ - if (ACCESS_ONCE(rsp->completed) == ACCESS_ONCE(rsp->gpnum)) { + if (!rcu_gp_in_progress(rsp)) { unsigned long nestflag; struct rcu_node *rnp_root = rcu_get_root(rsp); @@ -1259,10 +1373,20 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), rcu_start_gp(rsp, nestflag); /* releases rnp_root->lock. */ } - /* Force the grace period if too many callbacks or too long waiting. */ - if (unlikely(++rdp->qlen > qhimark)) { + /* + * Force the grace period if too many callbacks or too long waiting. + * Enforce hysteresis, and don't invoke force_quiescent_state() + * if some other CPU has recently done so. Also, don't bother + * invoking force_quiescent_state() if the newly enqueued callback + * is the only one waiting for a grace period to complete. + */ + if (unlikely(++rdp->qlen > rdp->qlen_last_fqs_check + qhimark)) { rdp->blimit = LONG_MAX; - force_quiescent_state(rsp, 0); + if (rsp->n_force_qs == rdp->n_force_qs_snap && + *rdp->nxttail[RCU_DONE_TAIL] != head) + force_quiescent_state(rsp, 0); + rdp->n_force_qs_snap = rsp->n_force_qs; + rdp->qlen_last_fqs_check = rdp->qlen; } else if ((long)(ACCESS_ONCE(rsp->jiffies_force_qs) - jiffies) < 0) force_quiescent_state(rsp, 1); local_irq_restore(flags); @@ -1286,6 +1410,68 @@ void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) } EXPORT_SYMBOL_GPL(call_rcu_bh); +/** + * synchronize_sched - wait until an rcu-sched grace period has elapsed. + * + * Control will return to the caller some time after a full rcu-sched + * grace period has elapsed, in other words after all currently executing + * rcu-sched read-side critical sections have completed. These read-side + * critical sections are delimited by rcu_read_lock_sched() and + * rcu_read_unlock_sched(), and may be nested. Note that preempt_disable(), + * local_irq_disable(), and so on may be used in place of + * rcu_read_lock_sched(). + * + * This means that all preempt_disable code sequences, including NMI and + * hardware-interrupt handlers, in progress on entry will have completed + * before this primitive returns. However, this does not guarantee that + * softirq handlers will have completed, since in some kernels, these + * handlers can run in process context, and can block. + * + * This primitive provides the guarantees made by the (now removed) + * synchronize_kernel() API. In contrast, synchronize_rcu() only + * guarantees that rcu_read_lock() sections will have completed. + * In "classic RCU", these two guarantees happen to be one and + * the same, but can differ in realtime RCU implementations. + */ +void synchronize_sched(void) +{ + struct rcu_synchronize rcu; + + if (rcu_blocking_is_gp()) + return; + + init_completion(&rcu.completion); + /* Will wake me after RCU finished. */ + call_rcu_sched(&rcu.head, wakeme_after_rcu); + /* Wait for it. */ + wait_for_completion(&rcu.completion); +} +EXPORT_SYMBOL_GPL(synchronize_sched); + +/** + * synchronize_rcu_bh - wait until an rcu_bh grace period has elapsed. + * + * Control will return to the caller some time after a full rcu_bh grace + * period has elapsed, in other words after all currently executing rcu_bh + * read-side critical sections have completed. RCU read-side critical + * sections are delimited by rcu_read_lock_bh() and rcu_read_unlock_bh(), + * and may be nested. + */ +void synchronize_rcu_bh(void) +{ + struct rcu_synchronize rcu; + + if (rcu_blocking_is_gp()) + return; + + init_completion(&rcu.completion); + /* Will wake me after RCU finished. */ + call_rcu_bh(&rcu.head, wakeme_after_rcu); + /* Wait for it. */ + wait_for_completion(&rcu.completion); +} +EXPORT_SYMBOL_GPL(synchronize_rcu_bh); + /* * Check to see if there is any immediate RCU-related work to be done * by the current CPU, for the specified type of RCU, returning 1 if so. @@ -1295,6 +1481,8 @@ EXPORT_SYMBOL_GPL(call_rcu_bh); */ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp) { + struct rcu_node *rnp = rdp->mynode; + rdp->n_rcu_pending++; /* Check for CPU stalls, if enabled. */ @@ -1319,19 +1507,19 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp) } /* Has another RCU grace period completed? */ - if (ACCESS_ONCE(rsp->completed) != rdp->completed) { /* outside lock */ + if (ACCESS_ONCE(rnp->completed) != rdp->completed) { /* outside lock */ rdp->n_rp_gp_completed++; return 1; } /* Has a new RCU grace period started? */ - if (ACCESS_ONCE(rsp->gpnum) != rdp->gpnum) { /* outside lock */ + if (ACCESS_ONCE(rnp->gpnum) != rdp->gpnum) { /* outside lock */ rdp->n_rp_gp_started++; return 1; } /* Has an RCU GP gone long enough to send resched IPIs &c? */ - if (ACCESS_ONCE(rsp->completed) != ACCESS_ONCE(rsp->gpnum) && + if (rcu_gp_in_progress(rsp) && ((long)(ACCESS_ONCE(rsp->jiffies_force_qs) - jiffies) < 0)) { rdp->n_rp_need_fqs++; return 1; @@ -1369,6 +1557,97 @@ int rcu_needs_cpu(int cpu) } /* + * This function is invoked towards the end of the scheduler's initialization + * process. Before this is called, the idle task might contain + * RCU read-side critical sections (during which time, this idle + * task is booting the system). After this function is called, the + * idle tasks are prohibited from containing RCU read-side critical + * sections. + */ +void rcu_scheduler_starting(void) +{ + WARN_ON(num_online_cpus() != 1); + WARN_ON(nr_context_switches() > 0); + rcu_scheduler_active = 1; +} + +static DEFINE_PER_CPU(struct rcu_head, rcu_barrier_head) = {NULL}; +static atomic_t rcu_barrier_cpu_count; +static DEFINE_MUTEX(rcu_barrier_mutex); +static struct completion rcu_barrier_completion; + +static void rcu_barrier_callback(struct rcu_head *notused) +{ + if (atomic_dec_and_test(&rcu_barrier_cpu_count)) + complete(&rcu_barrier_completion); +} + +/* + * Called with preemption disabled, and from cross-cpu IRQ context. + */ +static void rcu_barrier_func(void *type) +{ + int cpu = smp_processor_id(); + struct rcu_head *head = &per_cpu(rcu_barrier_head, cpu); + void (*call_rcu_func)(struct rcu_head *head, + void (*func)(struct rcu_head *head)); + + atomic_inc(&rcu_barrier_cpu_count); + call_rcu_func = type; + call_rcu_func(head, rcu_barrier_callback); +} + +/* + * Orchestrate the specified type of RCU barrier, waiting for all + * RCU callbacks of the specified type to complete. + */ +static void _rcu_barrier(struct rcu_state *rsp, + void (*call_rcu_func)(struct rcu_head *head, + void (*func)(struct rcu_head *head))) +{ + BUG_ON(in_interrupt()); + /* Take mutex to serialize concurrent rcu_barrier() requests. */ + mutex_lock(&rcu_barrier_mutex); + init_completion(&rcu_barrier_completion); + /* + * Initialize rcu_barrier_cpu_count to 1, then invoke + * rcu_barrier_func() on each CPU, so that each CPU also has + * incremented rcu_barrier_cpu_count. Only then is it safe to + * decrement rcu_barrier_cpu_count -- otherwise the first CPU + * might complete its grace period before all of the other CPUs + * did their increment, causing this function to return too + * early. + */ + atomic_set(&rcu_barrier_cpu_count, 1); + preempt_disable(); /* stop CPU_DYING from filling orphan_cbs_list */ + rcu_adopt_orphan_cbs(rsp); + on_each_cpu(rcu_barrier_func, (void *)call_rcu_func, 1); + preempt_enable(); /* CPU_DYING can again fill orphan_cbs_list */ + if (atomic_dec_and_test(&rcu_barrier_cpu_count)) + complete(&rcu_barrier_completion); + wait_for_completion(&rcu_barrier_completion); + mutex_unlock(&rcu_barrier_mutex); +} + +/** + * rcu_barrier_bh - Wait until all in-flight call_rcu_bh() callbacks complete. + */ +void rcu_barrier_bh(void) +{ + _rcu_barrier(&rcu_bh_state, call_rcu_bh); +} +EXPORT_SYMBOL_GPL(rcu_barrier_bh); + +/** + * rcu_barrier_sched - Wait for in-flight call_rcu_sched() callbacks. + */ +void rcu_barrier_sched(void) +{ + _rcu_barrier(&rcu_sched_state, call_rcu_sched); +} +EXPORT_SYMBOL_GPL(rcu_barrier_sched); + +/* * Do boot-time initialization of a CPU's per-CPU RCU data. */ static void __init @@ -1403,21 +1682,18 @@ static void __cpuinit rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptable) { unsigned long flags; - long lastcomp; unsigned long mask; struct rcu_data *rdp = rsp->rda[cpu]; struct rcu_node *rnp = rcu_get_root(rsp); /* Set up local state, ensuring consistent view of global state. */ spin_lock_irqsave(&rnp->lock, flags); - lastcomp = rsp->completed; - rdp->completed = lastcomp; - rdp->gpnum = lastcomp; rdp->passed_quiesc = 0; /* We could be racing with new GP, */ rdp->qs_pending = 1; /* so set up to respond to current GP. */ rdp->beenonline = 1; /* We have now been online. */ rdp->preemptable = preemptable; - rdp->passed_quiesc_completed = lastcomp - 1; + rdp->qlen_last_fqs_check = 0; + rdp->n_force_qs_snap = rsp->n_force_qs; rdp->blimit = blimit; spin_unlock(&rnp->lock); /* irqs remain disabled. */ @@ -1437,6 +1713,11 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptable) spin_lock(&rnp->lock); /* irqs already disabled. */ rnp->qsmaskinit |= mask; mask = rnp->grpmask; + if (rnp == rdp->mynode) { + rdp->gpnum = rnp->completed; /* if GP in progress... */ + rdp->completed = rnp->completed; + rdp->passed_quiesc_completed = rnp->completed - 1; + } spin_unlock(&rnp->lock); /* irqs already disabled. */ rnp = rnp->parent; } while (rnp != NULL && !(rnp->qsmaskinit & mask)); @@ -1454,8 +1735,8 @@ static void __cpuinit rcu_online_cpu(int cpu) /* * Handle CPU online/offline notification events. */ -int __cpuinit rcu_cpu_notify(struct notifier_block *self, - unsigned long action, void *hcpu) +static int __cpuinit rcu_cpu_notify(struct notifier_block *self, + unsigned long action, void *hcpu) { long cpu = (long)hcpu; @@ -1464,6 +1745,22 @@ int __cpuinit rcu_cpu_notify(struct notifier_block *self, case CPU_UP_PREPARE_FROZEN: rcu_online_cpu(cpu); break; + case CPU_DYING: + case CPU_DYING_FROZEN: + /* + * preempt_disable() in _rcu_barrier() prevents stop_machine(), + * so when "on_each_cpu(rcu_barrier_func, (void *)type, 1);" + * returns, all online cpus have queued rcu_barrier_func(). + * The dying CPU clears its cpu_online_mask bit and + * moves all of its RCU callbacks to ->orphan_cbs_list + * in the context of stop_machine(), so subsequent calls + * to _rcu_barrier() will adopt these callbacks and only + * then queue rcu_barrier_func() on all remaining CPUs. + */ + rcu_send_cbs_to_orphanage(&rcu_bh_state); + rcu_send_cbs_to_orphanage(&rcu_sched_state); + rcu_preempt_send_cbs_to_orphanage(); + break; case CPU_DEAD: case CPU_DEAD_FROZEN: case CPU_UP_CANCELED: @@ -1527,6 +1824,7 @@ static void __init rcu_init_one(struct rcu_state *rsp) rnp = rsp->level[i]; for (j = 0; j < rsp->levelcnt[i]; j++, rnp++) { spin_lock_init(&rnp->lock); + lockdep_set_class(&rnp->lock, &rcu_node_class[i]); rnp->gpnum = 0; rnp->qsmask = 0; rnp->qsmaskinit = 0; @@ -1547,6 +1845,8 @@ static void __init rcu_init_one(struct rcu_state *rsp) rnp->level = i; INIT_LIST_HEAD(&rnp->blocked_tasks[0]); INIT_LIST_HEAD(&rnp->blocked_tasks[1]); + INIT_LIST_HEAD(&rnp->blocked_tasks[2]); + INIT_LIST_HEAD(&rnp->blocked_tasks[3]); } } } @@ -1558,6 +1858,10 @@ static void __init rcu_init_one(struct rcu_state *rsp) */ #define RCU_INIT_FLAVOR(rsp, rcu_data) \ do { \ + int i; \ + int j; \ + struct rcu_node *rnp; \ + \ rcu_init_one(rsp); \ rnp = (rsp)->level[NUM_RCU_LVLS - 1]; \ j = 0; \ @@ -1570,41 +1874,30 @@ do { \ } \ } while (0) -#ifdef CONFIG_TREE_PREEMPT_RCU - -void __init __rcu_init_preempt(void) -{ - int i; /* All used by RCU_INIT_FLAVOR(). */ - int j; - struct rcu_node *rnp; - - RCU_INIT_FLAVOR(&rcu_preempt_state, rcu_preempt_data); -} - -#else /* #ifdef CONFIG_TREE_PREEMPT_RCU */ - -void __init __rcu_init_preempt(void) +void __init rcu_init(void) { -} - -#endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */ - -void __init __rcu_init(void) -{ - int i; /* All used by RCU_INIT_FLAVOR(). */ - int j; - struct rcu_node *rnp; + int i; rcu_bootup_announce(); #ifdef CONFIG_RCU_CPU_STALL_DETECTOR printk(KERN_INFO "RCU-based detection of stalled CPUs is enabled.\n"); #endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ +#if NUM_RCU_LVL_4 != 0 + printk(KERN_INFO "Experimental four-level hierarchy is enabled.\n"); +#endif /* #if NUM_RCU_LVL_4 != 0 */ RCU_INIT_FLAVOR(&rcu_sched_state, rcu_sched_data); RCU_INIT_FLAVOR(&rcu_bh_state, rcu_bh_data); __rcu_init_preempt(); open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); + + /* + * We don't need protection against CPU-hotplug here because + * this is called early in boot, before either interrupts + * or the scheduler are operational. + */ + cpu_notifier(rcu_cpu_notify, 0); + for_each_online_cpu(i) + rcu_cpu_notify(NULL, CPU_UP_PREPARE, (void *)(long)i); } -module_param(blimit, int, 0); -module_param(qhimark, int, 0); -module_param(qlowmark, int, 0); +#include "rcutree_plugin.h" diff --git a/kernel/rcutree.h b/kernel/rcutree.h index 8e8287a983c2..d2a0046f63b2 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h @@ -34,10 +34,11 @@ * In practice, this has not been tested, so there is probably some * bug somewhere. */ -#define MAX_RCU_LVLS 3 +#define MAX_RCU_LVLS 4 #define RCU_FANOUT (CONFIG_RCU_FANOUT) #define RCU_FANOUT_SQ (RCU_FANOUT * RCU_FANOUT) #define RCU_FANOUT_CUBE (RCU_FANOUT_SQ * RCU_FANOUT) +#define RCU_FANOUT_FOURTH (RCU_FANOUT_CUBE * RCU_FANOUT) #if NR_CPUS <= RCU_FANOUT # define NUM_RCU_LVLS 1 @@ -45,23 +46,33 @@ # define NUM_RCU_LVL_1 (NR_CPUS) # define NUM_RCU_LVL_2 0 # define NUM_RCU_LVL_3 0 +# define NUM_RCU_LVL_4 0 #elif NR_CPUS <= RCU_FANOUT_SQ # define NUM_RCU_LVLS 2 # define NUM_RCU_LVL_0 1 -# define NUM_RCU_LVL_1 (((NR_CPUS) + RCU_FANOUT - 1) / RCU_FANOUT) +# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT) # define NUM_RCU_LVL_2 (NR_CPUS) # define NUM_RCU_LVL_3 0 +# define NUM_RCU_LVL_4 0 #elif NR_CPUS <= RCU_FANOUT_CUBE # define NUM_RCU_LVLS 3 # define NUM_RCU_LVL_0 1 -# define NUM_RCU_LVL_1 (((NR_CPUS) + RCU_FANOUT_SQ - 1) / RCU_FANOUT_SQ) -# define NUM_RCU_LVL_2 (((NR_CPUS) + (RCU_FANOUT) - 1) / (RCU_FANOUT)) +# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_SQ) +# define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT) # define NUM_RCU_LVL_3 NR_CPUS +# define NUM_RCU_LVL_4 0 +#elif NR_CPUS <= RCU_FANOUT_FOURTH +# define NUM_RCU_LVLS 4 +# define NUM_RCU_LVL_0 1 +# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_CUBE) +# define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_SQ) +# define NUM_RCU_LVL_3 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT) +# define NUM_RCU_LVL_4 NR_CPUS #else # error "CONFIG_RCU_FANOUT insufficient for NR_CPUS" #endif /* #if (NR_CPUS) <= RCU_FANOUT */ -#define RCU_SUM (NUM_RCU_LVL_0 + NUM_RCU_LVL_1 + NUM_RCU_LVL_2 + NUM_RCU_LVL_3) +#define RCU_SUM (NUM_RCU_LVL_0 + NUM_RCU_LVL_1 + NUM_RCU_LVL_2 + NUM_RCU_LVL_3 + NUM_RCU_LVL_4) #define NUM_RCU_NODES (RCU_SUM - NR_CPUS) /* @@ -79,24 +90,67 @@ struct rcu_dynticks { * Definition for node within the RCU grace-period-detection hierarchy. */ struct rcu_node { - spinlock_t lock; + spinlock_t lock; /* Root rcu_node's lock protects some */ + /* rcu_state fields as well as following. */ long gpnum; /* Current grace period for this node. */ /* This will either be equal to or one */ /* behind the root rcu_node's gpnum. */ + long completed; /* Last grace period completed for this node. */ + /* This will either be equal to or one */ + /* behind the root rcu_node's gpnum. */ unsigned long qsmask; /* CPUs or groups that need to switch in */ /* order for current grace period to proceed.*/ + /* In leaf rcu_node, each bit corresponds to */ + /* an rcu_data structure, otherwise, each */ + /* bit corresponds to a child rcu_node */ + /* structure. */ + unsigned long expmask; /* Groups that have ->blocked_tasks[] */ + /* elements that need to drain to allow the */ + /* current expedited grace period to */ + /* complete (only for TREE_PREEMPT_RCU). */ unsigned long qsmaskinit; - /* Per-GP initialization for qsmask. */ + /* Per-GP initial value for qsmask & expmask. */ unsigned long grpmask; /* Mask to apply to parent qsmask. */ + /* Only one bit will be set in this mask. */ int grplo; /* lowest-numbered CPU or group here. */ int grphi; /* highest-numbered CPU or group here. */ u8 grpnum; /* CPU/group number for next level up. */ u8 level; /* root is at level 0. */ struct rcu_node *parent; - struct list_head blocked_tasks[2]; + struct list_head blocked_tasks[4]; /* Tasks blocked in RCU read-side critsect. */ + /* Grace period number (->gpnum) x blocked */ + /* by tasks on the (x & 0x1) element of the */ + /* blocked_tasks[] array. */ } ____cacheline_internodealigned_in_smp; +/* + * Do a full breadth-first scan of the rcu_node structures for the + * specified rcu_state structure. + */ +#define rcu_for_each_node_breadth_first(rsp, rnp) \ + for ((rnp) = &(rsp)->node[0]; \ + (rnp) < &(rsp)->node[NUM_RCU_NODES]; (rnp)++) + +/* + * Do a breadth-first scan of the non-leaf rcu_node structures for the + * specified rcu_state structure. Note that if there is a singleton + * rcu_node tree with but one rcu_node structure, this loop is a no-op. + */ +#define rcu_for_each_nonleaf_node_breadth_first(rsp, rnp) \ + for ((rnp) = &(rsp)->node[0]; \ + (rnp) < (rsp)->level[NUM_RCU_LVLS - 1]; (rnp)++) + +/* + * Scan the leaves of the rcu_node hierarchy for the specified rcu_state + * structure. Note that if there is a singleton rcu_node tree with but + * one rcu_node structure, this loop -will- visit the rcu_node structure. + * It is still a leaf node, even if it is also the root node. + */ +#define rcu_for_each_leaf_node(rsp, rnp) \ + for ((rnp) = (rsp)->level[NUM_RCU_LVLS - 1]; \ + (rnp) < &(rsp)->node[NUM_RCU_NODES]; (rnp)++) + /* Index values for nxttail array in struct rcu_data. */ #define RCU_DONE_TAIL 0 /* Also RCU_WAIT head. */ #define RCU_WAIT_TAIL 1 /* Also RCU_NEXT_READY head. */ @@ -126,23 +180,30 @@ struct rcu_data { * Any of the partitions might be empty, in which case the * pointer to that partition will be equal to the pointer for * the following partition. When the list is empty, all of - * the nxttail elements point to nxtlist, which is NULL. + * the nxttail elements point to the ->nxtlist pointer itself, + * which in that case is NULL. * - * [*nxttail[RCU_NEXT_READY_TAIL], NULL = *nxttail[RCU_NEXT_TAIL]): - * Entries that might have arrived after current GP ended - * [*nxttail[RCU_WAIT_TAIL], *nxttail[RCU_NEXT_READY_TAIL]): - * Entries known to have arrived before current GP ended - * [*nxttail[RCU_DONE_TAIL], *nxttail[RCU_WAIT_TAIL]): - * Entries that batch # <= ->completed - 1: waiting for current GP * [nxtlist, *nxttail[RCU_DONE_TAIL]): * Entries that batch # <= ->completed * The grace period for these entries has completed, and * the other grace-period-completed entries may be moved * here temporarily in rcu_process_callbacks(). + * [*nxttail[RCU_DONE_TAIL], *nxttail[RCU_WAIT_TAIL]): + * Entries that batch # <= ->completed - 1: waiting for current GP + * [*nxttail[RCU_WAIT_TAIL], *nxttail[RCU_NEXT_READY_TAIL]): + * Entries known to have arrived before current GP ended + * [*nxttail[RCU_NEXT_READY_TAIL], *nxttail[RCU_NEXT_TAIL]): + * Entries that might have arrived after current GP ended + * Note that the value of *nxttail[RCU_NEXT_TAIL] will + * always be NULL, as this is the end of the list. */ struct rcu_head *nxtlist; struct rcu_head **nxttail[RCU_NEXT_SIZE]; long qlen; /* # of queued callbacks */ + long qlen_last_fqs_check; + /* qlen at last check for QS forcing */ + unsigned long n_force_qs_snap; + /* did other CPU force QS recently? */ long blimit; /* Upper limit on a processed batch */ #ifdef CONFIG_NO_HZ @@ -173,13 +234,15 @@ struct rcu_data { }; /* Values for signaled field in struct rcu_state. */ -#define RCU_GP_INIT 0 /* Grace period being initialized. */ -#define RCU_SAVE_DYNTICK 1 /* Need to scan dyntick state. */ -#define RCU_FORCE_QS 2 /* Need to force quiescent state. */ +#define RCU_GP_IDLE 0 /* No grace period in progress. */ +#define RCU_GP_INIT 1 /* Grace period being initialized. */ +#define RCU_SAVE_DYNTICK 2 /* Need to scan dyntick state. */ +#define RCU_SAVE_COMPLETED 3 /* Need to save rsp->completed. */ +#define RCU_FORCE_QS 4 /* Need to force quiescent state. */ #ifdef CONFIG_NO_HZ #define RCU_SIGNAL_INIT RCU_SAVE_DYNTICK #else /* #ifdef CONFIG_NO_HZ */ -#define RCU_SIGNAL_INIT RCU_FORCE_QS +#define RCU_SIGNAL_INIT RCU_SAVE_COMPLETED #endif /* #else #ifdef CONFIG_NO_HZ */ #define RCU_JIFFIES_TILL_FORCE_QS 3 /* for rsp->jiffies_force_qs */ @@ -216,10 +279,23 @@ struct rcu_state { /* Force QS state. */ long gpnum; /* Current gp number. */ long completed; /* # of last completed gp. */ + + /* End of fields guarded by root rcu_node's lock. */ + spinlock_t onofflock; /* exclude on/offline and */ - /* starting new GP. */ + /* starting new GP. Also */ + /* protects the following */ + /* orphan_cbs fields. */ + struct rcu_head *orphan_cbs_list; /* list of rcu_head structs */ + /* orphaned by all CPUs in */ + /* a given leaf rcu_node */ + /* going offline. */ + struct rcu_head **orphan_cbs_tail; /* And tail pointer. */ + long orphan_qlen; /* Number of orphaned cbs. */ spinlock_t fqslock; /* Only one task forcing */ /* quiescent states. */ + long completed_fqs; /* Value of completed @ snap. */ + /* Protected by fqslock. */ unsigned long jiffies_force_qs; /* Time at which to invoke */ /* force_quiescent_state(). */ unsigned long n_force_qs; /* Number of calls to */ @@ -234,11 +310,15 @@ struct rcu_state { unsigned long jiffies_stall; /* Time at which to check */ /* for CPU stalls. */ #endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ -#ifdef CONFIG_NO_HZ - long dynticks_completed; /* Value of completed @ snap. */ -#endif /* #ifdef CONFIG_NO_HZ */ }; +/* Return values for rcu_preempt_offline_tasks(). */ + +#define RCU_OFL_TASKS_NORM_GP 0x1 /* Tasks blocking normal */ + /* GP were moved to root. */ +#define RCU_OFL_TASKS_EXP_GP 0x2 /* Tasks blocking expedited */ + /* GP were moved to root. */ + #ifdef RCU_TREE_NONCORE /* @@ -255,5 +335,37 @@ extern struct rcu_state rcu_preempt_state; DECLARE_PER_CPU(struct rcu_data, rcu_preempt_data); #endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ -#endif /* #ifdef RCU_TREE_NONCORE */ +#else /* #ifdef RCU_TREE_NONCORE */ + +/* Forward declarations for rcutree_plugin.h */ +static void rcu_bootup_announce(void); +long rcu_batches_completed(void); +static void rcu_preempt_note_context_switch(int cpu); +static int rcu_preempted_readers(struct rcu_node *rnp); +#ifdef CONFIG_HOTPLUG_CPU +static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, + unsigned long flags); +#endif /* #ifdef CONFIG_HOTPLUG_CPU */ +#ifdef CONFIG_RCU_CPU_STALL_DETECTOR +static void rcu_print_task_stall(struct rcu_node *rnp); +#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ +static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp); +#ifdef CONFIG_HOTPLUG_CPU +static int rcu_preempt_offline_tasks(struct rcu_state *rsp, + struct rcu_node *rnp, + struct rcu_data *rdp); +static void rcu_preempt_offline_cpu(int cpu); +#endif /* #ifdef CONFIG_HOTPLUG_CPU */ +static void rcu_preempt_check_callbacks(int cpu); +static void rcu_preempt_process_callbacks(void); +void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)); +#if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_TREE_PREEMPT_RCU) +static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp); +#endif /* #if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_TREE_PREEMPT_RCU) */ +static int rcu_preempt_pending(int cpu); +static int rcu_preempt_needs_cpu(int cpu); +static void __cpuinit rcu_preempt_init_percpu_data(int cpu); +static void rcu_preempt_send_cbs_to_orphanage(void); +static void __init __rcu_init_preempt(void); +#endif /* #else #ifdef RCU_TREE_NONCORE */ diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index 1cee04f627eb..37fbccdf41d5 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -24,16 +24,19 @@ * Paul E. McKenney <paulmck@linux.vnet.ibm.com> */ +#include <linux/delay.h> #ifdef CONFIG_TREE_PREEMPT_RCU struct rcu_state rcu_preempt_state = RCU_STATE_INITIALIZER(rcu_preempt_state); DEFINE_PER_CPU(struct rcu_data, rcu_preempt_data); +static int rcu_preempted_readers_exp(struct rcu_node *rnp); + /* * Tell them what RCU they are running. */ -static inline void rcu_bootup_announce(void) +static void __init rcu_bootup_announce(void) { printk(KERN_INFO "Experimental preemptable hierarchical RCU implementation.\n"); @@ -67,7 +70,7 @@ EXPORT_SYMBOL_GPL(rcu_batches_completed); static void rcu_preempt_qs(int cpu) { struct rcu_data *rdp = &per_cpu(rcu_preempt_data, cpu); - rdp->passed_quiesc_completed = rdp->completed; + rdp->passed_quiesc_completed = rdp->gpnum - 1; barrier(); rdp->passed_quiesc = 1; } @@ -150,11 +153,65 @@ void __rcu_read_lock(void) } EXPORT_SYMBOL_GPL(__rcu_read_lock); +/* + * Check for preempted RCU readers blocking the current grace period + * for the specified rcu_node structure. If the caller needs a reliable + * answer, it must hold the rcu_node's ->lock. + */ +static int rcu_preempted_readers(struct rcu_node *rnp) +{ + int phase = rnp->gpnum & 0x1; + + return !list_empty(&rnp->blocked_tasks[phase]) || + !list_empty(&rnp->blocked_tasks[phase + 2]); +} + +/* + * Record a quiescent state for all tasks that were previously queued + * on the specified rcu_node structure and that were blocking the current + * RCU grace period. The caller must hold the specified rnp->lock with + * irqs disabled, and this lock is released upon return, but irqs remain + * disabled. + */ +static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags) + __releases(rnp->lock) +{ + unsigned long mask; + struct rcu_node *rnp_p; + + if (rnp->qsmask != 0 || rcu_preempted_readers(rnp)) { + spin_unlock_irqrestore(&rnp->lock, flags); + return; /* Still need more quiescent states! */ + } + + rnp_p = rnp->parent; + if (rnp_p == NULL) { + /* + * Either there is only one rcu_node in the tree, + * or tasks were kicked up to root rcu_node due to + * CPUs going offline. + */ + rcu_report_qs_rsp(&rcu_preempt_state, flags); + return; + } + + /* Report up the rest of the hierarchy. */ + mask = rnp->grpmask; + spin_unlock(&rnp->lock); /* irqs remain disabled. */ + spin_lock(&rnp_p->lock); /* irqs already disabled. */ + rcu_report_qs_rnp(mask, &rcu_preempt_state, rnp_p, flags); +} + +/* + * Handle special cases during rcu_read_unlock(), such as needing to + * notify RCU core processing or task having blocked during the RCU + * read-side critical section. + */ static void rcu_read_unlock_special(struct task_struct *t) { int empty; + int empty_exp; unsigned long flags; - unsigned long mask; struct rcu_node *rnp; int special; @@ -196,37 +253,31 @@ static void rcu_read_unlock_special(struct task_struct *t) break; spin_unlock(&rnp->lock); /* irqs remain disabled. */ } - empty = list_empty(&rnp->blocked_tasks[rnp->gpnum & 0x1]); + empty = !rcu_preempted_readers(rnp); + empty_exp = !rcu_preempted_readers_exp(rnp); + smp_mb(); /* ensure expedited fastpath sees end of RCU c-s. */ list_del_init(&t->rcu_node_entry); t->rcu_blocked_node = NULL; /* * If this was the last task on the current list, and if * we aren't waiting on any CPUs, report the quiescent state. - * Note that both cpu_quiet_msk_finish() and cpu_quiet_msk() - * drop rnp->lock and restore irq. + * Note that rcu_report_unblock_qs_rnp() releases rnp->lock. */ - if (!empty && rnp->qsmask == 0 && - list_empty(&rnp->blocked_tasks[rnp->gpnum & 0x1])) { - struct rcu_node *rnp_p; - - if (rnp->parent == NULL) { - /* Only one rcu_node in the tree. */ - cpu_quiet_msk_finish(&rcu_preempt_state, flags); - return; - } - /* Report up the rest of the hierarchy. */ - mask = rnp->grpmask; + if (empty) spin_unlock_irqrestore(&rnp->lock, flags); - rnp_p = rnp->parent; - spin_lock_irqsave(&rnp_p->lock, flags); - WARN_ON_ONCE(rnp->qsmask); - cpu_quiet_msk(mask, &rcu_preempt_state, rnp_p, flags); - return; - } - spin_unlock(&rnp->lock); + else + rcu_report_unblock_qs_rnp(rnp, flags); + + /* + * If this was the last task on the expedited lists, + * then we need to report up the rcu_node hierarchy. + */ + if (!empty_exp && !rcu_preempted_readers_exp(rnp)) + rcu_report_exp_rnp(&rcu_preempt_state, rnp); + } else { + local_irq_restore(flags); } - local_irq_restore(flags); } /* @@ -257,12 +308,12 @@ static void rcu_print_task_stall(struct rcu_node *rnp) { unsigned long flags; struct list_head *lp; - int phase = rnp->gpnum & 0x1; + int phase; struct task_struct *t; - if (!list_empty(&rnp->blocked_tasks[phase])) { + if (rcu_preempted_readers(rnp)) { spin_lock_irqsave(&rnp->lock, flags); - phase = rnp->gpnum & 0x1; /* re-read under lock. */ + phase = rnp->gpnum & 0x1; lp = &rnp->blocked_tasks[phase]; list_for_each_entry(t, lp, rcu_node_entry) printk(" P%d", t->pid); @@ -281,20 +332,10 @@ static void rcu_print_task_stall(struct rcu_node *rnp) */ static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp) { - WARN_ON_ONCE(!list_empty(&rnp->blocked_tasks[rnp->gpnum & 0x1])); + WARN_ON_ONCE(rcu_preempted_readers(rnp)); WARN_ON_ONCE(rnp->qsmask); } -/* - * Check for preempted RCU readers for the specified rcu_node structure. - * If the caller needs a reliable answer, it must hold the rcu_node's - * >lock. - */ -static int rcu_preempted_readers(struct rcu_node *rnp) -{ - return !list_empty(&rnp->blocked_tasks[rnp->gpnum & 0x1]); -} - #ifdef CONFIG_HOTPLUG_CPU /* @@ -303,26 +344,34 @@ static int rcu_preempted_readers(struct rcu_node *rnp) * rcu_node. The reason for not just moving them to the immediate * parent is to remove the need for rcu_read_unlock_special() to * make more than two attempts to acquire the target rcu_node's lock. + * Returns true if there were tasks blocking the current RCU grace + * period. + * + * Returns 1 if there was previously a task blocking the current grace + * period on the specified rcu_node structure. * * The caller must hold rnp->lock with irqs disabled. */ -static void rcu_preempt_offline_tasks(struct rcu_state *rsp, - struct rcu_node *rnp, - struct rcu_data *rdp) +static int rcu_preempt_offline_tasks(struct rcu_state *rsp, + struct rcu_node *rnp, + struct rcu_data *rdp) { int i; struct list_head *lp; struct list_head *lp_root; + int retval = 0; struct rcu_node *rnp_root = rcu_get_root(rsp); struct task_struct *tp; if (rnp == rnp_root) { WARN_ONCE(1, "Last CPU thought to be offlined?"); - return; /* Shouldn't happen: at least one CPU online. */ + return 0; /* Shouldn't happen: at least one CPU online. */ } WARN_ON_ONCE(rnp != rdp->mynode && (!list_empty(&rnp->blocked_tasks[0]) || - !list_empty(&rnp->blocked_tasks[1]))); + !list_empty(&rnp->blocked_tasks[1]) || + !list_empty(&rnp->blocked_tasks[2]) || + !list_empty(&rnp->blocked_tasks[3]))); /* * Move tasks up to root rcu_node. Rely on the fact that the @@ -330,7 +379,11 @@ static void rcu_preempt_offline_tasks(struct rcu_state *rsp, * rcu_nodes in terms of gp_num value. This fact allows us to * move the blocked_tasks[] array directly, element by element. */ - for (i = 0; i < 2; i++) { + if (rcu_preempted_readers(rnp)) + retval |= RCU_OFL_TASKS_NORM_GP; + if (rcu_preempted_readers_exp(rnp)) + retval |= RCU_OFL_TASKS_EXP_GP; + for (i = 0; i < 4; i++) { lp = &rnp->blocked_tasks[i]; lp_root = &rnp_root->blocked_tasks[i]; while (!list_empty(lp)) { @@ -342,6 +395,7 @@ static void rcu_preempt_offline_tasks(struct rcu_state *rsp, spin_unlock(&rnp_root->lock); /* irqs remain disabled */ } } + return retval; } /* @@ -392,6 +446,186 @@ void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) } EXPORT_SYMBOL_GPL(call_rcu); +/** + * synchronize_rcu - wait until a grace period has elapsed. + * + * Control will return to the caller some time after a full grace + * period has elapsed, in other words after all currently executing RCU + * read-side critical sections have completed. RCU read-side critical + * sections are delimited by rcu_read_lock() and rcu_read_unlock(), + * and may be nested. + */ +void synchronize_rcu(void) +{ + struct rcu_synchronize rcu; + + if (!rcu_scheduler_active) + return; + + init_completion(&rcu.completion); + /* Will wake me after RCU finished. */ + call_rcu(&rcu.head, wakeme_after_rcu); + /* Wait for it. */ + wait_for_completion(&rcu.completion); +} +EXPORT_SYMBOL_GPL(synchronize_rcu); + +static DECLARE_WAIT_QUEUE_HEAD(sync_rcu_preempt_exp_wq); +static long sync_rcu_preempt_exp_count; +static DEFINE_MUTEX(sync_rcu_preempt_exp_mutex); + +/* + * Return non-zero if there are any tasks in RCU read-side critical + * sections blocking the current preemptible-RCU expedited grace period. + * If there is no preemptible-RCU expedited grace period currently in + * progress, returns zero unconditionally. + */ +static int rcu_preempted_readers_exp(struct rcu_node *rnp) +{ + return !list_empty(&rnp->blocked_tasks[2]) || + !list_empty(&rnp->blocked_tasks[3]); +} + +/* + * return non-zero if there is no RCU expedited grace period in progress + * for the specified rcu_node structure, in other words, if all CPUs and + * tasks covered by the specified rcu_node structure have done their bit + * for the current expedited grace period. Works only for preemptible + * RCU -- other RCU implementation use other means. + * + * Caller must hold sync_rcu_preempt_exp_mutex. + */ +static int sync_rcu_preempt_exp_done(struct rcu_node *rnp) +{ + return !rcu_preempted_readers_exp(rnp) && + ACCESS_ONCE(rnp->expmask) == 0; +} + +/* + * Report the exit from RCU read-side critical section for the last task + * that queued itself during or before the current expedited preemptible-RCU + * grace period. This event is reported either to the rcu_node structure on + * which the task was queued or to one of that rcu_node structure's ancestors, + * recursively up the tree. (Calm down, calm down, we do the recursion + * iteratively!) + * + * Caller must hold sync_rcu_preempt_exp_mutex. + */ +static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp) +{ + unsigned long flags; + unsigned long mask; + + spin_lock_irqsave(&rnp->lock, flags); + for (;;) { + if (!sync_rcu_preempt_exp_done(rnp)) + break; + if (rnp->parent == NULL) { + wake_up(&sync_rcu_preempt_exp_wq); + break; + } + mask = rnp->grpmask; + spin_unlock(&rnp->lock); /* irqs remain disabled */ + rnp = rnp->parent; + spin_lock(&rnp->lock); /* irqs already disabled */ + rnp->expmask &= ~mask; + } + spin_unlock_irqrestore(&rnp->lock, flags); +} + +/* + * Snapshot the tasks blocking the newly started preemptible-RCU expedited + * grace period for the specified rcu_node structure. If there are no such + * tasks, report it up the rcu_node hierarchy. + * + * Caller must hold sync_rcu_preempt_exp_mutex and rsp->onofflock. + */ +static void +sync_rcu_preempt_exp_init(struct rcu_state *rsp, struct rcu_node *rnp) +{ + int must_wait; + + spin_lock(&rnp->lock); /* irqs already disabled */ + list_splice_init(&rnp->blocked_tasks[0], &rnp->blocked_tasks[2]); + list_splice_init(&rnp->blocked_tasks[1], &rnp->blocked_tasks[3]); + must_wait = rcu_preempted_readers_exp(rnp); + spin_unlock(&rnp->lock); /* irqs remain disabled */ + if (!must_wait) + rcu_report_exp_rnp(rsp, rnp); +} + +/* + * Wait for an rcu-preempt grace period, but expedite it. The basic idea + * is to invoke synchronize_sched_expedited() to push all the tasks to + * the ->blocked_tasks[] lists, move all entries from the first set of + * ->blocked_tasks[] lists to the second set, and finally wait for this + * second set to drain. + */ +void synchronize_rcu_expedited(void) +{ + unsigned long flags; + struct rcu_node *rnp; + struct rcu_state *rsp = &rcu_preempt_state; + long snap; + int trycount = 0; + + smp_mb(); /* Caller's modifications seen first by other CPUs. */ + snap = ACCESS_ONCE(sync_rcu_preempt_exp_count) + 1; + smp_mb(); /* Above access cannot bleed into critical section. */ + + /* + * Acquire lock, falling back to synchronize_rcu() if too many + * lock-acquisition failures. Of course, if someone does the + * expedited grace period for us, just leave. + */ + while (!mutex_trylock(&sync_rcu_preempt_exp_mutex)) { + if (trycount++ < 10) + udelay(trycount * num_online_cpus()); + else { + synchronize_rcu(); + return; + } + if ((ACCESS_ONCE(sync_rcu_preempt_exp_count) - snap) > 0) + goto mb_ret; /* Others did our work for us. */ + } + if ((ACCESS_ONCE(sync_rcu_preempt_exp_count) - snap) > 0) + goto unlock_mb_ret; /* Others did our work for us. */ + + /* force all RCU readers onto blocked_tasks[]. */ + synchronize_sched_expedited(); + + spin_lock_irqsave(&rsp->onofflock, flags); + + /* Initialize ->expmask for all non-leaf rcu_node structures. */ + rcu_for_each_nonleaf_node_breadth_first(rsp, rnp) { + spin_lock(&rnp->lock); /* irqs already disabled. */ + rnp->expmask = rnp->qsmaskinit; + spin_unlock(&rnp->lock); /* irqs remain disabled. */ + } + + /* Snapshot current state of ->blocked_tasks[] lists. */ + rcu_for_each_leaf_node(rsp, rnp) + sync_rcu_preempt_exp_init(rsp, rnp); + if (NUM_RCU_NODES > 1) + sync_rcu_preempt_exp_init(rsp, rcu_get_root(rsp)); + + spin_unlock_irqrestore(&rsp->onofflock, flags); + + /* Wait for snapshotted ->blocked_tasks[] lists to drain. */ + rnp = rcu_get_root(rsp); + wait_event(sync_rcu_preempt_exp_wq, + sync_rcu_preempt_exp_done(rnp)); + + /* Clean up and exit. */ + smp_mb(); /* ensure expedited GP seen before counter increment. */ + ACCESS_ONCE(sync_rcu_preempt_exp_count)++; +unlock_mb_ret: + mutex_unlock(&sync_rcu_preempt_exp_mutex); +mb_ret: + smp_mb(); /* ensure subsequent action seen after grace period. */ +} +EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); + /* * Check to see if there is any immediate preemptable-RCU-related work * to be done. @@ -410,6 +644,15 @@ static int rcu_preempt_needs_cpu(int cpu) return !!per_cpu(rcu_preempt_data, cpu).nxtlist; } +/** + * rcu_barrier - Wait until all in-flight call_rcu() callbacks complete. + */ +void rcu_barrier(void) +{ + _rcu_barrier(&rcu_preempt_state, call_rcu); +} +EXPORT_SYMBOL_GPL(rcu_barrier); + /* * Initialize preemptable RCU's per-CPU data. */ @@ -419,6 +662,22 @@ static void __cpuinit rcu_preempt_init_percpu_data(int cpu) } /* + * Move preemptable RCU's callbacks to ->orphan_cbs_list. + */ +static void rcu_preempt_send_cbs_to_orphanage(void) +{ + rcu_send_cbs_to_orphanage(&rcu_preempt_state); +} + +/* + * Initialize preemptable RCU's state structures. + */ +static void __init __rcu_init_preempt(void) +{ + RCU_INIT_FLAVOR(&rcu_preempt_state, rcu_preempt_data); +} + +/* * Check for a task exiting while in a preemptable-RCU read-side * critical section, clean up if so. No need to issue warnings, * as debug_check_no_locks_held() already does this if lockdep @@ -439,7 +698,7 @@ void exit_rcu(void) /* * Tell them what RCU they are running. */ -static inline void rcu_bootup_announce(void) +static void __init rcu_bootup_announce(void) { printk(KERN_INFO "Hierarchical RCU implementation.\n"); } @@ -461,6 +720,25 @@ static void rcu_preempt_note_context_switch(int cpu) { } +/* + * Because preemptable RCU does not exist, there are never any preempted + * RCU readers. + */ +static int rcu_preempted_readers(struct rcu_node *rnp) +{ + return 0; +} + +#ifdef CONFIG_HOTPLUG_CPU + +/* Because preemptible RCU does not exist, no quieting of tasks. */ +static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags) +{ + spin_unlock_irqrestore(&rnp->lock, flags); +} + +#endif /* #ifdef CONFIG_HOTPLUG_CPU */ + #ifdef CONFIG_RCU_CPU_STALL_DETECTOR /* @@ -483,25 +761,19 @@ static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp) WARN_ON_ONCE(rnp->qsmask); } -/* - * Because preemptable RCU does not exist, there are never any preempted - * RCU readers. - */ -static int rcu_preempted_readers(struct rcu_node *rnp) -{ - return 0; -} - #ifdef CONFIG_HOTPLUG_CPU /* * Because preemptable RCU does not exist, it never needs to migrate - * tasks that were blocked within RCU read-side critical sections. + * tasks that were blocked within RCU read-side critical sections, and + * such non-existent tasks cannot possibly have been blocking the current + * grace period. */ -static void rcu_preempt_offline_tasks(struct rcu_state *rsp, - struct rcu_node *rnp, - struct rcu_data *rdp) +static int rcu_preempt_offline_tasks(struct rcu_state *rsp, + struct rcu_node *rnp, + struct rcu_data *rdp) { + return 0; } /* @@ -518,7 +790,7 @@ static void rcu_preempt_offline_cpu(int cpu) * Because preemptable RCU does not exist, it never has any callbacks * to check. */ -void rcu_preempt_check_callbacks(int cpu) +static void rcu_preempt_check_callbacks(int cpu) { } @@ -526,7 +798,7 @@ void rcu_preempt_check_callbacks(int cpu) * Because preemptable RCU does not exist, it never has any callbacks * to process. */ -void rcu_preempt_process_callbacks(void) +static void rcu_preempt_process_callbacks(void) { } @@ -540,6 +812,30 @@ void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) EXPORT_SYMBOL_GPL(call_rcu); /* + * Wait for an rcu-preempt grace period, but make it happen quickly. + * But because preemptable RCU does not exist, map to rcu-sched. + */ +void synchronize_rcu_expedited(void) +{ + synchronize_sched_expedited(); +} +EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); + +#ifdef CONFIG_HOTPLUG_CPU + +/* + * Because preemptable RCU does not exist, there is never any need to + * report on tasks preempted in RCU read-side critical sections during + * expedited RCU grace periods. + */ +static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp) +{ + return; +} + +#endif /* #ifdef CONFIG_HOTPLUG_CPU */ + +/* * Because preemptable RCU does not exist, it never has any work to do. */ static int rcu_preempt_pending(int cpu) @@ -556,6 +852,16 @@ static int rcu_preempt_needs_cpu(int cpu) } /* + * Because preemptable RCU does not exist, rcu_barrier() is just + * another name for rcu_barrier_sched(). + */ +void rcu_barrier(void) +{ + rcu_barrier_sched(); +} +EXPORT_SYMBOL_GPL(rcu_barrier); + +/* * Because preemptable RCU does not exist, there is no per-CPU * data to initialize. */ @@ -563,4 +869,18 @@ static void __cpuinit rcu_preempt_init_percpu_data(int cpu) { } +/* + * Because there is no preemptable RCU, there are no callbacks to move. + */ +static void rcu_preempt_send_cbs_to_orphanage(void) +{ +} + +/* + * Because preemptable RCU does not exist, it need not be initialized. + */ +static void __init __rcu_init_preempt(void) +{ +} + #endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */ diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c index c89f5e9fd173..9d2c88423b31 100644 --- a/kernel/rcutree_trace.c +++ b/kernel/rcutree_trace.c @@ -93,7 +93,7 @@ static int rcudata_open(struct inode *inode, struct file *file) return single_open(file, show_rcudata, NULL); } -static struct file_operations rcudata_fops = { +static const struct file_operations rcudata_fops = { .owner = THIS_MODULE, .open = rcudata_open, .read = seq_read, @@ -145,7 +145,7 @@ static int rcudata_csv_open(struct inode *inode, struct file *file) return single_open(file, show_rcudata_csv, NULL); } -static struct file_operations rcudata_csv_fops = { +static const struct file_operations rcudata_csv_fops = { .owner = THIS_MODULE, .open = rcudata_csv_open, .read = seq_read, @@ -155,24 +155,32 @@ static struct file_operations rcudata_csv_fops = { static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp) { + long gpnum; int level = 0; + int phase; struct rcu_node *rnp; + gpnum = rsp->gpnum; seq_printf(m, "c=%ld g=%ld s=%d jfq=%ld j=%x " - "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu\n", - rsp->completed, rsp->gpnum, rsp->signaled, + "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu oqlen=%ld\n", + rsp->completed, gpnum, rsp->signaled, (long)(rsp->jiffies_force_qs - jiffies), (int)(jiffies & 0xffff), rsp->n_force_qs, rsp->n_force_qs_ngp, rsp->n_force_qs - rsp->n_force_qs_ngp, - rsp->n_force_qs_lh); + rsp->n_force_qs_lh, rsp->orphan_qlen); for (rnp = &rsp->node[0]; rnp - &rsp->node[0] < NUM_RCU_NODES; rnp++) { if (rnp->level != level) { seq_puts(m, "\n"); level = rnp->level; } - seq_printf(m, "%lx/%lx %d:%d ^%d ", + phase = gpnum & 0x1; + seq_printf(m, "%lx/%lx %c%c>%c%c %d:%d ^%d ", rnp->qsmask, rnp->qsmaskinit, + "T."[list_empty(&rnp->blocked_tasks[phase])], + "E."[list_empty(&rnp->blocked_tasks[phase + 2])], + "T."[list_empty(&rnp->blocked_tasks[!phase])], + "E."[list_empty(&rnp->blocked_tasks[!phase + 2])], rnp->grplo, rnp->grphi, rnp->grpnum); } seq_puts(m, "\n"); @@ -196,7 +204,7 @@ static int rcuhier_open(struct inode *inode, struct file *file) return single_open(file, show_rcuhier, NULL); } -static struct file_operations rcuhier_fops = { +static const struct file_operations rcuhier_fops = { .owner = THIS_MODULE, .open = rcuhier_open, .read = seq_read, @@ -222,7 +230,7 @@ static int rcugp_open(struct inode *inode, struct file *file) return single_open(file, show_rcugp, NULL); } -static struct file_operations rcugp_fops = { +static const struct file_operations rcugp_fops = { .owner = THIS_MODULE, .open = rcugp_open, .read = seq_read, @@ -276,7 +284,7 @@ static int rcu_pending_open(struct inode *inode, struct file *file) return single_open(file, show_rcu_pending, NULL); } -static struct file_operations rcu_pending_fops = { +static const struct file_operations rcu_pending_fops = { .owner = THIS_MODULE, .open = rcu_pending_open, .read = seq_read, diff --git a/kernel/relay.c b/kernel/relay.c index bc188549788f..760c26209a3c 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -60,7 +60,7 @@ static int relay_buf_fault(struct vm_area_struct *vma, struct vm_fault *vmf) /* * vm_ops for relay file mappings. */ -static struct vm_operations_struct relay_file_mmap_ops = { +static const struct vm_operations_struct relay_file_mmap_ops = { .fault = relay_buf_fault, .close = relay_file_mmap_close, }; diff --git a/kernel/res_counter.c b/kernel/res_counter.c index e1338f074314..bcdabf37c40b 100644 --- a/kernel/res_counter.c +++ b/kernel/res_counter.c @@ -19,6 +19,7 @@ void res_counter_init(struct res_counter *counter, struct res_counter *parent) { spin_lock_init(&counter->lock); counter->limit = RESOURCE_MAX; + counter->soft_limit = RESOURCE_MAX; counter->parent = parent; } @@ -101,6 +102,8 @@ res_counter_member(struct res_counter *counter, int member) return &counter->limit; case RES_FAILCNT: return &counter->failcnt; + case RES_SOFT_LIMIT: + return &counter->soft_limit; }; BUG(); diff --git a/kernel/sched.c b/kernel/sched.c index 2f76e06bea58..aa31244caa9f 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -309,6 +309,8 @@ static DEFINE_PER_CPU_SHARED_ALIGNED(struct rt_rq, init_rt_rq); */ static DEFINE_SPINLOCK(task_group_lock); +#ifdef CONFIG_FAIR_GROUP_SCHED + #ifdef CONFIG_SMP static int root_task_group_empty(void) { @@ -316,7 +318,6 @@ static int root_task_group_empty(void) } #endif -#ifdef CONFIG_FAIR_GROUP_SCHED #ifdef CONFIG_USER_SCHED # define INIT_TASK_GROUP_LOAD (2*NICE_0_LOAD) #else /* !CONFIG_USER_SCHED */ @@ -534,14 +535,12 @@ struct rq { #define CPU_LOAD_IDX_MAX 5 unsigned long cpu_load[CPU_LOAD_IDX_MAX]; #ifdef CONFIG_NO_HZ - unsigned long last_tick_seen; unsigned char in_nohz_recently; #endif /* capture load from *all* tasks on this cpu: */ struct load_weight load; unsigned long nr_load_updates; u64 nr_switches; - u64 nr_migrations_in; struct cfs_rq cfs; struct rt_rq rt; @@ -590,6 +589,8 @@ struct rq { u64 rt_avg; u64 age_stamp; + u64 idle_stamp; + u64 avg_idle; #endif /* calc_load related fields */ @@ -676,6 +677,7 @@ inline void update_rq_clock(struct rq *rq) /** * runqueue_is_locked + * @cpu: the processor in question. * * Returns true if the current cpu runqueue is locked. * This interface allows printk to be called with the runqueue lock @@ -770,7 +772,7 @@ sched_feat_write(struct file *filp, const char __user *ubuf, if (!sched_feat_names[i]) return -EINVAL; - filp->f_pos += cnt; + *ppos += cnt; return cnt; } @@ -780,7 +782,7 @@ static int sched_feat_open(struct inode *inode, struct file *filp) return single_open(filp, sched_feat_show, NULL); } -static struct file_operations sched_feat_fops = { +static const struct file_operations sched_feat_fops = { .open = sched_feat_open, .write = sched_feat_write, .read = seq_read, @@ -1563,11 +1565,7 @@ static unsigned long cpu_avg_load_per_task(int cpu) #ifdef CONFIG_FAIR_GROUP_SCHED -struct update_shares_data { - unsigned long rq_weight[NR_CPUS]; -}; - -static DEFINE_PER_CPU(struct update_shares_data, update_shares_data); +static __read_mostly unsigned long *update_shares_data; static void __set_se_shares(struct sched_entity *se, unsigned long shares); @@ -1577,12 +1575,12 @@ static void __set_se_shares(struct sched_entity *se, unsigned long shares); static void update_group_shares_cpu(struct task_group *tg, int cpu, unsigned long sd_shares, unsigned long sd_rq_weight, - struct update_shares_data *usd) + unsigned long *usd_rq_weight) { unsigned long shares, rq_weight; int boost = 0; - rq_weight = usd->rq_weight[cpu]; + rq_weight = usd_rq_weight[cpu]; if (!rq_weight) { boost = 1; rq_weight = NICE_0_LOAD; @@ -1617,7 +1615,7 @@ static void update_group_shares_cpu(struct task_group *tg, int cpu, static int tg_shares_up(struct task_group *tg, void *data) { unsigned long weight, rq_weight = 0, shares = 0; - struct update_shares_data *usd; + unsigned long *usd_rq_weight; struct sched_domain *sd = data; unsigned long flags; int i; @@ -1626,11 +1624,11 @@ static int tg_shares_up(struct task_group *tg, void *data) return 0; local_irq_save(flags); - usd = &__get_cpu_var(update_shares_data); + usd_rq_weight = per_cpu_ptr(update_shares_data, smp_processor_id()); for_each_cpu(i, sched_domain_span(sd)) { weight = tg->cfs_rq[i]->load.weight; - usd->rq_weight[i] = weight; + usd_rq_weight[i] = weight; /* * If there are currently no tasks on the cpu pretend there @@ -1651,7 +1649,7 @@ static int tg_shares_up(struct task_group *tg, void *data) shares = tg->shares; for_each_cpu(i, sched_domain_span(sd)) - update_group_shares_cpu(tg, i, shares, rq_weight, usd); + update_group_shares_cpu(tg, i, shares, rq_weight, usd_rq_weight); local_irq_restore(flags); @@ -1995,6 +1993,39 @@ static inline void check_class_changed(struct rq *rq, struct task_struct *p, p->sched_class->prio_changed(rq, p, oldprio, running); } +/** + * kthread_bind - bind a just-created kthread to a cpu. + * @p: thread created by kthread_create(). + * @cpu: cpu (might not be online, must be possible) for @k to run on. + * + * Description: This function is equivalent to set_cpus_allowed(), + * except that @cpu doesn't need to be online, and the thread must be + * stopped (i.e., just returned from kthread_create()). + * + * Function lives here instead of kthread.c because it messes with + * scheduler internals which require locking. + */ +void kthread_bind(struct task_struct *p, unsigned int cpu) +{ + struct rq *rq = cpu_rq(cpu); + unsigned long flags; + + /* Must have done schedule() in kthread() before we set_task_cpu */ + if (!wait_task_inactive(p, TASK_UNINTERRUPTIBLE)) { + WARN_ON(1); + return; + } + + spin_lock_irqsave(&rq->lock, flags); + update_rq_clock(rq); + set_task_cpu(p, cpu); + p->cpus_allowed = cpumask_of_cpu(cpu); + p->rt.nr_cpus_allowed = 1; + p->flags |= PF_THREAD_BOUND; + spin_unlock_irqrestore(&rq->lock, flags); +} +EXPORT_SYMBOL(kthread_bind); + #ifdef CONFIG_SMP /* * Is this task likely cache-hot: @@ -2007,7 +2038,7 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd) /* * Buddy candidates are cache hot: */ - if (sched_feat(CACHE_HOT_BUDDY) && + if (sched_feat(CACHE_HOT_BUDDY) && this_rq()->nr_running && (&p->se == cfs_rq_of(&p->se)->next || &p->se == cfs_rq_of(&p->se)->last)) return 1; @@ -2048,7 +2079,6 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) #endif if (old_cpu != new_cpu) { p->se.nr_migrations++; - new_rq->nr_migrations_in++; #ifdef CONFIG_SCHEDSTATS if (task_hot(p, old_rq->clock, NULL)) schedstat_inc(p, se.nr_forced2_migrations); @@ -2085,6 +2115,7 @@ migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req) * it is sufficient to simply update the task's cpu field. */ if (!p->se.on_rq && !task_running(rq, p)) { + update_rq_clock(rq); set_task_cpu(p, dest_cpu); return 0; } @@ -2311,7 +2342,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, { int cpu, orig_cpu, this_cpu, success = 0; unsigned long flags; - struct rq *rq; + struct rq *rq, *orig_rq; if (!sched_feat(SYNC_WAKEUPS)) wake_flags &= ~WF_SYNC; @@ -2319,7 +2350,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, this_cpu = get_cpu(); smp_wmb(); - rq = task_rq_lock(p, &flags); + rq = orig_rq = task_rq_lock(p, &flags); update_rq_clock(rq); if (!(p->state & state)) goto out; @@ -2346,10 +2377,15 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, task_rq_unlock(rq, &flags); cpu = p->sched_class->select_task_rq(p, SD_BALANCE_WAKE, wake_flags); - if (cpu != orig_cpu) + if (cpu != orig_cpu) { + local_irq_save(flags); + rq = cpu_rq(cpu); + update_rq_clock(rq); set_task_cpu(p, cpu); - + local_irq_restore(flags); + } rq = task_rq_lock(p, &flags); + WARN_ON(p->state != TASK_WAKING); cpu = task_cpu(p); @@ -2406,6 +2442,17 @@ out_running: #ifdef CONFIG_SMP if (p->sched_class->task_wake_up) p->sched_class->task_wake_up(rq, p); + + if (unlikely(rq->idle_stamp)) { + u64 delta = rq->clock - rq->idle_stamp; + u64 max = 2*sysctl_sched_migration_cost; + + if (delta > max) + rq->avg_idle = max; + else + update_avg(&rq->avg_idle, delta); + rq->idle_stamp = 0; + } #endif out: task_rq_unlock(rq, &flags); @@ -2511,26 +2558,22 @@ static void __sched_fork(struct task_struct *p) void sched_fork(struct task_struct *p, int clone_flags) { int cpu = get_cpu(); + unsigned long flags; __sched_fork(p); /* - * Make sure we do not leak PI boosting priority to the child. - */ - p->prio = current->normal_prio; - - /* * Revert to default priority/policy on fork if requested. */ if (unlikely(p->sched_reset_on_fork)) { - if (p->policy == SCHED_FIFO || p->policy == SCHED_RR) + if (p->policy == SCHED_FIFO || p->policy == SCHED_RR) { p->policy = SCHED_NORMAL; - - if (p->normal_prio < DEFAULT_PRIO) - p->prio = DEFAULT_PRIO; + p->normal_prio = p->static_prio; + } if (PRIO_TO_NICE(p->static_prio) < 0) { p->static_prio = NICE_TO_PRIO(0); + p->normal_prio = p->static_prio; set_load_weight(p); } @@ -2541,13 +2584,21 @@ void sched_fork(struct task_struct *p, int clone_flags) p->sched_reset_on_fork = 0; } + /* + * Make sure we do not leak PI boosting priority to the child. + */ + p->prio = current->normal_prio; + if (!rt_prio(p->prio)) p->sched_class = &fair_sched_class; #ifdef CONFIG_SMP cpu = p->sched_class->select_task_rq(p, SD_BALANCE_FORK, 0); #endif + local_irq_save(flags); + update_rq_clock(cpu_rq(cpu)); set_task_cpu(p, cpu); + local_irq_restore(flags); #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) if (likely(sched_info_on())) @@ -2581,8 +2632,6 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags) BUG_ON(p->state != TASK_RUNNING); update_rq_clock(rq); - p->prio = effective_prio(p); - if (!p->sched_class->task_new || !current->se.on_rq) { activate_task(rq, p, 0); } else { @@ -2816,14 +2865,14 @@ context_switch(struct rq *rq, struct task_struct *prev, */ arch_start_context_switch(prev); - if (unlikely(!mm)) { + if (likely(!mm)) { next->active_mm = oldmm; atomic_inc(&oldmm->mm_count); enter_lazy_tlb(oldmm, next); } else switch_mm(oldmm, mm, next); - if (unlikely(!prev->mm)) { + if (likely(!prev->mm)) { prev->active_mm = NULL; rq->prev_mm = oldmm; } @@ -2986,15 +3035,6 @@ static void calc_load_account_active(struct rq *this_rq) } /* - * Externally visible per-cpu scheduler statistics: - * cpu_nr_migrations(cpu) - number of migrations into that cpu - */ -u64 cpu_nr_migrations(int cpu) -{ - return cpu_rq(cpu)->nr_migrations_in; -} - -/* * Update rq->cpu_load[] statistics. This function is usually called every * scheduler tick (TICK_NSEC). */ @@ -3658,6 +3698,7 @@ static void update_group_power(struct sched_domain *sd, int cpu) /** * update_sg_lb_stats - Update sched_group's statistics for load balancing. + * @sd: The sched_domain whose statistics are to be updated. * @group: sched_group whose statistics are to be updated. * @this_cpu: Cpu for which load balance is currently performed. * @idle: Idle status of this_cpu @@ -4093,7 +4134,7 @@ static int load_balance(int this_cpu, struct rq *this_rq, unsigned long flags; struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask); - cpumask_setall(cpus); + cpumask_copy(cpus, cpu_online_mask); /* * When power savings policy is enabled for the parent domain, idle @@ -4256,7 +4297,7 @@ load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd) int all_pinned = 0; struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask); - cpumask_setall(cpus); + cpumask_copy(cpus, cpu_online_mask); /* * When power savings policy is enabled for the parent domain, idle @@ -4396,6 +4437,11 @@ static void idle_balance(int this_cpu, struct rq *this_rq) int pulled_task = 0; unsigned long next_balance = jiffies + HZ; + this_rq->idle_stamp = this_rq->clock; + + if (this_rq->avg_idle < sysctl_sched_migration_cost) + return; + for_each_domain(this_cpu, sd) { unsigned long interval; @@ -4410,8 +4456,10 @@ static void idle_balance(int this_cpu, struct rq *this_rq) interval = msecs_to_jiffies(sd->balance_interval); if (time_after(next_balance, sd->last_balance + interval)) next_balance = sd->last_balance + interval; - if (pulled_task) + if (pulled_task) { + this_rq->idle_stamp = 0; break; + } } if (pulled_task || time_after(jiffies, this_rq->next_balance)) { /* @@ -5013,8 +5061,13 @@ static void account_guest_time(struct task_struct *p, cputime_t cputime, p->gtime = cputime_add(p->gtime, cputime); /* Add guest time to cpustat. */ - cpustat->user = cputime64_add(cpustat->user, tmp); - cpustat->guest = cputime64_add(cpustat->guest, tmp); + if (TASK_NICE(p) > 0) { + cpustat->nice = cputime64_add(cpustat->nice, tmp); + cpustat->guest_nice = cputime64_add(cpustat->guest_nice, tmp); + } else { + cpustat->user = cputime64_add(cpustat->user, tmp); + cpustat->guest = cputime64_add(cpustat->guest, tmp); + } } /* @@ -5129,60 +5182,86 @@ void account_idle_ticks(unsigned long ticks) * Use precise platform statistics if available: */ #ifdef CONFIG_VIRT_CPU_ACCOUNTING -cputime_t task_utime(struct task_struct *p) +void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st) { - return p->utime; + *ut = p->utime; + *st = p->stime; } -cputime_t task_stime(struct task_struct *p) +void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st) { - return p->stime; + struct task_cputime cputime; + + thread_group_cputime(p, &cputime); + + *ut = cputime.utime; + *st = cputime.stime; } #else -cputime_t task_utime(struct task_struct *p) + +#ifndef nsecs_to_cputime +# define nsecs_to_cputime(__nsecs) nsecs_to_jiffies(__nsecs) +#endif + +void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st) { - clock_t utime = cputime_to_clock_t(p->utime), - total = utime + cputime_to_clock_t(p->stime); - u64 temp; + cputime_t rtime, utime = p->utime, total = cputime_add(utime, p->stime); /* * Use CFS's precise accounting: */ - temp = (u64)nsec_to_clock_t(p->se.sum_exec_runtime); + rtime = nsecs_to_cputime(p->se.sum_exec_runtime); if (total) { - temp *= utime; + u64 temp; + + temp = (u64)(rtime * utime); do_div(temp, total); - } - utime = (clock_t)temp; + utime = (cputime_t)temp; + } else + utime = rtime; - p->prev_utime = max(p->prev_utime, clock_t_to_cputime(utime)); - return p->prev_utime; + /* + * Compare with previous values, to keep monotonicity: + */ + p->prev_utime = max(p->prev_utime, utime); + p->prev_stime = max(p->prev_stime, cputime_sub(rtime, p->prev_utime)); + + *ut = p->prev_utime; + *st = p->prev_stime; } -cputime_t task_stime(struct task_struct *p) +/* + * Must be called with siglock held. + */ +void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st) { - clock_t stime; + struct signal_struct *sig = p->signal; + struct task_cputime cputime; + cputime_t rtime, utime, total; - /* - * Use CFS's precise accounting. (we subtract utime from - * the total, to make sure the total observed by userspace - * grows monotonically - apps rely on that): - */ - stime = nsec_to_clock_t(p->se.sum_exec_runtime) - - cputime_to_clock_t(task_utime(p)); + thread_group_cputime(p, &cputime); - if (stime >= 0) - p->prev_stime = max(p->prev_stime, clock_t_to_cputime(stime)); + total = cputime_add(cputime.utime, cputime.stime); + rtime = nsecs_to_cputime(cputime.sum_exec_runtime); - return p->prev_stime; -} -#endif + if (total) { + u64 temp; -inline cputime_t task_gtime(struct task_struct *p) -{ - return p->gtime; + temp = (u64)(rtime * cputime.utime); + do_div(temp, total); + utime = (cputime_t)temp; + } else + utime = rtime; + + sig->prev_utime = max(sig->prev_utime, utime); + sig->prev_stime = max(sig->prev_stime, + cputime_sub(rtime, sig->prev_utime)); + + *ut = sig->prev_utime; + *st = sig->prev_stime; } +#endif /* * This function gets called by the timer code, with HZ frequency. @@ -5448,7 +5527,7 @@ need_resched_nonpreemptible: } EXPORT_SYMBOL(schedule); -#ifdef CONFIG_SMP +#ifdef CONFIG_MUTEX_SPIN_ON_OWNER /* * Look out! "owner" is an entirely speculative pointer * access and not reliable. @@ -6142,22 +6221,14 @@ __setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio) BUG_ON(p->se.on_rq); p->policy = policy; - switch (p->policy) { - case SCHED_NORMAL: - case SCHED_BATCH: - case SCHED_IDLE: - p->sched_class = &fair_sched_class; - break; - case SCHED_FIFO: - case SCHED_RR: - p->sched_class = &rt_sched_class; - break; - } - p->rt_priority = prio; p->normal_prio = normal_prio(p); /* we are holding p->pi_lock already */ p->prio = rt_mutex_getprio(p); + if (rt_prio(p->prio)) + p->sched_class = &rt_sched_class; + else + p->sched_class = &fair_sched_class; set_load_weight(p); } @@ -6720,9 +6791,6 @@ EXPORT_SYMBOL(yield); /* * This task is about to go to sleep on IO. Increment rq->nr_iowait so * that process accounting knows that this is a task in IO wait state. - * - * But don't do that if it is a deliberate, throttling IO wait (this task - * has set its backing_dev_info: the queue against which it should throttle) */ void __sched io_schedule(void) { @@ -6905,7 +6973,7 @@ void show_state_filter(unsigned long state_filter) /* * Only show locks if all tasks are dumped: */ - if (state_filter == -1) + if (!state_filter) debug_show_all_locks(); } @@ -7710,6 +7778,16 @@ early_initcall(migration_init); #ifdef CONFIG_SCHED_DEBUG +static __read_mostly int sched_domain_debug_enabled; + +static int __init sched_domain_debug_setup(char *str) +{ + sched_domain_debug_enabled = 1; + + return 0; +} +early_param("sched_debug", sched_domain_debug_setup); + static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, struct cpumask *groupmask) { @@ -7796,6 +7874,9 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu) cpumask_var_t groupmask; int level = 0; + if (!sched_domain_debug_enabled) + return; + if (!sd) { printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu); return; @@ -7875,6 +7956,8 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent) static void free_rootdomain(struct root_domain *rd) { + synchronize_sched(); + cpupri_cleanup(&rd->cpupri); free_cpumask_var(rd->rto_mask); @@ -8015,6 +8098,7 @@ static cpumask_var_t cpu_isolated_map; /* Setup the mask of cpus configured for isolated domains */ static int __init isolated_cpu_setup(char *str) { + alloc_bootmem_cpumask_var(&cpu_isolated_map); cpulist_parse(str, cpu_isolated_map); return 1; } @@ -8851,7 +8935,7 @@ static int build_sched_domains(const struct cpumask *cpu_map) return __build_sched_domains(cpu_map, NULL); } -static struct cpumask *doms_cur; /* current sched domains */ +static cpumask_var_t *doms_cur; /* current sched domains */ static int ndoms_cur; /* number of sched domains in 'doms_cur' */ static struct sched_domain_attr *dattr_cur; /* attribues of custom domains in 'doms_cur' */ @@ -8873,6 +8957,31 @@ int __attribute__((weak)) arch_update_cpu_topology(void) return 0; } +cpumask_var_t *alloc_sched_domains(unsigned int ndoms) +{ + int i; + cpumask_var_t *doms; + + doms = kmalloc(sizeof(*doms) * ndoms, GFP_KERNEL); + if (!doms) + return NULL; + for (i = 0; i < ndoms; i++) { + if (!alloc_cpumask_var(&doms[i], GFP_KERNEL)) { + free_sched_domains(doms, i); + return NULL; + } + } + return doms; +} + +void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms) +{ + unsigned int i; + for (i = 0; i < ndoms; i++) + free_cpumask_var(doms[i]); + kfree(doms); +} + /* * Set up scheduler domains and groups. Callers must hold the hotplug lock. * For now this just excludes isolated cpus, but could be used to @@ -8884,12 +8993,12 @@ static int arch_init_sched_domains(const struct cpumask *cpu_map) arch_update_cpu_topology(); ndoms_cur = 1; - doms_cur = kmalloc(cpumask_size(), GFP_KERNEL); + doms_cur = alloc_sched_domains(ndoms_cur); if (!doms_cur) - doms_cur = fallback_doms; - cpumask_andnot(doms_cur, cpu_map, cpu_isolated_map); + doms_cur = &fallback_doms; + cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map); dattr_cur = NULL; - err = build_sched_domains(doms_cur); + err = build_sched_domains(doms_cur[0]); register_sched_domain_sysctl(); return err; @@ -8939,19 +9048,19 @@ static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur, * doms_new[] to the current sched domain partitioning, doms_cur[]. * It destroys each deleted domain and builds each new domain. * - * 'doms_new' is an array of cpumask's of length 'ndoms_new'. + * 'doms_new' is an array of cpumask_var_t's of length 'ndoms_new'. * The masks don't intersect (don't overlap.) We should setup one * sched domain for each mask. CPUs not in any of the cpumasks will * not be load balanced. If the same cpumask appears both in the * current 'doms_cur' domains and in the new 'doms_new', we can leave * it as it is. * - * The passed in 'doms_new' should be kmalloc'd. This routine takes - * ownership of it and will kfree it when done with it. If the caller - * failed the kmalloc call, then it can pass in doms_new == NULL && - * ndoms_new == 1, and partition_sched_domains() will fallback to - * the single partition 'fallback_doms', it also forces the domains - * to be rebuilt. + * The passed in 'doms_new' should be allocated using + * alloc_sched_domains. This routine takes ownership of it and will + * free_sched_domains it when done with it. If the caller failed the + * alloc call, then it can pass in doms_new == NULL && ndoms_new == 1, + * and partition_sched_domains() will fallback to the single partition + * 'fallback_doms', it also forces the domains to be rebuilt. * * If doms_new == NULL it will be replaced with cpu_online_mask. * ndoms_new == 0 is a special case for destroying existing domains, @@ -8959,8 +9068,7 @@ static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur, * * Call with hotplug lock held */ -/* FIXME: Change to struct cpumask *doms_new[] */ -void partition_sched_domains(int ndoms_new, struct cpumask *doms_new, +void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[], struct sched_domain_attr *dattr_new) { int i, j, n; @@ -8979,40 +9087,40 @@ void partition_sched_domains(int ndoms_new, struct cpumask *doms_new, /* Destroy deleted domains */ for (i = 0; i < ndoms_cur; i++) { for (j = 0; j < n && !new_topology; j++) { - if (cpumask_equal(&doms_cur[i], &doms_new[j]) + if (cpumask_equal(doms_cur[i], doms_new[j]) && dattrs_equal(dattr_cur, i, dattr_new, j)) goto match1; } /* no match - a current sched domain not in new doms_new[] */ - detach_destroy_domains(doms_cur + i); + detach_destroy_domains(doms_cur[i]); match1: ; } if (doms_new == NULL) { ndoms_cur = 0; - doms_new = fallback_doms; - cpumask_andnot(&doms_new[0], cpu_online_mask, cpu_isolated_map); + doms_new = &fallback_doms; + cpumask_andnot(doms_new[0], cpu_online_mask, cpu_isolated_map); WARN_ON_ONCE(dattr_new); } /* Build new domains */ for (i = 0; i < ndoms_new; i++) { for (j = 0; j < ndoms_cur && !new_topology; j++) { - if (cpumask_equal(&doms_new[i], &doms_cur[j]) + if (cpumask_equal(doms_new[i], doms_cur[j]) && dattrs_equal(dattr_new, i, dattr_cur, j)) goto match2; } /* no match - add a new doms_new */ - __build_sched_domains(doms_new + i, + __build_sched_domains(doms_new[i], dattr_new ? dattr_new + i : NULL); match2: ; } /* Remember the new sched domains */ - if (doms_cur != fallback_doms) - kfree(doms_cur); + if (doms_cur != &fallback_doms) + free_sched_domains(doms_cur, ndoms_cur); kfree(dattr_cur); /* kfree(NULL) is safe */ doms_cur = doms_new; dattr_cur = dattr_new; @@ -9334,10 +9442,6 @@ void __init sched_init(void) #ifdef CONFIG_CPUMASK_OFFSTACK alloc_size += num_possible_cpus() * cpumask_size(); #endif - /* - * As sched_init() is called before page_alloc is setup, - * we use alloc_bootmem(). - */ if (alloc_size) { ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT); @@ -9406,6 +9510,10 @@ void __init sched_init(void) #endif /* CONFIG_USER_SCHED */ #endif /* CONFIG_GROUP_SCHED */ +#if defined CONFIG_FAIR_GROUP_SCHED && defined CONFIG_SMP + update_shares_data = __alloc_percpu(nr_cpu_ids * sizeof(unsigned long), + __alignof__(unsigned long)); +#endif for_each_possible_cpu(i) { struct rq *rq; @@ -9488,6 +9596,8 @@ void __init sched_init(void) rq->cpu = i; rq->online = 0; rq->migration_thread = NULL; + rq->idle_stamp = 0; + rq->avg_idle = 2*sysctl_sched_migration_cost; INIT_LIST_HEAD(&rq->migration_queue); rq_attach_root(rq, &def_root_domain); #endif @@ -9531,13 +9641,15 @@ void __init sched_init(void) current->sched_class = &fair_sched_class; /* Allocate the nohz_cpu_mask if CONFIG_CPUMASK_OFFSTACK */ - alloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT); + zalloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT); #ifdef CONFIG_SMP #ifdef CONFIG_NO_HZ - alloc_cpumask_var(&nohz.cpu_mask, GFP_NOWAIT); + zalloc_cpumask_var(&nohz.cpu_mask, GFP_NOWAIT); alloc_cpumask_var(&nohz.ilb_grp_nohz_mask, GFP_NOWAIT); #endif - alloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT); + /* May be allocated at isolcpus cmdline parse time */ + if (cpu_isolated_map == NULL) + zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT); #endif /* SMP */ perf_event_init(); @@ -10312,7 +10424,7 @@ static int sched_rt_global_constraints(void) #endif /* CONFIG_RT_GROUP_SCHED */ int sched_rt_handler(struct ctl_table *table, int write, - struct file *filp, void __user *buffer, size_t *lenp, + void __user *buffer, size_t *lenp, loff_t *ppos) { int ret; @@ -10323,7 +10435,7 @@ int sched_rt_handler(struct ctl_table *table, int write, old_period = sysctl_sched_rt_period; old_runtime = sysctl_sched_rt_runtime; - ret = proc_dointvec(table, write, filp, buffer, lenp, ppos); + ret = proc_dointvec(table, write, buffer, lenp, ppos); if (!ret && write) { ret = sched_rt_global_constraints(); @@ -10377,8 +10489,7 @@ cpu_cgroup_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp) } static int -cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, - struct task_struct *tsk) +cpu_cgroup_can_attach_task(struct cgroup *cgrp, struct task_struct *tsk) { #ifdef CONFIG_RT_GROUP_SCHED if (!sched_rt_can_attach(cgroup_tg(cgrp), tsk)) @@ -10388,15 +10499,45 @@ cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, if (tsk->sched_class != &fair_sched_class) return -EINVAL; #endif + return 0; +} +static int +cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, + struct task_struct *tsk, bool threadgroup) +{ + int retval = cpu_cgroup_can_attach_task(cgrp, tsk); + if (retval) + return retval; + if (threadgroup) { + struct task_struct *c; + rcu_read_lock(); + list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) { + retval = cpu_cgroup_can_attach_task(cgrp, c); + if (retval) { + rcu_read_unlock(); + return retval; + } + } + rcu_read_unlock(); + } return 0; } static void cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, - struct cgroup *old_cont, struct task_struct *tsk) + struct cgroup *old_cont, struct task_struct *tsk, + bool threadgroup) { sched_move_task(tsk); + if (threadgroup) { + struct task_struct *c; + rcu_read_lock(); + list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) { + sched_move_task(c); + } + rcu_read_unlock(); + } } #ifdef CONFIG_FAIR_GROUP_SCHED @@ -10838,6 +10979,7 @@ void synchronize_sched_expedited(void) spin_unlock_irqrestore(&rq->lock, flags); } rcu_expedited_state = RCU_EXPEDITED_STATE_IDLE; + synchronize_sched_expedited_count++; mutex_unlock(&rcu_sched_expedited_mutex); put_online_cpus(); if (need_full_sync) diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c index ac2e1dc708bd..479ce5682d7c 100644 --- a/kernel/sched_clock.c +++ b/kernel/sched_clock.c @@ -127,7 +127,7 @@ again: clock = wrap_max(clock, min_clock); clock = wrap_min(clock, max_clock); - if (cmpxchg(&scd->clock, old_clock, clock) != old_clock) + if (cmpxchg64(&scd->clock, old_clock, clock) != old_clock) goto again; return clock; @@ -163,7 +163,7 @@ again: val = remote_clock; } - if (cmpxchg(ptr, old_val, val) != old_val) + if (cmpxchg64(ptr, old_val, val) != old_val) goto again; return val; diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c index efb84409bc43..6988cf08f705 100644 --- a/kernel/sched_debug.c +++ b/kernel/sched_debug.c @@ -285,12 +285,16 @@ static void print_cpu(struct seq_file *m, int cpu) #ifdef CONFIG_SCHEDSTATS #define P(n) SEQ_printf(m, " .%-30s: %d\n", #n, rq->n); +#define P64(n) SEQ_printf(m, " .%-30s: %Ld\n", #n, rq->n); P(yld_count); P(sched_switch); P(sched_count); P(sched_goidle); +#ifdef CONFIG_SMP + P64(avg_idle); +#endif P(ttwu_count); P(ttwu_local); diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index ecc637a0d591..f61837ad336d 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -384,10 +384,10 @@ static struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq) #ifdef CONFIG_SCHED_DEBUG int sched_nr_latency_handler(struct ctl_table *table, int write, - struct file *filp, void __user *buffer, size_t *lenp, + void __user *buffer, size_t *lenp, loff_t *ppos) { - int ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos); + int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); if (ret || !write) return ret; @@ -822,6 +822,26 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) * re-elected due to buddy favours. */ clear_buddies(cfs_rq, curr); + return; + } + + /* + * Ensure that a task that missed wakeup preemption by a + * narrow margin doesn't have to wait for a full slice. + * This also mitigates buddy induced latencies under load. + */ + if (!sched_feat(WAKEUP_PREEMPT)) + return; + + if (delta_exec < sysctl_sched_min_granularity) + return; + + if (cfs_rq->nr_running > 1) { + struct sched_entity *se = __pick_next_entity(cfs_rq); + s64 delta = curr->vruntime - se->vruntime; + + if (delta > ideal_runtime) + resched_task(rq_of(cfs_rq)->curr); } } @@ -861,12 +881,18 @@ wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se); static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq) { struct sched_entity *se = __pick_next_entity(cfs_rq); + struct sched_entity *left = se; - if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, se) < 1) - return cfs_rq->next; + if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1) + se = cfs_rq->next; - if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, se) < 1) - return cfs_rq->last; + /* + * Prefer last buddy, try to return the CPU to a preempted task. + */ + if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1) + se = cfs_rq->last; + + clear_buddies(cfs_rq, se); return se; } @@ -1319,6 +1345,37 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) } /* + * Try and locate an idle CPU in the sched_domain. + */ +static int +select_idle_sibling(struct task_struct *p, struct sched_domain *sd, int target) +{ + int cpu = smp_processor_id(); + int prev_cpu = task_cpu(p); + int i; + + /* + * If this domain spans both cpu and prev_cpu (see the SD_WAKE_AFFINE + * test in select_task_rq_fair) and the prev_cpu is idle then that's + * always a better target than the current cpu. + */ + if (target == cpu && !cpu_rq(prev_cpu)->cfs.nr_running) + return prev_cpu; + + /* + * Otherwise, iterate the domain and find an elegible idle cpu. + */ + for_each_cpu_and(i, sched_domain_span(sd), &p->cpus_allowed) { + if (!cpu_rq(i)->cfs.nr_running) { + target = i; + break; + } + } + + return target; +} + +/* * sched_balance_self: balance the current task (running on cpu) in domains * that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and * SD_BALANCE_EXEC. @@ -1372,11 +1429,35 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag want_sd = 0; } - if (want_affine && (tmp->flags & SD_WAKE_AFFINE) && - cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) { + /* + * While iterating the domains looking for a spanning + * WAKE_AFFINE domain, adjust the affine target to any idle cpu + * in cache sharing domains along the way. + */ + if (want_affine) { + int target = -1; - affine_sd = tmp; - want_affine = 0; + /* + * If both cpu and prev_cpu are part of this domain, + * cpu is a valid SD_WAKE_AFFINE target. + */ + if (cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) + target = cpu; + + /* + * If there's an idle sibling in this domain, make that + * the wake_affine target instead of the current cpu. + */ + if (tmp->flags & SD_PREFER_SIBLING) + target = select_idle_sibling(p, tmp, target); + + if (target >= 0) { + if (tmp->flags & SD_WAKE_AFFINE) { + affine_sd = tmp; + want_affine = 0; + } + cpu = target; + } } if (!want_sd && !want_affine) @@ -1568,6 +1649,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ struct sched_entity *se = &curr->se, *pse = &p->se; struct cfs_rq *cfs_rq = task_cfs_rq(curr); int sync = wake_flags & WF_SYNC; + int scale = cfs_rq->nr_running >= sched_nr_latency; update_curr(cfs_rq); @@ -1582,18 +1664,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ if (unlikely(se == pse)) return; - /* - * Only set the backward buddy when the current task is still on the - * rq. This can happen when a wakeup gets interleaved with schedule on - * the ->pre_schedule() or idle_balance() point, either of which can - * drop the rq lock. - * - * Also, during early boot the idle thread is in the fair class, for - * obvious reasons its a bad idea to schedule back to the idle thread. - */ - if (sched_feat(LAST_BUDDY) && likely(se->on_rq && curr != rq->idle)) - set_last_buddy(se); - if (sched_feat(NEXT_BUDDY) && !(wake_flags & WF_FORK)) + if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK)) set_next_buddy(pse); /* @@ -1639,8 +1710,22 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ BUG_ON(!pse); - if (wakeup_preempt_entity(se, pse) == 1) + if (wakeup_preempt_entity(se, pse) == 1) { resched_task(curr); + /* + * Only set the backward buddy when the current task is still + * on the rq. This can happen when a wakeup gets interleaved + * with schedule on the ->pre_schedule() or idle_balance() + * point, either of which can * drop the rq lock. + * + * Also, during early boot the idle thread is in the fair class, + * for obvious reasons its a bad idea to schedule back to it. + */ + if (unlikely(!se->on_rq || curr == rq->idle)) + return; + if (sched_feat(LAST_BUDDY) && scale && entity_is_task(se)) + set_last_buddy(se); + } } static struct task_struct *pick_next_task_fair(struct rq *rq) @@ -1649,21 +1734,11 @@ static struct task_struct *pick_next_task_fair(struct rq *rq) struct cfs_rq *cfs_rq = &rq->cfs; struct sched_entity *se; - if (unlikely(!cfs_rq->nr_running)) + if (!cfs_rq->nr_running) return NULL; do { se = pick_next_entity(cfs_rq); - /* - * If se was a buddy, clear it so that it will have to earn - * the favour again. - * - * If se was not a buddy, clear the buddies because neither - * was elegible to run, let them earn it again. - * - * IOW. unconditionally clear buddies. - */ - __clear_buddies(cfs_rq, NULL); set_next_entity(cfs_rq, se); cfs_rq = group_cfs_rq(se); } while (cfs_rq); diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index a4d790cddb19..5c5fef378415 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -1153,29 +1153,12 @@ static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu) static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask); -static inline int pick_optimal_cpu(int this_cpu, - const struct cpumask *mask) -{ - int first; - - /* "this_cpu" is cheaper to preempt than a remote processor */ - if ((this_cpu != -1) && cpumask_test_cpu(this_cpu, mask)) - return this_cpu; - - first = cpumask_first(mask); - if (first < nr_cpu_ids) - return first; - - return -1; -} - static int find_lowest_rq(struct task_struct *task) { struct sched_domain *sd; struct cpumask *lowest_mask = __get_cpu_var(local_cpu_mask); int this_cpu = smp_processor_id(); int cpu = task_cpu(task); - cpumask_var_t domain_mask; if (task->rt.nr_cpus_allowed == 1) return -1; /* No other targets possible */ @@ -1198,28 +1181,26 @@ static int find_lowest_rq(struct task_struct *task) * Otherwise, we consult the sched_domains span maps to figure * out which cpu is logically closest to our hot cache data. */ - if (this_cpu == cpu) - this_cpu = -1; /* Skip this_cpu opt if the same */ - - if (alloc_cpumask_var(&domain_mask, GFP_ATOMIC)) { - for_each_domain(cpu, sd) { - if (sd->flags & SD_WAKE_AFFINE) { - int best_cpu; + if (!cpumask_test_cpu(this_cpu, lowest_mask)) + this_cpu = -1; /* Skip this_cpu opt if not among lowest */ - cpumask_and(domain_mask, - sched_domain_span(sd), - lowest_mask); + for_each_domain(cpu, sd) { + if (sd->flags & SD_WAKE_AFFINE) { + int best_cpu; - best_cpu = pick_optimal_cpu(this_cpu, - domain_mask); - - if (best_cpu != -1) { - free_cpumask_var(domain_mask); - return best_cpu; - } - } + /* + * "this_cpu" is cheaper to preempt than a + * remote processor. + */ + if (this_cpu != -1 && + cpumask_test_cpu(this_cpu, sched_domain_span(sd))) + return this_cpu; + + best_cpu = cpumask_first_and(lowest_mask, + sched_domain_span(sd)); + if (best_cpu < nr_cpu_ids) + return best_cpu; } - free_cpumask_var(domain_mask); } /* @@ -1227,7 +1208,13 @@ static int find_lowest_rq(struct task_struct *task) * just give the caller *something* to work with from the compatible * locations. */ - return pick_optimal_cpu(this_cpu, lowest_mask); + if (this_cpu != -1) + return this_cpu; + + cpu = cpumask_any(lowest_mask); + if (cpu < nr_cpu_ids) + return cpu; + return -1; } /* Will lock the rq it finds */ diff --git a/kernel/signal.c b/kernel/signal.c index 64c5deeaca5d..6b982f2cf524 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -22,12 +22,14 @@ #include <linux/ptrace.h> #include <linux/signal.h> #include <linux/signalfd.h> +#include <linux/ratelimit.h> #include <linux/tracehook.h> #include <linux/capability.h> #include <linux/freezer.h> #include <linux/pid_namespace.h> #include <linux/nsproxy.h> -#include <trace/events/sched.h> +#define CREATE_TRACE_POINTS +#include <trace/events/signal.h> #include <asm/param.h> #include <asm/uaccess.h> @@ -41,6 +43,8 @@ static struct kmem_cache *sigqueue_cachep; +int print_fatal_signals __read_mostly; + static void __user *sig_handler(struct task_struct *t, int sig) { return t->sighand->action[sig - 1].sa.sa_handler; @@ -159,7 +163,7 @@ int next_signal(struct sigpending *pending, sigset_t *mask) { unsigned long i, *s, *m, x; int sig = 0; - + s = pending->signal.sig; m = mask->sig; switch (_NSIG_WORDS) { @@ -184,17 +188,31 @@ int next_signal(struct sigpending *pending, sigset_t *mask) sig = ffz(~x) + 1; break; } - + return sig; } +static inline void print_dropped_signal(int sig) +{ + static DEFINE_RATELIMIT_STATE(ratelimit_state, 5 * HZ, 10); + + if (!print_fatal_signals) + return; + + if (!__ratelimit(&ratelimit_state)) + return; + + printk(KERN_INFO "%s/%d: reached RLIMIT_SIGPENDING, dropped signal %d\n", + current->comm, current->pid, sig); +} + /* * allocate a new signal queue record * - this may be called without locks if and only if t == current, otherwise an * appopriate lock must be held to stop the target task from exiting */ -static struct sigqueue *__sigqueue_alloc(struct task_struct *t, gfp_t flags, - int override_rlimit) +static struct sigqueue * +__sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags, int override_rlimit) { struct sigqueue *q = NULL; struct user_struct *user; @@ -207,10 +225,15 @@ static struct sigqueue *__sigqueue_alloc(struct task_struct *t, gfp_t flags, */ user = get_uid(__task_cred(t)->user); atomic_inc(&user->sigpending); + if (override_rlimit || atomic_read(&user->sigpending) <= - t->signal->rlim[RLIMIT_SIGPENDING].rlim_cur) + t->signal->rlim[RLIMIT_SIGPENDING].rlim_cur) { q = kmem_cache_alloc(sigqueue_cachep, flags); + } else { + print_dropped_signal(sig); + } + if (unlikely(q == NULL)) { atomic_dec(&user->sigpending); free_uid(user); @@ -705,7 +728,7 @@ static int prepare_signal(int sig, struct task_struct *p, int from_ancestor_ns) if (why) { /* - * The first thread which returns from finish_stop() + * The first thread which returns from do_signal_stop() * will take ->siglock, notice SIGNAL_CLD_MASK, and * notify its parent. See get_signal_to_deliver(). */ @@ -834,7 +857,7 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t, struct sigqueue *q; int override_rlimit; - trace_sched_signal_send(sig, t); + trace_signal_generate(sig, info, t); assert_spin_locked(&t->sighand->siglock); @@ -869,7 +892,7 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t, else override_rlimit = 0; - q = __sigqueue_alloc(t, GFP_ATOMIC | __GFP_NOTRACK_FALSE_POSITIVE, + q = __sigqueue_alloc(sig, t, GFP_ATOMIC | __GFP_NOTRACK_FALSE_POSITIVE, override_rlimit); if (q) { list_add_tail(&q->list, &pending->list); @@ -896,12 +919,21 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t, break; } } else if (!is_si_special(info)) { - if (sig >= SIGRTMIN && info->si_code != SI_USER) - /* - * Queue overflow, abort. We may abort if the signal was rt - * and sent by user using something other than kill(). - */ + if (sig >= SIGRTMIN && info->si_code != SI_USER) { + /* + * Queue overflow, abort. We may abort if the + * signal was rt and sent by user using something + * other than kill(). + */ + trace_signal_overflow_fail(sig, group, info); return -EAGAIN; + } else { + /* + * This is a silent loss of information. We still + * send the signal, but the *info bits are lost. + */ + trace_signal_lose_info(sig, group, info); + } } out_set: @@ -925,8 +957,6 @@ static int send_signal(int sig, struct siginfo *info, struct task_struct *t, return __send_signal(sig, info, t, group, from_ancestor_ns); } -int print_fatal_signals; - static void print_fatal_signal(struct pt_regs *regs, int signr) { printk("%s/%d: potentially unexpected fatal signal %d.\n", @@ -971,6 +1001,20 @@ specific_send_sig_info(int sig, struct siginfo *info, struct task_struct *t) return send_signal(sig, info, t, 0); } +int do_send_sig_info(int sig, struct siginfo *info, struct task_struct *p, + bool group) +{ + unsigned long flags; + int ret = -ESRCH; + + if (lock_task_sighand(p, &flags)) { + ret = send_signal(sig, info, p, group); + unlock_task_sighand(p, &flags); + } + + return ret; +} + /* * Force a signal that the process can't ignore: if necessary * we unblock the signal and change any SIG_IGN to SIG_DFL. @@ -1036,12 +1080,6 @@ void zap_other_threads(struct task_struct *p) } } -int __fatal_signal_pending(struct task_struct *tsk) -{ - return sigismember(&tsk->pending.signal, SIGKILL); -} -EXPORT_SYMBOL(__fatal_signal_pending); - struct sighand_struct *lock_task_sighand(struct task_struct *tsk, unsigned long *flags) { struct sighand_struct *sighand; @@ -1068,18 +1106,10 @@ struct sighand_struct *lock_task_sighand(struct task_struct *tsk, unsigned long */ int group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p) { - unsigned long flags; - int ret; - - ret = check_kill_permission(sig, info, p); + int ret = check_kill_permission(sig, info, p); - if (!ret && sig) { - ret = -ESRCH; - if (lock_task_sighand(p, &flags)) { - ret = __group_send_sig_info(sig, info, p); - unlock_task_sighand(p, &flags); - } - } + if (!ret && sig) + ret = do_send_sig_info(sig, info, p, true); return ret; } @@ -1224,15 +1254,9 @@ static int kill_something_info(int sig, struct siginfo *info, pid_t pid) * These are for backward compatibility with the rest of the kernel source. */ -/* - * The caller must ensure the task can't exit. - */ int send_sig_info(int sig, struct siginfo *info, struct task_struct *p) { - int ret; - unsigned long flags; - /* * Make sure legacy kernel users don't send in bad values * (normal paths check this in check_kill_permission). @@ -1240,10 +1264,7 @@ send_sig_info(int sig, struct siginfo *info, struct task_struct *p) if (!valid_signal(sig)) return -EINVAL; - spin_lock_irqsave(&p->sighand->siglock, flags); - ret = specific_send_sig_info(sig, info, p); - spin_unlock_irqrestore(&p->sighand->siglock, flags); - return ret; + return do_send_sig_info(sig, info, p, false); } #define __si_special(priv) \ @@ -1302,19 +1323,19 @@ EXPORT_SYMBOL(kill_pid); * These functions support sending signals using preallocated sigqueue * structures. This is needed "because realtime applications cannot * afford to lose notifications of asynchronous events, like timer - * expirations or I/O completions". In the case of Posix Timers + * expirations or I/O completions". In the case of Posix Timers * we allocate the sigqueue structure from the timer_create. If this * allocation fails we are able to report the failure to the application * with an EAGAIN error. */ - struct sigqueue *sigqueue_alloc(void) { - struct sigqueue *q; + struct sigqueue *q = __sigqueue_alloc(-1, current, GFP_KERNEL, 0); - if ((q = __sigqueue_alloc(current, GFP_KERNEL, 0))) + if (q) q->flags |= SIGQUEUE_PREALLOC; - return(q); + + return q; } void sigqueue_free(struct sigqueue *q) @@ -1383,15 +1404,6 @@ ret: } /* - * Wake up any threads in the parent blocked in wait* syscalls. - */ -static inline void __wake_up_parent(struct task_struct *p, - struct task_struct *parent) -{ - wake_up_interruptible_sync(&parent->signal->wait_chldexit); -} - -/* * Let a parent know about the death of a child. * For a stopped/continued status change, use do_notify_parent_cldstop instead. * @@ -1673,29 +1685,6 @@ void ptrace_notify(int exit_code) spin_unlock_irq(¤t->sighand->siglock); } -static void -finish_stop(int stop_count) -{ - /* - * If there are no other threads in the group, or if there is - * a group stop in progress and we are the last to stop, - * report to the parent. When ptraced, every thread reports itself. - */ - if (tracehook_notify_jctl(stop_count == 0, CLD_STOPPED)) { - read_lock(&tasklist_lock); - do_notify_parent_cldstop(current, CLD_STOPPED); - read_unlock(&tasklist_lock); - } - - do { - schedule(); - } while (try_to_freeze()); - /* - * Now we don't run again until continued. - */ - current->exit_code = 0; -} - /* * This performs the stopping for SIGSTOP and other stop signals. * We have to stop all threads in the thread group. @@ -1705,15 +1694,9 @@ finish_stop(int stop_count) static int do_signal_stop(int signr) { struct signal_struct *sig = current->signal; - int stop_count; + int notify; - if (sig->group_stop_count > 0) { - /* - * There is a group stop in progress. We don't need to - * start another one. - */ - stop_count = --sig->group_stop_count; - } else { + if (!sig->group_stop_count) { struct task_struct *t; if (!likely(sig->flags & SIGNAL_STOP_DEQUEUED) || @@ -1725,7 +1708,7 @@ static int do_signal_stop(int signr) */ sig->group_exit_code = signr; - stop_count = 0; + sig->group_stop_count = 1; for (t = next_thread(current); t != current; t = next_thread(t)) /* * Setting state to TASK_STOPPED for a group @@ -1734,19 +1717,44 @@ static int do_signal_stop(int signr) */ if (!(t->flags & PF_EXITING) && !task_is_stopped_or_traced(t)) { - stop_count++; + sig->group_stop_count++; signal_wake_up(t, 0); } - sig->group_stop_count = stop_count; } + /* + * If there are no other threads in the group, or if there is + * a group stop in progress and we are the last to stop, report + * to the parent. When ptraced, every thread reports itself. + */ + notify = sig->group_stop_count == 1 ? CLD_STOPPED : 0; + notify = tracehook_notify_jctl(notify, CLD_STOPPED); + /* + * tracehook_notify_jctl() can drop and reacquire siglock, so + * we keep ->group_stop_count != 0 before the call. If SIGCONT + * or SIGKILL comes in between ->group_stop_count == 0. + */ + if (sig->group_stop_count) { + if (!--sig->group_stop_count) + sig->flags = SIGNAL_STOP_STOPPED; + current->exit_code = sig->group_exit_code; + __set_current_state(TASK_STOPPED); + } + spin_unlock_irq(¤t->sighand->siglock); - if (stop_count == 0) - sig->flags = SIGNAL_STOP_STOPPED; - current->exit_code = sig->group_exit_code; - __set_current_state(TASK_STOPPED); + if (notify) { + read_lock(&tasklist_lock); + do_notify_parent_cldstop(current, notify); + read_unlock(&tasklist_lock); + } + + /* Now we don't run again until woken by SIGCONT or SIGKILL */ + do { + schedule(); + } while (try_to_freeze()); + + tracehook_finish_jctl(); + current->exit_code = 0; - spin_unlock_irq(¤t->sighand->siglock); - finish_stop(stop_count); return 1; } @@ -1815,14 +1823,15 @@ relock: int why = (signal->flags & SIGNAL_STOP_CONTINUED) ? CLD_CONTINUED : CLD_STOPPED; signal->flags &= ~SIGNAL_CLD_MASK; - spin_unlock_irq(&sighand->siglock); - if (unlikely(!tracehook_notify_jctl(1, why))) - goto relock; + why = tracehook_notify_jctl(why, CLD_CONTINUED); + spin_unlock_irq(&sighand->siglock); - read_lock(&tasklist_lock); - do_notify_parent_cldstop(current->group_leader, why); - read_unlock(&tasklist_lock); + if (why) { + read_lock(&tasklist_lock); + do_notify_parent_cldstop(current->group_leader, why); + read_unlock(&tasklist_lock); + } goto relock; } @@ -1860,6 +1869,9 @@ relock: ka = &sighand->action[signr-1]; } + /* Trace actually delivered signals. */ + trace_signal_deliver(signr, info, ka); + if (ka->sa.sa_handler == SIG_IGN) /* Do nothing. */ continue; if (ka->sa.sa_handler != SIG_DFL) { @@ -1987,14 +1999,14 @@ void exit_signals(struct task_struct *tsk) if (unlikely(tsk->signal->group_stop_count) && !--tsk->signal->group_stop_count) { tsk->signal->flags = SIGNAL_STOP_STOPPED; - group_stop = 1; + group_stop = tracehook_notify_jctl(CLD_STOPPED, CLD_STOPPED); } out: spin_unlock_irq(&tsk->sighand->siglock); - if (unlikely(group_stop) && tracehook_notify_jctl(1, CLD_STOPPED)) { + if (unlikely(group_stop)) { read_lock(&tasklist_lock); - do_notify_parent_cldstop(tsk, CLD_STOPPED); + do_notify_parent_cldstop(tsk, group_stop); read_unlock(&tasklist_lock); } } @@ -2290,7 +2302,6 @@ static int do_send_specific(pid_t tgid, pid_t pid, int sig, struct siginfo *info) { struct task_struct *p; - unsigned long flags; int error = -ESRCH; rcu_read_lock(); @@ -2300,14 +2311,16 @@ do_send_specific(pid_t tgid, pid_t pid, int sig, struct siginfo *info) /* * The null signal is a permissions and process existence * probe. No signal is actually delivered. - * - * If lock_task_sighand() fails we pretend the task dies - * after receiving the signal. The window is tiny, and the - * signal is private anyway. */ - if (!error && sig && lock_task_sighand(p, &flags)) { - error = specific_send_sig_info(sig, info, p); - unlock_task_sighand(p, &flags); + if (!error && sig) { + error = do_send_sig_info(sig, info, p, false); + /* + * If lock_task_sighand() failed we pretend the task + * dies after receiving the signal. The window is tiny, + * and the signal is private anyway. + */ + if (unlikely(error == -ESRCH)) + error = 0; } } rcu_read_unlock(); diff --git a/kernel/slow-work-debugfs.c b/kernel/slow-work-debugfs.c new file mode 100644 index 000000000000..e45c43645298 --- /dev/null +++ b/kernel/slow-work-debugfs.c @@ -0,0 +1,227 @@ +/* Slow work debugging + * + * Copyright (C) 2009 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public Licence + * as published by the Free Software Foundation; either version + * 2 of the Licence, or (at your option) any later version. + */ + +#include <linux/module.h> +#include <linux/slow-work.h> +#include <linux/fs.h> +#include <linux/time.h> +#include <linux/seq_file.h> +#include "slow-work.h" + +#define ITERATOR_SHIFT (BITS_PER_LONG - 4) +#define ITERATOR_SELECTOR (0xfUL << ITERATOR_SHIFT) +#define ITERATOR_COUNTER (~ITERATOR_SELECTOR) + +void slow_work_new_thread_desc(struct slow_work *work, struct seq_file *m) +{ + seq_puts(m, "Slow-work: New thread"); +} + +/* + * Render the time mark field on a work item into a 5-char time with units plus + * a space + */ +static void slow_work_print_mark(struct seq_file *m, struct slow_work *work) +{ + struct timespec now, diff; + + now = CURRENT_TIME; + diff = timespec_sub(now, work->mark); + + if (diff.tv_sec < 0) + seq_puts(m, " -ve "); + else if (diff.tv_sec == 0 && diff.tv_nsec < 1000) + seq_printf(m, "%3luns ", diff.tv_nsec); + else if (diff.tv_sec == 0 && diff.tv_nsec < 1000000) + seq_printf(m, "%3luus ", diff.tv_nsec / 1000); + else if (diff.tv_sec == 0 && diff.tv_nsec < 1000000000) + seq_printf(m, "%3lums ", diff.tv_nsec / 1000000); + else if (diff.tv_sec <= 1) + seq_puts(m, " 1s "); + else if (diff.tv_sec < 60) + seq_printf(m, "%4lus ", diff.tv_sec); + else if (diff.tv_sec < 60 * 60) + seq_printf(m, "%4lum ", diff.tv_sec / 60); + else if (diff.tv_sec < 60 * 60 * 24) + seq_printf(m, "%4luh ", diff.tv_sec / 3600); + else + seq_puts(m, "exces "); +} + +/* + * Describe a slow work item for debugfs + */ +static int slow_work_runqueue_show(struct seq_file *m, void *v) +{ + struct slow_work *work; + struct list_head *p = v; + unsigned long id; + + switch ((unsigned long) v) { + case 1: + seq_puts(m, "THR PID ITEM ADDR FL MARK DESC\n"); + return 0; + case 2: + seq_puts(m, "=== ===== ================ == ===== ==========\n"); + return 0; + + case 3 ... 3 + SLOW_WORK_THREAD_LIMIT - 1: + id = (unsigned long) v - 3; + + read_lock(&slow_work_execs_lock); + work = slow_work_execs[id]; + if (work) { + smp_read_barrier_depends(); + + seq_printf(m, "%3lu %5d %16p %2lx ", + id, slow_work_pids[id], work, work->flags); + slow_work_print_mark(m, work); + + if (work->ops->desc) + work->ops->desc(work, m); + seq_putc(m, '\n'); + } + read_unlock(&slow_work_execs_lock); + return 0; + + default: + work = list_entry(p, struct slow_work, link); + seq_printf(m, "%3s - %16p %2lx ", + work->flags & SLOW_WORK_VERY_SLOW ? "vsq" : "sq", + work, work->flags); + slow_work_print_mark(m, work); + + if (work->ops->desc) + work->ops->desc(work, m); + seq_putc(m, '\n'); + return 0; + } +} + +/* + * map the iterator to a work item + */ +static void *slow_work_runqueue_index(struct seq_file *m, loff_t *_pos) +{ + struct list_head *p; + unsigned long count, id; + + switch (*_pos >> ITERATOR_SHIFT) { + case 0x0: + if (*_pos == 0) + *_pos = 1; + if (*_pos < 3) + return (void *)(unsigned long) *_pos; + if (*_pos < 3 + SLOW_WORK_THREAD_LIMIT) + for (id = *_pos - 3; + id < SLOW_WORK_THREAD_LIMIT; + id++, (*_pos)++) + if (slow_work_execs[id]) + return (void *)(unsigned long) *_pos; + *_pos = 0x1UL << ITERATOR_SHIFT; + + case 0x1: + count = *_pos & ITERATOR_COUNTER; + list_for_each(p, &slow_work_queue) { + if (count == 0) + return p; + count--; + } + *_pos = 0x2UL << ITERATOR_SHIFT; + + case 0x2: + count = *_pos & ITERATOR_COUNTER; + list_for_each(p, &vslow_work_queue) { + if (count == 0) + return p; + count--; + } + *_pos = 0x3UL << ITERATOR_SHIFT; + + default: + return NULL; + } +} + +/* + * set up the iterator to start reading from the first line + */ +static void *slow_work_runqueue_start(struct seq_file *m, loff_t *_pos) +{ + spin_lock_irq(&slow_work_queue_lock); + return slow_work_runqueue_index(m, _pos); +} + +/* + * move to the next line + */ +static void *slow_work_runqueue_next(struct seq_file *m, void *v, loff_t *_pos) +{ + struct list_head *p = v; + unsigned long selector = *_pos >> ITERATOR_SHIFT; + + (*_pos)++; + switch (selector) { + case 0x0: + return slow_work_runqueue_index(m, _pos); + + case 0x1: + if (*_pos >> ITERATOR_SHIFT == 0x1) { + p = p->next; + if (p != &slow_work_queue) + return p; + } + *_pos = 0x2UL << ITERATOR_SHIFT; + p = &vslow_work_queue; + + case 0x2: + if (*_pos >> ITERATOR_SHIFT == 0x2) { + p = p->next; + if (p != &vslow_work_queue) + return p; + } + *_pos = 0x3UL << ITERATOR_SHIFT; + + default: + return NULL; + } +} + +/* + * clean up after reading + */ +static void slow_work_runqueue_stop(struct seq_file *m, void *v) +{ + spin_unlock_irq(&slow_work_queue_lock); +} + +static const struct seq_operations slow_work_runqueue_ops = { + .start = slow_work_runqueue_start, + .stop = slow_work_runqueue_stop, + .next = slow_work_runqueue_next, + .show = slow_work_runqueue_show, +}; + +/* + * open "/sys/kernel/debug/slow_work/runqueue" to list queue contents + */ +static int slow_work_runqueue_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &slow_work_runqueue_ops); +} + +const struct file_operations slow_work_runqueue_fops = { + .owner = THIS_MODULE, + .open = slow_work_runqueue_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; diff --git a/kernel/slow-work.c b/kernel/slow-work.c index 09d7519557d3..00889bd3c590 100644 --- a/kernel/slow-work.c +++ b/kernel/slow-work.c @@ -16,20 +16,17 @@ #include <linux/kthread.h> #include <linux/freezer.h> #include <linux/wait.h> - -#define SLOW_WORK_CULL_TIMEOUT (5 * HZ) /* cull threads 5s after running out of - * things to do */ -#define SLOW_WORK_OOM_TIMEOUT (5 * HZ) /* can't start new threads for 5s after - * OOM */ +#include <linux/debugfs.h> +#include "slow-work.h" static void slow_work_cull_timeout(unsigned long); static void slow_work_oom_timeout(unsigned long); #ifdef CONFIG_SYSCTL -static int slow_work_min_threads_sysctl(struct ctl_table *, int, struct file *, +static int slow_work_min_threads_sysctl(struct ctl_table *, int, void __user *, size_t *, loff_t *); -static int slow_work_max_threads_sysctl(struct ctl_table *, int , struct file *, +static int slow_work_max_threads_sysctl(struct ctl_table *, int , void __user *, size_t *, loff_t *); #endif @@ -46,7 +43,7 @@ static unsigned vslow_work_proportion = 50; /* % of threads that may process #ifdef CONFIG_SYSCTL static const int slow_work_min_min_threads = 2; -static int slow_work_max_max_threads = 255; +static int slow_work_max_max_threads = SLOW_WORK_THREAD_LIMIT; static const int slow_work_min_vslow = 1; static const int slow_work_max_vslow = 99; @@ -98,6 +95,56 @@ static DEFINE_TIMER(slow_work_oom_timer, slow_work_oom_timeout, 0, 0); static struct slow_work slow_work_new_thread; /* new thread starter */ /* + * slow work ID allocation (use slow_work_queue_lock) + */ +static DECLARE_BITMAP(slow_work_ids, SLOW_WORK_THREAD_LIMIT); + +/* + * Unregistration tracking to prevent put_ref() from disappearing during module + * unload + */ +#ifdef CONFIG_MODULES +static struct module *slow_work_thread_processing[SLOW_WORK_THREAD_LIMIT]; +static struct module *slow_work_unreg_module; +static struct slow_work *slow_work_unreg_work_item; +static DECLARE_WAIT_QUEUE_HEAD(slow_work_unreg_wq); +static DEFINE_MUTEX(slow_work_unreg_sync_lock); + +static void slow_work_set_thread_processing(int id, struct slow_work *work) +{ + if (work) + slow_work_thread_processing[id] = work->owner; +} +static void slow_work_done_thread_processing(int id, struct slow_work *work) +{ + struct module *module = slow_work_thread_processing[id]; + + slow_work_thread_processing[id] = NULL; + smp_mb(); + if (slow_work_unreg_work_item == work || + slow_work_unreg_module == module) + wake_up_all(&slow_work_unreg_wq); +} +static void slow_work_clear_thread_processing(int id) +{ + slow_work_thread_processing[id] = NULL; +} +#else +static void slow_work_set_thread_processing(int id, struct slow_work *work) {} +static void slow_work_done_thread_processing(int id, struct slow_work *work) {} +static void slow_work_clear_thread_processing(int id) {} +#endif + +/* + * Data for tracking currently executing items for indication through /proc + */ +#ifdef CONFIG_SLOW_WORK_DEBUG +struct slow_work *slow_work_execs[SLOW_WORK_THREAD_LIMIT]; +pid_t slow_work_pids[SLOW_WORK_THREAD_LIMIT]; +DEFINE_RWLOCK(slow_work_execs_lock); +#endif + +/* * The queues of work items and the lock governing access to them. These are * shared between all the CPUs. It doesn't make sense to have per-CPU queues * as the number of threads bears no relation to the number of CPUs. @@ -105,9 +152,18 @@ static struct slow_work slow_work_new_thread; /* new thread starter */ * There are two queues of work items: one for slow work items, and one for * very slow work items. */ -static LIST_HEAD(slow_work_queue); -static LIST_HEAD(vslow_work_queue); -static DEFINE_SPINLOCK(slow_work_queue_lock); +LIST_HEAD(slow_work_queue); +LIST_HEAD(vslow_work_queue); +DEFINE_SPINLOCK(slow_work_queue_lock); + +/* + * The following are two wait queues that get pinged when a work item is placed + * on an empty queue. These allow work items that are hogging a thread by + * sleeping in a way that could be deferred to yield their thread and enqueue + * themselves. + */ +static DECLARE_WAIT_QUEUE_HEAD(slow_work_queue_waits_for_occupation); +static DECLARE_WAIT_QUEUE_HEAD(vslow_work_queue_waits_for_occupation); /* * The thread controls. A variable used to signal to the threads that they @@ -126,6 +182,20 @@ static DECLARE_COMPLETION(slow_work_last_thread_exited); static int slow_work_user_count; static DEFINE_MUTEX(slow_work_user_lock); +static inline int slow_work_get_ref(struct slow_work *work) +{ + if (work->ops->get_ref) + return work->ops->get_ref(work); + + return 0; +} + +static inline void slow_work_put_ref(struct slow_work *work) +{ + if (work->ops->put_ref) + work->ops->put_ref(work); +} + /* * Calculate the maximum number of active threads in the pool that are * permitted to process very slow work items. @@ -149,7 +219,7 @@ static unsigned slow_work_calc_vsmax(void) * Attempt to execute stuff queued on a slow thread. Return true if we managed * it, false if there was nothing to do. */ -static bool slow_work_execute(void) +static noinline bool slow_work_execute(int id) { struct slow_work *work = NULL; unsigned vsmax; @@ -186,6 +256,13 @@ static bool slow_work_execute(void) } else { very_slow = false; /* avoid the compiler warning */ } + + slow_work_set_thread_processing(id, work); + if (work) { + slow_work_mark_time(work); + slow_work_begin_exec(id, work); + } + spin_unlock_irq(&slow_work_queue_lock); if (!work) @@ -194,12 +271,19 @@ static bool slow_work_execute(void) if (!test_and_clear_bit(SLOW_WORK_PENDING, &work->flags)) BUG(); - work->ops->execute(work); + /* don't execute if the work is in the process of being cancelled */ + if (!test_bit(SLOW_WORK_CANCELLING, &work->flags)) + work->ops->execute(work); if (very_slow) atomic_dec(&vslow_work_executing_count); clear_bit_unlock(SLOW_WORK_EXECUTING, &work->flags); + /* wake up anyone waiting for this work to be complete */ + wake_up_bit(&work->flags, SLOW_WORK_EXECUTING); + + slow_work_end_exec(id, work); + /* if someone tried to enqueue the item whilst we were executing it, * then it'll be left unenqueued to avoid multiple threads trying to * execute it simultaneously @@ -219,7 +303,10 @@ static bool slow_work_execute(void) spin_unlock_irq(&slow_work_queue_lock); } - work->ops->put_ref(work); + /* sort out the race between module unloading and put_ref() */ + slow_work_put_ref(work); + slow_work_done_thread_processing(id, work); + return true; auto_requeue: @@ -227,15 +314,61 @@ auto_requeue: * - we transfer our ref on the item back to the appropriate queue * - don't wake another thread up as we're awake already */ + slow_work_mark_time(work); if (test_bit(SLOW_WORK_VERY_SLOW, &work->flags)) list_add_tail(&work->link, &vslow_work_queue); else list_add_tail(&work->link, &slow_work_queue); spin_unlock_irq(&slow_work_queue_lock); + slow_work_clear_thread_processing(id); return true; } /** + * slow_work_sleep_till_thread_needed - Sleep till thread needed by other work + * work: The work item under execution that wants to sleep + * _timeout: Scheduler sleep timeout + * + * Allow a requeueable work item to sleep on a slow-work processor thread until + * that thread is needed to do some other work or the sleep is interrupted by + * some other event. + * + * The caller must set up a wake up event before calling this and must have set + * the appropriate sleep mode (such as TASK_UNINTERRUPTIBLE) and tested its own + * condition before calling this function as no test is made here. + * + * False is returned if there is nothing on the queue; true is returned if the + * work item should be requeued + */ +bool slow_work_sleep_till_thread_needed(struct slow_work *work, + signed long *_timeout) +{ + wait_queue_head_t *wfo_wq; + struct list_head *queue; + + DEFINE_WAIT(wait); + + if (test_bit(SLOW_WORK_VERY_SLOW, &work->flags)) { + wfo_wq = &vslow_work_queue_waits_for_occupation; + queue = &vslow_work_queue; + } else { + wfo_wq = &slow_work_queue_waits_for_occupation; + queue = &slow_work_queue; + } + + if (!list_empty(queue)) + return true; + + add_wait_queue_exclusive(wfo_wq, &wait); + if (list_empty(queue)) + *_timeout = schedule_timeout(*_timeout); + finish_wait(wfo_wq, &wait); + + return !list_empty(queue); +} +EXPORT_SYMBOL(slow_work_sleep_till_thread_needed); + +/** * slow_work_enqueue - Schedule a slow work item for processing * @work: The work item to queue * @@ -260,16 +393,22 @@ auto_requeue: * allowed to pick items to execute. This ensures that very slow items won't * overly block ones that are just ordinarily slow. * - * Returns 0 if successful, -EAGAIN if not. + * Returns 0 if successful, -EAGAIN if not (or -ECANCELED if cancelled work is + * attempted queued) */ int slow_work_enqueue(struct slow_work *work) { + wait_queue_head_t *wfo_wq; + struct list_head *queue; unsigned long flags; + int ret; + + if (test_bit(SLOW_WORK_CANCELLING, &work->flags)) + return -ECANCELED; BUG_ON(slow_work_user_count <= 0); BUG_ON(!work); BUG_ON(!work->ops); - BUG_ON(!work->ops->get_ref); /* when honouring an enqueue request, we only promise that we will run * the work function in the future; we do not promise to run it once @@ -280,8 +419,19 @@ int slow_work_enqueue(struct slow_work *work) * maintaining our promise */ if (!test_and_set_bit_lock(SLOW_WORK_PENDING, &work->flags)) { + if (test_bit(SLOW_WORK_VERY_SLOW, &work->flags)) { + wfo_wq = &vslow_work_queue_waits_for_occupation; + queue = &vslow_work_queue; + } else { + wfo_wq = &slow_work_queue_waits_for_occupation; + queue = &slow_work_queue; + } + spin_lock_irqsave(&slow_work_queue_lock, flags); + if (unlikely(test_bit(SLOW_WORK_CANCELLING, &work->flags))) + goto cancelled; + /* we promise that we will not attempt to execute the work * function in more than one thread simultaneously * @@ -299,25 +449,221 @@ int slow_work_enqueue(struct slow_work *work) if (test_bit(SLOW_WORK_EXECUTING, &work->flags)) { set_bit(SLOW_WORK_ENQ_DEFERRED, &work->flags); } else { - if (work->ops->get_ref(work) < 0) - goto cant_get_ref; - if (test_bit(SLOW_WORK_VERY_SLOW, &work->flags)) - list_add_tail(&work->link, &vslow_work_queue); - else - list_add_tail(&work->link, &slow_work_queue); + ret = slow_work_get_ref(work); + if (ret < 0) + goto failed; + slow_work_mark_time(work); + list_add_tail(&work->link, queue); wake_up(&slow_work_thread_wq); + + /* if someone who could be requeued is sleeping on a + * thread, then ask them to yield their thread */ + if (work->link.prev == queue) + wake_up(wfo_wq); } spin_unlock_irqrestore(&slow_work_queue_lock, flags); } return 0; -cant_get_ref: +cancelled: + ret = -ECANCELED; +failed: spin_unlock_irqrestore(&slow_work_queue_lock, flags); - return -EAGAIN; + return ret; } EXPORT_SYMBOL(slow_work_enqueue); +static int slow_work_wait(void *word) +{ + schedule(); + return 0; +} + +/** + * slow_work_cancel - Cancel a slow work item + * @work: The work item to cancel + * + * This function will cancel a previously enqueued work item. If we cannot + * cancel the work item, it is guarenteed to have run when this function + * returns. + */ +void slow_work_cancel(struct slow_work *work) +{ + bool wait = true, put = false; + + set_bit(SLOW_WORK_CANCELLING, &work->flags); + smp_mb(); + + /* if the work item is a delayed work item with an active timer, we + * need to wait for the timer to finish _before_ getting the spinlock, + * lest we deadlock against the timer routine + * + * the timer routine will leave DELAYED set if it notices the + * CANCELLING flag in time + */ + if (test_bit(SLOW_WORK_DELAYED, &work->flags)) { + struct delayed_slow_work *dwork = + container_of(work, struct delayed_slow_work, work); + del_timer_sync(&dwork->timer); + } + + spin_lock_irq(&slow_work_queue_lock); + + if (test_bit(SLOW_WORK_DELAYED, &work->flags)) { + /* the timer routine aborted or never happened, so we are left + * holding the timer's reference on the item and should just + * drop the pending flag and wait for any ongoing execution to + * finish */ + struct delayed_slow_work *dwork = + container_of(work, struct delayed_slow_work, work); + + BUG_ON(timer_pending(&dwork->timer)); + BUG_ON(!list_empty(&work->link)); + + clear_bit(SLOW_WORK_DELAYED, &work->flags); + put = true; + clear_bit(SLOW_WORK_PENDING, &work->flags); + + } else if (test_bit(SLOW_WORK_PENDING, &work->flags) && + !list_empty(&work->link)) { + /* the link in the pending queue holds a reference on the item + * that we will need to release */ + list_del_init(&work->link); + wait = false; + put = true; + clear_bit(SLOW_WORK_PENDING, &work->flags); + + } else if (test_and_clear_bit(SLOW_WORK_ENQ_DEFERRED, &work->flags)) { + /* the executor is holding our only reference on the item, so + * we merely need to wait for it to finish executing */ + clear_bit(SLOW_WORK_PENDING, &work->flags); + } + + spin_unlock_irq(&slow_work_queue_lock); + + /* the EXECUTING flag is set by the executor whilst the spinlock is set + * and before the item is dequeued - so assuming the above doesn't + * actually dequeue it, simply waiting for the EXECUTING flag to be + * released here should be sufficient */ + if (wait) + wait_on_bit(&work->flags, SLOW_WORK_EXECUTING, slow_work_wait, + TASK_UNINTERRUPTIBLE); + + clear_bit(SLOW_WORK_CANCELLING, &work->flags); + if (put) + slow_work_put_ref(work); +} +EXPORT_SYMBOL(slow_work_cancel); + +/* + * Handle expiry of the delay timer, indicating that a delayed slow work item + * should now be queued if not cancelled + */ +static void delayed_slow_work_timer(unsigned long data) +{ + wait_queue_head_t *wfo_wq; + struct list_head *queue; + struct slow_work *work = (struct slow_work *) data; + unsigned long flags; + bool queued = false, put = false, first = false; + + if (test_bit(SLOW_WORK_VERY_SLOW, &work->flags)) { + wfo_wq = &vslow_work_queue_waits_for_occupation; + queue = &vslow_work_queue; + } else { + wfo_wq = &slow_work_queue_waits_for_occupation; + queue = &slow_work_queue; + } + + spin_lock_irqsave(&slow_work_queue_lock, flags); + if (likely(!test_bit(SLOW_WORK_CANCELLING, &work->flags))) { + clear_bit(SLOW_WORK_DELAYED, &work->flags); + + if (test_bit(SLOW_WORK_EXECUTING, &work->flags)) { + /* we discard the reference the timer was holding in + * favour of the one the executor holds */ + set_bit(SLOW_WORK_ENQ_DEFERRED, &work->flags); + put = true; + } else { + slow_work_mark_time(work); + list_add_tail(&work->link, queue); + queued = true; + if (work->link.prev == queue) + first = true; + } + } + + spin_unlock_irqrestore(&slow_work_queue_lock, flags); + if (put) + slow_work_put_ref(work); + if (first) + wake_up(wfo_wq); + if (queued) + wake_up(&slow_work_thread_wq); +} + +/** + * delayed_slow_work_enqueue - Schedule a delayed slow work item for processing + * @dwork: The delayed work item to queue + * @delay: When to start executing the work, in jiffies from now + * + * This is similar to slow_work_enqueue(), but it adds a delay before the work + * is actually queued for processing. + * + * The item can have delayed processing requested on it whilst it is being + * executed. The delay will begin immediately, and if it expires before the + * item finishes executing, the item will be placed back on the queue when it + * has done executing. + */ +int delayed_slow_work_enqueue(struct delayed_slow_work *dwork, + unsigned long delay) +{ + struct slow_work *work = &dwork->work; + unsigned long flags; + int ret; + + if (delay == 0) + return slow_work_enqueue(&dwork->work); + + BUG_ON(slow_work_user_count <= 0); + BUG_ON(!work); + BUG_ON(!work->ops); + + if (test_bit(SLOW_WORK_CANCELLING, &work->flags)) + return -ECANCELED; + + if (!test_and_set_bit_lock(SLOW_WORK_PENDING, &work->flags)) { + spin_lock_irqsave(&slow_work_queue_lock, flags); + + if (test_bit(SLOW_WORK_CANCELLING, &work->flags)) + goto cancelled; + + /* the timer holds a reference whilst it is pending */ + ret = work->ops->get_ref(work); + if (ret < 0) + goto cant_get_ref; + + if (test_and_set_bit(SLOW_WORK_DELAYED, &work->flags)) + BUG(); + dwork->timer.expires = jiffies + delay; + dwork->timer.data = (unsigned long) work; + dwork->timer.function = delayed_slow_work_timer; + add_timer(&dwork->timer); + + spin_unlock_irqrestore(&slow_work_queue_lock, flags); + } + + return 0; + +cancelled: + ret = -ECANCELED; +cant_get_ref: + spin_unlock_irqrestore(&slow_work_queue_lock, flags); + return ret; +} +EXPORT_SYMBOL(delayed_slow_work_enqueue); + /* * Schedule a cull of the thread pool at some time in the near future */ @@ -368,13 +714,23 @@ static inline bool slow_work_available(int vsmax) */ static int slow_work_thread(void *_data) { - int vsmax; + int vsmax, id; DEFINE_WAIT(wait); set_freezable(); set_user_nice(current, -5); + /* allocate ourselves an ID */ + spin_lock_irq(&slow_work_queue_lock); + id = find_first_zero_bit(slow_work_ids, SLOW_WORK_THREAD_LIMIT); + BUG_ON(id < 0 || id >= SLOW_WORK_THREAD_LIMIT); + __set_bit(id, slow_work_ids); + slow_work_set_thread_pid(id, current->pid); + spin_unlock_irq(&slow_work_queue_lock); + + sprintf(current->comm, "kslowd%03u", id); + for (;;) { vsmax = vslow_work_proportion; vsmax *= atomic_read(&slow_work_thread_count); @@ -395,7 +751,7 @@ static int slow_work_thread(void *_data) vsmax *= atomic_read(&slow_work_thread_count); vsmax /= 100; - if (slow_work_available(vsmax) && slow_work_execute()) { + if (slow_work_available(vsmax) && slow_work_execute(id)) { cond_resched(); if (list_empty(&slow_work_queue) && list_empty(&vslow_work_queue) && @@ -412,6 +768,11 @@ static int slow_work_thread(void *_data) break; } + spin_lock_irq(&slow_work_queue_lock); + slow_work_set_thread_pid(id, 0); + __clear_bit(id, slow_work_ids); + spin_unlock_irq(&slow_work_queue_lock); + if (atomic_dec_and_test(&slow_work_thread_count)) complete_and_exit(&slow_work_last_thread_exited, 0); return 0; @@ -427,21 +788,6 @@ static void slow_work_cull_timeout(unsigned long data) } /* - * Get a reference on slow work thread starter - */ -static int slow_work_new_thread_get_ref(struct slow_work *work) -{ - return 0; -} - -/* - * Drop a reference on slow work thread starter - */ -static void slow_work_new_thread_put_ref(struct slow_work *work) -{ -} - -/* * Start a new slow work thread */ static void slow_work_new_thread_execute(struct slow_work *work) @@ -475,9 +821,11 @@ static void slow_work_new_thread_execute(struct slow_work *work) } static const struct slow_work_ops slow_work_new_thread_ops = { - .get_ref = slow_work_new_thread_get_ref, - .put_ref = slow_work_new_thread_put_ref, + .owner = THIS_MODULE, .execute = slow_work_new_thread_execute, +#ifdef CONFIG_SLOW_WORK_DEBUG + .desc = slow_work_new_thread_desc, +#endif }; /* @@ -493,10 +841,10 @@ static void slow_work_oom_timeout(unsigned long data) * Handle adjustment of the minimum number of threads */ static int slow_work_min_threads_sysctl(struct ctl_table *table, int write, - struct file *filp, void __user *buffer, + void __user *buffer, size_t *lenp, loff_t *ppos) { - int ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos); + int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); int n; if (ret == 0) { @@ -521,10 +869,10 @@ static int slow_work_min_threads_sysctl(struct ctl_table *table, int write, * Handle adjustment of the maximum number of threads */ static int slow_work_max_threads_sysctl(struct ctl_table *table, int write, - struct file *filp, void __user *buffer, + void __user *buffer, size_t *lenp, loff_t *ppos) { - int ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos); + int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); int n; if (ret == 0) { @@ -546,12 +894,13 @@ static int slow_work_max_threads_sysctl(struct ctl_table *table, int write, /** * slow_work_register_user - Register a user of the facility + * @module: The module about to make use of the facility * * Register a user of the facility, starting up the initial threads if there * aren't any other users at this point. This will return 0 if successful, or * an error if not. */ -int slow_work_register_user(void) +int slow_work_register_user(struct module *module) { struct task_struct *p; int loop; @@ -598,14 +947,81 @@ error: } EXPORT_SYMBOL(slow_work_register_user); +/* + * wait for all outstanding items from the calling module to complete + * - note that more items may be queued whilst we're waiting + */ +static void slow_work_wait_for_items(struct module *module) +{ +#ifdef CONFIG_MODULES + DECLARE_WAITQUEUE(myself, current); + struct slow_work *work; + int loop; + + mutex_lock(&slow_work_unreg_sync_lock); + add_wait_queue(&slow_work_unreg_wq, &myself); + + for (;;) { + spin_lock_irq(&slow_work_queue_lock); + + /* first of all, we wait for the last queued item in each list + * to be processed */ + list_for_each_entry_reverse(work, &vslow_work_queue, link) { + if (work->owner == module) { + set_current_state(TASK_UNINTERRUPTIBLE); + slow_work_unreg_work_item = work; + goto do_wait; + } + } + list_for_each_entry_reverse(work, &slow_work_queue, link) { + if (work->owner == module) { + set_current_state(TASK_UNINTERRUPTIBLE); + slow_work_unreg_work_item = work; + goto do_wait; + } + } + + /* then we wait for the items being processed to finish */ + slow_work_unreg_module = module; + smp_mb(); + for (loop = 0; loop < SLOW_WORK_THREAD_LIMIT; loop++) { + if (slow_work_thread_processing[loop] == module) + goto do_wait; + } + spin_unlock_irq(&slow_work_queue_lock); + break; /* okay, we're done */ + + do_wait: + spin_unlock_irq(&slow_work_queue_lock); + schedule(); + slow_work_unreg_work_item = NULL; + slow_work_unreg_module = NULL; + } + + remove_wait_queue(&slow_work_unreg_wq, &myself); + mutex_unlock(&slow_work_unreg_sync_lock); +#endif /* CONFIG_MODULES */ +} + /** * slow_work_unregister_user - Unregister a user of the facility + * @module: The module whose items should be cleared * * Unregister a user of the facility, killing all the threads if this was the * last one. + * + * This waits for all the work items belonging to the nominated module to go + * away before proceeding. */ -void slow_work_unregister_user(void) +void slow_work_unregister_user(struct module *module) { + /* first of all, wait for all outstanding items from the calling module + * to complete */ + if (module) + slow_work_wait_for_items(module); + + /* then we can actually go about shutting down the facility if need + * be */ mutex_lock(&slow_work_user_lock); BUG_ON(slow_work_user_count <= 0); @@ -639,6 +1055,16 @@ static int __init init_slow_work(void) if (slow_work_max_max_threads < nr_cpus * 2) slow_work_max_max_threads = nr_cpus * 2; #endif +#ifdef CONFIG_SLOW_WORK_DEBUG + { + struct dentry *dbdir; + + dbdir = debugfs_create_dir("slow_work", NULL); + if (dbdir && !IS_ERR(dbdir)) + debugfs_create_file("runqueue", S_IFREG | 0400, dbdir, + NULL, &slow_work_runqueue_fops); + } +#endif return 0; } diff --git a/kernel/slow-work.h b/kernel/slow-work.h new file mode 100644 index 000000000000..321f3c59d732 --- /dev/null +++ b/kernel/slow-work.h @@ -0,0 +1,72 @@ +/* Slow work private definitions + * + * Copyright (C) 2009 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public Licence + * as published by the Free Software Foundation; either version + * 2 of the Licence, or (at your option) any later version. + */ + +#define SLOW_WORK_CULL_TIMEOUT (5 * HZ) /* cull threads 5s after running out of + * things to do */ +#define SLOW_WORK_OOM_TIMEOUT (5 * HZ) /* can't start new threads for 5s after + * OOM */ + +#define SLOW_WORK_THREAD_LIMIT 255 /* abs maximum number of slow-work threads */ + +/* + * slow-work.c + */ +#ifdef CONFIG_SLOW_WORK_DEBUG +extern struct slow_work *slow_work_execs[]; +extern pid_t slow_work_pids[]; +extern rwlock_t slow_work_execs_lock; +#endif + +extern struct list_head slow_work_queue; +extern struct list_head vslow_work_queue; +extern spinlock_t slow_work_queue_lock; + +/* + * slow-work-debugfs.c + */ +#ifdef CONFIG_SLOW_WORK_DEBUG +extern const struct file_operations slow_work_runqueue_fops; + +extern void slow_work_new_thread_desc(struct slow_work *, struct seq_file *); +#endif + +/* + * Helper functions + */ +static inline void slow_work_set_thread_pid(int id, pid_t pid) +{ +#ifdef CONFIG_SLOW_WORK_PROC + slow_work_pids[id] = pid; +#endif +} + +static inline void slow_work_mark_time(struct slow_work *work) +{ +#ifdef CONFIG_SLOW_WORK_PROC + work->mark = CURRENT_TIME; +#endif +} + +static inline void slow_work_begin_exec(int id, struct slow_work *work) +{ +#ifdef CONFIG_SLOW_WORK_PROC + slow_work_execs[id] = work; +#endif +} + +static inline void slow_work_end_exec(int id, struct slow_work *work) +{ +#ifdef CONFIG_SLOW_WORK_PROC + write_lock(&slow_work_execs_lock); + slow_work_execs[id] = NULL; + write_unlock(&slow_work_execs_lock); +#endif +} diff --git a/kernel/smp.c b/kernel/smp.c index fd47a256a24e..a8c76069cf50 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -265,9 +265,7 @@ static DEFINE_PER_CPU(struct call_single_data, csd_data); * @info: An arbitrary pointer to pass to the function. * @wait: If true, wait until function has completed on other CPUs. * - * Returns 0 on success, else a negative status code. Note that @wait - * will be implicitly turned on in case of allocation failures, since - * we fall back to on-stack allocation. + * Returns 0 on success, else a negative status code. */ int smp_call_function_single(int cpu, void (*func) (void *info), void *info, int wait) @@ -321,6 +319,51 @@ int smp_call_function_single(int cpu, void (*func) (void *info), void *info, } EXPORT_SYMBOL(smp_call_function_single); +/* + * smp_call_function_any - Run a function on any of the given cpus + * @mask: The mask of cpus it can run on. + * @func: The function to run. This must be fast and non-blocking. + * @info: An arbitrary pointer to pass to the function. + * @wait: If true, wait until function has completed. + * + * Returns 0 on success, else a negative status code (if no cpus were online). + * Note that @wait will be implicitly turned on in case of allocation failures, + * since we fall back to on-stack allocation. + * + * Selection preference: + * 1) current cpu if in @mask + * 2) any cpu of current node if in @mask + * 3) any other online cpu in @mask + */ +int smp_call_function_any(const struct cpumask *mask, + void (*func)(void *info), void *info, int wait) +{ + unsigned int cpu; + const struct cpumask *nodemask; + int ret; + + /* Try for same CPU (cheapest) */ + cpu = get_cpu(); + if (cpumask_test_cpu(cpu, mask)) + goto call; + + /* Try for same node. */ + nodemask = cpumask_of_node(cpu); + for (cpu = cpumask_first_and(nodemask, mask); cpu < nr_cpu_ids; + cpu = cpumask_next_and(cpu, nodemask, mask)) { + if (cpu_online(cpu)) + goto call; + } + + /* Any online will do: smp_call_function_single handles nr_cpu_ids. */ + cpu = cpumask_any_and(mask, cpu_online_mask); +call: + ret = smp_call_function_single(cpu, func, info, wait); + put_cpu(); + return ret; +} +EXPORT_SYMBOL_GPL(smp_call_function_any); + /** * __smp_call_function_single(): Run a function on another CPU * @cpu: The CPU to run on. @@ -347,13 +390,6 @@ void __smp_call_function_single(int cpu, struct call_single_data *data, generic_exec_single(cpu, data, wait); } -/* Deprecated: shim for archs using old arch_send_call_function_ipi API. */ - -#ifndef arch_send_call_function_ipi_mask -# define arch_send_call_function_ipi_mask(maskp) \ - arch_send_call_function_ipi(*(maskp)) -#endif - /** * smp_call_function_many(): Run a function on a set of other CPUs. * @mask: The set of cpus to run on (only runs on online subset). @@ -362,9 +398,7 @@ void __smp_call_function_single(int cpu, struct call_single_data *data, * @wait: If true, wait (atomically) until function has completed * on other CPUs. * - * If @wait is true, then returns once @func has returned. Note that @wait - * will be implicitly turned on in case of allocation failures, since - * we fall back to on-stack allocation. + * If @wait is true, then returns once @func has returned. * * You must not call this function with disabled interrupts or from a * hardware interrupt handler or from a bottom half handler. Preemption @@ -450,8 +484,7 @@ EXPORT_SYMBOL(smp_call_function_many); * Returns 0. * * If @wait is true, then returns once @func has returned; otherwise - * it returns just before the target cpu calls @func. In case of allocation - * failure, @wait will be implicitly turned on. + * it returns just before the target cpu calls @func. * * You must not call this function with disabled interrupts or from a * hardware interrupt handler or from a bottom half handler. diff --git a/kernel/softirq.c b/kernel/softirq.c index f8749e5216e0..21939d9e830e 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -302,9 +302,9 @@ void irq_exit(void) if (!in_interrupt() && local_softirq_pending()) invoke_softirq(); + rcu_irq_exit(); #ifdef CONFIG_NO_HZ /* Make sure that timer wheel updates are propagated */ - rcu_irq_exit(); if (idle_cpu(smp_processor_id()) && !in_interrupt() && !need_resched()) tick_nohz_stop_sched_tick(0); #endif diff --git a/kernel/softlockup.c b/kernel/softlockup.c index 88796c330838..81324d12eb35 100644 --- a/kernel/softlockup.c +++ b/kernel/softlockup.c @@ -90,11 +90,11 @@ void touch_all_softlockup_watchdogs(void) EXPORT_SYMBOL(touch_all_softlockup_watchdogs); int proc_dosoftlockup_thresh(struct ctl_table *table, int write, - struct file *filp, void __user *buffer, + void __user *buffer, size_t *lenp, loff_t *ppos) { touch_all_softlockup_watchdogs(); - return proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos); + return proc_dointvec_minmax(table, write, buffer, lenp, ppos); } /* diff --git a/kernel/spinlock.c b/kernel/spinlock.c index 5ddab730cb2f..41e042219ff6 100644 --- a/kernel/spinlock.c +++ b/kernel/spinlock.c @@ -21,145 +21,28 @@ #include <linux/debug_locks.h> #include <linux/module.h> -#ifndef _spin_trylock -int __lockfunc _spin_trylock(spinlock_t *lock) -{ - return __spin_trylock(lock); -} -EXPORT_SYMBOL(_spin_trylock); -#endif - -#ifndef _read_trylock -int __lockfunc _read_trylock(rwlock_t *lock) -{ - return __read_trylock(lock); -} -EXPORT_SYMBOL(_read_trylock); -#endif - -#ifndef _write_trylock -int __lockfunc _write_trylock(rwlock_t *lock) -{ - return __write_trylock(lock); -} -EXPORT_SYMBOL(_write_trylock); -#endif - /* * If lockdep is enabled then we use the non-preemption spin-ops * even on CONFIG_PREEMPT, because lockdep assumes that interrupts are * not re-enabled during lock-acquire (which the preempt-spin-ops do): */ #if !defined(CONFIG_GENERIC_LOCKBREAK) || defined(CONFIG_DEBUG_LOCK_ALLOC) - -#ifndef _read_lock -void __lockfunc _read_lock(rwlock_t *lock) -{ - __read_lock(lock); -} -EXPORT_SYMBOL(_read_lock); -#endif - -#ifndef _spin_lock_irqsave -unsigned long __lockfunc _spin_lock_irqsave(spinlock_t *lock) -{ - return __spin_lock_irqsave(lock); -} -EXPORT_SYMBOL(_spin_lock_irqsave); -#endif - -#ifndef _spin_lock_irq -void __lockfunc _spin_lock_irq(spinlock_t *lock) -{ - __spin_lock_irq(lock); -} -EXPORT_SYMBOL(_spin_lock_irq); -#endif - -#ifndef _spin_lock_bh -void __lockfunc _spin_lock_bh(spinlock_t *lock) -{ - __spin_lock_bh(lock); -} -EXPORT_SYMBOL(_spin_lock_bh); -#endif - -#ifndef _read_lock_irqsave -unsigned long __lockfunc _read_lock_irqsave(rwlock_t *lock) -{ - return __read_lock_irqsave(lock); -} -EXPORT_SYMBOL(_read_lock_irqsave); -#endif - -#ifndef _read_lock_irq -void __lockfunc _read_lock_irq(rwlock_t *lock) -{ - __read_lock_irq(lock); -} -EXPORT_SYMBOL(_read_lock_irq); -#endif - -#ifndef _read_lock_bh -void __lockfunc _read_lock_bh(rwlock_t *lock) -{ - __read_lock_bh(lock); -} -EXPORT_SYMBOL(_read_lock_bh); -#endif - -#ifndef _write_lock_irqsave -unsigned long __lockfunc _write_lock_irqsave(rwlock_t *lock) -{ - return __write_lock_irqsave(lock); -} -EXPORT_SYMBOL(_write_lock_irqsave); -#endif - -#ifndef _write_lock_irq -void __lockfunc _write_lock_irq(rwlock_t *lock) -{ - __write_lock_irq(lock); -} -EXPORT_SYMBOL(_write_lock_irq); -#endif - -#ifndef _write_lock_bh -void __lockfunc _write_lock_bh(rwlock_t *lock) -{ - __write_lock_bh(lock); -} -EXPORT_SYMBOL(_write_lock_bh); -#endif - -#ifndef _spin_lock -void __lockfunc _spin_lock(spinlock_t *lock) -{ - __spin_lock(lock); -} -EXPORT_SYMBOL(_spin_lock); -#endif - -#ifndef _write_lock -void __lockfunc _write_lock(rwlock_t *lock) -{ - __write_lock(lock); -} -EXPORT_SYMBOL(_write_lock); -#endif - -#else /* CONFIG_PREEMPT: */ - /* + * The __lock_function inlines are taken from + * include/linux/spinlock_api_smp.h + */ +#else +/* + * We build the __lock_function inlines here. They are too large for + * inlining all over the place, but here is only one user per function + * which embedds them into the calling _lock_function below. + * * This could be a long-held lock. We both prepare to spin for a long * time (making _this_ CPU preemptable if possible), and we also signal * towards that other CPU that it should break the lock ASAP. - * - * (We do this in a function because inlining it would be excessive.) */ - #define BUILD_LOCK_OPS(op, locktype) \ -void __lockfunc _##op##_lock(locktype##_t *lock) \ +void __lockfunc __##op##_lock(locktype##_t *lock) \ { \ for (;;) { \ preempt_disable(); \ @@ -175,9 +58,7 @@ void __lockfunc _##op##_lock(locktype##_t *lock) \ (lock)->break_lock = 0; \ } \ \ -EXPORT_SYMBOL(_##op##_lock); \ - \ -unsigned long __lockfunc _##op##_lock_irqsave(locktype##_t *lock) \ +unsigned long __lockfunc __##op##_lock_irqsave(locktype##_t *lock) \ { \ unsigned long flags; \ \ @@ -198,16 +79,12 @@ unsigned long __lockfunc _##op##_lock_irqsave(locktype##_t *lock) \ return flags; \ } \ \ -EXPORT_SYMBOL(_##op##_lock_irqsave); \ - \ -void __lockfunc _##op##_lock_irq(locktype##_t *lock) \ +void __lockfunc __##op##_lock_irq(locktype##_t *lock) \ { \ _##op##_lock_irqsave(lock); \ } \ \ -EXPORT_SYMBOL(_##op##_lock_irq); \ - \ -void __lockfunc _##op##_lock_bh(locktype##_t *lock) \ +void __lockfunc __##op##_lock_bh(locktype##_t *lock) \ { \ unsigned long flags; \ \ @@ -220,23 +97,21 @@ void __lockfunc _##op##_lock_bh(locktype##_t *lock) \ local_bh_disable(); \ local_irq_restore(flags); \ } \ - \ -EXPORT_SYMBOL(_##op##_lock_bh) /* * Build preemption-friendly versions of the following * lock-spinning functions: * - * _[spin|read|write]_lock() - * _[spin|read|write]_lock_irq() - * _[spin|read|write]_lock_irqsave() - * _[spin|read|write]_lock_bh() + * __[spin|read|write]_lock() + * __[spin|read|write]_lock_irq() + * __[spin|read|write]_lock_irqsave() + * __[spin|read|write]_lock_bh() */ BUILD_LOCK_OPS(spin, spinlock); BUILD_LOCK_OPS(read, rwlock); BUILD_LOCK_OPS(write, rwlock); -#endif /* CONFIG_PREEMPT */ +#endif #ifdef CONFIG_DEBUG_LOCK_ALLOC @@ -248,7 +123,8 @@ void __lockfunc _spin_lock_nested(spinlock_t *lock, int subclass) } EXPORT_SYMBOL(_spin_lock_nested); -unsigned long __lockfunc _spin_lock_irqsave_nested(spinlock_t *lock, int subclass) +unsigned long __lockfunc _spin_lock_irqsave_nested(spinlock_t *lock, + int subclass) { unsigned long flags; @@ -272,7 +148,127 @@ EXPORT_SYMBOL(_spin_lock_nest_lock); #endif -#ifndef _spin_unlock +#ifndef CONFIG_INLINE_SPIN_TRYLOCK +int __lockfunc _spin_trylock(spinlock_t *lock) +{ + return __spin_trylock(lock); +} +EXPORT_SYMBOL(_spin_trylock); +#endif + +#ifndef CONFIG_INLINE_READ_TRYLOCK +int __lockfunc _read_trylock(rwlock_t *lock) +{ + return __read_trylock(lock); +} +EXPORT_SYMBOL(_read_trylock); +#endif + +#ifndef CONFIG_INLINE_WRITE_TRYLOCK +int __lockfunc _write_trylock(rwlock_t *lock) +{ + return __write_trylock(lock); +} +EXPORT_SYMBOL(_write_trylock); +#endif + +#ifndef CONFIG_INLINE_READ_LOCK +void __lockfunc _read_lock(rwlock_t *lock) +{ + __read_lock(lock); +} +EXPORT_SYMBOL(_read_lock); +#endif + +#ifndef CONFIG_INLINE_SPIN_LOCK_IRQSAVE +unsigned long __lockfunc _spin_lock_irqsave(spinlock_t *lock) +{ + return __spin_lock_irqsave(lock); +} +EXPORT_SYMBOL(_spin_lock_irqsave); +#endif + +#ifndef CONFIG_INLINE_SPIN_LOCK_IRQ +void __lockfunc _spin_lock_irq(spinlock_t *lock) +{ + __spin_lock_irq(lock); +} +EXPORT_SYMBOL(_spin_lock_irq); +#endif + +#ifndef CONFIG_INLINE_SPIN_LOCK_BH +void __lockfunc _spin_lock_bh(spinlock_t *lock) +{ + __spin_lock_bh(lock); +} +EXPORT_SYMBOL(_spin_lock_bh); +#endif + +#ifndef CONFIG_INLINE_READ_LOCK_IRQSAVE +unsigned long __lockfunc _read_lock_irqsave(rwlock_t *lock) +{ + return __read_lock_irqsave(lock); +} +EXPORT_SYMBOL(_read_lock_irqsave); +#endif + +#ifndef CONFIG_INLINE_READ_LOCK_IRQ +void __lockfunc _read_lock_irq(rwlock_t *lock) +{ + __read_lock_irq(lock); +} +EXPORT_SYMBOL(_read_lock_irq); +#endif + +#ifndef CONFIG_INLINE_READ_LOCK_BH +void __lockfunc _read_lock_bh(rwlock_t *lock) +{ + __read_lock_bh(lock); +} +EXPORT_SYMBOL(_read_lock_bh); +#endif + +#ifndef CONFIG_INLINE_WRITE_LOCK_IRQSAVE +unsigned long __lockfunc _write_lock_irqsave(rwlock_t *lock) +{ + return __write_lock_irqsave(lock); +} +EXPORT_SYMBOL(_write_lock_irqsave); +#endif + +#ifndef CONFIG_INLINE_WRITE_LOCK_IRQ +void __lockfunc _write_lock_irq(rwlock_t *lock) +{ + __write_lock_irq(lock); +} +EXPORT_SYMBOL(_write_lock_irq); +#endif + +#ifndef CONFIG_INLINE_WRITE_LOCK_BH +void __lockfunc _write_lock_bh(rwlock_t *lock) +{ + __write_lock_bh(lock); +} +EXPORT_SYMBOL(_write_lock_bh); +#endif + +#ifndef CONFIG_INLINE_SPIN_LOCK +void __lockfunc _spin_lock(spinlock_t *lock) +{ + __spin_lock(lock); +} +EXPORT_SYMBOL(_spin_lock); +#endif + +#ifndef CONFIG_INLINE_WRITE_LOCK +void __lockfunc _write_lock(rwlock_t *lock) +{ + __write_lock(lock); +} +EXPORT_SYMBOL(_write_lock); +#endif + +#ifndef CONFIG_INLINE_SPIN_UNLOCK void __lockfunc _spin_unlock(spinlock_t *lock) { __spin_unlock(lock); @@ -280,7 +276,7 @@ void __lockfunc _spin_unlock(spinlock_t *lock) EXPORT_SYMBOL(_spin_unlock); #endif -#ifndef _write_unlock +#ifndef CONFIG_INLINE_WRITE_UNLOCK void __lockfunc _write_unlock(rwlock_t *lock) { __write_unlock(lock); @@ -288,7 +284,7 @@ void __lockfunc _write_unlock(rwlock_t *lock) EXPORT_SYMBOL(_write_unlock); #endif -#ifndef _read_unlock +#ifndef CONFIG_INLINE_READ_UNLOCK void __lockfunc _read_unlock(rwlock_t *lock) { __read_unlock(lock); @@ -296,7 +292,7 @@ void __lockfunc _read_unlock(rwlock_t *lock) EXPORT_SYMBOL(_read_unlock); #endif -#ifndef _spin_unlock_irqrestore +#ifndef CONFIG_INLINE_SPIN_UNLOCK_IRQRESTORE void __lockfunc _spin_unlock_irqrestore(spinlock_t *lock, unsigned long flags) { __spin_unlock_irqrestore(lock, flags); @@ -304,7 +300,7 @@ void __lockfunc _spin_unlock_irqrestore(spinlock_t *lock, unsigned long flags) EXPORT_SYMBOL(_spin_unlock_irqrestore); #endif -#ifndef _spin_unlock_irq +#ifndef CONFIG_INLINE_SPIN_UNLOCK_IRQ void __lockfunc _spin_unlock_irq(spinlock_t *lock) { __spin_unlock_irq(lock); @@ -312,7 +308,7 @@ void __lockfunc _spin_unlock_irq(spinlock_t *lock) EXPORT_SYMBOL(_spin_unlock_irq); #endif -#ifndef _spin_unlock_bh +#ifndef CONFIG_INLINE_SPIN_UNLOCK_BH void __lockfunc _spin_unlock_bh(spinlock_t *lock) { __spin_unlock_bh(lock); @@ -320,7 +316,7 @@ void __lockfunc _spin_unlock_bh(spinlock_t *lock) EXPORT_SYMBOL(_spin_unlock_bh); #endif -#ifndef _read_unlock_irqrestore +#ifndef CONFIG_INLINE_READ_UNLOCK_IRQRESTORE void __lockfunc _read_unlock_irqrestore(rwlock_t *lock, unsigned long flags) { __read_unlock_irqrestore(lock, flags); @@ -328,7 +324,7 @@ void __lockfunc _read_unlock_irqrestore(rwlock_t *lock, unsigned long flags) EXPORT_SYMBOL(_read_unlock_irqrestore); #endif -#ifndef _read_unlock_irq +#ifndef CONFIG_INLINE_READ_UNLOCK_IRQ void __lockfunc _read_unlock_irq(rwlock_t *lock) { __read_unlock_irq(lock); @@ -336,7 +332,7 @@ void __lockfunc _read_unlock_irq(rwlock_t *lock) EXPORT_SYMBOL(_read_unlock_irq); #endif -#ifndef _read_unlock_bh +#ifndef CONFIG_INLINE_READ_UNLOCK_BH void __lockfunc _read_unlock_bh(rwlock_t *lock) { __read_unlock_bh(lock); @@ -344,7 +340,7 @@ void __lockfunc _read_unlock_bh(rwlock_t *lock) EXPORT_SYMBOL(_read_unlock_bh); #endif -#ifndef _write_unlock_irqrestore +#ifndef CONFIG_INLINE_WRITE_UNLOCK_IRQRESTORE void __lockfunc _write_unlock_irqrestore(rwlock_t *lock, unsigned long flags) { __write_unlock_irqrestore(lock, flags); @@ -352,7 +348,7 @@ void __lockfunc _write_unlock_irqrestore(rwlock_t *lock, unsigned long flags) EXPORT_SYMBOL(_write_unlock_irqrestore); #endif -#ifndef _write_unlock_irq +#ifndef CONFIG_INLINE_WRITE_UNLOCK_IRQ void __lockfunc _write_unlock_irq(rwlock_t *lock) { __write_unlock_irq(lock); @@ -360,7 +356,7 @@ void __lockfunc _write_unlock_irq(rwlock_t *lock) EXPORT_SYMBOL(_write_unlock_irq); #endif -#ifndef _write_unlock_bh +#ifndef CONFIG_INLINE_WRITE_UNLOCK_BH void __lockfunc _write_unlock_bh(rwlock_t *lock) { __write_unlock_bh(lock); @@ -368,7 +364,7 @@ void __lockfunc _write_unlock_bh(rwlock_t *lock) EXPORT_SYMBOL(_write_unlock_bh); #endif -#ifndef _spin_trylock_bh +#ifndef CONFIG_INLINE_SPIN_TRYLOCK_BH int __lockfunc _spin_trylock_bh(spinlock_t *lock) { return __spin_trylock_bh(lock); diff --git a/kernel/srcu.c b/kernel/srcu.c index b0aeeaf22ce4..818d7d9aa03c 100644 --- a/kernel/srcu.c +++ b/kernel/srcu.c @@ -49,6 +49,7 @@ int init_srcu_struct(struct srcu_struct *sp) sp->per_cpu_ref = alloc_percpu(struct srcu_struct_array); return (sp->per_cpu_ref ? 0 : -ENOMEM); } +EXPORT_SYMBOL_GPL(init_srcu_struct); /* * srcu_readers_active_idx -- returns approximate number of readers @@ -97,6 +98,7 @@ void cleanup_srcu_struct(struct srcu_struct *sp) free_percpu(sp->per_cpu_ref); sp->per_cpu_ref = NULL; } +EXPORT_SYMBOL_GPL(cleanup_srcu_struct); /** * srcu_read_lock - register a new reader for an SRCU-protected structure. @@ -118,6 +120,7 @@ int srcu_read_lock(struct srcu_struct *sp) preempt_enable(); return idx; } +EXPORT_SYMBOL_GPL(srcu_read_lock); /** * srcu_read_unlock - unregister a old reader from an SRCU-protected structure. @@ -136,22 +139,12 @@ void srcu_read_unlock(struct srcu_struct *sp, int idx) per_cpu_ptr(sp->per_cpu_ref, smp_processor_id())->c[idx]--; preempt_enable(); } +EXPORT_SYMBOL_GPL(srcu_read_unlock); -/** - * synchronize_srcu - wait for prior SRCU read-side critical-section completion - * @sp: srcu_struct with which to synchronize. - * - * Flip the completed counter, and wait for the old count to drain to zero. - * As with classic RCU, the updater must use some separate means of - * synchronizing concurrent updates. Can block; must be called from - * process context. - * - * Note that it is illegal to call synchornize_srcu() from the corresponding - * SRCU read-side critical section; doing so will result in deadlock. - * However, it is perfectly legal to call synchronize_srcu() on one - * srcu_struct from some other srcu_struct's read-side critical section. +/* + * Helper function for synchronize_srcu() and synchronize_srcu_expedited(). */ -void synchronize_srcu(struct srcu_struct *sp) +void __synchronize_srcu(struct srcu_struct *sp, void (*sync_func)(void)) { int idx; @@ -173,7 +166,7 @@ void synchronize_srcu(struct srcu_struct *sp) return; } - synchronize_sched(); /* Force memory barrier on all CPUs. */ + sync_func(); /* Force memory barrier on all CPUs. */ /* * The preceding synchronize_sched() ensures that any CPU that @@ -190,7 +183,7 @@ void synchronize_srcu(struct srcu_struct *sp) idx = sp->completed & 0x1; sp->completed++; - synchronize_sched(); /* Force memory barrier on all CPUs. */ + sync_func(); /* Force memory barrier on all CPUs. */ /* * At this point, because of the preceding synchronize_sched(), @@ -203,7 +196,7 @@ void synchronize_srcu(struct srcu_struct *sp) while (srcu_readers_active_idx(sp, idx)) schedule_timeout_interruptible(1); - synchronize_sched(); /* Force memory barrier on all CPUs. */ + sync_func(); /* Force memory barrier on all CPUs. */ /* * The preceding synchronize_sched() forces all srcu_read_unlock() @@ -237,6 +230,47 @@ void synchronize_srcu(struct srcu_struct *sp) } /** + * synchronize_srcu - wait for prior SRCU read-side critical-section completion + * @sp: srcu_struct with which to synchronize. + * + * Flip the completed counter, and wait for the old count to drain to zero. + * As with classic RCU, the updater must use some separate means of + * synchronizing concurrent updates. Can block; must be called from + * process context. + * + * Note that it is illegal to call synchronize_srcu() from the corresponding + * SRCU read-side critical section; doing so will result in deadlock. + * However, it is perfectly legal to call synchronize_srcu() on one + * srcu_struct from some other srcu_struct's read-side critical section. + */ +void synchronize_srcu(struct srcu_struct *sp) +{ + __synchronize_srcu(sp, synchronize_sched); +} +EXPORT_SYMBOL_GPL(synchronize_srcu); + +/** + * synchronize_srcu_expedited - like synchronize_srcu, but less patient + * @sp: srcu_struct with which to synchronize. + * + * Flip the completed counter, and wait for the old count to drain to zero. + * As with classic RCU, the updater must use some separate means of + * synchronizing concurrent updates. Can block; must be called from + * process context. + * + * Note that it is illegal to call synchronize_srcu_expedited() + * from the corresponding SRCU read-side critical section; doing so + * will result in deadlock. However, it is perfectly legal to call + * synchronize_srcu_expedited() on one srcu_struct from some other + * srcu_struct's read-side critical section. + */ +void synchronize_srcu_expedited(struct srcu_struct *sp) +{ + __synchronize_srcu(sp, synchronize_sched_expedited); +} +EXPORT_SYMBOL_GPL(synchronize_srcu_expedited); + +/** * srcu_batches_completed - return batches completed. * @sp: srcu_struct on which to report batch completion. * @@ -248,10 +282,4 @@ long srcu_batches_completed(struct srcu_struct *sp) { return sp->completed; } - -EXPORT_SYMBOL_GPL(init_srcu_struct); -EXPORT_SYMBOL_GPL(cleanup_srcu_struct); -EXPORT_SYMBOL_GPL(srcu_read_lock); -EXPORT_SYMBOL_GPL(srcu_read_unlock); -EXPORT_SYMBOL_GPL(synchronize_srcu); EXPORT_SYMBOL_GPL(srcu_batches_completed); diff --git a/kernel/sys.c b/kernel/sys.c index ebcb15611728..9968c5fb55b9 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -911,16 +911,15 @@ change_okay: void do_sys_times(struct tms *tms) { - struct task_cputime cputime; - cputime_t cutime, cstime; + cputime_t tgutime, tgstime, cutime, cstime; - thread_group_cputime(current, &cputime); spin_lock_irq(¤t->sighand->siglock); + thread_group_times(current, &tgutime, &tgstime); cutime = current->signal->cutime; cstime = current->signal->cstime; spin_unlock_irq(¤t->sighand->siglock); - tms->tms_utime = cputime_to_clock_t(cputime.utime); - tms->tms_stime = cputime_to_clock_t(cputime.stime); + tms->tms_utime = cputime_to_clock_t(tgutime); + tms->tms_stime = cputime_to_clock_t(tgstime); tms->tms_cutime = cputime_to_clock_t(cutime); tms->tms_cstime = cputime_to_clock_t(cstime); } @@ -1110,6 +1109,8 @@ SYSCALL_DEFINE0(setsid) err = session; out: write_unlock_irq(&tasklist_lock); + if (err > 0) + proc_sid_connector(group_leader); return err; } @@ -1336,16 +1337,14 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r) { struct task_struct *t; unsigned long flags; - cputime_t utime, stime; - struct task_cputime cputime; + cputime_t tgutime, tgstime, utime, stime; unsigned long maxrss = 0; memset((char *) r, 0, sizeof *r); utime = stime = cputime_zero; if (who == RUSAGE_THREAD) { - utime = task_utime(current); - stime = task_stime(current); + task_times(current, &utime, &stime); accumulate_thread_rusage(p, r); maxrss = p->signal->maxrss; goto out; @@ -1371,9 +1370,9 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r) break; case RUSAGE_SELF: - thread_group_cputime(p, &cputime); - utime = cputime_add(utime, cputime.utime); - stime = cputime_add(stime, cputime.stime); + thread_group_times(p, &tgutime, &tgstime); + utime = cputime_add(utime, tgutime); + stime = cputime_add(stime, tgstime); r->ru_nvcsw += p->signal->nvcsw; r->ru_nivcsw += p->signal->nivcsw; r->ru_minflt += p->signal->min_flt; @@ -1542,6 +1541,41 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, current->timer_slack_ns = arg2; error = 0; break; + case PR_MCE_KILL: + if (arg4 | arg5) + return -EINVAL; + switch (arg2) { + case PR_MCE_KILL_CLEAR: + if (arg3 != 0) + return -EINVAL; + current->flags &= ~PF_MCE_PROCESS; + break; + case PR_MCE_KILL_SET: + current->flags |= PF_MCE_PROCESS; + if (arg3 == PR_MCE_KILL_EARLY) + current->flags |= PF_MCE_EARLY; + else if (arg3 == PR_MCE_KILL_LATE) + current->flags &= ~PF_MCE_EARLY; + else if (arg3 == PR_MCE_KILL_DEFAULT) + current->flags &= + ~(PF_MCE_EARLY|PF_MCE_PROCESS); + else + return -EINVAL; + break; + default: + return -EINVAL; + } + error = 0; + break; + case PR_MCE_KILL_GET: + if (arg2 | arg3 | arg4 | arg5) + return -EINVAL; + if (current->flags & PF_MCE_PROCESS) + error = (current->flags & PF_MCE_EARLY) ? + PR_MCE_KILL_EARLY : PR_MCE_KILL_LATE; + else + error = PR_MCE_KILL_DEFAULT; + break; default: error = -EINVAL; break; diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 515bc230ac2a..e06d0b8d1951 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -49,6 +49,7 @@ cond_syscall(sys_sendmsg); cond_syscall(compat_sys_sendmsg); cond_syscall(sys_recvmsg); cond_syscall(compat_sys_recvmsg); +cond_syscall(compat_sys_recvfrom); cond_syscall(sys_socketcall); cond_syscall(sys_futex); cond_syscall(compat_sys_futex); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 0dfaa47d7cb6..4dbf93a52ee9 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -26,7 +26,6 @@ #include <linux/proc_fs.h> #include <linux/security.h> #include <linux/ctype.h> -#include <linux/utsname.h> #include <linux/kmemcheck.h> #include <linux/smp_lock.h> #include <linux/fs.h> @@ -37,6 +36,7 @@ #include <linux/sysrq.h> #include <linux/highuid.h> #include <linux/writeback.h> +#include <linux/ratelimit.h> #include <linux/hugetlb.h> #include <linux/initrd.h> #include <linux/key.h> @@ -77,6 +77,7 @@ extern int max_threads; extern int core_uses_pid; extern int suid_dumpable; extern char core_pattern[]; +extern unsigned int core_pipe_limit; extern int pid_max; extern int min_free_kbytes; extern int pid_max_min, pid_max_max; @@ -158,14 +159,16 @@ extern int no_unaligned_warning; extern int unaligned_dump_stack; #endif +extern struct ratelimit_state printk_ratelimit_state; + #ifdef CONFIG_RT_MUTEXES extern int max_lock_depth; #endif #ifdef CONFIG_PROC_SYSCTL -static int proc_do_cad_pid(struct ctl_table *table, int write, struct file *filp, +static int proc_do_cad_pid(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos); -static int proc_taint(struct ctl_table *table, int write, struct file *filp, +static int proc_taint(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos); #endif @@ -424,6 +427,14 @@ static struct ctl_table kern_table[] = { .proc_handler = &proc_dostring, .strategy = &sysctl_string, }, + { + .ctl_name = CTL_UNNUMBERED, + .procname = "core_pipe_limit", + .data = &core_pipe_limit, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, #ifdef CONFIG_PROC_SYSCTL { .procname = "tainted", @@ -1390,6 +1401,31 @@ static struct ctl_table vm_table[] = { .mode = 0644, .proc_handler = &scan_unevictable_handler, }, +#ifdef CONFIG_MEMORY_FAILURE + { + .ctl_name = CTL_UNNUMBERED, + .procname = "memory_failure_early_kill", + .data = &sysctl_memory_failure_early_kill, + .maxlen = sizeof(sysctl_memory_failure_early_kill), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &zero, + .extra2 = &one, + }, + { + .ctl_name = CTL_UNNUMBERED, + .procname = "memory_failure_recovery", + .data = &sysctl_memory_failure_recovery, + .maxlen = sizeof(sysctl_memory_failure_recovery), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &zero, + .extra2 = &one, + }, +#endif + /* * NOTE: do not add new entries to this table unless you have read * Documentation/sysctl/ctl_unnumbered.txt @@ -2218,7 +2254,7 @@ void sysctl_head_put(struct ctl_table_header *head) #ifdef CONFIG_PROC_SYSCTL static int _proc_do_string(void* data, int maxlen, int write, - struct file *filp, void __user *buffer, + void __user *buffer, size_t *lenp, loff_t *ppos) { size_t len; @@ -2279,7 +2315,6 @@ static int _proc_do_string(void* data, int maxlen, int write, * proc_dostring - read a string sysctl * @table: the sysctl table * @write: %TRUE if this is a write to the sysctl file - * @filp: the file structure * @buffer: the user buffer * @lenp: the size of the user buffer * @ppos: file position @@ -2293,10 +2328,10 @@ static int _proc_do_string(void* data, int maxlen, int write, * * Returns 0 on success. */ -int proc_dostring(struct ctl_table *table, int write, struct file *filp, +int proc_dostring(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { - return _proc_do_string(table->data, table->maxlen, write, filp, + return _proc_do_string(table->data, table->maxlen, write, buffer, lenp, ppos); } @@ -2321,7 +2356,7 @@ static int do_proc_dointvec_conv(int *negp, unsigned long *lvalp, } static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table, - int write, struct file *filp, void __user *buffer, + int write, void __user *buffer, size_t *lenp, loff_t *ppos, int (*conv)(int *negp, unsigned long *lvalp, int *valp, int write, void *data), @@ -2428,13 +2463,13 @@ static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table, #undef TMPBUFLEN } -static int do_proc_dointvec(struct ctl_table *table, int write, struct file *filp, +static int do_proc_dointvec(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos, int (*conv)(int *negp, unsigned long *lvalp, int *valp, int write, void *data), void *data) { - return __do_proc_dointvec(table->data, table, write, filp, + return __do_proc_dointvec(table->data, table, write, buffer, lenp, ppos, conv, data); } @@ -2442,7 +2477,6 @@ static int do_proc_dointvec(struct ctl_table *table, int write, struct file *fil * proc_dointvec - read a vector of integers * @table: the sysctl table * @write: %TRUE if this is a write to the sysctl file - * @filp: the file structure * @buffer: the user buffer * @lenp: the size of the user buffer * @ppos: file position @@ -2452,10 +2486,10 @@ static int do_proc_dointvec(struct ctl_table *table, int write, struct file *fil * * Returns 0 on success. */ -int proc_dointvec(struct ctl_table *table, int write, struct file *filp, +int proc_dointvec(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { - return do_proc_dointvec(table,write,filp,buffer,lenp,ppos, + return do_proc_dointvec(table,write,buffer,lenp,ppos, NULL,NULL); } @@ -2463,7 +2497,7 @@ int proc_dointvec(struct ctl_table *table, int write, struct file *filp, * Taint values can only be increased * This means we can safely use a temporary. */ -static int proc_taint(struct ctl_table *table, int write, struct file *filp, +static int proc_taint(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { struct ctl_table t; @@ -2475,7 +2509,7 @@ static int proc_taint(struct ctl_table *table, int write, struct file *filp, t = *table; t.data = &tmptaint; - err = proc_doulongvec_minmax(&t, write, filp, buffer, lenp, ppos); + err = proc_doulongvec_minmax(&t, write, buffer, lenp, ppos); if (err < 0) return err; @@ -2527,7 +2561,6 @@ static int do_proc_dointvec_minmax_conv(int *negp, unsigned long *lvalp, * proc_dointvec_minmax - read a vector of integers with min/max values * @table: the sysctl table * @write: %TRUE if this is a write to the sysctl file - * @filp: the file structure * @buffer: the user buffer * @lenp: the size of the user buffer * @ppos: file position @@ -2540,19 +2573,18 @@ static int do_proc_dointvec_minmax_conv(int *negp, unsigned long *lvalp, * * Returns 0 on success. */ -int proc_dointvec_minmax(struct ctl_table *table, int write, struct file *filp, +int proc_dointvec_minmax(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { struct do_proc_dointvec_minmax_conv_param param = { .min = (int *) table->extra1, .max = (int *) table->extra2, }; - return do_proc_dointvec(table, write, filp, buffer, lenp, ppos, + return do_proc_dointvec(table, write, buffer, lenp, ppos, do_proc_dointvec_minmax_conv, ¶m); } static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int write, - struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos, unsigned long convmul, @@ -2657,21 +2689,19 @@ static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int } static int do_proc_doulongvec_minmax(struct ctl_table *table, int write, - struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos, unsigned long convmul, unsigned long convdiv) { return __do_proc_doulongvec_minmax(table->data, table, write, - filp, buffer, lenp, ppos, convmul, convdiv); + buffer, lenp, ppos, convmul, convdiv); } /** * proc_doulongvec_minmax - read a vector of long integers with min/max values * @table: the sysctl table * @write: %TRUE if this is a write to the sysctl file - * @filp: the file structure * @buffer: the user buffer * @lenp: the size of the user buffer * @ppos: file position @@ -2684,17 +2714,16 @@ static int do_proc_doulongvec_minmax(struct ctl_table *table, int write, * * Returns 0 on success. */ -int proc_doulongvec_minmax(struct ctl_table *table, int write, struct file *filp, +int proc_doulongvec_minmax(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { - return do_proc_doulongvec_minmax(table, write, filp, buffer, lenp, ppos, 1l, 1l); + return do_proc_doulongvec_minmax(table, write, buffer, lenp, ppos, 1l, 1l); } /** * proc_doulongvec_ms_jiffies_minmax - read a vector of millisecond values with min/max values * @table: the sysctl table * @write: %TRUE if this is a write to the sysctl file - * @filp: the file structure * @buffer: the user buffer * @lenp: the size of the user buffer * @ppos: file position @@ -2709,11 +2738,10 @@ int proc_doulongvec_minmax(struct ctl_table *table, int write, struct file *filp * Returns 0 on success. */ int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write, - struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos) { - return do_proc_doulongvec_minmax(table, write, filp, buffer, + return do_proc_doulongvec_minmax(table, write, buffer, lenp, ppos, HZ, 1000l); } @@ -2789,7 +2817,6 @@ static int do_proc_dointvec_ms_jiffies_conv(int *negp, unsigned long *lvalp, * proc_dointvec_jiffies - read a vector of integers as seconds * @table: the sysctl table * @write: %TRUE if this is a write to the sysctl file - * @filp: the file structure * @buffer: the user buffer * @lenp: the size of the user buffer * @ppos: file position @@ -2801,10 +2828,10 @@ static int do_proc_dointvec_ms_jiffies_conv(int *negp, unsigned long *lvalp, * * Returns 0 on success. */ -int proc_dointvec_jiffies(struct ctl_table *table, int write, struct file *filp, +int proc_dointvec_jiffies(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { - return do_proc_dointvec(table,write,filp,buffer,lenp,ppos, + return do_proc_dointvec(table,write,buffer,lenp,ppos, do_proc_dointvec_jiffies_conv,NULL); } @@ -2812,7 +2839,6 @@ int proc_dointvec_jiffies(struct ctl_table *table, int write, struct file *filp, * proc_dointvec_userhz_jiffies - read a vector of integers as 1/USER_HZ seconds * @table: the sysctl table * @write: %TRUE if this is a write to the sysctl file - * @filp: the file structure * @buffer: the user buffer * @lenp: the size of the user buffer * @ppos: pointer to the file position @@ -2824,10 +2850,10 @@ int proc_dointvec_jiffies(struct ctl_table *table, int write, struct file *filp, * * Returns 0 on success. */ -int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write, struct file *filp, +int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { - return do_proc_dointvec(table,write,filp,buffer,lenp,ppos, + return do_proc_dointvec(table,write,buffer,lenp,ppos, do_proc_dointvec_userhz_jiffies_conv,NULL); } @@ -2835,7 +2861,6 @@ int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write, struct file * proc_dointvec_ms_jiffies - read a vector of integers as 1 milliseconds * @table: the sysctl table * @write: %TRUE if this is a write to the sysctl file - * @filp: the file structure * @buffer: the user buffer * @lenp: the size of the user buffer * @ppos: file position @@ -2848,14 +2873,14 @@ int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write, struct file * * Returns 0 on success. */ -int proc_dointvec_ms_jiffies(struct ctl_table *table, int write, struct file *filp, +int proc_dointvec_ms_jiffies(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { - return do_proc_dointvec(table, write, filp, buffer, lenp, ppos, + return do_proc_dointvec(table, write, buffer, lenp, ppos, do_proc_dointvec_ms_jiffies_conv, NULL); } -static int proc_do_cad_pid(struct ctl_table *table, int write, struct file *filp, +static int proc_do_cad_pid(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { struct pid *new_pid; @@ -2864,7 +2889,7 @@ static int proc_do_cad_pid(struct ctl_table *table, int write, struct file *filp tmp = pid_vnr(cad_pid); - r = __do_proc_dointvec(&tmp, table, write, filp, buffer, + r = __do_proc_dointvec(&tmp, table, write, buffer, lenp, ppos, NULL, NULL); if (r || !write) return r; @@ -2879,50 +2904,49 @@ static int proc_do_cad_pid(struct ctl_table *table, int write, struct file *filp #else /* CONFIG_PROC_FS */ -int proc_dostring(struct ctl_table *table, int write, struct file *filp, +int proc_dostring(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { return -ENOSYS; } -int proc_dointvec(struct ctl_table *table, int write, struct file *filp, +int proc_dointvec(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { return -ENOSYS; } -int proc_dointvec_minmax(struct ctl_table *table, int write, struct file *filp, +int proc_dointvec_minmax(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { return -ENOSYS; } -int proc_dointvec_jiffies(struct ctl_table *table, int write, struct file *filp, +int proc_dointvec_jiffies(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { return -ENOSYS; } -int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write, struct file *filp, +int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { return -ENOSYS; } -int proc_dointvec_ms_jiffies(struct ctl_table *table, int write, struct file *filp, +int proc_dointvec_ms_jiffies(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { return -ENOSYS; } -int proc_doulongvec_minmax(struct ctl_table *table, int write, struct file *filp, +int proc_doulongvec_minmax(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { return -ENOSYS; } int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write, - struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos) { diff --git a/kernel/sysctl_check.c b/kernel/sysctl_check.c index b38423ca711a..b6e7aaea4604 100644 --- a/kernel/sysctl_check.c +++ b/kernel/sysctl_check.c @@ -1521,7 +1521,7 @@ int sysctl_check_table(struct nsproxy *namespaces, struct ctl_table *table) if (!table->ctl_name && table->strategy) set_fail(&fail, table, "Strategy without ctl_name"); #endif -#ifdef CONFIG_PROC_FS +#ifdef CONFIG_PROC_SYSCTL if (table->procname && !table->proc_handler) set_fail(&fail, table, "No proc_handler"); #endif diff --git a/kernel/time.c b/kernel/time.c index 2e2e469a7fec..804798005d19 100644 --- a/kernel/time.c +++ b/kernel/time.c @@ -662,6 +662,36 @@ u64 nsec_to_clock_t(u64 x) #endif } +/** + * nsecs_to_jiffies - Convert nsecs in u64 to jiffies + * + * @n: nsecs in u64 + * + * Unlike {m,u}secs_to_jiffies, type of input is not unsigned int but u64. + * And this doesn't return MAX_JIFFY_OFFSET since this function is designed + * for scheduler, not for use in device drivers to calculate timeout value. + * + * note: + * NSEC_PER_SEC = 10^9 = (5^9 * 2^9) = (1953125 * 512) + * ULLONG_MAX ns = 18446744073.709551615 secs = about 584 years + */ +unsigned long nsecs_to_jiffies(u64 n) +{ +#if (NSEC_PER_SEC % HZ) == 0 + /* Common case, HZ = 100, 128, 200, 250, 256, 500, 512, 1000 etc. */ + return div_u64(n, NSEC_PER_SEC / HZ); +#elif (HZ % 512) == 0 + /* overflow after 292 years if HZ = 1024 */ + return div_u64(n * HZ / 512, NSEC_PER_SEC / 512); +#else + /* + * Generic case - optimized for cases where HZ is a multiple of 3. + * overflow after 64.99 years, exact for HZ = 60, 72, 90, 120 etc. + */ + return div_u64(n * 9, (9ull * NSEC_PER_SEC + HZ / 2) / HZ); +#endif +} + #if (BITS_PER_LONG < 64) u64 get_jiffies_64(void) { diff --git a/kernel/time/Makefile b/kernel/time/Makefile index 0b0a6366c9d4..ee266620b06c 100644 --- a/kernel/time/Makefile +++ b/kernel/time/Makefile @@ -1,4 +1,4 @@ -obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o timecompare.o +obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o timecompare.o timeconv.o obj-$(CONFIG_GENERIC_CLOCKEVENTS_BUILD) += clockevents.o obj-$(CONFIG_GENERIC_CLOCKEVENTS) += tick-common.o diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 09113347d328..5e18c6ab2c6a 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -394,15 +394,11 @@ void clocksource_resume(void) { struct clocksource *cs; - mutex_lock(&clocksource_mutex); - list_for_each_entry(cs, &clocksource_list, list) if (cs->resume) cs->resume(); clocksource_resume_watchdog(); - - mutex_unlock(&clocksource_mutex); } /** diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index e0f59a21c061..89aed5933ed4 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -231,6 +231,13 @@ void tick_nohz_stop_sched_tick(int inidle) if (!inidle && !ts->inidle) goto end; + /* + * Set ts->inidle unconditionally. Even if the system did not + * switch to NOHZ mode the cpu frequency governers rely on the + * update of the idle time accounting in tick_nohz_start_idle(). + */ + ts->inidle = 1; + now = tick_nohz_start_idle(ts); /* @@ -248,8 +255,6 @@ void tick_nohz_stop_sched_tick(int inidle) if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE)) goto end; - ts->inidle = 1; - if (need_resched()) goto end; diff --git a/kernel/time/timeconv.c b/kernel/time/timeconv.c new file mode 100644 index 000000000000..86628e755f38 --- /dev/null +++ b/kernel/time/timeconv.c @@ -0,0 +1,127 @@ +/* + * Copyright (C) 1993, 1994, 1995, 1996, 1997 Free Software Foundation, Inc. + * This file is part of the GNU C Library. + * Contributed by Paul Eggert (eggert@twinsun.com). + * + * The GNU C Library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Library General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * The GNU C Library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Library General Public License for more details. + * + * You should have received a copy of the GNU Library General Public + * License along with the GNU C Library; see the file COPYING.LIB. If not, + * write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 02111-1307, USA. + */ + +/* + * Converts the calendar time to broken-down time representation + * Based on code from glibc-2.6 + * + * 2009-7-14: + * Moved from glibc-2.6 to kernel by Zhaolei<zhaolei@cn.fujitsu.com> + */ + +#include <linux/time.h> +#include <linux/module.h> + +/* + * Nonzero if YEAR is a leap year (every 4 years, + * except every 100th isn't, and every 400th is). + */ +static int __isleap(long year) +{ + return (year) % 4 == 0 && ((year) % 100 != 0 || (year) % 400 == 0); +} + +/* do a mathdiv for long type */ +static long math_div(long a, long b) +{ + return a / b - (a % b < 0); +} + +/* How many leap years between y1 and y2, y1 must less or equal to y2 */ +static long leaps_between(long y1, long y2) +{ + long leaps1 = math_div(y1 - 1, 4) - math_div(y1 - 1, 100) + + math_div(y1 - 1, 400); + long leaps2 = math_div(y2 - 1, 4) - math_div(y2 - 1, 100) + + math_div(y2 - 1, 400); + return leaps2 - leaps1; +} + +/* How many days come before each month (0-12). */ +static const unsigned short __mon_yday[2][13] = { + /* Normal years. */ + {0, 31, 59, 90, 120, 151, 181, 212, 243, 273, 304, 334, 365}, + /* Leap years. */ + {0, 31, 60, 91, 121, 152, 182, 213, 244, 274, 305, 335, 366} +}; + +#define SECS_PER_HOUR (60 * 60) +#define SECS_PER_DAY (SECS_PER_HOUR * 24) + +/** + * time_to_tm - converts the calendar time to local broken-down time + * + * @totalsecs the number of seconds elapsed since 00:00:00 on January 1, 1970, + * Coordinated Universal Time (UTC). + * @offset offset seconds adding to totalsecs. + * @result pointer to struct tm variable to receive broken-down time + */ +void time_to_tm(time_t totalsecs, int offset, struct tm *result) +{ + long days, rem, y; + const unsigned short *ip; + + days = totalsecs / SECS_PER_DAY; + rem = totalsecs % SECS_PER_DAY; + rem += offset; + while (rem < 0) { + rem += SECS_PER_DAY; + --days; + } + while (rem >= SECS_PER_DAY) { + rem -= SECS_PER_DAY; + ++days; + } + + result->tm_hour = rem / SECS_PER_HOUR; + rem %= SECS_PER_HOUR; + result->tm_min = rem / 60; + result->tm_sec = rem % 60; + + /* January 1, 1970 was a Thursday. */ + result->tm_wday = (4 + days) % 7; + if (result->tm_wday < 0) + result->tm_wday += 7; + + y = 1970; + + while (days < 0 || days >= (__isleap(y) ? 366 : 365)) { + /* Guess a corrected year, assuming 365 days per year. */ + long yg = y + math_div(days, 365); + + /* Adjust DAYS and Y to match the guessed year. */ + days -= (yg - y) * 365 + leaps_between(y, yg); + y = yg; + } + + result->tm_year = y - 1900; + + result->tm_yday = days; + + ip = __mon_yday[__isleap(y)]; + for (y = 11; days < ip[y]; y--) + continue; + days -= ip[y]; + + result->tm_mon = y; + result->tm_mday = days + 1; +} +EXPORT_SYMBOL(time_to_tm); diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index fb0f46fa1ecd..c3a4e2907eaa 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -13,6 +13,7 @@ #include <linux/percpu.h> #include <linux/init.h> #include <linux/mm.h> +#include <linux/sched.h> #include <linux/sysdev.h> #include <linux/clocksource.h> #include <linux/jiffies.h> diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c index fddd69d16e03..1b5b7aa2fdfd 100644 --- a/kernel/time/timer_list.c +++ b/kernel/time/timer_list.c @@ -275,7 +275,7 @@ static int timer_list_open(struct inode *inode, struct file *filp) return single_open(filp, timer_list_show, NULL); } -static struct file_operations timer_list_fops = { +static const struct file_operations timer_list_fops = { .open = timer_list_open, .read = seq_read, .llseek = seq_lseek, diff --git a/kernel/time/timer_stats.c b/kernel/time/timer_stats.c index 4cde8b9c716f..ee5681f8d7ec 100644 --- a/kernel/time/timer_stats.c +++ b/kernel/time/timer_stats.c @@ -395,7 +395,7 @@ static int tstats_open(struct inode *inode, struct file *filp) return single_open(filp, tstats_show, NULL); } -static struct file_operations tstats_fops = { +static const struct file_operations tstats_fops = { .open = tstats_open, .read = seq_read, .write = tstats_write, diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index b416512ad17f..d006554888dc 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -339,6 +339,27 @@ config POWER_TRACER power management decisions, specifically the C-state and P-state behavior. +config KSYM_TRACER + bool "Trace read and write access on kernel memory locations" + depends on HAVE_HW_BREAKPOINT + select TRACING + help + This tracer helps find read and write operations on any given kernel + symbol i.e. /proc/kallsyms. + +config PROFILE_KSYM_TRACER + bool "Profile all kernel memory accesses on 'watched' variables" + depends on KSYM_TRACER + help + This tracer profiles kernel accesses on variables watched through the + ksym tracer ftrace plugin. Depending upon the hardware, all read + and write operations on kernel variables can be monitored for + accesses. + + The results will be displayed in: + /debugfs/tracing/profile_ksym + + Say N if unsure. config STACK_TRACER bool "Trace max stack" @@ -428,6 +449,23 @@ config BLK_DEV_IO_TRACE If unsure, say N. +config KPROBE_EVENT + depends on KPROBES + depends on X86 + bool "Enable kprobes-based dynamic events" + select TRACING + default y + help + This allows the user to add tracing events (similar to tracepoints) on the fly + via the ftrace interface. See Documentation/trace/kprobetrace.txt + for more details. + + Those events can be inserted wherever kprobes can probe, and record + various register and memory values. + + This option is also required by perf-probe subcommand of perf tools. If + you want to use perf tools, this option is strongly recommended. + config DYNAMIC_FTRACE bool "enable/disable ftrace tracepoints dynamically" depends on FUNCTION_TRACER diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index 26f03ac07c2b..cd9ecd89ec77 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -53,6 +53,8 @@ obj-$(CONFIG_EVENT_TRACING) += trace_export.o obj-$(CONFIG_FTRACE_SYSCALLS) += trace_syscalls.o obj-$(CONFIG_EVENT_PROFILE) += trace_event_profile.o obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o +obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o +obj-$(CONFIG_KSYM_TRACER) += trace_ksym.o obj-$(CONFIG_EVENT_TRACING) += power-traces.o libftrace-y := ftrace.o diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 3eb159c277c8..d9d6206e0b14 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -856,6 +856,37 @@ static void blk_add_trace_remap(struct request_queue *q, struct bio *bio, } /** + * blk_add_trace_rq_remap - Add a trace for a request-remap operation + * @q: queue the io is for + * @rq: the source request + * @dev: target device + * @from: source sector + * + * Description: + * Device mapper remaps request to other devices. + * Add a trace for that action. + * + **/ +static void blk_add_trace_rq_remap(struct request_queue *q, + struct request *rq, dev_t dev, + sector_t from) +{ + struct blk_trace *bt = q->blk_trace; + struct blk_io_trace_remap r; + + if (likely(!bt)) + return; + + r.device_from = cpu_to_be32(dev); + r.device_to = cpu_to_be32(disk_devt(rq->rq_disk)); + r.sector_from = cpu_to_be64(from); + + __blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq), + rq_data_dir(rq), BLK_TA_REMAP, !!rq->errors, + sizeof(r), &r); +} + +/** * blk_add_driver_data - Add binary message with driver-specific data * @q: queue the io is for * @rq: io request @@ -922,10 +953,13 @@ static void blk_register_tracepoints(void) WARN_ON(ret); ret = register_trace_block_remap(blk_add_trace_remap); WARN_ON(ret); + ret = register_trace_block_rq_remap(blk_add_trace_rq_remap); + WARN_ON(ret); } static void blk_unregister_tracepoints(void) { + unregister_trace_block_rq_remap(blk_add_trace_rq_remap); unregister_trace_block_remap(blk_add_trace_remap); unregister_trace_block_split(blk_add_trace_split); unregister_trace_block_unplug_io(blk_add_trace_unplug_io); @@ -1657,6 +1691,11 @@ int blk_trace_init_sysfs(struct device *dev) return sysfs_create_group(&dev->kobj, &blk_trace_attr_group); } +void blk_trace_remove_sysfs(struct device *dev) +{ + sysfs_remove_group(&dev->kobj, &blk_trace_attr_group); +} + #endif /* CONFIG_BLK_DEV_IO_TRACE */ #ifdef CONFIG_EVENT_TRACING diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 23df7771c937..e51a1bcb7bed 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -60,6 +60,13 @@ static int last_ftrace_enabled; /* Quick disabling of function tracer. */ int function_trace_stop; +/* List for set_ftrace_pid's pids. */ +LIST_HEAD(ftrace_pids); +struct ftrace_pid { + struct list_head list; + struct pid *pid; +}; + /* * ftrace_disabled is set when an anomaly is discovered. * ftrace_disabled is much stronger than ftrace_enabled. @@ -78,6 +85,10 @@ ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub; ftrace_func_t __ftrace_trace_function __read_mostly = ftrace_stub; ftrace_func_t ftrace_pid_function __read_mostly = ftrace_stub; +#ifdef CONFIG_FUNCTION_GRAPH_TRACER +static int ftrace_set_func(unsigned long *array, int *idx, char *buffer); +#endif + static void ftrace_list_func(unsigned long ip, unsigned long parent_ip) { struct ftrace_ops *op = ftrace_list; @@ -155,7 +166,7 @@ static int __register_ftrace_function(struct ftrace_ops *ops) else func = ftrace_list_func; - if (ftrace_pid_trace) { + if (!list_empty(&ftrace_pids)) { set_ftrace_pid_function(func); func = ftrace_pid_func; } @@ -203,7 +214,7 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops) if (ftrace_list->next == &ftrace_list_end) { ftrace_func_t func = ftrace_list->func; - if (ftrace_pid_trace) { + if (!list_empty(&ftrace_pids)) { set_ftrace_pid_function(func); func = ftrace_pid_func; } @@ -225,9 +236,13 @@ static void ftrace_update_pid_func(void) if (ftrace_trace_function == ftrace_stub) return; +#ifdef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST func = ftrace_trace_function; +#else + func = __ftrace_trace_function; +#endif - if (ftrace_pid_trace) { + if (!list_empty(&ftrace_pids)) { set_ftrace_pid_function(func); func = ftrace_pid_func; } else { @@ -736,7 +751,7 @@ ftrace_profile_write(struct file *filp, const char __user *ubuf, out: mutex_unlock(&ftrace_profile_lock); - filp->f_pos += cnt; + *ppos += cnt; return cnt; } @@ -817,8 +832,6 @@ static __init void ftrace_profile_debugfs(struct dentry *d_tracer) } #endif /* CONFIG_FUNCTION_PROFILER */ -/* set when tracing only a pid */ -struct pid *ftrace_pid_trace; static struct pid * const ftrace_swapper_pid = &init_struct_pid; #ifdef CONFIG_DYNAMIC_FTRACE @@ -1074,14 +1087,9 @@ static void ftrace_replace_code(int enable) failed = __ftrace_replace_code(rec, enable); if (failed) { rec->flags |= FTRACE_FL_FAILED; - if ((system_state == SYSTEM_BOOTING) || - !core_kernel_text(rec->ip)) { - ftrace_free_rec(rec); - } else { - ftrace_bug(failed, rec->ip); - /* Stop processing */ - return; - } + ftrace_bug(failed, rec->ip); + /* Stop processing */ + return; } } while_for_each_ftrace_rec(); } @@ -1262,12 +1270,34 @@ static int ftrace_update_code(struct module *mod) ftrace_new_addrs = p->newlist; p->flags = 0L; - /* convert record (i.e, patch mcount-call with NOP) */ - if (ftrace_code_disable(mod, p)) { - p->flags |= FTRACE_FL_CONVERTED; - ftrace_update_cnt++; - } else + /* + * Do the initial record convertion from mcount jump + * to the NOP instructions. + */ + if (!ftrace_code_disable(mod, p)) { ftrace_free_rec(p); + continue; + } + + p->flags |= FTRACE_FL_CONVERTED; + ftrace_update_cnt++; + + /* + * If the tracing is enabled, go ahead and enable the record. + * + * The reason not to enable the record immediatelly is the + * inherent check of ftrace_make_nop/ftrace_make_call for + * correct previous instructions. Making first the NOP + * conversion puts the module to the correct state, thus + * passing the ftrace_make_call check. + */ + if (ftrace_start_up) { + int failed = __ftrace_replace_code(p, 1); + if (failed) { + ftrace_bug(failed, p->ip); + ftrace_free_rec(p); + } + } } stop = ftrace_now(raw_smp_processor_id()); @@ -1621,8 +1651,10 @@ ftrace_regex_open(struct inode *inode, struct file *file, int enable) if (!ret) { struct seq_file *m = file->private_data; m->private = iter; - } else + } else { + trace_parser_put(&iter->parser); kfree(iter); + } } else file->private_data = iter; mutex_unlock(&ftrace_regex_lock); @@ -1655,60 +1687,6 @@ ftrace_regex_lseek(struct file *file, loff_t offset, int origin) return ret; } -enum { - MATCH_FULL, - MATCH_FRONT_ONLY, - MATCH_MIDDLE_ONLY, - MATCH_END_ONLY, -}; - -/* - * (static function - no need for kernel doc) - * - * Pass in a buffer containing a glob and this function will - * set search to point to the search part of the buffer and - * return the type of search it is (see enum above). - * This does modify buff. - * - * Returns enum type. - * search returns the pointer to use for comparison. - * not returns 1 if buff started with a '!' - * 0 otherwise. - */ -static int -ftrace_setup_glob(char *buff, int len, char **search, int *not) -{ - int type = MATCH_FULL; - int i; - - if (buff[0] == '!') { - *not = 1; - buff++; - len--; - } else - *not = 0; - - *search = buff; - - for (i = 0; i < len; i++) { - if (buff[i] == '*') { - if (!i) { - *search = buff + 1; - type = MATCH_END_ONLY; - } else { - if (type == MATCH_END_ONLY) - type = MATCH_MIDDLE_ONLY; - else - type = MATCH_FRONT_ONLY; - buff[i] = 0; - break; - } - } - } - - return type; -} - static int ftrace_match(char *str, char *regex, int len, int type) { int matched = 0; @@ -1757,7 +1735,7 @@ static void ftrace_match_records(char *buff, int len, int enable) int not; flag = enable ? FTRACE_FL_FILTER : FTRACE_FL_NOTRACE; - type = ftrace_setup_glob(buff, len, &search, ¬); + type = filter_parse_regex(buff, len, &search, ¬); search_len = strlen(search); @@ -1825,7 +1803,7 @@ static void ftrace_match_module_records(char *buff, char *mod, int enable) } if (strlen(buff)) { - type = ftrace_setup_glob(buff, strlen(buff), &search, ¬); + type = filter_parse_regex(buff, strlen(buff), &search, ¬); search_len = strlen(search); } @@ -1990,7 +1968,7 @@ register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, int count = 0; char *search; - type = ftrace_setup_glob(glob, strlen(glob), &search, ¬); + type = filter_parse_regex(glob, strlen(glob), &search, ¬); len = strlen(search); /* we do not support '!' for function probes */ @@ -2067,7 +2045,7 @@ __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, else if (glob) { int not; - type = ftrace_setup_glob(glob, strlen(glob), &search, ¬); + type = filter_parse_regex(glob, strlen(glob), &search, ¬); len = strlen(search); /* we do not support '!' for function probes */ @@ -2202,7 +2180,7 @@ ftrace_regex_write(struct file *file, const char __user *ubuf, struct trace_parser *parser; ssize_t ret, read; - if (!cnt || cnt < 0) + if (!cnt) return 0; mutex_lock(&ftrace_regex_lock); @@ -2216,20 +2194,20 @@ ftrace_regex_write(struct file *file, const char __user *ubuf, parser = &iter->parser; read = trace_get_user(parser, ubuf, cnt, ppos); - if (trace_parser_loaded(parser) && + if (read >= 0 && trace_parser_loaded(parser) && !trace_parser_cont(parser)) { ret = ftrace_process_regex(parser->buffer, parser->idx, enable); if (ret) - goto out; + goto out_unlock; trace_parser_clear(parser); } ret = read; - +out_unlock: mutex_unlock(&ftrace_regex_lock); -out: + return ret; } @@ -2311,6 +2289,32 @@ static int __init set_ftrace_filter(char *str) } __setup("ftrace_filter=", set_ftrace_filter); +#ifdef CONFIG_FUNCTION_GRAPH_TRACER +static char ftrace_graph_buf[FTRACE_FILTER_SIZE] __initdata; +static int __init set_graph_function(char *str) +{ + strlcpy(ftrace_graph_buf, str, FTRACE_FILTER_SIZE); + return 1; +} +__setup("ftrace_graph_filter=", set_graph_function); + +static void __init set_ftrace_early_graph(char *buf) +{ + int ret; + char *func; + + while (buf) { + func = strsep(&buf, ","); + /* we allow only one expression at a time */ + ret = ftrace_set_func(ftrace_graph_funcs, &ftrace_graph_count, + func); + if (ret) + printk(KERN_DEBUG "ftrace: function %s not " + "traceable\n", func); + } +} +#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ + static void __init set_ftrace_early_filter(char *buf, int enable) { char *func; @@ -2327,6 +2331,10 @@ static void __init set_ftrace_early_filters(void) set_ftrace_early_filter(ftrace_filter_buf, 1); if (ftrace_notrace_buf[0]) set_ftrace_early_filter(ftrace_notrace_buf, 0); +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + if (ftrace_graph_buf[0]) + set_ftrace_early_graph(ftrace_graph_buf); +#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ } static int @@ -2512,7 +2520,7 @@ ftrace_set_func(unsigned long *array, int *idx, char *buffer) return -ENODEV; /* decode regex */ - type = ftrace_setup_glob(buffer, strlen(buffer), &search, ¬); + type = filter_parse_regex(buffer, strlen(buffer), &search, ¬); if (not) return -EINVAL; @@ -2552,8 +2560,7 @@ ftrace_graph_write(struct file *file, const char __user *ubuf, size_t cnt, loff_t *ppos) { struct trace_parser parser; - size_t read = 0; - ssize_t ret; + ssize_t read, ret; if (!cnt || cnt < 0) return 0; @@ -2562,29 +2569,31 @@ ftrace_graph_write(struct file *file, const char __user *ubuf, if (ftrace_graph_count >= FTRACE_GRAPH_MAX_FUNCS) { ret = -EBUSY; - goto out; + goto out_unlock; } if (trace_parser_get_init(&parser, FTRACE_BUFF_MAX)) { ret = -ENOMEM; - goto out; + goto out_unlock; } read = trace_get_user(&parser, ubuf, cnt, ppos); - if (trace_parser_loaded((&parser))) { + if (read >= 0 && trace_parser_loaded((&parser))) { parser.buffer[parser.idx] = 0; /* we allow only one expression at a time */ ret = ftrace_set_func(ftrace_graph_funcs, &ftrace_graph_count, parser.buffer); if (ret) - goto out; + goto out_free; } ret = read; - out: + +out_free: trace_parser_put(&parser); +out_unlock: mutex_unlock(&graph_lock); return ret; @@ -2622,7 +2631,7 @@ static __init int ftrace_init_dyn_debugfs(struct dentry *d_tracer) return 0; } -static int ftrace_convert_nops(struct module *mod, +static int ftrace_process_locs(struct module *mod, unsigned long *start, unsigned long *end) { @@ -2655,19 +2664,17 @@ static int ftrace_convert_nops(struct module *mod, } #ifdef CONFIG_MODULES -void ftrace_release(void *start, void *end) +void ftrace_release_mod(struct module *mod) { struct dyn_ftrace *rec; struct ftrace_page *pg; - unsigned long s = (unsigned long)start; - unsigned long e = (unsigned long)end; - if (ftrace_disabled || !start || start == end) + if (ftrace_disabled) return; mutex_lock(&ftrace_lock); do_for_each_ftrace_rec(pg, rec) { - if ((rec->ip >= s) && (rec->ip < e)) { + if (within_module_core(rec->ip, mod)) { /* * rec->ip is changed in ftrace_free_rec() * It should not between s and e if record was freed. @@ -2684,7 +2691,7 @@ static void ftrace_init_module(struct module *mod, { if (ftrace_disabled || start == end) return; - ftrace_convert_nops(mod, start, end); + ftrace_process_locs(mod, start, end); } static int ftrace_module_notify(struct notifier_block *self, @@ -2699,9 +2706,7 @@ static int ftrace_module_notify(struct notifier_block *self, mod->num_ftrace_callsites); break; case MODULE_STATE_GOING: - ftrace_release(mod->ftrace_callsites, - mod->ftrace_callsites + - mod->num_ftrace_callsites); + ftrace_release_mod(mod); break; } @@ -2747,7 +2752,7 @@ void __init ftrace_init(void) last_ftrace_enabled = ftrace_enabled = 1; - ret = ftrace_convert_nops(NULL, + ret = ftrace_process_locs(NULL, __start_mcount_loc, __stop_mcount_loc); @@ -2780,23 +2785,6 @@ static inline void ftrace_startup_enable(int command) { } # define ftrace_shutdown_sysctl() do { } while (0) #endif /* CONFIG_DYNAMIC_FTRACE */ -static ssize_t -ftrace_pid_read(struct file *file, char __user *ubuf, - size_t cnt, loff_t *ppos) -{ - char buf[64]; - int r; - - if (ftrace_pid_trace == ftrace_swapper_pid) - r = sprintf(buf, "swapper tasks\n"); - else if (ftrace_pid_trace) - r = sprintf(buf, "%u\n", pid_vnr(ftrace_pid_trace)); - else - r = sprintf(buf, "no pid\n"); - - return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); -} - static void clear_ftrace_swapper(void) { struct task_struct *p; @@ -2847,14 +2835,12 @@ static void set_ftrace_pid(struct pid *pid) rcu_read_unlock(); } -static void clear_ftrace_pid_task(struct pid **pid) +static void clear_ftrace_pid_task(struct pid *pid) { - if (*pid == ftrace_swapper_pid) + if (pid == ftrace_swapper_pid) clear_ftrace_swapper(); else - clear_ftrace_pid(*pid); - - *pid = NULL; + clear_ftrace_pid(pid); } static void set_ftrace_pid_task(struct pid *pid) @@ -2865,74 +2851,184 @@ static void set_ftrace_pid_task(struct pid *pid) set_ftrace_pid(pid); } -static ssize_t -ftrace_pid_write(struct file *filp, const char __user *ubuf, - size_t cnt, loff_t *ppos) +static int ftrace_pid_add(int p) { struct pid *pid; - char buf[64]; - long val; - int ret; + struct ftrace_pid *fpid; + int ret = -EINVAL; - if (cnt >= sizeof(buf)) - return -EINVAL; + mutex_lock(&ftrace_lock); - if (copy_from_user(&buf, ubuf, cnt)) - return -EFAULT; + if (!p) + pid = ftrace_swapper_pid; + else + pid = find_get_pid(p); - buf[cnt] = 0; + if (!pid) + goto out; - ret = strict_strtol(buf, 10, &val); - if (ret < 0) - return ret; + ret = 0; - mutex_lock(&ftrace_lock); - if (val < 0) { - /* disable pid tracing */ - if (!ftrace_pid_trace) - goto out; + list_for_each_entry(fpid, &ftrace_pids, list) + if (fpid->pid == pid) + goto out_put; - clear_ftrace_pid_task(&ftrace_pid_trace); + ret = -ENOMEM; - } else { - /* swapper task is special */ - if (!val) { - pid = ftrace_swapper_pid; - if (pid == ftrace_pid_trace) - goto out; - } else { - pid = find_get_pid(val); + fpid = kmalloc(sizeof(*fpid), GFP_KERNEL); + if (!fpid) + goto out_put; - if (pid == ftrace_pid_trace) { - put_pid(pid); - goto out; - } - } + list_add(&fpid->list, &ftrace_pids); + fpid->pid = pid; - if (ftrace_pid_trace) - clear_ftrace_pid_task(&ftrace_pid_trace); + set_ftrace_pid_task(pid); - if (!pid) - goto out; + ftrace_update_pid_func(); + ftrace_startup_enable(0); + + mutex_unlock(&ftrace_lock); + return 0; + +out_put: + if (pid != ftrace_swapper_pid) + put_pid(pid); + +out: + mutex_unlock(&ftrace_lock); + return ret; +} + +static void ftrace_pid_reset(void) +{ + struct ftrace_pid *fpid, *safe; + + mutex_lock(&ftrace_lock); + list_for_each_entry_safe(fpid, safe, &ftrace_pids, list) { + struct pid *pid = fpid->pid; - ftrace_pid_trace = pid; + clear_ftrace_pid_task(pid); - set_ftrace_pid_task(ftrace_pid_trace); + list_del(&fpid->list); + kfree(fpid); } - /* update the function call */ ftrace_update_pid_func(); ftrace_startup_enable(0); - out: mutex_unlock(&ftrace_lock); +} - return cnt; +static void *fpid_start(struct seq_file *m, loff_t *pos) +{ + mutex_lock(&ftrace_lock); + + if (list_empty(&ftrace_pids) && (!*pos)) + return (void *) 1; + + return seq_list_start(&ftrace_pids, *pos); +} + +static void *fpid_next(struct seq_file *m, void *v, loff_t *pos) +{ + if (v == (void *)1) + return NULL; + + return seq_list_next(v, &ftrace_pids, pos); +} + +static void fpid_stop(struct seq_file *m, void *p) +{ + mutex_unlock(&ftrace_lock); +} + +static int fpid_show(struct seq_file *m, void *v) +{ + const struct ftrace_pid *fpid = list_entry(v, struct ftrace_pid, list); + + if (v == (void *)1) { + seq_printf(m, "no pid\n"); + return 0; + } + + if (fpid->pid == ftrace_swapper_pid) + seq_printf(m, "swapper tasks\n"); + else + seq_printf(m, "%u\n", pid_vnr(fpid->pid)); + + return 0; +} + +static const struct seq_operations ftrace_pid_sops = { + .start = fpid_start, + .next = fpid_next, + .stop = fpid_stop, + .show = fpid_show, +}; + +static int +ftrace_pid_open(struct inode *inode, struct file *file) +{ + int ret = 0; + + if ((file->f_mode & FMODE_WRITE) && + (file->f_flags & O_TRUNC)) + ftrace_pid_reset(); + + if (file->f_mode & FMODE_READ) + ret = seq_open(file, &ftrace_pid_sops); + + return ret; +} + +static ssize_t +ftrace_pid_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + char buf[64], *tmp; + long val; + int ret; + + if (cnt >= sizeof(buf)) + return -EINVAL; + + if (copy_from_user(&buf, ubuf, cnt)) + return -EFAULT; + + buf[cnt] = 0; + + /* + * Allow "echo > set_ftrace_pid" or "echo -n '' > set_ftrace_pid" + * to clean the filter quietly. + */ + tmp = strstrip(buf); + if (strlen(tmp) == 0) + return 1; + + ret = strict_strtol(tmp, 10, &val); + if (ret < 0) + return ret; + + ret = ftrace_pid_add(val); + + return ret ? ret : cnt; +} + +static int +ftrace_pid_release(struct inode *inode, struct file *file) +{ + if (file->f_mode & FMODE_READ) + seq_release(inode, file); + + return 0; } static const struct file_operations ftrace_pid_fops = { - .read = ftrace_pid_read, - .write = ftrace_pid_write, + .open = ftrace_pid_open, + .write = ftrace_pid_write, + .read = seq_read, + .llseek = seq_lseek, + .release = ftrace_pid_release, }; static __init int ftrace_init_debugfs(void) @@ -3015,7 +3111,7 @@ int unregister_ftrace_function(struct ftrace_ops *ops) int ftrace_enable_sysctl(struct ctl_table *table, int write, - struct file *file, void __user *buffer, size_t *lenp, + void __user *buffer, size_t *lenp, loff_t *ppos) { int ret; @@ -3025,7 +3121,7 @@ ftrace_enable_sysctl(struct ctl_table *table, int write, mutex_lock(&ftrace_lock); - ret = proc_dointvec(table, write, file, buffer, lenp, ppos); + ret = proc_dointvec(table, write, buffer, lenp, ppos); if (ret || !write || (last_ftrace_enabled == !!ftrace_enabled)) goto out; @@ -3295,4 +3391,3 @@ void ftrace_graph_stop(void) ftrace_stop(); } #endif - diff --git a/kernel/trace/kmemtrace.c b/kernel/trace/kmemtrace.c index 81b1645c8549..a91da69f153a 100644 --- a/kernel/trace/kmemtrace.c +++ b/kernel/trace/kmemtrace.c @@ -501,7 +501,7 @@ static int __init init_kmem_tracer(void) return 1; } - if (!register_tracer(&kmem_tracer)) { + if (register_tracer(&kmem_tracer) != 0) { pr_warning("Warning: could not register the kmem tracer\n"); return 1; } diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index d4ff01970547..a1ca4956ab5e 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -397,18 +397,21 @@ int ring_buffer_print_page_header(struct trace_seq *s) int ret; ret = trace_seq_printf(s, "\tfield: u64 timestamp;\t" - "offset:0;\tsize:%u;\n", - (unsigned int)sizeof(field.time_stamp)); + "offset:0;\tsize:%u;\tsigned:%u;\n", + (unsigned int)sizeof(field.time_stamp), + (unsigned int)is_signed_type(u64)); ret = trace_seq_printf(s, "\tfield: local_t commit;\t" - "offset:%u;\tsize:%u;\n", + "offset:%u;\tsize:%u;\tsigned:%u;\n", (unsigned int)offsetof(typeof(field), commit), - (unsigned int)sizeof(field.commit)); + (unsigned int)sizeof(field.commit), + (unsigned int)is_signed_type(long)); ret = trace_seq_printf(s, "\tfield: char data;\t" - "offset:%u;\tsize:%u;\n", + "offset:%u;\tsize:%u;\tsigned:%u;\n", (unsigned int)offsetof(typeof(field), data), - (unsigned int)BUF_PAGE_SIZE); + (unsigned int)BUF_PAGE_SIZE, + (unsigned int)is_signed_type(char)); return ret; } @@ -483,7 +486,7 @@ struct ring_buffer_iter { /* Up this if you want to test the TIME_EXTENTS and normalization */ #define DEBUG_SHIFT 0 -static inline u64 rb_time_stamp(struct ring_buffer *buffer, int cpu) +static inline u64 rb_time_stamp(struct ring_buffer *buffer) { /* shift to debug/test normalization and TIME_EXTENTS */ return buffer->clock() << DEBUG_SHIFT; @@ -494,7 +497,7 @@ u64 ring_buffer_time_stamp(struct ring_buffer *buffer, int cpu) u64 time; preempt_disable_notrace(); - time = rb_time_stamp(buffer, cpu); + time = rb_time_stamp(buffer); preempt_enable_no_resched_notrace(); return time; @@ -599,7 +602,7 @@ static struct list_head *rb_list_head(struct list_head *list) } /* - * rb_is_head_page - test if the give page is the head page + * rb_is_head_page - test if the given page is the head page * * Because the reader may move the head_page pointer, we can * not trust what the head page is (it may be pointing to @@ -1193,6 +1196,7 @@ rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned nr_pages) atomic_inc(&cpu_buffer->record_disabled); synchronize_sched(); + spin_lock_irq(&cpu_buffer->reader_lock); rb_head_page_deactivate(cpu_buffer); for (i = 0; i < nr_pages; i++) { @@ -1207,6 +1211,7 @@ rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned nr_pages) return; rb_reset_cpu(cpu_buffer); + spin_unlock_irq(&cpu_buffer->reader_lock); rb_check_pages(cpu_buffer); @@ -1785,9 +1790,9 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer, static struct ring_buffer_event * rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer, unsigned long length, unsigned long tail, - struct buffer_page *commit_page, struct buffer_page *tail_page, u64 *ts) { + struct buffer_page *commit_page = cpu_buffer->commit_page; struct ring_buffer *buffer = cpu_buffer->buffer; struct buffer_page *next_page; int ret; @@ -1868,7 +1873,7 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer, * Nested commits always have zero deltas, so * just reread the time stamp */ - *ts = rb_time_stamp(buffer, cpu_buffer->cpu); + *ts = rb_time_stamp(buffer); next_page->page->time_stamp = *ts; } @@ -1890,13 +1895,10 @@ static struct ring_buffer_event * __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, unsigned type, unsigned long length, u64 *ts) { - struct buffer_page *tail_page, *commit_page; + struct buffer_page *tail_page; struct ring_buffer_event *event; unsigned long tail, write; - commit_page = cpu_buffer->commit_page; - /* we just need to protect against interrupts */ - barrier(); tail_page = cpu_buffer->tail_page; write = local_add_return(length, &tail_page->write); @@ -1907,7 +1909,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, /* See if we shot pass the end of this buffer page */ if (write > BUF_PAGE_SIZE) return rb_move_tail(cpu_buffer, length, tail, - commit_page, tail_page, ts); + tail_page, ts); /* We reserved something on the buffer */ @@ -2111,7 +2113,7 @@ rb_reserve_next_event(struct ring_buffer *buffer, if (RB_WARN_ON(cpu_buffer, ++nr_loops > 1000)) goto out_fail; - ts = rb_time_stamp(cpu_buffer->buffer, cpu_buffer->cpu); + ts = rb_time_stamp(cpu_buffer->buffer); /* * Only the first commit can update the timestamp. @@ -2681,7 +2683,7 @@ unsigned long ring_buffer_entries(struct ring_buffer *buffer) EXPORT_SYMBOL_GPL(ring_buffer_entries); /** - * ring_buffer_overrun_cpu - get the number of overruns in buffer + * ring_buffer_overruns - get the number of overruns in buffer * @buffer: The ring buffer * * Returns the total number of overruns in the ring buffer diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c index 573d3cc762c3..b2477caf09c2 100644 --- a/kernel/trace/ring_buffer_benchmark.c +++ b/kernel/trace/ring_buffer_benchmark.c @@ -35,6 +35,28 @@ static int disable_reader; module_param(disable_reader, uint, 0644); MODULE_PARM_DESC(disable_reader, "only run producer"); +static int write_iteration = 50; +module_param(write_iteration, uint, 0644); +MODULE_PARM_DESC(write_iteration, "# of writes between timestamp readings"); + +static int producer_nice = 19; +static int consumer_nice = 19; + +static int producer_fifo = -1; +static int consumer_fifo = -1; + +module_param(producer_nice, uint, 0644); +MODULE_PARM_DESC(producer_nice, "nice prio for producer"); + +module_param(consumer_nice, uint, 0644); +MODULE_PARM_DESC(consumer_nice, "nice prio for consumer"); + +module_param(producer_fifo, uint, 0644); +MODULE_PARM_DESC(producer_fifo, "fifo prio for producer"); + +module_param(consumer_fifo, uint, 0644); +MODULE_PARM_DESC(consumer_fifo, "fifo prio for consumer"); + static int read_events; static int kill_test; @@ -208,15 +230,18 @@ static void ring_buffer_producer(void) do { struct ring_buffer_event *event; int *entry; - - event = ring_buffer_lock_reserve(buffer, 10); - if (!event) { - missed++; - } else { - hit++; - entry = ring_buffer_event_data(event); - *entry = smp_processor_id(); - ring_buffer_unlock_commit(buffer, event); + int i; + + for (i = 0; i < write_iteration; i++) { + event = ring_buffer_lock_reserve(buffer, 10); + if (!event) { + missed++; + } else { + hit++; + entry = ring_buffer_event_data(event); + *entry = smp_processor_id(); + ring_buffer_unlock_commit(buffer, event); + } } do_gettimeofday(&end_tv); @@ -263,6 +288,27 @@ static void ring_buffer_producer(void) if (kill_test) trace_printk("ERROR!\n"); + + if (!disable_reader) { + if (consumer_fifo < 0) + trace_printk("Running Consumer at nice: %d\n", + consumer_nice); + else + trace_printk("Running Consumer at SCHED_FIFO %d\n", + consumer_fifo); + } + if (producer_fifo < 0) + trace_printk("Running Producer at nice: %d\n", + producer_nice); + else + trace_printk("Running Producer at SCHED_FIFO %d\n", + producer_fifo); + + /* Let the user know that the test is running at low priority */ + if (producer_fifo < 0 && consumer_fifo < 0 && + producer_nice == 19 && consumer_nice == 19) + trace_printk("WARNING!!! This test is running at lowest priority.\n"); + trace_printk("Time: %lld (usecs)\n", time); trace_printk("Overruns: %lld\n", overruns); if (disable_reader) @@ -392,6 +438,27 @@ static int __init ring_buffer_benchmark_init(void) if (IS_ERR(producer)) goto out_kill; + /* + * Run them as low-prio background tasks by default: + */ + if (!disable_reader) { + if (consumer_fifo >= 0) { + struct sched_param param = { + .sched_priority = consumer_fifo + }; + sched_setscheduler(consumer, SCHED_FIFO, ¶m); + } else + set_user_nice(consumer, consumer_nice); + } + + if (producer_fifo >= 0) { + struct sched_param param = { + .sched_priority = consumer_fifo + }; + sched_setscheduler(producer, SCHED_FIFO, ¶m); + } else + set_user_nice(producer, producer_nice); + return 0; out_kill: diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 6c0f6a8a22eb..874f2893cff0 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -129,7 +129,7 @@ static int tracing_set_tracer(const char *buf); static char bootup_tracer_buf[MAX_TRACER_SIZE] __initdata; static char *default_bootup_tracer; -static int __init set_ftrace(char *str) +static int __init set_cmdline_ftrace(char *str) { strncpy(bootup_tracer_buf, str, MAX_TRACER_SIZE); default_bootup_tracer = bootup_tracer_buf; @@ -137,7 +137,7 @@ static int __init set_ftrace(char *str) ring_buffer_expanded = 1; return 1; } -__setup("ftrace=", set_ftrace); +__setup("ftrace=", set_cmdline_ftrace); static int __init set_ftrace_dump_on_oops(char *str) { @@ -415,7 +415,7 @@ int trace_get_user(struct trace_parser *parser, const char __user *ubuf, /* read the non-space input */ while (cnt && !isspace(ch)) { - if (parser->idx < parser->size) + if (parser->idx < parser->size - 1) parser->buffer[parser->idx++] = ch; else { ret = -EINVAL; @@ -1361,10 +1361,11 @@ int trace_array_vprintk(struct trace_array *tr, pause_graph_tracing(); raw_local_irq_save(irq_flags); __raw_spin_lock(&trace_buf_lock); - len = vsnprintf(trace_buf, TRACE_BUF_SIZE, fmt, args); - - len = min(len, TRACE_BUF_SIZE-1); - trace_buf[len] = 0; + if (args == NULL) { + strncpy(trace_buf, fmt, TRACE_BUF_SIZE); + len = strlen(trace_buf); + } else + len = vsnprintf(trace_buf, TRACE_BUF_SIZE, fmt, args); size = sizeof(*entry) + len + 1; buffer = tr->buffer; @@ -1373,10 +1374,10 @@ int trace_array_vprintk(struct trace_array *tr, if (!event) goto out_unlock; entry = ring_buffer_event_data(event); - entry->ip = ip; + entry->ip = ip; memcpy(&entry->buf, trace_buf, len); - entry->buf[len] = 0; + entry->buf[len] = '\0'; if (!filter_check_discard(call, entry, buffer, event)) ring_buffer_unlock_commit(buffer, event); @@ -1393,7 +1394,7 @@ int trace_array_vprintk(struct trace_array *tr, int trace_vprintk(unsigned long ip, const char *fmt, va_list args) { - return trace_array_printk(&global_trace, ip, fmt, args); + return trace_array_vprintk(&global_trace, ip, fmt, args); } EXPORT_SYMBOL_GPL(trace_vprintk); @@ -1984,11 +1985,9 @@ __tracing_open(struct inode *inode, struct file *file) if (current_trace) *iter->trace = *current_trace; - if (!alloc_cpumask_var(&iter->started, GFP_KERNEL)) + if (!zalloc_cpumask_var(&iter->started, GFP_KERNEL)) goto fail; - cpumask_clear(iter->started); - if (current_trace && current_trace->print_max) iter->tr = &max_tr; else @@ -2442,7 +2441,7 @@ tracing_trace_options_write(struct file *filp, const char __user *ubuf, return ret; } - filp->f_pos += cnt; + *ppos += cnt; return cnt; } @@ -2584,7 +2583,7 @@ tracing_ctrl_write(struct file *filp, const char __user *ubuf, } mutex_unlock(&trace_types_lock); - filp->f_pos += cnt; + *ppos += cnt; return cnt; } @@ -2766,7 +2765,7 @@ tracing_set_trace_write(struct file *filp, const char __user *ubuf, if (err) return err; - filp->f_pos += ret; + *ppos += ret; return ret; } @@ -3301,7 +3300,7 @@ tracing_entries_write(struct file *filp, const char __user *ubuf, } } - filp->f_pos += cnt; + *ppos += cnt; /* If check pages failed, return ENOMEM */ if (tracing_disabled) @@ -3321,22 +3320,11 @@ tracing_entries_write(struct file *filp, const char __user *ubuf, return cnt; } -static int mark_printk(const char *fmt, ...) -{ - int ret; - va_list args; - va_start(args, fmt); - ret = trace_vprintk(0, fmt, args); - va_end(args); - return ret; -} - static ssize_t tracing_mark_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *fpos) { char *buf; - char *end; if (tracing_disabled) return -EINVAL; @@ -3344,7 +3332,7 @@ tracing_mark_write(struct file *filp, const char __user *ubuf, if (cnt > TRACE_BUF_SIZE) cnt = TRACE_BUF_SIZE; - buf = kmalloc(cnt + 1, GFP_KERNEL); + buf = kmalloc(cnt + 2, GFP_KERNEL); if (buf == NULL) return -ENOMEM; @@ -3352,14 +3340,13 @@ tracing_mark_write(struct file *filp, const char __user *ubuf, kfree(buf); return -EFAULT; } + if (buf[cnt-1] != '\n') { + buf[cnt] = '\n'; + buf[cnt+1] = '\0'; + } else + buf[cnt] = '\0'; - /* Cut from the first nil or newline. */ - buf[cnt] = '\0'; - end = strchr(buf, '\n'); - if (end) - *end = '\0'; - - cnt = mark_printk("%s\n", buf); + cnt = trace_vprintk(0, buf, NULL); kfree(buf); *fpos += cnt; @@ -3732,7 +3719,7 @@ tracing_stats_read(struct file *filp, char __user *ubuf, s = kmalloc(sizeof(*s), GFP_KERNEL); if (!s) - return ENOMEM; + return -ENOMEM; trace_seq_init(s); @@ -4389,7 +4376,7 @@ __init static int tracer_alloc_buffers(void) if (!alloc_cpumask_var(&tracing_cpumask, GFP_KERNEL)) goto out_free_buffer_mask; - if (!alloc_cpumask_var(&tracing_reader_cpumask, GFP_KERNEL)) + if (!zalloc_cpumask_var(&tracing_reader_cpumask, GFP_KERNEL)) goto out_free_tracing_cpumask; /* To save memory, keep the ring buffer size to its minimum */ @@ -4400,7 +4387,6 @@ __init static int tracer_alloc_buffers(void) cpumask_copy(tracing_buffer_mask, cpu_possible_mask); cpumask_copy(tracing_cpumask, cpu_all_mask); - cpumask_clear(tracing_reader_cpumask); /* TODO: make the number of buffers hot pluggable with CPUS */ global_trace.buffer = ring_buffer_alloc(ring_buf_size, diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 405cb850b75d..1d7f4830a80d 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -11,6 +11,7 @@ #include <linux/ftrace.h> #include <trace/boot.h> #include <linux/kmemtrace.h> +#include <linux/hw_breakpoint.h> #include <linux/trace_seq.h> #include <linux/ftrace_event.h> @@ -37,6 +38,7 @@ enum trace_type { TRACE_KMEM_ALLOC, TRACE_KMEM_FREE, TRACE_BLK, + TRACE_KSYM, __TRACE_LAST_TYPE, }; @@ -98,9 +100,32 @@ struct syscall_trace_enter { struct syscall_trace_exit { struct trace_entry ent; int nr; - unsigned long ret; + long ret; }; +struct kprobe_trace_entry { + struct trace_entry ent; + unsigned long ip; + int nargs; + unsigned long args[]; +}; + +#define SIZEOF_KPROBE_TRACE_ENTRY(n) \ + (offsetof(struct kprobe_trace_entry, args) + \ + (sizeof(unsigned long) * (n))) + +struct kretprobe_trace_entry { + struct trace_entry ent; + unsigned long func; + unsigned long ret_ip; + int nargs; + unsigned long args[]; +}; + +#define SIZEOF_KRETPROBE_TRACE_ENTRY(n) \ + (offsetof(struct kretprobe_trace_entry, args) + \ + (sizeof(unsigned long) * (n))) + /* * trace_flag_type is an enumeration that holds different * states when a trace occurs. These are: @@ -209,6 +234,7 @@ extern void __ftrace_bad_type(void); TRACE_KMEM_ALLOC); \ IF_ASSIGN(var, ent, struct kmemtrace_free_entry, \ TRACE_KMEM_FREE); \ + IF_ASSIGN(var, ent, struct ksym_trace_entry, TRACE_KSYM);\ __ftrace_bad_type(); \ } while (0) @@ -364,6 +390,8 @@ int register_tracer(struct tracer *type); void unregister_tracer(struct tracer *type); int is_tracing_stopped(void); +extern int process_new_ksym_entry(char *ksymname, int op, unsigned long addr); + extern unsigned long nsecs_to_usecs(unsigned long nsecs); #ifdef CONFIG_TRACER_MAX_TRACE @@ -438,6 +466,8 @@ extern int trace_selftest_startup_branch(struct tracer *trace, struct trace_array *tr); extern int trace_selftest_startup_hw_branches(struct tracer *trace, struct trace_array *tr); +extern int trace_selftest_startup_ksym(struct tracer *trace, + struct trace_array *tr); #endif /* CONFIG_FTRACE_STARTUP_TEST */ extern void *head_page(struct trace_array_cpu *data); @@ -483,10 +513,6 @@ static inline int ftrace_graph_addr(unsigned long addr) return 0; } #else -static inline int ftrace_trace_addr(unsigned long addr) -{ - return 1; -} static inline int ftrace_graph_addr(unsigned long addr) { return 1; @@ -500,12 +526,12 @@ print_graph_function(struct trace_iterator *iter) } #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ -extern struct pid *ftrace_pid_trace; +extern struct list_head ftrace_pids; #ifdef CONFIG_FUNCTION_TRACER static inline int ftrace_trace_task(struct task_struct *task) { - if (!ftrace_pid_trace) + if (list_empty(&ftrace_pids)) return 1; return test_tsk_trace_trace(task); @@ -687,7 +713,6 @@ struct event_filter { int n_preds; struct filter_pred **preds; char *filter_string; - bool no_reset; }; struct event_subsystem { @@ -699,22 +724,40 @@ struct event_subsystem { }; struct filter_pred; +struct regex; typedef int (*filter_pred_fn_t) (struct filter_pred *pred, void *event, int val1, int val2); +typedef int (*regex_match_func)(char *str, struct regex *r, int len); + +enum regex_type { + MATCH_FULL = 0, + MATCH_FRONT_ONLY, + MATCH_MIDDLE_ONLY, + MATCH_END_ONLY, +}; + +struct regex { + char pattern[MAX_FILTER_STR_VAL]; + int len; + int field_len; + regex_match_func match; +}; + struct filter_pred { - filter_pred_fn_t fn; - u64 val; - char str_val[MAX_FILTER_STR_VAL]; - int str_len; - char *field_name; - int offset; - int not; - int op; - int pop_n; + filter_pred_fn_t fn; + u64 val; + struct regex regex; + char *field_name; + int offset; + int not; + int op; + int pop_n; }; +extern enum regex_type +filter_parse_regex(char *buff, int len, char **search, int *not); extern void print_event_filter(struct ftrace_event_call *call, struct trace_seq *s); extern int apply_event_filter(struct ftrace_event_call *call, @@ -730,7 +773,8 @@ filter_check_discard(struct ftrace_event_call *call, void *rec, struct ring_buffer *buffer, struct ring_buffer_event *event) { - if (unlikely(call->filter_active) && !filter_match_preds(call, rec)) { + if (unlikely(call->filter_active) && + !filter_match_preds(call->filter, rec)) { ring_buffer_discard_commit(buffer, event); return 1; } diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c index 7a7a9fd249a9..4a194f08f88c 100644 --- a/kernel/trace/trace_branch.c +++ b/kernel/trace/trace_branch.c @@ -34,6 +34,7 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect) struct trace_array *tr = branch_tracer; struct ring_buffer_event *event; struct trace_branch *entry; + struct ring_buffer *buffer; unsigned long flags; int cpu, pc; const char *p; @@ -54,7 +55,8 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect) goto out; pc = preempt_count(); - event = trace_buffer_lock_reserve(tr, TRACE_BRANCH, + buffer = tr->buffer; + event = trace_buffer_lock_reserve(buffer, TRACE_BRANCH, sizeof(*entry), flags, pc); if (!event) goto out; @@ -74,8 +76,8 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect) entry->line = f->line; entry->correct = val == expect; - if (!filter_check_discard(call, entry, tr->buffer, event)) - ring_buffer_unlock_commit(tr->buffer, event); + if (!filter_check_discard(call, entry, buffer, event)) + ring_buffer_unlock_commit(buffer, event); out: atomic_dec(&tr->data[cpu]->disabled); diff --git a/kernel/trace/trace_clock.c b/kernel/trace/trace_clock.c index 20c5f92e28a8..878c03f386ba 100644 --- a/kernel/trace/trace_clock.c +++ b/kernel/trace/trace_clock.c @@ -20,6 +20,8 @@ #include <linux/ktime.h> #include <linux/trace_clock.h> +#include "trace.h" + /* * trace_clock_local(): the simplest and least coherent tracing clock. * @@ -28,17 +30,17 @@ */ u64 notrace trace_clock_local(void) { - unsigned long flags; u64 clock; + int resched; /* * sched_clock() is an architecture implemented, fast, scalable, * lockless clock. It is not guaranteed to be coherent across * CPUs, nor across CPU idle events. */ - raw_local_irq_save(flags); + resched = ftrace_preempt_disable(); clock = sched_clock(); - raw_local_irq_restore(flags); + ftrace_preempt_enable(resched); return clock; } diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h index ead3d724599d..c16a08f399df 100644 --- a/kernel/trace/trace_entries.h +++ b/kernel/trace/trace_entries.h @@ -364,3 +364,19 @@ FTRACE_ENTRY(kmem_free, kmemtrace_free_entry, F_printk("type:%u call_site:%lx ptr:%p", __entry->type_id, __entry->call_site, __entry->ptr) ); + +FTRACE_ENTRY(ksym_trace, ksym_trace_entry, + + TRACE_KSYM, + + F_STRUCT( + __field( unsigned long, ip ) + __field( unsigned char, type ) + __array( char , cmd, TASK_COMM_LEN ) + __field( unsigned long, addr ) + ), + + F_printk("ip: %pF type: %d ksym_name: %pS cmd: %s", + (void *)__entry->ip, (unsigned int)__entry->type, + (void *)__entry->addr, __entry->cmd) +); diff --git a/kernel/trace/trace_event_profile.c b/kernel/trace/trace_event_profile.c index dd44b8768867..d9c60f80aa0d 100644 --- a/kernel/trace/trace_event_profile.c +++ b/kernel/trace/trace_event_profile.c @@ -8,17 +8,14 @@ #include <linux/module.h> #include "trace.h" -/* - * We can't use a size but a type in alloc_percpu() - * So let's create a dummy type that matches the desired size - */ -typedef struct {char buf[FTRACE_MAX_PROFILE_SIZE];} profile_buf_t; -char *trace_profile_buf; -EXPORT_SYMBOL_GPL(trace_profile_buf); +char *perf_trace_buf; +EXPORT_SYMBOL_GPL(perf_trace_buf); + +char *perf_trace_buf_nmi; +EXPORT_SYMBOL_GPL(perf_trace_buf_nmi); -char *trace_profile_buf_nmi; -EXPORT_SYMBOL_GPL(trace_profile_buf_nmi); +typedef typeof(char [FTRACE_MAX_PROFILE_SIZE]) perf_trace_t ; /* Count the events in use (per event id, not per instance) */ static int total_profile_count; @@ -31,29 +28,34 @@ static int ftrace_profile_enable_event(struct ftrace_event_call *event) if (atomic_inc_return(&event->profile_count)) return 0; - if (!total_profile_count++) { - buf = (char *)alloc_percpu(profile_buf_t); + if (!total_profile_count) { + buf = (char *)alloc_percpu(perf_trace_t); if (!buf) goto fail_buf; - rcu_assign_pointer(trace_profile_buf, buf); + rcu_assign_pointer(perf_trace_buf, buf); - buf = (char *)alloc_percpu(profile_buf_t); + buf = (char *)alloc_percpu(perf_trace_t); if (!buf) goto fail_buf_nmi; - rcu_assign_pointer(trace_profile_buf_nmi, buf); + rcu_assign_pointer(perf_trace_buf_nmi, buf); } - ret = event->profile_enable(); - if (!ret) + ret = event->profile_enable(event); + if (!ret) { + total_profile_count++; return 0; + } - kfree(trace_profile_buf_nmi); fail_buf_nmi: - kfree(trace_profile_buf); + if (!total_profile_count) { + free_percpu(perf_trace_buf_nmi); + free_percpu(perf_trace_buf); + perf_trace_buf_nmi = NULL; + perf_trace_buf = NULL; + } fail_buf: - total_profile_count--; atomic_dec(&event->profile_count); return ret; @@ -84,14 +86,14 @@ static void ftrace_profile_disable_event(struct ftrace_event_call *event) if (!atomic_add_negative(-1, &event->profile_count)) return; - event->profile_disable(); + event->profile_disable(event); if (!--total_profile_count) { - buf = trace_profile_buf; - rcu_assign_pointer(trace_profile_buf, NULL); + buf = perf_trace_buf; + rcu_assign_pointer(perf_trace_buf, NULL); - nmi_buf = trace_profile_buf_nmi; - rcu_assign_pointer(trace_profile_buf_nmi, NULL); + nmi_buf = perf_trace_buf_nmi; + rcu_assign_pointer(perf_trace_buf_nmi, NULL); /* * Ensure every events in profiling have finished before diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 6f03c8a1105e..1d18315dc836 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -93,9 +93,7 @@ int trace_define_common_fields(struct ftrace_event_call *call) } EXPORT_SYMBOL_GPL(trace_define_common_fields); -#ifdef CONFIG_MODULES - -static void trace_destroy_fields(struct ftrace_event_call *call) +void trace_destroy_fields(struct ftrace_event_call *call) { struct ftrace_event_field *field, *next; @@ -107,8 +105,6 @@ static void trace_destroy_fields(struct ftrace_event_call *call) } } -#endif /* CONFIG_MODULES */ - static void ftrace_event_enable_disable(struct ftrace_event_call *call, int enable) { @@ -117,14 +113,14 @@ static void ftrace_event_enable_disable(struct ftrace_event_call *call, if (call->enabled) { call->enabled = 0; tracing_stop_cmdline_record(); - call->unregfunc(call->data); + call->unregfunc(call); } break; case 1: if (!call->enabled) { call->enabled = 1; tracing_start_cmdline_record(); - call->regfunc(call->data); + call->regfunc(call); } break; } @@ -232,10 +228,9 @@ ftrace_event_write(struct file *file, const char __user *ubuf, size_t cnt, loff_t *ppos) { struct trace_parser parser; - size_t read = 0; - ssize_t ret; + ssize_t read, ret; - if (!cnt || cnt < 0) + if (!cnt) return 0; ret = tracing_update_buffers(); @@ -247,7 +242,7 @@ ftrace_event_write(struct file *file, const char __user *ubuf, read = trace_get_user(&parser, ubuf, cnt, ppos); - if (trace_parser_loaded((&parser))) { + if (read >= 0 && trace_parser_loaded((&parser))) { int set = 1; if (*parser.buffer == '!') @@ -508,7 +503,7 @@ extern char *__bad_type_size(void); #define FIELD(type, name) \ sizeof(type) != sizeof(field.name) ? __bad_type_size() : \ #type, "common_" #name, offsetof(typeof(field), name), \ - sizeof(field.name) + sizeof(field.name), is_signed_type(type) static int trace_write_header(struct trace_seq *s) { @@ -516,17 +511,17 @@ static int trace_write_header(struct trace_seq *s) /* struct trace_entry */ return trace_seq_printf(s, - "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n" - "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n" - "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n" - "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n" - "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n" - "\n", - FIELD(unsigned short, type), - FIELD(unsigned char, flags), - FIELD(unsigned char, preempt_count), - FIELD(int, pid), - FIELD(int, lock_depth)); + "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\tsigned:%u;\n" + "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\tsigned:%u;\n" + "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\tsigned:%u;\n" + "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\tsigned:%u;\n" + "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\tsigned:%u;\n" + "\n", + FIELD(unsigned short, type), + FIELD(unsigned char, flags), + FIELD(unsigned char, preempt_count), + FIELD(int, pid), + FIELD(int, lock_depth)); } static ssize_t @@ -879,9 +874,9 @@ event_subsystem_dir(const char *name, struct dentry *d_events) "'%s/filter' entry\n", name); } - entry = trace_create_file("enable", 0644, system->entry, - (void *)system->name, - &ftrace_system_enable_fops); + trace_create_file("enable", 0644, system->entry, + (void *)system->name, + &ftrace_system_enable_fops); return system->entry; } @@ -893,7 +888,6 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events, const struct file_operations *filter, const struct file_operations *format) { - struct dentry *entry; int ret; /* @@ -911,12 +905,12 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events, } if (call->regfunc) - entry = trace_create_file("enable", 0644, call->dir, call, - enable); + trace_create_file("enable", 0644, call->dir, call, + enable); if (call->id && call->profile_enable) - entry = trace_create_file("id", 0444, call->dir, call, - id); + trace_create_file("id", 0444, call->dir, call, + id); if (call->define_fields) { ret = call->define_fields(call); @@ -925,41 +919,60 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events, " events/%s\n", call->name); return ret; } - entry = trace_create_file("filter", 0644, call->dir, call, - filter); + trace_create_file("filter", 0644, call->dir, call, + filter); } /* A trace may not want to export its format */ if (!call->show_format) return 0; - entry = trace_create_file("format", 0444, call->dir, call, - format); + trace_create_file("format", 0444, call->dir, call, + format); return 0; } -#define for_each_event(event, start, end) \ - for (event = start; \ - (unsigned long)event < (unsigned long)end; \ - event++) +static int __trace_add_event_call(struct ftrace_event_call *call) +{ + struct dentry *d_events; + int ret; -#ifdef CONFIG_MODULES + if (!call->name) + return -EINVAL; -static LIST_HEAD(ftrace_module_file_list); + if (call->raw_init) { + ret = call->raw_init(call); + if (ret < 0) { + if (ret != -ENOSYS) + pr_warning("Could not initialize trace " + "events/%s\n", call->name); + return ret; + } + } -/* - * Modules must own their file_operations to keep up with - * reference counting. - */ -struct ftrace_module_file_ops { - struct list_head list; - struct module *mod; - struct file_operations id; - struct file_operations enable; - struct file_operations format; - struct file_operations filter; -}; + d_events = event_trace_events_dir(); + if (!d_events) + return -ENOENT; + + ret = event_create_dir(call, d_events, &ftrace_event_id_fops, + &ftrace_enable_fops, &ftrace_event_filter_fops, + &ftrace_event_format_fops); + if (!ret) + list_add(&call->list, &ftrace_events); + + return ret; +} + +/* Add an additional event_call dynamically */ +int trace_add_event_call(struct ftrace_event_call *call) +{ + int ret; + mutex_lock(&event_mutex); + ret = __trace_add_event_call(call); + mutex_unlock(&event_mutex); + return ret; +} static void remove_subsystem_dir(const char *name) { @@ -987,6 +1000,53 @@ static void remove_subsystem_dir(const char *name) } } +/* + * Must be called under locking both of event_mutex and trace_event_mutex. + */ +static void __trace_remove_event_call(struct ftrace_event_call *call) +{ + ftrace_event_enable_disable(call, 0); + if (call->event) + __unregister_ftrace_event(call->event); + debugfs_remove_recursive(call->dir); + list_del(&call->list); + trace_destroy_fields(call); + destroy_preds(call); + remove_subsystem_dir(call->system); +} + +/* Remove an event_call */ +void trace_remove_event_call(struct ftrace_event_call *call) +{ + mutex_lock(&event_mutex); + down_write(&trace_event_mutex); + __trace_remove_event_call(call); + up_write(&trace_event_mutex); + mutex_unlock(&event_mutex); +} + +#define for_each_event(event, start, end) \ + for (event = start; \ + (unsigned long)event < (unsigned long)end; \ + event++) + +#ifdef CONFIG_MODULES + +static LIST_HEAD(ftrace_module_file_list); + +/* + * Modules must own their file_operations to keep up with + * reference counting. + */ +struct ftrace_module_file_ops { + struct list_head list; + struct module *mod; + struct file_operations id; + struct file_operations enable; + struct file_operations format; + struct file_operations filter; +}; + static struct ftrace_module_file_ops * trace_create_file_ops(struct module *mod) { @@ -1044,7 +1104,7 @@ static void trace_module_add_events(struct module *mod) if (!call->name) continue; if (call->raw_init) { - ret = call->raw_init(); + ret = call->raw_init(call); if (ret < 0) { if (ret != -ENOSYS) pr_warning("Could not initialize trace " @@ -1062,10 +1122,11 @@ static void trace_module_add_events(struct module *mod) return; } call->mod = mod; - list_add(&call->list, &ftrace_events); - event_create_dir(call, d_events, - &file_ops->id, &file_ops->enable, - &file_ops->filter, &file_ops->format); + ret = event_create_dir(call, d_events, + &file_ops->id, &file_ops->enable, + &file_ops->filter, &file_ops->format); + if (!ret) + list_add(&call->list, &ftrace_events); } } @@ -1079,14 +1140,7 @@ static void trace_module_remove_events(struct module *mod) list_for_each_entry_safe(call, p, &ftrace_events, list) { if (call->mod == mod) { found = true; - ftrace_event_enable_disable(call, 0); - if (call->event) - __unregister_ftrace_event(call->event); - debugfs_remove_recursive(call->dir); - list_del(&call->list); - trace_destroy_fields(call); - destroy_preds(call); - remove_subsystem_dir(call->system); + __trace_remove_event_call(call); } } @@ -1204,7 +1258,7 @@ static __init int event_trace_init(void) if (!call->name) continue; if (call->raw_init) { - ret = call->raw_init(); + ret = call->raw_init(call); if (ret < 0) { if (ret != -ENOSYS) pr_warning("Could not initialize trace " @@ -1212,10 +1266,12 @@ static __init int event_trace_init(void) continue; } } - list_add(&call->list, &ftrace_events); - event_create_dir(call, d_events, &ftrace_event_id_fops, - &ftrace_enable_fops, &ftrace_event_filter_fops, - &ftrace_event_format_fops); + ret = event_create_dir(call, d_events, &ftrace_event_id_fops, + &ftrace_enable_fops, + &ftrace_event_filter_fops, + &ftrace_event_format_fops); + if (!ret) + list_add(&call->list, &ftrace_events); } while (true) { diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 23245785927f..50504cb228de 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -18,11 +18,10 @@ * Copyright (C) 2009 Tom Zanussi <tzanussi@gmail.com> */ -#include <linux/debugfs.h> -#include <linux/uaccess.h> #include <linux/module.h> #include <linux/ctype.h> #include <linux/mutex.h> +#include <linux/perf_event.h> #include "trace.h" #include "trace_output.h" @@ -31,6 +30,7 @@ enum filter_op_ids { OP_OR, OP_AND, + OP_GLOB, OP_NE, OP_EQ, OP_LT, @@ -48,16 +48,17 @@ struct filter_op { }; static struct filter_op filter_ops[] = { - { OP_OR, "||", 1 }, - { OP_AND, "&&", 2 }, - { OP_NE, "!=", 4 }, - { OP_EQ, "==", 4 }, - { OP_LT, "<", 5 }, - { OP_LE, "<=", 5 }, - { OP_GT, ">", 5 }, - { OP_GE, ">=", 5 }, - { OP_NONE, "OP_NONE", 0 }, - { OP_OPEN_PAREN, "(", 0 }, + { OP_OR, "||", 1 }, + { OP_AND, "&&", 2 }, + { OP_GLOB, "~", 4 }, + { OP_NE, "!=", 4 }, + { OP_EQ, "==", 4 }, + { OP_LT, "<", 5 }, + { OP_LE, "<=", 5 }, + { OP_GT, ">", 5 }, + { OP_GE, ">=", 5 }, + { OP_NONE, "OP_NONE", 0 }, + { OP_OPEN_PAREN, "(", 0 }, }; enum { @@ -197,9 +198,9 @@ static int filter_pred_string(struct filter_pred *pred, void *event, char *addr = (char *)(event + pred->offset); int cmp, match; - cmp = strncmp(addr, pred->str_val, pred->str_len); + cmp = pred->regex.match(addr, &pred->regex, pred->regex.field_len); - match = (!cmp) ^ pred->not; + match = cmp ^ pred->not; return match; } @@ -211,9 +212,9 @@ static int filter_pred_pchar(struct filter_pred *pred, void *event, char **addr = (char **)(event + pred->offset); int cmp, match; - cmp = strncmp(*addr, pred->str_val, pred->str_len); + cmp = pred->regex.match(*addr, &pred->regex, pred->regex.field_len); - match = (!cmp) ^ pred->not; + match = cmp ^ pred->not; return match; } @@ -237,9 +238,9 @@ static int filter_pred_strloc(struct filter_pred *pred, void *event, char *addr = (char *)(event + str_loc); int cmp, match; - cmp = strncmp(addr, pred->str_val, str_len); + cmp = pred->regex.match(addr, &pred->regex, str_len); - match = (!cmp) ^ pred->not; + match = cmp ^ pred->not; return match; } @@ -250,10 +251,121 @@ static int filter_pred_none(struct filter_pred *pred, void *event, return 0; } +/* Basic regex callbacks */ +static int regex_match_full(char *str, struct regex *r, int len) +{ + if (strncmp(str, r->pattern, len) == 0) + return 1; + return 0; +} + +static int regex_match_front(char *str, struct regex *r, int len) +{ + if (strncmp(str, r->pattern, len) == 0) + return 1; + return 0; +} + +static int regex_match_middle(char *str, struct regex *r, int len) +{ + if (strstr(str, r->pattern)) + return 1; + return 0; +} + +static int regex_match_end(char *str, struct regex *r, int len) +{ + char *ptr = strstr(str, r->pattern); + + if (ptr && (ptr[r->len] == 0)) + return 1; + return 0; +} + +/** + * filter_parse_regex - parse a basic regex + * @buff: the raw regex + * @len: length of the regex + * @search: will point to the beginning of the string to compare + * @not: tell whether the match will have to be inverted + * + * This passes in a buffer containing a regex and this function will + * set search to point to the search part of the buffer and + * return the type of search it is (see enum above). + * This does modify buff. + * + * Returns enum type. + * search returns the pointer to use for comparison. + * not returns 1 if buff started with a '!' + * 0 otherwise. + */ +enum regex_type filter_parse_regex(char *buff, int len, char **search, int *not) +{ + int type = MATCH_FULL; + int i; + + if (buff[0] == '!') { + *not = 1; + buff++; + len--; + } else + *not = 0; + + *search = buff; + + for (i = 0; i < len; i++) { + if (buff[i] == '*') { + if (!i) { + *search = buff + 1; + type = MATCH_END_ONLY; + } else { + if (type == MATCH_END_ONLY) + type = MATCH_MIDDLE_ONLY; + else + type = MATCH_FRONT_ONLY; + buff[i] = 0; + break; + } + } + } + + return type; +} + +static void filter_build_regex(struct filter_pred *pred) +{ + struct regex *r = &pred->regex; + char *search; + enum regex_type type = MATCH_FULL; + int not = 0; + + if (pred->op == OP_GLOB) { + type = filter_parse_regex(r->pattern, r->len, &search, ¬); + r->len = strlen(search); + memmove(r->pattern, search, r->len+1); + } + + switch (type) { + case MATCH_FULL: + r->match = regex_match_full; + break; + case MATCH_FRONT_ONLY: + r->match = regex_match_front; + break; + case MATCH_MIDDLE_ONLY: + r->match = regex_match_middle; + break; + case MATCH_END_ONLY: + r->match = regex_match_end; + break; + } + + pred->not ^= not; +} + /* return 1 if event matches, 0 otherwise (discard) */ -int filter_match_preds(struct ftrace_event_call *call, void *rec) +int filter_match_preds(struct event_filter *filter, void *rec) { - struct event_filter *filter = call->filter; int match, top = 0, val1 = 0, val2 = 0; int stack[MAX_FILTER_PRED]; struct filter_pred *pred; @@ -396,7 +508,7 @@ static void filter_clear_pred(struct filter_pred *pred) { kfree(pred->field_name); pred->field_name = NULL; - pred->str_len = 0; + pred->regex.len = 0; } static int filter_set_pred(struct filter_pred *dest, @@ -426,9 +538,8 @@ static void filter_disable_preds(struct ftrace_event_call *call) filter->preds[i]->fn = filter_pred_none; } -void destroy_preds(struct ftrace_event_call *call) +static void __free_preds(struct event_filter *filter) { - struct event_filter *filter = call->filter; int i; if (!filter) @@ -441,21 +552,24 @@ void destroy_preds(struct ftrace_event_call *call) kfree(filter->preds); kfree(filter->filter_string); kfree(filter); +} + +void destroy_preds(struct ftrace_event_call *call) +{ + __free_preds(call->filter); call->filter = NULL; + call->filter_active = 0; } -static int init_preds(struct ftrace_event_call *call) +static struct event_filter *__alloc_preds(void) { struct event_filter *filter; struct filter_pred *pred; int i; - if (call->filter) - return 0; - - filter = call->filter = kzalloc(sizeof(*filter), GFP_KERNEL); - if (!call->filter) - return -ENOMEM; + filter = kzalloc(sizeof(*filter), GFP_KERNEL); + if (!filter) + return ERR_PTR(-ENOMEM); filter->n_preds = 0; @@ -471,12 +585,24 @@ static int init_preds(struct ftrace_event_call *call) filter->preds[i] = pred; } - return 0; + return filter; oom: - destroy_preds(call); + __free_preds(filter); + return ERR_PTR(-ENOMEM); +} + +static int init_preds(struct ftrace_event_call *call) +{ + if (call->filter) + return 0; + + call->filter_active = 0; + call->filter = __alloc_preds(); + if (IS_ERR(call->filter)) + return PTR_ERR(call->filter); - return -ENOMEM; + return 0; } static int init_subsystem_preds(struct event_subsystem *system) @@ -499,14 +625,7 @@ static int init_subsystem_preds(struct event_subsystem *system) return 0; } -enum { - FILTER_DISABLE_ALL, - FILTER_INIT_NO_RESET, - FILTER_SKIP_NO_RESET, -}; - -static void filter_free_subsystem_preds(struct event_subsystem *system, - int flag) +static void filter_free_subsystem_preds(struct event_subsystem *system) { struct ftrace_event_call *call; @@ -517,14 +636,6 @@ static void filter_free_subsystem_preds(struct event_subsystem *system, if (strcmp(call->system, system->name) != 0) continue; - if (flag == FILTER_INIT_NO_RESET) { - call->filter->no_reset = false; - continue; - } - - if (flag == FILTER_SKIP_NO_RESET && call->filter->no_reset) - continue; - filter_disable_preds(call); remove_filter_string(call->filter); } @@ -532,10 +643,10 @@ static void filter_free_subsystem_preds(struct event_subsystem *system, static int filter_add_pred_fn(struct filter_parse_state *ps, struct ftrace_event_call *call, + struct event_filter *filter, struct filter_pred *pred, filter_pred_fn_t fn) { - struct event_filter *filter = call->filter; int idx, err; if (filter->n_preds == MAX_FILTER_PRED) { @@ -550,7 +661,6 @@ static int filter_add_pred_fn(struct filter_parse_state *ps, return err; filter->n_preds++; - call->filter_active = 1; return 0; } @@ -575,7 +685,10 @@ static bool is_string_field(struct ftrace_event_field *field) static int is_legal_op(struct ftrace_event_field *field, int op) { - if (is_string_field(field) && (op != OP_EQ && op != OP_NE)) + if (is_string_field(field) && + (op != OP_EQ && op != OP_NE && op != OP_GLOB)) + return 0; + if (!is_string_field(field) && op == OP_GLOB) return 0; return 1; @@ -626,6 +739,7 @@ static filter_pred_fn_t select_comparison_fn(int op, int field_size, static int filter_add_pred(struct filter_parse_state *ps, struct ftrace_event_call *call, + struct event_filter *filter, struct filter_pred *pred, bool dry_run) { @@ -660,21 +774,22 @@ static int filter_add_pred(struct filter_parse_state *ps, } if (is_string_field(field)) { - pred->str_len = field->size; + filter_build_regex(pred); - if (field->filter_type == FILTER_STATIC_STRING) + if (field->filter_type == FILTER_STATIC_STRING) { fn = filter_pred_string; - else if (field->filter_type == FILTER_DYN_STRING) + pred->regex.field_len = field->size; + } else if (field->filter_type == FILTER_DYN_STRING) fn = filter_pred_strloc; else { fn = filter_pred_pchar; - pred->str_len = strlen(pred->str_val); + pred->regex.field_len = strlen(pred->regex.pattern); } } else { if (field->is_signed) - ret = strict_strtoll(pred->str_val, 0, &val); + ret = strict_strtoll(pred->regex.pattern, 0, &val); else - ret = strict_strtoull(pred->str_val, 0, &val); + ret = strict_strtoull(pred->regex.pattern, 0, &val); if (ret) { parse_error(ps, FILT_ERR_ILLEGAL_INTVAL, 0); return -EINVAL; @@ -694,45 +809,7 @@ static int filter_add_pred(struct filter_parse_state *ps, add_pred_fn: if (!dry_run) - return filter_add_pred_fn(ps, call, pred, fn); - return 0; -} - -static int filter_add_subsystem_pred(struct filter_parse_state *ps, - struct event_subsystem *system, - struct filter_pred *pred, - char *filter_string, - bool dry_run) -{ - struct ftrace_event_call *call; - int err = 0; - bool fail = true; - - list_for_each_entry(call, &ftrace_events, list) { - - if (!call->define_fields) - continue; - - if (strcmp(call->system, system->name)) - continue; - - if (call->filter->no_reset) - continue; - - err = filter_add_pred(ps, call, pred, dry_run); - if (err) - call->filter->no_reset = true; - else - fail = false; - - if (!dry_run) - replace_filter_string(call->filter, filter_string); - } - - if (fail) { - parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0); - return err; - } + return filter_add_pred_fn(ps, call, filter, pred, fn); return 0; } @@ -933,8 +1010,9 @@ static void postfix_clear(struct filter_parse_state *ps) while (!list_empty(&ps->postfix)) { elt = list_first_entry(&ps->postfix, struct postfix_elt, list); - kfree(elt->operand); list_del(&elt->list); + kfree(elt->operand); + kfree(elt); } } @@ -1044,8 +1122,8 @@ static struct filter_pred *create_pred(int op, char *operand1, char *operand2) return NULL; } - strcpy(pred->str_val, operand2); - pred->str_len = strlen(operand2); + strcpy(pred->regex.pattern, operand2); + pred->regex.len = strlen(pred->regex.pattern); pred->op = op; @@ -1089,8 +1167,8 @@ static int check_preds(struct filter_parse_state *ps) return 0; } -static int replace_preds(struct event_subsystem *system, - struct ftrace_event_call *call, +static int replace_preds(struct ftrace_event_call *call, + struct event_filter *filter, struct filter_parse_state *ps, char *filter_string, bool dry_run) @@ -1137,11 +1215,7 @@ static int replace_preds(struct event_subsystem *system, add_pred: if (!pred) return -ENOMEM; - if (call) - err = filter_add_pred(ps, call, pred, false); - else - err = filter_add_subsystem_pred(ps, system, pred, - filter_string, dry_run); + err = filter_add_pred(ps, call, filter, pred, dry_run); filter_free_pred(pred); if (err) return err; @@ -1152,10 +1226,50 @@ add_pred: return 0; } -int apply_event_filter(struct ftrace_event_call *call, char *filter_string) +static int replace_system_preds(struct event_subsystem *system, + struct filter_parse_state *ps, + char *filter_string) { + struct ftrace_event_call *call; + bool fail = true; int err; + list_for_each_entry(call, &ftrace_events, list) { + struct event_filter *filter = call->filter; + + if (!call->define_fields) + continue; + + if (strcmp(call->system, system->name) != 0) + continue; + + /* try to see if the filter can be applied */ + err = replace_preds(call, filter, ps, filter_string, true); + if (err) + continue; + + /* really apply the filter */ + filter_disable_preds(call); + err = replace_preds(call, filter, ps, filter_string, false); + if (err) + filter_disable_preds(call); + else { + call->filter_active = 1; + replace_filter_string(filter, filter_string); + } + fail = false; + } + + if (fail) { + parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0); + return -EINVAL; + } + return 0; +} + +int apply_event_filter(struct ftrace_event_call *call, char *filter_string) +{ + int err; struct filter_parse_state *ps; mutex_lock(&event_mutex); @@ -1167,8 +1281,7 @@ int apply_event_filter(struct ftrace_event_call *call, char *filter_string) if (!strcmp(strstrip(filter_string), "0")) { filter_disable_preds(call); remove_filter_string(call->filter); - mutex_unlock(&event_mutex); - return 0; + goto out_unlock; } err = -ENOMEM; @@ -1186,10 +1299,11 @@ int apply_event_filter(struct ftrace_event_call *call, char *filter_string) goto out; } - err = replace_preds(NULL, call, ps, filter_string, false); + err = replace_preds(call, call->filter, ps, filter_string, false); if (err) append_filter_err(ps, call->filter); - + else + call->filter_active = 1; out: filter_opstack_clear(ps); postfix_clear(ps); @@ -1204,7 +1318,6 @@ int apply_subsystem_event_filter(struct event_subsystem *system, char *filter_string) { int err; - struct filter_parse_state *ps; mutex_lock(&event_mutex); @@ -1214,10 +1327,9 @@ int apply_subsystem_event_filter(struct event_subsystem *system, goto out_unlock; if (!strcmp(strstrip(filter_string), "0")) { - filter_free_subsystem_preds(system, FILTER_DISABLE_ALL); + filter_free_subsystem_preds(system); remove_filter_string(system->filter); - mutex_unlock(&event_mutex); - return 0; + goto out_unlock; } err = -ENOMEM; @@ -1234,31 +1346,87 @@ int apply_subsystem_event_filter(struct event_subsystem *system, goto out; } - filter_free_subsystem_preds(system, FILTER_INIT_NO_RESET); - - /* try to see the filter can be applied to which events */ - err = replace_preds(system, NULL, ps, filter_string, true); - if (err) { + err = replace_system_preds(system, ps, filter_string); + if (err) append_filter_err(ps, system->filter); - goto out; + +out: + filter_opstack_clear(ps); + postfix_clear(ps); + kfree(ps); +out_unlock: + mutex_unlock(&event_mutex); + + return err; +} + +#ifdef CONFIG_EVENT_PROFILE + +void ftrace_profile_free_filter(struct perf_event *event) +{ + struct event_filter *filter = event->filter; + + event->filter = NULL; + __free_preds(filter); +} + +int ftrace_profile_set_filter(struct perf_event *event, int event_id, + char *filter_str) +{ + int err; + struct event_filter *filter; + struct filter_parse_state *ps; + struct ftrace_event_call *call = NULL; + + mutex_lock(&event_mutex); + + list_for_each_entry(call, &ftrace_events, list) { + if (call->id == event_id) + break; } - filter_free_subsystem_preds(system, FILTER_SKIP_NO_RESET); + err = -EINVAL; + if (!call) + goto out_unlock; - /* really apply the filter to the events */ - err = replace_preds(system, NULL, ps, filter_string, false); - if (err) { - append_filter_err(ps, system->filter); - filter_free_subsystem_preds(system, 2); + err = -EEXIST; + if (event->filter) + goto out_unlock; + + filter = __alloc_preds(); + if (IS_ERR(filter)) { + err = PTR_ERR(filter); + goto out_unlock; } -out: + err = -ENOMEM; + ps = kzalloc(sizeof(*ps), GFP_KERNEL); + if (!ps) + goto free_preds; + + parse_init(ps, filter_ops, filter_str); + err = filter_parse(ps); + if (err) + goto free_ps; + + err = replace_preds(call, filter, ps, filter_str, false); + if (!err) + event->filter = filter; + +free_ps: filter_opstack_clear(ps); postfix_clear(ps); kfree(ps); + +free_preds: + if (err) + __free_preds(filter); + out_unlock: mutex_unlock(&event_mutex); return err; } +#endif /* CONFIG_EVENT_PROFILE */ + diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c index 9753fcc61bc5..dff8c84ddf17 100644 --- a/kernel/trace/trace_export.c +++ b/kernel/trace/trace_export.c @@ -48,11 +48,11 @@ struct ____ftrace_##name { \ tstruct \ }; \ -static void __used ____ftrace_check_##name(void) \ +static void __always_unused ____ftrace_check_##name(void) \ { \ struct ____ftrace_##name *__entry = NULL; \ \ - /* force cmpile-time check on F_printk() */ \ + /* force compile-time check on F_printk() */ \ printk(print); \ } @@ -66,44 +66,47 @@ static void __used ____ftrace_check_##name(void) \ #undef __field #define __field(type, item) \ ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t" \ - "offset:%zu;\tsize:%zu;\n", \ + "offset:%zu;\tsize:%zu;\tsigned:%u;\n", \ offsetof(typeof(field), item), \ - sizeof(field.item)); \ + sizeof(field.item), is_signed_type(type)); \ if (!ret) \ return 0; #undef __field_desc #define __field_desc(type, container, item) \ ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t" \ - "offset:%zu;\tsize:%zu;\n", \ + "offset:%zu;\tsize:%zu;\tsigned:%u;\n", \ offsetof(typeof(field), container.item), \ - sizeof(field.container.item)); \ + sizeof(field.container.item), \ + is_signed_type(type)); \ if (!ret) \ return 0; #undef __array #define __array(type, item, len) \ ret = trace_seq_printf(s, "\tfield:" #type " " #item "[" #len "];\t" \ - "offset:%zu;\tsize:%zu;\n", \ - offsetof(typeof(field), item), \ - sizeof(field.item)); \ + "offset:%zu;\tsize:%zu;\tsigned:%u;\n", \ + offsetof(typeof(field), item), \ + sizeof(field.item), is_signed_type(type)); \ if (!ret) \ return 0; #undef __array_desc #define __array_desc(type, container, item, len) \ ret = trace_seq_printf(s, "\tfield:" #type " " #item "[" #len "];\t" \ - "offset:%zu;\tsize:%zu;\n", \ + "offset:%zu;\tsize:%zu;\tsigned:%u;\n", \ offsetof(typeof(field), container.item), \ - sizeof(field.container.item)); \ + sizeof(field.container.item), \ + is_signed_type(type)); \ if (!ret) \ return 0; #undef __dynamic_array #define __dynamic_array(type, item) \ ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t" \ - "offset:%zu;\tsize:0;\n", \ - offsetof(typeof(field), item)); \ + "offset:%zu;\tsize:0;\tsigned:%u;\n", \ + offsetof(typeof(field), item), \ + is_signed_type(type)); \ if (!ret) \ return 0; @@ -131,7 +134,6 @@ ftrace_format_##name(struct ftrace_event_call *unused, \ #include "trace_entries.h" - #undef __field #define __field(type, item) \ ret = trace_define_field(event_call, #type, #item, \ @@ -193,6 +195,11 @@ ftrace_define_fields_##name(struct ftrace_event_call *event_call) \ #include "trace_entries.h" +static int ftrace_raw_init_event(struct ftrace_event_call *call) +{ + INIT_LIST_HEAD(&call->fields); + return 0; +} #undef __field #define __field(type, item) @@ -211,7 +218,6 @@ ftrace_define_fields_##name(struct ftrace_event_call *event_call) \ #undef FTRACE_ENTRY #define FTRACE_ENTRY(call, struct_name, type, tstruct, print) \ -static int ftrace_raw_init_event_##call(void); \ \ struct ftrace_event_call __used \ __attribute__((__aligned__(4))) \ @@ -219,14 +225,9 @@ __attribute__((section("_ftrace_events"))) event_##call = { \ .name = #call, \ .id = type, \ .system = __stringify(TRACE_SYSTEM), \ - .raw_init = ftrace_raw_init_event_##call, \ + .raw_init = ftrace_raw_init_event, \ .show_format = ftrace_format_##call, \ .define_fields = ftrace_define_fields_##call, \ }; \ -static int ftrace_raw_init_event_##call(void) \ -{ \ - INIT_LIST_HEAD(&event_##call.fields); \ - return 0; \ -} \ #include "trace_entries.h" diff --git a/kernel/trace/trace_hw_branches.c b/kernel/trace/trace_hw_branches.c index 23b63859130e..69543a905cd5 100644 --- a/kernel/trace/trace_hw_branches.c +++ b/kernel/trace/trace_hw_branches.c @@ -165,6 +165,7 @@ void trace_hw_branch(u64 from, u64 to) struct ftrace_event_call *call = &event_hw_branch; struct trace_array *tr = hw_branch_trace; struct ring_buffer_event *event; + struct ring_buffer *buf; struct hw_branch_entry *entry; unsigned long irq1; int cpu; @@ -180,7 +181,8 @@ void trace_hw_branch(u64 from, u64 to) if (atomic_inc_return(&tr->data[cpu]->disabled) != 1) goto out; - event = trace_buffer_lock_reserve(tr, TRACE_HW_BRANCHES, + buf = tr->buffer; + event = trace_buffer_lock_reserve(buf, TRACE_HW_BRANCHES, sizeof(*entry), 0, 0); if (!event) goto out; @@ -189,8 +191,8 @@ void trace_hw_branch(u64 from, u64 to) entry->ent.type = TRACE_HW_BRANCHES; entry->from = from; entry->to = to; - if (!filter_check_discard(call, entry, tr->buffer, event)) - trace_buffer_unlock_commit(tr, event, 0, 0); + if (!filter_check_discard(call, entry, buf, event)) + trace_buffer_unlock_commit(buf, event, 0, 0); out: atomic_dec(&tr->data[cpu]->disabled); diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c new file mode 100644 index 000000000000..aff5f80b59b8 --- /dev/null +++ b/kernel/trace/trace_kprobe.c @@ -0,0 +1,1523 @@ +/* + * Kprobes-based tracing events + * + * Created by Masami Hiramatsu <mhiramat@redhat.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include <linux/module.h> +#include <linux/uaccess.h> +#include <linux/kprobes.h> +#include <linux/seq_file.h> +#include <linux/slab.h> +#include <linux/smp.h> +#include <linux/debugfs.h> +#include <linux/types.h> +#include <linux/string.h> +#include <linux/ctype.h> +#include <linux/ptrace.h> +#include <linux/perf_event.h> + +#include "trace.h" +#include "trace_output.h" + +#define MAX_TRACE_ARGS 128 +#define MAX_ARGSTR_LEN 63 +#define MAX_EVENT_NAME_LEN 64 +#define KPROBE_EVENT_SYSTEM "kprobes" + +/* Reserved field names */ +#define FIELD_STRING_IP "__probe_ip" +#define FIELD_STRING_NARGS "__probe_nargs" +#define FIELD_STRING_RETIP "__probe_ret_ip" +#define FIELD_STRING_FUNC "__probe_func" + +const char *reserved_field_names[] = { + "common_type", + "common_flags", + "common_preempt_count", + "common_pid", + "common_tgid", + "common_lock_depth", + FIELD_STRING_IP, + FIELD_STRING_NARGS, + FIELD_STRING_RETIP, + FIELD_STRING_FUNC, +}; + +struct fetch_func { + unsigned long (*func)(struct pt_regs *, void *); + void *data; +}; + +static __kprobes unsigned long call_fetch(struct fetch_func *f, + struct pt_regs *regs) +{ + return f->func(regs, f->data); +} + +/* fetch handlers */ +static __kprobes unsigned long fetch_register(struct pt_regs *regs, + void *offset) +{ + return regs_get_register(regs, (unsigned int)((unsigned long)offset)); +} + +static __kprobes unsigned long fetch_stack(struct pt_regs *regs, + void *num) +{ + return regs_get_kernel_stack_nth(regs, + (unsigned int)((unsigned long)num)); +} + +static __kprobes unsigned long fetch_memory(struct pt_regs *regs, void *addr) +{ + unsigned long retval; + + if (probe_kernel_address(addr, retval)) + return 0; + return retval; +} + +static __kprobes unsigned long fetch_argument(struct pt_regs *regs, void *num) +{ + return regs_get_argument_nth(regs, (unsigned int)((unsigned long)num)); +} + +static __kprobes unsigned long fetch_retvalue(struct pt_regs *regs, + void *dummy) +{ + return regs_return_value(regs); +} + +static __kprobes unsigned long fetch_stack_address(struct pt_regs *regs, + void *dummy) +{ + return kernel_stack_pointer(regs); +} + +/* Memory fetching by symbol */ +struct symbol_cache { + char *symbol; + long offset; + unsigned long addr; +}; + +static unsigned long update_symbol_cache(struct symbol_cache *sc) +{ + sc->addr = (unsigned long)kallsyms_lookup_name(sc->symbol); + if (sc->addr) + sc->addr += sc->offset; + return sc->addr; +} + +static void free_symbol_cache(struct symbol_cache *sc) +{ + kfree(sc->symbol); + kfree(sc); +} + +static struct symbol_cache *alloc_symbol_cache(const char *sym, long offset) +{ + struct symbol_cache *sc; + + if (!sym || strlen(sym) == 0) + return NULL; + sc = kzalloc(sizeof(struct symbol_cache), GFP_KERNEL); + if (!sc) + return NULL; + + sc->symbol = kstrdup(sym, GFP_KERNEL); + if (!sc->symbol) { + kfree(sc); + return NULL; + } + sc->offset = offset; + + update_symbol_cache(sc); + return sc; +} + +static __kprobes unsigned long fetch_symbol(struct pt_regs *regs, void *data) +{ + struct symbol_cache *sc = data; + + if (sc->addr) + return fetch_memory(regs, (void *)sc->addr); + else + return 0; +} + +/* Special indirect memory access interface */ +struct indirect_fetch_data { + struct fetch_func orig; + long offset; +}; + +static __kprobes unsigned long fetch_indirect(struct pt_regs *regs, void *data) +{ + struct indirect_fetch_data *ind = data; + unsigned long addr; + + addr = call_fetch(&ind->orig, regs); + if (addr) { + addr += ind->offset; + return fetch_memory(regs, (void *)addr); + } else + return 0; +} + +static __kprobes void free_indirect_fetch_data(struct indirect_fetch_data *data) +{ + if (data->orig.func == fetch_indirect) + free_indirect_fetch_data(data->orig.data); + else if (data->orig.func == fetch_symbol) + free_symbol_cache(data->orig.data); + kfree(data); +} + +/** + * Kprobe event core functions + */ + +struct probe_arg { + struct fetch_func fetch; + const char *name; +}; + +/* Flags for trace_probe */ +#define TP_FLAG_TRACE 1 +#define TP_FLAG_PROFILE 2 + +struct trace_probe { + struct list_head list; + struct kretprobe rp; /* Use rp.kp for kprobe use */ + unsigned long nhit; + unsigned int flags; /* For TP_FLAG_* */ + const char *symbol; /* symbol name */ + struct ftrace_event_call call; + struct trace_event event; + unsigned int nr_args; + struct probe_arg args[]; +}; + +#define SIZEOF_TRACE_PROBE(n) \ + (offsetof(struct trace_probe, args) + \ + (sizeof(struct probe_arg) * (n))) + +static __kprobes int probe_is_return(struct trace_probe *tp) +{ + return tp->rp.handler != NULL; +} + +static __kprobes const char *probe_symbol(struct trace_probe *tp) +{ + return tp->symbol ? tp->symbol : "unknown"; +} + +static int probe_arg_string(char *buf, size_t n, struct fetch_func *ff) +{ + int ret = -EINVAL; + + if (ff->func == fetch_argument) + ret = snprintf(buf, n, "$arg%lu", (unsigned long)ff->data); + else if (ff->func == fetch_register) { + const char *name; + name = regs_query_register_name((unsigned int)((long)ff->data)); + ret = snprintf(buf, n, "%%%s", name); + } else if (ff->func == fetch_stack) + ret = snprintf(buf, n, "$stack%lu", (unsigned long)ff->data); + else if (ff->func == fetch_memory) + ret = snprintf(buf, n, "@0x%p", ff->data); + else if (ff->func == fetch_symbol) { + struct symbol_cache *sc = ff->data; + if (sc->offset) + ret = snprintf(buf, n, "@%s%+ld", sc->symbol, + sc->offset); + else + ret = snprintf(buf, n, "@%s", sc->symbol); + } else if (ff->func == fetch_retvalue) + ret = snprintf(buf, n, "$retval"); + else if (ff->func == fetch_stack_address) + ret = snprintf(buf, n, "$stack"); + else if (ff->func == fetch_indirect) { + struct indirect_fetch_data *id = ff->data; + size_t l = 0; + ret = snprintf(buf, n, "%+ld(", id->offset); + if (ret >= n) + goto end; + l += ret; + ret = probe_arg_string(buf + l, n - l, &id->orig); + if (ret < 0) + goto end; + l += ret; + ret = snprintf(buf + l, n - l, ")"); + ret += l; + } +end: + if (ret >= n) + return -ENOSPC; + return ret; +} + +static int register_probe_event(struct trace_probe *tp); +static void unregister_probe_event(struct trace_probe *tp); + +static DEFINE_MUTEX(probe_lock); +static LIST_HEAD(probe_list); + +static int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs); +static int kretprobe_dispatcher(struct kretprobe_instance *ri, + struct pt_regs *regs); + +/* + * Allocate new trace_probe and initialize it (including kprobes). + */ +static struct trace_probe *alloc_trace_probe(const char *group, + const char *event, + void *addr, + const char *symbol, + unsigned long offs, + int nargs, int is_return) +{ + struct trace_probe *tp; + + tp = kzalloc(SIZEOF_TRACE_PROBE(nargs), GFP_KERNEL); + if (!tp) + return ERR_PTR(-ENOMEM); + + if (symbol) { + tp->symbol = kstrdup(symbol, GFP_KERNEL); + if (!tp->symbol) + goto error; + tp->rp.kp.symbol_name = tp->symbol; + tp->rp.kp.offset = offs; + } else + tp->rp.kp.addr = addr; + + if (is_return) + tp->rp.handler = kretprobe_dispatcher; + else + tp->rp.kp.pre_handler = kprobe_dispatcher; + + if (!event) + goto error; + tp->call.name = kstrdup(event, GFP_KERNEL); + if (!tp->call.name) + goto error; + + if (!group) + goto error; + tp->call.system = kstrdup(group, GFP_KERNEL); + if (!tp->call.system) + goto error; + + INIT_LIST_HEAD(&tp->list); + return tp; +error: + kfree(tp->call.name); + kfree(tp->symbol); + kfree(tp); + return ERR_PTR(-ENOMEM); +} + +static void free_probe_arg(struct probe_arg *arg) +{ + if (arg->fetch.func == fetch_symbol) + free_symbol_cache(arg->fetch.data); + else if (arg->fetch.func == fetch_indirect) + free_indirect_fetch_data(arg->fetch.data); + kfree(arg->name); +} + +static void free_trace_probe(struct trace_probe *tp) +{ + int i; + + for (i = 0; i < tp->nr_args; i++) + free_probe_arg(&tp->args[i]); + + kfree(tp->call.system); + kfree(tp->call.name); + kfree(tp->symbol); + kfree(tp); +} + +static struct trace_probe *find_probe_event(const char *event, + const char *group) +{ + struct trace_probe *tp; + + list_for_each_entry(tp, &probe_list, list) + if (strcmp(tp->call.name, event) == 0 && + strcmp(tp->call.system, group) == 0) + return tp; + return NULL; +} + +/* Unregister a trace_probe and probe_event: call with locking probe_lock */ +static void unregister_trace_probe(struct trace_probe *tp) +{ + if (probe_is_return(tp)) + unregister_kretprobe(&tp->rp); + else + unregister_kprobe(&tp->rp.kp); + list_del(&tp->list); + unregister_probe_event(tp); +} + +/* Register a trace_probe and probe_event */ +static int register_trace_probe(struct trace_probe *tp) +{ + struct trace_probe *old_tp; + int ret; + + mutex_lock(&probe_lock); + + /* register as an event */ + old_tp = find_probe_event(tp->call.name, tp->call.system); + if (old_tp) { + /* delete old event */ + unregister_trace_probe(old_tp); + free_trace_probe(old_tp); + } + ret = register_probe_event(tp); + if (ret) { + pr_warning("Faild to register probe event(%d)\n", ret); + goto end; + } + + tp->rp.kp.flags |= KPROBE_FLAG_DISABLED; + if (probe_is_return(tp)) + ret = register_kretprobe(&tp->rp); + else + ret = register_kprobe(&tp->rp.kp); + + if (ret) { + pr_warning("Could not insert probe(%d)\n", ret); + if (ret == -EILSEQ) { + pr_warning("Probing address(0x%p) is not an " + "instruction boundary.\n", + tp->rp.kp.addr); + ret = -EINVAL; + } + unregister_probe_event(tp); + } else + list_add_tail(&tp->list, &probe_list); +end: + mutex_unlock(&probe_lock); + return ret; +} + +/* Split symbol and offset. */ +static int split_symbol_offset(char *symbol, unsigned long *offset) +{ + char *tmp; + int ret; + + if (!offset) + return -EINVAL; + + tmp = strchr(symbol, '+'); + if (tmp) { + /* skip sign because strict_strtol doesn't accept '+' */ + ret = strict_strtoul(tmp + 1, 0, offset); + if (ret) + return ret; + *tmp = '\0'; + } else + *offset = 0; + return 0; +} + +#define PARAM_MAX_ARGS 16 +#define PARAM_MAX_STACK (THREAD_SIZE / sizeof(unsigned long)) + +static int parse_probe_vars(char *arg, struct fetch_func *ff, int is_return) +{ + int ret = 0; + unsigned long param; + + if (strcmp(arg, "retval") == 0) { + if (is_return) { + ff->func = fetch_retvalue; + ff->data = NULL; + } else + ret = -EINVAL; + } else if (strncmp(arg, "stack", 5) == 0) { + if (arg[5] == '\0') { + ff->func = fetch_stack_address; + ff->data = NULL; + } else if (isdigit(arg[5])) { + ret = strict_strtoul(arg + 5, 10, ¶m); + if (ret || param > PARAM_MAX_STACK) + ret = -EINVAL; + else { + ff->func = fetch_stack; + ff->data = (void *)param; + } + } else + ret = -EINVAL; + } else if (strncmp(arg, "arg", 3) == 0 && isdigit(arg[3])) { + ret = strict_strtoul(arg + 3, 10, ¶m); + if (ret || param > PARAM_MAX_ARGS) + ret = -EINVAL; + else { + ff->func = fetch_argument; + ff->data = (void *)param; + } + } else + ret = -EINVAL; + return ret; +} + +/* Recursive argument parser */ +static int __parse_probe_arg(char *arg, struct fetch_func *ff, int is_return) +{ + int ret = 0; + unsigned long param; + long offset; + char *tmp; + + switch (arg[0]) { + case '$': + ret = parse_probe_vars(arg + 1, ff, is_return); + break; + case '%': /* named register */ + ret = regs_query_register_offset(arg + 1); + if (ret >= 0) { + ff->func = fetch_register; + ff->data = (void *)(unsigned long)ret; + ret = 0; + } + break; + case '@': /* memory or symbol */ + if (isdigit(arg[1])) { + ret = strict_strtoul(arg + 1, 0, ¶m); + if (ret) + break; + ff->func = fetch_memory; + ff->data = (void *)param; + } else { + ret = split_symbol_offset(arg + 1, &offset); + if (ret) + break; + ff->data = alloc_symbol_cache(arg + 1, offset); + if (ff->data) + ff->func = fetch_symbol; + else + ret = -EINVAL; + } + break; + case '+': /* indirect memory */ + case '-': + tmp = strchr(arg, '('); + if (!tmp) { + ret = -EINVAL; + break; + } + *tmp = '\0'; + ret = strict_strtol(arg + 1, 0, &offset); + if (ret) + break; + if (arg[0] == '-') + offset = -offset; + arg = tmp + 1; + tmp = strrchr(arg, ')'); + if (tmp) { + struct indirect_fetch_data *id; + *tmp = '\0'; + id = kzalloc(sizeof(struct indirect_fetch_data), + GFP_KERNEL); + if (!id) + return -ENOMEM; + id->offset = offset; + ret = __parse_probe_arg(arg, &id->orig, is_return); + if (ret) + kfree(id); + else { + ff->func = fetch_indirect; + ff->data = (void *)id; + } + } else + ret = -EINVAL; + break; + default: + /* TODO: support custom handler */ + ret = -EINVAL; + } + return ret; +} + +/* String length checking wrapper */ +static int parse_probe_arg(char *arg, struct fetch_func *ff, int is_return) +{ + if (strlen(arg) > MAX_ARGSTR_LEN) { + pr_info("Argument is too long.: %s\n", arg); + return -ENOSPC; + } + return __parse_probe_arg(arg, ff, is_return); +} + +/* Return 1 if name is reserved or already used by another argument */ +static int conflict_field_name(const char *name, + struct probe_arg *args, int narg) +{ + int i; + for (i = 0; i < ARRAY_SIZE(reserved_field_names); i++) + if (strcmp(reserved_field_names[i], name) == 0) + return 1; + for (i = 0; i < narg; i++) + if (strcmp(args[i].name, name) == 0) + return 1; + return 0; +} + +static int create_trace_probe(int argc, char **argv) +{ + /* + * Argument syntax: + * - Add kprobe: p[:[GRP/]EVENT] KSYM[+OFFS]|KADDR [FETCHARGS] + * - Add kretprobe: r[:[GRP/]EVENT] KSYM[+0] [FETCHARGS] + * Fetch args: + * $argN : fetch Nth of function argument. (N:0-) + * $retval : fetch return value + * $stack : fetch stack address + * $stackN : fetch Nth of stack (N:0-) + * @ADDR : fetch memory at ADDR (ADDR should be in kernel) + * @SYM[+|-offs] : fetch memory at SYM +|- offs (SYM is a data symbol) + * %REG : fetch register REG + * Indirect memory fetch: + * +|-offs(ARG) : fetch memory at ARG +|- offs address. + * Alias name of args: + * NAME=FETCHARG : set NAME as alias of FETCHARG. + */ + struct trace_probe *tp; + int i, ret = 0; + int is_return = 0; + char *symbol = NULL, *event = NULL, *arg = NULL, *group = NULL; + unsigned long offset = 0; + void *addr = NULL; + char buf[MAX_EVENT_NAME_LEN]; + + if (argc < 2) { + pr_info("Probe point is not specified.\n"); + return -EINVAL; + } + + if (argv[0][0] == 'p') + is_return = 0; + else if (argv[0][0] == 'r') + is_return = 1; + else { + pr_info("Probe definition must be started with 'p' or 'r'.\n"); + return -EINVAL; + } + + if (argv[0][1] == ':') { + event = &argv[0][2]; + if (strchr(event, '/')) { + group = event; + event = strchr(group, '/') + 1; + event[-1] = '\0'; + if (strlen(group) == 0) { + pr_info("Group name is not specifiled\n"); + return -EINVAL; + } + } + if (strlen(event) == 0) { + pr_info("Event name is not specifiled\n"); + return -EINVAL; + } + } + + if (isdigit(argv[1][0])) { + if (is_return) { + pr_info("Return probe point must be a symbol.\n"); + return -EINVAL; + } + /* an address specified */ + ret = strict_strtoul(&argv[0][2], 0, (unsigned long *)&addr); + if (ret) { + pr_info("Failed to parse address.\n"); + return ret; + } + } else { + /* a symbol specified */ + symbol = argv[1]; + /* TODO: support .init module functions */ + ret = split_symbol_offset(symbol, &offset); + if (ret) { + pr_info("Failed to parse symbol.\n"); + return ret; + } + if (offset && is_return) { + pr_info("Return probe must be used without offset.\n"); + return -EINVAL; + } + } + argc -= 2; argv += 2; + + /* setup a probe */ + if (!group) + group = KPROBE_EVENT_SYSTEM; + if (!event) { + /* Make a new event name */ + if (symbol) + snprintf(buf, MAX_EVENT_NAME_LEN, "%c@%s%+ld", + is_return ? 'r' : 'p', symbol, offset); + else + snprintf(buf, MAX_EVENT_NAME_LEN, "%c@0x%p", + is_return ? 'r' : 'p', addr); + event = buf; + } + tp = alloc_trace_probe(group, event, addr, symbol, offset, argc, + is_return); + if (IS_ERR(tp)) { + pr_info("Failed to allocate trace_probe.(%d)\n", + (int)PTR_ERR(tp)); + return PTR_ERR(tp); + } + + /* parse arguments */ + ret = 0; + for (i = 0; i < argc && i < MAX_TRACE_ARGS; i++) { + /* Parse argument name */ + arg = strchr(argv[i], '='); + if (arg) + *arg++ = '\0'; + else + arg = argv[i]; + + if (conflict_field_name(argv[i], tp->args, i)) { + pr_info("Argument%d name '%s' conflicts with " + "another field.\n", i, argv[i]); + ret = -EINVAL; + goto error; + } + + tp->args[i].name = kstrdup(argv[i], GFP_KERNEL); + if (!tp->args[i].name) { + pr_info("Failed to allocate argument%d name '%s'.\n", + i, argv[i]); + ret = -ENOMEM; + goto error; + } + + /* Parse fetch argument */ + ret = parse_probe_arg(arg, &tp->args[i].fetch, is_return); + if (ret) { + pr_info("Parse error at argument%d. (%d)\n", i, ret); + kfree(tp->args[i].name); + goto error; + } + + tp->nr_args++; + } + + ret = register_trace_probe(tp); + if (ret) + goto error; + return 0; + +error: + free_trace_probe(tp); + return ret; +} + +static void cleanup_all_probes(void) +{ + struct trace_probe *tp; + + mutex_lock(&probe_lock); + /* TODO: Use batch unregistration */ + while (!list_empty(&probe_list)) { + tp = list_entry(probe_list.next, struct trace_probe, list); + unregister_trace_probe(tp); + free_trace_probe(tp); + } + mutex_unlock(&probe_lock); +} + + +/* Probes listing interfaces */ +static void *probes_seq_start(struct seq_file *m, loff_t *pos) +{ + mutex_lock(&probe_lock); + return seq_list_start(&probe_list, *pos); +} + +static void *probes_seq_next(struct seq_file *m, void *v, loff_t *pos) +{ + return seq_list_next(v, &probe_list, pos); +} + +static void probes_seq_stop(struct seq_file *m, void *v) +{ + mutex_unlock(&probe_lock); +} + +static int probes_seq_show(struct seq_file *m, void *v) +{ + struct trace_probe *tp = v; + int i, ret; + char buf[MAX_ARGSTR_LEN + 1]; + + seq_printf(m, "%c", probe_is_return(tp) ? 'r' : 'p'); + seq_printf(m, ":%s/%s", tp->call.system, tp->call.name); + + if (!tp->symbol) + seq_printf(m, " 0x%p", tp->rp.kp.addr); + else if (tp->rp.kp.offset) + seq_printf(m, " %s+%u", probe_symbol(tp), tp->rp.kp.offset); + else + seq_printf(m, " %s", probe_symbol(tp)); + + for (i = 0; i < tp->nr_args; i++) { + ret = probe_arg_string(buf, MAX_ARGSTR_LEN, &tp->args[i].fetch); + if (ret < 0) { + pr_warning("Argument%d decoding error(%d).\n", i, ret); + return ret; + } + seq_printf(m, " %s=%s", tp->args[i].name, buf); + } + seq_printf(m, "\n"); + return 0; +} + +static const struct seq_operations probes_seq_op = { + .start = probes_seq_start, + .next = probes_seq_next, + .stop = probes_seq_stop, + .show = probes_seq_show +}; + +static int probes_open(struct inode *inode, struct file *file) +{ + if ((file->f_mode & FMODE_WRITE) && + (file->f_flags & O_TRUNC)) + cleanup_all_probes(); + + return seq_open(file, &probes_seq_op); +} + +static int command_trace_probe(const char *buf) +{ + char **argv; + int argc = 0, ret = 0; + + argv = argv_split(GFP_KERNEL, buf, &argc); + if (!argv) + return -ENOMEM; + + if (argc) + ret = create_trace_probe(argc, argv); + + argv_free(argv); + return ret; +} + +#define WRITE_BUFSIZE 128 + +static ssize_t probes_write(struct file *file, const char __user *buffer, + size_t count, loff_t *ppos) +{ + char *kbuf, *tmp; + int ret; + size_t done; + size_t size; + + kbuf = kmalloc(WRITE_BUFSIZE, GFP_KERNEL); + if (!kbuf) + return -ENOMEM; + + ret = done = 0; + while (done < count) { + size = count - done; + if (size >= WRITE_BUFSIZE) + size = WRITE_BUFSIZE - 1; + if (copy_from_user(kbuf, buffer + done, size)) { + ret = -EFAULT; + goto out; + } + kbuf[size] = '\0'; + tmp = strchr(kbuf, '\n'); + if (tmp) { + *tmp = '\0'; + size = tmp - kbuf + 1; + } else if (done + size < count) { + pr_warning("Line length is too long: " + "Should be less than %d.", WRITE_BUFSIZE); + ret = -EINVAL; + goto out; + } + done += size; + /* Remove comments */ + tmp = strchr(kbuf, '#'); + if (tmp) + *tmp = '\0'; + + ret = command_trace_probe(kbuf); + if (ret) + goto out; + } + ret = done; +out: + kfree(kbuf); + return ret; +} + +static const struct file_operations kprobe_events_ops = { + .owner = THIS_MODULE, + .open = probes_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, + .write = probes_write, +}; + +/* Probes profiling interfaces */ +static int probes_profile_seq_show(struct seq_file *m, void *v) +{ + struct trace_probe *tp = v; + + seq_printf(m, " %-44s %15lu %15lu\n", tp->call.name, tp->nhit, + tp->rp.kp.nmissed); + + return 0; +} + +static const struct seq_operations profile_seq_op = { + .start = probes_seq_start, + .next = probes_seq_next, + .stop = probes_seq_stop, + .show = probes_profile_seq_show +}; + +static int profile_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &profile_seq_op); +} + +static const struct file_operations kprobe_profile_ops = { + .owner = THIS_MODULE, + .open = profile_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +/* Kprobe handler */ +static __kprobes int kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs) +{ + struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp); + struct kprobe_trace_entry *entry; + struct ring_buffer_event *event; + struct ring_buffer *buffer; + int size, i, pc; + unsigned long irq_flags; + struct ftrace_event_call *call = &tp->call; + + tp->nhit++; + + local_save_flags(irq_flags); + pc = preempt_count(); + + size = SIZEOF_KPROBE_TRACE_ENTRY(tp->nr_args); + + event = trace_current_buffer_lock_reserve(&buffer, call->id, size, + irq_flags, pc); + if (!event) + return 0; + + entry = ring_buffer_event_data(event); + entry->nargs = tp->nr_args; + entry->ip = (unsigned long)kp->addr; + for (i = 0; i < tp->nr_args; i++) + entry->args[i] = call_fetch(&tp->args[i].fetch, regs); + + if (!filter_current_check_discard(buffer, call, entry, event)) + trace_nowake_buffer_unlock_commit(buffer, event, irq_flags, pc); + return 0; +} + +/* Kretprobe handler */ +static __kprobes int kretprobe_trace_func(struct kretprobe_instance *ri, + struct pt_regs *regs) +{ + struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp); + struct kretprobe_trace_entry *entry; + struct ring_buffer_event *event; + struct ring_buffer *buffer; + int size, i, pc; + unsigned long irq_flags; + struct ftrace_event_call *call = &tp->call; + + local_save_flags(irq_flags); + pc = preempt_count(); + + size = SIZEOF_KRETPROBE_TRACE_ENTRY(tp->nr_args); + + event = trace_current_buffer_lock_reserve(&buffer, call->id, size, + irq_flags, pc); + if (!event) + return 0; + + entry = ring_buffer_event_data(event); + entry->nargs = tp->nr_args; + entry->func = (unsigned long)tp->rp.kp.addr; + entry->ret_ip = (unsigned long)ri->ret_addr; + for (i = 0; i < tp->nr_args; i++) + entry->args[i] = call_fetch(&tp->args[i].fetch, regs); + + if (!filter_current_check_discard(buffer, call, entry, event)) + trace_nowake_buffer_unlock_commit(buffer, event, irq_flags, pc); + + return 0; +} + +/* Event entry printers */ +enum print_line_t +print_kprobe_event(struct trace_iterator *iter, int flags) +{ + struct kprobe_trace_entry *field; + struct trace_seq *s = &iter->seq; + struct trace_event *event; + struct trace_probe *tp; + int i; + + field = (struct kprobe_trace_entry *)iter->ent; + event = ftrace_find_event(field->ent.type); + tp = container_of(event, struct trace_probe, event); + + if (!trace_seq_printf(s, "%s: (", tp->call.name)) + goto partial; + + if (!seq_print_ip_sym(s, field->ip, flags | TRACE_ITER_SYM_OFFSET)) + goto partial; + + if (!trace_seq_puts(s, ")")) + goto partial; + + for (i = 0; i < field->nargs; i++) + if (!trace_seq_printf(s, " %s=%lx", + tp->args[i].name, field->args[i])) + goto partial; + + if (!trace_seq_puts(s, "\n")) + goto partial; + + return TRACE_TYPE_HANDLED; +partial: + return TRACE_TYPE_PARTIAL_LINE; +} + +enum print_line_t +print_kretprobe_event(struct trace_iterator *iter, int flags) +{ + struct kretprobe_trace_entry *field; + struct trace_seq *s = &iter->seq; + struct trace_event *event; + struct trace_probe *tp; + int i; + + field = (struct kretprobe_trace_entry *)iter->ent; + event = ftrace_find_event(field->ent.type); + tp = container_of(event, struct trace_probe, event); + + if (!trace_seq_printf(s, "%s: (", tp->call.name)) + goto partial; + + if (!seq_print_ip_sym(s, field->ret_ip, flags | TRACE_ITER_SYM_OFFSET)) + goto partial; + + if (!trace_seq_puts(s, " <- ")) + goto partial; + + if (!seq_print_ip_sym(s, field->func, flags & ~TRACE_ITER_SYM_OFFSET)) + goto partial; + + if (!trace_seq_puts(s, ")")) + goto partial; + + for (i = 0; i < field->nargs; i++) + if (!trace_seq_printf(s, " %s=%lx", + tp->args[i].name, field->args[i])) + goto partial; + + if (!trace_seq_puts(s, "\n")) + goto partial; + + return TRACE_TYPE_HANDLED; +partial: + return TRACE_TYPE_PARTIAL_LINE; +} + +static int probe_event_enable(struct ftrace_event_call *call) +{ + struct trace_probe *tp = (struct trace_probe *)call->data; + + tp->flags |= TP_FLAG_TRACE; + if (probe_is_return(tp)) + return enable_kretprobe(&tp->rp); + else + return enable_kprobe(&tp->rp.kp); +} + +static void probe_event_disable(struct ftrace_event_call *call) +{ + struct trace_probe *tp = (struct trace_probe *)call->data; + + tp->flags &= ~TP_FLAG_TRACE; + if (!(tp->flags & (TP_FLAG_TRACE | TP_FLAG_PROFILE))) { + if (probe_is_return(tp)) + disable_kretprobe(&tp->rp); + else + disable_kprobe(&tp->rp.kp); + } +} + +static int probe_event_raw_init(struct ftrace_event_call *event_call) +{ + INIT_LIST_HEAD(&event_call->fields); + + return 0; +} + +#undef DEFINE_FIELD +#define DEFINE_FIELD(type, item, name, is_signed) \ + do { \ + ret = trace_define_field(event_call, #type, name, \ + offsetof(typeof(field), item), \ + sizeof(field.item), is_signed, \ + FILTER_OTHER); \ + if (ret) \ + return ret; \ + } while (0) + +static int kprobe_event_define_fields(struct ftrace_event_call *event_call) +{ + int ret, i; + struct kprobe_trace_entry field; + struct trace_probe *tp = (struct trace_probe *)event_call->data; + + ret = trace_define_common_fields(event_call); + if (!ret) + return ret; + + DEFINE_FIELD(unsigned long, ip, FIELD_STRING_IP, 0); + DEFINE_FIELD(int, nargs, FIELD_STRING_NARGS, 1); + /* Set argument names as fields */ + for (i = 0; i < tp->nr_args; i++) + DEFINE_FIELD(unsigned long, args[i], tp->args[i].name, 0); + return 0; +} + +static int kretprobe_event_define_fields(struct ftrace_event_call *event_call) +{ + int ret, i; + struct kretprobe_trace_entry field; + struct trace_probe *tp = (struct trace_probe *)event_call->data; + + ret = trace_define_common_fields(event_call); + if (!ret) + return ret; + + DEFINE_FIELD(unsigned long, func, FIELD_STRING_FUNC, 0); + DEFINE_FIELD(unsigned long, ret_ip, FIELD_STRING_RETIP, 0); + DEFINE_FIELD(int, nargs, FIELD_STRING_NARGS, 1); + /* Set argument names as fields */ + for (i = 0; i < tp->nr_args; i++) + DEFINE_FIELD(unsigned long, args[i], tp->args[i].name, 0); + return 0; +} + +static int __probe_event_show_format(struct trace_seq *s, + struct trace_probe *tp, const char *fmt, + const char *arg) +{ + int i; + + /* Show format */ + if (!trace_seq_printf(s, "\nprint fmt: \"%s", fmt)) + return 0; + + for (i = 0; i < tp->nr_args; i++) + if (!trace_seq_printf(s, " %s=%%lx", tp->args[i].name)) + return 0; + + if (!trace_seq_printf(s, "\", %s", arg)) + return 0; + + for (i = 0; i < tp->nr_args; i++) + if (!trace_seq_printf(s, ", REC->%s", tp->args[i].name)) + return 0; + + return trace_seq_puts(s, "\n"); +} + +#undef SHOW_FIELD +#define SHOW_FIELD(type, item, name) \ + do { \ + ret = trace_seq_printf(s, "\tfield: " #type " %s;\t" \ + "offset:%u;\tsize:%u;\n", name, \ + (unsigned int)offsetof(typeof(field), item),\ + (unsigned int)sizeof(type)); \ + if (!ret) \ + return 0; \ + } while (0) + +static int kprobe_event_show_format(struct ftrace_event_call *call, + struct trace_seq *s) +{ + struct kprobe_trace_entry field __attribute__((unused)); + int ret, i; + struct trace_probe *tp = (struct trace_probe *)call->data; + + SHOW_FIELD(unsigned long, ip, FIELD_STRING_IP); + SHOW_FIELD(int, nargs, FIELD_STRING_NARGS); + + /* Show fields */ + for (i = 0; i < tp->nr_args; i++) + SHOW_FIELD(unsigned long, args[i], tp->args[i].name); + trace_seq_puts(s, "\n"); + + return __probe_event_show_format(s, tp, "(%lx)", + "REC->" FIELD_STRING_IP); +} + +static int kretprobe_event_show_format(struct ftrace_event_call *call, + struct trace_seq *s) +{ + struct kretprobe_trace_entry field __attribute__((unused)); + int ret, i; + struct trace_probe *tp = (struct trace_probe *)call->data; + + SHOW_FIELD(unsigned long, func, FIELD_STRING_FUNC); + SHOW_FIELD(unsigned long, ret_ip, FIELD_STRING_RETIP); + SHOW_FIELD(int, nargs, FIELD_STRING_NARGS); + + /* Show fields */ + for (i = 0; i < tp->nr_args; i++) + SHOW_FIELD(unsigned long, args[i], tp->args[i].name); + trace_seq_puts(s, "\n"); + + return __probe_event_show_format(s, tp, "(%lx <- %lx)", + "REC->" FIELD_STRING_FUNC + ", REC->" FIELD_STRING_RETIP); +} + +#ifdef CONFIG_EVENT_PROFILE + +/* Kprobe profile handler */ +static __kprobes int kprobe_profile_func(struct kprobe *kp, + struct pt_regs *regs) +{ + struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp); + struct ftrace_event_call *call = &tp->call; + struct kprobe_trace_entry *entry; + struct trace_entry *ent; + int size, __size, i, pc, __cpu; + unsigned long irq_flags; + char *trace_buf; + char *raw_data; + int rctx; + + pc = preempt_count(); + __size = SIZEOF_KPROBE_TRACE_ENTRY(tp->nr_args); + size = ALIGN(__size + sizeof(u32), sizeof(u64)); + size -= sizeof(u32); + if (WARN_ONCE(size > FTRACE_MAX_PROFILE_SIZE, + "profile buffer not large enough")) + return 0; + + /* + * Protect the non nmi buffer + * This also protects the rcu read side + */ + local_irq_save(irq_flags); + + rctx = perf_swevent_get_recursion_context(); + if (rctx < 0) + goto end_recursion; + + __cpu = smp_processor_id(); + + if (in_nmi()) + trace_buf = rcu_dereference(perf_trace_buf_nmi); + else + trace_buf = rcu_dereference(perf_trace_buf); + + if (!trace_buf) + goto end; + + raw_data = per_cpu_ptr(trace_buf, __cpu); + + /* Zero dead bytes from alignment to avoid buffer leak to userspace */ + *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL; + entry = (struct kprobe_trace_entry *)raw_data; + ent = &entry->ent; + + tracing_generic_entry_update(ent, irq_flags, pc); + ent->type = call->id; + entry->nargs = tp->nr_args; + entry->ip = (unsigned long)kp->addr; + for (i = 0; i < tp->nr_args; i++) + entry->args[i] = call_fetch(&tp->args[i].fetch, regs); + perf_tp_event(call->id, entry->ip, 1, entry, size); + +end: + perf_swevent_put_recursion_context(rctx); +end_recursion: + local_irq_restore(irq_flags); + + return 0; +} + +/* Kretprobe profile handler */ +static __kprobes int kretprobe_profile_func(struct kretprobe_instance *ri, + struct pt_regs *regs) +{ + struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp); + struct ftrace_event_call *call = &tp->call; + struct kretprobe_trace_entry *entry; + struct trace_entry *ent; + int size, __size, i, pc, __cpu; + unsigned long irq_flags; + char *trace_buf; + char *raw_data; + int rctx; + + pc = preempt_count(); + __size = SIZEOF_KRETPROBE_TRACE_ENTRY(tp->nr_args); + size = ALIGN(__size + sizeof(u32), sizeof(u64)); + size -= sizeof(u32); + if (WARN_ONCE(size > FTRACE_MAX_PROFILE_SIZE, + "profile buffer not large enough")) + return 0; + + /* + * Protect the non nmi buffer + * This also protects the rcu read side + */ + local_irq_save(irq_flags); + + rctx = perf_swevent_get_recursion_context(); + if (rctx < 0) + goto end_recursion; + + __cpu = smp_processor_id(); + + if (in_nmi()) + trace_buf = rcu_dereference(perf_trace_buf_nmi); + else + trace_buf = rcu_dereference(perf_trace_buf); + + if (!trace_buf) + goto end; + + raw_data = per_cpu_ptr(trace_buf, __cpu); + + /* Zero dead bytes from alignment to avoid buffer leak to userspace */ + *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL; + entry = (struct kretprobe_trace_entry *)raw_data; + ent = &entry->ent; + + tracing_generic_entry_update(ent, irq_flags, pc); + ent->type = call->id; + entry->nargs = tp->nr_args; + entry->func = (unsigned long)tp->rp.kp.addr; + entry->ret_ip = (unsigned long)ri->ret_addr; + for (i = 0; i < tp->nr_args; i++) + entry->args[i] = call_fetch(&tp->args[i].fetch, regs); + perf_tp_event(call->id, entry->ret_ip, 1, entry, size); + +end: + perf_swevent_put_recursion_context(rctx); +end_recursion: + local_irq_restore(irq_flags); + + return 0; +} + +static int probe_profile_enable(struct ftrace_event_call *call) +{ + struct trace_probe *tp = (struct trace_probe *)call->data; + + tp->flags |= TP_FLAG_PROFILE; + + if (probe_is_return(tp)) + return enable_kretprobe(&tp->rp); + else + return enable_kprobe(&tp->rp.kp); +} + +static void probe_profile_disable(struct ftrace_event_call *call) +{ + struct trace_probe *tp = (struct trace_probe *)call->data; + + tp->flags &= ~TP_FLAG_PROFILE; + + if (!(tp->flags & TP_FLAG_TRACE)) { + if (probe_is_return(tp)) + disable_kretprobe(&tp->rp); + else + disable_kprobe(&tp->rp.kp); + } +} +#endif /* CONFIG_EVENT_PROFILE */ + + +static __kprobes +int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs) +{ + struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp); + + if (tp->flags & TP_FLAG_TRACE) + kprobe_trace_func(kp, regs); +#ifdef CONFIG_EVENT_PROFILE + if (tp->flags & TP_FLAG_PROFILE) + kprobe_profile_func(kp, regs); +#endif /* CONFIG_EVENT_PROFILE */ + return 0; /* We don't tweek kernel, so just return 0 */ +} + +static __kprobes +int kretprobe_dispatcher(struct kretprobe_instance *ri, struct pt_regs *regs) +{ + struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp); + + if (tp->flags & TP_FLAG_TRACE) + kretprobe_trace_func(ri, regs); +#ifdef CONFIG_EVENT_PROFILE + if (tp->flags & TP_FLAG_PROFILE) + kretprobe_profile_func(ri, regs); +#endif /* CONFIG_EVENT_PROFILE */ + return 0; /* We don't tweek kernel, so just return 0 */ +} + +static int register_probe_event(struct trace_probe *tp) +{ + struct ftrace_event_call *call = &tp->call; + int ret; + + /* Initialize ftrace_event_call */ + if (probe_is_return(tp)) { + tp->event.trace = print_kretprobe_event; + call->raw_init = probe_event_raw_init; + call->show_format = kretprobe_event_show_format; + call->define_fields = kretprobe_event_define_fields; + } else { + tp->event.trace = print_kprobe_event; + call->raw_init = probe_event_raw_init; + call->show_format = kprobe_event_show_format; + call->define_fields = kprobe_event_define_fields; + } + call->event = &tp->event; + call->id = register_ftrace_event(&tp->event); + if (!call->id) + return -ENODEV; + call->enabled = 0; + call->regfunc = probe_event_enable; + call->unregfunc = probe_event_disable; + +#ifdef CONFIG_EVENT_PROFILE + atomic_set(&call->profile_count, -1); + call->profile_enable = probe_profile_enable; + call->profile_disable = probe_profile_disable; +#endif + call->data = tp; + ret = trace_add_event_call(call); + if (ret) { + pr_info("Failed to register kprobe event: %s\n", call->name); + unregister_ftrace_event(&tp->event); + } + return ret; +} + +static void unregister_probe_event(struct trace_probe *tp) +{ + /* tp->event is unregistered in trace_remove_event_call() */ + trace_remove_event_call(&tp->call); +} + +/* Make a debugfs interface for controling probe points */ +static __init int init_kprobe_trace(void) +{ + struct dentry *d_tracer; + struct dentry *entry; + + d_tracer = tracing_init_dentry(); + if (!d_tracer) + return 0; + + entry = debugfs_create_file("kprobe_events", 0644, d_tracer, + NULL, &kprobe_events_ops); + + /* Event list interface */ + if (!entry) + pr_warning("Could not create debugfs " + "'kprobe_events' entry\n"); + + /* Profile interface */ + entry = debugfs_create_file("kprobe_profile", 0444, d_tracer, + NULL, &kprobe_profile_ops); + + if (!entry) + pr_warning("Could not create debugfs " + "'kprobe_profile' entry\n"); + return 0; +} +fs_initcall(init_kprobe_trace); + + +#ifdef CONFIG_FTRACE_STARTUP_TEST + +static int kprobe_trace_selftest_target(int a1, int a2, int a3, + int a4, int a5, int a6) +{ + return a1 + a2 + a3 + a4 + a5 + a6; +} + +static __init int kprobe_trace_self_tests_init(void) +{ + int ret; + int (*target)(int, int, int, int, int, int); + + target = kprobe_trace_selftest_target; + + pr_info("Testing kprobe tracing: "); + + ret = command_trace_probe("p:testprobe kprobe_trace_selftest_target " + "$arg1 $arg2 $arg3 $arg4 $stack $stack0"); + if (WARN_ON_ONCE(ret)) + pr_warning("error enabling function entry\n"); + + ret = command_trace_probe("r:testprobe2 kprobe_trace_selftest_target " + "$retval"); + if (WARN_ON_ONCE(ret)) + pr_warning("error enabling function return\n"); + + ret = target(1, 2, 3, 4, 5, 6); + + cleanup_all_probes(); + + pr_cont("OK\n"); + return 0; +} + +late_initcall(kprobe_trace_self_tests_init); + +#endif diff --git a/kernel/trace/trace_ksym.c b/kernel/trace/trace_ksym.c new file mode 100644 index 000000000000..ddfa0fd43bc0 --- /dev/null +++ b/kernel/trace/trace_ksym.c @@ -0,0 +1,550 @@ +/* + * trace_ksym.c - Kernel Symbol Tracer + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright (C) IBM Corporation, 2009 + */ + +#include <linux/kallsyms.h> +#include <linux/uaccess.h> +#include <linux/debugfs.h> +#include <linux/ftrace.h> +#include <linux/module.h> +#include <linux/fs.h> + +#include "trace_output.h" +#include "trace_stat.h" +#include "trace.h" + +#include <linux/hw_breakpoint.h> +#include <asm/hw_breakpoint.h> + +/* + * For now, let us restrict the no. of symbols traced simultaneously to number + * of available hardware breakpoint registers. + */ +#define KSYM_TRACER_MAX HBP_NUM + +#define KSYM_TRACER_OP_LEN 3 /* rw- */ + +struct trace_ksym { + struct perf_event **ksym_hbp; + struct perf_event_attr attr; +#ifdef CONFIG_PROFILE_KSYM_TRACER + unsigned long counter; +#endif + struct hlist_node ksym_hlist; +}; + +static struct trace_array *ksym_trace_array; + +static unsigned int ksym_filter_entry_count; +static unsigned int ksym_tracing_enabled; + +static HLIST_HEAD(ksym_filter_head); + +static DEFINE_MUTEX(ksym_tracer_mutex); + +#ifdef CONFIG_PROFILE_KSYM_TRACER + +#define MAX_UL_INT 0xffffffff + +void ksym_collect_stats(unsigned long hbp_hit_addr) +{ + struct hlist_node *node; + struct trace_ksym *entry; + + rcu_read_lock(); + hlist_for_each_entry_rcu(entry, node, &ksym_filter_head, ksym_hlist) { + if ((entry->attr.bp_addr == hbp_hit_addr) && + (entry->counter <= MAX_UL_INT)) { + entry->counter++; + break; + } + } + rcu_read_unlock(); +} +#endif /* CONFIG_PROFILE_KSYM_TRACER */ + +void ksym_hbp_handler(struct perf_event *hbp, void *data) +{ + struct ring_buffer_event *event; + struct ksym_trace_entry *entry; + struct pt_regs *regs = data; + struct ring_buffer *buffer; + int pc; + + if (!ksym_tracing_enabled) + return; + + buffer = ksym_trace_array->buffer; + + pc = preempt_count(); + + event = trace_buffer_lock_reserve(buffer, TRACE_KSYM, + sizeof(*entry), 0, pc); + if (!event) + return; + + entry = ring_buffer_event_data(event); + entry->ip = instruction_pointer(regs); + entry->type = hw_breakpoint_type(hbp); + entry->addr = hw_breakpoint_addr(hbp); + strlcpy(entry->cmd, current->comm, TASK_COMM_LEN); + +#ifdef CONFIG_PROFILE_KSYM_TRACER + ksym_collect_stats(hw_breakpoint_addr(hbp)); +#endif /* CONFIG_PROFILE_KSYM_TRACER */ + + trace_buffer_unlock_commit(buffer, event, 0, pc); +} + +/* Valid access types are represented as + * + * rw- : Set Read/Write Access Breakpoint + * -w- : Set Write Access Breakpoint + * --- : Clear Breakpoints + * --x : Set Execution Break points (Not available yet) + * + */ +static int ksym_trace_get_access_type(char *str) +{ + int access = 0; + + if (str[0] == 'r') + access |= HW_BREAKPOINT_R; + + if (str[1] == 'w') + access |= HW_BREAKPOINT_W; + + if (str[2] == 'x') + access |= HW_BREAKPOINT_X; + + switch (access) { + case HW_BREAKPOINT_R: + case HW_BREAKPOINT_W: + case HW_BREAKPOINT_W | HW_BREAKPOINT_R: + return access; + default: + return -EINVAL; + } +} + +/* + * There can be several possible malformed requests and we attempt to capture + * all of them. We enumerate some of the rules + * 1. We will not allow kernel symbols with ':' since it is used as a delimiter. + * i.e. multiple ':' symbols disallowed. Possible uses are of the form + * <module>:<ksym_name>:<op>. + * 2. No delimiter symbol ':' in the input string + * 3. Spurious operator symbols or symbols not in their respective positions + * 4. <ksym_name>:--- i.e. clear breakpoint request when ksym_name not in file + * 5. Kernel symbol not a part of /proc/kallsyms + * 6. Duplicate requests + */ +static int parse_ksym_trace_str(char *input_string, char **ksymname, + unsigned long *addr) +{ + int ret; + + *ksymname = strsep(&input_string, ":"); + *addr = kallsyms_lookup_name(*ksymname); + + /* Check for malformed request: (2), (1) and (5) */ + if ((!input_string) || + (strlen(input_string) != KSYM_TRACER_OP_LEN) || + (*addr == 0)) + return -EINVAL;; + + ret = ksym_trace_get_access_type(input_string); + + return ret; +} + +int process_new_ksym_entry(char *ksymname, int op, unsigned long addr) +{ + struct trace_ksym *entry; + int ret = -ENOMEM; + + if (ksym_filter_entry_count >= KSYM_TRACER_MAX) { + printk(KERN_ERR "ksym_tracer: Maximum limit:(%d) reached. No" + " new requests for tracing can be accepted now.\n", + KSYM_TRACER_MAX); + return -ENOSPC; + } + + entry = kzalloc(sizeof(struct trace_ksym), GFP_KERNEL); + if (!entry) + return -ENOMEM; + + hw_breakpoint_init(&entry->attr); + + entry->attr.bp_type = op; + entry->attr.bp_addr = addr; + entry->attr.bp_len = HW_BREAKPOINT_LEN_4; + + ret = -EAGAIN; + entry->ksym_hbp = register_wide_hw_breakpoint(&entry->attr, + ksym_hbp_handler); + + if (IS_ERR(entry->ksym_hbp)) { + ret = PTR_ERR(entry->ksym_hbp); + printk(KERN_INFO "ksym_tracer request failed. Try again" + " later!!\n"); + goto err; + } + + hlist_add_head_rcu(&(entry->ksym_hlist), &ksym_filter_head); + ksym_filter_entry_count++; + + return 0; + +err: + kfree(entry); + + return ret; +} + +static ssize_t ksym_trace_filter_read(struct file *filp, char __user *ubuf, + size_t count, loff_t *ppos) +{ + struct trace_ksym *entry; + struct hlist_node *node; + struct trace_seq *s; + ssize_t cnt = 0; + int ret; + + s = kmalloc(sizeof(*s), GFP_KERNEL); + if (!s) + return -ENOMEM; + trace_seq_init(s); + + mutex_lock(&ksym_tracer_mutex); + + hlist_for_each_entry(entry, node, &ksym_filter_head, ksym_hlist) { + ret = trace_seq_printf(s, "%pS:", (void *)entry->attr.bp_addr); + if (entry->attr.bp_type == HW_BREAKPOINT_R) + ret = trace_seq_puts(s, "r--\n"); + else if (entry->attr.bp_type == HW_BREAKPOINT_W) + ret = trace_seq_puts(s, "-w-\n"); + else if (entry->attr.bp_type == (HW_BREAKPOINT_W | HW_BREAKPOINT_R)) + ret = trace_seq_puts(s, "rw-\n"); + WARN_ON_ONCE(!ret); + } + + cnt = simple_read_from_buffer(ubuf, count, ppos, s->buffer, s->len); + + mutex_unlock(&ksym_tracer_mutex); + + kfree(s); + + return cnt; +} + +static void __ksym_trace_reset(void) +{ + struct trace_ksym *entry; + struct hlist_node *node, *node1; + + mutex_lock(&ksym_tracer_mutex); + hlist_for_each_entry_safe(entry, node, node1, &ksym_filter_head, + ksym_hlist) { + unregister_wide_hw_breakpoint(entry->ksym_hbp); + ksym_filter_entry_count--; + hlist_del_rcu(&(entry->ksym_hlist)); + synchronize_rcu(); + kfree(entry); + } + mutex_unlock(&ksym_tracer_mutex); +} + +static ssize_t ksym_trace_filter_write(struct file *file, + const char __user *buffer, + size_t count, loff_t *ppos) +{ + struct trace_ksym *entry; + struct hlist_node *node; + char *input_string, *ksymname = NULL; + unsigned long ksym_addr = 0; + int ret, op, changed = 0; + + input_string = kzalloc(count + 1, GFP_KERNEL); + if (!input_string) + return -ENOMEM; + + if (copy_from_user(input_string, buffer, count)) { + kfree(input_string); + return -EFAULT; + } + input_string[count] = '\0'; + + strstrip(input_string); + + /* + * Clear all breakpoints if: + * 1: echo > ksym_trace_filter + * 2: echo 0 > ksym_trace_filter + * 3: echo "*:---" > ksym_trace_filter + */ + if (!input_string[0] || !strcmp(input_string, "0") || + !strcmp(input_string, "*:---")) { + __ksym_trace_reset(); + kfree(input_string); + return count; + } + + ret = op = parse_ksym_trace_str(input_string, &ksymname, &ksym_addr); + if (ret < 0) { + kfree(input_string); + return ret; + } + + mutex_lock(&ksym_tracer_mutex); + + ret = -EINVAL; + hlist_for_each_entry(entry, node, &ksym_filter_head, ksym_hlist) { + if (entry->attr.bp_addr == ksym_addr) { + /* Check for malformed request: (6) */ + if (entry->attr.bp_type != op) + changed = 1; + else + goto out; + break; + } + } + if (changed) { + unregister_wide_hw_breakpoint(entry->ksym_hbp); + entry->attr.bp_type = op; + ret = 0; + if (op > 0) { + entry->ksym_hbp = + register_wide_hw_breakpoint(&entry->attr, + ksym_hbp_handler); + if (IS_ERR(entry->ksym_hbp)) + ret = PTR_ERR(entry->ksym_hbp); + else + goto out; + } + /* Error or "symbol:---" case: drop it */ + ksym_filter_entry_count--; + hlist_del_rcu(&(entry->ksym_hlist)); + synchronize_rcu(); + kfree(entry); + goto out; + } else { + /* Check for malformed request: (4) */ + if (op == 0) + goto out; + ret = process_new_ksym_entry(ksymname, op, ksym_addr); + } +out: + mutex_unlock(&ksym_tracer_mutex); + + kfree(input_string); + + if (!ret) + ret = count; + return ret; +} + +static const struct file_operations ksym_tracing_fops = { + .open = tracing_open_generic, + .read = ksym_trace_filter_read, + .write = ksym_trace_filter_write, +}; + +static void ksym_trace_reset(struct trace_array *tr) +{ + ksym_tracing_enabled = 0; + __ksym_trace_reset(); +} + +static int ksym_trace_init(struct trace_array *tr) +{ + int cpu, ret = 0; + + for_each_online_cpu(cpu) + tracing_reset(tr, cpu); + ksym_tracing_enabled = 1; + ksym_trace_array = tr; + + return ret; +} + +static void ksym_trace_print_header(struct seq_file *m) +{ + seq_puts(m, + "# TASK-PID CPU# Symbol " + "Type Function\n"); + seq_puts(m, + "# | | | " + " | |\n"); +} + +static enum print_line_t ksym_trace_output(struct trace_iterator *iter) +{ + struct trace_entry *entry = iter->ent; + struct trace_seq *s = &iter->seq; + struct ksym_trace_entry *field; + char str[KSYM_SYMBOL_LEN]; + int ret; + + if (entry->type != TRACE_KSYM) + return TRACE_TYPE_UNHANDLED; + + trace_assign_type(field, entry); + + ret = trace_seq_printf(s, "%11s-%-5d [%03d] %pS", field->cmd, + entry->pid, iter->cpu, (char *)field->addr); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + + switch (field->type) { + case HW_BREAKPOINT_R: + ret = trace_seq_printf(s, " R "); + break; + case HW_BREAKPOINT_W: + ret = trace_seq_printf(s, " W "); + break; + case HW_BREAKPOINT_R | HW_BREAKPOINT_W: + ret = trace_seq_printf(s, " RW "); + break; + default: + return TRACE_TYPE_PARTIAL_LINE; + } + + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + + sprint_symbol(str, field->ip); + ret = trace_seq_printf(s, "%s\n", str); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + + return TRACE_TYPE_HANDLED; +} + +struct tracer ksym_tracer __read_mostly = +{ + .name = "ksym_tracer", + .init = ksym_trace_init, + .reset = ksym_trace_reset, +#ifdef CONFIG_FTRACE_SELFTEST + .selftest = trace_selftest_startup_ksym, +#endif + .print_header = ksym_trace_print_header, + .print_line = ksym_trace_output +}; + +__init static int init_ksym_trace(void) +{ + struct dentry *d_tracer; + struct dentry *entry; + + d_tracer = tracing_init_dentry(); + ksym_filter_entry_count = 0; + + entry = debugfs_create_file("ksym_trace_filter", 0644, d_tracer, + NULL, &ksym_tracing_fops); + if (!entry) + pr_warning("Could not create debugfs " + "'ksym_trace_filter' file\n"); + + return register_tracer(&ksym_tracer); +} +device_initcall(init_ksym_trace); + + +#ifdef CONFIG_PROFILE_KSYM_TRACER +static int ksym_tracer_stat_headers(struct seq_file *m) +{ + seq_puts(m, " Access Type "); + seq_puts(m, " Symbol Counter\n"); + seq_puts(m, " ----------- "); + seq_puts(m, " ------ -------\n"); + return 0; +} + +static int ksym_tracer_stat_show(struct seq_file *m, void *v) +{ + struct hlist_node *stat = v; + struct trace_ksym *entry; + int access_type = 0; + char fn_name[KSYM_NAME_LEN]; + + entry = hlist_entry(stat, struct trace_ksym, ksym_hlist); + + access_type = entry->attr.bp_type; + + switch (access_type) { + case HW_BREAKPOINT_R: + seq_puts(m, " R "); + break; + case HW_BREAKPOINT_W: + seq_puts(m, " W "); + break; + case HW_BREAKPOINT_R | HW_BREAKPOINT_W: + seq_puts(m, " RW "); + break; + default: + seq_puts(m, " NA "); + } + + if (lookup_symbol_name(entry->attr.bp_addr, fn_name) >= 0) + seq_printf(m, " %-36s", fn_name); + else + seq_printf(m, " %-36s", "<NA>"); + seq_printf(m, " %15lu\n", entry->counter); + + return 0; +} + +static void *ksym_tracer_stat_start(struct tracer_stat *trace) +{ + return ksym_filter_head.first; +} + +static void * +ksym_tracer_stat_next(void *v, int idx) +{ + struct hlist_node *stat = v; + + return stat->next; +} + +static struct tracer_stat ksym_tracer_stats = { + .name = "ksym_tracer", + .stat_start = ksym_tracer_stat_start, + .stat_next = ksym_tracer_stat_next, + .stat_headers = ksym_tracer_stat_headers, + .stat_show = ksym_tracer_stat_show +}; + +__init static int ksym_tracer_stat_init(void) +{ + int ret; + + ret = register_stat_tracer(&ksym_tracer_stats); + if (ret) { + printk(KERN_WARNING "Warning: could not register " + "ksym tracer stats\n"); + return 1; + } + + return 0; +} +fs_initcall(ksym_tracer_stat_init); +#endif /* CONFIG_PROFILE_KSYM_TRACER */ diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index f572f44c6e1e..b6c12c6a1bcd 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -69,6 +69,9 @@ enum print_line_t trace_print_printk_msg_only(struct trace_iterator *iter) * @s: trace sequence descriptor * @fmt: printf format string * + * It returns 0 if the trace oversizes the buffer's free + * space, 1 otherwise. + * * The tracer may use either sequence operations or its own * copy to user routines. To simplify formating of a trace * trace_seq_printf is used to store strings into a special @@ -95,7 +98,7 @@ trace_seq_printf(struct trace_seq *s, const char *fmt, ...) s->len += ret; - return len; + return 1; } EXPORT_SYMBOL_GPL(trace_seq_printf); @@ -486,16 +489,18 @@ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry) hardirq ? 'h' : softirq ? 's' : '.')) return 0; - if (entry->lock_depth < 0) - ret = trace_seq_putc(s, '.'); + if (entry->preempt_count) + ret = trace_seq_printf(s, "%x", entry->preempt_count); else - ret = trace_seq_printf(s, "%d", entry->lock_depth); + ret = trace_seq_putc(s, '.'); + if (!ret) return 0; - if (entry->preempt_count) - return trace_seq_printf(s, "%x", entry->preempt_count); - return trace_seq_putc(s, '.'); + if (entry->lock_depth < 0) + return trace_seq_putc(s, '.'); + + return trace_seq_printf(s, "%d", entry->lock_depth); } static int @@ -883,7 +888,7 @@ static int trace_ctxwake_raw(struct trace_iterator *iter, char S) trace_assign_type(field, iter->ent); if (!S) - task_state_char(field->prev_state); + S = task_state_char(field->prev_state); T = task_state_char(field->next_state); if (!trace_seq_printf(&iter->seq, "%d %d %c %d %d %d %c\n", field->prev_pid, @@ -918,7 +923,7 @@ static int trace_ctxwake_hex(struct trace_iterator *iter, char S) trace_assign_type(field, iter->ent); if (!S) - task_state_char(field->prev_state); + S = task_state_char(field->prev_state); T = task_state_char(field->next_state); SEQ_PUT_HEX_FIELD_RET(s, field->prev_pid); diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index d2cdbabb4ead..dc98309e839a 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -17,6 +17,7 @@ static inline int trace_valid_entry(struct trace_entry *entry) case TRACE_GRAPH_ENT: case TRACE_GRAPH_RET: case TRACE_HW_BRANCHES: + case TRACE_KSYM: return 1; } return 0; @@ -808,3 +809,57 @@ trace_selftest_startup_hw_branches(struct tracer *trace, return ret; } #endif /* CONFIG_HW_BRANCH_TRACER */ + +#ifdef CONFIG_KSYM_TRACER +static int ksym_selftest_dummy; + +int +trace_selftest_startup_ksym(struct tracer *trace, struct trace_array *tr) +{ + unsigned long count; + int ret; + + /* start the tracing */ + ret = tracer_init(trace, tr); + if (ret) { + warn_failed_init_tracer(trace, ret); + return ret; + } + + ksym_selftest_dummy = 0; + /* Register the read-write tracing request */ + + ret = process_new_ksym_entry("ksym_selftest_dummy", + HW_BREAKPOINT_R | HW_BREAKPOINT_W, + (unsigned long)(&ksym_selftest_dummy)); + + if (ret < 0) { + printk(KERN_CONT "ksym_trace read-write startup test failed\n"); + goto ret_path; + } + /* Perform a read and a write operation over the dummy variable to + * trigger the tracer + */ + if (ksym_selftest_dummy == 0) + ksym_selftest_dummy++; + + /* stop the tracing. */ + tracing_stop(); + /* check the trace buffer */ + ret = trace_test_buffer(tr, &count); + trace->reset(tr); + tracing_start(); + + /* read & write operations - one each is performed on the dummy variable + * triggering two entries in the trace buffer + */ + if (!ret && count != 2) { + printk(KERN_CONT "Ksym tracer startup test failed"); + ret = -1; + } + +ret_path: + return ret; +} +#endif /* CONFIG_KSYM_TRACER */ + diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index 0f6facb050a1..8504ac71e4e8 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -296,14 +296,14 @@ static const struct file_operations stack_trace_fops = { int stack_trace_sysctl(struct ctl_table *table, int write, - struct file *file, void __user *buffer, size_t *lenp, + void __user *buffer, size_t *lenp, loff_t *ppos) { int ret; mutex_lock(&stack_sysctl_mutex); - ret = proc_dointvec(table, write, file, buffer, lenp, ppos); + ret = proc_dointvec(table, write, buffer, lenp, ppos); if (ret || !write || (last_stack_tracer_enabled == !!stack_tracer_enabled)) diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index 9fbce6c9d2e1..57501d90096a 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -14,6 +14,43 @@ static int sys_refcount_exit; static DECLARE_BITMAP(enabled_enter_syscalls, NR_syscalls); static DECLARE_BITMAP(enabled_exit_syscalls, NR_syscalls); +extern unsigned long __start_syscalls_metadata[]; +extern unsigned long __stop_syscalls_metadata[]; + +static struct syscall_metadata **syscalls_metadata; + +static struct syscall_metadata *find_syscall_meta(unsigned long syscall) +{ + struct syscall_metadata *start; + struct syscall_metadata *stop; + char str[KSYM_SYMBOL_LEN]; + + + start = (struct syscall_metadata *)__start_syscalls_metadata; + stop = (struct syscall_metadata *)__stop_syscalls_metadata; + kallsyms_lookup(syscall, NULL, NULL, NULL, str); + + for ( ; start < stop; start++) { + /* + * Only compare after the "sys" prefix. Archs that use + * syscall wrappers may have syscalls symbols aliases prefixed + * with "SyS" instead of "sys", leading to an unwanted + * mismatch. + */ + if (start->name && !strcmp(start->name + 3, str + 3)) + return start; + } + return NULL; +} + +static struct syscall_metadata *syscall_nr_to_meta(int nr) +{ + if (!syscalls_metadata || nr >= NR_syscalls || nr < 0) + return NULL; + + return syscalls_metadata[nr]; +} + enum print_line_t print_syscall_enter(struct trace_iterator *iter, int flags) { @@ -30,7 +67,7 @@ print_syscall_enter(struct trace_iterator *iter, int flags) if (!entry) goto end; - if (entry->enter_id != ent->type) { + if (entry->enter_event->id != ent->type) { WARN_ON_ONCE(1); goto end; } @@ -85,7 +122,7 @@ print_syscall_exit(struct trace_iterator *iter, int flags) return TRACE_TYPE_HANDLED; } - if (entry->exit_id != ent->type) { + if (entry->exit_event->id != ent->type) { WARN_ON_ONCE(1); return TRACE_TYPE_UNHANDLED; } @@ -103,24 +140,19 @@ extern char *__bad_type_size(void); #define SYSCALL_FIELD(type, name) \ sizeof(type) != sizeof(trace.name) ? \ __bad_type_size() : \ - #type, #name, offsetof(typeof(trace), name), sizeof(trace.name) + #type, #name, offsetof(typeof(trace), name), \ + sizeof(trace.name), is_signed_type(type) int syscall_enter_format(struct ftrace_event_call *call, struct trace_seq *s) { int i; - int nr; int ret; - struct syscall_metadata *entry; + struct syscall_metadata *entry = call->data; struct syscall_trace_enter trace; int offset = offsetof(struct syscall_trace_enter, args); - nr = syscall_name_to_nr(call->data); - entry = syscall_nr_to_meta(nr); - - if (!entry) - return 0; - - ret = trace_seq_printf(s, "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n", + ret = trace_seq_printf(s, "\tfield:%s %s;\toffset:%zu;\tsize:%zu;" + "\tsigned:%u;\n", SYSCALL_FIELD(int, nr)); if (!ret) return 0; @@ -130,8 +162,10 @@ int syscall_enter_format(struct ftrace_event_call *call, struct trace_seq *s) entry->args[i]); if (!ret) return 0; - ret = trace_seq_printf(s, "\toffset:%d;\tsize:%zu;\n", offset, - sizeof(unsigned long)); + ret = trace_seq_printf(s, "\toffset:%d;\tsize:%zu;" + "\tsigned:%u;\n", offset, + sizeof(unsigned long), + is_signed_type(unsigned long)); if (!ret) return 0; offset += sizeof(unsigned long); @@ -163,10 +197,12 @@ int syscall_exit_format(struct ftrace_event_call *call, struct trace_seq *s) struct syscall_trace_exit trace; ret = trace_seq_printf(s, - "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n" - "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n", + "\tfield:%s %s;\toffset:%zu;\tsize:%zu;" + "\tsigned:%u;\n" + "\tfield:%s %s;\toffset:%zu;\tsize:%zu;" + "\tsigned:%u;\n", SYSCALL_FIELD(int, nr), - SYSCALL_FIELD(unsigned long, ret)); + SYSCALL_FIELD(long, ret)); if (!ret) return 0; @@ -176,22 +212,19 @@ int syscall_exit_format(struct ftrace_event_call *call, struct trace_seq *s) int syscall_enter_define_fields(struct ftrace_event_call *call) { struct syscall_trace_enter trace; - struct syscall_metadata *meta; + struct syscall_metadata *meta = call->data; int ret; - int nr; int i; int offset = offsetof(typeof(trace), args); - nr = syscall_name_to_nr(call->data); - meta = syscall_nr_to_meta(nr); - - if (!meta) - return 0; - ret = trace_define_common_fields(call); if (ret) return ret; + ret = trace_define_field(call, SYSCALL_FIELD(int, nr), FILTER_OTHER); + if (ret) + return ret; + for (i = 0; i < meta->nb_args; i++) { ret = trace_define_field(call, meta->types[i], meta->args[i], offset, @@ -212,7 +245,11 @@ int syscall_exit_define_fields(struct ftrace_event_call *call) if (ret) return ret; - ret = trace_define_field(call, SYSCALL_FIELD(unsigned long, ret), 0, + ret = trace_define_field(call, SYSCALL_FIELD(int, nr), FILTER_OTHER); + if (ret) + return ret; + + ret = trace_define_field(call, SYSCALL_FIELD(long, ret), FILTER_OTHER); return ret; @@ -239,8 +276,8 @@ void ftrace_syscall_enter(struct pt_regs *regs, long id) size = sizeof(*entry) + sizeof(unsigned long) * sys_data->nb_args; - event = trace_current_buffer_lock_reserve(&buffer, sys_data->enter_id, - size, 0, 0); + event = trace_current_buffer_lock_reserve(&buffer, + sys_data->enter_event->id, size, 0, 0); if (!event) return; @@ -271,8 +308,8 @@ void ftrace_syscall_exit(struct pt_regs *regs, long ret) if (!sys_data) return; - event = trace_current_buffer_lock_reserve(&buffer, sys_data->exit_id, - sizeof(*entry), 0, 0); + event = trace_current_buffer_lock_reserve(&buffer, + sys_data->exit_event->id, sizeof(*entry), 0, 0); if (!event) return; @@ -285,14 +322,12 @@ void ftrace_syscall_exit(struct pt_regs *regs, long ret) trace_current_buffer_unlock_commit(buffer, event, 0, 0); } -int reg_event_syscall_enter(void *ptr) +int reg_event_syscall_enter(struct ftrace_event_call *call) { int ret = 0; int num; - char *name; - name = (char *)ptr; - num = syscall_name_to_nr(name); + num = ((struct syscall_metadata *)call->data)->syscall_nr; if (num < 0 || num >= NR_syscalls) return -ENOSYS; mutex_lock(&syscall_trace_lock); @@ -309,13 +344,11 @@ int reg_event_syscall_enter(void *ptr) return ret; } -void unreg_event_syscall_enter(void *ptr) +void unreg_event_syscall_enter(struct ftrace_event_call *call) { int num; - char *name; - name = (char *)ptr; - num = syscall_name_to_nr(name); + num = ((struct syscall_metadata *)call->data)->syscall_nr; if (num < 0 || num >= NR_syscalls) return; mutex_lock(&syscall_trace_lock); @@ -326,14 +359,12 @@ void unreg_event_syscall_enter(void *ptr) mutex_unlock(&syscall_trace_lock); } -int reg_event_syscall_exit(void *ptr) +int reg_event_syscall_exit(struct ftrace_event_call *call) { int ret = 0; int num; - char *name; - name = (char *)ptr; - num = syscall_name_to_nr(name); + num = ((struct syscall_metadata *)call->data)->syscall_nr; if (num < 0 || num >= NR_syscalls) return -ENOSYS; mutex_lock(&syscall_trace_lock); @@ -350,13 +381,11 @@ int reg_event_syscall_exit(void *ptr) return ret; } -void unreg_event_syscall_exit(void *ptr) +void unreg_event_syscall_exit(struct ftrace_event_call *call) { int num; - char *name; - name = (char *)ptr; - num = syscall_name_to_nr(name); + num = ((struct syscall_metadata *)call->data)->syscall_nr; if (num < 0 || num >= NR_syscalls) return; mutex_lock(&syscall_trace_lock); @@ -367,13 +396,44 @@ void unreg_event_syscall_exit(void *ptr) mutex_unlock(&syscall_trace_lock); } -struct trace_event event_syscall_enter = { - .trace = print_syscall_enter, -}; +int init_syscall_trace(struct ftrace_event_call *call) +{ + int id; + + id = register_ftrace_event(call->event); + if (!id) + return -ENODEV; + call->id = id; + INIT_LIST_HEAD(&call->fields); + return 0; +} + +int __init init_ftrace_syscalls(void) +{ + struct syscall_metadata *meta; + unsigned long addr; + int i; + + syscalls_metadata = kzalloc(sizeof(*syscalls_metadata) * + NR_syscalls, GFP_KERNEL); + if (!syscalls_metadata) { + WARN_ON(1); + return -ENOMEM; + } + + for (i = 0; i < NR_syscalls; i++) { + addr = arch_syscall_addr(i); + meta = find_syscall_meta(addr); + if (!meta) + continue; + + meta->syscall_nr = i; + syscalls_metadata[i] = meta; + } -struct trace_event event_syscall_exit = { - .trace = print_syscall_exit, -}; + return 0; +} +core_initcall(init_ftrace_syscalls); #ifdef CONFIG_EVENT_PROFILE @@ -387,8 +447,10 @@ static void prof_syscall_enter(struct pt_regs *regs, long id) struct syscall_metadata *sys_data; struct syscall_trace_enter *rec; unsigned long flags; + char *trace_buf; char *raw_data; int syscall_nr; + int rctx; int size; int cpu; @@ -412,41 +474,42 @@ static void prof_syscall_enter(struct pt_regs *regs, long id) /* Protect the per cpu buffer, begin the rcu read side */ local_irq_save(flags); + rctx = perf_swevent_get_recursion_context(); + if (rctx < 0) + goto end_recursion; + cpu = smp_processor_id(); - if (in_nmi()) - raw_data = rcu_dereference(trace_profile_buf_nmi); - else - raw_data = rcu_dereference(trace_profile_buf); + trace_buf = rcu_dereference(perf_trace_buf); - if (!raw_data) + if (!trace_buf) goto end; - raw_data = per_cpu_ptr(raw_data, cpu); + raw_data = per_cpu_ptr(trace_buf, cpu); /* zero the dead bytes from align to not leak stack to user */ *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL; rec = (struct syscall_trace_enter *) raw_data; tracing_generic_entry_update(&rec->ent, 0, 0); - rec->ent.type = sys_data->enter_id; + rec->ent.type = sys_data->enter_event->id; rec->nr = syscall_nr; syscall_get_arguments(current, regs, 0, sys_data->nb_args, (unsigned long *)&rec->args); - perf_tp_event(sys_data->enter_id, 0, 1, rec, size); + perf_tp_event(sys_data->enter_event->id, 0, 1, rec, size); end: + perf_swevent_put_recursion_context(rctx); +end_recursion: local_irq_restore(flags); } -int reg_prof_syscall_enter(char *name) +int prof_sysenter_enable(struct ftrace_event_call *call) { int ret = 0; int num; - num = syscall_name_to_nr(name); - if (num < 0 || num >= NR_syscalls) - return -ENOSYS; + num = ((struct syscall_metadata *)call->data)->syscall_nr; mutex_lock(&syscall_trace_lock); if (!sys_prof_refcount_enter) @@ -462,13 +525,11 @@ int reg_prof_syscall_enter(char *name) return ret; } -void unreg_prof_syscall_enter(char *name) +void prof_sysenter_disable(struct ftrace_event_call *call) { int num; - num = syscall_name_to_nr(name); - if (num < 0 || num >= NR_syscalls) - return; + num = ((struct syscall_metadata *)call->data)->syscall_nr; mutex_lock(&syscall_trace_lock); sys_prof_refcount_enter--; @@ -484,7 +545,9 @@ static void prof_syscall_exit(struct pt_regs *regs, long ret) struct syscall_trace_exit *rec; unsigned long flags; int syscall_nr; + char *trace_buf; char *raw_data; + int rctx; int size; int cpu; @@ -510,17 +573,19 @@ static void prof_syscall_exit(struct pt_regs *regs, long ret) /* Protect the per cpu buffer, begin the rcu read side */ local_irq_save(flags); + + rctx = perf_swevent_get_recursion_context(); + if (rctx < 0) + goto end_recursion; + cpu = smp_processor_id(); - if (in_nmi()) - raw_data = rcu_dereference(trace_profile_buf_nmi); - else - raw_data = rcu_dereference(trace_profile_buf); + trace_buf = rcu_dereference(perf_trace_buf); - if (!raw_data) + if (!trace_buf) goto end; - raw_data = per_cpu_ptr(raw_data, cpu); + raw_data = per_cpu_ptr(trace_buf, cpu); /* zero the dead bytes from align to not leak stack to user */ *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL; @@ -528,24 +593,24 @@ static void prof_syscall_exit(struct pt_regs *regs, long ret) rec = (struct syscall_trace_exit *)raw_data; tracing_generic_entry_update(&rec->ent, 0, 0); - rec->ent.type = sys_data->exit_id; + rec->ent.type = sys_data->exit_event->id; rec->nr = syscall_nr; rec->ret = syscall_get_return_value(current, regs); - perf_tp_event(sys_data->exit_id, 0, 1, rec, size); + perf_tp_event(sys_data->exit_event->id, 0, 1, rec, size); end: + perf_swevent_put_recursion_context(rctx); +end_recursion: local_irq_restore(flags); } -int reg_prof_syscall_exit(char *name) +int prof_sysexit_enable(struct ftrace_event_call *call) { int ret = 0; int num; - num = syscall_name_to_nr(name); - if (num < 0 || num >= NR_syscalls) - return -ENOSYS; + num = ((struct syscall_metadata *)call->data)->syscall_nr; mutex_lock(&syscall_trace_lock); if (!sys_prof_refcount_exit) @@ -561,13 +626,11 @@ int reg_prof_syscall_exit(char *name) return ret; } -void unreg_prof_syscall_exit(char *name) +void prof_sysexit_disable(struct ftrace_event_call *call) { int num; - num = syscall_name_to_nr(name); - if (num < 0 || num >= NR_syscalls) - return; + num = ((struct syscall_metadata *)call->data)->syscall_nr; mutex_lock(&syscall_trace_lock); sys_prof_refcount_exit--; diff --git a/kernel/uid16.c b/kernel/uid16.c index 0314501688b9..419209893d87 100644 --- a/kernel/uid16.c +++ b/kernel/uid16.c @@ -4,7 +4,6 @@ */ #include <linux/mm.h> -#include <linux/utsname.h> #include <linux/mman.h> #include <linux/notifier.h> #include <linux/reboot.h> diff --git a/kernel/user.c b/kernel/user.c index 2c000e7132ac..46d0165ca70c 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -330,9 +330,9 @@ done: */ static void free_user(struct user_struct *up, unsigned long flags) { - spin_unlock_irqrestore(&uidhash_lock, flags); INIT_DELAYED_WORK(&up->work, cleanup_user_struct); schedule_delayed_work(&up->work, msecs_to_jiffies(1000)); + spin_unlock_irqrestore(&uidhash_lock, flags); } #else /* CONFIG_USER_SCHED && CONFIG_SYSFS */ diff --git a/kernel/utsname_sysctl.c b/kernel/utsname_sysctl.c index 92359cc747a7..69eae358a726 100644 --- a/kernel/utsname_sysctl.c +++ b/kernel/utsname_sysctl.c @@ -42,14 +42,14 @@ static void put_uts(ctl_table *table, int write, void *which) * Special case of dostring for the UTS structure. This has locks * to observe. Should this be in kernel/sys.c ???? */ -static int proc_do_uts_string(ctl_table *table, int write, struct file *filp, +static int proc_do_uts_string(ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { struct ctl_table uts_table; int r; memcpy(&uts_table, table, sizeof(uts_table)); uts_table.data = get_uts(table, write); - r = proc_dostring(&uts_table,write,filp,buffer,lenp, ppos); + r = proc_dostring(&uts_table,write,buffer,lenp, ppos); put_uts(table, write, uts_table.data); return r; } diff --git a/kernel/workqueue.c b/kernel/workqueue.c index addfe2df93b1..67e526b6ae81 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -640,6 +640,24 @@ int schedule_delayed_work(struct delayed_work *dwork, EXPORT_SYMBOL(schedule_delayed_work); /** + * flush_delayed_work - block until a dwork_struct's callback has terminated + * @dwork: the delayed work which is to be flushed + * + * Any timeout is cancelled, and any pending work is run immediately. + */ +void flush_delayed_work(struct delayed_work *dwork) +{ + if (del_timer_sync(&dwork->timer)) { + struct cpu_workqueue_struct *cwq; + cwq = wq_per_cpu(keventd_wq, get_cpu()); + __queue_work(cwq, &dwork->work); + put_cpu(); + } + flush_work(&dwork->work); +} +EXPORT_SYMBOL(flush_delayed_work); + +/** * schedule_delayed_work_on - queue work in global workqueue on CPU after delay * @cpu: cpu to use * @dwork: job to be done @@ -667,6 +685,7 @@ EXPORT_SYMBOL(schedule_delayed_work_on); int schedule_on_each_cpu(work_func_t func) { int cpu; + int orig = -1; struct work_struct *works; works = alloc_percpu(struct work_struct); @@ -674,14 +693,28 @@ int schedule_on_each_cpu(work_func_t func) return -ENOMEM; get_online_cpus(); + + /* + * When running in keventd don't schedule a work item on + * itself. Can just call directly because the work queue is + * already bound. This also is faster. + */ + if (current_is_keventd()) + orig = raw_smp_processor_id(); + for_each_online_cpu(cpu) { struct work_struct *work = per_cpu_ptr(works, cpu); INIT_WORK(work, func); - schedule_work_on(cpu, work); + if (cpu != orig) + schedule_work_on(cpu, work); } + if (orig >= 0) + func(per_cpu_ptr(works, orig)); + for_each_online_cpu(cpu) flush_work(per_cpu_ptr(works, cpu)); + put_online_cpus(); free_percpu(works); return 0; |