From 9f2d1e68cf4d641def734adaccfc3823d3575e6c Mon Sep 17 00:00:00 2001 From: Jessica Yu Date: Tue, 5 Jun 2018 10:22:52 +0200 Subject: module: exclude SHN_UNDEF symbols from kallsyms api Livepatch modules are special in that we preserve their entire symbol tables in order to be able to apply relocations after module load. The unwanted side effect of this is that undefined (SHN_UNDEF) symbols of livepatch modules are accessible via the kallsyms api and this can confuse symbol resolution in livepatch (klp_find_object_symbol()) and cause subtle bugs in livepatch. Have the module kallsyms api skip over SHN_UNDEF symbols. These symbols are usually not available for normal modules anyway as we cut down their symbol tables to just the core (non-undefined) symbols, so this should really just affect livepatch modules. Note that this patch doesn't affect the display of undefined symbols in /proc/kallsyms. Reported-by: Josh Poimboeuf Tested-by: Josh Poimboeuf Reviewed-by: Josh Poimboeuf Signed-off-by: Jessica Yu --- kernel/module.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index f475f30eed8c..4a6b9c6d5f2c 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -4067,7 +4067,7 @@ static unsigned long mod_find_symname(struct module *mod, const char *name) for (i = 0; i < kallsyms->num_symtab; i++) if (strcmp(name, symname(kallsyms, i)) == 0 && - kallsyms->symtab[i].st_info != 'U') + kallsyms->symtab[i].st_shndx != SHN_UNDEF) return kallsyms->symtab[i].st_value; return 0; } @@ -4113,6 +4113,10 @@ int module_kallsyms_on_each_symbol(int (*fn)(void *, const char *, if (mod->state == MODULE_STATE_UNFORMED) continue; for (i = 0; i < kallsyms->num_symtab; i++) { + + if (kallsyms->symtab[i].st_shndx == SHN_UNDEF) + continue; + ret = fn(data, symname(kallsyms, i), mod, kallsyms->symtab[i].st_value); if (ret != 0) -- cgit From 4a5f4d2f891bcff7285b5f7490ed5a7a5d516049 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 18 Jun 2018 05:56:12 -0700 Subject: genirq: Use rcu in kstat_irqs_usr() Jeremy Dorfman identified mutex contention when multiple threads parse /proc/stat concurrently. Since commit 425a5072dcd1 ("genirq: Free irq_desc with rcu"), kstat_irqs_usr() can be switched to rcu locking, which removes this mutex contention. show_interrupts() case will be handled in a separate patch. Reported-by: Jeremy Dorfman Signed-off-by: Eric Dumazet Signed-off-by: Thomas Gleixner Cc: Eric Dumazet Cc: Willem de Bruijn Link: https://lkml.kernel.org/r/20180618125612.155057-1-edumazet@google.com --- kernel/irq/irqdesc.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index afc7f902d74a..578d0e5f1b5b 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -443,6 +443,7 @@ static void free_desc(unsigned int irq) * We free the descriptor, masks and stat fields via RCU. That * allows demultiplex interrupts to do rcu based management of * the child interrupts. + * This also allows us to use rcu in kstat_irqs_usr(). */ call_rcu(&desc->rcu, delayed_free_desc); } @@ -928,17 +929,17 @@ unsigned int kstat_irqs(unsigned int irq) * kstat_irqs_usr - Get the statistics for an interrupt * @irq: The interrupt number * - * Returns the sum of interrupt counts on all cpus since boot for - * @irq. Contrary to kstat_irqs() this can be called from any - * preemptible context. It's protected against concurrent removal of - * an interrupt descriptor when sparse irqs are enabled. + * Returns the sum of interrupt counts on all cpus since boot for @irq. + * Contrary to kstat_irqs() this can be called from any context. + * It uses rcu since a concurrent removal of an interrupt descriptor is + * observing an rcu grace period before delayed_free_desc()/irq_kobj_release(). */ unsigned int kstat_irqs_usr(unsigned int irq) { unsigned int sum; - irq_lock_sparse(); + rcu_read_lock(); sum = kstat_irqs(irq); - irq_unlock_sparse(); + rcu_read_unlock(); return sum; } -- cgit From 0a13ec0bbc42bddf90ab6a444df8aaa67c148b16 Mon Sep 17 00:00:00 2001 From: Jonathan Neuschäfer Date: Sun, 17 Jun 2018 14:40:18 +0200 Subject: genirq: Fix editing error in a comment MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When the comment was reflowed to a wider format, the "*" snuck in. Fixes: ae88a23b32fa ("irq: refactor and clean up the free_irq() code flow") Signed-off-by: Jonathan Neuschäfer Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/20180617124018.25539-1-j.neuschaefer@gmx.net --- kernel/irq/manage.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index daeabd791d58..591cfe901162 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -1638,7 +1638,7 @@ static struct irqaction *__free_irq(struct irq_desc *desc, void *dev_id) * is so by doing an extra call to the handler .... * * ( We do this after actually deregistering it, to make sure that a - * 'real' IRQ doesn't run in * parallel with our fake. ) + * 'real' IRQ doesn't run in parallel with our fake. ) */ if (action->flags & IRQF_SHARED) { local_irq_save(flags); -- cgit From 29c1372d6a9b872acf479ba2744e4e7f043981c0 Mon Sep 17 00:00:00 2001 From: Ondrej Mosnáček Date: Wed, 30 May 2018 10:45:24 +0200 Subject: audit: allow other filter list types for AUDIT_EXE This patch removes the restriction of the AUDIT_EXE field to only SYSCALL filter and teaches audit_filter to recognize this field. This makes it possible to write rule lists such as: auditctl -a exit,always [some general rule] # Filter out events with executable name /bin/exe1 or /bin/exe2: auditctl -a exclude,always -F exe=/bin/exe1 auditctl -a exclude,always -F exe=/bin/exe2 See: https://github.com/linux-audit/audit-kernel/issues/54 Signed-off-by: Ondrej Mosnacek Reviewed-by: Richard Guy Briggs Signed-off-by: Paul Moore --- kernel/auditfilter.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index eaa320148d97..6db9847ca031 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -428,8 +428,6 @@ static int audit_field_valid(struct audit_entry *entry, struct audit_field *f) case AUDIT_EXE: if (f->op != Audit_not_equal && f->op != Audit_equal) return -EINVAL; - if (entry->rule.listnr != AUDIT_FILTER_EXIT) - return -EINVAL; break; } return 0; @@ -1360,6 +1358,11 @@ int audit_filter(int msgtype, unsigned int listtype) f->type, f->op, f->lsm_rule, NULL); } break; + case AUDIT_EXE: + result = audit_exe_compare(current, e->rule.exe); + if (f->op == Audit_not_equal) + result = !result; + break; default: goto unlock_and_return; } -- cgit From 9b8753fffe7b3642688135f28aa8a0a0f45fd9ab Mon Sep 17 00:00:00 2001 From: Richard Guy Briggs Date: Thu, 31 May 2018 16:27:24 -0400 Subject: audit: tie SECCOMP records to syscall Since seccomp events are triggered by user activity, tie the SECCOMP record to the syscall record to collect all records from the same event. See: https://github.com/linux-audit/audit-kernel/issues/87 Signed-off-by: Richard Guy Briggs Signed-off-by: Paul Moore --- kernel/auditsc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/auditsc.c b/kernel/auditsc.c index ceb1c4596c51..fefb9e215cd0 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -2485,7 +2485,7 @@ void audit_seccomp(unsigned long syscall, long signr, int code) { struct audit_buffer *ab; - ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_SECCOMP); + ab = audit_log_start(audit_context(), GFP_KERNEL, AUDIT_SECCOMP); if (unlikely(!ab)) return; audit_log_task(ab); -- cgit From d87de4a878e110d0061fb22726d37a54a281285d Mon Sep 17 00:00:00 2001 From: Richard Guy Briggs Date: Thu, 31 May 2018 16:28:12 -0400 Subject: audit: tie ANOM_ABEND records to syscall Since core dump events are triggered by user activity, tie the ANOM_ABEND record to the syscall record to collect all records from the same event. See: https://github.com/linux-audit/audit-kernel/issues/88 Signed-off-by: Richard Guy Briggs Signed-off-by: Paul Moore --- kernel/auditsc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/auditsc.c b/kernel/auditsc.c index fefb9e215cd0..5f0bd5ece578 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -2461,7 +2461,7 @@ void audit_core_dumps(long signr) if (signr == SIGQUIT) /* don't care for those */ return; - ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_ANOM_ABEND); + ab = audit_log_start(audit_context(), GFP_KERNEL, AUDIT_ANOM_ABEND); if (unlikely(!ab)) return; audit_log_task(ab); -- cgit From af85d1772e31fed34165a1b3decef340cf4080c0 Mon Sep 17 00:00:00 2001 From: Ondrej Mosnáček Date: Tue, 5 Jun 2018 11:00:10 +0200 Subject: audit: Fix extended comparison of GID/EGID The audit_filter_rules() function in auditsc.c used the in_[e]group_p() functions to check GID/EGID match, but these functions use the current task's credentials, while the comparison should use the credentials of the task given to audit_filter_rules() as a parameter (tsk). Note that we can use group_search(cred->group_info, ...) as a replacement for both in_group_p and in_egroup_p as these functions only compare the parameter to cred->fsgid/egid and then call group_search. In fact, the usage of in_group_p was even more incorrect: it compares to cred->fsgid (which is usually equal to cred->egid) and not cred->gid. GitHub issue: https://github.com/linux-audit/audit-kernel/issues/82 Fixes: 37eebe39c973 ("audit: improve GID/EGID comparation logic") Signed-off-by: Ondrej Mosnacek Signed-off-by: Paul Moore --- kernel/auditsc.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 5f0bd5ece578..d762e0b8160e 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -494,20 +494,20 @@ static int audit_filter_rules(struct task_struct *tsk, result = audit_gid_comparator(cred->gid, f->op, f->gid); if (f->op == Audit_equal) { if (!result) - result = in_group_p(f->gid); + result = groups_search(cred->group_info, f->gid); } else if (f->op == Audit_not_equal) { if (result) - result = !in_group_p(f->gid); + result = !groups_search(cred->group_info, f->gid); } break; case AUDIT_EGID: result = audit_gid_comparator(cred->egid, f->op, f->gid); if (f->op == Audit_equal) { if (!result) - result = in_egroup_p(f->gid); + result = groups_search(cred->group_info, f->gid); } else if (f->op == Audit_not_equal) { if (result) - result = !in_egroup_p(f->gid); + result = !groups_search(cred->group_info, f->gid); } break; case AUDIT_SGID: -- cgit From d904ac0320d3c4ff4e9d80e4294ca5dde803696f Mon Sep 17 00:00:00 2001 From: Richard Guy Briggs Date: Tue, 5 Jun 2018 11:45:07 -0400 Subject: audit: rename FILTER_TYPE to FILTER_EXCLUDE The AUDIT_FILTER_TYPE name is vague and misleading due to not describing where or when the filter is applied and obsolete due to its available filter fields having been expanded. Userspace has already renamed it from AUDIT_FILTER_TYPE to AUDIT_FILTER_EXCLUDE without checking if it already exists. The userspace maintainer assures that as long as it is set to the same value it will not be a problem since the userspace code does not treat compiler warnings as errors. If this policy changes then checks if it already exists can be added at the same time. See: https://github.com/linux-audit/audit-kernel/issues/89 Signed-off-by: Richard Guy Briggs Signed-off-by: Paul Moore --- include/uapi/linux/audit.h | 3 ++- kernel/audit.c | 2 +- kernel/auditfilter.c | 10 +++++----- 3 files changed, 8 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/include/uapi/linux/audit.h b/include/uapi/linux/audit.h index c35aee9ad4a6..4e3eaba84175 100644 --- a/include/uapi/linux/audit.h +++ b/include/uapi/linux/audit.h @@ -157,7 +157,8 @@ #define AUDIT_FILTER_ENTRY 0x02 /* Apply rule at syscall entry */ #define AUDIT_FILTER_WATCH 0x03 /* Apply rule to file system watches */ #define AUDIT_FILTER_EXIT 0x04 /* Apply rule at syscall exit */ -#define AUDIT_FILTER_TYPE 0x05 /* Apply rule at audit_log_start */ +#define AUDIT_FILTER_EXCLUDE 0x05 /* Apply rule before record creation */ +#define AUDIT_FILTER_TYPE AUDIT_FILTER_EXCLUDE /* obsolete misleading naming */ #define AUDIT_FILTER_FS 0x06 /* Apply rule at __audit_inode_child */ #define AUDIT_NR_FILTERS 7 diff --git a/kernel/audit.c b/kernel/audit.c index e7478cb58079..5c0a1d7b0c7b 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -1754,7 +1754,7 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask, if (audit_initialized != AUDIT_INITIALIZED) return NULL; - if (unlikely(!audit_filter(type, AUDIT_FILTER_TYPE))) + if (unlikely(!audit_filter(type, AUDIT_FILTER_EXCLUDE))) return NULL; /* NOTE: don't ever fail/sleep on these two conditions: diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index 6db9847ca031..bf309f2592c4 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -264,7 +264,7 @@ static inline struct audit_entry *audit_to_entry_common(struct audit_rule_data * case AUDIT_FILTER_TASK: #endif case AUDIT_FILTER_USER: - case AUDIT_FILTER_TYPE: + case AUDIT_FILTER_EXCLUDE: case AUDIT_FILTER_FS: ; } @@ -337,7 +337,7 @@ static int audit_field_valid(struct audit_entry *entry, struct audit_field *f) { switch(f->type) { case AUDIT_MSGTYPE: - if (entry->rule.listnr != AUDIT_FILTER_TYPE && + if (entry->rule.listnr != AUDIT_FILTER_EXCLUDE && entry->rule.listnr != AUDIT_FILTER_USER) return -EINVAL; break; @@ -929,7 +929,7 @@ static inline int audit_add_rule(struct audit_entry *entry) /* If any of these, don't count towards total */ switch(entry->rule.listnr) { case AUDIT_FILTER_USER: - case AUDIT_FILTER_TYPE: + case AUDIT_FILTER_EXCLUDE: case AUDIT_FILTER_FS: dont_count = 1; } @@ -1011,7 +1011,7 @@ int audit_del_rule(struct audit_entry *entry) /* If any of these, don't count towards total */ switch(entry->rule.listnr) { case AUDIT_FILTER_USER: - case AUDIT_FILTER_TYPE: + case AUDIT_FILTER_EXCLUDE: case AUDIT_FILTER_FS: dont_count = 1; } @@ -1372,7 +1372,7 @@ int audit_filter(int msgtype, unsigned int listtype) break; } if (result > 0) { - if (e->rule.action == AUDIT_NEVER || listtype == AUDIT_FILTER_TYPE) + if (e->rule.action == AUDIT_NEVER || listtype == AUDIT_FILTER_EXCLUDE) ret = 0; break; } -- cgit From f7859590d97614815b35a755c8213dfb8f2766bd Mon Sep 17 00:00:00 2001 From: Richard Guy Briggs Date: Tue, 5 Jun 2018 19:20:39 -0400 Subject: audit: eliminate audit_enabled magic number comparison Remove comparison of audit_enabled to magic numbers outside of audit. Related: https://github.com/linux-audit/audit-kernel/issues/86 Signed-off-by: Richard Guy Briggs Signed-off-by: Paul Moore --- drivers/tty/tty_audit.c | 2 +- include/linux/audit.h | 5 ++++- include/net/xfrm.h | 2 +- kernel/audit.c | 3 --- net/netfilter/xt_AUDIT.c | 2 +- net/netlabel/netlabel_user.c | 2 +- 6 files changed, 8 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/drivers/tty/tty_audit.c b/drivers/tty/tty_audit.c index e30aa6bf9ff9..50f567b6a66e 100644 --- a/drivers/tty/tty_audit.c +++ b/drivers/tty/tty_audit.c @@ -92,7 +92,7 @@ static void tty_audit_buf_push(struct tty_audit_buf *buf) { if (buf->valid == 0) return; - if (audit_enabled == 0) { + if (audit_enabled == AUDIT_OFF) { buf->valid = 0; return; } diff --git a/include/linux/audit.h b/include/linux/audit.h index 69c78477590b..9334fbef7bae 100644 --- a/include/linux/audit.h +++ b/include/linux/audit.h @@ -117,6 +117,9 @@ struct filename; extern void audit_log_session_info(struct audit_buffer *ab); +#define AUDIT_OFF 0 +#define AUDIT_ON 1 +#define AUDIT_LOCKED 2 #ifdef CONFIG_AUDIT /* These are defined in audit.c */ /* Public API */ @@ -202,7 +205,7 @@ static inline int audit_log_task_context(struct audit_buffer *ab) static inline void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk) { } -#define audit_enabled 0 +#define audit_enabled AUDIT_OFF #endif /* CONFIG_AUDIT */ #ifdef CONFIG_AUDIT_COMPAT_GENERIC diff --git a/include/net/xfrm.h b/include/net/xfrm.h index 557122846e0e..f7f297727ed8 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -735,7 +735,7 @@ static inline struct audit_buffer *xfrm_audit_start(const char *op) { struct audit_buffer *audit_buf = NULL; - if (audit_enabled == 0) + if (audit_enabled == AUDIT_OFF) return NULL; audit_buf = audit_log_start(audit_context(), GFP_ATOMIC, AUDIT_MAC_IPSEC_EVENT); diff --git a/kernel/audit.c b/kernel/audit.c index 5c0a1d7b0c7b..0f3222e4edde 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -83,9 +83,6 @@ #define AUDIT_INITIALIZED 1 static int audit_initialized; -#define AUDIT_OFF 0 -#define AUDIT_ON 1 -#define AUDIT_LOCKED 2 u32 audit_enabled = AUDIT_OFF; bool audit_ever_enabled = !!AUDIT_OFF; diff --git a/net/netfilter/xt_AUDIT.c b/net/netfilter/xt_AUDIT.c index f368ee6741db..af883f1b64f9 100644 --- a/net/netfilter/xt_AUDIT.c +++ b/net/netfilter/xt_AUDIT.c @@ -72,7 +72,7 @@ audit_tg(struct sk_buff *skb, const struct xt_action_param *par) struct audit_buffer *ab; int fam = -1; - if (audit_enabled == 0) + if (audit_enabled == AUDIT_OFF) goto errout; ab = audit_log_start(NULL, GFP_ATOMIC, AUDIT_NETFILTER_PKT); if (ab == NULL) diff --git a/net/netlabel/netlabel_user.c b/net/netlabel/netlabel_user.c index 2f328af91a52..4676f5bb16ae 100644 --- a/net/netlabel/netlabel_user.c +++ b/net/netlabel/netlabel_user.c @@ -101,7 +101,7 @@ struct audit_buffer *netlbl_audit_start_common(int type, char *secctx; u32 secctx_len; - if (audit_enabled == 0) + if (audit_enabled == AUDIT_OFF) return NULL; audit_buf = audit_log_start(audit_context(), GFP_ATOMIC, type); -- cgit From 6519750210d9d3330e85d12e0128652edde448d2 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 12 Jun 2018 10:34:50 +0200 Subject: sched/swait: Remove __prepare_to_swait There is no public user of this API, remove it. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Acked-by: Linus Torvalds Cc: bigeasy@linutronix.de Cc: oleg@redhat.com Cc: paulmck@linux.vnet.ibm.com Cc: pbonzini@redhat.com Link: https://lkml.kernel.org/r/20180612083909.157076812@infradead.org --- include/linux/swait.h | 1 - kernel/sched/swait.c | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/include/linux/swait.h b/include/linux/swait.h index bf8cb0dee23c..d6a5e949e4ca 100644 --- a/include/linux/swait.h +++ b/include/linux/swait.h @@ -161,7 +161,6 @@ extern void swake_up(struct swait_queue_head *q); extern void swake_up_all(struct swait_queue_head *q); extern void swake_up_locked(struct swait_queue_head *q); -extern void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait); extern void prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait, int state); extern long prepare_to_swait_event(struct swait_queue_head *q, struct swait_queue *wait, int state); diff --git a/kernel/sched/swait.c b/kernel/sched/swait.c index b6fb2c3b3ff7..e68bb1398b05 100644 --- a/kernel/sched/swait.c +++ b/kernel/sched/swait.c @@ -69,7 +69,7 @@ void swake_up_all(struct swait_queue_head *q) } EXPORT_SYMBOL(swake_up_all); -void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait) +static void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait) { wait->task = current; if (list_empty(&wait->task_list)) -- cgit From 0abf17bc7790dd0467ed0e38522242f23c5da1c4 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 12 Jun 2018 10:34:51 +0200 Subject: sched/swait: Switch to full exclusive mode Linus noted that swait basically implements exclusive mode -- because swake_up() only wakes a single waiter. And because of that it should take care to properly deal with the interruptible case. In short, the problem is that swake_up() can race with a signal. In this this case it is possible the swake_up() 'wakes' the waiter that is already on the way out because it just got a signal and the wakeup gets lost. The normal wait code is very careful and avoids this situation, make sure we do too. Copy the exact exclusive semantics from wait. Suggested-by: Linus Torvalds Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Acked-by: Linus Torvalds Cc: bigeasy@linutronix.de Cc: oleg@redhat.com Cc: paulmck@linux.vnet.ibm.com Cc: pbonzini@redhat.com Link: https://lkml.kernel.org/r/20180612083909.209762413@infradead.org --- include/linux/swait.h | 11 ++++++----- kernel/sched/swait.c | 22 +++++++++++++++++----- 2 files changed, 23 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/include/linux/swait.h b/include/linux/swait.h index d6a5e949e4ca..dd032061112d 100644 --- a/include/linux/swait.h +++ b/include/linux/swait.h @@ -38,8 +38,8 @@ * all wakeups are TASK_NORMAL in order to avoid O(n) lookups for the right * sleeper state. * - * - the exclusive mode; because this requires preserving the list order - * and this is hard. + * - the !exclusive mode; because that leads to O(n) wakeups, everything is + * exclusive. * * - custom wake callback functions; because you cannot give any guarantees * about random code. This also allows swait to be used in RT, such that @@ -167,9 +167,10 @@ extern long prepare_to_swait_event(struct swait_queue_head *q, struct swait_queu extern void __finish_swait(struct swait_queue_head *q, struct swait_queue *wait); extern void finish_swait(struct swait_queue_head *q, struct swait_queue *wait); -/* as per ___wait_event() but for swait, therefore "exclusive == 0" */ +/* as per ___wait_event() but for swait, therefore "exclusive == 1" */ #define ___swait_event(wq, condition, state, ret, cmd) \ ({ \ + __label__ __out; \ struct swait_queue __wait; \ long __ret = ret; \ \ @@ -182,13 +183,13 @@ extern void finish_swait(struct swait_queue_head *q, struct swait_queue *wait); \ if (___wait_is_interruptible(state) && __int) { \ __ret = __int; \ - break; \ + goto __out; \ } \ \ cmd; \ } \ finish_swait(&wq, &__wait); \ - __ret; \ +__out: __ret; \ }) #define __swait_event(wq, condition) \ diff --git a/kernel/sched/swait.c b/kernel/sched/swait.c index e68bb1398b05..66890de93ee5 100644 --- a/kernel/sched/swait.c +++ b/kernel/sched/swait.c @@ -73,7 +73,7 @@ static void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *w { wait->task = current; if (list_empty(&wait->task_list)) - list_add(&wait->task_list, &q->task_list); + list_add_tail(&wait->task_list, &q->task_list); } void prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait, int state) @@ -89,12 +89,24 @@ EXPORT_SYMBOL(prepare_to_swait); long prepare_to_swait_event(struct swait_queue_head *q, struct swait_queue *wait, int state) { - if (signal_pending_state(state, current)) - return -ERESTARTSYS; + unsigned long flags; + long ret = 0; - prepare_to_swait(q, wait, state); + raw_spin_lock_irqsave(&q->lock, flags); + if (unlikely(signal_pending_state(state, current))) { + /* + * See prepare_to_wait_event(). TL;DR, subsequent swake_up() + * must not see us. + */ + list_del_init(&wait->task_list); + ret = -ERESTARTSYS; + } else { + __prepare_to_swait(q, wait); + set_current_state(state); + } + raw_spin_unlock_irqrestore(&q->lock, flags); - return 0; + return ret; } EXPORT_SYMBOL(prepare_to_swait_event); -- cgit From b3dae109fa89d67334bf3349babab3ad9b6f233f Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 12 Jun 2018 10:34:52 +0200 Subject: sched/swait: Rename to exclusive Since swait basically implemented exclusive waits only, make sure the API reflects that. $ git grep -l -e "\" -e "\" | while read file; do sed -i -e 's/\/&_one/g' -e 's/\/&_exclusive/g' $file; done With a few manual touch-ups. Suggested-by: Linus Torvalds Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Acked-by: Linus Torvalds Cc: bigeasy@linutronix.de Cc: oleg@redhat.com Cc: paulmck@linux.vnet.ibm.com Cc: pbonzini@redhat.com Link: https://lkml.kernel.org/r/20180612083909.261946548@infradead.org --- arch/mips/kvm/mips.c | 4 ++-- arch/powerpc/kvm/book3s_hv.c | 6 +++--- arch/s390/kvm/interrupt.c | 2 +- arch/x86/kernel/kvm.c | 4 ++-- arch/x86/kvm/lapic.c | 2 +- include/linux/swait.h | 24 ++++++++++++------------ kernel/power/suspend.c | 4 ++-- kernel/rcu/srcutiny.c | 4 ++-- kernel/rcu/tree.c | 8 ++++---- kernel/rcu/tree_exp.h | 4 ++-- kernel/rcu/tree_plugin.h | 12 ++++++------ kernel/sched/swait.c | 10 +++++----- virt/kvm/arm/arm.c | 4 ++-- virt/kvm/arm/psci.c | 2 +- virt/kvm/async_pf.c | 2 +- virt/kvm/kvm_main.c | 4 ++-- 16 files changed, 48 insertions(+), 48 deletions(-) (limited to 'kernel') diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c index 7cd76f93a438..f7ea8e21656b 100644 --- a/arch/mips/kvm/mips.c +++ b/arch/mips/kvm/mips.c @@ -515,7 +515,7 @@ int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, dvcpu->arch.wait = 0; if (swq_has_sleeper(&dvcpu->wq)) - swake_up(&dvcpu->wq); + swake_up_one(&dvcpu->wq); return 0; } @@ -1204,7 +1204,7 @@ static void kvm_mips_comparecount_func(unsigned long data) vcpu->arch.wait = 0; if (swq_has_sleeper(&vcpu->wq)) - swake_up(&vcpu->wq); + swake_up_one(&vcpu->wq); } /* low level hrtimer wake routine */ diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index de686b340f4a..ee4a8854985e 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -216,7 +216,7 @@ static void kvmppc_fast_vcpu_kick_hv(struct kvm_vcpu *vcpu) wqp = kvm_arch_vcpu_wq(vcpu); if (swq_has_sleeper(wqp)) { - swake_up(wqp); + swake_up_one(wqp); ++vcpu->stat.halt_wakeup; } @@ -3188,7 +3188,7 @@ static void kvmppc_vcore_blocked(struct kvmppc_vcore *vc) } } - prepare_to_swait(&vc->wq, &wait, TASK_INTERRUPTIBLE); + prepare_to_swait_exclusive(&vc->wq, &wait, TASK_INTERRUPTIBLE); if (kvmppc_vcore_check_block(vc)) { finish_swait(&vc->wq, &wait); @@ -3311,7 +3311,7 @@ static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) kvmppc_start_thread(vcpu, vc); trace_kvm_guest_enter(vcpu); } else if (vc->vcore_state == VCORE_SLEEPING) { - swake_up(&vc->wq); + swake_up_one(&vc->wq); } } diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c index daa09f89ca2d..fcb55b02990e 100644 --- a/arch/s390/kvm/interrupt.c +++ b/arch/s390/kvm/interrupt.c @@ -1145,7 +1145,7 @@ void kvm_s390_vcpu_wakeup(struct kvm_vcpu *vcpu) * yield-candidate. */ vcpu->preempted = true; - swake_up(&vcpu->wq); + swake_up_one(&vcpu->wq); vcpu->stat.halt_wakeup++; } /* diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 5b2300b818af..a37bda38d205 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -154,7 +154,7 @@ void kvm_async_pf_task_wait(u32 token, int interrupt_kernel) for (;;) { if (!n.halted) - prepare_to_swait(&n.wq, &wait, TASK_UNINTERRUPTIBLE); + prepare_to_swait_exclusive(&n.wq, &wait, TASK_UNINTERRUPTIBLE); if (hlist_unhashed(&n.link)) break; @@ -188,7 +188,7 @@ static void apf_task_wake_one(struct kvm_task_sleep_node *n) if (n->halted) smp_send_reschedule(n->cpu); else if (swq_has_sleeper(&n->wq)) - swake_up(&n->wq); + swake_up_one(&n->wq); } static void apf_task_wake_all(void) diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index b5cd8465d44f..d536d457517b 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -1379,7 +1379,7 @@ static void apic_timer_expired(struct kvm_lapic *apic) * using swait_active() is safe. */ if (swait_active(q)) - swake_up(q); + swake_up_one(q); if (apic_lvtt_tscdeadline(apic)) ktimer->expired_tscdeadline = ktimer->tscdeadline; diff --git a/include/linux/swait.h b/include/linux/swait.h index dd032061112d..73e06e9986d4 100644 --- a/include/linux/swait.h +++ b/include/linux/swait.h @@ -16,7 +16,7 @@ * wait-queues, but the semantics are actually completely different, and * every single user we have ever had has been buggy (or pointless). * - * A "swake_up()" only wakes up _one_ waiter, which is not at all what + * A "swake_up_one()" only wakes up _one_ waiter, which is not at all what * "wake_up()" does, and has led to problems. In other cases, it has * been fine, because there's only ever one waiter (kvm), but in that * case gthe whole "simple" wait-queue is just pointless to begin with, @@ -115,7 +115,7 @@ extern void __init_swait_queue_head(struct swait_queue_head *q, const char *name * CPU0 - waker CPU1 - waiter * * for (;;) { - * @cond = true; prepare_to_swait(&wq_head, &wait, state); + * @cond = true; prepare_to_swait_exclusive(&wq_head, &wait, state); * smp_mb(); // smp_mb() from set_current_state() * if (swait_active(wq_head)) if (@cond) * wake_up(wq_head); break; @@ -157,11 +157,11 @@ static inline bool swq_has_sleeper(struct swait_queue_head *wq) return swait_active(wq); } -extern void swake_up(struct swait_queue_head *q); +extern void swake_up_one(struct swait_queue_head *q); extern void swake_up_all(struct swait_queue_head *q); extern void swake_up_locked(struct swait_queue_head *q); -extern void prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait, int state); +extern void prepare_to_swait_exclusive(struct swait_queue_head *q, struct swait_queue *wait, int state); extern long prepare_to_swait_event(struct swait_queue_head *q, struct swait_queue *wait, int state); extern void __finish_swait(struct swait_queue_head *q, struct swait_queue *wait); @@ -196,7 +196,7 @@ __out: __ret; \ (void)___swait_event(wq, condition, TASK_UNINTERRUPTIBLE, 0, \ schedule()) -#define swait_event(wq, condition) \ +#define swait_event_exclusive(wq, condition) \ do { \ if (condition) \ break; \ @@ -208,7 +208,7 @@ do { \ TASK_UNINTERRUPTIBLE, timeout, \ __ret = schedule_timeout(__ret)) -#define swait_event_timeout(wq, condition, timeout) \ +#define swait_event_timeout_exclusive(wq, condition, timeout) \ ({ \ long __ret = timeout; \ if (!___wait_cond_timeout(condition)) \ @@ -220,7 +220,7 @@ do { \ ___swait_event(wq, condition, TASK_INTERRUPTIBLE, 0, \ schedule()) -#define swait_event_interruptible(wq, condition) \ +#define swait_event_interruptible_exclusive(wq, condition) \ ({ \ int __ret = 0; \ if (!(condition)) \ @@ -233,7 +233,7 @@ do { \ TASK_INTERRUPTIBLE, timeout, \ __ret = schedule_timeout(__ret)) -#define swait_event_interruptible_timeout(wq, condition, timeout) \ +#define swait_event_interruptible_timeout_exclusive(wq, condition, timeout)\ ({ \ long __ret = timeout; \ if (!___wait_cond_timeout(condition)) \ @@ -246,7 +246,7 @@ do { \ (void)___swait_event(wq, condition, TASK_IDLE, 0, schedule()) /** - * swait_event_idle - wait without system load contribution + * swait_event_idle_exclusive - wait without system load contribution * @wq: the waitqueue to wait on * @condition: a C expression for the event to wait for * @@ -257,7 +257,7 @@ do { \ * condition and doesn't want to contribute to system load. Signals are * ignored. */ -#define swait_event_idle(wq, condition) \ +#define swait_event_idle_exclusive(wq, condition) \ do { \ if (condition) \ break; \ @@ -270,7 +270,7 @@ do { \ __ret = schedule_timeout(__ret)) /** - * swait_event_idle_timeout - wait up to timeout without load contribution + * swait_event_idle_timeout_exclusive - wait up to timeout without load contribution * @wq: the waitqueue to wait on * @condition: a C expression for the event to wait for * @timeout: timeout at which we'll give up in jiffies @@ -288,7 +288,7 @@ do { \ * or the remaining jiffies (at least 1) if the @condition evaluated * to %true before the @timeout elapsed. */ -#define swait_event_idle_timeout(wq, condition, timeout) \ +#define swait_event_idle_timeout_exclusive(wq, condition, timeout) \ ({ \ long __ret = timeout; \ if (!___wait_cond_timeout(condition)) \ diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 87331565e505..70178f6ffdc4 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -92,7 +92,7 @@ static void s2idle_enter(void) /* Push all the CPUs into the idle loop. */ wake_up_all_idle_cpus(); /* Make the current CPU wait so it can enter the idle loop too. */ - swait_event(s2idle_wait_head, + swait_event_exclusive(s2idle_wait_head, s2idle_state == S2IDLE_STATE_WAKE); cpuidle_pause(); @@ -160,7 +160,7 @@ void s2idle_wake(void) raw_spin_lock_irqsave(&s2idle_lock, flags); if (s2idle_state > S2IDLE_STATE_NONE) { s2idle_state = S2IDLE_STATE_WAKE; - swake_up(&s2idle_wait_head); + swake_up_one(&s2idle_wait_head); } raw_spin_unlock_irqrestore(&s2idle_lock, flags); } diff --git a/kernel/rcu/srcutiny.c b/kernel/rcu/srcutiny.c index 622792abe41a..04fc2ed71af8 100644 --- a/kernel/rcu/srcutiny.c +++ b/kernel/rcu/srcutiny.c @@ -110,7 +110,7 @@ void __srcu_read_unlock(struct srcu_struct *sp, int idx) WRITE_ONCE(sp->srcu_lock_nesting[idx], newval); if (!newval && READ_ONCE(sp->srcu_gp_waiting)) - swake_up(&sp->srcu_wq); + swake_up_one(&sp->srcu_wq); } EXPORT_SYMBOL_GPL(__srcu_read_unlock); @@ -140,7 +140,7 @@ void srcu_drive_gp(struct work_struct *wp) idx = sp->srcu_idx; WRITE_ONCE(sp->srcu_idx, !sp->srcu_idx); WRITE_ONCE(sp->srcu_gp_waiting, true); /* srcu_read_unlock() wakes! */ - swait_event(sp->srcu_wq, !READ_ONCE(sp->srcu_lock_nesting[idx])); + swait_event_exclusive(sp->srcu_wq, !READ_ONCE(sp->srcu_lock_nesting[idx])); WRITE_ONCE(sp->srcu_gp_waiting, false); /* srcu_read_unlock() cheap. */ /* Invoke the callbacks we removed above. */ diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index aa7cade1b9f3..91f888d3b23a 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1727,7 +1727,7 @@ static void rcu_gp_kthread_wake(struct rcu_state *rsp) !READ_ONCE(rsp->gp_flags) || !rsp->gp_kthread) return; - swake_up(&rsp->gp_wq); + swake_up_one(&rsp->gp_wq); } /* @@ -2002,7 +2002,7 @@ static bool rcu_gp_init(struct rcu_state *rsp) } /* - * Helper function for swait_event_idle() wakeup at force-quiescent-state + * Helper function for swait_event_idle_exclusive() wakeup at force-quiescent-state * time. */ static bool rcu_gp_fqs_check_wake(struct rcu_state *rsp, int *gfp) @@ -2144,7 +2144,7 @@ static int __noreturn rcu_gp_kthread(void *arg) READ_ONCE(rsp->gpnum), TPS("reqwait")); rsp->gp_state = RCU_GP_WAIT_GPS; - swait_event_idle(rsp->gp_wq, READ_ONCE(rsp->gp_flags) & + swait_event_idle_exclusive(rsp->gp_wq, READ_ONCE(rsp->gp_flags) & RCU_GP_FLAG_INIT); rsp->gp_state = RCU_GP_DONE_GPS; /* Locking provides needed memory barrier. */ @@ -2176,7 +2176,7 @@ static int __noreturn rcu_gp_kthread(void *arg) READ_ONCE(rsp->gpnum), TPS("fqswait")); rsp->gp_state = RCU_GP_WAIT_FQS; - ret = swait_event_idle_timeout(rsp->gp_wq, + ret = swait_event_idle_timeout_exclusive(rsp->gp_wq, rcu_gp_fqs_check_wake(rsp, &gf), j); rsp->gp_state = RCU_GP_DOING_FQS; /* Locking provides needed memory barriers. */ diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index d40708e8c5d6..d428cc1064c8 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -212,7 +212,7 @@ static void __rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, raw_spin_unlock_irqrestore_rcu_node(rnp, flags); if (wake) { smp_mb(); /* EGP done before wake_up(). */ - swake_up(&rsp->expedited_wq); + swake_up_one(&rsp->expedited_wq); } break; } @@ -518,7 +518,7 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp) jiffies_start = jiffies; for (;;) { - ret = swait_event_timeout( + ret = swait_event_timeout_exclusive( rsp->expedited_wq, sync_rcu_preempt_exp_done_unlocked(rnp_root), jiffies_stall); diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 7fd12039e512..ad53d133f709 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -1854,8 +1854,8 @@ static void __wake_nocb_leader(struct rcu_data *rdp, bool force, WRITE_ONCE(rdp_leader->nocb_leader_sleep, false); del_timer(&rdp->nocb_timer); raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags); - smp_mb(); /* ->nocb_leader_sleep before swake_up(). */ - swake_up(&rdp_leader->nocb_wq); + smp_mb(); /* ->nocb_leader_sleep before swake_up_one(). */ + swake_up_one(&rdp_leader->nocb_wq); } else { raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags); } @@ -2082,7 +2082,7 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp) */ trace_rcu_this_gp(rnp, rdp, c, TPS("StartWait")); for (;;) { - swait_event_interruptible( + swait_event_interruptible_exclusive( rnp->nocb_gp_wq[c & 0x1], (d = ULONG_CMP_GE(READ_ONCE(rnp->completed), c))); if (likely(d)) @@ -2111,7 +2111,7 @@ wait_again: /* Wait for callbacks to appear. */ if (!rcu_nocb_poll) { trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu, TPS("Sleep")); - swait_event_interruptible(my_rdp->nocb_wq, + swait_event_interruptible_exclusive(my_rdp->nocb_wq, !READ_ONCE(my_rdp->nocb_leader_sleep)); raw_spin_lock_irqsave(&my_rdp->nocb_lock, flags); my_rdp->nocb_leader_sleep = true; @@ -2176,7 +2176,7 @@ wait_again: raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags); if (rdp != my_rdp && tail == &rdp->nocb_follower_head) { /* List was empty, so wake up the follower. */ - swake_up(&rdp->nocb_wq); + swake_up_one(&rdp->nocb_wq); } } @@ -2193,7 +2193,7 @@ static void nocb_follower_wait(struct rcu_data *rdp) { for (;;) { trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("FollowerSleep")); - swait_event_interruptible(rdp->nocb_wq, + swait_event_interruptible_exclusive(rdp->nocb_wq, READ_ONCE(rdp->nocb_follower_head)); if (smp_load_acquire(&rdp->nocb_follower_head)) { /* ^^^ Ensure CB invocation follows _head test. */ diff --git a/kernel/sched/swait.c b/kernel/sched/swait.c index 66890de93ee5..66b59ac77c22 100644 --- a/kernel/sched/swait.c +++ b/kernel/sched/swait.c @@ -32,7 +32,7 @@ void swake_up_locked(struct swait_queue_head *q) } EXPORT_SYMBOL(swake_up_locked); -void swake_up(struct swait_queue_head *q) +void swake_up_one(struct swait_queue_head *q) { unsigned long flags; @@ -40,7 +40,7 @@ void swake_up(struct swait_queue_head *q) swake_up_locked(q); raw_spin_unlock_irqrestore(&q->lock, flags); } -EXPORT_SYMBOL(swake_up); +EXPORT_SYMBOL(swake_up_one); /* * Does not allow usage from IRQ disabled, since we must be able to @@ -76,7 +76,7 @@ static void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *w list_add_tail(&wait->task_list, &q->task_list); } -void prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait, int state) +void prepare_to_swait_exclusive(struct swait_queue_head *q, struct swait_queue *wait, int state) { unsigned long flags; @@ -85,7 +85,7 @@ void prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait, int set_current_state(state); raw_spin_unlock_irqrestore(&q->lock, flags); } -EXPORT_SYMBOL(prepare_to_swait); +EXPORT_SYMBOL(prepare_to_swait_exclusive); long prepare_to_swait_event(struct swait_queue_head *q, struct swait_queue *wait, int state) { @@ -95,7 +95,7 @@ long prepare_to_swait_event(struct swait_queue_head *q, struct swait_queue *wait raw_spin_lock_irqsave(&q->lock, flags); if (unlikely(signal_pending_state(state, current))) { /* - * See prepare_to_wait_event(). TL;DR, subsequent swake_up() + * See prepare_to_wait_event(). TL;DR, subsequent swake_up_one() * must not see us. */ list_del_init(&wait->task_list); diff --git a/virt/kvm/arm/arm.c b/virt/kvm/arm/arm.c index 04e554cae3a2..108250e4d376 100644 --- a/virt/kvm/arm/arm.c +++ b/virt/kvm/arm/arm.c @@ -604,7 +604,7 @@ void kvm_arm_resume_guest(struct kvm *kvm) kvm_for_each_vcpu(i, vcpu, kvm) { vcpu->arch.pause = false; - swake_up(kvm_arch_vcpu_wq(vcpu)); + swake_up_one(kvm_arch_vcpu_wq(vcpu)); } } @@ -612,7 +612,7 @@ static void vcpu_req_sleep(struct kvm_vcpu *vcpu) { struct swait_queue_head *wq = kvm_arch_vcpu_wq(vcpu); - swait_event_interruptible(*wq, ((!vcpu->arch.power_off) && + swait_event_interruptible_exclusive(*wq, ((!vcpu->arch.power_off) && (!vcpu->arch.pause))); if (vcpu->arch.power_off || vcpu->arch.pause) { diff --git a/virt/kvm/arm/psci.c b/virt/kvm/arm/psci.c index c95ab4c5a475..9b73d3ad918a 100644 --- a/virt/kvm/arm/psci.c +++ b/virt/kvm/arm/psci.c @@ -155,7 +155,7 @@ static unsigned long kvm_psci_vcpu_on(struct kvm_vcpu *source_vcpu) smp_mb(); /* Make sure the above is visible */ wq = kvm_arch_vcpu_wq(vcpu); - swake_up(wq); + swake_up_one(wq); return PSCI_RET_SUCCESS; } diff --git a/virt/kvm/async_pf.c b/virt/kvm/async_pf.c index 57bcb27dcf30..23c2519c5b32 100644 --- a/virt/kvm/async_pf.c +++ b/virt/kvm/async_pf.c @@ -107,7 +107,7 @@ static void async_pf_execute(struct work_struct *work) trace_kvm_async_pf_completed(addr, gva); if (swq_has_sleeper(&vcpu->wq)) - swake_up(&vcpu->wq); + swake_up_one(&vcpu->wq); mmput(mm); kvm_put_kvm(vcpu->kvm); diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index ada21f47f22b..940a4aed5b2d 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -2167,7 +2167,7 @@ void kvm_vcpu_block(struct kvm_vcpu *vcpu) kvm_arch_vcpu_blocking(vcpu); for (;;) { - prepare_to_swait(&vcpu->wq, &wait, TASK_INTERRUPTIBLE); + prepare_to_swait_exclusive(&vcpu->wq, &wait, TASK_INTERRUPTIBLE); if (kvm_vcpu_check_block(vcpu) < 0) break; @@ -2209,7 +2209,7 @@ bool kvm_vcpu_wake_up(struct kvm_vcpu *vcpu) wqp = kvm_arch_vcpu_wq(vcpu); if (swq_has_sleeper(wqp)) { - swake_up(wqp); + swake_up_one(wqp); ++vcpu->stat.halt_wakeup; return true; } -- cgit From 5a6cf77f5e35e7af35d36a1e7dc21a42f6412e4f Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Wed, 20 Jun 2018 01:05:07 +0900 Subject: kprobes: Remove jprobe API implementation Remove functionally empty jprobe API implementations and test cases. Signed-off-by: Masami Hiramatsu Acked-by: Thomas Gleixner Cc: Ananth N Mavinakayanahalli Cc: Andrew Morton Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Steven Rostedt Cc: linux-arch@vger.kernel.org Link: https://lore.kernel.org/lkml/152942430705.15209.2307050500995264322.stgit@devbox Signed-off-by: Ingo Molnar --- include/linux/kprobes.h | 3 -- kernel/kprobes.c | 78 +--------------------------------------- kernel/test_kprobes.c | 94 ------------------------------------------------- lib/Kconfig.debug | 2 +- 4 files changed, 2 insertions(+), 175 deletions(-) (limited to 'kernel') diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h index 9440a2fc8893..b520baa65682 100644 --- a/include/linux/kprobes.h +++ b/include/linux/kprobes.h @@ -389,9 +389,6 @@ int register_kprobe(struct kprobe *p); void unregister_kprobe(struct kprobe *p); int register_kprobes(struct kprobe **kps, int num); void unregister_kprobes(struct kprobe **kps, int num); -int setjmp_pre_handler(struct kprobe *, struct pt_regs *); -int longjmp_break_handler(struct kprobe *, struct pt_regs *); -void jprobe_return(void); unsigned long arch_deref_entry_point(void *); int register_kretprobe(struct kretprobe *rp); diff --git a/kernel/kprobes.c b/kernel/kprobes.c index ea619021d901..69de130595f7 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -1272,7 +1272,7 @@ NOKPROBE_SYMBOL(cleanup_rp_inst); /* * Add the new probe to ap->list. Fail if this is the -* second jprobe at the address - two jprobes can't coexist +* second break_handler at the address */ static int add_new_kprobe(struct kprobe *ap, struct kprobe *p) { @@ -1812,77 +1812,6 @@ unsigned long __weak arch_deref_entry_point(void *entry) return (unsigned long)entry; } -#if 0 -int register_jprobes(struct jprobe **jps, int num) -{ - int ret = 0, i; - - if (num <= 0) - return -EINVAL; - - for (i = 0; i < num; i++) { - ret = register_jprobe(jps[i]); - - if (ret < 0) { - if (i > 0) - unregister_jprobes(jps, i); - break; - } - } - - return ret; -} -EXPORT_SYMBOL_GPL(register_jprobes); - -int register_jprobe(struct jprobe *jp) -{ - unsigned long addr, offset; - struct kprobe *kp = &jp->kp; - - /* - * Verify probepoint as well as the jprobe handler are - * valid function entry points. - */ - addr = arch_deref_entry_point(jp->entry); - - if (kallsyms_lookup_size_offset(addr, NULL, &offset) && offset == 0 && - kprobe_on_func_entry(kp->addr, kp->symbol_name, kp->offset)) { - kp->pre_handler = setjmp_pre_handler; - kp->break_handler = longjmp_break_handler; - return register_kprobe(kp); - } - - return -EINVAL; -} -EXPORT_SYMBOL_GPL(register_jprobe); - -void unregister_jprobe(struct jprobe *jp) -{ - unregister_jprobes(&jp, 1); -} -EXPORT_SYMBOL_GPL(unregister_jprobe); - -void unregister_jprobes(struct jprobe **jps, int num) -{ - int i; - - if (num <= 0) - return; - mutex_lock(&kprobe_mutex); - for (i = 0; i < num; i++) - if (__unregister_kprobe_top(&jps[i]->kp) < 0) - jps[i]->kp.addr = NULL; - mutex_unlock(&kprobe_mutex); - - synchronize_sched(); - for (i = 0; i < num; i++) { - if (jps[i]->kp.addr) - __unregister_kprobe_bottom(&jps[i]->kp); - } -} -EXPORT_SYMBOL_GPL(unregister_jprobes); -#endif - #ifdef CONFIG_KRETPROBES /* * This kprobe pre_handler is registered with every kretprobe. When probe @@ -2329,8 +2258,6 @@ static void report_probe(struct seq_file *pi, struct kprobe *p, if (p->pre_handler == pre_handler_kretprobe) kprobe_type = "r"; - else if (p->pre_handler == setjmp_pre_handler) - kprobe_type = "j"; else kprobe_type = "k"; @@ -2637,6 +2564,3 @@ late_initcall(debugfs_kprobe_init); #endif /* CONFIG_DEBUG_FS */ module_init(init_kprobes); - -/* defined in arch/.../kernel/kprobes.c */ -EXPORT_SYMBOL_GPL(jprobe_return); diff --git a/kernel/test_kprobes.c b/kernel/test_kprobes.c index dd53e354f630..7bca480151b0 100644 --- a/kernel/test_kprobes.c +++ b/kernel/test_kprobes.c @@ -162,90 +162,6 @@ static int test_kprobes(void) } -#if 0 -static u32 jph_val; - -static u32 j_kprobe_target(u32 value) -{ - if (preemptible()) { - handler_errors++; - pr_err("jprobe-handler is preemptible\n"); - } - if (value != rand1) { - handler_errors++; - pr_err("incorrect value in jprobe handler\n"); - } - - jph_val = rand1; - jprobe_return(); - return 0; -} - -static struct jprobe jp = { - .entry = j_kprobe_target, - .kp.symbol_name = "kprobe_target" -}; - -static int test_jprobe(void) -{ - int ret; - - ret = register_jprobe(&jp); - if (ret < 0) { - pr_err("register_jprobe returned %d\n", ret); - return ret; - } - - ret = target(rand1); - unregister_jprobe(&jp); - if (jph_val == 0) { - pr_err("jprobe handler not called\n"); - handler_errors++; - } - - return 0; -} - -static struct jprobe jp2 = { - .entry = j_kprobe_target, - .kp.symbol_name = "kprobe_target2" -}; - -static int test_jprobes(void) -{ - int ret; - struct jprobe *jps[2] = {&jp, &jp2}; - - /* addr and flags should be cleard for reusing kprobe. */ - jp.kp.addr = NULL; - jp.kp.flags = 0; - ret = register_jprobes(jps, 2); - if (ret < 0) { - pr_err("register_jprobes returned %d\n", ret); - return ret; - } - - jph_val = 0; - ret = target(rand1); - if (jph_val == 0) { - pr_err("jprobe handler not called\n"); - handler_errors++; - } - - jph_val = 0; - ret = target2(rand1); - if (jph_val == 0) { - pr_err("jprobe handler2 not called\n"); - handler_errors++; - } - unregister_jprobes(jps, 2); - - return 0; -} -#else -#define test_jprobe() (0) -#define test_jprobes() (0) -#endif #ifdef CONFIG_KRETPROBES static u32 krph_val; @@ -383,16 +299,6 @@ int init_test_probes(void) if (ret < 0) errors++; - num_tests++; - ret = test_jprobe(); - if (ret < 0) - errors++; - - num_tests++; - ret = test_jprobes(); - if (ret < 0) - errors++; - #ifdef CONFIG_KRETPROBES num_tests++; ret = test_kretprobe(); diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 8838d1158d19..0b066b3c9284 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -1718,7 +1718,7 @@ config KPROBES_SANITY_TEST default n help This option provides for testing basic kprobes functionality on - boot. A sample kprobe, jprobe and kretprobe are inserted and + boot. Samples of kprobe and kretprobe are inserted and verified for functionality. Say N if you are unsure. -- cgit From 059053a275b56eff1db11605c68988a6e9818561 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Wed, 20 Jun 2018 01:10:27 +0900 Subject: kprobes: Don't check the ->break_handler() in generic kprobes code Don't check the ->break_handler() from the core kprobes code, because it was only used by jprobes which got removed. ( In followup patches we'll remove the remaining calls in low level arch handlers as well and remove the callback altogether. ) Signed-off-by: Masami Hiramatsu Acked-by: Thomas Gleixner Cc: Ananth N Mavinakayanahalli Cc: Andrew Morton Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Steven Rostedt Cc: linux-arch@vger.kernel.org Link: https://lore.kernel.org/lkml/152942462686.15209.6324404940493598980.stgit@devbox Signed-off-by: Ingo Molnar --- Documentation/kprobes.txt | 2 +- kernel/kprobes.c | 39 +++++---------------------------------- 2 files changed, 6 insertions(+), 35 deletions(-) (limited to 'kernel') diff --git a/Documentation/kprobes.txt b/Documentation/kprobes.txt index 33dd6a81fb64..ab2c7307f123 100644 --- a/Documentation/kprobes.txt +++ b/Documentation/kprobes.txt @@ -262,7 +262,7 @@ is optimized, that modification is ignored. Thus, if you want to tweak the kernel's execution path, you need to suppress optimization, using one of the following techniques: -- Specify an empty function for the kprobe's post_handler or break_handler. +- Specify an empty function for the kprobe's post_handler. or diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 69de130595f7..536ab451e96d 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -627,8 +627,8 @@ static void optimize_kprobe(struct kprobe *p) (kprobe_disabled(p) || kprobes_all_disarmed)) return; - /* Both of break_handler and post_handler are not supported. */ - if (p->break_handler || p->post_handler) + /* kprobes with post_handler can not be optimized */ + if (p->post_handler) return; op = container_of(p, struct optimized_kprobe, kp); @@ -1116,20 +1116,6 @@ static int aggr_fault_handler(struct kprobe *p, struct pt_regs *regs, } NOKPROBE_SYMBOL(aggr_fault_handler); -static int aggr_break_handler(struct kprobe *p, struct pt_regs *regs) -{ - struct kprobe *cur = __this_cpu_read(kprobe_instance); - int ret = 0; - - if (cur && cur->break_handler) { - if (cur->break_handler(cur, regs)) - ret = 1; - } - reset_kprobe_instance(); - return ret; -} -NOKPROBE_SYMBOL(aggr_break_handler); - /* Walks the list and increments nmissed count for multiprobe case */ void kprobes_inc_nmissed_count(struct kprobe *p) { @@ -1270,24 +1256,15 @@ static void cleanup_rp_inst(struct kretprobe *rp) } NOKPROBE_SYMBOL(cleanup_rp_inst); -/* -* Add the new probe to ap->list. Fail if this is the -* second break_handler at the address -*/ +/* Add the new probe to ap->list */ static int add_new_kprobe(struct kprobe *ap, struct kprobe *p) { BUG_ON(kprobe_gone(ap) || kprobe_gone(p)); - if (p->break_handler || p->post_handler) + if (p->post_handler) unoptimize_kprobe(ap, true); /* Fall back to normal kprobe */ - if (p->break_handler) { - if (ap->break_handler) - return -EEXIST; - list_add_tail_rcu(&p->list, &ap->list); - ap->break_handler = aggr_break_handler; - } else - list_add_rcu(&p->list, &ap->list); + list_add_rcu(&p->list, &ap->list); if (p->post_handler && !ap->post_handler) ap->post_handler = aggr_post_handler; @@ -1310,8 +1287,6 @@ static void init_aggr_kprobe(struct kprobe *ap, struct kprobe *p) /* We don't care the kprobe which has gone. */ if (p->post_handler && !kprobe_gone(p)) ap->post_handler = aggr_post_handler; - if (p->break_handler && !kprobe_gone(p)) - ap->break_handler = aggr_break_handler; INIT_LIST_HEAD(&ap->list); INIT_HLIST_NODE(&ap->hlist); @@ -1706,8 +1681,6 @@ static int __unregister_kprobe_top(struct kprobe *p) goto disarmed; else { /* If disabling probe has special handlers, update aggrprobe */ - if (p->break_handler && !kprobe_gone(p)) - ap->break_handler = NULL; if (p->post_handler && !kprobe_gone(p)) { list_for_each_entry_rcu(list_p, &ap->list, list) { if ((list_p != p) && (list_p->post_handler)) @@ -1911,7 +1884,6 @@ int register_kretprobe(struct kretprobe *rp) rp->kp.pre_handler = pre_handler_kretprobe; rp->kp.post_handler = NULL; rp->kp.fault_handler = NULL; - rp->kp.break_handler = NULL; /* Pre-allocate memory for max kretprobe instances */ if (rp->maxactive <= 0) { @@ -2034,7 +2006,6 @@ static void kill_kprobe(struct kprobe *p) list_for_each_entry_rcu(kp, &p->list, list) kp->flags |= KPROBE_FLAG_GONE; p->post_handler = NULL; - p->break_handler = NULL; kill_optimized_kprobe(p); } /* -- cgit From cce188bd58cfbd603b904dbce75f34de2eff959a Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Wed, 20 Jun 2018 01:15:45 +0900 Subject: bpf/error-inject/kprobes: Clear current_kprobe and enable preempt in kprobe Clear current_kprobe and enable preemption in kprobe even if pre_handler returns !0. This simplifies function override using kprobes. Jprobe used to require to keep the preemption disabled and keep current_kprobe until it returned to original function entry. For this reason kprobe_int3_handler() and similar arch dependent kprobe handers checks pre_handler result and exit without enabling preemption if the result is !0. After removing the jprobe, Kprobes does not need to keep preempt disabled even if user handler returns !0 anymore. But since the function override handler in error-inject and bpf is also returns !0 if it overrides a function, to balancing the preempt count, it enables preemption and reset current kprobe by itself. That is a bad design that is very buggy. This fixes such unbalanced preempt-count and current_kprobes setting in kprobes, bpf and error-inject. Note: for powerpc and x86, this removes all preempt_disable from kprobe_ftrace_handler because ftrace callbacks are called under preempt disabled. Signed-off-by: Masami Hiramatsu Acked-by: Thomas Gleixner Acked-by: Naveen N. Rao Cc: Alexei Starovoitov Cc: Ananth N Mavinakayanahalli Cc: Andrew Morton Cc: Benjamin Herrenschmidt Cc: Catalin Marinas Cc: David S. Miller Cc: Fenghua Yu Cc: Heiko Carstens Cc: James Hogan Cc: Josef Bacik Cc: Linus Torvalds Cc: Martin Schwidefsky Cc: Michael Ellerman Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Ralf Baechle Cc: Rich Felker Cc: Russell King Cc: Steven Rostedt Cc: Tony Luck Cc: Vineet Gupta Cc: Will Deacon Cc: Yoshinori Sato Cc: linux-arch@vger.kernel.org Cc: linux-arm-kernel@lists.infradead.org Cc: linux-ia64@vger.kernel.org Cc: linux-mips@linux-mips.org Cc: linux-s390@vger.kernel.org Cc: linux-sh@vger.kernel.org Cc: linux-snps-arc@lists.infradead.org Cc: linuxppc-dev@lists.ozlabs.org Cc: sparclinux@vger.kernel.org Link: https://lore.kernel.org/lkml/152942494574.15209.12323837825873032258.stgit@devbox Signed-off-by: Ingo Molnar --- arch/arc/kernel/kprobes.c | 5 +++-- arch/arm/probes/kprobes/core.c | 10 +++++----- arch/arm64/kernel/probes/kprobes.c | 10 +++++----- arch/ia64/kernel/kprobes.c | 13 ++++--------- arch/mips/kernel/kprobes.c | 4 ++-- arch/powerpc/kernel/kprobes-ftrace.c | 19 ++++++------------- arch/powerpc/kernel/kprobes.c | 7 +++++-- arch/s390/kernel/kprobes.c | 7 ++++--- arch/sh/kernel/kprobes.c | 7 ++++--- arch/sparc/kernel/kprobes.c | 7 ++++--- arch/x86/kernel/kprobes/core.c | 4 ++++ arch/x86/kernel/kprobes/ftrace.c | 9 +++------ kernel/fail_function.c | 3 --- kernel/trace/trace_kprobe.c | 11 +++-------- 14 files changed, 52 insertions(+), 64 deletions(-) (limited to 'kernel') diff --git a/arch/arc/kernel/kprobes.c b/arch/arc/kernel/kprobes.c index 465365696c91..df35d4c0b0b8 100644 --- a/arch/arc/kernel/kprobes.c +++ b/arch/arc/kernel/kprobes.c @@ -231,6 +231,9 @@ int __kprobes arc_kprobe_handler(unsigned long addr, struct pt_regs *regs) if (!p->pre_handler || !p->pre_handler(p, regs)) { setup_singlestep(p, regs); kcb->kprobe_status = KPROBE_HIT_SS; + } else { + reset_current_kprobe(); + preempt_enable_no_resched(); } return 1; @@ -442,9 +445,7 @@ static int __kprobes trampoline_probe_handler(struct kprobe *p, kretprobe_assert(ri, orig_ret_address, trampoline_address); regs->ret = orig_ret_address; - reset_current_kprobe(); kretprobe_hash_unlock(current, &flags); - preempt_enable_no_resched(); hlist_for_each_entry_safe(ri, tmp, &empty_rp, hlist) { hlist_del(&ri->hlist); diff --git a/arch/arm/probes/kprobes/core.c b/arch/arm/probes/kprobes/core.c index 3192350f389d..8d37601fdb20 100644 --- a/arch/arm/probes/kprobes/core.c +++ b/arch/arm/probes/kprobes/core.c @@ -300,10 +300,10 @@ void __kprobes kprobe_handler(struct pt_regs *regs) /* * If we have no pre-handler or it returned 0, we - * continue with normal processing. If we have a - * pre-handler and it returned non-zero, it prepped - * for calling the break_handler below on re-entry, - * so get out doing nothing more here. + * continue with normal processing. If we have a + * pre-handler and it returned non-zero, it will + * modify the execution path and no need to single + * stepping. Let's just reset current kprobe and exit. */ if (!p->pre_handler || !p->pre_handler(p, regs)) { kcb->kprobe_status = KPROBE_HIT_SS; @@ -312,8 +312,8 @@ void __kprobes kprobe_handler(struct pt_regs *regs) kcb->kprobe_status = KPROBE_HIT_SSDONE; p->post_handler(p, regs, 0); } - reset_current_kprobe(); } + reset_current_kprobe(); } } else { /* diff --git a/arch/arm64/kernel/probes/kprobes.c b/arch/arm64/kernel/probes/kprobes.c index 076c3c0775a6..5daf3d721cb7 100644 --- a/arch/arm64/kernel/probes/kprobes.c +++ b/arch/arm64/kernel/probes/kprobes.c @@ -395,9 +395,9 @@ static void __kprobes kprobe_handler(struct pt_regs *regs) /* * If we have no pre-handler or it returned 0, we * continue with normal processing. If we have a - * pre-handler and it returned non-zero, it prepped - * for calling the break_handler below on re-entry, - * so get out doing nothing more here. + * pre-handler and it returned non-zero, it will + * modify the execution path and no need to single + * stepping. Let's just reset current kprobe and exit. * * pre_handler can hit a breakpoint and can step thru * before return, keep PSTATE D-flag enabled until @@ -405,8 +405,8 @@ static void __kprobes kprobe_handler(struct pt_regs *regs) */ if (!p->pre_handler || !p->pre_handler(p, regs)) { setup_singlestep(p, regs, kcb, 0); - return; - } + } else + reset_current_kprobe(); } } /* diff --git a/arch/ia64/kernel/kprobes.c b/arch/ia64/kernel/kprobes.c index 74c8524e6309..aa41bd5cf9b7 100644 --- a/arch/ia64/kernel/kprobes.c +++ b/arch/ia64/kernel/kprobes.c @@ -478,12 +478,9 @@ int __kprobes trampoline_probe_handler(struct kprobe *p, struct pt_regs *regs) */ break; } - kretprobe_assert(ri, orig_ret_address, trampoline_address); - reset_current_kprobe(); kretprobe_hash_unlock(current, &flags); - preempt_enable_no_resched(); hlist_for_each_entry_safe(ri, tmp, &empty_rp, hlist) { hlist_del(&ri->hlist); @@ -851,13 +848,11 @@ static int __kprobes pre_kprobes_handler(struct die_args *args) set_current_kprobe(p, kcb); kcb->kprobe_status = KPROBE_HIT_ACTIVE; - if (p->pre_handler && p->pre_handler(p, regs)) - /* - * Our pre-handler is specifically requesting that we just - * do a return. This is used for both the jprobe pre-handler - * and the kretprobe trampoline - */ + if (p->pre_handler && p->pre_handler(p, regs)) { + reset_current_kprobe(); + preempt_enable_no_resched(); return 1; + } #if !defined(CONFIG_PREEMPT) if (p->ainsn.inst_flag == INST_FLAG_BOOSTABLE && !p->post_handler) { diff --git a/arch/mips/kernel/kprobes.c b/arch/mips/kernel/kprobes.c index 7fd277bc59b9..54cd675c5d1d 100644 --- a/arch/mips/kernel/kprobes.c +++ b/arch/mips/kernel/kprobes.c @@ -358,6 +358,8 @@ static int __kprobes kprobe_handler(struct pt_regs *regs) if (p->pre_handler && p->pre_handler(p, regs)) { /* handler has already set things up, so skip ss setup */ + reset_current_kprobe(); + preempt_enable_no_resched(); return 1; } @@ -543,9 +545,7 @@ static int __kprobes trampoline_probe_handler(struct kprobe *p, kretprobe_assert(ri, orig_ret_address, trampoline_address); instruction_pointer(regs) = orig_ret_address; - reset_current_kprobe(); kretprobe_hash_unlock(current, &flags); - preempt_enable_no_resched(); hlist_for_each_entry_safe(ri, tmp, &empty_rp, hlist) { hlist_del(&ri->hlist); diff --git a/arch/powerpc/kernel/kprobes-ftrace.c b/arch/powerpc/kernel/kprobes-ftrace.c index 070d1d862444..e4a49c051325 100644 --- a/arch/powerpc/kernel/kprobes-ftrace.c +++ b/arch/powerpc/kernel/kprobes-ftrace.c @@ -32,11 +32,9 @@ void kprobe_ftrace_handler(unsigned long nip, unsigned long parent_nip, struct kprobe *p; struct kprobe_ctlblk *kcb; - preempt_disable(); - p = get_kprobe((kprobe_opcode_t *)nip); if (unlikely(!p) || kprobe_disabled(p)) - goto end; + return; kcb = get_kprobe_ctlblk(); if (kprobe_running()) { @@ -60,18 +58,13 @@ void kprobe_ftrace_handler(unsigned long nip, unsigned long parent_nip, kcb->kprobe_status = KPROBE_HIT_SSDONE; p->post_handler(p, regs, 0); } - __this_cpu_write(current_kprobe, NULL); - } else { - /* - * If pre_handler returns !0, it sets regs->nip and - * resets current kprobe. In this case, we should not - * re-enable preemption. - */ - return; } + /* + * If pre_handler returns !0, it changes regs->nip. We have to + * skip emulating post_handler. + */ + __this_cpu_write(current_kprobe, NULL); } -end: - preempt_enable_no_resched(); } NOKPROBE_SYMBOL(kprobe_ftrace_handler); diff --git a/arch/powerpc/kernel/kprobes.c b/arch/powerpc/kernel/kprobes.c index f06747e2e70d..5c60bb0f927f 100644 --- a/arch/powerpc/kernel/kprobes.c +++ b/arch/powerpc/kernel/kprobes.c @@ -358,9 +358,12 @@ int kprobe_handler(struct pt_regs *regs) kcb->kprobe_status = KPROBE_HIT_ACTIVE; set_current_kprobe(p, regs, kcb); - if (p->pre_handler && p->pre_handler(p, regs)) - /* handler has already set things up, so skip ss setup */ + if (p->pre_handler && p->pre_handler(p, regs)) { + /* handler changed execution path, so skip ss setup */ + reset_current_kprobe(); + preempt_enable_no_resched(); return 1; + } if (p->ainsn.boostable >= 0) { ret = try_to_emulate(p, regs); diff --git a/arch/s390/kernel/kprobes.c b/arch/s390/kernel/kprobes.c index 3e34018960b5..7c0a095e9c5f 100644 --- a/arch/s390/kernel/kprobes.c +++ b/arch/s390/kernel/kprobes.c @@ -326,8 +326,11 @@ static int kprobe_handler(struct pt_regs *regs) */ push_kprobe(kcb, p); kcb->kprobe_status = KPROBE_HIT_ACTIVE; - if (p->pre_handler && p->pre_handler(p, regs)) + if (p->pre_handler && p->pre_handler(p, regs)) { + pop_kprobe(kcb); + preempt_enable_no_resched(); return 1; + } kcb->kprobe_status = KPROBE_HIT_SS; } enable_singlestep(kcb, regs, (unsigned long) p->ainsn.insn); @@ -431,9 +434,7 @@ static int trampoline_probe_handler(struct kprobe *p, struct pt_regs *regs) regs->psw.addr = orig_ret_address; - pop_kprobe(get_kprobe_ctlblk()); kretprobe_hash_unlock(current, &flags); - preempt_enable_no_resched(); hlist_for_each_entry_safe(ri, tmp, &empty_rp, hlist) { hlist_del(&ri->hlist); diff --git a/arch/sh/kernel/kprobes.c b/arch/sh/kernel/kprobes.c index 4fafe0cd12c6..241e903dd3ee 100644 --- a/arch/sh/kernel/kprobes.c +++ b/arch/sh/kernel/kprobes.c @@ -272,9 +272,12 @@ static int __kprobes kprobe_handler(struct pt_regs *regs) set_current_kprobe(p, regs, kcb); kcb->kprobe_status = KPROBE_HIT_ACTIVE; - if (p->pre_handler && p->pre_handler(p, regs)) + if (p->pre_handler && p->pre_handler(p, regs)) { /* handler has already set things up, so skip ss setup */ + reset_current_kprobe(); + preempt_enable_no_resched(); return 1; + } prepare_singlestep(p, regs); kcb->kprobe_status = KPROBE_HIT_SS; @@ -352,8 +355,6 @@ int __kprobes trampoline_probe_handler(struct kprobe *p, struct pt_regs *regs) regs->pc = orig_ret_address; kretprobe_hash_unlock(current, &flags); - preempt_enable_no_resched(); - hlist_for_each_entry_safe(ri, tmp, &empty_rp, hlist) { hlist_del(&ri->hlist); kfree(ri); diff --git a/arch/sparc/kernel/kprobes.c b/arch/sparc/kernel/kprobes.c index c684c96ef2e9..dfbca2470536 100644 --- a/arch/sparc/kernel/kprobes.c +++ b/arch/sparc/kernel/kprobes.c @@ -175,8 +175,11 @@ static int __kprobes kprobe_handler(struct pt_regs *regs) set_current_kprobe(p, regs, kcb); kcb->kprobe_status = KPROBE_HIT_ACTIVE; - if (p->pre_handler && p->pre_handler(p, regs)) + if (p->pre_handler && p->pre_handler(p, regs)) { + reset_current_kprobe(); + preempt_enable_no_resched(); return 1; + } prepare_singlestep(p, regs, kcb); kcb->kprobe_status = KPROBE_HIT_SS; @@ -508,9 +511,7 @@ static int __kprobes trampoline_probe_handler(struct kprobe *p, regs->tpc = orig_ret_address; regs->tnpc = orig_ret_address + 4; - reset_current_kprobe(); kretprobe_hash_unlock(current, &flags); - preempt_enable_no_resched(); hlist_for_each_entry_safe(ri, tmp, &empty_rp, hlist) { hlist_del(&ri->hlist); diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c index 0ac16a0d93e5..814e26b7c8a2 100644 --- a/arch/x86/kernel/kprobes/core.c +++ b/arch/x86/kernel/kprobes/core.c @@ -694,6 +694,10 @@ int kprobe_int3_handler(struct pt_regs *regs) */ if (!p->pre_handler || !p->pre_handler(p, regs)) setup_singlestep(p, regs, kcb, 0); + else { + reset_current_kprobe(); + preempt_enable_no_resched(); + } return 1; } } else if (*addr != BREAKPOINT_INSTRUCTION) { diff --git a/arch/x86/kernel/kprobes/ftrace.c b/arch/x86/kernel/kprobes/ftrace.c index 02a6dd1b6bd0..ef819e19650b 100644 --- a/arch/x86/kernel/kprobes/ftrace.c +++ b/arch/x86/kernel/kprobes/ftrace.c @@ -45,8 +45,6 @@ void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip, /* Kprobe handler expects regs->ip = ip + 1 as breakpoint hit */ regs->ip = ip + sizeof(kprobe_opcode_t); - /* To emulate trap based kprobes, preempt_disable here */ - preempt_disable(); __this_cpu_write(current_kprobe, p); kcb->kprobe_status = KPROBE_HIT_ACTIVE; if (!p->pre_handler || !p->pre_handler(p, regs)) { @@ -60,13 +58,12 @@ void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip, p->post_handler(p, regs, 0); } regs->ip = orig_ip; - __this_cpu_write(current_kprobe, NULL); - preempt_enable_no_resched(); } /* - * If pre_handler returns !0, it sets regs->ip and - * resets current kprobe, and keep preempt count +1. + * If pre_handler returns !0, it changes regs->ip. We have to + * skip emulating post_handler. */ + __this_cpu_write(current_kprobe, NULL); } } NOKPROBE_SYMBOL(kprobe_ftrace_handler); diff --git a/kernel/fail_function.c b/kernel/fail_function.c index 5349c91c2298..bc80a4e268c0 100644 --- a/kernel/fail_function.c +++ b/kernel/fail_function.c @@ -184,9 +184,6 @@ static int fei_kprobe_handler(struct kprobe *kp, struct pt_regs *regs) if (should_fail(&fei_fault_attr, 1)) { regs_set_return_value(regs, attr->retval); override_function_with_return(regs); - /* Kprobe specific fixup */ - reset_current_kprobe(); - preempt_enable_no_resched(); return 1; } diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index daa81571b22a..7e3b944b6ac1 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -1217,16 +1217,11 @@ kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs) /* * We need to check and see if we modified the pc of the - * pt_regs, and if so clear the kprobe and return 1 so that we - * don't do the single stepping. - * The ftrace kprobe handler leaves it up to us to re-enable - * preemption here before returning if we've modified the ip. + * pt_regs, and if so return 1 so that we don't do the + * single stepping. */ - if (orig_ip != instruction_pointer(regs)) { - reset_current_kprobe(); - preempt_enable_no_resched(); + if (orig_ip != instruction_pointer(regs)) return 1; - } if (!ret) return 0; } -- cgit From ba2591a5993eabcc8e874e30f361d8ffbb10d6d4 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 29 May 2018 16:43:46 +0200 Subject: sched/smt: Update sched_smt_present at runtime The static key sched_smt_present is only updated at boot time when SMT siblings have been detected. Booting with maxcpus=1 and bringing the siblings online after boot rebuilds the scheduling domains correctly but does not update the static key, so the SMT code is not enabled. Let the key be updated in the scheduler CPU hotplug code to fix this. Signed-off-by: Peter Zijlstra Signed-off-by: Thomas Gleixner Reviewed-by: Konrad Rzeszutek Wilk Acked-by: Ingo Molnar --- kernel/sched/core.c | 30 ++++++++++++------------------ kernel/sched/fair.c | 1 + 2 files changed, 13 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 78d8facba456..0f79bb5ffb5a 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5777,6 +5777,18 @@ int sched_cpu_activate(unsigned int cpu) struct rq *rq = cpu_rq(cpu); struct rq_flags rf; +#ifdef CONFIG_SCHED_SMT + /* + * The sched_smt_present static key needs to be evaluated on every + * hotplug event because at boot time SMT might be disabled when + * the number of booted CPUs is limited. + * + * If then later a sibling gets hotplugged, then the key would stay + * off and SMT scheduling would never be functional. + */ + if (cpumask_weight(cpu_smt_mask(cpu)) > 1) + static_branch_enable_cpuslocked(&sched_smt_present); +#endif set_cpu_active(cpu, true); if (sched_smp_initialized) { @@ -5874,22 +5886,6 @@ int sched_cpu_dying(unsigned int cpu) } #endif -#ifdef CONFIG_SCHED_SMT -DEFINE_STATIC_KEY_FALSE(sched_smt_present); - -static void sched_init_smt(void) -{ - /* - * We've enumerated all CPUs and will assume that if any CPU - * has SMT siblings, CPU0 will too. - */ - if (cpumask_weight(cpu_smt_mask(0)) > 1) - static_branch_enable(&sched_smt_present); -} -#else -static inline void sched_init_smt(void) { } -#endif - void __init sched_init_smp(void) { sched_init_numa(); @@ -5911,8 +5907,6 @@ void __init sched_init_smp(void) init_sched_rt_class(); init_sched_dl_class(); - sched_init_smt(); - sched_smp_initialized = true; } diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 1866e64792a7..cacdbef99b95 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6238,6 +6238,7 @@ static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p } #ifdef CONFIG_SCHED_SMT +DEFINE_STATIC_KEY_FALSE(sched_smt_present); static inline void set_idle_cores(int cpu, int val) { -- cgit From c4de65696d865c225fda3b9913b31284ea65ea96 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 29 May 2018 19:05:25 +0200 Subject: cpu/hotplug: Make bringup/teardown of smp threads symmetric The asymmetry caused a warning to trigger if the bootup was stopped in state CPUHP_AP_ONLINE_IDLE. The warning no longer triggers as kthread_park() can now be invoked on already or still parked threads. But there is still no reason to have this be asymmetric. Signed-off-by: Thomas Gleixner Reviewed-by: Konrad Rzeszutek Wilk Acked-by: Ingo Molnar --- kernel/cpu.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index 0db8938fbb23..61396b3e9058 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -754,7 +754,6 @@ static int takedown_cpu(unsigned int cpu) /* Park the smpboot threads */ kthread_park(per_cpu_ptr(&cpuhp_state, cpu)->thread); - smpboot_park_threads(cpu); /* * Prevent irq alloc/free while the dying cpu reorganizes the @@ -1332,7 +1331,7 @@ static struct cpuhp_step cpuhp_hp_states[] = { [CPUHP_AP_SMPBOOT_THREADS] = { .name = "smpboot/threads:online", .startup.single = smpboot_unpark_threads, - .teardown.single = NULL, + .teardown.single = smpboot_park_threads, }, [CPUHP_AP_IRQ_AFFINITY_ONLINE] = { .name = "irq/affinity:online", -- cgit From cc1fe215e1efa406b03aa4389e6269b61342dec5 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 29 May 2018 17:49:05 +0200 Subject: cpu/hotplug: Split do_cpu_down() Split out the inner workings of do_cpu_down() to allow reuse of that function for the upcoming SMT disabling mechanism. No functional change. Signed-off-by: Thomas Gleixner Reviewed-by: Konrad Rzeszutek Wilk Acked-by: Ingo Molnar --- kernel/cpu.c | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index 61396b3e9058..e266cb529c1a 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -906,20 +906,19 @@ out: return ret; } +static int cpu_down_maps_locked(unsigned int cpu, enum cpuhp_state target) +{ + if (cpu_hotplug_disabled) + return -EBUSY; + return _cpu_down(cpu, 0, target); +} + static int do_cpu_down(unsigned int cpu, enum cpuhp_state target) { int err; cpu_maps_update_begin(); - - if (cpu_hotplug_disabled) { - err = -EBUSY; - goto out; - } - - err = _cpu_down(cpu, 0, target); - -out: + err = cpu_down_maps_locked(cpu, target); cpu_maps_update_done(); return err; } -- cgit From 05736e4ac13c08a4a9b1ef2de26dd31a32cbee57 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 29 May 2018 17:48:27 +0200 Subject: cpu/hotplug: Provide knobs to control SMT Provide a command line and a sysfs knob to control SMT. The command line options are: 'nosmt': Enumerate secondary threads, but do not online them 'nosmt=force': Ignore secondary threads completely during enumeration via MP table and ACPI/MADT. The sysfs control file has the following states (read/write): 'on': SMT is enabled. Secondary threads can be freely onlined 'off': SMT is disabled. Secondary threads, even if enumerated cannot be onlined 'forceoff': SMT is permanentely disabled. Writes to the control file are rejected. 'notsupported': SMT is not supported by the CPU The command line option 'nosmt' sets the sysfs control to 'off'. This can be changed to 'on' to reenable SMT during runtime. The command line option 'nosmt=force' sets the sysfs control to 'forceoff'. This cannot be changed during runtime. When SMT is 'on' and the control file is changed to 'off' then all online secondary threads are offlined and attempts to online a secondary thread later on are rejected. When SMT is 'off' and the control file is changed to 'on' then secondary threads can be onlined again. The 'off' -> 'on' transition does not automatically online the secondary threads. When the control file is set to 'forceoff', the behaviour is the same as setting it to 'off', but the operation is irreversible and later writes to the control file are rejected. When the control status is 'notsupported' then writes to the control file are rejected. Signed-off-by: Thomas Gleixner Reviewed-by: Konrad Rzeszutek Wilk Acked-by: Ingo Molnar --- Documentation/ABI/testing/sysfs-devices-system-cpu | 20 +++ Documentation/admin-guide/kernel-parameters.txt | 8 + arch/Kconfig | 3 + arch/x86/Kconfig | 1 + include/linux/cpu.h | 13 ++ kernel/cpu.c | 170 +++++++++++++++++++++ 6 files changed, 215 insertions(+) (limited to 'kernel') diff --git a/Documentation/ABI/testing/sysfs-devices-system-cpu b/Documentation/ABI/testing/sysfs-devices-system-cpu index 9c5e7732d249..65d9b844ecfd 100644 --- a/Documentation/ABI/testing/sysfs-devices-system-cpu +++ b/Documentation/ABI/testing/sysfs-devices-system-cpu @@ -487,3 +487,23 @@ Description: Information about CPU vulnerabilities "Not affected" CPU is not affected by the vulnerability "Vulnerable" CPU is affected and no mitigation in effect "Mitigation: $M" CPU is affected and mitigation $M is in effect + +What: /sys/devices/system/cpu/smt + /sys/devices/system/cpu/smt/active + /sys/devices/system/cpu/smt/control +Date: June 2018 +Contact: Linux kernel mailing list +Description: Control Symetric Multi Threading (SMT) + + active: Tells whether SMT is active (enabled and siblings online) + + control: Read/write interface to control SMT. Possible + values: + + "on" SMT is enabled + "off" SMT is disabled + "forceoff" SMT is force disabled. Cannot be changed. + "notsupported" SMT is not supported by the CPU + + If control status is "forceoff" or "notsupported" writes + are rejected. diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index efc7aa7a0670..8e29c4b6756f 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -2687,6 +2687,14 @@ nosmt [KNL,S390] Disable symmetric multithreading (SMT). Equivalent to smt=1. + [KNL,x86] Disable symmetric multithreading (SMT). + nosmt=force: Force disable SMT, similar to disabling + it in the BIOS except that some of the + resource partitioning effects which are + caused by having SMT enabled in the BIOS + cannot be undone. Depending on the CPU + type this might have a performance impact. + nospectre_v2 [X86] Disable all mitigations for the Spectre variant 2 (indirect branch prediction) vulnerability. System may allow data leaks with this option, which is equivalent diff --git a/arch/Kconfig b/arch/Kconfig index 1aa59063f1fd..d1f2ed462ac8 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -13,6 +13,9 @@ config KEXEC_CORE config HAVE_IMA_KEXEC bool +config HOTPLUG_SMT + bool + config OPROFILE tristate "OProfile system profiling" depends on PROFILING diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index f1dbb4ee19d7..7a34fdf8daf0 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -187,6 +187,7 @@ config X86 select HAVE_SYSCALL_TRACEPOINTS select HAVE_UNSTABLE_SCHED_CLOCK select HAVE_USER_RETURN_NOTIFIER + select HOTPLUG_SMT if SMP select IRQ_FORCED_THREADING select NEED_SG_DMA_LENGTH select PCI_LOCKLESS_CONFIG diff --git a/include/linux/cpu.h b/include/linux/cpu.h index d3da5184aa1c..7532cbf27b1d 100644 --- a/include/linux/cpu.h +++ b/include/linux/cpu.h @@ -168,4 +168,17 @@ void cpuhp_report_idle_dead(void); static inline void cpuhp_report_idle_dead(void) { } #endif /* #ifdef CONFIG_HOTPLUG_CPU */ +enum cpuhp_smt_control { + CPU_SMT_ENABLED, + CPU_SMT_DISABLED, + CPU_SMT_FORCE_DISABLED, + CPU_SMT_NOT_SUPPORTED, +}; + +#if defined(CONFIG_SMP) && defined(CONFIG_HOTPLUG_SMT) +extern enum cpuhp_smt_control cpu_smt_control; +#else +# define cpu_smt_control (CPU_SMT_ENABLED) +#endif + #endif /* _LINUX_CPU_H_ */ diff --git a/kernel/cpu.c b/kernel/cpu.c index e266cb529c1a..d29fdd7e57bb 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -933,6 +933,29 @@ EXPORT_SYMBOL(cpu_down); #define takedown_cpu NULL #endif /*CONFIG_HOTPLUG_CPU*/ +#ifdef CONFIG_HOTPLUG_SMT +enum cpuhp_smt_control cpu_smt_control __read_mostly = CPU_SMT_ENABLED; + +static int __init smt_cmdline_disable(char *str) +{ + cpu_smt_control = CPU_SMT_DISABLED; + if (str && !strcmp(str, "force")) { + pr_info("SMT: Force disabled\n"); + cpu_smt_control = CPU_SMT_FORCE_DISABLED; + } + return 0; +} +early_param("nosmt", smt_cmdline_disable); + +static inline bool cpu_smt_allowed(unsigned int cpu) +{ + return cpu_smt_control == CPU_SMT_ENABLED || + topology_is_primary_thread(cpu); +} +#else +static inline bool cpu_smt_allowed(unsigned int cpu) { return true; } +#endif + /** * notify_cpu_starting(cpu) - Invoke the callbacks on the starting CPU * @cpu: cpu that just started @@ -1056,6 +1079,10 @@ static int do_cpu_up(unsigned int cpu, enum cpuhp_state target) err = -EBUSY; goto out; } + if (!cpu_smt_allowed(cpu)) { + err = -EPERM; + goto out; + } err = _cpu_up(cpu, 0, target); out: @@ -1904,10 +1931,153 @@ static const struct attribute_group cpuhp_cpu_root_attr_group = { NULL }; +#ifdef CONFIG_HOTPLUG_SMT + +static const char *smt_states[] = { + [CPU_SMT_ENABLED] = "on", + [CPU_SMT_DISABLED] = "off", + [CPU_SMT_FORCE_DISABLED] = "forceoff", + [CPU_SMT_NOT_SUPPORTED] = "notsupported", +}; + +static ssize_t +show_smt_control(struct device *dev, struct device_attribute *attr, char *buf) +{ + return snprintf(buf, PAGE_SIZE - 2, "%s\n", smt_states[cpu_smt_control]); +} + +static void cpuhp_offline_cpu_device(unsigned int cpu) +{ + struct device *dev = get_cpu_device(cpu); + + dev->offline = true; + /* Tell user space about the state change */ + kobject_uevent(&dev->kobj, KOBJ_OFFLINE); +} + +static int cpuhp_smt_disable(enum cpuhp_smt_control ctrlval) +{ + int cpu, ret = 0; + + cpu_maps_update_begin(); + for_each_online_cpu(cpu) { + if (topology_is_primary_thread(cpu)) + continue; + ret = cpu_down_maps_locked(cpu, CPUHP_OFFLINE); + if (ret) + break; + /* + * As this needs to hold the cpu maps lock it's impossible + * to call device_offline() because that ends up calling + * cpu_down() which takes cpu maps lock. cpu maps lock + * needs to be held as this might race against in kernel + * abusers of the hotplug machinery (thermal management). + * + * So nothing would update device:offline state. That would + * leave the sysfs entry stale and prevent onlining after + * smt control has been changed to 'off' again. This is + * called under the sysfs hotplug lock, so it is properly + * serialized against the regular offline usage. + */ + cpuhp_offline_cpu_device(cpu); + } + if (!ret) + cpu_smt_control = ctrlval; + cpu_maps_update_done(); + return ret; +} + +static void cpuhp_smt_enable(void) +{ + cpu_maps_update_begin(); + cpu_smt_control = CPU_SMT_ENABLED; + cpu_maps_update_done(); +} + +static ssize_t +store_smt_control(struct device *dev, struct device_attribute *attr, + const char *buf, size_t count) +{ + int ctrlval, ret; + + if (sysfs_streq(buf, "on")) + ctrlval = CPU_SMT_ENABLED; + else if (sysfs_streq(buf, "off")) + ctrlval = CPU_SMT_DISABLED; + else if (sysfs_streq(buf, "forceoff")) + ctrlval = CPU_SMT_FORCE_DISABLED; + else + return -EINVAL; + + if (cpu_smt_control == CPU_SMT_FORCE_DISABLED) + return -EPERM; + + if (cpu_smt_control == CPU_SMT_NOT_SUPPORTED) + return -ENODEV; + + ret = lock_device_hotplug_sysfs(); + if (ret) + return ret; + + if (ctrlval != cpu_smt_control) { + switch (ctrlval) { + case CPU_SMT_ENABLED: + cpuhp_smt_enable(); + break; + case CPU_SMT_DISABLED: + case CPU_SMT_FORCE_DISABLED: + ret = cpuhp_smt_disable(ctrlval); + break; + } + } + + unlock_device_hotplug(); + return ret ? ret : count; +} +static DEVICE_ATTR(control, 0644, show_smt_control, store_smt_control); + +static ssize_t +show_smt_active(struct device *dev, struct device_attribute *attr, char *buf) +{ + bool active = topology_max_smt_threads() > 1; + + return snprintf(buf, PAGE_SIZE - 2, "%d\n", active); +} +static DEVICE_ATTR(active, 0444, show_smt_active, NULL); + +static struct attribute *cpuhp_smt_attrs[] = { + &dev_attr_control.attr, + &dev_attr_active.attr, + NULL +}; + +static const struct attribute_group cpuhp_smt_attr_group = { + .attrs = cpuhp_smt_attrs, + .name = "smt", + NULL +}; + +static int __init cpu_smt_state_init(void) +{ + if (!topology_smt_supported()) + cpu_smt_control = CPU_SMT_NOT_SUPPORTED; + + return sysfs_create_group(&cpu_subsys.dev_root->kobj, + &cpuhp_smt_attr_group); +} + +#else +static inline int cpu_smt_state_init(void) { return 0; } +#endif + static int __init cpuhp_sysfs_init(void) { int cpu, ret; + ret = cpu_smt_state_init(); + if (ret) + return ret; + ret = sysfs_create_group(&cpu_subsys.dev_root->kobj, &cpuhp_cpu_root_attr_group); if (ret) -- cgit From bfc18e389c7a09fbbbed6bf4032396685b14246e Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Thu, 21 Jun 2018 13:13:04 +0100 Subject: atomics/treewide: Rename __atomic_add_unless() => atomic_fetch_add_unless() While __atomic_add_unless() was originally intended as a building-block for atomic_add_unless(), it's now used in a number of places around the kernel. It's the only common atomic operation named __atomic*(), rather than atomic_*(), and for consistency it would be better named atomic_fetch_add_unless(). This lack of consistency is slightly confusing, and gets in the way of scripting atomics. Given that, let's clean things up and promote it to an official part of the atomics API, in the form of atomic_fetch_add_unless(). This patch converts definitions and invocations over to the new name, including the instrumented version, using the following script: ---- git grep -w __atomic_add_unless | while read line; do sed -i '{s/\<__atomic_add_unless\>/atomic_fetch_add_unless/}' "${line%%:*}"; done git grep -w __arch_atomic_add_unless | while read line; do sed -i '{s/\<__arch_atomic_add_unless\>/arch_atomic_fetch_add_unless/}' "${line%%:*}"; done ---- Note that we do not have atomic{64,_long}_fetch_add_unless(), which will be introduced by later patches. There should be no functional change as a result of this patch. Signed-off-by: Mark Rutland Reviewed-by: Will Deacon Acked-by: Geert Uytterhoeven Acked-by: Peter Zijlstra (Intel) Acked-by: Palmer Dabbelt Cc: Boqun Feng Cc: Linus Torvalds Cc: Thomas Gleixner Link: https://lore.kernel.org/lkml/20180621121321.4761-2-mark.rutland@arm.com Signed-off-by: Ingo Molnar --- arch/alpha/include/asm/atomic.h | 4 ++-- arch/arc/include/asm/atomic.h | 4 ++-- arch/arm/include/asm/atomic.h | 4 ++-- arch/arm64/include/asm/atomic.h | 2 +- arch/h8300/include/asm/atomic.h | 2 +- arch/hexagon/include/asm/atomic.h | 4 ++-- arch/ia64/include/asm/atomic.h | 2 +- arch/m68k/include/asm/atomic.h | 2 +- arch/mips/include/asm/atomic.h | 4 ++-- arch/openrisc/include/asm/atomic.h | 4 ++-- arch/parisc/include/asm/atomic.h | 4 ++-- arch/powerpc/include/asm/atomic.h | 8 ++++---- arch/riscv/include/asm/atomic.h | 4 ++-- arch/s390/include/asm/atomic.h | 2 +- arch/sh/include/asm/atomic.h | 4 ++-- arch/sparc/include/asm/atomic_32.h | 2 +- arch/sparc/include/asm/atomic_64.h | 2 +- arch/sparc/lib/atomic32.c | 4 ++-- arch/x86/include/asm/atomic.h | 4 ++-- arch/xtensa/include/asm/atomic.h | 4 ++-- drivers/block/rbd.c | 2 +- drivers/infiniband/core/rdma_core.c | 2 +- fs/afs/rxrpc.c | 2 +- include/asm-generic/atomic-instrumented.h | 4 ++-- include/asm-generic/atomic.h | 4 ++-- include/linux/atomic.h | 2 +- kernel/bpf/syscall.c | 4 ++-- net/rxrpc/call_object.c | 2 +- net/rxrpc/conn_object.c | 4 ++-- net/rxrpc/local_object.c | 2 +- net/rxrpc/peer_object.c | 2 +- 31 files changed, 50 insertions(+), 50 deletions(-) (limited to 'kernel') diff --git a/arch/alpha/include/asm/atomic.h b/arch/alpha/include/asm/atomic.h index 767bfdd42992..392b15a4dd4f 100644 --- a/arch/alpha/include/asm/atomic.h +++ b/arch/alpha/include/asm/atomic.h @@ -206,7 +206,7 @@ ATOMIC_OPS(xor, xor) #define atomic_xchg(v, new) (xchg(&((v)->counter), new)) /** - * __atomic_add_unless - add unless the number is a given value + * atomic_fetch_add_unless - add unless the number is a given value * @v: pointer of type atomic_t * @a: the amount to add to v... * @u: ...unless v is equal to u. @@ -214,7 +214,7 @@ ATOMIC_OPS(xor, xor) * Atomically adds @a to @v, so long as it was not @u. * Returns the old value of @v. */ -static __inline__ int __atomic_add_unless(atomic_t *v, int a, int u) +static __inline__ int atomic_fetch_add_unless(atomic_t *v, int a, int u) { int c, new, old; smp_mb(); diff --git a/arch/arc/include/asm/atomic.h b/arch/arc/include/asm/atomic.h index 11859287c52a..67121b5ff3a3 100644 --- a/arch/arc/include/asm/atomic.h +++ b/arch/arc/include/asm/atomic.h @@ -309,7 +309,7 @@ ATOMIC_OPS(xor, ^=, CTOP_INST_AXOR_DI_R2_R2_R3) #undef ATOMIC_OP /** - * __atomic_add_unless - add unless the number is a given value + * atomic_fetch_add_unless - add unless the number is a given value * @v: pointer of type atomic_t * @a: the amount to add to v... * @u: ...unless v is equal to u. @@ -317,7 +317,7 @@ ATOMIC_OPS(xor, ^=, CTOP_INST_AXOR_DI_R2_R2_R3) * Atomically adds @a to @v, so long as it was not @u. * Returns the old value of @v */ -#define __atomic_add_unless(v, a, u) \ +#define atomic_fetch_add_unless(v, a, u) \ ({ \ int c, old; \ \ diff --git a/arch/arm/include/asm/atomic.h b/arch/arm/include/asm/atomic.h index 66d0e215a773..9d56d0727c9b 100644 --- a/arch/arm/include/asm/atomic.h +++ b/arch/arm/include/asm/atomic.h @@ -130,7 +130,7 @@ static inline int atomic_cmpxchg_relaxed(atomic_t *ptr, int old, int new) } #define atomic_cmpxchg_relaxed atomic_cmpxchg_relaxed -static inline int __atomic_add_unless(atomic_t *v, int a, int u) +static inline int atomic_fetch_add_unless(atomic_t *v, int a, int u) { int oldval, newval; unsigned long tmp; @@ -215,7 +215,7 @@ static inline int atomic_cmpxchg(atomic_t *v, int old, int new) return ret; } -static inline int __atomic_add_unless(atomic_t *v, int a, int u) +static inline int atomic_fetch_add_unless(atomic_t *v, int a, int u) { int c, old; diff --git a/arch/arm64/include/asm/atomic.h b/arch/arm64/include/asm/atomic.h index c0235e0ff849..264d20339f74 100644 --- a/arch/arm64/include/asm/atomic.h +++ b/arch/arm64/include/asm/atomic.h @@ -125,7 +125,7 @@ #define atomic_dec_and_test(v) (atomic_dec_return(v) == 0) #define atomic_sub_and_test(i, v) (atomic_sub_return((i), (v)) == 0) #define atomic_add_negative(i, v) (atomic_add_return((i), (v)) < 0) -#define __atomic_add_unless(v, a, u) ___atomic_add_unless(v, a, u,) +#define atomic_fetch_add_unless(v, a, u) ___atomic_add_unless(v, a, u,) #define atomic_andnot atomic_andnot /* diff --git a/arch/h8300/include/asm/atomic.h b/arch/h8300/include/asm/atomic.h index b174dec099bf..5c856887fdf2 100644 --- a/arch/h8300/include/asm/atomic.h +++ b/arch/h8300/include/asm/atomic.h @@ -94,7 +94,7 @@ static inline int atomic_cmpxchg(atomic_t *v, int old, int new) return ret; } -static inline int __atomic_add_unless(atomic_t *v, int a, int u) +static inline int atomic_fetch_add_unless(atomic_t *v, int a, int u) { int ret; h8300flags flags; diff --git a/arch/hexagon/include/asm/atomic.h b/arch/hexagon/include/asm/atomic.h index fb3dfb2a667e..287aa9f394f3 100644 --- a/arch/hexagon/include/asm/atomic.h +++ b/arch/hexagon/include/asm/atomic.h @@ -164,7 +164,7 @@ ATOMIC_OPS(xor) #undef ATOMIC_OP /** - * __atomic_add_unless - add unless the number is a given value + * atomic_fetch_add_unless - add unless the number is a given value * @v: pointer to value * @a: amount to add * @u: unless value is equal to u @@ -173,7 +173,7 @@ ATOMIC_OPS(xor) * */ -static inline int __atomic_add_unless(atomic_t *v, int a, int u) +static inline int atomic_fetch_add_unless(atomic_t *v, int a, int u) { int __oldval; register int tmp; diff --git a/arch/ia64/include/asm/atomic.h b/arch/ia64/include/asm/atomic.h index 2524fb60fbc2..9d2ddde5f9d5 100644 --- a/arch/ia64/include/asm/atomic.h +++ b/arch/ia64/include/asm/atomic.h @@ -215,7 +215,7 @@ ATOMIC64_FETCH_OP(xor, ^) (cmpxchg(&((v)->counter), old, new)) #define atomic64_xchg(v, new) (xchg(&((v)->counter), new)) -static __inline__ int __atomic_add_unless(atomic_t *v, int a, int u) +static __inline__ int atomic_fetch_add_unless(atomic_t *v, int a, int u) { int c, old; c = atomic_read(v); diff --git a/arch/m68k/include/asm/atomic.h b/arch/m68k/include/asm/atomic.h index e993e2860ee1..8022d9ea1213 100644 --- a/arch/m68k/include/asm/atomic.h +++ b/arch/m68k/include/asm/atomic.h @@ -211,7 +211,7 @@ static inline int atomic_add_negative(int i, atomic_t *v) return c != 0; } -static __inline__ int __atomic_add_unless(atomic_t *v, int a, int u) +static __inline__ int atomic_fetch_add_unless(atomic_t *v, int a, int u) { int c, old; c = atomic_read(v); diff --git a/arch/mips/include/asm/atomic.h b/arch/mips/include/asm/atomic.h index 0ab176bdb8e8..02fc1553cf9b 100644 --- a/arch/mips/include/asm/atomic.h +++ b/arch/mips/include/asm/atomic.h @@ -275,7 +275,7 @@ static __inline__ int atomic_sub_if_positive(int i, atomic_t * v) #define atomic_xchg(v, new) (xchg(&((v)->counter), (new))) /** - * __atomic_add_unless - add unless the number is a given value + * atomic_fetch_add_unless - add unless the number is a given value * @v: pointer of type atomic_t * @a: the amount to add to v... * @u: ...unless v is equal to u. @@ -283,7 +283,7 @@ static __inline__ int atomic_sub_if_positive(int i, atomic_t * v) * Atomically adds @a to @v, so long as it was not @u. * Returns the old value of @v. */ -static __inline__ int __atomic_add_unless(atomic_t *v, int a, int u) +static __inline__ int atomic_fetch_add_unless(atomic_t *v, int a, int u) { int c, old; c = atomic_read(v); diff --git a/arch/openrisc/include/asm/atomic.h b/arch/openrisc/include/asm/atomic.h index 146e1660f00e..b589fac39b92 100644 --- a/arch/openrisc/include/asm/atomic.h +++ b/arch/openrisc/include/asm/atomic.h @@ -100,7 +100,7 @@ ATOMIC_OP(xor) * * This is often used through atomic_inc_not_zero() */ -static inline int __atomic_add_unless(atomic_t *v, int a, int u) +static inline int atomic_fetch_add_unless(atomic_t *v, int a, int u) { int old, tmp; @@ -119,7 +119,7 @@ static inline int __atomic_add_unless(atomic_t *v, int a, int u) return old; } -#define __atomic_add_unless __atomic_add_unless +#define atomic_fetch_add_unless atomic_fetch_add_unless #include diff --git a/arch/parisc/include/asm/atomic.h b/arch/parisc/include/asm/atomic.h index 88bae6676c9b..7748abced766 100644 --- a/arch/parisc/include/asm/atomic.h +++ b/arch/parisc/include/asm/atomic.h @@ -78,7 +78,7 @@ static __inline__ int atomic_read(const atomic_t *v) #define atomic_xchg(v, new) (xchg(&((v)->counter), new)) /** - * __atomic_add_unless - add unless the number is a given value + * atomic_fetch_add_unless - add unless the number is a given value * @v: pointer of type atomic_t * @a: the amount to add to v... * @u: ...unless v is equal to u. @@ -86,7 +86,7 @@ static __inline__ int atomic_read(const atomic_t *v) * Atomically adds @a to @v, so long as it was not @u. * Returns the old value of @v. */ -static __inline__ int __atomic_add_unless(atomic_t *v, int a, int u) +static __inline__ int atomic_fetch_add_unless(atomic_t *v, int a, int u) { int c, old; c = atomic_read(v); diff --git a/arch/powerpc/include/asm/atomic.h b/arch/powerpc/include/asm/atomic.h index 682b3e6a1e21..1483261080a1 100644 --- a/arch/powerpc/include/asm/atomic.h +++ b/arch/powerpc/include/asm/atomic.h @@ -218,7 +218,7 @@ static __inline__ int atomic_dec_return_relaxed(atomic_t *v) #define atomic_xchg_relaxed(v, new) xchg_relaxed(&((v)->counter), (new)) /** - * __atomic_add_unless - add unless the number is a given value + * atomic_fetch_add_unless - add unless the number is a given value * @v: pointer of type atomic_t * @a: the amount to add to v... * @u: ...unless v is equal to u. @@ -226,13 +226,13 @@ static __inline__ int atomic_dec_return_relaxed(atomic_t *v) * Atomically adds @a to @v, so long as it was not @u. * Returns the old value of @v. */ -static __inline__ int __atomic_add_unless(atomic_t *v, int a, int u) +static __inline__ int atomic_fetch_add_unless(atomic_t *v, int a, int u) { int t; __asm__ __volatile__ ( PPC_ATOMIC_ENTRY_BARRIER -"1: lwarx %0,0,%1 # __atomic_add_unless\n\ +"1: lwarx %0,0,%1 # atomic_fetch_add_unless\n\ cmpw 0,%0,%3 \n\ beq 2f \n\ add %0,%2,%0 \n" @@ -538,7 +538,7 @@ static __inline__ int atomic64_add_unless(atomic64_t *v, long a, long u) __asm__ __volatile__ ( PPC_ATOMIC_ENTRY_BARRIER -"1: ldarx %0,0,%1 # __atomic_add_unless\n\ +"1: ldarx %0,0,%1 # atomic_fetch_add_unless\n\ cmpd 0,%0,%3 \n\ beq 2f \n\ add %0,%2,%0 \n" diff --git a/arch/riscv/include/asm/atomic.h b/arch/riscv/include/asm/atomic.h index 855115ace98c..739e810c857e 100644 --- a/arch/riscv/include/asm/atomic.h +++ b/arch/riscv/include/asm/atomic.h @@ -332,7 +332,7 @@ ATOMIC_OP(dec_and_test, dec, ==, 0, 64) #undef ATOMIC_OP /* This is required to provide a full barrier on success. */ -static __always_inline int __atomic_add_unless(atomic_t *v, int a, int u) +static __always_inline int atomic_fetch_add_unless(atomic_t *v, int a, int u) { int prev, rc; @@ -381,7 +381,7 @@ static __always_inline int atomic64_add_unless(atomic64_t *v, long a, long u) */ static __always_inline int atomic_inc_not_zero(atomic_t *v) { - return __atomic_add_unless(v, 1, 0); + return atomic_fetch_add_unless(v, 1, 0); } #ifndef CONFIG_GENERIC_ATOMIC64 diff --git a/arch/s390/include/asm/atomic.h b/arch/s390/include/asm/atomic.h index 4b55532f15c4..c2858cdd8c29 100644 --- a/arch/s390/include/asm/atomic.h +++ b/arch/s390/include/asm/atomic.h @@ -90,7 +90,7 @@ static inline int atomic_cmpxchg(atomic_t *v, int old, int new) return __atomic_cmpxchg(&v->counter, old, new); } -static inline int __atomic_add_unless(atomic_t *v, int a, int u) +static inline int atomic_fetch_add_unless(atomic_t *v, int a, int u) { int c, old; c = atomic_read(v); diff --git a/arch/sh/include/asm/atomic.h b/arch/sh/include/asm/atomic.h index 0fd0099f43cc..ef45931ebac5 100644 --- a/arch/sh/include/asm/atomic.h +++ b/arch/sh/include/asm/atomic.h @@ -46,7 +46,7 @@ #define atomic_cmpxchg(v, o, n) (cmpxchg(&((v)->counter), (o), (n))) /** - * __atomic_add_unless - add unless the number is a given value + * atomic_fetch_add_unless - add unless the number is a given value * @v: pointer of type atomic_t * @a: the amount to add to v... * @u: ...unless v is equal to u. @@ -54,7 +54,7 @@ * Atomically adds @a to @v, so long as it was not @u. * Returns the old value of @v. */ -static inline int __atomic_add_unless(atomic_t *v, int a, int u) +static inline int atomic_fetch_add_unless(atomic_t *v, int a, int u) { int c, old; c = atomic_read(v); diff --git a/arch/sparc/include/asm/atomic_32.h b/arch/sparc/include/asm/atomic_32.h index d13ce517f4b9..a58f4b43bcc7 100644 --- a/arch/sparc/include/asm/atomic_32.h +++ b/arch/sparc/include/asm/atomic_32.h @@ -27,7 +27,7 @@ int atomic_fetch_or(int, atomic_t *); int atomic_fetch_xor(int, atomic_t *); int atomic_cmpxchg(atomic_t *, int, int); int atomic_xchg(atomic_t *, int); -int __atomic_add_unless(atomic_t *, int, int); +int atomic_fetch_add_unless(atomic_t *, int, int); void atomic_set(atomic_t *, int); #define atomic_set_release(v, i) atomic_set((v), (i)) diff --git a/arch/sparc/include/asm/atomic_64.h b/arch/sparc/include/asm/atomic_64.h index 28db058d471b..f416fd3d2708 100644 --- a/arch/sparc/include/asm/atomic_64.h +++ b/arch/sparc/include/asm/atomic_64.h @@ -89,7 +89,7 @@ static inline int atomic_xchg(atomic_t *v, int new) return xchg(&v->counter, new); } -static inline int __atomic_add_unless(atomic_t *v, int a, int u) +static inline int atomic_fetch_add_unless(atomic_t *v, int a, int u) { int c, old; c = atomic_read(v); diff --git a/arch/sparc/lib/atomic32.c b/arch/sparc/lib/atomic32.c index 465a901a0ada..281fa634bb1a 100644 --- a/arch/sparc/lib/atomic32.c +++ b/arch/sparc/lib/atomic32.c @@ -95,7 +95,7 @@ int atomic_cmpxchg(atomic_t *v, int old, int new) } EXPORT_SYMBOL(atomic_cmpxchg); -int __atomic_add_unless(atomic_t *v, int a, int u) +int atomic_fetch_add_unless(atomic_t *v, int a, int u) { int ret; unsigned long flags; @@ -107,7 +107,7 @@ int __atomic_add_unless(atomic_t *v, int a, int u) spin_unlock_irqrestore(ATOMIC_HASH(v), flags); return ret; } -EXPORT_SYMBOL(__atomic_add_unless); +EXPORT_SYMBOL(atomic_fetch_add_unless); /* Atomic operations are already serializing */ void atomic_set(atomic_t *v, int i) diff --git a/arch/x86/include/asm/atomic.h b/arch/x86/include/asm/atomic.h index 0db6bec95489..84ed0bd76aef 100644 --- a/arch/x86/include/asm/atomic.h +++ b/arch/x86/include/asm/atomic.h @@ -254,7 +254,7 @@ static inline int arch_atomic_fetch_xor(int i, atomic_t *v) } /** - * __arch_atomic_add_unless - add unless the number is already a given value + * arch_atomic_fetch_add_unless - add unless the number is already a given value * @v: pointer of type atomic_t * @a: the amount to add to v... * @u: ...unless v is equal to u. @@ -262,7 +262,7 @@ static inline int arch_atomic_fetch_xor(int i, atomic_t *v) * Atomically adds @a to @v, so long as @v was not already @u. * Returns the old value of @v. */ -static __always_inline int __arch_atomic_add_unless(atomic_t *v, int a, int u) +static __always_inline int arch_atomic_fetch_add_unless(atomic_t *v, int a, int u) { int c = arch_atomic_read(v); diff --git a/arch/xtensa/include/asm/atomic.h b/arch/xtensa/include/asm/atomic.h index e7a23f2a519a..4188e56c06c9 100644 --- a/arch/xtensa/include/asm/atomic.h +++ b/arch/xtensa/include/asm/atomic.h @@ -275,7 +275,7 @@ ATOMIC_OPS(xor) #define atomic_xchg(v, new) (xchg(&((v)->counter), new)) /** - * __atomic_add_unless - add unless the number is a given value + * atomic_fetch_add_unless - add unless the number is a given value * @v: pointer of type atomic_t * @a: the amount to add to v... * @u: ...unless v is equal to u. @@ -283,7 +283,7 @@ ATOMIC_OPS(xor) * Atomically adds @a to @v, so long as it was not @u. * Returns the old value of @v. */ -static __inline__ int __atomic_add_unless(atomic_t *v, int a, int u) +static __inline__ int atomic_fetch_add_unless(atomic_t *v, int a, int u) { int c, old; c = atomic_read(v); diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index fa0729c1e776..d81c653b9bf6 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -61,7 +61,7 @@ static int atomic_inc_return_safe(atomic_t *v) { unsigned int counter; - counter = (unsigned int)__atomic_add_unless(v, 1, 0); + counter = (unsigned int)atomic_fetch_add_unless(v, 1, 0); if (counter <= (unsigned int)INT_MAX) return (int)counter; diff --git a/drivers/infiniband/core/rdma_core.c b/drivers/infiniband/core/rdma_core.c index a6e904973ba8..475910ffbcb6 100644 --- a/drivers/infiniband/core/rdma_core.c +++ b/drivers/infiniband/core/rdma_core.c @@ -121,7 +121,7 @@ static int uverbs_try_lock_object(struct ib_uobject *uobj, bool exclusive) * this lock. */ if (!exclusive) - return __atomic_add_unless(&uobj->usecnt, 1, -1) == -1 ? + return atomic_fetch_add_unless(&uobj->usecnt, 1, -1) == -1 ? -EBUSY : 0; /* lock is either WRITE or DESTROY - should be exclusive */ diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c index a1b18082991b..183cc5418722 100644 --- a/fs/afs/rxrpc.c +++ b/fs/afs/rxrpc.c @@ -648,7 +648,7 @@ static void afs_wake_up_async_call(struct sock *sk, struct rxrpc_call *rxcall, trace_afs_notify_call(rxcall, call); call->need_attention = true; - u = __atomic_add_unless(&call->usage, 1, 0); + u = atomic_fetch_add_unless(&call->usage, 1, 0); if (u != 0) { trace_afs_call(call, afs_call_trace_wake, u, atomic_read(&call->net->nr_outstanding_calls), diff --git a/include/asm-generic/atomic-instrumented.h b/include/asm-generic/atomic-instrumented.h index ec07f23678ea..b8b14cc2df6c 100644 --- a/include/asm-generic/atomic-instrumented.h +++ b/include/asm-generic/atomic-instrumented.h @@ -84,10 +84,10 @@ static __always_inline bool atomic64_try_cmpxchg(atomic64_t *v, s64 *old, s64 ne } #endif -static __always_inline int __atomic_add_unless(atomic_t *v, int a, int u) +static __always_inline int atomic_fetch_add_unless(atomic_t *v, int a, int u) { kasan_check_write(v, sizeof(*v)); - return __arch_atomic_add_unless(v, a, u); + return arch_atomic_fetch_add_unless(v, a, u); } diff --git a/include/asm-generic/atomic.h b/include/asm-generic/atomic.h index abe6dd9ca2a8..10051ed6d088 100644 --- a/include/asm-generic/atomic.h +++ b/include/asm-generic/atomic.h @@ -221,8 +221,8 @@ static inline void atomic_dec(atomic_t *v) #define atomic_xchg(ptr, v) (xchg(&(ptr)->counter, (v))) #define atomic_cmpxchg(v, old, new) (cmpxchg(&((v)->counter), (old), (new))) -#ifndef __atomic_add_unless -static inline int __atomic_add_unless(atomic_t *v, int a, int u) +#ifndef atomic_fetch_add_unless +static inline int atomic_fetch_add_unless(atomic_t *v, int a, int u) { int c, old; c = atomic_read(v); diff --git a/include/linux/atomic.h b/include/linux/atomic.h index 01ce3997cb42..9cc982936675 100644 --- a/include/linux/atomic.h +++ b/include/linux/atomic.h @@ -530,7 +530,7 @@ */ static inline int atomic_add_unless(atomic_t *v, int a, int u) { - return __atomic_add_unless(v, a, u) != u; + return atomic_fetch_add_unless(v, a, u) != u; } /** diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 35dc466641f2..f12db70d3bf3 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -575,7 +575,7 @@ static struct bpf_map *bpf_map_inc_not_zero(struct bpf_map *map, { int refold; - refold = __atomic_add_unless(&map->refcnt, 1, 0); + refold = atomic_fetch_add_unless(&map->refcnt, 1, 0); if (refold >= BPF_MAX_REFCNT) { __bpf_map_put(map, false); @@ -1142,7 +1142,7 @@ struct bpf_prog *bpf_prog_inc_not_zero(struct bpf_prog *prog) { int refold; - refold = __atomic_add_unless(&prog->aux->refcnt, 1, 0); + refold = atomic_fetch_add_unless(&prog->aux->refcnt, 1, 0); if (refold >= BPF_MAX_REFCNT) { __bpf_prog_put(prog, false); diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c index f6734d8cb01a..9486293fef5c 100644 --- a/net/rxrpc/call_object.c +++ b/net/rxrpc/call_object.c @@ -415,7 +415,7 @@ void rxrpc_incoming_call(struct rxrpc_sock *rx, bool rxrpc_queue_call(struct rxrpc_call *call) { const void *here = __builtin_return_address(0); - int n = __atomic_add_unless(&call->usage, 1, 0); + int n = atomic_fetch_add_unless(&call->usage, 1, 0); if (n == 0) return false; if (rxrpc_queue_work(&call->processor)) diff --git a/net/rxrpc/conn_object.c b/net/rxrpc/conn_object.c index 4c77a78a252a..77440a356b14 100644 --- a/net/rxrpc/conn_object.c +++ b/net/rxrpc/conn_object.c @@ -266,7 +266,7 @@ void rxrpc_kill_connection(struct rxrpc_connection *conn) bool rxrpc_queue_conn(struct rxrpc_connection *conn) { const void *here = __builtin_return_address(0); - int n = __atomic_add_unless(&conn->usage, 1, 0); + int n = atomic_fetch_add_unless(&conn->usage, 1, 0); if (n == 0) return false; if (rxrpc_queue_work(&conn->processor)) @@ -309,7 +309,7 @@ rxrpc_get_connection_maybe(struct rxrpc_connection *conn) const void *here = __builtin_return_address(0); if (conn) { - int n = __atomic_add_unless(&conn->usage, 1, 0); + int n = atomic_fetch_add_unless(&conn->usage, 1, 0); if (n > 0) trace_rxrpc_conn(conn, rxrpc_conn_got, n + 1, here); else diff --git a/net/rxrpc/local_object.c b/net/rxrpc/local_object.c index b493e6b62740..777c3ed4cfc0 100644 --- a/net/rxrpc/local_object.c +++ b/net/rxrpc/local_object.c @@ -305,7 +305,7 @@ struct rxrpc_local *rxrpc_get_local_maybe(struct rxrpc_local *local) const void *here = __builtin_return_address(0); if (local) { - int n = __atomic_add_unless(&local->usage, 1, 0); + int n = atomic_fetch_add_unless(&local->usage, 1, 0); if (n > 0) trace_rxrpc_local(local, rxrpc_local_got, n + 1, here); else diff --git a/net/rxrpc/peer_object.c b/net/rxrpc/peer_object.c index 1b7e8107b3ae..1cf3b408017a 100644 --- a/net/rxrpc/peer_object.c +++ b/net/rxrpc/peer_object.c @@ -406,7 +406,7 @@ struct rxrpc_peer *rxrpc_get_peer_maybe(struct rxrpc_peer *peer) const void *here = __builtin_return_address(0); if (peer) { - int n = __atomic_add_unless(&peer->usage, 1, 0); + int n = atomic_fetch_add_unless(&peer->usage, 1, 0); if (n > 0) trace_rxrpc_peer(peer, rxrpc_peer_got, n + 1, here); else -- cgit From 8f894bf47dc9e8b77166125a084a7217693a28cd Mon Sep 17 00:00:00 2001 From: Yisheng Xie Date: Thu, 31 May 2018 19:11:19 +0800 Subject: sched/debug: Use match_string() helper instead of open-coded logic match_string() returns the index of an array for a matching string, which can be used instead of the open coded variant. Signed-off-by: Yisheng Xie Reviewed-by: Andy Shevchenko Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Link: https://lore.kernel.org/lkml/1527765086-19873-15-git-send-email-xieyisheng1@huawei.com Signed-off-by: Ingo Molnar --- kernel/sched/debug.c | 31 +++++++++++++++---------------- 1 file changed, 15 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index e593b4118578..c96e89cc4bc7 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -111,20 +111,19 @@ static int sched_feat_set(char *cmp) cmp += 3; } - for (i = 0; i < __SCHED_FEAT_NR; i++) { - if (strcmp(cmp, sched_feat_names[i]) == 0) { - if (neg) { - sysctl_sched_features &= ~(1UL << i); - sched_feat_disable(i); - } else { - sysctl_sched_features |= (1UL << i); - sched_feat_enable(i); - } - break; - } + i = match_string(sched_feat_names, __SCHED_FEAT_NR, cmp); + if (i < 0) + return i; + + if (neg) { + sysctl_sched_features &= ~(1UL << i); + sched_feat_disable(i); + } else { + sysctl_sched_features |= (1UL << i); + sched_feat_enable(i); } - return i; + return 0; } static ssize_t @@ -133,7 +132,7 @@ sched_feat_write(struct file *filp, const char __user *ubuf, { char buf[64]; char *cmp; - int i; + int ret; struct inode *inode; if (cnt > 63) @@ -148,10 +147,10 @@ sched_feat_write(struct file *filp, const char __user *ubuf, /* Ensure the static_key remains in a consistent state */ inode = file_inode(filp); inode_lock(inode); - i = sched_feat_set(cmp); + ret = sched_feat_set(cmp); inode_unlock(inode); - if (i == __SCHED_FEAT_NR) - return -EINVAL; + if (ret < 0) + return ret; *ppos += cnt; -- cgit From 9e3ed2d7597ccbce2a30e036342fb0613d0759e5 Mon Sep 17 00:00:00 2001 From: Souptick Joarder Date: Mon, 21 May 2018 23:55:20 +0530 Subject: perf/core: Change perf_mmap_fault() return type to 'vm_fault_t' Use new return type 'vm_fault_t' for fault handlers. For now, this is just documenting that the function returns a VM_FAULT value rather than an errno. Once all instances are converted, vm_fault_t will become a distinct type. See the following commit: 1c8f422059ae ("mm: change return type to vm_fault_t") Signed-off-by: Souptick Joarder Reviewed-by: Matthew Wilcox Acked-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: acme@kernel.org Cc: akpm@linux-foundation.org Cc: alexander.shishkin@linux.intel.com Cc: jolsa@redhat.com Cc: namhyung@kernel.org Link: https://lkml.kernel.org/lkml/20180521182520.GA19677@jordon-HP-15-Notebook-PC Signed-off-by: Ingo Molnar --- kernel/events/core.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 80cca2b30c4f..a4d5ac6d74af 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -5273,11 +5273,11 @@ unlock: } EXPORT_SYMBOL_GPL(perf_event_update_userpage); -static int perf_mmap_fault(struct vm_fault *vmf) +static vm_fault_t perf_mmap_fault(struct vm_fault *vmf) { struct perf_event *event = vmf->vma->vm_file->private_data; struct ring_buffer *rb; - int ret = VM_FAULT_SIGBUS; + vm_fault_t ret = VM_FAULT_SIGBUS; if (vmf->flags & FAULT_FLAG_MKWRITE) { if (vmf->pgoff == 0) -- cgit From f2a3ab36077222437b4826fc76111caa14562b7c Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Sat, 28 Apr 2018 21:35:01 +0900 Subject: kprobes: Make list and blacklist root user read only Since the blacklist and list files on debugfs indicates a sensitive address information to reader, it should be restricted to the root user. Suggested-by: Thomas Richter Suggested-by: Ingo Molnar Signed-off-by: Masami Hiramatsu Cc: Ananth N Mavinakayanahalli Cc: Anil S Keshavamurthy Cc: Arnd Bergmann Cc: David Howells Cc: David S . Miller Cc: Heiko Carstens Cc: Jon Medhurst Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Tobin C . Harding Cc: Will Deacon Cc: acme@kernel.org Cc: akpm@linux-foundation.org Cc: brueckner@linux.vnet.ibm.com Cc: linux-arch@vger.kernel.org Cc: rostedt@goodmis.org Cc: schwidefsky@de.ibm.com Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/lkml/152491890171.9916.5183693615601334087.stgit@devbox Signed-off-by: Ingo Molnar --- kernel/kprobes.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 536ab451e96d..898ee56d4f48 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -2509,7 +2509,7 @@ static int __init debugfs_kprobe_init(void) if (!dir) return -ENOMEM; - file = debugfs_create_file("list", 0444, dir, NULL, + file = debugfs_create_file("list", 0400, dir, NULL, &debugfs_kprobes_operations); if (!file) goto error; @@ -2519,7 +2519,7 @@ static int __init debugfs_kprobe_init(void) if (!file) goto error; - file = debugfs_create_file("blacklist", 0444, dir, NULL, + file = debugfs_create_file("blacklist", 0400, dir, NULL, &debugfs_kprobe_blacklist_ops); if (!file) goto error; -- cgit From ffb9bd68ebdb3b8d00ef5a79bbe8167a3281cace Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Sat, 28 Apr 2018 21:35:32 +0900 Subject: kprobes: Show blacklist addresses as same as kallsyms does Show kprobes blacklist addresses under same condition of showing kallsyms addresses. Since there are several name conflict for local symbols, kprobe blacklist needs to show each addresses so that user can identify where is on blacklist by comparing with kallsyms. Signed-off-by: Masami Hiramatsu Cc: Ananth N Mavinakayanahalli Cc: Anil S Keshavamurthy Cc: Arnd Bergmann Cc: David Howells Cc: David S . Miller Cc: Heiko Carstens Cc: Jon Medhurst Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Thomas Richter Cc: Tobin C . Harding Cc: Will Deacon Cc: acme@kernel.org Cc: akpm@linux-foundation.org Cc: brueckner@linux.vnet.ibm.com Cc: linux-arch@vger.kernel.org Cc: rostedt@goodmis.org Cc: schwidefsky@de.ibm.com Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/lkml/152491893217.9916.14760965896164273464.stgit@devbox Signed-off-by: Ingo Molnar --- kernel/kprobes.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 898ee56d4f48..ab1bfa3d1d9c 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -2326,8 +2326,16 @@ static int kprobe_blacklist_seq_show(struct seq_file *m, void *v) struct kprobe_blacklist_entry *ent = list_entry(v, struct kprobe_blacklist_entry, list); - seq_printf(m, "0x%px-0x%px\t%ps\n", (void *)ent->start_addr, - (void *)ent->end_addr, (void *)ent->start_addr); + /* + * If /proc/kallsyms is not showing kernel address, we won't + * show them here either. + */ + if (!kallsyms_show_value()) + seq_printf(m, "0x%px-0x%px\t%ps\n", NULL, NULL, + (void *)ent->start_addr); + else + seq_printf(m, "0x%px-0x%px\t%ps\n", (void *)ent->start_addr, + (void *)ent->end_addr, (void *)ent->start_addr); return 0; } -- cgit From 81365a947de453bcaba2558f8de5dadcffe05bc1 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Sat, 28 Apr 2018 21:36:02 +0900 Subject: kprobes: Show address of kprobes if kallsyms does Show probed address in debugfs kprobe list file as same as kallsyms does. This information is used for checking kprobes are placed in the expected address. So it should be able to compared with address in kallsyms. Signed-off-by: Masami Hiramatsu Cc: Ananth N Mavinakayanahalli Cc: Anil S Keshavamurthy Cc: Arnd Bergmann Cc: David Howells Cc: David S . Miller Cc: Heiko Carstens Cc: Jon Medhurst Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Thomas Richter Cc: Tobin C . Harding Cc: Will Deacon Cc: acme@kernel.org Cc: akpm@linux-foundation.org Cc: brueckner@linux.vnet.ibm.com Cc: linux-arch@vger.kernel.org Cc: rostedt@goodmis.org Cc: schwidefsky@de.ibm.com Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/lkml/152491896256.9916.1583733714492565296.stgit@devbox Signed-off-by: Ingo Molnar --- kernel/kprobes.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/kprobes.c b/kernel/kprobes.c index ab1bfa3d1d9c..1d6130d2937a 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -2226,19 +2226,23 @@ static void report_probe(struct seq_file *pi, struct kprobe *p, const char *sym, int offset, char *modname, struct kprobe *pp) { char *kprobe_type; + void *addr = p->addr; if (p->pre_handler == pre_handler_kretprobe) kprobe_type = "r"; else kprobe_type = "k"; + if (!kallsyms_show_value()) + addr = NULL; + if (sym) - seq_printf(pi, "%p %s %s+0x%x %s ", - p->addr, kprobe_type, sym, offset, + seq_printf(pi, "%px %s %s+0x%x %s ", + addr, kprobe_type, sym, offset, (modname ? modname : " ")); - else - seq_printf(pi, "%p %s %p ", - p->addr, kprobe_type, p->addr); + else /* try to use %pS */ + seq_printf(pi, "%px %s %pS ", + addr, kprobe_type, p->addr); if (!pp) pp = p; -- cgit From 4458515b2c52831ee622411d2fe3e774d1f5c49a Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Sat, 28 Apr 2018 21:36:33 +0900 Subject: kprobes: Replace %p with other pointer types Replace %p with %pS or just remove it if unneeded. And use WARN_ONCE() if it is a single bug. Signed-off-by: Masami Hiramatsu Cc: Ananth N Mavinakayanahalli Cc: Anil S Keshavamurthy Cc: Arnd Bergmann Cc: David Howells Cc: David S . Miller Cc: Heiko Carstens Cc: Jon Medhurst Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Thomas Richter Cc: Tobin C . Harding Cc: Will Deacon Cc: acme@kernel.org Cc: akpm@linux-foundation.org Cc: brueckner@linux.vnet.ibm.com Cc: linux-arch@vger.kernel.org Cc: rostedt@goodmis.org Cc: schwidefsky@de.ibm.com Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/lkml/152491899284.9916.5350534544808158621.stgit@devbox Signed-off-by: Ingo Molnar --- kernel/kprobes.c | 22 ++++++++++------------ 1 file changed, 10 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 1d6130d2937a..ab257be4d924 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -710,9 +710,7 @@ static void reuse_unused_kprobe(struct kprobe *ap) * there is still a relative jump) and disabled. */ op = container_of(ap, struct optimized_kprobe, kp); - if (unlikely(list_empty(&op->list))) - printk(KERN_WARNING "Warning: found a stray unused " - "aggrprobe@%p\n", ap->addr); + WARN_ON_ONCE(list_empty(&op->list)); /* Enable the probe again */ ap->flags &= ~KPROBE_FLAG_DISABLED; /* Optimize it again (remove from op->list) */ @@ -985,7 +983,8 @@ static int arm_kprobe_ftrace(struct kprobe *p) ret = ftrace_set_filter_ip(&kprobe_ftrace_ops, (unsigned long)p->addr, 0, 0); if (ret) { - pr_debug("Failed to arm kprobe-ftrace at %p (%d)\n", p->addr, ret); + pr_debug("Failed to arm kprobe-ftrace at %pS (%d)\n", + p->addr, ret); return ret; } @@ -1025,7 +1024,8 @@ static int disarm_kprobe_ftrace(struct kprobe *p) ret = ftrace_set_filter_ip(&kprobe_ftrace_ops, (unsigned long)p->addr, 1, 0); - WARN(ret < 0, "Failed to disarm kprobe-ftrace at %p (%d)\n", p->addr, ret); + WARN_ONCE(ret < 0, "Failed to disarm kprobe-ftrace at %pS (%d)\n", + p->addr, ret); return ret; } #else /* !CONFIG_KPROBES_ON_FTRACE */ @@ -2069,11 +2069,12 @@ out: } EXPORT_SYMBOL_GPL(enable_kprobe); +/* Caller must NOT call this in usual path. This is only for critical case */ void dump_kprobe(struct kprobe *kp) { - printk(KERN_WARNING "Dumping kprobe:\n"); - printk(KERN_WARNING "Name: %s\nAddress: %p\nOffset: %x\n", - kp->symbol_name, kp->addr, kp->offset); + pr_err("Dumping kprobe:\n"); + pr_err("Name: %s\nOffset: %x\nAddress: %pS\n", + kp->symbol_name, kp->offset, kp->addr); } NOKPROBE_SYMBOL(dump_kprobe); @@ -2096,11 +2097,8 @@ static int __init populate_kprobe_blacklist(unsigned long *start, entry = arch_deref_entry_point((void *)*iter); if (!kernel_text_address(entry) || - !kallsyms_lookup_size_offset(entry, &size, &offset)) { - pr_err("Failed to find blacklist at %p\n", - (void *)entry); + !kallsyms_lookup_size_offset(entry, &size, &offset)) continue; - } ent = kmalloc(sizeof(*ent), GFP_KERNEL); if (!ent) -- cgit From 03585a95cd830e7a92697d2a8fe9a34df87563db Mon Sep 17 00:00:00 2001 From: Li RongQing Date: Wed, 11 Apr 2018 13:16:56 +0800 Subject: sched/fair: Remove stale tg_unthrottle_up() comments After commit: 82958366cfea ("sched: Replace update_shares weight distribution with per-entity computation") tg_unthrottle_up() did not update the weight. Signed-off-by: Li RongQing Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: https://lkml.kernel.org/lkml/1523423816-18322-1-git-send-email-lirongqing@baidu.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 1866e64792a7..4cc1441b0746 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4732,7 +4732,6 @@ static inline int throttled_lb_pair(struct task_group *tg, throttled_hierarchy(dest_cfs_rq); } -/* updated child weight may affect parent so we have to do this bottom up */ static int tg_unthrottle_up(struct task_group *tg, void *data) { struct rq *rq = data; -- cgit From 81a0abd9f213704fbeeea1550ff202000e3c3cdf Mon Sep 17 00:00:00 2001 From: Jessica Yu Date: Fri, 22 Jun 2018 13:59:29 +0200 Subject: module: make it clear when we're handling the module copy in info->hdr In load_module(), it's not always clear whether we're handling the temporary module copy in info->hdr (which is freed at the end of load_module()) or if we're handling the module already allocated and copied to it's final place. Adding an info->mod field and using it whenever we're handling the temporary copy makes that explicitly clear. Signed-off-by: Jessica Yu --- kernel/module.c | 42 +++++++++++++++++++++--------------------- 1 file changed, 21 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index 4a6b9c6d5f2c..3ed4aaa646dc 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -309,6 +309,8 @@ EXPORT_SYMBOL(unregister_module_notifier); struct load_info { const char *name; + /* pointer to module in temporary copy, freed at end of load_module() */ + struct module *mod; Elf_Ehdr *hdr; unsigned long len; Elf_Shdr *sechdrs; @@ -2947,14 +2949,13 @@ static int rewrite_section_headers(struct load_info *info, int flags) * search for module section index etc), and do some basic section * verification. * - * Return the temporary module pointer (we'll replace it with the final - * one when we move the module sections around). + * Set info->mod to the temporary copy of the module in info->hdr. The final one + * will be allocated in move_module(). */ -static struct module *setup_load_info(struct load_info *info, int flags) +static int setup_load_info(struct load_info *info, int flags) { unsigned int i; int err; - struct module *mod; /* Set up the convenience variables */ info->sechdrs = (void *)info->hdr + info->hdr->e_shoff; @@ -2963,7 +2964,7 @@ static struct module *setup_load_info(struct load_info *info, int flags) err = rewrite_section_headers(info, flags); if (err) - return ERR_PTR(err); + return err; /* Find internal symbols and strings. */ for (i = 1; i < info->hdr->e_shnum; i++) { @@ -2980,30 +2981,30 @@ static struct module *setup_load_info(struct load_info *info, int flags) if (!info->index.mod) { pr_warn("%s: No module found in object\n", info->name ?: "(missing .modinfo name field)"); - return ERR_PTR(-ENOEXEC); + return -ENOEXEC; } /* This is temporary: point mod into copy of data. */ - mod = (void *)info->sechdrs[info->index.mod].sh_addr; + info->mod = (void *)info->sechdrs[info->index.mod].sh_addr; /* * If we didn't load the .modinfo 'name' field, fall back to * on-disk struct mod 'name' field. */ if (!info->name) - info->name = mod->name; + info->name = info->mod->name; if (info->index.sym == 0) { pr_warn("%s: module has no symbols (stripped?)\n", info->name); - return ERR_PTR(-ENOEXEC); + return -ENOEXEC; } info->index.pcpu = find_pcpusec(info); /* Check module struct version now, before we try to use module. */ - if (!check_modstruct_version(info, mod)) - return ERR_PTR(-ENOEXEC); + if (!check_modstruct_version(info, info->mod)) + return -ENOEXEC; - return mod; + return 0; } static int check_modinfo(struct module *mod, struct load_info *info, int flags) @@ -3298,25 +3299,24 @@ core_param(module_blacklist, module_blacklist, charp, 0400); static struct module *layout_and_allocate(struct load_info *info, int flags) { - /* Module within temporary copy. */ struct module *mod; unsigned int ndx; int err; - mod = setup_load_info(info, flags); - if (IS_ERR(mod)) - return mod; + err = setup_load_info(info, flags); + if (err) + return ERR_PTR(err); if (blacklisted(info->name)) return ERR_PTR(-EPERM); - err = check_modinfo(mod, info, flags); + err = check_modinfo(info->mod, info, flags); if (err) return ERR_PTR(err); /* Allow arches to frob section contents and sizes. */ err = module_frob_arch_sections(info->hdr, info->sechdrs, - info->secstrings, mod); + info->secstrings, info->mod); if (err < 0) return ERR_PTR(err); @@ -3335,11 +3335,11 @@ static struct module *layout_and_allocate(struct load_info *info, int flags) /* Determine total sizes, and put offsets in sh_entsize. For now this is done generically; there doesn't appear to be any special cases for the architectures. */ - layout_sections(mod, info); - layout_symtab(mod, info); + layout_sections(info->mod, info); + layout_symtab(info->mod, info); /* Allocate and move to the final place */ - err = move_module(mod, info); + err = move_module(info->mod, info); if (err) return ERR_PTR(err); -- cgit From 5fdc7db6448a4f558f298b1c98d6d124fdf2ad95 Mon Sep 17 00:00:00 2001 From: Jessica Yu Date: Fri, 22 Jun 2018 14:00:01 +0200 Subject: module: setup load info before module_sig_check() We want to be able to log the module name in early error messages, such as when module signature verification fails. Previously, the module name is set in layout_and_allocate(), meaning that any error messages that happen before (such as those in module_sig_check()) won't be logged with a module name, which isn't terribly helpful. In order to do this, reshuffle the order in load_module() and set up load info earlier so that we can log the module name along with these error messages. This requires splitting rewrite_section_headers() out of setup_load_info(). While we're at it, clean up and split up the operations done in layout_and_allocate(), setup_load_info(), and rewrite_section_headers() more cleanly so these functions only perform what their names suggest. Signed-off-by: Jessica Yu --- kernel/module.c | 77 ++++++++++++++++++++++++++++++++------------------------- 1 file changed, 43 insertions(+), 34 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index 3ed4aaa646dc..0ad0bb58e116 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -2488,7 +2488,11 @@ static char *get_modinfo(struct load_info *info, const char *tag) Elf_Shdr *infosec = &info->sechdrs[info->index.info]; unsigned long size = infosec->sh_size; - for (p = (char *)infosec->sh_addr; p; p = next_string(p, &size)) { + /* + * get_modinfo() calls made before rewrite_section_headers() + * must use sh_offset, as sh_addr isn't set! + */ + for (p = (char *)info->hdr + infosec->sh_offset; p; p = next_string(p, &size)) { if (strncmp(p, tag, taglen) == 0 && p[taglen] == '=') return p + taglen + 1; } @@ -2928,17 +2932,7 @@ static int rewrite_section_headers(struct load_info *info, int flags) } /* Track but don't keep modinfo and version sections. */ - if (flags & MODULE_INIT_IGNORE_MODVERSIONS) - info->index.vers = 0; /* Pretend no __versions section! */ - else - info->index.vers = find_sec(info, "__versions"); info->sechdrs[info->index.vers].sh_flags &= ~(unsigned long)SHF_ALLOC; - - info->index.info = find_sec(info, ".modinfo"); - if (!info->index.info) - info->name = "(missing .modinfo section)"; - else - info->name = get_modinfo(info, "name"); info->sechdrs[info->index.info].sh_flags &= ~(unsigned long)SHF_ALLOC; return 0; @@ -2955,16 +2949,18 @@ static int rewrite_section_headers(struct load_info *info, int flags) static int setup_load_info(struct load_info *info, int flags) { unsigned int i; - int err; /* Set up the convenience variables */ info->sechdrs = (void *)info->hdr + info->hdr->e_shoff; info->secstrings = (void *)info->hdr + info->sechdrs[info->hdr->e_shstrndx].sh_offset; - err = rewrite_section_headers(info, flags); - if (err) - return err; + /* Try to find a name early so we can log errors with a module name */ + info->index.info = find_sec(info, ".modinfo"); + if (!info->index.info) + info->name = "(missing .modinfo section)"; + else + info->name = get_modinfo(info, "name"); /* Find internal symbols and strings. */ for (i = 1; i < info->hdr->e_shnum; i++) { @@ -2977,6 +2973,11 @@ static int setup_load_info(struct load_info *info, int flags) } } + if (info->index.sym == 0) { + pr_warn("%s: module has no symbols (stripped?)\n", info->name); + return -ENOEXEC; + } + info->index.mod = find_sec(info, ".gnu.linkonce.this_module"); if (!info->index.mod) { pr_warn("%s: No module found in object\n", @@ -2984,26 +2985,22 @@ static int setup_load_info(struct load_info *info, int flags) return -ENOEXEC; } /* This is temporary: point mod into copy of data. */ - info->mod = (void *)info->sechdrs[info->index.mod].sh_addr; + info->mod = (void *)info->hdr + info->sechdrs[info->index.mod].sh_offset; /* - * If we didn't load the .modinfo 'name' field, fall back to + * If we didn't load the .modinfo 'name' field earlier, fall back to * on-disk struct mod 'name' field. */ if (!info->name) info->name = info->mod->name; - if (info->index.sym == 0) { - pr_warn("%s: module has no symbols (stripped?)\n", info->name); - return -ENOEXEC; - } + if (flags & MODULE_INIT_IGNORE_MODVERSIONS) + info->index.vers = 0; /* Pretend no __versions section! */ + else + info->index.vers = find_sec(info, "__versions"); info->index.pcpu = find_pcpusec(info); - /* Check module struct version now, before we try to use module. */ - if (!check_modstruct_version(info, info->mod)) - return -ENOEXEC; - return 0; } @@ -3303,13 +3300,6 @@ static struct module *layout_and_allocate(struct load_info *info, int flags) unsigned int ndx; int err; - err = setup_load_info(info, flags); - if (err) - return ERR_PTR(err); - - if (blacklisted(info->name)) - return ERR_PTR(-EPERM); - err = check_modinfo(info->mod, info, flags); if (err) return ERR_PTR(err); @@ -3657,17 +3647,36 @@ static int load_module(struct load_info *info, const char __user *uargs, int flags) { struct module *mod; - long err; + long err = 0; char *after_dashes; + err = elf_header_check(info); + if (err) + goto free_copy; + + err = setup_load_info(info, flags); + if (err) + goto free_copy; + + if (blacklisted(info->name)) { + err = -EPERM; + goto free_copy; + } + err = module_sig_check(info, flags); if (err) goto free_copy; - err = elf_header_check(info); + err = rewrite_section_headers(info, flags); if (err) goto free_copy; + /* Check module struct version now, before we try to use module. */ + if (!check_modstruct_version(info, info->mod)) { + err = -ENOEXEC; + goto free_copy; + } + /* Figure out module layout, and allocate all the memory. */ mod = layout_and_allocate(info, flags); if (IS_ERR(mod)) { -- cgit From 74bdf7815dfb3805a37b0bba615814063a227bf5 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 20 Jun 2018 08:03:32 -0700 Subject: genirq: Speedup show_interrupts() Since commit 425a5072dcd1 ("genirq: Free irq_desc with rcu"), show_interrupts() can be switched to rcu locking, which removes possible contention on sparse_irq_lock. The per_cpu count scan and print can be done without holding desc spinlock. And there is no need to call kstat_irqs_cpu() and abuse irq_to_desc() while holding rcu read lock, since desc and desc->kstat_irqs wont disappear or change. Signed-off-by: Eric Dumazet Signed-off-by: Thomas Gleixner Cc: Eric Dumazet Link: https://lkml.kernel.org/r/20180620150332.163320-1-edumazet@google.com --- kernel/irq/proc.c | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index 37eda10f5c36..da9addb8d655 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -475,22 +475,24 @@ int show_interrupts(struct seq_file *p, void *v) seq_putc(p, '\n'); } - irq_lock_sparse(); + rcu_read_lock(); desc = irq_to_desc(i); if (!desc) goto outsparse; - raw_spin_lock_irqsave(&desc->lock, flags); - for_each_online_cpu(j) - any_count |= kstat_irqs_cpu(i, j); - action = desc->action; - if ((!action || irq_desc_is_chained(desc)) && !any_count) - goto out; + if (desc->kstat_irqs) + for_each_online_cpu(j) + any_count |= *per_cpu_ptr(desc->kstat_irqs, j); + + if ((!desc->action || irq_desc_is_chained(desc)) && !any_count) + goto outsparse; seq_printf(p, "%*d: ", prec, i); for_each_online_cpu(j) - seq_printf(p, "%10u ", kstat_irqs_cpu(i, j)); + seq_printf(p, "%10u ", desc->kstat_irqs ? + *per_cpu_ptr(desc->kstat_irqs, j) : 0); + raw_spin_lock_irqsave(&desc->lock, flags); if (desc->irq_data.chip) { if (desc->irq_data.chip->irq_print_chip) desc->irq_data.chip->irq_print_chip(&desc->irq_data, p); @@ -511,6 +513,7 @@ int show_interrupts(struct seq_file *p, void *v) if (desc->name) seq_printf(p, "-%-8s", desc->name); + action = desc->action; if (action) { seq_printf(p, " %s", action->name); while ((action = action->next) != NULL) @@ -518,10 +521,9 @@ int show_interrupts(struct seq_file *p, void *v) } seq_putc(p, '\n'); -out: raw_spin_unlock_irqrestore(&desc->lock, flags); outsparse: - irq_unlock_sparse(); + rcu_read_unlock(); return 0; } #endif -- cgit From 836557bd58e5e65c05c73af9f6ebed885dbfccfc Mon Sep 17 00:00:00 2001 From: Lukas Wunner Date: Sun, 24 Jun 2018 10:35:18 +0200 Subject: genirq: Update code comments wrt recycled thread_mask Previously a race existed between __free_irq() and __setup_irq() wherein the thread_mask of a just removed action could be handed out to a newly added action and the freed irq thread would then tread on the oneshot mask bit of the newly added irq thread in irq_finalize_oneshot(): time | __free_irq() | raw_spin_lock_irqsave(&desc->lock, flags); | | raw_spin_unlock_irqrestore(&desc->lock, flags); | | __setup_irq() | raw_spin_lock_irqsave(&desc->lock, flags); | | raw_spin_unlock_irqrestore(&desc->lock, flags); | | irq_thread() of freed irq (__free_irq() waits in synchronize_irq()) | irq_thread_fn() | irq_finalize_oneshot() | raw_spin_lock_irq(&desc->lock); | desc->threads_oneshot &= ~action->thread_mask; | raw_spin_unlock_irq(&desc->lock); v The race was known at least since 2012 when it was documented in a code comment by commit e04268b0effc ("genirq: Remove paranoid warnons and bogus fixups"). The race itself is harmless as nothing touches any of the potentially freed data after synchronize_irq(). In 2017 the race was close by commit 9114014cf4e6 ("genirq: Add mutex to irq desc to serialize request/free_irq()"), apparently inadvertantly so because the race is neither mentioned in the commit message nor was the code comment updated. Make up for that. Signed-off-by: Lukas Wunner Signed-off-by: Thomas Gleixner Cc: Bjorn Helgaas Cc: Mika Westerberg Cc: linux-pci@vger.kernel.org Link: https://lkml.kernel.org/r/32fc25aa35ecef4b2692f57687bb7fc2a57230e2.1529828292.git.lukas@wunner.de --- kernel/irq/manage.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 591cfe901162..123a227d3357 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -1025,10 +1025,7 @@ static int irq_thread(void *data) * This is the regular exit path. __free_irq() is stopping the * thread via kthread_stop() after calling * synchronize_irq(). So neither IRQTF_RUNTHREAD nor the - * oneshot mask bit can be set. We cannot verify that as we - * cannot touch the oneshot mask at this point anymore as - * __setup_irq() might have given out currents thread_mask - * again. + * oneshot mask bit can be set. */ task_work_cancel(current, irq_thread_dtor); return 0; @@ -1245,7 +1242,9 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) /* * Protects against a concurrent __free_irq() call which might wait * for synchronize_irq() to complete without holding the optional - * chip bus lock and desc->lock. + * chip bus lock and desc->lock. Also protects against handing out + * a recycled oneshot thread_mask bit while it's still in use by + * its previous owner. */ mutex_lock(&desc->request_mutex); -- cgit From 519cc8652b3a1d3d0a02257afbe9573ad644da26 Mon Sep 17 00:00:00 2001 From: Lukas Wunner Date: Sun, 24 Jun 2018 10:35:30 +0200 Subject: genirq: Synchronize only with single thread on free_irq() When pciehp is converted to threaded IRQ handling, removal of unplugged devices below a PCIe hotplug port happens synchronously in the IRQ thread. Removal of devices typically entails a call to free_irq() by their drivers. If those devices share their IRQ with the hotplug port, __free_irq() deadlocks because it calls synchronize_irq() to wait for all hard IRQ handlers as well as all threads sharing the IRQ to finish. Actually it's sufficient to wait only for the IRQ thread of the removed device, so call synchronize_hardirq() to wait for all hard IRQ handlers to finish, but no longer for any threads. Compensate by rearranging the control flow in irq_wait_for_interrupt() such that the device's thread is allowed to run one last time after kthread_stop() has been called. kthread_stop() blocks until the IRQ thread has completed. On completion the IRQ thread clears its oneshot thread_mask bit. This is safe because __free_irq() holds the request_mutex, thereby preventing __setup_irq() from handing out the same oneshot thread_mask bit to a newly requested action. Stack trace for posterity: INFO: task irq/17-pciehp:94 blocked for more than 120 seconds. schedule+0x28/0x80 synchronize_irq+0x6e/0xa0 __free_irq+0x15a/0x2b0 free_irq+0x33/0x70 pciehp_release_ctrl+0x98/0xb0 pcie_port_remove_service+0x2f/0x40 device_release_driver_internal+0x157/0x220 bus_remove_device+0xe2/0x150 device_del+0x124/0x340 device_unregister+0x16/0x60 remove_iter+0x1a/0x20 device_for_each_child+0x4b/0x90 pcie_port_device_remove+0x1e/0x30 pci_device_remove+0x36/0xb0 device_release_driver_internal+0x157/0x220 pci_stop_bus_device+0x7d/0xa0 pci_stop_bus_device+0x3d/0xa0 pci_stop_and_remove_bus_device+0xe/0x20 pciehp_unconfigure_device+0xb8/0x160 pciehp_disable_slot+0x84/0x130 pciehp_ist+0x158/0x190 irq_thread_fn+0x1b/0x50 irq_thread+0x143/0x1a0 kthread+0x111/0x130 Signed-off-by: Lukas Wunner Signed-off-by: Thomas Gleixner Cc: Bjorn Helgaas Cc: Mika Westerberg Cc: linux-pci@vger.kernel.org Link: https://lkml.kernel.org/r/d72b41309f077c8d3bee6cc08ad3662d50b5d22a.1529828292.git.lukas@wunner.de --- kernel/irq/manage.c | 33 +++++++++++++++++++++++---------- 1 file changed, 23 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 123a227d3357..1f8be33572a7 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -790,9 +790,19 @@ static irqreturn_t irq_forced_secondary_handler(int irq, void *dev_id) static int irq_wait_for_interrupt(struct irqaction *action) { - set_current_state(TASK_INTERRUPTIBLE); + for (;;) { + set_current_state(TASK_INTERRUPTIBLE); - while (!kthread_should_stop()) { + if (kthread_should_stop()) { + /* may need to run one last time */ + if (test_and_clear_bit(IRQTF_RUNTHREAD, + &action->thread_flags)) { + __set_current_state(TASK_RUNNING); + return 0; + } + __set_current_state(TASK_RUNNING); + return -1; + } if (test_and_clear_bit(IRQTF_RUNTHREAD, &action->thread_flags)) { @@ -800,10 +810,7 @@ static int irq_wait_for_interrupt(struct irqaction *action) return 0; } schedule(); - set_current_state(TASK_INTERRUPTIBLE); } - __set_current_state(TASK_RUNNING); - return -1; } /* @@ -1024,7 +1031,7 @@ static int irq_thread(void *data) /* * This is the regular exit path. __free_irq() is stopping the * thread via kthread_stop() after calling - * synchronize_irq(). So neither IRQTF_RUNTHREAD nor the + * synchronize_hardirq(). So neither IRQTF_RUNTHREAD nor the * oneshot mask bit can be set. */ task_work_cancel(current, irq_thread_dtor); @@ -1241,7 +1248,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) /* * Protects against a concurrent __free_irq() call which might wait - * for synchronize_irq() to complete without holding the optional + * for synchronize_hardirq() to complete without holding the optional * chip bus lock and desc->lock. Also protects against handing out * a recycled oneshot thread_mask bit while it's still in use by * its previous owner. @@ -1612,11 +1619,11 @@ static struct irqaction *__free_irq(struct irq_desc *desc, void *dev_id) /* * Drop bus_lock here so the changes which were done in the chip * callbacks above are synced out to the irq chips which hang - * behind a slow bus (I2C, SPI) before calling synchronize_irq(). + * behind a slow bus (I2C, SPI) before calling synchronize_hardirq(). * * Aside of that the bus_lock can also be taken from the threaded * handler in irq_finalize_oneshot() which results in a deadlock - * because synchronize_irq() would wait forever for the thread to + * because kthread_stop() would wait forever for the thread to * complete, which is blocked on the bus lock. * * The still held desc->request_mutex() protects against a @@ -1628,7 +1635,7 @@ static struct irqaction *__free_irq(struct irq_desc *desc, void *dev_id) unregister_handler_proc(irq, action); /* Make sure it's not being used on another CPU: */ - synchronize_irq(irq); + synchronize_hardirq(irq); #ifdef CONFIG_DEBUG_SHIRQ /* @@ -1646,6 +1653,12 @@ static struct irqaction *__free_irq(struct irq_desc *desc, void *dev_id) } #endif + /* + * The action has already been removed above, but the thread writes + * its oneshot mask bit when it completes. Though request_mutex is + * held across this which prevents __setup_irq() from handing out + * the same bit to a newly requested action. + */ if (action->thread) { kthread_stop(action->thread); put_task_struct(action->thread); -- cgit From faef87723ace12e5f685437c1e68cbecb69a7a89 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 15 Jun 2018 13:08:51 +0200 Subject: dma-noncoherent: add a arch_sync_dma_for_cpu_all hook The MIPS bmips platform needs a global flush when transferring ownership back to the CPU. Add a hook for that to the dma-noncoherent implementation. Signed-off-by: Christoph Hellwig Patchwork: https://patchwork.linux-mips.org/patch/19549/ Signed-off-by: Paul Burton Cc: Florian Fainelli Cc: David Daney Cc: Kevin Cernekee Cc: Jiaxun Yang Cc: Tom Bogendoerfer Cc: Huacai Chen Cc: iommu@lists.linux-foundation.org Cc: linux-mips@linux-mips.org --- include/linux/dma-noncoherent.h | 8 ++++++++ kernel/dma/noncoherent.c | 8 ++++++-- 2 files changed, 14 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/include/linux/dma-noncoherent.h b/include/linux/dma-noncoherent.h index 10b2654d549b..a0aa00cc909d 100644 --- a/include/linux/dma-noncoherent.h +++ b/include/linux/dma-noncoherent.h @@ -44,4 +44,12 @@ static inline void arch_sync_dma_for_cpu(struct device *dev, } #endif /* ARCH_HAS_SYNC_DMA_FOR_CPU */ +#ifdef CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU_ALL +void arch_sync_dma_for_cpu_all(struct device *dev); +#else +static inline void arch_sync_dma_for_cpu_all(struct device *dev) +{ +} +#endif /* CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU_ALL */ + #endif /* _LINUX_DMA_NONCOHERENT_H */ diff --git a/kernel/dma/noncoherent.c b/kernel/dma/noncoherent.c index 79e9a757387f..031fe235d958 100644 --- a/kernel/dma/noncoherent.c +++ b/kernel/dma/noncoherent.c @@ -49,11 +49,13 @@ static int dma_noncoherent_map_sg(struct device *dev, struct scatterlist *sgl, return nents; } -#ifdef CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU +#if defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU) || \ + defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU_ALL) static void dma_noncoherent_sync_single_for_cpu(struct device *dev, dma_addr_t addr, size_t size, enum dma_data_direction dir) { arch_sync_dma_for_cpu(dev, dma_to_phys(dev, addr), size, dir); + arch_sync_dma_for_cpu_all(dev); } static void dma_noncoherent_sync_sg_for_cpu(struct device *dev, @@ -64,6 +66,7 @@ static void dma_noncoherent_sync_sg_for_cpu(struct device *dev, for_each_sg(sgl, sg, nents, i) arch_sync_dma_for_cpu(dev, sg_phys(sg), sg->length, dir); + arch_sync_dma_for_cpu_all(dev); } static void dma_noncoherent_unmap_page(struct device *dev, dma_addr_t addr, @@ -89,7 +92,8 @@ const struct dma_map_ops dma_noncoherent_ops = { .sync_sg_for_device = dma_noncoherent_sync_sg_for_device, .map_page = dma_noncoherent_map_page, .map_sg = dma_noncoherent_map_sg, -#ifdef CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU +#if defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU) || \ + defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU_ALL) .sync_single_for_cpu = dma_noncoherent_sync_single_for_cpu, .sync_sg_for_cpu = dma_noncoherent_sync_sg_for_cpu, .unmap_page = dma_noncoherent_unmap_page, -- cgit From 62267e0ecc9c00a1b8ff7859cfa03e34b419f7ee Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Fri, 22 Jun 2018 17:38:50 +0200 Subject: module: print sensible error code Printing "err 0" to the user in the warning message is not particularly useful, especially when this gets transformed into a -ENOENT for the remainder of the call chain. Signed-off-by: Jason A. Donenfeld Signed-off-by: Jessica Yu --- kernel/module.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index 0ad0bb58e116..b6b3a3c58af1 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -2284,9 +2284,9 @@ static int simplify_symbols(struct module *mod, const struct load_info *info) if (!ksym && ELF_ST_BIND(sym[i].st_info) == STB_WEAK) break; - pr_warn("%s: Unknown symbol %s (err %li)\n", - mod->name, name, PTR_ERR(ksym)); ret = PTR_ERR(ksym) ?: -ENOENT; + pr_warn("%s: Unknown symbol %s (err %d)\n", + mod->name, name, ret); break; default: -- cgit From 996302c5e85650722f1e5aeaeaaac12f9f362bf8 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Sun, 24 Jun 2018 00:37:44 +0900 Subject: module: replace VMLINUX_SYMBOL_STR() with __stringify() or string literal With the special case handling for Blackfin and Metag was removed by commit 94e58e0ac312 ("export.h: remove code for prefixing symbols with underscore"), VMLINUX_SYMBOL_STR() is now equivalent to __stringify(). Replace the remaining usages to prepare for the entire removal of VMLINUX_SYMBOL_STR(). Signed-off-by: Masahiro Yamada Signed-off-by: Jessica Yu --- include/linux/module.h | 4 ++-- kernel/module.c | 6 ++---- 2 files changed, 4 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/include/linux/module.h b/include/linux/module.h index d44df9b2c131..f807f15bebbe 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -266,7 +266,7 @@ extern int modules_disabled; /* for sysctl */ /* Get/put a kernel symbol (calls must be symmetric) */ void *__symbol_get(const char *symbol); void *__symbol_get_gpl(const char *symbol); -#define symbol_get(x) ((typeof(&x))(__symbol_get(VMLINUX_SYMBOL_STR(x)))) +#define symbol_get(x) ((typeof(&x))(__symbol_get(__stringify(x)))) /* modules using other modules: kdb wants to see this. */ struct module_use { @@ -575,7 +575,7 @@ extern void __noreturn __module_put_and_exit(struct module *mod, #ifdef CONFIG_MODULE_UNLOAD int module_refcount(struct module *mod); void __symbol_put(const char *symbol); -#define symbol_put(x) __symbol_put(VMLINUX_SYMBOL_STR(x)) +#define symbol_put(x) __symbol_put(__stringify(x)) void symbol_put_addr(void *addr); /* Sometimes we know we already have a refcount, and it's easier not diff --git a/kernel/module.c b/kernel/module.c index b6b3a3c58af1..ba45a84e4287 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -1341,14 +1341,12 @@ static inline int check_modstruct_version(const struct load_info *info, * locking is necessary -- use preempt_disable() to placate lockdep. */ preempt_disable(); - if (!find_symbol(VMLINUX_SYMBOL_STR(module_layout), NULL, - &crc, true, false)) { + if (!find_symbol("module_layout", NULL, &crc, true, false)) { preempt_enable(); BUG(); } preempt_enable(); - return check_version(info, VMLINUX_SYMBOL_STR(module_layout), - mod, crc); + return check_version(info, "module_layout", mod, crc); } /* First part is kernel version, which we ignore if module has crcs. */ -- cgit From 5257514d8885b2ebc11bf359ea1527282b47a5fd Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 24 Apr 2018 11:03:39 -0700 Subject: rcu: Make expedited grace period use direct call on last leaf During expedited grace-period initialization, a work item is scheduled for each leaf rcu_node structure. However, that initialization code is itself (normally) executing from a workqueue, so one of the leaf rcu_node structures could just as well be handled by that pre-existing workqueue, and with less overhead. This commit therefore uses a shiny new rcu_is_leaf_node() macro to execute the last leaf rcu_node structure's initialization directly from the pre-existing workqueue. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcu.h | 3 +++ kernel/rcu/tree_exp.h | 5 +++-- 2 files changed, 6 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index 40cea6735c2d..db0870acfdff 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -276,6 +276,9 @@ static inline void rcu_init_levelspread(int *levelspread, const int *levelcnt) /* Is this rcu_node a leaf? */ #define rcu_is_leaf_node(rnp) ((rnp)->level == rcu_num_lvls - 1) +/* Is this rcu_node the last leaf? */ +#define rcu_is_last_leaf_node(rsp, rnp) ((rnp) == &(rsp)->node[rcu_num_nodes - 1]) + /* * Do a full breadth-first scan of the rcu_node structures for the * specified rcu_state structure. diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index d40708e8c5d6..c6385ee1af65 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -486,8 +486,9 @@ static void sync_rcu_exp_select_cpus(struct rcu_state *rsp, rnp->rew.rew_func = func; rnp->rew.rew_rsp = rsp; if (!READ_ONCE(rcu_par_gp_wq) || - rcu_scheduler_active != RCU_SCHEDULER_RUNNING) { - /* No workqueues yet. */ + rcu_scheduler_active != RCU_SCHEDULER_RUNNING || + rcu_is_last_leaf_node(rsp, rnp)) { + /* No workqueues yet or last leaf, do direct call. */ sync_rcu_exp_select_node_cpus(&rnp->rew.rew_work); continue; } -- cgit From 5ef98a6328a1506f544a64b28d6a8a7b99af475b Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 24 Apr 2018 21:30:13 -0700 Subject: srcu: Fix typos in __call_srcu() header comment This commit simply changes some copy-pasta call_rcu() instances to the correct call_srcu(). Signed-off-by: Paul E. McKenney --- kernel/rcu/srcutree.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index b4123d7a2cec..615e414b0c1e 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -823,17 +823,17 @@ static void srcu_leak_callback(struct rcu_head *rhp) * more than one CPU, this means that when "func()" is invoked, each CPU * is guaranteed to have executed a full memory barrier since the end of * its last corresponding SRCU read-side critical section whose beginning - * preceded the call to call_rcu(). It also means that each CPU executing + * preceded the call to call_srcu(). It also means that each CPU executing * an SRCU read-side critical section that continues beyond the start of - * "func()" must have executed a memory barrier after the call_rcu() + * "func()" must have executed a memory barrier after the call_srcu() * but before the beginning of that SRCU read-side critical section. * Note that these guarantees include CPUs that are offline, idle, or * executing in user mode, as well as CPUs that are executing in the kernel. * - * Furthermore, if CPU A invoked call_rcu() and CPU B invoked the + * Furthermore, if CPU A invoked call_srcu() and CPU B invoked the * resulting SRCU callback function "func()", then both CPU A and CPU * B are guaranteed to execute a full memory barrier during the time - * interval between the call to call_rcu() and the invocation of "func()". + * interval between the call to call_srcu() and the invocation of "func()". * This guarantee applies even if CPU A and CPU B are the same CPU (but * again only if the system has more than one CPU). * -- cgit From 17294ce6a41d3fee6a9bfc52387c107a4607c1c9 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 25 Apr 2018 12:03:36 -0700 Subject: srcu: Document that srcu_funnel_gp_start() implies srcu_funnel_exp_start() This commit updates the header comment of srcu_funnel_gp_start() to document the fact that srcu_funnel_gp_start() does the work of srcu_funnel_exp_start(), in some cases by invoking it directly. Signed-off-by: Paul E. McKenney --- kernel/rcu/srcutree.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index 615e414b0c1e..37bef2fe80e6 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -641,6 +641,9 @@ static void srcu_funnel_exp_start(struct srcu_struct *sp, struct srcu_node *snp, * period s. Losers must either ensure that their desired grace-period * number is recorded on at least their leaf srcu_node structure, or they * must take steps to invoke their own callbacks. + * + * Note that this function also does the work of srcu_funnel_exp_start(), + * in some cases by directly invoking it. */ static void srcu_funnel_gp_start(struct srcu_struct *sp, struct srcu_data *sdp, unsigned long s, bool do_norm) -- cgit From 5ab07a8df4d6c958ca63640d3f2ef896f0679c05 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 22 May 2018 12:28:04 -0700 Subject: srcu: Add address of first callback to rcutorture output This commit adds the address of the first callback to the per-CPU rcutorture output in order to allow lost wakeups to be more efficiently tracked down. Signed-off-by: Paul E. McKenney --- kernel/rcu/srcutree.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index 37bef2fe80e6..5a1a9a07b407 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -1271,11 +1271,11 @@ void srcu_torture_stats_print(struct srcu_struct *sp, char *tt, char *tf) unsigned long l0, l1; unsigned long u0, u1; long c0, c1; - struct srcu_data *counts; + struct srcu_data *sdp; - counts = per_cpu_ptr(sp->sda, cpu); - u0 = counts->srcu_unlock_count[!idx]; - u1 = counts->srcu_unlock_count[idx]; + sdp = per_cpu_ptr(sp->sda, cpu); + u0 = sdp->srcu_unlock_count[!idx]; + u1 = sdp->srcu_unlock_count[idx]; /* * Make sure that a lock is always counted if the corresponding @@ -1283,12 +1283,13 @@ void srcu_torture_stats_print(struct srcu_struct *sp, char *tt, char *tf) */ smp_rmb(); - l0 = counts->srcu_lock_count[!idx]; - l1 = counts->srcu_lock_count[idx]; + l0 = sdp->srcu_lock_count[!idx]; + l1 = sdp->srcu_lock_count[idx]; c0 = l0 - u0; c1 = l1 - u1; - pr_cont(" %d(%ld,%ld)", cpu, c0, c1); + pr_cont(" %d(%ld,%ld %1p)", + cpu, c0, c1, rcu_segcblist_head(&sdp->srcu_cblist)); s0 += c0; s1 += c1; } -- cgit From 90127d605f403d814f4986436871210bf8ceb335 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 9 May 2018 10:29:18 -0700 Subject: torture: Make online/offline messages appear only for verbose=2 Some bugs reproduce quickly only at high CPU-hotplug rates, so the rcutorture TREE03 scenario now has only 200 milliseconds spacing between CPU-hotplug operations. At this rate, the torture-test pair of console messages per operation becomes a bit voluminous. This commit therefore converts the torture-test set of "verbose" kernel-boot arguments from bool to int, and prints the extra console messages only when verbose=2. The default is still verbose=1. Signed-off-by: Paul E. McKenney --- include/linux/torture.h | 2 +- kernel/locking/locktorture.c | 2 +- kernel/rcu/rcuperf.c | 2 +- kernel/rcu/rcutorture.c | 2 +- kernel/torture.c | 12 ++++++------ 5 files changed, 10 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/include/linux/torture.h b/include/linux/torture.h index 66272862070b..a55e80817dae 100644 --- a/include/linux/torture.h +++ b/include/linux/torture.h @@ -79,7 +79,7 @@ void stutter_wait(const char *title); int torture_stutter_init(int s); /* Initialization and cleanup. */ -bool torture_init_begin(char *ttype, bool v); +bool torture_init_begin(char *ttype, int v); void torture_init_end(void); bool torture_cleanup_begin(void); void torture_cleanup_end(void); diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c index 8402b3349dca..4a2e13870a9b 100644 --- a/kernel/locking/locktorture.c +++ b/kernel/locking/locktorture.c @@ -57,7 +57,7 @@ torture_param(int, shutdown_secs, 0, "Shutdown time (j), <= zero to disable."); torture_param(int, stat_interval, 60, "Number of seconds between stats printk()s"); torture_param(int, stutter, 5, "Number of jiffies to run/halt test, 0=disable"); -torture_param(bool, verbose, true, +torture_param(int, verbose, 1, "Enable verbose debugging printk()s"); static char *torture_type = "spin_lock"; diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c index e232846516b3..fb8094848906 100644 --- a/kernel/rcu/rcuperf.c +++ b/kernel/rcu/rcuperf.c @@ -88,7 +88,7 @@ torture_param(int, nreaders, -1, "Number of RCU reader threads"); torture_param(int, nwriters, -1, "Number of RCU updater threads"); torture_param(bool, shutdown, !IS_ENABLED(MODULE), "Shutdown at end of performance tests."); -torture_param(bool, verbose, true, "Enable verbose debugging printk()s"); +torture_param(int, verbose, 1, "Enable verbose debugging printk()s"); torture_param(int, writer_holdoff, 0, "Holdoff (us) between GPs, zero to disable"); static char *perf_type = "rcu"; diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 42fcb7f05fac..a5540bd831c4 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -101,7 +101,7 @@ torture_param(int, test_boost_interval, 7, "Interval between boost tests, seconds."); torture_param(bool, test_no_idle_hz, true, "Test support for tickless idle CPUs"); -torture_param(bool, verbose, true, +torture_param(int, verbose, 1, "Enable verbose debugging printk()s"); static char *torture_type = "rcu"; diff --git a/kernel/torture.c b/kernel/torture.c index 3de1efbecd6a..840fd33c1cda 100644 --- a/kernel/torture.c +++ b/kernel/torture.c @@ -53,7 +53,7 @@ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Paul E. McKenney "); static char *torture_type; -static bool verbose; +static int verbose; /* Mediate rmmod and system shutdown. Concurrent rmmod & shutdown illegal! */ #define FULLSTOP_DONTSTOP 0 /* Normal operation. */ @@ -98,7 +98,7 @@ bool torture_offline(int cpu, long *n_offl_attempts, long *n_offl_successes, if (!cpu_online(cpu) || !cpu_is_hotpluggable(cpu)) return false; - if (verbose) + if (verbose > 1) pr_alert("%s" TORTURE_FLAG "torture_onoff task: offlining %d\n", torture_type, cpu); @@ -111,7 +111,7 @@ bool torture_offline(int cpu, long *n_offl_attempts, long *n_offl_successes, "torture_onoff task: offline %d failed: errno %d\n", torture_type, cpu, ret); } else { - if (verbose) + if (verbose > 1) pr_alert("%s" TORTURE_FLAG "torture_onoff task: offlined %d\n", torture_type, cpu); @@ -147,7 +147,7 @@ bool torture_online(int cpu, long *n_onl_attempts, long *n_onl_successes, if (cpu_online(cpu) || !cpu_is_hotpluggable(cpu)) return false; - if (verbose) + if (verbose > 1) pr_alert("%s" TORTURE_FLAG "torture_onoff task: onlining %d\n", torture_type, cpu); @@ -160,7 +160,7 @@ bool torture_online(int cpu, long *n_onl_attempts, long *n_onl_successes, "torture_onoff task: online %d failed: errno %d\n", torture_type, cpu, ret); } else { - if (verbose) + if (verbose > 1) pr_alert("%s" TORTURE_FLAG "torture_onoff task: onlined %d\n", torture_type, cpu); @@ -647,7 +647,7 @@ static void torture_stutter_cleanup(void) * The runnable parameter points to a flag that controls whether or not * the test is currently runnable. If there is no such flag, pass in NULL. */ -bool torture_init_begin(char *ttype, bool v) +bool torture_init_begin(char *ttype, int v) { mutex_lock(&fullstop_mutex); if (torture_type != NULL) { -- cgit From 60500037637397dcc8ea3d3c2f16e05ea6695a86 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 15 May 2018 12:25:05 -0700 Subject: torture: Keep old-school dmesg format This commit adds "#define pr_fmt(fmt) fmt" to the torture-test files in order to keep the current dmesg format. Once Joe's commits have hit mainline, these definitions will be changed in order to automatically generate the dmesg line prefix that the scripts expect. This will have the beneficial side-effect of allowing printk() formats to be used more widely and of shortening some pr_*() lines. Signed-off-by: Paul E. McKenney Cc: Joe Perches --- kernel/locking/locktorture.c | 3 +++ kernel/rcu/rcuperf.c | 3 +++ kernel/rcu/rcutorture.c | 3 +++ kernel/torture.c | 3 +++ 4 files changed, 12 insertions(+) (limited to 'kernel') diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c index 4a2e13870a9b..57bef4fbfb31 100644 --- a/kernel/locking/locktorture.c +++ b/kernel/locking/locktorture.c @@ -21,6 +21,9 @@ * Davidlohr Bueso * Based on kernel/rcu/torture.c. */ + +#define pr_fmt(fmt) fmt + #include #include #include diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c index fb8094848906..df29119b2013 100644 --- a/kernel/rcu/rcuperf.c +++ b/kernel/rcu/rcuperf.c @@ -19,6 +19,9 @@ * * Authors: Paul E. McKenney */ + +#define pr_fmt(fmt) fmt + #include #include #include diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index a5540bd831c4..5604bfac8df4 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -22,6 +22,9 @@ * * See also: Documentation/RCU/torture.txt */ + +#define pr_fmt(fmt) fmt + #include #include #include diff --git a/kernel/torture.c b/kernel/torture.c index 840fd33c1cda..1ac24a826589 100644 --- a/kernel/torture.c +++ b/kernel/torture.c @@ -20,6 +20,9 @@ * Author: Paul E. McKenney * Based on kernel/rcu/torture.c. */ + +#define pr_fmt(fmt) fmt + #include #include #include -- cgit From 9a4903dde2c8633c5fcf887b98c4e047a6154a54 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 26 Jun 2018 04:58:48 +0200 Subject: perf/hw_breakpoint: Split attribute parse and commit arch_validate_hwbkpt_settings() mixes up attribute check and commit into a single code entity. Therefore the validation may return an error due to incorrect atributes while still leaving halfway modified architecture breakpoint data. This is harmless when we deal with a new breakpoint but it becomes a problem when we modify an existing breakpoint. Split attribute parse and commit to fix that. The architecture is passed a "struct arch_hw_breakpoint" to fill on top of the new attr and the core takes care about copying the backend data once it's fully validated. The architectures then need to implement the new API. Original-patch-by: Andy Lutomirski Reported-by: Linus Torvalds Signed-off-by: Frederic Weisbecker Cc: Alexander Shishkin Cc: Andy Lutomirski Cc: Arnaldo Carvalho de Melo Cc: Arnaldo Carvalho de Melo Cc: Benjamin Herrenschmidt Cc: Catalin Marinas Cc: Chris Zankel Cc: Jiri Olsa Cc: Joel Fernandes Cc: Mark Rutland Cc: Max Filippov Cc: Michael Ellerman Cc: Namhyung Kim Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Rich Felker Cc: Thomas Gleixner Cc: Will Deacon Cc: Yoshinori Sato Link: http://lkml.kernel.org/r/1529981939-8231-2-git-send-email-frederic@kernel.org Signed-off-by: Ingo Molnar --- kernel/events/hw_breakpoint.c | 57 +++++++++++++++++++++++++++++++------------ 1 file changed, 41 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c index 6e28d2866be5..314e2a9040c7 100644 --- a/kernel/events/hw_breakpoint.c +++ b/kernel/events/hw_breakpoint.c @@ -400,16 +400,35 @@ int dbg_release_bp_slot(struct perf_event *bp) return 0; } -static int validate_hw_breakpoint(struct perf_event *bp) +#ifndef hw_breakpoint_arch_parse +int hw_breakpoint_arch_parse(struct perf_event *bp, + const struct perf_event_attr *attr, + struct arch_hw_breakpoint *hw) { - int ret; + int err; - ret = arch_validate_hwbkpt_settings(bp); - if (ret) - return ret; + err = arch_validate_hwbkpt_settings(bp); + if (err) + return err; + + *hw = bp->hw.info; + + return 0; +} +#endif + +static int hw_breakpoint_parse(struct perf_event *bp, + const struct perf_event_attr *attr, + struct arch_hw_breakpoint *hw) +{ + int err; + + err = hw_breakpoint_arch_parse(bp, attr, hw); + if (err) + return err; if (arch_check_bp_in_kernelspace(bp)) { - if (bp->attr.exclude_kernel) + if (attr->exclude_kernel) return -EINVAL; /* * Don't let unprivileged users set a breakpoint in the trap @@ -424,19 +443,22 @@ static int validate_hw_breakpoint(struct perf_event *bp) int register_perf_hw_breakpoint(struct perf_event *bp) { - int ret; - - ret = reserve_bp_slot(bp); - if (ret) - return ret; + struct arch_hw_breakpoint hw; + int err; - ret = validate_hw_breakpoint(bp); + err = reserve_bp_slot(bp); + if (err) + return err; - /* if arch_validate_hwbkpt_settings() fails then release bp slot */ - if (ret) + err = hw_breakpoint_parse(bp, &bp->attr, &hw); + if (err) { release_bp_slot(bp); + return err; + } - return ret; + bp->hw.info = hw; + + return 0; } /** @@ -464,6 +486,7 @@ modify_user_hw_breakpoint_check(struct perf_event *bp, struct perf_event_attr *a u64 old_len = bp->attr.bp_len; int old_type = bp->attr.bp_type; bool modify = attr->bp_type != old_type; + struct arch_hw_breakpoint hw; int err = 0; bp->attr.bp_addr = attr->bp_addr; @@ -473,7 +496,7 @@ modify_user_hw_breakpoint_check(struct perf_event *bp, struct perf_event_attr *a if (check && memcmp(&bp->attr, attr, sizeof(*attr))) return -EINVAL; - err = validate_hw_breakpoint(bp); + err = hw_breakpoint_parse(bp, attr, &hw); if (!err && modify) err = modify_bp_slot(bp, old_type); @@ -484,7 +507,9 @@ modify_user_hw_breakpoint_check(struct perf_event *bp, struct perf_event_attr *a return err; } + bp->hw.info = hw; bp->attr.disabled = attr->disabled; + return 0; } -- cgit From 8e983ff9ac02a8fb454ed09c2462bdb3617006a8 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 26 Jun 2018 04:58:49 +0200 Subject: perf/hw_breakpoint: Pass arch breakpoint struct to arch_check_bp_in_kernelspace() We can't pass the breakpoint directly on arch_check_bp_in_kernelspace() anymore because its architecture internal datas (struct arch_hw_breakpoint) are not yet filled by the time we call the function, and most implementation need this backend to be up to date. So arrange the function to take the probing struct instead. Signed-off-by: Frederic Weisbecker Cc: Alexander Shishkin Cc: Andy Lutomirski Cc: Arnaldo Carvalho de Melo Cc: Arnaldo Carvalho de Melo Cc: Benjamin Herrenschmidt Cc: Catalin Marinas Cc: Chris Zankel Cc: Jiri Olsa Cc: Joel Fernandes Cc: Linus Torvalds Cc: Mark Rutland Cc: Max Filippov Cc: Michael Ellerman Cc: Namhyung Kim Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Rich Felker Cc: Thomas Gleixner Cc: Will Deacon Cc: Yoshinori Sato Link: http://lkml.kernel.org/r/1529981939-8231-3-git-send-email-frederic@kernel.org Signed-off-by: Ingo Molnar --- arch/arm/include/asm/hw_breakpoint.h | 2 +- arch/arm/kernel/hw_breakpoint.c | 11 +++-- arch/arm64/include/asm/hw_breakpoint.h | 2 +- arch/arm64/kernel/hw_breakpoint.c | 9 ++-- arch/powerpc/include/asm/hw_breakpoint.h | 2 +- arch/powerpc/kernel/hw_breakpoint.c | 6 +-- arch/sh/include/asm/hw_breakpoint.h | 2 +- arch/sh/kernel/hw_breakpoint.c | 9 ++-- arch/x86/include/asm/hw_breakpoint.h | 2 +- arch/x86/kernel/hw_breakpoint.c | 71 +++++++++++++++++--------------- arch/xtensa/include/asm/hw_breakpoint.h | 2 +- arch/xtensa/kernel/hw_breakpoint.c | 7 ++-- kernel/events/hw_breakpoint.c | 2 +- 13 files changed, 63 insertions(+), 64 deletions(-) (limited to 'kernel') diff --git a/arch/arm/include/asm/hw_breakpoint.h b/arch/arm/include/asm/hw_breakpoint.h index e46e4e7bdba3..d5a0f52c6755 100644 --- a/arch/arm/include/asm/hw_breakpoint.h +++ b/arch/arm/include/asm/hw_breakpoint.h @@ -117,7 +117,7 @@ struct pmu; extern int arch_bp_generic_fields(struct arch_hw_breakpoint_ctrl ctrl, int *gen_len, int *gen_type); -extern int arch_check_bp_in_kernelspace(struct perf_event *bp); +extern int arch_check_bp_in_kernelspace(struct arch_hw_breakpoint *hw); extern int arch_validate_hwbkpt_settings(struct perf_event *bp); extern int hw_breakpoint_exceptions_notify(struct notifier_block *unused, unsigned long val, void *data); diff --git a/arch/arm/kernel/hw_breakpoint.c b/arch/arm/kernel/hw_breakpoint.c index 629e25152c0d..385dcf45d64b 100644 --- a/arch/arm/kernel/hw_breakpoint.c +++ b/arch/arm/kernel/hw_breakpoint.c @@ -456,14 +456,13 @@ static int get_hbp_len(u8 hbp_len) /* * Check whether bp virtual address is in kernel space. */ -int arch_check_bp_in_kernelspace(struct perf_event *bp) +int arch_check_bp_in_kernelspace(struct arch_hw_breakpoint *hw) { unsigned int len; unsigned long va; - struct arch_hw_breakpoint *info = counter_arch_bp(bp); - va = info->address; - len = get_hbp_len(info->ctrl.len); + va = hw->address; + len = get_hbp_len(hw->ctrl.len); return (va >= TASK_SIZE) && ((va + len - 1) >= TASK_SIZE); } @@ -576,7 +575,7 @@ static int arch_build_bp_info(struct perf_event *bp) /* Privilege */ info->ctrl.privilege = ARM_BREAKPOINT_USER; - if (arch_check_bp_in_kernelspace(bp)) + if (arch_check_bp_in_kernelspace(info)) info->ctrl.privilege |= ARM_BREAKPOINT_PRIV; /* Enabled? */ @@ -640,7 +639,7 @@ int arch_validate_hwbkpt_settings(struct perf_event *bp) return -EINVAL; /* We don't allow mismatch breakpoints in kernel space. */ - if (arch_check_bp_in_kernelspace(bp)) + if (arch_check_bp_in_kernelspace(info)) return -EPERM; /* diff --git a/arch/arm64/include/asm/hw_breakpoint.h b/arch/arm64/include/asm/hw_breakpoint.h index 41770766d964..9f4a3d48762e 100644 --- a/arch/arm64/include/asm/hw_breakpoint.h +++ b/arch/arm64/include/asm/hw_breakpoint.h @@ -124,7 +124,7 @@ struct pmu; extern int arch_bp_generic_fields(struct arch_hw_breakpoint_ctrl ctrl, int *gen_len, int *gen_type, int *offset); -extern int arch_check_bp_in_kernelspace(struct perf_event *bp); +extern int arch_check_bp_in_kernelspace(struct arch_hw_breakpoint *hw); extern int arch_validate_hwbkpt_settings(struct perf_event *bp); extern int hw_breakpoint_exceptions_notify(struct notifier_block *unused, unsigned long val, void *data); diff --git a/arch/arm64/kernel/hw_breakpoint.c b/arch/arm64/kernel/hw_breakpoint.c index 413dbe530da8..6a90d12e4ff2 100644 --- a/arch/arm64/kernel/hw_breakpoint.c +++ b/arch/arm64/kernel/hw_breakpoint.c @@ -343,14 +343,13 @@ static int get_hbp_len(u8 hbp_len) /* * Check whether bp virtual address is in kernel space. */ -int arch_check_bp_in_kernelspace(struct perf_event *bp) +int arch_check_bp_in_kernelspace(struct arch_hw_breakpoint *hw) { unsigned int len; unsigned long va; - struct arch_hw_breakpoint *info = counter_arch_bp(bp); - va = info->address; - len = get_hbp_len(info->ctrl.len); + va = hw->address; + len = get_hbp_len(hw->ctrl.len); return (va >= TASK_SIZE) && ((va + len - 1) >= TASK_SIZE); } @@ -502,7 +501,7 @@ static int arch_build_bp_info(struct perf_event *bp) * Note that we disallow combined EL0/EL1 breakpoints because * that would complicate the stepping code. */ - if (arch_check_bp_in_kernelspace(bp)) + if (arch_check_bp_in_kernelspace(info)) info->ctrl.privilege = AARCH64_BREAKPOINT_EL1; else info->ctrl.privilege = AARCH64_BREAKPOINT_EL0; diff --git a/arch/powerpc/include/asm/hw_breakpoint.h b/arch/powerpc/include/asm/hw_breakpoint.h index 8e7b09703ca4..4d0b1bfcdfd0 100644 --- a/arch/powerpc/include/asm/hw_breakpoint.h +++ b/arch/powerpc/include/asm/hw_breakpoint.h @@ -60,7 +60,7 @@ struct perf_sample_data; extern int hw_breakpoint_slots(int type); extern int arch_bp_generic_fields(int type, int *gen_bp_type); -extern int arch_check_bp_in_kernelspace(struct perf_event *bp); +extern int arch_check_bp_in_kernelspace(struct arch_hw_breakpoint *hw); extern int arch_validate_hwbkpt_settings(struct perf_event *bp); extern int hw_breakpoint_exceptions_notify(struct notifier_block *unused, unsigned long val, void *data); diff --git a/arch/powerpc/kernel/hw_breakpoint.c b/arch/powerpc/kernel/hw_breakpoint.c index 80547dad37da..899bcec3f3c8 100644 --- a/arch/powerpc/kernel/hw_breakpoint.c +++ b/arch/powerpc/kernel/hw_breakpoint.c @@ -119,11 +119,9 @@ void arch_unregister_hw_breakpoint(struct perf_event *bp) /* * Check for virtual address in kernel space. */ -int arch_check_bp_in_kernelspace(struct perf_event *bp) +int arch_check_bp_in_kernelspace(struct arch_hw_breakpoint *hw) { - struct arch_hw_breakpoint *info = counter_arch_bp(bp); - - return is_kernel_addr(info->address); + return is_kernel_addr(hw->address); } int arch_bp_generic_fields(int type, int *gen_bp_type) diff --git a/arch/sh/include/asm/hw_breakpoint.h b/arch/sh/include/asm/hw_breakpoint.h index 7431c172c0cb..8a88ed0b3cd7 100644 --- a/arch/sh/include/asm/hw_breakpoint.h +++ b/arch/sh/include/asm/hw_breakpoint.h @@ -54,7 +54,7 @@ static inline int hw_breakpoint_slots(int type) } /* arch/sh/kernel/hw_breakpoint.c */ -extern int arch_check_bp_in_kernelspace(struct perf_event *bp); +extern int arch_check_bp_in_kernelspace(struct arch_hw_breakpoint *hw); extern int arch_validate_hwbkpt_settings(struct perf_event *bp); extern int hw_breakpoint_exceptions_notify(struct notifier_block *unused, unsigned long val, void *data); diff --git a/arch/sh/kernel/hw_breakpoint.c b/arch/sh/kernel/hw_breakpoint.c index 8648ed05ccf0..38791fefdd0c 100644 --- a/arch/sh/kernel/hw_breakpoint.c +++ b/arch/sh/kernel/hw_breakpoint.c @@ -124,14 +124,13 @@ static int get_hbp_len(u16 hbp_len) /* * Check for virtual address in kernel space. */ -int arch_check_bp_in_kernelspace(struct perf_event *bp) +int arch_check_bp_in_kernelspace(struct arch_hw_breakpoint *hw) { unsigned int len; unsigned long va; - struct arch_hw_breakpoint *info = counter_arch_bp(bp); - va = info->address; - len = get_hbp_len(info->len); + va = hw->address; + len = get_hbp_len(hw->len); return (va >= TASK_SIZE) && ((va + len - 1) >= TASK_SIZE); } @@ -346,7 +345,7 @@ static int __kprobes hw_breakpoint_handler(struct die_args *args) perf_bp_event(bp, args->regs); /* Deliver the signal to userspace */ - if (!arch_check_bp_in_kernelspace(bp)) { + if (!arch_check_bp_in_kernelspace(&bp->hw.info)) { force_sig_fault(SIGTRAP, TRAP_HWBKPT, (void __user *)NULL, current); } diff --git a/arch/x86/include/asm/hw_breakpoint.h b/arch/x86/include/asm/hw_breakpoint.h index f59c39835a5a..78924596f51f 100644 --- a/arch/x86/include/asm/hw_breakpoint.h +++ b/arch/x86/include/asm/hw_breakpoint.h @@ -52,7 +52,7 @@ static inline int hw_breakpoint_slots(int type) struct perf_event; struct pmu; -extern int arch_check_bp_in_kernelspace(struct perf_event *bp); +extern int arch_check_bp_in_kernelspace(struct arch_hw_breakpoint *hw); extern int arch_validate_hwbkpt_settings(struct perf_event *bp); extern int hw_breakpoint_exceptions_notify(struct notifier_block *unused, unsigned long val, void *data); diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c index 8771766d46b6..c43379188f4f 100644 --- a/arch/x86/kernel/hw_breakpoint.c +++ b/arch/x86/kernel/hw_breakpoint.c @@ -169,28 +169,29 @@ void arch_uninstall_hw_breakpoint(struct perf_event *bp) set_dr_addr_mask(0, i); } -/* - * Check for virtual address in kernel space. - */ -int arch_check_bp_in_kernelspace(struct perf_event *bp) +static int arch_bp_generic_len(int x86_len) { - unsigned int len; - unsigned long va; - struct arch_hw_breakpoint *info = counter_arch_bp(bp); - - va = info->address; - len = bp->attr.bp_len; - - /* - * We don't need to worry about va + len - 1 overflowing: - * we already require that va is aligned to a multiple of len. - */ - return (va >= TASK_SIZE_MAX) || ((va + len - 1) >= TASK_SIZE_MAX); + switch (x86_len) { + case X86_BREAKPOINT_LEN_1: + return HW_BREAKPOINT_LEN_1; + case X86_BREAKPOINT_LEN_2: + return HW_BREAKPOINT_LEN_2; + case X86_BREAKPOINT_LEN_4: + return HW_BREAKPOINT_LEN_4; +#ifdef CONFIG_X86_64 + case X86_BREAKPOINT_LEN_8: + return HW_BREAKPOINT_LEN_8; +#endif + default: + return -EINVAL; + } } int arch_bp_generic_fields(int x86_len, int x86_type, int *gen_len, int *gen_type) { + int len; + /* Type */ switch (x86_type) { case X86_BREAKPOINT_EXECUTE: @@ -211,28 +212,32 @@ int arch_bp_generic_fields(int x86_len, int x86_type, } /* Len */ - switch (x86_len) { - case X86_BREAKPOINT_LEN_1: - *gen_len = HW_BREAKPOINT_LEN_1; - break; - case X86_BREAKPOINT_LEN_2: - *gen_len = HW_BREAKPOINT_LEN_2; - break; - case X86_BREAKPOINT_LEN_4: - *gen_len = HW_BREAKPOINT_LEN_4; - break; -#ifdef CONFIG_X86_64 - case X86_BREAKPOINT_LEN_8: - *gen_len = HW_BREAKPOINT_LEN_8; - break; -#endif - default: + len = arch_bp_generic_len(x86_len); + if (len < 0) return -EINVAL; - } + *gen_len = len; return 0; } +/* + * Check for virtual address in kernel space. + */ +int arch_check_bp_in_kernelspace(struct arch_hw_breakpoint *hw) +{ + unsigned long va; + int len; + + va = hw->address; + len = arch_bp_generic_len(hw->len); + WARN_ON_ONCE(len < 0); + + /* + * We don't need to worry about va + len - 1 overflowing: + * we already require that va is aligned to a multiple of len. + */ + return (va >= TASK_SIZE_MAX) || ((va + len - 1) >= TASK_SIZE_MAX); +} static int arch_build_bp_info(struct perf_event *bp) { diff --git a/arch/xtensa/include/asm/hw_breakpoint.h b/arch/xtensa/include/asm/hw_breakpoint.h index dbe3053b284a..2525bf6816a6 100644 --- a/arch/xtensa/include/asm/hw_breakpoint.h +++ b/arch/xtensa/include/asm/hw_breakpoint.h @@ -35,7 +35,7 @@ struct pt_regs; struct task_struct; int hw_breakpoint_slots(int type); -int arch_check_bp_in_kernelspace(struct perf_event *bp); +int arch_check_bp_in_kernelspace(struct arch_hw_breakpoint *hw); int arch_validate_hwbkpt_settings(struct perf_event *bp); int hw_breakpoint_exceptions_notify(struct notifier_block *unused, unsigned long val, void *data); diff --git a/arch/xtensa/kernel/hw_breakpoint.c b/arch/xtensa/kernel/hw_breakpoint.c index b35656ab7dbd..6e34c3848885 100644 --- a/arch/xtensa/kernel/hw_breakpoint.c +++ b/arch/xtensa/kernel/hw_breakpoint.c @@ -33,14 +33,13 @@ int hw_breakpoint_slots(int type) } } -int arch_check_bp_in_kernelspace(struct perf_event *bp) +int arch_check_bp_in_kernelspace(struct arch_hw_breakpoint *hw) { unsigned int len; unsigned long va; - struct arch_hw_breakpoint *info = counter_arch_bp(bp); - va = info->address; - len = bp->attr.bp_len; + va = hw->address; + len = hw->len; return (va >= TASK_SIZE) && ((va + len - 1) >= TASK_SIZE); } diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c index 314e2a9040c7..89fe94c9214c 100644 --- a/kernel/events/hw_breakpoint.c +++ b/kernel/events/hw_breakpoint.c @@ -427,7 +427,7 @@ static int hw_breakpoint_parse(struct perf_event *bp, if (err) return err; - if (arch_check_bp_in_kernelspace(bp)) { + if (arch_check_bp_in_kernelspace(hw)) { if (attr->exclude_kernel) return -EINVAL; /* -- cgit From cffbb3bd444bc95186b6ab0ed3bf1d5bcf4aa620 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 26 Jun 2018 04:58:57 +0200 Subject: perf/hw_breakpoint: Remove default hw_breakpoint_arch_parse() All architectures have implemented it, we can now remove the poor weak version. Signed-off-by: Frederic Weisbecker Cc: Alexander Shishkin Cc: Andy Lutomirski Cc: Arnaldo Carvalho de Melo Cc: Arnaldo Carvalho de Melo Cc: Benjamin Herrenschmidt Cc: Catalin Marinas Cc: Chris Zankel Cc: Jiri Olsa Cc: Joel Fernandes Cc: Linus Torvalds Cc: Mark Rutland Cc: Max Filippov Cc: Michael Ellerman Cc: Namhyung Kim Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Rich Felker Cc: Thomas Gleixner Cc: Will Deacon Cc: Yoshinori Sato Link: http://lkml.kernel.org/r/1529981939-8231-11-git-send-email-frederic@kernel.org Signed-off-by: Ingo Molnar --- arch/arm/include/asm/hw_breakpoint.h | 1 - arch/arm64/include/asm/hw_breakpoint.h | 1 - arch/powerpc/include/asm/hw_breakpoint.h | 1 - arch/sh/include/asm/hw_breakpoint.h | 1 - arch/x86/include/asm/hw_breakpoint.h | 1 - arch/xtensa/include/asm/hw_breakpoint.h | 1 - kernel/events/hw_breakpoint.c | 17 ----------------- 7 files changed, 23 deletions(-) (limited to 'kernel') diff --git a/arch/arm/include/asm/hw_breakpoint.h b/arch/arm/include/asm/hw_breakpoint.h index 1e02925c11f4..ac54c06764e6 100644 --- a/arch/arm/include/asm/hw_breakpoint.h +++ b/arch/arm/include/asm/hw_breakpoint.h @@ -122,7 +122,6 @@ extern int arch_check_bp_in_kernelspace(struct arch_hw_breakpoint *hw); extern int hw_breakpoint_arch_parse(struct perf_event *bp, const struct perf_event_attr *attr, struct arch_hw_breakpoint *hw); -#define hw_breakpoint_arch_parse hw_breakpoint_arch_parse extern int hw_breakpoint_exceptions_notify(struct notifier_block *unused, unsigned long val, void *data); diff --git a/arch/arm64/include/asm/hw_breakpoint.h b/arch/arm64/include/asm/hw_breakpoint.h index bf9c305daa62..6a53e59ced95 100644 --- a/arch/arm64/include/asm/hw_breakpoint.h +++ b/arch/arm64/include/asm/hw_breakpoint.h @@ -129,7 +129,6 @@ extern int arch_check_bp_in_kernelspace(struct arch_hw_breakpoint *hw); extern int hw_breakpoint_arch_parse(struct perf_event *bp, const struct perf_event_attr *attr, struct arch_hw_breakpoint *hw); -#define hw_breakpoint_arch_parse hw_breakpoint_arch_parse extern int hw_breakpoint_exceptions_notify(struct notifier_block *unused, unsigned long val, void *data); diff --git a/arch/powerpc/include/asm/hw_breakpoint.h b/arch/powerpc/include/asm/hw_breakpoint.h index 38ae180610bd..27d6e3c8fde9 100644 --- a/arch/powerpc/include/asm/hw_breakpoint.h +++ b/arch/powerpc/include/asm/hw_breakpoint.h @@ -65,7 +65,6 @@ extern int arch_check_bp_in_kernelspace(struct arch_hw_breakpoint *hw); extern int hw_breakpoint_arch_parse(struct perf_event *bp, const struct perf_event_attr *attr, struct arch_hw_breakpoint *hw); -#define hw_breakpoint_arch_parse hw_breakpoint_arch_parse extern int hw_breakpoint_exceptions_notify(struct notifier_block *unused, unsigned long val, void *data); int arch_install_hw_breakpoint(struct perf_event *bp); diff --git a/arch/sh/include/asm/hw_breakpoint.h b/arch/sh/include/asm/hw_breakpoint.h index 867edcc60e8f..199d17b765f2 100644 --- a/arch/sh/include/asm/hw_breakpoint.h +++ b/arch/sh/include/asm/hw_breakpoint.h @@ -58,7 +58,6 @@ extern int arch_check_bp_in_kernelspace(struct arch_hw_breakpoint *hw); extern int hw_breakpoint_arch_parse(struct perf_event *bp, const struct perf_event_attr *attr, struct arch_hw_breakpoint *hw); -#define hw_breakpoint_arch_parse hw_breakpoint_arch_parse extern int hw_breakpoint_exceptions_notify(struct notifier_block *unused, unsigned long val, void *data); diff --git a/arch/x86/include/asm/hw_breakpoint.h b/arch/x86/include/asm/hw_breakpoint.h index 6c88e8e22678..a1f0e90d0818 100644 --- a/arch/x86/include/asm/hw_breakpoint.h +++ b/arch/x86/include/asm/hw_breakpoint.h @@ -57,7 +57,6 @@ extern int arch_check_bp_in_kernelspace(struct arch_hw_breakpoint *hw); extern int hw_breakpoint_arch_parse(struct perf_event *bp, const struct perf_event_attr *attr, struct arch_hw_breakpoint *hw); -#define hw_breakpoint_arch_parse hw_breakpoint_arch_parse extern int hw_breakpoint_exceptions_notify(struct notifier_block *unused, unsigned long val, void *data); diff --git a/arch/xtensa/include/asm/hw_breakpoint.h b/arch/xtensa/include/asm/hw_breakpoint.h index f347c2132e6b..9f119c1ca0b5 100644 --- a/arch/xtensa/include/asm/hw_breakpoint.h +++ b/arch/xtensa/include/asm/hw_breakpoint.h @@ -40,7 +40,6 @@ int arch_check_bp_in_kernelspace(struct arch_hw_breakpoint *hw); int hw_breakpoint_arch_parse(struct perf_event *bp, const struct perf_event_attr *attr, struct arch_hw_breakpoint *hw); -#define hw_breakpoint_arch_parse hw_breakpoint_arch_parse int hw_breakpoint_exceptions_notify(struct notifier_block *unused, unsigned long val, void *data); diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c index 89fe94c9214c..e7bc8d0a5972 100644 --- a/kernel/events/hw_breakpoint.c +++ b/kernel/events/hw_breakpoint.c @@ -400,23 +400,6 @@ int dbg_release_bp_slot(struct perf_event *bp) return 0; } -#ifndef hw_breakpoint_arch_parse -int hw_breakpoint_arch_parse(struct perf_event *bp, - const struct perf_event_attr *attr, - struct arch_hw_breakpoint *hw) -{ - int err; - - err = arch_validate_hwbkpt_settings(bp); - if (err) - return err; - - *hw = bp->hw.info; - - return 0; -} -#endif - static int hw_breakpoint_parse(struct perf_event *bp, const struct perf_event_attr *attr, struct arch_hw_breakpoint *hw) -- cgit From cb8b78815b68b048303f55c276d2aef147e8f2e7 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 26 Jun 2018 04:58:58 +0200 Subject: perf/hw_breakpoint: Pass new breakpoint type to modify_breakpoint_slot() We soon won't be able to rely on bp->attr anymore to get the new type of the modifying breakpoint because the new attributes are going to be copied only once we successfully modified the breakpoint slot. This will fix the current misdesigned layout where the new attr are copied to the modifying breakpoint before we actually know if the modification will be validated. In order to prepare for that, allow modify_breakpoint_slot() to take the new breakpoint type. Signed-off-by: Frederic Weisbecker Cc: Alexander Shishkin Cc: Andy Lutomirski Cc: Arnaldo Carvalho de Melo Cc: Arnaldo Carvalho de Melo Cc: Benjamin Herrenschmidt Cc: Catalin Marinas Cc: Chris Zankel Cc: Jiri Olsa Cc: Joel Fernandes Cc: Linus Torvalds Cc: Mark Rutland Cc: Max Filippov Cc: Michael Ellerman Cc: Namhyung Kim Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Rich Felker Cc: Thomas Gleixner Cc: Will Deacon Cc: Yoshinori Sato Link: http://lkml.kernel.org/r/1529981939-8231-12-git-send-email-frederic@kernel.org Signed-off-by: Ingo Molnar --- kernel/events/hw_breakpoint.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c index e7bc8d0a5972..71387703b5f1 100644 --- a/kernel/events/hw_breakpoint.c +++ b/kernel/events/hw_breakpoint.c @@ -345,13 +345,13 @@ void release_bp_slot(struct perf_event *bp) mutex_unlock(&nr_bp_mutex); } -static int __modify_bp_slot(struct perf_event *bp, u64 old_type) +static int __modify_bp_slot(struct perf_event *bp, u64 old_type, u64 new_type) { int err; __release_bp_slot(bp, old_type); - err = __reserve_bp_slot(bp, bp->attr.bp_type); + err = __reserve_bp_slot(bp, new_type); if (err) { /* * Reserve the old_type slot back in case @@ -367,12 +367,12 @@ static int __modify_bp_slot(struct perf_event *bp, u64 old_type) return err; } -static int modify_bp_slot(struct perf_event *bp, u64 old_type) +static int modify_bp_slot(struct perf_event *bp, u64 old_type, u64 new_type) { int ret; mutex_lock(&nr_bp_mutex); - ret = __modify_bp_slot(bp, old_type); + ret = __modify_bp_slot(bp, old_type, new_type); mutex_unlock(&nr_bp_mutex); return ret; } @@ -481,7 +481,7 @@ modify_user_hw_breakpoint_check(struct perf_event *bp, struct perf_event_attr *a err = hw_breakpoint_parse(bp, attr, &hw); if (!err && modify) - err = modify_bp_slot(bp, old_type); + err = modify_bp_slot(bp, old_type, bp->attr.bp_type); if (err) { bp->attr.bp_addr = old_addr; -- cgit From 26c6ccdf5c06aa83bb78518c72cb287f043db3df Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 26 Jun 2018 04:58:59 +0200 Subject: perf/hw_breakpoint: Clean up and consolidate modify_user_hw_breakpoint_check() Remove the dance around old and new attributes. Just don't modify the previous breakpoint at all until we have verified everything. Original-patch-by: Andy Lutomirski Reported-by: Linus Torvalds Signed-off-by: Frederic Weisbecker Cc: Alexander Shishkin Cc: Andy Lutomirski Cc: Arnaldo Carvalho de Melo Cc: Arnaldo Carvalho de Melo Cc: Benjamin Herrenschmidt Cc: Catalin Marinas Cc: Chris Zankel Cc: Jiri Olsa Cc: Joel Fernandes Cc: Mark Rutland Cc: Max Filippov Cc: Michael Ellerman Cc: Namhyung Kim Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Rich Felker Cc: Thomas Gleixner Cc: Will Deacon Cc: Yoshinori Sato Link: http://lkml.kernel.org/r/1529981939-8231-13-git-send-email-frederic@kernel.org Signed-off-by: Ingo Molnar --- kernel/events/hw_breakpoint.c | 44 ++++++++++++++++++++++++------------------- 1 file changed, 25 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c index 71387703b5f1..b3814fce5ecb 100644 --- a/kernel/events/hw_breakpoint.c +++ b/kernel/events/hw_breakpoint.c @@ -461,37 +461,43 @@ register_user_hw_breakpoint(struct perf_event_attr *attr, } EXPORT_SYMBOL_GPL(register_user_hw_breakpoint); +static void hw_breakpoint_copy_attr(struct perf_event_attr *to, + struct perf_event_attr *from) +{ + to->bp_addr = from->bp_addr; + to->bp_type = from->bp_type; + to->bp_len = from->bp_len; + to->disabled = from->disabled; +} + int modify_user_hw_breakpoint_check(struct perf_event *bp, struct perf_event_attr *attr, bool check) { - u64 old_addr = bp->attr.bp_addr; - u64 old_len = bp->attr.bp_len; - int old_type = bp->attr.bp_type; - bool modify = attr->bp_type != old_type; struct arch_hw_breakpoint hw; - int err = 0; + int err; - bp->attr.bp_addr = attr->bp_addr; - bp->attr.bp_type = attr->bp_type; - bp->attr.bp_len = attr->bp_len; + err = hw_breakpoint_parse(bp, attr, &hw); + if (err) + return err; - if (check && memcmp(&bp->attr, attr, sizeof(*attr))) - return -EINVAL; + if (check) { + struct perf_event_attr old_attr; - err = hw_breakpoint_parse(bp, attr, &hw); - if (!err && modify) - err = modify_bp_slot(bp, old_type, bp->attr.bp_type); + old_attr = bp->attr; + hw_breakpoint_copy_attr(&old_attr, attr); + if (memcmp(&old_attr, attr, sizeof(*attr))) + return -EINVAL; + } - if (err) { - bp->attr.bp_addr = old_addr; - bp->attr.bp_type = old_type; - bp->attr.bp_len = old_len; - return err; + if (bp->attr.bp_type != attr->bp_type) { + err = modify_bp_slot(bp, bp->attr.bp_type, attr->bp_type); + if (err) + return err; } + hw_breakpoint_copy_attr(&bp->attr, attr); bp->hw.info = hw; - bp->attr.disabled = attr->disabled; return 0; } -- cgit From fdb5c4531c1e0e50e609df83f736b6f3a02896e2 Mon Sep 17 00:00:00 2001 From: Sean Young Date: Tue, 19 Jun 2018 00:04:24 +0100 Subject: bpf: fix attach type BPF_LIRC_MODE2 dependency wrt CONFIG_CGROUP_BPF If the kernel is compiled with CONFIG_CGROUP_BPF not enabled, it is not possible to attach, detach or query IR BPF programs to /dev/lircN devices, making them impossible to use. For embedded devices, it should be possible to use IR decoding without cgroups or CONFIG_CGROUP_BPF enabled. This change requires some refactoring, since bpf_prog_{attach,detach,query} functions are now always compiled, but their code paths for cgroups need moving out. Rather than a #ifdef CONFIG_CGROUP_BPF in kernel/bpf/syscall.c, moving them to kernel/bpf/cgroup.c and kernel/bpf/sockmap.c does not require #ifdefs since that is already conditionally compiled. Fixes: f4364dcfc86d ("media: rc: introduce BPF_PROG_LIRC_MODE2") Signed-off-by: Sean Young Signed-off-by: Daniel Borkmann --- drivers/media/rc/bpf-lirc.c | 14 +------ include/linux/bpf-cgroup.h | 26 ++++++++++++ include/linux/bpf.h | 8 ++++ include/linux/bpf_lirc.h | 5 ++- kernel/bpf/cgroup.c | 54 +++++++++++++++++++++++++ kernel/bpf/sockmap.c | 18 +++++++++ kernel/bpf/syscall.c | 99 ++++++++++----------------------------------- 7 files changed, 132 insertions(+), 92 deletions(-) (limited to 'kernel') diff --git a/drivers/media/rc/bpf-lirc.c b/drivers/media/rc/bpf-lirc.c index 40826bba06b6..fcfab6635f9c 100644 --- a/drivers/media/rc/bpf-lirc.c +++ b/drivers/media/rc/bpf-lirc.c @@ -207,29 +207,19 @@ void lirc_bpf_free(struct rc_dev *rcdev) bpf_prog_array_free(rcdev->raw->progs); } -int lirc_prog_attach(const union bpf_attr *attr) +int lirc_prog_attach(const union bpf_attr *attr, struct bpf_prog *prog) { - struct bpf_prog *prog; struct rc_dev *rcdev; int ret; if (attr->attach_flags) return -EINVAL; - prog = bpf_prog_get_type(attr->attach_bpf_fd, - BPF_PROG_TYPE_LIRC_MODE2); - if (IS_ERR(prog)) - return PTR_ERR(prog); - rcdev = rc_dev_get_from_fd(attr->target_fd); - if (IS_ERR(rcdev)) { - bpf_prog_put(prog); + if (IS_ERR(rcdev)) return PTR_ERR(rcdev); - } ret = lirc_bpf_attach(rcdev, prog); - if (ret) - bpf_prog_put(prog); put_device(&rcdev->dev); diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h index 975fb4cf1bb7..79795c5fa7c3 100644 --- a/include/linux/bpf-cgroup.h +++ b/include/linux/bpf-cgroup.h @@ -188,12 +188,38 @@ int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor, \ __ret; \ }) +int cgroup_bpf_prog_attach(const union bpf_attr *attr, + enum bpf_prog_type ptype, struct bpf_prog *prog); +int cgroup_bpf_prog_detach(const union bpf_attr *attr, + enum bpf_prog_type ptype); +int cgroup_bpf_prog_query(const union bpf_attr *attr, + union bpf_attr __user *uattr); #else +struct bpf_prog; struct cgroup_bpf {}; static inline void cgroup_bpf_put(struct cgroup *cgrp) {} static inline int cgroup_bpf_inherit(struct cgroup *cgrp) { return 0; } +static inline int cgroup_bpf_prog_attach(const union bpf_attr *attr, + enum bpf_prog_type ptype, + struct bpf_prog *prog) +{ + return -EINVAL; +} + +static inline int cgroup_bpf_prog_detach(const union bpf_attr *attr, + enum bpf_prog_type ptype) +{ + return -EINVAL; +} + +static inline int cgroup_bpf_prog_query(const union bpf_attr *attr, + union bpf_attr __user *uattr) +{ + return -EINVAL; +} + #define cgroup_bpf_enabled (0) #define BPF_CGROUP_PRE_CONNECT_ENABLED(sk) (0) #define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk,skb) ({ 0; }) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 7df32a3200f7..8827e797ff97 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -696,6 +696,8 @@ static inline void bpf_map_offload_map_free(struct bpf_map *map) struct sock *__sock_map_lookup_elem(struct bpf_map *map, u32 key); struct sock *__sock_hash_lookup_elem(struct bpf_map *map, void *key); int sock_map_prog(struct bpf_map *map, struct bpf_prog *prog, u32 type); +int sockmap_get_from_fd(const union bpf_attr *attr, int type, + struct bpf_prog *prog); #else static inline struct sock *__sock_map_lookup_elem(struct bpf_map *map, u32 key) { @@ -714,6 +716,12 @@ static inline int sock_map_prog(struct bpf_map *map, { return -EOPNOTSUPP; } + +static inline int sockmap_get_from_fd(const union bpf_attr *attr, int type, + struct bpf_prog *prog) +{ + return -EINVAL; +} #endif #if defined(CONFIG_XDP_SOCKETS) diff --git a/include/linux/bpf_lirc.h b/include/linux/bpf_lirc.h index 5f8a4283092d..9d9ff755ec29 100644 --- a/include/linux/bpf_lirc.h +++ b/include/linux/bpf_lirc.h @@ -5,11 +5,12 @@ #include #ifdef CONFIG_BPF_LIRC_MODE2 -int lirc_prog_attach(const union bpf_attr *attr); +int lirc_prog_attach(const union bpf_attr *attr, struct bpf_prog *prog); int lirc_prog_detach(const union bpf_attr *attr); int lirc_prog_query(const union bpf_attr *attr, union bpf_attr __user *uattr); #else -static inline int lirc_prog_attach(const union bpf_attr *attr) +static inline int lirc_prog_attach(const union bpf_attr *attr, + struct bpf_prog *prog) { return -EINVAL; } diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index f7c00bd6f8e4..3d83ee7df381 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -428,6 +428,60 @@ int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr, return ret; } +int cgroup_bpf_prog_attach(const union bpf_attr *attr, + enum bpf_prog_type ptype, struct bpf_prog *prog) +{ + struct cgroup *cgrp; + int ret; + + cgrp = cgroup_get_from_fd(attr->target_fd); + if (IS_ERR(cgrp)) + return PTR_ERR(cgrp); + + ret = cgroup_bpf_attach(cgrp, prog, attr->attach_type, + attr->attach_flags); + cgroup_put(cgrp); + return ret; +} + +int cgroup_bpf_prog_detach(const union bpf_attr *attr, enum bpf_prog_type ptype) +{ + struct bpf_prog *prog; + struct cgroup *cgrp; + int ret; + + cgrp = cgroup_get_from_fd(attr->target_fd); + if (IS_ERR(cgrp)) + return PTR_ERR(cgrp); + + prog = bpf_prog_get_type(attr->attach_bpf_fd, ptype); + if (IS_ERR(prog)) + prog = NULL; + + ret = cgroup_bpf_detach(cgrp, prog, attr->attach_type, 0); + if (prog) + bpf_prog_put(prog); + + cgroup_put(cgrp); + return ret; +} + +int cgroup_bpf_prog_query(const union bpf_attr *attr, + union bpf_attr __user *uattr) +{ + struct cgroup *cgrp; + int ret; + + cgrp = cgroup_get_from_fd(attr->query.target_fd); + if (IS_ERR(cgrp)) + return PTR_ERR(cgrp); + + ret = cgroup_bpf_query(cgrp, attr, uattr); + + cgroup_put(cgrp); + return ret; +} + /** * __cgroup_bpf_run_filter_skb() - Run a program for packet filtering * @sk: The socket sending or receiving traffic diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c index 52a91d816c0e..81d0c55a77aa 100644 --- a/kernel/bpf/sockmap.c +++ b/kernel/bpf/sockmap.c @@ -1915,6 +1915,24 @@ int sock_map_prog(struct bpf_map *map, struct bpf_prog *prog, u32 type) return 0; } +int sockmap_get_from_fd(const union bpf_attr *attr, int type, + struct bpf_prog *prog) +{ + int ufd = attr->target_fd; + struct bpf_map *map; + struct fd f; + int err; + + f = fdget(ufd); + map = __bpf_map_get(f); + if (IS_ERR(map)) + return PTR_ERR(map); + + err = sock_map_prog(map, prog, attr->attach_type); + fdput(f); + return err; +} + static void *sock_map_lookup(struct bpf_map *map, void *key) { return NULL; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 35dc466641f2..d10ecd78105f 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1483,8 +1483,6 @@ out_free_tp: return err; } -#ifdef CONFIG_CGROUP_BPF - static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog, enum bpf_attach_type attach_type) { @@ -1499,40 +1497,6 @@ static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog, #define BPF_PROG_ATTACH_LAST_FIELD attach_flags -static int sockmap_get_from_fd(const union bpf_attr *attr, - int type, bool attach) -{ - struct bpf_prog *prog = NULL; - int ufd = attr->target_fd; - struct bpf_map *map; - struct fd f; - int err; - - f = fdget(ufd); - map = __bpf_map_get(f); - if (IS_ERR(map)) - return PTR_ERR(map); - - if (attach) { - prog = bpf_prog_get_type(attr->attach_bpf_fd, type); - if (IS_ERR(prog)) { - fdput(f); - return PTR_ERR(prog); - } - } - - err = sock_map_prog(map, prog, attr->attach_type); - if (err) { - fdput(f); - if (prog) - bpf_prog_put(prog); - return err; - } - - fdput(f); - return 0; -} - #define BPF_F_ATTACH_MASK \ (BPF_F_ALLOW_OVERRIDE | BPF_F_ALLOW_MULTI) @@ -1540,7 +1504,6 @@ static int bpf_prog_attach(const union bpf_attr *attr) { enum bpf_prog_type ptype; struct bpf_prog *prog; - struct cgroup *cgrp; int ret; if (!capable(CAP_NET_ADMIN)) @@ -1577,12 +1540,15 @@ static int bpf_prog_attach(const union bpf_attr *attr) ptype = BPF_PROG_TYPE_CGROUP_DEVICE; break; case BPF_SK_MSG_VERDICT: - return sockmap_get_from_fd(attr, BPF_PROG_TYPE_SK_MSG, true); + ptype = BPF_PROG_TYPE_SK_MSG; + break; case BPF_SK_SKB_STREAM_PARSER: case BPF_SK_SKB_STREAM_VERDICT: - return sockmap_get_from_fd(attr, BPF_PROG_TYPE_SK_SKB, true); + ptype = BPF_PROG_TYPE_SK_SKB; + break; case BPF_LIRC_MODE2: - return lirc_prog_attach(attr); + ptype = BPF_PROG_TYPE_LIRC_MODE2; + break; default: return -EINVAL; } @@ -1596,18 +1562,20 @@ static int bpf_prog_attach(const union bpf_attr *attr) return -EINVAL; } - cgrp = cgroup_get_from_fd(attr->target_fd); - if (IS_ERR(cgrp)) { - bpf_prog_put(prog); - return PTR_ERR(cgrp); + switch (ptype) { + case BPF_PROG_TYPE_SK_SKB: + case BPF_PROG_TYPE_SK_MSG: + ret = sockmap_get_from_fd(attr, ptype, prog); + break; + case BPF_PROG_TYPE_LIRC_MODE2: + ret = lirc_prog_attach(attr, prog); + break; + default: + ret = cgroup_bpf_prog_attach(attr, ptype, prog); } - ret = cgroup_bpf_attach(cgrp, prog, attr->attach_type, - attr->attach_flags); if (ret) bpf_prog_put(prog); - cgroup_put(cgrp); - return ret; } @@ -1616,9 +1584,6 @@ static int bpf_prog_attach(const union bpf_attr *attr) static int bpf_prog_detach(const union bpf_attr *attr) { enum bpf_prog_type ptype; - struct bpf_prog *prog; - struct cgroup *cgrp; - int ret; if (!capable(CAP_NET_ADMIN)) return -EPERM; @@ -1651,29 +1616,17 @@ static int bpf_prog_detach(const union bpf_attr *attr) ptype = BPF_PROG_TYPE_CGROUP_DEVICE; break; case BPF_SK_MSG_VERDICT: - return sockmap_get_from_fd(attr, BPF_PROG_TYPE_SK_MSG, false); + return sockmap_get_from_fd(attr, BPF_PROG_TYPE_SK_MSG, NULL); case BPF_SK_SKB_STREAM_PARSER: case BPF_SK_SKB_STREAM_VERDICT: - return sockmap_get_from_fd(attr, BPF_PROG_TYPE_SK_SKB, false); + return sockmap_get_from_fd(attr, BPF_PROG_TYPE_SK_SKB, NULL); case BPF_LIRC_MODE2: return lirc_prog_detach(attr); default: return -EINVAL; } - cgrp = cgroup_get_from_fd(attr->target_fd); - if (IS_ERR(cgrp)) - return PTR_ERR(cgrp); - - prog = bpf_prog_get_type(attr->attach_bpf_fd, ptype); - if (IS_ERR(prog)) - prog = NULL; - - ret = cgroup_bpf_detach(cgrp, prog, attr->attach_type, 0); - if (prog) - bpf_prog_put(prog); - cgroup_put(cgrp); - return ret; + return cgroup_bpf_prog_detach(attr, ptype); } #define BPF_PROG_QUERY_LAST_FIELD query.prog_cnt @@ -1681,9 +1634,6 @@ static int bpf_prog_detach(const union bpf_attr *attr) static int bpf_prog_query(const union bpf_attr *attr, union bpf_attr __user *uattr) { - struct cgroup *cgrp; - int ret; - if (!capable(CAP_NET_ADMIN)) return -EPERM; if (CHECK_ATTR(BPF_PROG_QUERY)) @@ -1711,14 +1661,9 @@ static int bpf_prog_query(const union bpf_attr *attr, default: return -EINVAL; } - cgrp = cgroup_get_from_fd(attr->query.target_fd); - if (IS_ERR(cgrp)) - return PTR_ERR(cgrp); - ret = cgroup_bpf_query(cgrp, attr, uattr); - cgroup_put(cgrp); - return ret; + + return cgroup_bpf_prog_query(attr, uattr); } -#endif /* CONFIG_CGROUP_BPF */ #define BPF_PROG_TEST_RUN_LAST_FIELD test.duration @@ -2365,7 +2310,6 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz case BPF_OBJ_GET: err = bpf_obj_get(&attr); break; -#ifdef CONFIG_CGROUP_BPF case BPF_PROG_ATTACH: err = bpf_prog_attach(&attr); break; @@ -2375,7 +2319,6 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz case BPF_PROG_QUERY: err = bpf_prog_query(&attr, uattr); break; -#endif case BPF_PROG_TEST_RUN: err = bpf_prog_test_run(&attr, uattr); break; -- cgit From 4bc8d55574dd316e43975651b9259c5c18d741fc Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 27 Nov 2017 15:13:56 -0800 Subject: rcu: Add debugging info to assertion The WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp()) in rcu_gp_cleanup() triggers (inexplicably, of course) every so often. This commit therefore extracts more information. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 5 ++++- kernel/rcu/tree.h | 2 ++ kernel/rcu/tree_plugin.h | 38 ++++++++++++++++++++++++++++++++++++-- 3 files changed, 42 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index aa7cade1b9f3..79c7fe978b17 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2084,7 +2084,8 @@ static void rcu_gp_cleanup(struct rcu_state *rsp) */ rcu_for_each_node_breadth_first(rsp, rnp) { raw_spin_lock_irq_rcu_node(rnp); - WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp)); + if (WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp))) + dump_blkd_tasks(rnp, 10); WARN_ON_ONCE(rnp->qsmask); WRITE_ONCE(rnp->completed, rsp->gpnum); rdp = this_cpu_ptr(rsp->rda); @@ -2294,6 +2295,7 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp, raw_spin_unlock_irqrestore_rcu_node(rnp, flags); return; } + rnp->completedqs = rnp->gpnum; mask = rnp->grpmask; if (rnp->parent == NULL) { @@ -3930,6 +3932,7 @@ static void __init rcu_init_one(struct rcu_state *rsp) &rcu_fqs_class[i], fqs[i]); rnp->gpnum = rsp->gpnum; rnp->completed = rsp->completed; + rnp->completedqs = rsp->completed; rnp->qsmask = 0; rnp->qsmaskinit = 0; rnp->grplo = j * cpustride; diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 78e051dffc5b..7365ac53fdd9 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -87,6 +87,7 @@ struct rcu_node { unsigned long completed; /* Last GP completed for this node. */ /* This will either be equal to or one */ /* behind the root rcu_node's gpnum. */ + unsigned long completedqs; /* All QSes done for this node. */ unsigned long qsmask; /* CPUs or groups that need to switch in */ /* order for current grace period to proceed.*/ /* In leaf rcu_node, each bit corresponds to */ @@ -453,6 +454,7 @@ static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp); static void rcu_preempt_check_callbacks(void); void call_rcu(struct rcu_head *head, rcu_callback_t func); static void __init __rcu_init_preempt(void); +static void dump_blkd_tasks(struct rcu_node *rnp, int ncheck); static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags); static void rcu_preempt_boost_start_gp(struct rcu_node *rnp); static void invoke_rcu_callbacks_kthread(void); diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 7fd12039e512..17b67ecf7dff 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -260,8 +260,10 @@ static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp) * ->exp_tasks pointers, respectively, to reference the newly * blocked tasks. */ - if (!rnp->gp_tasks && (blkd_state & RCU_GP_BLKD)) + if (!rnp->gp_tasks && (blkd_state & RCU_GP_BLKD)) { rnp->gp_tasks = &t->rcu_node_entry; + WARN_ON_ONCE(rnp->completedqs == rnp->gpnum); + } if (!rnp->exp_tasks && (blkd_state & RCU_EXP_BLKD)) rnp->exp_tasks = &t->rcu_node_entry; WARN_ON_ONCE(!(blkd_state & RCU_GP_BLKD) != @@ -535,6 +537,8 @@ void rcu_read_unlock_special(struct task_struct *t) WARN_ON_ONCE(rnp != t->rcu_blocked_node); WARN_ON_ONCE(!rcu_is_leaf_node(rnp)); empty_norm = !rcu_preempt_blocked_readers_cgp(rnp); + WARN_ON_ONCE(rnp->completedqs == rnp->gpnum && + (!empty_norm || rnp->qsmask)); empty_exp = sync_rcu_preempt_exp_done(rnp); smp_mb(); /* ensure expedited fastpath sees end of RCU c-s. */ np = rcu_next_node_entry(t, rnp); @@ -697,7 +701,8 @@ static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp) struct task_struct *t; RCU_LOCKDEP_WARN(preemptible(), "rcu_preempt_check_blocked_tasks() invoked with preemption enabled!!!\n"); - WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp)); + if (WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp))) + dump_blkd_tasks(rnp, 10); if (rcu_preempt_has_tasks(rnp)) { rnp->gp_tasks = rnp->blkd_tasks.next; t = container_of(rnp->gp_tasks, struct task_struct, @@ -841,6 +846,27 @@ void exit_rcu(void) __rcu_read_unlock(); } +/* + * Dump the blocked-tasks state, but limit the list dump to the + * specified number of elements. + */ +static void dump_blkd_tasks(struct rcu_node *rnp, int ncheck) +{ + int i; + struct list_head *lhp; + + lockdep_assert_held(&rnp->lock); + pr_info("%s: grp: %d-%d level: %d ->qamask %#lx ->gp_tasks %p ->boost_tasks %p ->exp_tasks %p &->blkd_tasks: %p offset: %u\n", __func__, rnp->grplo, rnp->grphi, rnp->level, rnp->qsmask, rnp->gp_tasks, rnp->boost_tasks, rnp->exp_tasks, &rnp->blkd_tasks, (unsigned int)offsetof(typeof(*rnp), blkd_tasks)); + pr_cont("\t->blkd_tasks"); + i = 0; + list_for_each(lhp, &rnp->blkd_tasks) { + pr_cont(" %p", lhp); + if (++i >= 10) + break; + } + pr_cont("\n"); +} + #else /* #ifdef CONFIG_PREEMPT_RCU */ static struct rcu_state *const rcu_state_p = &rcu_sched_state; @@ -949,6 +975,14 @@ void exit_rcu(void) { } +/* + * Dump the guaranteed-empty blocked-tasks state. Trust but verify. + */ +static void dump_blkd_tasks(struct rcu_node *rnp, int ncheck) +{ + WARN_ON_ONCE(!list_empty(&rnp->blkd_tasks)); +} + #endif /* #else #ifdef CONFIG_PREEMPT_RCU */ #ifdef CONFIG_RCU_BOOST -- cgit From ce11fae8d43fe9a36823fbbfe7c44de775b7e346 Mon Sep 17 00:00:00 2001 From: Boqun Feng Date: Fri, 9 Mar 2018 09:32:18 +0800 Subject: rcu: Use the proper lockdep annotation in dump_blkd_tasks() Sparse reported this: | kernel/rcu/tree_plugin.h:814:9: warning: incorrect type in argument 1 (different modifiers) | kernel/rcu/tree_plugin.h:814:9: expected struct lockdep_map const *lock | kernel/rcu/tree_plugin.h:814:9: got struct lockdep_map [noderef] * This is caused by using vanilla lockdep annotations on rcu_node::lock, and that requires accessing ->lock of rcu_node directly. However we need to keep rcu_node::lock __private to avoid breaking its extra ordering guarantee. And we have a dedicated lockdep annotation for rcu_node::lock, so use it. Signed-off-by: Boqun Feng Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 17b67ecf7dff..e387ea712758 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -855,7 +855,7 @@ static void dump_blkd_tasks(struct rcu_node *rnp, int ncheck) int i; struct list_head *lhp; - lockdep_assert_held(&rnp->lock); + raw_lockdep_assert_held_rcu_node(rnp); pr_info("%s: grp: %d-%d level: %d ->qamask %#lx ->gp_tasks %p ->boost_tasks %p ->exp_tasks %p &->blkd_tasks: %p offset: %u\n", __func__, rnp->grplo, rnp->grphi, rnp->level, rnp->qsmask, rnp->gp_tasks, rnp->boost_tasks, rnp->exp_tasks, &rnp->blkd_tasks, (unsigned int)offsetof(typeof(*rnp), blkd_tasks)); pr_cont("\t->blkd_tasks"); i = 0; -- cgit From 8c42b1f39fdf9fde7cfc4024397255f31a860db6 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 9 Apr 2018 11:04:46 -0700 Subject: rcu: Exclude near-simultaneous RCU CPU stall warnings There is a two-jiffy delay between the time that a CPU will self-report an RCU CPU stall warning and the time that some other CPU will report a warning on behalf of the first CPU. This has worked well in the past, but on busy systems, it is possible for the two warnings to overlap, which makes interpreting them extremely difficult. This commit therefore uses a cmpxchg-based timing decision that allows only one report in a given one-minute period (assuming default stall-warning Kconfig parameters). This approach will of course fail if you are seeing minute-long vCPU preemption, but in that case the overlapping RCU CPU stall warnings are the least of your worries. Reported-by: Dmitry Vyukov Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 26 +++++++++++--------------- 1 file changed, 11 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 79c7fe978b17..b1fffa21b9e4 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1368,7 +1368,6 @@ static inline void panic_on_rcu_stall(void) static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gpnum) { int cpu; - long delta; unsigned long flags; unsigned long gpa; unsigned long j; @@ -1381,18 +1380,6 @@ static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gpnum) if (rcu_cpu_stall_suppress) return; - /* Only let one CPU complain about others per time interval. */ - - raw_spin_lock_irqsave_rcu_node(rnp, flags); - delta = jiffies - READ_ONCE(rsp->jiffies_stall); - if (delta < RCU_STALL_RAT_DELAY || !rcu_gp_in_progress(rsp)) { - raw_spin_unlock_irqrestore_rcu_node(rnp, flags); - return; - } - WRITE_ONCE(rsp->jiffies_stall, - jiffies + 3 * rcu_jiffies_till_stall_check() + 3); - raw_spin_unlock_irqrestore_rcu_node(rnp, flags); - /* * OK, time to rat on our buddy... * See Documentation/RCU/stallwarn.txt for info on how to debug @@ -1441,6 +1428,10 @@ static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gpnum) sched_show_task(current); } } + /* Rewrite if needed in case of slow consoles. */ + if (ULONG_CMP_GE(jiffies, READ_ONCE(rsp->jiffies_stall))) + WRITE_ONCE(rsp->jiffies_stall, + jiffies + 3 * rcu_jiffies_till_stall_check() + 3); rcu_check_gp_kthread_starvation(rsp); @@ -1485,6 +1476,7 @@ static void print_cpu_stall(struct rcu_state *rsp) rcu_dump_cpu_stacks(rsp); raw_spin_lock_irqsave_rcu_node(rnp, flags); + /* Rewrite if needed in case of slow consoles. */ if (ULONG_CMP_GE(jiffies, READ_ONCE(rsp->jiffies_stall))) WRITE_ONCE(rsp->jiffies_stall, jiffies + 3 * rcu_jiffies_till_stall_check() + 3); @@ -1508,6 +1500,7 @@ static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp) unsigned long gpnum; unsigned long gps; unsigned long j; + unsigned long jn; unsigned long js; struct rcu_node *rnp; @@ -1546,14 +1539,17 @@ static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp) ULONG_CMP_GE(gps, js)) return; /* No stall or GP completed since entering function. */ rnp = rdp->mynode; + jn = jiffies + 3 * rcu_jiffies_till_stall_check() + 3; if (rcu_gp_in_progress(rsp) && - (READ_ONCE(rnp->qsmask) & rdp->grpmask)) { + (READ_ONCE(rnp->qsmask) & rdp->grpmask) && + cmpxchg(&rsp->jiffies_stall, js, jn) == js) { /* We haven't checked in, so go dump stack. */ print_cpu_stall(rsp); } else if (rcu_gp_in_progress(rsp) && - ULONG_CMP_GE(j, js + RCU_STALL_RAT_DELAY)) { + ULONG_CMP_GE(j, js + RCU_STALL_RAT_DELAY) && + cmpxchg(&rsp->jiffies_stall, js, jn) == js) { /* They had a few time units to dump stack, so complain. */ print_other_cpu_stall(rsp, gpnum); -- cgit From 36f10f55ff1d2867bfc48ed898a9cc0dc6b49dd2 Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Sat, 23 Jun 2018 17:54:49 +0300 Subject: fsnotify: let connector point to an abstract object Make the code to attach/detach a connector to object more generic by letting the fsnotify connector point to an abstract fsnotify_connp_t. Code that needs to dereference an inode or mount object now uses the helpers fsnotify_conn_{inode,mount}. Signed-off-by: Amir Goldstein Signed-off-by: Jan Kara --- fs/notify/fdinfo.c | 8 ++++---- fs/notify/fsnotify.h | 10 ++++++---- fs/notify/mark.c | 32 ++++++++++++++++---------------- include/linux/fsnotify_backend.h | 15 ++++++++++----- kernel/audit_tree.c | 17 +++++++++-------- 5 files changed, 45 insertions(+), 37 deletions(-) (limited to 'kernel') diff --git a/fs/notify/fdinfo.c b/fs/notify/fdinfo.c index 10aac1942c9f..86fcf5814279 100644 --- a/fs/notify/fdinfo.c +++ b/fs/notify/fdinfo.c @@ -15,7 +15,7 @@ #include #include "inotify/inotify.h" -#include "../fs/mount.h" +#include "fsnotify.h" #if defined(CONFIG_PROC_FS) @@ -81,7 +81,7 @@ static void inotify_fdinfo(struct seq_file *m, struct fsnotify_mark *mark) return; inode_mark = container_of(mark, struct inotify_inode_mark, fsn_mark); - inode = igrab(mark->connector->inode); + inode = igrab(fsnotify_conn_inode(mark->connector)); if (inode) { /* * IN_ALL_EVENTS represents all of the mask bits @@ -117,7 +117,7 @@ static void fanotify_fdinfo(struct seq_file *m, struct fsnotify_mark *mark) mflags |= FAN_MARK_IGNORED_SURV_MODIFY; if (mark->connector->type == FSNOTIFY_OBJ_TYPE_INODE) { - inode = igrab(mark->connector->inode); + inode = igrab(fsnotify_conn_inode(mark->connector)); if (!inode) return; seq_printf(m, "fanotify ino:%lx sdev:%x mflags:%x mask:%x ignored_mask:%x ", @@ -127,7 +127,7 @@ static void fanotify_fdinfo(struct seq_file *m, struct fsnotify_mark *mark) seq_putc(m, '\n'); iput(inode); } else if (mark->connector->type == FSNOTIFY_OBJ_TYPE_VFSMOUNT) { - struct mount *mnt = real_mount(mark->connector->mnt); + struct mount *mnt = fsnotify_conn_mount(mark->connector); seq_printf(m, "fanotify mnt_id:%x mflags:%x mask:%x ignored_mask:%x\n", mnt->mnt_id, mflags, mark->mask, mark->ignored_mask); diff --git a/fs/notify/fsnotify.h b/fs/notify/fsnotify.h index caeee042d1cc..7902653dd577 100644 --- a/fs/notify/fsnotify.h +++ b/fs/notify/fsnotify.h @@ -9,14 +9,16 @@ #include "../mount.h" -static inline struct inode *fsnotify_obj_inode(fsnotify_connp_t *connp) +static inline struct inode *fsnotify_conn_inode( + struct fsnotify_mark_connector *conn) { - return container_of(connp, struct inode, i_fsnotify_marks); + return container_of(conn->obj, struct inode, i_fsnotify_marks); } -static inline struct mount *fsnotify_obj_mount(fsnotify_connp_t *connp) +static inline struct mount *fsnotify_conn_mount( + struct fsnotify_mark_connector *conn) { - return container_of(connp, struct mount, mnt_fsnotify_marks); + return container_of(conn->obj, struct mount, mnt_fsnotify_marks); } /* destroy all events sitting in this groups notification queue */ diff --git a/fs/notify/mark.c b/fs/notify/mark.c index 7abb73b5beba..959bc73aaae7 100644 --- a/fs/notify/mark.c +++ b/fs/notify/mark.c @@ -120,14 +120,14 @@ static void __fsnotify_recalc_mask(struct fsnotify_mark_connector *conn) new_mask |= mark->mask; } if (conn->type == FSNOTIFY_OBJ_TYPE_INODE) - conn->inode->i_fsnotify_mask = new_mask; + fsnotify_conn_inode(conn)->i_fsnotify_mask = new_mask; else if (conn->type == FSNOTIFY_OBJ_TYPE_VFSMOUNT) - real_mount(conn->mnt)->mnt_fsnotify_mask = new_mask; + fsnotify_conn_mount(conn)->mnt_fsnotify_mask = new_mask; } /* * Calculate mask of events for a list of marks. The caller must make sure - * connector and connector->inode cannot disappear under us. Callers achieve + * connector and connector->obj cannot disappear under us. Callers achieve * this by holding a mark->lock or mark->group->mark_mutex for a mark on this * list. */ @@ -140,7 +140,8 @@ void fsnotify_recalc_mask(struct fsnotify_mark_connector *conn) __fsnotify_recalc_mask(conn); spin_unlock(&conn->lock); if (conn->type == FSNOTIFY_OBJ_TYPE_INODE) - __fsnotify_update_child_dentry_flags(conn->inode); + __fsnotify_update_child_dentry_flags( + fsnotify_conn_inode(conn)); } /* Free all connectors queued for freeing once SRCU period ends */ @@ -166,20 +167,20 @@ static struct inode *fsnotify_detach_connector_from_object( { struct inode *inode = NULL; + if (conn->type == FSNOTIFY_OBJ_TYPE_DETACHED) + return NULL; + if (conn->type == FSNOTIFY_OBJ_TYPE_INODE) { - inode = conn->inode; - rcu_assign_pointer(inode->i_fsnotify_marks, NULL); + inode = fsnotify_conn_inode(conn); inode->i_fsnotify_mask = 0; - conn->inode = NULL; - conn->type = FSNOTIFY_OBJ_TYPE_DETACHED; } else if (conn->type == FSNOTIFY_OBJ_TYPE_VFSMOUNT) { - rcu_assign_pointer(real_mount(conn->mnt)->mnt_fsnotify_marks, - NULL); - real_mount(conn->mnt)->mnt_fsnotify_mask = 0; - conn->mnt = NULL; - conn->type = FSNOTIFY_OBJ_TYPE_DETACHED; + fsnotify_conn_mount(conn)->mnt_fsnotify_mask = 0; } + rcu_assign_pointer(*(conn->obj), NULL); + conn->obj = NULL; + conn->type = FSNOTIFY_OBJ_TYPE_DETACHED; + return inode; } @@ -448,10 +449,9 @@ static int fsnotify_attach_connector_to_object(fsnotify_connp_t *connp, spin_lock_init(&conn->lock); INIT_HLIST_HEAD(&conn->list); conn->type = type; + conn->obj = connp; if (conn->type == FSNOTIFY_OBJ_TYPE_INODE) - inode = conn->inode = igrab(fsnotify_obj_inode(connp)); - else if (conn->type == FSNOTIFY_OBJ_TYPE_VFSMOUNT) - conn->mnt = &fsnotify_obj_mount(connp)->mnt; + inode = igrab(fsnotify_conn_inode(conn)); /* * cmpxchg() provides the barrier so that readers of *connp can see * only initialized structure diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h index 8efb8663453d..381cfb0e67fa 100644 --- a/include/linux/fsnotify_backend.h +++ b/include/linux/fsnotify_backend.h @@ -255,6 +255,13 @@ FSNOTIFY_ITER_FUNCS(vfsmount, VFSMOUNT) #define fsnotify_foreach_obj_type(type) \ for (type = 0; type < FSNOTIFY_OBJ_TYPE_COUNT; type++) +/* + * fsnotify_connp_t is what we embed in objects which connector can be attached + * to. fsnotify_connp_t * is how we refer from connector back to object. + */ +struct fsnotify_mark_connector; +typedef struct fsnotify_mark_connector __rcu *fsnotify_connp_t; + /* * Inode / vfsmount point to this structure which tracks all marks attached to * the inode / vfsmount. The reference to inode / vfsmount is held by this @@ -264,17 +271,15 @@ FSNOTIFY_ITER_FUNCS(vfsmount, VFSMOUNT) struct fsnotify_mark_connector { spinlock_t lock; unsigned int type; /* Type of object [lock] */ - union { /* Object pointer [lock] */ - struct inode *inode; - struct vfsmount *mnt; + union { + /* Object pointer [lock] */ + fsnotify_connp_t *obj; /* Used listing heads to free after srcu period expires */ struct fsnotify_mark_connector *destroy_next; }; struct hlist_head list; }; -typedef struct fsnotify_mark_connector __rcu *fsnotify_connp_t; - /* * A mark is simply an object attached to an in core inode which allows an * fsnotify listener to indicate they are either no longer interested in events diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index c99ebaae5abc..02feef939560 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c @@ -168,7 +168,8 @@ static __cacheline_aligned_in_smp DEFINE_SPINLOCK(hash_lock); /* Function to return search key in our hash from inode. */ static unsigned long inode_to_key(const struct inode *inode) { - return (unsigned long)inode; + /* Use address pointed to by connector->obj as the key */ + return (unsigned long)&inode->i_fsnotify_marks; } /* @@ -183,7 +184,7 @@ static unsigned long chunk_to_key(struct audit_chunk *chunk) */ if (WARN_ON_ONCE(!chunk->mark.connector)) return 0; - return (unsigned long)chunk->mark.connector->inode; + return (unsigned long)chunk->mark.connector->obj; } static inline struct list_head *chunk_hash(unsigned long key) @@ -258,7 +259,7 @@ static void untag_chunk(struct node *p) spin_lock(&entry->lock); /* * mark_mutex protects mark from getting detached and thus also from - * mark->connector->inode getting NULL. + * mark->connector->obj getting NULL. */ if (chunk->dead || !(entry->flags & FSNOTIFY_MARK_FLAG_ATTACHED)) { spin_unlock(&entry->lock); @@ -288,8 +289,8 @@ static void untag_chunk(struct node *p) if (!new) goto Fallback; - if (fsnotify_add_inode_mark_locked(&new->mark, entry->connector->inode, - 1)) { + if (fsnotify_add_mark_locked(&new->mark, entry->connector->obj, + FSNOTIFY_OBJ_TYPE_INODE, 1)) { fsnotify_put_mark(&new->mark); goto Fallback; } @@ -423,7 +424,7 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree) spin_lock(&old_entry->lock); /* * mark_mutex protects mark from getting detached and thus also from - * mark->connector->inode getting NULL. + * mark->connector->obj getting NULL. */ if (!(old_entry->flags & FSNOTIFY_MARK_FLAG_ATTACHED)) { /* old_entry is being shot, lets just lie */ @@ -434,8 +435,8 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree) return -ENOENT; } - if (fsnotify_add_inode_mark_locked(chunk_entry, - old_entry->connector->inode, 1)) { + if (fsnotify_add_mark_locked(chunk_entry, old_entry->connector->obj, + FSNOTIFY_OBJ_TYPE_INODE, 1)) { spin_unlock(&old_entry->lock); mutex_unlock(&old_entry->group->mark_mutex); fsnotify_put_mark(chunk_entry); -- cgit From 375899cddcbb26881b03cb3fbdcfd600e4e67f4a Mon Sep 17 00:00:00 2001 From: Maninder Singh Date: Fri, 1 Jun 2018 14:26:42 +0530 Subject: printk: make sure to print log on console. This patch make sure printing of log on console if loglevel at time of storing log is less than current console loglevel. @why In SMP printk can work asynchronously, logs can be missed on console because it checks current log level at time of console_unlock, not at time of storing logs. func() { .... .... console_verbose(); // user wants to have all the logs on console. pr_alert(); dump_backtrace(); //prints with default loglevel. ... console_silent(); // stop all logs from printing on console. } Now if console_lock was owned by another process, the messages might be handled after the consoles were silenced. Reused flag LOG_NOCONS as its usage is gone long back by the commit 5c2992ee7fd8a29d0412 ("printk: remove console flushing special cases for partial buffered lines"). Note that there are still some corner cases where this patch is not enough. For example, when the messages are flushed later from printk_safe buffers or when there are races between console_verbose() and console_silent() callers. Link: http://lkml.kernel.org/r/20180601090029epcas5p3cc93d4bfbebb3199f0a2684058da7e26~z-a_jkmrI2993329933epcas5p3q@epcas5p3.samsung.com Cc: linux-kernel@vger.kernel.org Cc: a.sahrawat@samsung.com Cc: pankaj.m@samsung.com Cc: v.narang@samsung.com Cc: Signed-off-by: Vaneet Narang Signed-off-by: Maninder Singh Reviewed-by: Sergey Senozhatsky Signed-off-by: Petr Mladek --- kernel/printk/printk.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 247808333ba4..3999c295d6f7 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -349,7 +349,7 @@ static int console_msg_format = MSG_FORMAT_DEFAULT; */ enum log_flags { - LOG_NOCONS = 1, /* already flushed, do not print to console */ + LOG_NOCONS = 1, /* suppress print, do not print to console */ LOG_NEWLINE = 2, /* text ended with a newline */ LOG_PREFIX = 4, /* text started with a prefix */ LOG_CONT = 8, /* text is a fragment of a continuation line */ @@ -1886,6 +1886,9 @@ asmlinkage int vprintk_emit(int facility, int level, if (dict) lflags |= LOG_PREFIX|LOG_NEWLINE; + if (suppress_message_printing(level)) + lflags |= LOG_NOCONS; + printed_len = log_output(facility, level, lflags, dict, dictlen, text, text_len); logbuf_unlock_irqrestore(flags); @@ -2349,11 +2352,10 @@ skip: break; msg = log_from_idx(console_idx); - if (suppress_message_printing(msg->level)) { + if (msg->flags & LOG_NOCONS) { /* - * Skip record we have buffered and already printed - * directly to the console when we received it, and - * record that has level above the console loglevel. + * Skip record if !ignore_loglevel, and + * record has level above the console loglevel. */ console_idx = log_next(console_idx); console_seq++; -- cgit From 63842c21347ecf4685d6ec008e667e61b0ead6e5 Mon Sep 17 00:00:00 2001 From: Namit Gupta Date: Wed, 20 Jun 2018 19:26:19 +0530 Subject: printk: Remove unnecessary kmalloc() from syslog during clear When the request is only for clearing logs, there is no need for allocation/deallocation. Only the indexes need to be reset and returned. Rest of the patch is mostly made up of changes because of indention. Link: http://lkml.kernel.org/r/20180620135951epcas5p3bd2a8f25ec689ca333bce861b527dba2~54wyKcT0_3155531555epcas5p3y@epcas5p3.samsung.com Cc: linux-kernel@vger.kernel.org Cc: pankaj.m@samsung.com Cc: a.sahrawat@samsung.com Signed-off-by: Namit Gupta Signed-off-by: Himanshu Maithani Reviewed-by: Steven Rostedt (VMware) Reviewed-by: Sergey Senozhatsky Signed-off-by: Petr Mladek --- kernel/printk/printk.c | 109 ++++++++++++++++++++++++++----------------------- 1 file changed, 58 insertions(+), 51 deletions(-) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 3999c295d6f7..16b02cc51a14 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -1352,71 +1352,78 @@ static int syslog_print_all(char __user *buf, int size, bool clear) { char *text; int len = 0; + u64 next_seq; + u64 seq; + u32 idx; + + if (!buf) { + if (clear) { + logbuf_lock_irq(); + clear_seq = log_next_seq; + clear_idx = log_next_idx; + logbuf_unlock_irq(); + } + return 0; + } text = kmalloc(LOG_LINE_MAX + PREFIX_MAX, GFP_KERNEL); if (!text) return -ENOMEM; logbuf_lock_irq(); - if (buf) { - u64 next_seq; - u64 seq; - u32 idx; + /* + * Find first record that fits, including all following records, + * into the user-provided buffer for this dump. + */ + seq = clear_seq; + idx = clear_idx; + while (seq < log_next_seq) { + struct printk_log *msg = log_from_idx(idx); - /* - * Find first record that fits, including all following records, - * into the user-provided buffer for this dump. - */ - seq = clear_seq; - idx = clear_idx; - while (seq < log_next_seq) { - struct printk_log *msg = log_from_idx(idx); - - len += msg_print_text(msg, true, NULL, 0); - idx = log_next(idx); - seq++; - } + len += msg_print_text(msg, true, NULL, 0); + idx = log_next(idx); + seq++; + } - /* move first record forward until length fits into the buffer */ - seq = clear_seq; - idx = clear_idx; - while (len > size && seq < log_next_seq) { - struct printk_log *msg = log_from_idx(idx); + /* move first record forward until length fits into the buffer */ + seq = clear_seq; + idx = clear_idx; + while (len > size && seq < log_next_seq) { + struct printk_log *msg = log_from_idx(idx); - len -= msg_print_text(msg, true, NULL, 0); - idx = log_next(idx); - seq++; - } + len -= msg_print_text(msg, true, NULL, 0); + idx = log_next(idx); + seq++; + } - /* last message fitting into this dump */ - next_seq = log_next_seq; + /* last message fitting into this dump */ + next_seq = log_next_seq; - len = 0; - while (len >= 0 && seq < next_seq) { - struct printk_log *msg = log_from_idx(idx); - int textlen; + len = 0; + while (len >= 0 && seq < next_seq) { + struct printk_log *msg = log_from_idx(idx); + int textlen; - textlen = msg_print_text(msg, true, text, - LOG_LINE_MAX + PREFIX_MAX); - if (textlen < 0) { - len = textlen; - break; - } - idx = log_next(idx); - seq++; + textlen = msg_print_text(msg, true, text, + LOG_LINE_MAX + PREFIX_MAX); + if (textlen < 0) { + len = textlen; + break; + } + idx = log_next(idx); + seq++; - logbuf_unlock_irq(); - if (copy_to_user(buf + len, text, textlen)) - len = -EFAULT; - else - len += textlen; - logbuf_lock_irq(); + logbuf_unlock_irq(); + if (copy_to_user(buf + len, text, textlen)) + len = -EFAULT; + else + len += textlen; + logbuf_lock_irq(); - if (seq < log_first_seq) { - /* messages are gone, move to next one */ - seq = log_first_seq; - idx = log_first_idx; - } + if (seq < log_first_seq) { + /* messages are gone, move to next one */ + seq = log_first_seq; + idx = log_first_idx; } } -- cgit From d48de54a9dab5370edd2e991f78cc7996cf5483e Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Thu, 28 Jun 2018 15:20:27 +0200 Subject: printk: Export is_console_locked This is a preparation patch for adding a number of WARN_CONSOLE_UNLOCKED() calls to the fbcon code, which may be built as a module (event though usually it is not). Acked-by: Steven Rostedt (VMware) Acked-by: Sergey Senozhatsky Acked-by: Petr Mladek Reviewed-by: Daniel Vetter Signed-off-by: Hans de Goede Signed-off-by: Bartlomiej Zolnierkiewicz --- kernel/printk/printk.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 247808333ba4..3f041e7cbfc9 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -2243,6 +2243,7 @@ int is_console_locked(void) { return console_locked; } +EXPORT_SYMBOL(is_console_locked); /* * Check if we have any console that is capable of printing while cpu is -- cgit From 65a8766f5f50a5cd342f3d8f77a30917648ddb67 Mon Sep 17 00:00:00 2001 From: Richard Guy Briggs Date: Thu, 14 Jun 2018 16:20:05 -0400 Subject: audit: check audit_enabled in audit_tree_log_remove_rule() Respect the audit_enabled flag when printing tree rule config change records. See: https://github.com/linux-audit/audit-kernel/issues/50 Signed-off-by: Richard Guy Briggs [PM: tweak the subject line] Signed-off-by: Paul Moore --- kernel/audit_tree.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index c99ebaae5abc..9f6eaeb6919f 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c @@ -497,6 +497,8 @@ static void audit_tree_log_remove_rule(struct audit_krule *rule) { struct audit_buffer *ab; + if (!audit_enabled) + return; ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE); if (unlikely(!ab)) return; -- cgit From 4fa7f086993594e79f456a04656da2fcb5691209 Mon Sep 17 00:00:00 2001 From: Richard Guy Briggs Date: Thu, 14 Jun 2018 16:20:06 -0400 Subject: audit: simplify audit_enabled check in audit_watch_log_rule_change() Check the audit_enabled flag and bail immediately. This does not change the functionality, but brings the code format in line with similar checks in audit_tree_log_remove_rule(), audit_mark_log_rule_change(), and elsewhere in the audit code. See: https://github.com/linux-audit/audit-kernel/issues/50 Signed-off-by: Richard Guy Briggs [PM: tweaked subject line] Signed-off-by: Paul Moore --- kernel/audit_watch.c | 29 +++++++++++++++-------------- 1 file changed, 15 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c index c17c0c268436..6f249bdf2d84 100644 --- a/kernel/audit_watch.c +++ b/kernel/audit_watch.c @@ -238,20 +238,21 @@ out: static void audit_watch_log_rule_change(struct audit_krule *r, struct audit_watch *w, char *op) { - if (audit_enabled) { - struct audit_buffer *ab; - ab = audit_log_start(NULL, GFP_NOFS, AUDIT_CONFIG_CHANGE); - if (unlikely(!ab)) - return; - audit_log_format(ab, "auid=%u ses=%u op=%s", - from_kuid(&init_user_ns, audit_get_loginuid(current)), - audit_get_sessionid(current), op); - audit_log_format(ab, " path="); - audit_log_untrustedstring(ab, w->path); - audit_log_key(ab, r->filterkey); - audit_log_format(ab, " list=%d res=1", r->listnr); - audit_log_end(ab); - } + struct audit_buffer *ab; + + if (!audit_enabled) + return; + ab = audit_log_start(NULL, GFP_NOFS, AUDIT_CONFIG_CHANGE); + if (!ab) + return; + audit_log_format(ab, "auid=%u ses=%u op=%s", + from_kuid(&init_user_ns, audit_get_loginuid(current)), + audit_get_sessionid(current), op); + audit_log_format(ab, " path="); + audit_log_untrustedstring(ab, w->path); + audit_log_key(ab, r->filterkey); + audit_log_format(ab, " list=%d res=1", r->listnr); + audit_log_end(ab); } /* Update inode info in audit rules based on filesystem event. */ -- cgit From 85782e037f8aba8922dadb24a1523ca0b82ab8bc Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 28 Jun 2018 23:34:59 +0200 Subject: bpf: undo prog rejection on read-only lock failure Partially undo commit 9facc336876f ("bpf: reject any prog that failed read-only lock") since it caused a regression, that is, syzkaller was able to manage to cause a panic via fault injection deep in set_memory_ro() path by letting an allocation fail: In x86's __change_page_attr_set_clr() it was able to change the attributes of the primary mapping but not in the alias mapping via cpa_process_alias(), so the second, inner call to the __change_page_attr() via __change_page_attr_set_clr() had to split a larger page and failed in the alloc_pages() with the artifically triggered allocation error which is then propagated down to the call site. Thus, for set_memory_ro() this means that it returned with an error, but from debugging a probe_kernel_write() revealed EFAULT on that memory since the primary mapping succeeded to get changed. Therefore the subsequent hdr->locked = 0 reset triggered the panic as it was performed on read-only memory, so call-site assumptions were infact wrong to assume that it would either succeed /or/ not succeed at all since there's no such rollback in set_memory_*() calls from partial change of mappings, in other words, we're left in a state that is "half done". A later undo via set_memory_rw() is succeeding though due to matching permissions on that part (aka due to the try_preserve_large_page() succeeding). While reproducing locally with explicitly triggering this error, the initial splitting only happens on rare occasions and in real world it would additionally need oom conditions, but that said, it could partially fail. Therefore, it is definitely wrong to bail out on set_memory_ro() error and reject the program with the set_memory_*() semantics we have today. Shouldn't have gone the extra mile since no other user in tree today infact checks for any set_memory_*() errors, e.g. neither module_enable_ro() / module_disable_ro() for module RO/NX handling which is mostly default these days nor kprobes core with alloc_insn_page() / free_insn_page() as examples that could be invoked long after bootup and original 314beb9bcabf ("x86: bpf_jit_comp: secure bpf jit against spraying attacks") did neither when it got first introduced to BPF so "improving" with bailing out was clearly not right when set_memory_*() cannot handle it today. Kees suggested that if set_memory_*() can fail, we should annotate it with __must_check, and all callers need to deal with it gracefully given those set_memory_*() markings aren't "advisory", but they're expected to actually do what they say. This might be an option worth to move forward in future but would at the same time require that set_memory_*() calls from supporting archs are guaranteed to be "atomic" in that they provide rollback if part of the range fails, once that happened, the transition from RW -> RO could be made more robust that way, while subsequent RO -> RW transition /must/ continue guaranteeing to always succeed the undo part. Reported-by: syzbot+a4eb8c7766952a1ca872@syzkaller.appspotmail.com Reported-by: syzbot+d866d1925855328eac3b@syzkaller.appspotmail.com Fixes: 9facc336876f ("bpf: reject any prog that failed read-only lock") Cc: Laura Abbott Cc: Kees Cook Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: Alexei Starovoitov --- include/linux/filter.h | 56 ++++++++------------------------------------------ kernel/bpf/core.c | 30 +-------------------------- 2 files changed, 9 insertions(+), 77 deletions(-) (limited to 'kernel') diff --git a/include/linux/filter.h b/include/linux/filter.h index 20f2659dd829..300baad62c88 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -470,9 +470,7 @@ struct sock_fprog_kern { }; struct bpf_binary_header { - u16 pages; - u16 locked:1; - + u32 pages; /* Some arches need word alignment for their instructions */ u8 image[] __aligned(4); }; @@ -481,7 +479,7 @@ struct bpf_prog { u16 pages; /* Number of allocated pages */ u16 jited:1, /* Is our filter JIT'ed? */ jit_requested:1,/* archs need to JIT the prog */ - locked:1, /* Program image locked? */ + undo_set_mem:1, /* Passed set_memory_ro() checkpoint */ gpl_compatible:1, /* Is filter GPL compatible? */ cb_access:1, /* Is control block accessed? */ dst_needed:1, /* Do we need dst entry? */ @@ -677,46 +675,24 @@ bpf_ctx_narrow_access_ok(u32 off, u32 size, u32 size_default) static inline void bpf_prog_lock_ro(struct bpf_prog *fp) { -#ifdef CONFIG_ARCH_HAS_SET_MEMORY - fp->locked = 1; - if (set_memory_ro((unsigned long)fp, fp->pages)) - fp->locked = 0; -#endif + fp->undo_set_mem = 1; + set_memory_ro((unsigned long)fp, fp->pages); } static inline void bpf_prog_unlock_ro(struct bpf_prog *fp) { -#ifdef CONFIG_ARCH_HAS_SET_MEMORY - if (fp->locked) { - WARN_ON_ONCE(set_memory_rw((unsigned long)fp, fp->pages)); - /* In case set_memory_rw() fails, we want to be the first - * to crash here instead of some random place later on. - */ - fp->locked = 0; - } -#endif + if (fp->undo_set_mem) + set_memory_rw((unsigned long)fp, fp->pages); } static inline void bpf_jit_binary_lock_ro(struct bpf_binary_header *hdr) { -#ifdef CONFIG_ARCH_HAS_SET_MEMORY - hdr->locked = 1; - if (set_memory_ro((unsigned long)hdr, hdr->pages)) - hdr->locked = 0; -#endif + set_memory_ro((unsigned long)hdr, hdr->pages); } static inline void bpf_jit_binary_unlock_ro(struct bpf_binary_header *hdr) { -#ifdef CONFIG_ARCH_HAS_SET_MEMORY - if (hdr->locked) { - WARN_ON_ONCE(set_memory_rw((unsigned long)hdr, hdr->pages)); - /* In case set_memory_rw() fails, we want to be the first - * to crash here instead of some random place later on. - */ - hdr->locked = 0; - } -#endif + set_memory_rw((unsigned long)hdr, hdr->pages); } static inline struct bpf_binary_header * @@ -728,22 +704,6 @@ bpf_jit_binary_hdr(const struct bpf_prog *fp) return (void *)addr; } -#ifdef CONFIG_ARCH_HAS_SET_MEMORY -static inline int bpf_prog_check_pages_ro_single(const struct bpf_prog *fp) -{ - if (!fp->locked) - return -ENOLCK; - if (fp->jited) { - const struct bpf_binary_header *hdr = bpf_jit_binary_hdr(fp); - - if (!hdr->locked) - return -ENOLCK; - } - - return 0; -} -#endif - int sk_filter_trim_cap(struct sock *sk, struct sk_buff *skb, unsigned int cap); static inline int sk_filter(struct sock *sk, struct sk_buff *skb) { diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index a9e6c04d0f4a..1e5625d46414 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -598,8 +598,6 @@ bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr, bpf_fill_ill_insns(hdr, size); hdr->pages = size / PAGE_SIZE; - hdr->locked = 0; - hole = min_t(unsigned int, size - (proglen + sizeof(*hdr)), PAGE_SIZE - sizeof(*hdr)); start = (get_random_int() % hole) & ~(alignment - 1); @@ -1450,22 +1448,6 @@ static int bpf_check_tail_call(const struct bpf_prog *fp) return 0; } -static int bpf_prog_check_pages_ro_locked(const struct bpf_prog *fp) -{ -#ifdef CONFIG_ARCH_HAS_SET_MEMORY - int i, err; - - for (i = 0; i < fp->aux->func_cnt; i++) { - err = bpf_prog_check_pages_ro_single(fp->aux->func[i]); - if (err) - return err; - } - - return bpf_prog_check_pages_ro_single(fp); -#endif - return 0; -} - static void bpf_prog_select_func(struct bpf_prog *fp) { #ifndef CONFIG_BPF_JIT_ALWAYS_ON @@ -1524,17 +1506,7 @@ finalize: * all eBPF JITs might immediately support all features. */ *err = bpf_check_tail_call(fp); - if (*err) - return fp; - - /* Checkpoint: at this point onwards any cBPF -> eBPF or - * native eBPF program is read-only. If we failed to change - * the page attributes (e.g. allocation failure from - * splitting large pages), then reject the whole program - * in order to guarantee not ending up with any W+X pages - * from BPF side in kernel. - */ - *err = bpf_prog_check_pages_ro_locked(fp); + return fp; } EXPORT_SYMBOL_GPL(bpf_prog_select_runtime); -- cgit From 9901c5d77e969d8215a8e8d087ef02e6feddc84c Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Sat, 30 Jun 2018 06:17:36 -0700 Subject: bpf: sockmap, fix crash when ipv6 sock is added This fixes a crash where we assign tcp_prot to IPv6 sockets instead of tcpv6_prot. Previously we overwrote the sk->prot field with tcp_prot even in the AF_INET6 case. This patch ensures the correct tcp_prot and tcpv6_prot are used. Tested with 'netserver -6' and 'netperf -H [IPv6]' as well as 'netperf -H [IPv4]'. The ESTABLISHED check resolves the previously crashing case here. Fixes: 174a79ff9515 ("bpf: sockmap with sk redirect support") Reported-by: syzbot+5c063698bdbfac19f363@syzkaller.appspotmail.com Acked-by: Martin KaFai Lau Signed-off-by: John Fastabend Signed-off-by: Wei Wang Signed-off-by: Daniel Borkmann --- kernel/bpf/sockmap.c | 58 +++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 48 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c index 81d0c55a77aa..bfdfbd199c3b 100644 --- a/kernel/bpf/sockmap.c +++ b/kernel/bpf/sockmap.c @@ -140,6 +140,7 @@ static int bpf_tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, static int bpf_tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size); static int bpf_tcp_sendpage(struct sock *sk, struct page *page, int offset, size_t size, int flags); +static void bpf_tcp_close(struct sock *sk, long timeout); static inline struct smap_psock *smap_psock_sk(const struct sock *sk) { @@ -161,7 +162,42 @@ out: return !empty; } -static struct proto tcp_bpf_proto; +enum { + SOCKMAP_IPV4, + SOCKMAP_IPV6, + SOCKMAP_NUM_PROTS, +}; + +enum { + SOCKMAP_BASE, + SOCKMAP_TX, + SOCKMAP_NUM_CONFIGS, +}; + +static struct proto *saved_tcpv6_prot __read_mostly; +static DEFINE_SPINLOCK(tcpv6_prot_lock); +static struct proto bpf_tcp_prots[SOCKMAP_NUM_PROTS][SOCKMAP_NUM_CONFIGS]; +static void build_protos(struct proto prot[SOCKMAP_NUM_CONFIGS], + struct proto *base) +{ + prot[SOCKMAP_BASE] = *base; + prot[SOCKMAP_BASE].close = bpf_tcp_close; + prot[SOCKMAP_BASE].recvmsg = bpf_tcp_recvmsg; + prot[SOCKMAP_BASE].stream_memory_read = bpf_tcp_stream_read; + + prot[SOCKMAP_TX] = prot[SOCKMAP_BASE]; + prot[SOCKMAP_TX].sendmsg = bpf_tcp_sendmsg; + prot[SOCKMAP_TX].sendpage = bpf_tcp_sendpage; +} + +static void update_sk_prot(struct sock *sk, struct smap_psock *psock) +{ + int family = sk->sk_family == AF_INET6 ? SOCKMAP_IPV6 : SOCKMAP_IPV4; + int conf = psock->bpf_tx_msg ? SOCKMAP_TX : SOCKMAP_BASE; + + sk->sk_prot = &bpf_tcp_prots[family][conf]; +} + static int bpf_tcp_init(struct sock *sk) { struct smap_psock *psock; @@ -181,14 +217,17 @@ static int bpf_tcp_init(struct sock *sk) psock->save_close = sk->sk_prot->close; psock->sk_proto = sk->sk_prot; - if (psock->bpf_tx_msg) { - tcp_bpf_proto.sendmsg = bpf_tcp_sendmsg; - tcp_bpf_proto.sendpage = bpf_tcp_sendpage; - tcp_bpf_proto.recvmsg = bpf_tcp_recvmsg; - tcp_bpf_proto.stream_memory_read = bpf_tcp_stream_read; + /* Build IPv6 sockmap whenever the address of tcpv6_prot changes */ + if (sk->sk_family == AF_INET6 && + unlikely(sk->sk_prot != smp_load_acquire(&saved_tcpv6_prot))) { + spin_lock_bh(&tcpv6_prot_lock); + if (likely(sk->sk_prot != saved_tcpv6_prot)) { + build_protos(bpf_tcp_prots[SOCKMAP_IPV6], sk->sk_prot); + smp_store_release(&saved_tcpv6_prot, sk->sk_prot); + } + spin_unlock_bh(&tcpv6_prot_lock); } - - sk->sk_prot = &tcp_bpf_proto; + update_sk_prot(sk, psock); rcu_read_unlock(); return 0; } @@ -1111,8 +1150,7 @@ static void bpf_tcp_msg_add(struct smap_psock *psock, static int bpf_tcp_ulp_register(void) { - tcp_bpf_proto = tcp_prot; - tcp_bpf_proto.close = bpf_tcp_close; + build_protos(bpf_tcp_prots[SOCKMAP_IPV4], &tcp_prot); /* Once BPF TX ULP is registered it is never unregistered. It * will be in the ULP list for the lifetime of the system. Doing * duplicate registers is not a problem. -- cgit From 54fedb42c6537dcb0102e4a58a88456a6286999d Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Sat, 30 Jun 2018 06:17:41 -0700 Subject: bpf: sockmap, fix smap_list_map_remove when psock is in many maps If a hashmap is free'd with open socks it removes the reference to the hash entry from the psock. If that is the last reference to the psock then it will also be free'd by the reference counting logic. However the current logic that removes the hash reference from the list of references is broken. In smap_list_remove() we first check if the sockmap entry matches and then check if the hashmap entry matches. But, the sockmap entry sill always match because its NULL in this case which causes the first entry to be removed from the list. If this is always the "right" entry (because the user adds/removes entries in order) then everything is OK but otherwise a subsequent bpf_tcp_close() may reference a free'd object. To fix this create two list handlers one for sockmap and one for sockhash. Reported-by: syzbot+0ce137753c78f7b6acc1@syzkaller.appspotmail.com Fixes: 81110384441a ("bpf: sockmap, add hash map support") Acked-by: Martin KaFai Lau Signed-off-by: John Fastabend Signed-off-by: Daniel Borkmann --- kernel/bpf/sockmap.c | 34 ++++++++++++++++++++++------------ 1 file changed, 22 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c index bfdfbd199c3b..65a937ed5762 100644 --- a/kernel/bpf/sockmap.c +++ b/kernel/bpf/sockmap.c @@ -1602,17 +1602,27 @@ free_stab: return ERR_PTR(err); } -static void smap_list_remove(struct smap_psock *psock, - struct sock **entry, - struct htab_elem *hash_link) +static void smap_list_map_remove(struct smap_psock *psock, + struct sock **entry) { struct smap_psock_map_entry *e, *tmp; list_for_each_entry_safe(e, tmp, &psock->maps, list) { - if (e->entry == entry || e->hash_link == hash_link) { + if (e->entry == entry) + list_del(&e->list); + } +} + +static void smap_list_hash_remove(struct smap_psock *psock, + struct htab_elem *hash_link) +{ + struct smap_psock_map_entry *e, *tmp; + + list_for_each_entry_safe(e, tmp, &psock->maps, list) { + struct htab_elem *c = e->hash_link; + + if (c == hash_link) list_del(&e->list); - break; - } } } @@ -1647,7 +1657,7 @@ static void sock_map_free(struct bpf_map *map) * to be null and queued for garbage collection. */ if (likely(psock)) { - smap_list_remove(psock, &stab->sock_map[i], NULL); + smap_list_map_remove(psock, &stab->sock_map[i]); smap_release_sock(psock, sock); } write_unlock_bh(&sock->sk_callback_lock); @@ -1706,7 +1716,7 @@ static int sock_map_delete_elem(struct bpf_map *map, void *key) if (psock->bpf_parse) smap_stop_sock(psock, sock); - smap_list_remove(psock, &stab->sock_map[k], NULL); + smap_list_map_remove(psock, &stab->sock_map[k]); smap_release_sock(psock, sock); out: write_unlock_bh(&sock->sk_callback_lock); @@ -1908,7 +1918,7 @@ static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops, struct smap_psock *opsock = smap_psock_sk(osock); write_lock_bh(&osock->sk_callback_lock); - smap_list_remove(opsock, &stab->sock_map[i], NULL); + smap_list_map_remove(opsock, &stab->sock_map[i]); smap_release_sock(opsock, osock); write_unlock_bh(&osock->sk_callback_lock); } @@ -2142,7 +2152,7 @@ static void sock_hash_free(struct bpf_map *map) * (psock) to be null and queued for garbage collection. */ if (likely(psock)) { - smap_list_remove(psock, NULL, l); + smap_list_hash_remove(psock, l); smap_release_sock(psock, sock); } write_unlock_bh(&sock->sk_callback_lock); @@ -2322,7 +2332,7 @@ static int sock_hash_ctx_update_elem(struct bpf_sock_ops_kern *skops, psock = smap_psock_sk(l_old->sk); hlist_del_rcu(&l_old->hash_node); - smap_list_remove(psock, NULL, l_old); + smap_list_hash_remove(psock, l_old); smap_release_sock(psock, l_old->sk); free_htab_elem(htab, l_old); } @@ -2390,7 +2400,7 @@ static int sock_hash_delete_elem(struct bpf_map *map, void *key) * to be null and queued for garbage collection. */ if (likely(psock)) { - smap_list_remove(psock, NULL, l); + smap_list_hash_remove(psock, l); smap_release_sock(psock, sock); } write_unlock_bh(&sock->sk_callback_lock); -- cgit From e9db4ef6bf4ca9894bb324c76e01b8f1a16b2650 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Sat, 30 Jun 2018 06:17:47 -0700 Subject: bpf: sockhash fix omitted bucket lock in sock_close First the sk_callback_lock() was being used to protect both the sock callback hooks and the psock->maps list. This got overly convoluted after the addition of sockhash (in sockmap it made some sense because masp and callbacks were tightly coupled) so lets split out a specific lock for maps and only use the callback lock for its intended purpose. This fixes a couple cases where we missed using maps lock when it was in fact needed. Also this makes it easier to follow the code because now we can put the locking closer to the actual code its serializing. Next, in sock_hash_delete_elem() the pattern was as follows, sock_hash_delete_elem() [...] spin_lock(bucket_lock) l = lookup_elem_raw() if (l) hlist_del_rcu() write_lock(sk_callback_lock) .... destroy psock ... write_unlock(sk_callback_lock) spin_unlock(bucket_lock) The ordering is necessary because we only know the {p}sock after dereferencing the hash table which we can't do unless we have the bucket lock held. Once we have the bucket lock and the psock element it is deleted from the hashmap to ensure any other path doing a lookup will fail. Finally, the refcnt is decremented and if zero the psock is destroyed. In parallel with the above (or free'ing the map) a tcp close event may trigger tcp_close(). Which at the moment omits the bucket lock altogether (oops!) where the flow looks like this, bpf_tcp_close() [...] write_lock(sk_callback_lock) for each psock->maps // list of maps this sock is part of hlist_del_rcu(ref_hash_node); .... destroy psock ... write_unlock(sk_callback_lock) Obviously, and demonstrated by syzbot, this is broken because we can have multiple threads deleting entries via hlist_del_rcu(). To fix this we might be tempted to wrap the hlist operation in a bucket lock but that would create a lock inversion problem. In summary to follow locking rules the psocks maps list needs the sk_callback_lock (after this patch maps_lock) but we need the bucket lock to do the hlist_del_rcu. To resolve the lock inversion problem pop the head of the maps list repeatedly and remove the reference until no more are left. If a delete happens in parallel from the BPF API that is OK as well because it will do a similar action, lookup the lock in the map/hash, delete it from the map/hash, and dec the refcnt. We check for this case before doing a destroy on the psock to ensure we don't have two threads tearing down a psock. The new logic is as follows, bpf_tcp_close() e = psock_map_pop(psock->maps) // done with map lock bucket_lock() // lock hash list bucket l = lookup_elem_raw(head, hash, key, key_size); if (l) { //only get here if elmnt was not already removed hlist_del_rcu() ... destroy psock... } bucket_unlock() And finally for all the above to work add missing locking around map operations per above. Then add RCU annotations and use rcu_dereference/rcu_assign_pointer to manage values relying on RCU so that the object is not free'd from sock_hash_free() while it is being referenced in bpf_tcp_close(). Reported-by: syzbot+0ce137753c78f7b6acc1@syzkaller.appspotmail.com Fixes: 81110384441a ("bpf: sockmap, add hash map support") Signed-off-by: John Fastabend Signed-off-by: Daniel Borkmann --- kernel/bpf/sockmap.c | 145 ++++++++++++++++++++++++++++++++++----------------- 1 file changed, 96 insertions(+), 49 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c index 65a937ed5762..ac09b35a9567 100644 --- a/kernel/bpf/sockmap.c +++ b/kernel/bpf/sockmap.c @@ -72,6 +72,7 @@ struct bpf_htab { u32 n_buckets; u32 elem_size; struct bpf_sock_progs progs; + struct rcu_head rcu; }; struct htab_elem { @@ -89,8 +90,8 @@ enum smap_psock_state { struct smap_psock_map_entry { struct list_head list; struct sock **entry; - struct htab_elem *hash_link; - struct bpf_htab *htab; + struct htab_elem __rcu *hash_link; + struct bpf_htab __rcu *htab; }; struct smap_psock { @@ -120,6 +121,7 @@ struct smap_psock { struct bpf_prog *bpf_parse; struct bpf_prog *bpf_verdict; struct list_head maps; + spinlock_t maps_lock; /* Back reference used when sock callback trigger sockmap operations */ struct sock *sock; @@ -258,16 +260,54 @@ out: rcu_read_unlock(); } +static struct htab_elem *lookup_elem_raw(struct hlist_head *head, + u32 hash, void *key, u32 key_size) +{ + struct htab_elem *l; + + hlist_for_each_entry_rcu(l, head, hash_node) { + if (l->hash == hash && !memcmp(&l->key, key, key_size)) + return l; + } + + return NULL; +} + +static inline struct bucket *__select_bucket(struct bpf_htab *htab, u32 hash) +{ + return &htab->buckets[hash & (htab->n_buckets - 1)]; +} + +static inline struct hlist_head *select_bucket(struct bpf_htab *htab, u32 hash) +{ + return &__select_bucket(htab, hash)->head; +} + static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l) { atomic_dec(&htab->count); kfree_rcu(l, rcu); } +static struct smap_psock_map_entry *psock_map_pop(struct sock *sk, + struct smap_psock *psock) +{ + struct smap_psock_map_entry *e; + + spin_lock_bh(&psock->maps_lock); + e = list_first_entry_or_null(&psock->maps, + struct smap_psock_map_entry, + list); + if (e) + list_del(&e->list); + spin_unlock_bh(&psock->maps_lock); + return e; +} + static void bpf_tcp_close(struct sock *sk, long timeout) { void (*close_fun)(struct sock *sk, long timeout); - struct smap_psock_map_entry *e, *tmp; + struct smap_psock_map_entry *e; struct sk_msg_buff *md, *mtmp; struct smap_psock *psock; struct sock *osk; @@ -286,7 +326,6 @@ static void bpf_tcp_close(struct sock *sk, long timeout) */ close_fun = psock->save_close; - write_lock_bh(&sk->sk_callback_lock); if (psock->cork) { free_start_sg(psock->sock, psock->cork); kfree(psock->cork); @@ -299,20 +338,38 @@ static void bpf_tcp_close(struct sock *sk, long timeout) kfree(md); } - list_for_each_entry_safe(e, tmp, &psock->maps, list) { + e = psock_map_pop(sk, psock); + while (e) { if (e->entry) { osk = cmpxchg(e->entry, sk, NULL); if (osk == sk) { - list_del(&e->list); smap_release_sock(psock, sk); } } else { - hlist_del_rcu(&e->hash_link->hash_node); - smap_release_sock(psock, e->hash_link->sk); - free_htab_elem(e->htab, e->hash_link); + struct htab_elem *link = rcu_dereference(e->hash_link); + struct bpf_htab *htab = rcu_dereference(e->htab); + struct hlist_head *head; + struct htab_elem *l; + struct bucket *b; + + b = __select_bucket(htab, link->hash); + head = &b->head; + raw_spin_lock_bh(&b->lock); + l = lookup_elem_raw(head, + link->hash, link->key, + htab->map.key_size); + /* If another thread deleted this object skip deletion. + * The refcnt on psock may or may not be zero. + */ + if (l) { + hlist_del_rcu(&link->hash_node); + smap_release_sock(psock, link->sk); + free_htab_elem(htab, link); + } + raw_spin_unlock_bh(&b->lock); } + e = psock_map_pop(sk, psock); } - write_unlock_bh(&sk->sk_callback_lock); rcu_read_unlock(); close_fun(sk, timeout); } @@ -1395,7 +1452,9 @@ static void smap_release_sock(struct smap_psock *psock, struct sock *sock) { if (refcount_dec_and_test(&psock->refcnt)) { tcp_cleanup_ulp(sock); + write_lock_bh(&sock->sk_callback_lock); smap_stop_sock(psock, sock); + write_unlock_bh(&sock->sk_callback_lock); clear_bit(SMAP_TX_RUNNING, &psock->state); rcu_assign_sk_user_data(sock, NULL); call_rcu_sched(&psock->rcu, smap_destroy_psock); @@ -1546,6 +1605,7 @@ static struct smap_psock *smap_init_psock(struct sock *sock, int node) INIT_LIST_HEAD(&psock->maps); INIT_LIST_HEAD(&psock->ingress); refcount_set(&psock->refcnt, 1); + spin_lock_init(&psock->maps_lock); rcu_assign_sk_user_data(sock, psock); sock_hold(sock); @@ -1607,10 +1667,12 @@ static void smap_list_map_remove(struct smap_psock *psock, { struct smap_psock_map_entry *e, *tmp; + spin_lock_bh(&psock->maps_lock); list_for_each_entry_safe(e, tmp, &psock->maps, list) { if (e->entry == entry) list_del(&e->list); } + spin_unlock_bh(&psock->maps_lock); } static void smap_list_hash_remove(struct smap_psock *psock, @@ -1618,12 +1680,14 @@ static void smap_list_hash_remove(struct smap_psock *psock, { struct smap_psock_map_entry *e, *tmp; + spin_lock_bh(&psock->maps_lock); list_for_each_entry_safe(e, tmp, &psock->maps, list) { - struct htab_elem *c = e->hash_link; + struct htab_elem *c = rcu_dereference(e->hash_link); if (c == hash_link) list_del(&e->list); } + spin_unlock_bh(&psock->maps_lock); } static void sock_map_free(struct bpf_map *map) @@ -1649,7 +1713,6 @@ static void sock_map_free(struct bpf_map *map) if (!sock) continue; - write_lock_bh(&sock->sk_callback_lock); psock = smap_psock_sk(sock); /* This check handles a racing sock event that can get the * sk_callback_lock before this case but after xchg happens @@ -1660,7 +1723,6 @@ static void sock_map_free(struct bpf_map *map) smap_list_map_remove(psock, &stab->sock_map[i]); smap_release_sock(psock, sock); } - write_unlock_bh(&sock->sk_callback_lock); } rcu_read_unlock(); @@ -1709,7 +1771,6 @@ static int sock_map_delete_elem(struct bpf_map *map, void *key) if (!sock) return -EINVAL; - write_lock_bh(&sock->sk_callback_lock); psock = smap_psock_sk(sock); if (!psock) goto out; @@ -1719,7 +1780,6 @@ static int sock_map_delete_elem(struct bpf_map *map, void *key) smap_list_map_remove(psock, &stab->sock_map[k]); smap_release_sock(psock, sock); out: - write_unlock_bh(&sock->sk_callback_lock); return 0; } @@ -1800,7 +1860,6 @@ static int __sock_map_ctx_update_elem(struct bpf_map *map, } } - write_lock_bh(&sock->sk_callback_lock); psock = smap_psock_sk(sock); /* 2. Do not allow inheriting programs if psock exists and has @@ -1857,7 +1916,9 @@ static int __sock_map_ctx_update_elem(struct bpf_map *map, if (err) goto out_free; smap_init_progs(psock, verdict, parse); + write_lock_bh(&sock->sk_callback_lock); smap_start_sock(psock, sock); + write_unlock_bh(&sock->sk_callback_lock); } /* 4. Place psock in sockmap for use and stop any programs on @@ -1867,9 +1928,10 @@ static int __sock_map_ctx_update_elem(struct bpf_map *map, */ if (map_link) { e->entry = map_link; + spin_lock_bh(&psock->maps_lock); list_add_tail(&e->list, &psock->maps); + spin_unlock_bh(&psock->maps_lock); } - write_unlock_bh(&sock->sk_callback_lock); return err; out_free: smap_release_sock(psock, sock); @@ -1880,7 +1942,6 @@ out_progs: } if (tx_msg) bpf_prog_put(tx_msg); - write_unlock_bh(&sock->sk_callback_lock); kfree(e); return err; } @@ -1917,10 +1978,8 @@ static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops, if (osock) { struct smap_psock *opsock = smap_psock_sk(osock); - write_lock_bh(&osock->sk_callback_lock); smap_list_map_remove(opsock, &stab->sock_map[i]); smap_release_sock(opsock, osock); - write_unlock_bh(&osock->sk_callback_lock); } out: return err; @@ -2109,14 +2168,13 @@ free_htab: return ERR_PTR(err); } -static inline struct bucket *__select_bucket(struct bpf_htab *htab, u32 hash) +static void __bpf_htab_free(struct rcu_head *rcu) { - return &htab->buckets[hash & (htab->n_buckets - 1)]; -} + struct bpf_htab *htab; -static inline struct hlist_head *select_bucket(struct bpf_htab *htab, u32 hash) -{ - return &__select_bucket(htab, hash)->head; + htab = container_of(rcu, struct bpf_htab, rcu); + bpf_map_area_free(htab->buckets); + kfree(htab); } static void sock_hash_free(struct bpf_map *map) @@ -2135,16 +2193,18 @@ static void sock_hash_free(struct bpf_map *map) */ rcu_read_lock(); for (i = 0; i < htab->n_buckets; i++) { - struct hlist_head *head = select_bucket(htab, i); + struct bucket *b = __select_bucket(htab, i); + struct hlist_head *head; struct hlist_node *n; struct htab_elem *l; + raw_spin_lock_bh(&b->lock); + head = &b->head; hlist_for_each_entry_safe(l, n, head, hash_node) { struct sock *sock = l->sk; struct smap_psock *psock; hlist_del_rcu(&l->hash_node); - write_lock_bh(&sock->sk_callback_lock); psock = smap_psock_sk(sock); /* This check handles a racing sock event that can get * the sk_callback_lock before this case but after xchg @@ -2155,13 +2215,12 @@ static void sock_hash_free(struct bpf_map *map) smap_list_hash_remove(psock, l); smap_release_sock(psock, sock); } - write_unlock_bh(&sock->sk_callback_lock); - kfree(l); + free_htab_elem(htab, l); } + raw_spin_unlock_bh(&b->lock); } rcu_read_unlock(); - bpf_map_area_free(htab->buckets); - kfree(htab); + call_rcu(&htab->rcu, __bpf_htab_free); } static struct htab_elem *alloc_sock_hash_elem(struct bpf_htab *htab, @@ -2188,19 +2247,6 @@ static struct htab_elem *alloc_sock_hash_elem(struct bpf_htab *htab, return l_new; } -static struct htab_elem *lookup_elem_raw(struct hlist_head *head, - u32 hash, void *key, u32 key_size) -{ - struct htab_elem *l; - - hlist_for_each_entry_rcu(l, head, hash_node) { - if (l->hash == hash && !memcmp(&l->key, key, key_size)) - return l; - } - - return NULL; -} - static inline u32 htab_map_hash(const void *key, u32 key_len) { return jhash(key, key_len, 0); @@ -2320,9 +2366,12 @@ static int sock_hash_ctx_update_elem(struct bpf_sock_ops_kern *skops, goto bucket_err; } - e->hash_link = l_new; - e->htab = container_of(map, struct bpf_htab, map); + rcu_assign_pointer(e->hash_link, l_new); + rcu_assign_pointer(e->htab, + container_of(map, struct bpf_htab, map)); + spin_lock_bh(&psock->maps_lock); list_add_tail(&e->list, &psock->maps); + spin_unlock_bh(&psock->maps_lock); /* add new element to the head of the list, so that * concurrent search will find it before old elem @@ -2392,7 +2441,6 @@ static int sock_hash_delete_elem(struct bpf_map *map, void *key) struct smap_psock *psock; hlist_del_rcu(&l->hash_node); - write_lock_bh(&sock->sk_callback_lock); psock = smap_psock_sk(sock); /* This check handles a racing sock event that can get the * sk_callback_lock before this case but after xchg happens @@ -2403,7 +2451,6 @@ static int sock_hash_delete_elem(struct bpf_map *map, void *key) smap_list_hash_remove(psock, l); smap_release_sock(psock, sock); } - write_unlock_bh(&sock->sk_callback_lock); free_htab_elem(htab, l); ret = 0; } -- cgit From caac76a5170eb508529bbff9d9300e9c57126444 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Sat, 30 Jun 2018 06:17:52 -0700 Subject: bpf: sockhash, add release routine Add map_release_uref pointer to hashmap ops. This was dropped when original sockhash code was ported into bpf-next before initial commit. Fixes: 81110384441a ("bpf: sockmap, add hash map support") Acked-by: Martin KaFai Lau Signed-off-by: John Fastabend Signed-off-by: Daniel Borkmann --- kernel/bpf/sockmap.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c index ac09b35a9567..cf7b6a6dbd1f 100644 --- a/kernel/bpf/sockmap.c +++ b/kernel/bpf/sockmap.c @@ -2496,6 +2496,7 @@ const struct bpf_map_ops sock_hash_ops = { .map_get_next_key = sock_hash_get_next_key, .map_update_elem = sock_hash_update_elem, .map_delete_elem = sock_hash_delete_elem, + .map_release_uref = sock_map_release, }; BPF_CALL_4(bpf_sock_map_update, struct bpf_sock_ops_kern *, bpf_sock, -- cgit From 0cc3cd21657be04cb0559fe8063f2130493f92cf Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 29 Jun 2018 16:05:48 +0200 Subject: cpu/hotplug: Boot HT siblings at least once Due to the way Machine Check Exceptions work on X86 hyperthreads it's required to boot up _all_ logical cores at least once in order to set the CR4.MCE bit. So instead of ignoring the sibling threads right away, let them boot up once so they can configure themselves. After they came out of the initial boot stage check whether its a "secondary" sibling and cancel the operation which puts the CPU back into offline state. Reported-by: Dave Hansen Signed-off-by: Thomas Gleixner Tested-by: Tony Luck --- kernel/cpu.c | 72 ++++++++++++++++++++++++++++++++++++++++-------------------- 1 file changed, 48 insertions(+), 24 deletions(-) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index d29fdd7e57bb..6f3a3cde8b83 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -60,6 +60,7 @@ struct cpuhp_cpu_state { bool rollback; bool single; bool bringup; + bool booted_once; struct hlist_node *node; struct hlist_node *last; enum cpuhp_state cb_state; @@ -342,6 +343,40 @@ void cpu_hotplug_enable(void) EXPORT_SYMBOL_GPL(cpu_hotplug_enable); #endif /* CONFIG_HOTPLUG_CPU */ +#ifdef CONFIG_HOTPLUG_SMT +enum cpuhp_smt_control cpu_smt_control __read_mostly = CPU_SMT_ENABLED; + +static int __init smt_cmdline_disable(char *str) +{ + cpu_smt_control = CPU_SMT_DISABLED; + if (str && !strcmp(str, "force")) { + pr_info("SMT: Force disabled\n"); + cpu_smt_control = CPU_SMT_FORCE_DISABLED; + } + return 0; +} +early_param("nosmt", smt_cmdline_disable); + +static inline bool cpu_smt_allowed(unsigned int cpu) +{ + if (cpu_smt_control == CPU_SMT_ENABLED) + return true; + + if (topology_is_primary_thread(cpu)) + return true; + + /* + * On x86 it's required to boot all logical CPUs at least once so + * that the init code can get a chance to set CR4.MCE on each + * CPU. Otherwise, a broadacasted MCE observing CR4.MCE=0b on any + * core will shutdown the machine. + */ + return !per_cpu(cpuhp_state, cpu).booted_once; +} +#else +static inline bool cpu_smt_allowed(unsigned int cpu) { return true; } +#endif + static inline enum cpuhp_state cpuhp_set_state(struct cpuhp_cpu_state *st, enum cpuhp_state target) { @@ -422,6 +457,16 @@ static int bringup_wait_for_ap(unsigned int cpu) stop_machine_unpark(cpu); kthread_unpark(st->thread); + /* + * SMT soft disabling on X86 requires to bring the CPU out of the + * BIOS 'wait for SIPI' state in order to set the CR4.MCE bit. The + * CPU marked itself as booted_once in cpu_notify_starting() so the + * cpu_smt_allowed() check will now return false if this is not the + * primary sibling. + */ + if (!cpu_smt_allowed(cpu)) + return -ECANCELED; + if (st->target <= CPUHP_AP_ONLINE_IDLE) return 0; @@ -933,29 +978,6 @@ EXPORT_SYMBOL(cpu_down); #define takedown_cpu NULL #endif /*CONFIG_HOTPLUG_CPU*/ -#ifdef CONFIG_HOTPLUG_SMT -enum cpuhp_smt_control cpu_smt_control __read_mostly = CPU_SMT_ENABLED; - -static int __init smt_cmdline_disable(char *str) -{ - cpu_smt_control = CPU_SMT_DISABLED; - if (str && !strcmp(str, "force")) { - pr_info("SMT: Force disabled\n"); - cpu_smt_control = CPU_SMT_FORCE_DISABLED; - } - return 0; -} -early_param("nosmt", smt_cmdline_disable); - -static inline bool cpu_smt_allowed(unsigned int cpu) -{ - return cpu_smt_control == CPU_SMT_ENABLED || - topology_is_primary_thread(cpu); -} -#else -static inline bool cpu_smt_allowed(unsigned int cpu) { return true; } -#endif - /** * notify_cpu_starting(cpu) - Invoke the callbacks on the starting CPU * @cpu: cpu that just started @@ -970,6 +992,7 @@ void notify_cpu_starting(unsigned int cpu) int ret; rcu_cpu_starting(cpu); /* Enables RCU usage on this CPU. */ + st->booted_once = true; while (st->state < target) { st->state++; ret = cpuhp_invoke_callback(cpu, st->state, true, NULL, NULL); @@ -2180,5 +2203,6 @@ void __init boot_cpu_init(void) */ void __init boot_cpu_state_init(void) { - per_cpu_ptr(&cpuhp_state, smp_processor_id())->state = CPUHP_ONLINE; + this_cpu_write(cpuhp_state.booted_once, true); + this_cpu_write(cpuhp_state.state, CPUHP_ONLINE); } -- cgit From f314dfea16a085a58d2ff227ea9fa9e490ee5d18 Mon Sep 17 00:00:00 2001 From: Jessica Yu Date: Fri, 29 Jun 2018 16:37:08 +0200 Subject: modsign: log module name in the event of an error Now that we have the load_info struct all initialized (including info->name, which contains the name of the module) before module_sig_check(), make the load_info struct and hence module name available to mod_verify_sig() so that we can log the module name in the event of an error. Signed-off-by: Jessica Yu --- kernel/module-internal.h | 25 ++++++++++++++++++++++++- kernel/module.c | 22 +--------------------- kernel/module_signing.c | 12 +++++++----- 3 files changed, 32 insertions(+), 27 deletions(-) (limited to 'kernel') diff --git a/kernel/module-internal.h b/kernel/module-internal.h index 915e123a430f..79c9be2dbbe9 100644 --- a/kernel/module-internal.h +++ b/kernel/module-internal.h @@ -9,4 +9,27 @@ * 2 of the Licence, or (at your option) any later version. */ -extern int mod_verify_sig(const void *mod, unsigned long *_modlen); +#include +#include + +struct load_info { + const char *name; + /* pointer to module in temporary copy, freed at end of load_module() */ + struct module *mod; + Elf_Ehdr *hdr; + unsigned long len; + Elf_Shdr *sechdrs; + char *secstrings, *strtab; + unsigned long symoffs, stroffs; + struct _ddebug *debug; + unsigned int num_debug; + bool sig_ok; +#ifdef CONFIG_KALLSYMS + unsigned long mod_kallsyms_init_off; +#endif + struct { + unsigned int sym, str, mod, vers, info, pcpu; + } index; +}; + +extern int mod_verify_sig(const void *mod, struct load_info *info); diff --git a/kernel/module.c b/kernel/module.c index ba45a84e4287..8a45986fd728 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -307,26 +307,6 @@ int unregister_module_notifier(struct notifier_block *nb) } EXPORT_SYMBOL(unregister_module_notifier); -struct load_info { - const char *name; - /* pointer to module in temporary copy, freed at end of load_module() */ - struct module *mod; - Elf_Ehdr *hdr; - unsigned long len; - Elf_Shdr *sechdrs; - char *secstrings, *strtab; - unsigned long symoffs, stroffs; - struct _ddebug *debug; - unsigned int num_debug; - bool sig_ok; -#ifdef CONFIG_KALLSYMS - unsigned long mod_kallsyms_init_off; -#endif - struct { - unsigned int sym, str, mod, vers, info, pcpu; - } index; -}; - /* * We require a truly strong try_module_get(): 0 means success. * Otherwise an error is returned due to ongoing or failed @@ -2778,7 +2758,7 @@ static int module_sig_check(struct load_info *info, int flags) memcmp(mod + info->len - markerlen, MODULE_SIG_STRING, markerlen) == 0) { /* We truncate the module to discard the signature */ info->len -= markerlen; - err = mod_verify_sig(mod, &info->len); + err = mod_verify_sig(mod, info); } if (!err) { diff --git a/kernel/module_signing.c b/kernel/module_signing.c index 937c844bee4a..f2075ce8e4b3 100644 --- a/kernel/module_signing.c +++ b/kernel/module_signing.c @@ -45,10 +45,10 @@ struct module_signature { /* * Verify the signature on a module. */ -int mod_verify_sig(const void *mod, unsigned long *_modlen) +int mod_verify_sig(const void *mod, struct load_info *info) { struct module_signature ms; - size_t modlen = *_modlen, sig_len; + size_t sig_len, modlen = info->len; pr_devel("==>%s(,%zu)\n", __func__, modlen); @@ -62,10 +62,11 @@ int mod_verify_sig(const void *mod, unsigned long *_modlen) if (sig_len >= modlen) return -EBADMSG; modlen -= sig_len; - *_modlen = modlen; + info->len = modlen; if (ms.id_type != PKEY_ID_PKCS7) { - pr_err("Module is not signed with expected PKCS#7 message\n"); + pr_err("%s: Module is not signed with expected PKCS#7 message\n", + info->name); return -ENOPKG; } @@ -76,7 +77,8 @@ int mod_verify_sig(const void *mod, unsigned long *_modlen) ms.__pad[0] != 0 || ms.__pad[1] != 0 || ms.__pad[2] != 0) { - pr_err("PKCS#7 signature info has unexpected non-zero params\n"); + pr_err("%s: PKCS#7 signature info has unexpected non-zero params\n", + info->name); return -EBADMSG; } -- cgit From d5641c64c48f4408500a348301bff01fbf2c1ec5 Mon Sep 17 00:00:00 2001 From: Chengguang Xu Date: Mon, 25 Jun 2018 13:30:58 +0800 Subject: PM / hibernate: cast PAGE_SIZE to int when comparing with error code If PAGE_SIZE is unsigned type then negative error code will be larger than PAGE_SIZE. Signed-off-by: Chengguang Xu Signed-off-by: Rafael J. Wysocki --- kernel/power/swap.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/power/swap.c b/kernel/power/swap.c index c2bcf97d24c8..d7f6c1a288d3 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -923,7 +923,7 @@ int swsusp_write(unsigned int flags) } memset(&snapshot, 0, sizeof(struct snapshot_handle)); error = snapshot_read_next(&snapshot); - if (error < PAGE_SIZE) { + if (error < (int)PAGE_SIZE) { if (error >= 0) error = -EFAULT; @@ -1483,7 +1483,7 @@ int swsusp_read(unsigned int *flags_p) memset(&snapshot, 0, sizeof(struct snapshot_handle)); error = snapshot_write_next(&snapshot); - if (error < PAGE_SIZE) + if (error < (int)PAGE_SIZE) return error < 0 ? error : -EFAULT; header = (struct swsusp_info *)data_of(snapshot); error = get_swap_reader(&handle, flags_p); -- cgit From ea272257ccba1e57b9448444e1e3de39794ad6b4 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Tue, 26 Jun 2018 06:49:11 -0300 Subject: docs: histogram.txt: convert it to ReST file format Despite being mentioned at Documentation/trace/ftrace.rst as a rst file, this file was still a text one, with several issues. Convert it to ReST and add it to the trace index: - Mark the document title as such; - Identify and indent the literal blocks; - Use the proper markups for table. Signed-off-by: Mauro Carvalho Chehab Acked-by: Steven Rostedt (VMware) Signed-off-by: Jonathan Corbet --- Documentation/trace/events.rst | 2 +- Documentation/trace/histogram.rst | 2545 +++++++++++++++++++++++++++++++++++++ Documentation/trace/histogram.txt | 2539 ------------------------------------ Documentation/trace/index.rst | 1 + kernel/trace/Kconfig | 2 +- 5 files changed, 2548 insertions(+), 2541 deletions(-) create mode 100644 Documentation/trace/histogram.rst delete mode 100644 Documentation/trace/histogram.txt (limited to 'kernel') diff --git a/Documentation/trace/events.rst b/Documentation/trace/events.rst index 696dc69b8158..f7e1fcc0953c 100644 --- a/Documentation/trace/events.rst +++ b/Documentation/trace/events.rst @@ -524,4 +524,4 @@ The following commands are supported: totals derived from one or more trace event format fields and/or event counts (hitcount). - See Documentation/trace/histogram.txt for details and examples. + See Documentation/trace/histogram.rst for details and examples. diff --git a/Documentation/trace/histogram.rst b/Documentation/trace/histogram.rst new file mode 100644 index 000000000000..5ac724baea7d --- /dev/null +++ b/Documentation/trace/histogram.rst @@ -0,0 +1,2545 @@ +================ +Event Histograms +================ + +Documentation written by Tom Zanussi + +1. Introduction +=============== + + Histogram triggers are special event triggers that can be used to + aggregate trace event data into histograms. For information on + trace events and event triggers, see Documentation/trace/events.rst. + + +2. Histogram Trigger Command +============================ + + A histogram trigger command is an event trigger command that + aggregates event hits into a hash table keyed on one or more trace + event format fields (or stacktrace) and a set of running totals + derived from one or more trace event format fields and/or event + counts (hitcount). + + The format of a hist trigger is as follows:: + + hist:keys=[:values=] + [:sort=][:size=#entries][:pause][:continue] + [:clear][:name=histname1] [if ] + + When a matching event is hit, an entry is added to a hash table + using the key(s) and value(s) named. Keys and values correspond to + fields in the event's format description. Values must correspond to + numeric fields - on an event hit, the value(s) will be added to a + sum kept for that field. The special string 'hitcount' can be used + in place of an explicit value field - this is simply a count of + event hits. If 'values' isn't specified, an implicit 'hitcount' + value will be automatically created and used as the only value. + Keys can be any field, or the special string 'stacktrace', which + will use the event's kernel stacktrace as the key. The keywords + 'keys' or 'key' can be used to specify keys, and the keywords + 'values', 'vals', or 'val' can be used to specify values. Compound + keys consisting of up to two fields can be specified by the 'keys' + keyword. Hashing a compound key produces a unique entry in the + table for each unique combination of component keys, and can be + useful for providing more fine-grained summaries of event data. + Additionally, sort keys consisting of up to two fields can be + specified by the 'sort' keyword. If more than one field is + specified, the result will be a 'sort within a sort': the first key + is taken to be the primary sort key and the second the secondary + key. If a hist trigger is given a name using the 'name' parameter, + its histogram data will be shared with other triggers of the same + name, and trigger hits will update this common data. Only triggers + with 'compatible' fields can be combined in this way; triggers are + 'compatible' if the fields named in the trigger share the same + number and type of fields and those fields also have the same names. + Note that any two events always share the compatible 'hitcount' and + 'stacktrace' fields and can therefore be combined using those + fields, however pointless that may be. + + 'hist' triggers add a 'hist' file to each event's subdirectory. + Reading the 'hist' file for the event will dump the hash table in + its entirety to stdout. If there are multiple hist triggers + attached to an event, there will be a table for each trigger in the + output. The table displayed for a named trigger will be the same as + any other instance having the same name. Each printed hash table + entry is a simple list of the keys and values comprising the entry; + keys are printed first and are delineated by curly braces, and are + followed by the set of value fields for the entry. By default, + numeric fields are displayed as base-10 integers. This can be + modified by appending any of the following modifiers to the field + name: + + =========== ========================================== + .hex display a number as a hex value + .sym display an address as a symbol + .sym-offset display an address as a symbol and offset + .syscall display a syscall id as a system call name + .execname display a common_pid as a program name + .log2 display log2 value rather than raw number + .usecs display a common_timestamp in microseconds + =========== ========================================== + + Note that in general the semantics of a given field aren't + interpreted when applying a modifier to it, but there are some + restrictions to be aware of in this regard: + + - only the 'hex' modifier can be used for values (because values + are essentially sums, and the other modifiers don't make sense + in that context). + - the 'execname' modifier can only be used on a 'common_pid'. The + reason for this is that the execname is simply the 'comm' value + saved for the 'current' process when an event was triggered, + which is the same as the common_pid value saved by the event + tracing code. Trying to apply that comm value to other pid + values wouldn't be correct, and typically events that care save + pid-specific comm fields in the event itself. + + A typical usage scenario would be the following to enable a hist + trigger, read its current contents, and then turn it off:: + + # echo 'hist:keys=skbaddr.hex:vals=len' > \ + /sys/kernel/debug/tracing/events/net/netif_rx/trigger + + # cat /sys/kernel/debug/tracing/events/net/netif_rx/hist + + # echo '!hist:keys=skbaddr.hex:vals=len' > \ + /sys/kernel/debug/tracing/events/net/netif_rx/trigger + + The trigger file itself can be read to show the details of the + currently attached hist trigger. This information is also displayed + at the top of the 'hist' file when read. + + By default, the size of the hash table is 2048 entries. The 'size' + parameter can be used to specify more or fewer than that. The units + are in terms of hashtable entries - if a run uses more entries than + specified, the results will show the number of 'drops', the number + of hits that were ignored. The size should be a power of 2 between + 128 and 131072 (any non- power-of-2 number specified will be rounded + up). + + The 'sort' parameter can be used to specify a value field to sort + on. The default if unspecified is 'hitcount' and the default sort + order is 'ascending'. To sort in the opposite direction, append + .descending' to the sort key. + + The 'pause' parameter can be used to pause an existing hist trigger + or to start a hist trigger but not log any events until told to do + so. 'continue' or 'cont' can be used to start or restart a paused + hist trigger. + + The 'clear' parameter will clear the contents of a running hist + trigger and leave its current paused/active state. + + Note that the 'pause', 'cont', and 'clear' parameters should be + applied using 'append' shell operator ('>>') if applied to an + existing trigger, rather than via the '>' operator, which will cause + the trigger to be removed through truncation. + +- enable_hist/disable_hist + + The enable_hist and disable_hist triggers can be used to have one + event conditionally start and stop another event's already-attached + hist trigger. Any number of enable_hist and disable_hist triggers + can be attached to a given event, allowing that event to kick off + and stop aggregations on a host of other events. + + The format is very similar to the enable/disable_event triggers:: + + enable_hist::[:count] + disable_hist::[:count] + + Instead of enabling or disabling the tracing of the target event + into the trace buffer as the enable/disable_event triggers do, the + enable/disable_hist triggers enable or disable the aggregation of + the target event into a hash table. + + A typical usage scenario for the enable_hist/disable_hist triggers + would be to first set up a paused hist trigger on some event, + followed by an enable_hist/disable_hist pair that turns the hist + aggregation on and off when conditions of interest are hit:: + + # echo 'hist:keys=skbaddr.hex:vals=len:pause' > \ + /sys/kernel/debug/tracing/events/net/netif_receive_skb/trigger + + # echo 'enable_hist:net:netif_receive_skb if filename==/usr/bin/wget' > \ + /sys/kernel/debug/tracing/events/sched/sched_process_exec/trigger + + # echo 'disable_hist:net:netif_receive_skb if comm==wget' > \ + /sys/kernel/debug/tracing/events/sched/sched_process_exit/trigger + + The above sets up an initially paused hist trigger which is unpaused + and starts aggregating events when a given program is executed, and + which stops aggregating when the process exits and the hist trigger + is paused again. + + The examples below provide a more concrete illustration of the + concepts and typical usage patterns discussed above. + +'special' event fields +------------------------ + + There are a number of 'special event fields' available for use as + keys or values in a hist trigger. These look like and behave as if + they were actual event fields, but aren't really part of the event's + field definition or format file. They are however available for any + event, and can be used anywhere an actual event field could be. + They are: + + ====================== ==== ======================================= + common_timestamp u64 timestamp (from ring buffer) associated + with the event, in nanoseconds. May be + modified by .usecs to have timestamps + interpreted as microseconds. + cpu int the cpu on which the event occurred. + ====================== ==== ======================================= + +Extended error information +-------------------------- + + For some error conditions encountered when invoking a hist trigger + command, extended error information is available via the + corresponding event's 'hist' file. Reading the hist file after an + error will display more detailed information about what went wrong, + if information is available. This extended error information will + be available until the next hist trigger command for that event. + + If available for a given error condition, the extended error + information and usage takes the following form:: + + # echo xxx > /sys/kernel/debug/tracing/events/sched/sched_wakeup/trigger + echo: write error: Invalid argument + + # cat /sys/kernel/debug/tracing/events/sched/sched_wakeup/hist + ERROR: Couldn't yyy: zzz + Last command: xxx + +6.2 'hist' trigger examples +--------------------------- + + The first set of examples creates aggregations using the kmalloc + event. The fields that can be used for the hist trigger are listed + in the kmalloc event's format file:: + + # cat /sys/kernel/debug/tracing/events/kmem/kmalloc/format + name: kmalloc + ID: 374 + format: + field:unsigned short common_type; offset:0; size:2; signed:0; + field:unsigned char common_flags; offset:2; size:1; signed:0; + field:unsigned char common_preempt_count; offset:3; size:1; signed:0; + field:int common_pid; offset:4; size:4; signed:1; + + field:unsigned long call_site; offset:8; size:8; signed:0; + field:const void * ptr; offset:16; size:8; signed:0; + field:size_t bytes_req; offset:24; size:8; signed:0; + field:size_t bytes_alloc; offset:32; size:8; signed:0; + field:gfp_t gfp_flags; offset:40; size:4; signed:0; + + We'll start by creating a hist trigger that generates a simple table + that lists the total number of bytes requested for each function in + the kernel that made one or more calls to kmalloc:: + + # echo 'hist:key=call_site:val=bytes_req' > \ + /sys/kernel/debug/tracing/events/kmem/kmalloc/trigger + + This tells the tracing system to create a 'hist' trigger using the + call_site field of the kmalloc event as the key for the table, which + just means that each unique call_site address will have an entry + created for it in the table. The 'val=bytes_req' parameter tells + the hist trigger that for each unique entry (call_site) in the + table, it should keep a running total of the number of bytes + requested by that call_site. + + We'll let it run for awhile and then dump the contents of the 'hist' + file in the kmalloc event's subdirectory (for readability, a number + of entries have been omitted):: + + # cat /sys/kernel/debug/tracing/events/kmem/kmalloc/hist + # trigger info: hist:keys=call_site:vals=bytes_req:sort=hitcount:size=2048 [active] + + { call_site: 18446744072106379007 } hitcount: 1 bytes_req: 176 + { call_site: 18446744071579557049 } hitcount: 1 bytes_req: 1024 + { call_site: 18446744071580608289 } hitcount: 1 bytes_req: 16384 + { call_site: 18446744071581827654 } hitcount: 1 bytes_req: 24 + { call_site: 18446744071580700980 } hitcount: 1 bytes_req: 8 + { call_site: 18446744071579359876 } hitcount: 1 bytes_req: 152 + { call_site: 18446744071580795365 } hitcount: 3 bytes_req: 144 + { call_site: 18446744071581303129 } hitcount: 3 bytes_req: 144 + { call_site: 18446744071580713234 } hitcount: 4 bytes_req: 2560 + { call_site: 18446744071580933750 } hitcount: 4 bytes_req: 736 + . + . + . + { call_site: 18446744072106047046 } hitcount: 69 bytes_req: 5576 + { call_site: 18446744071582116407 } hitcount: 73 bytes_req: 2336 + { call_site: 18446744072106054684 } hitcount: 136 bytes_req: 140504 + { call_site: 18446744072106224230 } hitcount: 136 bytes_req: 19584 + { call_site: 18446744072106078074 } hitcount: 153 bytes_req: 2448 + { call_site: 18446744072106062406 } hitcount: 153 bytes_req: 36720 + { call_site: 18446744071582507929 } hitcount: 153 bytes_req: 37088 + { call_site: 18446744072102520590 } hitcount: 273 bytes_req: 10920 + { call_site: 18446744071582143559 } hitcount: 358 bytes_req: 716 + { call_site: 18446744072106465852 } hitcount: 417 bytes_req: 56712 + { call_site: 18446744072102523378 } hitcount: 485 bytes_req: 27160 + { call_site: 18446744072099568646 } hitcount: 1676 bytes_req: 33520 + + Totals: + Hits: 4610 + Entries: 45 + Dropped: 0 + + The output displays a line for each entry, beginning with the key + specified in the trigger, followed by the value(s) also specified in + the trigger. At the beginning of the output is a line that displays + the trigger info, which can also be displayed by reading the + 'trigger' file:: + + # cat /sys/kernel/debug/tracing/events/kmem/kmalloc/trigger + hist:keys=call_site:vals=bytes_req:sort=hitcount:size=2048 [active] + + At the end of the output are a few lines that display the overall + totals for the run. The 'Hits' field shows the total number of + times the event trigger was hit, the 'Entries' field shows the total + number of used entries in the hash table, and the 'Dropped' field + shows the number of hits that were dropped because the number of + used entries for the run exceeded the maximum number of entries + allowed for the table (normally 0, but if not a hint that you may + want to increase the size of the table using the 'size' parameter). + + Notice in the above output that there's an extra field, 'hitcount', + which wasn't specified in the trigger. Also notice that in the + trigger info output, there's a parameter, 'sort=hitcount', which + wasn't specified in the trigger either. The reason for that is that + every trigger implicitly keeps a count of the total number of hits + attributed to a given entry, called the 'hitcount'. That hitcount + information is explicitly displayed in the output, and in the + absence of a user-specified sort parameter, is used as the default + sort field. + + The value 'hitcount' can be used in place of an explicit value in + the 'values' parameter if you don't really need to have any + particular field summed and are mainly interested in hit + frequencies. + + To turn the hist trigger off, simply call up the trigger in the + command history and re-execute it with a '!' prepended:: + + # echo '!hist:key=call_site:val=bytes_req' > \ + /sys/kernel/debug/tracing/events/kmem/kmalloc/trigger + + Finally, notice that the call_site as displayed in the output above + isn't really very useful. It's an address, but normally addresses + are displayed in hex. To have a numeric field displayed as a hex + value, simply append '.hex' to the field name in the trigger:: + + # echo 'hist:key=call_site.hex:val=bytes_req' > \ + /sys/kernel/debug/tracing/events/kmem/kmalloc/trigger + + # cat /sys/kernel/debug/tracing/events/kmem/kmalloc/hist + # trigger info: hist:keys=call_site.hex:vals=bytes_req:sort=hitcount:size=2048 [active] + + { call_site: ffffffffa026b291 } hitcount: 1 bytes_req: 433 + { call_site: ffffffffa07186ff } hitcount: 1 bytes_req: 176 + { call_site: ffffffff811ae721 } hitcount: 1 bytes_req: 16384 + { call_site: ffffffff811c5134 } hitcount: 1 bytes_req: 8 + { call_site: ffffffffa04a9ebb } hitcount: 1 bytes_req: 511 + { call_site: ffffffff8122e0a6 } hitcount: 1 bytes_req: 12 + { call_site: ffffffff8107da84 } hitcount: 1 bytes_req: 152 + { call_site: ffffffff812d8246 } hitcount: 1 bytes_req: 24 + { call_site: ffffffff811dc1e5 } hitcount: 3 bytes_req: 144 + { call_site: ffffffffa02515e8 } hitcount: 3 bytes_req: 648 + { call_site: ffffffff81258159 } hitcount: 3 bytes_req: 144 + { call_site: ffffffff811c80f4 } hitcount: 4 bytes_req: 544 + . + . + . + { call_site: ffffffffa06c7646 } hitcount: 106 bytes_req: 8024 + { call_site: ffffffffa06cb246 } hitcount: 132 bytes_req: 31680 + { call_site: ffffffffa06cef7a } hitcount: 132 bytes_req: 2112 + { call_site: ffffffff8137e399 } hitcount: 132 bytes_req: 23232 + { call_site: ffffffffa06c941c } hitcount: 185 bytes_req: 171360 + { call_site: ffffffffa06f2a66 } hitcount: 185 bytes_req: 26640 + { call_site: ffffffffa036a70e } hitcount: 265 bytes_req: 10600 + { call_site: ffffffff81325447 } hitcount: 292 bytes_req: 584 + { call_site: ffffffffa072da3c } hitcount: 446 bytes_req: 60656 + { call_site: ffffffffa036b1f2 } hitcount: 526 bytes_req: 29456 + { call_site: ffffffffa0099c06 } hitcount: 1780 bytes_req: 35600 + + Totals: + Hits: 4775 + Entries: 46 + Dropped: 0 + + Even that's only marginally more useful - while hex values do look + more like addresses, what users are typically more interested in + when looking at text addresses are the corresponding symbols + instead. To have an address displayed as symbolic value instead, + simply append '.sym' or '.sym-offset' to the field name in the + trigger:: + + # echo 'hist:key=call_site.sym:val=bytes_req' > \ + /sys/kernel/debug/tracing/events/kmem/kmalloc/trigger + + # cat /sys/kernel/debug/tracing/events/kmem/kmalloc/hist + # trigger info: hist:keys=call_site.sym:vals=bytes_req:sort=hitcount:size=2048 [active] + + { call_site: [ffffffff810adcb9] syslog_print_all } hitcount: 1 bytes_req: 1024 + { call_site: [ffffffff8154bc62] usb_control_msg } hitcount: 1 bytes_req: 8 + { call_site: [ffffffffa00bf6fe] hidraw_send_report [hid] } hitcount: 1 bytes_req: 7 + { call_site: [ffffffff8154acbe] usb_alloc_urb } hitcount: 1 bytes_req: 192 + { call_site: [ffffffffa00bf1ca] hidraw_report_event [hid] } hitcount: 1 bytes_req: 7 + { call_site: [ffffffff811e3a25] __seq_open_private } hitcount: 1 bytes_req: 40 + { call_site: [ffffffff8109524a] alloc_fair_sched_group } hitcount: 2 bytes_req: 128 + { call_site: [ffffffff811febd5] fsnotify_alloc_group } hitcount: 2 bytes_req: 528 + { call_site: [ffffffff81440f58] __tty_buffer_request_room } hitcount: 2 bytes_req: 2624 + { call_site: [ffffffff81200ba6] inotify_new_group } hitcount: 2 bytes_req: 96 + { call_site: [ffffffffa05e19af] ieee80211_start_tx_ba_session [mac80211] } hitcount: 2 bytes_req: 464 + { call_site: [ffffffff81672406] tcp_get_metrics } hitcount: 2 bytes_req: 304 + { call_site: [ffffffff81097ec2] alloc_rt_sched_group } hitcount: 2 bytes_req: 128 + { call_site: [ffffffff81089b05] sched_create_group } hitcount: 2 bytes_req: 1424 + . + . + . + { call_site: [ffffffffa04a580c] intel_crtc_page_flip [i915] } hitcount: 1185 bytes_req: 123240 + { call_site: [ffffffffa0287592] drm_mode_page_flip_ioctl [drm] } hitcount: 1185 bytes_req: 104280 + { call_site: [ffffffffa04c4a3c] intel_plane_duplicate_state [i915] } hitcount: 1402 bytes_req: 190672 + { call_site: [ffffffff812891ca] ext4_find_extent } hitcount: 1518 bytes_req: 146208 + { call_site: [ffffffffa029070e] drm_vma_node_allow [drm] } hitcount: 1746 bytes_req: 69840 + { call_site: [ffffffffa045e7c4] i915_gem_do_execbuffer.isra.23 [i915] } hitcount: 2021 bytes_req: 792312 + { call_site: [ffffffffa02911f2] drm_modeset_lock_crtc [drm] } hitcount: 2592 bytes_req: 145152 + { call_site: [ffffffffa0489a66] intel_ring_begin [i915] } hitcount: 2629 bytes_req: 378576 + { call_site: [ffffffffa046041c] i915_gem_execbuffer2 [i915] } hitcount: 2629 bytes_req: 3783248 + { call_site: [ffffffff81325607] apparmor_file_alloc_security } hitcount: 5192 bytes_req: 10384 + { call_site: [ffffffffa00b7c06] hid_report_raw_event [hid] } hitcount: 5529 bytes_req: 110584 + { call_site: [ffffffff8131ebf7] aa_alloc_task_context } hitcount: 21943 bytes_req: 702176 + { call_site: [ffffffff8125847d] ext4_htree_store_dirent } hitcount: 55759 bytes_req: 5074265 + + Totals: + Hits: 109928 + Entries: 71 + Dropped: 0 + + Because the default sort key above is 'hitcount', the above shows a + the list of call_sites by increasing hitcount, so that at the bottom + we see the functions that made the most kmalloc calls during the + run. If instead we we wanted to see the top kmalloc callers in + terms of the number of bytes requested rather than the number of + calls, and we wanted the top caller to appear at the top, we can use + the 'sort' parameter, along with the 'descending' modifier:: + + # echo 'hist:key=call_site.sym:val=bytes_req:sort=bytes_req.descending' > \ + /sys/kernel/debug/tracing/events/kmem/kmalloc/trigger + + # cat /sys/kernel/debug/tracing/events/kmem/kmalloc/hist + # trigger info: hist:keys=call_site.sym:vals=bytes_req:sort=bytes_req.descending:size=2048 [active] + + { call_site: [ffffffffa046041c] i915_gem_execbuffer2 [i915] } hitcount: 2186 bytes_req: 3397464 + { call_site: [ffffffffa045e7c4] i915_gem_do_execbuffer.isra.23 [i915] } hitcount: 1790 bytes_req: 712176 + { call_site: [ffffffff8125847d] ext4_htree_store_dirent } hitcount: 8132 bytes_req: 513135 + { call_site: [ffffffff811e2a1b] seq_buf_alloc } hitcount: 106 bytes_req: 440128 + { call_site: [ffffffffa0489a66] intel_ring_begin [i915] } hitcount: 2186 bytes_req: 314784 + { call_site: [ffffffff812891ca] ext4_find_extent } hitcount: 2174 bytes_req: 208992 + { call_site: [ffffffff811ae8e1] __kmalloc } hitcount: 8 bytes_req: 131072 + { call_site: [ffffffffa04c4a3c] intel_plane_duplicate_state [i915] } hitcount: 859 bytes_req: 116824 + { call_site: [ffffffffa02911f2] drm_modeset_lock_crtc [drm] } hitcount: 1834 bytes_req: 102704 + { call_site: [ffffffffa04a580c] intel_crtc_page_flip [i915] } hitcount: 972 bytes_req: 101088 + { call_site: [ffffffffa0287592] drm_mode_page_flip_ioctl [drm] } hitcount: 972 bytes_req: 85536 + { call_site: [ffffffffa00b7c06] hid_report_raw_event [hid] } hitcount: 3333 bytes_req: 66664 + { call_site: [ffffffff8137e559] sg_kmalloc } hitcount: 209 bytes_req: 61632 + . + . + . + { call_site: [ffffffff81095225] alloc_fair_sched_group } hitcount: 2 bytes_req: 128 + { call_site: [ffffffff81097ec2] alloc_rt_sched_group } hitcount: 2 bytes_req: 128 + { call_site: [ffffffff812d8406] copy_semundo } hitcount: 2 bytes_req: 48 + { call_site: [ffffffff81200ba6] inotify_new_group } hitcount: 1 bytes_req: 48 + { call_site: [ffffffffa027121a] drm_getmagic [drm] } hitcount: 1 bytes_req: 48 + { call_site: [ffffffff811e3a25] __seq_open_private } hitcount: 1 bytes_req: 40 + { call_site: [ffffffff811c52f4] bprm_change_interp } hitcount: 2 bytes_req: 16 + { call_site: [ffffffff8154bc62] usb_control_msg } hitcount: 1 bytes_req: 8 + { call_site: [ffffffffa00bf1ca] hidraw_report_event [hid] } hitcount: 1 bytes_req: 7 + { call_site: [ffffffffa00bf6fe] hidraw_send_report [hid] } hitcount: 1 bytes_req: 7 + + Totals: + Hits: 32133 + Entries: 81 + Dropped: 0 + + To display the offset and size information in addition to the symbol + name, just use 'sym-offset' instead:: + + # echo 'hist:key=call_site.sym-offset:val=bytes_req:sort=bytes_req.descending' > \ + /sys/kernel/debug/tracing/events/kmem/kmalloc/trigger + + # cat /sys/kernel/debug/tracing/events/kmem/kmalloc/hist + # trigger info: hist:keys=call_site.sym-offset:vals=bytes_req:sort=bytes_req.descending:size=2048 [active] + + { call_site: [ffffffffa046041c] i915_gem_execbuffer2+0x6c/0x2c0 [i915] } hitcount: 4569 bytes_req: 3163720 + { call_site: [ffffffffa0489a66] intel_ring_begin+0xc6/0x1f0 [i915] } hitcount: 4569 bytes_req: 657936 + { call_site: [ffffffffa045e7c4] i915_gem_do_execbuffer.isra.23+0x694/0x1020 [i915] } hitcount: 1519 bytes_req: 472936 + { call_site: [ffffffffa045e646] i915_gem_do_execbuffer.isra.23+0x516/0x1020 [i915] } hitcount: 3050 bytes_req: 211832 + { call_site: [ffffffff811e2a1b] seq_buf_alloc+0x1b/0x50 } hitcount: 34 bytes_req: 148384 + { call_site: [ffffffffa04a580c] intel_crtc_page_flip+0xbc/0x870 [i915] } hitcount: 1385 bytes_req: 144040 + { call_site: [ffffffff811ae8e1] __kmalloc+0x191/0x1b0 } hitcount: 8 bytes_req: 131072 + { call_site: [ffffffffa0287592] drm_mode_page_flip_ioctl+0x282/0x360 [drm] } hitcount: 1385 bytes_req: 121880 + { call_site: [ffffffffa02911f2] drm_modeset_lock_crtc+0x32/0x100 [drm] } hitcount: 1848 bytes_req: 103488 + { call_site: [ffffffffa04c4a3c] intel_plane_duplicate_state+0x2c/0xa0 [i915] } hitcount: 461 bytes_req: 62696 + { call_site: [ffffffffa029070e] drm_vma_node_allow+0x2e/0xd0 [drm] } hitcount: 1541 bytes_req: 61640 + { call_site: [ffffffff815f8d7b] sk_prot_alloc+0xcb/0x1b0 } hitcount: 57 bytes_req: 57456 + . + . + . + { call_site: [ffffffff8109524a] alloc_fair_sched_group+0x5a/0x1a0 } hitcount: 2 bytes_req: 128 + { call_site: [ffffffffa027b921] drm_vm_open_locked+0x31/0xa0 [drm] } hitcount: 3 bytes_req: 96 + { call_site: [ffffffff8122e266] proc_self_follow_link+0x76/0xb0 } hitcount: 8 bytes_req: 96 + { call_site: [ffffffff81213e80] load_elf_binary+0x240/0x1650 } hitcount: 3 bytes_req: 84 + { call_site: [ffffffff8154bc62] usb_control_msg+0x42/0x110 } hitcount: 1 bytes_req: 8 + { call_site: [ffffffffa00bf6fe] hidraw_send_report+0x7e/0x1a0 [hid] } hitcount: 1 bytes_req: 7 + { call_site: [ffffffffa00bf1ca] hidraw_report_event+0x8a/0x120 [hid] } hitcount: 1 bytes_req: 7 + + Totals: + Hits: 26098 + Entries: 64 + Dropped: 0 + + We can also add multiple fields to the 'values' parameter. For + example, we might want to see the total number of bytes allocated + alongside bytes requested, and display the result sorted by bytes + allocated in a descending order:: + + # echo 'hist:keys=call_site.sym:values=bytes_req,bytes_alloc:sort=bytes_alloc.descending' > \ + /sys/kernel/debug/tracing/events/kmem/kmalloc/trigger + + # cat /sys/kernel/debug/tracing/events/kmem/kmalloc/hist + # trigger info: hist:keys=call_site.sym:vals=bytes_req,bytes_alloc:sort=bytes_alloc.descending:size=2048 [active] + + { call_site: [ffffffffa046041c] i915_gem_execbuffer2 [i915] } hitcount: 7403 bytes_req: 4084360 bytes_alloc: 5958016 + { call_site: [ffffffff811e2a1b] seq_buf_alloc } hitcount: 541 bytes_req: 2213968 bytes_alloc: 2228224 + { call_site: [ffffffffa0489a66] intel_ring_begin [i915] } hitcount: 7404 bytes_req: 1066176 bytes_alloc: 1421568 + { call_site: [ffffffffa045e7c4] i915_gem_do_execbuffer.isra.23 [i915] } hitcount: 1565 bytes_req: 557368 bytes_alloc: 1037760 + { call_site: [ffffffff8125847d] ext4_htree_store_dirent } hitcount: 9557 bytes_req: 595778 bytes_alloc: 695744 + { call_site: [ffffffffa045e646] i915_gem_do_execbuffer.isra.23 [i915] } hitcount: 5839 bytes_req: 430680 bytes_alloc: 470400 + { call_site: [ffffffffa04c4a3c] intel_plane_duplicate_state [i915] } hitcount: 2388 bytes_req: 324768 bytes_alloc: 458496 + { call_site: [ffffffffa02911f2] drm_modeset_lock_crtc [drm] } hitcount: 3911 bytes_req: 219016 bytes_alloc: 250304 + { call_site: [ffffffff815f8d7b] sk_prot_alloc } hitcount: 235 bytes_req: 236880 bytes_alloc: 240640 + { call_site: [ffffffff8137e559] sg_kmalloc } hitcount: 557 bytes_req: 169024 bytes_alloc: 221760 + { call_site: [ffffffffa00b7c06] hid_report_raw_event [hid] } hitcount: 9378 bytes_req: 187548 bytes_alloc: 206312 + { call_site: [ffffffffa04a580c] intel_crtc_page_flip [i915] } hitcount: 1519 bytes_req: 157976 bytes_alloc: 194432 + . + . + . + { call_site: [ffffffff8109bd3b] sched_autogroup_create_attach } hitcount: 2 bytes_req: 144 bytes_alloc: 192 + { call_site: [ffffffff81097ee8] alloc_rt_sched_group } hitcount: 2 bytes_req: 128 bytes_alloc: 128 + { call_site: [ffffffff8109524a] alloc_fair_sched_group } hitcount: 2 bytes_req: 128 bytes_alloc: 128 + { call_site: [ffffffff81095225] alloc_fair_sched_group } hitcount: 2 bytes_req: 128 bytes_alloc: 128 + { call_site: [ffffffff81097ec2] alloc_rt_sched_group } hitcount: 2 bytes_req: 128 bytes_alloc: 128 + { call_site: [ffffffff81213e80] load_elf_binary } hitcount: 3 bytes_req: 84 bytes_alloc: 96 + { call_site: [ffffffff81079a2e] kthread_create_on_node } hitcount: 1 bytes_req: 56 bytes_alloc: 64 + { call_site: [ffffffffa00bf6fe] hidraw_send_report [hid] } hitcount: 1 bytes_req: 7 bytes_alloc: 8 + { call_site: [ffffffff8154bc62] usb_control_msg } hitcount: 1 bytes_req: 8 bytes_alloc: 8 + { call_site: [ffffffffa00bf1ca] hidraw_report_event [hid] } hitcount: 1 bytes_req: 7 bytes_alloc: 8 + + Totals: + Hits: 66598 + Entries: 65 + Dropped: 0 + + Finally, to finish off our kmalloc example, instead of simply having + the hist trigger display symbolic call_sites, we can have the hist + trigger additionally display the complete set of kernel stack traces + that led to each call_site. To do that, we simply use the special + value 'stacktrace' for the key parameter:: + + # echo 'hist:keys=stacktrace:values=bytes_req,bytes_alloc:sort=bytes_alloc' > \ + /sys/kernel/debug/tracing/events/kmem/kmalloc/trigger + + The above trigger will use the kernel stack trace in effect when an + event is triggered as the key for the hash table. This allows the + enumeration of every kernel callpath that led up to a particular + event, along with a running total of any of the event fields for + that event. Here we tally bytes requested and bytes allocated for + every callpath in the system that led up to a kmalloc (in this case + every callpath to a kmalloc for a kernel compile):: + + # cat /sys/kernel/debug/tracing/events/kmem/kmalloc/hist + # trigger info: hist:keys=stacktrace:vals=bytes_req,bytes_alloc:sort=bytes_alloc:size=2048 [active] + + { stacktrace: + __kmalloc_track_caller+0x10b/0x1a0 + kmemdup+0x20/0x50 + hidraw_report_event+0x8a/0x120 [hid] + hid_report_raw_event+0x3ea/0x440 [hid] + hid_input_report+0x112/0x190 [hid] + hid_irq_in+0xc2/0x260 [usbhid] + __usb_hcd_giveback_urb+0x72/0x120 + usb_giveback_urb_bh+0x9e/0xe0 + tasklet_hi_action+0xf8/0x100 + __do_softirq+0x114/0x2c0 + irq_exit+0xa5/0xb0 + do_IRQ+0x5a/0xf0 + ret_from_intr+0x0/0x30 + cpuidle_enter+0x17/0x20 + cpu_startup_entry+0x315/0x3e0 + rest_init+0x7c/0x80 + } hitcount: 3 bytes_req: 21 bytes_alloc: 24 + { stacktrace: + __kmalloc_track_caller+0x10b/0x1a0 + kmemdup+0x20/0x50 + hidraw_report_event+0x8a/0x120 [hid] + hid_report_raw_event+0x3ea/0x440 [hid] + hid_input_report+0x112/0x190 [hid] + hid_irq_in+0xc2/0x260 [usbhid] + __usb_hcd_giveback_urb+0x72/0x120 + usb_giveback_urb_bh+0x9e/0xe0 + tasklet_hi_action+0xf8/0x100 + __do_softirq+0x114/0x2c0 + irq_exit+0xa5/0xb0 + do_IRQ+0x5a/0xf0 + ret_from_intr+0x0/0x30 + } hitcount: 3 bytes_req: 21 bytes_alloc: 24 + { stacktrace: + kmem_cache_alloc_trace+0xeb/0x150 + aa_alloc_task_context+0x27/0x40 + apparmor_cred_prepare+0x1f/0x50 + security_prepare_creds+0x16/0x20 + prepare_creds+0xdf/0x1a0 + SyS_capset+0xb5/0x200 + system_call_fastpath+0x12/0x6a + } hitcount: 1 bytes_req: 32 bytes_alloc: 32 + . + . + . + { stacktrace: + __kmalloc+0x11b/0x1b0 + i915_gem_execbuffer2+0x6c/0x2c0 [i915] + drm_ioctl+0x349/0x670 [drm] + do_vfs_ioctl+0x2f0/0x4f0 + SyS_ioctl+0x81/0xa0 + system_call_fastpath+0x12/0x6a + } hitcount: 17726 bytes_req: 13944120 bytes_alloc: 19593808 + { stacktrace: + __kmalloc+0x11b/0x1b0 + load_elf_phdrs+0x76/0xa0 + load_elf_binary+0x102/0x1650 + search_binary_handler+0x97/0x1d0 + do_execveat_common.isra.34+0x551/0x6e0 + SyS_execve+0x3a/0x50 + return_from_execve+0x0/0x23 + } hitcount: 33348 bytes_req: 17152128 bytes_alloc: 20226048 + { stacktrace: + kmem_cache_alloc_trace+0xeb/0x150 + apparmor_file_alloc_security+0x27/0x40 + security_file_alloc+0x16/0x20 + get_empty_filp+0x93/0x1c0 + path_openat+0x31/0x5f0 + do_filp_open+0x3a/0x90 + do_sys_open+0x128/0x220 + SyS_open+0x1e/0x20 + system_call_fastpath+0x12/0x6a + } hitcount: 4766422 bytes_req: 9532844 bytes_alloc: 38131376 + { stacktrace: + __kmalloc+0x11b/0x1b0 + seq_buf_alloc+0x1b/0x50 + seq_read+0x2cc/0x370 + proc_reg_read+0x3d/0x80 + __vfs_read+0x28/0xe0 + vfs_read+0x86/0x140 + SyS_read+0x46/0xb0 + system_call_fastpath+0x12/0x6a + } hitcount: 19133 bytes_req: 78368768 bytes_alloc: 78368768 + + Totals: + Hits: 6085872 + Entries: 253 + Dropped: 0 + + If you key a hist trigger on common_pid, in order for example to + gather and display sorted totals for each process, you can use the + special .execname modifier to display the executable names for the + processes in the table rather than raw pids. The example below + keeps a per-process sum of total bytes read:: + + # echo 'hist:key=common_pid.execname:val=count:sort=count.descending' > \ + /sys/kernel/debug/tracing/events/syscalls/sys_enter_read/trigger + + # cat /sys/kernel/debug/tracing/events/syscalls/sys_enter_read/hist + # trigger info: hist:keys=common_pid.execname:vals=count:sort=count.descending:size=2048 [active] + + { common_pid: gnome-terminal [ 3196] } hitcount: 280 count: 1093512 + { common_pid: Xorg [ 1309] } hitcount: 525 count: 256640 + { common_pid: compiz [ 2889] } hitcount: 59 count: 254400 + { common_pid: bash [ 8710] } hitcount: 3 count: 66369 + { common_pid: dbus-daemon-lau [ 8703] } hitcount: 49 count: 47739 + { common_pid: irqbalance [ 1252] } hitcount: 27 count: 27648 + { common_pid: 01ifupdown [ 8705] } hitcount: 3 count: 17216 + { common_pid: dbus-daemon [ 772] } hitcount: 10 count: 12396 + { common_pid: Socket Thread [ 8342] } hitcount: 11 count: 11264 + { common_pid: nm-dhcp-client. [ 8701] } hitcount: 6 count: 7424 + { common_pid: gmain [ 1315] } hitcount: 18 count: 6336 + . + . + . + { common_pid: postgres [ 1892] } hitcount: 2 count: 32 + { common_pid: postgres [ 1891] } hitcount: 2 count: 32 + { common_pid: gmain [ 8704] } hitcount: 2 count: 32 + { common_pid: upstart-dbus-br [ 2740] } hitcount: 21 count: 21 + { common_pid: nm-dispatcher.a [ 8696] } hitcount: 1 count: 16 + { common_pid: indicator-datet [ 2904] } hitcount: 1 count: 16 + { common_pid: gdbus [ 2998] } hitcount: 1 count: 16 + { common_pid: rtkit-daemon [ 2052] } hitcount: 1 count: 8 + { common_pid: init [ 1] } hitcount: 2 count: 2 + + Totals: + Hits: 2116 + Entries: 51 + Dropped: 0 + + Similarly, if you key a hist trigger on syscall id, for example to + gather and display a list of systemwide syscall hits, you can use + the special .syscall modifier to display the syscall names rather + than raw ids. The example below keeps a running total of syscall + counts for the system during the run:: + + # echo 'hist:key=id.syscall:val=hitcount' > \ + /sys/kernel/debug/tracing/events/raw_syscalls/sys_enter/trigger + + # cat /sys/kernel/debug/tracing/events/raw_syscalls/sys_enter/hist + # trigger info: hist:keys=id.syscall:vals=hitcount:sort=hitcount:size=2048 [active] + + { id: sys_fsync [ 74] } hitcount: 1 + { id: sys_newuname [ 63] } hitcount: 1 + { id: sys_prctl [157] } hitcount: 1 + { id: sys_statfs [137] } hitcount: 1 + { id: sys_symlink [ 88] } hitcount: 1 + { id: sys_sendmmsg [307] } hitcount: 1 + { id: sys_semctl [ 66] } hitcount: 1 + { id: sys_readlink [ 89] } hitcount: 3 + { id: sys_bind [ 49] } hitcount: 3 + { id: sys_getsockname [ 51] } hitcount: 3 + { id: sys_unlink [ 87] } hitcount: 3 + { id: sys_rename [ 82] } hitcount: 4 + { id: unknown_syscall [ 58] } hitcount: 4 + { id: sys_connect [ 42] } hitcount: 4 + { id: sys_getpid [ 39] } hitcount: 4 + . + . + . + { id: sys_rt_sigprocmask [ 14] } hitcount: 952 + { id: sys_futex [202] } hitcount: 1534 + { id: sys_write [ 1] } hitcount: 2689 + { id: sys_setitimer [ 38] } hitcount: 2797 + { id: sys_read [ 0] } hitcount: 3202 + { id: sys_select [ 23] } hitcount: 3773 + { id: sys_writev [ 20] } hitcount: 4531 + { id: sys_poll [ 7] } hitcount: 8314 + { id: sys_recvmsg [ 47] } hitcount: 13738 + { id: sys_ioctl [ 16] } hitcount: 21843 + + Totals: + Hits: 67612 + Entries: 72 + Dropped: 0 + + The syscall counts above provide a rough overall picture of system + call activity on the system; we can see for example that the most + popular system call on this system was the 'sys_ioctl' system call. + + We can use 'compound' keys to refine that number and provide some + further insight as to which processes exactly contribute to the + overall ioctl count. + + The command below keeps a hitcount for every unique combination of + system call id and pid - the end result is essentially a table + that keeps a per-pid sum of system call hits. The results are + sorted using the system call id as the primary key, and the + hitcount sum as the secondary key:: + + # echo 'hist:key=id.syscall,common_pid.execname:val=hitcount:sort=id,hitcount' > \ + /sys/kernel/debug/tracing/events/raw_syscalls/sys_enter/trigger + + # cat /sys/kernel/debug/tracing/events/raw_syscalls/sys_enter/hist + # trigger info: hist:keys=id.syscall,common_pid.execname:vals=hitcount:sort=id.syscall,hitcount:size=2048 [active] + + { id: sys_read [ 0], common_pid: rtkit-daemon [ 1877] } hitcount: 1 + { id: sys_read [ 0], common_pid: gdbus [ 2976] } hitcount: 1 + { id: sys_read [ 0], common_pid: console-kit-dae [ 3400] } hitcount: 1 + { id: sys_read [ 0], common_pid: postgres [ 1865] } hitcount: 1 + { id: sys_read [ 0], common_pid: deja-dup-monito [ 3543] } hitcount: 2 + { id: sys_read [ 0], common_pid: NetworkManager [ 890] } hitcount: 2 + { id: sys_read [ 0], common_pid: evolution-calen [ 3048] } hitcount: 2 + { id: sys_read [ 0], common_pid: postgres [ 1864] } hitcount: 2 + { id: sys_read [ 0], common_pid: nm-applet [ 3022] } hitcount: 2 + { id: sys_read [ 0], common_pid: whoopsie [ 1212] } hitcount: 2 + . + . + . + { id: sys_ioctl [ 16], common_pid: bash [ 8479] } hitcount: 1 + { id: sys_ioctl [ 16], common_pid: bash [ 3472] } hitcount: 12 + { id: sys_ioctl [ 16], common_pid: gnome-terminal [ 3199] } hitcount: 16 + { id: sys_ioctl [ 16], common_pid: Xorg [ 1267] } hitcount: 1808 + { id: sys_ioctl [ 16], common_pid: compiz [ 2994] } hitcount: 5580 + . + . + . + { id: sys_waitid [247], common_pid: upstart-dbus-br [ 2690] } hitcount: 3 + { id: sys_waitid [247], common_pid: upstart-dbus-br [ 2688] } hitcount: 16 + { id: sys_inotify_add_watch [254], common_pid: gmain [ 975] } hitcount: 2 + { id: sys_inotify_add_watch [254], common_pid: gmain [ 3204] } hitcount: 4 + { id: sys_inotify_add_watch [254], common_pid: gmain [ 2888] } hitcount: 4 + { id: sys_inotify_add_watch [254], common_pid: gmain [ 3003] } hitcount: 4 + { id: sys_inotify_add_watch [254], common_pid: gmain [ 2873] } hitcount: 4 + { id: sys_inotify_add_watch [254], common_pid: gmain [ 3196] } hitcount: 6 + { id: sys_openat [257], common_pid: java [ 2623] } hitcount: 2 + { id: sys_eventfd2 [290], common_pid: ibus-ui-gtk3 [ 2760] } hitcount: 4 + { id: sys_eventfd2 [290], common_pid: compiz [ 2994] } hitcount: 6 + + Totals: + Hits: 31536 + Entries: 323 + Dropped: 0 + + The above list does give us a breakdown of the ioctl syscall by + pid, but it also gives us quite a bit more than that, which we + don't really care about at the moment. Since we know the syscall + id for sys_ioctl (16, displayed next to the sys_ioctl name), we + can use that to filter out all the other syscalls:: + + # echo 'hist:key=id.syscall,common_pid.execname:val=hitcount:sort=id,hitcount if id == 16' > \ + /sys/kernel/debug/tracing/events/raw_syscalls/sys_enter/trigger + + # cat /sys/kernel/debug/tracing/events/raw_syscalls/sys_enter/hist + # trigger info: hist:keys=id.syscall,common_pid.execname:vals=hitcount:sort=id.syscall,hitcount:size=2048 if id == 16 [active] + + { id: sys_ioctl [ 16], common_pid: gmain [ 2769] } hitcount: 1 + { id: sys_ioctl [ 16], common_pid: evolution-addre [ 8571] } hitcount: 1 + { id: sys_ioctl [ 16], common_pid: gmain [ 3003] } hitcount: 1 + { id: sys_ioctl [ 16], common_pid: gmain [ 2781] } hitcount: 1 + { id: sys_ioctl [ 16], common_pid: gmain [ 2829] } hitcount: 1 + { id: sys_ioctl [ 16], common_pid: bash [ 8726] } hitcount: 1 + { id: sys_ioctl [ 16], common_pid: bash [ 8508] } hitcount: 1 + { id: sys_ioctl [ 16], common_pid: gmain [ 2970] } hitcount: 1 + { id: sys_ioctl [ 16], common_pid: gmain [ 2768] } hitcount: 1 + . + . + . + { id: sys_ioctl [ 16], common_pid: pool [ 8559] } hitcount: 45 + { id: sys_ioctl [ 16], common_pid: pool [ 8555] } hitcount: 48 + { id: sys_ioctl [ 16], common_pid: pool [ 8551] } hitcount: 48 + { id: sys_ioctl [ 16], common_pid: avahi-daemon [ 896] } hitcount: 66 + { id: sys_ioctl [ 16], common_pid: Xorg [ 1267] } hitcount: 26674 + { id: sys_ioctl [ 16], common_pid: compiz [ 2994] } hitcount: 73443 + + Totals: + Hits: 101162 + Entries: 103 + Dropped: 0 + + The above output shows that 'compiz' and 'Xorg' are far and away + the heaviest ioctl callers (which might lead to questions about + whether they really need to be making all those calls and to + possible avenues for further investigation.) + + The compound key examples used a key and a sum value (hitcount) to + sort the output, but we can just as easily use two keys instead. + Here's an example where we use a compound key composed of the the + common_pid and size event fields. Sorting with pid as the primary + key and 'size' as the secondary key allows us to display an + ordered summary of the recvfrom sizes, with counts, received by + each process:: + + # echo 'hist:key=common_pid.execname,size:val=hitcount:sort=common_pid,size' > \ + /sys/kernel/debug/tracing/events/syscalls/sys_enter_recvfrom/trigger + + # cat /sys/kernel/debug/tracing/events/syscalls/sys_enter_recvfrom/hist + # trigger info: hist:keys=common_pid.execname,size:vals=hitcount:sort=common_pid.execname,size:size=2048 [active] + + { common_pid: smbd [ 784], size: 4 } hitcount: 1 + { common_pid: dnsmasq [ 1412], size: 4096 } hitcount: 672 + { common_pid: postgres [ 1796], size: 1000 } hitcount: 6 + { common_pid: postgres [ 1867], size: 1000 } hitcount: 10 + { common_pid: bamfdaemon [ 2787], size: 28 } hitcount: 2 + { common_pid: bamfdaemon [ 2787], size: 14360 } hitcount: 1 + { common_pid: compiz [ 2994], size: 8 } hitcount: 1 + { common_pid: compiz [ 2994], size: 20 } hitcount: 11 + { common_pid: gnome-terminal [ 3199], size: 4 } hitcount: 2 + { common_pid: firefox [ 8817], size: 4 } hitcount: 1 + { common_pid: firefox [ 8817], size: 8 } hitcount: 5 + { common_pid: firefox [ 8817], size: 588 } hitcount: 2 + { common_pid: firefox [ 8817], size: 628 } hitcount: 1 + { common_pid: firefox [ 8817], size: 6944 } hitcount: 1 + { common_pid: firefox [ 8817], size: 408880 } hitcount: 2 + { common_pid: firefox [ 8822], size: 8 } hitcount: 2 + { common_pid: firefox [ 8822], size: 160 } hitcount: 2 + { common_pid: firefox [ 8822], size: 320 } hitcount: 2 + { common_pid: firefox [ 8822], size: 352 } hitcount: 1 + . + . + . + { common_pid: pool [ 8923], size: 1960 } hitcount: 10 + { common_pid: pool [ 8923], size: 2048 } hitcount: 10 + { common_pid: pool [ 8924], size: 1960 } hitcount: 10 + { common_pid: pool [ 8924], size: 2048 } hitcount: 10 + { common_pid: pool [ 8928], size: 1964 } hitcount: 4 + { common_pid: pool [ 8928], size: 1965 } hitcount: 2 + { common_pid: pool [ 8928], size: 2048 } hitcount: 6 + { common_pid: pool [ 8929], size: 1982 } hitcount: 1 + { common_pid: pool [ 8929], size: 2048 } hitcount: 1 + + Totals: + Hits: 2016 + Entries: 224 + Dropped: 0 + + The above example also illustrates the fact that although a compound + key is treated as a single entity for hashing purposes, the sub-keys + it's composed of can be accessed independently. + + The next example uses a string field as the hash key and + demonstrates how you can manually pause and continue a hist trigger. + In this example, we'll aggregate fork counts and don't expect a + large number of entries in the hash table, so we'll drop it to a + much smaller number, say 256:: + + # echo 'hist:key=child_comm:val=hitcount:size=256' > \ + /sys/kernel/debug/tracing/events/sched/sched_process_fork/trigger + + # cat /sys/kernel/debug/tracing/events/sched/sched_process_fork/hist + # trigger info: hist:keys=child_comm:vals=hitcount:sort=hitcount:size=256 [active] + + { child_comm: dconf worker } hitcount: 1 + { child_comm: ibus-daemon } hitcount: 1 + { child_comm: whoopsie } hitcount: 1 + { child_comm: smbd } hitcount: 1 + { child_comm: gdbus } hitcount: 1 + { child_comm: kthreadd } hitcount: 1 + { child_comm: dconf worker } hitcount: 1 + { child_comm: evolution-alarm } hitcount: 2 + { child_comm: Socket Thread } hitcount: 2 + { child_comm: postgres } hitcount: 2 + { child_comm: bash } hitcount: 3 + { child_comm: compiz } hitcount: 3 + { child_comm: evolution-sourc } hitcount: 4 + { child_comm: dhclient } hitcount: 4 + { child_comm: pool } hitcount: 5 + { child_comm: nm-dispatcher.a } hitcount: 8 + { child_comm: firefox } hitcount: 8 + { child_comm: dbus-daemon } hitcount: 8 + { child_comm: glib-pacrunner } hitcount: 10 + { child_comm: evolution } hitcount: 23 + + Totals: + Hits: 89 + Entries: 20 + Dropped: 0 + + If we want to pause the hist trigger, we can simply append :pause to + the command that started the trigger. Notice that the trigger info + displays as [paused]:: + + # echo 'hist:key=child_comm:val=hitcount:size=256:pause' >> \ + /sys/kernel/debug/tracing/events/sched/sched_process_fork/trigger + + # cat /sys/kernel/debug/tracing/events/sched/sched_process_fork/hist + # trigger info: hist:keys=child_comm:vals=hitcount:sort=hitcount:size=256 [paused] + + { child_comm: dconf worker } hitcount: 1 + { child_comm: kthreadd } hitcount: 1 + { child_comm: dconf worker } hitcount: 1 + { child_comm: gdbus } hitcount: 1 + { child_comm: ibus-daemon } hitcount: 1 + { child_comm: Socket Thread } hitcount: 2 + { child_comm: evolution-alarm } hitcount: 2 + { child_comm: smbd } hitcount: 2 + { child_comm: bash } hitcount: 3 + { child_comm: whoopsie } hitcount: 3 + { child_comm: compiz } hitcount: 3 + { child_comm: evolution-sourc } hitcount: 4 + { child_comm: pool } hitcount: 5 + { child_comm: postgres } hitcount: 6 + { child_comm: firefox } hitcount: 8 + { child_comm: dhclient } hitcount: 10 + { child_comm: emacs } hitcount: 12 + { child_comm: dbus-daemon } hitcount: 20 + { child_comm: nm-dispatcher.a } hitcount: 20 + { child_comm: evolution } hitcount: 35 + { child_comm: glib-pacrunner } hitcount: 59 + + Totals: + Hits: 199 + Entries: 21 + Dropped: 0 + + To manually continue having the trigger aggregate events, append + :cont instead. Notice that the trigger info displays as [active] + again, and the data has changed:: + + # echo 'hist:key=child_comm:val=hitcount:size=256:cont' >> \ + /sys/kernel/debug/tracing/events/sched/sched_process_fork/trigger + + # cat /sys/kernel/debug/tracing/events/sched/sched_process_fork/hist + # trigger info: hist:keys=child_comm:vals=hitcount:sort=hitcount:size=256 [active] + + { child_comm: dconf worker } hitcount: 1 + { child_comm: dconf worker } hitcount: 1 + { child_comm: kthreadd } hitcount: 1 + { child_comm: gdbus } hitcount: 1 + { child_comm: ibus-daemon } hitcount: 1 + { child_comm: Socket Thread } hitcount: 2 + { child_comm: evolution-alarm } hitcount: 2 + { child_comm: smbd } hitcount: 2 + { child_comm: whoopsie } hitcount: 3 + { child_comm: compiz } hitcount: 3 + { child_comm: evolution-sourc } hitcount: 4 + { child_comm: bash } hitcount: 5 + { child_comm: pool } hitcount: 5 + { child_comm: postgres } hitcount: 6 + { child_comm: firefox } hitcount: 8 + { child_comm: dhclient } hitcount: 11 + { child_comm: emacs } hitcount: 12 + { child_comm: dbus-daemon } hitcount: 22 + { child_comm: nm-dispatcher.a } hitcount: 22 + { child_comm: evolution } hitcount: 35 + { child_comm: glib-pacrunner } hitcount: 59 + + Totals: + Hits: 206 + Entries: 21 + Dropped: 0 + + The previous example showed how to start and stop a hist trigger by + appending 'pause' and 'continue' to the hist trigger command. A + hist trigger can also be started in a paused state by initially + starting the trigger with ':pause' appended. This allows you to + start the trigger only when you're ready to start collecting data + and not before. For example, you could start the trigger in a + paused state, then unpause it and do something you want to measure, + then pause the trigger again when done. + + Of course, doing this manually can be difficult and error-prone, but + it is possible to automatically start and stop a hist trigger based + on some condition, via the enable_hist and disable_hist triggers. + + For example, suppose we wanted to take a look at the relative + weights in terms of skb length for each callpath that leads to a + netif_receieve_skb event when downloading a decent-sized file using + wget. + + First we set up an initially paused stacktrace trigger on the + netif_receive_skb event:: + + # echo 'hist:key=stacktrace:vals=len:pause' > \ + /sys/kernel/debug/tracing/events/net/netif_receive_skb/trigger + + Next, we set up an 'enable_hist' trigger on the sched_process_exec + event, with an 'if filename==/usr/bin/wget' filter. The effect of + this new trigger is that it will 'unpause' the hist trigger we just + set up on netif_receive_skb if and only if it sees a + sched_process_exec event with a filename of '/usr/bin/wget'. When + that happens, all netif_receive_skb events are aggregated into a + hash table keyed on stacktrace:: + + # echo 'enable_hist:net:netif_receive_skb if filename==/usr/bin/wget' > \ + /sys/kernel/debug/tracing/events/sched/sched_process_exec/trigger + + The aggregation continues until the netif_receive_skb is paused + again, which is what the following disable_hist event does by + creating a similar setup on the sched_process_exit event, using the + filter 'comm==wget':: + + # echo 'disable_hist:net:netif_receive_skb if comm==wget' > \ + /sys/kernel/debug/tracing/events/sched/sched_process_exit/trigger + + Whenever a process exits and the comm field of the disable_hist + trigger filter matches 'comm==wget', the netif_receive_skb hist + trigger is disabled. + + The overall effect is that netif_receive_skb events are aggregated + into the hash table for only the duration of the wget. Executing a + wget command and then listing the 'hist' file will display the + output generated by the wget command:: + + $ wget https://www.kernel.org/pub/linux/kernel/v3.x/patch-3.19.xz + + # cat /sys/kernel/debug/tracing/events/net/netif_receive_skb/hist + # trigger info: hist:keys=stacktrace:vals=len:sort=hitcount:size=2048 [paused] + + { stacktrace: + __netif_receive_skb_core+0x46d/0x990 + __netif_receive_skb+0x18/0x60 + netif_receive_skb_internal+0x23/0x90 + napi_gro_receive+0xc8/0x100 + ieee80211_deliver_skb+0xd6/0x270 [mac80211] + ieee80211_rx_handlers+0xccf/0x22f0 [mac80211] + ieee80211_prepare_and_rx_handle+0x4e7/0xc40 [mac80211] + ieee80211_rx+0x31d/0x900 [mac80211] + iwlagn_rx_reply_rx+0x3db/0x6f0 [iwldvm] + iwl_rx_dispatch+0x8e/0xf0 [iwldvm] + iwl_pcie_irq_handler+0xe3c/0x12f0 [iwlwifi] + irq_thread_fn+0x20/0x50 + irq_thread+0x11f/0x150 + kthread+0xd2/0xf0 + ret_from_fork+0x42/0x70 + } hitcount: 85 len: 28884 + { stacktrace: + __netif_receive_skb_core+0x46d/0x990 + __netif_receive_skb+0x18/0x60 + netif_receive_skb_internal+0x23/0x90 + napi_gro_complete+0xa4/0xe0 + dev_gro_receive+0x23a/0x360 + napi_gro_receive+0x30/0x100 + ieee80211_deliver_skb+0xd6/0x270 [mac80211] + ieee80211_rx_handlers+0xccf/0x22f0 [mac80211] + ieee80211_prepare_and_rx_handle+0x4e7/0xc40 [mac80211] + ieee80211_rx+0x31d/0x900 [mac80211] + iwlagn_rx_reply_rx+0x3db/0x6f0 [iwldvm] + iwl_rx_dispatch+0x8e/0xf0 [iwldvm] + iwl_pcie_irq_handler+0xe3c/0x12f0 [iwlwifi] + irq_thread_fn+0x20/0x50 + irq_thread+0x11f/0x150 + kthread+0xd2/0xf0 + } hitcount: 98 len: 664329 + { stacktrace: + __netif_receive_skb_core+0x46d/0x990 + __netif_receive_skb+0x18/0x60 + process_backlog+0xa8/0x150 + net_rx_action+0x15d/0x340 + __do_softirq+0x114/0x2c0 + do_softirq_own_stack+0x1c/0x30 + do_softirq+0x65/0x70 + __local_bh_enable_ip+0xb5/0xc0 + ip_finish_output+0x1f4/0x840 + ip_output+0x6b/0xc0 + ip_local_out_sk+0x31/0x40 + ip_send_skb+0x1a/0x50 + udp_send_skb+0x173/0x2a0 + udp_sendmsg+0x2bf/0x9f0 + inet_sendmsg+0x64/0xa0 + sock_sendmsg+0x3d/0x50 + } hitcount: 115 len: 13030 + { stacktrace: + __netif_receive_skb_core+0x46d/0x990 + __netif_receive_skb+0x18/0x60 + netif_receive_skb_internal+0x23/0x90 + napi_gro_complete+0xa4/0xe0 + napi_gro_flush+0x6d/0x90 + iwl_pcie_irq_handler+0x92a/0x12f0 [iwlwifi] + irq_thread_fn+0x20/0x50 + irq_thread+0x11f/0x150 + kthread+0xd2/0xf0 + ret_from_fork+0x42/0x70 + } hitcount: 934 len: 5512212 + + Totals: + Hits: 1232 + Entries: 4 + Dropped: 0 + + The above shows all the netif_receive_skb callpaths and their total + lengths for the duration of the wget command. + + The 'clear' hist trigger param can be used to clear the hash table. + Suppose we wanted to try another run of the previous example but + this time also wanted to see the complete list of events that went + into the histogram. In order to avoid having to set everything up + again, we can just clear the histogram first:: + + # echo 'hist:key=stacktrace:vals=len:clear' >> \ + /sys/kernel/debug/tracing/events/net/netif_receive_skb/trigger + + Just to verify that it is in fact cleared, here's what we now see in + the hist file:: + + # cat /sys/kernel/debug/tracing/events/net/netif_receive_skb/hist + # trigger info: hist:keys=stacktrace:vals=len:sort=hitcount:size=2048 [paused] + + Totals: + Hits: 0 + Entries: 0 + Dropped: 0 + + Since we want to see the detailed list of every netif_receive_skb + event occurring during the new run, which are in fact the same + events being aggregated into the hash table, we add some additional + 'enable_event' events to the triggering sched_process_exec and + sched_process_exit events as such:: + + # echo 'enable_event:net:netif_receive_skb if filename==/usr/bin/wget' > \ + /sys/kernel/debug/tracing/events/sched/sched_process_exec/trigger + + # echo 'disable_event:net:netif_receive_skb if comm==wget' > \ + /sys/kernel/debug/tracing/events/sched/sched_process_exit/trigger + + If you read the trigger files for the sched_process_exec and + sched_process_exit triggers, you should see two triggers for each: + one enabling/disabling the hist aggregation and the other + enabling/disabling the logging of events:: + + # cat /sys/kernel/debug/tracing/events/sched/sched_process_exec/trigger + enable_event:net:netif_receive_skb:unlimited if filename==/usr/bin/wget + enable_hist:net:netif_receive_skb:unlimited if filename==/usr/bin/wget + + # cat /sys/kernel/debug/tracing/events/sched/sched_process_exit/trigger + enable_event:net:netif_receive_skb:unlimited if comm==wget + disable_hist:net:netif_receive_skb:unlimited if comm==wget + + In other words, whenever either of the sched_process_exec or + sched_process_exit events is hit and matches 'wget', it enables or + disables both the histogram and the event log, and what you end up + with is a hash table and set of events just covering the specified + duration. Run the wget command again:: + + $ wget https://www.kernel.org/pub/linux/kernel/v3.x/patch-3.19.xz + + Displaying the 'hist' file should show something similar to what you + saw in the last run, but this time you should also see the + individual events in the trace file:: + + # cat /sys/kernel/debug/tracing/trace + + # tracer: nop + # + # entries-in-buffer/entries-written: 183/1426 #P:4 + # + # _-----=> irqs-off + # / _----=> need-resched + # | / _---=> hardirq/softirq + # || / _--=> preempt-depth + # ||| / delay + # TASK-PID CPU# |||| TIMESTAMP FUNCTION + # | | | |||| | | + wget-15108 [000] ..s1 31769.606929: netif_receive_skb: dev=lo skbaddr=ffff88009c353100 len=60 + wget-15108 [000] ..s1 31769.606999: netif_receive_skb: dev=lo skbaddr=ffff88009c353200 len=60 + dnsmasq-1382 [000] ..s1 31769.677652: netif_receive_skb: dev=lo skbaddr=ffff88009c352b00 len=130 + dnsmasq-1382 [000] ..s1 31769.685917: netif_receive_skb: dev=lo skbaddr=ffff88009c352200 len=138 + ##### CPU 2 buffer started #### + irq/29-iwlwifi-559 [002] ..s. 31772.031529: netif_receive_skb: dev=wlan0 skbaddr=ffff88009d433d00 len=2948 + irq/29-iwlwifi-559 [002] ..s. 31772.031572: netif_receive_skb: dev=wlan0 skbaddr=ffff88009d432200 len=1500 + irq/29-iwlwifi-559 [002] ..s. 31772.032196: netif_receive_skb: dev=wlan0 skbaddr=ffff88009d433100 len=2948 + irq/29-iwlwifi-559 [002] ..s. 31772.032761: netif_receive_skb: dev=wlan0 skbaddr=ffff88009d433000 len=2948 + irq/29-iwlwifi-559 [002] ..s. 31772.033220: netif_receive_skb: dev=wlan0 skbaddr=ffff88009d432e00 len=1500 + . + . + . + + The following example demonstrates how multiple hist triggers can be + attached to a given event. This capability can be useful for + creating a set of different summaries derived from the same set of + events, or for comparing the effects of different filters, among + other things:: + + # echo 'hist:keys=skbaddr.hex:vals=len if len < 0' >> \ + /sys/kernel/debug/tracing/events/net/netif_receive_skb/trigger + # echo 'hist:keys=skbaddr.hex:vals=len if len > 4096' >> \ + /sys/kernel/debug/tracing/events/net/netif_receive_skb/trigger + # echo 'hist:keys=skbaddr.hex:vals=len if len == 256' >> \ + /sys/kernel/debug/tracing/events/net/netif_receive_skb/trigger + # echo 'hist:keys=skbaddr.hex:vals=len' >> \ + /sys/kernel/debug/tracing/events/net/netif_receive_skb/trigger + # echo 'hist:keys=len:vals=common_preempt_count' >> \ + /sys/kernel/debug/tracing/events/net/netif_receive_skb/trigger + + The above set of commands create four triggers differing only in + their filters, along with a completely different though fairly + nonsensical trigger. Note that in order to append multiple hist + triggers to the same file, you should use the '>>' operator to + append them ('>' will also add the new hist trigger, but will remove + any existing hist triggers beforehand). + + Displaying the contents of the 'hist' file for the event shows the + contents of all five histograms:: + + # cat /sys/kernel/debug/tracing/events/net/netif_receive_skb/hist + + # event histogram + # + # trigger info: hist:keys=len:vals=hitcount,common_preempt_count:sort=hitcount:size=2048 [active] + # + + { len: 176 } hitcount: 1 common_preempt_count: 0 + { len: 223 } hitcount: 1 common_preempt_count: 0 + { len: 4854 } hitcount: 1 common_preempt_count: 0 + { len: 395 } hitcount: 1 common_preempt_count: 0 + { len: 177 } hitcount: 1 common_preempt_count: 0 + { len: 446 } hitcount: 1 common_preempt_count: 0 + { len: 1601 } hitcount: 1 common_preempt_count: 0 + . + . + . + { len: 1280 } hitcount: 66 common_preempt_count: 0 + { len: 116 } hitcount: 81 common_preempt_count: 40 + { len: 708 } hitcount: 112 common_preempt_count: 0 + { len: 46 } hitcount: 221 common_preempt_count: 0 + { len: 1264 } hitcount: 458 common_preempt_count: 0 + + Totals: + Hits: 1428 + Entries: 147 + Dropped: 0 + + + # event histogram + # + # trigger info: hist:keys=skbaddr.hex:vals=hitcount,len:sort=hitcount:size=2048 [active] + # + + { skbaddr: ffff8800baee5e00 } hitcount: 1 len: 130 + { skbaddr: ffff88005f3d5600 } hitcount: 1 len: 1280 + { skbaddr: ffff88005f3d4900 } hitcount: 1 len: 1280 + { skbaddr: ffff88009fed6300 } hitcount: 1 len: 115 + { skbaddr: ffff88009fe0ad00 } hitcount: 1 len: 115 + { skbaddr: ffff88008cdb1900 } hitcount: 1 len: 46 + { skbaddr: ffff880064b5ef00 } hitcount: 1 len: 118 + { skbaddr: ffff880044e3c700 } hitcount: 1 len: 60 + { skbaddr: ffff880100065900 } hitcount: 1 len: 46 + { skbaddr: ffff8800d46bd500 } hitcount: 1 len: 116 + { skbaddr: ffff88005f3d5f00 } hitcount: 1 len: 1280 + { skbaddr: ffff880100064700 } hitcount: 1 len: 365 + { skbaddr: ffff8800badb6f00 } hitcount: 1 len: 60 + . + . + . + { skbaddr: ffff88009fe0be00 } hitcount: 27 len: 24677 + { skbaddr: ffff88009fe0a400 } hitcount: 27 len: 23052 + { skbaddr: ffff88009fe0b700 } hitcount: 31 len: 25589 + { skbaddr: ffff88009fe0b600 } hitcount: 32 len: 27326 + { skbaddr: ffff88006a462800 } hitcount: 68 len: 71678 + { skbaddr: ffff88006a463700 } hitcount: 70 len: 72678 + { skbaddr: ffff88006a462b00 } hitcount: 71 len: 77589 + { skbaddr: ffff88006a463600 } hitcount: 73 len: 71307 + { skbaddr: ffff88006a462200 } hitcount: 81 len: 81032 + + Totals: + Hits: 1451 + Entries: 318 + Dropped: 0 + + + # event histogram + # + # trigger info: hist:keys=skbaddr.hex:vals=hitcount,len:sort=hitcount:size=2048 if len == 256 [active] + # + + + Totals: + Hits: 0 + Entries: 0 + Dropped: 0 + + + # event histogram + # + # trigger info: hist:keys=skbaddr.hex:vals=hitcount,len:sort=hitcount:size=2048 if len > 4096 [active] + # + + { skbaddr: ffff88009fd2c300 } hitcount: 1 len: 7212 + { skbaddr: ffff8800d2bcce00 } hitcount: 1 len: 7212 + { skbaddr: ffff8800d2bcd700 } hitcount: 1 len: 7212 + { skbaddr: ffff8800d2bcda00 } hitcount: 1 len: 21492 + { skbaddr: ffff8800ae2e2d00 } hitcount: 1 len: 7212 + { skbaddr: ffff8800d2bcdb00 } hitcount: 1 len: 7212 + { skbaddr: ffff88006a4df500 } hitcount: 1 len: 4854 + { skbaddr: ffff88008ce47b00 } hitcount: 1 len: 18636 + { skbaddr: ffff8800ae2e2200 } hitcount: 1 len: 12924 + { skbaddr: ffff88005f3e1000 } hitcount: 1 len: 4356 + { skbaddr: ffff8800d2bcdc00 } hitcount: 2 len: 24420 + { skbaddr: ffff8800d2bcc200 } hitcount: 2 len: 12996 + + Totals: + Hits: 14 + Entries: 12 + Dropped: 0 + + + # event histogram + # + # trigger info: hist:keys=skbaddr.hex:vals=hitcount,len:sort=hitcount:size=2048 if len < 0 [active] + # + + + Totals: + Hits: 0 + Entries: 0 + Dropped: 0 + + Named triggers can be used to have triggers share a common set of + histogram data. This capability is mostly useful for combining the + output of events generated by tracepoints contained inside inline + functions, but names can be used in a hist trigger on any event. + For example, these two triggers when hit will update the same 'len' + field in the shared 'foo' histogram data:: + + # echo 'hist:name=foo:keys=skbaddr.hex:vals=len' > \ + /sys/kernel/debug/tracing/events/net/netif_receive_skb/trigger + # echo 'hist:name=foo:keys=skbaddr.hex:vals=len' > \ + /sys/kernel/debug/tracing/events/net/netif_rx/trigger + + You can see that they're updating common histogram data by reading + each event's hist files at the same time:: + + # cat /sys/kernel/debug/tracing/events/net/netif_receive_skb/hist; + cat /sys/kernel/debug/tracing/events/net/netif_rx/hist + + # event histogram + # + # trigger info: hist:name=foo:keys=skbaddr.hex:vals=hitcount,len:sort=hitcount:size=2048 [active] + # + + { skbaddr: ffff88000ad53500 } hitcount: 1 len: 46 + { skbaddr: ffff8800af5a1500 } hitcount: 1 len: 76 + { skbaddr: ffff8800d62a1900 } hitcount: 1 len: 46 + { skbaddr: ffff8800d2bccb00 } hitcount: 1 len: 468 + { skbaddr: ffff8800d3c69900 } hitcount: 1 len: 46 + { skbaddr: ffff88009ff09100 } hitcount: 1 len: 52 + { skbaddr: ffff88010f13ab00 } hitcount: 1 len: 168 + { skbaddr: ffff88006a54f400 } hitcount: 1 len: 46 + { skbaddr: ffff8800d2bcc500 } hitcount: 1 len: 260 + { skbaddr: ffff880064505000 } hitcount: 1 len: 46 + { skbaddr: ffff8800baf24e00 } hitcount: 1 len: 32 + { skbaddr: ffff88009fe0ad00 } hitcount: 1 len: 46 + { skbaddr: ffff8800d3edff00 } hitcount: 1 len: 44 + { skbaddr: ffff88009fe0b400 } hitcount: 1 len: 168 + { skbaddr: ffff8800a1c55a00 } hitcount: 1 len: 40 + { skbaddr: ffff8800d2bcd100 } hitcount: 1 len: 40 + { skbaddr: ffff880064505f00 } hitcount: 1 len: 174 + { skbaddr: ffff8800a8bff200 } hitcount: 1 len: 160 + { skbaddr: ffff880044e3cc00 } hitcount: 1 len: 76 + { skbaddr: ffff8800a8bfe700 } hitcount: 1 len: 46 + { skbaddr: ffff8800d2bcdc00 } hitcount: 1 len: 32 + { skbaddr: ffff8800a1f64800 } hitcount: 1 len: 46 + { skbaddr: ffff8800d2bcde00 } hitcount: 1 len: 988 + { skbaddr: ffff88006a5dea00 } hitcount: 1 len: 46 + { skbaddr: ffff88002e37a200 } hitcount: 1 len: 44 + { skbaddr: ffff8800a1f32c00 } hitcount: 2 len: 676 + { skbaddr: ffff88000ad52600 } hitcount: 2 len: 107 + { skbaddr: ffff8800a1f91e00 } hitcount: 2 len: 92 + { skbaddr: ffff8800af5a0200 } hitcount: 2 len: 142 + { skbaddr: ffff8800d2bcc600 } hitcount: 2 len: 220 + { skbaddr: ffff8800ba36f500 } hitcount: 2 len: 92 + { skbaddr: ffff8800d021f800 } hitcount: 2 len: 92 + { skbaddr: ffff8800a1f33600 } hitcount: 2 len: 675 + { skbaddr: ffff8800a8bfff00 } hitcount: 3 len: 138 + { skbaddr: ffff8800d62a1300 } hitcount: 3 len: 138 + { skbaddr: ffff88002e37a100 } hitcount: 4 len: 184 + { skbaddr: ffff880064504400 } hitcount: 4 len: 184 + { skbaddr: ffff8800a8bfec00 } hitcount: 4 len: 184 + { skbaddr: ffff88000ad53700 } hitcount: 5 len: 230 + { skbaddr: ffff8800d2bcdb00 } hitcount: 5 len: 196 + { skbaddr: ffff8800a1f90000 } hitcount: 6 len: 276 + { skbaddr: ffff88006a54f900 } hitcount: 6 len: 276 + + Totals: + Hits: 81 + Entries: 42 + Dropped: 0 + # event histogram + # + # trigger info: hist:name=foo:keys=skbaddr.hex:vals=hitcount,len:sort=hitcount:size=2048 [active] + # + + { skbaddr: ffff88000ad53500 } hitcount: 1 len: 46 + { skbaddr: ffff8800af5a1500 } hitcount: 1 len: 76 + { skbaddr: ffff8800d62a1900 } hitcount: 1 len: 46 + { skbaddr: ffff8800d2bccb00 } hitcount: 1 len: 468 + { skbaddr: ffff8800d3c69900 } hitcount: 1 len: 46 + { skbaddr: ffff88009ff09100 } hitcount: 1 len: 52 + { skbaddr: ffff88010f13ab00 } hitcount: 1 len: 168 + { skbaddr: ffff88006a54f400 } hitcount: 1 len: 46 + { skbaddr: ffff8800d2bcc500 } hitcount: 1 len: 260 + { skbaddr: ffff880064505000 } hitcount: 1 len: 46 + { skbaddr: ffff8800baf24e00 } hitcount: 1 len: 32 + { skbaddr: ffff88009fe0ad00 } hitcount: 1 len: 46 + { skbaddr: ffff8800d3edff00 } hitcount: 1 len: 44 + { skbaddr: ffff88009fe0b400 } hitcount: 1 len: 168 + { skbaddr: ffff8800a1c55a00 } hitcount: 1 len: 40 + { skbaddr: ffff8800d2bcd100 } hitcount: 1 len: 40 + { skbaddr: ffff880064505f00 } hitcount: 1 len: 174 + { skbaddr: ffff8800a8bff200 } hitcount: 1 len: 160 + { skbaddr: ffff880044e3cc00 } hitcount: 1 len: 76 + { skbaddr: ffff8800a8bfe700 } hitcount: 1 len: 46 + { skbaddr: ffff8800d2bcdc00 } hitcount: 1 len: 32 + { skbaddr: ffff8800a1f64800 } hitcount: 1 len: 46 + { skbaddr: ffff8800d2bcde00 } hitcount: 1 len: 988 + { skbaddr: ffff88006a5dea00 } hitcount: 1 len: 46 + { skbaddr: ffff88002e37a200 } hitcount: 1 len: 44 + { skbaddr: ffff8800a1f32c00 } hitcount: 2 len: 676 + { skbaddr: ffff88000ad52600 } hitcount: 2 len: 107 + { skbaddr: ffff8800a1f91e00 } hitcount: 2 len: 92 + { skbaddr: ffff8800af5a0200 } hitcount: 2 len: 142 + { skbaddr: ffff8800d2bcc600 } hitcount: 2 len: 220 + { skbaddr: ffff8800ba36f500 } hitcount: 2 len: 92 + { skbaddr: ffff8800d021f800 } hitcount: 2 len: 92 + { skbaddr: ffff8800a1f33600 } hitcount: 2 len: 675 + { skbaddr: ffff8800a8bfff00 } hitcount: 3 len: 138 + { skbaddr: ffff8800d62a1300 } hitcount: 3 len: 138 + { skbaddr: ffff88002e37a100 } hitcount: 4 len: 184 + { skbaddr: ffff880064504400 } hitcount: 4 len: 184 + { skbaddr: ffff8800a8bfec00 } hitcount: 4 len: 184 + { skbaddr: ffff88000ad53700 } hitcount: 5 len: 230 + { skbaddr: ffff8800d2bcdb00 } hitcount: 5 len: 196 + { skbaddr: ffff8800a1f90000 } hitcount: 6 len: 276 + { skbaddr: ffff88006a54f900 } hitcount: 6 len: 276 + + Totals: + Hits: 81 + Entries: 42 + Dropped: 0 + + And here's an example that shows how to combine histogram data from + any two events even if they don't share any 'compatible' fields + other than 'hitcount' and 'stacktrace'. These commands create a + couple of triggers named 'bar' using those fields:: + + # echo 'hist:name=bar:key=stacktrace:val=hitcount' > \ + /sys/kernel/debug/tracing/events/sched/sched_process_fork/trigger + # echo 'hist:name=bar:key=stacktrace:val=hitcount' > \ + /sys/kernel/debug/tracing/events/net/netif_rx/trigger + + And displaying the output of either shows some interesting if + somewhat confusing output:: + + # cat /sys/kernel/debug/tracing/events/sched/sched_process_fork/hist + # cat /sys/kernel/debug/tracing/events/net/netif_rx/hist + + # event histogram + # + # trigger info: hist:name=bar:keys=stacktrace:vals=hitcount:sort=hitcount:size=2048 [active] + # + + { stacktrace: + _do_fork+0x18e/0x330 + kernel_thread+0x29/0x30 + kthreadd+0x154/0x1b0 + ret_from_fork+0x3f/0x70 + } hitcount: 1 + { stacktrace: + netif_rx_internal+0xb2/0xd0 + netif_rx_ni+0x20/0x70 + dev_loopback_xmit+0xaa/0xd0 + ip_mc_output+0x126/0x240 + ip_local_out_sk+0x31/0x40 + igmp_send_report+0x1e9/0x230 + igmp_timer_expire+0xe9/0x120 + call_timer_fn+0x39/0xf0 + run_timer_softirq+0x1e1/0x290 + __do_softirq+0xfd/0x290 + irq_exit+0x98/0xb0 + smp_apic_timer_interrupt+0x4a/0x60 + apic_timer_interrupt+0x6d/0x80 + cpuidle_enter+0x17/0x20 + call_cpuidle+0x3b/0x60 + cpu_startup_entry+0x22d/0x310 + } hitcount: 1 + { stacktrace: + netif_rx_internal+0xb2/0xd0 + netif_rx_ni+0x20/0x70 + dev_loopback_xmit+0xaa/0xd0 + ip_mc_output+0x17f/0x240 + ip_local_out_sk+0x31/0x40 + ip_send_skb+0x1a/0x50 + udp_send_skb+0x13e/0x270 + udp_sendmsg+0x2bf/0x980 + inet_sendmsg+0x67/0xa0 + sock_sendmsg+0x38/0x50 + SYSC_sendto+0xef/0x170 + SyS_sendto+0xe/0x10 + entry_SYSCALL_64_fastpath+0x12/0x6a + } hitcount: 2 + { stacktrace: + netif_rx_internal+0xb2/0xd0 + netif_rx+0x1c/0x60 + loopback_xmit+0x6c/0xb0 + dev_hard_start_xmit+0x219/0x3a0 + __dev_queue_xmit+0x415/0x4f0 + dev_queue_xmit_sk+0x13/0x20 + ip_finish_output2+0x237/0x340 + ip_finish_output+0x113/0x1d0 + ip_output+0x66/0xc0 + ip_local_out_sk+0x31/0x40 + ip_send_skb+0x1a/0x50 + udp_send_skb+0x16d/0x270 + udp_sendmsg+0x2bf/0x980 + inet_sendmsg+0x67/0xa0 + sock_sendmsg+0x38/0x50 + ___sys_sendmsg+0x14e/0x270 + } hitcount: 76 + { stacktrace: + netif_rx_internal+0xb2/0xd0 + netif_rx+0x1c/0x60 + loopback_xmit+0x6c/0xb0 + dev_hard_start_xmit+0x219/0x3a0 + __dev_queue_xmit+0x415/0x4f0 + dev_queue_xmit_sk+0x13/0x20 + ip_finish_output2+0x237/0x340 + ip_finish_output+0x113/0x1d0 + ip_output+0x66/0xc0 + ip_local_out_sk+0x31/0x40 + ip_send_skb+0x1a/0x50 + udp_send_skb+0x16d/0x270 + udp_sendmsg+0x2bf/0x980 + inet_sendmsg+0x67/0xa0 + sock_sendmsg+0x38/0x50 + ___sys_sendmsg+0x269/0x270 + } hitcount: 77 + { stacktrace: + netif_rx_internal+0xb2/0xd0 + netif_rx+0x1c/0x60 + loopback_xmit+0x6c/0xb0 + dev_hard_start_xmit+0x219/0x3a0 + __dev_queue_xmit+0x415/0x4f0 + dev_queue_xmit_sk+0x13/0x20 + ip_finish_output2+0x237/0x340 + ip_finish_output+0x113/0x1d0 + ip_output+0x66/0xc0 + ip_local_out_sk+0x31/0x40 + ip_send_skb+0x1a/0x50 + udp_send_skb+0x16d/0x270 + udp_sendmsg+0x2bf/0x980 + inet_sendmsg+0x67/0xa0 + sock_sendmsg+0x38/0x50 + SYSC_sendto+0xef/0x170 + } hitcount: 88 + { stacktrace: + _do_fork+0x18e/0x330 + SyS_clone+0x19/0x20 + entry_SYSCALL_64_fastpath+0x12/0x6a + } hitcount: 244 + + Totals: + Hits: 489 + Entries: 7 + Dropped: 0 + +2.2 Inter-event hist triggers +----------------------------- + +Inter-event hist triggers are hist triggers that combine values from +one or more other events and create a histogram using that data. Data +from an inter-event histogram can in turn become the source for +further combined histograms, thus providing a chain of related +histograms, which is important for some applications. + +The most important example of an inter-event quantity that can be used +in this manner is latency, which is simply a difference in timestamps +between two events. Although latency is the most important +inter-event quantity, note that because the support is completely +general across the trace event subsystem, any event field can be used +in an inter-event quantity. + +An example of a histogram that combines data from other histograms +into a useful chain would be a 'wakeupswitch latency' histogram that +combines a 'wakeup latency' histogram and a 'switch latency' +histogram. + +Normally, a hist trigger specification consists of a (possibly +compound) key along with one or more numeric values, which are +continually updated sums associated with that key. A histogram +specification in this case consists of individual key and value +specifications that refer to trace event fields associated with a +single event type. + +The inter-event hist trigger extension allows fields from multiple +events to be referenced and combined into a multi-event histogram +specification. In support of this overall goal, a few enabling +features have been added to the hist trigger support: + + - In order to compute an inter-event quantity, a value from one + event needs to saved and then referenced from another event. This + requires the introduction of support for histogram 'variables'. + + - The computation of inter-event quantities and their combination + require some minimal amount of support for applying simple + expressions to variables (+ and -). + + - A histogram consisting of inter-event quantities isn't logically a + histogram on either event (so having the 'hist' file for either + event host the histogram output doesn't really make sense). To + address the idea that the histogram is associated with a + combination of events, support is added allowing the creation of + 'synthetic' events that are events derived from other events. + These synthetic events are full-fledged events just like any other + and can be used as such, as for instance to create the + 'combination' histograms mentioned previously. + + - A set of 'actions' can be associated with histogram entries - + these can be used to generate the previously mentioned synthetic + events, but can also be used for other purposes, such as for + example saving context when a 'max' latency has been hit. + + - Trace events don't have a 'timestamp' associated with them, but + there is an implicit timestamp saved along with an event in the + underlying ftrace ring buffer. This timestamp is now exposed as a + a synthetic field named 'common_timestamp' which can be used in + histograms as if it were any other event field; it isn't an actual + field in the trace format but rather is a synthesized value that + nonetheless can be used as if it were an actual field. By default + it is in units of nanoseconds; appending '.usecs' to a + common_timestamp field changes the units to microseconds. + +A note on inter-event timestamps: If common_timestamp is used in a +histogram, the trace buffer is automatically switched over to using +absolute timestamps and the "global" trace clock, in order to avoid +bogus timestamp differences with other clocks that aren't coherent +across CPUs. This can be overridden by specifying one of the other +trace clocks instead, using the "clock=XXX" hist trigger attribute, +where XXX is any of the clocks listed in the tracing/trace_clock +pseudo-file. + +These features are described in more detail in the following sections. + +2.2.1 Histogram Variables +------------------------- + +Variables are simply named locations used for saving and retrieving +values between matching events. A 'matching' event is defined as an +event that has a matching key - if a variable is saved for a histogram +entry corresponding to that key, any subsequent event with a matching +key can access that variable. + +A variable's value is normally available to any subsequent event until +it is set to something else by a subsequent event. The one exception +to that rule is that any variable used in an expression is essentially +'read-once' - once it's used by an expression in a subsequent event, +it's reset to its 'unset' state, which means it can't be used again +unless it's set again. This ensures not only that an event doesn't +use an uninitialized variable in a calculation, but that that variable +is used only once and not for any unrelated subsequent match. + +The basic syntax for saving a variable is to simply prefix a unique +variable name not corresponding to any keyword along with an '=' sign +to any event field. + +Either keys or values can be saved and retrieved in this way. This +creates a variable named 'ts0' for a histogram entry with the key +'next_pid':: + + # echo 'hist:keys=next_pid:vals=$ts0:ts0=common_timestamp ... >> \ + event/trigger + +The ts0 variable can be accessed by any subsequent event having the +same pid as 'next_pid'. + +Variable references are formed by prepending the variable name with +the '$' sign. Thus for example, the ts0 variable above would be +referenced as '$ts0' in expressions. + +Because 'vals=' is used, the common_timestamp variable value above +will also be summed as a normal histogram value would (though for a +timestamp it makes little sense). + +The below shows that a key value can also be saved in the same way:: + + # echo 'hist:timer_pid=common_pid:key=timer_pid ...' >> event/trigger + +If a variable isn't a key variable or prefixed with 'vals=', the +associated event field will be saved in a variable but won't be summed +as a value:: + + # echo 'hist:keys=next_pid:ts1=common_timestamp ...' >> event/trigger + +Multiple variables can be assigned at the same time. The below would +result in both ts0 and b being created as variables, with both +common_timestamp and field1 additionally being summed as values:: + + # echo 'hist:keys=pid:vals=$ts0,$b:ts0=common_timestamp,b=field1 ...' >> \ + event/trigger + +Note that variable assignments can appear either preceding or +following their use. The command below behaves identically to the +command above:: + + # echo 'hist:keys=pid:ts0=common_timestamp,b=field1:vals=$ts0,$b ...' >> \ + event/trigger + +Any number of variables not bound to a 'vals=' prefix can also be +assigned by simply separating them with colons. Below is the same +thing but without the values being summed in the histogram:: + + # echo 'hist:keys=pid:ts0=common_timestamp:b=field1 ...' >> event/trigger + +Variables set as above can be referenced and used in expressions on +another event. + +For example, here's how a latency can be calculated:: + + # echo 'hist:keys=pid,prio:ts0=common_timestamp ...' >> event1/trigger + # echo 'hist:keys=next_pid:wakeup_lat=common_timestamp-$ts0 ...' >> event2/trigger + +In the first line above, the event's timetamp is saved into the +variable ts0. In the next line, ts0 is subtracted from the second +event's timestamp to produce the latency, which is then assigned into +yet another variable, 'wakeup_lat'. The hist trigger below in turn +makes use of the wakeup_lat variable to compute a combined latency +using the same key and variable from yet another event:: + + # echo 'hist:key=pid:wakeupswitch_lat=$wakeup_lat+$switchtime_lat ...' >> event3/trigger + +2.2.2 Synthetic Events +---------------------- + +Synthetic events are user-defined events generated from hist trigger +variables or fields associated with one or more other events. Their +purpose is to provide a mechanism for displaying data spanning +multiple events consistent with the existing and already familiar +usage for normal events. + +To define a synthetic event, the user writes a simple specification +consisting of the name of the new event along with one or more +variables and their types, which can be any valid field type, +separated by semicolons, to the tracing/synthetic_events file. + +For instance, the following creates a new event named 'wakeup_latency' +with 3 fields: lat, pid, and prio. Each of those fields is simply a +variable reference to a variable on another event:: + + # echo 'wakeup_latency \ + u64 lat; \ + pid_t pid; \ + int prio' >> \ + /sys/kernel/debug/tracing/synthetic_events + +Reading the tracing/synthetic_events file lists all the currently +defined synthetic events, in this case the event defined above:: + + # cat /sys/kernel/debug/tracing/synthetic_events + wakeup_latency u64 lat; pid_t pid; int prio + +An existing synthetic event definition can be removed by prepending +the command that defined it with a '!':: + + # echo '!wakeup_latency u64 lat pid_t pid int prio' >> \ + /sys/kernel/debug/tracing/synthetic_events + +At this point, there isn't yet an actual 'wakeup_latency' event +instantiated in the event subsytem - for this to happen, a 'hist +trigger action' needs to be instantiated and bound to actual fields +and variables defined on other events (see Section 2.2.3 below on +how that is done using hist trigger 'onmatch' action). Once that is +done, the 'wakeup_latency' synthetic event instance is created. + +A histogram can now be defined for the new synthetic event:: + + # echo 'hist:keys=pid,prio,lat.log2:sort=pid,lat' >> \ + /sys/kernel/debug/tracing/events/synthetic/wakeup_latency/trigger + +The new event is created under the tracing/events/synthetic/ directory +and looks and behaves just like any other event:: + + # ls /sys/kernel/debug/tracing/events/synthetic/wakeup_latency + enable filter format hist id trigger + +Like any other event, once a histogram is enabled for the event, the +output can be displayed by reading the event's 'hist' file. + +2.2.3 Hist trigger 'actions' +---------------------------- + +A hist trigger 'action' is a function that's executed whenever a +histogram entry is added or updated. + +The default 'action' if no special function is explicity specified is +as it always has been, to simply update the set of values associated +with an entry. Some applications, however, may want to perform +additional actions at that point, such as generate another event, or +compare and save a maximum. + +The following additional actions are available. To specify an action +for a given event, simply specify the action between colons in the +hist trigger specification. + + - onmatch(matching.event).(param list) + + The 'onmatch(matching.event).(params)' hist + trigger action is invoked whenever an event matches and the + histogram entry would be added or updated. It causes the named + synthetic event to be generated with the values given in the + 'param list'. The result is the generation of a synthetic event + that consists of the values contained in those variables at the + time the invoking event was hit. + + The 'param list' consists of one or more parameters which may be + either variables or fields defined on either the 'matching.event' + or the target event. The variables or fields specified in the + param list may be either fully-qualified or unqualified. If a + variable is specified as unqualified, it must be unique between + the two events. A field name used as a param can be unqualified + if it refers to the target event, but must be fully qualified if + it refers to the matching event. A fully-qualified name is of the + form 'system.event_name.$var_name' or 'system.event_name.field'. + + The 'matching.event' specification is simply the fully qualified + event name of the event that matches the target event for the + onmatch() functionality, in the form 'system.event_name'. + + Finally, the number and type of variables/fields in the 'param + list' must match the number and types of the fields in the + synthetic event being generated. + + As an example the below defines a simple synthetic event and uses + a variable defined on the sched_wakeup_new event as a parameter + when invoking the synthetic event. Here we define the synthetic + event:: + + # echo 'wakeup_new_test pid_t pid' >> \ + /sys/kernel/debug/tracing/synthetic_events + + # cat /sys/kernel/debug/tracing/synthetic_events + wakeup_new_test pid_t pid + + The following hist trigger both defines the missing testpid + variable and specifies an onmatch() action that generates a + wakeup_new_test synthetic event whenever a sched_wakeup_new event + occurs, which because of the 'if comm == "cyclictest"' filter only + happens when the executable is cyclictest:: + + # echo 'hist:keys=$testpid:testpid=pid:onmatch(sched.sched_wakeup_new).\ + wakeup_new_test($testpid) if comm=="cyclictest"' >> \ + /sys/kernel/debug/tracing/events/sched/sched_wakeup_new/trigger + + Creating and displaying a histogram based on those events is now + just a matter of using the fields and new synthetic event in the + tracing/events/synthetic directory, as usual:: + + # echo 'hist:keys=pid:sort=pid' >> \ + /sys/kernel/debug/tracing/events/synthetic/wakeup_new_test/trigger + + Running 'cyclictest' should cause wakeup_new events to generate + wakeup_new_test synthetic events which should result in histogram + output in the wakeup_new_test event's hist file:: + + # cat /sys/kernel/debug/tracing/events/synthetic/wakeup_new_test/hist + + A more typical usage would be to use two events to calculate a + latency. The following example uses a set of hist triggers to + produce a 'wakeup_latency' histogram. + + First, we define a 'wakeup_latency' synthetic event:: + + # echo 'wakeup_latency u64 lat; pid_t pid; int prio' >> \ + /sys/kernel/debug/tracing/synthetic_events + + Next, we specify that whenever we see a sched_waking event for a + cyclictest thread, save the timestamp in a 'ts0' variable:: + + # echo 'hist:keys=$saved_pid:saved_pid=pid:ts0=common_timestamp.usecs \ + if comm=="cyclictest"' >> \ + /sys/kernel/debug/tracing/events/sched/sched_waking/trigger + + Then, when the corresponding thread is actually scheduled onto the + CPU by a sched_switch event, calculate the latency and use that + along with another variable and an event field to generate a + wakeup_latency synthetic event:: + + # echo 'hist:keys=next_pid:wakeup_lat=common_timestamp.usecs-$ts0:\ + onmatch(sched.sched_waking).wakeup_latency($wakeup_lat,\ + $saved_pid,next_prio) if next_comm=="cyclictest"' >> \ + /sys/kernel/debug/tracing/events/sched/sched_switch/trigger + + We also need to create a histogram on the wakeup_latency synthetic + event in order to aggregate the generated synthetic event data:: + + # echo 'hist:keys=pid,prio,lat:sort=pid,lat' >> \ + /sys/kernel/debug/tracing/events/synthetic/wakeup_latency/trigger + + Finally, once we've run cyclictest to actually generate some + events, we can see the output by looking at the wakeup_latency + synthetic event's hist file:: + + # cat /sys/kernel/debug/tracing/events/synthetic/wakeup_latency/hist + + - onmax(var).save(field,.. .) + + The 'onmax(var).save(field,...)' hist trigger action is invoked + whenever the value of 'var' associated with a histogram entry + exceeds the current maximum contained in that variable. + + The end result is that the trace event fields specified as the + onmax.save() params will be saved if 'var' exceeds the current + maximum for that hist trigger entry. This allows context from the + event that exhibited the new maximum to be saved for later + reference. When the histogram is displayed, additional fields + displaying the saved values will be printed. + + As an example the below defines a couple of hist triggers, one for + sched_waking and another for sched_switch, keyed on pid. Whenever + a sched_waking occurs, the timestamp is saved in the entry + corresponding to the current pid, and when the scheduler switches + back to that pid, the timestamp difference is calculated. If the + resulting latency, stored in wakeup_lat, exceeds the current + maximum latency, the values specified in the save() fields are + recorded:: + + # echo 'hist:keys=pid:ts0=common_timestamp.usecs \ + if comm=="cyclictest"' >> \ + /sys/kernel/debug/tracing/events/sched/sched_waking/trigger + + # echo 'hist:keys=next_pid:\ + wakeup_lat=common_timestamp.usecs-$ts0:\ + onmax($wakeup_lat).save(next_comm,prev_pid,prev_prio,prev_comm) \ + if next_comm=="cyclictest"' >> \ + /sys/kernel/debug/tracing/events/sched/sched_switch/trigger + + When the histogram is displayed, the max value and the saved + values corresponding to the max are displayed following the rest + of the fields:: + + # cat /sys/kernel/debug/tracing/events/sched/sched_switch/hist + { next_pid: 2255 } hitcount: 239 + common_timestamp-ts0: 0 + max: 27 + next_comm: cyclictest + prev_pid: 0 prev_prio: 120 prev_comm: swapper/1 + + { next_pid: 2256 } hitcount: 2355 + common_timestamp-ts0: 0 + max: 49 next_comm: cyclictest + prev_pid: 0 prev_prio: 120 prev_comm: swapper/0 + + Totals: + Hits: 12970 + Entries: 2 + Dropped: 0 + +3. User space creating a trigger +-------------------------------- + +Writing into /sys/kernel/tracing/trace_marker writes into the ftrace +ring buffer. This can also act like an event, by writing into the trigger +file located in /sys/kernel/tracing/events/ftrace/print/ + +Modifying cyclictest to write into the trace_marker file before it sleeps +and after it wakes up, something like this:: + + static void traceputs(char *str) + { + /* tracemark_fd is the trace_marker file descriptor */ + if (tracemark_fd < 0) + return; + /* write the tracemark message */ + write(tracemark_fd, str, strlen(str)); + } + +And later add something like:: + + traceputs("start"); + clock_nanosleep(...); + traceputs("end"); + +We can make a histogram from this:: + + # cd /sys/kernel/tracing + # echo 'latency u64 lat' > synthetic_events + # echo 'hist:keys=common_pid:ts0=common_timestamp.usecs if buf == "start"' > events/ftrace/print/trigger + # echo 'hist:keys=common_pid:lat=common_timestamp.usecs-$ts0:onmatch(ftrace.print).latency($lat) if buf == "end"' >> events/ftrace/print/trigger + # echo 'hist:keys=lat,common_pid:sort=lat' > events/synthetic/latency/trigger + +The above created a synthetic event called "latency" and two histograms +against the trace_marker, one gets triggered when "start" is written into the +trace_marker file and the other when "end" is written. If the pids match, then +it will call the "latency" synthetic event with the calculated latency as its +parameter. Finally, a histogram is added to the latency synthetic event to +record the calculated latency along with the pid. + +Now running cyclictest with:: + + # ./cyclictest -p80 -d0 -i250 -n -a -t --tracemark -b 1000 + + -p80 : run threads at priority 80 + -d0 : have all threads run at the same interval + -i250 : start the interval at 250 microseconds (all threads will do this) + -n : sleep with nanosleep + -a : affine all threads to a separate CPU + -t : one thread per available CPU + --tracemark : enable trace mark writing + -b 1000 : stop if any latency is greater than 1000 microseconds + +Note, the -b 1000 is used just to make --tracemark available. + +Then we can see the histogram created by this with:: + + # cat events/synthetic/latency/hist + # event histogram + # + # trigger info: hist:keys=lat,common_pid:vals=hitcount:sort=lat:size=2048 [active] + # + + { lat: 107, common_pid: 2039 } hitcount: 1 + { lat: 122, common_pid: 2041 } hitcount: 1 + { lat: 166, common_pid: 2039 } hitcount: 1 + { lat: 174, common_pid: 2039 } hitcount: 1 + { lat: 194, common_pid: 2041 } hitcount: 1 + { lat: 196, common_pid: 2036 } hitcount: 1 + { lat: 197, common_pid: 2038 } hitcount: 1 + { lat: 198, common_pid: 2039 } hitcount: 1 + { lat: 199, common_pid: 2039 } hitcount: 1 + { lat: 200, common_pid: 2041 } hitcount: 1 + { lat: 201, common_pid: 2039 } hitcount: 2 + { lat: 202, common_pid: 2038 } hitcount: 1 + { lat: 202, common_pid: 2043 } hitcount: 1 + { lat: 203, common_pid: 2039 } hitcount: 1 + { lat: 203, common_pid: 2036 } hitcount: 1 + { lat: 203, common_pid: 2041 } hitcount: 1 + { lat: 206, common_pid: 2038 } hitcount: 2 + { lat: 207, common_pid: 2039 } hitcount: 1 + { lat: 207, common_pid: 2036 } hitcount: 1 + { lat: 208, common_pid: 2040 } hitcount: 1 + { lat: 209, common_pid: 2043 } hitcount: 1 + { lat: 210, common_pid: 2039 } hitcount: 1 + { lat: 211, common_pid: 2039 } hitcount: 4 + { lat: 212, common_pid: 2043 } hitcount: 1 + { lat: 212, common_pid: 2039 } hitcount: 2 + { lat: 213, common_pid: 2039 } hitcount: 1 + { lat: 214, common_pid: 2038 } hitcount: 1 + { lat: 214, common_pid: 2039 } hitcount: 2 + { lat: 214, common_pid: 2042 } hitcount: 1 + { lat: 215, common_pid: 2039 } hitcount: 1 + { lat: 217, common_pid: 2036 } hitcount: 1 + { lat: 217, common_pid: 2040 } hitcount: 1 + { lat: 217, common_pid: 2039 } hitcount: 1 + { lat: 218, common_pid: 2039 } hitcount: 6 + { lat: 219, common_pid: 2039 } hitcount: 9 + { lat: 220, common_pid: 2039 } hitcount: 11 + { lat: 221, common_pid: 2039 } hitcount: 5 + { lat: 221, common_pid: 2042 } hitcount: 1 + { lat: 222, common_pid: 2039 } hitcount: 7 + { lat: 223, common_pid: 2036 } hitcount: 1 + { lat: 223, common_pid: 2039 } hitcount: 3 + { lat: 224, common_pid: 2039 } hitcount: 4 + { lat: 224, common_pid: 2037 } hitcount: 1 + { lat: 224, common_pid: 2036 } hitcount: 2 + { lat: 225, common_pid: 2039 } hitcount: 5 + { lat: 225, common_pid: 2042 } hitcount: 1 + { lat: 226, common_pid: 2039 } hitcount: 7 + { lat: 226, common_pid: 2036 } hitcount: 4 + { lat: 227, common_pid: 2039 } hitcount: 6 + { lat: 227, common_pid: 2036 } hitcount: 12 + { lat: 227, common_pid: 2043 } hitcount: 1 + { lat: 228, common_pid: 2039 } hitcount: 7 + { lat: 228, common_pid: 2036 } hitcount: 14 + { lat: 229, common_pid: 2039 } hitcount: 9 + { lat: 229, common_pid: 2036 } hitcount: 8 + { lat: 229, common_pid: 2038 } hitcount: 1 + { lat: 230, common_pid: 2039 } hitcount: 11 + { lat: 230, common_pid: 2036 } hitcount: 6 + { lat: 230, common_pid: 2043 } hitcount: 1 + { lat: 230, common_pid: 2042 } hitcount: 2 + { lat: 231, common_pid: 2041 } hitcount: 1 + { lat: 231, common_pid: 2036 } hitcount: 6 + { lat: 231, common_pid: 2043 } hitcount: 1 + { lat: 231, common_pid: 2039 } hitcount: 8 + { lat: 232, common_pid: 2037 } hitcount: 1 + { lat: 232, common_pid: 2039 } hitcount: 6 + { lat: 232, common_pid: 2040 } hitcount: 2 + { lat: 232, common_pid: 2036 } hitcount: 5 + { lat: 232, common_pid: 2043 } hitcount: 1 + { lat: 233, common_pid: 2036 } hitcount: 5 + { lat: 233, common_pid: 2039 } hitcount: 11 + { lat: 234, common_pid: 2039 } hitcount: 4 + { lat: 234, common_pid: 2038 } hitcount: 2 + { lat: 234, common_pid: 2043 } hitcount: 2 + { lat: 234, common_pid: 2036 } hitcount: 11 + { lat: 234, common_pid: 2040 } hitcount: 1 + { lat: 235, common_pid: 2037 } hitcount: 2 + { lat: 235, common_pid: 2036 } hitcount: 8 + { lat: 235, common_pid: 2043 } hitcount: 2 + { lat: 235, common_pid: 2039 } hitcount: 5 + { lat: 235, common_pid: 2042 } hitcount: 2 + { lat: 235, common_pid: 2040 } hitcount: 4 + { lat: 235, common_pid: 2041 } hitcount: 1 + { lat: 236, common_pid: 2036 } hitcount: 7 + { lat: 236, common_pid: 2037 } hitcount: 1 + { lat: 236, common_pid: 2041 } hitcount: 5 + { lat: 236, common_pid: 2039 } hitcount: 3 + { lat: 236, common_pid: 2043 } hitcount: 9 + { lat: 236, common_pid: 2040 } hitcount: 7 + { lat: 237, common_pid: 2037 } hitcount: 1 + { lat: 237, common_pid: 2040 } hitcount: 1 + { lat: 237, common_pid: 2036 } hitcount: 9 + { lat: 237, common_pid: 2039 } hitcount: 3 + { lat: 237, common_pid: 2043 } hitcount: 8 + { lat: 237, common_pid: 2042 } hitcount: 2 + { lat: 237, common_pid: 2041 } hitcount: 2 + { lat: 238, common_pid: 2043 } hitcount: 10 + { lat: 238, common_pid: 2040 } hitcount: 1 + { lat: 238, common_pid: 2037 } hitcount: 9 + { lat: 238, common_pid: 2038 } hitcount: 1 + { lat: 238, common_pid: 2039 } hitcount: 1 + { lat: 238, common_pid: 2042 } hitcount: 3 + { lat: 238, common_pid: 2036 } hitcount: 7 + { lat: 239, common_pid: 2041 } hitcount: 1 + { lat: 239, common_pid: 2043 } hitcount: 11 + { lat: 239, common_pid: 2037 } hitcount: 11 + { lat: 239, common_pid: 2038 } hitcount: 6 + { lat: 239, common_pid: 2036 } hitcount: 7 + { lat: 239, common_pid: 2040 } hitcount: 1 + { lat: 239, common_pid: 2042 } hitcount: 9 + { lat: 240, common_pid: 2037 } hitcount: 29 + { lat: 240, common_pid: 2043 } hitcount: 15 + { lat: 240, common_pid: 2040 } hitcount: 44 + { lat: 240, common_pid: 2039 } hitcount: 1 + { lat: 240, common_pid: 2041 } hitcount: 2 + { lat: 240, common_pid: 2038 } hitcount: 1 + { lat: 240, common_pid: 2036 } hitcount: 10 + { lat: 240, common_pid: 2042 } hitcount: 13 + { lat: 241, common_pid: 2036 } hitcount: 21 + { lat: 241, common_pid: 2041 } hitcount: 36 + { lat: 241, common_pid: 2037 } hitcount: 34 + { lat: 241, common_pid: 2042 } hitcount: 14 + { lat: 241, common_pid: 2040 } hitcount: 94 + { lat: 241, common_pid: 2039 } hitcount: 12 + { lat: 241, common_pid: 2038 } hitcount: 2 + { lat: 241, common_pid: 2043 } hitcount: 28 + { lat: 242, common_pid: 2040 } hitcount: 109 + { lat: 242, common_pid: 2041 } hitcount: 506 + { lat: 242, common_pid: 2039 } hitcount: 155 + { lat: 242, common_pid: 2042 } hitcount: 21 + { lat: 242, common_pid: 2037 } hitcount: 52 + { lat: 242, common_pid: 2043 } hitcount: 21 + { lat: 242, common_pid: 2036 } hitcount: 16 + { lat: 242, common_pid: 2038 } hitcount: 156 + { lat: 243, common_pid: 2037 } hitcount: 46 + { lat: 243, common_pid: 2039 } hitcount: 40 + { lat: 243, common_pid: 2042 } hitcount: 119 + { lat: 243, common_pid: 2041 } hitcount: 611 + { lat: 243, common_pid: 2036 } hitcount: 69 + { lat: 243, common_pid: 2038 } hitcount: 784 + { lat: 243, common_pid: 2040 } hitcount: 323 + { lat: 243, common_pid: 2043 } hitcount: 14 + { lat: 244, common_pid: 2043 } hitcount: 35 + { lat: 244, common_pid: 2042 } hitcount: 305 + { lat: 244, common_pid: 2039 } hitcount: 8 + { lat: 244, common_pid: 2040 } hitcount: 4515 + { lat: 244, common_pid: 2038 } hitcount: 371 + { lat: 244, common_pid: 2037 } hitcount: 31 + { lat: 244, common_pid: 2036 } hitcount: 114 + { lat: 244, common_pid: 2041 } hitcount: 3396 + { lat: 245, common_pid: 2036 } hitcount: 700 + { lat: 245, common_pid: 2041 } hitcount: 2772 + { lat: 245, common_pid: 2037 } hitcount: 268 + { lat: 245, common_pid: 2039 } hitcount: 472 + { lat: 245, common_pid: 2038 } hitcount: 2758 + { lat: 245, common_pid: 2042 } hitcount: 3833 + { lat: 245, common_pid: 2040 } hitcount: 3105 + { lat: 245, common_pid: 2043 } hitcount: 645 + { lat: 246, common_pid: 2038 } hitcount: 3451 + { lat: 246, common_pid: 2041 } hitcount: 142 + { lat: 246, common_pid: 2037 } hitcount: 5101 + { lat: 246, common_pid: 2040 } hitcount: 68 + { lat: 246, common_pid: 2043 } hitcount: 5099 + { lat: 246, common_pid: 2039 } hitcount: 5608 + { lat: 246, common_pid: 2042 } hitcount: 3723 + { lat: 246, common_pid: 2036 } hitcount: 4738 + { lat: 247, common_pid: 2042 } hitcount: 312 + { lat: 247, common_pid: 2043 } hitcount: 2385 + { lat: 247, common_pid: 2041 } hitcount: 452 + { lat: 247, common_pid: 2038 } hitcount: 792 + { lat: 247, common_pid: 2040 } hitcount: 78 + { lat: 247, common_pid: 2036 } hitcount: 2375 + { lat: 247, common_pid: 2039 } hitcount: 1834 + { lat: 247, common_pid: 2037 } hitcount: 2655 + { lat: 248, common_pid: 2037 } hitcount: 36 + { lat: 248, common_pid: 2042 } hitcount: 11 + { lat: 248, common_pid: 2038 } hitcount: 122 + { lat: 248, common_pid: 2036 } hitcount: 135 + { lat: 248, common_pid: 2039 } hitcount: 26 + { lat: 248, common_pid: 2041 } hitcount: 503 + { lat: 248, common_pid: 2043 } hitcount: 66 + { lat: 248, common_pid: 2040 } hitcount: 46 + { lat: 249, common_pid: 2037 } hitcount: 29 + { lat: 249, common_pid: 2038 } hitcount: 1 + { lat: 249, common_pid: 2043 } hitcount: 29 + { lat: 249, common_pid: 2039 } hitcount: 8 + { lat: 249, common_pid: 2042 } hitcount: 56 + { lat: 249, common_pid: 2040 } hitcount: 27 + { lat: 249, common_pid: 2041 } hitcount: 11 + { lat: 249, common_pid: 2036 } hitcount: 27 + { lat: 250, common_pid: 2038 } hitcount: 1 + { lat: 250, common_pid: 2036 } hitcount: 30 + { lat: 250, common_pid: 2040 } hitcount: 19 + { lat: 250, common_pid: 2043 } hitcount: 22 + { lat: 250, common_pid: 2042 } hitcount: 20 + { lat: 250, common_pid: 2041 } hitcount: 1 + { lat: 250, common_pid: 2039 } hitcount: 6 + { lat: 250, common_pid: 2037 } hitcount: 48 + { lat: 251, common_pid: 2037 } hitcount: 43 + { lat: 251, common_pid: 2039 } hitcount: 1 + { lat: 251, common_pid: 2036 } hitcount: 12 + { lat: 251, common_pid: 2042 } hitcount: 2 + { lat: 251, common_pid: 2041 } hitcount: 1 + { lat: 251, common_pid: 2043 } hitcount: 15 + { lat: 251, common_pid: 2040 } hitcount: 3 + { lat: 252, common_pid: 2040 } hitcount: 1 + { lat: 252, common_pid: 2036 } hitcount: 12 + { lat: 252, common_pid: 2037 } hitcount: 21 + { lat: 252, common_pid: 2043 } hitcount: 14 + { lat: 253, common_pid: 2037 } hitcount: 21 + { lat: 253, common_pid: 2039 } hitcount: 2 + { lat: 253, common_pid: 2036 } hitcount: 9 + { lat: 253, common_pid: 2043 } hitcount: 6 + { lat: 253, common_pid: 2040 } hitcount: 1 + { lat: 254, common_pid: 2036 } hitcount: 8 + { lat: 254, common_pid: 2043 } hitcount: 3 + { lat: 254, common_pid: 2041 } hitcount: 1 + { lat: 254, common_pid: 2042 } hitcount: 1 + { lat: 254, common_pid: 2039 } hitcount: 1 + { lat: 254, common_pid: 2037 } hitcount: 12 + { lat: 255, common_pid: 2043 } hitcount: 1 + { lat: 255, common_pid: 2037 } hitcount: 2 + { lat: 255, common_pid: 2036 } hitcount: 2 + { lat: 255, common_pid: 2039 } hitcount: 8 + { lat: 256, common_pid: 2043 } hitcount: 1 + { lat: 256, common_pid: 2036 } hitcount: 4 + { lat: 256, common_pid: 2039 } hitcount: 6 + { lat: 257, common_pid: 2039 } hitcount: 5 + { lat: 257, common_pid: 2036 } hitcount: 4 + { lat: 258, common_pid: 2039 } hitcount: 5 + { lat: 258, common_pid: 2036 } hitcount: 2 + { lat: 259, common_pid: 2036 } hitcount: 7 + { lat: 259, common_pid: 2039 } hitcount: 7 + { lat: 260, common_pid: 2036 } hitcount: 8 + { lat: 260, common_pid: 2039 } hitcount: 6 + { lat: 261, common_pid: 2036 } hitcount: 5 + { lat: 261, common_pid: 2039 } hitcount: 7 + { lat: 262, common_pid: 2039 } hitcount: 5 + { lat: 262, common_pid: 2036 } hitcount: 5 + { lat: 263, common_pid: 2039 } hitcount: 7 + { lat: 263, common_pid: 2036 } hitcount: 7 + { lat: 264, common_pid: 2039 } hitcount: 9 + { lat: 264, common_pid: 2036 } hitcount: 9 + { lat: 265, common_pid: 2036 } hitcount: 5 + { lat: 265, common_pid: 2039 } hitcount: 1 + { lat: 266, common_pid: 2036 } hitcount: 1 + { lat: 266, common_pid: 2039 } hitcount: 3 + { lat: 267, common_pid: 2036 } hitcount: 1 + { lat: 267, common_pid: 2039 } hitcount: 3 + { lat: 268, common_pid: 2036 } hitcount: 1 + { lat: 268, common_pid: 2039 } hitcount: 6 + { lat: 269, common_pid: 2036 } hitcount: 1 + { lat: 269, common_pid: 2043 } hitcount: 1 + { lat: 269, common_pid: 2039 } hitcount: 2 + { lat: 270, common_pid: 2040 } hitcount: 1 + { lat: 270, common_pid: 2039 } hitcount: 6 + { lat: 271, common_pid: 2041 } hitcount: 1 + { lat: 271, common_pid: 2039 } hitcount: 5 + { lat: 272, common_pid: 2039 } hitcount: 10 + { lat: 273, common_pid: 2039 } hitcount: 8 + { lat: 274, common_pid: 2039 } hitcount: 2 + { lat: 275, common_pid: 2039 } hitcount: 1 + { lat: 276, common_pid: 2039 } hitcount: 2 + { lat: 276, common_pid: 2037 } hitcount: 1 + { lat: 276, common_pid: 2038 } hitcount: 1 + { lat: 277, common_pid: 2039 } hitcount: 1 + { lat: 277, common_pid: 2042 } hitcount: 1 + { lat: 278, common_pid: 2039 } hitcount: 1 + { lat: 279, common_pid: 2039 } hitcount: 4 + { lat: 279, common_pid: 2043 } hitcount: 1 + { lat: 280, common_pid: 2039 } hitcount: 3 + { lat: 283, common_pid: 2036 } hitcount: 2 + { lat: 284, common_pid: 2039 } hitcount: 1 + { lat: 284, common_pid: 2043 } hitcount: 1 + { lat: 288, common_pid: 2039 } hitcount: 1 + { lat: 289, common_pid: 2039 } hitcount: 1 + { lat: 300, common_pid: 2039 } hitcount: 1 + { lat: 384, common_pid: 2039 } hitcount: 1 + + Totals: + Hits: 67625 + Entries: 278 + Dropped: 0 + +Note, the writes are around the sleep, so ideally they will all be of 250 +microseconds. If you are wondering how there are several that are under +250 microseconds, that is because the way cyclictest works, is if one +iteration comes in late, the next one will set the timer to wake up less that +250. That is, if an iteration came in 50 microseconds late, the next wake up +will be at 200 microseconds. + +But this could easily be done in userspace. To make this even more +interesting, we can mix the histogram between events that happened in the +kernel with trace_marker:: + + # cd /sys/kernel/tracing + # echo 'latency u64 lat' > synthetic_events + # echo 'hist:keys=pid:ts0=common_timestamp.usecs' > events/sched/sched_waking/trigger + # echo 'hist:keys=common_pid:lat=common_timestamp.usecs-$ts0:onmatch(sched.sched_waking).latency($lat) if buf == "end"' > events/ftrace/print/trigger + # echo 'hist:keys=lat,common_pid:sort=lat' > events/synthetic/latency/trigger + +The difference this time is that instead of using the trace_marker to start +the latency, the sched_waking event is used, matching the common_pid for the +trace_marker write with the pid that is being woken by sched_waking. + +After running cyclictest again with the same parameters, we now have:: + + # cat events/synthetic/latency/hist + # event histogram + # + # trigger info: hist:keys=lat,common_pid:vals=hitcount:sort=lat:size=2048 [active] + # + + { lat: 7, common_pid: 2302 } hitcount: 640 + { lat: 7, common_pid: 2299 } hitcount: 42 + { lat: 7, common_pid: 2303 } hitcount: 18 + { lat: 7, common_pid: 2305 } hitcount: 166 + { lat: 7, common_pid: 2306 } hitcount: 1 + { lat: 7, common_pid: 2301 } hitcount: 91 + { lat: 7, common_pid: 2300 } hitcount: 17 + { lat: 8, common_pid: 2303 } hitcount: 8296 + { lat: 8, common_pid: 2304 } hitcount: 6864 + { lat: 8, common_pid: 2305 } hitcount: 9464 + { lat: 8, common_pid: 2301 } hitcount: 9213 + { lat: 8, common_pid: 2306 } hitcount: 6246 + { lat: 8, common_pid: 2302 } hitcount: 8797 + { lat: 8, common_pid: 2299 } hitcount: 8771 + { lat: 8, common_pid: 2300 } hitcount: 8119 + { lat: 9, common_pid: 2305 } hitcount: 1519 + { lat: 9, common_pid: 2299 } hitcount: 2346 + { lat: 9, common_pid: 2303 } hitcount: 2841 + { lat: 9, common_pid: 2301 } hitcount: 1846 + { lat: 9, common_pid: 2304 } hitcount: 3861 + { lat: 9, common_pid: 2302 } hitcount: 1210 + { lat: 9, common_pid: 2300 } hitcount: 2762 + { lat: 9, common_pid: 2306 } hitcount: 4247 + { lat: 10, common_pid: 2299 } hitcount: 16 + { lat: 10, common_pid: 2306 } hitcount: 333 + { lat: 10, common_pid: 2303 } hitcount: 16 + { lat: 10, common_pid: 2304 } hitcount: 168 + { lat: 10, common_pid: 2302 } hitcount: 240 + { lat: 10, common_pid: 2301 } hitcount: 28 + { lat: 10, common_pid: 2300 } hitcount: 95 + { lat: 10, common_pid: 2305 } hitcount: 18 + { lat: 11, common_pid: 2303 } hitcount: 5 + { lat: 11, common_pid: 2305 } hitcount: 8 + { lat: 11, common_pid: 2306 } hitcount: 221 + { lat: 11, common_pid: 2302 } hitcount: 76 + { lat: 11, common_pid: 2304 } hitcount: 26 + { lat: 11, common_pid: 2300 } hitcount: 125 + { lat: 11, common_pid: 2299 } hitcount: 2 + { lat: 12, common_pid: 2305 } hitcount: 3 + { lat: 12, common_pid: 2300 } hitcount: 6 + { lat: 12, common_pid: 2306 } hitcount: 90 + { lat: 12, common_pid: 2302 } hitcount: 4 + { lat: 12, common_pid: 2303 } hitcount: 1 + { lat: 12, common_pid: 2304 } hitcount: 122 + { lat: 13, common_pid: 2300 } hitcount: 12 + { lat: 13, common_pid: 2301 } hitcount: 1 + { lat: 13, common_pid: 2306 } hitcount: 32 + { lat: 13, common_pid: 2302 } hitcount: 5 + { lat: 13, common_pid: 2305 } hitcount: 1 + { lat: 13, common_pid: 2303 } hitcount: 1 + { lat: 13, common_pid: 2304 } hitcount: 61 + { lat: 14, common_pid: 2303 } hitcount: 4 + { lat: 14, common_pid: 2306 } hitcount: 5 + { lat: 14, common_pid: 2305 } hitcount: 4 + { lat: 14, common_pid: 2304 } hitcount: 62 + { lat: 14, common_pid: 2302 } hitcount: 19 + { lat: 14, common_pid: 2300 } hitcount: 33 + { lat: 14, common_pid: 2299 } hitcount: 1 + { lat: 14, common_pid: 2301 } hitcount: 4 + { lat: 15, common_pid: 2305 } hitcount: 1 + { lat: 15, common_pid: 2302 } hitcount: 25 + { lat: 15, common_pid: 2300 } hitcount: 11 + { lat: 15, common_pid: 2299 } hitcount: 5 + { lat: 15, common_pid: 2301 } hitcount: 1 + { lat: 15, common_pid: 2304 } hitcount: 8 + { lat: 15, common_pid: 2303 } hitcount: 1 + { lat: 15, common_pid: 2306 } hitcount: 6 + { lat: 16, common_pid: 2302 } hitcount: 31 + { lat: 16, common_pid: 2306 } hitcount: 3 + { lat: 16, common_pid: 2300 } hitcount: 5 + { lat: 17, common_pid: 2302 } hitcount: 6 + { lat: 17, common_pid: 2303 } hitcount: 1 + { lat: 18, common_pid: 2304 } hitcount: 1 + { lat: 18, common_pid: 2302 } hitcount: 8 + { lat: 18, common_pid: 2299 } hitcount: 1 + { lat: 18, common_pid: 2301 } hitcount: 1 + { lat: 19, common_pid: 2303 } hitcount: 4 + { lat: 19, common_pid: 2304 } hitcount: 5 + { lat: 19, common_pid: 2302 } hitcount: 4 + { lat: 19, common_pid: 2299 } hitcount: 3 + { lat: 19, common_pid: 2306 } hitcount: 1 + { lat: 19, common_pid: 2300 } hitcount: 4 + { lat: 19, common_pid: 2305 } hitcount: 5 + { lat: 20, common_pid: 2299 } hitcount: 2 + { lat: 20, common_pid: 2302 } hitcount: 3 + { lat: 20, common_pid: 2305 } hitcount: 1 + { lat: 20, common_pid: 2300 } hitcount: 2 + { lat: 20, common_pid: 2301 } hitcount: 2 + { lat: 20, common_pid: 2303 } hitcount: 3 + { lat: 21, common_pid: 2305 } hitcount: 1 + { lat: 21, common_pid: 2299 } hitcount: 5 + { lat: 21, common_pid: 2303 } hitcount: 4 + { lat: 21, common_pid: 2302 } hitcount: 7 + { lat: 21, common_pid: 2300 } hitcount: 1 + { lat: 21, common_pid: 2301 } hitcount: 5 + { lat: 21, common_pid: 2304 } hitcount: 2 + { lat: 22, common_pid: 2302 } hitcount: 5 + { lat: 22, common_pid: 2303 } hitcount: 1 + { lat: 22, common_pid: 2306 } hitcount: 3 + { lat: 22, common_pid: 2301 } hitcount: 2 + { lat: 22, common_pid: 2300 } hitcount: 1 + { lat: 22, common_pid: 2299 } hitcount: 1 + { lat: 22, common_pid: 2305 } hitcount: 1 + { lat: 22, common_pid: 2304 } hitcount: 1 + { lat: 23, common_pid: 2299 } hitcount: 1 + { lat: 23, common_pid: 2306 } hitcount: 2 + { lat: 23, common_pid: 2302 } hitcount: 6 + { lat: 24, common_pid: 2302 } hitcount: 3 + { lat: 24, common_pid: 2300 } hitcount: 1 + { lat: 24, common_pid: 2306 } hitcount: 2 + { lat: 24, common_pid: 2305 } hitcount: 1 + { lat: 24, common_pid: 2299 } hitcount: 1 + { lat: 25, common_pid: 2300 } hitcount: 1 + { lat: 25, common_pid: 2302 } hitcount: 4 + { lat: 26, common_pid: 2302 } hitcount: 2 + { lat: 27, common_pid: 2305 } hitcount: 1 + { lat: 27, common_pid: 2300 } hitcount: 1 + { lat: 27, common_pid: 2302 } hitcount: 3 + { lat: 28, common_pid: 2306 } hitcount: 1 + { lat: 28, common_pid: 2302 } hitcount: 4 + { lat: 29, common_pid: 2302 } hitcount: 1 + { lat: 29, common_pid: 2300 } hitcount: 2 + { lat: 29, common_pid: 2306 } hitcount: 1 + { lat: 29, common_pid: 2304 } hitcount: 1 + { lat: 30, common_pid: 2302 } hitcount: 4 + { lat: 31, common_pid: 2302 } hitcount: 6 + { lat: 32, common_pid: 2302 } hitcount: 1 + { lat: 33, common_pid: 2299 } hitcount: 1 + { lat: 33, common_pid: 2302 } hitcount: 3 + { lat: 34, common_pid: 2302 } hitcount: 2 + { lat: 35, common_pid: 2302 } hitcount: 1 + { lat: 35, common_pid: 2304 } hitcount: 1 + { lat: 36, common_pid: 2302 } hitcount: 4 + { lat: 37, common_pid: 2302 } hitcount: 6 + { lat: 38, common_pid: 2302 } hitcount: 2 + { lat: 39, common_pid: 2302 } hitcount: 2 + { lat: 39, common_pid: 2304 } hitcount: 1 + { lat: 40, common_pid: 2304 } hitcount: 2 + { lat: 40, common_pid: 2302 } hitcount: 5 + { lat: 41, common_pid: 2304 } hitcount: 1 + { lat: 41, common_pid: 2302 } hitcount: 8 + { lat: 42, common_pid: 2302 } hitcount: 6 + { lat: 42, common_pid: 2304 } hitcount: 1 + { lat: 43, common_pid: 2302 } hitcount: 3 + { lat: 43, common_pid: 2304 } hitcount: 4 + { lat: 44, common_pid: 2302 } hitcount: 6 + { lat: 45, common_pid: 2302 } hitcount: 5 + { lat: 46, common_pid: 2302 } hitcount: 5 + { lat: 47, common_pid: 2302 } hitcount: 7 + { lat: 48, common_pid: 2301 } hitcount: 1 + { lat: 48, common_pid: 2302 } hitcount: 9 + { lat: 49, common_pid: 2302 } hitcount: 3 + { lat: 50, common_pid: 2302 } hitcount: 1 + { lat: 50, common_pid: 2301 } hitcount: 1 + { lat: 51, common_pid: 2302 } hitcount: 2 + { lat: 51, common_pid: 2301 } hitcount: 1 + { lat: 61, common_pid: 2302 } hitcount: 1 + { lat: 110, common_pid: 2302 } hitcount: 1 + + Totals: + Hits: 89565 + Entries: 158 + Dropped: 0 + +This doesn't tell us any information about how late cyclictest may have +woken up, but it does show us a nice histogram of how long it took from +the time that cyclictest was woken to the time it made it into user space. diff --git a/Documentation/trace/histogram.txt b/Documentation/trace/histogram.txt deleted file mode 100644 index 7ffea6aa22e3..000000000000 --- a/Documentation/trace/histogram.txt +++ /dev/null @@ -1,2539 +0,0 @@ - Event Histograms - - Documentation written by Tom Zanussi - -1. Introduction -=============== - - Histogram triggers are special event triggers that can be used to - aggregate trace event data into histograms. For information on - trace events and event triggers, see Documentation/trace/events.rst. - - -2. Histogram Trigger Command -============================ - - A histogram trigger command is an event trigger command that - aggregates event hits into a hash table keyed on one or more trace - event format fields (or stacktrace) and a set of running totals - derived from one or more trace event format fields and/or event - counts (hitcount). - - The format of a hist trigger is as follows: - - hist:keys=[:values=] - [:sort=][:size=#entries][:pause][:continue] - [:clear][:name=histname1] [if ] - - When a matching event is hit, an entry is added to a hash table - using the key(s) and value(s) named. Keys and values correspond to - fields in the event's format description. Values must correspond to - numeric fields - on an event hit, the value(s) will be added to a - sum kept for that field. The special string 'hitcount' can be used - in place of an explicit value field - this is simply a count of - event hits. If 'values' isn't specified, an implicit 'hitcount' - value will be automatically created and used as the only value. - Keys can be any field, or the special string 'stacktrace', which - will use the event's kernel stacktrace as the key. The keywords - 'keys' or 'key' can be used to specify keys, and the keywords - 'values', 'vals', or 'val' can be used to specify values. Compound - keys consisting of up to two fields can be specified by the 'keys' - keyword. Hashing a compound key produces a unique entry in the - table for each unique combination of component keys, and can be - useful for providing more fine-grained summaries of event data. - Additionally, sort keys consisting of up to two fields can be - specified by the 'sort' keyword. If more than one field is - specified, the result will be a 'sort within a sort': the first key - is taken to be the primary sort key and the second the secondary - key. If a hist trigger is given a name using the 'name' parameter, - its histogram data will be shared with other triggers of the same - name, and trigger hits will update this common data. Only triggers - with 'compatible' fields can be combined in this way; triggers are - 'compatible' if the fields named in the trigger share the same - number and type of fields and those fields also have the same names. - Note that any two events always share the compatible 'hitcount' and - 'stacktrace' fields and can therefore be combined using those - fields, however pointless that may be. - - 'hist' triggers add a 'hist' file to each event's subdirectory. - Reading the 'hist' file for the event will dump the hash table in - its entirety to stdout. If there are multiple hist triggers - attached to an event, there will be a table for each trigger in the - output. The table displayed for a named trigger will be the same as - any other instance having the same name. Each printed hash table - entry is a simple list of the keys and values comprising the entry; - keys are printed first and are delineated by curly braces, and are - followed by the set of value fields for the entry. By default, - numeric fields are displayed as base-10 integers. This can be - modified by appending any of the following modifiers to the field - name: - - .hex display a number as a hex value - .sym display an address as a symbol - .sym-offset display an address as a symbol and offset - .syscall display a syscall id as a system call name - .execname display a common_pid as a program name - .log2 display log2 value rather than raw number - .usecs display a common_timestamp in microseconds - - Note that in general the semantics of a given field aren't - interpreted when applying a modifier to it, but there are some - restrictions to be aware of in this regard: - - - only the 'hex' modifier can be used for values (because values - are essentially sums, and the other modifiers don't make sense - in that context). - - the 'execname' modifier can only be used on a 'common_pid'. The - reason for this is that the execname is simply the 'comm' value - saved for the 'current' process when an event was triggered, - which is the same as the common_pid value saved by the event - tracing code. Trying to apply that comm value to other pid - values wouldn't be correct, and typically events that care save - pid-specific comm fields in the event itself. - - A typical usage scenario would be the following to enable a hist - trigger, read its current contents, and then turn it off: - - # echo 'hist:keys=skbaddr.hex:vals=len' > \ - /sys/kernel/debug/tracing/events/net/netif_rx/trigger - - # cat /sys/kernel/debug/tracing/events/net/netif_rx/hist - - # echo '!hist:keys=skbaddr.hex:vals=len' > \ - /sys/kernel/debug/tracing/events/net/netif_rx/trigger - - The trigger file itself can be read to show the details of the - currently attached hist trigger. This information is also displayed - at the top of the 'hist' file when read. - - By default, the size of the hash table is 2048 entries. The 'size' - parameter can be used to specify more or fewer than that. The units - are in terms of hashtable entries - if a run uses more entries than - specified, the results will show the number of 'drops', the number - of hits that were ignored. The size should be a power of 2 between - 128 and 131072 (any non- power-of-2 number specified will be rounded - up). - - The 'sort' parameter can be used to specify a value field to sort - on. The default if unspecified is 'hitcount' and the default sort - order is 'ascending'. To sort in the opposite direction, append - .descending' to the sort key. - - The 'pause' parameter can be used to pause an existing hist trigger - or to start a hist trigger but not log any events until told to do - so. 'continue' or 'cont' can be used to start or restart a paused - hist trigger. - - The 'clear' parameter will clear the contents of a running hist - trigger and leave its current paused/active state. - - Note that the 'pause', 'cont', and 'clear' parameters should be - applied using 'append' shell operator ('>>') if applied to an - existing trigger, rather than via the '>' operator, which will cause - the trigger to be removed through truncation. - -- enable_hist/disable_hist - - The enable_hist and disable_hist triggers can be used to have one - event conditionally start and stop another event's already-attached - hist trigger. Any number of enable_hist and disable_hist triggers - can be attached to a given event, allowing that event to kick off - and stop aggregations on a host of other events. - - The format is very similar to the enable/disable_event triggers: - - enable_hist::[:count] - disable_hist::[:count] - - Instead of enabling or disabling the tracing of the target event - into the trace buffer as the enable/disable_event triggers do, the - enable/disable_hist triggers enable or disable the aggregation of - the target event into a hash table. - - A typical usage scenario for the enable_hist/disable_hist triggers - would be to first set up a paused hist trigger on some event, - followed by an enable_hist/disable_hist pair that turns the hist - aggregation on and off when conditions of interest are hit: - - # echo 'hist:keys=skbaddr.hex:vals=len:pause' > \ - /sys/kernel/debug/tracing/events/net/netif_receive_skb/trigger - - # echo 'enable_hist:net:netif_receive_skb if filename==/usr/bin/wget' > \ - /sys/kernel/debug/tracing/events/sched/sched_process_exec/trigger - - # echo 'disable_hist:net:netif_receive_skb if comm==wget' > \ - /sys/kernel/debug/tracing/events/sched/sched_process_exit/trigger - - The above sets up an initially paused hist trigger which is unpaused - and starts aggregating events when a given program is executed, and - which stops aggregating when the process exits and the hist trigger - is paused again. - - The examples below provide a more concrete illustration of the - concepts and typical usage patterns discussed above. - - 'special' event fields - ------------------------ - - There are a number of 'special event fields' available for use as - keys or values in a hist trigger. These look like and behave as if - they were actual event fields, but aren't really part of the event's - field definition or format file. They are however available for any - event, and can be used anywhere an actual event field could be. - They are: - - common_timestamp u64 - timestamp (from ring buffer) associated - with the event, in nanoseconds. May be - modified by .usecs to have timestamps - interpreted as microseconds. - cpu int - the cpu on which the event occurred. - - Extended error information - -------------------------- - - For some error conditions encountered when invoking a hist trigger - command, extended error information is available via the - corresponding event's 'hist' file. Reading the hist file after an - error will display more detailed information about what went wrong, - if information is available. This extended error information will - be available until the next hist trigger command for that event. - - If available for a given error condition, the extended error - information and usage takes the following form: - - # echo xxx > /sys/kernel/debug/tracing/events/sched/sched_wakeup/trigger - echo: write error: Invalid argument - - # cat /sys/kernel/debug/tracing/events/sched/sched_wakeup/hist - ERROR: Couldn't yyy: zzz - Last command: xxx - -6.2 'hist' trigger examples ---------------------------- - - The first set of examples creates aggregations using the kmalloc - event. The fields that can be used for the hist trigger are listed - in the kmalloc event's format file: - - # cat /sys/kernel/debug/tracing/events/kmem/kmalloc/format - name: kmalloc - ID: 374 - format: - field:unsigned short common_type; offset:0; size:2; signed:0; - field:unsigned char common_flags; offset:2; size:1; signed:0; - field:unsigned char common_preempt_count; offset:3; size:1; signed:0; - field:int common_pid; offset:4; size:4; signed:1; - - field:unsigned long call_site; offset:8; size:8; signed:0; - field:const void * ptr; offset:16; size:8; signed:0; - field:size_t bytes_req; offset:24; size:8; signed:0; - field:size_t bytes_alloc; offset:32; size:8; signed:0; - field:gfp_t gfp_flags; offset:40; size:4; signed:0; - - We'll start by creating a hist trigger that generates a simple table - that lists the total number of bytes requested for each function in - the kernel that made one or more calls to kmalloc: - - # echo 'hist:key=call_site:val=bytes_req' > \ - /sys/kernel/debug/tracing/events/kmem/kmalloc/trigger - - This tells the tracing system to create a 'hist' trigger using the - call_site field of the kmalloc event as the key for the table, which - just means that each unique call_site address will have an entry - created for it in the table. The 'val=bytes_req' parameter tells - the hist trigger that for each unique entry (call_site) in the - table, it should keep a running total of the number of bytes - requested by that call_site. - - We'll let it run for awhile and then dump the contents of the 'hist' - file in the kmalloc event's subdirectory (for readability, a number - of entries have been omitted): - - # cat /sys/kernel/debug/tracing/events/kmem/kmalloc/hist - # trigger info: hist:keys=call_site:vals=bytes_req:sort=hitcount:size=2048 [active] - - { call_site: 18446744072106379007 } hitcount: 1 bytes_req: 176 - { call_site: 18446744071579557049 } hitcount: 1 bytes_req: 1024 - { call_site: 18446744071580608289 } hitcount: 1 bytes_req: 16384 - { call_site: 18446744071581827654 } hitcount: 1 bytes_req: 24 - { call_site: 18446744071580700980 } hitcount: 1 bytes_req: 8 - { call_site: 18446744071579359876 } hitcount: 1 bytes_req: 152 - { call_site: 18446744071580795365 } hitcount: 3 bytes_req: 144 - { call_site: 18446744071581303129 } hitcount: 3 bytes_req: 144 - { call_site: 18446744071580713234 } hitcount: 4 bytes_req: 2560 - { call_site: 18446744071580933750 } hitcount: 4 bytes_req: 736 - . - . - . - { call_site: 18446744072106047046 } hitcount: 69 bytes_req: 5576 - { call_site: 18446744071582116407 } hitcount: 73 bytes_req: 2336 - { call_site: 18446744072106054684 } hitcount: 136 bytes_req: 140504 - { call_site: 18446744072106224230 } hitcount: 136 bytes_req: 19584 - { call_site: 18446744072106078074 } hitcount: 153 bytes_req: 2448 - { call_site: 18446744072106062406 } hitcount: 153 bytes_req: 36720 - { call_site: 18446744071582507929 } hitcount: 153 bytes_req: 37088 - { call_site: 18446744072102520590 } hitcount: 273 bytes_req: 10920 - { call_site: 18446744071582143559 } hitcount: 358 bytes_req: 716 - { call_site: 18446744072106465852 } hitcount: 417 bytes_req: 56712 - { call_site: 18446744072102523378 } hitcount: 485 bytes_req: 27160 - { call_site: 18446744072099568646 } hitcount: 1676 bytes_req: 33520 - - Totals: - Hits: 4610 - Entries: 45 - Dropped: 0 - - The output displays a line for each entry, beginning with the key - specified in the trigger, followed by the value(s) also specified in - the trigger. At the beginning of the output is a line that displays - the trigger info, which can also be displayed by reading the - 'trigger' file: - - # cat /sys/kernel/debug/tracing/events/kmem/kmalloc/trigger - hist:keys=call_site:vals=bytes_req:sort=hitcount:size=2048 [active] - - At the end of the output are a few lines that display the overall - totals for the run. The 'Hits' field shows the total number of - times the event trigger was hit, the 'Entries' field shows the total - number of used entries in the hash table, and the 'Dropped' field - shows the number of hits that were dropped because the number of - used entries for the run exceeded the maximum number of entries - allowed for the table (normally 0, but if not a hint that you may - want to increase the size of the table using the 'size' parameter). - - Notice in the above output that there's an extra field, 'hitcount', - which wasn't specified in the trigger. Also notice that in the - trigger info output, there's a parameter, 'sort=hitcount', which - wasn't specified in the trigger either. The reason for that is that - every trigger implicitly keeps a count of the total number of hits - attributed to a given entry, called the 'hitcount'. That hitcount - information is explicitly displayed in the output, and in the - absence of a user-specified sort parameter, is used as the default - sort field. - - The value 'hitcount' can be used in place of an explicit value in - the 'values' parameter if you don't really need to have any - particular field summed and are mainly interested in hit - frequencies. - - To turn the hist trigger off, simply call up the trigger in the - command history and re-execute it with a '!' prepended: - - # echo '!hist:key=call_site:val=bytes_req' > \ - /sys/kernel/debug/tracing/events/kmem/kmalloc/trigger - - Finally, notice that the call_site as displayed in the output above - isn't really very useful. It's an address, but normally addresses - are displayed in hex. To have a numeric field displayed as a hex - value, simply append '.hex' to the field name in the trigger: - - # echo 'hist:key=call_site.hex:val=bytes_req' > \ - /sys/kernel/debug/tracing/events/kmem/kmalloc/trigger - - # cat /sys/kernel/debug/tracing/events/kmem/kmalloc/hist - # trigger info: hist:keys=call_site.hex:vals=bytes_req:sort=hitcount:size=2048 [active] - - { call_site: ffffffffa026b291 } hitcount: 1 bytes_req: 433 - { call_site: ffffffffa07186ff } hitcount: 1 bytes_req: 176 - { call_site: ffffffff811ae721 } hitcount: 1 bytes_req: 16384 - { call_site: ffffffff811c5134 } hitcount: 1 bytes_req: 8 - { call_site: ffffffffa04a9ebb } hitcount: 1 bytes_req: 511 - { call_site: ffffffff8122e0a6 } hitcount: 1 bytes_req: 12 - { call_site: ffffffff8107da84 } hitcount: 1 bytes_req: 152 - { call_site: ffffffff812d8246 } hitcount: 1 bytes_req: 24 - { call_site: ffffffff811dc1e5 } hitcount: 3 bytes_req: 144 - { call_site: ffffffffa02515e8 } hitcount: 3 bytes_req: 648 - { call_site: ffffffff81258159 } hitcount: 3 bytes_req: 144 - { call_site: ffffffff811c80f4 } hitcount: 4 bytes_req: 544 - . - . - . - { call_site: ffffffffa06c7646 } hitcount: 106 bytes_req: 8024 - { call_site: ffffffffa06cb246 } hitcount: 132 bytes_req: 31680 - { call_site: ffffffffa06cef7a } hitcount: 132 bytes_req: 2112 - { call_site: ffffffff8137e399 } hitcount: 132 bytes_req: 23232 - { call_site: ffffffffa06c941c } hitcount: 185 bytes_req: 171360 - { call_site: ffffffffa06f2a66 } hitcount: 185 bytes_req: 26640 - { call_site: ffffffffa036a70e } hitcount: 265 bytes_req: 10600 - { call_site: ffffffff81325447 } hitcount: 292 bytes_req: 584 - { call_site: ffffffffa072da3c } hitcount: 446 bytes_req: 60656 - { call_site: ffffffffa036b1f2 } hitcount: 526 bytes_req: 29456 - { call_site: ffffffffa0099c06 } hitcount: 1780 bytes_req: 35600 - - Totals: - Hits: 4775 - Entries: 46 - Dropped: 0 - - Even that's only marginally more useful - while hex values do look - more like addresses, what users are typically more interested in - when looking at text addresses are the corresponding symbols - instead. To have an address displayed as symbolic value instead, - simply append '.sym' or '.sym-offset' to the field name in the - trigger: - - # echo 'hist:key=call_site.sym:val=bytes_req' > \ - /sys/kernel/debug/tracing/events/kmem/kmalloc/trigger - - # cat /sys/kernel/debug/tracing/events/kmem/kmalloc/hist - # trigger info: hist:keys=call_site.sym:vals=bytes_req:sort=hitcount:size=2048 [active] - - { call_site: [ffffffff810adcb9] syslog_print_all } hitcount: 1 bytes_req: 1024 - { call_site: [ffffffff8154bc62] usb_control_msg } hitcount: 1 bytes_req: 8 - { call_site: [ffffffffa00bf6fe] hidraw_send_report [hid] } hitcount: 1 bytes_req: 7 - { call_site: [ffffffff8154acbe] usb_alloc_urb } hitcount: 1 bytes_req: 192 - { call_site: [ffffffffa00bf1ca] hidraw_report_event [hid] } hitcount: 1 bytes_req: 7 - { call_site: [ffffffff811e3a25] __seq_open_private } hitcount: 1 bytes_req: 40 - { call_site: [ffffffff8109524a] alloc_fair_sched_group } hitcount: 2 bytes_req: 128 - { call_site: [ffffffff811febd5] fsnotify_alloc_group } hitcount: 2 bytes_req: 528 - { call_site: [ffffffff81440f58] __tty_buffer_request_room } hitcount: 2 bytes_req: 2624 - { call_site: [ffffffff81200ba6] inotify_new_group } hitcount: 2 bytes_req: 96 - { call_site: [ffffffffa05e19af] ieee80211_start_tx_ba_session [mac80211] } hitcount: 2 bytes_req: 464 - { call_site: [ffffffff81672406] tcp_get_metrics } hitcount: 2 bytes_req: 304 - { call_site: [ffffffff81097ec2] alloc_rt_sched_group } hitcount: 2 bytes_req: 128 - { call_site: [ffffffff81089b05] sched_create_group } hitcount: 2 bytes_req: 1424 - . - . - . - { call_site: [ffffffffa04a580c] intel_crtc_page_flip [i915] } hitcount: 1185 bytes_req: 123240 - { call_site: [ffffffffa0287592] drm_mode_page_flip_ioctl [drm] } hitcount: 1185 bytes_req: 104280 - { call_site: [ffffffffa04c4a3c] intel_plane_duplicate_state [i915] } hitcount: 1402 bytes_req: 190672 - { call_site: [ffffffff812891ca] ext4_find_extent } hitcount: 1518 bytes_req: 146208 - { call_site: [ffffffffa029070e] drm_vma_node_allow [drm] } hitcount: 1746 bytes_req: 69840 - { call_site: [ffffffffa045e7c4] i915_gem_do_execbuffer.isra.23 [i915] } hitcount: 2021 bytes_req: 792312 - { call_site: [ffffffffa02911f2] drm_modeset_lock_crtc [drm] } hitcount: 2592 bytes_req: 145152 - { call_site: [ffffffffa0489a66] intel_ring_begin [i915] } hitcount: 2629 bytes_req: 378576 - { call_site: [ffffffffa046041c] i915_gem_execbuffer2 [i915] } hitcount: 2629 bytes_req: 3783248 - { call_site: [ffffffff81325607] apparmor_file_alloc_security } hitcount: 5192 bytes_req: 10384 - { call_site: [ffffffffa00b7c06] hid_report_raw_event [hid] } hitcount: 5529 bytes_req: 110584 - { call_site: [ffffffff8131ebf7] aa_alloc_task_context } hitcount: 21943 bytes_req: 702176 - { call_site: [ffffffff8125847d] ext4_htree_store_dirent } hitcount: 55759 bytes_req: 5074265 - - Totals: - Hits: 109928 - Entries: 71 - Dropped: 0 - - Because the default sort key above is 'hitcount', the above shows a - the list of call_sites by increasing hitcount, so that at the bottom - we see the functions that made the most kmalloc calls during the - run. If instead we we wanted to see the top kmalloc callers in - terms of the number of bytes requested rather than the number of - calls, and we wanted the top caller to appear at the top, we can use - the 'sort' parameter, along with the 'descending' modifier: - - # echo 'hist:key=call_site.sym:val=bytes_req:sort=bytes_req.descending' > \ - /sys/kernel/debug/tracing/events/kmem/kmalloc/trigger - - # cat /sys/kernel/debug/tracing/events/kmem/kmalloc/hist - # trigger info: hist:keys=call_site.sym:vals=bytes_req:sort=bytes_req.descending:size=2048 [active] - - { call_site: [ffffffffa046041c] i915_gem_execbuffer2 [i915] } hitcount: 2186 bytes_req: 3397464 - { call_site: [ffffffffa045e7c4] i915_gem_do_execbuffer.isra.23 [i915] } hitcount: 1790 bytes_req: 712176 - { call_site: [ffffffff8125847d] ext4_htree_store_dirent } hitcount: 8132 bytes_req: 513135 - { call_site: [ffffffff811e2a1b] seq_buf_alloc } hitcount: 106 bytes_req: 440128 - { call_site: [ffffffffa0489a66] intel_ring_begin [i915] } hitcount: 2186 bytes_req: 314784 - { call_site: [ffffffff812891ca] ext4_find_extent } hitcount: 2174 bytes_req: 208992 - { call_site: [ffffffff811ae8e1] __kmalloc } hitcount: 8 bytes_req: 131072 - { call_site: [ffffffffa04c4a3c] intel_plane_duplicate_state [i915] } hitcount: 859 bytes_req: 116824 - { call_site: [ffffffffa02911f2] drm_modeset_lock_crtc [drm] } hitcount: 1834 bytes_req: 102704 - { call_site: [ffffffffa04a580c] intel_crtc_page_flip [i915] } hitcount: 972 bytes_req: 101088 - { call_site: [ffffffffa0287592] drm_mode_page_flip_ioctl [drm] } hitcount: 972 bytes_req: 85536 - { call_site: [ffffffffa00b7c06] hid_report_raw_event [hid] } hitcount: 3333 bytes_req: 66664 - { call_site: [ffffffff8137e559] sg_kmalloc } hitcount: 209 bytes_req: 61632 - . - . - . - { call_site: [ffffffff81095225] alloc_fair_sched_group } hitcount: 2 bytes_req: 128 - { call_site: [ffffffff81097ec2] alloc_rt_sched_group } hitcount: 2 bytes_req: 128 - { call_site: [ffffffff812d8406] copy_semundo } hitcount: 2 bytes_req: 48 - { call_site: [ffffffff81200ba6] inotify_new_group } hitcount: 1 bytes_req: 48 - { call_site: [ffffffffa027121a] drm_getmagic [drm] } hitcount: 1 bytes_req: 48 - { call_site: [ffffffff811e3a25] __seq_open_private } hitcount: 1 bytes_req: 40 - { call_site: [ffffffff811c52f4] bprm_change_interp } hitcount: 2 bytes_req: 16 - { call_site: [ffffffff8154bc62] usb_control_msg } hitcount: 1 bytes_req: 8 - { call_site: [ffffffffa00bf1ca] hidraw_report_event [hid] } hitcount: 1 bytes_req: 7 - { call_site: [ffffffffa00bf6fe] hidraw_send_report [hid] } hitcount: 1 bytes_req: 7 - - Totals: - Hits: 32133 - Entries: 81 - Dropped: 0 - - To display the offset and size information in addition to the symbol - name, just use 'sym-offset' instead: - - # echo 'hist:key=call_site.sym-offset:val=bytes_req:sort=bytes_req.descending' > \ - /sys/kernel/debug/tracing/events/kmem/kmalloc/trigger - - # cat /sys/kernel/debug/tracing/events/kmem/kmalloc/hist - # trigger info: hist:keys=call_site.sym-offset:vals=bytes_req:sort=bytes_req.descending:size=2048 [active] - - { call_site: [ffffffffa046041c] i915_gem_execbuffer2+0x6c/0x2c0 [i915] } hitcount: 4569 bytes_req: 3163720 - { call_site: [ffffffffa0489a66] intel_ring_begin+0xc6/0x1f0 [i915] } hitcount: 4569 bytes_req: 657936 - { call_site: [ffffffffa045e7c4] i915_gem_do_execbuffer.isra.23+0x694/0x1020 [i915] } hitcount: 1519 bytes_req: 472936 - { call_site: [ffffffffa045e646] i915_gem_do_execbuffer.isra.23+0x516/0x1020 [i915] } hitcount: 3050 bytes_req: 211832 - { call_site: [ffffffff811e2a1b] seq_buf_alloc+0x1b/0x50 } hitcount: 34 bytes_req: 148384 - { call_site: [ffffffffa04a580c] intel_crtc_page_flip+0xbc/0x870 [i915] } hitcount: 1385 bytes_req: 144040 - { call_site: [ffffffff811ae8e1] __kmalloc+0x191/0x1b0 } hitcount: 8 bytes_req: 131072 - { call_site: [ffffffffa0287592] drm_mode_page_flip_ioctl+0x282/0x360 [drm] } hitcount: 1385 bytes_req: 121880 - { call_site: [ffffffffa02911f2] drm_modeset_lock_crtc+0x32/0x100 [drm] } hitcount: 1848 bytes_req: 103488 - { call_site: [ffffffffa04c4a3c] intel_plane_duplicate_state+0x2c/0xa0 [i915] } hitcount: 461 bytes_req: 62696 - { call_site: [ffffffffa029070e] drm_vma_node_allow+0x2e/0xd0 [drm] } hitcount: 1541 bytes_req: 61640 - { call_site: [ffffffff815f8d7b] sk_prot_alloc+0xcb/0x1b0 } hitcount: 57 bytes_req: 57456 - . - . - . - { call_site: [ffffffff8109524a] alloc_fair_sched_group+0x5a/0x1a0 } hitcount: 2 bytes_req: 128 - { call_site: [ffffffffa027b921] drm_vm_open_locked+0x31/0xa0 [drm] } hitcount: 3 bytes_req: 96 - { call_site: [ffffffff8122e266] proc_self_follow_link+0x76/0xb0 } hitcount: 8 bytes_req: 96 - { call_site: [ffffffff81213e80] load_elf_binary+0x240/0x1650 } hitcount: 3 bytes_req: 84 - { call_site: [ffffffff8154bc62] usb_control_msg+0x42/0x110 } hitcount: 1 bytes_req: 8 - { call_site: [ffffffffa00bf6fe] hidraw_send_report+0x7e/0x1a0 [hid] } hitcount: 1 bytes_req: 7 - { call_site: [ffffffffa00bf1ca] hidraw_report_event+0x8a/0x120 [hid] } hitcount: 1 bytes_req: 7 - - Totals: - Hits: 26098 - Entries: 64 - Dropped: 0 - - We can also add multiple fields to the 'values' parameter. For - example, we might want to see the total number of bytes allocated - alongside bytes requested, and display the result sorted by bytes - allocated in a descending order: - - # echo 'hist:keys=call_site.sym:values=bytes_req,bytes_alloc:sort=bytes_alloc.descending' > \ - /sys/kernel/debug/tracing/events/kmem/kmalloc/trigger - - # cat /sys/kernel/debug/tracing/events/kmem/kmalloc/hist - # trigger info: hist:keys=call_site.sym:vals=bytes_req,bytes_alloc:sort=bytes_alloc.descending:size=2048 [active] - - { call_site: [ffffffffa046041c] i915_gem_execbuffer2 [i915] } hitcount: 7403 bytes_req: 4084360 bytes_alloc: 5958016 - { call_site: [ffffffff811e2a1b] seq_buf_alloc } hitcount: 541 bytes_req: 2213968 bytes_alloc: 2228224 - { call_site: [ffffffffa0489a66] intel_ring_begin [i915] } hitcount: 7404 bytes_req: 1066176 bytes_alloc: 1421568 - { call_site: [ffffffffa045e7c4] i915_gem_do_execbuffer.isra.23 [i915] } hitcount: 1565 bytes_req: 557368 bytes_alloc: 1037760 - { call_site: [ffffffff8125847d] ext4_htree_store_dirent } hitcount: 9557 bytes_req: 595778 bytes_alloc: 695744 - { call_site: [ffffffffa045e646] i915_gem_do_execbuffer.isra.23 [i915] } hitcount: 5839 bytes_req: 430680 bytes_alloc: 470400 - { call_site: [ffffffffa04c4a3c] intel_plane_duplicate_state [i915] } hitcount: 2388 bytes_req: 324768 bytes_alloc: 458496 - { call_site: [ffffffffa02911f2] drm_modeset_lock_crtc [drm] } hitcount: 3911 bytes_req: 219016 bytes_alloc: 250304 - { call_site: [ffffffff815f8d7b] sk_prot_alloc } hitcount: 235 bytes_req: 236880 bytes_alloc: 240640 - { call_site: [ffffffff8137e559] sg_kmalloc } hitcount: 557 bytes_req: 169024 bytes_alloc: 221760 - { call_site: [ffffffffa00b7c06] hid_report_raw_event [hid] } hitcount: 9378 bytes_req: 187548 bytes_alloc: 206312 - { call_site: [ffffffffa04a580c] intel_crtc_page_flip [i915] } hitcount: 1519 bytes_req: 157976 bytes_alloc: 194432 - . - . - . - { call_site: [ffffffff8109bd3b] sched_autogroup_create_attach } hitcount: 2 bytes_req: 144 bytes_alloc: 192 - { call_site: [ffffffff81097ee8] alloc_rt_sched_group } hitcount: 2 bytes_req: 128 bytes_alloc: 128 - { call_site: [ffffffff8109524a] alloc_fair_sched_group } hitcount: 2 bytes_req: 128 bytes_alloc: 128 - { call_site: [ffffffff81095225] alloc_fair_sched_group } hitcount: 2 bytes_req: 128 bytes_alloc: 128 - { call_site: [ffffffff81097ec2] alloc_rt_sched_group } hitcount: 2 bytes_req: 128 bytes_alloc: 128 - { call_site: [ffffffff81213e80] load_elf_binary } hitcount: 3 bytes_req: 84 bytes_alloc: 96 - { call_site: [ffffffff81079a2e] kthread_create_on_node } hitcount: 1 bytes_req: 56 bytes_alloc: 64 - { call_site: [ffffffffa00bf6fe] hidraw_send_report [hid] } hitcount: 1 bytes_req: 7 bytes_alloc: 8 - { call_site: [ffffffff8154bc62] usb_control_msg } hitcount: 1 bytes_req: 8 bytes_alloc: 8 - { call_site: [ffffffffa00bf1ca] hidraw_report_event [hid] } hitcount: 1 bytes_req: 7 bytes_alloc: 8 - - Totals: - Hits: 66598 - Entries: 65 - Dropped: 0 - - Finally, to finish off our kmalloc example, instead of simply having - the hist trigger display symbolic call_sites, we can have the hist - trigger additionally display the complete set of kernel stack traces - that led to each call_site. To do that, we simply use the special - value 'stacktrace' for the key parameter: - - # echo 'hist:keys=stacktrace:values=bytes_req,bytes_alloc:sort=bytes_alloc' > \ - /sys/kernel/debug/tracing/events/kmem/kmalloc/trigger - - The above trigger will use the kernel stack trace in effect when an - event is triggered as the key for the hash table. This allows the - enumeration of every kernel callpath that led up to a particular - event, along with a running total of any of the event fields for - that event. Here we tally bytes requested and bytes allocated for - every callpath in the system that led up to a kmalloc (in this case - every callpath to a kmalloc for a kernel compile): - - # cat /sys/kernel/debug/tracing/events/kmem/kmalloc/hist - # trigger info: hist:keys=stacktrace:vals=bytes_req,bytes_alloc:sort=bytes_alloc:size=2048 [active] - - { stacktrace: - __kmalloc_track_caller+0x10b/0x1a0 - kmemdup+0x20/0x50 - hidraw_report_event+0x8a/0x120 [hid] - hid_report_raw_event+0x3ea/0x440 [hid] - hid_input_report+0x112/0x190 [hid] - hid_irq_in+0xc2/0x260 [usbhid] - __usb_hcd_giveback_urb+0x72/0x120 - usb_giveback_urb_bh+0x9e/0xe0 - tasklet_hi_action+0xf8/0x100 - __do_softirq+0x114/0x2c0 - irq_exit+0xa5/0xb0 - do_IRQ+0x5a/0xf0 - ret_from_intr+0x0/0x30 - cpuidle_enter+0x17/0x20 - cpu_startup_entry+0x315/0x3e0 - rest_init+0x7c/0x80 - } hitcount: 3 bytes_req: 21 bytes_alloc: 24 - { stacktrace: - __kmalloc_track_caller+0x10b/0x1a0 - kmemdup+0x20/0x50 - hidraw_report_event+0x8a/0x120 [hid] - hid_report_raw_event+0x3ea/0x440 [hid] - hid_input_report+0x112/0x190 [hid] - hid_irq_in+0xc2/0x260 [usbhid] - __usb_hcd_giveback_urb+0x72/0x120 - usb_giveback_urb_bh+0x9e/0xe0 - tasklet_hi_action+0xf8/0x100 - __do_softirq+0x114/0x2c0 - irq_exit+0xa5/0xb0 - do_IRQ+0x5a/0xf0 - ret_from_intr+0x0/0x30 - } hitcount: 3 bytes_req: 21 bytes_alloc: 24 - { stacktrace: - kmem_cache_alloc_trace+0xeb/0x150 - aa_alloc_task_context+0x27/0x40 - apparmor_cred_prepare+0x1f/0x50 - security_prepare_creds+0x16/0x20 - prepare_creds+0xdf/0x1a0 - SyS_capset+0xb5/0x200 - system_call_fastpath+0x12/0x6a - } hitcount: 1 bytes_req: 32 bytes_alloc: 32 - . - . - . - { stacktrace: - __kmalloc+0x11b/0x1b0 - i915_gem_execbuffer2+0x6c/0x2c0 [i915] - drm_ioctl+0x349/0x670 [drm] - do_vfs_ioctl+0x2f0/0x4f0 - SyS_ioctl+0x81/0xa0 - system_call_fastpath+0x12/0x6a - } hitcount: 17726 bytes_req: 13944120 bytes_alloc: 19593808 - { stacktrace: - __kmalloc+0x11b/0x1b0 - load_elf_phdrs+0x76/0xa0 - load_elf_binary+0x102/0x1650 - search_binary_handler+0x97/0x1d0 - do_execveat_common.isra.34+0x551/0x6e0 - SyS_execve+0x3a/0x50 - return_from_execve+0x0/0x23 - } hitcount: 33348 bytes_req: 17152128 bytes_alloc: 20226048 - { stacktrace: - kmem_cache_alloc_trace+0xeb/0x150 - apparmor_file_alloc_security+0x27/0x40 - security_file_alloc+0x16/0x20 - get_empty_filp+0x93/0x1c0 - path_openat+0x31/0x5f0 - do_filp_open+0x3a/0x90 - do_sys_open+0x128/0x220 - SyS_open+0x1e/0x20 - system_call_fastpath+0x12/0x6a - } hitcount: 4766422 bytes_req: 9532844 bytes_alloc: 38131376 - { stacktrace: - __kmalloc+0x11b/0x1b0 - seq_buf_alloc+0x1b/0x50 - seq_read+0x2cc/0x370 - proc_reg_read+0x3d/0x80 - __vfs_read+0x28/0xe0 - vfs_read+0x86/0x140 - SyS_read+0x46/0xb0 - system_call_fastpath+0x12/0x6a - } hitcount: 19133 bytes_req: 78368768 bytes_alloc: 78368768 - - Totals: - Hits: 6085872 - Entries: 253 - Dropped: 0 - - If you key a hist trigger on common_pid, in order for example to - gather and display sorted totals for each process, you can use the - special .execname modifier to display the executable names for the - processes in the table rather than raw pids. The example below - keeps a per-process sum of total bytes read: - - # echo 'hist:key=common_pid.execname:val=count:sort=count.descending' > \ - /sys/kernel/debug/tracing/events/syscalls/sys_enter_read/trigger - - # cat /sys/kernel/debug/tracing/events/syscalls/sys_enter_read/hist - # trigger info: hist:keys=common_pid.execname:vals=count:sort=count.descending:size=2048 [active] - - { common_pid: gnome-terminal [ 3196] } hitcount: 280 count: 1093512 - { common_pid: Xorg [ 1309] } hitcount: 525 count: 256640 - { common_pid: compiz [ 2889] } hitcount: 59 count: 254400 - { common_pid: bash [ 8710] } hitcount: 3 count: 66369 - { common_pid: dbus-daemon-lau [ 8703] } hitcount: 49 count: 47739 - { common_pid: irqbalance [ 1252] } hitcount: 27 count: 27648 - { common_pid: 01ifupdown [ 8705] } hitcount: 3 count: 17216 - { common_pid: dbus-daemon [ 772] } hitcount: 10 count: 12396 - { common_pid: Socket Thread [ 8342] } hitcount: 11 count: 11264 - { common_pid: nm-dhcp-client. [ 8701] } hitcount: 6 count: 7424 - { common_pid: gmain [ 1315] } hitcount: 18 count: 6336 - . - . - . - { common_pid: postgres [ 1892] } hitcount: 2 count: 32 - { common_pid: postgres [ 1891] } hitcount: 2 count: 32 - { common_pid: gmain [ 8704] } hitcount: 2 count: 32 - { common_pid: upstart-dbus-br [ 2740] } hitcount: 21 count: 21 - { common_pid: nm-dispatcher.a [ 8696] } hitcount: 1 count: 16 - { common_pid: indicator-datet [ 2904] } hitcount: 1 count: 16 - { common_pid: gdbus [ 2998] } hitcount: 1 count: 16 - { common_pid: rtkit-daemon [ 2052] } hitcount: 1 count: 8 - { common_pid: init [ 1] } hitcount: 2 count: 2 - - Totals: - Hits: 2116 - Entries: 51 - Dropped: 0 - - Similarly, if you key a hist trigger on syscall id, for example to - gather and display a list of systemwide syscall hits, you can use - the special .syscall modifier to display the syscall names rather - than raw ids. The example below keeps a running total of syscall - counts for the system during the run: - - # echo 'hist:key=id.syscall:val=hitcount' > \ - /sys/kernel/debug/tracing/events/raw_syscalls/sys_enter/trigger - - # cat /sys/kernel/debug/tracing/events/raw_syscalls/sys_enter/hist - # trigger info: hist:keys=id.syscall:vals=hitcount:sort=hitcount:size=2048 [active] - - { id: sys_fsync [ 74] } hitcount: 1 - { id: sys_newuname [ 63] } hitcount: 1 - { id: sys_prctl [157] } hitcount: 1 - { id: sys_statfs [137] } hitcount: 1 - { id: sys_symlink [ 88] } hitcount: 1 - { id: sys_sendmmsg [307] } hitcount: 1 - { id: sys_semctl [ 66] } hitcount: 1 - { id: sys_readlink [ 89] } hitcount: 3 - { id: sys_bind [ 49] } hitcount: 3 - { id: sys_getsockname [ 51] } hitcount: 3 - { id: sys_unlink [ 87] } hitcount: 3 - { id: sys_rename [ 82] } hitcount: 4 - { id: unknown_syscall [ 58] } hitcount: 4 - { id: sys_connect [ 42] } hitcount: 4 - { id: sys_getpid [ 39] } hitcount: 4 - . - . - . - { id: sys_rt_sigprocmask [ 14] } hitcount: 952 - { id: sys_futex [202] } hitcount: 1534 - { id: sys_write [ 1] } hitcount: 2689 - { id: sys_setitimer [ 38] } hitcount: 2797 - { id: sys_read [ 0] } hitcount: 3202 - { id: sys_select [ 23] } hitcount: 3773 - { id: sys_writev [ 20] } hitcount: 4531 - { id: sys_poll [ 7] } hitcount: 8314 - { id: sys_recvmsg [ 47] } hitcount: 13738 - { id: sys_ioctl [ 16] } hitcount: 21843 - - Totals: - Hits: 67612 - Entries: 72 - Dropped: 0 - - The syscall counts above provide a rough overall picture of system - call activity on the system; we can see for example that the most - popular system call on this system was the 'sys_ioctl' system call. - - We can use 'compound' keys to refine that number and provide some - further insight as to which processes exactly contribute to the - overall ioctl count. - - The command below keeps a hitcount for every unique combination of - system call id and pid - the end result is essentially a table - that keeps a per-pid sum of system call hits. The results are - sorted using the system call id as the primary key, and the - hitcount sum as the secondary key: - - # echo 'hist:key=id.syscall,common_pid.execname:val=hitcount:sort=id,hitcount' > \ - /sys/kernel/debug/tracing/events/raw_syscalls/sys_enter/trigger - - # cat /sys/kernel/debug/tracing/events/raw_syscalls/sys_enter/hist - # trigger info: hist:keys=id.syscall,common_pid.execname:vals=hitcount:sort=id.syscall,hitcount:size=2048 [active] - - { id: sys_read [ 0], common_pid: rtkit-daemon [ 1877] } hitcount: 1 - { id: sys_read [ 0], common_pid: gdbus [ 2976] } hitcount: 1 - { id: sys_read [ 0], common_pid: console-kit-dae [ 3400] } hitcount: 1 - { id: sys_read [ 0], common_pid: postgres [ 1865] } hitcount: 1 - { id: sys_read [ 0], common_pid: deja-dup-monito [ 3543] } hitcount: 2 - { id: sys_read [ 0], common_pid: NetworkManager [ 890] } hitcount: 2 - { id: sys_read [ 0], common_pid: evolution-calen [ 3048] } hitcount: 2 - { id: sys_read [ 0], common_pid: postgres [ 1864] } hitcount: 2 - { id: sys_read [ 0], common_pid: nm-applet [ 3022] } hitcount: 2 - { id: sys_read [ 0], common_pid: whoopsie [ 1212] } hitcount: 2 - . - . - . - { id: sys_ioctl [ 16], common_pid: bash [ 8479] } hitcount: 1 - { id: sys_ioctl [ 16], common_pid: bash [ 3472] } hitcount: 12 - { id: sys_ioctl [ 16], common_pid: gnome-terminal [ 3199] } hitcount: 16 - { id: sys_ioctl [ 16], common_pid: Xorg [ 1267] } hitcount: 1808 - { id: sys_ioctl [ 16], common_pid: compiz [ 2994] } hitcount: 5580 - . - . - . - { id: sys_waitid [247], common_pid: upstart-dbus-br [ 2690] } hitcount: 3 - { id: sys_waitid [247], common_pid: upstart-dbus-br [ 2688] } hitcount: 16 - { id: sys_inotify_add_watch [254], common_pid: gmain [ 975] } hitcount: 2 - { id: sys_inotify_add_watch [254], common_pid: gmain [ 3204] } hitcount: 4 - { id: sys_inotify_add_watch [254], common_pid: gmain [ 2888] } hitcount: 4 - { id: sys_inotify_add_watch [254], common_pid: gmain [ 3003] } hitcount: 4 - { id: sys_inotify_add_watch [254], common_pid: gmain [ 2873] } hitcount: 4 - { id: sys_inotify_add_watch [254], common_pid: gmain [ 3196] } hitcount: 6 - { id: sys_openat [257], common_pid: java [ 2623] } hitcount: 2 - { id: sys_eventfd2 [290], common_pid: ibus-ui-gtk3 [ 2760] } hitcount: 4 - { id: sys_eventfd2 [290], common_pid: compiz [ 2994] } hitcount: 6 - - Totals: - Hits: 31536 - Entries: 323 - Dropped: 0 - - The above list does give us a breakdown of the ioctl syscall by - pid, but it also gives us quite a bit more than that, which we - don't really care about at the moment. Since we know the syscall - id for sys_ioctl (16, displayed next to the sys_ioctl name), we - can use that to filter out all the other syscalls: - - # echo 'hist:key=id.syscall,common_pid.execname:val=hitcount:sort=id,hitcount if id == 16' > \ - /sys/kernel/debug/tracing/events/raw_syscalls/sys_enter/trigger - - # cat /sys/kernel/debug/tracing/events/raw_syscalls/sys_enter/hist - # trigger info: hist:keys=id.syscall,common_pid.execname:vals=hitcount:sort=id.syscall,hitcount:size=2048 if id == 16 [active] - - { id: sys_ioctl [ 16], common_pid: gmain [ 2769] } hitcount: 1 - { id: sys_ioctl [ 16], common_pid: evolution-addre [ 8571] } hitcount: 1 - { id: sys_ioctl [ 16], common_pid: gmain [ 3003] } hitcount: 1 - { id: sys_ioctl [ 16], common_pid: gmain [ 2781] } hitcount: 1 - { id: sys_ioctl [ 16], common_pid: gmain [ 2829] } hitcount: 1 - { id: sys_ioctl [ 16], common_pid: bash [ 8726] } hitcount: 1 - { id: sys_ioctl [ 16], common_pid: bash [ 8508] } hitcount: 1 - { id: sys_ioctl [ 16], common_pid: gmain [ 2970] } hitcount: 1 - { id: sys_ioctl [ 16], common_pid: gmain [ 2768] } hitcount: 1 - . - . - . - { id: sys_ioctl [ 16], common_pid: pool [ 8559] } hitcount: 45 - { id: sys_ioctl [ 16], common_pid: pool [ 8555] } hitcount: 48 - { id: sys_ioctl [ 16], common_pid: pool [ 8551] } hitcount: 48 - { id: sys_ioctl [ 16], common_pid: avahi-daemon [ 896] } hitcount: 66 - { id: sys_ioctl [ 16], common_pid: Xorg [ 1267] } hitcount: 26674 - { id: sys_ioctl [ 16], common_pid: compiz [ 2994] } hitcount: 73443 - - Totals: - Hits: 101162 - Entries: 103 - Dropped: 0 - - The above output shows that 'compiz' and 'Xorg' are far and away - the heaviest ioctl callers (which might lead to questions about - whether they really need to be making all those calls and to - possible avenues for further investigation.) - - The compound key examples used a key and a sum value (hitcount) to - sort the output, but we can just as easily use two keys instead. - Here's an example where we use a compound key composed of the the - common_pid and size event fields. Sorting with pid as the primary - key and 'size' as the secondary key allows us to display an - ordered summary of the recvfrom sizes, with counts, received by - each process: - - # echo 'hist:key=common_pid.execname,size:val=hitcount:sort=common_pid,size' > \ - /sys/kernel/debug/tracing/events/syscalls/sys_enter_recvfrom/trigger - - # cat /sys/kernel/debug/tracing/events/syscalls/sys_enter_recvfrom/hist - # trigger info: hist:keys=common_pid.execname,size:vals=hitcount:sort=common_pid.execname,size:size=2048 [active] - - { common_pid: smbd [ 784], size: 4 } hitcount: 1 - { common_pid: dnsmasq [ 1412], size: 4096 } hitcount: 672 - { common_pid: postgres [ 1796], size: 1000 } hitcount: 6 - { common_pid: postgres [ 1867], size: 1000 } hitcount: 10 - { common_pid: bamfdaemon [ 2787], size: 28 } hitcount: 2 - { common_pid: bamfdaemon [ 2787], size: 14360 } hitcount: 1 - { common_pid: compiz [ 2994], size: 8 } hitcount: 1 - { common_pid: compiz [ 2994], size: 20 } hitcount: 11 - { common_pid: gnome-terminal [ 3199], size: 4 } hitcount: 2 - { common_pid: firefox [ 8817], size: 4 } hitcount: 1 - { common_pid: firefox [ 8817], size: 8 } hitcount: 5 - { common_pid: firefox [ 8817], size: 588 } hitcount: 2 - { common_pid: firefox [ 8817], size: 628 } hitcount: 1 - { common_pid: firefox [ 8817], size: 6944 } hitcount: 1 - { common_pid: firefox [ 8817], size: 408880 } hitcount: 2 - { common_pid: firefox [ 8822], size: 8 } hitcount: 2 - { common_pid: firefox [ 8822], size: 160 } hitcount: 2 - { common_pid: firefox [ 8822], size: 320 } hitcount: 2 - { common_pid: firefox [ 8822], size: 352 } hitcount: 1 - . - . - . - { common_pid: pool [ 8923], size: 1960 } hitcount: 10 - { common_pid: pool [ 8923], size: 2048 } hitcount: 10 - { common_pid: pool [ 8924], size: 1960 } hitcount: 10 - { common_pid: pool [ 8924], size: 2048 } hitcount: 10 - { common_pid: pool [ 8928], size: 1964 } hitcount: 4 - { common_pid: pool [ 8928], size: 1965 } hitcount: 2 - { common_pid: pool [ 8928], size: 2048 } hitcount: 6 - { common_pid: pool [ 8929], size: 1982 } hitcount: 1 - { common_pid: pool [ 8929], size: 2048 } hitcount: 1 - - Totals: - Hits: 2016 - Entries: 224 - Dropped: 0 - - The above example also illustrates the fact that although a compound - key is treated as a single entity for hashing purposes, the sub-keys - it's composed of can be accessed independently. - - The next example uses a string field as the hash key and - demonstrates how you can manually pause and continue a hist trigger. - In this example, we'll aggregate fork counts and don't expect a - large number of entries in the hash table, so we'll drop it to a - much smaller number, say 256: - - # echo 'hist:key=child_comm:val=hitcount:size=256' > \ - /sys/kernel/debug/tracing/events/sched/sched_process_fork/trigger - - # cat /sys/kernel/debug/tracing/events/sched/sched_process_fork/hist - # trigger info: hist:keys=child_comm:vals=hitcount:sort=hitcount:size=256 [active] - - { child_comm: dconf worker } hitcount: 1 - { child_comm: ibus-daemon } hitcount: 1 - { child_comm: whoopsie } hitcount: 1 - { child_comm: smbd } hitcount: 1 - { child_comm: gdbus } hitcount: 1 - { child_comm: kthreadd } hitcount: 1 - { child_comm: dconf worker } hitcount: 1 - { child_comm: evolution-alarm } hitcount: 2 - { child_comm: Socket Thread } hitcount: 2 - { child_comm: postgres } hitcount: 2 - { child_comm: bash } hitcount: 3 - { child_comm: compiz } hitcount: 3 - { child_comm: evolution-sourc } hitcount: 4 - { child_comm: dhclient } hitcount: 4 - { child_comm: pool } hitcount: 5 - { child_comm: nm-dispatcher.a } hitcount: 8 - { child_comm: firefox } hitcount: 8 - { child_comm: dbus-daemon } hitcount: 8 - { child_comm: glib-pacrunner } hitcount: 10 - { child_comm: evolution } hitcount: 23 - - Totals: - Hits: 89 - Entries: 20 - Dropped: 0 - - If we want to pause the hist trigger, we can simply append :pause to - the command that started the trigger. Notice that the trigger info - displays as [paused]: - - # echo 'hist:key=child_comm:val=hitcount:size=256:pause' >> \ - /sys/kernel/debug/tracing/events/sched/sched_process_fork/trigger - - # cat /sys/kernel/debug/tracing/events/sched/sched_process_fork/hist - # trigger info: hist:keys=child_comm:vals=hitcount:sort=hitcount:size=256 [paused] - - { child_comm: dconf worker } hitcount: 1 - { child_comm: kthreadd } hitcount: 1 - { child_comm: dconf worker } hitcount: 1 - { child_comm: gdbus } hitcount: 1 - { child_comm: ibus-daemon } hitcount: 1 - { child_comm: Socket Thread } hitcount: 2 - { child_comm: evolution-alarm } hitcount: 2 - { child_comm: smbd } hitcount: 2 - { child_comm: bash } hitcount: 3 - { child_comm: whoopsie } hitcount: 3 - { child_comm: compiz } hitcount: 3 - { child_comm: evolution-sourc } hitcount: 4 - { child_comm: pool } hitcount: 5 - { child_comm: postgres } hitcount: 6 - { child_comm: firefox } hitcount: 8 - { child_comm: dhclient } hitcount: 10 - { child_comm: emacs } hitcount: 12 - { child_comm: dbus-daemon } hitcount: 20 - { child_comm: nm-dispatcher.a } hitcount: 20 - { child_comm: evolution } hitcount: 35 - { child_comm: glib-pacrunner } hitcount: 59 - - Totals: - Hits: 199 - Entries: 21 - Dropped: 0 - - To manually continue having the trigger aggregate events, append - :cont instead. Notice that the trigger info displays as [active] - again, and the data has changed: - - # echo 'hist:key=child_comm:val=hitcount:size=256:cont' >> \ - /sys/kernel/debug/tracing/events/sched/sched_process_fork/trigger - - # cat /sys/kernel/debug/tracing/events/sched/sched_process_fork/hist - # trigger info: hist:keys=child_comm:vals=hitcount:sort=hitcount:size=256 [active] - - { child_comm: dconf worker } hitcount: 1 - { child_comm: dconf worker } hitcount: 1 - { child_comm: kthreadd } hitcount: 1 - { child_comm: gdbus } hitcount: 1 - { child_comm: ibus-daemon } hitcount: 1 - { child_comm: Socket Thread } hitcount: 2 - { child_comm: evolution-alarm } hitcount: 2 - { child_comm: smbd } hitcount: 2 - { child_comm: whoopsie } hitcount: 3 - { child_comm: compiz } hitcount: 3 - { child_comm: evolution-sourc } hitcount: 4 - { child_comm: bash } hitcount: 5 - { child_comm: pool } hitcount: 5 - { child_comm: postgres } hitcount: 6 - { child_comm: firefox } hitcount: 8 - { child_comm: dhclient } hitcount: 11 - { child_comm: emacs } hitcount: 12 - { child_comm: dbus-daemon } hitcount: 22 - { child_comm: nm-dispatcher.a } hitcount: 22 - { child_comm: evolution } hitcount: 35 - { child_comm: glib-pacrunner } hitcount: 59 - - Totals: - Hits: 206 - Entries: 21 - Dropped: 0 - - The previous example showed how to start and stop a hist trigger by - appending 'pause' and 'continue' to the hist trigger command. A - hist trigger can also be started in a paused state by initially - starting the trigger with ':pause' appended. This allows you to - start the trigger only when you're ready to start collecting data - and not before. For example, you could start the trigger in a - paused state, then unpause it and do something you want to measure, - then pause the trigger again when done. - - Of course, doing this manually can be difficult and error-prone, but - it is possible to automatically start and stop a hist trigger based - on some condition, via the enable_hist and disable_hist triggers. - - For example, suppose we wanted to take a look at the relative - weights in terms of skb length for each callpath that leads to a - netif_receieve_skb event when downloading a decent-sized file using - wget. - - First we set up an initially paused stacktrace trigger on the - netif_receive_skb event: - - # echo 'hist:key=stacktrace:vals=len:pause' > \ - /sys/kernel/debug/tracing/events/net/netif_receive_skb/trigger - - Next, we set up an 'enable_hist' trigger on the sched_process_exec - event, with an 'if filename==/usr/bin/wget' filter. The effect of - this new trigger is that it will 'unpause' the hist trigger we just - set up on netif_receive_skb if and only if it sees a - sched_process_exec event with a filename of '/usr/bin/wget'. When - that happens, all netif_receive_skb events are aggregated into a - hash table keyed on stacktrace: - - # echo 'enable_hist:net:netif_receive_skb if filename==/usr/bin/wget' > \ - /sys/kernel/debug/tracing/events/sched/sched_process_exec/trigger - - The aggregation continues until the netif_receive_skb is paused - again, which is what the following disable_hist event does by - creating a similar setup on the sched_process_exit event, using the - filter 'comm==wget': - - # echo 'disable_hist:net:netif_receive_skb if comm==wget' > \ - /sys/kernel/debug/tracing/events/sched/sched_process_exit/trigger - - Whenever a process exits and the comm field of the disable_hist - trigger filter matches 'comm==wget', the netif_receive_skb hist - trigger is disabled. - - The overall effect is that netif_receive_skb events are aggregated - into the hash table for only the duration of the wget. Executing a - wget command and then listing the 'hist' file will display the - output generated by the wget command: - - $ wget https://www.kernel.org/pub/linux/kernel/v3.x/patch-3.19.xz - - # cat /sys/kernel/debug/tracing/events/net/netif_receive_skb/hist - # trigger info: hist:keys=stacktrace:vals=len:sort=hitcount:size=2048 [paused] - - { stacktrace: - __netif_receive_skb_core+0x46d/0x990 - __netif_receive_skb+0x18/0x60 - netif_receive_skb_internal+0x23/0x90 - napi_gro_receive+0xc8/0x100 - ieee80211_deliver_skb+0xd6/0x270 [mac80211] - ieee80211_rx_handlers+0xccf/0x22f0 [mac80211] - ieee80211_prepare_and_rx_handle+0x4e7/0xc40 [mac80211] - ieee80211_rx+0x31d/0x900 [mac80211] - iwlagn_rx_reply_rx+0x3db/0x6f0 [iwldvm] - iwl_rx_dispatch+0x8e/0xf0 [iwldvm] - iwl_pcie_irq_handler+0xe3c/0x12f0 [iwlwifi] - irq_thread_fn+0x20/0x50 - irq_thread+0x11f/0x150 - kthread+0xd2/0xf0 - ret_from_fork+0x42/0x70 - } hitcount: 85 len: 28884 - { stacktrace: - __netif_receive_skb_core+0x46d/0x990 - __netif_receive_skb+0x18/0x60 - netif_receive_skb_internal+0x23/0x90 - napi_gro_complete+0xa4/0xe0 - dev_gro_receive+0x23a/0x360 - napi_gro_receive+0x30/0x100 - ieee80211_deliver_skb+0xd6/0x270 [mac80211] - ieee80211_rx_handlers+0xccf/0x22f0 [mac80211] - ieee80211_prepare_and_rx_handle+0x4e7/0xc40 [mac80211] - ieee80211_rx+0x31d/0x900 [mac80211] - iwlagn_rx_reply_rx+0x3db/0x6f0 [iwldvm] - iwl_rx_dispatch+0x8e/0xf0 [iwldvm] - iwl_pcie_irq_handler+0xe3c/0x12f0 [iwlwifi] - irq_thread_fn+0x20/0x50 - irq_thread+0x11f/0x150 - kthread+0xd2/0xf0 - } hitcount: 98 len: 664329 - { stacktrace: - __netif_receive_skb_core+0x46d/0x990 - __netif_receive_skb+0x18/0x60 - process_backlog+0xa8/0x150 - net_rx_action+0x15d/0x340 - __do_softirq+0x114/0x2c0 - do_softirq_own_stack+0x1c/0x30 - do_softirq+0x65/0x70 - __local_bh_enable_ip+0xb5/0xc0 - ip_finish_output+0x1f4/0x840 - ip_output+0x6b/0xc0 - ip_local_out_sk+0x31/0x40 - ip_send_skb+0x1a/0x50 - udp_send_skb+0x173/0x2a0 - udp_sendmsg+0x2bf/0x9f0 - inet_sendmsg+0x64/0xa0 - sock_sendmsg+0x3d/0x50 - } hitcount: 115 len: 13030 - { stacktrace: - __netif_receive_skb_core+0x46d/0x990 - __netif_receive_skb+0x18/0x60 - netif_receive_skb_internal+0x23/0x90 - napi_gro_complete+0xa4/0xe0 - napi_gro_flush+0x6d/0x90 - iwl_pcie_irq_handler+0x92a/0x12f0 [iwlwifi] - irq_thread_fn+0x20/0x50 - irq_thread+0x11f/0x150 - kthread+0xd2/0xf0 - ret_from_fork+0x42/0x70 - } hitcount: 934 len: 5512212 - - Totals: - Hits: 1232 - Entries: 4 - Dropped: 0 - - The above shows all the netif_receive_skb callpaths and their total - lengths for the duration of the wget command. - - The 'clear' hist trigger param can be used to clear the hash table. - Suppose we wanted to try another run of the previous example but - this time also wanted to see the complete list of events that went - into the histogram. In order to avoid having to set everything up - again, we can just clear the histogram first: - - # echo 'hist:key=stacktrace:vals=len:clear' >> \ - /sys/kernel/debug/tracing/events/net/netif_receive_skb/trigger - - Just to verify that it is in fact cleared, here's what we now see in - the hist file: - - # cat /sys/kernel/debug/tracing/events/net/netif_receive_skb/hist - # trigger info: hist:keys=stacktrace:vals=len:sort=hitcount:size=2048 [paused] - - Totals: - Hits: 0 - Entries: 0 - Dropped: 0 - - Since we want to see the detailed list of every netif_receive_skb - event occurring during the new run, which are in fact the same - events being aggregated into the hash table, we add some additional - 'enable_event' events to the triggering sched_process_exec and - sched_process_exit events as such: - - # echo 'enable_event:net:netif_receive_skb if filename==/usr/bin/wget' > \ - /sys/kernel/debug/tracing/events/sched/sched_process_exec/trigger - - # echo 'disable_event:net:netif_receive_skb if comm==wget' > \ - /sys/kernel/debug/tracing/events/sched/sched_process_exit/trigger - - If you read the trigger files for the sched_process_exec and - sched_process_exit triggers, you should see two triggers for each: - one enabling/disabling the hist aggregation and the other - enabling/disabling the logging of events: - - # cat /sys/kernel/debug/tracing/events/sched/sched_process_exec/trigger - enable_event:net:netif_receive_skb:unlimited if filename==/usr/bin/wget - enable_hist:net:netif_receive_skb:unlimited if filename==/usr/bin/wget - - # cat /sys/kernel/debug/tracing/events/sched/sched_process_exit/trigger - enable_event:net:netif_receive_skb:unlimited if comm==wget - disable_hist:net:netif_receive_skb:unlimited if comm==wget - - In other words, whenever either of the sched_process_exec or - sched_process_exit events is hit and matches 'wget', it enables or - disables both the histogram and the event log, and what you end up - with is a hash table and set of events just covering the specified - duration. Run the wget command again: - - $ wget https://www.kernel.org/pub/linux/kernel/v3.x/patch-3.19.xz - - Displaying the 'hist' file should show something similar to what you - saw in the last run, but this time you should also see the - individual events in the trace file: - - # cat /sys/kernel/debug/tracing/trace - - # tracer: nop - # - # entries-in-buffer/entries-written: 183/1426 #P:4 - # - # _-----=> irqs-off - # / _----=> need-resched - # | / _---=> hardirq/softirq - # || / _--=> preempt-depth - # ||| / delay - # TASK-PID CPU# |||| TIMESTAMP FUNCTION - # | | | |||| | | - wget-15108 [000] ..s1 31769.606929: netif_receive_skb: dev=lo skbaddr=ffff88009c353100 len=60 - wget-15108 [000] ..s1 31769.606999: netif_receive_skb: dev=lo skbaddr=ffff88009c353200 len=60 - dnsmasq-1382 [000] ..s1 31769.677652: netif_receive_skb: dev=lo skbaddr=ffff88009c352b00 len=130 - dnsmasq-1382 [000] ..s1 31769.685917: netif_receive_skb: dev=lo skbaddr=ffff88009c352200 len=138 - ##### CPU 2 buffer started #### - irq/29-iwlwifi-559 [002] ..s. 31772.031529: netif_receive_skb: dev=wlan0 skbaddr=ffff88009d433d00 len=2948 - irq/29-iwlwifi-559 [002] ..s. 31772.031572: netif_receive_skb: dev=wlan0 skbaddr=ffff88009d432200 len=1500 - irq/29-iwlwifi-559 [002] ..s. 31772.032196: netif_receive_skb: dev=wlan0 skbaddr=ffff88009d433100 len=2948 - irq/29-iwlwifi-559 [002] ..s. 31772.032761: netif_receive_skb: dev=wlan0 skbaddr=ffff88009d433000 len=2948 - irq/29-iwlwifi-559 [002] ..s. 31772.033220: netif_receive_skb: dev=wlan0 skbaddr=ffff88009d432e00 len=1500 - . - . - . - - The following example demonstrates how multiple hist triggers can be - attached to a given event. This capability can be useful for - creating a set of different summaries derived from the same set of - events, or for comparing the effects of different filters, among - other things. - - # echo 'hist:keys=skbaddr.hex:vals=len if len < 0' >> \ - /sys/kernel/debug/tracing/events/net/netif_receive_skb/trigger - # echo 'hist:keys=skbaddr.hex:vals=len if len > 4096' >> \ - /sys/kernel/debug/tracing/events/net/netif_receive_skb/trigger - # echo 'hist:keys=skbaddr.hex:vals=len if len == 256' >> \ - /sys/kernel/debug/tracing/events/net/netif_receive_skb/trigger - # echo 'hist:keys=skbaddr.hex:vals=len' >> \ - /sys/kernel/debug/tracing/events/net/netif_receive_skb/trigger - # echo 'hist:keys=len:vals=common_preempt_count' >> \ - /sys/kernel/debug/tracing/events/net/netif_receive_skb/trigger - - The above set of commands create four triggers differing only in - their filters, along with a completely different though fairly - nonsensical trigger. Note that in order to append multiple hist - triggers to the same file, you should use the '>>' operator to - append them ('>' will also add the new hist trigger, but will remove - any existing hist triggers beforehand). - - Displaying the contents of the 'hist' file for the event shows the - contents of all five histograms: - - # cat /sys/kernel/debug/tracing/events/net/netif_receive_skb/hist - - # event histogram - # - # trigger info: hist:keys=len:vals=hitcount,common_preempt_count:sort=hitcount:size=2048 [active] - # - - { len: 176 } hitcount: 1 common_preempt_count: 0 - { len: 223 } hitcount: 1 common_preempt_count: 0 - { len: 4854 } hitcount: 1 common_preempt_count: 0 - { len: 395 } hitcount: 1 common_preempt_count: 0 - { len: 177 } hitcount: 1 common_preempt_count: 0 - { len: 446 } hitcount: 1 common_preempt_count: 0 - { len: 1601 } hitcount: 1 common_preempt_count: 0 - . - . - . - { len: 1280 } hitcount: 66 common_preempt_count: 0 - { len: 116 } hitcount: 81 common_preempt_count: 40 - { len: 708 } hitcount: 112 common_preempt_count: 0 - { len: 46 } hitcount: 221 common_preempt_count: 0 - { len: 1264 } hitcount: 458 common_preempt_count: 0 - - Totals: - Hits: 1428 - Entries: 147 - Dropped: 0 - - - # event histogram - # - # trigger info: hist:keys=skbaddr.hex:vals=hitcount,len:sort=hitcount:size=2048 [active] - # - - { skbaddr: ffff8800baee5e00 } hitcount: 1 len: 130 - { skbaddr: ffff88005f3d5600 } hitcount: 1 len: 1280 - { skbaddr: ffff88005f3d4900 } hitcount: 1 len: 1280 - { skbaddr: ffff88009fed6300 } hitcount: 1 len: 115 - { skbaddr: ffff88009fe0ad00 } hitcount: 1 len: 115 - { skbaddr: ffff88008cdb1900 } hitcount: 1 len: 46 - { skbaddr: ffff880064b5ef00 } hitcount: 1 len: 118 - { skbaddr: ffff880044e3c700 } hitcount: 1 len: 60 - { skbaddr: ffff880100065900 } hitcount: 1 len: 46 - { skbaddr: ffff8800d46bd500 } hitcount: 1 len: 116 - { skbaddr: ffff88005f3d5f00 } hitcount: 1 len: 1280 - { skbaddr: ffff880100064700 } hitcount: 1 len: 365 - { skbaddr: ffff8800badb6f00 } hitcount: 1 len: 60 - . - . - . - { skbaddr: ffff88009fe0be00 } hitcount: 27 len: 24677 - { skbaddr: ffff88009fe0a400 } hitcount: 27 len: 23052 - { skbaddr: ffff88009fe0b700 } hitcount: 31 len: 25589 - { skbaddr: ffff88009fe0b600 } hitcount: 32 len: 27326 - { skbaddr: ffff88006a462800 } hitcount: 68 len: 71678 - { skbaddr: ffff88006a463700 } hitcount: 70 len: 72678 - { skbaddr: ffff88006a462b00 } hitcount: 71 len: 77589 - { skbaddr: ffff88006a463600 } hitcount: 73 len: 71307 - { skbaddr: ffff88006a462200 } hitcount: 81 len: 81032 - - Totals: - Hits: 1451 - Entries: 318 - Dropped: 0 - - - # event histogram - # - # trigger info: hist:keys=skbaddr.hex:vals=hitcount,len:sort=hitcount:size=2048 if len == 256 [active] - # - - - Totals: - Hits: 0 - Entries: 0 - Dropped: 0 - - - # event histogram - # - # trigger info: hist:keys=skbaddr.hex:vals=hitcount,len:sort=hitcount:size=2048 if len > 4096 [active] - # - - { skbaddr: ffff88009fd2c300 } hitcount: 1 len: 7212 - { skbaddr: ffff8800d2bcce00 } hitcount: 1 len: 7212 - { skbaddr: ffff8800d2bcd700 } hitcount: 1 len: 7212 - { skbaddr: ffff8800d2bcda00 } hitcount: 1 len: 21492 - { skbaddr: ffff8800ae2e2d00 } hitcount: 1 len: 7212 - { skbaddr: ffff8800d2bcdb00 } hitcount: 1 len: 7212 - { skbaddr: ffff88006a4df500 } hitcount: 1 len: 4854 - { skbaddr: ffff88008ce47b00 } hitcount: 1 len: 18636 - { skbaddr: ffff8800ae2e2200 } hitcount: 1 len: 12924 - { skbaddr: ffff88005f3e1000 } hitcount: 1 len: 4356 - { skbaddr: ffff8800d2bcdc00 } hitcount: 2 len: 24420 - { skbaddr: ffff8800d2bcc200 } hitcount: 2 len: 12996 - - Totals: - Hits: 14 - Entries: 12 - Dropped: 0 - - - # event histogram - # - # trigger info: hist:keys=skbaddr.hex:vals=hitcount,len:sort=hitcount:size=2048 if len < 0 [active] - # - - - Totals: - Hits: 0 - Entries: 0 - Dropped: 0 - - Named triggers can be used to have triggers share a common set of - histogram data. This capability is mostly useful for combining the - output of events generated by tracepoints contained inside inline - functions, but names can be used in a hist trigger on any event. - For example, these two triggers when hit will update the same 'len' - field in the shared 'foo' histogram data: - - # echo 'hist:name=foo:keys=skbaddr.hex:vals=len' > \ - /sys/kernel/debug/tracing/events/net/netif_receive_skb/trigger - # echo 'hist:name=foo:keys=skbaddr.hex:vals=len' > \ - /sys/kernel/debug/tracing/events/net/netif_rx/trigger - - You can see that they're updating common histogram data by reading - each event's hist files at the same time: - - # cat /sys/kernel/debug/tracing/events/net/netif_receive_skb/hist; - cat /sys/kernel/debug/tracing/events/net/netif_rx/hist - - # event histogram - # - # trigger info: hist:name=foo:keys=skbaddr.hex:vals=hitcount,len:sort=hitcount:size=2048 [active] - # - - { skbaddr: ffff88000ad53500 } hitcount: 1 len: 46 - { skbaddr: ffff8800af5a1500 } hitcount: 1 len: 76 - { skbaddr: ffff8800d62a1900 } hitcount: 1 len: 46 - { skbaddr: ffff8800d2bccb00 } hitcount: 1 len: 468 - { skbaddr: ffff8800d3c69900 } hitcount: 1 len: 46 - { skbaddr: ffff88009ff09100 } hitcount: 1 len: 52 - { skbaddr: ffff88010f13ab00 } hitcount: 1 len: 168 - { skbaddr: ffff88006a54f400 } hitcount: 1 len: 46 - { skbaddr: ffff8800d2bcc500 } hitcount: 1 len: 260 - { skbaddr: ffff880064505000 } hitcount: 1 len: 46 - { skbaddr: ffff8800baf24e00 } hitcount: 1 len: 32 - { skbaddr: ffff88009fe0ad00 } hitcount: 1 len: 46 - { skbaddr: ffff8800d3edff00 } hitcount: 1 len: 44 - { skbaddr: ffff88009fe0b400 } hitcount: 1 len: 168 - { skbaddr: ffff8800a1c55a00 } hitcount: 1 len: 40 - { skbaddr: ffff8800d2bcd100 } hitcount: 1 len: 40 - { skbaddr: ffff880064505f00 } hitcount: 1 len: 174 - { skbaddr: ffff8800a8bff200 } hitcount: 1 len: 160 - { skbaddr: ffff880044e3cc00 } hitcount: 1 len: 76 - { skbaddr: ffff8800a8bfe700 } hitcount: 1 len: 46 - { skbaddr: ffff8800d2bcdc00 } hitcount: 1 len: 32 - { skbaddr: ffff8800a1f64800 } hitcount: 1 len: 46 - { skbaddr: ffff8800d2bcde00 } hitcount: 1 len: 988 - { skbaddr: ffff88006a5dea00 } hitcount: 1 len: 46 - { skbaddr: ffff88002e37a200 } hitcount: 1 len: 44 - { skbaddr: ffff8800a1f32c00 } hitcount: 2 len: 676 - { skbaddr: ffff88000ad52600 } hitcount: 2 len: 107 - { skbaddr: ffff8800a1f91e00 } hitcount: 2 len: 92 - { skbaddr: ffff8800af5a0200 } hitcount: 2 len: 142 - { skbaddr: ffff8800d2bcc600 } hitcount: 2 len: 220 - { skbaddr: ffff8800ba36f500 } hitcount: 2 len: 92 - { skbaddr: ffff8800d021f800 } hitcount: 2 len: 92 - { skbaddr: ffff8800a1f33600 } hitcount: 2 len: 675 - { skbaddr: ffff8800a8bfff00 } hitcount: 3 len: 138 - { skbaddr: ffff8800d62a1300 } hitcount: 3 len: 138 - { skbaddr: ffff88002e37a100 } hitcount: 4 len: 184 - { skbaddr: ffff880064504400 } hitcount: 4 len: 184 - { skbaddr: ffff8800a8bfec00 } hitcount: 4 len: 184 - { skbaddr: ffff88000ad53700 } hitcount: 5 len: 230 - { skbaddr: ffff8800d2bcdb00 } hitcount: 5 len: 196 - { skbaddr: ffff8800a1f90000 } hitcount: 6 len: 276 - { skbaddr: ffff88006a54f900 } hitcount: 6 len: 276 - - Totals: - Hits: 81 - Entries: 42 - Dropped: 0 - # event histogram - # - # trigger info: hist:name=foo:keys=skbaddr.hex:vals=hitcount,len:sort=hitcount:size=2048 [active] - # - - { skbaddr: ffff88000ad53500 } hitcount: 1 len: 46 - { skbaddr: ffff8800af5a1500 } hitcount: 1 len: 76 - { skbaddr: ffff8800d62a1900 } hitcount: 1 len: 46 - { skbaddr: ffff8800d2bccb00 } hitcount: 1 len: 468 - { skbaddr: ffff8800d3c69900 } hitcount: 1 len: 46 - { skbaddr: ffff88009ff09100 } hitcount: 1 len: 52 - { skbaddr: ffff88010f13ab00 } hitcount: 1 len: 168 - { skbaddr: ffff88006a54f400 } hitcount: 1 len: 46 - { skbaddr: ffff8800d2bcc500 } hitcount: 1 len: 260 - { skbaddr: ffff880064505000 } hitcount: 1 len: 46 - { skbaddr: ffff8800baf24e00 } hitcount: 1 len: 32 - { skbaddr: ffff88009fe0ad00 } hitcount: 1 len: 46 - { skbaddr: ffff8800d3edff00 } hitcount: 1 len: 44 - { skbaddr: ffff88009fe0b400 } hitcount: 1 len: 168 - { skbaddr: ffff8800a1c55a00 } hitcount: 1 len: 40 - { skbaddr: ffff8800d2bcd100 } hitcount: 1 len: 40 - { skbaddr: ffff880064505f00 } hitcount: 1 len: 174 - { skbaddr: ffff8800a8bff200 } hitcount: 1 len: 160 - { skbaddr: ffff880044e3cc00 } hitcount: 1 len: 76 - { skbaddr: ffff8800a8bfe700 } hitcount: 1 len: 46 - { skbaddr: ffff8800d2bcdc00 } hitcount: 1 len: 32 - { skbaddr: ffff8800a1f64800 } hitcount: 1 len: 46 - { skbaddr: ffff8800d2bcde00 } hitcount: 1 len: 988 - { skbaddr: ffff88006a5dea00 } hitcount: 1 len: 46 - { skbaddr: ffff88002e37a200 } hitcount: 1 len: 44 - { skbaddr: ffff8800a1f32c00 } hitcount: 2 len: 676 - { skbaddr: ffff88000ad52600 } hitcount: 2 len: 107 - { skbaddr: ffff8800a1f91e00 } hitcount: 2 len: 92 - { skbaddr: ffff8800af5a0200 } hitcount: 2 len: 142 - { skbaddr: ffff8800d2bcc600 } hitcount: 2 len: 220 - { skbaddr: ffff8800ba36f500 } hitcount: 2 len: 92 - { skbaddr: ffff8800d021f800 } hitcount: 2 len: 92 - { skbaddr: ffff8800a1f33600 } hitcount: 2 len: 675 - { skbaddr: ffff8800a8bfff00 } hitcount: 3 len: 138 - { skbaddr: ffff8800d62a1300 } hitcount: 3 len: 138 - { skbaddr: ffff88002e37a100 } hitcount: 4 len: 184 - { skbaddr: ffff880064504400 } hitcount: 4 len: 184 - { skbaddr: ffff8800a8bfec00 } hitcount: 4 len: 184 - { skbaddr: ffff88000ad53700 } hitcount: 5 len: 230 - { skbaddr: ffff8800d2bcdb00 } hitcount: 5 len: 196 - { skbaddr: ffff8800a1f90000 } hitcount: 6 len: 276 - { skbaddr: ffff88006a54f900 } hitcount: 6 len: 276 - - Totals: - Hits: 81 - Entries: 42 - Dropped: 0 - - And here's an example that shows how to combine histogram data from - any two events even if they don't share any 'compatible' fields - other than 'hitcount' and 'stacktrace'. These commands create a - couple of triggers named 'bar' using those fields: - - # echo 'hist:name=bar:key=stacktrace:val=hitcount' > \ - /sys/kernel/debug/tracing/events/sched/sched_process_fork/trigger - # echo 'hist:name=bar:key=stacktrace:val=hitcount' > \ - /sys/kernel/debug/tracing/events/net/netif_rx/trigger - - And displaying the output of either shows some interesting if - somewhat confusing output: - - # cat /sys/kernel/debug/tracing/events/sched/sched_process_fork/hist - # cat /sys/kernel/debug/tracing/events/net/netif_rx/hist - - # event histogram - # - # trigger info: hist:name=bar:keys=stacktrace:vals=hitcount:sort=hitcount:size=2048 [active] - # - - { stacktrace: - _do_fork+0x18e/0x330 - kernel_thread+0x29/0x30 - kthreadd+0x154/0x1b0 - ret_from_fork+0x3f/0x70 - } hitcount: 1 - { stacktrace: - netif_rx_internal+0xb2/0xd0 - netif_rx_ni+0x20/0x70 - dev_loopback_xmit+0xaa/0xd0 - ip_mc_output+0x126/0x240 - ip_local_out_sk+0x31/0x40 - igmp_send_report+0x1e9/0x230 - igmp_timer_expire+0xe9/0x120 - call_timer_fn+0x39/0xf0 - run_timer_softirq+0x1e1/0x290 - __do_softirq+0xfd/0x290 - irq_exit+0x98/0xb0 - smp_apic_timer_interrupt+0x4a/0x60 - apic_timer_interrupt+0x6d/0x80 - cpuidle_enter+0x17/0x20 - call_cpuidle+0x3b/0x60 - cpu_startup_entry+0x22d/0x310 - } hitcount: 1 - { stacktrace: - netif_rx_internal+0xb2/0xd0 - netif_rx_ni+0x20/0x70 - dev_loopback_xmit+0xaa/0xd0 - ip_mc_output+0x17f/0x240 - ip_local_out_sk+0x31/0x40 - ip_send_skb+0x1a/0x50 - udp_send_skb+0x13e/0x270 - udp_sendmsg+0x2bf/0x980 - inet_sendmsg+0x67/0xa0 - sock_sendmsg+0x38/0x50 - SYSC_sendto+0xef/0x170 - SyS_sendto+0xe/0x10 - entry_SYSCALL_64_fastpath+0x12/0x6a - } hitcount: 2 - { stacktrace: - netif_rx_internal+0xb2/0xd0 - netif_rx+0x1c/0x60 - loopback_xmit+0x6c/0xb0 - dev_hard_start_xmit+0x219/0x3a0 - __dev_queue_xmit+0x415/0x4f0 - dev_queue_xmit_sk+0x13/0x20 - ip_finish_output2+0x237/0x340 - ip_finish_output+0x113/0x1d0 - ip_output+0x66/0xc0 - ip_local_out_sk+0x31/0x40 - ip_send_skb+0x1a/0x50 - udp_send_skb+0x16d/0x270 - udp_sendmsg+0x2bf/0x980 - inet_sendmsg+0x67/0xa0 - sock_sendmsg+0x38/0x50 - ___sys_sendmsg+0x14e/0x270 - } hitcount: 76 - { stacktrace: - netif_rx_internal+0xb2/0xd0 - netif_rx+0x1c/0x60 - loopback_xmit+0x6c/0xb0 - dev_hard_start_xmit+0x219/0x3a0 - __dev_queue_xmit+0x415/0x4f0 - dev_queue_xmit_sk+0x13/0x20 - ip_finish_output2+0x237/0x340 - ip_finish_output+0x113/0x1d0 - ip_output+0x66/0xc0 - ip_local_out_sk+0x31/0x40 - ip_send_skb+0x1a/0x50 - udp_send_skb+0x16d/0x270 - udp_sendmsg+0x2bf/0x980 - inet_sendmsg+0x67/0xa0 - sock_sendmsg+0x38/0x50 - ___sys_sendmsg+0x269/0x270 - } hitcount: 77 - { stacktrace: - netif_rx_internal+0xb2/0xd0 - netif_rx+0x1c/0x60 - loopback_xmit+0x6c/0xb0 - dev_hard_start_xmit+0x219/0x3a0 - __dev_queue_xmit+0x415/0x4f0 - dev_queue_xmit_sk+0x13/0x20 - ip_finish_output2+0x237/0x340 - ip_finish_output+0x113/0x1d0 - ip_output+0x66/0xc0 - ip_local_out_sk+0x31/0x40 - ip_send_skb+0x1a/0x50 - udp_send_skb+0x16d/0x270 - udp_sendmsg+0x2bf/0x980 - inet_sendmsg+0x67/0xa0 - sock_sendmsg+0x38/0x50 - SYSC_sendto+0xef/0x170 - } hitcount: 88 - { stacktrace: - _do_fork+0x18e/0x330 - SyS_clone+0x19/0x20 - entry_SYSCALL_64_fastpath+0x12/0x6a - } hitcount: 244 - - Totals: - Hits: 489 - Entries: 7 - Dropped: 0 - -2.2 Inter-event hist triggers ------------------------------ - -Inter-event hist triggers are hist triggers that combine values from -one or more other events and create a histogram using that data. Data -from an inter-event histogram can in turn become the source for -further combined histograms, thus providing a chain of related -histograms, which is important for some applications. - -The most important example of an inter-event quantity that can be used -in this manner is latency, which is simply a difference in timestamps -between two events. Although latency is the most important -inter-event quantity, note that because the support is completely -general across the trace event subsystem, any event field can be used -in an inter-event quantity. - -An example of a histogram that combines data from other histograms -into a useful chain would be a 'wakeupswitch latency' histogram that -combines a 'wakeup latency' histogram and a 'switch latency' -histogram. - -Normally, a hist trigger specification consists of a (possibly -compound) key along with one or more numeric values, which are -continually updated sums associated with that key. A histogram -specification in this case consists of individual key and value -specifications that refer to trace event fields associated with a -single event type. - -The inter-event hist trigger extension allows fields from multiple -events to be referenced and combined into a multi-event histogram -specification. In support of this overall goal, a few enabling -features have been added to the hist trigger support: - - - In order to compute an inter-event quantity, a value from one - event needs to saved and then referenced from another event. This - requires the introduction of support for histogram 'variables'. - - - The computation of inter-event quantities and their combination - require some minimal amount of support for applying simple - expressions to variables (+ and -). - - - A histogram consisting of inter-event quantities isn't logically a - histogram on either event (so having the 'hist' file for either - event host the histogram output doesn't really make sense). To - address the idea that the histogram is associated with a - combination of events, support is added allowing the creation of - 'synthetic' events that are events derived from other events. - These synthetic events are full-fledged events just like any other - and can be used as such, as for instance to create the - 'combination' histograms mentioned previously. - - - A set of 'actions' can be associated with histogram entries - - these can be used to generate the previously mentioned synthetic - events, but can also be used for other purposes, such as for - example saving context when a 'max' latency has been hit. - - - Trace events don't have a 'timestamp' associated with them, but - there is an implicit timestamp saved along with an event in the - underlying ftrace ring buffer. This timestamp is now exposed as a - a synthetic field named 'common_timestamp' which can be used in - histograms as if it were any other event field; it isn't an actual - field in the trace format but rather is a synthesized value that - nonetheless can be used as if it were an actual field. By default - it is in units of nanoseconds; appending '.usecs' to a - common_timestamp field changes the units to microseconds. - -A note on inter-event timestamps: If common_timestamp is used in a -histogram, the trace buffer is automatically switched over to using -absolute timestamps and the "global" trace clock, in order to avoid -bogus timestamp differences with other clocks that aren't coherent -across CPUs. This can be overridden by specifying one of the other -trace clocks instead, using the "clock=XXX" hist trigger attribute, -where XXX is any of the clocks listed in the tracing/trace_clock -pseudo-file. - -These features are described in more detail in the following sections. - -2.2.1 Histogram Variables -------------------------- - -Variables are simply named locations used for saving and retrieving -values between matching events. A 'matching' event is defined as an -event that has a matching key - if a variable is saved for a histogram -entry corresponding to that key, any subsequent event with a matching -key can access that variable. - -A variable's value is normally available to any subsequent event until -it is set to something else by a subsequent event. The one exception -to that rule is that any variable used in an expression is essentially -'read-once' - once it's used by an expression in a subsequent event, -it's reset to its 'unset' state, which means it can't be used again -unless it's set again. This ensures not only that an event doesn't -use an uninitialized variable in a calculation, but that that variable -is used only once and not for any unrelated subsequent match. - -The basic syntax for saving a variable is to simply prefix a unique -variable name not corresponding to any keyword along with an '=' sign -to any event field. - -Either keys or values can be saved and retrieved in this way. This -creates a variable named 'ts0' for a histogram entry with the key -'next_pid': - - # echo 'hist:keys=next_pid:vals=$ts0:ts0=common_timestamp ... >> \ - event/trigger - -The ts0 variable can be accessed by any subsequent event having the -same pid as 'next_pid'. - -Variable references are formed by prepending the variable name with -the '$' sign. Thus for example, the ts0 variable above would be -referenced as '$ts0' in expressions. - -Because 'vals=' is used, the common_timestamp variable value above -will also be summed as a normal histogram value would (though for a -timestamp it makes little sense). - -The below shows that a key value can also be saved in the same way: - - # echo 'hist:timer_pid=common_pid:key=timer_pid ...' >> event/trigger - -If a variable isn't a key variable or prefixed with 'vals=', the -associated event field will be saved in a variable but won't be summed -as a value: - - # echo 'hist:keys=next_pid:ts1=common_timestamp ...' >> event/trigger - -Multiple variables can be assigned at the same time. The below would -result in both ts0 and b being created as variables, with both -common_timestamp and field1 additionally being summed as values: - - # echo 'hist:keys=pid:vals=$ts0,$b:ts0=common_timestamp,b=field1 ...' >> \ - event/trigger - -Note that variable assignments can appear either preceding or -following their use. The command below behaves identically to the -command above: - - # echo 'hist:keys=pid:ts0=common_timestamp,b=field1:vals=$ts0,$b ...' >> \ - event/trigger - -Any number of variables not bound to a 'vals=' prefix can also be -assigned by simply separating them with colons. Below is the same -thing but without the values being summed in the histogram: - - # echo 'hist:keys=pid:ts0=common_timestamp:b=field1 ...' >> event/trigger - -Variables set as above can be referenced and used in expressions on -another event. - -For example, here's how a latency can be calculated: - - # echo 'hist:keys=pid,prio:ts0=common_timestamp ...' >> event1/trigger - # echo 'hist:keys=next_pid:wakeup_lat=common_timestamp-$ts0 ...' >> event2/trigger - -In the first line above, the event's timetamp is saved into the -variable ts0. In the next line, ts0 is subtracted from the second -event's timestamp to produce the latency, which is then assigned into -yet another variable, 'wakeup_lat'. The hist trigger below in turn -makes use of the wakeup_lat variable to compute a combined latency -using the same key and variable from yet another event: - - # echo 'hist:key=pid:wakeupswitch_lat=$wakeup_lat+$switchtime_lat ...' >> event3/trigger - -2.2.2 Synthetic Events ----------------------- - -Synthetic events are user-defined events generated from hist trigger -variables or fields associated with one or more other events. Their -purpose is to provide a mechanism for displaying data spanning -multiple events consistent with the existing and already familiar -usage for normal events. - -To define a synthetic event, the user writes a simple specification -consisting of the name of the new event along with one or more -variables and their types, which can be any valid field type, -separated by semicolons, to the tracing/synthetic_events file. - -For instance, the following creates a new event named 'wakeup_latency' -with 3 fields: lat, pid, and prio. Each of those fields is simply a -variable reference to a variable on another event: - - # echo 'wakeup_latency \ - u64 lat; \ - pid_t pid; \ - int prio' >> \ - /sys/kernel/debug/tracing/synthetic_events - -Reading the tracing/synthetic_events file lists all the currently -defined synthetic events, in this case the event defined above: - - # cat /sys/kernel/debug/tracing/synthetic_events - wakeup_latency u64 lat; pid_t pid; int prio - -An existing synthetic event definition can be removed by prepending -the command that defined it with a '!': - - # echo '!wakeup_latency u64 lat pid_t pid int prio' >> \ - /sys/kernel/debug/tracing/synthetic_events - -At this point, there isn't yet an actual 'wakeup_latency' event -instantiated in the event subsytem - for this to happen, a 'hist -trigger action' needs to be instantiated and bound to actual fields -and variables defined on other events (see Section 2.2.3 below on -how that is done using hist trigger 'onmatch' action). Once that is -done, the 'wakeup_latency' synthetic event instance is created. - -A histogram can now be defined for the new synthetic event: - - # echo 'hist:keys=pid,prio,lat.log2:sort=pid,lat' >> \ - /sys/kernel/debug/tracing/events/synthetic/wakeup_latency/trigger - -The new event is created under the tracing/events/synthetic/ directory -and looks and behaves just like any other event: - - # ls /sys/kernel/debug/tracing/events/synthetic/wakeup_latency - enable filter format hist id trigger - -Like any other event, once a histogram is enabled for the event, the -output can be displayed by reading the event's 'hist' file. - -2.2.3 Hist trigger 'actions' ----------------------------- - -A hist trigger 'action' is a function that's executed whenever a -histogram entry is added or updated. - -The default 'action' if no special function is explicity specified is -as it always has been, to simply update the set of values associated -with an entry. Some applications, however, may want to perform -additional actions at that point, such as generate another event, or -compare and save a maximum. - -The following additional actions are available. To specify an action -for a given event, simply specify the action between colons in the -hist trigger specification. - - - onmatch(matching.event).(param list) - - The 'onmatch(matching.event).(params)' hist - trigger action is invoked whenever an event matches and the - histogram entry would be added or updated. It causes the named - synthetic event to be generated with the values given in the - 'param list'. The result is the generation of a synthetic event - that consists of the values contained in those variables at the - time the invoking event was hit. - - The 'param list' consists of one or more parameters which may be - either variables or fields defined on either the 'matching.event' - or the target event. The variables or fields specified in the - param list may be either fully-qualified or unqualified. If a - variable is specified as unqualified, it must be unique between - the two events. A field name used as a param can be unqualified - if it refers to the target event, but must be fully qualified if - it refers to the matching event. A fully-qualified name is of the - form 'system.event_name.$var_name' or 'system.event_name.field'. - - The 'matching.event' specification is simply the fully qualified - event name of the event that matches the target event for the - onmatch() functionality, in the form 'system.event_name'. - - Finally, the number and type of variables/fields in the 'param - list' must match the number and types of the fields in the - synthetic event being generated. - - As an example the below defines a simple synthetic event and uses - a variable defined on the sched_wakeup_new event as a parameter - when invoking the synthetic event. Here we define the synthetic - event: - - # echo 'wakeup_new_test pid_t pid' >> \ - /sys/kernel/debug/tracing/synthetic_events - - # cat /sys/kernel/debug/tracing/synthetic_events - wakeup_new_test pid_t pid - - The following hist trigger both defines the missing testpid - variable and specifies an onmatch() action that generates a - wakeup_new_test synthetic event whenever a sched_wakeup_new event - occurs, which because of the 'if comm == "cyclictest"' filter only - happens when the executable is cyclictest: - - # echo 'hist:keys=$testpid:testpid=pid:onmatch(sched.sched_wakeup_new).\ - wakeup_new_test($testpid) if comm=="cyclictest"' >> \ - /sys/kernel/debug/tracing/events/sched/sched_wakeup_new/trigger - - Creating and displaying a histogram based on those events is now - just a matter of using the fields and new synthetic event in the - tracing/events/synthetic directory, as usual: - - # echo 'hist:keys=pid:sort=pid' >> \ - /sys/kernel/debug/tracing/events/synthetic/wakeup_new_test/trigger - - Running 'cyclictest' should cause wakeup_new events to generate - wakeup_new_test synthetic events which should result in histogram - output in the wakeup_new_test event's hist file: - - # cat /sys/kernel/debug/tracing/events/synthetic/wakeup_new_test/hist - - A more typical usage would be to use two events to calculate a - latency. The following example uses a set of hist triggers to - produce a 'wakeup_latency' histogram: - - First, we define a 'wakeup_latency' synthetic event: - - # echo 'wakeup_latency u64 lat; pid_t pid; int prio' >> \ - /sys/kernel/debug/tracing/synthetic_events - - Next, we specify that whenever we see a sched_waking event for a - cyclictest thread, save the timestamp in a 'ts0' variable: - - # echo 'hist:keys=$saved_pid:saved_pid=pid:ts0=common_timestamp.usecs \ - if comm=="cyclictest"' >> \ - /sys/kernel/debug/tracing/events/sched/sched_waking/trigger - - Then, when the corresponding thread is actually scheduled onto the - CPU by a sched_switch event, calculate the latency and use that - along with another variable and an event field to generate a - wakeup_latency synthetic event: - - # echo 'hist:keys=next_pid:wakeup_lat=common_timestamp.usecs-$ts0:\ - onmatch(sched.sched_waking).wakeup_latency($wakeup_lat,\ - $saved_pid,next_prio) if next_comm=="cyclictest"' >> \ - /sys/kernel/debug/tracing/events/sched/sched_switch/trigger - - We also need to create a histogram on the wakeup_latency synthetic - event in order to aggregate the generated synthetic event data: - - # echo 'hist:keys=pid,prio,lat:sort=pid,lat' >> \ - /sys/kernel/debug/tracing/events/synthetic/wakeup_latency/trigger - - Finally, once we've run cyclictest to actually generate some - events, we can see the output by looking at the wakeup_latency - synthetic event's hist file: - - # cat /sys/kernel/debug/tracing/events/synthetic/wakeup_latency/hist - - - onmax(var).save(field,.. .) - - The 'onmax(var).save(field,...)' hist trigger action is invoked - whenever the value of 'var' associated with a histogram entry - exceeds the current maximum contained in that variable. - - The end result is that the trace event fields specified as the - onmax.save() params will be saved if 'var' exceeds the current - maximum for that hist trigger entry. This allows context from the - event that exhibited the new maximum to be saved for later - reference. When the histogram is displayed, additional fields - displaying the saved values will be printed. - - As an example the below defines a couple of hist triggers, one for - sched_waking and another for sched_switch, keyed on pid. Whenever - a sched_waking occurs, the timestamp is saved in the entry - corresponding to the current pid, and when the scheduler switches - back to that pid, the timestamp difference is calculated. If the - resulting latency, stored in wakeup_lat, exceeds the current - maximum latency, the values specified in the save() fields are - recorded: - - # echo 'hist:keys=pid:ts0=common_timestamp.usecs \ - if comm=="cyclictest"' >> \ - /sys/kernel/debug/tracing/events/sched/sched_waking/trigger - - # echo 'hist:keys=next_pid:\ - wakeup_lat=common_timestamp.usecs-$ts0:\ - onmax($wakeup_lat).save(next_comm,prev_pid,prev_prio,prev_comm) \ - if next_comm=="cyclictest"' >> \ - /sys/kernel/debug/tracing/events/sched/sched_switch/trigger - - When the histogram is displayed, the max value and the saved - values corresponding to the max are displayed following the rest - of the fields: - - # cat /sys/kernel/debug/tracing/events/sched/sched_switch/hist - { next_pid: 2255 } hitcount: 239 - common_timestamp-ts0: 0 - max: 27 - next_comm: cyclictest - prev_pid: 0 prev_prio: 120 prev_comm: swapper/1 - - { next_pid: 2256 } hitcount: 2355 - common_timestamp-ts0: 0 - max: 49 next_comm: cyclictest - prev_pid: 0 prev_prio: 120 prev_comm: swapper/0 - - Totals: - Hits: 12970 - Entries: 2 - Dropped: 0 - -3. User space creating a trigger --------------------------------- - -Writing into /sys/kernel/tracing/trace_marker writes into the ftrace -ring buffer. This can also act like an event, by writing into the trigger -file located in /sys/kernel/tracing/events/ftrace/print/ - -Modifying cyclictest to write into the trace_marker file before it sleeps -and after it wakes up, something like this: - -static void traceputs(char *str) -{ - /* tracemark_fd is the trace_marker file descriptor */ - if (tracemark_fd < 0) - return; - /* write the tracemark message */ - write(tracemark_fd, str, strlen(str)); -} - -And later add something like: - - traceputs("start"); - clock_nanosleep(...); - traceputs("end"); - -We can make a histogram from this: - - # cd /sys/kernel/tracing - # echo 'latency u64 lat' > synthetic_events - # echo 'hist:keys=common_pid:ts0=common_timestamp.usecs if buf == "start"' > events/ftrace/print/trigger - # echo 'hist:keys=common_pid:lat=common_timestamp.usecs-$ts0:onmatch(ftrace.print).latency($lat) if buf == "end"' >> events/ftrace/print/trigger - # echo 'hist:keys=lat,common_pid:sort=lat' > events/synthetic/latency/trigger - -The above created a synthetic event called "latency" and two histograms -against the trace_marker, one gets triggered when "start" is written into the -trace_marker file and the other when "end" is written. If the pids match, then -it will call the "latency" synthetic event with the calculated latency as its -parameter. Finally, a histogram is added to the latency synthetic event to -record the calculated latency along with the pid. - -Now running cyclictest with: - - # ./cyclictest -p80 -d0 -i250 -n -a -t --tracemark -b 1000 - - -p80 : run threads at priority 80 - -d0 : have all threads run at the same interval - -i250 : start the interval at 250 microseconds (all threads will do this) - -n : sleep with nanosleep - -a : affine all threads to a separate CPU - -t : one thread per available CPU - --tracemark : enable trace mark writing - -b 1000 : stop if any latency is greater than 1000 microseconds - -Note, the -b 1000 is used just to make --tracemark available. - -Then we can see the histogram created by this with: - - # cat events/synthetic/latency/hist -# event histogram -# -# trigger info: hist:keys=lat,common_pid:vals=hitcount:sort=lat:size=2048 [active] -# - -{ lat: 107, common_pid: 2039 } hitcount: 1 -{ lat: 122, common_pid: 2041 } hitcount: 1 -{ lat: 166, common_pid: 2039 } hitcount: 1 -{ lat: 174, common_pid: 2039 } hitcount: 1 -{ lat: 194, common_pid: 2041 } hitcount: 1 -{ lat: 196, common_pid: 2036 } hitcount: 1 -{ lat: 197, common_pid: 2038 } hitcount: 1 -{ lat: 198, common_pid: 2039 } hitcount: 1 -{ lat: 199, common_pid: 2039 } hitcount: 1 -{ lat: 200, common_pid: 2041 } hitcount: 1 -{ lat: 201, common_pid: 2039 } hitcount: 2 -{ lat: 202, common_pid: 2038 } hitcount: 1 -{ lat: 202, common_pid: 2043 } hitcount: 1 -{ lat: 203, common_pid: 2039 } hitcount: 1 -{ lat: 203, common_pid: 2036 } hitcount: 1 -{ lat: 203, common_pid: 2041 } hitcount: 1 -{ lat: 206, common_pid: 2038 } hitcount: 2 -{ lat: 207, common_pid: 2039 } hitcount: 1 -{ lat: 207, common_pid: 2036 } hitcount: 1 -{ lat: 208, common_pid: 2040 } hitcount: 1 -{ lat: 209, common_pid: 2043 } hitcount: 1 -{ lat: 210, common_pid: 2039 } hitcount: 1 -{ lat: 211, common_pid: 2039 } hitcount: 4 -{ lat: 212, common_pid: 2043 } hitcount: 1 -{ lat: 212, common_pid: 2039 } hitcount: 2 -{ lat: 213, common_pid: 2039 } hitcount: 1 -{ lat: 214, common_pid: 2038 } hitcount: 1 -{ lat: 214, common_pid: 2039 } hitcount: 2 -{ lat: 214, common_pid: 2042 } hitcount: 1 -{ lat: 215, common_pid: 2039 } hitcount: 1 -{ lat: 217, common_pid: 2036 } hitcount: 1 -{ lat: 217, common_pid: 2040 } hitcount: 1 -{ lat: 217, common_pid: 2039 } hitcount: 1 -{ lat: 218, common_pid: 2039 } hitcount: 6 -{ lat: 219, common_pid: 2039 } hitcount: 9 -{ lat: 220, common_pid: 2039 } hitcount: 11 -{ lat: 221, common_pid: 2039 } hitcount: 5 -{ lat: 221, common_pid: 2042 } hitcount: 1 -{ lat: 222, common_pid: 2039 } hitcount: 7 -{ lat: 223, common_pid: 2036 } hitcount: 1 -{ lat: 223, common_pid: 2039 } hitcount: 3 -{ lat: 224, common_pid: 2039 } hitcount: 4 -{ lat: 224, common_pid: 2037 } hitcount: 1 -{ lat: 224, common_pid: 2036 } hitcount: 2 -{ lat: 225, common_pid: 2039 } hitcount: 5 -{ lat: 225, common_pid: 2042 } hitcount: 1 -{ lat: 226, common_pid: 2039 } hitcount: 7 -{ lat: 226, common_pid: 2036 } hitcount: 4 -{ lat: 227, common_pid: 2039 } hitcount: 6 -{ lat: 227, common_pid: 2036 } hitcount: 12 -{ lat: 227, common_pid: 2043 } hitcount: 1 -{ lat: 228, common_pid: 2039 } hitcount: 7 -{ lat: 228, common_pid: 2036 } hitcount: 14 -{ lat: 229, common_pid: 2039 } hitcount: 9 -{ lat: 229, common_pid: 2036 } hitcount: 8 -{ lat: 229, common_pid: 2038 } hitcount: 1 -{ lat: 230, common_pid: 2039 } hitcount: 11 -{ lat: 230, common_pid: 2036 } hitcount: 6 -{ lat: 230, common_pid: 2043 } hitcount: 1 -{ lat: 230, common_pid: 2042 } hitcount: 2 -{ lat: 231, common_pid: 2041 } hitcount: 1 -{ lat: 231, common_pid: 2036 } hitcount: 6 -{ lat: 231, common_pid: 2043 } hitcount: 1 -{ lat: 231, common_pid: 2039 } hitcount: 8 -{ lat: 232, common_pid: 2037 } hitcount: 1 -{ lat: 232, common_pid: 2039 } hitcount: 6 -{ lat: 232, common_pid: 2040 } hitcount: 2 -{ lat: 232, common_pid: 2036 } hitcount: 5 -{ lat: 232, common_pid: 2043 } hitcount: 1 -{ lat: 233, common_pid: 2036 } hitcount: 5 -{ lat: 233, common_pid: 2039 } hitcount: 11 -{ lat: 234, common_pid: 2039 } hitcount: 4 -{ lat: 234, common_pid: 2038 } hitcount: 2 -{ lat: 234, common_pid: 2043 } hitcount: 2 -{ lat: 234, common_pid: 2036 } hitcount: 11 -{ lat: 234, common_pid: 2040 } hitcount: 1 -{ lat: 235, common_pid: 2037 } hitcount: 2 -{ lat: 235, common_pid: 2036 } hitcount: 8 -{ lat: 235, common_pid: 2043 } hitcount: 2 -{ lat: 235, common_pid: 2039 } hitcount: 5 -{ lat: 235, common_pid: 2042 } hitcount: 2 -{ lat: 235, common_pid: 2040 } hitcount: 4 -{ lat: 235, common_pid: 2041 } hitcount: 1 -{ lat: 236, common_pid: 2036 } hitcount: 7 -{ lat: 236, common_pid: 2037 } hitcount: 1 -{ lat: 236, common_pid: 2041 } hitcount: 5 -{ lat: 236, common_pid: 2039 } hitcount: 3 -{ lat: 236, common_pid: 2043 } hitcount: 9 -{ lat: 236, common_pid: 2040 } hitcount: 7 -{ lat: 237, common_pid: 2037 } hitcount: 1 -{ lat: 237, common_pid: 2040 } hitcount: 1 -{ lat: 237, common_pid: 2036 } hitcount: 9 -{ lat: 237, common_pid: 2039 } hitcount: 3 -{ lat: 237, common_pid: 2043 } hitcount: 8 -{ lat: 237, common_pid: 2042 } hitcount: 2 -{ lat: 237, common_pid: 2041 } hitcount: 2 -{ lat: 238, common_pid: 2043 } hitcount: 10 -{ lat: 238, common_pid: 2040 } hitcount: 1 -{ lat: 238, common_pid: 2037 } hitcount: 9 -{ lat: 238, common_pid: 2038 } hitcount: 1 -{ lat: 238, common_pid: 2039 } hitcount: 1 -{ lat: 238, common_pid: 2042 } hitcount: 3 -{ lat: 238, common_pid: 2036 } hitcount: 7 -{ lat: 239, common_pid: 2041 } hitcount: 1 -{ lat: 239, common_pid: 2043 } hitcount: 11 -{ lat: 239, common_pid: 2037 } hitcount: 11 -{ lat: 239, common_pid: 2038 } hitcount: 6 -{ lat: 239, common_pid: 2036 } hitcount: 7 -{ lat: 239, common_pid: 2040 } hitcount: 1 -{ lat: 239, common_pid: 2042 } hitcount: 9 -{ lat: 240, common_pid: 2037 } hitcount: 29 -{ lat: 240, common_pid: 2043 } hitcount: 15 -{ lat: 240, common_pid: 2040 } hitcount: 44 -{ lat: 240, common_pid: 2039 } hitcount: 1 -{ lat: 240, common_pid: 2041 } hitcount: 2 -{ lat: 240, common_pid: 2038 } hitcount: 1 -{ lat: 240, common_pid: 2036 } hitcount: 10 -{ lat: 240, common_pid: 2042 } hitcount: 13 -{ lat: 241, common_pid: 2036 } hitcount: 21 -{ lat: 241, common_pid: 2041 } hitcount: 36 -{ lat: 241, common_pid: 2037 } hitcount: 34 -{ lat: 241, common_pid: 2042 } hitcount: 14 -{ lat: 241, common_pid: 2040 } hitcount: 94 -{ lat: 241, common_pid: 2039 } hitcount: 12 -{ lat: 241, common_pid: 2038 } hitcount: 2 -{ lat: 241, common_pid: 2043 } hitcount: 28 -{ lat: 242, common_pid: 2040 } hitcount: 109 -{ lat: 242, common_pid: 2041 } hitcount: 506 -{ lat: 242, common_pid: 2039 } hitcount: 155 -{ lat: 242, common_pid: 2042 } hitcount: 21 -{ lat: 242, common_pid: 2037 } hitcount: 52 -{ lat: 242, common_pid: 2043 } hitcount: 21 -{ lat: 242, common_pid: 2036 } hitcount: 16 -{ lat: 242, common_pid: 2038 } hitcount: 156 -{ lat: 243, common_pid: 2037 } hitcount: 46 -{ lat: 243, common_pid: 2039 } hitcount: 40 -{ lat: 243, common_pid: 2042 } hitcount: 119 -{ lat: 243, common_pid: 2041 } hitcount: 611 -{ lat: 243, common_pid: 2036 } hitcount: 69 -{ lat: 243, common_pid: 2038 } hitcount: 784 -{ lat: 243, common_pid: 2040 } hitcount: 323 -{ lat: 243, common_pid: 2043 } hitcount: 14 -{ lat: 244, common_pid: 2043 } hitcount: 35 -{ lat: 244, common_pid: 2042 } hitcount: 305 -{ lat: 244, common_pid: 2039 } hitcount: 8 -{ lat: 244, common_pid: 2040 } hitcount: 4515 -{ lat: 244, common_pid: 2038 } hitcount: 371 -{ lat: 244, common_pid: 2037 } hitcount: 31 -{ lat: 244, common_pid: 2036 } hitcount: 114 -{ lat: 244, common_pid: 2041 } hitcount: 3396 -{ lat: 245, common_pid: 2036 } hitcount: 700 -{ lat: 245, common_pid: 2041 } hitcount: 2772 -{ lat: 245, common_pid: 2037 } hitcount: 268 -{ lat: 245, common_pid: 2039 } hitcount: 472 -{ lat: 245, common_pid: 2038 } hitcount: 2758 -{ lat: 245, common_pid: 2042 } hitcount: 3833 -{ lat: 245, common_pid: 2040 } hitcount: 3105 -{ lat: 245, common_pid: 2043 } hitcount: 645 -{ lat: 246, common_pid: 2038 } hitcount: 3451 -{ lat: 246, common_pid: 2041 } hitcount: 142 -{ lat: 246, common_pid: 2037 } hitcount: 5101 -{ lat: 246, common_pid: 2040 } hitcount: 68 -{ lat: 246, common_pid: 2043 } hitcount: 5099 -{ lat: 246, common_pid: 2039 } hitcount: 5608 -{ lat: 246, common_pid: 2042 } hitcount: 3723 -{ lat: 246, common_pid: 2036 } hitcount: 4738 -{ lat: 247, common_pid: 2042 } hitcount: 312 -{ lat: 247, common_pid: 2043 } hitcount: 2385 -{ lat: 247, common_pid: 2041 } hitcount: 452 -{ lat: 247, common_pid: 2038 } hitcount: 792 -{ lat: 247, common_pid: 2040 } hitcount: 78 -{ lat: 247, common_pid: 2036 } hitcount: 2375 -{ lat: 247, common_pid: 2039 } hitcount: 1834 -{ lat: 247, common_pid: 2037 } hitcount: 2655 -{ lat: 248, common_pid: 2037 } hitcount: 36 -{ lat: 248, common_pid: 2042 } hitcount: 11 -{ lat: 248, common_pid: 2038 } hitcount: 122 -{ lat: 248, common_pid: 2036 } hitcount: 135 -{ lat: 248, common_pid: 2039 } hitcount: 26 -{ lat: 248, common_pid: 2041 } hitcount: 503 -{ lat: 248, common_pid: 2043 } hitcount: 66 -{ lat: 248, common_pid: 2040 } hitcount: 46 -{ lat: 249, common_pid: 2037 } hitcount: 29 -{ lat: 249, common_pid: 2038 } hitcount: 1 -{ lat: 249, common_pid: 2043 } hitcount: 29 -{ lat: 249, common_pid: 2039 } hitcount: 8 -{ lat: 249, common_pid: 2042 } hitcount: 56 -{ lat: 249, common_pid: 2040 } hitcount: 27 -{ lat: 249, common_pid: 2041 } hitcount: 11 -{ lat: 249, common_pid: 2036 } hitcount: 27 -{ lat: 250, common_pid: 2038 } hitcount: 1 -{ lat: 250, common_pid: 2036 } hitcount: 30 -{ lat: 250, common_pid: 2040 } hitcount: 19 -{ lat: 250, common_pid: 2043 } hitcount: 22 -{ lat: 250, common_pid: 2042 } hitcount: 20 -{ lat: 250, common_pid: 2041 } hitcount: 1 -{ lat: 250, common_pid: 2039 } hitcount: 6 -{ lat: 250, common_pid: 2037 } hitcount: 48 -{ lat: 251, common_pid: 2037 } hitcount: 43 -{ lat: 251, common_pid: 2039 } hitcount: 1 -{ lat: 251, common_pid: 2036 } hitcount: 12 -{ lat: 251, common_pid: 2042 } hitcount: 2 -{ lat: 251, common_pid: 2041 } hitcount: 1 -{ lat: 251, common_pid: 2043 } hitcount: 15 -{ lat: 251, common_pid: 2040 } hitcount: 3 -{ lat: 252, common_pid: 2040 } hitcount: 1 -{ lat: 252, common_pid: 2036 } hitcount: 12 -{ lat: 252, common_pid: 2037 } hitcount: 21 -{ lat: 252, common_pid: 2043 } hitcount: 14 -{ lat: 253, common_pid: 2037 } hitcount: 21 -{ lat: 253, common_pid: 2039 } hitcount: 2 -{ lat: 253, common_pid: 2036 } hitcount: 9 -{ lat: 253, common_pid: 2043 } hitcount: 6 -{ lat: 253, common_pid: 2040 } hitcount: 1 -{ lat: 254, common_pid: 2036 } hitcount: 8 -{ lat: 254, common_pid: 2043 } hitcount: 3 -{ lat: 254, common_pid: 2041 } hitcount: 1 -{ lat: 254, common_pid: 2042 } hitcount: 1 -{ lat: 254, common_pid: 2039 } hitcount: 1 -{ lat: 254, common_pid: 2037 } hitcount: 12 -{ lat: 255, common_pid: 2043 } hitcount: 1 -{ lat: 255, common_pid: 2037 } hitcount: 2 -{ lat: 255, common_pid: 2036 } hitcount: 2 -{ lat: 255, common_pid: 2039 } hitcount: 8 -{ lat: 256, common_pid: 2043 } hitcount: 1 -{ lat: 256, common_pid: 2036 } hitcount: 4 -{ lat: 256, common_pid: 2039 } hitcount: 6 -{ lat: 257, common_pid: 2039 } hitcount: 5 -{ lat: 257, common_pid: 2036 } hitcount: 4 -{ lat: 258, common_pid: 2039 } hitcount: 5 -{ lat: 258, common_pid: 2036 } hitcount: 2 -{ lat: 259, common_pid: 2036 } hitcount: 7 -{ lat: 259, common_pid: 2039 } hitcount: 7 -{ lat: 260, common_pid: 2036 } hitcount: 8 -{ lat: 260, common_pid: 2039 } hitcount: 6 -{ lat: 261, common_pid: 2036 } hitcount: 5 -{ lat: 261, common_pid: 2039 } hitcount: 7 -{ lat: 262, common_pid: 2039 } hitcount: 5 -{ lat: 262, common_pid: 2036 } hitcount: 5 -{ lat: 263, common_pid: 2039 } hitcount: 7 -{ lat: 263, common_pid: 2036 } hitcount: 7 -{ lat: 264, common_pid: 2039 } hitcount: 9 -{ lat: 264, common_pid: 2036 } hitcount: 9 -{ lat: 265, common_pid: 2036 } hitcount: 5 -{ lat: 265, common_pid: 2039 } hitcount: 1 -{ lat: 266, common_pid: 2036 } hitcount: 1 -{ lat: 266, common_pid: 2039 } hitcount: 3 -{ lat: 267, common_pid: 2036 } hitcount: 1 -{ lat: 267, common_pid: 2039 } hitcount: 3 -{ lat: 268, common_pid: 2036 } hitcount: 1 -{ lat: 268, common_pid: 2039 } hitcount: 6 -{ lat: 269, common_pid: 2036 } hitcount: 1 -{ lat: 269, common_pid: 2043 } hitcount: 1 -{ lat: 269, common_pid: 2039 } hitcount: 2 -{ lat: 270, common_pid: 2040 } hitcount: 1 -{ lat: 270, common_pid: 2039 } hitcount: 6 -{ lat: 271, common_pid: 2041 } hitcount: 1 -{ lat: 271, common_pid: 2039 } hitcount: 5 -{ lat: 272, common_pid: 2039 } hitcount: 10 -{ lat: 273, common_pid: 2039 } hitcount: 8 -{ lat: 274, common_pid: 2039 } hitcount: 2 -{ lat: 275, common_pid: 2039 } hitcount: 1 -{ lat: 276, common_pid: 2039 } hitcount: 2 -{ lat: 276, common_pid: 2037 } hitcount: 1 -{ lat: 276, common_pid: 2038 } hitcount: 1 -{ lat: 277, common_pid: 2039 } hitcount: 1 -{ lat: 277, common_pid: 2042 } hitcount: 1 -{ lat: 278, common_pid: 2039 } hitcount: 1 -{ lat: 279, common_pid: 2039 } hitcount: 4 -{ lat: 279, common_pid: 2043 } hitcount: 1 -{ lat: 280, common_pid: 2039 } hitcount: 3 -{ lat: 283, common_pid: 2036 } hitcount: 2 -{ lat: 284, common_pid: 2039 } hitcount: 1 -{ lat: 284, common_pid: 2043 } hitcount: 1 -{ lat: 288, common_pid: 2039 } hitcount: 1 -{ lat: 289, common_pid: 2039 } hitcount: 1 -{ lat: 300, common_pid: 2039 } hitcount: 1 -{ lat: 384, common_pid: 2039 } hitcount: 1 - -Totals: - Hits: 67625 - Entries: 278 - Dropped: 0 - -Note, the writes are around the sleep, so ideally they will all be of 250 -microseconds. If you are wondering how there are several that are under -250 microseconds, that is because the way cyclictest works, is if one -iteration comes in late, the next one will set the timer to wake up less that -250. That is, if an iteration came in 50 microseconds late, the next wake up -will be at 200 microseconds. - -But this could easily be done in userspace. To make this even more -interesting, we can mix the histogram between events that happened in the -kernel with trace_marker. - - # cd /sys/kernel/tracing - # echo 'latency u64 lat' > synthetic_events - # echo 'hist:keys=pid:ts0=common_timestamp.usecs' > events/sched/sched_waking/trigger - # echo 'hist:keys=common_pid:lat=common_timestamp.usecs-$ts0:onmatch(sched.sched_waking).latency($lat) if buf == "end"' > events/ftrace/print/trigger - # echo 'hist:keys=lat,common_pid:sort=lat' > events/synthetic/latency/trigger - -The difference this time is that instead of using the trace_marker to start -the latency, the sched_waking event is used, matching the common_pid for the -trace_marker write with the pid that is being woken by sched_waking. - -After running cyclictest again with the same parameters, we now have: - - # cat events/synthetic/latency/hist -# event histogram -# -# trigger info: hist:keys=lat,common_pid:vals=hitcount:sort=lat:size=2048 [active] -# - -{ lat: 7, common_pid: 2302 } hitcount: 640 -{ lat: 7, common_pid: 2299 } hitcount: 42 -{ lat: 7, common_pid: 2303 } hitcount: 18 -{ lat: 7, common_pid: 2305 } hitcount: 166 -{ lat: 7, common_pid: 2306 } hitcount: 1 -{ lat: 7, common_pid: 2301 } hitcount: 91 -{ lat: 7, common_pid: 2300 } hitcount: 17 -{ lat: 8, common_pid: 2303 } hitcount: 8296 -{ lat: 8, common_pid: 2304 } hitcount: 6864 -{ lat: 8, common_pid: 2305 } hitcount: 9464 -{ lat: 8, common_pid: 2301 } hitcount: 9213 -{ lat: 8, common_pid: 2306 } hitcount: 6246 -{ lat: 8, common_pid: 2302 } hitcount: 8797 -{ lat: 8, common_pid: 2299 } hitcount: 8771 -{ lat: 8, common_pid: 2300 } hitcount: 8119 -{ lat: 9, common_pid: 2305 } hitcount: 1519 -{ lat: 9, common_pid: 2299 } hitcount: 2346 -{ lat: 9, common_pid: 2303 } hitcount: 2841 -{ lat: 9, common_pid: 2301 } hitcount: 1846 -{ lat: 9, common_pid: 2304 } hitcount: 3861 -{ lat: 9, common_pid: 2302 } hitcount: 1210 -{ lat: 9, common_pid: 2300 } hitcount: 2762 -{ lat: 9, common_pid: 2306 } hitcount: 4247 -{ lat: 10, common_pid: 2299 } hitcount: 16 -{ lat: 10, common_pid: 2306 } hitcount: 333 -{ lat: 10, common_pid: 2303 } hitcount: 16 -{ lat: 10, common_pid: 2304 } hitcount: 168 -{ lat: 10, common_pid: 2302 } hitcount: 240 -{ lat: 10, common_pid: 2301 } hitcount: 28 -{ lat: 10, common_pid: 2300 } hitcount: 95 -{ lat: 10, common_pid: 2305 } hitcount: 18 -{ lat: 11, common_pid: 2303 } hitcount: 5 -{ lat: 11, common_pid: 2305 } hitcount: 8 -{ lat: 11, common_pid: 2306 } hitcount: 221 -{ lat: 11, common_pid: 2302 } hitcount: 76 -{ lat: 11, common_pid: 2304 } hitcount: 26 -{ lat: 11, common_pid: 2300 } hitcount: 125 -{ lat: 11, common_pid: 2299 } hitcount: 2 -{ lat: 12, common_pid: 2305 } hitcount: 3 -{ lat: 12, common_pid: 2300 } hitcount: 6 -{ lat: 12, common_pid: 2306 } hitcount: 90 -{ lat: 12, common_pid: 2302 } hitcount: 4 -{ lat: 12, common_pid: 2303 } hitcount: 1 -{ lat: 12, common_pid: 2304 } hitcount: 122 -{ lat: 13, common_pid: 2300 } hitcount: 12 -{ lat: 13, common_pid: 2301 } hitcount: 1 -{ lat: 13, common_pid: 2306 } hitcount: 32 -{ lat: 13, common_pid: 2302 } hitcount: 5 -{ lat: 13, common_pid: 2305 } hitcount: 1 -{ lat: 13, common_pid: 2303 } hitcount: 1 -{ lat: 13, common_pid: 2304 } hitcount: 61 -{ lat: 14, common_pid: 2303 } hitcount: 4 -{ lat: 14, common_pid: 2306 } hitcount: 5 -{ lat: 14, common_pid: 2305 } hitcount: 4 -{ lat: 14, common_pid: 2304 } hitcount: 62 -{ lat: 14, common_pid: 2302 } hitcount: 19 -{ lat: 14, common_pid: 2300 } hitcount: 33 -{ lat: 14, common_pid: 2299 } hitcount: 1 -{ lat: 14, common_pid: 2301 } hitcount: 4 -{ lat: 15, common_pid: 2305 } hitcount: 1 -{ lat: 15, common_pid: 2302 } hitcount: 25 -{ lat: 15, common_pid: 2300 } hitcount: 11 -{ lat: 15, common_pid: 2299 } hitcount: 5 -{ lat: 15, common_pid: 2301 } hitcount: 1 -{ lat: 15, common_pid: 2304 } hitcount: 8 -{ lat: 15, common_pid: 2303 } hitcount: 1 -{ lat: 15, common_pid: 2306 } hitcount: 6 -{ lat: 16, common_pid: 2302 } hitcount: 31 -{ lat: 16, common_pid: 2306 } hitcount: 3 -{ lat: 16, common_pid: 2300 } hitcount: 5 -{ lat: 17, common_pid: 2302 } hitcount: 6 -{ lat: 17, common_pid: 2303 } hitcount: 1 -{ lat: 18, common_pid: 2304 } hitcount: 1 -{ lat: 18, common_pid: 2302 } hitcount: 8 -{ lat: 18, common_pid: 2299 } hitcount: 1 -{ lat: 18, common_pid: 2301 } hitcount: 1 -{ lat: 19, common_pid: 2303 } hitcount: 4 -{ lat: 19, common_pid: 2304 } hitcount: 5 -{ lat: 19, common_pid: 2302 } hitcount: 4 -{ lat: 19, common_pid: 2299 } hitcount: 3 -{ lat: 19, common_pid: 2306 } hitcount: 1 -{ lat: 19, common_pid: 2300 } hitcount: 4 -{ lat: 19, common_pid: 2305 } hitcount: 5 -{ lat: 20, common_pid: 2299 } hitcount: 2 -{ lat: 20, common_pid: 2302 } hitcount: 3 -{ lat: 20, common_pid: 2305 } hitcount: 1 -{ lat: 20, common_pid: 2300 } hitcount: 2 -{ lat: 20, common_pid: 2301 } hitcount: 2 -{ lat: 20, common_pid: 2303 } hitcount: 3 -{ lat: 21, common_pid: 2305 } hitcount: 1 -{ lat: 21, common_pid: 2299 } hitcount: 5 -{ lat: 21, common_pid: 2303 } hitcount: 4 -{ lat: 21, common_pid: 2302 } hitcount: 7 -{ lat: 21, common_pid: 2300 } hitcount: 1 -{ lat: 21, common_pid: 2301 } hitcount: 5 -{ lat: 21, common_pid: 2304 } hitcount: 2 -{ lat: 22, common_pid: 2302 } hitcount: 5 -{ lat: 22, common_pid: 2303 } hitcount: 1 -{ lat: 22, common_pid: 2306 } hitcount: 3 -{ lat: 22, common_pid: 2301 } hitcount: 2 -{ lat: 22, common_pid: 2300 } hitcount: 1 -{ lat: 22, common_pid: 2299 } hitcount: 1 -{ lat: 22, common_pid: 2305 } hitcount: 1 -{ lat: 22, common_pid: 2304 } hitcount: 1 -{ lat: 23, common_pid: 2299 } hitcount: 1 -{ lat: 23, common_pid: 2306 } hitcount: 2 -{ lat: 23, common_pid: 2302 } hitcount: 6 -{ lat: 24, common_pid: 2302 } hitcount: 3 -{ lat: 24, common_pid: 2300 } hitcount: 1 -{ lat: 24, common_pid: 2306 } hitcount: 2 -{ lat: 24, common_pid: 2305 } hitcount: 1 -{ lat: 24, common_pid: 2299 } hitcount: 1 -{ lat: 25, common_pid: 2300 } hitcount: 1 -{ lat: 25, common_pid: 2302 } hitcount: 4 -{ lat: 26, common_pid: 2302 } hitcount: 2 -{ lat: 27, common_pid: 2305 } hitcount: 1 -{ lat: 27, common_pid: 2300 } hitcount: 1 -{ lat: 27, common_pid: 2302 } hitcount: 3 -{ lat: 28, common_pid: 2306 } hitcount: 1 -{ lat: 28, common_pid: 2302 } hitcount: 4 -{ lat: 29, common_pid: 2302 } hitcount: 1 -{ lat: 29, common_pid: 2300 } hitcount: 2 -{ lat: 29, common_pid: 2306 } hitcount: 1 -{ lat: 29, common_pid: 2304 } hitcount: 1 -{ lat: 30, common_pid: 2302 } hitcount: 4 -{ lat: 31, common_pid: 2302 } hitcount: 6 -{ lat: 32, common_pid: 2302 } hitcount: 1 -{ lat: 33, common_pid: 2299 } hitcount: 1 -{ lat: 33, common_pid: 2302 } hitcount: 3 -{ lat: 34, common_pid: 2302 } hitcount: 2 -{ lat: 35, common_pid: 2302 } hitcount: 1 -{ lat: 35, common_pid: 2304 } hitcount: 1 -{ lat: 36, common_pid: 2302 } hitcount: 4 -{ lat: 37, common_pid: 2302 } hitcount: 6 -{ lat: 38, common_pid: 2302 } hitcount: 2 -{ lat: 39, common_pid: 2302 } hitcount: 2 -{ lat: 39, common_pid: 2304 } hitcount: 1 -{ lat: 40, common_pid: 2304 } hitcount: 2 -{ lat: 40, common_pid: 2302 } hitcount: 5 -{ lat: 41, common_pid: 2304 } hitcount: 1 -{ lat: 41, common_pid: 2302 } hitcount: 8 -{ lat: 42, common_pid: 2302 } hitcount: 6 -{ lat: 42, common_pid: 2304 } hitcount: 1 -{ lat: 43, common_pid: 2302 } hitcount: 3 -{ lat: 43, common_pid: 2304 } hitcount: 4 -{ lat: 44, common_pid: 2302 } hitcount: 6 -{ lat: 45, common_pid: 2302 } hitcount: 5 -{ lat: 46, common_pid: 2302 } hitcount: 5 -{ lat: 47, common_pid: 2302 } hitcount: 7 -{ lat: 48, common_pid: 2301 } hitcount: 1 -{ lat: 48, common_pid: 2302 } hitcount: 9 -{ lat: 49, common_pid: 2302 } hitcount: 3 -{ lat: 50, common_pid: 2302 } hitcount: 1 -{ lat: 50, common_pid: 2301 } hitcount: 1 -{ lat: 51, common_pid: 2302 } hitcount: 2 -{ lat: 51, common_pid: 2301 } hitcount: 1 -{ lat: 61, common_pid: 2302 } hitcount: 1 -{ lat: 110, common_pid: 2302 } hitcount: 1 - -Totals: - Hits: 89565 - Entries: 158 - Dropped: 0 - -This doesn't tell us any information about how late cyclictest may have -woken up, but it does show us a nice histogram of how long it took from -the time that cyclictest was woken to the time it made it into user space. diff --git a/Documentation/trace/index.rst b/Documentation/trace/index.rst index b58c10b04e27..306997941ba1 100644 --- a/Documentation/trace/index.rst +++ b/Documentation/trace/index.rst @@ -18,6 +18,7 @@ Linux Tracing Technologies events-nmi events-msr mmiotrace + histogram hwlat_detector intel_th stm diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index dcc0166d1997..2bd4a9181a0f 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -605,7 +605,7 @@ config HIST_TRIGGERS Inter-event tracing of quantities such as latencies is also supported using hist triggers under this option. - See Documentation/trace/histogram.txt. + See Documentation/trace/histogram.rst. If in doubt, say N. config MMIOTRACE_TEST -- cgit From d9c0ffcabd6aae7ff1e34e8078354c13bb9f1183 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 28 Jun 2018 18:29:41 +0200 Subject: sched/nohz: Skip remote tick on idle task entirely Some people have reported that the warning in sched_tick_remote() occasionally triggers, especially in favour of some RCU-Torture pressure: WARNING: CPU: 11 PID: 906 at kernel/sched/core.c:3138 sched_tick_remote+0xb6/0xc0 Modules linked in: CPU: 11 PID: 906 Comm: kworker/u32:3 Not tainted 4.18.0-rc2+ #1 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-1 04/01/2014 Workqueue: events_unbound sched_tick_remote RIP: 0010:sched_tick_remote+0xb6/0xc0 Code: e8 0f 06 b8 00 c6 03 00 fb eb 9d 8b 43 04 85 c0 75 8d 48 8b 83 e0 0a 00 00 48 85 c0 75 81 eb 88 48 89 df e8 bc fe ff ff eb aa <0f> 0b eb +c5 66 0f 1f 44 00 00 bf 17 00 00 00 e8 b6 2e fe ff 0f b6 Call Trace: process_one_work+0x1df/0x3b0 worker_thread+0x44/0x3d0 kthread+0xf3/0x130 ? set_worker_desc+0xb0/0xb0 ? kthread_create_worker_on_cpu+0x70/0x70 ret_from_fork+0x35/0x40 This happens when the remote tick applies on an idle task. Usually the idle_cpu() check avoids that, but it is performed before we lock the runqueue and it is therefore racy. It was intended to be that way in order to prevent from useless runqueue locks since idle task tick callback is a no-op. Now if the racy check slips out of our hands and we end up remotely ticking an idle task, the empty task_tick_idle() is harmless. Still it won't pass the WARN_ON_ONCE() test that ensures rq_clock_task() is not too far from curr->se.exec_start because update_curr_idle() doesn't update the exec_start value like other scheduler policies. Hence the reported false positive. So let's have another check, while the rq is locked, to make sure we don't remote tick on an idle task. The lockless idle_cpu() still applies to avoid unecessary rq lock contention. Reported-by: Jacek Tomaka Reported-by: Paul E. McKenney Reported-by: Anna-Maria Gleixner Signed-off-by: Frederic Weisbecker Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1530203381-31234-1-git-send-email-frederic@kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 36 +++++++++++++++++++++--------------- 1 file changed, 21 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 78d8facba456..22fce36426c0 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3113,7 +3113,9 @@ static void sched_tick_remote(struct work_struct *work) struct tick_work *twork = container_of(dwork, struct tick_work, work); int cpu = twork->cpu; struct rq *rq = cpu_rq(cpu); + struct task_struct *curr; struct rq_flags rf; + u64 delta; /* * Handle the tick only if it appears the remote CPU is running in full @@ -3122,24 +3124,28 @@ static void sched_tick_remote(struct work_struct *work) * statistics and checks timeslices in a time-independent way, regardless * of when exactly it is running. */ - if (!idle_cpu(cpu) && tick_nohz_tick_stopped_cpu(cpu)) { - struct task_struct *curr; - u64 delta; + if (idle_cpu(cpu) || !tick_nohz_tick_stopped_cpu(cpu)) + goto out_requeue; - rq_lock_irq(rq, &rf); - update_rq_clock(rq); - curr = rq->curr; - delta = rq_clock_task(rq) - curr->se.exec_start; + rq_lock_irq(rq, &rf); + curr = rq->curr; + if (is_idle_task(curr)) + goto out_unlock; - /* - * Make sure the next tick runs within a reasonable - * amount of time. - */ - WARN_ON_ONCE(delta > (u64)NSEC_PER_SEC * 3); - curr->sched_class->task_tick(rq, curr, 0); - rq_unlock_irq(rq, &rf); - } + update_rq_clock(rq); + delta = rq_clock_task(rq) - curr->se.exec_start; + + /* + * Make sure the next tick runs within a reasonable + * amount of time. + */ + WARN_ON_ONCE(delta > (u64)NSEC_PER_SEC * 3); + curr->sched_class->task_tick(rq, curr, 0); + +out_unlock: + rq_unlock_irq(rq, &rf); +out_requeue: /* * Run the remote tick once per second (1Hz). This arbitrary * frequency is large enough to avoid overload but short enough -- cgit From 296b2ffe7fa9ed756c41415c6b1512bc4ad687b1 Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Tue, 26 Jun 2018 15:53:22 +0200 Subject: sched/rt: Fix call to cpufreq_update_util() With commit: 8f111bc357aa ("cpufreq/schedutil: Rewrite CPUFREQ_RT support") the schedutil governor uses rq->rt.rt_nr_running to detect whether an RT task is currently running on the CPU and to set frequency to max if necessary. cpufreq_update_util() is called in enqueue/dequeue_top_rt_rq() but rq->rt.rt_nr_running has not been updated yet when dequeue_top_rt_rq() is called so schedutil still considers that an RT task is running when the last task is dequeued. The update of rq->rt.rt_nr_running happens later in dequeue_rt_stack(). In fact, we can take advantage of the sequence that the dequeue then re-enqueue rt entities when a rt task is enqueued or dequeued; As a result enqueue_top_rt_rq() is always called when a task is enqueued or dequeued and also when groups are throttled or unthrottled. The only place that not use enqueue_top_rt_rq() is when root rt_rq is throttled. Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: efault@gmx.de Cc: juri.lelli@redhat.com Cc: patrick.bellasi@arm.com Cc: viresh.kumar@linaro.org Fixes: 8f111bc357aa ('cpufreq/schedutil: Rewrite CPUFREQ_RT support') Link: http://lkml.kernel.org/r/1530021202-21695-1-git-send-email-vincent.guittot@linaro.org Signed-off-by: Ingo Molnar --- kernel/sched/cpufreq_schedutil.c | 2 +- kernel/sched/rt.c | 16 ++++++++++------ kernel/sched/sched.h | 5 +++++ 3 files changed, 16 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 3cde46483f0a..c907fde01eaa 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -192,7 +192,7 @@ static unsigned long sugov_aggregate_util(struct sugov_cpu *sg_cpu) { struct rq *rq = cpu_rq(sg_cpu->cpu); - if (rq->rt.rt_nr_running) + if (rt_rq_is_runnable(&rq->rt)) return sg_cpu->max; /* diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 47556b0c9a95..572567078b60 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -508,8 +508,11 @@ static void sched_rt_rq_dequeue(struct rt_rq *rt_rq) rt_se = rt_rq->tg->rt_se[cpu]; - if (!rt_se) + if (!rt_se) { dequeue_top_rt_rq(rt_rq); + /* Kick cpufreq (see the comment in kernel/sched/sched.h). */ + cpufreq_update_util(rq_of_rt_rq(rt_rq), 0); + } else if (on_rt_rq(rt_se)) dequeue_rt_entity(rt_se, 0); } @@ -1001,8 +1004,6 @@ dequeue_top_rt_rq(struct rt_rq *rt_rq) sub_nr_running(rq, rt_rq->rt_nr_running); rt_rq->rt_queued = 0; - /* Kick cpufreq (see the comment in kernel/sched/sched.h). */ - cpufreq_update_util(rq, 0); } static void @@ -1014,11 +1015,14 @@ enqueue_top_rt_rq(struct rt_rq *rt_rq) if (rt_rq->rt_queued) return; - if (rt_rq_throttled(rt_rq) || !rt_rq->rt_nr_running) + + if (rt_rq_throttled(rt_rq)) return; - add_nr_running(rq, rt_rq->rt_nr_running); - rt_rq->rt_queued = 1; + if (rt_rq->rt_nr_running) { + add_nr_running(rq, rt_rq->rt_nr_running); + rt_rq->rt_queued = 1; + } /* Kick cpufreq (see the comment in kernel/sched/sched.h). */ cpufreq_update_util(rq, 0); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 6601baf2361c..27ddec334601 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -609,6 +609,11 @@ struct rt_rq { #endif }; +static inline bool rt_rq_is_runnable(struct rt_rq *rt_rq) +{ + return rt_rq->rt_queued && rt_rq->rt_nr_running; +} + /* Deadline class' related fields in a runqueue */ struct dl_rq { /* runqueue is an rbtree, ordered by deadline */ -- cgit From 512ac999d2755d2b7109e996a76b6fb8b888631d Mon Sep 17 00:00:00 2001 From: Xunlei Pang Date: Wed, 20 Jun 2018 18:18:33 +0800 Subject: sched/fair: Fix bandwidth timer clock drift condition I noticed that cgroup task groups constantly get throttled even if they have low CPU usage, this causes some jitters on the response time to some of our business containers when enabling CPU quotas. It's very simple to reproduce: mkdir /sys/fs/cgroup/cpu/test cd /sys/fs/cgroup/cpu/test echo 100000 > cpu.cfs_quota_us echo $$ > tasks then repeat: cat cpu.stat | grep nr_throttled # nr_throttled will increase steadily After some analysis, we found that cfs_rq::runtime_remaining will be cleared by expire_cfs_rq_runtime() due to two equal but stale "cfs_{b|q}->runtime_expires" after period timer is re-armed. The current condition to judge clock drift in expire_cfs_rq_runtime() is wrong, the two runtime_expires are actually the same when clock drift happens, so this condtion can never hit. The orginal design was correctly done by this commit: a9cf55b28610 ("sched: Expire invalid runtime") ... but was changed to be the current implementation due to its locking bug. This patch introduces another way, it adds a new field in both structures cfs_rq and cfs_bandwidth to record the expiration update sequence, and uses them to figure out if clock drift happens (true if they are equal). Signed-off-by: Xunlei Pang Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Ben Segall Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Fixes: 51f2176d74ac ("sched/fair: Fix unlocked reads of some cfs_b->quota/period") Link: http://lkml.kernel.org/r/20180620101834.24455-1-xlpang@linux.alibaba.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 14 ++++++++------ kernel/sched/sched.h | 6 ++++-- 2 files changed, 12 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 1866e64792a7..791707c56886 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4590,6 +4590,7 @@ void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b) now = sched_clock_cpu(smp_processor_id()); cfs_b->runtime = cfs_b->quota; cfs_b->runtime_expires = now + ktime_to_ns(cfs_b->period); + cfs_b->expires_seq++; } static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg) @@ -4612,6 +4613,7 @@ static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq) struct task_group *tg = cfs_rq->tg; struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg); u64 amount = 0, min_amount, expires; + int expires_seq; /* note: this is a positive sum as runtime_remaining <= 0 */ min_amount = sched_cfs_bandwidth_slice() - cfs_rq->runtime_remaining; @@ -4628,6 +4630,7 @@ static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq) cfs_b->idle = 0; } } + expires_seq = cfs_b->expires_seq; expires = cfs_b->runtime_expires; raw_spin_unlock(&cfs_b->lock); @@ -4637,8 +4640,10 @@ static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq) * spread between our sched_clock and the one on which runtime was * issued. */ - if ((s64)(expires - cfs_rq->runtime_expires) > 0) + if (cfs_rq->expires_seq != expires_seq) { + cfs_rq->expires_seq = expires_seq; cfs_rq->runtime_expires = expires; + } return cfs_rq->runtime_remaining > 0; } @@ -4664,12 +4669,9 @@ static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq) * has not truly expired. * * Fortunately we can check determine whether this the case by checking - * whether the global deadline has advanced. It is valid to compare - * cfs_b->runtime_expires without any locks since we only care about - * exact equality, so a partial write will still work. + * whether the global deadline(cfs_b->expires_seq) has advanced. */ - - if (cfs_rq->runtime_expires != cfs_b->runtime_expires) { + if (cfs_rq->expires_seq == cfs_b->expires_seq) { /* extend local deadline, drift is bounded above by 2 ticks */ cfs_rq->runtime_expires += TICK_NSEC; } else { diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 27ddec334601..c7742dcc136c 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -334,9 +334,10 @@ struct cfs_bandwidth { u64 runtime; s64 hierarchical_quota; u64 runtime_expires; + int expires_seq; - int idle; - int period_active; + short idle; + short period_active; struct hrtimer period_timer; struct hrtimer slack_timer; struct list_head throttled_cfs_rq; @@ -551,6 +552,7 @@ struct cfs_rq { #ifdef CONFIG_CFS_BANDWIDTH int runtime_enabled; + int expires_seq; u64 runtime_expires; s64 runtime_remaining; -- cgit From f1d1be8aee6c461652aea8f58bedebaa73d7f4d3 Mon Sep 17 00:00:00 2001 From: Xunlei Pang Date: Wed, 20 Jun 2018 18:18:34 +0800 Subject: sched/fair: Advance global expiration when period timer is restarted When period gets restarted after some idle time, start_cfs_bandwidth() doesn't update the expiration information, expire_cfs_rq_runtime() will see cfs_rq->runtime_expires smaller than rq clock and go to the clock drift logic, wasting needless CPU cycles on the scheduler hot path. Update the global expiration in start_cfs_bandwidth() to avoid frequent expire_cfs_rq_runtime() calls once a new period begins. Signed-off-by: Xunlei Pang Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Ben Segall Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20180620101834.24455-2-xlpang@linux.alibaba.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 791707c56886..840b92ee6f89 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5204,13 +5204,18 @@ static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b) { + u64 overrun; + lockdep_assert_held(&cfs_b->lock); - if (!cfs_b->period_active) { - cfs_b->period_active = 1; - hrtimer_forward_now(&cfs_b->period_timer, cfs_b->period); - hrtimer_start_expires(&cfs_b->period_timer, HRTIMER_MODE_ABS_PINNED); - } + if (cfs_b->period_active) + return; + + cfs_b->period_active = 1; + overrun = hrtimer_forward_now(&cfs_b->period_timer, cfs_b->period); + cfs_b->runtime_expires += (overrun + 1) * ktime_to_ns(cfs_b->period); + cfs_b->expires_seq++; + hrtimer_start_expires(&cfs_b->period_timer, HRTIMER_MODE_ABS_PINNED); } static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) -- cgit From 3482d98bbc730758b63a5d1cf41d05ea17481412 Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Thu, 14 Jun 2018 12:33:00 +0200 Subject: sched/util_est: Fix util_est_dequeue() for throttled cfs_rq When a cfs_rq is throttled, parent cfs_rq->nr_running is decreased and everything happens at cfs_rq level. Currently util_est stays unchanged in such case and it keeps accounting the utilization of throttled tasks. This can somewhat make sense as we don't dequeue tasks but only throttled cfs_rq. If a task of another group is enqueued/dequeued and root cfs_rq becomes idle during the dequeue, util_est will be cleared whereas it was accounting util_est of throttled tasks before. So the behavior of util_est is not always the same regarding throttled tasks and depends of side activity. Furthermore, util_est will not be updated when the cfs_rq is unthrottled as everything happens at cfs_rq level. Main results is that util_est will stay null whereas we now have running tasks. We have to wait for the next dequeue/enqueue of the previously throttled tasks to get an up to date util_est. Remove the assumption that cfs_rq's estimated utilization of a CPU is 0 if there is no running task so the util_est of a task remains until the latter is dequeued even if its cfs_rq has been throttled. Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Patrick Bellasi Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Fixes: 7f65ea42eb00 ("sched/fair: Add util_est on top of PELT") Link: http://lkml.kernel.org/r/1528972380-16268-1-git-send-email-vincent.guittot@linaro.org Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 16 ++++------------ 1 file changed, 4 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 840b92ee6f89..2f0a0be4d344 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3982,18 +3982,10 @@ util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p, bool task_sleep) if (!sched_feat(UTIL_EST)) return; - /* - * Update root cfs_rq's estimated utilization - * - * If *p is the last task then the root cfs_rq's estimated utilization - * of a CPU is 0 by definition. - */ - ue.enqueued = 0; - if (cfs_rq->nr_running) { - ue.enqueued = cfs_rq->avg.util_est.enqueued; - ue.enqueued -= min_t(unsigned int, ue.enqueued, - (_task_util_est(p) | UTIL_AVG_UNCHANGED)); - } + /* Update root cfs_rq's estimated utilization */ + ue.enqueued = cfs_rq->avg.util_est.enqueued; + ue.enqueued -= min_t(unsigned int, ue.enqueued, + (_task_util_est(p) | UTIL_AVG_UNCHANGED)); WRITE_ONCE(cfs_rq->avg.util_est.enqueued, ue.enqueued); /* -- cgit From 1cef1150ef40ec52f507436a14230cbc2623299c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 7 Jun 2018 11:45:49 +0200 Subject: kthread, sched/core: Fix kthread_parkme() (again...) Gaurav reports that commit: 85f1abe0019f ("kthread, sched/wait: Fix kthread_parkme() completion issue") isn't working for him. Because of the following race: > controller Thread CPUHP Thread > takedown_cpu > kthread_park > kthread_parkme > Set KTHREAD_SHOULD_PARK > smpboot_thread_fn > set Task interruptible > > > wake_up_process > if (!(p->state & state)) > goto out; > > Kthread_parkme > SET TASK_PARKED > schedule > raw_spin_lock(&rq->lock) > ttwu_remote > waiting for __task_rq_lock > context_switch > > finish_lock_switch > > > > Case TASK_PARKED > kthread_park_complete > > > SET Running Furthermore, Oleg noticed that the whole scheduler TASK_PARKED handling is buggered because the TASK_DEAD thing is done with preemption disabled, the current code can still complete early on preemption :/ So basically revert that earlier fix and go with a variant of the alternative mentioned in the commit. Promote TASK_PARKED to special state to avoid the store-store issue on task->state leading to the WARN in kthread_unpark() -> __kthread_bind(). But in addition, add wait_task_inactive() to kthread_park() to ensure the task really is PARKED when we return from kthread_park(). This avoids the whole kthread still gets migrated nonsense -- although it would be really good to get this done differently. Reported-by: Gaurav Kohli Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Fixes: 85f1abe0019f ("kthread, sched/wait: Fix kthread_parkme() completion issue") Signed-off-by: Ingo Molnar --- include/linux/kthread.h | 1 - include/linux/sched.h | 2 +- kernel/kthread.c | 30 ++++++++++++++++++++++++------ kernel/sched/core.c | 31 +++++++++++-------------------- 4 files changed, 36 insertions(+), 28 deletions(-) (limited to 'kernel') diff --git a/include/linux/kthread.h b/include/linux/kthread.h index 2803264c512f..c1961761311d 100644 --- a/include/linux/kthread.h +++ b/include/linux/kthread.h @@ -62,7 +62,6 @@ void *kthread_probe_data(struct task_struct *k); int kthread_park(struct task_struct *k); void kthread_unpark(struct task_struct *k); void kthread_parkme(void); -void kthread_park_complete(struct task_struct *k); int kthreadd(void *unused); extern struct task_struct *kthreadd_task; diff --git a/include/linux/sched.h b/include/linux/sched.h index 9256118bd40c..43731fe51c97 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -118,7 +118,7 @@ struct task_group; * the comment with set_special_state(). */ #define is_special_task_state(state) \ - ((state) & (__TASK_STOPPED | __TASK_TRACED | TASK_DEAD)) + ((state) & (__TASK_STOPPED | __TASK_TRACED | TASK_PARKED | TASK_DEAD)) #define __set_current_state(state_value) \ do { \ diff --git a/kernel/kthread.c b/kernel/kthread.c index 481951bf091d..750cb8082694 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -177,9 +177,20 @@ void *kthread_probe_data(struct task_struct *task) static void __kthread_parkme(struct kthread *self) { for (;;) { - set_current_state(TASK_PARKED); + /* + * TASK_PARKED is a special state; we must serialize against + * possible pending wakeups to avoid store-store collisions on + * task->state. + * + * Such a collision might possibly result in the task state + * changin from TASK_PARKED and us failing the + * wait_task_inactive() in kthread_park(). + */ + set_special_state(TASK_PARKED); if (!test_bit(KTHREAD_SHOULD_PARK, &self->flags)) break; + + complete_all(&self->parked); schedule(); } __set_current_state(TASK_RUNNING); @@ -191,11 +202,6 @@ void kthread_parkme(void) } EXPORT_SYMBOL_GPL(kthread_parkme); -void kthread_park_complete(struct task_struct *k) -{ - complete_all(&to_kthread(k)->parked); -} - static int kthread(void *_create) { /* Copy data: it's on kthread's stack */ @@ -461,6 +467,9 @@ void kthread_unpark(struct task_struct *k) reinit_completion(&kthread->parked); clear_bit(KTHREAD_SHOULD_PARK, &kthread->flags); + /* + * __kthread_parkme() will either see !SHOULD_PARK or get the wakeup. + */ wake_up_state(k, TASK_PARKED); } EXPORT_SYMBOL_GPL(kthread_unpark); @@ -487,7 +496,16 @@ int kthread_park(struct task_struct *k) set_bit(KTHREAD_SHOULD_PARK, &kthread->flags); if (k != current) { wake_up_process(k); + /* + * Wait for __kthread_parkme() to complete(), this means we + * _will_ have TASK_PARKED and are about to call schedule(). + */ wait_for_completion(&kthread->parked); + /* + * Now wait for that schedule() to complete and the task to + * get scheduled out. + */ + WARN_ON_ONCE(!wait_task_inactive(k, TASK_PARKED)); } return 0; diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 22fce36426c0..fe365c9a08e9 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7,7 +7,6 @@ */ #include "sched.h" -#include #include #include @@ -2724,28 +2723,20 @@ static struct rq *finish_task_switch(struct task_struct *prev) membarrier_mm_sync_core_before_usermode(mm); mmdrop(mm); } - if (unlikely(prev_state & (TASK_DEAD|TASK_PARKED))) { - switch (prev_state) { - case TASK_DEAD: - if (prev->sched_class->task_dead) - prev->sched_class->task_dead(prev); + if (unlikely(prev_state == TASK_DEAD)) { + if (prev->sched_class->task_dead) + prev->sched_class->task_dead(prev); - /* - * Remove function-return probe instances associated with this - * task and put them back on the free list. - */ - kprobe_flush_task(prev); - - /* Task is done with its stack. */ - put_task_stack(prev); + /* + * Remove function-return probe instances associated with this + * task and put them back on the free list. + */ + kprobe_flush_task(prev); - put_task_struct(prev); - break; + /* Task is done with its stack. */ + put_task_stack(prev); - case TASK_PARKED: - kthread_park_complete(prev); - break; - } + put_task_struct(prev); } tick_nohz_task_switch(); -- cgit From 9cf57731b63e37ed995b46690adc604891a9a28f Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 7 Jun 2018 10:52:03 +0200 Subject: watchdog/softlockup: Replace "watchdog/%u" threads with cpu_stop_work Oleg suggested to replace the "watchdog/%u" threads with cpu_stop_work. That removes one thread per CPU while at the same time fixes softlockup vs SCHED_DEADLINE. But more importantly, it does away with the single smpboot_update_cpumask_percpu_thread() user, which allows cleanups/shrinkage of the smpboot interface. Suggested-by: Oleg Nesterov Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- include/linux/cpuhotplug.h | 1 + include/linux/nmi.h | 5 ++ kernel/cpu.c | 5 ++ kernel/watchdog.c | 137 ++++++++++++++++++++------------------------- 4 files changed, 71 insertions(+), 77 deletions(-) (limited to 'kernel') diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h index 8796ba387152..4cf06a64bc02 100644 --- a/include/linux/cpuhotplug.h +++ b/include/linux/cpuhotplug.h @@ -164,6 +164,7 @@ enum cpuhp_state { CPUHP_AP_PERF_POWERPC_NEST_IMC_ONLINE, CPUHP_AP_PERF_POWERPC_CORE_IMC_ONLINE, CPUHP_AP_PERF_POWERPC_THREAD_IMC_ONLINE, + CPUHP_AP_WATCHDOG_ONLINE, CPUHP_AP_WORKQUEUE_ONLINE, CPUHP_AP_RCUTREE_ONLINE, CPUHP_AP_ONLINE_DYN, diff --git a/include/linux/nmi.h b/include/linux/nmi.h index b8d868d23e79..80664bbeca43 100644 --- a/include/linux/nmi.h +++ b/include/linux/nmi.h @@ -33,10 +33,15 @@ extern int sysctl_hardlockup_all_cpu_backtrace; #define sysctl_hardlockup_all_cpu_backtrace 0 #endif /* !CONFIG_SMP */ +extern int lockup_detector_online_cpu(unsigned int cpu); +extern int lockup_detector_offline_cpu(unsigned int cpu); + #else /* CONFIG_LOCKUP_DETECTOR */ static inline void lockup_detector_init(void) { } static inline void lockup_detector_soft_poweroff(void) { } static inline void lockup_detector_cleanup(void) { } +#define lockup_detector_online_cpu NULL +#define lockup_detector_offline_cpu NULL #endif /* !CONFIG_LOCKUP_DETECTOR */ #ifdef CONFIG_SOFTLOCKUP_DETECTOR diff --git a/kernel/cpu.c b/kernel/cpu.c index 0db8938fbb23..191097c45fb1 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -1344,6 +1344,11 @@ static struct cpuhp_step cpuhp_hp_states[] = { .startup.single = perf_event_init_cpu, .teardown.single = perf_event_exit_cpu, }, + [CPUHP_AP_WATCHDOG_ONLINE] = { + .name = "lockup_detector:online", + .startup.single = lockup_detector_online_cpu, + .teardown.single = lockup_detector_offline_cpu, + }, [CPUHP_AP_WORKQUEUE_ONLINE] = { .name = "workqueue:online", .startup.single = workqueue_online_cpu, diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 576d18045811..b81f777838d5 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -18,18 +18,14 @@ #include #include #include -#include -#include -#include #include -#include #include #include #include +#include #include #include -#include static DEFINE_MUTEX(watchdog_mutex); @@ -169,11 +165,10 @@ static void lockup_detector_update_enable(void) unsigned int __read_mostly softlockup_panic = CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC_VALUE; -static bool softlockup_threads_initialized __read_mostly; +static bool softlockup_initialized __read_mostly; static u64 __read_mostly sample_period; static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts); -static DEFINE_PER_CPU(struct task_struct *, softlockup_watchdog); static DEFINE_PER_CPU(struct hrtimer, watchdog_hrtimer); static DEFINE_PER_CPU(bool, softlockup_touch_sync); static DEFINE_PER_CPU(bool, soft_watchdog_warn); @@ -335,6 +330,25 @@ static void watchdog_interrupt_count(void) __this_cpu_inc(hrtimer_interrupts); } +/* + * The watchdog thread function - touches the timestamp. + * + * It only runs once every sample_period seconds (4 seconds by + * default) to reset the softlockup timestamp. If this gets delayed + * for more than 2*watchdog_thresh seconds then the debug-printout + * triggers in watchdog_timer_fn(). + */ +static int softlockup_fn(void *data) +{ + __this_cpu_write(soft_lockup_hrtimer_cnt, + __this_cpu_read(hrtimer_interrupts)); + __touch_watchdog(); + + return 0; +} + +static DEFINE_PER_CPU(struct cpu_stop_work, softlockup_stop_work); + /* watchdog kicker functions */ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) { @@ -350,7 +364,9 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) watchdog_interrupt_count(); /* kick the softlockup detector */ - wake_up_process(__this_cpu_read(softlockup_watchdog)); + stop_one_cpu_nowait(smp_processor_id(), + softlockup_fn, NULL, + this_cpu_ptr(&softlockup_stop_work)); /* .. and repeat */ hrtimer_forward_now(hrtimer, ns_to_ktime(sample_period)); @@ -448,17 +464,12 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) return HRTIMER_RESTART; } -static void watchdog_set_prio(unsigned int policy, unsigned int prio) -{ - struct sched_param param = { .sched_priority = prio }; - - sched_setscheduler(current, policy, ¶m); -} - static void watchdog_enable(unsigned int cpu) { struct hrtimer *hrtimer = this_cpu_ptr(&watchdog_hrtimer); + WARN_ON_ONCE(cpu != smp_processor_id()); + /* * Start the timer first to prevent the NMI watchdog triggering * before the timer has a chance to fire. @@ -473,15 +484,14 @@ static void watchdog_enable(unsigned int cpu) /* Enable the perf event */ if (watchdog_enabled & NMI_WATCHDOG_ENABLED) watchdog_nmi_enable(cpu); - - watchdog_set_prio(SCHED_FIFO, MAX_RT_PRIO - 1); } static void watchdog_disable(unsigned int cpu) { struct hrtimer *hrtimer = this_cpu_ptr(&watchdog_hrtimer); - watchdog_set_prio(SCHED_NORMAL, 0); + WARN_ON_ONCE(cpu != smp_processor_id()); + /* * Disable the perf event first. That prevents that a large delay * between disabling the timer and disabling the perf event causes @@ -491,77 +501,63 @@ static void watchdog_disable(unsigned int cpu) hrtimer_cancel(hrtimer); } -static void watchdog_cleanup(unsigned int cpu, bool online) +static int softlockup_stop_fn(void *data) { - watchdog_disable(cpu); + watchdog_disable(smp_processor_id()); + return 0; } -static int watchdog_should_run(unsigned int cpu) +static void softlockup_stop_all(void) { - return __this_cpu_read(hrtimer_interrupts) != - __this_cpu_read(soft_lockup_hrtimer_cnt); + int cpu; + + if (!softlockup_initialized) + return; + + for_each_cpu(cpu, &watchdog_allowed_mask) + smp_call_on_cpu(cpu, softlockup_stop_fn, NULL, false); + + cpumask_clear(&watchdog_allowed_mask); } -/* - * The watchdog thread function - touches the timestamp. - * - * It only runs once every sample_period seconds (4 seconds by - * default) to reset the softlockup timestamp. If this gets delayed - * for more than 2*watchdog_thresh seconds then the debug-printout - * triggers in watchdog_timer_fn(). - */ -static void watchdog(unsigned int cpu) +static int softlockup_start_fn(void *data) { - __this_cpu_write(soft_lockup_hrtimer_cnt, - __this_cpu_read(hrtimer_interrupts)); - __touch_watchdog(); + watchdog_enable(smp_processor_id()); + return 0; } -static struct smp_hotplug_thread watchdog_threads = { - .store = &softlockup_watchdog, - .thread_should_run = watchdog_should_run, - .thread_fn = watchdog, - .thread_comm = "watchdog/%u", - .setup = watchdog_enable, - .cleanup = watchdog_cleanup, - .park = watchdog_disable, - .unpark = watchdog_enable, -}; - -static void softlockup_update_smpboot_threads(void) +static void softlockup_start_all(void) { - lockdep_assert_held(&watchdog_mutex); - - if (!softlockup_threads_initialized) - return; + int cpu; - smpboot_update_cpumask_percpu_thread(&watchdog_threads, - &watchdog_allowed_mask); + cpumask_copy(&watchdog_allowed_mask, &watchdog_cpumask); + for_each_cpu(cpu, &watchdog_allowed_mask) + smp_call_on_cpu(cpu, softlockup_start_fn, NULL, false); } -/* Temporarily park all watchdog threads */ -static void softlockup_park_all_threads(void) +int lockup_detector_online_cpu(unsigned int cpu) { - cpumask_clear(&watchdog_allowed_mask); - softlockup_update_smpboot_threads(); + watchdog_enable(cpu); + return 0; } -/* Unpark enabled threads */ -static void softlockup_unpark_threads(void) +int lockup_detector_offline_cpu(unsigned int cpu) { - cpumask_copy(&watchdog_allowed_mask, &watchdog_cpumask); - softlockup_update_smpboot_threads(); + watchdog_disable(cpu); + return 0; } static void lockup_detector_reconfigure(void) { cpus_read_lock(); watchdog_nmi_stop(); - softlockup_park_all_threads(); + + softlockup_stop_all(); set_sample_period(); lockup_detector_update_enable(); if (watchdog_enabled && watchdog_thresh) - softlockup_unpark_threads(); + softlockup_start_all(); + watchdog_nmi_start(); cpus_read_unlock(); /* @@ -580,8 +576,6 @@ static void lockup_detector_reconfigure(void) */ static __init void lockup_detector_setup(void) { - int ret; - /* * If sysctl is off and watchdog got disabled on the command line, * nothing to do here. @@ -592,24 +586,13 @@ static __init void lockup_detector_setup(void) !(watchdog_enabled && watchdog_thresh)) return; - ret = smpboot_register_percpu_thread_cpumask(&watchdog_threads, - &watchdog_allowed_mask); - if (ret) { - pr_err("Failed to initialize soft lockup detector threads\n"); - return; - } - mutex_lock(&watchdog_mutex); - softlockup_threads_initialized = true; lockup_detector_reconfigure(); + softlockup_initialized = true; mutex_unlock(&watchdog_mutex); } #else /* CONFIG_SOFTLOCKUP_DETECTOR */ -static inline int watchdog_park_threads(void) { return 0; } -static inline void watchdog_unpark_threads(void) { } -static inline int watchdog_enable_all_cpus(void) { return 0; } -static inline void watchdog_disable_all_cpus(void) { } static void lockup_detector_reconfigure(void) { cpus_read_lock(); -- cgit From 167a88677b05d6a810f23b871cfb2b5db1808e60 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 7 Jun 2018 10:53:01 +0200 Subject: smpboot: Remove cpumask from the API Now that the sole use of the whole smpboot_*cpumask() API is gone, remove it. Suggested-by: Oleg Nesterov Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- include/linux/smpboot.h | 15 +------------- kernel/smpboot.c | 54 +++++-------------------------------------------- 2 files changed, 6 insertions(+), 63 deletions(-) (limited to 'kernel') diff --git a/include/linux/smpboot.h b/include/linux/smpboot.h index c174844cf663..d0884b525001 100644 --- a/include/linux/smpboot.h +++ b/include/linux/smpboot.h @@ -25,8 +25,6 @@ struct smpboot_thread_data; * parked (cpu offline) * @unpark: Optional unpark function, called when the thread is * unparked (cpu online) - * @cpumask: Internal state. To update which threads are unparked, - * call smpboot_update_cpumask_percpu_thread(). * @selfparking: Thread is not parked by the park function. * @thread_comm: The base name of the thread */ @@ -40,23 +38,12 @@ struct smp_hotplug_thread { void (*cleanup)(unsigned int cpu, bool online); void (*park)(unsigned int cpu); void (*unpark)(unsigned int cpu); - cpumask_var_t cpumask; bool selfparking; const char *thread_comm; }; -int smpboot_register_percpu_thread_cpumask(struct smp_hotplug_thread *plug_thread, - const struct cpumask *cpumask); - -static inline int -smpboot_register_percpu_thread(struct smp_hotplug_thread *plug_thread) -{ - return smpboot_register_percpu_thread_cpumask(plug_thread, - cpu_possible_mask); -} +int smpboot_register_percpu_thread(struct smp_hotplug_thread *plug_thread); void smpboot_unregister_percpu_thread(struct smp_hotplug_thread *plug_thread); -void smpboot_update_cpumask_percpu_thread(struct smp_hotplug_thread *plug_thread, - const struct cpumask *); #endif diff --git a/kernel/smpboot.c b/kernel/smpboot.c index 5043e7433f4b..c230c2dd48e1 100644 --- a/kernel/smpboot.c +++ b/kernel/smpboot.c @@ -238,8 +238,7 @@ int smpboot_unpark_threads(unsigned int cpu) mutex_lock(&smpboot_threads_lock); list_for_each_entry(cur, &hotplug_threads, list) - if (cpumask_test_cpu(cpu, cur->cpumask)) - smpboot_unpark_thread(cur, cpu); + smpboot_unpark_thread(cur, cpu); mutex_unlock(&smpboot_threads_lock); return 0; } @@ -280,34 +279,26 @@ static void smpboot_destroy_threads(struct smp_hotplug_thread *ht) } /** - * smpboot_register_percpu_thread_cpumask - Register a per_cpu thread related + * smpboot_register_percpu_thread - Register a per_cpu thread related * to hotplug * @plug_thread: Hotplug thread descriptor - * @cpumask: The cpumask where threads run * * Creates and starts the threads on all online cpus. */ -int smpboot_register_percpu_thread_cpumask(struct smp_hotplug_thread *plug_thread, - const struct cpumask *cpumask) +int smpboot_register_percpu_thread(struct smp_hotplug_thread *plug_thread) { unsigned int cpu; int ret = 0; - if (!alloc_cpumask_var(&plug_thread->cpumask, GFP_KERNEL)) - return -ENOMEM; - cpumask_copy(plug_thread->cpumask, cpumask); - get_online_cpus(); mutex_lock(&smpboot_threads_lock); for_each_online_cpu(cpu) { ret = __smpboot_create_thread(plug_thread, cpu); if (ret) { smpboot_destroy_threads(plug_thread); - free_cpumask_var(plug_thread->cpumask); goto out; } - if (cpumask_test_cpu(cpu, cpumask)) - smpboot_unpark_thread(plug_thread, cpu); + smpboot_unpark_thread(plug_thread, cpu); } list_add(&plug_thread->list, &hotplug_threads); out: @@ -315,7 +306,7 @@ out: put_online_cpus(); return ret; } -EXPORT_SYMBOL_GPL(smpboot_register_percpu_thread_cpumask); +EXPORT_SYMBOL_GPL(smpboot_register_percpu_thread); /** * smpboot_unregister_percpu_thread - Unregister a per_cpu thread related to hotplug @@ -331,44 +322,9 @@ void smpboot_unregister_percpu_thread(struct smp_hotplug_thread *plug_thread) smpboot_destroy_threads(plug_thread); mutex_unlock(&smpboot_threads_lock); put_online_cpus(); - free_cpumask_var(plug_thread->cpumask); } EXPORT_SYMBOL_GPL(smpboot_unregister_percpu_thread); -/** - * smpboot_update_cpumask_percpu_thread - Adjust which per_cpu hotplug threads stay parked - * @plug_thread: Hotplug thread descriptor - * @new: Revised mask to use - * - * The cpumask field in the smp_hotplug_thread must not be updated directly - * by the client, but only by calling this function. - * This function can only be called on a registered smp_hotplug_thread. - */ -void smpboot_update_cpumask_percpu_thread(struct smp_hotplug_thread *plug_thread, - const struct cpumask *new) -{ - struct cpumask *old = plug_thread->cpumask; - static struct cpumask tmp; - unsigned int cpu; - - lockdep_assert_cpus_held(); - mutex_lock(&smpboot_threads_lock); - - /* Park threads that were exclusively enabled on the old mask. */ - cpumask_andnot(&tmp, old, new); - for_each_cpu_and(cpu, &tmp, cpu_online_mask) - smpboot_park_thread(plug_thread, cpu); - - /* Unpark threads that are exclusively enabled on the new mask. */ - cpumask_andnot(&tmp, new, old); - for_each_cpu_and(cpu, &tmp, cpu_online_mask) - smpboot_unpark_thread(plug_thread, cpu); - - cpumask_copy(old, new); - - mutex_unlock(&smpboot_threads_lock); -} - static DEFINE_PER_CPU(atomic_t, cpu_hotplug_state) = ATOMIC_INIT(CPU_POST_DEAD); /* -- cgit From f83ee19be4272564ad592ef90145db7295229490 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 7 Jun 2018 10:55:56 +0200 Subject: kthread: Simplify kthread_park() completion Oleg explains the reason we could hit park+park is that smpboot_update_cpumask_percpu_thread()'s for_each_cpu_and(cpu, &tmp, cpu_online_mask) smpboot_park_kthread(); turns into: for ((cpu) = 0; (cpu) < 1; (cpu)++, (void)mask, (void)and) smpboot_park_kthread(); on UP, ignoring the mask. But since we just completely removed that function, this is no longer relevant. So revert commit: b1f5b378e126 ("kthread: Allow kthread_park() on a parked kthread") Suggested-by: Oleg Nesterov Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/kthread.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/kthread.c b/kernel/kthread.c index 750cb8082694..11b591ee51ab 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -190,7 +190,7 @@ static void __kthread_parkme(struct kthread *self) if (!test_bit(KTHREAD_SHOULD_PARK, &self->flags)) break; - complete_all(&self->parked); + complete(&self->parked); schedule(); } __set_current_state(TASK_RUNNING); @@ -465,7 +465,6 @@ void kthread_unpark(struct task_struct *k) if (test_bit(KTHREAD_IS_PER_CPU, &kthread->flags)) __kthread_bind(k, kthread->cpu, TASK_PARKED); - reinit_completion(&kthread->parked); clear_bit(KTHREAD_SHOULD_PARK, &kthread->flags); /* * __kthread_parkme() will either see !SHOULD_PARK or get the wakeup. @@ -493,6 +492,9 @@ int kthread_park(struct task_struct *k) if (WARN_ON(k->flags & PF_EXITING)) return -ENOSYS; + if (WARN_ON_ONCE(test_bit(KTHREAD_SHOULD_PARK, &kthread->flags))) + return -EBUSY; + set_bit(KTHREAD_SHOULD_PARK, &kthread->flags); if (k != current) { wake_up_process(k); -- cgit From 55f036ca7e74b85e34958af3d22121c656796413 Mon Sep 17 00:00:00 2001 From: Peter Ziljstra Date: Fri, 15 Jun 2018 10:07:12 +0200 Subject: locking: WW mutex cleanup Make the WW mutex code more readable by adding comments, splitting up functions and pointing out that we're actually using the Wait-Die algorithm. Cc: Ingo Molnar Cc: Jonathan Corbet Cc: Gustavo Padovan Cc: Maarten Lankhorst Cc: Sean Paul Cc: David Airlie Cc: Davidlohr Bueso Cc: "Paul E. McKenney" Cc: Josh Triplett Cc: Thomas Gleixner Cc: Kate Stewart Cc: Philippe Ombredanne Cc: Greg Kroah-Hartman Cc: linux-doc@vger.kernel.org Cc: linux-media@vger.kernel.org Cc: linaro-mm-sig@lists.linaro.org Co-authored-by: Thomas Hellstrom Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Hellstrom Acked-by: Ingo Molnar --- Documentation/locking/ww-mutex-design.txt | 12 +- include/linux/ww_mutex.h | 28 ++--- kernel/locking/mutex.c | 202 ++++++++++++++++++------------ 3 files changed, 145 insertions(+), 97 deletions(-) (limited to 'kernel') diff --git a/Documentation/locking/ww-mutex-design.txt b/Documentation/locking/ww-mutex-design.txt index 34c3a1b50b9a..2fd7f2a2af21 100644 --- a/Documentation/locking/ww-mutex-design.txt +++ b/Documentation/locking/ww-mutex-design.txt @@ -32,10 +32,10 @@ the oldest task) wins, and the one with the higher reservation id (i.e. the younger task) unlocks all of the buffers that it has already locked, and then tries again. -In the RDBMS literature this deadlock handling approach is called wait/wound: +In the RDBMS literature this deadlock handling approach is called wait/die: The older tasks waits until it can acquire the contended lock. The younger tasks needs to back off and drop all the locks it is currently holding, i.e. the -younger task is wounded. +younger task dies. Concepts -------- @@ -56,9 +56,9 @@ Furthermore there are three different class of w/w lock acquire functions: * Normal lock acquisition with a context, using ww_mutex_lock. -* Slowpath lock acquisition on the contending lock, used by the wounded task - after having dropped all already acquired locks. These functions have the - _slow postfix. +* Slowpath lock acquisition on the contending lock, used by the task that just + killed its transaction after having dropped all already acquired locks. + These functions have the _slow postfix. From a simple semantics point-of-view the _slow functions are not strictly required, since simply calling the normal ww_mutex_lock functions on the @@ -220,7 +220,7 @@ mutexes are a natural fit for such a case for two reasons: Note that this approach differs in two important ways from the above methods: - Since the list of objects is dynamically constructed (and might very well be - different when retrying due to hitting the -EDEADLK wound condition) there's + different when retrying due to hitting the -EDEADLK die condition) there's no need to keep any object on a persistent list when it's not locked. We can therefore move the list_head into the object itself. - On the other hand the dynamic object list construction also means that the -EALREADY return diff --git a/include/linux/ww_mutex.h b/include/linux/ww_mutex.h index 39fda195bf78..f82fce2229c8 100644 --- a/include/linux/ww_mutex.h +++ b/include/linux/ww_mutex.h @@ -6,7 +6,7 @@ * * Copyright (C) 2004, 2005, 2006 Red Hat, Inc., Ingo Molnar * - * Wound/wait implementation: + * Wait/Die implementation: * Copyright (C) 2013 Canonical Ltd. * * This file contains the main data structure and API definitions. @@ -28,9 +28,9 @@ struct ww_class { struct ww_acquire_ctx { struct task_struct *task; unsigned long stamp; - unsigned acquired; + unsigned int acquired; #ifdef CONFIG_DEBUG_MUTEXES - unsigned done_acquire; + unsigned int done_acquire; struct ww_class *ww_class; struct ww_mutex *contending_lock; #endif @@ -38,8 +38,8 @@ struct ww_acquire_ctx { struct lockdep_map dep_map; #endif #ifdef CONFIG_DEBUG_WW_MUTEX_SLOWPATH - unsigned deadlock_inject_interval; - unsigned deadlock_inject_countdown; + unsigned int deadlock_inject_interval; + unsigned int deadlock_inject_countdown; #endif }; @@ -102,7 +102,7 @@ static inline void ww_mutex_init(struct ww_mutex *lock, * * Context-based w/w mutex acquiring can be done in any order whatsoever within * a given lock class. Deadlocks will be detected and handled with the - * wait/wound logic. + * wait/die logic. * * Mixing of context-based w/w mutex acquiring and single w/w mutex locking can * result in undetected deadlocks and is so forbidden. Mixing different contexts @@ -195,13 +195,13 @@ static inline void ww_acquire_fini(struct ww_acquire_ctx *ctx) * Lock the w/w mutex exclusively for this task. * * Deadlocks within a given w/w class of locks are detected and handled with the - * wait/wound algorithm. If the lock isn't immediately avaiable this function + * wait/die algorithm. If the lock isn't immediately available this function * will either sleep until it is (wait case). Or it selects the current context - * for backing off by returning -EDEADLK (wound case). Trying to acquire the + * for backing off by returning -EDEADLK (die case). Trying to acquire the * same lock with the same context twice is also detected and signalled by * returning -EALREADY. Returns 0 if the mutex was successfully acquired. * - * In the wound case the caller must release all currently held w/w mutexes for + * In the die case the caller must release all currently held w/w mutexes for * the given context and then wait for this contending lock to be available by * calling ww_mutex_lock_slow. Alternatively callers can opt to not acquire this * lock and proceed with trying to acquire further w/w mutexes (e.g. when @@ -226,14 +226,14 @@ extern int /* __must_check */ ww_mutex_lock(struct ww_mutex *lock, struct ww_acq * Lock the w/w mutex exclusively for this task. * * Deadlocks within a given w/w class of locks are detected and handled with the - * wait/wound algorithm. If the lock isn't immediately avaiable this function + * wait/die algorithm. If the lock isn't immediately available this function * will either sleep until it is (wait case). Or it selects the current context - * for backing off by returning -EDEADLK (wound case). Trying to acquire the + * for backing off by returning -EDEADLK (die case). Trying to acquire the * same lock with the same context twice is also detected and signalled by * returning -EALREADY. Returns 0 if the mutex was successfully acquired. If a * signal arrives while waiting for the lock then this function returns -EINTR. * - * In the wound case the caller must release all currently held w/w mutexes for + * In the die case the caller must release all currently held w/w mutexes for * the given context and then wait for this contending lock to be available by * calling ww_mutex_lock_slow_interruptible. Alternatively callers can opt to * not acquire this lock and proceed with trying to acquire further w/w mutexes @@ -256,7 +256,7 @@ extern int __must_check ww_mutex_lock_interruptible(struct ww_mutex *lock, * @lock: the mutex to be acquired * @ctx: w/w acquire context * - * Acquires a w/w mutex with the given context after a wound case. This function + * Acquires a w/w mutex with the given context after a die case. This function * will sleep until the lock becomes available. * * The caller must have released all w/w mutexes already acquired with the @@ -290,7 +290,7 @@ ww_mutex_lock_slow(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) * @lock: the mutex to be acquired * @ctx: w/w acquire context * - * Acquires a w/w mutex with the given context after a wound case. This function + * Acquires a w/w mutex with the given context after a die case. This function * will sleep until the lock becomes available and returns 0 when the lock has * been acquired. If a signal arrives while waiting for the lock then this * function returns -EINTR. diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index f44f658ae629..cfe48419b7d0 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -244,6 +244,17 @@ void __sched mutex_lock(struct mutex *lock) EXPORT_SYMBOL(mutex_lock); #endif +/* + * Wait-Die: + * The newer transactions are killed when: + * It (the new transaction) makes a request for a lock being held + * by an older transaction. + */ + +/* + * Associate the ww_mutex @ww with the context @ww_ctx under which we acquired + * it. + */ static __always_inline void ww_mutex_lock_acquired(struct ww_mutex *ww, struct ww_acquire_ctx *ww_ctx) { @@ -282,26 +293,53 @@ ww_mutex_lock_acquired(struct ww_mutex *ww, struct ww_acquire_ctx *ww_ctx) DEBUG_LOCKS_WARN_ON(ww_ctx->ww_class != ww->ww_class); #endif ww_ctx->acquired++; + ww->ctx = ww_ctx; } +/* + * Determine if context @a is 'after' context @b. IOW, @a is a younger + * transaction than @b and depending on algorithm either needs to wait for + * @b or die. + */ static inline bool __sched __ww_ctx_stamp_after(struct ww_acquire_ctx *a, struct ww_acquire_ctx *b) { - return a->stamp - b->stamp <= LONG_MAX && - (a->stamp != b->stamp || a > b); + + return (signed long)(a->stamp - b->stamp) > 0; +} + +/* + * Wait-Die; wake a younger waiter context (when locks held) such that it can + * die. + * + * Among waiters with context, only the first one can have other locks acquired + * already (ctx->acquired > 0), because __ww_mutex_add_waiter() and + * __ww_mutex_check_kill() wake any but the earliest context. + */ +static bool __sched +__ww_mutex_die(struct mutex *lock, struct mutex_waiter *waiter, + struct ww_acquire_ctx *ww_ctx) +{ + if (waiter->ww_ctx->acquired > 0 && + __ww_ctx_stamp_after(waiter->ww_ctx, ww_ctx)) { + debug_mutex_wake_waiter(lock, waiter); + wake_up_process(waiter->task); + } + + return true; } /* - * Wake up any waiters that may have to back off when the lock is held by the - * given context. + * We just acquired @lock under @ww_ctx, if there are later contexts waiting + * behind us on the wait-list, check if they need to die. * - * Due to the invariants on the wait list, this can only affect the first - * waiter with a context. + * See __ww_mutex_add_waiter() for the list-order construction; basically the + * list is ordered by stamp, smallest (oldest) first. * * The current task must not be on the wait list. */ static void __sched -__ww_mutex_wakeup_for_backoff(struct mutex *lock, struct ww_acquire_ctx *ww_ctx) +__ww_mutex_check_waiters(struct mutex *lock, struct ww_acquire_ctx *ww_ctx) { struct mutex_waiter *cur; @@ -311,30 +349,23 @@ __ww_mutex_wakeup_for_backoff(struct mutex *lock, struct ww_acquire_ctx *ww_ctx) if (!cur->ww_ctx) continue; - if (cur->ww_ctx->acquired > 0 && - __ww_ctx_stamp_after(cur->ww_ctx, ww_ctx)) { - debug_mutex_wake_waiter(lock, cur); - wake_up_process(cur->task); - } - - break; + if (__ww_mutex_die(lock, cur, ww_ctx)) + break; } } /* - * After acquiring lock with fastpath or when we lost out in contested - * slowpath, set ctx and wake up any waiters so they can recheck. + * After acquiring lock with fastpath, where we do not hold wait_lock, set ctx + * and wake up any waiters so they can recheck. */ static __always_inline void ww_mutex_set_context_fastpath(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) { ww_mutex_lock_acquired(lock, ctx); - lock->ctx = ctx; - /* * The lock->ctx update should be visible on all cores before - * the atomic read is done, otherwise contended waiters might be + * the WAITERS check is done, otherwise contended waiters might be * missed. The contended waiters will either see ww_ctx == NULL * and keep spinning, or it will acquire wait_lock, add itself * to waiter list and sleep. @@ -348,29 +379,14 @@ ww_mutex_set_context_fastpath(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) return; /* - * Uh oh, we raced in fastpath, wake up everyone in this case, - * so they can see the new lock->ctx. + * Uh oh, we raced in fastpath, check if any of the waiters need to + * die. */ spin_lock(&lock->base.wait_lock); - __ww_mutex_wakeup_for_backoff(&lock->base, ctx); + __ww_mutex_check_waiters(&lock->base, ctx); spin_unlock(&lock->base.wait_lock); } -/* - * After acquiring lock in the slowpath set ctx. - * - * Unlike for the fast path, the caller ensures that waiters are woken up where - * necessary. - * - * Callers must hold the mutex wait_lock. - */ -static __always_inline void -ww_mutex_set_context_slowpath(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) -{ - ww_mutex_lock_acquired(lock, ctx); - lock->ctx = ctx; -} - #ifdef CONFIG_MUTEX_SPIN_ON_OWNER static inline @@ -646,37 +662,73 @@ void __sched ww_mutex_unlock(struct ww_mutex *lock) } EXPORT_SYMBOL(ww_mutex_unlock); + +static __always_inline int __sched +__ww_mutex_kill(struct mutex *lock, struct ww_acquire_ctx *ww_ctx) +{ + if (ww_ctx->acquired > 0) { +#ifdef CONFIG_DEBUG_MUTEXES + struct ww_mutex *ww; + + ww = container_of(lock, struct ww_mutex, base); + DEBUG_LOCKS_WARN_ON(ww_ctx->contending_lock); + ww_ctx->contending_lock = ww; +#endif + return -EDEADLK; + } + + return 0; +} + + +/* + * Check whether we need to kill the transaction for the current lock acquire. + * + * Wait-Die: If we're trying to acquire a lock already held by an older + * context, kill ourselves. + * + * Since __ww_mutex_add_waiter() orders the wait-list on stamp, we only have to + * look at waiters before us in the wait-list. + */ static inline int __sched -__ww_mutex_lock_check_stamp(struct mutex *lock, struct mutex_waiter *waiter, - struct ww_acquire_ctx *ctx) +__ww_mutex_check_kill(struct mutex *lock, struct mutex_waiter *waiter, + struct ww_acquire_ctx *ctx) { struct ww_mutex *ww = container_of(lock, struct ww_mutex, base); struct ww_acquire_ctx *hold_ctx = READ_ONCE(ww->ctx); struct mutex_waiter *cur; + if (ctx->acquired == 0) + return 0; + if (hold_ctx && __ww_ctx_stamp_after(ctx, hold_ctx)) - goto deadlock; + return __ww_mutex_kill(lock, ctx); /* * If there is a waiter in front of us that has a context, then its - * stamp is earlier than ours and we must back off. + * stamp is earlier than ours and we must kill ourself. */ cur = waiter; list_for_each_entry_continue_reverse(cur, &lock->wait_list, list) { - if (cur->ww_ctx) - goto deadlock; + if (!cur->ww_ctx) + continue; + + return __ww_mutex_kill(lock, ctx); } return 0; - -deadlock: -#ifdef CONFIG_DEBUG_MUTEXES - DEBUG_LOCKS_WARN_ON(ctx->contending_lock); - ctx->contending_lock = ww; -#endif - return -EDEADLK; } +/* + * Add @waiter to the wait-list, keep the wait-list ordered by stamp, smallest + * first. Such that older contexts are preferred to acquire the lock over + * younger contexts. + * + * Waiters without context are interspersed in FIFO order. + * + * Furthermore, for Wait-Die kill ourself immediately when possible (there are + * older contexts already waiting) to avoid unnecessary waiting. + */ static inline int __sched __ww_mutex_add_waiter(struct mutex_waiter *waiter, struct mutex *lock, @@ -693,7 +745,7 @@ __ww_mutex_add_waiter(struct mutex_waiter *waiter, /* * Add the waiter before the first waiter with a higher stamp. * Waiters without a context are skipped to avoid starving - * them. + * them. Wait-Die waiters may die here. */ pos = &lock->wait_list; list_for_each_entry_reverse(cur, &lock->wait_list, list) { @@ -701,34 +753,27 @@ __ww_mutex_add_waiter(struct mutex_waiter *waiter, continue; if (__ww_ctx_stamp_after(ww_ctx, cur->ww_ctx)) { - /* Back off immediately if necessary. */ - if (ww_ctx->acquired > 0) { -#ifdef CONFIG_DEBUG_MUTEXES - struct ww_mutex *ww; + /* + * Wait-Die: if we find an older context waiting, there + * is no point in queueing behind it, as we'd have to + * die the moment it would acquire the lock. + */ + int ret = __ww_mutex_kill(lock, ww_ctx); - ww = container_of(lock, struct ww_mutex, base); - DEBUG_LOCKS_WARN_ON(ww_ctx->contending_lock); - ww_ctx->contending_lock = ww; -#endif - return -EDEADLK; - } + if (ret) + return ret; break; } pos = &cur->list; - /* - * Wake up the waiter so that it gets a chance to back - * off. - */ - if (cur->ww_ctx->acquired > 0) { - debug_mutex_wake_waiter(lock, cur); - wake_up_process(cur->task); - } + /* Wait-Die: ensure younger waiters die. */ + __ww_mutex_die(lock, cur, ww_ctx); } list_add_tail(&waiter->list, pos); + return 0; } @@ -772,7 +817,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, */ if (__mutex_trylock(lock)) { if (use_ww_ctx && ww_ctx) - __ww_mutex_wakeup_for_backoff(lock, ww_ctx); + __ww_mutex_check_waiters(lock, ww_ctx); goto skip_wait; } @@ -790,10 +835,13 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, waiter.ww_ctx = MUTEX_POISON_WW_CTX; #endif } else { - /* Add in stamp order, waking up waiters that must back off. */ + /* + * Add in stamp order, waking up waiters that must kill + * themselves. + */ ret = __ww_mutex_add_waiter(&waiter, lock, ww_ctx); if (ret) - goto err_early_backoff; + goto err_early_kill; waiter.ww_ctx = ww_ctx; } @@ -815,7 +863,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, goto acquired; /* - * Check for signals and wound conditions while holding + * Check for signals and kill conditions while holding * wait_lock. This ensures the lock cancellation is ordered * against mutex_unlock() and wake-ups do not go missing. */ @@ -824,8 +872,8 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, goto err; } - if (use_ww_ctx && ww_ctx && ww_ctx->acquired > 0) { - ret = __ww_mutex_lock_check_stamp(lock, &waiter, ww_ctx); + if (use_ww_ctx && ww_ctx) { + ret = __ww_mutex_check_kill(lock, &waiter, ww_ctx); if (ret) goto err; } @@ -870,7 +918,7 @@ skip_wait: lock_acquired(&lock->dep_map, ip); if (use_ww_ctx && ww_ctx) - ww_mutex_set_context_slowpath(ww, ww_ctx); + ww_mutex_lock_acquired(ww, ww_ctx); spin_unlock(&lock->wait_lock); preempt_enable(); @@ -879,7 +927,7 @@ skip_wait: err: __set_current_state(TASK_RUNNING); mutex_remove_waiter(lock, &waiter, current); -err_early_backoff: +err_early_kill: spin_unlock(&lock->wait_lock); debug_mutex_free_waiter(&waiter); mutex_release(&lock->dep_map, 1, ip); -- cgit From 08295b3b5beec9aac0f7a9db86f0fc3792039da3 Mon Sep 17 00:00:00 2001 From: Thomas Hellstrom Date: Fri, 15 Jun 2018 10:17:38 +0200 Subject: locking: Implement an algorithm choice for Wound-Wait mutexes The current Wound-Wait mutex algorithm is actually not Wound-Wait but Wait-Die. Implement also Wound-Wait as a per-ww-class choice. Wound-Wait is, contrary to Wait-Die a preemptive algorithm and is known to generate fewer backoffs. Testing reveals that this is true if the number of simultaneous contending transactions is small. As the number of simultaneous contending threads increases, Wait-Wound becomes inferior to Wait-Die in terms of elapsed time. Possibly due to the larger number of held locks of sleeping transactions. Update documentation and callers. Timings using git://people.freedesktop.org/~thomash/ww_mutex_test tag patch-18-06-15 Each thread runs 100000 batches of lock / unlock 800 ww mutexes randomly chosen out of 100000. Four core Intel x86_64: Algorithm #threads Rollbacks time Wound-Wait 4 ~100 ~17s. Wait-Die 4 ~150000 ~19s. Wound-Wait 16 ~360000 ~109s. Wait-Die 16 ~450000 ~82s. Cc: Ingo Molnar Cc: Jonathan Corbet Cc: Gustavo Padovan Cc: Maarten Lankhorst Cc: Sean Paul Cc: David Airlie Cc: Davidlohr Bueso Cc: "Paul E. McKenney" Cc: Josh Triplett Cc: Thomas Gleixner Cc: Kate Stewart Cc: Philippe Ombredanne Cc: Greg Kroah-Hartman Cc: linux-doc@vger.kernel.org Cc: linux-media@vger.kernel.org Cc: linaro-mm-sig@lists.linaro.org Co-authored-by: Peter Zijlstra Signed-off-by: Thomas Hellstrom Acked-by: Peter Zijlstra (Intel) Acked-by: Ingo Molnar --- Documentation/locking/ww-mutex-design.txt | 57 +++++++++-- drivers/dma-buf/reservation.c | 2 +- drivers/gpu/drm/drm_modeset_lock.c | 2 +- include/linux/ww_mutex.h | 17 ++- kernel/locking/locktorture.c | 2 +- kernel/locking/mutex.c | 165 +++++++++++++++++++++++++++--- kernel/locking/test-ww_mutex.c | 2 +- lib/locking-selftest.c | 2 +- 8 files changed, 213 insertions(+), 36 deletions(-) (limited to 'kernel') diff --git a/Documentation/locking/ww-mutex-design.txt b/Documentation/locking/ww-mutex-design.txt index 2fd7f2a2af21..f0ed7c30e695 100644 --- a/Documentation/locking/ww-mutex-design.txt +++ b/Documentation/locking/ww-mutex-design.txt @@ -1,4 +1,4 @@ -Wait/Wound Deadlock-Proof Mutex Design +Wound/Wait Deadlock-Proof Mutex Design ====================================== Please read mutex-design.txt first, as it applies to wait/wound mutexes too. @@ -32,10 +32,26 @@ the oldest task) wins, and the one with the higher reservation id (i.e. the younger task) unlocks all of the buffers that it has already locked, and then tries again. -In the RDBMS literature this deadlock handling approach is called wait/die: -The older tasks waits until it can acquire the contended lock. The younger tasks -needs to back off and drop all the locks it is currently holding, i.e. the -younger task dies. +In the RDBMS literature, a reservation ticket is associated with a transaction. +and the deadlock handling approach is called Wait-Die. The name is based on +the actions of a locking thread when it encounters an already locked mutex. +If the transaction holding the lock is younger, the locking transaction waits. +If the transaction holding the lock is older, the locking transaction backs off +and dies. Hence Wait-Die. +There is also another algorithm called Wound-Wait: +If the transaction holding the lock is younger, the locking transaction +wounds the transaction holding the lock, requesting it to die. +If the transaction holding the lock is older, it waits for the other +transaction. Hence Wound-Wait. +The two algorithms are both fair in that a transaction will eventually succeed. +However, the Wound-Wait algorithm is typically stated to generate fewer backoffs +compared to Wait-Die, but is, on the other hand, associated with more work than +Wait-Die when recovering from a backoff. Wound-Wait is also a preemptive +algorithm in that transactions are wounded by other transactions, and that +requires a reliable way to pick up up the wounded condition and preempt the +running transaction. Note that this is not the same as process preemption. A +Wound-Wait transaction is considered preempted when it dies (returning +-EDEADLK) following a wound. Concepts -------- @@ -47,10 +63,12 @@ Acquire context: To ensure eventual forward progress it is important the a task trying to acquire locks doesn't grab a new reservation id, but keeps the one it acquired when starting the lock acquisition. This ticket is stored in the acquire context. Furthermore the acquire context keeps track of debugging state -to catch w/w mutex interface abuse. +to catch w/w mutex interface abuse. An acquire context is representing a +transaction. W/w class: In contrast to normal mutexes the lock class needs to be explicit for -w/w mutexes, since it is required to initialize the acquire context. +w/w mutexes, since it is required to initialize the acquire context. The lock +class also specifies what algorithm to use, Wound-Wait or Wait-Die. Furthermore there are three different class of w/w lock acquire functions: @@ -90,6 +108,12 @@ provided. Usage ----- +The algorithm (Wait-Die vs Wound-Wait) is chosen by using either +DEFINE_WW_CLASS() (Wound-Wait) or DEFINE_WD_CLASS() (Wait-Die) +As a rough rule of thumb, use Wound-Wait iff you +expect the number of simultaneous competing transactions to be typically small, +and you want to reduce the number of rollbacks. + Three different ways to acquire locks within the same w/w class. Common definitions for methods #1 and #2: @@ -312,12 +336,23 @@ Design: We maintain the following invariants for the wait list: (1) Waiters with an acquire context are sorted by stamp order; waiters without an acquire context are interspersed in FIFO order. - (2) Among waiters with contexts, only the first one can have other locks - acquired already (ctx->acquired > 0). Note that this waiter may come - after other waiters without contexts in the list. + (2) For Wait-Die, among waiters with contexts, only the first one can have + other locks acquired already (ctx->acquired > 0). Note that this waiter + may come after other waiters without contexts in the list. + + The Wound-Wait preemption is implemented with a lazy-preemption scheme: + The wounded status of the transaction is checked only when there is + contention for a new lock and hence a true chance of deadlock. In that + situation, if the transaction is wounded, it backs off, clears the + wounded status and retries. A great benefit of implementing preemption in + this way is that the wounded transaction can identify a contending lock to + wait for before restarting the transaction. Just blindly restarting the + transaction would likely make the transaction end up in a situation where + it would have to back off again. In general, not much contention is expected. The locks are typically used to - serialize access to resources for devices. + serialize access to resources for devices, and optimization focus should + therefore be directed towards the uncontended cases. Lockdep: Special care has been taken to warn for as many cases of api abuse diff --git a/drivers/dma-buf/reservation.c b/drivers/dma-buf/reservation.c index 314eb1071cce..20bf90f4ee63 100644 --- a/drivers/dma-buf/reservation.c +++ b/drivers/dma-buf/reservation.c @@ -46,7 +46,7 @@ * write-side updates. */ -DEFINE_WW_CLASS(reservation_ww_class); +DEFINE_WD_CLASS(reservation_ww_class); EXPORT_SYMBOL(reservation_ww_class); struct lock_class_key reservation_seqcount_class; diff --git a/drivers/gpu/drm/drm_modeset_lock.c b/drivers/gpu/drm/drm_modeset_lock.c index 8a5100685875..638be2eb67b4 100644 --- a/drivers/gpu/drm/drm_modeset_lock.c +++ b/drivers/gpu/drm/drm_modeset_lock.c @@ -70,7 +70,7 @@ * lists and lookup data structures. */ -static DEFINE_WW_CLASS(crtc_ww_class); +static DEFINE_WD_CLASS(crtc_ww_class); /** * drm_modeset_lock_all - take all modeset locks diff --git a/include/linux/ww_mutex.h b/include/linux/ww_mutex.h index f82fce2229c8..3af7c0e03be5 100644 --- a/include/linux/ww_mutex.h +++ b/include/linux/ww_mutex.h @@ -8,6 +8,8 @@ * * Wait/Die implementation: * Copyright (C) 2013 Canonical Ltd. + * Choice of algorithm: + * Copyright (C) 2018 WMWare Inc. * * This file contains the main data structure and API definitions. */ @@ -23,12 +25,15 @@ struct ww_class { struct lock_class_key mutex_key; const char *acquire_name; const char *mutex_name; + unsigned int is_wait_die; }; struct ww_acquire_ctx { struct task_struct *task; unsigned long stamp; unsigned int acquired; + unsigned short wounded; + unsigned short is_wait_die; #ifdef CONFIG_DEBUG_MUTEXES unsigned int done_acquire; struct ww_class *ww_class; @@ -58,17 +63,21 @@ struct ww_mutex { # define __WW_CLASS_MUTEX_INITIALIZER(lockname, class) #endif -#define __WW_CLASS_INITIALIZER(ww_class) \ +#define __WW_CLASS_INITIALIZER(ww_class, _is_wait_die) \ { .stamp = ATOMIC_LONG_INIT(0) \ , .acquire_name = #ww_class "_acquire" \ - , .mutex_name = #ww_class "_mutex" } + , .mutex_name = #ww_class "_mutex" \ + , .is_wait_die = _is_wait_die } #define __WW_MUTEX_INITIALIZER(lockname, class) \ { .base = __MUTEX_INITIALIZER(lockname.base) \ __WW_CLASS_MUTEX_INITIALIZER(lockname, class) } +#define DEFINE_WD_CLASS(classname) \ + struct ww_class classname = __WW_CLASS_INITIALIZER(classname, 1) + #define DEFINE_WW_CLASS(classname) \ - struct ww_class classname = __WW_CLASS_INITIALIZER(classname) + struct ww_class classname = __WW_CLASS_INITIALIZER(classname, 0) #define DEFINE_WW_MUTEX(mutexname, ww_class) \ struct ww_mutex mutexname = __WW_MUTEX_INITIALIZER(mutexname, ww_class) @@ -123,6 +132,8 @@ static inline void ww_acquire_init(struct ww_acquire_ctx *ctx, ctx->task = current; ctx->stamp = atomic_long_inc_return_relaxed(&ww_class->stamp); ctx->acquired = 0; + ctx->wounded = false; + ctx->is_wait_die = ww_class->is_wait_die; #ifdef CONFIG_DEBUG_MUTEXES ctx->ww_class = ww_class; ctx->done_acquire = 0; diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c index 8402b3349dca..c28224347d69 100644 --- a/kernel/locking/locktorture.c +++ b/kernel/locking/locktorture.c @@ -365,7 +365,7 @@ static struct lock_torture_ops mutex_lock_ops = { }; #include -static DEFINE_WW_CLASS(torture_ww_class); +static DEFINE_WD_CLASS(torture_ww_class); static DEFINE_WW_MUTEX(torture_ww_mutex_0, &torture_ww_class); static DEFINE_WW_MUTEX(torture_ww_mutex_1, &torture_ww_class); static DEFINE_WW_MUTEX(torture_ww_mutex_2, &torture_ww_class); diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index cfe48419b7d0..1a81a1257b3f 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -173,6 +173,21 @@ static inline bool __mutex_waiter_is_first(struct mutex *lock, struct mutex_wait return list_first_entry(&lock->wait_list, struct mutex_waiter, list) == waiter; } +/* + * Add @waiter to a given location in the lock wait_list and set the + * FLAG_WAITERS flag if it's the first waiter. + */ +static void __sched +__mutex_add_waiter(struct mutex *lock, struct mutex_waiter *waiter, + struct list_head *list) +{ + debug_mutex_add_waiter(lock, waiter, current); + + list_add_tail(&waiter->list, list); + if (__mutex_waiter_is_first(lock, waiter)) + __mutex_set_flag(lock, MUTEX_FLAG_WAITERS); +} + /* * Give up ownership to a specific task, when @task = NULL, this is equivalent * to a regular unlock. Sets PICKUP on a handoff, clears HANDOF, preserves @@ -249,6 +264,11 @@ EXPORT_SYMBOL(mutex_lock); * The newer transactions are killed when: * It (the new transaction) makes a request for a lock being held * by an older transaction. + * + * Wound-Wait: + * The newer transactions are wounded when: + * An older transaction makes a request for a lock being held by + * the newer transaction. */ /* @@ -320,6 +340,9 @@ static bool __sched __ww_mutex_die(struct mutex *lock, struct mutex_waiter *waiter, struct ww_acquire_ctx *ww_ctx) { + if (!ww_ctx->is_wait_die) + return false; + if (waiter->ww_ctx->acquired > 0 && __ww_ctx_stamp_after(waiter->ww_ctx, ww_ctx)) { debug_mutex_wake_waiter(lock, waiter); @@ -329,13 +352,65 @@ __ww_mutex_die(struct mutex *lock, struct mutex_waiter *waiter, return true; } +/* + * Wound-Wait; wound a younger @hold_ctx if it holds the lock. + * + * Wound the lock holder if there are waiters with older transactions than + * the lock holders. Even if multiple waiters may wound the lock holder, + * it's sufficient that only one does. + */ +static bool __ww_mutex_wound(struct mutex *lock, + struct ww_acquire_ctx *ww_ctx, + struct ww_acquire_ctx *hold_ctx) +{ + struct task_struct *owner = __mutex_owner(lock); + + lockdep_assert_held(&lock->wait_lock); + + /* + * Possible through __ww_mutex_add_waiter() when we race with + * ww_mutex_set_context_fastpath(). In that case we'll get here again + * through __ww_mutex_check_waiters(). + */ + if (!hold_ctx) + return false; + + /* + * Can have !owner because of __mutex_unlock_slowpath(), but if owner, + * it cannot go away because we'll have FLAG_WAITERS set and hold + * wait_lock. + */ + if (!owner) + return false; + + if (ww_ctx->acquired > 0 && __ww_ctx_stamp_after(hold_ctx, ww_ctx)) { + hold_ctx->wounded = 1; + + /* + * wake_up_process() paired with set_current_state() + * inserts sufficient barriers to make sure @owner either sees + * it's wounded in __ww_mutex_lock_check_stamp() or has a + * wakeup pending to re-read the wounded state. + */ + if (owner != current) + wake_up_process(owner); + + return true; + } + + return false; +} + /* * We just acquired @lock under @ww_ctx, if there are later contexts waiting - * behind us on the wait-list, check if they need to die. + * behind us on the wait-list, check if they need to die, or wound us. * * See __ww_mutex_add_waiter() for the list-order construction; basically the * list is ordered by stamp, smallest (oldest) first. * + * This relies on never mixing wait-die/wound-wait on the same wait-list; + * which is currently ensured by that being a ww_class property. + * * The current task must not be on the wait list. */ static void __sched @@ -349,7 +424,8 @@ __ww_mutex_check_waiters(struct mutex *lock, struct ww_acquire_ctx *ww_ctx) if (!cur->ww_ctx) continue; - if (__ww_mutex_die(lock, cur, ww_ctx)) + if (__ww_mutex_die(lock, cur, ww_ctx) || + __ww_mutex_wound(lock, cur->ww_ctx, ww_ctx)) break; } } @@ -370,17 +446,23 @@ ww_mutex_set_context_fastpath(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) * and keep spinning, or it will acquire wait_lock, add itself * to waiter list and sleep. */ - smp_mb(); /* ^^^ */ + smp_mb(); /* See comments above and below. */ /* - * Check if lock is contended, if not there is nobody to wake up + * [W] ww->ctx = ctx [W] MUTEX_FLAG_WAITERS + * MB MB + * [R] MUTEX_FLAG_WAITERS [R] ww->ctx + * + * The memory barrier above pairs with the memory barrier in + * __ww_mutex_add_waiter() and makes sure we either observe ww->ctx + * and/or !empty list. */ if (likely(!(atomic_long_read(&lock->base.owner) & MUTEX_FLAG_WAITERS))) return; /* * Uh oh, we raced in fastpath, check if any of the waiters need to - * die. + * die or wound us. */ spin_lock(&lock->base.wait_lock); __ww_mutex_check_waiters(&lock->base, ctx); @@ -682,7 +764,9 @@ __ww_mutex_kill(struct mutex *lock, struct ww_acquire_ctx *ww_ctx) /* - * Check whether we need to kill the transaction for the current lock acquire. + * Check the wound condition for the current lock acquire. + * + * Wound-Wait: If we're wounded, kill ourself. * * Wait-Die: If we're trying to acquire a lock already held by an older * context, kill ourselves. @@ -701,6 +785,13 @@ __ww_mutex_check_kill(struct mutex *lock, struct mutex_waiter *waiter, if (ctx->acquired == 0) return 0; + if (!ctx->is_wait_die) { + if (ctx->wounded) + return __ww_mutex_kill(lock, ctx); + + return 0; + } + if (hold_ctx && __ww_ctx_stamp_after(ctx, hold_ctx)) return __ww_mutex_kill(lock, ctx); @@ -727,7 +818,8 @@ __ww_mutex_check_kill(struct mutex *lock, struct mutex_waiter *waiter, * Waiters without context are interspersed in FIFO order. * * Furthermore, for Wait-Die kill ourself immediately when possible (there are - * older contexts already waiting) to avoid unnecessary waiting. + * older contexts already waiting) to avoid unnecessary waiting and for + * Wound-Wait ensure we wound the owning context when it is younger. */ static inline int __sched __ww_mutex_add_waiter(struct mutex_waiter *waiter, @@ -736,16 +828,21 @@ __ww_mutex_add_waiter(struct mutex_waiter *waiter, { struct mutex_waiter *cur; struct list_head *pos; + bool is_wait_die; if (!ww_ctx) { - list_add_tail(&waiter->list, &lock->wait_list); + __mutex_add_waiter(lock, waiter, &lock->wait_list); return 0; } + is_wait_die = ww_ctx->is_wait_die; + /* * Add the waiter before the first waiter with a higher stamp. * Waiters without a context are skipped to avoid starving - * them. Wait-Die waiters may die here. + * them. Wait-Die waiters may die here. Wound-Wait waiters + * never die here, but they are sorted in stamp order and + * may wound the lock holder. */ pos = &lock->wait_list; list_for_each_entry_reverse(cur, &lock->wait_list, list) { @@ -758,10 +855,12 @@ __ww_mutex_add_waiter(struct mutex_waiter *waiter, * is no point in queueing behind it, as we'd have to * die the moment it would acquire the lock. */ - int ret = __ww_mutex_kill(lock, ww_ctx); + if (is_wait_die) { + int ret = __ww_mutex_kill(lock, ww_ctx); - if (ret) - return ret; + if (ret) + return ret; + } break; } @@ -772,7 +871,23 @@ __ww_mutex_add_waiter(struct mutex_waiter *waiter, __ww_mutex_die(lock, cur, ww_ctx); } - list_add_tail(&waiter->list, pos); + __mutex_add_waiter(lock, waiter, pos); + + /* + * Wound-Wait: if we're blocking on a mutex owned by a younger context, + * wound that such that we might proceed. + */ + if (!is_wait_die) { + struct ww_mutex *ww = container_of(lock, struct ww_mutex, base); + + /* + * See ww_mutex_set_context_fastpath(). Orders setting + * MUTEX_FLAG_WAITERS vs the ww->ctx load, + * such that either we or the fastpath will wound @ww->ctx. + */ + smp_mb(); + __ww_mutex_wound(lock, ww_ctx, ww->ctx); + } return 0; } @@ -796,6 +911,14 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, if (use_ww_ctx && ww_ctx) { if (unlikely(ww_ctx == READ_ONCE(ww->ctx))) return -EALREADY; + + /* + * Reset the wounded flag after a kill. No other process can + * race and wound us here since they can't have a valid owner + * pointer if we don't have any locks held. + */ + if (ww_ctx->acquired == 0) + ww_ctx->wounded = 0; } preempt_disable(); @@ -829,7 +952,8 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, if (!use_ww_ctx) { /* add waiting tasks to the end of the waitqueue (FIFO): */ - list_add_tail(&waiter.list, &lock->wait_list); + __mutex_add_waiter(lock, &waiter, &lock->wait_list); + #ifdef CONFIG_DEBUG_MUTEXES waiter.ww_ctx = MUTEX_POISON_WW_CTX; @@ -848,9 +972,6 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, waiter.task = current; - if (__mutex_waiter_is_first(lock, &waiter)) - __mutex_set_flag(lock, MUTEX_FLAG_WAITERS); - set_current_state(state); for (;;) { /* @@ -907,6 +1028,16 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, acquired: __set_current_state(TASK_RUNNING); + if (use_ww_ctx && ww_ctx) { + /* + * Wound-Wait; we stole the lock (!first_waiter), check the + * waiters as anyone might want to wound us. + */ + if (!ww_ctx->is_wait_die && + !__mutex_waiter_is_first(lock, &waiter)) + __ww_mutex_check_waiters(lock, ww_ctx); + } + mutex_remove_waiter(lock, &waiter, current); if (likely(list_empty(&lock->wait_list))) __mutex_clear_flag(lock, MUTEX_FLAGS); diff --git a/kernel/locking/test-ww_mutex.c b/kernel/locking/test-ww_mutex.c index 0e4cd64ad2c0..5b915b370d5a 100644 --- a/kernel/locking/test-ww_mutex.c +++ b/kernel/locking/test-ww_mutex.c @@ -26,7 +26,7 @@ #include #include -static DEFINE_WW_CLASS(ww_class); +static DEFINE_WD_CLASS(ww_class); struct workqueue_struct *wq; struct test_mutex { diff --git a/lib/locking-selftest.c b/lib/locking-selftest.c index b5c1293ce147..1e1bbf171eca 100644 --- a/lib/locking-selftest.c +++ b/lib/locking-selftest.c @@ -29,7 +29,7 @@ */ static unsigned int debug_locks_verbose; -static DEFINE_WW_CLASS(ww_lockdep); +static DEFINE_WD_CLASS(ww_lockdep); static int __init setup_debug_locks_verbose(char *str) { -- cgit From c72051d5778a9c0e5df31d6553a6fa3507b3685c Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 18 Jun 2018 16:58:24 +0200 Subject: audit: use ktime_get_coarse_ts64() for time access The API got renamed for consistency with the other time accessors, this changes the audit caller as well. Signed-off-by: Arnd Bergmann Reviewed-by: Richard Guy Briggs Signed-off-by: Paul Moore --- kernel/audit.c | 2 +- kernel/auditsc.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index 0f3222e4edde..e17bc697d11c 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -1721,7 +1721,7 @@ static inline void audit_get_stamp(struct audit_context *ctx, struct timespec64 *t, unsigned int *serial) { if (!ctx || !auditsc_get_stamp(ctx, t, serial)) { - *t = current_kernel_time64(); + ktime_get_coarse_ts64(t); *serial = audit_serial(); } } diff --git a/kernel/auditsc.c b/kernel/auditsc.c index d762e0b8160e..f6a0cb32d76e 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -1540,10 +1540,10 @@ void __audit_syscall_entry(int major, unsigned long a1, unsigned long a2, context->argv[2] = a3; context->argv[3] = a4; context->serial = 0; - context->ctime = current_kernel_time64(); context->in_syscall = 1; context->current_state = state; context->ppid = 0; + ktime_get_coarse_ts64(&context->ctime); } /** -- cgit From ed2b82c03dc187018307c7c6bf9299705f3db383 Mon Sep 17 00:00:00 2001 From: Mauricio Vasquez B Date: Fri, 29 Jun 2018 14:48:20 +0200 Subject: bpf: hash map: decrement counter on error Decrement the number of elements in the map in case the allocation of a new node fails. Fixes: 6c9059817432 ("bpf: pre-allocate hash map elements") Signed-off-by: Mauricio Vasquez B Acked-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- kernel/bpf/hashtab.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 3ca2198a6d22..513d9dfcf4ee 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -747,13 +747,15 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key, * old element will be freed immediately. * Otherwise return an error */ - atomic_dec(&htab->count); - return ERR_PTR(-E2BIG); + l_new = ERR_PTR(-E2BIG); + goto dec_count; } l_new = kmalloc_node(htab->elem_size, GFP_ATOMIC | __GFP_NOWARN, htab->map.numa_node); - if (!l_new) - return ERR_PTR(-ENOMEM); + if (!l_new) { + l_new = ERR_PTR(-ENOMEM); + goto dec_count; + } } memcpy(l_new->key, key, key_size); @@ -766,7 +768,8 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key, GFP_ATOMIC | __GFP_NOWARN); if (!pptr) { kfree(l_new); - return ERR_PTR(-ENOMEM); + l_new = ERR_PTR(-ENOMEM); + goto dec_count; } } @@ -780,6 +783,9 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key, l_new->hash = hash; return l_new; +dec_count: + atomic_dec(&htab->count); + return l_new; } static int check_flags(struct bpf_htab *htab, struct htab_elem *l_old, -- cgit From cf4d418e653afc84c9c873236033e06be5d58f1c Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 28 Mar 2018 16:09:10 +0200 Subject: tracing: Avoid string overflow 'err' is used as a NUL-terminated string, but using strncpy() with the length equal to the buffer size may result in lack of the termination: kernel/trace/trace_events_hist.c: In function 'hist_err_event': kernel/trace/trace_events_hist.c:396:3: error: 'strncpy' specified bound 256 equals destination size [-Werror=stringop-truncation] strncpy(err, var, MAX_FILTER_STR_VAL); This changes it to use the safer strscpy() instead. Link: http://lkml.kernel.org/r/20180328140920.2842153-1-arnd@arndb.de Cc: stable@vger.kernel.org Fixes: f404da6e1d46 ("tracing: Add 'last error' error facility for hist triggers") Acked-by: Tom Zanussi Signed-off-by: Arnd Bergmann Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_hist.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 046c716a6536..aae18af94c94 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -393,7 +393,7 @@ static void hist_err_event(char *str, char *system, char *event, char *var) else if (system) snprintf(err, MAX_FILTER_STR_VAL, "%s.%s", system, event); else - strncpy(err, var, MAX_FILTER_STR_VAL); + strscpy(err, var, MAX_FILTER_STR_VAL); hist_err(str, err); } -- cgit From f90658725ba7ebb031054866aff4cda0d099a3b1 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Mon, 2 Jul 2018 11:41:38 -0400 Subject: tracing: Make create_filter() code match the comments The comment in create_filter() states that the passed in filter pointer (filterp) will either be NULL or contain an error message stating why the filter failed. But it also expects the filter pointer to point to NULL when passed in. If it is not, the function create_filter_start() will warn and return an error message without updating the filter pointer. This is not what the comment states. As we always expect the pointer to point to NULL, if it is not, trigger a WARN_ON(), set it to NULL, and then continue the path as the rest will work as the comment states. Also update the comment to state it must point to NULL. Reported-by: Dan Carpenter Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_filter.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 0dceb77d1d42..893a206bcba4 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -1701,6 +1701,7 @@ static void create_filter_finish(struct filter_parse_error *pe) * @filter_str: filter string * @set_str: remember @filter_str and enable detailed error in filter * @filterp: out param for created filter (always updated on return) + * Must be a pointer that references a NULL pointer. * * Creates a filter for @call with @filter_str. If @set_str is %true, * @filter_str is copied and recorded in the new filter. @@ -1718,6 +1719,10 @@ static int create_filter(struct trace_event_call *call, struct filter_parse_error *pe = NULL; int err; + /* filterp must point to NULL */ + if (WARN_ON(*filterp)) + *filterp = NULL; + err = create_filter_start(filter_string, set_str, &pe, filterp); if (err) return err; -- cgit From f26808ba7227a921e0e8549c7d3c52332b920085 Mon Sep 17 00:00:00 2001 From: yuan linyu Date: Sun, 8 Apr 2018 19:36:31 +0800 Subject: tracing: Optimize trace_buffer_iter() logic Simplify and optimize the logic in trace_buffer_iter() to use a conditional operation instead of an if conditional. Link: http://lkml.kernel.org/r/20180408113631.3947-1-cugyly@163.com Signed-off-by: yuan linyu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.h | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 630c5a24b2b2..f8f86231ad90 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -583,9 +583,7 @@ static __always_inline void trace_clear_recursion(int bit) static inline struct ring_buffer_iter * trace_buffer_iter(struct trace_iterator *iter, int cpu) { - if (iter->buffer_iter && iter->buffer_iter[cpu]) - return iter->buffer_iter[cpu]; - return NULL; + return iter->buffer_iter ? iter->buffer_iter[cpu] : NULL; } int tracer_init(struct tracer *t, struct trace_array *tr); -- cgit From 26b68dd2f48fe7699a89f0cfbb9f4a650dc1c837 Mon Sep 17 00:00:00 2001 From: Mathieu Malaterre Date: Thu, 8 Mar 2018 21:58:43 +0100 Subject: tracing: Use __printf markup to silence compiler MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Silence warnings (triggered at W=1) by adding relevant __printf attributes. CC kernel/trace/trace.o kernel/trace/trace.c: In function ‘__trace_array_vprintk’: kernel/trace/trace.c:2979:2: warning: function might be possible candidate for ‘gnu_printf’ format attribute [-Wsuggest-attribute=format] len = vscnprintf(tbuffer, TRACE_BUF_SIZE, fmt, args); ^~~ AR kernel/trace/built-in.o Link: http://lkml.kernel.org/r/20180308205843.27447-1-malat@debian.org Signed-off-by: Mathieu Malaterre Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index a0079b4c7a49..f054bd6a1c66 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2953,6 +2953,7 @@ out_nobuffer: } EXPORT_SYMBOL_GPL(trace_vbprintk); +__printf(3, 0) static int __trace_array_vprintk(struct ring_buffer *buffer, unsigned long ip, const char *fmt, va_list args) @@ -3007,12 +3008,14 @@ out_nobuffer: return len; } +__printf(3, 0) int trace_array_vprintk(struct trace_array *tr, unsigned long ip, const char *fmt, va_list args) { return __trace_array_vprintk(tr->trace_buffer.buffer, ip, fmt, args); } +__printf(3, 0) int trace_array_printk(struct trace_array *tr, unsigned long ip, const char *fmt, ...) { @@ -3028,6 +3031,7 @@ int trace_array_printk(struct trace_array *tr, return ret; } +__printf(3, 4) int trace_array_printk_buf(struct ring_buffer *buffer, unsigned long ip, const char *fmt, ...) { @@ -3043,6 +3047,7 @@ int trace_array_printk_buf(struct ring_buffer *buffer, return ret; } +__printf(2, 0) int trace_vprintk(unsigned long ip, const char *fmt, va_list args) { return trace_array_vprintk(&global_trace, ip, fmt, args); -- cgit From 5ccba64a560fa6ca06008d4001f5d46ebeb34b41 Mon Sep 17 00:00:00 2001 From: Yisheng Xie Date: Fri, 2 Feb 2018 10:14:49 +0800 Subject: ftrace: Nuke clear_ftrace_function clear_ftrace_function is not used outside of ftrace.c and is not help to use a function, so nuke it per Steve's suggestion. Link: http://lkml.kernel.org/r/1517537689-34947-1-git-send-email-xieyisheng1@huawei.com Suggested-by: Steven Rostedt Signed-off-by: Yisheng Xie Signed-off-by: Steven Rostedt (VMware) --- include/linux/ftrace.h | 2 -- kernel/trace/ftrace.c | 13 +------------ 2 files changed, 1 insertion(+), 14 deletions(-) (limited to 'kernel') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 8154f4920fcb..ebb77674be90 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -223,7 +223,6 @@ extern enum ftrace_tracing_type_t ftrace_tracing_type; */ int register_ftrace_function(struct ftrace_ops *ops); int unregister_ftrace_function(struct ftrace_ops *ops); -void clear_ftrace_function(void); extern void ftrace_stub(unsigned long a0, unsigned long a1, struct ftrace_ops *op, struct pt_regs *regs); @@ -239,7 +238,6 @@ static inline int ftrace_nr_registered_ops(void) { return 0; } -static inline void clear_ftrace_function(void) { } static inline void ftrace_kill(void) { } static inline void ftrace_free_init_mem(void) { } static inline void ftrace_free_mem(struct module *mod, void *start, void *end) { } diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index efed9c1cfb7e..caf9cbf35816 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -192,17 +192,6 @@ static void ftrace_pid_func(unsigned long ip, unsigned long parent_ip, op->saved_func(ip, parent_ip, op, regs); } -/** - * clear_ftrace_function - reset the ftrace function - * - * This NULLs the ftrace function and in essence stops - * tracing. There may be lag - */ -void clear_ftrace_function(void) -{ - ftrace_trace_function = ftrace_stub; -} - static void ftrace_sync(struct work_struct *work) { /* @@ -6689,7 +6678,7 @@ void ftrace_kill(void) { ftrace_disabled = 1; ftrace_enabled = 0; - clear_ftrace_function(); + ftrace_trace_function = ftrace_stub; } /** -- cgit From 1fe4293f4b8de75824935f8d8e9a99c7fc6873da Mon Sep 17 00:00:00 2001 From: Changbin Du Date: Wed, 31 Jan 2018 23:48:49 +0800 Subject: tracing: Fix missing return symbol in function_graph output The function_graph tracer does not show the interrupt return marker for the leaf entry. On leaf entries, we see an unbalanced interrupt marker (the interrupt was entered, but nevern left). Before: 1) | SyS_write() { 1) | __fdget_pos() { 1) 0.061 us | __fget_light(); 1) 0.289 us | } 1) | vfs_write() { 1) 0.049 us | rw_verify_area(); 1) + 15.424 us | __vfs_write(); 1) ==========> | 1) 6.003 us | smp_apic_timer_interrupt(); 1) 0.055 us | __fsnotify_parent(); 1) 0.073 us | fsnotify(); 1) + 23.665 us | } 1) + 24.501 us | } After: 0) | SyS_write() { 0) | __fdget_pos() { 0) 0.052 us | __fget_light(); 0) 0.328 us | } 0) | vfs_write() { 0) 0.057 us | rw_verify_area(); 0) | __vfs_write() { 0) ==========> | 0) 8.548 us | smp_apic_timer_interrupt(); 0) <========== | 0) + 36.507 us | } /* __vfs_write */ 0) 0.049 us | __fsnotify_parent(); 0) 0.066 us | fsnotify(); 0) + 50.064 us | } 0) + 50.952 us | } Link: http://lkml.kernel.org/r/1517413729-20411-1-git-send-email-changbin.du@intel.com Cc: stable@vger.kernel.org Fixes: f8b755ac8e0cc ("tracing/function-graph-tracer: Output arrows signal on hardirq call/return") Signed-off-by: Changbin Du Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_functions_graph.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 23c0b0cb5fb9..169b3c44ee97 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -831,6 +831,7 @@ print_graph_entry_leaf(struct trace_iterator *iter, struct ftrace_graph_ret *graph_ret; struct ftrace_graph_ent *call; unsigned long long duration; + int cpu = iter->cpu; int i; graph_ret = &ret_entry->ret; @@ -839,7 +840,6 @@ print_graph_entry_leaf(struct trace_iterator *iter, if (data) { struct fgraph_cpu_data *cpu_data; - int cpu = iter->cpu; cpu_data = per_cpu_ptr(data->cpu_data, cpu); @@ -869,6 +869,9 @@ print_graph_entry_leaf(struct trace_iterator *iter, trace_seq_printf(s, "%ps();\n", (void *)call->func); + print_graph_irq(iter, graph_ret->func, TRACE_GRAPH_RET, + cpu, iter->ent->pid, flags); + return trace_handle_return(s); } -- cgit From 26acfb666a473d960f0fd971fe68f3e3ad16c70b Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Wed, 20 Jun 2018 11:29:53 -0400 Subject: x86/KVM: Warn user if KVM is loaded SMT and L1TF CPU bug being present If the L1TF CPU bug is present we allow the KVM module to be loaded as the major of users that use Linux and KVM have trusted guests and do not want a broken setup. Cloud vendors are the ones that are uncomfortable with CVE 2018-3620 and as such they are the ones that should set nosmt to one. Setting 'nosmt' means that the system administrator also needs to disable SMT (Hyper-threading) in the BIOS, or via the 'nosmt' command line parameter, or via the /sys/devices/system/cpu/smt/control. See commit 05736e4ac13c ("cpu/hotplug: Provide knobs to control SMT"). Other mitigations are to use task affinity, cpu sets, interrupt binding, etc - anything to make sure that _only_ the same guests vCPUs are running on sibling threads. Signed-off-by: Konrad Rzeszutek Wilk Signed-off-by: Thomas Gleixner --- Documentation/admin-guide/kernel-parameters.txt | 6 ++++++ arch/x86/kvm/vmx.c | 13 +++++++++++++ kernel/cpu.c | 1 + 3 files changed, 20 insertions(+) (limited to 'kernel') diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 5da7b0b30432..298f1b38dc89 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -1946,6 +1946,12 @@ [KVM,ARM] Allow use of GICv4 for direct injection of LPIs. + kvm-intel.nosmt=[KVM,Intel] If the L1TF CPU bug is present (CVE-2018-3620) + and the system has SMT (aka Hyper-Threading) enabled then + don't allow guests to be created. + + Default is 0 (allow guests to be created). + kvm-intel.ept= [KVM,Intel] Disable extended page tables (virtualized MMU) support on capable Intel chips. Default is 1 (enabled) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 559a12b6184d..f2e7b6d016c9 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -71,6 +71,9 @@ static const struct x86_cpu_id vmx_cpu_id[] = { }; MODULE_DEVICE_TABLE(x86cpu, vmx_cpu_id); +static bool __read_mostly nosmt; +module_param(nosmt, bool, S_IRUGO); + static bool __read_mostly enable_vpid = 1; module_param_named(vpid, enable_vpid, bool, 0444); @@ -10370,10 +10373,20 @@ free_vcpu: return ERR_PTR(err); } +#define L1TF_MSG "SMT enabled with L1TF CPU bug present. Refer to CVE-2018-3620 for details.\n" + static int vmx_vm_init(struct kvm *kvm) { if (!ple_gap) kvm->arch.pause_in_guest = true; + + if (boot_cpu_has(X86_BUG_L1TF) && cpu_smt_control == CPU_SMT_ENABLED) { + if (nosmt) { + pr_err(L1TF_MSG); + return -EOPNOTSUPP; + } + pr_warn(L1TF_MSG); + } return 0; } diff --git a/kernel/cpu.c b/kernel/cpu.c index 6f3a3cde8b83..5a00ebdf98c6 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -345,6 +345,7 @@ EXPORT_SYMBOL_GPL(cpu_hotplug_enable); #ifdef CONFIG_HOTPLUG_SMT enum cpuhp_smt_control cpu_smt_control __read_mostly = CPU_SMT_ENABLED; +EXPORT_SYMBOL_GPL(cpu_smt_control); static int __init smt_cmdline_disable(char *str) { -- cgit From 547b3aa451ae2739585547db9fbdee11a43ff999 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Thu, 5 Jul 2018 08:05:56 -0700 Subject: bpf: sockmap, error path can not release psock in multi-map case The current code, in the error path of sock_hash_ctx_update_elem, checks if the sock has a psock in the user data and if so decrements the reference count of the psock. However, if the error happens early in the error path we may have never incremented the psock reference count and if the psock exists because the sock is in another map then we may inadvertently decrement the reference count. Fix this by making the error path only call smap_release_sock if the error happens after the increment. Reported-by: syzbot+d464d2c20c717ef5a6a8@syzkaller.appspotmail.com Fixes: 81110384441a ("bpf: sockmap, add hash map support") Signed-off-by: John Fastabend Signed-off-by: Alexei Starovoitov --- kernel/bpf/sockmap.c | 17 ++++++----------- 1 file changed, 6 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c index cf7b6a6dbd1f..3847a7ce7dae 100644 --- a/kernel/bpf/sockmap.c +++ b/kernel/bpf/sockmap.c @@ -1896,7 +1896,7 @@ static int __sock_map_ctx_update_elem(struct bpf_map *map, e = kzalloc(sizeof(*e), GFP_ATOMIC | __GFP_NOWARN); if (!e) { err = -ENOMEM; - goto out_progs; + goto out_free; } } @@ -2342,7 +2342,10 @@ static int sock_hash_ctx_update_elem(struct bpf_sock_ops_kern *skops, if (err) goto err; - /* bpf_map_update_elem() can be called in_irq() */ + /* psock is valid here because otherwise above *ctx_update_elem would + * have thrown an error. It is safe to skip error check. + */ + psock = smap_psock_sk(sock); raw_spin_lock_bh(&b->lock); l_old = lookup_elem_raw(head, hash, key, key_size); if (l_old && map_flags == BPF_NOEXIST) { @@ -2360,12 +2363,6 @@ static int sock_hash_ctx_update_elem(struct bpf_sock_ops_kern *skops, goto bucket_err; } - psock = smap_psock_sk(sock); - if (unlikely(!psock)) { - err = -EINVAL; - goto bucket_err; - } - rcu_assign_pointer(e->hash_link, l_new); rcu_assign_pointer(e->htab, container_of(map, struct bpf_htab, map)); @@ -2388,12 +2385,10 @@ static int sock_hash_ctx_update_elem(struct bpf_sock_ops_kern *skops, raw_spin_unlock_bh(&b->lock); return 0; bucket_err: + smap_release_sock(psock, sock); raw_spin_unlock_bh(&b->lock); err: kfree(e); - psock = smap_psock_sk(sock); - if (psock) - smap_release_sock(psock, sock); return err; } -- cgit From 1d1ef005dbc6de673c62cbd2562290ada3090463 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Thu, 5 Jul 2018 08:06:01 -0700 Subject: bpf: sockmap, hash table is RCU so readers do not need locks This removes locking from readers of RCU hash table. Its not necessary. Fixes: 81110384441a ("bpf: sockmap, add hash map support") Signed-off-by: John Fastabend Signed-off-by: Alexei Starovoitov --- kernel/bpf/sockmap.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c index 3847a7ce7dae..00fb2e328d1b 100644 --- a/kernel/bpf/sockmap.c +++ b/kernel/bpf/sockmap.c @@ -2467,10 +2467,8 @@ struct sock *__sock_hash_lookup_elem(struct bpf_map *map, void *key) b = __select_bucket(htab, hash); head = &b->head; - raw_spin_lock_bh(&b->lock); l = lookup_elem_raw(head, hash, key, key_size); sk = l ? l->sk : NULL; - raw_spin_unlock_bh(&b->lock); return sk; } -- cgit From 99ba2b5aba24e022683a7db63204f9e306fe7ab9 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Thu, 5 Jul 2018 08:50:04 -0700 Subject: bpf: sockhash, disallow bpf_tcp_close and update in parallel After latest lock updates there is no longer anything preventing a close and recvmsg call running in parallel. Additionally, we can race update with close if we close a socket and simultaneously update if via the BPF userspace API (note the cgroup ops are already run with sock_lock held). To resolve this take sock_lock in close and update paths. Reported-by: syzbot+b680e42077a0d7c9a0c4@syzkaller.appspotmail.com Fixes: e9db4ef6bf4c ("bpf: sockhash fix omitted bucket lock in sock_close") Signed-off-by: John Fastabend Signed-off-by: Alexei Starovoitov --- kernel/bpf/sockmap.c | 15 +++++++++++++++ kernel/bpf/syscall.c | 4 +++- 2 files changed, 18 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c index 00fb2e328d1b..9c67e96fe336 100644 --- a/kernel/bpf/sockmap.c +++ b/kernel/bpf/sockmap.c @@ -312,10 +312,12 @@ static void bpf_tcp_close(struct sock *sk, long timeout) struct smap_psock *psock; struct sock *osk; + lock_sock(sk); rcu_read_lock(); psock = smap_psock_sk(sk); if (unlikely(!psock)) { rcu_read_unlock(); + release_sock(sk); return sk->sk_prot->close(sk, timeout); } @@ -371,6 +373,7 @@ static void bpf_tcp_close(struct sock *sk, long timeout) e = psock_map_pop(sk, psock); } rcu_read_unlock(); + release_sock(sk); close_fun(sk, timeout); } @@ -2069,7 +2072,13 @@ static int sock_map_update_elem(struct bpf_map *map, return -EOPNOTSUPP; } + lock_sock(skops.sk); + preempt_disable(); + rcu_read_lock(); err = sock_map_ctx_update_elem(&skops, map, key, flags); + rcu_read_unlock(); + preempt_enable(); + release_sock(skops.sk); fput(socket->file); return err; } @@ -2410,7 +2419,13 @@ static int sock_hash_update_elem(struct bpf_map *map, return -EINVAL; } + lock_sock(skops.sk); + preempt_disable(); + rcu_read_lock(); err = sock_hash_ctx_update_elem(&skops, map, key, flags); + rcu_read_unlock(); + preempt_enable(); + release_sock(skops.sk); fput(socket->file); return err; } diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index d10ecd78105f..a31a1ba0f8ea 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -735,7 +735,9 @@ static int map_update_elem(union bpf_attr *attr) if (bpf_map_is_dev_bound(map)) { err = bpf_map_offload_update_elem(map, key, value, attr->flags); goto out; - } else if (map->map_type == BPF_MAP_TYPE_CPUMAP) { + } else if (map->map_type == BPF_MAP_TYPE_CPUMAP || + map->map_type == BPF_MAP_TYPE_SOCKHASH || + map->map_type == BPF_MAP_TYPE_SOCKMAP) { err = map->ops->map_update_elem(map, key, value, attr->flags); goto out; } -- cgit From 7ebc14d507b4b55105da8d1a1eda323381529cc7 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Thu, 5 Jul 2018 08:50:10 -0700 Subject: bpf: sockmap, consume_skb in close path Currently, when a sock is closed and the bpf_tcp_close() callback is used we remove memory but do not free the skb. Call consume_skb() if the skb is attached to the buffer. Reported-by: syzbot+d464d2c20c717ef5a6a8@syzkaller.appspotmail.com Fixes: 1aa12bdf1bfb ("bpf: sockmap, add sock close() hook to remove socks") Signed-off-by: John Fastabend Signed-off-by: Alexei Starovoitov --- kernel/bpf/sockmap.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c index 9c67e96fe336..dfc8a8a07c1f 100644 --- a/kernel/bpf/sockmap.c +++ b/kernel/bpf/sockmap.c @@ -571,7 +571,8 @@ static int free_sg(struct sock *sk, int start, struct sk_msg_buff *md) while (sg[i].length) { free += sg[i].length; sk_mem_uncharge(sk, sg[i].length); - put_page(sg_page(&sg[i])); + if (!md->skb) + put_page(sg_page(&sg[i])); sg[i].length = 0; sg[i].page_link = 0; sg[i].offset = 0; @@ -580,6 +581,8 @@ static int free_sg(struct sock *sk, int start, struct sk_msg_buff *md) if (i == MAX_SKB_FRAGS) i = 0; } + if (md->skb) + consume_skb(md->skb); return free; } -- cgit From 0ea488ff8d23c93da383fcf424825c298b13b1fb Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Thu, 5 Jul 2018 08:50:15 -0700 Subject: bpf: sockmap, convert bpf_compute_data_pointers to bpf_*_sk_skb In commit 'bpf: bpf_compute_data uses incorrect cb structure' (8108a7751512) we added the routine bpf_compute_data_end_sk_skb() to compute the correct data_end values, but this has since been lost. In kernel v4.14 this was correct and the above patch was applied in it entirety. Then when v4.14 was merged into v4.15-rc1 net-next tree we lost the piece that renamed bpf_compute_data_pointers to the new function bpf_compute_data_end_sk_skb. This was done here, e1ea2f9856b7 ("Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net") When it conflicted with the following rename patch, 6aaae2b6c433 ("bpf: rename bpf_compute_data_end into bpf_compute_data_pointers") Finally, after a refactor I thought even the function bpf_compute_data_end_sk_skb() was no longer needed and it was erroneously removed. However, we never reverted the sk_skb_convert_ctx_access() usage of tcp_skb_cb which had been committed and survived the merge conflict. Here we fix this by adding back the helper and *_data_end_sk_skb() usage. Using the bpf_skc_data_end mapping is not correct because it expects a qdisc_skb_cb object but at the sock layer this is not the case. Even though it happens to work here because we don't overwrite any data in-use at the socket layer and the cb structure is cleared later this has potential to create some subtle issues. But, even more concretely the filter.c access check uses tcp_skb_cb. And by some act of chance though, struct bpf_skb_data_end { struct qdisc_skb_cb qdisc_cb; /* 0 28 */ /* XXX 4 bytes hole, try to pack */ void * data_meta; /* 32 8 */ void * data_end; /* 40 8 */ /* size: 48, cachelines: 1, members: 3 */ /* sum members: 44, holes: 1, sum holes: 4 */ /* last cacheline: 48 bytes */ }; and then tcp_skb_cb, struct tcp_skb_cb { [...] struct { __u32 flags; /* 24 4 */ struct sock * sk_redir; /* 32 8 */ void * data_end; /* 40 8 */ } bpf; /* 24 */ }; So when we use offset_of() to track down the byte offset we get 40 in either case and everything continues to work. Fix this mess and use correct structures its unclear how long this might actually work for until someone moves the structs around. Reported-by: Martin KaFai Lau Fixes: e1ea2f9856b7 ("Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net") Fixes: 6aaae2b6c433 ("bpf: rename bpf_compute_data_end into bpf_compute_data_pointers") Signed-off-by: John Fastabend Signed-off-by: Alexei Starovoitov --- include/net/tcp.h | 4 +++ kernel/bpf/sockmap.c | 4 +-- net/core/filter.c | 98 ++++++++++++++++++++++++++++++++++++++++++++++++---- 3 files changed, 97 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/include/net/tcp.h b/include/net/tcp.h index 800582b5dd54..af3ec72d5d41 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -828,6 +828,10 @@ struct tcp_skb_cb { #define TCP_SKB_CB(__skb) ((struct tcp_skb_cb *)&((__skb)->cb[0])) +static inline void bpf_compute_data_end_sk_skb(struct sk_buff *skb) +{ + TCP_SKB_CB(skb)->bpf.data_end = skb->data + skb_headlen(skb); +} #if IS_ENABLED(CONFIG_IPV6) /* This is the variant of inet6_iif() that must be used by TCP, diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c index dfc8a8a07c1f..98fb7938beea 100644 --- a/kernel/bpf/sockmap.c +++ b/kernel/bpf/sockmap.c @@ -1236,7 +1236,7 @@ static int smap_verdict_func(struct smap_psock *psock, struct sk_buff *skb) */ TCP_SKB_CB(skb)->bpf.sk_redir = NULL; skb->sk = psock->sock; - bpf_compute_data_pointers(skb); + bpf_compute_data_end_sk_skb(skb); preempt_disable(); rc = (*prog->bpf_func)(skb, prog->insnsi); preempt_enable(); @@ -1491,7 +1491,7 @@ static int smap_parse_func_strparser(struct strparser *strp, * any socket yet. */ skb->sk = psock->sock; - bpf_compute_data_pointers(skb); + bpf_compute_data_end_sk_skb(skb); rc = (*prog->bpf_func)(skb, prog->insnsi); skb->sk = NULL; rcu_read_unlock(); diff --git a/net/core/filter.c b/net/core/filter.c index 3095f1ba7015..470268024a40 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -1762,6 +1762,37 @@ static const struct bpf_func_proto bpf_skb_pull_data_proto = { .arg2_type = ARG_ANYTHING, }; +static inline int sk_skb_try_make_writable(struct sk_buff *skb, + unsigned int write_len) +{ + int err = __bpf_try_make_writable(skb, write_len); + + bpf_compute_data_end_sk_skb(skb); + return err; +} + +BPF_CALL_2(sk_skb_pull_data, struct sk_buff *, skb, u32, len) +{ + /* Idea is the following: should the needed direct read/write + * test fail during runtime, we can pull in more data and redo + * again, since implicitly, we invalidate previous checks here. + * + * Or, since we know how much we need to make read/writeable, + * this can be done once at the program beginning for direct + * access case. By this we overcome limitations of only current + * headroom being accessible. + */ + return sk_skb_try_make_writable(skb, len ? : skb_headlen(skb)); +} + +static const struct bpf_func_proto sk_skb_pull_data_proto = { + .func = sk_skb_pull_data, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, +}; + BPF_CALL_5(bpf_l3_csum_replace, struct sk_buff *, skb, u32, offset, u64, from, u64, to, u64, flags) { @@ -2864,8 +2895,8 @@ static int bpf_skb_trim_rcsum(struct sk_buff *skb, unsigned int new_len) return __skb_trim_rcsum(skb, new_len); } -BPF_CALL_3(bpf_skb_change_tail, struct sk_buff *, skb, u32, new_len, - u64, flags) +static inline int __bpf_skb_change_tail(struct sk_buff *skb, u32 new_len, + u64 flags) { u32 max_len = __bpf_skb_max_len(skb); u32 min_len = __bpf_skb_min_len(skb); @@ -2901,6 +2932,13 @@ BPF_CALL_3(bpf_skb_change_tail, struct sk_buff *, skb, u32, new_len, if (!ret && skb_is_gso(skb)) skb_gso_reset(skb); } + return ret; +} + +BPF_CALL_3(bpf_skb_change_tail, struct sk_buff *, skb, u32, new_len, + u64, flags) +{ + int ret = __bpf_skb_change_tail(skb, new_len, flags); bpf_compute_data_pointers(skb); return ret; @@ -2915,8 +2953,26 @@ static const struct bpf_func_proto bpf_skb_change_tail_proto = { .arg3_type = ARG_ANYTHING, }; -BPF_CALL_3(bpf_skb_change_head, struct sk_buff *, skb, u32, head_room, +BPF_CALL_3(sk_skb_change_tail, struct sk_buff *, skb, u32, new_len, u64, flags) +{ + int ret = __bpf_skb_change_tail(skb, new_len, flags); + + bpf_compute_data_end_sk_skb(skb); + return ret; +} + +static const struct bpf_func_proto sk_skb_change_tail_proto = { + .func = sk_skb_change_tail, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_ANYTHING, +}; + +static inline int __bpf_skb_change_head(struct sk_buff *skb, u32 head_room, + u64 flags) { u32 max_len = __bpf_skb_max_len(skb); u32 new_len = skb->len + head_room; @@ -2942,8 +2998,16 @@ BPF_CALL_3(bpf_skb_change_head, struct sk_buff *, skb, u32, head_room, skb_reset_mac_header(skb); } + return ret; +} + +BPF_CALL_3(bpf_skb_change_head, struct sk_buff *, skb, u32, head_room, + u64, flags) +{ + int ret = __bpf_skb_change_head(skb, head_room, flags); + bpf_compute_data_pointers(skb); - return 0; + return ret; } static const struct bpf_func_proto bpf_skb_change_head_proto = { @@ -2955,6 +3019,23 @@ static const struct bpf_func_proto bpf_skb_change_head_proto = { .arg3_type = ARG_ANYTHING, }; +BPF_CALL_3(sk_skb_change_head, struct sk_buff *, skb, u32, head_room, + u64, flags) +{ + int ret = __bpf_skb_change_head(skb, head_room, flags); + + bpf_compute_data_end_sk_skb(skb); + return ret; +} + +static const struct bpf_func_proto sk_skb_change_head_proto = { + .func = sk_skb_change_head, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_ANYTHING, +}; static unsigned long xdp_get_metalen(const struct xdp_buff *xdp) { return xdp_data_meta_unsupported(xdp) ? 0 : @@ -4618,9 +4699,12 @@ bool bpf_helper_changes_pkt_data(void *func) func == bpf_skb_store_bytes || func == bpf_skb_change_proto || func == bpf_skb_change_head || + func == sk_skb_change_head || func == bpf_skb_change_tail || + func == sk_skb_change_tail || func == bpf_skb_adjust_room || func == bpf_skb_pull_data || + func == sk_skb_pull_data || func == bpf_clone_redirect || func == bpf_l3_csum_replace || func == bpf_l4_csum_replace || @@ -4872,11 +4956,11 @@ sk_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) case BPF_FUNC_skb_load_bytes: return &bpf_skb_load_bytes_proto; case BPF_FUNC_skb_pull_data: - return &bpf_skb_pull_data_proto; + return &sk_skb_pull_data_proto; case BPF_FUNC_skb_change_tail: - return &bpf_skb_change_tail_proto; + return &sk_skb_change_tail_proto; case BPF_FUNC_skb_change_head: - return &bpf_skb_change_head_proto; + return &sk_skb_change_head_proto; case BPF_FUNC_get_socket_cookie: return &bpf_get_socket_cookie_proto; case BPF_FUNC_get_socket_uid: -- cgit From d8d7218ad842e18fc6976b87c08ed749e8d56313 Mon Sep 17 00:00:00 2001 From: Toshiaki Makita Date: Fri, 6 Jul 2018 11:49:00 +0900 Subject: xdp: XDP_REDIRECT should check IFF_UP and MTU Otherwise we end up with attempting to send packets from down devices or to send oversized packets, which may cause unexpected driver/device behaviour. Generic XDP has already done this check, so reuse the logic in native XDP. Fixes: 814abfabef3c ("xdp: add bpf_redirect helper function") Signed-off-by: Toshiaki Makita Signed-off-by: Alexei Starovoitov --- include/linux/filter.h | 6 +++--- kernel/bpf/devmap.c | 7 ++++++- net/core/filter.c | 9 +++++++-- 3 files changed, 16 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/include/linux/filter.h b/include/linux/filter.h index 300baad62c88..c73dd7396886 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -765,8 +765,8 @@ static inline bool bpf_dump_raw_ok(void) struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off, const struct bpf_insn *patch, u32 len); -static inline int __xdp_generic_ok_fwd_dev(struct sk_buff *skb, - struct net_device *fwd) +static inline int xdp_ok_fwd_dev(const struct net_device *fwd, + unsigned int pktlen) { unsigned int len; @@ -774,7 +774,7 @@ static inline int __xdp_generic_ok_fwd_dev(struct sk_buff *skb, return -ENETDOWN; len = fwd->mtu + fwd->hard_header_len + VLAN_HLEN; - if (skb->len > len) + if (pktlen > len) return -EMSGSIZE; return 0; diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index 642c97f6d1b8..d361fc1e3bf3 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -334,10 +334,15 @@ int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp, { struct net_device *dev = dst->dev; struct xdp_frame *xdpf; + int err; if (!dev->netdev_ops->ndo_xdp_xmit) return -EOPNOTSUPP; + err = xdp_ok_fwd_dev(dev, xdp->data_end - xdp->data); + if (unlikely(err)) + return err; + xdpf = convert_to_xdp_frame(xdp); if (unlikely(!xdpf)) return -EOVERFLOW; @@ -350,7 +355,7 @@ int dev_map_generic_redirect(struct bpf_dtab_netdev *dst, struct sk_buff *skb, { int err; - err = __xdp_generic_ok_fwd_dev(skb, dst->dev); + err = xdp_ok_fwd_dev(dst->dev, skb->len); if (unlikely(err)) return err; skb->dev = dst->dev; diff --git a/net/core/filter.c b/net/core/filter.c index 470268024a40..5fa66a33927f 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -3128,12 +3128,16 @@ static int __bpf_tx_xdp(struct net_device *dev, u32 index) { struct xdp_frame *xdpf; - int sent; + int err, sent; if (!dev->netdev_ops->ndo_xdp_xmit) { return -EOPNOTSUPP; } + err = xdp_ok_fwd_dev(dev, xdp->data_end - xdp->data); + if (unlikely(err)) + return err; + xdpf = convert_to_xdp_frame(xdp); if (unlikely(!xdpf)) return -EOVERFLOW; @@ -3367,7 +3371,8 @@ int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb, goto err; } - if (unlikely((err = __xdp_generic_ok_fwd_dev(skb, fwd)))) + err = xdp_ok_fwd_dev(fwd, skb->len); + if (unlikely(err)) goto err; skb->dev = fwd; -- cgit From 215af5499d9e2b55f111d2431ea20218115f29b3 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 7 Jul 2018 11:40:18 +0200 Subject: cpu/hotplug: Online siblings when SMT control is turned on Writing 'off' to /sys/devices/system/cpu/smt/control offlines all SMT siblings. Writing 'on' merily enables the abilify to online them, but does not online them automatically. Make 'on' more useful by onlining all offline siblings. Signed-off-by: Thomas Gleixner --- kernel/cpu.c | 26 ++++++++++++++++++++++++-- 1 file changed, 24 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index 5a00ebdf98c6..d79e24df2420 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -1979,6 +1979,15 @@ static void cpuhp_offline_cpu_device(unsigned int cpu) kobject_uevent(&dev->kobj, KOBJ_OFFLINE); } +static void cpuhp_online_cpu_device(unsigned int cpu) +{ + struct device *dev = get_cpu_device(cpu); + + dev->offline = false; + /* Tell user space about the state change */ + kobject_uevent(&dev->kobj, KOBJ_ONLINE); +} + static int cpuhp_smt_disable(enum cpuhp_smt_control ctrlval) { int cpu, ret = 0; @@ -2011,11 +2020,24 @@ static int cpuhp_smt_disable(enum cpuhp_smt_control ctrlval) return ret; } -static void cpuhp_smt_enable(void) +static int cpuhp_smt_enable(void) { + int cpu, ret = 0; + cpu_maps_update_begin(); cpu_smt_control = CPU_SMT_ENABLED; + for_each_present_cpu(cpu) { + /* Skip online CPUs and CPUs on offline nodes */ + if (cpu_online(cpu) || !node_online(cpu_to_node(cpu))) + continue; + ret = _cpu_up(cpu, 0, CPUHP_ONLINE); + if (ret) + break; + /* See comment in cpuhp_smt_disable() */ + cpuhp_online_cpu_device(cpu); + } cpu_maps_update_done(); + return ret; } static ssize_t @@ -2046,7 +2068,7 @@ store_smt_control(struct device *dev, struct device_attribute *attr, if (ctrlval != cpu_smt_control) { switch (ctrlval) { case CPU_SMT_ENABLED: - cpuhp_smt_enable(); + ret = cpuhp_smt_enable(); break; case CPU_SMT_DISABLED: case CPU_SMT_FORCE_DISABLED: -- cgit From 8599dc7dec874b137384714bfe3ddbbf5d4a614e Mon Sep 17 00:00:00 2001 From: Petr Mladek Date: Wed, 27 Jun 2018 17:06:41 +0200 Subject: printk: Clean up syslog_print_all() syslog_print_all() is called twice. Once with a valid buffer and once just to set the indexes. Both variants are already handled separately. This patch just makes it more obvious. It does not change the existing behavior. Link: http://lkml.kernel.org/r/20180627150641.p56xyy6mdzvnfpig@pathway.suse.cz Cc: Sergey Senozhatsky Cc: Namit Gupta Cc: linux-kernel@vger.kernel.org Cc: pankaj.m@samsung.com Cc: a.sahrawat@samsung.com Cc: himanshu.m@samsung.com Signed-off-by: Petr Mladek --- kernel/printk/printk.c | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 16b02cc51a14..fcc1992c040a 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -1356,16 +1356,6 @@ static int syslog_print_all(char __user *buf, int size, bool clear) u64 seq; u32 idx; - if (!buf) { - if (clear) { - logbuf_lock_irq(); - clear_seq = log_next_seq; - clear_idx = log_next_idx; - logbuf_unlock_irq(); - } - return 0; - } - text = kmalloc(LOG_LINE_MAX + PREFIX_MAX, GFP_KERNEL); if (!text) return -ENOMEM; @@ -1437,6 +1427,14 @@ static int syslog_print_all(char __user *buf, int size, bool clear) return len; } +static void syslog_clear(void) +{ + logbuf_lock_irq(); + clear_seq = log_next_seq; + clear_idx = log_next_idx; + logbuf_unlock_irq(); +} + int do_syslog(int type, char __user *buf, int len, int source) { bool clear = false; @@ -1481,7 +1479,7 @@ int do_syslog(int type, char __user *buf, int len, int source) break; /* Clear ring buffer */ case SYSLOG_ACTION_CLEAR: - syslog_print_all(NULL, 0, true); + syslog_clear(); break; /* Disable logging to console */ case SYSLOG_ACTION_CONSOLE_OFF: -- cgit From ba552399954dde1b388f7749fecad5c349216981 Mon Sep 17 00:00:00 2001 From: Petr Mladek Date: Wed, 27 Jun 2018 16:08:15 +0200 Subject: printk: Split the code for storing a message into the log buffer It is just a preparation step. The patch does not change the existing behavior. Link: http://lkml.kernel.org/r/20180627140817.27764-2-pmladek@suse.com To: Steven Rostedt Cc: Peter Zijlstra Cc: Tetsuo Handa Cc: Sergey Senozhatsky Cc: linux-kernel@vger.kernel.org Cc: stable@vger.kernel.org Acked-by: Sergey Senozhatsky Signed-off-by: Petr Mladek --- kernel/printk/printk.c | 43 ++++++++++++++++++++++++++----------------- 1 file changed, 26 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 247808333ba4..a844c611b17c 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -1824,28 +1824,16 @@ static size_t log_output(int facility, int level, enum log_flags lflags, const c return log_store(facility, level, lflags, 0, dict, dictlen, text, text_len); } -asmlinkage int vprintk_emit(int facility, int level, - const char *dict, size_t dictlen, - const char *fmt, va_list args) +/* Must be called under logbuf_lock. */ +int vprintk_store(int facility, int level, + const char *dict, size_t dictlen, + const char *fmt, va_list args) { static char textbuf[LOG_LINE_MAX]; char *text = textbuf; size_t text_len; enum log_flags lflags = 0; - unsigned long flags; - int printed_len; - bool in_sched = false; - - if (level == LOGLEVEL_SCHED) { - level = LOGLEVEL_DEFAULT; - in_sched = true; - } - - boot_delay_msec(level); - printk_delay(); - /* This stops the holder of console_sem just where we want him */ - logbuf_lock_irqsave(flags); /* * The printf needs to come first; we need the syslog * prefix which might be passed-in as a parameter. @@ -1886,8 +1874,29 @@ asmlinkage int vprintk_emit(int facility, int level, if (dict) lflags |= LOG_PREFIX|LOG_NEWLINE; - printed_len = log_output(facility, level, lflags, dict, dictlen, text, text_len); + return log_output(facility, level, lflags, + dict, dictlen, text, text_len); +} +asmlinkage int vprintk_emit(int facility, int level, + const char *dict, size_t dictlen, + const char *fmt, va_list args) +{ + int printed_len; + bool in_sched = false; + unsigned long flags; + + if (level == LOGLEVEL_SCHED) { + level = LOGLEVEL_DEFAULT; + in_sched = true; + } + + boot_delay_msec(level); + printk_delay(); + + /* This stops the holder of console_sem just where we want him */ + logbuf_lock_irqsave(flags); + printed_len = vprintk_store(facility, level, dict, dictlen, fmt, args); logbuf_unlock_irqrestore(flags); /* If called from the scheduler, we can not call up(). */ -- cgit From a338f84dc196f44b63ba0863d2f34fd9b1613572 Mon Sep 17 00:00:00 2001 From: Petr Mladek Date: Wed, 27 Jun 2018 16:08:16 +0200 Subject: printk: Create helper function to queue deferred console handling It is just a preparation step. The patch does not change the existing behavior. Link: http://lkml.kernel.org/r/20180627140817.27764-3-pmladek@suse.com To: Steven Rostedt Cc: Peter Zijlstra Cc: Tetsuo Handa Cc: Sergey Senozhatsky Cc: linux-kernel@vger.kernel.org Cc: stable@vger.kernel.org Acked-by: Sergey Senozhatsky Signed-off-by: Petr Mladek --- kernel/printk/printk.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index a844c611b17c..1d1513215c22 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -2887,16 +2887,20 @@ void wake_up_klogd(void) preempt_enable(); } -int vprintk_deferred(const char *fmt, va_list args) +void defer_console_output(void) { - int r; - - r = vprintk_emit(0, LOGLEVEL_SCHED, NULL, 0, fmt, args); - preempt_disable(); __this_cpu_or(printk_pending, PRINTK_PENDING_OUTPUT); irq_work_queue(this_cpu_ptr(&wake_up_klogd_work)); preempt_enable(); +} + +int vprintk_deferred(const char *fmt, va_list args) +{ + int r; + + r = vprintk_emit(0, LOGLEVEL_SCHED, NULL, 0, fmt, args); + defer_console_output(); return r; } -- cgit From 03fc7f9c99c1e7ae2925d459e8487f1a6f199f79 Mon Sep 17 00:00:00 2001 From: Petr Mladek Date: Wed, 27 Jun 2018 16:20:28 +0200 Subject: printk/nmi: Prevent deadlock when accessing the main log buffer in NMI The commit 719f6a7040f1bdaf96 ("printk: Use the main logbuf in NMI when logbuf_lock is available") brought back the possible deadlocks in printk() and NMI. The check of logbuf_lock is done only in printk_nmi_enter() to prevent mixed output. But another CPU might take the lock later, enter NMI, and: + Both NMIs might be serialized by yet another lock, for example, the one in nmi_cpu_backtrace(). + The other CPU might get stopped in NMI, see smp_send_stop() in panic(). The only safe solution is to use trylock when storing the message into the main log-buffer. It might cause reordering when some lines go to the main lock buffer directly and others are delayed via the per-CPU buffer. It means that it is not useful in general. This patch replaces the problematic NMI deferred context with NMI direct context. It can be used to mark a code that might produce many messages in NMI and the risk of losing them is more critical than problems with eventual reordering. The context is then used when dumping trace buffers on oops. It was the primary motivation for the original fix. Also the reordering is even smaller issue there because some traces have their own time stamps. Finally, nmi_cpu_backtrace() need not longer be serialized because it will always us the per-CPU buffers again. Fixes: 719f6a7040f1bdaf96 ("printk: Use the main logbuf in NMI when logbuf_lock is available") Cc: stable@vger.kernel.org Link: http://lkml.kernel.org/r/20180627142028.11259-1-pmladek@suse.com To: Steven Rostedt Cc: Peter Zijlstra Cc: Tetsuo Handa Cc: Sergey Senozhatsky Cc: linux-kernel@vger.kernel.org Cc: stable@vger.kernel.org Acked-by: Sergey Senozhatsky Signed-off-by: Petr Mladek --- include/linux/printk.h | 4 ++++ kernel/printk/internal.h | 9 ++++++- kernel/printk/printk_safe.c | 58 +++++++++++++++++++++++++++++---------------- kernel/trace/trace.c | 4 +++- lib/nmi_backtrace.c | 3 --- 5 files changed, 52 insertions(+), 26 deletions(-) (limited to 'kernel') diff --git a/include/linux/printk.h b/include/linux/printk.h index 6d7e800affd8..3ede9f46a494 100644 --- a/include/linux/printk.h +++ b/include/linux/printk.h @@ -148,9 +148,13 @@ void early_printk(const char *s, ...) { } #ifdef CONFIG_PRINTK_NMI extern void printk_nmi_enter(void); extern void printk_nmi_exit(void); +extern void printk_nmi_direct_enter(void); +extern void printk_nmi_direct_exit(void); #else static inline void printk_nmi_enter(void) { } static inline void printk_nmi_exit(void) { } +static inline void printk_nmi_direct_enter(void) { } +static inline void printk_nmi_direct_exit(void) { } #endif /* PRINTK_NMI */ #ifdef CONFIG_PRINTK diff --git a/kernel/printk/internal.h b/kernel/printk/internal.h index 2a7d04049af4..0f1898820cba 100644 --- a/kernel/printk/internal.h +++ b/kernel/printk/internal.h @@ -19,11 +19,16 @@ #ifdef CONFIG_PRINTK #define PRINTK_SAFE_CONTEXT_MASK 0x3fffffff -#define PRINTK_NMI_DEFERRED_CONTEXT_MASK 0x40000000 +#define PRINTK_NMI_DIRECT_CONTEXT_MASK 0x40000000 #define PRINTK_NMI_CONTEXT_MASK 0x80000000 extern raw_spinlock_t logbuf_lock; +__printf(5, 0) +int vprintk_store(int facility, int level, + const char *dict, size_t dictlen, + const char *fmt, va_list args); + __printf(1, 0) int vprintk_default(const char *fmt, va_list args); __printf(1, 0) int vprintk_deferred(const char *fmt, va_list args); __printf(1, 0) int vprintk_func(const char *fmt, va_list args); @@ -54,6 +59,8 @@ void __printk_safe_exit(void); local_irq_enable(); \ } while (0) +void defer_console_output(void); + #else __printf(1, 0) int vprintk_func(const char *fmt, va_list args) { return 0; } diff --git a/kernel/printk/printk_safe.c b/kernel/printk/printk_safe.c index d7d091309054..a0a74c533e4b 100644 --- a/kernel/printk/printk_safe.c +++ b/kernel/printk/printk_safe.c @@ -308,24 +308,33 @@ static __printf(1, 0) int vprintk_nmi(const char *fmt, va_list args) void printk_nmi_enter(void) { - /* - * The size of the extra per-CPU buffer is limited. Use it only when - * the main one is locked. If this CPU is not in the safe context, - * the lock must be taken on another CPU and we could wait for it. - */ - if ((this_cpu_read(printk_context) & PRINTK_SAFE_CONTEXT_MASK) && - raw_spin_is_locked(&logbuf_lock)) { - this_cpu_or(printk_context, PRINTK_NMI_CONTEXT_MASK); - } else { - this_cpu_or(printk_context, PRINTK_NMI_DEFERRED_CONTEXT_MASK); - } + this_cpu_or(printk_context, PRINTK_NMI_CONTEXT_MASK); } void printk_nmi_exit(void) { - this_cpu_and(printk_context, - ~(PRINTK_NMI_CONTEXT_MASK | - PRINTK_NMI_DEFERRED_CONTEXT_MASK)); + this_cpu_and(printk_context, ~PRINTK_NMI_CONTEXT_MASK); +} + +/* + * Marks a code that might produce many messages in NMI context + * and the risk of losing them is more critical than eventual + * reordering. + * + * It has effect only when called in NMI context. Then printk() + * will try to store the messages into the main logbuf directly + * and use the per-CPU buffers only as a fallback when the lock + * is not available. + */ +void printk_nmi_direct_enter(void) +{ + if (this_cpu_read(printk_context) & PRINTK_NMI_CONTEXT_MASK) + this_cpu_or(printk_context, PRINTK_NMI_DIRECT_CONTEXT_MASK); +} + +void printk_nmi_direct_exit(void) +{ + this_cpu_and(printk_context, ~PRINTK_NMI_DIRECT_CONTEXT_MASK); } #else @@ -363,6 +372,20 @@ void __printk_safe_exit(void) __printf(1, 0) int vprintk_func(const char *fmt, va_list args) { + /* + * Try to use the main logbuf even in NMI. But avoid calling console + * drivers that might have their own locks. + */ + if ((this_cpu_read(printk_context) & PRINTK_NMI_DIRECT_CONTEXT_MASK) && + raw_spin_trylock(&logbuf_lock)) { + int len; + + len = vprintk_store(0, LOGLEVEL_DEFAULT, NULL, 0, fmt, args); + raw_spin_unlock(&logbuf_lock); + defer_console_output(); + return len; + } + /* Use extra buffer in NMI when logbuf_lock is taken or in safe mode. */ if (this_cpu_read(printk_context) & PRINTK_NMI_CONTEXT_MASK) return vprintk_nmi(fmt, args); @@ -371,13 +394,6 @@ __printf(1, 0) int vprintk_func(const char *fmt, va_list args) if (this_cpu_read(printk_context) & PRINTK_SAFE_CONTEXT_MASK) return vprintk_safe(fmt, args); - /* - * Use the main logbuf when logbuf_lock is available in NMI. - * But avoid calling console drivers that might have their own locks. - */ - if (this_cpu_read(printk_context) & PRINTK_NMI_DEFERRED_CONTEXT_MASK) - return vprintk_deferred(fmt, args); - /* No obstacles. */ return vprintk_default(fmt, args); } diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index bcd93031d042..f106ad12f72f 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -8265,6 +8265,7 @@ void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) tracing_off(); local_irq_save(flags); + printk_nmi_direct_enter(); /* Simulate the iterator */ trace_init_global_iter(&iter); @@ -8344,7 +8345,8 @@ void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) for_each_tracing_cpu(cpu) { atomic_dec(&per_cpu_ptr(iter.trace_buffer->data, cpu)->disabled); } - atomic_dec(&dump_running); + atomic_dec(&dump_running); + printk_nmi_direct_exit(); local_irq_restore(flags); } EXPORT_SYMBOL_GPL(ftrace_dump); diff --git a/lib/nmi_backtrace.c b/lib/nmi_backtrace.c index 61a6b5aab07e..15ca78e1c7d4 100644 --- a/lib/nmi_backtrace.c +++ b/lib/nmi_backtrace.c @@ -87,11 +87,9 @@ void nmi_trigger_cpumask_backtrace(const cpumask_t *mask, bool nmi_cpu_backtrace(struct pt_regs *regs) { - static arch_spinlock_t lock = __ARCH_SPIN_LOCK_UNLOCKED; int cpu = smp_processor_id(); if (cpumask_test_cpu(cpu, to_cpumask(backtrace_mask))) { - arch_spin_lock(&lock); if (regs && cpu_in_idle(instruction_pointer(regs))) { pr_warn("NMI backtrace for cpu %d skipped: idling at %pS\n", cpu, (void *)instruction_pointer(regs)); @@ -102,7 +100,6 @@ bool nmi_cpu_backtrace(struct pt_regs *regs) else dump_stack(); } - arch_spin_unlock(&lock); cpumask_clear_cpu(cpu, to_cpumask(backtrace_mask)); return true; } -- cgit From e1a413245a564683697a3d02ec197b72cf009b89 Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Fri, 29 Jun 2018 09:56:08 +0800 Subject: Blktrace: bail out early if block debugfs is not configured Since @blk_debugfs_root couldn't be configured dynamically, we can save a few memory allocation if it's not there. Signed-off-by: Liu Bo Signed-off-by: Jens Axboe --- kernel/trace/blktrace.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 987d9a9ae283..b951aa1fac61 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -494,6 +494,9 @@ static int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, if (!buts->buf_size || !buts->buf_nr) return -EINVAL; + if (!blk_debugfs_root) + return -ENOENT; + strncpy(buts->name, name, BLKTRACE_BDEV_SIZE); buts->name[BLKTRACE_BDEV_SIZE - 1] = '\0'; @@ -518,9 +521,6 @@ static int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, ret = -ENOENT; - if (!blk_debugfs_root) - goto err; - dir = debugfs_lookup(buts->name, blk_debugfs_root); if (!dir) bt->dir = dir = debugfs_create_dir(buts->name, blk_debugfs_root); -- cgit From 5b5ccbc2b041f98f26b984e013d303b7f9e6fb8e Mon Sep 17 00:00:00 2001 From: Sudeep Holla Date: Mon, 9 Jul 2018 16:45:35 +0100 Subject: Revert "tick: Prefer a lower rating device only if it's CPU local device" This reverts commit 1332a90558013ae4242e3dd7934bdcdeafb06c0d. The original issue was not because of incorrect checking of cpumask for both new and old tick device. It was incorrectly analysed was due to the misunderstanding of the comment and misinterpretation of the return value from tick_check_preferred. The main issue is with the clockevent driver that sets the cpumask to cpu_all_mask instead of cpu_possible_mask. Signed-off-by: Sudeep Holla Signed-off-by: Thomas Gleixner Tested-by: Kevin Hilman Tested-by: Martin Blumenstingl Cc: linux-arm-kernel@lists.infradead.org Cc: Marc Zyngier Link: https://lkml.kernel.org/r/1531151136-18297-1-git-send-email-sudeep.holla@arm.com --- kernel/time/tick-common.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index b7005dd21ec1..14de3727b18e 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -277,8 +277,7 @@ static bool tick_check_preferred(struct clock_event_device *curdev, */ return !curdev || newdev->rating > curdev->rating || - (!cpumask_equal(curdev->cpumask, newdev->cpumask) && - !tick_check_percpu(curdev, newdev, smp_processor_id())); + !cpumask_equal(curdev->cpumask, newdev->cpumask); } /* -- cgit From e96d71359e9bbea846a2111e4469a03a055dfa6f Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Mon, 9 Jul 2018 15:51:50 -0400 Subject: rseq: Use __u64 for rseq_cs fields, validate user inputs Change the rseq ABI so rseq_cs start_ip, post_commit_offset and abort_ip fields are seen as 64-bit fields by both 32-bit and 64-bit kernels rather that ignoring the 32 upper bits on 32-bit kernels. This ensures we have a consistent behavior for a 32-bit binary executed on 32-bit kernels and in compat mode on 64-bit kernels. Validating the value of abort_ip field to be below TASK_SIZE ensures the kernel don't return to an invalid address when returning to userspace after an abort. I don't fully trust each architecture code to consistently deal with invalid return addresses. Validating the value of the start_ip and post_commit_offset fields prevents overflow on arithmetic performed on those values, used to check whether abort_ip is within the rseq critical section. If validation fails, the process is killed with a segmentation fault. When the signature encountered before abort_ip does not match the expected signature, return -EINVAL rather than -EPERM to be consistent with other input validation return codes from rseq_get_rseq_cs(). Signed-off-by: Mathieu Desnoyers Signed-off-by: Thomas Gleixner Cc: linux-api@vger.kernel.org Cc: Peter Zijlstra Cc: "Paul E . McKenney" Cc: Boqun Feng Cc: Andy Lutomirski Cc: Dave Watson Cc: Paul Turner Cc: Andrew Morton Cc: Russell King Cc: "H . Peter Anvin" Cc: Andi Kleen Cc: Chris Lameter Cc: Ben Maurer Cc: Steven Rostedt Cc: Josh Triplett Cc: Linus Torvalds Cc: Catalin Marinas Cc: Will Deacon Cc: Michael Kerrisk Cc: Joel Fernandes Cc: "Paul E. McKenney" Cc: "H. Peter Anvin" Link: https://lkml.kernel.org/r/20180709195155.7654-2-mathieu.desnoyers@efficios.com --- include/uapi/linux/rseq.h | 6 +++--- kernel/rseq.c | 14 ++++++++++---- 2 files changed, 13 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/include/uapi/linux/rseq.h b/include/uapi/linux/rseq.h index d620fa43756c..519ad6e176d1 100644 --- a/include/uapi/linux/rseq.h +++ b/include/uapi/linux/rseq.h @@ -52,10 +52,10 @@ struct rseq_cs { __u32 version; /* enum rseq_cs_flags */ __u32 flags; - LINUX_FIELD_u32_u64(start_ip); + __u64 start_ip; /* Offset from start_ip. */ - LINUX_FIELD_u32_u64(post_commit_offset); - LINUX_FIELD_u32_u64(abort_ip); + __u64 post_commit_offset; + __u64 abort_ip; } __attribute__((aligned(4 * sizeof(__u64)))); /* diff --git a/kernel/rseq.c b/kernel/rseq.c index 22b6acf1ad63..16b38c5342f9 100644 --- a/kernel/rseq.c +++ b/kernel/rseq.c @@ -130,14 +130,20 @@ static int rseq_get_rseq_cs(struct task_struct *t, struct rseq_cs *rseq_cs) urseq_cs = (struct rseq_cs __user *)ptr; if (copy_from_user(rseq_cs, urseq_cs, sizeof(*rseq_cs))) return -EFAULT; - if (rseq_cs->version > 0) - return -EINVAL; + if (rseq_cs->start_ip >= TASK_SIZE || + rseq_cs->start_ip + rseq_cs->post_commit_offset >= TASK_SIZE || + rseq_cs->abort_ip >= TASK_SIZE || + rseq_cs->version > 0) + return -EINVAL; + /* Check for overflow. */ + if (rseq_cs->start_ip + rseq_cs->post_commit_offset < rseq_cs->start_ip) + return -EINVAL; /* Ensure that abort_ip is not in the critical section. */ if (rseq_cs->abort_ip - rseq_cs->start_ip < rseq_cs->post_commit_offset) return -EINVAL; - usig = (u32 __user *)(rseq_cs->abort_ip - sizeof(u32)); + usig = (u32 __user *)(unsigned long)(rseq_cs->abort_ip - sizeof(u32)); ret = get_user(sig, usig); if (ret) return ret; @@ -146,7 +152,7 @@ static int rseq_get_rseq_cs(struct task_struct *t, struct rseq_cs *rseq_cs) printk_ratelimited(KERN_WARNING "Possible attack attempt. Unexpected rseq signature 0x%x, expecting 0x%x (pid=%d, addr=%p).\n", sig, current->rseq_sig, current->pid, usig); - return -EPERM; + return -EINVAL; } return 0; } -- cgit From 8f28177014925f968baf45fc833c25848faf8c1c Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Mon, 9 Jul 2018 15:51:51 -0400 Subject: rseq: Use get_user/put_user rather than __get_user/__put_user __get_user()/__put_user() is used to read values for address ranges that were already checked with access_ok() on rseq registration. It has been recognized that __get_user/__put_user are optimizing the wrong thing. Replace them by get_user/put_user across rseq instead. If those end up showing up in benchmarks, the proper approach would be to use user_access_begin() / unsafe_{get,put}_user() / user_access_end() anyway. Signed-off-by: Mathieu Desnoyers Signed-off-by: Thomas Gleixner Cc: linux-api@vger.kernel.org Cc: Peter Zijlstra Cc: "Paul E . McKenney" Cc: Boqun Feng Cc: Andy Lutomirski Cc: Dave Watson Cc: Paul Turner Cc: Andrew Morton Cc: Russell King Cc: "H . Peter Anvin" Cc: Andi Kleen Cc: Chris Lameter Cc: Ben Maurer Cc: Steven Rostedt Cc: Josh Triplett Cc: Linus Torvalds Cc: Catalin Marinas Cc: Will Deacon Cc: Michael Kerrisk Cc: Joel Fernandes Cc: linux-arm-kernel@lists.infradead.org Link: https://lkml.kernel.org/r/20180709195155.7654-3-mathieu.desnoyers@efficios.com --- kernel/rseq.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/rseq.c b/kernel/rseq.c index 16b38c5342f9..2c8463acb50d 100644 --- a/kernel/rseq.c +++ b/kernel/rseq.c @@ -85,9 +85,9 @@ static int rseq_update_cpu_id(struct task_struct *t) { u32 cpu_id = raw_smp_processor_id(); - if (__put_user(cpu_id, &t->rseq->cpu_id_start)) + if (put_user(cpu_id, &t->rseq->cpu_id_start)) return -EFAULT; - if (__put_user(cpu_id, &t->rseq->cpu_id)) + if (put_user(cpu_id, &t->rseq->cpu_id)) return -EFAULT; trace_rseq_update(t); return 0; @@ -100,14 +100,14 @@ static int rseq_reset_rseq_cpu_id(struct task_struct *t) /* * Reset cpu_id_start to its initial state (0). */ - if (__put_user(cpu_id_start, &t->rseq->cpu_id_start)) + if (put_user(cpu_id_start, &t->rseq->cpu_id_start)) return -EFAULT; /* * Reset cpu_id to RSEQ_CPU_ID_UNINITIALIZED, so any user coming * in after unregistration can figure out that rseq needs to be * registered again. */ - if (__put_user(cpu_id, &t->rseq->cpu_id)) + if (put_user(cpu_id, &t->rseq->cpu_id)) return -EFAULT; return 0; } @@ -120,7 +120,7 @@ static int rseq_get_rseq_cs(struct task_struct *t, struct rseq_cs *rseq_cs) u32 sig; int ret; - ret = __get_user(ptr, &t->rseq->rseq_cs); + ret = get_user(ptr, &t->rseq->rseq_cs); if (ret) return ret; if (!ptr) { @@ -163,7 +163,7 @@ static int rseq_need_restart(struct task_struct *t, u32 cs_flags) int ret; /* Get thread flags. */ - ret = __get_user(flags, &t->rseq->flags); + ret = get_user(flags, &t->rseq->flags); if (ret) return ret; @@ -203,7 +203,7 @@ static int clear_rseq_cs(struct task_struct *t) * * Set rseq_cs to NULL with single-copy atomicity. */ - return __put_user(0UL, &t->rseq->rseq_cs); + return put_user(0UL, &t->rseq->rseq_cs); } /* -- cgit From 0fb9a1abc8c97f858997e962694eb36b4517144e Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Mon, 9 Jul 2018 15:51:52 -0400 Subject: rseq: uapi: Update uapi comments Update rseq uapi header comments to reflect that user-space need to do thread-local loads/stores from/to the struct rseq fields. As a consequence of this added requirement, the kernel does not need to perform loads/stores with single-copy atomicity. Update the comment associated to the "flags" fields to describe more accurately that it's only useful to facilitate single-stepping through rseq critical sections with debuggers. Signed-off-by: Mathieu Desnoyers Signed-off-by: Thomas Gleixner Cc: linux-api@vger.kernel.org Cc: Peter Zijlstra Cc: "Paul E . McKenney" Cc: Boqun Feng Cc: Andy Lutomirski Cc: Dave Watson Cc: Paul Turner Cc: Andrew Morton Cc: Russell King Cc: "H . Peter Anvin" Cc: Andi Kleen Cc: Chris Lameter Cc: Ben Maurer Cc: Steven Rostedt Cc: Josh Triplett Cc: Linus Torvalds Cc: Catalin Marinas Cc: Will Deacon Cc: Michael Kerrisk Cc: Joel Fernandes Cc: "Paul E. McKenney" Cc: "H. Peter Anvin" Link: https://lkml.kernel.org/r/20180709195155.7654-4-mathieu.desnoyers@efficios.com --- include/uapi/linux/rseq.h | 69 ++++++++++++++++++++++++----------------------- kernel/rseq.c | 2 +- 2 files changed, 37 insertions(+), 34 deletions(-) (limited to 'kernel') diff --git a/include/uapi/linux/rseq.h b/include/uapi/linux/rseq.h index 519ad6e176d1..bf4188c13bec 100644 --- a/include/uapi/linux/rseq.h +++ b/include/uapi/linux/rseq.h @@ -67,28 +67,30 @@ struct rseq_cs { struct rseq { /* * Restartable sequences cpu_id_start field. Updated by the - * kernel, and read by user-space with single-copy atomicity - * semantics. Aligned on 32-bit. Always contains a value in the - * range of possible CPUs, although the value may not be the - * actual current CPU (e.g. if rseq is not initialized). This - * CPU number value should always be compared against the value - * of the cpu_id field before performing a rseq commit or - * returning a value read from a data structure indexed using - * the cpu_id_start value. + * kernel. Read by user-space with single-copy atomicity + * semantics. This field should only be read by the thread which + * registered this data structure. Aligned on 32-bit. Always + * contains a value in the range of possible CPUs, although the + * value may not be the actual current CPU (e.g. if rseq is not + * initialized). This CPU number value should always be compared + * against the value of the cpu_id field before performing a rseq + * commit or returning a value read from a data structure indexed + * using the cpu_id_start value. */ __u32 cpu_id_start; /* - * Restartable sequences cpu_id field. Updated by the kernel, - * and read by user-space with single-copy atomicity semantics. - * Aligned on 32-bit. Values RSEQ_CPU_ID_UNINITIALIZED and - * RSEQ_CPU_ID_REGISTRATION_FAILED have a special semantic: the - * former means "rseq uninitialized", and latter means "rseq - * initialization failed". This value is meant to be read within - * rseq critical sections and compared with the cpu_id_start - * value previously read, before performing the commit instruction, - * or read and compared with the cpu_id_start value before returning - * a value loaded from a data structure indexed using the - * cpu_id_start value. + * Restartable sequences cpu_id field. Updated by the kernel. + * Read by user-space with single-copy atomicity semantics. This + * field should only be read by the thread which registered this + * data structure. Aligned on 32-bit. Values + * RSEQ_CPU_ID_UNINITIALIZED and RSEQ_CPU_ID_REGISTRATION_FAILED + * have a special semantic: the former means "rseq uninitialized", + * and latter means "rseq initialization failed". This value is + * meant to be read within rseq critical sections and compared + * with the cpu_id_start value previously read, before performing + * the commit instruction, or read and compared with the + * cpu_id_start value before returning a value loaded from a data + * structure indexed using the cpu_id_start value. */ __u32 cpu_id; /* @@ -105,27 +107,28 @@ struct rseq { * targeted by the rseq_cs. Also needs to be set to NULL by user-space * before reclaiming memory that contains the targeted struct rseq_cs. * - * Read and set by the kernel with single-copy atomicity semantics. - * Set by user-space with single-copy atomicity semantics. Aligned - * on 64-bit. + * Read and set by the kernel. Set by user-space with single-copy + * atomicity semantics. This field should only be updated by the + * thread which registered this data structure. Aligned on 64-bit. */ LINUX_FIELD_u32_u64(rseq_cs); /* - * - RSEQ_DISABLE flag: + * Restartable sequences flags field. + * + * This field should only be updated by the thread which + * registered this data structure. Read by the kernel. + * Mainly used for single-stepping through rseq critical sections + * with debuggers. * - * Fallback fast-track flag for single-stepping. - * Set by user-space if lack of progress is detected. - * Cleared by user-space after rseq finish. - * Read by the kernel. * - RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT - * Inhibit instruction sequence block restart and event - * counter increment on preemption for this thread. + * Inhibit instruction sequence block restart on preemption + * for this thread. * - RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL - * Inhibit instruction sequence block restart and event - * counter increment on signal delivery for this thread. + * Inhibit instruction sequence block restart on signal + * delivery for this thread. * - RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE - * Inhibit instruction sequence block restart and event - * counter increment on migration for this thread. + * Inhibit instruction sequence block restart on migration for + * this thread. */ __u32 flags; } __attribute__((aligned(4 * sizeof(__u64)))); diff --git a/kernel/rseq.c b/kernel/rseq.c index 2c8463acb50d..2a7748675be7 100644 --- a/kernel/rseq.c +++ b/kernel/rseq.c @@ -201,7 +201,7 @@ static int clear_rseq_cs(struct task_struct *t) * of code outside of the rseq assembly block. This performs * a lazy clear of the rseq_cs field. * - * Set rseq_cs to NULL with single-copy atomicity. + * Set rseq_cs to NULL. */ return put_user(0UL, &t->rseq->rseq_cs); } -- cgit From ec9c82e03a744e5698bd95eab872855861a821fa Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Mon, 9 Jul 2018 15:51:53 -0400 Subject: rseq: uapi: Declare rseq_cs field as union, update includes Declaring the rseq_cs field as a union between __u64 and two __u32 allows both 32-bit and 64-bit kernels to read the full __u64, and therefore validate that a 32-bit user-space cleared the upper 32 bits, thus ensuring a consistent behavior between native 32-bit kernels and 32-bit compat tasks on 64-bit kernels. Check that the rseq_cs value read is < TASK_SIZE. The asm/byteorder.h header needs to be included by rseq.h, now that it is not using linux/types_32_64.h anymore. Considering that only __32 and __u64 types are declared in linux/rseq.h, the linux/types.h header should always be included for both kernel and user-space code: including stdint.h is just for u64 and u32, which are not used in this header at all. Use copy_from_user()/clear_user() to interact with a 64-bit field, because arm32 does not implement 64-bit __get_user, and ppc32 does not 64-bit get_user. Considering that the rseq_cs pointer does not need to be loaded/stored with single-copy atomicity from the kernel anymore, we can simply use copy_from_user()/clear_user(). Signed-off-by: Mathieu Desnoyers Signed-off-by: Thomas Gleixner Cc: linux-api@vger.kernel.org Cc: Peter Zijlstra Cc: "Paul E . McKenney" Cc: Boqun Feng Cc: Andy Lutomirski Cc: Dave Watson Cc: Paul Turner Cc: Andrew Morton Cc: Russell King Cc: "H . Peter Anvin" Cc: Andi Kleen Cc: Chris Lameter Cc: Ben Maurer Cc: Steven Rostedt Cc: Josh Triplett Cc: Linus Torvalds Cc: Catalin Marinas Cc: Will Deacon Cc: Michael Kerrisk Cc: Joel Fernandes Cc: "Paul E. McKenney" Cc: "H. Peter Anvin" Link: https://lkml.kernel.org/r/20180709195155.7654-5-mathieu.desnoyers@efficios.com --- include/uapi/linux/rseq.h | 27 +++++++++++++++++++-------- kernel/rseq.c | 15 +++++++++------ tools/testing/selftests/rseq/rseq.h | 11 ++++++++++- 3 files changed, 38 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/include/uapi/linux/rseq.h b/include/uapi/linux/rseq.h index bf4188c13bec..9a402fdb60e9 100644 --- a/include/uapi/linux/rseq.h +++ b/include/uapi/linux/rseq.h @@ -10,13 +10,8 @@ * Copyright (c) 2015-2018 Mathieu Desnoyers */ -#ifdef __KERNEL__ -# include -#else -# include -#endif - -#include +#include +#include enum rseq_cpu_id_state { RSEQ_CPU_ID_UNINITIALIZED = -1, @@ -111,7 +106,23 @@ struct rseq { * atomicity semantics. This field should only be updated by the * thread which registered this data structure. Aligned on 64-bit. */ - LINUX_FIELD_u32_u64(rseq_cs); + union { + __u64 ptr64; +#ifdef __LP64__ + __u64 ptr; +#else + struct { +#if (defined(__BYTE_ORDER) && (__BYTE_ORDER == __BIG_ENDIAN)) || defined(__BIG_ENDIAN) + __u32 padding; /* Initialized to zero. */ + __u32 ptr32; +#else /* LITTLE */ + __u32 ptr32; + __u32 padding; /* Initialized to zero. */ +#endif /* ENDIAN */ + } ptr; +#endif + } rseq_cs; + /* * Restartable sequences flags field. * diff --git a/kernel/rseq.c b/kernel/rseq.c index 2a7748675be7..c6242d8594dc 100644 --- a/kernel/rseq.c +++ b/kernel/rseq.c @@ -115,19 +115,20 @@ static int rseq_reset_rseq_cpu_id(struct task_struct *t) static int rseq_get_rseq_cs(struct task_struct *t, struct rseq_cs *rseq_cs) { struct rseq_cs __user *urseq_cs; - unsigned long ptr; + u64 ptr; u32 __user *usig; u32 sig; int ret; - ret = get_user(ptr, &t->rseq->rseq_cs); - if (ret) - return ret; + if (copy_from_user(&ptr, &t->rseq->rseq_cs.ptr64, sizeof(ptr))) + return -EFAULT; if (!ptr) { memset(rseq_cs, 0, sizeof(*rseq_cs)); return 0; } - urseq_cs = (struct rseq_cs __user *)ptr; + if (ptr >= TASK_SIZE) + return -EINVAL; + urseq_cs = (struct rseq_cs __user *)(unsigned long)ptr; if (copy_from_user(rseq_cs, urseq_cs, sizeof(*rseq_cs))) return -EFAULT; @@ -203,7 +204,9 @@ static int clear_rseq_cs(struct task_struct *t) * * Set rseq_cs to NULL. */ - return put_user(0UL, &t->rseq->rseq_cs); + if (clear_user(&t->rseq->rseq_cs.ptr64, sizeof(t->rseq->rseq_cs.ptr64))) + return -EFAULT; + return 0; } /* diff --git a/tools/testing/selftests/rseq/rseq.h b/tools/testing/selftests/rseq/rseq.h index a4684112676c..f2073cfa4448 100644 --- a/tools/testing/selftests/rseq/rseq.h +++ b/tools/testing/selftests/rseq/rseq.h @@ -133,6 +133,15 @@ static inline uint32_t rseq_current_cpu(void) return cpu; } +static inline void rseq_clear_rseq_cs(void) +{ +#ifdef __LP64__ + __rseq_abi.rseq_cs.ptr = 0; +#else + __rseq_abi.rseq_cs.ptr.ptr32 = 0; +#endif +} + /* * rseq_prepare_unload() should be invoked by each thread using rseq_finish*() * at least once between their last rseq_finish*() and library unload of the @@ -143,7 +152,7 @@ static inline uint32_t rseq_current_cpu(void) */ static inline void rseq_prepare_unload(void) { - __rseq_abi.rseq_cs = 0; + rseq_clear_rseq_cs(); } #endif /* RSEQ_H_ */ -- cgit From ffaa619af1b062d5a37871df6363aa52356007a7 Mon Sep 17 00:00:00 2001 From: Petr Mladek Date: Tue, 10 Jul 2018 10:44:53 +0200 Subject: printk: Fix warning about unused suppress_message_printing MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit suppress_message_printing() is not longer called in console_unlock(). Therefore it is not longer needed with disabled CONFIG_PRINTK. This fixes the warning: kernel/printk/printk.c:2033:13: warning: ‘suppress_message_printing’ defined but not used [-Wunused-function] static bool suppress_message_printing(int level) { return false; } Reported-by: Stephen Rothwell Reported-by: Arnd Bergmann Suggested-by: Maninder Singh Acked-by: Sergey Senozhatsky Signed-off-by: Petr Mladek --- kernel/printk/printk.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index fcc1992c040a..e2cb0fc18e2d 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -2021,7 +2021,6 @@ static void call_console_drivers(const char *ext_text, size_t ext_len, const char *text, size_t len) {} static size_t msg_print_text(const struct printk_log *msg, bool syslog, char *buf, size_t size) { return 0; } -static bool suppress_message_printing(int level) { return false; } #endif /* CONFIG_PRINTK */ -- cgit From e4f8d81c738db6d3ffdabfb8329aa2feaa310699 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Mon, 9 Jul 2018 17:48:54 -0400 Subject: cgroup/tracing: Move taking of spin lock out of trace event handlers It is unwise to take spin locks from the handlers of trace events. Mainly, because they can introduce lockups, because it introduces locks in places that are normally not tested. Worse yet, because trace events are tucked away in the include/trace/events/ directory, locks that are taken there are forgotten about. As a general rule, I tell people never to take any locks in a trace event handler. Several cgroup trace event handlers call cgroup_path() which eventually takes the kernfs_rename_lock spinlock. This injects the spinlock in the code without people realizing it. It also can cause issues for the PREEMPT_RT patch, as the spinlock becomes a mutex, and the trace event handlers are called with preemption disabled. By moving the calculation of the cgroup_path() out of the trace event handlers and into a macro (surrounded by a trace_cgroup_##type##_enabled()), then we could place the cgroup_path into a string, and pass that to the trace event. Not only does this remove the taking of the spinlock out of the trace event handler, but it also means that the cgroup_path() only needs to be called once (it is currently called twice, once to get the length to reserver the buffer for, and once again to get the path itself. Now it only needs to be done once. Reported-by: Sebastian Andrzej Siewior Signed-off-by: Steven Rostedt (VMware) Signed-off-by: Tejun Heo --- include/trace/events/cgroup.h | 47 ++++++++++++++++++++--------------------- kernel/cgroup/cgroup-internal.h | 26 +++++++++++++++++++++++ kernel/cgroup/cgroup-v1.c | 4 ++-- kernel/cgroup/cgroup.c | 12 ++++++----- 4 files changed, 58 insertions(+), 31 deletions(-) (limited to 'kernel') diff --git a/include/trace/events/cgroup.h b/include/trace/events/cgroup.h index d74722c2ac8b..a401ff5e7847 100644 --- a/include/trace/events/cgroup.h +++ b/include/trace/events/cgroup.h @@ -53,24 +53,22 @@ DEFINE_EVENT(cgroup_root, cgroup_remount, DECLARE_EVENT_CLASS(cgroup, - TP_PROTO(struct cgroup *cgrp), + TP_PROTO(struct cgroup *cgrp, const char *path), - TP_ARGS(cgrp), + TP_ARGS(cgrp, path), TP_STRUCT__entry( __field( int, root ) __field( int, id ) __field( int, level ) - __dynamic_array(char, path, - cgroup_path(cgrp, NULL, 0) + 1) + __string( path, path ) ), TP_fast_assign( __entry->root = cgrp->root->hierarchy_id; __entry->id = cgrp->id; __entry->level = cgrp->level; - cgroup_path(cgrp, __get_dynamic_array(path), - __get_dynamic_array_len(path)); + __assign_str(path, path); ), TP_printk("root=%d id=%d level=%d path=%s", @@ -79,45 +77,45 @@ DECLARE_EVENT_CLASS(cgroup, DEFINE_EVENT(cgroup, cgroup_mkdir, - TP_PROTO(struct cgroup *cgroup), + TP_PROTO(struct cgroup *cgrp, const char *path), - TP_ARGS(cgroup) + TP_ARGS(cgrp, path) ); DEFINE_EVENT(cgroup, cgroup_rmdir, - TP_PROTO(struct cgroup *cgroup), + TP_PROTO(struct cgroup *cgrp, const char *path), - TP_ARGS(cgroup) + TP_ARGS(cgrp, path) ); DEFINE_EVENT(cgroup, cgroup_release, - TP_PROTO(struct cgroup *cgroup), + TP_PROTO(struct cgroup *cgrp, const char *path), - TP_ARGS(cgroup) + TP_ARGS(cgrp, path) ); DEFINE_EVENT(cgroup, cgroup_rename, - TP_PROTO(struct cgroup *cgroup), + TP_PROTO(struct cgroup *cgrp, const char *path), - TP_ARGS(cgroup) + TP_ARGS(cgrp, path) ); DECLARE_EVENT_CLASS(cgroup_migrate, - TP_PROTO(struct cgroup *dst_cgrp, struct task_struct *task, bool threadgroup), + TP_PROTO(struct cgroup *dst_cgrp, const char *path, + struct task_struct *task, bool threadgroup), - TP_ARGS(dst_cgrp, task, threadgroup), + TP_ARGS(dst_cgrp, path, task, threadgroup), TP_STRUCT__entry( __field( int, dst_root ) __field( int, dst_id ) __field( int, dst_level ) - __dynamic_array(char, dst_path, - cgroup_path(dst_cgrp, NULL, 0) + 1) __field( int, pid ) + __string( dst_path, path ) __string( comm, task->comm ) ), @@ -125,8 +123,7 @@ DECLARE_EVENT_CLASS(cgroup_migrate, __entry->dst_root = dst_cgrp->root->hierarchy_id; __entry->dst_id = dst_cgrp->id; __entry->dst_level = dst_cgrp->level; - cgroup_path(dst_cgrp, __get_dynamic_array(dst_path), - __get_dynamic_array_len(dst_path)); + __assign_str(dst_path, path); __entry->pid = task->pid; __assign_str(comm, task->comm); ), @@ -138,16 +135,18 @@ DECLARE_EVENT_CLASS(cgroup_migrate, DEFINE_EVENT(cgroup_migrate, cgroup_attach_task, - TP_PROTO(struct cgroup *dst_cgrp, struct task_struct *task, bool threadgroup), + TP_PROTO(struct cgroup *dst_cgrp, const char *path, + struct task_struct *task, bool threadgroup), - TP_ARGS(dst_cgrp, task, threadgroup) + TP_ARGS(dst_cgrp, path, task, threadgroup) ); DEFINE_EVENT(cgroup_migrate, cgroup_transfer_tasks, - TP_PROTO(struct cgroup *dst_cgrp, struct task_struct *task, bool threadgroup), + TP_PROTO(struct cgroup *dst_cgrp, const char *path, + struct task_struct *task, bool threadgroup), - TP_ARGS(dst_cgrp, task, threadgroup) + TP_ARGS(dst_cgrp, path, task, threadgroup) ); #endif /* _TRACE_CGROUP_H */ diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h index 77ff1cd6a252..75568fcf2180 100644 --- a/kernel/cgroup/cgroup-internal.h +++ b/kernel/cgroup/cgroup-internal.h @@ -8,6 +8,32 @@ #include #include +#define TRACE_CGROUP_PATH_LEN 1024 +extern spinlock_t trace_cgroup_path_lock; +extern char trace_cgroup_path[TRACE_CGROUP_PATH_LEN]; + +/* + * cgroup_path() takes a spin lock. It is good practice not to take + * spin locks within trace point handlers, as they are mostly hidden + * from normal view. As cgroup_path() can take the kernfs_rename_lock + * spin lock, it is best to not call that function from the trace event + * handler. + * + * Note: trace_cgroup_##type##_enabled() is a static branch that will only + * be set when the trace event is enabled. + */ +#define TRACE_CGROUP_PATH(type, cgrp, ...) \ + do { \ + if (trace_cgroup_##type##_enabled()) { \ + spin_lock(&trace_cgroup_path_lock); \ + cgroup_path(cgrp, trace_cgroup_path, \ + TRACE_CGROUP_PATH_LEN); \ + trace_cgroup_##type(cgrp, trace_cgroup_path, \ + ##__VA_ARGS__); \ + spin_unlock(&trace_cgroup_path_lock); \ + } \ + } while (0) + /* * A cgroup can be associated with multiple css_sets as different tasks may * belong to different cgroups on different hierarchies. In the other diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index 8b4f0768efd6..51063e7a93c2 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -135,7 +135,7 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from) if (task) { ret = cgroup_migrate(task, false, &mgctx); if (!ret) - trace_cgroup_transfer_tasks(to, task, false); + TRACE_CGROUP_PATH(transfer_tasks, to, task, false); put_task_struct(task); } } while (task && !ret); @@ -865,7 +865,7 @@ static int cgroup1_rename(struct kernfs_node *kn, struct kernfs_node *new_parent ret = kernfs_rename(kn, new_parent, new_name_str); if (!ret) - trace_cgroup_rename(cgrp); + TRACE_CGROUP_PATH(rename, cgrp); mutex_unlock(&cgroup_mutex); diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 077370bf8964..4a439de621bd 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -83,6 +83,9 @@ EXPORT_SYMBOL_GPL(cgroup_mutex); EXPORT_SYMBOL_GPL(css_set_lock); #endif +DEFINE_SPINLOCK(trace_cgroup_path_lock); +char trace_cgroup_path[TRACE_CGROUP_PATH_LEN]; + /* * Protects cgroup_idr and css_idr so that IDs can be released without * grabbing cgroup_mutex. @@ -2638,7 +2641,7 @@ int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader, cgroup_migrate_finish(&mgctx); if (!ret) - trace_cgroup_attach_task(dst_cgrp, leader, threadgroup); + TRACE_CGROUP_PATH(attach_task, dst_cgrp, leader, threadgroup); return ret; } @@ -4634,7 +4637,7 @@ static void css_release_work_fn(struct work_struct *work) struct cgroup *tcgrp; /* cgroup release path */ - trace_cgroup_release(cgrp); + TRACE_CGROUP_PATH(release, cgrp); if (cgroup_on_dfl(cgrp)) cgroup_rstat_flush(cgrp); @@ -4977,7 +4980,7 @@ int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, umode_t mode) if (ret) goto out_destroy; - trace_cgroup_mkdir(cgrp); + TRACE_CGROUP_PATH(mkdir, cgrp); /* let's create and online css's */ kernfs_activate(kn); @@ -5165,9 +5168,8 @@ int cgroup_rmdir(struct kernfs_node *kn) return 0; ret = cgroup_destroy_locked(cgrp); - if (!ret) - trace_cgroup_rmdir(cgrp); + TRACE_CGROUP_PATH(rmdir, cgrp); cgroup_kn_unlock(kn); return ret; -- cgit From 0fc8c3581dd42bc8f530314ca86db2d861485731 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Mon, 9 Jul 2018 16:19:06 +0200 Subject: tracing/kprobe: Release kprobe print_fmt properly We don't release tk->tp.call.print_fmt when destroying local uprobe. Also there's missing print_fmt kfree in create_local_trace_kprobe error path. Link: http://lkml.kernel.org/r/20180709141906.2390-1-jolsa@kernel.org Cc: stable@vger.kernel.org Fixes: e12f03d7031a ("perf/core: Implement the 'perf_kprobe' PMU") Acked-by: Song Liu Acked-by: Masami Hiramatsu Signed-off-by: Jiri Olsa Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_kprobe.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index daa81571b22a..21f718472942 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -1480,8 +1480,10 @@ create_local_trace_kprobe(char *func, void *addr, unsigned long offs, } ret = __register_trace_kprobe(tk); - if (ret < 0) + if (ret < 0) { + kfree(tk->tp.call.print_fmt); goto error; + } return &tk->tp.call; error: @@ -1501,6 +1503,8 @@ void destroy_local_trace_kprobe(struct trace_event_call *event_call) } __unregister_trace_kprobe(tk); + + kfree(tk->tp.call.print_fmt); free_trace_kprobe(tk); } #endif /* CONFIG_PERF_EVENTS */ -- cgit From b65f370d0671c4980ffe866c41e327b88893245c Mon Sep 17 00:00:00 2001 From: Okash Khawaja Date: Tue, 10 Jul 2018 14:33:07 -0700 Subject: bpf: btf: Fix bitfield extraction for big endian When extracting bitfield from a number, btf_int_bits_seq_show() builds a mask and accesses least significant byte of the number in a way specific to little-endian. This patch fixes that by checking endianness of the machine and then shifting left and right the unneeded bits. Thanks to Martin Lau for the help in navigating potential pitfalls when dealing with endianess and for the final solution. Fixes: b00b8daec828 ("bpf: btf: Add pretty print capability for data with BTF type info") Signed-off-by: Okash Khawaja Acked-by: Martin KaFai Lau Signed-off-by: Daniel Borkmann --- kernel/bpf/btf.c | 30 +++++++++++++----------------- 1 file changed, 13 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 2d49d18b793a..e016ac3afa24 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -991,16 +991,13 @@ static void btf_int_bits_seq_show(const struct btf *btf, void *data, u8 bits_offset, struct seq_file *m) { + u16 left_shift_bits, right_shift_bits; u32 int_data = btf_type_int(t); u16 nr_bits = BTF_INT_BITS(int_data); u16 total_bits_offset; u16 nr_copy_bytes; u16 nr_copy_bits; - u8 nr_upper_bits; - union { - u64 u64_num; - u8 u8_nums[8]; - } print_num; + u64 print_num; total_bits_offset = bits_offset + BTF_INT_OFFSET(int_data); data += BITS_ROUNDDOWN_BYTES(total_bits_offset); @@ -1008,21 +1005,20 @@ static void btf_int_bits_seq_show(const struct btf *btf, nr_copy_bits = nr_bits + bits_offset; nr_copy_bytes = BITS_ROUNDUP_BYTES(nr_copy_bits); - print_num.u64_num = 0; - memcpy(&print_num.u64_num, data, nr_copy_bytes); + print_num = 0; + memcpy(&print_num, data, nr_copy_bytes); - /* Ditch the higher order bits */ - nr_upper_bits = BITS_PER_BYTE_MASKED(nr_copy_bits); - if (nr_upper_bits) { - /* We need to mask out some bits of the upper byte. */ - u8 mask = (1 << nr_upper_bits) - 1; +#ifdef __BIG_ENDIAN_BITFIELD + left_shift_bits = bits_offset; +#else + left_shift_bits = BITS_PER_U64 - nr_copy_bits; +#endif + right_shift_bits = BITS_PER_U64 - nr_bits; - print_num.u8_nums[nr_copy_bytes - 1] &= mask; - } - - print_num.u64_num >>= bits_offset; + print_num <<= left_shift_bits; + print_num >>= right_shift_bits; - seq_printf(m, "0x%llx", print_num.u64_num); + seq_printf(m, "0x%llx", print_num); } static void btf_int_seq_show(const struct btf *btf, const struct btf_type *t, -- cgit From fcc63543650150629c8a873cbef3578770acecd9 Mon Sep 17 00:00:00 2001 From: Boqun Feng Date: Fri, 15 Jun 2018 12:06:31 -0700 Subject: rcu: Make expedited GPs handle CPU 0 being offline Currently, the parallelized initialization of expedited grace periods uses the workqueue associated with each rcu_node structure's ->grplo field. This works fine unless that CPU is offline. This commit therefore uses the CPU corresponding to the lowest-numbered online CPU, or just queues the work on WORK_CPU_UNBOUND if there are no online CPUs corresponding to this rcu_node structure. Note that this patch uses cpu_is_offline() instead of the usual approach of checking bits in the rcu_node structure's ->qsmaskinitnext field. This is safe because preemption is disabled across both the cpu_is_offline() check and the call to queue_work_on(). Signed-off-by: Boqun Feng [ paulmck: Disable preemption to close offline race window. ] Signed-off-by: Paul E. McKenney [ paulmck: Apply Peter Zijlstra feedback on CPU selection. ] Tested-by: Aneesh Kumar K.V --- kernel/rcu/tree_exp.h | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index c6385ee1af65..b3df3b770afb 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -472,6 +472,7 @@ retry_ipi: static void sync_rcu_exp_select_cpus(struct rcu_state *rsp, smp_call_func_t func) { + int cpu; struct rcu_node *rnp; trace_rcu_exp_grace_period(rsp->name, rcu_exp_gp_seq_endval(rsp), TPS("reset")); @@ -493,7 +494,13 @@ static void sync_rcu_exp_select_cpus(struct rcu_state *rsp, continue; } INIT_WORK(&rnp->rew.rew_work, sync_rcu_exp_select_node_cpus); - queue_work_on(rnp->grplo, rcu_par_gp_wq, &rnp->rew.rew_work); + preempt_disable(); + cpu = cpumask_next(rnp->grplo - 1, cpu_online_mask); + /* If all offline, queue the work on an unbound CPU. */ + if (unlikely(cpu > rnp->grphi)) + cpu = WORK_CPU_UNBOUND; + queue_work_on(cpu, rcu_par_gp_wq, &rnp->rew.rew_work); + preempt_enable(); rnp->exp_need_flush = true; } -- cgit From c7a897843224a92209f306c984975b704969b89d Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 12 Jul 2018 21:44:28 +0200 Subject: bpf: don't leave partial mangled prog in jit_subprogs error path syzkaller managed to trigger the following bug through fault injection: [...] [ 141.043668] verifier bug. No program starts at insn 3 [ 141.044648] WARNING: CPU: 3 PID: 4072 at kernel/bpf/verifier.c:1613 get_callee_stack_depth kernel/bpf/verifier.c:1612 [inline] [ 141.044648] WARNING: CPU: 3 PID: 4072 at kernel/bpf/verifier.c:1613 fixup_call_args kernel/bpf/verifier.c:5587 [inline] [ 141.044648] WARNING: CPU: 3 PID: 4072 at kernel/bpf/verifier.c:1613 bpf_check+0x525e/0x5e60 kernel/bpf/verifier.c:5952 [ 141.047355] CPU: 3 PID: 4072 Comm: a.out Not tainted 4.18.0-rc4+ #51 [ 141.048446] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996),BIOS 1.10.2-1 04/01/2014 [ 141.049877] Call Trace: [ 141.050324] __dump_stack lib/dump_stack.c:77 [inline] [ 141.050324] dump_stack+0x1c9/0x2b4 lib/dump_stack.c:113 [ 141.050950] ? dump_stack_print_info.cold.2+0x52/0x52 lib/dump_stack.c:60 [ 141.051837] panic+0x238/0x4e7 kernel/panic.c:184 [ 141.052386] ? add_taint.cold.5+0x16/0x16 kernel/panic.c:385 [ 141.053101] ? __warn.cold.8+0x148/0x1ba kernel/panic.c:537 [ 141.053814] ? __warn.cold.8+0x117/0x1ba kernel/panic.c:530 [ 141.054506] ? get_callee_stack_depth kernel/bpf/verifier.c:1612 [inline] [ 141.054506] ? fixup_call_args kernel/bpf/verifier.c:5587 [inline] [ 141.054506] ? bpf_check+0x525e/0x5e60 kernel/bpf/verifier.c:5952 [ 141.055163] __warn.cold.8+0x163/0x1ba kernel/panic.c:538 [ 141.055820] ? get_callee_stack_depth kernel/bpf/verifier.c:1612 [inline] [ 141.055820] ? fixup_call_args kernel/bpf/verifier.c:5587 [inline] [ 141.055820] ? bpf_check+0x525e/0x5e60 kernel/bpf/verifier.c:5952 [...] What happens in jit_subprogs() is that kcalloc() for the subprog func buffer is failing with NULL where we then bail out. Latter is a plain return -ENOMEM, and this is definitely not okay since earlier in the loop we are walking all subprogs and temporarily rewrite insn->off to remember the subprog id as well as insn->imm to temporarily point the call to __bpf_call_base + 1 for the initial JIT pass. Thus, bailing out in such state and handing this over to the interpreter is troublesome since later/subsequent e.g. find_subprog() lookups are based on wrong insn->imm. Therefore, once we hit this point, we need to jump to out_free path where we undo all changes from earlier loop, so that interpreter can work on unmodified insn->{off,imm}. Another point is that should find_subprog() fail in jit_subprogs() due to a verifier bug, then we also should not simply defer the program to the interpreter since also here we did partial modifications. Instead we should just bail out entirely and return an error to the user who is trying to load the program. Fixes: 1c2a088a6626 ("bpf: x64: add JIT support for multi-function programs") Reported-by: syzbot+7d427828b2ea6e592804@syzkaller.appspotmail.com Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 9e2bf834f13a..63aaac52a265 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5430,6 +5430,10 @@ static int jit_subprogs(struct bpf_verifier_env *env) if (insn->code != (BPF_JMP | BPF_CALL) || insn->src_reg != BPF_PSEUDO_CALL) continue; + /* Upon error here we cannot fall back to interpreter but + * need a hard reject of the program. Thus -EFAULT is + * propagated in any case. + */ subprog = find_subprog(env, i + insn->imm + 1); if (subprog < 0) { WARN_ONCE(1, "verifier bug. No program starts at insn %d\n", @@ -5450,7 +5454,7 @@ static int jit_subprogs(struct bpf_verifier_env *env) func = kcalloc(env->subprog_cnt, sizeof(prog), GFP_KERNEL); if (!func) - return -ENOMEM; + goto out_undo_insn; for (i = 0; i < env->subprog_cnt; i++) { subprog_start = subprog_end; @@ -5515,7 +5519,7 @@ static int jit_subprogs(struct bpf_verifier_env *env) tmp = bpf_int_jit_compile(func[i]); if (tmp != func[i] || func[i]->bpf_func != old_bpf_func) { verbose(env, "JIT doesn't support bpf-to-bpf calls\n"); - err = -EFAULT; + err = -ENOTSUPP; goto out_free; } cond_resched(); @@ -5552,6 +5556,7 @@ out_free: if (func[i]) bpf_jit_free(func[i]); kfree(func); +out_undo_insn: /* cleanup main prog to be interpreted */ prog->jit_requested = 0; for (i = 0, insn = prog->insnsi; i < prog->len; i++, insn++) { @@ -5578,6 +5583,8 @@ static int fixup_call_args(struct bpf_verifier_env *env) err = jit_subprogs(env); if (err == 0) return 0; + if (err == -EFAULT) + return err; } #ifndef CONFIG_BPF_JIT_ALWAYS_ON for (i = 0; i < prog->len; i++, insn++) { -- cgit From 26d950a9451336a6b5abc1c8ca6c21df58e8d89f Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sat, 21 Apr 2018 20:44:11 -0700 Subject: rcu: Diagnostics for grace-period startup hangs This commit causes a splat if RCU is idle and a request for a new grace period is ignored for more than one second. This splat normally indicates that some code path asked for a new grace period, but failed to wake up the RCU grace-period kthread. Signed-off-by: Paul E. McKenney [ paulmck: Fix bug located by Dan Carpenter and his static checker. ] [ paulmck: Fix self-deadlock bug located 0day test robot. ] [ paulmck: Disable unless CONFIG_PROVE_RCU=y. ] --- kernel/rcu/tree.c | 66 +++++++++++++++++++++++++++++++++++++++++++++++++++++-- kernel/rcu/tree.h | 2 ++ 2 files changed, 66 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index b1fffa21b9e4..6ce82c009195 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1681,6 +1681,7 @@ static bool rcu_start_this_gp(struct rcu_node *rnp, struct rcu_data *rdp, } trace_rcu_this_gp(rnp_root, rdp, c, TPS("Startedroot")); WRITE_ONCE(rsp->gp_flags, rsp->gp_flags | RCU_GP_FLAG_INIT); + rsp->gp_req_activity = jiffies; if (!rsp->gp_kthread) { trace_rcu_this_gp(rnp_root, rdp, c, TPS("NoGPkthread")); goto unlock_out; @@ -2113,6 +2114,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp) /* Advance CBs to reduce false positives below. */ if (!rcu_accelerate_cbs(rsp, rnp, rdp) && needgp) { WRITE_ONCE(rsp->gp_flags, RCU_GP_FLAG_INIT); + rsp->gp_req_activity = jiffies; trace_rcu_grace_period(rsp->name, READ_ONCE(rsp->gpnum), TPS("newreq")); } @@ -2744,6 +2746,65 @@ static void force_quiescent_state(struct rcu_state *rsp) rcu_gp_kthread_wake(rsp); } +/* + * This function checks for grace-period requests that fail to motivate + * RCU to come out of its idle mode. + */ +static void +rcu_check_gp_start_stall(struct rcu_state *rsp, struct rcu_node *rnp, + struct rcu_data *rdp) +{ + unsigned long flags; + unsigned long j; + struct rcu_node *rnp_root = rcu_get_root(rsp); + static atomic_t warned = ATOMIC_INIT(0); + + if (!IS_ENABLED(CONFIG_PROVE_RCU) || + rcu_gp_in_progress(rsp) || !need_any_future_gp(rcu_get_root(rsp))) + return; + j = jiffies; /* Expensive access, and in common case don't get here. */ + if (time_before(j, READ_ONCE(rsp->gp_req_activity) + HZ) || + time_before(j, READ_ONCE(rsp->gp_activity) + HZ) || + atomic_read(&warned)) + return; + + raw_spin_lock_irqsave_rcu_node(rnp, flags); + j = jiffies; + if (rcu_gp_in_progress(rsp) || !need_any_future_gp(rcu_get_root(rsp)) || + time_before(j, READ_ONCE(rsp->gp_req_activity) + HZ) || + time_before(j, READ_ONCE(rsp->gp_activity) + HZ) || + atomic_read(&warned)) { + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + return; + } + /* Hold onto the leaf lock to make others see warned==1. */ + + if (rnp_root != rnp) + raw_spin_lock_rcu_node(rnp_root); /* irqs already disabled. */ + j = jiffies; + if (rcu_gp_in_progress(rsp) || !need_any_future_gp(rcu_get_root(rsp)) || + time_before(j, rsp->gp_req_activity + HZ) || + time_before(j, rsp->gp_activity + HZ) || + atomic_xchg(&warned, 1)) { + raw_spin_unlock_rcu_node(rnp_root); /* irqs remain disabled. */ + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + return; + } + pr_alert("%s: g%lu %d%d%d%d gar:%lu ga:%lu f%#x %s->state:%#lx\n", + __func__, READ_ONCE(rsp->gpnum), + need_future_gp_element(rcu_get_root(rsp), 0), + need_future_gp_element(rcu_get_root(rsp), 1), + need_future_gp_element(rcu_get_root(rsp), 2), + need_future_gp_element(rcu_get_root(rsp), 3), + j - rsp->gp_req_activity, j - rsp->gp_activity, + rsp->gp_flags, rsp->name, + rsp->gp_kthread ? rsp->gp_kthread->state : 0x1ffffL); + WARN_ON(1); + if (rnp_root != rnp) + raw_spin_unlock_rcu_node(rnp_root); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); +} + /* * This does the RCU core processing work for the specified rcu_state * and rcu_data structures. This may be called only from the CPU to @@ -2755,7 +2816,7 @@ __rcu_process_callbacks(struct rcu_state *rsp) unsigned long flags; bool needwake; struct rcu_data *rdp = raw_cpu_ptr(rsp->rda); - struct rcu_node *rnp; + struct rcu_node *rnp = rdp->mynode; WARN_ON_ONCE(!rdp->beenonline); @@ -2769,7 +2830,6 @@ __rcu_process_callbacks(struct rcu_state *rsp) if (rcu_segcblist_restempty(&rdp->cblist, RCU_NEXT_READY_TAIL)) { local_irq_restore(flags); } else { - rnp = rdp->mynode; raw_spin_lock_rcu_node(rnp); /* irqs disabled. */ needwake = rcu_accelerate_cbs(rsp, rnp, rdp); raw_spin_unlock_irqrestore_rcu_node(rnp, flags); @@ -2778,6 +2838,8 @@ __rcu_process_callbacks(struct rcu_state *rsp) } } + rcu_check_gp_start_stall(rsp, rnp, rdp); + /* If there are callbacks ready, invoke them. */ if (rcu_segcblist_ready_cbs(&rdp->cblist)) invoke_rcu_callbacks(rsp, rdp); diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 7365ac53fdd9..3c1942174c56 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -374,6 +374,8 @@ struct rcu_state { /* but in jiffies. */ unsigned long gp_activity; /* Time of last GP kthread */ /* activity in jiffies. */ + unsigned long gp_req_activity; /* Time of last GP request */ + /* in jiffies. */ unsigned long jiffies_stall; /* Time at which to check */ /* for CPU stalls. */ unsigned long jiffies_resched; /* Time at which to resched */ -- cgit From 18390aeae7010ee56b132dcfb663ba362a099d99 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sun, 22 Apr 2018 15:06:05 -0700 Subject: rcu: Make rcu_gp_cleanup() write only once to ->gp_flags At the end of rcu_gp_cleanup(), if another grace period is needed, but not via rcu_accelerate_cbs(), the ->gp_flags field is written twice, once when making the new grace-period request, and once when clearing all other types of requests. This commit therefore adds an else-clause to avoid this double write. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 6ce82c009195..a9a4a260ea7d 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2117,8 +2117,9 @@ static void rcu_gp_cleanup(struct rcu_state *rsp) rsp->gp_req_activity = jiffies; trace_rcu_grace_period(rsp->name, READ_ONCE(rsp->gpnum), TPS("newreq")); + } else { + WRITE_ONCE(rsp->gp_flags, rsp->gp_flags & RCU_GP_FLAG_INIT); } - WRITE_ONCE(rsp->gp_flags, rsp->gp_flags & RCU_GP_FLAG_INIT); raw_spin_unlock_irq_rcu_node(rnp); } -- cgit From de30ad512a668b56e7ad7a5a7c379d7c5d138a94 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 26 Apr 2018 11:52:09 -0700 Subject: rcu: Introduce grace-period sequence numbers This commit adds grace-period sequence numbers (->gp_seq) to the rcu_state, rcu_node, and rcu_data structures, and updates them. It also checks for consistency between rsp->gpnum and rsp->gp_seq. These ->gp_seq counters will eventually replace the existing ->gpnum and ->completed counters, allowing a single memory access to determine whether or not a grace period is in progress and if so, which one. This in turn will enable changes that will reduce ->lock contention on the leaf rcu_node structures. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 16 +++++++++++++++- kernel/rcu/tree.h | 3 +++ 2 files changed, 18 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index a9a4a260ea7d..467cd8e5c6ff 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -97,6 +97,7 @@ struct rcu_state sname##_state = { \ .gp_state = RCU_GP_IDLE, \ .gpnum = 0UL - 300UL, \ .completed = 0UL - 300UL, \ + .gp_seq = (0UL - 300UL) << RCU_SEQ_CTR_SHIFT, \ .barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \ .name = RCU_STATE_NAME(sname), \ .abbr = sabbr, \ @@ -1849,6 +1850,8 @@ static bool __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp, WRITE_ONCE(rdp->gpwrap, false); rcu_gpnum_ovf(rnp, rdp); } + if (rdp->gp_seq != rnp->gp_seq) + rdp->gp_seq = rnp->gp_seq; return ret; } @@ -1910,7 +1913,10 @@ static bool rcu_gp_init(struct rcu_state *rsp) /* Advance to a new grace period and initialize state. */ record_gp_stall_check_time(rsp); /* Record GP times before starting GP, hence smp_store_release(). */ + WARN_ON_ONCE(rsp->gpnum << RCU_SEQ_CTR_SHIFT != rsp->gp_seq); smp_store_release(&rsp->gpnum, rsp->gpnum + 1); + smp_mb(); /* Pairs with barriers in stall-warning code. */ + rcu_seq_start(&rsp->gp_seq); trace_rcu_grace_period(rsp->name, rsp->gpnum, TPS("start")); raw_spin_unlock_irq_rcu_node(rnp); @@ -1984,6 +1990,7 @@ static bool rcu_gp_init(struct rcu_state *rsp) WRITE_ONCE(rnp->gpnum, rsp->gpnum); if (WARN_ON_ONCE(rnp->completed != rsp->completed)) WRITE_ONCE(rnp->completed, rsp->completed); + WRITE_ONCE(rnp->gp_seq, rsp->gp_seq); if (rnp == rdp->mynode) (void)__note_gp_changes(rsp, rnp, rdp); rcu_preempt_boost_start_gp(rnp); @@ -2050,6 +2057,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp) { unsigned long gp_duration; bool needgp = false; + unsigned long new_gp_seq; struct rcu_data *rdp; struct rcu_node *rnp = rcu_get_root(rsp); struct swait_queue_head *sq; @@ -2079,12 +2087,15 @@ static void rcu_gp_cleanup(struct rcu_state *rsp) * all of the rcu_node structures before the beginning of the next * grace period is recorded in any of the rcu_node structures. */ + new_gp_seq = rsp->gp_seq; + rcu_seq_end(&new_gp_seq); rcu_for_each_node_breadth_first(rsp, rnp) { raw_spin_lock_irq_rcu_node(rnp); if (WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp))) dump_blkd_tasks(rnp, 10); WARN_ON_ONCE(rnp->qsmask); WRITE_ONCE(rnp->completed, rsp->gpnum); + WRITE_ONCE(rnp->gp_seq, new_gp_seq); rdp = this_cpu_ptr(rsp->rda); if (rnp == rdp->mynode) needgp = __note_gp_changes(rsp, rnp, rdp) || needgp; @@ -2098,10 +2109,11 @@ static void rcu_gp_cleanup(struct rcu_state *rsp) rcu_gp_slow(rsp, gp_cleanup_delay); } rnp = rcu_get_root(rsp); - raw_spin_lock_irq_rcu_node(rnp); /* Order GP before ->completed update. */ + raw_spin_lock_irq_rcu_node(rnp); /* GP before rsp->gp_seq update. */ /* Declare grace period done. */ WRITE_ONCE(rsp->completed, rsp->gpnum); + rcu_seq_end(&rsp->gp_seq); trace_rcu_grace_period(rsp->name, rsp->completed, TPS("end")); rsp->gp_state = RCU_GP_IDLE; /* Check for GP requests since above loop. */ @@ -3612,6 +3624,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp) rdp->beenonline = true; /* We have now been online. */ rdp->gpnum = rnp->completed; /* Make CPU later note any new GP. */ rdp->completed = rnp->completed; + rdp->gp_seq = rnp->gp_seq; rdp->cpu_no_qs.b.norm = true; rdp->rcu_qs_ctr_snap = per_cpu(rcu_dynticks.rcu_qs_ctr, cpu); rdp->core_needs_qs = false; @@ -3991,6 +4004,7 @@ static void __init rcu_init_one(struct rcu_state *rsp) &rcu_fqs_class[i], fqs[i]); rnp->gpnum = rsp->gpnum; rnp->completed = rsp->completed; + rnp->gp_seq = rsp->gp_seq; rnp->completedqs = rsp->completed; rnp->qsmask = 0; rnp->qsmaskinit = 0; diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 3c1942174c56..50a28d1cf5a1 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -87,6 +87,7 @@ struct rcu_node { unsigned long completed; /* Last GP completed for this node. */ /* This will either be equal to or one */ /* behind the root rcu_node's gpnum. */ + unsigned long gp_seq; /* Track rsp->rcu_gp_seq. */ unsigned long completedqs; /* All QSes done for this node. */ unsigned long qsmask; /* CPUs or groups that need to switch in */ /* order for current grace period to proceed.*/ @@ -211,6 +212,7 @@ struct rcu_data { /* in order to detect GP end. */ unsigned long gpnum; /* Highest gp number that this CPU */ /* is aware of having started. */ + unsigned long gp_seq; /* Track rsp->rcu_gp_seq counter. */ unsigned long rcu_qs_ctr_snap;/* Snapshot of rcu_qs_ctr to check */ /* for rcu_all_qs() invocations. */ union rcu_noqs cpu_no_qs; /* No QSes yet for this CPU. */ @@ -343,6 +345,7 @@ struct rcu_state { /* Subject to priority boost. */ unsigned long gpnum; /* Current gp number. */ unsigned long completed; /* # of last completed gp. */ + unsigned long gp_seq; /* Grace-period sequence #. */ struct task_struct *gp_kthread; /* Task for grace periods. */ struct swait_queue_head gp_wq; /* Where GP task waits. */ short gp_flags; /* Commands for GP task. */ -- cgit From dee4f42298bba030e84035aca5c114f9fee8fa6a Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 26 Apr 2018 15:30:28 -0700 Subject: rcu: Move rcu_gp_slow() to ->gp_seq This commit moves rcu_gp_slow() to ->gp_seq. This function only uses the grace-period number to modulate delay, so rcu_seq_ctr(rsp->gp_seq) gets the same effect, at least in cases where the delay is to happen more than four times per wrap of an unsigned long. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 467cd8e5c6ff..3c3af7e2758f 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1879,7 +1879,8 @@ static void note_gp_changes(struct rcu_state *rsp, struct rcu_data *rdp) static void rcu_gp_slow(struct rcu_state *rsp, int delay) { if (delay > 0 && - !(rsp->gpnum % (rcu_num_nodes * PER_RCU_NODE_PERIOD * delay))) + !(rcu_seq_ctr(rsp->gp_seq) % + (rcu_num_nodes * PER_RCU_NODE_PERIOD * delay))) schedule_timeout_uninterruptible(delay); } -- cgit From 17ef2fe97c8c8e754e4a702c42f8e5b0ffadf4dd Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 27 Apr 2018 11:39:34 -0700 Subject: rcu: Make rcutorture's batches-completed API use ->gp_seq The rcutorture test invokes rcu_batches_started(), rcu_batches_completed(), rcu_batches_started_bh(), rcu_batches_completed_bh(), rcu_batches_started_sched(), and rcu_batches_completed_sched() to do grace-period consistency checks, and rcuperf uses the _completed variants for statistics. These functions use ->gpnum and ->completed. This commit therefore replaces them with rcu_get_gp_seq(), rcu_bh_get_gp_seq(), and rcu_sched_get_gp_seq(), adjusting rcutorture and rcuperf to make use of them. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcu.h | 18 ++++++----------- kernel/rcu/rcuperf.c | 26 +++++++++---------------- kernel/rcu/rcutorture.c | 50 +++++++++++++++++------------------------------- kernel/rcu/tree.c | 51 ++++++++++++------------------------------------- 4 files changed, 45 insertions(+), 100 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index db0870acfdff..f0907f9f6cd0 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -463,12 +463,9 @@ void srcutorture_get_gp_data(enum rcutorture_type test_type, #endif #ifdef CONFIG_TINY_RCU -static inline unsigned long rcu_batches_started(void) { return 0; } -static inline unsigned long rcu_batches_started_bh(void) { return 0; } -static inline unsigned long rcu_batches_started_sched(void) { return 0; } -static inline unsigned long rcu_batches_completed(void) { return 0; } -static inline unsigned long rcu_batches_completed_bh(void) { return 0; } -static inline unsigned long rcu_batches_completed_sched(void) { return 0; } +static inline unsigned long rcu_get_gp_seq(void) { return 0; } +static inline unsigned long rcu_bh_get_gp_seq(void) { return 0; } +static inline unsigned long rcu_sched_get_gp_seq(void) { return 0; } static inline unsigned long rcu_exp_batches_completed(void) { return 0; } static inline unsigned long rcu_exp_batches_completed_sched(void) { return 0; } static inline unsigned long @@ -480,12 +477,9 @@ static inline void show_rcu_gp_kthreads(void) { } #else /* #ifdef CONFIG_TINY_RCU */ extern unsigned long rcutorture_testseq; extern unsigned long rcutorture_vernum; -unsigned long rcu_batches_started(void); -unsigned long rcu_batches_started_bh(void); -unsigned long rcu_batches_started_sched(void); -unsigned long rcu_batches_completed(void); -unsigned long rcu_batches_completed_bh(void); -unsigned long rcu_batches_completed_sched(void); +unsigned long rcu_get_gp_seq(void); +unsigned long rcu_bh_get_gp_seq(void); +unsigned long rcu_sched_get_gp_seq(void); unsigned long rcu_exp_batches_completed(void); unsigned long rcu_exp_batches_completed_sched(void); unsigned long srcu_batches_completed(struct srcu_struct *sp); diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c index df29119b2013..2b5a613afcf3 100644 --- a/kernel/rcu/rcuperf.c +++ b/kernel/rcu/rcuperf.c @@ -138,8 +138,7 @@ struct rcu_perf_ops { void (*cleanup)(void); int (*readlock)(void); void (*readunlock)(int idx); - unsigned long (*started)(void); - unsigned long (*completed)(void); + unsigned long (*get_gp_seq)(void); unsigned long (*exp_completed)(void); void (*async)(struct rcu_head *head, rcu_callback_t func); void (*gp_barrier)(void); @@ -179,8 +178,7 @@ static struct rcu_perf_ops rcu_ops = { .init = rcu_sync_perf_init, .readlock = rcu_perf_read_lock, .readunlock = rcu_perf_read_unlock, - .started = rcu_batches_started, - .completed = rcu_batches_completed, + .get_gp_seq = rcu_get_gp_seq, .exp_completed = rcu_exp_batches_completed, .async = call_rcu, .gp_barrier = rcu_barrier, @@ -209,8 +207,7 @@ static struct rcu_perf_ops rcu_bh_ops = { .init = rcu_sync_perf_init, .readlock = rcu_bh_perf_read_lock, .readunlock = rcu_bh_perf_read_unlock, - .started = rcu_batches_started_bh, - .completed = rcu_batches_completed_bh, + .get_gp_seq = rcu_bh_get_gp_seq, .exp_completed = rcu_exp_batches_completed_sched, .async = call_rcu_bh, .gp_barrier = rcu_barrier_bh, @@ -266,8 +263,7 @@ static struct rcu_perf_ops srcu_ops = { .init = rcu_sync_perf_init, .readlock = srcu_perf_read_lock, .readunlock = srcu_perf_read_unlock, - .started = NULL, - .completed = srcu_perf_completed, + .get_gp_seq = srcu_perf_completed, .exp_completed = srcu_perf_completed, .async = srcu_call_rcu, .gp_barrier = srcu_rcu_barrier, @@ -295,8 +291,7 @@ static struct rcu_perf_ops srcud_ops = { .cleanup = srcu_sync_perf_cleanup, .readlock = srcu_perf_read_lock, .readunlock = srcu_perf_read_unlock, - .started = NULL, - .completed = srcu_perf_completed, + .get_gp_seq = srcu_perf_completed, .exp_completed = srcu_perf_completed, .async = srcu_call_rcu, .gp_barrier = srcu_rcu_barrier, @@ -325,8 +320,7 @@ static struct rcu_perf_ops sched_ops = { .init = rcu_sync_perf_init, .readlock = sched_perf_read_lock, .readunlock = sched_perf_read_unlock, - .started = rcu_batches_started_sched, - .completed = rcu_batches_completed_sched, + .get_gp_seq = rcu_sched_get_gp_seq, .exp_completed = rcu_exp_batches_completed_sched, .async = call_rcu_sched, .gp_barrier = rcu_barrier_sched, @@ -353,8 +347,7 @@ static struct rcu_perf_ops tasks_ops = { .init = rcu_sync_perf_init, .readlock = tasks_perf_read_lock, .readunlock = tasks_perf_read_unlock, - .started = rcu_no_completed, - .completed = rcu_no_completed, + .get_gp_seq = rcu_no_completed, .async = call_rcu_tasks, .gp_barrier = rcu_barrier_tasks, .sync = synchronize_rcu_tasks, @@ -447,8 +440,7 @@ rcu_perf_writer(void *arg) b_rcu_perf_writer_started = cur_ops->exp_completed() / 2; } else { - b_rcu_perf_writer_started = - cur_ops->completed(); + b_rcu_perf_writer_started = cur_ops->get_gp_seq(); } } @@ -505,7 +497,7 @@ retry: cur_ops->exp_completed() / 2; } else { b_rcu_perf_writer_finished = - cur_ops->completed(); + cur_ops->get_gp_seq(); } if (shutdown) { smp_mb(); /* Assign before wake. */ diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 5604bfac8df4..1f66597c7783 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -264,8 +264,7 @@ struct rcu_torture_ops { int (*readlock)(void); void (*read_delay)(struct torture_random_state *rrsp); void (*readunlock)(int idx); - unsigned long (*started)(void); - unsigned long (*completed)(void); + unsigned long (*get_gp_seq)(void); void (*deferred_free)(struct rcu_torture *p); void (*sync)(void); void (*exp_sync)(void); @@ -305,10 +304,10 @@ static void rcu_read_delay(struct torture_random_state *rrsp) * force_quiescent_state. */ if (!(torture_random(rrsp) % (nrealreaders * 2000 * longdelay_ms))) { - started = cur_ops->completed(); + started = cur_ops->get_gp_seq(); ts = rcu_trace_clock_local(); mdelay(longdelay_ms); - completed = cur_ops->completed(); + completed = cur_ops->get_gp_seq(); do_trace_rcu_torture_read(cur_ops->name, NULL, ts, started, completed); } @@ -400,8 +399,7 @@ static struct rcu_torture_ops rcu_ops = { .readlock = rcu_torture_read_lock, .read_delay = rcu_read_delay, .readunlock = rcu_torture_read_unlock, - .started = rcu_batches_started, - .completed = rcu_batches_completed, + .get_gp_seq = rcu_get_gp_seq, .deferred_free = rcu_torture_deferred_free, .sync = synchronize_rcu, .exp_sync = synchronize_rcu_expedited, @@ -442,8 +440,7 @@ static struct rcu_torture_ops rcu_bh_ops = { .readlock = rcu_bh_torture_read_lock, .read_delay = rcu_read_delay, /* just reuse rcu's version. */ .readunlock = rcu_bh_torture_read_unlock, - .started = rcu_batches_started_bh, - .completed = rcu_batches_completed_bh, + .get_gp_seq = rcu_bh_get_gp_seq, .deferred_free = rcu_bh_torture_deferred_free, .sync = synchronize_rcu_bh, .exp_sync = synchronize_rcu_bh_expedited, @@ -486,8 +483,7 @@ static struct rcu_torture_ops rcu_busted_ops = { .readlock = rcu_torture_read_lock, .read_delay = rcu_read_delay, /* just reuse rcu's version. */ .readunlock = rcu_torture_read_unlock, - .started = rcu_no_completed, - .completed = rcu_no_completed, + .get_gp_seq = rcu_no_completed, .deferred_free = rcu_busted_torture_deferred_free, .sync = synchronize_rcu_busted, .exp_sync = synchronize_rcu_busted, @@ -575,8 +571,7 @@ static struct rcu_torture_ops srcu_ops = { .readlock = srcu_torture_read_lock, .read_delay = srcu_read_delay, .readunlock = srcu_torture_read_unlock, - .started = NULL, - .completed = srcu_torture_completed, + .get_gp_seq = srcu_torture_completed, .deferred_free = srcu_torture_deferred_free, .sync = srcu_torture_synchronize, .exp_sync = srcu_torture_synchronize_expedited, @@ -613,8 +608,7 @@ static struct rcu_torture_ops srcud_ops = { .readlock = srcu_torture_read_lock, .read_delay = srcu_read_delay, .readunlock = srcu_torture_read_unlock, - .started = NULL, - .completed = srcu_torture_completed, + .get_gp_seq = srcu_torture_completed, .deferred_free = srcu_torture_deferred_free, .sync = srcu_torture_synchronize, .exp_sync = srcu_torture_synchronize_expedited, @@ -651,8 +645,7 @@ static struct rcu_torture_ops sched_ops = { .readlock = sched_torture_read_lock, .read_delay = rcu_read_delay, /* just reuse rcu's version. */ .readunlock = sched_torture_read_unlock, - .started = rcu_batches_started_sched, - .completed = rcu_batches_completed_sched, + .get_gp_seq = rcu_sched_get_gp_seq, .deferred_free = rcu_sched_torture_deferred_free, .sync = synchronize_sched, .exp_sync = synchronize_sched_expedited, @@ -690,8 +683,7 @@ static struct rcu_torture_ops tasks_ops = { .readlock = tasks_torture_read_lock, .read_delay = rcu_read_delay, /* just reuse rcu's version. */ .readunlock = tasks_torture_read_unlock, - .started = rcu_no_completed, - .completed = rcu_no_completed, + .get_gp_seq = rcu_no_completed, .deferred_free = rcu_tasks_torture_deferred_free, .sync = synchronize_rcu_tasks, .exp_sync = synchronize_rcu_tasks, @@ -1104,10 +1096,7 @@ static void rcu_torture_timer(struct timer_list *unused) unsigned long long ts; idx = cur_ops->readlock(); - if (cur_ops->started) - started = cur_ops->started(); - else - started = cur_ops->completed(); + started = cur_ops->get_gp_seq(); ts = rcu_trace_clock_local(); p = rcu_dereference_check(rcu_torture_current, rcu_read_lock_bh_held() || @@ -1131,7 +1120,7 @@ static void rcu_torture_timer(struct timer_list *unused) /* Should not happen, but... */ pipe_count = RCU_TORTURE_PIPE_LEN; } - completed = cur_ops->completed(); + completed = cur_ops->get_gp_seq(); if (pipe_count > 1) { do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu, ts, started, completed); @@ -1139,8 +1128,8 @@ static void rcu_torture_timer(struct timer_list *unused) } __this_cpu_inc(rcu_torture_count[pipe_count]); completed = completed - started; - if (cur_ops->started) - completed++; + if (completed > ULONG_MAX >> 1) + completed = 0; /* Not all gp_seq have full range. */ if (completed > RCU_TORTURE_PIPE_LEN) { /* Should not happen, but... */ completed = RCU_TORTURE_PIPE_LEN; @@ -1187,10 +1176,7 @@ rcu_torture_reader(void *arg) mod_timer(&t, jiffies + 1); } idx = cur_ops->readlock(); - if (cur_ops->started) - started = cur_ops->started(); - else - started = cur_ops->completed(); + started = cur_ops->get_gp_seq(); ts = rcu_trace_clock_local(); p = rcu_dereference_check(rcu_torture_current, rcu_read_lock_bh_held() || @@ -1212,7 +1198,7 @@ rcu_torture_reader(void *arg) /* Should not happen, but... */ pipe_count = RCU_TORTURE_PIPE_LEN; } - completed = cur_ops->completed(); + completed = cur_ops->get_gp_seq(); if (pipe_count > 1) { do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu, ts, started, completed); @@ -1220,8 +1206,8 @@ rcu_torture_reader(void *arg) } __this_cpu_inc(rcu_torture_count[pipe_count]); completed = completed - started; - if (cur_ops->started) - completed++; + if (completed > ULONG_MAX >> 1) + completed = 0; /* Not all gp_seq have full range. */ if (completed > RCU_TORTURE_PIPE_LEN) { /* Should not happen, but... */ completed = RCU_TORTURE_PIPE_LEN; diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 3c3af7e2758f..547112bec26a 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -530,58 +530,31 @@ static void force_quiescent_state(struct rcu_state *rsp); static int rcu_pending(void); /* - * Return the number of RCU batches started thus far for debug & stats. + * Return the number of RCU GPs completed thus far for debug & stats. */ -unsigned long rcu_batches_started(void) +unsigned long rcu_get_gp_seq(void) { - return rcu_state_p->gpnum; + return rcu_seq_ctr(READ_ONCE(rcu_state_p->gp_seq)); } -EXPORT_SYMBOL_GPL(rcu_batches_started); +EXPORT_SYMBOL_GPL(rcu_get_gp_seq); /* - * Return the number of RCU-sched batches started thus far for debug & stats. + * Return the number of RCU-sched GPs completed thus far for debug & stats. */ -unsigned long rcu_batches_started_sched(void) +unsigned long rcu_sched_get_gp_seq(void) { - return rcu_sched_state.gpnum; + return rcu_seq_ctr(READ_ONCE(rcu_sched_state.gp_seq)); } -EXPORT_SYMBOL_GPL(rcu_batches_started_sched); +EXPORT_SYMBOL_GPL(rcu_sched_get_gp_seq); /* - * Return the number of RCU BH batches started thus far for debug & stats. + * Return the number of RCU-bh GPs completed thus far for debug & stats. */ -unsigned long rcu_batches_started_bh(void) +unsigned long rcu_bh_get_gp_seq(void) { - return rcu_bh_state.gpnum; + return rcu_seq_ctr(READ_ONCE(rcu_bh_state.gp_seq)); } -EXPORT_SYMBOL_GPL(rcu_batches_started_bh); - -/* - * Return the number of RCU batches completed thus far for debug & stats. - */ -unsigned long rcu_batches_completed(void) -{ - return rcu_state_p->completed; -} -EXPORT_SYMBOL_GPL(rcu_batches_completed); - -/* - * Return the number of RCU-sched batches completed thus far for debug & stats. - */ -unsigned long rcu_batches_completed_sched(void) -{ - return rcu_sched_state.completed; -} -EXPORT_SYMBOL_GPL(rcu_batches_completed_sched); - -/* - * Return the number of RCU BH batches completed thus far for debug & stats. - */ -unsigned long rcu_batches_completed_bh(void) -{ - return rcu_bh_state.completed; -} -EXPORT_SYMBOL_GPL(rcu_batches_completed_bh); +EXPORT_SYMBOL_GPL(rcu_bh_get_gp_seq); /* * Return the number of RCU expedited batches completed thus far for -- cgit From 78c5a67f1788f59d991c8e78d674ad3c8542f0ac Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 27 Apr 2018 13:32:28 -0700 Subject: rcu: Convert rcu_check_gp_kthread_starvation() to GP sequence number This commit switches rcu_check_gp_kthread_starvation() from printing ->gpnum and ->completed to printing ->gp_seq upon detecting a starving RCU grace-period kthread during an RCU CPU stall warning. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 547112bec26a..002f85357226 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1276,9 +1276,9 @@ static void rcu_check_gp_kthread_starvation(struct rcu_state *rsp) j = jiffies; gpa = READ_ONCE(rsp->gp_activity); if (j - gpa > 2 * HZ) { - pr_err("%s kthread starved for %ld jiffies! g%lu c%lu f%#x %s(%d) ->state=%#lx ->cpu=%d\n", + pr_err("%s kthread starved for %ld jiffies! g%ld f%#x %s(%d) ->state=%#lx ->cpu=%d\n", rsp->name, j - gpa, - rsp->gpnum, rsp->completed, + (long)rcu_seq_current(&rsp->gp_seq), rsp->gp_flags, gp_state_getname(rsp->gp_state), rsp->gp_state, rsp->gp_kthread ? rsp->gp_kthread->state : ~0, -- cgit From c9a24e2d0c7d33b141167f5fa13f95cf6d35cb1e Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 27 Apr 2018 14:54:46 -0700 Subject: rcu: Make quiescent-state reporting use ->gp_seq This commit switches the functions reporting quiescent states from use of ->gpnum to ->gp_seq. In either case, the point is to handle races where a given grace period ends before a quiescent state can be reported. Failing to catch these races would result in too-short grace periods, hence the checking. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 002f85357226..a54587dc13f0 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2242,7 +2242,7 @@ static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags) * must be represented by the same rcu_node structure (which need not be a * leaf rcu_node structure, though it often will be). The gps parameter * is the grace-period snapshot, which means that the quiescent states - * are valid only if rnp->gpnum is equal to gps. That structure's lock + * are valid only if rnp->gp_seq is equal to gps. That structure's lock * must be held upon entry, and it is released before return. */ static void @@ -2257,7 +2257,7 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp, /* Walk up the rcu_node hierarchy. */ for (;;) { - if (!(rnp->qsmask & mask) || rnp->gpnum != gps) { + if (!(rnp->qsmask & mask) || rnp->gp_seq != gps) { /* * Our bit has already been cleared, or the @@ -2335,8 +2335,8 @@ static void rcu_report_unblock_qs_rnp(struct rcu_state *rsp, return; } - /* Report up the rest of the hierarchy, tracking current ->gpnum. */ - gps = rnp->gpnum; + /* Report up the rest of the hierarchy, tracking current ->gp_seq. */ + gps = rnp->gp_seq; mask = rnp->grpmask; raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */ raw_spin_lock_rcu_node(rnp_p); /* irqs already disabled. */ @@ -2357,8 +2357,8 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp) rnp = rdp->mynode; raw_spin_lock_irqsave_rcu_node(rnp, flags); - if (rdp->cpu_no_qs.b.norm || rdp->gpnum != rnp->gpnum || - rnp->completed == rnp->gpnum || rdp->gpwrap) { + if (rdp->cpu_no_qs.b.norm || rdp->gp_seq != rnp->gp_seq || + rdp->gpwrap) { /* * The grace period in which this quiescent state was @@ -2383,7 +2383,7 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp) */ needwake = rcu_accelerate_cbs(rsp, rnp, rdp); - rcu_report_qs_rnp(mask, rsp, rnp, rnp->gpnum, flags); + rcu_report_qs_rnp(mask, rsp, rnp, rnp->gp_seq, flags); /* ^^^ Released rnp->lock */ if (needwake) rcu_gp_kthread_wake(rsp); @@ -2688,8 +2688,8 @@ static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *rsp)) } } if (mask != 0) { - /* Idle/offline CPUs, report (releases rnp->lock. */ - rcu_report_qs_rnp(mask, rsp, rnp, rnp->gpnum, flags); + /* Idle/offline CPUs, report (releases rnp->lock). */ + rcu_report_qs_rnp(mask, rsp, rnp, rnp->gp_seq, flags); } else { /* Nothing to do here, so just drop the lock. */ raw_spin_unlock_irqrestore_rcu_node(rnp, flags); -- cgit From e4be81a2ed3a7356a2c22c7571af622ceb57eb2b Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 27 Apr 2018 15:16:50 -0700 Subject: rcu: Convert conditional grace-period primitives to ->gp_seq This commit converts get_state_synchronize_rcu(), cond_synchronize_rcu(), get_state_synchronize_sched(), and cond_synchronize_sched() from ->gpnum and ->completed to ->gp_seq. Note that this also introduces a full memory barrier in the already-done paths off cond_synchronize_rcu() and cond_synchronize_sched(), as work with LKMM indicates that the earlier smp_load_acquire() were insufficiently strong in some situations where these two functions were called just as the grace period ended. In such cases, these two functions would not gain the benefit of memory ordering at the end of the grace period. Please note that the performance impact is negligible, as you shouldn't be using either function anywhere near a fastpath in any case. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 42 ++++++++++-------------------------------- 1 file changed, 10 insertions(+), 32 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index a54587dc13f0..fd2f582a6db0 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3183,16 +3183,10 @@ unsigned long get_state_synchronize_rcu(void) { /* * Any prior manipulation of RCU-protected data must happen - * before the load from ->gpnum. + * before the load from ->gp_seq. */ smp_mb(); /* ^^^ */ - - /* - * Make sure this load happens before the purportedly - * time-consuming work between get_state_synchronize_rcu() - * and cond_synchronize_rcu(). - */ - return smp_load_acquire(&rcu_state_p->gpnum); + return rcu_seq_snap(&rcu_state_p->gp_seq); } EXPORT_SYMBOL_GPL(get_state_synchronize_rcu); @@ -3212,15 +3206,10 @@ EXPORT_SYMBOL_GPL(get_state_synchronize_rcu); */ void cond_synchronize_rcu(unsigned long oldstate) { - unsigned long newstate; - - /* - * Ensure that this load happens before any RCU-destructive - * actions the caller might carry out after we return. - */ - newstate = smp_load_acquire(&rcu_state_p->completed); - if (ULONG_CMP_GE(oldstate, newstate)) + if (!rcu_seq_done(&rcu_state_p->gp_seq, oldstate)) synchronize_rcu(); + else + smp_mb(); /* Ensure GP ends before subsequent accesses. */ } EXPORT_SYMBOL_GPL(cond_synchronize_rcu); @@ -3235,16 +3224,10 @@ unsigned long get_state_synchronize_sched(void) { /* * Any prior manipulation of RCU-protected data must happen - * before the load from ->gpnum. + * before the load from ->gp_seq. */ smp_mb(); /* ^^^ */ - - /* - * Make sure this load happens before the purportedly - * time-consuming work between get_state_synchronize_sched() - * and cond_synchronize_sched(). - */ - return smp_load_acquire(&rcu_sched_state.gpnum); + return rcu_seq_snap(&rcu_sched_state.gp_seq); } EXPORT_SYMBOL_GPL(get_state_synchronize_sched); @@ -3264,15 +3247,10 @@ EXPORT_SYMBOL_GPL(get_state_synchronize_sched); */ void cond_synchronize_sched(unsigned long oldstate) { - unsigned long newstate; - - /* - * Ensure that this load happens before any RCU-destructive - * actions the caller might carry out after we return. - */ - newstate = smp_load_acquire(&rcu_sched_state.completed); - if (ULONG_CMP_GE(oldstate, newstate)) + if (!rcu_seq_done(&rcu_sched_state.gp_seq, oldstate)) synchronize_sched(); + else + smp_mb(); /* Ensure GP ends before subsequent accesses. */ } EXPORT_SYMBOL_GPL(cond_synchronize_sched); -- cgit From 67e14c1e39d2d956300b3d6ad00f7708e3285531 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 27 Apr 2018 16:01:46 -0700 Subject: rcu: Move RCU's grace-period-change code to ->gp_seq This commit moves __note_gp_changes(), note_gp_changes(), and __rcu_pending() to ->gp_seq, creating new rcu_seq_completed_gp() and rcu_seq_new_gp() functions for this purpose. Signed-off-by: Paul E. McKenney [ paulmck: Reinstate "cpuend: trace as suggested by Joel Fernandes. ] --- kernel/rcu/rcu.h | 17 +++++++++++++++++ kernel/rcu/tree.c | 39 ++++++++++++++++----------------------- 2 files changed, 33 insertions(+), 23 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index f0907f9f6cd0..7568a3fd0815 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -116,6 +116,23 @@ static inline bool rcu_seq_done(unsigned long *sp, unsigned long s) return ULONG_CMP_GE(READ_ONCE(*sp), s); } +/* + * Has a grace period completed since the time the old gp_seq was collected? + */ +static inline bool rcu_seq_completed_gp(unsigned long old, unsigned long new) +{ + return ULONG_CMP_LT(old, new & ~RCU_SEQ_STATE_MASK); +} + +/* + * Has a grace period started since the time the old gp_seq was collected? + */ +static inline bool rcu_seq_new_gp(unsigned long old, unsigned long new) +{ + return ULONG_CMP_LT((old + RCU_SEQ_STATE_MASK) & ~RCU_SEQ_STATE_MASK, + new); +} + /* * debug_rcu_head_queue()/debug_rcu_head_unqueue() are used internally * by call_rcu() and rcu callback execution, and are therefore not part of the diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index fd2f582a6db0..bcd659e65dfd 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1790,24 +1790,23 @@ static bool __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp, raw_lockdep_assert_held_rcu_node(rnp); - /* Handle the ends of any preceding grace periods first. */ - if (rdp->completed == rnp->completed && - !unlikely(READ_ONCE(rdp->gpwrap))) { - - /* No grace period end, so just accelerate recent callbacks. */ - ret = rcu_accelerate_cbs(rsp, rnp, rdp); - - } else { - - /* Advance callbacks. */ - ret = rcu_advance_cbs(rsp, rnp, rdp); + if (rdp->gp_seq == rnp->gp_seq) + return false; /* Nothing to do. */ + /* Handle the ends of any preceding grace periods first. */ + if (rcu_seq_completed_gp(rdp->gp_seq, rnp->gp_seq) || + unlikely(READ_ONCE(rdp->gpwrap))) { + ret = rcu_advance_cbs(rsp, rnp, rdp); /* Advance callbacks. */ /* Remember that we saw this grace-period completion. */ rdp->completed = rnp->completed; - trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpuend")); + trace_rcu_grace_period(rsp->name, rdp->gp_seq, TPS("cpuend")); + } else { + ret = rcu_accelerate_cbs(rsp, rnp, rdp); /* Recent callbacks. */ } - if (rdp->gpnum != rnp->gpnum || unlikely(READ_ONCE(rdp->gpwrap))) { + /* Now handle the beginnings of any new-to-this-CPU grace periods. */ + if (rcu_seq_new_gp(rdp->gp_seq, rnp->gp_seq) || + unlikely(READ_ONCE(rdp->gpwrap))) { /* * If the current grace period is waiting for this CPU, * set up to detect a quiescent state, otherwise don't @@ -1823,8 +1822,7 @@ static bool __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp, WRITE_ONCE(rdp->gpwrap, false); rcu_gpnum_ovf(rnp, rdp); } - if (rdp->gp_seq != rnp->gp_seq) - rdp->gp_seq = rnp->gp_seq; + rdp->gp_seq = rnp->gp_seq; /* Remember new grace-period state. */ return ret; } @@ -1836,8 +1834,7 @@ static void note_gp_changes(struct rcu_state *rsp, struct rcu_data *rdp) local_irq_save(flags); rnp = rdp->mynode; - if ((rdp->gpnum == READ_ONCE(rnp->gpnum) && - rdp->completed == READ_ONCE(rnp->completed) && + if ((rdp->gp_seq == rcu_seq_current(&rnp->gp_seq) && !unlikely(READ_ONCE(rdp->gpwrap))) || /* w/out lock. */ !raw_spin_trylock_rcu_node(rnp)) { /* irqs already off, so later. */ local_irq_restore(flags); @@ -3286,12 +3283,8 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp) !rcu_segcblist_restempty(&rdp->cblist, RCU_NEXT_READY_TAIL)) return 1; - /* Has another RCU grace period completed? */ - if (READ_ONCE(rnp->completed) != rdp->completed) /* outside lock */ - return 1; - - /* Has a new RCU grace period started? */ - if (READ_ONCE(rnp->gpnum) != rdp->gpnum || + /* Have RCU grace period completed or started? */ + if (rcu_seq_current(&rnp->gp_seq) != rdp->gp_seq || unlikely(READ_ONCE(rdp->gpwrap))) /* outside lock */ return 1; -- cgit From a66ae8ae35de60a85d2d89689d8c2d6a4dc15d85 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 27 Apr 2018 18:06:08 -0700 Subject: rcu: Convert rcu_gpnum_ovf() to ->gp_seq This commit converts rcu_gpnum_ovf() to use ->gp_seq instead of ->gpnum. Same size unsigned long, so same approach. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index bcd659e65dfd..de2e2c5d64bb 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1089,14 +1089,15 @@ static int rcu_is_cpu_rrupt_from_idle(void) /* * We are reporting a quiescent state on behalf of some other CPU, so * it is our responsibility to check for and handle potential overflow - * of the rcu_node ->gpnum counter with respect to the rcu_data counters. + * of the rcu_node ->gp_seq counter with respect to the rcu_data counters. * After all, the CPU might be in deep idle state, and thus executing no * code whatsoever. */ static void rcu_gpnum_ovf(struct rcu_node *rnp, struct rcu_data *rdp) { raw_lockdep_assert_held_rcu_node(rnp); - if (ULONG_CMP_LT(READ_ONCE(rdp->gpnum) + ULONG_MAX / 4, rnp->gpnum)) + if (ULONG_CMP_LT(rcu_seq_current(&rdp->gp_seq) + ULONG_MAX / 4, + rnp->gp_seq)) WRITE_ONCE(rdp->gpwrap, true); if (ULONG_CMP_LT(rdp->rcu_iw_gpnum + ULONG_MAX / 4, rnp->gpnum)) rdp->rcu_iw_gpnum = rnp->gpnum + ULONG_MAX / 4; -- cgit From e05720b0977bd50707ea6cf296f99e709de3f760 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 27 Apr 2018 18:58:58 -0700 Subject: rcu: Move rcu_implicit_dynticks_qs() to ->gp_seq This commit makes rcu_implicit_dynticks_qs() use ->gp_seq, with the exception of tracing, which will be converted later. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index de2e2c5d64bb..b3af3d24286c 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1178,7 +1178,7 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) ruqp = per_cpu_ptr(&rcu_dynticks.rcu_urgent_qs, rdp->cpu); if (time_after(jiffies, rdp->rsp->gp_start + jtsq) && READ_ONCE(rdp->rcu_qs_ctr_snap) != per_cpu(rcu_dynticks.rcu_qs_ctr, rdp->cpu) && - READ_ONCE(rdp->gpnum) == rnp->gpnum && !rdp->gpwrap) { + rcu_seq_current(&rdp->gp_seq) == rnp->gp_seq && !rdp->gpwrap) { trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("rqc")); rcu_gpnum_ovf(rnp, rdp); return 1; -- cgit From 03c8cb765a747c02fd8d3fade1efe9d529ad54bd Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 27 Apr 2018 19:56:16 -0700 Subject: rcu: Move rcu_try_advance_all_cbs() to ->gp_seq This commit makes rcu_try_advance_all_cbs() use ->gp_seq, with the exception of tracing, which will be converted later. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index e387ea712758..d893899b72f4 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -1467,7 +1467,8 @@ static bool __maybe_unused rcu_try_advance_all_cbs(void) * completed since we last checked and there are * callbacks not yet ready to invoke. */ - if ((rdp->completed != rnp->completed || + if ((rcu_seq_completed_gp(rdp->gp_seq, + rcu_seq_current(&rnp->gp_seq)) || unlikely(READ_ONCE(rdp->gpwrap))) && rcu_segcblist_pend_cbs(&rdp->cblist)) note_gp_changes(rsp, rdp); -- cgit From e0da2374c3881cb9a512e2718f9ca655a48de9db Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 27 Apr 2018 20:51:36 -0700 Subject: rcu: Move rcu_nocb_gp_get() to ->gp_seq This commit makes rcu_try_advance_all_cbs() use ->gp_seq. It uses rcu_seq_ctr() in order to shift away the state bits, so that the low-order bits of the result may safely be used to index ->nocb_gp_wq[]. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index d893899b72f4..5b10904669c5 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -1852,7 +1852,7 @@ static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq) static struct swait_queue_head *rcu_nocb_gp_get(struct rcu_node *rnp) { - return &rnp->nocb_gp_wq[rnp->completed & 0x1]; + return &rnp->nocb_gp_wq[rcu_seq_ctr(rnp->gp_seq) & 0x1]; } static void rcu_init_one_nocb(struct rcu_node *rnp) -- cgit From ba04107fc901ddce49686944f7e038b4f0b0a359 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 27 Apr 2018 21:25:01 -0700 Subject: rcu: Move rcu_gp_in_progress() to ->gp_seq This commit makes rcu_gp_in_progress() use ->gp_seq instead of ->completed and ->gpnum. The READ_ONCE() invocations are buried in rcu_seq_current(). Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index b3af3d24286c..56445f4c09a8 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -219,7 +219,7 @@ unsigned long rcu_rnp_online_cpus(struct rcu_node *rnp) */ static int rcu_gp_in_progress(struct rcu_state *rsp) { - return READ_ONCE(rsp->completed) != READ_ONCE(rsp->gpnum); + return rcu_seq_state(rcu_seq_current(&rsp->gp_seq)); } /* -- cgit From 8aa670cdacc1820cb0597e4b4b413ef91ede2dd9 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sat, 28 Apr 2018 14:15:40 -0700 Subject: rcu: Convert ->rcu_iw_gpnum to ->gp_seq This commit switches the interrupt-disabled detection mechanism to ->gp_seq. This mechanism is used as part of RCU CPU stall warnings, and detects cases where the stall is due to a CPU having interrupts disabled. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 12 ++++++------ kernel/rcu/tree.h | 2 +- kernel/rcu/tree_plugin.h | 2 +- 3 files changed, 8 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 56445f4c09a8..2ddbd1cfb31a 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1099,8 +1099,8 @@ static void rcu_gpnum_ovf(struct rcu_node *rnp, struct rcu_data *rdp) if (ULONG_CMP_LT(rcu_seq_current(&rdp->gp_seq) + ULONG_MAX / 4, rnp->gp_seq)) WRITE_ONCE(rdp->gpwrap, true); - if (ULONG_CMP_LT(rdp->rcu_iw_gpnum + ULONG_MAX / 4, rnp->gpnum)) - rdp->rcu_iw_gpnum = rnp->gpnum + ULONG_MAX / 4; + if (ULONG_CMP_LT(rdp->rcu_iw_gp_seq + ULONG_MAX / 4, rnp->gp_seq)) + rdp->rcu_iw_gp_seq = rnp->gp_seq + ULONG_MAX / 4; } /* @@ -1134,7 +1134,7 @@ static void rcu_iw_handler(struct irq_work *iwp) rnp = rdp->mynode; raw_spin_lock_rcu_node(rnp); if (!WARN_ON_ONCE(!rdp->rcu_iw_pending)) { - rdp->rcu_iw_gpnum = rnp->gpnum; + rdp->rcu_iw_gp_seq = rnp->gp_seq; rdp->rcu_iw_pending = false; } raw_spin_unlock_rcu_node(rnp); @@ -1231,11 +1231,11 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) if (jiffies - rdp->rsp->gp_start > rcu_jiffies_till_stall_check() / 2) { resched_cpu(rdp->cpu); if (IS_ENABLED(CONFIG_IRQ_WORK) && - !rdp->rcu_iw_pending && rdp->rcu_iw_gpnum != rnp->gpnum && + !rdp->rcu_iw_pending && rdp->rcu_iw_gp_seq != rnp->gp_seq && (rnp->ffmask & rdp->grpmask)) { init_irq_work(&rdp->rcu_iw, rcu_iw_handler); rdp->rcu_iw_pending = true; - rdp->rcu_iw_gpnum = rnp->gpnum; + rdp->rcu_iw_gp_seq = rnp->gp_seq; irq_work_queue_on(&rdp->rcu_iw, rdp->cpu); } } @@ -3575,7 +3575,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp) rdp->rcu_qs_ctr_snap = per_cpu(rcu_dynticks.rcu_qs_ctr, cpu); rdp->core_needs_qs = false; rdp->rcu_iw_pending = false; - rdp->rcu_iw_gpnum = rnp->gpnum - 1; + rdp->rcu_iw_gp_seq = rnp->gp_seq - 1; trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpuonl")); raw_spin_unlock_irqrestore_rcu_node(rnp, flags); } diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 50a28d1cf5a1..6d6cbc8b3a9c 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -286,7 +286,7 @@ struct rcu_data { /* ->rcu_iw* fields protected by leaf rcu_node ->lock. */ struct irq_work rcu_iw; /* Check for non-irq activity. */ bool rcu_iw_pending; /* Is ->rcu_iw pending? */ - unsigned long rcu_iw_gpnum; /* ->gpnum associated with ->rcu_iw. */ + unsigned long rcu_iw_gp_seq; /* ->gp_seq associated with ->rcu_iw. */ int cpu; struct rcu_state *rsp; diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 5b10904669c5..bc32e1f434a6 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -1763,7 +1763,7 @@ static void print_cpu_stall_info(struct rcu_state *rsp, int cpu) ticks_value = rsp->gpnum - rdp->gpnum; } print_cpu_stall_fast_no_hz(fast_no_hz, cpu); - delta = rdp->mynode->gpnum - rdp->rcu_iw_gpnum; + delta = rcu_seq_ctr(rdp->mynode->gp_seq - rdp->rcu_iw_gp_seq); pr_err("\t%d-%c%c%c%c: (%lu %s) idle=%03x/%ld/%ld softirq=%u/%u fqs=%ld %s\n", cpu, "O."[!!cpu_online(cpu)], -- cgit From d43a5d32e125db1e34641922e52baaa4fee3510a Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sat, 28 Apr 2018 18:50:06 -0700 Subject: rcu: Convert ->completedqs to ->gp_seq This commit switches the quiescent-state no-backtracking checks from ->gpnum and ->completed to ->gp_seq. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 4 ++-- kernel/rcu/tree_plugin.h | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 2ddbd1cfb31a..da2ca9cc078b 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2278,7 +2278,7 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp, raw_spin_unlock_irqrestore_rcu_node(rnp, flags); return; } - rnp->completedqs = rnp->gpnum; + rnp->completedqs = rnp->gp_seq; mask = rnp->grpmask; if (rnp->parent == NULL) { @@ -3951,7 +3951,7 @@ static void __init rcu_init_one(struct rcu_state *rsp) rnp->gpnum = rsp->gpnum; rnp->completed = rsp->completed; rnp->gp_seq = rsp->gp_seq; - rnp->completedqs = rsp->completed; + rnp->completedqs = rsp->gp_seq; rnp->qsmask = 0; rnp->qsmaskinit = 0; rnp->grplo = j * cpustride; diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index bc32e1f434a6..2a81d243521f 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -262,7 +262,7 @@ static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp) */ if (!rnp->gp_tasks && (blkd_state & RCU_GP_BLKD)) { rnp->gp_tasks = &t->rcu_node_entry; - WARN_ON_ONCE(rnp->completedqs == rnp->gpnum); + WARN_ON_ONCE(rnp->completedqs == rnp->gp_seq); } if (!rnp->exp_tasks && (blkd_state & RCU_EXP_BLKD)) rnp->exp_tasks = &t->rcu_node_entry; @@ -537,7 +537,7 @@ void rcu_read_unlock_special(struct task_struct *t) WARN_ON_ONCE(rnp != t->rcu_blocked_node); WARN_ON_ONCE(!rcu_is_leaf_node(rnp)); empty_norm = !rcu_preempt_blocked_readers_cgp(rnp); - WARN_ON_ONCE(rnp->completedqs == rnp->gpnum && + WARN_ON_ONCE(rnp->completedqs == rnp->gp_seq && (!empty_norm || rnp->qsmask)); empty_exp = sync_rcu_preempt_exp_done(rnp); smp_mb(); /* ensure expedited fastpath sees end of RCU c-s. */ -- cgit From 29365e563b1e4e5bfde211280d37dc6127c019ed Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 30 Apr 2018 10:57:36 -0700 Subject: rcu: Convert grace-period requests to ->gp_seq This commit converts the grace-period request code paths from ->completed and ->gpnum to ->gp_seq. The need_future_gp_element() macro encapsulates the shift operation required to use ->gp_seq as an index to the ->need_future_gp[] array. The rcu_cbs_completed() function is removed in favor of the rcu_seq_snap() function. The rcu_start_this_gp() gets some temporary consistency checks and uses rcu_seq_done(), rcu_seq_current(), rcu_seq_state(), and rcu_gp_in_progress() in place of the earlier open-coded comparisons of ->gpnum and ->completed. The rcu_future_gp_cleanup() function replaces use of ->completed with ->gp_seq. The rcu_accelerate_cbs() function replaces a call to rcu_cbs_completed() with one to rcu_seq_snap(). The rcu_advance_cbs() function replaces an access to >completed with one to ->gp_seq and adds some temporary warnings. The rcu_nocb_wait_gp() function replaces a call to rcu_cbs_completed() with one to rcu_seq_snap() and an open-coded comparison with rcu_seq_done(). The temporary warnings will be removed when the various ->gpnum and ->completed fields are removed. Their purpose is to locate code who might still be using ->gpnum and ->completed. (Much easier that way than trying to trace down the causes of too-short grace periods and grace-period hangs!) Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 84 ++++++++++++------------------------------------ kernel/rcu/tree.h | 2 +- kernel/rcu/tree_plugin.h | 6 ++-- 3 files changed, 24 insertions(+), 68 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index da2ca9cc078b..ffa45b6175d9 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1548,52 +1548,6 @@ void rcu_cpu_stall_reset(void) WRITE_ONCE(rsp->jiffies_stall, jiffies + ULONG_MAX / 2); } -/* - * Determine the value that ->completed will have at the end of the - * next subsequent grace period. This is used to tag callbacks so that - * a CPU can invoke callbacks in a timely fashion even if that CPU has - * been dyntick-idle for an extended period with callbacks under the - * influence of RCU_FAST_NO_HZ. - * - * The caller must hold rnp->lock with interrupts disabled. - */ -static unsigned long rcu_cbs_completed(struct rcu_state *rsp, - struct rcu_node *rnp) -{ - raw_lockdep_assert_held_rcu_node(rnp); - - /* - * If RCU is idle, we just wait for the next grace period. - * But we can only be sure that RCU is idle if we are looking - * at the root rcu_node structure -- otherwise, a new grace - * period might have started, but just not yet gotten around - * to initializing the current non-root rcu_node structure. - */ - if (rcu_get_root(rsp) == rnp && rnp->gpnum == rnp->completed) - return rnp->completed + 1; - - /* - * If the current rcu_node structure believes that RCU is - * idle, and if the rcu_state structure does not yet reflect - * the start of a new grace period, then the next grace period - * will suffice. The memory barrier is needed to accurately - * sample the rsp->gpnum, and pairs with the second lock - * acquisition in rcu_gp_init(), which is augmented with - * smp_mb__after_unlock_lock() for this purpose. - */ - if (rnp->gpnum == rnp->completed) { - smp_mb(); /* See above block comment. */ - if (READ_ONCE(rsp->gpnum) == rnp->completed) - return rnp->completed + 1; - } - - /* - * Otherwise, wait for a possible partial grace period and - * then the subsequent full grace period. - */ - return rnp->completed + 2; -} - /* Trace-event wrapper function for trace_rcu_future_grace_period. */ static void trace_rcu_this_gp(struct rcu_node *rnp, struct rcu_data *rdp, unsigned long c, const char *s) @@ -1629,16 +1583,16 @@ static bool rcu_start_this_gp(struct rcu_node *rnp, struct rcu_data *rdp, * not be released. */ raw_lockdep_assert_held_rcu_node(rnp); + WARN_ON_ONCE(c & 0x2); /* Catch any lingering use of ->gpnum. */ + WARN_ON_ONCE(((rnp->completed << RCU_SEQ_CTR_SHIFT) >> RCU_SEQ_CTR_SHIFT) != rcu_seq_ctr(rnp->gp_seq)); /* Catch any ->completed/->gp_seq mismatches. */ trace_rcu_this_gp(rnp, rdp, c, TPS("Startleaf")); for (rnp_root = rnp; 1; rnp_root = rnp_root->parent) { if (rnp_root != rnp) raw_spin_lock_rcu_node(rnp_root); - WARN_ON_ONCE(ULONG_CMP_LT(rnp_root->gpnum + - need_future_gp_mask(), c)); if (need_future_gp_element(rnp_root, c) || - ULONG_CMP_GE(rnp_root->gpnum, c) || + rcu_seq_done(&rnp_root->gp_seq, c) || (rnp != rnp_root && - rnp_root->gpnum != rnp_root->completed)) { + rcu_seq_state(rcu_seq_current(&rnp_root->gp_seq)))) { trace_rcu_this_gp(rnp_root, rdp, c, TPS("Prestarted")); goto unlock_out; } @@ -1650,7 +1604,7 @@ static bool rcu_start_this_gp(struct rcu_node *rnp, struct rcu_data *rdp, } /* If GP already in progress, just leave, otherwise start one. */ - if (rnp_root->gpnum != rnp_root->completed) { + if (rcu_gp_in_progress(rsp)) { trace_rcu_this_gp(rnp_root, rdp, c, TPS("Startedleafroot")); goto unlock_out; } @@ -1675,7 +1629,7 @@ unlock_out: */ static bool rcu_future_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp) { - unsigned long c = rnp->completed; + unsigned long c = rnp->gp_seq; bool needmore; struct rcu_data *rdp = this_cpu_ptr(rsp->rda); @@ -1703,14 +1657,14 @@ static void rcu_gp_kthread_wake(struct rcu_state *rsp) } /* - * If there is room, assign a ->completed number to any callbacks on - * this CPU that have not already been assigned. Also accelerate any - * callbacks that were previously assigned a ->completed number that has - * since proven to be too conservative, which can happen if callbacks get - * assigned a ->completed number while RCU is idle, but with reference to - * a non-root rcu_node structure. This function is idempotent, so it does - * not hurt to call it repeatedly. Returns an flag saying that we should - * awaken the RCU grace-period kthread. + * If there is room, assign a ->gp_seq number to any callbacks on this + * CPU that have not already been assigned. Also accelerate any callbacks + * that were previously assigned a ->gp_seq number that has since proven + * to be too conservative, which can happen if callbacks get assigned a + * ->gp_seq number while RCU is idle, but with reference to a non-root + * rcu_node structure. This function is idempotent, so it does not hurt + * to call it repeatedly. Returns an flag saying that we should awaken + * the RCU grace-period kthread. * * The caller must hold rnp->lock with interrupts disabled. */ @@ -1736,7 +1690,7 @@ static bool rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp, * accelerating callback invocation to an earlier grace-period * number. */ - c = rcu_cbs_completed(rsp, rnp); + c = rcu_seq_snap(&rsp->gp_seq); if (rcu_segcblist_accelerate(&rdp->cblist, c)) ret = rcu_start_this_gp(rnp, rdp, c); @@ -1751,7 +1705,7 @@ static bool rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp, /* * Move any callbacks whose grace period has completed to the * RCU_DONE_TAIL sublist, then compact the remaining sublists and - * assign ->completed numbers to any callbacks in the RCU_NEXT_TAIL + * assign ->gp_seq numbers to any callbacks in the RCU_NEXT_TAIL * sublist. This function is idempotent, so it does not hurt to * invoke it repeatedly. As long as it is not invoked -too- often... * Returns true if the RCU grace-period kthread needs to be awakened. @@ -1768,10 +1722,10 @@ static bool rcu_advance_cbs(struct rcu_state *rsp, struct rcu_node *rnp, return false; /* - * Find all callbacks whose ->completed numbers indicate that they + * Find all callbacks whose ->gp_seq numbers indicate that they * are ready to invoke, and put them into the RCU_DONE_TAIL sublist. */ - rcu_segcblist_advance(&rdp->cblist, rnp->completed); + rcu_segcblist_advance(&rdp->cblist, rnp->gp_seq); /* Classify any remaining callbacks. */ return rcu_accelerate_cbs(rsp, rnp, rdp); @@ -1889,6 +1843,8 @@ static bool rcu_gp_init(struct rcu_state *rsp) smp_store_release(&rsp->gpnum, rsp->gpnum + 1); smp_mb(); /* Pairs with barriers in stall-warning code. */ rcu_seq_start(&rsp->gp_seq); + if (WARN_ON_ONCE(((rnp->completed << RCU_SEQ_CTR_SHIFT) >> RCU_SEQ_CTR_SHIFT) != rcu_seq_ctr(rnp->gp_seq))) /* Catch any ->completed/->gp_seq mismatches. */ + pr_info("%s ->completed: %#lx (%#lx) ->gp_seq %#lx (%#lx)\n", __func__, rnp->completed, (rnp->completed << RCU_SEQ_CTR_SHIFT) >> RCU_SEQ_CTR_SHIFT, rnp->gp_seq, rcu_seq_ctr(rnp->gp_seq)); trace_rcu_grace_period(rsp->name, rsp->gpnum, TPS("start")); raw_spin_unlock_irq_rcu_node(rnp); diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 6d6cbc8b3a9c..a21d403a6010 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -174,7 +174,7 @@ struct rcu_node { #define need_future_gp_mask() \ (ARRAY_SIZE(((struct rcu_node *)NULL)->need_future_gp) - 1) #define need_future_gp_element(rnp, c) \ - ((rnp)->need_future_gp[(c) & need_future_gp_mask()]) + ((rnp)->need_future_gp[(c >> RCU_SEQ_CTR_SHIFT) & need_future_gp_mask()]) #define need_any_future_gp(rnp) \ ({ \ int __i; \ diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 2a81d243521f..2036dc7426ac 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -2105,7 +2105,7 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp) struct rcu_node *rnp = rdp->mynode; raw_spin_lock_irqsave_rcu_node(rnp, flags); - c = rcu_cbs_completed(rdp->rsp, rnp); + c = rcu_seq_snap(&rdp->rsp->gp_seq); needwake = rcu_start_this_gp(rnp, rdp, c); raw_spin_unlock_irqrestore_rcu_node(rnp, flags); if (needwake) @@ -2118,8 +2118,8 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp) trace_rcu_this_gp(rnp, rdp, c, TPS("StartWait")); for (;;) { swait_event_interruptible( - rnp->nocb_gp_wq[c & 0x1], - (d = ULONG_CMP_GE(READ_ONCE(rnp->completed), c))); + rnp->nocb_gp_wq[rcu_seq_ctr(c) & 0x1], + (d = rcu_seq_done(&rnp->gp_seq, c))); if (likely(d)) break; WARN_ON(signal_pending(current)); -- cgit From 471f87c3d91bd0884451c0e3071473476c165630 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 30 Apr 2018 13:09:17 -0700 Subject: rcu: Make RCU CPU stall warnings use ->gp_seq This commit makes the RCU CPU stall-warning code in print_other_cpu_stall(), print_cpu_stall(), and check_cpu_stall() use ->gp_seq instead of ->gpnum and ->completed. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 51 ++++++++++++++++++++++++------------------------ kernel/rcu/tree_plugin.h | 8 ++++---- 2 files changed, 30 insertions(+), 29 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index ffa45b6175d9..9e619c4878d3 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1340,7 +1340,7 @@ static inline void panic_on_rcu_stall(void) panic("RCU Stall\n"); } -static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gpnum) +static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gp_seq) { int cpu; unsigned long flags; @@ -1350,6 +1350,8 @@ static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gpnum) struct rcu_node *rnp = rcu_get_root(rsp); long totqlen = 0; + WARN_ON_ONCE(gp_seq & 0x2); /* Remove when ->gpnum removed. */ + /* Kick and suppress, if so configured. */ rcu_stall_kick_kthreads(rsp); if (rcu_cpu_stall_suppress) @@ -1380,17 +1382,16 @@ static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gpnum) for_each_possible_cpu(cpu) totqlen += rcu_segcblist_n_cbs(&per_cpu_ptr(rsp->rda, cpu)->cblist); - pr_cont("(detected by %d, t=%ld jiffies, g=%ld, c=%ld, q=%lu)\n", + pr_cont("(detected by %d, t=%ld jiffies, g=%ld, q=%lu)\n", smp_processor_id(), (long)(jiffies - rsp->gp_start), - (long)rsp->gpnum, (long)rsp->completed, totqlen); + (long)rcu_seq_current(&rsp->gp_seq), totqlen); if (ndetected) { rcu_dump_cpu_stacks(rsp); /* Complain about tasks blocking the grace period. */ rcu_print_detail_task_stall(rsp); } else { - if (READ_ONCE(rsp->gpnum) != gpnum || - READ_ONCE(rsp->completed) == gpnum) { + if (rcu_seq_current(&rsp->gp_seq) != gp_seq) { pr_err("INFO: Stall ended before state dump start\n"); } else { j = jiffies; @@ -1442,9 +1443,9 @@ static void print_cpu_stall(struct rcu_state *rsp) for_each_possible_cpu(cpu) totqlen += rcu_segcblist_n_cbs(&per_cpu_ptr(rsp->rda, cpu)->cblist); - pr_cont(" (t=%lu jiffies g=%ld c=%ld q=%lu)\n", + pr_cont(" (t=%lu jiffies g=%ld q=%lu)\n", jiffies - rsp->gp_start, - (long)rsp->gpnum, (long)rsp->completed, totqlen); + (long)rcu_seq_current(&rsp->gp_seq), totqlen); rcu_check_gp_kthread_starvation(rsp); @@ -1471,8 +1472,8 @@ static void print_cpu_stall(struct rcu_state *rsp) static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp) { - unsigned long completed; - unsigned long gpnum; + unsigned long gs1; + unsigned long gs2; unsigned long gps; unsigned long j; unsigned long jn; @@ -1488,28 +1489,28 @@ static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp) /* * Lots of memory barriers to reject false positives. * - * The idea is to pick up rsp->gpnum, then rsp->jiffies_stall, - * then rsp->gp_start, and finally rsp->completed. These values - * are updated in the opposite order with memory barriers (or - * equivalent) during grace-period initialization and cleanup. - * Now, a false positive can occur if we get an new value of - * rsp->gp_start and a old value of rsp->jiffies_stall. But given - * the memory barriers, the only way that this can happen is if one - * grace period ends and another starts between these two fetches. - * Detect this by comparing rsp->completed with the previous fetch - * from rsp->gpnum. + * The idea is to pick up rsp->gp_seq, then rsp->jiffies_stall, + * then rsp->gp_start, and finally another copy of rsp->gp_seq. + * These values are updated in the opposite order with memory + * barriers (or equivalent) during grace-period initialization + * and cleanup. Now, a false positive can occur if we get an new + * value of rsp->gp_start and a old value of rsp->jiffies_stall. + * But given the memory barriers, the only way that this can happen + * is if one grace period ends and another starts between these + * two fetches. This is detected by comparing the second fetch + * of rsp->gp_seq with the previous fetch from rsp->gp_seq. * * Given this check, comparisons of jiffies, rsp->jiffies_stall, * and rsp->gp_start suffice to forestall false positives. */ - gpnum = READ_ONCE(rsp->gpnum); - smp_rmb(); /* Pick up ->gpnum first... */ + gs1 = READ_ONCE(rsp->gp_seq); + smp_rmb(); /* Pick up ->gp_seq first... */ js = READ_ONCE(rsp->jiffies_stall); smp_rmb(); /* ...then ->jiffies_stall before the rest... */ gps = READ_ONCE(rsp->gp_start); - smp_rmb(); /* ...and finally ->gp_start before ->completed. */ - completed = READ_ONCE(rsp->completed); - if (ULONG_CMP_GE(completed, gpnum) || + smp_rmb(); /* ...and finally ->gp_start before ->gp_seq again. */ + gs2 = READ_ONCE(rsp->gp_seq); + if (gs1 != gs2 || ULONG_CMP_LT(j, js) || ULONG_CMP_GE(gps, js)) return; /* No stall or GP completed since entering function. */ @@ -1527,7 +1528,7 @@ static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp) cmpxchg(&rsp->jiffies_stall, js, jn) == js) { /* They had a few time units to dump stack, so complain. */ - print_other_cpu_stall(rsp, gpnum); + print_other_cpu_stall(rsp, gs2); } } diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 2036dc7426ac..f4a88e3c388d 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -1755,12 +1755,12 @@ static void print_cpu_stall_info(struct rcu_state *rsp, int cpu) */ touch_nmi_watchdog(); - if (rsp->gpnum == rdp->gpnum) { + ticks_value = rcu_seq_ctr(rsp->gp_seq - rdp->gp_seq); + if (ticks_value) { + ticks_title = "GPs behind"; + } else { ticks_title = "ticks this GP"; ticks_value = rdp->ticks_this_gp; - } else { - ticks_title = "GPs behind"; - ticks_value = rsp->gpnum - rdp->gpnum; } print_cpu_stall_fast_no_hz(fast_no_hz, cpu); delta = rcu_seq_ctr(rdp->mynode->gp_seq - rdp->rcu_iw_gp_seq); -- cgit From aebc82644b2c8eafa15e8c481fbafc1b41f4fbf9 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 1 May 2018 06:42:51 -0700 Subject: rcutorture: Convert rcutorture_get_gp_data() to ->gp_seq SRCU has long used ->srcu_gp_seq, and now RCU uses ->gp_seq. This commit therefore moves the rcutorture_get_gp_data() function from a ->gpnum / ->completed pair to ->gp_seq. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcu.h | 17 ++++++----------- kernel/rcu/rcutorture.c | 24 ++++++++++-------------- kernel/rcu/srcutree.c | 5 ++--- kernel/rcu/tree.c | 5 ++--- 4 files changed, 20 insertions(+), 31 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index 7568a3fd0815..003671825d62 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -425,7 +425,7 @@ enum rcutorture_type { #if defined(CONFIG_TREE_RCU) || defined(CONFIG_PREEMPT_RCU) void rcutorture_get_gp_data(enum rcutorture_type test_type, int *flags, - unsigned long *gpnum, unsigned long *completed); + unsigned long *gp_seq); void rcutorture_record_test_transition(void); void rcutorture_record_progress(unsigned long vernum); void do_trace_rcu_torture_read(const char *rcutorturename, @@ -435,13 +435,10 @@ void do_trace_rcu_torture_read(const char *rcutorturename, unsigned long c); #else static inline void rcutorture_get_gp_data(enum rcutorture_type test_type, - int *flags, - unsigned long *gpnum, - unsigned long *completed) + int *flags, unsigned long *gp_seq) { *flags = 0; - *gpnum = 0; - *completed = 0; + *gp_seq = 0; } static inline void rcutorture_record_test_transition(void) { } static inline void rcutorture_record_progress(unsigned long vernum) { } @@ -461,21 +458,19 @@ void do_trace_rcu_torture_read(const char *rcutorturename, static inline void srcutorture_get_gp_data(enum rcutorture_type test_type, struct srcu_struct *sp, int *flags, - unsigned long *gpnum, - unsigned long *completed) + unsigned long *gp_seq) { if (test_type != SRCU_FLAVOR) return; *flags = 0; - *completed = sp->srcu_idx; - *gpnum = *completed; + *gp_seq = sp->srcu_idx; } #elif defined(CONFIG_TREE_SRCU) void srcutorture_get_gp_data(enum rcutorture_type test_type, struct srcu_struct *sp, int *flags, - unsigned long *gpnum, unsigned long *completed); + unsigned long *gp_seq); #endif diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 1f66597c7783..81fb43530d64 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1313,18 +1313,16 @@ rcu_torture_stats_print(void) if (rtcv_snap == rcu_torture_current_version && rcu_torture_current != NULL) { int __maybe_unused flags = 0; - unsigned long __maybe_unused gpnum = 0; - unsigned long __maybe_unused completed = 0; + unsigned long __maybe_unused gp_seq = 0; rcutorture_get_gp_data(cur_ops->ttype, - &flags, &gpnum, &completed); + &flags, &gp_seq); srcutorture_get_gp_data(cur_ops->ttype, srcu_ctlp, - &flags, &gpnum, &completed); + &flags, &gp_seq); wtp = READ_ONCE(writer_task); - pr_alert("??? Writer stall state %s(%d) g%lu c%lu f%#x ->state %#lx cpu %d\n", + pr_alert("??? Writer stall state %s(%d) g%lu f%#x ->state %#lx cpu %d\n", rcu_torture_writer_state_getname(), - rcu_torture_writer_state, - gpnum, completed, flags, + rcu_torture_writer_state, gp_seq, flags, wtp == NULL ? ~0UL : wtp->state, wtp == NULL ? -1 : (int)task_cpu(wtp)); if (!splatted && wtp) { @@ -1605,8 +1603,7 @@ static void rcu_torture_cleanup(void) { int flags = 0; - unsigned long gpnum = 0; - unsigned long completed = 0; + unsigned long gp_seq = 0; int i; rcutorture_record_test_transition(); @@ -1637,11 +1634,10 @@ rcu_torture_cleanup(void) fakewriter_tasks = NULL; } - rcutorture_get_gp_data(cur_ops->ttype, &flags, &gpnum, &completed); - srcutorture_get_gp_data(cur_ops->ttype, srcu_ctlp, - &flags, &gpnum, &completed); - pr_alert("%s: End-test grace-period state: g%lu c%lu f%#x\n", - cur_ops->name, gpnum, completed, flags); + rcutorture_get_gp_data(cur_ops->ttype, &flags, &gp_seq); + srcutorture_get_gp_data(cur_ops->ttype, srcu_ctlp, &flags, &gp_seq); + pr_alert("%s: End-test grace-period state: g%lu f%#x\n", + cur_ops->name, gp_seq, flags); torture_stop_kthread(rcu_torture_stats, stats_task); torture_stop_kthread(rcu_torture_fqs, fqs_task); for (i = 0; i < ncbflooders; i++) diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index 5a1a9a07b407..d6d6ea9738c0 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -1249,13 +1249,12 @@ static void process_srcu(struct work_struct *work) void srcutorture_get_gp_data(enum rcutorture_type test_type, struct srcu_struct *sp, int *flags, - unsigned long *gpnum, unsigned long *completed) + unsigned long *gp_seq) { if (test_type != SRCU_FLAVOR) return; *flags = 0; - *completed = rcu_seq_ctr(sp->srcu_gp_seq); - *gpnum = rcu_seq_ctr(sp->srcu_gp_seq_needed); + *gp_seq = rcu_seq_current(&sp->srcu_gp_seq); } EXPORT_SYMBOL_GPL(srcutorture_get_gp_data); diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 9e619c4878d3..4a528a062cd4 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -638,7 +638,7 @@ EXPORT_SYMBOL_GPL(rcutorture_record_test_transition); * Send along grace-period-related data for rcutorture diagnostics. */ void rcutorture_get_gp_data(enum rcutorture_type test_type, int *flags, - unsigned long *gpnum, unsigned long *completed) + unsigned long *gp_seq) { struct rcu_state *rsp = NULL; @@ -658,8 +658,7 @@ void rcutorture_get_gp_data(enum rcutorture_type test_type, int *flags, if (rsp == NULL) return; *flags = READ_ONCE(rsp->gp_flags); - *gpnum = READ_ONCE(rsp->gpnum); - *completed = READ_ONCE(rsp->completed); + *gp_seq = rcu_seq_current(&rsp->gp_seq); } EXPORT_SYMBOL_GPL(rcutorture_get_gp_data); -- cgit From 7a1d0f23ad70cd4813bf4b72735ea2c26a4f53fe Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 1 May 2018 10:26:57 -0700 Subject: rcu: Move from ->need_future_gp[] to ->gp_seq_needed One problem with the ->need_future_gp[] array is that the grace-period assignment of each element changes as the grace periods complete. This means that it is necessary to hold a lock when checking this array to learn if a given grace period has already been requested. This increase lock contention, which is the opposite of helpful. This commit therefore replaces the ->need_future_gp[] with a single ->gp_seq_needed value and keeps it updated in the rcu_data structure. This will enable reliable lockless checking of whether or not a given grace period has already been requested. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 34 ++++++++++++++++++---------------- kernel/rcu/tree.h | 19 ++----------------- 2 files changed, 20 insertions(+), 33 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 4a528a062cd4..1ede51690e4a 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1560,7 +1560,7 @@ static void trace_rcu_this_gp(struct rcu_node *rnp, struct rcu_data *rdp, /* * Start the specified grace period, as needed to handle newly arrived * callbacks. The required future grace periods are recorded in each - * rcu_node structure's ->need_future_gp[] field. Returns true if there + * rcu_node structure's ->gp_seq_needed field. Returns true if there * is reason to awaken the grace-period kthread. * * The caller must hold the specified rcu_node structure's ->lock, which @@ -1589,14 +1589,14 @@ static bool rcu_start_this_gp(struct rcu_node *rnp, struct rcu_data *rdp, for (rnp_root = rnp; 1; rnp_root = rnp_root->parent) { if (rnp_root != rnp) raw_spin_lock_rcu_node(rnp_root); - if (need_future_gp_element(rnp_root, c) || + if (ULONG_CMP_GE(rnp_root->gp_seq_needed, c) || rcu_seq_done(&rnp_root->gp_seq, c) || (rnp != rnp_root && rcu_seq_state(rcu_seq_current(&rnp_root->gp_seq)))) { trace_rcu_this_gp(rnp_root, rdp, c, TPS("Prestarted")); goto unlock_out; } - need_future_gp_element(rnp_root, c) = true; + rnp_root->gp_seq_needed = c; if (rnp_root != rnp && rnp_root->parent != NULL) raw_spin_unlock_rcu_node(rnp_root); if (!rnp_root->parent) @@ -1633,8 +1633,9 @@ static bool rcu_future_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp) bool needmore; struct rcu_data *rdp = this_cpu_ptr(rsp->rda); - need_future_gp_element(rnp, c) = false; - needmore = need_any_future_gp(rnp); + needmore = ULONG_CMP_LT(rnp->gp_seq, rnp->gp_seq_needed); + if (!needmore) + rnp->gp_seq_needed = rnp->gp_seq; /* Avoid counter wrap. */ trace_rcu_this_gp(rnp, rdp, c, needmore ? TPS("CleanupMore") : TPS("Cleanup")); return needmore; @@ -2046,7 +2047,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp) rsp->gp_state = RCU_GP_IDLE; /* Check for GP requests since above loop. */ rdp = this_cpu_ptr(rsp->rda); - if (need_any_future_gp(rnp)) { + if (ULONG_CMP_LT(rnp->gp_seq, rnp->gp_seq_needed)) { trace_rcu_this_gp(rnp, rdp, rsp->completed - 1, TPS("CleanupMore")); needgp = true; @@ -2700,8 +2701,8 @@ rcu_check_gp_start_stall(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_node *rnp_root = rcu_get_root(rsp); static atomic_t warned = ATOMIC_INIT(0); - if (!IS_ENABLED(CONFIG_PROVE_RCU) || - rcu_gp_in_progress(rsp) || !need_any_future_gp(rcu_get_root(rsp))) + if (!IS_ENABLED(CONFIG_PROVE_RCU) || rcu_gp_in_progress(rsp) || + ULONG_CMP_GE(rnp_root->gp_seq, rnp_root->gp_seq_needed)) return; j = jiffies; /* Expensive access, and in common case don't get here. */ if (time_before(j, READ_ONCE(rsp->gp_req_activity) + HZ) || @@ -2711,7 +2712,8 @@ rcu_check_gp_start_stall(struct rcu_state *rsp, struct rcu_node *rnp, raw_spin_lock_irqsave_rcu_node(rnp, flags); j = jiffies; - if (rcu_gp_in_progress(rsp) || !need_any_future_gp(rcu_get_root(rsp)) || + if (rcu_gp_in_progress(rsp) || + ULONG_CMP_GE(rnp_root->gp_seq, rnp_root->gp_seq_needed) || time_before(j, READ_ONCE(rsp->gp_req_activity) + HZ) || time_before(j, READ_ONCE(rsp->gp_activity) + HZ) || atomic_read(&warned)) { @@ -2723,7 +2725,8 @@ rcu_check_gp_start_stall(struct rcu_state *rsp, struct rcu_node *rnp, if (rnp_root != rnp) raw_spin_lock_rcu_node(rnp_root); /* irqs already disabled. */ j = jiffies; - if (rcu_gp_in_progress(rsp) || !need_any_future_gp(rcu_get_root(rsp)) || + if (rcu_gp_in_progress(rsp) || + ULONG_CMP_GE(rnp_root->gp_seq, rnp_root->gp_seq_needed) || time_before(j, rsp->gp_req_activity + HZ) || time_before(j, rsp->gp_activity + HZ) || atomic_xchg(&warned, 1)) { @@ -2731,12 +2734,9 @@ rcu_check_gp_start_stall(struct rcu_state *rsp, struct rcu_node *rnp, raw_spin_unlock_irqrestore_rcu_node(rnp, flags); return; } - pr_alert("%s: g%lu %d%d%d%d gar:%lu ga:%lu f%#x %s->state:%#lx\n", - __func__, READ_ONCE(rsp->gpnum), - need_future_gp_element(rcu_get_root(rsp), 0), - need_future_gp_element(rcu_get_root(rsp), 1), - need_future_gp_element(rcu_get_root(rsp), 2), - need_future_gp_element(rcu_get_root(rsp), 3), + pr_alert("%s: g%ld->%ld gar:%lu ga:%lu f%#x %s->state:%#lx\n", + __func__, (long)READ_ONCE(rsp->gp_seq), + (long)READ_ONCE(rnp_root->gp_seq_needed), j - rsp->gp_req_activity, j - rsp->gp_activity, rsp->gp_flags, rsp->name, rsp->gp_kthread ? rsp->gp_kthread->state : 0x1ffffL); @@ -3527,6 +3527,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp) rdp->gpnum = rnp->completed; /* Make CPU later note any new GP. */ rdp->completed = rnp->completed; rdp->gp_seq = rnp->gp_seq; + rdp->gp_seq_needed = rnp->gp_seq; rdp->cpu_no_qs.b.norm = true; rdp->rcu_qs_ctr_snap = per_cpu(rcu_dynticks.rcu_qs_ctr, cpu); rdp->core_needs_qs = false; @@ -3907,6 +3908,7 @@ static void __init rcu_init_one(struct rcu_state *rsp) rnp->gpnum = rsp->gpnum; rnp->completed = rsp->completed; rnp->gp_seq = rsp->gp_seq; + rnp->gp_seq_needed = rsp->gp_seq; rnp->completedqs = rsp->gp_seq; rnp->qsmask = 0; rnp->qsmaskinit = 0; diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index a21d403a6010..9329c1ff695f 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -88,6 +88,7 @@ struct rcu_node { /* This will either be equal to or one */ /* behind the root rcu_node's gpnum. */ unsigned long gp_seq; /* Track rsp->rcu_gp_seq. */ + unsigned long gp_seq_needed; /* Track rsp->rcu_gp_seq_needed. */ unsigned long completedqs; /* All QSes done for this node. */ unsigned long qsmask; /* CPUs or groups that need to switch in */ /* order for current grace period to proceed.*/ @@ -160,7 +161,6 @@ struct rcu_node { struct swait_queue_head nocb_gp_wq[2]; /* Place for rcu_nocb_kthread() to wait GP. */ #endif /* #ifdef CONFIG_RCU_NOCB_CPU */ - u8 need_future_gp[4]; /* Counts of upcoming GP requests. */ raw_spinlock_t fqslock ____cacheline_internodealigned_in_smp; spinlock_t exp_lock ____cacheline_internodealigned_in_smp; @@ -170,22 +170,6 @@ struct rcu_node { bool exp_need_flush; /* Need to flush workitem? */ } ____cacheline_internodealigned_in_smp; -/* Accessors for ->need_future_gp[] array. */ -#define need_future_gp_mask() \ - (ARRAY_SIZE(((struct rcu_node *)NULL)->need_future_gp) - 1) -#define need_future_gp_element(rnp, c) \ - ((rnp)->need_future_gp[(c >> RCU_SEQ_CTR_SHIFT) & need_future_gp_mask()]) -#define need_any_future_gp(rnp) \ -({ \ - int __i; \ - bool __nonzero = false; \ - \ - for (__i = 0; __i < ARRAY_SIZE((rnp)->need_future_gp); __i++) \ - __nonzero = __nonzero || \ - READ_ONCE((rnp)->need_future_gp[__i]); \ - __nonzero; \ -}) - /* * Bitmasks in an rcu_node cover the interval [grplo, grphi] of CPU IDs, and * are indexed relative to this interval rather than the global CPU ID space. @@ -213,6 +197,7 @@ struct rcu_data { unsigned long gpnum; /* Highest gp number that this CPU */ /* is aware of having started. */ unsigned long gp_seq; /* Track rsp->rcu_gp_seq counter. */ + unsigned long gp_seq_needed; /* Track rsp->rcu_gp_seq_needed ctr. */ unsigned long rcu_qs_ctr_snap;/* Snapshot of rcu_qs_ctr to check */ /* for rcu_all_qs() invocations. */ union rcu_noqs cpu_no_qs; /* No QSes yet for this CPU. */ -- cgit From ab5e869c1f7aa30a1210f5e8a277758b0599609f Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 1 May 2018 11:07:23 -0700 Subject: rcu: Make rcu_nocb_wait_gp() check if GP already requested This commit makes rcu_nocb_wait_gp() check rdp->gp_seq_needed to see if the current CPU already knows about the needed grace period having already been requested. If so, it avoids acquiring the corresponding leaf rcu_node structure's ->lock, thus decreasing contention. This optimization is intended for cases where either multiple leader rcuo kthreads are running on the same CPU or these kthreads are running on a non-offloaded (e.g., housekeeping) CPU. Signed-off-by: Paul E. McKenney [ paulmck: Move lock release past "if" as suggested by Joel Fernandes. ] [ paulmck: Fix caching of furthest-future requested grace period. ] --- kernel/rcu/tree.c | 5 +++++ kernel/rcu/tree_plugin.h | 15 ++++++++++----- 2 files changed, 15 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 1ede51690e4a..4826598867c3 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1618,6 +1618,11 @@ static bool rcu_start_this_gp(struct rcu_node *rnp, struct rcu_data *rdp, trace_rcu_grace_period(rsp->name, READ_ONCE(rsp->gpnum), TPS("newreq")); ret = true; /* Caller must wake GP kthread. */ unlock_out: + /* Push furthest requested GP to leaf node and rcu_data structure. */ + if (ULONG_CMP_LT(c, rnp_root->gp_seq_needed)) { + rnp->gp_seq_needed = rnp_root->gp_seq_needed; + rdp->gp_seq_needed = rnp_root->gp_seq_needed; + } if (rnp != rnp_root) raw_spin_unlock_rcu_node(rnp_root); return ret; diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index f4a88e3c388d..ca73931f7b30 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -2104,12 +2104,17 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp) bool needwake; struct rcu_node *rnp = rdp->mynode; - raw_spin_lock_irqsave_rcu_node(rnp, flags); + local_irq_save(flags); c = rcu_seq_snap(&rdp->rsp->gp_seq); - needwake = rcu_start_this_gp(rnp, rdp, c); - raw_spin_unlock_irqrestore_rcu_node(rnp, flags); - if (needwake) - rcu_gp_kthread_wake(rdp->rsp); + if (!rdp->gpwrap && ULONG_CMP_GE(rdp->gp_seq_needed, c)) { + local_irq_restore(flags); + } else { + raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */ + needwake = rcu_start_this_gp(rnp, rdp, c); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + if (needwake) + rcu_gp_kthread_wake(rdp->rsp); + } /* * Wait for the grace period. Do so interruptibly to avoid messing -- cgit From 477351f7829d2268769c5d545511081555066529 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 1 May 2018 12:54:11 -0700 Subject: rcu: Convert rcu_grace_period tracepoint to gp_seq This commit makes the rcu_grace_period tracepoint use gp_seq instead of ->gpnum or ->completed. It also introduces a "cpuofl-bgp" string to less obscurely indicate when a CPU has gone offline while a grace period is waiting on it. Signed-off-by: Paul E. McKenney --- include/trace/events/rcu.h | 13 +++++++------ kernel/rcu/tree.c | 43 +++++++++++++++++++++---------------------- kernel/rcu/tree_plugin.h | 2 +- 3 files changed, 29 insertions(+), 29 deletions(-) (limited to 'kernel') diff --git a/include/trace/events/rcu.h b/include/trace/events/rcu.h index 5936aac357ab..cd229e82ec8b 100644 --- a/include/trace/events/rcu.h +++ b/include/trace/events/rcu.h @@ -52,6 +52,7 @@ TRACE_EVENT(rcu_utilization, * "cpuqs": CPU passes through a quiescent state. * "cpuonl": CPU comes online. * "cpuofl": CPU goes offline. + * "cpuofl-bgp": CPU goes offline while blocking a grace period. * "reqwait": GP kthread sleeps waiting for grace-period request. * "reqwaitsig": GP kthread awakened by signal from reqwait state. * "fqswait": GP kthread waiting until time to force quiescent states. @@ -63,24 +64,24 @@ TRACE_EVENT(rcu_utilization, */ TRACE_EVENT(rcu_grace_period, - TP_PROTO(const char *rcuname, unsigned long gpnum, const char *gpevent), + TP_PROTO(const char *rcuname, unsigned long gp_seq, const char *gpevent), - TP_ARGS(rcuname, gpnum, gpevent), + TP_ARGS(rcuname, gp_seq, gpevent), TP_STRUCT__entry( __field(const char *, rcuname) - __field(unsigned long, gpnum) + __field(unsigned long, gp_seq) __field(const char *, gpevent) ), TP_fast_assign( __entry->rcuname = rcuname; - __entry->gpnum = gpnum; + __entry->gp_seq = gp_seq; __entry->gpevent = gpevent; ), TP_printk("%s %lu %s", - __entry->rcuname, __entry->gpnum, __entry->gpevent) + __entry->rcuname, __entry->gp_seq, __entry->gpevent) ); /* @@ -753,7 +754,7 @@ TRACE_EVENT(rcu_barrier, #else /* #ifdef CONFIG_RCU_TRACE */ -#define trace_rcu_grace_period(rcuname, gpnum, gpevent) do { } while (0) +#define trace_rcu_grace_period(rcuname, gp_seq, gpevent) do { } while (0) #define trace_rcu_future_grace_period(rcuname, gpnum, completed, c, \ level, grplo, grphi, event) \ do { } while (0) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 4826598867c3..7ce85ad39af6 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -234,7 +234,7 @@ void rcu_sched_qs(void) if (!__this_cpu_read(rcu_sched_data.cpu_no_qs.s)) return; trace_rcu_grace_period(TPS("rcu_sched"), - __this_cpu_read(rcu_sched_data.gpnum), + __this_cpu_read(rcu_sched_data.gp_seq), TPS("cpuqs")); __this_cpu_write(rcu_sched_data.cpu_no_qs.b.norm, false); if (!__this_cpu_read(rcu_sched_data.cpu_no_qs.b.exp)) @@ -249,7 +249,7 @@ void rcu_bh_qs(void) RCU_LOCKDEP_WARN(preemptible(), "rcu_bh_qs() invoked with preemption enabled!!!"); if (__this_cpu_read(rcu_bh_data.cpu_no_qs.s)) { trace_rcu_grace_period(TPS("rcu_bh"), - __this_cpu_read(rcu_bh_data.gpnum), + __this_cpu_read(rcu_bh_data.gp_seq), TPS("cpuqs")); __this_cpu_write(rcu_bh_data.cpu_no_qs.b.norm, false); } @@ -1615,7 +1615,7 @@ static bool rcu_start_this_gp(struct rcu_node *rnp, struct rcu_data *rdp, trace_rcu_this_gp(rnp_root, rdp, c, TPS("NoGPkthread")); goto unlock_out; } - trace_rcu_grace_period(rsp->name, READ_ONCE(rsp->gpnum), TPS("newreq")); + trace_rcu_grace_period(rsp->name, READ_ONCE(rsp->gp_seq), TPS("newreq")); ret = true; /* Caller must wake GP kthread. */ unlock_out: /* Push furthest requested GP to leaf node and rcu_data structure. */ @@ -1702,9 +1702,9 @@ static bool rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp, /* Trace depending on how much we were able to accelerate. */ if (rcu_segcblist_restempty(&rdp->cblist, RCU_WAIT_TAIL)) - trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("AccWaitCB")); + trace_rcu_grace_period(rsp->name, rdp->gp_seq, TPS("AccWaitCB")); else - trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("AccReadyCB")); + trace_rcu_grace_period(rsp->name, rdp->gp_seq, TPS("AccReadyCB")); return ret; } @@ -1774,7 +1774,7 @@ static bool __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp, * go looking for one. */ rdp->gpnum = rnp->gpnum; - trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpustart")); + trace_rcu_grace_period(rsp->name, rdp->gp_seq, TPS("cpustart")); need_gp = !!(rnp->qsmask & rdp->grpmask); rdp->cpu_no_qs.b.norm = need_gp; rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_dynticks.rcu_qs_ctr); @@ -1851,7 +1851,7 @@ static bool rcu_gp_init(struct rcu_state *rsp) rcu_seq_start(&rsp->gp_seq); if (WARN_ON_ONCE(((rnp->completed << RCU_SEQ_CTR_SHIFT) >> RCU_SEQ_CTR_SHIFT) != rcu_seq_ctr(rnp->gp_seq))) /* Catch any ->completed/->gp_seq mismatches. */ pr_info("%s ->completed: %#lx (%#lx) ->gp_seq %#lx (%#lx)\n", __func__, rnp->completed, (rnp->completed << RCU_SEQ_CTR_SHIFT) >> RCU_SEQ_CTR_SHIFT, rnp->gp_seq, rcu_seq_ctr(rnp->gp_seq)); - trace_rcu_grace_period(rsp->name, rsp->gpnum, TPS("start")); + trace_rcu_grace_period(rsp->name, rsp->gp_seq, TPS("start")); raw_spin_unlock_irq_rcu_node(rnp); /* @@ -1928,7 +1928,7 @@ static bool rcu_gp_init(struct rcu_state *rsp) if (rnp == rdp->mynode) (void)__note_gp_changes(rsp, rnp, rdp); rcu_preempt_boost_start_gp(rnp); - trace_rcu_grace_period_init(rsp->name, rnp->gpnum, + trace_rcu_grace_period_init(rsp->name, rnp->gp_seq, rnp->level, rnp->grplo, rnp->grphi, rnp->qsmask); raw_spin_unlock_irq_rcu_node(rnp); @@ -2048,7 +2048,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp) /* Declare grace period done. */ WRITE_ONCE(rsp->completed, rsp->gpnum); rcu_seq_end(&rsp->gp_seq); - trace_rcu_grace_period(rsp->name, rsp->completed, TPS("end")); + trace_rcu_grace_period(rsp->name, rsp->gp_seq, TPS("end")); rsp->gp_state = RCU_GP_IDLE; /* Check for GP requests since above loop. */ rdp = this_cpu_ptr(rsp->rda); @@ -2061,7 +2061,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp) if (!rcu_accelerate_cbs(rsp, rnp, rdp) && needgp) { WRITE_ONCE(rsp->gp_flags, RCU_GP_FLAG_INIT); rsp->gp_req_activity = jiffies; - trace_rcu_grace_period(rsp->name, READ_ONCE(rsp->gpnum), + trace_rcu_grace_period(rsp->name, READ_ONCE(rsp->gp_seq), TPS("newreq")); } else { WRITE_ONCE(rsp->gp_flags, rsp->gp_flags & RCU_GP_FLAG_INIT); @@ -2087,7 +2087,7 @@ static int __noreturn rcu_gp_kthread(void *arg) /* Handle grace-period start. */ for (;;) { trace_rcu_grace_period(rsp->name, - READ_ONCE(rsp->gpnum), + READ_ONCE(rsp->gp_seq), TPS("reqwait")); rsp->gp_state = RCU_GP_WAIT_GPS; swait_event_idle(rsp->gp_wq, READ_ONCE(rsp->gp_flags) & @@ -2100,7 +2100,7 @@ static int __noreturn rcu_gp_kthread(void *arg) WRITE_ONCE(rsp->gp_activity, jiffies); WARN_ON(signal_pending(current)); trace_rcu_grace_period(rsp->name, - READ_ONCE(rsp->gpnum), + READ_ONCE(rsp->gp_seq), TPS("reqwaitsig")); } @@ -2119,7 +2119,7 @@ static int __noreturn rcu_gp_kthread(void *arg) jiffies + 3 * j); } trace_rcu_grace_period(rsp->name, - READ_ONCE(rsp->gpnum), + READ_ONCE(rsp->gp_seq), TPS("fqswait")); rsp->gp_state = RCU_GP_WAIT_FQS; ret = swait_event_idle_timeout(rsp->gp_wq, @@ -2134,12 +2134,12 @@ static int __noreturn rcu_gp_kthread(void *arg) if (ULONG_CMP_GE(jiffies, rsp->jiffies_force_qs) || (gf & RCU_GP_FLAG_FQS)) { trace_rcu_grace_period(rsp->name, - READ_ONCE(rsp->gpnum), + READ_ONCE(rsp->gp_seq), TPS("fqsstart")); rcu_gp_fqs(rsp, first_gp_fqs); first_gp_fqs = false; trace_rcu_grace_period(rsp->name, - READ_ONCE(rsp->gpnum), + READ_ONCE(rsp->gp_seq), TPS("fqsend")); cond_resched_tasks_rcu_qs(); WRITE_ONCE(rsp->gp_activity, jiffies); @@ -2158,7 +2158,7 @@ static int __noreturn rcu_gp_kthread(void *arg) WRITE_ONCE(rsp->gp_activity, jiffies); WARN_ON(signal_pending(current)); trace_rcu_grace_period(rsp->name, - READ_ONCE(rsp->gpnum), + READ_ONCE(rsp->gp_seq), TPS("fqswaitsig")); ret = 1; /* Keep old FQS timing. */ j = jiffies; @@ -2388,17 +2388,16 @@ rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp) */ static void rcu_cleanup_dying_cpu(struct rcu_state *rsp) { - RCU_TRACE(unsigned long mask;) + RCU_TRACE(bool blkd;) RCU_TRACE(struct rcu_data *rdp = this_cpu_ptr(rsp->rda);) RCU_TRACE(struct rcu_node *rnp = rdp->mynode;) if (!IS_ENABLED(CONFIG_HOTPLUG_CPU)) return; - RCU_TRACE(mask = rdp->grpmask;) - trace_rcu_grace_period(rsp->name, - rnp->gpnum + 1 - !!(rnp->qsmask & mask), - TPS("cpuofl")); + RCU_TRACE(blkd = !!(rnp->qsmask & rdp->grpmask);) + trace_rcu_grace_period(rsp->name, rnp->gp_seq, + blkd ? TPS("cpuofl") : TPS("cpuofl-bgp")); } /* @@ -3538,7 +3537,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp) rdp->core_needs_qs = false; rdp->rcu_iw_pending = false; rdp->rcu_iw_gp_seq = rnp->gp_seq - 1; - trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpuonl")); + trace_rcu_grace_period(rsp->name, rdp->gp_seq, TPS("cpuonl")); raw_spin_unlock_irqrestore_rcu_node(rnp, flags); } diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index ca73931f7b30..aca9d187c509 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -301,7 +301,7 @@ static void rcu_preempt_qs(void) RCU_LOCKDEP_WARN(preemptible(), "rcu_preempt_qs() invoked with preemption enabled!!!\n"); if (__this_cpu_read(rcu_data_p->cpu_no_qs.s)) { trace_rcu_grace_period(TPS("rcu_preempt"), - __this_cpu_read(rcu_data_p->gpnum), + __this_cpu_read(rcu_data_p->gp_seq), TPS("cpuqs")); __this_cpu_write(rcu_data_p->cpu_no_qs.b.norm, false); barrier(); /* Coordinate with rcu_preempt_check_callbacks(). */ -- cgit From abd13fdd9516e5baae2257721b921684ecb090d3 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 1 May 2018 13:08:46 -0700 Subject: rcu: Convert rcu_future_grace_period tracepoint to gp_seq This commit makes the rcu_future_grace_period tracepoint use gp_seq instead of ->gpnum and ->completed. Signed-off-by: Paul E. McKenney --- include/trace/events/rcu.h | 22 +++++++++------------- kernel/rcu/tree.c | 7 +++---- 2 files changed, 12 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/include/trace/events/rcu.h b/include/trace/events/rcu.h index cd229e82ec8b..286047d22314 100644 --- a/include/trace/events/rcu.h +++ b/include/trace/events/rcu.h @@ -103,16 +103,14 @@ TRACE_EVENT(rcu_grace_period, */ TRACE_EVENT(rcu_future_grace_period, - TP_PROTO(const char *rcuname, unsigned long gpnum, unsigned long completed, - unsigned long c, u8 level, int grplo, int grphi, - const char *gpevent), + TP_PROTO(const char *rcuname, unsigned long gp_seq, unsigned long c, + u8 level, int grplo, int grphi, const char *gpevent), - TP_ARGS(rcuname, gpnum, completed, c, level, grplo, grphi, gpevent), + TP_ARGS(rcuname, gp_seq, c, level, grplo, grphi, gpevent), TP_STRUCT__entry( __field(const char *, rcuname) - __field(unsigned long, gpnum) - __field(unsigned long, completed) + __field(unsigned long, gp_seq) __field(unsigned long, c) __field(u8, level) __field(int, grplo) @@ -122,8 +120,7 @@ TRACE_EVENT(rcu_future_grace_period, TP_fast_assign( __entry->rcuname = rcuname; - __entry->gpnum = gpnum; - __entry->completed = completed; + __entry->gp_seq = gp_seq; __entry->c = c; __entry->level = level; __entry->grplo = grplo; @@ -131,10 +128,9 @@ TRACE_EVENT(rcu_future_grace_period, __entry->gpevent = gpevent; ), - TP_printk("%s %lu %lu %lu %u %d %d %s", - __entry->rcuname, __entry->gpnum, __entry->completed, - __entry->c, __entry->level, __entry->grplo, __entry->grphi, - __entry->gpevent) + TP_printk("%s %lu %lu %u %d %d %s", + __entry->rcuname, __entry->gp_seq, __entry->c, __entry->level, + __entry->grplo, __entry->grphi, __entry->gpevent) ); /* @@ -755,7 +751,7 @@ TRACE_EVENT(rcu_barrier, #else /* #ifdef CONFIG_RCU_TRACE */ #define trace_rcu_grace_period(rcuname, gp_seq, gpevent) do { } while (0) -#define trace_rcu_future_grace_period(rcuname, gpnum, completed, c, \ +#define trace_rcu_future_grace_period(rcuname, gp_seq, c, \ level, grplo, grphi, event) \ do { } while (0) #define trace_rcu_grace_period_init(rcuname, gpnum, level, grplo, grphi, \ diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 7ce85ad39af6..066dbaacec30 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1552,9 +1552,8 @@ void rcu_cpu_stall_reset(void) static void trace_rcu_this_gp(struct rcu_node *rnp, struct rcu_data *rdp, unsigned long c, const char *s) { - trace_rcu_future_grace_period(rdp->rsp->name, rnp->gpnum, - rnp->completed, c, rnp->level, - rnp->grplo, rnp->grphi, s); + trace_rcu_future_grace_period(rdp->rsp->name, rnp->gp_seq, c, + rnp->level, rnp->grplo, rnp->grphi, s); } /* @@ -2053,7 +2052,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp) /* Check for GP requests since above loop. */ rdp = this_cpu_ptr(rsp->rda); if (ULONG_CMP_LT(rnp->gp_seq, rnp->gp_seq_needed)) { - trace_rcu_this_gp(rnp, rdp, rsp->completed - 1, + trace_rcu_this_gp(rnp, rdp, rnp->gp_seq_needed, TPS("CleanupMore")); needgp = true; } -- cgit From 598ce09480efb6b48799df60c66bac70bea5ef54 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 1 May 2018 13:35:20 -0700 Subject: rcu: Convert rcu_preempt_task tracepoint to ->gp_seq This commit makes the rcu_preempt_task tracepoint use ->gp_seq instead of ->gpnum. Signed-off-by: Paul E. McKenney --- include/trace/events/rcu.h | 12 ++++++------ kernel/rcu/tree_plugin.h | 4 ++-- 2 files changed, 8 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/include/trace/events/rcu.h b/include/trace/events/rcu.h index 892016ad0647..38dbd97d65a3 100644 --- a/include/trace/events/rcu.h +++ b/include/trace/events/rcu.h @@ -298,24 +298,24 @@ TRACE_EVENT(rcu_nocb_wake, */ TRACE_EVENT(rcu_preempt_task, - TP_PROTO(const char *rcuname, int pid, unsigned long gpnum), + TP_PROTO(const char *rcuname, int pid, unsigned long gp_seq), - TP_ARGS(rcuname, pid, gpnum), + TP_ARGS(rcuname, pid, gp_seq), TP_STRUCT__entry( __field(const char *, rcuname) - __field(unsigned long, gpnum) + __field(unsigned long, gp_seq) __field(int, pid) ), TP_fast_assign( __entry->rcuname = rcuname; - __entry->gpnum = gpnum; + __entry->gp_seq = gp_seq; __entry->pid = pid; ), TP_printk("%s %lu %d", - __entry->rcuname, __entry->gpnum, __entry->pid) + __entry->rcuname, __entry->gp_seq, __entry->pid) ); /* @@ -761,7 +761,7 @@ TRACE_EVENT(rcu_barrier, #define trace_rcu_exp_funnel_lock(rcuname, level, grplo, grphi, gpevent) \ do { } while (0) #define trace_rcu_nocb_wake(rcuname, cpu, reason) do { } while (0) -#define trace_rcu_preempt_task(rcuname, pid, gpnum) do { } while (0) +#define trace_rcu_preempt_task(rcuname, pid, gp_seq) do { } while (0) #define trace_rcu_unlock_preempted_task(rcuname, gpnum, pid) do { } while (0) #define trace_rcu_quiescent_state_report(rcuname, gpnum, mask, qsmask, level, \ grplo, grphi, gp_tasks) do { } \ diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index aca9d187c509..02ca3b4e6a8f 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -350,8 +350,8 @@ static void rcu_preempt_note_context_switch(bool preempt) trace_rcu_preempt_task(rdp->rsp->name, t->pid, (rnp->qsmask & rdp->grpmask) - ? rnp->gpnum - : rnp->gpnum + 1); + ? rnp->gp_seq + : rcu_seq_snap(&rnp->gp_seq)); rcu_preempt_ctxt_queue(rnp, rdp); } else if (t->rcu_read_lock_nesting < 0 && t->rcu_read_unlock_special.s) { -- cgit From 865aa1e08d8aefdfd1f5d30ecfce1b8ef8cd520a Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 1 May 2018 13:35:20 -0700 Subject: rcu: Convert rcu_unlock_preempted_task tracepoint to ->gp_seq This commit makes the rcu_unlock_preempted_task tracepoint use ->gp_seq instead of ->gpnum. Signed-off-by: Paul E. McKenney --- include/trace/events/rcu.h | 12 ++++++------ kernel/rcu/tree_plugin.h | 4 ++-- 2 files changed, 8 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/include/trace/events/rcu.h b/include/trace/events/rcu.h index 38dbd97d65a3..95b7491196aa 100644 --- a/include/trace/events/rcu.h +++ b/include/trace/events/rcu.h @@ -325,23 +325,23 @@ TRACE_EVENT(rcu_preempt_task, */ TRACE_EVENT(rcu_unlock_preempted_task, - TP_PROTO(const char *rcuname, unsigned long gpnum, int pid), + TP_PROTO(const char *rcuname, unsigned long gp_seq, int pid), - TP_ARGS(rcuname, gpnum, pid), + TP_ARGS(rcuname, gp_seq, pid), TP_STRUCT__entry( __field(const char *, rcuname) - __field(unsigned long, gpnum) + __field(unsigned long, gp_seq) __field(int, pid) ), TP_fast_assign( __entry->rcuname = rcuname; - __entry->gpnum = gpnum; + __entry->gp_seq = gp_seq; __entry->pid = pid; ), - TP_printk("%s %lu %d", __entry->rcuname, __entry->gpnum, __entry->pid) + TP_printk("%s %lu %d", __entry->rcuname, __entry->gp_seq, __entry->pid) ); /* @@ -762,7 +762,7 @@ TRACE_EVENT(rcu_barrier, do { } while (0) #define trace_rcu_nocb_wake(rcuname, cpu, reason) do { } while (0) #define trace_rcu_preempt_task(rcuname, pid, gp_seq) do { } while (0) -#define trace_rcu_unlock_preempted_task(rcuname, gpnum, pid) do { } while (0) +#define trace_rcu_unlock_preempted_task(rcuname, gp_seq, pid) do { } while (0) #define trace_rcu_quiescent_state_report(rcuname, gpnum, mask, qsmask, level, \ grplo, grphi, gp_tasks) do { } \ while (0) diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 02ca3b4e6a8f..a10b0e26ce19 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -545,7 +545,7 @@ void rcu_read_unlock_special(struct task_struct *t) list_del_init(&t->rcu_node_entry); t->rcu_blocked_node = NULL; trace_rcu_unlock_preempted_task(TPS("rcu_preempt"), - rnp->gpnum, t->pid); + rnp->gp_seq, t->pid); if (&t->rcu_node_entry == rnp->gp_tasks) rnp->gp_tasks = np; if (&t->rcu_node_entry == rnp->exp_tasks) @@ -708,7 +708,7 @@ static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp) t = container_of(rnp->gp_tasks, struct task_struct, rcu_node_entry); trace_rcu_unlock_preempted_task(TPS("rcu_preempt-GPS"), - rnp->gpnum, t->pid); + rnp->gp_seq, t->pid); } WARN_ON_ONCE(rnp->qsmask); } -- cgit From db023296f0115d2fe01fdabad54678f2b806da23 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 1 May 2018 13:35:20 -0700 Subject: rcu: Convert rcu_quiescent_state_report tracepoint to ->gp_seq This commit makes the rcu_quiescent_state_report tracepoint use ->gp_seq instead of ->gpnum. Signed-off-by: Paul E. McKenney --- include/trace/events/rcu.h | 12 ++++++------ kernel/rcu/tree.c | 2 +- kernel/rcu/tree_plugin.h | 2 +- 3 files changed, 8 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/include/trace/events/rcu.h b/include/trace/events/rcu.h index 95b7491196aa..ac4d9d4a1ebf 100644 --- a/include/trace/events/rcu.h +++ b/include/trace/events/rcu.h @@ -354,15 +354,15 @@ TRACE_EVENT(rcu_unlock_preempted_task, */ TRACE_EVENT(rcu_quiescent_state_report, - TP_PROTO(const char *rcuname, unsigned long gpnum, + TP_PROTO(const char *rcuname, unsigned long gp_seq, unsigned long mask, unsigned long qsmask, u8 level, int grplo, int grphi, int gp_tasks), - TP_ARGS(rcuname, gpnum, mask, qsmask, level, grplo, grphi, gp_tasks), + TP_ARGS(rcuname, gp_seq, mask, qsmask, level, grplo, grphi, gp_tasks), TP_STRUCT__entry( __field(const char *, rcuname) - __field(unsigned long, gpnum) + __field(unsigned long, gp_seq) __field(unsigned long, mask) __field(unsigned long, qsmask) __field(u8, level) @@ -373,7 +373,7 @@ TRACE_EVENT(rcu_quiescent_state_report, TP_fast_assign( __entry->rcuname = rcuname; - __entry->gpnum = gpnum; + __entry->gp_seq = gp_seq; __entry->mask = mask; __entry->qsmask = qsmask; __entry->level = level; @@ -383,7 +383,7 @@ TRACE_EVENT(rcu_quiescent_state_report, ), TP_printk("%s %lu %lx>%lx %u %d %d %u", - __entry->rcuname, __entry->gpnum, + __entry->rcuname, __entry->gp_seq, __entry->mask, __entry->qsmask, __entry->level, __entry->grplo, __entry->grphi, __entry->gp_tasks) ); @@ -763,7 +763,7 @@ TRACE_EVENT(rcu_barrier, #define trace_rcu_nocb_wake(rcuname, cpu, reason) do { } while (0) #define trace_rcu_preempt_task(rcuname, pid, gp_seq) do { } while (0) #define trace_rcu_unlock_preempted_task(rcuname, gp_seq, pid) do { } while (0) -#define trace_rcu_quiescent_state_report(rcuname, gpnum, mask, qsmask, level, \ +#define trace_rcu_quiescent_state_report(rcuname, gp_seq, mask, qsmask, level, \ grplo, grphi, gp_tasks) do { } \ while (0) #define trace_rcu_fqs(rcuname, gpnum, cpu, qsevent) do { } while (0) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 066dbaacec30..7c6c11d5479c 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2229,7 +2229,7 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp, WARN_ON_ONCE(!rcu_is_leaf_node(rnp) && rcu_preempt_blocked_readers_cgp(rnp)); rnp->qsmask &= ~mask; - trace_rcu_quiescent_state_report(rsp->name, rnp->gpnum, + trace_rcu_quiescent_state_report(rsp->name, rnp->gp_seq, mask, rnp->qsmask, rnp->level, rnp->grplo, rnp->grphi, !!rnp->gp_tasks); diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index a10b0e26ce19..c8a2c7760121 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -566,7 +566,7 @@ void rcu_read_unlock_special(struct task_struct *t) empty_exp_now = sync_rcu_preempt_exp_done(rnp); if (!empty_norm && !rcu_preempt_blocked_readers_cgp(rnp)) { trace_rcu_quiescent_state_report(TPS("preempt_rcu"), - rnp->gpnum, + rnp->gp_seq, 0, rnp->qsmask, rnp->level, rnp->grplo, -- cgit From fee5997c17562e95fb1fecc142efb2da0934baa4 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 1 May 2018 13:35:20 -0700 Subject: rcu: Convert rcu_fqs tracepoint to ->gp_seq This commit makes the rcu_fqs tracepoint use ->gp_seq instead of ->gpnum. Signed-off-by: Paul E. McKenney --- include/trace/events/rcu.h | 12 ++++++------ kernel/rcu/tree.c | 8 ++++---- 2 files changed, 10 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/include/trace/events/rcu.h b/include/trace/events/rcu.h index ac4d9d4a1ebf..7d3650cc9d30 100644 --- a/include/trace/events/rcu.h +++ b/include/trace/events/rcu.h @@ -398,26 +398,26 @@ TRACE_EVENT(rcu_quiescent_state_report, */ TRACE_EVENT(rcu_fqs, - TP_PROTO(const char *rcuname, unsigned long gpnum, int cpu, const char *qsevent), + TP_PROTO(const char *rcuname, unsigned long gp_seq, int cpu, const char *qsevent), - TP_ARGS(rcuname, gpnum, cpu, qsevent), + TP_ARGS(rcuname, gp_seq, cpu, qsevent), TP_STRUCT__entry( __field(const char *, rcuname) - __field(unsigned long, gpnum) + __field(unsigned long, gp_seq) __field(int, cpu) __field(const char *, qsevent) ), TP_fast_assign( __entry->rcuname = rcuname; - __entry->gpnum = gpnum; + __entry->gp_seq = gp_seq; __entry->cpu = cpu; __entry->qsevent = qsevent; ), TP_printk("%s %lu %d %s", - __entry->rcuname, __entry->gpnum, + __entry->rcuname, __entry->gp_seq, __entry->cpu, __entry->qsevent) ); @@ -766,7 +766,7 @@ TRACE_EVENT(rcu_barrier, #define trace_rcu_quiescent_state_report(rcuname, gp_seq, mask, qsmask, level, \ grplo, grphi, gp_tasks) do { } \ while (0) -#define trace_rcu_fqs(rcuname, gpnum, cpu, qsevent) do { } while (0) +#define trace_rcu_fqs(rcuname, gp_seq, cpu, qsevent) do { } while (0) #define trace_rcu_dyntick(polarity, oldnesting, newnesting, dyntick) do { } while (0) #define trace_rcu_callback(rcuname, rhp, qlen_lazy, qlen) do { } while (0) #define trace_rcu_kfree_callback(rcuname, rhp, offset, qlen_lazy, qlen) \ diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 7c6c11d5479c..42e89d4f1e33 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1111,7 +1111,7 @@ static int dyntick_save_progress_counter(struct rcu_data *rdp) { rdp->dynticks_snap = rcu_dynticks_snap(rdp->dynticks); if (rcu_dynticks_in_eqs(rdp->dynticks_snap)) { - trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("dti")); + trace_rcu_fqs(rdp->rsp->name, rdp->gp_seq, rdp->cpu, TPS("dti")); rcu_gpnum_ovf(rdp->mynode, rdp); return 1; } @@ -1161,7 +1161,7 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) * of the current RCU grace period. */ if (rcu_dynticks_in_eqs_since(rdp->dynticks, rdp->dynticks_snap)) { - trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("dti")); + trace_rcu_fqs(rdp->rsp->name, rdp->gp_seq, rdp->cpu, TPS("dti")); rdp->dynticks_fqs++; rcu_gpnum_ovf(rnp, rdp); return 1; @@ -1178,7 +1178,7 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) if (time_after(jiffies, rdp->rsp->gp_start + jtsq) && READ_ONCE(rdp->rcu_qs_ctr_snap) != per_cpu(rcu_dynticks.rcu_qs_ctr, rdp->cpu) && rcu_seq_current(&rdp->gp_seq) == rnp->gp_seq && !rdp->gpwrap) { - trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("rqc")); + trace_rcu_fqs(rdp->rsp->name, rdp->gp_seq, rdp->cpu, TPS("rqc")); rcu_gpnum_ovf(rnp, rdp); return 1; } else if (time_after(jiffies, rdp->rsp->gp_start + jtsq)) { @@ -1188,7 +1188,7 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) /* Check for the CPU being offline. */ if (!(rdp->grpmask & rcu_rnp_online_cpus(rnp))) { - trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("ofl")); + trace_rcu_fqs(rdp->rsp->name, rdp->gp_seq, rdp->cpu, TPS("ofl")); rdp->offline_fqs++; rcu_gpnum_ovf(rnp, rdp); return 1; -- cgit From ff3bb6f4d06247508489345ee90a8a9b6f3ffd3b Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 1 May 2018 14:34:08 -0700 Subject: rcu: Remove ->gpnum and ->completed Now that everything has been converted to use ->gp_seq instead of ->gpnum and ->completed, this commit removes ->gpnum and ->completed. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 45 +++++++++++---------------------------------- kernel/rcu/tree.h | 14 +------------- kernel/rcu/tree_plugin.h | 2 +- 3 files changed, 13 insertions(+), 48 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 42e89d4f1e33..0aeddc908181 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -95,8 +95,6 @@ struct rcu_state sname##_state = { \ .rda = &sname##_data, \ .call = cr, \ .gp_state = RCU_GP_IDLE, \ - .gpnum = 0UL - 300UL, \ - .completed = 0UL - 300UL, \ .gp_seq = (0UL - 300UL) << RCU_SEQ_CTR_SHIFT, \ .barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \ .name = RCU_STATE_NAME(sname), \ @@ -1349,8 +1347,6 @@ static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gp_seq) struct rcu_node *rnp = rcu_get_root(rsp); long totqlen = 0; - WARN_ON_ONCE(gp_seq & 0x2); /* Remove when ->gpnum removed. */ - /* Kick and suppress, if so configured. */ rcu_stall_kick_kthreads(rsp); if (rcu_cpu_stall_suppress) @@ -1582,8 +1578,6 @@ static bool rcu_start_this_gp(struct rcu_node *rnp, struct rcu_data *rdp, * not be released. */ raw_lockdep_assert_held_rcu_node(rnp); - WARN_ON_ONCE(c & 0x2); /* Catch any lingering use of ->gpnum. */ - WARN_ON_ONCE(((rnp->completed << RCU_SEQ_CTR_SHIFT) >> RCU_SEQ_CTR_SHIFT) != rcu_seq_ctr(rnp->gp_seq)); /* Catch any ->completed/->gp_seq mismatches. */ trace_rcu_this_gp(rnp, rdp, c, TPS("Startleaf")); for (rnp_root = rnp; 1; rnp_root = rnp_root->parent) { if (rnp_root != rnp) @@ -1757,8 +1751,6 @@ static bool __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp, if (rcu_seq_completed_gp(rdp->gp_seq, rnp->gp_seq) || unlikely(READ_ONCE(rdp->gpwrap))) { ret = rcu_advance_cbs(rsp, rnp, rdp); /* Advance callbacks. */ - /* Remember that we saw this grace-period completion. */ - rdp->completed = rnp->completed; trace_rcu_grace_period(rsp->name, rdp->gp_seq, TPS("cpuend")); } else { ret = rcu_accelerate_cbs(rsp, rnp, rdp); /* Recent callbacks. */ @@ -1772,7 +1764,6 @@ static bool __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp, * set up to detect a quiescent state, otherwise don't * go looking for one. */ - rdp->gpnum = rnp->gpnum; trace_rcu_grace_period(rsp->name, rdp->gp_seq, TPS("cpustart")); need_gp = !!(rnp->qsmask & rdp->grpmask); rdp->cpu_no_qs.b.norm = need_gp; @@ -1843,13 +1834,8 @@ static bool rcu_gp_init(struct rcu_state *rsp) /* Advance to a new grace period and initialize state. */ record_gp_stall_check_time(rsp); - /* Record GP times before starting GP, hence smp_store_release(). */ - WARN_ON_ONCE(rsp->gpnum << RCU_SEQ_CTR_SHIFT != rsp->gp_seq); - smp_store_release(&rsp->gpnum, rsp->gpnum + 1); - smp_mb(); /* Pairs with barriers in stall-warning code. */ + /* Record GP times before starting GP, hence rcu_seq_start(). */ rcu_seq_start(&rsp->gp_seq); - if (WARN_ON_ONCE(((rnp->completed << RCU_SEQ_CTR_SHIFT) >> RCU_SEQ_CTR_SHIFT) != rcu_seq_ctr(rnp->gp_seq))) /* Catch any ->completed/->gp_seq mismatches. */ - pr_info("%s ->completed: %#lx (%#lx) ->gp_seq %#lx (%#lx)\n", __func__, rnp->completed, (rnp->completed << RCU_SEQ_CTR_SHIFT) >> RCU_SEQ_CTR_SHIFT, rnp->gp_seq, rcu_seq_ctr(rnp->gp_seq)); trace_rcu_grace_period(rsp->name, rsp->gp_seq, TPS("start")); raw_spin_unlock_irq_rcu_node(rnp); @@ -1920,9 +1906,6 @@ static bool rcu_gp_init(struct rcu_state *rsp) rdp = this_cpu_ptr(rsp->rda); rcu_preempt_check_blocked_tasks(rnp); rnp->qsmask = rnp->qsmaskinit; - WRITE_ONCE(rnp->gpnum, rsp->gpnum); - if (WARN_ON_ONCE(rnp->completed != rsp->completed)) - WRITE_ONCE(rnp->completed, rsp->completed); WRITE_ONCE(rnp->gp_seq, rsp->gp_seq); if (rnp == rdp->mynode) (void)__note_gp_changes(rsp, rnp, rdp); @@ -2012,13 +1995,13 @@ static void rcu_gp_cleanup(struct rcu_state *rsp) raw_spin_unlock_irq_rcu_node(rnp); /* - * Propagate new ->completed value to rcu_node structures so - * that other CPUs don't have to wait until the start of the next - * grace period to process their callbacks. This also avoids - * some nasty RCU grace-period initialization races by forcing - * the end of the current grace period to be completely recorded in - * all of the rcu_node structures before the beginning of the next - * grace period is recorded in any of the rcu_node structures. + * Propagate new ->gp_seq value to rcu_node structures so that + * other CPUs don't have to wait until the start of the next grace + * period to process their callbacks. This also avoids some nasty + * RCU grace-period initialization races by forcing the end of + * the current grace period to be completely recorded in all of + * the rcu_node structures before the beginning of the next grace + * period is recorded in any of the rcu_node structures. */ new_gp_seq = rsp->gp_seq; rcu_seq_end(&new_gp_seq); @@ -2027,7 +2010,6 @@ static void rcu_gp_cleanup(struct rcu_state *rsp) if (WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp))) dump_blkd_tasks(rnp, 10); WARN_ON_ONCE(rnp->qsmask); - WRITE_ONCE(rnp->completed, rsp->gpnum); WRITE_ONCE(rnp->gp_seq, new_gp_seq); rdp = this_cpu_ptr(rsp->rda); if (rnp == rdp->mynode) @@ -2045,7 +2027,6 @@ static void rcu_gp_cleanup(struct rcu_state *rsp) raw_spin_lock_irq_rcu_node(rnp); /* GP before rsp->gp_seq update. */ /* Declare grace period done. */ - WRITE_ONCE(rsp->completed, rsp->gpnum); rcu_seq_end(&rsp->gp_seq); trace_rcu_grace_period(rsp->name, rsp->gp_seq, TPS("end")); rsp->gp_state = RCU_GP_IDLE; @@ -3496,9 +3477,9 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp) /* * Initialize a CPU's per-CPU RCU data. Note that only one online or - * offline event can be happening at a given time. Note also that we - * can accept some slop in the rsp->completed access due to the fact - * that this CPU cannot possibly have any RCU callbacks in flight yet. + * offline event can be happening at a given time. Note also that we can + * accept some slop in the rsp->gp_seq access due to the fact that this + * CPU cannot possibly have any RCU callbacks in flight yet. */ static void rcu_init_percpu_data(int cpu, struct rcu_state *rsp) @@ -3527,8 +3508,6 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp) rnp = rdp->mynode; raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */ rdp->beenonline = true; /* We have now been online. */ - rdp->gpnum = rnp->completed; /* Make CPU later note any new GP. */ - rdp->completed = rnp->completed; rdp->gp_seq = rnp->gp_seq; rdp->gp_seq_needed = rnp->gp_seq; rdp->cpu_no_qs.b.norm = true; @@ -3908,8 +3887,6 @@ static void __init rcu_init_one(struct rcu_state *rsp) raw_spin_lock_init(&rnp->fqslock); lockdep_set_class_and_name(&rnp->fqslock, &rcu_fqs_class[i], fqs[i]); - rnp->gpnum = rsp->gpnum; - rnp->completed = rsp->completed; rnp->gp_seq = rsp->gp_seq; rnp->gp_seq_needed = rsp->gp_seq; rnp->completedqs = rsp->gp_seq; diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 9329c1ff695f..3def94fc9c74 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -81,12 +81,6 @@ struct rcu_node { raw_spinlock_t __private lock; /* Root rcu_node's lock protects */ /* some rcu_state fields as well as */ /* following. */ - unsigned long gpnum; /* Current grace period for this node. */ - /* This will either be equal to or one */ - /* behind the root rcu_node's gpnum. */ - unsigned long completed; /* Last GP completed for this node. */ - /* This will either be equal to or one */ - /* behind the root rcu_node's gpnum. */ unsigned long gp_seq; /* Track rsp->rcu_gp_seq. */ unsigned long gp_seq_needed; /* Track rsp->rcu_gp_seq_needed. */ unsigned long completedqs; /* All QSes done for this node. */ @@ -192,10 +186,6 @@ union rcu_noqs { /* Per-CPU data for read-copy update. */ struct rcu_data { /* 1) quiescent-state and grace-period handling : */ - unsigned long completed; /* Track rsp->completed gp number */ - /* in order to detect GP end. */ - unsigned long gpnum; /* Highest gp number that this CPU */ - /* is aware of having started. */ unsigned long gp_seq; /* Track rsp->rcu_gp_seq counter. */ unsigned long gp_seq_needed; /* Track rsp->rcu_gp_seq_needed ctr. */ unsigned long rcu_qs_ctr_snap;/* Snapshot of rcu_qs_ctr to check */ @@ -203,7 +193,7 @@ struct rcu_data { union rcu_noqs cpu_no_qs; /* No QSes yet for this CPU. */ bool core_needs_qs; /* Core waits for quiesc state. */ bool beenonline; /* CPU online at least once. */ - bool gpwrap; /* Possible gpnum/completed wrap. */ + bool gpwrap; /* Possible ->gp_seq wrap. */ struct rcu_node *mynode; /* This CPU's leaf of hierarchy */ unsigned long grpmask; /* Mask to apply to leaf qsmask. */ unsigned long ticks_this_gp; /* The number of scheduling-clock */ @@ -328,8 +318,6 @@ struct rcu_state { u8 boost ____cacheline_internodealigned_in_smp; /* Subject to priority boost. */ - unsigned long gpnum; /* Current gp number. */ - unsigned long completed; /* # of last completed gp. */ unsigned long gp_seq; /* Grace-period sequence #. */ struct task_struct *gp_kthread; /* Task for grace periods. */ struct swait_queue_head gp_wq; /* Where GP task waits. */ diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index c8a2c7760121..3a6e04103de1 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -690,7 +690,7 @@ static int rcu_print_task_exp_stall(struct rcu_node *rnp) * Check that the list of blocked tasks for the newly completed grace * period is in fact empty. It is a serious bug to complete a grace * period that still has RCU readers blocked! This function must be - * invoked -before- updating this rnp's ->gpnum, and the rnp's ->lock + * invoked -before- updating this rnp's ->gp_seq, and the rnp's ->lock * must be held by the caller. * * Also, if there are blocked tasks on the list, they automatically -- cgit From e44e73ca47b47510dac491329d453d82aea1d8d8 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 1 May 2018 16:29:47 -0700 Subject: rcu: Make simple callback acceleration refer to rdp->gp_seq_needed Now that the rcu_data structure contains ->gp_seq_needed, create an rcu_accelerate_cbs_unlocked() helper function that locklessly checks to see if new callbacks' required grace period has already been requested. If so, update the callback list locally and again locklessly. (Though interrupts must be and are disabled to avoid racing with conflicting updates in interrupt handlers.) Otherwise, call rcu_accelerate_cbs() as before. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 51 ++++++++++++++++++++++++++++++++------------------- 1 file changed, 32 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 0aeddc908181..5643c135fb06 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1701,6 +1701,34 @@ static bool rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp, return ret; } +/* + * Similar to rcu_accelerate_cbs(), but does not require that the leaf + * rcu_node structure's ->lock be held. It consults the cached value + * of ->gp_seq_needed in the rcu_data structure, and if that indicates + * that a new grace-period request be made, invokes rcu_accelerate_cbs() + * while holding the leaf rcu_node structure's ->lock. + */ +static void rcu_accelerate_cbs_unlocked(struct rcu_state *rsp, + struct rcu_node *rnp, + struct rcu_data *rdp) +{ + unsigned long c; + bool needwake; + + lockdep_assert_irqs_disabled(); + c = rcu_seq_snap(&rsp->gp_seq); + if (!rdp->gpwrap && ULONG_CMP_GE(rdp->gp_seq_needed, c)) { + /* Old request still live, so mark recent callbacks. */ + (void)rcu_segcblist_accelerate(&rdp->cblist, c); + return; + } + raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */ + needwake = rcu_accelerate_cbs(rsp, rnp, rdp); + raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */ + if (needwake) + rcu_gp_kthread_wake(rsp); +} + /* * Move any callbacks whose grace period has completed to the * RCU_DONE_TAIL sublist, then compact the remaining sublists and @@ -2739,7 +2767,6 @@ static void __rcu_process_callbacks(struct rcu_state *rsp) { unsigned long flags; - bool needwake; struct rcu_data *rdp = raw_cpu_ptr(rsp->rda); struct rcu_node *rnp = rdp->mynode; @@ -2752,15 +2779,9 @@ __rcu_process_callbacks(struct rcu_state *rsp) if (!rcu_gp_in_progress(rsp) && rcu_segcblist_is_enabled(&rdp->cblist)) { local_irq_save(flags); - if (rcu_segcblist_restempty(&rdp->cblist, RCU_NEXT_READY_TAIL)) { - local_irq_restore(flags); - } else { - raw_spin_lock_rcu_node(rnp); /* irqs disabled. */ - needwake = rcu_accelerate_cbs(rsp, rnp, rdp); - raw_spin_unlock_irqrestore_rcu_node(rnp, flags); - if (needwake) - rcu_gp_kthread_wake(rsp); - } + if (!rcu_segcblist_restempty(&rdp->cblist, RCU_NEXT_READY_TAIL)) + rcu_accelerate_cbs_unlocked(rsp, rnp, rdp); + local_irq_restore(flags); } rcu_check_gp_start_stall(rsp, rnp, rdp); @@ -2818,8 +2839,6 @@ static void invoke_rcu_core(void) static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp, struct rcu_head *head, unsigned long flags) { - bool needwake; - /* * If called from an extended quiescent state, invoke the RCU * core in order to force a re-evaluation of RCU's idleness. @@ -2846,13 +2865,7 @@ static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp, /* Start a new grace period if one not already started. */ if (!rcu_gp_in_progress(rsp)) { - struct rcu_node *rnp = rdp->mynode; - - raw_spin_lock_rcu_node(rnp); - needwake = rcu_accelerate_cbs(rsp, rnp, rdp); - raw_spin_unlock_rcu_node(rnp); - if (needwake) - rcu_gp_kthread_wake(rsp); + rcu_accelerate_cbs_unlocked(rsp, rdp->mynode, rdp); } else { /* Give the grace period a kick. */ rdp->blimit = LONG_MAX; -- cgit From a2165e416878b325747f871df4b236b49bf61486 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sat, 12 May 2018 07:42:20 -0700 Subject: rcu: Don't funnel-lock above leaf node if GP in progress The old grace-period start code would acquire only the leaf's rcu_node structure's ->lock if that structure believed that a grace period was in progress. The new code advances to the leaf's parent in this case, needlessly acquiring then leaf's parent's ->lock. This commit therefore checks the grace-period state after marking the leaf with the need for the specified grace period, and if the leaf believes that a grace period is in progress, takes an early exit. Reported-by: Joel Fernandes Signed-off-by: Paul E. McKenney [ paulmck: Add "Startedleaf" tracing as suggested by Joel Fernandes. ] --- include/trace/events/rcu.h | 4 ++-- kernel/rcu/tree.c | 9 +++++++++ 2 files changed, 11 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/include/trace/events/rcu.h b/include/trace/events/rcu.h index 7d3650cc9d30..5ab1df0a2801 100644 --- a/include/trace/events/rcu.h +++ b/include/trace/events/rcu.h @@ -91,8 +91,8 @@ TRACE_EVENT(rcu_grace_period, * * "Startleaf": Request a grace period based on leaf-node data. * "Prestarted": Someone beat us to the request - * "Startedleaf": Leaf-node start proved sufficient. - * "Startedleafroot": Leaf-node start proved sufficient after checking root. + * "Startedleaf": Leaf node marked for future GP. + * "Startedleafroot": All nodes from leaf to root marked for future GP. * "Startedroot": Requested a nocb grace period based on root-node data. * "NoGPkthread": The RCU grace-period kthread has not yet started. * "StartWait": Start waiting for the requested grace period. diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 5643c135fb06..24a79e85b81f 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1590,6 +1590,15 @@ static bool rcu_start_this_gp(struct rcu_node *rnp, struct rcu_data *rdp, goto unlock_out; } rnp_root->gp_seq_needed = c; + if (rcu_seq_state(rcu_seq_current(&rnp->gp_seq))) { + /* + * We just marked the leaf, and a grace period + * is in progress, which means that rcu_gp_cleanup() + * will see the marking. Bail to reduce contention. + */ + trace_rcu_this_gp(rnp, rdp, c, TPS("Startedleaf")); + goto unlock_out; + } if (rnp_root != rnp && rnp_root->parent != NULL) raw_spin_unlock_rcu_node(rnp_root); if (!rnp_root->parent) -- cgit From 5b55072f22ba2ed136b7a1b6c5beea9ace8415a7 Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Sun, 13 May 2018 20:15:40 -0700 Subject: rcu: Produce last "CleanupMore" trace only if late-breaking request Currently Tree RCU's clean-up code emits a "CleanupMore" trace event in response to late-arriving grace-period requests even if the grace period was already requested. This makes "CleanupMore" show up an extra time (in addition to once for each rcu_node structure that was previously marked with the request), and for no good reason. This commit therefore avoids emitting this trace message unless the the only request for this next grace period arrived during or after the cleanup scan of the rcu_node structures. Signed-off-by: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 24a79e85b81f..73a33b82cfcd 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2069,7 +2069,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp) rsp->gp_state = RCU_GP_IDLE; /* Check for GP requests since above loop. */ rdp = this_cpu_ptr(rsp->rda); - if (ULONG_CMP_LT(rnp->gp_seq, rnp->gp_seq_needed)) { + if (!needgp && ULONG_CMP_LT(rnp->gp_seq, rnp->gp_seq_needed)) { trace_rcu_this_gp(rnp, rdp, rnp->gp_seq_needed, TPS("CleanupMore")); needgp = true; -- cgit From 5ca0905f6787e930bc2a626cf1d8f69fab52acef Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Sun, 13 May 2018 20:15:41 -0700 Subject: rcu: Fix cpustart tracepoint gp_seq number The "cpustart" trace event shows a stale gp_seq. This is because it uses rdp->gp_seq, which is updated only at the end of the __note_gp_changes() function. This commit therefore instead uses rnp->gp_seq. An alternative fix would be to update rdp->gp_seq earlier, but this would break RCU's detection of the beginning of a new-to-this-CPU grace period. Signed-off-by: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 73a33b82cfcd..973250503d98 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1801,7 +1801,7 @@ static bool __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp, * set up to detect a quiescent state, otherwise don't * go looking for one. */ - trace_rcu_grace_period(rsp->name, rdp->gp_seq, TPS("cpustart")); + trace_rcu_grace_period(rsp->name, rnp->gp_seq, TPS("cpustart")); need_gp = !!(rnp->qsmask & rdp->grpmask); rdp->cpu_no_qs.b.norm = need_gp; rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_dynticks.rcu_qs_ctr); -- cgit From 2e3e5e55010105f9d4351f68e15dbc43402a7794 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 15 May 2018 11:53:41 -0700 Subject: rcu: Make rcu_start_this_gp() check for grace period already started In the old days of ->gpnum and ->completed, the code requesting a new grace period checked to see if that grace period had already started, bailing early if so. The new-age ->gp_seq approach instead checks whether the grace period has already finished. A compensating change pushed the requested grace period down to the bottom of the tree, thus reducing lock contention and even eliminating it in some cases. But why not further reduce contention, especially on large systems, by doing both, especially given that the cost of doing both is extremely small? This commit therefore adds a new rcu_seq_started() function that checks whether a specified grace period has already started. It then uses this new function in place of rcu_seq_done() in the rcu_start_this_gp() function's funnel locking code. Reported-by: Joel Fernandes Signed-off-by: Paul E. McKenney --- kernel/rcu/rcu.h | 9 +++++++++ kernel/rcu/tree.c | 2 +- 2 files changed, 10 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index 003671825d62..1c5cbd9d7c97 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -107,6 +107,15 @@ static inline unsigned long rcu_seq_current(unsigned long *sp) return READ_ONCE(*sp); } +/* + * Given a snapshot from rcu_seq_snap(), determine whether or not the + * corresponding update-side operation has started. + */ +static inline bool rcu_seq_started(unsigned long *sp, unsigned long s) +{ + return ULONG_CMP_LT((s - 1) & ~RCU_SEQ_STATE_MASK, READ_ONCE(*sp)); +} + /* * Given a snapshot from rcu_seq_snap(), determine whether or not a * full update-side operation has occurred. diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 973250503d98..cbf2bcde5e60 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1583,7 +1583,7 @@ static bool rcu_start_this_gp(struct rcu_node *rnp, struct rcu_data *rdp, if (rnp_root != rnp) raw_spin_lock_rcu_node(rnp_root); if (ULONG_CMP_GE(rnp_root->gp_seq_needed, c) || - rcu_seq_done(&rnp_root->gp_seq, c) || + rcu_seq_started(&rnp_root->gp_seq, c) || (rnp != rnp_root && rcu_seq_state(rcu_seq_current(&rnp_root->gp_seq)))) { trace_rcu_this_gp(rnp_root, rdp, c, TPS("Prestarted")); -- cgit From d72193123c81ae6123d108b3be2096f3f13b25a6 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 15 May 2018 15:24:41 -0700 Subject: rcutorture: Correctly handle grace-period sequence wrap The new ->gq_seq grace-period sequence numbers must be shifted down, which give artifacts when these numbers wrap. This commit therefore enables rcutorture and rcuperf to handle grace-period sequence numbers even if they do wrap. It does this by allowing a special subtraction function to be specified, and this function subtracts before shifting. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcu.h | 9 +++++++++ kernel/rcu/rcuperf.c | 18 ++++++++++++++++-- kernel/rcu/rcutorture.c | 19 +++++++++++++------ kernel/rcu/tree.c | 6 +++--- 4 files changed, 41 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index 1c5cbd9d7c97..aa215d6355f8 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -142,6 +142,15 @@ static inline bool rcu_seq_new_gp(unsigned long old, unsigned long new) new); } +/* + * Roughly how many full grace periods have elapsed between the collection + * of the two specified grace periods? + */ +static inline unsigned long rcu_seq_diff(unsigned long new, unsigned long old) +{ + return (new - old) >> RCU_SEQ_CTR_SHIFT; +} + /* * debug_rcu_head_queue()/debug_rcu_head_unqueue() are used internally * by call_rcu() and rcu callback execution, and are therefore not part of the diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c index 2b5a613afcf3..b080bc4a4f45 100644 --- a/kernel/rcu/rcuperf.c +++ b/kernel/rcu/rcuperf.c @@ -139,6 +139,7 @@ struct rcu_perf_ops { int (*readlock)(void); void (*readunlock)(int idx); unsigned long (*get_gp_seq)(void); + unsigned long (*gp_diff)(unsigned long new, unsigned long old); unsigned long (*exp_completed)(void); void (*async)(struct rcu_head *head, rcu_callback_t func); void (*gp_barrier)(void); @@ -179,6 +180,7 @@ static struct rcu_perf_ops rcu_ops = { .readlock = rcu_perf_read_lock, .readunlock = rcu_perf_read_unlock, .get_gp_seq = rcu_get_gp_seq, + .gp_diff = rcu_seq_diff, .exp_completed = rcu_exp_batches_completed, .async = call_rcu, .gp_barrier = rcu_barrier, @@ -208,6 +210,7 @@ static struct rcu_perf_ops rcu_bh_ops = { .readlock = rcu_bh_perf_read_lock, .readunlock = rcu_bh_perf_read_unlock, .get_gp_seq = rcu_bh_get_gp_seq, + .gp_diff = rcu_seq_diff, .exp_completed = rcu_exp_batches_completed_sched, .async = call_rcu_bh, .gp_barrier = rcu_barrier_bh, @@ -264,6 +267,7 @@ static struct rcu_perf_ops srcu_ops = { .readlock = srcu_perf_read_lock, .readunlock = srcu_perf_read_unlock, .get_gp_seq = srcu_perf_completed, + .gp_diff = rcu_seq_diff, .exp_completed = srcu_perf_completed, .async = srcu_call_rcu, .gp_barrier = srcu_rcu_barrier, @@ -292,6 +296,7 @@ static struct rcu_perf_ops srcud_ops = { .readlock = srcu_perf_read_lock, .readunlock = srcu_perf_read_unlock, .get_gp_seq = srcu_perf_completed, + .gp_diff = rcu_seq_diff, .exp_completed = srcu_perf_completed, .async = srcu_call_rcu, .gp_barrier = srcu_rcu_barrier, @@ -321,6 +326,7 @@ static struct rcu_perf_ops sched_ops = { .readlock = sched_perf_read_lock, .readunlock = sched_perf_read_unlock, .get_gp_seq = rcu_sched_get_gp_seq, + .gp_diff = rcu_seq_diff, .exp_completed = rcu_exp_batches_completed_sched, .async = call_rcu_sched, .gp_barrier = rcu_barrier_sched, @@ -348,6 +354,7 @@ static struct rcu_perf_ops tasks_ops = { .readlock = tasks_perf_read_lock, .readunlock = tasks_perf_read_unlock, .get_gp_seq = rcu_no_completed, + .gp_diff = rcu_seq_diff, .async = call_rcu_tasks, .gp_barrier = rcu_barrier_tasks, .sync = synchronize_rcu_tasks, @@ -355,6 +362,13 @@ static struct rcu_perf_ops tasks_ops = { .name = "tasks" }; +static unsigned long rcuperf_seq_diff(unsigned long new, unsigned long old) +{ + if (!cur_ops->gp_diff) + return new - old; + return cur_ops->gp_diff(new, old); +} + static bool __maybe_unused torturing_tasks(void) { return cur_ops == &tasks_ops; @@ -577,8 +591,8 @@ rcu_perf_cleanup(void) t_rcu_perf_writer_finished - t_rcu_perf_writer_started, ngps, - b_rcu_perf_writer_finished - - b_rcu_perf_writer_started); + rcuperf_seq_diff(b_rcu_perf_writer_finished, + b_rcu_perf_writer_started)); for (i = 0; i < nrealwriters; i++) { if (!writer_durations) break; diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 81fb43530d64..0481c7286875 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -265,6 +265,7 @@ struct rcu_torture_ops { void (*read_delay)(struct torture_random_state *rrsp); void (*readunlock)(int idx); unsigned long (*get_gp_seq)(void); + unsigned long (*gp_diff)(unsigned long new, unsigned long old); void (*deferred_free)(struct rcu_torture *p); void (*sync)(void); void (*exp_sync)(void); @@ -400,6 +401,7 @@ static struct rcu_torture_ops rcu_ops = { .read_delay = rcu_read_delay, .readunlock = rcu_torture_read_unlock, .get_gp_seq = rcu_get_gp_seq, + .gp_diff = rcu_seq_diff, .deferred_free = rcu_torture_deferred_free, .sync = synchronize_rcu, .exp_sync = synchronize_rcu_expedited, @@ -441,6 +443,7 @@ static struct rcu_torture_ops rcu_bh_ops = { .read_delay = rcu_read_delay, /* just reuse rcu's version. */ .readunlock = rcu_bh_torture_read_unlock, .get_gp_seq = rcu_bh_get_gp_seq, + .gp_diff = rcu_seq_diff, .deferred_free = rcu_bh_torture_deferred_free, .sync = synchronize_rcu_bh, .exp_sync = synchronize_rcu_bh_expedited, @@ -646,6 +649,7 @@ static struct rcu_torture_ops sched_ops = { .read_delay = rcu_read_delay, /* just reuse rcu's version. */ .readunlock = sched_torture_read_unlock, .get_gp_seq = rcu_sched_get_gp_seq, + .gp_diff = rcu_seq_diff, .deferred_free = rcu_sched_torture_deferred_free, .sync = synchronize_sched, .exp_sync = synchronize_sched_expedited, @@ -695,6 +699,13 @@ static struct rcu_torture_ops tasks_ops = { .name = "tasks" }; +static unsigned long rcutorture_seq_diff(unsigned long new, unsigned long old) +{ + if (!cur_ops->gp_diff) + return new - old; + return cur_ops->gp_diff(new, old); +} + static bool __maybe_unused torturing_tasks(void) { return cur_ops == &tasks_ops; @@ -1127,9 +1138,7 @@ static void rcu_torture_timer(struct timer_list *unused) rcu_ftrace_dump(DUMP_ALL); } __this_cpu_inc(rcu_torture_count[pipe_count]); - completed = completed - started; - if (completed > ULONG_MAX >> 1) - completed = 0; /* Not all gp_seq have full range. */ + completed = rcutorture_seq_diff(completed, started); if (completed > RCU_TORTURE_PIPE_LEN) { /* Should not happen, but... */ completed = RCU_TORTURE_PIPE_LEN; @@ -1205,9 +1214,7 @@ rcu_torture_reader(void *arg) rcu_ftrace_dump(DUMP_ALL); } __this_cpu_inc(rcu_torture_count[pipe_count]); - completed = completed - started; - if (completed > ULONG_MAX >> 1) - completed = 0; /* Not all gp_seq have full range. */ + completed = rcutorture_seq_diff(completed, started); if (completed > RCU_TORTURE_PIPE_LEN) { /* Should not happen, but... */ completed = RCU_TORTURE_PIPE_LEN; diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index cbf2bcde5e60..fa219eea0ae7 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -532,7 +532,7 @@ static int rcu_pending(void); */ unsigned long rcu_get_gp_seq(void) { - return rcu_seq_ctr(READ_ONCE(rcu_state_p->gp_seq)); + return READ_ONCE(rcu_state_p->gp_seq); } EXPORT_SYMBOL_GPL(rcu_get_gp_seq); @@ -541,7 +541,7 @@ EXPORT_SYMBOL_GPL(rcu_get_gp_seq); */ unsigned long rcu_sched_get_gp_seq(void) { - return rcu_seq_ctr(READ_ONCE(rcu_sched_state.gp_seq)); + return READ_ONCE(rcu_sched_state.gp_seq); } EXPORT_SYMBOL_GPL(rcu_sched_get_gp_seq); @@ -550,7 +550,7 @@ EXPORT_SYMBOL_GPL(rcu_sched_get_gp_seq); */ unsigned long rcu_bh_get_gp_seq(void) { - return rcu_seq_ctr(READ_ONCE(rcu_bh_state.gp_seq)); + return READ_ONCE(rcu_bh_state.gp_seq); } EXPORT_SYMBOL_GPL(rcu_bh_get_gp_seq); -- cgit From 3d18469a2bb3988e669d67e097eff42dd40663d4 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 15 May 2018 16:47:30 -0700 Subject: rcu: Regularize resetting of rcu_data wrap indicator The rcu_data structure's ->gpwrap indicator is currently reset only when the CPU in question detects a new grace period. This is in theory sufficient because any CPU that has been out of action for long enough that its ->gpwrap indicator is set is guaranteed to see both the end of an old grace period and the start of a new one. However, the current code leaves a short window during which the ->gpwrap indicator has been reset but the corresponding ->gp_seq counter has not yet been brought up to date. This is harmless because interrupts are disabled, but it is likely to (at the very least) cause confusion. This commit therefore moves the resetting of ->gpwrap to follow the updating of ->gp_seq. While in the area, it also resets ->gp_seq_needed. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index fa219eea0ae7..161e8fb8b83f 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1807,10 +1807,12 @@ static bool __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp, rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_dynticks.rcu_qs_ctr); rdp->core_needs_qs = need_gp; zero_cpu_stall_ticks(rdp); - WRITE_ONCE(rdp->gpwrap, false); - rcu_gpnum_ovf(rnp, rdp); } rdp->gp_seq = rnp->gp_seq; /* Remember new grace-period state. */ + if (ULONG_CMP_GE(rnp->gp_seq_needed, rdp->gp_seq_needed) || rdp->gpwrap) + rdp->gp_seq_needed = rnp->gp_seq_needed; + WRITE_ONCE(rdp->gpwrap, false); + rcu_gpnum_ovf(rnp, rdp); return ret; } -- cgit From b73de91d6a4c97ed586b2a5a6ce7c6fe395d9a3b Mon Sep 17 00:00:00 2001 From: Joel Fernandes Date: Sun, 20 May 2018 21:42:18 -0700 Subject: rcu: Rename the grace-period-request variables and parameters The name 'c' is used for variables and parameters holding the requested grace-period sequence number. However it is no longer very meaningful given the conversions from ->gpnum and (especially) ->completed to ->gp_seq. This commit therefore renames 'c' to 'gp_seq_req'. Previous patch discussion is at: https://patchwork.kernel.org/patch/10396579/ Signed-off-by: Joel Fernandes Signed-off-by: Paul E. McKenney --- include/trace/events/rcu.h | 15 ++++++++------- kernel/rcu/tree.c | 46 +++++++++++++++++++++++++++------------------- 2 files changed, 35 insertions(+), 26 deletions(-) (limited to 'kernel') diff --git a/include/trace/events/rcu.h b/include/trace/events/rcu.h index 5ab1df0a2801..759e7e83733d 100644 --- a/include/trace/events/rcu.h +++ b/include/trace/events/rcu.h @@ -103,15 +103,16 @@ TRACE_EVENT(rcu_grace_period, */ TRACE_EVENT(rcu_future_grace_period, - TP_PROTO(const char *rcuname, unsigned long gp_seq, unsigned long c, - u8 level, int grplo, int grphi, const char *gpevent), + TP_PROTO(const char *rcuname, unsigned long gp_seq, + unsigned long gp_seq_req, u8 level, int grplo, int grphi, + const char *gpevent), - TP_ARGS(rcuname, gp_seq, c, level, grplo, grphi, gpevent), + TP_ARGS(rcuname, gp_seq, gp_seq_req, level, grplo, grphi, gpevent), TP_STRUCT__entry( __field(const char *, rcuname) __field(unsigned long, gp_seq) - __field(unsigned long, c) + __field(unsigned long, gp_seq_req) __field(u8, level) __field(int, grplo) __field(int, grphi) @@ -121,7 +122,7 @@ TRACE_EVENT(rcu_future_grace_period, TP_fast_assign( __entry->rcuname = rcuname; __entry->gp_seq = gp_seq; - __entry->c = c; + __entry->gp_seq_req = gp_seq_req; __entry->level = level; __entry->grplo = grplo; __entry->grphi = grphi; @@ -129,7 +130,7 @@ TRACE_EVENT(rcu_future_grace_period, ), TP_printk("%s %lu %lu %u %d %d %s", - __entry->rcuname, __entry->gp_seq, __entry->c, __entry->level, + __entry->rcuname, __entry->gp_seq, __entry->gp_seq_req, __entry->level, __entry->grplo, __entry->grphi, __entry->gpevent) ); @@ -751,7 +752,7 @@ TRACE_EVENT(rcu_barrier, #else /* #ifdef CONFIG_RCU_TRACE */ #define trace_rcu_grace_period(rcuname, gp_seq, gpevent) do { } while (0) -#define trace_rcu_future_grace_period(rcuname, gp_seq, c, \ +#define trace_rcu_future_grace_period(rcuname, gp_seq, gp_seq_req, \ level, grplo, grphi, event) \ do { } while (0) #define trace_rcu_grace_period_init(rcuname, gp_seq, level, grplo, grphi, \ diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 161e8fb8b83f..8c31b1740afc 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1546,13 +1546,18 @@ void rcu_cpu_stall_reset(void) /* Trace-event wrapper function for trace_rcu_future_grace_period. */ static void trace_rcu_this_gp(struct rcu_node *rnp, struct rcu_data *rdp, - unsigned long c, const char *s) + unsigned long gp_seq_req, const char *s) { - trace_rcu_future_grace_period(rdp->rsp->name, rnp->gp_seq, c, + trace_rcu_future_grace_period(rdp->rsp->name, rnp->gp_seq, gp_seq_req, rnp->level, rnp->grplo, rnp->grphi, s); } /* + * rcu_start_this_gp - Request the start of a particular grace period + * @rnp: The leaf node of the CPU from which to start. + * @rdp: The rcu_data corresponding to the CPU from which to start. + * @gp_seq_req: The gp_seq of the grace period to start. + * * Start the specified grace period, as needed to handle newly arrived * callbacks. The required future grace periods are recorded in each * rcu_node structure's ->gp_seq_needed field. Returns true if there @@ -1560,9 +1565,11 @@ static void trace_rcu_this_gp(struct rcu_node *rnp, struct rcu_data *rdp, * * The caller must hold the specified rcu_node structure's ->lock, which * is why the caller is responsible for waking the grace-period kthread. + * + * Returns true if the GP thread needs to be awakened else false. */ static bool rcu_start_this_gp(struct rcu_node *rnp, struct rcu_data *rdp, - unsigned long c) + unsigned long gp_seq_req) { bool ret = false; struct rcu_state *rsp = rdp->rsp; @@ -1578,25 +1585,27 @@ static bool rcu_start_this_gp(struct rcu_node *rnp, struct rcu_data *rdp, * not be released. */ raw_lockdep_assert_held_rcu_node(rnp); - trace_rcu_this_gp(rnp, rdp, c, TPS("Startleaf")); + trace_rcu_this_gp(rnp, rdp, gp_seq_req, TPS("Startleaf")); for (rnp_root = rnp; 1; rnp_root = rnp_root->parent) { if (rnp_root != rnp) raw_spin_lock_rcu_node(rnp_root); - if (ULONG_CMP_GE(rnp_root->gp_seq_needed, c) || - rcu_seq_started(&rnp_root->gp_seq, c) || + if (ULONG_CMP_GE(rnp_root->gp_seq_needed, gp_seq_req) || + rcu_seq_started(&rnp_root->gp_seq, gp_seq_req) || (rnp != rnp_root && rcu_seq_state(rcu_seq_current(&rnp_root->gp_seq)))) { - trace_rcu_this_gp(rnp_root, rdp, c, TPS("Prestarted")); + trace_rcu_this_gp(rnp_root, rdp, gp_seq_req, + TPS("Prestarted")); goto unlock_out; } - rnp_root->gp_seq_needed = c; + rnp_root->gp_seq_needed = gp_seq_req; if (rcu_seq_state(rcu_seq_current(&rnp->gp_seq))) { /* * We just marked the leaf, and a grace period * is in progress, which means that rcu_gp_cleanup() * will see the marking. Bail to reduce contention. */ - trace_rcu_this_gp(rnp, rdp, c, TPS("Startedleaf")); + trace_rcu_this_gp(rnp, rdp, gp_seq_req, + TPS("Startedleaf")); goto unlock_out; } if (rnp_root != rnp && rnp_root->parent != NULL) @@ -1607,21 +1616,21 @@ static bool rcu_start_this_gp(struct rcu_node *rnp, struct rcu_data *rdp, /* If GP already in progress, just leave, otherwise start one. */ if (rcu_gp_in_progress(rsp)) { - trace_rcu_this_gp(rnp_root, rdp, c, TPS("Startedleafroot")); + trace_rcu_this_gp(rnp_root, rdp, gp_seq_req, TPS("Startedleafroot")); goto unlock_out; } - trace_rcu_this_gp(rnp_root, rdp, c, TPS("Startedroot")); + trace_rcu_this_gp(rnp_root, rdp, gp_seq_req, TPS("Startedroot")); WRITE_ONCE(rsp->gp_flags, rsp->gp_flags | RCU_GP_FLAG_INIT); rsp->gp_req_activity = jiffies; if (!rsp->gp_kthread) { - trace_rcu_this_gp(rnp_root, rdp, c, TPS("NoGPkthread")); + trace_rcu_this_gp(rnp_root, rdp, gp_seq_req, TPS("NoGPkthread")); goto unlock_out; } trace_rcu_grace_period(rsp->name, READ_ONCE(rsp->gp_seq), TPS("newreq")); ret = true; /* Caller must wake GP kthread. */ unlock_out: /* Push furthest requested GP to leaf node and rcu_data structure. */ - if (ULONG_CMP_LT(c, rnp_root->gp_seq_needed)) { + if (ULONG_CMP_LT(gp_seq_req, rnp_root->gp_seq_needed)) { rnp->gp_seq_needed = rnp_root->gp_seq_needed; rdp->gp_seq_needed = rnp_root->gp_seq_needed; } @@ -1636,14 +1645,13 @@ unlock_out: */ static bool rcu_future_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp) { - unsigned long c = rnp->gp_seq; bool needmore; struct rcu_data *rdp = this_cpu_ptr(rsp->rda); needmore = ULONG_CMP_LT(rnp->gp_seq, rnp->gp_seq_needed); if (!needmore) rnp->gp_seq_needed = rnp->gp_seq; /* Avoid counter wrap. */ - trace_rcu_this_gp(rnp, rdp, c, + trace_rcu_this_gp(rnp, rdp, rnp->gp_seq, needmore ? TPS("CleanupMore") : TPS("Cleanup")); return needmore; } @@ -1679,7 +1687,7 @@ static void rcu_gp_kthread_wake(struct rcu_state *rsp) static bool rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp) { - unsigned long c; + unsigned long gp_seq_req; bool ret = false; raw_lockdep_assert_held_rcu_node(rnp); @@ -1698,9 +1706,9 @@ static bool rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp, * accelerating callback invocation to an earlier grace-period * number. */ - c = rcu_seq_snap(&rsp->gp_seq); - if (rcu_segcblist_accelerate(&rdp->cblist, c)) - ret = rcu_start_this_gp(rnp, rdp, c); + gp_seq_req = rcu_seq_snap(&rsp->gp_seq); + if (rcu_segcblist_accelerate(&rdp->cblist, gp_seq_req)) + ret = rcu_start_this_gp(rnp, rdp, gp_seq_req); /* Trace depending on how much we were able to accelerate. */ if (rcu_segcblist_restempty(&rdp->cblist, RCU_WAIT_TAIL)) -- cgit From df2bf8f7f776cef57e6b27690c7b78c86f259515 Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Tue, 22 May 2018 23:38:14 -0700 Subject: rcu: Use better variable names in funnel locking loop The funnel locking loop in rcu_start_this_gp uses rcu_root as a temporary variable while walking the combining tree. This causes a tiresome exercise of a code reader reminding themselves that rcu_root may not be root. Lets just call it rnp, and rename other variables as well to be more appropriate. Original patch: https://patchwork.kernel.org/patch/10396577/ Signed-off-by: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney [ paulmck: Fix name in comment as well. ] --- kernel/rcu/tree.c | 64 +++++++++++++++++++++++++++---------------------------- 1 file changed, 32 insertions(+), 32 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 8c31b1740afc..1af58f4b8a25 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1554,7 +1554,7 @@ static void trace_rcu_this_gp(struct rcu_node *rnp, struct rcu_data *rdp, /* * rcu_start_this_gp - Request the start of a particular grace period - * @rnp: The leaf node of the CPU from which to start. + * @rnp_start: The leaf node of the CPU from which to start. * @rdp: The rcu_data corresponding to the CPU from which to start. * @gp_seq_req: The gp_seq of the grace period to start. * @@ -1568,74 +1568,74 @@ static void trace_rcu_this_gp(struct rcu_node *rnp, struct rcu_data *rdp, * * Returns true if the GP thread needs to be awakened else false. */ -static bool rcu_start_this_gp(struct rcu_node *rnp, struct rcu_data *rdp, +static bool rcu_start_this_gp(struct rcu_node *rnp_start, struct rcu_data *rdp, unsigned long gp_seq_req) { bool ret = false; struct rcu_state *rsp = rdp->rsp; - struct rcu_node *rnp_root; + struct rcu_node *rnp; /* * Use funnel locking to either acquire the root rcu_node * structure's lock or bail out if the need for this grace period - * has already been recorded -- or has already started. If there - * is already a grace period in progress in a non-leaf node, no - * recording is needed because the end of the grace period will - * scan the leaf rcu_node structures. Note that rnp->lock must - * not be released. + * has already been recorded -- or if that grace period has in + * fact already started. If there is already a grace period in + * progress in a non-leaf node, no recording is needed because the + * end of the grace period will scan the leaf rcu_node structures. + * Note that rnp_start->lock must not be released. */ - raw_lockdep_assert_held_rcu_node(rnp); - trace_rcu_this_gp(rnp, rdp, gp_seq_req, TPS("Startleaf")); - for (rnp_root = rnp; 1; rnp_root = rnp_root->parent) { - if (rnp_root != rnp) - raw_spin_lock_rcu_node(rnp_root); - if (ULONG_CMP_GE(rnp_root->gp_seq_needed, gp_seq_req) || - rcu_seq_started(&rnp_root->gp_seq, gp_seq_req) || - (rnp != rnp_root && - rcu_seq_state(rcu_seq_current(&rnp_root->gp_seq)))) { - trace_rcu_this_gp(rnp_root, rdp, gp_seq_req, + raw_lockdep_assert_held_rcu_node(rnp_start); + trace_rcu_this_gp(rnp_start, rdp, gp_seq_req, TPS("Startleaf")); + for (rnp = rnp_start; 1; rnp = rnp->parent) { + if (rnp != rnp_start) + raw_spin_lock_rcu_node(rnp); + if (ULONG_CMP_GE(rnp->gp_seq_needed, gp_seq_req) || + rcu_seq_started(&rnp->gp_seq, gp_seq_req) || + (rnp != rnp_start && + rcu_seq_state(rcu_seq_current(&rnp->gp_seq)))) { + trace_rcu_this_gp(rnp, rdp, gp_seq_req, TPS("Prestarted")); goto unlock_out; } - rnp_root->gp_seq_needed = gp_seq_req; - if (rcu_seq_state(rcu_seq_current(&rnp->gp_seq))) { + rnp->gp_seq_needed = gp_seq_req; + if (rcu_seq_state(rcu_seq_current(&rnp_start->gp_seq))) { /* * We just marked the leaf, and a grace period * is in progress, which means that rcu_gp_cleanup() * will see the marking. Bail to reduce contention. */ - trace_rcu_this_gp(rnp, rdp, gp_seq_req, + trace_rcu_this_gp(rnp_start, rdp, gp_seq_req, TPS("Startedleaf")); goto unlock_out; } - if (rnp_root != rnp && rnp_root->parent != NULL) - raw_spin_unlock_rcu_node(rnp_root); - if (!rnp_root->parent) + if (rnp != rnp_start && rnp->parent != NULL) + raw_spin_unlock_rcu_node(rnp); + if (!rnp->parent) break; /* At root, and perhaps also leaf. */ } /* If GP already in progress, just leave, otherwise start one. */ if (rcu_gp_in_progress(rsp)) { - trace_rcu_this_gp(rnp_root, rdp, gp_seq_req, TPS("Startedleafroot")); + trace_rcu_this_gp(rnp, rdp, gp_seq_req, TPS("Startedleafroot")); goto unlock_out; } - trace_rcu_this_gp(rnp_root, rdp, gp_seq_req, TPS("Startedroot")); + trace_rcu_this_gp(rnp, rdp, gp_seq_req, TPS("Startedroot")); WRITE_ONCE(rsp->gp_flags, rsp->gp_flags | RCU_GP_FLAG_INIT); rsp->gp_req_activity = jiffies; if (!rsp->gp_kthread) { - trace_rcu_this_gp(rnp_root, rdp, gp_seq_req, TPS("NoGPkthread")); + trace_rcu_this_gp(rnp, rdp, gp_seq_req, TPS("NoGPkthread")); goto unlock_out; } trace_rcu_grace_period(rsp->name, READ_ONCE(rsp->gp_seq), TPS("newreq")); ret = true; /* Caller must wake GP kthread. */ unlock_out: /* Push furthest requested GP to leaf node and rcu_data structure. */ - if (ULONG_CMP_LT(gp_seq_req, rnp_root->gp_seq_needed)) { - rnp->gp_seq_needed = rnp_root->gp_seq_needed; - rdp->gp_seq_needed = rnp_root->gp_seq_needed; + if (ULONG_CMP_LT(gp_seq_req, rnp->gp_seq_needed)) { + rnp_start->gp_seq_needed = rnp->gp_seq_needed; + rdp->gp_seq_needed = rnp->gp_seq_needed; } - if (rnp != rnp_root) - raw_spin_unlock_rcu_node(rnp_root); + if (rnp != rnp_start) + raw_spin_unlock_rcu_node(rnp); return ret; } -- cgit From 226ca5e76692e2c82c17e8e8eedab22043f6ffee Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Tue, 22 May 2018 23:38:15 -0700 Subject: rcu: Identify grace period is in progress as we advance up the tree There's no need to keep checking the same starting node for whether a grace period is in progress as we advance up the funnel lock loop. Its sufficient if we just checked it in the start, and then subsequently checked the internal nodes as we advanced up the combining tree. This also makes sense because the grace-period updates propogate from the root to the leaf, so there's a chance we may find a grace period has started as we advance up, lets check for the same. Reported-by: Paul McKenney Signed-off-by: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 1af58f4b8a25..a6863b813f0c 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1598,11 +1598,12 @@ static bool rcu_start_this_gp(struct rcu_node *rnp_start, struct rcu_data *rdp, goto unlock_out; } rnp->gp_seq_needed = gp_seq_req; - if (rcu_seq_state(rcu_seq_current(&rnp_start->gp_seq))) { + if (rcu_seq_state(rcu_seq_current(&rnp->gp_seq))) { /* - * We just marked the leaf, and a grace period - * is in progress, which means that rcu_gp_cleanup() - * will see the marking. Bail to reduce contention. + * We just marked the leaf or internal node, and a + * grace period is in progress, which means that + * rcu_gp_cleanup() will see the marking. Bail to + * reduce contention. */ trace_rcu_this_gp(rnp_start, rdp, gp_seq_req, TPS("Startedleaf")); -- cgit From 962aff03c315b508d980422db5b49b49e4382119 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 2 May 2018 12:49:21 -0700 Subject: rcu: Clean up handling of tasks blocked across full-rcu_node offline Commit 0aa04b055e71 ("rcu: Process offlining and onlining only at grace-period start") deferred handling of CPU-hotplug events until the start of the next grace period, but consider the following sequence of events: 1. A task is preempted within an RCU-preempt read-side critical section. 2. The CPU that this task was running on goes offline, along with all other CPUs sharing the corresponding leaf rcu_node structure. 3. The task resumes execution. 4. One of those CPUs comes back online before a new grace period starts. In step 2, the code in the next rcu_gp_init() invocation will (correctly) defer removing the leaf rcu_node structure from the upper-level bitmasks, and will (correctly) set that structure's ->wait_blkd_tasks field. During the ensuing interval, RCU will (correctly) track the tasks preempted on that structure because they must block any subsequent grace period. In step 3, the code in rcu_read_unlock_special() will (correctly) remove the task from the leaf rcu_node structure. From this point forward, RCU need not pay attention to this structure, at least not until one of the corresponding CPUs comes back online. In step 4, the code in the next rcu_gp_init() invocation will (incorrectly) invoke rcu_init_new_rnp(). This is incorrect because the corresponding rcu_cleanup_dead_rnp() was never invoked. This is nevertheless harmless because the upper-level bits are still set. So, no harm, no foul, right? At least, all is well until a little further into rcu_gp_init() invocation, which will notice that there are no longer any tasks blocked on the leaf rcu_node structure, conclude that there is no longer anything left over from step 2's offline operation, and will therefore invoke rcu_cleanup_dead_rnp(). But this invocation of rcu_cleanup_dead_rnp() is for the beginning of the earlier offline interval, and the previous invocation of rcu_init_new_rnp() is for the end of that same interval. That is right, they are invoked out of order. That cannot be good, can it? It turns out that this is not a (correctness!) problem because rcu_cleanup_dead_rnp() checks to see if any of the corresponding CPUs are online, and refuses to do anything if so. In other words, in the case where rcu_init_new_rnp() and rcu_cleanup_dead_rnp() execute out of order, they both have no effect. But this is at best an accident waiting to happen. This commit therefore adds logic to rcu_gp_init() so that rcu_init_new_rnp() and rcu_cleanup_dead_rnp() are always invoked in order, and so that neither are invoked at all in cases where RCU had to pay attention to the leaf rcu_node structure during the entire time that all corresponding CPUs were offline. And, while in the area, this commit reduces confusion by using formal parameters rather than local variables that just happen to have the same value at that particular point in the code. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 30 +++++++++++++++++------------- 1 file changed, 17 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index a6863b813f0c..9a5ba6db7b60 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1909,12 +1909,14 @@ static bool rcu_gp_init(struct rcu_state *rsp) /* If zero-ness of ->qsmaskinit changed, propagate up tree. */ if (!oldmask != !rnp->qsmaskinit) { - if (!oldmask) /* First online CPU for this rcu_node. */ - rcu_init_new_rnp(rnp); - else if (rcu_preempt_has_tasks(rnp)) /* blocked tasks */ - rnp->wait_blkd_tasks = true; - else /* Last offline CPU and can propagate. */ + if (!oldmask) { /* First online CPU for rcu_node. */ + if (!rnp->wait_blkd_tasks) /* Ever offline? */ + rcu_init_new_rnp(rnp); + } else if (rcu_preempt_has_tasks(rnp)) { + rnp->wait_blkd_tasks = true; /* blocked tasks */ + } else { /* Last offline CPU and can propagate. */ rcu_cleanup_dead_rnp(rnp); + } } /* @@ -1923,14 +1925,13 @@ static bool rcu_gp_init(struct rcu_state *rsp) * still offline, propagate up the rcu_node tree and * clear ->wait_blkd_tasks. Otherwise, if one of this * rcu_node structure's CPUs has since come back online, - * simply clear ->wait_blkd_tasks (but rcu_cleanup_dead_rnp() - * checks for this, so just call it unconditionally). + * simply clear ->wait_blkd_tasks. */ if (rnp->wait_blkd_tasks && - (!rcu_preempt_has_tasks(rnp) || - rnp->qsmaskinit)) { + (!rcu_preempt_has_tasks(rnp) || rnp->qsmaskinit)) { rnp->wait_blkd_tasks = false; - rcu_cleanup_dead_rnp(rnp); + if (!rnp->qsmaskinit) + rcu_cleanup_dead_rnp(rnp); } raw_spin_unlock_irq_rcu_node(rnp); @@ -2450,9 +2451,10 @@ static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf) long mask; struct rcu_node *rnp = rnp_leaf; - raw_lockdep_assert_held_rcu_node(rnp); + raw_lockdep_assert_held_rcu_node(rnp_leaf); if (!IS_ENABLED(CONFIG_HOTPLUG_CPU) || - rnp->qsmaskinit || rcu_preempt_has_tasks(rnp)) + WARN_ON_ONCE(rnp_leaf->qsmaskinit) || + WARN_ON_ONCE(rcu_preempt_has_tasks(rnp_leaf))) return; for (;;) { mask = rnp->grpmask; @@ -2461,7 +2463,8 @@ static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf) break; raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */ rnp->qsmaskinit &= ~mask; - rnp->qsmask &= ~mask; + /* Between grace periods, so better already be zero! */ + WARN_ON_ONCE(rnp->qsmask); if (rnp->qsmaskinit) { raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */ @@ -3479,6 +3482,7 @@ static void rcu_init_new_rnp(struct rcu_node *rnp_leaf) struct rcu_node *rnp = rnp_leaf; raw_lockdep_assert_held_rcu_node(rnp); + WARN_ON_ONCE(rnp->wait_blkd_tasks); for (;;) { mask = rnp->grpmask; rnp = rnp->parent; -- cgit From c50cbe535c972150c2caf923239ef77e85c5ad60 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 2 May 2018 13:51:57 -0700 Subject: rcu: Fix an obsolete ->qsmaskinit comment Back in the old days, when grace-period initialization blocked CPU hotplug, the ->qsmaskinit mask was indeed updated at the time that a given CPU went offline. However, with the deferral of these updates until the beginning of the next grace period in commit 0aa04b055e71 ("rcu: Process offlining and onlining only at grace-period start"), it is instead ->qsmaskinitnext that gets updated at that time. This commit therefore updates the obsolete comment. It also fixes punctuation while on the topic of comments mentioning ->qsmaskinit. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 9a5ba6db7b60..05f69b787a57 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2438,7 +2438,7 @@ static void rcu_cleanup_dying_cpu(struct rcu_state *rsp) * This function therefore goes up the tree of rcu_node structures, * clearing the corresponding bits in the ->qsmaskinit fields. Note that * the leaf rcu_node structure's ->qsmaskinit field has already been - * updated + * updated. * * This function does check that the specified rcu_node structure has * all CPUs offline and no blocked tasks, so it is OK to invoke it @@ -3709,7 +3709,7 @@ void rcu_cpu_starting(unsigned int cpu) #ifdef CONFIG_HOTPLUG_CPU /* * The CPU is exiting the idle loop into the arch_cpu_idle_dead() - * function. We now remove it from the rcu_node tree's ->qsmaskinit + * function. We now remove it from the rcu_node tree's ->qsmaskinitnext * bit masks. */ static void rcu_cleanup_dying_idle_cpu(int cpu, struct rcu_state *rsp) -- cgit From 8d672fa6bf68ffc36a0c5e4868499f86bbea2308 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 2 May 2018 14:46:43 -0700 Subject: rcu: Make rcu_init_new_rnp() stop upon already-set bit Currently, rcu_init_new_rnp() walks up the rcu_node combining tree, setting bits in the ->qsmaskinit fields on the way up. It walks up unconditionally, regardless of the initial state of these bits. This is OK because only the corresponding RCU grace-period kthread ever tests or sets these bits during runtime. However, it is also pointless, and it increases both memory and lock contention (albeit only slightly), so this commit stops the walk as soon as an already-set bit is encountered. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 05f69b787a57..3fe854a15d82 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3479,9 +3479,10 @@ EXPORT_SYMBOL_GPL(rcu_barrier_sched); static void rcu_init_new_rnp(struct rcu_node *rnp_leaf) { long mask; + long oldmask; struct rcu_node *rnp = rnp_leaf; - raw_lockdep_assert_held_rcu_node(rnp); + raw_lockdep_assert_held_rcu_node(rnp_leaf); WARN_ON_ONCE(rnp->wait_blkd_tasks); for (;;) { mask = rnp->grpmask; @@ -3489,8 +3490,11 @@ static void rcu_init_new_rnp(struct rcu_node *rnp_leaf) if (rnp == NULL) return; raw_spin_lock_rcu_node(rnp); /* Interrupts already disabled. */ + oldmask = rnp->qsmaskinit; rnp->qsmaskinit |= mask; raw_spin_unlock_rcu_node(rnp); /* Interrupts remain disabled. */ + if (oldmask) + return; } } -- cgit From c74859d1eb2d8578bdf6d78ba893e394085aba1a Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 27 Apr 2018 14:05:27 -0700 Subject: rcu: Make rcu_report_unblock_qs_rnp() warn on violated preconditions If rcu_report_unblock_qs_rnp() is invoked on something other than preemptible RCU or if there are still preempted tasks blocking the current grace period, something went badly wrong in the caller. This commit therefore adds WARN_ON_ONCE() to these conditions, but leaving the legitimate reason for early exit (rnp->qsmask != 0) unwarned. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 3fe854a15d82..85417584ffd0 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2308,8 +2308,10 @@ static void rcu_report_unblock_qs_rnp(struct rcu_state *rsp, struct rcu_node *rnp_p; raw_lockdep_assert_held_rcu_node(rnp); - if (rcu_state_p == &rcu_sched_state || rsp != rcu_state_p || - rnp->qsmask != 0 || rcu_preempt_blocked_readers_cgp(rnp)) { + if (WARN_ON_ONCE(rcu_state_p == &rcu_sched_state) || + WARN_ON_ONCE(rsp != rcu_state_p) || + WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp)) || + rnp->qsmask != 0) { raw_spin_unlock_irqrestore_rcu_node(rnp, flags); return; /* Still need more quiescent states! */ } -- cgit From 77cfc7bf24ba0ba37be54e224007847d485a860f Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 1 May 2018 15:00:10 -0700 Subject: rcu: Fix typo and add additional debug This commit fixes a typo and adds some additional debugging to the message emitted when a task blocking the current grace period is listed as blocking it when either that grace period ends or the next grace period begins. This commit also reformats the console message for readability. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 1 + kernel/rcu/tree_plugin.h | 10 ++++++++-- 2 files changed, 9 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 85417584ffd0..a4277c1087d9 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2316,6 +2316,7 @@ static void rcu_report_unblock_qs_rnp(struct rcu_state *rsp, return; /* Still need more quiescent states! */ } + rnp->completedqs = rnp->gp_seq; rnp_p = rnp->parent; if (rnp_p == NULL) { /* diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 3a6e04103de1..677b0c9f548d 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -856,8 +856,14 @@ static void dump_blkd_tasks(struct rcu_node *rnp, int ncheck) struct list_head *lhp; raw_lockdep_assert_held_rcu_node(rnp); - pr_info("%s: grp: %d-%d level: %d ->qamask %#lx ->gp_tasks %p ->boost_tasks %p ->exp_tasks %p &->blkd_tasks: %p offset: %u\n", __func__, rnp->grplo, rnp->grphi, rnp->level, rnp->qsmask, rnp->gp_tasks, rnp->boost_tasks, rnp->exp_tasks, &rnp->blkd_tasks, (unsigned int)offsetof(typeof(*rnp), blkd_tasks)); - pr_cont("\t->blkd_tasks"); + pr_info("%s: grp: %d-%d level: %d ->gp_seq %#lx ->completedqs %#lx\n", + __func__, rnp->grplo, rnp->grphi, rnp->level, + rnp->gp_seq, rnp->completedqs); + pr_info("%s: ->qsmask %#lx ->qsmaskinit %#lx ->qsmaskinitnext %#lx\n", + __func__, rnp->qsmask, rnp->qsmaskinit, rnp->qsmaskinitnext); + pr_info("%s: ->gp_tasks %p ->boost_tasks %p ->exp_tasks %p\n", + __func__, rnp->gp_tasks, rnp->boost_tasks, rnp->exp_tasks); + pr_info("%s: ->blkd_tasks", __func__); i = 0; list_for_each(lhp, &rnp->blkd_tasks) { pr_cont(" %p", lhp); -- cgit From 91f63ced7dc4e80acd13386204327d5de00a672d Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 1 May 2018 15:05:45 -0700 Subject: rcu: Replace smp_wmb() with smp_store_release() for stall check This commit gets rid of the smp_wmb() in record_gp_stall_check_time() in favor of an smp_store_release(). Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index a4277c1087d9..439228b79811 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1246,9 +1246,9 @@ static void record_gp_stall_check_time(struct rcu_state *rsp) unsigned long j1; rsp->gp_start = j; - smp_wmb(); /* Record start time before stall time. */ j1 = rcu_jiffies_till_stall_check(); - WRITE_ONCE(rsp->jiffies_stall, j + j1); + /* Record ->gp_start before ->jiffies_stall. */ + smp_store_release(&rsp->jiffies_stall, j + j1); /* ^^^ */ rsp->jiffies_resched = j + j1 / 2; rsp->n_force_qs_gpstart = READ_ONCE(rsp->n_force_qs); } -- cgit From 928164351e700f91ab588f20fe470cac9db477a9 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 2 May 2018 11:07:02 -0700 Subject: rcu: Prevent useless FQS scan after all CPUs have checked in The force_qs_rnp() function checks for ->qsmask being all zero, that is, all CPUs for the current rcu_node structure having already passed through quiescent states. But with RCU-preempt, this is not sufficient to report quiescent states further up the tree, so there are further checks that can initiate RCU priority boosting and also for races with CPU-hotplug operations. However, if neither of these further checks apply, the code proceeds to carry out a useless scan of an all-zero ->qsmask. This commit therefore adds code to release the current rcu_node structure's lock and continue on to the next rcu_node structure, thereby avoiding this useless scan. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 439228b79811..3efd591fcd15 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2672,6 +2672,8 @@ static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *rsp)) /* rcu_report_unblock_qs_rnp() rlses ->lock */ continue; } + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + continue; } for_each_leaf_node_possible_cpu(rnp, cpu) { unsigned long bit = leaf_node_cpu_bit(rnp, cpu); -- cgit From 5554788e1d4253a92b794a9006b7ae2c10be52af Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 15 May 2018 10:14:34 -0700 Subject: rcu: Suppress false-positive offline-CPU lockdep-RCU splat The rcu_lockdep_current_cpu_online() function currently checks only the RCU-sched data structures to determine whether or not RCU believes that a given CPU is offline. Unfortunately, there are multiple flavors of RCU, which means that there is a short window of time during which the various flavors disagree as to whether or not a given CPU is offline. This can result in false-positive lockdep-RCU splats in which some other flavor of RCU tries to do something based on its view that the CPU is online, only to get hit with a lockdep-RCU splat because RCU-sched instead believes that the CPU is offline. This commit therefore changes rcu_lockdep_current_cpu_online() to scan all RCU flavors and to consider a given CPU to be online if any of the RCU flavors believe it to be online, thus preventing these false-positive splats. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 48 ++++++++++++++++++++++++------------------------ 1 file changed, 24 insertions(+), 24 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 3efd591fcd15..a7773fc72b0c 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1030,41 +1030,41 @@ void rcu_request_urgent_qs_task(struct task_struct *t) #if defined(CONFIG_PROVE_RCU) && defined(CONFIG_HOTPLUG_CPU) /* - * Is the current CPU online? Disable preemption to avoid false positives - * that could otherwise happen due to the current CPU number being sampled, - * this task being preempted, its old CPU being taken offline, resuming - * on some other CPU, then determining that its old CPU is now offline. - * It is OK to use RCU on an offline processor during initial boot, hence - * the check for rcu_scheduler_fully_active. Note also that it is OK - * for a CPU coming online to use RCU for one jiffy prior to marking itself - * online in the cpu_online_mask. Similarly, it is OK for a CPU going - * offline to continue to use RCU for one jiffy after marking itself - * offline in the cpu_online_mask. This leniency is necessary given the - * non-atomic nature of the online and offline processing, for example, - * the fact that a CPU enters the scheduler after completing the teardown - * of the CPU. + * Is the current CPU online as far as RCU is concerned? * - * This is also why RCU internally marks CPUs online during in the - * preparation phase and offline after the CPU has been taken down. + * Disable preemption to avoid false positives that could otherwise + * happen due to the current CPU number being sampled, this task being + * preempted, its old CPU being taken offline, resuming on some other CPU, + * then determining that its old CPU is now offline. Because there are + * multiple flavors of RCU, and because this function can be called in the + * midst of updating the flavors while a given CPU coming online or going + * offline, it is necessary to check all flavors. If any of the flavors + * believe that given CPU is online, it is considered to be online. * - * Disable checking if in an NMI handler because we cannot safely report - * errors from NMI handlers anyway. + * Disable checking if in an NMI handler because we cannot safely + * report errors from NMI handlers anyway. In addition, it is OK to use + * RCU on an offline processor during initial boot, hence the check for + * rcu_scheduler_fully_active. */ bool rcu_lockdep_current_cpu_online(void) { struct rcu_data *rdp; struct rcu_node *rnp; - bool ret; + struct rcu_state *rsp; - if (in_nmi()) + if (in_nmi() || !rcu_scheduler_fully_active) return true; preempt_disable(); - rdp = this_cpu_ptr(&rcu_sched_data); - rnp = rdp->mynode; - ret = (rdp->grpmask & rcu_rnp_online_cpus(rnp)) || - !rcu_scheduler_fully_active; + for_each_rcu_flavor(rsp) { + rdp = this_cpu_ptr(rsp->rda); + rnp = rdp->mynode; + if (rdp->grpmask & rcu_rnp_online_cpus(rnp)) { + preempt_enable(); + return true; + } + } preempt_enable(); - return ret; + return false; } EXPORT_SYMBOL_GPL(rcu_lockdep_current_cpu_online); -- cgit From fece27760ff57a0ca16c37e144f6a58a2f1851e9 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 2 May 2018 20:04:12 -0700 Subject: rcu: Suppress false-positive preempted-task splats Consider the following sequence of events in a PREEMPT=y kernel: 1. All CPUs corresponding to a given rcu_node structure go offline. A new grace period starts just after the CPU-hotplug code path does its synchronize_rcu() for the last CPU, so at least this CPU is present in that structure's ->qsmask. 2. Before the grace period ends, a CPU comes back online, and not just any CPU, but the one corresponding to a non-zero bit in the leaf rcu_node structure's ->qsmask. 3. A task running on the newly onlined CPU is preempted while in an RCU read-side critical section. Because this CPU's ->qsmask bit is net, not only does this task queue itself on the leaf rcu_node structure's ->blkd_tasks list, it also sets that structure's ->gp_tasks pointer to reference it. 4. The grace period started in #1 above comes to an end. This results in rcu_gp_cleanup() being invoked, which, among other things, checks to make sure that there are no tasks blocking the just-ended grace period, that is, that all ->gp_tasks pointers are NULL. The ->gp_tasks pointer corresponding to the task preempted in #3 above is non-NULL, which results in a splat. This splat is a false positive. The task's RCU read-side critical section cannot have begun before the just-ended grace period because this would mean either: (1) The CPU came online before the grace period started, which cannot have happened because the grace period started before that CPU was all the way offline, or (2) The task started its RCU read-side critical section on some other CPU, but then it would have had to have been preempted before migrating to this CPU, which would mean that it would have instead queued itself on that other CPU's rcu_node structure. This commit eliminates this false positive by adding code to the end of rcu_cleanup_dying_idle_cpu() that reports a quiescent state to RCU, which has the side-effect of clearing that CPU's ->qsmask bit, preventing the above scenario. This approach has the added benefit of more promptly reporting quiescent states corresponding to offline CPUs. Note well that the call to rcu_report_qs_rnp() reporting the quiescent state must come -before- the clearing of this CPU's bit in the leaf rcu_node structure's ->qsmaskinitnext field. Otherwise, lockdep-RCU will complain bitterly about quiescent states coming from an offline CPU. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index a7773fc72b0c..72adf97458e3 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3731,6 +3731,11 @@ static void rcu_cleanup_dying_idle_cpu(int cpu, struct rcu_state *rsp) /* Remove outgoing CPU from mask in the leaf rcu_node structure. */ mask = rdp->grpmask; raw_spin_lock_irqsave_rcu_node(rnp, flags); /* Enforce GP memory-order guarantee. */ + if (rnp->qsmask & mask) { /* RCU waiting on outgoing CPU? */ + /* Report quiescent state -before- changing ->qsmaskinitnext! */ + rcu_report_qs_rnp(mask, rsp, rnp, rnp->gp_seq, flags); + raw_spin_lock_irqsave_rcu_node(rnp, flags); + } rnp->qsmaskinitnext &= ~mask; raw_spin_unlock_irqrestore_rcu_node(rnp, flags); } -- cgit From 99990da1b3c00f6c05a330e06a76b9dbc8416d7e Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 3 May 2018 09:45:00 -0700 Subject: rcu: Suppress more involved false-positive preempted-task splats Consider the following sequence of events in a PREEMPT=y kernel: 1. All but one of the CPUs corresponding to a given leaf rcu_node structure go offline. Each of these CPUs clears its bit in that structure's ->qsmaskinitnext field. 2. A new grace period starts, and rcu_gp_init() scans the leaf rcu_node structures, applying CPU-hotplug changes since the start of the previous grace period, including those changes in #1 above. This copies each leaf structure's ->qsmaskinitnext to its ->qsmask field, which represents the CPUs that this new grace period will wait on. Each copy operation is done holding the corresponding leaf rcu_node structure's ->lock, and at the end of this scan, rcu_gp_init() holds no locks. 3. The last CPU corresponding to #1's leaf rcu_node structure goes offline, clearing its bit in that structure's ->qsmaskinitnext field, but not touching the ->qsmaskinit field. Note that rcu_gp_init() is not currently holding any locks! This CPU does -not- report a quiescent state because the grace period has not yet initialized itself sufficiently to have set any bits in any of the leaf rcu_node structures' ->qsmask fields. 4. The rcu_gp_init() function continues initializing the new grace period, copying each leaf rcu_node structure's ->qsmaskinit field to its ->qsmask field while holding the corresponding ->lock. This sets the ->qsmask bit corresponding to #3's CPU. 5. Before the grace period ends, #3's CPU comes back online. Because te grace period has not yet done any force-quiescent-state scans (which would report a quiescent state on behalf of any offline CPUs), this CPU's ->qsmask bit is still set. 6. A task running on the newly onlined CPU is preempted while in an RCU read-side critical section. Because this CPU's ->qsmask bit is net, not only does this task queue itself on the leaf rcu_node structure's ->blkd_tasks list, it also sets that structure's ->gp_tasks pointer to reference it. 7. The grace period started in #1 above comes to an end. This results in rcu_gp_cleanup() being invoked, which, among other things, checks to make sure that there are no tasks blocking the just-ended grace period, that is, that all ->gp_tasks pointers are NULL. The ->gp_tasks pointer corresponding to the task preempted in #3 above is non-NULL, which results in a splat. This splat is a false positive. The task's RCU read-side critical section cannot have begun before the just-ended grace period because this would mean either: (1) The CPU came online before the grace period started, which cannot have happened because the grace period started before that CPU went offline, or (2) The task started its RCU read-side critical section on some other CPU, but then it would have had to have been preempted before migrating to this CPU, which would mean that it would have instead queued itself on that other CPU's rcu_node structure. RCU's grace periods thus are working correctly. Or, more accurately, that remaining bugs in RCU's grace periods are elsewhere. This commit eliminates this false positive by adding code to the end of rcu_cpu_starting() that reports a quiescent state to RCU, which has the side-effect of clearing that CPU's ->qsmask bit, preventing the above scenario. This approach has the added benefit of more promptly reporting quiescent states corresponding to offline CPUs. Nevertheless, this commit does -not- remove the need for the force-quiescent-state scans to check for offline CPUs, given that a CPU might remain offline indefinitely. And without the checks in the force-quiescent-state scans, the grace period would also persist indefinitely, which could result in hangs or memory exhaustion. Note well that the call to rcu_report_qs_rnp() reporting the quiescent state must come -after- the setting of this CPU's bit in the leaf rcu_node structure's ->qsmaskinitnext field. Otherwise, lockdep-RCU will complain bitterly about quiescent states coming from an offline CPU. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 72adf97458e3..6275ed3925e9 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3710,7 +3710,12 @@ void rcu_cpu_starting(unsigned int cpu) nbits = bitmap_weight(&oldmask, BITS_PER_LONG); /* Allow lockless access for expedited grace periods. */ smp_store_release(&rsp->ncpus, rsp->ncpus + nbits); /* ^^^ */ - raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + if (rnp->qsmask & mask) { /* RCU waiting on incoming CPU? */ + /* Report QS -after- changing ->qsmaskinitnext! */ + rcu_report_qs_rnp(mask, rsp, rnp, rnp->gp_seq, flags); + } else { + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + } } smp_mb(); /* Ensure RCU read-side usage follows above initialization. */ } -- cgit From 0b107d24d9361132758374a7b007c7c74efa007f Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 8 May 2018 16:18:28 -0700 Subject: rcu: Suppress false-positive splats from mid-init task resume Consider the following sequence of events in a PREEMPT=y kernel: 1. All CPUs corresponding to a given leaf rcu_node structure are offline. 2. The first phase of the rcu_gp_init() function's grace-period initialization runs, and sets that rcu_node structure's ->qsmaskinit to zero, as it should. 3. One of the CPUs corresponding to that rcu_node structure comes back online. Note that because this CPU came online after the grace period started, this grace period can safely ignore this newly onlined CPU. 4. A task running on the newly onlined CPU enters an RCU-preempt read-side critical section, and is then preempted. Because the corresponding rcu_node structure's ->qsmask is zero, rcu_preempt_ctxt_queue() leaves the rcu_node structure's ->gp_tasks field NULL, as it should. 5. The rcu_gp_init() function continues running the second phase of grace-period initialization. The ->qsmask field of the parent of the aforementioned leaf rcu_node structure is set to not expect a quiescent state from the leaf, as is only right and proper. However, when rcu_gp_init() reaches the leaf, it invokes rcu_preempt_check_blocked_tasks(), which sees that the leaf's ->blkd_tasks list is non-empty, and therefore sets the leaf's ->gp_tasks field to reference the first task on that list. 6. The grace period ends before the preempted task resumes, which is perfectly fine, given that this grace period was under no obligation to wait for that task to exit its late-starting RCU-preempt read-side critical section. Unfortunately, the leaf's ->gp_tasks field is non-NULL, so rcu_gp_cleanup() splats. After all, it appears to rcu_gp_cleanup() that the grace period failed to wait for a task that was supposed to be blocking that grace period. This commit avoids this false-positive splat by adding a check of both ->qsmaskinit and ->wait_blkd_tasks to rcu_preempt_check_blocked_tasks(). If both ->qsmaskinit and ->wait_blkd_tasks are zero, then the task must have entered its RCU-preempt read-side critical section late (after all, the CPU that it is running on was not online at that time), which means that the upper-level rcu_node structure won't be waiting for anything on the leaf anyway. If ->wait_blkd_tasks is non-zero, then there is at least one task on ths rcu_node structure's ->blkd_tasks list whose RCU read-side critical section predates the current grace period. If ->qsmaskinit is non-zero, there is at least one CPU that was online at the start of the current grace period. Thus, if both are zero, there is nothing to wait for. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 677b0c9f548d..1c9a836af5b6 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -703,7 +703,8 @@ static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp) RCU_LOCKDEP_WARN(preemptible(), "rcu_preempt_check_blocked_tasks() invoked with preemption enabled!!!\n"); if (WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp))) dump_blkd_tasks(rnp, 10); - if (rcu_preempt_has_tasks(rnp)) { + if (rcu_preempt_has_tasks(rnp) && + (rnp->qsmaskinit || rnp->wait_blkd_tasks)) { rnp->gp_tasks = rnp->blkd_tasks.next; t = container_of(rnp->gp_tasks, struct task_struct, rcu_node_entry); -- cgit From ec2c29765a4ab12c236ac5a89b89660222ff6b01 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 7 May 2018 09:34:17 -0700 Subject: rcu: Fix grace-period hangs from mid-init task resume Without special fail-safe quiescent-state-propagation checks, grace-period hangs can result from the following scenario: 1. A task running on a given CPU is preempted in its RCU read-side critical section. 2. That CPU goes offline, and there are now no online CPUs corresponding to that CPU's leaf rcu_node structure. 3. The rcu_gp_init() function does the first phase of grace-period initialization, and sets the aforementioned leaf rcu_node structure's ->qsmaskinit field to all zeroes. Because there is a blocked task, it does not propagate the zeroing of either ->qsmaskinit or ->qsmaskinitnext up the rcu_node tree. 4. The task resumes on some other CPU and exits its critical section. There is no grace period in progress, so the resulting quiescent state is not reported up the tree. 5. The rcu_gp_init() function does the second phase of grace-period initialization, which results in the leaf rcu_node structure being initialized to expect no further quiescent states, but with that structure's parent expecting a quiescent-state report. The parent will never receive a quiescent state from this leaf rcu_node structure, so the grace period will hang, resulting in RCU CPU stall warnings. It would be good to get rid of the special fail-safe quiescent-state propagation checks. This commit therefore checks the leaf rcu_node structure's ->wait_blkd_tasks field during grace-period initialization. If this flag is set, the rcu_report_qs_rnp() is invoked to immediately report the possible quiescent state. While in the neighborhood, this commit also report quiescent states for any CPUs that went offline between the two phases of grace-period initialization, thus reducing grace-period delays and hopefully eventually allowing removal of offline-CPU checks from the force-quiescent-state code path. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 6275ed3925e9..7f872721c54e 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -154,6 +154,9 @@ EXPORT_SYMBOL_GPL(rcu_scheduler_active); */ static int rcu_scheduler_fully_active __read_mostly; +static void +rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp, + struct rcu_node *rnp, unsigned long gps, unsigned long flags); static void rcu_init_new_rnp(struct rcu_node *rnp_leaf); static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf); static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu); @@ -1858,7 +1861,9 @@ static void rcu_gp_slow(struct rcu_state *rsp, int delay) */ static bool rcu_gp_init(struct rcu_state *rsp) { + unsigned long flags; unsigned long oldmask; + unsigned long mask; struct rcu_data *rdp; struct rcu_node *rnp = rcu_get_root(rsp); @@ -1951,7 +1956,7 @@ static bool rcu_gp_init(struct rcu_state *rsp) */ rcu_for_each_node_breadth_first(rsp, rnp) { rcu_gp_slow(rsp, gp_init_delay); - raw_spin_lock_irq_rcu_node(rnp); + raw_spin_lock_irqsave_rcu_node(rnp, flags); rdp = this_cpu_ptr(rsp->rda); rcu_preempt_check_blocked_tasks(rnp); rnp->qsmask = rnp->qsmaskinit; @@ -1962,7 +1967,12 @@ static bool rcu_gp_init(struct rcu_state *rsp) trace_rcu_grace_period_init(rsp->name, rnp->gp_seq, rnp->level, rnp->grplo, rnp->grphi, rnp->qsmask); - raw_spin_unlock_irq_rcu_node(rnp); + /* Quiescent states for tasks on any now-offline CPUs. */ + mask = rnp->qsmask & ~rnp->qsmaskinitnext; + if ((mask || rnp->wait_blkd_tasks) && rcu_is_leaf_node(rnp)) + rcu_report_qs_rnp(mask, rsp, rnp, rnp->gp_seq, flags); + else + raw_spin_unlock_irq_rcu_node(rnp); cond_resched_tasks_rcu_qs(); WRITE_ONCE(rsp->gp_activity, jiffies); } @@ -2233,6 +2243,10 @@ static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags) * is the grace-period snapshot, which means that the quiescent states * are valid only if rnp->gp_seq is equal to gps. That structure's lock * must be held upon entry, and it is released before return. + * + * As a special case, if mask is zero, the bit-already-cleared check is + * disabled. This allows propagating quiescent state due to resumed tasks + * during grace-period initialization. */ static void rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp, @@ -2246,7 +2260,7 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp, /* Walk up the rcu_node hierarchy. */ for (;;) { - if (!(rnp->qsmask & mask) || rnp->gp_seq != gps) { + if ((!(rnp->qsmask & mask) && mask) || rnp->gp_seq != gps) { /* * Our bit has already been cleared, or the -- cgit From 1e64b15a4b102e1cd059d4d798b7a78f93341333 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 25 May 2018 19:23:09 -0700 Subject: rcu: Fix grace-period hangs due to race with CPU offline Without special fail-safe quiescent-state-propagation checks, grace-period hangs can result from the following scenario: 1. CPU 1 goes offline. 2. Because CPU 1 is the only CPU in the system blocking the current grace period, the grace period ends as soon as rcu_cleanup_dying_idle_cpu()'s call to rcu_report_qs_rnp() returns. 3. At this point, the leaf rcu_node structure's ->lock is no longer held: rcu_report_qs_rnp() has released it, as it must in order to awaken the RCU grace-period kthread. 4. At this point, that same leaf rcu_node structure's ->qsmaskinitnext field still records CPU 1 as being online. This is absolutely necessary because the scheduler uses RCU (in this case on the wake-up path while awakening RCU's grace-period kthread), and ->qsmaskinitnext contains RCU's idea as to which CPUs are online. Therefore, invoking rcu_report_qs_rnp() after clearing CPU 1's bit from ->qsmaskinitnext would result in a lockdep-RCU splat due to RCU being used from an offline CPU. 5. RCU's grace-period kthread awakens, sees that the old grace period has completed and that a new one is needed. It therefore starts a new grace period, but because CPU 1's leaf rcu_node structure's ->qsmaskinitnext field still shows CPU 1 as being online, this new grace period is initialized to wait for a quiescent state from the now-offline CPU 1. 6. Without the fail-safe force-quiescent-state checks, there would be no quiescent state from the now-offline CPU 1, which would eventually result in RCU CPU stall warnings and memory exhaustion. It would be good to get rid of the special fail-safe quiescent-state propagation checks, and thus it would be good to fix things so that the above scenario cannot happen. This commit therefore adds a new ->ofl_lock to the rcu_state structure. This lock is held by rcu_gp_init() across the applying of buffered online and offline operations to the rcu_node tree, and it is also held by rcu_cleanup_dying_idle_cpu() when buffering a new offline operation. This prevents rcu_gp_init() from acquiring the leaf rcu_node structure's lock during the interval between when rcu_cleanup_dying_idle_cpu() invokes rcu_report_qs_rnp(), which releases ->lock and the re-acquisition of that same lock. This in turn prevents the failure scenario outlined above, and will hopefully eventually allow removal of the offline-CPU checks from the force-quiescent-state code path. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 6 ++++++ kernel/rcu/tree.h | 4 ++++ 2 files changed, 10 insertions(+) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 7f872721c54e..50e4f7ebf8cf 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -101,6 +101,7 @@ struct rcu_state sname##_state = { \ .abbr = sabbr, \ .exp_mutex = __MUTEX_INITIALIZER(sname##_state.exp_mutex), \ .exp_wake_mutex = __MUTEX_INITIALIZER(sname##_state.exp_wake_mutex), \ + .ofl_lock = __SPIN_LOCK_UNLOCKED(sname##_state.ofl_lock), \ } RCU_STATE_INITIALIZER(rcu_sched, 's', call_rcu_sched); @@ -1900,11 +1901,13 @@ static bool rcu_gp_init(struct rcu_state *rsp) */ rcu_for_each_leaf_node(rsp, rnp) { rcu_gp_slow(rsp, gp_preinit_delay); + spin_lock(&rsp->ofl_lock); raw_spin_lock_irq_rcu_node(rnp); if (rnp->qsmaskinit == rnp->qsmaskinitnext && !rnp->wait_blkd_tasks) { /* Nothing to do on this leaf rcu_node structure. */ raw_spin_unlock_irq_rcu_node(rnp); + spin_unlock(&rsp->ofl_lock); continue; } @@ -1940,6 +1943,7 @@ static bool rcu_gp_init(struct rcu_state *rsp) } raw_spin_unlock_irq_rcu_node(rnp); + spin_unlock(&rsp->ofl_lock); } /* @@ -3749,6 +3753,7 @@ static void rcu_cleanup_dying_idle_cpu(int cpu, struct rcu_state *rsp) /* Remove outgoing CPU from mask in the leaf rcu_node structure. */ mask = rdp->grpmask; + spin_lock(&rsp->ofl_lock); raw_spin_lock_irqsave_rcu_node(rnp, flags); /* Enforce GP memory-order guarantee. */ if (rnp->qsmask & mask) { /* RCU waiting on outgoing CPU? */ /* Report quiescent state -before- changing ->qsmaskinitnext! */ @@ -3757,6 +3762,7 @@ static void rcu_cleanup_dying_idle_cpu(int cpu, struct rcu_state *rsp) } rnp->qsmaskinitnext &= ~mask; raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + spin_unlock(&rsp->ofl_lock); } /* diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 3def94fc9c74..6683da6e4ecc 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -363,6 +363,10 @@ struct rcu_state { const char *name; /* Name of structure. */ char abbr; /* Abbreviated name. */ struct list_head flavors; /* List of RCU flavors. */ + + spinlock_t ofl_lock ____cacheline_internodealigned_in_smp; + /* Synchronize offline with */ + /* GP pre-initialization. */ }; /* Values for rcu_state structure's gp_flags field. */ -- cgit From 1f3e5f51b933cbc25e3da0cdbdac40716df04ddb Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 3 May 2018 10:35:33 -0700 Subject: rcu: Add RCU-preempt check for waiting on newly onlined CPU RCU should only be waiting on CPUs that were online at the time that the current grace period started. Failure to abide by this rule can result in confusing splats during grace-period cleanup and initialization. This commit therefore adds a check to RCU-preempt's preempted-task queuing that checks for waiting on newly onlined CPUs. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 1c9a836af5b6..6b85ce936ad4 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -183,6 +183,9 @@ static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp) raw_lockdep_assert_held_rcu_node(rnp); WARN_ON_ONCE(rdp->mynode != rnp); WARN_ON_ONCE(!rcu_is_leaf_node(rnp)); + /* RCU better not be waiting on newly onlined CPUs! */ + WARN_ON_ONCE(rnp->qsmaskinitnext & ~rnp->qsmaskinit & rnp->qsmask & + rdp->grpmask); /* * Decide where to queue the newly blocked task. In theory, -- cgit From f34f2f5852e556ee1c3b3b294571086b1791008a Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 3 May 2018 13:40:25 -0700 Subject: rcu: Move grace-period pre-init delay after pre-init The main race with the early part of grace-period initialization appears to be with CPU hotplug. To more fully open this race window, this commit moves the rcu_gp_slow() from the beginning of the early initialization loop to follow that loop, thus widening the race window, especially for the rcu_node structures that are initialized last. This commit also expands rcutree.gp_preinit_delay from 3 to 12, giving the same overall delay in the grace period, but concentrated in the spot where it will do the most good. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 2 +- tools/testing/selftests/rcutorture/configs/rcu/TREE03.boot | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 50e4f7ebf8cf..c577cadcc4f8 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1900,7 +1900,6 @@ static bool rcu_gp_init(struct rcu_state *rsp) * will handle subsequent offline CPUs. */ rcu_for_each_leaf_node(rsp, rnp) { - rcu_gp_slow(rsp, gp_preinit_delay); spin_lock(&rsp->ofl_lock); raw_spin_lock_irq_rcu_node(rnp); if (rnp->qsmaskinit == rnp->qsmaskinitnext && @@ -1945,6 +1944,7 @@ static bool rcu_gp_init(struct rcu_state *rsp) raw_spin_unlock_irq_rcu_node(rnp); spin_unlock(&rsp->ofl_lock); } + rcu_gp_slow(rsp, gp_preinit_delay); /* Races with CPU hotplug. */ /* * Set the quiescent-state-needed bits in all the rcu_node diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE03.boot b/tools/testing/selftests/rcutorture/configs/rcu/TREE03.boot index 5d2cc0bd50a0..b79ddb9eb9e8 100644 --- a/tools/testing/selftests/rcutorture/configs/rcu/TREE03.boot +++ b/tools/testing/selftests/rcutorture/configs/rcu/TREE03.boot @@ -1,5 +1,5 @@ rcutorture.onoff_interval=1 rcutorture.onoff_holdoff=30 -rcutree.gp_preinit_delay=3 +rcutree.gp_preinit_delay=12 rcutree.gp_init_delay=3 rcutree.gp_cleanup_delay=3 rcutree.kthread_prio=2 -- cgit From 17a8212b8de21ce1b9a91fb75c8a6fb337685b9a Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 3 May 2018 14:30:02 -0700 Subject: rcu: Remove failsafe check for lost quiescent state Now that quiescent-state reporting is fully event-driven, this commit removes the check for a lost quiescent state from force_qs_rnp(). Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 16 +++------------- 1 file changed, 3 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index c577cadcc4f8..770f0df54015 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2317,8 +2317,9 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp, * irqs disabled, and this lock is released upon return, but irqs remain * disabled. */ -static void rcu_report_unblock_qs_rnp(struct rcu_state *rsp, - struct rcu_node *rnp, unsigned long flags) +static void __maybe_unused +rcu_report_unblock_qs_rnp(struct rcu_state *rsp, + struct rcu_node *rnp, unsigned long flags) __releases(rnp->lock) { unsigned long gps; @@ -2679,17 +2680,6 @@ static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *rsp)) /* rcu_initiate_boost() releases rnp->lock */ continue; } - if (rnp->parent && - (rnp->parent->qsmask & rnp->grpmask)) { - /* - * Race between grace-period - * initialization and task exiting RCU - * read-side critical section: Report. - */ - rcu_report_unblock_qs_rnp(rsp, rnp, flags); - /* rcu_report_unblock_qs_rnp() rlses ->lock */ - continue; - } raw_spin_unlock_irqrestore_rcu_node(rnp, flags); continue; } -- cgit From e05121ba5b81e2f85349f038642410578457f6db Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 7 May 2018 12:07:48 -0700 Subject: rcu: Remove CPU-hotplug failsafe from force-quiescent-state code path Now that quiescent states for newly offlined CPUs are reported either when that CPU goes offline or at the end of grace-period initialization, the CPU-hotplug failsafe in the force-quiescent-state code path is no longer needed. This commit therefore removes this failsafe. Signed-off-by: Paul E. McKenney --- include/trace/events/rcu.h | 10 +++++----- kernel/rcu/tree.c | 9 +-------- kernel/rcu/tree.h | 1 - 3 files changed, 6 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/include/trace/events/rcu.h b/include/trace/events/rcu.h index 759e7e83733d..a8d07feff6a0 100644 --- a/include/trace/events/rcu.h +++ b/include/trace/events/rcu.h @@ -391,11 +391,11 @@ TRACE_EVENT(rcu_quiescent_state_report, /* * Tracepoint for quiescent states detected by force_quiescent_state(). - * These trace events include the type of RCU, the grace-period number that - * was blocked by the CPU, the CPU itself, and the type of quiescent state, - * which can be "dti" for dyntick-idle mode, "ofl" for CPU offline, "kick" - * when kicking a CPU that has been in dyntick-idle mode for too long, or - * "rqc" if the CPU got a quiescent state via its rcu_qs_ctr. + * These trace events include the type of RCU, the grace-period number + * that was blocked by the CPU, the CPU itself, and the type of quiescent + * state, which can be "dti" for dyntick-idle mode, "kick" when kicking + * a CPU that has been in dyntick-idle mode for too long, or "rqc" if the + * CPU got a quiescent state via its rcu_qs_ctr. */ TRACE_EVENT(rcu_fqs, diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 770f0df54015..5f1a11f1f7bc 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1188,14 +1188,6 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) smp_store_release(ruqp, true); } - /* Check for the CPU being offline. */ - if (!(rdp->grpmask & rcu_rnp_online_cpus(rnp))) { - trace_rcu_fqs(rdp->rsp->name, rdp->gp_seq, rdp->cpu, TPS("ofl")); - rdp->offline_fqs++; - rcu_gpnum_ovf(rnp, rdp); - return 1; - } - /* * A CPU running for an extended time within the kernel can * delay RCU grace periods. When the CPU is in NO_HZ_FULL mode, @@ -3718,6 +3710,7 @@ void rcu_cpu_starting(unsigned int cpu) nbits = bitmap_weight(&oldmask, BITS_PER_LONG); /* Allow lockless access for expedited grace periods. */ smp_store_release(&rsp->ncpus, rsp->ncpus + nbits); /* ^^^ */ + rcu_gpnum_ovf(rnp, rdp); /* Offline-induced counter wrap? */ if (rnp->qsmask & mask) { /* RCU waiting on incoming CPU? */ /* Report QS -after- changing ->qsmaskinitnext! */ rcu_report_qs_rnp(mask, rsp, rnp, rnp->gp_seq, flags); diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 6683da6e4ecc..795d469c6f67 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -217,7 +217,6 @@ struct rcu_data { /* 4) reasons this CPU needed to be kicked by force_quiescent_state */ unsigned long dynticks_fqs; /* Kicked due to dynticks idle. */ - unsigned long offline_fqs; /* Kicked due to being offline. */ unsigned long cond_resched_completed; /* Grace period that needs help */ /* from cond_resched(). */ -- cgit From ff3cee39088b1931a432587059d66cd505f785dc Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 8 May 2018 12:50:14 -0700 Subject: rcu: Add up-tree information to dump_blkd_tasks() diagnostics This commit updates dump_blkd_tasks() to print out quiescent-state bitmasks for the rcu_node structures further up the tree. This information helps debugging of interactions between CPU-hotplug operations and RCU grace-period initialization. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 6b85ce936ad4..f45ff97b0d51 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -858,13 +858,15 @@ static void dump_blkd_tasks(struct rcu_node *rnp, int ncheck) { int i; struct list_head *lhp; + struct rcu_node *rnp1; raw_lockdep_assert_held_rcu_node(rnp); - pr_info("%s: grp: %d-%d level: %d ->gp_seq %#lx ->completedqs %#lx\n", + pr_info("%s: grp: %d-%d level: %d ->gp_seq %ld ->completedqs %ld\n", __func__, rnp->grplo, rnp->grphi, rnp->level, - rnp->gp_seq, rnp->completedqs); - pr_info("%s: ->qsmask %#lx ->qsmaskinit %#lx ->qsmaskinitnext %#lx\n", - __func__, rnp->qsmask, rnp->qsmaskinit, rnp->qsmaskinitnext); + (long)rnp->gp_seq, (long)rnp->completedqs); + for (rnp1 = rnp; rnp1; rnp1 = rnp1->parent) + pr_info("%s: %d:%d ->qsmask %#lx ->qsmaskinit %#lx ->qsmaskinitnext %#lx\n", + __func__, rnp1->grplo, rnp1->grphi, rnp1->qsmask, rnp1->qsmaskinit, rnp1->qsmaskinitnext); pr_info("%s: ->gp_tasks %p ->boost_tasks %p ->exp_tasks %p\n", __func__, rnp->gp_tasks, rnp->boost_tasks, rnp->exp_tasks); pr_info("%s: ->blkd_tasks", __func__); -- cgit From 577389423187d8b51dfe6199297e579a3419b72b Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 8 May 2018 14:18:57 -0700 Subject: rcu: Add CPU online/offline state to dump_blkd_tasks() Interactions between CPU-hotplug operations and grace-period initialization can result in dump_blkd_tasks(). One of the first debugging actions in this case is to search back in dmesg to work out which of the affected rcu_node structure's CPUs are online and to determine the last CPU-hotplug operation affecting any of those CPUs. This can be laborious and error-prone, especially when console output is lost. This commit therefore causes dump_blkd_tasks() to dump the state of the affected rcu_node structure's CPUs and the last grace period during which the last offline and online operation affected each of these CPUs. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 12 ++++++++++-- kernel/rcu/tree.h | 12 +++++++++--- kernel/rcu/tree_plugin.h | 25 ++++++++++++++++++++----- 3 files changed, 39 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 5f1a11f1f7bc..a2503ef1bbe2 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1954,7 +1954,7 @@ static bool rcu_gp_init(struct rcu_state *rsp) rcu_gp_slow(rsp, gp_init_delay); raw_spin_lock_irqsave_rcu_node(rnp, flags); rdp = this_cpu_ptr(rsp->rda); - rcu_preempt_check_blocked_tasks(rnp); + rcu_preempt_check_blocked_tasks(rsp, rnp); rnp->qsmask = rnp->qsmaskinit; WRITE_ONCE(rnp->gp_seq, rsp->gp_seq); if (rnp == rdp->mynode) @@ -2063,7 +2063,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp) rcu_for_each_node_breadth_first(rsp, rnp) { raw_spin_lock_irq_rcu_node(rnp); if (WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp))) - dump_blkd_tasks(rnp, 10); + dump_blkd_tasks(rsp, rnp, 10); WARN_ON_ONCE(rnp->qsmask); WRITE_ONCE(rnp->gp_seq, new_gp_seq); rdp = this_cpu_ptr(rsp->rda); @@ -3516,6 +3516,10 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp) rdp->dynticks = &per_cpu(rcu_dynticks, cpu); WARN_ON_ONCE(rdp->dynticks->dynticks_nesting != 1); WARN_ON_ONCE(rcu_dynticks_in_eqs(rcu_dynticks_snap(rdp->dynticks))); + rdp->rcu_ofl_gp_seq = rsp->gp_seq; + rdp->rcu_ofl_gp_flags = RCU_GP_CLEANED; + rdp->rcu_onl_gp_seq = rsp->gp_seq; + rdp->rcu_onl_gp_flags = RCU_GP_CLEANED; rdp->cpu = cpu; rdp->rsp = rsp; rcu_boot_init_nocb_percpu_data(rdp); @@ -3711,6 +3715,8 @@ void rcu_cpu_starting(unsigned int cpu) /* Allow lockless access for expedited grace periods. */ smp_store_release(&rsp->ncpus, rsp->ncpus + nbits); /* ^^^ */ rcu_gpnum_ovf(rnp, rdp); /* Offline-induced counter wrap? */ + rdp->rcu_onl_gp_seq = READ_ONCE(rsp->gp_seq); + rdp->rcu_onl_gp_flags = READ_ONCE(rsp->gp_flags); if (rnp->qsmask & mask) { /* RCU waiting on incoming CPU? */ /* Report QS -after- changing ->qsmaskinitnext! */ rcu_report_qs_rnp(mask, rsp, rnp, rnp->gp_seq, flags); @@ -3738,6 +3744,8 @@ static void rcu_cleanup_dying_idle_cpu(int cpu, struct rcu_state *rsp) mask = rdp->grpmask; spin_lock(&rsp->ofl_lock); raw_spin_lock_irqsave_rcu_node(rnp, flags); /* Enforce GP memory-order guarantee. */ + rdp->rcu_ofl_gp_seq = READ_ONCE(rsp->gp_seq); + rdp->rcu_ofl_gp_flags = READ_ONCE(rsp->gp_flags); if (rnp->qsmask & mask) { /* RCU waiting on outgoing CPU? */ /* Report quiescent state -before- changing ->qsmaskinitnext! */ rcu_report_qs_rnp(mask, rsp, rnp, rnp->gp_seq, flags); diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 795d469c6f67..f52bc059bfec 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -255,12 +255,16 @@ struct rcu_data { /* Leader CPU takes GP-end wakeups. */ #endif /* #ifdef CONFIG_RCU_NOCB_CPU */ - /* 7) RCU CPU stall data. */ + /* 7) Diagnostic data, including RCU CPU stall warnings. */ unsigned int softirq_snap; /* Snapshot of softirq activity. */ /* ->rcu_iw* fields protected by leaf rcu_node ->lock. */ struct irq_work rcu_iw; /* Check for non-irq activity. */ bool rcu_iw_pending; /* Is ->rcu_iw pending? */ unsigned long rcu_iw_gp_seq; /* ->gp_seq associated with ->rcu_iw. */ + unsigned long rcu_ofl_gp_seq; /* ->gp_seq at last offline. */ + short rcu_ofl_gp_flags; /* ->gp_flags at last offline. */ + unsigned long rcu_onl_gp_seq; /* ->gp_seq at last online. */ + short rcu_onl_gp_flags; /* ->gp_flags at last online. */ int cpu; struct rcu_state *rsp; @@ -431,11 +435,13 @@ static bool rcu_preempt_has_tasks(struct rcu_node *rnp); static void rcu_print_detail_task_stall(struct rcu_state *rsp); static int rcu_print_task_stall(struct rcu_node *rnp); static int rcu_print_task_exp_stall(struct rcu_node *rnp); -static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp); +static void rcu_preempt_check_blocked_tasks(struct rcu_state *rsp, + struct rcu_node *rnp); static void rcu_preempt_check_callbacks(void); void call_rcu(struct rcu_head *head, rcu_callback_t func); static void __init __rcu_init_preempt(void); -static void dump_blkd_tasks(struct rcu_node *rnp, int ncheck); +static void dump_blkd_tasks(struct rcu_state *rsp, struct rcu_node *rnp, + int ncheck); static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags); static void rcu_preempt_boost_start_gp(struct rcu_node *rnp); static void invoke_rcu_callbacks_kthread(void); diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index f45ff97b0d51..613372246a07 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -699,13 +699,14 @@ static int rcu_print_task_exp_stall(struct rcu_node *rnp) * Also, if there are blocked tasks on the list, they automatically * block the newly created grace period, so set up ->gp_tasks accordingly. */ -static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp) +static void +rcu_preempt_check_blocked_tasks(struct rcu_state *rsp, struct rcu_node *rnp) { struct task_struct *t; RCU_LOCKDEP_WARN(preemptible(), "rcu_preempt_check_blocked_tasks() invoked with preemption enabled!!!\n"); if (WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp))) - dump_blkd_tasks(rnp, 10); + dump_blkd_tasks(rsp, rnp, 10); if (rcu_preempt_has_tasks(rnp) && (rnp->qsmaskinit || rnp->wait_blkd_tasks)) { rnp->gp_tasks = rnp->blkd_tasks.next; @@ -854,10 +855,14 @@ void exit_rcu(void) * Dump the blocked-tasks state, but limit the list dump to the * specified number of elements. */ -static void dump_blkd_tasks(struct rcu_node *rnp, int ncheck) +static void +dump_blkd_tasks(struct rcu_state *rsp, struct rcu_node *rnp, int ncheck) { + int cpu; int i; struct list_head *lhp; + bool onl; + struct rcu_data *rdp; struct rcu_node *rnp1; raw_lockdep_assert_held_rcu_node(rnp); @@ -877,6 +882,14 @@ static void dump_blkd_tasks(struct rcu_node *rnp, int ncheck) break; } pr_cont("\n"); + for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++) { + rdp = per_cpu_ptr(rsp->rda, cpu); + onl = !!(rdp->grpmask & rcu_rnp_online_cpus(rnp)); + pr_info("\t%d: %c online: %ld(%d) offline: %ld(%d)\n", + cpu, ".o"[onl], + (long)rdp->rcu_onl_gp_seq, rdp->rcu_onl_gp_flags, + (long)rdp->rcu_ofl_gp_seq, rdp->rcu_ofl_gp_flags); + } } #else /* #ifdef CONFIG_PREEMPT_RCU */ @@ -949,7 +962,8 @@ static int rcu_print_task_exp_stall(struct rcu_node *rnp) * so there is no need to check for blocked tasks. So check only for * bogus qsmask values. */ -static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp) +static void +rcu_preempt_check_blocked_tasks(struct rcu_state *rsp, struct rcu_node *rnp) { WARN_ON_ONCE(rnp->qsmask); } @@ -990,7 +1004,8 @@ void exit_rcu(void) /* * Dump the guaranteed-empty blocked-tasks state. Trust but verify. */ -static void dump_blkd_tasks(struct rcu_node *rnp, int ncheck) +static void +dump_blkd_tasks(struct rcu_state *rsp, struct rcu_node *rnp, int ncheck) { WARN_ON_ONCE(!list_empty(&rnp->blkd_tasks)); } -- cgit From fea3f222d3523dfdd0e86b11227d3cda20765102 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 15 May 2018 15:47:30 -0700 Subject: rcu: Record ->gp_state for both phases of grace-period initialization Grace-period initialization first processes any recent CPU-hotplug operations, and then initializes state for the new grace period. These two phases of initialization are currently not distinguished in debug prints, but the distinction is valuable in a number of debug situations. This commit therefore introduces two new values for ->gp_state, RCU_GP_ONOFF and RCU_GP_INIT, in order to make this distinction. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 2 ++ kernel/rcu/tree.h | 12 ++++++++---- 2 files changed, 10 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index a2503ef1bbe2..ee218d743226 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1891,6 +1891,7 @@ static bool rcu_gp_init(struct rcu_state *rsp) * for subsequent online CPUs, and that quiescent-state forcing * will handle subsequent offline CPUs. */ + rsp->gp_state = RCU_GP_ONOFF; rcu_for_each_leaf_node(rsp, rnp) { spin_lock(&rsp->ofl_lock); raw_spin_lock_irq_rcu_node(rnp); @@ -1950,6 +1951,7 @@ static bool rcu_gp_init(struct rcu_state *rsp) * The grace period cannot complete until the initialization * process finishes, because this kthread handles both. */ + rsp->gp_state = RCU_GP_INIT; rcu_for_each_node_breadth_first(rsp, rnp) { rcu_gp_slow(rsp, gp_init_delay); raw_spin_lock_irqsave_rcu_node(rnp, flags); diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index f52bc059bfec..8077aff7ab40 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -380,16 +380,20 @@ struct rcu_state { #define RCU_GP_IDLE 0 /* Initial state and no GP in progress. */ #define RCU_GP_WAIT_GPS 1 /* Wait for grace-period start. */ #define RCU_GP_DONE_GPS 2 /* Wait done for grace-period start. */ -#define RCU_GP_WAIT_FQS 3 /* Wait for force-quiescent-state time. */ -#define RCU_GP_DOING_FQS 4 /* Wait done for force-quiescent-state time. */ -#define RCU_GP_CLEANUP 5 /* Grace-period cleanup started. */ -#define RCU_GP_CLEANED 6 /* Grace-period cleanup complete. */ +#define RCU_GP_ONOFF 3 /* Grace-period initialization hotplug. */ +#define RCU_GP_INIT 4 /* Grace-period initialization. */ +#define RCU_GP_WAIT_FQS 5 /* Wait for force-quiescent-state time. */ +#define RCU_GP_DOING_FQS 6 /* Wait done for force-quiescent-state time. */ +#define RCU_GP_CLEANUP 7 /* Grace-period cleanup started. */ +#define RCU_GP_CLEANED 8 /* Grace-period cleanup complete. */ #ifndef RCU_TREE_NONCORE static const char * const gp_state_names[] = { "RCU_GP_IDLE", "RCU_GP_WAIT_GPS", "RCU_GP_DONE_GPS", + "RCU_GP_ONOFF", + "RCU_GP_INIT", "RCU_GP_WAIT_FQS", "RCU_GP_DOING_FQS", "RCU_GP_CLEANUP", -- cgit From f2e2df59786d7bd52e6e7e2d10c1c6ba433a0ee7 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 15 May 2018 16:23:23 -0700 Subject: rcu: Add diagnostics for offline CPUs failing to report QS CPUs are expected to report quiescent states when coming online and when going offline, and grace-period initialization is supposed to handle any race conditions where a CPU's ->qsmask bit is set just after it goes offline. This commit adds diagnostics for the case where an offline CPU nevertheless has a grace period waiting on it. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 22 ++++++++++++++++++++++ kernel/rcu/tree.h | 1 + 2 files changed, 23 insertions(+) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index ee218d743226..d3333ee2c6f5 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1188,6 +1188,27 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) smp_store_release(ruqp, true); } + /* If waiting too long on an offline CPU, complain. */ + if (!(rdp->grpmask & rcu_rnp_online_cpus(rnp)) && + time_after(jiffies, rdp->rsp->gp_start + HZ)) { + bool onl; + struct rcu_node *rnp1; + + WARN_ON(1); /* Offline CPUs are supposed to report QS! */ + pr_info("%s: grp: %d-%d level: %d ->gp_seq %ld ->completedqs %ld\n", + __func__, rnp->grplo, rnp->grphi, rnp->level, + (long)rnp->gp_seq, (long)rnp->completedqs); + for (rnp1 = rnp; rnp1; rnp1 = rnp1->parent) + pr_info("%s: %d:%d ->qsmask %#lx ->qsmaskinit %#lx ->qsmaskinitnext %#lx ->rcu_gp_init_mask %#lx\n", + __func__, rnp1->grplo, rnp1->grphi, rnp1->qsmask, rnp1->qsmaskinit, rnp1->qsmaskinitnext, rnp1->rcu_gp_init_mask); + onl = !!(rdp->grpmask & rcu_rnp_online_cpus(rnp)); + pr_info("%s %d: %c online: %ld(%d) offline: %ld(%d)\n", + __func__, rdp->cpu, ".o"[onl], + (long)rdp->rcu_onl_gp_seq, rdp->rcu_onl_gp_flags, + (long)rdp->rcu_ofl_gp_seq, rdp->rcu_ofl_gp_flags); + return 1; /* Break things loose after complaining. */ + } + /* * A CPU running for an extended time within the kernel can * delay RCU grace periods. When the CPU is in NO_HZ_FULL mode, @@ -1967,6 +1988,7 @@ static bool rcu_gp_init(struct rcu_state *rsp) rnp->grphi, rnp->qsmask); /* Quiescent states for tasks on any now-offline CPUs. */ mask = rnp->qsmask & ~rnp->qsmaskinitnext; + rnp->rcu_gp_init_mask = mask; if ((mask || rnp->wait_blkd_tasks) && rcu_is_leaf_node(rnp)) rcu_report_qs_rnp(mask, rsp, rnp, rnp->gp_seq, flags); else diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 8077aff7ab40..d51e6edc8e83 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -90,6 +90,7 @@ struct rcu_node { /* an rcu_data structure, otherwise, each */ /* bit corresponds to a child rcu_node */ /* structure. */ + unsigned long rcu_gp_init_mask; /* Mask of offline CPUs at GP init. */ unsigned long qsmaskinit; /* Per-GP initial value for qsmask. */ /* Initialized from ->qsmaskinitnext at the */ -- cgit From 3949fa9bac090ad217534c30bc3b6572289abf21 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 8 May 2018 15:29:10 -0700 Subject: rcu: Make rcu_read_unlock_special() static Because rcu_read_unlock_special() is no longer used outside of kernel/rcu/tree_plugin.h, this commit makes it static. Signed-off-by: Paul E. McKenney --- include/linux/rcupdate.h | 1 - kernel/rcu/tree_plugin.h | 3 ++- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 65163aa0bb04..67ec077c7ee5 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -64,7 +64,6 @@ void rcu_barrier_tasks(void); void __rcu_read_lock(void); void __rcu_read_unlock(void); -void rcu_read_unlock_special(struct task_struct *t); void synchronize_rcu(void); /* diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 613372246a07..54a251640f53 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -127,6 +127,7 @@ static struct rcu_data __percpu *const rcu_data_p = &rcu_preempt_data; static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, bool wake); +static void rcu_read_unlock_special(struct task_struct *t); /* * Tell them what RCU they are running. @@ -461,7 +462,7 @@ static bool rcu_preempt_has_tasks(struct rcu_node *rnp) * notify RCU core processing or task having blocked during the RCU * read-side critical section. */ -void rcu_read_unlock_special(struct task_struct *t) +static void rcu_read_unlock_special(struct task_struct *t) { bool empty_exp; bool empty_norm; -- cgit From 07f27570dcd148a5f4de7dc3513c1d1cd069b362 Mon Sep 17 00:00:00 2001 From: Byungchul Park Date: Fri, 11 May 2018 17:30:34 +0900 Subject: rcu: Improve rcu_note_voluntary_context_switch() reporting We expect a quiescent state of TASKS_RCU when cond_resched_tasks_rcu_qs() is called, no matter whether it actually be scheduled or not. However, it currently doesn't report the quiescent state when the task enters into __schedule() as it's called with preempt = true. So make it report the quiescent state unconditionally when cond_resched_tasks_rcu_qs() is called. And in TINY_RCU, even though the quiescent state of rcu_bh also should be reported when the tick interrupt comes from user, it doesn't. So make it reported. Lastly in TREE_RCU, rcu_note_voluntary_context_switch() should be reported when the tick interrupt comes from not only user but also idle, as an extended quiescent state. Signed-off-by: Byungchul Park Signed-off-by: Paul E. McKenney [ paulmck: Simplify rcutiny portion given no RCU-tasks for !PREEMPT. ] --- include/linux/rcupdate.h | 4 ++-- kernel/rcu/tiny.c | 4 +--- kernel/rcu/tree.c | 4 ++-- 3 files changed, 5 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 67ec077c7ee5..5cab15e7ec83 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -194,8 +194,8 @@ static inline void exit_tasks_rcu_finish(void) { } */ #define cond_resched_tasks_rcu_qs() \ do { \ - if (!cond_resched()) \ - rcu_note_voluntary_context_switch_lite(current); \ + rcu_note_voluntary_context_switch_lite(current); \ + cond_resched(); \ } while (0) /* diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c index a64eee0db39e..befc9321a89c 100644 --- a/kernel/rcu/tiny.c +++ b/kernel/rcu/tiny.c @@ -122,10 +122,8 @@ void rcu_check_callbacks(int user) { if (user) rcu_sched_qs(); - else if (!in_softirq()) + if (user || !in_softirq()) rcu_bh_qs(); - if (user) - rcu_note_voluntary_context_switch(current); } /* diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index d3333ee2c6f5..19beabe73629 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2645,6 +2645,7 @@ void rcu_check_callbacks(int user) rcu_sched_qs(); rcu_bh_qs(); + rcu_note_voluntary_context_switch(current); } else if (!in_softirq()) { @@ -2660,8 +2661,7 @@ void rcu_check_callbacks(int user) rcu_preempt_check_callbacks(); if (rcu_pending()) invoke_rcu_core(); - if (user) - rcu_note_voluntary_context_switch(current); + trace_rcu_utilization(TPS("End scheduler-tick")); } -- cgit From a7538352da722fae5cc95ae6656ea2013f5b8b21 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Mon, 14 May 2018 13:27:33 -0700 Subject: rcu: Use pr_fmt to prefix "rcu: " to logging output This commit also adjusts some whitespace while in the area. Signed-off-by: Joe Perches Signed-off-by: Paul E. McKenney [ paulmck: Revert string-breaking %s as requested by Andy Shevchenko. ] --- kernel/rcu/rcuperf.c | 7 +++---- kernel/rcu/rcutorture.c | 4 ++-- kernel/rcu/srcutree.c | 5 ++++- kernel/rcu/tree.c | 8 +++++--- kernel/rcu/tree_plugin.h | 10 ++++++---- 5 files changed, 20 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c index b080bc4a4f45..00e395c0d7d0 100644 --- a/kernel/rcu/rcuperf.c +++ b/kernel/rcu/rcuperf.c @@ -680,12 +680,11 @@ rcu_perf_init(void) break; } if (i == ARRAY_SIZE(perf_ops)) { - pr_alert("rcu-perf: invalid perf type: \"%s\"\n", - perf_type); + pr_alert("rcu-perf: invalid perf type: \"%s\"\n", perf_type); pr_alert("rcu-perf types:"); for (i = 0; i < ARRAY_SIZE(perf_ops); i++) - pr_alert(" %s", perf_ops[i]->name); - pr_alert("\n"); + pr_cont(" %s", perf_ops[i]->name); + pr_cont("\n"); firsterr = -EINVAL; goto unwind; } diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 0481c7286875..90a94fecdd73 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1755,8 +1755,8 @@ rcu_torture_init(void) torture_type); pr_alert("rcu-torture types:"); for (i = 0; i < ARRAY_SIZE(torture_ops); i++) - pr_alert(" %s", torture_ops[i]->name); - pr_alert("\n"); + pr_cont(" %s", torture_ops[i]->name); + pr_cont("\n"); firsterr = -EINVAL; goto unwind; } diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index d6d6ea9738c0..e526b56998af 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -26,6 +26,8 @@ * */ +#define pr_fmt(fmt) "rcu: " fmt + #include #include #include @@ -390,7 +392,8 @@ void _cleanup_srcu_struct(struct srcu_struct *sp, bool quiesced) } if (WARN_ON(rcu_seq_state(READ_ONCE(sp->srcu_gp_seq)) != SRCU_STATE_IDLE) || WARN_ON(srcu_readers_active(sp))) { - pr_info("%s: Active srcu_struct %p state: %d\n", __func__, sp, rcu_seq_state(READ_ONCE(sp->srcu_gp_seq))); + pr_info("%s: Active srcu_struct %p state: %d\n", + __func__, sp, rcu_seq_state(READ_ONCE(sp->srcu_gp_seq))); return; /* Caller forgot to stop doing call_srcu()? */ } free_percpu(sp->sda); diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 19beabe73629..6f2922168216 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -27,6 +27,9 @@ * For detailed explanation of Read-Copy Update mechanism see - * Documentation/RCU */ + +#define pr_fmt(fmt) "rcu: " fmt + #include #include #include @@ -1374,8 +1377,7 @@ static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gp_seq) * See Documentation/RCU/stallwarn.txt for info on how to debug * RCU CPU stall warnings. */ - pr_err("INFO: %s detected stalls on CPUs/tasks:", - rsp->name); + pr_err("INFO: %s detected stalls on CPUs/tasks:", rsp->name); print_cpu_stall_info_begin(); rcu_for_each_leaf_node(rsp, rnp) { raw_spin_lock_irqsave_rcu_node(rnp, flags); @@ -4048,7 +4050,7 @@ static void __init rcu_init_geometry(void) if (rcu_fanout_leaf == RCU_FANOUT_LEAF && nr_cpu_ids == NR_CPUS) return; - pr_info("RCU: Adjusting geometry for rcu_fanout_leaf=%d, nr_cpu_ids=%u\n", + pr_info("Adjusting geometry for rcu_fanout_leaf=%d, nr_cpu_ids=%u\n", rcu_fanout_leaf, nr_cpu_ids); /* diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 54a251640f53..dbfe90191e19 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -74,8 +74,8 @@ static void __init rcu_bootup_announce_oddness(void) pr_info("\tRCU event tracing is enabled.\n"); if ((IS_ENABLED(CONFIG_64BIT) && RCU_FANOUT != 64) || (!IS_ENABLED(CONFIG_64BIT) && RCU_FANOUT != 32)) - pr_info("\tCONFIG_RCU_FANOUT set to non-default value of %d\n", - RCU_FANOUT); + pr_info("\tCONFIG_RCU_FANOUT set to non-default value of %d.\n", + RCU_FANOUT); if (rcu_fanout_exact) pr_info("\tHierarchical RCU autobalancing is disabled.\n"); if (IS_ENABLED(CONFIG_RCU_FAST_NO_HZ)) @@ -88,11 +88,13 @@ static void __init rcu_bootup_announce_oddness(void) pr_info("\tBuild-time adjustment of leaf fanout to %d.\n", RCU_FANOUT_LEAF); if (rcu_fanout_leaf != RCU_FANOUT_LEAF) - pr_info("\tBoot-time adjustment of leaf fanout to %d.\n", rcu_fanout_leaf); + pr_info("\tBoot-time adjustment of leaf fanout to %d.\n", + rcu_fanout_leaf); if (nr_cpu_ids != NR_CPUS) pr_info("\tRCU restricting CPUs from NR_CPUS=%d to nr_cpu_ids=%u.\n", NR_CPUS, nr_cpu_ids); #ifdef CONFIG_RCU_BOOST - pr_info("\tRCU priority boosting: priority %d delay %d ms.\n", kthread_prio, CONFIG_RCU_BOOST_DELAY); + pr_info("\tRCU priority boosting: priority %d delay %d ms.\n", + kthread_prio, CONFIG_RCU_BOOST_DELAY); #endif if (blimit != DEFAULT_RCU_BLIMIT) pr_info("\tBoot-time adjustment of callback invocation limit to %ld.\n", blimit); -- cgit From 6f56f714db067056c80f5d71510118f82872e34c Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 14 May 2018 13:52:27 -0700 Subject: rcu: Improve RCU-tasks naming and comments The naming and comments associated with some RCU-tasks code make the faulty assumption that context switches due to cond_resched() are voluntary. As several people pointed out, this is not the case. This commit therefore updates function names and comments to better reflect current reality. Reported-by: Byungchul Park Reported-by: Joel Fernandes Reported-by: Steven Rostedt Signed-off-by: Paul E. McKenney --- include/linux/rcupdate.h | 12 ++++++------ include/linux/rcutiny.h | 2 +- kernel/rcu/tree.c | 2 +- kernel/rcu/update.c | 27 ++++++++++++++------------- 4 files changed, 22 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index dacc90358b33..75e5b393cf44 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -158,11 +158,11 @@ static inline void rcu_init_nohz(void) { } } while (0) /* - * Note a voluntary context switch for RCU-tasks benefit. This is a - * macro rather than an inline function to avoid #include hell. + * Note a quasi-voluntary context switch for RCU-tasks's benefit. + * This is a macro rather than an inline function to avoid #include hell. */ #ifdef CONFIG_TASKS_RCU -#define rcu_note_voluntary_context_switch_lite(t) \ +#define rcu_tasks_qs(t) \ do { \ if (READ_ONCE((t)->rcu_tasks_holdout)) \ WRITE_ONCE((t)->rcu_tasks_holdout, false); \ @@ -170,14 +170,14 @@ static inline void rcu_init_nohz(void) { } #define rcu_note_voluntary_context_switch(t) \ do { \ rcu_all_qs(); \ - rcu_note_voluntary_context_switch_lite(t); \ + rcu_tasks_qs(t); \ } while (0) void call_rcu_tasks(struct rcu_head *head, rcu_callback_t func); void synchronize_rcu_tasks(void); void exit_tasks_rcu_start(void); void exit_tasks_rcu_finish(void); #else /* #ifdef CONFIG_TASKS_RCU */ -#define rcu_note_voluntary_context_switch_lite(t) do { } while (0) +#define rcu_tasks_qs(t) do { } while (0) #define rcu_note_voluntary_context_switch(t) rcu_all_qs() #define call_rcu_tasks call_rcu_sched #define synchronize_rcu_tasks synchronize_sched @@ -194,7 +194,7 @@ static inline void exit_tasks_rcu_finish(void) { } */ #define cond_resched_tasks_rcu_qs() \ do { \ - rcu_note_voluntary_context_switch_lite(current); \ + rcu_tasks_qs(current); \ cond_resched(); \ } while (0) diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h index 7b3c82e8a625..8d9a0ea8f0b5 100644 --- a/include/linux/rcutiny.h +++ b/include/linux/rcutiny.h @@ -93,7 +93,7 @@ static inline void kfree_call_rcu(struct rcu_head *head, #define rcu_note_context_switch(preempt) \ do { \ rcu_sched_qs(); \ - rcu_note_voluntary_context_switch_lite(current); \ + rcu_tasks_qs(current); \ } while (0) static inline int rcu_needs_cpu(u64 basemono, u64 *nextevt) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 6f2922168216..ccc061acf887 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -457,7 +457,7 @@ void rcu_note_context_switch(bool preempt) rcu_momentary_dyntick_idle(); this_cpu_inc(rcu_dynticks.rcu_qs_ctr); if (!preempt) - rcu_note_voluntary_context_switch_lite(current); + rcu_tasks_qs(current); out: trace_rcu_utilization(TPS("End context switch")); barrier(); /* Avoid RCU read-side critical sections leaking up. */ diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index 4c230a60ece4..5783bdf86e5a 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -507,14 +507,15 @@ early_initcall(check_cpu_stall_init); #ifdef CONFIG_TASKS_RCU /* - * Simple variant of RCU whose quiescent states are voluntary context switch, - * user-space execution, and idle. As such, grace periods can take one good - * long time. There are no read-side primitives similar to rcu_read_lock() - * and rcu_read_unlock() because this implementation is intended to get - * the system into a safe state for some of the manipulations involved in - * tracing and the like. Finally, this implementation does not support - * high call_rcu_tasks() rates from multiple CPUs. If this is required, - * per-CPU callback lists will be needed. + * Simple variant of RCU whose quiescent states are voluntary context + * switch, cond_resched_rcu_qs(), user-space execution, and idle. + * As such, grace periods can take one good long time. There are no + * read-side primitives similar to rcu_read_lock() and rcu_read_unlock() + * because this implementation is intended to get the system into a safe + * state for some of the manipulations involved in tracing and the like. + * Finally, this implementation does not support high call_rcu_tasks() + * rates from multiple CPUs. If this is required, per-CPU callback lists + * will be needed. */ /* Global list of callbacks and associated lock. */ @@ -542,11 +543,11 @@ static struct task_struct *rcu_tasks_kthread_ptr; * period elapses, in other words after all currently executing RCU * read-side critical sections have completed. call_rcu_tasks() assumes * that the read-side critical sections end at a voluntary context - * switch (not a preemption!), entry into idle, or transition to usermode - * execution. As such, there are no read-side primitives analogous to - * rcu_read_lock() and rcu_read_unlock() because this primitive is intended - * to determine that all tasks have passed through a safe state, not so - * much for data-strcuture synchronization. + * switch (not a preemption!), cond_resched_rcu_qs(), entry into idle, + * or transition to usermode execution. As such, there are no read-side + * primitives analogous to rcu_read_lock() and rcu_read_unlock() because + * this primitive is intended to determine that all tasks have passed + * through a safe state, not so much for data-strcuture synchronization. * * See the description of call_rcu() for more detailed information on * memory ordering guarantees. -- cgit From 15651201fa055ec81d3669b36ab7c2fb12c3ce36 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 16 May 2018 14:41:41 -0700 Subject: rcu: Mark task as .need_qs less aggressively If any scheduling-clock interrupt interrupts an RCU-preempt read-side critical section, the interrupted task's ->rcu_read_unlock_special.b.need_qs field is set. This causes the outermost rcu_read_unlock() to incur the extra overhead of calling into rcu_read_unlock_special(). This commit reduces that overhead by setting ->rcu_read_unlock_special.b.need_qs only if the grace period has been in effect for more than one second. Why one second? Because this is comfortably smaller than the minimum RCU CPU stall-warning timeout of three seconds, but long enough that the .need_qs marking should happen quite rarely. And if your RCU read-side critical section has run on-CPU for a full second, it is not unreasonable to invest some CPU time in ending the grace period quickly. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index dbfe90191e19..0239cf8a4be6 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -730,6 +730,7 @@ rcu_preempt_check_blocked_tasks(struct rcu_state *rsp, struct rcu_node *rnp) */ static void rcu_preempt_check_callbacks(void) { + struct rcu_state *rsp = &rcu_preempt_state; struct task_struct *t = current; if (t->rcu_read_lock_nesting == 0) { @@ -738,7 +739,9 @@ static void rcu_preempt_check_callbacks(void) } if (t->rcu_read_lock_nesting > 0 && __this_cpu_read(rcu_data_p->core_needs_qs) && - __this_cpu_read(rcu_data_p->cpu_no_qs.b.norm)) + __this_cpu_read(rcu_data_p->cpu_no_qs.b.norm) && + !t->rcu_read_unlock_special.b.need_qs && + time_after(jiffies, rsp->gp_start + HZ)) t->rcu_read_unlock_special.b.need_qs = true; } -- cgit From 3b57a3994f330a102b91bd70c84c9530c0ae50f6 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 16 May 2018 16:01:56 -0700 Subject: rcu: Inline rcu_dynticks_momentary_idle() into its sole caller The rcu_dynticks_momentary_idle() function is invoked only from rcu_momentary_dyntick_idle(), and neither function is particularly large. This commit therefore saves a few lines by inlining rcu_dynticks_momentary_idle() into rcu_momentary_dyntick_idle(). Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 23 +++++++---------------- 1 file changed, 7 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index ccc061acf887..ebdbb5f96e5c 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -385,20 +385,6 @@ static bool rcu_dynticks_in_eqs_since(struct rcu_dynticks *rdtp, int snap) return snap != rcu_dynticks_snap(rdtp); } -/* - * Do a double-increment of the ->dynticks counter to emulate a - * momentary idle-CPU quiescent state. - */ -static void rcu_dynticks_momentary_idle(void) -{ - struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); - int special = atomic_add_return(2 * RCU_DYNTICK_CTRL_CTR, - &rdtp->dynticks); - - /* It is illegal to call this from idle state. */ - WARN_ON_ONCE(!(special & RCU_DYNTICK_CTRL_CTR)); -} - /* * Set the special (bottom) bit of the specified CPU so that it * will take special action (such as flushing its TLB) on the @@ -430,12 +416,17 @@ bool rcu_eqs_special_set(int cpu) * * We inform the RCU core by emulating a zero-duration dyntick-idle period. * - * The caller must have disabled interrupts. + * The caller must have disabled interrupts and must not be idle. */ static void rcu_momentary_dyntick_idle(void) { + struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); + int special; + raw_cpu_write(rcu_dynticks.rcu_need_heavy_qs, false); - rcu_dynticks_momentary_idle(); + special = atomic_add_return(2 * RCU_DYNTICK_CTRL_CTR, &rdtp->dynticks); + /* It is illegal to call this from idle state. */ + WARN_ON_ONCE(!(special & RCU_DYNTICK_CTRL_CTR)); } /* -- cgit From c7037ff5249cee237b8c8a6f99998ae4f3916b60 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 16 May 2018 18:00:17 -0700 Subject: rcu: Clarify and correct the rcu_preempt_qs() header comment The rcu_preempt_qs() function only applies to the CPU, not the task. A task really is allowed to invoke this function while in an RCU-preempt read-side critical section, but only if it has first added itself to some leaf rcu_node structure's ->blkd_tasks list. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 0239cf8a4be6..07d1ad175994 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -294,13 +294,17 @@ static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp) } /* - * Record a preemptible-RCU quiescent state for the specified CPU. Note - * that this just means that the task currently running on the CPU is - * not in a quiescent state. There might be any number of tasks blocked - * while in an RCU read-side critical section. + * Record a preemptible-RCU quiescent state for the specified CPU. + * Note that this does not necessarily mean that the task currently running + * on the CPU is in a quiescent state: Instead, it means that the current + * grace period need not wait on any RCU read-side critical section that + * starts later on this CPU. It also means that if the current task is + * in an RCU read-side critical section, it has already added itself to + * some leaf rcu_node structure's ->blkd_tasks list. In addition to the + * current task, there might be any number of other tasks blocked while + * in an RCU read-side critical section. * - * As with the other rcu_*_qs() functions, callers to this function - * must disable preemption. + * Callers to this function must disable preemption. */ static void rcu_preempt_qs(void) { -- cgit From 164ba3fc4864346dbc365f8b89d8888e1b6cd38c Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 16 May 2018 20:41:36 -0700 Subject: rcu: Remove unused rcu_kick_nohz_cpu() function The rcu_kick_nohz_cpu() function is no longer used, and the functionality it used to provide is now provided by a call to resched_cpu() in the force-quiescent-state function rcu_implicit_dynticks_qs(). This commit therefore removes rcu_kick_nohz_cpu(). Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.h | 1 - kernel/rcu/tree_plugin.h | 17 ----------------- 2 files changed, 18 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index d51e6edc8e83..4e74df768c57 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -483,7 +483,6 @@ static void __init rcu_spawn_nocb_kthreads(void); #ifdef CONFIG_RCU_NOCB_CPU static void __init rcu_organize_nocb_kthreads(struct rcu_state *rsp); #endif /* #ifdef CONFIG_RCU_NOCB_CPU */ -static void __maybe_unused rcu_kick_nohz_cpu(int cpu); static bool init_nocb_callback_list(struct rcu_data *rdp); static void rcu_bind_gp_kthread(void); static bool rcu_nohz_full_cpu(struct rcu_state *rsp); diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 07d1ad175994..75a91d58b8f7 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -2645,23 +2645,6 @@ static bool init_nocb_callback_list(struct rcu_data *rdp) #endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */ -/* - * An adaptive-ticks CPU can potentially execute in kernel mode for an - * arbitrarily long period of time with the scheduling-clock tick turned - * off. RCU will be paying attention to this CPU because it is in the - * kernel, but the CPU cannot be guaranteed to be executing the RCU state - * machine because the scheduling-clock tick has been disabled. Therefore, - * if an adaptive-ticks CPU is failing to respond to the current grace - * period and has not be idle from an RCU perspective, kick it. - */ -static void __maybe_unused rcu_kick_nohz_cpu(int cpu) -{ -#ifdef CONFIG_NO_HZ_FULL - if (tick_nohz_full_cpu(cpu)) - smp_send_reschedule(cpu); -#endif /* #ifdef CONFIG_NO_HZ_FULL */ -} - /* * Is this CPU a NO_HZ_FULL CPU that should ignore RCU so that the * grace-period kthread will do force_quiescent_state() processing? -- cgit From ab6b82147f471e31d9397c95d523631b6c8953f2 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 17 May 2018 11:21:20 -0700 Subject: rcu: Remove unused local variable "cpu" One danger of using __maybe_unused is that the compiler doesn't yell at you when you remove the last reference, witness rcu_bind_gp_kthread() and its local variable "cpu". This commit removes this local variable. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 75a91d58b8f7..2cc9bf0d363a 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -2670,8 +2670,6 @@ static bool rcu_nohz_full_cpu(struct rcu_state *rsp) */ static void rcu_bind_gp_kthread(void) { - int __maybe_unused cpu; - if (!tick_nohz_full_enabled()) return; housekeeping_affine(current, HK_FLAG_RCU); -- cgit From 95394e69c42f0da83a176936fdb28f6cac57ea69 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 17 May 2018 11:33:17 -0700 Subject: rcu: Remove "inline" from panic_on_rcu_stall() and rcu_blocking_is_gp() These functions are in kernel/rcu/tree.c, which is not an include file, so there is no problem dropping the "inline", especially given that these functions are nowhere near a fastpath. This commit therefore delegates the inlining decision to the compiler by dropping the "inline". Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index ebdbb5f96e5c..3504ee35e226 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1342,7 +1342,7 @@ static void rcu_stall_kick_kthreads(struct rcu_state *rsp) } } -static inline void panic_on_rcu_stall(void) +static void panic_on_rcu_stall(void) { if (sysctl_panic_on_rcu_stall) panic("RCU Stall\n"); @@ -3080,7 +3080,7 @@ EXPORT_SYMBOL_GPL(kfree_call_rcu); * when there was in fact only one the whole time, as this just adds * some overhead: RCU still operates correctly. */ -static inline int rcu_blocking_is_gp(void) +static int rcu_blocking_is_gp(void) { int ret; -- cgit From eac45e586cd38a1b56aa716560002e68741b78a7 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 17 May 2018 11:33:17 -0700 Subject: rcu: Remove "inline" from rcu_torture_print_module_parms() This function is in rcutorture.c, which is not an include file, so there is no problem dropping the "inline", especially given that this function is invoked only twice per rcutorture run. This commit therefore delegates the inlining decision to the compiler by dropping the "inline". Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 90a94fecdd73..57a4277ccc63 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1359,7 +1359,7 @@ rcu_torture_stats(void *arg) return 0; } -static inline void +static void rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, const char *tag) { pr_alert("%s" TORTURE_FLAG -- cgit From 9622179519c52ead944c3b6a07aed9c6db3659e2 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 17 May 2018 11:33:17 -0700 Subject: rcu: Remove "inline" from rcu_perf_print_module_parms() This function is in rcuperf.c, which is not an include file, so there is no problem dropping the "inline", especially given that this function is invoked only twice per rcuperf run. This commit therefore delegates the inlining decision to the compiler by dropping the "inline". Signed-off-by: Paul E. McKenney --- kernel/rcu/rcuperf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c index 00e395c0d7d0..3e86940245d9 100644 --- a/kernel/rcu/rcuperf.c +++ b/kernel/rcu/rcuperf.c @@ -536,7 +536,7 @@ retry: return 0; } -static inline void +static void rcu_perf_print_module_parms(struct rcu_perf_ops *cur_ops, const char *tag) { pr_alert("%s" PERF_FLAG -- cgit From 51fbb910f52c8559a78665d203e55ab2b95e7126 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 17 May 2018 13:15:40 -0700 Subject: rcu: Remove __maybe_unused from rcu_cpu_has_callbacks() The rcu_cpu_has_callbacks() function is now used in all configurations, so this commit removes the __maybe_unused. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 3504ee35e226..cdc4fca0c4cb 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3315,7 +3315,7 @@ static int rcu_pending(void) * non-NULL, store an indication of whether all callbacks are lazy. * (If there are no callbacks, all of them are deemed to be lazy.) */ -static bool __maybe_unused rcu_cpu_has_callbacks(bool *all_lazy) +static bool rcu_cpu_has_callbacks(bool *all_lazy) { bool al = true; bool hc = false; -- cgit From b06ae25a1e2b54b2b5bc589a4a118b7bb39159fe Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 17 May 2018 13:32:51 -0700 Subject: rcu: Use RCU CPU stall timeout for rcu_check_gp_start_stall() Currently, rcu_check_gp_start_stall() waits for one second after the first request before complaining that a grace period has not yet started. This was desirable while testing the conversion from ->future_gp_needed[] to ->gp_seq_needed, but it is a bit on the hair-trigger side for production use under heavy load. This commit therefore makes this wait time be exactly that of the RCU CPU stall warning, allowing easy adjustment of both timeouts to suit the distribution or installation at hand. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index cdc4fca0c4cb..7746fe1ee3fc 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2753,6 +2753,7 @@ static void rcu_check_gp_start_stall(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp) { + const unsigned long gpssdelay = rcu_jiffies_till_stall_check() * HZ; unsigned long flags; unsigned long j; struct rcu_node *rnp_root = rcu_get_root(rsp); @@ -2762,8 +2763,8 @@ rcu_check_gp_start_stall(struct rcu_state *rsp, struct rcu_node *rnp, ULONG_CMP_GE(rnp_root->gp_seq, rnp_root->gp_seq_needed)) return; j = jiffies; /* Expensive access, and in common case don't get here. */ - if (time_before(j, READ_ONCE(rsp->gp_req_activity) + HZ) || - time_before(j, READ_ONCE(rsp->gp_activity) + HZ) || + if (time_before(j, READ_ONCE(rsp->gp_req_activity) + gpssdelay) || + time_before(j, READ_ONCE(rsp->gp_activity) + gpssdelay) || atomic_read(&warned)) return; @@ -2771,8 +2772,8 @@ rcu_check_gp_start_stall(struct rcu_state *rsp, struct rcu_node *rnp, j = jiffies; if (rcu_gp_in_progress(rsp) || ULONG_CMP_GE(rnp_root->gp_seq, rnp_root->gp_seq_needed) || - time_before(j, READ_ONCE(rsp->gp_req_activity) + HZ) || - time_before(j, READ_ONCE(rsp->gp_activity) + HZ) || + time_before(j, READ_ONCE(rsp->gp_req_activity) + gpssdelay) || + time_before(j, READ_ONCE(rsp->gp_activity) + gpssdelay) || atomic_read(&warned)) { raw_spin_unlock_irqrestore_rcu_node(rnp, flags); return; @@ -2784,18 +2785,18 @@ rcu_check_gp_start_stall(struct rcu_state *rsp, struct rcu_node *rnp, j = jiffies; if (rcu_gp_in_progress(rsp) || ULONG_CMP_GE(rnp_root->gp_seq, rnp_root->gp_seq_needed) || - time_before(j, rsp->gp_req_activity + HZ) || - time_before(j, rsp->gp_activity + HZ) || + time_before(j, rsp->gp_req_activity + gpssdelay) || + time_before(j, rsp->gp_activity + gpssdelay) || atomic_xchg(&warned, 1)) { raw_spin_unlock_rcu_node(rnp_root); /* irqs remain disabled. */ raw_spin_unlock_irqrestore_rcu_node(rnp, flags); return; } - pr_alert("%s: g%ld->%ld gar:%lu ga:%lu f%#x %s->state:%#lx\n", + pr_alert("%s: g%ld->%ld gar:%lu ga:%lu f%#x gs:%d %s->state:%#lx\n", __func__, (long)READ_ONCE(rsp->gp_seq), (long)READ_ONCE(rnp_root->gp_seq_needed), j - rsp->gp_req_activity, j - rsp->gp_activity, - rsp->gp_flags, rsp->name, + rsp->gp_flags, rsp->gp_state, rsp->name, rsp->gp_kthread ? rsp->gp_kthread->state : 0x1ffffL); WARN_ON(1); if (rnp_root != rnp) -- cgit From 0d805a70a652a6eef8d0283e5183879e7acb85ad Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Tue, 22 May 2018 23:38:13 -0700 Subject: rcu: Add comment documenting how rcu_seq_snap works rcu_seq_snap may be tricky to decipher. Lets document how it works with an example to make it easier. Signed-off-by: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney [ paulmck: Shrink comment as suggested by Peter Zijlstra. ] --- kernel/rcu/rcu.h | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index aa215d6355f8..89f13fffac73 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -91,7 +91,17 @@ static inline void rcu_seq_end(unsigned long *sp) WRITE_ONCE(*sp, rcu_seq_endval(sp)); } -/* Take a snapshot of the update side's sequence number. */ +/* + * rcu_seq_snap - Take a snapshot of the update side's sequence number. + * + * This function returns the earliest value of the grace-period sequence number + * that will indicate that a full grace period has elapsed since the current + * time. Once the grace-period sequence number has reached this value, it will + * be safe to invoke all callbacks that have been registered prior to the + * current time. This value is the current grace-period number plus two to the + * power of the number of low-order bits reserved for state, then rounded up to + * the next value in which the state bits are all zero. + */ static inline unsigned long rcu_seq_snap(unsigned long *sp) { unsigned long s; -- cgit From c03be752d39dc64dcfda0ac8ce87fb10b1ee5621 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Thu, 24 May 2018 18:49:46 -0400 Subject: rcu: Speed up calling of RCU tasks callbacks Joel Fernandes found that the synchronize_rcu_tasks() was taking a significant amount of time. He demonstrated it with the following test: # cd /sys/kernel/tracing # while [ 1 ]; do x=1; done & # echo '__schedule_bug:traceon' > set_ftrace_filter # time echo '!__schedule_bug:traceon' > set_ftrace_filter; real 0m1.064s user 0m0.000s sys 0m0.004s Where it takes a little over a second to perform the synchronize, because there's a loop that waits 1 second at a time for tasks to get through their quiescent points when there's a task that must be waited for. After discussion we came up with a simple way to wait for holdouts but increase the time for each iteration of the loop but no more than a full second. With the new patch we have: # time echo '!__schedule_bug:traceon' > set_ftrace_filter; real 0m0.131s user 0m0.000s sys 0m0.004s Which drops it down to 13% of what the original wait time was. Link: http://lkml.kernel.org/r/20180523063815.198302-2-joel@joelfernandes.org Reported-by: Joel Fernandes (Google) Suggested-by: Joel Fernandes (Google) Signed-off-by: Steven Rostedt (VMware) Signed-off-by: Paul E. McKenney --- kernel/rcu/update.c | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index 5783bdf86e5a..4c7c49c106ee 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -668,6 +668,7 @@ static int __noreturn rcu_tasks_kthread(void *arg) struct rcu_head *list; struct rcu_head *next; LIST_HEAD(rcu_tasks_holdouts); + int fract; /* Run on housekeeping CPUs by default. Sysadm can move if desired. */ housekeeping_affine(current, HK_FLAG_RCU); @@ -749,13 +750,25 @@ static int __noreturn rcu_tasks_kthread(void *arg) * holdouts. When the list is empty, we are done. */ lastreport = jiffies; - while (!list_empty(&rcu_tasks_holdouts)) { + + /* Start off with HZ/10 wait and slowly back off to 1 HZ wait*/ + fract = 10; + + for (;;) { bool firstreport; bool needreport; int rtst; struct task_struct *t1; - schedule_timeout_interruptible(HZ); + if (list_empty(&rcu_tasks_holdouts)) + break; + + /* Slowly back off waiting for holdouts */ + schedule_timeout_interruptible(HZ/fract); + + if (fract > 1) + fract--; + rtst = READ_ONCE(rcu_task_stall_timeout); needreport = rtst > 0 && time_after(jiffies, lastreport + rtst); -- cgit From cd23ac8ddb7be993f88bee893b89a8b4971c3651 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Thu, 24 May 2018 18:58:16 -0400 Subject: rcu: Add comment to the last sleep in the rcu tasks loop At the end of rcu_tasks_kthread() there's a lonely schedule_timeout_uninterruptible() call with no apparent rationale for its existence. But there is. It is to keep the thread from going into a tight loop if there's some anomaly. That really needs a comment. Link: http://lkml.kernel.org/r/20180524223839.GU3803@linux.vnet.ibm.com Signed-off-by: Steven Rostedt (VMware) Signed-off-by: Paul E. McKenney --- kernel/rcu/update.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index 4c7c49c106ee..39cb23d22109 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -814,6 +814,7 @@ static int __noreturn rcu_tasks_kthread(void *arg) list = next; cond_resched(); } + /* Paranoid sleep to keep this from entering a tight loop */ schedule_timeout_uninterruptible(HZ/10); } } -- cgit From 47199a0812535217c29933cecf468568bb37f933 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 28 May 2018 10:33:08 -0700 Subject: rcu: Add diagnostics for rcutorture writer stall warning This commit adds any in-the-future ->gp_seq_needed fields to the diagnostics for an rcutorture writer stall warning message. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 7746fe1ee3fc..4915525559ac 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -606,11 +606,32 @@ EXPORT_SYMBOL_GPL(rcu_sched_force_quiescent_state); */ void show_rcu_gp_kthreads(void) { + int cpu; + struct rcu_data *rdp; + struct rcu_node *rnp; struct rcu_state *rsp; for_each_rcu_flavor(rsp) { pr_info("%s: wait state: %d ->state: %#lx\n", rsp->name, rsp->gp_state, rsp->gp_kthread->state); + rcu_for_each_node_breadth_first(rsp, rnp) { + if (ULONG_CMP_GE(rsp->gp_seq, rnp->gp_seq_needed)) + continue; + pr_info("\trcu_node %d:%d ->gp_seq %lu ->gp_seq_needed %lu\n", + rnp->grplo, rnp->grphi, rnp->gp_seq, + rnp->gp_seq_needed); + if (!rcu_is_leaf_node(rnp)) + continue; + for_each_leaf_node_possible_cpu(rnp, cpu) { + rdp = per_cpu_ptr(rsp->rda, cpu); + if (rdp->gpwrap || + ULONG_CMP_GE(rsp->gp_seq, + rdp->gp_seq_needed)) + continue; + pr_info("\tcpu %d ->gp_seq_needed %lu\n", + cpu, rdp->gp_seq_needed); + } + } /* sched_show_task(rsp->gp_kthread); */ } } -- cgit From 67abb96cbf307e16e3c6d1a0328ece085b5ce94c Mon Sep 17 00:00:00 2001 From: Byungchul Park Date: Fri, 1 Jun 2018 11:03:09 +0900 Subject: rcu: Check the range of jiffies_till_{first,next}_fqs when setting them Currently, the range of jiffies_till_{first,next}_fqs are checked and adjusted on and on in the loop of rcu_gp_kthread on runtime. However, it's enough to check them only when setting them, not every time in the loop. So make them handled on a setting time via sysfs. Signed-off-by: Byungchul Park Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 45 ++++++++++++++++++++++++++++++++------------- 1 file changed, 32 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 4915525559ac..7498a416f63b 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -510,8 +510,38 @@ static ulong jiffies_till_first_fqs = ULONG_MAX; static ulong jiffies_till_next_fqs = ULONG_MAX; static bool rcu_kick_kthreads; -module_param(jiffies_till_first_fqs, ulong, 0644); -module_param(jiffies_till_next_fqs, ulong, 0644); +static int param_set_first_fqs_jiffies(const char *val, const struct kernel_param *kp) +{ + ulong j; + int ret = kstrtoul(val, 0, &j); + + if (!ret) + WRITE_ONCE(*(ulong *)kp->arg, (j > HZ) ? HZ : j); + return ret; +} + +static int param_set_next_fqs_jiffies(const char *val, const struct kernel_param *kp) +{ + ulong j; + int ret = kstrtoul(val, 0, &j); + + if (!ret) + WRITE_ONCE(*(ulong *)kp->arg, (j > HZ) ? HZ : (j ?: 1)); + return ret; +} + +static struct kernel_param_ops first_fqs_jiffies_ops = { + .set = param_set_first_fqs_jiffies, + .get = param_get_ulong, +}; + +static struct kernel_param_ops next_fqs_jiffies_ops = { + .set = param_set_next_fqs_jiffies, + .get = param_get_ulong, +}; + +module_param_cb(jiffies_till_first_fqs, &first_fqs_jiffies_ops, &jiffies_till_first_fqs, 0644); +module_param_cb(jiffies_till_next_fqs, &next_fqs_jiffies_ops, &jiffies_till_next_fqs, 0644); module_param(rcu_kick_kthreads, bool, 0644); /* @@ -2180,10 +2210,6 @@ static int __noreturn rcu_gp_kthread(void *arg) /* Handle quiescent-state forcing. */ first_gp_fqs = true; j = jiffies_till_first_fqs; - if (j > HZ) { - j = HZ; - jiffies_till_first_fqs = HZ; - } ret = 0; for (;;) { if (!ret) { @@ -2218,13 +2244,6 @@ static int __noreturn rcu_gp_kthread(void *arg) WRITE_ONCE(rsp->gp_activity, jiffies); ret = 0; /* Force full wait till next FQS. */ j = jiffies_till_next_fqs; - if (j > HZ) { - j = HZ; - jiffies_till_next_fqs = HZ; - } else if (j < 1) { - j = 1; - jiffies_till_next_fqs = 1; - } } else { /* Deal with stray signal. */ cond_resched_tasks_rcu_qs(); -- cgit From 2ee5aca54622aacc196106c623fea4116f1043a6 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sat, 9 Jun 2018 01:22:20 -0700 Subject: rcu: Make rcu_seq_diff() more exact The current implementatation of rcu_seq_diff() follows tradition in providing a rough-and-ready approximation of the number of elapsed grace periods between the two rcu_seq values. However, this difference is used to flag RCU-failure "near misses", which can be a valuable debugging aid, so more exactitude would be an improvement. This commit therefore improves the accuracy of rcu_seq_diff(). Signed-off-by: Paul E. McKenney --- kernel/rcu/rcu.h | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index 89f13fffac73..d5e0294b8580 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -158,7 +158,20 @@ static inline bool rcu_seq_new_gp(unsigned long old, unsigned long new) */ static inline unsigned long rcu_seq_diff(unsigned long new, unsigned long old) { - return (new - old) >> RCU_SEQ_CTR_SHIFT; + unsigned long rnd_diff; + + if (old == new) + return 0; + /* + * Compute the number of grace periods (still shifted up), plus + * one if either of new and old is not an exact grace period. + */ + rnd_diff = (new & ~RCU_SEQ_STATE_MASK) - + ((old + RCU_SEQ_STATE_MASK) & ~RCU_SEQ_STATE_MASK) + + ((new & RCU_SEQ_STATE_MASK) || (old & RCU_SEQ_STATE_MASK)); + if (ULONG_CMP_GE(RCU_SEQ_STATE_MASK, rnd_diff)) + return 1; /* Definitely no grace period has elapsed. */ + return ((rnd_diff - RCU_SEQ_STATE_MASK - 1) >> RCU_SEQ_CTR_SHIFT) + 2; } /* -- cgit From 89b4cd4b9ebf15b01f70b85105ea5f5c6b2a3788 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 11 Jun 2018 12:29:16 -0700 Subject: rcu: Print stall-warning NMI dyntick state in hexadecimal The ->dynticks_nmi_nesting field records the nesting depth of both interrupt and NMI handlers. Because the kernel can enter interrupts and never leave them (and vice versa) and because NMIs can interrupt manipulation of the ->dynticks_nmi_nesting field, the values in this field must be both chosen and maniupated very carefully. As a result, although the value is zero when the corresponding CPU is executing neither an interrupt nor an NMI handler, it is 4,611,686,018,427,387,906 on 64-bit systems when there is a single level of interrupt/NMI handling in progress. This number is difficult to remember and interpret, so this commit switches the output to hexadecimal, resulting in the much nicer 0x4000000000000002. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 2cc9bf0d363a..c1b17f5b9361 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -1801,7 +1801,7 @@ static void print_cpu_stall_info(struct rcu_state *rsp, int cpu) } print_cpu_stall_fast_no_hz(fast_no_hz, cpu); delta = rcu_seq_ctr(rdp->mynode->gp_seq - rdp->rcu_iw_gp_seq); - pr_err("\t%d-%c%c%c%c: (%lu %s) idle=%03x/%ld/%ld softirq=%u/%u fqs=%ld %s\n", + pr_err("\t%d-%c%c%c%c: (%lu %s) idle=%03x/%ld/%#lx softirq=%u/%u fqs=%ld %s\n", cpu, "O."[!!cpu_online(cpu)], "o."[!!(rdp->grpmask & rdp->mynode->qsmaskinit)], -- cgit From 52e17ba1d063ab6adb367f288babd380e30bad46 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 19 Jun 2018 08:54:37 -0700 Subject: srcu: Add grace-period number to rcutorture statistics printout This commit adds the SRCU grace-period number to the rcutorture statistics printout, which allows it to be compared to the rcutorture "Writer stall state" message. Signed-off-by: Paul E. McKenney --- kernel/rcu/srcutree.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index e526b56998af..6c9866a854b1 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -1268,7 +1268,8 @@ void srcu_torture_stats_print(struct srcu_struct *sp, char *tt, char *tf) unsigned long s0 = 0, s1 = 0; idx = sp->srcu_idx & 0x1; - pr_alert("%s%s Tree SRCU per-CPU(idx=%d):", tt, tf, idx); + pr_alert("%s%s Tree SRCU g%ld per-CPU(idx=%d):", + tt, tf, rcu_seq_current(&sp->srcu_gp_seq), idx); for_each_possible_cpu(cpu) { unsigned long l0, l1; unsigned long u0, u1; -- cgit From c7cd161ecb2188c07ba9560ca82aee756575359f Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Tue, 19 Jun 2018 15:14:17 -0700 Subject: rcu: Assign higher prio to RCU threads if rcutorture is built-in The rcutorture RCU priority boosting tests fail even with CONFIG_RCU_BOOST set because rcutorture's threads run at the same priority as the default RCU kthreads (RT class with priority of 1). This patch checks if RCU torture is built into the kernel and if so, assigns RT priority 1 to the RCU threads, allowing the rcutorture boost tests to pass. Signed-off-by: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 7498a416f63b..8143b8d40a6c 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3923,12 +3923,16 @@ static int __init rcu_spawn_gp_kthread(void) struct task_struct *t; /* Force priority into range. */ - if (IS_ENABLED(CONFIG_RCU_BOOST) && kthread_prio < 1) + if (IS_ENABLED(CONFIG_RCU_BOOST) && kthread_prio < 2 + && IS_BUILTIN(CONFIG_RCU_TORTURE_TEST)) + kthread_prio = 2; + else if (IS_ENABLED(CONFIG_RCU_BOOST) && kthread_prio < 1) kthread_prio = 1; else if (kthread_prio < 0) kthread_prio = 0; else if (kthread_prio > 99) kthread_prio = 99; + if (kthread_prio != kthread_prio_in) pr_alert("rcu_spawn_gp_kthread(): Limited prio to %d from %d\n", kthread_prio, kthread_prio_in); -- cgit From 028be12b294e3a059e6fc06852d458fdc82717ed Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 8 May 2018 09:20:34 -0700 Subject: rcutorture: Change units of onoff_interval to jiffies Some RCU bugs have been sensitive to the frequency of CPU-hotplug operations, which have been gradually increased over time. But this frequency is now at the one-second lower limit that can be specified using the rcutorture.onoff_interval kernel parameter. This commit therefore changes the units of rcutorture.onoff_interval from seconds to jiffies, and also sets the value specified for this kernel parameter in the TREE03 rcutorture scenario to 200, which is 200 milliseconds for HZ=1000. Signed-off-by: Paul E. McKenney --- Documentation/admin-guide/kernel-parameters.txt | 4 ++-- kernel/rcu/rcutorture.c | 4 ++-- tools/testing/selftests/rcutorture/configs/rcu/TREE03.boot | 2 +- tools/testing/selftests/rcutorture/configs/rcu/ver_functions.sh | 2 +- 4 files changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index efc7aa7a0670..77bd3e635313 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -3632,8 +3632,8 @@ Set time (s) after boot for CPU-hotplug testing. rcutorture.onoff_interval= [KNL] - Set time (s) between CPU-hotplug operations, or - zero to disable CPU-hotplug testing. + Set time (jiffies) between CPU-hotplug operations, + or zero to disable CPU-hotplug testing. rcutorture.shuffle_interval= [KNL] Set task-shuffle interval (s). Shuffling tasks diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 0481c7286875..eb6d4915b4e6 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -87,7 +87,7 @@ torture_param(int, object_debug, 0, "Enable debug-object double call_rcu() testing"); torture_param(int, onoff_holdoff, 0, "Time after boot before CPU hotplugs (s)"); torture_param(int, onoff_interval, 0, - "Time between CPU hotplugs (s), 0=disable"); + "Time between CPU hotplugs (jiffies), 0=disable"); torture_param(int, shuffle_interval, 3, "Number of seconds between shuffles"); torture_param(int, shutdown_secs, 0, "Shutdown time (s), <= zero to disable."); torture_param(int, stall_cpu, 0, "Stall duration (s), zero to disable."); @@ -1889,7 +1889,7 @@ rcu_torture_init(void) firsterr = torture_shutdown_init(shutdown_secs, rcu_torture_cleanup); if (firsterr) goto unwind; - firsterr = torture_onoff_init(onoff_holdoff * HZ, onoff_interval * HZ); + firsterr = torture_onoff_init(onoff_holdoff * HZ, onoff_interval); if (firsterr) goto unwind; firsterr = rcu_torture_stall_init(); diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE03.boot b/tools/testing/selftests/rcutorture/configs/rcu/TREE03.boot index b79ddb9eb9e8..5c3213cc3ad7 100644 --- a/tools/testing/selftests/rcutorture/configs/rcu/TREE03.boot +++ b/tools/testing/selftests/rcutorture/configs/rcu/TREE03.boot @@ -1,4 +1,4 @@ -rcutorture.onoff_interval=1 rcutorture.onoff_holdoff=30 +rcutorture.onoff_interval=200 rcutorture.onoff_holdoff=30 rcutree.gp_preinit_delay=12 rcutree.gp_init_delay=3 rcutree.gp_cleanup_delay=3 diff --git a/tools/testing/selftests/rcutorture/configs/rcu/ver_functions.sh b/tools/testing/selftests/rcutorture/configs/rcu/ver_functions.sh index 24ec91041957..7bab8246392b 100644 --- a/tools/testing/selftests/rcutorture/configs/rcu/ver_functions.sh +++ b/tools/testing/selftests/rcutorture/configs/rcu/ver_functions.sh @@ -39,7 +39,7 @@ rcutorture_param_onoff () { if ! bootparam_hotplug_cpu "$1" && configfrag_hotplug_cpu "$2" then echo CPU-hotplug kernel, adding rcutorture onoff. 1>&2 - echo rcutorture.onoff_interval=3 rcutorture.onoff_holdoff=30 + echo rcutorture.onoff_interval=1000 rcutorture.onoff_holdoff=30 fi } -- cgit From 6bea2cc5a97b7e9677088b1a93e27edb74ae0e55 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 16 May 2018 15:30:36 -0700 Subject: rcu: Remove rcutorture test version and sequence number Back when RCU had a debugfs interface, there was a test version and sequence number that allowed associating debugfs data with a particular test run, where the test run started with modprobe and ended with rmmod, which was how tests were run back on the old ABAT system within IBM. But rcutorture testing no longer runs on ABAT, and there is no longer an RCU debugfs interface, so there is no longer any need for test versions and sequence numbers. This commit therefore removes the rcutorture_record_test_transition() and rcutorture_record_progress() functions, and along with them the rcutorture_testseq and rcutorture_vernum variables that they update. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcu.h | 4 ---- kernel/rcu/rcutorture.c | 4 +--- kernel/rcu/tree.c | 37 ------------------------------------- 3 files changed, 1 insertion(+), 44 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index aa215d6355f8..0453a7d12b3f 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -444,7 +444,6 @@ enum rcutorture_type { #if defined(CONFIG_TREE_RCU) || defined(CONFIG_PREEMPT_RCU) void rcutorture_get_gp_data(enum rcutorture_type test_type, int *flags, unsigned long *gp_seq); -void rcutorture_record_test_transition(void); void rcutorture_record_progress(unsigned long vernum); void do_trace_rcu_torture_read(const char *rcutorturename, struct rcu_head *rhp, @@ -458,7 +457,6 @@ static inline void rcutorture_get_gp_data(enum rcutorture_type test_type, *flags = 0; *gp_seq = 0; } -static inline void rcutorture_record_test_transition(void) { } static inline void rcutorture_record_progress(unsigned long vernum) { } #ifdef CONFIG_RCU_TRACE void do_trace_rcu_torture_read(const char *rcutorturename, @@ -505,8 +503,6 @@ static inline void rcu_bh_force_quiescent_state(void) { } static inline void rcu_sched_force_quiescent_state(void) { } static inline void show_rcu_gp_kthreads(void) { } #else /* #ifdef CONFIG_TINY_RCU */ -extern unsigned long rcutorture_testseq; -extern unsigned long rcutorture_vernum; unsigned long rcu_get_gp_seq(void); unsigned long rcu_bh_get_gp_seq(void); unsigned long rcu_sched_get_gp_seq(void); diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index eb6d4915b4e6..335387fabac2 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1016,7 +1016,7 @@ rcu_torture_writer(void *arg) break; } } - rcutorture_record_progress(++rcu_torture_current_version); + rcu_torture_current_version++; /* Cycle through nesting levels of rcu_expedite_gp() calls. */ if (can_expedite && !(torture_random(&rand) & 0xff & (!!expediting - 1))) { @@ -1613,7 +1613,6 @@ rcu_torture_cleanup(void) unsigned long gp_seq = 0; int i; - rcutorture_record_test_transition(); if (torture_cleanup_begin()) { if (cur_ops->cb_barrier != NULL) cur_ops->cb_barrier(); @@ -1918,7 +1917,6 @@ rcu_torture_init(void) goto unwind; } } - rcutorture_record_test_transition(); torture_init_end(); return 0; diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index d3333ee2c6f5..65abb399b08d 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -191,18 +191,6 @@ module_param(gp_cleanup_delay, int, 0444); */ #define PER_RCU_NODE_PERIOD 3 /* Number of grace periods between delays. */ -/* - * Track the rcutorture test sequence number and the update version - * number within a given test. The rcutorture_testseq is incremented - * on every rcutorture module load and unload, so has an odd value - * when a test is running. The rcutorture_vernum is set to zero - * when rcutorture starts and is incremented on each rcutorture update. - * These variables enable correlating rcutorture output with the - * RCU tracing information. - */ -unsigned long rcutorture_testseq; -unsigned long rcutorture_vernum; - /* * Compute the mask of online CPUs for the specified rcu_node structure. * This will not be stable unless the rcu_node structure's ->lock is @@ -622,20 +610,6 @@ void show_rcu_gp_kthreads(void) } EXPORT_SYMBOL_GPL(show_rcu_gp_kthreads); -/* - * Record the number of times rcutorture tests have been initiated and - * terminated. This information allows the debugfs tracing stats to be - * correlated to the rcutorture messages, even when the rcutorture module - * is being repeatedly loaded and unloaded. In other words, we cannot - * store this state in rcutorture itself. - */ -void rcutorture_record_test_transition(void) -{ - rcutorture_testseq++; - rcutorture_vernum = 0; -} -EXPORT_SYMBOL_GPL(rcutorture_record_test_transition); - /* * Send along grace-period-related data for rcutorture diagnostics. */ @@ -664,17 +638,6 @@ void rcutorture_get_gp_data(enum rcutorture_type test_type, int *flags, } EXPORT_SYMBOL_GPL(rcutorture_get_gp_data); -/* - * Record the number of writer passes through the current rcutorture test. - * This is also used to correlate debugfs tracing stats with the rcutorture - * messages. - */ -void rcutorture_record_progress(unsigned long vernum) -{ - rcutorture_vernum++; -} -EXPORT_SYMBOL_GPL(rcutorture_record_progress); - /* * Return the root node of the specified rcu_state structure. */ -- cgit From 2d3625841dcee549653b6f50ffa4e6431305035a Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 17 May 2018 11:09:47 -0700 Subject: rcuperf: Remove unused torturing_tasks() function The torturing_tasks() function in rcuperf.c is not used, so this commit removes it. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcuperf.c | 5 ----- 1 file changed, 5 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c index b080bc4a4f45..06eb5e42726d 100644 --- a/kernel/rcu/rcuperf.c +++ b/kernel/rcu/rcuperf.c @@ -369,11 +369,6 @@ static unsigned long rcuperf_seq_diff(unsigned long new, unsigned long old) return cur_ops->gp_diff(new, old); } -static bool __maybe_unused torturing_tasks(void) -{ - return cur_ops == &tasks_ops; -} - /* * If performance tests complete, wait for shutdown to commence. */ -- cgit From 6b06aa723ed705102f3c63a494ac45352ccc0e7c Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 22 May 2018 10:56:05 -0700 Subject: rcutorture: Extract common code from rcu_torture_reader() This commit extracts the code executed on each pass through the loop in rcu_torture_reader() into a new rcu_torture_one_read() function. This new function will also be used by rcu_torture_timer(). Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 98 +++++++++++++++++++++++++++---------------------- 1 file changed, 55 insertions(+), 43 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 335387fabac2..971e31ae9bcf 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1089,6 +1089,60 @@ static void rcu_torture_timer_cb(struct rcu_head *rhp) kfree(rhp); } +/* + * Do one read-side critical section, returning false if there was + * no data to read. Can be invoked both from process context and + * from a timer handler. + */ +static bool rcu_torture_one_read(struct torture_random_state *trsp) +{ + int idx; + unsigned long started; + unsigned long completed; + struct rcu_torture *p; + int pipe_count; + unsigned long long ts; + + idx = cur_ops->readlock(); + started = cur_ops->get_gp_seq(); + ts = rcu_trace_clock_local(); + p = rcu_dereference_check(rcu_torture_current, + rcu_read_lock_bh_held() || + rcu_read_lock_sched_held() || + srcu_read_lock_held(srcu_ctlp) || + torturing_tasks()); + if (p == NULL) { + /* Wait for rcu_torture_writer to get underway */ + cur_ops->readunlock(idx); + return false; + } + if (p->rtort_mbtest == 0) + atomic_inc(&n_rcu_torture_mberror); + cur_ops->read_delay(trsp); + preempt_disable(); + pipe_count = p->rtort_pipe_count; + if (pipe_count > RCU_TORTURE_PIPE_LEN) { + /* Should not happen, but... */ + pipe_count = RCU_TORTURE_PIPE_LEN; + } + completed = cur_ops->get_gp_seq(); + if (pipe_count > 1) { + do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu, + ts, started, completed); + rcu_ftrace_dump(DUMP_ALL); + } + __this_cpu_inc(rcu_torture_count[pipe_count]); + completed = rcutorture_seq_diff(completed, started); + if (completed > RCU_TORTURE_PIPE_LEN) { + /* Should not happen, but... */ + completed = RCU_TORTURE_PIPE_LEN; + } + __this_cpu_inc(rcu_torture_batch[completed]); + preempt_enable(); + cur_ops->readunlock(idx); + return true; +} + /* * RCU torture reader from timer handler. Dereferences rcu_torture_current, * incrementing the corresponding element of the pipeline array. The @@ -1165,14 +1219,8 @@ static void rcu_torture_timer(struct timer_list *unused) static int rcu_torture_reader(void *arg) { - unsigned long started; - unsigned long completed; - int idx; DEFINE_TORTURE_RANDOM(rand); - struct rcu_torture *p; - int pipe_count; struct timer_list t; - unsigned long long ts; VERBOSE_TOROUT_STRING("rcu_torture_reader task started"); set_user_nice(current, MAX_NICE); @@ -1184,44 +1232,8 @@ rcu_torture_reader(void *arg) if (!timer_pending(&t)) mod_timer(&t, jiffies + 1); } - idx = cur_ops->readlock(); - started = cur_ops->get_gp_seq(); - ts = rcu_trace_clock_local(); - p = rcu_dereference_check(rcu_torture_current, - rcu_read_lock_bh_held() || - rcu_read_lock_sched_held() || - srcu_read_lock_held(srcu_ctlp) || - torturing_tasks()); - if (p == NULL) { - /* Wait for rcu_torture_writer to get underway */ - cur_ops->readunlock(idx); + if (!rcu_torture_one_read(&rand)) schedule_timeout_interruptible(HZ); - continue; - } - if (p->rtort_mbtest == 0) - atomic_inc(&n_rcu_torture_mberror); - cur_ops->read_delay(&rand); - preempt_disable(); - pipe_count = p->rtort_pipe_count; - if (pipe_count > RCU_TORTURE_PIPE_LEN) { - /* Should not happen, but... */ - pipe_count = RCU_TORTURE_PIPE_LEN; - } - completed = cur_ops->get_gp_seq(); - if (pipe_count > 1) { - do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu, - ts, started, completed); - rcu_ftrace_dump(DUMP_ALL); - } - __this_cpu_inc(rcu_torture_count[pipe_count]); - completed = rcutorture_seq_diff(completed, started); - if (completed > RCU_TORTURE_PIPE_LEN) { - /* Should not happen, but... */ - completed = RCU_TORTURE_PIPE_LEN; - } - __this_cpu_inc(rcu_torture_batch[completed]); - preempt_enable(); - cur_ops->readunlock(idx); stutter_wait("rcu_torture_reader"); } while (!torture_must_stop()); if (irqreader && cur_ops->irq_capable) { -- cgit From 8da9a59523b6608f4b21f3e489578d0993c0779f Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 22 May 2018 11:17:51 -0700 Subject: rcutorture: Use atomic increment for n_rcu_torture_timers Currently, rcu_torture_timer() relies on a lock to guard updates to n_rcu_torture_timers. Unfortunately, consolidating code with rcu_torture_reader() will dispense with this lock. This commit therefore makes n_rcu_torture_timers be an atomic_long_t and uses atomic_long_inc() to carry out the update. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 971e31ae9bcf..2452e4a29923 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -151,7 +151,7 @@ static long n_rcu_torture_boost_ktrerror; static long n_rcu_torture_boost_rterror; static long n_rcu_torture_boost_failure; static long n_rcu_torture_boosts; -static long n_rcu_torture_timers; +static atomic_long_t n_rcu_torture_timers; static long n_barrier_attempts; static long n_barrier_successes; static atomic_long_t n_cbfloods; @@ -1160,6 +1160,7 @@ static void rcu_torture_timer(struct timer_list *unused) int pipe_count; unsigned long long ts; + atomic_long_inc(&n_rcu_torture_timers); idx = cur_ops->readlock(); started = cur_ops->get_gp_seq(); ts = rcu_trace_clock_local(); @@ -1177,7 +1178,6 @@ static void rcu_torture_timer(struct timer_list *unused) atomic_inc(&n_rcu_torture_mberror); spin_lock(&rand_lock); cur_ops->read_delay(&rand); - n_rcu_torture_timers++; spin_unlock(&rand_lock); preempt_disable(); pipe_count = p->rtort_pipe_count; @@ -1290,7 +1290,7 @@ rcu_torture_stats_print(void) pr_cont("rtbf: %ld rtb: %ld nt: %ld ", n_rcu_torture_boost_failure, n_rcu_torture_boosts, - n_rcu_torture_timers); + atomic_long_read(&n_rcu_torture_timers)); torture_onoff_stats(); pr_cont("barrier: %ld/%ld:%ld ", n_barrier_successes, -- cgit From 3025520ec424df8b0fd5cdc319ad6b83406d9954 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 22 May 2018 11:38:47 -0700 Subject: rcutorture: Use per-CPU random state for rcu_torture_timer() Currently, the rcu_torture_timer() function uses a single global torture_random_state structure protected by a single global lock. This conflicts to some extent with performance and scalability, but even more with the goal of consolidating read-side testing with rcu_torture_reader(). This commit therefore creates a per-CPU torture_random_state structure for use by rcu_torture_timer() and eliminates the lock. Signed-off-by: Paul E. McKenney [ paulmck: Make rcu_torture_timer_rand static, per 0day Test Robot report. ] --- include/linux/torture.h | 2 ++ kernel/rcu/rcutorture.c | 10 +++++----- 2 files changed, 7 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/include/linux/torture.h b/include/linux/torture.h index a55e80817dae..61dfd93b6ee4 100644 --- a/include/linux/torture.h +++ b/include/linux/torture.h @@ -64,6 +64,8 @@ struct torture_random_state { long trs_count; }; #define DEFINE_TORTURE_RANDOM(name) struct torture_random_state name = { 0, 0 } +#define DEFINE_TORTURE_RANDOM_PERCPU(name) \ + DEFINE_PER_CPU(struct torture_random_state, name) unsigned long torture_random(struct torture_random_state *trsp); /* Task shuffler, which causes CPUs to occasionally go idle. */ diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 2452e4a29923..d5a5465d2507 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1143,6 +1143,8 @@ static bool rcu_torture_one_read(struct torture_random_state *trsp) return true; } +static DEFINE_TORTURE_RANDOM_PERCPU(rcu_torture_timer_rand); + /* * RCU torture reader from timer handler. Dereferences rcu_torture_current, * incrementing the corresponding element of the pipeline array. The @@ -1154,12 +1156,12 @@ static void rcu_torture_timer(struct timer_list *unused) int idx; unsigned long started; unsigned long completed; - static DEFINE_TORTURE_RANDOM(rand); - static DEFINE_SPINLOCK(rand_lock); struct rcu_torture *p; int pipe_count; + struct torture_random_state *trsp; unsigned long long ts; + trsp = this_cpu_ptr(&rcu_torture_timer_rand); atomic_long_inc(&n_rcu_torture_timers); idx = cur_ops->readlock(); started = cur_ops->get_gp_seq(); @@ -1176,9 +1178,7 @@ static void rcu_torture_timer(struct timer_list *unused) } if (p->rtort_mbtest == 0) atomic_inc(&n_rcu_torture_mberror); - spin_lock(&rand_lock); - cur_ops->read_delay(&rand); - spin_unlock(&rand_lock); + cur_ops->read_delay(trsp); preempt_disable(); pipe_count = p->rtort_pipe_count; if (pipe_count > RCU_TORTURE_PIPE_LEN) { -- cgit From 241b42522abb36c78cdc84d0cade358c4449306f Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 22 May 2018 11:59:31 -0700 Subject: rcutorture: Make rcu_torture_timer() use rcu_torture_one_read() This commit saves a few lines of code by making rcu_torture_timer() invoke rcu_torture_one_read(), thus completing the consolidation of code between rcu_torture_timer() and rcu_torture_reader(). Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 47 +---------------------------------------------- 1 file changed, 1 insertion(+), 46 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index d5a5465d2507..ac700aa6dcaf 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1153,53 +1153,8 @@ static DEFINE_TORTURE_RANDOM_PERCPU(rcu_torture_timer_rand); */ static void rcu_torture_timer(struct timer_list *unused) { - int idx; - unsigned long started; - unsigned long completed; - struct rcu_torture *p; - int pipe_count; - struct torture_random_state *trsp; - unsigned long long ts; - - trsp = this_cpu_ptr(&rcu_torture_timer_rand); atomic_long_inc(&n_rcu_torture_timers); - idx = cur_ops->readlock(); - started = cur_ops->get_gp_seq(); - ts = rcu_trace_clock_local(); - p = rcu_dereference_check(rcu_torture_current, - rcu_read_lock_bh_held() || - rcu_read_lock_sched_held() || - srcu_read_lock_held(srcu_ctlp) || - torturing_tasks()); - if (p == NULL) { - /* Leave because rcu_torture_writer is not yet underway */ - cur_ops->readunlock(idx); - return; - } - if (p->rtort_mbtest == 0) - atomic_inc(&n_rcu_torture_mberror); - cur_ops->read_delay(trsp); - preempt_disable(); - pipe_count = p->rtort_pipe_count; - if (pipe_count > RCU_TORTURE_PIPE_LEN) { - /* Should not happen, but... */ - pipe_count = RCU_TORTURE_PIPE_LEN; - } - completed = cur_ops->get_gp_seq(); - if (pipe_count > 1) { - do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu, ts, - started, completed); - rcu_ftrace_dump(DUMP_ALL); - } - __this_cpu_inc(rcu_torture_count[pipe_count]); - completed = rcutorture_seq_diff(completed, started); - if (completed > RCU_TORTURE_PIPE_LEN) { - /* Should not happen, but... */ - completed = RCU_TORTURE_PIPE_LEN; - } - __this_cpu_inc(rcu_torture_batch[completed]); - preempt_enable(); - cur_ops->readunlock(idx); + (void)rcu_torture_one_read(this_cpu_ptr(&rcu_torture_timer_rand)); /* Test call_rcu() invocation from interrupt handler. */ if (cur_ops->call) { -- cgit From 2397d072f76b552fc21cda19686d24a8066ced22 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 25 May 2018 07:29:25 -0700 Subject: rcutorture: Handle extended read-side critical sections This commit enables rcutorture to test whether RCU properly aggregates different types of read-side critical sections into a larger section covering the set. It does this by extending an initial read-side critical section randomly for a random number of extensions. There is a new rcu_torture_ops field ->extendable that specifies what extensions are permitted for a given flavor of RCU (for example, SRCU does not permit any extensions, while RCU-sched permits all types). Note that if a given operation (for example, local_bh_disable()) extends an RCU read-side critical section, then rcutorture feels free to also start and end the critical section with that operation's type of disabling. Disabling operations include local_bh_disable(), local_irq_disable(), and preempt_disable(). This commit also adds a new "busted_srcud" torture type, which verifies rcutorture's ability to detect extensions of RCU read-side critical sections that are not handled. Gotta test the test, after all! Note that it is not legal to invoke local_bh_disable() with interrupts disabled, and this transition is avoided by overriding the random-number generator when it wants to call local_bh_disable() while interrupts are disabled. The code instead leaves both interrupts and bh/softirq disabled in this case. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 158 ++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 152 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index ac700aa6dcaf..f97757755207 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -62,6 +62,18 @@ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Paul E. McKenney and Josh Triplett "); +/* Bits for ->extendables field, extendables param, and related definitions. */ +#define RCUTORTURE_RDR_SHIFT 8 /* Put SRCU index in upper bits. */ +#define RCUTORTURE_RDR_MASK ((1 << RCUTORTURE_RDR_SHIFT) - 1) +#define RCUTORTURE_RDR_BH 0x1 /* Extend readers by disabling bh. */ +#define RCUTORTURE_RDR_IRQ 0x2 /* ... disabling interrupts. */ +#define RCUTORTURE_RDR_PREEMPT 0x4 /* ... disabling preemption. */ +#define RCUTORTURE_RDR_RCU 0x8 /* ... entering another RCU reader. */ +#define RCUTORTURE_MAX_EXTEND (RCUTORTURE_RDR_BH | RCUTORTURE_RDR_IRQ | \ + RCUTORTURE_RDR_PREEMPT) +#define RCUTORTURE_RDR_MAX_LOOPS 0x7 /* Maximum reader extensions. */ + /* Must be power of two minus one. */ + torture_param(int, cbflood_inter_holdoff, HZ, "Holdoff between floods (jiffies)"); torture_param(int, cbflood_intra_holdoff, 1, @@ -69,6 +81,8 @@ torture_param(int, cbflood_intra_holdoff, 1, torture_param(int, cbflood_n_burst, 3, "# bursts in flood, zero to disable"); torture_param(int, cbflood_n_per_burst, 20000, "# callbacks per burst in flood"); +torture_param(int, extendables, RCUTORTURE_MAX_EXTEND, + "Extend readers by disabling bh (1), irqs (2), or preempt (4)"); torture_param(int, fqs_duration, 0, "Duration of fqs bursts (us), 0 to disable"); torture_param(int, fqs_holdoff, 0, "Holdoff time within fqs bursts (us)"); @@ -277,6 +291,8 @@ struct rcu_torture_ops { void (*stats)(void); int irq_capable; int can_boost; + int extendables; + int ext_irq_conflict; const char *name; }; @@ -452,6 +468,8 @@ static struct rcu_torture_ops rcu_bh_ops = { .fqs = rcu_bh_force_quiescent_state, .stats = NULL, .irq_capable = 1, + .extendables = (RCUTORTURE_RDR_BH | RCUTORTURE_RDR_IRQ), + .ext_irq_conflict = RCUTORTURE_RDR_RCU, .name = "rcu_bh" }; @@ -622,6 +640,26 @@ static struct rcu_torture_ops srcud_ops = { .name = "srcud" }; +/* As above, but broken due to inappropriate reader extension. */ +static struct rcu_torture_ops busted_srcud_ops = { + .ttype = SRCU_FLAVOR, + .init = srcu_torture_init, + .cleanup = srcu_torture_cleanup, + .readlock = srcu_torture_read_lock, + .read_delay = rcu_read_delay, + .readunlock = srcu_torture_read_unlock, + .get_gp_seq = srcu_torture_completed, + .deferred_free = srcu_torture_deferred_free, + .sync = srcu_torture_synchronize, + .exp_sync = srcu_torture_synchronize_expedited, + .call = srcu_torture_call, + .cb_barrier = srcu_torture_barrier, + .stats = srcu_torture_stats, + .irq_capable = 1, + .extendables = RCUTORTURE_MAX_EXTEND, + .name = "busted_srcud" +}; + /* * Definitions for sched torture testing. */ @@ -660,6 +698,7 @@ static struct rcu_torture_ops sched_ops = { .fqs = rcu_sched_force_quiescent_state, .stats = NULL, .irq_capable = 1, + .extendables = RCUTORTURE_MAX_EXTEND, .name = "sched" }; @@ -1089,6 +1128,110 @@ static void rcu_torture_timer_cb(struct rcu_head *rhp) kfree(rhp); } +/* + * Do one extension of an RCU read-side critical section using the + * current reader state in readstate (set to zero for initial entry + * to extended critical section), set the new state as specified by + * newstate (set to zero for final exit from extended critical section), + * and random-number-generator state in trsp. If this is neither the + * beginning or end of the critical section and if there was actually a + * change, do a ->read_delay(). + */ +static void rcutorture_one_extend(int *readstate, int newstate, + struct torture_random_state *trsp) +{ + int idxnew = -1; + int idxold = *readstate; + int statesnew = ~*readstate & newstate; + int statesold = *readstate & ~newstate; + + WARN_ON_ONCE(idxold < 0); + WARN_ON_ONCE((idxold >> RCUTORTURE_RDR_SHIFT) > 1); + + /* First, put new protection in place to avoid critical-section gap. */ + if (statesnew & RCUTORTURE_RDR_BH) + local_bh_disable(); + if (statesnew & RCUTORTURE_RDR_IRQ) + local_irq_disable(); + if (statesnew & RCUTORTURE_RDR_PREEMPT) + preempt_disable(); + if (statesnew & RCUTORTURE_RDR_RCU) + idxnew = cur_ops->readlock() << RCUTORTURE_RDR_SHIFT; + + /* Next, remove old protection, irq first due to bh conflict. */ + if (statesold & RCUTORTURE_RDR_IRQ) + local_irq_enable(); + if (statesold & RCUTORTURE_RDR_BH) + local_bh_enable(); + if (statesold & RCUTORTURE_RDR_PREEMPT) + preempt_enable(); + if (statesold & RCUTORTURE_RDR_RCU) + cur_ops->readunlock(idxold >> RCUTORTURE_RDR_SHIFT); + + /* Delay if neither beginning nor end and there was a change. */ + if ((statesnew || statesold) && *readstate && newstate) + cur_ops->read_delay(trsp); + + /* Update the reader state. */ + if (idxnew == -1) + idxnew = idxold & ~RCUTORTURE_RDR_MASK; + WARN_ON_ONCE(idxnew < 0); + WARN_ON_ONCE((idxnew >> RCUTORTURE_RDR_SHIFT) > 1); + *readstate = idxnew | newstate; + WARN_ON_ONCE((*readstate >> RCUTORTURE_RDR_SHIFT) < 0); + WARN_ON_ONCE((*readstate >> RCUTORTURE_RDR_SHIFT) > 1); +} + +/* Return the biggest extendables mask given current RCU and boot parameters. */ +static int rcutorture_extend_mask_max(void) +{ + int mask; + + WARN_ON_ONCE(extendables & ~RCUTORTURE_MAX_EXTEND); + mask = extendables & RCUTORTURE_MAX_EXTEND & cur_ops->extendables; + mask = mask | RCUTORTURE_RDR_RCU; + return mask; +} + +/* Return a random protection state mask, but with at least one bit set. */ +static int +rcutorture_extend_mask(int oldmask, struct torture_random_state *trsp) +{ + int mask = rcutorture_extend_mask_max(); + + WARN_ON_ONCE(mask >> RCUTORTURE_RDR_SHIFT); + mask = mask & (torture_random(trsp) >> RCUTORTURE_RDR_SHIFT); + if ((mask & RCUTORTURE_RDR_IRQ) && + !(mask & RCUTORTURE_RDR_BH) && + (oldmask & RCUTORTURE_RDR_BH)) + mask |= RCUTORTURE_RDR_BH; /* Can't enable bh w/irq disabled. */ + if ((mask & RCUTORTURE_RDR_IRQ) && + !(mask & cur_ops->ext_irq_conflict) && + (oldmask & cur_ops->ext_irq_conflict)) + mask |= cur_ops->ext_irq_conflict; /* Or if readers object. */ + return mask ?: RCUTORTURE_RDR_RCU; +} + +/* + * Do a randomly selected number of extensions of an existing RCU read-side + * critical section. + */ +static void rcutorture_loop_extend(int *readstate, + struct torture_random_state *trsp) +{ + int i; + int mask = rcutorture_extend_mask_max(); + + WARN_ON_ONCE(!*readstate); /* -Existing- RCU read-side critsect! */ + if (!((mask - 1) & mask)) + return; /* Current RCU flavor not extendable. */ + i = (torture_random(trsp) >> 3) & RCUTORTURE_RDR_MAX_LOOPS; + while (i--) { + mask = rcutorture_extend_mask(*readstate, trsp); + rcutorture_one_extend(readstate, mask, trsp); + } +} + /* * Do one read-side critical section, returning false if there was * no data to read. Can be invoked both from process context and @@ -1096,14 +1239,16 @@ static void rcu_torture_timer_cb(struct rcu_head *rhp) */ static bool rcu_torture_one_read(struct torture_random_state *trsp) { - int idx; unsigned long started; unsigned long completed; + int newstate; struct rcu_torture *p; int pipe_count; + int readstate = 0; unsigned long long ts; - idx = cur_ops->readlock(); + newstate = rcutorture_extend_mask(readstate, trsp); + rcutorture_one_extend(&readstate, newstate, trsp); started = cur_ops->get_gp_seq(); ts = rcu_trace_clock_local(); p = rcu_dereference_check(rcu_torture_current, @@ -1113,12 +1258,12 @@ static bool rcu_torture_one_read(struct torture_random_state *trsp) torturing_tasks()); if (p == NULL) { /* Wait for rcu_torture_writer to get underway */ - cur_ops->readunlock(idx); + rcutorture_one_extend(&readstate, 0, trsp); return false; } if (p->rtort_mbtest == 0) atomic_inc(&n_rcu_torture_mberror); - cur_ops->read_delay(trsp); + rcutorture_loop_extend(&readstate, trsp); preempt_disable(); pipe_count = p->rtort_pipe_count; if (pipe_count > RCU_TORTURE_PIPE_LEN) { @@ -1139,7 +1284,8 @@ static bool rcu_torture_one_read(struct torture_random_state *trsp) } __this_cpu_inc(rcu_torture_batch[completed]); preempt_enable(); - cur_ops->readunlock(idx); + rcutorture_one_extend(&readstate, 0, trsp); + WARN_ON_ONCE(readstate & RCUTORTURE_RDR_MASK); return true; } @@ -1704,7 +1850,7 @@ rcu_torture_init(void) int firsterr = 0; static struct rcu_torture_ops *torture_ops[] = { &rcu_ops, &rcu_bh_ops, &rcu_busted_ops, &srcu_ops, &srcud_ops, - &sched_ops, &tasks_ops, + &busted_srcud_ops, &sched_ops, &tasks_ops, }; if (!torture_init_begin(torture_type, verbose)) -- cgit From bf1bef50bee13b2292929f4b86118302a3827a32 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sun, 10 Jun 2018 08:50:09 -0700 Subject: rcutorture: Emphasize testing of single reader protection type For RCU implementations supporting multiple types of reader protection, rcutorture currently randomly selects the combinations of types of protection for each phase of each reader. The problem with this, for example, given the four kinds of protection for RCU-sched (local_irq_disable(), local_bh_disable(), preempt_disable(), and rcu_read_lock_sched()), the reader will be protected by a single mechanism only 25% of the time. We really heavier testing of single read-side mechanisms. This commit therefore uses only a single mechanism about 60% of the time, half of the time explicitly and one-eighth of the time by chance. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index f97757755207..aa0be7ec2a26 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -69,6 +69,7 @@ MODULE_AUTHOR("Paul E. McKenney and Josh Triplett > 8; + unsigned long randmask2 = randmask1 >> 1; WARN_ON_ONCE(mask >> RCUTORTURE_RDR_SHIFT); - mask = mask & (torture_random(trsp) >> RCUTORTURE_RDR_SHIFT); + /* Half the time lots of bits, half the time only one bit. */ + if (randmask1 & 0x1) + mask = mask & randmask2; + else + mask = mask & (1 << (randmask2 % RCUTORTURE_RDR_NBITS)); if ((mask & RCUTORTURE_RDR_IRQ) && !(mask & RCUTORTURE_RDR_BH) && (oldmask & RCUTORTURE_RDR_BH)) -- cgit From 450efca7182a516a12dfcc0311abfd242bde42b2 Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Sun, 10 Jun 2018 16:45:43 -0700 Subject: rcutorture: Disable RT throttling for boost tests Currently rcutorture is not able to torture RCU boosting properly. This is because the rcutorture's boost threads which are doing the torturing may be throttled due to RT throttling. This patch makes rcutorture use the right torture technique (unthrottled rcutorture boost tasks) for torturing RCU so that the test fails correctly when no boost is available. Currently this requires accessing sysctl_sched_rt_runtime directly, but that should be Ok since rcutorture is test code. Such direct access is also only possible if rcutorture is used as a built-in so make it conditional on that. Signed-off-by: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index aa0be7ec2a26..74e47d0a618c 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -55,6 +55,7 @@ #include #include #include +#include #include "rcu.h" @@ -772,6 +773,32 @@ static void rcu_torture_boost_cb(struct rcu_head *head) smp_store_release(&rbip->inflight, 0); } +static int old_rt_runtime = -1; + +static void rcu_torture_disable_rt_throttle(void) +{ + /* + * Disable RT throttling so that rcutorture's boost threads don't get + * throttled. Only possible if rcutorture is built-in otherwise the + * user should manually do this by setting the sched_rt_period_us and + * sched_rt_runtime sysctls. + */ + if (!IS_BUILTIN(CONFIG_RCU_TORTURE_TEST) || old_rt_runtime != -1) + return; + + old_rt_runtime = sysctl_sched_rt_runtime; + sysctl_sched_rt_runtime = -1; +} + +static void rcu_torture_enable_rt_throttle(void) +{ + if (!IS_BUILTIN(CONFIG_RCU_TORTURE_TEST) || old_rt_runtime == -1) + return; + + sysctl_sched_rt_runtime = old_rt_runtime; + old_rt_runtime = -1; +} + static int rcu_torture_boost(void *arg) { unsigned long call_rcu_time; @@ -1511,6 +1538,7 @@ static int rcutorture_booster_cleanup(unsigned int cpu) mutex_lock(&boost_mutex); t = boost_tasks[cpu]; boost_tasks[cpu] = NULL; + rcu_torture_enable_rt_throttle(); mutex_unlock(&boost_mutex); /* This must be outside of the mutex, otherwise deadlock! */ @@ -1527,6 +1555,7 @@ static int rcutorture_booster_init(unsigned int cpu) /* Don't allow time recalculation while creating a new task. */ mutex_lock(&boost_mutex); + rcu_torture_disable_rt_throttle(); VERBOSE_TOROUT_STRING("Creating rcu_torture_boost task"); boost_tasks[cpu] = kthread_create_on_node(rcu_torture_boost, NULL, cpu_to_node(cpu), -- cgit From 3b745c8969c752601cb68c82a06735363563ab42 Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Sun, 10 Jun 2018 16:45:44 -0700 Subject: rcutorture: Make boost test more robust Currently, with RCU_BOOST disabled, I get no failures when forcing rcutorture to test RCU boost priority inversion. The reason seems to be that we don't check for failures if the callback never ran at all for the duration of the boost-test loop. Further, the 'rtb' and 'rtbf' counters seem to be used inconsistently. 'rtb' is incremented at the start of each test and 'rtbf' is incremented per-cpu on each failure of call_rcu. So its possible 'rtbf' > 'rtb'. To test the boost with rcutorture, I did following on a 4-CPU x86 machine: modprobe rcutorture test_boost=2 sleep 20 rmmod rcutorture With patch: rtbf: 8 rtb: 12 Without patch: rtbf: 0 rtb: 2 In summary this patch: - Increments failed and total test counters once per boost-test. - Checks for failure cases correctly. Signed-off-by: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 45 +++++++++++++++++++++++++++++++++++++++------ 1 file changed, 39 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 74e47d0a618c..36b9b8266213 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -799,6 +799,18 @@ static void rcu_torture_enable_rt_throttle(void) old_rt_runtime = -1; } +static bool rcu_torture_boost_failed(unsigned long start, unsigned long end) +{ + if (end - start > test_boost_duration * HZ - HZ / 2) { + VERBOSE_TOROUT_STRING("rcu_torture_boost boosting failed"); + n_rcu_torture_boost_failure++; + + return true; /* failed */ + } + + return false; /* passed */ +} + static int rcu_torture_boost(void *arg) { unsigned long call_rcu_time; @@ -819,6 +831,21 @@ static int rcu_torture_boost(void *arg) init_rcu_head_on_stack(&rbi.rcu); /* Each pass through the following loop does one boost-test cycle. */ do { + /* Track if the test failed already in this test interval? */ + bool failed = false; + + /* Increment n_rcu_torture_boosts once per boost-test */ + while (!kthread_should_stop()) { + if (mutex_trylock(&boost_mutex)) { + n_rcu_torture_boosts++; + mutex_unlock(&boost_mutex); + break; + } + schedule_timeout_uninterruptible(1); + } + if (kthread_should_stop()) + goto checkwait; + /* Wait for the next test interval. */ oldstarttime = boost_starttime; while (ULONG_CMP_LT(jiffies, oldstarttime)) { @@ -837,11 +864,10 @@ static int rcu_torture_boost(void *arg) /* RCU core before ->inflight = 1. */ smp_store_release(&rbi.inflight, 1); call_rcu(&rbi.rcu, rcu_torture_boost_cb); - if (jiffies - call_rcu_time > - test_boost_duration * HZ - HZ / 2) { - VERBOSE_TOROUT_STRING("rcu_torture_boost boosting failed"); - n_rcu_torture_boost_failure++; - } + /* Check if the boost test failed */ + failed = failed || + rcu_torture_boost_failed(call_rcu_time, + jiffies); call_rcu_time = jiffies; } stutter_wait("rcu_torture_boost"); @@ -849,6 +875,14 @@ static int rcu_torture_boost(void *arg) goto checkwait; } + /* + * If boost never happened, then inflight will always be 1, in + * this case the boost check would never happen in the above + * loop so do another one here. + */ + if (!failed && smp_load_acquire(&rbi.inflight)) + rcu_torture_boost_failed(call_rcu_time, jiffies); + /* * Set the start time of the next test interval. * Yes, this is vulnerable to long delays, but such @@ -861,7 +895,6 @@ static int rcu_torture_boost(void *arg) if (mutex_trylock(&boost_mutex)) { boost_starttime = jiffies + test_boost_interval * HZ; - n_rcu_torture_boosts++; mutex_unlock(&boost_mutex); break; } -- cgit From 622be33fcbc93e9b672b99ed338369eb5e843ac3 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 18 Jun 2018 16:47:34 +0200 Subject: rcutorture: Use monotonic timestamp for stall detection The get_seconds() call is deprecated because it overflows on 32-bit architectures. The algorithm in rcu_torture_stall() can deal with the overflow, but another problem here is that using a CLOCK_REALTIME stamp can lead to a false-positive stall warning when a settimeofday() happens concurrently. Using ktime_get_seconds() instead avoids those issues and will never overflow. The added cast to 'unsigned long' however is necessary to make ULONG_CMP_LT() work correctly. Signed-off-by: Arnd Bergmann Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 36b9b8266213..049b3735dba8 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1622,7 +1622,7 @@ static int rcu_torture_stall(void *args) VERBOSE_TOROUT_STRING("rcu_torture_stall end holdoff"); } if (!kthread_should_stop()) { - stop_at = get_seconds() + stall_cpu; + stop_at = ktime_get_seconds() + stall_cpu; /* RCU CPU stall is expected behavior in following code. */ rcu_read_lock(); if (stall_cpu_irqsoff) @@ -1631,7 +1631,8 @@ static int rcu_torture_stall(void *args) preempt_disable(); pr_alert("rcu_torture_stall start on CPU %d.\n", smp_processor_id()); - while (ULONG_CMP_LT(get_seconds(), stop_at)) + while (ULONG_CMP_LT((unsigned long)ktime_get_seconds(), + stop_at)) continue; /* Induce RCU CPU stall warning. */ if (stall_cpu_irqsoff) local_irq_enable(); -- cgit From 4babd855fd6137f9792117eb73b096c221a49d3c Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Tue, 19 Jun 2018 15:14:18 -0700 Subject: rcutorture: Add support to detect if boost kthread prio is too low When rcutorture is built in to the kernel, an earlier patch detects that and raises the priority of RCU's kthreads to allow rcutorture's RCU priority boosting tests to succeed. However, if rcutorture is built as a module, those priorities must be raised manually via the rcutree.kthread_prio kernel boot parameter. If this manual step is not taken, rcutorture's RCU priority boosting tests will fail due to kthread starvation. One approach would be to raise the default priority, but that risks breaking existing users. Another approach would be to allow runtime adjustment of RCU's kthread priorities, but that introduces numerous "interesting" race conditions. This patch therefore instead detects too-low priorities, and prints a message and disables the RCU priority boosting tests in that case. Signed-off-by: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney --- kernel/rcu/rcu.h | 2 ++ kernel/rcu/rcutorture.c | 32 ++++++++++++++++++++++++++++---- kernel/rcu/tree.c | 7 +++++++ 3 files changed, 37 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index 0453a7d12b3f..bee070979970 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -502,6 +502,7 @@ static inline void rcu_force_quiescent_state(void) { } static inline void rcu_bh_force_quiescent_state(void) { } static inline void rcu_sched_force_quiescent_state(void) { } static inline void show_rcu_gp_kthreads(void) { } +static inline int rcu_get_gp_kthreads_prio(void) { return 0; } #else /* #ifdef CONFIG_TINY_RCU */ unsigned long rcu_get_gp_seq(void); unsigned long rcu_bh_get_gp_seq(void); @@ -510,6 +511,7 @@ unsigned long rcu_exp_batches_completed(void); unsigned long rcu_exp_batches_completed_sched(void); unsigned long srcu_batches_completed(struct srcu_struct *sp); void show_rcu_gp_kthreads(void); +int rcu_get_gp_kthreads_prio(void); void rcu_force_quiescent_state(void); void rcu_bh_force_quiescent_state(void); void rcu_sched_force_quiescent_state(void); diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 049b3735dba8..e3d2d4f1d928 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1787,6 +1787,32 @@ static void rcu_torture_barrier_cleanup(void) } } +static bool rcu_torture_can_boost(void) +{ + static int boost_warn_once; + int prio; + + if (!(test_boost == 1 && cur_ops->can_boost) && test_boost != 2) + return false; + + prio = rcu_get_gp_kthreads_prio(); + if (!prio) + return false; + + if (prio < 2) { + if (boost_warn_once == 1) + return false; + + pr_alert("%s: WARN: RCU kthread priority too low to test boosting. " + "Skipping RCU boost test. Try passing rcutree.kthread_prio > 1 " + "on the kernel command line.\n", KBUILD_MODNAME); + boost_warn_once = 1; + return false; + } + + return true; +} + static enum cpuhp_state rcutor_hp; static void @@ -1831,8 +1857,7 @@ rcu_torture_cleanup(void) torture_stop_kthread(rcu_torture_fqs, fqs_task); for (i = 0; i < ncbflooders; i++) torture_stop_kthread(rcu_torture_cbflood, cbflood_task[i]); - if ((test_boost == 1 && cur_ops->can_boost) || - test_boost == 2) + if (rcu_torture_can_boost()) cpuhp_remove_state(rcutor_hp); /* @@ -2056,8 +2081,7 @@ rcu_torture_init(void) test_boost_interval = 1; if (test_boost_duration < 2) test_boost_duration = 2; - if ((test_boost == 1 && cur_ops->can_boost) || - test_boost == 2) { + if (rcu_torture_can_boost()) { boost_starttime = jiffies + test_boost_interval * HZ; diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 65abb399b08d..b4bcb5e21ca6 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -180,6 +180,13 @@ module_param(gp_init_delay, int, 0444); static int gp_cleanup_delay; module_param(gp_cleanup_delay, int, 0444); +/* Retreive RCU kthreads priority for rcutorture */ +int rcu_get_gp_kthreads_prio(void) +{ + return kthread_prio; +} +EXPORT_SYMBOL_GPL(rcu_get_gp_kthreads_prio); + /* * Number of grace periods between delays, normalized by the duration of * the delay. The longer the delay, the more the grace periods between -- cgit From bf5b64355a3ce41752856b66c4efad4d7a88e84b Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Tue, 19 Jun 2018 15:14:19 -0700 Subject: rcutorture: Fix rcu_barrier successes counter The rcutorture test module currently increments both successes and error for the barrier test upon error, which results in misleading statistics being printed. This commit therefore changes the code to increment the success counter only when the test actually passes. This change was tested by by returning from the barrier callback without incrementing the callback counter, thus introducing what appeared to rcutorture to be rcu_barrier() failures. Signed-off-by: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index e3d2d4f1d928..bdc86cdf3b8b 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -169,7 +169,7 @@ static long n_rcu_torture_boost_failure; static long n_rcu_torture_boosts; static atomic_long_t n_rcu_torture_timers; static long n_barrier_attempts; -static long n_barrier_successes; +static long n_barrier_successes; /* did rcu_barrier test succeed? */ static atomic_long_t n_cbfloods; static struct list_head rcu_torture_removed; @@ -1723,8 +1723,9 @@ static int rcu_torture_barrier(void *arg) atomic_read(&barrier_cbs_invoked), n_barrier_cbs); WARN_ON_ONCE(1); + } else { + n_barrier_successes++; } - n_barrier_successes++; schedule_timeout_interruptible(HZ / 10); } while (!torture_must_stop()); torture_kthread_stopping("rcu_torture_barrier"); @@ -1803,9 +1804,7 @@ static bool rcu_torture_can_boost(void) if (boost_warn_once == 1) return false; - pr_alert("%s: WARN: RCU kthread priority too low to test boosting. " - "Skipping RCU boost test. Try passing rcutree.kthread_prio > 1 " - "on the kernel command line.\n", KBUILD_MODNAME); + pr_alert("%s: WARN: RCU kthread priority too low to test boosting. Skipping RCU boost test. Try passing rcutree.kthread_prio > 1 on the kernel command line.\n", KBUILD_MODNAME); boost_warn_once = 1; return false; } -- cgit From f8494fa3dd10b52eab47a9666a8bc34719a129aa Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Mon, 25 Jun 2018 17:08:22 -0700 Subject: tracing: Reorder display of TGID to be after PID Currently ftrace displays data in trace output like so: _-----=> irqs-off / _----=> need-resched | / _---=> hardirq/softirq || / _--=> preempt-depth ||| / delay TASK-PID CPU TGID |||| TIMESTAMP FUNCTION | | | | |||| | | bash-1091 [000] ( 1091) d..2 28.313544: sched_switch: However Android's trace visualization tools expect a slightly different format due to an out-of-tree patch patch that was been carried for a decade, notice that the TGID and CPU fields are reversed: _-----=> irqs-off / _----=> need-resched | / _---=> hardirq/softirq || / _--=> preempt-depth ||| / delay TASK-PID TGID CPU |||| TIMESTAMP FUNCTION | | | | |||| | | bash-1091 ( 1091) [002] d..2 64.965177: sched_switch: From kernel v4.13 onwards, during which TGID was introduced, tracing with systrace on all Android kernels will break (most Android kernels have been on 4.9 with Android patches, so this issues hasn't been seen yet). From v4.13 onwards things will break. The chrome browser's tracing tools also embed the systrace viewer which uses the legacy TGID format and updates to that are known to be difficult to make. Considering this, I suggest we make this change to the upstream kernel and backport it to all Android kernels. I believe this feature is merged recently enough into the upstream kernel that it shouldn't be a problem. Also logically, IMO it makes more sense to group the TGID with the TASK-PID and the CPU after these. Link: http://lkml.kernel.org/r/20180626000822.113931-1-joel@joelfernandes.org Cc: jreck@google.com Cc: tkjos@google.com Cc: stable@vger.kernel.org Fixes: 441dae8f2f29 ("tracing: Add support for display of tgid in trace output") Signed-off-by: Joel Fernandes (Google) Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 8 ++++---- kernel/trace/trace_output.c | 5 +++-- 2 files changed, 7 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index f054bd6a1c66..87cf25171fb8 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3365,8 +3365,8 @@ static void print_func_help_header(struct trace_buffer *buf, struct seq_file *m, print_event_info(buf, m); - seq_printf(m, "# TASK-PID CPU# %s TIMESTAMP FUNCTION\n", tgid ? "TGID " : ""); - seq_printf(m, "# | | | %s | |\n", tgid ? " | " : ""); + seq_printf(m, "# TASK-PID %s CPU# TIMESTAMP FUNCTION\n", tgid ? "TGID " : ""); + seq_printf(m, "# | | %s | | |\n", tgid ? " | " : ""); } static void print_func_help_header_irq(struct trace_buffer *buf, struct seq_file *m, @@ -3386,9 +3386,9 @@ static void print_func_help_header_irq(struct trace_buffer *buf, struct seq_file tgid ? tgid_space : space); seq_printf(m, "# %s||| / delay\n", tgid ? tgid_space : space); - seq_printf(m, "# TASK-PID CPU#%s|||| TIMESTAMP FUNCTION\n", + seq_printf(m, "# TASK-PID %sCPU# |||| TIMESTAMP FUNCTION\n", tgid ? " TGID " : space); - seq_printf(m, "# | | | %s|||| | |\n", + seq_printf(m, "# | | %s | |||| | |\n", tgid ? " | " : space); } diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 90db994ac900..1c8e30fda46a 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -594,8 +594,7 @@ int trace_print_context(struct trace_iterator *iter) trace_find_cmdline(entry->pid, comm); - trace_seq_printf(s, "%16s-%-5d [%03d] ", - comm, entry->pid, iter->cpu); + trace_seq_printf(s, "%16s-%-5d ", comm, entry->pid); if (tr->trace_flags & TRACE_ITER_RECORD_TGID) { unsigned int tgid = trace_find_tgid(entry->pid); @@ -606,6 +605,8 @@ int trace_print_context(struct trace_iterator *iter) trace_seq_printf(s, "(%5d) ", tgid); } + trace_seq_printf(s, "[%03d] ", iter->cpu); + if (tr->trace_flags & TRACE_ITER_IRQ_INFO) trace_print_lat_fmt(s, entry); -- cgit From 8e1b706b6e819bed215c0db16345568864660393 Mon Sep 17 00:00:00 2001 From: Jiri Kosina Date: Fri, 13 Jul 2018 16:23:23 +0200 Subject: cpu/hotplug: Expose SMT control init function The L1TF mitigation will gain a commend line parameter which allows to set a combination of hypervisor mitigation and SMT control. Expose cpu_smt_disable() so the command line parser can tweak SMT settings. [ tglx: Split out of larger patch and made it preserve an already existing force off state ] Signed-off-by: Jiri Kosina Signed-off-by: Thomas Gleixner Tested-by: Jiri Kosina Reviewed-by: Greg Kroah-Hartman Reviewed-by: Josh Poimboeuf Link: https://lkml.kernel.org/r/20180713142323.039715135@linutronix.de --- include/linux/cpu.h | 2 ++ kernel/cpu.c | 16 +++++++++++++--- 2 files changed, 15 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/include/linux/cpu.h b/include/linux/cpu.h index 7532cbf27b1d..c9b23ad27b38 100644 --- a/include/linux/cpu.h +++ b/include/linux/cpu.h @@ -177,8 +177,10 @@ enum cpuhp_smt_control { #if defined(CONFIG_SMP) && defined(CONFIG_HOTPLUG_SMT) extern enum cpuhp_smt_control cpu_smt_control; +extern void cpu_smt_disable(bool force); #else # define cpu_smt_control (CPU_SMT_ENABLED) +static inline void cpu_smt_disable(bool force) { } #endif #endif /* _LINUX_CPU_H_ */ diff --git a/kernel/cpu.c b/kernel/cpu.c index d79e24df2420..8453e31f2d1a 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -347,13 +347,23 @@ EXPORT_SYMBOL_GPL(cpu_hotplug_enable); enum cpuhp_smt_control cpu_smt_control __read_mostly = CPU_SMT_ENABLED; EXPORT_SYMBOL_GPL(cpu_smt_control); -static int __init smt_cmdline_disable(char *str) +void __init cpu_smt_disable(bool force) { - cpu_smt_control = CPU_SMT_DISABLED; - if (str && !strcmp(str, "force")) { + if (cpu_smt_control == CPU_SMT_FORCE_DISABLED || + cpu_smt_control == CPU_SMT_NOT_SUPPORTED) + return; + + if (force) { pr_info("SMT: Force disabled\n"); cpu_smt_control = CPU_SMT_FORCE_DISABLED; + } else { + cpu_smt_control = CPU_SMT_DISABLED; } +} + +static int __init smt_cmdline_disable(char *str) +{ + cpu_smt_disable(str && !strcmp(str, "force")); return 0; } early_param("nosmt", smt_cmdline_disable); -- cgit From fee0aede6f4739c87179eca76136f83210953b86 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 13 Jul 2018 16:23:24 +0200 Subject: cpu/hotplug: Set CPU_SMT_NOT_SUPPORTED early The CPU_SMT_NOT_SUPPORTED state is set (if the processor does not support SMT) when the sysfs SMT control file is initialized. That was fine so far as this was only required to make the output of the control file correct and to prevent writes in that case. With the upcoming l1tf command line parameter, this needs to be set up before the L1TF mitigation selection and command line parsing happens. Signed-off-by: Thomas Gleixner Tested-by: Jiri Kosina Reviewed-by: Greg Kroah-Hartman Reviewed-by: Josh Poimboeuf Link: https://lkml.kernel.org/r/20180713142323.121795971@linutronix.de --- arch/x86/kernel/cpu/bugs.c | 6 ++++++ include/linux/cpu.h | 2 ++ kernel/cpu.c | 13 ++++++++++--- 3 files changed, 18 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 7df7df9a2753..4be69a7bef0b 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -58,6 +58,12 @@ void __init check_bugs(void) { identify_boot_cpu(); + /* + * identify_boot_cpu() initialized SMT support information, let the + * core code know. + */ + cpu_smt_check_topology(); + if (!IS_ENABLED(CONFIG_SMP)) { pr_info("CPU: "); print_cpu_info(&boot_cpu_data); diff --git a/include/linux/cpu.h b/include/linux/cpu.h index c9b23ad27b38..ccdf3a67ce56 100644 --- a/include/linux/cpu.h +++ b/include/linux/cpu.h @@ -178,9 +178,11 @@ enum cpuhp_smt_control { #if defined(CONFIG_SMP) && defined(CONFIG_HOTPLUG_SMT) extern enum cpuhp_smt_control cpu_smt_control; extern void cpu_smt_disable(bool force); +extern void cpu_smt_check_topology(void); #else # define cpu_smt_control (CPU_SMT_ENABLED) static inline void cpu_smt_disable(bool force) { } +static inline void cpu_smt_check_topology(void) { } #endif #endif /* _LINUX_CPU_H_ */ diff --git a/kernel/cpu.c b/kernel/cpu.c index 8453e31f2d1a..37eec872042b 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -361,6 +361,16 @@ void __init cpu_smt_disable(bool force) } } +/* + * The decision whether SMT is supported can only be done after the full + * CPU identification. Called from architecture code. + */ +void __init cpu_smt_check_topology(void) +{ + if (!topology_smt_supported()) + cpu_smt_control = CPU_SMT_NOT_SUPPORTED; +} + static int __init smt_cmdline_disable(char *str) { cpu_smt_disable(str && !strcmp(str, "force")); @@ -2115,9 +2125,6 @@ static const struct attribute_group cpuhp_smt_attr_group = { static int __init cpu_smt_state_init(void) { - if (!topology_smt_supported()) - cpu_smt_control = CPU_SMT_NOT_SUPPORTED; - return sysfs_create_group(&cpu_subsys.dev_root->kobj, &cpuhp_smt_attr_group); } -- cgit From 9fb8d5dc4b649dd190e1af4ead670753e71bf907 Mon Sep 17 00:00:00 2001 From: "Isaac J. Manjarres" Date: Tue, 3 Jul 2018 15:02:14 -0700 Subject: stop_machine: Disable preemption when waking two stopper threads When cpu_stop_queue_two_works() begins to wake the stopper threads, it does so without preemption disabled, which leads to the following race condition: The source CPU calls cpu_stop_queue_two_works(), with cpu1 as the source CPU, and cpu2 as the destination CPU. When adding the stopper threads to the wake queue used in this function, the source CPU stopper thread is added first, and the destination CPU stopper thread is added last. When wake_up_q() is invoked to wake the stopper threads, the threads are woken up in the order that they are queued in, so the source CPU's stopper thread is woken up first, and it preempts the thread running on the source CPU. The stopper thread will then execute on the source CPU, disable preemption, and begin executing multi_cpu_stop(), and wait for an ack from the destination CPU's stopper thread, with preemption still disabled. Since the worker thread that woke up the stopper thread on the source CPU is affine to the source CPU, and preemption is disabled on the source CPU, that thread will never run to dequeue the destination CPU's stopper thread from the wake queue, and thus, the destination CPU's stopper thread will never run, causing the source CPU's stopper thread to wait forever, and stall. Disable preemption when waking the stopper threads in cpu_stop_queue_two_works(). Fixes: 0b26351b910f ("stop_machine, sched: Fix migrate_swap() vs. active_balance() deadlock") Co-Developed-by: Prasad Sodagudi Signed-off-by: Prasad Sodagudi Co-Developed-by: Pavankumar Kondeti Signed-off-by: Pavankumar Kondeti Signed-off-by: Isaac J. Manjarres Signed-off-by: Thomas Gleixner Cc: peterz@infradead.org Cc: matt@codeblueprint.co.uk Cc: bigeasy@linutronix.de Cc: gregkh@linuxfoundation.org Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/1530655334-4601-1-git-send-email-isaacm@codeaurora.org --- kernel/stop_machine.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index f89014a2c238..1ff523dae6e2 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -270,7 +270,11 @@ unlock: goto retry; } - wake_up_q(&wakeq); + if (!err) { + preempt_disable(); + wake_up_q(&wakeq); + preempt_enable(); + } return err; } -- cgit From e117cb52bdb4d376b711bee34af6434c9e314b3b Mon Sep 17 00:00:00 2001 From: Juri Lelli Date: Wed, 11 Jul 2018 09:29:48 +0200 Subject: sched/deadline: Fix switched_from_dl() warning Mark noticed that syzkaller is able to reliably trigger the following warning: dl_rq->running_bw > dl_rq->this_bw WARNING: CPU: 1 PID: 153 at kernel/sched/deadline.c:124 switched_from_dl+0x454/0x608 Kernel panic - not syncing: panic_on_warn set ... CPU: 1 PID: 153 Comm: syz-executor253 Not tainted 4.18.0-rc3+ #29 Hardware name: linux,dummy-virt (DT) Call trace: dump_backtrace+0x0/0x458 show_stack+0x20/0x30 dump_stack+0x180/0x250 panic+0x2dc/0x4ec __warn_printk+0x0/0x150 report_bug+0x228/0x2d8 bug_handler+0xa0/0x1a0 brk_handler+0x2f0/0x568 do_debug_exception+0x1bc/0x5d0 el1_dbg+0x18/0x78 switched_from_dl+0x454/0x608 __sched_setscheduler+0x8cc/0x2018 sys_sched_setattr+0x340/0x758 el0_svc_naked+0x30/0x34 syzkaller reproducer runs a bunch of threads that constantly switch between DEADLINE and NORMAL classes while interacting through futexes. The splat above is caused by the fact that if a DEADLINE task is setattr back to NORMAL while in non_contending state (blocked on a futex - inactive timer armed), its contribution to running_bw is not removed before sub_rq_bw() gets called (!task_on_rq_queued() branch) and the latter sees running_bw > this_bw. Fix it by removing a task contribution from running_bw if the task is not queued and in non_contending state while switched to a different class. Reported-by: Mark Rutland Signed-off-by: Juri Lelli Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Daniel Bristot de Oliveira Reviewed-by: Luca Abeni Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: claudio@evidence.eu.com Cc: rostedt@goodmis.org Link: http://lkml.kernel.org/r/20180711072948.27061-1-juri.lelli@redhat.com Signed-off-by: Ingo Molnar --- kernel/sched/deadline.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index fbfc3f1d368a..10c7b51c0d1f 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -2290,8 +2290,17 @@ static void switched_from_dl(struct rq *rq, struct task_struct *p) if (task_on_rq_queued(p) && p->dl.dl_runtime) task_non_contending(p); - if (!task_on_rq_queued(p)) + if (!task_on_rq_queued(p)) { + /* + * Inactive timer is armed. However, p is leaving DEADLINE and + * might migrate away from this rq while continuing to run on + * some other class. We need to remove its contribution from + * this rq running_bw now, or sub_rq_bw (below) will complain. + */ + if (p->dl.dl_non_contending) + sub_running_bw(&p->dl, &rq->dl); sub_rq_bw(&p->dl, &rq->dl); + } /* * We cannot use inactive_task_timer() to invoke sub_running_bw() -- cgit From be45bf5395e0886a93fc816bbe41a008ec2e42e2 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 13 Jul 2018 12:42:08 +0200 Subject: watchdog/softlockup: Fix cpu_stop_queue_work() double-queue bug When scheduling is delayed for longer than the softlockup interrupt period it is possible to double-queue the cpu_stop_work, causing list corruption. Cure this by adding a completion to track the cpu_stop_work's progress. Reported-by: kernel test robot Tested-by: Rong Chen Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Fixes: 9cf57731b63e ("watchdog/softlockup: Replace "watchdog/%u" threads with cpu_stop_work") Link: http://lkml.kernel.org/r/20180713104208.GW2494@hirez.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- kernel/watchdog.c | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/watchdog.c b/kernel/watchdog.c index b81f777838d5..5470dce212c0 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -330,6 +330,9 @@ static void watchdog_interrupt_count(void) __this_cpu_inc(hrtimer_interrupts); } +static DEFINE_PER_CPU(struct completion, softlockup_completion); +static DEFINE_PER_CPU(struct cpu_stop_work, softlockup_stop_work); + /* * The watchdog thread function - touches the timestamp. * @@ -343,12 +346,11 @@ static int softlockup_fn(void *data) __this_cpu_write(soft_lockup_hrtimer_cnt, __this_cpu_read(hrtimer_interrupts)); __touch_watchdog(); + complete(this_cpu_ptr(&softlockup_completion)); return 0; } -static DEFINE_PER_CPU(struct cpu_stop_work, softlockup_stop_work); - /* watchdog kicker functions */ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) { @@ -364,9 +366,12 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) watchdog_interrupt_count(); /* kick the softlockup detector */ - stop_one_cpu_nowait(smp_processor_id(), - softlockup_fn, NULL, - this_cpu_ptr(&softlockup_stop_work)); + if (completion_done(this_cpu_ptr(&softlockup_completion))) { + reinit_completion(this_cpu_ptr(&softlockup_completion)); + stop_one_cpu_nowait(smp_processor_id(), + softlockup_fn, NULL, + this_cpu_ptr(&softlockup_stop_work)); + } /* .. and repeat */ hrtimer_forward_now(hrtimer, ns_to_ktime(sample_period)); @@ -467,9 +472,13 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) static void watchdog_enable(unsigned int cpu) { struct hrtimer *hrtimer = this_cpu_ptr(&watchdog_hrtimer); + struct completion *done = this_cpu_ptr(&softlockup_completion); WARN_ON_ONCE(cpu != smp_processor_id()); + init_completion(done); + complete(done); + /* * Start the timer first to prevent the NMI watchdog triggering * before the timer has a chance to fire. @@ -499,6 +508,7 @@ static void watchdog_disable(unsigned int cpu) */ watchdog_nmi_disable(cpu); hrtimer_cancel(hrtimer); + wait_for_completion(this_cpu_ptr(&softlockup_completion)); } static int softlockup_stop_fn(void *data) -- cgit From 8fe5c5a937d0f4e84221631833a2718afde52285 Mon Sep 17 00:00:00 2001 From: Quentin Perret Date: Tue, 12 Jun 2018 12:22:15 +0100 Subject: sched/fair: Fix util_avg of new tasks for asymmetric systems When a new task wakes-up for the first time, its initial utilization is set to half of the spare capacity of its CPU. The current implementation of post_init_entity_util_avg() uses SCHED_CAPACITY_SCALE directly as a capacity reference. As a result, on a big.LITTLE system, a new task waking up on an idle little CPU will be given ~512 of util_avg, even if the CPU's capacity is significantly less than that. Fix this by computing the spare capacity with arch_scale_cpu_capacity(). Signed-off-by: Quentin Perret Signed-off-by: Peter Zijlstra (Intel) Acked-by: Vincent Guittot Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: dietmar.eggemann@arm.com Cc: morten.rasmussen@arm.com Cc: patrick.bellasi@arm.com Link: http://lkml.kernel.org/r/20180612112215.25448-1-quentin.perret@arm.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 321cd5dcf2e8..08b89ae34233 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -735,11 +735,12 @@ static void attach_entity_cfs_rq(struct sched_entity *se); * To solve this problem, we also cap the util_avg of successive tasks to * only 1/2 of the left utilization budget: * - * util_avg_cap = (1024 - cfs_rq->avg.util_avg) / 2^n + * util_avg_cap = (cpu_scale - cfs_rq->avg.util_avg) / 2^n * - * where n denotes the nth task. + * where n denotes the nth task and cpu_scale the CPU capacity. * - * For example, a simplest series from the beginning would be like: + * For example, for a CPU with 1024 of capacity, a simplest series from + * the beginning would be like: * * task util_avg: 512, 256, 128, 64, 32, 16, 8, ... * cfs_rq util_avg: 512, 768, 896, 960, 992, 1008, 1016, ... @@ -751,7 +752,8 @@ void post_init_entity_util_avg(struct sched_entity *se) { struct cfs_rq *cfs_rq = cfs_rq_of(se); struct sched_avg *sa = &se->avg; - long cap = (long)(SCHED_CAPACITY_SCALE - cfs_rq->avg.util_avg) / 2; + long cpu_scale = arch_scale_cpu_capacity(NULL, cpu_of(rq_of(cfs_rq))); + long cap = (long)(cpu_scale - cfs_rq->avg.util_avg) / 2; if (cap > 0) { if (cfs_rq->avg.util_avg != 0) { -- cgit From c079629862b20c101e8336362a8b042ec7d942fe Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Thu, 28 Jun 2018 17:45:04 +0200 Subject: sched/pelt: Move PELT related code in a dedicated file We want to track rt_rq's utilization as a part of the estimation of the whole rq's utilization. This is necessary because rt tasks can steal utilization to cfs tasks and make them lighter than they are. As we want to use the same load tracking mecanism for both and prevent useless dependency between cfs and rt code, PELT code is moved in a dedicated file. Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Morten.Rasmussen@arm.com Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: claudio@evidence.eu.com Cc: daniel.lezcano@linaro.org Cc: dietmar.eggemann@arm.com Cc: joel@joelfernandes.org Cc: juri.lelli@redhat.com Cc: luca.abeni@santannapisa.it Cc: patrick.bellasi@arm.com Cc: quentin.perret@arm.com Cc: rjw@rjwysocki.net Cc: valentin.schneider@arm.com Cc: viresh.kumar@linaro.org Link: http://lkml.kernel.org/r/1530200714-4504-2-git-send-email-vincent.guittot@linaro.org Signed-off-by: Ingo Molnar --- kernel/sched/Makefile | 2 +- kernel/sched/fair.c | 333 +------------------------------------------------- kernel/sched/pelt.c | 311 ++++++++++++++++++++++++++++++++++++++++++++++ kernel/sched/pelt.h | 43 +++++++ kernel/sched/sched.h | 19 +++ 5 files changed, 375 insertions(+), 333 deletions(-) create mode 100644 kernel/sched/pelt.c create mode 100644 kernel/sched/pelt.h (limited to 'kernel') diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index d9a02b318108..7fe183404c38 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -20,7 +20,7 @@ obj-y += core.o loadavg.o clock.o cputime.o obj-y += idle.o fair.o rt.o deadline.o obj-y += wait.o wait_bit.o swait.o completion.o -obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o topology.o stop_task.o +obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o topology.o stop_task.o pelt.o obj-$(CONFIG_SCHED_AUTOGROUP) += autogroup.o obj-$(CONFIG_SCHEDSTATS) += stats.o obj-$(CONFIG_SCHED_DEBUG) += debug.o diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 08b89ae34233..39ab46cea6c5 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -255,9 +255,6 @@ static inline struct rq *rq_of(struct cfs_rq *cfs_rq) return cfs_rq->rq; } -/* An entity is a task if it doesn't "own" a runqueue */ -#define entity_is_task(se) (!se->my_q) - static inline struct task_struct *task_of(struct sched_entity *se) { SCHED_WARN_ON(!entity_is_task(se)); @@ -419,7 +416,6 @@ static inline struct rq *rq_of(struct cfs_rq *cfs_rq) return container_of(cfs_rq, struct rq, cfs); } -#define entity_is_task(se) 1 #define for_each_sched_entity(se) \ for (; se; se = NULL) @@ -692,7 +688,7 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se) } #ifdef CONFIG_SMP - +#include "pelt.h" #include "sched-pelt.h" static int select_idle_sibling(struct task_struct *p, int prev_cpu, int cpu); @@ -2751,19 +2747,6 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) } while (0) #ifdef CONFIG_SMP -/* - * XXX we want to get rid of these helpers and use the full load resolution. - */ -static inline long se_weight(struct sched_entity *se) -{ - return scale_load_down(se->load.weight); -} - -static inline long se_runnable(struct sched_entity *se) -{ - return scale_load_down(se->runnable_weight); -} - static inline void enqueue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { @@ -3064,314 +3047,6 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq, int flags) } #ifdef CONFIG_SMP -/* - * Approximate: - * val * y^n, where y^32 ~= 0.5 (~1 scheduling period) - */ -static u64 decay_load(u64 val, u64 n) -{ - unsigned int local_n; - - if (unlikely(n > LOAD_AVG_PERIOD * 63)) - return 0; - - /* after bounds checking we can collapse to 32-bit */ - local_n = n; - - /* - * As y^PERIOD = 1/2, we can combine - * y^n = 1/2^(n/PERIOD) * y^(n%PERIOD) - * With a look-up table which covers y^n (n= LOAD_AVG_PERIOD)) { - val >>= local_n / LOAD_AVG_PERIOD; - local_n %= LOAD_AVG_PERIOD; - } - - val = mul_u64_u32_shr(val, runnable_avg_yN_inv[local_n], 32); - return val; -} - -static u32 __accumulate_pelt_segments(u64 periods, u32 d1, u32 d3) -{ - u32 c1, c2, c3 = d3; /* y^0 == 1 */ - - /* - * c1 = d1 y^p - */ - c1 = decay_load((u64)d1, periods); - - /* - * p-1 - * c2 = 1024 \Sum y^n - * n=1 - * - * inf inf - * = 1024 ( \Sum y^n - \Sum y^n - y^0 ) - * n=0 n=p - */ - c2 = LOAD_AVG_MAX - decay_load(LOAD_AVG_MAX, periods) - 1024; - - return c1 + c2 + c3; -} - -/* - * Accumulate the three separate parts of the sum; d1 the remainder - * of the last (incomplete) period, d2 the span of full periods and d3 - * the remainder of the (incomplete) current period. - * - * d1 d2 d3 - * ^ ^ ^ - * | | | - * |<->|<----------------->|<--->| - * ... |---x---|------| ... |------|-----x (now) - * - * p-1 - * u' = (u + d1) y^p + 1024 \Sum y^n + d3 y^0 - * n=1 - * - * = u y^p + (Step 1) - * - * p-1 - * d1 y^p + 1024 \Sum y^n + d3 y^0 (Step 2) - * n=1 - */ -static __always_inline u32 -accumulate_sum(u64 delta, int cpu, struct sched_avg *sa, - unsigned long load, unsigned long runnable, int running) -{ - unsigned long scale_freq, scale_cpu; - u32 contrib = (u32)delta; /* p == 0 -> delta < 1024 */ - u64 periods; - - scale_freq = arch_scale_freq_capacity(cpu); - scale_cpu = arch_scale_cpu_capacity(NULL, cpu); - - delta += sa->period_contrib; - periods = delta / 1024; /* A period is 1024us (~1ms) */ - - /* - * Step 1: decay old *_sum if we crossed period boundaries. - */ - if (periods) { - sa->load_sum = decay_load(sa->load_sum, periods); - sa->runnable_load_sum = - decay_load(sa->runnable_load_sum, periods); - sa->util_sum = decay_load((u64)(sa->util_sum), periods); - - /* - * Step 2 - */ - delta %= 1024; - contrib = __accumulate_pelt_segments(periods, - 1024 - sa->period_contrib, delta); - } - sa->period_contrib = delta; - - contrib = cap_scale(contrib, scale_freq); - if (load) - sa->load_sum += load * contrib; - if (runnable) - sa->runnable_load_sum += runnable * contrib; - if (running) - sa->util_sum += contrib * scale_cpu; - - return periods; -} - -/* - * We can represent the historical contribution to runnable average as the - * coefficients of a geometric series. To do this we sub-divide our runnable - * history into segments of approximately 1ms (1024us); label the segment that - * occurred N-ms ago p_N, with p_0 corresponding to the current period, e.g. - * - * [<- 1024us ->|<- 1024us ->|<- 1024us ->| ... - * p0 p1 p2 - * (now) (~1ms ago) (~2ms ago) - * - * Let u_i denote the fraction of p_i that the entity was runnable. - * - * We then designate the fractions u_i as our co-efficients, yielding the - * following representation of historical load: - * u_0 + u_1*y + u_2*y^2 + u_3*y^3 + ... - * - * We choose y based on the with of a reasonably scheduling period, fixing: - * y^32 = 0.5 - * - * This means that the contribution to load ~32ms ago (u_32) will be weighted - * approximately half as much as the contribution to load within the last ms - * (u_0). - * - * When a period "rolls over" and we have new u_0`, multiplying the previous - * sum again by y is sufficient to update: - * load_avg = u_0` + y*(u_0 + u_1*y + u_2*y^2 + ... ) - * = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}] - */ -static __always_inline int -___update_load_sum(u64 now, int cpu, struct sched_avg *sa, - unsigned long load, unsigned long runnable, int running) -{ - u64 delta; - - delta = now - sa->last_update_time; - /* - * This should only happen when time goes backwards, which it - * unfortunately does during sched clock init when we swap over to TSC. - */ - if ((s64)delta < 0) { - sa->last_update_time = now; - return 0; - } - - /* - * Use 1024ns as the unit of measurement since it's a reasonable - * approximation of 1us and fast to compute. - */ - delta >>= 10; - if (!delta) - return 0; - - sa->last_update_time += delta << 10; - - /* - * running is a subset of runnable (weight) so running can't be set if - * runnable is clear. But there are some corner cases where the current - * se has been already dequeued but cfs_rq->curr still points to it. - * This means that weight will be 0 but not running for a sched_entity - * but also for a cfs_rq if the latter becomes idle. As an example, - * this happens during idle_balance() which calls - * update_blocked_averages() - */ - if (!load) - runnable = running = 0; - - /* - * Now we know we crossed measurement unit boundaries. The *_avg - * accrues by two steps: - * - * Step 1: accumulate *_sum since last_update_time. If we haven't - * crossed period boundaries, finish. - */ - if (!accumulate_sum(delta, cpu, sa, load, runnable, running)) - return 0; - - return 1; -} - -static __always_inline void -___update_load_avg(struct sched_avg *sa, unsigned long load, unsigned long runnable) -{ - u32 divider = LOAD_AVG_MAX - 1024 + sa->period_contrib; - - /* - * Step 2: update *_avg. - */ - sa->load_avg = div_u64(load * sa->load_sum, divider); - sa->runnable_load_avg = div_u64(runnable * sa->runnable_load_sum, divider); - sa->util_avg = sa->util_sum / divider; -} - -/* - * When a task is dequeued, its estimated utilization should not be update if - * its util_avg has not been updated at least once. - * This flag is used to synchronize util_avg updates with util_est updates. - * We map this information into the LSB bit of the utilization saved at - * dequeue time (i.e. util_est.dequeued). - */ -#define UTIL_AVG_UNCHANGED 0x1 - -static inline void cfs_se_util_change(struct sched_avg *avg) -{ - unsigned int enqueued; - - if (!sched_feat(UTIL_EST)) - return; - - /* Avoid store if the flag has been already set */ - enqueued = avg->util_est.enqueued; - if (!(enqueued & UTIL_AVG_UNCHANGED)) - return; - - /* Reset flag to report util_avg has been updated */ - enqueued &= ~UTIL_AVG_UNCHANGED; - WRITE_ONCE(avg->util_est.enqueued, enqueued); -} - -/* - * sched_entity: - * - * task: - * se_runnable() == se_weight() - * - * group: [ see update_cfs_group() ] - * se_weight() = tg->weight * grq->load_avg / tg->load_avg - * se_runnable() = se_weight(se) * grq->runnable_load_avg / grq->load_avg - * - * load_sum := runnable_sum - * load_avg = se_weight(se) * runnable_avg - * - * runnable_load_sum := runnable_sum - * runnable_load_avg = se_runnable(se) * runnable_avg - * - * XXX collapse load_sum and runnable_load_sum - * - * cfq_rs: - * - * load_sum = \Sum se_weight(se) * se->avg.load_sum - * load_avg = \Sum se->avg.load_avg - * - * runnable_load_sum = \Sum se_runnable(se) * se->avg.runnable_load_sum - * runnable_load_avg = \Sum se->avg.runable_load_avg - */ - -static int -__update_load_avg_blocked_se(u64 now, int cpu, struct sched_entity *se) -{ - if (entity_is_task(se)) - se->runnable_weight = se->load.weight; - - if (___update_load_sum(now, cpu, &se->avg, 0, 0, 0)) { - ___update_load_avg(&se->avg, se_weight(se), se_runnable(se)); - return 1; - } - - return 0; -} - -static int -__update_load_avg_se(u64 now, int cpu, struct cfs_rq *cfs_rq, struct sched_entity *se) -{ - if (entity_is_task(se)) - se->runnable_weight = se->load.weight; - - if (___update_load_sum(now, cpu, &se->avg, !!se->on_rq, !!se->on_rq, - cfs_rq->curr == se)) { - - ___update_load_avg(&se->avg, se_weight(se), se_runnable(se)); - cfs_se_util_change(&se->avg); - return 1; - } - - return 0; -} - -static int -__update_load_avg_cfs_rq(u64 now, int cpu, struct cfs_rq *cfs_rq) -{ - if (___update_load_sum(now, cpu, &cfs_rq->avg, - scale_load_down(cfs_rq->load.weight), - scale_load_down(cfs_rq->runnable_weight), - cfs_rq->curr != NULL)) { - - ___update_load_avg(&cfs_rq->avg, 1, 1); - return 1; - } - - return 0; -} - #ifdef CONFIG_FAIR_GROUP_SCHED /** * update_tg_load_avg - update the tg's load avg @@ -4039,12 +3714,6 @@ util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p, bool task_sleep) #else /* CONFIG_SMP */ -static inline int -update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) -{ - return 0; -} - #define UPDATE_TG 0x0 #define SKIP_AGE_LOAD 0x0 #define DO_ATTACH 0x0 diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c new file mode 100644 index 000000000000..e6ecbb2b8698 --- /dev/null +++ b/kernel/sched/pelt.c @@ -0,0 +1,311 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Per Entity Load Tracking + * + * Copyright (C) 2007 Red Hat, Inc., Ingo Molnar + * + * Interactivity improvements by Mike Galbraith + * (C) 2007 Mike Galbraith + * + * Various enhancements by Dmitry Adamushko. + * (C) 2007 Dmitry Adamushko + * + * Group scheduling enhancements by Srivatsa Vaddagiri + * Copyright IBM Corporation, 2007 + * Author: Srivatsa Vaddagiri + * + * Scaled math optimizations by Thomas Gleixner + * Copyright (C) 2007, Thomas Gleixner + * + * Adaptive scheduling granularity, math enhancements by Peter Zijlstra + * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra + * + * Move PELT related code from fair.c into this pelt.c file + * Author: Vincent Guittot + */ + +#include +#include "sched.h" +#include "sched-pelt.h" +#include "pelt.h" + +/* + * Approximate: + * val * y^n, where y^32 ~= 0.5 (~1 scheduling period) + */ +static u64 decay_load(u64 val, u64 n) +{ + unsigned int local_n; + + if (unlikely(n > LOAD_AVG_PERIOD * 63)) + return 0; + + /* after bounds checking we can collapse to 32-bit */ + local_n = n; + + /* + * As y^PERIOD = 1/2, we can combine + * y^n = 1/2^(n/PERIOD) * y^(n%PERIOD) + * With a look-up table which covers y^n (n= LOAD_AVG_PERIOD)) { + val >>= local_n / LOAD_AVG_PERIOD; + local_n %= LOAD_AVG_PERIOD; + } + + val = mul_u64_u32_shr(val, runnable_avg_yN_inv[local_n], 32); + return val; +} + +static u32 __accumulate_pelt_segments(u64 periods, u32 d1, u32 d3) +{ + u32 c1, c2, c3 = d3; /* y^0 == 1 */ + + /* + * c1 = d1 y^p + */ + c1 = decay_load((u64)d1, periods); + + /* + * p-1 + * c2 = 1024 \Sum y^n + * n=1 + * + * inf inf + * = 1024 ( \Sum y^n - \Sum y^n - y^0 ) + * n=0 n=p + */ + c2 = LOAD_AVG_MAX - decay_load(LOAD_AVG_MAX, periods) - 1024; + + return c1 + c2 + c3; +} + +#define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT) + +/* + * Accumulate the three separate parts of the sum; d1 the remainder + * of the last (incomplete) period, d2 the span of full periods and d3 + * the remainder of the (incomplete) current period. + * + * d1 d2 d3 + * ^ ^ ^ + * | | | + * |<->|<----------------->|<--->| + * ... |---x---|------| ... |------|-----x (now) + * + * p-1 + * u' = (u + d1) y^p + 1024 \Sum y^n + d3 y^0 + * n=1 + * + * = u y^p + (Step 1) + * + * p-1 + * d1 y^p + 1024 \Sum y^n + d3 y^0 (Step 2) + * n=1 + */ +static __always_inline u32 +accumulate_sum(u64 delta, int cpu, struct sched_avg *sa, + unsigned long load, unsigned long runnable, int running) +{ + unsigned long scale_freq, scale_cpu; + u32 contrib = (u32)delta; /* p == 0 -> delta < 1024 */ + u64 periods; + + scale_freq = arch_scale_freq_capacity(cpu); + scale_cpu = arch_scale_cpu_capacity(NULL, cpu); + + delta += sa->period_contrib; + periods = delta / 1024; /* A period is 1024us (~1ms) */ + + /* + * Step 1: decay old *_sum if we crossed period boundaries. + */ + if (periods) { + sa->load_sum = decay_load(sa->load_sum, periods); + sa->runnable_load_sum = + decay_load(sa->runnable_load_sum, periods); + sa->util_sum = decay_load((u64)(sa->util_sum), periods); + + /* + * Step 2 + */ + delta %= 1024; + contrib = __accumulate_pelt_segments(periods, + 1024 - sa->period_contrib, delta); + } + sa->period_contrib = delta; + + contrib = cap_scale(contrib, scale_freq); + if (load) + sa->load_sum += load * contrib; + if (runnable) + sa->runnable_load_sum += runnable * contrib; + if (running) + sa->util_sum += contrib * scale_cpu; + + return periods; +} + +/* + * We can represent the historical contribution to runnable average as the + * coefficients of a geometric series. To do this we sub-divide our runnable + * history into segments of approximately 1ms (1024us); label the segment that + * occurred N-ms ago p_N, with p_0 corresponding to the current period, e.g. + * + * [<- 1024us ->|<- 1024us ->|<- 1024us ->| ... + * p0 p1 p2 + * (now) (~1ms ago) (~2ms ago) + * + * Let u_i denote the fraction of p_i that the entity was runnable. + * + * We then designate the fractions u_i as our co-efficients, yielding the + * following representation of historical load: + * u_0 + u_1*y + u_2*y^2 + u_3*y^3 + ... + * + * We choose y based on the with of a reasonably scheduling period, fixing: + * y^32 = 0.5 + * + * This means that the contribution to load ~32ms ago (u_32) will be weighted + * approximately half as much as the contribution to load within the last ms + * (u_0). + * + * When a period "rolls over" and we have new u_0`, multiplying the previous + * sum again by y is sufficient to update: + * load_avg = u_0` + y*(u_0 + u_1*y + u_2*y^2 + ... ) + * = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}] + */ +static __always_inline int +___update_load_sum(u64 now, int cpu, struct sched_avg *sa, + unsigned long load, unsigned long runnable, int running) +{ + u64 delta; + + delta = now - sa->last_update_time; + /* + * This should only happen when time goes backwards, which it + * unfortunately does during sched clock init when we swap over to TSC. + */ + if ((s64)delta < 0) { + sa->last_update_time = now; + return 0; + } + + /* + * Use 1024ns as the unit of measurement since it's a reasonable + * approximation of 1us and fast to compute. + */ + delta >>= 10; + if (!delta) + return 0; + + sa->last_update_time += delta << 10; + + /* + * running is a subset of runnable (weight) so running can't be set if + * runnable is clear. But there are some corner cases where the current + * se has been already dequeued but cfs_rq->curr still points to it. + * This means that weight will be 0 but not running for a sched_entity + * but also for a cfs_rq if the latter becomes idle. As an example, + * this happens during idle_balance() which calls + * update_blocked_averages() + */ + if (!load) + runnable = running = 0; + + /* + * Now we know we crossed measurement unit boundaries. The *_avg + * accrues by two steps: + * + * Step 1: accumulate *_sum since last_update_time. If we haven't + * crossed period boundaries, finish. + */ + if (!accumulate_sum(delta, cpu, sa, load, runnable, running)) + return 0; + + return 1; +} + +static __always_inline void +___update_load_avg(struct sched_avg *sa, unsigned long load, unsigned long runnable) +{ + u32 divider = LOAD_AVG_MAX - 1024 + sa->period_contrib; + + /* + * Step 2: update *_avg. + */ + sa->load_avg = div_u64(load * sa->load_sum, divider); + sa->runnable_load_avg = div_u64(runnable * sa->runnable_load_sum, divider); + sa->util_avg = sa->util_sum / divider; +} + +/* + * sched_entity: + * + * task: + * se_runnable() == se_weight() + * + * group: [ see update_cfs_group() ] + * se_weight() = tg->weight * grq->load_avg / tg->load_avg + * se_runnable() = se_weight(se) * grq->runnable_load_avg / grq->load_avg + * + * load_sum := runnable_sum + * load_avg = se_weight(se) * runnable_avg + * + * runnable_load_sum := runnable_sum + * runnable_load_avg = se_runnable(se) * runnable_avg + * + * XXX collapse load_sum and runnable_load_sum + * + * cfq_rq: + * + * load_sum = \Sum se_weight(se) * se->avg.load_sum + * load_avg = \Sum se->avg.load_avg + * + * runnable_load_sum = \Sum se_runnable(se) * se->avg.runnable_load_sum + * runnable_load_avg = \Sum se->avg.runable_load_avg + */ + +int __update_load_avg_blocked_se(u64 now, int cpu, struct sched_entity *se) +{ + if (entity_is_task(se)) + se->runnable_weight = se->load.weight; + + if (___update_load_sum(now, cpu, &se->avg, 0, 0, 0)) { + ___update_load_avg(&se->avg, se_weight(se), se_runnable(se)); + return 1; + } + + return 0; +} + +int __update_load_avg_se(u64 now, int cpu, struct cfs_rq *cfs_rq, struct sched_entity *se) +{ + if (entity_is_task(se)) + se->runnable_weight = se->load.weight; + + if (___update_load_sum(now, cpu, &se->avg, !!se->on_rq, !!se->on_rq, + cfs_rq->curr == se)) { + + ___update_load_avg(&se->avg, se_weight(se), se_runnable(se)); + cfs_se_util_change(&se->avg); + return 1; + } + + return 0; +} + +int __update_load_avg_cfs_rq(u64 now, int cpu, struct cfs_rq *cfs_rq) +{ + if (___update_load_sum(now, cpu, &cfs_rq->avg, + scale_load_down(cfs_rq->load.weight), + scale_load_down(cfs_rq->runnable_weight), + cfs_rq->curr != NULL)) { + + ___update_load_avg(&cfs_rq->avg, 1, 1); + return 1; + } + + return 0; +} diff --git a/kernel/sched/pelt.h b/kernel/sched/pelt.h new file mode 100644 index 000000000000..9cac73efd64a --- /dev/null +++ b/kernel/sched/pelt.h @@ -0,0 +1,43 @@ +#ifdef CONFIG_SMP + +int __update_load_avg_blocked_se(u64 now, int cpu, struct sched_entity *se); +int __update_load_avg_se(u64 now, int cpu, struct cfs_rq *cfs_rq, struct sched_entity *se); +int __update_load_avg_cfs_rq(u64 now, int cpu, struct cfs_rq *cfs_rq); + +/* + * When a task is dequeued, its estimated utilization should not be update if + * its util_avg has not been updated at least once. + * This flag is used to synchronize util_avg updates with util_est updates. + * We map this information into the LSB bit of the utilization saved at + * dequeue time (i.e. util_est.dequeued). + */ +#define UTIL_AVG_UNCHANGED 0x1 + +static inline void cfs_se_util_change(struct sched_avg *avg) +{ + unsigned int enqueued; + + if (!sched_feat(UTIL_EST)) + return; + + /* Avoid store if the flag has been already set */ + enqueued = avg->util_est.enqueued; + if (!(enqueued & UTIL_AVG_UNCHANGED)) + return; + + /* Reset flag to report util_avg has been updated */ + enqueued &= ~UTIL_AVG_UNCHANGED; + WRITE_ONCE(avg->util_est.enqueued, enqueued); +} + +#else + +static inline int +update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) +{ + return 0; +} + +#endif + + diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index c7742dcc136c..00d6f2594c4e 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -673,7 +673,26 @@ struct dl_rq { u64 bw_ratio; }; +#ifdef CONFIG_FAIR_GROUP_SCHED +/* An entity is a task if it doesn't "own" a runqueue */ +#define entity_is_task(se) (!se->my_q) +#else +#define entity_is_task(se) 1 +#endif + #ifdef CONFIG_SMP +/* + * XXX we want to get rid of these helpers and use the full load resolution. + */ +static inline long se_weight(struct sched_entity *se) +{ + return scale_load_down(se->load.weight); +} + +static inline long se_runnable(struct sched_entity *se) +{ + return scale_load_down(se->runnable_weight); +} static inline bool sched_asym_prefer(int a, int b) { -- cgit From 371bf42732694d142b0de026e152266c039b97d3 Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Thu, 28 Jun 2018 17:45:05 +0200 Subject: sched/rt: Add rt_rq utilization tracking schedutil governor relies on cfs_rq's util_avg to choose the OPP when CFS tasks are running. When the CPU is overloaded by CFS and RT tasks, CFS tasks are preempted by RT tasks and in this case util_avg reflects the remaining capacity but not what CFS want to use. In such case, schedutil can select a lower OPP whereas the CPU is overloaded. In order to have a more accurate view of the utilization of the CPU, we track the utilization of RT tasks. Only util_avg is correctly tracked but not load_avg and runnable_load_avg which are useless for rt_rq. rt_rq uses rq_clock_task and cfs_rq uses cfs_rq_clock_task but they are the same at the root group level, so the PELT windows of the util_sum are aligned. Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Morten.Rasmussen@arm.com Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: claudio@evidence.eu.com Cc: daniel.lezcano@linaro.org Cc: dietmar.eggemann@arm.com Cc: joel@joelfernandes.org Cc: juri.lelli@redhat.com Cc: luca.abeni@santannapisa.it Cc: patrick.bellasi@arm.com Cc: quentin.perret@arm.com Cc: rjw@rjwysocki.net Cc: valentin.schneider@arm.com Cc: viresh.kumar@linaro.org Link: http://lkml.kernel.org/r/1530200714-4504-3-git-send-email-vincent.guittot@linaro.org Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 15 ++++++++++++++- kernel/sched/pelt.c | 25 +++++++++++++++++++++++++ kernel/sched/pelt.h | 7 +++++++ kernel/sched/rt.c | 13 +++++++++++++ kernel/sched/sched.h | 7 +++++++ 5 files changed, 66 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 39ab46cea6c5..5b453213cd18 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -7290,6 +7290,14 @@ static inline bool cfs_rq_has_blocked(struct cfs_rq *cfs_rq) return false; } +static inline bool rt_rq_has_blocked(struct rq *rq) +{ + if (READ_ONCE(rq->avg_rt.util_avg)) + return true; + + return false; +} + #ifdef CONFIG_FAIR_GROUP_SCHED static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq) @@ -7349,6 +7357,10 @@ static void update_blocked_averages(int cpu) if (cfs_rq_has_blocked(cfs_rq)) done = false; } + update_rt_rq_load_avg(rq_clock_task(rq), rq, 0); + /* Don't need periodic decay once load/util_avg are null */ + if (rt_rq_has_blocked(rq)) + done = false; #ifdef CONFIG_NO_HZ_COMMON rq->last_blocked_load_update_tick = jiffies; @@ -7414,9 +7426,10 @@ static inline void update_blocked_averages(int cpu) rq_lock_irqsave(rq, &rf); update_rq_clock(rq); update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq); + update_rt_rq_load_avg(rq_clock_task(rq), rq, 0); #ifdef CONFIG_NO_HZ_COMMON rq->last_blocked_load_update_tick = jiffies; - if (!cfs_rq_has_blocked(cfs_rq)) + if (!cfs_rq_has_blocked(cfs_rq) && !rt_rq_has_blocked(rq)) rq->has_blocked_load = 0; #endif rq_unlock_irqrestore(rq, &rf); diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c index e6ecbb2b8698..a00b1ba3dd5b 100644 --- a/kernel/sched/pelt.c +++ b/kernel/sched/pelt.c @@ -309,3 +309,28 @@ int __update_load_avg_cfs_rq(u64 now, int cpu, struct cfs_rq *cfs_rq) return 0; } + +/* + * rt_rq: + * + * util_sum = \Sum se->avg.util_sum but se->avg.util_sum is not tracked + * util_sum = cpu_scale * load_sum + * runnable_load_sum = load_sum + * + * load_avg and runnable_load_avg are not supported and meaningless. + * + */ + +int update_rt_rq_load_avg(u64 now, struct rq *rq, int running) +{ + if (___update_load_sum(now, rq->cpu, &rq->avg_rt, + running, + running, + running)) { + + ___update_load_avg(&rq->avg_rt, 1, 1); + return 1; + } + + return 0; +} diff --git a/kernel/sched/pelt.h b/kernel/sched/pelt.h index 9cac73efd64a..b2983b741d57 100644 --- a/kernel/sched/pelt.h +++ b/kernel/sched/pelt.h @@ -3,6 +3,7 @@ int __update_load_avg_blocked_se(u64 now, int cpu, struct sched_entity *se); int __update_load_avg_se(u64 now, int cpu, struct cfs_rq *cfs_rq, struct sched_entity *se); int __update_load_avg_cfs_rq(u64 now, int cpu, struct cfs_rq *cfs_rq); +int update_rt_rq_load_avg(u64 now, struct rq *rq, int running); /* * When a task is dequeued, its estimated utilization should not be update if @@ -38,6 +39,12 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) return 0; } +static inline int +update_rt_rq_load_avg(u64 now, struct rq *rq, int running) +{ + return 0; +} + #endif diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 572567078b60..0dc8ad1915e6 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -5,6 +5,8 @@ */ #include "sched.h" +#include "pelt.h" + int sched_rr_timeslice = RR_TIMESLICE; int sysctl_sched_rr_timeslice = (MSEC_PER_SEC / HZ) * RR_TIMESLICE; @@ -1576,6 +1578,14 @@ pick_next_task_rt(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) rt_queue_push_tasks(rq); + /* + * If prev task was rt, put_prev_task() has already updated the + * utilization. We only care of the case where we start to schedule a + * rt task + */ + if (rq->curr->sched_class != &rt_sched_class) + update_rt_rq_load_avg(rq_clock_task(rq), rq, 0); + return p; } @@ -1583,6 +1593,8 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p) { update_curr_rt(rq); + update_rt_rq_load_avg(rq_clock_task(rq), rq, 1); + /* * The previous task needs to be made eligible for pushing * if it is still active @@ -2312,6 +2324,7 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued) struct sched_rt_entity *rt_se = &p->rt; update_curr_rt(rq); + update_rt_rq_load_avg(rq_clock_task(rq), rq, 1); watchdog(rq, p); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 00d6f2594c4e..405dd9ba6b39 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -594,6 +594,7 @@ struct rt_rq { unsigned long rt_nr_total; int overloaded; struct plist_head pushable_tasks; + #endif /* CONFIG_SMP */ int rt_queued; @@ -854,6 +855,7 @@ struct rq { u64 rt_avg; u64 age_stamp; + struct sched_avg avg_rt; u64 idle_stamp; u64 avg_idle; @@ -2212,4 +2214,9 @@ static inline unsigned long cpu_util_cfs(struct rq *rq) return util; } + +static inline unsigned long cpu_util_rt(struct rq *rq) +{ + return rq->avg_rt.util_avg; +} #endif -- cgit From 3ae117c6cd7c4783819a0766aa97b9493a8a0f62 Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Thu, 28 Jun 2018 17:45:06 +0200 Subject: cpufreq/schedutil: Use RT utilization tracking Add both CFS and RT utilization when selecting an OPP for CFS tasks as RT can preempt and steal CFS's running time. RT util_avg is used to take into account the utilization of RT tasks on the CPU when selecting OPP. If a RT task migrate, the RT utilization will not migrate but will decay over time. On an overloaded CPU, CFS utilization reflects the remaining utilization avialable on CPU. When RT task migrates, the CFS utilization will increase when tasks will start to use the newly available capacity. At the same pace, RT utilization will decay and both variations will compensate each other to keep unchanged overall utilization and will prevent any OPP drop. Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Acked-by: Viresh Kumar Cc: Linus Torvalds Cc: Morten.Rasmussen@arm.com Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: claudio@evidence.eu.com Cc: daniel.lezcano@linaro.org Cc: dietmar.eggemann@arm.com Cc: joel@joelfernandes.org Cc: juri.lelli@redhat.com Cc: luca.abeni@santannapisa.it Cc: patrick.bellasi@arm.com Cc: quentin.perret@arm.com Cc: rjw@rjwysocki.net Cc: valentin.schneider@arm.com Link: http://lkml.kernel.org/r/1530200714-4504-4-git-send-email-vincent.guittot@linaro.org Signed-off-by: Ingo Molnar --- kernel/sched/cpufreq_schedutil.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index c907fde01eaa..da29b5a33adb 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -56,6 +56,7 @@ struct sugov_cpu { /* The fields below are only needed when sharing a policy: */ unsigned long util_cfs; unsigned long util_dl; + unsigned long util_rt; unsigned long max; /* The field below is for single-CPU policies only: */ @@ -186,15 +187,21 @@ static void sugov_get_util(struct sugov_cpu *sg_cpu) sg_cpu->max = arch_scale_cpu_capacity(NULL, sg_cpu->cpu); sg_cpu->util_cfs = cpu_util_cfs(rq); sg_cpu->util_dl = cpu_util_dl(rq); + sg_cpu->util_rt = cpu_util_rt(rq); } static unsigned long sugov_aggregate_util(struct sugov_cpu *sg_cpu) { struct rq *rq = cpu_rq(sg_cpu->cpu); + unsigned long util; if (rt_rq_is_runnable(&rq->rt)) return sg_cpu->max; + util = sg_cpu->util_dl; + util += sg_cpu->util_cfs; + util += sg_cpu->util_rt; + /* * Utilization required by DEADLINE must always be granted while, for * FAIR, we use blocked utilization of IDLE CPUs as a mechanism to @@ -205,7 +212,7 @@ static unsigned long sugov_aggregate_util(struct sugov_cpu *sg_cpu) * util_cfs + util_dl as requested freq. However, cpufreq is not yet * ready for such an interface. So, we only do the latter for now. */ - return min(sg_cpu->max, (sg_cpu->util_dl + sg_cpu->util_cfs)); + return min(sg_cpu->max, util); } /** -- cgit From 3727e0e16340cbdf83818f5bf0113505c6876057 Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Thu, 28 Jun 2018 17:45:07 +0200 Subject: sched/dl: Add dl_rq utilization tracking Similarly to what happens with RT tasks, CFS tasks can be preempted by DL tasks and the CFS's utilization might no longer describes the real utilization level. Current DL bandwidth reflects the requirements to meet deadline when tasks are enqueued but not the current utilization of the DL sched class. We track DL class utilization to estimate the system utilization. Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Morten.Rasmussen@arm.com Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: claudio@evidence.eu.com Cc: daniel.lezcano@linaro.org Cc: dietmar.eggemann@arm.com Cc: joel@joelfernandes.org Cc: juri.lelli@redhat.com Cc: luca.abeni@santannapisa.it Cc: patrick.bellasi@arm.com Cc: quentin.perret@arm.com Cc: rjw@rjwysocki.net Cc: valentin.schneider@arm.com Cc: viresh.kumar@linaro.org Link: http://lkml.kernel.org/r/1530200714-4504-5-git-send-email-vincent.guittot@linaro.org Signed-off-by: Ingo Molnar --- kernel/sched/deadline.c | 6 ++++++ kernel/sched/fair.c | 11 ++++++++--- kernel/sched/pelt.c | 23 +++++++++++++++++++++++ kernel/sched/pelt.h | 6 ++++++ kernel/sched/sched.h | 1 + 5 files changed, 44 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index fbfc3f1d368a..f4de26982d80 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -16,6 +16,7 @@ * Fabio Checconi */ #include "sched.h" +#include "pelt.h" struct dl_bandwidth def_dl_bandwidth; @@ -1761,6 +1762,9 @@ pick_next_task_dl(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) deadline_queue_push_tasks(rq); + if (rq->curr->sched_class != &dl_sched_class) + update_dl_rq_load_avg(rq_clock_task(rq), rq, 0); + return p; } @@ -1768,6 +1772,7 @@ static void put_prev_task_dl(struct rq *rq, struct task_struct *p) { update_curr_dl(rq); + update_dl_rq_load_avg(rq_clock_task(rq), rq, 1); if (on_dl_rq(&p->dl) && p->nr_cpus_allowed > 1) enqueue_pushable_dl_task(rq, p); } @@ -1784,6 +1789,7 @@ static void task_tick_dl(struct rq *rq, struct task_struct *p, int queued) { update_curr_dl(rq); + update_dl_rq_load_avg(rq_clock_task(rq), rq, 1); /* * Even when we have runtime, update_curr_dl() might have resulted in us * not being the leftmost task anymore. In that case NEED_RESCHED will diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 5b453213cd18..f096275c7df2 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -7290,11 +7290,14 @@ static inline bool cfs_rq_has_blocked(struct cfs_rq *cfs_rq) return false; } -static inline bool rt_rq_has_blocked(struct rq *rq) +static inline bool others_rqs_have_blocked(struct rq *rq) { if (READ_ONCE(rq->avg_rt.util_avg)) return true; + if (READ_ONCE(rq->avg_dl.util_avg)) + return true; + return false; } @@ -7358,8 +7361,9 @@ static void update_blocked_averages(int cpu) done = false; } update_rt_rq_load_avg(rq_clock_task(rq), rq, 0); + update_dl_rq_load_avg(rq_clock_task(rq), rq, 0); /* Don't need periodic decay once load/util_avg are null */ - if (rt_rq_has_blocked(rq)) + if (others_rqs_have_blocked(rq)) done = false; #ifdef CONFIG_NO_HZ_COMMON @@ -7427,9 +7431,10 @@ static inline void update_blocked_averages(int cpu) update_rq_clock(rq); update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq); update_rt_rq_load_avg(rq_clock_task(rq), rq, 0); + update_dl_rq_load_avg(rq_clock_task(rq), rq, 0); #ifdef CONFIG_NO_HZ_COMMON rq->last_blocked_load_update_tick = jiffies; - if (!cfs_rq_has_blocked(cfs_rq) && !rt_rq_has_blocked(rq)) + if (!cfs_rq_has_blocked(cfs_rq) && !others_rqs_have_blocked(rq)) rq->has_blocked_load = 0; #endif rq_unlock_irqrestore(rq, &rf); diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c index a00b1ba3dd5b..8b78b6320cda 100644 --- a/kernel/sched/pelt.c +++ b/kernel/sched/pelt.c @@ -334,3 +334,26 @@ int update_rt_rq_load_avg(u64 now, struct rq *rq, int running) return 0; } + +/* + * dl_rq: + * + * util_sum = \Sum se->avg.util_sum but se->avg.util_sum is not tracked + * util_sum = cpu_scale * load_sum + * runnable_load_sum = load_sum + * + */ + +int update_dl_rq_load_avg(u64 now, struct rq *rq, int running) +{ + if (___update_load_sum(now, rq->cpu, &rq->avg_dl, + running, + running, + running)) { + + ___update_load_avg(&rq->avg_dl, 1, 1); + return 1; + } + + return 0; +} diff --git a/kernel/sched/pelt.h b/kernel/sched/pelt.h index b2983b741d57..0e4f912461ad 100644 --- a/kernel/sched/pelt.h +++ b/kernel/sched/pelt.h @@ -4,6 +4,7 @@ int __update_load_avg_blocked_se(u64 now, int cpu, struct sched_entity *se); int __update_load_avg_se(u64 now, int cpu, struct cfs_rq *cfs_rq, struct sched_entity *se); int __update_load_avg_cfs_rq(u64 now, int cpu, struct cfs_rq *cfs_rq); int update_rt_rq_load_avg(u64 now, struct rq *rq, int running); +int update_dl_rq_load_avg(u64 now, struct rq *rq, int running); /* * When a task is dequeued, its estimated utilization should not be update if @@ -45,6 +46,11 @@ update_rt_rq_load_avg(u64 now, struct rq *rq, int running) return 0; } +static inline int +update_dl_rq_load_avg(u64 now, struct rq *rq, int running) +{ + return 0; +} #endif diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 405dd9ba6b39..ab8b5296b5f6 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -856,6 +856,7 @@ struct rq { u64 rt_avg; u64 age_stamp; struct sched_avg avg_rt; + struct sched_avg avg_dl; u64 idle_stamp; u64 avg_idle; -- cgit From 8cc90515a4fa419ccfc4703ff127699cdcb96839 Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Thu, 28 Jun 2018 17:45:08 +0200 Subject: cpufreq/schedutil: Use DL utilization tracking Now that we have both the DL class bandwidth requirement and the DL class utilization, we can detect when CPU is fully used so we should run at max. Otherwise, we keep using the DL bandwidth requirement to define the utilization of the CPU. Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Acked-by: Viresh Kumar Cc: Linus Torvalds Cc: Morten.Rasmussen@arm.com Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: claudio@evidence.eu.com Cc: daniel.lezcano@linaro.org Cc: dietmar.eggemann@arm.com Cc: joel@joelfernandes.org Cc: juri.lelli@redhat.com Cc: luca.abeni@santannapisa.it Cc: patrick.bellasi@arm.com Cc: quentin.perret@arm.com Cc: rjw@rjwysocki.net Cc: valentin.schneider@arm.com Link: http://lkml.kernel.org/r/1530200714-4504-6-git-send-email-vincent.guittot@linaro.org Signed-off-by: Ingo Molnar --- kernel/sched/cpufreq_schedutil.c | 23 +++++++++++++++++------ kernel/sched/sched.h | 7 ++++++- 2 files changed, 23 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index da29b5a33adb..07760bc7f69a 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -56,6 +56,7 @@ struct sugov_cpu { /* The fields below are only needed when sharing a policy: */ unsigned long util_cfs; unsigned long util_dl; + unsigned long bw_dl; unsigned long util_rt; unsigned long max; @@ -187,6 +188,7 @@ static void sugov_get_util(struct sugov_cpu *sg_cpu) sg_cpu->max = arch_scale_cpu_capacity(NULL, sg_cpu->cpu); sg_cpu->util_cfs = cpu_util_cfs(rq); sg_cpu->util_dl = cpu_util_dl(rq); + sg_cpu->bw_dl = cpu_bw_dl(rq); sg_cpu->util_rt = cpu_util_rt(rq); } @@ -198,20 +200,29 @@ static unsigned long sugov_aggregate_util(struct sugov_cpu *sg_cpu) if (rt_rq_is_runnable(&rq->rt)) return sg_cpu->max; - util = sg_cpu->util_dl; - util += sg_cpu->util_cfs; + util = sg_cpu->util_cfs; util += sg_cpu->util_rt; + if ((util + sg_cpu->util_dl) >= sg_cpu->max) + return sg_cpu->max; + /* - * Utilization required by DEADLINE must always be granted while, for - * FAIR, we use blocked utilization of IDLE CPUs as a mechanism to - * gracefully reduce the frequency when no tasks show up for longer + * As there is still idle time on the CPU, we need to compute the + * utilization level of the CPU. + * + * Bandwidth required by DEADLINE must always be granted while, for + * FAIR and RT, we use blocked utilization of IDLE CPUs as a mechanism + * to gracefully reduce the frequency when no tasks show up for longer * periods of time. * * Ideally we would like to set util_dl as min/guaranteed freq and * util_cfs + util_dl as requested freq. However, cpufreq is not yet * ready for such an interface. So, we only do the latter for now. */ + + /* Add DL bandwidth requirement */ + util += sg_cpu->bw_dl; + return min(sg_cpu->max, util); } @@ -367,7 +378,7 @@ static inline bool sugov_cpu_is_busy(struct sugov_cpu *sg_cpu) { return false; } */ static inline void ignore_dl_rate_limit(struct sugov_cpu *sg_cpu, struct sugov_policy *sg_policy) { - if (cpu_util_dl(cpu_rq(sg_cpu->cpu)) > sg_cpu->util_dl) + if (cpu_bw_dl(cpu_rq(sg_cpu->cpu)) > sg_cpu->bw_dl) sg_policy->need_freq_update = true; } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index ab8b5296b5f6..9028f268f867 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -2199,11 +2199,16 @@ static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {} #endif #ifdef CONFIG_CPU_FREQ_GOV_SCHEDUTIL -static inline unsigned long cpu_util_dl(struct rq *rq) +static inline unsigned long cpu_bw_dl(struct rq *rq) { return (rq->dl.running_bw * SCHED_CAPACITY_SCALE) >> BW_SHIFT; } +static inline unsigned long cpu_util_dl(struct rq *rq) +{ + return READ_ONCE(rq->avg_dl.util_avg); +} + static inline unsigned long cpu_util_cfs(struct rq *rq) { unsigned long util = READ_ONCE(rq->cfs.avg.util_avg); -- cgit From 91c27493e78df6849baaa21a9d66e26de8b875c0 Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Thu, 28 Jun 2018 17:45:09 +0200 Subject: sched/irq: Add IRQ utilization tracking interrupt and steal time are the only remaining activities tracked by rt_avg. Like for sched classes, we can use PELT to track their average utilization of the CPU. But unlike sched class, we don't track when entering/leaving interrupt; Instead, we take into account the time spent under interrupt context when we update rqs' clock (rq_clock_task). This also means that we have to decay the normal context time and account for interrupt time during the update. That's also important to note that because: rq_clock == rq_clock_task + interrupt time and rq_clock_task is used by a sched class to compute its utilization, the util_avg of a sched class only reflects the utilization of the time spent in normal context and not of the whole time of the CPU. The utilization of interrupt gives an more accurate level of utilization of CPU. The CPU utilization is: avg_irq + (1 - avg_irq / max capacity) * /Sum avg_rq Most of the time, avg_irq is small and neglictible so the use of the approximation CPU utilization = /Sum avg_rq was enough. Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Morten.Rasmussen@arm.com Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: claudio@evidence.eu.com Cc: daniel.lezcano@linaro.org Cc: dietmar.eggemann@arm.com Cc: joel@joelfernandes.org Cc: juri.lelli@redhat.com Cc: luca.abeni@santannapisa.it Cc: patrick.bellasi@arm.com Cc: quentin.perret@arm.com Cc: rjw@rjwysocki.net Cc: valentin.schneider@arm.com Cc: viresh.kumar@linaro.org Link: http://lkml.kernel.org/r/1530200714-4504-7-git-send-email-vincent.guittot@linaro.org Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 4 +++- kernel/sched/fair.c | 13 ++++++++++--- kernel/sched/pelt.c | 40 ++++++++++++++++++++++++++++++++++++++++ kernel/sched/pelt.h | 16 ++++++++++++++++ kernel/sched/sched.h | 3 +++ 5 files changed, 72 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index fe365c9a08e9..38107a95baca 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -17,6 +17,8 @@ #include "../workqueue_internal.h" #include "../smpboot.h" +#include "pelt.h" + #define CREATE_TRACE_POINTS #include @@ -185,7 +187,7 @@ static void update_rq_clock_task(struct rq *rq, s64 delta) #if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING) if ((irq_delta + steal) && sched_feat(NONTASK_CAPACITY)) - sched_rt_avg_update(rq, irq_delta + steal); + update_irq_load_avg(rq, irq_delta + steal); #endif } diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index f096275c7df2..c2782b29c79f 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -7290,7 +7290,7 @@ static inline bool cfs_rq_has_blocked(struct cfs_rq *cfs_rq) return false; } -static inline bool others_rqs_have_blocked(struct rq *rq) +static inline bool others_have_blocked(struct rq *rq) { if (READ_ONCE(rq->avg_rt.util_avg)) return true; @@ -7298,6 +7298,11 @@ static inline bool others_rqs_have_blocked(struct rq *rq) if (READ_ONCE(rq->avg_dl.util_avg)) return true; +#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING) + if (READ_ONCE(rq->avg_irq.util_avg)) + return true; +#endif + return false; } @@ -7362,8 +7367,9 @@ static void update_blocked_averages(int cpu) } update_rt_rq_load_avg(rq_clock_task(rq), rq, 0); update_dl_rq_load_avg(rq_clock_task(rq), rq, 0); + update_irq_load_avg(rq, 0); /* Don't need periodic decay once load/util_avg are null */ - if (others_rqs_have_blocked(rq)) + if (others_have_blocked(rq)) done = false; #ifdef CONFIG_NO_HZ_COMMON @@ -7432,9 +7438,10 @@ static inline void update_blocked_averages(int cpu) update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq); update_rt_rq_load_avg(rq_clock_task(rq), rq, 0); update_dl_rq_load_avg(rq_clock_task(rq), rq, 0); + update_irq_load_avg(rq, 0); #ifdef CONFIG_NO_HZ_COMMON rq->last_blocked_load_update_tick = jiffies; - if (!cfs_rq_has_blocked(cfs_rq) && !others_rqs_have_blocked(rq)) + if (!cfs_rq_has_blocked(cfs_rq) && !others_have_blocked(rq)) rq->has_blocked_load = 0; #endif rq_unlock_irqrestore(rq, &rf); diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c index 8b78b6320cda..ead6d8b4a8b8 100644 --- a/kernel/sched/pelt.c +++ b/kernel/sched/pelt.c @@ -357,3 +357,43 @@ int update_dl_rq_load_avg(u64 now, struct rq *rq, int running) return 0; } + +#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING) +/* + * irq: + * + * util_sum = \Sum se->avg.util_sum but se->avg.util_sum is not tracked + * util_sum = cpu_scale * load_sum + * runnable_load_sum = load_sum + * + */ + +int update_irq_load_avg(struct rq *rq, u64 running) +{ + int ret = 0; + /* + * We know the time that has been used by interrupt since last update + * but we don't when. Let be pessimistic and assume that interrupt has + * happened just before the update. This is not so far from reality + * because interrupt will most probably wake up task and trig an update + * of rq clock during which the metric si updated. + * We start to decay with normal context time and then we add the + * interrupt context time. + * We can safely remove running from rq->clock because + * rq->clock += delta with delta >= running + */ + ret = ___update_load_sum(rq->clock - running, rq->cpu, &rq->avg_irq, + 0, + 0, + 0); + ret += ___update_load_sum(rq->clock, rq->cpu, &rq->avg_irq, + 1, + 1, + 1); + + if (ret) + ___update_load_avg(&rq->avg_irq, 1, 1); + + return ret; +} +#endif diff --git a/kernel/sched/pelt.h b/kernel/sched/pelt.h index 0e4f912461ad..d2894db28955 100644 --- a/kernel/sched/pelt.h +++ b/kernel/sched/pelt.h @@ -6,6 +6,16 @@ int __update_load_avg_cfs_rq(u64 now, int cpu, struct cfs_rq *cfs_rq); int update_rt_rq_load_avg(u64 now, struct rq *rq, int running); int update_dl_rq_load_avg(u64 now, struct rq *rq, int running); +#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING) +int update_irq_load_avg(struct rq *rq, u64 running); +#else +static inline int +update_irq_load_avg(struct rq *rq, u64 running) +{ + return 0; +} +#endif + /* * When a task is dequeued, its estimated utilization should not be update if * its util_avg has not been updated at least once. @@ -51,6 +61,12 @@ update_dl_rq_load_avg(u64 now, struct rq *rq, int running) { return 0; } + +static inline int +update_irq_load_avg(struct rq *rq, u64 running) +{ + return 0; +} #endif diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 9028f268f867..b26d0c9948dd 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -857,6 +857,9 @@ struct rq { u64 age_stamp; struct sched_avg avg_rt; struct sched_avg avg_dl; +#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING) + struct sched_avg avg_irq; +#endif u64 idle_stamp; u64 avg_idle; -- cgit From 9033ea11889f88f243445495f72441e22256d5e9 Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Thu, 28 Jun 2018 17:45:10 +0200 Subject: cpufreq/schedutil: Take time spent in interrupts into account The time spent executing IRQ handlers can be significant but it is not reflected in the utilization of CPU when deciding to choose an OPP. Now that we have access to this metric, schedutil can take it into account when selecting the OPP for a CPU. RQS utilization don't see the time spend under interrupt context and report their value in the normal context time window. We need to compensate this when adding interrupt utilization The CPU utilization is: IRQ util_avg + (1 - IRQ util_avg / max capacity ) * /Sum rq util_avg A test with iperf on hikey (octo arm64) gives the following speedup: iperf -c server_address -r -t 5 w/o patch w/ patch Tx 276 Mbits/sec 304 Mbits/sec +10% Rx 299 Mbits/sec 328 Mbits/sec +9% 8 iterations stdev is lower than 1% Only WFI idle state is enabled (shallowest idle state). Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Acked-by: Viresh Kumar Cc: Linus Torvalds Cc: Morten.Rasmussen@arm.com Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: claudio@evidence.eu.com Cc: daniel.lezcano@linaro.org Cc: dietmar.eggemann@arm.com Cc: joel@joelfernandes.org Cc: juri.lelli@redhat.com Cc: luca.abeni@santannapisa.it Cc: patrick.bellasi@arm.com Cc: quentin.perret@arm.com Cc: rjw@rjwysocki.net Cc: valentin.schneider@arm.com Link: http://lkml.kernel.org/r/1530200714-4504-8-git-send-email-vincent.guittot@linaro.org Signed-off-by: Ingo Molnar --- kernel/sched/cpufreq_schedutil.c | 25 +++++++++++++++++++++---- kernel/sched/sched.h | 13 +++++++++++++ 2 files changed, 34 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 07760bc7f69a..7016bde9d194 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -58,6 +58,7 @@ struct sugov_cpu { unsigned long util_dl; unsigned long bw_dl; unsigned long util_rt; + unsigned long util_irq; unsigned long max; /* The field below is for single-CPU policies only: */ @@ -190,21 +191,30 @@ static void sugov_get_util(struct sugov_cpu *sg_cpu) sg_cpu->util_dl = cpu_util_dl(rq); sg_cpu->bw_dl = cpu_bw_dl(rq); sg_cpu->util_rt = cpu_util_rt(rq); + sg_cpu->util_irq = cpu_util_irq(rq); } static unsigned long sugov_aggregate_util(struct sugov_cpu *sg_cpu) { struct rq *rq = cpu_rq(sg_cpu->cpu); - unsigned long util; + unsigned long util, max = sg_cpu->max; if (rt_rq_is_runnable(&rq->rt)) return sg_cpu->max; + if (unlikely(sg_cpu->util_irq >= max)) + return max; + + /* Sum rq utilization */ util = sg_cpu->util_cfs; util += sg_cpu->util_rt; - if ((util + sg_cpu->util_dl) >= sg_cpu->max) - return sg_cpu->max; + /* + * Interrupt time is not seen by RQS utilization so we can compare + * them with the CPU capacity + */ + if ((util + sg_cpu->util_dl) >= max) + return max; /* * As there is still idle time on the CPU, we need to compute the @@ -220,10 +230,17 @@ static unsigned long sugov_aggregate_util(struct sugov_cpu *sg_cpu) * ready for such an interface. So, we only do the latter for now. */ + /* Weight RQS utilization to normal context window */ + util *= (max - sg_cpu->util_irq); + util /= max; + + /* Add interrupt utilization */ + util += sg_cpu->util_irq; + /* Add DL bandwidth requirement */ util += sg_cpu->bw_dl; - return min(sg_cpu->max, util); + return min(max, util); } /** diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index b26d0c9948dd..b2833e2b4b6a 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -2228,4 +2228,17 @@ static inline unsigned long cpu_util_rt(struct rq *rq) { return rq->avg_rt.util_avg; } + +#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING) +static inline unsigned long cpu_util_irq(struct rq *rq) +{ + return rq->avg_irq.util_avg; +} +#else +static inline unsigned long cpu_util_irq(struct rq *rq) +{ + return 0; +} + +#endif #endif -- cgit From dfa444dc2ff62edbaf1ff95ed22dd2ce8a5715da Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Thu, 28 Jun 2018 17:45:11 +0200 Subject: sched/cpufreq: Remove sugov_aggregate_util() There is no reason why sugov_get_util() and sugov_aggregate_util() were in fact separate functions. Signed-off-by: Vincent Guittot [ Rebased after adding irq tracking and fixed some compilation errors. ] Signed-off-by: Peter Zijlstra (Intel) Acked-by: Viresh Kumar Cc: Linus Torvalds Cc: Morten.Rasmussen@arm.com Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: claudio@evidence.eu.com Cc: daniel.lezcano@linaro.org Cc: dietmar.eggemann@arm.com Cc: joel@joelfernandes.org Cc: juri.lelli@redhat.com Cc: luca.abeni@santannapisa.it Cc: patrick.bellasi@arm.com Cc: quentin.perret@arm.com Cc: rjw@rjwysocki.net Cc: valentin.schneider@arm.com Link: http://lkml.kernel.org/r/1530200714-4504-9-git-send-email-vincent.guittot@linaro.org Signed-off-by: Ingo Molnar --- kernel/sched/cpufreq_schedutil.c | 44 ++++++++++++++-------------------------- kernel/sched/sched.h | 2 +- 2 files changed, 16 insertions(+), 30 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 7016bde9d194..c9622b3f183d 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -53,12 +53,7 @@ struct sugov_cpu { unsigned int iowait_boost_max; u64 last_update; - /* The fields below are only needed when sharing a policy: */ - unsigned long util_cfs; - unsigned long util_dl; unsigned long bw_dl; - unsigned long util_rt; - unsigned long util_irq; unsigned long max; /* The field below is for single-CPU policies only: */ @@ -182,38 +177,31 @@ static unsigned int get_next_freq(struct sugov_policy *sg_policy, return cpufreq_driver_resolve_freq(policy, freq); } -static void sugov_get_util(struct sugov_cpu *sg_cpu) +static unsigned long sugov_get_util(struct sugov_cpu *sg_cpu) { struct rq *rq = cpu_rq(sg_cpu->cpu); + unsigned long util, irq, max; - sg_cpu->max = arch_scale_cpu_capacity(NULL, sg_cpu->cpu); - sg_cpu->util_cfs = cpu_util_cfs(rq); - sg_cpu->util_dl = cpu_util_dl(rq); - sg_cpu->bw_dl = cpu_bw_dl(rq); - sg_cpu->util_rt = cpu_util_rt(rq); - sg_cpu->util_irq = cpu_util_irq(rq); -} - -static unsigned long sugov_aggregate_util(struct sugov_cpu *sg_cpu) -{ - struct rq *rq = cpu_rq(sg_cpu->cpu); - unsigned long util, max = sg_cpu->max; + sg_cpu->max = max = arch_scale_cpu_capacity(NULL, sg_cpu->cpu); + sg_cpu->bw_dl = cpu_bw_dl(rq); if (rt_rq_is_runnable(&rq->rt)) - return sg_cpu->max; + return max; + + irq = cpu_util_irq(rq); - if (unlikely(sg_cpu->util_irq >= max)) + if (unlikely(irq >= max)) return max; /* Sum rq utilization */ - util = sg_cpu->util_cfs; - util += sg_cpu->util_rt; + util = cpu_util_cfs(rq); + util += cpu_util_rt(rq); /* * Interrupt time is not seen by RQS utilization so we can compare * them with the CPU capacity */ - if ((util + sg_cpu->util_dl) >= max) + if ((util + cpu_util_dl(rq)) >= max) return max; /* @@ -231,11 +219,11 @@ static unsigned long sugov_aggregate_util(struct sugov_cpu *sg_cpu) */ /* Weight RQS utilization to normal context window */ - util *= (max - sg_cpu->util_irq); + util *= (max - irq); util /= max; /* Add interrupt utilization */ - util += sg_cpu->util_irq; + util += irq; /* Add DL bandwidth requirement */ util += sg_cpu->bw_dl; @@ -418,9 +406,8 @@ static void sugov_update_single(struct update_util_data *hook, u64 time, busy = sugov_cpu_is_busy(sg_cpu); - sugov_get_util(sg_cpu); + util = sugov_get_util(sg_cpu); max = sg_cpu->max; - util = sugov_aggregate_util(sg_cpu); sugov_iowait_apply(sg_cpu, time, &util, &max); next_f = get_next_freq(sg_policy, util, max); /* @@ -459,9 +446,8 @@ static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, u64 time) struct sugov_cpu *j_sg_cpu = &per_cpu(sugov_cpu, j); unsigned long j_util, j_max; - sugov_get_util(j_sg_cpu); + j_util = sugov_get_util(j_sg_cpu); j_max = j_sg_cpu->max; - j_util = sugov_aggregate_util(j_sg_cpu); sugov_iowait_apply(j_sg_cpu, time, &j_util, &j_max); if (j_util * max > j_max * util) { diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index b2833e2b4b6a..061d51fb5b44 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -2226,7 +2226,7 @@ static inline unsigned long cpu_util_cfs(struct rq *rq) static inline unsigned long cpu_util_rt(struct rq *rq) { - return rq->avg_rt.util_avg; + return READ_ONCE(rq->avg_rt.util_avg); } #if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING) -- cgit From 523e979d31648112bad07f427c183525c0258c75 Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Thu, 28 Jun 2018 17:45:12 +0200 Subject: sched/core: Use PELT for scale_rt_capacity() The utilization of the CPU by RT, DL and IRQs are now tracked with PELT so we can use these metrics instead of rt_avg to evaluate the remaining capacity available for CFS class. scale_rt_capacity() behavior has been changed and now returns the remaining capacity available for CFS instead of a scaling factor because RT, DL and IRQ provide now absolute utilization value. The same formula as schedutil is used: IRQ util_avg + (1 - IRQ util_avg / max capacity ) * /Sum rq util_avg but the implementation is different because it doesn't return the same value and doesn't benefit of the same optimization. Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Morten.Rasmussen@arm.com Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: claudio@evidence.eu.com Cc: daniel.lezcano@linaro.org Cc: dietmar.eggemann@arm.com Cc: joel@joelfernandes.org Cc: juri.lelli@redhat.com Cc: luca.abeni@santannapisa.it Cc: patrick.bellasi@arm.com Cc: quentin.perret@arm.com Cc: rjw@rjwysocki.net Cc: valentin.schneider@arm.com Cc: viresh.kumar@linaro.org Link: http://lkml.kernel.org/r/1530200714-4504-10-git-send-email-vincent.guittot@linaro.org Signed-off-by: Ingo Molnar --- kernel/sched/deadline.c | 2 -- kernel/sched/fair.c | 44 ++++++++++++++++++++++---------------------- kernel/sched/pelt.c | 2 +- kernel/sched/rt.c | 2 -- 4 files changed, 23 insertions(+), 27 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index f4de26982d80..68b8a9f1c9ca 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -1180,8 +1180,6 @@ static void update_curr_dl(struct rq *rq) curr->se.exec_start = now; cgroup_account_cputime(curr, delta_exec); - sched_rt_avg_update(rq, delta_exec); - if (dl_entity_is_special(dl_se)) return; diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index c2782b29c79f..d265fa9756a2 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -7551,39 +7551,39 @@ static inline int get_sd_load_idx(struct sched_domain *sd, static unsigned long scale_rt_capacity(int cpu) { struct rq *rq = cpu_rq(cpu); - u64 total, used, age_stamp, avg; - s64 delta; - - /* - * Since we're reading these variables without serialization make sure - * we read them once before doing sanity checks on them. - */ - age_stamp = READ_ONCE(rq->age_stamp); - avg = READ_ONCE(rq->rt_avg); - delta = __rq_clock_broken(rq) - age_stamp; + unsigned long max = arch_scale_cpu_capacity(NULL, cpu); + unsigned long used, free; +#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING) + unsigned long irq; +#endif - if (unlikely(delta < 0)) - delta = 0; +#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING) + irq = READ_ONCE(rq->avg_irq.util_avg); - total = sched_avg_period() + delta; + if (unlikely(irq >= max)) + return 1; +#endif - used = div_u64(avg, total); + used = READ_ONCE(rq->avg_rt.util_avg); + used += READ_ONCE(rq->avg_dl.util_avg); - if (likely(used < SCHED_CAPACITY_SCALE)) - return SCHED_CAPACITY_SCALE - used; + if (unlikely(used >= max)) + return 1; - return 1; + free = max - used; +#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING) + free *= (max - irq); + free /= max; +#endif + return free; } static void update_cpu_capacity(struct sched_domain *sd, int cpu) { - unsigned long capacity = arch_scale_cpu_capacity(sd, cpu); + unsigned long capacity = scale_rt_capacity(cpu); struct sched_group *sdg = sd->groups; - cpu_rq(cpu)->cpu_capacity_orig = capacity; - - capacity *= scale_rt_capacity(cpu); - capacity >>= SCHED_CAPACITY_SHIFT; + cpu_rq(cpu)->cpu_capacity_orig = arch_scale_cpu_capacity(sd, cpu); if (!capacity) capacity = 1; diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c index ead6d8b4a8b8..35475c0c5419 100644 --- a/kernel/sched/pelt.c +++ b/kernel/sched/pelt.c @@ -237,7 +237,7 @@ ___update_load_avg(struct sched_avg *sa, unsigned long load, unsigned long runna */ sa->load_avg = div_u64(load * sa->load_sum, divider); sa->runnable_load_avg = div_u64(runnable * sa->runnable_load_sum, divider); - sa->util_avg = sa->util_sum / divider; + WRITE_ONCE(sa->util_avg, sa->util_sum / divider); } /* diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 0dc8ad1915e6..2df72abfa24a 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -973,8 +973,6 @@ static void update_curr_rt(struct rq *rq) curr->se.exec_start = now; cgroup_account_cputime(curr, delta_exec); - sched_rt_avg_update(rq, delta_exec); - if (!rt_bandwidth_enabled()) return; -- cgit From bbb62c0b024a1c721232667fa1d625cf6b3a555b Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Thu, 28 Jun 2018 17:45:13 +0200 Subject: sched/core: Remove the rt_avg code rt_avg is not used anywhere anymore, so we can remove all related code. Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Morten.Rasmussen@arm.com Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: claudio@evidence.eu.com Cc: daniel.lezcano@linaro.org Cc: dietmar.eggemann@arm.com Cc: joel@joelfernandes.org Cc: juri.lelli@redhat.com Cc: luca.abeni@santannapisa.it Cc: patrick.bellasi@arm.com Cc: quentin.perret@arm.com Cc: rjw@rjwysocki.net Cc: valentin.schneider@arm.com Cc: viresh.kumar@linaro.org Link: http://lkml.kernel.org/r/1530200714-4504-11-git-send-email-vincent.guittot@linaro.org Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 26 -------------------------- kernel/sched/fair.c | 2 -- kernel/sched/sched.h | 17 ----------------- 3 files changed, 45 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 38107a95baca..a691b07390ab 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -651,23 +651,6 @@ bool sched_can_stop_tick(struct rq *rq) return true; } #endif /* CONFIG_NO_HZ_FULL */ - -void sched_avg_update(struct rq *rq) -{ - s64 period = sched_avg_period(); - - while ((s64)(rq_clock(rq) - rq->age_stamp) > period) { - /* - * Inline assembly required to prevent the compiler - * optimising this loop into a divmod call. - * See __iter_div_u64_rem() for another example of this. - */ - asm("" : "+rm" (rq->age_stamp)); - rq->age_stamp += period; - rq->rt_avg /= 2; - } -} - #endif /* CONFIG_SMP */ #if defined(CONFIG_RT_GROUP_SCHED) || (defined(CONFIG_FAIR_GROUP_SCHED) && \ @@ -5716,13 +5699,6 @@ void set_rq_offline(struct rq *rq) } } -static void set_cpu_rq_start_time(unsigned int cpu) -{ - struct rq *rq = cpu_rq(cpu); - - rq->age_stamp = sched_clock_cpu(cpu); -} - /* * used to mark begin/end of suspend/resume: */ @@ -5840,7 +5816,6 @@ static void sched_rq_cpu_starting(unsigned int cpu) int sched_cpu_starting(unsigned int cpu) { - set_cpu_rq_start_time(cpu); sched_rq_cpu_starting(cpu); sched_tick_start(cpu); return 0; @@ -6108,7 +6083,6 @@ void __init sched_init(void) #ifdef CONFIG_SMP idle_thread_set_boot_cpu(); - set_cpu_rq_start_time(smp_processor_id()); #endif init_sched_fair_class(); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index d265fa9756a2..d5f7d521e448 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5323,8 +5323,6 @@ static void cpu_load_update(struct rq *this_rq, unsigned long this_load, this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i; } - - sched_avg_update(this_rq); } /* Used instead of source_load when we know the type == 0 */ diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 061d51fb5b44..14aac2d2de80 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -853,8 +853,6 @@ struct rq { struct list_head cfs_tasks; - u64 rt_avg; - u64 age_stamp; struct sched_avg avg_rt; struct sched_avg avg_dl; #if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING) @@ -1719,11 +1717,6 @@ extern const_debug unsigned int sysctl_sched_time_avg; extern const_debug unsigned int sysctl_sched_nr_migrate; extern const_debug unsigned int sysctl_sched_migration_cost; -static inline u64 sched_avg_period(void) -{ - return (u64)sysctl_sched_time_avg * NSEC_PER_MSEC / 2; -} - #ifdef CONFIG_SCHED_HRTICK /* @@ -1760,8 +1753,6 @@ unsigned long arch_scale_freq_capacity(int cpu) #endif #ifdef CONFIG_SMP -extern void sched_avg_update(struct rq *rq); - #ifndef arch_scale_cpu_capacity static __always_inline unsigned long arch_scale_cpu_capacity(struct sched_domain *sd, int cpu) @@ -1772,12 +1763,6 @@ unsigned long arch_scale_cpu_capacity(struct sched_domain *sd, int cpu) return SCHED_CAPACITY_SCALE; } #endif - -static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta) -{ - rq->rt_avg += rt_delta * arch_scale_freq_capacity(cpu_of(rq)); - sched_avg_update(rq); -} #else #ifndef arch_scale_cpu_capacity static __always_inline @@ -1786,8 +1771,6 @@ unsigned long arch_scale_cpu_capacity(void __always_unused *sd, int cpu) return SCHED_CAPACITY_SCALE; } #endif -static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta) { } -static inline void sched_avg_update(struct rq *rq) { } #endif struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf) -- cgit From 5fd778915ad29184a5ff8eb82d1118f6916b79e4 Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Thu, 28 Jun 2018 17:45:14 +0200 Subject: sched/sysctl: Remove unused sched_time_avg_ms sysctl /proc/sys/kernel/sched_time_avg_ms entry is not used anywhere, remove it. Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Luis R. Rodriguez Cc: Kees Cook Cc: Linus Torvalds Cc: Morten.Rasmussen@arm.com Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: claudio@evidence.eu.com Cc: daniel.lezcano@linaro.org Cc: dietmar.eggemann@arm.com Cc: joel@joelfernandes.org Cc: juri.lelli@redhat.com Cc: luca.abeni@santannapisa.it Cc: patrick.bellasi@arm.com Cc: quentin.perret@arm.com Cc: rjw@rjwysocki.net Cc: valentin.schneider@arm.com Cc: viresh.kumar@linaro.org Link: http://lkml.kernel.org/r/1530200714-4504-12-git-send-email-vincent.guittot@linaro.org Signed-off-by: Ingo Molnar --- include/linux/sched/sysctl.h | 1 - kernel/sched/core.c | 8 -------- kernel/sched/sched.h | 1 - kernel/sysctl.c | 8 -------- 4 files changed, 18 deletions(-) (limited to 'kernel') diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h index 1c1a1512ec55..913488d828cb 100644 --- a/include/linux/sched/sysctl.h +++ b/include/linux/sched/sysctl.h @@ -40,7 +40,6 @@ extern unsigned int sysctl_numa_balancing_scan_size; #ifdef CONFIG_SCHED_DEBUG extern __read_mostly unsigned int sysctl_sched_migration_cost; extern __read_mostly unsigned int sysctl_sched_nr_migrate; -extern __read_mostly unsigned int sysctl_sched_time_avg; int sched_proc_update_handler(struct ctl_table *table, int write, void __user *buffer, size_t *length, diff --git a/kernel/sched/core.c b/kernel/sched/core.c index a691b07390ab..ba6bb805693a 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -46,14 +46,6 @@ const_debug unsigned int sysctl_sched_features = */ const_debug unsigned int sysctl_sched_nr_migrate = 32; -/* - * period over which we average the RT time consumption, measured - * in ms. - * - * default: 1s - */ -const_debug unsigned int sysctl_sched_time_avg = MSEC_PER_SEC; - /* * period over which we measure -rt task CPU usage in us. * default: 1s diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 14aac2d2de80..ebb4b3c3ece7 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1713,7 +1713,6 @@ extern void deactivate_task(struct rq *rq, struct task_struct *p, int flags); extern void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags); -extern const_debug unsigned int sysctl_sched_time_avg; extern const_debug unsigned int sysctl_sched_nr_migrate; extern const_debug unsigned int sysctl_sched_migration_cost; diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 2d9837c0aff4..f22f76b7a138 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -368,14 +368,6 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, - { - .procname = "sched_time_avg_ms", - .data = &sysctl_sched_time_avg, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = &one, - }, #ifdef CONFIG_SCHEDSTATS { .procname = "sched_schedstats", -- cgit From 45f5519ec55e75af3565dd737586d3b041834f71 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 5 Jul 2018 14:36:17 +0200 Subject: sched/cpufreq: Clarify sugov_get_util() Add a few comments to (hopefully) clarifying some of the magic in sugov_get_util(). Signed-off-by: Peter Zijlstra (Intel) Acked-by: Viresh Kumar Cc: Linus Torvalds Cc: Morten.Rasmussen@arm.com Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Vincent Guittot Cc: claudio@evidence.eu.com Cc: daniel.lezcano@linaro.org Cc: dietmar.eggemann@arm.com Cc: joel@joelfernandes.org Cc: juri.lelli@redhat.com Cc: luca.abeni@santannapisa.it Cc: patrick.bellasi@arm.com Cc: quentin.perret@arm.com Cc: rjw@rjwysocki.net Cc: valentin.schneider@arm.com Link: http://lkml.kernel.org/r/20180705123617.GM2458@hirez.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- kernel/sched/cpufreq_schedutil.c | 75 +++++++++++++++++++++++++++++----------- 1 file changed, 54 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index c9622b3f183d..97dcd4472a0e 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -177,6 +177,26 @@ static unsigned int get_next_freq(struct sugov_policy *sg_policy, return cpufreq_driver_resolve_freq(policy, freq); } +/* + * This function computes an effective utilization for the given CPU, to be + * used for frequency selection given the linear relation: f = u * f_max. + * + * The scheduler tracks the following metrics: + * + * cpu_util_{cfs,rt,dl,irq}() + * cpu_bw_dl() + * + * Where the cfs,rt and dl util numbers are tracked with the same metric and + * synchronized windows and are thus directly comparable. + * + * The cfs,rt,dl utilization are the running times measured with rq->clock_task + * which excludes things like IRQ and steal-time. These latter are then accrued + * in the irq utilization. + * + * The DL bandwidth number otoh is not a measured metric but a value computed + * based on the task model parameters and gives the minimal utilization + * required to meet deadlines. + */ static unsigned long sugov_get_util(struct sugov_cpu *sg_cpu) { struct rq *rq = cpu_rq(sg_cpu->cpu); @@ -188,47 +208,60 @@ static unsigned long sugov_get_util(struct sugov_cpu *sg_cpu) if (rt_rq_is_runnable(&rq->rt)) return max; + /* + * Early check to see if IRQ/steal time saturates the CPU, can be + * because of inaccuracies in how we track these -- see + * update_irq_load_avg(). + */ irq = cpu_util_irq(rq); - if (unlikely(irq >= max)) return max; - /* Sum rq utilization */ + /* + * Because the time spend on RT/DL tasks is visible as 'lost' time to + * CFS tasks and we use the same metric to track the effective + * utilization (PELT windows are synchronized) we can directly add them + * to obtain the CPU's actual utilization. + */ util = cpu_util_cfs(rq); util += cpu_util_rt(rq); /* - * Interrupt time is not seen by RQS utilization so we can compare - * them with the CPU capacity + * We do not make cpu_util_dl() a permanent part of this sum because we + * want to use cpu_bw_dl() later on, but we need to check if the + * CFS+RT+DL sum is saturated (ie. no idle time) such that we select + * f_max when there is no idle time. + * + * NOTE: numerical errors or stop class might cause us to not quite hit + * saturation when we should -- something for later. */ if ((util + cpu_util_dl(rq)) >= max) return max; /* - * As there is still idle time on the CPU, we need to compute the - * utilization level of the CPU. + * There is still idle time; further improve the number by using the + * irq metric. Because IRQ/steal time is hidden from the task clock we + * need to scale the task numbers: * + * 1 - irq + * U' = irq + ------- * U + * max + */ + util *= (max - irq); + util /= max; + util += irq; + + /* * Bandwidth required by DEADLINE must always be granted while, for * FAIR and RT, we use blocked utilization of IDLE CPUs as a mechanism * to gracefully reduce the frequency when no tasks show up for longer * periods of time. * - * Ideally we would like to set util_dl as min/guaranteed freq and - * util_cfs + util_dl as requested freq. However, cpufreq is not yet - * ready for such an interface. So, we only do the latter for now. + * Ideally we would like to set bw_dl as min/guaranteed freq and util + + * bw_dl as requested freq. However, cpufreq is not yet ready for such + * an interface. So, we only do the latter for now. */ - - /* Weight RQS utilization to normal context window */ - util *= (max - irq); - util /= max; - - /* Add interrupt utilization */ - util += irq; - - /* Add DL bandwidth requirement */ - util += sg_cpu->bw_dl; - - return min(max, util); + return min(max, util + sg_cpu->bw_dl); } /** -- cgit From af0fffd9300b97d8875aa745bc78e2f6fdb3c1f0 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Fri, 6 Jul 2018 15:06:15 +0200 Subject: sched/core: Remove get_cpu() from sched_fork() get_cpu() disables preemption for the entire sched_fork() function. This get_cpu() was introduced in commit: dd41f596cda0 ("sched: cfs core code") ... which also invoked sched_balance_self() and this function required preemption do be off. Today, sched_balance_self() seems to be moved to ->task_fork callback which is invoked while the ->pi_lock is held. set_load_weight() could invoke reweight_task() which then via $callchain might end up in smp_processor_id() but since `update_load' is false this won't happen. I didn't find any this_cpu*() or similar usage during the initialisation of the task_struct. The `cpu' value (from get_cpu()) is only used later in __set_task_cpu() while the ->pi_lock lock is held. Based on this it is possible to remove get_cpu() and use smp_processor_id() for the `cpu' variable without breaking anything. Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20180706130615.g2ex2kmfu5kcvlq6@linutronix.de Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index ba6bb805693a..c3cf7d992159 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2294,7 +2294,6 @@ static inline void init_schedstats(void) {} int sched_fork(unsigned long clone_flags, struct task_struct *p) { unsigned long flags; - int cpu = get_cpu(); __sched_fork(clone_flags, p); /* @@ -2330,14 +2329,12 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) p->sched_reset_on_fork = 0; } - if (dl_prio(p->prio)) { - put_cpu(); + if (dl_prio(p->prio)) return -EAGAIN; - } else if (rt_prio(p->prio)) { + else if (rt_prio(p->prio)) p->sched_class = &rt_sched_class; - } else { + else p->sched_class = &fair_sched_class; - } init_entity_runnable_average(&p->se); @@ -2353,7 +2350,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) * We're setting the CPU for the first time, we don't migrate, * so use __set_task_cpu(). */ - __set_task_cpu(p, cpu); + __set_task_cpu(p, smp_processor_id()); if (p->sched_class->task_fork) p->sched_class->task_fork(p); raw_spin_unlock_irqrestore(&p->pi_lock, flags); @@ -2370,8 +2367,6 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) plist_node_init(&p->pushable_tasks, MAX_PRIO); RB_CLEAR_NODE(&p->pushable_dl_tasks); #endif - - put_cpu(); return 0; } -- cgit From 788faab70d5a882693286b8d5022779559c79904 Mon Sep 17 00:00:00 2001 From: Tobias Tefke Date: Mon, 9 Jul 2018 12:57:15 +0200 Subject: perf, tools: Use correct articles in comments Some of the comments in the perf events code use articles incorrectly, using 'a' for words beginning with a vowel sound, where 'an' should be used. Signed-off-by: Tobias Tefke Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: acme@kernel.org Cc: alexander.shishkin@linux.intel.com Cc: jolsa@redhat.com Cc: namhyung@kernel.org Link: http://lkml.kernel.org/r/20180709105715.22938-1-tobias.tefke@tutanota.com [ Fix a few more perf related 'a event' typo fixes from all around the kernel and tooling tree. ] Signed-off-by: Ingo Molnar --- arch/powerpc/perf/core-book3s.c | 6 +++--- include/linux/perf_event.h | 2 +- kernel/events/core.c | 16 ++++++++-------- kernel/events/uprobes.c | 6 +++--- tools/perf/Documentation/perf-list.txt | 2 +- tools/perf/Documentation/perf-record.txt | 4 ++-- 6 files changed, 18 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/arch/powerpc/perf/core-book3s.c b/arch/powerpc/perf/core-book3s.c index 3f66fcf8ad99..19d8ab49d1bd 100644 --- a/arch/powerpc/perf/core-book3s.c +++ b/arch/powerpc/perf/core-book3s.c @@ -1469,7 +1469,7 @@ static int collect_events(struct perf_event *group, int max_count, } /* - * Add a event to the PMU. + * Add an event to the PMU. * If all events are not already frozen, then we disable and * re-enable the PMU in order to get hw_perf_enable to do the * actual work of reconfiguring the PMU. @@ -1548,7 +1548,7 @@ nocheck: } /* - * Remove a event from the PMU. + * Remove an event from the PMU. */ static void power_pmu_del(struct perf_event *event, int ef_flags) { @@ -1742,7 +1742,7 @@ static int power_pmu_commit_txn(struct pmu *pmu) /* * Return 1 if we might be able to put event on a limited PMC, * or 0 if not. - * A event can only go on a limited PMC if it counts something + * An event can only go on a limited PMC if it counts something * that a limited PMC can count, doesn't require interrupts, and * doesn't exclude any processor mode. */ diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 1fa12887ec02..e6dd3a2f8ec4 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -490,7 +490,7 @@ struct perf_addr_filters_head { }; /** - * enum perf_event_state - the states of a event + * enum perf_event_state - the states of an event: */ enum perf_event_state { PERF_EVENT_STATE_DEAD = -4, diff --git a/kernel/events/core.c b/kernel/events/core.c index a4d5ac6d74af..86b31e5a5a9a 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -1656,7 +1656,7 @@ perf_event_groups_next(struct perf_event *event) typeof(*event), group_node)) /* - * Add a event from the lists for its context. + * Add an event from the lists for its context. * Must be called with ctx->mutex and ctx->lock held. */ static void @@ -1844,7 +1844,7 @@ static void perf_group_attach(struct perf_event *event) } /* - * Remove a event from the lists for its context. + * Remove an event from the lists for its context. * Must be called with ctx->mutex and ctx->lock held. */ static void @@ -2148,7 +2148,7 @@ static void __perf_event_disable(struct perf_event *event, } /* - * Disable a event. + * Disable an event. * * If event->ctx is a cloned context, callers must make sure that * every task struct that event->ctx->task could possibly point to @@ -2677,7 +2677,7 @@ static void __perf_event_enable(struct perf_event *event, } /* - * Enable a event. + * Enable an event. * * If event->ctx is a cloned context, callers must make sure that * every task struct that event->ctx->task could possibly point to @@ -2755,7 +2755,7 @@ static int __perf_event_stop(void *info) * events will refuse to restart because of rb::aux_mmap_count==0, * see comments in perf_aux_output_begin(). * - * Since this is happening on a event-local CPU, no trace is lost + * Since this is happening on an event-local CPU, no trace is lost * while restarting. */ if (sd->restart) @@ -4827,7 +4827,7 @@ __perf_read(struct perf_event *event, char __user *buf, size_t count) int ret; /* - * Return end-of-file for a read on a event that is in + * Return end-of-file for a read on an event that is in * error state (i.e. because it was pinned but it couldn't be * scheduled on to the CPU at some point). */ @@ -9898,7 +9898,7 @@ enabled: } /* - * Allocate and initialize a event structure + * Allocate and initialize an event structure */ static struct perf_event * perf_event_alloc(struct perf_event_attr *attr, int cpu, @@ -11229,7 +11229,7 @@ const struct perf_event_attr *perf_event_attrs(struct perf_event *event) } /* - * Inherit a event from parent task to child task. + * Inherit an event from parent task to child task. * * Returns: * - valid pointer on success diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index ccc579a7d32e..aed1ba569954 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -918,7 +918,7 @@ int uprobe_register(struct inode *inode, loff_t offset, struct uprobe_consumer * EXPORT_SYMBOL_GPL(uprobe_register); /* - * uprobe_apply - unregister a already registered probe. + * uprobe_apply - unregister an already registered probe. * @inode: the file in which the probe has to be removed. * @offset: offset from the start of the file. * @uc: consumer which wants to add more or remove some breakpoints @@ -947,7 +947,7 @@ int uprobe_apply(struct inode *inode, loff_t offset, } /* - * uprobe_unregister - unregister a already registered probe. + * uprobe_unregister - unregister an already registered probe. * @inode: the file in which the probe has to be removed. * @offset: offset from the start of the file. * @uc: identify which probe if multiple probes are colocated. @@ -1403,7 +1403,7 @@ static struct return_instance *free_ret_instance(struct return_instance *ri) /* * Called with no locks held. - * Called in context of a exiting or a exec-ing thread. + * Called in context of an exiting or an exec-ing thread. */ void uprobe_free_utask(struct task_struct *t) { diff --git a/tools/perf/Documentation/perf-list.txt b/tools/perf/Documentation/perf-list.txt index 11300dbe35c5..14e13512c05f 100644 --- a/tools/perf/Documentation/perf-list.txt +++ b/tools/perf/Documentation/perf-list.txt @@ -234,7 +234,7 @@ perf also supports group leader sampling using the :S specifier. perf record -e '{cycles,instructions}:S' ... perf report --group -Normally all events in a event group sample, but with :S only +Normally all events in an event group sample, but with :S only the first event (the leader) samples, and it only reads the values of the other events in the group. diff --git a/tools/perf/Documentation/perf-record.txt b/tools/perf/Documentation/perf-record.txt index 04168da4268e..246dee081efd 100644 --- a/tools/perf/Documentation/perf-record.txt +++ b/tools/perf/Documentation/perf-record.txt @@ -94,7 +94,7 @@ OPTIONS "perf report" to view group events together. --filter=:: - Event filter. This option should follow a event selector (-e) which + Event filter. This option should follow an event selector (-e) which selects either tracepoint event(s) or a hardware trace PMU (e.g. Intel PT or CoreSight). @@ -153,7 +153,7 @@ OPTIONS --exclude-perf:: Don't record events issued by perf itself. This option should follow - a event selector (-e) which selects tracepoint event(s). It adds a + an event selector (-e) which selects tracepoint event(s). It adds a filter expression 'common_pid != $PERFPID' to filters. If other '--filter' exists, the new filter expression will be combined with them by '&&'. -- cgit From 1d98a69e5cef3aeb68bcefab0e67e342d6bb4dad Mon Sep 17 00:00:00 2001 From: Kamalesh Babulal Date: Thu, 12 Jul 2018 13:35:06 +0530 Subject: livepatch: Remove reliable stacktrace check in klp_try_switch_task() Support for immediate flag was removed by commit d0807da78e11 ("livepatch: Remove immediate feature"). We bail out during patch registration for architectures, those don't support reliable stack trace. Remove the check in klp_try_switch_task(), as its not required. Signed-off-by: Kamalesh Babulal Reviewed-by: Petr Mladek Acked-by: Miroslav Benes Acked-by: Josh Poimboeuf Signed-off-by: Jiri Kosina --- kernel/livepatch/transition.c | 7 ------- 1 file changed, 7 deletions(-) (limited to 'kernel') diff --git a/kernel/livepatch/transition.c b/kernel/livepatch/transition.c index 7c6631e693bc..5bc349805e03 100644 --- a/kernel/livepatch/transition.c +++ b/kernel/livepatch/transition.c @@ -309,13 +309,6 @@ static bool klp_try_switch_task(struct task_struct *task) if (task->patch_state == klp_target_state) return true; - /* - * For arches which don't have reliable stack traces, we have to rely - * on other methods (e.g., switching tasks at kernel exit). - */ - if (!klp_have_reliable_stack()) - return false; - /* * Now try to check the stack for any to-be-patched or to-be-unpatched * functions. If all goes well, switch the task to the target patch -- cgit From a210fd32a46bae6d05b43860fe3b47732501d63b Mon Sep 17 00:00:00 2001 From: Mimi Zohar Date: Fri, 13 Jul 2018 14:05:57 -0400 Subject: kexec: add call to LSM hook in original kexec_load syscall In order for LSMs and IMA-appraisal to differentiate between kexec_load and kexec_file_load syscalls, both the original and new syscalls must call an LSM hook. This patch adds a call to security_kernel_load_data() in the original kexec_load syscall. Signed-off-by: Mimi Zohar Cc: Eric Biederman Cc: Kees Cook Acked-by: Serge Hallyn Acked-by: Kees Cook Signed-off-by: James Morris --- kernel/kexec.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'kernel') diff --git a/kernel/kexec.c b/kernel/kexec.c index aed8fb2564b3..68559808fdfa 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include @@ -195,10 +196,17 @@ out: static inline int kexec_load_check(unsigned long nr_segments, unsigned long flags) { + int result; + /* We only trust the superuser with rebooting the system. */ if (!capable(CAP_SYS_BOOT) || kexec_load_disabled) return -EPERM; + /* Permit LSMs and IMA to fail the kexec */ + result = security_kernel_load_data(LOADING_KEXEC_IMAGE); + if (result < 0) + return result; + /* * Verify we have a legal set of flags * This leaves us room for future extensions. -- cgit From c77b8cdf745d91eca138e7bfa430dc6640b604a0 Mon Sep 17 00:00:00 2001 From: Mimi Zohar Date: Fri, 13 Jul 2018 14:06:02 -0400 Subject: module: replace the existing LSM hook in init_module Both the init_module and finit_module syscalls call either directly or indirectly the security_kernel_read_file LSM hook. This patch replaces the direct call in init_module with a call to the new security_kernel_load_data hook and makes the corresponding changes in SELinux, LoadPin, and IMA. Signed-off-by: Mimi Zohar Cc: Jeff Vander Stoep Cc: Casey Schaufler Cc: Kees Cook Acked-by: Jessica Yu Acked-by: Paul Moore Acked-by: Kees Cook Signed-off-by: James Morris --- kernel/module.c | 2 +- security/integrity/ima/ima_main.c | 23 ++++++++++------------- security/loadpin/loadpin.c | 6 ++++++ security/selinux/hooks.c | 15 +++++++++++++++ 4 files changed, 32 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index f475f30eed8c..a7615d661910 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -2876,7 +2876,7 @@ static int copy_module_from_user(const void __user *umod, unsigned long len, if (info->len < sizeof(*(info->hdr))) return -ENOEXEC; - err = security_kernel_read_file(NULL, READING_MODULE); + err = security_kernel_load_data(LOADING_MODULE); if (err) return err; diff --git a/security/integrity/ima/ima_main.c b/security/integrity/ima/ima_main.c index e467664965e7..ef349a761609 100644 --- a/security/integrity/ima/ima_main.c +++ b/security/integrity/ima/ima_main.c @@ -429,16 +429,6 @@ void ima_post_path_mknod(struct dentry *dentry) */ int ima_read_file(struct file *file, enum kernel_read_file_id read_id) { - bool sig_enforce = is_module_sig_enforced(); - - if (!file && read_id == READING_MODULE) { - if (!sig_enforce && (ima_appraise & IMA_APPRAISE_MODULES) && - (ima_appraise & IMA_APPRAISE_ENFORCE)) { - pr_err("impossible to appraise a module without a file descriptor. sig_enforce kernel parameter might help\n"); - return -EACCES; /* INTEGRITY_UNKNOWN */ - } - return 0; /* We rely on module signature checking */ - } return 0; } @@ -479,9 +469,6 @@ int ima_post_read_file(struct file *file, void *buf, loff_t size, return 0; } - if (!file && read_id == READING_MODULE) /* MODULE_SIG_FORCE enabled */ - return 0; - /* permit signed certs */ if (!file && read_id == READING_X509_CERTIFICATE) return 0; @@ -510,6 +497,8 @@ int ima_post_read_file(struct file *file, void *buf, loff_t size, */ int ima_load_data(enum kernel_load_data_id id) { + bool sig_enforce; + if ((ima_appraise & IMA_APPRAISE_ENFORCE) != IMA_APPRAISE_ENFORCE) return 0; @@ -525,6 +514,14 @@ int ima_load_data(enum kernel_load_data_id id) pr_err("Prevent firmware sysfs fallback loading.\n"); return -EACCES; /* INTEGRITY_UNKNOWN */ } + break; + case LOADING_MODULE: + sig_enforce = is_module_sig_enforced(); + + if (!sig_enforce && (ima_appraise & IMA_APPRAISE_MODULES)) { + pr_err("impossible to appraise a module without a file descriptor. sig_enforce kernel parameter might help\n"); + return -EACCES; /* INTEGRITY_UNKNOWN */ + } default: break; } diff --git a/security/loadpin/loadpin.c b/security/loadpin/loadpin.c index 5fa191252c8f..0716af28808a 100644 --- a/security/loadpin/loadpin.c +++ b/security/loadpin/loadpin.c @@ -173,9 +173,15 @@ static int loadpin_read_file(struct file *file, enum kernel_read_file_id id) return 0; } +static int loadpin_load_data(enum kernel_load_data_id id) +{ + return loadpin_read_file(NULL, (enum kernel_read_file_id) id); +} + static struct security_hook_list loadpin_hooks[] __lsm_ro_after_init = { LSM_HOOK_INIT(sb_free_security, loadpin_sb_free_security), LSM_HOOK_INIT(kernel_read_file, loadpin_read_file), + LSM_HOOK_INIT(kernel_load_data, loadpin_load_data), }; void __init loadpin_add_hooks(void) diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index 2b5ee5fbd652..a8bf324130f5 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -4073,6 +4073,20 @@ static int selinux_kernel_read_file(struct file *file, return rc; } +static int selinux_kernel_load_data(enum kernel_load_data_id id) +{ + int rc = 0; + + switch (id) { + case LOADING_MODULE: + rc = selinux_kernel_module_from_file(NULL); + default: + break; + } + + return rc; +} + static int selinux_task_setpgid(struct task_struct *p, pid_t pgid) { return avc_has_perm(&selinux_state, @@ -6972,6 +6986,7 @@ static struct security_hook_list selinux_hooks[] __lsm_ro_after_init = { LSM_HOOK_INIT(kernel_act_as, selinux_kernel_act_as), LSM_HOOK_INIT(kernel_create_files_as, selinux_kernel_create_files_as), LSM_HOOK_INIT(kernel_module_request, selinux_kernel_module_request), + LSM_HOOK_INIT(kernel_load_data, selinux_kernel_load_data), LSM_HOOK_INIT(kernel_read_file, selinux_kernel_read_file), LSM_HOOK_INIT(task_setpgid, selinux_task_setpgid), LSM_HOOK_INIT(task_getpgid, selinux_task_getpgid), -- cgit From 76e079fefc8f62bd9b2cd2950814d1ee806e31a5 Mon Sep 17 00:00:00 2001 From: Andrea Parri Date: Mon, 16 Jul 2018 11:06:01 -0700 Subject: sched/core: Use smp_mb() in wake_woken_function() wake_woken_function() synchronizes with wait_woken() as follows: [wait_woken] [wake_woken_function] entry->flags &= ~wq_flag_woken; condition = true; smp_mb(); smp_wmb(); if (condition) wq_entry->flags |= wq_flag_woken; break; This commit replaces the above smp_wmb() with an smp_mb() in order to guarantee that either wait_woken() sees the wait condition being true or the store to wq_entry->flags in woken_wake_function() follows the store in wait_woken() in the coherence order (so that the former can eventually be observed by wait_woken()). The commit also fixes a comment associated to set_current_state() in wait_woken(): the comment pairs the barrier in set_current_state() to the above smp_wmb(), while the actual pairing involves the barrier in set_current_state() and the barrier executed by the try_to_wake_up() in wake_woken_function(). Signed-off-by: Andrea Parri Signed-off-by: Paul E. McKenney Acked-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: akiyks@gmail.com Cc: boqun.feng@gmail.com Cc: dhowells@redhat.com Cc: j.alglave@ucl.ac.uk Cc: linux-arch@vger.kernel.org Cc: luc.maranget@inria.fr Cc: npiggin@gmail.com Cc: parri.andrea@gmail.com Cc: stern@rowland.harvard.edu Cc: will.deacon@arm.com Link: http://lkml.kernel.org/r/20180716180605.16115-10-paulmck@linux.vnet.ibm.com Signed-off-by: Ingo Molnar --- kernel/sched/wait.c | 47 +++++++++++++++++++++-------------------------- 1 file changed, 21 insertions(+), 26 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c index 928be527477e..a7a2aaa3026a 100644 --- a/kernel/sched/wait.c +++ b/kernel/sched/wait.c @@ -392,35 +392,36 @@ static inline bool is_kthread_should_stop(void) * if (condition) * break; * - * p->state = mode; condition = true; - * smp_mb(); // A smp_wmb(); // C - * if (!wq_entry->flags & WQ_FLAG_WOKEN) wq_entry->flags |= WQ_FLAG_WOKEN; - * schedule() try_to_wake_up(); - * p->state = TASK_RUNNING; ~~~~~~~~~~~~~~~~~~ - * wq_entry->flags &= ~WQ_FLAG_WOKEN; condition = true; - * smp_mb() // B smp_wmb(); // C - * wq_entry->flags |= WQ_FLAG_WOKEN; - * } - * remove_wait_queue(&wq_head, &wait); + * // in wait_woken() // in woken_wake_function() * + * p->state = mode; wq_entry->flags |= WQ_FLAG_WOKEN; + * smp_mb(); // A try_to_wake_up(): + * if (!(wq_entry->flags & WQ_FLAG_WOKEN)) + * schedule() if (p->state & mode) + * p->state = TASK_RUNNING; p->state = TASK_RUNNING; + * wq_entry->flags &= ~WQ_FLAG_WOKEN; ~~~~~~~~~~~~~~~~~~ + * smp_mb(); // B condition = true; + * } smp_mb(); // C + * remove_wait_queue(&wq_head, &wait); wq_entry->flags |= WQ_FLAG_WOKEN; */ long wait_woken(struct wait_queue_entry *wq_entry, unsigned mode, long timeout) { - set_current_state(mode); /* A */ /* - * The above implies an smp_mb(), which matches with the smp_wmb() from - * woken_wake_function() such that if we observe WQ_FLAG_WOKEN we must - * also observe all state before the wakeup. + * The below executes an smp_mb(), which matches with the full barrier + * executed by the try_to_wake_up() in woken_wake_function() such that + * either we see the store to wq_entry->flags in woken_wake_function() + * or woken_wake_function() sees our store to current->state. */ + set_current_state(mode); /* A */ if (!(wq_entry->flags & WQ_FLAG_WOKEN) && !is_kthread_should_stop()) timeout = schedule_timeout(timeout); __set_current_state(TASK_RUNNING); /* - * The below implies an smp_mb(), it too pairs with the smp_wmb() from - * woken_wake_function() such that we must either observe the wait - * condition being true _OR_ WQ_FLAG_WOKEN such that we will not miss - * an event. + * The below executes an smp_mb(), which matches with the smp_mb() (C) + * in woken_wake_function() such that either we see the wait condition + * being true or the store to wq_entry->flags in woken_wake_function() + * follows ours in the coherence order. */ smp_store_mb(wq_entry->flags, wq_entry->flags & ~WQ_FLAG_WOKEN); /* B */ @@ -430,14 +431,8 @@ EXPORT_SYMBOL(wait_woken); int woken_wake_function(struct wait_queue_entry *wq_entry, unsigned mode, int sync, void *key) { - /* - * Although this function is called under waitqueue lock, LOCK - * doesn't imply write barrier and the users expects write - * barrier semantics on wakeup functions. The following - * smp_wmb() is equivalent to smp_wmb() in try_to_wake_up() - * and is paired with smp_store_mb() in wait_woken(). - */ - smp_wmb(); /* C */ + /* Pairs with the smp_store_mb() in wait_woken(). */ + smp_mb(); /* C */ wq_entry->flags |= WQ_FLAG_WOKEN; return default_wake_function(wq_entry, mode, sync, key); -- cgit From 3d85b2703783636366560c94842affd8608ec9d1 Mon Sep 17 00:00:00 2001 From: Andrea Parri Date: Mon, 16 Jul 2018 11:06:02 -0700 Subject: locking/spinlock, sched/core: Clarify requirements for smp_mb__after_spinlock() There are 11 interpretations of the requirements described in the header comment for smp_mb__after_spinlock(): one for each LKMM maintainer, and one currently encoded in the Cat file. Stick to the latter (until a more satisfactory solution is available). This also reworks some snippets related to the barrier to illustrate the requirements and to link them to the idioms which are relied upon at its call sites. Suggested-by: Boqun Feng Signed-off-by: Andrea Parri Signed-off-by: Paul E. McKenney Acked-by: Peter Zijlstra Cc: Linus Torvalds Cc: Thomas Gleixner Cc: Will Deacon Cc: akiyks@gmail.com Cc: dhowells@redhat.com Cc: j.alglave@ucl.ac.uk Cc: linux-arch@vger.kernel.org Cc: luc.maranget@inria.fr Cc: npiggin@gmail.com Cc: parri.andrea@gmail.com Cc: stern@rowland.harvard.edu Link: http://lkml.kernel.org/r/20180716180605.16115-11-paulmck@linux.vnet.ibm.com Signed-off-by: Ingo Molnar --- include/linux/spinlock.h | 53 ++++++++++++++++++++++++++++++++---------------- kernel/sched/core.c | 41 +++++++++++++++++++------------------ 2 files changed, 57 insertions(+), 37 deletions(-) (limited to 'kernel') diff --git a/include/linux/spinlock.h b/include/linux/spinlock.h index fd57888d4942..3190997df9ca 100644 --- a/include/linux/spinlock.h +++ b/include/linux/spinlock.h @@ -114,29 +114,48 @@ do { \ #endif /*arch_spin_is_contended*/ /* - * This barrier must provide two things: + * smp_mb__after_spinlock() provides the equivalent of a full memory barrier + * between program-order earlier lock acquisitions and program-order later + * memory accesses. * - * - it must guarantee a STORE before the spin_lock() is ordered against a - * LOAD after it, see the comments at its two usage sites. + * This guarantees that the following two properties hold: * - * - it must ensure the critical section is RCsc. + * 1) Given the snippet: * - * The latter is important for cases where we observe values written by other - * CPUs in spin-loops, without barriers, while being subject to scheduling. + * { X = 0; Y = 0; } * - * CPU0 CPU1 CPU2 + * CPU0 CPU1 * - * for (;;) { - * if (READ_ONCE(X)) - * break; - * } - * X=1 - * - * - * r = X; + * WRITE_ONCE(X, 1); WRITE_ONCE(Y, 1); + * spin_lock(S); smp_mb(); + * smp_mb__after_spinlock(); r1 = READ_ONCE(X); + * r0 = READ_ONCE(Y); + * spin_unlock(S); * - * without transitivity it could be that CPU1 observes X!=0 breaks the loop, - * we get migrated and CPU2 sees X==0. + * it is forbidden that CPU0 does not observe CPU1's store to Y (r0 = 0) + * and CPU1 does not observe CPU0's store to X (r1 = 0); see the comments + * preceding the call to smp_mb__after_spinlock() in __schedule() and in + * try_to_wake_up(). + * + * 2) Given the snippet: + * + * { X = 0; Y = 0; } + * + * CPU0 CPU1 CPU2 + * + * spin_lock(S); spin_lock(S); r1 = READ_ONCE(Y); + * WRITE_ONCE(X, 1); smp_mb__after_spinlock(); smp_rmb(); + * spin_unlock(S); r0 = READ_ONCE(X); r2 = READ_ONCE(X); + * WRITE_ONCE(Y, 1); + * spin_unlock(S); + * + * it is forbidden that CPU0's critical section executes before CPU1's + * critical section (r0 = 1), CPU2 observes CPU1's store to Y (r1 = 1) + * and CPU2 does not observe CPU0's store to X (r2 = 0); see the comments + * preceding the calls to smp_rmb() in try_to_wake_up() for similar + * snippets but "projected" onto two CPUs. + * + * Property (2) upgrades the lock to an RCsc lock. * * Since most load-store architectures implement ACQUIRE with an smp_mb() after * the LL/SC loop, they need no further barriers. Similarly all our TSO diff --git a/kernel/sched/core.c b/kernel/sched/core.c index fe365c9a08e9..0c5ec2abdf93 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1998,21 +1998,20 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) * be possible to, falsely, observe p->on_rq == 0 and get stuck * in smp_cond_load_acquire() below. * - * sched_ttwu_pending() try_to_wake_up() - * [S] p->on_rq = 1; [L] P->state - * UNLOCK rq->lock -----. - * \ - * +--- RMB - * schedule() / - * LOCK rq->lock -----' - * UNLOCK rq->lock + * sched_ttwu_pending() try_to_wake_up() + * STORE p->on_rq = 1 LOAD p->state + * UNLOCK rq->lock + * + * __schedule() (switch to task 'p') + * LOCK rq->lock smp_rmb(); + * smp_mb__after_spinlock(); + * UNLOCK rq->lock * * [task p] - * [S] p->state = UNINTERRUPTIBLE [L] p->on_rq + * STORE p->state = UNINTERRUPTIBLE LOAD p->on_rq * - * Pairs with the UNLOCK+LOCK on rq->lock from the - * last wakeup of our task and the schedule that got our task - * current. + * Pairs with the LOCK+smp_mb__after_spinlock() on rq->lock in + * __schedule(). See the comment for smp_mb__after_spinlock(). */ smp_rmb(); if (p->on_rq && ttwu_remote(p, wake_flags)) @@ -2026,15 +2025,17 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) * One must be running (->on_cpu == 1) in order to remove oneself * from the runqueue. * - * [S] ->on_cpu = 1; [L] ->on_rq - * UNLOCK rq->lock - * RMB - * LOCK rq->lock - * [S] ->on_rq = 0; [L] ->on_cpu + * __schedule() (switch to task 'p') try_to_wake_up() + * STORE p->on_cpu = 1 LOAD p->on_rq + * UNLOCK rq->lock + * + * __schedule() (put 'p' to sleep) + * LOCK rq->lock smp_rmb(); + * smp_mb__after_spinlock(); + * STORE p->on_rq = 0 LOAD p->on_cpu * - * Pairs with the full barrier implied in the UNLOCK+LOCK on rq->lock - * from the consecutive calls to schedule(); the first switching to our - * task, the second putting it to sleep. + * Pairs with the LOCK+smp_mb__after_spinlock() on rq->lock in + * __schedule(). See the comment for smp_mb__after_spinlock(). */ smp_rmb(); -- cgit From 7696f9910a9a40b8a952f57d3428515fabd2d889 Mon Sep 17 00:00:00 2001 From: Andrea Parri Date: Mon, 16 Jul 2018 11:06:03 -0700 Subject: sched/Documentation: Update wake_up() & co. memory-barrier guarantees Both the implementation and the users' expectation [1] for the various wakeup primitives have evolved over time, but the documentation has not kept up with these changes: brings it into 2018. [1] http://lkml.kernel.org/r/20180424091510.GB4064@hirez.programming.kicks-ass.net Also applied feedback from Alan Stern. Suggested-by: Peter Zijlstra Signed-off-by: Andrea Parri Signed-off-by: Paul E. McKenney Acked-by: Peter Zijlstra (Intel) Cc: Akira Yokosawa Cc: Alan Stern Cc: Boqun Feng Cc: Daniel Lustig Cc: David Howells Cc: Jade Alglave Cc: Jonathan Corbet Cc: Linus Torvalds Cc: Luc Maranget Cc: Nicholas Piggin Cc: Thomas Gleixner Cc: Will Deacon Cc: linux-arch@vger.kernel.org Cc: parri.andrea@gmail.com Link: http://lkml.kernel.org/r/20180716180605.16115-12-paulmck@linux.vnet.ibm.com Signed-off-by: Ingo Molnar --- Documentation/memory-barriers.txt | 43 ++++++++++++++++++++++++--------------- include/linux/sched.h | 4 ++-- kernel/sched/completion.c | 8 ++++---- kernel/sched/core.c | 30 +++++++++++---------------- kernel/sched/wait.c | 8 ++++---- 5 files changed, 49 insertions(+), 44 deletions(-) (limited to 'kernel') diff --git a/Documentation/memory-barriers.txt b/Documentation/memory-barriers.txt index a02d6bbfc9d0..0d8d7ef131e9 100644 --- a/Documentation/memory-barriers.txt +++ b/Documentation/memory-barriers.txt @@ -2179,32 +2179,41 @@ or: event_indicated = 1; wake_up_process(event_daemon); -A write memory barrier is implied by wake_up() and co. if and only if they -wake something up. The barrier occurs before the task state is cleared, and so -sits between the STORE to indicate the event and the STORE to set TASK_RUNNING: +A general memory barrier is executed by wake_up() if it wakes something up. +If it doesn't wake anything up then a memory barrier may or may not be +executed; you must not rely on it. The barrier occurs before the task state +is accessed, in particular, it sits between the STORE to indicate the event +and the STORE to set TASK_RUNNING: - CPU 1 CPU 2 + CPU 1 (Sleeper) CPU 2 (Waker) =============================== =============================== set_current_state(); STORE event_indicated smp_store_mb(); wake_up(); - STORE current->state - STORE current->state - LOAD event_indicated + STORE current->state ... + + LOAD event_indicated if ((LOAD task->state) & TASK_NORMAL) + STORE task->state -To repeat, this write memory barrier is present if and only if something -is actually awakened. To see this, consider the following sequence of -events, where X and Y are both initially zero: +where "task" is the thread being woken up and it equals CPU 1's "current". + +To repeat, a general memory barrier is guaranteed to be executed by wake_up() +if something is actually awakened, but otherwise there is no such guarantee. +To see this, consider the following sequence of events, where X and Y are both +initially zero: CPU 1 CPU 2 =============================== =============================== - X = 1; STORE event_indicated + X = 1; Y = 1; smp_mb(); wake_up(); - Y = 1; wait_event(wq, Y == 1); - wake_up(); load from Y sees 1, no memory barrier - load from X might see 0 + LOAD Y LOAD X + +If a wakeup does occur, one (at least) of the two loads must see 1. If, on +the other hand, a wakeup does not occur, both loads might see 0. -In contrast, if a wakeup does occur, CPU 2's load from X would be guaranteed -to see 1. +wake_up_process() always executes a general memory barrier. The barrier again +occurs before the task state is accessed. In particular, if the wake_up() in +the previous snippet were replaced by a call to wake_up_process() then one of +the two loads would be guaranteed to see 1. The available waker functions include: @@ -2224,6 +2233,8 @@ The available waker functions include: wake_up_poll(); wake_up_process(); +In terms of memory ordering, these functions all provide the same guarantees of +a wake_up() (or stronger). [!] Note that the memory barriers implied by the sleeper and the waker do _not_ order multiple stores before the wake-up with respect to loads of those stored diff --git a/include/linux/sched.h b/include/linux/sched.h index 43731fe51c97..05cd419f962d 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -167,8 +167,8 @@ struct task_group; * need_sleep = false; * wake_up_state(p, TASK_UNINTERRUPTIBLE); * - * Where wake_up_state() (and all other wakeup primitives) imply enough - * barriers to order the store of the variable against wakeup. + * where wake_up_state() executes a full memory barrier before accessing the + * task state. * * Wakeup will do: if (@state & p->state) p->state = TASK_RUNNING, that is, * once it observes the TASK_UNINTERRUPTIBLE store the waking CPU can issue a diff --git a/kernel/sched/completion.c b/kernel/sched/completion.c index e426b0cb9ac6..a1ad5b7d5521 100644 --- a/kernel/sched/completion.c +++ b/kernel/sched/completion.c @@ -22,8 +22,8 @@ * * See also complete_all(), wait_for_completion() and related routines. * - * It may be assumed that this function implies a write memory barrier before - * changing the task state if and only if any tasks are woken up. + * If this function wakes up a task, it executes a full memory barrier before + * accessing the task state. */ void complete(struct completion *x) { @@ -44,8 +44,8 @@ EXPORT_SYMBOL(complete); * * This will wake up all threads waiting on this particular completion event. * - * It may be assumed that this function implies a write memory barrier before - * changing the task state if and only if any tasks are woken up. + * If this function wakes up a task, it executes a full memory barrier before + * accessing the task state. * * Since complete_all() sets the completion of @x permanently to done * to allow multiple waiters to finish, a call to reinit_completion() diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 0c5ec2abdf93..a0065c84e73f 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -412,8 +412,8 @@ void wake_q_add(struct wake_q_head *head, struct task_struct *task) * its already queued (either by us or someone else) and will get the * wakeup due to that. * - * This cmpxchg() implies a full barrier, which pairs with the write - * barrier implied by the wakeup in wake_up_q(). + * This cmpxchg() executes a full barrier, which pairs with the full + * barrier executed by the wakeup in wake_up_q(). */ if (cmpxchg(&node->next, NULL, WAKE_Q_TAIL)) return; @@ -441,8 +441,8 @@ void wake_up_q(struct wake_q_head *head) task->wake_q.next = NULL; /* - * wake_up_process() implies a wmb() to pair with the queueing - * in wake_q_add() so as not to miss wakeups. + * wake_up_process() executes a full barrier, which pairs with + * the queueing in wake_q_add() so as not to miss wakeups. */ wake_up_process(task); put_task_struct(task); @@ -1879,8 +1879,7 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags) * rq(c1)->lock (if not at the same time, then in that order). * C) LOCK of the rq(c1)->lock scheduling in task * - * Transitivity guarantees that B happens after A and C after B. - * Note: we only require RCpc transitivity. + * Release/acquire chaining guarantees that B happens after A and C after B. * Note: the CPU doing B need not be c0 or c1 * * Example: @@ -1942,16 +1941,9 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags) * UNLOCK rq(0)->lock * * - * However; for wakeups there is a second guarantee we must provide, namely we - * must observe the state that lead to our wakeup. That is, not only must our - * task observe its own prior state, it must also observe the stores prior to - * its wakeup. - * - * This means that any means of doing remote wakeups must order the CPU doing - * the wakeup against the CPU the task is going to end up running on. This, - * however, is already required for the regular Program-Order guarantee above, - * since the waking CPU is the one issueing the ACQUIRE (smp_cond_load_acquire). - * + * However, for wakeups there is a second guarantee we must provide, namely we + * must ensure that CONDITION=1 done by the caller can not be reordered with + * accesses to the task state; see try_to_wake_up() and set_current_state(). */ /** @@ -1967,6 +1959,9 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags) * Atomic against schedule() which would dequeue a task, also see * set_current_state(). * + * This function executes a full memory barrier before accessing the task + * state; see set_current_state(). + * * Return: %true if @p->state changes (an actual wakeup was done), * %false otherwise. */ @@ -2141,8 +2136,7 @@ out: * * Return: 1 if the process was woken up, 0 if it was already running. * - * It may be assumed that this function implies a write memory barrier before - * changing the task state if and only if any tasks are woken up. + * This function executes a full memory barrier before accessing the task state. */ int wake_up_process(struct task_struct *p) { diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c index a7a2aaa3026a..870f97b313e3 100644 --- a/kernel/sched/wait.c +++ b/kernel/sched/wait.c @@ -134,8 +134,8 @@ static void __wake_up_common_lock(struct wait_queue_head *wq_head, unsigned int * @nr_exclusive: how many wake-one or wake-many threads to wake up * @key: is directly passed to the wakeup function * - * It may be assumed that this function implies a write memory barrier before - * changing the task state if and only if any tasks are woken up. + * If this function wakes up a task, it executes a full memory barrier before + * accessing the task state. */ void __wake_up(struct wait_queue_head *wq_head, unsigned int mode, int nr_exclusive, void *key) @@ -180,8 +180,8 @@ EXPORT_SYMBOL_GPL(__wake_up_locked_key_bookmark); * * On UP it can prevent extra preemption. * - * It may be assumed that this function implies a write memory barrier before - * changing the task state if and only if any tasks are woken up. + * If this function wakes up a task, it executes a full memory barrier before + * accessing the task state. */ void __wake_up_sync_key(struct wait_queue_head *wq_head, unsigned int mode, int nr_exclusive, void *key) -- cgit From c1a2f7f0c06454387c2cd7b93ff1491c715a8c69 Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Mon, 16 Jul 2018 15:03:31 -0400 Subject: mm: Allocate the mm_cpumask (mm->cpu_bitmap[]) dynamically based on nr_cpu_ids The mm_struct always contains a cpumask bitmap, regardless of CONFIG_CPUMASK_OFFSTACK. That means the first step can be to simplify things, and simply have one bitmask at the end of the mm_struct for the mm_cpumask. This does necessitate moving everything else in mm_struct into an anonymous sub-structure, which can be randomized when struct randomization is enabled. The second step is to determine the correct size for the mm_struct slab object from the size of the mm_struct (excluding the CPU bitmap) and the size the cpumask. For init_mm we can simply allocate the maximum size this kernel is compiled for, since we only have one init_mm in the system, anyway. Pointer magic by Mike Galbraith, to evade -Wstringop-overflow getting confused by the dynamically sized array. Tested-by: Song Liu Signed-off-by: Rik van Riel Signed-off-by: Mike Galbraith Signed-off-by: Rik van Riel Acked-by: Dave Hansen Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: kernel-team@fb.com Cc: luto@kernel.org Link: http://lkml.kernel.org/r/20180716190337.26133-2-riel@surriel.com Signed-off-by: Ingo Molnar --- drivers/firmware/efi/efi.c | 1 + include/linux/mm_types.h | 241 +++++++++++++++++++++++---------------------- kernel/fork.c | 15 +-- mm/init-mm.c | 11 +++ 4 files changed, 145 insertions(+), 123 deletions(-) (limited to 'kernel') diff --git a/drivers/firmware/efi/efi.c b/drivers/firmware/efi/efi.c index 232f4915223b..7f0b19410a95 100644 --- a/drivers/firmware/efi/efi.c +++ b/drivers/firmware/efi/efi.c @@ -82,6 +82,7 @@ struct mm_struct efi_mm = { .mmap_sem = __RWSEM_INITIALIZER(efi_mm.mmap_sem), .page_table_lock = __SPIN_LOCK_UNLOCKED(efi_mm.page_table_lock), .mmlist = LIST_HEAD_INIT(efi_mm.mmlist), + .cpu_bitmap = { [BITS_TO_LONGS(NR_CPUS)] = 0}, }; static bool disable_runtime; diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 99ce070e7dcb..efdc24dd9e97 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -335,176 +335,183 @@ struct core_state { struct kioctx_table; struct mm_struct { - struct vm_area_struct *mmap; /* list of VMAs */ - struct rb_root mm_rb; - u32 vmacache_seqnum; /* per-thread vmacache */ + struct { + struct vm_area_struct *mmap; /* list of VMAs */ + struct rb_root mm_rb; + u32 vmacache_seqnum; /* per-thread vmacache */ #ifdef CONFIG_MMU - unsigned long (*get_unmapped_area) (struct file *filp, + unsigned long (*get_unmapped_area) (struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags); #endif - unsigned long mmap_base; /* base of mmap area */ - unsigned long mmap_legacy_base; /* base of mmap area in bottom-up allocations */ + unsigned long mmap_base; /* base of mmap area */ + unsigned long mmap_legacy_base; /* base of mmap area in bottom-up allocations */ #ifdef CONFIG_HAVE_ARCH_COMPAT_MMAP_BASES - /* Base adresses for compatible mmap() */ - unsigned long mmap_compat_base; - unsigned long mmap_compat_legacy_base; + /* Base adresses for compatible mmap() */ + unsigned long mmap_compat_base; + unsigned long mmap_compat_legacy_base; #endif - unsigned long task_size; /* size of task vm space */ - unsigned long highest_vm_end; /* highest vma end address */ - pgd_t * pgd; - - /** - * @mm_users: The number of users including userspace. - * - * Use mmget()/mmget_not_zero()/mmput() to modify. When this drops - * to 0 (i.e. when the task exits and there are no other temporary - * reference holders), we also release a reference on @mm_count - * (which may then free the &struct mm_struct if @mm_count also - * drops to 0). - */ - atomic_t mm_users; - - /** - * @mm_count: The number of references to &struct mm_struct - * (@mm_users count as 1). - * - * Use mmgrab()/mmdrop() to modify. When this drops to 0, the - * &struct mm_struct is freed. - */ - atomic_t mm_count; + unsigned long task_size; /* size of task vm space */ + unsigned long highest_vm_end; /* highest vma end address */ + pgd_t * pgd; + + /** + * @mm_users: The number of users including userspace. + * + * Use mmget()/mmget_not_zero()/mmput() to modify. When this + * drops to 0 (i.e. when the task exits and there are no other + * temporary reference holders), we also release a reference on + * @mm_count (which may then free the &struct mm_struct if + * @mm_count also drops to 0). + */ + atomic_t mm_users; + + /** + * @mm_count: The number of references to &struct mm_struct + * (@mm_users count as 1). + * + * Use mmgrab()/mmdrop() to modify. When this drops to 0, the + * &struct mm_struct is freed. + */ + atomic_t mm_count; #ifdef CONFIG_MMU - atomic_long_t pgtables_bytes; /* PTE page table pages */ + atomic_long_t pgtables_bytes; /* PTE page table pages */ #endif - int map_count; /* number of VMAs */ + int map_count; /* number of VMAs */ - spinlock_t page_table_lock; /* Protects page tables and some counters */ - struct rw_semaphore mmap_sem; + spinlock_t page_table_lock; /* Protects page tables and some + * counters + */ + struct rw_semaphore mmap_sem; - struct list_head mmlist; /* List of maybe swapped mm's. These are globally strung - * together off init_mm.mmlist, and are protected - * by mmlist_lock - */ + struct list_head mmlist; /* List of maybe swapped mm's. These + * are globally strung together off + * init_mm.mmlist, and are protected + * by mmlist_lock + */ - unsigned long hiwater_rss; /* High-watermark of RSS usage */ - unsigned long hiwater_vm; /* High-water virtual memory usage */ + unsigned long hiwater_rss; /* High-watermark of RSS usage */ + unsigned long hiwater_vm; /* High-water virtual memory usage */ - unsigned long total_vm; /* Total pages mapped */ - unsigned long locked_vm; /* Pages that have PG_mlocked set */ - unsigned long pinned_vm; /* Refcount permanently increased */ - unsigned long data_vm; /* VM_WRITE & ~VM_SHARED & ~VM_STACK */ - unsigned long exec_vm; /* VM_EXEC & ~VM_WRITE & ~VM_STACK */ - unsigned long stack_vm; /* VM_STACK */ - unsigned long def_flags; + unsigned long total_vm; /* Total pages mapped */ + unsigned long locked_vm; /* Pages that have PG_mlocked set */ + unsigned long pinned_vm; /* Refcount permanently increased */ + unsigned long data_vm; /* VM_WRITE & ~VM_SHARED & ~VM_STACK */ + unsigned long exec_vm; /* VM_EXEC & ~VM_WRITE & ~VM_STACK */ + unsigned long stack_vm; /* VM_STACK */ + unsigned long def_flags; - spinlock_t arg_lock; /* protect the below fields */ - unsigned long start_code, end_code, start_data, end_data; - unsigned long start_brk, brk, start_stack; - unsigned long arg_start, arg_end, env_start, env_end; + spinlock_t arg_lock; /* protect the below fields */ + unsigned long start_code, end_code, start_data, end_data; + unsigned long start_brk, brk, start_stack; + unsigned long arg_start, arg_end, env_start, env_end; - unsigned long saved_auxv[AT_VECTOR_SIZE]; /* for /proc/PID/auxv */ + unsigned long saved_auxv[AT_VECTOR_SIZE]; /* for /proc/PID/auxv */ - /* - * Special counters, in some configurations protected by the - * page_table_lock, in other configurations by being atomic. - */ - struct mm_rss_stat rss_stat; - - struct linux_binfmt *binfmt; + /* + * Special counters, in some configurations protected by the + * page_table_lock, in other configurations by being atomic. + */ + struct mm_rss_stat rss_stat; - cpumask_var_t cpu_vm_mask_var; + struct linux_binfmt *binfmt; - /* Architecture-specific MM context */ - mm_context_t context; + /* Architecture-specific MM context */ + mm_context_t context; - unsigned long flags; /* Must use atomic bitops to access the bits */ + unsigned long flags; /* Must use atomic bitops to access */ - struct core_state *core_state; /* coredumping support */ + struct core_state *core_state; /* coredumping support */ #ifdef CONFIG_MEMBARRIER - atomic_t membarrier_state; + atomic_t membarrier_state; #endif #ifdef CONFIG_AIO - spinlock_t ioctx_lock; - struct kioctx_table __rcu *ioctx_table; + spinlock_t ioctx_lock; + struct kioctx_table __rcu *ioctx_table; #endif #ifdef CONFIG_MEMCG - /* - * "owner" points to a task that is regarded as the canonical - * user/owner of this mm. All of the following must be true in - * order for it to be changed: - * - * current == mm->owner - * current->mm != mm - * new_owner->mm == mm - * new_owner->alloc_lock is held - */ - struct task_struct __rcu *owner; + /* + * "owner" points to a task that is regarded as the canonical + * user/owner of this mm. All of the following must be true in + * order for it to be changed: + * + * current == mm->owner + * current->mm != mm + * new_owner->mm == mm + * new_owner->alloc_lock is held + */ + struct task_struct __rcu *owner; #endif - struct user_namespace *user_ns; + struct user_namespace *user_ns; - /* store ref to file /proc//exe symlink points to */ - struct file __rcu *exe_file; + /* store ref to file /proc//exe symlink points to */ + struct file __rcu *exe_file; #ifdef CONFIG_MMU_NOTIFIER - struct mmu_notifier_mm *mmu_notifier_mm; + struct mmu_notifier_mm *mmu_notifier_mm; #endif #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS - pgtable_t pmd_huge_pte; /* protected by page_table_lock */ -#endif -#ifdef CONFIG_CPUMASK_OFFSTACK - struct cpumask cpumask_allocation; + pgtable_t pmd_huge_pte; /* protected by page_table_lock */ #endif #ifdef CONFIG_NUMA_BALANCING - /* - * numa_next_scan is the next time that the PTEs will be marked - * pte_numa. NUMA hinting faults will gather statistics and migrate - * pages to new nodes if necessary. - */ - unsigned long numa_next_scan; + /* + * numa_next_scan is the next time that the PTEs will be marked + * pte_numa. NUMA hinting faults will gather statistics and + * migrate pages to new nodes if necessary. + */ + unsigned long numa_next_scan; - /* Restart point for scanning and setting pte_numa */ - unsigned long numa_scan_offset; + /* Restart point for scanning and setting pte_numa */ + unsigned long numa_scan_offset; - /* numa_scan_seq prevents two threads setting pte_numa */ - int numa_scan_seq; + /* numa_scan_seq prevents two threads setting pte_numa */ + int numa_scan_seq; #endif - /* - * An operation with batched TLB flushing is going on. Anything that - * can move process memory needs to flush the TLB when moving a - * PROT_NONE or PROT_NUMA mapped page. - */ - atomic_t tlb_flush_pending; + /* + * An operation with batched TLB flushing is going on. Anything + * that can move process memory needs to flush the TLB when + * moving a PROT_NONE or PROT_NUMA mapped page. + */ + atomic_t tlb_flush_pending; #ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH - /* See flush_tlb_batched_pending() */ - bool tlb_flush_batched; + /* See flush_tlb_batched_pending() */ + bool tlb_flush_batched; #endif - struct uprobes_state uprobes_state; + struct uprobes_state uprobes_state; #ifdef CONFIG_HUGETLB_PAGE - atomic_long_t hugetlb_usage; + atomic_long_t hugetlb_usage; #endif - struct work_struct async_put_work; + struct work_struct async_put_work; #if IS_ENABLED(CONFIG_HMM) - /* HMM needs to track a few things per mm */ - struct hmm *hmm; + /* HMM needs to track a few things per mm */ + struct hmm *hmm; #endif -} __randomize_layout; + } __randomize_layout; + + /* + * The mm_cpumask needs to be at the end of mm_struct, because it + * is dynamically sized based on nr_cpu_ids. + */ + unsigned long cpu_bitmap[]; +}; extern struct mm_struct init_mm; +/* Pointer magic because the dynamic array size confuses some compilers. */ static inline void mm_init_cpumask(struct mm_struct *mm) { -#ifdef CONFIG_CPUMASK_OFFSTACK - mm->cpu_vm_mask_var = &mm->cpumask_allocation; -#endif - cpumask_clear(mm->cpu_vm_mask_var); + unsigned long cpu_bitmap = (unsigned long)mm; + + cpu_bitmap += offsetof(struct mm_struct, cpu_bitmap); + cpumask_clear((struct cpumask *)cpu_bitmap); } /* Future-safe accessor for struct mm_struct's cpu_vm_mask. */ static inline cpumask_t *mm_cpumask(struct mm_struct *mm) { - return mm->cpu_vm_mask_var; + return (struct cpumask *)&mm->cpu_bitmap; } struct mmu_gather; diff --git a/kernel/fork.c b/kernel/fork.c index 9440d61b925c..5b64c1b8461e 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -2253,6 +2253,8 @@ static void sighand_ctor(void *data) void __init proc_caches_init(void) { + unsigned int mm_size; + sighand_cachep = kmem_cache_create("sighand_cache", sizeof(struct sighand_struct), 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_TYPESAFE_BY_RCU| @@ -2269,15 +2271,16 @@ void __init proc_caches_init(void) sizeof(struct fs_struct), 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT, NULL); + /* - * FIXME! The "sizeof(struct mm_struct)" currently includes the - * whole struct cpumask for the OFFSTACK case. We could change - * this to *only* allocate as much of it as required by the - * maximum number of CPU's we can ever have. The cpumask_allocation - * is at the end of the structure, exactly for that reason. + * The mm_cpumask is located at the end of mm_struct, and is + * dynamically sized based on the maximum CPU number this system + * can have, taking hotplug into account (nr_cpu_ids). */ + mm_size = sizeof(struct mm_struct) + cpumask_size(); + mm_cachep = kmem_cache_create_usercopy("mm_struct", - sizeof(struct mm_struct), ARCH_MIN_MMSTRUCT_ALIGN, + mm_size, ARCH_MIN_MMSTRUCT_ALIGN, SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT, offsetof(struct mm_struct, saved_auxv), sizeof_field(struct mm_struct, saved_auxv), diff --git a/mm/init-mm.c b/mm/init-mm.c index f0179c9c04c2..a787a319211e 100644 --- a/mm/init-mm.c +++ b/mm/init-mm.c @@ -15,6 +15,16 @@ #define INIT_MM_CONTEXT(name) #endif +/* + * For dynamically allocated mm_structs, there is a dynamically sized cpumask + * at the end of the structure, the size of which depends on the maximum CPU + * number the system can see. That way we allocate only as much memory for + * mm_cpumask() as needed for the hundreds, or thousands of processes that + * a system typically runs. + * + * Since there is only one init_mm in the entire system, keep it simple + * and size this cpu_bitmask to NR_CPUS. + */ struct mm_struct init_mm = { .mm_rb = RB_ROOT, .pgd = swapper_pg_dir, @@ -25,5 +35,6 @@ struct mm_struct init_mm = { .arg_lock = __SPIN_LOCK_UNLOCKED(init_mm.arg_lock), .mmlist = LIST_HEAD_INIT(init_mm.mmlist), .user_ns = &init_user_ns, + .cpu_bitmap = { [BITS_TO_LONGS(NR_CPUS)] = 0}, INIT_MM_CONTEXT(init_mm) }; -- cgit From d91cfeb0aa79445fcfa9f523a5b57c5e9f4113ec Mon Sep 17 00:00:00 2001 From: RAGHU Halharvi Date: Tue, 17 Jul 2018 15:50:09 +0530 Subject: genirq: Remove redundant NULL pointer check in __free_irq() The NULL pointer check in __free_irq() triggers a 'dereference before NULL pointer check' warning in static code analysis. It turns out that the check is redundant because all callers have a NULL pointer check already. Remove it. Signed-off-by: RAGHU Halharvi Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/20180717102009.7708-1-raghuhack78@gmail.com --- kernel/irq/manage.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 1f8be33572a7..a66c58f91bff 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -1570,9 +1570,6 @@ static struct irqaction *__free_irq(struct irq_desc *desc, void *dev_id) WARN(in_interrupt(), "Trying to free IRQ %d from IRQ context!\n", irq); - if (!desc) - return NULL; - mutex_lock(&desc->request_mutex); chip_bus_lock(desc); raw_spin_lock_irqsave(&desc->lock, flags); -- cgit From c417fbce98722ad7e384caa8ba6f2e7c5f8672d9 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Tue, 26 Jun 2018 01:40:23 +0900 Subject: kbuild: move bin2c back to scripts/ from scripts/basic/ Commit 8370edea81e3 ("bin2c: move bin2c in scripts/basic") moved bin2c to the scripts/basic/ directory, incorrectly stating "Kexec wants to use bin2c and it wants to use it really early in the build process. See arch/x86/purgatory/ code in later patches." Commit bdab125c9301 ("Revert "kexec/purgatory: Add clean-up for purgatory directory"") and commit d6605b6bbee8 ("x86/build: Remove unnecessary preparation for purgatory") removed the redundant purgatory build magic entirely. That means that the move of bin2c was unnecessary in the first place. fixdep is the only host program that deserves to sit in the scripts/basic/ directory. Signed-off-by: Masahiro Yamada --- arch/powerpc/purgatory/Makefile | 3 +-- arch/s390/purgatory/Makefile | 3 +-- arch/x86/purgatory/Makefile | 3 +-- kernel/Makefile | 2 +- scripts/.gitignore | 1 + scripts/Makefile | 1 + scripts/basic/.gitignore | 1 - scripts/basic/Makefile | 1 - scripts/basic/bin2c.c | 36 ------------------------------------ scripts/bin2c.c | 36 ++++++++++++++++++++++++++++++++++++ security/tomoyo/Makefile | 2 +- 11 files changed, 43 insertions(+), 46 deletions(-) delete mode 100644 scripts/basic/bin2c.c create mode 100644 scripts/bin2c.c (limited to 'kernel') diff --git a/arch/powerpc/purgatory/Makefile b/arch/powerpc/purgatory/Makefile index 30e05decbb4c..4314ba5baf43 100644 --- a/arch/powerpc/purgatory/Makefile +++ b/arch/powerpc/purgatory/Makefile @@ -6,9 +6,8 @@ LDFLAGS_purgatory.ro := -e purgatory_start -r --no-undefined $(obj)/purgatory.ro: $(obj)/trampoline.o FORCE $(call if_changed,ld) -CMD_BIN2C = $(objtree)/scripts/basic/bin2c quiet_cmd_bin2c = BIN2C $@ - cmd_bin2c = $(CMD_BIN2C) kexec_purgatory < $< > $@ + cmd_bin2c = $(objtree)/scripts/bin2c kexec_purgatory < $< > $@ $(obj)/kexec-purgatory.c: $(obj)/purgatory.ro FORCE $(call if_changed,bin2c) diff --git a/arch/s390/purgatory/Makefile b/arch/s390/purgatory/Makefile index 1ace023cbdce..445c4603ce02 100644 --- a/arch/s390/purgatory/Makefile +++ b/arch/s390/purgatory/Makefile @@ -27,9 +27,8 @@ KBUILD_CFLAGS += $(call cc-option,-fno-PIE) $(obj)/purgatory.ro: $(PURGATORY_OBJS) FORCE $(call if_changed,ld) -CMD_BIN2C = $(objtree)/scripts/basic/bin2c quiet_cmd_bin2c = BIN2C $@ - cmd_bin2c = $(CMD_BIN2C) kexec_purgatory < $< > $@ + cmd_bin2c = $(objtree)/scripts/bin2c kexec_purgatory < $< > $@ $(obj)/kexec-purgatory.c: $(obj)/purgatory.ro FORCE $(call if_changed,bin2c) diff --git a/arch/x86/purgatory/Makefile b/arch/x86/purgatory/Makefile index 81a8e33115ad..3cf302b26332 100644 --- a/arch/x86/purgatory/Makefile +++ b/arch/x86/purgatory/Makefile @@ -28,9 +28,8 @@ $(obj)/purgatory.ro: $(PURGATORY_OBJS) FORCE targets += kexec-purgatory.c -CMD_BIN2C = $(objtree)/scripts/basic/bin2c quiet_cmd_bin2c = BIN2C $@ - cmd_bin2c = $(CMD_BIN2C) kexec_purgatory < $< > $@ + cmd_bin2c = $(objtree)/scripts/bin2c kexec_purgatory < $< > $@ $(obj)/kexec-purgatory.c: $(obj)/purgatory.ro FORCE $(call if_changed,bin2c) diff --git a/kernel/Makefile b/kernel/Makefile index 04bc07c2b42a..7a63d567fdb5 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -123,7 +123,7 @@ targets += config_data.gz $(obj)/config_data.gz: $(KCONFIG_CONFIG) FORCE $(call if_changed,gzip) - filechk_ikconfiggz = (echo "static const char kernel_config_data[] __used = MAGIC_START"; cat $< | scripts/basic/bin2c; echo "MAGIC_END;") + filechk_ikconfiggz = (echo "static const char kernel_config_data[] __used = MAGIC_START"; cat $< | scripts/bin2c; echo "MAGIC_END;") targets += config_data.h $(obj)/config_data.h: $(obj)/config_data.gz FORCE $(call filechk,ikconfiggz) diff --git a/scripts/.gitignore b/scripts/.gitignore index 0442c06eefcb..12d302d70128 100644 --- a/scripts/.gitignore +++ b/scripts/.gitignore @@ -1,6 +1,7 @@ # # Generated files # +bin2c conmakehash kallsyms pnmtologo diff --git a/scripts/Makefile b/scripts/Makefile index 25ab143cbe14..59c21ec49b84 100644 --- a/scripts/Makefile +++ b/scripts/Makefile @@ -10,6 +10,7 @@ HOST_EXTRACFLAGS += -I$(srctree)/tools/include +hostprogs-$(CONFIG_BUILD_BIN2C) += bin2c hostprogs-$(CONFIG_KALLSYMS) += kallsyms hostprogs-$(CONFIG_LOGO) += pnmtologo hostprogs-$(CONFIG_VT) += conmakehash diff --git a/scripts/basic/.gitignore b/scripts/basic/.gitignore index 9528ec9e5adc..a776371a3502 100644 --- a/scripts/basic/.gitignore +++ b/scripts/basic/.gitignore @@ -1,2 +1 @@ fixdep -bin2c diff --git a/scripts/basic/Makefile b/scripts/basic/Makefile index 0372b33febe5..af49b446f17d 100644 --- a/scripts/basic/Makefile +++ b/scripts/basic/Makefile @@ -9,7 +9,6 @@ # fixdep: Used to generate dependency information during build process hostprogs-y := fixdep -hostprogs-$(CONFIG_BUILD_BIN2C) += bin2c always := $(hostprogs-y) # fixdep is needed to compile other host programs diff --git a/scripts/basic/bin2c.c b/scripts/basic/bin2c.c deleted file mode 100644 index c3d7eef3ad06..000000000000 --- a/scripts/basic/bin2c.c +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Unloved program to convert a binary on stdin to a C include on stdout - * - * Jan 1999 Matt Mackall - * - * This software may be used and distributed according to the terms - * of the GNU General Public License, incorporated herein by reference. - */ - -#include - -int main(int argc, char *argv[]) -{ - int ch, total = 0; - - if (argc > 1) - printf("const char %s[] %s=\n", - argv[1], argc > 2 ? argv[2] : ""); - - do { - printf("\t\""); - while ((ch = getchar()) != EOF) { - total++; - printf("\\x%02x", ch); - if (total % 16 == 0) - break; - } - printf("\"\n"); - } while (ch != EOF); - - if (argc > 1) - printf("\t;\n\n#include \n\nconst size_t %s_size = %d;\n", - argv[1], total); - - return 0; -} diff --git a/scripts/bin2c.c b/scripts/bin2c.c new file mode 100644 index 000000000000..c3d7eef3ad06 --- /dev/null +++ b/scripts/bin2c.c @@ -0,0 +1,36 @@ +/* + * Unloved program to convert a binary on stdin to a C include on stdout + * + * Jan 1999 Matt Mackall + * + * This software may be used and distributed according to the terms + * of the GNU General Public License, incorporated herein by reference. + */ + +#include + +int main(int argc, char *argv[]) +{ + int ch, total = 0; + + if (argc > 1) + printf("const char %s[] %s=\n", + argv[1], argc > 2 ? argv[2] : ""); + + do { + printf("\t\""); + while ((ch = getchar()) != EOF) { + total++; + printf("\\x%02x", ch); + if (total % 16 == 0) + break; + } + printf("\"\n"); + } while (ch != EOF); + + if (argc > 1) + printf("\t;\n\n#include \n\nconst size_t %s_size = %d;\n", + argv[1], total); + + return 0; +} diff --git a/security/tomoyo/Makefile b/security/tomoyo/Makefile index b7c6a7ffc058..cca5a3012fee 100644 --- a/security/tomoyo/Makefile +++ b/security/tomoyo/Makefile @@ -4,7 +4,7 @@ obj-y = audit.o common.o condition.o domain.o environ.o file.o gc.o group.o load targets += builtin-policy.h define do_policy echo "static char tomoyo_builtin_$(1)[] __initdata ="; \ -$(objtree)/scripts/basic/bin2c <$(firstword $(wildcard $(obj)/policy/$(1).conf $(srctree)/$(src)/policy/$(1).conf.default) /dev/null); \ +$(objtree)/scripts/bin2c <$(firstword $(wildcard $(obj)/policy/$(1).conf $(srctree)/$(src)/policy/$(1).conf.default) /dev/null); \ echo ";" endef quiet_cmd_policy = POLICY $@ -- cgit From 3c53776e29f81719efcf8f7a6e30cdf753bee94d Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Mon, 8 Jan 2018 11:51:04 -0800 Subject: Mark HI and TASKLET softirq synchronous Way back in 4.9, we committed 4cd13c21b207 ("softirq: Let ksoftirqd do its job"), and ever since we've had small nagging issues with it. For example, we've had: 1ff688209e2e ("watchdog: core: make sure the watchdog_worker is not deferred") 8d5755b3f77b ("watchdog: softdog: fire watchdog even if softirqs do not get to run") 217f69743681 ("net: busy-poll: allow preemption in sk_busy_loop()") all of which worked around some of the effects of that commit. The DVB people have also complained that the commit causes excessive USB URB latencies, which seems to be due to the USB code using tasklets to schedule USB traffic. This seems to be an issue mainly when already living on the edge, but waiting for ksoftirqd to handle it really does seem to cause excessive latencies. Now Hanna Hawa reports that this issue isn't just limited to USB URB and DVB, but also causes timeout problems for the Marvell SoC team: "I'm facing kernel panic issue while running raid 5 on sata disks connected to Macchiatobin (Marvell community board with Armada-8040 SoC with 4 ARMv8 cores of CA72) Raid 5 built with Marvell DMA engine and async_tx mechanism (ASYNC_TX_DMA [=y]); the DMA driver (mv_xor_v2) uses a tasklet to clean the done descriptors from the queue" The latency problem causes a panic: mv_xor_v2 f0400000.xor: dma_sync_wait: timeout! Kernel panic - not syncing: async_tx_quiesce: DMA error waiting for transaction We've discussed simply just reverting the original commit entirely, and also much more involved solutions (with per-softirq threads etc). This patch is intentionally stupid and fairly limited, because the issue still remains, and the other solutions either got sidetracked or had other issues. We should probably also consider the timer softirqs to be synchronous and not be delayed to ksoftirqd (since they were the issue with the earlier watchdog problems), but that should be done as a separate patch. This does only the tasklet cases. Reported-and-tested-by: Hanna Hawa Reported-and-tested-by: Josef Griebichler Reported-by: Mauro Carvalho Chehab Cc: Alan Stern Cc: Greg Kroah-Hartman Cc: Eric Dumazet Cc: Ingo Molnar Signed-off-by: Linus Torvalds --- kernel/softirq.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/softirq.c b/kernel/softirq.c index 900dcfee542c..75ffc1d1a2e0 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -79,12 +79,16 @@ static void wakeup_softirqd(void) /* * If ksoftirqd is scheduled, we do not want to process pending softirqs - * right now. Let ksoftirqd handle this at its own rate, to get fairness. + * right now. Let ksoftirqd handle this at its own rate, to get fairness, + * unless we're doing some of the synchronous softirqs. */ -static bool ksoftirqd_running(void) +#define SOFTIRQ_NOW_MASK ((1 << HI_SOFTIRQ) | (1 << TASKLET_SOFTIRQ)) +static bool ksoftirqd_running(unsigned long pending) { struct task_struct *tsk = __this_cpu_read(ksoftirqd); + if (pending & SOFTIRQ_NOW_MASK) + return false; return tsk && (tsk->state == TASK_RUNNING); } @@ -328,7 +332,7 @@ asmlinkage __visible void do_softirq(void) pending = local_softirq_pending(); - if (pending && !ksoftirqd_running()) + if (pending && !ksoftirqd_running(pending)) do_softirq_own_stack(); local_irq_restore(flags); @@ -355,7 +359,7 @@ void irq_enter(void) static inline void invoke_softirq(void) { - if (ksoftirqd_running()) + if (ksoftirqd_running(local_softirq_pending())) return; if (!force_irqthreads) { -- cgit From 290e44b7dd116cc61cf37b7ca0be13313bb11e37 Mon Sep 17 00:00:00 2001 From: Paul Moore Date: Tue, 17 Jul 2018 14:45:08 -0400 Subject: audit: use ktime_get_coarse_real_ts64() for timestamps Commit c72051d5778a ("audit: use ktime_get_coarse_ts64() for time access") converted audit's use of current_kernel_time64() to the new ktime_get_coarse_ts64() function. Unfortunately this resulted in incorrect timestamps, e.g. events stamped with the year 1969 despite it being 2018. This patch corrects this by using ktime_get_coarse_real_ts64() just like the current_kernel_time64() wrapper. Fixes: c72051d5778a ("audit: use ktime_get_coarse_ts64() for time access") Reviewed-by: Arnd Bergmann Signed-off-by: Paul Moore --- kernel/audit.c | 2 +- kernel/auditsc.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index e17bc697d11c..2a8058764aa6 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -1721,7 +1721,7 @@ static inline void audit_get_stamp(struct audit_context *ctx, struct timespec64 *t, unsigned int *serial) { if (!ctx || !auditsc_get_stamp(ctx, t, serial)) { - ktime_get_coarse_ts64(t); + ktime_get_coarse_real_ts64(t); *serial = audit_serial(); } } diff --git a/kernel/auditsc.c b/kernel/auditsc.c index f6a0cb32d76e..fb207466e99b 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -1543,7 +1543,7 @@ void __audit_syscall_entry(int major, unsigned long a1, unsigned long a2, context->in_syscall = 1; context->current_state = state; context->ppid = 0; - ktime_get_coarse_ts64(&context->ctime); + ktime_get_coarse_real_ts64(&context->ctime); } /** -- cgit From d29ab6e1fa21ebc2a8a771015dd9e0e5d4e28dc1 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Fri, 13 Jul 2018 12:41:10 -0700 Subject: bpf: bpf_prog_array_alloc() should return a generic non-rcu pointer Currently the return type of the bpf_prog_array_alloc() is struct bpf_prog_array __rcu *, which is not quite correct. Obviously, the returned pointer is a generic pointer, which is valid for an indefinite amount of time and it's not shared with anyone else, so there is no sense in marking it as __rcu. This change eliminate the following sparse warnings: kernel/bpf/core.c:1544:31: warning: incorrect type in return expression (different address spaces) kernel/bpf/core.c:1544:31: expected struct bpf_prog_array [noderef] * kernel/bpf/core.c:1544:31: got void * kernel/bpf/core.c:1548:17: warning: incorrect type in return expression (different address spaces) kernel/bpf/core.c:1548:17: expected struct bpf_prog_array [noderef] * kernel/bpf/core.c:1548:17: got struct bpf_prog_array * kernel/bpf/core.c:1681:15: warning: incorrect type in assignment (different address spaces) kernel/bpf/core.c:1681:15: expected struct bpf_prog_array *array kernel/bpf/core.c:1681:15: got struct bpf_prog_array [noderef] * Fixes: 324bda9e6c5a ("bpf: multi program support for cgroup+bpf") Signed-off-by: Roman Gushchin Cc: Alexei Starovoitov Cc: Daniel Borkmann Signed-off-by: Daniel Borkmann --- include/linux/bpf.h | 2 +- kernel/bpf/core.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 8827e797ff97..943fb08d8287 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -352,7 +352,7 @@ struct bpf_prog_array { struct bpf_prog *progs[0]; }; -struct bpf_prog_array __rcu *bpf_prog_array_alloc(u32 prog_cnt, gfp_t flags); +struct bpf_prog_array *bpf_prog_array_alloc(u32 prog_cnt, gfp_t flags); void bpf_prog_array_free(struct bpf_prog_array __rcu *progs); int bpf_prog_array_length(struct bpf_prog_array __rcu *progs); int bpf_prog_array_copy_to_user(struct bpf_prog_array __rcu *progs, diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 1e5625d46414..253aa8e79c7b 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1538,7 +1538,7 @@ static struct { .null_prog = NULL, }; -struct bpf_prog_array __rcu *bpf_prog_array_alloc(u32 prog_cnt, gfp_t flags) +struct bpf_prog_array *bpf_prog_array_alloc(u32 prog_cnt, gfp_t flags) { if (prog_cnt) return kzalloc(sizeof(struct bpf_prog_array) + -- cgit From 3960f4fd6585608e8cc285d9665821985494e147 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Fri, 13 Jul 2018 12:41:11 -0700 Subject: bpf: fix rcu annotations in compute_effective_progs() The progs local variable in compute_effective_progs() is marked as __rcu, which is not correct. This is a local pointer, which is initialized by bpf_prog_array_alloc(), which also now returns a generic non-rcu pointer. The real rcu-protected pointer is *array (array is a pointer to an RCU-protected pointer), so the assignment should be performed using rcu_assign_pointer(). Fixes: 324bda9e6c5a ("bpf: multi program support for cgroup+bpf") Signed-off-by: Roman Gushchin Cc: Alexei Starovoitov Cc: Daniel Borkmann Signed-off-by: Daniel Borkmann --- kernel/bpf/cgroup.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 3d83ee7df381..badabb0b435c 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -95,7 +95,7 @@ static int compute_effective_progs(struct cgroup *cgrp, enum bpf_attach_type type, struct bpf_prog_array __rcu **array) { - struct bpf_prog_array __rcu *progs; + struct bpf_prog_array *progs; struct bpf_prog_list *pl; struct cgroup *p = cgrp; int cnt = 0; @@ -120,13 +120,12 @@ static int compute_effective_progs(struct cgroup *cgrp, &p->bpf.progs[type], node) { if (!pl->prog) continue; - rcu_dereference_protected(progs, 1)-> - progs[cnt++] = pl->prog; + progs->progs[cnt++] = pl->prog; } p = cgroup_parent(p); } while (p); - *array = progs; + rcu_assign_pointer(*array, progs); return 0; } -- cgit From c23e014a4ba1a9448cbbd74916377f23a7da2fc2 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Tue, 17 Jul 2018 09:38:59 +0100 Subject: bpf: sockmap: remove redundant pointer sg Pointer sg is being assigned but is never used hence it is redundant and can be removed. Cleans up clang warning: warning: variable 'sg' set but not used [-Wunused-but-set-variable] Signed-off-by: Colin Ian King Signed-off-by: Daniel Borkmann --- kernel/bpf/sockmap.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c index cf7b6a6dbd1f..3e7a5f622712 100644 --- a/kernel/bpf/sockmap.c +++ b/kernel/bpf/sockmap.c @@ -719,11 +719,8 @@ static int bpf_tcp_sendmsg_do_redirect(struct sock *sk, int send, { bool ingress = !!(md->flags & BPF_F_INGRESS); struct smap_psock *psock; - struct scatterlist *sg; int err = 0; - sg = md->sg_data; - rcu_read_lock(); psock = smap_psock_sk(sk); if (unlikely(!psock)) -- cgit From 09728266b6f99ab57cd4f84f3eead65b7b65dbf7 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 17 Jul 2018 10:53:23 -0700 Subject: bpf: offload: rename bpf_offload_dev_match() to bpf_offload_prog_map_match() A set of new API functions exported for the drivers will soon use 'bpf_offload_dev_' as a prefix. Rename the bpf_offload_dev_match() which is internal to the core (used by the verifier) to avoid any confusion. Signed-off-by: Jakub Kicinski Reviewed-by: Quentin Monnet Signed-off-by: Daniel Borkmann --- include/linux/bpf.h | 2 +- kernel/bpf/offload.c | 2 +- kernel/bpf/verifier.c | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 943fb08d8287..9b010d9129f3 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -648,7 +648,7 @@ int bpf_map_offload_delete_elem(struct bpf_map *map, void *key); int bpf_map_offload_get_next_key(struct bpf_map *map, void *key, void *next_key); -bool bpf_offload_dev_match(struct bpf_prog *prog, struct bpf_map *map); +bool bpf_offload_prog_map_match(struct bpf_prog *prog, struct bpf_map *map); #if defined(CONFIG_NET) && defined(CONFIG_BPF_SYSCALL) int bpf_prog_offload_init(struct bpf_prog *prog, union bpf_attr *attr); diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c index ac747d5cf7c6..6184e48703f4 100644 --- a/kernel/bpf/offload.c +++ b/kernel/bpf/offload.c @@ -468,7 +468,7 @@ int bpf_map_offload_info_fill(struct bpf_map_info *info, struct bpf_map *map) return 0; } -bool bpf_offload_dev_match(struct bpf_prog *prog, struct bpf_map *map) +bool bpf_offload_prog_map_match(struct bpf_prog *prog, struct bpf_map *map) { struct bpf_offloaded_map *offmap; struct bpf_prog_offload *offload; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 9e2bf834f13a..15d69b278277 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5054,7 +5054,7 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env, } if ((bpf_prog_is_dev_bound(prog->aux) || bpf_map_is_dev_bound(map)) && - !bpf_offload_dev_match(prog, map)) { + !bpf_offload_prog_map_match(prog, map)) { verbose(env, "offload device mismatch between prog and map\n"); return -EINVAL; } -- cgit From 9fd7c5559165f4c679b40c5e6ad442955832dfad Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 17 Jul 2018 10:53:24 -0700 Subject: bpf: offload: aggregate offloads per-device Currently we have two lists of offloaded objects - programs and maps. Netdevice unregister notifier scans those lists to orphan objects associated with device being unregistered. This puts unnecessary (even if negligible) burden on all netdev unregister calls in BPF- -enabled kernel. The lists of objects may potentially get long making the linear scan even more problematic. There haven't been complaints about this mechanisms so far, but it is suboptimal. Instead of relying on notifiers, make the few BPF-capable drivers register explicitly for BPF offloads. The programs and maps will now be collected per-device not on a global list, and only scanned for removal when driver unregisters from BPF offloads. Signed-off-by: Jakub Kicinski Reviewed-by: Quentin Monnet Signed-off-by: Daniel Borkmann --- drivers/net/ethernet/netronome/nfp/bpf/main.c | 13 +++ drivers/net/netdevsim/bpf.c | 7 ++ include/linux/bpf.h | 3 + kernel/bpf/offload.c | 142 +++++++++++++++++--------- 4 files changed, 119 insertions(+), 46 deletions(-) (limited to 'kernel') diff --git a/drivers/net/ethernet/netronome/nfp/bpf/main.c b/drivers/net/ethernet/netronome/nfp/bpf/main.c index b95b94d008cf..dee039ada75c 100644 --- a/drivers/net/ethernet/netronome/nfp/bpf/main.c +++ b/drivers/net/ethernet/netronome/nfp/bpf/main.c @@ -404,6 +404,16 @@ err_release_free: return -EINVAL; } +static int nfp_bpf_ndo_init(struct nfp_app *app, struct net_device *netdev) +{ + return bpf_offload_dev_netdev_register(netdev); +} + +static void nfp_bpf_ndo_uninit(struct nfp_app *app, struct net_device *netdev) +{ + bpf_offload_dev_netdev_unregister(netdev); +} + static int nfp_bpf_init(struct nfp_app *app) { struct nfp_app_bpf *bpf; @@ -466,6 +476,9 @@ const struct nfp_app_type app_bpf = { .extra_cap = nfp_bpf_extra_cap, + .ndo_init = nfp_bpf_ndo_init, + .ndo_uninit = nfp_bpf_ndo_uninit, + .vnic_alloc = nfp_bpf_vnic_alloc, .vnic_free = nfp_bpf_vnic_free, diff --git a/drivers/net/netdevsim/bpf.c b/drivers/net/netdevsim/bpf.c index 357f9e62f306..c4a2829e0e1f 100644 --- a/drivers/net/netdevsim/bpf.c +++ b/drivers/net/netdevsim/bpf.c @@ -582,6 +582,8 @@ int nsim_bpf(struct net_device *dev, struct netdev_bpf *bpf) int nsim_bpf_init(struct netdevsim *ns) { + int err; + if (ns->sdev->refcnt == 1) { INIT_LIST_HEAD(&ns->sdev->bpf_bound_progs); INIT_LIST_HEAD(&ns->sdev->bpf_bound_maps); @@ -592,6 +594,10 @@ int nsim_bpf_init(struct netdevsim *ns) return -ENOMEM; } + err = bpf_offload_dev_netdev_register(ns->netdev); + if (err) + return err; + debugfs_create_u32("bpf_offloaded_id", 0400, ns->ddir, &ns->bpf_offloaded_id); @@ -625,6 +631,7 @@ void nsim_bpf_uninit(struct netdevsim *ns) WARN_ON(ns->xdp.prog); WARN_ON(ns->xdp_hw.prog); WARN_ON(ns->bpf_offloaded); + bpf_offload_dev_netdev_unregister(ns->netdev); if (ns->sdev->refcnt == 1) { WARN_ON(!list_empty(&ns->sdev->bpf_bound_progs)); diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 9b010d9129f3..aa2e834a122b 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -650,6 +650,9 @@ int bpf_map_offload_get_next_key(struct bpf_map *map, bool bpf_offload_prog_map_match(struct bpf_prog *prog, struct bpf_map *map); +int bpf_offload_dev_netdev_register(struct net_device *netdev); +void bpf_offload_dev_netdev_unregister(struct net_device *netdev); + #if defined(CONFIG_NET) && defined(CONFIG_BPF_SYSCALL) int bpf_prog_offload_init(struct bpf_prog *prog, union bpf_attr *attr); diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c index 6184e48703f4..cd64a26807aa 100644 --- a/kernel/bpf/offload.c +++ b/kernel/bpf/offload.c @@ -18,19 +18,37 @@ #include #include #include +#include #include #include #include +#include #include #include -/* Protects bpf_prog_offload_devs, bpf_map_offload_devs and offload members +/* Protects offdevs, members of bpf_offload_netdev and offload members * of all progs. * RTNL lock cannot be taken when holding this lock. */ static DECLARE_RWSEM(bpf_devs_lock); -static LIST_HEAD(bpf_prog_offload_devs); -static LIST_HEAD(bpf_map_offload_devs); + +struct bpf_offload_netdev { + struct rhash_head l; + struct net_device *netdev; + struct list_head progs; + struct list_head maps; +}; + +static const struct rhashtable_params offdevs_params = { + .nelem_hint = 4, + .key_len = sizeof(struct net_device *), + .key_offset = offsetof(struct bpf_offload_netdev, netdev), + .head_offset = offsetof(struct bpf_offload_netdev, l), + .automatic_shrinking = true, +}; + +static struct rhashtable offdevs; +static bool offdevs_inited; static int bpf_dev_offload_check(struct net_device *netdev) { @@ -41,8 +59,19 @@ static int bpf_dev_offload_check(struct net_device *netdev) return 0; } +static struct bpf_offload_netdev * +bpf_offload_find_netdev(struct net_device *netdev) +{ + lockdep_assert_held(&bpf_devs_lock); + + if (!offdevs_inited) + return NULL; + return rhashtable_lookup_fast(&offdevs, &netdev, offdevs_params); +} + int bpf_prog_offload_init(struct bpf_prog *prog, union bpf_attr *attr) { + struct bpf_offload_netdev *ondev; struct bpf_prog_offload *offload; int err; @@ -66,12 +95,13 @@ int bpf_prog_offload_init(struct bpf_prog *prog, union bpf_attr *attr) goto err_maybe_put; down_write(&bpf_devs_lock); - if (offload->netdev->reg_state != NETREG_REGISTERED) { + ondev = bpf_offload_find_netdev(offload->netdev); + if (!ondev) { err = -EINVAL; goto err_unlock; } prog->aux->offload = offload; - list_add_tail(&offload->offloads, &bpf_prog_offload_devs); + list_add_tail(&offload->offloads, &ondev->progs); dev_put(offload->netdev); up_write(&bpf_devs_lock); @@ -294,6 +324,7 @@ static int bpf_map_offload_ndo(struct bpf_offloaded_map *offmap, struct bpf_map *bpf_map_offload_map_alloc(union bpf_attr *attr) { struct net *net = current->nsproxy->net_ns; + struct bpf_offload_netdev *ondev; struct bpf_offloaded_map *offmap; int err; @@ -316,11 +347,17 @@ struct bpf_map *bpf_map_offload_map_alloc(union bpf_attr *attr) if (err) goto err_unlock; + ondev = bpf_offload_find_netdev(offmap->netdev); + if (!ondev) { + err = -EINVAL; + goto err_unlock; + } + err = bpf_map_offload_ndo(offmap, BPF_OFFLOAD_MAP_ALLOC); if (err) goto err_unlock; - list_add_tail(&offmap->offloads, &bpf_map_offload_devs); + list_add_tail(&offmap->offloads, &ondev->maps); up_write(&bpf_devs_lock); rtnl_unlock(); @@ -489,56 +526,69 @@ bool bpf_offload_prog_map_match(struct bpf_prog *prog, struct bpf_map *map) return ret; } -static void bpf_offload_orphan_all_progs(struct net_device *netdev) +int bpf_offload_dev_netdev_register(struct net_device *netdev) { - struct bpf_prog_offload *offload, *tmp; + struct bpf_offload_netdev *ondev; + int err; - list_for_each_entry_safe(offload, tmp, &bpf_prog_offload_devs, offloads) - if (offload->netdev == netdev) - __bpf_prog_offload_destroy(offload->prog); -} + down_write(&bpf_devs_lock); + if (!offdevs_inited) { + err = rhashtable_init(&offdevs, &offdevs_params); + if (err) + return err; + offdevs_inited = true; + } + up_write(&bpf_devs_lock); -static void bpf_offload_orphan_all_maps(struct net_device *netdev) -{ - struct bpf_offloaded_map *offmap, *tmp; + ondev = kzalloc(sizeof(*ondev), GFP_KERNEL); + if (!ondev) + return -ENOMEM; + + ondev->netdev = netdev; + INIT_LIST_HEAD(&ondev->progs); + INIT_LIST_HEAD(&ondev->maps); + + down_write(&bpf_devs_lock); + err = rhashtable_insert_fast(&offdevs, &ondev->l, offdevs_params); + if (err) { + netdev_warn(netdev, "failed to register for BPF offload\n"); + goto err_unlock_free; + } - list_for_each_entry_safe(offmap, tmp, &bpf_map_offload_devs, offloads) - if (offmap->netdev == netdev) - __bpf_map_offload_destroy(offmap); + up_write(&bpf_devs_lock); + return 0; + +err_unlock_free: + up_write(&bpf_devs_lock); + kfree(ondev); + return err; } +EXPORT_SYMBOL_GPL(bpf_offload_dev_netdev_register); -static int bpf_offload_notification(struct notifier_block *notifier, - ulong event, void *ptr) +void bpf_offload_dev_netdev_unregister(struct net_device *netdev) { - struct net_device *netdev = netdev_notifier_info_to_dev(ptr); + struct bpf_offloaded_map *offmap, *mtmp; + struct bpf_prog_offload *offload, *ptmp; + struct bpf_offload_netdev *ondev; ASSERT_RTNL(); - switch (event) { - case NETDEV_UNREGISTER: - /* ignore namespace changes */ - if (netdev->reg_state != NETREG_UNREGISTERING) - break; - - down_write(&bpf_devs_lock); - bpf_offload_orphan_all_progs(netdev); - bpf_offload_orphan_all_maps(netdev); - up_write(&bpf_devs_lock); - break; - default: - break; - } - return NOTIFY_OK; -} + down_write(&bpf_devs_lock); + ondev = rhashtable_lookup_fast(&offdevs, &netdev, offdevs_params); + if (WARN_ON(!ondev)) + goto unlock; -static struct notifier_block bpf_offload_notifier = { - .notifier_call = bpf_offload_notification, -}; + WARN_ON(rhashtable_remove_fast(&offdevs, &ondev->l, offdevs_params)); -static int __init bpf_offload_init(void) -{ - register_netdevice_notifier(&bpf_offload_notifier); - return 0; -} + list_for_each_entry_safe(offload, ptmp, &ondev->progs, offloads) + __bpf_prog_offload_destroy(offload->prog); + list_for_each_entry_safe(offmap, mtmp, &ondev->maps, offloads) + __bpf_map_offload_destroy(offmap); -subsys_initcall(bpf_offload_init); + WARN_ON(!list_empty(&ondev->progs)); + WARN_ON(!list_empty(&ondev->maps)); + kfree(ondev); +unlock: + up_write(&bpf_devs_lock); +} +EXPORT_SYMBOL_GPL(bpf_offload_dev_netdev_unregister); -- cgit From 602144c224604f1cbff02ee2d1cf46825269ecbd Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 17 Jul 2018 10:53:25 -0700 Subject: bpf: offload: keep the offload state per-ASIC Create a higher-level entity to represent a device/ASIC to allow programs and maps to be shared between device ports. The extra work is required to make sure we don't destroy BPF objects as soon as the netdev for which they were loaded gets destroyed, as other ports may still be using them. When netdev goes away all of its BPF objects will be moved to other netdevs of the device, and only destroyed when last netdev is unregistered. Signed-off-by: Jakub Kicinski Reviewed-by: Quentin Monnet Acked-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- drivers/net/ethernet/netronome/nfp/bpf/main.c | 14 ++++- drivers/net/ethernet/netronome/nfp/bpf/main.h | 4 ++ drivers/net/netdevsim/bpf.c | 17 +++++- drivers/net/netdevsim/netdevsim.h | 3 + include/linux/bpf.h | 9 ++- kernel/bpf/offload.c | 81 +++++++++++++++++++++------ 6 files changed, 104 insertions(+), 24 deletions(-) (limited to 'kernel') diff --git a/drivers/net/ethernet/netronome/nfp/bpf/main.c b/drivers/net/ethernet/netronome/nfp/bpf/main.c index dee039ada75c..458f49235d06 100644 --- a/drivers/net/ethernet/netronome/nfp/bpf/main.c +++ b/drivers/net/ethernet/netronome/nfp/bpf/main.c @@ -406,12 +406,16 @@ err_release_free: static int nfp_bpf_ndo_init(struct nfp_app *app, struct net_device *netdev) { - return bpf_offload_dev_netdev_register(netdev); + struct nfp_app_bpf *bpf = app->priv; + + return bpf_offload_dev_netdev_register(bpf->bpf_dev, netdev); } static void nfp_bpf_ndo_uninit(struct nfp_app *app, struct net_device *netdev) { - bpf_offload_dev_netdev_unregister(netdev); + struct nfp_app_bpf *bpf = app->priv; + + bpf_offload_dev_netdev_unregister(bpf->bpf_dev, netdev); } static int nfp_bpf_init(struct nfp_app *app) @@ -437,6 +441,11 @@ static int nfp_bpf_init(struct nfp_app *app) if (err) goto err_free_neutral_maps; + bpf->bpf_dev = bpf_offload_dev_create(); + err = PTR_ERR_OR_ZERO(bpf->bpf_dev); + if (err) + goto err_free_neutral_maps; + return 0; err_free_neutral_maps: @@ -455,6 +464,7 @@ static void nfp_bpf_clean(struct nfp_app *app) { struct nfp_app_bpf *bpf = app->priv; + bpf_offload_dev_destroy(bpf->bpf_dev); WARN_ON(!skb_queue_empty(&bpf->cmsg_replies)); WARN_ON(!list_empty(&bpf->map_list)); WARN_ON(bpf->maps_in_use || bpf->map_elems_in_use); diff --git a/drivers/net/ethernet/netronome/nfp/bpf/main.h b/drivers/net/ethernet/netronome/nfp/bpf/main.h index 9845c1a2d4c2..bec935468f90 100644 --- a/drivers/net/ethernet/netronome/nfp/bpf/main.h +++ b/drivers/net/ethernet/netronome/nfp/bpf/main.h @@ -110,6 +110,8 @@ enum pkt_vec { * struct nfp_app_bpf - bpf app priv structure * @app: backpointer to the app * + * @bpf_dev: BPF offload device handle + * * @tag_allocator: bitmap of control message tags in use * @tag_alloc_next: next tag bit to allocate * @tag_alloc_last: next tag bit to be freed @@ -150,6 +152,8 @@ enum pkt_vec { struct nfp_app_bpf { struct nfp_app *app; + struct bpf_offload_dev *bpf_dev; + DECLARE_BITMAP(tag_allocator, U16_MAX + 1); u16 tag_alloc_next; u16 tag_alloc_last; diff --git a/drivers/net/netdevsim/bpf.c b/drivers/net/netdevsim/bpf.c index c4a2829e0e1f..9eab29f67a0e 100644 --- a/drivers/net/netdevsim/bpf.c +++ b/drivers/net/netdevsim/bpf.c @@ -592,11 +592,16 @@ int nsim_bpf_init(struct netdevsim *ns) debugfs_create_dir("bpf_bound_progs", ns->sdev->ddir); if (IS_ERR_OR_NULL(ns->sdev->ddir_bpf_bound_progs)) return -ENOMEM; + + ns->sdev->bpf_dev = bpf_offload_dev_create(); + err = PTR_ERR_OR_ZERO(ns->sdev->bpf_dev); + if (err) + return err; } - err = bpf_offload_dev_netdev_register(ns->netdev); + err = bpf_offload_dev_netdev_register(ns->sdev->bpf_dev, ns->netdev); if (err) - return err; + goto err_destroy_bdev; debugfs_create_u32("bpf_offloaded_id", 0400, ns->ddir, &ns->bpf_offloaded_id); @@ -624,6 +629,11 @@ int nsim_bpf_init(struct netdevsim *ns) &ns->bpf_map_accept); return 0; + +err_destroy_bdev: + if (ns->sdev->refcnt == 1) + bpf_offload_dev_destroy(ns->sdev->bpf_dev); + return err; } void nsim_bpf_uninit(struct netdevsim *ns) @@ -631,10 +641,11 @@ void nsim_bpf_uninit(struct netdevsim *ns) WARN_ON(ns->xdp.prog); WARN_ON(ns->xdp_hw.prog); WARN_ON(ns->bpf_offloaded); - bpf_offload_dev_netdev_unregister(ns->netdev); + bpf_offload_dev_netdev_unregister(ns->sdev->bpf_dev, ns->netdev); if (ns->sdev->refcnt == 1) { WARN_ON(!list_empty(&ns->sdev->bpf_bound_progs)); WARN_ON(!list_empty(&ns->sdev->bpf_bound_maps)); + bpf_offload_dev_destroy(ns->sdev->bpf_dev); } } diff --git a/drivers/net/netdevsim/netdevsim.h b/drivers/net/netdevsim/netdevsim.h index 98f26fa1e671..02be199eb005 100644 --- a/drivers/net/netdevsim/netdevsim.h +++ b/drivers/net/netdevsim/netdevsim.h @@ -27,6 +27,7 @@ #define NSIM_EA(extack, msg) NL_SET_ERR_MSG_MOD((extack), msg) struct bpf_prog; +struct bpf_offload_dev; struct dentry; struct nsim_vf_config; @@ -36,6 +37,8 @@ struct netdevsim_shared_dev { struct dentry *ddir; + struct bpf_offload_dev *bpf_dev; + struct dentry *ddir_bpf_bound_progs; u32 prog_id_gen; diff --git a/include/linux/bpf.h b/include/linux/bpf.h index aa2e834a122b..913465e45062 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -85,6 +85,7 @@ struct bpf_map { char name[BPF_OBJ_NAME_LEN]; }; +struct bpf_offload_dev; struct bpf_offloaded_map; struct bpf_map_dev_ops { @@ -650,8 +651,12 @@ int bpf_map_offload_get_next_key(struct bpf_map *map, bool bpf_offload_prog_map_match(struct bpf_prog *prog, struct bpf_map *map); -int bpf_offload_dev_netdev_register(struct net_device *netdev); -void bpf_offload_dev_netdev_unregister(struct net_device *netdev); +struct bpf_offload_dev *bpf_offload_dev_create(void); +void bpf_offload_dev_destroy(struct bpf_offload_dev *offdev); +int bpf_offload_dev_netdev_register(struct bpf_offload_dev *offdev, + struct net_device *netdev); +void bpf_offload_dev_netdev_unregister(struct bpf_offload_dev *offdev, + struct net_device *netdev); #if defined(CONFIG_NET) && defined(CONFIG_BPF_SYSCALL) int bpf_prog_offload_init(struct bpf_prog *prog, union bpf_attr *attr); diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c index cd64a26807aa..925575f64ff1 100644 --- a/kernel/bpf/offload.c +++ b/kernel/bpf/offload.c @@ -32,11 +32,17 @@ */ static DECLARE_RWSEM(bpf_devs_lock); +struct bpf_offload_dev { + struct list_head netdevs; +}; + struct bpf_offload_netdev { struct rhash_head l; struct net_device *netdev; + struct bpf_offload_dev *offdev; struct list_head progs; struct list_head maps; + struct list_head offdev_netdevs; }; static const struct rhashtable_params offdevs_params = { @@ -526,25 +532,18 @@ bool bpf_offload_prog_map_match(struct bpf_prog *prog, struct bpf_map *map) return ret; } -int bpf_offload_dev_netdev_register(struct net_device *netdev) +int bpf_offload_dev_netdev_register(struct bpf_offload_dev *offdev, + struct net_device *netdev) { struct bpf_offload_netdev *ondev; int err; - down_write(&bpf_devs_lock); - if (!offdevs_inited) { - err = rhashtable_init(&offdevs, &offdevs_params); - if (err) - return err; - offdevs_inited = true; - } - up_write(&bpf_devs_lock); - ondev = kzalloc(sizeof(*ondev), GFP_KERNEL); if (!ondev) return -ENOMEM; ondev->netdev = netdev; + ondev->offdev = offdev; INIT_LIST_HEAD(&ondev->progs); INIT_LIST_HEAD(&ondev->maps); @@ -555,6 +554,7 @@ int bpf_offload_dev_netdev_register(struct net_device *netdev) goto err_unlock_free; } + list_add(&ondev->offdev_netdevs, &offdev->netdevs); up_write(&bpf_devs_lock); return 0; @@ -565,11 +565,12 @@ err_unlock_free: } EXPORT_SYMBOL_GPL(bpf_offload_dev_netdev_register); -void bpf_offload_dev_netdev_unregister(struct net_device *netdev) +void bpf_offload_dev_netdev_unregister(struct bpf_offload_dev *offdev, + struct net_device *netdev) { + struct bpf_offload_netdev *ondev, *altdev; struct bpf_offloaded_map *offmap, *mtmp; struct bpf_prog_offload *offload, *ptmp; - struct bpf_offload_netdev *ondev; ASSERT_RTNL(); @@ -579,11 +580,26 @@ void bpf_offload_dev_netdev_unregister(struct net_device *netdev) goto unlock; WARN_ON(rhashtable_remove_fast(&offdevs, &ondev->l, offdevs_params)); - - list_for_each_entry_safe(offload, ptmp, &ondev->progs, offloads) - __bpf_prog_offload_destroy(offload->prog); - list_for_each_entry_safe(offmap, mtmp, &ondev->maps, offloads) - __bpf_map_offload_destroy(offmap); + list_del(&ondev->offdev_netdevs); + + /* Try to move the objects to another netdev of the device */ + altdev = list_first_entry_or_null(&offdev->netdevs, + struct bpf_offload_netdev, + offdev_netdevs); + if (altdev) { + list_for_each_entry(offload, &ondev->progs, offloads) + offload->netdev = altdev->netdev; + list_splice_init(&ondev->progs, &altdev->progs); + + list_for_each_entry(offmap, &ondev->maps, offloads) + offmap->netdev = altdev->netdev; + list_splice_init(&ondev->maps, &altdev->maps); + } else { + list_for_each_entry_safe(offload, ptmp, &ondev->progs, offloads) + __bpf_prog_offload_destroy(offload->prog); + list_for_each_entry_safe(offmap, mtmp, &ondev->maps, offloads) + __bpf_map_offload_destroy(offmap); + } WARN_ON(!list_empty(&ondev->progs)); WARN_ON(!list_empty(&ondev->maps)); @@ -592,3 +608,34 @@ unlock: up_write(&bpf_devs_lock); } EXPORT_SYMBOL_GPL(bpf_offload_dev_netdev_unregister); + +struct bpf_offload_dev *bpf_offload_dev_create(void) +{ + struct bpf_offload_dev *offdev; + int err; + + down_write(&bpf_devs_lock); + if (!offdevs_inited) { + err = rhashtable_init(&offdevs, &offdevs_params); + if (err) + return ERR_PTR(err); + offdevs_inited = true; + } + up_write(&bpf_devs_lock); + + offdev = kzalloc(sizeof(*offdev), GFP_KERNEL); + if (!offdev) + return ERR_PTR(-ENOMEM); + + INIT_LIST_HEAD(&offdev->netdevs); + + return offdev; +} +EXPORT_SYMBOL_GPL(bpf_offload_dev_create); + +void bpf_offload_dev_destroy(struct bpf_offload_dev *offdev) +{ + WARN_ON(!list_empty(&offdev->netdevs)); + kfree(offdev); +} +EXPORT_SYMBOL_GPL(bpf_offload_dev_destroy); -- cgit From fd4f227dea0f24d89f52f7c4eb3207f84ddcbcbd Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 17 Jul 2018 10:53:26 -0700 Subject: bpf: offload: allow program and map sharing per-ASIC Allow programs and maps to be re-used across different netdevs, as long as they belong to the same struct bpf_offload_dev. Update the bpf_offload_prog_map_match() helper for the verifier and export a new helper for the drivers to use when checking programs at attachment time. Signed-off-by: Jakub Kicinski Reviewed-by: Quentin Monnet Signed-off-by: Daniel Borkmann --- include/linux/bpf.h | 1 + kernel/bpf/offload.c | 42 +++++++++++++++++++++++++++++++++++------- 2 files changed, 36 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 913465e45062..5b5ad95cf339 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -657,6 +657,7 @@ int bpf_offload_dev_netdev_register(struct bpf_offload_dev *offdev, struct net_device *netdev); void bpf_offload_dev_netdev_unregister(struct bpf_offload_dev *offdev, struct net_device *netdev); +bool bpf_offload_dev_match(struct bpf_prog *prog, struct net_device *netdev); #if defined(CONFIG_NET) && defined(CONFIG_BPF_SYSCALL) int bpf_prog_offload_init(struct bpf_prog *prog, union bpf_attr *attr); diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c index 925575f64ff1..177a52436394 100644 --- a/kernel/bpf/offload.c +++ b/kernel/bpf/offload.c @@ -511,22 +511,50 @@ int bpf_map_offload_info_fill(struct bpf_map_info *info, struct bpf_map *map) return 0; } -bool bpf_offload_prog_map_match(struct bpf_prog *prog, struct bpf_map *map) +static bool __bpf_offload_dev_match(struct bpf_prog *prog, + struct net_device *netdev) { - struct bpf_offloaded_map *offmap; + struct bpf_offload_netdev *ondev1, *ondev2; struct bpf_prog_offload *offload; - bool ret; if (!bpf_prog_is_dev_bound(prog->aux)) return false; - if (!bpf_map_is_dev_bound(map)) - return bpf_map_offload_neutral(map); - down_read(&bpf_devs_lock); offload = prog->aux->offload; + if (!offload) + return false; + if (offload->netdev == netdev) + return true; + + ondev1 = bpf_offload_find_netdev(offload->netdev); + ondev2 = bpf_offload_find_netdev(netdev); + + return ondev1 && ondev2 && ondev1->offdev == ondev2->offdev; +} + +bool bpf_offload_dev_match(struct bpf_prog *prog, struct net_device *netdev) +{ + bool ret; + + down_read(&bpf_devs_lock); + ret = __bpf_offload_dev_match(prog, netdev); + up_read(&bpf_devs_lock); + + return ret; +} +EXPORT_SYMBOL_GPL(bpf_offload_dev_match); + +bool bpf_offload_prog_map_match(struct bpf_prog *prog, struct bpf_map *map) +{ + struct bpf_offloaded_map *offmap; + bool ret; + + if (!bpf_map_is_dev_bound(map)) + return bpf_map_offload_neutral(map); offmap = map_to_offmap(map); - ret = offload && offload->netdev == offmap->netdev; + down_read(&bpf_devs_lock); + ret = __bpf_offload_dev_match(prog, offmap->netdev); up_read(&bpf_devs_lock); return ret; -- cgit From baa2a4fdd525c8c4b0f704d20457195b29437839 Mon Sep 17 00:00:00 2001 From: Ronny Chevalier Date: Wed, 11 Jul 2018 14:39:37 +0200 Subject: audit: fix use-after-free in audit_add_watch audit_add_watch stores locally krule->watch without taking a reference on watch. Then, it calls audit_add_to_parent, and uses the watch stored locally. Unfortunately, it is possible that audit_add_to_parent updates krule->watch. When it happens, it also drops a reference of watch which could free the watch. How to reproduce (with KASAN enabled): auditctl -w /etc/passwd -F success=0 -k test_passwd auditctl -w /etc/passwd -F success=1 -k test_passwd2 The second call to auditctl triggers the use-after-free, because audit_to_parent updates krule->watch to use a previous existing watch and drops the reference to the newly created watch. To fix the issue, we grab a reference of watch and we release it at the end of the function. Signed-off-by: Ronny Chevalier Reviewed-by: Richard Guy Briggs Signed-off-by: Paul Moore --- kernel/audit_watch.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c index 6f249bdf2d84..787c7afdf829 100644 --- a/kernel/audit_watch.c +++ b/kernel/audit_watch.c @@ -420,6 +420,13 @@ int audit_add_watch(struct audit_krule *krule, struct list_head **list) struct path parent_path; int h, ret = 0; + /* + * When we will be calling audit_add_to_parent, krule->watch might have + * been updated and watch might have been freed. + * So we need to keep a reference of watch. + */ + audit_get_watch(watch); + mutex_unlock(&audit_filter_mutex); /* Avoid calling path_lookup under audit_filter_mutex. */ @@ -428,8 +435,10 @@ int audit_add_watch(struct audit_krule *krule, struct list_head **list) /* caller expects mutex locked */ mutex_lock(&audit_filter_mutex); - if (ret) + if (ret) { + audit_put_watch(watch); return ret; + } /* either find an old parent or attach a new one */ parent = audit_find_parent(d_backing_inode(parent_path.dentry)); @@ -447,6 +456,7 @@ int audit_add_watch(struct audit_krule *krule, struct list_head **list) *list = &audit_inode_hash[h]; error: path_put(&parent_path); + audit_put_watch(watch); return ret; } -- cgit From 3eca993740b8eb40f514b90b1877a4dbcf0a6710 Mon Sep 17 00:00:00 2001 From: Pavel Tatashin Date: Thu, 19 Jul 2018 16:55:34 -0400 Subject: timekeeping: Replace read_boot_clock64() with read_persistent_wall_and_boot_offset() If architecture does not support exact boot time, it is challenging to estimate boot time without having a reference to the current persistent clock value. Yet, it cannot read the persistent clock time again, because this may lead to math discrepancies with the caller of read_boot_clock64() who have read the persistent clock at a different time. This is why it is better to provide two values simultaneously: the persistent clock value, and the boot time. Replace read_boot_clock64() with: read_persistent_wall_and_boot_offset(wall_time, boot_offset) Where wall_time is returned by read_persistent_clock() And boot_offset is wall_time - boot time, which defaults to 0. Signed-off-by: Pavel Tatashin Signed-off-by: Thomas Gleixner Cc: steven.sistare@oracle.com Cc: daniel.m.jordan@oracle.com Cc: linux@armlinux.org.uk Cc: schwidefsky@de.ibm.com Cc: heiko.carstens@de.ibm.com Cc: john.stultz@linaro.org Cc: sboyd@codeaurora.org Cc: hpa@zytor.com Cc: douly.fnst@cn.fujitsu.com Cc: peterz@infradead.org Cc: prarit@redhat.com Cc: feng.tang@intel.com Cc: pmladek@suse.com Cc: gnomes@lxorguk.ukuu.org.uk Cc: linux-s390@vger.kernel.org Cc: boris.ostrovsky@oracle.com Cc: jgross@suse.com Cc: pbonzini@redhat.com Link: https://lkml.kernel.org/r/20180719205545.16512-16-pasha.tatashin@oracle.com --- include/linux/timekeeping.h | 3 ++- kernel/time/timekeeping.c | 59 +++++++++++++++++++++++---------------------- 2 files changed, 32 insertions(+), 30 deletions(-) (limited to 'kernel') diff --git a/include/linux/timekeeping.h b/include/linux/timekeeping.h index 86bc2026efce..686bc27acef0 100644 --- a/include/linux/timekeeping.h +++ b/include/linux/timekeeping.h @@ -243,7 +243,8 @@ extern void ktime_get_snapshot(struct system_time_snapshot *systime_snapshot); extern int persistent_clock_is_local; extern void read_persistent_clock64(struct timespec64 *ts); -extern void read_boot_clock64(struct timespec64 *ts); +void read_persistent_clock_and_boot_offset(struct timespec64 *wall_clock, + struct timespec64 *boot_offset); extern int update_persistent_clock64(struct timespec64 now); /* diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 4786df904c22..cb738f825c12 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -17,6 +17,7 @@ #include #include #include +#include #include #include #include @@ -1496,18 +1497,20 @@ void __weak read_persistent_clock64(struct timespec64 *ts64) } /** - * read_boot_clock64 - Return time of the system start. + * read_persistent_wall_and_boot_offset - Read persistent clock, and also offset + * from the boot. * * Weak dummy function for arches that do not yet support it. - * Function to read the exact time the system has been started. - * Returns a timespec64 with tv_sec=0 and tv_nsec=0 if unsupported. - * - * XXX - Do be sure to remove it once all arches implement it. + * wall_time - current time as returned by persistent clock + * boot_offset - offset that is defined as wall_time - boot_time + * default to 0. */ -void __weak read_boot_clock64(struct timespec64 *ts) +void __weak __init +read_persistent_wall_and_boot_offset(struct timespec64 *wall_time, + struct timespec64 *boot_offset) { - ts->tv_sec = 0; - ts->tv_nsec = 0; + read_persistent_clock64(wall_time); + *boot_offset = (struct timespec64){0}; } /* Flag for if timekeeping_resume() has injected sleeptime */ @@ -1521,28 +1524,29 @@ static bool persistent_clock_exists; */ void __init timekeeping_init(void) { + struct timespec64 wall_time, boot_offset, wall_to_mono; struct timekeeper *tk = &tk_core.timekeeper; struct clocksource *clock; unsigned long flags; - struct timespec64 now, boot, tmp; - - read_persistent_clock64(&now); - if (!timespec64_valid_strict(&now)) { - pr_warn("WARNING: Persistent clock returned invalid value!\n" - " Check your CMOS/BIOS settings.\n"); - now.tv_sec = 0; - now.tv_nsec = 0; - } else if (now.tv_sec || now.tv_nsec) - persistent_clock_exists = true; - read_boot_clock64(&boot); - if (!timespec64_valid_strict(&boot)) { - pr_warn("WARNING: Boot clock returned invalid value!\n" - " Check your CMOS/BIOS settings.\n"); - boot.tv_sec = 0; - boot.tv_nsec = 0; + read_persistent_wall_and_boot_offset(&wall_time, &boot_offset); + if (timespec64_valid_strict(&wall_time) && + timespec64_to_ns(&wall_time) > 0) { + persistent_clock_exists = true; + } else { + pr_warn("Persistent clock returned invalid value"); + wall_time = (struct timespec64){0}; } + if (timespec64_compare(&wall_time, &boot_offset) < 0) + boot_offset = (struct timespec64){0}; + + /* + * We want set wall_to_mono, so the following is true: + * wall time + wall_to_mono = boot time + */ + wall_to_mono = timespec64_sub(boot_offset, wall_time); + raw_spin_lock_irqsave(&timekeeper_lock, flags); write_seqcount_begin(&tk_core.seq); ntp_init(); @@ -1552,13 +1556,10 @@ void __init timekeeping_init(void) clock->enable(clock); tk_setup_internals(tk, clock); - tk_set_xtime(tk, &now); + tk_set_xtime(tk, &wall_time); tk->raw_sec = 0; - if (boot.tv_sec == 0 && boot.tv_nsec == 0) - boot = tk_xtime(tk); - set_normalized_timespec64(&tmp, -boot.tv_sec, -boot.tv_nsec); - tk_set_wall_to_mono(tk, tmp); + tk_set_wall_to_mono(tk, wall_to_mono); timekeeping_update(tk, TK_MIRROR | TK_CLOCK_WAS_SET); -- cgit From 4b1b7f8054896cee25669f6cea7cb6dd17f508f7 Mon Sep 17 00:00:00 2001 From: Pavel Tatashin Date: Thu, 19 Jul 2018 16:55:35 -0400 Subject: timekeeping: Default boot time offset to local_clock() read_persistent_wall_and_boot_offset() is called during boot to read both the persistent clock and also return the offset between the boot time and the value of persistent clock. Change the default boot_offset from zero to local_clock() so architectures, that do not have a dedicated boot_clock but have early sched_clock(), such as SPARCv9, x86, and possibly more will benefit from this change by getting a better and more consistent estimate of the boot time without need for an arch specific implementation. Signed-off-by: Pavel Tatashin Signed-off-by: Thomas Gleixner Cc: steven.sistare@oracle.com Cc: daniel.m.jordan@oracle.com Cc: linux@armlinux.org.uk Cc: schwidefsky@de.ibm.com Cc: heiko.carstens@de.ibm.com Cc: john.stultz@linaro.org Cc: sboyd@codeaurora.org Cc: hpa@zytor.com Cc: douly.fnst@cn.fujitsu.com Cc: peterz@infradead.org Cc: prarit@redhat.com Cc: feng.tang@intel.com Cc: pmladek@suse.com Cc: gnomes@lxorguk.ukuu.org.uk Cc: linux-s390@vger.kernel.org Cc: boris.ostrovsky@oracle.com Cc: jgross@suse.com Cc: pbonzini@redhat.com Link: https://lkml.kernel.org/r/20180719205545.16512-17-pasha.tatashin@oracle.com --- kernel/time/timekeeping.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index cb738f825c12..30d7f64ffc87 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -1503,14 +1503,17 @@ void __weak read_persistent_clock64(struct timespec64 *ts64) * Weak dummy function for arches that do not yet support it. * wall_time - current time as returned by persistent clock * boot_offset - offset that is defined as wall_time - boot_time - * default to 0. + * The default function calculates offset based on the current value of + * local_clock(). This way architectures that support sched_clock() but don't + * support dedicated boot time clock will provide the best estimate of the + * boot time. */ void __weak __init read_persistent_wall_and_boot_offset(struct timespec64 *wall_time, struct timespec64 *boot_offset) { read_persistent_clock64(wall_time); - *boot_offset = (struct timespec64){0}; + *boot_offset = ns_to_timespec64(local_clock()); } /* Flag for if timekeeping_resume() has injected sleeptime */ -- cgit From 5d2a4e91a541cb04d20d11602f0f9340291322ac Mon Sep 17 00:00:00 2001 From: Pavel Tatashin Date: Thu, 19 Jul 2018 16:55:41 -0400 Subject: sched/clock: Move sched clock initialization and merge with generic clock sched_clock_postinit() initializes a generic clock on systems where no other clock is provided. This function may be called only after timekeeping_init(). Rename sched_clock_postinit to generic_clock_inti() and call it from sched_clock_init(). Move the call for sched_clock_init() until after time_init(). Suggested-by: Peter Zijlstra Signed-off-by: Pavel Tatashin Signed-off-by: Thomas Gleixner Cc: steven.sistare@oracle.com Cc: daniel.m.jordan@oracle.com Cc: linux@armlinux.org.uk Cc: schwidefsky@de.ibm.com Cc: heiko.carstens@de.ibm.com Cc: john.stultz@linaro.org Cc: sboyd@codeaurora.org Cc: hpa@zytor.com Cc: douly.fnst@cn.fujitsu.com Cc: prarit@redhat.com Cc: feng.tang@intel.com Cc: pmladek@suse.com Cc: gnomes@lxorguk.ukuu.org.uk Cc: linux-s390@vger.kernel.org Cc: boris.ostrovsky@oracle.com Cc: jgross@suse.com Cc: pbonzini@redhat.com Link: https://lkml.kernel.org/r/20180719205545.16512-23-pasha.tatashin@oracle.com --- include/linux/sched_clock.h | 5 ++--- init/main.c | 4 ++-- kernel/sched/clock.c | 27 +++++++++++++++++---------- kernel/sched/core.c | 1 - kernel/time/sched_clock.c | 2 +- 5 files changed, 22 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/include/linux/sched_clock.h b/include/linux/sched_clock.h index 411b52e424e1..abe28d5cb3f4 100644 --- a/include/linux/sched_clock.h +++ b/include/linux/sched_clock.h @@ -9,17 +9,16 @@ #define LINUX_SCHED_CLOCK #ifdef CONFIG_GENERIC_SCHED_CLOCK -extern void sched_clock_postinit(void); +extern void generic_sched_clock_init(void); extern void sched_clock_register(u64 (*read)(void), int bits, unsigned long rate); #else -static inline void sched_clock_postinit(void) { } +static inline void generic_sched_clock_init(void) { } static inline void sched_clock_register(u64 (*read)(void), int bits, unsigned long rate) { - ; } #endif diff --git a/init/main.c b/init/main.c index 3b4ada11ed52..162d931c9511 100644 --- a/init/main.c +++ b/init/main.c @@ -79,7 +79,7 @@ #include #include #include -#include +#include #include #include #include @@ -642,7 +642,7 @@ asmlinkage __visible void __init start_kernel(void) softirq_init(); timekeeping_init(); time_init(); - sched_clock_postinit(); + sched_clock_init(); printk_safe_init(); perf_event_init(); profile_init(); diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c index 10c83e73837a..0e9dbb2d9aea 100644 --- a/kernel/sched/clock.c +++ b/kernel/sched/clock.c @@ -53,6 +53,7 @@ * */ #include "sched.h" +#include /* * Scheduler clock - returns current time in nanosec units. @@ -68,11 +69,6 @@ EXPORT_SYMBOL_GPL(sched_clock); __read_mostly int sched_clock_running; -void sched_clock_init(void) -{ - sched_clock_running = 1; -} - #ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK /* * We must start with !__sched_clock_stable because the unstable -> stable @@ -199,6 +195,15 @@ void clear_sched_clock_stable(void) __clear_sched_clock_stable(); } +static void __sched_clock_gtod_offset(void) +{ + __gtod_offset = (sched_clock() + __sched_clock_offset) - ktime_get_ns(); +} + +void __init sched_clock_init(void) +{ + sched_clock_running = 1; +} /* * We run this as late_initcall() such that it runs after all built-in drivers, * notably: acpi_processor and intel_idle, which can mark the TSC as unstable. @@ -385,8 +390,6 @@ void sched_clock_tick(void) void sched_clock_tick_stable(void) { - u64 gtod, clock; - if (!sched_clock_stable()) return; @@ -398,9 +401,7 @@ void sched_clock_tick_stable(void) * TSC to be unstable, any computation will be computing crap. */ local_irq_disable(); - gtod = ktime_get_ns(); - clock = sched_clock(); - __gtod_offset = (clock + __sched_clock_offset) - gtod; + __sched_clock_gtod_offset(); local_irq_enable(); } @@ -434,6 +435,12 @@ EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event); #else /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */ +void __init sched_clock_init(void) +{ + sched_clock_running = 1; + generic_sched_clock_init(); +} + u64 sched_clock_cpu(int cpu) { if (unlikely(!sched_clock_running)) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index fe365c9a08e9..552406e9713b 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5954,7 +5954,6 @@ void __init sched_init(void) int i, j; unsigned long alloc_size = 0, ptr; - sched_clock_init(); wait_bit_init(); #ifdef CONFIG_FAIR_GROUP_SCHED diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c index 2d8f05aad442..cbc72c2c1fca 100644 --- a/kernel/time/sched_clock.c +++ b/kernel/time/sched_clock.c @@ -237,7 +237,7 @@ sched_clock_register(u64 (*read)(void), int bits, unsigned long rate) pr_debug("Registered %pF as sched_clock source\n", read); } -void __init sched_clock_postinit(void) +void __init generic_sched_clock_init(void) { /* * If no sched_clock() function has been provided at that point, -- cgit From 857baa87b6422bcfb84ed3631d6839920cb5b09d Mon Sep 17 00:00:00 2001 From: Pavel Tatashin Date: Thu, 19 Jul 2018 16:55:42 -0400 Subject: sched/clock: Enable sched clock early Allow sched_clock() to be used before schec_clock_init() is called. This provides a way to get early boot timestamps on machines with unstable clocks. Signed-off-by: Pavel Tatashin Signed-off-by: Thomas Gleixner Cc: steven.sistare@oracle.com Cc: daniel.m.jordan@oracle.com Cc: linux@armlinux.org.uk Cc: schwidefsky@de.ibm.com Cc: heiko.carstens@de.ibm.com Cc: john.stultz@linaro.org Cc: sboyd@codeaurora.org Cc: hpa@zytor.com Cc: douly.fnst@cn.fujitsu.com Cc: peterz@infradead.org Cc: prarit@redhat.com Cc: feng.tang@intel.com Cc: pmladek@suse.com Cc: gnomes@lxorguk.ukuu.org.uk Cc: linux-s390@vger.kernel.org Cc: boris.ostrovsky@oracle.com Cc: jgross@suse.com Cc: pbonzini@redhat.com Link: https://lkml.kernel.org/r/20180719205545.16512-24-pasha.tatashin@oracle.com --- init/main.c | 2 +- kernel/sched/clock.c | 20 +++++++++++++++++++- 2 files changed, 20 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/init/main.c b/init/main.c index 162d931c9511..ff0a24170b95 100644 --- a/init/main.c +++ b/init/main.c @@ -642,7 +642,6 @@ asmlinkage __visible void __init start_kernel(void) softirq_init(); timekeeping_init(); time_init(); - sched_clock_init(); printk_safe_init(); perf_event_init(); profile_init(); @@ -697,6 +696,7 @@ asmlinkage __visible void __init start_kernel(void) acpi_early_init(); if (late_time_init) late_time_init(); + sched_clock_init(); calibrate_delay(); pid_idr_init(); anon_vma_init(); diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c index 0e9dbb2d9aea..422cd63f8f17 100644 --- a/kernel/sched/clock.c +++ b/kernel/sched/clock.c @@ -202,7 +202,25 @@ static void __sched_clock_gtod_offset(void) void __init sched_clock_init(void) { + unsigned long flags; + + /* + * Set __gtod_offset such that once we mark sched_clock_running, + * sched_clock_tick() continues where sched_clock() left off. + * + * Even if TSC is buggered, we're still UP at this point so it + * can't really be out of sync. + */ + local_irq_save(flags); + __sched_clock_gtod_offset(); + local_irq_restore(flags); + sched_clock_running = 1; + + /* Now that sched_clock_running is set adjust scd */ + local_irq_save(flags); + sched_clock_tick(); + local_irq_restore(flags); } /* * We run this as late_initcall() such that it runs after all built-in drivers, @@ -356,7 +374,7 @@ u64 sched_clock_cpu(int cpu) return sched_clock() + __sched_clock_offset; if (unlikely(!sched_clock_running)) - return 0ull; + return sched_clock(); preempt_disable_notrace(); scd = cpu_sdc(cpu); -- cgit From 46457ea464f5341d1f9dad8dd213805d45f7f117 Mon Sep 17 00:00:00 2001 From: Pavel Tatashin Date: Thu, 19 Jul 2018 16:55:43 -0400 Subject: sched/clock: Use static key for sched_clock_running sched_clock_running may be read every time sched_clock_cpu() is called. Yet, this variable is updated only twice during boot, and never changes again, therefore it is better to make it a static key. Signed-off-by: Pavel Tatashin Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra Cc: steven.sistare@oracle.com Cc: daniel.m.jordan@oracle.com Cc: linux@armlinux.org.uk Cc: schwidefsky@de.ibm.com Cc: heiko.carstens@de.ibm.com Cc: john.stultz@linaro.org Cc: sboyd@codeaurora.org Cc: hpa@zytor.com Cc: douly.fnst@cn.fujitsu.com Cc: prarit@redhat.com Cc: feng.tang@intel.com Cc: pmladek@suse.com Cc: gnomes@lxorguk.ukuu.org.uk Cc: linux-s390@vger.kernel.org Cc: boris.ostrovsky@oracle.com Cc: jgross@suse.com Cc: pbonzini@redhat.com Link: https://lkml.kernel.org/r/20180719205545.16512-25-pasha.tatashin@oracle.com --- kernel/sched/clock.c | 16 ++++++++-------- kernel/sched/debug.c | 2 -- 2 files changed, 8 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c index 422cd63f8f17..c5c47ad3f386 100644 --- a/kernel/sched/clock.c +++ b/kernel/sched/clock.c @@ -67,7 +67,7 @@ unsigned long long __weak sched_clock(void) } EXPORT_SYMBOL_GPL(sched_clock); -__read_mostly int sched_clock_running; +static DEFINE_STATIC_KEY_FALSE(sched_clock_running); #ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK /* @@ -191,7 +191,7 @@ void clear_sched_clock_stable(void) smp_mb(); /* matches sched_clock_init_late() */ - if (sched_clock_running == 2) + if (static_key_count(&sched_clock_running.key) == 2) __clear_sched_clock_stable(); } @@ -215,7 +215,7 @@ void __init sched_clock_init(void) __sched_clock_gtod_offset(); local_irq_restore(flags); - sched_clock_running = 1; + static_branch_inc(&sched_clock_running); /* Now that sched_clock_running is set adjust scd */ local_irq_save(flags); @@ -228,7 +228,7 @@ void __init sched_clock_init(void) */ static int __init sched_clock_init_late(void) { - sched_clock_running = 2; + static_branch_inc(&sched_clock_running); /* * Ensure that it is impossible to not do a static_key update. * @@ -373,7 +373,7 @@ u64 sched_clock_cpu(int cpu) if (sched_clock_stable()) return sched_clock() + __sched_clock_offset; - if (unlikely(!sched_clock_running)) + if (!static_branch_unlikely(&sched_clock_running)) return sched_clock(); preempt_disable_notrace(); @@ -396,7 +396,7 @@ void sched_clock_tick(void) if (sched_clock_stable()) return; - if (unlikely(!sched_clock_running)) + if (!static_branch_unlikely(&sched_clock_running)) return; lockdep_assert_irqs_disabled(); @@ -455,13 +455,13 @@ EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event); void __init sched_clock_init(void) { - sched_clock_running = 1; + static_branch_inc(&sched_clock_running); generic_sched_clock_init(); } u64 sched_clock_cpu(int cpu) { - if (unlikely(!sched_clock_running)) + if (!static_branch_unlikely(&sched_clock_running)) return 0; return sched_clock(); diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index e593b4118578..b0212f489a33 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -623,8 +623,6 @@ void print_dl_rq(struct seq_file *m, int cpu, struct dl_rq *dl_rq) #undef PU } -extern __read_mostly int sched_clock_running; - static void print_cpu(struct seq_file *m, int cpu) { struct rq *rq = cpu_rq(cpu); -- cgit From 36fc3c8c282c01ad1570bd864de52f128d731b75 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Thu, 19 Jul 2018 22:14:31 -0700 Subject: bpf: btf: Clean up BTF_INT_BITS() in uapi btf.h This patch shrinks the BTF_INT_BITS() mask. The current btf_int_check_meta() ensures the nr_bits of an integer cannot exceed 64. Hence, it is mostly an uapi cleanup. The actual btf usage (i.e. seq_show()) is also modified to use u8 instead of u16. The verification (e.g. btf_int_check_meta()) path stays as is to deal with invalid BTF situation. Fixes: 69b693f0aefa ("bpf: btf: Introduce BPF Type Format (BTF)") Signed-off-by: Martin KaFai Lau Signed-off-by: Daniel Borkmann --- include/uapi/linux/btf.h | 2 +- kernel/bpf/btf.c | 16 ++++++++++------ 2 files changed, 11 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/include/uapi/linux/btf.h b/include/uapi/linux/btf.h index 0b5ddbe135a4..972265f32871 100644 --- a/include/uapi/linux/btf.h +++ b/include/uapi/linux/btf.h @@ -76,7 +76,7 @@ struct btf_type { */ #define BTF_INT_ENCODING(VAL) (((VAL) & 0x0f000000) >> 24) #define BTF_INT_OFFSET(VAL) (((VAL & 0x00ff0000)) >> 16) -#define BTF_INT_BITS(VAL) ((VAL) & 0x0000ffff) +#define BTF_INT_BITS(VAL) ((VAL) & 0x000000ff) /* Attributes stored in the BTF_INT_ENCODING */ #define BTF_INT_SIGNED (1 << 0) diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index e016ac3afa24..9704934252b3 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -450,7 +450,7 @@ static const struct btf_type *btf_type_by_id(const struct btf *btf, u32 type_id) */ static bool btf_type_int_is_regular(const struct btf_type *t) { - u16 nr_bits, nr_bytes; + u8 nr_bits, nr_bytes; u32 int_data; int_data = btf_type_int(t); @@ -993,12 +993,16 @@ static void btf_int_bits_seq_show(const struct btf *btf, { u16 left_shift_bits, right_shift_bits; u32 int_data = btf_type_int(t); - u16 nr_bits = BTF_INT_BITS(int_data); - u16 total_bits_offset; - u16 nr_copy_bytes; - u16 nr_copy_bits; + u8 nr_bits = BTF_INT_BITS(int_data); + u8 total_bits_offset; + u8 nr_copy_bytes; + u8 nr_copy_bits; u64 print_num; + /* + * bits_offset is at most 7. + * BTF_INT_OFFSET() cannot exceed 64 bits. + */ total_bits_offset = bits_offset + BTF_INT_OFFSET(int_data); data += BITS_ROUNDDOWN_BYTES(total_bits_offset); bits_offset = BITS_PER_BYTE_MASKED(total_bits_offset); @@ -1028,7 +1032,7 @@ static void btf_int_seq_show(const struct btf *btf, const struct btf_type *t, u32 int_data = btf_type_int(t); u8 encoding = BTF_INT_ENCODING(int_data); bool sign = encoding & BTF_INT_SIGNED; - u32 nr_bits = BTF_INT_BITS(int_data); + u8 nr_bits = BTF_INT_BITS(int_data); if (bits_offset || BTF_INT_OFFSET(int_data) || BITS_PER_BYTE_MASKED(nr_bits)) { -- cgit From 9407f5a7ee77c631d1e100436132437cf6237e45 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 20 Jul 2018 10:09:11 +0200 Subject: sched/clock: Close a hole in sched_clock_init() All data required for the 'unstable' sched_clock must be set-up _before_ enabling it -- setting sched_clock_running. This includes the __gtod_offset but also a recent scd stamp. Make the gtod-offset update also set the csd stamp -- it requires the same two clock reads _anyway_. This doesn't hurt in the sched_clock_tick_stable() case and ensures sched_clock_init() gets everything set-up before use. Also switch to unconditional IRQ-disable/enable because the static key stuff already requires this is not ran with IRQs disabled. Fixes: 857baa87b642 ("sched/clock: Enable sched clock early") Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Cc: Pavel Tatashin Cc: steven.sistare@oracle.com Cc: daniel.m.jordan@oracle.com Cc: linux@armlinux.org.uk Cc: schwidefsky@de.ibm.com Cc: heiko.carstens@de.ibm.com Cc: john.stultz@linaro.org Cc: sboyd@codeaurora.org Cc: hpa@zytor.com Cc: douly.fnst@cn.fujitsu.com Cc: prarit@redhat.com Cc: feng.tang@intel.com Cc: pmladek@suse.com Cc: gnomes@lxorguk.ukuu.org.uk Cc: linux-s390@vger.kernel.org Cc: boris.ostrovsky@oracle.com Cc: jgross@suse.com Cc: pbonzini@redhat.com Link: https://lkml.kernel.org/r/20180720080911.GM2494@hirez.programming.kicks-ass.net --- kernel/sched/clock.c | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c index c5c47ad3f386..811a39aca1ce 100644 --- a/kernel/sched/clock.c +++ b/kernel/sched/clock.c @@ -197,13 +197,14 @@ void clear_sched_clock_stable(void) static void __sched_clock_gtod_offset(void) { - __gtod_offset = (sched_clock() + __sched_clock_offset) - ktime_get_ns(); + struct sched_clock_data *scd = this_scd(); + + __scd_stamp(scd); + __gtod_offset = (scd->tick_raw + __sched_clock_offset) - scd->tick_gtod; } void __init sched_clock_init(void) { - unsigned long flags; - /* * Set __gtod_offset such that once we mark sched_clock_running, * sched_clock_tick() continues where sched_clock() left off. @@ -211,16 +212,11 @@ void __init sched_clock_init(void) * Even if TSC is buggered, we're still UP at this point so it * can't really be out of sync. */ - local_irq_save(flags); + local_irq_disable(); __sched_clock_gtod_offset(); - local_irq_restore(flags); + local_irq_enable(); static_branch_inc(&sched_clock_running); - - /* Now that sched_clock_running is set adjust scd */ - local_irq_save(flags); - sched_clock_tick(); - local_irq_restore(flags); } /* * We run this as late_initcall() such that it runs after all built-in drivers, -- cgit From 488dee96bb62f0b3d9e678cf42574034d5b033a5 Mon Sep 17 00:00:00 2001 From: Dmitry Torokhov Date: Fri, 20 Jul 2018 21:56:47 +0000 Subject: kernfs: allow creating kernfs objects with arbitrary uid/gid This change allows creating kernfs files and directories with arbitrary uid/gid instead of always using GLOBAL_ROOT_UID/GID by extending kernfs_create_dir_ns() and kernfs_create_file_ns() with uid/gid arguments. The "simple" kernfs_create_file() and kernfs_create_dir() are left alone and always create objects belonging to the global root. When creating symlinks ownership (uid/gid) is taken from the target kernfs object. Co-Developed-by: Tyler Hicks Signed-off-by: Dmitry Torokhov Signed-off-by: Tyler Hicks Signed-off-by: David S. Miller --- arch/x86/kernel/cpu/intel_rdt_rdtgroup.c | 4 +++- fs/kernfs/dir.c | 29 ++++++++++++++++++++++++++--- fs/kernfs/file.c | 8 ++++++-- fs/kernfs/inode.c | 2 +- fs/kernfs/kernfs-internal.h | 2 ++ fs/kernfs/symlink.c | 11 ++++++++++- fs/sysfs/dir.c | 4 +++- fs/sysfs/file.c | 5 +++-- include/linux/kernfs.h | 28 +++++++++++++++++++--------- kernel/cgroup/cgroup.c | 4 +++- 10 files changed, 76 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c index 749856a2e736..9af1a21265d3 100644 --- a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c +++ b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c @@ -146,6 +146,7 @@ static int rdtgroup_add_file(struct kernfs_node *parent_kn, struct rftype *rft) int ret; kn = __kernfs_create_file(parent_kn, rft->name, rft->mode, + GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, 0, rft->kf_ops, rft, NULL, NULL); if (IS_ERR(kn)) return PTR_ERR(kn); @@ -1503,7 +1504,8 @@ static int mon_addfile(struct kernfs_node *parent_kn, const char *name, struct kernfs_node *kn; int ret = 0; - kn = __kernfs_create_file(parent_kn, name, 0444, 0, + kn = __kernfs_create_file(parent_kn, name, 0444, + GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, 0, &kf_mondata_ops, priv, NULL, NULL); if (IS_ERR(kn)) return PTR_ERR(kn); diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c index d66cc0777303..4ca0b5c18192 100644 --- a/fs/kernfs/dir.c +++ b/fs/kernfs/dir.c @@ -619,6 +619,7 @@ struct kernfs_node *kernfs_node_from_dentry(struct dentry *dentry) static struct kernfs_node *__kernfs_new_node(struct kernfs_root *root, const char *name, umode_t mode, + kuid_t uid, kgid_t gid, unsigned flags) { struct kernfs_node *kn; @@ -661,8 +662,22 @@ static struct kernfs_node *__kernfs_new_node(struct kernfs_root *root, kn->mode = mode; kn->flags = flags; + if (!uid_eq(uid, GLOBAL_ROOT_UID) || !gid_eq(gid, GLOBAL_ROOT_GID)) { + struct iattr iattr = { + .ia_valid = ATTR_UID | ATTR_GID, + .ia_uid = uid, + .ia_gid = gid, + }; + + ret = __kernfs_setattr(kn, &iattr); + if (ret < 0) + goto err_out3; + } + return kn; + err_out3: + idr_remove(&root->ino_idr, kn->id.ino); err_out2: kmem_cache_free(kernfs_node_cache, kn); err_out1: @@ -672,11 +687,13 @@ static struct kernfs_node *__kernfs_new_node(struct kernfs_root *root, struct kernfs_node *kernfs_new_node(struct kernfs_node *parent, const char *name, umode_t mode, + kuid_t uid, kgid_t gid, unsigned flags) { struct kernfs_node *kn; - kn = __kernfs_new_node(kernfs_root(parent), name, mode, flags); + kn = __kernfs_new_node(kernfs_root(parent), + name, mode, uid, gid, flags); if (kn) { kernfs_get(parent); kn->parent = parent; @@ -946,6 +963,7 @@ struct kernfs_root *kernfs_create_root(struct kernfs_syscall_ops *scops, root->next_generation = 1; kn = __kernfs_new_node(root, "", S_IFDIR | S_IRUGO | S_IXUGO, + GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, KERNFS_DIR); if (!kn) { idr_destroy(&root->ino_idr); @@ -984,6 +1002,8 @@ void kernfs_destroy_root(struct kernfs_root *root) * @parent: parent in which to create a new directory * @name: name of the new directory * @mode: mode of the new directory + * @uid: uid of the new directory + * @gid: gid of the new directory * @priv: opaque data associated with the new directory * @ns: optional namespace tag of the directory * @@ -991,13 +1011,15 @@ void kernfs_destroy_root(struct kernfs_root *root) */ struct kernfs_node *kernfs_create_dir_ns(struct kernfs_node *parent, const char *name, umode_t mode, + kuid_t uid, kgid_t gid, void *priv, const void *ns) { struct kernfs_node *kn; int rc; /* allocate */ - kn = kernfs_new_node(parent, name, mode | S_IFDIR, KERNFS_DIR); + kn = kernfs_new_node(parent, name, mode | S_IFDIR, + uid, gid, KERNFS_DIR); if (!kn) return ERR_PTR(-ENOMEM); @@ -1028,7 +1050,8 @@ struct kernfs_node *kernfs_create_empty_dir(struct kernfs_node *parent, int rc; /* allocate */ - kn = kernfs_new_node(parent, name, S_IRUGO|S_IXUGO|S_IFDIR, KERNFS_DIR); + kn = kernfs_new_node(parent, name, S_IRUGO|S_IXUGO|S_IFDIR, + GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, KERNFS_DIR); if (!kn) return ERR_PTR(-ENOMEM); diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c index 2015d8c45e4a..dbf5bc250bfd 100644 --- a/fs/kernfs/file.c +++ b/fs/kernfs/file.c @@ -965,6 +965,8 @@ const struct file_operations kernfs_file_fops = { * @parent: directory to create the file in * @name: name of the file * @mode: mode of the file + * @uid: uid of the file + * @gid: gid of the file * @size: size of the file * @ops: kernfs operations for the file * @priv: private data for the file @@ -975,7 +977,8 @@ const struct file_operations kernfs_file_fops = { */ struct kernfs_node *__kernfs_create_file(struct kernfs_node *parent, const char *name, - umode_t mode, loff_t size, + umode_t mode, kuid_t uid, kgid_t gid, + loff_t size, const struct kernfs_ops *ops, void *priv, const void *ns, struct lock_class_key *key) @@ -986,7 +989,8 @@ struct kernfs_node *__kernfs_create_file(struct kernfs_node *parent, flags = KERNFS_FILE; - kn = kernfs_new_node(parent, name, (mode & S_IALLUGO) | S_IFREG, flags); + kn = kernfs_new_node(parent, name, (mode & S_IALLUGO) | S_IFREG, + uid, gid, flags); if (!kn) return ERR_PTR(-ENOMEM); diff --git a/fs/kernfs/inode.c b/fs/kernfs/inode.c index 3d73fe9d56e2..80cebcd94c90 100644 --- a/fs/kernfs/inode.c +++ b/fs/kernfs/inode.c @@ -63,7 +63,7 @@ out_unlock: return ret; } -static int __kernfs_setattr(struct kernfs_node *kn, const struct iattr *iattr) +int __kernfs_setattr(struct kernfs_node *kn, const struct iattr *iattr) { struct kernfs_iattrs *attrs; struct iattr *iattrs; diff --git a/fs/kernfs/kernfs-internal.h b/fs/kernfs/kernfs-internal.h index 0f260dcca177..3d83b114bb08 100644 --- a/fs/kernfs/kernfs-internal.h +++ b/fs/kernfs/kernfs-internal.h @@ -90,6 +90,7 @@ int kernfs_iop_setattr(struct dentry *dentry, struct iattr *iattr); int kernfs_iop_getattr(const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags); ssize_t kernfs_iop_listxattr(struct dentry *dentry, char *buf, size_t size); +int __kernfs_setattr(struct kernfs_node *kn, const struct iattr *iattr); /* * dir.c @@ -104,6 +105,7 @@ void kernfs_put_active(struct kernfs_node *kn); int kernfs_add_one(struct kernfs_node *kn); struct kernfs_node *kernfs_new_node(struct kernfs_node *parent, const char *name, umode_t mode, + kuid_t uid, kgid_t gid, unsigned flags); struct kernfs_node *kernfs_find_and_get_node_by_ino(struct kernfs_root *root, unsigned int ino); diff --git a/fs/kernfs/symlink.c b/fs/kernfs/symlink.c index 08ccabd7047f..5ffed48f3d0e 100644 --- a/fs/kernfs/symlink.c +++ b/fs/kernfs/symlink.c @@ -21,6 +21,7 @@ * @target: target node for the symlink to point to * * Returns the created node on success, ERR_PTR() value on error. + * Ownership of the link matches ownership of the target. */ struct kernfs_node *kernfs_create_link(struct kernfs_node *parent, const char *name, @@ -28,8 +29,16 @@ struct kernfs_node *kernfs_create_link(struct kernfs_node *parent, { struct kernfs_node *kn; int error; + kuid_t uid = GLOBAL_ROOT_UID; + kgid_t gid = GLOBAL_ROOT_GID; - kn = kernfs_new_node(parent, name, S_IFLNK|S_IRWXUGO, KERNFS_LINK); + if (target->iattr) { + uid = target->iattr->ia_iattr.ia_uid; + gid = target->iattr->ia_iattr.ia_gid; + } + + kn = kernfs_new_node(parent, name, S_IFLNK|S_IRWXUGO, uid, gid, + KERNFS_LINK); if (!kn) return ERR_PTR(-ENOMEM); diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c index 58eba92a0e41..e39b884f0867 100644 --- a/fs/sysfs/dir.c +++ b/fs/sysfs/dir.c @@ -52,7 +52,9 @@ int sysfs_create_dir_ns(struct kobject *kobj, const void *ns) return -ENOENT; kn = kernfs_create_dir_ns(parent, kobject_name(kobj), - S_IRWXU | S_IRUGO | S_IXUGO, kobj, ns); + S_IRWXU | S_IRUGO | S_IXUGO, + GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, + kobj, ns); if (IS_ERR(kn)) { if (PTR_ERR(kn) == -EEXIST) sysfs_warn_dup(parent, kobject_name(kobj)); diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c index 5c13f29bfcdb..513fa691ecbd 100644 --- a/fs/sysfs/file.c +++ b/fs/sysfs/file.c @@ -302,8 +302,9 @@ int sysfs_add_file_mode_ns(struct kernfs_node *parent, if (!attr->ignore_lockdep) key = attr->key ?: (struct lock_class_key *)&attr->skey; #endif - kn = __kernfs_create_file(parent, attr->name, mode & 0777, size, ops, - (void *)attr, ns, key); + kn = __kernfs_create_file(parent, attr->name, + mode & 0777, GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, + size, ops, (void *)attr, ns, key); if (IS_ERR(kn)) { if (PTR_ERR(kn) == -EEXIST) sysfs_warn_dup(parent, attr->name); diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h index ab25c8b6d9e3..814643f7ee52 100644 --- a/include/linux/kernfs.h +++ b/include/linux/kernfs.h @@ -15,6 +15,7 @@ #include #include #include +#include #include struct file; @@ -325,12 +326,14 @@ void kernfs_destroy_root(struct kernfs_root *root); struct kernfs_node *kernfs_create_dir_ns(struct kernfs_node *parent, const char *name, umode_t mode, + kuid_t uid, kgid_t gid, void *priv, const void *ns); struct kernfs_node *kernfs_create_empty_dir(struct kernfs_node *parent, const char *name); struct kernfs_node *__kernfs_create_file(struct kernfs_node *parent, - const char *name, - umode_t mode, loff_t size, + const char *name, umode_t mode, + kuid_t uid, kgid_t gid, + loff_t size, const struct kernfs_ops *ops, void *priv, const void *ns, struct lock_class_key *key); @@ -415,12 +418,14 @@ static inline void kernfs_destroy_root(struct kernfs_root *root) { } static inline struct kernfs_node * kernfs_create_dir_ns(struct kernfs_node *parent, const char *name, - umode_t mode, void *priv, const void *ns) + umode_t mode, kuid_t uid, kgid_t gid, + void *priv, const void *ns) { return ERR_PTR(-ENOSYS); } static inline struct kernfs_node * __kernfs_create_file(struct kernfs_node *parent, const char *name, - umode_t mode, loff_t size, const struct kernfs_ops *ops, + umode_t mode, kuid_t uid, kgid_t gid, + loff_t size, const struct kernfs_ops *ops, void *priv, const void *ns, struct lock_class_key *key) { return ERR_PTR(-ENOSYS); } @@ -498,12 +503,15 @@ static inline struct kernfs_node * kernfs_create_dir(struct kernfs_node *parent, const char *name, umode_t mode, void *priv) { - return kernfs_create_dir_ns(parent, name, mode, priv, NULL); + return kernfs_create_dir_ns(parent, name, mode, + GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, + priv, NULL); } static inline struct kernfs_node * kernfs_create_file_ns(struct kernfs_node *parent, const char *name, - umode_t mode, loff_t size, const struct kernfs_ops *ops, + umode_t mode, kuid_t uid, kgid_t gid, + loff_t size, const struct kernfs_ops *ops, void *priv, const void *ns) { struct lock_class_key *key = NULL; @@ -511,15 +519,17 @@ kernfs_create_file_ns(struct kernfs_node *parent, const char *name, #ifdef CONFIG_DEBUG_LOCK_ALLOC key = (struct lock_class_key *)&ops->lockdep_key; #endif - return __kernfs_create_file(parent, name, mode, size, ops, priv, ns, - key); + return __kernfs_create_file(parent, name, mode, uid, gid, + size, ops, priv, ns, key); } static inline struct kernfs_node * kernfs_create_file(struct kernfs_node *parent, const char *name, umode_t mode, loff_t size, const struct kernfs_ops *ops, void *priv) { - return kernfs_create_file_ns(parent, name, mode, size, ops, priv, NULL); + return kernfs_create_file_ns(parent, name, mode, + GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, + size, ops, priv, NULL); } static inline int kernfs_remove_by_name(struct kernfs_node *parent, diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 077370bf8964..35cf3d71f8aa 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -3557,7 +3557,9 @@ static int cgroup_add_file(struct cgroup_subsys_state *css, struct cgroup *cgrp, key = &cft->lockdep_key; #endif kn = __kernfs_create_file(cgrp->kn, cgroup_file_name(cgrp, cft, name), - cgroup_file_mode(cft), 0, cft->kf_ops, cft, + cgroup_file_mode(cft), + GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, + 0, cft->kf_ops, cft, NULL, key); if (IS_ERR(kn)) return PTR_ERR(kn); -- cgit From 1fb53567a3633740aac8761eb7023dc5671f0edb Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 5 May 2017 13:45:14 -0500 Subject: pids: Move task_pid_type into sched/signal.h The function is general and inline so there is no need to hide it inside of exit.c Signed-off-by: "Eric W. Biederman" --- include/linux/sched/signal.h | 8 ++++++++ kernel/exit.c | 8 -------- 2 files changed, 8 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h index 113d1ad1ced7..d8ef0a3d2e7e 100644 --- a/include/linux/sched/signal.h +++ b/include/linux/sched/signal.h @@ -556,6 +556,14 @@ extern bool current_is_single_threaded(void); typedef int (*proc_visitor)(struct task_struct *p, void *data); void walk_process_tree(struct task_struct *top, proc_visitor, void *); +static inline +struct pid *task_pid_type(struct task_struct *task, enum pid_type type) +{ + if (type != PIDTYPE_PID) + task = task->group_leader; + return task->pids[type].pid; +} + static inline int get_nr_threads(struct task_struct *tsk) { return tsk->signal->nr_threads; diff --git a/kernel/exit.c b/kernel/exit.c index c3c7ac560114..16432428fc6c 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -1001,14 +1001,6 @@ struct wait_opts { int notask_error; }; -static inline -struct pid *task_pid_type(struct task_struct *task, enum pid_type type) -{ - if (type != PIDTYPE_PID) - task = task->group_leader; - return task->pids[type].pid; -} - static int eligible_pid(struct wait_opts *wo, struct task_struct *p) { return wo->wo_type == PIDTYPE_MAX || -- cgit From 7a36094d61bfe9843de5484ff0140227983ac5d5 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Tue, 26 Sep 2017 12:45:33 -0500 Subject: pids: Compute task_tgid using signal->leader_pid The cost is the the same and this removes the need to worry about complications that come from de_thread and group_leader changing. __task_pid_nr_ns has been updated to take advantage of this change. Signed-off-by: "Eric W. Biederman" --- arch/ia64/kernel/asm-offsets.c | 2 +- arch/ia64/kernel/fsys.S | 8 ++++---- drivers/platform/x86/thinkpad_acpi.c | 1 + fs/fuse/file.c | 1 + fs/notify/fanotify/fanotify.c | 1 + include/linux/sched.h | 5 ----- include/linux/sched/signal.h | 5 +++++ include/net/scm.h | 1 + kernel/pid.c | 15 ++++++++------- 9 files changed, 22 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/arch/ia64/kernel/asm-offsets.c b/arch/ia64/kernel/asm-offsets.c index f4db2168d1b8..f5433bb7f04a 100644 --- a/arch/ia64/kernel/asm-offsets.c +++ b/arch/ia64/kernel/asm-offsets.c @@ -50,7 +50,6 @@ void foo(void) DEFINE(IA64_TASK_BLOCKED_OFFSET,offsetof (struct task_struct, blocked)); DEFINE(IA64_TASK_CLEAR_CHILD_TID_OFFSET,offsetof (struct task_struct, clear_child_tid)); - DEFINE(IA64_TASK_GROUP_LEADER_OFFSET, offsetof (struct task_struct, group_leader)); DEFINE(IA64_TASK_TGIDLINK_OFFSET, offsetof (struct task_struct, pids[PIDTYPE_PID].pid)); DEFINE(IA64_PID_LEVEL_OFFSET, offsetof (struct pid, level)); DEFINE(IA64_PID_UPID_OFFSET, offsetof (struct pid, numbers[0])); @@ -68,6 +67,7 @@ void foo(void) DEFINE(IA64_SIGNAL_GROUP_STOP_COUNT_OFFSET,offsetof (struct signal_struct, group_stop_count)); DEFINE(IA64_SIGNAL_SHARED_PENDING_OFFSET,offsetof (struct signal_struct, shared_pending)); + DEFINE(IA64_SIGNAL_LEADER_PID_OFFSET, offsetof (struct signal_struct, leader_pid)); BLANK(); diff --git a/arch/ia64/kernel/fsys.S b/arch/ia64/kernel/fsys.S index fe742ffafc7a..eaf5a0d6f3e0 100644 --- a/arch/ia64/kernel/fsys.S +++ b/arch/ia64/kernel/fsys.S @@ -62,16 +62,16 @@ ENTRY(fsys_getpid) .prologue .altrp b6 .body - add r17=IA64_TASK_GROUP_LEADER_OFFSET,r16 + add r17=IA64_TASK_SIGNAL_OFFSET,r16 ;; - ld8 r17=[r17] // r17 = current->group_leader + ld8 r17=[r17] // r17 = current->signal add r9=TI_FLAGS+IA64_TASK_SIZE,r16 ;; ld4 r9=[r9] - add r17=IA64_TASK_TGIDLINK_OFFSET,r17 + add r17=IA64_SIGNAL_LEADER_PID_OFFSET,r17 ;; and r9=TIF_ALLWORK_MASK,r9 - ld8 r17=[r17] // r17 = current->group_leader->pids[PIDTYPE_PID].pid + ld8 r17=[r17] // r17 = current->signal->leader_pid ;; add r8=IA64_PID_LEVEL_OFFSET,r17 ;; diff --git a/drivers/platform/x86/thinkpad_acpi.c b/drivers/platform/x86/thinkpad_acpi.c index cae9b0595692..d556e95c532c 100644 --- a/drivers/platform/x86/thinkpad_acpi.c +++ b/drivers/platform/x86/thinkpad_acpi.c @@ -57,6 +57,7 @@ #include #include #include +#include #include #include #include diff --git a/fs/fuse/file.c b/fs/fuse/file.c index a201fb0ac64f..b00a3f126a89 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c index f90842efea13..6e828cb82e5e 100644 --- a/fs/notify/fanotify/fanotify.c +++ b/fs/notify/fanotify/fanotify.c @@ -8,6 +8,7 @@ #include #include #include +#include #include #include #include diff --git a/include/linux/sched.h b/include/linux/sched.h index 87bf02d93a27..a461ff89a3af 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1202,11 +1202,6 @@ static inline struct pid *task_pid(struct task_struct *task) return task->pids[PIDTYPE_PID].pid; } -static inline struct pid *task_tgid(struct task_struct *task) -{ - return task->group_leader->pids[PIDTYPE_PID].pid; -} - /* * Without tasklist or RCU lock it is not safe to dereference * the result of task_pgrp/task_session even if task == current, diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h index d8ef0a3d2e7e..b95a272c1ab5 100644 --- a/include/linux/sched/signal.h +++ b/include/linux/sched/signal.h @@ -564,6 +564,11 @@ struct pid *task_pid_type(struct task_struct *task, enum pid_type type) return task->pids[type].pid; } +static inline struct pid *task_tgid(struct task_struct *task) +{ + return task->signal->leader_pid; +} + static inline int get_nr_threads(struct task_struct *tsk) { return tsk->signal->nr_threads; diff --git a/include/net/scm.h b/include/net/scm.h index 903771c8d4e3..1ce365f4c256 100644 --- a/include/net/scm.h +++ b/include/net/scm.h @@ -8,6 +8,7 @@ #include #include #include +#include /* Well, we should have at least one descriptor open * to accept passed FDs 8) diff --git a/kernel/pid.c b/kernel/pid.c index 157fe4b19971..d0de2b59f86f 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -421,13 +421,14 @@ pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type, if (!ns) ns = task_active_pid_ns(current); if (likely(pid_alive(task))) { - if (type != PIDTYPE_PID) { - if (type == __PIDTYPE_TGID) - type = PIDTYPE_PID; - - task = task->group_leader; - } - nr = pid_nr_ns(rcu_dereference(task->pids[type].pid), ns); + struct pid *pid; + if (type == PIDTYPE_PID) + pid = task_pid(task); + else if (type == __PIDTYPE_TGID) + pid = task_tgid(task); + else + pid = rcu_dereference(task->group_leader->pids[type].pid); + nr = pid_nr_ns(pid, ns); } rcu_read_unlock(); -- cgit From 2c4704756cab7cfa031ada4dab361562f0e357c0 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Tue, 26 Sep 2017 13:06:43 -0500 Subject: pids: Move the pgrp and session pid pointers from task_struct to signal_struct To access these fields the code always has to go to group leader so going to signal struct is no loss and is actually a fundamental simplification. This saves a little bit of memory by only allocating the pid pointer array once instead of once for every thread, and even better this removes a few potential races caused by the fact that group_leader can be changed by de_thread, while signal_struct can not. Signed-off-by: "Eric W. Biederman" --- arch/ia64/kernel/asm-offsets.c | 2 +- arch/ia64/kernel/fsys.S | 4 ++-- fs/autofs/autofs_i.h | 1 + include/linux/init_task.h | 9 --------- include/linux/pid.h | 8 +------- include/linux/sched.h | 22 ++++----------------- include/linux/sched/signal.h | 26 +++++++++++++++++++++--- init/init_task.c | 11 ++++++----- kernel/fork.c | 23 ++++++++++++++++----- kernel/pid.c | 45 +++++++++++++++++++++--------------------- 10 files changed, 78 insertions(+), 73 deletions(-) (limited to 'kernel') diff --git a/arch/ia64/kernel/asm-offsets.c b/arch/ia64/kernel/asm-offsets.c index f5433bb7f04a..c1f8a57855af 100644 --- a/arch/ia64/kernel/asm-offsets.c +++ b/arch/ia64/kernel/asm-offsets.c @@ -50,7 +50,7 @@ void foo(void) DEFINE(IA64_TASK_BLOCKED_OFFSET,offsetof (struct task_struct, blocked)); DEFINE(IA64_TASK_CLEAR_CHILD_TID_OFFSET,offsetof (struct task_struct, clear_child_tid)); - DEFINE(IA64_TASK_TGIDLINK_OFFSET, offsetof (struct task_struct, pids[PIDTYPE_PID].pid)); + DEFINE(IA64_TASK_THREAD_PID_OFFSET,offsetof (struct task_struct, thread_pid)); DEFINE(IA64_PID_LEVEL_OFFSET, offsetof (struct pid, level)); DEFINE(IA64_PID_UPID_OFFSET, offsetof (struct pid, numbers[0])); DEFINE(IA64_TASK_PENDING_OFFSET,offsetof (struct task_struct, pending)); diff --git a/arch/ia64/kernel/fsys.S b/arch/ia64/kernel/fsys.S index eaf5a0d6f3e0..e85ebdac678b 100644 --- a/arch/ia64/kernel/fsys.S +++ b/arch/ia64/kernel/fsys.S @@ -96,11 +96,11 @@ ENTRY(fsys_set_tid_address) .altrp b6 .body add r9=TI_FLAGS+IA64_TASK_SIZE,r16 - add r17=IA64_TASK_TGIDLINK_OFFSET,r16 + add r17=IA64_TASK_THREAD_PID_OFFSET,r16 ;; ld4 r9=[r9] tnat.z p6,p7=r32 // check argument register for being NaT - ld8 r17=[r17] // r17 = current->pids[PIDTYPE_PID].pid + ld8 r17=[r17] // r17 = current->thread_pid ;; and r9=TIF_ALLWORK_MASK,r9 add r8=IA64_PID_LEVEL_OFFSET,r17 diff --git a/fs/autofs/autofs_i.h b/fs/autofs/autofs_i.h index 9400a9f6318a..502812289850 100644 --- a/fs/autofs/autofs_i.h +++ b/fs/autofs/autofs_i.h @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include diff --git a/include/linux/init_task.h b/include/linux/init_task.h index a454b8aeb938..a7083a45a26c 100644 --- a/include/linux/init_task.h +++ b/include/linux/init_task.h @@ -46,15 +46,6 @@ extern struct cred init_cred; #define INIT_CPU_TIMERS(s) #endif -#define INIT_PID_LINK(type) \ -{ \ - .node = { \ - .next = NULL, \ - .pprev = NULL, \ - }, \ - .pid = &init_struct_pid, \ -} - #define INIT_TASK_COMM "swapper" /* Attach to the init_task data structure for proper alignment */ diff --git a/include/linux/pid.h b/include/linux/pid.h index 7633d55d9a24..3d4c504dcc8c 100644 --- a/include/linux/pid.h +++ b/include/linux/pid.h @@ -67,12 +67,6 @@ struct pid extern struct pid init_struct_pid; -struct pid_link -{ - struct hlist_node node; - struct pid *pid; -}; - static inline struct pid *get_pid(struct pid *pid) { if (pid) @@ -177,7 +171,7 @@ pid_t pid_vnr(struct pid *pid); do { \ if ((pid) != NULL) \ hlist_for_each_entry_rcu((task), \ - &(pid)->tasks[type], pids[type].node) { + &(pid)->tasks[type], pid_links[type]) { /* * Both old and new leaders may be attached to diff --git a/include/linux/sched.h b/include/linux/sched.h index a461ff89a3af..445bdf5b1f64 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -775,7 +775,8 @@ struct task_struct { struct list_head ptrace_entry; /* PID/PID hash table linkage. */ - struct pid_link pids[PIDTYPE_MAX]; + struct pid *thread_pid; + struct hlist_node pid_links[PIDTYPE_MAX]; struct list_head thread_group; struct list_head thread_node; @@ -1199,22 +1200,7 @@ struct task_struct { static inline struct pid *task_pid(struct task_struct *task) { - return task->pids[PIDTYPE_PID].pid; -} - -/* - * Without tasklist or RCU lock it is not safe to dereference - * the result of task_pgrp/task_session even if task == current, - * we can race with another thread doing sys_setsid/sys_setpgid. - */ -static inline struct pid *task_pgrp(struct task_struct *task) -{ - return task->group_leader->pids[PIDTYPE_PGID].pid; -} - -static inline struct pid *task_session(struct task_struct *task) -{ - return task->group_leader->pids[PIDTYPE_SID].pid; + return task->thread_pid; } /* @@ -1263,7 +1249,7 @@ static inline pid_t task_tgid_nr(struct task_struct *tsk) */ static inline int pid_alive(const struct task_struct *p) { - return p->pids[PIDTYPE_PID].pid != NULL; + return p->thread_pid != NULL; } static inline pid_t task_pgrp_nr_ns(struct task_struct *tsk, struct pid_namespace *ns) diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h index b95a272c1ab5..2dcded16eb1e 100644 --- a/include/linux/sched/signal.h +++ b/include/linux/sched/signal.h @@ -146,7 +146,9 @@ struct signal_struct { #endif + /* PID/PID hash table linkage. */ struct pid *leader_pid; + struct pid *pids[PIDTYPE_MAX]; #ifdef CONFIG_NO_HZ_FULL atomic_t tick_dep_mask; @@ -559,9 +561,12 @@ void walk_process_tree(struct task_struct *top, proc_visitor, void *); static inline struct pid *task_pid_type(struct task_struct *task, enum pid_type type) { - if (type != PIDTYPE_PID) - task = task->group_leader; - return task->pids[type].pid; + struct pid *pid; + if (type == PIDTYPE_PID) + pid = task_pid(task); + else + pid = task->signal->pids[type]; + return pid; } static inline struct pid *task_tgid(struct task_struct *task) @@ -569,6 +574,21 @@ static inline struct pid *task_tgid(struct task_struct *task) return task->signal->leader_pid; } +/* + * Without tasklist or RCU lock it is not safe to dereference + * the result of task_pgrp/task_session even if task == current, + * we can race with another thread doing sys_setsid/sys_setpgid. + */ +static inline struct pid *task_pgrp(struct task_struct *task) +{ + return task->signal->pids[PIDTYPE_PGID]; +} + +static inline struct pid *task_session(struct task_struct *task) +{ + return task->signal->pids[PIDTYPE_SID]; +} + static inline int get_nr_threads(struct task_struct *tsk) { return tsk->signal->nr_threads; diff --git a/init/init_task.c b/init/init_task.c index 7914ffb8dc73..db12a61259f1 100644 --- a/init/init_task.c +++ b/init/init_task.c @@ -34,6 +34,11 @@ static struct signal_struct init_signals = { #endif INIT_CPU_TIMERS(init_signals) .leader_pid = &init_struct_pid, + .pids = { + [PIDTYPE_PID] = &init_struct_pid, + [PIDTYPE_PGID] = &init_struct_pid, + [PIDTYPE_SID] = &init_struct_pid, + }, INIT_PREV_CPUTIME(init_signals) }; @@ -112,11 +117,7 @@ struct task_struct init_task INIT_CPU_TIMERS(init_task) .pi_lock = __RAW_SPIN_LOCK_UNLOCKED(init_task.pi_lock), .timer_slack_ns = 50000, /* 50 usec default slack */ - .pids = { - [PIDTYPE_PID] = INIT_PID_LINK(PIDTYPE_PID), - [PIDTYPE_PGID] = INIT_PID_LINK(PIDTYPE_PGID), - [PIDTYPE_SID] = INIT_PID_LINK(PIDTYPE_SID), - }, + .thread_pid = &init_struct_pid, .thread_group = LIST_HEAD_INIT(init_task.thread_group), .thread_node = LIST_HEAD_INIT(init_signals.thread_head), #ifdef CONFIG_AUDITSYSCALL diff --git a/kernel/fork.c b/kernel/fork.c index 9440d61b925c..d2952162399b 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1549,10 +1549,22 @@ static void posix_cpu_timers_init(struct task_struct *tsk) static inline void posix_cpu_timers_init(struct task_struct *tsk) { } #endif +static inline void init_task_pid_links(struct task_struct *task) +{ + enum pid_type type; + + for (type = PIDTYPE_PID; type < PIDTYPE_MAX; ++type) { + INIT_HLIST_NODE(&task->pid_links[type]); + } +} + static inline void init_task_pid(struct task_struct *task, enum pid_type type, struct pid *pid) { - task->pids[type].pid = pid; + if (type == PIDTYPE_PID) + task->thread_pid = pid; + else + task->signal->pids[type] = pid; } static inline void rcu_copy_process(struct task_struct *p) @@ -1928,6 +1940,7 @@ static __latent_entropy struct task_struct *copy_process( goto bad_fork_cancel_cgroup; } + init_task_pid_links(p); if (likely(p->pid)) { ptrace_init_task(p, (clone_flags & CLONE_PTRACE) || trace); @@ -2036,13 +2049,13 @@ fork_out: return ERR_PTR(retval); } -static inline void init_idle_pids(struct pid_link *links) +static inline void init_idle_pids(struct task_struct *idle) { enum pid_type type; for (type = PIDTYPE_PID; type < PIDTYPE_MAX; ++type) { - INIT_HLIST_NODE(&links[type].node); /* not really needed */ - links[type].pid = &init_struct_pid; + INIT_HLIST_NODE(&idle->pid_links[type]); /* not really needed */ + init_task_pid(idle, type, &init_struct_pid); } } @@ -2052,7 +2065,7 @@ struct task_struct *fork_idle(int cpu) task = copy_process(CLONE_VM, 0, 0, NULL, &init_struct_pid, 0, 0, cpu_to_node(cpu)); if (!IS_ERR(task)) { - init_idle_pids(task->pids); + init_idle_pids(task); init_idle(task, cpu); } diff --git a/kernel/pid.c b/kernel/pid.c index d0de2b59f86f..f8486d2e2346 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -265,27 +265,35 @@ struct pid *find_vpid(int nr) } EXPORT_SYMBOL_GPL(find_vpid); +static struct pid **task_pid_ptr(struct task_struct *task, enum pid_type type) +{ + return (type == PIDTYPE_PID) ? + &task->thread_pid : + (type == __PIDTYPE_TGID) ? + &task->signal->leader_pid : + &task->signal->pids[type]; +} + /* * attach_pid() must be called with the tasklist_lock write-held. */ void attach_pid(struct task_struct *task, enum pid_type type) { - struct pid_link *link = &task->pids[type]; - hlist_add_head_rcu(&link->node, &link->pid->tasks[type]); + struct pid *pid = *task_pid_ptr(task, type); + hlist_add_head_rcu(&task->pid_links[type], &pid->tasks[type]); } static void __change_pid(struct task_struct *task, enum pid_type type, struct pid *new) { - struct pid_link *link; + struct pid **pid_ptr = task_pid_ptr(task, type); struct pid *pid; int tmp; - link = &task->pids[type]; - pid = link->pid; + pid = *pid_ptr; - hlist_del_rcu(&link->node); - link->pid = new; + hlist_del_rcu(&task->pid_links[type]); + *pid_ptr = new; for (tmp = PIDTYPE_MAX; --tmp >= 0; ) if (!hlist_empty(&pid->tasks[tmp])) @@ -310,8 +318,9 @@ void change_pid(struct task_struct *task, enum pid_type type, void transfer_pid(struct task_struct *old, struct task_struct *new, enum pid_type type) { - new->pids[type].pid = old->pids[type].pid; - hlist_replace_rcu(&old->pids[type].node, &new->pids[type].node); + if (type == PIDTYPE_PID) + new->thread_pid = old->thread_pid; + hlist_replace_rcu(&old->pid_links[type], &new->pid_links[type]); } struct task_struct *pid_task(struct pid *pid, enum pid_type type) @@ -322,7 +331,7 @@ struct task_struct *pid_task(struct pid *pid, enum pid_type type) first = rcu_dereference_check(hlist_first_rcu(&pid->tasks[type]), lockdep_tasklist_lock_is_held()); if (first) - result = hlist_entry(first, struct task_struct, pids[(type)].node); + result = hlist_entry(first, struct task_struct, pid_links[(type)]); } return result; } @@ -360,9 +369,7 @@ struct pid *get_task_pid(struct task_struct *task, enum pid_type type) { struct pid *pid; rcu_read_lock(); - if (type != PIDTYPE_PID) - task = task->group_leader; - pid = get_pid(rcu_dereference(task->pids[type].pid)); + pid = get_pid(rcu_dereference(*task_pid_ptr(task, type))); rcu_read_unlock(); return pid; } @@ -420,16 +427,8 @@ pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type, rcu_read_lock(); if (!ns) ns = task_active_pid_ns(current); - if (likely(pid_alive(task))) { - struct pid *pid; - if (type == PIDTYPE_PID) - pid = task_pid(task); - else if (type == __PIDTYPE_TGID) - pid = task_tgid(task); - else - pid = rcu_dereference(task->group_leader->pids[type].pid); - nr = pid_nr_ns(pid, ns); - } + if (likely(pid_alive(task))) + nr = pid_nr_ns(rcu_dereference(*task_pid_ptr(task, type)), ns); rcu_read_unlock(); return nr; -- cgit From 6883f81aac6f44e7df70a6af189b3689ff52cbfb Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Sun, 4 Jun 2017 04:32:13 -0500 Subject: pid: Implement PIDTYPE_TGID Everywhere except in the pid array we distinguish between a tasks pid and a tasks tgid (thread group id). Even in the enumeration we want that distinction sometimes so we have added __PIDTYPE_TGID. With leader_pid we almost have an implementation of PIDTYPE_TGID in struct signal_struct. Add PIDTYPE_TGID as a first class member of the pid_type enumeration and into the pids array. Then remove the __PIDTYPE_TGID special case and the leader_pid in signal_struct. The net size increase is just an extra pointer added to struct pid and an extra pair of pointers of an hlist_node added to task_struct. The effect on code maintenance is the removal of a number of special cases today and the potential to remove many more special cases as PIDTYPE_TGID gets used to it's fullest. The long term potential is allowing zombie thread group leaders to exit, which will remove a lot more special cases in the code. Signed-off-by: "Eric W. Biederman" --- arch/ia64/kernel/asm-offsets.c | 2 +- arch/ia64/kernel/fsys.S | 4 ++-- arch/s390/kernel/perf_cpum_sf.c | 2 +- fs/exec.c | 1 + include/linux/pid.h | 3 +-- include/linux/sched.h | 4 ++-- include/linux/sched/signal.h | 5 ++--- init/init_task.c | 2 +- kernel/events/core.c | 2 +- kernel/exit.c | 1 + kernel/fork.c | 3 ++- kernel/pid.c | 2 -- kernel/time/itimer.c | 5 +++-- kernel/time/posix-cpu-timers.c | 2 +- 14 files changed, 19 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/arch/ia64/kernel/asm-offsets.c b/arch/ia64/kernel/asm-offsets.c index c1f8a57855af..00e8e2a1eb19 100644 --- a/arch/ia64/kernel/asm-offsets.c +++ b/arch/ia64/kernel/asm-offsets.c @@ -67,7 +67,7 @@ void foo(void) DEFINE(IA64_SIGNAL_GROUP_STOP_COUNT_OFFSET,offsetof (struct signal_struct, group_stop_count)); DEFINE(IA64_SIGNAL_SHARED_PENDING_OFFSET,offsetof (struct signal_struct, shared_pending)); - DEFINE(IA64_SIGNAL_LEADER_PID_OFFSET, offsetof (struct signal_struct, leader_pid)); + DEFINE(IA64_SIGNAL_PIDS_TGID_OFFSET, offsetof (struct signal_struct, pids[PIDTYPE_TGID])); BLANK(); diff --git a/arch/ia64/kernel/fsys.S b/arch/ia64/kernel/fsys.S index e85ebdac678b..d80c99a5f55d 100644 --- a/arch/ia64/kernel/fsys.S +++ b/arch/ia64/kernel/fsys.S @@ -68,10 +68,10 @@ ENTRY(fsys_getpid) add r9=TI_FLAGS+IA64_TASK_SIZE,r16 ;; ld4 r9=[r9] - add r17=IA64_SIGNAL_LEADER_PID_OFFSET,r17 + add r17=IA64_SIGNAL_PIDS_TGID_OFFSET,r17 ;; and r9=TIF_ALLWORK_MASK,r9 - ld8 r17=[r17] // r17 = current->signal->leader_pid + ld8 r17=[r17] // r17 = current->signal->pids[PIDTYPE_TGID] ;; add r8=IA64_PID_LEVEL_OFFSET,r17 ;; diff --git a/arch/s390/kernel/perf_cpum_sf.c b/arch/s390/kernel/perf_cpum_sf.c index 0292d68e7dde..ca0b7ae894bb 100644 --- a/arch/s390/kernel/perf_cpum_sf.c +++ b/arch/s390/kernel/perf_cpum_sf.c @@ -665,7 +665,7 @@ static void cpumsf_output_event_pid(struct perf_event *event, goto out; /* Update the process ID (see also kernel/events/core.c) */ - data->tid_entry.pid = cpumsf_pid_type(event, pid, __PIDTYPE_TGID); + data->tid_entry.pid = cpumsf_pid_type(event, pid, PIDTYPE_TGID); data->tid_entry.tid = cpumsf_pid_type(event, pid, PIDTYPE_PID); perf_output_sample(&handle, &header, data, event); diff --git a/fs/exec.c b/fs/exec.c index 2d4e0075bd24..79a11fbded7a 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1146,6 +1146,7 @@ static int de_thread(struct task_struct *tsk) */ tsk->pid = leader->pid; change_pid(tsk, PIDTYPE_PID, task_pid(leader)); + transfer_pid(leader, tsk, PIDTYPE_TGID); transfer_pid(leader, tsk, PIDTYPE_PGID); transfer_pid(leader, tsk, PIDTYPE_SID); diff --git a/include/linux/pid.h b/include/linux/pid.h index 3d4c504dcc8c..14a9a39da9c7 100644 --- a/include/linux/pid.h +++ b/include/linux/pid.h @@ -7,11 +7,10 @@ enum pid_type { PIDTYPE_PID, + PIDTYPE_TGID, PIDTYPE_PGID, PIDTYPE_SID, PIDTYPE_MAX, - /* only valid to __task_pid_nr_ns() */ - __PIDTYPE_TGID }; /* diff --git a/include/linux/sched.h b/include/linux/sched.h index 445bdf5b1f64..06b4e3bda93a 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1275,12 +1275,12 @@ static inline pid_t task_session_vnr(struct task_struct *tsk) static inline pid_t task_tgid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns) { - return __task_pid_nr_ns(tsk, __PIDTYPE_TGID, ns); + return __task_pid_nr_ns(tsk, PIDTYPE_TGID, ns); } static inline pid_t task_tgid_vnr(struct task_struct *tsk) { - return __task_pid_nr_ns(tsk, __PIDTYPE_TGID, NULL); + return __task_pid_nr_ns(tsk, PIDTYPE_TGID, NULL); } static inline pid_t task_ppid_nr_ns(const struct task_struct *tsk, struct pid_namespace *ns) diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h index 2dcded16eb1e..ee30a5ba475f 100644 --- a/include/linux/sched/signal.h +++ b/include/linux/sched/signal.h @@ -147,7 +147,6 @@ struct signal_struct { #endif /* PID/PID hash table linkage. */ - struct pid *leader_pid; struct pid *pids[PIDTYPE_MAX]; #ifdef CONFIG_NO_HZ_FULL @@ -571,7 +570,7 @@ struct pid *task_pid_type(struct task_struct *task, enum pid_type type) static inline struct pid *task_tgid(struct task_struct *task) { - return task->signal->leader_pid; + return task->signal->pids[PIDTYPE_TGID]; } /* @@ -607,7 +606,7 @@ static inline bool thread_group_leader(struct task_struct *p) */ static inline bool has_group_leader_pid(struct task_struct *p) { - return task_pid(p) == p->signal->leader_pid; + return task_pid(p) == task_tgid(p); } static inline diff --git a/init/init_task.c b/init/init_task.c index db12a61259f1..4f97846256d7 100644 --- a/init/init_task.c +++ b/init/init_task.c @@ -33,9 +33,9 @@ static struct signal_struct init_signals = { }, #endif INIT_CPU_TIMERS(init_signals) - .leader_pid = &init_struct_pid, .pids = { [PIDTYPE_PID] = &init_struct_pid, + [PIDTYPE_TGID] = &init_struct_pid, [PIDTYPE_PGID] = &init_struct_pid, [PIDTYPE_SID] = &init_struct_pid, }, diff --git a/kernel/events/core.c b/kernel/events/core.c index 80cca2b30c4f..9025b1796ca8 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -1334,7 +1334,7 @@ static u32 perf_event_pid_type(struct perf_event *event, struct task_struct *p, static u32 perf_event_pid(struct perf_event *event, struct task_struct *p) { - return perf_event_pid_type(event, p, __PIDTYPE_TGID); + return perf_event_pid_type(event, p, PIDTYPE_TGID); } static u32 perf_event_tid(struct perf_event *event, struct task_struct *p) diff --git a/kernel/exit.c b/kernel/exit.c index 16432428fc6c..25582b442955 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -73,6 +73,7 @@ static void __unhash_process(struct task_struct *p, bool group_dead) nr_threads--; detach_pid(p, PIDTYPE_PID); if (group_dead) { + detach_pid(p, PIDTYPE_TGID); detach_pid(p, PIDTYPE_PGID); detach_pid(p, PIDTYPE_SID); diff --git a/kernel/fork.c b/kernel/fork.c index d2952162399b..cc5be0d01ce6 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1946,6 +1946,7 @@ static __latent_entropy struct task_struct *copy_process( init_task_pid(p, PIDTYPE_PID, pid); if (thread_group_leader(p)) { + init_task_pid(p, PIDTYPE_TGID, pid); init_task_pid(p, PIDTYPE_PGID, task_pgrp(current)); init_task_pid(p, PIDTYPE_SID, task_session(current)); @@ -1954,7 +1955,6 @@ static __latent_entropy struct task_struct *copy_process( p->signal->flags |= SIGNAL_UNKILLABLE; } - p->signal->leader_pid = pid; p->signal->tty = tty_kref_get(current->signal->tty); /* * Inherit has_child_subreaper flag under the same @@ -1965,6 +1965,7 @@ static __latent_entropy struct task_struct *copy_process( p->real_parent->signal->is_child_subreaper; list_add_tail(&p->sibling, &p->real_parent->children); list_add_tail_rcu(&p->tasks, &init_task.tasks); + attach_pid(p, PIDTYPE_TGID); attach_pid(p, PIDTYPE_PGID); attach_pid(p, PIDTYPE_SID); __this_cpu_inc(process_counts); diff --git a/kernel/pid.c b/kernel/pid.c index f8486d2e2346..de1cfc4f75a2 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -269,8 +269,6 @@ static struct pid **task_pid_ptr(struct task_struct *task, enum pid_type type) { return (type == PIDTYPE_PID) ? &task->thread_pid : - (type == __PIDTYPE_TGID) ? - &task->signal->leader_pid : &task->signal->pids[type]; } diff --git a/kernel/time/itimer.c b/kernel/time/itimer.c index f26acef5d7b4..9a65713c8309 100644 --- a/kernel/time/itimer.c +++ b/kernel/time/itimer.c @@ -139,9 +139,10 @@ enum hrtimer_restart it_real_fn(struct hrtimer *timer) { struct signal_struct *sig = container_of(timer, struct signal_struct, real_timer); + struct pid *leader_pid = sig->pids[PIDTYPE_TGID]; - trace_itimer_expire(ITIMER_REAL, sig->leader_pid, 0); - kill_pid_info(SIGALRM, SEND_SIG_PRIV, sig->leader_pid); + trace_itimer_expire(ITIMER_REAL, leader_pid, 0); + kill_pid_info(SIGALRM, SEND_SIG_PRIV, leader_pid); return HRTIMER_NORESTART; } diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index 5a6251ac6f7a..40e6fae46cec 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -895,7 +895,7 @@ static void check_cpu_itimer(struct task_struct *tsk, struct cpu_itimer *it, trace_itimer_expire(signo == SIGPROF ? ITIMER_PROF : ITIMER_VIRTUAL, - tsk->signal->leader_pid, cur_time); + task_tgid(tsk), cur_time); __group_send_sig_info(signo, SEND_SIG_PRIV, tsk); } -- cgit From 2118e1f53f6f0973a1d9a6a7dc9296959bf39ec0 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Sat, 21 Jul 2018 00:00:29 -0500 Subject: posix-timers: Noralize good_sigevent In good_sigevent directly compute the default return value as "task_tgid(current)". This is exactly the same as "task_pid(current->group_leader)" but written more clearly. In the thread case first compute the thread's pid. Then veify that attached to that pid is a thread of the current thread group. This has the net effect of making the code a little clearer, and making it obvious that posix timers never look up a process by a the pid of a thread. Signed-off-by: "Eric W. Biederman" --- kernel/time/posix-timers.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index e08ce3f27447..2bdf08a2bae9 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -433,11 +433,13 @@ static enum hrtimer_restart posix_timer_fn(struct hrtimer *timer) static struct pid *good_sigevent(sigevent_t * event) { - struct task_struct *rtn = current->group_leader; + struct pid *pid = task_tgid(current); + struct task_struct *rtn; switch (event->sigev_notify) { case SIGEV_SIGNAL | SIGEV_THREAD_ID: - rtn = find_task_by_vpid(event->sigev_notify_thread_id); + pid = find_vpid(event->sigev_notify_thread_id); + rtn = pid_task(pid, PIDTYPE_PID); if (!rtn || !same_thread_group(rtn, current)) return NULL; /* FALLTHRU */ @@ -447,7 +449,7 @@ static struct pid *good_sigevent(sigevent_t * event) return NULL; /* FALLTHRU */ case SIGEV_NONE: - return task_pid(rtn); + return pid; default: return NULL; } -- cgit From 24122c7f4969adeeaeca3fb1656a31569e9aa59b Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 20 Jul 2018 14:30:23 -0500 Subject: signal: Pass pid and pid type into send_sigqueue Make the code more maintainable by performing more of the signal related work in send_sigqueue. A quick inspection of do_timer_create will show that this code path does not lookup a thread group by a thread's pid. Making it safe to find the task pointed to by it_pid with "pid_task(it_pid, type)"; This supports the changes needed in fork to tell if a signal was sent to a single process or a group of processes. Having the pid to task transition in signal.c will also make it easier to sort out races with de_thread and and the thread group leader exiting when it comes time to address that. Signed-off-by: "Eric W. Biederman" --- include/linux/sched/signal.h | 2 +- kernel/signal.c | 14 +++++++++----- kernel/time/posix-timers.c | 13 ++++--------- 3 files changed, 14 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h index ee30a5ba475f..94558ffa82ab 100644 --- a/include/linux/sched/signal.h +++ b/include/linux/sched/signal.h @@ -330,7 +330,7 @@ extern int send_sig(int, struct task_struct *, int); extern int zap_other_threads(struct task_struct *p); extern struct sigqueue *sigqueue_alloc(void); extern void sigqueue_free(struct sigqueue *); -extern int send_sigqueue(struct sigqueue *, struct task_struct *, int group); +extern int send_sigqueue(struct sigqueue *, struct pid *, enum pid_type); extern int do_sigaction(int, struct k_sigaction *, struct k_sigaction *); static inline int restart_syscall(void) diff --git a/kernel/signal.c b/kernel/signal.c index 8d8a940422a8..40feb14e276d 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1664,17 +1664,20 @@ void sigqueue_free(struct sigqueue *q) __sigqueue_free(q); } -int send_sigqueue(struct sigqueue *q, struct task_struct *t, int group) +int send_sigqueue(struct sigqueue *q, struct pid *pid, enum pid_type type) { int sig = q->info.si_signo; struct sigpending *pending; + struct task_struct *t; unsigned long flags; int ret, result; BUG_ON(!(q->flags & SIGQUEUE_PREALLOC)); ret = -1; - if (!likely(lock_task_sighand(t, &flags))) + rcu_read_lock(); + t = pid_task(pid, type); + if (!t || !likely(lock_task_sighand(t, &flags))) goto ret; ret = 1; /* the signal is ignored */ @@ -1696,15 +1699,16 @@ int send_sigqueue(struct sigqueue *q, struct task_struct *t, int group) q->info.si_overrun = 0; signalfd_notify(t, sig); - pending = group ? &t->signal->shared_pending : &t->pending; + pending = (type != PIDTYPE_PID) ? &t->signal->shared_pending : &t->pending; list_add_tail(&q->list, &pending->list); sigaddset(&pending->signal, sig); - complete_signal(sig, t, group); + complete_signal(sig, t, type != PIDTYPE_PID); result = TRACE_SIGNAL_DELIVERED; out: - trace_signal_generate(sig, &q->info, t, group, result); + trace_signal_generate(sig, &q->info, t, type != PIDTYPE_PID, result); unlock_task_sighand(t, &flags); ret: + rcu_read_unlock(); return ret; } diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index 2bdf08a2bae9..2d2e739fbc57 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -332,8 +332,8 @@ void posixtimer_rearm(struct siginfo *info) int posix_timer_event(struct k_itimer *timr, int si_private) { - struct task_struct *task; - int shared, ret = -1; + enum pid_type type; + int ret = -1; /* * FIXME: if ->sigq is queued we can race with * dequeue_signal()->posixtimer_rearm(). @@ -347,13 +347,8 @@ int posix_timer_event(struct k_itimer *timr, int si_private) */ timr->sigq->info.si_sys_private = si_private; - rcu_read_lock(); - task = pid_task(timr->it_pid, PIDTYPE_PID); - if (task) { - shared = !(timr->it_sigev_notify & SIGEV_THREAD_ID); - ret = send_sigqueue(timr->sigq, task, shared); - } - rcu_read_unlock(); + type = !(timr->it_sigev_notify & SIGEV_THREAD_ID) ? PIDTYPE_TGID : PIDTYPE_PID; + ret = send_sigqueue(timr->sigq, timr->it_pid, type); /* If we failed to send the signal the timer stops. */ return ret > 0; } -- cgit From 0102498083d58d8b17759642c602b525215e1a54 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 13 Jul 2018 18:40:57 -0500 Subject: signal: Pass pid type into group_send_sig_info This passes the information we already have at the call sight into group_send_sig_info. Ultimatelly allowing for to better handle signals sent to a group of processes. Signed-off-by: "Eric W. Biederman" --- include/linux/signal.h | 4 +++- kernel/exit.c | 3 ++- kernel/signal.c | 10 ++++++---- 3 files changed, 11 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/include/linux/signal.h b/include/linux/signal.h index 3c5200137b24..d8f2bf3d41e6 100644 --- a/include/linux/signal.h +++ b/include/linux/signal.h @@ -254,11 +254,13 @@ static inline int valid_signal(unsigned long sig) struct timespec; struct pt_regs; +enum pid_type; extern int next_signal(struct sigpending *pending, sigset_t *mask); extern int do_send_sig_info(int sig, struct siginfo *info, struct task_struct *p, bool group); -extern int group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p); +extern int group_send_sig_info(int sig, struct siginfo *info, + struct task_struct *p, enum pid_type type); extern int __group_send_sig_info(int, struct siginfo *, struct task_struct *); extern int sigprocmask(int, sigset_t *, sigset_t *); extern void set_current_blocked(sigset_t *); diff --git a/kernel/exit.c b/kernel/exit.c index 25582b442955..0e21e6d21f35 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -681,7 +681,8 @@ static void forget_original_parent(struct task_struct *father, t->parent = t->real_parent; if (t->pdeath_signal) group_send_sig_info(t->pdeath_signal, - SEND_SIG_NOINFO, t); + SEND_SIG_NOINFO, t, + PIDTYPE_TGID); } /* * If this is a threaded reparent there is no need to diff --git a/kernel/signal.c b/kernel/signal.c index 40feb14e276d..c7527338fe9d 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1274,7 +1274,8 @@ struct sighand_struct *__lock_task_sighand(struct task_struct *tsk, /* * send signal info to all the members of a group */ -int group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p) +int group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p, + enum pid_type type) { int ret; @@ -1301,7 +1302,7 @@ int __kill_pgrp_info(int sig, struct siginfo *info, struct pid *pgrp) success = 0; retval = -ESRCH; do_each_pid_task(pgrp, PIDTYPE_PGID, p) { - int err = group_send_sig_info(sig, info, p); + int err = group_send_sig_info(sig, info, p, PIDTYPE_PGID); success |= !err; retval = err; } while_each_pid_task(pgrp, PIDTYPE_PGID, p); @@ -1317,7 +1318,7 @@ int kill_pid_info(int sig, struct siginfo *info, struct pid *pid) rcu_read_lock(); p = pid_task(pid, PIDTYPE_PID); if (p) - error = group_send_sig_info(sig, info, p); + error = group_send_sig_info(sig, info, p, PIDTYPE_TGID); rcu_read_unlock(); if (likely(!p || error != -ESRCH)) return error; @@ -1420,7 +1421,8 @@ static int kill_something_info(int sig, struct siginfo *info, pid_t pid) for_each_process(p) { if (task_pid_vnr(p) > 1 && !same_thread_group(p, current)) { - int err = group_send_sig_info(sig, info, p); + int err = group_send_sig_info(sig, info, p, + PIDTYPE_MAX); ++count; if (err != -EPERM) retval = err; -- cgit From 40b3b02535621027f56d248139e0e467573c3098 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Sat, 21 Jul 2018 10:45:15 -0500 Subject: signal: Pass pid type into do_send_sig_info This passes the information we already have at the call sight into do_send_sig_info. Ultimately allowing for better handling of signals sent to a group of processes during fork. Signed-off-by: "Eric W. Biederman" --- drivers/tty/sysrq.c | 2 +- fs/fcntl.c | 6 +++--- include/linux/signal.h | 2 +- kernel/signal.c | 10 +++++----- mm/oom_kill.c | 4 ++-- 5 files changed, 12 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/drivers/tty/sysrq.c b/drivers/tty/sysrq.c index 6364890575ec..06ed20dd01ba 100644 --- a/drivers/tty/sysrq.c +++ b/drivers/tty/sysrq.c @@ -348,7 +348,7 @@ static void send_sig_all(int sig) if (is_global_init(p)) continue; - do_send_sig_info(sig, SEND_SIG_FORCED, p, true); + do_send_sig_info(sig, SEND_SIG_FORCED, p, PIDTYPE_MAX); } read_unlock(&tasklist_lock); } diff --git a/fs/fcntl.c b/fs/fcntl.c index 5d596a00f40b..a04accf6847f 100644 --- a/fs/fcntl.c +++ b/fs/fcntl.c @@ -767,11 +767,11 @@ static void send_sigio_to_task(struct task_struct *p, else si.si_band = mangle_poll(band_table[reason - POLL_IN]); si.si_fd = fd; - if (!do_send_sig_info(signum, &si, p, type != PIDTYPE_PID)) + if (!do_send_sig_info(signum, &si, p, type)) break; /* fall-through: fall back on the old plain SIGIO signal */ case 0: - do_send_sig_info(SIGIO, SEND_SIG_PRIV, p, type != PIDTYPE_PID); + do_send_sig_info(SIGIO, SEND_SIG_PRIV, p, type); } } @@ -808,7 +808,7 @@ static void send_sigurg_to_task(struct task_struct *p, struct fown_struct *fown, enum pid_type type) { if (sigio_perm(p, fown, SIGURG)) - do_send_sig_info(SIGURG, SEND_SIG_PRIV, p, type != PIDTYPE_PID); + do_send_sig_info(SIGURG, SEND_SIG_PRIV, p, type); } int send_sigurg(struct fown_struct *fown) diff --git a/include/linux/signal.h b/include/linux/signal.h index d8f2bf3d41e6..fe125b0335f7 100644 --- a/include/linux/signal.h +++ b/include/linux/signal.h @@ -258,7 +258,7 @@ enum pid_type; extern int next_signal(struct sigpending *pending, sigset_t *mask); extern int do_send_sig_info(int sig, struct siginfo *info, - struct task_struct *p, bool group); + struct task_struct *p, enum pid_type type); extern int group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p, enum pid_type type); extern int __group_send_sig_info(int, struct siginfo *, struct task_struct *); diff --git a/kernel/signal.c b/kernel/signal.c index c7527338fe9d..2c09e6143dd8 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1161,13 +1161,13 @@ specific_send_sig_info(int sig, struct siginfo *info, struct task_struct *t) } int do_send_sig_info(int sig, struct siginfo *info, struct task_struct *p, - bool group) + enum pid_type type) { unsigned long flags; int ret = -ESRCH; if (lock_task_sighand(p, &flags)) { - ret = send_signal(sig, info, p, group); + ret = send_signal(sig, info, p, type != PIDTYPE_PID); unlock_task_sighand(p, &flags); } @@ -1284,7 +1284,7 @@ int group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p, rcu_read_unlock(); if (!ret && sig) - ret = do_send_sig_info(sig, info, p, true); + ret = do_send_sig_info(sig, info, p, type); return ret; } @@ -1448,7 +1448,7 @@ int send_sig_info(int sig, struct siginfo *info, struct task_struct *p) if (!valid_signal(sig)) return -EINVAL; - return do_send_sig_info(sig, info, p, false); + return do_send_sig_info(sig, info, p, PIDTYPE_PID); } #define __si_special(priv) \ @@ -3199,7 +3199,7 @@ do_send_specific(pid_t tgid, pid_t pid, int sig, struct siginfo *info) * probe. No signal is actually delivered. */ if (!error && sig) { - error = do_send_sig_info(sig, info, p, false); + error = do_send_sig_info(sig, info, p, PIDTYPE_PID); /* * If lock_task_sighand() failed we pretend the task * dies after receiving the signal. The window is tiny, diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 84081e77bc51..2cc9b238368f 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -920,7 +920,7 @@ static void oom_kill_process(struct oom_control *oc, const char *message) * in order to prevent the OOM victim from depleting the memory * reserves from the user space under its control. */ - do_send_sig_info(SIGKILL, SEND_SIG_FORCED, victim, true); + do_send_sig_info(SIGKILL, SEND_SIG_FORCED, victim, PIDTYPE_TGID); mark_oom_victim(victim); pr_err("Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB\n", task_pid_nr(victim), victim->comm, K(victim->mm->total_vm), @@ -958,7 +958,7 @@ static void oom_kill_process(struct oom_control *oc, const char *message) */ if (unlikely(p->flags & PF_KTHREAD)) continue; - do_send_sig_info(SIGKILL, SEND_SIG_FORCED, p, true); + do_send_sig_info(SIGKILL, SEND_SIG_FORCED, p, PIDTYPE_TGID); } rcu_read_unlock(); -- cgit From b213984bd3987e6f5bf96b7847d151b7b42e1efd Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 20 Jul 2018 15:49:17 -0500 Subject: signal: Push pid type down into send_signal This information is already available in the callers and by pushing it down it makes the code a little clearer, and allows better group signal behavior in fork. Signed-off-by: "Eric W. Biederman" --- kernel/signal.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index 2c09e6143dd8..8decc70c1dc2 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1103,7 +1103,7 @@ ret: } static int send_signal(int sig, struct siginfo *info, struct task_struct *t, - int group) + enum pid_type type) { int from_ancestor_ns = 0; @@ -1112,7 +1112,7 @@ static int send_signal(int sig, struct siginfo *info, struct task_struct *t, !task_pid_nr_ns(current, task_active_pid_ns(t)); #endif - return __send_signal(sig, info, t, group, from_ancestor_ns); + return __send_signal(sig, info, t, type != PIDTYPE_PID, from_ancestor_ns); } static void print_fatal_signal(int signr) @@ -1151,13 +1151,13 @@ __setup("print-fatal-signals=", setup_print_fatal_signals); int __group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p) { - return send_signal(sig, info, p, 1); + return send_signal(sig, info, p, PIDTYPE_TGID); } static int specific_send_sig_info(int sig, struct siginfo *info, struct task_struct *t) { - return send_signal(sig, info, t, 0); + return send_signal(sig, info, t, PIDTYPE_PID); } int do_send_sig_info(int sig, struct siginfo *info, struct task_struct *p, @@ -1167,7 +1167,7 @@ int do_send_sig_info(int sig, struct siginfo *info, struct task_struct *p, int ret = -ESRCH; if (lock_task_sighand(p, &flags)) { - ret = send_signal(sig, info, p, type != PIDTYPE_PID); + ret = send_signal(sig, info, p, type); unlock_task_sighand(p, &flags); } @@ -3966,7 +3966,7 @@ void kdb_send_sig(struct task_struct *t, int sig) "the deadlock.\n"); return; } - ret = send_signal(sig, SEND_SIG_PRIV, t, false); + ret = send_signal(sig, SEND_SIG_PRIV, t, PIDTYPE_PID); spin_unlock(&t->sighand->siglock); if (ret) kdb_printf("Fail to deliver Signal %d to process %d.\n", -- cgit From 5a883cee7442a5fd72e18eae895113fbd7b16110 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 13 Jul 2018 19:26:27 -0500 Subject: signal: Push pid type down into __send_signal This information is already available in the callers and by pushing it down it makes the code a little clearer, and allows implementing better handling of signales set to a group of processes in fork. Signed-off-by: "Eric W. Biederman" --- kernel/signal.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index 8decc70c1dc2..1ef94303d87a 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -998,7 +998,7 @@ static inline void userns_fixup_signal_uid(struct siginfo *info, struct task_str #endif static int __send_signal(int sig, struct siginfo *info, struct task_struct *t, - int group, int from_ancestor_ns) + enum pid_type type, int from_ancestor_ns) { struct sigpending *pending; struct sigqueue *q; @@ -1012,7 +1012,7 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t, from_ancestor_ns || (info == SEND_SIG_FORCED))) goto ret; - pending = group ? &t->signal->shared_pending : &t->pending; + pending = (type != PIDTYPE_PID) ? &t->signal->shared_pending : &t->pending; /* * Short-circuit ignored signals and support queuing * exactly one non-rt signal, so that we can get more @@ -1096,9 +1096,9 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t, out_set: signalfd_notify(t, sig); sigaddset(&pending->signal, sig); - complete_signal(sig, t, group); + complete_signal(sig, t, type != PIDTYPE_PID); ret: - trace_signal_generate(sig, info, t, group, result); + trace_signal_generate(sig, info, t, type != PIDTYPE_PID, result); return ret; } @@ -1112,7 +1112,7 @@ static int send_signal(int sig, struct siginfo *info, struct task_struct *t, !task_pid_nr_ns(current, task_active_pid_ns(t)); #endif - return __send_signal(sig, info, t, type != PIDTYPE_PID, from_ancestor_ns); + return __send_signal(sig, info, t, type, from_ancestor_ns); } static void print_fatal_signal(int signr) @@ -1377,7 +1377,7 @@ int kill_pid_info_as_cred(int sig, struct siginfo *info, struct pid *pid, if (sig) { if (lock_task_sighand(p, &flags)) { - ret = __send_signal(sig, info, p, 1, 0); + ret = __send_signal(sig, info, p, PIDTYPE_TGID, 0); unlock_task_sighand(p, &flags); } else ret = -ESRCH; -- cgit From 0729614992c946f6e8ccb9ef260aea1f06993df0 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 13 Jul 2018 21:39:13 -0500 Subject: signal: Push pid type down into complete_signal. This is the bottom and by pushing this down it simplifies the callers and otherwise leaves things as is. This is in preparation for allowing fork to implement better handling of signals set to groups of processes. Signed-off-by: "Eric W. Biederman" --- kernel/signal.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index 1ef94303d87a..dddbea558455 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -895,7 +895,7 @@ static inline int wants_signal(int sig, struct task_struct *p) return task_curr(p) || !signal_pending(p); } -static void complete_signal(int sig, struct task_struct *p, int group) +static void complete_signal(int sig, struct task_struct *p, enum pid_type type) { struct signal_struct *signal = p->signal; struct task_struct *t; @@ -908,7 +908,7 @@ static void complete_signal(int sig, struct task_struct *p, int group) */ if (wants_signal(sig, p)) t = p; - else if (!group || thread_group_empty(p)) + else if ((type == PIDTYPE_PID) || thread_group_empty(p)) /* * There is just one thread and it does not need to be woken. * It will dequeue unblocked signals before it runs again. @@ -1096,7 +1096,7 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t, out_set: signalfd_notify(t, sig); sigaddset(&pending->signal, sig); - complete_signal(sig, t, type != PIDTYPE_PID); + complete_signal(sig, t, type); ret: trace_signal_generate(sig, info, t, type != PIDTYPE_PID, result); return ret; @@ -1704,7 +1704,7 @@ int send_sigqueue(struct sigqueue *q, struct pid *pid, enum pid_type type) pending = (type != PIDTYPE_PID) ? &t->signal->shared_pending : &t->pending; list_add_tail(&q->list, &pending->list); sigaddset(&pending->signal, sig); - complete_signal(sig, t, type != PIDTYPE_PID); + complete_signal(sig, t, type); result = TRACE_SIGNAL_DELIVERED; out: trace_signal_generate(sig, &q->info, t, type != PIDTYPE_PID, result); -- cgit From 3928d4f5ee37cdc523894f6e549e6aae521d8980 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sat, 21 Jul 2018 13:48:51 -0700 Subject: mm: use helper functions for allocating and freeing vm_area structs The vm_area_struct is one of the most fundamental memory management objects, but the management of it is entirely open-coded evertwhere, ranging from allocation and freeing (using kmem_cache_[z]alloc and kmem_cache_free) to initializing all the fields. We want to unify this in order to end up having some unified initialization of the vmas, and the first step to this is to at least have basic allocation functions. Right now those functions are literally just wrappers around the kmem_cache_*() calls. This is a purely mechanical conversion: # new vma: kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL) -> vm_area_alloc() # copy old vma kmem_cache_alloc(vm_area_cachep, GFP_KERNEL) -> vm_area_dup(old) # free vma kmem_cache_free(vm_area_cachep, vma) -> vm_area_free(vma) to the point where the old vma passed in to the vm_area_dup() function isn't even used yet (because I've left all the old manual initialization alone). Signed-off-by: Linus Torvalds --- arch/ia64/kernel/perfmon.c | 4 ++-- arch/ia64/mm/init.c | 8 ++++---- fs/exec.c | 4 ++-- include/linux/mm.h | 4 +++- kernel/fork.c | 21 ++++++++++++++++++--- mm/mmap.c | 22 +++++++++++----------- mm/nommu.c | 8 ++++---- 7 files changed, 44 insertions(+), 27 deletions(-) (limited to 'kernel') diff --git a/arch/ia64/kernel/perfmon.c b/arch/ia64/kernel/perfmon.c index 3b38c717008a..e859246badca 100644 --- a/arch/ia64/kernel/perfmon.c +++ b/arch/ia64/kernel/perfmon.c @@ -2278,7 +2278,7 @@ pfm_smpl_buffer_alloc(struct task_struct *task, struct file *filp, pfm_context_t DPRINT(("smpl_buf @%p\n", smpl_buf)); /* allocate vma */ - vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL); + vma = vm_area_alloc(); if (!vma) { DPRINT(("Cannot allocate vma\n")); goto error_kmem; @@ -2346,7 +2346,7 @@ pfm_smpl_buffer_alloc(struct task_struct *task, struct file *filp, pfm_context_t return 0; error: - kmem_cache_free(vm_area_cachep, vma); + vm_area_free(vma); error_kmem: pfm_rvfree(smpl_buf, size); diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c index 18278b448530..3f2321bffb72 100644 --- a/arch/ia64/mm/init.c +++ b/arch/ia64/mm/init.c @@ -114,7 +114,7 @@ ia64_init_addr_space (void) * the problem. When the process attempts to write to the register backing store * for the first time, it will get a SEGFAULT in this case. */ - vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL); + vma = vm_area_alloc(); if (vma) { INIT_LIST_HEAD(&vma->anon_vma_chain); vma->vm_mm = current->mm; @@ -125,7 +125,7 @@ ia64_init_addr_space (void) down_write(¤t->mm->mmap_sem); if (insert_vm_struct(current->mm, vma)) { up_write(¤t->mm->mmap_sem); - kmem_cache_free(vm_area_cachep, vma); + vm_area_free(vma); return; } up_write(¤t->mm->mmap_sem); @@ -133,7 +133,7 @@ ia64_init_addr_space (void) /* map NaT-page at address zero to speed up speculative dereferencing of NULL: */ if (!(current->personality & MMAP_PAGE_ZERO)) { - vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL); + vma = vm_area_alloc(); if (vma) { INIT_LIST_HEAD(&vma->anon_vma_chain); vma->vm_mm = current->mm; @@ -144,7 +144,7 @@ ia64_init_addr_space (void) down_write(¤t->mm->mmap_sem); if (insert_vm_struct(current->mm, vma)) { up_write(¤t->mm->mmap_sem); - kmem_cache_free(vm_area_cachep, vma); + vm_area_free(vma); return; } up_write(¤t->mm->mmap_sem); diff --git a/fs/exec.c b/fs/exec.c index 2d4e0075bd24..9bd83989ea25 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -290,7 +290,7 @@ static int __bprm_mm_init(struct linux_binprm *bprm) struct vm_area_struct *vma = NULL; struct mm_struct *mm = bprm->mm; - bprm->vma = vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL); + bprm->vma = vma = vm_area_alloc(); if (!vma) return -ENOMEM; @@ -326,7 +326,7 @@ err: up_write(&mm->mmap_sem); err_free: bprm->vma = NULL; - kmem_cache_free(vm_area_cachep, vma); + vm_area_free(vma); return err; } diff --git a/include/linux/mm.h b/include/linux/mm.h index 3982c83fdcbf..de2fd86c6154 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -155,7 +155,9 @@ extern int overcommit_kbytes_handler(struct ctl_table *, int, void __user *, * mmap() functions). */ -extern struct kmem_cache *vm_area_cachep; +struct vm_area_struct *vm_area_alloc(void); +struct vm_area_struct *vm_area_dup(struct vm_area_struct *); +void vm_area_free(struct vm_area_struct *); #ifndef CONFIG_MMU extern struct rb_root nommu_region_tree; diff --git a/kernel/fork.c b/kernel/fork.c index 9440d61b925c..0e23deb5acfc 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -303,11 +303,26 @@ struct kmem_cache *files_cachep; struct kmem_cache *fs_cachep; /* SLAB cache for vm_area_struct structures */ -struct kmem_cache *vm_area_cachep; +static struct kmem_cache *vm_area_cachep; /* SLAB cache for mm_struct structures (tsk->mm) */ static struct kmem_cache *mm_cachep; +struct vm_area_struct *vm_area_alloc(void) +{ + return kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL); +} + +struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig) +{ + return kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); +} + +void vm_area_free(struct vm_area_struct *vma) +{ + kmem_cache_free(vm_area_cachep, vma); +} + static void account_kernel_stack(struct task_struct *tsk, int account) { void *stack = task_stack_page(tsk); @@ -455,7 +470,7 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, goto fail_nomem; charge = len; } - tmp = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); + tmp = vm_area_dup(mpnt); if (!tmp) goto fail_nomem; *tmp = *mpnt; @@ -539,7 +554,7 @@ fail_uprobe_end: fail_nomem_anon_vma_fork: mpol_put(vma_policy(tmp)); fail_nomem_policy: - kmem_cache_free(vm_area_cachep, tmp); + vm_area_free(tmp); fail_nomem: retval = -ENOMEM; vm_unacct_memory(charge); diff --git a/mm/mmap.c b/mm/mmap.c index 5801b5f0a634..4286ad2dd1f5 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -182,7 +182,7 @@ static struct vm_area_struct *remove_vma(struct vm_area_struct *vma) if (vma->vm_file) fput(vma->vm_file); mpol_put(vma_policy(vma)); - kmem_cache_free(vm_area_cachep, vma); + vm_area_free(vma); return next; } @@ -911,7 +911,7 @@ again: anon_vma_merge(vma, next); mm->map_count--; mpol_put(vma_policy(next)); - kmem_cache_free(vm_area_cachep, next); + vm_area_free(next); /* * In mprotect's case 6 (see comments on vma_merge), * we must remove another next too. It would clutter @@ -1729,7 +1729,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr, * specific mapper. the address has already been validated, but * not unmapped, but the maps are removed from the list. */ - vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL); + vma = vm_area_alloc(); if (!vma) { error = -ENOMEM; goto unacct_error; @@ -1832,7 +1832,7 @@ allow_write_and_free_vma: if (vm_flags & VM_DENYWRITE) allow_write_access(file); free_vma: - kmem_cache_free(vm_area_cachep, vma); + vm_area_free(vma); unacct_error: if (charged) vm_unacct_memory(charged); @@ -2620,7 +2620,7 @@ int __split_vma(struct mm_struct *mm, struct vm_area_struct *vma, return err; } - new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); + new = vm_area_dup(vma); if (!new) return -ENOMEM; @@ -2669,7 +2669,7 @@ int __split_vma(struct mm_struct *mm, struct vm_area_struct *vma, out_free_mpol: mpol_put(vma_policy(new)); out_free_vma: - kmem_cache_free(vm_area_cachep, new); + vm_area_free(new); return err; } @@ -2984,7 +2984,7 @@ static int do_brk_flags(unsigned long addr, unsigned long len, unsigned long fla /* * create a vma struct for an anonymous mapping */ - vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL); + vma = vm_area_alloc(); if (!vma) { vm_unacct_memory(len >> PAGE_SHIFT); return -ENOMEM; @@ -3202,7 +3202,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, } *need_rmap_locks = (new_vma->vm_pgoff <= vma->vm_pgoff); } else { - new_vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); + new_vma = vm_area_dup(vma); if (!new_vma) goto out; *new_vma = *vma; @@ -3226,7 +3226,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, out_free_mempol: mpol_put(vma_policy(new_vma)); out_free_vma: - kmem_cache_free(vm_area_cachep, new_vma); + vm_area_free(new_vma); out: return NULL; } @@ -3350,7 +3350,7 @@ static struct vm_area_struct *__install_special_mapping( int ret; struct vm_area_struct *vma; - vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL); + vma = vm_area_alloc(); if (unlikely(vma == NULL)) return ERR_PTR(-ENOMEM); @@ -3376,7 +3376,7 @@ static struct vm_area_struct *__install_special_mapping( return vma; out: - kmem_cache_free(vm_area_cachep, vma); + vm_area_free(vma); return ERR_PTR(ret); } diff --git a/mm/nommu.c b/mm/nommu.c index 4452d8bd9ae4..006e3fe65017 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -769,7 +769,7 @@ static void delete_vma(struct mm_struct *mm, struct vm_area_struct *vma) if (vma->vm_file) fput(vma->vm_file); put_nommu_region(vma->vm_region); - kmem_cache_free(vm_area_cachep, vma); + vm_area_free(vma); } /* @@ -1204,7 +1204,7 @@ unsigned long do_mmap(struct file *file, if (!region) goto error_getting_region; - vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL); + vma = vm_area_alloc(); if (!vma) goto error_getting_vma; @@ -1368,7 +1368,7 @@ error: kmem_cache_free(vm_region_jar, region); if (vma->vm_file) fput(vma->vm_file); - kmem_cache_free(vm_area_cachep, vma); + vm_area_free(vma); return ret; sharing_violation: @@ -1469,7 +1469,7 @@ int split_vma(struct mm_struct *mm, struct vm_area_struct *vma, if (!region) return -ENOMEM; - new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); + new = vm_area_dup(vma); if (!new) { kmem_cache_free(vm_region_jar, region); return -ENOMEM; -- cgit From 95faf6992df468f617edb788da8c21c6eed0dfa7 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sat, 21 Jul 2018 14:48:45 -0700 Subject: mm: make vm_area_dup() actually copy the old vma data .. and re-initialize th eanon_vma_chain head. This removes some boiler-plate from the users, and also makes it clear why it didn't need use the 'zalloc()' version. Signed-off-by: Linus Torvalds --- kernel/fork.c | 10 +++++++--- mm/mmap.c | 7 ------- mm/nommu.c | 1 - 3 files changed, 7 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 0e23deb5acfc..67253e41bfb0 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -315,7 +315,13 @@ struct vm_area_struct *vm_area_alloc(void) struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig) { - return kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); + struct vm_area_struct *new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); + + if (new) { + *new = *orig; + INIT_LIST_HEAD(&new->anon_vma_chain); + } + return new; } void vm_area_free(struct vm_area_struct *vma) @@ -473,8 +479,6 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, tmp = vm_area_dup(mpnt); if (!tmp) goto fail_nomem; - *tmp = *mpnt; - INIT_LIST_HEAD(&tmp->anon_vma_chain); retval = vma_dup_policy(mpnt, tmp); if (retval) goto fail_nomem_policy; diff --git a/mm/mmap.c b/mm/mmap.c index 4286ad2dd1f5..b0ed8ce1b67e 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2624,11 +2624,6 @@ int __split_vma(struct mm_struct *mm, struct vm_area_struct *vma, if (!new) return -ENOMEM; - /* most fields are the same, copy all, and then fixup */ - *new = *vma; - - INIT_LIST_HEAD(&new->anon_vma_chain); - if (new_below) new->vm_end = addr; else { @@ -3205,13 +3200,11 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, new_vma = vm_area_dup(vma); if (!new_vma) goto out; - *new_vma = *vma; new_vma->vm_start = addr; new_vma->vm_end = addr + len; new_vma->vm_pgoff = pgoff; if (vma_dup_policy(vma, new_vma)) goto out_free_vma; - INIT_LIST_HEAD(&new_vma->anon_vma_chain); if (anon_vma_clone(new_vma, vma)) goto out_free_mempol; if (new_vma->vm_file) diff --git a/mm/nommu.c b/mm/nommu.c index 006e3fe65017..c2560e9cc803 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -1476,7 +1476,6 @@ int split_vma(struct mm_struct *mm, struct vm_area_struct *vma, } /* most fields are the same, copy all, and then fixup */ - *new = *vma; *region = *vma->vm_region; new->vm_region = region; -- cgit From 490fc053865c9cc40f1085ef8a5504f5341f79d2 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sat, 21 Jul 2018 15:24:03 -0700 Subject: mm: make vm_area_alloc() initialize core fields Like vm_area_dup(), it initializes the anon_vma_chain head, and the basic mm pointer. The rest of the fields end up being different for different users, although the plan is to also initialize the 'vm_ops' field to a dummy entry. Signed-off-by: Linus Torvalds --- arch/ia64/kernel/perfmon.c | 4 +--- arch/ia64/mm/init.c | 8 ++------ fs/exec.c | 4 +--- include/linux/mm.h | 2 +- kernel/fork.c | 10 ++++++++-- mm/mmap.c | 12 +++--------- mm/nommu.c | 3 +-- 7 files changed, 17 insertions(+), 26 deletions(-) (limited to 'kernel') diff --git a/arch/ia64/kernel/perfmon.c b/arch/ia64/kernel/perfmon.c index e859246badca..46bff1661836 100644 --- a/arch/ia64/kernel/perfmon.c +++ b/arch/ia64/kernel/perfmon.c @@ -2278,17 +2278,15 @@ pfm_smpl_buffer_alloc(struct task_struct *task, struct file *filp, pfm_context_t DPRINT(("smpl_buf @%p\n", smpl_buf)); /* allocate vma */ - vma = vm_area_alloc(); + vma = vm_area_alloc(mm); if (!vma) { DPRINT(("Cannot allocate vma\n")); goto error_kmem; } - INIT_LIST_HEAD(&vma->anon_vma_chain); /* * partially initialize the vma for the sampling buffer */ - vma->vm_mm = mm; vma->vm_file = get_file(filp); vma->vm_flags = VM_READ|VM_MAYREAD|VM_DONTEXPAND|VM_DONTDUMP; vma->vm_page_prot = PAGE_READONLY; /* XXX may need to change */ diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c index 3f2321bffb72..bdb14a369137 100644 --- a/arch/ia64/mm/init.c +++ b/arch/ia64/mm/init.c @@ -114,10 +114,8 @@ ia64_init_addr_space (void) * the problem. When the process attempts to write to the register backing store * for the first time, it will get a SEGFAULT in this case. */ - vma = vm_area_alloc(); + vma = vm_area_alloc(current->mm); if (vma) { - INIT_LIST_HEAD(&vma->anon_vma_chain); - vma->vm_mm = current->mm; vma->vm_start = current->thread.rbs_bot & PAGE_MASK; vma->vm_end = vma->vm_start + PAGE_SIZE; vma->vm_flags = VM_DATA_DEFAULT_FLAGS|VM_GROWSUP|VM_ACCOUNT; @@ -133,10 +131,8 @@ ia64_init_addr_space (void) /* map NaT-page at address zero to speed up speculative dereferencing of NULL: */ if (!(current->personality & MMAP_PAGE_ZERO)) { - vma = vm_area_alloc(); + vma = vm_area_alloc(current->mm); if (vma) { - INIT_LIST_HEAD(&vma->anon_vma_chain); - vma->vm_mm = current->mm; vma->vm_end = PAGE_SIZE; vma->vm_page_prot = __pgprot(pgprot_val(PAGE_READONLY) | _PAGE_MA_NAT); vma->vm_flags = VM_READ | VM_MAYREAD | VM_IO | diff --git a/fs/exec.c b/fs/exec.c index 9bd83989ea25..72e961a62adb 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -290,7 +290,7 @@ static int __bprm_mm_init(struct linux_binprm *bprm) struct vm_area_struct *vma = NULL; struct mm_struct *mm = bprm->mm; - bprm->vma = vma = vm_area_alloc(); + bprm->vma = vma = vm_area_alloc(mm); if (!vma) return -ENOMEM; @@ -298,7 +298,6 @@ static int __bprm_mm_init(struct linux_binprm *bprm) err = -EINTR; goto err_free; } - vma->vm_mm = mm; /* * Place the stack at the largest stack address the architecture @@ -311,7 +310,6 @@ static int __bprm_mm_init(struct linux_binprm *bprm) vma->vm_start = vma->vm_end - PAGE_SIZE; vma->vm_flags = VM_SOFTDIRTY | VM_STACK_FLAGS | VM_STACK_INCOMPLETE_SETUP; vma->vm_page_prot = vm_get_page_prot(vma->vm_flags); - INIT_LIST_HEAD(&vma->anon_vma_chain); err = insert_vm_struct(mm, vma); if (err) diff --git a/include/linux/mm.h b/include/linux/mm.h index de2fd86c6154..d3a3842316b8 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -155,7 +155,7 @@ extern int overcommit_kbytes_handler(struct ctl_table *, int, void __user *, * mmap() functions). */ -struct vm_area_struct *vm_area_alloc(void); +struct vm_area_struct *vm_area_alloc(struct mm_struct *); struct vm_area_struct *vm_area_dup(struct vm_area_struct *); void vm_area_free(struct vm_area_struct *); diff --git a/kernel/fork.c b/kernel/fork.c index 67253e41bfb0..a191c05e757d 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -308,9 +308,15 @@ static struct kmem_cache *vm_area_cachep; /* SLAB cache for mm_struct structures (tsk->mm) */ static struct kmem_cache *mm_cachep; -struct vm_area_struct *vm_area_alloc(void) +struct vm_area_struct *vm_area_alloc(struct mm_struct *mm) { - return kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL); + struct vm_area_struct *vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL); + + if (vma) { + vma->vm_mm = mm; + INIT_LIST_HEAD(&vma->anon_vma_chain); + } + return vma; } struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig) diff --git a/mm/mmap.c b/mm/mmap.c index b0ed8ce1b67e..ff1944d8d458 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1729,19 +1729,17 @@ unsigned long mmap_region(struct file *file, unsigned long addr, * specific mapper. the address has already been validated, but * not unmapped, but the maps are removed from the list. */ - vma = vm_area_alloc(); + vma = vm_area_alloc(mm); if (!vma) { error = -ENOMEM; goto unacct_error; } - vma->vm_mm = mm; vma->vm_start = addr; vma->vm_end = addr + len; vma->vm_flags = vm_flags; vma->vm_page_prot = vm_get_page_prot(vm_flags); vma->vm_pgoff = pgoff; - INIT_LIST_HEAD(&vma->anon_vma_chain); if (file) { if (vm_flags & VM_DENYWRITE) { @@ -2979,14 +2977,12 @@ static int do_brk_flags(unsigned long addr, unsigned long len, unsigned long fla /* * create a vma struct for an anonymous mapping */ - vma = vm_area_alloc(); + vma = vm_area_alloc(mm); if (!vma) { vm_unacct_memory(len >> PAGE_SHIFT); return -ENOMEM; } - INIT_LIST_HEAD(&vma->anon_vma_chain); - vma->vm_mm = mm; vma->vm_start = addr; vma->vm_end = addr + len; vma->vm_pgoff = pgoff; @@ -3343,12 +3339,10 @@ static struct vm_area_struct *__install_special_mapping( int ret; struct vm_area_struct *vma; - vma = vm_area_alloc(); + vma = vm_area_alloc(mm); if (unlikely(vma == NULL)) return ERR_PTR(-ENOMEM); - INIT_LIST_HEAD(&vma->anon_vma_chain); - vma->vm_mm = mm; vma->vm_start = addr; vma->vm_end = addr + len; diff --git a/mm/nommu.c b/mm/nommu.c index c2560e9cc803..1d22fdbf7d7c 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -1204,7 +1204,7 @@ unsigned long do_mmap(struct file *file, if (!region) goto error_getting_region; - vma = vm_area_alloc(); + vma = vm_area_alloc(current->mm); if (!vma) goto error_getting_vma; @@ -1212,7 +1212,6 @@ unsigned long do_mmap(struct file *file, region->vm_flags = vm_flags; region->vm_pgoff = pgoff; - INIT_LIST_HEAD(&vma->anon_vma_chain); vma->vm_flags = vm_flags; vma->vm_pgoff = pgoff; -- cgit From 6e9df95b76cad18f7b217bdad7bb8a26d63b8c47 Mon Sep 17 00:00:00 2001 From: Kamalesh Babulal Date: Fri, 20 Jul 2018 15:16:42 +0530 Subject: livepatch: Validate module/old func name length livepatch module author can pass module name/old function name with more than the defined character limit. With obj->name length greater than MODULE_NAME_LEN, the livepatch module gets loaded but waits forever on the module specified by obj->name to be loaded. It also populates a /sys directory with an untruncated object name. In the case of funcs->old_name length greater then KSYM_NAME_LEN, it would not match against any of the symbol table entries. Instead loop through the symbol table comparing them against a nonexisting function, which can be avoided. The same issues apply, to misspelled/incorrect names. At least gatekeep the modules with over the limit string length, by checking for their length during livepatch module registration. Cc: stable@vger.kernel.org Signed-off-by: Kamalesh Babulal Acked-by: Josh Poimboeuf Signed-off-by: Jiri Kosina --- kernel/livepatch/core.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'kernel') diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c index 3a4656fb7047..5b77a7314e01 100644 --- a/kernel/livepatch/core.c +++ b/kernel/livepatch/core.c @@ -678,6 +678,9 @@ static int klp_init_func(struct klp_object *obj, struct klp_func *func) if (!func->old_name || !func->new_func) return -EINVAL; + if (strlen(func->old_name) >= KSYM_NAME_LEN) + return -EINVAL; + INIT_LIST_HEAD(&func->stack_node); func->patched = false; func->transition = false; @@ -751,6 +754,9 @@ static int klp_init_object(struct klp_patch *patch, struct klp_object *obj) if (!obj->funcs) return -EINVAL; + if (klp_is_module(obj) && strlen(obj->name) >= MODULE_NAME_LEN) + return -EINVAL; + obj->patched = false; obj->mod = NULL; -- cgit From 4ca1d3ee46130e9b939c02a93e3970dad151fed6 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 13 Jul 2018 15:30:33 -0500 Subject: fork: Move and describe why the code examines PIDNS_ADDING Normally this would be something that would be handled by handling signals that are sent to a group of processes but in this case the forking process is not a member of the group being signaled. Thus special code is needed to prevent a race with pid namespaces exiting, and fork adding new processes within them. Move this test up before the signal restart just in case signals are also pending. Fatal conditions should take presedence over restarts. Signed-off-by: "Eric W. Biederman" --- kernel/fork.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index cc5be0d01ce6..b9c54318a292 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1922,6 +1922,12 @@ static __latent_entropy struct task_struct *copy_process( rseq_fork(p, clone_flags); + /* Don't start children in a dying pid namespace */ + if (unlikely(!(ns_of_pid(pid)->pid_allocated & PIDNS_ADDING))) { + retval = -ENOMEM; + goto bad_fork_cancel_cgroup; + } + /* * Process group and session signals need to be delivered to just the * parent before the fork or both the parent and the child after the @@ -1935,10 +1941,7 @@ static __latent_entropy struct task_struct *copy_process( retval = -ERESTARTNOINTR; goto bad_fork_cancel_cgroup; } - if (unlikely(!(ns_of_pid(pid)->pid_allocated & PIDNS_ADDING))) { - retval = -ENOMEM; - goto bad_fork_cancel_cgroup; - } + init_task_pid_links(p); if (likely(p->pid)) { -- cgit From 7673bf553b2732a00f7644fb2adadda69389ab37 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Mon, 23 Jul 2018 08:01:10 -0500 Subject: fork: Unconditionally exit if a fatal signal is pending In practice this does not change anything as testing for fatal_signal_pending and exiting for with an error code duplicates the work of the next clause which recalculates pending signals and then exits fork if any are pending. In both cases the pending signal will trigger the slow path when existing to userspace, and the fatal signal will cause do_exit to be called. The advantage of making this a separate test is that it makes it clear processing the fatal signal will terminate the fork, and it allows the rest of the signal logic to be updated without fear that this important case will be lost. Signed-off-by: "Eric W. Biederman" --- kernel/fork.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index b9c54318a292..22d4cdb9a7ca 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1928,6 +1928,12 @@ static __latent_entropy struct task_struct *copy_process( goto bad_fork_cancel_cgroup; } + /* Let kill terminate clone/fork in the middle */ + if (fatal_signal_pending(current)) { + retval = -EINTR; + goto bad_fork_cancel_cgroup; + } + /* * Process group and session signals need to be delivered to just the * parent before the fork or both the parent and the child after the -- cgit From 2fa147bdbf672c53386a8f5f2c7fe358004c3ef8 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Fri, 13 Jul 2018 21:50:01 -0700 Subject: mm, dev_pagemap: Do not clear ->mapping on final put MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit MEMORY_DEVICE_FS_DAX relies on typical page semantics whereby ->mapping is only ever cleared by truncation, not final put. Without this fix dax pages may forget their mapping association at the end of every page pin event. Move this atypical behavior that HMM wants into the HMM ->page_free() callback. Cc: Cc: Jan Kara Cc: Jérôme Glisse Cc: Andrew Morton Cc: Ross Zwisler Fixes: d2c997c0f145 ("fs, dax: use page->mapping...") Signed-off-by: Dan Williams Acked-by: Jérôme Glisse Signed-off-by: Dave Jiang --- kernel/memremap.c | 1 - mm/hmm.c | 2 ++ 2 files changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/memremap.c b/kernel/memremap.c index 5857267a4af5..62603634a1d2 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c @@ -339,7 +339,6 @@ void __put_devmap_managed_page(struct page *page) __ClearPageActive(page); __ClearPageWaiters(page); - page->mapping = NULL; mem_cgroup_uncharge(page); page->pgmap->page_free(page, page->pgmap->data); diff --git a/mm/hmm.c b/mm/hmm.c index de7b6bf77201..f9d1d89dec4d 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -963,6 +963,8 @@ static void hmm_devmem_free(struct page *page, void *data) { struct hmm_devmem *devmem = data; + page->mapping = NULL; + devmem->ops->free(devmem, page); } -- cgit From 6283fa38dc8744dc7c2bd2a03bb0478fe42f79fa Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Fri, 20 Jul 2018 17:38:37 -0700 Subject: bpf: btf: Ensure the member->offset is in the right order This patch ensures the member->offset of a struct is in the correct order (i.e the later member's offset cannot go backward). The current "pahole -J" BTF encoder does not generate something like this. However, checking this can ensure future encoder will not violate this. Fixes: 69b693f0aefa ("bpf: btf: Introduce BPF Type Format (BTF)") Signed-off-by: Martin KaFai Lau Acked-by: Yonghong Song Signed-off-by: Daniel Borkmann --- kernel/bpf/btf.c | 14 +++++++++++++- tools/testing/selftests/bpf/test_btf.c | 28 ++++++++++++++++++++++++++++ 2 files changed, 41 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 9704934252b3..2590700237c1 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -1519,9 +1519,9 @@ static s32 btf_struct_check_meta(struct btf_verifier_env *env, { bool is_union = BTF_INFO_KIND(t->info) == BTF_KIND_UNION; const struct btf_member *member; + u32 meta_needed, last_offset; struct btf *btf = env->btf; u32 struct_size = t->size; - u32 meta_needed; u16 i; meta_needed = btf_type_vlen(t) * sizeof(*member); @@ -1534,6 +1534,7 @@ static s32 btf_struct_check_meta(struct btf_verifier_env *env, btf_verifier_log_type(env, t, NULL); + last_offset = 0; for_each_member(i, t, member) { if (!btf_name_offset_valid(btf, member->name_off)) { btf_verifier_log_member(env, t, member, @@ -1555,6 +1556,16 @@ static s32 btf_struct_check_meta(struct btf_verifier_env *env, return -EINVAL; } + /* + * ">" instead of ">=" because the last member could be + * "char a[0];" + */ + if (last_offset > member->offset) { + btf_verifier_log_member(env, t, member, + "Invalid member bits_offset"); + return -EINVAL; + } + if (BITS_ROUNDUP_BYTES(member->offset) > struct_size) { btf_verifier_log_member(env, t, member, "Memmber bits_offset exceeds its struct size"); @@ -1562,6 +1573,7 @@ static s32 btf_struct_check_meta(struct btf_verifier_env *env, } btf_verifier_log_member(env, t, member, NULL); + last_offset = member->offset; } return meta_needed; diff --git a/tools/testing/selftests/bpf/test_btf.c b/tools/testing/selftests/bpf/test_btf.c index 3619f3023088..402c0f7cc418 100644 --- a/tools/testing/selftests/bpf/test_btf.c +++ b/tools/testing/selftests/bpf/test_btf.c @@ -247,6 +247,34 @@ static struct btf_raw_test raw_tests[] = { .max_entries = 4, }, +{ + .descr = "struct test #3 Invalid member offset", + .raw_types = { + /* int */ /* [1] */ + BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4), + /* int64 */ /* [2] */ + BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 64, 8), + + /* struct A { */ /* [3] */ + BTF_TYPE_ENC(NAME_TBD, BTF_INFO_ENC(BTF_KIND_STRUCT, 0, 2), 16), + BTF_MEMBER_ENC(NAME_TBD, 1, 64), /* int m; */ + BTF_MEMBER_ENC(NAME_TBD, 2, 0), /* int64 n; */ + /* } */ + BTF_END_RAW, + }, + .str_sec = "\0A\0m\0n\0", + .str_sec_size = sizeof("\0A\0m\0n\0"), + .map_type = BPF_MAP_TYPE_ARRAY, + .map_name = "struct_test3_map", + .key_size = sizeof(int), + .value_size = 16, + .key_type_id = 1, + .value_type_id = 3, + .max_entries = 4, + .btf_load_err = true, + .err_str = "Invalid member bits_offset", +}, + /* Test member exceeds the size of struct. * * struct A { -- cgit From 73d5e2b472640b1fcdb61ae8be389912ef211bda Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Tue, 24 Jul 2018 18:17:40 +0200 Subject: cpu/hotplug: detect SMT disabled by BIOS If SMT is disabled in BIOS, the CPU code doesn't properly detect it. The /sys/devices/system/cpu/smt/control file shows 'on', and the 'l1tf' vulnerabilities file shows SMT as vulnerable. Fix it by forcing 'cpu_smt_control' to CPU_SMT_NOT_SUPPORTED in such a case. Unfortunately the detection can only be done after bringing all the CPUs online, so we have to overwrite any previous writes to the variable. Reported-by: Joe Mario Tested-by: Jiri Kosina Fixes: f048c399e0f7 ("x86/topology: Provide topology_smt_supported()") Signed-off-by: Josh Poimboeuf Signed-off-by: Peter Zijlstra --- kernel/cpu.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index 37eec872042b..2bc3d2f5b2a5 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -2125,6 +2125,15 @@ static const struct attribute_group cpuhp_smt_attr_group = { static int __init cpu_smt_state_init(void) { + /* + * If SMT was disabled by BIOS, detect it here, after the CPUs have + * been brought online. This ensures the smt/l1tf sysfs entries are + * consistent with reality. Note this may overwrite cpu_smt_control's + * previous setting. + */ + if (topology_max_smt_threads() == 1) + cpu_smt_control = CPU_SMT_NOT_SUPPORTED; + return sysfs_create_group(&cpu_subsys.dev_root->kobj, &cpuhp_smt_attr_group); } -- cgit From 62cedf3e60af03e47849fe2bd6a03ec179422a8a Mon Sep 17 00:00:00 2001 From: Peter Rosin Date: Fri, 20 Jul 2018 10:39:13 +0200 Subject: locking/rtmutex: Allow specifying a subclass for nested locking Needed for annotating rt_mutex locks. Tested-by: John Sperbeck Signed-off-by: Peter Rosin Signed-off-by: Peter Zijlstra (Intel) Cc: Davidlohr Bueso Cc: Deepa Dinamani Cc: Greg Kroah-Hartman Cc: Linus Torvalds Cc: Peter Chang Cc: Peter Zijlstra Cc: Philippe Ombredanne Cc: Thomas Gleixner Cc: Will Deacon Cc: Wolfram Sang Link: http://lkml.kernel.org/r/20180720083914.1950-2-peda@axentia.se Signed-off-by: Ingo Molnar --- include/linux/rtmutex.h | 7 +++++++ kernel/locking/rtmutex.c | 29 +++++++++++++++++++++++++---- 2 files changed, 32 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/include/linux/rtmutex.h b/include/linux/rtmutex.h index 1b92a28dd672..6fd615a0eea9 100644 --- a/include/linux/rtmutex.h +++ b/include/linux/rtmutex.h @@ -106,7 +106,14 @@ static inline int rt_mutex_is_locked(struct rt_mutex *lock) extern void __rt_mutex_init(struct rt_mutex *lock, const char *name, struct lock_class_key *key); extern void rt_mutex_destroy(struct rt_mutex *lock); +#ifdef CONFIG_DEBUG_LOCK_ALLOC +extern void rt_mutex_lock_nested(struct rt_mutex *lock, unsigned int subclass); +#define rt_mutex_lock(lock) rt_mutex_lock_nested(lock, 0) +#else extern void rt_mutex_lock(struct rt_mutex *lock); +#define rt_mutex_lock_nested(lock, subclass) rt_mutex_lock(lock) +#endif + extern int rt_mutex_lock_interruptible(struct rt_mutex *lock); extern int rt_mutex_timed_lock(struct rt_mutex *lock, struct hrtimer_sleeper *timeout); diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index 4f014be7a4b8..2823d4163a37 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -1465,6 +1465,29 @@ rt_mutex_fastunlock(struct rt_mutex *lock, rt_mutex_postunlock(&wake_q); } +static inline void __rt_mutex_lock(struct rt_mutex *lock, unsigned int subclass) +{ + might_sleep(); + + mutex_acquire(&lock->dep_map, subclass, 0, _RET_IP_); + rt_mutex_fastlock(lock, TASK_UNINTERRUPTIBLE, rt_mutex_slowlock); +} + +#ifdef CONFIG_DEBUG_LOCK_ALLOC +/** + * rt_mutex_lock_nested - lock a rt_mutex + * + * @lock: the rt_mutex to be locked + * @subclass: the lockdep subclass + */ +void __sched rt_mutex_lock_nested(struct rt_mutex *lock, unsigned int subclass) +{ + __rt_mutex_lock(lock, subclass); +} +EXPORT_SYMBOL_GPL(rt_mutex_lock_nested); +#endif + +#ifndef CONFIG_DEBUG_LOCK_ALLOC /** * rt_mutex_lock - lock a rt_mutex * @@ -1472,12 +1495,10 @@ rt_mutex_fastunlock(struct rt_mutex *lock, */ void __sched rt_mutex_lock(struct rt_mutex *lock) { - might_sleep(); - - mutex_acquire(&lock->dep_map, 0, 0, _RET_IP_); - rt_mutex_fastlock(lock, TASK_UNINTERRUPTIBLE, rt_mutex_slowlock); + __rt_mutex_lock(lock, 0); } EXPORT_SYMBOL_GPL(rt_mutex_lock); +#endif /** * rt_mutex_lock_interruptible - lock a rt_mutex interruptible -- cgit From 6cd0c583b04b2bd9415e07b51b63ab799949dd66 Mon Sep 17 00:00:00 2001 From: Yi Wang Date: Mon, 23 Jul 2018 12:19:07 +0800 Subject: sched/topology: Check variable group before dereferencing it The 'group' variable in sched_domain_debug_one() is not checked when firstly used in cpumask_test_cpu(cpu, sched_group_span(group)), but it might be NULL (it is checked later in the following while loop) and may cause NULL pointer dereference. We need to check it before using to avoid NULL dereference. Signed-off-by: Yi Wang Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Jiang Biao Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: zhong.weidong@zte.com.cn Link: http://lkml.kernel.org/r/1532319547-33335-1-git-send-email-wang.yi59@zte.com.cn Signed-off-by: Ingo Molnar --- kernel/sched/topology.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 05a831427bc7..56a0fed30c0a 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -47,7 +47,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) { printk(KERN_ERR "ERROR: domain->span does not contain CPU%d\n", cpu); } - if (!cpumask_test_cpu(cpu, sched_group_span(group))) { + if (group && !cpumask_test_cpu(cpu, sched_group_span(group))) { printk(KERN_ERR "ERROR: domain->groups does not contain CPU%d\n", cpu); } -- cgit From 2610e88946632afb78aa58e61f11368ac4c0af7b Mon Sep 17 00:00:00 2001 From: "Isaac J. Manjarres" Date: Tue, 17 Jul 2018 12:35:29 -0700 Subject: stop_machine: Disable preemption after queueing stopper threads This commit: 9fb8d5dc4b64 ("stop_machine, Disable preemption when waking two stopper threads") does not fully address the race condition that can occur as follows: On one CPU, call it CPU 3, thread 1 invokes cpu_stop_queue_two_works(2, 3,...), and the execution is such that thread 1 queues the works for migration/2 and migration/3, and is preempted after releasing the locks for migration/2 and migration/3, but before waking the threads. Then, On CPU 2, a kworker, call it thread 2, is running, and it invokes cpu_stop_queue_two_works(1, 2,...), such that thread 2 queues the works for migration/1 and migration/2. Meanwhile, on CPU 3, thread 1 resumes execution, and wakes migration/2 and migration/3. This means that when CPU 2 releases the locks for migration/1 and migration/2, but before it wakes those threads, it can be preempted by migration/2. If thread 2 is preempted by migration/2, then migration/2 will execute the first work item successfully, since migration/3 was woken up by CPU 3, but when it goes to execute the second work item, it disables preemption, calls multi_cpu_stop(), and thus, CPU 2 will wait forever for migration/1, which should have been woken up by thread 2. However migration/1 cannot be woken up by thread 2, since it is a kworker, so it is affine to CPU 2, but CPU 2 is running migration/2 with preemption disabled, so thread 2 will never run. Disable preemption after queueing works for stopper threads to ensure that the operation of queueing the works and waking the stopper threads is atomic. Co-Developed-by: Prasad Sodagudi Co-Developed-by: Pavankumar Kondeti Signed-off-by: Isaac J. Manjarres Signed-off-by: Prasad Sodagudi Signed-off-by: Pavankumar Kondeti Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: bigeasy@linutronix.de Cc: gregkh@linuxfoundation.org Cc: matt@codeblueprint.co.uk Fixes: 9fb8d5dc4b64 ("stop_machine, Disable preemption when waking two stopper threads") Link: http://lkml.kernel.org/r/1531856129-9871-1-git-send-email-isaacm@codeaurora.org Signed-off-by: Ingo Molnar --- kernel/stop_machine.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index 1ff523dae6e2..e190d1ef3a23 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -260,6 +260,15 @@ retry: err = 0; __cpu_stop_queue_work(stopper1, work1, &wakeq); __cpu_stop_queue_work(stopper2, work2, &wakeq); + /* + * The waking up of stopper threads has to happen + * in the same scheduling context as the queueing. + * Otherwise, there is a possibility of one of the + * above stoppers being woken up by another CPU, + * and preempting us. This will cause us to n ot + * wake up the other stopper forever. + */ + preempt_disable(); unlock: raw_spin_unlock(&stopper2->lock); raw_spin_unlock_irq(&stopper1->lock); @@ -271,7 +280,6 @@ unlock: } if (!err) { - preempt_disable(); wake_up_q(&wakeq); preempt_enable(); } -- cgit From 840d719604b0925ca23dde95f1767e4528668369 Mon Sep 17 00:00:00 2001 From: Daniel Bristot de Oliveira Date: Fri, 20 Jul 2018 11:16:30 +0200 Subject: sched/deadline: Update rq_clock of later_rq when pushing a task Daniel Casini got this warn while running a DL task here at RetisLab: [ 461.137582] ------------[ cut here ]------------ [ 461.137583] rq->clock_update_flags < RQCF_ACT_SKIP [ 461.137599] WARNING: CPU: 4 PID: 2354 at kernel/sched/sched.h:967 assert_clock_updated.isra.32.part.33+0x17/0x20 [a ton of modules] [ 461.137646] CPU: 4 PID: 2354 Comm: label_image Not tainted 4.18.0-rc4+ #3 [ 461.137647] Hardware name: ASUS All Series/Z87-K, BIOS 0801 09/02/2013 [ 461.137649] RIP: 0010:assert_clock_updated.isra.32.part.33+0x17/0x20 [ 461.137649] Code: ff 48 89 83 08 09 00 00 eb c6 66 0f 1f 84 00 00 00 00 00 55 48 c7 c7 98 7a 6c a5 c6 05 bc 0d 54 01 01 48 89 e5 e8 a9 84 fb ff <0f> 0b 5d c3 0f 1f 44 00 00 0f 1f 44 00 00 83 7e 60 01 74 0a 48 3b [ 461.137673] RSP: 0018:ffffa77e08cafc68 EFLAGS: 00010082 [ 461.137674] RAX: 0000000000000000 RBX: ffff8b3fc1702d80 RCX: 0000000000000006 [ 461.137674] RDX: 0000000000000007 RSI: 0000000000000096 RDI: ffff8b3fded164b0 [ 461.137675] RBP: ffffa77e08cafc68 R08: 0000000000000026 R09: 0000000000000339 [ 461.137676] R10: ffff8b3fd060d410 R11: 0000000000000026 R12: ffffffffa4e14e20 [ 461.137677] R13: ffff8b3fdec22940 R14: ffff8b3fc1702da0 R15: ffff8b3fdec22940 [ 461.137678] FS: 00007efe43ee5700(0000) GS:ffff8b3fded00000(0000) knlGS:0000000000000000 [ 461.137679] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 461.137680] CR2: 00007efe30000010 CR3: 0000000301744003 CR4: 00000000001606e0 [ 461.137680] Call Trace: [ 461.137684] push_dl_task.part.46+0x3bc/0x460 [ 461.137686] task_woken_dl+0x60/0x80 [ 461.137689] ttwu_do_wakeup+0x4f/0x150 [ 461.137690] ttwu_do_activate+0x77/0x80 [ 461.137692] try_to_wake_up+0x1d6/0x4c0 [ 461.137693] wake_up_q+0x32/0x70 [ 461.137696] do_futex+0x7e7/0xb50 [ 461.137698] __x64_sys_futex+0x8b/0x180 [ 461.137701] do_syscall_64+0x5a/0x110 [ 461.137703] entry_SYSCALL_64_after_hwframe+0x44/0xa9 [ 461.137705] RIP: 0033:0x7efe4918ca26 [ 461.137705] Code: 00 00 00 74 17 49 8b 48 20 44 8b 59 10 41 83 e3 30 41 83 fb 20 74 1e be 85 00 00 00 41 ba 01 00 00 00 41 b9 01 00 00 04 0f 05 <48> 3d 01 f0 ff ff 73 1f 31 c0 c3 be 8c 00 00 00 49 89 c8 4d 31 d2 [ 461.137738] RSP: 002b:00007efe43ee4928 EFLAGS: 00000283 ORIG_RAX: 00000000000000ca [ 461.137739] RAX: ffffffffffffffda RBX: 0000000005094df0 RCX: 00007efe4918ca26 [ 461.137740] RDX: 0000000000000001 RSI: 0000000000000085 RDI: 0000000005094e24 [ 461.137741] RBP: 00007efe43ee49c0 R08: 0000000005094e20 R09: 0000000004000001 [ 461.137741] R10: 0000000000000001 R11: 0000000000000283 R12: 0000000000000000 [ 461.137742] R13: 0000000005094df8 R14: 0000000000000001 R15: 0000000000448a10 [ 461.137743] ---[ end trace 187df4cad2bf7649 ]--- This warning happened in the push_dl_task(), because __add_running_bw()->cpufreq_update_util() is getting the rq_clock of the later_rq before its update, which takes place at activate_task(). The fix then is to update the rq_clock before calling add_running_bw(). To avoid double rq_clock_update() call, we set ENQUEUE_NOCLOCK flag to activate_task(). Reported-by: Daniel Casini Signed-off-by: Daniel Bristot de Oliveira Signed-off-by: Peter Zijlstra (Intel) Acked-by: Juri Lelli Cc: Clark Williams Cc: Linus Torvalds Cc: Luca Abeni Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Tommaso Cucinotta Fixes: e0367b12674b sched/deadline: Move CPU frequency selection triggering points Link: http://lkml.kernel.org/r/ca31d073a4788acf0684a8b255f14fea775ccf20.1532077269.git.bristot@redhat.com Signed-off-by: Ingo Molnar --- kernel/sched/deadline.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 10c7b51c0d1f..b5fbdde6afa9 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -2090,8 +2090,14 @@ retry: sub_rq_bw(&next_task->dl, &rq->dl); set_task_cpu(next_task, later_rq->cpu); add_rq_bw(&next_task->dl, &later_rq->dl); + + /* + * Update the later_rq clock here, because the clock is used + * by the cpufreq_update_util() inside __add_running_bw(). + */ + update_rq_clock(later_rq); add_running_bw(&next_task->dl, &later_rq->dl); - activate_task(later_rq, next_task, 0); + activate_task(later_rq, next_task, ENQUEUE_NOCLOCK); ret = 1; resched_curr(later_rq); -- cgit From f3d133ee0a17d5694c6f21873eec9863e11fa423 Mon Sep 17 00:00:00 2001 From: Hailong Liu Date: Wed, 18 Jul 2018 08:46:55 +0800 Subject: sched/rt: Restore rt_runtime after disabling RT_RUNTIME_SHARE NO_RT_RUNTIME_SHARE feature is used to prevent a CPU borrow enough runtime with a spin-rt-task. However, if RT_RUNTIME_SHARE feature is enabled and rt_rq has borrowd enough rt_runtime at the beginning, rt_runtime can't be restored to its initial bandwidth rt_runtime after we disable RT_RUNTIME_SHARE. E.g. on my PC with 4 cores, procedure to reproduce: 1) Make sure RT_RUNTIME_SHARE is enabled cat /sys/kernel/debug/sched_features GENTLE_FAIR_SLEEPERS START_DEBIT NO_NEXT_BUDDY LAST_BUDDY CACHE_HOT_BUDDY WAKEUP_PREEMPTION NO_HRTICK NO_DOUBLE_TICK LB_BIAS NONTASK_CAPACITY TTWU_QUEUE NO_SIS_AVG_CPU SIS_PROP NO_WARN_DOUBLE_CLOCK RT_PUSH_IPI RT_RUNTIME_SHARE NO_LB_MIN ATTACH_AGE_LOAD WA_IDLE WA_WEIGHT WA_BIAS 2) Start a spin-rt-task ./loop_rr & 3) set affinity to the last cpu taskset -p 8 $pid_of_loop_rr 4) Observe that last cpu have borrowed enough runtime. cat /proc/sched_debug | grep rt_runtime .rt_runtime : 950.000000 .rt_runtime : 900.000000 .rt_runtime : 950.000000 .rt_runtime : 1000.000000 5) Disable RT_RUNTIME_SHARE echo NO_RT_RUNTIME_SHARE > /sys/kernel/debug/sched_features 6) Observe that rt_runtime can not been restored cat /proc/sched_debug | grep rt_runtime .rt_runtime : 950.000000 .rt_runtime : 900.000000 .rt_runtime : 950.000000 .rt_runtime : 1000.000000 This patch help to restore rt_runtime after we disable RT_RUNTIME_SHARE. Signed-off-by: Hailong Liu Signed-off-by: Jiang Biao Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: zhong.weidong@zte.com.cn Link: http://lkml.kernel.org/r/1531874815-39357-1-git-send-email-liu.hailong6@zte.com.cn Signed-off-by: Ingo Molnar --- kernel/sched/rt.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 572567078b60..eaaec8364f96 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -836,6 +836,8 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) * can be time-consuming. Try to avoid it when possible. */ raw_spin_lock(&rt_rq->rt_runtime_lock); + if (!sched_feat(RT_RUNTIME_SHARE) && rt_rq->rt_runtime != RUNTIME_INF) + rt_rq->rt_runtime = rt_b->rt_runtime; skip = !rt_rq->rt_time && !rt_rq->rt_nr_running; raw_spin_unlock(&rt_rq->rt_runtime_lock); if (skip) -- cgit From 2e62c4743adc4c7bfcbc1f45118fc7bec58cf30a Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Thu, 19 Jul 2018 14:00:06 +0200 Subject: sched/fair: Remove #ifdefs from scale_rt_capacity() Reuse cpu_util_irq() that has been defined for schedutil and set irq util to 0 when !CONFIG_IRQ_TIME_ACCOUNTING. But the compiler is not able to optimize the sequence (at least with aarch64 GCC 7.2.1): free *= (max - irq); free /= max; when irq is fixed to 0 Add a new inline function scale_irq_capacity() that will scale utilization when irq is accounted. Reuse this funciton in schedutil which applies similar formula. Suggested-by: Ingo Molnar Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Acked-by: Viresh Kumar Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: rjw@rjwysocki.net Link: http://lkml.kernel.org/r/1532001606-6689-1-git-send-email-vincent.guittot@linaro.org Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 2 +- kernel/sched/cpufreq_schedutil.c | 3 +-- kernel/sched/fair.c | 13 +++---------- kernel/sched/sched.h | 20 ++++++++++++++++++-- 4 files changed, 23 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index c3cf7d992159..fc177c06e490 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -177,7 +177,7 @@ static void update_rq_clock_task(struct rq *rq, s64 delta) rq->clock_task += delta; -#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING) +#ifdef HAVE_SCHED_AVG_IRQ if ((irq_delta + steal) && sched_feat(NONTASK_CAPACITY)) update_irq_load_avg(rq, irq_delta + steal); #endif diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 97dcd4472a0e..3fffad3bc8a8 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -247,8 +247,7 @@ static unsigned long sugov_get_util(struct sugov_cpu *sg_cpu) * U' = irq + ------- * U * max */ - util *= (max - irq); - util /= max; + util = scale_irq_capacity(util, irq, max); util += irq; /* diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index d5f7d521e448..14c3fddf822a 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -7551,16 +7551,12 @@ static unsigned long scale_rt_capacity(int cpu) struct rq *rq = cpu_rq(cpu); unsigned long max = arch_scale_cpu_capacity(NULL, cpu); unsigned long used, free; -#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING) unsigned long irq; -#endif -#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING) - irq = READ_ONCE(rq->avg_irq.util_avg); + irq = cpu_util_irq(rq); if (unlikely(irq >= max)) return 1; -#endif used = READ_ONCE(rq->avg_rt.util_avg); used += READ_ONCE(rq->avg_dl.util_avg); @@ -7569,11 +7565,8 @@ static unsigned long scale_rt_capacity(int cpu) return 1; free = max - used; -#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING) - free *= (max - irq); - free /= max; -#endif - return free; + + return scale_irq_capacity(free, irq, max); } static void update_cpu_capacity(struct sched_domain *sd, int cpu) diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index ebb4b3c3ece7..614170d9b1aa 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -856,6 +856,7 @@ struct rq { struct sched_avg avg_rt; struct sched_avg avg_dl; #if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING) +#define HAVE_SCHED_AVG_IRQ struct sched_avg avg_irq; #endif u64 idle_stamp; @@ -2210,17 +2211,32 @@ static inline unsigned long cpu_util_rt(struct rq *rq) { return READ_ONCE(rq->avg_rt.util_avg); } +#endif -#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING) +#ifdef HAVE_SCHED_AVG_IRQ static inline unsigned long cpu_util_irq(struct rq *rq) { return rq->avg_irq.util_avg; } + +static inline +unsigned long scale_irq_capacity(unsigned long util, unsigned long irq, unsigned long max) +{ + util *= (max - irq); + util /= max; + + return util; + +} #else static inline unsigned long cpu_util_irq(struct rq *rq) { return 0; } -#endif +static inline +unsigned long scale_irq_capacity(unsigned long util, unsigned long irq, unsigned long max) +{ + return util; +} #endif -- cgit From 3d6c50c27bd6418dceb51642540ecfcb8ca708c2 Mon Sep 17 00:00:00 2001 From: Yun Wang Date: Wed, 4 Jul 2018 11:27:27 +0800 Subject: sched/debug: Show the sum wait time of a task group Although we can rely on cpuacct to present the CPU usage of task groups, it is hard to tell how intense the competition is between these groups on CPU resources. Monitoring the wait time or sched_debug of each process could be very expensive, and there is no good way to accurately represent the conflict with these info, we need the wait time on group dimension. Thus we introduce group's wait_sum to represent the resource conflict between task groups, which is simply the sum of the wait time of the group's cfs_rq. The 'cpu.stat' is modified to show the statistic, like: nr_periods 0 nr_throttled 0 throttled_time 0 wait_sum 2035098795584 Now we can monitor the changes of wait_sum to tell how much a a task group is suffering in the fight of CPU resources. For example: (wait_sum - last_wait_sum) * 100 / (nr_cpu * period_ns) == X% means the task group paid X percentage of period on waiting for the CPU. Signed-off-by: Michael Wang Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/ff7dae3b-e5f9-7157-1caa-ff02c6b23dc1@linux.alibaba.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index fc177c06e490..2bc391a574e6 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6748,6 +6748,16 @@ static int cpu_cfs_stat_show(struct seq_file *sf, void *v) seq_printf(sf, "nr_throttled %d\n", cfs_b->nr_throttled); seq_printf(sf, "throttled_time %llu\n", cfs_b->throttled_time); + if (schedstat_enabled() && tg != &root_task_group) { + u64 ws = 0; + int i; + + for_each_possible_cpu(i) + ws += schedstat_val(tg->se[i]->statistics.wait_sum); + + seq_printf(sf, "wait_sum %llu\n", ws); + } + return 0; } #endif /* CONFIG_CFS_BANDWIDTH */ -- cgit From 305c1fac3225dfa7eeb89bfe91b7335a6edd5172 Mon Sep 17 00:00:00 2001 From: Srikar Dronamraju Date: Wed, 20 Jun 2018 22:32:43 +0530 Subject: sched/numa: Evaluate move once per node task_numa_compare() helps choose the best CPU to move or swap the selected task. To achieve this task_numa_compare() is called for every CPU in the node. Currently it evaluates if the task can be moved/swapped for each of the CPUs. However the move evaluation is mostly independent of the CPU. Evaluating the move logic once per node, provides scope for simplifying task_numa_compare(). Running SPECjbb2005 on a 4 node machine and comparing bops/JVM JVMS LAST_PATCH WITH_PATCH %CHANGE 16 25705.2 25058.2 -2.51 1 74433 72950 -1.99 Running SPECjbb2005 on a 16 node machine and comparing bops/JVM JVMS LAST_PATCH WITH_PATCH %CHANGE 8 96589.6 105930 9.670 1 181830 178624 -1.76 (numbers from v1 based on v4.17-rc5) Testcase Time: Min Max Avg StdDev numa01.sh Real: 440.65 941.32 758.98 189.17 numa01.sh Sys: 183.48 320.07 258.42 50.09 numa01.sh User: 37384.65 71818.14 60302.51 13798.96 numa02.sh Real: 61.24 65.35 62.49 1.49 numa02.sh Sys: 16.83 24.18 21.40 2.60 numa02.sh User: 5219.59 5356.34 5264.03 49.07 numa03.sh Real: 822.04 912.40 873.55 37.35 numa03.sh Sys: 118.80 140.94 132.90 7.60 numa03.sh User: 62485.19 70025.01 67208.33 2967.10 numa04.sh Real: 690.66 872.12 778.49 65.44 numa04.sh Sys: 459.26 563.03 494.03 42.39 numa04.sh User: 51116.44 70527.20 58849.44 8461.28 numa05.sh Real: 418.37 562.28 525.77 54.27 numa05.sh Sys: 299.45 481.00 392.49 64.27 numa05.sh User: 34115.09 41324.02 39105.30 2627.68 Testcase Time: Min Max Avg StdDev %Change numa01.sh Real: 516.14 892.41 739.84 151.32 2.587% numa01.sh Sys: 153.16 192.99 177.70 14.58 45.42% numa01.sh User: 39821.04 69528.92 57193.87 10989.48 5.435% numa02.sh Real: 60.91 62.35 61.58 0.63 1.477% numa02.sh Sys: 16.47 26.16 21.20 3.85 0.943% numa02.sh User: 5227.58 5309.61 5265.17 31.04 -0.02% numa03.sh Real: 739.07 917.73 795.75 64.45 9.776% numa03.sh Sys: 94.46 136.08 109.48 14.58 21.39% numa03.sh User: 57478.56 72014.09 61764.48 5343.69 8.813% numa04.sh Real: 442.61 715.43 530.31 96.12 46.79% numa04.sh Sys: 224.90 348.63 285.61 48.83 72.97% numa04.sh User: 35836.84 47522.47 40235.41 3985.26 46.26% numa05.sh Real: 386.13 489.17 434.94 43.59 20.88% numa05.sh Sys: 144.29 438.56 278.80 105.78 40.77% numa05.sh User: 33255.86 36890.82 34879.31 1641.98 12.11% Signed-off-by: Srikar Dronamraju Signed-off-by: Peter Zijlstra (Intel) Acked-by: Mel Gorman Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1529514181-9842-3-git-send-email-srikar@linux.vnet.ibm.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 128 +++++++++++++++++++++++----------------------------- 1 file changed, 57 insertions(+), 71 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 14c3fddf822a..b10e0663a49e 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1580,9 +1580,8 @@ static bool load_too_imbalanced(long src_load, long dst_load, * be exchanged with the source task */ static void task_numa_compare(struct task_numa_env *env, - long taskimp, long groupimp) + long taskimp, long groupimp, bool maymove) { - struct rq *src_rq = cpu_rq(env->src_cpu); struct rq *dst_rq = cpu_rq(env->dst_cpu); struct task_struct *cur; long src_load, dst_load; @@ -1603,97 +1602,73 @@ static void task_numa_compare(struct task_numa_env *env, if (cur == env->p) goto unlock; + if (!cur) { + if (maymove || imp > env->best_imp) + goto assign; + else + goto unlock; + } + /* * "imp" is the fault differential for the source task between the * source and destination node. Calculate the total differential for * the source task and potential destination task. The more negative - * the value is, the more rmeote accesses that would be expected to + * the value is, the more remote accesses that would be expected to * be incurred if the tasks were swapped. */ - if (cur) { - /* Skip this swap candidate if cannot move to the source CPU: */ - if (!cpumask_test_cpu(env->src_cpu, &cur->cpus_allowed)) - goto unlock; + /* Skip this swap candidate if cannot move to the source cpu */ + if (!cpumask_test_cpu(env->src_cpu, &cur->cpus_allowed)) + goto unlock; + /* + * If dst and source tasks are in the same NUMA group, or not + * in any group then look only at task weights. + */ + if (cur->numa_group == env->p->numa_group) { + imp = taskimp + task_weight(cur, env->src_nid, dist) - + task_weight(cur, env->dst_nid, dist); /* - * If dst and source tasks are in the same NUMA group, or not - * in any group then look only at task weights. + * Add some hysteresis to prevent swapping the + * tasks within a group over tiny differences. */ - if (cur->numa_group == env->p->numa_group) { - imp = taskimp + task_weight(cur, env->src_nid, dist) - - task_weight(cur, env->dst_nid, dist); - /* - * Add some hysteresis to prevent swapping the - * tasks within a group over tiny differences. - */ - if (cur->numa_group) - imp -= imp/16; - } else { - /* - * Compare the group weights. If a task is all by - * itself (not part of a group), use the task weight - * instead. - */ - if (cur->numa_group) - imp += group_weight(cur, env->src_nid, dist) - - group_weight(cur, env->dst_nid, dist); - else - imp += task_weight(cur, env->src_nid, dist) - - task_weight(cur, env->dst_nid, dist); - } + if (cur->numa_group) + imp -= imp / 16; + } else { + /* + * Compare the group weights. If a task is all by itself + * (not part of a group), use the task weight instead. + */ + if (cur->numa_group && env->p->numa_group) + imp += group_weight(cur, env->src_nid, dist) - + group_weight(cur, env->dst_nid, dist); + else + imp += task_weight(cur, env->src_nid, dist) - + task_weight(cur, env->dst_nid, dist); } - if (imp <= env->best_imp && moveimp <= env->best_imp) + if (imp <= env->best_imp) goto unlock; - if (!cur) { - /* Is there capacity at our destination? */ - if (env->src_stats.nr_running <= env->src_stats.task_capacity && - !env->dst_stats.has_free_capacity) - goto unlock; - - goto balance; - } - - /* Balance doesn't matter much if we're running a task per CPU: */ - if (imp > env->best_imp && src_rq->nr_running == 1 && - dst_rq->nr_running == 1) + if (maymove && moveimp > imp && moveimp > env->best_imp) { + imp = moveimp - 1; + cur = NULL; goto assign; + } /* * In the overloaded case, try and keep the load balanced. */ -balance: - load = task_h_load(env->p); + load = task_h_load(env->p) - task_h_load(cur); + if (!load) + goto assign; + dst_load = env->dst_stats.load + load; src_load = env->src_stats.load - load; - if (moveimp > imp && moveimp > env->best_imp) { - /* - * If the improvement from just moving env->p direction is - * better than swapping tasks around, check if a move is - * possible. Store a slightly smaller score than moveimp, - * so an actually idle CPU will win. - */ - if (!load_too_imbalanced(src_load, dst_load, env)) { - imp = moveimp - 1; - cur = NULL; - goto assign; - } - } - - if (imp <= env->best_imp) - goto unlock; - - if (cur) { - load = task_h_load(cur); - dst_load -= load; - src_load += load; - } - if (load_too_imbalanced(src_load, dst_load, env)) goto unlock; +assign: /* * One idle CPU per node is evaluated for a task numa move. * Call select_idle_sibling to maybe find a better one. @@ -1709,7 +1684,6 @@ balance: local_irq_enable(); } -assign: task_numa_assign(env, cur, imp); unlock: rcu_read_unlock(); @@ -1718,15 +1692,27 @@ unlock: static void task_numa_find_cpu(struct task_numa_env *env, long taskimp, long groupimp) { + long src_load, dst_load, load; + bool maymove = false; int cpu; + load = task_h_load(env->p); + dst_load = env->dst_stats.load + load; + src_load = env->src_stats.load - load; + + /* + * If the improvement from just moving env->p direction is better + * than swapping tasks around, check if a move is possible. + */ + maymove = !load_too_imbalanced(src_load, dst_load, env); + for_each_cpu(cpu, cpumask_of_node(env->dst_nid)) { /* Skip this CPU if the source task cannot migrate */ if (!cpumask_test_cpu(cpu, &env->p->cpus_allowed)) continue; env->dst_cpu = cpu; - task_numa_compare(env, taskimp, groupimp); + task_numa_compare(env, taskimp, groupimp, maymove); } } -- cgit From 5f95ba7a43057f28a349ea1f03ee8d04e0f445ea Mon Sep 17 00:00:00 2001 From: Srikar Dronamraju Date: Wed, 20 Jun 2018 22:32:44 +0530 Subject: sched/numa: Simplify load_too_imbalanced() Currently load_too_imbalance() cares about the slope of imbalance. It doesn't care of the direction of the imbalance. However this may not work if nodes that are being compared have dissimilar capacities. Few nodes might have more cores than other nodes in the system. Also unlike traditional load balance at a NUMA sched domain, multiple requests to migrate from the same source node to same destination node may run in parallel. This can cause huge load imbalance. This is specially true on a larger machines with either large cores per node or more number of nodes in the system. Hence allow move/swap only if the imbalance is going to reduce. Running SPECjbb2005 on a 4 node machine and comparing bops/JVM JVMS LAST_PATCH WITH_PATCH %CHANGE 16 25058.2 25122.9 0.25 1 72950 73850 1.23 (numbers from v1 based on v4.17-rc5) Testcase Time: Min Max Avg StdDev numa01.sh Real: 516.14 892.41 739.84 151.32 numa01.sh Sys: 153.16 192.99 177.70 14.58 numa01.sh User: 39821.04 69528.92 57193.87 10989.48 numa02.sh Real: 60.91 62.35 61.58 0.63 numa02.sh Sys: 16.47 26.16 21.20 3.85 numa02.sh User: 5227.58 5309.61 5265.17 31.04 numa03.sh Real: 739.07 917.73 795.75 64.45 numa03.sh Sys: 94.46 136.08 109.48 14.58 numa03.sh User: 57478.56 72014.09 61764.48 5343.69 numa04.sh Real: 442.61 715.43 530.31 96.12 numa04.sh Sys: 224.90 348.63 285.61 48.83 numa04.sh User: 35836.84 47522.47 40235.41 3985.26 numa05.sh Real: 386.13 489.17 434.94 43.59 numa05.sh Sys: 144.29 438.56 278.80 105.78 numa05.sh User: 33255.86 36890.82 34879.31 1641.98 Testcase Time: Min Max Avg StdDev %Change numa01.sh Real: 435.78 653.81 534.58 83.20 38.39% numa01.sh Sys: 121.93 187.18 145.90 23.47 21.79% numa01.sh User: 37082.81 51402.80 43647.60 5409.75 31.03% numa02.sh Real: 60.64 61.63 61.19 0.40 0.637% numa02.sh Sys: 14.72 25.68 19.06 4.03 11.22% numa02.sh User: 5210.95 5266.69 5233.30 20.82 0.608% numa03.sh Real: 746.51 808.24 780.36 23.88 1.972% numa03.sh Sys: 97.26 108.48 105.07 4.28 4.197% numa03.sh User: 58956.30 61397.05 60162.95 1050.82 2.661% numa04.sh Real: 465.97 519.27 484.81 19.62 9.385% numa04.sh Sys: 304.43 359.08 334.68 20.64 -14.6% numa04.sh User: 37544.16 41186.15 39262.44 1314.91 2.478% numa05.sh Real: 411.57 457.20 433.29 16.58 0.380% numa05.sh Sys: 230.05 435.48 339.95 67.58 -17.9% numa05.sh User: 33325.54 36896.31 35637.84 1222.64 -2.12% Signed-off-by: Srikar Dronamraju Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Rik van Riel Acked-by: Mel Gorman Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1529514181-9842-4-git-send-email-srikar@linux.vnet.ibm.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 20 ++------------------ 1 file changed, 2 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index b10e0663a49e..226837960ec0 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1546,28 +1546,12 @@ static bool load_too_imbalanced(long src_load, long dst_load, src_capacity = env->src_stats.compute_capacity; dst_capacity = env->dst_stats.compute_capacity; - /* We care about the slope of the imbalance, not the direction. */ - if (dst_load < src_load) - swap(dst_load, src_load); - - /* Is the difference below the threshold? */ - imb = dst_load * src_capacity * 100 - - src_load * dst_capacity * env->imbalance_pct; - if (imb <= 0) - return false; + imb = abs(dst_load * src_capacity - src_load * dst_capacity); - /* - * The imbalance is above the allowed threshold. - * Compare it with the old imbalance. - */ orig_src_load = env->src_stats.load; orig_dst_load = env->dst_stats.load; - if (orig_dst_load < orig_src_load) - swap(orig_dst_load, orig_src_load); - - old_imb = orig_dst_load * src_capacity * 100 - - orig_src_load * dst_capacity * env->imbalance_pct; + old_imb = abs(orig_dst_load * src_capacity - orig_src_load * dst_capacity); /* Would this change make things worse? */ return (imb > old_imb); -- cgit From 8cd45eee43bd46b933158b25aa7c742e0f3e811f Mon Sep 17 00:00:00 2001 From: Srikar Dronamraju Date: Wed, 20 Jun 2018 22:32:45 +0530 Subject: sched/numa: Set preferred_node based on best_cpu Currently preferred node is set to dst_nid which is the last node in the iteration whose group weight or task weight is greater than the current node. However it doesn't guarantee that dst_nid has the numa capacity to move. It also doesn't guarantee that dst_nid has the best_cpu which is the CPU/node ideal for node migration. Lets consider faults on a 4 node system with group weight numbers in different nodes being in 0 < 1 < 2 < 3 proportion. Consider the task is running on 3 and 0 is its preferred node but its capacity is full. Consider nodes 1, 2 and 3 have capacity. Then the task should be migrated to node 1. Currently the task gets moved to node 2. env.dst_nid points to the last node whose faults were greater than current node. Modify to set the preferred node based of best_cpu. Earlier setting preferred node was skipped if nr_active_nodes is 1. This could result in the task being moved out of the preferred node to a random node during regular load balancing. Also while modifying task_numa_migrate(), use sched_setnuma to set preferred node. This ensures out numa accounting is correct. Running SPECjbb2005 on a 4 node machine and comparing bops/JVM JVMS LAST_PATCH WITH_PATCH %CHANGE 16 25122.9 25549.6 1.698 1 73850 73190 -0.89 Running SPECjbb2005 on a 16 node machine and comparing bops/JVM JVMS LAST_PATCH WITH_PATCH %CHANGE 8 105930 113437 7.08676 1 178624 196130 9.80047 (numbers from v1 based on v4.17-rc5) Testcase Time: Min Max Avg StdDev numa01.sh Real: 435.78 653.81 534.58 83.20 numa01.sh Sys: 121.93 187.18 145.90 23.47 numa01.sh User: 37082.81 51402.80 43647.60 5409.75 numa02.sh Real: 60.64 61.63 61.19 0.40 numa02.sh Sys: 14.72 25.68 19.06 4.03 numa02.sh User: 5210.95 5266.69 5233.30 20.82 numa03.sh Real: 746.51 808.24 780.36 23.88 numa03.sh Sys: 97.26 108.48 105.07 4.28 numa03.sh User: 58956.30 61397.05 60162.95 1050.82 numa04.sh Real: 465.97 519.27 484.81 19.62 numa04.sh Sys: 304.43 359.08 334.68 20.64 numa04.sh User: 37544.16 41186.15 39262.44 1314.91 numa05.sh Real: 411.57 457.20 433.29 16.58 numa05.sh Sys: 230.05 435.48 339.95 67.58 numa05.sh User: 33325.54 36896.31 35637.84 1222.64 Testcase Time: Min Max Avg StdDev %Change numa01.sh Real: 506.35 794.46 599.06 104.26 -10.76% numa01.sh Sys: 150.37 223.56 195.99 24.94 -25.55% numa01.sh User: 43450.69 61752.04 49281.50 6635.33 -11.43% numa02.sh Real: 60.33 62.40 61.31 0.90 -0.195% numa02.sh Sys: 18.12 31.66 24.28 5.89 -21.49% numa02.sh User: 5203.91 5325.32 5260.29 49.98 -0.513% numa03.sh Real: 696.47 853.62 745.80 57.28 4.6339% numa03.sh Sys: 85.68 123.71 97.89 13.48 7.3347% numa03.sh User: 55978.45 66418.63 59254.94 3737.97 1.5323% numa04.sh Real: 444.05 514.83 497.06 26.85 -2.464% numa04.sh Sys: 230.39 375.79 316.23 48.58 5.8343% numa04.sh User: 35403.12 41004.10 39720.80 2163.08 -1.153% numa05.sh Real: 423.09 460.41 439.57 13.92 -1.428% numa05.sh Sys: 287.38 480.15 369.37 68.52 -7.964% numa05.sh User: 34732.12 38016.80 36255.85 1070.51 -1.704% Signed-off-by: Srikar Dronamraju Signed-off-by: Peter Zijlstra (Intel) Acked-by: Mel Gorman Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1529514181-9842-5-git-send-email-srikar@linux.vnet.ibm.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 226837960ec0..0532195c38d0 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1765,7 +1765,7 @@ static int task_numa_migrate(struct task_struct *p) * elsewhere, so there is no point in (re)trying. */ if (unlikely(!sd)) { - p->numa_preferred_nid = task_node(p); + sched_setnuma(p, task_node(p)); return -EINVAL; } @@ -1824,15 +1824,13 @@ static int task_numa_migrate(struct task_struct *p) * trying for a better one later. Do not set the preferred node here. */ if (p->numa_group) { - struct numa_group *ng = p->numa_group; - if (env.best_cpu == -1) nid = env.src_nid; else - nid = env.dst_nid; + nid = cpu_to_node(env.best_cpu); - if (ng->active_nodes > 1 && numa_is_active_node(env.dst_nid, ng)) - sched_setnuma(p, env.dst_nid); + if (nid != p->numa_preferred_nid) + sched_setnuma(p, nid); } /* No better CPU than the current one was found. */ -- cgit From f03bb6760b8e5e2bcecc88d2a2ef41c09adcab39 Mon Sep 17 00:00:00 2001 From: Srikar Dronamraju Date: Wed, 20 Jun 2018 22:32:46 +0530 Subject: sched/numa: Use task faults only if numa_group is not yet set up When numa_group faults are available, task_numa_placement only uses numa_group faults to evaluate preferred node. However it still accounts task faults and even evaluates the preferred node just based on task faults just to discard it in favour of preferred node chosen on the basis of numa_group. Instead use task faults only if numa_group is not set. Running SPECjbb2005 on a 4 node machine and comparing bops/JVM JVMS LAST_PATCH WITH_PATCH %CHANGE 16 25549.6 25215.7 -1.30 1 73190 72107 -1.47 Running SPECjbb2005 on a 16 node machine and comparing bops/JVM JVMS LAST_PATCH WITH_PATCH %CHANGE 8 113437 113372 -0.05 1 196130 177403 -9.54 (numbers from v1 based on v4.17-rc5) Testcase Time: Min Max Avg StdDev numa01.sh Real: 506.35 794.46 599.06 104.26 numa01.sh Sys: 150.37 223.56 195.99 24.94 numa01.sh User: 43450.69 61752.04 49281.50 6635.33 numa02.sh Real: 60.33 62.40 61.31 0.90 numa02.sh Sys: 18.12 31.66 24.28 5.89 numa02.sh User: 5203.91 5325.32 5260.29 49.98 numa03.sh Real: 696.47 853.62 745.80 57.28 numa03.sh Sys: 85.68 123.71 97.89 13.48 numa03.sh User: 55978.45 66418.63 59254.94 3737.97 numa04.sh Real: 444.05 514.83 497.06 26.85 numa04.sh Sys: 230.39 375.79 316.23 48.58 numa04.sh User: 35403.12 41004.10 39720.80 2163.08 numa05.sh Real: 423.09 460.41 439.57 13.92 numa05.sh Sys: 287.38 480.15 369.37 68.52 numa05.sh User: 34732.12 38016.80 36255.85 1070.51 Testcase Time: Min Max Avg StdDev %Change numa01.sh Real: 478.45 565.90 515.11 30.87 16.29% numa01.sh Sys: 207.79 271.04 232.94 21.33 -15.8% numa01.sh User: 39763.93 47303.12 43210.73 2644.86 14.04% numa02.sh Real: 60.00 61.46 60.78 0.49 0.871% numa02.sh Sys: 15.71 25.31 20.69 3.42 17.35% numa02.sh User: 5175.92 5265.86 5235.97 32.82 0.464% numa03.sh Real: 776.42 834.85 806.01 23.22 -7.47% numa03.sh Sys: 114.43 128.75 121.65 5.49 -19.5% numa03.sh User: 60773.93 64855.25 62616.91 1576.39 -5.36% numa04.sh Real: 456.93 511.95 482.91 20.88 2.930% numa04.sh Sys: 178.09 460.89 356.86 94.58 -11.3% numa04.sh User: 36312.09 42553.24 39623.21 2247.96 0.246% numa05.sh Real: 393.98 493.48 436.61 35.59 0.677% numa05.sh Sys: 164.49 329.15 265.87 61.78 38.92% numa05.sh User: 33182.65 36654.53 35074.51 1187.71 3.368% Signed-off-by: Srikar Dronamraju Signed-off-by: Peter Zijlstra (Intel) Acked-by: Mel Gorman Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1529514181-9842-6-git-send-email-srikar@linux.vnet.ibm.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 0532195c38d0..a10c4f8f47e8 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2110,8 +2110,8 @@ static int preferred_group_nid(struct task_struct *p, int nid) static void task_numa_placement(struct task_struct *p) { - int seq, nid, max_nid = -1, max_group_nid = -1; - unsigned long max_faults = 0, max_group_faults = 0; + int seq, nid, max_nid = -1; + unsigned long max_faults = 0; unsigned long fault_types[2] = { 0, 0 }; unsigned long total_faults; u64 runtime, period; @@ -2190,15 +2190,15 @@ static void task_numa_placement(struct task_struct *p) } } - if (faults > max_faults) { - max_faults = faults; + if (!p->numa_group) { + if (faults > max_faults) { + max_faults = faults; + max_nid = nid; + } + } else if (group_faults > max_faults) { + max_faults = group_faults; max_nid = nid; } - - if (group_faults > max_group_faults) { - max_group_faults = group_faults; - max_group_nid = nid; - } } update_task_scan_period(p, fault_types[0], fault_types[1]); @@ -2206,7 +2206,7 @@ static void task_numa_placement(struct task_struct *p) if (p->numa_group) { numa_group_count_active_nodes(p->numa_group); spin_unlock_irq(group_lock); - max_nid = preferred_group_nid(p, max_group_nid); + max_nid = preferred_group_nid(p, max_nid); } if (max_faults) { -- cgit From 67d9f6c256cd66e15f85c92670f52a7ad4689cff Mon Sep 17 00:00:00 2001 From: Srikar Dronamraju Date: Wed, 20 Jun 2018 22:32:47 +0530 Subject: sched/debug: Reverse the order of printing faults Fix the order in which the private and shared numa faults are getting printed. No functional changes. Running SPECjbb2005 on a 4 node machine and comparing bops/JVM JVMS LAST_PATCH WITH_PATCH %CHANGE 16 25215.7 25375.3 0.63 1 72107 72617 0.70 Signed-off-by: Srikar Dronamraju Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Rik van Riel Acked-by: Mel Gorman Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1529514181-9842-7-git-send-email-srikar@linux.vnet.ibm.com Signed-off-by: Ingo Molnar --- kernel/sched/debug.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index c96e89cc4bc7..870d4f3da285 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -842,8 +842,8 @@ void print_numa_stats(struct seq_file *m, int node, unsigned long tsf, unsigned long tpf, unsigned long gsf, unsigned long gpf) { SEQ_printf(m, "numa_faults node=%d ", node); - SEQ_printf(m, "task_private=%lu task_shared=%lu ", tsf, tpf); - SEQ_printf(m, "group_private=%lu group_shared=%lu\n", gsf, gpf); + SEQ_printf(m, "task_private=%lu task_shared=%lu ", tpf, tsf); + SEQ_printf(m, "group_private=%lu group_shared=%lu\n", gpf, gsf); } #endif -- cgit From 0ee7e74dc0dc64d9900751d03c5c22dfdd173fb8 Mon Sep 17 00:00:00 2001 From: Srikar Dronamraju Date: Wed, 20 Jun 2018 22:32:48 +0530 Subject: sched/numa: Skip nodes that are at 'hoplimit' When comparing two nodes at a distance of 'hoplimit', we should consider nodes only up to 'hoplimit'. Currently we also consider nodes at 'oplimit' distance too. Hence two nodes at a distance of 'hoplimit' will have same groupweight. Fix this by skipping nodes at hoplimit. Running SPECjbb2005 on a 4 node machine and comparing bops/JVM JVMS LAST_PATCH WITH_PATCH %CHANGE 16 25375.3 25308.6 -0.26 1 72617 72964 0.477 Running SPECjbb2005 on a 16 node machine and comparing bops/JVM JVMS LAST_PATCH WITH_PATCH %CHANGE 8 113372 108750 -4.07684 1 177403 183115 3.21979 (numbers from v1 based on v4.17-rc5) Testcase Time: Min Max Avg StdDev numa01.sh Real: 478.45 565.90 515.11 30.87 numa01.sh Sys: 207.79 271.04 232.94 21.33 numa01.sh User: 39763.93 47303.12 43210.73 2644.86 numa02.sh Real: 60.00 61.46 60.78 0.49 numa02.sh Sys: 15.71 25.31 20.69 3.42 numa02.sh User: 5175.92 5265.86 5235.97 32.82 numa03.sh Real: 776.42 834.85 806.01 23.22 numa03.sh Sys: 114.43 128.75 121.65 5.49 numa03.sh User: 60773.93 64855.25 62616.91 1576.39 numa04.sh Real: 456.93 511.95 482.91 20.88 numa04.sh Sys: 178.09 460.89 356.86 94.58 numa04.sh User: 36312.09 42553.24 39623.21 2247.96 numa05.sh Real: 393.98 493.48 436.61 35.59 numa05.sh Sys: 164.49 329.15 265.87 61.78 numa05.sh User: 33182.65 36654.53 35074.51 1187.71 Testcase Time: Min Max Avg StdDev %Change numa01.sh Real: 414.64 819.20 556.08 147.70 -7.36% numa01.sh Sys: 77.52 205.04 139.40 52.05 67.10% numa01.sh User: 37043.24 61757.88 45517.48 9290.38 -5.06% numa02.sh Real: 60.80 63.32 61.63 0.88 -1.37% numa02.sh Sys: 17.35 39.37 25.71 7.33 -19.5% numa02.sh User: 5213.79 5374.73 5268.90 55.09 -0.62% numa03.sh Real: 780.09 948.64 831.43 63.02 -3.05% numa03.sh Sys: 104.96 136.92 116.31 11.34 4.591% numa03.sh User: 60465.42 73339.78 64368.03 4700.14 -2.72% numa04.sh Real: 412.60 681.92 521.29 96.64 -7.36% numa04.sh Sys: 210.32 314.10 251.77 37.71 41.74% numa04.sh User: 34026.38 45581.20 38534.49 4198.53 2.825% numa05.sh Real: 394.79 439.63 411.35 16.87 6.140% numa05.sh Sys: 238.32 330.09 292.31 38.32 -9.04% numa05.sh User: 33456.45 34876.07 34138.62 609.45 2.741% While there is a regression with this change, this change is needed from a correctness perspective. Also it helps consolidation as seen from perf bench output. Signed-off-by: Srikar Dronamraju Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Rik van Riel Acked-by: Mel Gorman Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1529514181-9842-8-git-send-email-srikar@linux.vnet.ibm.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index a10c4f8f47e8..e5f39e8dfe53 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1312,7 +1312,7 @@ static unsigned long score_nearby_nodes(struct task_struct *p, int nid, * of each group. Skip other nodes. */ if (sched_numa_topology_type == NUMA_BACKPLANE && - dist > maxdist) + dist >= maxdist) continue; /* Add up the faults from nearby nodes. */ -- cgit From 10864a9e222048a862da2c21efa28929a4dfed15 Mon Sep 17 00:00:00 2001 From: Srikar Dronamraju Date: Wed, 20 Jun 2018 22:32:49 +0530 Subject: sched/numa: Remove unused task_capacity from 'struct numa_stats' The task_capacity field in 'struct numa_stats' is redundant. Also move nr_running for better packing within the struct. No functional changes. Running SPECjbb2005 on a 4 node machine and comparing bops/JVM JVMS LAST_PATCH WITH_PATCH %CHANGE 16 25308.6 25377.3 0.271 1 72964 72287 -0.92 Signed-off-by: Srikar Dronamraju Signed-off-by: Peter Zijlstra (Intel) Acked-by: Mel Gorman Acked-by: Rik van Riel Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1529514181-9842-9-git-send-email-srikar@linux.vnet.ibm.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index e5f39e8dfe53..4ac60b296d96 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1450,14 +1450,12 @@ static unsigned long capacity_of(int cpu); /* Cached statistics for all CPUs within a node */ struct numa_stats { - unsigned long nr_running; unsigned long load; /* Total compute capacity of CPUs on a node */ unsigned long compute_capacity; - /* Approximate capacity in terms of runnable tasks on a node */ - unsigned long task_capacity; + unsigned int nr_running; int has_free_capacity; }; @@ -1495,9 +1493,9 @@ static void update_numa_stats(struct numa_stats *ns, int nid) smt = DIV_ROUND_UP(SCHED_CAPACITY_SCALE * cpus, ns->compute_capacity); capacity = cpus / smt; /* cores */ - ns->task_capacity = min_t(unsigned, capacity, + capacity = min_t(unsigned, capacity, DIV_ROUND_CLOSEST(ns->compute_capacity, SCHED_CAPACITY_SCALE)); - ns->has_free_capacity = (ns->nr_running < ns->task_capacity); + ns->has_free_capacity = (ns->nr_running < capacity); } struct task_numa_env { -- cgit From 0ad4e3dfe6cf3f207e61cbd8e3e4a943f8c1ad20 Mon Sep 17 00:00:00 2001 From: Srikar Dronamraju Date: Wed, 20 Jun 2018 22:32:50 +0530 Subject: sched/numa: Modify migrate_swap() to accept additional parameters There are checks in migrate_swap_stop() that check if the task/CPU combination is as per migrate_swap_arg before migrating. However atleast one of the two tasks to be swapped by migrate_swap() could have migrated to a completely different CPU before updating the migrate_swap_arg. The new CPU where the task is currently running could be a different node too. If the task has migrated, numa balancer might end up placing a task in a wrong node. Instead of achieving node consolidation, it may end up spreading the load across nodes. To avoid that pass the CPUs as additional parameters. While here, place migrate_swap under CONFIG_NUMA_BALANCING. Running SPECjbb2005 on a 4 node machine and comparing bops/JVM JVMS LAST_PATCH WITH_PATCH %CHANGE 16 25377.3 25226.6 -0.59 1 72287 73326 1.437 Signed-off-by: Srikar Dronamraju Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Rik van Riel Acked-by: Mel Gorman Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1529514181-9842-10-git-send-email-srikar@linux.vnet.ibm.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 9 ++++++--- kernel/sched/fair.c | 3 ++- kernel/sched/sched.h | 3 ++- 3 files changed, 10 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 2bc391a574e6..deafa9fe602b 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1176,6 +1176,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) __set_task_cpu(p, new_cpu); } +#ifdef CONFIG_NUMA_BALANCING static void __migrate_swap_task(struct task_struct *p, int cpu) { if (task_on_rq_queued(p)) { @@ -1257,16 +1258,17 @@ unlock: /* * Cross migrate two tasks */ -int migrate_swap(struct task_struct *cur, struct task_struct *p) +int migrate_swap(struct task_struct *cur, struct task_struct *p, + int target_cpu, int curr_cpu) { struct migration_swap_arg arg; int ret = -EINVAL; arg = (struct migration_swap_arg){ .src_task = cur, - .src_cpu = task_cpu(cur), + .src_cpu = curr_cpu, .dst_task = p, - .dst_cpu = task_cpu(p), + .dst_cpu = target_cpu, }; if (arg.src_cpu == arg.dst_cpu) @@ -1291,6 +1293,7 @@ int migrate_swap(struct task_struct *cur, struct task_struct *p) out: return ret; } +#endif /* CONFIG_NUMA_BALANCING */ /* * wait_task_inactive - wait for a thread to unschedule. diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 4ac60b296d96..7b4eddec3ccc 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1848,7 +1848,8 @@ static int task_numa_migrate(struct task_struct *p) return ret; } - ret = migrate_swap(p, env.best_task); + ret = migrate_swap(p, env.best_task, env.best_cpu, env.src_cpu); + if (ret != 0) trace_sched_stick_numa(p, env.src_cpu, task_cpu(env.best_task)); put_task_struct(env.best_task); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 614170d9b1aa..4a2e8cae63c4 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1099,7 +1099,8 @@ enum numa_faults_stats { }; extern void sched_setnuma(struct task_struct *p, int node); extern int migrate_task_to(struct task_struct *p, int cpu); -extern int migrate_swap(struct task_struct *, struct task_struct *); +extern int migrate_swap(struct task_struct *p, struct task_struct *t, + int cpu, int scpu); extern void init_numa_balancing(unsigned long clone_flags, struct task_struct *p); #else static inline void -- cgit From 2d4056fafa196e1ab4e7161bae4df76f9602d56d Mon Sep 17 00:00:00 2001 From: Srikar Dronamraju Date: Wed, 20 Jun 2018 22:32:53 +0530 Subject: sched/numa: Remove numa_has_capacity() task_numa_find_cpu() helps to find the CPU to swap/move the task to. It's guarded by numa_has_capacity(). However node not having capacity shouldn't deter a task swapping if it helps NUMA placement. Further load_too_imbalanced(), which evaluates possibilities of move/swap, provides similar checks as numa_has_capacity. Hence remove numa_has_capacity() to enhance possibilities of task swapping even if load is imbalanced. Running SPECjbb2005 on a 4 node machine and comparing bops/JVM JVMS LAST_PATCH WITH_PATCH %CHANGE 16 25657.9 25804.1 0.569 1 74435 73413 -1.37 Signed-off-by: Srikar Dronamraju Signed-off-by: Peter Zijlstra (Intel) Acked-by: Rik van Riel Cc: Linus Torvalds Cc: Mel Gorman Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1529514181-9842-13-git-send-email-srikar@linux.vnet.ibm.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 36 +++--------------------------------- 1 file changed, 3 insertions(+), 33 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 7b4eddec3ccc..3bcf0e864613 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1456,7 +1456,6 @@ struct numa_stats { unsigned long compute_capacity; unsigned int nr_running; - int has_free_capacity; }; /* @@ -1483,8 +1482,7 @@ static void update_numa_stats(struct numa_stats *ns, int nid) * the @ns structure is NULL'ed and task_numa_compare() will * not find this node attractive. * - * We'll either bail at !has_free_capacity, or we'll detect a huge - * imbalance and bail there. + * We'll detect a huge imbalance and bail there. */ if (!cpus) return; @@ -1495,7 +1493,6 @@ static void update_numa_stats(struct numa_stats *ns, int nid) capacity = min_t(unsigned, capacity, DIV_ROUND_CLOSEST(ns->compute_capacity, SCHED_CAPACITY_SCALE)); - ns->has_free_capacity = (ns->nr_running < capacity); } struct task_numa_env { @@ -1698,31 +1695,6 @@ static void task_numa_find_cpu(struct task_numa_env *env, } } -/* Only move tasks to a NUMA node less busy than the current node. */ -static bool numa_has_capacity(struct task_numa_env *env) -{ - struct numa_stats *src = &env->src_stats; - struct numa_stats *dst = &env->dst_stats; - - if (src->has_free_capacity && !dst->has_free_capacity) - return false; - - /* - * Only consider a task move if the source has a higher load - * than the destination, corrected for CPU capacity on each node. - * - * src->load dst->load - * --------------------- vs --------------------- - * src->compute_capacity dst->compute_capacity - */ - if (src->load * dst->compute_capacity * env->imbalance_pct > - - dst->load * src->compute_capacity * 100) - return true; - - return false; -} - static int task_numa_migrate(struct task_struct *p) { struct task_numa_env env = { @@ -1777,8 +1749,7 @@ static int task_numa_migrate(struct task_struct *p) update_numa_stats(&env.dst_stats, env.dst_nid); /* Try to find a spot on the preferred nid. */ - if (numa_has_capacity(&env)) - task_numa_find_cpu(&env, taskimp, groupimp); + task_numa_find_cpu(&env, taskimp, groupimp); /* * Look at other nodes in these cases: @@ -1808,8 +1779,7 @@ static int task_numa_migrate(struct task_struct *p) env.dist = dist; env.dst_nid = nid; update_numa_stats(&env.dst_stats, env.dst_nid); - if (numa_has_capacity(&env)) - task_numa_find_cpu(&env, taskimp, groupimp); + task_numa_find_cpu(&env, taskimp, groupimp); } } -- cgit From 30619c89b17d46808b4cdf5b3f81b6a01ade1473 Mon Sep 17 00:00:00 2001 From: Srikar Dronamraju Date: Wed, 20 Jun 2018 22:32:55 +0530 Subject: sched/numa: Update the scan period without holding the numa_group lock The metrics for updating scan periods are local or task specific. Currently this update happens under the numa_group lock, which seems unnecessary. Hence move this update outside the lock. Running SPECjbb2005 on a 4 node machine and comparing bops/JVM JVMS LAST_PATCH WITH_PATCH %CHANGE 16 25355.9 25645.4 1.141 1 72812 72142 -0.92 Signed-off-by: Srikar Dronamraju Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Rik van Riel Acked-by: Mel Gorman Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1529514181-9842-15-git-send-email-srikar@linux.vnet.ibm.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 3bcf0e864613..fc33a4b40a09 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2170,8 +2170,6 @@ static void task_numa_placement(struct task_struct *p) } } - update_task_scan_period(p, fault_types[0], fault_types[1]); - if (p->numa_group) { numa_group_count_active_nodes(p->numa_group); spin_unlock_irq(group_lock); @@ -2186,6 +2184,8 @@ static void task_numa_placement(struct task_struct *p) if (task_node(p) != p->numa_preferred_nid) numa_migrate_preferred(p); } + + update_task_scan_period(p, fault_types[0], fault_types[1]); } static inline int get_numa_group(struct numa_group *grp) -- cgit From f35678b6a17063f3b0d391af5ab8f8c83cf31b0c Mon Sep 17 00:00:00 2001 From: Srikar Dronamraju Date: Wed, 20 Jun 2018 22:32:56 +0530 Subject: sched/numa: Use group_weights to identify if migration degrades locality On NUMA_BACKPLANE and NUMA_GLUELESS_MESH systems, tasks/memory should be consolidated to the closest group of nodes. In such a case, relying on group_fault metric may not always help to consolidate. There can always be a case where a node closer to the preferred node may have lesser faults than a node further away from the preferred node. In such a case, moving to node with more faults might avoid numa consolidation. Using group_weight would help to consolidate task/memory around the preferred_node. While here, to be on the conservative side, don't override migrate thread degrades locality logic for CPU_NEWLY_IDLE load balancing. Note: Similar problems exist with should_numa_migrate_memory and will be dealt separately. Running SPECjbb2005 on a 4 node machine and comparing bops/JVM JVMS LAST_PATCH WITH_PATCH %CHANGE 16 25645.4 25960 1.22 1 72142 73550 1.95 Running SPECjbb2005 on a 16 node machine and comparing bops/JVM JVMS LAST_PATCH WITH_PATCH %CHANGE 8 110199 120071 8.958 1 176303 176249 -0.03 (numbers from v1 based on v4.17-rc5) Testcase Time: Min Max Avg StdDev numa01.sh Real: 490.04 774.86 596.26 96.46 numa01.sh Sys: 151.52 242.88 184.82 31.71 numa01.sh User: 41418.41 60844.59 48776.09 6564.27 numa02.sh Real: 60.14 62.94 60.98 1.00 numa02.sh Sys: 16.11 30.77 21.20 5.28 numa02.sh User: 5184.33 5311.09 5228.50 44.24 numa03.sh Real: 790.95 856.35 826.41 24.11 numa03.sh Sys: 114.93 118.85 117.05 1.63 numa03.sh User: 60990.99 64959.28 63470.43 1415.44 numa04.sh Real: 434.37 597.92 504.87 59.70 numa04.sh Sys: 237.63 397.40 289.74 55.98 numa04.sh User: 34854.87 41121.83 38572.52 2615.84 numa05.sh Real: 386.77 448.90 417.22 22.79 numa05.sh Sys: 149.23 379.95 303.04 79.55 numa05.sh User: 32951.76 35959.58 34562.18 1034.05 Testcase Time: Min Max Avg StdDev %Change numa01.sh Real: 493.19 672.88 597.51 59.38 -0.20% numa01.sh Sys: 150.09 245.48 207.76 34.26 -11.0% numa01.sh User: 41928.51 53779.17 48747.06 3901.39 0.059% numa02.sh Real: 60.63 62.87 61.22 0.83 -0.39% numa02.sh Sys: 16.64 27.97 20.25 4.06 4.691% numa02.sh User: 5222.92 5309.60 5254.03 29.98 -0.48% numa03.sh Real: 821.52 902.15 863.60 32.41 -4.30% numa03.sh Sys: 112.04 130.66 118.35 7.08 -1.09% numa03.sh User: 62245.16 69165.14 66443.04 2450.32 -4.47% numa04.sh Real: 414.53 519.57 476.25 37.00 6.009% numa04.sh Sys: 181.84 335.67 280.41 54.07 3.327% numa04.sh User: 33924.50 39115.39 37343.78 1934.26 3.290% numa05.sh Real: 408.30 441.45 417.90 12.05 -0.16% numa05.sh Sys: 233.41 381.60 295.58 57.37 2.523% numa05.sh User: 33301.31 35972.50 34335.19 938.94 0.661% Signed-off-by: Srikar Dronamraju Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mel Gorman Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1529514181-9842-16-git-send-email-srikar@linux.vnet.ibm.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index fc33a4b40a09..9c9e54ea65d9 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6899,8 +6899,8 @@ static int task_hot(struct task_struct *p, struct lb_env *env) static int migrate_degrades_locality(struct task_struct *p, struct lb_env *env) { struct numa_group *numa_group = rcu_dereference(p->numa_group); - unsigned long src_faults, dst_faults; - int src_nid, dst_nid; + unsigned long src_weight, dst_weight; + int src_nid, dst_nid, dist; if (!static_branch_likely(&sched_numa_balancing)) return -1; @@ -6927,18 +6927,19 @@ static int migrate_degrades_locality(struct task_struct *p, struct lb_env *env) return 0; /* Leaving a core idle is often worse than degrading locality. */ - if (env->idle != CPU_NOT_IDLE) + if (env->idle == CPU_IDLE) return -1; + dist = node_distance(src_nid, dst_nid); if (numa_group) { - src_faults = group_faults(p, src_nid); - dst_faults = group_faults(p, dst_nid); + src_weight = group_weight(p, src_nid, dist); + dst_weight = group_weight(p, dst_nid, dist); } else { - src_faults = task_faults(p, src_nid); - dst_faults = task_faults(p, dst_nid); + src_weight = task_weight(p, src_nid, dist); + dst_weight = task_weight(p, dst_nid, dist); } - return dst_faults < src_faults; + return dst_weight < src_weight; } #else -- cgit From b6a60cf36d497e7fbde9dd5b86fabd96850249f6 Mon Sep 17 00:00:00 2001 From: Srikar Dronamraju Date: Wed, 20 Jun 2018 22:33:00 +0530 Subject: sched/numa: Move task_numa_placement() closer to numa_migrate_preferred() numa_migrate_preferred() is called periodically or when task preferred node changes. Preferred node evaluations happen once per scan sequence. If the scan completion happens just after the periodic NUMA migration, then we try to migrate to the preferred node and the preferred node might change, needing another node migration. Avoid this by checking for scan sequence completion only when checking for periodic migration. Running SPECjbb2005 on a 4 node machine and comparing bops/JVM JVMS LAST_PATCH WITH_PATCH %CHANGE 16 25862.6 26158.1 1.14258 1 74357 72725 -2.19482 Running SPECjbb2005 on a 16 node machine and comparing bops/JVM JVMS LAST_PATCH WITH_PATCH %CHANGE 8 117019 113992 -2.58 1 179095 174947 -2.31 (numbers from v1 based on v4.17-rc5) Testcase Time: Min Max Avg StdDev numa01.sh Real: 449.46 770.77 615.22 101.70 numa01.sh Sys: 132.72 208.17 170.46 24.96 numa01.sh User: 39185.26 60290.89 50066.76 6807.84 numa02.sh Real: 60.85 61.79 61.28 0.37 numa02.sh Sys: 15.34 24.71 21.08 3.61 numa02.sh User: 5204.41 5249.85 5231.21 17.60 numa03.sh Real: 785.50 916.97 840.77 44.98 numa03.sh Sys: 108.08 133.60 119.43 8.82 numa03.sh User: 61422.86 70919.75 64720.87 3310.61 numa04.sh Real: 429.57 587.37 480.80 57.40 numa04.sh Sys: 240.61 321.97 290.84 33.58 numa04.sh User: 34597.65 40498.99 37079.48 2060.72 numa05.sh Real: 392.09 431.25 414.65 13.82 numa05.sh Sys: 229.41 372.48 297.54 53.14 numa05.sh User: 33390.86 34697.49 34222.43 556.42 Testcase Time: Min Max Avg StdDev %Change numa01.sh Real: 424.63 566.18 498.12 59.26 23.50% numa01.sh Sys: 160.19 256.53 208.98 37.02 -18.4% numa01.sh User: 37320.00 46225.58 42001.57 3482.45 19.20% numa02.sh Real: 60.17 62.47 60.91 0.85 0.607% numa02.sh Sys: 15.30 22.82 17.04 2.90 23.70% numa02.sh User: 5202.13 5255.51 5219.08 20.14 0.232% numa03.sh Real: 823.91 844.89 833.86 8.46 0.828% numa03.sh Sys: 130.69 148.29 140.47 6.21 -14.9% numa03.sh User: 62519.15 64262.20 63613.38 620.05 1.740% numa04.sh Real: 515.30 603.74 548.56 30.93 -12.3% numa04.sh Sys: 459.73 525.48 489.18 21.63 -40.5% numa04.sh User: 40561.96 44919.18 42047.87 1526.85 -11.8% numa05.sh Real: 396.58 454.37 421.13 19.71 -1.53% numa05.sh Sys: 208.72 422.02 348.90 73.60 -14.7% numa05.sh User: 33124.08 36109.35 34846.47 1089.74 -1.79% Signed-off-by: Srikar Dronamraju Signed-off-by: Peter Zijlstra (Intel) Acked-by: Mel Gorman Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1529514181-9842-20-git-send-email-srikar@linux.vnet.ibm.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 9c9e54ea65d9..309c93fcc604 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2180,9 +2180,6 @@ static void task_numa_placement(struct task_struct *p) /* Set the new preferred node */ if (max_nid != p->numa_preferred_nid) sched_setnuma(p, max_nid); - - if (task_node(p) != p->numa_preferred_nid) - numa_migrate_preferred(p); } update_task_scan_period(p, fault_types[0], fault_types[1]); @@ -2385,14 +2382,14 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags) numa_is_active_node(mem_node, ng)) local = 1; - task_numa_placement(p); - /* * Retry task to preferred node migration periodically, in case it * case it previously failed, or the scheduler moved us. */ - if (time_after(jiffies, p->numa_migrate_retry)) + if (time_after(jiffies, p->numa_migrate_retry)) { + task_numa_placement(p); numa_migrate_preferred(p); + } if (migrated) p->numa_pages_migrated += pages; -- cgit From 6cbc304f2f360f25cc8607817239d6f4a2fd3dc5 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 10 May 2018 15:48:41 +0200 Subject: perf/x86/intel: Fix unwind errors from PEBS entries (mk-II) Vince reported the perf_fuzzer giving various unwinder warnings and Josh reported: > Deja vu. Most of these are related to perf PEBS, similar to the > following issue: > > b8000586c90b ("perf/x86/intel: Cure bogus unwind from PEBS entries") > > This is basically the ORC version of that. setup_pebs_sample_data() is > assembling a franken-pt_regs which ORC isn't happy about. RIP is > inconsistent with some of the other registers (like RSP and RBP). And where the previous unwinder only needed BP,SP ORC also requires IP. But we cannot spoof IP because then the sample will get displaced, entirely negating the point of PEBS. So cure the whole thing differently by doing the unwind early; this does however require a means to communicate we did the unwind early. We (ab)use an unused sample_type bit for this, which we set on events that fill out the data->callchain before the normal perf_prepare_sample(). Debugged-by: Josh Poimboeuf Reported-by: Vince Weaver Tested-by: Josh Poimboeuf Tested-by: Prashant Bhole Signed-off-by: Peter Zijlstra (Intel) Cc: Alexander Shishkin Cc: Andy Lutomirski Cc: Arnaldo Carvalho de Melo Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/events/intel/core.c | 3 +++ arch/x86/events/intel/ds.c | 25 +++++++++++-------------- include/linux/perf_event.h | 1 + include/uapi/linux/perf_event.h | 2 ++ kernel/events/core.c | 6 ++++-- 5 files changed, 21 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 707b2a96e516..86f0c15dcc2d 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -2997,6 +2997,9 @@ static int intel_pmu_hw_config(struct perf_event *event) } if (x86_pmu.pebs_aliases) x86_pmu.pebs_aliases(event); + + if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) + event->attr.sample_type |= __PERF_SAMPLE_CALLCHAIN_EARLY; } if (needs_branch_stack(event)) { diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index 8cf03f101938..8dbba77e0518 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -1185,17 +1185,21 @@ static void setup_pebs_sample_data(struct perf_event *event, data->data_src.val = val; } + /* + * We must however always use iregs for the unwinder to stay sane; the + * record BP,SP,IP can point into thin air when the record is from a + * previous PMI context or an (I)RET happend between the record and + * PMI. + */ + if (sample_type & PERF_SAMPLE_CALLCHAIN) + data->callchain = perf_callchain(event, iregs); + /* * We use the interrupt regs as a base because the PEBS record does not * contain a full regs set, specifically it seems to lack segment * descriptors, which get used by things like user_mode(). * * In the simple case fix up only the IP for PERF_SAMPLE_IP. - * - * We must however always use BP,SP from iregs for the unwinder to stay - * sane; the record BP,SP can point into thin air when the record is - * from a previous PMI context or an (I)RET happend between the record - * and PMI. */ *regs = *iregs; @@ -1214,15 +1218,8 @@ static void setup_pebs_sample_data(struct perf_event *event, regs->si = pebs->si; regs->di = pebs->di; - /* - * Per the above; only set BP,SP if we don't need callchains. - * - * XXX: does this make sense? - */ - if (!(sample_type & PERF_SAMPLE_CALLCHAIN)) { - regs->bp = pebs->bp; - regs->sp = pebs->sp; - } + regs->bp = pebs->bp; + regs->sp = pebs->sp; #ifndef CONFIG_X86_32 regs->r8 = pebs->r8; diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 1fa12887ec02..87f6db437e4a 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -1130,6 +1130,7 @@ extern void perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct extern struct perf_callchain_entry * get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user, u32 max_stack, bool crosstask, bool add_mark); +extern struct perf_callchain_entry *perf_callchain(struct perf_event *event, struct pt_regs *regs); extern int get_callchain_buffers(int max_stack); extern void put_callchain_buffers(void); diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h index b8e288a1f740..eeb787b1c53c 100644 --- a/include/uapi/linux/perf_event.h +++ b/include/uapi/linux/perf_event.h @@ -143,6 +143,8 @@ enum perf_event_sample_format { PERF_SAMPLE_PHYS_ADDR = 1U << 19, PERF_SAMPLE_MAX = 1U << 20, /* non-ABI */ + + __PERF_SAMPLE_CALLCHAIN_EARLY = 1ULL << 63, }; /* diff --git a/kernel/events/core.c b/kernel/events/core.c index 8f0434a9951a..cdb32cf8e33c 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -6343,7 +6343,7 @@ static u64 perf_virt_to_phys(u64 virt) static struct perf_callchain_entry __empty_callchain = { .nr = 0, }; -static struct perf_callchain_entry * +struct perf_callchain_entry * perf_callchain(struct perf_event *event, struct pt_regs *regs) { bool kernel = !event->attr.exclude_callchain_kernel; @@ -6382,7 +6382,9 @@ void perf_prepare_sample(struct perf_event_header *header, if (sample_type & PERF_SAMPLE_CALLCHAIN) { int size = 1; - data->callchain = perf_callchain(event, regs); + if (!(sample_type & __PERF_SAMPLE_CALLCHAIN_EARLY)) + data->callchain = perf_callchain(event, regs); + size += data->callchain->nr; header->size += size * sizeof(u64); -- cgit From 7f635ff187ab6be0b350b3ec06791e376af238ab Mon Sep 17 00:00:00 2001 From: Mathieu Poirier Date: Mon, 16 Jul 2018 17:13:51 -0600 Subject: perf/core: Fix crash when using HW tracing kernel filters In function perf_event_parse_addr_filter(), the path::dentry of each struct perf_addr_filter is left unassigned (as it should be) when the pattern being parsed is related to kernel space. But in function perf_addr_filter_match() the same dentries are given to d_inode() where the value is not expected to be NULL, resulting in the following splat: Unable to handle kernel NULL pointer dereference at virtual address 0000000000000058 pc : perf_event_mmap+0x2fc/0x5a0 lr : perf_event_mmap+0x2c8/0x5a0 Process uname (pid: 2860, stack limit = 0x000000001cbcca37) Call trace: perf_event_mmap+0x2fc/0x5a0 mmap_region+0x124/0x570 do_mmap+0x344/0x4f8 vm_mmap_pgoff+0xe4/0x110 vm_mmap+0x2c/0x40 elf_map+0x60/0x108 load_elf_binary+0x450/0x12c4 search_binary_handler+0x90/0x290 __do_execve_file.isra.13+0x6e4/0x858 sys_execve+0x3c/0x50 el0_svc_naked+0x30/0x34 This patch is fixing the problem by introducing a new check in function perf_addr_filter_match() to see if the filter's dentry is NULL. Signed-off-by: Mathieu Poirier Signed-off-by: Peter Zijlstra (Intel) Acked-by: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Cc: acme@kernel.org Cc: miklos@szeredi.hu Cc: namhyung@kernel.org Cc: songliubraving@fb.com Fixes: 9511bce9fe8e ("perf/core: Fix bad use of igrab()") Link: http://lkml.kernel.org/r/1531782831-1186-1-git-send-email-mathieu.poirier@linaro.org Signed-off-by: Ingo Molnar --- kernel/events/core.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index cdb32cf8e33c..eec2d5fb676b 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -7337,6 +7337,10 @@ static bool perf_addr_filter_match(struct perf_addr_filter *filter, struct file *file, unsigned long offset, unsigned long size) { + /* d_inode(NULL) won't be equal to any mapped user-space file */ + if (!filter->path.dentry) + return false; + if (d_inode(filter->path.dentry) != file_inode(file)) return false; -- cgit From 7d63fb3af87aa67aa7d24466e792f9d7c57d8e79 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Tue, 10 Jul 2018 16:22:22 -0700 Subject: swiotlb: clean up reporting This removes needless use of '%p', and refactors the printk calls to use pr_*() helpers instead. Signed-off-by: Kees Cook Reviewed-by: Konrad Rzeszutek Wilk Signed-off-by: Christoph Hellwig --- kernel/dma/swiotlb.c | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index 904541055792..4f8a6dbf0b60 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -17,6 +17,8 @@ * 08/12/11 beckyb Add highmem support */ +#define pr_fmt(fmt) "software IO TLB: " fmt + #include #include #include @@ -162,20 +164,16 @@ static bool no_iotlb_memory; void swiotlb_print_info(void) { unsigned long bytes = io_tlb_nslabs << IO_TLB_SHIFT; - unsigned char *vstart, *vend; if (no_iotlb_memory) { - pr_warn("software IO TLB: No low mem\n"); + pr_warn("No low mem\n"); return; } - vstart = phys_to_virt(io_tlb_start); - vend = phys_to_virt(io_tlb_end); - - printk(KERN_INFO "software IO TLB [mem %#010llx-%#010llx] (%luMB) mapped at [%p-%p]\n", + pr_info("mapped [mem %#010llx-%#010llx] (%luMB)\n", (unsigned long long)io_tlb_start, (unsigned long long)io_tlb_end, - bytes >> 20, vstart, vend - 1); + bytes >> 20); } /* @@ -275,7 +273,7 @@ swiotlb_init(int verbose) if (io_tlb_start) memblock_free_early(io_tlb_start, PAGE_ALIGN(io_tlb_nslabs << IO_TLB_SHIFT)); - pr_warn("Cannot allocate SWIOTLB buffer"); + pr_warn("Cannot allocate buffer"); no_iotlb_memory = true; } @@ -317,8 +315,8 @@ swiotlb_late_init_with_default_size(size_t default_size) return -ENOMEM; } if (order != get_order(bytes)) { - printk(KERN_WARNING "Warning: only able to allocate %ld MB " - "for software IO TLB\n", (PAGE_SIZE << order) >> 20); + pr_warn("only able to allocate %ld MB\n", + (PAGE_SIZE << order) >> 20); io_tlb_nslabs = SLABS_PER_PAGE << order; } rc = swiotlb_late_init_with_tbl(vstart, io_tlb_nslabs); -- cgit From 1863c387259b629e4ebfb255495f67cd06aa229b Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Tue, 24 Jul 2018 19:13:31 -0400 Subject: tracing: Fix double free of event_trigger_data Running the following: # cd /sys/kernel/debug/tracing # echo 500000 > buffer_size_kb [ Or some other number that takes up most of memory ] # echo snapshot > events/sched/sched_switch/trigger Triggers the following bug: ------------[ cut here ]------------ kernel BUG at mm/slub.c:296! invalid opcode: 0000 [#1] SMP DEBUG_PAGEALLOC PTI CPU: 6 PID: 6878 Comm: bash Not tainted 4.18.0-rc6-test+ #1066 Hardware name: Hewlett-Packard HP Compaq Pro 6300 SFF/339A, BIOS K01 v03.03 07/14/2016 RIP: 0010:kfree+0x16c/0x180 Code: 05 41 0f b6 72 51 5b 5d 41 5c 4c 89 d7 e9 ac b3 f8 ff 48 89 d9 48 89 da 41 b8 01 00 00 00 5b 5d 41 5c 4c 89 d6 e9 f4 f3 ff ff <0f> 0b 0f 0b 48 8b 3d d9 d8 f9 00 e9 c1 fe ff ff 0f 1f 40 00 0f 1f RSP: 0018:ffffb654436d3d88 EFLAGS: 00010246 RAX: ffff91a9d50f3d80 RBX: ffff91a9d50f3d80 RCX: ffff91a9d50f3d80 RDX: 00000000000006a4 RSI: ffff91a9de5a60e0 RDI: ffff91a9d9803500 RBP: ffffffff8d267c80 R08: 00000000000260e0 R09: ffffffff8c1a56be R10: fffff0d404543cc0 R11: 0000000000000389 R12: ffffffff8c1a56be R13: ffff91a9d9930e18 R14: ffff91a98c0c2890 R15: ffffffff8d267d00 FS: 00007f363ea64700(0000) GS:ffff91a9de580000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 000055c1cacc8e10 CR3: 00000000d9b46003 CR4: 00000000001606e0 Call Trace: event_trigger_callback+0xee/0x1d0 event_trigger_write+0xfc/0x1a0 __vfs_write+0x33/0x190 ? handle_mm_fault+0x115/0x230 ? _cond_resched+0x16/0x40 vfs_write+0xb0/0x190 ksys_write+0x52/0xc0 do_syscall_64+0x5a/0x160 entry_SYSCALL_64_after_hwframe+0x49/0xbe RIP: 0033:0x7f363e16ab50 Code: 73 01 c3 48 8b 0d 38 83 2c 00 f7 d8 64 89 01 48 83 c8 ff c3 66 0f 1f 44 00 00 83 3d 79 db 2c 00 00 75 10 b8 01 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 31 c3 48 83 ec 08 e8 1e e3 01 00 48 89 04 24 RSP: 002b:00007fff9a4c6378 EFLAGS: 00000246 ORIG_RAX: 0000000000000001 RAX: ffffffffffffffda RBX: 0000000000000009 RCX: 00007f363e16ab50 RDX: 0000000000000009 RSI: 000055c1cacc8e10 RDI: 0000000000000001 RBP: 000055c1cacc8e10 R08: 00007f363e435740 R09: 00007f363ea64700 R10: 0000000000000073 R11: 0000000000000246 R12: 0000000000000009 R13: 0000000000000001 R14: 00007f363e4345e0 R15: 00007f363e4303c0 Modules linked in: ip6table_filter ip6_tables snd_hda_codec_hdmi snd_hda_codec_realtek snd_hda_codec_generic snd_hda_intel snd_hda_codec snd_hwdep snd_hda_core snd_seq snd_seq_device i915 snd_pcm snd_timer i2c_i801 snd soundcore i2c_algo_bit drm_kms_helper 86_pkg_temp_thermal video kvm_intel kvm irqbypass wmi e1000e ---[ end trace d301afa879ddfa25 ]--- The cause is because the register_snapshot_trigger() call failed to allocate the snapshot buffer, and then called unregister_trigger() which freed the data that was passed to it. Then on return to the function that called register_snapshot_trigger(), as it sees it failed to register, it frees the trigger_data again and causes a double free. By calling event_trigger_init() on the trigger_data (which only ups the reference counter for it), and then event_trigger_free() afterward, the trigger_data would not get freed by the registering trigger function as it would only up and lower the ref count for it. If the register trigger function fails, then the event_trigger_free() called after it will free the trigger data normally. Link: http://lkml.kernel.org/r/20180724191331.738eb819@gandalf.local.home Cc: stable@vger.kerne.org Fixes: 93e31ffbf417 ("tracing: Add 'snapshot' event trigger command") Reported-by: Masami Hiramatsu Reviewed-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_trigger.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c index d18249683682..d18ec0e58be2 100644 --- a/kernel/trace/trace_events_trigger.c +++ b/kernel/trace/trace_events_trigger.c @@ -679,6 +679,8 @@ event_trigger_callback(struct event_command *cmd_ops, goto out_free; out_reg: + /* Up the trigger_data count to make sure reg doesn't free it on failure */ + event_trigger_init(trigger_ops, trigger_data); ret = cmd_ops->reg(glob, trigger_ops, trigger_data, file); /* * The above returns on success the # of functions enabled, @@ -686,11 +688,13 @@ event_trigger_callback(struct event_command *cmd_ops, * Consider no functions a failure too. */ if (!ret) { + cmd_ops->unreg(glob, trigger_ops, trigger_data, file); ret = -ENOENT; - goto out_free; - } else if (ret < 0) - goto out_free; - ret = 0; + } else if (ret > 0) + ret = 0; + + /* Down the counter of trigger_data or free it if not used anymore */ + event_trigger_free(trigger_ops, trigger_data); out: return ret; -- cgit From 73c8d8945505acdcbae137c2e00a1232e0be709f Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Sat, 14 Jul 2018 01:28:15 +0900 Subject: ring_buffer: tracing: Inherit the tracing setting to next ring buffer Maintain the tracing on/off setting of the ring_buffer when switching to the trace buffer snapshot. Taking a snapshot is done by swapping the backup ring buffer (max_tr_buffer). But since the tracing on/off setting is defined by the ring buffer, when swapping it, the tracing on/off setting can also be changed. This causes a strange result like below: /sys/kernel/debug/tracing # cat tracing_on 1 /sys/kernel/debug/tracing # echo 0 > tracing_on /sys/kernel/debug/tracing # cat tracing_on 0 /sys/kernel/debug/tracing # echo 1 > snapshot /sys/kernel/debug/tracing # cat tracing_on 1 /sys/kernel/debug/tracing # echo 1 > snapshot /sys/kernel/debug/tracing # cat tracing_on 0 We don't touch tracing_on, but snapshot changes tracing_on setting each time. This is an anomaly, because user doesn't know that each "ring_buffer" stores its own tracing-enable state and the snapshot is done by swapping ring buffers. Link: http://lkml.kernel.org/r/153149929558.11274.11730609978254724394.stgit@devbox Cc: Ingo Molnar Cc: Shuah Khan Cc: Tom Zanussi Cc: Hiraku Toyooka Cc: stable@vger.kernel.org Fixes: debdd57f5145 ("tracing: Make a snapshot feature available from userspace") Signed-off-by: Masami Hiramatsu [ Updated commit log and comment in the code ] Signed-off-by: Steven Rostedt (VMware) --- include/linux/ring_buffer.h | 1 + kernel/trace/ring_buffer.c | 16 ++++++++++++++++ kernel/trace/trace.c | 6 ++++++ 3 files changed, 23 insertions(+) (limited to 'kernel') diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h index b72ebdff0b77..003d09ab308d 100644 --- a/include/linux/ring_buffer.h +++ b/include/linux/ring_buffer.h @@ -165,6 +165,7 @@ void ring_buffer_record_enable(struct ring_buffer *buffer); void ring_buffer_record_off(struct ring_buffer *buffer); void ring_buffer_record_on(struct ring_buffer *buffer); int ring_buffer_record_is_on(struct ring_buffer *buffer); +int ring_buffer_record_is_set_on(struct ring_buffer *buffer); void ring_buffer_record_disable_cpu(struct ring_buffer *buffer, int cpu); void ring_buffer_record_enable_cpu(struct ring_buffer *buffer, int cpu); diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 6a46af21765c..0b0b688ea166 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -3226,6 +3226,22 @@ int ring_buffer_record_is_on(struct ring_buffer *buffer) return !atomic_read(&buffer->record_disabled); } +/** + * ring_buffer_record_is_set_on - return true if the ring buffer is set writable + * @buffer: The ring buffer to see if write is set enabled + * + * Returns true if the ring buffer is set writable by ring_buffer_record_on(). + * Note that this does NOT mean it is in a writable state. + * + * It may return true when the ring buffer has been disabled by + * ring_buffer_record_disable(), as that is a temporary disabling of + * the ring buffer. + */ +int ring_buffer_record_is_set_on(struct ring_buffer *buffer) +{ + return !(atomic_read(&buffer->record_disabled) & RB_BUFFER_OFF); +} + /** * ring_buffer_record_disable_cpu - stop all writes into the cpu_buffer * @buffer: The ring buffer to stop writes to. diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 87cf25171fb8..823687997b01 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1373,6 +1373,12 @@ update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) arch_spin_lock(&tr->max_lock); + /* Inherit the recordable setting from trace_buffer */ + if (ring_buffer_record_is_set_on(tr->trace_buffer.buffer)) + ring_buffer_record_on(tr->max_buffer.buffer); + else + ring_buffer_record_off(tr->max_buffer.buffer); + swap(tr->trace_buffer.buffer, tr->max_buffer.buffer); __update_max_tr(tr, tsk, cpu); -- cgit From 57ea2a34adf40f3a6e88409aafcf803b8945619a Mon Sep 17 00:00:00 2001 From: Artem Savkov Date: Wed, 25 Jul 2018 16:20:38 +0200 Subject: tracing/kprobes: Fix trace_probe flags on enable_trace_kprobe() failure If enable_trace_kprobe fails to enable the probe in enable_k(ret)probe it returns an error, but does not unset the tp flags it set previously. This results in a probe being considered enabled and failures like being unable to remove the probe through kprobe_events file since probes_open() expects every probe to be disabled. Link: http://lkml.kernel.org/r/20180725102826.8300-1-asavkov@redhat.com Link: http://lkml.kernel.org/r/20180725142038.4765-1-asavkov@redhat.com Cc: Ingo Molnar Cc: stable@vger.kernel.org Fixes: 41a7dd420c57 ("tracing/kprobes: Support ftrace_event_file base multibuffer") Acked-by: Masami Hiramatsu Reviewed-by: Josh Poimboeuf Signed-off-by: Artem Savkov Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_kprobe.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 21f718472942..27ace4513c43 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -400,11 +400,10 @@ static struct trace_kprobe *find_trace_kprobe(const char *event, static int enable_trace_kprobe(struct trace_kprobe *tk, struct trace_event_file *file) { + struct event_file_link *link; int ret = 0; if (file) { - struct event_file_link *link; - link = kmalloc(sizeof(*link), GFP_KERNEL); if (!link) { ret = -ENOMEM; @@ -424,6 +423,16 @@ enable_trace_kprobe(struct trace_kprobe *tk, struct trace_event_file *file) else ret = enable_kprobe(&tk->rp.kp); } + + if (ret) { + if (file) { + list_del_rcu(&link->list); + kfree(link); + tk->tp.flags &= ~TP_FLAG_TRACE; + } else { + tk->tp.flags &= ~TP_FLAG_PROFILE; + } + } out: return ret; } -- cgit From 15cc78644d0075e76d59476a4467e7143860f660 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Wed, 25 Jul 2018 16:02:06 -0400 Subject: tracing: Fix possible double free in event_enable_trigger_func() There was a case that triggered a double free in event_trigger_callback() due to the called reg() function freeing the trigger_data and then it getting freed again by the error return by the caller. The solution there was to up the trigger_data ref count. Code inspection found that event_enable_trigger_func() has the same issue, but is not as easy to trigger (requires harder to trigger failures). It needs to be solved slightly different as it needs more to clean up when the reg() function fails. Link: http://lkml.kernel.org/r/20180725124008.7008e586@gandalf.local.home Cc: stable@vger.kernel.org Fixes: 7862ad1846e99 ("tracing: Add 'enable_event' and 'disable_event' event trigger commands") Reivewed-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_trigger.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c index d18ec0e58be2..5dea177cef53 100644 --- a/kernel/trace/trace_events_trigger.c +++ b/kernel/trace/trace_events_trigger.c @@ -1420,6 +1420,9 @@ int event_enable_trigger_func(struct event_command *cmd_ops, goto out; } + /* Up the trigger_data count to make sure nothing frees it on failure */ + event_trigger_init(trigger_ops, trigger_data); + if (trigger) { number = strsep(&trigger, ":"); @@ -1470,6 +1473,7 @@ int event_enable_trigger_func(struct event_command *cmd_ops, goto out_disable; /* Just return zero, not the number of enabled functions */ ret = 0; + event_trigger_free(trigger_ops, trigger_data); out: return ret; @@ -1480,7 +1484,7 @@ int event_enable_trigger_func(struct event_command *cmd_ops, out_free: if (cmd_ops->set_filter) cmd_ops->set_filter(NULL, trigger_data, NULL); - kfree(trigger_data); + event_trigger_free(trigger_ops, trigger_data); kfree(enable_data); goto out; } -- cgit From 2519c1bbe38d7acacc9aacba303ca6f97482ed53 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Wed, 25 Jul 2018 22:28:56 -0400 Subject: tracing: Quiet gcc warning about maybe unused link variable Commit 57ea2a34adf4 ("tracing/kprobes: Fix trace_probe flags on enable_trace_kprobe() failure") added an if statement that depends on another if statement that gcc doesn't see will initialize the "link" variable and gives the warning: "warning: 'link' may be used uninitialized in this function" It is really a false positive, but to quiet the warning, and also to make sure that it never actually is used uninitialized, initialize the "link" variable to NULL and add an if (!WARN_ON_ONCE(!link)) where the compiler thinks it could be used uninitialized. Cc: stable@vger.kernel.org Fixes: 57ea2a34adf4 ("tracing/kprobes: Fix trace_probe flags on enable_trace_kprobe() failure") Reported-by: kbuild test robot Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_kprobe.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 27ace4513c43..6b71860f3998 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -400,7 +400,7 @@ static struct trace_kprobe *find_trace_kprobe(const char *event, static int enable_trace_kprobe(struct trace_kprobe *tk, struct trace_event_file *file) { - struct event_file_link *link; + struct event_file_link *link = NULL; int ret = 0; if (file) { @@ -426,7 +426,9 @@ enable_trace_kprobe(struct trace_kprobe *tk, struct trace_event_file *file) if (ret) { if (file) { - list_del_rcu(&link->list); + /* Notice the if is true on not WARN() */ + if (!WARN_ON_ONCE(!link)) + list_del_rcu(&link->list); kfree(link); tk->tp.flags &= ~TP_FLAG_TRACE; } else { -- cgit From 6f4ceee9305dc3fe74099159b460f4b56b506f1d Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Tue, 24 Jul 2018 14:26:04 -0400 Subject: cpu/hotplug: Add a cpus_read_trylock() function There are use cases where it can be useful to have a cpus_read_trylock() function to work around circular lock dependency problem involving the cpu_hotplug_lock. Signed-off-by: Waiman Long Signed-off-by: Rafael J. Wysocki --- include/linux/cpu.h | 2 ++ kernel/cpu.c | 6 ++++++ 2 files changed, 8 insertions(+) (limited to 'kernel') diff --git a/include/linux/cpu.h b/include/linux/cpu.h index a97a63eef59f..e850bfea3e84 100644 --- a/include/linux/cpu.h +++ b/include/linux/cpu.h @@ -103,6 +103,7 @@ extern void cpus_write_lock(void); extern void cpus_write_unlock(void); extern void cpus_read_lock(void); extern void cpus_read_unlock(void); +extern int cpus_read_trylock(void); extern void lockdep_assert_cpus_held(void); extern void cpu_hotplug_disable(void); extern void cpu_hotplug_enable(void); @@ -115,6 +116,7 @@ static inline void cpus_write_lock(void) { } static inline void cpus_write_unlock(void) { } static inline void cpus_read_lock(void) { } static inline void cpus_read_unlock(void) { } +static inline int cpus_read_trylock(void) { return true; } static inline void lockdep_assert_cpus_held(void) { } static inline void cpu_hotplug_disable(void) { } static inline void cpu_hotplug_enable(void) { } diff --git a/kernel/cpu.c b/kernel/cpu.c index 0db8938fbb23..307486baa477 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -290,6 +290,12 @@ void cpus_read_lock(void) } EXPORT_SYMBOL_GPL(cpus_read_lock); +int cpus_read_trylock(void) +{ + return percpu_down_read_trylock(&cpu_hotplug_lock); +} +EXPORT_SYMBOL_GPL(cpus_read_trylock); + void cpus_read_unlock(void) { percpu_up_read(&cpu_hotplug_lock); -- cgit From 3e536e222f2930534c252c1cc7ae799c725c5ff9 Mon Sep 17 00:00:00 2001 From: Snild Dolkow Date: Thu, 26 Jul 2018 09:15:39 +0200 Subject: kthread, tracing: Don't expose half-written comm when creating kthreads There is a window for racing when printing directly to task->comm, allowing other threads to see a non-terminated string. The vsnprintf function fills the buffer, counts the truncated chars, then finally writes the \0 at the end. creator other vsnprintf: fill (not terminated) count the rest trace_sched_waking(p): ... memcpy(comm, p->comm, TASK_COMM_LEN) write \0 The consequences depend on how 'other' uses the string. In our case, it was copied into the tracing system's saved cmdlines, a buffer of adjacent TASK_COMM_LEN-byte buffers (note the 'n' where 0 should be): crash-arm64> x/1024s savedcmd->saved_cmdlines | grep 'evenk' 0xffffffd5b3818640: "irq/497-pwr_evenkworker/u16:12" ...and a strcpy out of there would cause stack corruption: [224761.522292] Kernel panic - not syncing: stack-protector: Kernel stack is corrupted in: ffffff9bf9783c78 crash-arm64> kbt | grep 'comm\|trace_print_context' #6 0xffffff9bf9783c78 in trace_print_context+0x18c(+396) comm (char [16]) = "irq/497-pwr_even" crash-arm64> rd 0xffffffd4d0e17d14 8 ffffffd4d0e17d14: 2f71726900000000 5f7277702d373934 ....irq/497-pwr_ ffffffd4d0e17d24: 726f776b6e657665 3a3631752f72656b evenkworker/u16: ffffffd4d0e17d34: f9780248ff003231 cede60e0ffffff9b 12..H.x......`.. ffffffd4d0e17d44: cede60c8ffffffd4 00000fffffffffd4 .....`.......... The workaround in e09e28671 (use strlcpy in __trace_find_cmdline) was likely needed because of this same bug. Solved by vsnprintf:ing to a local buffer, then using set_task_comm(). This way, there won't be a window where comm is not terminated. Link: http://lkml.kernel.org/r/20180726071539.188015-1-snild@sony.com Cc: stable@vger.kernel.org Fixes: bc0c38d139ec7 ("ftrace: latency tracer infrastructure") Reviewed-by: Steven Rostedt (VMware) Signed-off-by: Snild Dolkow Signed-off-by: Steven Rostedt (VMware) --- kernel/kthread.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/kthread.c b/kernel/kthread.c index 750cb8082694..486dedbd9af5 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -325,8 +325,14 @@ struct task_struct *__kthread_create_on_node(int (*threadfn)(void *data), task = create->result; if (!IS_ERR(task)) { static const struct sched_param param = { .sched_priority = 0 }; + char name[TASK_COMM_LEN]; - vsnprintf(task->comm, sizeof(task->comm), namefmt, args); + /* + * task is already visible to other tasks, so updating + * COMM must be protected. + */ + vsnprintf(name, sizeof(name), namefmt, args); + set_task_comm(task, name); /* * root may have changed our (kthreadd's) priority or CPU mask. * The kernel thread should not inherit these properties. -- cgit From 2b27ece6c50c7f0a1db372786731be1a17c5b606 Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Thu, 28 Jun 2018 11:21:45 -0700 Subject: tracing/irqsoff: Split reset into separate functions Split reset functions into seperate functions in preparation of future patches that need to do tracer specific reset. Link: http://lkml.kernel.org/r/20180628182149.226164-4-joel@joelfernandes.org Reviewed-by: Namhyung Kim Signed-off-by: Joel Fernandes (Google) Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_irqsoff.c | 22 +++++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index 03ecb4465ee4..f8daa754cce2 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -634,7 +634,7 @@ static int __irqsoff_tracer_init(struct trace_array *tr) return 0; } -static void irqsoff_tracer_reset(struct trace_array *tr) +static void __irqsoff_tracer_reset(struct trace_array *tr) { int lat_flag = save_flags & TRACE_ITER_LATENCY_FMT; int overwrite_flag = save_flags & TRACE_ITER_OVERWRITE; @@ -665,6 +665,12 @@ static int irqsoff_tracer_init(struct trace_array *tr) return __irqsoff_tracer_init(tr); } + +static void irqsoff_tracer_reset(struct trace_array *tr) +{ + __irqsoff_tracer_reset(tr); +} + static struct tracer irqsoff_tracer __read_mostly = { .name = "irqsoff", @@ -697,11 +703,16 @@ static int preemptoff_tracer_init(struct trace_array *tr) return __irqsoff_tracer_init(tr); } +static void preemptoff_tracer_reset(struct trace_array *tr) +{ + __irqsoff_tracer_reset(tr); +} + static struct tracer preemptoff_tracer __read_mostly = { .name = "preemptoff", .init = preemptoff_tracer_init, - .reset = irqsoff_tracer_reset, + .reset = preemptoff_tracer_reset, .start = irqsoff_tracer_start, .stop = irqsoff_tracer_stop, .print_max = true, @@ -731,11 +742,16 @@ static int preemptirqsoff_tracer_init(struct trace_array *tr) return __irqsoff_tracer_init(tr); } +static void preemptirqsoff_tracer_reset(struct trace_array *tr) +{ + __irqsoff_tracer_reset(tr); +} + static struct tracer preemptirqsoff_tracer __read_mostly = { .name = "preemptirqsoff", .init = preemptirqsoff_tracer_init, - .reset = irqsoff_tracer_reset, + .reset = preemptirqsoff_tracer_reset, .start = irqsoff_tracer_start, .stop = irqsoff_tracer_stop, .print_max = true, -- cgit From f96e8577da1026c344e49c75111303888d225389 Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Thu, 12 Jul 2018 14:36:11 -0700 Subject: lib: Add module for testing preemptoff/irqsoff latency tracers Here we introduce a test module for introducing a long preempt or irq disable delay in the kernel which the preemptoff or irqsoff tracers can detect. This module is to be used only for test purposes and is default disabled. Following is the expected output (only briefly shown) that can be parsed to verify that the tracers are working correctly. We will use this from the kselftests in future patches. For the preemptoff tracer: echo preemptoff > /d/tracing/current_tracer sleep 1 insmod ./preemptirq_delay_test.ko test_mode=preempt delay=500000 sleep 1 bash-4.3# cat /d/tracing/trace preempt -1066 2...2 0us@: preemptirq_delay_run <-preemptirq_delay_run preempt -1066 2...2 500002us : preemptirq_delay_run <-preemptirq_delay_run preempt -1066 2...2 500004us : tracer_preempt_on <-preemptirq_delay_run preempt -1066 2...2 500012us : => kthread => ret_from_fork For the irqsoff tracer: echo irqsoff > /d/tracing/current_tracer sleep 1 insmod ./preemptirq_delay_test.ko test_mode=irq delay=500000 sleep 1 bash-4.3# cat /d/tracing/trace irq dis -1069 1d..1 0us@: preemptirq_delay_run irq dis -1069 1d..1 500001us : preemptirq_delay_run irq dis -1069 1d..1 500002us : tracer_hardirqs_on <-preemptirq_delay_run irq dis -1069 1d..1 500005us : => ret_from_fork Link: http://lkml.kernel.org/r/20180712213611.GA8743@joelaf.mtv.corp.google.com Cc: Boqun Feng Cc: Byungchul Park Cc: Ingo Molnar Cc: Julia Cartwright Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Cc: Namhyung Kim Cc: Paul McKenney Cc: Peter Zijlstra Cc: Shuah Khan Cc: Thomas Glexiner Cc: Todd Kjos Cc: Tom Zanussi Cc: Andy Shevchenko Reviewed-by: Andy Shevchenko [ Erick is a co-developer of this commit ] Signed-off-by: Erick Reyes Signed-off-by: Joel Fernandes (Google) Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/Kconfig | 15 ++++++++ kernel/trace/Makefile | 1 + kernel/trace/preemptirq_delay_test.c | 72 ++++++++++++++++++++++++++++++++++++ 3 files changed, 88 insertions(+) create mode 100644 kernel/trace/preemptirq_delay_test.c (limited to 'kernel') diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index dcc0166d1997..e15fadbe5dfb 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -687,6 +687,21 @@ config RING_BUFFER_STARTUP_TEST If unsure, say N +config PREEMPTIRQ_DELAY_TEST + tristate "Preempt / IRQ disable delay thread to test latency tracers" + depends on m + help + Select this option to build a test module that can help test latency + tracers by executing a preempt or irq disable section with a user + configurable delay. The module busy waits for the duration of the + critical section. + + For example, the following invocation forces a one-time irq-disabled + critical section for 500us: + modprobe preemptirq_delay_test test_mode=irq delay=500000 + + If unsure, say N + config TRACE_EVAL_MAP_FILE bool "Show eval mappings for trace events" depends on TRACING diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index e2538c7638d4..31c6b524c260 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -33,6 +33,7 @@ obj-$(CONFIG_TRACING) += trace_seq.o obj-$(CONFIG_TRACING) += trace_stat.o obj-$(CONFIG_TRACING) += trace_printk.o obj-$(CONFIG_TRACING_MAP) += tracing_map.o +obj-$(CONFIG_PREEMPTIRQ_DELAY_TEST) += preemptirq_delay_test.o obj-$(CONFIG_CONTEXT_SWITCH_TRACER) += trace_sched_switch.o obj-$(CONFIG_FUNCTION_TRACER) += trace_functions.o obj-$(CONFIG_PREEMPTIRQ_EVENTS) += trace_irqsoff.o diff --git a/kernel/trace/preemptirq_delay_test.c b/kernel/trace/preemptirq_delay_test.c new file mode 100644 index 000000000000..c97a026c0720 --- /dev/null +++ b/kernel/trace/preemptirq_delay_test.c @@ -0,0 +1,72 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Preempt / IRQ disable delay thread to test latency tracers + * + * Copyright (C) 2018 Joel Fernandes (Google) + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static ulong delay = 100; +static char test_mode[10] = "irq"; + +module_param_named(delay, delay, ulong, S_IRUGO); +module_param_string(test_mode, test_mode, 10, S_IRUGO); +MODULE_PARM_DESC(delay, "Period in microseconds (100 uS default)"); +MODULE_PARM_DESC(test_mode, "Mode of the test such as preempt or irq (default irq)"); + +static void busy_wait(ulong time) +{ + ktime_t start, end; + start = ktime_get(); + do { + end = ktime_get(); + if (kthread_should_stop()) + break; + } while (ktime_to_ns(ktime_sub(end, start)) < (time * 1000)); +} + +int preemptirq_delay_run(void *data) +{ + unsigned long flags; + + if (!strcmp(test_mode, "irq")) { + local_irq_save(flags); + busy_wait(delay); + local_irq_restore(flags); + } else if (!strcmp(test_mode, "preempt")) { + preempt_disable(); + busy_wait(delay); + preempt_enable(); + } + + return 0; +} + +static int __init preemptirq_delay_init(void) +{ + char task_name[50]; + struct task_struct *test_task; + + snprintf(task_name, sizeof(task_name), "%s_test", test_mode); + + test_task = kthread_run(preemptirq_delay_run, NULL, task_name); + return PTR_ERR_OR_ZERO(test_task); +} + +static void __exit preemptirq_delay_exit(void) +{ + return; +} + +module_init(preemptirq_delay_init) +module_exit(preemptirq_delay_exit) +MODULE_LICENSE("GPL v2"); -- cgit From f6b7425cfb92cbf0d04de816bec1bd8cb9a79d0f Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Tue, 24 Jul 2018 18:55:58 -0400 Subject: tracing: Make unregister_trigger() static Nothing uses unregister_trigger() outside of trace_events_trigger.c file, thus it should be static. Not sure why this was ever converted, because its counter part, register_trigger(), was always static. Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.h | 3 --- kernel/trace/trace_events_trigger.c | 6 +++--- 2 files changed, 3 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index f8f86231ad90..3f9d9acc69e6 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -1533,9 +1533,6 @@ extern int event_trigger_init(struct event_trigger_ops *ops, extern int trace_event_trigger_enable_disable(struct trace_event_file *file, int trigger_enable); extern void update_cond_flag(struct trace_event_file *file); -extern void unregister_trigger(char *glob, struct event_trigger_ops *ops, - struct event_trigger_data *test, - struct trace_event_file *file); extern int set_trigger_filter(char *filter_str, struct event_trigger_data *trigger_data, struct trace_event_file *file); diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c index 5dea177cef53..58d21fd52932 100644 --- a/kernel/trace/trace_events_trigger.c +++ b/kernel/trace/trace_events_trigger.c @@ -579,9 +579,9 @@ out: * Usually used directly as the @unreg method in event command * implementations. */ -void unregister_trigger(char *glob, struct event_trigger_ops *ops, - struct event_trigger_data *test, - struct trace_event_file *file) +static void unregister_trigger(char *glob, struct event_trigger_ops *ops, + struct event_trigger_data *test, + struct trace_event_file *file) { struct event_trigger_data *data; bool unregistered = false; -- cgit From 7b144b6c795a380beae6f7b40dcfb21014c4afb8 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Thu, 26 Jul 2018 21:44:04 +0900 Subject: tracing: Remove orphaned function using_ftrace_ops_list_func(). Remove using_ftrace_ops_list_func() since it is no longer used. Using ftrace_ops_list_func() has been introduced by commit 7eea4fce0246 ("tracing/stack_trace: Skip 4 instead of 3 when using ftrace_ops_list_func") as a helper function, but its caller has been removed by commit 72ac426a5bb0 ("tracing: Clean up stack tracing and fix fentry updates"). So it is not called anymore. Link: http://lkml.kernel.org/r/153260904427.12474.9952096317439329851.stgit@devbox Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 5 ----- kernel/trace/trace.h | 1 - 2 files changed, 6 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index caf9cbf35816..b8b3324ca1c8 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -313,11 +313,6 @@ static void update_ftrace_function(void) ftrace_trace_function = func; } -int using_ftrace_ops_list_func(void) -{ - return ftrace_trace_function == ftrace_ops_list_func; -} - static void add_ftrace_ops(struct ftrace_ops __rcu **list, struct ftrace_ops *ops) { diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 3f9d9acc69e6..e1c8a1d6f240 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -937,7 +937,6 @@ void ftrace_destroy_function_files(struct trace_array *tr); void ftrace_init_global_array_ops(struct trace_array *tr); void ftrace_init_array_ops(struct trace_array *tr, ftrace_func_t func); void ftrace_reset_array_ops(struct trace_array *tr); -int using_ftrace_ops_list_func(void); void ftrace_init_tracefs(struct trace_array *tr, struct dentry *d_tracer); void ftrace_init_tracefs_toplevel(struct trace_array *tr, struct dentry *d_tracer); -- cgit From 72809cbf6748830ae4a59a45bcb2367a6c24d74d Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Thu, 26 Jul 2018 21:44:32 +0900 Subject: tracing: Remove orphaned function ftrace_nr_registered_ops() Remove ftrace_nr_registered_ops() because it is no longer used. ftrace_nr_registered_ops() has been introduced by commit ea701f11da44 ("ftrace: Add selftest to test function trace recursion protection"), but its caller has been removed by commit 05cbbf643b8e ("tracing: Fix selftest function recursion accounting"). So it is not called anymore. Link: http://lkml.kernel.org/r/153260907227.12474.5234899025934963683.stgit@devbox Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- include/linux/ftrace.h | 6 ------ kernel/trace/ftrace.c | 24 ------------------------ 2 files changed, 30 deletions(-) (limited to 'kernel') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index ebb77674be90..63af5eb0ff46 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -234,10 +234,6 @@ extern void ftrace_stub(unsigned long a0, unsigned long a1, */ #define register_ftrace_function(ops) ({ 0; }) #define unregister_ftrace_function(ops) ({ 0; }) -static inline int ftrace_nr_registered_ops(void) -{ - return 0; -} static inline void ftrace_kill(void) { } static inline void ftrace_free_init_mem(void) { } static inline void ftrace_free_mem(struct module *mod, void *start, void *end) { } @@ -328,8 +324,6 @@ struct seq_file; extern int ftrace_text_reserved(const void *start, const void *end); -extern int ftrace_nr_registered_ops(void); - struct ftrace_ops *ftrace_ops_trampoline(unsigned long addr); bool is_ftrace_trampoline(unsigned long addr); diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index b8b3324ca1c8..0d380a98a880 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -157,30 +157,6 @@ static inline void ftrace_ops_init(struct ftrace_ops *ops) #endif } -/** - * ftrace_nr_registered_ops - return number of ops registered - * - * Returns the number of ftrace_ops registered and tracing functions - */ -int ftrace_nr_registered_ops(void) -{ - struct ftrace_ops *ops; - int cnt = 0; - - mutex_lock(&ftrace_lock); - - for (ops = rcu_dereference_protected(ftrace_ops_list, - lockdep_is_held(&ftrace_lock)); - ops != &ftrace_list_end; - ops = rcu_dereference_protected(ops->next, - lockdep_is_held(&ftrace_lock))) - cnt++; - - mutex_unlock(&ftrace_lock); - - return cnt; -} - static void ftrace_pid_func(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *op, struct pt_regs *regs) { -- cgit From 1fee4f77523a3e5a44fa33c9aa76de8c6c42a632 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Thu, 26 Jul 2018 21:43:36 +0900 Subject: doc: tracing: Fix a typo of trace_stat The name of the directory for per-cpu function statistics is trace_stat, not trace_stats. Signed-off-by: Masami Hiramatsu Acked-by: Steven Rostedt (VMware) Signed-off-by: Jonathan Corbet --- Documentation/trace/ftrace.rst | 4 ++-- kernel/trace/Kconfig | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/Documentation/trace/ftrace.rst b/Documentation/trace/ftrace.rst index a20d34955333..7ea16a0ceffc 100644 --- a/Documentation/trace/ftrace.rst +++ b/Documentation/trace/ftrace.rst @@ -329,9 +329,9 @@ of ftrace. Here is a list of some of the key files: track of the time spent in those functions. The histogram content can be displayed in the files: - trace_stats/function ( function0, function1, etc). + trace_stat/function ( function0, function1, etc). - trace_stats: + trace_stat: A directory that holds different tracing stats. diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 2bd4a9181a0f..9a27f146fa1c 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -521,7 +521,7 @@ config FUNCTION_PROFILER in debugfs called function_profile_enabled which defaults to zero. When a 1 is echoed into this file profiling begins, and when a zero is entered, profiling stops. A "functions" file is created in - the trace_stats directory; this file shows the list of functions that + the trace_stat directory; this file shows the list of functions that have been hit and their counters. If in doubt, say N. -- cgit From 5f300e8004cb80182a24c0fa488218a4a43e6aac Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Thu, 26 Jul 2018 09:57:59 -0700 Subject: bpf: btf: Use exact btf value_size match in map_check_btf() The current map_check_btf() in BPF_MAP_TYPE_ARRAY rejects '> map->value_size' to ensure map_seq_show_elem() will not access things beyond an array element. Yonghong suggested that using '!=' is a more correct check. The 8 bytes round_up on value_size is stored in array->elem_size. Hence, using '!=' on map->value_size is a proper check. This patch also adds new tests to check the btf array key type and value type. Two of these new tests verify the btf's value_size (the change in this patch). It also fixes two existing tests that wrongly encoded a btf's type size (pprint_test) and the value_type_id (in one of the raw_tests[]). However, that do not affect these two BTF verification tests before or after this test changes. These two tests mainly failed at array creation time after this patch. Fixes: a26ca7c982cb ("bpf: btf: Add pretty print support to the basic arraymap") Suggested-by: Yonghong Song Acked-by: Yonghong Song Signed-off-by: Martin KaFai Lau Signed-off-by: Daniel Borkmann --- kernel/bpf/arraymap.c | 2 +- tools/testing/selftests/bpf/test_btf.c | 86 +++++++++++++++++++++++++++++++++- 2 files changed, 85 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 544e58f5f642..2aa55d030c77 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -378,7 +378,7 @@ static int array_map_check_btf(const struct bpf_map *map, const struct btf *btf, return -EINVAL; value_type = btf_type_id_size(btf, &btf_value_id, &value_size); - if (!value_type || value_size > map->value_size) + if (!value_type || value_size != map->value_size) return -EINVAL; return 0; diff --git a/tools/testing/selftests/bpf/test_btf.c b/tools/testing/selftests/bpf/test_btf.c index 402c0f7cc418..ffdd27737c9e 100644 --- a/tools/testing/selftests/bpf/test_btf.c +++ b/tools/testing/selftests/bpf/test_btf.c @@ -507,7 +507,7 @@ static struct btf_raw_test raw_tests[] = { .key_size = sizeof(int), .value_size = sizeof(void *) * 4, .key_type_id = 1, - .value_type_id = 4, + .value_type_id = 5, .max_entries = 4, }, @@ -1292,6 +1292,88 @@ static struct btf_raw_test raw_tests[] = { .err_str = "type != 0", }, +{ + .descr = "arraymap invalid btf key (a bit field)", + .raw_types = { + /* int */ /* [1] */ + BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4), + /* 32 bit int with 32 bit offset */ /* [2] */ + BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 32, 32, 8), + BTF_END_RAW, + }, + .str_sec = "", + .str_sec_size = sizeof(""), + .map_type = BPF_MAP_TYPE_ARRAY, + .map_name = "array_map_check_btf", + .key_size = sizeof(int), + .value_size = sizeof(int), + .key_type_id = 2, + .value_type_id = 1, + .max_entries = 4, + .map_create_err = true, +}, + +{ + .descr = "arraymap invalid btf key (!= 32 bits)", + .raw_types = { + /* int */ /* [1] */ + BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4), + /* 16 bit int with 0 bit offset */ /* [2] */ + BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 16, 2), + BTF_END_RAW, + }, + .str_sec = "", + .str_sec_size = sizeof(""), + .map_type = BPF_MAP_TYPE_ARRAY, + .map_name = "array_map_check_btf", + .key_size = sizeof(int), + .value_size = sizeof(int), + .key_type_id = 2, + .value_type_id = 1, + .max_entries = 4, + .map_create_err = true, +}, + +{ + .descr = "arraymap invalid btf value (too small)", + .raw_types = { + /* int */ /* [1] */ + BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4), + BTF_END_RAW, + }, + .str_sec = "", + .str_sec_size = sizeof(""), + .map_type = BPF_MAP_TYPE_ARRAY, + .map_name = "array_map_check_btf", + .key_size = sizeof(int), + /* btf_value_size < map->value_size */ + .value_size = sizeof(__u64), + .key_type_id = 1, + .value_type_id = 1, + .max_entries = 4, + .map_create_err = true, +}, + +{ + .descr = "arraymap invalid btf value (too big)", + .raw_types = { + /* int */ /* [1] */ + BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4), + BTF_END_RAW, + }, + .str_sec = "", + .str_sec_size = sizeof(""), + .map_type = BPF_MAP_TYPE_ARRAY, + .map_name = "array_map_check_btf", + .key_size = sizeof(int), + /* btf_value_size > map->value_size */ + .value_size = sizeof(__u16), + .key_type_id = 1, + .value_type_id = 1, + .max_entries = 4, + .map_create_err = true, +}, + }; /* struct btf_raw_test raw_tests[] */ static const char *get_next_str(const char *start, const char *end) @@ -2051,7 +2133,7 @@ static struct btf_raw_test pprint_test = { BTF_ENUM_ENC(NAME_TBD, 2), BTF_ENUM_ENC(NAME_TBD, 3), /* struct pprint_mapv */ /* [16] */ - BTF_TYPE_ENC(NAME_TBD, BTF_INFO_ENC(BTF_KIND_STRUCT, 0, 8), 28), + BTF_TYPE_ENC(NAME_TBD, BTF_INFO_ENC(BTF_KIND_STRUCT, 0, 8), 32), BTF_MEMBER_ENC(NAME_TBD, 11, 0), /* uint32_t ui32 */ BTF_MEMBER_ENC(NAME_TBD, 10, 32), /* uint16_t ui16 */ BTF_MEMBER_ENC(NAME_TBD, 12, 64), /* int32_t si32 */ -- cgit From 15d36fecd0bdc7510b70a0e5ec6671140b3fce0c Mon Sep 17 00:00:00 2001 From: Dave Jiang Date: Thu, 26 Jul 2018 16:37:15 -0700 Subject: mm: disallow mappings that conflict for devm_memremap_pages() When pmem namespaces created are smaller than section size, this can cause an issue during removal and gpf was observed: general protection fault: 0000 1 SMP PTI CPU: 36 PID: 3941 Comm: ndctl Tainted: G W 4.14.28-1.el7uek.x86_64 #2 task: ffff88acda150000 task.stack: ffffc900233a4000 RIP: 0010:__put_page+0x56/0x79 Call Trace: devm_memremap_pages_release+0x155/0x23a release_nodes+0x21e/0x260 devres_release_all+0x3c/0x48 device_release_driver_internal+0x15c/0x207 device_release_driver+0x12/0x14 unbind_store+0xba/0xd8 drv_attr_store+0x27/0x31 sysfs_kf_write+0x3f/0x46 kernfs_fop_write+0x10f/0x18b __vfs_write+0x3a/0x16d vfs_write+0xb2/0x1a1 SyS_write+0x55/0xb9 do_syscall_64+0x79/0x1ae entry_SYSCALL_64_after_hwframe+0x3d/0x0 Add code to check whether we have a mapping already in the same section and prevent additional mappings from being created if that is the case. Link: http://lkml.kernel.org/r/152909478401.50143.312364396244072931.stgit@djiang5-desk3.ch.intel.com Signed-off-by: Dave Jiang Cc: Dan Williams Cc: Robert Elliott Cc: Jeff Moyer Cc: Matthew Wilcox Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/memremap.c | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/memremap.c b/kernel/memremap.c index 5857267a4af5..a734b1747466 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c @@ -176,10 +176,27 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap) unsigned long pfn, pgoff, order; pgprot_t pgprot = PAGE_KERNEL; int error, nid, is_ram; + struct dev_pagemap *conflict_pgmap; align_start = res->start & ~(SECTION_SIZE - 1); align_size = ALIGN(res->start + resource_size(res), SECTION_SIZE) - align_start; + align_end = align_start + align_size - 1; + + conflict_pgmap = get_dev_pagemap(PHYS_PFN(align_start), NULL); + if (conflict_pgmap) { + dev_WARN(dev, "Conflicting mapping in same section\n"); + put_dev_pagemap(conflict_pgmap); + return ERR_PTR(-ENOMEM); + } + + conflict_pgmap = get_dev_pagemap(PHYS_PFN(align_end), NULL); + if (conflict_pgmap) { + dev_WARN(dev, "Conflicting mapping in same section\n"); + put_dev_pagemap(conflict_pgmap); + return ERR_PTR(-ENOMEM); + } + is_ram = region_intersects(align_start, align_size, IORESOURCE_SYSTEM_RAM, IORES_DESC_NONE); @@ -199,7 +216,6 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap) mutex_lock(&pgmap_lock); error = 0; - align_end = align_start + align_size - 1; foreach_order_pgoff(res, order, pgoff) { error = __radix_tree_insert(&pgmap_radix, -- cgit From 31c5bda3a656089f01963d290a40ccda181f816e Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Thu, 26 Jul 2018 16:37:22 -0700 Subject: mm: fix exports that inadvertently make put_page() EXPORT_SYMBOL_GPL MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit e76384884344 ("mm: introduce MEMORY_DEVICE_FS_DAX and CONFIG_DEV_PAGEMAP_OPS") added two EXPORT_SYMBOL_GPL() symbols, but these symbols are required by the inlined put_page(), thus accidentally making put_page() a GPL export only. This breaks OpenAFS (at least). Mark them EXPORT_SYMBOL() instead. Link: http://lkml.kernel.org/r/153128611970.2928.11310692420711601254.stgit@dwillia2-desk3.amr.corp.intel.com Fixes: e76384884344 ("mm: introduce MEMORY_DEVICE_FS_DAX and CONFIG_DEV_PAGEMAP_OPS") Signed-off-by: Dan Williams Reported-by: Joe Gorse Reported-by: John Hubbard Tested-by: Joe Gorse Tested-by: John Hubbard Cc: Jérôme Glisse Cc: Mark Vitale Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/memremap.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/memremap.c b/kernel/memremap.c index a734b1747466..38283363da06 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c @@ -321,7 +321,7 @@ EXPORT_SYMBOL_GPL(get_dev_pagemap); #ifdef CONFIG_DEV_PAGEMAP_OPS DEFINE_STATIC_KEY_FALSE(devmap_managed_key); -EXPORT_SYMBOL_GPL(devmap_managed_key); +EXPORT_SYMBOL(devmap_managed_key); static atomic_t devmap_enable; /* @@ -362,5 +362,5 @@ void __put_devmap_managed_page(struct page *page) } else if (!count) __put_page(page); } -EXPORT_SYMBOL_GPL(__put_devmap_managed_page); +EXPORT_SYMBOL(__put_devmap_managed_page); #endif /* CONFIG_DEV_PAGEMAP_OPS */ -- cgit From 027232da7c7c1c7f04383f93bd798e475dde5285 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Thu, 26 Jul 2018 16:37:25 -0700 Subject: mm: introduce vma_init() Not all VMAs allocated with vm_area_alloc(). Some of them allocated on stack or in data segment. The new helper can be use to initialize VMA properly regardless where it was allocated. Link: http://lkml.kernel.org/r/20180724121139.62570-2-kirill.shutemov@linux.intel.com Signed-off-by: Kirill A. Shutemov Acked-by: Linus Torvalds Reviewed-by: Andrew Morton Cc: Dmitry Vyukov Cc: Oleg Nesterov Cc: Andrea Arcangeli Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 6 ++++++ kernel/fork.c | 6 ++---- 2 files changed, 8 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/include/linux/mm.h b/include/linux/mm.h index d3a3842316b8..31540f166987 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -452,6 +452,12 @@ struct vm_operations_struct { unsigned long addr); }; +static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm) +{ + vma->vm_mm = mm; + INIT_LIST_HEAD(&vma->anon_vma_chain); +} + struct mmu_gather; struct inode; diff --git a/kernel/fork.c b/kernel/fork.c index a191c05e757d..1b27babc4c78 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -312,10 +312,8 @@ struct vm_area_struct *vm_area_alloc(struct mm_struct *mm) { struct vm_area_struct *vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL); - if (vma) { - vma->vm_mm = mm; - INIT_LIST_HEAD(&vma->anon_vma_chain); - } + if (vma) + vma_init(vma, mm); return vma; } -- cgit From 87107a25a2f857693156f075da6e2a1438f4b4a0 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Thu, 26 Jul 2018 12:07:32 -0400 Subject: tracing/kprobes: Simplify the logic of enable_trace_kprobe() The function enable_trace_kprobe() performs slightly differently if the file parameter is passed in as NULL on non-NULL. Instead of checking file twice, move the code between the two tests into a static inline helper function to make the code easier to follow. Link: http://lkml.kernel.org/r/20180725224728.7b1d5db2@vmware.local.home Link: http://lkml.kernel.org/r/20180726121152.4dd54330@gandalf.local.home Reviewed-by: Josh Poimboeuf Acked-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_kprobe.c | 42 ++++++++++++++++++++++++------------------ 1 file changed, 24 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 6b71860f3998..0534eb8b7640 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -393,6 +393,20 @@ static struct trace_kprobe *find_trace_kprobe(const char *event, return NULL; } +static inline int __enable_trace_kprobe(struct trace_kprobe *tk) +{ + int ret = 0; + + if (trace_probe_is_registered(&tk->tp) && !trace_kprobe_has_gone(tk)) { + if (trace_kprobe_is_return(tk)) + ret = enable_kretprobe(&tk->rp); + else + ret = enable_kprobe(&tk->rp.kp); + } + + return ret; +} + /* * Enable trace_probe * if the file is NULL, enable "perf" handler, or enable "trace" handler. @@ -400,7 +414,7 @@ static struct trace_kprobe *find_trace_kprobe(const char *event, static int enable_trace_kprobe(struct trace_kprobe *tk, struct trace_event_file *file) { - struct event_file_link *link = NULL; + struct event_file_link *link; int ret = 0; if (file) { @@ -414,26 +428,18 @@ enable_trace_kprobe(struct trace_kprobe *tk, struct trace_event_file *file) list_add_tail_rcu(&link->list, &tk->tp.files); tk->tp.flags |= TP_FLAG_TRACE; - } else - tk->tp.flags |= TP_FLAG_PROFILE; - - if (trace_probe_is_registered(&tk->tp) && !trace_kprobe_has_gone(tk)) { - if (trace_kprobe_is_return(tk)) - ret = enable_kretprobe(&tk->rp); - else - ret = enable_kprobe(&tk->rp.kp); - } - - if (ret) { - if (file) { - /* Notice the if is true on not WARN() */ - if (!WARN_ON_ONCE(!link)) - list_del_rcu(&link->list); + ret = __enable_trace_kprobe(tk); + if (ret) { + list_del_rcu(&link->list); kfree(link); tk->tp.flags &= ~TP_FLAG_TRACE; - } else { - tk->tp.flags &= ~TP_FLAG_PROFILE; } + + } else { + tk->tp.flags |= TP_FLAG_PROFILE; + ret = __enable_trace_kprobe(tk); + if (ret) + tk->tp.flags &= ~TP_FLAG_PROFILE; } out: return ret; -- cgit From f07d141fe9430cdf9f8a65a87c4136bd83b8ab2e Mon Sep 17 00:00:00 2001 From: Robin Murphy Date: Mon, 23 Jul 2018 23:16:07 +0100 Subject: dma-mapping: Generalise dma_32bit_limit flag Whilst the notion of an upstream DMA restriction is most commonly seen in PCI host bridges saddled with a 32-bit native interface, a more general version of the same issue can exist on complex SoCs where a bus or point-to-point interconnect link from a device's DMA master interface to another component along the path to memory (often an IOMMU) may carry fewer address bits than the interfaces at both ends nominally support. In order to properly deal with this, the first step is to expand the dma_32bit_limit flag into an arbitrary mask. To minimise the impact on existing code, we'll make sure to only consider this new mask valid if set. That makes sense anyway, since a mask of zero would represent DMA not being wired up at all, and that would be better handled by not providing valid ops in the first place. Signed-off-by: Robin Murphy Acked-by: Ard Biesheuvel Signed-off-by: Christoph Hellwig --- arch/x86/kernel/pci-dma.c | 2 +- include/linux/device.h | 6 +++--- kernel/dma/direct.c | 6 +++--- 3 files changed, 7 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index ab5d9dd668d2..80f9fe8d27d0 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -175,7 +175,7 @@ rootfs_initcall(pci_iommu_init); static int via_no_dac_cb(struct pci_dev *pdev, void *data) { - pdev->dev.dma_32bit_limit = true; + pdev->dev.bus_dma_mask = DMA_BIT_MASK(32); return 0; } diff --git a/include/linux/device.h b/include/linux/device.h index 055a69dbcd18..6d3b000be57e 100644 --- a/include/linux/device.h +++ b/include/linux/device.h @@ -886,6 +886,8 @@ struct dev_links_info { * @coherent_dma_mask: Like dma_mask, but for alloc_coherent mapping as not all * hardware supports 64-bit addresses for consistent allocations * such descriptors. + * @bus_dma_mask: Mask of an upstream bridge or bus which imposes a smaller DMA + * limit than the device itself supports. * @dma_pfn_offset: offset of DMA memory range relatively of RAM * @dma_parms: A low level driver may set these to teach IOMMU code about * segment limitations. @@ -912,8 +914,6 @@ struct dev_links_info { * @offline: Set after successful invocation of bus type's .offline(). * @of_node_reused: Set if the device-tree node is shared with an ancestor * device. - * @dma_32bit_limit: bridge limited to 32bit DMA even if the device itself - * indicates support for a higher limit in the dma_mask field. * * At the lowest level, every device in a Linux system is represented by an * instance of struct device. The device structure contains the information @@ -967,6 +967,7 @@ struct device { not all hardware supports 64 bit addresses for consistent allocations such descriptors. */ + u64 bus_dma_mask; /* upstream dma_mask constraint */ unsigned long dma_pfn_offset; struct device_dma_parameters *dma_parms; @@ -1002,7 +1003,6 @@ struct device { bool offline_disabled:1; bool offline:1; bool of_node_reused:1; - bool dma_32bit_limit:1; }; static inline struct device *kobj_to_dev(struct kobject *kobj) diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index 8be8106270c2..c2860c5a9e96 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -180,10 +180,10 @@ int dma_direct_supported(struct device *dev, u64 mask) return 0; #endif /* - * Various PCI/PCIe bridges have broken support for > 32bit DMA even - * if the device itself might support it. + * Upstream PCI/PCIe bridges or SoC interconnects may not carry + * as many DMA address bits as the device itself supports. */ - if (dev->dma_32bit_limit && mask > DMA_BIT_MASK(32)) + if (dev->bus_dma_mask && mask > dev->bus_dma_mask) return 0; return 1; } -- cgit From 518eeca05c08347f0a69966b4459cc2d152af959 Mon Sep 17 00:00:00 2001 From: kbuild test robot Date: Fri, 13 Jul 2018 21:14:52 +0800 Subject: tracing: preemptirq_delay_run() can be static Automatically found by kbuild test robot. Fixes: ffdc73a3b2ad ("lib: Add module for testing preemptoff/irqsoff latency tracers") Signed-off-by: kbuild test robot Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/preemptirq_delay_test.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/preemptirq_delay_test.c b/kernel/trace/preemptirq_delay_test.c index c97a026c0720..f704390db9fc 100644 --- a/kernel/trace/preemptirq_delay_test.c +++ b/kernel/trace/preemptirq_delay_test.c @@ -34,7 +34,7 @@ static void busy_wait(ulong time) } while (ktime_to_ns(ktime_sub(end, start)) < (time * 1000)); } -int preemptirq_delay_run(void *data) +static int preemptirq_delay_run(void *data) { unsigned long flags; -- cgit From 684ad537abff987886d63fb3c573eeca40d7f2db Mon Sep 17 00:00:00 2001 From: Pavel Tatashin Date: Wed, 25 Jul 2018 16:00:18 -0400 Subject: timekeeping: Prevent false warning when persistent clock is not available On arches with no persistent clock a message like this is printed during boot: [ 0.000000] Persistent clock returned invalid value The value is not invalid: Zero means that no persistent clock is available and the absence of persistent clock should be quietly accepted. Fixes: 3eca993740b8 ("timekeeping: Replace read_boot_clock64() with read_persistent_wall_and_boot_offset()") Signed-off-by: Pavel Tatashin Signed-off-by: Thomas Gleixner Cc: steven.sistare@oracle.com Cc: daniel.m.jordan@oracle.com Cc: sboyd@kernel.org Cc: john.stultz@linaro.org Link: https://lkml.kernel.org/r/20180725200018.23722-1-pasha.tatashin@oracle.com --- kernel/time/timekeeping.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 30d7f64ffc87..6183e7460138 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -1536,7 +1536,7 @@ void __init timekeeping_init(void) if (timespec64_valid_strict(&wall_time) && timespec64_to_ns(&wall_time) > 0) { persistent_clock_exists = true; - } else { + } else if (timespec64_to_ns(&wall_time) != 0) { pr_warn("Persistent clock returned invalid value"); wall_time = (struct timespec64){0}; } -- cgit From bd9f943e5d2a42d864f9692477a25034c9d47dcc Mon Sep 17 00:00:00 2001 From: Pavel Tatashin Date: Mon, 30 Jul 2018 09:52:52 -0400 Subject: sched/clock: Disable interrupts when calling generic_sched_clock_init() sched_clock_init() used be called early during boot when interrupts were still disabled. After the recent changes to utilize sched clock early the sched_clock_init() call happens when interrupts are already enabled, which triggers the following warning: WARNING: CPU: 0 PID: 0 at kernel/time/sched_clock.c:180 sched_clock_register+0x44/0x278 [] (warn_slowpath_null) from [] (sched_clock_register+0x44/0x278) [] (sched_clock_register) from [] (generic_sched_clock_init+0x28/0x88) [] (generic_sched_clock_init) from [] (sched_clock_init+0x54/0x74) [] (sched_clock_init) from [] (start_kernel+0x310/0x3e4) [] (start_kernel) from [<00000000>] ( (null)) Disable IRQs for the duration of generic_sched_clock_init(). Fixes: 857baa87b642 ("sched/clock: Enable sched clock early") Signed-off-by: Pavel Tatashin Reported-by: Guenter Roeck Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra Cc: steven.sistare@oracle.com Cc: daniel.m.jordan@oracle.com Link: https://lkml.kernel.org/r/20180730135252.24599-1-pasha.tatashin@oracle.com --- kernel/sched/clock.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c index 811a39aca1ce..e3e3b979f9bd 100644 --- a/kernel/sched/clock.c +++ b/kernel/sched/clock.c @@ -452,7 +452,9 @@ EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event); void __init sched_clock_init(void) { static_branch_inc(&sched_clock_running); + local_irq_disable(); generic_sched_clock_init(); + local_irq_enable(); } u64 sched_clock_cpu(int cpu) -- cgit From d018031f562b9c2eff038969ab1955a370c52d8f Mon Sep 17 00:00:00 2001 From: Mukesh Ojha Date: Tue, 24 Jul 2018 20:17:48 +0530 Subject: cpu/hotplug: Clarify CPU hotplug step name for timers After commit 249d4a9b3246 ("timers: Reinitialize per cpu bases on hotplug") i.e. the introduction of state CPUHP_TIMERS_PREPARE instead of CPUHP_TIMERS_DEAD the step name "timers:dead" is not longer accurate. Rename it to "timers:prepare". [ tglx: Massaged changelog ] Signed-off-by: Mukesh Ojha Signed-off-by: Thomas Gleixner Cc: gkohli@codeaurora.org Cc: neeraju@codeaurora.org Cc: Peter Zijlstra Cc: Lai Jiangshan Cc: Brendan Jackman Cc: Mathieu Malaterre Link: https://lkml.kernel.org/r/1532443668-26810-1-git-send-email-mojha@codeaurora.org --- kernel/cpu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index 0db8938fbb23..8e6606ac3d72 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -1274,7 +1274,7 @@ static struct cpuhp_step cpuhp_hp_states[] = { * otherwise a RCU stall occurs. */ [CPUHP_TIMERS_PREPARE] = { - .name = "timers:dead", + .name = "timers:prepare", .startup.single = timers_prepare_cpu, .teardown.single = timers_dead_cpu, }, -- cgit From b305f7ed0f4f494ad6f3ef5667501535d5a8fa31 Mon Sep 17 00:00:00 2001 From: Yi Wang Date: Wed, 25 Jul 2018 10:26:19 +0800 Subject: audit: fix potential null dereference 'context->module.name' The variable 'context->module.name' may be null pointer when kmalloc return null, so it's better to check it before using to avoid null dereference. Another one more thing this patch does is using kstrdup instead of (kmalloc + strcpy), and signal a lost record via audit_log_lost. Cc: stable@vger.kernel.org # 4.11 Signed-off-by: Yi Wang Reviewed-by: Jiang Biao Reviewed-by: Richard Guy Briggs Signed-off-by: Paul Moore --- kernel/auditsc.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/auditsc.c b/kernel/auditsc.c index ceb1c4596c51..80d672a11088 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -1279,8 +1279,12 @@ static void show_special(struct audit_context *context, int *call_panic) break; case AUDIT_KERN_MODULE: audit_log_format(ab, "name="); - audit_log_untrustedstring(ab, context->module.name); - kfree(context->module.name); + if (context->module.name) { + audit_log_untrustedstring(ab, context->module.name); + kfree(context->module.name); + } else + audit_log_format(ab, "(null)"); + break; } audit_log_end(ab); @@ -2411,8 +2415,9 @@ void __audit_log_kern_module(char *name) { struct audit_context *context = audit_context(); - context->module.name = kmalloc(strlen(name) + 1, GFP_KERNEL); - strcpy(context->module.name, name); + context->module.name = kstrdup(name, GFP_KERNEL); + if (!context->module.name) + audit_log_lost("out of memory in __audit_log_kern_module"); context->type = AUDIT_KERN_MODULE; } -- cgit From 45408c4f92506dbdfef1721f2613e1426de00894 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Mon, 30 Jul 2018 19:20:14 +0900 Subject: tracing: kprobes: Prohibit probing on notrace function Prohibit kprobe-events probing on notrace functions. Since probing on a notrace function can cause a recursive event call. In most cases those are just skipped, but in some cases it falls into an infinite recursive call. This protection can be disabled by the kconfig CONFIG_KPROBE_EVENTS_ON_NOTRACE=y, but it is highly recommended to keep it "n" for normal kernel builds. Note that this is only available if "kprobes on ftrace" has been implemented on the target arch and CONFIG_KPROBES_ON_FTRACE=y. Link: http://lkml.kernel.org/r/153294601436.32740.10557881188933661239.stgit@devbox Signed-off-by: Masami Hiramatsu Tested-by: Francis Deslauriers [ Slight grammar and spelling fixes ] Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/Kconfig | 20 +++++++++++++++++++ kernel/trace/trace_kprobe.c | 47 ++++++++++++++++++++++++++++++++++++--------- 2 files changed, 58 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index e15fadbe5dfb..4d4eb15cc7fd 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -456,6 +456,26 @@ config KPROBE_EVENTS This option is also required by perf-probe subcommand of perf tools. If you want to use perf tools, this option is strongly recommended. +config KPROBE_EVENTS_ON_NOTRACE + bool "Do NOT protect notrace function from kprobe events" + depends on KPROBE_EVENTS + depends on KPROBES_ON_FTRACE + default n + help + This is only for the developers who want to debug ftrace itself + using kprobe events. + + If kprobes can use ftrace instead of breakpoint, ftrace related + functions are protected from kprobe-events to prevent an infinit + recursion or any unexpected execution path which leads to a kernel + crash. + + This option disables such protection and allows you to put kprobe + events on ftrace functions for debugging ftrace by itself. + Note that this might let you shoot yourself in the foot. + + If unsure, say N. + config UPROBE_EVENTS bool "Enable uprobes-based dynamic events" depends on ARCH_SUPPORTS_UPROBES diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 0534eb8b7640..25662a780fdf 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -87,6 +87,21 @@ static nokprobe_inline unsigned long trace_kprobe_nhit(struct trace_kprobe *tk) return nhit; } +static nokprobe_inline +unsigned long trace_kprobe_address(struct trace_kprobe *tk) +{ + unsigned long addr; + + if (tk->symbol) { + addr = (unsigned long) + kallsyms_lookup_name(trace_kprobe_symbol(tk)); + addr += tk->rp.kp.offset; + } else { + addr = (unsigned long)tk->rp.kp.addr; + } + return addr; +} + bool trace_kprobe_on_func_entry(struct trace_event_call *call) { struct trace_kprobe *tk = (struct trace_kprobe *)call->data; @@ -99,16 +114,8 @@ bool trace_kprobe_on_func_entry(struct trace_event_call *call) bool trace_kprobe_error_injectable(struct trace_event_call *call) { struct trace_kprobe *tk = (struct trace_kprobe *)call->data; - unsigned long addr; - if (tk->symbol) { - addr = (unsigned long) - kallsyms_lookup_name(trace_kprobe_symbol(tk)); - addr += tk->rp.kp.offset; - } else { - addr = (unsigned long)tk->rp.kp.addr; - } - return within_error_injection_list(addr); + return within_error_injection_list(trace_kprobe_address(tk)); } static int register_kprobe_event(struct trace_kprobe *tk); @@ -504,6 +511,22 @@ disable_trace_kprobe(struct trace_kprobe *tk, struct trace_event_file *file) return ret; } +#if defined(CONFIG_KPROBES_ON_FTRACE) && \ + !defined(CONFIG_KPROBE_EVENTS_ON_NOTRACE) +static bool within_notrace_func(struct trace_kprobe *tk) +{ + unsigned long offset, size, addr; + + addr = trace_kprobe_address(tk); + if (!kallsyms_lookup_size_offset(addr, &size, &offset)) + return true; /* Out of range. */ + + return !ftrace_location_range(addr - offset, addr - offset + size); +} +#else +#define within_notrace_func(tk) (false) +#endif + /* Internal register function - just handle k*probes and flags */ static int __register_trace_kprobe(struct trace_kprobe *tk) { @@ -512,6 +535,12 @@ static int __register_trace_kprobe(struct trace_kprobe *tk) if (trace_probe_is_registered(&tk->tp)) return -EINVAL; + if (within_notrace_func(tk)) { + pr_warn("Could not probe notrace function %s\n", + trace_kprobe_symbol(tk)); + return -EINVAL; + } + for (i = 0; i < tk->tp.nr_args; i++) traceprobe_update_arg(&tk->tp.args[i]); -- cgit From d899926f552bbd2e132b339fe40c06b9a1152e1a Mon Sep 17 00:00:00 2001 From: Francis Deslauriers Date: Mon, 30 Jul 2018 19:20:42 +0900 Subject: selftest/ftrace: Move kprobe selftest function to separate compile unit Move selftest function to its own compile unit so it can be compiled with the ftrace cflags (CC_FLAGS_FTRACE) allowing it to be probed during the ftrace startup tests. Link: http://lkml.kernel.org/r/153294604271.32740.16490677128630177030.stgit@devbox Signed-off-by: Francis Deslauriers Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/Makefile | 5 +++++ kernel/trace/trace_kprobe.c | 12 +----------- kernel/trace/trace_kprobe_selftest.c | 10 ++++++++++ kernel/trace/trace_kprobe_selftest.h | 7 +++++++ 4 files changed, 23 insertions(+), 11 deletions(-) create mode 100644 kernel/trace/trace_kprobe_selftest.c create mode 100644 kernel/trace/trace_kprobe_selftest.h (limited to 'kernel') diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index 31c6b524c260..81902a79e049 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -13,6 +13,11 @@ obj-y += trace_selftest_dynamic.o endif endif +ifdef CONFIG_FTRACE_STARTUP_TEST +CFLAGS_trace_kprobe_selftest.o = $(CC_FLAGS_FTRACE) +obj-$(CONFIG_KPROBE_EVENTS) += trace_kprobe_selftest.o +endif + # If unlikely tracing is enabled, do not trace these files ifdef CONFIG_TRACING_BRANCHES KBUILD_CFLAGS += -DDISABLE_BRANCH_PROFILING diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 25662a780fdf..deeb03ae21e1 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -23,6 +23,7 @@ #include #include +#include "trace_kprobe_selftest.h" #include "trace_probe.h" #define KPROBE_EVENT_SYSTEM "kprobes" @@ -1587,17 +1588,6 @@ fs_initcall(init_kprobe_trace); #ifdef CONFIG_FTRACE_STARTUP_TEST -/* - * The "__used" keeps gcc from removing the function symbol - * from the kallsyms table. 'noinline' makes sure that there - * isn't an inlined version used by the test method below - */ -static __used __init noinline int -kprobe_trace_selftest_target(int a1, int a2, int a3, int a4, int a5, int a6) -{ - return a1 + a2 + a3 + a4 + a5 + a6; -} - static __init struct trace_event_file * find_trace_probe_file(struct trace_kprobe *tk, struct trace_array *tr) { diff --git a/kernel/trace/trace_kprobe_selftest.c b/kernel/trace/trace_kprobe_selftest.c new file mode 100644 index 000000000000..16548ee4c8c6 --- /dev/null +++ b/kernel/trace/trace_kprobe_selftest.c @@ -0,0 +1,10 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Function used during the kprobe self test. This function is in a separate + * compile unit so it can be compile with CC_FLAGS_FTRACE to ensure that it + * can be probed by the selftests. + */ +int kprobe_trace_selftest_target(int a1, int a2, int a3, int a4, int a5, int a6) +{ + return a1 + a2 + a3 + a4 + a5 + a6; +} diff --git a/kernel/trace/trace_kprobe_selftest.h b/kernel/trace/trace_kprobe_selftest.h new file mode 100644 index 000000000000..4e10ec41c013 --- /dev/null +++ b/kernel/trace/trace_kprobe_selftest.h @@ -0,0 +1,7 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Function used during the kprobe self test. This function is in a separate + * compile unit so it can be compile with CC_FLAGS_FTRACE to ensure that it + * can be probed by the selftests. + */ +int kprobe_trace_selftest_target(int a1, int a2, int a3, int a4, int a5, int a6); -- cgit From 01f38497c6f6525f57eb445887b9ed1867dbd05c Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Mon, 30 Jul 2018 15:24:21 -0700 Subject: lockdep: Use this_cpu_ptr instead of get_cpu_var stats get_cpu_var disables preemption which has the potential to call into the preemption disable trace points causing some complications. There's also no need to disable preemption in uses of get_lock_stats anyway since preempt is already disabled. So lets simplify the code. Link: http://lkml.kernel.org/r/20180730222423.196630-2-joel@joelfernandes.org Suggested-by: Peter Zijlstra Acked-by: Peter Zijlstra Signed-off-by: Joel Fernandes (Google) Signed-off-by: Steven Rostedt (VMware) --- kernel/locking/lockdep.c | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 5fa4d3138bf1..fbbb79d5cfa0 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -248,12 +248,7 @@ void clear_lock_stats(struct lock_class *class) static struct lock_class_stats *get_lock_stats(struct lock_class *class) { - return &get_cpu_var(cpu_lock_stats)[class - lock_classes]; -} - -static void put_lock_stats(struct lock_class_stats *stats) -{ - put_cpu_var(cpu_lock_stats); + return &this_cpu_ptr(cpu_lock_stats)[class - lock_classes]; } static void lock_release_holdtime(struct held_lock *hlock) @@ -271,7 +266,6 @@ static void lock_release_holdtime(struct held_lock *hlock) lock_time_inc(&stats->read_holdtime, holdtime); else lock_time_inc(&stats->write_holdtime, holdtime); - put_lock_stats(stats); } #else static inline void lock_release_holdtime(struct held_lock *hlock) @@ -4090,7 +4084,6 @@ __lock_contended(struct lockdep_map *lock, unsigned long ip) stats->contending_point[contending_point]++; if (lock->cpu != smp_processor_id()) stats->bounces[bounce_contended + !!hlock->read]++; - put_lock_stats(stats); } static void @@ -4138,7 +4131,6 @@ __lock_acquired(struct lockdep_map *lock, unsigned long ip) } if (lock->cpu != cpu) stats->bounces[bounce_acquired + !!hlock->read]++; - put_lock_stats(stats); lock->cpu = cpu; lock->ip = ip; -- cgit From e6753f23d961d601dbae50a2fc2a3975c9715b14 Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Mon, 30 Jul 2018 15:24:22 -0700 Subject: tracepoint: Make rcuidle tracepoint callers use SRCU In recent tests with IRQ on/off tracepoints, a large performance overhead ~10% is noticed when running hackbench. This is root caused to calls to rcu_irq_enter_irqson and rcu_irq_exit_irqson from the tracepoint code. Following a long discussion on the list [1] about this, we concluded that srcu is a better alternative for use during rcu idle. Although it does involve extra barriers, its lighter than the sched-rcu version which has to do additional RCU calls to notify RCU idle about entry into RCU sections. In this patch, we change the underlying implementation of the trace_*_rcuidle API to use SRCU. This has shown to improve performance alot for the high frequency irq enable/disable tracepoints. Test: Tested idle and preempt/irq tracepoints. Here are some performance numbers: With a run of the following 30 times on a single core x86 Qemu instance with 1GB memory: hackbench -g 4 -f 2 -l 3000 Completion times in seconds. CONFIG_PROVE_LOCKING=y. No patches (without this series) Mean: 3.048 Median: 3.025 Std Dev: 0.064 With Lockdep using irq tracepoints with RCU implementation: Mean: 3.451 (-11.66 %) Median: 3.447 (-12.22%) Std Dev: 0.049 With Lockdep using irq tracepoints with SRCU implementation (this series): Mean: 3.020 (I would consider the improvement against the "without this series" case as just noise). Median: 3.013 Std Dev: 0.033 [1] https://patchwork.kernel.org/patch/10344297/ [remove rcu_read_lock_sched_notrace as its the equivalent of preempt_disable_notrace and is unnecessary to call in tracepoint code] Link: http://lkml.kernel.org/r/20180730222423.196630-3-joel@joelfernandes.org Cleaned-up-by: Peter Zijlstra Acked-by: Peter Zijlstra Reviewed-by: Mathieu Desnoyers Signed-off-by: Joel Fernandes (Google) [ Simplified WARN_ON_ONCE() ] Signed-off-by: Steven Rostedt (VMware) --- include/linux/tracepoint.h | 40 ++++++++++++++++++++++++++++++++-------- kernel/tracepoint.c | 16 +++++++++++++++- 2 files changed, 47 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h index 19a690b559ca..d9a084c72541 100644 --- a/include/linux/tracepoint.h +++ b/include/linux/tracepoint.h @@ -15,6 +15,7 @@ */ #include +#include #include #include #include @@ -33,6 +34,8 @@ struct trace_eval_map { #define TRACEPOINT_DEFAULT_PRIO 10 +extern struct srcu_struct tracepoint_srcu; + extern int tracepoint_probe_register(struct tracepoint *tp, void *probe, void *data); extern int @@ -75,10 +78,16 @@ int unregister_tracepoint_module_notifier(struct notifier_block *nb) * probe unregistration and the end of module exit to make sure there is no * caller executing a probe when it is freed. */ +#ifdef CONFIG_TRACEPOINTS static inline void tracepoint_synchronize_unregister(void) { + synchronize_srcu(&tracepoint_srcu); synchronize_sched(); } +#else +static inline void tracepoint_synchronize_unregister(void) +{ } +#endif #ifdef CONFIG_HAVE_SYSCALL_TRACEPOINTS extern int syscall_regfunc(void); @@ -129,18 +138,31 @@ extern void syscall_unregfunc(void); * as "(void *, void)". The DECLARE_TRACE_NOARGS() will pass in just * "void *data", where as the DECLARE_TRACE() will pass in "void *data, proto". */ -#define __DO_TRACE(tp, proto, args, cond, rcucheck) \ +#define __DO_TRACE(tp, proto, args, cond, rcuidle) \ do { \ struct tracepoint_func *it_func_ptr; \ void *it_func; \ void *__data; \ + int __maybe_unused idx = 0; \ \ if (!(cond)) \ return; \ - if (rcucheck) \ - rcu_irq_enter_irqson(); \ - rcu_read_lock_sched_notrace(); \ - it_func_ptr = rcu_dereference_sched((tp)->funcs); \ + \ + /* srcu can't be used from NMI */ \ + WARN_ON_ONCE(rcuidle && in_nmi()); \ + \ + /* keep srcu and sched-rcu usage consistent */ \ + preempt_disable_notrace(); \ + \ + /* \ + * For rcuidle callers, use srcu since sched-rcu \ + * doesn't work from the idle path. \ + */ \ + if (rcuidle) \ + idx = srcu_read_lock_notrace(&tracepoint_srcu); \ + \ + it_func_ptr = rcu_dereference_raw((tp)->funcs); \ + \ if (it_func_ptr) { \ do { \ it_func = (it_func_ptr)->func; \ @@ -148,9 +170,11 @@ extern void syscall_unregfunc(void); ((void(*)(proto))(it_func))(args); \ } while ((++it_func_ptr)->func); \ } \ - rcu_read_unlock_sched_notrace(); \ - if (rcucheck) \ - rcu_irq_exit_irqson(); \ + \ + if (rcuidle) \ + srcu_read_unlock_notrace(&tracepoint_srcu, idx);\ + \ + preempt_enable_notrace(); \ } while (0) #ifndef MODULE diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c index 6dc6356c3327..955148d91b74 100644 --- a/kernel/tracepoint.c +++ b/kernel/tracepoint.c @@ -31,6 +31,9 @@ extern struct tracepoint * const __start___tracepoints_ptrs[]; extern struct tracepoint * const __stop___tracepoints_ptrs[]; +DEFINE_SRCU(tracepoint_srcu); +EXPORT_SYMBOL_GPL(tracepoint_srcu); + /* Set to 1 to enable tracepoint debug output */ static const int tracepoint_debug; @@ -67,16 +70,27 @@ static inline void *allocate_probes(int count) return p == NULL ? NULL : p->probes; } -static void rcu_free_old_probes(struct rcu_head *head) +static void srcu_free_old_probes(struct rcu_head *head) { kfree(container_of(head, struct tp_probes, rcu)); } +static void rcu_free_old_probes(struct rcu_head *head) +{ + call_srcu(&tracepoint_srcu, head, srcu_free_old_probes); +} + static inline void release_probes(struct tracepoint_func *old) { if (old) { struct tp_probes *tp_probes = container_of(old, struct tp_probes, probes[0]); + /* + * Tracepoint probes are protected by both sched RCU and SRCU, + * by calling the SRCU callback in the sched RCU callback we + * cover both cases. So let us chain the SRCU and sched RCU + * callbacks to wait for both grace periods. + */ call_rcu_sched(&tp_probes->rcu, rcu_free_old_probes); } } -- cgit From 9d2dcc8fc66087d7fd365e07cd4292adc873e568 Mon Sep 17 00:00:00 2001 From: Michael O'Farrell Date: Mon, 30 Jul 2018 13:14:34 -0700 Subject: arm64: perf: Add cap_user_time aarch64 It is useful to get the running time of a thread. Doing so in an efficient manner can be important for performance of user applications. Avoiding system calls in `clock_gettime` when handling CLOCK_THREAD_CPUTIME_ID is important. Other clocks are handled in the VDSO, but CLOCK_THREAD_CPUTIME_ID falls back on the system call. CLOCK_THREAD_CPUTIME_ID is not handled in the VDSO since it would have costs associated with maintaining updated user space accessible time offsets. These offsets have to be updated everytime the a thread is scheduled/descheduled. However, for programs regularly checking the running time of a thread, this is a performance improvement. This patch takes a middle ground, and adds support for cap_user_time an optional feature of the perf_event API. This way costs are only incurred when the perf_event api is enabled. This is done the same way as it is in x86. Ultimately this allows calculating the thread running time in userspace on aarch64 as follows (adapted from perf_event_open manpage): u32 seq, time_mult, time_shift; u64 running, count, time_offset, quot, rem, delta; struct perf_event_mmap_page *pc; pc = buf; // buf is the perf event mmaped page as documented in the API. if (pc->cap_usr_time) { do { seq = pc->lock; barrier(); running = pc->time_running; count = readCNTVCT_EL0(); // Read ARM hardware clock. time_offset = pc->time_offset; time_mult = pc->time_mult; time_shift = pc->time_shift; barrier(); } while (pc->lock != seq); quot = (count >> time_shift); rem = count & (((u64)1 << time_shift) - 1); delta = time_offset + quot * time_mult + ((rem * time_mult) >> time_shift); running += delta; // running now has the current nanosecond level thread time. } Summary of changes in the patch: For aarch64 systems, make arch_perf_update_userpage update the timing information stored in the perf_event page. Requiring the following calculations: - Calculate the appropriate time_mult, and time_shift factors to convert ticks to nano seconds for the current clock frequency. - Adjust the mult and shift factors to avoid shift factors of 32 bits. (possibly unnecessary) - The time_offset userspace should apply when doing calculations: negative the current sched time (now), because time_running and time_enabled fields of the perf_event page have just been updated. Toggle bits to appropriate values: - Enable cap_user_time Signed-off-by: Michael O'Farrell Signed-off-by: Will Deacon --- arch/arm64/kernel/perf_event.c | 30 ++++++++++++++++++++++++++++++ kernel/events/core.c | 4 ++-- 2 files changed, 32 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/arch/arm64/kernel/perf_event.c b/arch/arm64/kernel/perf_event.c index dfff5ed5c625..8e38d5267f22 100644 --- a/arch/arm64/kernel/perf_event.c +++ b/arch/arm64/kernel/perf_event.c @@ -25,6 +25,7 @@ #include #include +#include #include #include #include @@ -1278,3 +1279,32 @@ static int __init armv8_pmu_driver_init(void) return arm_pmu_acpi_probe(armv8_pmuv3_init); } device_initcall(armv8_pmu_driver_init) + +void arch_perf_update_userpage(struct perf_event *event, + struct perf_event_mmap_page *userpg, u64 now) +{ + u32 freq; + u32 shift; + + /* + * Internal timekeeping for enabled/running/stopped times + * is always computed with the sched_clock. + */ + freq = arch_timer_get_rate(); + userpg->cap_user_time = 1; + + clocks_calc_mult_shift(&userpg->time_mult, &shift, freq, + NSEC_PER_SEC, 0); + /* + * time_shift is not expected to be greater than 31 due to + * the original published conversion algorithm shifting a + * 32-bit value (now specifies a 64-bit value) - refer + * perf_event_mmap_page documentation in perf_event.h. + */ + if (shift == 32) { + shift = 31; + userpg->time_mult >>= 1; + } + userpg->time_shift = (u16)shift; + userpg->time_offset = -now; +} diff --git a/kernel/events/core.c b/kernel/events/core.c index 8f0434a9951a..1a16dcca0da2 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -5246,8 +5246,8 @@ void perf_event_update_userpage(struct perf_event *event) userpg = rb->user_page; /* - * Disable preemption so as to not let the corresponding user-space - * spin too long if we get preempted. + * Disable preemption to guarantee consistent time stamps are stored to + * the user page. */ preempt_disable(); ++userpg->lock; -- cgit From 56e6c104e4f151e19eb410004405ec52b4f8605a Mon Sep 17 00:00:00 2001 From: Thomas Zimmermann Date: Tue, 31 Jul 2018 13:06:57 +0200 Subject: console: Replace #if 0 with atomic var 'ignore_console_lock_warning' The macro WARN_CONSOLE_UNLOCKED prints a warning when a thread enters the console's critical section without having acquired the console lock. The console lock can be ignored when debugging the console using printk, but this makes WARN_CONSOLE_UNLOCKED generate unnecessary warnings. The variable ignore_console_lock_warning temporarily disables WARN_CONSOLE_UNLOCKED. Developers interested in debugging the console's critical sections should increment it before entering the CS and decrement it after leaving the CS. Setting ignore_console_lock_warning is only for debugging. Regular operation should not manipulate it. Acknoledgements: This patch is based on an earlier version by Steven Rostedt. The use of atomic increment/decrement was suggested by Petr Mladek. Link: http://lkml.kernel.org/r/717e6337-e7a6-7a92-1c1b-8929a25696b5@suse.de Signed-off-by: Thomas Zimmermann Acked-by: Hans de Goede Acked-by: Petr Mladek Reviewed-by: Sergey Senozhatsky Cc: Steven Rostedt (VMware) Cc: Andrew Morton [b.zolnierkie: use ] Signed-off-by: Bartlomiej Zolnierkiewicz --- include/linux/console.h | 14 +++++++++----- kernel/printk/printk.c | 3 +++ 2 files changed, 12 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/include/linux/console.h b/include/linux/console.h index f59f3dbca65c..ec9bdb3d7bab 100644 --- a/include/linux/console.h +++ b/include/linux/console.h @@ -14,6 +14,7 @@ #ifndef _LINUX_CONSOLE_H_ #define _LINUX_CONSOLE_H_ 1 +#include #include struct vc_data; @@ -201,11 +202,14 @@ void vcs_make_sysfs(int index); void vcs_remove_sysfs(int index); /* Some debug stub to catch some of the obvious races in the VT code */ -#if 1 -#define WARN_CONSOLE_UNLOCKED() WARN_ON(!is_console_locked() && !oops_in_progress) -#else -#define WARN_CONSOLE_UNLOCKED() -#endif +#define WARN_CONSOLE_UNLOCKED() \ + WARN_ON(!atomic_read(&ignore_console_lock_warning) && \ + !is_console_locked() && !oops_in_progress) +/* + * Increment ignore_console_lock_warning if you need to quiet + * WARN_CONSOLE_UNLOCKED() for debugging purposes. + */ +extern atomic_t ignore_console_lock_warning; /* VESA Blanking Levels */ #define VESA_NO_BLANKING 0 diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 3f041e7cbfc9..7d32a86758cd 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -66,6 +66,9 @@ int console_printk[4] = { CONSOLE_LOGLEVEL_DEFAULT, /* default_console_loglevel */ }; +atomic_t ignore_console_lock_warning __read_mostly = ATOMIC_INIT(0); +EXPORT_SYMBOL(ignore_console_lock_warning); + /* * Low level drivers may need that to know if they can schedule in * their unblank() callback or not. So let's export it. -- cgit From c3bc8fd637a9623f5c507bd18f9677effbddf584 Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Mon, 30 Jul 2018 15:24:23 -0700 Subject: tracing: Centralize preemptirq tracepoints and unify their usage This patch detaches the preemptirq tracepoints from the tracers and keeps it separate. Advantages: * Lockdep and irqsoff event can now run in parallel since they no longer have their own calls. * This unifies the usecase of adding hooks to an irqsoff and irqson event, and a preemptoff and preempton event. 3 users of the events exist: - Lockdep - irqsoff and preemptoff tracers - irqs and preempt trace events The unification cleans up several ifdefs and makes the code in preempt tracer and irqsoff tracers simpler. It gets rid of all the horrific ifdeferry around PROVE_LOCKING and makes configuration of the different users of the tracepoints more easy and understandable. It also gets rid of the time_* function calls from the lockdep hooks used to call into the preemptirq tracer which is not needed anymore. The negative delta in lines of code in this patch is quite large too. In the patch we introduce a new CONFIG option PREEMPTIRQ_TRACEPOINTS as a single point for registering probes onto the tracepoints. With this, the web of config options for preempt/irq toggle tracepoints and its users becomes: PREEMPT_TRACER PREEMPTIRQ_EVENTS IRQSOFF_TRACER PROVE_LOCKING | | \ | | \ (selects) / \ \ (selects) / TRACE_PREEMPT_TOGGLE ----> TRACE_IRQFLAGS \ / \ (depends on) / PREEMPTIRQ_TRACEPOINTS Other than the performance tests mentioned in the previous patch, I also ran the locking API test suite. I verified that all tests cases are passing. I also injected issues by not registering lockdep probes onto the tracepoints and I see failures to confirm that the probes are indeed working. This series + lockdep probes not registered (just to inject errors): [ 0.000000] hard-irqs-on + irq-safe-A/21: ok | ok | ok | [ 0.000000] soft-irqs-on + irq-safe-A/21: ok | ok | ok | [ 0.000000] sirq-safe-A => hirqs-on/12:FAILED|FAILED| ok | [ 0.000000] sirq-safe-A => hirqs-on/21:FAILED|FAILED| ok | [ 0.000000] hard-safe-A + irqs-on/12:FAILED|FAILED| ok | [ 0.000000] soft-safe-A + irqs-on/12:FAILED|FAILED| ok | [ 0.000000] hard-safe-A + irqs-on/21:FAILED|FAILED| ok | [ 0.000000] soft-safe-A + irqs-on/21:FAILED|FAILED| ok | [ 0.000000] hard-safe-A + unsafe-B #1/123: ok | ok | ok | [ 0.000000] soft-safe-A + unsafe-B #1/123: ok | ok | ok | With this series + lockdep probes registered, all locking tests pass: [ 0.000000] hard-irqs-on + irq-safe-A/21: ok | ok | ok | [ 0.000000] soft-irqs-on + irq-safe-A/21: ok | ok | ok | [ 0.000000] sirq-safe-A => hirqs-on/12: ok | ok | ok | [ 0.000000] sirq-safe-A => hirqs-on/21: ok | ok | ok | [ 0.000000] hard-safe-A + irqs-on/12: ok | ok | ok | [ 0.000000] soft-safe-A + irqs-on/12: ok | ok | ok | [ 0.000000] hard-safe-A + irqs-on/21: ok | ok | ok | [ 0.000000] soft-safe-A + irqs-on/21: ok | ok | ok | [ 0.000000] hard-safe-A + unsafe-B #1/123: ok | ok | ok | [ 0.000000] soft-safe-A + unsafe-B #1/123: ok | ok | ok | Link: http://lkml.kernel.org/r/20180730222423.196630-4-joel@joelfernandes.org Acked-by: Peter Zijlstra (Intel) Reviewed-by: Namhyung Kim Signed-off-by: Joel Fernandes (Google) Signed-off-by: Steven Rostedt (VMware) --- include/linux/ftrace.h | 11 +- include/linux/irqflags.h | 11 +- include/linux/lockdep.h | 8 +- include/linux/preempt.h | 2 +- include/trace/events/preemptirq.h | 23 ++-- init/main.c | 5 +- kernel/locking/lockdep.c | 35 +++--- kernel/sched/core.c | 2 +- kernel/trace/Kconfig | 22 +++- kernel/trace/Makefile | 2 +- kernel/trace/trace_irqsoff.c | 231 ++++++++++---------------------------- kernel/trace/trace_preemptirq.c | 72 ++++++++++++ 12 files changed, 195 insertions(+), 229 deletions(-) create mode 100644 kernel/trace/trace_preemptirq.c (limited to 'kernel') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 63af5eb0ff46..a397907e8d72 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -701,16 +701,7 @@ static inline unsigned long get_lock_parent_ip(void) return CALLER_ADDR2; } -#ifdef CONFIG_IRQSOFF_TRACER - extern void time_hardirqs_on(unsigned long a0, unsigned long a1); - extern void time_hardirqs_off(unsigned long a0, unsigned long a1); -#else - static inline void time_hardirqs_on(unsigned long a0, unsigned long a1) { } - static inline void time_hardirqs_off(unsigned long a0, unsigned long a1) { } -#endif - -#if defined(CONFIG_PREEMPT_TRACER) || \ - (defined(CONFIG_DEBUG_PREEMPT) && defined(CONFIG_PREEMPTIRQ_EVENTS)) +#ifdef CONFIG_TRACE_PREEMPT_TOGGLE extern void trace_preempt_on(unsigned long a0, unsigned long a1); extern void trace_preempt_off(unsigned long a0, unsigned long a1); #else diff --git a/include/linux/irqflags.h b/include/linux/irqflags.h index 9700f00bbc04..50edb9cbbd26 100644 --- a/include/linux/irqflags.h +++ b/include/linux/irqflags.h @@ -15,9 +15,16 @@ #include #include -#ifdef CONFIG_TRACE_IRQFLAGS +/* Currently trace_softirqs_on/off is used only by lockdep */ +#ifdef CONFIG_PROVE_LOCKING extern void trace_softirqs_on(unsigned long ip); extern void trace_softirqs_off(unsigned long ip); +#else +# define trace_softirqs_on(ip) do { } while (0) +# define trace_softirqs_off(ip) do { } while (0) +#endif + +#ifdef CONFIG_TRACE_IRQFLAGS extern void trace_hardirqs_on(void); extern void trace_hardirqs_off(void); # define trace_hardirq_context(p) ((p)->hardirq_context) @@ -43,8 +50,6 @@ do { \ #else # define trace_hardirqs_on() do { } while (0) # define trace_hardirqs_off() do { } while (0) -# define trace_softirqs_on(ip) do { } while (0) -# define trace_softirqs_off(ip) do { } while (0) # define trace_hardirq_context(p) 0 # define trace_softirq_context(p) 0 # define trace_hardirqs_enabled(p) 0 diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h index 6fc77d4dbdcd..a8113357ceeb 100644 --- a/include/linux/lockdep.h +++ b/include/linux/lockdep.h @@ -266,7 +266,8 @@ struct held_lock { /* * Initialization, self-test and debugging-output methods: */ -extern void lockdep_info(void); +extern void lockdep_init(void); +extern void lockdep_init_early(void); extern void lockdep_reset(void); extern void lockdep_reset_lock(struct lockdep_map *lock); extern void lockdep_free_key_range(void *start, unsigned long size); @@ -406,7 +407,8 @@ static inline void lockdep_on(void) # define lock_downgrade(l, i) do { } while (0) # define lock_set_class(l, n, k, s, i) do { } while (0) # define lock_set_subclass(l, s, i) do { } while (0) -# define lockdep_info() do { } while (0) +# define lockdep_init() do { } while (0) +# define lockdep_init_early() do { } while (0) # define lockdep_init_map(lock, name, key, sub) \ do { (void)(name); (void)(key); } while (0) # define lockdep_set_class(lock, key) do { (void)(key); } while (0) @@ -532,7 +534,7 @@ do { \ #endif /* CONFIG_LOCKDEP */ -#ifdef CONFIG_TRACE_IRQFLAGS +#ifdef CONFIG_PROVE_LOCKING extern void print_irqtrace_events(struct task_struct *curr); #else static inline void print_irqtrace_events(struct task_struct *curr) diff --git a/include/linux/preempt.h b/include/linux/preempt.h index 5bd3f151da78..c01813c3fbe9 100644 --- a/include/linux/preempt.h +++ b/include/linux/preempt.h @@ -150,7 +150,7 @@ */ #define in_atomic_preempt_off() (preempt_count() != PREEMPT_DISABLE_OFFSET) -#if defined(CONFIG_DEBUG_PREEMPT) || defined(CONFIG_PREEMPT_TRACER) +#if defined(CONFIG_DEBUG_PREEMPT) || defined(CONFIG_TRACE_PREEMPT_TOGGLE) extern void preempt_count_add(int val); extern void preempt_count_sub(int val); #define preempt_count_dec_and_test() \ diff --git a/include/trace/events/preemptirq.h b/include/trace/events/preemptirq.h index 9c4eb33c5a1d..9a0d4ceeb166 100644 --- a/include/trace/events/preemptirq.h +++ b/include/trace/events/preemptirq.h @@ -1,4 +1,4 @@ -#ifdef CONFIG_PREEMPTIRQ_EVENTS +#ifdef CONFIG_PREEMPTIRQ_TRACEPOINTS #undef TRACE_SYSTEM #define TRACE_SYSTEM preemptirq @@ -32,7 +32,7 @@ DECLARE_EVENT_CLASS(preemptirq_template, (void *)((unsigned long)(_stext) + __entry->parent_offs)) ); -#ifndef CONFIG_PROVE_LOCKING +#ifdef CONFIG_TRACE_IRQFLAGS DEFINE_EVENT(preemptirq_template, irq_disable, TP_PROTO(unsigned long ip, unsigned long parent_ip), TP_ARGS(ip, parent_ip)); @@ -40,9 +40,14 @@ DEFINE_EVENT(preemptirq_template, irq_disable, DEFINE_EVENT(preemptirq_template, irq_enable, TP_PROTO(unsigned long ip, unsigned long parent_ip), TP_ARGS(ip, parent_ip)); +#else +#define trace_irq_enable(...) +#define trace_irq_disable(...) +#define trace_irq_enable_rcuidle(...) +#define trace_irq_disable_rcuidle(...) #endif -#ifdef CONFIG_DEBUG_PREEMPT +#ifdef CONFIG_TRACE_PREEMPT_TOGGLE DEFINE_EVENT(preemptirq_template, preempt_disable, TP_PROTO(unsigned long ip, unsigned long parent_ip), TP_ARGS(ip, parent_ip)); @@ -50,22 +55,22 @@ DEFINE_EVENT(preemptirq_template, preempt_disable, DEFINE_EVENT(preemptirq_template, preempt_enable, TP_PROTO(unsigned long ip, unsigned long parent_ip), TP_ARGS(ip, parent_ip)); +#else +#define trace_preempt_enable(...) +#define trace_preempt_disable(...) +#define trace_preempt_enable_rcuidle(...) +#define trace_preempt_disable_rcuidle(...) #endif #endif /* _TRACE_PREEMPTIRQ_H */ #include -#endif /* !CONFIG_PREEMPTIRQ_EVENTS */ - -#if !defined(CONFIG_PREEMPTIRQ_EVENTS) || defined(CONFIG_PROVE_LOCKING) +#else /* !CONFIG_PREEMPTIRQ_TRACEPOINTS */ #define trace_irq_enable(...) #define trace_irq_disable(...) #define trace_irq_enable_rcuidle(...) #define trace_irq_disable_rcuidle(...) -#endif - -#if !defined(CONFIG_PREEMPTIRQ_EVENTS) || !defined(CONFIG_DEBUG_PREEMPT) #define trace_preempt_enable(...) #define trace_preempt_disable(...) #define trace_preempt_enable_rcuidle(...) diff --git a/init/main.c b/init/main.c index 3b4ada11ed52..44fe43be84c1 100644 --- a/init/main.c +++ b/init/main.c @@ -648,6 +648,9 @@ asmlinkage __visible void __init start_kernel(void) profile_init(); call_function_init(); WARN(!irqs_disabled(), "Interrupts were enabled early\n"); + + lockdep_init_early(); + early_boot_irqs_disabled = false; local_irq_enable(); @@ -663,7 +666,7 @@ asmlinkage __visible void __init start_kernel(void) panic("Too many boot %s vars at `%s'", panic_later, panic_param); - lockdep_info(); + lockdep_init(); /* * Need to run this when irqs are enabled, because it wants diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index fbbb79d5cfa0..03bfaeb9f4e6 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -55,6 +55,7 @@ #include "lockdep_internals.h" +#include #define CREATE_TRACE_POINTS #include @@ -2839,10 +2840,9 @@ static void __trace_hardirqs_on_caller(unsigned long ip) debug_atomic_inc(hardirqs_on_events); } -__visible void trace_hardirqs_on_caller(unsigned long ip) +static void lockdep_hardirqs_on(void *none, unsigned long ignore, + unsigned long ip) { - time_hardirqs_on(CALLER_ADDR0, ip); - if (unlikely(!debug_locks || current->lockdep_recursion)) return; @@ -2881,23 +2881,15 @@ __visible void trace_hardirqs_on_caller(unsigned long ip) __trace_hardirqs_on_caller(ip); current->lockdep_recursion = 0; } -EXPORT_SYMBOL(trace_hardirqs_on_caller); - -void trace_hardirqs_on(void) -{ - trace_hardirqs_on_caller(CALLER_ADDR0); -} -EXPORT_SYMBOL(trace_hardirqs_on); /* * Hardirqs were disabled: */ -__visible void trace_hardirqs_off_caller(unsigned long ip) +static void lockdep_hardirqs_off(void *none, unsigned long ignore, + unsigned long ip) { struct task_struct *curr = current; - time_hardirqs_off(CALLER_ADDR0, ip); - if (unlikely(!debug_locks || current->lockdep_recursion)) return; @@ -2919,13 +2911,6 @@ __visible void trace_hardirqs_off_caller(unsigned long ip) } else debug_atomic_inc(redundant_hardirqs_off); } -EXPORT_SYMBOL(trace_hardirqs_off_caller); - -void trace_hardirqs_off(void) -{ - trace_hardirqs_off_caller(CALLER_ADDR0); -} -EXPORT_SYMBOL(trace_hardirqs_off); /* * Softirqs will be enabled: @@ -4330,7 +4315,15 @@ out_restore: raw_local_irq_restore(flags); } -void __init lockdep_info(void) +void __init lockdep_init_early(void) +{ +#ifdef CONFIG_PROVE_LOCKING + register_trace_prio_irq_disable(lockdep_hardirqs_off, NULL, INT_MAX); + register_trace_prio_irq_enable(lockdep_hardirqs_on, NULL, INT_MIN); +#endif +} + +void __init lockdep_init(void) { printk("Lock dependency validator: Copyright (c) 2006 Red Hat, Inc., Ingo Molnar\n"); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index fe365c9a08e9..5de1a4343424 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3189,7 +3189,7 @@ static inline void sched_tick_stop(int cpu) { } #endif #if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \ - defined(CONFIG_PREEMPT_TRACER)) + defined(CONFIG_TRACE_PREEMPT_TOGGLE)) /* * If the value passed in is equal to the current preempt count * then we just disabled preemption. Start timing the latency. diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 4d4eb15cc7fd..036cec1fcd24 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -82,6 +82,15 @@ config RING_BUFFER_ALLOW_SWAP Allow the use of ring_buffer_swap_cpu. Adds a very slight overhead to tracing when enabled. +config PREEMPTIRQ_TRACEPOINTS + bool + depends on TRACE_PREEMPT_TOGGLE || TRACE_IRQFLAGS + select TRACING + default y + help + Create preempt/irq toggle tracepoints if needed, so that other parts + of the kernel can use them to generate or add hooks to them. + # All tracer options should select GENERIC_TRACER. For those options that are # enabled by all tracers (context switch and event tracer) they select TRACING. # This allows those options to appear when no other tracer is selected. But the @@ -155,18 +164,20 @@ config FUNCTION_GRAPH_TRACER the return value. This is done by setting the current return address on the current task structure into a stack of calls. +config TRACE_PREEMPT_TOGGLE + bool + help + Enables hooks which will be called when preemption is first disabled, + and last enabled. config PREEMPTIRQ_EVENTS bool "Enable trace events for preempt and irq disable/enable" select TRACE_IRQFLAGS - depends on DEBUG_PREEMPT || !PROVE_LOCKING - depends on TRACING + select TRACE_PREEMPT_TOGGLE if PREEMPT + select GENERIC_TRACER default n help Enable tracing of disable and enable events for preemption and irqs. - For tracing preempt disable/enable events, DEBUG_PREEMPT must be - enabled. For tracing irq disable/enable events, PROVE_LOCKING must - be disabled. config IRQSOFF_TRACER bool "Interrupts-off Latency Tracer" @@ -203,6 +214,7 @@ config PREEMPT_TRACER select RING_BUFFER_ALLOW_SWAP select TRACER_SNAPSHOT select TRACER_SNAPSHOT_PER_CPU_SWAP + select TRACE_PREEMPT_TOGGLE help This option measures the time spent in preemption-off critical sections, with microsecond accuracy. diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index 81902a79e049..98d53b39a8ee 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -41,7 +41,7 @@ obj-$(CONFIG_TRACING_MAP) += tracing_map.o obj-$(CONFIG_PREEMPTIRQ_DELAY_TEST) += preemptirq_delay_test.o obj-$(CONFIG_CONTEXT_SWITCH_TRACER) += trace_sched_switch.o obj-$(CONFIG_FUNCTION_TRACER) += trace_functions.o -obj-$(CONFIG_PREEMPTIRQ_EVENTS) += trace_irqsoff.o +obj-$(CONFIG_PREEMPTIRQ_TRACEPOINTS) += trace_preemptirq.o obj-$(CONFIG_IRQSOFF_TRACER) += trace_irqsoff.o obj-$(CONFIG_PREEMPT_TRACER) += trace_irqsoff.o obj-$(CONFIG_SCHED_TRACER) += trace_sched_wakeup.o diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index f8daa754cce2..770cd30cda40 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -16,7 +16,6 @@ #include "trace.h" -#define CREATE_TRACE_POINTS #include #if defined(CONFIG_IRQSOFF_TRACER) || defined(CONFIG_PREEMPT_TRACER) @@ -450,66 +449,6 @@ void stop_critical_timings(void) } EXPORT_SYMBOL_GPL(stop_critical_timings); -#ifdef CONFIG_IRQSOFF_TRACER -#ifdef CONFIG_PROVE_LOCKING -void time_hardirqs_on(unsigned long a0, unsigned long a1) -{ - if (!preempt_trace() && irq_trace()) - stop_critical_timing(a0, a1); -} - -void time_hardirqs_off(unsigned long a0, unsigned long a1) -{ - if (!preempt_trace() && irq_trace()) - start_critical_timing(a0, a1); -} - -#else /* !CONFIG_PROVE_LOCKING */ - -/* - * We are only interested in hardirq on/off events: - */ -static inline void tracer_hardirqs_on(void) -{ - if (!preempt_trace() && irq_trace()) - stop_critical_timing(CALLER_ADDR0, CALLER_ADDR1); -} - -static inline void tracer_hardirqs_off(void) -{ - if (!preempt_trace() && irq_trace()) - start_critical_timing(CALLER_ADDR0, CALLER_ADDR1); -} - -static inline void tracer_hardirqs_on_caller(unsigned long caller_addr) -{ - if (!preempt_trace() && irq_trace()) - stop_critical_timing(CALLER_ADDR0, caller_addr); -} - -static inline void tracer_hardirqs_off_caller(unsigned long caller_addr) -{ - if (!preempt_trace() && irq_trace()) - start_critical_timing(CALLER_ADDR0, caller_addr); -} - -#endif /* CONFIG_PROVE_LOCKING */ -#endif /* CONFIG_IRQSOFF_TRACER */ - -#ifdef CONFIG_PREEMPT_TRACER -static inline void tracer_preempt_on(unsigned long a0, unsigned long a1) -{ - if (preempt_trace() && !irq_trace()) - stop_critical_timing(a0, a1); -} - -static inline void tracer_preempt_off(unsigned long a0, unsigned long a1) -{ - if (preempt_trace() && !irq_trace()) - start_critical_timing(a0, a1); -} -#endif /* CONFIG_PREEMPT_TRACER */ - #ifdef CONFIG_FUNCTION_TRACER static bool function_enabled; @@ -659,15 +598,34 @@ static void irqsoff_tracer_stop(struct trace_array *tr) } #ifdef CONFIG_IRQSOFF_TRACER +/* + * We are only interested in hardirq on/off events: + */ +static void tracer_hardirqs_on(void *none, unsigned long a0, unsigned long a1) +{ + if (!preempt_trace() && irq_trace()) + stop_critical_timing(a0, a1); +} + +static void tracer_hardirqs_off(void *none, unsigned long a0, unsigned long a1) +{ + if (!preempt_trace() && irq_trace()) + start_critical_timing(a0, a1); +} + static int irqsoff_tracer_init(struct trace_array *tr) { trace_type = TRACER_IRQS_OFF; + register_trace_irq_disable(tracer_hardirqs_off, NULL); + register_trace_irq_enable(tracer_hardirqs_on, NULL); return __irqsoff_tracer_init(tr); } static void irqsoff_tracer_reset(struct trace_array *tr) { + unregister_trace_irq_disable(tracer_hardirqs_off, NULL); + unregister_trace_irq_enable(tracer_hardirqs_on, NULL); __irqsoff_tracer_reset(tr); } @@ -690,21 +648,34 @@ static struct tracer irqsoff_tracer __read_mostly = .allow_instances = true, .use_max_tr = true, }; -# define register_irqsoff(trace) register_tracer(&trace) -#else -# define register_irqsoff(trace) do { } while (0) -#endif +#endif /* CONFIG_IRQSOFF_TRACER */ #ifdef CONFIG_PREEMPT_TRACER +static void tracer_preempt_on(void *none, unsigned long a0, unsigned long a1) +{ + if (preempt_trace() && !irq_trace()) + stop_critical_timing(a0, a1); +} + +static void tracer_preempt_off(void *none, unsigned long a0, unsigned long a1) +{ + if (preempt_trace() && !irq_trace()) + start_critical_timing(a0, a1); +} + static int preemptoff_tracer_init(struct trace_array *tr) { trace_type = TRACER_PREEMPT_OFF; + register_trace_preempt_disable(tracer_preempt_off, NULL); + register_trace_preempt_enable(tracer_preempt_on, NULL); return __irqsoff_tracer_init(tr); } static void preemptoff_tracer_reset(struct trace_array *tr) { + unregister_trace_preempt_disable(tracer_preempt_off, NULL); + unregister_trace_preempt_enable(tracer_preempt_on, NULL); __irqsoff_tracer_reset(tr); } @@ -727,23 +698,29 @@ static struct tracer preemptoff_tracer __read_mostly = .allow_instances = true, .use_max_tr = true, }; -# define register_preemptoff(trace) register_tracer(&trace) -#else -# define register_preemptoff(trace) do { } while (0) -#endif +#endif /* CONFIG_PREEMPT_TRACER */ -#if defined(CONFIG_IRQSOFF_TRACER) && \ - defined(CONFIG_PREEMPT_TRACER) +#if defined(CONFIG_IRQSOFF_TRACER) && defined(CONFIG_PREEMPT_TRACER) static int preemptirqsoff_tracer_init(struct trace_array *tr) { trace_type = TRACER_IRQS_OFF | TRACER_PREEMPT_OFF; + register_trace_irq_disable(tracer_hardirqs_off, NULL); + register_trace_irq_enable(tracer_hardirqs_on, NULL); + register_trace_preempt_disable(tracer_preempt_off, NULL); + register_trace_preempt_enable(tracer_preempt_on, NULL); + return __irqsoff_tracer_init(tr); } static void preemptirqsoff_tracer_reset(struct trace_array *tr) { + unregister_trace_irq_disable(tracer_hardirqs_off, NULL); + unregister_trace_irq_enable(tracer_hardirqs_on, NULL); + unregister_trace_preempt_disable(tracer_preempt_off, NULL); + unregister_trace_preempt_enable(tracer_preempt_on, NULL); + __irqsoff_tracer_reset(tr); } @@ -766,115 +743,21 @@ static struct tracer preemptirqsoff_tracer __read_mostly = .allow_instances = true, .use_max_tr = true, }; - -# define register_preemptirqsoff(trace) register_tracer(&trace) -#else -# define register_preemptirqsoff(trace) do { } while (0) #endif __init static int init_irqsoff_tracer(void) { - register_irqsoff(irqsoff_tracer); - register_preemptoff(preemptoff_tracer); - register_preemptirqsoff(preemptirqsoff_tracer); - - return 0; -} -core_initcall(init_irqsoff_tracer); -#endif /* IRQSOFF_TRACER || PREEMPTOFF_TRACER */ - -#ifndef CONFIG_IRQSOFF_TRACER -static inline void tracer_hardirqs_on(void) { } -static inline void tracer_hardirqs_off(void) { } -static inline void tracer_hardirqs_on_caller(unsigned long caller_addr) { } -static inline void tracer_hardirqs_off_caller(unsigned long caller_addr) { } +#ifdef CONFIG_IRQSOFF_TRACER + register_tracer(&irqsoff_tracer); #endif - -#ifndef CONFIG_PREEMPT_TRACER -static inline void tracer_preempt_on(unsigned long a0, unsigned long a1) { } -static inline void tracer_preempt_off(unsigned long a0, unsigned long a1) { } +#ifdef CONFIG_PREEMPT_TRACER + register_tracer(&preemptoff_tracer); #endif - -#if defined(CONFIG_TRACE_IRQFLAGS) && !defined(CONFIG_PROVE_LOCKING) -/* Per-cpu variable to prevent redundant calls when IRQs already off */ -static DEFINE_PER_CPU(int, tracing_irq_cpu); - -void trace_hardirqs_on(void) -{ - if (!this_cpu_read(tracing_irq_cpu)) - return; - - trace_irq_enable_rcuidle(CALLER_ADDR0, CALLER_ADDR1); - tracer_hardirqs_on(); - - this_cpu_write(tracing_irq_cpu, 0); -} -EXPORT_SYMBOL(trace_hardirqs_on); - -void trace_hardirqs_off(void) -{ - if (this_cpu_read(tracing_irq_cpu)) - return; - - this_cpu_write(tracing_irq_cpu, 1); - - trace_irq_disable_rcuidle(CALLER_ADDR0, CALLER_ADDR1); - tracer_hardirqs_off(); -} -EXPORT_SYMBOL(trace_hardirqs_off); - -__visible void trace_hardirqs_on_caller(unsigned long caller_addr) -{ - if (!this_cpu_read(tracing_irq_cpu)) - return; - - trace_irq_enable_rcuidle(CALLER_ADDR0, caller_addr); - tracer_hardirqs_on_caller(caller_addr); - - this_cpu_write(tracing_irq_cpu, 0); -} -EXPORT_SYMBOL(trace_hardirqs_on_caller); - -__visible void trace_hardirqs_off_caller(unsigned long caller_addr) -{ - if (this_cpu_read(tracing_irq_cpu)) - return; - - this_cpu_write(tracing_irq_cpu, 1); - - trace_irq_disable_rcuidle(CALLER_ADDR0, caller_addr); - tracer_hardirqs_off_caller(caller_addr); -} -EXPORT_SYMBOL(trace_hardirqs_off_caller); - -/* - * Stubs: - */ - -void trace_softirqs_on(unsigned long ip) -{ -} - -void trace_softirqs_off(unsigned long ip) -{ -} - -inline void print_irqtrace_events(struct task_struct *curr) -{ -} +#if defined(CONFIG_IRQSOFF_TRACER) && defined(CONFIG_PREEMPT_TRACER) + register_tracer(&preemptirqsoff_tracer); #endif -#if defined(CONFIG_PREEMPT_TRACER) || \ - (defined(CONFIG_DEBUG_PREEMPT) && defined(CONFIG_PREEMPTIRQ_EVENTS)) -void trace_preempt_on(unsigned long a0, unsigned long a1) -{ - trace_preempt_enable_rcuidle(a0, a1); - tracer_preempt_on(a0, a1); -} - -void trace_preempt_off(unsigned long a0, unsigned long a1) -{ - trace_preempt_disable_rcuidle(a0, a1); - tracer_preempt_off(a0, a1); + return 0; } -#endif +core_initcall(init_irqsoff_tracer); +#endif /* IRQSOFF_TRACER || PREEMPTOFF_TRACER */ diff --git a/kernel/trace/trace_preemptirq.c b/kernel/trace/trace_preemptirq.c new file mode 100644 index 000000000000..e76b78bf258e --- /dev/null +++ b/kernel/trace/trace_preemptirq.c @@ -0,0 +1,72 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * preemptoff and irqoff tracepoints + * + * Copyright (C) Joel Fernandes (Google) + */ + +#include +#include +#include +#include + +#define CREATE_TRACE_POINTS +#include + +#ifdef CONFIG_TRACE_IRQFLAGS +/* Per-cpu variable to prevent redundant calls when IRQs already off */ +static DEFINE_PER_CPU(int, tracing_irq_cpu); + +void trace_hardirqs_on(void) +{ + if (!this_cpu_read(tracing_irq_cpu)) + return; + + trace_irq_enable_rcuidle(CALLER_ADDR0, CALLER_ADDR1); + this_cpu_write(tracing_irq_cpu, 0); +} +EXPORT_SYMBOL(trace_hardirqs_on); + +void trace_hardirqs_off(void) +{ + if (this_cpu_read(tracing_irq_cpu)) + return; + + this_cpu_write(tracing_irq_cpu, 1); + trace_irq_disable_rcuidle(CALLER_ADDR0, CALLER_ADDR1); +} +EXPORT_SYMBOL(trace_hardirqs_off); + +__visible void trace_hardirqs_on_caller(unsigned long caller_addr) +{ + if (!this_cpu_read(tracing_irq_cpu)) + return; + + trace_irq_enable_rcuidle(CALLER_ADDR0, caller_addr); + this_cpu_write(tracing_irq_cpu, 0); +} +EXPORT_SYMBOL(trace_hardirqs_on_caller); + +__visible void trace_hardirqs_off_caller(unsigned long caller_addr) +{ + if (this_cpu_read(tracing_irq_cpu)) + return; + + this_cpu_write(tracing_irq_cpu, 1); + trace_irq_disable_rcuidle(CALLER_ADDR0, caller_addr); +} +EXPORT_SYMBOL(trace_hardirqs_off_caller); +#endif /* CONFIG_TRACE_IRQFLAGS */ + +#ifdef CONFIG_TRACE_PREEMPT_TOGGLE + +void trace_preempt_on(unsigned long a0, unsigned long a1) +{ + trace_preempt_enable_rcuidle(a0, a1); +} + +void trace_preempt_off(unsigned long a0, unsigned long a1) +{ + trace_preempt_disable_rcuidle(a0, a1); +} +#endif -- cgit From 80d20d35af1edd632a5e7a3b9c0ab7ceff92769e Mon Sep 17 00:00:00 2001 From: Anna-Maria Gleixner Date: Tue, 31 Jul 2018 18:13:58 +0200 Subject: nohz: Fix local_timer_softirq_pending() local_timer_softirq_pending() checks whether the timer softirq is pending with: local_softirq_pending() & TIMER_SOFTIRQ. This is wrong because TIMER_SOFTIRQ is the softirq number and not a bitmask. So the test checks for the wrong bit. Use BIT(TIMER_SOFTIRQ) instead. Fixes: 5d62c183f9e9 ("nohz: Prevent a timer interrupt storm in tick_nohz_stop_sched_tick()") Signed-off-by: Anna-Maria Gleixner Signed-off-by: Thomas Gleixner Reviewed-by: Paul E. McKenney Reviewed-by: Daniel Bristot de Oliveira Acked-by: Frederic Weisbecker Cc: bigeasy@linutronix.de Cc: peterz@infradead.org Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/20180731161358.29472-1-anna-maria@linutronix.de --- kernel/time/tick-sched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index da9455a6b42b..5b33e2f5c0ed 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -642,7 +642,7 @@ static void tick_nohz_restart(struct tick_sched *ts, ktime_t now) static inline bool local_timer_softirq_pending(void) { - return local_softirq_pending() & TIMER_SOFTIRQ; + return local_softirq_pending() & BIT(TIMER_SOFTIRQ); } static ktime_t tick_nohz_next_event(struct tick_sched *ts, int cpu) -- cgit From fbeb1603bf4e9baa82da8f794de42949d0fe5e25 Mon Sep 17 00:00:00 2001 From: Arthur Fabre Date: Tue, 31 Jul 2018 18:17:22 +0100 Subject: bpf: verifier: MOV64 don't mark dst reg unbounded When check_alu_op() handles a BPF_MOV64 between two registers, it calls check_reg_arg(DST_OP) on the dst register, marking it as unbounded. If the src and dst register are the same, this marks the src as unbounded, which can lead to unexpected errors for further checks that rely on bounds info. For example: BPF_MOV64_IMM(BPF_REG_2, 0), BPF_MOV64_REG(BPF_REG_2, BPF_REG_2), BPF_ALU64_REG(BPF_ADD, BPF_REG_1, BPF_REG_2), BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), Results in: "math between ctx pointer and register with unbounded min value is not allowed" check_alu_op() now uses check_reg_arg(DST_OP_NO_MARK), and MOVs that need to mark the dst register (MOVIMM, MOV32) do so. Added a test case for MOV64 dst == src, and dst != src. Signed-off-by: Arthur Fabre Acked-by: Edward Cree Signed-off-by: Daniel Borkmann --- kernel/bpf/verifier.c | 6 ++++-- tools/testing/selftests/bpf/test_verifier.c | 26 ++++++++++++++++++++++++++ 2 files changed, 30 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 25e47c195874..e948303a0ea8 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -3238,8 +3238,8 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) } } - /* check dest operand */ - err = check_reg_arg(env, insn->dst_reg, DST_OP); + /* check dest operand, mark as required later */ + err = check_reg_arg(env, insn->dst_reg, DST_OP_NO_MARK); if (err) return err; @@ -3265,6 +3265,8 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) /* case: R = imm * remember the value we stored into this reg */ + /* clear any state __mark_reg_known doesn't set */ + mark_reg_unknown(env, regs, insn->dst_reg); regs[insn->dst_reg].type = SCALAR_VALUE; if (BPF_CLASS(insn->code) == BPF_ALU64) { __mark_reg_known(regs + insn->dst_reg, diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c index f5f7bcc96046..c582afba9d1f 100644 --- a/tools/testing/selftests/bpf/test_verifier.c +++ b/tools/testing/selftests/bpf/test_verifier.c @@ -12332,6 +12332,32 @@ static struct bpf_test tests[] = { .result = REJECT, .errstr = "variable ctx access var_off=(0x0; 0x4)", }, + { + "mov64 src == dst", + .insns = { + BPF_MOV64_IMM(BPF_REG_2, 0), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_2), + // Check bounds are OK + BPF_ALU64_REG(BPF_ADD, BPF_REG_1, BPF_REG_2), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .prog_type = BPF_PROG_TYPE_SCHED_CLS, + .result = ACCEPT, + }, + { + "mov64 src != dst", + .insns = { + BPF_MOV64_IMM(BPF_REG_3, 0), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_3), + // Check bounds are OK + BPF_ALU64_REG(BPF_ADD, BPF_REG_1, BPF_REG_2), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .prog_type = BPF_PROG_TYPE_SCHED_CLS, + .result = ACCEPT, + }, }; static int probe_filter_length(const struct bpf_insn *fp) -- cgit From 2c323017e381c55c5ce2a603b8305bb18c1162cc Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 31 Jul 2018 12:39:04 -0400 Subject: blk-cgroup: clear the throttle queue on fork We were hitting a panic in production where we put too many times on the request queue. This is because we'd get the throttle_queue of the parent if we fork()'ed while we needed to be throttled, but we didn't have a reference on it. Instead just clear these flags on fork so the child doesn't pay for the sins of its father. Signed-off-by: Josef Bacik Signed-off-by: Jens Axboe --- kernel/fork.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 9440d61b925c..694ae0e56866 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -843,6 +843,11 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) tsk->fail_nth = 0; #endif +#ifdef CONFIG_BLK_CGROUP + tsk->throttle_queue = NULL; + tsk->use_memdelay = 0; +#endif + return tsk; free_stack: -- cgit From f143641bfef9a4a60c57af30de26c63057e7e695 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Wed, 1 Aug 2018 15:40:57 -0400 Subject: tracing: Do not call start/stop() functions when tracing_on does not change Currently, when one echo's in 1 into tracing_on, the current tracer's "start()" function is executed, even if tracing_on was already one. This can lead to strange side effects. One being that if the hwlat tracer is enabled, and someone does "echo 1 > tracing_on" into tracing_on, the hwlat tracer's start() function is called again which will recreate another kernel thread, and make it unable to remove the old one. Link: http://lkml.kernel.org/r/1533120354-22923-1-git-send-email-erica.bugden@linutronix.de Cc: stable@vger.kernel.org Fixes: 2df8f8a6a897e ("tracing: Fix regression with irqsoff tracer and tracing_on file") Reported-by: Erica Bugden Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 823687997b01..2378bb4f1442 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -7628,7 +7628,9 @@ rb_simple_write(struct file *filp, const char __user *ubuf, if (buffer) { mutex_lock(&trace_types_lock); - if (val) { + if (!!val == tracer_tracing_is_on(tr)) { + val = 0; /* do nothing */ + } else if (val) { tracer_tracing_on(tr); if (tr->current_trace->start) tr->current_trace->start(tr); -- cgit From 82fbc8c48adffd73297e7edbd7266a89d00cc52f Mon Sep 17 00:00:00 2001 From: Erica Bugden Date: Wed, 1 Aug 2018 12:45:54 +0200 Subject: ftrace: Add missing check for existing hwlat thread The hwlat tracer uses a kernel thread to measure latencies. The function that creates this kernel thread, start_kthread(), can be called when the tracer is initialized and when the tracer is explicitly enabled. start_kthread() does not check if there is an existing hwlat kernel thread and will create a new one each time it is called. This causes the reference to the previous thread to be lost. Without the thread reference, the old kernel thread becomes unstoppable and continues to use CPU time even after the hwlat tracer has been disabled. This problem can be observed when a system is booted with tracing enabled and the hwlat tracer is configured like this: echo hwlat > current_tracer; echo 1 > tracing_on Add the missing check for an existing kernel thread in start_kthread() to prevent this problem. This function and the rest of the hwlat kernel thread setup and teardown are already serialized because they are called through the tracer core code with trace_type_lock held. [ Note, this only fixes the symptom. The real fix was not to call this function when tracing_on was already one. But this still makes the code more robust, so we'll add it. ] Link: http://lkml.kernel.org/r/1533120354-22923-1-git-send-email-erica.bugden@linutronix.de Signed-off-by: Erica Bugden Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_hwlat.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace_hwlat.c b/kernel/trace/trace_hwlat.c index d7c8e4ec3d9d..2d9d36dd5fe7 100644 --- a/kernel/trace/trace_hwlat.c +++ b/kernel/trace/trace_hwlat.c @@ -354,6 +354,9 @@ static int start_kthread(struct trace_array *tr) struct task_struct *kthread; int next_cpu; + if (hwlat_kthread) + return 0; + /* Just pick the first CPU on first iteration */ current_mask = &save_cpumask; get_online_cpus(); -- cgit From 978defee11a5d54f8f546e79867d2aab1b50606b Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Wed, 1 Aug 2018 16:06:02 -0400 Subject: tracing: Do a WARN_ON() if start_thread() in hwlat is called when thread exists The start function of the hwlat tracer should never be called when the hwlat thread already exists. If it is called, do a WARN_ON(). Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_hwlat.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_hwlat.c b/kernel/trace/trace_hwlat.c index 2d9d36dd5fe7..688e48be7bb5 100644 --- a/kernel/trace/trace_hwlat.c +++ b/kernel/trace/trace_hwlat.c @@ -354,7 +354,7 @@ static int start_kthread(struct trace_array *tr) struct task_struct *kthread; int next_cpu; - if (hwlat_kthread) + if (WARN_ON(hwlat_kthread)) return 0; /* Just pick the first CPU on first iteration */ -- cgit From ec57350883cd7fccd867b0d2260bac3a9bf6442d Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Wed, 1 Aug 2018 16:08:57 -0400 Subject: tracing: Make tracer_tracing_is_on() return bool There's code that expects tracer_tracing_is_on() to be either true or false, not some random number. Currently, it should only return one or zero, but just in case, change its return value to bool, to enforce it. Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 2 +- kernel/trace/trace.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 2378bb4f1442..2dad27809794 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1087,7 +1087,7 @@ void disable_trace_on_warning(void) * * Shows real state of the ring buffer if it is enabled or not. */ -int tracer_tracing_is_on(struct trace_array *tr) +bool tracer_tracing_is_on(struct trace_array *tr) { if (tr->trace_buffer.buffer) return ring_buffer_record_is_on(tr->trace_buffer.buffer); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index e1c8a1d6f240..d88cd9bb72f4 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -594,7 +594,7 @@ void tracing_reset_current(int cpu); void tracing_reset_all_online_cpus(void); int tracing_open_generic(struct inode *inode, struct file *filp); bool tracing_is_disabled(void); -int tracer_tracing_is_on(struct trace_array *tr); +bool tracer_tracing_is_on(struct trace_array *tr); void tracer_tracing_on(struct trace_array *tr); void tracer_tracing_off(struct trace_array *tr); struct dentry *trace_create_file(const char *name, -- cgit From 87a4c375995ed8eaa721b08825cf73d0b02b3145 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 31 Jul 2018 13:39:32 +0200 Subject: kconfig: include kernel/Kconfig.preempt from init/Kconfig Almost all architectures include it. Add a ARCH_NO_PREEMPT symbol to disable preempt support for alpha, hexagon, non-coldfire m68k and user mode Linux. Signed-off-by: Christoph Hellwig Signed-off-by: Masahiro Yamada --- arch/Kconfig | 3 +++ arch/alpha/Kconfig | 1 + arch/arc/Kconfig | 2 -- arch/arm/Kconfig | 2 -- arch/arm64/Kconfig | 1 - arch/c6x/Kconfig | 2 -- arch/h8300/Kconfig | 6 ------ arch/hexagon/Kconfig | 1 + arch/ia64/Kconfig | 2 -- arch/m68k/Kconfig | 5 +---- arch/microblaze/Kconfig | 2 -- arch/mips/Kconfig | 2 -- arch/nds32/Kconfig | 1 - arch/nios2/Kconfig | 2 -- arch/openrisc/Kconfig | 1 - arch/parisc/Kconfig | 1 - arch/powerpc/Kconfig | 1 - arch/riscv/Kconfig | 2 -- arch/s390/Kconfig | 2 -- arch/sh/Kconfig | 2 -- arch/sparc/Kconfig | 2 -- arch/um/Kconfig | 1 + arch/unicore32/Kconfig | 2 -- arch/x86/Kconfig | 2 -- arch/xtensa/Kconfig | 2 -- init/Kconfig | 1 + kernel/Kconfig.preempt | 2 ++ lib/Kconfig.debug | 1 + 28 files changed, 11 insertions(+), 43 deletions(-) (limited to 'kernel') diff --git a/arch/Kconfig b/arch/Kconfig index 09a561a70168..cddd0faae4cb 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -881,6 +881,9 @@ config COMPAT_32BIT_TIME config ARCH_NO_COHERENT_DMA_MMAP bool +config ARCH_NO_PREEMPT + bool + config CPU_NO_EFFICIENT_FFS def_bool n diff --git a/arch/alpha/Kconfig b/arch/alpha/Kconfig index e4334f017f8e..5b4f88363453 100644 --- a/arch/alpha/Kconfig +++ b/arch/alpha/Kconfig @@ -4,6 +4,7 @@ config ALPHA default y select ARCH_MIGHT_HAVE_PC_PARPORT select ARCH_MIGHT_HAVE_PC_SERIO + select ARCH_NO_PREEMPT select ARCH_USE_CMPXCHG_LOCKREF select HAVE_AOUT select HAVE_IDE diff --git a/arch/arc/Kconfig b/arch/arc/Kconfig index 5aab069eba17..639ab1bed835 100644 --- a/arch/arc/Kconfig +++ b/arch/arc/Kconfig @@ -545,8 +545,6 @@ config ARC_BUILTIN_DTB_NAME Set the name of the DTB to embed in the vmlinux binary Leaving it blank selects the minimal "skeleton" dtb -source "kernel/Kconfig.preempt" - endmenu # "ARC Architecture Configuration" config FORCE_MAX_ZONEORDER diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index 9aaa23c25374..1b1c21519039 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -1481,8 +1481,6 @@ config ARCH_NR_GPIO If unsure, leave the default value. -source kernel/Kconfig.preempt - config HZ_FIXED int default 200 if ARCH_EBSA110 diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 07d457ec417a..daf59d363dd3 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -752,7 +752,6 @@ config HOLES_IN_ZONE def_bool y depends on NUMA -source kernel/Kconfig.preempt source kernel/Kconfig.hz config ARCH_SUPPORTS_DEBUG_PAGEALLOC diff --git a/arch/c6x/Kconfig b/arch/c6x/Kconfig index 904b3375331e..a641b0bf1611 100644 --- a/arch/c6x/Kconfig +++ b/arch/c6x/Kconfig @@ -110,8 +110,6 @@ config KERNEL_RAM_BASE_ADDRESS default 0xe0000000 if SOC_TMS320C6472 default 0x80000000 -source "kernel/Kconfig.preempt" - source "kernel/Kconfig.hz" endmenu diff --git a/arch/h8300/Kconfig b/arch/h8300/Kconfig index c16e7cf732f7..5e89d40be8cd 100644 --- a/arch/h8300/Kconfig +++ b/arch/h8300/Kconfig @@ -49,9 +49,3 @@ config NR_CPUS default 1 source "arch/h8300/Kconfig.cpu" - -menu "Kernel Features" - -source "kernel/Kconfig.preempt" - -endmenu diff --git a/arch/hexagon/Kconfig b/arch/hexagon/Kconfig index fcdb6d9fcecc..89a4b22f34d9 100644 --- a/arch/hexagon/Kconfig +++ b/arch/hexagon/Kconfig @@ -4,6 +4,7 @@ comment "Linux Kernel Configuration for Hexagon" config HEXAGON def_bool y + select ARCH_NO_PREEMPT select HAVE_OPROFILE # Other pending projects/to-do items. # select HAVE_REGS_AND_STACK_ACCESS_API diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig index 34a8d24cffea..86bd377bc7c0 100644 --- a/arch/ia64/Kconfig +++ b/arch/ia64/Kconfig @@ -364,8 +364,6 @@ config FORCE_CPEI_RETARGET This option it useful to enable this feature on older BIOS's as well. You can also enable this by using boot command line option force_cpei=1. -source "kernel/Kconfig.preempt" - config ARCH_SELECT_MEMORY_MODEL def_bool y diff --git a/arch/m68k/Kconfig b/arch/m68k/Kconfig index cf41ad45513f..3e47f8df6504 100644 --- a/arch/m68k/Kconfig +++ b/arch/m68k/Kconfig @@ -4,6 +4,7 @@ config M68K default y select ARCH_MIGHT_HAVE_PC_PARPORT if ISA select ARCH_NO_COHERENT_DMA_MMAP if !MMU + select ARCH_NO_PREEMPT if !COLDFIRE select HAVE_IDE select HAVE_AOUT if MMU select HAVE_DEBUG_BUGVERBOSE @@ -129,10 +130,6 @@ endmenu menu "Kernel Features" -if COLDFIRE -source "kernel/Kconfig.preempt" -endif - endmenu if !MMU diff --git a/arch/microblaze/Kconfig b/arch/microblaze/Kconfig index f4492e9478fe..6163a39ddeb6 100644 --- a/arch/microblaze/Kconfig +++ b/arch/microblaze/Kconfig @@ -87,8 +87,6 @@ source "arch/microblaze/Kconfig.platform" menu "Processor type and features" -source "kernel/Kconfig.preempt" - source "kernel/Kconfig.hz" config MMU diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig index ab02824c3976..06a633eb9777 100644 --- a/arch/mips/Kconfig +++ b/arch/mips/Kconfig @@ -2818,8 +2818,6 @@ config HZ config SCHED_HRTICK def_bool HIGH_RES_TIMERS -source "kernel/Kconfig.preempt" - config KEXEC bool "Kexec system call" select KEXEC_CORE diff --git a/arch/nds32/Kconfig b/arch/nds32/Kconfig index c03d0e5a591b..541f16adfb06 100644 --- a/arch/nds32/Kconfig +++ b/arch/nds32/Kconfig @@ -88,6 +88,5 @@ config NDS32_BUILTIN_DTB endmenu menu "Kernel Features" -source "kernel/Kconfig.preempt" source "kernel/Kconfig.hz" endmenu diff --git a/arch/nios2/Kconfig b/arch/nios2/Kconfig index 22d19febcc92..cbe1844b0657 100644 --- a/arch/nios2/Kconfig +++ b/arch/nios2/Kconfig @@ -47,8 +47,6 @@ config TRACE_IRQFLAGS_SUPPORT menu "Kernel features" -source "kernel/Kconfig.preempt" - source "kernel/Kconfig.hz" config FORCE_MAX_ZONEORDER diff --git a/arch/openrisc/Kconfig b/arch/openrisc/Kconfig index c52cecd94d62..42e3a0f2afab 100644 --- a/arch/openrisc/Kconfig +++ b/arch/openrisc/Kconfig @@ -143,7 +143,6 @@ config SMP If you don't know what to do here, say N. source kernel/Kconfig.hz -source kernel/Kconfig.preempt config OPENRISC_NO_SPR_SR_DSX bool "use SPR_SR_DSX software emulation" if OR1K_1200 diff --git a/arch/parisc/Kconfig b/arch/parisc/Kconfig index d1dd56ea297b..89496aa318da 100644 --- a/arch/parisc/Kconfig +++ b/arch/parisc/Kconfig @@ -323,7 +323,6 @@ config NODES_SHIFT default "3" depends on NEED_MULTIPLE_NODES -source "kernel/Kconfig.preempt" source "kernel/Kconfig.hz" config COMPAT diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index c55e61302d57..1c10ff0406f2 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -393,7 +393,6 @@ config HIGHMEM depends on PPC32 source kernel/Kconfig.hz -source kernel/Kconfig.preempt config HUGETLB_PAGE_SIZE_VARIABLE bool diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index 73c0e16793fa..a344980287a5 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -212,8 +212,6 @@ endmenu menu "Kernel type" -source "kernel/Kconfig.preempt" - source "kernel/Kconfig.hz" endmenu diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index ac8925766d0a..8441bd8f7d70 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -511,8 +511,6 @@ config SCHED_TOPOLOGY making when dealing with machines that have multi-threading, multiple cores or multiple books. -source kernel/Kconfig.preempt - source kernel/Kconfig.hz config KEXEC diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig index c8460330eff7..993f61a5961d 100644 --- a/arch/sh/Kconfig +++ b/arch/sh/Kconfig @@ -709,8 +709,6 @@ config HOTPLUG_CPU Say Y here to experiment with turning CPUs off and on. CPUs can be controlled through /sys/devices/system/cpu. -source "kernel/Kconfig.preempt" - config GUSA def_bool y depends on !SMP && SUPERH32 diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig index 622695e8fa44..2d58c26bff9a 100644 --- a/arch/sparc/Kconfig +++ b/arch/sparc/Kconfig @@ -349,8 +349,6 @@ config SCHED_MC making when dealing with multi-core CPU chips at a cost of slightly increased overhead in some places. If unsure say N here. -source "kernel/Kconfig.preempt" - config CMDLINE_BOOL bool "Default bootloader kernel arguments" depends on SPARC64 diff --git a/arch/um/Kconfig b/arch/um/Kconfig index 3022d1bf9bf9..6b9938919f0b 100644 --- a/arch/um/Kconfig +++ b/arch/um/Kconfig @@ -6,6 +6,7 @@ config UML bool default y select ARCH_HAS_KCOV + select ARCH_NO_PREEMPT select HAVE_ARCH_AUDITSYSCALL select HAVE_ARCH_SECCOMP_FILTER select HAVE_UID16 diff --git a/arch/unicore32/Kconfig b/arch/unicore32/Kconfig index 6bfaa4a910e1..60eae744d8fd 100644 --- a/arch/unicore32/Kconfig +++ b/arch/unicore32/Kconfig @@ -135,8 +135,6 @@ endmenu menu "Kernel Features" -source "kernel/Kconfig.preempt" - source "kernel/Kconfig.hz" config LEDS diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 41d28b430fef..98fd04cfa995 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1039,8 +1039,6 @@ config SCHED_MC_PRIO If unsure say Y here. -source "kernel/Kconfig.preempt" - config UP_LATE_INIT def_bool y depends on !SMP && X86_LOCAL_APIC diff --git a/arch/xtensa/Kconfig b/arch/xtensa/Kconfig index aae0e1800be7..801491e98890 100644 --- a/arch/xtensa/Kconfig +++ b/arch/xtensa/Kconfig @@ -173,8 +173,6 @@ config XTENSA_UNALIGNED_USER Say Y here to enable unaligned memory access in user space. -source "kernel/Kconfig.preempt" - config HAVE_SMP bool "System Supports SMP (MX)" depends on XTENSA_VARIANT_CUSTOM diff --git a/init/Kconfig b/init/Kconfig index 283f6bc796b1..29bbad9338d6 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -326,6 +326,7 @@ config AUDIT_TREE source "kernel/irq/Kconfig" source "kernel/time/Kconfig" +source "kernel/Kconfig.preempt" menu "CPU/Task time and stats accounting" diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt index 3f9c97419f02..cd1655122ec0 100644 --- a/kernel/Kconfig.preempt +++ b/kernel/Kconfig.preempt @@ -18,6 +18,7 @@ config PREEMPT_NONE config PREEMPT_VOLUNTARY bool "Voluntary Kernel Preemption (Desktop)" + depends on !ARCH_NO_PREEMPT help This option reduces the latency of the kernel by adding more "explicit preemption points" to the kernel code. These new @@ -35,6 +36,7 @@ config PREEMPT_VOLUNTARY config PREEMPT bool "Preemptible Kernel (Low-Latency Desktop)" + depends on !ARCH_NO_PREEMPT select PREEMPT_COUNT select UNINLINE_SPIN_UNLOCK if !ARCH_INLINE_SPIN_UNLOCK help diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 26d3ff7e3cf4..373ce9fecd7e 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -1195,6 +1195,7 @@ config DEBUG_ATOMIC_SLEEP bool "Sleep inside atomic section checking" select PREEMPT_COUNT depends on DEBUG_KERNEL + depends on !ARCH_NO_PREEMPT help If you say Y here, various routines which may sleep will become very noisy if they are called inside atomic sections: when a spinlock is -- cgit From 3ebea280d7e9b610fa3d31c9cfd556b1705eeedf Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Wed, 1 Aug 2018 21:08:30 -0400 Subject: ring-buffer: Make ring_buffer_record_is_on() return bool The value of ring_buffer_record_is_on() is either true or false, so have its return value be bool. Signed-off-by: Steven Rostedt (VMware) --- include/linux/ring_buffer.h | 2 +- kernel/trace/ring_buffer.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h index 003d09ab308d..5176124fc4ba 100644 --- a/include/linux/ring_buffer.h +++ b/include/linux/ring_buffer.h @@ -164,7 +164,7 @@ void ring_buffer_record_disable(struct ring_buffer *buffer); void ring_buffer_record_enable(struct ring_buffer *buffer); void ring_buffer_record_off(struct ring_buffer *buffer); void ring_buffer_record_on(struct ring_buffer *buffer); -int ring_buffer_record_is_on(struct ring_buffer *buffer); +bool ring_buffer_record_is_on(struct ring_buffer *buffer); int ring_buffer_record_is_set_on(struct ring_buffer *buffer); void ring_buffer_record_disable_cpu(struct ring_buffer *buffer, int cpu); void ring_buffer_record_enable_cpu(struct ring_buffer *buffer, int cpu); diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 0b0b688ea166..ffb43cbc5c13 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -3221,7 +3221,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_record_on); * * Returns true if the ring buffer is in a state that it accepts writes. */ -int ring_buffer_record_is_on(struct ring_buffer *buffer) +bool ring_buffer_record_is_on(struct ring_buffer *buffer) { return !atomic_read(&buffer->record_disabled); } -- cgit From d7224c0e128c7337c0b0f66ac20921fbbf4efc14 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Wed, 1 Aug 2018 21:09:50 -0400 Subject: ring-buffer: Make ring_buffer_record_is_set_on() return bool The value of ring_buffer_record_is_set_on() is either true or false, so have its return value be bool. Signed-off-by: Steven Rostedt (VMware) --- include/linux/ring_buffer.h | 2 +- kernel/trace/ring_buffer.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h index 5176124fc4ba..0940fda59872 100644 --- a/include/linux/ring_buffer.h +++ b/include/linux/ring_buffer.h @@ -165,7 +165,7 @@ void ring_buffer_record_enable(struct ring_buffer *buffer); void ring_buffer_record_off(struct ring_buffer *buffer); void ring_buffer_record_on(struct ring_buffer *buffer); bool ring_buffer_record_is_on(struct ring_buffer *buffer); -int ring_buffer_record_is_set_on(struct ring_buffer *buffer); +bool ring_buffer_record_is_set_on(struct ring_buffer *buffer); void ring_buffer_record_disable_cpu(struct ring_buffer *buffer, int cpu); void ring_buffer_record_enable_cpu(struct ring_buffer *buffer, int cpu); diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index ffb43cbc5c13..b386830b4d51 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -3237,7 +3237,7 @@ bool ring_buffer_record_is_on(struct ring_buffer *buffer) * ring_buffer_record_disable(), as that is a temporary disabling of * the ring buffer. */ -int ring_buffer_record_is_set_on(struct ring_buffer *buffer) +bool ring_buffer_record_is_set_on(struct ring_buffer *buffer) { return !(atomic_read(&buffer->record_disabled) & RB_BUFFER_OFF); } -- cgit From 44ec3ec01fb7d54d1a14c138cb43b90a0e934b89 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Wed, 1 Aug 2018 20:00:56 -0500 Subject: ftrace: Use true and false for boolean values in ops_references_rec() Return statements in functions returning bool should use true or false instead of an integer value. This code was detected with the help of Coccinelle. Link: http://lkml.kernel.org/r/20180802010056.GA31012@embeddedor.com Signed-off-by: Gustavo A. R. Silva Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 0d380a98a880..2d795193024b 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -2898,22 +2898,22 @@ ops_references_rec(struct ftrace_ops *ops, struct dyn_ftrace *rec) { /* If ops isn't enabled, ignore it */ if (!(ops->flags & FTRACE_OPS_FL_ENABLED)) - return 0; + return false; /* If ops traces all then it includes this function */ if (ops_traces_mod(ops)) - return 1; + return true; /* The function must be in the filter */ if (!ftrace_hash_empty(ops->func_hash->filter_hash) && !__ftrace_lookup_ip(ops->func_hash->filter_hash, rec->ip)) - return 0; + return false; /* If in notrace hash, we ignore it too */ if (ftrace_lookup_ip(ops->func_hash->notrace_hash, rec->ip)) - return 0; + return false; - return 1; + return true; } static int ftrace_update_code(struct module *mod, struct ftrace_page *new_pgs) -- cgit From b80a2bfce85e1051056d98d04ecb2d0b55cbbc1c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 30 Jul 2018 13:21:40 +0200 Subject: stop_machine: Reflow cpu_stop_queue_two_works() The code flow in cpu_stop_queue_two_works() is a little arcane; fix this by lifting the preempt_disable() to the top to create more natural nesting wrt the spinlocks and make the wake_up_q() and preempt_enable() unconditional at the end. Furthermore, enable preemption in the -EDEADLK case, such that we spin-wait with preemption enabled. Suggested-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Cc: Sebastian Andrzej Siewior Cc: isaacm@codeaurora.org Cc: matt@codeblueprint.co.uk Cc: psodagud@codeaurora.org Cc: gregkh@linuxfoundation.org Cc: pkondeti@codeaurora.org Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/20180730112140.GH2494@hirez.programming.kicks-ass.net --- kernel/stop_machine.c | 41 +++++++++++++++++++++++------------------ 1 file changed, 23 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index e190d1ef3a23..34b6652e8677 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -236,13 +236,24 @@ static int cpu_stop_queue_two_works(int cpu1, struct cpu_stop_work *work1, struct cpu_stopper *stopper2 = per_cpu_ptr(&cpu_stopper, cpu2); DEFINE_WAKE_Q(wakeq); int err; + retry: + /* + * The waking up of stopper threads has to happen in the same + * scheduling context as the queueing. Otherwise, there is a + * possibility of one of the above stoppers being woken up by another + * CPU, and preempting us. This will cause us to not wake up the other + * stopper forever. + */ + preempt_disable(); raw_spin_lock_irq(&stopper1->lock); raw_spin_lock_nested(&stopper2->lock, SINGLE_DEPTH_NESTING); - err = -ENOENT; - if (!stopper1->enabled || !stopper2->enabled) + if (!stopper1->enabled || !stopper2->enabled) { + err = -ENOENT; goto unlock; + } + /* * Ensure that if we race with __stop_cpus() the stoppers won't get * queued up in reverse order leading to system deadlock. @@ -253,36 +264,30 @@ retry: * It can be falsely true but it is safe to spin until it is cleared, * queue_stop_cpus_work() does everything under preempt_disable(). */ - err = -EDEADLK; - if (unlikely(stop_cpus_in_progress)) - goto unlock; + if (unlikely(stop_cpus_in_progress)) { + err = -EDEADLK; + goto unlock; + } err = 0; __cpu_stop_queue_work(stopper1, work1, &wakeq); __cpu_stop_queue_work(stopper2, work2, &wakeq); - /* - * The waking up of stopper threads has to happen - * in the same scheduling context as the queueing. - * Otherwise, there is a possibility of one of the - * above stoppers being woken up by another CPU, - * and preempting us. This will cause us to n ot - * wake up the other stopper forever. - */ - preempt_disable(); + unlock: raw_spin_unlock(&stopper2->lock); raw_spin_unlock_irq(&stopper1->lock); if (unlikely(err == -EDEADLK)) { + preempt_enable(); + while (stop_cpus_in_progress) cpu_relax(); + goto retry; } - if (!err) { - wake_up_q(&wakeq); - preempt_enable(); - } + wake_up_q(&wakeq); + preempt_enable(); return err; } -- cgit From 9be936f4b3a2ec101f54cff9cf1a6abf67263c50 Mon Sep 17 00:00:00 2001 From: zhong jiang Date: Wed, 1 Aug 2018 00:56:17 +0800 Subject: kernel/module: Use kmemdup to replace kmalloc+memcpy we prefer to the kmemdup rather than kmalloc+memcpy. so just replace them. Signed-off-by: zhong jiang Signed-off-by: Jessica Yu --- kernel/module.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index 8a45986fd728..54fbac81fd56 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -2039,21 +2039,19 @@ static int copy_module_elf(struct module *mod, struct load_info *info) /* Elf section header table */ size = sizeof(*info->sechdrs) * info->hdr->e_shnum; - mod->klp_info->sechdrs = kmalloc(size, GFP_KERNEL); + mod->klp_info->sechdrs = kmemdup(info->sechdrs, size, GFP_KERNEL); if (mod->klp_info->sechdrs == NULL) { ret = -ENOMEM; goto free_info; } - memcpy(mod->klp_info->sechdrs, info->sechdrs, size); /* Elf section name string table */ size = info->sechdrs[info->hdr->e_shstrndx].sh_size; - mod->klp_info->secstrings = kmalloc(size, GFP_KERNEL); + mod->klp_info->secstrings = kmemdup(info->secstrings, size, GFP_KERNEL); if (mod->klp_info->secstrings == NULL) { ret = -ENOMEM; goto free_sechdrs; } - memcpy(mod->klp_info->secstrings, info->secstrings, size); /* Elf symbol section index */ symndx = info->index.sym; -- cgit From 6bc6c77cfc6747292d5865d124cc31333b1b8536 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Thu, 2 Aug 2018 16:50:48 +0900 Subject: tracing/kprobes: Fix within_notrace_func() to check only notrace functions Fix within_notrace_func() to check only notrace functions and to ignore the kprobe-event which can not solve symbol addresses. within_notrace_func() returns true if the given kprobe events probe point seems to be out-of-range. But that is not the correct place to check for it, it should be done in kprobes afterwards. kprobe-events allow users to define a probe point on "currently unloaded module" so that it can trace the event during module load. In this case, the user will put a probe on a symbol which is not in kallsyms yet and it hits the within_notrace_func(). As a result, kprobe-events always refuses if user defines a probe on a "currenly unloaded module". Fixes: commit 45408c4f9250 ("tracing: kprobes: Prohibit probing on notrace function") Link: http://lkml.kernel.org/r/153319624799.29007.13604430345640129926.stgit@devbox Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_kprobe.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index deeb03ae21e1..2e13f77b9a37 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -88,6 +88,7 @@ static nokprobe_inline unsigned long trace_kprobe_nhit(struct trace_kprobe *tk) return nhit; } +/* Return 0 if it fails to find the symbol address */ static nokprobe_inline unsigned long trace_kprobe_address(struct trace_kprobe *tk) { @@ -96,7 +97,8 @@ unsigned long trace_kprobe_address(struct trace_kprobe *tk) if (tk->symbol) { addr = (unsigned long) kallsyms_lookup_name(trace_kprobe_symbol(tk)); - addr += tk->rp.kp.offset; + if (addr) + addr += tk->rp.kp.offset; } else { addr = (unsigned long)tk->rp.kp.addr; } @@ -519,8 +521,8 @@ static bool within_notrace_func(struct trace_kprobe *tk) unsigned long offset, size, addr; addr = trace_kprobe_address(tk); - if (!kallsyms_lookup_size_offset(addr, &size, &offset)) - return true; /* Out of range. */ + if (!addr || !kallsyms_lookup_size_offset(addr, &size, &offset)) + return false; return !ftrace_location_range(addr - offset, addr - offset + size); } -- cgit From 0a4c58f5702858822621fa1177c7d3475f181ccb Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Thu, 2 Aug 2018 14:27:17 -0700 Subject: bpf: add ability to charge bpf maps memory dynamically This commits extends existing bpf maps memory charging API to support dynamic charging/uncharging. This is required to account memory used by maps, if all entries are created dynamically after the map initialization. Signed-off-by: Roman Gushchin Cc: Alexei Starovoitov Cc: Daniel Borkmann Acked-by: Martin KaFai Lau Signed-off-by: Daniel Borkmann --- include/linux/bpf.h | 2 ++ kernel/bpf/syscall.c | 58 ++++++++++++++++++++++++++++++++++++++-------------- 2 files changed, 45 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 5b5ad95cf339..5a4a256473c3 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -435,6 +435,8 @@ struct bpf_map * __must_check bpf_map_inc(struct bpf_map *map, bool uref); void bpf_map_put_with_uref(struct bpf_map *map); void bpf_map_put(struct bpf_map *map); int bpf_map_precharge_memlock(u32 pages); +int bpf_map_charge_memlock(struct bpf_map *map, u32 pages); +void bpf_map_uncharge_memlock(struct bpf_map *map, u32 pages); void *bpf_map_area_alloc(size_t size, int numa_node); void bpf_map_area_free(void *base); void bpf_map_init_from_attr(struct bpf_map *map, union bpf_attr *attr); diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index a31a1ba0f8ea..7958252a4d29 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -181,32 +181,60 @@ int bpf_map_precharge_memlock(u32 pages) return 0; } -static int bpf_map_charge_memlock(struct bpf_map *map) +static int bpf_charge_memlock(struct user_struct *user, u32 pages) { - struct user_struct *user = get_current_user(); - unsigned long memlock_limit; + unsigned long memlock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; - memlock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; + if (atomic_long_add_return(pages, &user->locked_vm) > memlock_limit) { + atomic_long_sub(pages, &user->locked_vm); + return -EPERM; + } + return 0; +} - atomic_long_add(map->pages, &user->locked_vm); +static void bpf_uncharge_memlock(struct user_struct *user, u32 pages) +{ + atomic_long_sub(pages, &user->locked_vm); +} + +static int bpf_map_init_memlock(struct bpf_map *map) +{ + struct user_struct *user = get_current_user(); + int ret; - if (atomic_long_read(&user->locked_vm) > memlock_limit) { - atomic_long_sub(map->pages, &user->locked_vm); + ret = bpf_charge_memlock(user, map->pages); + if (ret) { free_uid(user); - return -EPERM; + return ret; } map->user = user; - return 0; + return ret; } -static void bpf_map_uncharge_memlock(struct bpf_map *map) +static void bpf_map_release_memlock(struct bpf_map *map) { struct user_struct *user = map->user; - - atomic_long_sub(map->pages, &user->locked_vm); + bpf_uncharge_memlock(user, map->pages); free_uid(user); } +int bpf_map_charge_memlock(struct bpf_map *map, u32 pages) +{ + int ret; + + ret = bpf_charge_memlock(map->user, pages); + if (ret) + return ret; + map->pages += pages; + return ret; +} + +void bpf_map_uncharge_memlock(struct bpf_map *map, u32 pages) +{ + bpf_uncharge_memlock(map->user, pages); + map->pages -= pages; +} + static int bpf_map_alloc_id(struct bpf_map *map) { int id; @@ -256,7 +284,7 @@ static void bpf_map_free_deferred(struct work_struct *work) { struct bpf_map *map = container_of(work, struct bpf_map, work); - bpf_map_uncharge_memlock(map); + bpf_map_release_memlock(map); security_bpf_map_free(map); /* implementation dependent freeing */ map->ops->map_free(map); @@ -492,7 +520,7 @@ static int map_create(union bpf_attr *attr) if (err) goto free_map_nouncharge; - err = bpf_map_charge_memlock(map); + err = bpf_map_init_memlock(map); if (err) goto free_map_sec; @@ -515,7 +543,7 @@ static int map_create(union bpf_attr *attr) return err; free_map: - bpf_map_uncharge_memlock(map); + bpf_map_release_memlock(map); free_map_sec: security_bpf_map_free(map); free_map_nouncharge: -- cgit From de9cbbaadba5adf88a19e46df61f7054000838f6 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Thu, 2 Aug 2018 14:27:18 -0700 Subject: bpf: introduce cgroup storage maps This commit introduces BPF_MAP_TYPE_CGROUP_STORAGE maps: a special type of maps which are implementing the cgroup storage. >From the userspace point of view it's almost a generic hash map with the (cgroup inode id, attachment type) pair used as a key. The only difference is that some operations are restricted: 1) a user can't create new entries, 2) a user can't remove existing entries. The lookup from userspace is o(log(n)). Signed-off-by: Roman Gushchin Cc: Alexei Starovoitov Cc: Daniel Borkmann Acked-by: Martin KaFai Lau Signed-off-by: Daniel Borkmann --- include/linux/bpf-cgroup.h | 38 +++++ include/linux/bpf.h | 1 + include/linux/bpf_types.h | 3 + include/uapi/linux/bpf.h | 6 + kernel/bpf/Makefile | 1 + kernel/bpf/local_storage.c | 376 +++++++++++++++++++++++++++++++++++++++++++++ kernel/bpf/syscall.c | 3 + kernel/bpf/verifier.c | 12 ++ 8 files changed, 440 insertions(+) create mode 100644 kernel/bpf/local_storage.c (limited to 'kernel') diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h index d50c2f0a655a..7d00d58869ed 100644 --- a/include/linux/bpf-cgroup.h +++ b/include/linux/bpf-cgroup.h @@ -4,19 +4,39 @@ #include #include +#include #include struct sock; struct sockaddr; struct cgroup; struct sk_buff; +struct bpf_map; +struct bpf_prog; struct bpf_sock_ops_kern; +struct bpf_cgroup_storage; #ifdef CONFIG_CGROUP_BPF extern struct static_key_false cgroup_bpf_enabled_key; #define cgroup_bpf_enabled static_branch_unlikely(&cgroup_bpf_enabled_key) +struct bpf_cgroup_storage_map; + +struct bpf_storage_buffer { + struct rcu_head rcu; + char data[0]; +}; + +struct bpf_cgroup_storage { + struct bpf_storage_buffer *buf; + struct bpf_cgroup_storage_map *map; + struct bpf_cgroup_storage_key key; + struct list_head list; + struct rb_node node; + struct rcu_head rcu; +}; + struct bpf_prog_list { struct list_head node; struct bpf_prog *prog; @@ -77,6 +97,15 @@ int __cgroup_bpf_run_filter_sock_ops(struct sock *sk, int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor, short access, enum bpf_attach_type type); +struct bpf_cgroup_storage *bpf_cgroup_storage_alloc(struct bpf_prog *prog); +void bpf_cgroup_storage_free(struct bpf_cgroup_storage *storage); +void bpf_cgroup_storage_link(struct bpf_cgroup_storage *storage, + struct cgroup *cgroup, + enum bpf_attach_type type); +void bpf_cgroup_storage_unlink(struct bpf_cgroup_storage *storage); +int bpf_cgroup_storage_assign(struct bpf_prog *prog, struct bpf_map *map); +void bpf_cgroup_storage_release(struct bpf_prog *prog, struct bpf_map *map); + /* Wrappers for __cgroup_bpf_run_filter_skb() guarded by cgroup_bpf_enabled. */ #define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk, skb) \ ({ \ @@ -221,6 +250,15 @@ static inline int cgroup_bpf_prog_query(const union bpf_attr *attr, return -EINVAL; } +static inline int bpf_cgroup_storage_assign(struct bpf_prog *prog, + struct bpf_map *map) { return 0; } +static inline void bpf_cgroup_storage_release(struct bpf_prog *prog, + struct bpf_map *map) {} +static inline struct bpf_cgroup_storage *bpf_cgroup_storage_alloc( + struct bpf_prog *prog) { return 0; } +static inline void bpf_cgroup_storage_free( + struct bpf_cgroup_storage *storage) {} + #define cgroup_bpf_enabled (0) #define BPF_CGROUP_PRE_CONNECT_ENABLED(sk) (0) #define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk,skb) ({ 0; }) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 5a4a256473c3..9d1e4727495e 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -282,6 +282,7 @@ struct bpf_prog_aux { struct bpf_prog *prog; struct user_struct *user; u64 load_time; /* ns since boottime */ + struct bpf_map *cgroup_storage; char name[BPF_OBJ_NAME_LEN]; #ifdef CONFIG_SECURITY void *security; diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index c5700c2d5549..add08be53b6f 100644 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h @@ -37,6 +37,9 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_PERF_EVENT_ARRAY, perf_event_array_map_ops) #ifdef CONFIG_CGROUPS BPF_MAP_TYPE(BPF_MAP_TYPE_CGROUP_ARRAY, cgroup_array_map_ops) #endif +#ifdef CONFIG_CGROUP_BPF +BPF_MAP_TYPE(BPF_MAP_TYPE_CGROUP_STORAGE, cgroup_storage_map_ops) +#endif BPF_MAP_TYPE(BPF_MAP_TYPE_HASH, htab_map_ops) BPF_MAP_TYPE(BPF_MAP_TYPE_PERCPU_HASH, htab_percpu_map_ops) BPF_MAP_TYPE(BPF_MAP_TYPE_LRU_HASH, htab_lru_map_ops) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 0ebaaf7f3568..b10118ee5afe 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -75,6 +75,11 @@ struct bpf_lpm_trie_key { __u8 data[0]; /* Arbitrary size */ }; +struct bpf_cgroup_storage_key { + __u64 cgroup_inode_id; /* cgroup inode id */ + __u32 attach_type; /* program attach type */ +}; + /* BPF syscall commands, see bpf(2) man-page for details. */ enum bpf_cmd { BPF_MAP_CREATE, @@ -120,6 +125,7 @@ enum bpf_map_type { BPF_MAP_TYPE_CPUMAP, BPF_MAP_TYPE_XSKMAP, BPF_MAP_TYPE_SOCKHASH, + BPF_MAP_TYPE_CGROUP_STORAGE, }; enum bpf_prog_type { diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index f27f5496d6fe..e8906cbad81f 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -3,6 +3,7 @@ obj-y := core.o obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o +obj-$(CONFIG_BPF_SYSCALL) += local_storage.o obj-$(CONFIG_BPF_SYSCALL) += disasm.o obj-$(CONFIG_BPF_SYSCALL) += btf.o ifeq ($(CONFIG_NET),y) diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c new file mode 100644 index 000000000000..f23d3fdeba23 --- /dev/null +++ b/kernel/bpf/local_storage.c @@ -0,0 +1,376 @@ +//SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include +#include +#include +#include + +#ifdef CONFIG_CGROUP_BPF + +#define LOCAL_STORAGE_CREATE_FLAG_MASK \ + (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY) + +struct bpf_cgroup_storage_map { + struct bpf_map map; + + spinlock_t lock; + struct bpf_prog *prog; + struct rb_root root; + struct list_head list; +}; + +static struct bpf_cgroup_storage_map *map_to_storage(struct bpf_map *map) +{ + return container_of(map, struct bpf_cgroup_storage_map, map); +} + +static int bpf_cgroup_storage_key_cmp( + const struct bpf_cgroup_storage_key *key1, + const struct bpf_cgroup_storage_key *key2) +{ + if (key1->cgroup_inode_id < key2->cgroup_inode_id) + return -1; + else if (key1->cgroup_inode_id > key2->cgroup_inode_id) + return 1; + else if (key1->attach_type < key2->attach_type) + return -1; + else if (key1->attach_type > key2->attach_type) + return 1; + return 0; +} + +static struct bpf_cgroup_storage *cgroup_storage_lookup( + struct bpf_cgroup_storage_map *map, struct bpf_cgroup_storage_key *key, + bool locked) +{ + struct rb_root *root = &map->root; + struct rb_node *node; + + if (!locked) + spin_lock_bh(&map->lock); + + node = root->rb_node; + while (node) { + struct bpf_cgroup_storage *storage; + + storage = container_of(node, struct bpf_cgroup_storage, node); + + switch (bpf_cgroup_storage_key_cmp(key, &storage->key)) { + case -1: + node = node->rb_left; + break; + case 1: + node = node->rb_right; + break; + default: + if (!locked) + spin_unlock_bh(&map->lock); + return storage; + } + } + + if (!locked) + spin_unlock_bh(&map->lock); + + return NULL; +} + +static int cgroup_storage_insert(struct bpf_cgroup_storage_map *map, + struct bpf_cgroup_storage *storage) +{ + struct rb_root *root = &map->root; + struct rb_node **new = &(root->rb_node), *parent = NULL; + + while (*new) { + struct bpf_cgroup_storage *this; + + this = container_of(*new, struct bpf_cgroup_storage, node); + + parent = *new; + switch (bpf_cgroup_storage_key_cmp(&storage->key, &this->key)) { + case -1: + new = &((*new)->rb_left); + break; + case 1: + new = &((*new)->rb_right); + break; + default: + return -EEXIST; + } + } + + rb_link_node(&storage->node, parent, new); + rb_insert_color(&storage->node, root); + + return 0; +} + +static void *cgroup_storage_lookup_elem(struct bpf_map *_map, void *_key) +{ + struct bpf_cgroup_storage_map *map = map_to_storage(_map); + struct bpf_cgroup_storage_key *key = _key; + struct bpf_cgroup_storage *storage; + + storage = cgroup_storage_lookup(map, key, false); + if (!storage) + return NULL; + + return &READ_ONCE(storage->buf)->data[0]; +} + +static int cgroup_storage_update_elem(struct bpf_map *map, void *_key, + void *value, u64 flags) +{ + struct bpf_cgroup_storage_key *key = _key; + struct bpf_cgroup_storage *storage; + struct bpf_storage_buffer *new; + + if (flags & BPF_NOEXIST) + return -EINVAL; + + storage = cgroup_storage_lookup((struct bpf_cgroup_storage_map *)map, + key, false); + if (!storage) + return -ENOENT; + + new = kmalloc_node(sizeof(struct bpf_storage_buffer) + + map->value_size, __GFP_ZERO | GFP_USER, + map->numa_node); + if (!new) + return -ENOMEM; + + memcpy(&new->data[0], value, map->value_size); + + new = xchg(&storage->buf, new); + kfree_rcu(new, rcu); + + return 0; +} + +static int cgroup_storage_get_next_key(struct bpf_map *_map, void *_key, + void *_next_key) +{ + struct bpf_cgroup_storage_map *map = map_to_storage(_map); + struct bpf_cgroup_storage_key *key = _key; + struct bpf_cgroup_storage_key *next = _next_key; + struct bpf_cgroup_storage *storage; + + spin_lock_bh(&map->lock); + + if (list_empty(&map->list)) + goto enoent; + + if (key) { + storage = cgroup_storage_lookup(map, key, true); + if (!storage) + goto enoent; + + storage = list_next_entry(storage, list); + if (!storage) + goto enoent; + } else { + storage = list_first_entry(&map->list, + struct bpf_cgroup_storage, list); + } + + spin_unlock_bh(&map->lock); + next->attach_type = storage->key.attach_type; + next->cgroup_inode_id = storage->key.cgroup_inode_id; + return 0; + +enoent: + spin_unlock_bh(&map->lock); + return -ENOENT; +} + +static struct bpf_map *cgroup_storage_map_alloc(union bpf_attr *attr) +{ + int numa_node = bpf_map_attr_numa_node(attr); + struct bpf_cgroup_storage_map *map; + + if (attr->key_size != sizeof(struct bpf_cgroup_storage_key)) + return ERR_PTR(-EINVAL); + + if (attr->value_size > PAGE_SIZE) + return ERR_PTR(-E2BIG); + + if (attr->map_flags & ~LOCAL_STORAGE_CREATE_FLAG_MASK) + /* reserved bits should not be used */ + return ERR_PTR(-EINVAL); + + if (attr->max_entries) + /* max_entries is not used and enforced to be 0 */ + return ERR_PTR(-EINVAL); + + map = kmalloc_node(sizeof(struct bpf_cgroup_storage_map), + __GFP_ZERO | GFP_USER, numa_node); + if (!map) + return ERR_PTR(-ENOMEM); + + map->map.pages = round_up(sizeof(struct bpf_cgroup_storage_map), + PAGE_SIZE) >> PAGE_SHIFT; + + /* copy mandatory map attributes */ + bpf_map_init_from_attr(&map->map, attr); + + spin_lock_init(&map->lock); + map->root = RB_ROOT; + INIT_LIST_HEAD(&map->list); + + return &map->map; +} + +static void cgroup_storage_map_free(struct bpf_map *_map) +{ + struct bpf_cgroup_storage_map *map = map_to_storage(_map); + + WARN_ON(!RB_EMPTY_ROOT(&map->root)); + WARN_ON(!list_empty(&map->list)); + + kfree(map); +} + +static int cgroup_storage_delete_elem(struct bpf_map *map, void *key) +{ + return -EINVAL; +} + +const struct bpf_map_ops cgroup_storage_map_ops = { + .map_alloc = cgroup_storage_map_alloc, + .map_free = cgroup_storage_map_free, + .map_get_next_key = cgroup_storage_get_next_key, + .map_lookup_elem = cgroup_storage_lookup_elem, + .map_update_elem = cgroup_storage_update_elem, + .map_delete_elem = cgroup_storage_delete_elem, +}; + +int bpf_cgroup_storage_assign(struct bpf_prog *prog, struct bpf_map *_map) +{ + struct bpf_cgroup_storage_map *map = map_to_storage(_map); + int ret = -EBUSY; + + spin_lock_bh(&map->lock); + + if (map->prog && map->prog != prog) + goto unlock; + if (prog->aux->cgroup_storage && prog->aux->cgroup_storage != _map) + goto unlock; + + map->prog = prog; + prog->aux->cgroup_storage = _map; + ret = 0; +unlock: + spin_unlock_bh(&map->lock); + + return ret; +} + +void bpf_cgroup_storage_release(struct bpf_prog *prog, struct bpf_map *_map) +{ + struct bpf_cgroup_storage_map *map = map_to_storage(_map); + + spin_lock_bh(&map->lock); + if (map->prog == prog) { + WARN_ON(prog->aux->cgroup_storage != _map); + map->prog = NULL; + prog->aux->cgroup_storage = NULL; + } + spin_unlock_bh(&map->lock); +} + +struct bpf_cgroup_storage *bpf_cgroup_storage_alloc(struct bpf_prog *prog) +{ + struct bpf_cgroup_storage *storage; + struct bpf_map *map; + u32 pages; + + map = prog->aux->cgroup_storage; + if (!map) + return NULL; + + pages = round_up(sizeof(struct bpf_cgroup_storage) + + sizeof(struct bpf_storage_buffer) + + map->value_size, PAGE_SIZE) >> PAGE_SHIFT; + if (bpf_map_charge_memlock(map, pages)) + return ERR_PTR(-EPERM); + + storage = kmalloc_node(sizeof(struct bpf_cgroup_storage), + __GFP_ZERO | GFP_USER, map->numa_node); + if (!storage) { + bpf_map_uncharge_memlock(map, pages); + return ERR_PTR(-ENOMEM); + } + + storage->buf = kmalloc_node(sizeof(struct bpf_storage_buffer) + + map->value_size, __GFP_ZERO | GFP_USER, + map->numa_node); + if (!storage->buf) { + bpf_map_uncharge_memlock(map, pages); + kfree(storage); + return ERR_PTR(-ENOMEM); + } + + storage->map = (struct bpf_cgroup_storage_map *)map; + + return storage; +} + +void bpf_cgroup_storage_free(struct bpf_cgroup_storage *storage) +{ + u32 pages; + struct bpf_map *map; + + if (!storage) + return; + + map = &storage->map->map; + pages = round_up(sizeof(struct bpf_cgroup_storage) + + sizeof(struct bpf_storage_buffer) + + map->value_size, PAGE_SIZE) >> PAGE_SHIFT; + bpf_map_uncharge_memlock(map, pages); + + kfree_rcu(storage->buf, rcu); + kfree_rcu(storage, rcu); +} + +void bpf_cgroup_storage_link(struct bpf_cgroup_storage *storage, + struct cgroup *cgroup, + enum bpf_attach_type type) +{ + struct bpf_cgroup_storage_map *map; + + if (!storage) + return; + + storage->key.attach_type = type; + storage->key.cgroup_inode_id = cgroup->kn->id.id; + + map = storage->map; + + spin_lock_bh(&map->lock); + WARN_ON(cgroup_storage_insert(map, storage)); + list_add(&storage->list, &map->list); + spin_unlock_bh(&map->lock); +} + +void bpf_cgroup_storage_unlink(struct bpf_cgroup_storage *storage) +{ + struct bpf_cgroup_storage_map *map; + struct rb_root *root; + + if (!storage) + return; + + map = storage->map; + + spin_lock_bh(&map->lock); + root = &map->root; + rb_erase(&storage->node, root); + + list_del(&storage->list); + spin_unlock_bh(&map->lock); +} + +#endif diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 7958252a4d29..5af4e9e2722d 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -957,6 +957,9 @@ static void free_used_maps(struct bpf_prog_aux *aux) { int i; + if (aux->cgroup_storage) + bpf_cgroup_storage_release(aux->prog, aux->cgroup_storage); + for (i = 0; i < aux->used_map_cnt; i++) bpf_map_put(aux->used_maps[i]); diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index e948303a0ea8..7e75434a9e54 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5154,6 +5154,14 @@ static int replace_map_fd_with_map_ptr(struct bpf_verifier_env *env) } env->used_maps[env->used_map_cnt++] = map; + if (map->map_type == BPF_MAP_TYPE_CGROUP_STORAGE && + bpf_cgroup_storage_assign(env->prog, map)) { + verbose(env, + "only one cgroup storage is allowed\n"); + fdput(f); + return -EBUSY; + } + fdput(f); next_insn: insn++; @@ -5180,6 +5188,10 @@ static void release_maps(struct bpf_verifier_env *env) { int i; + if (env->prog->aux->cgroup_storage) + bpf_cgroup_storage_release(env->prog, + env->prog->aux->cgroup_storage); + for (i = 0; i < env->used_map_cnt; i++) bpf_map_put(env->used_maps[i]); } -- cgit From aa0ad5b0391e268bdecf6cda31268388844f8afd Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Thu, 2 Aug 2018 14:27:19 -0700 Subject: bpf: pass a pointer to a cgroup storage using pcpu variable This commit introduces the bpf_cgroup_storage_set() helper, which will be used to pass a pointer to a cgroup storage to the bpf helper. Signed-off-by: Roman Gushchin Cc: Alexei Starovoitov Cc: Daniel Borkmann Acked-by: Martin KaFai Lau Signed-off-by: Daniel Borkmann --- include/linux/bpf-cgroup.h | 15 +++++++++++++++ kernel/bpf/local_storage.c | 2 ++ 2 files changed, 17 insertions(+) (limited to 'kernel') diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h index 7d00d58869ed..9a144ddbbc8f 100644 --- a/include/linux/bpf-cgroup.h +++ b/include/linux/bpf-cgroup.h @@ -4,6 +4,7 @@ #include #include +#include #include #include @@ -21,6 +22,8 @@ struct bpf_cgroup_storage; extern struct static_key_false cgroup_bpf_enabled_key; #define cgroup_bpf_enabled static_branch_unlikely(&cgroup_bpf_enabled_key) +DECLARE_PER_CPU(void*, bpf_cgroup_storage); + struct bpf_cgroup_storage_map; struct bpf_storage_buffer { @@ -97,6 +100,17 @@ int __cgroup_bpf_run_filter_sock_ops(struct sock *sk, int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor, short access, enum bpf_attach_type type); +static inline void bpf_cgroup_storage_set(struct bpf_cgroup_storage *storage) +{ + struct bpf_storage_buffer *buf; + + if (!storage) + return; + + buf = READ_ONCE(storage->buf); + this_cpu_write(bpf_cgroup_storage, &buf->data[0]); +} + struct bpf_cgroup_storage *bpf_cgroup_storage_alloc(struct bpf_prog *prog); void bpf_cgroup_storage_free(struct bpf_cgroup_storage *storage); void bpf_cgroup_storage_link(struct bpf_cgroup_storage *storage, @@ -250,6 +264,7 @@ static inline int cgroup_bpf_prog_query(const union bpf_attr *attr, return -EINVAL; } +static inline void bpf_cgroup_storage_set(struct bpf_cgroup_storage *storage) {} static inline int bpf_cgroup_storage_assign(struct bpf_prog *prog, struct bpf_map *map) { return 0; } static inline void bpf_cgroup_storage_release(struct bpf_prog *prog, diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c index f23d3fdeba23..fc4e37f68f2a 100644 --- a/kernel/bpf/local_storage.c +++ b/kernel/bpf/local_storage.c @@ -7,6 +7,8 @@ #include #include +DEFINE_PER_CPU(void*, bpf_cgroup_storage); + #ifdef CONFIG_CGROUP_BPF #define LOCAL_STORAGE_CREATE_FLAG_MASK \ -- cgit From d7bf2c10af053191e931b58704cd862fccb7f9de Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Thu, 2 Aug 2018 14:27:20 -0700 Subject: bpf: allocate cgroup storage entries on attaching bpf programs If a bpf program is using cgroup local storage, allocate a bpf_cgroup_storage structure automatically on attaching the program to a cgroup and save the pointer into the corresponding bpf_prog_list entry. Analogically, release the cgroup local storage on detaching of the bpf program. Signed-off-by: Roman Gushchin Cc: Alexei Starovoitov Cc: Daniel Borkmann Acked-by: Martin KaFai Lau Signed-off-by: Daniel Borkmann --- include/linux/bpf-cgroup.h | 1 + kernel/bpf/cgroup.c | 35 +++++++++++++++++++++++++++++++---- 2 files changed, 32 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h index 9a144ddbbc8f..f91b0f8ff3a9 100644 --- a/include/linux/bpf-cgroup.h +++ b/include/linux/bpf-cgroup.h @@ -43,6 +43,7 @@ struct bpf_cgroup_storage { struct bpf_prog_list { struct list_head node; struct bpf_prog *prog; + struct bpf_cgroup_storage *storage; }; struct bpf_prog_array; diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index badabb0b435c..935274c86bfe 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -34,6 +34,8 @@ void cgroup_bpf_put(struct cgroup *cgrp) list_for_each_entry_safe(pl, tmp, progs, node) { list_del(&pl->node); bpf_prog_put(pl->prog); + bpf_cgroup_storage_unlink(pl->storage); + bpf_cgroup_storage_free(pl->storage); kfree(pl); static_branch_dec(&cgroup_bpf_enabled_key); } @@ -188,6 +190,7 @@ int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog, { struct list_head *progs = &cgrp->bpf.progs[type]; struct bpf_prog *old_prog = NULL; + struct bpf_cgroup_storage *storage, *old_storage = NULL; struct cgroup_subsys_state *css; struct bpf_prog_list *pl; bool pl_was_allocated; @@ -210,31 +213,47 @@ int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog, if (prog_list_length(progs) >= BPF_CGROUP_MAX_PROGS) return -E2BIG; + storage = bpf_cgroup_storage_alloc(prog); + if (IS_ERR(storage)) + return -ENOMEM; + if (flags & BPF_F_ALLOW_MULTI) { - list_for_each_entry(pl, progs, node) - if (pl->prog == prog) + list_for_each_entry(pl, progs, node) { + if (pl->prog == prog) { /* disallow attaching the same prog twice */ + bpf_cgroup_storage_free(storage); return -EINVAL; + } + } pl = kmalloc(sizeof(*pl), GFP_KERNEL); - if (!pl) + if (!pl) { + bpf_cgroup_storage_free(storage); return -ENOMEM; + } + pl_was_allocated = true; pl->prog = prog; + pl->storage = storage; list_add_tail(&pl->node, progs); } else { if (list_empty(progs)) { pl = kmalloc(sizeof(*pl), GFP_KERNEL); - if (!pl) + if (!pl) { + bpf_cgroup_storage_free(storage); return -ENOMEM; + } pl_was_allocated = true; list_add_tail(&pl->node, progs); } else { pl = list_first_entry(progs, typeof(*pl), node); old_prog = pl->prog; + old_storage = pl->storage; + bpf_cgroup_storage_unlink(old_storage); pl_was_allocated = false; } pl->prog = prog; + pl->storage = storage; } cgrp->bpf.flags[type] = flags; @@ -257,10 +276,13 @@ int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog, } static_branch_inc(&cgroup_bpf_enabled_key); + if (old_storage) + bpf_cgroup_storage_free(old_storage); if (old_prog) { bpf_prog_put(old_prog); static_branch_dec(&cgroup_bpf_enabled_key); } + bpf_cgroup_storage_link(storage, cgrp, type); return 0; cleanup: @@ -276,6 +298,9 @@ cleanup: /* and cleanup the prog list */ pl->prog = old_prog; + bpf_cgroup_storage_free(pl->storage); + pl->storage = old_storage; + bpf_cgroup_storage_link(old_storage, cgrp, type); if (pl_was_allocated) { list_del(&pl->node); kfree(pl); @@ -356,6 +381,8 @@ int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog, /* now can actually delete it from this cgroup list */ list_del(&pl->node); + bpf_cgroup_storage_unlink(pl->storage); + bpf_cgroup_storage_free(pl->storage); kfree(pl); if (list_empty(progs)) /* last program was detached, reset flags to zero */ -- cgit From 394e40a29788820c9c0526b1c3497c9e0ec2a126 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Thu, 2 Aug 2018 14:27:21 -0700 Subject: bpf: extend bpf_prog_array to store pointers to the cgroup storage This patch converts bpf_prog_array from an array of prog pointers to the array of struct bpf_prog_array_item elements. This allows to save a cgroup storage pointer for each bpf program efficiently attached to a cgroup. Signed-off-by: Roman Gushchin Cc: Alexei Starovoitov Cc: Daniel Borkmann Acked-by: Martin KaFai Lau Signed-off-by: Daniel Borkmann --- drivers/media/rc/bpf-lirc.c | 10 +++--- include/linux/bpf.h | 19 ++++++++---- kernel/bpf/cgroup.c | 21 +++++++------ kernel/bpf/core.c | 76 +++++++++++++++++++++++---------------------- 4 files changed, 70 insertions(+), 56 deletions(-) (limited to 'kernel') diff --git a/drivers/media/rc/bpf-lirc.c b/drivers/media/rc/bpf-lirc.c index fcfab6635f9c..8c26df9b96c1 100644 --- a/drivers/media/rc/bpf-lirc.c +++ b/drivers/media/rc/bpf-lirc.c @@ -195,14 +195,16 @@ void lirc_bpf_run(struct rc_dev *rcdev, u32 sample) */ void lirc_bpf_free(struct rc_dev *rcdev) { - struct bpf_prog **progs; + struct bpf_prog_array_item *item; if (!rcdev->raw->progs) return; - progs = rcu_dereference(rcdev->raw->progs)->progs; - while (*progs) - bpf_prog_put(*progs++); + item = rcu_dereference(rcdev->raw->progs)->items; + while (item->prog) { + bpf_prog_put(item->prog); + item++; + } bpf_prog_array_free(rcdev->raw->progs); } diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 9d1e4727495e..16be67888c30 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -349,9 +349,14 @@ int bpf_prog_test_run_skb(struct bpf_prog *prog, const union bpf_attr *kattr, * The 'struct bpf_prog_array *' should only be replaced with xchg() * since other cpus are walking the array of pointers in parallel. */ +struct bpf_prog_array_item { + struct bpf_prog *prog; + struct bpf_cgroup_storage *cgroup_storage; +}; + struct bpf_prog_array { struct rcu_head rcu; - struct bpf_prog *progs[0]; + struct bpf_prog_array_item items[0]; }; struct bpf_prog_array *bpf_prog_array_alloc(u32 prog_cnt, gfp_t flags); @@ -372,7 +377,8 @@ int bpf_prog_array_copy(struct bpf_prog_array __rcu *old_array, #define __BPF_PROG_RUN_ARRAY(array, ctx, func, check_non_null) \ ({ \ - struct bpf_prog **_prog, *__prog; \ + struct bpf_prog_array_item *_item; \ + struct bpf_prog *_prog; \ struct bpf_prog_array *_array; \ u32 _ret = 1; \ preempt_disable(); \ @@ -380,10 +386,11 @@ int bpf_prog_array_copy(struct bpf_prog_array __rcu *old_array, _array = rcu_dereference(array); \ if (unlikely(check_non_null && !_array))\ goto _out; \ - _prog = _array->progs; \ - while ((__prog = READ_ONCE(*_prog))) { \ - _ret &= func(__prog, ctx); \ - _prog++; \ + _item = &_array->items[0]; \ + while ((_prog = READ_ONCE(_item->prog))) { \ + bpf_cgroup_storage_set(_item->cgroup_storage); \ + _ret &= func(_prog, ctx); \ + _item++; \ } \ _out: \ rcu_read_unlock(); \ diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 935274c86bfe..ddfa6cc13e57 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -117,15 +117,18 @@ static int compute_effective_progs(struct cgroup *cgrp, cnt = 0; p = cgrp; do { - if (cnt == 0 || (p->bpf.flags[type] & BPF_F_ALLOW_MULTI)) - list_for_each_entry(pl, - &p->bpf.progs[type], node) { - if (!pl->prog) - continue; - progs->progs[cnt++] = pl->prog; - } - p = cgroup_parent(p); - } while (p); + if (cnt > 0 && !(p->bpf.flags[type] & BPF_F_ALLOW_MULTI)) + continue; + + list_for_each_entry(pl, &p->bpf.progs[type], node) { + if (!pl->prog) + continue; + + progs->items[cnt].prog = pl->prog; + progs->items[cnt].cgroup_storage = pl->storage; + cnt++; + } + } while ((p = cgroup_parent(p))); rcu_assign_pointer(*array, progs); return 0; diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 253aa8e79c7b..9abcf25ebf9f 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1542,7 +1542,8 @@ struct bpf_prog_array *bpf_prog_array_alloc(u32 prog_cnt, gfp_t flags) { if (prog_cnt) return kzalloc(sizeof(struct bpf_prog_array) + - sizeof(struct bpf_prog *) * (prog_cnt + 1), + sizeof(struct bpf_prog_array_item) * + (prog_cnt + 1), flags); return &empty_prog_array.hdr; @@ -1556,43 +1557,45 @@ void bpf_prog_array_free(struct bpf_prog_array __rcu *progs) kfree_rcu(progs, rcu); } -int bpf_prog_array_length(struct bpf_prog_array __rcu *progs) +int bpf_prog_array_length(struct bpf_prog_array __rcu *array) { - struct bpf_prog **prog; + struct bpf_prog_array_item *item; u32 cnt = 0; rcu_read_lock(); - prog = rcu_dereference(progs)->progs; - for (; *prog; prog++) - if (*prog != &dummy_bpf_prog.prog) + item = rcu_dereference(array)->items; + for (; item->prog; item++) + if (item->prog != &dummy_bpf_prog.prog) cnt++; rcu_read_unlock(); return cnt; } -static bool bpf_prog_array_copy_core(struct bpf_prog **prog, + +static bool bpf_prog_array_copy_core(struct bpf_prog_array __rcu *array, u32 *prog_ids, u32 request_cnt) { + struct bpf_prog_array_item *item; int i = 0; - for (; *prog; prog++) { - if (*prog == &dummy_bpf_prog.prog) + item = rcu_dereference(array)->items; + for (; item->prog; item++) { + if (item->prog == &dummy_bpf_prog.prog) continue; - prog_ids[i] = (*prog)->aux->id; + prog_ids[i] = item->prog->aux->id; if (++i == request_cnt) { - prog++; + item++; break; } } - return !!(*prog); + return !!(item->prog); } -int bpf_prog_array_copy_to_user(struct bpf_prog_array __rcu *progs, +int bpf_prog_array_copy_to_user(struct bpf_prog_array __rcu *array, __u32 __user *prog_ids, u32 cnt) { - struct bpf_prog **prog; unsigned long err = 0; bool nospc; u32 *ids; @@ -1611,8 +1614,7 @@ int bpf_prog_array_copy_to_user(struct bpf_prog_array __rcu *progs, if (!ids) return -ENOMEM; rcu_read_lock(); - prog = rcu_dereference(progs)->progs; - nospc = bpf_prog_array_copy_core(prog, ids, cnt); + nospc = bpf_prog_array_copy_core(array, ids, cnt); rcu_read_unlock(); err = copy_to_user(prog_ids, ids, cnt * sizeof(u32)); kfree(ids); @@ -1623,14 +1625,14 @@ int bpf_prog_array_copy_to_user(struct bpf_prog_array __rcu *progs, return 0; } -void bpf_prog_array_delete_safe(struct bpf_prog_array __rcu *progs, +void bpf_prog_array_delete_safe(struct bpf_prog_array __rcu *array, struct bpf_prog *old_prog) { - struct bpf_prog **prog = progs->progs; + struct bpf_prog_array_item *item = array->items; - for (; *prog; prog++) - if (*prog == old_prog) { - WRITE_ONCE(*prog, &dummy_bpf_prog.prog); + for (; item->prog; item++) + if (item->prog == old_prog) { + WRITE_ONCE(item->prog, &dummy_bpf_prog.prog); break; } } @@ -1641,7 +1643,7 @@ int bpf_prog_array_copy(struct bpf_prog_array __rcu *old_array, struct bpf_prog_array **new_array) { int new_prog_cnt, carry_prog_cnt = 0; - struct bpf_prog **existing_prog; + struct bpf_prog_array_item *existing; struct bpf_prog_array *array; bool found_exclude = false; int new_prog_idx = 0; @@ -1650,15 +1652,15 @@ int bpf_prog_array_copy(struct bpf_prog_array __rcu *old_array, * the new array. */ if (old_array) { - existing_prog = old_array->progs; - for (; *existing_prog; existing_prog++) { - if (*existing_prog == exclude_prog) { + existing = old_array->items; + for (; existing->prog; existing++) { + if (existing->prog == exclude_prog) { found_exclude = true; continue; } - if (*existing_prog != &dummy_bpf_prog.prog) + if (existing->prog != &dummy_bpf_prog.prog) carry_prog_cnt++; - if (*existing_prog == include_prog) + if (existing->prog == include_prog) return -EEXIST; } } @@ -1684,15 +1686,17 @@ int bpf_prog_array_copy(struct bpf_prog_array __rcu *old_array, /* Fill in the new prog array */ if (carry_prog_cnt) { - existing_prog = old_array->progs; - for (; *existing_prog; existing_prog++) - if (*existing_prog != exclude_prog && - *existing_prog != &dummy_bpf_prog.prog) - array->progs[new_prog_idx++] = *existing_prog; + existing = old_array->items; + for (; existing->prog; existing++) + if (existing->prog != exclude_prog && + existing->prog != &dummy_bpf_prog.prog) { + array->items[new_prog_idx++].prog = + existing->prog; + } } if (include_prog) - array->progs[new_prog_idx++] = include_prog; - array->progs[new_prog_idx] = NULL; + array->items[new_prog_idx++].prog = include_prog; + array->items[new_prog_idx].prog = NULL; *new_array = array; return 0; } @@ -1701,7 +1705,6 @@ int bpf_prog_array_copy_info(struct bpf_prog_array __rcu *array, u32 *prog_ids, u32 request_cnt, u32 *prog_cnt) { - struct bpf_prog **prog; u32 cnt = 0; if (array) @@ -1714,8 +1717,7 @@ int bpf_prog_array_copy_info(struct bpf_prog_array __rcu *array, return 0; /* this function is called under trace/bpf_trace.c: bpf_event_mutex */ - prog = rcu_dereference_check(array, 1)->progs; - return bpf_prog_array_copy_core(prog, prog_ids, request_cnt) ? -ENOSPC + return bpf_prog_array_copy_core(array, prog_ids, request_cnt) ? -ENOSPC : 0; } -- cgit From 3e6a4b3e0289dc9540a2c1d8a20657f4707fbabb Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Thu, 2 Aug 2018 14:27:22 -0700 Subject: bpf/verifier: introduce BPF_PTR_TO_MAP_VALUE BPF_MAP_TYPE_CGROUP_STORAGE maps are special in a way that the access from the bpf program side is lookup-free. That means the result is guaranteed to be a valid pointer to the cgroup storage; no NULL-check is required. This patch introduces BPF_PTR_TO_MAP_VALUE return type, which is required to cause the verifier accept programs, which are not checking the map value pointer for being NULL. Signed-off-by: Roman Gushchin Cc: Alexei Starovoitov Cc: Daniel Borkmann Acked-by: Martin KaFai Lau Signed-off-by: Daniel Borkmann --- include/linux/bpf.h | 1 + kernel/bpf/verifier.c | 8 ++++++-- 2 files changed, 7 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 16be67888c30..ca4ac2a39def 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -155,6 +155,7 @@ enum bpf_arg_type { enum bpf_return_type { RET_INTEGER, /* function returns integer */ RET_VOID, /* function doesn't return anything */ + RET_PTR_TO_MAP_VALUE, /* returns a pointer to map elem value */ RET_PTR_TO_MAP_VALUE_OR_NULL, /* returns a pointer to map elem value or NULL */ }; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 7e75434a9e54..1ede16c8bb40 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -2545,8 +2545,12 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn mark_reg_unknown(env, regs, BPF_REG_0); } else if (fn->ret_type == RET_VOID) { regs[BPF_REG_0].type = NOT_INIT; - } else if (fn->ret_type == RET_PTR_TO_MAP_VALUE_OR_NULL) { - regs[BPF_REG_0].type = PTR_TO_MAP_VALUE_OR_NULL; + } else if (fn->ret_type == RET_PTR_TO_MAP_VALUE_OR_NULL || + fn->ret_type == RET_PTR_TO_MAP_VALUE) { + if (fn->ret_type == RET_PTR_TO_MAP_VALUE) + regs[BPF_REG_0].type = PTR_TO_MAP_VALUE; + else + regs[BPF_REG_0].type = PTR_TO_MAP_VALUE_OR_NULL; /* There is no offset yet applied, variable or fixed */ mark_reg_known_zero(env, regs, BPF_REG_0); regs[BPF_REG_0].off = 0; -- cgit From 7b5dd2bde72cd33313b63cf3ba1de6a9e443a65d Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Thu, 2 Aug 2018 14:27:23 -0700 Subject: bpf: don't allow create maps of cgroup local storages As there is one-to-one relation between a bpf program and cgroup local storage map, there is no sense in creating a map of cgroup local storage maps. Forbid it explicitly to avoid possible side effects. Signed-off-by: Roman Gushchin Cc: Alexei Starovoitov Cc: Daniel Borkmann Acked-by: Martin KaFai Lau Signed-off-by: Daniel Borkmann --- kernel/bpf/map_in_map.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/map_in_map.c b/kernel/bpf/map_in_map.c index 1da574612bea..3bfbf4464416 100644 --- a/kernel/bpf/map_in_map.c +++ b/kernel/bpf/map_in_map.c @@ -23,7 +23,8 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd) * is a runtime binding. Doing static check alone * in the verifier is not enough. */ - if (inner_map->map_type == BPF_MAP_TYPE_PROG_ARRAY) { + if (inner_map->map_type == BPF_MAP_TYPE_PROG_ARRAY || + inner_map->map_type == BPF_MAP_TYPE_CGROUP_STORAGE) { fdput(f); return ERR_PTR(-ENOTSUPP); } -- cgit From cd3394317653837e2eb5c5d0904a8996102af9fc Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Thu, 2 Aug 2018 14:27:24 -0700 Subject: bpf: introduce the bpf_get_local_storage() helper function The bpf_get_local_storage() helper function is used to get a pointer to the bpf local storage from a bpf program. It takes a pointer to a storage map and flags as arguments. Right now it accepts only cgroup storage maps, and flags argument has to be 0. Further it can be extended to support other types of local storage: e.g. thread local storage etc. Signed-off-by: Roman Gushchin Cc: Alexei Starovoitov Cc: Daniel Borkmann Acked-by: Martin KaFai Lau Signed-off-by: Daniel Borkmann --- include/linux/bpf.h | 2 ++ include/uapi/linux/bpf.h | 21 ++++++++++++++++++++- kernel/bpf/cgroup.c | 2 ++ kernel/bpf/core.c | 1 + kernel/bpf/helpers.c | 20 ++++++++++++++++++++ kernel/bpf/verifier.c | 18 ++++++++++++++++++ net/core/filter.c | 23 ++++++++++++++++++++++- 7 files changed, 85 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index ca4ac2a39def..cd8790d2c6ed 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -788,6 +788,8 @@ extern const struct bpf_func_proto bpf_sock_map_update_proto; extern const struct bpf_func_proto bpf_sock_hash_update_proto; extern const struct bpf_func_proto bpf_get_current_cgroup_id_proto; +extern const struct bpf_func_proto bpf_get_local_storage_proto; + /* Shared helpers among cBPF and eBPF. */ void bpf_user_rnd_init_once(void); u64 bpf_user_rnd_u32(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5); diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index b10118ee5afe..dd5758dc35d3 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -2095,6 +2095,24 @@ union bpf_attr { * Return * A 64-bit integer containing the current cgroup id based * on the cgroup within which the current task is running. + * + * void* get_local_storage(void *map, u64 flags) + * Description + * Get the pointer to the local storage area. + * The type and the size of the local storage is defined + * by the *map* argument. + * The *flags* meaning is specific for each map type, + * and has to be 0 for cgroup local storage. + * + * Depending on the bpf program type, a local storage area + * can be shared between multiple instances of the bpf program, + * running simultaneously. + * + * A user should care about the synchronization by himself. + * For example, by using the BPF_STX_XADD instruction to alter + * the shared data. + * Return + * Pointer to the local storage area. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -2177,7 +2195,8 @@ union bpf_attr { FN(rc_repeat), \ FN(rc_keydown), \ FN(skb_cgroup_id), \ - FN(get_current_cgroup_id), + FN(get_current_cgroup_id), \ + FN(get_local_storage), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index ddfa6cc13e57..0a4fe5a7dc91 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -684,6 +684,8 @@ cgroup_dev_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_map_delete_elem_proto; case BPF_FUNC_get_current_uid_gid: return &bpf_get_current_uid_gid_proto; + case BPF_FUNC_get_local_storage: + return &bpf_get_local_storage_proto; case BPF_FUNC_trace_printk: if (capable(CAP_SYS_ADMIN)) return bpf_get_trace_printk_proto(); diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 9abcf25ebf9f..4d09e610777f 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1795,6 +1795,7 @@ const struct bpf_func_proto bpf_get_current_comm_proto __weak; const struct bpf_func_proto bpf_sock_map_update_proto __weak; const struct bpf_func_proto bpf_sock_hash_update_proto __weak; const struct bpf_func_proto bpf_get_current_cgroup_id_proto __weak; +const struct bpf_func_proto bpf_get_local_storage_proto __weak; const struct bpf_func_proto * __weak bpf_get_trace_printk_proto(void) { diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 73065e2d23c2..1991466b8327 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -193,4 +193,24 @@ const struct bpf_func_proto bpf_get_current_cgroup_id_proto = { .gpl_only = false, .ret_type = RET_INTEGER, }; + +DECLARE_PER_CPU(void*, bpf_cgroup_storage); + +BPF_CALL_2(bpf_get_local_storage, struct bpf_map *, map, u64, flags) +{ + /* map and flags arguments are not used now, + * but provide an ability to extend the API + * for other types of local storages. + * verifier checks that their values are correct. + */ + return (unsigned long) this_cpu_read(bpf_cgroup_storage); +} + +const struct bpf_func_proto bpf_get_local_storage_proto = { + .func = bpf_get_local_storage, + .gpl_only = false, + .ret_type = RET_PTR_TO_MAP_VALUE, + .arg1_type = ARG_CONST_MAP_PTR, + .arg2_type = ARG_ANYTHING, +}; #endif diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 1ede16c8bb40..587468a9c37d 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -2127,6 +2127,10 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, func_id != BPF_FUNC_current_task_under_cgroup) goto error; break; + case BPF_MAP_TYPE_CGROUP_STORAGE: + if (func_id != BPF_FUNC_get_local_storage) + goto error; + break; /* devmap returns a pointer to a live net_device ifindex that we cannot * allow to be modified from bpf side. So do not allow lookup elements * for now. @@ -2209,6 +2213,10 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, if (map->map_type != BPF_MAP_TYPE_SOCKHASH) goto error; break; + case BPF_FUNC_get_local_storage: + if (map->map_type != BPF_MAP_TYPE_CGROUP_STORAGE) + goto error; + break; default: break; } @@ -2533,6 +2541,16 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn } regs = cur_regs(env); + + /* check that flags argument in get_local_storage(map, flags) is 0, + * this is required because get_local_storage() can't return an error. + */ + if (func_id == BPF_FUNC_get_local_storage && + !register_is_null(®s[BPF_REG_2])) { + verbose(env, "get_local_storage() doesn't support non-zero flags\n"); + return -EINVAL; + } + /* reset caller saved regs */ for (i = 0; i < CALLER_SAVED_REGS; i++) { mark_reg_not_init(env, regs, caller_saved[i]); diff --git a/net/core/filter.c b/net/core/filter.c index 9bb9a4488e25..9f73aae2f089 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4820,6 +4820,8 @@ sock_filter_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) */ case BPF_FUNC_get_current_uid_gid: return &bpf_get_current_uid_gid_proto; + case BPF_FUNC_get_local_storage: + return &bpf_get_local_storage_proto; default: return bpf_base_func_proto(func_id); } @@ -4844,6 +4846,8 @@ sock_addr_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) } case BPF_FUNC_get_socket_cookie: return &bpf_get_socket_cookie_sock_addr_proto; + case BPF_FUNC_get_local_storage: + return &bpf_get_local_storage_proto; default: return bpf_base_func_proto(func_id); } @@ -4866,6 +4870,17 @@ sk_filter_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) } } +static const struct bpf_func_proto * +cg_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) +{ + switch (func_id) { + case BPF_FUNC_get_local_storage: + return &bpf_get_local_storage_proto; + default: + return sk_filter_func_proto(func_id, prog); + } +} + static const struct bpf_func_proto * tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { @@ -4988,6 +5003,8 @@ sock_ops_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_sock_hash_update_proto; case BPF_FUNC_get_socket_cookie: return &bpf_get_socket_cookie_sock_ops_proto; + case BPF_FUNC_get_local_storage: + return &bpf_get_local_storage_proto; default: return bpf_base_func_proto(func_id); } @@ -5007,6 +5024,8 @@ sk_msg_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_msg_cork_bytes_proto; case BPF_FUNC_msg_pull_data: return &bpf_msg_pull_data_proto; + case BPF_FUNC_get_local_storage: + return &bpf_get_local_storage_proto; default: return bpf_base_func_proto(func_id); } @@ -5034,6 +5053,8 @@ sk_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_sk_redirect_map_proto; case BPF_FUNC_sk_redirect_hash: return &bpf_sk_redirect_hash_proto; + case BPF_FUNC_get_local_storage: + return &bpf_get_local_storage_proto; default: return bpf_base_func_proto(func_id); } @@ -6838,7 +6859,7 @@ const struct bpf_prog_ops xdp_prog_ops = { }; const struct bpf_verifier_ops cg_skb_verifier_ops = { - .get_func_proto = sk_filter_func_proto, + .get_func_proto = cg_skb_func_proto, .is_valid_access = sk_filter_is_valid_access, .convert_ctx_access = bpf_convert_ctx_access, }; -- cgit From 4f7799d96e6621ce584df60739e1480a6fd89f0a Mon Sep 17 00:00:00 2001 From: Palmer Dabbelt Date: Fri, 22 Jun 2018 10:01:26 -0700 Subject: genirq/irqchip: Remove MULTI_IRQ_HANDLER as it's now obselete Now that every user of MULTI_IRQ_HANDLER has been convereted over to use GENERIC_IRQ_MULTI_HANDLER remove the references to MULTI_IRQ_HANDLER. Signed-off-by: Palmer Dabbelt Signed-off-by: Thomas Gleixner Cc: linux@armlinux.org.uk Cc: catalin.marinas@arm.com Cc: Will Deacon Cc: jonas@southpole.se Cc: stefan.kristiansson@saunalahti.fi Cc: shorne@gmail.com Cc: jason@lakedaemon.net Cc: marc.zyngier@arm.com Cc: Arnd Bergmann Cc: nicolas.pitre@linaro.org Cc: vladimir.murzin@arm.com Cc: keescook@chromium.org Cc: jinb.park7@gmail.com Cc: yamada.masahiro@socionext.com Cc: alexandre.belloni@bootlin.com Cc: pombredanne@nexb.com Cc: Greg KH Cc: kstewart@linuxfoundation.org Cc: jhogan@kernel.org Cc: mark.rutland@arm.com Cc: ard.biesheuvel@linaro.org Cc: james.morse@arm.com Cc: linux-arm-kernel@lists.infradead.org Cc: openrisc@lists.librecores.org Link: https://lkml.kernel.org/r/20180622170126.6308-6-palmer@sifive.com --- drivers/irqchip/Kconfig | 24 ++++++++---------------- kernel/irq/Kconfig | 1 - 2 files changed, 8 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/drivers/irqchip/Kconfig b/drivers/irqchip/Kconfig index 7527f6a9adae..d564d21245c5 100644 --- a/drivers/irqchip/Kconfig +++ b/drivers/irqchip/Kconfig @@ -8,8 +8,7 @@ config ARM_GIC bool select IRQ_DOMAIN select IRQ_DOMAIN_HIERARCHY - select MULTI_IRQ_HANDLER - select GENERIC_IRQ_MULTI_HANDLER if !MULTI_IRQ_HANDLER + select GENERIC_IRQ_MULTI_HANDLER select GENERIC_IRQ_EFFECTIVE_AFF_MASK config ARM_GIC_PM @@ -35,8 +34,7 @@ config GIC_NON_BANKED config ARM_GIC_V3 bool select IRQ_DOMAIN - select MULTI_IRQ_HANDLER - select GENERIC_IRQ_MULTI_HANDLER if !MULTI_IRQ_HANDLER + select GENERIC_IRQ_MULTI_HANDLER select IRQ_DOMAIN_HIERARCHY select PARTITION_PERCPU select GENERIC_IRQ_EFFECTIVE_AFF_MASK @@ -68,8 +66,7 @@ config ARM_NVIC config ARM_VIC bool select IRQ_DOMAIN - select MULTI_IRQ_HANDLER - select GENERIC_IRQ_MULTI_HANDLER if !MULTI_IRQ_HANDLER + select GENERIC_IRQ_MULTI_HANDLER config ARM_VIC_NR int @@ -96,16 +93,14 @@ config ATMEL_AIC_IRQ bool select GENERIC_IRQ_CHIP select IRQ_DOMAIN - select MULTI_IRQ_HANDLER - select GENERIC_IRQ_MULTI_HANDLER if !MULTI_IRQ_HANDLER + select GENERIC_IRQ_MULTI_HANDLER select SPARSE_IRQ config ATMEL_AIC5_IRQ bool select GENERIC_IRQ_CHIP select IRQ_DOMAIN - select MULTI_IRQ_HANDLER - select GENERIC_IRQ_MULTI_HANDLER if !MULTI_IRQ_HANDLER + select GENERIC_IRQ_MULTI_HANDLER select SPARSE_IRQ config I8259 @@ -142,8 +137,7 @@ config DW_APB_ICTL config FARADAY_FTINTC010 bool select IRQ_DOMAIN - select MULTI_IRQ_HANDLER - select GENERIC_IRQ_MULTI_HANDLER if !MULTI_IRQ_HANDLER + select GENERIC_IRQ_MULTI_HANDLER select SPARSE_IRQ config HISILICON_IRQ_MBIGEN @@ -168,8 +162,7 @@ config CLPS711X_IRQCHIP bool depends on ARCH_CLPS711X select IRQ_DOMAIN - select MULTI_IRQ_HANDLER - select GENERIC_IRQ_MULTI_HANDLER if !MULTI_IRQ_HANDLER + select GENERIC_IRQ_MULTI_HANDLER select SPARSE_IRQ default y @@ -188,8 +181,7 @@ config OMAP_IRQCHIP config ORION_IRQCHIP bool select IRQ_DOMAIN - select MULTI_IRQ_HANDLER - select GENERIC_IRQ_MULTI_HANDLER if !MULTI_IRQ_HANDLER + select GENERIC_IRQ_MULTI_HANDLER config PIC32_EVIC bool diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig index c6766f326072..5f3e2baefca9 100644 --- a/kernel/irq/Kconfig +++ b/kernel/irq/Kconfig @@ -134,7 +134,6 @@ config GENERIC_IRQ_DEBUGFS endmenu config GENERIC_IRQ_MULTI_HANDLER - depends on !MULTI_IRQ_HANDLER bool help Allow to specify the low level IRQ handler at run time. -- cgit From 1b6266ebe3da8198e9a02fbad77bbb56e2f7ce2e Mon Sep 17 00:00:00 2001 From: Sinan Kaya Date: Thu, 2 Aug 2018 23:09:41 -0700 Subject: watchdog: Reduce message verbosity Code is emitting the following error message during boot on systems without PMU hardware support while probing NMI capability. NMI watchdog: Perf event create on CPU 0 failed with -2 This error is emitted as the perf subsystem returns -ENOENT due to lack of PMUs in the system. It is followed by the warning that NMI watchdog is disabled: NMI watchdog: Perf NMI watchdog permanently disabled While NMI disabled information is useful for ordinary users, seeing a PERF event create failed with error code -2 is not. Reduce the message severity to debug so that if debugging is still possible in case the error code returned by perf is required for analysis. Signed-off-by: Sinan Kaya Signed-off-by: Thomas Gleixner Acked-by: Don Zickus Cc: Kate Stewart Cc: Greg Kroah-Hartman Cc: Colin Ian King Cc: Peter Zijlstra Link: https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=599368 Link: https://lkml.kernel.org/r/20180803060943.2643-1-okaya@kernel.org --- kernel/watchdog_hld.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/watchdog_hld.c b/kernel/watchdog_hld.c index e449a23e9d59..1f7020d65d0a 100644 --- a/kernel/watchdog_hld.c +++ b/kernel/watchdog_hld.c @@ -175,8 +175,8 @@ static int hardlockup_detector_event_create(void) evt = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback, NULL); if (IS_ERR(evt)) { - pr_info("Perf event create on CPU %d failed with %ld\n", cpu, - PTR_ERR(evt)); + pr_debug("Perf event create on CPU %d failed with %ld\n", cpu, + PTR_ERR(evt)); return PTR_ERR(evt); } this_cpu_write(watchdog_ev, evt); -- cgit From d1f0301b3333eef5efbfa1fe0f0edbea01863d5d Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 3 Aug 2018 14:44:59 +0200 Subject: genirq: Make force irq threading setup more robust The support of force threading interrupts which are set up with both a primary and a threaded handler wreckaged the setup of regular requested threaded interrupts (primary handler == NULL). The reason is that it does not check whether the primary handler is set to the default handler which wakes the handler thread. Instead it replaces the thread handler with the primary handler as it would do with force threaded interrupts which have been requested via request_irq(). So both the primary and the thread handler become the same which then triggers the warnon that the thread handler tries to wakeup a not configured secondary thread. Fortunately this only happens when the driver omits the IRQF_ONESHOT flag when requesting the threaded interrupt, which is normaly caught by the sanity checks when force irq threading is disabled. Fix it by skipping the force threading setup when a regular threaded interrupt is requested. As a consequence the interrupt request which lacks the IRQ_ONESHOT flag is rejected correctly instead of silently wreckaging it. Fixes: 2a1d3ab8986d ("genirq: Handle force threading of irqs with primary and thread handler") Reported-by: Kurt Kanzenbach Signed-off-by: Thomas Gleixner Tested-by: Kurt Kanzenbach Cc: stable@vger.kernel.org --- kernel/irq/manage.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index daeabd791d58..9a8b7ba9aa88 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -1068,6 +1068,13 @@ static int irq_setup_forced_threading(struct irqaction *new) if (new->flags & (IRQF_NO_THREAD | IRQF_PERCPU | IRQF_ONESHOT)) return 0; + /* + * No further action required for interrupts which are requested as + * threaded interrupts already + */ + if (new->handler == irq_default_primary_handler) + return 0; + new->flags |= IRQF_ONESHOT; /* @@ -1075,7 +1082,7 @@ static int irq_setup_forced_threading(struct irqaction *new) * thread handler. We force thread them as well by creating a * secondary action. */ - if (new->handler != irq_default_primary_handler && new->thread_fn) { + if (new->handler && new->thread_fn) { /* Allocate the secondary action */ new->secondary = kzalloc(sizeof(struct irqaction), GFP_KERNEL); if (!new->secondary) -- cgit From da25a672cf0e2c143bffb40acb507a342e25b4f4 Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Thu, 2 Aug 2018 19:34:07 -0700 Subject: trace: Use rcu_dereference_raw for hooks from trace-event subsystem Since we switched to using SRCU for tracepoints used in the idle path, we can no longer use rcu_dereference_sched for dereferencing points in trace-event hooks. Since tracepoints can now use either SRCU or sched-RCU, just use rcu_dereference_raw for traceevents just like we're doing when dereferencing the tracepoint table. This prevents an RCU warning reported by Masami: [ 282.060593] WARNING: can't dereference registers at 00000000f3c7f62b [ 282.063200] ============================= [ 282.064082] WARNING: suspicious RCU usage [ 282.064963] 4.18.0-rc6+ #15 Tainted: G W [ 282.066048] ----------------------------- [ 282.066923] /home/mhiramat/ksrc/linux/kernel/trace/trace_events.c:242 suspicious rcu_dereference_check() usage! [ 282.068974] [ 282.068974] other info that might help us debug this: [ 282.068974] [ 282.070770] [ 282.070770] RCU used illegally from idle CPU! [ 282.070770] rcu_scheduler_active = 2, debug_locks = 1 [ 282.072938] RCU used illegally from extended quiescent state! [ 282.074183] no locks held by swapper/0/0. [ 282.075071] [ 282.075071] stack backtrace: [ 282.076121] CPU: 0 PID: 0 Comm: swapper/0 Tainted: G W [ 282.077782] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996) [ 282.079604] Call Trace: [ 282.080212] [ 282.080755] dump_stack+0x85/0xcb [ 282.081523] trace_event_ignore_this_pid+0x66/0x70 [ 282.082541] trace_event_raw_event_preemptirq_template+0xa2/0xb0 [ 282.083774] ? interrupt_entry+0xc4/0xe0 [ 282.084665] ? trace_hardirqs_off_thunk+0x1a/0x1c [ 282.085669] trace_hardirqs_off_caller+0x90/0xd0 [ 282.086597] trace_hardirqs_off_thunk+0x1a/0x1c [ 282.087433] ? call_function_interrupt+0xa/0x20 [ 282.088201] interrupt_entry+0xc4/0xe0 [ 282.088848] ? call_function_interrupt+0xa/0x20 [ 282.089579] [ 282.090029] ? native_safe_halt+0x2/0x10 [ 282.090695] ? default_idle+0x1f/0x160 [ 282.091330] ? default_idle_call+0x24/0x40 [ 282.091997] ? do_idle+0x210/0x250 [ 282.092658] ? cpu_startup_entry+0x6f/0x80 [ 282.093338] ? start_kernel+0x49d/0x4bd [ 282.093987] ? secondary_startup_64+0xa5/0xb0 Link: http://lkml.kernel.org/r/20180803023407.225852-1-joel@joelfernandes.org Reported-by: Masami Hiramatsu Tested-by: Masami Hiramatsu Fixes: e6753f23d961 ("tracepoint: Make rcuidle tracepoint callers use SRCU") Signed-off-by: Joel Fernandes (Google) Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 14ff4ff3caab..7b508ce8ac44 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -239,7 +239,7 @@ bool trace_event_ignore_this_pid(struct trace_event_file *trace_file) struct trace_array_cpu *data; struct trace_pid_list *pid_list; - pid_list = rcu_dereference_sched(tr->filtered_pids); + pid_list = rcu_dereference_raw(tr->filtered_pids); if (!pid_list) return false; @@ -512,7 +512,7 @@ event_filter_pid_sched_process_exit(void *data, struct task_struct *task) struct trace_pid_list *pid_list; struct trace_array *tr = data; - pid_list = rcu_dereference_sched(tr->filtered_pids); + pid_list = rcu_dereference_raw(tr->filtered_pids); trace_filter_add_remove_task(pid_list, NULL, task); } -- cgit From 0a0e0829f990120cef165bbb804237f400953ec2 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 3 Aug 2018 15:31:34 +0200 Subject: nohz: Fix missing tick reprogram when interrupting an inline softirq The full nohz tick is reprogrammed in irq_exit() only if the exit is not in a nesting interrupt. This stands as an optimization: whether a hardirq or a softirq is interrupted, the tick is going to be reprogrammed when necessary at the end of the inner interrupt, with even potential new updates on the timer queue. When soft interrupts are interrupted, it's assumed that they are executing on the tail of an interrupt return. In that case tick_nohz_irq_exit() is called after softirq processing to take care of the tick reprogramming. But the assumption is wrong: softirqs can be processed inline as well, ie: outside of an interrupt, like in a call to local_bh_enable() or from ksoftirqd. Inline softirqs don't reprogram the tick once they are done, as opposed to interrupt tail softirq processing. So if a tick interrupts an inline softirq processing, the next timer will neither be reprogrammed from the interrupting tick's irq_exit() nor after the interrupted softirq processing. This situation may leave the tick unprogrammed while timers are armed. To fix this, simply keep reprogramming the tick even if a softirq has been interrupted. That can be optimized further, but for now correctness is more important. Note that new timers enqueued in nohz_full mode after a softirq gets interrupted will still be handled just fine through self-IPIs triggered by the timer code. Reported-by: Anna-Maria Gleixner Signed-off-by: Frederic Weisbecker Signed-off-by: Thomas Gleixner Tested-by: Anna-Maria Gleixner Cc: stable@vger.kernel.org # 4.14+ Link: https://lkml.kernel.org/r/1533303094-15855-1-git-send-email-frederic@kernel.org --- kernel/softirq.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/softirq.c b/kernel/softirq.c index 75ffc1d1a2e0..6f584861d329 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -390,7 +390,7 @@ static inline void tick_irq_exit(void) /* Make sure that timer wheel updates are propagated */ if ((idle_cpu(cpu) && !need_resched()) || tick_nohz_full_cpu(cpu)) { - if (!in_interrupt()) + if (!in_irq()) tick_nohz_irq_exit(); } #endif -- cgit From 088fe47ce952542389e604e83f533811750aaf7c Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Mon, 23 Jul 2018 17:26:49 -0500 Subject: signal: Add calculate_sigpending() Add a function calculate_sigpending to test to see if any signals are pending for a new task immediately following fork. Signals have to happen either before or after fork. Today our practice is to push all of the signals to before the fork, but that has the downside that frequent or periodic signals can make fork take much much longer than normal or prevent fork from completing entirely. So we need move signals that we can after the fork to prevent that. This updates the code to set TIF_SIGPENDING on a new task if there are signals or other activities that have moved so that they appear to happen after the fork. As the code today restarts if it sees any such activity this won't immediately have an effect, as there will be no reason for it to set TIF_SIGPENDING immediately after the fork. Adding calculate_sigpending means the code in fork can safely be changed to not always restart if a signal is pending. The new calculate_sigpending function sets sigpending if there are pending bits in jobctl, pending signals, the freezer needs to freeze the new task or the live kernel patching framework need the new thread to take the slow path to userspace. I have verified that setting TIF_SIGPENDING does make a new process take the slow path to userspace before it executes it's first userspace instruction. I have looked at the callers of signal_wake_up and the code paths setting TIF_SIGPENDING and I don't see anything else that needs to be handled. The code probably doesn't need to set TIF_SIGPENDING for the kernel live patching as it uses a separate thread flag as well. But at this point it seems safer reuse the recalc_sigpending logic and get the kernel live patching folks to sort out their story later. V2: I have moved the test into schedule_tail where siglock can be grabbed and recalc_sigpending can be reused directly. Further as the last action of setting up a new task this guarantees that TIF_SIGPENDING will be properly set in the new process. The helper calculate_sigpending takes the siglock and uncontitionally sets TIF_SIGPENDING and let's recalc_sigpending clear TIF_SIGPENDING if it is unnecessary. This allows reusing the existing code and keeps maintenance of the conditions simple. Oleg Nesterov suggested the movement and pointed out the need to take siglock if this code was going to be called while the new task is discoverable. Signed-off-by: "Eric W. Biederman" --- include/linux/sched/signal.h | 1 + kernel/sched/core.c | 2 ++ kernel/signal.c | 11 +++++++++++ 3 files changed, 14 insertions(+) (limited to 'kernel') diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h index 94558ffa82ab..b55fd293c1e5 100644 --- a/include/linux/sched/signal.h +++ b/include/linux/sched/signal.h @@ -372,6 +372,7 @@ static inline int signal_pending_state(long state, struct task_struct *p) */ extern void recalc_sigpending_and_wake(struct task_struct *t); extern void recalc_sigpending(void); +extern void calculate_sigpending(void); extern void signal_wake_up_state(struct task_struct *t, unsigned int state); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 78d8facba456..3e4ed4b7aa2d 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2813,6 +2813,8 @@ asmlinkage __visible void schedule_tail(struct task_struct *prev) if (current->set_child_tid) put_user(task_pid_vnr(current), current->set_child_tid); + + calculate_sigpending(); } /* diff --git a/kernel/signal.c b/kernel/signal.c index dddbea558455..1e06f1eba363 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -172,6 +172,17 @@ void recalc_sigpending(void) } +void calculate_sigpending(void) +{ + /* Have any signals or users of TIF_SIGPENDING been delayed + * until after fork? + */ + spin_lock_irq(¤t->sighand->siglock); + set_tsk_thread_flag(current, TIF_SIGPENDING); + recalc_sigpending(); + spin_unlock_irq(¤t->sighand->siglock); +} + /* Given the mask, find the first available signal that should be serviced. */ #define SYNCHRONOUS_MASK \ -- cgit From 924de3b8c9410c404c6eda7abffd282b97b3ff7f Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Mon, 23 Jul 2018 13:38:00 -0500 Subject: fork: Have new threads join on-going signal group stops There are only two signals that are delivered to every member of a signal group: SIGSTOP and SIGKILL. Signal delivery requires every signal appear to be delivered either before or after a clone syscall. SIGKILL terminates the clone so does not need to be considered. Which leaves only SIGSTOP that needs to be considered when creating new threads. Today in the event of a group stop TIF_SIGPENDING will get set and the fork will restart ensuring the fork syscall participates in the group stop. A fork (especially of a process with a lot of memory) is one of the most expensive system so we really only want to restart a fork when necessary. It is easy so check to see if a SIGSTOP is ongoing and have the new thread join it immediate after the clone completes. Making it appear the clone completed happened just before the SIGSTOP. The calculate_sigpending function will see the bits set in jobctl and set TIF_SIGPENDING to ensure the new task takes the slow path to userspace. V2: The call to task_join_group_stop was moved before the new task is added to the thread group list. This should not matter as sighand->siglock is held over both the addition of the threads, the call to task_join_group_stop and do_signal_stop. But the change is trivial and it is one less thing to worry about when reading the code. Signed-off-by: "Eric W. Biederman" --- include/linux/sched/signal.h | 2 ++ kernel/fork.c | 27 +++++++++++++++------------ kernel/signal.c | 14 ++++++++++++++ 3 files changed, 31 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h index b55fd293c1e5..ae2b0b81be25 100644 --- a/include/linux/sched/signal.h +++ b/include/linux/sched/signal.h @@ -385,6 +385,8 @@ static inline void ptrace_signal_wake_up(struct task_struct *t, bool resume) signal_wake_up_state(t, resume ? __TASK_TRACED : 0); } +void task_join_group_stop(struct task_struct *task); + #ifdef TIF_RESTORE_SIGMASK /* * Legacy restore_sigmask accessors. These are inefficient on diff --git a/kernel/fork.c b/kernel/fork.c index 22d4cdb9a7ca..ab731e15a600 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1934,18 +1934,20 @@ static __latent_entropy struct task_struct *copy_process( goto bad_fork_cancel_cgroup; } - /* - * Process group and session signals need to be delivered to just the - * parent before the fork or both the parent and the child after the - * fork. Restart if a signal comes in before we add the new process to - * it's process group. - * A fatal signal pending means that current will exit, so the new - * thread can't slip out of an OOM kill (or normal SIGKILL). - */ - recalc_sigpending(); - if (signal_pending(current)) { - retval = -ERESTARTNOINTR; - goto bad_fork_cancel_cgroup; + if (!(clone_flags & CLONE_THREAD)) { + /* + * Process group and session signals need to be delivered to just the + * parent before the fork or both the parent and the child after the + * fork. Restart if a signal comes in before we add the new process to + * it's process group. + * A fatal signal pending means that current will exit, so the new + * thread can't slip out of an OOM kill (or normal SIGKILL). + */ + recalc_sigpending(); + if (signal_pending(current)) { + retval = -ERESTARTNOINTR; + goto bad_fork_cancel_cgroup; + } } @@ -1982,6 +1984,7 @@ static __latent_entropy struct task_struct *copy_process( current->signal->nr_threads++; atomic_inc(¤t->signal->live); atomic_inc(¤t->signal->sigcnt); + task_join_group_stop(p); list_add_tail_rcu(&p->thread_group, &p->group_leader->thread_group); list_add_tail_rcu(&p->thread_node, diff --git a/kernel/signal.c b/kernel/signal.c index 1e06f1eba363..9f0eafb6d474 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -373,6 +373,20 @@ static bool task_participate_group_stop(struct task_struct *task) return false; } +void task_join_group_stop(struct task_struct *task) +{ + /* Have the new thread join an on-going signal group stop */ + unsigned long jobctl = current->jobctl; + if (jobctl & JOBCTL_STOP_PENDING) { + struct signal_struct *sig = current->signal; + unsigned long signr = jobctl & JOBCTL_STOP_SIGMASK; + unsigned long gstop = JOBCTL_STOP_PENDING | JOBCTL_STOP_CONSUME; + if (task_set_jobctl_pending(task, signr | gstop)) { + sig->group_stop_count++; + } + } +} + /* * allocate a new signal queue record * - this may be called without locks if and only if t == current, otherwise an -- cgit From cfd355145c32bb7ccb65fccbe2d67280dc2119e1 Mon Sep 17 00:00:00 2001 From: Prasad Sodagudi Date: Fri, 3 Aug 2018 13:56:06 -0700 Subject: stop_machine: Atomically queue and wake stopper threads When cpu_stop_queue_work() releases the lock for the stopper thread that was queued into its wake queue, preemption is enabled, which leads to the following deadlock: CPU0 CPU1 sched_setaffinity(0, ...) __set_cpus_allowed_ptr() stop_one_cpu(0, ...) stop_two_cpus(0, 1, ...) cpu_stop_queue_work(0, ...) cpu_stop_queue_two_works(0, ..., 1, ...) -grabs lock for migration/0- -spins with preemption disabled, waiting for migration/0's lock to be released- -adds work items for migration/0 and queues migration/0 to its wake_q- -releases lock for migration/0 and preemption is enabled- -current thread is preempted, and __set_cpus_allowed_ptr has changed the thread's cpu allowed mask to CPU1 only- -acquires migration/0 and migration/1's locks- -adds work for migration/0 but does not add migration/0 to wake_q, since it is already in a wake_q- -adds work for migration/1 and adds migration/1 to its wake_q- -releases migration/0 and migration/1's locks, wakes migration/1, and enables preemption- -since migration/1 is requested to run, migration/1 begins to run and waits on migration/0, but migration/0 will never be able to run, since the thread that can wake it is affine to CPU1- Disable preemption in cpu_stop_queue_work() before queueing works for stopper threads, and queueing the stopper thread in the wake queue, to ensure that the operation of queueing the works and waking the stopper threads is atomic. Fixes: 0b26351b910f ("stop_machine, sched: Fix migrate_swap() vs. active_balance() deadlock") Signed-off-by: Prasad Sodagudi Signed-off-by: Isaac J. Manjarres Signed-off-by: Thomas Gleixner Cc: peterz@infradead.org Cc: matt@codeblueprint.co.uk Cc: bigeasy@linutronix.de Cc: gregkh@linuxfoundation.org Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/1533329766-4856-1-git-send-email-isaacm@codeaurora.org Co-Developed-by: Isaac J. Manjarres --- kernel/stop_machine.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index e190d1ef3a23..69eb76daed34 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -81,6 +81,7 @@ static bool cpu_stop_queue_work(unsigned int cpu, struct cpu_stop_work *work) unsigned long flags; bool enabled; + preempt_disable(); raw_spin_lock_irqsave(&stopper->lock, flags); enabled = stopper->enabled; if (enabled) @@ -90,6 +91,7 @@ static bool cpu_stop_queue_work(unsigned int cpu, struct cpu_stop_work *work) raw_spin_unlock_irqrestore(&stopper->lock, flags); wake_up_q(&wakeq); + preempt_enable(); return enabled; } -- cgit From 82837ad5bda79cd13d64b9abad92e7872a06bc65 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Sat, 4 Aug 2018 21:12:00 -0500 Subject: PM / hibernate: Mark expected switch fall-through In preparation to enabling -Wimplicit-fallthrough, mark switch cases where we are expecting to fall through. This addresses Coverity-ID: 114713 ("Missing break in switch"). Signed-off-by: Gustavo A. R. Silva Acked-by: Pavel Machek Signed-off-by: Rafael J. Wysocki --- kernel/power/hibernate.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index 9c85c7822383..38f3f96b4b12 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -638,6 +638,7 @@ static void power_down(void) break; case HIBERNATION_PLATFORM: hibernation_platform_enter(); + /* Fall through */ case HIBERNATION_SHUTDOWN: if (pm_power_off) kernel_power_off(); -- cgit From 55f2503c3b69328735e88031ff8d6ba291bd952b Mon Sep 17 00:00:00 2001 From: Pingfan Liu Date: Tue, 31 Jul 2018 16:51:32 +0800 Subject: PM / reboot: Eliminate race between reboot and suspend At present, "systemctl suspend" and "shutdown" can run in parrallel. A system can suspend after devices_shutdown(), and resume. Then the shutdown task goes on to power off. This causes many devices are not really shut off. Hence replacing reboot_mutex with system_transition_mutex (renamed from pm_mutex) to achieve the exclusion. The renaming of pm_mutex as system_transition_mutex can be better to reflect the purpose of the mutex. Signed-off-by: Pingfan Liu Acked-by: Pavel Machek Signed-off-by: Rafael J. Wysocki --- Documentation/power/freezing-of-tasks.txt | 12 ++++++------ Documentation/power/suspend-and-cpuhotplug.txt | 6 +++--- include/linux/suspend.h | 2 +- kernel/freezer.c | 4 +++- kernel/power/hibernate.c | 15 ++++++++------- kernel/power/main.c | 12 ++++++------ kernel/power/suspend.c | 4 ++-- kernel/power/user.c | 4 ++-- kernel/reboot.c | 6 +++--- mm/page_alloc.c | 11 ++++++----- 10 files changed, 40 insertions(+), 36 deletions(-) (limited to 'kernel') diff --git a/Documentation/power/freezing-of-tasks.txt b/Documentation/power/freezing-of-tasks.txt index af005770e767..cd283190855a 100644 --- a/Documentation/power/freezing-of-tasks.txt +++ b/Documentation/power/freezing-of-tasks.txt @@ -204,26 +204,26 @@ VI. Are there any precautions to be taken to prevent freezing failures? Yes, there are. -First of all, grabbing the 'pm_mutex' lock to mutually exclude a piece of code +First of all, grabbing the 'system_transition_mutex' lock to mutually exclude a piece of code from system-wide sleep such as suspend/hibernation is not encouraged. If possible, that piece of code must instead hook onto the suspend/hibernation notifiers to achieve mutual exclusion. Look at the CPU-Hotplug code (kernel/cpu.c) for an example. -However, if that is not feasible, and grabbing 'pm_mutex' is deemed necessary, -it is strongly discouraged to directly call mutex_[un]lock(&pm_mutex) since +However, if that is not feasible, and grabbing 'system_transition_mutex' is deemed necessary, +it is strongly discouraged to directly call mutex_[un]lock(&system_transition_mutex) since that could lead to freezing failures, because if the suspend/hibernate code -successfully acquired the 'pm_mutex' lock, and hence that other entity failed +successfully acquired the 'system_transition_mutex' lock, and hence that other entity failed to acquire the lock, then that task would get blocked in TASK_UNINTERRUPTIBLE state. As a consequence, the freezer would not be able to freeze that task, leading to freezing failure. However, the [un]lock_system_sleep() APIs are safe to use in this scenario, since they ask the freezer to skip freezing this task, since it is anyway -"frozen enough" as it is blocked on 'pm_mutex', which will be released +"frozen enough" as it is blocked on 'system_transition_mutex', which will be released only after the entire suspend/hibernation sequence is complete. So, to summarize, use [un]lock_system_sleep() instead of directly using -mutex_[un]lock(&pm_mutex). That would prevent freezing failures. +mutex_[un]lock(&system_transition_mutex). That would prevent freezing failures. V. Miscellaneous /sys/power/pm_freeze_timeout controls how long it will cost at most to freeze diff --git a/Documentation/power/suspend-and-cpuhotplug.txt b/Documentation/power/suspend-and-cpuhotplug.txt index 6f55eb960a6d..a8751b8df10e 100644 --- a/Documentation/power/suspend-and-cpuhotplug.txt +++ b/Documentation/power/suspend-and-cpuhotplug.txt @@ -32,7 +32,7 @@ More details follow: sysfs file | v - Acquire pm_mutex lock + Acquire system_transition_mutex lock | v Send PM_SUSPEND_PREPARE @@ -96,10 +96,10 @@ execution during resume): * thaw tasks * send PM_POST_SUSPEND notifications -* Release pm_mutex lock. +* Release system_transition_mutex lock. -It is to be noted here that the pm_mutex lock is acquired at the very +It is to be noted here that the system_transition_mutex lock is acquired at the very beginning, when we are just starting out to suspend, and then released only after the entire cycle is complete (i.e., suspend + resume). diff --git a/include/linux/suspend.h b/include/linux/suspend.h index 440b62f7502e..5a28ac9284f0 100644 --- a/include/linux/suspend.h +++ b/include/linux/suspend.h @@ -414,7 +414,7 @@ static inline bool hibernation_available(void) { return false; } #define PM_RESTORE_PREPARE 0x0005 /* Going to restore a saved image */ #define PM_POST_RESTORE 0x0006 /* Restore failed */ -extern struct mutex pm_mutex; +extern struct mutex system_transition_mutex; #ifdef CONFIG_PM_SLEEP void save_processor_state(void); diff --git a/kernel/freezer.c b/kernel/freezer.c index 6f56a9e219fa..b162b74611e4 100644 --- a/kernel/freezer.c +++ b/kernel/freezer.c @@ -15,7 +15,9 @@ atomic_t system_freezing_cnt = ATOMIC_INIT(0); EXPORT_SYMBOL(system_freezing_cnt); -/* indicate whether PM freezing is in effect, protected by pm_mutex */ +/* indicate whether PM freezing is in effect, protected by + * system_transition_mutex + */ bool pm_freezing; bool pm_nosig_freezing; diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index 38f3f96b4b12..abef759de7c8 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -338,7 +338,7 @@ static int create_image(int platform_mode) * hibernation_snapshot - Quiesce devices and create a hibernation image. * @platform_mode: If set, use platform driver to prepare for the transition. * - * This routine must be called with pm_mutex held. + * This routine must be called with system_transition_mutex held. */ int hibernation_snapshot(int platform_mode) { @@ -500,8 +500,9 @@ static int resume_target_kernel(bool platform_mode) * hibernation_restore - Quiesce devices and restore from a hibernation image. * @platform_mode: If set, use platform driver to prepare for the transition. * - * This routine must be called with pm_mutex held. If it is successful, control - * reappears in the restored target kernel in hibernation_snapshot(). + * This routine must be called with system_transition_mutex held. If it is + * successful, control reappears in the restored target kernel in + * hibernation_snapshot(). */ int hibernation_restore(int platform_mode) { @@ -806,13 +807,13 @@ static int software_resume(void) * name_to_dev_t() below takes a sysfs buffer mutex when sysfs * is configured into the kernel. Since the regular hibernate * trigger path is via sysfs which takes a buffer mutex before - * calling hibernate functions (which take pm_mutex) this can - * cause lockdep to complain about a possible ABBA deadlock + * calling hibernate functions (which take system_transition_mutex) + * this can cause lockdep to complain about a possible ABBA deadlock * which cannot happen since we're in the boot code here and * sysfs can't be invoked yet. Therefore, we use a subclass * here to avoid lockdep complaining. */ - mutex_lock_nested(&pm_mutex, SINGLE_DEPTH_NESTING); + mutex_lock_nested(&system_transition_mutex, SINGLE_DEPTH_NESTING); if (swsusp_resume_device) goto Check_image; @@ -900,7 +901,7 @@ static int software_resume(void) atomic_inc(&snapshot_device_available); /* For success case, the suspend path will release the lock */ Unlock: - mutex_unlock(&pm_mutex); + mutex_unlock(&system_transition_mutex); pm_pr_dbg("Hibernation image not present or could not be loaded.\n"); return error; Close_Finish: diff --git a/kernel/power/main.c b/kernel/power/main.c index d9706da10930..35b50823d83b 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -15,17 +15,16 @@ #include #include #include +#include #include "power.h" -DEFINE_MUTEX(pm_mutex); - #ifdef CONFIG_PM_SLEEP void lock_system_sleep(void) { current->flags |= PF_FREEZER_SKIP; - mutex_lock(&pm_mutex); + mutex_lock(&system_transition_mutex); } EXPORT_SYMBOL_GPL(lock_system_sleep); @@ -37,8 +36,9 @@ void unlock_system_sleep(void) * * Reason: * Fundamentally, we just don't need it, because freezing condition - * doesn't come into effect until we release the pm_mutex lock, - * since the freezer always works with pm_mutex held. + * doesn't come into effect until we release the + * system_transition_mutex lock, since the freezer always works with + * system_transition_mutex held. * * More importantly, in the case of hibernation, * unlock_system_sleep() gets called in snapshot_read() and @@ -47,7 +47,7 @@ void unlock_system_sleep(void) * enter the refrigerator, thus causing hibernation to lockup. */ current->flags &= ~PF_FREEZER_SKIP; - mutex_unlock(&pm_mutex); + mutex_unlock(&system_transition_mutex); } EXPORT_SYMBOL_GPL(unlock_system_sleep); diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 87331565e505..9e13afe65a14 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -556,7 +556,7 @@ static int enter_state(suspend_state_t state) } else if (!valid_state(state)) { return -EINVAL; } - if (!mutex_trylock(&pm_mutex)) + if (!mutex_trylock(&system_transition_mutex)) return -EBUSY; if (state == PM_SUSPEND_TO_IDLE) @@ -590,7 +590,7 @@ static int enter_state(suspend_state_t state) pm_pr_dbg("Finishing wakeup.\n"); suspend_finish(); Unlock: - mutex_unlock(&pm_mutex); + mutex_unlock(&system_transition_mutex); return error; } diff --git a/kernel/power/user.c b/kernel/power/user.c index abd225550271..2d8b60a3c86b 100644 --- a/kernel/power/user.c +++ b/kernel/power/user.c @@ -216,7 +216,7 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd, if (!capable(CAP_SYS_ADMIN)) return -EPERM; - if (!mutex_trylock(&pm_mutex)) + if (!mutex_trylock(&system_transition_mutex)) return -EBUSY; lock_device_hotplug(); @@ -394,7 +394,7 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd, } unlock_device_hotplug(); - mutex_unlock(&pm_mutex); + mutex_unlock(&system_transition_mutex); return error; } diff --git a/kernel/reboot.c b/kernel/reboot.c index e4ced883d8de..8fb44dec9ad7 100644 --- a/kernel/reboot.c +++ b/kernel/reboot.c @@ -294,7 +294,7 @@ void kernel_power_off(void) } EXPORT_SYMBOL_GPL(kernel_power_off); -static DEFINE_MUTEX(reboot_mutex); +DEFINE_MUTEX(system_transition_mutex); /* * Reboot system call: for obvious reasons only root may call it, @@ -338,7 +338,7 @@ SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd, if ((cmd == LINUX_REBOOT_CMD_POWER_OFF) && !pm_power_off) cmd = LINUX_REBOOT_CMD_HALT; - mutex_lock(&reboot_mutex); + mutex_lock(&system_transition_mutex); switch (cmd) { case LINUX_REBOOT_CMD_RESTART: kernel_restart(NULL); @@ -389,7 +389,7 @@ SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd, ret = -EINVAL; break; } - mutex_unlock(&reboot_mutex); + mutex_unlock(&system_transition_mutex); return ret; } diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 1521100f1e63..e47f9e467408 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -155,16 +155,17 @@ static inline void set_pcppage_migratetype(struct page *page, int migratetype) * The following functions are used by the suspend/hibernate code to temporarily * change gfp_allowed_mask in order to avoid using I/O during memory allocations * while devices are suspended. To avoid races with the suspend/hibernate code, - * they should always be called with pm_mutex held (gfp_allowed_mask also should - * only be modified with pm_mutex held, unless the suspend/hibernate code is - * guaranteed not to run in parallel with that modification). + * they should always be called with system_transition_mutex held + * (gfp_allowed_mask also should only be modified with system_transition_mutex + * held, unless the suspend/hibernate code is guaranteed not to run in parallel + * with that modification). */ static gfp_t saved_gfp_mask; void pm_restore_gfp_mask(void) { - WARN_ON(!mutex_is_locked(&pm_mutex)); + WARN_ON(!mutex_is_locked(&system_transition_mutex)); if (saved_gfp_mask) { gfp_allowed_mask = saved_gfp_mask; saved_gfp_mask = 0; @@ -173,7 +174,7 @@ void pm_restore_gfp_mask(void) void pm_restrict_gfp_mask(void) { - WARN_ON(!mutex_is_locked(&pm_mutex)); + WARN_ON(!mutex_is_locked(&system_transition_mutex)); WARN_ON(saved_gfp_mask); saved_gfp_mask = gfp_allowed_mask; gfp_allowed_mask &= ~(__GFP_IO | __GFP_FS); -- cgit From da5b3ebb4527733299661229a8d035d64a4f0b1a Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Sun, 5 Aug 2018 20:40:49 -0700 Subject: tracing: irqsoff: Account for additional preempt_disable Recently we tried to make the preemptirqsoff tracer to use irqsoff tracepoint probes. However this causes issues as reported by Masami: [2.271078] Testing tracer preemptirqsoff: .. no entries found ..FAILED! [2.381015] WARNING: CPU: 0 PID: 1 at /home/mhiramat/ksrc/linux/kernel/ trace/trace.c:1512 run_tracer_selftest+0xf3/0x154 This is due to the tracepoint code increasing the preempt nesting count by calling an additional preempt_disable before calling into the preemptoff tracer which messes up the preempt_count() check in tracer_hardirqs_off. To fix this, make the irqsoff tracer probes balance the additional outer preempt_disable with a preempt_enable_notrace. The other way to fix this is to just use SRCU for all tracepoints. However we can't do that because we can't use NMIs from RCU context. Link: http://lkml.kernel.org/r/20180806034049.67949-1-joel@joelfernandes.org Fixes: c3bc8fd637a9 ("tracing: Centralize preemptirq tracepoints and unify their usage") Fixes: e6753f23d961 ("tracepoint: Make rcuidle tracepoint callers use SRCU") Reported-by: Masami Hiramatsu Tested-by: Masami Hiramatsu Signed-off-by: Joel Fernandes (Google) Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_irqsoff.c | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index 770cd30cda40..ffbf1505d5bc 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -603,14 +603,40 @@ static void irqsoff_tracer_stop(struct trace_array *tr) */ static void tracer_hardirqs_on(void *none, unsigned long a0, unsigned long a1) { + /* + * Tracepoint probes are expected to be called with preempt disabled, + * We don't care about being called with preempt disabled but we need + * to know in the future if that changes so we can remove the next + * preempt_enable. + */ + WARN_ON_ONCE(!preempt_count()); + + /* Tracepoint probes disable preemption atleast once, account for that */ + preempt_enable_notrace(); + if (!preempt_trace() && irq_trace()) stop_critical_timing(a0, a1); + + preempt_disable_notrace(); } static void tracer_hardirqs_off(void *none, unsigned long a0, unsigned long a1) { + /* + * Tracepoint probes are expected to be called with preempt disabled, + * We don't care about being called with preempt disabled but we need + * to know in the future if that changes so we can remove the next + * preempt_enable. + */ + WARN_ON_ONCE(!preempt_count()); + + /* Tracepoint probes disable preemption atleast once, account for that */ + preempt_enable_notrace(); + if (!preempt_trace() && irq_trace()) start_critical_timing(a0, a1); + + preempt_disable_notrace(); } static int irqsoff_tracer_init(struct trace_array *tr) -- cgit From bc2d8d262cba5736332cbc866acb11b1c5748aa9 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 7 Aug 2018 08:19:57 +0200 Subject: cpu/hotplug: Fix SMT supported evaluation Josh reported that the late SMT evaluation in cpu_smt_state_init() sets cpu_smt_control to CPU_SMT_NOT_SUPPORTED in case that 'nosmt' was supplied on the kernel command line as it cannot differentiate between SMT disabled by BIOS and SMT soft disable via 'nosmt'. That wreckages the state and makes the sysfs interface unusable. Rework this so that during bringup of the non boot CPUs the availability of SMT is determined in cpu_smt_allowed(). If a newly booted CPU is not a 'primary' thread then set the local cpu_smt_available marker and evaluate this explicitely right after the initial SMP bringup has finished. SMT evaulation on x86 is a trainwreck as the firmware has all the information _before_ booting the kernel, but there is no interface to query it. Fixes: 73d5e2b47264 ("cpu/hotplug: detect SMT disabled by BIOS") Reported-by: Josh Poimboeuf Signed-off-by: Thomas Gleixner --- arch/x86/kernel/cpu/bugs.c | 2 +- include/linux/cpu.h | 2 ++ kernel/cpu.c | 41 ++++++++++++++++++++++++++++------------- kernel/smp.c | 2 ++ 4 files changed, 33 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 86f4549b7c9f..be79b7599dda 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -63,7 +63,7 @@ void __init check_bugs(void) * identify_boot_cpu() initialized SMT support information, let the * core code know. */ - cpu_smt_check_topology(); + cpu_smt_check_topology_early(); if (!IS_ENABLED(CONFIG_SMP)) { pr_info("CPU: "); diff --git a/include/linux/cpu.h b/include/linux/cpu.h index ccdf3a67ce56..b216bd5bfd20 100644 --- a/include/linux/cpu.h +++ b/include/linux/cpu.h @@ -178,10 +178,12 @@ enum cpuhp_smt_control { #if defined(CONFIG_SMP) && defined(CONFIG_HOTPLUG_SMT) extern enum cpuhp_smt_control cpu_smt_control; extern void cpu_smt_disable(bool force); +extern void cpu_smt_check_topology_early(void); extern void cpu_smt_check_topology(void); #else # define cpu_smt_control (CPU_SMT_ENABLED) static inline void cpu_smt_disable(bool force) { } +static inline void cpu_smt_check_topology_early(void) { } static inline void cpu_smt_check_topology(void) { } #endif diff --git a/kernel/cpu.c b/kernel/cpu.c index 2bc3d2f5b2a5..9d2512dd263c 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -347,6 +347,8 @@ EXPORT_SYMBOL_GPL(cpu_hotplug_enable); enum cpuhp_smt_control cpu_smt_control __read_mostly = CPU_SMT_ENABLED; EXPORT_SYMBOL_GPL(cpu_smt_control); +static bool cpu_smt_available __read_mostly; + void __init cpu_smt_disable(bool force) { if (cpu_smt_control == CPU_SMT_FORCE_DISABLED || @@ -363,14 +365,28 @@ void __init cpu_smt_disable(bool force) /* * The decision whether SMT is supported can only be done after the full - * CPU identification. Called from architecture code. + * CPU identification. Called from architecture code before non boot CPUs + * are brought up. */ -void __init cpu_smt_check_topology(void) +void __init cpu_smt_check_topology_early(void) { if (!topology_smt_supported()) cpu_smt_control = CPU_SMT_NOT_SUPPORTED; } +/* + * If SMT was disabled by BIOS, detect it here, after the CPUs have been + * brought online. This ensures the smt/l1tf sysfs entries are consistent + * with reality. cpu_smt_available is set to true during the bringup of non + * boot CPUs when a SMT sibling is detected. Note, this may overwrite + * cpu_smt_control's previous setting. + */ +void __init cpu_smt_check_topology(void) +{ + if (!cpu_smt_available) + cpu_smt_control = CPU_SMT_NOT_SUPPORTED; +} + static int __init smt_cmdline_disable(char *str) { cpu_smt_disable(str && !strcmp(str, "force")); @@ -380,10 +396,18 @@ early_param("nosmt", smt_cmdline_disable); static inline bool cpu_smt_allowed(unsigned int cpu) { - if (cpu_smt_control == CPU_SMT_ENABLED) + if (topology_is_primary_thread(cpu)) return true; - if (topology_is_primary_thread(cpu)) + /* + * If the CPU is not a 'primary' thread and the booted_once bit is + * set then the processor has SMT support. Store this information + * for the late check of SMT support in cpu_smt_check_topology(). + */ + if (per_cpu(cpuhp_state, cpu).booted_once) + cpu_smt_available = true; + + if (cpu_smt_control == CPU_SMT_ENABLED) return true; /* @@ -2125,15 +2149,6 @@ static const struct attribute_group cpuhp_smt_attr_group = { static int __init cpu_smt_state_init(void) { - /* - * If SMT was disabled by BIOS, detect it here, after the CPUs have - * been brought online. This ensures the smt/l1tf sysfs entries are - * consistent with reality. Note this may overwrite cpu_smt_control's - * previous setting. - */ - if (topology_max_smt_threads() == 1) - cpu_smt_control = CPU_SMT_NOT_SUPPORTED; - return sysfs_create_group(&cpu_subsys.dev_root->kobj, &cpuhp_smt_attr_group); } diff --git a/kernel/smp.c b/kernel/smp.c index 084c8b3a2681..d86eec5f51c1 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -584,6 +584,8 @@ void __init smp_init(void) num_nodes, (num_nodes > 1 ? "s" : ""), num_cpus, (num_cpus > 1 ? "s" : "")); + /* Final decision about SMT support */ + cpu_smt_check_topology(); /* Any cleanup work */ smp_cpus_done(setup_max_cpus); } -- cgit From 85fc4b16aaf05fc8978d242c556a1711dce15cf8 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Mon, 6 Aug 2018 14:27:28 -0700 Subject: bpf: introduce update_effective_progs() __cgroup_bpf_attach() and __cgroup_bpf_detach() functions have a good amount of duplicated code, which is possible to eliminate by introducing the update_effective_progs() helper function. The update_effective_progs() calls compute_effective_progs() and then in case of success it calls activate_effective_progs() for each descendant cgroup. In case of failure (OOM), it releases allocated prog arrays and return the error code. Signed-off-by: Roman Gushchin Cc: Alexei Starovoitov Cc: Daniel Borkmann Acked-by: Song Liu Signed-off-by: Daniel Borkmann --- kernel/bpf/cgroup.c | 99 ++++++++++++++++++++++++----------------------------- 1 file changed, 45 insertions(+), 54 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 0a4fe5a7dc91..6a7d931bbc55 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -177,6 +177,45 @@ cleanup: return -ENOMEM; } +static int update_effective_progs(struct cgroup *cgrp, + enum bpf_attach_type type) +{ + struct cgroup_subsys_state *css; + int err; + + /* allocate and recompute effective prog arrays */ + css_for_each_descendant_pre(css, &cgrp->self) { + struct cgroup *desc = container_of(css, struct cgroup, self); + + err = compute_effective_progs(desc, type, &desc->bpf.inactive); + if (err) + goto cleanup; + } + + /* all allocations were successful. Activate all prog arrays */ + css_for_each_descendant_pre(css, &cgrp->self) { + struct cgroup *desc = container_of(css, struct cgroup, self); + + activate_effective_progs(desc, type, desc->bpf.inactive); + desc->bpf.inactive = NULL; + } + + return 0; + +cleanup: + /* oom while computing effective. Free all computed effective arrays + * since they were not activated + */ + css_for_each_descendant_pre(css, &cgrp->self) { + struct cgroup *desc = container_of(css, struct cgroup, self); + + bpf_prog_array_free(desc->bpf.inactive); + desc->bpf.inactive = NULL; + } + + return err; +} + #define BPF_CGROUP_MAX_PROGS 64 /** @@ -194,7 +233,6 @@ int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog, struct list_head *progs = &cgrp->bpf.progs[type]; struct bpf_prog *old_prog = NULL; struct bpf_cgroup_storage *storage, *old_storage = NULL; - struct cgroup_subsys_state *css; struct bpf_prog_list *pl; bool pl_was_allocated; int err; @@ -261,22 +299,9 @@ int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog, cgrp->bpf.flags[type] = flags; - /* allocate and recompute effective prog arrays */ - css_for_each_descendant_pre(css, &cgrp->self) { - struct cgroup *desc = container_of(css, struct cgroup, self); - - err = compute_effective_progs(desc, type, &desc->bpf.inactive); - if (err) - goto cleanup; - } - - /* all allocations were successful. Activate all prog arrays */ - css_for_each_descendant_pre(css, &cgrp->self) { - struct cgroup *desc = container_of(css, struct cgroup, self); - - activate_effective_progs(desc, type, desc->bpf.inactive); - desc->bpf.inactive = NULL; - } + err = update_effective_progs(cgrp, type); + if (err) + goto cleanup; static_branch_inc(&cgroup_bpf_enabled_key); if (old_storage) @@ -289,16 +314,6 @@ int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog, return 0; cleanup: - /* oom while computing effective. Free all computed effective arrays - * since they were not activated - */ - css_for_each_descendant_pre(css, &cgrp->self) { - struct cgroup *desc = container_of(css, struct cgroup, self); - - bpf_prog_array_free(desc->bpf.inactive); - desc->bpf.inactive = NULL; - } - /* and cleanup the prog list */ pl->prog = old_prog; bpf_cgroup_storage_free(pl->storage); @@ -326,7 +341,6 @@ int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog, struct list_head *progs = &cgrp->bpf.progs[type]; u32 flags = cgrp->bpf.flags[type]; struct bpf_prog *old_prog = NULL; - struct cgroup_subsys_state *css; struct bpf_prog_list *pl; int err; @@ -365,22 +379,9 @@ int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog, pl->prog = NULL; } - /* allocate and recompute effective prog arrays */ - css_for_each_descendant_pre(css, &cgrp->self) { - struct cgroup *desc = container_of(css, struct cgroup, self); - - err = compute_effective_progs(desc, type, &desc->bpf.inactive); - if (err) - goto cleanup; - } - - /* all allocations were successful. Activate all prog arrays */ - css_for_each_descendant_pre(css, &cgrp->self) { - struct cgroup *desc = container_of(css, struct cgroup, self); - - activate_effective_progs(desc, type, desc->bpf.inactive); - desc->bpf.inactive = NULL; - } + err = update_effective_progs(cgrp, type); + if (err) + goto cleanup; /* now can actually delete it from this cgroup list */ list_del(&pl->node); @@ -396,16 +397,6 @@ int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog, return 0; cleanup: - /* oom while computing effective. Free all computed effective arrays - * since they were not activated - */ - css_for_each_descendant_pre(css, &cgrp->self) { - struct cgroup *desc = container_of(css, struct cgroup, self); - - bpf_prog_array_free(desc->bpf.inactive); - desc->bpf.inactive = NULL; - } - /* and restore back old_prog */ pl->prog = old_prog; return err; -- cgit From 5121700b346b6160ccc9411194e3f1f417c340d1 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Wed, 8 Aug 2018 19:23:13 +0200 Subject: bpf, sockmap: fix bpf_tcp_sendmsg sock error handling While working on bpf_tcp_sendmsg() code, I noticed that when a sk->sk_err is set we error out with err = sk->sk_err. However this is problematic since sk->sk_err is a positive error value and therefore we will neither go into sk_stream_error() nor will we report an error back to user space. I had this case with EPIPE and user space was thinking sendmsg() succeeded since EPIPE is a positive value, thinking we submitted 32 bytes. Fix it by negating the sk->sk_err value. Fixes: 4f738adba30a ("bpf: create tcp_bpf_ulp allowing BPF to monitor socket TX/RX data") Signed-off-by: Daniel Borkmann Acked-by: John Fastabend Signed-off-by: Alexei Starovoitov --- kernel/bpf/sockmap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c index 98fb7938beea..f7360c4d7250 100644 --- a/kernel/bpf/sockmap.c +++ b/kernel/bpf/sockmap.c @@ -1053,7 +1053,7 @@ static int bpf_tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) int copy; if (sk->sk_err) { - err = sk->sk_err; + err = -sk->sk_err; goto out_err; } -- cgit From 7c81c71730456845e6212dccbf00098faa66740f Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Wed, 8 Aug 2018 19:23:14 +0200 Subject: bpf, sockmap: fix leak in bpf_tcp_sendmsg wait for mem path In bpf_tcp_sendmsg() the sk_alloc_sg() may fail. In the case of ENOMEM, it may also mean that we've partially filled the scatterlist entries with pages. Later jumping to sk_stream_wait_memory() we could further fail with an error for several reasons, however we miss to call free_start_sg() if the local sk_msg_buff was used. Fixes: 4f738adba30a ("bpf: create tcp_bpf_ulp allowing BPF to monitor socket TX/RX data") Signed-off-by: Daniel Borkmann Acked-by: John Fastabend Signed-off-by: Alexei Starovoitov --- kernel/bpf/sockmap.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c index f7360c4d7250..c4d75c52b4fc 100644 --- a/kernel/bpf/sockmap.c +++ b/kernel/bpf/sockmap.c @@ -1048,7 +1048,7 @@ static int bpf_tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT); while (msg_data_left(msg)) { - struct sk_msg_buff *m; + struct sk_msg_buff *m = NULL; bool enospc = false; int copy; @@ -1116,8 +1116,11 @@ wait_for_sndbuf: set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); wait_for_memory: err = sk_stream_wait_memory(sk, &timeo); - if (err) + if (err) { + if (m && m != psock->cork) + free_start_sg(sk, m); goto out_err; + } } out_err: if (err < 0) -- cgit From c3ad2c3b02e953ead2b8d52a0c9e70312930c3d0 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Mon, 23 Jul 2018 15:20:37 -0500 Subject: signal: Don't restart fork when signals come in. Wen Yang and majiang report that a periodic signal received during fork can cause fork to continually restart preventing an application from making progress. The code was being overly pessimistic. Fork needs to guarantee that a signal sent to multiple processes is logically delivered before the fork and just to the forking process or logically delivered after the fork to both the forking process and it's newly spawned child. For signals like periodic timers that are always delivered to a single process fork can safely complete and let them appear to logically delivered after the fork(). While examining this issue I also discovered that fork today will miss signals delivered to multiple processes during the fork and handled by another thread. Similarly the current code will also miss blocked signals that are delivered to multiple process, as those signals will not appear pending during fork. Add a list of each thread that is currently forking, and keep on that list a signal set that records all of the signals sent to multiple processes. When fork completes initialize the new processes shared_pending signal set with it. The calculate_sigpending function will see those signals and set TIF_SIGPENDING causing the new task to take the slow path to userspace to handle those signals. Making it appear as if those signals were received immediately after the fork. It is not possible to send real time signals to multiple processes and exceptions don't go to multiple processes, which means that that are no signals sent to multiple processes that require siginfo. This means it is safe to not bother collecting siginfo on signals sent during fork. The sigaction of a child of fork is initially the same as the sigaction of the parent process. So a signal the parent ignores the child will also initially ignore. Therefore it is safe to ignore signals sent to multiple processes and ignored by the forking process. Signals sent to only a single process or only a single thread and delivered during fork are treated as if they are received after the fork, and generally not dealt with. They won't cause any problems. V2: Added removal from the multiprocess list on failure. V3: Use -ERESTARTNOINTR directly V4: - Don't queue both SIGCONT and SIGSTOP - Initialize signal_struct.multiprocess in init_task - Move setting of shared_pending to before the new task is visible to signals. This prevents signals from comming in before shared_pending.signal is set to delayed.signal and being lost. V5: - rework list add and delete to account for idle threads v6: - Use sigdelsetmask when removing stop signals Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=200447 Reported-by: Wen Yang and Reported-by: majiang Fixes: 4a2c7a7837da ("[PATCH] make fork() atomic wrt pgrp/session signals") Signed-off-by: "Eric W. Biederman" --- include/linux/sched/signal.h | 8 ++++++++ init/init_task.c | 1 + kernel/fork.c | 43 +++++++++++++++++++++++++------------------ kernel/signal.c | 15 +++++++++++++++ 4 files changed, 49 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h index ae2b0b81be25..4e9b77fb702d 100644 --- a/include/linux/sched/signal.h +++ b/include/linux/sched/signal.h @@ -69,6 +69,11 @@ struct thread_group_cputimer { bool checking_timer; }; +struct multiprocess_signals { + sigset_t signal; + struct hlist_node node; +}; + /* * NOTE! "signal_struct" does not have its own * locking, because a shared signal_struct always @@ -90,6 +95,9 @@ struct signal_struct { /* shared signal handling: */ struct sigpending shared_pending; + /* For collecting multiprocess signals during fork */ + struct hlist_head multiprocess; + /* thread group exit support */ int group_exit_code; /* overloaded: diff --git a/init/init_task.c b/init/init_task.c index 4f97846256d7..5aebe3be4d7c 100644 --- a/init/init_task.c +++ b/init/init_task.c @@ -22,6 +22,7 @@ static struct signal_struct init_signals = { .list = LIST_HEAD_INIT(init_signals.shared_pending.list), .signal = {{0}} }, + .multiprocess = HLIST_HEAD_INIT, .rlim = INIT_RLIMITS, .cred_guard_mutex = __MUTEX_INITIALIZER(init_signals.cred_guard_mutex), #ifdef CONFIG_POSIX_TIMERS diff --git a/kernel/fork.c b/kernel/fork.c index ab731e15a600..411e34acace7 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1456,6 +1456,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) init_waitqueue_head(&sig->wait_chldexit); sig->curr_target = tsk; init_sigpending(&sig->shared_pending); + INIT_HLIST_HEAD(&sig->multiprocess); seqlock_init(&sig->stats_lock); prev_cputime_init(&sig->prev_cputime); @@ -1602,6 +1603,7 @@ static __latent_entropy struct task_struct *copy_process( { int retval; struct task_struct *p; + struct multiprocess_signals delayed; /* * Don't allow sharing the root directory with processes in a different @@ -1649,6 +1651,24 @@ static __latent_entropy struct task_struct *copy_process( return ERR_PTR(-EINVAL); } + /* + * Force any signals received before this point to be delivered + * before the fork happens. Collect up signals sent to multiple + * processes that happen during the fork and delay them so that + * they appear to happen after the fork. + */ + sigemptyset(&delayed.signal); + INIT_HLIST_NODE(&delayed.node); + + spin_lock_irq(¤t->sighand->siglock); + if (!(clone_flags & CLONE_THREAD)) + hlist_add_head(&delayed.node, ¤t->signal->multiprocess); + recalc_sigpending(); + spin_unlock_irq(¤t->sighand->siglock); + retval = -ERESTARTNOINTR; + if (signal_pending(current)) + goto fork_out; + retval = -ENOMEM; p = dup_task_struct(current, node); if (!p) @@ -1934,22 +1954,6 @@ static __latent_entropy struct task_struct *copy_process( goto bad_fork_cancel_cgroup; } - if (!(clone_flags & CLONE_THREAD)) { - /* - * Process group and session signals need to be delivered to just the - * parent before the fork or both the parent and the child after the - * fork. Restart if a signal comes in before we add the new process to - * it's process group. - * A fatal signal pending means that current will exit, so the new - * thread can't slip out of an OOM kill (or normal SIGKILL). - */ - recalc_sigpending(); - if (signal_pending(current)) { - retval = -ERESTARTNOINTR; - goto bad_fork_cancel_cgroup; - } - } - init_task_pid_links(p); if (likely(p->pid)) { @@ -1965,7 +1969,7 @@ static __latent_entropy struct task_struct *copy_process( ns_of_pid(pid)->child_reaper = p; p->signal->flags |= SIGNAL_UNKILLABLE; } - + p->signal->shared_pending.signal = delayed.signal; p->signal->tty = tty_kref_get(current->signal->tty); /* * Inherit has_child_subreaper flag under the same @@ -1993,8 +1997,8 @@ static __latent_entropy struct task_struct *copy_process( attach_pid(p, PIDTYPE_PID); nr_threads++; } - total_forks++; + hlist_del_init(&delayed.node); spin_unlock(¤t->sighand->siglock); syscall_tracepoint_update(p); write_unlock_irq(&tasklist_lock); @@ -2059,6 +2063,9 @@ bad_fork_free: put_task_stack(p); free_task(p); fork_out: + spin_lock_irq(¤t->sighand->siglock); + hlist_del_init(&delayed.node); + spin_unlock_irq(¤t->sighand->siglock); return ERR_PTR(retval); } diff --git a/kernel/signal.c b/kernel/signal.c index 9f0eafb6d474..cfa9d10e731a 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1121,6 +1121,21 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t, out_set: signalfd_notify(t, sig); sigaddset(&pending->signal, sig); + + /* Let multiprocess signals appear after on-going forks */ + if (type > PIDTYPE_TGID) { + struct multiprocess_signals *delayed; + hlist_for_each_entry(delayed, &t->signal->multiprocess, node) { + sigset_t *signal = &delayed->signal; + /* Can't queue both a stop and a continue signal */ + if (sig == SIGCONT) + sigdelsetmask(signal, SIG_KERNEL_STOP_MASK); + else if (sig_kernel_stop(sig)) + sigdelset(signal, SIGCONT); + sigaddset(signal, sig); + } + } + complete_signal(sig, t, type); ret: trace_signal_generate(sig, info, t, type != PIDTYPE_PID, result); -- cgit From ad0ab027fc6da08cbd34070d816ff3b7986c64ae Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Wed, 8 Aug 2018 23:00:34 +0200 Subject: xdp: fix bug in cpumap teardown code path When removing a cpumap entry, a number of syncronization steps happen. Eventually the teardown code __cpu_map_entry_free is invoked from/via call_rcu. The teardown code __cpu_map_entry_free() flushes remaining xdp_frames, by invoking bq_flush_to_queue, which calls xdp_return_frame_rx_napi(). The issues is that the teardown code is not running in the RX NAPI code path. Thus, it is not allowed to invoke the NAPI variant of xdp_return_frame. This bug was found and triggered by using the --stress-mode option to the samples/bpf program xdp_redirect_cpu. It is hard to trigger, because the ptr_ring have to be full and cpumap bulk queue max contains 8 packets, and a remote CPU is racing to empty the ptr_ring queue. Fixes: 389ab7f01af9 ("xdp: introduce xdp_return_frame_rx_napi") Tested-by: Jean-Tsung Hsiao Signed-off-by: Jesper Dangaard Brouer Signed-off-by: Daniel Borkmann --- kernel/bpf/cpumap.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index e0918d180f08..46f5f29605d4 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -69,7 +69,7 @@ struct bpf_cpu_map { }; static int bq_flush_to_queue(struct bpf_cpu_map_entry *rcpu, - struct xdp_bulk_queue *bq); + struct xdp_bulk_queue *bq, bool in_napi_ctx); static u64 cpu_map_bitmap_size(const union bpf_attr *attr) { @@ -375,7 +375,7 @@ static void __cpu_map_entry_free(struct rcu_head *rcu) struct xdp_bulk_queue *bq = per_cpu_ptr(rcpu->bulkq, cpu); /* No concurrent bq_enqueue can run at this point */ - bq_flush_to_queue(rcpu, bq); + bq_flush_to_queue(rcpu, bq, false); } free_percpu(rcpu->bulkq); /* Cannot kthread_stop() here, last put free rcpu resources */ @@ -558,7 +558,7 @@ const struct bpf_map_ops cpu_map_ops = { }; static int bq_flush_to_queue(struct bpf_cpu_map_entry *rcpu, - struct xdp_bulk_queue *bq) + struct xdp_bulk_queue *bq, bool in_napi_ctx) { unsigned int processed = 0, drops = 0; const int to_cpu = rcpu->cpu; @@ -578,7 +578,10 @@ static int bq_flush_to_queue(struct bpf_cpu_map_entry *rcpu, err = __ptr_ring_produce(q, xdpf); if (err) { drops++; - xdp_return_frame_rx_napi(xdpf); + if (likely(in_napi_ctx)) + xdp_return_frame_rx_napi(xdpf); + else + xdp_return_frame(xdpf); } processed++; } @@ -598,7 +601,7 @@ static int bq_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_frame *xdpf) struct xdp_bulk_queue *bq = this_cpu_ptr(rcpu->bulkq); if (unlikely(bq->count == CPU_MAP_BULK_SIZE)) - bq_flush_to_queue(rcpu, bq); + bq_flush_to_queue(rcpu, bq, true); /* Notice, xdp_buff/page MUST be queued here, long enough for * driver to code invoking us to finished, due to driver @@ -661,7 +664,7 @@ void __cpu_map_flush(struct bpf_map *map) /* Flush all frames in bulkq to real queue */ bq = this_cpu_ptr(rcpu->bulkq); - bq_flush_to_queue(rcpu, bq); + bq_flush_to_queue(rcpu, bq, true); /* If already running, costs spin_lock_irqsave + smb_mb */ wake_up_process(rcpu->kthread); -- cgit From 1bf9116d0866a649104a5dfa008c302ad54d1e02 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Wed, 8 Aug 2018 23:00:45 +0200 Subject: xdp: fix bug in devmap teardown code path Like cpumap teardown, the devmap teardown code also flush remaining xdp_frames, via bq_xmit_all() in case map entry is removed. The code can call xdp_return_frame_rx_napi, from the the wrong context, in-case ndo_xdp_xmit() fails. Fixes: 389ab7f01af9 ("xdp: introduce xdp_return_frame_rx_napi") Fixes: 735fc4054b3a ("xdp: change ndo_xdp_xmit API to support bulking") Signed-off-by: Jesper Dangaard Brouer Signed-off-by: Daniel Borkmann --- kernel/bpf/devmap.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index d361fc1e3bf3..750d45edae79 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -217,7 +217,8 @@ void __dev_map_insert_ctx(struct bpf_map *map, u32 bit) } static int bq_xmit_all(struct bpf_dtab_netdev *obj, - struct xdp_bulk_queue *bq, u32 flags) + struct xdp_bulk_queue *bq, u32 flags, + bool in_napi_ctx) { struct net_device *dev = obj->dev; int sent = 0, drops = 0, err = 0; @@ -254,7 +255,10 @@ error: struct xdp_frame *xdpf = bq->q[i]; /* RX path under NAPI protection, can return frames faster */ - xdp_return_frame_rx_napi(xdpf); + if (likely(in_napi_ctx)) + xdp_return_frame_rx_napi(xdpf); + else + xdp_return_frame(xdpf); drops++; } goto out; @@ -286,7 +290,7 @@ void __dev_map_flush(struct bpf_map *map) __clear_bit(bit, bitmap); bq = this_cpu_ptr(dev->bulkq); - bq_xmit_all(dev, bq, XDP_XMIT_FLUSH); + bq_xmit_all(dev, bq, XDP_XMIT_FLUSH, true); } } @@ -316,7 +320,7 @@ static int bq_enqueue(struct bpf_dtab_netdev *obj, struct xdp_frame *xdpf, struct xdp_bulk_queue *bq = this_cpu_ptr(obj->bulkq); if (unlikely(bq->count == DEV_MAP_BULK_SIZE)) - bq_xmit_all(obj, bq, 0); + bq_xmit_all(obj, bq, 0, true); /* Ingress dev_rx will be the same for all xdp_frame's in * bulk_queue, because bq stored per-CPU and must be flushed @@ -385,7 +389,7 @@ static void dev_map_flush_old(struct bpf_dtab_netdev *dev) __clear_bit(dev->bit, bitmap); bq = per_cpu_ptr(dev->bulkq, cpu); - bq_xmit_all(dev, bq, XDP_XMIT_FLUSH); + bq_xmit_all(dev, bq, XDP_XMIT_FLUSH, false); } } } -- cgit From dc1508a579e682a1e5f1ed0753390e0aa7c23a97 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Thu, 9 Aug 2018 08:55:19 -0700 Subject: bpf: fix bpffs non-array map seq_show issue In function map_seq_next() of kernel/bpf/inode.c, the first key will be the "0" regardless of the map type. This works for array. But for hash type, if it happens key "0" is in the map, the bpffs map show will miss some items if the key "0" is not the first element of the first bucket. This patch fixed the issue by guaranteeing to get the first element, if the seq_show is just started, by passing NULL pointer key to map_get_next_key() callback. This way, no missing elements will occur for bpffs hash table show even if key "0" is in the map. Fixes: a26ca7c982cb5 ("bpf: btf: Add pretty print support to the basic arraymap") Acked-by: Alexei Starovoitov Signed-off-by: Yonghong Song Signed-off-by: Daniel Borkmann --- kernel/bpf/inode.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index 76efe9a183f5..fc5b103512e7 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -196,19 +196,21 @@ static void *map_seq_next(struct seq_file *m, void *v, loff_t *pos) { struct bpf_map *map = seq_file_to_map(m); void *key = map_iter(m)->key; + void *prev_key; if (map_iter(m)->done) return NULL; if (unlikely(v == SEQ_START_TOKEN)) - goto done; + prev_key = NULL; + else + prev_key = key; - if (map->ops->map_get_next_key(map, key, key)) { + if (map->ops->map_get_next_key(map, prev_key, key)) { map_iter(m)->done = true; return NULL; } -done: ++(*pos); return key; } -- cgit From 699c86d6ec21d0f885d12800249d138659de8489 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Thu, 9 Aug 2018 08:55:20 -0700 Subject: bpf: btf: add pretty print for hash/lru_hash maps Commit a26ca7c982cb ("bpf: btf: Add pretty print support to the basic arraymap") added pretty print support to array map. This patch adds pretty print for hash and lru_hash maps. The following example shows the pretty-print result of a pinned hashmap: struct map_value { int count_a; int count_b; }; cat /sys/fs/bpf/pinned_hash_map: 87907: {87907,87908} 57354: {37354,57355} 76625: {76625,76626} ... Signed-off-by: Yonghong Song Signed-off-by: Daniel Borkmann --- kernel/bpf/hashtab.c | 44 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 513d9dfcf4ee..d6110042e0d9 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -11,9 +11,11 @@ * General Public License for more details. */ #include +#include #include #include #include +#include #include "percpu_freelist.h" #include "bpf_lru_list.h" #include "map_in_map.h" @@ -1162,6 +1164,44 @@ static void htab_map_free(struct bpf_map *map) kfree(htab); } +static void htab_map_seq_show_elem(struct bpf_map *map, void *key, + struct seq_file *m) +{ + void *value; + + rcu_read_lock(); + + value = htab_map_lookup_elem(map, key); + if (!value) { + rcu_read_unlock(); + return; + } + + btf_type_seq_show(map->btf, map->btf_key_type_id, key, m); + seq_puts(m, ": "); + btf_type_seq_show(map->btf, map->btf_value_type_id, value, m); + seq_puts(m, "\n"); + + rcu_read_unlock(); +} + +static int htab_map_check_btf(const struct bpf_map *map, const struct btf *btf, + u32 btf_key_id, u32 btf_value_id) +{ + const struct btf_type *key_type, *value_type; + u32 key_size, value_size; + + key_type = btf_type_id_size(btf, &btf_key_id, &key_size); + if (!key_type || key_size != map->key_size) + return -EINVAL; + + value_type = btf_type_id_size(btf, &btf_value_id, &value_size); + if (!value_type || value_size != map->value_size) + return -EINVAL; + + return 0; +} + const struct bpf_map_ops htab_map_ops = { .map_alloc_check = htab_map_alloc_check, .map_alloc = htab_map_alloc, @@ -1171,6 +1211,8 @@ const struct bpf_map_ops htab_map_ops = { .map_update_elem = htab_map_update_elem, .map_delete_elem = htab_map_delete_elem, .map_gen_lookup = htab_map_gen_lookup, + .map_seq_show_elem = htab_map_seq_show_elem, + .map_check_btf = htab_map_check_btf, }; const struct bpf_map_ops htab_lru_map_ops = { @@ -1182,6 +1224,8 @@ const struct bpf_map_ops htab_lru_map_ops = { .map_update_elem = htab_lru_map_update_elem, .map_delete_elem = htab_lru_map_delete_elem, .map_gen_lookup = htab_lru_map_gen_lookup, + .map_seq_show_elem = htab_map_seq_show_elem, + .map_check_btf = htab_map_check_btf, }; /* Called from eBPF program */ -- cgit From bff1b208a5d1dbb2355822ef859edcb9be0379e4 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Mon, 6 Aug 2018 15:50:58 -0400 Subject: tracing: Partial revert of "tracing: Centralize preemptirq tracepoints and unify their usage" Joel Fernandes created a nice patch that cleaned up the duplicate hooks used by lockdep and irqsoff latency tracer. It made both use tracepoints. But it caused lockdep to trigger several false positives. We have not figured out why yet, but removing lockdep from using the trace event hooks and just call its helper functions directly (like it use to), makes the problem go away. This is a partial revert of the clean up patch c3bc8fd637a9 ("tracing: Centralize preemptirq tracepoints and unify their usage") that adds direct calls for lockdep, but also keeps most of the clean up done to get rid of the horrible preprocessor if statements. Link: http://lkml.kernel.org/r/20180806155058.5ee875f4@gandalf.local.home Cc: Peter Zijlstra Reviewed-by: Joel Fernandes (Google) Fixes: c3bc8fd637a9 ("tracing: Centralize preemptirq tracepoints and unify their usage") Signed-off-by: Steven Rostedt (VMware) --- include/linux/irqflags.h | 8 ++++++-- include/linux/lockdep.h | 2 -- init/main.c | 2 -- kernel/locking/lockdep.c | 14 ++------------ kernel/trace/trace_preemptirq.c | 36 ++++++++++++++++++++---------------- 5 files changed, 28 insertions(+), 34 deletions(-) (limited to 'kernel') diff --git a/include/linux/irqflags.h b/include/linux/irqflags.h index 50edb9cbbd26..21619c92c377 100644 --- a/include/linux/irqflags.h +++ b/include/linux/irqflags.h @@ -19,9 +19,13 @@ #ifdef CONFIG_PROVE_LOCKING extern void trace_softirqs_on(unsigned long ip); extern void trace_softirqs_off(unsigned long ip); + extern void lockdep_hardirqs_on(unsigned long ip); + extern void lockdep_hardirqs_off(unsigned long ip); #else -# define trace_softirqs_on(ip) do { } while (0) -# define trace_softirqs_off(ip) do { } while (0) + static inline void trace_softirqs_on(unsigned long ip) { } + static inline void trace_softirqs_off(unsigned long ip) { } + static inline void lockdep_hardirqs_on(unsigned long ip) { } + static inline void lockdep_hardirqs_off(unsigned long ip) { } #endif #ifdef CONFIG_TRACE_IRQFLAGS diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h index a8113357ceeb..b0d0b51c4d85 100644 --- a/include/linux/lockdep.h +++ b/include/linux/lockdep.h @@ -267,7 +267,6 @@ struct held_lock { * Initialization, self-test and debugging-output methods: */ extern void lockdep_init(void); -extern void lockdep_init_early(void); extern void lockdep_reset(void); extern void lockdep_reset_lock(struct lockdep_map *lock); extern void lockdep_free_key_range(void *start, unsigned long size); @@ -408,7 +407,6 @@ static inline void lockdep_on(void) # define lock_set_class(l, n, k, s, i) do { } while (0) # define lock_set_subclass(l, s, i) do { } while (0) # define lockdep_init() do { } while (0) -# define lockdep_init_early() do { } while (0) # define lockdep_init_map(lock, name, key, sub) \ do { (void)(name); (void)(key); } while (0) # define lockdep_set_class(lock, key) do { (void)(key); } while (0) diff --git a/init/main.c b/init/main.c index 44fe43be84c1..5d42e577643a 100644 --- a/init/main.c +++ b/init/main.c @@ -649,8 +649,6 @@ asmlinkage __visible void __init start_kernel(void) call_function_init(); WARN(!irqs_disabled(), "Interrupts were enabled early\n"); - lockdep_init_early(); - early_boot_irqs_disabled = false; local_irq_enable(); diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 03bfaeb9f4e6..e406c5fdb41e 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -2840,8 +2840,7 @@ static void __trace_hardirqs_on_caller(unsigned long ip) debug_atomic_inc(hardirqs_on_events); } -static void lockdep_hardirqs_on(void *none, unsigned long ignore, - unsigned long ip) +void lockdep_hardirqs_on(unsigned long ip) { if (unlikely(!debug_locks || current->lockdep_recursion)) return; @@ -2885,8 +2884,7 @@ static void lockdep_hardirqs_on(void *none, unsigned long ignore, /* * Hardirqs were disabled: */ -static void lockdep_hardirqs_off(void *none, unsigned long ignore, - unsigned long ip) +void lockdep_hardirqs_off(unsigned long ip) { struct task_struct *curr = current; @@ -4315,14 +4313,6 @@ out_restore: raw_local_irq_restore(flags); } -void __init lockdep_init_early(void) -{ -#ifdef CONFIG_PROVE_LOCKING - register_trace_prio_irq_disable(lockdep_hardirqs_off, NULL, INT_MAX); - register_trace_prio_irq_enable(lockdep_hardirqs_on, NULL, INT_MIN); -#endif -} - void __init lockdep_init(void) { printk("Lock dependency validator: Copyright (c) 2006 Red Hat, Inc., Ingo Molnar\n"); diff --git a/kernel/trace/trace_preemptirq.c b/kernel/trace/trace_preemptirq.c index e76b78bf258e..fa656b25f427 100644 --- a/kernel/trace/trace_preemptirq.c +++ b/kernel/trace/trace_preemptirq.c @@ -19,41 +19,45 @@ static DEFINE_PER_CPU(int, tracing_irq_cpu); void trace_hardirqs_on(void) { - if (!this_cpu_read(tracing_irq_cpu)) - return; + if (this_cpu_read(tracing_irq_cpu)) { + trace_irq_enable_rcuidle(CALLER_ADDR0, CALLER_ADDR1); + this_cpu_write(tracing_irq_cpu, 0); + } - trace_irq_enable_rcuidle(CALLER_ADDR0, CALLER_ADDR1); - this_cpu_write(tracing_irq_cpu, 0); + lockdep_hardirqs_on(CALLER_ADDR0); } EXPORT_SYMBOL(trace_hardirqs_on); void trace_hardirqs_off(void) { - if (this_cpu_read(tracing_irq_cpu)) - return; + if (!this_cpu_read(tracing_irq_cpu)) { + this_cpu_write(tracing_irq_cpu, 1); + trace_irq_disable_rcuidle(CALLER_ADDR0, CALLER_ADDR1); + } - this_cpu_write(tracing_irq_cpu, 1); - trace_irq_disable_rcuidle(CALLER_ADDR0, CALLER_ADDR1); + lockdep_hardirqs_off(CALLER_ADDR0); } EXPORT_SYMBOL(trace_hardirqs_off); __visible void trace_hardirqs_on_caller(unsigned long caller_addr) { - if (!this_cpu_read(tracing_irq_cpu)) - return; + if (this_cpu_read(tracing_irq_cpu)) { + trace_irq_enable_rcuidle(CALLER_ADDR0, caller_addr); + this_cpu_write(tracing_irq_cpu, 0); + } - trace_irq_enable_rcuidle(CALLER_ADDR0, caller_addr); - this_cpu_write(tracing_irq_cpu, 0); + lockdep_hardirqs_on(CALLER_ADDR0); } EXPORT_SYMBOL(trace_hardirqs_on_caller); __visible void trace_hardirqs_off_caller(unsigned long caller_addr) { - if (this_cpu_read(tracing_irq_cpu)) - return; + if (!this_cpu_read(tracing_irq_cpu)) { + this_cpu_write(tracing_irq_cpu, 1); + trace_irq_disable_rcuidle(CALLER_ADDR0, caller_addr); + } - this_cpu_write(tracing_irq_cpu, 1); - trace_irq_disable_rcuidle(CALLER_ADDR0, caller_addr); + lockdep_hardirqs_off(CALLER_ADDR0); } EXPORT_SYMBOL(trace_hardirqs_off_caller); #endif /* CONFIG_TRACE_IRQFLAGS */ -- cgit From f27107fa20ad531ace5fd580473ff8dd0c6b9ca9 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Tue, 7 Aug 2018 17:03:12 -0400 Subject: tracing/irqsoff: Handle preempt_count for different configs I was hitting the following warning: WARNING: CPU: 0 PID: 1 at kernel/trace/trace_irqsoff.c:631 tracer_hardirqs_off+0x15/0x2a Modules linked in: CPU: 0 PID: 1 Comm: swapper/0 Not tainted 4.18.0-rc6-test+ #13 Hardware name: MSI MS-7823/CSM-H87M-G43 (MS-7823), BIOS V1.6 02/22/2014 EIP: tracer_hardirqs_off+0x15/0x2a Code: ff 85 c0 74 0e 8b 45 00 8b 50 04 8b 45 04 e8 35 ff ff ff 5d c3 55 64 a1 cc 37 51 c1 a9 ff ff ff 7f 89 e5 53 89 d3 89 ca 75 02 <0f> 0b e8 90 fc ff ff 85 c0 74 07 89 d8 e8 0c ff ff ff 5b 5d c3 55 EAX: 80000000 EBX: c04337f0 ECX: c04338e3 EDX: c04338e3 ESI: c04337f0 EDI: c04338e3 EBP: f2aa1d68 ESP: f2aa1d64 DS: 007b ES: 007b FS: 00d8 GS: 00e0 SS: 0068 EFLAGS: 00210046 CR0: 80050033 CR2: 00000000 CR3: 01668000 CR4: 001406f0 Call Trace: trace_irq_disable_rcuidle+0x63/0x6c trace_hardirqs_off+0x26/0x30 default_send_IPI_mask_allbutself_logical+0x31/0x93 default_send_IPI_allbutself+0x37/0x48 native_send_call_func_ipi+0x4d/0x6a smp_call_function_many+0x165/0x19d ? add_nops+0x34/0x34 ? trace_hardirqs_on_caller+0x2d/0x2d ? add_nops+0x34/0x34 smp_call_function+0x1f/0x23 on_each_cpu+0x15/0x43 ? trace_hardirqs_on_caller+0x2d/0x2d ? trace_hardirqs_on_caller+0x2d/0x2d ? trace_irq_disable_rcuidle+0x1/0x6c text_poke_bp+0xa0/0xc2 ? trace_hardirqs_on_caller+0x2d/0x2d arch_jump_label_transform+0xa7/0xcb ? trace_irq_disable_rcuidle+0x5/0x6c __jump_label_update+0x3e/0x6d jump_label_update+0x7d/0x81 static_key_slow_inc_cpuslocked+0x58/0x6d static_key_slow_inc+0x19/0x20 tracepoint_probe_register_prio+0x19e/0x1d1 ? start_critical_timings+0x1c/0x1c tracepoint_probe_register+0xf/0x11 irqsoff_tracer_init+0x21/0xf2 tracer_init+0x16/0x1a trace_selftest_startup_irqsoff+0x25/0xc4 run_tracer_selftest+0xca/0x131 register_tracer+0xd5/0x172 ? trace_event_define_fields_preemptirq_template+0x45/0x45 init_irqsoff_tracer+0xd/0x11 do_one_initcall+0xab/0x1e8 ? rcu_read_lock_sched_held+0x3d/0x44 ? trace_initcall_level+0x52/0x86 kernel_init_freeable+0x195/0x21a ? rest_init+0xb4/0xb4 kernel_init+0xd/0xe4 ret_from_fork+0x2e/0x38 It is due to running a CONFIG_PREEMPT_VOLUNTARY kernel, which would trigger this warning every time: WARN_ON_ONCE(preempt_count()); Because on CONFIG_PREEMPT_VOLUNTARY, preempt_count() is always zero. This warning is to make sure preempt_count is set because tracer_hardirqs_on() does a preempt_enable_notrace() to make the preempt_trace() work properly, as being called by a trace event, the trace event code disables preemption, and the tracer wants to know what the preemption was before it was called. Instead of enabling preemption like this, just record the preempt_count, subtract PREEMPT_DISABLE_OFFSET from it (which is zero with !CONFIG_PREEMPT set), and pass that value to the necessary functions, which should use the passed in parameter instead of calling preempt_count() directly. Fixes: da5b3ebb45277 ("tracing: irqsoff: Account for additional preempt_disable") Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_irqsoff.c | 66 +++++++++++++++++++++++++------------------- 1 file changed, 37 insertions(+), 29 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index ffbf1505d5bc..4af990e9c594 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -40,12 +40,12 @@ static int start_irqsoff_tracer(struct trace_array *tr, int graph); #ifdef CONFIG_PREEMPT_TRACER static inline int -preempt_trace(void) +preempt_trace(int pc) { - return ((trace_type & TRACER_PREEMPT_OFF) && preempt_count()); + return ((trace_type & TRACER_PREEMPT_OFF) && pc); } #else -# define preempt_trace() (0) +# define preempt_trace(pc) (0) #endif #ifdef CONFIG_IRQSOFF_TRACER @@ -366,7 +366,7 @@ out: } static inline void -start_critical_timing(unsigned long ip, unsigned long parent_ip) +start_critical_timing(unsigned long ip, unsigned long parent_ip, int pc) { int cpu; struct trace_array *tr = irqsoff_trace; @@ -394,7 +394,7 @@ start_critical_timing(unsigned long ip, unsigned long parent_ip) local_save_flags(flags); - __trace_function(tr, ip, parent_ip, flags, preempt_count()); + __trace_function(tr, ip, parent_ip, flags, pc); per_cpu(tracing_cpu, cpu) = 1; @@ -402,7 +402,7 @@ start_critical_timing(unsigned long ip, unsigned long parent_ip) } static inline void -stop_critical_timing(unsigned long ip, unsigned long parent_ip) +stop_critical_timing(unsigned long ip, unsigned long parent_ip, int pc) { int cpu; struct trace_array *tr = irqsoff_trace; @@ -428,7 +428,7 @@ stop_critical_timing(unsigned long ip, unsigned long parent_ip) atomic_inc(&data->disabled); local_save_flags(flags); - __trace_function(tr, ip, parent_ip, flags, preempt_count()); + __trace_function(tr, ip, parent_ip, flags, pc); check_critical_timing(tr, data, parent_ip ? : ip, cpu); data->critical_start = 0; atomic_dec(&data->disabled); @@ -437,15 +437,19 @@ stop_critical_timing(unsigned long ip, unsigned long parent_ip) /* start and stop critical timings used to for stoppage (in idle) */ void start_critical_timings(void) { - if (preempt_trace() || irq_trace()) - start_critical_timing(CALLER_ADDR0, CALLER_ADDR1); + int pc = preempt_count(); + + if (preempt_trace(pc) || irq_trace()) + start_critical_timing(CALLER_ADDR0, CALLER_ADDR1, pc); } EXPORT_SYMBOL_GPL(start_critical_timings); void stop_critical_timings(void) { - if (preempt_trace() || irq_trace()) - stop_critical_timing(CALLER_ADDR0, CALLER_ADDR1); + int pc = preempt_count(); + + if (preempt_trace(pc) || irq_trace()) + stop_critical_timing(CALLER_ADDR0, CALLER_ADDR1, pc); } EXPORT_SYMBOL_GPL(stop_critical_timings); @@ -603,40 +607,40 @@ static void irqsoff_tracer_stop(struct trace_array *tr) */ static void tracer_hardirqs_on(void *none, unsigned long a0, unsigned long a1) { + unsigned int pc = preempt_count(); + /* * Tracepoint probes are expected to be called with preempt disabled, * We don't care about being called with preempt disabled but we need * to know in the future if that changes so we can remove the next * preempt_enable. */ - WARN_ON_ONCE(!preempt_count()); - - /* Tracepoint probes disable preemption atleast once, account for that */ - preempt_enable_notrace(); + WARN_ON_ONCE(pc < PREEMPT_DISABLE_OFFSET); - if (!preempt_trace() && irq_trace()) - stop_critical_timing(a0, a1); + /* Use PREEMPT_DISABLE_OFFSET to handle !CONFIG_PREEMPT cases */ + pc -= PREEMPT_DISABLE_OFFSET; - preempt_disable_notrace(); + if (!preempt_trace(pc) && irq_trace()) + stop_critical_timing(a0, a1, pc); } static void tracer_hardirqs_off(void *none, unsigned long a0, unsigned long a1) { + unsigned int pc = preempt_count(); + /* * Tracepoint probes are expected to be called with preempt disabled, * We don't care about being called with preempt disabled but we need * to know in the future if that changes so we can remove the next * preempt_enable. */ - WARN_ON_ONCE(!preempt_count()); - - /* Tracepoint probes disable preemption atleast once, account for that */ - preempt_enable_notrace(); + WARN_ON_ONCE(pc < PREEMPT_DISABLE_OFFSET); - if (!preempt_trace() && irq_trace()) - start_critical_timing(a0, a1); + /* Use PREEMPT_DISABLE_OFFSET to handle !CONFIG_PREEMPT cases */ + pc -= PREEMPT_DISABLE_OFFSET; - preempt_disable_notrace(); + if (!preempt_trace(pc) && irq_trace()) + start_critical_timing(a0, a1, pc); } static int irqsoff_tracer_init(struct trace_array *tr) @@ -679,14 +683,18 @@ static struct tracer irqsoff_tracer __read_mostly = #ifdef CONFIG_PREEMPT_TRACER static void tracer_preempt_on(void *none, unsigned long a0, unsigned long a1) { - if (preempt_trace() && !irq_trace()) - stop_critical_timing(a0, a1); + int pc = preempt_count(); + + if (preempt_trace(pc) && !irq_trace()) + stop_critical_timing(a0, a1, pc); } static void tracer_preempt_off(void *none, unsigned long a0, unsigned long a1) { - if (preempt_trace() && !irq_trace()) - start_critical_timing(a0, a1); + int pc = preempt_count(); + + if (preempt_trace(pc) && !irq_trace()) + start_critical_timing(a0, a1, pc); } static int preemptoff_tracer_init(struct trace_array *tr) -- cgit From 3f1756dc210e5abb37121da3e7c10d65920f6ec0 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Wed, 8 Aug 2018 21:28:05 -0400 Subject: tracing: More reverting of "tracing: Centralize preemptirq tracepoints and unify their usage" Joel Fernandes created a nice patch that cleaned up the duplicate hooks used by lockdep and irqsoff latency tracer. It made both use tracepoints. But the latency tracer is triggering warnings when using tracepoints to call into the latency tracer's routines. Mainly, they can be called from NMI context. If that happens, then the SRCU may not work properly because on some architectures, SRCU is not safe to be called in both NMI and non-NMI context. This is a partial revert of the clean up patch c3bc8fd637a9 ("tracing: Centralize preemptirq tracepoints and unify their usage") that adds back the direct calls into the latency tracer. It also only calls the trace events when not in NMI. Link: http://lkml.kernel.org/r/20180809210654.622445925@goodmis.org Reviewed-by: Joel Fernandes (Google) Fixes: c3bc8fd637a9 ("tracing: Centralize preemptirq tracepoints and unify their usage") Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.h | 15 +++++++++++++ kernel/trace/trace_irqsoff.c | 48 ++++------------------------------------- kernel/trace/trace_preemptirq.c | 25 +++++++++++++++------ 3 files changed, 38 insertions(+), 50 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index d88cd9bb72f4..a62b678731e3 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -1827,6 +1827,21 @@ static inline int tracing_alloc_snapshot_instance(struct trace_array *tr) } #endif +#ifdef CONFIG_PREEMPT_TRACER +void tracer_preempt_on(unsigned long a0, unsigned long a1); +void tracer_preempt_off(unsigned long a0, unsigned long a1); +#else +static inline void tracer_preempt_on(unsigned long a0, unsigned long a1) { } +static inline void tracer_preempt_off(unsigned long a0, unsigned long a1) { } +#endif +#ifdef CONFIG_IRQSOFF_TRACER +void tracer_hardirqs_on(unsigned long a0, unsigned long a1); +void tracer_hardirqs_off(unsigned long a0, unsigned long a1); +#else +static inline void tracer_hardirqs_on(unsigned long a0, unsigned long a1) { } +static inline void tracer_hardirqs_off(unsigned long a0, unsigned long a1) { } +#endif + extern struct trace_iterator *tracepoint_print_iter; #endif /* _LINUX_KERNEL_TRACE_H */ diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index 4af990e9c594..94c1ba139b3b 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -605,40 +605,18 @@ static void irqsoff_tracer_stop(struct trace_array *tr) /* * We are only interested in hardirq on/off events: */ -static void tracer_hardirqs_on(void *none, unsigned long a0, unsigned long a1) +void tracer_hardirqs_on(unsigned long a0, unsigned long a1) { unsigned int pc = preempt_count(); - /* - * Tracepoint probes are expected to be called with preempt disabled, - * We don't care about being called with preempt disabled but we need - * to know in the future if that changes so we can remove the next - * preempt_enable. - */ - WARN_ON_ONCE(pc < PREEMPT_DISABLE_OFFSET); - - /* Use PREEMPT_DISABLE_OFFSET to handle !CONFIG_PREEMPT cases */ - pc -= PREEMPT_DISABLE_OFFSET; - if (!preempt_trace(pc) && irq_trace()) stop_critical_timing(a0, a1, pc); } -static void tracer_hardirqs_off(void *none, unsigned long a0, unsigned long a1) +void tracer_hardirqs_off(unsigned long a0, unsigned long a1) { unsigned int pc = preempt_count(); - /* - * Tracepoint probes are expected to be called with preempt disabled, - * We don't care about being called with preempt disabled but we need - * to know in the future if that changes so we can remove the next - * preempt_enable. - */ - WARN_ON_ONCE(pc < PREEMPT_DISABLE_OFFSET); - - /* Use PREEMPT_DISABLE_OFFSET to handle !CONFIG_PREEMPT cases */ - pc -= PREEMPT_DISABLE_OFFSET; - if (!preempt_trace(pc) && irq_trace()) start_critical_timing(a0, a1, pc); } @@ -647,15 +625,11 @@ static int irqsoff_tracer_init(struct trace_array *tr) { trace_type = TRACER_IRQS_OFF; - register_trace_irq_disable(tracer_hardirqs_off, NULL); - register_trace_irq_enable(tracer_hardirqs_on, NULL); return __irqsoff_tracer_init(tr); } static void irqsoff_tracer_reset(struct trace_array *tr) { - unregister_trace_irq_disable(tracer_hardirqs_off, NULL); - unregister_trace_irq_enable(tracer_hardirqs_on, NULL); __irqsoff_tracer_reset(tr); } @@ -681,7 +655,7 @@ static struct tracer irqsoff_tracer __read_mostly = #endif /* CONFIG_IRQSOFF_TRACER */ #ifdef CONFIG_PREEMPT_TRACER -static void tracer_preempt_on(void *none, unsigned long a0, unsigned long a1) +void tracer_preempt_on(unsigned long a0, unsigned long a1) { int pc = preempt_count(); @@ -689,7 +663,7 @@ static void tracer_preempt_on(void *none, unsigned long a0, unsigned long a1) stop_critical_timing(a0, a1, pc); } -static void tracer_preempt_off(void *none, unsigned long a0, unsigned long a1) +void tracer_preempt_off(unsigned long a0, unsigned long a1) { int pc = preempt_count(); @@ -701,15 +675,11 @@ static int preemptoff_tracer_init(struct trace_array *tr) { trace_type = TRACER_PREEMPT_OFF; - register_trace_preempt_disable(tracer_preempt_off, NULL); - register_trace_preempt_enable(tracer_preempt_on, NULL); return __irqsoff_tracer_init(tr); } static void preemptoff_tracer_reset(struct trace_array *tr) { - unregister_trace_preempt_disable(tracer_preempt_off, NULL); - unregister_trace_preempt_enable(tracer_preempt_on, NULL); __irqsoff_tracer_reset(tr); } @@ -740,21 +710,11 @@ static int preemptirqsoff_tracer_init(struct trace_array *tr) { trace_type = TRACER_IRQS_OFF | TRACER_PREEMPT_OFF; - register_trace_irq_disable(tracer_hardirqs_off, NULL); - register_trace_irq_enable(tracer_hardirqs_on, NULL); - register_trace_preempt_disable(tracer_preempt_off, NULL); - register_trace_preempt_enable(tracer_preempt_on, NULL); - return __irqsoff_tracer_init(tr); } static void preemptirqsoff_tracer_reset(struct trace_array *tr) { - unregister_trace_irq_disable(tracer_hardirqs_off, NULL); - unregister_trace_irq_enable(tracer_hardirqs_on, NULL); - unregister_trace_preempt_disable(tracer_preempt_off, NULL); - unregister_trace_preempt_enable(tracer_preempt_on, NULL); - __irqsoff_tracer_reset(tr); } diff --git a/kernel/trace/trace_preemptirq.c b/kernel/trace/trace_preemptirq.c index fa656b25f427..71f553cceb3c 100644 --- a/kernel/trace/trace_preemptirq.c +++ b/kernel/trace/trace_preemptirq.c @@ -9,6 +9,7 @@ #include #include #include +#include "trace.h" #define CREATE_TRACE_POINTS #include @@ -20,7 +21,9 @@ static DEFINE_PER_CPU(int, tracing_irq_cpu); void trace_hardirqs_on(void) { if (this_cpu_read(tracing_irq_cpu)) { - trace_irq_enable_rcuidle(CALLER_ADDR0, CALLER_ADDR1); + if (!in_nmi()) + trace_irq_enable_rcuidle(CALLER_ADDR0, CALLER_ADDR1); + tracer_hardirqs_on(CALLER_ADDR0, CALLER_ADDR1); this_cpu_write(tracing_irq_cpu, 0); } @@ -32,7 +35,9 @@ void trace_hardirqs_off(void) { if (!this_cpu_read(tracing_irq_cpu)) { this_cpu_write(tracing_irq_cpu, 1); - trace_irq_disable_rcuidle(CALLER_ADDR0, CALLER_ADDR1); + tracer_hardirqs_off(CALLER_ADDR0, CALLER_ADDR1); + if (!in_nmi()) + trace_irq_disable_rcuidle(CALLER_ADDR0, CALLER_ADDR1); } lockdep_hardirqs_off(CALLER_ADDR0); @@ -42,7 +47,9 @@ EXPORT_SYMBOL(trace_hardirqs_off); __visible void trace_hardirqs_on_caller(unsigned long caller_addr) { if (this_cpu_read(tracing_irq_cpu)) { - trace_irq_enable_rcuidle(CALLER_ADDR0, caller_addr); + if (!in_nmi()) + trace_irq_enable_rcuidle(CALLER_ADDR0, caller_addr); + tracer_hardirqs_on(CALLER_ADDR0, caller_addr); this_cpu_write(tracing_irq_cpu, 0); } @@ -54,7 +61,9 @@ __visible void trace_hardirqs_off_caller(unsigned long caller_addr) { if (!this_cpu_read(tracing_irq_cpu)) { this_cpu_write(tracing_irq_cpu, 1); - trace_irq_disable_rcuidle(CALLER_ADDR0, caller_addr); + tracer_hardirqs_off(CALLER_ADDR0, caller_addr); + if (!in_nmi()) + trace_irq_disable_rcuidle(CALLER_ADDR0, caller_addr); } lockdep_hardirqs_off(CALLER_ADDR0); @@ -66,11 +75,15 @@ EXPORT_SYMBOL(trace_hardirqs_off_caller); void trace_preempt_on(unsigned long a0, unsigned long a1) { - trace_preempt_enable_rcuidle(a0, a1); + if (!in_nmi()) + trace_preempt_enable_rcuidle(a0, a1); + tracer_preempt_on(a0, a1); } void trace_preempt_off(unsigned long a0, unsigned long a1) { - trace_preempt_disable_rcuidle(a0, a1); + if (!in_nmi()) + trace_preempt_disable_rcuidle(a0, a1); + tracer_preempt_off(a0, a1); } #endif -- cgit From b207de3ec531ff364843708c9fea968700aae8fe Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Thu, 9 Aug 2018 13:56:09 +0100 Subject: ftrace: Remove unused pointer ftrace_swapper_pid Pointer ftrace_swapper_pid is defined but is never used hence it is redundant and can be removed. The use of this variable was removed in commit 345ddcc882d8 ("ftrace: Have set_ftrace_pid use the bitmap like events do"). Cleans up clang warning: warning: 'ftrace_swapper_pid' defined but not used [-Wunused-const-variable=] Link: http://lkml.kernel.org/r/20180809125609.13142-1-colin.king@canonical.com Signed-off-by: Colin Ian King Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 2d795193024b..48b5b466ec7a 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1020,8 +1020,6 @@ static __init void ftrace_profile_tracefs(struct dentry *d_tracer) } #endif /* CONFIG_FUNCTION_PROFILER */ -static struct pid * const ftrace_swapper_pid = &init_struct_pid; - #ifdef CONFIG_FUNCTION_GRAPH_TRACER static int ftrace_graph_active; #else -- cgit From e0a568dcd18bdaa77877d558700ce4d3bbbb12b8 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Thu, 9 Aug 2018 15:31:48 -0400 Subject: tracing: Fix synchronizing to event changes with tracepoint_synchronize_unregister() Now that some trace events can be protected by srcu_read_lock(tracepoint_srcu), we need to make sure all locations that depend on this are also protected. There were many places that did a synchronize_sched() thinking that it was enough to protect againts access to trace events. This use to be the case, but now that we use SRCU for _rcuidle() trace events, they may not be protected by synchronize_sched(), as they may be called in paths that RCU is not watching for preempt disable. Fixes: e6753f23d961d ("tracepoint: Make rcuidle tracepoint callers use SRCU") Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events.c | 8 ++++---- kernel/trace/trace_events_filter.c | 15 ++++++++------- kernel/trace/trace_events_hist.c | 2 +- kernel/trace/trace_events_trigger.c | 6 ++++-- 4 files changed, 17 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 7b508ce8ac44..808cf29febe2 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -636,7 +636,7 @@ static void __ftrace_clear_event_pids(struct trace_array *tr) rcu_assign_pointer(tr->filtered_pids, NULL); /* Wait till all users are no longer using pid filtering */ - synchronize_sched(); + tracepoint_synchronize_unregister(); trace_free_pid_list(pid_list); } @@ -1622,7 +1622,7 @@ ftrace_event_pid_write(struct file *filp, const char __user *ubuf, } if (filtered_pids) { - synchronize_sched(); + tracepoint_synchronize_unregister(); trace_free_pid_list(filtered_pids); } else if (pid_list) { /* @@ -3036,8 +3036,8 @@ int event_trace_del_tracer(struct trace_array *tr) /* Disable any running events */ __ftrace_set_clr_event_nolock(tr, NULL, NULL, NULL, 0); - /* Access to events are within rcu_read_lock_sched() */ - synchronize_sched(); + /* Make sure no more events are being executed */ + tracepoint_synchronize_unregister(); down_write(&trace_event_sem); __trace_remove_event_dirs(tr); diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 893a206bcba4..184c7685d5ea 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -899,7 +899,8 @@ int filter_match_preds(struct event_filter *filter, void *rec) if (!filter) return 1; - prog = rcu_dereference_sched(filter->prog); + /* Protected by either SRCU(tracepoint_srcu) or preempt_disable */ + prog = rcu_dereference_raw(filter->prog); if (!prog) return 1; @@ -1626,10 +1627,10 @@ static int process_system_preds(struct trace_subsystem_dir *dir, /* * The calls can still be using the old filters. - * Do a synchronize_sched() to ensure all calls are + * Do a synchronize_sched() and to ensure all calls are * done with them before we free them. */ - synchronize_sched(); + tracepoint_synchronize_unregister(); list_for_each_entry_safe(filter_item, tmp, &filter_list, list) { __free_filter(filter_item->filter); list_del(&filter_item->list); @@ -1648,7 +1649,7 @@ static int process_system_preds(struct trace_subsystem_dir *dir, kfree(filter); /* If any call succeeded, we still need to sync */ if (!fail) - synchronize_sched(); + tracepoint_synchronize_unregister(); list_for_each_entry_safe(filter_item, tmp, &filter_list, list) { __free_filter(filter_item->filter); list_del(&filter_item->list); @@ -1790,7 +1791,7 @@ int apply_event_filter(struct trace_event_file *file, char *filter_string) event_clear_filter(file); /* Make sure the filter is not being used */ - synchronize_sched(); + tracepoint_synchronize_unregister(); __free_filter(filter); return 0; @@ -1817,7 +1818,7 @@ int apply_event_filter(struct trace_event_file *file, char *filter_string) if (tmp) { /* Make sure the call is done with the filter */ - synchronize_sched(); + tracepoint_synchronize_unregister(); __free_filter(tmp); } } @@ -1847,7 +1848,7 @@ int apply_subsystem_event_filter(struct trace_subsystem_dir *dir, filter = system->filter; system->filter = NULL; /* Ensure all filters are no longer used */ - synchronize_sched(); + tracepoint_synchronize_unregister(); filter_free_subsystem_filters(dir, tr); __free_filter(filter); goto out_unlock; diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index aae18af94c94..c522b51d9909 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -5141,7 +5141,7 @@ static void hist_clear(struct event_trigger_data *data) if (data->name) pause_named_trigger(data); - synchronize_sched(); + tracepoint_synchronize_unregister(); tracing_map_clear(hist_data->map); diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c index 58d21fd52932..750044ef15e8 100644 --- a/kernel/trace/trace_events_trigger.c +++ b/kernel/trace/trace_events_trigger.c @@ -34,7 +34,9 @@ void trigger_data_free(struct event_trigger_data *data) if (data->cmd_ops->set_filter) data->cmd_ops->set_filter(NULL, data, NULL); - synchronize_sched(); /* make sure current triggers exit before free */ + /* make sure current triggers exit before free */ + tracepoint_synchronize_unregister(); + kfree(data); } @@ -752,7 +754,7 @@ int set_trigger_filter(char *filter_str, if (tmp) { /* Make sure the call is done with the filter */ - synchronize_sched(); + tracepoint_synchronize_unregister(); free_event_filter(tmp); } -- cgit From 016f8ffc48cb01d1e7701649c728c5d2e737d295 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Thu, 9 Aug 2018 15:37:59 -0400 Subject: uprobes: Use synchronize_rcu() not synchronize_sched() While debugging another bug, I was looking at all the synchronize*() functions being used in kernel/trace, and noticed that trace_uprobes was using synchronize_sched(), with a comment to synchronize with {u,ret}_probe_trace_func(). When looking at those functions, the data is protected with "rcu_read_lock()" and not with "rcu_read_lock_sched()". This is using the wrong synchronize_*() function. Link: http://lkml.kernel.org/r/20180809160553.469e1e32@gandalf.local.home Cc: stable@vger.kernel.org Fixes: 70ed91c6ec7f8 ("tracing/uprobes: Support ftrace_event_file base multibuffer") Acked-by: Oleg Nesterov Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_uprobe.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index bf89a51e740d..ac02fafc9f1b 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -952,7 +952,7 @@ probe_event_disable(struct trace_uprobe *tu, struct trace_event_file *file) list_del_rcu(&link->list); /* synchronize with u{,ret}probe_trace_func */ - synchronize_sched(); + synchronize_rcu(); kfree(link); if (!list_empty(&tu->tp.files)) -- cgit From f8a79d5c7ef47c62d97a30e16064caf2ef91f648 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Fri, 10 Aug 2018 12:17:50 -0400 Subject: tracepoints: Free early tracepoints after RCU is initialized When enabling trace events via the kernel command line, I hit this warning: WARNING: CPU: 0 PID: 13 at kernel/rcu/srcutree.c:236 check_init_srcu_struct+0xe/0x61 Modules linked in: CPU: 0 PID: 13 Comm: watchdog/0 Not tainted 4.18.0-rc6-test+ #6 Hardware name: MSI MS-7823/CSM-H87M-G43 (MS-7823), BIOS V1.6 02/22/2014 RIP: 0010:check_init_srcu_struct+0xe/0x61 Code: 48 c7 c6 ec 8a 65 b4 e8 ff 79 fe ff 48 89 df 31 f6 e8 f2 fa ff ff 5a 5b 41 5c 5d c3 0f 1f 44 00 00 83 3d 68 94 b8 01 01 75 02 <0f> 0b 48 8b 87 f0 0a 00 00 a8 03 74 45 55 48 89 e5 41 55 41 54 4c RSP: 0000:ffff96eb9ea03e68 EFLAGS: 00010246 RAX: ffff96eb962b5b01 RBX: ffffffffb4a87420 RCX: 0000000000000001 RDX: ffffffffb3107969 RSI: ffff96eb962b5b40 RDI: ffffffffb4a87420 RBP: ffff96eb9ea03eb0 R08: ffffabbd00cd7f48 R09: 0000000000000000 R10: ffff96eb9ea03e68 R11: ffffffffb4a6eec0 R12: ffff96eb962b5b40 R13: ffff96eb9ea03ef8 R14: ffffffffb3107969 R15: ffffffffb3107948 FS: 0000000000000000(0000) GS:ffff96eb9ea00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: ffff96eb13ab2000 CR3: 0000000192a1e001 CR4: 00000000001606f0 Call Trace: ? __call_srcu+0x2d/0x290 ? rcu_process_callbacks+0x26e/0x448 ? allocate_probes+0x2b/0x2b call_srcu+0x13/0x15 rcu_free_old_probes+0x1f/0x21 rcu_process_callbacks+0x2ed/0x448 __do_softirq+0x172/0x336 irq_exit+0x62/0xb2 smp_apic_timer_interrupt+0x161/0x19e apic_timer_interrupt+0xf/0x20 The problem is that the enabling of trace events before RCU is set up will cause SRCU to give this warning. To avoid this, add a list to store probes that need to be freed till after RCU is initialized, and then free them then. Link: http://lkml.kernel.org/r/20180810113554.1df28050@gandalf.local.home Link: http://lkml.kernel.org/r/20180810123517.5e9714ad@gandalf.local.home Acked-by: Paul E. McKenney Reviewed-by: Joel Fernandes (Google) Fixes: e6753f23d961d ("tracepoint: Make rcuidle tracepoint callers use SRCU") Signed-off-by: Steven Rostedt (VMware) --- kernel/tracepoint.c | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) (limited to 'kernel') diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c index 955148d91b74..96db841bf0fc 100644 --- a/kernel/tracepoint.c +++ b/kernel/tracepoint.c @@ -53,6 +53,9 @@ static LIST_HEAD(tracepoint_module_list); */ static DEFINE_MUTEX(tracepoints_mutex); +static struct rcu_head *early_probes; +static bool ok_to_free_tracepoints; + /* * Note about RCU : * It is used to delay the free of multiple probes array until a quiescent @@ -80,11 +83,40 @@ static void rcu_free_old_probes(struct rcu_head *head) call_srcu(&tracepoint_srcu, head, srcu_free_old_probes); } +static __init int release_early_probes(void) +{ + struct rcu_head *tmp; + + ok_to_free_tracepoints = true; + + while (early_probes) { + tmp = early_probes; + early_probes = tmp->next; + call_rcu_sched(tmp, rcu_free_old_probes); + } + + return 0; +} + +/* SRCU is initialized at core_initcall */ +postcore_initcall(release_early_probes); + static inline void release_probes(struct tracepoint_func *old) { if (old) { struct tp_probes *tp_probes = container_of(old, struct tp_probes, probes[0]); + + /* + * We can't free probes if SRCU is not initialized yet. + * Postpone the freeing till after SRCU is initialized. + */ + if (unlikely(!ok_to_free_tracepoints)) { + tp_probes->rcu.next = early_probes; + early_probes = &tp_probes->rcu; + return; + } + /* * Tracepoint probes are protected by both sched RCU and SRCU, * by calling the SRCU callback in the sched RCU callback we -- cgit From 5dc4c4b7d4e8115e7cde96a030f98cb3ab2e458c Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Wed, 8 Aug 2018 01:01:24 -0700 Subject: bpf: Introduce BPF_MAP_TYPE_REUSEPORT_SOCKARRAY This patch introduces a new map type BPF_MAP_TYPE_REUSEPORT_SOCKARRAY. To unleash the full potential of a bpf prog, it is essential for the userspace to be capable of directly setting up a bpf map which can then be consumed by the bpf prog to make decision. In this case, decide which SO_REUSEPORT sk to serve the incoming request. By adding BPF_MAP_TYPE_REUSEPORT_SOCKARRAY, the userspace has total control and visibility on where a SO_REUSEPORT sk should be located in a bpf map. The later patch will introduce BPF_PROG_TYPE_SK_REUSEPORT such that the bpf prog can directly select a sk from the bpf map. That will raise the programmability of the bpf prog attached to a reuseport group (a group of sk serving the same IP:PORT). For example, in UDP, the bpf prog can peek into the payload (e.g. through the "data" pointer introduced in the later patch) to learn the application level's connection information and then decide which sk to pick from a bpf map. The userspace can tightly couple the sk's location in a bpf map with the application logic in generating the UDP payload's connection information. This connection info contact/API stays within the userspace. Also, when used with map-in-map, the userspace can switch the old-server-process's inner map to a new-server-process's inner map in one call "bpf_map_update_elem(outer_map, &index, &new_reuseport_array)". The bpf prog will then direct incoming requests to the new process instead of the old process. The old process can finish draining the pending requests (e.g. by "accept()") before closing the old-fds. [Note that deleting a fd from a bpf map does not necessary mean the fd is closed] During map_update_elem(), Only SO_REUSEPORT sk (i.e. which has already been added to a reuse->socks[]) can be used. That means a SO_REUSEPORT sk that is "bind()" for UDP or "bind()+listen()" for TCP. These conditions are ensured in "reuseport_array_update_check()". A SO_REUSEPORT sk can only be added once to a map (i.e. the same sk cannot be added twice even to the same map). SO_REUSEPORT already allows another sk to be created for the same IP:PORT. There is no need to re-create a similar usage in the BPF side. When a SO_REUSEPORT is deleted from the "reuse->socks[]" (e.g. "close()"), it will notify the bpf map to remove it from the map also. It is done through "bpf_sk_reuseport_detach()" and it will only be called if >=1 of the "reuse->sock[]" has ever been added to a bpf map. The map_update()/map_delete() has to be in-sync with the "reuse->socks[]". Hence, the same "reuseport_lock" used by "reuse->socks[]" has to be used here also. Care has been taken to ensure the lock is only acquired when the adding sk passes some strict tests. and freeing the map does not require the reuseport_lock. The reuseport_array will also support lookup from the syscall side. It will return a sock_gen_cookie(). The sock_gen_cookie() is on-demand (i.e. a sk's cookie is not generated until the very first map_lookup_elem()). The lookup cookie is 64bits but it goes against the logical userspace expectation on 32bits sizeof(fd) (and as other fd based bpf maps do also). It may catch user in surprise if we enforce value_size=8 while userspace still pass a 32bits fd during update. Supporting different value_size between lookup and update seems unintuitive also. We also need to consider what if other existing fd based maps want to return 64bits value from syscall's lookup in the future. Hence, reuseport_array supports both value_size 4 and 8, and assuming user will usually use value_size=4. The syscall's lookup will return ENOSPC on value_size=4. It will will only return 64bits value from sock_gen_cookie() when user consciously choose value_size=8 (as a signal that lookup is desired) which then requires a 64bits value in both lookup and update. Signed-off-by: Martin KaFai Lau Acked-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- include/linux/bpf.h | 28 ++++ include/linux/bpf_types.h | 3 + include/uapi/linux/bpf.h | 1 + kernel/bpf/Makefile | 3 + kernel/bpf/arraymap.c | 2 +- kernel/bpf/reuseport_array.c | 363 +++++++++++++++++++++++++++++++++++++++++++ kernel/bpf/syscall.c | 6 + net/core/sock_reuseport.c | 8 + 8 files changed, 413 insertions(+), 1 deletion(-) create mode 100644 kernel/bpf/reuseport_array.c (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index cd8790d2c6ed..db11662faea6 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -524,6 +524,7 @@ static inline int bpf_map_attr_numa_node(const union bpf_attr *attr) } struct bpf_prog *bpf_prog_get_type_path(const char *name, enum bpf_prog_type type); +int array_map_alloc_check(union bpf_attr *attr); #else /* !CONFIG_BPF_SYSCALL */ static inline struct bpf_prog *bpf_prog_get(u32 ufd) @@ -769,6 +770,33 @@ static inline void __xsk_map_flush(struct bpf_map *map) } #endif +#if defined(CONFIG_INET) && defined(CONFIG_BPF_SYSCALL) +void bpf_sk_reuseport_detach(struct sock *sk); +int bpf_fd_reuseport_array_lookup_elem(struct bpf_map *map, void *key, + void *value); +int bpf_fd_reuseport_array_update_elem(struct bpf_map *map, void *key, + void *value, u64 map_flags); +#else +static inline void bpf_sk_reuseport_detach(struct sock *sk) +{ +} + +#ifdef CONFIG_BPF_SYSCALL +static inline int bpf_fd_reuseport_array_lookup_elem(struct bpf_map *map, + void *key, void *value) +{ + return -EOPNOTSUPP; +} + +static inline int bpf_fd_reuseport_array_update_elem(struct bpf_map *map, + void *key, void *value, + u64 map_flags) +{ + return -EOPNOTSUPP; +} +#endif /* CONFIG_BPF_SYSCALL */ +#endif /* defined(CONFIG_INET) && defined(CONFIG_BPF_SYSCALL) */ + /* verifier prototypes for helper functions called from eBPF programs */ extern const struct bpf_func_proto bpf_map_lookup_elem_proto; extern const struct bpf_func_proto bpf_map_update_elem_proto; diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index add08be53b6f..14fd6c02d258 100644 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h @@ -60,4 +60,7 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_CPUMAP, cpu_map_ops) #if defined(CONFIG_XDP_SOCKETS) BPF_MAP_TYPE(BPF_MAP_TYPE_XSKMAP, xsk_map_ops) #endif +#ifdef CONFIG_INET +BPF_MAP_TYPE(BPF_MAP_TYPE_REUSEPORT_SOCKARRAY, reuseport_array_ops) +#endif #endif diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index dd5758dc35d3..40f584bc7da0 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -126,6 +126,7 @@ enum bpf_map_type { BPF_MAP_TYPE_XSKMAP, BPF_MAP_TYPE_SOCKHASH, BPF_MAP_TYPE_CGROUP_STORAGE, + BPF_MAP_TYPE_REUSEPORT_SOCKARRAY, }; enum bpf_prog_type { diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index e8906cbad81f..0488b8258321 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -23,3 +23,6 @@ ifeq ($(CONFIG_PERF_EVENTS),y) obj-$(CONFIG_BPF_SYSCALL) += stackmap.o endif obj-$(CONFIG_CGROUP_BPF) += cgroup.o +ifeq ($(CONFIG_INET),y) +obj-$(CONFIG_BPF_SYSCALL) += reuseport_array.o +endif diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 2aa55d030c77..f6ca3e712831 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -54,7 +54,7 @@ static int bpf_array_alloc_percpu(struct bpf_array *array) } /* Called from syscall */ -static int array_map_alloc_check(union bpf_attr *attr) +int array_map_alloc_check(union bpf_attr *attr) { bool percpu = attr->map_type == BPF_MAP_TYPE_PERCPU_ARRAY; int numa_node = bpf_map_attr_numa_node(attr); diff --git a/kernel/bpf/reuseport_array.c b/kernel/bpf/reuseport_array.c new file mode 100644 index 000000000000..18e225de80ff --- /dev/null +++ b/kernel/bpf/reuseport_array.c @@ -0,0 +1,363 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2018 Facebook + */ +#include +#include +#include +#include + +struct reuseport_array { + struct bpf_map map; + struct sock __rcu *ptrs[]; +}; + +static struct reuseport_array *reuseport_array(struct bpf_map *map) +{ + return (struct reuseport_array *)map; +} + +/* The caller must hold the reuseport_lock */ +void bpf_sk_reuseport_detach(struct sock *sk) +{ + struct sock __rcu **socks; + + write_lock_bh(&sk->sk_callback_lock); + socks = sk->sk_user_data; + if (socks) { + WRITE_ONCE(sk->sk_user_data, NULL); + /* + * Do not move this NULL assignment outside of + * sk->sk_callback_lock because there is + * a race with reuseport_array_free() + * which does not hold the reuseport_lock. + */ + RCU_INIT_POINTER(*socks, NULL); + } + write_unlock_bh(&sk->sk_callback_lock); +} + +static int reuseport_array_alloc_check(union bpf_attr *attr) +{ + if (attr->value_size != sizeof(u32) && + attr->value_size != sizeof(u64)) + return -EINVAL; + + return array_map_alloc_check(attr); +} + +static void *reuseport_array_lookup_elem(struct bpf_map *map, void *key) +{ + struct reuseport_array *array = reuseport_array(map); + u32 index = *(u32 *)key; + + if (unlikely(index >= array->map.max_entries)) + return NULL; + + return rcu_dereference(array->ptrs[index]); +} + +/* Called from syscall only */ +static int reuseport_array_delete_elem(struct bpf_map *map, void *key) +{ + struct reuseport_array *array = reuseport_array(map); + u32 index = *(u32 *)key; + struct sock *sk; + int err; + + if (index >= map->max_entries) + return -E2BIG; + + if (!rcu_access_pointer(array->ptrs[index])) + return -ENOENT; + + spin_lock_bh(&reuseport_lock); + + sk = rcu_dereference_protected(array->ptrs[index], + lockdep_is_held(&reuseport_lock)); + if (sk) { + write_lock_bh(&sk->sk_callback_lock); + WRITE_ONCE(sk->sk_user_data, NULL); + RCU_INIT_POINTER(array->ptrs[index], NULL); + write_unlock_bh(&sk->sk_callback_lock); + err = 0; + } else { + err = -ENOENT; + } + + spin_unlock_bh(&reuseport_lock); + + return err; +} + +static void reuseport_array_free(struct bpf_map *map) +{ + struct reuseport_array *array = reuseport_array(map); + struct sock *sk; + u32 i; + + synchronize_rcu(); + + /* + * ops->map_*_elem() will not be able to access this + * array now. Hence, this function only races with + * bpf_sk_reuseport_detach() which was triggerred by + * close() or disconnect(). + * + * This function and bpf_sk_reuseport_detach() are + * both removing sk from "array". Who removes it + * first does not matter. + * + * The only concern here is bpf_sk_reuseport_detach() + * may access "array" which is being freed here. + * bpf_sk_reuseport_detach() access this "array" + * through sk->sk_user_data _and_ with sk->sk_callback_lock + * held which is enough because this "array" is not freed + * until all sk->sk_user_data has stopped referencing this "array". + * + * Hence, due to the above, taking "reuseport_lock" is not + * needed here. + */ + + /* + * Since reuseport_lock is not taken, sk is accessed under + * rcu_read_lock() + */ + rcu_read_lock(); + for (i = 0; i < map->max_entries; i++) { + sk = rcu_dereference(array->ptrs[i]); + if (sk) { + write_lock_bh(&sk->sk_callback_lock); + /* + * No need for WRITE_ONCE(). At this point, + * no one is reading it without taking the + * sk->sk_callback_lock. + */ + sk->sk_user_data = NULL; + write_unlock_bh(&sk->sk_callback_lock); + RCU_INIT_POINTER(array->ptrs[i], NULL); + } + } + rcu_read_unlock(); + + /* + * Once reaching here, all sk->sk_user_data is not + * referenceing this "array". "array" can be freed now. + */ + bpf_map_area_free(array); +} + +static struct bpf_map *reuseport_array_alloc(union bpf_attr *attr) +{ + int err, numa_node = bpf_map_attr_numa_node(attr); + struct reuseport_array *array; + u64 cost, array_size; + + if (!capable(CAP_SYS_ADMIN)) + return ERR_PTR(-EPERM); + + array_size = sizeof(*array); + array_size += (u64)attr->max_entries * sizeof(struct sock *); + + /* make sure there is no u32 overflow later in round_up() */ + cost = array_size; + if (cost >= U32_MAX - PAGE_SIZE) + return ERR_PTR(-ENOMEM); + cost = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; + + err = bpf_map_precharge_memlock(cost); + if (err) + return ERR_PTR(err); + + /* allocate all map elements and zero-initialize them */ + array = bpf_map_area_alloc(array_size, numa_node); + if (!array) + return ERR_PTR(-ENOMEM); + + /* copy mandatory map attributes */ + bpf_map_init_from_attr(&array->map, attr); + array->map.pages = cost; + + return &array->map; +} + +int bpf_fd_reuseport_array_lookup_elem(struct bpf_map *map, void *key, + void *value) +{ + struct sock *sk; + int err; + + if (map->value_size != sizeof(u64)) + return -ENOSPC; + + rcu_read_lock(); + sk = reuseport_array_lookup_elem(map, key); + if (sk) { + *(u64 *)value = sock_gen_cookie(sk); + err = 0; + } else { + err = -ENOENT; + } + rcu_read_unlock(); + + return err; +} + +static int +reuseport_array_update_check(const struct reuseport_array *array, + const struct sock *nsk, + const struct sock *osk, + const struct sock_reuseport *nsk_reuse, + u32 map_flags) +{ + if (osk && map_flags == BPF_NOEXIST) + return -EEXIST; + + if (!osk && map_flags == BPF_EXIST) + return -ENOENT; + + if (nsk->sk_protocol != IPPROTO_UDP && nsk->sk_protocol != IPPROTO_TCP) + return -ENOTSUPP; + + if (nsk->sk_family != AF_INET && nsk->sk_family != AF_INET6) + return -ENOTSUPP; + + if (nsk->sk_type != SOCK_STREAM && nsk->sk_type != SOCK_DGRAM) + return -ENOTSUPP; + + /* + * sk must be hashed (i.e. listening in the TCP case or binded + * in the UDP case) and + * it must also be a SO_REUSEPORT sk (i.e. reuse cannot be NULL). + * + * Also, sk will be used in bpf helper that is protected by + * rcu_read_lock(). + */ + if (!sock_flag(nsk, SOCK_RCU_FREE) || !sk_hashed(nsk) || !nsk_reuse) + return -EINVAL; + + /* READ_ONCE because the sk->sk_callback_lock may not be held here */ + if (READ_ONCE(nsk->sk_user_data)) + return -EBUSY; + + return 0; +} + +/* + * Called from syscall only. + * The "nsk" in the fd refcnt. + * The "osk" and "reuse" are protected by reuseport_lock. + */ +int bpf_fd_reuseport_array_update_elem(struct bpf_map *map, void *key, + void *value, u64 map_flags) +{ + struct reuseport_array *array = reuseport_array(map); + struct sock *free_osk = NULL, *osk, *nsk; + struct sock_reuseport *reuse; + u32 index = *(u32 *)key; + struct socket *socket; + int err, fd; + + if (map_flags > BPF_EXIST) + return -EINVAL; + + if (index >= map->max_entries) + return -E2BIG; + + if (map->value_size == sizeof(u64)) { + u64 fd64 = *(u64 *)value; + + if (fd64 > S32_MAX) + return -EINVAL; + fd = fd64; + } else { + fd = *(int *)value; + } + + socket = sockfd_lookup(fd, &err); + if (!socket) + return err; + + nsk = socket->sk; + if (!nsk) { + err = -EINVAL; + goto put_file; + } + + /* Quick checks before taking reuseport_lock */ + err = reuseport_array_update_check(array, nsk, + rcu_access_pointer(array->ptrs[index]), + rcu_access_pointer(nsk->sk_reuseport_cb), + map_flags); + if (err) + goto put_file; + + spin_lock_bh(&reuseport_lock); + /* + * Some of the checks only need reuseport_lock + * but it is done under sk_callback_lock also + * for simplicity reason. + */ + write_lock_bh(&nsk->sk_callback_lock); + + osk = rcu_dereference_protected(array->ptrs[index], + lockdep_is_held(&reuseport_lock)); + reuse = rcu_dereference_protected(nsk->sk_reuseport_cb, + lockdep_is_held(&reuseport_lock)); + err = reuseport_array_update_check(array, nsk, osk, reuse, map_flags); + if (err) + goto put_file_unlock; + + /* Ensure reuse->reuseport_id is set */ + err = reuseport_get_id(reuse); + if (err < 0) + goto put_file_unlock; + + WRITE_ONCE(nsk->sk_user_data, &array->ptrs[index]); + rcu_assign_pointer(array->ptrs[index], nsk); + free_osk = osk; + err = 0; + +put_file_unlock: + write_unlock_bh(&nsk->sk_callback_lock); + + if (free_osk) { + write_lock_bh(&free_osk->sk_callback_lock); + WRITE_ONCE(free_osk->sk_user_data, NULL); + write_unlock_bh(&free_osk->sk_callback_lock); + } + + spin_unlock_bh(&reuseport_lock); +put_file: + fput(socket->file); + return err; +} + +/* Called from syscall */ +static int reuseport_array_get_next_key(struct bpf_map *map, void *key, + void *next_key) +{ + struct reuseport_array *array = reuseport_array(map); + u32 index = key ? *(u32 *)key : U32_MAX; + u32 *next = (u32 *)next_key; + + if (index >= array->map.max_entries) { + *next = 0; + return 0; + } + + if (index == array->map.max_entries - 1) + return -ENOENT; + + *next = index + 1; + return 0; +} + +const struct bpf_map_ops reuseport_array_ops = { + .map_alloc_check = reuseport_array_alloc_check, + .map_alloc = reuseport_array_alloc, + .map_free = reuseport_array_free, + .map_lookup_elem = reuseport_array_lookup_elem, + .map_get_next_key = reuseport_array_get_next_key, + .map_delete_elem = reuseport_array_delete_elem, +}; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 5af4e9e2722d..57f4d076141b 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -684,6 +684,8 @@ static int map_lookup_elem(union bpf_attr *attr) err = bpf_fd_array_map_lookup_elem(map, key, value); } else if (IS_FD_HASH(map)) { err = bpf_fd_htab_map_lookup_elem(map, key, value); + } else if (map->map_type == BPF_MAP_TYPE_REUSEPORT_SOCKARRAY) { + err = bpf_fd_reuseport_array_lookup_elem(map, key, value); } else { rcu_read_lock(); ptr = map->ops->map_lookup_elem(map, key); @@ -790,6 +792,10 @@ static int map_update_elem(union bpf_attr *attr) err = bpf_fd_htab_map_update_elem(map, f.file, key, value, attr->flags); rcu_read_unlock(); + } else if (map->map_type == BPF_MAP_TYPE_REUSEPORT_SOCKARRAY) { + /* rcu_read_lock() is not needed */ + err = bpf_fd_reuseport_array_update_elem(map, key, value, + attr->flags); } else { rcu_read_lock(); err = map->ops->map_update_elem(map, key, value, attr->flags); diff --git a/net/core/sock_reuseport.c b/net/core/sock_reuseport.c index cf2e4d305af9..8235f2439816 100644 --- a/net/core/sock_reuseport.c +++ b/net/core/sock_reuseport.c @@ -186,6 +186,14 @@ void reuseport_detach_sock(struct sock *sk) spin_lock_bh(&reuseport_lock); reuse = rcu_dereference_protected(sk->sk_reuseport_cb, lockdep_is_held(&reuseport_lock)); + + /* At least one of the sk in this reuseport group is added to + * a bpf map. Notify the bpf side. The bpf map logic will + * remove the sk if it is indeed added to a bpf map. + */ + if (reuse->reuseport_id) + bpf_sk_reuseport_detach(sk); + rcu_assign_pointer(sk->sk_reuseport_cb, NULL); for (i = 0; i < reuse->num_socks; i++) { -- cgit From 2dbb9b9e6df67d444fbe425c7f6014858d337adf Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Wed, 8 Aug 2018 01:01:25 -0700 Subject: bpf: Introduce BPF_PROG_TYPE_SK_REUSEPORT This patch adds a BPF_PROG_TYPE_SK_REUSEPORT which can select a SO_REUSEPORT sk from a BPF_MAP_TYPE_REUSEPORT_ARRAY. Like other non SK_FILTER/CGROUP_SKB program, it requires CAP_SYS_ADMIN. BPF_PROG_TYPE_SK_REUSEPORT introduces "struct sk_reuseport_kern" to store the bpf context instead of using the skb->cb[48]. At the SO_REUSEPORT sk lookup time, it is in the middle of transiting from a lower layer (ipv4/ipv6) to a upper layer (udp/tcp). At this point, it is not always clear where the bpf context can be appended in the skb->cb[48] to avoid saving-and-restoring cb[]. Even putting aside the difference between ipv4-vs-ipv6 and udp-vs-tcp. It is not clear if the lower layer is only ipv4 and ipv6 in the future and will it not touch the cb[] again before transiting to the upper layer. For example, in udp_gro_receive(), it uses the 48 byte NAPI_GRO_CB instead of IP[6]CB and it may still modify the cb[] after calling the udp[46]_lib_lookup_skb(). Because of the above reason, if sk->cb is used for the bpf ctx, saving-and-restoring is needed and likely the whole 48 bytes cb[] has to be saved and restored. Instead of saving, setting and restoring the cb[], this patch opts to create a new "struct sk_reuseport_kern" and setting the needed values in there. The new BPF_PROG_TYPE_SK_REUSEPORT and "struct sk_reuseport_(kern|md)" will serve all ipv4/ipv6 + udp/tcp combinations. There is no protocol specific usage at this point and it is also inline with the current sock_reuseport.c implementation (i.e. no protocol specific requirement). In "struct sk_reuseport_md", this patch exposes data/data_end/len with semantic similar to other existing usages. Together with "bpf_skb_load_bytes()" and "bpf_skb_load_bytes_relative()", the bpf prog can peek anywhere in the skb. The "bind_inany" tells the bpf prog that the reuseport group is bind-ed to a local INANY address which cannot be learned from skb. The new "bind_inany" is added to "struct sock_reuseport" which will be used when running the new "BPF_PROG_TYPE_SK_REUSEPORT" bpf prog in order to avoid repeating the "bind INANY" test on "sk_v6_rcv_saddr/sk->sk_rcv_saddr" every time a bpf prog is run. It can only be properly initialized when a "sk->sk_reuseport" enabled sk is adding to a hashtable (i.e. during "reuseport_alloc()" and "reuseport_add_sock()"). The new "sk_select_reuseport()" is the main helper that the bpf prog will use to select a SO_REUSEPORT sk. It is the only function that can use the new BPF_MAP_TYPE_REUSEPORT_ARRAY. As mentioned in the earlier patch, the validity of a selected sk is checked in run time in "sk_select_reuseport()". Doing the check in verification time is difficult and inflexible (consider the map-in-map use case). The runtime check is to compare the selected sk's reuseport_id with the reuseport_id that we want. This helper will return -EXXX if the selected sk cannot serve the incoming request (e.g. reuseport_id not match). The bpf prog can decide if it wants to do SK_DROP as its discretion. When the bpf prog returns SK_PASS, the kernel will check if a valid sk has been selected (i.e. "reuse_kern->selected_sk != NULL"). If it does , it will use the selected sk. If not, the kernel will select one from "reuse->socks[]" (as before this patch). The SK_DROP and SK_PASS handling logic will be in the next patch. Signed-off-by: Martin KaFai Lau Acked-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- include/linux/bpf_types.h | 3 + include/linux/filter.h | 15 +++ include/net/addrconf.h | 1 + include/net/sock_reuseport.h | 6 +- include/uapi/linux/bpf.h | 36 +++++- kernel/bpf/verifier.c | 9 ++ net/core/filter.c | 269 +++++++++++++++++++++++++++++++++++++++- net/core/sock_reuseport.c | 20 ++- net/ipv4/inet_connection_sock.c | 9 ++ net/ipv4/inet_hashtables.c | 5 +- net/ipv4/udp.c | 5 +- 11 files changed, 365 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index 14fd6c02d258..cd26c090e7c0 100644 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h @@ -29,6 +29,9 @@ BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_DEVICE, cg_dev) #ifdef CONFIG_BPF_LIRC_MODE2 BPF_PROG_TYPE(BPF_PROG_TYPE_LIRC_MODE2, lirc_mode2) #endif +#ifdef CONFIG_INET +BPF_PROG_TYPE(BPF_PROG_TYPE_SK_REUSEPORT, sk_reuseport) +#endif BPF_MAP_TYPE(BPF_MAP_TYPE_ARRAY, array_map_ops) BPF_MAP_TYPE(BPF_MAP_TYPE_PERCPU_ARRAY, percpu_array_map_ops) diff --git a/include/linux/filter.h b/include/linux/filter.h index 2b072dab32c0..70e9d57677fe 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -32,6 +32,7 @@ struct seccomp_data; struct bpf_prog_aux; struct xdp_rxq_info; struct xdp_buff; +struct sock_reuseport; /* ArgX, context and stack frame pointer register positions. Note, * Arg1, Arg2, Arg3, etc are used as argument mappings of function @@ -833,6 +834,20 @@ void bpf_warn_invalid_xdp_action(u32 act); struct sock *do_sk_redirect_map(struct sk_buff *skb); struct sock *do_msg_redirect_map(struct sk_msg_buff *md); +#ifdef CONFIG_INET +struct sock *bpf_run_sk_reuseport(struct sock_reuseport *reuse, struct sock *sk, + struct bpf_prog *prog, struct sk_buff *skb, + u32 hash); +#else +static inline struct sock * +bpf_run_sk_reuseport(struct sock_reuseport *reuse, struct sock *sk, + struct bpf_prog *prog, struct sk_buff *skb, + u32 hash) +{ + return NULL; +} +#endif + #ifdef CONFIG_BPF_JIT extern int bpf_jit_enable; extern int bpf_jit_harden; diff --git a/include/net/addrconf.h b/include/net/addrconf.h index 5f43f7a70fe6..6def0351bcc3 100644 --- a/include/net/addrconf.h +++ b/include/net/addrconf.h @@ -108,6 +108,7 @@ int ipv6_get_lladdr(struct net_device *dev, struct in6_addr *addr, u32 banned_flags); bool inet_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2, bool match_wildcard); +bool inet_rcv_saddr_any(const struct sock *sk); void addrconf_join_solict(struct net_device *dev, const struct in6_addr *addr); void addrconf_leave_solict(struct inet6_dev *idev, const struct in6_addr *addr); diff --git a/include/net/sock_reuseport.h b/include/net/sock_reuseport.h index e1a7681856f7..73b569556be6 100644 --- a/include/net/sock_reuseport.h +++ b/include/net/sock_reuseport.h @@ -21,12 +21,14 @@ struct sock_reuseport { unsigned int synq_overflow_ts; /* ID stays the same even after the size of socks[] grows. */ unsigned int reuseport_id; + bool bind_inany; struct bpf_prog __rcu *prog; /* optional BPF sock selector */ struct sock *socks[0]; /* array of sock pointers */ }; -extern int reuseport_alloc(struct sock *sk); -extern int reuseport_add_sock(struct sock *sk, struct sock *sk2); +extern int reuseport_alloc(struct sock *sk, bool bind_inany); +extern int reuseport_add_sock(struct sock *sk, struct sock *sk2, + bool bind_inany); extern void reuseport_detach_sock(struct sock *sk); extern struct sock *reuseport_select_sock(struct sock *sk, u32 hash, diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 40f584bc7da0..3102a2a23c31 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -151,6 +151,7 @@ enum bpf_prog_type { BPF_PROG_TYPE_CGROUP_SOCK_ADDR, BPF_PROG_TYPE_LWT_SEG6LOCAL, BPF_PROG_TYPE_LIRC_MODE2, + BPF_PROG_TYPE_SK_REUSEPORT, }; enum bpf_attach_type { @@ -2114,6 +2115,14 @@ union bpf_attr { * the shared data. * Return * Pointer to the local storage area. + * + * int bpf_sk_select_reuseport(struct sk_reuseport_md *reuse, struct bpf_map *map, void *key, u64 flags) + * Description + * Select a SO_REUSEPORT sk from a BPF_MAP_TYPE_REUSEPORT_ARRAY map + * It checks the selected sk is matching the incoming + * request in the skb. + * Return + * 0 on success, or a negative error in case of failure. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -2197,7 +2206,8 @@ union bpf_attr { FN(rc_keydown), \ FN(skb_cgroup_id), \ FN(get_current_cgroup_id), \ - FN(get_local_storage), + FN(get_local_storage), \ + FN(sk_select_reuseport), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call @@ -2414,6 +2424,30 @@ struct sk_msg_md { __u32 local_port; /* stored in host byte order */ }; +struct sk_reuseport_md { + /* + * Start of directly accessible data. It begins from + * the tcp/udp header. + */ + void *data; + void *data_end; /* End of directly accessible data */ + /* + * Total length of packet (starting from the tcp/udp header). + * Note that the directly accessible bytes (data_end - data) + * could be less than this "len". Those bytes could be + * indirectly read by a helper "bpf_skb_load_bytes()". + */ + __u32 len; + /* + * Eth protocol in the mac header (network byte order). e.g. + * ETH_P_IP(0x0800) and ETH_P_IPV6(0x86DD) + */ + __u32 eth_protocol; + __u32 ip_protocol; /* IP protocol. e.g. IPPROTO_TCP, IPPROTO_UDP */ + __u32 bind_inany; /* Is sock bound to an INANY address? */ + __u32 hash; /* A hash of the packet 4 tuples */ +}; + #define BPF_TAG_SIZE 8 struct bpf_prog_info { diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 587468a9c37d..ca90679a7fe5 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1310,6 +1310,7 @@ static bool may_access_direct_pkt_data(struct bpf_verifier_env *env, case BPF_PROG_TYPE_LWT_IN: case BPF_PROG_TYPE_LWT_OUT: case BPF_PROG_TYPE_LWT_SEG6LOCAL: + case BPF_PROG_TYPE_SK_REUSEPORT: /* dst_input() and dst_output() can't write for now */ if (t == BPF_WRITE) return false; @@ -2166,6 +2167,10 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, func_id != BPF_FUNC_msg_redirect_hash) goto error; break; + case BPF_MAP_TYPE_REUSEPORT_SOCKARRAY: + if (func_id != BPF_FUNC_sk_select_reuseport) + goto error; + break; default: break; } @@ -2217,6 +2222,10 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, if (map->map_type != BPF_MAP_TYPE_CGROUP_STORAGE) goto error; break; + case BPF_FUNC_sk_select_reuseport: + if (map->map_type != BPF_MAP_TYPE_REUSEPORT_SOCKARRAY) + goto error; + break; default: break; } diff --git a/net/core/filter.c b/net/core/filter.c index 2de7dd9f2a57..142595b4e0d1 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -1462,7 +1462,7 @@ static int __reuseport_attach_prog(struct bpf_prog *prog, struct sock *sk) return -ENOMEM; if (sk_unhashed(sk) && sk->sk_reuseport) { - err = reuseport_alloc(sk); + err = reuseport_alloc(sk, false); if (err) return err; } else if (!rcu_access_pointer(sk->sk_reuseport_cb)) { @@ -7013,3 +7013,270 @@ out: release_sock(sk); return ret; } + +#ifdef CONFIG_INET +struct sk_reuseport_kern { + struct sk_buff *skb; + struct sock *sk; + struct sock *selected_sk; + void *data_end; + u32 hash; + u32 reuseport_id; + bool bind_inany; +}; + +static void bpf_init_reuseport_kern(struct sk_reuseport_kern *reuse_kern, + struct sock_reuseport *reuse, + struct sock *sk, struct sk_buff *skb, + u32 hash) +{ + reuse_kern->skb = skb; + reuse_kern->sk = sk; + reuse_kern->selected_sk = NULL; + reuse_kern->data_end = skb->data + skb_headlen(skb); + reuse_kern->hash = hash; + reuse_kern->reuseport_id = reuse->reuseport_id; + reuse_kern->bind_inany = reuse->bind_inany; +} + +struct sock *bpf_run_sk_reuseport(struct sock_reuseport *reuse, struct sock *sk, + struct bpf_prog *prog, struct sk_buff *skb, + u32 hash) +{ + struct sk_reuseport_kern reuse_kern; + enum sk_action action; + + bpf_init_reuseport_kern(&reuse_kern, reuse, sk, skb, hash); + action = BPF_PROG_RUN(prog, &reuse_kern); + + if (action == SK_PASS) + return reuse_kern.selected_sk; + else + return ERR_PTR(-ECONNREFUSED); +} + +BPF_CALL_4(sk_select_reuseport, struct sk_reuseport_kern *, reuse_kern, + struct bpf_map *, map, void *, key, u32, flags) +{ + struct sock_reuseport *reuse; + struct sock *selected_sk; + + selected_sk = map->ops->map_lookup_elem(map, key); + if (!selected_sk) + return -ENOENT; + + reuse = rcu_dereference(selected_sk->sk_reuseport_cb); + if (!reuse) + /* selected_sk is unhashed (e.g. by close()) after the + * above map_lookup_elem(). Treat selected_sk has already + * been removed from the map. + */ + return -ENOENT; + + if (unlikely(reuse->reuseport_id != reuse_kern->reuseport_id)) { + struct sock *sk; + + if (unlikely(!reuse_kern->reuseport_id)) + /* There is a small race between adding the + * sk to the map and setting the + * reuse_kern->reuseport_id. + * Treat it as the sk has not been added to + * the bpf map yet. + */ + return -ENOENT; + + sk = reuse_kern->sk; + if (sk->sk_protocol != selected_sk->sk_protocol) + return -EPROTOTYPE; + else if (sk->sk_family != selected_sk->sk_family) + return -EAFNOSUPPORT; + + /* Catch all. Likely bound to a different sockaddr. */ + return -EBADFD; + } + + reuse_kern->selected_sk = selected_sk; + + return 0; +} + +static const struct bpf_func_proto sk_select_reuseport_proto = { + .func = sk_select_reuseport, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_CONST_MAP_PTR, + .arg3_type = ARG_PTR_TO_MAP_KEY, + .arg4_type = ARG_ANYTHING, +}; + +BPF_CALL_4(sk_reuseport_load_bytes, + const struct sk_reuseport_kern *, reuse_kern, u32, offset, + void *, to, u32, len) +{ + return ____bpf_skb_load_bytes(reuse_kern->skb, offset, to, len); +} + +static const struct bpf_func_proto sk_reuseport_load_bytes_proto = { + .func = sk_reuseport_load_bytes, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_PTR_TO_UNINIT_MEM, + .arg4_type = ARG_CONST_SIZE, +}; + +BPF_CALL_5(sk_reuseport_load_bytes_relative, + const struct sk_reuseport_kern *, reuse_kern, u32, offset, + void *, to, u32, len, u32, start_header) +{ + return ____bpf_skb_load_bytes_relative(reuse_kern->skb, offset, to, + len, start_header); +} + +static const struct bpf_func_proto sk_reuseport_load_bytes_relative_proto = { + .func = sk_reuseport_load_bytes_relative, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_PTR_TO_UNINIT_MEM, + .arg4_type = ARG_CONST_SIZE, + .arg5_type = ARG_ANYTHING, +}; + +static const struct bpf_func_proto * +sk_reuseport_func_proto(enum bpf_func_id func_id, + const struct bpf_prog *prog) +{ + switch (func_id) { + case BPF_FUNC_sk_select_reuseport: + return &sk_select_reuseport_proto; + case BPF_FUNC_skb_load_bytes: + return &sk_reuseport_load_bytes_proto; + case BPF_FUNC_skb_load_bytes_relative: + return &sk_reuseport_load_bytes_relative_proto; + default: + return bpf_base_func_proto(func_id); + } +} + +static bool +sk_reuseport_is_valid_access(int off, int size, + enum bpf_access_type type, + const struct bpf_prog *prog, + struct bpf_insn_access_aux *info) +{ + const u32 size_default = sizeof(__u32); + + if (off < 0 || off >= sizeof(struct sk_reuseport_md) || + off % size || type != BPF_READ) + return false; + + switch (off) { + case offsetof(struct sk_reuseport_md, data): + info->reg_type = PTR_TO_PACKET; + return size == sizeof(__u64); + + case offsetof(struct sk_reuseport_md, data_end): + info->reg_type = PTR_TO_PACKET_END; + return size == sizeof(__u64); + + case offsetof(struct sk_reuseport_md, hash): + return size == size_default; + + /* Fields that allow narrowing */ + case offsetof(struct sk_reuseport_md, eth_protocol): + if (size < FIELD_SIZEOF(struct sk_buff, protocol)) + return false; + case offsetof(struct sk_reuseport_md, ip_protocol): + case offsetof(struct sk_reuseport_md, bind_inany): + case offsetof(struct sk_reuseport_md, len): + bpf_ctx_record_field_size(info, size_default); + return bpf_ctx_narrow_access_ok(off, size, size_default); + + default: + return false; + } +} + +#define SK_REUSEPORT_LOAD_FIELD(F) ({ \ + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_reuseport_kern, F), \ + si->dst_reg, si->src_reg, \ + bpf_target_off(struct sk_reuseport_kern, F, \ + FIELD_SIZEOF(struct sk_reuseport_kern, F), \ + target_size)); \ + }) + +#define SK_REUSEPORT_LOAD_SKB_FIELD(SKB_FIELD) \ + SOCK_ADDR_LOAD_NESTED_FIELD(struct sk_reuseport_kern, \ + struct sk_buff, \ + skb, \ + SKB_FIELD) + +#define SK_REUSEPORT_LOAD_SK_FIELD_SIZE_OFF(SK_FIELD, BPF_SIZE, EXTRA_OFF) \ + SOCK_ADDR_LOAD_NESTED_FIELD_SIZE_OFF(struct sk_reuseport_kern, \ + struct sock, \ + sk, \ + SK_FIELD, BPF_SIZE, EXTRA_OFF) + +static u32 sk_reuseport_convert_ctx_access(enum bpf_access_type type, + const struct bpf_insn *si, + struct bpf_insn *insn_buf, + struct bpf_prog *prog, + u32 *target_size) +{ + struct bpf_insn *insn = insn_buf; + + switch (si->off) { + case offsetof(struct sk_reuseport_md, data): + SK_REUSEPORT_LOAD_SKB_FIELD(data); + break; + + case offsetof(struct sk_reuseport_md, len): + SK_REUSEPORT_LOAD_SKB_FIELD(len); + break; + + case offsetof(struct sk_reuseport_md, eth_protocol): + SK_REUSEPORT_LOAD_SKB_FIELD(protocol); + break; + + case offsetof(struct sk_reuseport_md, ip_protocol): + BUILD_BUG_ON(hweight_long(SK_FL_PROTO_MASK) != BITS_PER_BYTE); + SK_REUSEPORT_LOAD_SK_FIELD_SIZE_OFF(__sk_flags_offset, + BPF_W, 0); + *insn++ = BPF_ALU32_IMM(BPF_AND, si->dst_reg, SK_FL_PROTO_MASK); + *insn++ = BPF_ALU32_IMM(BPF_RSH, si->dst_reg, + SK_FL_PROTO_SHIFT); + /* SK_FL_PROTO_MASK and SK_FL_PROTO_SHIFT are endian + * aware. No further narrowing or masking is needed. + */ + *target_size = 1; + break; + + case offsetof(struct sk_reuseport_md, data_end): + SK_REUSEPORT_LOAD_FIELD(data_end); + break; + + case offsetof(struct sk_reuseport_md, hash): + SK_REUSEPORT_LOAD_FIELD(hash); + break; + + case offsetof(struct sk_reuseport_md, bind_inany): + SK_REUSEPORT_LOAD_FIELD(bind_inany); + break; + } + + return insn - insn_buf; +} + +const struct bpf_verifier_ops sk_reuseport_verifier_ops = { + .get_func_proto = sk_reuseport_func_proto, + .is_valid_access = sk_reuseport_is_valid_access, + .convert_ctx_access = sk_reuseport_convert_ctx_access, +}; + +const struct bpf_prog_ops sk_reuseport_prog_ops = { +}; +#endif /* CONFIG_INET */ diff --git a/net/core/sock_reuseport.c b/net/core/sock_reuseport.c index 8235f2439816..d260167f5f77 100644 --- a/net/core/sock_reuseport.c +++ b/net/core/sock_reuseport.c @@ -51,7 +51,7 @@ static struct sock_reuseport *__reuseport_alloc(unsigned int max_socks) return reuse; } -int reuseport_alloc(struct sock *sk) +int reuseport_alloc(struct sock *sk, bool bind_inany) { struct sock_reuseport *reuse; @@ -63,9 +63,17 @@ int reuseport_alloc(struct sock *sk) /* Allocation attempts can occur concurrently via the setsockopt path * and the bind/hash path. Nothing to do when we lose the race. */ - if (rcu_dereference_protected(sk->sk_reuseport_cb, - lockdep_is_held(&reuseport_lock))) + reuse = rcu_dereference_protected(sk->sk_reuseport_cb, + lockdep_is_held(&reuseport_lock)); + if (reuse) { + /* Only set reuse->bind_inany if the bind_inany is true. + * Otherwise, it will overwrite the reuse->bind_inany + * which was set by the bind/hash path. + */ + if (bind_inany) + reuse->bind_inany = bind_inany; goto out; + } reuse = __reuseport_alloc(INIT_SOCKS); if (!reuse) { @@ -75,6 +83,7 @@ int reuseport_alloc(struct sock *sk) reuse->socks[0] = sk; reuse->num_socks = 1; + reuse->bind_inany = bind_inany; rcu_assign_pointer(sk->sk_reuseport_cb, reuse); out: @@ -101,6 +110,7 @@ static struct sock_reuseport *reuseport_grow(struct sock_reuseport *reuse) more_reuse->num_socks = reuse->num_socks; more_reuse->prog = reuse->prog; more_reuse->reuseport_id = reuse->reuseport_id; + more_reuse->bind_inany = reuse->bind_inany; memcpy(more_reuse->socks, reuse->socks, reuse->num_socks * sizeof(struct sock *)); @@ -136,12 +146,12 @@ static void reuseport_free_rcu(struct rcu_head *head) * @sk2: Socket belonging to the existing reuseport group. * May return ENOMEM and not add socket to group under memory pressure. */ -int reuseport_add_sock(struct sock *sk, struct sock *sk2) +int reuseport_add_sock(struct sock *sk, struct sock *sk2, bool bind_inany) { struct sock_reuseport *old_reuse, *reuse; if (!rcu_access_pointer(sk2->sk_reuseport_cb)) { - int err = reuseport_alloc(sk2); + int err = reuseport_alloc(sk2, bind_inany); if (err) return err; diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 33a88e045efd..dfd5009f96ef 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -107,6 +107,15 @@ bool inet_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2, } EXPORT_SYMBOL(inet_rcv_saddr_equal); +bool inet_rcv_saddr_any(const struct sock *sk) +{ +#if IS_ENABLED(CONFIG_IPV6) + if (sk->sk_family == AF_INET6) + return ipv6_addr_any(&sk->sk_v6_rcv_saddr); +#endif + return !sk->sk_rcv_saddr; +} + void inet_get_local_port_range(struct net *net, int *low, int *high) { unsigned int seq; diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index 3647167c8fa3..370e24463fb7 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -567,10 +567,11 @@ static int inet_reuseport_add_sock(struct sock *sk, inet_csk(sk2)->icsk_bind_hash == tb && sk2->sk_reuseport && uid_eq(uid, sock_i_uid(sk2)) && inet_rcv_saddr_equal(sk, sk2, false)) - return reuseport_add_sock(sk, sk2); + return reuseport_add_sock(sk, sk2, + inet_rcv_saddr_any(sk)); } - return reuseport_alloc(sk); + return reuseport_alloc(sk, inet_rcv_saddr_any(sk)); } int __inet_hash(struct sock *sk, struct sock *osk) diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 060e841dde40..038dd7909051 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -221,11 +221,12 @@ static int udp_reuseport_add_sock(struct sock *sk, struct udp_hslot *hslot) (sk2->sk_bound_dev_if == sk->sk_bound_dev_if) && sk2->sk_reuseport && uid_eq(uid, sock_i_uid(sk2)) && inet_rcv_saddr_equal(sk, sk2, false)) { - return reuseport_add_sock(sk, sk2); + return reuseport_add_sock(sk, sk2, + inet_rcv_saddr_any(sk)); } } - return reuseport_alloc(sk); + return reuseport_alloc(sk, inet_rcv_saddr_any(sk)); } /** -- cgit From 5820f140edef111a9ea2ef414ab2428b8cb805b1 Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Mon, 25 Jun 2018 18:34:19 +0200 Subject: userns: move user access out of the mutex The old code would hold the userns_state_mutex indefinitely if memdup_user_nul stalled due to e.g. a userfault region. Prevent that by moving the memdup_user_nul in front of the mutex_lock(). Note: This changes the error precedence of invalid buf/count/*ppos vs map already written / capabilities missing. Fixes: 22d917d80e84 ("userns: Rework the user_namespace adding uid/gid...") Cc: stable@vger.kernel.org Signed-off-by: Jann Horn Acked-by: Christian Brauner Acked-by: Serge Hallyn Signed-off-by: Eric W. Biederman --- kernel/user_namespace.c | 24 ++++++++++-------------- 1 file changed, 10 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index c3d7583fcd21..e5222b5fb4fe 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -859,7 +859,16 @@ static ssize_t map_write(struct file *file, const char __user *buf, unsigned idx; struct uid_gid_extent extent; char *kbuf = NULL, *pos, *next_line; - ssize_t ret = -EINVAL; + ssize_t ret; + + /* Only allow < page size writes at the beginning of the file */ + if ((*ppos != 0) || (count >= PAGE_SIZE)) + return -EINVAL; + + /* Slurp in the user data */ + kbuf = memdup_user_nul(buf, count); + if (IS_ERR(kbuf)) + return PTR_ERR(kbuf); /* * The userns_state_mutex serializes all writes to any given map. @@ -895,19 +904,6 @@ static ssize_t map_write(struct file *file, const char __user *buf, if (cap_valid(cap_setid) && !file_ns_capable(file, ns, CAP_SYS_ADMIN)) goto out; - /* Only allow < page size writes at the beginning of the file */ - ret = -EINVAL; - if ((*ppos != 0) || (count >= PAGE_SIZE)) - goto out; - - /* Slurp in the user data */ - kbuf = memdup_user_nul(buf, count); - if (IS_ERR(kbuf)) { - ret = PTR_ERR(kbuf); - kbuf = NULL; - goto out; - } - /* Parse the user data */ ret = -EINVAL; pos = kbuf; -- cgit From 42a0cc3478584d4d63f68f2f5af021ddbea771fa Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Mon, 25 Jun 2018 18:34:10 +0200 Subject: sys: don't hold uts_sem while accessing userspace memory Holding uts_sem as a writer while accessing userspace memory allows a namespace admin to stall all processes that attempt to take uts_sem. Instead, move data through stack buffers and don't access userspace memory while uts_sem is held. Cc: stable@vger.kernel.org Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Jann Horn Signed-off-by: Eric W. Biederman --- arch/alpha/kernel/osf_sys.c | 51 ++++++++++----------- arch/sparc/kernel/sys_sparc_32.c | 22 ++++++---- arch/sparc/kernel/sys_sparc_64.c | 20 +++++---- kernel/sys.c | 95 +++++++++++++++++++--------------------- kernel/utsname_sysctl.c | 41 ++++++++++------- 5 files changed, 119 insertions(+), 110 deletions(-) (limited to 'kernel') diff --git a/arch/alpha/kernel/osf_sys.c b/arch/alpha/kernel/osf_sys.c index 6e921754c8fc..2a5c768df335 100644 --- a/arch/alpha/kernel/osf_sys.c +++ b/arch/alpha/kernel/osf_sys.c @@ -530,24 +530,19 @@ SYSCALL_DEFINE4(osf_mount, unsigned long, typenr, const char __user *, path, SYSCALL_DEFINE1(osf_utsname, char __user *, name) { int error; + char tmp[5 * 32]; down_read(&uts_sem); - error = -EFAULT; - if (copy_to_user(name + 0, utsname()->sysname, 32)) - goto out; - if (copy_to_user(name + 32, utsname()->nodename, 32)) - goto out; - if (copy_to_user(name + 64, utsname()->release, 32)) - goto out; - if (copy_to_user(name + 96, utsname()->version, 32)) - goto out; - if (copy_to_user(name + 128, utsname()->machine, 32)) - goto out; + memcpy(tmp + 0 * 32, utsname()->sysname, 32); + memcpy(tmp + 1 * 32, utsname()->nodename, 32); + memcpy(tmp + 2 * 32, utsname()->release, 32); + memcpy(tmp + 3 * 32, utsname()->version, 32); + memcpy(tmp + 4 * 32, utsname()->machine, 32); + up_read(&uts_sem); - error = 0; - out: - up_read(&uts_sem); - return error; + if (copy_to_user(name, tmp, sizeof(tmp))) + return -EFAULT; + return 0; } SYSCALL_DEFINE0(getpagesize) @@ -567,18 +562,21 @@ SYSCALL_DEFINE2(osf_getdomainname, char __user *, name, int, namelen) { int len, err = 0; char *kname; + char tmp[32]; - if (namelen > 32) + if (namelen < 0 || namelen > 32) namelen = 32; down_read(&uts_sem); kname = utsname()->domainname; len = strnlen(kname, namelen); - if (copy_to_user(name, kname, min(len + 1, namelen))) - err = -EFAULT; + len = min(len + 1, namelen); + memcpy(tmp, kname, len); up_read(&uts_sem); - return err; + if (copy_to_user(name, tmp, len)) + return -EFAULT; + return 0; } /* @@ -739,13 +737,14 @@ SYSCALL_DEFINE3(osf_sysinfo, int, command, char __user *, buf, long, count) }; unsigned long offset; const char *res; - long len, err = -EINVAL; + long len; + char tmp[__NEW_UTS_LEN + 1]; offset = command-1; if (offset >= ARRAY_SIZE(sysinfo_table)) { /* Digital UNIX has a few unpublished interfaces here */ printk("sysinfo(%d)", command); - goto out; + return -EINVAL; } down_read(&uts_sem); @@ -753,13 +752,11 @@ SYSCALL_DEFINE3(osf_sysinfo, int, command, char __user *, buf, long, count) len = strlen(res)+1; if ((unsigned long)len > (unsigned long)count) len = count; - if (copy_to_user(buf, res, len)) - err = -EFAULT; - else - err = 0; + memcpy(tmp, res, len); up_read(&uts_sem); - out: - return err; + if (copy_to_user(buf, tmp, len)) + return -EFAULT; + return 0; } SYSCALL_DEFINE5(osf_getsysinfo, unsigned long, op, void __user *, buffer, diff --git a/arch/sparc/kernel/sys_sparc_32.c b/arch/sparc/kernel/sys_sparc_32.c index 7f3d9c59719a..452e4d080855 100644 --- a/arch/sparc/kernel/sys_sparc_32.c +++ b/arch/sparc/kernel/sys_sparc_32.c @@ -197,23 +197,27 @@ SYSCALL_DEFINE5(rt_sigaction, int, sig, SYSCALL_DEFINE2(getdomainname, char __user *, name, int, len) { - int nlen, err; - + int nlen, err; + char tmp[__NEW_UTS_LEN + 1]; + if (len < 0) return -EINVAL; - down_read(&uts_sem); - + down_read(&uts_sem); + nlen = strlen(utsname()->domainname) + 1; err = -EINVAL; if (nlen > len) - goto out; + goto out_unlock; + memcpy(tmp, utsname()->domainname, nlen); - err = -EFAULT; - if (!copy_to_user(name, utsname()->domainname, nlen)) - err = 0; + up_read(&uts_sem); -out: + if (copy_to_user(name, tmp, nlen)) + return -EFAULT; + return 0; + +out_unlock: up_read(&uts_sem); return err; } diff --git a/arch/sparc/kernel/sys_sparc_64.c b/arch/sparc/kernel/sys_sparc_64.c index 63baa8aa9414..274ed0b9b3e0 100644 --- a/arch/sparc/kernel/sys_sparc_64.c +++ b/arch/sparc/kernel/sys_sparc_64.c @@ -519,23 +519,27 @@ asmlinkage void sparc_breakpoint(struct pt_regs *regs) SYSCALL_DEFINE2(getdomainname, char __user *, name, int, len) { - int nlen, err; + int nlen, err; + char tmp[__NEW_UTS_LEN + 1]; if (len < 0) return -EINVAL; - down_read(&uts_sem); - + down_read(&uts_sem); + nlen = strlen(utsname()->domainname) + 1; err = -EINVAL; if (nlen > len) - goto out; + goto out_unlock; + memcpy(tmp, utsname()->domainname, nlen); + + up_read(&uts_sem); - err = -EFAULT; - if (!copy_to_user(name, utsname()->domainname, nlen)) - err = 0; + if (copy_to_user(name, tmp, nlen)) + return -EFAULT; + return 0; -out: +out_unlock: up_read(&uts_sem); return err; } diff --git a/kernel/sys.c b/kernel/sys.c index 38509dc1f77b..69b9a37ecf0d 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1237,18 +1237,19 @@ static int override_release(char __user *release, size_t len) SYSCALL_DEFINE1(newuname, struct new_utsname __user *, name) { - int errno = 0; + struct new_utsname tmp; down_read(&uts_sem); - if (copy_to_user(name, utsname(), sizeof *name)) - errno = -EFAULT; + memcpy(&tmp, utsname(), sizeof(tmp)); up_read(&uts_sem); + if (copy_to_user(name, &tmp, sizeof(tmp))) + return -EFAULT; - if (!errno && override_release(name->release, sizeof(name->release))) - errno = -EFAULT; - if (!errno && override_architecture(name)) - errno = -EFAULT; - return errno; + if (override_release(name->release, sizeof(name->release))) + return -EFAULT; + if (override_architecture(name)) + return -EFAULT; + return 0; } #ifdef __ARCH_WANT_SYS_OLD_UNAME @@ -1257,55 +1258,46 @@ SYSCALL_DEFINE1(newuname, struct new_utsname __user *, name) */ SYSCALL_DEFINE1(uname, struct old_utsname __user *, name) { - int error = 0; + struct old_utsname tmp; if (!name) return -EFAULT; down_read(&uts_sem); - if (copy_to_user(name, utsname(), sizeof(*name))) - error = -EFAULT; + memcpy(&tmp, utsname(), sizeof(tmp)); up_read(&uts_sem); + if (copy_to_user(name, &tmp, sizeof(tmp))) + return -EFAULT; - if (!error && override_release(name->release, sizeof(name->release))) - error = -EFAULT; - if (!error && override_architecture(name)) - error = -EFAULT; - return error; + if (override_release(name->release, sizeof(name->release))) + return -EFAULT; + if (override_architecture(name)) + return -EFAULT; + return 0; } SYSCALL_DEFINE1(olduname, struct oldold_utsname __user *, name) { - int error; + struct oldold_utsname tmp = {}; if (!name) return -EFAULT; - if (!access_ok(VERIFY_WRITE, name, sizeof(struct oldold_utsname))) - return -EFAULT; down_read(&uts_sem); - error = __copy_to_user(&name->sysname, &utsname()->sysname, - __OLD_UTS_LEN); - error |= __put_user(0, name->sysname + __OLD_UTS_LEN); - error |= __copy_to_user(&name->nodename, &utsname()->nodename, - __OLD_UTS_LEN); - error |= __put_user(0, name->nodename + __OLD_UTS_LEN); - error |= __copy_to_user(&name->release, &utsname()->release, - __OLD_UTS_LEN); - error |= __put_user(0, name->release + __OLD_UTS_LEN); - error |= __copy_to_user(&name->version, &utsname()->version, - __OLD_UTS_LEN); - error |= __put_user(0, name->version + __OLD_UTS_LEN); - error |= __copy_to_user(&name->machine, &utsname()->machine, - __OLD_UTS_LEN); - error |= __put_user(0, name->machine + __OLD_UTS_LEN); + memcpy(&tmp.sysname, &utsname()->sysname, __OLD_UTS_LEN); + memcpy(&tmp.nodename, &utsname()->nodename, __OLD_UTS_LEN); + memcpy(&tmp.release, &utsname()->release, __OLD_UTS_LEN); + memcpy(&tmp.version, &utsname()->version, __OLD_UTS_LEN); + memcpy(&tmp.machine, &utsname()->machine, __OLD_UTS_LEN); up_read(&uts_sem); + if (copy_to_user(name, &tmp, sizeof(tmp))) + return -EFAULT; - if (!error && override_architecture(name)) - error = -EFAULT; - if (!error && override_release(name->release, sizeof(name->release))) - error = -EFAULT; - return error ? -EFAULT : 0; + if (override_architecture(name)) + return -EFAULT; + if (override_release(name->release, sizeof(name->release))) + return -EFAULT; + return 0; } #endif @@ -1319,17 +1311,18 @@ SYSCALL_DEFINE2(sethostname, char __user *, name, int, len) if (len < 0 || len > __NEW_UTS_LEN) return -EINVAL; - down_write(&uts_sem); errno = -EFAULT; if (!copy_from_user(tmp, name, len)) { - struct new_utsname *u = utsname(); + struct new_utsname *u; + down_write(&uts_sem); + u = utsname(); memcpy(u->nodename, tmp, len); memset(u->nodename + len, 0, sizeof(u->nodename) - len); errno = 0; uts_proc_notify(UTS_PROC_HOSTNAME); + up_write(&uts_sem); } - up_write(&uts_sem); return errno; } @@ -1337,8 +1330,9 @@ SYSCALL_DEFINE2(sethostname, char __user *, name, int, len) SYSCALL_DEFINE2(gethostname, char __user *, name, int, len) { - int i, errno; + int i; struct new_utsname *u; + char tmp[__NEW_UTS_LEN + 1]; if (len < 0) return -EINVAL; @@ -1347,11 +1341,11 @@ SYSCALL_DEFINE2(gethostname, char __user *, name, int, len) i = 1 + strlen(u->nodename); if (i > len) i = len; - errno = 0; - if (copy_to_user(name, u->nodename, i)) - errno = -EFAULT; + memcpy(tmp, u->nodename, i); up_read(&uts_sem); - return errno; + if (copy_to_user(name, tmp, i)) + return -EFAULT; + return 0; } #endif @@ -1370,17 +1364,18 @@ SYSCALL_DEFINE2(setdomainname, char __user *, name, int, len) if (len < 0 || len > __NEW_UTS_LEN) return -EINVAL; - down_write(&uts_sem); errno = -EFAULT; if (!copy_from_user(tmp, name, len)) { - struct new_utsname *u = utsname(); + struct new_utsname *u; + down_write(&uts_sem); + u = utsname(); memcpy(u->domainname, tmp, len); memset(u->domainname + len, 0, sizeof(u->domainname) - len); errno = 0; uts_proc_notify(UTS_PROC_DOMAINNAME); + up_write(&uts_sem); } - up_write(&uts_sem); return errno; } diff --git a/kernel/utsname_sysctl.c b/kernel/utsname_sysctl.c index 233cd8fc6910..258033d62cb3 100644 --- a/kernel/utsname_sysctl.c +++ b/kernel/utsname_sysctl.c @@ -18,7 +18,7 @@ #ifdef CONFIG_PROC_SYSCTL -static void *get_uts(struct ctl_table *table, int write) +static void *get_uts(struct ctl_table *table) { char *which = table->data; struct uts_namespace *uts_ns; @@ -26,21 +26,9 @@ static void *get_uts(struct ctl_table *table, int write) uts_ns = current->nsproxy->uts_ns; which = (which - (char *)&init_uts_ns) + (char *)uts_ns; - if (!write) - down_read(&uts_sem); - else - down_write(&uts_sem); return which; } -static void put_uts(struct ctl_table *table, int write, void *which) -{ - if (!write) - up_read(&uts_sem); - else - up_write(&uts_sem); -} - /* * Special case of dostring for the UTS structure. This has locks * to observe. Should this be in kernel/sys.c ???? @@ -50,13 +38,34 @@ static int proc_do_uts_string(struct ctl_table *table, int write, { struct ctl_table uts_table; int r; + char tmp_data[__NEW_UTS_LEN + 1]; + memcpy(&uts_table, table, sizeof(uts_table)); - uts_table.data = get_uts(table, write); + uts_table.data = tmp_data; + + /* + * Buffer the value in tmp_data so that proc_dostring() can be called + * without holding any locks. + * We also need to read the original value in the write==1 case to + * support partial writes. + */ + down_read(&uts_sem); + memcpy(tmp_data, get_uts(table), sizeof(tmp_data)); + up_read(&uts_sem); r = proc_dostring(&uts_table, write, buffer, lenp, ppos); - put_uts(table, write, uts_table.data); - if (write) + if (write) { + /* + * Write back the new value. + * Note that, since we dropped uts_sem, the result can + * theoretically be incorrect if there are two parallel writes + * at non-zero offsets to the same sysctl. + */ + down_write(&uts_sem); + memcpy(get_uts(table), tmp_data, sizeof(tmp_data)); + up_write(&uts_sem); proc_sys_poll_notify(table->poll); + } return r; } -- cgit From b5b1404d0815894de0690de8a1ab58269e56eae6 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 12 Aug 2018 12:19:42 -0700 Subject: init: rename and re-order boot_cpu_state_init() This is purely a preparatory patch for upcoming changes during the 4.19 merge window. We have a function called "boot_cpu_state_init()" that isn't really about the bootup cpu state: that is done much earlier by the similarly named "boot_cpu_init()" (note lack of "state" in name). This function initializes some hotplug CPU state, and needs to run after the percpu data has been properly initialized. It even has a comment to that effect. Except it _doesn't_ actually run after the percpu data has been properly initialized. On x86 it happens to do that, but on at least arm and arm64, the percpu base pointers are initialized by the arch-specific 'smp_prepare_boot_cpu()' hook, which ran _after_ boot_cpu_state_init(). This had some unexpected results, and in particular we have a patch pending for the merge window that did the obvious cleanup of using 'this_cpu_write()' in the cpu hotplug init code: - per_cpu_ptr(&cpuhp_state, smp_processor_id())->state = CPUHP_ONLINE; + this_cpu_write(cpuhp_state.state, CPUHP_ONLINE); which is obviously the right thing to do. Except because of the ordering issue, it actually failed miserably and unexpectedly on arm64. So this just fixes the ordering, and changes the name of the function to be 'boot_cpu_hotplug_init()' to make it obvious that it's about cpu hotplug state, because the core CPU state was supposed to have already been done earlier. Marked for stable, since the (not yet merged) patch that will show this problem is marked for stable. Reported-by: Vlastimil Babka Reported-by: Mian Yousaf Kaukab Suggested-by: Catalin Marinas Acked-by: Thomas Gleixner Cc: Will Deacon Cc: stable@kernel.org Signed-off-by: Linus Torvalds --- include/linux/cpu.h | 2 +- init/main.c | 2 +- kernel/cpu.c | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/include/linux/cpu.h b/include/linux/cpu.h index a97a63eef59f..3233fbe23594 100644 --- a/include/linux/cpu.h +++ b/include/linux/cpu.h @@ -30,7 +30,7 @@ struct cpu { }; extern void boot_cpu_init(void); -extern void boot_cpu_state_init(void); +extern void boot_cpu_hotplug_init(void); extern void cpu_init(void); extern void trap_init(void); diff --git a/init/main.c b/init/main.c index 3b4ada11ed52..5e13c544bbf4 100644 --- a/init/main.c +++ b/init/main.c @@ -561,8 +561,8 @@ asmlinkage __visible void __init start_kernel(void) setup_command_line(command_line); setup_nr_cpu_ids(); setup_per_cpu_areas(); - boot_cpu_state_init(); smp_prepare_boot_cpu(); /* arch-specific boot-cpu hooks */ + boot_cpu_hotplug_init(); build_all_zonelists(NULL); page_alloc_init(); diff --git a/kernel/cpu.c b/kernel/cpu.c index 0db8938fbb23..2f8f338e77cf 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -2010,7 +2010,7 @@ void __init boot_cpu_init(void) /* * Must be called _AFTER_ setting up the per_cpu areas */ -void __init boot_cpu_state_init(void) +void __init boot_cpu_hotplug_init(void) { per_cpu_ptr(&cpuhp_state, smp_processor_id())->state = CPUHP_ONLINE; } -- cgit From e8d2bec0457962e8f348a9a3627b398f7fe5c5fc Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sun, 12 Aug 2018 01:59:17 +0200 Subject: bpf: decouple btf from seq bpf fs dump and enable more maps Commit a26ca7c982cb ("bpf: btf: Add pretty print support to the basic arraymap") and 699c86d6ec21 ("bpf: btf: add pretty print for hash/lru_hash maps") enabled support for BTF and dumping via BPF fs for array and hash/lru map. However, both can be decoupled from each other such that regular BPF maps can be supported for attaching BTF key/value information, while not all maps necessarily need to dump via map_seq_show_elem() callback. The basic sanity check which is a prerequisite for all maps is that key/value size has to match in any case, and some maps can have extra checks via map_check_btf() callback, e.g. probing certain types or indicating no support in general. With that we can also enable retrieving BTF info for per-cpu map types and lpm. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Acked-by: Yonghong Song --- include/linux/bpf.h | 13 +++++++++---- kernel/bpf/arraymap.c | 26 ++++++++++++-------------- kernel/bpf/cpumap.c | 1 + kernel/bpf/devmap.c | 1 + kernel/bpf/hashtab.c | 20 +------------------- kernel/bpf/inode.c | 3 ++- kernel/bpf/local_storage.c | 1 + kernel/bpf/lpm_trie.c | 12 ++++++++++++ kernel/bpf/sockmap.c | 2 ++ kernel/bpf/stackmap.c | 1 + kernel/bpf/syscall.c | 36 ++++++++++++++++++++++++++++++++---- kernel/bpf/xskmap.c | 3 +-- 12 files changed, 75 insertions(+), 44 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index db11662faea6..523481a3471b 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -23,7 +23,7 @@ struct bpf_prog; struct bpf_map; struct sock; struct seq_file; -struct btf; +struct btf_type; /* map is generic key/value storage optionally accesible by eBPF programs */ struct bpf_map_ops { @@ -48,8 +48,9 @@ struct bpf_map_ops { u32 (*map_fd_sys_lookup_elem)(void *ptr); void (*map_seq_show_elem)(struct bpf_map *map, void *key, struct seq_file *m); - int (*map_check_btf)(const struct bpf_map *map, const struct btf *btf, - u32 key_type_id, u32 value_type_id); + int (*map_check_btf)(const struct bpf_map *map, + const struct btf_type *key_type, + const struct btf_type *value_type); }; struct bpf_map { @@ -118,9 +119,13 @@ static inline bool bpf_map_offload_neutral(const struct bpf_map *map) static inline bool bpf_map_support_seq_show(const struct bpf_map *map) { - return map->ops->map_seq_show_elem && map->ops->map_check_btf; + return map->btf && map->ops->map_seq_show_elem; } +int map_check_no_btf(const struct bpf_map *map, + const struct btf_type *key_type, + const struct btf_type *value_type); + extern const struct bpf_map_ops bpf_map_offload_ops; /* function argument constraints */ diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index f6ca3e712831..0c17aab3ce5f 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -358,27 +358,20 @@ static void array_map_seq_show_elem(struct bpf_map *map, void *key, rcu_read_unlock(); } -static int array_map_check_btf(const struct bpf_map *map, const struct btf *btf, - u32 btf_key_id, u32 btf_value_id) +static int array_map_check_btf(const struct bpf_map *map, + const struct btf_type *key_type, + const struct btf_type *value_type) { - const struct btf_type *key_type, *value_type; - u32 key_size, value_size; u32 int_data; - key_type = btf_type_id_size(btf, &btf_key_id, &key_size); - if (!key_type || BTF_INFO_KIND(key_type->info) != BTF_KIND_INT) + if (BTF_INFO_KIND(key_type->info) != BTF_KIND_INT) return -EINVAL; int_data = *(u32 *)(key_type + 1); - /* bpf array can only take a u32 key. This check makes - * sure that the btf matches the attr used during map_create. + /* bpf array can only take a u32 key. This check makes sure + * that the btf matches the attr used during map_create. */ - if (BTF_INT_BITS(int_data) != 32 || key_size != 4 || - BTF_INT_OFFSET(int_data)) - return -EINVAL; - - value_type = btf_type_id_size(btf, &btf_value_id, &value_size); - if (!value_type || value_size != map->value_size) + if (BTF_INT_BITS(int_data) != 32 || BTF_INT_OFFSET(int_data)) return -EINVAL; return 0; @@ -405,6 +398,7 @@ const struct bpf_map_ops percpu_array_map_ops = { .map_lookup_elem = percpu_array_map_lookup_elem, .map_update_elem = array_map_update_elem, .map_delete_elem = array_map_delete_elem, + .map_check_btf = array_map_check_btf, }; static int fd_array_map_alloc_check(union bpf_attr *attr) @@ -546,6 +540,7 @@ const struct bpf_map_ops prog_array_map_ops = { .map_fd_put_ptr = prog_fd_array_put_ptr, .map_fd_sys_lookup_elem = prog_fd_array_sys_lookup_elem, .map_release_uref = bpf_fd_array_map_clear, + .map_check_btf = map_check_no_btf, }; static struct bpf_event_entry *bpf_event_entry_gen(struct file *perf_file, @@ -634,6 +629,7 @@ const struct bpf_map_ops perf_event_array_map_ops = { .map_fd_get_ptr = perf_event_fd_array_get_ptr, .map_fd_put_ptr = perf_event_fd_array_put_ptr, .map_release = perf_event_fd_array_release, + .map_check_btf = map_check_no_btf, }; #ifdef CONFIG_CGROUPS @@ -665,6 +661,7 @@ const struct bpf_map_ops cgroup_array_map_ops = { .map_delete_elem = fd_array_map_delete_elem, .map_fd_get_ptr = cgroup_fd_array_get_ptr, .map_fd_put_ptr = cgroup_fd_array_put_ptr, + .map_check_btf = map_check_no_btf, }; #endif @@ -749,4 +746,5 @@ const struct bpf_map_ops array_of_maps_map_ops = { .map_fd_put_ptr = bpf_map_fd_put_ptr, .map_fd_sys_lookup_elem = bpf_map_fd_sys_lookup_elem, .map_gen_lookup = array_of_map_gen_lookup, + .map_check_btf = map_check_no_btf, }; diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index e0918d180f08..3b494941a44a 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -555,6 +555,7 @@ const struct bpf_map_ops cpu_map_ops = { .map_update_elem = cpu_map_update_elem, .map_lookup_elem = cpu_map_lookup_elem, .map_get_next_key = cpu_map_get_next_key, + .map_check_btf = map_check_no_btf, }; static int bq_flush_to_queue(struct bpf_cpu_map_entry *rcpu, diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index d361fc1e3bf3..a7c6620d8389 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -484,6 +484,7 @@ const struct bpf_map_ops dev_map_ops = { .map_lookup_elem = dev_map_lookup_elem, .map_update_elem = dev_map_update_elem, .map_delete_elem = dev_map_delete_elem, + .map_check_btf = map_check_no_btf, }; static int dev_map_notification(struct notifier_block *notifier, diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index d6110042e0d9..04b8eda94e7d 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -1185,23 +1185,6 @@ static void htab_map_seq_show_elem(struct bpf_map *map, void *key, rcu_read_unlock(); } -static int htab_map_check_btf(const struct bpf_map *map, const struct btf *btf, - u32 btf_key_id, u32 btf_value_id) -{ - const struct btf_type *key_type, *value_type; - u32 key_size, value_size; - - key_type = btf_type_id_size(btf, &btf_key_id, &key_size); - if (!key_type || key_size != map->key_size) - return -EINVAL; - - value_type = btf_type_id_size(btf, &btf_value_id, &value_size); - if (!value_type || value_size != map->value_size) - return -EINVAL; - - return 0; -} - const struct bpf_map_ops htab_map_ops = { .map_alloc_check = htab_map_alloc_check, .map_alloc = htab_map_alloc, @@ -1212,7 +1195,6 @@ const struct bpf_map_ops htab_map_ops = { .map_delete_elem = htab_map_delete_elem, .map_gen_lookup = htab_map_gen_lookup, .map_seq_show_elem = htab_map_seq_show_elem, - .map_check_btf = htab_map_check_btf, }; const struct bpf_map_ops htab_lru_map_ops = { @@ -1225,7 +1207,6 @@ const struct bpf_map_ops htab_lru_map_ops = { .map_delete_elem = htab_lru_map_delete_elem, .map_gen_lookup = htab_lru_map_gen_lookup, .map_seq_show_elem = htab_map_seq_show_elem, - .map_check_btf = htab_map_check_btf, }; /* Called from eBPF program */ @@ -1452,4 +1433,5 @@ const struct bpf_map_ops htab_of_maps_map_ops = { .map_fd_put_ptr = bpf_map_fd_put_ptr, .map_fd_sys_lookup_elem = bpf_map_fd_sys_lookup_elem, .map_gen_lookup = htab_of_map_gen_lookup, + .map_check_btf = map_check_no_btf, }; diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index fc5b103512e7..2ada5e21dfa6 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -334,7 +334,8 @@ static int bpf_mkmap(struct dentry *dentry, umode_t mode, void *arg) struct bpf_map *map = arg; return bpf_mkobj_ops(dentry, mode, arg, &bpf_map_iops, - map->btf ? &bpffs_map_fops : &bpffs_obj_fops); + bpf_map_support_seq_show(map) ? + &bpffs_map_fops : &bpffs_obj_fops); } static struct dentry * diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c index fc4e37f68f2a..22ad967d1e5f 100644 --- a/kernel/bpf/local_storage.c +++ b/kernel/bpf/local_storage.c @@ -246,6 +246,7 @@ const struct bpf_map_ops cgroup_storage_map_ops = { .map_lookup_elem = cgroup_storage_lookup_elem, .map_update_elem = cgroup_storage_update_elem, .map_delete_elem = cgroup_storage_delete_elem, + .map_check_btf = map_check_no_btf, }; int bpf_cgroup_storage_assign(struct bpf_prog *prog, struct bpf_map *_map) diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c index 1603492c9cc7..9058317ba9de 100644 --- a/kernel/bpf/lpm_trie.c +++ b/kernel/bpf/lpm_trie.c @@ -10,11 +10,13 @@ */ #include +#include #include #include #include #include #include +#include /* Intermediate node */ #define LPM_TREE_NODE_FLAG_IM BIT(0) @@ -686,6 +688,15 @@ free_stack: return err; } +static int trie_check_btf(const struct bpf_map *map, + const struct btf_type *key_type, + const struct btf_type *value_type) +{ + /* Keys must have struct bpf_lpm_trie_key embedded. */ + return BTF_INFO_KIND(key_type->info) != BTF_KIND_STRUCT ? + -EINVAL : 0; +} + const struct bpf_map_ops trie_map_ops = { .map_alloc = trie_alloc, .map_free = trie_free, @@ -693,4 +704,5 @@ const struct bpf_map_ops trie_map_ops = { .map_lookup_elem = trie_lookup_elem, .map_update_elem = trie_update_elem, .map_delete_elem = trie_delete_elem, + .map_check_btf = trie_check_btf, }; diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c index 0b38be5a955c..e376ffffebce 100644 --- a/kernel/bpf/sockmap.c +++ b/kernel/bpf/sockmap.c @@ -2495,6 +2495,7 @@ const struct bpf_map_ops sock_map_ops = { .map_update_elem = sock_map_update_elem, .map_delete_elem = sock_map_delete_elem, .map_release_uref = sock_map_release, + .map_check_btf = map_check_no_btf, }; const struct bpf_map_ops sock_hash_ops = { @@ -2505,6 +2506,7 @@ const struct bpf_map_ops sock_hash_ops = { .map_update_elem = sock_hash_update_elem, .map_delete_elem = sock_hash_delete_elem, .map_release_uref = sock_map_release, + .map_check_btf = map_check_no_btf, }; BPF_CALL_4(bpf_sock_map_update, struct bpf_sock_ops_kern *, bpf_sock, diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index b675a3f3d141..8061a439ef18 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -607,6 +607,7 @@ const struct bpf_map_ops stack_map_ops = { .map_lookup_elem = stack_map_lookup_elem, .map_update_elem = stack_map_update_elem, .map_delete_elem = stack_map_delete_elem, + .map_check_btf = map_check_no_btf, }; static int __init stack_map_init(void) diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 57f4d076141b..43727ed0d94a 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -103,6 +103,7 @@ int bpf_check_uarg_tail_zero(void __user *uaddr, const struct bpf_map_ops bpf_map_offload_ops = { .map_alloc = bpf_map_offload_map_alloc, .map_free = bpf_map_offload_map_free, + .map_check_btf = map_check_no_btf, }; static struct bpf_map *find_and_alloc_map(union bpf_attr *attr) @@ -455,6 +456,34 @@ static int bpf_obj_name_cpy(char *dst, const char *src) return 0; } +int map_check_no_btf(const struct bpf_map *map, + const struct btf_type *key_type, + const struct btf_type *value_type) +{ + return -ENOTSUPP; +} + +static int map_check_btf(const struct bpf_map *map, const struct btf *btf, + u32 btf_key_id, u32 btf_value_id) +{ + const struct btf_type *key_type, *value_type; + u32 key_size, value_size; + int ret = 0; + + key_type = btf_type_id_size(btf, &btf_key_id, &key_size); + if (!key_type || key_size != map->key_size) + return -EINVAL; + + value_type = btf_type_id_size(btf, &btf_value_id, &value_size); + if (!value_type || value_size != map->value_size) + return -EINVAL; + + if (map->ops->map_check_btf) + ret = map->ops->map_check_btf(map, key_type, value_type); + + return ret; +} + #define BPF_MAP_CREATE_LAST_FIELD btf_value_type_id /* called via syscall */ static int map_create(union bpf_attr *attr) @@ -489,8 +518,7 @@ static int map_create(union bpf_attr *attr) atomic_set(&map->refcnt, 1); atomic_set(&map->usercnt, 1); - if (bpf_map_support_seq_show(map) && - (attr->btf_key_type_id || attr->btf_value_type_id)) { + if (attr->btf_key_type_id || attr->btf_value_type_id) { struct btf *btf; if (!attr->btf_key_type_id || !attr->btf_value_type_id) { @@ -504,8 +532,8 @@ static int map_create(union bpf_attr *attr) goto free_map_nouncharge; } - err = map->ops->map_check_btf(map, btf, attr->btf_key_type_id, - attr->btf_value_type_id); + err = map_check_btf(map, btf, attr->btf_key_type_id, + attr->btf_value_type_id); if (err) { btf_put(btf); goto free_map_nouncharge; diff --git a/kernel/bpf/xskmap.c b/kernel/bpf/xskmap.c index b3c557476a8d..4ddf61e158f6 100644 --- a/kernel/bpf/xskmap.c +++ b/kernel/bpf/xskmap.c @@ -227,6 +227,5 @@ const struct bpf_map_ops xsk_map_ops = { .map_lookup_elem = xsk_map_lookup_elem, .map_update_elem = xsk_map_update_elem, .map_delete_elem = xsk_map_delete_elem, + .map_check_btf = map_check_no_btf, }; - - -- cgit From 93cb8e20d56be40c541475f77b5f565fbb385a4b Mon Sep 17 00:00:00 2001 From: Helge Deller Date: Sun, 1 Jul 2018 18:18:15 +0200 Subject: parisc: Drop architecture-specific ENOTSUP define parisc is the only Linux architecture which has defined a value for ENOTSUP. All other architectures #define ENOTSUP as EOPNOTSUPP in their libc headers. Having an own value for ENOTSUP which is different than EOPNOTSUPP often gives problems with userspace programs which expect both to be the same. One such example is a build error in the libuv package, as can be seen in https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=900237. Since we dropped HP-UX support, there is no real benefit in keeping an own value for ENOTSUP. This patch drops the parisc value for ENOTSUP from the kernel sources. glibc needs no patch, it reuses the exported headers. Signed-off-by: Helge Deller --- arch/parisc/include/uapi/asm/errno.h | 1 - kernel/time/posix-timers.c | 13 ++----------- tools/arch/parisc/include/uapi/asm/errno.h | 1 - 3 files changed, 2 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/arch/parisc/include/uapi/asm/errno.h b/arch/parisc/include/uapi/asm/errno.h index fc0df353ff0d..87245c584784 100644 --- a/arch/parisc/include/uapi/asm/errno.h +++ b/arch/parisc/include/uapi/asm/errno.h @@ -113,7 +113,6 @@ #define ELOOP 249 /* Too many symbolic links encountered */ #define ENOSYS 251 /* Function not implemented */ -#define ENOTSUP 252 /* Function not implemented (POSIX.4 / HPUX) */ #define ECANCELLED 253 /* aio request was canceled before complete (POSIX.4 / HPUX) */ #define ECANCELED ECANCELLED /* SuSv3 and Solaris wants one 'L' */ diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index e08ce3f27447..66321cbdf90c 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -85,15 +85,6 @@ static const struct k_clock clock_realtime, clock_monotonic; #error "SIGEV_THREAD_ID must not share bit with other SIGEV values!" #endif -/* - * parisc wants ENOTSUP instead of EOPNOTSUPP - */ -#ifndef ENOTSUP -# define ENANOSLEEP_NOTSUP EOPNOTSUPP -#else -# define ENANOSLEEP_NOTSUP ENOTSUP -#endif - /* * The timer ID is turned into a timer address by idr_find(). * Verifying a valid ID consists of: @@ -1220,7 +1211,7 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags, if (!kc) return -EINVAL; if (!kc->nsleep) - return -ENANOSLEEP_NOTSUP; + return -EOPNOTSUPP; if (get_timespec64(&t, rqtp)) return -EFAULT; @@ -1247,7 +1238,7 @@ COMPAT_SYSCALL_DEFINE4(clock_nanosleep, clockid_t, which_clock, int, flags, if (!kc) return -EINVAL; if (!kc->nsleep) - return -ENANOSLEEP_NOTSUP; + return -EOPNOTSUPP; if (compat_get_timespec64(&t, rqtp)) return -EFAULT; diff --git a/tools/arch/parisc/include/uapi/asm/errno.h b/tools/arch/parisc/include/uapi/asm/errno.h index fc0df353ff0d..87245c584784 100644 --- a/tools/arch/parisc/include/uapi/asm/errno.h +++ b/tools/arch/parisc/include/uapi/asm/errno.h @@ -113,7 +113,6 @@ #define ELOOP 249 /* Too many symbolic links encountered */ #define ENOSYS 251 /* Function not implemented */ -#define ENOTSUP 252 /* Function not implemented (POSIX.4 / HPUX) */ #define ECANCELLED 253 /* aio request was canceled before complete (POSIX.4 / HPUX) */ #define ECANCELED ECANCELLED /* SuSv3 and Solaris wants one 'L' */ -- cgit From 38e967ae1e6022557e579f673fba544cf19cf1c6 Mon Sep 17 00:00:00 2001 From: Ravi Bangoria Date: Thu, 9 Aug 2018 09:48:51 +0530 Subject: Uprobes: Simplify uprobe_register() body Simplify uprobe_register() function body and let __uprobe_register() handle everything. Also move dependency functions around to avoid build failures. Link: http://lkml.kernel.org/r/20180809041856.1547-2-ravi.bangoria@linux.ibm.com Reviewed-by: Song Liu Acked-by: Srikar Dronamraju Signed-off-by: Ravi Bangoria Signed-off-by: Steven Rostedt (VMware) --- kernel/events/uprobes.c | 69 ++++++++++++++++++++++++++----------------------- 1 file changed, 36 insertions(+), 33 deletions(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index ccc579a7d32e..471eac896635 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -840,13 +840,8 @@ register_for_each_vma(struct uprobe *uprobe, struct uprobe_consumer *new) return err; } -static int __uprobe_register(struct uprobe *uprobe, struct uprobe_consumer *uc) -{ - consumer_add(uprobe, uc); - return register_for_each_vma(uprobe, uc); -} - -static void __uprobe_unregister(struct uprobe *uprobe, struct uprobe_consumer *uc) +static void +__uprobe_unregister(struct uprobe *uprobe, struct uprobe_consumer *uc) { int err; @@ -860,24 +855,46 @@ static void __uprobe_unregister(struct uprobe *uprobe, struct uprobe_consumer *u } /* - * uprobe_register - register a probe + * uprobe_unregister - unregister a already registered probe. + * @inode: the file in which the probe has to be removed. + * @offset: offset from the start of the file. + * @uc: identify which probe if multiple probes are colocated. + */ +void uprobe_unregister(struct inode *inode, loff_t offset, struct uprobe_consumer *uc) +{ + struct uprobe *uprobe; + + uprobe = find_uprobe(inode, offset); + if (WARN_ON(!uprobe)) + return; + + down_write(&uprobe->register_rwsem); + __uprobe_unregister(uprobe, uc); + up_write(&uprobe->register_rwsem); + put_uprobe(uprobe); +} +EXPORT_SYMBOL_GPL(uprobe_unregister); + +/* + * __uprobe_register - register a probe * @inode: the file in which the probe has to be placed. * @offset: offset from the start of the file. * @uc: information on howto handle the probe.. * - * Apart from the access refcount, uprobe_register() takes a creation + * Apart from the access refcount, __uprobe_register() takes a creation * refcount (thro alloc_uprobe) if and only if this @uprobe is getting * inserted into the rbtree (i.e first consumer for a @inode:@offset * tuple). Creation refcount stops uprobe_unregister from freeing the * @uprobe even before the register operation is complete. Creation * refcount is released when the last @uc for the @uprobe - * unregisters. Caller of uprobe_register() is required to keep @inode + * unregisters. Caller of __uprobe_register() is required to keep @inode * (and the containing mount) referenced. * * Return errno if it cannot successully install probes * else return 0 (success) */ -int uprobe_register(struct inode *inode, loff_t offset, struct uprobe_consumer *uc) +static int __uprobe_register(struct inode *inode, loff_t offset, + struct uprobe_consumer *uc) { struct uprobe *uprobe; int ret; @@ -904,7 +921,8 @@ int uprobe_register(struct inode *inode, loff_t offset, struct uprobe_consumer * down_write(&uprobe->register_rwsem); ret = -EAGAIN; if (likely(uprobe_is_active(uprobe))) { - ret = __uprobe_register(uprobe, uc); + consumer_add(uprobe, uc); + ret = register_for_each_vma(uprobe, uc); if (ret) __uprobe_unregister(uprobe, uc); } @@ -915,6 +933,12 @@ int uprobe_register(struct inode *inode, loff_t offset, struct uprobe_consumer * goto retry; return ret; } + +int uprobe_register(struct inode *inode, loff_t offset, + struct uprobe_consumer *uc) +{ + return __uprobe_register(inode, offset, uc); +} EXPORT_SYMBOL_GPL(uprobe_register); /* @@ -946,27 +970,6 @@ int uprobe_apply(struct inode *inode, loff_t offset, return ret; } -/* - * uprobe_unregister - unregister a already registered probe. - * @inode: the file in which the probe has to be removed. - * @offset: offset from the start of the file. - * @uc: identify which probe if multiple probes are colocated. - */ -void uprobe_unregister(struct inode *inode, loff_t offset, struct uprobe_consumer *uc) -{ - struct uprobe *uprobe; - - uprobe = find_uprobe(inode, offset); - if (WARN_ON(!uprobe)) - return; - - down_write(&uprobe->register_rwsem); - __uprobe_unregister(uprobe, uc); - up_write(&uprobe->register_rwsem); - put_uprobe(uprobe); -} -EXPORT_SYMBOL_GPL(uprobe_unregister); - static int unapply_uprobe(struct uprobe *uprobe, struct mm_struct *mm) { struct vm_area_struct *vma; -- cgit From 6d43743e9079ac0531b60cde7eadd0f042873344 Mon Sep 17 00:00:00 2001 From: Ravi Bangoria Date: Thu, 9 Aug 2018 09:48:52 +0530 Subject: Uprobe: Additional argument arch_uprobe to uprobe_write_opcode() Add addition argument 'arch_uprobe' to uprobe_write_opcode(). We need this in later set of patches. Link: http://lkml.kernel.org/r/20180809041856.1547-3-ravi.bangoria@linux.ibm.com Reviewed-by: Song Liu Acked-by: Srikar Dronamraju Signed-off-by: Ravi Bangoria Signed-off-by: Steven Rostedt (VMware) --- arch/arm/probes/uprobes/core.c | 2 +- arch/mips/kernel/uprobes.c | 2 +- include/linux/uprobes.h | 2 +- kernel/events/uprobes.c | 9 +++++---- 4 files changed, 8 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/arch/arm/probes/uprobes/core.c b/arch/arm/probes/uprobes/core.c index d1329f1ba4e4..bf992264060e 100644 --- a/arch/arm/probes/uprobes/core.c +++ b/arch/arm/probes/uprobes/core.c @@ -32,7 +32,7 @@ bool is_swbp_insn(uprobe_opcode_t *insn) int set_swbp(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr) { - return uprobe_write_opcode(mm, vaddr, + return uprobe_write_opcode(auprobe, mm, vaddr, __opcode_to_mem_arm(auprobe->bpinsn)); } diff --git a/arch/mips/kernel/uprobes.c b/arch/mips/kernel/uprobes.c index f7a0645ccb82..4aaff3b3175c 100644 --- a/arch/mips/kernel/uprobes.c +++ b/arch/mips/kernel/uprobes.c @@ -224,7 +224,7 @@ unsigned long arch_uretprobe_hijack_return_addr( int __weak set_swbp(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr) { - return uprobe_write_opcode(mm, vaddr, UPROBE_SWBP_INSN); + return uprobe_write_opcode(auprobe, mm, vaddr, UPROBE_SWBP_INSN); } void arch_uprobe_copy_ixol(struct page *page, unsigned long vaddr, diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h index 0a294e950df8..bb9d2084af03 100644 --- a/include/linux/uprobes.h +++ b/include/linux/uprobes.h @@ -121,7 +121,7 @@ extern bool is_swbp_insn(uprobe_opcode_t *insn); extern bool is_trap_insn(uprobe_opcode_t *insn); extern unsigned long uprobe_get_swbp_addr(struct pt_regs *regs); extern unsigned long uprobe_get_trap_addr(struct pt_regs *regs); -extern int uprobe_write_opcode(struct mm_struct *mm, unsigned long vaddr, uprobe_opcode_t); +extern int uprobe_write_opcode(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr, uprobe_opcode_t); extern int uprobe_register(struct inode *inode, loff_t offset, struct uprobe_consumer *uc); extern int uprobe_apply(struct inode *inode, loff_t offset, struct uprobe_consumer *uc, bool); extern void uprobe_unregister(struct inode *inode, loff_t offset, struct uprobe_consumer *uc); diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 471eac896635..c0418ba52ba8 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -299,8 +299,8 @@ static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t * Called with mm->mmap_sem held for write. * Return 0 (success) or a negative errno. */ -int uprobe_write_opcode(struct mm_struct *mm, unsigned long vaddr, - uprobe_opcode_t opcode) +int uprobe_write_opcode(struct arch_uprobe *auprobe, struct mm_struct *mm, + unsigned long vaddr, uprobe_opcode_t opcode) { struct page *old_page, *new_page; struct vm_area_struct *vma; @@ -351,7 +351,7 @@ put_old: */ int __weak set_swbp(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr) { - return uprobe_write_opcode(mm, vaddr, UPROBE_SWBP_INSN); + return uprobe_write_opcode(auprobe, mm, vaddr, UPROBE_SWBP_INSN); } /** @@ -366,7 +366,8 @@ int __weak set_swbp(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned int __weak set_orig_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr) { - return uprobe_write_opcode(mm, vaddr, *(uprobe_opcode_t *)&auprobe->insn); + return uprobe_write_opcode(auprobe, mm, vaddr, + *(uprobe_opcode_t *)&auprobe->insn); } static struct uprobe *get_uprobe(struct uprobe *uprobe) -- cgit From 269777aa530f3438ec1781586cdac0b5fe47b061 Mon Sep 17 00:00:00 2001 From: Abel Vesa Date: Wed, 15 Aug 2018 00:26:00 +0300 Subject: cpu/hotplug: Non-SMP machines do not make use of booted_once Commit 0cc3cd21657b ("cpu/hotplug: Boot HT siblings at least once") breaks non-SMP builds. [ I suspect the 'bool' fields should just be made to be bitfields and be exposed regardless of configuration, but that's a separate cleanup that I'll leave to the owners of this file for later. - Linus ] Fixes: 0cc3cd21657b ("cpu/hotplug: Boot HT siblings at least once") Cc: Dave Hansen Cc: Thomas Gleixner Cc: Tony Luck Signed-off-by: Abel Vesa Signed-off-by: Linus Torvalds --- kernel/cpu.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index 90ec528a6e32..ed44d7d34c2d 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -2278,6 +2278,8 @@ void __init boot_cpu_init(void) */ void __init boot_cpu_hotplug_init(void) { +#ifdef CONFIG_SMP this_cpu_write(cpuhp_state.booted_once, true); +#endif this_cpu_write(cpuhp_state.state, CPUHP_ONLINE); } -- cgit From 3df6f61fff49632492490fb6e42646b803a9958a Mon Sep 17 00:00:00 2001 From: "zhangyi (F)" Date: Tue, 14 Aug 2018 10:34:42 +0800 Subject: PM / sleep: wakeup: Fix build error caused by missing SRCU support Commit ea0212f40c6 (power: auto select CONFIG_SRCU) made the code in drivers/base/power/wakeup.c use SRCU instead of RCU, but it forgot to select CONFIG_SRCU in Kconfig, which leads to the following build error if CONFIG_SRCU is not selected somewhere else: drivers/built-in.o: In function `wakeup_source_remove': (.text+0x3c6fc): undefined reference to `synchronize_srcu' drivers/built-in.o: In function `pm_print_active_wakeup_sources': (.text+0x3c7a8): undefined reference to `__srcu_read_lock' drivers/built-in.o: In function `pm_print_active_wakeup_sources': (.text+0x3c84c): undefined reference to `__srcu_read_unlock' drivers/built-in.o: In function `device_wakeup_arm_wake_irqs': (.text+0x3d1d8): undefined reference to `__srcu_read_lock' drivers/built-in.o: In function `device_wakeup_arm_wake_irqs': (.text+0x3d228): undefined reference to `__srcu_read_unlock' drivers/built-in.o: In function `device_wakeup_disarm_wake_irqs': (.text+0x3d24c): undefined reference to `__srcu_read_lock' drivers/built-in.o: In function `device_wakeup_disarm_wake_irqs': (.text+0x3d29c): undefined reference to `__srcu_read_unlock' drivers/built-in.o:(.data+0x4158): undefined reference to `process_srcu' Fix this error by selecting CONFIG_SRCU when PM_SLEEP is enabled. Fixes: ea0212f40c6 (power: auto select CONFIG_SRCU) Cc: 4.2+ # 4.2+ Signed-off-by: zhangyi (F) [ rjw: Minor subject/changelog fixups ] Signed-off-by: Rafael J. Wysocki --- kernel/power/Kconfig | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index e880ca22c5a5..3a6c2f87699e 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -105,6 +105,7 @@ config PM_SLEEP def_bool y depends on SUSPEND || HIBERNATE_CALLBACKS select PM + select SRCU config PM_SLEEP_SMP def_bool y -- cgit From b96679422007c3fa04625be14977904c27c722eb Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Wed, 6 Jun 2018 15:54:09 +0300 Subject: kallsyms: Simplify update_iter_mod() The logic in update_iter_mod() is overcomplicated and gets worse every time another get_ksymbol_* function is added. In preparation for adding another get_ksymbol_* function, simplify logic in update_iter_mod(). Signed-off-by: Adrian Hunter Tested-by: (ftrace changes only) Steven Rostedt (VMware) Acked-by: Andi Kleen Acked-by: Peter Zijlstra (Intel) Cc: Alexander Shishkin Cc: Andy Lutomirski Cc: Dave Hansen Cc: H. Peter Anvin Cc: Jiri Olsa Cc: Joerg Roedel Cc: Thomas Gleixner Cc: x86@kernel.org Link: http://lkml.kernel.org/r/1528289651-4113-2-git-send-email-adrian.hunter@intel.com Signed-off-by: Arnaldo Carvalho de Melo --- kernel/kallsyms.c | 25 +++++++++++-------------- 1 file changed, 11 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index a23e21ada81b..aa31aa07f2ef 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -506,27 +506,24 @@ static void reset_iter(struct kallsym_iter *iter, loff_t new_pos) } } +/* + * The end position (last + 1) of each additional kallsyms section is recorded + * in iter->pos_..._end as each section is added, and so can be used to + * determine which get_ksymbol_...() function to call next. + */ static int update_iter_mod(struct kallsym_iter *iter, loff_t pos) { iter->pos = pos; - if (iter->pos_ftrace_mod_end > 0 && - iter->pos_ftrace_mod_end < iter->pos) - return get_ksymbol_bpf(iter); - - if (iter->pos_mod_end > 0 && - iter->pos_mod_end < iter->pos) { - if (!get_ksymbol_ftrace_mod(iter)) - return get_ksymbol_bpf(iter); + if ((!iter->pos_mod_end || iter->pos_mod_end > pos) && + get_ksymbol_mod(iter)) return 1; - } - if (!get_ksymbol_mod(iter)) { - if (!get_ksymbol_ftrace_mod(iter)) - return get_ksymbol_bpf(iter); - } + if ((!iter->pos_ftrace_mod_end || iter->pos_ftrace_mod_end > pos) && + get_ksymbol_ftrace_mod(iter)) + return 1; - return 1; + return get_ksymbol_bpf(iter); } /* Returns false if pos at or past end of file. */ -- cgit From d83212d5dd6761625fe87cc23016bbaa47303271 Mon Sep 17 00:00:00 2001 From: Alexander Shishkin Date: Wed, 6 Jun 2018 15:54:10 +0300 Subject: kallsyms, x86: Export addresses of PTI entry trampolines Currently, the addresses of PTI entry trampolines are not exported to user space. Kernel profiling tools need these addresses to identify the kernel code, so add a symbol and address for each CPU's PTI entry trampoline. Signed-off-by: Alexander Shishkin Acked-by: Andi Kleen Acked-by: Peter Zijlstra (Intel) Cc: Andy Lutomirski Cc: Dave Hansen Cc: H. Peter Anvin Cc: Jiri Olsa Cc: Joerg Roedel Cc: Thomas Gleixner Cc: x86@kernel.org Link: http://lkml.kernel.org/r/1528289651-4113-3-git-send-email-adrian.hunter@intel.com Signed-off-by: Arnaldo Carvalho de Melo --- arch/x86/mm/cpu_entry_area.c | 23 +++++++++++++++++++++++ kernel/kallsyms.c | 28 +++++++++++++++++++++++++++- 2 files changed, 50 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/arch/x86/mm/cpu_entry_area.c b/arch/x86/mm/cpu_entry_area.c index b45f5aaefd74..fab49fd5190f 100644 --- a/arch/x86/mm/cpu_entry_area.c +++ b/arch/x86/mm/cpu_entry_area.c @@ -2,6 +2,7 @@ #include #include +#include #include #include @@ -150,6 +151,28 @@ static void __init setup_cpu_entry_area(int cpu) percpu_setup_debug_store(cpu); } +#ifdef CONFIG_X86_64 +int arch_get_kallsym(unsigned int symnum, unsigned long *value, char *type, + char *name) +{ + unsigned int cpu, ncpu = 0; + + if (symnum >= num_possible_cpus()) + return -EINVAL; + + for_each_possible_cpu(cpu) { + if (ncpu++ >= symnum) + break; + } + + *value = (unsigned long)&get_cpu_entry_area(cpu)->entry_trampoline; + *type = 't'; + strlcpy(name, "__entry_SYSCALL_64_trampoline", KSYM_NAME_LEN); + + return 0; +} +#endif + static __init void setup_cpu_entry_area_ptes(void) { #ifdef CONFIG_X86_32 diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index aa31aa07f2ef..02a0b01380d8 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -432,6 +432,7 @@ int sprint_backtrace(char *buffer, unsigned long address) /* To avoid using get_symbol_offset for every symbol, we carry prefix along. */ struct kallsym_iter { loff_t pos; + loff_t pos_arch_end; loff_t pos_mod_end; loff_t pos_ftrace_mod_end; unsigned long value; @@ -443,9 +444,29 @@ struct kallsym_iter { int show_value; }; +int __weak arch_get_kallsym(unsigned int symnum, unsigned long *value, + char *type, char *name) +{ + return -EINVAL; +} + +static int get_ksymbol_arch(struct kallsym_iter *iter) +{ + int ret = arch_get_kallsym(iter->pos - kallsyms_num_syms, + &iter->value, &iter->type, + iter->name); + + if (ret < 0) { + iter->pos_arch_end = iter->pos; + return 0; + } + + return 1; +} + static int get_ksymbol_mod(struct kallsym_iter *iter) { - int ret = module_get_kallsym(iter->pos - kallsyms_num_syms, + int ret = module_get_kallsym(iter->pos - iter->pos_arch_end, &iter->value, &iter->type, iter->name, iter->module_name, &iter->exported); @@ -501,6 +522,7 @@ static void reset_iter(struct kallsym_iter *iter, loff_t new_pos) iter->nameoff = get_symbol_offset(new_pos); iter->pos = new_pos; if (new_pos == 0) { + iter->pos_arch_end = 0; iter->pos_mod_end = 0; iter->pos_ftrace_mod_end = 0; } @@ -515,6 +537,10 @@ static int update_iter_mod(struct kallsym_iter *iter, loff_t pos) { iter->pos = pos; + if ((!iter->pos_arch_end || iter->pos_arch_end > pos) && + get_ksymbol_arch(iter)) + return 1; + if ((!iter->pos_mod_end || iter->pos_mod_end > pos) && get_ksymbol_mod(iter)) return 1; -- cgit From 2f4df0017baedd535254d987c6b10f855f1fb11f Mon Sep 17 00:00:00 2001 From: Vasily Gorbik Date: Mon, 6 Aug 2018 15:17:46 +0200 Subject: tracing: Add -mcount-nop option support -mcount-nop gcc option generates the calls to the profiling functions as nops which allows to avoid patching mcount jump with NOP instructions initially. -mcount-nop gcc option will be activated if platform selects HAVE_NOP_MCOUNT and gcc actually supports it. In addition to that CC_USING_NOP_MCOUNT is defined and could be used by architectures to adapt ftrace patching behavior. Link: http://lkml.kernel.org/r/patch-3.thread-aa7b8d.git-e02ed2dc082b.your-ad-here.call-01533557518-ext-9465@work.hours Signed-off-by: Vasily Gorbik Signed-off-by: Steven Rostedt (VMware) --- Makefile | 6 ++++++ kernel/trace/Kconfig | 5 +++++ kernel/trace/ftrace.c | 2 ++ 3 files changed, 13 insertions(+) (limited to 'kernel') diff --git a/Makefile b/Makefile index 2c92ebf59a34..c5b0cd7f3c2a 100644 --- a/Makefile +++ b/Makefile @@ -749,6 +749,12 @@ ifdef CONFIG_FTRACE_MCOUNT_RECORD CC_FLAGS_FTRACE += -mrecord-mcount export CC_USING_RECORD_MCOUNT := 1 endif + ifdef CONFIG_HAVE_NOP_MCOUNT + ifeq ($(call cc-option-yn, -mnop-mcount),y) + CC_FLAGS_FTRACE += -mnop-mcount + CC_FLAGS_USING += -DCC_USING_NOP_MCOUNT + endif + endif endif ifdef CONFIG_HAVE_FENTRY ifeq ($(call cc-option-yn, -mfentry),y) diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 036cec1fcd24..fd6754b88f87 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -47,6 +47,11 @@ config HAVE_FENTRY help Arch supports the gcc options -pg with -mfentry +config HAVE_NOP_MCOUNT + bool + help + Arch supports the gcc options -pg with -mrecord-mcount and -nop-mcount + config HAVE_C_RECORDMCOUNT bool help diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 48b5b466ec7a..468e8527e979 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -2950,12 +2950,14 @@ static int ftrace_update_code(struct module *mod, struct ftrace_page *new_pgs) p = &pg->records[i]; p->flags = rec_flags; +#ifndef CC_USING_NOP_MCOUNT /* * Do the initial record conversion from mcount jump * to the NOP instructions. */ if (!ftrace_code_disable(mod, p)) break; +#endif update_cnt++; } -- cgit From 91c1e6ba39cf7f20d4b1c8510d51fff8d91f207c Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Thu, 16 Aug 2018 11:19:04 -0400 Subject: blktrace: Add SPDX License format header Add the SPDX License header to ease license compliance management. Acked-by: Jens Axboe Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/blktrace.c | 14 +------------- 1 file changed, 1 insertion(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 987d9a9ae283..b82e546083e1 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -1,19 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2006 Jens Axboe * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - * */ #include #include -- cgit From 965931e3a803a506482616f89239eff6901c17b8 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Tue, 14 Aug 2018 11:01:12 -0700 Subject: bpf: fix a rcu usage warning in bpf_prog_array_copy_core() Commit 394e40a29788 ("bpf: extend bpf_prog_array to store pointers to the cgroup storage") refactored the bpf_prog_array_copy_core() to accommodate new structure bpf_prog_array_item which contains bpf_prog array itself. In the old code, we had perf_event_query_prog_array(): mutex_lock(...) bpf_prog_array_copy_call(): prog = rcu_dereference_check(array, 1)->progs bpf_prog_array_copy_core(prog, ...) mutex_unlock(...) With the above commit, we had perf_event_query_prog_array(): mutex_lock(...) bpf_prog_array_copy_call(): bpf_prog_array_copy_core(array, ...): item = rcu_dereference(array)->items; ... mutex_unlock(...) The new code will trigger a lockdep rcu checking warning. The fix is to change rcu_dereference() to rcu_dereference_check() to prevent such a warning. Reported-by: syzbot+6e72317008eef84a216b@syzkaller.appspotmail.com Fixes: 394e40a29788 ("bpf: extend bpf_prog_array to store pointers to the cgroup storage") Cc: Roman Gushchin Signed-off-by: Yonghong Song Acked-by: Alexei Starovoitov Acked-by: Roman Gushchin Signed-off-by: Daniel Borkmann --- kernel/bpf/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 4d09e610777f..3f5bf1af0826 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1579,7 +1579,7 @@ static bool bpf_prog_array_copy_core(struct bpf_prog_array __rcu *array, struct bpf_prog_array_item *item; int i = 0; - item = rcu_dereference(array)->items; + item = rcu_dereference_check(array, 1)->items; for (; item->prog; item++) { if (item->prog == &dummy_bpf_prog.prog) continue; -- cgit From 757d9140072054528b13bbe291583d9823cde195 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Thu, 16 Aug 2018 16:08:37 -0400 Subject: tracing/blktrace: Fix to allow setting same value Masami Hiramatsu reported: Current trace-enable attribute in sysfs returns an error if user writes the same setting value as current one, e.g. # cat /sys/block/sda/trace/enable 0 # echo 0 > /sys/block/sda/trace/enable bash: echo: write error: Invalid argument # echo 1 > /sys/block/sda/trace/enable # echo 1 > /sys/block/sda/trace/enable bash: echo: write error: Device or resource busy But this is not a preferred behavior, it should ignore if new setting is same as current one. This fixes the problem as below. # cat /sys/block/sda/trace/enable 0 # echo 0 > /sys/block/sda/trace/enable # echo 1 > /sys/block/sda/trace/enable # echo 1 > /sys/block/sda/trace/enable Link: http://lkml.kernel.org/r/20180816103802.08678002@gandalf.local.home Cc: Ingo Molnar Cc: Jens Axboe Cc: linux-block@vger.kernel.org Cc: stable@vger.kernel.org Fixes: cd649b8bb830d ("blktrace: remove sysfs_blk_trace_enable_show/store()") Reported-by: Masami Hiramatsu Tested-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) Signed-off-by: Jens Axboe --- kernel/trace/blktrace.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index b951aa1fac61..96457ad8d720 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -1841,6 +1841,10 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev, mutex_lock(&q->blk_trace_mutex); if (attr == &dev_attr_enable) { + if (!!value == !!q->blk_trace) { + ret = 0; + goto out_unlock_bdev; + } if (value) ret = blk_trace_setup_queue(q, bdev); else -- cgit From d40b0116c94bd8fc2b63aae35ce8e66bb53bba42 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 16 Aug 2018 21:49:08 +0200 Subject: bpf, sockmap: fix leakage of smap_psock_map_entry While working on sockmap I noticed that we do not always kfree the struct smap_psock_map_entry list elements which track psocks attached to maps. In the case of sock_hash_ctx_update_elem(), these map entries are allocated outside of __sock_map_ctx_update_elem() with their linkage to the socket hash table filled. In the case of sock array, the map entries are allocated inside of __sock_map_ctx_update_elem() and added with their linkage to the psock->maps. Both additions are under psock->maps_lock each. Now, we drop these elements from their psock->maps list in a few occasions: i) in sock array via smap_list_map_remove() when an entry is either deleted from the map from user space, or updated via user space or BPF program where we drop the old socket at that map slot, or the sock array is freed via sock_map_free() and drops all its elements; ii) for sock hash via smap_list_hash_remove() in exactly the same occasions as just described for sock array; iii) in the bpf_tcp_close() where we remove the elements from the list via psock_map_pop() and iterate over them dropping themselves from either sock array or sock hash; and last but not least iv) once again in smap_gc_work() which is a callback for deferring the work once the psock refcount hit zero and thus the socket is being destroyed. Problem is that the only case where we kfree() the list entry is in case iv), which at that point should have an empty list in normal cases. So in cases from i) to iii) we unlink the elements without freeing where they go out of reach from us. Hence fix is to properly kfree() them as well to stop the leakage. Given these are all handled under psock->maps_lock there is no need for deferred RCU freeing. I later also ran with kmemleak detector and it confirmed the finding as well where in the state before the fix the object goes unreferenced while after the patch no kmemleak report related to BPF showed up. [...] unreferenced object 0xffff880378eadae0 (size 64): comm "test_sockmap", pid 2225, jiffies 4294720701 (age 43.504s) hex dump (first 32 bytes): 00 01 00 00 00 00 ad de 00 02 00 00 00 00 ad de ................ 50 4d 75 5d 03 88 ff ff 00 00 00 00 00 00 00 00 PMu]............ backtrace: [<000000005225ac3c>] sock_map_ctx_update_elem.isra.21+0xd8/0x210 [<0000000045dd6d3c>] bpf_sock_map_update+0x29/0x60 [<00000000877723aa>] ___bpf_prog_run+0x1e1f/0x4960 [<000000002ef89e83>] 0xffffffffffffffff unreferenced object 0xffff880378ead240 (size 64): comm "test_sockmap", pid 2225, jiffies 4294720701 (age 43.504s) hex dump (first 32 bytes): 00 01 00 00 00 00 ad de 00 02 00 00 00 00 ad de ................ 00 44 75 5d 03 88 ff ff 00 00 00 00 00 00 00 00 .Du]............ backtrace: [<000000005225ac3c>] sock_map_ctx_update_elem.isra.21+0xd8/0x210 [<0000000030e37a3a>] sock_map_update_elem+0x125/0x240 [<000000002e5ce36e>] map_update_elem+0x4eb/0x7b0 [<00000000db453cc9>] __x64_sys_bpf+0x1f9/0x360 [<0000000000763660>] do_syscall_64+0x9a/0x300 [<00000000422a2bb2>] entry_SYSCALL_64_after_hwframe+0x44/0xa9 [<000000002ef89e83>] 0xffffffffffffffff [...] Fixes: e9db4ef6bf4c ("bpf: sockhash fix omitted bucket lock in sock_close") Fixes: 54fedb42c653 ("bpf: sockmap, fix smap_list_map_remove when psock is in many maps") Fixes: 2f857d04601a ("bpf: sockmap, remove STRPARSER map_flags and add multi-map support") Signed-off-by: Daniel Borkmann Acked-by: John Fastabend Acked-by: Song Liu Signed-off-by: Alexei Starovoitov --- kernel/bpf/sockmap.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c index 0c1a696b041b..94a324bc6afc 100644 --- a/kernel/bpf/sockmap.c +++ b/kernel/bpf/sockmap.c @@ -370,6 +370,7 @@ static void bpf_tcp_close(struct sock *sk, long timeout) } raw_spin_unlock_bh(&b->lock); } + kfree(e); e = psock_map_pop(sk, psock); } rcu_read_unlock(); @@ -1675,8 +1676,10 @@ static void smap_list_map_remove(struct smap_psock *psock, spin_lock_bh(&psock->maps_lock); list_for_each_entry_safe(e, tmp, &psock->maps, list) { - if (e->entry == entry) + if (e->entry == entry) { list_del(&e->list); + kfree(e); + } } spin_unlock_bh(&psock->maps_lock); } @@ -1690,8 +1693,10 @@ static void smap_list_hash_remove(struct smap_psock *psock, list_for_each_entry_safe(e, tmp, &psock->maps, list) { struct htab_elem *c = rcu_dereference(e->hash_link); - if (c == hash_link) + if (c == hash_link) { list_del(&e->list); + kfree(e); + } } spin_unlock_bh(&psock->maps_lock); } -- cgit From 166ab6f0a0702fdd4d865ad5090bf3094ed83428 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 16 Aug 2018 21:49:09 +0200 Subject: bpf, sockmap: fix map elem deletion race with smap_stop_sock The smap_start_sock() and smap_stop_sock() are each protected under the sock->sk_callback_lock from their call-sites except in the case of sock_map_delete_elem() where we drop the old socket from the map slot. This is racy because the same sock could be part of multiple sock maps, so we run smap_stop_sock() in parallel, and given at that point psock->strp_enabled might be true on both CPUs, we might for example wrongly restore the sk->sk_data_ready / sk->sk_write_space. Therefore, hold the sock->sk_callback_lock as well on delete. Looks like 2f857d04601a ("bpf: sockmap, remove STRPARSER map_flags and add multi-map support") had this right, but later on e9db4ef6bf4c ("bpf: sockhash fix omitted bucket lock in sock_close") removed it again from delete leaving this smap_stop_sock() instance unprotected. Fixes: e9db4ef6bf4c ("bpf: sockhash fix omitted bucket lock in sock_close") Signed-off-by: Daniel Borkmann Acked-by: John Fastabend Acked-by: Song Liu Signed-off-by: Alexei Starovoitov --- kernel/bpf/sockmap.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c index 94a324bc6afc..921cb6b8c862 100644 --- a/kernel/bpf/sockmap.c +++ b/kernel/bpf/sockmap.c @@ -1786,8 +1786,11 @@ static int sock_map_delete_elem(struct bpf_map *map, void *key) if (!psock) goto out; - if (psock->bpf_parse) + if (psock->bpf_parse) { + write_lock_bh(&sock->sk_callback_lock); smap_stop_sock(psock, sock); + write_unlock_bh(&sock->sk_callback_lock); + } smap_list_map_remove(psock, &stab->sock_map[k]); smap_release_sock(psock, sock); out: -- cgit From 585f5a6252ee43ec8feeee07387e3fcc7e8bb292 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 16 Aug 2018 21:49:10 +0200 Subject: bpf, sockmap: fix sock_map_ctx_update_elem race with exist/noexist The current code in sock_map_ctx_update_elem() allows for BPF_EXIST and BPF_NOEXIST map update flags. While on array-like maps this approach is rather uncommon, e.g. bpf_fd_array_map_update_elem() and others enforce map update flags to be BPF_ANY such that xchg() can be used directly, the current implementation in sock map does not guarantee that such operation with BPF_EXIST / BPF_NOEXIST is atomic. The initial test does a READ_ONCE(stab->sock_map[i]) to fetch the socket from the slot which is then tested for NULL / non-NULL. However later after __sock_map_ctx_update_elem(), the actual update is done through osock = xchg(&stab->sock_map[i], sock). Problem is that in the meantime a different CPU could have updated / deleted a socket on that specific slot and thus flag contraints won't hold anymore. I've been thinking whether best would be to just break UAPI and do an enforcement of BPF_ANY to check if someone actually complains, however trouble is that already in BPF kselftest we use BPF_NOEXIST for the map update, and therefore it might have been copied into applications already. The fix to keep the current behavior intact would be to add a map lock similar to the sock hash bucket lock only for covering the whole map. Fixes: 174a79ff9515 ("bpf: sockmap with sk redirect support") Signed-off-by: Daniel Borkmann Acked-by: John Fastabend Acked-by: Song Liu Signed-off-by: Alexei Starovoitov --- kernel/bpf/sockmap.c | 106 +++++++++++++++++++++++++++------------------------ 1 file changed, 57 insertions(+), 49 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c index 921cb6b8c862..98e621a29e8e 100644 --- a/kernel/bpf/sockmap.c +++ b/kernel/bpf/sockmap.c @@ -58,6 +58,7 @@ struct bpf_stab { struct bpf_map map; struct sock **sock_map; struct bpf_sock_progs progs; + raw_spinlock_t lock; }; struct bucket { @@ -89,9 +90,9 @@ enum smap_psock_state { struct smap_psock_map_entry { struct list_head list; + struct bpf_map *map; struct sock **entry; struct htab_elem __rcu *hash_link; - struct bpf_htab __rcu *htab; }; struct smap_psock { @@ -343,13 +344,18 @@ static void bpf_tcp_close(struct sock *sk, long timeout) e = psock_map_pop(sk, psock); while (e) { if (e->entry) { - osk = cmpxchg(e->entry, sk, NULL); + struct bpf_stab *stab = container_of(e->map, struct bpf_stab, map); + + raw_spin_lock_bh(&stab->lock); + osk = *e->entry; if (osk == sk) { + *e->entry = NULL; smap_release_sock(psock, sk); } + raw_spin_unlock_bh(&stab->lock); } else { struct htab_elem *link = rcu_dereference(e->hash_link); - struct bpf_htab *htab = rcu_dereference(e->htab); + struct bpf_htab *htab = container_of(e->map, struct bpf_htab, map); struct hlist_head *head; struct htab_elem *l; struct bucket *b; @@ -1642,6 +1648,7 @@ static struct bpf_map *sock_map_alloc(union bpf_attr *attr) return ERR_PTR(-ENOMEM); bpf_map_init_from_attr(&stab->map, attr); + raw_spin_lock_init(&stab->lock); /* make sure page count doesn't overflow */ cost = (u64) stab->map.max_entries * sizeof(struct sock *); @@ -1716,14 +1723,15 @@ static void sock_map_free(struct bpf_map *map) * and a grace period expire to ensure psock is really safe to remove. */ rcu_read_lock(); + raw_spin_lock_bh(&stab->lock); for (i = 0; i < stab->map.max_entries; i++) { struct smap_psock *psock; struct sock *sock; - sock = xchg(&stab->sock_map[i], NULL); + sock = stab->sock_map[i]; if (!sock) continue; - + stab->sock_map[i] = NULL; psock = smap_psock_sk(sock); /* This check handles a racing sock event that can get the * sk_callback_lock before this case but after xchg happens @@ -1735,6 +1743,7 @@ static void sock_map_free(struct bpf_map *map) smap_release_sock(psock, sock); } } + raw_spin_unlock_bh(&stab->lock); rcu_read_unlock(); sock_map_remove_complete(stab); @@ -1778,14 +1787,16 @@ static int sock_map_delete_elem(struct bpf_map *map, void *key) if (k >= map->max_entries) return -EINVAL; - sock = xchg(&stab->sock_map[k], NULL); + raw_spin_lock_bh(&stab->lock); + sock = stab->sock_map[k]; + stab->sock_map[k] = NULL; + raw_spin_unlock_bh(&stab->lock); if (!sock) return -EINVAL; psock = smap_psock_sk(sock); if (!psock) - goto out; - + return 0; if (psock->bpf_parse) { write_lock_bh(&sock->sk_callback_lock); smap_stop_sock(psock, sock); @@ -1793,7 +1804,6 @@ static int sock_map_delete_elem(struct bpf_map *map, void *key) } smap_list_map_remove(psock, &stab->sock_map[k]); smap_release_sock(psock, sock); -out: return 0; } @@ -1829,11 +1839,9 @@ out: static int __sock_map_ctx_update_elem(struct bpf_map *map, struct bpf_sock_progs *progs, struct sock *sock, - struct sock **map_link, void *key) { struct bpf_prog *verdict, *parse, *tx_msg; - struct smap_psock_map_entry *e = NULL; struct smap_psock *psock; bool new = false; int err = 0; @@ -1906,14 +1914,6 @@ static int __sock_map_ctx_update_elem(struct bpf_map *map, new = true; } - if (map_link) { - e = kzalloc(sizeof(*e), GFP_ATOMIC | __GFP_NOWARN); - if (!e) { - err = -ENOMEM; - goto out_free; - } - } - /* 3. At this point we have a reference to a valid psock that is * running. Attach any BPF programs needed. */ @@ -1935,17 +1935,6 @@ static int __sock_map_ctx_update_elem(struct bpf_map *map, write_unlock_bh(&sock->sk_callback_lock); } - /* 4. Place psock in sockmap for use and stop any programs on - * the old sock assuming its not the same sock we are replacing - * it with. Because we can only have a single set of programs if - * old_sock has a strp we can stop it. - */ - if (map_link) { - e->entry = map_link; - spin_lock_bh(&psock->maps_lock); - list_add_tail(&e->list, &psock->maps); - spin_unlock_bh(&psock->maps_lock); - } return err; out_free: smap_release_sock(psock, sock); @@ -1956,7 +1945,6 @@ out_progs: } if (tx_msg) bpf_prog_put(tx_msg); - kfree(e); return err; } @@ -1966,36 +1954,57 @@ static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops, { struct bpf_stab *stab = container_of(map, struct bpf_stab, map); struct bpf_sock_progs *progs = &stab->progs; - struct sock *osock, *sock; + struct sock *osock, *sock = skops->sk; + struct smap_psock_map_entry *e; + struct smap_psock *psock; u32 i = *(u32 *)key; int err; if (unlikely(flags > BPF_EXIST)) return -EINVAL; - if (unlikely(i >= stab->map.max_entries)) return -E2BIG; - sock = READ_ONCE(stab->sock_map[i]); - if (flags == BPF_EXIST && !sock) - return -ENOENT; - else if (flags == BPF_NOEXIST && sock) - return -EEXIST; + e = kzalloc(sizeof(*e), GFP_ATOMIC | __GFP_NOWARN); + if (!e) + return -ENOMEM; - sock = skops->sk; - err = __sock_map_ctx_update_elem(map, progs, sock, &stab->sock_map[i], - key); + err = __sock_map_ctx_update_elem(map, progs, sock, key); if (err) goto out; - osock = xchg(&stab->sock_map[i], sock); - if (osock) { - struct smap_psock *opsock = smap_psock_sk(osock); + /* psock guaranteed to be present. */ + psock = smap_psock_sk(sock); + raw_spin_lock_bh(&stab->lock); + osock = stab->sock_map[i]; + if (osock && flags == BPF_NOEXIST) { + err = -EEXIST; + goto out_unlock; + } + if (!osock && flags == BPF_EXIST) { + err = -ENOENT; + goto out_unlock; + } + + e->entry = &stab->sock_map[i]; + e->map = map; + spin_lock_bh(&psock->maps_lock); + list_add_tail(&e->list, &psock->maps); + spin_unlock_bh(&psock->maps_lock); - smap_list_map_remove(opsock, &stab->sock_map[i]); - smap_release_sock(opsock, osock); + stab->sock_map[i] = sock; + if (osock) { + psock = smap_psock_sk(osock); + smap_list_map_remove(psock, &stab->sock_map[i]); + smap_release_sock(psock, osock); } + raw_spin_unlock_bh(&stab->lock); + return 0; +out_unlock: + smap_release_sock(psock, sock); + raw_spin_unlock_bh(&stab->lock); out: + kfree(e); return err; } @@ -2358,7 +2367,7 @@ static int sock_hash_ctx_update_elem(struct bpf_sock_ops_kern *skops, b = __select_bucket(htab, hash); head = &b->head; - err = __sock_map_ctx_update_elem(map, progs, sock, NULL, key); + err = __sock_map_ctx_update_elem(map, progs, sock, key); if (err) goto err; @@ -2384,8 +2393,7 @@ static int sock_hash_ctx_update_elem(struct bpf_sock_ops_kern *skops, } rcu_assign_pointer(e->hash_link, l_new); - rcu_assign_pointer(e->htab, - container_of(map, struct bpf_htab, map)); + e->map = map; spin_lock_bh(&psock->maps_lock); list_add_tail(&e->list, &psock->maps); spin_unlock_bh(&psock->maps_lock); -- cgit From 179a0cc4e0e14216b3c46b5538436532cb9ed39a Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Thu, 16 Aug 2018 11:20:54 -0400 Subject: tracing: Add SPDX License format to bpf_trace.c Add the SPDX License header to ease license compliance management. Acked-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/bpf_trace.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 0ae6829804bc..08fcfe440c63 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1,9 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0 /* Copyright (c) 2011-2015 PLUMgrid, http://plumgrid.com * Copyright (c) 2016 Facebook - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. */ #include #include -- cgit From bcea3f96e11cf2f0232d851e0fdb854f5ada425a Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Thu, 16 Aug 2018 11:23:53 -0400 Subject: tracing: Add SPDX License format tags to tracing files Add the SPDX License header to ease license compliance management. Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 1 + kernel/trace/ring_buffer.c | 1 + kernel/trace/ring_buffer_benchmark.c | 1 + kernel/trace/trace.c | 1 + kernel/trace/trace_clock.c | 1 + kernel/trace/trace_event_perf.c | 1 + kernel/trace/trace_events.c | 1 + kernel/trace/trace_events_filter.c | 15 +-------------- kernel/trace/trace_events_hist.c | 11 +---------- kernel/trace/trace_events_trigger.c | 15 +-------------- kernel/trace/trace_hwlat.c | 4 +--- kernel/trace/trace_irqsoff.c | 1 + kernel/trace/trace_kprobe.c | 13 +------------ kernel/trace/trace_output.c | 1 + kernel/trace/trace_printk.c | 1 + kernel/trace/trace_probe.c | 14 +------------- kernel/trace/trace_probe.h | 14 +------------- kernel/trace/trace_seq.c | 1 + kernel/trace/trace_uprobe.c | 14 +------------- kernel/trace/tracing_map.c | 11 +---------- 20 files changed, 20 insertions(+), 102 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 468e8527e979..f536f601bd46 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Infrastructure for profiling code inserted by 'gcc -pg'. * diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index b386830b4d51..1d92d4a982fd 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Generic ring buffer * diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c index 68ee79afe31c..ffba6789c0e2 100644 --- a/kernel/trace/ring_buffer_benchmark.c +++ b/kernel/trace/ring_buffer_benchmark.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * ring buffer tester and benchmark * diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 2dad27809794..4ecd8950b513 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * ring buffer based function tracer * diff --git a/kernel/trace/trace_clock.c b/kernel/trace/trace_clock.c index d8a188e0418a..aaf6793ededa 100644 --- a/kernel/trace/trace_clock.c +++ b/kernel/trace/trace_clock.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * tracing clocks * diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c index c79193e598f5..69a3fe926e8c 100644 --- a/kernel/trace/trace_event_perf.c +++ b/kernel/trace/trace_event_perf.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * trace event based perf event profiling/tracing * diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 808cf29febe2..f94be0c2827b 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * event tracer * diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 184c7685d5ea..84a65173b1e9 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -1,20 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0 /* * trace_events_filter - generic event filtering * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. - * * Copyright (C) 2009 Tom Zanussi */ diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index c522b51d9909..85f6b01431c7 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -1,16 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0 /* * trace_events_hist - trace event hist triggers * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * * Copyright (C) 2015 Tom Zanussi */ diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c index 750044ef15e8..2152d1e530cb 100644 --- a/kernel/trace/trace_events_trigger.c +++ b/kernel/trace/trace_events_trigger.c @@ -1,20 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0 /* * trace_events_trigger - trace event triggers * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. - * * Copyright (C) 2013 Tom Zanussi */ diff --git a/kernel/trace/trace_hwlat.c b/kernel/trace/trace_hwlat.c index 688e48be7bb5..1e6db9cbe4dc 100644 --- a/kernel/trace/trace_hwlat.c +++ b/kernel/trace/trace_hwlat.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * trace_hwlatdetect.c - A simple Hardware Latency detector. * @@ -35,9 +36,6 @@ * * Includes useful feedback from Clark Williams * - * This file is licensed under the terms of the GNU General Public - * License version 2. This program is licensed "as is" without any - * warranty of any kind, whether express or implied. */ #include #include diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index 94c1ba139b3b..b7357f9f82a3 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * trace irqs off critical timings * diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 2e13f77b9a37..65a4157af851 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -1,20 +1,9 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Kprobes-based tracing events * * Created by Masami Hiramatsu * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #define pr_fmt(fmt) "trace_kprobe: " fmt diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 1c8e30fda46a..6e6cc64faa38 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * trace_output.c * diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c index 50f44b7b2b32..b0875b327f5c 100644 --- a/kernel/trace/trace_printk.c +++ b/kernel/trace/trace_printk.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * trace binary printk * diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index daf54bda4dc8..e99c3ce7aa65 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -1,19 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Common code for probe-based Dynamic events. * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - * * This code was copied from kernel/trace/trace_kprobe.c written by * Masami Hiramatsu * diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h index 75daff22ccea..5f52668e165d 100644 --- a/kernel/trace/trace_probe.h +++ b/kernel/trace/trace_probe.h @@ -1,19 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Common header file for probe-based Dynamic events. * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - * * This code was copied from kernel/trace/trace_kprobe.h written by * Masami Hiramatsu * diff --git a/kernel/trace/trace_seq.c b/kernel/trace/trace_seq.c index e694c9f9efa4..6b1c562ffdaf 100644 --- a/kernel/trace/trace_seq.c +++ b/kernel/trace/trace_seq.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * trace_seq.c * diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index ac02fafc9f1b..e696667da29a 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -1,19 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0 /* * uprobes-based tracing events * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - * * Copyright (C) IBM Corporation, 2010-2012 * Author: Srikar Dronamraju */ diff --git a/kernel/trace/tracing_map.c b/kernel/trace/tracing_map.c index 752d8042bad4..9a1c22310323 100644 --- a/kernel/trace/tracing_map.c +++ b/kernel/trace/tracing_map.c @@ -1,16 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0 /* * tracing_map - lock-free map for tracing * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * * Copyright (C) 2015 Tom Zanussi * * tracing_map implementation inspired by lock-free map algorithms -- cgit From bb730b5833b5bddf5cb226865e5f4496770d00b0 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Thu, 16 Aug 2018 11:26:55 -0400 Subject: tracing: Fix SPDX format headers to use C++ style comments The Linux kernel adopted the SPDX License format headers to ease license compliance management, and uses the C++ '//' style comments for the SPDX header tags. Some files in the tracing directory used the C style /* */ comments for them. To be consistent across all files, replace the /* */ C style SPDX tags with the C++ // SPDX tags. Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.h | 2 +- kernel/trace/trace_benchmark.h | 2 +- kernel/trace/trace_entries.h | 2 +- kernel/trace/trace_events_filter_test.h | 2 +- kernel/trace/trace_kprobe_selftest.h | 2 +- kernel/trace/trace_output.h | 2 +- kernel/trace/trace_stat.h | 2 +- kernel/trace/tracing_map.h | 2 +- 8 files changed, 8 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index a62b678731e3..3b8c0e24ab30 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -1,4 +1,4 @@ -/* SPDX-License-Identifier: GPL-2.0 */ +// SPDX-License-Identifier: GPL-2.0 #ifndef _LINUX_KERNEL_TRACE_H #define _LINUX_KERNEL_TRACE_H diff --git a/kernel/trace/trace_benchmark.h b/kernel/trace/trace_benchmark.h index be1d86ff753d..79e6fbe5b365 100644 --- a/kernel/trace/trace_benchmark.h +++ b/kernel/trace/trace_benchmark.h @@ -1,4 +1,4 @@ -/* SPDX-License-Identifier: GPL-2.0 */ +// SPDX-License-Identifier: GPL-2.0 #undef TRACE_SYSTEM #define TRACE_SYSTEM benchmark diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h index 1d67464ed95e..06bb2fd9a56c 100644 --- a/kernel/trace/trace_entries.h +++ b/kernel/trace/trace_entries.h @@ -1,4 +1,4 @@ -/* SPDX-License-Identifier: GPL-2.0 */ +// SPDX-License-Identifier: GPL-2.0 /* * This file defines the trace event structures that go into the ring * buffer directly. They are created via macros so that changes for them diff --git a/kernel/trace/trace_events_filter_test.h b/kernel/trace/trace_events_filter_test.h index 39d7ef4f57cb..e651dfbd345e 100644 --- a/kernel/trace/trace_events_filter_test.h +++ b/kernel/trace/trace_events_filter_test.h @@ -1,4 +1,4 @@ -/* SPDX-License-Identifier: GPL-2.0 */ +// SPDX-License-Identifier: GPL-2.0 #undef TRACE_SYSTEM #define TRACE_SYSTEM test diff --git a/kernel/trace/trace_kprobe_selftest.h b/kernel/trace/trace_kprobe_selftest.h index 4e10ec41c013..c4fc7268ba7c 100644 --- a/kernel/trace/trace_kprobe_selftest.h +++ b/kernel/trace/trace_kprobe_selftest.h @@ -1,4 +1,4 @@ -/* SPDX-License-Identifier: GPL-2.0 */ +// SPDX-License-Identifier: GPL-2.0 /* * Function used during the kprobe self test. This function is in a separate * compile unit so it can be compile with CC_FLAGS_FTRACE to ensure that it diff --git a/kernel/trace/trace_output.h b/kernel/trace/trace_output.h index dbba03ed96de..2f742b74e7e6 100644 --- a/kernel/trace/trace_output.h +++ b/kernel/trace/trace_output.h @@ -1,4 +1,4 @@ -/* SPDX-License-Identifier: GPL-2.0 */ +// SPDX-License-Identifier: GPL-2.0 #ifndef __TRACE_EVENTS_H #define __TRACE_EVENTS_H diff --git a/kernel/trace/trace_stat.h b/kernel/trace/trace_stat.h index 76d30b4ebe83..8786d17caf49 100644 --- a/kernel/trace/trace_stat.h +++ b/kernel/trace/trace_stat.h @@ -1,4 +1,4 @@ -/* SPDX-License-Identifier: GPL-2.0 */ +// SPDX-License-Identifier: GPL-2.0 #ifndef __TRACE_STAT_H #define __TRACE_STAT_H diff --git a/kernel/trace/tracing_map.h b/kernel/trace/tracing_map.h index 053eb92b2d31..a6de61fc22de 100644 --- a/kernel/trace/tracing_map.h +++ b/kernel/trace/tracing_map.h @@ -1,4 +1,4 @@ -/* SPDX-License-Identifier: GPL-2.0 */ +// SPDX-License-Identifier: GPL-2.0 #ifndef __TRACING_MAP_H #define __TRACING_MAP_H -- cgit From f6069b9aa9934ede26f41ac0781fce241279ad43 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 17 Aug 2018 23:26:14 +0200 Subject: bpf: fix redirect to map under tail calls Commits 109980b894e9 ("bpf: don't select potentially stale ri->map from buggy xdp progs") and 7c3001313396 ("bpf: fix ri->map_owner pointer on bpf_prog_realloc") tried to mitigate that buggy programs using bpf_redirect_map() helper call do not leave stale maps behind. Idea was to add a map_owner cookie into the per CPU struct redirect_info which was set to prog->aux by the prog making the helper call as a proof that the map is not stale since the prog is implicitly holding a reference to it. This owner cookie could later on get compared with the program calling into BPF whether they match and therefore the redirect could proceed with processing the map safely. In (obvious) hindsight, this approach breaks down when tail calls are involved since the original caller's prog->aux pointer does not have to match the one from one of the progs out of the tail call chain, and therefore the xdp buffer will be dropped instead of redirected. A way around that would be to fix the issue differently (which also allows to remove related work in fast path at the same time): once the life-time of a redirect map has come to its end we use it's map free callback where we need to wait on synchronize_rcu() for current outstanding xdp buffers and remove such a map pointer from the redirect info if found to be present. At that time no program is using this map anymore so we simply invalidate the map pointers to NULL iff they previously pointed to that instance while making sure that the redirect path only reads out the map once. Fixes: 97f91a7cf04f ("bpf: add bpf_redirect_map helper routine") Fixes: 109980b894e9 ("bpf: don't select potentially stale ri->map from buggy xdp progs") Reported-by: Sebastiano Miano Signed-off-by: Daniel Borkmann Acked-by: John Fastabend Signed-off-by: Alexei Starovoitov --- include/linux/filter.h | 3 +- include/trace/events/xdp.h | 5 ++-- kernel/bpf/cpumap.c | 2 ++ kernel/bpf/devmap.c | 1 + kernel/bpf/verifier.c | 21 -------------- kernel/bpf/xskmap.c | 1 + net/core/filter.c | 68 ++++++++++++++++++++-------------------------- 7 files changed, 38 insertions(+), 63 deletions(-) (limited to 'kernel') diff --git a/include/linux/filter.h b/include/linux/filter.h index 5d565c50bcb2..6791a0ac0139 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -543,7 +543,6 @@ struct bpf_redirect_info { u32 flags; struct bpf_map *map; struct bpf_map *map_to_flush; - unsigned long map_owner; u32 kern_flags; }; @@ -781,6 +780,8 @@ static inline bool bpf_dump_raw_ok(void) struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off, const struct bpf_insn *patch, u32 len); +void bpf_clear_redirect_map(struct bpf_map *map); + static inline bool xdp_return_frame_no_direct(void) { struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); diff --git a/include/trace/events/xdp.h b/include/trace/events/xdp.h index 1ecf4c67fcf7..e95cb86b65cf 100644 --- a/include/trace/events/xdp.h +++ b/include/trace/events/xdp.h @@ -147,9 +147,8 @@ struct _bpf_dtab_netdev { #define devmap_ifindex(fwd, map) \ (!fwd ? 0 : \ - (!map ? 0 : \ - ((map->map_type == BPF_MAP_TYPE_DEVMAP) ? \ - ((struct _bpf_dtab_netdev *)fwd)->dev->ifindex : 0))) + ((map->map_type == BPF_MAP_TYPE_DEVMAP) ? \ + ((struct _bpf_dtab_netdev *)fwd)->dev->ifindex : 0)) #define _trace_xdp_redirect_map(dev, xdp, fwd, map, idx) \ trace_xdp_redirect_map(dev, xdp, devmap_ifindex(fwd, map), \ diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index 620bc5024d7d..24aac0d0f412 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -479,6 +479,8 @@ static void cpu_map_free(struct bpf_map *map) * It does __not__ ensure pending flush operations (if any) are * complete. */ + + bpf_clear_redirect_map(map); synchronize_rcu(); /* To ensure all pending flush operations have completed wait for flush diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index ac1df79f3788..141710b82a6c 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -161,6 +161,7 @@ static void dev_map_free(struct bpf_map *map) list_del_rcu(&dtab->list); spin_unlock(&dev_map_lock); + bpf_clear_redirect_map(map); synchronize_rcu(); /* To ensure all pending flush operations have completed wait for flush diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index ca90679a7fe5..92246117d2b0 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5844,27 +5844,6 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) goto patch_call_imm; } - if (insn->imm == BPF_FUNC_redirect_map) { - /* Note, we cannot use prog directly as imm as subsequent - * rewrites would still change the prog pointer. The only - * stable address we can use is aux, which also works with - * prog clones during blinding. - */ - u64 addr = (unsigned long)prog->aux; - struct bpf_insn r4_ld[] = { - BPF_LD_IMM64(BPF_REG_4, addr), - *insn, - }; - cnt = ARRAY_SIZE(r4_ld); - - new_prog = bpf_patch_insn_data(env, i + delta, r4_ld, cnt); - if (!new_prog) - return -ENOMEM; - - delta += cnt - 1; - env->prog = prog = new_prog; - insn = new_prog->insnsi + i + delta; - } patch_call_imm: fn = env->ops->get_func_proto(insn->imm, env->prog); /* all functions that have prototype and verifier allowed diff --git a/kernel/bpf/xskmap.c b/kernel/bpf/xskmap.c index 4ddf61e158f6..9f8463afda9c 100644 --- a/kernel/bpf/xskmap.c +++ b/kernel/bpf/xskmap.c @@ -75,6 +75,7 @@ static void xsk_map_free(struct bpf_map *map) struct xsk_map *m = container_of(map, struct xsk_map, map); int i; + bpf_clear_redirect_map(map); synchronize_net(); for (i = 0; i < map->max_entries; i++) { diff --git a/net/core/filter.c b/net/core/filter.c index fd423ce3da34..c25eb36f1320 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -3246,31 +3246,33 @@ static void *__xdp_map_lookup_elem(struct bpf_map *map, u32 index) } } -static inline bool xdp_map_invalid(const struct bpf_prog *xdp_prog, - unsigned long aux) +void bpf_clear_redirect_map(struct bpf_map *map) { - return (unsigned long)xdp_prog->aux != aux; + struct bpf_redirect_info *ri; + int cpu; + + for_each_possible_cpu(cpu) { + ri = per_cpu_ptr(&bpf_redirect_info, cpu); + /* Avoid polluting remote cacheline due to writes if + * not needed. Once we pass this test, we need the + * cmpxchg() to make sure it hasn't been changed in + * the meantime by remote CPU. + */ + if (unlikely(READ_ONCE(ri->map) == map)) + cmpxchg(&ri->map, map, NULL); + } } static int xdp_do_redirect_map(struct net_device *dev, struct xdp_buff *xdp, - struct bpf_prog *xdp_prog) + struct bpf_prog *xdp_prog, struct bpf_map *map) { struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); - unsigned long map_owner = ri->map_owner; - struct bpf_map *map = ri->map; u32 index = ri->ifindex; void *fwd = NULL; int err; ri->ifindex = 0; - ri->map = NULL; - ri->map_owner = 0; - - if (unlikely(xdp_map_invalid(xdp_prog, map_owner))) { - err = -EFAULT; - map = NULL; - goto err; - } + WRITE_ONCE(ri->map, NULL); fwd = __xdp_map_lookup_elem(map, index); if (!fwd) { @@ -3296,12 +3298,13 @@ int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp, struct bpf_prog *xdp_prog) { struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); + struct bpf_map *map = READ_ONCE(ri->map); struct net_device *fwd; u32 index = ri->ifindex; int err; - if (ri->map) - return xdp_do_redirect_map(dev, xdp, xdp_prog); + if (map) + return xdp_do_redirect_map(dev, xdp, xdp_prog, map); fwd = dev_get_by_index_rcu(dev_net(dev), index); ri->ifindex = 0; @@ -3325,24 +3328,17 @@ EXPORT_SYMBOL_GPL(xdp_do_redirect); static int xdp_do_generic_redirect_map(struct net_device *dev, struct sk_buff *skb, struct xdp_buff *xdp, - struct bpf_prog *xdp_prog) + struct bpf_prog *xdp_prog, + struct bpf_map *map) { struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); - unsigned long map_owner = ri->map_owner; - struct bpf_map *map = ri->map; u32 index = ri->ifindex; void *fwd = NULL; int err = 0; ri->ifindex = 0; - ri->map = NULL; - ri->map_owner = 0; + WRITE_ONCE(ri->map, NULL); - if (unlikely(xdp_map_invalid(xdp_prog, map_owner))) { - err = -EFAULT; - map = NULL; - goto err; - } fwd = __xdp_map_lookup_elem(map, index); if (unlikely(!fwd)) { err = -EINVAL; @@ -3379,13 +3375,14 @@ int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb, struct xdp_buff *xdp, struct bpf_prog *xdp_prog) { struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); + struct bpf_map *map = READ_ONCE(ri->map); u32 index = ri->ifindex; struct net_device *fwd; int err = 0; - if (ri->map) - return xdp_do_generic_redirect_map(dev, skb, xdp, xdp_prog); - + if (map) + return xdp_do_generic_redirect_map(dev, skb, xdp, xdp_prog, + map); ri->ifindex = 0; fwd = dev_get_by_index_rcu(dev_net(dev), index); if (unlikely(!fwd)) { @@ -3416,8 +3413,7 @@ BPF_CALL_2(bpf_xdp_redirect, u32, ifindex, u64, flags) ri->ifindex = ifindex; ri->flags = flags; - ri->map = NULL; - ri->map_owner = 0; + WRITE_ONCE(ri->map, NULL); return XDP_REDIRECT; } @@ -3430,8 +3426,8 @@ static const struct bpf_func_proto bpf_xdp_redirect_proto = { .arg2_type = ARG_ANYTHING, }; -BPF_CALL_4(bpf_xdp_redirect_map, struct bpf_map *, map, u32, ifindex, u64, flags, - unsigned long, map_owner) +BPF_CALL_3(bpf_xdp_redirect_map, struct bpf_map *, map, u32, ifindex, + u64, flags) { struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); @@ -3440,15 +3436,11 @@ BPF_CALL_4(bpf_xdp_redirect_map, struct bpf_map *, map, u32, ifindex, u64, flags ri->ifindex = ifindex; ri->flags = flags; - ri->map = map; - ri->map_owner = map_owner; + WRITE_ONCE(ri->map, map); return XDP_REDIRECT; } -/* Note, arg4 is hidden from users and populated by the verifier - * with the right pointer. - */ static const struct bpf_func_proto bpf_xdp_redirect_map_proto = { .func = bpf_xdp_redirect_map, .gpl_only = false, -- cgit From d46eb14b735b11927d4bdc2d1854c311af19de6d Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Fri, 17 Aug 2018 15:46:39 -0700 Subject: fs: fsnotify: account fsnotify metadata to kmemcg Patch series "Directed kmem charging", v8. The Linux kernel's memory cgroup allows limiting the memory usage of the jobs running on the system to provide isolation between the jobs. All the kernel memory allocated in the context of the job and marked with __GFP_ACCOUNT will also be included in the memory usage and be limited by the job's limit. The kernel memory can only be charged to the memcg of the process in whose context kernel memory was allocated. However there are cases where the allocated kernel memory should be charged to the memcg different from the current processes's memcg. This patch series contains two such concrete use-cases i.e. fsnotify and buffer_head. The fsnotify event objects can consume a lot of system memory for large or unlimited queues if there is either no or slow listener. The events are allocated in the context of the event producer. However they should be charged to the event consumer. Similarly the buffer_head objects can be allocated in a memcg different from the memcg of the page for which buffer_head objects are being allocated. To solve this issue, this patch series introduces mechanism to charge kernel memory to a given memcg. In case of fsnotify events, the memcg of the consumer can be used for charging and for buffer_head, the memcg of the page can be charged. For directed charging, the caller can use the scope API memalloc_[un]use_memcg() to specify the memcg to charge for all the __GFP_ACCOUNT allocations within the scope. This patch (of 2): A lot of memory can be consumed by the events generated for the huge or unlimited queues if there is either no or slow listener. This can cause system level memory pressure or OOMs. So, it's better to account the fsnotify kmem caches to the memcg of the listener. However the listener can be in a different memcg than the memcg of the producer and these allocations happen in the context of the event producer. This patch introduces remote memcg charging API which the producer can use to charge the allocations to the memcg of the listener. There are seven fsnotify kmem caches and among them allocations from dnotify_struct_cache, dnotify_mark_cache, fanotify_mark_cache and inotify_inode_mark_cachep happens in the context of syscall from the listener. So, SLAB_ACCOUNT is enough for these caches. The objects from fsnotify_mark_connector_cachep are not accounted as they are small compared to the notification mark or events and it is unclear whom to account connector to since it is shared by all events attached to the inode. The allocations from the event caches happen in the context of the event producer. For such caches we will need to remote charge the allocations to the listener's memcg. Thus we save the memcg reference in the fsnotify_group structure of the listener. This patch has also moved the members of fsnotify_group to keep the size same, at least for 64 bit build, even with additional member by filling the holes. [shakeelb@google.com: use GFP_KERNEL_ACCOUNT rather than open-coding it] Link: http://lkml.kernel.org/r/20180702215439.211597-1-shakeelb@google.com Link: http://lkml.kernel.org/r/20180627191250.209150-2-shakeelb@google.com Signed-off-by: Shakeel Butt Acked-by: Johannes Weiner Cc: Michal Hocko Cc: Jan Kara Cc: Amir Goldstein Cc: Greg Thelen Cc: Vladimir Davydov Cc: Roman Gushchin Cc: Alexander Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/notify/dnotify/dnotify.c | 5 +++-- fs/notify/fanotify/fanotify.c | 14 ++++++++++---- fs/notify/fanotify/fanotify_user.c | 5 ++++- fs/notify/group.c | 3 +++ fs/notify/inotify/inotify_fsnotify.c | 7 ++++++- fs/notify/inotify/inotify_user.c | 5 ++++- include/linux/fsnotify_backend.h | 12 ++++++++---- include/linux/memcontrol.h | 10 +++++++++- include/linux/sched.h | 3 +++ include/linux/sched/mm.h | 37 ++++++++++++++++++++++++++++++++++++ kernel/fork.c | 3 +++ mm/memcontrol.c | 37 ++++++++++++++++++++++++++++++++---- 12 files changed, 123 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c index e2bea2ac5dfb..a6365e6bc047 100644 --- a/fs/notify/dnotify/dnotify.c +++ b/fs/notify/dnotify/dnotify.c @@ -384,8 +384,9 @@ out_err: static int __init dnotify_init(void) { - dnotify_struct_cache = KMEM_CACHE(dnotify_struct, SLAB_PANIC); - dnotify_mark_cache = KMEM_CACHE(dnotify_mark, SLAB_PANIC); + dnotify_struct_cache = KMEM_CACHE(dnotify_struct, + SLAB_PANIC|SLAB_ACCOUNT); + dnotify_mark_cache = KMEM_CACHE(dnotify_mark, SLAB_PANIC|SLAB_ACCOUNT); dnotify_group = fsnotify_alloc_group(&dnotify_fsnotify_ops); if (IS_ERR(dnotify_group)) diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c index f90842efea13..eb4e75175cfb 100644 --- a/fs/notify/fanotify/fanotify.c +++ b/fs/notify/fanotify/fanotify.c @@ -11,6 +11,7 @@ #include #include #include +#include #include "fanotify.h" @@ -140,8 +141,8 @@ struct fanotify_event_info *fanotify_alloc_event(struct fsnotify_group *group, struct inode *inode, u32 mask, const struct path *path) { - struct fanotify_event_info *event; - gfp_t gfp = GFP_KERNEL; + struct fanotify_event_info *event = NULL; + gfp_t gfp = GFP_KERNEL_ACCOUNT; /* * For queues with unlimited length lost events are not expected and @@ -151,19 +152,22 @@ struct fanotify_event_info *fanotify_alloc_event(struct fsnotify_group *group, if (group->max_events == UINT_MAX) gfp |= __GFP_NOFAIL; + /* Whoever is interested in the event, pays for the allocation. */ + memalloc_use_memcg(group->memcg); + if (fanotify_is_perm_event(mask)) { struct fanotify_perm_event_info *pevent; pevent = kmem_cache_alloc(fanotify_perm_event_cachep, gfp); if (!pevent) - return NULL; + goto out; event = &pevent->fae; pevent->response = 0; goto init; } event = kmem_cache_alloc(fanotify_event_cachep, gfp); if (!event) - return NULL; + goto out; init: __maybe_unused fsnotify_init_event(&event->fse, inode, mask); event->tgid = get_pid(task_tgid(current)); @@ -174,6 +178,8 @@ init: __maybe_unused event->path.mnt = NULL; event->path.dentry = NULL; } +out: + memalloc_unuse_memcg(); return event; } diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index ec4d8c59d0e3..0cf45041dc32 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -16,6 +16,7 @@ #include #include #include +#include #include @@ -756,6 +757,7 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags) group->fanotify_data.user = user; atomic_inc(&user->fanotify_listeners); + group->memcg = get_mem_cgroup_from_mm(current->mm); oevent = fanotify_alloc_event(group, NULL, FS_Q_OVERFLOW, NULL); if (unlikely(!oevent)) { @@ -957,7 +959,8 @@ COMPAT_SYSCALL_DEFINE6(fanotify_mark, */ static int __init fanotify_user_setup(void) { - fanotify_mark_cache = KMEM_CACHE(fsnotify_mark, SLAB_PANIC); + fanotify_mark_cache = KMEM_CACHE(fsnotify_mark, + SLAB_PANIC|SLAB_ACCOUNT); fanotify_event_cachep = KMEM_CACHE(fanotify_event_info, SLAB_PANIC); if (IS_ENABLED(CONFIG_FANOTIFY_ACCESS_PERMISSIONS)) { fanotify_perm_event_cachep = diff --git a/fs/notify/group.c b/fs/notify/group.c index aa5468f23e45..c03b83662876 100644 --- a/fs/notify/group.c +++ b/fs/notify/group.c @@ -22,6 +22,7 @@ #include #include #include +#include #include #include "fsnotify.h" @@ -36,6 +37,8 @@ static void fsnotify_final_destroy_group(struct fsnotify_group *group) if (group->ops->free_group_priv) group->ops->free_group_priv(group); + mem_cgroup_put(group->memcg); + kfree(group); } diff --git a/fs/notify/inotify/inotify_fsnotify.c b/fs/notify/inotify/inotify_fsnotify.c index 9ab6dde38a14..f4184b4f3815 100644 --- a/fs/notify/inotify/inotify_fsnotify.c +++ b/fs/notify/inotify/inotify_fsnotify.c @@ -31,6 +31,7 @@ #include #include #include +#include #include "inotify.h" @@ -98,7 +99,11 @@ int inotify_handle_event(struct fsnotify_group *group, i_mark = container_of(inode_mark, struct inotify_inode_mark, fsn_mark); - event = kmalloc(alloc_len, GFP_KERNEL); + /* Whoever is interested in the event, pays for the allocation. */ + memalloc_use_memcg(group->memcg); + event = kmalloc(alloc_len, GFP_KERNEL_ACCOUNT); + memalloc_unuse_memcg(); + if (unlikely(!event)) { /* * Treat lost event due to ENOMEM the same way as queue diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c index 1cf5b779d862..749c46ababa0 100644 --- a/fs/notify/inotify/inotify_user.c +++ b/fs/notify/inotify/inotify_user.c @@ -38,6 +38,7 @@ #include #include #include +#include #include "inotify.h" #include "../fdinfo.h" @@ -636,6 +637,7 @@ static struct fsnotify_group *inotify_new_group(unsigned int max_events) oevent->name_len = 0; group->max_events = max_events; + group->memcg = get_mem_cgroup_from_mm(current->mm); spin_lock_init(&group->inotify_data.idr_lock); idr_init(&group->inotify_data.idr); @@ -808,7 +810,8 @@ static int __init inotify_user_setup(void) BUG_ON(hweight32(ALL_INOTIFY_BITS) != 21); - inotify_inode_mark_cachep = KMEM_CACHE(inotify_inode_mark, SLAB_PANIC); + inotify_inode_mark_cachep = KMEM_CACHE(inotify_inode_mark, + SLAB_PANIC|SLAB_ACCOUNT); inotify_max_queued_events = 16384; init_user_ns.ucount_max[UCOUNT_INOTIFY_INSTANCES] = 128; diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h index b38964a7a521..a0c4790c5302 100644 --- a/include/linux/fsnotify_backend.h +++ b/include/linux/fsnotify_backend.h @@ -84,6 +84,8 @@ struct fsnotify_event_private_data; struct fsnotify_fname; struct fsnotify_iter_info; +struct mem_cgroup; + /* * Each group much define these ops. The fsnotify infrastructure will call * these operations for each relevant group. @@ -127,6 +129,8 @@ struct fsnotify_event { * everything will be cleaned up. */ struct fsnotify_group { + const struct fsnotify_ops *ops; /* how this group handles things */ + /* * How the refcnt is used is up to each group. When the refcnt hits 0 * fsnotify will clean up all of the resources associated with this group. @@ -137,8 +141,6 @@ struct fsnotify_group { */ refcount_t refcnt; /* things with interest in this group */ - const struct fsnotify_ops *ops; /* how this group handles things */ - /* needed to send notification to userspace */ spinlock_t notification_lock; /* protect the notification_list */ struct list_head notification_list; /* list of event_holder this group needs to send to userspace */ @@ -160,6 +162,8 @@ struct fsnotify_group { atomic_t num_marks; /* 1 for each mark and 1 for not being * past the point of no return when freeing * a group */ + atomic_t user_waits; /* Number of tasks waiting for user + * response */ struct list_head marks_list; /* all inode marks for this group */ struct fasync_struct *fsn_fa; /* async notification */ @@ -167,8 +171,8 @@ struct fsnotify_group { struct fsnotify_event *overflow_event; /* Event we queue when the * notification list is too * full */ - atomic_t user_waits; /* Number of tasks waiting for user - * response */ + + struct mem_cgroup *memcg; /* memcg to charge allocations */ /* groups can define private fields here or use the void *private */ union { diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 42f4719def32..121e218d2a21 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -373,6 +373,8 @@ struct lruvec *mem_cgroup_page_lruvec(struct page *, struct pglist_data *); bool task_in_mem_cgroup(struct task_struct *task, struct mem_cgroup *memcg); struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p); +struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm); + static inline struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *css){ return css ? container_of(css, struct mem_cgroup, css) : NULL; @@ -380,7 +382,8 @@ struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *css){ static inline void mem_cgroup_put(struct mem_cgroup *memcg) { - css_put(&memcg->css); + if (memcg) + css_put(&memcg->css); } #define mem_cgroup_from_counter(counter, member) \ @@ -855,6 +858,11 @@ static inline bool task_in_mem_cgroup(struct task_struct *task, return true; } +static inline struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm) +{ + return NULL; +} + static inline void mem_cgroup_put(struct mem_cgroup *memcg) { } diff --git a/include/linux/sched.h b/include/linux/sched.h index 95a5018c338e..1827f4a7a6de 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1152,6 +1152,9 @@ struct task_struct { /* Number of pages to reclaim on returning to userland: */ unsigned int memcg_nr_pages_over_high; + + /* Used by memcontrol for targeted memcg charge: */ + struct mem_cgroup *active_memcg; #endif #ifdef CONFIG_BLK_CGROUP diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h index 44d356f5e47c..aebb370a0006 100644 --- a/include/linux/sched/mm.h +++ b/include/linux/sched/mm.h @@ -248,6 +248,43 @@ static inline void memalloc_noreclaim_restore(unsigned int flags) current->flags = (current->flags & ~PF_MEMALLOC) | flags; } +#ifdef CONFIG_MEMCG +/** + * memalloc_use_memcg - Starts the remote memcg charging scope. + * @memcg: memcg to charge. + * + * This function marks the beginning of the remote memcg charging scope. All the + * __GFP_ACCOUNT allocations till the end of the scope will be charged to the + * given memcg. + * + * NOTE: This function is not nesting safe. + */ +static inline void memalloc_use_memcg(struct mem_cgroup *memcg) +{ + WARN_ON_ONCE(current->active_memcg); + current->active_memcg = memcg; +} + +/** + * memalloc_unuse_memcg - Ends the remote memcg charging scope. + * + * This function marks the end of the remote memcg charging scope started by + * memalloc_use_memcg(). + */ +static inline void memalloc_unuse_memcg(void) +{ + current->active_memcg = NULL; +} +#else +static inline void memalloc_use_memcg(struct mem_cgroup *memcg) +{ +} + +static inline void memalloc_unuse_memcg(void) +{ +} +#endif + #ifdef CONFIG_MEMBARRIER enum { MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY = (1U << 0), diff --git a/kernel/fork.c b/kernel/fork.c index 33112315b5c0..5ee74c113381 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -871,6 +871,9 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) tsk->use_memdelay = 0; #endif +#ifdef CONFIG_MEMCG + tsk->active_memcg = NULL; +#endif return tsk; free_stack: diff --git a/mm/memcontrol.c b/mm/memcontrol.c index b836e7f00309..bf9cf738c836 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -678,9 +678,20 @@ struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p) } EXPORT_SYMBOL(mem_cgroup_from_task); -static struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm) +/** + * get_mem_cgroup_from_mm: Obtain a reference on given mm_struct's memcg. + * @mm: mm from which memcg should be extracted. It can be NULL. + * + * Obtain a reference on mm->memcg and returns it if successful. Otherwise + * root_mem_cgroup is returned. However if mem_cgroup is disabled, NULL is + * returned. + */ +struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm) { - struct mem_cgroup *memcg = NULL; + struct mem_cgroup *memcg; + + if (mem_cgroup_disabled()) + return NULL; rcu_read_lock(); do { @@ -700,6 +711,24 @@ static struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm) rcu_read_unlock(); return memcg; } +EXPORT_SYMBOL(get_mem_cgroup_from_mm); + +/** + * If current->active_memcg is non-NULL, do not fallback to current->mm->memcg. + */ +static __always_inline struct mem_cgroup *get_mem_cgroup_from_current(void) +{ + if (unlikely(current->active_memcg)) { + struct mem_cgroup *memcg = root_mem_cgroup; + + rcu_read_lock(); + if (css_tryget_online(¤t->active_memcg->css)) + memcg = current->active_memcg; + rcu_read_unlock(); + return memcg; + } + return get_mem_cgroup_from_mm(current->mm); +} /** * mem_cgroup_iter - iterate over memory cgroup hierarchy @@ -2261,7 +2290,7 @@ struct kmem_cache *memcg_kmem_get_cache(struct kmem_cache *cachep) if (current->memcg_kmem_skip_account) return cachep; - memcg = get_mem_cgroup_from_mm(current->mm); + memcg = get_mem_cgroup_from_current(); kmemcg_id = READ_ONCE(memcg->kmemcg_id); if (kmemcg_id < 0) goto out; @@ -2345,7 +2374,7 @@ int memcg_kmem_charge(struct page *page, gfp_t gfp, int order) if (memcg_kmem_bypass()) return 0; - memcg = get_mem_cgroup_from_mm(current->mm); + memcg = get_mem_cgroup_from_current(); if (!mem_cgroup_is_root(memcg)) { ret = memcg_kmem_charge_memcg(page, gfp, order, memcg); if (!ret) -- cgit From 0207df4fa1a869281ddbf72db6203dbf036b3e1a Mon Sep 17 00:00:00 2001 From: Andrey Ryabinin Date: Fri, 17 Aug 2018 15:47:04 -0700 Subject: kernel/memremap, kasan: make ZONE_DEVICE with work with KASAN KASAN learns about hotadded memory via the memory hotplug notifier. devm_memremap_pages() intentionally skips calling memory hotplug notifiers. So KASAN doesn't know anything about new memory added by devm_memremap_pages(). This causes a crash when KASAN tries to access non-existent shadow memory: BUG: unable to handle kernel paging request at ffffed0078000000 RIP: 0010:check_memory_region+0x82/0x1e0 Call Trace: memcpy+0x1f/0x50 pmem_do_bvec+0x163/0x720 pmem_make_request+0x305/0xac0 generic_make_request+0x54f/0xcf0 submit_bio+0x9c/0x370 submit_bh_wbc+0x4c7/0x700 block_read_full_page+0x5ef/0x870 do_read_cache_page+0x2b8/0xb30 read_dev_sector+0xbd/0x3f0 read_lba.isra.0+0x277/0x670 efi_partition+0x41a/0x18f0 check_partition+0x30d/0x5e9 rescan_partitions+0x18c/0x840 __blkdev_get+0x859/0x1060 blkdev_get+0x23f/0x810 __device_add_disk+0x9c8/0xde0 pmem_attach_disk+0x9a8/0xf50 nvdimm_bus_probe+0xf3/0x3c0 driver_probe_device+0x493/0xbd0 bus_for_each_drv+0x118/0x1b0 __device_attach+0x1cd/0x2b0 bus_probe_device+0x1ac/0x260 device_add+0x90d/0x1380 nd_async_device_register+0xe/0x50 async_run_entry_fn+0xc3/0x5d0 process_one_work+0xa0a/0x1810 worker_thread+0x87/0xe80 kthread+0x2d7/0x390 ret_from_fork+0x3a/0x50 Add kasan_add_zero_shadow()/kasan_remove_zero_shadow() - post mm_init() interface to map/unmap kasan_zero_page at requested virtual addresses. And use it to add/remove the shadow memory for hotplugged/unplugged device memory. Link: http://lkml.kernel.org/r/20180629164932.740-1-aryabinin@virtuozzo.com Fixes: 41e94a851304 ("add devm_memremap_pages") Signed-off-by: Andrey Ryabinin Reported-by: Dave Chinner Reviewed-by: Dan Williams Tested-by: Dan Williams Cc: Dmitry Vyukov Cc: Alexander Potapenko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kasan.h | 13 ++- kernel/memremap.c | 10 ++ mm/kasan/kasan_init.c | 316 +++++++++++++++++++++++++++++++++++++++++++++++--- 3 files changed, 325 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/include/linux/kasan.h b/include/linux/kasan.h index de784fd11d12..46aae129917c 100644 --- a/include/linux/kasan.h +++ b/include/linux/kasan.h @@ -20,7 +20,7 @@ extern pmd_t kasan_zero_pmd[PTRS_PER_PMD]; extern pud_t kasan_zero_pud[PTRS_PER_PUD]; extern p4d_t kasan_zero_p4d[MAX_PTRS_PER_P4D]; -void kasan_populate_zero_shadow(const void *shadow_start, +int kasan_populate_zero_shadow(const void *shadow_start, const void *shadow_end); static inline void *kasan_mem_to_shadow(const void *addr) @@ -71,6 +71,9 @@ struct kasan_cache { int kasan_module_alloc(void *addr, size_t size); void kasan_free_shadow(const struct vm_struct *vm); +int kasan_add_zero_shadow(void *start, unsigned long size); +void kasan_remove_zero_shadow(void *start, unsigned long size); + size_t ksize(const void *); static inline void kasan_unpoison_slab(const void *ptr) { ksize(ptr); } size_t kasan_metadata_size(struct kmem_cache *cache); @@ -124,6 +127,14 @@ static inline bool kasan_slab_free(struct kmem_cache *s, void *object, static inline int kasan_module_alloc(void *addr, size_t size) { return 0; } static inline void kasan_free_shadow(const struct vm_struct *vm) {} +static inline int kasan_add_zero_shadow(void *start, unsigned long size) +{ + return 0; +} +static inline void kasan_remove_zero_shadow(void *start, + unsigned long size) +{} + static inline void kasan_unpoison_slab(const void *ptr) { } static inline size_t kasan_metadata_size(struct kmem_cache *cache) { return 0; } diff --git a/kernel/memremap.c b/kernel/memremap.c index 38283363da06..1f87ea6b6545 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c @@ -5,6 +5,7 @@ #include #include #include +#include #include #include #include @@ -137,6 +138,7 @@ static void devm_memremap_pages_release(void *data) mem_hotplug_begin(); arch_remove_memory(align_start, align_size, pgmap->altmap_valid ? &pgmap->altmap : NULL); + kasan_remove_zero_shadow(__va(align_start), align_size); mem_hotplug_done(); untrack_pfn(NULL, PHYS_PFN(align_start), align_size); @@ -239,6 +241,12 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap) goto err_pfn_remap; mem_hotplug_begin(); + error = kasan_add_zero_shadow(__va(align_start), align_size); + if (error) { + mem_hotplug_done(); + goto err_kasan; + } + error = arch_add_memory(nid, align_start, align_size, altmap, false); if (!error) move_pfn_range_to_zone(&NODE_DATA(nid)->node_zones[ZONE_DEVICE], @@ -267,6 +275,8 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap) return __va(res->start); err_add_memory: + kasan_remove_zero_shadow(__va(align_start), align_size); + err_kasan: untrack_pfn(NULL, PHYS_PFN(align_start), align_size); err_pfn_remap: err_radix: diff --git a/mm/kasan/kasan_init.c b/mm/kasan/kasan_init.c index f436246ccc79..7a2a2f13f86f 100644 --- a/mm/kasan/kasan_init.c +++ b/mm/kasan/kasan_init.c @@ -17,10 +17,13 @@ #include #include #include +#include #include #include +#include "kasan.h" + /* * This page serves two purposes: * - It used as early shadow memory. The entire shadow region populated @@ -32,22 +35,59 @@ unsigned char kasan_zero_page[PAGE_SIZE] __page_aligned_bss; #if CONFIG_PGTABLE_LEVELS > 4 p4d_t kasan_zero_p4d[MAX_PTRS_PER_P4D] __page_aligned_bss; +static inline bool kasan_p4d_table(pgd_t pgd) +{ + return pgd_page(pgd) == virt_to_page(lm_alias(kasan_zero_p4d)); +} +#else +static inline bool kasan_p4d_table(pgd_t pgd) +{ + return 0; +} #endif #if CONFIG_PGTABLE_LEVELS > 3 pud_t kasan_zero_pud[PTRS_PER_PUD] __page_aligned_bss; +static inline bool kasan_pud_table(p4d_t p4d) +{ + return p4d_page(p4d) == virt_to_page(lm_alias(kasan_zero_pud)); +} +#else +static inline bool kasan_pud_table(p4d_t p4d) +{ + return 0; +} #endif #if CONFIG_PGTABLE_LEVELS > 2 pmd_t kasan_zero_pmd[PTRS_PER_PMD] __page_aligned_bss; +static inline bool kasan_pmd_table(pud_t pud) +{ + return pud_page(pud) == virt_to_page(lm_alias(kasan_zero_pmd)); +} +#else +static inline bool kasan_pmd_table(pud_t pud) +{ + return 0; +} #endif pte_t kasan_zero_pte[PTRS_PER_PTE] __page_aligned_bss; +static inline bool kasan_pte_table(pmd_t pmd) +{ + return pmd_page(pmd) == virt_to_page(lm_alias(kasan_zero_pte)); +} + +static inline bool kasan_zero_page_entry(pte_t pte) +{ + return pte_page(pte) == virt_to_page(lm_alias(kasan_zero_page)); +} + static __init void *early_alloc(size_t size, int node) { return memblock_virt_alloc_try_nid(size, size, __pa(MAX_DMA_ADDRESS), BOOTMEM_ALLOC_ACCESSIBLE, node); } -static void __init zero_pte_populate(pmd_t *pmd, unsigned long addr, +static void __ref zero_pte_populate(pmd_t *pmd, unsigned long addr, unsigned long end) { pte_t *pte = pte_offset_kernel(pmd, addr); @@ -63,7 +103,7 @@ static void __init zero_pte_populate(pmd_t *pmd, unsigned long addr, } } -static void __init zero_pmd_populate(pud_t *pud, unsigned long addr, +static int __ref zero_pmd_populate(pud_t *pud, unsigned long addr, unsigned long end) { pmd_t *pmd = pmd_offset(pud, addr); @@ -78,14 +118,24 @@ static void __init zero_pmd_populate(pud_t *pud, unsigned long addr, } if (pmd_none(*pmd)) { - pmd_populate_kernel(&init_mm, pmd, - early_alloc(PAGE_SIZE, NUMA_NO_NODE)); + pte_t *p; + + if (slab_is_available()) + p = pte_alloc_one_kernel(&init_mm, addr); + else + p = early_alloc(PAGE_SIZE, NUMA_NO_NODE); + if (!p) + return -ENOMEM; + + pmd_populate_kernel(&init_mm, pmd, p); } zero_pte_populate(pmd, addr, next); } while (pmd++, addr = next, addr != end); + + return 0; } -static void __init zero_pud_populate(p4d_t *p4d, unsigned long addr, +static int __ref zero_pud_populate(p4d_t *p4d, unsigned long addr, unsigned long end) { pud_t *pud = pud_offset(p4d, addr); @@ -103,14 +153,24 @@ static void __init zero_pud_populate(p4d_t *p4d, unsigned long addr, } if (pud_none(*pud)) { - pud_populate(&init_mm, pud, - early_alloc(PAGE_SIZE, NUMA_NO_NODE)); + pmd_t *p; + + if (slab_is_available()) { + p = pmd_alloc(&init_mm, pud, addr); + if (!p) + return -ENOMEM; + } else { + pud_populate(&init_mm, pud, + early_alloc(PAGE_SIZE, NUMA_NO_NODE)); + } } zero_pmd_populate(pud, addr, next); } while (pud++, addr = next, addr != end); + + return 0; } -static void __init zero_p4d_populate(pgd_t *pgd, unsigned long addr, +static int __ref zero_p4d_populate(pgd_t *pgd, unsigned long addr, unsigned long end) { p4d_t *p4d = p4d_offset(pgd, addr); @@ -132,11 +192,21 @@ static void __init zero_p4d_populate(pgd_t *pgd, unsigned long addr, } if (p4d_none(*p4d)) { - p4d_populate(&init_mm, p4d, - early_alloc(PAGE_SIZE, NUMA_NO_NODE)); + pud_t *p; + + if (slab_is_available()) { + p = pud_alloc(&init_mm, p4d, addr); + if (!p) + return -ENOMEM; + } else { + p4d_populate(&init_mm, p4d, + early_alloc(PAGE_SIZE, NUMA_NO_NODE)); + } } zero_pud_populate(p4d, addr, next); } while (p4d++, addr = next, addr != end); + + return 0; } /** @@ -145,7 +215,7 @@ static void __init zero_p4d_populate(pgd_t *pgd, unsigned long addr, * @shadow_start - start of the memory range to populate * @shadow_end - end of the memory range to populate */ -void __init kasan_populate_zero_shadow(const void *shadow_start, +int __ref kasan_populate_zero_shadow(const void *shadow_start, const void *shadow_end) { unsigned long addr = (unsigned long)shadow_start; @@ -191,9 +261,229 @@ void __init kasan_populate_zero_shadow(const void *shadow_start, } if (pgd_none(*pgd)) { - pgd_populate(&init_mm, pgd, - early_alloc(PAGE_SIZE, NUMA_NO_NODE)); + p4d_t *p; + + if (slab_is_available()) { + p = p4d_alloc(&init_mm, pgd, addr); + if (!p) + return -ENOMEM; + } else { + pgd_populate(&init_mm, pgd, + early_alloc(PAGE_SIZE, NUMA_NO_NODE)); + } } zero_p4d_populate(pgd, addr, next); } while (pgd++, addr = next, addr != end); + + return 0; +} + +static void kasan_free_pte(pte_t *pte_start, pmd_t *pmd) +{ + pte_t *pte; + int i; + + for (i = 0; i < PTRS_PER_PTE; i++) { + pte = pte_start + i; + if (!pte_none(*pte)) + return; + } + + pte_free_kernel(&init_mm, (pte_t *)page_to_virt(pmd_page(*pmd))); + pmd_clear(pmd); +} + +static void kasan_free_pmd(pmd_t *pmd_start, pud_t *pud) +{ + pmd_t *pmd; + int i; + + for (i = 0; i < PTRS_PER_PMD; i++) { + pmd = pmd_start + i; + if (!pmd_none(*pmd)) + return; + } + + pmd_free(&init_mm, (pmd_t *)page_to_virt(pud_page(*pud))); + pud_clear(pud); +} + +static void kasan_free_pud(pud_t *pud_start, p4d_t *p4d) +{ + pud_t *pud; + int i; + + for (i = 0; i < PTRS_PER_PUD; i++) { + pud = pud_start + i; + if (!pud_none(*pud)) + return; + } + + pud_free(&init_mm, (pud_t *)page_to_virt(p4d_page(*p4d))); + p4d_clear(p4d); +} + +static void kasan_free_p4d(p4d_t *p4d_start, pgd_t *pgd) +{ + p4d_t *p4d; + int i; + + for (i = 0; i < PTRS_PER_P4D; i++) { + p4d = p4d_start + i; + if (!p4d_none(*p4d)) + return; + } + + p4d_free(&init_mm, (p4d_t *)page_to_virt(pgd_page(*pgd))); + pgd_clear(pgd); +} + +static void kasan_remove_pte_table(pte_t *pte, unsigned long addr, + unsigned long end) +{ + unsigned long next; + + for (; addr < end; addr = next, pte++) { + next = (addr + PAGE_SIZE) & PAGE_MASK; + if (next > end) + next = end; + + if (!pte_present(*pte)) + continue; + + if (WARN_ON(!kasan_zero_page_entry(*pte))) + continue; + pte_clear(&init_mm, addr, pte); + } +} + +static void kasan_remove_pmd_table(pmd_t *pmd, unsigned long addr, + unsigned long end) +{ + unsigned long next; + + for (; addr < end; addr = next, pmd++) { + pte_t *pte; + + next = pmd_addr_end(addr, end); + + if (!pmd_present(*pmd)) + continue; + + if (kasan_pte_table(*pmd)) { + if (IS_ALIGNED(addr, PMD_SIZE) && + IS_ALIGNED(next, PMD_SIZE)) + pmd_clear(pmd); + continue; + } + pte = pte_offset_kernel(pmd, addr); + kasan_remove_pte_table(pte, addr, next); + kasan_free_pte(pte_offset_kernel(pmd, 0), pmd); + } +} + +static void kasan_remove_pud_table(pud_t *pud, unsigned long addr, + unsigned long end) +{ + unsigned long next; + + for (; addr < end; addr = next, pud++) { + pmd_t *pmd, *pmd_base; + + next = pud_addr_end(addr, end); + + if (!pud_present(*pud)) + continue; + + if (kasan_pmd_table(*pud)) { + if (IS_ALIGNED(addr, PUD_SIZE) && + IS_ALIGNED(next, PUD_SIZE)) + pud_clear(pud); + continue; + } + pmd = pmd_offset(pud, addr); + pmd_base = pmd_offset(pud, 0); + kasan_remove_pmd_table(pmd, addr, next); + kasan_free_pmd(pmd_base, pud); + } +} + +static void kasan_remove_p4d_table(p4d_t *p4d, unsigned long addr, + unsigned long end) +{ + unsigned long next; + + for (; addr < end; addr = next, p4d++) { + pud_t *pud; + + next = p4d_addr_end(addr, end); + + if (!p4d_present(*p4d)) + continue; + + if (kasan_pud_table(*p4d)) { + if (IS_ALIGNED(addr, P4D_SIZE) && + IS_ALIGNED(next, P4D_SIZE)) + p4d_clear(p4d); + continue; + } + pud = pud_offset(p4d, addr); + kasan_remove_pud_table(pud, addr, next); + kasan_free_pud(pud_offset(p4d, 0), p4d); + } +} + +void kasan_remove_zero_shadow(void *start, unsigned long size) +{ + unsigned long addr, end, next; + pgd_t *pgd; + + addr = (unsigned long)kasan_mem_to_shadow(start); + end = addr + (size >> KASAN_SHADOW_SCALE_SHIFT); + + if (WARN_ON((unsigned long)start % + (KASAN_SHADOW_SCALE_SIZE * PAGE_SIZE)) || + WARN_ON(size % (KASAN_SHADOW_SCALE_SIZE * PAGE_SIZE))) + return; + + for (; addr < end; addr = next) { + p4d_t *p4d; + + next = pgd_addr_end(addr, end); + + pgd = pgd_offset_k(addr); + if (!pgd_present(*pgd)) + continue; + + if (kasan_p4d_table(*pgd)) { + if (IS_ALIGNED(addr, PGDIR_SIZE) && + IS_ALIGNED(next, PGDIR_SIZE)) + pgd_clear(pgd); + continue; + } + + p4d = p4d_offset(pgd, addr); + kasan_remove_p4d_table(p4d, addr, next); + kasan_free_p4d(p4d_offset(pgd, 0), pgd); + } +} + +int kasan_add_zero_shadow(void *start, unsigned long size) +{ + int ret; + void *shadow_start, *shadow_end; + + shadow_start = kasan_mem_to_shadow(start); + shadow_end = shadow_start + (size >> KASAN_SHADOW_SCALE_SHIFT); + + if (WARN_ON((unsigned long)start % + (KASAN_SHADOW_SCALE_SIZE * PAGE_SIZE)) || + WARN_ON(size % (KASAN_SHADOW_SCALE_SIZE * PAGE_SIZE))) + return -EINVAL; + + ret = kasan_populate_zero_shadow(shadow_start, shadow_end); + if (ret) + kasan_remove_zero_shadow(shadow_start, + size >> KASAN_SHADOW_SCALE_SHIFT); + return ret; } -- cgit From 6518202970c1052148daaef9a8096711775e43a2 Mon Sep 17 00:00:00 2001 From: Marek Szyprowski Date: Fri, 17 Aug 2018 15:48:57 -0700 Subject: mm/cma: remove unsupported gfp_mask parameter from cma_alloc() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit cma_alloc() doesn't really support gfp flags other than __GFP_NOWARN, so convert gfp_mask parameter to boolean no_warn parameter. This will help to avoid giving false feeling that this function supports standard gfp flags and callers can pass __GFP_ZERO to get zeroed buffer, what has already been an issue: see commit dd65a941f6ba ("arm64: dma-mapping: clear buffers allocated with FORCE_CONTIGUOUS flag"). Link: http://lkml.kernel.org/r/20180709122019eucas1p2340da484acfcc932537e6014f4fd2c29~-sqTPJKij2939229392eucas1p2j@eucas1p2.samsung.com Signed-off-by: Marek Szyprowski Acked-by: Michal Hocko Acked-by: Michał Nazarewicz Acked-by: Laura Abbott Acked-by: Vlastimil Babka Reviewed-by: Christoph Hellwig Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/powerpc/kvm/book3s_hv_builtin.c | 2 +- drivers/s390/char/vmcp.c | 2 +- drivers/staging/android/ion/ion_cma_heap.c | 2 +- include/linux/cma.h | 2 +- kernel/dma/contiguous.c | 3 ++- mm/cma.c | 8 ++++---- mm/cma_debug.c | 2 +- 7 files changed, 11 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/arch/powerpc/kvm/book3s_hv_builtin.c b/arch/powerpc/kvm/book3s_hv_builtin.c index d4a3f4da409b..fc6bb9630a9c 100644 --- a/arch/powerpc/kvm/book3s_hv_builtin.c +++ b/arch/powerpc/kvm/book3s_hv_builtin.c @@ -77,7 +77,7 @@ struct page *kvm_alloc_hpt_cma(unsigned long nr_pages) VM_BUG_ON(order_base_2(nr_pages) < KVM_CMA_CHUNK_ORDER - PAGE_SHIFT); return cma_alloc(kvm_cma, nr_pages, order_base_2(HPT_ALIGN_PAGES), - GFP_KERNEL); + false); } EXPORT_SYMBOL_GPL(kvm_alloc_hpt_cma); diff --git a/drivers/s390/char/vmcp.c b/drivers/s390/char/vmcp.c index 948ce82a7725..0fa1b6b1491a 100644 --- a/drivers/s390/char/vmcp.c +++ b/drivers/s390/char/vmcp.c @@ -68,7 +68,7 @@ static void vmcp_response_alloc(struct vmcp_session *session) * anymore the system won't work anyway. */ if (order > 2) - page = cma_alloc(vmcp_cma, nr_pages, 0, GFP_KERNEL); + page = cma_alloc(vmcp_cma, nr_pages, 0, false); if (page) { session->response = (char *)page_to_phys(page); session->cma_alloc = 1; diff --git a/drivers/staging/android/ion/ion_cma_heap.c b/drivers/staging/android/ion/ion_cma_heap.c index 49718c96bf9e..3fafd013d80a 100644 --- a/drivers/staging/android/ion/ion_cma_heap.c +++ b/drivers/staging/android/ion/ion_cma_heap.c @@ -39,7 +39,7 @@ static int ion_cma_allocate(struct ion_heap *heap, struct ion_buffer *buffer, if (align > CONFIG_CMA_ALIGNMENT) align = CONFIG_CMA_ALIGNMENT; - pages = cma_alloc(cma_heap->cma, nr_pages, align, GFP_KERNEL); + pages = cma_alloc(cma_heap->cma, nr_pages, align, false); if (!pages) return -ENOMEM; diff --git a/include/linux/cma.h b/include/linux/cma.h index bf90f0bb42bd..190184b5ff32 100644 --- a/include/linux/cma.h +++ b/include/linux/cma.h @@ -33,7 +33,7 @@ extern int cma_init_reserved_mem(phys_addr_t base, phys_addr_t size, const char *name, struct cma **res_cma); extern struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align, - gfp_t gfp_mask); + bool no_warn); extern bool cma_release(struct cma *cma, const struct page *pages, unsigned int count); extern int cma_for_each_area(int (*it)(struct cma *cma, void *data), void *data); diff --git a/kernel/dma/contiguous.c b/kernel/dma/contiguous.c index d987dcd1bd56..19ea5d70150c 100644 --- a/kernel/dma/contiguous.c +++ b/kernel/dma/contiguous.c @@ -191,7 +191,8 @@ struct page *dma_alloc_from_contiguous(struct device *dev, size_t count, if (align > CONFIG_CMA_ALIGNMENT) align = CONFIG_CMA_ALIGNMENT; - return cma_alloc(dev_get_cma_area(dev), count, align, gfp_mask); + return cma_alloc(dev_get_cma_area(dev), count, align, + gfp_mask & __GFP_NOWARN); } /** diff --git a/mm/cma.c b/mm/cma.c index 5809bbe360d7..4cb76121a3ab 100644 --- a/mm/cma.c +++ b/mm/cma.c @@ -395,13 +395,13 @@ static inline void cma_debug_show_areas(struct cma *cma) { } * @cma: Contiguous memory region for which the allocation is performed. * @count: Requested number of pages. * @align: Requested alignment of pages (in PAGE_SIZE order). - * @gfp_mask: GFP mask to use during compaction + * @no_warn: Avoid printing message about failed allocation * * This function allocates part of contiguous memory on specific * contiguous memory area. */ struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align, - gfp_t gfp_mask) + bool no_warn) { unsigned long mask, offset; unsigned long pfn = -1; @@ -447,7 +447,7 @@ struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align, pfn = cma->base_pfn + (bitmap_no << cma->order_per_bit); mutex_lock(&cma_mutex); ret = alloc_contig_range(pfn, pfn + count, MIGRATE_CMA, - gfp_mask); + GFP_KERNEL | (no_warn ? __GFP_NOWARN : 0)); mutex_unlock(&cma_mutex); if (ret == 0) { page = pfn_to_page(pfn); @@ -466,7 +466,7 @@ struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align, trace_cma_alloc(pfn, page, count, align); - if (ret && !(gfp_mask & __GFP_NOWARN)) { + if (ret && !no_warn) { pr_err("%s: alloc failed, req-size: %zu pages, ret: %d\n", __func__, count, ret); cma_debug_show_areas(cma); diff --git a/mm/cma_debug.c b/mm/cma_debug.c index f23467291cfb..ad6723e9d110 100644 --- a/mm/cma_debug.c +++ b/mm/cma_debug.c @@ -139,7 +139,7 @@ static int cma_alloc_mem(struct cma *cma, int count) if (!mem) return -ENOMEM; - p = cma_alloc(cma, count, 0, GFP_KERNEL); + p = cma_alloc(cma, count, 0, false); if (!p) { kfree(mem); return -ENOMEM; -- cgit From d834c5ab83febf9624ad3b16c3c348aa1e02014c Mon Sep 17 00:00:00 2001 From: Marek Szyprowski Date: Fri, 17 Aug 2018 15:49:00 -0700 Subject: kernel/dma: remove unsupported gfp_mask parameter from dma_alloc_from_contiguous() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The CMA memory allocator doesn't support standard gfp flags for memory allocation, so there is no point having it as a parameter for dma_alloc_from_contiguous() function. Replace it by a boolean no_warn argument, which covers all the underlaying cma_alloc() function supports. This will help to avoid giving false feeling that this function supports standard gfp flags and callers can pass __GFP_ZERO to get zeroed buffer, what has already been an issue: see commit dd65a941f6ba ("arm64: dma-mapping: clear buffers allocated with FORCE_CONTIGUOUS flag"). Link: http://lkml.kernel.org/r/20180709122020eucas1p21a71b092975cb4a3b9954ffc63f699d1~-sqUFoa-h2939329393eucas1p2Y@eucas1p2.samsung.com Signed-off-by: Marek Szyprowski Acked-by: Michał Nazarewicz Acked-by: Vlastimil Babka Reviewed-by: Christoph Hellwig Cc: Laura Abbott Cc: Michal Hocko Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm/mm/dma-mapping.c | 5 +++-- arch/arm64/mm/dma-mapping.c | 4 ++-- arch/xtensa/kernel/pci-dma.c | 2 +- drivers/iommu/amd_iommu.c | 2 +- drivers/iommu/intel-iommu.c | 3 ++- include/linux/dma-contiguous.h | 4 ++-- kernel/dma/contiguous.c | 7 +++---- kernel/dma/direct.c | 3 ++- 8 files changed, 16 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/arch/arm/mm/dma-mapping.c b/arch/arm/mm/dma-mapping.c index ba0e786c952e..66566472c153 100644 --- a/arch/arm/mm/dma-mapping.c +++ b/arch/arm/mm/dma-mapping.c @@ -594,7 +594,7 @@ static void *__alloc_from_contiguous(struct device *dev, size_t size, struct page *page; void *ptr = NULL; - page = dma_alloc_from_contiguous(dev, count, order, gfp); + page = dma_alloc_from_contiguous(dev, count, order, gfp & __GFP_NOWARN); if (!page) return NULL; @@ -1299,7 +1299,8 @@ static struct page **__iommu_alloc_buffer(struct device *dev, size_t size, unsigned long order = get_order(size); struct page *page; - page = dma_alloc_from_contiguous(dev, count, order, gfp); + page = dma_alloc_from_contiguous(dev, count, order, + gfp & __GFP_NOWARN); if (!page) goto error; diff --git a/arch/arm64/mm/dma-mapping.c b/arch/arm64/mm/dma-mapping.c index 61e93f0b5482..072c51fb07d7 100644 --- a/arch/arm64/mm/dma-mapping.c +++ b/arch/arm64/mm/dma-mapping.c @@ -355,7 +355,7 @@ static int __init atomic_pool_init(void) if (dev_get_cma_area(NULL)) page = dma_alloc_from_contiguous(NULL, nr_pages, - pool_size_order, GFP_KERNEL); + pool_size_order, false); else page = alloc_pages(GFP_DMA32, pool_size_order); @@ -573,7 +573,7 @@ static void *__iommu_alloc_attrs(struct device *dev, size_t size, struct page *page; page = dma_alloc_from_contiguous(dev, size >> PAGE_SHIFT, - get_order(size), gfp); + get_order(size), gfp & __GFP_NOWARN); if (!page) return NULL; diff --git a/arch/xtensa/kernel/pci-dma.c b/arch/xtensa/kernel/pci-dma.c index 392b4a80ebc2..a02dc563d290 100644 --- a/arch/xtensa/kernel/pci-dma.c +++ b/arch/xtensa/kernel/pci-dma.c @@ -137,7 +137,7 @@ static void *xtensa_dma_alloc(struct device *dev, size_t size, if (gfpflags_allow_blocking(flag)) page = dma_alloc_from_contiguous(dev, count, get_order(size), - flag); + flag & __GFP_NOWARN); if (!page) page = alloc_pages(flag, get_order(size)); diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c index 596b95c50051..60b2eab29cd8 100644 --- a/drivers/iommu/amd_iommu.c +++ b/drivers/iommu/amd_iommu.c @@ -2620,7 +2620,7 @@ static void *alloc_coherent(struct device *dev, size_t size, return NULL; page = dma_alloc_from_contiguous(dev, size >> PAGE_SHIFT, - get_order(size), flag); + get_order(size), flag & __GFP_NOWARN); if (!page) return NULL; } diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c index 115ff26e9ced..6a237d18fabf 100644 --- a/drivers/iommu/intel-iommu.c +++ b/drivers/iommu/intel-iommu.c @@ -3758,7 +3758,8 @@ static void *intel_alloc_coherent(struct device *dev, size_t size, if (gfpflags_allow_blocking(flags)) { unsigned int count = size >> PAGE_SHIFT; - page = dma_alloc_from_contiguous(dev, count, order, flags); + page = dma_alloc_from_contiguous(dev, count, order, + flags & __GFP_NOWARN); if (page && iommu_no_mapping(dev) && page_to_phys(page) + size > dev->coherent_dma_mask) { dma_release_from_contiguous(dev, page, count); diff --git a/include/linux/dma-contiguous.h b/include/linux/dma-contiguous.h index 3c5a4cb3eb95..f247e8aa5e3d 100644 --- a/include/linux/dma-contiguous.h +++ b/include/linux/dma-contiguous.h @@ -112,7 +112,7 @@ static inline int dma_declare_contiguous(struct device *dev, phys_addr_t size, } struct page *dma_alloc_from_contiguous(struct device *dev, size_t count, - unsigned int order, gfp_t gfp_mask); + unsigned int order, bool no_warn); bool dma_release_from_contiguous(struct device *dev, struct page *pages, int count); @@ -145,7 +145,7 @@ int dma_declare_contiguous(struct device *dev, phys_addr_t size, static inline struct page *dma_alloc_from_contiguous(struct device *dev, size_t count, - unsigned int order, gfp_t gfp_mask) + unsigned int order, bool no_warn) { return NULL; } diff --git a/kernel/dma/contiguous.c b/kernel/dma/contiguous.c index 19ea5d70150c..286d82329eb0 100644 --- a/kernel/dma/contiguous.c +++ b/kernel/dma/contiguous.c @@ -178,7 +178,7 @@ int __init dma_contiguous_reserve_area(phys_addr_t size, phys_addr_t base, * @dev: Pointer to device for which the allocation is performed. * @count: Requested number of pages. * @align: Requested alignment of pages (in PAGE_SIZE order). - * @gfp_mask: GFP flags to use for this allocation. + * @no_warn: Avoid printing message about failed allocation. * * This function allocates memory buffer for specified device. It uses * device specific contiguous memory area if available or the default @@ -186,13 +186,12 @@ int __init dma_contiguous_reserve_area(phys_addr_t size, phys_addr_t base, * function. */ struct page *dma_alloc_from_contiguous(struct device *dev, size_t count, - unsigned int align, gfp_t gfp_mask) + unsigned int align, bool no_warn) { if (align > CONFIG_CMA_ALIGNMENT) align = CONFIG_CMA_ALIGNMENT; - return cma_alloc(dev_get_cma_area(dev), count, align, - gfp_mask & __GFP_NOWARN); + return cma_alloc(dev_get_cma_area(dev), count, align, no_warn); } /** diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index c2860c5a9e96..1c35b7b945d0 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -78,7 +78,8 @@ void *dma_direct_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle, again: /* CMA can be used only in the context which permits sleeping */ if (gfpflags_allow_blocking(gfp)) { - page = dma_alloc_from_contiguous(dev, count, page_order, gfp); + page = dma_alloc_from_contiguous(dev, count, page_order, + gfp & __GFP_NOWARN); if (page && !dma_coherent_ok(dev, page_to_phys(page), size)) { dma_release_from_contiguous(dev, page, count); page = NULL; -- cgit From 7059b36636beab57c3c43c62104483e5449bee95 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 9 Aug 2018 19:08:34 +0200 Subject: sched: idle: Avoid retaining the tick when it has been stopped If the tick has been stopped already, but the governor has not asked to stop it (which it can do sometimes), the idle loop should invoke tick_nohz_idle_stop_tick(), to let tick_nohz_stop_tick() take care of this case properly. Fixes: 554c8aa8ecad (sched: idle: Select idle state before stopping the tick) Cc: 4.17+ # 4.17+ Acked-by: Peter Zijlstra (Intel) Signed-off-by: Rafael J. Wysocki --- kernel/sched/idle.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 1a3e9bddd17b..16f84142f2f4 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -190,7 +190,7 @@ static void cpuidle_idle_call(void) */ next_state = cpuidle_select(drv, dev, &stop_tick); - if (stop_tick) + if (stop_tick || tick_nohz_tick_stopped()) tick_nohz_idle_stop_tick(); else tick_nohz_idle_retain_tick(); -- cgit From b639186ffe9168fd1d2f95a1fff8571720739126 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Thu, 16 Aug 2018 12:21:24 -0500 Subject: futex: Mark expected switch fall-throughs In preparation of enabling -Wimplicit-fallthrough, mark switch cases which fall through. Signed-off-by: Gustavo A. R. Silva Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra Cc: Darren Hart Link: https://lkml.kernel.org/r/20180816172124.GA2407@embeddedor.com --- kernel/futex.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index 1f450e092c74..11fc3bb456d6 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -3523,10 +3523,12 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, switch (cmd) { case FUTEX_WAIT: val3 = FUTEX_BITSET_MATCH_ANY; + /* fall through */ case FUTEX_WAIT_BITSET: return futex_wait(uaddr, flags, val, timeout, val3); case FUTEX_WAKE: val3 = FUTEX_BITSET_MATCH_ANY; + /* fall through */ case FUTEX_WAKE_BITSET: return futex_wake(uaddr, flags, val, val3); case FUTEX_REQUEUE: -- cgit From 6b7dca401cb148603158119c89319c85228a2a61 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Tue, 21 Aug 2018 16:27:58 +0900 Subject: tracing: Allow gcov profiling on only ftrace subsystem Add GCOV_PROFILE_FTRACE to allow gcov profiling on only files in ftrace subsystem. This config option will be used for checking kselftest/ftrace coverage. Link: http://lkml.kernel.org/r/153483647755.32472.4746349899604275441.stgit@devbox Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/Kconfig | 12 ++++++++++++ kernel/trace/Makefile | 5 +++++ 2 files changed, 17 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index fd6754b88f87..aec44d838283 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -774,6 +774,18 @@ config TRACING_EVENTS_GPIO help Enable tracing events for gpio subsystem +config GCOV_PROFILE_FTRACE + bool "Enable GCOV profiling on ftrace subsystem" + depends on GCOV_KERNEL + help + Enable GCOV profiling on ftrace subsystem for checking + which functions/lines are tested. + + If unsure, say N. + + Note that on a kernel compiled with this config, ftrace will + run significantly slower. + endif # FTRACE endif # TRACING_SUPPORT diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index 98d53b39a8ee..f81dadbc7c4a 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -23,6 +23,11 @@ ifdef CONFIG_TRACING_BRANCHES KBUILD_CFLAGS += -DDISABLE_BRANCH_PROFILING endif +# for GCOV coverage profiling +ifdef CONFIG_GCOV_PROFILE_FTRACE +GCOV_PROFILE := y +endif + CFLAGS_trace_benchmark.o := -I$(src) CFLAGS_trace_events_filter.o := -I$(src) -- cgit From 9161a864ff88e800de50494da095af19832e9583 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Tue, 21 Aug 2018 22:04:57 +0900 Subject: tracing/kprobes: Fix to check notrace function with correct range Fix within_notrace_func() to check notrace function correctly. Since the ftrace_location_range(start, end) function checks the range inclusively (start <= ftrace-loc <= end), the end address must not include the entry address of next function. However, within_notrace_func() uses kallsyms_lookup_size_offset() to get the function size and calculate the end address from adding the size to the entry address. This means the end address is the entry address of the next function. In the result, within_notrace_func() fails to find notrace function if the next function of the target function is ftraced. Let's subtract 1 from the end address so that ftrace_location_range() can check it correctly. Link: http://lkml.kernel.org/r/153485669706.16611.17726752296213785504.stgit@devbox Fixes: commit 45408c4f9250 ("tracing: kprobes: Prohibit probing on notrace function") Reported-by: Michael Rodin Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_kprobe.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 65a4157af851..ad384b31fe01 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -513,7 +513,14 @@ static bool within_notrace_func(struct trace_kprobe *tk) if (!addr || !kallsyms_lookup_size_offset(addr, &size, &offset)) return false; - return !ftrace_location_range(addr - offset, addr - offset + size); + /* Get the entry address of the target function */ + addr -= offset; + + /* + * Since ftrace_location_range() does inclusive range check, we need + * to subtract 1 byte from the end address. + */ + return !ftrace_location_range(addr, addr + size - 1); } #else #define within_notrace_func(tk) (false) -- cgit From d6e89786bed977f37f55ffca11e563f6d2b1e3b5 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 22 Aug 2018 11:49:03 +0200 Subject: workqueue: skip lockdep wq dependency in cancel_work_sync() In cancel_work_sync(), we can only have one of two cases, even with an ordered workqueue: * the work isn't running, just cancelled before it started * the work is running, but then nothing else can be on the workqueue before it Thus, we need to skip the lockdep workqueue dependency handling, otherwise we get false positive reports from lockdep saying that we have a potential deadlock when the workqueue also has other work items with locking, e.g. work1_function() { mutex_lock(&mutex); ... } work2_function() { /* nothing */ } other_function() { queue_work(ordered_wq, &work1); queue_work(ordered_wq, &work2); mutex_lock(&mutex); cancel_work_sync(&work2); } As described above, this isn't a problem, but lockdep will currently flag it as if cancel_work_sync() was flush_work(), which *is* a problem. Signed-off-by: Johannes Berg Signed-off-by: Tejun Heo --- kernel/workqueue.c | 37 ++++++++++++++++++++++--------------- 1 file changed, 22 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 7ea75529eabb..aa520e715bbc 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -2843,7 +2843,8 @@ reflush: } EXPORT_SYMBOL_GPL(drain_workqueue); -static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr) +static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr, + bool from_cancel) { struct worker *worker = NULL; struct worker_pool *pool; @@ -2885,7 +2886,8 @@ static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr) * workqueues the deadlock happens when the rescuer stalls, blocking * forward progress. */ - if (pwq->wq->saved_max_active == 1 || pwq->wq->rescuer) { + if (!from_cancel && + (pwq->wq->saved_max_active == 1 || pwq->wq->rescuer)) { lock_map_acquire(&pwq->wq->lockdep_map); lock_map_release(&pwq->wq->lockdep_map); } @@ -2896,6 +2898,22 @@ already_gone: return false; } +static bool __flush_work(struct work_struct *work, bool from_cancel) +{ + struct wq_barrier barr; + + if (WARN_ON(!wq_online)) + return false; + + if (start_flush_work(work, &barr, from_cancel)) { + wait_for_completion(&barr.done); + destroy_work_on_stack(&barr.work); + return true; + } else { + return false; + } +} + /** * flush_work - wait for a work to finish executing the last queueing instance * @work: the work to flush @@ -2909,18 +2927,7 @@ already_gone: */ bool flush_work(struct work_struct *work) { - struct wq_barrier barr; - - if (WARN_ON(!wq_online)) - return false; - - if (start_flush_work(work, &barr)) { - wait_for_completion(&barr.done); - destroy_work_on_stack(&barr.work); - return true; - } else { - return false; - } + return __flush_work(work, false); } EXPORT_SYMBOL_GPL(flush_work); @@ -2986,7 +2993,7 @@ static bool __cancel_work_timer(struct work_struct *work, bool is_dwork) * isn't executing. */ if (wq_online) - flush_work(work); + __flush_work(work, true); clear_work_data(work); -- cgit From 87915adc3f0acdf03c776df42e308e5a155c19af Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 22 Aug 2018 11:49:04 +0200 Subject: workqueue: re-add lockdep dependencies for flushing In flush_work(), we need to create a lockdep dependency so that the following scenario is appropriately tagged as a problem: work_function() { mutex_lock(&mutex); ... } other_function() { mutex_lock(&mutex); flush_work(&work); // or cancel_work_sync(&work); } This is a problem since the work might be running and be blocked on trying to acquire the mutex. Similarly, in flush_workqueue(). These were removed after cross-release partially caught these problems, but now cross-release was reverted anyway. IMHO the removal was erroneous anyway though, since lockdep should be able to catch potential problems, not just actual ones, and cross-release would only have caught the problem when actually invoking wait_for_completion(). Fixes: fd1a5b04dfb8 ("workqueue: Remove now redundant lock acquisitions wrt. workqueue flushes") Signed-off-by: Johannes Berg Signed-off-by: Tejun Heo --- kernel/workqueue.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index aa520e715bbc..661184fcd503 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -2652,6 +2652,9 @@ void flush_workqueue(struct workqueue_struct *wq) if (WARN_ON(!wq_online)) return; + lock_map_acquire(&wq->lockdep_map); + lock_map_release(&wq->lockdep_map); + mutex_lock(&wq->mutex); /* @@ -2905,6 +2908,11 @@ static bool __flush_work(struct work_struct *work, bool from_cancel) if (WARN_ON(!wq_online)) return false; + if (!from_cancel) { + lock_map_acquire(&work->lockdep_map); + lock_map_release(&work->lockdep_map); + } + if (start_flush_work(work, &barr, from_cancel)) { wait_for_completion(&barr.done); destroy_work_on_stack(&barr.work); -- cgit From a670468f5e0b5fad4db6e4d195f15915dc2a35c1 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Tue, 21 Aug 2018 21:53:06 -0700 Subject: mm: zero out the vma in vma_init() Rather than in vm_area_alloc(). To ensure that the various oddball stack-based vmas are in a good state. Some of the callers were zeroing them out, others were not. Acked-by: Kirill A. Shutemov Cc: Russell King Cc: Dmitry Vyukov Cc: Oleg Nesterov Cc: Andrea Arcangeli Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm/kernel/process.c | 9 ++++----- fs/hugetlbfs/inode.c | 2 -- include/linux/mm.h | 1 + kernel/fork.c | 3 ++- mm/mempolicy.c | 1 - mm/shmem.c | 1 - 6 files changed, 7 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/arch/arm/kernel/process.c b/arch/arm/kernel/process.c index d9c299133111..82ab015bf42b 100644 --- a/arch/arm/kernel/process.c +++ b/arch/arm/kernel/process.c @@ -330,16 +330,15 @@ unsigned long arch_randomize_brk(struct mm_struct *mm) * atomic helpers. Insert it into the gate_vma so that it is visible * through ptrace and /proc//mem. */ -static struct vm_area_struct gate_vma = { - .vm_start = 0xffff0000, - .vm_end = 0xffff0000 + PAGE_SIZE, - .vm_flags = VM_READ | VM_EXEC | VM_MAYREAD | VM_MAYEXEC, -}; +static struct vm_area_struct gate_vma; static int __init gate_vma_init(void) { vma_init(&gate_vma, NULL); gate_vma.vm_page_prot = PAGE_READONLY_EXEC; + gate_vma.vm_start = 0xffff0000; + gate_vma.vm_end = 0xffff0000 + PAGE_SIZE; + gate_vma.vm_flags = VM_READ | VM_EXEC | VM_MAYREAD | VM_MAYEXEC; return 0; } arch_initcall(gate_vma_init); diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 346a146c7617..32920a10100e 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -410,7 +410,6 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart, int i, freed = 0; bool truncate_op = (lend == LLONG_MAX); - memset(&pseudo_vma, 0, sizeof(struct vm_area_struct)); vma_init(&pseudo_vma, current->mm); pseudo_vma.vm_flags = (VM_HUGETLB | VM_MAYSHARE | VM_SHARED); pagevec_init(&pvec); @@ -595,7 +594,6 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset, * allocation routines. If NUMA is configured, use page index * as input to create an allocation policy. */ - memset(&pseudo_vma, 0, sizeof(struct vm_area_struct)); vma_init(&pseudo_vma, mm); pseudo_vma.vm_flags = (VM_HUGETLB | VM_MAYSHARE | VM_SHARED); pseudo_vma.vm_file = file; diff --git a/include/linux/mm.h b/include/linux/mm.h index a3cae495f9ce..3a4b87d1a59a 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -456,6 +456,7 @@ static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm) { static const struct vm_operations_struct dummy_vm_ops = {}; + memset(vma, 0, sizeof(*vma)); vma->vm_mm = mm; vma->vm_ops = &dummy_vm_ops; INIT_LIST_HEAD(&vma->anon_vma_chain); diff --git a/kernel/fork.c b/kernel/fork.c index 5ee74c113381..8c760effa42e 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -310,8 +310,9 @@ static struct kmem_cache *mm_cachep; struct vm_area_struct *vm_area_alloc(struct mm_struct *mm) { - struct vm_area_struct *vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL); + struct vm_area_struct *vma; + vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); if (vma) vma_init(vma, mm); return vma; diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 01f1a14facc4..4861ba738d6f 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -2504,7 +2504,6 @@ void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol) goto put_new; /* Create pseudo-vma that contains just the policy */ - memset(&pvma, 0, sizeof(struct vm_area_struct)); vma_init(&pvma, NULL); pvma.vm_end = TASK_SIZE; /* policy covers entire file */ mpol_set_shared_policy(sp, &pvma, new); /* adds ref */ diff --git a/mm/shmem.c b/mm/shmem.c index c48c79018a7c..fb04baacc9fa 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1421,7 +1421,6 @@ static void shmem_pseudo_vma_init(struct vm_area_struct *vma, struct shmem_inode_info *info, pgoff_t index) { /* Create a pseudo vma that just contains the policy */ - memset(vma, 0, sizeof(*vma)); vma_init(vma, NULL); /* Bias interleave by inode number to distribute better across nodes */ vma->vm_pgoff = index + info->vfs_inode.i_ino; -- cgit From eff4345e7fba0aac8665f00e8599b36946d9aaec Mon Sep 17 00:00:00 2001 From: Omar Sandoval Date: Tue, 21 Aug 2018 21:55:17 -0700 Subject: crash_core: use VMCOREINFO_SYMBOL_ARRAY() for swapper_pg_dir This is preparation for allowing CRASH_CORE to be enabled for any architecture. swapper_pg_dir is always either an array or a macro expanding to NULL. In the latter case, VMCOREINFO_SYMBOL() won't work, as it tries to take the address of the given symbol: #define VMCOREINFO_SYMBOL(name) \ vmcoreinfo_append_str("SYMBOL(%s)=%lx\n", #name, (unsigned long)&name) Instead, use VMCOREINFO_SYMBOL_ARRAY(), which uses the value: #define VMCOREINFO_SYMBOL_ARRAY(name) \ vmcoreinfo_append_str("SYMBOL(%s)=%lx\n", #name, (unsigned long)name) This is the same thing for the array case but isn't an error for the macro case. Link: http://lkml.kernel.org/r/c05f9781ec204f40fc96f95086e7b6de6a3eb2c3.1532563124.git.osandov@fb.com Signed-off-by: Omar Sandoval Cc: Alexey Dobriyan Cc: Bhupesh Sharma Cc: Eric Biederman Cc: James Morse Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/crash_core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/crash_core.c b/kernel/crash_core.c index b66aced5e8c2..e7b4025c7b24 100644 --- a/kernel/crash_core.c +++ b/kernel/crash_core.c @@ -401,7 +401,7 @@ static int __init crash_save_vmcoreinfo_init(void) VMCOREINFO_SYMBOL(init_uts_ns); VMCOREINFO_SYMBOL(node_online_map); #ifdef CONFIG_MMU - VMCOREINFO_SYMBOL(swapper_pg_dir); + VMCOREINFO_SYMBOL_ARRAY(swapper_pg_dir); #endif VMCOREINFO_SYMBOL(_stext); VMCOREINFO_SYMBOL(vmap_area_list); -- cgit From 23c85094fe1895caefdd19ef624ee687ec5f4507 Mon Sep 17 00:00:00 2001 From: Omar Sandoval Date: Tue, 21 Aug 2018 21:55:20 -0700 Subject: proc/kcore: add vmcoreinfo note to /proc/kcore The vmcoreinfo information is useful for runtime debugging tools, not just for crash dumps. A lot of this information can be determined by other means, but this is much more convenient, and it only adds a page at most to the file. Link: http://lkml.kernel.org/r/fddbcd08eed76344863303878b12de1c1e2a04b6.1531953780.git.osandov@fb.com Signed-off-by: Omar Sandoval Cc: Alexey Dobriyan Cc: Bhupesh Sharma Cc: Eric Biederman Cc: James Morse Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/Kconfig | 1 + fs/proc/kcore.c | 18 ++++++++++++++++-- include/linux/crash_core.h | 2 ++ kernel/crash_core.c | 4 ++-- 4 files changed, 21 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/fs/proc/Kconfig b/fs/proc/Kconfig index 0eaeb41453f5..817c02b13b1d 100644 --- a/fs/proc/Kconfig +++ b/fs/proc/Kconfig @@ -31,6 +31,7 @@ config PROC_FS config PROC_KCORE bool "/proc/kcore support" if !ARM depends on PROC_FS && MMU + select CRASH_CORE help Provides a virtual ELF core file of the live kernel. This can be read with gdb and other ELF tools. No modifications can be diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c index 758c14e46a44..80464432dfe6 100644 --- a/fs/proc/kcore.c +++ b/fs/proc/kcore.c @@ -10,6 +10,7 @@ * Safe accesses to vmalloc/direct-mapped discontiguous areas, Kanoj Sarcar */ +#include #include #include #include @@ -81,10 +82,13 @@ static size_t get_kcore_size(int *nphdr, size_t *phdrs_len, size_t *notes_len, } *phdrs_len = *nphdr * sizeof(struct elf_phdr); - *notes_len = (3 * (sizeof(struct elf_note) + ALIGN(sizeof(CORE_STR), 4)) + + *notes_len = (4 * sizeof(struct elf_note) + + 3 * ALIGN(sizeof(CORE_STR), 4) + + VMCOREINFO_NOTE_NAME_BYTES + ALIGN(sizeof(struct elf_prstatus), 4) + ALIGN(sizeof(struct elf_prpsinfo), 4) + - ALIGN(arch_task_struct_size, 4)); + ALIGN(arch_task_struct_size, 4) + + ALIGN(vmcoreinfo_size, 4)); *data_offset = PAGE_ALIGN(sizeof(struct elfhdr) + *phdrs_len + *notes_len); return *data_offset + size; @@ -406,6 +410,16 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos) sizeof(prpsinfo)); append_kcore_note(notes, &i, CORE_STR, NT_TASKSTRUCT, current, arch_task_struct_size); + /* + * vmcoreinfo_size is mostly constant after init time, but it + * can be changed by crash_save_vmcoreinfo(). Racing here with a + * panic on another CPU before the machine goes down is insanely + * unlikely, but it's better to not leave potential buffer + * overflows lying around, regardless. + */ + append_kcore_note(notes, &i, VMCOREINFO_NOTE_NAME, 0, + vmcoreinfo_data, + min(vmcoreinfo_size, notes_len - i)); tsz = min_t(size_t, buflen, notes_offset + notes_len - *fpos); if (copy_to_user(buffer, notes + *fpos - notes_offset, tsz)) { diff --git a/include/linux/crash_core.h b/include/linux/crash_core.h index b511f6d24b42..525510a9f965 100644 --- a/include/linux/crash_core.h +++ b/include/linux/crash_core.h @@ -60,6 +60,8 @@ phys_addr_t paddr_vmcoreinfo_note(void); #define VMCOREINFO_CONFIG(name) \ vmcoreinfo_append_str("CONFIG_%s=y\n", #name) +extern unsigned char *vmcoreinfo_data; +extern size_t vmcoreinfo_size; extern u32 *vmcoreinfo_note; Elf_Word *append_elf_note(Elf_Word *buf, char *name, unsigned int type, diff --git a/kernel/crash_core.c b/kernel/crash_core.c index e7b4025c7b24..a683bfb0530f 100644 --- a/kernel/crash_core.c +++ b/kernel/crash_core.c @@ -14,8 +14,8 @@ #include /* vmcoreinfo stuff */ -static unsigned char *vmcoreinfo_data; -static size_t vmcoreinfo_size; +unsigned char *vmcoreinfo_data; +size_t vmcoreinfo_size; u32 *vmcoreinfo_note; /* trusted vmcoreinfo, e.g. we can make a copy in the crash memory */ -- cgit From fc37191272a972d449bab3d8247cf52cadf5d4e6 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Tue, 21 Aug 2018 21:55:38 -0700 Subject: userns: use refcount_t for reference counting instead atomic_t refcount_t type and corresponding API should be used instead of atomic_t wh en the variable is used as a reference counter. This avoids accidental refcounter overflows that might lead to use-after-free situations. Link: http://lkml.kernel.org/r/20180703200141.28415-6-bigeasy@linutronix.de Signed-off-by: Sebastian Andrzej Siewior Suggested-by: Peter Zijlstra Acked-by: Peter Zijlstra (Intel) Reviewed-by: Andrew Morton Cc: "Eric W. Biederman" Cc: Ingo Molnar Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sched/user.h | 5 +++-- kernel/user.c | 8 ++++---- 2 files changed, 7 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/include/linux/sched/user.h b/include/linux/sched/user.h index 96fe289c4c6e..39ad98c09c58 100644 --- a/include/linux/sched/user.h +++ b/include/linux/sched/user.h @@ -4,6 +4,7 @@ #include #include +#include #include struct key; @@ -12,7 +13,7 @@ struct key; * Some day this will be a full-fledged user tracking system.. */ struct user_struct { - atomic_t __count; /* reference count */ + refcount_t __count; /* reference count */ atomic_t processes; /* How many processes does this user have? */ atomic_t sigpending; /* How many pending signals does this user have? */ #ifdef CONFIG_FANOTIFY @@ -59,7 +60,7 @@ extern struct user_struct root_user; extern struct user_struct * alloc_uid(kuid_t); static inline struct user_struct *get_uid(struct user_struct *u) { - atomic_inc(&u->__count); + refcount_inc(&u->__count); return u; } extern void free_uid(struct user_struct *); diff --git a/kernel/user.c b/kernel/user.c index 36288d840675..5f65ef195259 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -96,7 +96,7 @@ static DEFINE_SPINLOCK(uidhash_lock); /* root_user.__count is 1, for init task cred */ struct user_struct root_user = { - .__count = ATOMIC_INIT(1), + .__count = REFCOUNT_INIT(1), .processes = ATOMIC_INIT(1), .sigpending = ATOMIC_INIT(0), .locked_shm = 0, @@ -123,7 +123,7 @@ static struct user_struct *uid_hash_find(kuid_t uid, struct hlist_head *hashent) hlist_for_each_entry(user, hashent, uidhash_node) { if (uid_eq(user->uid, uid)) { - atomic_inc(&user->__count); + refcount_inc(&user->__count); return user; } } @@ -170,7 +170,7 @@ void free_uid(struct user_struct *up) return; local_irq_save(flags); - if (atomic_dec_and_lock(&up->__count, &uidhash_lock)) + if (refcount_dec_and_lock(&up->__count, &uidhash_lock)) free_user(up, flags); else local_irq_restore(flags); @@ -191,7 +191,7 @@ struct user_struct *alloc_uid(kuid_t uid) goto out_unlock; new->uid = uid; - atomic_set(&new->__count, 1); + refcount_set(&new->__count, 1); ratelimit_state_init(&new->ratelimit, HZ, 100); ratelimit_set_flags(&new->ratelimit, RATELIMIT_MSG_ON_RELEASE); -- cgit From ce0a568d327968367f294ea2425fe2c6f4f33ed2 Mon Sep 17 00:00:00 2001 From: Anna-Maria Gleixner Date: Tue, 21 Aug 2018 21:55:42 -0700 Subject: userns: use irqsave variant of refcount_dec_and_lock() The irqsave variant of refcount_dec_and_lock handles irqsave/restore when taking/releasing the spin lock. With this variant the call of local_irq_save/restore is no longer required. [bigeasy@linutronix.de: s@atomic_dec_and_lock@refcount_dec_and_lock@g] Link: http://lkml.kernel.org/r/20180703200141.28415-7-bigeasy@linutronix.de Signed-off-by: Anna-Maria Gleixner Signed-off-by: Sebastian Andrzej Siewior Acked-by: Peter Zijlstra (Intel) Cc: "Eric W. Biederman" Cc: Ingo Molnar Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/user.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/user.c b/kernel/user.c index 5f65ef195259..0df9b1640b2a 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -169,11 +169,8 @@ void free_uid(struct user_struct *up) if (!up) return; - local_irq_save(flags); - if (refcount_dec_and_lock(&up->__count, &uidhash_lock)) + if (refcount_dec_and_lock_irqsave(&up->__count, &uidhash_lock, &flags)) free_user(up, flags); - else - local_irq_restore(flags); } struct user_struct *alloc_uid(kuid_t uid) -- cgit From 91bc9aaf746ae41016bd6b61a48133e162542574 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 21 Aug 2018 21:55:49 -0700 Subject: kernel/crash_core.c: print timestamp using time64_t The get_seconds() call returns a 32-bit timestamp on some architectures, and will overflow in the future. The newer ktime_get_real_seconds() always returns a 64-bit timestamp that does not suffer from this problem. Link: http://lkml.kernel.org/r/20180618150329.941903-1-arnd@arndb.de Signed-off-by: Arnd Bergmann Reviewed-by: Andrew Morton Cc: Dave Young Cc: Baoquan He Cc: "Kirill A. Shutemov" Cc: Petr Tesarik Cc: Marc-Andr Lureau Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/crash_core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/crash_core.c b/kernel/crash_core.c index a683bfb0530f..933cb3e45b98 100644 --- a/kernel/crash_core.c +++ b/kernel/crash_core.c @@ -344,7 +344,7 @@ void crash_save_vmcoreinfo(void) if (vmcoreinfo_data_safecopy) vmcoreinfo_data = vmcoreinfo_data_safecopy; - vmcoreinfo_append_str("CRASHTIME=%ld\n", get_seconds()); + vmcoreinfo_append_str("CRASHTIME=%lld\n", ktime_get_real_seconds()); update_vmcoreinfo_note(); } -- cgit From a2e514453861dd39b53b7a50b6771bd3f9852078 Mon Sep 17 00:00:00 2001 From: Dmitry Vyukov Date: Tue, 21 Aug 2018 21:55:52 -0700 Subject: kernel/hung_task.c: allow to set checking interval separately from timeout Currently task hung checking interval is equal to timeout, as the result hung is detected anywhere between timeout and 2*timeout. This is fine for most interactive environments, but this hurts automated testing setups (syzbot). In an automated setup we need to strictly order CPU lockup < RCU stall < workqueue lockup < task hung < silent loss, so that RCU stall is not detected as task hung and task hung is not detected as silent machine loss. The large variance in task hung detection timeout requires setting silent machine loss timeout to a very large value (e.g. if task hung is 3 mins, then silent loss need to be set to ~7 mins). The additional 3 minutes significantly reduce testing efficiency because usually we crash kernel within a minute, and this can add hours to bug localization process as it needs to do dozens of tests. Allow setting checking interval separately from timeout. This allows to set timeout to, say, 3 minutes, but checking interval to 10 secs. The interval is controlled via a new hung_task_check_interval_secs sysctl, similar to the existing hung_task_timeout_secs sysctl. The default value of 0 results in the current behavior: checking interval is equal to timeout. [akpm@linux-foundation.org: update hung_task_timeout_max's comment] Link: http://lkml.kernel.org/r/20180611111004.203513-1-dvyukov@google.com Signed-off-by: Dmitry Vyukov Cc: Paul E. McKenney Cc: Tetsuo Handa Cc: Thomas Gleixner Cc: Peter Zijlstra Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/sysctl/kernel.txt | 15 ++++++++++++++- include/linux/sched.h | 1 + include/linux/sched/sysctl.h | 1 + kernel/fork.c | 1 + kernel/hung_task.c | 15 ++++++++++++++- kernel/sysctl.c | 13 ++++++++++++- 6 files changed, 43 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt index 59585030cbaf..9d38e75b19a0 100644 --- a/Documentation/sysctl/kernel.txt +++ b/Documentation/sysctl/kernel.txt @@ -38,6 +38,7 @@ show up in /proc/sys/kernel: - hung_task_panic - hung_task_check_count - hung_task_timeout_secs +- hung_task_check_interval_secs - hung_task_warnings - hyperv_record_panic_msg - kexec_load_disabled @@ -355,7 +356,7 @@ This file shows up if CONFIG_DETECT_HUNG_TASK is enabled. hung_task_timeout_secs: -Check interval. When a task in D state did not get scheduled +When a task in D state did not get scheduled for more than this value report a warning. This file shows up if CONFIG_DETECT_HUNG_TASK is enabled. @@ -364,6 +365,18 @@ Possible values to set are in range {0..LONG_MAX/HZ}. ============================================================== +hung_task_check_interval_secs: + +Hung task check interval. If hung task checking is enabled +(see hung_task_timeout_secs), the check is done every +hung_task_check_interval_secs seconds. +This file shows up if CONFIG_DETECT_HUNG_TASK is enabled. + +0 (default): means use hung_task_timeout_secs as checking interval. +Possible values to set are in range {0..LONG_MAX/HZ}. + +============================================================== + hung_task_warnings: The maximum number of warnings to report. During a check interval diff --git a/include/linux/sched.h b/include/linux/sched.h index 789923fbee3a..58eb3a2bc695 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -853,6 +853,7 @@ struct task_struct { #endif #ifdef CONFIG_DETECT_HUNG_TASK unsigned long last_switch_count; + unsigned long last_switch_time; #endif /* Filesystem information: */ struct fs_struct *fs; diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h index 913488d828cb..a9c32daeb9d8 100644 --- a/include/linux/sched/sysctl.h +++ b/include/linux/sched/sysctl.h @@ -10,6 +10,7 @@ struct ctl_table; extern int sysctl_hung_task_check_count; extern unsigned int sysctl_hung_task_panic; extern unsigned long sysctl_hung_task_timeout_secs; +extern unsigned long sysctl_hung_task_check_interval_secs; extern int sysctl_hung_task_warnings; extern int proc_dohung_task_timeout_secs(struct ctl_table *table, int write, void __user *buffer, diff --git a/kernel/fork.c b/kernel/fork.c index 8c760effa42e..8bcef2413f1b 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1302,6 +1302,7 @@ static int copy_mm(unsigned long clone_flags, struct task_struct *tsk) tsk->nvcsw = tsk->nivcsw = 0; #ifdef CONFIG_DETECT_HUNG_TASK tsk->last_switch_count = tsk->nvcsw + tsk->nivcsw; + tsk->last_switch_time = 0; #endif tsk->mm = NULL; diff --git a/kernel/hung_task.c b/kernel/hung_task.c index 32b479468e4d..b9132d1269ef 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c @@ -40,6 +40,11 @@ int __read_mostly sysctl_hung_task_check_count = PID_MAX_LIMIT; */ unsigned long __read_mostly sysctl_hung_task_timeout_secs = CONFIG_DEFAULT_HUNG_TASK_TIMEOUT; +/* + * Zero (default value) means use sysctl_hung_task_timeout_secs: + */ +unsigned long __read_mostly sysctl_hung_task_check_interval_secs; + int __read_mostly sysctl_hung_task_warnings = 10; static int __read_mostly did_panic; @@ -98,8 +103,11 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout) if (switch_count != t->last_switch_count) { t->last_switch_count = switch_count; + t->last_switch_time = jiffies; return; } + if (time_is_after_jiffies(t->last_switch_time + timeout * HZ)) + return; trace_sched_process_hang(t); @@ -245,8 +253,13 @@ static int watchdog(void *dummy) for ( ; ; ) { unsigned long timeout = sysctl_hung_task_timeout_secs; - long t = hung_timeout_jiffies(hung_last_checked, timeout); + unsigned long interval = sysctl_hung_task_check_interval_secs; + long t; + if (interval == 0) + interval = timeout; + interval = min_t(unsigned long, interval, timeout); + t = hung_timeout_jiffies(hung_last_checked, interval); if (t <= 0) { if (!atomic_xchg(&reset_hung_task, 0)) check_hung_uninterruptible_tasks(timeout); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index f22f76b7a138..dbdcb6673b27 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -145,7 +145,10 @@ static int minolduid; static int ngroups_max = NGROUPS_MAX; static const int cap_last_cap = CAP_LAST_CAP; -/*this is needed for proc_doulongvec_minmax of sysctl_hung_task_timeout_secs */ +/* + * This is needed for proc_doulongvec_minmax of sysctl_hung_task_timeout_secs + * and hung_task_check_interval_secs + */ #ifdef CONFIG_DETECT_HUNG_TASK static unsigned long hung_task_timeout_max = (LONG_MAX/HZ); #endif @@ -1090,6 +1093,14 @@ static struct ctl_table kern_table[] = { .proc_handler = proc_dohung_task_timeout_secs, .extra2 = &hung_task_timeout_max, }, + { + .procname = "hung_task_check_interval_secs", + .data = &sysctl_hung_task_check_interval_secs, + .maxlen = sizeof(unsigned long), + .mode = 0644, + .proc_handler = proc_dohung_task_timeout_secs, + .extra2 = &hung_task_timeout_max, + }, { .procname = "hung_task_warnings", .data = &sysctl_hung_task_warnings, -- cgit From 7290d58095712a89f845e1bca05334796dd49ed2 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Tue, 21 Aug 2018 21:56:09 -0700 Subject: module: use relative references for __ksymtab entries An ordinary arm64 defconfig build has ~64 KB worth of __ksymtab entries, each consisting of two 64-bit fields containing absolute references, to the symbol itself and to a char array containing its name, respectively. When we build the same configuration with KASLR enabled, we end up with an additional ~192 KB of relocations in the .init section, i.e., one 24 byte entry for each absolute reference, which all need to be processed at boot time. Given how the struct kernel_symbol that describes each entry is completely local to module.c (except for the references emitted by EXPORT_SYMBOL() itself), we can easily modify it to contain two 32-bit relative references instead. This reduces the size of the __ksymtab section by 50% for all 64-bit architectures, and gets rid of the runtime relocations entirely for architectures implementing KASLR, either via standard PIE linking (arm64) or using custom host tools (x86). Note that the binary search involving __ksymtab contents relies on each section being sorted by symbol name. This is implemented based on the input section names, not the names in the ksymtab entries, so this patch does not interfere with that. Given that the use of place-relative relocations requires support both in the toolchain and in the module loader, we cannot enable this feature for all architectures. So make it dependent on whether CONFIG_HAVE_ARCH_PREL32_RELOCATIONS is defined. Link: http://lkml.kernel.org/r/20180704083651.24360-4-ard.biesheuvel@linaro.org Signed-off-by: Ard Biesheuvel Acked-by: Jessica Yu Acked-by: Michael Ellerman Reviewed-by: Will Deacon Acked-by: Ingo Molnar Cc: Arnd Bergmann Cc: Benjamin Herrenschmidt Cc: Bjorn Helgaas Cc: Catalin Marinas Cc: James Morris Cc: James Morris Cc: Josh Poimboeuf Cc: Kees Cook Cc: Nicolas Pitre Cc: Paul Mackerras Cc: Petr Mladek Cc: Russell King Cc: "Serge E. Hallyn" Cc: Sergey Senozhatsky Cc: Steven Rostedt Cc: Thomas Garnier Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/include/asm/Kbuild | 1 + arch/x86/include/asm/export.h | 5 ----- include/asm-generic/export.h | 12 +++++++++-- include/linux/compiler.h | 19 ++++++++++++++++++ include/linux/export.h | 46 ++++++++++++++++++++++++++++++++----------- kernel/module.c | 32 ++++++++++++++++++++++++------ 6 files changed, 91 insertions(+), 24 deletions(-) delete mode 100644 arch/x86/include/asm/export.h (limited to 'kernel') diff --git a/arch/x86/include/asm/Kbuild b/arch/x86/include/asm/Kbuild index de690c2d2e33..a0ab9ab61c75 100644 --- a/arch/x86/include/asm/Kbuild +++ b/arch/x86/include/asm/Kbuild @@ -8,5 +8,6 @@ generated-y += xen-hypercalls.h generic-y += dma-contiguous.h generic-y += early_ioremap.h +generic-y += export.h generic-y += mcs_spinlock.h generic-y += mm-arch-hooks.h diff --git a/arch/x86/include/asm/export.h b/arch/x86/include/asm/export.h deleted file mode 100644 index 2a51d66689c5..000000000000 --- a/arch/x86/include/asm/export.h +++ /dev/null @@ -1,5 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifdef CONFIG_64BIT -#define KSYM_ALIGN 16 -#endif -#include diff --git a/include/asm-generic/export.h b/include/asm-generic/export.h index 68efb950a918..4d73e6e3c66c 100644 --- a/include/asm-generic/export.h +++ b/include/asm-generic/export.h @@ -5,12 +5,10 @@ #define KSYM_FUNC(x) x #endif #ifdef CONFIG_64BIT -#define __put .quad #ifndef KSYM_ALIGN #define KSYM_ALIGN 8 #endif #else -#define __put .long #ifndef KSYM_ALIGN #define KSYM_ALIGN 4 #endif @@ -19,6 +17,16 @@ #define KCRC_ALIGN 4 #endif +.macro __put, val, name +#ifdef CONFIG_HAVE_ARCH_PREL32_RELOCATIONS + .long \val - ., \name - . +#elif defined(CONFIG_64BIT) + .quad \val, \name +#else + .long \val, \name +#endif +.endm + /* * note on .section use: @progbits vs %progbits nastiness doesn't matter, * since we immediately emit into those sections anyway. diff --git a/include/linux/compiler.h b/include/linux/compiler.h index c8eab637a2a7..681d866efb1e 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -280,6 +280,25 @@ unsigned long read_word_at_a_time(const void *addr) #endif /* __KERNEL__ */ +/* + * Force the compiler to emit 'sym' as a symbol, so that we can reference + * it from inline assembler. Necessary in case 'sym' could be inlined + * otherwise, or eliminated entirely due to lack of references that are + * visible to the compiler. + */ +#define __ADDRESSABLE(sym) \ + static void * __attribute__((section(".discard.addressable"), used)) \ + __PASTE(__addressable_##sym, __LINE__) = (void *)&sym; + +/** + * offset_to_ptr - convert a relative memory offset to an absolute pointer + * @off: the address of the 32-bit offset value + */ +static inline void *offset_to_ptr(const int *off) +{ + return (void *)((unsigned long)off + *off); +} + #endif /* __ASSEMBLY__ */ #ifndef __optimize diff --git a/include/linux/export.h b/include/linux/export.h index ea7df303d68d..ae072bc5aacf 100644 --- a/include/linux/export.h +++ b/include/linux/export.h @@ -18,12 +18,6 @@ #define VMLINUX_SYMBOL_STR(x) __VMLINUX_SYMBOL_STR(x) #ifndef __ASSEMBLY__ -struct kernel_symbol -{ - unsigned long value; - const char *name; -}; - #ifdef MODULE extern struct module __this_module; #define THIS_MODULE (&__this_module) @@ -54,17 +48,47 @@ extern struct module __this_module; #define __CRC_SYMBOL(sym, sec) #endif +#ifdef CONFIG_HAVE_ARCH_PREL32_RELOCATIONS +#include +/* + * Emit the ksymtab entry as a pair of relative references: this reduces + * the size by half on 64-bit architectures, and eliminates the need for + * absolute relocations that require runtime processing on relocatable + * kernels. + */ +#define __KSYMTAB_ENTRY(sym, sec) \ + __ADDRESSABLE(sym) \ + asm(" .section \"___ksymtab" sec "+" #sym "\", \"a\" \n" \ + " .balign 8 \n" \ + "__ksymtab_" #sym ": \n" \ + " .long " #sym "- . \n" \ + " .long __kstrtab_" #sym "- . \n" \ + " .previous \n") + +struct kernel_symbol { + int value_offset; + int name_offset; +}; +#else +#define __KSYMTAB_ENTRY(sym, sec) \ + static const struct kernel_symbol __ksymtab_##sym \ + __attribute__((section("___ksymtab" sec "+" #sym), used)) \ + = { (unsigned long)&sym, __kstrtab_##sym } + +struct kernel_symbol { + unsigned long value; + const char *name; +}; +#endif + /* For every exported symbol, place a struct in the __ksymtab section */ #define ___EXPORT_SYMBOL(sym, sec) \ extern typeof(sym) sym; \ __CRC_SYMBOL(sym, sec) \ static const char __kstrtab_##sym[] \ - __attribute__((section("__ksymtab_strings"), aligned(1))) \ + __attribute__((section("__ksymtab_strings"), used, aligned(1))) \ = #sym; \ - static const struct kernel_symbol __ksymtab_##sym \ - __used \ - __attribute__((section("___ksymtab" sec "+" #sym), used)) \ - = { (unsigned long)&sym, __kstrtab_##sym } + __KSYMTAB_ENTRY(sym, sec) #if defined(__DISABLE_EXPORTS) diff --git a/kernel/module.c b/kernel/module.c index b046a32520d8..6746c85511fe 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -529,12 +529,30 @@ static bool check_symbol(const struct symsearch *syms, return true; } +static unsigned long kernel_symbol_value(const struct kernel_symbol *sym) +{ +#ifdef CONFIG_HAVE_ARCH_PREL32_RELOCATIONS + return (unsigned long)offset_to_ptr(&sym->value_offset); +#else + return sym->value; +#endif +} + +static const char *kernel_symbol_name(const struct kernel_symbol *sym) +{ +#ifdef CONFIG_HAVE_ARCH_PREL32_RELOCATIONS + return offset_to_ptr(&sym->name_offset); +#else + return sym->name; +#endif +} + static int cmp_name(const void *va, const void *vb) { const char *a; const struct kernel_symbol *b; a = va; b = vb; - return strcmp(a, b->name); + return strcmp(a, kernel_symbol_name(b)); } static bool find_symbol_in_section(const struct symsearch *syms, @@ -2170,7 +2188,7 @@ void *__symbol_get(const char *symbol) sym = NULL; preempt_enable(); - return sym ? (void *)sym->value : NULL; + return sym ? (void *)kernel_symbol_value(sym) : NULL; } EXPORT_SYMBOL_GPL(__symbol_get); @@ -2200,10 +2218,12 @@ static int verify_export_symbols(struct module *mod) for (i = 0; i < ARRAY_SIZE(arr); i++) { for (s = arr[i].sym; s < arr[i].sym + arr[i].num; s++) { - if (find_symbol(s->name, &owner, NULL, true, false)) { + if (find_symbol(kernel_symbol_name(s), &owner, NULL, + true, false)) { pr_err("%s: exports duplicate symbol %s" " (owned by %s)\n", - mod->name, s->name, module_name(owner)); + mod->name, kernel_symbol_name(s), + module_name(owner)); return -ENOEXEC; } } @@ -2252,7 +2272,7 @@ static int simplify_symbols(struct module *mod, const struct load_info *info) ksym = resolve_symbol_wait(mod, info, name); /* Ok if resolved. */ if (ksym && !IS_ERR(ksym)) { - sym[i].st_value = ksym->value; + sym[i].st_value = kernel_symbol_value(ksym); break; } @@ -2516,7 +2536,7 @@ static int is_exported(const char *name, unsigned long value, ks = lookup_symbol(name, __start___ksymtab, __stop___ksymtab); else ks = lookup_symbol(name, mod->syms, mod->syms + mod->num_syms); - return ks != NULL && ks->value == value; + return ks != NULL && kernel_symbol_value(ks) == value; } /* As per nm */ -- cgit From 1b1eeca7e4c19fa76d409d4c7b338dba21f2df45 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Tue, 21 Aug 2018 21:56:13 -0700 Subject: init: allow initcall tables to be emitted using relative references Allow the initcall tables to be emitted using relative references that are only half the size on 64-bit architectures and don't require fixups at runtime on relocatable kernels. Link: http://lkml.kernel.org/r/20180704083651.24360-5-ard.biesheuvel@linaro.org Acked-by: James Morris Acked-by: Sergey Senozhatsky Acked-by: Petr Mladek Acked-by: Michael Ellerman Acked-by: Ingo Molnar Signed-off-by: Ard Biesheuvel Cc: Arnd Bergmann Cc: Benjamin Herrenschmidt Cc: Bjorn Helgaas Cc: Catalin Marinas Cc: James Morris Cc: Jessica Yu Cc: Josh Poimboeuf Cc: Kees Cook Cc: Nicolas Pitre Cc: Paul Mackerras Cc: Russell King Cc: "Serge E. Hallyn" Cc: Steven Rostedt Cc: Thomas Garnier Cc: Thomas Gleixner Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/init.h | 44 +++++++++++++++++++++++++++++++++----------- init/main.c | 32 ++++++++++++++++---------------- kernel/printk/printk.c | 16 +++++++++------- security/security.c | 17 ++++++++++------- 4 files changed, 68 insertions(+), 41 deletions(-) (limited to 'kernel') diff --git a/include/linux/init.h b/include/linux/init.h index bc27cf03c41e..2538d176dd1f 100644 --- a/include/linux/init.h +++ b/include/linux/init.h @@ -116,8 +116,24 @@ typedef int (*initcall_t)(void); typedef void (*exitcall_t)(void); -extern initcall_t __con_initcall_start[], __con_initcall_end[]; -extern initcall_t __security_initcall_start[], __security_initcall_end[]; +#ifdef CONFIG_HAVE_ARCH_PREL32_RELOCATIONS +typedef int initcall_entry_t; + +static inline initcall_t initcall_from_entry(initcall_entry_t *entry) +{ + return offset_to_ptr(entry); +} +#else +typedef initcall_t initcall_entry_t; + +static inline initcall_t initcall_from_entry(initcall_entry_t *entry) +{ + return *entry; +} +#endif + +extern initcall_entry_t __con_initcall_start[], __con_initcall_end[]; +extern initcall_entry_t __security_initcall_start[], __security_initcall_end[]; /* Used for contructor calls. */ typedef void (*ctor_fn_t)(void); @@ -167,9 +183,20 @@ extern bool initcall_debug; * as KEEP() in the linker script. */ -#define __define_initcall(fn, id) \ +#ifdef CONFIG_HAVE_ARCH_PREL32_RELOCATIONS +#define ___define_initcall(fn, id, __sec) \ + __ADDRESSABLE(fn) \ + asm(".section \"" #__sec ".init\", \"a\" \n" \ + "__initcall_" #fn #id ": \n" \ + ".long " #fn " - . \n" \ + ".previous \n"); +#else +#define ___define_initcall(fn, id, __sec) \ static initcall_t __initcall_##fn##id __used \ - __attribute__((__section__(".initcall" #id ".init"))) = fn; + __attribute__((__section__(#__sec ".init"))) = fn; +#endif + +#define __define_initcall(fn, id) ___define_initcall(fn, id, .initcall##id) /* * Early initcalls run before initializing SMP. @@ -208,13 +235,8 @@ extern bool initcall_debug; #define __exitcall(fn) \ static exitcall_t __exitcall_##fn __exit_call = fn -#define console_initcall(fn) \ - static initcall_t __initcall_##fn \ - __used __section(.con_initcall.init) = fn - -#define security_initcall(fn) \ - static initcall_t __initcall_##fn \ - __used __section(.security_initcall.init) = fn +#define console_initcall(fn) ___define_initcall(fn,, .con_initcall) +#define security_initcall(fn) ___define_initcall(fn,, .security_initcall) struct obs_kernel_param { const char *str; diff --git a/init/main.c b/init/main.c index b729e1f22838..3a6ce89e128f 100644 --- a/init/main.c +++ b/init/main.c @@ -902,18 +902,18 @@ int __init_or_module do_one_initcall(initcall_t fn) } -extern initcall_t __initcall_start[]; -extern initcall_t __initcall0_start[]; -extern initcall_t __initcall1_start[]; -extern initcall_t __initcall2_start[]; -extern initcall_t __initcall3_start[]; -extern initcall_t __initcall4_start[]; -extern initcall_t __initcall5_start[]; -extern initcall_t __initcall6_start[]; -extern initcall_t __initcall7_start[]; -extern initcall_t __initcall_end[]; - -static initcall_t *initcall_levels[] __initdata = { +extern initcall_entry_t __initcall_start[]; +extern initcall_entry_t __initcall0_start[]; +extern initcall_entry_t __initcall1_start[]; +extern initcall_entry_t __initcall2_start[]; +extern initcall_entry_t __initcall3_start[]; +extern initcall_entry_t __initcall4_start[]; +extern initcall_entry_t __initcall5_start[]; +extern initcall_entry_t __initcall6_start[]; +extern initcall_entry_t __initcall7_start[]; +extern initcall_entry_t __initcall_end[]; + +static initcall_entry_t *initcall_levels[] __initdata = { __initcall0_start, __initcall1_start, __initcall2_start, @@ -939,7 +939,7 @@ static char *initcall_level_names[] __initdata = { static void __init do_initcall_level(int level) { - initcall_t *fn; + initcall_entry_t *fn; strcpy(initcall_command_line, saved_command_line); parse_args(initcall_level_names[level], @@ -950,7 +950,7 @@ static void __init do_initcall_level(int level) trace_initcall_level(initcall_level_names[level]); for (fn = initcall_levels[level]; fn < initcall_levels[level+1]; fn++) - do_one_initcall(*fn); + do_one_initcall(initcall_from_entry(fn)); } static void __init do_initcalls(void) @@ -981,11 +981,11 @@ static void __init do_basic_setup(void) static void __init do_pre_smp_initcalls(void) { - initcall_t *fn; + initcall_entry_t *fn; trace_initcall_level("early"); for (fn = __initcall_start; fn < __initcall0_start; fn++) - do_one_initcall(*fn); + do_one_initcall(initcall_from_entry(fn)); } /* diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 90b6ab01db59..918f386b2f6e 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -2788,7 +2788,8 @@ EXPORT_SYMBOL(unregister_console); void __init console_init(void) { int ret; - initcall_t *call; + initcall_t call; + initcall_entry_t *ce; /* Setup the default TTY line discipline. */ n_tty_init(); @@ -2797,13 +2798,14 @@ void __init console_init(void) * set up the console device so that later boot sequences can * inform about problems etc.. */ - call = __con_initcall_start; + ce = __con_initcall_start; trace_initcall_level("console"); - while (call < __con_initcall_end) { - trace_initcall_start((*call)); - ret = (*call)(); - trace_initcall_finish((*call), ret); - call++; + while (ce < __con_initcall_end) { + call = initcall_from_entry(ce); + trace_initcall_start(call); + ret = call(); + trace_initcall_finish(call, ret); + ce++; } } diff --git a/security/security.c b/security/security.c index 47cfff01d7ec..736e78da1ab9 100644 --- a/security/security.c +++ b/security/security.c @@ -48,14 +48,17 @@ static __initdata char chosen_lsm[SECURITY_NAME_MAX + 1] = static void __init do_security_initcalls(void) { int ret; - initcall_t *call; - call = __security_initcall_start; + initcall_t call; + initcall_entry_t *ce; + + ce = __security_initcall_start; trace_initcall_level("security"); - while (call < __security_initcall_end) { - trace_initcall_start((*call)); - ret = (*call) (); - trace_initcall_finish((*call), ret); - call++; + while (ce < __security_initcall_end) { + call = initcall_from_entry(ce); + trace_initcall_start(call); + ret = call(); + trace_initcall_finish(call, ret); + ce++; } } -- cgit From 46e0c9be206fa7b11aca75da2d6b8535d0139752 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Tue, 21 Aug 2018 21:56:22 -0700 Subject: kernel: tracepoints: add support for relative references To avoid the need for relocating absolute references to tracepoint structures at boot time when running relocatable kernels (which may take a disproportionate amount of space), add the option to emit these tables as relative references instead. Link: http://lkml.kernel.org/r/20180704083651.24360-7-ard.biesheuvel@linaro.org Acked-by: Michael Ellerman Acked-by: Ingo Molnar Acked-by: Steven Rostedt (VMware) Signed-off-by: Ard Biesheuvel Cc: Arnd Bergmann Cc: Benjamin Herrenschmidt Cc: Bjorn Helgaas Cc: Catalin Marinas Cc: James Morris Cc: James Morris Cc: Jessica Yu Cc: Josh Poimboeuf Cc: Kees Cook Cc: Nicolas Pitre Cc: Paul Mackerras Cc: Petr Mladek Cc: Russell King Cc: "Serge E. Hallyn" Cc: Sergey Senozhatsky Cc: Thomas Garnier Cc: Thomas Gleixner Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/tracepoint.h | 19 ++++++++++++++---- kernel/tracepoint.c | 49 ++++++++++++++++++++++++---------------------- 2 files changed, 41 insertions(+), 27 deletions(-) (limited to 'kernel') diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h index d9a084c72541..7f2e16e76ac4 100644 --- a/include/linux/tracepoint.h +++ b/include/linux/tracepoint.h @@ -249,6 +249,19 @@ extern void syscall_unregfunc(void); return static_key_false(&__tracepoint_##name.key); \ } +#ifdef CONFIG_HAVE_ARCH_PREL32_RELOCATIONS +#define __TRACEPOINT_ENTRY(name) \ + asm(" .section \"__tracepoints_ptrs\", \"a\" \n" \ + " .balign 4 \n" \ + " .long __tracepoint_" #name " - . \n" \ + " .previous \n") +#else +#define __TRACEPOINT_ENTRY(name) \ + static struct tracepoint * const __tracepoint_ptr_##name __used \ + __attribute__((section("__tracepoints_ptrs"))) = \ + &__tracepoint_##name +#endif + /* * We have no guarantee that gcc and the linker won't up-align the tracepoint * structures, so we create an array of pointers that will be used for iteration @@ -258,11 +271,9 @@ extern void syscall_unregfunc(void); static const char __tpstrtab_##name[] \ __attribute__((section("__tracepoints_strings"))) = #name; \ struct tracepoint __tracepoint_##name \ - __attribute__((section("__tracepoints"))) = \ + __attribute__((section("__tracepoints"), used)) = \ { __tpstrtab_##name, STATIC_KEY_INIT_FALSE, reg, unreg, NULL };\ - static struct tracepoint * const __tracepoint_ptr_##name __used \ - __attribute__((section("__tracepoints_ptrs"))) = \ - &__tracepoint_##name; + __TRACEPOINT_ENTRY(name); #define DEFINE_TRACE(name) \ DEFINE_TRACE_FN(name, NULL, NULL); diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c index 96db841bf0fc..bf2c06ef9afc 100644 --- a/kernel/tracepoint.c +++ b/kernel/tracepoint.c @@ -371,6 +371,27 @@ int tracepoint_probe_unregister(struct tracepoint *tp, void *probe, void *data) } EXPORT_SYMBOL_GPL(tracepoint_probe_unregister); +static void for_each_tracepoint_range(struct tracepoint * const *begin, + struct tracepoint * const *end, + void (*fct)(struct tracepoint *tp, void *priv), + void *priv) +{ + if (!begin) + return; + + if (IS_ENABLED(CONFIG_HAVE_ARCH_PREL32_RELOCATIONS)) { + const int *iter; + + for (iter = (const int *)begin; iter < (const int *)end; iter++) + fct(offset_to_ptr(iter), priv); + } else { + struct tracepoint * const *iter; + + for (iter = begin; iter < end; iter++) + fct(*iter, priv); + } +} + #ifdef CONFIG_MODULES bool trace_module_has_bad_taint(struct module *mod) { @@ -435,15 +456,9 @@ EXPORT_SYMBOL_GPL(unregister_tracepoint_module_notifier); * Ensure the tracer unregistered the module's probes before the module * teardown is performed. Prevents leaks of probe and data pointers. */ -static void tp_module_going_check_quiescent(struct tracepoint * const *begin, - struct tracepoint * const *end) +static void tp_module_going_check_quiescent(struct tracepoint *tp, void *priv) { - struct tracepoint * const *iter; - - if (!begin) - return; - for (iter = begin; iter < end; iter++) - WARN_ON_ONCE((*iter)->funcs); + WARN_ON_ONCE(tp->funcs); } static int tracepoint_module_coming(struct module *mod) @@ -494,8 +509,9 @@ static void tracepoint_module_going(struct module *mod) * Called the going notifier before checking for * quiescence. */ - tp_module_going_check_quiescent(mod->tracepoints_ptrs, - mod->tracepoints_ptrs + mod->num_tracepoints); + for_each_tracepoint_range(mod->tracepoints_ptrs, + mod->tracepoints_ptrs + mod->num_tracepoints, + tp_module_going_check_quiescent, NULL); break; } } @@ -547,19 +563,6 @@ static __init int init_tracepoints(void) __initcall(init_tracepoints); #endif /* CONFIG_MODULES */ -static void for_each_tracepoint_range(struct tracepoint * const *begin, - struct tracepoint * const *end, - void (*fct)(struct tracepoint *tp, void *priv), - void *priv) -{ - struct tracepoint * const *iter; - - if (!begin) - return; - for (iter = begin; iter < end; iter++) - fct(*iter, priv); -} - /** * for_each_kernel_tracepoint - iteration on all kernel tracepoints * @fct: callback -- cgit From e05a8e4d88d16e088d83ce679ac3343ac66c936b Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 21 Aug 2018 21:56:34 -0700 Subject: sched/wait: assert the wait_queue_head lock is held in __wake_up_common Better ensure we actually hold the lock using lockdep than just commenting on it. Due to the various exported _locked interfaces it is far too easy to get the locking wrong. Link: http://lkml.kernel.org/r/20171214152344.6880-4-hch@lst.de Signed-off-by: Christoph Hellwig Acked-by: Ingo Molnar Cc: Al Viro Cc: Andrea Arcangeli Cc: Ingo Molnar Cc: Jason Baron Cc: Matthew Wilcox Cc: Mike Rapoport Cc: Peter Zijlstra Cc: Davidlohr Bueso Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sched/wait.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c index 870f97b313e3..5dd47f1103d1 100644 --- a/kernel/sched/wait.c +++ b/kernel/sched/wait.c @@ -69,6 +69,8 @@ static int __wake_up_common(struct wait_queue_head *wq_head, unsigned int mode, wait_queue_entry_t *curr, *next; int cnt = 0; + lockdep_assert_held(&wq_head->lock); + if (bookmark && (bookmark->flags & WQ_FLAG_BOOKMARK)) { curr = list_next_entry(bookmark, entry); -- cgit From 52cba1a274818c3ee6299c1a1b476e7bc5037a2f Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Tue, 21 Aug 2018 21:59:51 -0700 Subject: signal: make force_sigsegv() void Patch series "signal: refactor some functions", v3. This series refactors a bunch of functions in signal.c to simplify parts of the code. The greatest single change is declaring the static do_sigpending() helper as void which makes it possible to remove a bunch of unnecessary checks in the syscalls later on. This patch (of 17): force_sigsegv() returned 0 unconditionally so it doesn't make sense to have it return at all. In addition, there are no callers that check force_sigsegv()'s return value. Link: http://lkml.kernel.org/r/20180602103653.18181-2-christian@brauner.io Signed-off-by: Christian Brauner Reviewed-by: Andrew Morton Cc: Eric W. Biederman Cc: Greg Kroah-Hartman Cc: Ingo Molnar Cc: James Morris Cc: Kees Cook Cc: Peter Zijlstra Cc: Stephen Smalley Cc: Al Viro Cc: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sched/signal.h | 2 +- kernel/signal.c | 7 ++----- 2 files changed, 3 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h index 113d1ad1ced7..e138ac16c650 100644 --- a/include/linux/sched/signal.h +++ b/include/linux/sched/signal.h @@ -314,7 +314,7 @@ int force_sig_pkuerr(void __user *addr, u32 pkey); int force_sig_ptrace_errno_trap(int errno, void __user *addr); extern int send_sig_info(int, struct siginfo *, struct task_struct *); -extern int force_sigsegv(int, struct task_struct *); +extern void force_sigsegv(int sig, struct task_struct *p); extern int force_sig_info(int, struct siginfo *, struct task_struct *); extern int __kill_pgrp_info(int sig, struct siginfo *info, struct pid *pgrp); extern int kill_pid_info(int sig, struct siginfo *info, struct pid *pid); diff --git a/kernel/signal.c b/kernel/signal.c index 8d8a940422a8..8a828baa0f93 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1458,8 +1458,7 @@ send_sig(int sig, struct task_struct *p, int priv) return send_sig_info(sig, __si_special(priv), p); } -void -force_sig(int sig, struct task_struct *p) +void force_sig(int sig, struct task_struct *p) { force_sig_info(sig, SEND_SIG_PRIV, p); } @@ -1470,8 +1469,7 @@ force_sig(int sig, struct task_struct *p) * the problem was already a SIGSEGV, we'll want to * make sure we don't even try to deliver the signal.. */ -int -force_sigsegv(int sig, struct task_struct *p) +void force_sigsegv(int sig, struct task_struct *p) { if (sig == SIGSEGV) { unsigned long flags; @@ -1480,7 +1478,6 @@ force_sigsegv(int sig, struct task_struct *p) spin_unlock_irqrestore(&p->sighand->siglock, flags); } force_sig(SIGSEGV, p); - return 0; } int force_sig_fault(int sig, int code, void __user *addr -- cgit From bb17fcca078fc56ea7c7611cbe92687021cf6a31 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Tue, 21 Aug 2018 21:59:55 -0700 Subject: signal: make kill_as_cred_perm() return bool kill_as_cred_perm() already behaves like a boolean function. Let's actually declare it as such too. Link: http://lkml.kernel.org/r/20180602103653.18181-3-christian@brauner.io Signed-off-by: Christian Brauner Reviewed-by: Andrew Morton Cc: Al Viro Cc: Eric W. Biederman Cc: Greg Kroah-Hartman Cc: Ingo Molnar Cc: James Morris Cc: Kees Cook Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Stephen Smalley Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/signal.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index 8a828baa0f93..5b0fb57b23f0 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1339,14 +1339,15 @@ static int kill_proc_info(int sig, struct siginfo *info, pid_t pid) return error; } -static int kill_as_cred_perm(const struct cred *cred, - struct task_struct *target) +static inline bool kill_as_cred_perm(const struct cred *cred, + struct task_struct *target) { const struct cred *pcred = __task_cred(target); - if (!uid_eq(cred->euid, pcred->suid) && !uid_eq(cred->euid, pcred->uid) && - !uid_eq(cred->uid, pcred->suid) && !uid_eq(cred->uid, pcred->uid)) - return 0; - return 1; + + return uid_eq(cred->euid, pcred->suid) || + uid_eq(cred->euid, pcred->uid) || + uid_eq(cred->uid, pcred->suid) || + uid_eq(cred->uid, pcred->uid); } /* like kill_pid_info(), but doesn't use uid/euid of "current" */ -- cgit From 6527de953354e550bf6d941fdcd1aaf832317e8d Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Tue, 21 Aug 2018 21:59:59 -0700 Subject: signal: make may_ptrace_stop() return bool may_ptrace_stop() already behaves like a boolean function. Let's actually declare it as such too. Link: http://lkml.kernel.org/r/20180602103653.18181-4-christian@brauner.io Signed-off-by: Christian Brauner Reviewed-by: Andrew Morton Cc: Al Viro Cc: Eric W. Biederman Cc: Greg Kroah-Hartman Cc: Ingo Molnar Cc: James Morris Cc: Kees Cook Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Stephen Smalley Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/signal.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index 5b0fb57b23f0..f03886cd064e 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1875,10 +1875,10 @@ static void do_notify_parent_cldstop(struct task_struct *tsk, spin_unlock_irqrestore(&sighand->siglock, flags); } -static inline int may_ptrace_stop(void) +static inline bool may_ptrace_stop(void) { if (!likely(current->ptrace)) - return 0; + return false; /* * Are we in the middle of do_coredump? * If so and our tracer is also part of the coredump stopping @@ -1894,9 +1894,9 @@ static inline int may_ptrace_stop(void) */ if (unlikely(current->mm->core_state) && unlikely(current->mm == current->parent->mm)) - return 0; + return false; - return 1; + return true; } /* -- cgit From b1d294c80393e7ba18b3c646fe9d8e6a206c7988 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Tue, 21 Aug 2018 22:00:02 -0700 Subject: signal: make do_sigpending() void do_sigpending() returned 0 unconditionally so it doesn't make sense to have it return at all. This allows us to simplify a bunch of syscall callers. Link: http://lkml.kernel.org/r/20180602103653.18181-5-christian@brauner.io Signed-off-by: Christian Brauner Acked-by: Al Viro Acked-by: Oleg Nesterov Reviewed-by: Andrew Morton Cc: Eric W. Biederman Cc: Greg Kroah-Hartman Cc: Ingo Molnar Cc: James Morris Cc: Kees Cook Cc: Peter Zijlstra Cc: Stephen Smalley Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/signal.c | 41 ++++++++++++++++++++--------------------- 1 file changed, 20 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index f03886cd064e..10918c7b2ee3 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2753,7 +2753,7 @@ COMPAT_SYSCALL_DEFINE4(rt_sigprocmask, int, how, compat_sigset_t __user *, nset, } #endif -static int do_sigpending(sigset_t *set) +static void do_sigpending(sigset_t *set) { spin_lock_irq(¤t->sighand->siglock); sigorsets(set, ¤t->pending.signal, @@ -2762,7 +2762,6 @@ static int do_sigpending(sigset_t *set) /* Outside the lock because only this thread touches it. */ sigandsets(set, ¤t->blocked, set); - return 0; } /** @@ -2774,15 +2773,16 @@ static int do_sigpending(sigset_t *set) SYSCALL_DEFINE2(rt_sigpending, sigset_t __user *, uset, size_t, sigsetsize) { sigset_t set; - int err; if (sigsetsize > sizeof(*uset)) return -EINVAL; - err = do_sigpending(&set); - if (!err && copy_to_user(uset, &set, sigsetsize)) - err = -EFAULT; - return err; + do_sigpending(&set); + + if (copy_to_user(uset, &set, sigsetsize)) + return -EFAULT; + + return 0; } #ifdef CONFIG_COMPAT @@ -2790,15 +2790,13 @@ COMPAT_SYSCALL_DEFINE2(rt_sigpending, compat_sigset_t __user *, uset, compat_size_t, sigsetsize) { sigset_t set; - int err; if (sigsetsize > sizeof(*uset)) return -EINVAL; - err = do_sigpending(&set); - if (!err) - err = put_compat_sigset(uset, &set, sigsetsize); - return err; + do_sigpending(&set); + + return put_compat_sigset(uset, &set, sigsetsize); } #endif @@ -3560,25 +3558,26 @@ int __compat_save_altstack(compat_stack_t __user *uss, unsigned long sp) SYSCALL_DEFINE1(sigpending, old_sigset_t __user *, uset) { sigset_t set; - int err; if (sizeof(old_sigset_t) > sizeof(*uset)) return -EINVAL; - err = do_sigpending(&set); - if (!err && copy_to_user(uset, &set, sizeof(old_sigset_t))) - err = -EFAULT; - return err; + do_sigpending(&set); + + if (copy_to_user(uset, &set, sizeof(old_sigset_t))) + return -EFAULT; + + return 0; } #ifdef CONFIG_COMPAT COMPAT_SYSCALL_DEFINE1(sigpending, compat_old_sigset_t __user *, set32) { sigset_t set; - int err = do_sigpending(&set); - if (!err) - err = put_user(set.sig[0], set32); - return err; + + do_sigpending(&set); + + return put_user(set.sig[0], set32); } #endif -- cgit From d8f993b3dba05ff354f9b9592afc44b1b3dbfd46 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Tue, 21 Aug 2018 22:00:07 -0700 Subject: signal: simplify rt_sigaction() The goto is not needed and does not add any clarity. Simply return -EINVAL on unexpected sigset_t struct size directly. Link: http://lkml.kernel.org/r/20180602103653.18181-6-christian@brauner.io Signed-off-by: Christian Brauner Acked-by: Oleg Nesterov Reviewed-by: Andrew Morton Cc: Al Viro Cc: Eric W. Biederman Cc: Greg Kroah-Hartman Cc: Ingo Molnar Cc: James Morris Cc: Kees Cook Cc: Peter Zijlstra Cc: Stephen Smalley Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/signal.c | 22 ++++++++++------------ 1 file changed, 10 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index 10918c7b2ee3..adeee5095039 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -3648,25 +3648,23 @@ SYSCALL_DEFINE4(rt_sigaction, int, sig, size_t, sigsetsize) { struct k_sigaction new_sa, old_sa; - int ret = -EINVAL; + int ret; /* XXX: Don't preclude handling different sized sigset_t's. */ if (sigsetsize != sizeof(sigset_t)) - goto out; + return -EINVAL; - if (act) { - if (copy_from_user(&new_sa.sa, act, sizeof(new_sa.sa))) - return -EFAULT; - } + if (act && copy_from_user(&new_sa.sa, act, sizeof(new_sa.sa))) + return -EFAULT; ret = do_sigaction(sig, act ? &new_sa : NULL, oact ? &old_sa : NULL); + if (ret) + return ret; - if (!ret && oact) { - if (copy_to_user(oact, &old_sa.sa, sizeof(old_sa.sa))) - return -EFAULT; - } -out: - return ret; + if (oact && copy_to_user(oact, &old_sa.sa, sizeof(old_sa.sa))) + return -EFAULT; + + return 0; } #ifdef CONFIG_COMPAT COMPAT_SYSCALL_DEFINE4(rt_sigaction, int, sig, -- cgit From 2a9b90940957d3c0f744011b02b4575323be5947 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Tue, 21 Aug 2018 22:00:11 -0700 Subject: signal: make kill_ok_by_cred() return bool kill_ok_by_cred() already behaves like a boolean function. Let's actually declare it as such too. Link: http://lkml.kernel.org/r/20180602103653.18181-7-christian@brauner.io Signed-off-by: Christian Brauner Reviewed-by: Andrew Morton Cc: Al Viro Cc: Eric W. Biederman Cc: Greg Kroah-Hartman Cc: Ingo Molnar Cc: James Morris Cc: Kees Cook Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Stephen Smalley Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/signal.c | 17 ++++++----------- 1 file changed, 6 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index adeee5095039..0e1bb87415c4 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -717,21 +717,16 @@ static inline bool si_fromuser(const struct siginfo *info) /* * called with RCU read lock from check_kill_permission() */ -static int kill_ok_by_cred(struct task_struct *t) +static bool kill_ok_by_cred(struct task_struct *t) { const struct cred *cred = current_cred(); const struct cred *tcred = __task_cred(t); - if (uid_eq(cred->euid, tcred->suid) || - uid_eq(cred->euid, tcred->uid) || - uid_eq(cred->uid, tcred->suid) || - uid_eq(cred->uid, tcred->uid)) - return 1; - - if (ns_capable(tcred->user_ns, CAP_KILL)) - return 1; - - return 0; + return uid_eq(cred->euid, tcred->suid) || + uid_eq(cred->euid, tcred->uid) || + uid_eq(cred->uid, tcred->suid) || + uid_eq(cred->uid, tcred->uid) || + ns_capable(tcred->user_ns, CAP_KILL); } /* -- cgit From e4a8b4efbfdf8ce86b1d410cc96d2790a05f1fd5 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Tue, 21 Aug 2018 22:00:15 -0700 Subject: signal: make sig_handler_ignored() return bool sig_handler_ignored() already behaves like a boolean function. Let's actually declare it as such too. Link: http://lkml.kernel.org/r/20180602103653.18181-8-christian@brauner.io Signed-off-by: Christian Brauner Reviewed-by: Andrew Morton Cc: Al Viro Cc: Eric W. Biederman Cc: Greg Kroah-Hartman Cc: Ingo Molnar Cc: James Morris Cc: Kees Cook Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Stephen Smalley Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/signal.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index 0e1bb87415c4..a032a9bae5fb 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -65,11 +65,11 @@ static void __user *sig_handler(struct task_struct *t, int sig) return t->sighand->action[sig - 1].sa.sa_handler; } -static int sig_handler_ignored(void __user *handler, int sig) +static inline bool sig_handler_ignored(void __user *handler, int sig) { /* Is it explicitly or implicitly ignored? */ return handler == SIG_IGN || - (handler == SIG_DFL && sig_kernel_ignore(sig)); + (handler == SIG_DFL && sig_kernel_ignore(sig)); } static int sig_task_ignored(struct task_struct *t, int sig, bool force) -- cgit From 41aaa481197dc566f691d403e0247bb53a017770 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Tue, 21 Aug 2018 22:00:19 -0700 Subject: signal: make sig_task_ignored() return bool sig_task_ignored() already behaves like a boolean function. Let's actually declare it as such too. Link: http://lkml.kernel.org/r/20180602103653.18181-9-christian@brauner.io Signed-off-by: Christian Brauner Reviewed-by: Andrew Morton Cc: Al Viro Cc: Eric W. Biederman Cc: Greg Kroah-Hartman Cc: Ingo Molnar Cc: James Morris Cc: Kees Cook Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Stephen Smalley Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/signal.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index a032a9bae5fb..6e92adddd667 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -72,7 +72,7 @@ static inline bool sig_handler_ignored(void __user *handler, int sig) (handler == SIG_DFL && sig_kernel_ignore(sig)); } -static int sig_task_ignored(struct task_struct *t, int sig, bool force) +static bool sig_task_ignored(struct task_struct *t, int sig, bool force) { void __user *handler; @@ -80,7 +80,7 @@ static int sig_task_ignored(struct task_struct *t, int sig, bool force) if (unlikely(t->signal->flags & SIGNAL_UNKILLABLE) && handler == SIG_DFL && !(force && sig_kernel_only(sig))) - return 1; + return true; return sig_handler_ignored(handler, sig); } -- cgit From 6a0cdcd78892d4508e77cbec913eaf099ab4a8e9 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Tue, 21 Aug 2018 22:00:23 -0700 Subject: signal: make sig_ignored() return bool sig_ignored() already behaves like a boolean function. Let's actually declare it as such too. Link: http://lkml.kernel.org/r/20180602103653.18181-10-christian@brauner.io Signed-off-by: Christian Brauner Reviewed-by: Andrew Morton Cc: Al Viro Cc: Eric W. Biederman Cc: Greg Kroah-Hartman Cc: Ingo Molnar Cc: James Morris Cc: Kees Cook Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Stephen Smalley Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/signal.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index 6e92adddd667..bcd4272639ca 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -85,7 +85,7 @@ static bool sig_task_ignored(struct task_struct *t, int sig, bool force) return sig_handler_ignored(handler, sig); } -static int sig_ignored(struct task_struct *t, int sig, bool force) +static bool sig_ignored(struct task_struct *t, int sig, bool force) { /* * Blocked signals are never ignored, since the @@ -93,7 +93,7 @@ static int sig_ignored(struct task_struct *t, int sig, bool force) * unblocked. */ if (sigismember(&t->blocked, sig) || sigismember(&t->real_blocked, sig)) - return 0; + return false; /* * Tracers may want to know about even ignored signal unless it @@ -101,7 +101,7 @@ static int sig_ignored(struct task_struct *t, int sig, bool force) * by SIGNAL_UNKILLABLE task. */ if (t->ptrace && sig != SIGKILL) - return 0; + return false; return sig_task_ignored(t, sig, force); } -- cgit From 938696a82974c33b93be9657b72521bc84f5c587 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Tue, 21 Aug 2018 22:00:27 -0700 Subject: signal: make has_pending_signals() return bool has_pending_signals() already behaves like a boolean function. Let's actually declare it as such too. Link: http://lkml.kernel.org/r/20180602103653.18181-11-christian@brauner.io Signed-off-by: Christian Brauner Reviewed-by: Andrew Morton Cc: Al Viro Cc: Eric W. Biederman Cc: Greg Kroah-Hartman Cc: Ingo Molnar Cc: James Morris Cc: Kees Cook Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Stephen Smalley Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/signal.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index bcd4272639ca..e00e96a7b24f 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -110,7 +110,7 @@ static bool sig_ignored(struct task_struct *t, int sig, bool force) * Re-calculate pending state from the set of locally pending * signals, globally pending signals, and blocked signals. */ -static inline int has_pending_signals(sigset_t *signal, sigset_t *blocked) +static inline bool has_pending_signals(sigset_t *signal, sigset_t *blocked) { unsigned long ready; long i; -- cgit From 09ae854edb2d275d98a874c21c24c58cee4df14e Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Tue, 21 Aug 2018 22:00:30 -0700 Subject: signal: make recalc_sigpending_tsk() return bool recalc_sigpending_tsk() already behaves like a boolean function. Let's actually declare it as such too. Link: http://lkml.kernel.org/r/20180602103653.18181-12-christian@brauner.io Signed-off-by: Christian Brauner Reviewed-by: Andrew Morton Cc: Al Viro Cc: Eric W. Biederman Cc: Greg Kroah-Hartman Cc: Ingo Molnar Cc: James Morris Cc: Kees Cook Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Stephen Smalley Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/signal.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index e00e96a7b24f..7eec2dba597e 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -138,20 +138,21 @@ static inline bool has_pending_signals(sigset_t *signal, sigset_t *blocked) #define PENDING(p,b) has_pending_signals(&(p)->signal, (b)) -static int recalc_sigpending_tsk(struct task_struct *t) +static bool recalc_sigpending_tsk(struct task_struct *t) { if ((t->jobctl & JOBCTL_PENDING_MASK) || PENDING(&t->pending, &t->blocked) || PENDING(&t->signal->shared_pending, &t->blocked)) { set_tsk_thread_flag(t, TIF_SIGPENDING); - return 1; + return true; } + /* * We must never clear the flag in another thread, or in current * when it's possible the current syscall is returning -ERESTART*. * So we don't clear it here, and only callers who know they should do. */ - return 0; + return false; } /* -- cgit From 67a48a24478841525ba73ec6d64caaf079cf3b7c Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Tue, 21 Aug 2018 22:00:34 -0700 Subject: signal: make unhandled_signal() return bool unhandled_signal() already behaves like a boolean function. Let's actually declare it as such too. All callers treat it as such too. Link: http://lkml.kernel.org/r/20180602103653.18181-13-christian@brauner.io Signed-off-by: Christian Brauner Reviewed-by: Andrew Morton Cc: Al Viro Cc: Eric W. Biederman Cc: Greg Kroah-Hartman Cc: Ingo Molnar Cc: James Morris Cc: Kees Cook Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Stephen Smalley Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/signal.h | 2 +- kernel/signal.c | 8 +++++--- 2 files changed, 6 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/include/linux/signal.h b/include/linux/signal.h index 3c5200137b24..fa23cae66758 100644 --- a/include/linux/signal.h +++ b/include/linux/signal.h @@ -287,7 +287,7 @@ static inline void disallow_signal(int sig) extern struct kmem_cache *sighand_cachep; -int unhandled_signal(struct task_struct *tsk, int sig); +extern bool unhandled_signal(struct task_struct *tsk, int sig); /* * In POSIX a signal is sent either to a specific thread (Linux task) diff --git a/kernel/signal.c b/kernel/signal.c index 7eec2dba597e..c10c09fc4ec3 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -505,13 +505,15 @@ flush_signal_handlers(struct task_struct *t, int force_default) } } -int unhandled_signal(struct task_struct *tsk, int sig) +bool unhandled_signal(struct task_struct *tsk, int sig) { void __user *handler = tsk->sighand->action[sig-1].sa.sa_handler; if (is_global_init(tsk)) - return 1; + return true; + if (handler != SIG_IGN && handler != SIG_DFL) - return 0; + return false; + /* if ptraced, let the tracer determine */ return !tsk->ptrace; } -- cgit From 8f11351eee9524c179361fcbf1538c993f2390e4 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Tue, 21 Aug 2018 22:00:38 -0700 Subject: signal: make flush_sigqueue_mask() void The return value of flush_sigqueue_mask() is never checked anywhere. Link: http://lkml.kernel.org/r/20180602103653.18181-14-christian@brauner.io Signed-off-by: Christian Brauner Reviewed-by: Andrew Morton Cc: Al Viro Cc: Eric W. Biederman Cc: Greg Kroah-Hartman Cc: Ingo Molnar Cc: James Morris Cc: Kees Cook Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Stephen Smalley Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/signal.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index c10c09fc4ec3..fe0e9e82ce44 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -687,14 +687,14 @@ void signal_wake_up_state(struct task_struct *t, unsigned int state) * * All callers must be holding the siglock. */ -static int flush_sigqueue_mask(sigset_t *mask, struct sigpending *s) +static void flush_sigqueue_mask(sigset_t *mask, struct sigpending *s) { struct sigqueue *q, *n; sigset_t m; sigandsets(&m, mask, &s->signal); if (sigisemptyset(&m)) - return 0; + return; sigandnsets(&s->signal, &s->signal, mask); list_for_each_entry_safe(q, n, &s->list, list) { @@ -703,7 +703,6 @@ static int flush_sigqueue_mask(sigset_t *mask, struct sigpending *s) __sigqueue_free(q); } } - return 1; } static inline int is_si_special(const struct siginfo *info) -- cgit From acd14e62f075f44cd04e1ac2920a515f9b80146e Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Tue, 21 Aug 2018 22:00:42 -0700 Subject: signal: make wants_signal() return bool wants_signal() already behaves like a boolean function. Let's actually declare it as such too. Link: http://lkml.kernel.org/r/20180602103653.18181-15-christian@brauner.io Signed-off-by: Christian Brauner Reviewed-by: Andrew Morton Cc: Al Viro Cc: Eric W. Biederman Cc: Greg Kroah-Hartman Cc: Ingo Molnar Cc: James Morris Cc: Kees Cook Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Stephen Smalley Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/signal.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index fe0e9e82ce44..0e48dbc6649e 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -879,16 +879,20 @@ static bool prepare_signal(int sig, struct task_struct *p, bool force) * as soon as they're available, so putting the signal on the shared queue * will be equivalent to sending it to one such thread. */ -static inline int wants_signal(int sig, struct task_struct *p) +static inline bool wants_signal(int sig, struct task_struct *p) { if (sigismember(&p->blocked, sig)) - return 0; + return false; + if (p->flags & PF_EXITING) - return 0; + return false; + if (sig == SIGKILL) - return 1; + return true; + if (task_is_stopped_or_traced(p)) - return 0; + return false; + return task_curr(p) || !signal_pending(p); } -- cgit From a19e2c01f71bb4b8442db450200a01a28db36a04 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Tue, 21 Aug 2018 22:00:46 -0700 Subject: signal: make legacy_queue() return bool legacy_queue() already behaves like a boolean function. Let's actually declare it as such too. Link: http://lkml.kernel.org/r/20180602103653.18181-16-christian@brauner.io Signed-off-by: Christian Brauner Reviewed-by: Andrew Morton Cc: Al Viro Cc: Eric W. Biederman Cc: Greg Kroah-Hartman Cc: Ingo Molnar Cc: James Morris Cc: Kees Cook Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Stephen Smalley Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/signal.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index 0e48dbc6649e..3de1ba2af032 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -972,7 +972,7 @@ static void complete_signal(int sig, struct task_struct *p, int group) return; } -static inline int legacy_queue(struct sigpending *signals, int sig) +static inline bool legacy_queue(struct sigpending *signals, int sig) { return (sig < SIGRTMIN) && sigismember(&signals->signal, sig); } -- cgit From f99e9d8c5c7f5539e6d97fd04a4ad213e52daac0 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Tue, 21 Aug 2018 22:00:50 -0700 Subject: signal: make sigkill_pending() return bool sigkill_pending() already behaves like a boolean function. Let's actually declare it as such too. Link: http://lkml.kernel.org/r/20180602103653.18181-17-christian@brauner.io Signed-off-by: Christian Brauner Reviewed-by: Andrew Morton Cc: Al Viro Cc: Eric W. Biederman Cc: Greg Kroah-Hartman Cc: Ingo Molnar Cc: James Morris Cc: Kees Cook Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Stephen Smalley Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/signal.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index 3de1ba2af032..9f9b4183178e 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1904,10 +1904,10 @@ static inline bool may_ptrace_stop(void) * Return non-zero if there is a SIGKILL that should be waking us up. * Called with the siglock held. */ -static int sigkill_pending(struct task_struct *tsk) +static bool sigkill_pending(struct task_struct *tsk) { - return sigismember(&tsk->pending.signal, SIGKILL) || - sigismember(&tsk->signal->shared_pending.signal, SIGKILL); + return sigismember(&tsk->pending.signal, SIGKILL) || + sigismember(&tsk->signal->shared_pending.signal, SIGKILL); } /* -- cgit From 20ab7218d2507b2dbec9c053c2f1b96054e266b6 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Tue, 21 Aug 2018 22:00:54 -0700 Subject: signal: make get_signal() return bool make get_signal() already behaves like a boolean function. Let's actually declare it as such too. Link: http://lkml.kernel.org/r/20180602103653.18181-18-christian@brauner.io Signed-off-by: Christian Brauner Reviewed-by: Andrew Morton Cc: Al Viro Cc: Eric W. Biederman Cc: Greg Kroah-Hartman Cc: Ingo Molnar Cc: James Morris Cc: Kees Cook Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Stephen Smalley Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/signal.h | 2 +- kernel/signal.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/include/linux/signal.h b/include/linux/signal.h index fa23cae66758..e3d9e0640e6e 100644 --- a/include/linux/signal.h +++ b/include/linux/signal.h @@ -265,7 +265,7 @@ extern void set_current_blocked(sigset_t *); extern void __set_current_blocked(const sigset_t *); extern int show_unhandled_signals; -extern int get_signal(struct ksignal *ksig); +extern bool get_signal(struct ksignal *ksig); extern void signal_setup_done(int failed, struct ksignal *ksig, int stepping); extern void exit_signals(struct task_struct *tsk); extern void kernel_sigaction(int, __sighandler_t); diff --git a/kernel/signal.c b/kernel/signal.c index 9f9b4183178e..786bacc60649 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2287,7 +2287,7 @@ static int ptrace_signal(int signr, siginfo_t *info) return signr; } -int get_signal(struct ksignal *ksig) +bool get_signal(struct ksignal *ksig) { struct sighand_struct *sighand = current->sighand; struct signal_struct *signal = current->signal; @@ -2297,7 +2297,7 @@ int get_signal(struct ksignal *ksig) task_work_run(); if (unlikely(uprobe_deny_signal())) - return 0; + return false; /* * Do this once, we can't return to user-mode if freezing() == T. -- cgit From 06e62a46bbba20aa5286102016a04214bb446141 Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Tue, 21 Aug 2018 22:00:58 -0700 Subject: fork: don't copy inconsistent signal handler state to child Before this change, if a multithreaded process forks while one of its threads is changing a signal handler using sigaction(), the memcpy() in copy_sighand() can race with the struct assignment in do_sigaction(). It isn't clear whether this can cause corruption of the userspace signal handler pointer, but it definitely can cause inconsistency between different fields of struct sigaction. Take the appropriate spinlock to avoid this. I have tested that this patch prevents inconsistency between sa_sigaction and sa_flags, which is possible before this patch. Link: http://lkml.kernel.org/r/20180702145108.73189-1-jannh@google.com Signed-off-by: Jann Horn Acked-by: Michal Hocko Reviewed-by: Andrew Morton Cc: Rik van Riel Cc: "Peter Zijlstra (Intel)" Cc: Kees Cook Cc: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 8bcef2413f1b..23eb960c701d 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1427,7 +1427,9 @@ static int copy_sighand(unsigned long clone_flags, struct task_struct *tsk) return -ENOMEM; atomic_set(&sig->count, 1); + spin_lock_irq(¤t->sighand->siglock); memcpy(sig->action, current->sighand->action, sizeof(sig->action)); + spin_unlock_irq(¤t->sighand->siglock); return 0; } -- cgit From 5f733e8a2dd130eaeed24cbaef49d349206cdb8b Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Tue, 21 Aug 2018 22:01:06 -0700 Subject: kernel/sysctl.c: fix typos in comments Fix a few typos/spellos in kernel/sysctl.c. Link: http://lkml.kernel.org/r/bb09a8b9-f984-6dd4-b07b-3ecaf200862e@infradead.org Signed-off-by: Randy Dunlap Acked-by: Kees Cook Cc: "Luis R. Rodriguez" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sysctl.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index dbdcb6673b27..71ceb6c13c1a 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -225,7 +225,7 @@ static int proc_dopipe_max_size(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos); #ifdef CONFIG_MAGIC_SYSRQ -/* Note: sysrq code uses it's own private copy */ +/* Note: sysrq code uses its own private copy */ static int __sysrq_enabled = CONFIG_MAGIC_SYSRQ_DEFAULT_ENABLE; static int sysrq_sysctl_handler(struct ctl_table *table, int write, @@ -1976,13 +1976,13 @@ static void warn_sysctl_write(struct ctl_table *table) } /** - * proc_first_pos_non_zero_ignore - check if firs position is allowed + * proc_first_pos_non_zero_ignore - check if first position is allowed * @ppos: file position * @table: the sysctl table * * Returns true if the first position is non-zero and the sysctl_writes_strict * mode indicates this is not allowed for numeric input types. String proc - * hadlers can ignore the return value. + * handlers can ignore the return value. */ static bool proc_first_pos_non_zero_ignore(loff_t *ppos, struct ctl_table *table) -- cgit From 30aba6656f61ed44cba445a3c0d38b296fa9e8f5 Mon Sep 17 00:00:00 2001 From: Salvatore Mesoraca Date: Thu, 23 Aug 2018 17:00:35 -0700 Subject: namei: allow restricted O_CREAT of FIFOs and regular files Disallows open of FIFOs or regular files not owned by the user in world writable sticky directories, unless the owner is the same as that of the directory or the file is opened without the O_CREAT flag. The purpose is to make data spoofing attacks harder. This protection can be turned on and off separately for FIFOs and regular files via sysctl, just like the symlinks/hardlinks protection. This patch is based on Openwall's "HARDEN_FIFO" feature by Solar Designer. This is a brief list of old vulnerabilities that could have been prevented by this feature, some of them even allow for privilege escalation: CVE-2000-1134 CVE-2007-3852 CVE-2008-0525 CVE-2009-0416 CVE-2011-4834 CVE-2015-1838 CVE-2015-7442 CVE-2016-7489 This list is not meant to be complete. It's difficult to track down all vulnerabilities of this kind because they were often reported without any mention of this particular attack vector. In fact, before hardlinks/symlinks restrictions, fifos/regular files weren't the favorite vehicle to exploit them. [s.mesoraca16@gmail.com: fix bug reported by Dan Carpenter] Link: https://lkml.kernel.org/r/20180426081456.GA7060@mwanda Link: http://lkml.kernel.org/r/1524829819-11275-1-git-send-email-s.mesoraca16@gmail.com [keescook@chromium.org: drop pr_warn_ratelimited() in favor of audit changes in the future] [keescook@chromium.org: adjust commit subjet] Link: http://lkml.kernel.org/r/20180416175918.GA13494@beast Signed-off-by: Salvatore Mesoraca Signed-off-by: Kees Cook Suggested-by: Solar Designer Suggested-by: Kees Cook Cc: Al Viro Cc: Dan Carpenter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/sysctl/fs.txt | 36 ++++++++++++++++++++++++++++++ fs/namei.c | 53 ++++++++++++++++++++++++++++++++++++++++++--- include/linux/fs.h | 2 ++ kernel/sysctl.c | 18 +++++++++++++++ 4 files changed, 106 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/Documentation/sysctl/fs.txt b/Documentation/sysctl/fs.txt index 6c00c1e2743f..819caf8ca05f 100644 --- a/Documentation/sysctl/fs.txt +++ b/Documentation/sysctl/fs.txt @@ -34,7 +34,9 @@ Currently, these files are in /proc/sys/fs: - overflowgid - pipe-user-pages-hard - pipe-user-pages-soft +- protected_fifos - protected_hardlinks +- protected_regular - protected_symlinks - suid_dumpable - super-max @@ -182,6 +184,24 @@ applied. ============================================================== +protected_fifos: + +The intent of this protection is to avoid unintentional writes to +an attacker-controlled FIFO, where a program expected to create a regular +file. + +When set to "0", writing to FIFOs is unrestricted. + +When set to "1" don't allow O_CREAT open on FIFOs that we don't own +in world writable sticky directories, unless they are owned by the +owner of the directory. + +When set to "2" it also applies to group writable sticky directories. + +This protection is based on the restrictions in Openwall. + +============================================================== + protected_hardlinks: A long-standing class of security issues is the hardlink-based @@ -202,6 +222,22 @@ This protection is based on the restrictions in Openwall and grsecurity. ============================================================== +protected_regular: + +This protection is similar to protected_fifos, but it +avoids writes to an attacker-controlled regular file, where a program +expected to create one. + +When set to "0", writing to regular files is unrestricted. + +When set to "1" don't allow O_CREAT open on regular files that we +don't own in world writable sticky directories, unless they are +owned by the owner of the directory. + +When set to "2" it also applies to group writable sticky directories. + +============================================================== + protected_symlinks: A long-standing class of security issues is the symlink-based diff --git a/fs/namei.c b/fs/namei.c index ae6aa9ae757c..0cab6494978c 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -887,6 +887,8 @@ static inline void put_link(struct nameidata *nd) int sysctl_protected_symlinks __read_mostly = 0; int sysctl_protected_hardlinks __read_mostly = 0; +int sysctl_protected_fifos __read_mostly; +int sysctl_protected_regular __read_mostly; /** * may_follow_link - Check symlink following for unsafe situations @@ -1003,6 +1005,45 @@ static int may_linkat(struct path *link) return -EPERM; } +/** + * may_create_in_sticky - Check whether an O_CREAT open in a sticky directory + * should be allowed, or not, on files that already + * exist. + * @dir: the sticky parent directory + * @inode: the inode of the file to open + * + * Block an O_CREAT open of a FIFO (or a regular file) when: + * - sysctl_protected_fifos (or sysctl_protected_regular) is enabled + * - the file already exists + * - we are in a sticky directory + * - we don't own the file + * - the owner of the directory doesn't own the file + * - the directory is world writable + * If the sysctl_protected_fifos (or sysctl_protected_regular) is set to 2 + * the directory doesn't have to be world writable: being group writable will + * be enough. + * + * Returns 0 if the open is allowed, -ve on error. + */ +static int may_create_in_sticky(struct dentry * const dir, + struct inode * const inode) +{ + if ((!sysctl_protected_fifos && S_ISFIFO(inode->i_mode)) || + (!sysctl_protected_regular && S_ISREG(inode->i_mode)) || + likely(!(dir->d_inode->i_mode & S_ISVTX)) || + uid_eq(inode->i_uid, dir->d_inode->i_uid) || + uid_eq(current_fsuid(), inode->i_uid)) + return 0; + + if (likely(dir->d_inode->i_mode & 0002) || + (dir->d_inode->i_mode & 0020 && + ((sysctl_protected_fifos >= 2 && S_ISFIFO(inode->i_mode)) || + (sysctl_protected_regular >= 2 && S_ISREG(inode->i_mode))))) { + return -EACCES; + } + return 0; +} + static __always_inline const char *get_link(struct nameidata *nd) { @@ -3348,9 +3389,15 @@ finish_open: if (error) return error; audit_inode(nd->name, nd->path.dentry, 0); - error = -EISDIR; - if ((open_flag & O_CREAT) && d_is_dir(nd->path.dentry)) - goto out; + if (open_flag & O_CREAT) { + error = -EISDIR; + if (d_is_dir(nd->path.dentry)) + goto out; + error = may_create_in_sticky(dir, + d_backing_inode(nd->path.dentry)); + if (unlikely(error)) + goto out; + } error = -ENOTDIR; if ((nd->flags & LOOKUP_DIRECTORY) && !d_can_lookup(nd->path.dentry)) goto out; diff --git a/include/linux/fs.h b/include/linux/fs.h index e5710541183b..33322702c910 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -74,6 +74,8 @@ extern struct inodes_stat_t inodes_stat; extern int leases_enable, lease_break_time; extern int sysctl_protected_symlinks; extern int sysctl_protected_hardlinks; +extern int sysctl_protected_fifos; +extern int sysctl_protected_regular; typedef __kernel_rwf_t rwf_t; diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 71ceb6c13c1a..cc02050fd0c4 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1807,6 +1807,24 @@ static struct ctl_table fs_table[] = { .extra1 = &zero, .extra2 = &one, }, + { + .procname = "protected_fifos", + .data = &sysctl_protected_fifos, + .maxlen = sizeof(int), + .mode = 0600, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &two, + }, + { + .procname = "protected_regular", + .data = &sysctl_protected_regular, + .maxlen = sizeof(int), + .mode = 0600, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &two, + }, { .procname = "suid_dumpable", .data = &suid_dumpable, -- cgit From 3723c63247854c97fe044c12a40e29043e9bbc1b Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 23 Aug 2018 17:01:26 -0700 Subject: treewide: convert ISO_8859-1 text comments to utf-8 Almost all files in the kernel are either plain text or UTF-8 encoded. A couple however are ISO_8859-1, usually just a few characters in a C comments, for historic reasons. This converts them all to UTF-8 for consistency. Link: http://lkml.kernel.org/r/20180724111600.4158975-1-arnd@arndb.de Signed-off-by: Arnd Bergmann Acked-by: Simon Horman [IPVS portion] Acked-by: Jonathan Cameron [IIO] Acked-by: Michael Ellerman [powerpc] Acked-by: Rob Herring Cc: Joe Perches Cc: Arnd Bergmann Cc: Samuel Ortiz Cc: "David S. Miller" Cc: Rob Herring Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- .../devicetree/bindings/net/nfc/pn544.txt | 2 +- arch/arm/boot/dts/sun4i-a10-inet97fv2.dts | 2 +- arch/arm/crypto/sha256_glue.c | 2 +- arch/arm/crypto/sha256_neon_glue.c | 4 +- drivers/crypto/vmx/ghashp8-ppc.pl | 12 +-- drivers/iio/dac/ltc2632.c | 2 +- drivers/power/reset/ltc2952-poweroff.c | 4 +- kernel/events/callchain.c | 2 +- net/netfilter/ipvs/Kconfig | 8 +- net/netfilter/ipvs/ip_vs_mh.c | 4 +- tools/power/cpupower/po/de.po | 44 ++++---- tools/power/cpupower/po/fr.po | 120 ++++++++++----------- 12 files changed, 103 insertions(+), 103 deletions(-) (limited to 'kernel') diff --git a/Documentation/devicetree/bindings/net/nfc/pn544.txt b/Documentation/devicetree/bindings/net/nfc/pn544.txt index 5b937403fed6..92f399ec22b8 100644 --- a/Documentation/devicetree/bindings/net/nfc/pn544.txt +++ b/Documentation/devicetree/bindings/net/nfc/pn544.txt @@ -2,7 +2,7 @@ Required properties: - compatible: Should be "nxp,pn544-i2c". -- clock-frequency: IC work frequency. +- clock-frequency: I²C work frequency. - reg: address on the bus - interrupts: GPIO interrupt to which the chip is connected - enable-gpios: Output GPIO pin used for enabling/disabling the PN544 diff --git a/arch/arm/boot/dts/sun4i-a10-inet97fv2.dts b/arch/arm/boot/dts/sun4i-a10-inet97fv2.dts index 5d096528e75a..71c27ea0b53e 100644 --- a/arch/arm/boot/dts/sun4i-a10-inet97fv2.dts +++ b/arch/arm/boot/dts/sun4i-a10-inet97fv2.dts @@ -1,7 +1,7 @@ /* * Copyright 2014 Open Source Support GmbH * - * David Lanzendrfer + * David Lanzendörfer * * This file is dual-licensed: you can use it either under the terms * of the GPL or the X11 license, at your option. Note that this dual diff --git a/arch/arm/crypto/sha256_glue.c b/arch/arm/crypto/sha256_glue.c index bf8ccff2c9d0..0ae900e778f3 100644 --- a/arch/arm/crypto/sha256_glue.c +++ b/arch/arm/crypto/sha256_glue.c @@ -2,7 +2,7 @@ * Glue code for the SHA256 Secure Hash Algorithm assembly implementation * using optimized ARM assembler and NEON instructions. * - * Copyright 2015 Google Inc. + * Copyright © 2015 Google Inc. * * This file is based on sha256_ssse3_glue.c: * Copyright (C) 2013 Intel Corporation diff --git a/arch/arm/crypto/sha256_neon_glue.c b/arch/arm/crypto/sha256_neon_glue.c index 9bbee56fbdc8..1d82c6cd31a4 100644 --- a/arch/arm/crypto/sha256_neon_glue.c +++ b/arch/arm/crypto/sha256_neon_glue.c @@ -2,10 +2,10 @@ * Glue code for the SHA256 Secure Hash Algorithm assembly implementation * using NEON instructions. * - * Copyright 2015 Google Inc. + * Copyright © 2015 Google Inc. * * This file is based on sha512_neon_glue.c: - * Copyright 2014 Jussi Kivilinna + * Copyright © 2014 Jussi Kivilinna * * This program is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the Free diff --git a/drivers/crypto/vmx/ghashp8-ppc.pl b/drivers/crypto/vmx/ghashp8-ppc.pl index f746af271460..38b06503ede0 100644 --- a/drivers/crypto/vmx/ghashp8-ppc.pl +++ b/drivers/crypto/vmx/ghashp8-ppc.pl @@ -129,9 +129,9 @@ $code=<<___; le?vperm $IN,$IN,$IN,$lemask vxor $zero,$zero,$zero - vpmsumd $Xl,$IN,$Hl # H.loXi.lo - vpmsumd $Xm,$IN,$H # H.hiXi.lo+H.loXi.hi - vpmsumd $Xh,$IN,$Hh # H.hiXi.hi + vpmsumd $Xl,$IN,$Hl # H.lo·Xi.lo + vpmsumd $Xm,$IN,$H # H.hi·Xi.lo+H.lo·Xi.hi + vpmsumd $Xh,$IN,$Hh # H.hi·Xi.hi vpmsumd $t2,$Xl,$xC2 # 1st phase @@ -187,11 +187,11 @@ $code=<<___; .align 5 Loop: subic $len,$len,16 - vpmsumd $Xl,$IN,$Hl # H.loXi.lo + vpmsumd $Xl,$IN,$Hl # H.lo·Xi.lo subfe. r0,r0,r0 # borrow?-1:0 - vpmsumd $Xm,$IN,$H # H.hiXi.lo+H.loXi.hi + vpmsumd $Xm,$IN,$H # H.hi·Xi.lo+H.lo·Xi.hi and r0,r0,$len - vpmsumd $Xh,$IN,$Hh # H.hiXi.hi + vpmsumd $Xh,$IN,$Hh # H.hi·Xi.hi add $inp,$inp,r0 vpmsumd $t2,$Xl,$xC2 # 1st phase diff --git a/drivers/iio/dac/ltc2632.c b/drivers/iio/dac/ltc2632.c index 28e9b7656b20..15d498fc4758 100644 --- a/drivers/iio/dac/ltc2632.c +++ b/drivers/iio/dac/ltc2632.c @@ -1,7 +1,7 @@ /* * LTC2632 Digital to analog convertors spi driver * - * Copyright 2017 Maxime Roussin-Blanger + * Copyright 2017 Maxime Roussin-Bélanger * expanded by Silvan Murer * * Licensed under the GPL-2. diff --git a/drivers/power/reset/ltc2952-poweroff.c b/drivers/power/reset/ltc2952-poweroff.c index 6b911b6b10a6..c484584745bc 100644 --- a/drivers/power/reset/ltc2952-poweroff.c +++ b/drivers/power/reset/ltc2952-poweroff.c @@ -2,7 +2,7 @@ * LTC2952 (PowerPath) driver * * Copyright (C) 2014, Xsens Technologies BV - * Maintainer: Ren Moll + * Maintainer: René Moll * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License @@ -319,6 +319,6 @@ static struct platform_driver ltc2952_poweroff_driver = { module_platform_driver(ltc2952_poweroff_driver); -MODULE_AUTHOR("Ren Moll "); +MODULE_AUTHOR("René Moll "); MODULE_DESCRIPTION("LTC PowerPath power-off driver"); MODULE_LICENSE("GPL v2"); diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c index c187aa3df3c8..24a77c34e9ad 100644 --- a/kernel/events/callchain.c +++ b/kernel/events/callchain.c @@ -4,7 +4,7 @@ * Copyright (C) 2008 Thomas Gleixner * Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar * Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra - * Copyright 2009 Paul Mackerras, IBM Corp. + * Copyright © 2009 Paul Mackerras, IBM Corp. * * For licensing details see kernel-base/COPYING */ diff --git a/net/netfilter/ipvs/Kconfig b/net/netfilter/ipvs/Kconfig index 05dc1b77e466..cad48d07c818 100644 --- a/net/netfilter/ipvs/Kconfig +++ b/net/netfilter/ipvs/Kconfig @@ -296,10 +296,10 @@ config IP_VS_MH_TAB_INDEX stored in a hash table. This table is assigned by a preference list of the positions to each destination until all slots in the table are filled. The index determines the prime for size of - the table as251, 509, 1021, 2039, 4093, 8191, 16381, 32749, - 65521 or 131071.When using weights to allow destinations to - receive more connections,the table is assigned an amount - proportional to the weights specified.The table needs to be large + the table as 251, 509, 1021, 2039, 4093, 8191, 16381, 32749, + 65521 or 131071. When using weights to allow destinations to + receive more connections, the table is assigned an amount + proportional to the weights specified. The table needs to be large enough to effectively fit all the destinations multiplied by their respective weights. diff --git a/net/netfilter/ipvs/ip_vs_mh.c b/net/netfilter/ipvs/ip_vs_mh.c index 0f795b186eb3..94d9d349ebb0 100644 --- a/net/netfilter/ipvs/ip_vs_mh.c +++ b/net/netfilter/ipvs/ip_vs_mh.c @@ -5,10 +5,10 @@ * */ -/* The mh algorithm is to assigna preference list of all the lookup +/* The mh algorithm is to assign a preference list of all the lookup * table positions to each destination and populate the table with * the most-preferred position of destinations. Then it is to select - * destination with the hash key of source IP addressthrough looking + * destination with the hash key of source IP address through looking * up a the lookup table. * * The algorithm is detailed in: diff --git a/tools/power/cpupower/po/de.po b/tools/power/cpupower/po/de.po index 78c09e51663a..840c17cc450a 100644 --- a/tools/power/cpupower/po/de.po +++ b/tools/power/cpupower/po/de.po @@ -323,12 +323,12 @@ msgstr " Hardwarebedingte Grenzen der Taktfrequenz: " #: utils/cpufreq-info.c:256 #, c-format msgid " available frequency steps: " -msgstr " mgliche Taktfrequenzen: " +msgstr " mögliche Taktfrequenzen: " #: utils/cpufreq-info.c:269 #, c-format msgid " available cpufreq governors: " -msgstr " mgliche Regler: " +msgstr " mögliche Regler: " #: utils/cpufreq-info.c:280 #, c-format @@ -381,7 +381,7 @@ msgstr "Optionen:\n" msgid " -e, --debug Prints out debug information [default]\n" msgstr "" " -e, --debug Erzeugt detaillierte Informationen, hilfreich\n" -" zum Aufspren von Fehlern\n" +" zum Aufspüren von Fehlern\n" #: utils/cpufreq-info.c:475 #, c-format @@ -424,7 +424,7 @@ msgstr " -p, --policy Findet die momentane Taktik heraus *\n" #: utils/cpufreq-info.c:482 #, c-format msgid " -g, --governors Determines available cpufreq governors *\n" -msgstr " -g, --governors Erzeugt eine Liste mit verfgbaren Reglern *\n" +msgstr " -g, --governors Erzeugt eine Liste mit verfügbaren Reglern *\n" #: utils/cpufreq-info.c:483 #, c-format @@ -450,7 +450,7 @@ msgstr "" #, c-format msgid " -s, --stats Shows cpufreq statistics if available\n" msgstr "" -" -s, --stats Zeigt, sofern mglich, Statistiken ber cpufreq an.\n" +" -s, --stats Zeigt, sofern möglich, Statistiken über cpufreq an.\n" #: utils/cpufreq-info.c:487 #, c-format @@ -473,9 +473,9 @@ msgid "" "cpufreq\n" " interface in 2.4. and early 2.6. kernels\n" msgstr "" -" -o, --proc Erzeugt Informationen in einem hnlichem Format zu " +" -o, --proc Erzeugt Informationen in einem ähnlichem Format zu " "dem\n" -" der /proc/cpufreq-Datei in 2.4. und frhen 2.6.\n" +" der /proc/cpufreq-Datei in 2.4. und frühen 2.6.\n" " Kernel-Versionen\n" #: utils/cpufreq-info.c:491 @@ -491,7 +491,7 @@ msgstr "" #: utils/cpufreq-info.c:492 utils/cpuidle-info.c:152 #, c-format msgid " -h, --help Prints out this screen\n" -msgstr " -h, --help Gibt diese Kurzbersicht aus\n" +msgstr " -h, --help Gibt diese Kurzübersicht aus\n" #: utils/cpufreq-info.c:495 #, c-format @@ -501,7 +501,7 @@ msgid "" msgstr "" "Sofern kein anderer Parameter als '-c, --cpu' angegeben wird, liefert " "dieses\n" -"Programm Informationen, die z.B. zum Berichten von Fehlern ntzlich sind.\n" +"Programm Informationen, die z.B. zum Berichten von Fehlern nützlich sind.\n" #: utils/cpufreq-info.c:497 #, c-format @@ -557,7 +557,7 @@ msgid "" "select\n" msgstr "" " -d FREQ, --min FREQ neue minimale Taktfrequenz, die der Regler\n" -" auswhlen darf\n" +" auswählen darf\n" #: utils/cpufreq-set.c:28 #, c-format @@ -566,7 +566,7 @@ msgid "" "select\n" msgstr "" " -u FREQ, --max FREQ neue maximale Taktfrequenz, die der Regler\n" -" auswhlen darf\n" +" auswählen darf\n" #: utils/cpufreq-set.c:29 #, c-format @@ -579,20 +579,20 @@ msgid "" " -f FREQ, --freq FREQ specific frequency to be set. Requires userspace\n" " governor to be available and loaded\n" msgstr "" -" -f FREQ, --freq FREQ setze exakte Taktfrequenz. Bentigt den Regler\n" +" -f FREQ, --freq FREQ setze exakte Taktfrequenz. Benötigt den Regler\n" " 'userspace'.\n" #: utils/cpufreq-set.c:32 #, c-format msgid " -r, --related Switches all hardware-related CPUs\n" msgstr "" -" -r, --related Setze Werte fr alle CPUs, deren Taktfrequenz\n" +" -r, --related Setze Werte für alle CPUs, deren Taktfrequenz\n" " hardwarebedingt identisch ist.\n" #: utils/cpufreq-set.c:33 utils/cpupower-set.c:28 utils/cpupower-info.c:27 #, c-format msgid " -h, --help Prints out this screen\n" -msgstr " -h, --help Gibt diese Kurzbersicht aus\n" +msgstr " -h, --help Gibt diese Kurzübersicht aus\n" #: utils/cpufreq-set.c:35 #, fuzzy, c-format @@ -618,8 +618,8 @@ msgstr "" " angenommen\n" "2. Der Parameter -f bzw. --freq kann mit keinem anderen als dem Parameter\n" " -c bzw. --cpu kombiniert werden\n" -"3. FREQuenzen knnen in Hz, kHz (Standard), MHz, GHz oder THz eingegeben\n" -" werden, indem der Wert und unmittelbar anschlieend (ohne Leerzeichen!)\n" +"3. FREQuenzen können in Hz, kHz (Standard), MHz, GHz oder THz eingegeben\n" +" werden, indem der Wert und unmittelbar anschließend (ohne Leerzeichen!)\n" " die Einheit angegeben werden. (Bsp: 1GHz )\n" " (FREQuenz in kHz =^ MHz * 1000 =^ GHz * 1000000).\n" @@ -638,7 +638,7 @@ msgid "" msgstr "" "Beim Einstellen ist ein Fehler aufgetreten. Typische Fehlerquellen sind:\n" "- nicht ausreichende Rechte (Administrator)\n" -"- der Regler ist nicht verfgbar bzw. nicht geladen\n" +"- der Regler ist nicht verfügbar bzw. nicht geladen\n" "- die angegebene Taktik ist inkorrekt\n" "- eine spezifische Frequenz wurde angegeben, aber der Regler 'userspace'\n" " kann entweder hardwarebedingt nicht genutzt werden oder ist nicht geladen\n" @@ -821,7 +821,7 @@ msgstr "" #: utils/cpuidle-info.c:48 #, fuzzy, c-format msgid "Available idle states:" -msgstr " mgliche Taktfrequenzen: " +msgstr " mögliche Taktfrequenzen: " #: utils/cpuidle-info.c:71 #, c-format @@ -924,7 +924,7 @@ msgstr "Aufruf: cpufreq-info [Optionen]\n" msgid " -s, --silent Only show general C-state information\n" msgstr "" " -e, --debug Erzeugt detaillierte Informationen, hilfreich\n" -" zum Aufspren von Fehlern\n" +" zum Aufspüren von Fehlern\n" #: utils/cpuidle-info.c:150 #, fuzzy, c-format @@ -933,9 +933,9 @@ msgid "" "acpi/processor/*/power\n" " interface in older kernels\n" msgstr "" -" -o, --proc Erzeugt Informationen in einem hnlichem Format zu " +" -o, --proc Erzeugt Informationen in einem ähnlichem Format zu " "dem\n" -" der /proc/cpufreq-Datei in 2.4. und frhen 2.6.\n" +" der /proc/cpufreq-Datei in 2.4. und frühen 2.6.\n" " Kernel-Versionen\n" #: utils/cpuidle-info.c:209 @@ -949,7 +949,7 @@ msgstr "" #~ " -c CPU, --cpu CPU CPU number which information shall be determined " #~ "about\n" #~ msgstr "" -#~ " -c CPU, --cpu CPU Nummer der CPU, ber die Informationen " +#~ " -c CPU, --cpu CPU Nummer der CPU, über die Informationen " #~ "herausgefunden werden sollen\n" #~ msgid "" diff --git a/tools/power/cpupower/po/fr.po b/tools/power/cpupower/po/fr.po index 245ad20a9bf9..b46ca2548f86 100644 --- a/tools/power/cpupower/po/fr.po +++ b/tools/power/cpupower/po/fr.po @@ -212,7 +212,7 @@ msgstr "" #: utils/cpupower.c:91 #, c-format msgid "Report errors and bugs to %s, please.\n" -msgstr "Veuillez rapportez les erreurs et les bogues %s, s'il vous plait.\n" +msgstr "Veuillez rapportez les erreurs et les bogues à %s, s'il vous plait.\n" #: utils/cpupower.c:114 #, c-format @@ -227,14 +227,14 @@ msgstr "" #: utils/cpufreq-info.c:31 #, c-format msgid "Couldn't count the number of CPUs (%s: %s), assuming 1\n" -msgstr "Dtermination du nombre de CPUs (%s : %s) impossible. Assume 1\n" +msgstr "Détermination du nombre de CPUs (%s : %s) impossible. Assume 1\n" #: utils/cpufreq-info.c:63 #, c-format msgid "" " minimum CPU frequency - maximum CPU frequency - governor\n" msgstr "" -" Frquence CPU minimale - Frquence CPU maximale - rgulateur\n" +" Fréquence CPU minimale - Fréquence CPU maximale - régulateur\n" #: utils/cpufreq-info.c:151 #, c-format @@ -302,12 +302,12 @@ msgstr " pilote : %s\n" #: utils/cpufreq-info.c:219 #, fuzzy, c-format msgid " CPUs which run at the same hardware frequency: " -msgstr " CPUs qui doivent changer de frquences en mme temps : " +msgstr " CPUs qui doivent changer de fréquences en même temps : " #: utils/cpufreq-info.c:230 #, fuzzy, c-format msgid " CPUs which need to have their frequency coordinated by software: " -msgstr " CPUs qui doivent changer de frquences en mme temps : " +msgstr " CPUs qui doivent changer de fréquences en même temps : " #: utils/cpufreq-info.c:241 #, c-format @@ -317,22 +317,22 @@ msgstr "" #: utils/cpufreq-info.c:247 #, c-format msgid " hardware limits: " -msgstr " limitation matrielle : " +msgstr " limitation matérielle : " #: utils/cpufreq-info.c:256 #, c-format msgid " available frequency steps: " -msgstr " plage de frquence : " +msgstr " plage de fréquence : " #: utils/cpufreq-info.c:269 #, c-format msgid " available cpufreq governors: " -msgstr " rgulateurs disponibles : " +msgstr " régulateurs disponibles : " #: utils/cpufreq-info.c:280 #, c-format msgid " current policy: frequency should be within " -msgstr " tactique actuelle : la frquence doit tre comprise entre " +msgstr " tactique actuelle : la fréquence doit être comprise entre " #: utils/cpufreq-info.c:282 #, c-format @@ -345,18 +345,18 @@ msgid "" "The governor \"%s\" may decide which speed to use\n" " within this range.\n" msgstr "" -"Le rgulateur \"%s\" est libre de choisir la vitesse\n" -" dans cette plage de frquences.\n" +"Le régulateur \"%s\" est libre de choisir la vitesse\n" +" dans cette plage de fréquences.\n" #: utils/cpufreq-info.c:293 #, c-format msgid " current CPU frequency is " -msgstr " la frquence actuelle de ce CPU est " +msgstr " la fréquence actuelle de ce CPU est " #: utils/cpufreq-info.c:296 #, c-format msgid " (asserted by call to hardware)" -msgstr " (vrifi par un appel direct du matriel)" +msgstr " (vérifié par un appel direct du matériel)" #: utils/cpufreq-info.c:304 #, c-format @@ -377,7 +377,7 @@ msgstr "Options :\n" #: utils/cpufreq-info.c:474 #, fuzzy, c-format msgid " -e, --debug Prints out debug information [default]\n" -msgstr " -e, --debug Afficher les informations de dboguage\n" +msgstr " -e, --debug Afficher les informations de déboguage\n" #: utils/cpufreq-info.c:475 #, c-format @@ -385,8 +385,8 @@ msgid "" " -f, --freq Get frequency the CPU currently runs at, according\n" " to the cpufreq core *\n" msgstr "" -" -f, --freq Obtenir la frquence actuelle du CPU selon le point\n" -" de vue du coeur du systme de cpufreq *\n" +" -f, --freq Obtenir la fréquence actuelle du CPU selon le point\n" +" de vue du coeur du système de cpufreq *\n" #: utils/cpufreq-info.c:477 #, c-format @@ -394,8 +394,8 @@ msgid "" " -w, --hwfreq Get frequency the CPU currently runs at, by reading\n" " it from hardware (only available to root) *\n" msgstr "" -" -w, --hwfreq Obtenir la frquence actuelle du CPU directement par\n" -" le matriel (doit tre root) *\n" +" -w, --hwfreq Obtenir la fréquence actuelle du CPU directement par\n" +" le matériel (doit être root) *\n" #: utils/cpufreq-info.c:479 #, c-format @@ -403,13 +403,13 @@ msgid "" " -l, --hwlimits Determine the minimum and maximum CPU frequency " "allowed *\n" msgstr "" -" -l, --hwlimits Affiche les frquences minimales et maximales du CPU " +" -l, --hwlimits Affiche les fréquences minimales et maximales du CPU " "*\n" #: utils/cpufreq-info.c:480 #, c-format msgid " -d, --driver Determines the used cpufreq kernel driver *\n" -msgstr " -d, --driver Affiche le pilote cpufreq utilis *\n" +msgstr " -d, --driver Affiche le pilote cpufreq utilisé *\n" #: utils/cpufreq-info.c:481 #, c-format @@ -420,7 +420,7 @@ msgstr " -p, --policy Affiche la tactique actuelle de cpufreq *\n" #, c-format msgid " -g, --governors Determines available cpufreq governors *\n" msgstr "" -" -g, --governors Affiche les rgulateurs disponibles de cpufreq *\n" +" -g, --governors Affiche les régulateurs disponibles de cpufreq *\n" #: utils/cpufreq-info.c:483 #, fuzzy, c-format @@ -429,7 +429,7 @@ msgid "" "frequency *\n" msgstr "" " -a, --affected-cpus Affiche quels sont les CPUs qui doivent changer de\n" -" frquences en mme temps *\n" +" fréquences en même temps *\n" #: utils/cpufreq-info.c:484 #, fuzzy, c-format @@ -438,7 +438,7 @@ msgid "" " coordinated by software *\n" msgstr "" " -a, --affected-cpus Affiche quels sont les CPUs qui doivent changer de\n" -" frquences en mme temps *\n" +" fréquences en même temps *\n" #: utils/cpufreq-info.c:486 #, c-format @@ -453,7 +453,7 @@ msgid "" " -y, --latency Determines the maximum latency on CPU frequency " "changes *\n" msgstr "" -" -l, --hwlimits Affiche les frquences minimales et maximales du CPU " +" -l, --hwlimits Affiche les fréquences minimales et maximales du CPU " "*\n" #: utils/cpufreq-info.c:488 @@ -469,7 +469,7 @@ msgid "" " interface in 2.4. and early 2.6. kernels\n" msgstr "" " -o, --proc Affiche les informations en utilisant l'interface\n" -" fournie par /proc/cpufreq, prsente dans les " +" fournie par /proc/cpufreq, présente dans les " "versions\n" " 2.4 et les anciennes versions 2.6 du noyau\n" @@ -485,7 +485,7 @@ msgstr "" #: utils/cpufreq-info.c:492 utils/cpuidle-info.c:152 #, c-format msgid " -h, --help Prints out this screen\n" -msgstr " -h, --help affiche l'aide-mmoire\n" +msgstr " -h, --help affiche l'aide-mémoire\n" #: utils/cpufreq-info.c:495 #, c-format @@ -493,8 +493,8 @@ msgid "" "If no argument or only the -c, --cpu parameter is given, debug output about\n" "cpufreq is printed which is useful e.g. for reporting bugs.\n" msgstr "" -"Par dfaut, les informations de dboguage seront affiches si aucun\n" -"argument, ou bien si seulement l'argument -c (--cpu) est donn, afin de\n" +"Par défaut, les informations de déboguage seront affichées si aucun\n" +"argument, ou bien si seulement l'argument -c (--cpu) est donné, afin de\n" "faciliter les rapports de bogues par exemple\n" #: utils/cpufreq-info.c:497 @@ -517,8 +517,8 @@ msgid "" "You can't specify more than one --cpu parameter and/or\n" "more than one output-specific argument\n" msgstr "" -"On ne peut indiquer plus d'un paramtre --cpu, tout comme l'on ne peut\n" -"spcifier plus d'un argument de formatage\n" +"On ne peut indiquer plus d'un paramètre --cpu, tout comme l'on ne peut\n" +"spécifier plus d'un argument de formatage\n" #: utils/cpufreq-info.c:600 utils/cpufreq-set.c:82 utils/cpupower-set.c:42 #: utils/cpupower-info.c:42 utils/cpuidle-info.c:213 @@ -529,7 +529,7 @@ msgstr "option invalide\n" #: utils/cpufreq-info.c:617 #, c-format msgid "couldn't analyze CPU %d as it doesn't seem to be present\n" -msgstr "analyse du CPU %d impossible puisqu'il ne semble pas tre prsent\n" +msgstr "analyse du CPU %d impossible puisqu'il ne semble pas être présent\n" #: utils/cpufreq-info.c:620 utils/cpupower-info.c:142 #, c-format @@ -547,8 +547,8 @@ msgid "" " -d FREQ, --min FREQ new minimum CPU frequency the governor may " "select\n" msgstr "" -" -d FREQ, --min FREQ nouvelle frquence minimale du CPU utiliser\n" -" par le rgulateur\n" +" -d FREQ, --min FREQ nouvelle fréquence minimale du CPU à utiliser\n" +" par le régulateur\n" #: utils/cpufreq-set.c:28 #, c-format @@ -556,13 +556,13 @@ msgid "" " -u FREQ, --max FREQ new maximum CPU frequency the governor may " "select\n" msgstr "" -" -u FREQ, --max FREQ nouvelle frquence maximale du CPU utiliser\n" -" par le rgulateur\n" +" -u FREQ, --max FREQ nouvelle fréquence maximale du CPU à utiliser\n" +" par le régulateur\n" #: utils/cpufreq-set.c:29 #, c-format msgid " -g GOV, --governor GOV new cpufreq governor\n" -msgstr " -g GOV, --governor GOV active le rgulateur GOV\n" +msgstr " -g GOV, --governor GOV active le régulateur GOV\n" #: utils/cpufreq-set.c:30 #, c-format @@ -570,9 +570,9 @@ msgid "" " -f FREQ, --freq FREQ specific frequency to be set. Requires userspace\n" " governor to be available and loaded\n" msgstr "" -" -f FREQ, --freq FREQ fixe la frquence du processeur FREQ. Il faut\n" -" que le rgulateur userspace soit disponible \n" -" et activ.\n" +" -f FREQ, --freq FREQ fixe la fréquence du processeur à FREQ. Il faut\n" +" que le régulateur « userspace » soit disponible \n" +" et activé.\n" #: utils/cpufreq-set.c:32 #, c-format @@ -582,7 +582,7 @@ msgstr "" #: utils/cpufreq-set.c:33 utils/cpupower-set.c:28 utils/cpupower-info.c:27 #, fuzzy, c-format msgid " -h, --help Prints out this screen\n" -msgstr " -h, --help affiche l'aide-mmoire\n" +msgstr " -h, --help affiche l'aide-mémoire\n" #: utils/cpufreq-set.c:35 #, fuzzy, c-format @@ -602,11 +602,11 @@ msgid "" " (FREQuency in kHz =^ Hz * 0.001 =^ MHz * 1000 =^ GHz * 1000000).\n" msgstr "" "Remarque :\n" -"1. Le CPU numro 0 sera utilis par dfaut si -c (ou --cpu) est omis ;\n" -"2. l'argument -f FREQ (ou --freq FREQ) ne peut tre utilis qu'avec --cpu ;\n" -"3. on pourra prciser l'unit des frquences en postfixant sans aucune " +"1. Le CPU numéro 0 sera utilisé par défaut si -c (ou --cpu) est omis ;\n" +"2. l'argument -f FREQ (ou --freq FREQ) ne peut être utilisé qu'avec --cpu ;\n" +"3. on pourra préciser l'unité des fréquences en postfixant sans aucune " "espace\n" -" les valeurs par hz, kHz (par dfaut), MHz, GHz ou THz\n" +" les valeurs par hz, kHz (par défaut), MHz, GHz ou THz\n" " (kHz =^ Hz * 0.001 =^ MHz * 1000 =^ GHz * 1000000).\n" #: utils/cpufreq-set.c:57 @@ -622,21 +622,21 @@ msgid "" "frequency\n" " or because the userspace governor isn't loaded?\n" msgstr "" -"En ajustant les nouveaux paramtres, une erreur est apparue. Les sources\n" +"En ajustant les nouveaux paramètres, une erreur est apparue. Les sources\n" "d'erreur typique sont :\n" -"- droit d'administration insuffisant (tes-vous root ?) ;\n" -"- le rgulateur choisi n'est pas disponible, ou bien n'est pas disponible " +"- droit d'administration insuffisant (êtes-vous root ?) ;\n" +"- le régulateur choisi n'est pas disponible, ou bien n'est pas disponible " "en\n" " tant que module noyau ;\n" "- la tactique n'est pas disponible ;\n" -"- vous voulez utiliser l'option -f/--freq, mais le rgulateur userspace \n" -" n'est pas disponible, par exemple parce que le matriel ne le supporte\n" -" pas, ou bien n'est tout simplement pas charg.\n" +"- vous voulez utiliser l'option -f/--freq, mais le régulateur « userspace »\n" +" n'est pas disponible, par exemple parce que le matériel ne le supporte\n" +" pas, ou bien n'est tout simplement pas chargé.\n" #: utils/cpufreq-set.c:170 #, c-format msgid "wrong, unknown or unhandled CPU?\n" -msgstr "CPU inconnu ou non support ?\n" +msgstr "CPU inconnu ou non supporté ?\n" #: utils/cpufreq-set.c:302 #, c-format @@ -653,7 +653,7 @@ msgid "" "At least one parameter out of -f/--freq, -d/--min, -u/--max, and\n" "-g/--governor must be passed\n" msgstr "" -"L'un de ces paramtres est obligatoire : -f/--freq, -d/--min, -u/--max et\n" +"L'un de ces paramètres est obligatoire : -f/--freq, -d/--min, -u/--max et\n" "-g/--governor\n" #: utils/cpufreq-set.c:347 @@ -810,7 +810,7 @@ msgstr "" #: utils/cpuidle-info.c:48 #, fuzzy, c-format msgid "Available idle states:" -msgstr " plage de frquence : " +msgstr " plage de fréquence : " #: utils/cpuidle-info.c:71 #, c-format @@ -911,7 +911,7 @@ msgstr "Usage : cpufreq-info [options]\n" #: utils/cpuidle-info.c:149 #, fuzzy, c-format msgid " -s, --silent Only show general C-state information\n" -msgstr " -e, --debug Afficher les informations de dboguage\n" +msgstr " -e, --debug Afficher les informations de déboguage\n" #: utils/cpuidle-info.c:150 #, fuzzy, c-format @@ -921,7 +921,7 @@ msgid "" " interface in older kernels\n" msgstr "" " -o, --proc Affiche les informations en utilisant l'interface\n" -" fournie par /proc/cpufreq, prsente dans les " +" fournie par /proc/cpufreq, présente dans les " "versions\n" " 2.4 et les anciennes versions 2.6 du noyau\n" @@ -929,19 +929,19 @@ msgstr "" #, fuzzy, c-format msgid "You can't specify more than one output-specific argument\n" msgstr "" -"On ne peut indiquer plus d'un paramtre --cpu, tout comme l'on ne peut\n" -"spcifier plus d'un argument de formatage\n" +"On ne peut indiquer plus d'un paramètre --cpu, tout comme l'on ne peut\n" +"spécifier plus d'un argument de formatage\n" #~ msgid "" #~ " -c CPU, --cpu CPU CPU number which information shall be determined " #~ "about\n" #~ msgstr "" -#~ " -c CPU, --cpu CPU Numro du CPU pour lequel l'information sera " -#~ "affiche\n" +#~ " -c CPU, --cpu CPU Numéro du CPU pour lequel l'information sera " +#~ "affichée\n" #~ msgid "" #~ " -c CPU, --cpu CPU number of CPU where cpufreq settings shall be " #~ "modified\n" #~ msgstr "" -#~ " -c CPU, --cpu CPU numro du CPU prendre en compte pour les\n" +#~ " -c CPU, --cpu CPU numéro du CPU à prendre en compte pour les\n" #~ " changements\n" -- cgit From 2b7403035459c75e193c6b04a293e518a4212de0 Mon Sep 17 00:00:00 2001 From: Souptick Joarder Date: Thu, 23 Aug 2018 17:01:36 -0700 Subject: mm: Change return type int to vm_fault_t for fault handlers Use new return type vm_fault_t for fault handler. For now, this is just documenting that the function returns a VM_FAULT value rather than an errno. Once all instances are converted, vm_fault_t will become a distinct type. Ref-> commit 1c8f422059ae ("mm: change return type to vm_fault_t") The aim is to change the return type of finish_fault() and handle_mm_fault() to vm_fault_t type. As part of that clean up return type of all other recursively called functions have been changed to vm_fault_t type. The places from where handle_mm_fault() is getting invoked will be change to vm_fault_t type but in a separate patch. vmf_error() is the newly introduce inline function in 4.17-rc6. [akpm@linux-foundation.org: don't shadow outer local `ret' in __do_huge_pmd_anonymous_page()] Link: http://lkml.kernel.org/r/20180604171727.GA20279@jordon-HP-15-Notebook-PC Signed-off-by: Souptick Joarder Reviewed-by: Matthew Wilcox Reviewed-by: Andrew Morton Cc: Matthew Wilcox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/userfaultfd.c | 6 +-- include/linux/huge_mm.h | 9 +++-- include/linux/hugetlb.h | 2 +- include/linux/mm.h | 14 +++---- include/linux/oom.h | 2 +- include/linux/swapops.h | 5 ++- include/linux/userfaultfd_k.h | 5 ++- kernel/memremap.c | 2 +- mm/gup.c | 4 +- mm/huge_memory.c | 31 +++++++-------- mm/hugetlb.c | 29 +++++++------- mm/internal.h | 2 +- mm/khugepaged.c | 3 +- mm/memory.c | 90 ++++++++++++++++++++++--------------------- mm/shmem.c | 5 ++- 15 files changed, 106 insertions(+), 103 deletions(-) (limited to 'kernel') diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index f649023b19b5..bfa0ec69f924 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -340,17 +340,15 @@ out: * fatal_signal_pending()s, and the mmap_sem must be released before * returning it. */ -int handle_userfault(struct vm_fault *vmf, unsigned long reason) +vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason) { struct mm_struct *mm = vmf->vma->vm_mm; struct userfaultfd_ctx *ctx; struct userfaultfd_wait_queue uwq; - int ret; + vm_fault_t ret = VM_FAULT_SIGBUS; bool must_wait, return_to_userland; long blocking_state; - ret = VM_FAULT_SIGBUS; - /* * We don't do userfault handling for the final child pid update. * diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index a8a126259bc4..27e3e32135a8 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -6,7 +6,7 @@ #include /* only for vma_is_dax() */ -extern int do_huge_pmd_anonymous_page(struct vm_fault *vmf); +extern vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf); extern int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr, struct vm_area_struct *vma); @@ -23,7 +23,7 @@ static inline void huge_pud_set_accessed(struct vm_fault *vmf, pud_t orig_pud) } #endif -extern int do_huge_pmd_wp_page(struct vm_fault *vmf, pmd_t orig_pmd); +extern vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf, pmd_t orig_pmd); extern struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmd, @@ -216,7 +216,7 @@ struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr, struct page *follow_devmap_pud(struct vm_area_struct *vma, unsigned long addr, pud_t *pud, int flags); -extern int do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t orig_pmd); +extern vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t orig_pmd); extern struct page *huge_zero_page; @@ -321,7 +321,8 @@ static inline spinlock_t *pud_trans_huge_lock(pud_t *pud, return NULL; } -static inline int do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t orig_pmd) +static inline vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf, + pmd_t orig_pmd) { return 0; } diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index c39d9170a8a0..6b68e345f0ca 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -105,7 +105,7 @@ void hugetlb_report_meminfo(struct seq_file *); int hugetlb_report_node_meminfo(int, char *); void hugetlb_show_meminfo(void); unsigned long hugetlb_total_pages(void); -int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, +vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, unsigned int flags); int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, pte_t *dst_pte, struct vm_area_struct *dst_vma, diff --git a/include/linux/mm.h b/include/linux/mm.h index a9e733b5fb76..8fcc36660de6 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -728,10 +728,10 @@ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma) return pte; } -int alloc_set_pte(struct vm_fault *vmf, struct mem_cgroup *memcg, +vm_fault_t alloc_set_pte(struct vm_fault *vmf, struct mem_cgroup *memcg, struct page *page); -int finish_fault(struct vm_fault *vmf); -int finish_mkwrite_fault(struct vm_fault *vmf); +vm_fault_t finish_fault(struct vm_fault *vmf); +vm_fault_t finish_mkwrite_fault(struct vm_fault *vmf); #endif /* @@ -1403,8 +1403,8 @@ int generic_error_remove_page(struct address_space *mapping, struct page *page); int invalidate_inode_page(struct page *page); #ifdef CONFIG_MMU -extern int handle_mm_fault(struct vm_area_struct *vma, unsigned long address, - unsigned int flags); +extern vm_fault_t handle_mm_fault(struct vm_area_struct *vma, + unsigned long address, unsigned int flags); extern int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm, unsigned long address, unsigned int fault_flags, bool *unlocked); @@ -1413,7 +1413,7 @@ void unmap_mapping_pages(struct address_space *mapping, void unmap_mapping_range(struct address_space *mapping, loff_t const holebegin, loff_t const holelen, int even_cows); #else -static inline int handle_mm_fault(struct vm_area_struct *vma, +static inline vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address, unsigned int flags) { /* should never happen if there's no MMU */ @@ -2563,7 +2563,7 @@ static inline struct page *follow_page(struct vm_area_struct *vma, #define FOLL_COW 0x4000 /* internal GUP flag */ #define FOLL_ANON 0x8000 /* don't do file mappings */ -static inline int vm_fault_to_errno(int vm_fault, int foll_flags) +static inline int vm_fault_to_errno(vm_fault_t vm_fault, int foll_flags) { if (vm_fault & VM_FAULT_OOM) return -ENOMEM; diff --git a/include/linux/oom.h b/include/linux/oom.h index 92f70e4c6252..69864a547663 100644 --- a/include/linux/oom.h +++ b/include/linux/oom.h @@ -88,7 +88,7 @@ static inline bool mm_is_oom_victim(struct mm_struct *mm) * * Return 0 when the PF is safe VM_FAULT_SIGBUS otherwise. */ -static inline int check_stable_address_space(struct mm_struct *mm) +static inline vm_fault_t check_stable_address_space(struct mm_struct *mm) { if (unlikely(test_bit(MMF_UNSTABLE, &mm->flags))) return VM_FAULT_SIGBUS; diff --git a/include/linux/swapops.h b/include/linux/swapops.h index 568a3553d918..22af9d8a84ae 100644 --- a/include/linux/swapops.h +++ b/include/linux/swapops.h @@ -4,6 +4,7 @@ #include #include +#include /* * swapcache pages are stored in the swapper_space radix tree. We want to @@ -134,7 +135,7 @@ static inline struct page *device_private_entry_to_page(swp_entry_t entry) return pfn_to_page(swp_offset(entry)); } -int device_private_entry_fault(struct vm_area_struct *vma, +vm_fault_t device_private_entry_fault(struct vm_area_struct *vma, unsigned long addr, swp_entry_t entry, unsigned int flags, @@ -169,7 +170,7 @@ static inline struct page *device_private_entry_to_page(swp_entry_t entry) return NULL; } -static inline int device_private_entry_fault(struct vm_area_struct *vma, +static inline vm_fault_t device_private_entry_fault(struct vm_area_struct *vma, unsigned long addr, swp_entry_t entry, unsigned int flags, diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h index e091f0a11b11..37c9eba75c98 100644 --- a/include/linux/userfaultfd_k.h +++ b/include/linux/userfaultfd_k.h @@ -28,7 +28,7 @@ #define UFFD_SHARED_FCNTL_FLAGS (O_CLOEXEC | O_NONBLOCK) #define UFFD_FLAGS_SET (EFD_SHARED_FCNTL_FLAGS) -extern int handle_userfault(struct vm_fault *vmf, unsigned long reason); +extern vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason); extern ssize_t mcopy_atomic(struct mm_struct *dst_mm, unsigned long dst_start, unsigned long src_start, unsigned long len, @@ -77,7 +77,8 @@ extern void userfaultfd_unmap_complete(struct mm_struct *mm, #else /* CONFIG_USERFAULTFD */ /* mm helpers */ -static inline int handle_userfault(struct vm_fault *vmf, unsigned long reason) +static inline vm_fault_t handle_userfault(struct vm_fault *vmf, + unsigned long reason) { return VM_FAULT_SIGBUS; } diff --git a/kernel/memremap.c b/kernel/memremap.c index 1f87ea6b6545..d57d58f77409 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c @@ -43,7 +43,7 @@ static unsigned long order_at(struct resource *res, unsigned long pgoff) pgoff += 1UL << order, order = order_at((res), pgoff)) #if IS_ENABLED(CONFIG_DEVICE_PRIVATE) -int device_private_entry_fault(struct vm_area_struct *vma, +vm_fault_t device_private_entry_fault(struct vm_area_struct *vma, unsigned long addr, swp_entry_t entry, unsigned int flags, diff --git a/mm/gup.c b/mm/gup.c index fc5f98069f4e..1abc8b4afff6 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -497,7 +497,7 @@ static int faultin_page(struct task_struct *tsk, struct vm_area_struct *vma, unsigned long address, unsigned int *flags, int *nonblocking) { unsigned int fault_flags = 0; - int ret; + vm_fault_t ret; /* mlock all present pages, but do not fault in new pages */ if ((*flags & (FOLL_POPULATE | FOLL_MLOCK)) == FOLL_MLOCK) @@ -818,7 +818,7 @@ int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm, bool *unlocked) { struct vm_area_struct *vma; - int ret, major = 0; + vm_fault_t ret, major = 0; if (unlocked) fault_flags |= FAULT_FLAG_ALLOW_RETRY; diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 78427af91de9..08b544383d74 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -541,14 +541,14 @@ unsigned long thp_get_unmapped_area(struct file *filp, unsigned long addr, } EXPORT_SYMBOL_GPL(thp_get_unmapped_area); -static int __do_huge_pmd_anonymous_page(struct vm_fault *vmf, struct page *page, - gfp_t gfp) +static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf, + struct page *page, gfp_t gfp) { struct vm_area_struct *vma = vmf->vma; struct mem_cgroup *memcg; pgtable_t pgtable; unsigned long haddr = vmf->address & HPAGE_PMD_MASK; - int ret = 0; + vm_fault_t ret = 0; VM_BUG_ON_PAGE(!PageCompound(page), page); @@ -584,15 +584,15 @@ static int __do_huge_pmd_anonymous_page(struct vm_fault *vmf, struct page *page, /* Deliver the page fault to userland */ if (userfaultfd_missing(vma)) { - int ret; + vm_fault_t ret2; spin_unlock(vmf->ptl); mem_cgroup_cancel_charge(page, memcg, true); put_page(page); pte_free(vma->vm_mm, pgtable); - ret = handle_userfault(vmf, VM_UFFD_MISSING); - VM_BUG_ON(ret & VM_FAULT_FALLBACK); - return ret; + ret2 = handle_userfault(vmf, VM_UFFD_MISSING); + VM_BUG_ON(ret2 & VM_FAULT_FALLBACK); + return ret2; } entry = mk_huge_pmd(page, vma->vm_page_prot); @@ -663,7 +663,7 @@ static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm, return true; } -int do_huge_pmd_anonymous_page(struct vm_fault *vmf) +vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; gfp_t gfp; @@ -682,7 +682,7 @@ int do_huge_pmd_anonymous_page(struct vm_fault *vmf) pgtable_t pgtable; struct page *zero_page; bool set; - int ret; + vm_fault_t ret; pgtable = pte_alloc_one(vma->vm_mm, haddr); if (unlikely(!pgtable)) return VM_FAULT_OOM; @@ -1118,15 +1118,16 @@ unlock: spin_unlock(vmf->ptl); } -static int do_huge_pmd_wp_page_fallback(struct vm_fault *vmf, pmd_t orig_pmd, - struct page *page) +static vm_fault_t do_huge_pmd_wp_page_fallback(struct vm_fault *vmf, + pmd_t orig_pmd, struct page *page) { struct vm_area_struct *vma = vmf->vma; unsigned long haddr = vmf->address & HPAGE_PMD_MASK; struct mem_cgroup *memcg; pgtable_t pgtable; pmd_t _pmd; - int ret = 0, i; + int i; + vm_fault_t ret = 0; struct page **pages; unsigned long mmun_start; /* For mmu_notifiers */ unsigned long mmun_end; /* For mmu_notifiers */ @@ -1236,7 +1237,7 @@ out_free_pages: goto out; } -int do_huge_pmd_wp_page(struct vm_fault *vmf, pmd_t orig_pmd) +vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf, pmd_t orig_pmd) { struct vm_area_struct *vma = vmf->vma; struct page *page = NULL, *new_page; @@ -1245,7 +1246,7 @@ int do_huge_pmd_wp_page(struct vm_fault *vmf, pmd_t orig_pmd) unsigned long mmun_start; /* For mmu_notifiers */ unsigned long mmun_end; /* For mmu_notifiers */ gfp_t huge_gfp; /* for allocation and charge */ - int ret = 0; + vm_fault_t ret = 0; vmf->ptl = pmd_lockptr(vma->vm_mm, vmf->pmd); VM_BUG_ON_VMA(!vma->anon_vma, vma); @@ -1457,7 +1458,7 @@ out: } /* NUMA hinting page fault entry point for trans huge pmds */ -int do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t pmd) +vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t pmd) { struct vm_area_struct *vma = vmf->vma; struct anon_vma *anon_vma = NULL; diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 9f1c853f67b5..3c21775f196b 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -3501,14 +3501,15 @@ static void unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma, * cannot race with other handlers or page migration. * Keep the pte_same checks anyway to make transition from the mutex easier. */ -static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma, +static vm_fault_t hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, pte_t *ptep, struct page *pagecache_page, spinlock_t *ptl) { pte_t pte; struct hstate *h = hstate_vma(vma); struct page *old_page, *new_page; - int ret = 0, outside_reserve = 0; + int outside_reserve = 0; + vm_fault_t ret = 0; unsigned long mmun_start; /* For mmu_notifiers */ unsigned long mmun_end; /* For mmu_notifiers */ unsigned long haddr = address & huge_page_mask(h); @@ -3572,8 +3573,7 @@ retry_avoidcopy: return 0; } - ret = (PTR_ERR(new_page) == -ENOMEM) ? - VM_FAULT_OOM : VM_FAULT_SIGBUS; + ret = vmf_error(PTR_ERR(new_page)); goto out_release_old; } @@ -3676,12 +3676,13 @@ int huge_add_to_page_cache(struct page *page, struct address_space *mapping, return 0; } -static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma, - struct address_space *mapping, pgoff_t idx, - unsigned long address, pte_t *ptep, unsigned int flags) +static vm_fault_t hugetlb_no_page(struct mm_struct *mm, + struct vm_area_struct *vma, + struct address_space *mapping, pgoff_t idx, + unsigned long address, pte_t *ptep, unsigned int flags) { struct hstate *h = hstate_vma(vma); - int ret = VM_FAULT_SIGBUS; + vm_fault_t ret = VM_FAULT_SIGBUS; int anon_rmap = 0; unsigned long size; struct page *page; @@ -3744,11 +3745,7 @@ retry: page = alloc_huge_page(vma, haddr, 0); if (IS_ERR(page)) { - ret = PTR_ERR(page); - if (ret == -ENOMEM) - ret = VM_FAULT_OOM; - else - ret = VM_FAULT_SIGBUS; + ret = vmf_error(PTR_ERR(page)); goto out; } clear_huge_page(page, address, pages_per_huge_page(h)); @@ -3872,12 +3869,12 @@ u32 hugetlb_fault_mutex_hash(struct hstate *h, struct mm_struct *mm, } #endif -int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, +vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, unsigned int flags) { pte_t *ptep, entry; spinlock_t *ptl; - int ret; + vm_fault_t ret; u32 hash; pgoff_t idx; struct page *page = NULL; @@ -4207,7 +4204,7 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, if (absent || is_swap_pte(huge_ptep_get(pte)) || ((flags & FOLL_WRITE) && !huge_pte_write(huge_ptep_get(pte)))) { - int ret; + vm_fault_t ret; unsigned int fault_flags = 0; if (pte) diff --git a/mm/internal.h b/mm/internal.h index dab088cb6937..87256ae1bef8 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -38,7 +38,7 @@ void page_writeback_init(void); -int do_swap_page(struct vm_fault *vmf); +vm_fault_t do_swap_page(struct vm_fault *vmf); void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma, unsigned long floor, unsigned long ceiling); diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 961cbe9062a5..a31d740e6cd1 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -880,7 +880,8 @@ static bool __collapse_huge_page_swapin(struct mm_struct *mm, unsigned long address, pmd_t *pmd, int referenced) { - int swapped_in = 0, ret = 0; + int swapped_in = 0; + vm_fault_t ret = 0; struct vm_fault vmf = { .vma = vma, .address = address, diff --git a/mm/memory.c b/mm/memory.c index 19f47d7b9b86..42ebdc33268e 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2384,9 +2384,9 @@ static gfp_t __get_fault_gfp_mask(struct vm_area_struct *vma) * * We do this without the lock held, so that it can sleep if it needs to. */ -static int do_page_mkwrite(struct vm_fault *vmf) +static vm_fault_t do_page_mkwrite(struct vm_fault *vmf) { - int ret; + vm_fault_t ret; struct page *page = vmf->page; unsigned int old_flags = vmf->flags; @@ -2490,7 +2490,7 @@ static inline void wp_page_reuse(struct vm_fault *vmf) * held to the old page, as well as updating the rmap. * - In any case, unlock the PTL and drop the reference we took to the old page. */ -static int wp_page_copy(struct vm_fault *vmf) +static vm_fault_t wp_page_copy(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; struct mm_struct *mm = vma->vm_mm; @@ -2638,7 +2638,7 @@ oom: * The function expects the page to be locked or other protection against * concurrent faults / writeback (such as DAX radix tree locks). */ -int finish_mkwrite_fault(struct vm_fault *vmf) +vm_fault_t finish_mkwrite_fault(struct vm_fault *vmf) { WARN_ON_ONCE(!(vmf->vma->vm_flags & VM_SHARED)); vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd, vmf->address, @@ -2659,12 +2659,12 @@ int finish_mkwrite_fault(struct vm_fault *vmf) * Handle write page faults for VM_MIXEDMAP or VM_PFNMAP for a VM_SHARED * mapping */ -static int wp_pfn_shared(struct vm_fault *vmf) +static vm_fault_t wp_pfn_shared(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; if (vma->vm_ops && vma->vm_ops->pfn_mkwrite) { - int ret; + vm_fault_t ret; pte_unmap_unlock(vmf->pte, vmf->ptl); vmf->flags |= FAULT_FLAG_MKWRITE; @@ -2677,7 +2677,7 @@ static int wp_pfn_shared(struct vm_fault *vmf) return VM_FAULT_WRITE; } -static int wp_page_shared(struct vm_fault *vmf) +static vm_fault_t wp_page_shared(struct vm_fault *vmf) __releases(vmf->ptl) { struct vm_area_struct *vma = vmf->vma; @@ -2685,7 +2685,7 @@ static int wp_page_shared(struct vm_fault *vmf) get_page(vmf->page); if (vma->vm_ops && vma->vm_ops->page_mkwrite) { - int tmp; + vm_fault_t tmp; pte_unmap_unlock(vmf->pte, vmf->ptl); tmp = do_page_mkwrite(vmf); @@ -2728,7 +2728,7 @@ static int wp_page_shared(struct vm_fault *vmf) * but allow concurrent faults), with pte both mapped and locked. * We return with mmap_sem still held, but pte unmapped and unlocked. */ -static int do_wp_page(struct vm_fault *vmf) +static vm_fault_t do_wp_page(struct vm_fault *vmf) __releases(vmf->ptl) { struct vm_area_struct *vma = vmf->vma; @@ -2904,7 +2904,7 @@ EXPORT_SYMBOL(unmap_mapping_range); * We return with the mmap_sem locked or unlocked in the same cases * as does filemap_fault(). */ -int do_swap_page(struct vm_fault *vmf) +vm_fault_t do_swap_page(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; struct page *page = NULL, *swapcache; @@ -2913,7 +2913,7 @@ int do_swap_page(struct vm_fault *vmf) pte_t pte; int locked; int exclusive = 0; - int ret = 0; + vm_fault_t ret = 0; if (!pte_unmap_same(vma->vm_mm, vmf->pmd, vmf->pte, vmf->orig_pte)) goto out; @@ -3124,12 +3124,12 @@ out_release: * but allow concurrent faults), and pte mapped but not yet locked. * We return with mmap_sem still held, but pte unmapped and unlocked. */ -static int do_anonymous_page(struct vm_fault *vmf) +static vm_fault_t do_anonymous_page(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; struct mem_cgroup *memcg; struct page *page; - int ret = 0; + vm_fault_t ret = 0; pte_t entry; /* File mapping without ->vm_ops ? */ @@ -3239,10 +3239,10 @@ oom: * released depending on flags and vma->vm_ops->fault() return value. * See filemap_fault() and __lock_page_retry(). */ -static int __do_fault(struct vm_fault *vmf) +static vm_fault_t __do_fault(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; - int ret; + vm_fault_t ret; ret = vma->vm_ops->fault(vmf); if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY | @@ -3276,7 +3276,7 @@ static int pmd_devmap_trans_unstable(pmd_t *pmd) return pmd_devmap(*pmd) || pmd_trans_unstable(pmd); } -static int pte_alloc_one_map(struct vm_fault *vmf) +static vm_fault_t pte_alloc_one_map(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; @@ -3352,13 +3352,14 @@ static void deposit_prealloc_pte(struct vm_fault *vmf) vmf->prealloc_pte = NULL; } -static int do_set_pmd(struct vm_fault *vmf, struct page *page) +static vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page) { struct vm_area_struct *vma = vmf->vma; bool write = vmf->flags & FAULT_FLAG_WRITE; unsigned long haddr = vmf->address & HPAGE_PMD_MASK; pmd_t entry; - int i, ret; + int i; + vm_fault_t ret; if (!transhuge_vma_suitable(vma, haddr)) return VM_FAULT_FALLBACK; @@ -3408,7 +3409,7 @@ out: return ret; } #else -static int do_set_pmd(struct vm_fault *vmf, struct page *page) +static vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page) { BUILD_BUG(); return 0; @@ -3429,13 +3430,13 @@ static int do_set_pmd(struct vm_fault *vmf, struct page *page) * Target users are page handler itself and implementations of * vm_ops->map_pages. */ -int alloc_set_pte(struct vm_fault *vmf, struct mem_cgroup *memcg, +vm_fault_t alloc_set_pte(struct vm_fault *vmf, struct mem_cgroup *memcg, struct page *page) { struct vm_area_struct *vma = vmf->vma; bool write = vmf->flags & FAULT_FLAG_WRITE; pte_t entry; - int ret; + vm_fault_t ret; if (pmd_none(*vmf->pmd) && PageTransCompound(page) && IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE)) { @@ -3494,10 +3495,10 @@ int alloc_set_pte(struct vm_fault *vmf, struct mem_cgroup *memcg, * The function expects the page to be locked and on success it consumes a * reference of a page being mapped (for the PTE which maps it). */ -int finish_fault(struct vm_fault *vmf) +vm_fault_t finish_fault(struct vm_fault *vmf) { struct page *page; - int ret = 0; + vm_fault_t ret = 0; /* Did we COW the page? */ if ((vmf->flags & FAULT_FLAG_WRITE) && @@ -3583,12 +3584,13 @@ late_initcall(fault_around_debugfs); * (and therefore to page order). This way it's easier to guarantee * that we don't cross page table boundaries. */ -static int do_fault_around(struct vm_fault *vmf) +static vm_fault_t do_fault_around(struct vm_fault *vmf) { unsigned long address = vmf->address, nr_pages, mask; pgoff_t start_pgoff = vmf->pgoff; pgoff_t end_pgoff; - int off, ret = 0; + int off; + vm_fault_t ret = 0; nr_pages = READ_ONCE(fault_around_bytes) >> PAGE_SHIFT; mask = ~(nr_pages * PAGE_SIZE - 1) & PAGE_MASK; @@ -3638,10 +3640,10 @@ out: return ret; } -static int do_read_fault(struct vm_fault *vmf) +static vm_fault_t do_read_fault(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; - int ret = 0; + vm_fault_t ret = 0; /* * Let's call ->map_pages() first and use ->fault() as fallback @@ -3665,10 +3667,10 @@ static int do_read_fault(struct vm_fault *vmf) return ret; } -static int do_cow_fault(struct vm_fault *vmf) +static vm_fault_t do_cow_fault(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; - int ret; + vm_fault_t ret; if (unlikely(anon_vma_prepare(vma))) return VM_FAULT_OOM; @@ -3704,10 +3706,10 @@ uncharge_out: return ret; } -static int do_shared_fault(struct vm_fault *vmf) +static vm_fault_t do_shared_fault(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; - int ret, tmp; + vm_fault_t ret, tmp; ret = __do_fault(vmf); if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) @@ -3745,10 +3747,10 @@ static int do_shared_fault(struct vm_fault *vmf) * The mmap_sem may have been released depending on flags and our * return value. See filemap_fault() and __lock_page_or_retry(). */ -static int do_fault(struct vm_fault *vmf) +static vm_fault_t do_fault(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; - int ret; + vm_fault_t ret; /* The VMA was not fully populated on mmap() or missing VM_DONTEXPAND */ if (!vma->vm_ops->fault) @@ -3783,7 +3785,7 @@ static int numa_migrate_prep(struct page *page, struct vm_area_struct *vma, return mpol_misplaced(page, vma, addr); } -static int do_numa_page(struct vm_fault *vmf) +static vm_fault_t do_numa_page(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; struct page *page = NULL; @@ -3873,7 +3875,7 @@ out: return 0; } -static inline int create_huge_pmd(struct vm_fault *vmf) +static inline vm_fault_t create_huge_pmd(struct vm_fault *vmf) { if (vma_is_anonymous(vmf->vma)) return do_huge_pmd_anonymous_page(vmf); @@ -3883,7 +3885,7 @@ static inline int create_huge_pmd(struct vm_fault *vmf) } /* `inline' is required to avoid gcc 4.1.2 build error */ -static inline int wp_huge_pmd(struct vm_fault *vmf, pmd_t orig_pmd) +static inline vm_fault_t wp_huge_pmd(struct vm_fault *vmf, pmd_t orig_pmd) { if (vma_is_anonymous(vmf->vma)) return do_huge_pmd_wp_page(vmf, orig_pmd); @@ -3902,7 +3904,7 @@ static inline bool vma_is_accessible(struct vm_area_struct *vma) return vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE); } -static int create_huge_pud(struct vm_fault *vmf) +static vm_fault_t create_huge_pud(struct vm_fault *vmf) { #ifdef CONFIG_TRANSPARENT_HUGEPAGE /* No support for anonymous transparent PUD pages yet */ @@ -3914,7 +3916,7 @@ static int create_huge_pud(struct vm_fault *vmf) return VM_FAULT_FALLBACK; } -static int wp_huge_pud(struct vm_fault *vmf, pud_t orig_pud) +static vm_fault_t wp_huge_pud(struct vm_fault *vmf, pud_t orig_pud) { #ifdef CONFIG_TRANSPARENT_HUGEPAGE /* No support for anonymous transparent PUD pages yet */ @@ -3941,7 +3943,7 @@ static int wp_huge_pud(struct vm_fault *vmf, pud_t orig_pud) * The mmap_sem may have been released depending on flags and our return value. * See filemap_fault() and __lock_page_or_retry(). */ -static int handle_pte_fault(struct vm_fault *vmf) +static vm_fault_t handle_pte_fault(struct vm_fault *vmf) { pte_t entry; @@ -4029,8 +4031,8 @@ unlock: * The mmap_sem may have been released depending on flags and our * return value. See filemap_fault() and __lock_page_or_retry(). */ -static int __handle_mm_fault(struct vm_area_struct *vma, unsigned long address, - unsigned int flags) +static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma, + unsigned long address, unsigned int flags) { struct vm_fault vmf = { .vma = vma, @@ -4043,7 +4045,7 @@ static int __handle_mm_fault(struct vm_area_struct *vma, unsigned long address, struct mm_struct *mm = vma->vm_mm; pgd_t *pgd; p4d_t *p4d; - int ret; + vm_fault_t ret; pgd = pgd_offset(mm, address); p4d = p4d_alloc(mm, pgd, address); @@ -4118,10 +4120,10 @@ static int __handle_mm_fault(struct vm_area_struct *vma, unsigned long address, * The mmap_sem may have been released depending on flags and our * return value. See filemap_fault() and __lock_page_or_retry(). */ -int handle_mm_fault(struct vm_area_struct *vma, unsigned long address, +vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address, unsigned int flags) { - int ret; + vm_fault_t ret; __set_current_state(TASK_RUNNING); diff --git a/mm/shmem.c b/mm/shmem.c index fb04baacc9fa..0376c124b043 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -124,7 +124,7 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp, static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, struct page **pagep, enum sgp_type sgp, gfp_t gfp, struct vm_area_struct *vma, - struct vm_fault *vmf, int *fault_type); + struct vm_fault *vmf, vm_fault_t *fault_type); int shmem_getpage(struct inode *inode, pgoff_t index, struct page **pagep, enum sgp_type sgp) @@ -1620,7 +1620,8 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp, */ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, struct page **pagep, enum sgp_type sgp, gfp_t gfp, - struct vm_area_struct *vma, struct vm_fault *vmf, int *fault_type) + struct vm_area_struct *vma, struct vm_fault *vmf, + vm_fault_t *fault_type) { struct address_space *mapping = inode->i_mapping; struct shmem_inode_info *info = SHMEM_I(inode); -- cgit From 976516404ff3fab2a8caa8bd6f5efc1437fed0b8 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 14 Aug 2018 14:02:57 +0200 Subject: y2038: remove unused time interfaces After many small patches, at least some of the deprecated interfaces have no remaining users any more and can be removed: current_kernel_time do_settimeofday get_monotonic_boottime get_monotonic_boottime64 get_monotonic_coarse get_monotonic_coarse64 getrawmonotonic64 ktime_get_real_ts timekeeping_clocktai timespec_trunc timespec_valid_strict time_to_tm For many of the remaining time functions, we are missing one or two patches that failed to make it into 4.19, they will be removed in the following merge window. The replacement functions for the removed interfaces are documented in Documentation/core-api/timekeeping.rst. Signed-off-by: Arnd Bergmann --- include/linux/time32.h | 25 ---------------------- include/linux/timekeeping.h | 12 ----------- include/linux/timekeeping32.h | 50 ------------------------------------------- kernel/time/time.c | 24 --------------------- 4 files changed, 111 deletions(-) (limited to 'kernel') diff --git a/include/linux/time32.h b/include/linux/time32.h index d1ae43c13e25..92c0ca092d93 100644 --- a/include/linux/time32.h +++ b/include/linux/time32.h @@ -105,16 +105,6 @@ static inline bool timespec_valid(const struct timespec *ts) return true; } -static inline bool timespec_valid_strict(const struct timespec *ts) -{ - if (!timespec_valid(ts)) - return false; - /* Disallow values that could overflow ktime_t */ - if ((unsigned long long)ts->tv_sec >= KTIME_SEC_MAX) - return false; - return true; -} - /** * timespec_to_ns - Convert timespec to nanoseconds * @ts: pointer to the timespec variable to be converted @@ -149,19 +139,6 @@ static __always_inline void timespec_add_ns(struct timespec *a, u64 ns) a->tv_nsec = ns; } -/** - * time_to_tm - converts the calendar time to local broken-down time - * - * @totalsecs the number of seconds elapsed since 00:00:00 on January 1, 1970, - * Coordinated Universal Time (UTC). - * @offset offset seconds adding to totalsecs. - * @result pointer to struct tm variable to receive broken-down time - */ -static inline void time_to_tm(time_t totalsecs, int offset, struct tm *result) -{ - time64_to_tm(totalsecs, offset, result); -} - static inline unsigned long mktime(const unsigned int year, const unsigned int mon, const unsigned int day, const unsigned int hour, const unsigned int min, @@ -183,8 +160,6 @@ static inline bool timeval_valid(const struct timeval *tv) return true; } -extern struct timespec timespec_trunc(struct timespec t, unsigned int gran); - /** * timeval_to_ns - Convert timeval to nanoseconds * @ts: pointer to the timeval variable to be converted diff --git a/include/linux/timekeeping.h b/include/linux/timekeeping.h index 5d738804e3d6..3faba06411aa 100644 --- a/include/linux/timekeeping.h +++ b/include/linux/timekeeping.h @@ -266,9 +266,6 @@ extern int update_persistent_clock64(struct timespec64 now); * deprecated aliases, don't use in new code */ #define getnstimeofday64(ts) ktime_get_real_ts64(ts) -#define get_monotonic_boottime64(ts) ktime_get_boottime_ts64(ts) -#define getrawmonotonic64(ts) ktime_get_raw_ts64(ts) -#define timekeeping_clocktai64(ts) ktime_get_clocktai_ts64(ts) static inline struct timespec64 current_kernel_time64(void) { @@ -279,13 +276,4 @@ static inline struct timespec64 current_kernel_time64(void) return ts; } -static inline struct timespec64 get_monotonic_coarse64(void) -{ - struct timespec64 ts; - - ktime_get_coarse_ts64(&ts); - - return ts; -} - #endif diff --git a/include/linux/timekeeping32.h b/include/linux/timekeeping32.h index 8762c2f45f8b..a4a4991160fb 100644 --- a/include/linux/timekeeping32.h +++ b/include/linux/timekeeping32.h @@ -9,26 +9,6 @@ extern void do_gettimeofday(struct timeval *tv); unsigned long get_seconds(void); -static inline struct timespec current_kernel_time(void) -{ - struct timespec64 ts64; - - ktime_get_coarse_real_ts64(&ts64); - - return timespec64_to_timespec(ts64); -} - -/** - * Deprecated. Use do_settimeofday64(). - */ -static inline int do_settimeofday(const struct timespec *ts) -{ - struct timespec64 ts64; - - ts64 = timespec_to_timespec64(*ts); - return do_settimeofday64(&ts64); -} - static inline void getnstimeofday(struct timespec *ts) { struct timespec64 ts64; @@ -45,14 +25,6 @@ static inline void ktime_get_ts(struct timespec *ts) *ts = timespec64_to_timespec(ts64); } -static inline void ktime_get_real_ts(struct timespec *ts) -{ - struct timespec64 ts64; - - ktime_get_real_ts64(&ts64); - *ts = timespec64_to_timespec(ts64); -} - static inline void getrawmonotonic(struct timespec *ts) { struct timespec64 ts64; @@ -61,15 +33,6 @@ static inline void getrawmonotonic(struct timespec *ts) *ts = timespec64_to_timespec(ts64); } -static inline struct timespec get_monotonic_coarse(void) -{ - struct timespec64 ts64; - - ktime_get_coarse_ts64(&ts64); - - return timespec64_to_timespec(ts64); -} - static inline void getboottime(struct timespec *ts) { struct timespec64 ts64; @@ -78,19 +41,6 @@ static inline void getboottime(struct timespec *ts) *ts = timespec64_to_timespec(ts64); } -/* - * Timespec interfaces utilizing the ktime based ones - */ -static inline void get_monotonic_boottime(struct timespec *ts) -{ - *ts = ktime_to_timespec(ktime_get_boottime()); -} - -static inline void timekeeping_clocktai(struct timespec *ts) -{ - *ts = ktime_to_timespec(ktime_get_clocktai()); -} - /* * Persistent clock related interfaces */ diff --git a/kernel/time/time.c b/kernel/time/time.c index ccdb351277ee..712543011106 100644 --- a/kernel/time/time.c +++ b/kernel/time/time.c @@ -342,30 +342,6 @@ unsigned int jiffies_to_usecs(const unsigned long j) } EXPORT_SYMBOL(jiffies_to_usecs); -/** - * timespec_trunc - Truncate timespec to a granularity - * @t: Timespec - * @gran: Granularity in ns. - * - * Truncate a timespec to a granularity. Always rounds down. gran must - * not be 0 nor greater than a second (NSEC_PER_SEC, or 10^9 ns). - */ -struct timespec timespec_trunc(struct timespec t, unsigned gran) -{ - /* Avoid division in the common cases 1 ns and 1 s. */ - if (gran == 1) { - /* nothing */ - } else if (gran == NSEC_PER_SEC) { - t.tv_nsec = 0; - } else if (gran > 1 && gran < NSEC_PER_SEC) { - t.tv_nsec -= t.tv_nsec % gran; - } else { - WARN(1, "illegal file time granularity: %u", gran); - } - return t; -} -EXPORT_SYMBOL(timespec_trunc); - /* * mktime64 - Converts date to seconds. * Converts Gregorian date to seconds since 1970-01-01 00:00:00. -- cgit From 33e26418193f58d1895f2f968e1953b1caf8deb7 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 14 Aug 2018 15:18:20 +0200 Subject: y2038: make do_gettimeofday() and get_seconds() inline get_seconds() and do_gettimeofday() are only used by a few modules now any more (waiting for the respective patches to get accepted), and they are among the last holdouts of code that is not y2038 safe in the core kernel. Move the implementation into the timekeeping32.h header to clean up the core kernel and isolate the old interfaces further. Signed-off-by: Arnd Bergmann --- include/linux/timekeeping32.h | 15 +++++++++++++-- kernel/time/time.c | 15 +++++++++------ kernel/time/timekeeping.c | 24 ------------------------ 3 files changed, 22 insertions(+), 32 deletions(-) (limited to 'kernel') diff --git a/include/linux/timekeeping32.h b/include/linux/timekeeping32.h index a4a4991160fb..a502616f7e1c 100644 --- a/include/linux/timekeeping32.h +++ b/include/linux/timekeeping32.h @@ -6,8 +6,19 @@ * over time so we can remove the file here. */ -extern void do_gettimeofday(struct timeval *tv); -unsigned long get_seconds(void); +static inline void do_gettimeofday(struct timeval *tv) +{ + struct timespec64 now; + + ktime_get_real_ts64(&now); + tv->tv_sec = now.tv_sec; + tv->tv_usec = now.tv_nsec/1000; +} + +static inline unsigned long get_seconds(void) +{ + return ktime_get_real_seconds(); +} static inline void getnstimeofday(struct timespec *ts) { diff --git a/kernel/time/time.c b/kernel/time/time.c index 712543011106..de332250d6fa 100644 --- a/kernel/time/time.c +++ b/kernel/time/time.c @@ -144,9 +144,11 @@ SYSCALL_DEFINE2(gettimeofday, struct timeval __user *, tv, struct timezone __user *, tz) { if (likely(tv != NULL)) { - struct timeval ktv; - do_gettimeofday(&ktv); - if (copy_to_user(tv, &ktv, sizeof(ktv))) + struct timespec64 ts; + + ktime_get_real_ts64(&ts); + if (put_user(ts.tv_sec, &tv->tv_sec) || + put_user(ts.tv_nsec / 1000, &tv->tv_usec)) return -EFAULT; } if (unlikely(tz != NULL)) { @@ -227,10 +229,11 @@ COMPAT_SYSCALL_DEFINE2(gettimeofday, struct compat_timeval __user *, tv, struct timezone __user *, tz) { if (tv) { - struct timeval ktv; + struct timespec64 ts; - do_gettimeofday(&ktv); - if (compat_put_timeval(&ktv, tv)) + ktime_get_real_ts64(&ts); + if (put_user(ts.tv_sec, &tv->tv_sec) || + put_user(ts.tv_nsec / 1000, &tv->tv_usec)) return -EFAULT; } if (tz) { diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index f3b22f456fac..2d110c948805 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -1211,22 +1211,6 @@ int get_device_system_crosststamp(int (*get_time_fn) } EXPORT_SYMBOL_GPL(get_device_system_crosststamp); -/** - * do_gettimeofday - Returns the time of day in a timeval - * @tv: pointer to the timeval to be set - * - * NOTE: Users should be converted to using getnstimeofday() - */ -void do_gettimeofday(struct timeval *tv) -{ - struct timespec64 now; - - getnstimeofday64(&now); - tv->tv_sec = now.tv_sec; - tv->tv_usec = now.tv_nsec/1000; -} -EXPORT_SYMBOL(do_gettimeofday); - /** * do_settimeofday64 - Sets the time of day. * @ts: pointer to the timespec64 variable containing the new time @@ -2174,14 +2158,6 @@ void getboottime64(struct timespec64 *ts) } EXPORT_SYMBOL_GPL(getboottime64); -unsigned long get_seconds(void) -{ - struct timekeeper *tk = &tk_core.timekeeper; - - return tk->xtime_sec; -} -EXPORT_SYMBOL(get_seconds); - void ktime_get_coarse_real_ts64(struct timespec64 *ts) { struct timekeeper *tk = &tk_core.timekeeper; -- cgit From 9afc5eee65ca7d717a99d6fe8f4adfe32a40940a Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 13 Jul 2018 12:52:28 +0200 Subject: y2038: globally rename compat_time to old_time32 Christoph Hellwig suggested a slightly different path for handling backwards compatibility with the 32-bit time_t based system calls: Rather than simply reusing the compat_sys_* entry points on 32-bit architectures unchanged, we get rid of those entry points and the compat_time types by renaming them to something that makes more sense on 32-bit architectures (which don't have a compat mode otherwise), and then share the entry points under the new name with the 64-bit architectures that use them for implementing the compatibility. The following types and interfaces are renamed here, and moved from linux/compat_time.h to linux/time32.h: old new --- --- compat_time_t old_time32_t struct compat_timeval struct old_timeval32 struct compat_timespec struct old_timespec32 struct compat_itimerspec struct old_itimerspec32 ns_to_compat_timeval() ns_to_old_timeval32() get_compat_itimerspec64() get_old_itimerspec32() put_compat_itimerspec64() put_old_itimerspec32() compat_get_timespec64() get_old_timespec32() compat_put_timespec64() put_old_timespec32() As we already have aliases in place, this patch addresses only the instances that are relevant to the system call interface in particular, not those that occur in device drivers and other modules. Those will get handled separately, while providing the 64-bit version of the respective interfaces. I'm not renaming the timex, rusage and itimerval structures, as we are still debating what the new interface will look like, and whether we will need a replacement at all. This also doesn't change the names of the syscall entry points, which can be done more easily when we actually switch over the 32-bit architectures to use them, at that point we need to change COMPAT_SYSCALL_DEFINEx to SYSCALL_DEFINEx with a new name, e.g. with a _time32 suffix. Suggested-by: Christoph Hellwig Link: https://lore.kernel.org/lkml/20180705222110.GA5698@infradead.org/ Signed-off-by: Arnd Bergmann --- arch/arm64/include/asm/compat.h | 6 +-- arch/arm64/include/asm/stat.h | 2 +- arch/mips/include/asm/compat.h | 6 +-- arch/mips/kernel/binfmt_elfn32.c | 14 +++--- arch/mips/kernel/binfmt_elfo32.c | 14 +++--- arch/parisc/include/asm/compat.h | 6 +-- arch/powerpc/include/asm/compat.h | 6 +-- arch/powerpc/kernel/asm-offsets.c | 8 ++-- arch/powerpc/oprofile/backtrace.c | 2 +- arch/sparc/include/asm/compat.h | 6 +-- fs/aio.c | 8 ++-- fs/compat_binfmt_elf.c | 2 +- fs/select.c | 20 ++++---- fs/timerfd.c | 12 ++--- fs/utimes.c | 12 ++--- include/linux/compat.h | 96 +++++++++++++++++++-------------------- include/linux/compat_time.h | 32 ------------- include/linux/elfcore-compat.h | 8 ++-- include/linux/restart_block.h | 4 +- include/linux/syscalls.h | 2 +- include/linux/time32.h | 48 +++++++++++++++----- ipc/mqueue.c | 8 ++-- ipc/msg.c | 6 +-- ipc/sem.c | 10 ++-- ipc/shm.c | 6 +-- ipc/syscall.c | 2 +- ipc/util.h | 2 +- kernel/compat.c | 8 ++-- kernel/futex_compat.c | 2 +- kernel/sched/core.c | 4 +- kernel/signal.c | 2 +- kernel/time/hrtimer.c | 8 ++-- kernel/time/posix-stubs.c | 18 ++++---- kernel/time/posix-timers.c | 30 ++++++------ kernel/time/time.c | 58 +++++++++++------------ net/compat.c | 4 +- 36 files changed, 237 insertions(+), 245 deletions(-) delete mode 100644 include/linux/compat_time.h (limited to 'kernel') diff --git a/arch/arm64/include/asm/compat.h b/arch/arm64/include/asm/compat.h index 1a037b94eba1..4a89007db14a 100644 --- a/arch/arm64/include/asm/compat.h +++ b/arch/arm64/include/asm/compat.h @@ -86,11 +86,11 @@ struct compat_stat { compat_off_t st_size; compat_off_t st_blksize; compat_off_t st_blocks; - compat_time_t st_atime; + old_time32_t st_atime; compat_ulong_t st_atime_nsec; - compat_time_t st_mtime; + old_time32_t st_mtime; compat_ulong_t st_mtime_nsec; - compat_time_t st_ctime; + old_time32_t st_ctime; compat_ulong_t st_ctime_nsec; compat_ulong_t __unused4[2]; }; diff --git a/arch/arm64/include/asm/stat.h b/arch/arm64/include/asm/stat.h index eab738019707..397c6ccd04e7 100644 --- a/arch/arm64/include/asm/stat.h +++ b/arch/arm64/include/asm/stat.h @@ -20,7 +20,7 @@ #ifdef CONFIG_COMPAT -#include +#include #include /* diff --git a/arch/mips/include/asm/compat.h b/arch/mips/include/asm/compat.h index 78675f19440f..7dcbd855814e 100644 --- a/arch/mips/include/asm/compat.h +++ b/arch/mips/include/asm/compat.h @@ -59,11 +59,11 @@ struct compat_stat { s32 st_pad2[2]; compat_off_t st_size; s32 st_pad3; - compat_time_t st_atime; + old_time32_t st_atime; s32 st_atime_nsec; - compat_time_t st_mtime; + old_time32_t st_mtime; s32 st_mtime_nsec; - compat_time_t st_ctime; + old_time32_t st_ctime; s32 st_ctime_nsec; s32 st_blksize; s32 st_blocks; diff --git a/arch/mips/kernel/binfmt_elfn32.c b/arch/mips/kernel/binfmt_elfn32.c index 89b234844534..7a12763d553a 100644 --- a/arch/mips/kernel/binfmt_elfn32.c +++ b/arch/mips/kernel/binfmt_elfn32.c @@ -54,10 +54,10 @@ struct elf_prstatus32 pid_t pr_ppid; pid_t pr_pgrp; pid_t pr_sid; - struct compat_timeval pr_utime; /* User time */ - struct compat_timeval pr_stime; /* System time */ - struct compat_timeval pr_cutime;/* Cumulative user time */ - struct compat_timeval pr_cstime;/* Cumulative system time */ + struct old_timeval32 pr_utime; /* User time */ + struct old_timeval32 pr_stime; /* System time */ + struct old_timeval32 pr_cutime;/* Cumulative user time */ + struct old_timeval32 pr_cstime;/* Cumulative system time */ elf_gregset_t pr_reg; /* GP registers */ int pr_fpvalid; /* True if math co-processor being used. */ }; @@ -81,9 +81,9 @@ struct elf_prpsinfo32 #define elf_caddr_t u32 #define init_elf_binfmt init_elfn32_binfmt -#define jiffies_to_timeval jiffies_to_compat_timeval +#define jiffies_to_timeval jiffies_to_old_timeval32 static __inline__ void -jiffies_to_compat_timeval(unsigned long jiffies, struct compat_timeval *value) +jiffies_to_old_timeval32(unsigned long jiffies, struct old_timeval32 *value) { /* * Convert jiffies to nanoseconds and separate with @@ -101,6 +101,6 @@ jiffies_to_compat_timeval(unsigned long jiffies, struct compat_timeval *value) #define TASK_SIZE TASK_SIZE32 #undef ns_to_timeval -#define ns_to_timeval ns_to_compat_timeval +#define ns_to_timeval ns_to_old_timeval32 #include "../../../fs/binfmt_elf.c" diff --git a/arch/mips/kernel/binfmt_elfo32.c b/arch/mips/kernel/binfmt_elfo32.c index a88c59db3d48..e6db06a1d31a 100644 --- a/arch/mips/kernel/binfmt_elfo32.c +++ b/arch/mips/kernel/binfmt_elfo32.c @@ -59,10 +59,10 @@ struct elf_prstatus32 pid_t pr_ppid; pid_t pr_pgrp; pid_t pr_sid; - struct compat_timeval pr_utime; /* User time */ - struct compat_timeval pr_stime; /* System time */ - struct compat_timeval pr_cutime;/* Cumulative user time */ - struct compat_timeval pr_cstime;/* Cumulative system time */ + struct old_timeval32 pr_utime; /* User time */ + struct old_timeval32 pr_stime; /* System time */ + struct old_timeval32 pr_cutime;/* Cumulative user time */ + struct old_timeval32 pr_cstime;/* Cumulative system time */ elf_gregset_t pr_reg; /* GP registers */ int pr_fpvalid; /* True if math co-processor being used. */ }; @@ -86,9 +86,9 @@ struct elf_prpsinfo32 #define elf_caddr_t u32 #define init_elf_binfmt init_elf32_binfmt -#define jiffies_to_timeval jiffies_to_compat_timeval +#define jiffies_to_timeval jiffies_to_old_timeval32 static inline void -jiffies_to_compat_timeval(unsigned long jiffies, struct compat_timeval *value) +jiffies_to_old_timeval32(unsigned long jiffies, struct old_timeval32 *value) { /* * Convert jiffies to nanoseconds and separate with @@ -104,6 +104,6 @@ jiffies_to_compat_timeval(unsigned long jiffies, struct compat_timeval *value) #define TASK_SIZE TASK_SIZE32 #undef ns_to_timeval -#define ns_to_timeval ns_to_compat_timeval +#define ns_to_timeval ns_to_old_timeval32 #include "../../../fs/binfmt_elf.c" diff --git a/arch/parisc/include/asm/compat.h b/arch/parisc/include/asm/compat.h index ab8a54771507..32dadaf2b534 100644 --- a/arch/parisc/include/asm/compat.h +++ b/arch/parisc/include/asm/compat.h @@ -48,11 +48,11 @@ struct compat_stat { u16 st_reserved2; /* old st_gid */ compat_dev_t st_rdev; compat_off_t st_size; - compat_time_t st_atime; + old_time32_t st_atime; u32 st_atime_nsec; - compat_time_t st_mtime; + old_time32_t st_mtime; u32 st_mtime_nsec; - compat_time_t st_ctime; + old_time32_t st_ctime; u32 st_ctime_nsec; s32 st_blksize; s32 st_blocks; diff --git a/arch/powerpc/include/asm/compat.h b/arch/powerpc/include/asm/compat.h index 85c8af2bb272..93f79d1a03c3 100644 --- a/arch/powerpc/include/asm/compat.h +++ b/arch/powerpc/include/asm/compat.h @@ -55,11 +55,11 @@ struct compat_stat { compat_off_t st_size; compat_off_t st_blksize; compat_off_t st_blocks; - compat_time_t st_atime; + old_time32_t st_atime; u32 st_atime_nsec; - compat_time_t st_mtime; + old_time32_t st_mtime; u32 st_mtime_nsec; - compat_time_t st_ctime; + old_time32_t st_ctime; u32 st_ctime_nsec; u32 __unused4[2]; }; diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c index 89cf15566c4e..041a115789a1 100644 --- a/arch/powerpc/kernel/asm-offsets.c +++ b/arch/powerpc/kernel/asm-offsets.c @@ -387,12 +387,12 @@ int main(void) OFFSET(CFG_SYSCALL_MAP64, vdso_data, syscall_map_64); OFFSET(TVAL64_TV_SEC, timeval, tv_sec); OFFSET(TVAL64_TV_USEC, timeval, tv_usec); - OFFSET(TVAL32_TV_SEC, compat_timeval, tv_sec); - OFFSET(TVAL32_TV_USEC, compat_timeval, tv_usec); + OFFSET(TVAL32_TV_SEC, old_timeval32, tv_sec); + OFFSET(TVAL32_TV_USEC, old_timeval32, tv_usec); OFFSET(TSPC64_TV_SEC, timespec, tv_sec); OFFSET(TSPC64_TV_NSEC, timespec, tv_nsec); - OFFSET(TSPC32_TV_SEC, compat_timespec, tv_sec); - OFFSET(TSPC32_TV_NSEC, compat_timespec, tv_nsec); + OFFSET(TSPC32_TV_SEC, old_timespec32, tv_sec); + OFFSET(TSPC32_TV_NSEC, old_timespec32, tv_nsec); #else OFFSET(TVAL32_TV_SEC, timeval, tv_sec); OFFSET(TVAL32_TV_USEC, timeval, tv_usec); diff --git a/arch/powerpc/oprofile/backtrace.c b/arch/powerpc/oprofile/backtrace.c index ad054dd0d666..5df6290d1ccc 100644 --- a/arch/powerpc/oprofile/backtrace.c +++ b/arch/powerpc/oprofile/backtrace.c @@ -7,7 +7,7 @@ * 2 of the License, or (at your option) any later version. **/ -#include +#include #include #include #include diff --git a/arch/sparc/include/asm/compat.h b/arch/sparc/include/asm/compat.h index 4eb51d2dae98..7018cb60beef 100644 --- a/arch/sparc/include/asm/compat.h +++ b/arch/sparc/include/asm/compat.h @@ -47,11 +47,11 @@ struct compat_stat { __compat_gid_t st_gid; compat_dev_t st_rdev; compat_off_t st_size; - compat_time_t st_atime; + old_time32_t st_atime; compat_ulong_t st_atime_nsec; - compat_time_t st_mtime; + old_time32_t st_mtime; compat_ulong_t st_mtime_nsec; - compat_time_t st_ctime; + old_time32_t st_ctime; compat_ulong_t st_ctime_nsec; compat_off_t st_blksize; compat_off_t st_blocks; diff --git a/fs/aio.c b/fs/aio.c index b9350f3360c6..301e6314183b 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -2135,12 +2135,12 @@ COMPAT_SYSCALL_DEFINE5(io_getevents, compat_aio_context_t, ctx_id, compat_long_t, min_nr, compat_long_t, nr, struct io_event __user *, events, - struct compat_timespec __user *, timeout) + struct old_timespec32 __user *, timeout) { struct timespec64 t; int ret; - if (timeout && compat_get_timespec64(&t, timeout)) + if (timeout && get_old_timespec32(&t, timeout)) return -EFAULT; ret = do_io_getevents(ctx_id, min_nr, nr, events, timeout ? &t : NULL); @@ -2160,7 +2160,7 @@ COMPAT_SYSCALL_DEFINE6(io_pgetevents, compat_long_t, min_nr, compat_long_t, nr, struct io_event __user *, events, - struct compat_timespec __user *, timeout, + struct old_timespec32 __user *, timeout, const struct __compat_aio_sigset __user *, usig) { struct __compat_aio_sigset ksig = { NULL, }; @@ -2168,7 +2168,7 @@ COMPAT_SYSCALL_DEFINE6(io_pgetevents, struct timespec64 t; int ret; - if (timeout && compat_get_timespec64(&t, timeout)) + if (timeout && get_old_timespec32(&t, timeout)) return -EFAULT; if (usig && copy_from_user(&ksig, usig, sizeof(ksig))) diff --git a/fs/compat_binfmt_elf.c b/fs/compat_binfmt_elf.c index 504b3c3539dc..15f6e96b3bd9 100644 --- a/fs/compat_binfmt_elf.c +++ b/fs/compat_binfmt_elf.c @@ -52,7 +52,7 @@ #define elf_prpsinfo compat_elf_prpsinfo #undef ns_to_timeval -#define ns_to_timeval ns_to_compat_timeval +#define ns_to_timeval ns_to_old_timeval32 /* * To use this file, asm/elf.h must define compat_elf_check_arch. diff --git a/fs/select.c b/fs/select.c index 4a6b6e4b21cb..22b3bf89f051 100644 --- a/fs/select.c +++ b/fs/select.c @@ -1120,7 +1120,7 @@ int compat_poll_select_copy_remaining(struct timespec64 *end_time, void __user * ts.tv_sec = ts.tv_nsec = 0; if (timeval) { - struct compat_timeval rtv; + struct old_timeval32 rtv; rtv.tv_sec = ts.tv_sec; rtv.tv_usec = ts.tv_nsec / NSEC_PER_USEC; @@ -1128,7 +1128,7 @@ int compat_poll_select_copy_remaining(struct timespec64 *end_time, void __user * if (!copy_to_user(p, &rtv, sizeof(rtv))) return ret; } else { - if (!compat_put_timespec64(&ts, p)) + if (!put_old_timespec32(&ts, p)) return ret; } /* @@ -1257,10 +1257,10 @@ out_nofds: static int do_compat_select(int n, compat_ulong_t __user *inp, compat_ulong_t __user *outp, compat_ulong_t __user *exp, - struct compat_timeval __user *tvp) + struct old_timeval32 __user *tvp) { struct timespec64 end_time, *to = NULL; - struct compat_timeval tv; + struct old_timeval32 tv; int ret; if (tvp) { @@ -1282,7 +1282,7 @@ static int do_compat_select(int n, compat_ulong_t __user *inp, COMPAT_SYSCALL_DEFINE5(select, int, n, compat_ulong_t __user *, inp, compat_ulong_t __user *, outp, compat_ulong_t __user *, exp, - struct compat_timeval __user *, tvp) + struct old_timeval32 __user *, tvp) { return do_compat_select(n, inp, outp, exp, tvp); } @@ -1307,7 +1307,7 @@ COMPAT_SYSCALL_DEFINE1(old_select, struct compat_sel_arg_struct __user *, arg) static long do_compat_pselect(int n, compat_ulong_t __user *inp, compat_ulong_t __user *outp, compat_ulong_t __user *exp, - struct compat_timespec __user *tsp, compat_sigset_t __user *sigmask, + struct old_timespec32 __user *tsp, compat_sigset_t __user *sigmask, compat_size_t sigsetsize) { sigset_t ksigmask, sigsaved; @@ -1315,7 +1315,7 @@ static long do_compat_pselect(int n, compat_ulong_t __user *inp, int ret; if (tsp) { - if (compat_get_timespec64(&ts, tsp)) + if (get_old_timespec32(&ts, tsp)) return -EFAULT; to = &end_time; @@ -1355,7 +1355,7 @@ static long do_compat_pselect(int n, compat_ulong_t __user *inp, COMPAT_SYSCALL_DEFINE6(pselect6, int, n, compat_ulong_t __user *, inp, compat_ulong_t __user *, outp, compat_ulong_t __user *, exp, - struct compat_timespec __user *, tsp, void __user *, sig) + struct old_timespec32 __user *, tsp, void __user *, sig) { compat_size_t sigsetsize = 0; compat_uptr_t up = 0; @@ -1373,7 +1373,7 @@ COMPAT_SYSCALL_DEFINE6(pselect6, int, n, compat_ulong_t __user *, inp, } COMPAT_SYSCALL_DEFINE5(ppoll, struct pollfd __user *, ufds, - unsigned int, nfds, struct compat_timespec __user *, tsp, + unsigned int, nfds, struct old_timespec32 __user *, tsp, const compat_sigset_t __user *, sigmask, compat_size_t, sigsetsize) { sigset_t ksigmask, sigsaved; @@ -1381,7 +1381,7 @@ COMPAT_SYSCALL_DEFINE5(ppoll, struct pollfd __user *, ufds, int ret; if (tsp) { - if (compat_get_timespec64(&ts, tsp)) + if (get_old_timespec32(&ts, tsp)) return -EFAULT; to = &end_time; diff --git a/fs/timerfd.c b/fs/timerfd.c index d69ad801eb80..803ca070d42e 100644 --- a/fs/timerfd.c +++ b/fs/timerfd.c @@ -561,29 +561,29 @@ SYSCALL_DEFINE2(timerfd_gettime, int, ufd, struct __kernel_itimerspec __user *, #ifdef CONFIG_COMPAT_32BIT_TIME COMPAT_SYSCALL_DEFINE4(timerfd_settime, int, ufd, int, flags, - const struct compat_itimerspec __user *, utmr, - struct compat_itimerspec __user *, otmr) + const struct old_itimerspec32 __user *, utmr, + struct old_itimerspec32 __user *, otmr) { struct itimerspec64 new, old; int ret; - if (get_compat_itimerspec64(&new, utmr)) + if (get_old_itimerspec32(&new, utmr)) return -EFAULT; ret = do_timerfd_settime(ufd, flags, &new, &old); if (ret) return ret; - if (otmr && put_compat_itimerspec64(&old, otmr)) + if (otmr && put_old_itimerspec32(&old, otmr)) return -EFAULT; return ret; } COMPAT_SYSCALL_DEFINE2(timerfd_gettime, int, ufd, - struct compat_itimerspec __user *, otmr) + struct old_itimerspec32 __user *, otmr) { struct itimerspec64 kotmr; int ret = do_timerfd_gettime(ufd, &kotmr); if (ret) return ret; - return put_compat_itimerspec64(&kotmr, otmr) ? -EFAULT : 0; + return put_old_itimerspec32(&kotmr, otmr) ? -EFAULT : 0; } #endif diff --git a/fs/utimes.c b/fs/utimes.c index 69d4b6ba1bfb..79a65c163f40 100644 --- a/fs/utimes.c +++ b/fs/utimes.c @@ -245,13 +245,13 @@ COMPAT_SYSCALL_DEFINE2(utime, const char __user *, filename, return do_utimes(AT_FDCWD, filename, t ? tv : NULL, 0); } -COMPAT_SYSCALL_DEFINE4(utimensat, unsigned int, dfd, const char __user *, filename, struct compat_timespec __user *, t, int, flags) +COMPAT_SYSCALL_DEFINE4(utimensat, unsigned int, dfd, const char __user *, filename, struct old_timespec32 __user *, t, int, flags) { struct timespec64 tv[2]; if (t) { - if (compat_get_timespec64(&tv[0], &t[0]) || - compat_get_timespec64(&tv[1], &t[1])) + if (get_old_timespec32(&tv[0], &t[0]) || + get_old_timespec32(&tv[1], &t[1])) return -EFAULT; if (tv[0].tv_nsec == UTIME_OMIT && tv[1].tv_nsec == UTIME_OMIT) @@ -261,7 +261,7 @@ COMPAT_SYSCALL_DEFINE4(utimensat, unsigned int, dfd, const char __user *, filena } static long do_compat_futimesat(unsigned int dfd, const char __user *filename, - struct compat_timeval __user *t) + struct old_timeval32 __user *t) { struct timespec64 tv[2]; @@ -282,12 +282,12 @@ static long do_compat_futimesat(unsigned int dfd, const char __user *filename, COMPAT_SYSCALL_DEFINE3(futimesat, unsigned int, dfd, const char __user *, filename, - struct compat_timeval __user *, t) + struct old_timeval32 __user *, t) { return do_compat_futimesat(dfd, filename, t); } -COMPAT_SYSCALL_DEFINE2(utimes, const char __user *, filename, struct compat_timeval __user *, t) +COMPAT_SYSCALL_DEFINE2(utimes, const char __user *, filename, struct old_timeval32 __user *, t) { return do_compat_futimesat(AT_FDCWD, filename, t); } diff --git a/include/linux/compat.h b/include/linux/compat.h index 1a3c4f37e908..0e058792ecf6 100644 --- a/include/linux/compat.h +++ b/include/linux/compat.h @@ -7,7 +7,7 @@ */ #include -#include +#include #include #include /* for HZ */ @@ -116,13 +116,13 @@ struct compat_sel_arg_struct; struct rusage; struct compat_utimbuf { - compat_time_t actime; - compat_time_t modtime; + old_time32_t actime; + old_time32_t modtime; }; struct compat_itimerval { - struct compat_timeval it_interval; - struct compat_timeval it_value; + struct old_timeval32 it_interval; + struct old_timeval32 it_value; }; struct itimerval; @@ -146,7 +146,7 @@ struct compat_timex { compat_long_t constant; compat_long_t precision; compat_long_t tolerance; - struct compat_timeval time; + struct old_timeval32 time; compat_long_t tick; compat_long_t ppsfreq; compat_long_t jitter; @@ -307,8 +307,8 @@ struct compat_rlimit { }; struct compat_rusage { - struct compat_timeval ru_utime; - struct compat_timeval ru_stime; + struct old_timeval32 ru_utime; + struct old_timeval32 ru_stime; compat_long_t ru_maxrss; compat_long_t ru_ixrss; compat_long_t ru_idrss; @@ -457,8 +457,8 @@ int copy_siginfo_to_user32(struct compat_siginfo __user *to, const siginfo_t *fr int get_compat_sigevent(struct sigevent *event, const struct compat_sigevent __user *u_event); -static inline int compat_timeval_compare(struct compat_timeval *lhs, - struct compat_timeval *rhs) +static inline int old_timeval32_compare(struct old_timeval32 *lhs, + struct old_timeval32 *rhs) { if (lhs->tv_sec < rhs->tv_sec) return -1; @@ -467,8 +467,8 @@ static inline int compat_timeval_compare(struct compat_timeval *lhs, return lhs->tv_usec - rhs->tv_usec; } -static inline int compat_timespec_compare(struct compat_timespec *lhs, - struct compat_timespec *rhs) +static inline int old_timespec32_compare(struct old_timespec32 *lhs, + struct old_timespec32 *rhs) { if (lhs->tv_sec < rhs->tv_sec) return -1; @@ -552,12 +552,12 @@ asmlinkage long compat_sys_io_getevents(compat_aio_context_t ctx_id, compat_long_t min_nr, compat_long_t nr, struct io_event __user *events, - struct compat_timespec __user *timeout); + struct old_timespec32 __user *timeout); asmlinkage long compat_sys_io_pgetevents(compat_aio_context_t ctx_id, compat_long_t min_nr, compat_long_t nr, struct io_event __user *events, - struct compat_timespec __user *timeout, + struct old_timespec32 __user *timeout, const struct __compat_aio_sigset __user *usig); /* fs/cookies.c */ @@ -642,11 +642,11 @@ asmlinkage long compat_sys_sendfile64(int out_fd, int in_fd, asmlinkage long compat_sys_pselect6(int n, compat_ulong_t __user *inp, compat_ulong_t __user *outp, compat_ulong_t __user *exp, - struct compat_timespec __user *tsp, + struct old_timespec32 __user *tsp, void __user *sig); asmlinkage long compat_sys_ppoll(struct pollfd __user *ufds, unsigned int nfds, - struct compat_timespec __user *tsp, + struct old_timespec32 __user *tsp, const compat_sigset_t __user *sigmask, compat_size_t sigsetsize); @@ -671,15 +671,15 @@ asmlinkage long compat_sys_newfstat(unsigned int fd, /* fs/timerfd.c */ asmlinkage long compat_sys_timerfd_gettime(int ufd, - struct compat_itimerspec __user *otmr); + struct old_itimerspec32 __user *otmr); asmlinkage long compat_sys_timerfd_settime(int ufd, int flags, - const struct compat_itimerspec __user *utmr, - struct compat_itimerspec __user *otmr); + const struct old_itimerspec32 __user *utmr, + struct old_itimerspec32 __user *otmr); /* fs/utimes.c */ asmlinkage long compat_sys_utimensat(unsigned int dfd, const char __user *filename, - struct compat_timespec __user *t, + struct old_timespec32 __user *t, int flags); /* kernel/exit.c */ @@ -691,7 +691,7 @@ asmlinkage long compat_sys_waitid(int, compat_pid_t, /* kernel/futex.c */ asmlinkage long compat_sys_futex(u32 __user *uaddr, int op, u32 val, - struct compat_timespec __user *utime, u32 __user *uaddr2, + struct old_timespec32 __user *utime, u32 __user *uaddr2, u32 val3); asmlinkage long compat_sys_set_robust_list(struct compat_robust_list_head __user *head, @@ -701,8 +701,8 @@ compat_sys_get_robust_list(int pid, compat_uptr_t __user *head_ptr, compat_size_t __user *len_ptr); /* kernel/hrtimer.c */ -asmlinkage long compat_sys_nanosleep(struct compat_timespec __user *rqtp, - struct compat_timespec __user *rmtp); +asmlinkage long compat_sys_nanosleep(struct old_timespec32 __user *rqtp, + struct old_timespec32 __user *rmtp); /* kernel/itimer.c */ asmlinkage long compat_sys_getitimer(int which, @@ -722,19 +722,19 @@ asmlinkage long compat_sys_timer_create(clockid_t which_clock, struct compat_sigevent __user *timer_event_spec, timer_t __user *created_timer_id); asmlinkage long compat_sys_timer_gettime(timer_t timer_id, - struct compat_itimerspec __user *setting); + struct old_itimerspec32 __user *setting); asmlinkage long compat_sys_timer_settime(timer_t timer_id, int flags, - struct compat_itimerspec __user *new, - struct compat_itimerspec __user *old); + struct old_itimerspec32 __user *new, + struct old_itimerspec32 __user *old); asmlinkage long compat_sys_clock_settime(clockid_t which_clock, - struct compat_timespec __user *tp); + struct old_timespec32 __user *tp); asmlinkage long compat_sys_clock_gettime(clockid_t which_clock, - struct compat_timespec __user *tp); + struct old_timespec32 __user *tp); asmlinkage long compat_sys_clock_getres(clockid_t which_clock, - struct compat_timespec __user *tp); + struct old_timespec32 __user *tp); asmlinkage long compat_sys_clock_nanosleep(clockid_t which_clock, int flags, - struct compat_timespec __user *rqtp, - struct compat_timespec __user *rmtp); + struct old_timespec32 __user *rqtp, + struct old_timespec32 __user *rmtp); /* kernel/ptrace.c */ asmlinkage long compat_sys_ptrace(compat_long_t request, compat_long_t pid, @@ -748,7 +748,7 @@ asmlinkage long compat_sys_sched_getaffinity(compat_pid_t pid, unsigned int len, compat_ulong_t __user *user_mask_ptr); asmlinkage long compat_sys_sched_rr_get_interval(compat_pid_t pid, - struct compat_timespec __user *interval); + struct old_timespec32 __user *interval); /* kernel/signal.c */ asmlinkage long compat_sys_sigaltstack(const compat_stack_t __user *uss_ptr, @@ -768,7 +768,7 @@ asmlinkage long compat_sys_rt_sigpending(compat_sigset_t __user *uset, compat_size_t sigsetsize); asmlinkage long compat_sys_rt_sigtimedwait(compat_sigset_t __user *uthese, struct compat_siginfo __user *uinfo, - struct compat_timespec __user *uts, compat_size_t sigsetsize); + struct old_timespec32 __user *uts, compat_size_t sigsetsize); asmlinkage long compat_sys_rt_sigqueueinfo(compat_pid_t pid, int sig, struct compat_siginfo __user *uinfo); /* No generic prototype for rt_sigreturn */ @@ -782,9 +782,9 @@ asmlinkage long compat_sys_setrlimit(unsigned int resource, asmlinkage long compat_sys_getrusage(int who, struct compat_rusage __user *ru); /* kernel/time.c */ -asmlinkage long compat_sys_gettimeofday(struct compat_timeval __user *tv, +asmlinkage long compat_sys_gettimeofday(struct old_timeval32 __user *tv, struct timezone __user *tz); -asmlinkage long compat_sys_settimeofday(struct compat_timeval __user *tv, +asmlinkage long compat_sys_settimeofday(struct old_timeval32 __user *tv, struct timezone __user *tz); asmlinkage long compat_sys_adjtimex(struct compat_timex __user *utp); @@ -798,11 +798,11 @@ asmlinkage long compat_sys_mq_open(const char __user *u_name, asmlinkage long compat_sys_mq_timedsend(mqd_t mqdes, const char __user *u_msg_ptr, compat_size_t msg_len, unsigned int msg_prio, - const struct compat_timespec __user *u_abs_timeout); + const struct old_timespec32 __user *u_abs_timeout); asmlinkage ssize_t compat_sys_mq_timedreceive(mqd_t mqdes, char __user *u_msg_ptr, compat_size_t msg_len, unsigned int __user *u_msg_prio, - const struct compat_timespec __user *u_abs_timeout); + const struct old_timespec32 __user *u_abs_timeout); asmlinkage long compat_sys_mq_notify(mqd_t mqdes, const struct compat_sigevent __user *u_notification); asmlinkage long compat_sys_mq_getsetattr(mqd_t mqdes, @@ -819,7 +819,7 @@ asmlinkage long compat_sys_msgsnd(int msqid, compat_uptr_t msgp, /* ipc/sem.c */ asmlinkage long compat_sys_semctl(int semid, int semnum, int cmd, int arg); asmlinkage long compat_sys_semtimedop(int semid, struct sembuf __user *tsems, - unsigned nsems, const struct compat_timespec __user *timeout); + unsigned nsems, const struct old_timespec32 __user *timeout); /* ipc/shm.c */ asmlinkage long compat_sys_shmctl(int first, int second, void __user *uptr); @@ -876,7 +876,7 @@ asmlinkage long compat_sys_rt_tgsigqueueinfo(compat_pid_t tgid, struct compat_siginfo __user *uinfo); asmlinkage long compat_sys_recvmmsg(int fd, struct compat_mmsghdr __user *mmsg, unsigned vlen, unsigned int flags, - struct compat_timespec __user *timeout); + struct old_timespec32 __user *timeout); asmlinkage long compat_sys_wait4(compat_pid_t pid, compat_uint_t __user *stat_addr, int options, struct compat_rusage __user *ru); @@ -928,7 +928,7 @@ asmlinkage long compat_sys_pwritev64v2(unsigned long fd, asmlinkage long compat_sys_open(const char __user *filename, int flags, umode_t mode); asmlinkage long compat_sys_utimes(const char __user *filename, - struct compat_timeval __user *t); + struct old_timeval32 __user *t); /* __ARCH_WANT_SYSCALL_NO_FLAGS */ asmlinkage long compat_sys_signalfd(int ufd, @@ -942,15 +942,15 @@ asmlinkage long compat_sys_newlstat(const char __user *filename, struct compat_stat __user *statbuf); /* __ARCH_WANT_SYSCALL_DEPRECATED */ -asmlinkage long compat_sys_time(compat_time_t __user *tloc); +asmlinkage long compat_sys_time(old_time32_t __user *tloc); asmlinkage long compat_sys_utime(const char __user *filename, struct compat_utimbuf __user *t); asmlinkage long compat_sys_futimesat(unsigned int dfd, const char __user *filename, - struct compat_timeval __user *t); + struct old_timeval32 __user *t); asmlinkage long compat_sys_select(int n, compat_ulong_t __user *inp, compat_ulong_t __user *outp, compat_ulong_t __user *exp, - struct compat_timeval __user *tvp); + struct old_timeval32 __user *tvp); asmlinkage long compat_sys_ustat(unsigned dev, struct compat_ustat __user *u32); asmlinkage long compat_sys_recv(int fd, void __user *buf, compat_size_t len, unsigned flags); @@ -983,7 +983,7 @@ asmlinkage long compat_sys_sigaction(int sig, #endif /* obsolete: kernel/time/time.c */ -asmlinkage long compat_sys_stime(compat_time_t __user *tptr); +asmlinkage long compat_sys_stime(old_time32_t __user *tptr); /* obsolete: net/socket.c */ asmlinkage long compat_sys_socketcall(int call, u32 __user *args); @@ -1002,15 +1002,15 @@ static inline bool in_compat_syscall(void) { return is_compat_task(); } #endif /** - * ns_to_compat_timeval - Compat version of ns_to_timeval + * ns_to_old_timeval32 - Compat version of ns_to_timeval * @nsec: the nanoseconds value to be converted * - * Returns the compat_timeval representation of the nsec parameter. + * Returns the old_timeval32 representation of the nsec parameter. */ -static inline struct compat_timeval ns_to_compat_timeval(s64 nsec) +static inline struct old_timeval32 ns_to_old_timeval32(s64 nsec) { struct timeval tv; - struct compat_timeval ctv; + struct old_timeval32 ctv; tv = ns_to_timeval(nsec); ctv.tv_sec = tv.tv_sec; diff --git a/include/linux/compat_time.h b/include/linux/compat_time.h deleted file mode 100644 index e70bfd1d2c3f..000000000000 --- a/include/linux/compat_time.h +++ /dev/null @@ -1,32 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _LINUX_COMPAT_TIME_H -#define _LINUX_COMPAT_TIME_H - -#include -#include - -typedef s32 compat_time_t; - -struct compat_timespec { - compat_time_t tv_sec; - s32 tv_nsec; -}; - -struct compat_timeval { - compat_time_t tv_sec; - s32 tv_usec; -}; - -struct compat_itimerspec { - struct compat_timespec it_interval; - struct compat_timespec it_value; -}; - -extern int compat_get_timespec64(struct timespec64 *, const void __user *); -extern int compat_put_timespec64(const struct timespec64 *, void __user *); -extern int get_compat_itimerspec64(struct itimerspec64 *its, - const struct compat_itimerspec __user *uits); -extern int put_compat_itimerspec64(const struct itimerspec64 *its, - struct compat_itimerspec __user *uits); - -#endif /* _LINUX_COMPAT_TIME_H */ diff --git a/include/linux/elfcore-compat.h b/include/linux/elfcore-compat.h index b5f2efdd05e0..7a37f4ce9fd2 100644 --- a/include/linux/elfcore-compat.h +++ b/include/linux/elfcore-compat.h @@ -27,10 +27,10 @@ struct compat_elf_prstatus compat_pid_t pr_ppid; compat_pid_t pr_pgrp; compat_pid_t pr_sid; - struct compat_timeval pr_utime; - struct compat_timeval pr_stime; - struct compat_timeval pr_cutime; - struct compat_timeval pr_cstime; + struct old_timeval32 pr_utime; + struct old_timeval32 pr_stime; + struct old_timeval32 pr_cutime; + struct old_timeval32 pr_cstime; compat_elf_gregset_t pr_reg; #ifdef CONFIG_BINFMT_ELF_FDPIC compat_ulong_t pr_exec_fdpic_loadmap; diff --git a/include/linux/restart_block.h b/include/linux/restart_block.h index 5d83d0c1d06c..bba2920e9c05 100644 --- a/include/linux/restart_block.h +++ b/include/linux/restart_block.h @@ -10,7 +10,7 @@ #include struct timespec; -struct compat_timespec; +struct old_timespec32; struct pollfd; enum timespec_type { @@ -40,7 +40,7 @@ struct restart_block { enum timespec_type type; union { struct __kernel_timespec __user *rmtp; - struct compat_timespec __user *compat_rmtp; + struct old_timespec32 __user *compat_rmtp; }; u64 expires; } nanosleep; diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 2ff814c92f7f..b3e27e5ee322 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -60,7 +60,7 @@ struct tms; struct utimbuf; struct mq_attr; struct compat_stat; -struct compat_timeval; +struct old_timeval32; struct robust_list_head; struct getcpu_cache; struct old_linux_dirent; diff --git a/include/linux/time32.h b/include/linux/time32.h index 92c0ca092d93..0e0d7304d1a8 100644 --- a/include/linux/time32.h +++ b/include/linux/time32.h @@ -13,6 +13,31 @@ #define TIME_T_MAX (time_t)((1UL << ((sizeof(time_t) << 3) - 1)) - 1) +typedef s32 old_time32_t; + +struct old_timespec32 { + old_time32_t tv_sec; + s32 tv_nsec; +}; + +struct old_timeval32 { + old_time32_t tv_sec; + s32 tv_usec; +}; + +struct old_itimerspec32 { + struct old_timespec32 it_interval; + struct old_timespec32 it_value; +}; + +extern int get_old_timespec32(struct timespec64 *, const void __user *); +extern int put_old_timespec32(const struct timespec64 *, void __user *); +extern int get_old_itimerspec32(struct itimerspec64 *its, + const struct old_itimerspec32 __user *uits); +extern int put_old_itimerspec32(const struct itimerspec64 *its, + struct old_itimerspec32 __user *uits); + + #if __BITS_PER_LONG == 64 /* timespec64 is defined as timespec here */ @@ -183,18 +208,17 @@ extern struct timeval ns_to_timeval(const s64 nsec); extern struct __kernel_old_timeval ns_to_kernel_old_timeval(s64 nsec); /* - * New aliases for compat time functions. These will be used to replace - * the compat code so it can be shared between 32-bit and 64-bit builds - * both of which provide compatibility with old 32-bit tasks. + * Old names for the 32-bit time_t interfaces, these will be removed + * when everything uses the new names. */ -#define old_time32_t compat_time_t -#define old_timeval32 compat_timeval -#define old_timespec32 compat_timespec -#define old_itimerspec32 compat_itimerspec -#define ns_to_old_timeval32 ns_to_compat_timeval -#define get_old_itimerspec32 get_compat_itimerspec64 -#define put_old_itimerspec32 put_compat_itimerspec64 -#define get_old_timespec32 compat_get_timespec64 -#define put_old_timespec32 compat_put_timespec64 +#define compat_time_t old_time32_t +#define compat_timeval old_timeval32 +#define compat_timespec old_timespec32 +#define compat_itimerspec old_itimerspec32 +#define ns_to_compat_timeval ns_to_old_timeval32 +#define get_compat_itimerspec64 get_old_itimerspec32 +#define put_compat_itimerspec64 put_old_itimerspec32 +#define compat_get_timespec64 get_old_timespec32 +#define compat_put_timespec64 put_old_timespec32 #endif diff --git a/ipc/mqueue.c b/ipc/mqueue.c index c0d58f390c3b..db7833370351 100644 --- a/ipc/mqueue.c +++ b/ipc/mqueue.c @@ -1461,10 +1461,10 @@ COMPAT_SYSCALL_DEFINE3(mq_getsetattr, mqd_t, mqdes, #endif #ifdef CONFIG_COMPAT_32BIT_TIME -static int compat_prepare_timeout(const struct compat_timespec __user *p, +static int compat_prepare_timeout(const struct old_timespec32 __user *p, struct timespec64 *ts) { - if (compat_get_timespec64(ts, p)) + if (get_old_timespec32(ts, p)) return -EFAULT; if (!timespec64_valid(ts)) return -EINVAL; @@ -1474,7 +1474,7 @@ static int compat_prepare_timeout(const struct compat_timespec __user *p, COMPAT_SYSCALL_DEFINE5(mq_timedsend, mqd_t, mqdes, const char __user *, u_msg_ptr, compat_size_t, msg_len, unsigned int, msg_prio, - const struct compat_timespec __user *, u_abs_timeout) + const struct old_timespec32 __user *, u_abs_timeout) { struct timespec64 ts, *p = NULL; if (u_abs_timeout) { @@ -1489,7 +1489,7 @@ COMPAT_SYSCALL_DEFINE5(mq_timedsend, mqd_t, mqdes, COMPAT_SYSCALL_DEFINE5(mq_timedreceive, mqd_t, mqdes, char __user *, u_msg_ptr, compat_size_t, msg_len, unsigned int __user *, u_msg_prio, - const struct compat_timespec __user *, u_abs_timeout) + const struct old_timespec32 __user *, u_abs_timeout) { struct timespec64 ts, *p = NULL; if (u_abs_timeout) { diff --git a/ipc/msg.c b/ipc/msg.c index 883642cf2b27..0833c6405915 100644 --- a/ipc/msg.c +++ b/ipc/msg.c @@ -622,9 +622,9 @@ struct compat_msqid_ds { struct compat_ipc_perm msg_perm; compat_uptr_t msg_first; compat_uptr_t msg_last; - compat_time_t msg_stime; - compat_time_t msg_rtime; - compat_time_t msg_ctime; + old_time32_t msg_stime; + old_time32_t msg_rtime; + old_time32_t msg_ctime; compat_ulong_t msg_lcbytes; compat_ulong_t msg_lqbytes; unsigned short msg_cbytes; diff --git a/ipc/sem.c b/ipc/sem.c index 26f8e37fcdcb..745dc6187e84 100644 --- a/ipc/sem.c +++ b/ipc/sem.c @@ -1698,8 +1698,8 @@ SYSCALL_DEFINE4(semctl, int, semid, int, semnum, int, cmd, unsigned long, arg) struct compat_semid_ds { struct compat_ipc_perm sem_perm; - compat_time_t sem_otime; - compat_time_t sem_ctime; + old_time32_t sem_otime; + old_time32_t sem_ctime; compat_uptr_t sem_base; compat_uptr_t sem_pending; compat_uptr_t sem_pending_last; @@ -2214,11 +2214,11 @@ SYSCALL_DEFINE4(semtimedop, int, semid, struct sembuf __user *, tsops, #ifdef CONFIG_COMPAT_32BIT_TIME long compat_ksys_semtimedop(int semid, struct sembuf __user *tsems, unsigned int nsops, - const struct compat_timespec __user *timeout) + const struct old_timespec32 __user *timeout) { if (timeout) { struct timespec64 ts; - if (compat_get_timespec64(&ts, timeout)) + if (get_old_timespec32(&ts, timeout)) return -EFAULT; return do_semtimedop(semid, tsems, nsops, &ts); } @@ -2227,7 +2227,7 @@ long compat_ksys_semtimedop(int semid, struct sembuf __user *tsems, COMPAT_SYSCALL_DEFINE4(semtimedop, int, semid, struct sembuf __user *, tsems, unsigned int, nsops, - const struct compat_timespec __user *, timeout) + const struct old_timespec32 __user *, timeout) { return compat_ksys_semtimedop(semid, tsems, nsops, timeout); } diff --git a/ipc/shm.c b/ipc/shm.c index b0eb3757ab89..2657692199eb 100644 --- a/ipc/shm.c +++ b/ipc/shm.c @@ -1201,9 +1201,9 @@ SYSCALL_DEFINE3(shmctl, int, shmid, int, cmd, struct shmid_ds __user *, buf) struct compat_shmid_ds { struct compat_ipc_perm shm_perm; int shm_segsz; - compat_time_t shm_atime; - compat_time_t shm_dtime; - compat_time_t shm_ctime; + old_time32_t shm_atime; + old_time32_t shm_dtime; + old_time32_t shm_ctime; compat_ipc_pid_t shm_cpid; compat_ipc_pid_t shm_lpid; unsigned short shm_nattch; diff --git a/ipc/syscall.c b/ipc/syscall.c index 65d405f1ba0c..1ac06e3983c0 100644 --- a/ipc/syscall.c +++ b/ipc/syscall.c @@ -35,7 +35,7 @@ SYSCALL_DEFINE6(ipc, unsigned int, call, int, first, unsigned long, second, (const struct __kernel_timespec __user *)fifth); else if (IS_ENABLED(CONFIG_COMPAT_32BIT_TIME)) return compat_ksys_semtimedop(first, ptr, second, - (const struct compat_timespec __user *)fifth); + (const struct old_timespec32 __user *)fifth); else return -ENOSYS; diff --git a/ipc/util.h b/ipc/util.h index 0a159f69b3bb..1ee81bce25e9 100644 --- a/ipc/util.h +++ b/ipc/util.h @@ -266,7 +266,7 @@ long ksys_shmctl(int shmid, int cmd, struct shmid_ds __user *buf); /* for CONFIG_ARCH_WANT_OLD_COMPAT_IPC */ long compat_ksys_semtimedop(int semid, struct sembuf __user *tsems, unsigned int nsops, - const struct compat_timespec __user *timeout); + const struct old_timespec32 __user *timeout); #ifdef CONFIG_COMPAT long compat_ksys_semctl(int semid, int semnum, int cmd, int arg); long compat_ksys_msgctl(int msqid, int cmd, void __user *uptr); diff --git a/kernel/compat.c b/kernel/compat.c index 8e40efc2928a..089d00d0da9c 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -93,28 +93,28 @@ int compat_put_timex(struct compat_timex __user *utp, const struct timex *txc) return 0; } -static int __compat_get_timeval(struct timeval *tv, const struct compat_timeval __user *ctv) +static int __compat_get_timeval(struct timeval *tv, const struct old_timeval32 __user *ctv) { return (!access_ok(VERIFY_READ, ctv, sizeof(*ctv)) || __get_user(tv->tv_sec, &ctv->tv_sec) || __get_user(tv->tv_usec, &ctv->tv_usec)) ? -EFAULT : 0; } -static int __compat_put_timeval(const struct timeval *tv, struct compat_timeval __user *ctv) +static int __compat_put_timeval(const struct timeval *tv, struct old_timeval32 __user *ctv) { return (!access_ok(VERIFY_WRITE, ctv, sizeof(*ctv)) || __put_user(tv->tv_sec, &ctv->tv_sec) || __put_user(tv->tv_usec, &ctv->tv_usec)) ? -EFAULT : 0; } -static int __compat_get_timespec(struct timespec *ts, const struct compat_timespec __user *cts) +static int __compat_get_timespec(struct timespec *ts, const struct old_timespec32 __user *cts) { return (!access_ok(VERIFY_READ, cts, sizeof(*cts)) || __get_user(ts->tv_sec, &cts->tv_sec) || __get_user(ts->tv_nsec, &cts->tv_nsec)) ? -EFAULT : 0; } -static int __compat_put_timespec(const struct timespec *ts, struct compat_timespec __user *cts) +static int __compat_put_timespec(const struct timespec *ts, struct old_timespec32 __user *cts) { return (!access_ok(VERIFY_WRITE, cts, sizeof(*cts)) || __put_user(ts->tv_sec, &cts->tv_sec) || diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c index 83f830acbb5f..410a77a8f6e2 100644 --- a/kernel/futex_compat.c +++ b/kernel/futex_compat.c @@ -173,7 +173,7 @@ err_unlock: } COMPAT_SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val, - struct compat_timespec __user *, utime, u32 __user *, uaddr2, + struct old_timespec32 __user *, utime, u32 __user *, uaddr2, u32, val3) { struct timespec ts; diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 625bc9897f62..8287b75ed961 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5257,13 +5257,13 @@ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid, #ifdef CONFIG_COMPAT COMPAT_SYSCALL_DEFINE2(sched_rr_get_interval, compat_pid_t, pid, - struct compat_timespec __user *, interval) + struct old_timespec32 __user *, interval) { struct timespec64 t; int retval = sched_rr_get_interval(pid, &t); if (retval == 0) - retval = compat_put_timespec64(&t, interval); + retval = put_old_timespec32(&t, interval); return retval; } #endif diff --git a/kernel/signal.c b/kernel/signal.c index 5843c541fda9..a4db724e14c1 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -3173,7 +3173,7 @@ SYSCALL_DEFINE4(rt_sigtimedwait, const sigset_t __user *, uthese, #ifdef CONFIG_COMPAT COMPAT_SYSCALL_DEFINE4(rt_sigtimedwait, compat_sigset_t __user *, uthese, struct compat_siginfo __user *, uinfo, - struct compat_timespec __user *, uts, compat_size_t, sigsetsize) + struct old_timespec32 __user *, uts, compat_size_t, sigsetsize) { sigset_t s; struct timespec t; diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index e1a549c9e399..9cdd74bd2d27 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -1660,7 +1660,7 @@ int nanosleep_copyout(struct restart_block *restart, struct timespec64 *ts) switch(restart->nanosleep.type) { #ifdef CONFIG_COMPAT_32BIT_TIME case TT_COMPAT: - if (compat_put_timespec64(ts, restart->nanosleep.compat_rmtp)) + if (put_old_timespec32(ts, restart->nanosleep.compat_rmtp)) return -EFAULT; break; #endif @@ -1780,12 +1780,12 @@ SYSCALL_DEFINE2(nanosleep, struct __kernel_timespec __user *, rqtp, #ifdef CONFIG_COMPAT_32BIT_TIME -COMPAT_SYSCALL_DEFINE2(nanosleep, struct compat_timespec __user *, rqtp, - struct compat_timespec __user *, rmtp) +COMPAT_SYSCALL_DEFINE2(nanosleep, struct old_timespec32 __user *, rqtp, + struct old_timespec32 __user *, rmtp) { struct timespec64 tu; - if (compat_get_timespec64(&tu, rqtp)) + if (get_old_timespec32(&tu, rqtp)) return -EFAULT; if (!timespec64_valid(&tu)) diff --git a/kernel/time/posix-stubs.c b/kernel/time/posix-stubs.c index 2c6847d5d69b..989ccf028bde 100644 --- a/kernel/time/posix-stubs.c +++ b/kernel/time/posix-stubs.c @@ -162,20 +162,20 @@ COMPAT_SYS_NI(setitimer); #ifdef CONFIG_COMPAT_32BIT_TIME COMPAT_SYSCALL_DEFINE2(clock_settime, const clockid_t, which_clock, - struct compat_timespec __user *, tp) + struct old_timespec32 __user *, tp) { struct timespec64 new_tp; if (which_clock != CLOCK_REALTIME) return -EINVAL; - if (compat_get_timespec64(&new_tp, tp)) + if (get_old_timespec32(&new_tp, tp)) return -EFAULT; return do_sys_settimeofday64(&new_tp, NULL); } COMPAT_SYSCALL_DEFINE2(clock_gettime, clockid_t, which_clock, - struct compat_timespec __user *, tp) + struct old_timespec32 __user *, tp) { int ret; struct timespec64 kernel_tp; @@ -184,13 +184,13 @@ COMPAT_SYSCALL_DEFINE2(clock_gettime, clockid_t, which_clock, if (ret) return ret; - if (compat_put_timespec64(&kernel_tp, tp)) + if (put_old_timespec32(&kernel_tp, tp)) return -EFAULT; return 0; } COMPAT_SYSCALL_DEFINE2(clock_getres, clockid_t, which_clock, - struct compat_timespec __user *, tp) + struct old_timespec32 __user *, tp) { struct timespec64 rtn_tp = { .tv_sec = 0, @@ -201,7 +201,7 @@ COMPAT_SYSCALL_DEFINE2(clock_getres, clockid_t, which_clock, case CLOCK_REALTIME: case CLOCK_MONOTONIC: case CLOCK_BOOTTIME: - if (compat_put_timespec64(&rtn_tp, tp)) + if (put_old_timespec32(&rtn_tp, tp)) return -EFAULT; return 0; default: @@ -210,8 +210,8 @@ COMPAT_SYSCALL_DEFINE2(clock_getres, clockid_t, which_clock, } COMPAT_SYSCALL_DEFINE4(clock_nanosleep, clockid_t, which_clock, int, flags, - struct compat_timespec __user *, rqtp, - struct compat_timespec __user *, rmtp) + struct old_timespec32 __user *, rqtp, + struct old_timespec32 __user *, rmtp) { struct timespec64 t; @@ -224,7 +224,7 @@ COMPAT_SYSCALL_DEFINE4(clock_nanosleep, clockid_t, which_clock, int, flags, return -EINVAL; } - if (compat_get_timespec64(&t, rqtp)) + if (get_old_timespec32(&t, rqtp)) return -EFAULT; if (!timespec64_valid(&t)) return -EINVAL; diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index 4b9127e95430..3e71921668ba 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -755,13 +755,13 @@ SYSCALL_DEFINE2(timer_gettime, timer_t, timer_id, #ifdef CONFIG_COMPAT_32BIT_TIME COMPAT_SYSCALL_DEFINE2(timer_gettime, timer_t, timer_id, - struct compat_itimerspec __user *, setting) + struct old_itimerspec32 __user *, setting) { struct itimerspec64 cur_setting; int ret = do_timer_gettime(timer_id, &cur_setting); if (!ret) { - if (put_compat_itimerspec64(&cur_setting, setting)) + if (put_old_itimerspec32(&cur_setting, setting)) ret = -EFAULT; } return ret; @@ -928,8 +928,8 @@ SYSCALL_DEFINE4(timer_settime, timer_t, timer_id, int, flags, #ifdef CONFIG_COMPAT_32BIT_TIME COMPAT_SYSCALL_DEFINE4(timer_settime, timer_t, timer_id, int, flags, - struct compat_itimerspec __user *, new, - struct compat_itimerspec __user *, old) + struct old_itimerspec32 __user *, new, + struct old_itimerspec32 __user *, old) { struct itimerspec64 new_spec, old_spec; struct itimerspec64 *rtn = old ? &old_spec : NULL; @@ -937,12 +937,12 @@ COMPAT_SYSCALL_DEFINE4(timer_settime, timer_t, timer_id, int, flags, if (!new) return -EINVAL; - if (get_compat_itimerspec64(&new_spec, new)) + if (get_old_itimerspec32(&new_spec, new)) return -EFAULT; error = do_timer_settime(timer_id, flags, &new_spec, rtn); if (!error && old) { - if (put_compat_itimerspec64(&old_spec, old)) + if (put_old_itimerspec32(&old_spec, old)) error = -EFAULT; } return error; @@ -1115,7 +1115,7 @@ SYSCALL_DEFINE2(clock_getres, const clockid_t, which_clock, #ifdef CONFIG_COMPAT_32BIT_TIME COMPAT_SYSCALL_DEFINE2(clock_settime, clockid_t, which_clock, - struct compat_timespec __user *, tp) + struct old_timespec32 __user *, tp) { const struct k_clock *kc = clockid_to_kclock(which_clock); struct timespec64 ts; @@ -1123,14 +1123,14 @@ COMPAT_SYSCALL_DEFINE2(clock_settime, clockid_t, which_clock, if (!kc || !kc->clock_set) return -EINVAL; - if (compat_get_timespec64(&ts, tp)) + if (get_old_timespec32(&ts, tp)) return -EFAULT; return kc->clock_set(which_clock, &ts); } COMPAT_SYSCALL_DEFINE2(clock_gettime, clockid_t, which_clock, - struct compat_timespec __user *, tp) + struct old_timespec32 __user *, tp) { const struct k_clock *kc = clockid_to_kclock(which_clock); struct timespec64 ts; @@ -1141,7 +1141,7 @@ COMPAT_SYSCALL_DEFINE2(clock_gettime, clockid_t, which_clock, err = kc->clock_get(which_clock, &ts); - if (!err && compat_put_timespec64(&ts, tp)) + if (!err && put_old_timespec32(&ts, tp)) err = -EFAULT; return err; @@ -1180,7 +1180,7 @@ COMPAT_SYSCALL_DEFINE2(clock_adjtime, clockid_t, which_clock, #ifdef CONFIG_COMPAT_32BIT_TIME COMPAT_SYSCALL_DEFINE2(clock_getres, clockid_t, which_clock, - struct compat_timespec __user *, tp) + struct old_timespec32 __user *, tp) { const struct k_clock *kc = clockid_to_kclock(which_clock); struct timespec64 ts; @@ -1190,7 +1190,7 @@ COMPAT_SYSCALL_DEFINE2(clock_getres, clockid_t, which_clock, return -EINVAL; err = kc->clock_getres(which_clock, &ts); - if (!err && tp && compat_put_timespec64(&ts, tp)) + if (!err && tp && put_old_timespec32(&ts, tp)) return -EFAULT; return err; @@ -1237,8 +1237,8 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags, #ifdef CONFIG_COMPAT_32BIT_TIME COMPAT_SYSCALL_DEFINE4(clock_nanosleep, clockid_t, which_clock, int, flags, - struct compat_timespec __user *, rqtp, - struct compat_timespec __user *, rmtp) + struct old_timespec32 __user *, rqtp, + struct old_timespec32 __user *, rmtp) { const struct k_clock *kc = clockid_to_kclock(which_clock); struct timespec64 t; @@ -1248,7 +1248,7 @@ COMPAT_SYSCALL_DEFINE4(clock_nanosleep, clockid_t, which_clock, int, flags, if (!kc->nsleep) return -EOPNOTSUPP; - if (compat_get_timespec64(&t, rqtp)) + if (get_old_timespec32(&t, rqtp)) return -EFAULT; if (!timespec64_valid(&t)) diff --git a/kernel/time/time.c b/kernel/time/time.c index de332250d6fa..f1983f468fe3 100644 --- a/kernel/time/time.c +++ b/kernel/time/time.c @@ -104,12 +104,12 @@ SYSCALL_DEFINE1(stime, time_t __user *, tptr) #ifdef CONFIG_COMPAT #ifdef __ARCH_WANT_COMPAT_SYS_TIME -/* compat_time_t is a 32 bit "long" and needs to get converted. */ -COMPAT_SYSCALL_DEFINE1(time, compat_time_t __user *, tloc) +/* old_time32_t is a 32 bit "long" and needs to get converted. */ +COMPAT_SYSCALL_DEFINE1(time, old_time32_t __user *, tloc) { - compat_time_t i; + old_time32_t i; - i = (compat_time_t)ktime_get_real_seconds(); + i = (old_time32_t)ktime_get_real_seconds(); if (tloc) { if (put_user(i,tloc)) @@ -119,7 +119,7 @@ COMPAT_SYSCALL_DEFINE1(time, compat_time_t __user *, tloc) return i; } -COMPAT_SYSCALL_DEFINE1(stime, compat_time_t __user *, tptr) +COMPAT_SYSCALL_DEFINE1(stime, old_time32_t __user *, tptr) { struct timespec64 tv; int err; @@ -225,7 +225,7 @@ SYSCALL_DEFINE2(settimeofday, struct timeval __user *, tv, } #ifdef CONFIG_COMPAT -COMPAT_SYSCALL_DEFINE2(gettimeofday, struct compat_timeval __user *, tv, +COMPAT_SYSCALL_DEFINE2(gettimeofday, struct old_timeval32 __user *, tv, struct timezone __user *, tz) { if (tv) { @@ -244,7 +244,7 @@ COMPAT_SYSCALL_DEFINE2(gettimeofday, struct compat_timeval __user *, tv, return 0; } -COMPAT_SYSCALL_DEFINE2(settimeofday, struct compat_timeval __user *, tv, +COMPAT_SYSCALL_DEFINE2(settimeofday, struct old_timeval32 __user *, tv, struct timezone __user *, tz) { struct timespec64 new_ts; @@ -863,10 +863,10 @@ int put_timespec64(const struct timespec64 *ts, } EXPORT_SYMBOL_GPL(put_timespec64); -int __compat_get_timespec64(struct timespec64 *ts64, - const struct compat_timespec __user *cts) +int __get_old_timespec32(struct timespec64 *ts64, + const struct old_timespec32 __user *cts) { - struct compat_timespec ts; + struct old_timespec32 ts; int ret; ret = copy_from_user(&ts, cts, sizeof(ts)); @@ -879,33 +879,33 @@ int __compat_get_timespec64(struct timespec64 *ts64, return 0; } -int __compat_put_timespec64(const struct timespec64 *ts64, - struct compat_timespec __user *cts) +int __put_old_timespec32(const struct timespec64 *ts64, + struct old_timespec32 __user *cts) { - struct compat_timespec ts = { + struct old_timespec32 ts = { .tv_sec = ts64->tv_sec, .tv_nsec = ts64->tv_nsec }; return copy_to_user(cts, &ts, sizeof(ts)) ? -EFAULT : 0; } -int compat_get_timespec64(struct timespec64 *ts, const void __user *uts) +int get_old_timespec32(struct timespec64 *ts, const void __user *uts) { if (COMPAT_USE_64BIT_TIME) return copy_from_user(ts, uts, sizeof(*ts)) ? -EFAULT : 0; else - return __compat_get_timespec64(ts, uts); + return __get_old_timespec32(ts, uts); } -EXPORT_SYMBOL_GPL(compat_get_timespec64); +EXPORT_SYMBOL_GPL(get_old_timespec32); -int compat_put_timespec64(const struct timespec64 *ts, void __user *uts) +int put_old_timespec32(const struct timespec64 *ts, void __user *uts) { if (COMPAT_USE_64BIT_TIME) return copy_to_user(uts, ts, sizeof(*ts)) ? -EFAULT : 0; else - return __compat_put_timespec64(ts, uts); + return __put_old_timespec32(ts, uts); } -EXPORT_SYMBOL_GPL(compat_put_timespec64); +EXPORT_SYMBOL_GPL(put_old_timespec32); int get_itimerspec64(struct itimerspec64 *it, const struct __kernel_itimerspec __user *uit) @@ -937,23 +937,23 @@ int put_itimerspec64(const struct itimerspec64 *it, } EXPORT_SYMBOL_GPL(put_itimerspec64); -int get_compat_itimerspec64(struct itimerspec64 *its, - const struct compat_itimerspec __user *uits) +int get_old_itimerspec32(struct itimerspec64 *its, + const struct old_itimerspec32 __user *uits) { - if (__compat_get_timespec64(&its->it_interval, &uits->it_interval) || - __compat_get_timespec64(&its->it_value, &uits->it_value)) + if (__get_old_timespec32(&its->it_interval, &uits->it_interval) || + __get_old_timespec32(&its->it_value, &uits->it_value)) return -EFAULT; return 0; } -EXPORT_SYMBOL_GPL(get_compat_itimerspec64); +EXPORT_SYMBOL_GPL(get_old_itimerspec32); -int put_compat_itimerspec64(const struct itimerspec64 *its, - struct compat_itimerspec __user *uits) +int put_old_itimerspec32(const struct itimerspec64 *its, + struct old_itimerspec32 __user *uits) { - if (__compat_put_timespec64(&its->it_interval, &uits->it_interval) || - __compat_put_timespec64(&its->it_value, &uits->it_value)) + if (__put_old_timespec32(&its->it_interval, &uits->it_interval) || + __put_old_timespec32(&its->it_value, &uits->it_value)) return -EFAULT; return 0; } -EXPORT_SYMBOL_GPL(put_compat_itimerspec64); +EXPORT_SYMBOL_GPL(put_old_itimerspec32); diff --git a/net/compat.c b/net/compat.c index 3b2105f6549d..2ef160876bc1 100644 --- a/net/compat.c +++ b/net/compat.c @@ -812,7 +812,7 @@ COMPAT_SYSCALL_DEFINE6(recvfrom, int, fd, void __user *, buf, compat_size_t, len static int __compat_sys_recvmmsg(int fd, struct compat_mmsghdr __user *mmsg, unsigned int vlen, unsigned int flags, - struct compat_timespec __user *timeout) + struct old_timespec32 __user *timeout) { int datagrams; struct timespec ktspec; @@ -834,7 +834,7 @@ static int __compat_sys_recvmmsg(int fd, struct compat_mmsghdr __user *mmsg, COMPAT_SYSCALL_DEFINE5(recvmmsg, int, fd, struct compat_mmsghdr __user *, mmsg, unsigned int, vlen, unsigned int, flags, - struct compat_timespec __user *, timeout) + struct old_timespec32 __user *, timeout) { return __compat_sys_recvmmsg(fd, mmsg, vlen, flags, timeout); } -- cgit From 743f5cdb6cec33d2300922f6b1b1670a572595ad Mon Sep 17 00:00:00 2001 From: kbuild test robot Date: Wed, 29 Aug 2018 20:50:52 +0800 Subject: y2038: __get_old_timespec32() can be static The kbuild test robot reports two new warnings with the previous patch: kernel/time/time.c:866:5: sparse: symbol '__get_old_timespec32' was not declared. Should it be static? kernel/time/time.c:882:5: sparse: symbol '__put_old_timespec32' was not declared. Should it be static? These are actually older bugs, but came up now after the symbol got renamed. Fortunately, commit afef05cf238c ("time: Enable get/put_compat_itimerspec64 always") makes the two functions (__compat_get_timespec64/__compat_get_timespec64) local to time.c already, so we can mark them as 'static'. Fixes: ee16c8f415e4 ("y2038: Globally rename compat_time to old_time32") Signed-off-by: kbuild test robot [arnd: added changelog text] Signed-off-by: Arnd Bergmann --- kernel/time/time.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/time/time.c b/kernel/time/time.c index f1983f468fe3..e3a7f7fd3abc 100644 --- a/kernel/time/time.c +++ b/kernel/time/time.c @@ -863,7 +863,7 @@ int put_timespec64(const struct timespec64 *ts, } EXPORT_SYMBOL_GPL(put_timespec64); -int __get_old_timespec32(struct timespec64 *ts64, +static int __get_old_timespec32(struct timespec64 *ts64, const struct old_timespec32 __user *cts) { struct old_timespec32 ts; @@ -879,7 +879,7 @@ int __get_old_timespec32(struct timespec64 *ts64, return 0; } -int __put_old_timespec32(const struct timespec64 *ts64, +static int __put_old_timespec32(const struct timespec64 *ts64, struct old_timespec32 __user *cts) { struct old_timespec32 ts = { -- cgit From 474b9c777b20b8340a6ee0f7ba6ebbd6a4bf47e2 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 17 Apr 2018 21:59:47 +0200 Subject: y2038: sched: Change sched_rr_get_interval to use __kernel_timespec This is a preparation patch for converting sys_sched_rr_get_interval to work with 64-bit time_t on 32-bit architectures. The 'interval' argument is changed to struct __kernel_timespec, which will be redefined using 64-bit time_t in the future. The compat version of the system call in turn is enabled for compilation with CONFIG_COMPAT_32BIT_TIME so the individual 32-bit architectures can share the handling of the traditional argument with 64-bit architectures providing it for their compat mode. Signed-off-by: Arnd Bergmann --- include/linux/syscalls.h | 2 +- kernel/sched/core.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 95e795fb0593..5642016a312d 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -614,7 +614,7 @@ asmlinkage long sys_sched_yield(void); asmlinkage long sys_sched_get_priority_max(int policy); asmlinkage long sys_sched_get_priority_min(int policy); asmlinkage long sys_sched_rr_get_interval(pid_t pid, - struct timespec __user *interval); + struct __kernel_timespec __user *interval); /* kernel/signal.c */ asmlinkage long sys_restart_syscall(void); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 8287b75ed961..39af2bec2b39 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5243,7 +5243,7 @@ out_unlock: * an error code. */ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid, - struct timespec __user *, interval) + struct __kernel_timespec __user *, interval) { struct timespec64 t; int retval = sched_rr_get_interval(pid, &t); @@ -5254,7 +5254,7 @@ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid, return retval; } -#ifdef CONFIG_COMPAT +#ifdef CONFIG_COMPAT_32BIT_TIME COMPAT_SYSCALL_DEFINE2(sched_rr_get_interval, compat_pid_t, pid, struct old_timespec32 __user *, interval) -- cgit From 49c39f8464a9af702e9d45800c00a572753aeb06 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 18 Apr 2018 15:56:13 +0200 Subject: y2038: signal: Change rt_sigtimedwait to use __kernel_timespec This changes sys_rt_sigtimedwait() to use get_timespec64(), changing the timeout type to __kernel_timespec, which will be changed to use a 64-bit time_t in the future. Since the do_sigtimedwait() core function changes, we also have to modify the compat version of this system call in the same way. Signed-off-by: Arnd Bergmann --- include/linux/syscalls.h | 2 +- kernel/signal.c | 17 +++++++++-------- 2 files changed, 10 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index fb3a05fab8aa..2ac3d13a915b 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -635,7 +635,7 @@ asmlinkage long sys_rt_sigprocmask(int how, sigset_t __user *set, asmlinkage long sys_rt_sigpending(sigset_t __user *set, size_t sigsetsize); asmlinkage long sys_rt_sigtimedwait(const sigset_t __user *uthese, siginfo_t __user *uinfo, - const struct timespec __user *uts, + const struct __kernel_timespec __user *uts, size_t sigsetsize); asmlinkage long sys_rt_sigqueueinfo(pid_t pid, int sig, siginfo_t __user *uinfo); diff --git a/kernel/signal.c b/kernel/signal.c index a4db724e14c1..0831d56a731a 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -3082,7 +3082,7 @@ int copy_siginfo_from_user32(struct siginfo *to, * @ts: upper bound on process time suspension */ static int do_sigtimedwait(const sigset_t *which, siginfo_t *info, - const struct timespec *ts) + const struct timespec64 *ts) { ktime_t *to = NULL, timeout = KTIME_MAX; struct task_struct *tsk = current; @@ -3090,9 +3090,9 @@ static int do_sigtimedwait(const sigset_t *which, siginfo_t *info, int sig, ret = 0; if (ts) { - if (!timespec_valid(ts)) + if (!timespec64_valid(ts)) return -EINVAL; - timeout = timespec_to_ktime(*ts); + timeout = timespec64_to_ktime(*ts); to = &timeout; } @@ -3140,11 +3140,12 @@ static int do_sigtimedwait(const sigset_t *which, siginfo_t *info, * @sigsetsize: size of sigset_t type */ SYSCALL_DEFINE4(rt_sigtimedwait, const sigset_t __user *, uthese, - siginfo_t __user *, uinfo, const struct timespec __user *, uts, + siginfo_t __user *, uinfo, + const struct __kernel_timespec __user *, uts, size_t, sigsetsize) { sigset_t these; - struct timespec ts; + struct timespec64 ts; siginfo_t info; int ret; @@ -3156,7 +3157,7 @@ SYSCALL_DEFINE4(rt_sigtimedwait, const sigset_t __user *, uthese, return -EFAULT; if (uts) { - if (copy_from_user(&ts, uts, sizeof(ts))) + if (get_timespec64(&ts, uts)) return -EFAULT; } @@ -3176,7 +3177,7 @@ COMPAT_SYSCALL_DEFINE4(rt_sigtimedwait, compat_sigset_t __user *, uthese, struct old_timespec32 __user *, uts, compat_size_t, sigsetsize) { sigset_t s; - struct timespec t; + struct timespec64 t; siginfo_t info; long ret; @@ -3187,7 +3188,7 @@ COMPAT_SYSCALL_DEFINE4(rt_sigtimedwait, compat_sigset_t __user *, uthese, return -EFAULT; if (uts) { - if (compat_get_timespec(&t, uts)) + if (get_old_timespec32(&t, uts)) return -EFAULT; } -- cgit