aboutsummaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/audit.c12
-rw-r--r--kernel/audit.h3
-rw-r--r--kernel/auditsc.c40
-rw-r--r--kernel/bpf/Makefile2
-rw-r--r--kernel/bpf/arraymap.c10
-rw-r--r--kernel/bpf/bpf_lru_list.c20
-rw-r--r--kernel/bpf/core.c244
-rw-r--r--kernel/bpf/hashtab.c8
-rw-r--r--kernel/bpf/helpers.c4
-rw-r--r--kernel/bpf/inode.c17
-rw-r--r--kernel/bpf/lpm_trie.c521
-rw-r--r--kernel/bpf/stackmap.c2
-rw-r--r--kernel/bpf/syscall.c21
-rw-r--r--kernel/bpf/verifier.c294
-rw-r--r--kernel/configs/android-recommended.config2
-rw-r--r--kernel/exit.c19
-rw-r--r--kernel/extable.c6
-rw-r--r--kernel/kallsyms.c61
-rw-r--r--kernel/kexec_core.c4
-rw-r--r--kernel/kmod.c18
-rw-r--r--kernel/kprobes.c6
-rw-r--r--kernel/module.c42
-rw-r--r--kernel/panic.c4
-rw-r--r--kernel/power/hibernate.c2
-rw-r--r--kernel/power/power.h4
-rw-r--r--kernel/power/snapshot.c4
-rw-r--r--kernel/power/suspend_test.c2
-rw-r--r--kernel/power/swap.c2
-rw-r--r--kernel/printk/Makefile2
-rw-r--r--kernel/printk/internal.h79
-rw-r--r--kernel/printk/printk.c232
-rw-r--r--kernel/printk/printk_safe.c (renamed from kernel/printk/nmi.c)234
-rw-r--r--kernel/sched/core.c3
-rw-r--r--kernel/seccomp.c32
-rw-r--r--kernel/trace/blktrace.c78
-rw-r--r--kernel/trace/bpf_trace.c82
-rw-r--r--kernel/trace/trace_output.c18
37 files changed, 1618 insertions, 516 deletions
diff --git a/kernel/audit.c b/kernel/audit.c
index 6e399bb69d7c..e794544f5e63 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -121,7 +121,7 @@ u32 audit_sig_sid = 0;
3) suppressed due to audit_rate_limit
4) suppressed due to audit_backlog_limit
*/
-static atomic_t audit_lost = ATOMIC_INIT(0);
+static atomic_t audit_lost = ATOMIC_INIT(0);
/* The netlink socket. */
static struct sock *audit_sock;
@@ -1058,6 +1058,12 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
if (err < 0)
return err;
}
+ if (s.mask == AUDIT_STATUS_LOST) {
+ u32 lost = atomic_xchg(&audit_lost, 0);
+
+ audit_log_config_change("lost", 0, lost, 1);
+ return lost;
+ }
break;
}
case AUDIT_GET_FEATURE:
@@ -1349,7 +1355,9 @@ static int __init audit_init(void)
panic("audit: failed to start the kauditd thread (%d)\n", err);
}
- audit_log(NULL, GFP_KERNEL, AUDIT_KERNEL, "initialized");
+ audit_log(NULL, GFP_KERNEL, AUDIT_KERNEL,
+ "state=initialized audit_enabled=%u res=1",
+ audit_enabled);
return 0;
}
diff --git a/kernel/audit.h b/kernel/audit.h
index 960d49c9db5e..ca579880303a 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -199,6 +199,9 @@ struct audit_context {
struct {
int argc;
} execve;
+ struct {
+ char *name;
+ } module;
};
int fds[2];
struct audit_proctitle proctitle;
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index cf1fa43512c1..d6a8de5f8fa3 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -1221,7 +1221,7 @@ static void show_special(struct audit_context *context, int *call_panic)
context->ipc.perm_mode);
}
break; }
- case AUDIT_MQ_OPEN: {
+ case AUDIT_MQ_OPEN:
audit_log_format(ab,
"oflag=0x%x mode=%#ho mq_flags=0x%lx mq_maxmsg=%ld "
"mq_msgsize=%ld mq_curmsgs=%ld",
@@ -1230,8 +1230,8 @@ static void show_special(struct audit_context *context, int *call_panic)
context->mq_open.attr.mq_maxmsg,
context->mq_open.attr.mq_msgsize,
context->mq_open.attr.mq_curmsgs);
- break; }
- case AUDIT_MQ_SENDRECV: {
+ break;
+ case AUDIT_MQ_SENDRECV:
audit_log_format(ab,
"mqdes=%d msg_len=%zd msg_prio=%u "
"abs_timeout_sec=%ld abs_timeout_nsec=%ld",
@@ -1240,12 +1240,12 @@ static void show_special(struct audit_context *context, int *call_panic)
context->mq_sendrecv.msg_prio,
context->mq_sendrecv.abs_timeout.tv_sec,
context->mq_sendrecv.abs_timeout.tv_nsec);
- break; }
- case AUDIT_MQ_NOTIFY: {
+ break;
+ case AUDIT_MQ_NOTIFY:
audit_log_format(ab, "mqdes=%d sigev_signo=%d",
context->mq_notify.mqdes,
context->mq_notify.sigev_signo);
- break; }
+ break;
case AUDIT_MQ_GETSETATTR: {
struct mq_attr *attr = &context->mq_getsetattr.mqstat;
audit_log_format(ab,
@@ -1255,19 +1255,24 @@ static void show_special(struct audit_context *context, int *call_panic)
attr->mq_flags, attr->mq_maxmsg,
attr->mq_msgsize, attr->mq_curmsgs);
break; }
- case AUDIT_CAPSET: {
+ case AUDIT_CAPSET:
audit_log_format(ab, "pid=%d", context->capset.pid);
audit_log_cap(ab, "cap_pi", &context->capset.cap.inheritable);
audit_log_cap(ab, "cap_pp", &context->capset.cap.permitted);
audit_log_cap(ab, "cap_pe", &context->capset.cap.effective);
- break; }
- case AUDIT_MMAP: {
+ break;
+ case AUDIT_MMAP:
audit_log_format(ab, "fd=%d flags=0x%x", context->mmap.fd,
context->mmap.flags);
- break; }
- case AUDIT_EXECVE: {
+ break;
+ case AUDIT_EXECVE:
audit_log_execve_info(context, &ab);
- break; }
+ break;
+ case AUDIT_KERN_MODULE:
+ audit_log_format(ab, "name=");
+ audit_log_untrustedstring(ab, context->module.name);
+ kfree(context->module.name);
+ break;
}
audit_log_end(ab);
}
@@ -2368,6 +2373,15 @@ void __audit_mmap_fd(int fd, int flags)
context->type = AUDIT_MMAP;
}
+void __audit_log_kern_module(char *name)
+{
+ struct audit_context *context = current->audit_context;
+
+ context->module.name = kmalloc(strlen(name) + 1, GFP_KERNEL);
+ strcpy(context->module.name, name);
+ context->type = AUDIT_KERN_MODULE;
+}
+
static void audit_log_task(struct audit_buffer *ab)
{
kuid_t auid, uid;
@@ -2411,7 +2425,7 @@ void audit_core_dumps(long signr)
if (unlikely(!ab))
return;
audit_log_task(ab);
- audit_log_format(ab, " sig=%ld", signr);
+ audit_log_format(ab, " sig=%ld res=1", signr);
audit_log_end(ab);
}
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index 1276474ac3cd..e1ce4f4fd7fd 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -1,7 +1,7 @@
obj-y := core.o
obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o
-obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o
+obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o
ifeq ($(CONFIG_PERF_EVENTS),y)
obj-$(CONFIG_BPF_SYSCALL) += stackmap.o
endif
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 3d55d95dcf49..6b6f41f0b211 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -269,7 +269,7 @@ static const struct bpf_map_ops array_ops = {
.map_delete_elem = array_map_delete_elem,
};
-static struct bpf_map_type_list array_type __read_mostly = {
+static struct bpf_map_type_list array_type __ro_after_init = {
.ops = &array_ops,
.type = BPF_MAP_TYPE_ARRAY,
};
@@ -283,7 +283,7 @@ static const struct bpf_map_ops percpu_array_ops = {
.map_delete_elem = array_map_delete_elem,
};
-static struct bpf_map_type_list percpu_array_type __read_mostly = {
+static struct bpf_map_type_list percpu_array_type __ro_after_init = {
.ops = &percpu_array_ops,
.type = BPF_MAP_TYPE_PERCPU_ARRAY,
};
@@ -409,7 +409,7 @@ static const struct bpf_map_ops prog_array_ops = {
.map_fd_put_ptr = prog_fd_array_put_ptr,
};
-static struct bpf_map_type_list prog_array_type __read_mostly = {
+static struct bpf_map_type_list prog_array_type __ro_after_init = {
.ops = &prog_array_ops,
.type = BPF_MAP_TYPE_PROG_ARRAY,
};
@@ -522,7 +522,7 @@ static const struct bpf_map_ops perf_event_array_ops = {
.map_release = perf_event_fd_array_release,
};
-static struct bpf_map_type_list perf_event_array_type __read_mostly = {
+static struct bpf_map_type_list perf_event_array_type __ro_after_init = {
.ops = &perf_event_array_ops,
.type = BPF_MAP_TYPE_PERF_EVENT_ARRAY,
};
@@ -564,7 +564,7 @@ static const struct bpf_map_ops cgroup_array_ops = {
.map_fd_put_ptr = cgroup_fd_array_put_ptr,
};
-static struct bpf_map_type_list cgroup_array_type __read_mostly = {
+static struct bpf_map_type_list cgroup_array_type __ro_after_init = {
.ops = &cgroup_array_ops,
.type = BPF_MAP_TYPE_CGROUP_ARRAY,
};
diff --git a/kernel/bpf/bpf_lru_list.c b/kernel/bpf/bpf_lru_list.c
index 89b7ef41c86b..f62d1d56f41d 100644
--- a/kernel/bpf/bpf_lru_list.c
+++ b/kernel/bpf/bpf_lru_list.c
@@ -213,11 +213,10 @@ __bpf_lru_list_shrink_inactive(struct bpf_lru *lru,
enum bpf_lru_list_type tgt_free_type)
{
struct list_head *inactive = &l->lists[BPF_LRU_LIST_T_INACTIVE];
- struct bpf_lru_node *node, *tmp_node, *first_node;
+ struct bpf_lru_node *node, *tmp_node;
unsigned int nshrinked = 0;
unsigned int i = 0;
- first_node = list_first_entry(inactive, struct bpf_lru_node, list);
list_for_each_entry_safe_reverse(node, tmp_node, inactive, list) {
if (bpf_lru_node_is_ref(node)) {
__bpf_lru_node_move(l, node, BPF_LRU_LIST_T_ACTIVE);
@@ -361,7 +360,8 @@ static void __local_list_add_pending(struct bpf_lru *lru,
list_add(&node->list, local_pending_list(loc_l));
}
-struct bpf_lru_node *__local_list_pop_free(struct bpf_lru_locallist *loc_l)
+static struct bpf_lru_node *
+__local_list_pop_free(struct bpf_lru_locallist *loc_l)
{
struct bpf_lru_node *node;
@@ -374,8 +374,8 @@ struct bpf_lru_node *__local_list_pop_free(struct bpf_lru_locallist *loc_l)
return node;
}
-struct bpf_lru_node *__local_list_pop_pending(struct bpf_lru *lru,
- struct bpf_lru_locallist *loc_l)
+static struct bpf_lru_node *
+__local_list_pop_pending(struct bpf_lru *lru, struct bpf_lru_locallist *loc_l)
{
struct bpf_lru_node *node;
bool force = false;
@@ -558,8 +558,9 @@ void bpf_lru_push_free(struct bpf_lru *lru, struct bpf_lru_node *node)
bpf_common_lru_push_free(lru, node);
}
-void bpf_common_lru_populate(struct bpf_lru *lru, void *buf, u32 node_offset,
- u32 elem_size, u32 nr_elems)
+static void bpf_common_lru_populate(struct bpf_lru *lru, void *buf,
+ u32 node_offset, u32 elem_size,
+ u32 nr_elems)
{
struct bpf_lru_list *l = &lru->common_lru.lru_list;
u32 i;
@@ -575,8 +576,9 @@ void bpf_common_lru_populate(struct bpf_lru *lru, void *buf, u32 node_offset,
}
}
-void bpf_percpu_lru_populate(struct bpf_lru *lru, void *buf, u32 node_offset,
- u32 elem_size, u32 nr_elems)
+static void bpf_percpu_lru_populate(struct bpf_lru *lru, void *buf,
+ u32 node_offset, u32 elem_size,
+ u32 nr_elems)
{
u32 i, pcpu_entries;
int cpu;
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 503d4211988a..f45827e205d3 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -28,6 +28,9 @@
#include <linux/moduleloader.h>
#include <linux/bpf.h>
#include <linux/frame.h>
+#include <linux/rbtree_latch.h>
+#include <linux/kallsyms.h>
+#include <linux/rcupdate.h>
#include <asm/unaligned.h>
@@ -95,6 +98,8 @@ struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags)
fp->aux = aux;
fp->aux->prog = fp;
+ INIT_LIST_HEAD_RCU(&fp->aux->ksym_lnode);
+
return fp;
}
EXPORT_SYMBOL_GPL(bpf_prog_alloc);
@@ -290,6 +295,206 @@ struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off,
}
#ifdef CONFIG_BPF_JIT
+static __always_inline void
+bpf_get_prog_addr_region(const struct bpf_prog *prog,
+ unsigned long *symbol_start,
+ unsigned long *symbol_end)
+{
+ const struct bpf_binary_header *hdr = bpf_jit_binary_hdr(prog);
+ unsigned long addr = (unsigned long)hdr;
+
+ WARN_ON_ONCE(!bpf_prog_ebpf_jited(prog));
+
+ *symbol_start = addr;
+ *symbol_end = addr + hdr->pages * PAGE_SIZE;
+}
+
+static void bpf_get_prog_name(const struct bpf_prog *prog, char *sym)
+{
+ BUILD_BUG_ON(sizeof("bpf_prog_") +
+ sizeof(prog->tag) * 2 + 1 > KSYM_NAME_LEN);
+
+ sym += snprintf(sym, KSYM_NAME_LEN, "bpf_prog_");
+ sym = bin2hex(sym, prog->tag, sizeof(prog->tag));
+ *sym = 0;
+}
+
+static __always_inline unsigned long
+bpf_get_prog_addr_start(struct latch_tree_node *n)
+{
+ unsigned long symbol_start, symbol_end;
+ const struct bpf_prog_aux *aux;
+
+ aux = container_of(n, struct bpf_prog_aux, ksym_tnode);
+ bpf_get_prog_addr_region(aux->prog, &symbol_start, &symbol_end);
+
+ return symbol_start;
+}
+
+static __always_inline bool bpf_tree_less(struct latch_tree_node *a,
+ struct latch_tree_node *b)
+{
+ return bpf_get_prog_addr_start(a) < bpf_get_prog_addr_start(b);
+}
+
+static __always_inline int bpf_tree_comp(void *key, struct latch_tree_node *n)
+{
+ unsigned long val = (unsigned long)key;
+ unsigned long symbol_start, symbol_end;
+ const struct bpf_prog_aux *aux;
+
+ aux = container_of(n, struct bpf_prog_aux, ksym_tnode);
+ bpf_get_prog_addr_region(aux->prog, &symbol_start, &symbol_end);
+
+ if (val < symbol_start)
+ return -1;
+ if (val >= symbol_end)
+ return 1;
+
+ return 0;
+}
+
+static const struct latch_tree_ops bpf_tree_ops = {
+ .less = bpf_tree_less,
+ .comp = bpf_tree_comp,
+};
+
+static DEFINE_SPINLOCK(bpf_lock);
+static LIST_HEAD(bpf_kallsyms);
+static struct latch_tree_root bpf_tree __cacheline_aligned;
+
+int bpf_jit_kallsyms __read_mostly;
+
+static void bpf_prog_ksym_node_add(struct bpf_prog_aux *aux)
+{
+ WARN_ON_ONCE(!list_empty(&aux->ksym_lnode));
+ list_add_tail_rcu(&aux->ksym_lnode, &bpf_kallsyms);
+ latch_tree_insert(&aux->ksym_tnode, &bpf_tree, &bpf_tree_ops);
+}
+
+static void bpf_prog_ksym_node_del(struct bpf_prog_aux *aux)
+{
+ if (list_empty(&aux->ksym_lnode))
+ return;
+
+ latch_tree_erase(&aux->ksym_tnode, &bpf_tree, &bpf_tree_ops);
+ list_del_rcu(&aux->ksym_lnode);
+}
+
+static bool bpf_prog_kallsyms_candidate(const struct bpf_prog *fp)
+{
+ return fp->jited && !bpf_prog_was_classic(fp);
+}
+
+static bool bpf_prog_kallsyms_verify_off(const struct bpf_prog *fp)
+{
+ return list_empty(&fp->aux->ksym_lnode) ||
+ fp->aux->ksym_lnode.prev == LIST_POISON2;
+}
+
+void bpf_prog_kallsyms_add(struct bpf_prog *fp)
+{
+ unsigned long flags;
+
+ if (!bpf_prog_kallsyms_candidate(fp) ||
+ !capable(CAP_SYS_ADMIN))
+ return;
+
+ spin_lock_irqsave(&bpf_lock, flags);
+ bpf_prog_ksym_node_add(fp->aux);
+ spin_unlock_irqrestore(&bpf_lock, flags);
+}
+
+void bpf_prog_kallsyms_del(struct bpf_prog *fp)
+{
+ unsigned long flags;
+
+ if (!bpf_prog_kallsyms_candidate(fp))
+ return;
+
+ spin_lock_irqsave(&bpf_lock, flags);
+ bpf_prog_ksym_node_del(fp->aux);
+ spin_unlock_irqrestore(&bpf_lock, flags);
+}
+
+static struct bpf_prog *bpf_prog_kallsyms_find(unsigned long addr)
+{
+ struct latch_tree_node *n;
+
+ if (!bpf_jit_kallsyms_enabled())
+ return NULL;
+
+ n = latch_tree_find((void *)addr, &bpf_tree, &bpf_tree_ops);
+ return n ?
+ container_of(n, struct bpf_prog_aux, ksym_tnode)->prog :
+ NULL;
+}
+
+const char *__bpf_address_lookup(unsigned long addr, unsigned long *size,
+ unsigned long *off, char *sym)
+{
+ unsigned long symbol_start, symbol_end;
+ struct bpf_prog *prog;
+ char *ret = NULL;
+
+ rcu_read_lock();
+ prog = bpf_prog_kallsyms_find(addr);
+ if (prog) {
+ bpf_get_prog_addr_region(prog, &symbol_start, &symbol_end);
+ bpf_get_prog_name(prog, sym);
+
+ ret = sym;
+ if (size)
+ *size = symbol_end - symbol_start;
+ if (off)
+ *off = addr - symbol_start;
+ }
+ rcu_read_unlock();
+
+ return ret;
+}
+
+bool is_bpf_text_address(unsigned long addr)
+{
+ bool ret;
+
+ rcu_read_lock();
+ ret = bpf_prog_kallsyms_find(addr) != NULL;
+ rcu_read_unlock();
+
+ return ret;
+}
+
+int bpf_get_kallsym(unsigned int symnum, unsigned long *value, char *type,
+ char *sym)
+{
+ unsigned long symbol_start, symbol_end;
+ struct bpf_prog_aux *aux;
+ unsigned int it = 0;
+ int ret = -ERANGE;
+
+ if (!bpf_jit_kallsyms_enabled())
+ return ret;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(aux, &bpf_kallsyms, ksym_lnode) {
+ if (it++ != symnum)
+ continue;
+
+ bpf_get_prog_addr_region(aux->prog, &symbol_start, &symbol_end);
+ bpf_get_prog_name(aux->prog, sym);
+
+ *value = symbol_start;
+ *type = BPF_SYM_ELF_TYPE;
+
+ ret = 0;
+ break;
+ }
+ rcu_read_unlock();
+
+ return ret;
+}
+
struct bpf_binary_header *
bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr,
unsigned int alignment,
@@ -326,6 +531,24 @@ void bpf_jit_binary_free(struct bpf_binary_header *hdr)
module_memfree(hdr);
}
+/* This symbol is only overridden by archs that have different
+ * requirements than the usual eBPF JITs, f.e. when they only
+ * implement cBPF JIT, do not set images read-only, etc.
+ */
+void __weak bpf_jit_free(struct bpf_prog *fp)
+{
+ if (fp->jited) {
+ struct bpf_binary_header *hdr = bpf_jit_binary_hdr(fp);
+
+ bpf_jit_binary_unlock_ro(hdr);
+ bpf_jit_binary_free(hdr);
+
+ WARN_ON_ONCE(!bpf_prog_kallsyms_verify_off(fp));
+ }
+
+ bpf_prog_unlock_free(fp);
+}
+
int bpf_jit_harden __read_mostly;
static int bpf_jit_blind_insn(const struct bpf_insn *from,
@@ -1154,12 +1377,22 @@ const struct bpf_func_proto bpf_tail_call_proto = {
.arg3_type = ARG_ANYTHING,
};
-/* For classic BPF JITs that don't implement bpf_int_jit_compile(). */
+/* Stub for JITs that only support cBPF. eBPF programs are interpreted.
+ * It is encouraged to implement bpf_int_jit_compile() instead, so that
+ * eBPF and implicitly also cBPF can get JITed!
+ */
struct bpf_prog * __weak bpf_int_jit_compile(struct bpf_prog *prog)
{
return prog;
}
+/* Stub for JITs that support eBPF. All cBPF code gets transformed into
+ * eBPF by the kernel and is later compiled by bpf_int_jit_compile().
+ */
+void __weak bpf_jit_compile(struct bpf_prog *prog)
+{
+}
+
bool __weak bpf_helper_changes_pkt_data(void *func)
{
return false;
@@ -1173,3 +1406,12 @@ int __weak skb_copy_bits(const struct sk_buff *skb, int offset, void *to,
{
return -EFAULT;
}
+
+/* All definitions of tracepoints related to BPF. */
+#define CREATE_TRACE_POINTS
+#include <linux/bpf_trace.h>
+
+EXPORT_TRACEPOINT_SYMBOL_GPL(xdp_exception);
+
+EXPORT_TRACEPOINT_SYMBOL_GPL(bpf_prog_get_type);
+EXPORT_TRACEPOINT_SYMBOL_GPL(bpf_prog_put_rcu);
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index a753bbe7df0a..3ea87fb19a94 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -1023,7 +1023,7 @@ static const struct bpf_map_ops htab_ops = {
.map_delete_elem = htab_map_delete_elem,
};
-static struct bpf_map_type_list htab_type __read_mostly = {
+static struct bpf_map_type_list htab_type __ro_after_init = {
.ops = &htab_ops,
.type = BPF_MAP_TYPE_HASH,
};
@@ -1037,7 +1037,7 @@ static const struct bpf_map_ops htab_lru_ops = {
.map_delete_elem = htab_lru_map_delete_elem,
};
-static struct bpf_map_type_list htab_lru_type __read_mostly = {
+static struct bpf_map_type_list htab_lru_type __ro_after_init = {
.ops = &htab_lru_ops,
.type = BPF_MAP_TYPE_LRU_HASH,
};
@@ -1124,7 +1124,7 @@ static const struct bpf_map_ops htab_percpu_ops = {
.map_delete_elem = htab_map_delete_elem,
};
-static struct bpf_map_type_list htab_percpu_type __read_mostly = {
+static struct bpf_map_type_list htab_percpu_type __ro_after_init = {
.ops = &htab_percpu_ops,
.type = BPF_MAP_TYPE_PERCPU_HASH,
};
@@ -1138,7 +1138,7 @@ static const struct bpf_map_ops htab_lru_percpu_ops = {
.map_delete_elem = htab_lru_map_delete_elem,
};
-static struct bpf_map_type_list htab_lru_percpu_type __read_mostly = {
+static struct bpf_map_type_list htab_lru_percpu_type __ro_after_init = {
.ops = &htab_lru_percpu_ops,
.type = BPF_MAP_TYPE_LRU_PERCPU_HASH,
};
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 045cbe673356..3d24e238221e 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -176,6 +176,6 @@ const struct bpf_func_proto bpf_get_current_comm_proto = {
.func = bpf_get_current_comm,
.gpl_only = false,
.ret_type = RET_INTEGER,
- .arg1_type = ARG_PTR_TO_RAW_STACK,
- .arg2_type = ARG_CONST_STACK_SIZE,
+ .arg1_type = ARG_PTR_TO_UNINIT_MEM,
+ .arg2_type = ARG_CONST_SIZE,
};
diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c
index 0b030c9126d3..fddcae801724 100644
--- a/kernel/bpf/inode.c
+++ b/kernel/bpf/inode.c
@@ -21,6 +21,7 @@
#include <linux/parser.h>
#include <linux/filter.h>
#include <linux/bpf.h>
+#include <linux/bpf_trace.h>
enum bpf_type {
BPF_TYPE_UNSPEC = 0,
@@ -281,6 +282,13 @@ int bpf_obj_pin_user(u32 ufd, const char __user *pathname)
ret = bpf_obj_do_pin(pname, raw, type);
if (ret != 0)
bpf_any_put(raw, type);
+ if ((trace_bpf_obj_pin_prog_enabled() ||
+ trace_bpf_obj_pin_map_enabled()) && !ret) {
+ if (type == BPF_TYPE_PROG)
+ trace_bpf_obj_pin_prog(raw, ufd, pname);
+ if (type == BPF_TYPE_MAP)
+ trace_bpf_obj_pin_map(raw, ufd, pname);
+ }
out:
putname(pname);
return ret;
@@ -342,8 +350,15 @@ int bpf_obj_get_user(const char __user *pathname)
else
goto out;
- if (ret < 0)
+ if (ret < 0) {
bpf_any_put(raw, type);
+ } else if (trace_bpf_obj_get_prog_enabled() ||
+ trace_bpf_obj_get_map_enabled()) {
+ if (type == BPF_TYPE_PROG)
+ trace_bpf_obj_get_prog(raw, ret, pname);
+ if (type == BPF_TYPE_MAP)
+ trace_bpf_obj_get_map(raw, ret, pname);
+ }
out:
putname(pname);
return ret;
diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c
new file mode 100644
index 000000000000..8bfe0afaee10
--- /dev/null
+++ b/kernel/bpf/lpm_trie.c
@@ -0,0 +1,521 @@
+/*
+ * Longest prefix match list implementation
+ *
+ * Copyright (c) 2016,2017 Daniel Mack
+ * Copyright (c) 2016 David Herrmann
+ *
+ * This file is subject to the terms and conditions of version 2 of the GNU
+ * General Public License. See the file COPYING in the main directory of the
+ * Linux distribution for more details.
+ */
+
+#include <linux/bpf.h>
+#include <linux/err.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/vmalloc.h>
+#include <net/ipv6.h>
+
+/* Intermediate node */
+#define LPM_TREE_NODE_FLAG_IM BIT(0)
+
+struct lpm_trie_node;
+
+struct lpm_trie_node {
+ struct rcu_head rcu;
+ struct lpm_trie_node __rcu *child[2];
+ u32 prefixlen;
+ u32 flags;
+ u8 data[0];
+};
+
+struct lpm_trie {
+ struct bpf_map map;
+ struct lpm_trie_node __rcu *root;
+ size_t n_entries;
+ size_t max_prefixlen;
+ size_t data_size;
+ raw_spinlock_t lock;
+};
+
+/* This trie implements a longest prefix match algorithm that can be used to
+ * match IP addresses to a stored set of ranges.
+ *
+ * Data stored in @data of struct bpf_lpm_key and struct lpm_trie_node is
+ * interpreted as big endian, so data[0] stores the most significant byte.
+ *
+ * Match ranges are internally stored in instances of struct lpm_trie_node
+ * which each contain their prefix length as well as two pointers that may
+ * lead to more nodes containing more specific matches. Each node also stores
+ * a value that is defined by and returned to userspace via the update_elem
+ * and lookup functions.
+ *
+ * For instance, let's start with a trie that was created with a prefix length
+ * of 32, so it can be used for IPv4 addresses, and one single element that
+ * matches 192.168.0.0/16. The data array would hence contain
+ * [0xc0, 0xa8, 0x00, 0x00] in big-endian notation. This documentation will
+ * stick to IP-address notation for readability though.
+ *
+ * As the trie is empty initially, the new node (1) will be places as root
+ * node, denoted as (R) in the example below. As there are no other node, both
+ * child pointers are %NULL.
+ *
+ * +----------------+
+ * | (1) (R) |
+ * | 192.168.0.0/16 |
+ * | value: 1 |
+ * | [0] [1] |
+ * +----------------+
+ *
+ * Next, let's add a new node (2) matching 192.168.0.0/24. As there is already
+ * a node with the same data and a smaller prefix (ie, a less specific one),
+ * node (2) will become a child of (1). In child index depends on the next bit
+ * that is outside of what (1) matches, and that bit is 0, so (2) will be
+ * child[0] of (1):
+ *
+ * +----------------+
+ * | (1) (R) |
+ * | 192.168.0.0/16 |
+ * | value: 1 |
+ * | [0] [1] |
+ * +----------------+
+ * |
+ * +----------------+
+ * | (2) |
+ * | 192.168.0.0/24 |
+ * | value: 2 |
+ * | [0] [1] |
+ * +----------------+
+ *
+ * The child[1] slot of (1) could be filled with another node which has bit #17
+ * (the next bit after the ones that (1) matches on) set to 1. For instance,
+ * 192.168.128.0/24:
+ *
+ * +----------------+
+ * | (1) (R) |
+ * | 192.168.0.0/16 |
+ * | value: 1 |
+ * | [0] [1] |
+ * +----------------+
+ * | |
+ * +----------------+ +------------------+
+ * | (2) | | (3) |
+ * | 192.168.0.0/24 | | 192.168.128.0/24 |
+ * | value: 2 | | value: 3 |
+ * | [0] [1] | | [0] [1] |
+ * +----------------+ +------------------+
+ *
+ * Let's add another node (4) to the game for 192.168.1.0/24. In order to place
+ * it, node (1) is looked at first, and because (4) of the semantics laid out
+ * above (bit #17 is 0), it would normally be attached to (1) as child[0].
+ * However, that slot is already allocated, so a new node is needed in between.
+ * That node does not have a value attached to it and it will never be
+ * returned to users as result of a lookup. It is only there to differentiate
+ * the traversal further. It will get a prefix as wide as necessary to
+ * distinguish its two children:
+ *
+ * +----------------+
+ * | (1) (R) |
+ * | 192.168.0.0/16 |
+ * | value: 1 |
+ * | [0] [1] |
+ * +----------------+
+ * | |
+ * +----------------+ +------------------+
+ * | (4) (I) | | (3) |
+ * | 192.168.0.0/23 | | 192.168.128.0/24 |
+ * | value: --- | | value: 3 |
+ * | [0] [1] | | [0] [1] |
+ * +----------------+ +------------------+
+ * | |
+ * +----------------+ +----------------+
+ * | (2) | | (5) |
+ * | 192.168.0.0/24 | | 192.168.1.0/24 |
+ * | value: 2 | | value: 5 |
+ * | [0] [1] | | [0] [1] |
+ * +----------------+ +----------------+
+ *
+ * 192.168.1.1/32 would be a child of (5) etc.
+ *
+ * An intermediate node will be turned into a 'real' node on demand. In the
+ * example above, (4) would be re-used if 192.168.0.0/23 is added to the trie.
+ *
+ * A fully populated trie would have a height of 32 nodes, as the trie was
+ * created with a prefix length of 32.
+ *
+ * The lookup starts at the root node. If the current node matches and if there
+ * is a child that can be used to become more specific, the trie is traversed
+ * downwards. The last node in the traversal that is a non-intermediate one is
+ * returned.
+ */
+
+static inline int extract_bit(const u8 *data, size_t index)
+{
+ return !!(data[index / 8] & (1 << (7 - (index % 8))));
+}
+
+/**
+ * longest_prefix_match() - determine the longest prefix
+ * @trie: The trie to get internal sizes from
+ * @node: The node to operate on
+ * @key: The key to compare to @node
+ *
+ * Determine the longest prefix of @node that matches the bits in @key.
+ */
+static size_t longest_prefix_match(const struct lpm_trie *trie,
+ const struct lpm_trie_node *node,
+ const struct bpf_lpm_trie_key *key)
+{
+ size_t prefixlen = 0;
+ size_t i;
+
+ for (i = 0; i < trie->data_size; i++) {
+ size_t b;
+
+ b = 8 - fls(node->data[i] ^ key->data[i]);
+ prefixlen += b;
+
+ if (prefixlen >= node->prefixlen || prefixlen >= key->prefixlen)
+ return min(node->prefixlen, key->prefixlen);
+
+ if (b < 8)
+ break;
+ }
+
+ return prefixlen;
+}
+
+/* Called from syscall or from eBPF program */
+static void *trie_lookup_elem(struct bpf_map *map, void *_key)
+{
+ struct lpm_trie *trie = container_of(map, struct lpm_trie, map);
+ struct lpm_trie_node *node, *found = NULL;
+ struct bpf_lpm_trie_key *key = _key;
+
+ /* Start walking the trie from the root node ... */
+
+ for (node = rcu_dereference(trie->root); node;) {
+ unsigned int next_bit;
+ size_t matchlen;
+
+ /* Determine the longest prefix of @node that matches @key.
+ * If it's the maximum possible prefix for this trie, we have
+ * an exact match and can return it directly.
+ */
+ matchlen = longest_prefix_match(trie, node, key);
+ if (matchlen == trie->max_prefixlen) {
+ found = node;
+ break;
+ }
+
+ /* If the number of bits that match is smaller than the prefix
+ * length of @node, bail out and return the node we have seen
+ * last in the traversal (ie, the parent).
+ */
+ if (matchlen < node->prefixlen)
+ break;
+
+ /* Consider this node as return candidate unless it is an
+ * artificially added intermediate one.
+ */
+ if (!(node->flags & LPM_TREE_NODE_FLAG_IM))
+ found = node;
+
+ /* If the node match is fully satisfied, let's see if we can
+ * become more specific. Determine the next bit in the key and
+ * traverse down.
+ */
+ next_bit = extract_bit(key->data, node->prefixlen);
+ node = rcu_dereference(node->child[next_bit]);
+ }
+
+ if (!found)
+ return NULL;
+
+ return found->data + trie->data_size;
+}
+
+static struct lpm_trie_node *lpm_trie_node_alloc(const struct lpm_trie *trie,
+ const void *value)
+{
+ struct lpm_trie_node *node;
+ size_t size = sizeof(struct lpm_trie_node) + trie->data_size;
+
+ if (value)
+ size += trie->map.value_size;
+
+ node = kmalloc(size, GFP_ATOMIC | __GFP_NOWARN);
+ if (!node)
+ return NULL;
+
+ node->flags = 0;
+
+ if (value)
+ memcpy(node->data + trie->data_size, value,
+ trie->map.value_size);
+
+ return node;
+}
+
+/* Called from syscall or from eBPF program */
+static int trie_update_elem(struct bpf_map *map,
+ void *_key, void *value, u64 flags)
+{
+ struct lpm_trie *trie = container_of(map, struct lpm_trie, map);
+ struct lpm_trie_node *node, *im_node = NULL, *new_node = NULL;
+ struct lpm_trie_node __rcu **slot;
+ struct bpf_lpm_trie_key *key = _key;
+ unsigned long irq_flags;
+ unsigned int next_bit;
+ size_t matchlen = 0;
+ int ret = 0;
+
+ if (unlikely(flags > BPF_EXIST))
+ return -EINVAL;
+
+ if (key->prefixlen > trie->max_prefixlen)
+ return -EINVAL;
+
+ raw_spin_lock_irqsave(&trie->lock, irq_flags);
+
+ /* Allocate and fill a new node */
+
+ if (trie->n_entries == trie->map.max_entries) {
+ ret = -ENOSPC;
+ goto out;
+ }
+
+ new_node = lpm_trie_node_alloc(trie, value);
+ if (!new_node) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ trie->n_entries++;
+
+ new_node->prefixlen = key->prefixlen;
+ RCU_INIT_POINTER(new_node->child[0], NULL);
+ RCU_INIT_POINTER(new_node->child[1], NULL);
+ memcpy(new_node->data, key->data, trie->data_size);
+
+ /* Now find a slot to attach the new node. To do that, walk the tree
+ * from the root and match as many bits as possible for each node until
+ * we either find an empty slot or a slot that needs to be replaced by
+ * an intermediate node.
+ */
+ slot = &trie->root;
+
+ while ((node = rcu_dereference_protected(*slot,
+ lockdep_is_held(&trie->lock)))) {
+ matchlen = longest_prefix_match(trie, node, key);
+
+ if (node->prefixlen != matchlen ||
+ node->prefixlen == key->prefixlen ||
+ node->prefixlen == trie->max_prefixlen)
+ break;
+
+ next_bit = extract_bit(key->data, node->prefixlen);
+ slot = &node->child[next_bit];
+ }
+
+ /* If the slot is empty (a free child pointer or an empty root),
+ * simply assign the @new_node to that slot and be done.
+ */
+ if (!node) {
+ rcu_assign_pointer(*slot, new_node);
+ goto out;
+ }
+
+ /* If the slot we picked already exists, replace it with @new_node
+ * which already has the correct data array set.
+ */
+ if (node->prefixlen == matchlen) {
+ new_node->child[0] = node->child[0];
+ new_node->child[1] = node->child[1];
+
+ if (!(node->flags & LPM_TREE_NODE_FLAG_IM))
+ trie->n_entries--;
+
+ rcu_assign_pointer(*slot, new_node);
+ kfree_rcu(node, rcu);
+
+ goto out;
+ }
+
+ /* If the new node matches the prefix completely, it must be inserted
+ * as an ancestor. Simply insert it between @node and *@slot.
+ */
+ if (matchlen == key->prefixlen) {
+ next_bit = extract_bit(node->data, matchlen);
+ rcu_assign_pointer(new_node->child[next_bit], node);
+ rcu_assign_pointer(*slot, new_node);
+ goto out;
+ }
+
+ im_node = lpm_trie_node_alloc(trie, NULL);
+ if (!im_node) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ im_node->prefixlen = matchlen;
+ im_node->flags |= LPM_TREE_NODE_FLAG_IM;
+ memcpy(im_node->data, node->data, trie->data_size);
+
+ /* Now determine which child to install in which slot */
+ if (extract_bit(key->data, matchlen)) {
+ rcu_assign_pointer(im_node->child[0], node);
+ rcu_assign_pointer(im_node->child[1], new_node);
+ } else {
+ rcu_assign_pointer(im_node->child[0], new_node);
+ rcu_assign_pointer(im_node->child[1], node);
+ }
+
+ /* Finally, assign the intermediate node to the determined spot */
+ rcu_assign_pointer(*slot, im_node);
+
+out:
+ if (ret) {
+ if (new_node)
+ trie->n_entries--;
+
+ kfree(new_node);
+ kfree(im_node);
+ }
+
+ raw_spin_unlock_irqrestore(&trie->lock, irq_flags);
+
+ return ret;
+}
+
+static int trie_delete_elem(struct bpf_map *map, void *key)
+{
+ /* TODO */
+ return -ENOSYS;
+}
+
+#define LPM_DATA_SIZE_MAX 256
+#define LPM_DATA_SIZE_MIN 1
+
+#define LPM_VAL_SIZE_MAX (KMALLOC_MAX_SIZE - LPM_DATA_SIZE_MAX - \
+ sizeof(struct lpm_trie_node))
+#define LPM_VAL_SIZE_MIN 1
+
+#define LPM_KEY_SIZE(X) (sizeof(struct bpf_lpm_trie_key) + (X))
+#define LPM_KEY_SIZE_MAX LPM_KEY_SIZE(LPM_DATA_SIZE_MAX)
+#define LPM_KEY_SIZE_MIN LPM_KEY_SIZE(LPM_DATA_SIZE_MIN)
+
+static struct bpf_map *trie_alloc(union bpf_attr *attr)
+{
+ struct lpm_trie *trie;
+ u64 cost = sizeof(*trie), cost_per_node;
+ int ret;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return ERR_PTR(-EPERM);
+
+ /* check sanity of attributes */
+ if (attr->max_entries == 0 ||
+ attr->map_flags != BPF_F_NO_PREALLOC ||
+ attr->key_size < LPM_KEY_SIZE_MIN ||
+ attr->key_size > LPM_KEY_SIZE_MAX ||
+ attr->value_size < LPM_VAL_SIZE_MIN ||
+ attr->value_size > LPM_VAL_SIZE_MAX)
+ return ERR_PTR(-EINVAL);
+
+ trie = kzalloc(sizeof(*trie), GFP_USER | __GFP_NOWARN);
+ if (!trie)
+ return ERR_PTR(-ENOMEM);
+
+ /* copy mandatory map attributes */
+ trie->map.map_type = attr->map_type;
+ trie->map.key_size = attr->key_size;
+ trie->map.value_size = attr->value_size;
+ trie->map.max_entries = attr->max_entries;
+ trie->data_size = attr->key_size -
+ offsetof(struct bpf_lpm_trie_key, data);
+ trie->max_prefixlen = trie->data_size * 8;
+
+ cost_per_node = sizeof(struct lpm_trie_node) +
+ attr->value_size + trie->data_size;
+ cost += (u64) attr->max_entries * cost_per_node;
+ if (cost >= U32_MAX - PAGE_SIZE) {
+ ret = -E2BIG;
+ goto out_err;
+ }
+
+ trie->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
+
+ ret = bpf_map_precharge_memlock(trie->map.pages);
+ if (ret)
+ goto out_err;
+
+ raw_spin_lock_init(&trie->lock);
+
+ return &trie->map;
+out_err:
+ kfree(trie);
+ return ERR_PTR(ret);
+}
+
+static void trie_free(struct bpf_map *map)
+{
+ struct lpm_trie *trie = container_of(map, struct lpm_trie, map);
+ struct lpm_trie_node __rcu **slot;
+ struct lpm_trie_node *node;
+
+ raw_spin_lock(&trie->lock);
+
+ /* Always start at the root and walk down to a node that has no
+ * children. Then free that node, nullify its reference in the parent
+ * and start over.
+ */
+
+ for (;;) {
+ slot = &trie->root;
+
+ for (;;) {
+ node = rcu_dereference_protected(*slot,
+ lockdep_is_held(&trie->lock));
+ if (!node)
+ goto unlock;
+
+ if (rcu_access_pointer(node->child[0])) {
+ slot = &node->child[0];
+ continue;
+ }
+
+ if (rcu_access_pointer(node->child[1])) {
+ slot = &node->child[1];
+ continue;
+ }
+
+ kfree(node);
+ RCU_INIT_POINTER(*slot, NULL);
+ break;
+ }
+ }
+
+unlock:
+ raw_spin_unlock(&trie->lock);
+}
+
+static const struct bpf_map_ops trie_ops = {
+ .map_alloc = trie_alloc,
+ .map_free = trie_free,
+ .map_lookup_elem = trie_lookup_elem,
+ .map_update_elem = trie_update_elem,
+ .map_delete_elem = trie_delete_elem,
+};
+
+static struct bpf_map_type_list trie_type __ro_after_init = {
+ .ops = &trie_ops,
+ .type = BPF_MAP_TYPE_LPM_TRIE,
+};
+
+static int __init register_trie_map(void)
+{
+ bpf_register_map_type(&trie_type);
+ return 0;
+}
+late_initcall(register_trie_map);
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index be8519148c25..22aa45cd0324 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -273,7 +273,7 @@ static const struct bpf_map_ops stack_map_ops = {
.map_delete_elem = stack_map_delete_elem,
};
-static struct bpf_map_type_list stack_map_type __read_mostly = {
+static struct bpf_map_type_list stack_map_type __ro_after_init = {
.ops = &stack_map_ops,
.type = BPF_MAP_TYPE_STACK_TRACE,
};
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index bbb016adbaeb..461eb1e66a0f 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -10,6 +10,7 @@
* General Public License for more details.
*/
#include <linux/bpf.h>
+#include <linux/bpf_trace.h>
#include <linux/syscalls.h>
#include <linux/slab.h>
#include <linux/vmalloc.h>
@@ -241,6 +242,7 @@ static int map_create(union bpf_attr *attr)
/* failed to allocate fd */
goto free_map;
+ trace_bpf_map_create(map, err);
return err;
free_map:
@@ -365,6 +367,7 @@ static int map_lookup_elem(union bpf_attr *attr)
if (copy_to_user(uvalue, value, value_size) != 0)
goto free_value;
+ trace_bpf_map_lookup_elem(map, ufd, key, value);
err = 0;
free_value:
@@ -447,6 +450,8 @@ static int map_update_elem(union bpf_attr *attr)
__this_cpu_dec(bpf_prog_active);
preempt_enable();
+ if (!err)
+ trace_bpf_map_update_elem(map, ufd, key, value);
free_value:
kfree(value);
free_key:
@@ -492,6 +497,8 @@ static int map_delete_elem(union bpf_attr *attr)
__this_cpu_dec(bpf_prog_active);
preempt_enable();
+ if (!err)
+ trace_bpf_map_delete_elem(map, ufd, key);
free_key:
kfree(key);
err_put:
@@ -544,6 +551,7 @@ static int map_get_next_key(union bpf_attr *attr)
if (copy_to_user(unext_key, next_key, map->key_size) != 0)
goto free_next_key;
+ trace_bpf_map_next_key(map, ufd, key, next_key);
err = 0;
free_next_key:
@@ -697,8 +705,11 @@ static void __bpf_prog_put_rcu(struct rcu_head *rcu)
void bpf_prog_put(struct bpf_prog *prog)
{
- if (atomic_dec_and_test(&prog->aux->refcnt))
+ if (atomic_dec_and_test(&prog->aux->refcnt)) {
+ trace_bpf_prog_put_rcu(prog);
+ bpf_prog_kallsyms_del(prog);
call_rcu(&prog->aux->rcu, __bpf_prog_put_rcu);
+ }
}
EXPORT_SYMBOL_GPL(bpf_prog_put);
@@ -807,7 +818,11 @@ struct bpf_prog *bpf_prog_get(u32 ufd)
struct bpf_prog *bpf_prog_get_type(u32 ufd, enum bpf_prog_type type)
{
- return __bpf_prog_get(ufd, &type);
+ struct bpf_prog *prog = __bpf_prog_get(ufd, &type);
+
+ if (!IS_ERR(prog))
+ trace_bpf_prog_get_type(prog);
+ return prog;
}
EXPORT_SYMBOL_GPL(bpf_prog_get_type);
@@ -889,6 +904,8 @@ static int bpf_prog_load(union bpf_attr *attr)
/* failed to allocate fd */
goto free_used_maps;
+ bpf_prog_kallsyms_add(prog);
+ trace_bpf_prog_load(prog, err);
return err;
free_used_maps:
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index cdc43b899f28..d2bded2b250c 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -481,6 +481,13 @@ static void reset_reg_range_values(struct bpf_reg_state *regs, u32 regno)
regs[regno].max_value = BPF_REGISTER_MAX_RANGE;
}
+static void mark_reg_unknown_value_and_range(struct bpf_reg_state *regs,
+ u32 regno)
+{
+ mark_reg_unknown_value(regs, regno);
+ reset_reg_range_values(regs, regno);
+}
+
enum reg_arg_type {
SRC_OP, /* register is used as source operand */
DST_OP, /* register is used as destination operand */
@@ -532,6 +539,7 @@ static bool is_spillable_regtype(enum bpf_reg_type type)
switch (type) {
case PTR_TO_MAP_VALUE:
case PTR_TO_MAP_VALUE_OR_NULL:
+ case PTR_TO_MAP_VALUE_ADJ:
case PTR_TO_STACK:
case PTR_TO_CTX:
case PTR_TO_PACKET:
@@ -616,7 +624,8 @@ static int check_stack_read(struct bpf_verifier_state *state, int off, int size,
}
if (value_regno >= 0)
/* have read misc data from the stack */
- mark_reg_unknown_value(state->regs, value_regno);
+ mark_reg_unknown_value_and_range(state->regs,
+ value_regno);
return 0;
}
}
@@ -627,7 +636,7 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno, int off,
{
struct bpf_map *map = env->cur_state.regs[regno].map_ptr;
- if (off < 0 || off + size > map->value_size) {
+ if (off < 0 || size <= 0 || off + size > map->value_size) {
verbose("invalid access to map value, value_size=%d off=%d size=%d\n",
map->value_size, off, size);
return -EACCES;
@@ -635,6 +644,51 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno, int off,
return 0;
}
+/* check read/write into an adjusted map element */
+static int check_map_access_adj(struct bpf_verifier_env *env, u32 regno,
+ int off, int size)
+{
+ struct bpf_verifier_state *state = &env->cur_state;
+ struct bpf_reg_state *reg = &state->regs[regno];
+ int err;
+
+ /* We adjusted the register to this map value, so we
+ * need to change off and size to min_value and max_value
+ * respectively to make sure our theoretical access will be
+ * safe.
+ */
+ if (log_level)
+ print_verifier_state(state);
+ env->varlen_map_value_access = true;
+ /* The minimum value is only important with signed
+ * comparisons where we can't assume the floor of a
+ * value is 0. If we are using signed variables for our
+ * index'es we need to make sure that whatever we use
+ * will have a set floor within our range.
+ */
+ if (reg->min_value < 0) {
+ verbose("R%d min value is negative, either use unsigned index or do a if (index >=0) check.\n",
+ regno);
+ return -EACCES;
+ }
+ err = check_map_access(env, regno, reg->min_value + off, size);
+ if (err) {
+ verbose("R%d min value is outside of the array range\n",
+ regno);
+ return err;
+ }
+
+ /* If we haven't set a max value then we need to bail
+ * since we can't be sure we won't do bad things.
+ */
+ if (reg->max_value == BPF_REGISTER_MAX_RANGE) {
+ verbose("R%d unbounded memory access, make sure to bounds check any array access into a map\n",
+ regno);
+ return -EACCES;
+ }
+ return check_map_access(env, regno, reg->max_value + off, size);
+}
+
#define MAX_PACKET_OFF 0xffff
static bool may_access_direct_pkt_data(struct bpf_verifier_env *env,
@@ -647,6 +701,7 @@ static bool may_access_direct_pkt_data(struct bpf_verifier_env *env,
/* dst_input() and dst_output() can't write for now */
if (t == BPF_WRITE)
return false;
+ /* fallthrough */
case BPF_PROG_TYPE_SCHED_CLS:
case BPF_PROG_TYPE_SCHED_ACT:
case BPF_PROG_TYPE_XDP:
@@ -775,47 +830,13 @@ static int check_mem_access(struct bpf_verifier_env *env, u32 regno, int off,
return -EACCES;
}
- /* If we adjusted the register to this map value at all then we
- * need to change off and size to min_value and max_value
- * respectively to make sure our theoretical access will be
- * safe.
- */
- if (reg->type == PTR_TO_MAP_VALUE_ADJ) {
- if (log_level)
- print_verifier_state(state);
- env->varlen_map_value_access = true;
- /* The minimum value is only important with signed
- * comparisons where we can't assume the floor of a
- * value is 0. If we are using signed variables for our
- * index'es we need to make sure that whatever we use
- * will have a set floor within our range.
- */
- if (reg->min_value < 0) {
- verbose("R%d min value is negative, either use unsigned index or do a if (index >=0) check.\n",
- regno);
- return -EACCES;
- }
- err = check_map_access(env, regno, reg->min_value + off,
- size);
- if (err) {
- verbose("R%d min value is outside of the array range\n",
- regno);
- return err;
- }
-
- /* If we haven't set a max value then we need to bail
- * since we can't be sure we won't do bad things.
- */
- if (reg->max_value == BPF_REGISTER_MAX_RANGE) {
- verbose("R%d unbounded memory access, make sure to bounds check any array access into a map\n",
- regno);
- return -EACCES;
- }
- off += reg->max_value;
- }
- err = check_map_access(env, regno, off, size);
+ if (reg->type == PTR_TO_MAP_VALUE_ADJ)
+ err = check_map_access_adj(env, regno, off, size);
+ else
+ err = check_map_access(env, regno, off, size);
if (!err && t == BPF_READ && value_regno >= 0)
- mark_reg_unknown_value(state->regs, value_regno);
+ mark_reg_unknown_value_and_range(state->regs,
+ value_regno);
} else if (reg->type == PTR_TO_CTX) {
enum bpf_reg_type reg_type = UNKNOWN_VALUE;
@@ -827,7 +848,8 @@ static int check_mem_access(struct bpf_verifier_env *env, u32 regno, int off,
}
err = check_ctx_access(env, off, size, t, &reg_type);
if (!err && t == BPF_READ && value_regno >= 0) {
- mark_reg_unknown_value(state->regs, value_regno);
+ mark_reg_unknown_value_and_range(state->regs,
+ value_regno);
/* note that reg.[id|off|range] == 0 */
state->regs[value_regno].type = reg_type;
}
@@ -860,7 +882,8 @@ static int check_mem_access(struct bpf_verifier_env *env, u32 regno, int off,
}
err = check_packet_access(env, regno, off, size);
if (!err && t == BPF_READ && value_regno >= 0)
- mark_reg_unknown_value(state->regs, value_regno);
+ mark_reg_unknown_value_and_range(state->regs,
+ value_regno);
} else {
verbose("R%d invalid mem access '%s'\n",
regno, reg_type_str[reg->type]);
@@ -958,6 +981,25 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno,
return 0;
}
+static int check_helper_mem_access(struct bpf_verifier_env *env, int regno,
+ int access_size, bool zero_size_allowed,
+ struct bpf_call_arg_meta *meta)
+{
+ struct bpf_reg_state *regs = env->cur_state.regs;
+
+ switch (regs[regno].type) {
+ case PTR_TO_PACKET:
+ return check_packet_access(env, regno, 0, access_size);
+ case PTR_TO_MAP_VALUE:
+ return check_map_access(env, regno, 0, access_size);
+ case PTR_TO_MAP_VALUE_ADJ:
+ return check_map_access_adj(env, regno, 0, access_size);
+ default: /* const_imm|ptr_to_stack or invalid ptr */
+ return check_stack_boundary(env, regno, access_size,
+ zero_size_allowed, meta);
+ }
+}
+
static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
enum bpf_arg_type arg_type,
struct bpf_call_arg_meta *meta)
@@ -993,10 +1035,13 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
expected_type = PTR_TO_STACK;
if (type != PTR_TO_PACKET && type != expected_type)
goto err_type;
- } else if (arg_type == ARG_CONST_STACK_SIZE ||
- arg_type == ARG_CONST_STACK_SIZE_OR_ZERO) {
+ } else if (arg_type == ARG_CONST_SIZE ||
+ arg_type == ARG_CONST_SIZE_OR_ZERO) {
expected_type = CONST_IMM;
- if (type != expected_type)
+ /* One exception. Allow UNKNOWN_VALUE registers when the
+ * boundaries are known and don't cause unsafe memory accesses
+ */
+ if (type != UNKNOWN_VALUE && type != expected_type)
goto err_type;
} else if (arg_type == ARG_CONST_MAP_PTR) {
expected_type = CONST_PTR_TO_MAP;
@@ -1006,8 +1051,8 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
expected_type = PTR_TO_CTX;
if (type != expected_type)
goto err_type;
- } else if (arg_type == ARG_PTR_TO_STACK ||
- arg_type == ARG_PTR_TO_RAW_STACK) {
+ } else if (arg_type == ARG_PTR_TO_MEM ||
+ arg_type == ARG_PTR_TO_UNINIT_MEM) {
expected_type = PTR_TO_STACK;
/* One exception here. In case function allows for NULL to be
* passed in as argument, it's a CONST_IMM type. Final test
@@ -1015,9 +1060,10 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
*/
if (type == CONST_IMM && reg->imm == 0)
/* final test in check_stack_boundary() */;
- else if (type != PTR_TO_PACKET && type != expected_type)
+ else if (type != PTR_TO_PACKET && type != PTR_TO_MAP_VALUE &&
+ type != PTR_TO_MAP_VALUE_ADJ && type != expected_type)
goto err_type;
- meta->raw_mode = arg_type == ARG_PTR_TO_RAW_STACK;
+ meta->raw_mode = arg_type == ARG_PTR_TO_UNINIT_MEM;
} else {
verbose("unsupported arg_type %d\n", arg_type);
return -EFAULT;
@@ -1063,9 +1109,9 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
err = check_stack_boundary(env, regno,
meta->map_ptr->value_size,
false, NULL);
- } else if (arg_type == ARG_CONST_STACK_SIZE ||
- arg_type == ARG_CONST_STACK_SIZE_OR_ZERO) {
- bool zero_size_allowed = (arg_type == ARG_CONST_STACK_SIZE_OR_ZERO);
+ } else if (arg_type == ARG_CONST_SIZE ||
+ arg_type == ARG_CONST_SIZE_OR_ZERO) {
+ bool zero_size_allowed = (arg_type == ARG_CONST_SIZE_OR_ZERO);
/* bpf_xxx(..., buf, len) call will access 'len' bytes
* from stack pointer 'buf'. Check it
@@ -1073,14 +1119,50 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
*/
if (regno == 0) {
/* kernel subsystem misconfigured verifier */
- verbose("ARG_CONST_STACK_SIZE cannot be first argument\n");
+ verbose("ARG_CONST_SIZE cannot be first argument\n");
return -EACCES;
}
- if (regs[regno - 1].type == PTR_TO_PACKET)
- err = check_packet_access(env, regno - 1, 0, reg->imm);
- else
- err = check_stack_boundary(env, regno - 1, reg->imm,
- zero_size_allowed, meta);
+
+ /* If the register is UNKNOWN_VALUE, the access check happens
+ * using its boundaries. Otherwise, just use its imm
+ */
+ if (type == UNKNOWN_VALUE) {
+ /* For unprivileged variable accesses, disable raw
+ * mode so that the program is required to
+ * initialize all the memory that the helper could
+ * just partially fill up.
+ */
+ meta = NULL;
+
+ if (reg->min_value < 0) {
+ verbose("R%d min value is negative, either use unsigned or 'var &= const'\n",
+ regno);
+ return -EACCES;
+ }
+
+ if (reg->min_value == 0) {
+ err = check_helper_mem_access(env, regno - 1, 0,
+ zero_size_allowed,
+ meta);
+ if (err)
+ return err;
+ }
+
+ if (reg->max_value == BPF_REGISTER_MAX_RANGE) {
+ verbose("R%d unbounded memory access, use 'var &= const' or 'if (var < const)'\n",
+ regno);
+ return -EACCES;
+ }
+ err = check_helper_mem_access(env, regno - 1,
+ reg->max_value,
+ zero_size_allowed, meta);
+ if (err)
+ return err;
+ } else {
+ /* register is CONST_IMM */
+ err = check_helper_mem_access(env, regno - 1, reg->imm,
+ zero_size_allowed, meta);
+ }
}
return err;
@@ -1154,15 +1236,15 @@ static int check_raw_mode(const struct bpf_func_proto *fn)
{
int count = 0;
- if (fn->arg1_type == ARG_PTR_TO_RAW_STACK)
+ if (fn->arg1_type == ARG_PTR_TO_UNINIT_MEM)
count++;
- if (fn->arg2_type == ARG_PTR_TO_RAW_STACK)
+ if (fn->arg2_type == ARG_PTR_TO_UNINIT_MEM)
count++;
- if (fn->arg3_type == ARG_PTR_TO_RAW_STACK)
+ if (fn->arg3_type == ARG_PTR_TO_UNINIT_MEM)
count++;
- if (fn->arg4_type == ARG_PTR_TO_RAW_STACK)
+ if (fn->arg4_type == ARG_PTR_TO_UNINIT_MEM)
count++;
- if (fn->arg5_type == ARG_PTR_TO_RAW_STACK)
+ if (fn->arg5_type == ARG_PTR_TO_UNINIT_MEM)
count++;
return count > 1 ? -EINVAL : 0;
@@ -1316,7 +1398,7 @@ static int check_packet_ptr_add(struct bpf_verifier_env *env,
imm = insn->imm;
add_imm:
- if (imm <= 0) {
+ if (imm < 0) {
verbose("addition of negative constant to packet pointer is not allowed\n");
return -EACCES;
}
@@ -1485,22 +1567,54 @@ static int evaluate_reg_imm_alu(struct bpf_verifier_env *env,
struct bpf_reg_state *dst_reg = &regs[insn->dst_reg];
struct bpf_reg_state *src_reg = &regs[insn->src_reg];
u8 opcode = BPF_OP(insn->code);
+ u64 dst_imm = dst_reg->imm;
- /* dst_reg->type == CONST_IMM here, simulate execution of 'add'/'or'
- * insn. Don't care about overflow or negative values, just add them
+ /* dst_reg->type == CONST_IMM here. Simulate execution of insns
+ * containing ALU ops. Don't care about overflow or negative
+ * values, just add/sub/... them; registers are in u64.
*/
- if (opcode == BPF_ADD && BPF_SRC(insn->code) == BPF_K)
- dst_reg->imm += insn->imm;
- else if (opcode == BPF_ADD && BPF_SRC(insn->code) == BPF_X &&
- src_reg->type == CONST_IMM)
- dst_reg->imm += src_reg->imm;
- else if (opcode == BPF_OR && BPF_SRC(insn->code) == BPF_K)
- dst_reg->imm |= insn->imm;
- else if (opcode == BPF_OR && BPF_SRC(insn->code) == BPF_X &&
- src_reg->type == CONST_IMM)
- dst_reg->imm |= src_reg->imm;
- else
+ if (opcode == BPF_ADD && BPF_SRC(insn->code) == BPF_K) {
+ dst_imm += insn->imm;
+ } else if (opcode == BPF_ADD && BPF_SRC(insn->code) == BPF_X &&
+ src_reg->type == CONST_IMM) {
+ dst_imm += src_reg->imm;
+ } else if (opcode == BPF_SUB && BPF_SRC(insn->code) == BPF_K) {
+ dst_imm -= insn->imm;
+ } else if (opcode == BPF_SUB && BPF_SRC(insn->code) == BPF_X &&
+ src_reg->type == CONST_IMM) {
+ dst_imm -= src_reg->imm;
+ } else if (opcode == BPF_MUL && BPF_SRC(insn->code) == BPF_K) {
+ dst_imm *= insn->imm;
+ } else if (opcode == BPF_MUL && BPF_SRC(insn->code) == BPF_X &&
+ src_reg->type == CONST_IMM) {
+ dst_imm *= src_reg->imm;
+ } else if (opcode == BPF_OR && BPF_SRC(insn->code) == BPF_K) {
+ dst_imm |= insn->imm;
+ } else if (opcode == BPF_OR && BPF_SRC(insn->code) == BPF_X &&
+ src_reg->type == CONST_IMM) {
+ dst_imm |= src_reg->imm;
+ } else if (opcode == BPF_AND && BPF_SRC(insn->code) == BPF_K) {
+ dst_imm &= insn->imm;
+ } else if (opcode == BPF_AND && BPF_SRC(insn->code) == BPF_X &&
+ src_reg->type == CONST_IMM) {
+ dst_imm &= src_reg->imm;
+ } else if (opcode == BPF_RSH && BPF_SRC(insn->code) == BPF_K) {
+ dst_imm >>= insn->imm;
+ } else if (opcode == BPF_RSH && BPF_SRC(insn->code) == BPF_X &&
+ src_reg->type == CONST_IMM) {
+ dst_imm >>= src_reg->imm;
+ } else if (opcode == BPF_LSH && BPF_SRC(insn->code) == BPF_K) {
+ dst_imm <<= insn->imm;
+ } else if (opcode == BPF_LSH && BPF_SRC(insn->code) == BPF_X &&
+ src_reg->type == CONST_IMM) {
+ dst_imm <<= src_reg->imm;
+ } else {
mark_reg_unknown_value(regs, insn->dst_reg);
+ goto out;
+ }
+
+ dst_reg->imm = dst_imm;
+out:
return 0;
}
@@ -1894,6 +2008,7 @@ static void reg_set_min_max(struct bpf_reg_state *true_reg,
case BPF_JGT:
/* Unsigned comparison, the minimum value is 0. */
false_reg->min_value = 0;
+ /* fallthrough */
case BPF_JSGT:
/* If this is false then we know the maximum val is val,
* otherwise we know the min val is val+1.
@@ -1904,6 +2019,7 @@ static void reg_set_min_max(struct bpf_reg_state *true_reg,
case BPF_JGE:
/* Unsigned comparison, the minimum value is 0. */
false_reg->min_value = 0;
+ /* fallthrough */
case BPF_JSGE:
/* If this is false then we know the maximum value is val - 1,
* otherwise we know the mimimum value is val.
@@ -1942,6 +2058,7 @@ static void reg_set_min_max_inv(struct bpf_reg_state *true_reg,
case BPF_JGT:
/* Unsigned comparison, the minimum value is 0. */
true_reg->min_value = 0;
+ /* fallthrough */
case BPF_JSGT:
/*
* If this is false, then the val is <= the register, if it is
@@ -1953,6 +2070,7 @@ static void reg_set_min_max_inv(struct bpf_reg_state *true_reg,
case BPF_JGE:
/* Unsigned comparison, the minimum value is 0. */
true_reg->min_value = 0;
+ /* fallthrough */
case BPF_JSGE:
/* If this is false then constant < register, if it is true then
* the register < constant.
@@ -2144,14 +2262,8 @@ static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn)
return err;
if (insn->src_reg == 0) {
- /* generic move 64-bit immediate into a register,
- * only analyzer needs to collect the ld_imm value.
- */
u64 imm = ((u64)(insn + 1)->imm << 32) | (u32)insn->imm;
- if (!env->analyzer_ops)
- return 0;
-
regs[insn->dst_reg].type = CONST_IMM;
regs[insn->dst_reg].imm = imm;
return 0;
@@ -2729,7 +2841,6 @@ static int do_check(struct bpf_verifier_env *env)
if (err)
return err;
- reset_reg_range_values(regs, insn->dst_reg);
if (BPF_SIZE(insn->code) != BPF_W &&
BPF_SIZE(insn->code) != BPF_DW) {
insn_idx++;
@@ -3085,10 +3196,14 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
insn = env->prog->insnsi + delta;
for (i = 0; i < insn_cnt; i++, insn++) {
- if (insn->code == (BPF_LDX | BPF_MEM | BPF_W) ||
+ if (insn->code == (BPF_LDX | BPF_MEM | BPF_B) ||
+ insn->code == (BPF_LDX | BPF_MEM | BPF_H) ||
+ insn->code == (BPF_LDX | BPF_MEM | BPF_W) ||
insn->code == (BPF_LDX | BPF_MEM | BPF_DW))
type = BPF_READ;
- else if (insn->code == (BPF_STX | BPF_MEM | BPF_W) ||
+ else if (insn->code == (BPF_STX | BPF_MEM | BPF_B) ||
+ insn->code == (BPF_STX | BPF_MEM | BPF_H) ||
+ insn->code == (BPF_STX | BPF_MEM | BPF_W) ||
insn->code == (BPF_STX | BPF_MEM | BPF_DW))
type = BPF_WRITE;
else
@@ -3097,8 +3212,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
if (env->insn_aux_data[i].ptr_type != PTR_TO_CTX)
continue;
- cnt = ops->convert_ctx_access(type, insn->dst_reg, insn->src_reg,
- insn->off, insn_buf, env->prog);
+ cnt = ops->convert_ctx_access(type, insn, insn_buf, env->prog);
if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf)) {
verbose("bpf verifier is misconfigured\n");
return -EINVAL;
diff --git a/kernel/configs/android-recommended.config b/kernel/configs/android-recommended.config
index 297756be369c..99127edc5204 100644
--- a/kernel/configs/android-recommended.config
+++ b/kernel/configs/android-recommended.config
@@ -11,7 +11,7 @@ CONFIG_BLK_DEV_LOOP=y
CONFIG_BLK_DEV_RAM=y
CONFIG_BLK_DEV_RAM_SIZE=8192
CONFIG_COMPACTION=y
-CONFIG_DEBUG_RODATA=y
+CONFIG_STRICT_KERNEL_RWX=y
CONFIG_DM_CRYPT=y
CONFIG_DM_UEVENT=y
CONFIG_DM_VERITY=y
diff --git a/kernel/exit.c b/kernel/exit.c
index b67c57faa705..580da79e38ee 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -14,7 +14,6 @@
#include <linux/tty.h>
#include <linux/iocontext.h>
#include <linux/key.h>
-#include <linux/security.h>
#include <linux/cpu.h>
#include <linux/acct.h>
#include <linux/tsacct_kern.h>
@@ -1390,7 +1389,7 @@ static int wait_task_continued(struct wait_opts *wo, struct task_struct *p)
* Returns nonzero for a final return, when we have unlocked tasklist_lock.
* Returns zero if the search for a child should continue;
* then ->notask_error is 0 if @p is an eligible child,
- * or another error from security_task_wait(), or still -ECHILD.
+ * or still -ECHILD.
*/
static int wait_consider_task(struct wait_opts *wo, int ptrace,
struct task_struct *p)
@@ -1410,20 +1409,6 @@ static int wait_consider_task(struct wait_opts *wo, int ptrace,
if (!ret)
return ret;
- ret = security_task_wait(p);
- if (unlikely(ret < 0)) {
- /*
- * If we have not yet seen any eligible child,
- * then let this error code replace -ECHILD.
- * A permission error will give the user a clue
- * to look for security policy problems, rather
- * than for mysterious wait bugs.
- */
- if (wo->notask_error)
- wo->notask_error = ret;
- return 0;
- }
-
if (unlikely(exit_state == EXIT_TRACE)) {
/*
* ptrace == 0 means we are the natural parent. In this case
@@ -1516,7 +1501,7 @@ static int wait_consider_task(struct wait_opts *wo, int ptrace,
* Returns nonzero for a final return, when we have unlocked tasklist_lock.
* Returns zero if the search for a child should continue; then
* ->notask_error is 0 if there were any eligible children,
- * or another error from security_task_wait(), or still -ECHILD.
+ * or still -ECHILD.
*/
static int do_wait_thread(struct wait_opts *wo, struct task_struct *tsk)
{
diff --git a/kernel/extable.c b/kernel/extable.c
index e1359474baa5..2676d7f8baf6 100644
--- a/kernel/extable.c
+++ b/kernel/extable.c
@@ -17,10 +17,12 @@
*/
#include <linux/ftrace.h>
#include <linux/memory.h>
+#include <linux/extable.h>
#include <linux/module.h>
#include <linux/mutex.h>
#include <linux/init.h>
#include <linux/kprobes.h>
+#include <linux/filter.h>
#include <asm/sections.h>
#include <linux/uaccess.h>
@@ -107,6 +109,8 @@ int __kernel_text_address(unsigned long addr)
return 1;
if (is_kprobe_optinsn_slot(addr) || is_kprobe_insn_slot(addr))
return 1;
+ if (is_bpf_text_address(addr))
+ return 1;
/*
* There might be init symbols in saved stacktraces.
* Give those symbols a chance to be printed in
@@ -130,6 +134,8 @@ int kernel_text_address(unsigned long addr)
return 1;
if (is_kprobe_optinsn_slot(addr) || is_kprobe_insn_slot(addr))
return 1;
+ if (is_bpf_text_address(addr))
+ return 1;
return 0;
}
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index fafd1a3ef0da..6a3b249a2ae1 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -23,6 +23,7 @@
#include <linux/mm.h>
#include <linux/ctype.h>
#include <linux/slab.h>
+#include <linux/filter.h>
#include <linux/compiler.h>
#include <asm/sections.h>
@@ -300,10 +301,11 @@ int kallsyms_lookup_size_offset(unsigned long addr, unsigned long *symbolsize,
unsigned long *offset)
{
char namebuf[KSYM_NAME_LEN];
+
if (is_ksym_addr(addr))
return !!get_symbol_pos(addr, symbolsize, offset);
-
- return !!module_address_lookup(addr, symbolsize, offset, NULL, namebuf);
+ return !!module_address_lookup(addr, symbolsize, offset, NULL, namebuf) ||
+ !!__bpf_address_lookup(addr, symbolsize, offset, namebuf);
}
/*
@@ -318,6 +320,8 @@ const char *kallsyms_lookup(unsigned long addr,
unsigned long *offset,
char **modname, char *namebuf)
{
+ const char *ret;
+
namebuf[KSYM_NAME_LEN - 1] = 0;
namebuf[0] = 0;
@@ -333,9 +337,13 @@ const char *kallsyms_lookup(unsigned long addr,
return namebuf;
}
- /* See if it's in a module. */
- return module_address_lookup(addr, symbolsize, offset, modname,
- namebuf);
+ /* See if it's in a module or a BPF JITed image. */
+ ret = module_address_lookup(addr, symbolsize, offset,
+ modname, namebuf);
+ if (!ret)
+ ret = bpf_address_lookup(addr, symbolsize,
+ offset, modname, namebuf);
+ return ret;
}
int lookup_symbol_name(unsigned long addr, char *symname)
@@ -471,6 +479,7 @@ EXPORT_SYMBOL(__print_symbol);
/* To avoid using get_symbol_offset for every symbol, we carry prefix along. */
struct kallsym_iter {
loff_t pos;
+ loff_t pos_mod_end;
unsigned long value;
unsigned int nameoff; /* If iterating in core kernel symbols. */
char type;
@@ -481,13 +490,27 @@ struct kallsym_iter {
static int get_ksymbol_mod(struct kallsym_iter *iter)
{
- if (module_get_kallsym(iter->pos - kallsyms_num_syms, &iter->value,
- &iter->type, iter->name, iter->module_name,
- &iter->exported) < 0)
+ int ret = module_get_kallsym(iter->pos - kallsyms_num_syms,
+ &iter->value, &iter->type,
+ iter->name, iter->module_name,
+ &iter->exported);
+ if (ret < 0) {
+ iter->pos_mod_end = iter->pos;
return 0;
+ }
+
return 1;
}
+static int get_ksymbol_bpf(struct kallsym_iter *iter)
+{
+ iter->module_name[0] = '\0';
+ iter->exported = 0;
+ return bpf_get_kallsym(iter->pos - iter->pos_mod_end,
+ &iter->value, &iter->type,
+ iter->name) < 0 ? 0 : 1;
+}
+
/* Returns space to next name. */
static unsigned long get_ksymbol_core(struct kallsym_iter *iter)
{
@@ -508,16 +531,30 @@ static void reset_iter(struct kallsym_iter *iter, loff_t new_pos)
iter->name[0] = '\0';
iter->nameoff = get_symbol_offset(new_pos);
iter->pos = new_pos;
+ if (new_pos == 0)
+ iter->pos_mod_end = 0;
+}
+
+static int update_iter_mod(struct kallsym_iter *iter, loff_t pos)
+{
+ iter->pos = pos;
+
+ if (iter->pos_mod_end > 0 &&
+ iter->pos_mod_end < iter->pos)
+ return get_ksymbol_bpf(iter);
+
+ if (!get_ksymbol_mod(iter))
+ return get_ksymbol_bpf(iter);
+
+ return 1;
}
/* Returns false if pos at or past end of file. */
static int update_iter(struct kallsym_iter *iter, loff_t pos)
{
/* Module symbols can be accessed randomly. */
- if (pos >= kallsyms_num_syms) {
- iter->pos = pos;
- return get_ksymbol_mod(iter);
- }
+ if (pos >= kallsyms_num_syms)
+ return update_iter_mod(iter, pos);
/* If we're not on the desired position, reset to new position. */
if (pos != iter->pos)
diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
index 5617cc412444..bfe62d5b3872 100644
--- a/kernel/kexec_core.c
+++ b/kernel/kexec_core.c
@@ -916,7 +916,7 @@ void crash_kexec(struct pt_regs *regs)
old_cpu = atomic_cmpxchg(&panic_cpu, PANIC_CPU_INVALID, this_cpu);
if (old_cpu == PANIC_CPU_INVALID) {
/* This is the 1st CPU which comes here, so go ahead. */
- printk_nmi_flush_on_panic();
+ printk_safe_flush_on_panic();
__crash_kexec(regs);
/*
@@ -1399,7 +1399,7 @@ void __weak arch_crash_save_vmcoreinfo(void)
phys_addr_t __weak paddr_vmcoreinfo_note(void)
{
- return __pa((unsigned long)(char *)&vmcoreinfo_note);
+ return __pa_symbol((unsigned long)(char *)&vmcoreinfo_note);
}
static int __init crash_save_vmcoreinfo_init(void)
diff --git a/kernel/kmod.c b/kernel/kmod.c
index d45c96073afb..0c407f905ca4 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -516,7 +516,7 @@ static void helper_unlock(void)
* Function must be runnable in either a process context or the
* context in which call_usermodehelper_exec is called.
*/
-struct subprocess_info *call_usermodehelper_setup(char *path, char **argv,
+struct subprocess_info *call_usermodehelper_setup(const char *path, char **argv,
char **envp, gfp_t gfp_mask,
int (*init)(struct subprocess_info *info, struct cred *new),
void (*cleanup)(struct subprocess_info *info),
@@ -528,7 +528,12 @@ struct subprocess_info *call_usermodehelper_setup(char *path, char **argv,
goto out;
INIT_WORK(&sub_info->work, call_usermodehelper_exec_work);
+
+#ifdef CONFIG_STATIC_USERMODEHELPER
+ sub_info->path = CONFIG_STATIC_USERMODEHELPER_PATH;
+#else
sub_info->path = path;
+#endif
sub_info->argv = argv;
sub_info->envp = envp;
@@ -566,6 +571,15 @@ int call_usermodehelper_exec(struct subprocess_info *sub_info, int wait)
retval = -EBUSY;
goto out;
}
+
+ /*
+ * If there is no binary for us to call, then just return and get out of
+ * here. This allows us to set STATIC_USERMODEHELPER_PATH to "" and
+ * disable all call_usermodehelper() calls.
+ */
+ if (strlen(sub_info->path) == 0)
+ goto out;
+
/*
* Set the completion pointer only if there is a waiter.
* This makes it possible to use umh_complete to free
@@ -613,7 +627,7 @@ EXPORT_SYMBOL(call_usermodehelper_exec);
* This function is the equivalent to use call_usermodehelper_setup() and
* call_usermodehelper_exec().
*/
-int call_usermodehelper(char *path, char **argv, char **envp, int wait)
+int call_usermodehelper(const char *path, char **argv, char **envp, int wait)
{
struct subprocess_info *info;
gfp_t gfp_mask = (wait == UMH_NO_WAIT) ? GFP_ATOMIC : GFP_KERNEL;
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index ebb4dadca66b..699c5bc51a92 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -1740,6 +1740,12 @@ void unregister_kprobes(struct kprobe **kps, int num)
}
EXPORT_SYMBOL_GPL(unregister_kprobes);
+int __weak __kprobes kprobe_exceptions_notify(struct notifier_block *self,
+ unsigned long val, void *data)
+{
+ return NOTIFY_DONE;
+}
+
static struct notifier_block kprobe_exceptions_nb = {
.notifier_call = kprobe_exceptions_notify,
.priority = 0x7fffffff /* we need to be notified first */
diff --git a/kernel/module.c b/kernel/module.c
index 3d8f126208e3..7eba6dea4f41 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -17,6 +17,7 @@
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
#include <linux/export.h>
+#include <linux/extable.h>
#include <linux/moduleloader.h>
#include <linux/trace_events.h>
#include <linux/init.h>
@@ -61,6 +62,7 @@
#include <linux/pfn.h>
#include <linux/bsearch.h>
#include <linux/dynamic_debug.h>
+#include <linux/audit.h>
#include <uapi/linux/module.h>
#include "module-internal.h"
@@ -74,9 +76,9 @@
/*
* Modules' sections will be aligned on page boundaries
* to ensure complete separation of code and data, but
- * only when CONFIG_DEBUG_SET_MODULE_RONX=y
+ * only when CONFIG_STRICT_MODULE_RWX=y
*/
-#ifdef CONFIG_DEBUG_SET_MODULE_RONX
+#ifdef CONFIG_STRICT_MODULE_RWX
# define debug_align(X) ALIGN(X, PAGE_SIZE)
#else
# define debug_align(X) (X)
@@ -1844,7 +1846,7 @@ static void mod_sysfs_teardown(struct module *mod)
mod_sysfs_fini(mod);
}
-#ifdef CONFIG_DEBUG_SET_MODULE_RONX
+#ifdef CONFIG_STRICT_MODULE_RWX
/*
* LKM RO/NX protection: protect module's text/ro-data
* from modification and any data from execution.
@@ -2809,6 +2811,8 @@ static int check_modinfo_livepatch(struct module *mod, struct load_info *info)
if (get_modinfo(info, "livepatch")) {
mod->klp = true;
add_taint_module(mod, TAINT_LIVEPATCH, LOCKDEP_STILL_OK);
+ pr_notice_once("%s: tainting kernel with TAINT_LIVEPATCH\n",
+ mod->name);
}
return 0;
@@ -3608,6 +3612,8 @@ static int load_module(struct load_info *info, const char __user *uargs,
goto free_copy;
}
+ audit_log_kern_module(mod->name);
+
/* Reserve our place in the list. */
err = add_unformed_module(mod);
if (err)
@@ -3696,7 +3702,7 @@ static int load_module(struct load_info *info, const char __user *uargs,
mod->name, after_dashes);
}
- /* Link in to syfs. */
+ /* Link in to sysfs. */
err = mod_sysfs_setup(mod, info, mod->kp, mod->num_kp);
if (err < 0)
goto coming_cleanup;
@@ -3719,6 +3725,7 @@ static int load_module(struct load_info *info, const char __user *uargs,
mod_sysfs_teardown(mod);
coming_cleanup:
mod->state = MODULE_STATE_GOING;
+ destroy_params(mod->kp, mod->num_kp);
blocking_notifier_call_chain(&module_notify_list,
MODULE_STATE_GOING, mod);
klp_module_going(mod);
@@ -4165,22 +4172,23 @@ const struct exception_table_entry *search_module_extables(unsigned long addr)
struct module *mod;
preempt_disable();
- list_for_each_entry_rcu(mod, &modules, list) {
- if (mod->state == MODULE_STATE_UNFORMED)
- continue;
- if (mod->num_exentries == 0)
- continue;
+ mod = __module_address(addr);
+ if (!mod)
+ goto out;
- e = search_extable(mod->extable,
- mod->extable + mod->num_exentries - 1,
- addr);
- if (e)
- break;
- }
+ if (!mod->num_exentries)
+ goto out;
+
+ e = search_extable(mod->extable,
+ mod->extable + mod->num_exentries - 1,
+ addr);
+out:
preempt_enable();
- /* Now, if we found one, we are running inside it now, hence
- we cannot unload the module, hence no refcnt needed. */
+ /*
+ * Now, if we found one, we are running inside it now, hence
+ * we cannot unload the module, hence no refcnt needed.
+ */
return e;
}
diff --git a/kernel/panic.c b/kernel/panic.c
index 08aa88dde7de..b95959733ce0 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -188,7 +188,7 @@ void panic(const char *fmt, ...)
* Bypass the panic_cpu check and call __crash_kexec directly.
*/
if (!_crash_kexec_post_notifiers) {
- printk_nmi_flush_on_panic();
+ printk_safe_flush_on_panic();
__crash_kexec(NULL);
/*
@@ -213,7 +213,7 @@ void panic(const char *fmt, ...)
atomic_notifier_call_chain(&panic_notifier_list, 0, buf);
/* Call flush even twice. It tries harder with a single online CPU */
- printk_nmi_flush_on_panic();
+ printk_safe_flush_on_panic();
kmsg_dump(KMSG_DUMP_PANIC);
/*
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index b26dbc48c75b..86385af1080f 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -1156,7 +1156,7 @@ static int __init hibernate_setup(char *str)
} else if (!strncmp(str, "no", 2)) {
noresume = 1;
nohibernate = 1;
- } else if (IS_ENABLED(CONFIG_DEBUG_RODATA)
+ } else if (IS_ENABLED(CONFIG_STRICT_KERNEL_RWX)
&& !strncmp(str, "protect_image", 13)) {
enable_restore_image_protection();
}
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 1dfa0da827d3..7fdc40d31b7d 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -61,12 +61,12 @@ extern int hibernation_snapshot(int platform_mode);
extern int hibernation_restore(int platform_mode);
extern int hibernation_platform_enter(void);
-#ifdef CONFIG_DEBUG_RODATA
+#ifdef CONFIG_STRICT_KERNEL_RWX
/* kernel/power/snapshot.c */
extern void enable_restore_image_protection(void);
#else
static inline void enable_restore_image_protection(void) {}
-#endif /* CONFIG_DEBUG_RODATA */
+#endif /* CONFIG_STRICT_KERNEL_RWX */
#else /* !CONFIG_HIBERNATION */
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 2d8e2b227db8..905d5bbd595f 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -38,7 +38,7 @@
#include "power.h"
-#ifdef CONFIG_DEBUG_RODATA
+#ifdef CONFIG_STRICT_KERNEL_RWX
static bool hibernate_restore_protection;
static bool hibernate_restore_protection_active;
@@ -73,7 +73,7 @@ static inline void hibernate_restore_protection_begin(void) {}
static inline void hibernate_restore_protection_end(void) {}
static inline void hibernate_restore_protect_page(void *page_address) {}
static inline void hibernate_restore_unprotect_page(void *page_address) {}
-#endif /* CONFIG_DEBUG_RODATA */
+#endif /* CONFIG_STRICT_KERNEL_RWX */
static int swsusp_page_is_free(struct page *);
static void swsusp_set_page_forbidden(struct page *);
diff --git a/kernel/power/suspend_test.c b/kernel/power/suspend_test.c
index bdff5ed57f10..5db217051232 100644
--- a/kernel/power/suspend_test.c
+++ b/kernel/power/suspend_test.c
@@ -166,7 +166,7 @@ static int __init setup_test_suspend(char *value)
return 0;
}
- for (i = 0; pm_labels[i]; i++)
+ for (i = PM_SUSPEND_MIN; i < PM_SUSPEND_MAX; i++)
if (!strcmp(pm_labels[i], suspend_type)) {
test_state_label = pm_labels[i];
return 0;
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 32e0c232efba..f80fd33639e0 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -201,7 +201,7 @@ void free_all_swap_pages(int swap)
struct swsusp_extent *ext;
unsigned long offset;
- ext = container_of(node, struct swsusp_extent, node);
+ ext = rb_entry(node, struct swsusp_extent, node);
rb_erase(node, &swsusp_extents);
for (offset = ext->start; offset <= ext->end; offset++)
swap_free(swp_entry(swap, offset));
diff --git a/kernel/printk/Makefile b/kernel/printk/Makefile
index abb0042a427b..4a2ffc39eb95 100644
--- a/kernel/printk/Makefile
+++ b/kernel/printk/Makefile
@@ -1,3 +1,3 @@
obj-y = printk.o
-obj-$(CONFIG_PRINTK_NMI) += nmi.o
+obj-$(CONFIG_PRINTK) += printk_safe.o
obj-$(CONFIG_A11Y_BRAILLE_CONSOLE) += braille.o
diff --git a/kernel/printk/internal.h b/kernel/printk/internal.h
index 7fd2838fa417..1db044f808b7 100644
--- a/kernel/printk/internal.h
+++ b/kernel/printk/internal.h
@@ -16,42 +16,55 @@
*/
#include <linux/percpu.h>
-typedef __printf(1, 0) int (*printk_func_t)(const char *fmt, va_list args);
+#ifdef CONFIG_PRINTK
-int __printf(1, 0) vprintk_default(const char *fmt, va_list args);
-
-#ifdef CONFIG_PRINTK_NMI
+#define PRINTK_SAFE_CONTEXT_MASK 0x7fffffff
+#define PRINTK_NMI_CONTEXT_MASK 0x80000000
extern raw_spinlock_t logbuf_lock;
+__printf(1, 0) int vprintk_default(const char *fmt, va_list args);
+__printf(1, 0) int vprintk_func(const char *fmt, va_list args);
+void __printk_safe_enter(void);
+void __printk_safe_exit(void);
+
+#define printk_safe_enter_irqsave(flags) \
+ do { \
+ local_irq_save(flags); \
+ __printk_safe_enter(); \
+ } while (0)
+
+#define printk_safe_exit_irqrestore(flags) \
+ do { \
+ __printk_safe_exit(); \
+ local_irq_restore(flags); \
+ } while (0)
+
+#define printk_safe_enter_irq() \
+ do { \
+ local_irq_disable(); \
+ __printk_safe_enter(); \
+ } while (0)
+
+#define printk_safe_exit_irq() \
+ do { \
+ __printk_safe_exit(); \
+ local_irq_enable(); \
+ } while (0)
+
+#else
+
+__printf(1, 0) int vprintk_func(const char *fmt, va_list args) { return 0; }
+
/*
- * printk() could not take logbuf_lock in NMI context. Instead,
- * it temporary stores the strings into a per-CPU buffer.
- * The alternative implementation is chosen transparently
- * via per-CPU variable.
+ * In !PRINTK builds we still export logbuf_lock spin_lock, console_sem
+ * semaphore and some of console functions (console_unlock()/etc.), so
+ * printk-safe must preserve the existing local IRQ guarantees.
*/
-DECLARE_PER_CPU(printk_func_t, printk_func);
-static inline __printf(1, 0) int vprintk_func(const char *fmt, va_list args)
-{
- return this_cpu_read(printk_func)(fmt, args);
-}
-
-extern atomic_t nmi_message_lost;
-static inline int get_nmi_message_lost(void)
-{
- return atomic_xchg(&nmi_message_lost, 0);
-}
-
-#else /* CONFIG_PRINTK_NMI */
-
-static inline __printf(1, 0) int vprintk_func(const char *fmt, va_list args)
-{
- return vprintk_default(fmt, args);
-}
-
-static inline int get_nmi_message_lost(void)
-{
- return 0;
-}
-
-#endif /* CONFIG_PRINTK_NMI */
+#define printk_safe_enter_irqsave(flags) local_irq_save(flags)
+#define printk_safe_exit_irqrestore(flags) local_irq_restore(flags)
+
+#define printk_safe_enter_irq() local_irq_disable()
+#define printk_safe_exit_irq() local_irq_enable()
+
+#endif /* CONFIG_PRINTK */
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 4ba3d34938c0..34da86e73d00 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -213,17 +213,36 @@ static int nr_ext_console_drivers;
static int __down_trylock_console_sem(unsigned long ip)
{
- if (down_trylock(&console_sem))
+ int lock_failed;
+ unsigned long flags;
+
+ /*
+ * Here and in __up_console_sem() we need to be in safe mode,
+ * because spindump/WARN/etc from under console ->lock will
+ * deadlock in printk()->down_trylock_console_sem() otherwise.
+ */
+ printk_safe_enter_irqsave(flags);
+ lock_failed = down_trylock(&console_sem);
+ printk_safe_exit_irqrestore(flags);
+
+ if (lock_failed)
return 1;
mutex_acquire(&console_lock_dep_map, 0, 1, ip);
return 0;
}
#define down_trylock_console_sem() __down_trylock_console_sem(_RET_IP_)
-#define up_console_sem() do { \
- mutex_release(&console_lock_dep_map, 1, _RET_IP_);\
- up(&console_sem);\
-} while (0)
+static void __up_console_sem(unsigned long ip)
+{
+ unsigned long flags;
+
+ mutex_release(&console_lock_dep_map, 1, ip);
+
+ printk_safe_enter_irqsave(flags);
+ up(&console_sem);
+ printk_safe_exit_irqrestore(flags);
+}
+#define up_console_sem() __up_console_sem(_RET_IP_)
/*
* This is used for debugging the mess that is the VT code by
@@ -351,6 +370,34 @@ __packed __aligned(4)
*/
DEFINE_RAW_SPINLOCK(logbuf_lock);
+/*
+ * Helper macros to lock/unlock logbuf_lock and switch between
+ * printk-safe/unsafe modes.
+ */
+#define logbuf_lock_irq() \
+ do { \
+ printk_safe_enter_irq(); \
+ raw_spin_lock(&logbuf_lock); \
+ } while (0)
+
+#define logbuf_unlock_irq() \
+ do { \
+ raw_spin_unlock(&logbuf_lock); \
+ printk_safe_exit_irq(); \
+ } while (0)
+
+#define logbuf_lock_irqsave(flags) \
+ do { \
+ printk_safe_enter_irqsave(flags); \
+ raw_spin_lock(&logbuf_lock); \
+ } while (0)
+
+#define logbuf_unlock_irqrestore(flags) \
+ do { \
+ raw_spin_unlock(&logbuf_lock); \
+ printk_safe_exit_irqrestore(flags); \
+ } while (0)
+
#ifdef CONFIG_PRINTK
DECLARE_WAIT_QUEUE_HEAD(log_wait);
/* the next printk record to read by syslog(READ) or /proc/kmsg */
@@ -782,20 +829,21 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf,
ret = mutex_lock_interruptible(&user->lock);
if (ret)
return ret;
- raw_spin_lock_irq(&logbuf_lock);
+
+ logbuf_lock_irq();
while (user->seq == log_next_seq) {
if (file->f_flags & O_NONBLOCK) {
ret = -EAGAIN;
- raw_spin_unlock_irq(&logbuf_lock);
+ logbuf_unlock_irq();
goto out;
}
- raw_spin_unlock_irq(&logbuf_lock);
+ logbuf_unlock_irq();
ret = wait_event_interruptible(log_wait,
user->seq != log_next_seq);
if (ret)
goto out;
- raw_spin_lock_irq(&logbuf_lock);
+ logbuf_lock_irq();
}
if (user->seq < log_first_seq) {
@@ -803,7 +851,7 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf,
user->idx = log_first_idx;
user->seq = log_first_seq;
ret = -EPIPE;
- raw_spin_unlock_irq(&logbuf_lock);
+ logbuf_unlock_irq();
goto out;
}
@@ -816,7 +864,7 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf,
user->idx = log_next(user->idx);
user->seq++;
- raw_spin_unlock_irq(&logbuf_lock);
+ logbuf_unlock_irq();
if (len > count) {
ret = -EINVAL;
@@ -843,7 +891,7 @@ static loff_t devkmsg_llseek(struct file *file, loff_t offset, int whence)
if (offset)
return -ESPIPE;
- raw_spin_lock_irq(&logbuf_lock);
+ logbuf_lock_irq();
switch (whence) {
case SEEK_SET:
/* the first record */
@@ -867,7 +915,7 @@ static loff_t devkmsg_llseek(struct file *file, loff_t offset, int whence)
default:
ret = -EINVAL;
}
- raw_spin_unlock_irq(&logbuf_lock);
+ logbuf_unlock_irq();
return ret;
}
@@ -881,7 +929,7 @@ static unsigned int devkmsg_poll(struct file *file, poll_table *wait)
poll_wait(file, &log_wait, wait);
- raw_spin_lock_irq(&logbuf_lock);
+ logbuf_lock_irq();
if (user->seq < log_next_seq) {
/* return error when data has vanished underneath us */
if (user->seq < log_first_seq)
@@ -889,7 +937,7 @@ static unsigned int devkmsg_poll(struct file *file, poll_table *wait)
else
ret = POLLIN|POLLRDNORM;
}
- raw_spin_unlock_irq(&logbuf_lock);
+ logbuf_unlock_irq();
return ret;
}
@@ -919,10 +967,10 @@ static int devkmsg_open(struct inode *inode, struct file *file)
mutex_init(&user->lock);
- raw_spin_lock_irq(&logbuf_lock);
+ logbuf_lock_irq();
user->idx = log_first_idx;
user->seq = log_first_seq;
- raw_spin_unlock_irq(&logbuf_lock);
+ logbuf_unlock_irq();
file->private_data = user;
return 0;
@@ -1064,13 +1112,13 @@ void __init setup_log_buf(int early)
return;
}
- raw_spin_lock_irqsave(&logbuf_lock, flags);
+ logbuf_lock_irqsave(flags);
log_buf_len = new_log_buf_len;
log_buf = new_log_buf;
new_log_buf_len = 0;
free = __LOG_BUF_LEN - log_next_idx;
memcpy(log_buf, __log_buf, __LOG_BUF_LEN);
- raw_spin_unlock_irqrestore(&logbuf_lock, flags);
+ logbuf_unlock_irqrestore(flags);
pr_info("log_buf_len: %d bytes\n", log_buf_len);
pr_info("early log buf free: %d(%d%%)\n",
@@ -1248,7 +1296,7 @@ static int syslog_print(char __user *buf, int size)
size_t n;
size_t skip;
- raw_spin_lock_irq(&logbuf_lock);
+ logbuf_lock_irq();
if (syslog_seq < log_first_seq) {
/* messages are gone, move to first one */
syslog_seq = log_first_seq;
@@ -1256,7 +1304,7 @@ static int syslog_print(char __user *buf, int size)
syslog_partial = 0;
}
if (syslog_seq == log_next_seq) {
- raw_spin_unlock_irq(&logbuf_lock);
+ logbuf_unlock_irq();
break;
}
@@ -1275,7 +1323,7 @@ static int syslog_print(char __user *buf, int size)
syslog_partial += n;
} else
n = 0;
- raw_spin_unlock_irq(&logbuf_lock);
+ logbuf_unlock_irq();
if (!n)
break;
@@ -1304,7 +1352,7 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
if (!text)
return -ENOMEM;
- raw_spin_lock_irq(&logbuf_lock);
+ logbuf_lock_irq();
if (buf) {
u64 next_seq;
u64 seq;
@@ -1352,12 +1400,12 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
idx = log_next(idx);
seq++;
- raw_spin_unlock_irq(&logbuf_lock);
+ logbuf_unlock_irq();
if (copy_to_user(buf + len, text, textlen))
len = -EFAULT;
else
len += textlen;
- raw_spin_lock_irq(&logbuf_lock);
+ logbuf_lock_irq();
if (seq < log_first_seq) {
/* messages are gone, move to next one */
@@ -1371,7 +1419,7 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
clear_seq = log_next_seq;
clear_idx = log_next_idx;
}
- raw_spin_unlock_irq(&logbuf_lock);
+ logbuf_unlock_irq();
kfree(text);
return len;
@@ -1458,7 +1506,7 @@ int do_syslog(int type, char __user *buf, int len, int source)
break;
/* Number of chars in the log buffer */
case SYSLOG_ACTION_SIZE_UNREAD:
- raw_spin_lock_irq(&logbuf_lock);
+ logbuf_lock_irq();
if (syslog_seq < log_first_seq) {
/* messages are gone, move to first one */
syslog_seq = log_first_seq;
@@ -1486,7 +1534,7 @@ int do_syslog(int type, char __user *buf, int len, int source)
}
error -= syslog_partial;
}
- raw_spin_unlock_irq(&logbuf_lock);
+ logbuf_unlock_irq();
break;
/* Size of the log buffer */
case SYSLOG_ACTION_SIZE_BUFFER:
@@ -1510,8 +1558,7 @@ SYSCALL_DEFINE3(syslog, int, type, char __user *, buf, int, len)
* log_buf[start] to log_buf[end - 1].
* The console_lock must be held.
*/
-static void call_console_drivers(int level,
- const char *ext_text, size_t ext_len,
+static void call_console_drivers(const char *ext_text, size_t ext_len,
const char *text, size_t len)
{
struct console *con;
@@ -1538,28 +1585,6 @@ static void call_console_drivers(int level,
}
}
-/*
- * Zap console related locks when oopsing.
- * To leave time for slow consoles to print a full oops,
- * only zap at most once every 30 seconds.
- */
-static void zap_locks(void)
-{
- static unsigned long oops_timestamp;
-
- if (time_after_eq(jiffies, oops_timestamp) &&
- !time_after(jiffies, oops_timestamp + 30 * HZ))
- return;
-
- oops_timestamp = jiffies;
-
- debug_locks_off();
- /* If a crash is occurring, make sure we can't deadlock */
- raw_spin_lock_init(&logbuf_lock);
- /* And make sure that we print immediately */
- sema_init(&console_sem, 1);
-}
-
int printk_delay_msec __read_mostly;
static inline void printk_delay(void)
@@ -1669,18 +1694,13 @@ asmlinkage int vprintk_emit(int facility, int level,
const char *dict, size_t dictlen,
const char *fmt, va_list args)
{
- static bool recursion_bug;
static char textbuf[LOG_LINE_MAX];
char *text = textbuf;
size_t text_len = 0;
enum log_flags lflags = 0;
unsigned long flags;
- int this_cpu;
int printed_len = 0;
- int nmi_message_lost;
bool in_sched = false;
- /* cpu currently holding logbuf_lock in this function */
- static unsigned int logbuf_cpu = UINT_MAX;
if (level == LOGLEVEL_SCHED) {
level = LOGLEVEL_DEFAULT;
@@ -1690,53 +1710,8 @@ asmlinkage int vprintk_emit(int facility, int level,
boot_delay_msec(level);
printk_delay();
- local_irq_save(flags);
- this_cpu = smp_processor_id();
-
- /*
- * Ouch, printk recursed into itself!
- */
- if (unlikely(logbuf_cpu == this_cpu)) {
- /*
- * If a crash is occurring during printk() on this CPU,
- * then try to get the crash message out but make sure
- * we can't deadlock. Otherwise just return to avoid the
- * recursion and return - but flag the recursion so that
- * it can be printed at the next appropriate moment:
- */
- if (!oops_in_progress && !lockdep_recursing(current)) {
- recursion_bug = true;
- local_irq_restore(flags);
- return 0;
- }
- zap_locks();
- }
-
- lockdep_off();
/* This stops the holder of console_sem just where we want him */
- raw_spin_lock(&logbuf_lock);
- logbuf_cpu = this_cpu;
-
- if (unlikely(recursion_bug)) {
- static const char recursion_msg[] =
- "BUG: recent printk recursion!";
-
- recursion_bug = false;
- /* emit KERN_CRIT message */
- printed_len += log_store(0, 2, LOG_PREFIX|LOG_NEWLINE, 0,
- NULL, 0, recursion_msg,
- strlen(recursion_msg));
- }
-
- nmi_message_lost = get_nmi_message_lost();
- if (unlikely(nmi_message_lost)) {
- text_len = scnprintf(textbuf, sizeof(textbuf),
- "BAD LUCK: lost %d message(s) from NMI context!",
- nmi_message_lost);
- printed_len += log_store(0, 2, LOG_PREFIX|LOG_NEWLINE, 0,
- NULL, 0, textbuf, text_len);
- }
-
+ logbuf_lock_irqsave(flags);
/*
* The printf needs to come first; we need the syslog
* prefix which might be passed-in as a parameter.
@@ -1779,14 +1754,10 @@ asmlinkage int vprintk_emit(int facility, int level,
printed_len += log_output(facility, level, lflags, dict, dictlen, text, text_len);
- logbuf_cpu = UINT_MAX;
- raw_spin_unlock(&logbuf_lock);
- lockdep_on();
- local_irq_restore(flags);
+ logbuf_unlock_irqrestore(flags);
/* If called from the scheduler, we can not call up(). */
if (!in_sched) {
- lockdep_off();
/*
* Try to acquire and then immediately release the console
* semaphore. The release will print out buffers and wake up
@@ -1794,7 +1765,6 @@ asmlinkage int vprintk_emit(int facility, int level,
*/
if (console_trylock())
console_unlock();
- lockdep_on();
}
return printed_len;
@@ -1803,7 +1773,7 @@ EXPORT_SYMBOL(vprintk_emit);
asmlinkage int vprintk(const char *fmt, va_list args)
{
- return vprintk_emit(0, LOGLEVEL_DEFAULT, NULL, 0, fmt, args);
+ return vprintk_func(fmt, args);
}
EXPORT_SYMBOL(vprintk);
@@ -1895,16 +1865,12 @@ static ssize_t msg_print_ext_header(char *buf, size_t size,
static ssize_t msg_print_ext_body(char *buf, size_t size,
char *dict, size_t dict_len,
char *text, size_t text_len) { return 0; }
-static void call_console_drivers(int level,
- const char *ext_text, size_t ext_len,
+static void call_console_drivers(const char *ext_text, size_t ext_len,
const char *text, size_t len) {}
static size_t msg_print_text(const struct printk_log *msg,
bool syslog, char *buf, size_t size) { return 0; }
static bool suppress_message_printing(int level) { return false; }
-/* Still needs to be defined for users */
-DEFINE_PER_CPU(printk_func_t, printk_func);
-
#endif /* CONFIG_PRINTK */
#ifdef CONFIG_EARLY_PRINTK
@@ -2220,9 +2186,9 @@ again:
struct printk_log *msg;
size_t ext_len = 0;
size_t len;
- int level;
- raw_spin_lock_irqsave(&logbuf_lock, flags);
+ printk_safe_enter_irqsave(flags);
+ raw_spin_lock(&logbuf_lock);
if (seen_seq != log_next_seq) {
wake_klogd = true;
seen_seq = log_next_seq;
@@ -2243,8 +2209,7 @@ skip:
break;
msg = log_from_idx(console_idx);
- level = msg->level;
- if (suppress_message_printing(level)) {
+ if (suppress_message_printing(msg->level)) {
/*
* Skip record we have buffered and already printed
* directly to the console when we received it, and
@@ -2270,9 +2235,9 @@ skip:
raw_spin_unlock(&logbuf_lock);
stop_critical_timings(); /* don't trace print latency */
- call_console_drivers(level, ext_text, ext_len, text, len);
+ call_console_drivers(ext_text, ext_len, text, len);
start_critical_timings();
- local_irq_restore(flags);
+ printk_safe_exit_irqrestore(flags);
if (do_cond_resched)
cond_resched();
@@ -2295,7 +2260,8 @@ skip:
*/
raw_spin_lock(&logbuf_lock);
retry = console_seq != log_next_seq;
- raw_spin_unlock_irqrestore(&logbuf_lock, flags);
+ raw_spin_unlock(&logbuf_lock);
+ printk_safe_exit_irqrestore(flags);
if (retry && console_trylock())
goto again;
@@ -2558,10 +2524,10 @@ void register_console(struct console *newcon)
* console_unlock(); will print out the buffered messages
* for us.
*/
- raw_spin_lock_irqsave(&logbuf_lock, flags);
+ logbuf_lock_irqsave(flags);
console_seq = syslog_seq;
console_idx = syslog_idx;
- raw_spin_unlock_irqrestore(&logbuf_lock, flags);
+ logbuf_unlock_irqrestore(flags);
/*
* We're about to replay the log buffer. Only do this to the
* just-registered console to avoid excessive message spam to
@@ -2860,12 +2826,12 @@ void kmsg_dump(enum kmsg_dump_reason reason)
/* initialize iterator with data about the stored records */
dumper->active = true;
- raw_spin_lock_irqsave(&logbuf_lock, flags);
+ logbuf_lock_irqsave(flags);
dumper->cur_seq = clear_seq;
dumper->cur_idx = clear_idx;
dumper->next_seq = log_next_seq;
dumper->next_idx = log_next_idx;
- raw_spin_unlock_irqrestore(&logbuf_lock, flags);
+ logbuf_unlock_irqrestore(flags);
/* invoke dumper which will iterate over records */
dumper->dump(dumper, reason);
@@ -2950,9 +2916,9 @@ bool kmsg_dump_get_line(struct kmsg_dumper *dumper, bool syslog,
unsigned long flags;
bool ret;
- raw_spin_lock_irqsave(&logbuf_lock, flags);
+ logbuf_lock_irqsave(flags);
ret = kmsg_dump_get_line_nolock(dumper, syslog, line, size, len);
- raw_spin_unlock_irqrestore(&logbuf_lock, flags);
+ logbuf_unlock_irqrestore(flags);
return ret;
}
@@ -2991,7 +2957,7 @@ bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog,
if (!dumper->active)
goto out;
- raw_spin_lock_irqsave(&logbuf_lock, flags);
+ logbuf_lock_irqsave(flags);
if (dumper->cur_seq < log_first_seq) {
/* messages are gone, move to first available one */
dumper->cur_seq = log_first_seq;
@@ -3000,7 +2966,7 @@ bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog,
/* last entry */
if (dumper->cur_seq >= dumper->next_seq) {
- raw_spin_unlock_irqrestore(&logbuf_lock, flags);
+ logbuf_unlock_irqrestore(flags);
goto out;
}
@@ -3042,7 +3008,7 @@ bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog,
dumper->next_seq = next_seq;
dumper->next_idx = next_idx;
ret = true;
- raw_spin_unlock_irqrestore(&logbuf_lock, flags);
+ logbuf_unlock_irqrestore(flags);
out:
if (len)
*len = l;
@@ -3080,9 +3046,9 @@ void kmsg_dump_rewind(struct kmsg_dumper *dumper)
{
unsigned long flags;
- raw_spin_lock_irqsave(&logbuf_lock, flags);
+ logbuf_lock_irqsave(flags);
kmsg_dump_rewind_nolock(dumper);
- raw_spin_unlock_irqrestore(&logbuf_lock, flags);
+ logbuf_unlock_irqrestore(flags);
}
EXPORT_SYMBOL_GPL(kmsg_dump_rewind);
diff --git a/kernel/printk/nmi.c b/kernel/printk/printk_safe.c
index f011aaef583c..033e50a7d706 100644
--- a/kernel/printk/nmi.c
+++ b/kernel/printk/printk_safe.c
@@ -1,5 +1,5 @@
/*
- * nmi.c - Safe printk in NMI context
+ * printk_safe.c - Safe printk for printk-deadlock-prone contexts
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
@@ -32,36 +32,58 @@
* is later flushed into the main ring buffer via IRQ work.
*
* The alternative implementation is chosen transparently
- * via @printk_func per-CPU variable.
+ * by examinig current printk() context mask stored in @printk_context
+ * per-CPU variable.
*
* The implementation allows to flush the strings also from another CPU.
* There are situations when we want to make sure that all buffers
* were handled or when IRQs are blocked.
*/
-DEFINE_PER_CPU(printk_func_t, printk_func) = vprintk_default;
-static int printk_nmi_irq_ready;
-atomic_t nmi_message_lost;
+static int printk_safe_irq_ready;
-#define NMI_LOG_BUF_LEN ((1 << CONFIG_NMI_LOG_BUF_SHIFT) - \
- sizeof(atomic_t) - sizeof(struct irq_work))
+#define SAFE_LOG_BUF_LEN ((1 << CONFIG_PRINTK_SAFE_LOG_BUF_SHIFT) - \
+ sizeof(atomic_t) - \
+ sizeof(atomic_t) - \
+ sizeof(struct irq_work))
-struct nmi_seq_buf {
+struct printk_safe_seq_buf {
atomic_t len; /* length of written data */
+ atomic_t message_lost;
struct irq_work work; /* IRQ work that flushes the buffer */
- unsigned char buffer[NMI_LOG_BUF_LEN];
+ unsigned char buffer[SAFE_LOG_BUF_LEN];
};
-static DEFINE_PER_CPU(struct nmi_seq_buf, nmi_print_seq);
+
+static DEFINE_PER_CPU(struct printk_safe_seq_buf, safe_print_seq);
+static DEFINE_PER_CPU(int, printk_context);
+
+#ifdef CONFIG_PRINTK_NMI
+static DEFINE_PER_CPU(struct printk_safe_seq_buf, nmi_print_seq);
+#endif
+
+/* Get flushed in a more safe context. */
+static void queue_flush_work(struct printk_safe_seq_buf *s)
+{
+ if (printk_safe_irq_ready) {
+ /* Make sure that IRQ work is really initialized. */
+ smp_rmb();
+ irq_work_queue(&s->work);
+ }
+}
/*
- * Safe printk() for NMI context. It uses a per-CPU buffer to
- * store the message. NMIs are not nested, so there is always only
- * one writer running. But the buffer might get flushed from another
- * CPU, so we need to be careful.
+ * Add a message to per-CPU context-dependent buffer. NMI and printk-safe
+ * have dedicated buffers, because otherwise printk-safe preempted by
+ * NMI-printk would have overwritten the NMI messages.
+ *
+ * The messages are fushed from irq work (or from panic()), possibly,
+ * from other CPU, concurrently with printk_safe_log_store(). Should this
+ * happen, printk_safe_log_store() will notice the buffer->len mismatch
+ * and repeat the write.
*/
-static int vprintk_nmi(const char *fmt, va_list args)
+static int printk_safe_log_store(struct printk_safe_seq_buf *s,
+ const char *fmt, va_list args)
{
- struct nmi_seq_buf *s = this_cpu_ptr(&nmi_print_seq);
- int add = 0;
+ int add;
size_t len;
again:
@@ -69,18 +91,21 @@ again:
/* The trailing '\0' is not counted into len. */
if (len >= sizeof(s->buffer) - 1) {
- atomic_inc(&nmi_message_lost);
+ atomic_inc(&s->message_lost);
+ queue_flush_work(s);
return 0;
}
/*
- * Make sure that all old data have been read before the buffer was
- * reseted. This is not needed when we just append data.
+ * Make sure that all old data have been read before the buffer
+ * was reset. This is not needed when we just append data.
*/
if (!len)
smp_rmb();
add = vscnprintf(s->buffer + len, sizeof(s->buffer) - len, fmt, args);
+ if (!add)
+ return 0;
/*
* Do it once again if the buffer has been flushed in the meantime.
@@ -90,32 +115,23 @@ again:
if (atomic_cmpxchg(&s->len, len, len + add) != len)
goto again;
- /* Get flushed in a more safe context. */
- if (add && printk_nmi_irq_ready) {
- /* Make sure that IRQ work is really initialized. */
- smp_rmb();
- irq_work_queue(&s->work);
- }
-
+ queue_flush_work(s);
return add;
}
-static void printk_nmi_flush_line(const char *text, int len)
+static inline void printk_safe_flush_line(const char *text, int len)
{
/*
- * The buffers are flushed in NMI only on panic. The messages must
- * go only into the ring buffer at this stage. Consoles will get
- * explicitly called later when a crashdump is not generated.
+ * Avoid any console drivers calls from here, because we may be
+ * in NMI or printk_safe context (when in panic). The messages
+ * must go only into the ring buffer at this stage. Consoles will
+ * get explicitly called later when a crashdump is not generated.
*/
- if (in_nmi())
- printk_deferred("%.*s", len, text);
- else
- printk("%.*s", len, text);
-
+ printk_deferred("%.*s", len, text);
}
/* printk part of the temporary buffer line by line */
-static int printk_nmi_flush_buffer(const char *start, size_t len)
+static int printk_safe_flush_buffer(const char *start, size_t len)
{
const char *c, *end;
bool header;
@@ -127,7 +143,7 @@ static int printk_nmi_flush_buffer(const char *start, size_t len)
/* Print line by line. */
while (c < end) {
if (*c == '\n') {
- printk_nmi_flush_line(start, c - start + 1);
+ printk_safe_flush_line(start, c - start + 1);
start = ++c;
header = true;
continue;
@@ -140,7 +156,7 @@ static int printk_nmi_flush_buffer(const char *start, size_t len)
continue;
}
- printk_nmi_flush_line(start, c - start);
+ printk_safe_flush_line(start, c - start);
start = c++;
header = true;
continue;
@@ -154,22 +170,31 @@ static int printk_nmi_flush_buffer(const char *start, size_t len)
if (start < end && !header) {
static const char newline[] = KERN_CONT "\n";
- printk_nmi_flush_line(start, end - start);
- printk_nmi_flush_line(newline, strlen(newline));
+ printk_safe_flush_line(start, end - start);
+ printk_safe_flush_line(newline, strlen(newline));
}
return len;
}
+static void report_message_lost(struct printk_safe_seq_buf *s)
+{
+ int lost = atomic_xchg(&s->message_lost, 0);
+
+ if (lost)
+ printk_deferred("Lost %d message(s)!\n", lost);
+}
+
/*
- * Flush data from the associated per_CPU buffer. The function
+ * Flush data from the associated per-CPU buffer. The function
* can be called either via IRQ work or independently.
*/
-static void __printk_nmi_flush(struct irq_work *work)
+static void __printk_safe_flush(struct irq_work *work)
{
static raw_spinlock_t read_lock =
__RAW_SPIN_LOCK_INITIALIZER(read_lock);
- struct nmi_seq_buf *s = container_of(work, struct nmi_seq_buf, work);
+ struct printk_safe_seq_buf *s =
+ container_of(work, struct printk_safe_seq_buf, work);
unsigned long flags;
size_t len;
int i;
@@ -194,9 +219,9 @@ more:
* buffer size.
*/
if ((i && i >= len) || len > sizeof(s->buffer)) {
- const char *msg = "printk_nmi_flush: internal error\n";
+ const char *msg = "printk_safe_flush: internal error\n";
- printk_nmi_flush_line(msg, strlen(msg));
+ printk_safe_flush_line(msg, strlen(msg));
len = 0;
}
@@ -205,7 +230,7 @@ more:
/* Make sure that data has been written up to the @len */
smp_rmb();
- i += printk_nmi_flush_buffer(s->buffer + i, len - i);
+ i += printk_safe_flush_buffer(s->buffer + i, len - i);
/*
* Check that nothing has got added in the meantime and truncate
@@ -217,35 +242,40 @@ more:
goto more;
out:
+ report_message_lost(s);
raw_spin_unlock_irqrestore(&read_lock, flags);
}
/**
- * printk_nmi_flush - flush all per-cpu nmi buffers.
+ * printk_safe_flush - flush all per-cpu nmi buffers.
*
* The buffers are flushed automatically via IRQ work. This function
* is useful only when someone wants to be sure that all buffers have
* been flushed at some point.
*/
-void printk_nmi_flush(void)
+void printk_safe_flush(void)
{
int cpu;
- for_each_possible_cpu(cpu)
- __printk_nmi_flush(&per_cpu(nmi_print_seq, cpu).work);
+ for_each_possible_cpu(cpu) {
+#ifdef CONFIG_PRINTK_NMI
+ __printk_safe_flush(&per_cpu(nmi_print_seq, cpu).work);
+#endif
+ __printk_safe_flush(&per_cpu(safe_print_seq, cpu).work);
+ }
}
/**
- * printk_nmi_flush_on_panic - flush all per-cpu nmi buffers when the system
+ * printk_safe_flush_on_panic - flush all per-cpu nmi buffers when the system
* goes down.
*
- * Similar to printk_nmi_flush() but it can be called even in NMI context when
+ * Similar to printk_safe_flush() but it can be called even in NMI context when
* the system goes down. It does the best effort to get NMI messages into
* the main ring buffer.
*
* Note that it could try harder when there is only one CPU online.
*/
-void printk_nmi_flush_on_panic(void)
+void printk_safe_flush_on_panic(void)
{
/*
* Make sure that we could access the main ring buffer.
@@ -259,33 +289,97 @@ void printk_nmi_flush_on_panic(void)
raw_spin_lock_init(&logbuf_lock);
}
- printk_nmi_flush();
+ printk_safe_flush();
}
-void __init printk_nmi_init(void)
+#ifdef CONFIG_PRINTK_NMI
+/*
+ * Safe printk() for NMI context. It uses a per-CPU buffer to
+ * store the message. NMIs are not nested, so there is always only
+ * one writer running. But the buffer might get flushed from another
+ * CPU, so we need to be careful.
+ */
+static int vprintk_nmi(const char *fmt, va_list args)
{
- int cpu;
+ struct printk_safe_seq_buf *s = this_cpu_ptr(&nmi_print_seq);
- for_each_possible_cpu(cpu) {
- struct nmi_seq_buf *s = &per_cpu(nmi_print_seq, cpu);
+ return printk_safe_log_store(s, fmt, args);
+}
- init_irq_work(&s->work, __printk_nmi_flush);
- }
+void printk_nmi_enter(void)
+{
+ this_cpu_or(printk_context, PRINTK_NMI_CONTEXT_MASK);
+}
- /* Make sure that IRQ works are initialized before enabling. */
- smp_wmb();
- printk_nmi_irq_ready = 1;
+void printk_nmi_exit(void)
+{
+ this_cpu_and(printk_context, ~PRINTK_NMI_CONTEXT_MASK);
+}
- /* Flush pending messages that did not have scheduled IRQ works. */
- printk_nmi_flush();
+#else
+
+static int vprintk_nmi(const char *fmt, va_list args)
+{
+ return 0;
}
-void printk_nmi_enter(void)
+#endif /* CONFIG_PRINTK_NMI */
+
+/*
+ * Lock-less printk(), to avoid deadlocks should the printk() recurse
+ * into itself. It uses a per-CPU buffer to store the message, just like
+ * NMI.
+ */
+static int vprintk_safe(const char *fmt, va_list args)
{
- this_cpu_write(printk_func, vprintk_nmi);
+ struct printk_safe_seq_buf *s = this_cpu_ptr(&safe_print_seq);
+
+ return printk_safe_log_store(s, fmt, args);
}
-void printk_nmi_exit(void)
+/* Can be preempted by NMI. */
+void __printk_safe_enter(void)
+{
+ this_cpu_inc(printk_context);
+}
+
+/* Can be preempted by NMI. */
+void __printk_safe_exit(void)
{
- this_cpu_write(printk_func, vprintk_default);
+ this_cpu_dec(printk_context);
+}
+
+__printf(1, 0) int vprintk_func(const char *fmt, va_list args)
+{
+ if (this_cpu_read(printk_context) & PRINTK_NMI_CONTEXT_MASK)
+ return vprintk_nmi(fmt, args);
+
+ if (this_cpu_read(printk_context) & PRINTK_SAFE_CONTEXT_MASK)
+ return vprintk_safe(fmt, args);
+
+ return vprintk_default(fmt, args);
+}
+
+void __init printk_safe_init(void)
+{
+ int cpu;
+
+ for_each_possible_cpu(cpu) {
+ struct printk_safe_seq_buf *s;
+
+ s = &per_cpu(safe_print_seq, cpu);
+ init_irq_work(&s->work, __printk_safe_flush);
+
+#ifdef CONFIG_PRINTK_NMI
+ s = &per_cpu(nmi_print_seq, cpu);
+ init_irq_work(&s->work, __printk_safe_flush);
+#endif
+ }
+
+ /* Make sure that IRQ works are initialized before enabling. */
+ smp_wmb();
+ printk_safe_irq_ready = 1;
+
+ /* Flush pending messages that did not have scheduled IRQ works. */
+ printk_safe_flush();
}
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 34e2291a9a6c..e1ae6ac15eac 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -23,6 +23,9 @@
#include <asm/switch_to.h>
#include <asm/tlb.h>
+#ifdef CONFIG_PARAVIRT
+#include <asm/paravirt.h>
+#endif
#include "sched.h"
#include "../workqueue_internal.h"
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index f7ce79a46050..e15185c28de5 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -16,6 +16,7 @@
#include <linux/atomic.h>
#include <linux/audit.h>
#include <linux/compat.h>
+#include <linux/coredump.h>
#include <linux/sched.h>
#include <linux/seccomp.h>
#include <linux/slab.h>
@@ -486,6 +487,17 @@ void put_seccomp_filter(struct task_struct *tsk)
}
}
+static void seccomp_init_siginfo(siginfo_t *info, int syscall, int reason)
+{
+ memset(info, 0, sizeof(*info));
+ info->si_signo = SIGSYS;
+ info->si_code = SYS_SECCOMP;
+ info->si_call_addr = (void __user *)KSTK_EIP(current);
+ info->si_errno = reason;
+ info->si_arch = syscall_get_arch();
+ info->si_syscall = syscall;
+}
+
/**
* seccomp_send_sigsys - signals the task to allow in-process syscall emulation
* @syscall: syscall number to send to userland
@@ -496,13 +508,7 @@ void put_seccomp_filter(struct task_struct *tsk)
static void seccomp_send_sigsys(int syscall, int reason)
{
struct siginfo info;
- memset(&info, 0, sizeof(info));
- info.si_signo = SIGSYS;
- info.si_code = SYS_SECCOMP;
- info.si_call_addr = (void __user *)KSTK_EIP(current);
- info.si_errno = reason;
- info.si_arch = syscall_get_arch();
- info.si_syscall = syscall;
+ seccomp_init_siginfo(&info, syscall, reason);
force_sig_info(SIGSYS, &info, current);
}
#endif /* CONFIG_SECCOMP_FILTER */
@@ -634,10 +640,20 @@ static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd,
return 0;
case SECCOMP_RET_KILL:
- default:
+ default: {
+ siginfo_t info;
audit_seccomp(this_syscall, SIGSYS, action);
+ /* Dump core only if this is the last remaining thread. */
+ if (get_nr_threads(current) == 1) {
+ /* Show the original registers in the dump. */
+ syscall_rollback(current, task_pt_regs(current));
+ /* Trigger a manual coredump since do_exit skips it. */
+ seccomp_init_siginfo(&info, this_syscall, data);
+ do_coredump(&info);
+ }
do_exit(SIGSYS);
}
+ }
unreachable();
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 95cecbf67f5c..b2058a7f94bd 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -28,6 +28,8 @@
#include <linux/uaccess.h>
#include <linux/list.h>
+#include "../../block/blk.h"
+
#include <trace/events/block.h>
#include "trace_output.h"
@@ -292,9 +294,6 @@ record_it:
local_irq_restore(flags);
}
-static struct dentry *blk_tree_root;
-static DEFINE_MUTEX(blk_tree_mutex);
-
static void blk_trace_free(struct blk_trace *bt)
{
debugfs_remove(bt->msg_file);
@@ -433,9 +432,9 @@ static void blk_trace_setup_lba(struct blk_trace *bt,
/*
* Setup everything required to start tracing
*/
-int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
- struct block_device *bdev,
- struct blk_user_trace_setup *buts)
+static int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
+ struct block_device *bdev,
+ struct blk_user_trace_setup *buts)
{
struct blk_trace *bt = NULL;
struct dentry *dir = NULL;
@@ -468,22 +467,15 @@ int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
ret = -ENOENT;
- mutex_lock(&blk_tree_mutex);
- if (!blk_tree_root) {
- blk_tree_root = debugfs_create_dir("block", NULL);
- if (!blk_tree_root) {
- mutex_unlock(&blk_tree_mutex);
- goto err;
- }
- }
- mutex_unlock(&blk_tree_mutex);
-
- dir = debugfs_create_dir(buts->name, blk_tree_root);
+ if (!blk_debugfs_root)
+ goto err;
+ dir = debugfs_lookup(buts->name, blk_debugfs_root);
+ if (!dir)
+ bt->dir = dir = debugfs_create_dir(buts->name, blk_debugfs_root);
if (!dir)
goto err;
- bt->dir = dir;
bt->dev = dev;
atomic_set(&bt->dropped, 0);
INIT_LIST_HEAD(&bt->running_list);
@@ -525,9 +517,12 @@ int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
if (atomic_inc_return(&blk_probes_ref) == 1)
blk_register_tracepoints();
- return 0;
+ ret = 0;
err:
- blk_trace_free(bt);
+ if (dir && !bt->dir)
+ dput(dir);
+ if (ret)
+ blk_trace_free(bt);
return ret;
}
@@ -712,15 +707,13 @@ static void blk_add_trace_rq(struct request_queue *q, struct request *rq,
if (likely(!bt))
return;
- if (rq->cmd_type == REQ_TYPE_BLOCK_PC) {
+ if (blk_rq_is_passthrough(rq))
what |= BLK_TC_ACT(BLK_TC_PC);
- __blk_add_trace(bt, 0, nr_bytes, req_op(rq), rq->cmd_flags,
- what, rq->errors, rq->cmd_len, rq->cmd);
- } else {
+ else
what |= BLK_TC_ACT(BLK_TC_FS);
- __blk_add_trace(bt, blk_rq_pos(rq), nr_bytes, req_op(rq),
- rq->cmd_flags, what, rq->errors, 0, NULL);
- }
+
+ __blk_add_trace(bt, blk_rq_trace_sector(rq), nr_bytes, req_op(rq),
+ rq->cmd_flags, what, rq->errors, 0, NULL);
}
static void blk_add_trace_rq_abort(void *ignore,
@@ -972,11 +965,7 @@ void blk_add_driver_data(struct request_queue *q,
if (likely(!bt))
return;
- if (rq->cmd_type == REQ_TYPE_BLOCK_PC)
- __blk_add_trace(bt, 0, blk_rq_bytes(rq), 0, 0,
- BLK_TA_DRV_DATA, rq->errors, len, data);
- else
- __blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq), 0, 0,
+ __blk_add_trace(bt, blk_rq_trace_sector(rq), blk_rq_bytes(rq), 0, 0,
BLK_TA_DRV_DATA, rq->errors, len, data);
}
EXPORT_SYMBOL_GPL(blk_add_driver_data);
@@ -1752,31 +1741,6 @@ void blk_trace_remove_sysfs(struct device *dev)
#ifdef CONFIG_EVENT_TRACING
-void blk_dump_cmd(char *buf, struct request *rq)
-{
- int i, end;
- int len = rq->cmd_len;
- unsigned char *cmd = rq->cmd;
-
- if (rq->cmd_type != REQ_TYPE_BLOCK_PC) {
- buf[0] = '\0';
- return;
- }
-
- for (end = len - 1; end >= 0; end--)
- if (cmd[end])
- break;
- end++;
-
- for (i = 0; i < len; i++) {
- buf += sprintf(buf, "%s%02x", i == 0 ? "" : " ", cmd[i]);
- if (i == end && end != len - 1) {
- sprintf(buf, " ..");
- break;
- }
- }
-}
-
void blk_fill_rwbs(char *rwbs, unsigned int op, int bytes)
{
int i = 0;
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index fa77311dadb2..cee9802cf3e0 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -76,8 +76,8 @@ static const struct bpf_func_proto bpf_probe_read_proto = {
.func = bpf_probe_read,
.gpl_only = true,
.ret_type = RET_INTEGER,
- .arg1_type = ARG_PTR_TO_RAW_STACK,
- .arg2_type = ARG_CONST_STACK_SIZE,
+ .arg1_type = ARG_PTR_TO_UNINIT_MEM,
+ .arg2_type = ARG_CONST_SIZE,
.arg3_type = ARG_ANYTHING,
};
@@ -109,8 +109,8 @@ static const struct bpf_func_proto bpf_probe_write_user_proto = {
.gpl_only = true,
.ret_type = RET_INTEGER,
.arg1_type = ARG_ANYTHING,
- .arg2_type = ARG_PTR_TO_STACK,
- .arg3_type = ARG_CONST_STACK_SIZE,
+ .arg2_type = ARG_PTR_TO_MEM,
+ .arg3_type = ARG_CONST_SIZE,
};
static const struct bpf_func_proto *bpf_get_probe_write_proto(void)
@@ -213,8 +213,8 @@ static const struct bpf_func_proto bpf_trace_printk_proto = {
.func = bpf_trace_printk,
.gpl_only = true,
.ret_type = RET_INTEGER,
- .arg1_type = ARG_PTR_TO_STACK,
- .arg2_type = ARG_CONST_STACK_SIZE,
+ .arg1_type = ARG_PTR_TO_MEM,
+ .arg2_type = ARG_CONST_SIZE,
};
const struct bpf_func_proto *bpf_get_trace_printk_proto(void)
@@ -329,8 +329,8 @@ static const struct bpf_func_proto bpf_perf_event_output_proto = {
.arg1_type = ARG_PTR_TO_CTX,
.arg2_type = ARG_CONST_MAP_PTR,
.arg3_type = ARG_ANYTHING,
- .arg4_type = ARG_PTR_TO_STACK,
- .arg5_type = ARG_CONST_STACK_SIZE,
+ .arg4_type = ARG_PTR_TO_MEM,
+ .arg5_type = ARG_CONST_SIZE,
};
static DEFINE_PER_CPU(struct pt_regs, bpf_pt_regs);
@@ -395,6 +395,36 @@ static const struct bpf_func_proto bpf_current_task_under_cgroup_proto = {
.arg2_type = ARG_ANYTHING,
};
+BPF_CALL_3(bpf_probe_read_str, void *, dst, u32, size,
+ const void *, unsafe_ptr)
+{
+ int ret;
+
+ /*
+ * The strncpy_from_unsafe() call will likely not fill the entire
+ * buffer, but that's okay in this circumstance as we're probing
+ * arbitrary memory anyway similar to bpf_probe_read() and might
+ * as well probe the stack. Thus, memory is explicitly cleared
+ * only in error case, so that improper users ignoring return
+ * code altogether don't copy garbage; otherwise length of string
+ * is returned that can be used for bpf_perf_event_output() et al.
+ */
+ ret = strncpy_from_unsafe(dst, unsafe_ptr, size);
+ if (unlikely(ret < 0))
+ memset(dst, 0, size);
+
+ return ret;
+}
+
+static const struct bpf_func_proto bpf_probe_read_str_proto = {
+ .func = bpf_probe_read_str,
+ .gpl_only = true,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_UNINIT_MEM,
+ .arg2_type = ARG_CONST_SIZE,
+ .arg3_type = ARG_ANYTHING,
+};
+
static const struct bpf_func_proto *tracing_func_proto(enum bpf_func_id func_id)
{
switch (func_id) {
@@ -432,6 +462,8 @@ static const struct bpf_func_proto *tracing_func_proto(enum bpf_func_id func_id)
return &bpf_current_task_under_cgroup_proto;
case BPF_FUNC_get_prandom_u32:
return &bpf_get_prandom_u32_proto;
+ case BPF_FUNC_probe_read_str:
+ return &bpf_probe_read_str_proto;
default:
return NULL;
}
@@ -459,6 +491,13 @@ static bool kprobe_prog_is_valid_access(int off, int size, enum bpf_access_type
return false;
if (off % size != 0)
return false;
+ /*
+ * Assertion for 32 bit to make sure last 8 byte access
+ * (BPF_DW) to the last 4 byte member is disallowed.
+ */
+ if (off + size > sizeof(struct pt_regs))
+ return false;
+
return true;
}
@@ -467,7 +506,7 @@ static const struct bpf_verifier_ops kprobe_prog_ops = {
.is_valid_access = kprobe_prog_is_valid_access,
};
-static struct bpf_prog_type_list kprobe_tl = {
+static struct bpf_prog_type_list kprobe_tl __ro_after_init = {
.ops = &kprobe_prog_ops,
.type = BPF_PROG_TYPE_KPROBE,
};
@@ -492,8 +531,8 @@ static const struct bpf_func_proto bpf_perf_event_output_proto_tp = {
.arg1_type = ARG_PTR_TO_CTX,
.arg2_type = ARG_CONST_MAP_PTR,
.arg3_type = ARG_ANYTHING,
- .arg4_type = ARG_PTR_TO_STACK,
- .arg5_type = ARG_CONST_STACK_SIZE,
+ .arg4_type = ARG_PTR_TO_MEM,
+ .arg5_type = ARG_CONST_SIZE,
};
BPF_CALL_3(bpf_get_stackid_tp, void *, tp_buff, struct bpf_map *, map,
@@ -540,6 +579,8 @@ static bool tp_prog_is_valid_access(int off, int size, enum bpf_access_type type
return false;
if (off % size != 0)
return false;
+
+ BUILD_BUG_ON(PERF_MAX_TRACE_SIZE % sizeof(__u64));
return true;
}
@@ -548,7 +589,7 @@ static const struct bpf_verifier_ops tracepoint_prog_ops = {
.is_valid_access = tp_prog_is_valid_access,
};
-static struct bpf_prog_type_list tracepoint_tl = {
+static struct bpf_prog_type_list tracepoint_tl __ro_after_init = {
.ops = &tracepoint_prog_ops,
.type = BPF_PROG_TYPE_TRACEPOINT,
};
@@ -572,28 +613,29 @@ static bool pe_prog_is_valid_access(int off, int size, enum bpf_access_type type
return true;
}
-static u32 pe_prog_convert_ctx_access(enum bpf_access_type type, int dst_reg,
- int src_reg, int ctx_off,
+static u32 pe_prog_convert_ctx_access(enum bpf_access_type type,
+ const struct bpf_insn *si,
struct bpf_insn *insn_buf,
struct bpf_prog *prog)
{
struct bpf_insn *insn = insn_buf;
- switch (ctx_off) {
+ switch (si->off) {
case offsetof(struct bpf_perf_event_data, sample_period):
BUILD_BUG_ON(FIELD_SIZEOF(struct perf_sample_data, period) != sizeof(u64));
*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_perf_event_data_kern,
- data), dst_reg, src_reg,
+ data), si->dst_reg, si->src_reg,
offsetof(struct bpf_perf_event_data_kern, data));
- *insn++ = BPF_LDX_MEM(BPF_DW, dst_reg, dst_reg,
+ *insn++ = BPF_LDX_MEM(BPF_DW, si->dst_reg, si->dst_reg,
offsetof(struct perf_sample_data, period));
break;
default:
*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_perf_event_data_kern,
- regs), dst_reg, src_reg,
+ regs), si->dst_reg, si->src_reg,
offsetof(struct bpf_perf_event_data_kern, regs));
- *insn++ = BPF_LDX_MEM(BPF_SIZEOF(long), dst_reg, dst_reg, ctx_off);
+ *insn++ = BPF_LDX_MEM(BPF_SIZEOF(long), si->dst_reg, si->dst_reg,
+ si->off);
break;
}
@@ -606,7 +648,7 @@ static const struct bpf_verifier_ops perf_event_prog_ops = {
.convert_ctx_access = pe_prog_convert_ctx_access,
};
-static struct bpf_prog_type_list perf_event_tl = {
+static struct bpf_prog_type_list perf_event_tl __ro_after_init = {
.ops = &perf_event_prog_ops,
.type = BPF_PROG_TYPE_PERF_EVENT,
};
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 5d33a7352919..aea6a1218c7d 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -162,15 +162,27 @@ trace_print_bitmask_seq(struct trace_seq *p, void *bitmask_ptr,
}
EXPORT_SYMBOL_GPL(trace_print_bitmask_seq);
+/**
+ * trace_print_hex_seq - print buffer as hex sequence
+ * @p: trace seq struct to write to
+ * @buf: The buffer to print
+ * @buf_len: Length of @buf in bytes
+ * @concatenate: Print @buf as single hex string or with spacing
+ *
+ * Prints the passed buffer as a hex sequence either as a whole,
+ * single hex string if @concatenate is true or with spacing after
+ * each byte in case @concatenate is false.
+ */
const char *
-trace_print_hex_seq(struct trace_seq *p, const unsigned char *buf, int buf_len)
+trace_print_hex_seq(struct trace_seq *p, const unsigned char *buf, int buf_len,
+ bool concatenate)
{
int i;
const char *ret = trace_seq_buffer_ptr(p);
for (i = 0; i < buf_len; i++)
- trace_seq_printf(p, "%s%2.2x", i == 0 ? "" : " ", buf[i]);
-
+ trace_seq_printf(p, "%s%2.2x", concatenate || i == 0 ? "" : " ",
+ buf[i]);
trace_seq_putc(p, 0);
return ret;