diff options
Diffstat (limited to 'kernel')
37 files changed, 1618 insertions, 516 deletions
diff --git a/kernel/audit.c b/kernel/audit.c index 6e399bb69d7c..e794544f5e63 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -121,7 +121,7 @@ u32 audit_sig_sid = 0; 3) suppressed due to audit_rate_limit 4) suppressed due to audit_backlog_limit */ -static atomic_t audit_lost = ATOMIC_INIT(0); +static atomic_t audit_lost = ATOMIC_INIT(0); /* The netlink socket. */ static struct sock *audit_sock; @@ -1058,6 +1058,12 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) if (err < 0) return err; } + if (s.mask == AUDIT_STATUS_LOST) { + u32 lost = atomic_xchg(&audit_lost, 0); + + audit_log_config_change("lost", 0, lost, 1); + return lost; + } break; } case AUDIT_GET_FEATURE: @@ -1349,7 +1355,9 @@ static int __init audit_init(void) panic("audit: failed to start the kauditd thread (%d)\n", err); } - audit_log(NULL, GFP_KERNEL, AUDIT_KERNEL, "initialized"); + audit_log(NULL, GFP_KERNEL, AUDIT_KERNEL, + "state=initialized audit_enabled=%u res=1", + audit_enabled); return 0; } diff --git a/kernel/audit.h b/kernel/audit.h index 960d49c9db5e..ca579880303a 100644 --- a/kernel/audit.h +++ b/kernel/audit.h @@ -199,6 +199,9 @@ struct audit_context { struct { int argc; } execve; + struct { + char *name; + } module; }; int fds[2]; struct audit_proctitle proctitle; diff --git a/kernel/auditsc.c b/kernel/auditsc.c index cf1fa43512c1..d6a8de5f8fa3 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -1221,7 +1221,7 @@ static void show_special(struct audit_context *context, int *call_panic) context->ipc.perm_mode); } break; } - case AUDIT_MQ_OPEN: { + case AUDIT_MQ_OPEN: audit_log_format(ab, "oflag=0x%x mode=%#ho mq_flags=0x%lx mq_maxmsg=%ld " "mq_msgsize=%ld mq_curmsgs=%ld", @@ -1230,8 +1230,8 @@ static void show_special(struct audit_context *context, int *call_panic) context->mq_open.attr.mq_maxmsg, context->mq_open.attr.mq_msgsize, context->mq_open.attr.mq_curmsgs); - break; } - case AUDIT_MQ_SENDRECV: { + break; + case AUDIT_MQ_SENDRECV: audit_log_format(ab, "mqdes=%d msg_len=%zd msg_prio=%u " "abs_timeout_sec=%ld abs_timeout_nsec=%ld", @@ -1240,12 +1240,12 @@ static void show_special(struct audit_context *context, int *call_panic) context->mq_sendrecv.msg_prio, context->mq_sendrecv.abs_timeout.tv_sec, context->mq_sendrecv.abs_timeout.tv_nsec); - break; } - case AUDIT_MQ_NOTIFY: { + break; + case AUDIT_MQ_NOTIFY: audit_log_format(ab, "mqdes=%d sigev_signo=%d", context->mq_notify.mqdes, context->mq_notify.sigev_signo); - break; } + break; case AUDIT_MQ_GETSETATTR: { struct mq_attr *attr = &context->mq_getsetattr.mqstat; audit_log_format(ab, @@ -1255,19 +1255,24 @@ static void show_special(struct audit_context *context, int *call_panic) attr->mq_flags, attr->mq_maxmsg, attr->mq_msgsize, attr->mq_curmsgs); break; } - case AUDIT_CAPSET: { + case AUDIT_CAPSET: audit_log_format(ab, "pid=%d", context->capset.pid); audit_log_cap(ab, "cap_pi", &context->capset.cap.inheritable); audit_log_cap(ab, "cap_pp", &context->capset.cap.permitted); audit_log_cap(ab, "cap_pe", &context->capset.cap.effective); - break; } - case AUDIT_MMAP: { + break; + case AUDIT_MMAP: audit_log_format(ab, "fd=%d flags=0x%x", context->mmap.fd, context->mmap.flags); - break; } - case AUDIT_EXECVE: { + break; + case AUDIT_EXECVE: audit_log_execve_info(context, &ab); - break; } + break; + case AUDIT_KERN_MODULE: + audit_log_format(ab, "name="); + audit_log_untrustedstring(ab, context->module.name); + kfree(context->module.name); + break; } audit_log_end(ab); } @@ -2368,6 +2373,15 @@ void __audit_mmap_fd(int fd, int flags) context->type = AUDIT_MMAP; } +void __audit_log_kern_module(char *name) +{ + struct audit_context *context = current->audit_context; + + context->module.name = kmalloc(strlen(name) + 1, GFP_KERNEL); + strcpy(context->module.name, name); + context->type = AUDIT_KERN_MODULE; +} + static void audit_log_task(struct audit_buffer *ab) { kuid_t auid, uid; @@ -2411,7 +2425,7 @@ void audit_core_dumps(long signr) if (unlikely(!ab)) return; audit_log_task(ab); - audit_log_format(ab, " sig=%ld", signr); + audit_log_format(ab, " sig=%ld res=1", signr); audit_log_end(ab); } diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index 1276474ac3cd..e1ce4f4fd7fd 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -1,7 +1,7 @@ obj-y := core.o obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o -obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o +obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o ifeq ($(CONFIG_PERF_EVENTS),y) obj-$(CONFIG_BPF_SYSCALL) += stackmap.o endif diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 3d55d95dcf49..6b6f41f0b211 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -269,7 +269,7 @@ static const struct bpf_map_ops array_ops = { .map_delete_elem = array_map_delete_elem, }; -static struct bpf_map_type_list array_type __read_mostly = { +static struct bpf_map_type_list array_type __ro_after_init = { .ops = &array_ops, .type = BPF_MAP_TYPE_ARRAY, }; @@ -283,7 +283,7 @@ static const struct bpf_map_ops percpu_array_ops = { .map_delete_elem = array_map_delete_elem, }; -static struct bpf_map_type_list percpu_array_type __read_mostly = { +static struct bpf_map_type_list percpu_array_type __ro_after_init = { .ops = &percpu_array_ops, .type = BPF_MAP_TYPE_PERCPU_ARRAY, }; @@ -409,7 +409,7 @@ static const struct bpf_map_ops prog_array_ops = { .map_fd_put_ptr = prog_fd_array_put_ptr, }; -static struct bpf_map_type_list prog_array_type __read_mostly = { +static struct bpf_map_type_list prog_array_type __ro_after_init = { .ops = &prog_array_ops, .type = BPF_MAP_TYPE_PROG_ARRAY, }; @@ -522,7 +522,7 @@ static const struct bpf_map_ops perf_event_array_ops = { .map_release = perf_event_fd_array_release, }; -static struct bpf_map_type_list perf_event_array_type __read_mostly = { +static struct bpf_map_type_list perf_event_array_type __ro_after_init = { .ops = &perf_event_array_ops, .type = BPF_MAP_TYPE_PERF_EVENT_ARRAY, }; @@ -564,7 +564,7 @@ static const struct bpf_map_ops cgroup_array_ops = { .map_fd_put_ptr = cgroup_fd_array_put_ptr, }; -static struct bpf_map_type_list cgroup_array_type __read_mostly = { +static struct bpf_map_type_list cgroup_array_type __ro_after_init = { .ops = &cgroup_array_ops, .type = BPF_MAP_TYPE_CGROUP_ARRAY, }; diff --git a/kernel/bpf/bpf_lru_list.c b/kernel/bpf/bpf_lru_list.c index 89b7ef41c86b..f62d1d56f41d 100644 --- a/kernel/bpf/bpf_lru_list.c +++ b/kernel/bpf/bpf_lru_list.c @@ -213,11 +213,10 @@ __bpf_lru_list_shrink_inactive(struct bpf_lru *lru, enum bpf_lru_list_type tgt_free_type) { struct list_head *inactive = &l->lists[BPF_LRU_LIST_T_INACTIVE]; - struct bpf_lru_node *node, *tmp_node, *first_node; + struct bpf_lru_node *node, *tmp_node; unsigned int nshrinked = 0; unsigned int i = 0; - first_node = list_first_entry(inactive, struct bpf_lru_node, list); list_for_each_entry_safe_reverse(node, tmp_node, inactive, list) { if (bpf_lru_node_is_ref(node)) { __bpf_lru_node_move(l, node, BPF_LRU_LIST_T_ACTIVE); @@ -361,7 +360,8 @@ static void __local_list_add_pending(struct bpf_lru *lru, list_add(&node->list, local_pending_list(loc_l)); } -struct bpf_lru_node *__local_list_pop_free(struct bpf_lru_locallist *loc_l) +static struct bpf_lru_node * +__local_list_pop_free(struct bpf_lru_locallist *loc_l) { struct bpf_lru_node *node; @@ -374,8 +374,8 @@ struct bpf_lru_node *__local_list_pop_free(struct bpf_lru_locallist *loc_l) return node; } -struct bpf_lru_node *__local_list_pop_pending(struct bpf_lru *lru, - struct bpf_lru_locallist *loc_l) +static struct bpf_lru_node * +__local_list_pop_pending(struct bpf_lru *lru, struct bpf_lru_locallist *loc_l) { struct bpf_lru_node *node; bool force = false; @@ -558,8 +558,9 @@ void bpf_lru_push_free(struct bpf_lru *lru, struct bpf_lru_node *node) bpf_common_lru_push_free(lru, node); } -void bpf_common_lru_populate(struct bpf_lru *lru, void *buf, u32 node_offset, - u32 elem_size, u32 nr_elems) +static void bpf_common_lru_populate(struct bpf_lru *lru, void *buf, + u32 node_offset, u32 elem_size, + u32 nr_elems) { struct bpf_lru_list *l = &lru->common_lru.lru_list; u32 i; @@ -575,8 +576,9 @@ void bpf_common_lru_populate(struct bpf_lru *lru, void *buf, u32 node_offset, } } -void bpf_percpu_lru_populate(struct bpf_lru *lru, void *buf, u32 node_offset, - u32 elem_size, u32 nr_elems) +static void bpf_percpu_lru_populate(struct bpf_lru *lru, void *buf, + u32 node_offset, u32 elem_size, + u32 nr_elems) { u32 i, pcpu_entries; int cpu; diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 503d4211988a..f45827e205d3 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -28,6 +28,9 @@ #include <linux/moduleloader.h> #include <linux/bpf.h> #include <linux/frame.h> +#include <linux/rbtree_latch.h> +#include <linux/kallsyms.h> +#include <linux/rcupdate.h> #include <asm/unaligned.h> @@ -95,6 +98,8 @@ struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags) fp->aux = aux; fp->aux->prog = fp; + INIT_LIST_HEAD_RCU(&fp->aux->ksym_lnode); + return fp; } EXPORT_SYMBOL_GPL(bpf_prog_alloc); @@ -290,6 +295,206 @@ struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off, } #ifdef CONFIG_BPF_JIT +static __always_inline void +bpf_get_prog_addr_region(const struct bpf_prog *prog, + unsigned long *symbol_start, + unsigned long *symbol_end) +{ + const struct bpf_binary_header *hdr = bpf_jit_binary_hdr(prog); + unsigned long addr = (unsigned long)hdr; + + WARN_ON_ONCE(!bpf_prog_ebpf_jited(prog)); + + *symbol_start = addr; + *symbol_end = addr + hdr->pages * PAGE_SIZE; +} + +static void bpf_get_prog_name(const struct bpf_prog *prog, char *sym) +{ + BUILD_BUG_ON(sizeof("bpf_prog_") + + sizeof(prog->tag) * 2 + 1 > KSYM_NAME_LEN); + + sym += snprintf(sym, KSYM_NAME_LEN, "bpf_prog_"); + sym = bin2hex(sym, prog->tag, sizeof(prog->tag)); + *sym = 0; +} + +static __always_inline unsigned long +bpf_get_prog_addr_start(struct latch_tree_node *n) +{ + unsigned long symbol_start, symbol_end; + const struct bpf_prog_aux *aux; + + aux = container_of(n, struct bpf_prog_aux, ksym_tnode); + bpf_get_prog_addr_region(aux->prog, &symbol_start, &symbol_end); + + return symbol_start; +} + +static __always_inline bool bpf_tree_less(struct latch_tree_node *a, + struct latch_tree_node *b) +{ + return bpf_get_prog_addr_start(a) < bpf_get_prog_addr_start(b); +} + +static __always_inline int bpf_tree_comp(void *key, struct latch_tree_node *n) +{ + unsigned long val = (unsigned long)key; + unsigned long symbol_start, symbol_end; + const struct bpf_prog_aux *aux; + + aux = container_of(n, struct bpf_prog_aux, ksym_tnode); + bpf_get_prog_addr_region(aux->prog, &symbol_start, &symbol_end); + + if (val < symbol_start) + return -1; + if (val >= symbol_end) + return 1; + + return 0; +} + +static const struct latch_tree_ops bpf_tree_ops = { + .less = bpf_tree_less, + .comp = bpf_tree_comp, +}; + +static DEFINE_SPINLOCK(bpf_lock); +static LIST_HEAD(bpf_kallsyms); +static struct latch_tree_root bpf_tree __cacheline_aligned; + +int bpf_jit_kallsyms __read_mostly; + +static void bpf_prog_ksym_node_add(struct bpf_prog_aux *aux) +{ + WARN_ON_ONCE(!list_empty(&aux->ksym_lnode)); + list_add_tail_rcu(&aux->ksym_lnode, &bpf_kallsyms); + latch_tree_insert(&aux->ksym_tnode, &bpf_tree, &bpf_tree_ops); +} + +static void bpf_prog_ksym_node_del(struct bpf_prog_aux *aux) +{ + if (list_empty(&aux->ksym_lnode)) + return; + + latch_tree_erase(&aux->ksym_tnode, &bpf_tree, &bpf_tree_ops); + list_del_rcu(&aux->ksym_lnode); +} + +static bool bpf_prog_kallsyms_candidate(const struct bpf_prog *fp) +{ + return fp->jited && !bpf_prog_was_classic(fp); +} + +static bool bpf_prog_kallsyms_verify_off(const struct bpf_prog *fp) +{ + return list_empty(&fp->aux->ksym_lnode) || + fp->aux->ksym_lnode.prev == LIST_POISON2; +} + +void bpf_prog_kallsyms_add(struct bpf_prog *fp) +{ + unsigned long flags; + + if (!bpf_prog_kallsyms_candidate(fp) || + !capable(CAP_SYS_ADMIN)) + return; + + spin_lock_irqsave(&bpf_lock, flags); + bpf_prog_ksym_node_add(fp->aux); + spin_unlock_irqrestore(&bpf_lock, flags); +} + +void bpf_prog_kallsyms_del(struct bpf_prog *fp) +{ + unsigned long flags; + + if (!bpf_prog_kallsyms_candidate(fp)) + return; + + spin_lock_irqsave(&bpf_lock, flags); + bpf_prog_ksym_node_del(fp->aux); + spin_unlock_irqrestore(&bpf_lock, flags); +} + +static struct bpf_prog *bpf_prog_kallsyms_find(unsigned long addr) +{ + struct latch_tree_node *n; + + if (!bpf_jit_kallsyms_enabled()) + return NULL; + + n = latch_tree_find((void *)addr, &bpf_tree, &bpf_tree_ops); + return n ? + container_of(n, struct bpf_prog_aux, ksym_tnode)->prog : + NULL; +} + +const char *__bpf_address_lookup(unsigned long addr, unsigned long *size, + unsigned long *off, char *sym) +{ + unsigned long symbol_start, symbol_end; + struct bpf_prog *prog; + char *ret = NULL; + + rcu_read_lock(); + prog = bpf_prog_kallsyms_find(addr); + if (prog) { + bpf_get_prog_addr_region(prog, &symbol_start, &symbol_end); + bpf_get_prog_name(prog, sym); + + ret = sym; + if (size) + *size = symbol_end - symbol_start; + if (off) + *off = addr - symbol_start; + } + rcu_read_unlock(); + + return ret; +} + +bool is_bpf_text_address(unsigned long addr) +{ + bool ret; + + rcu_read_lock(); + ret = bpf_prog_kallsyms_find(addr) != NULL; + rcu_read_unlock(); + + return ret; +} + +int bpf_get_kallsym(unsigned int symnum, unsigned long *value, char *type, + char *sym) +{ + unsigned long symbol_start, symbol_end; + struct bpf_prog_aux *aux; + unsigned int it = 0; + int ret = -ERANGE; + + if (!bpf_jit_kallsyms_enabled()) + return ret; + + rcu_read_lock(); + list_for_each_entry_rcu(aux, &bpf_kallsyms, ksym_lnode) { + if (it++ != symnum) + continue; + + bpf_get_prog_addr_region(aux->prog, &symbol_start, &symbol_end); + bpf_get_prog_name(aux->prog, sym); + + *value = symbol_start; + *type = BPF_SYM_ELF_TYPE; + + ret = 0; + break; + } + rcu_read_unlock(); + + return ret; +} + struct bpf_binary_header * bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr, unsigned int alignment, @@ -326,6 +531,24 @@ void bpf_jit_binary_free(struct bpf_binary_header *hdr) module_memfree(hdr); } +/* This symbol is only overridden by archs that have different + * requirements than the usual eBPF JITs, f.e. when they only + * implement cBPF JIT, do not set images read-only, etc. + */ +void __weak bpf_jit_free(struct bpf_prog *fp) +{ + if (fp->jited) { + struct bpf_binary_header *hdr = bpf_jit_binary_hdr(fp); + + bpf_jit_binary_unlock_ro(hdr); + bpf_jit_binary_free(hdr); + + WARN_ON_ONCE(!bpf_prog_kallsyms_verify_off(fp)); + } + + bpf_prog_unlock_free(fp); +} + int bpf_jit_harden __read_mostly; static int bpf_jit_blind_insn(const struct bpf_insn *from, @@ -1154,12 +1377,22 @@ const struct bpf_func_proto bpf_tail_call_proto = { .arg3_type = ARG_ANYTHING, }; -/* For classic BPF JITs that don't implement bpf_int_jit_compile(). */ +/* Stub for JITs that only support cBPF. eBPF programs are interpreted. + * It is encouraged to implement bpf_int_jit_compile() instead, so that + * eBPF and implicitly also cBPF can get JITed! + */ struct bpf_prog * __weak bpf_int_jit_compile(struct bpf_prog *prog) { return prog; } +/* Stub for JITs that support eBPF. All cBPF code gets transformed into + * eBPF by the kernel and is later compiled by bpf_int_jit_compile(). + */ +void __weak bpf_jit_compile(struct bpf_prog *prog) +{ +} + bool __weak bpf_helper_changes_pkt_data(void *func) { return false; @@ -1173,3 +1406,12 @@ int __weak skb_copy_bits(const struct sk_buff *skb, int offset, void *to, { return -EFAULT; } + +/* All definitions of tracepoints related to BPF. */ +#define CREATE_TRACE_POINTS +#include <linux/bpf_trace.h> + +EXPORT_TRACEPOINT_SYMBOL_GPL(xdp_exception); + +EXPORT_TRACEPOINT_SYMBOL_GPL(bpf_prog_get_type); +EXPORT_TRACEPOINT_SYMBOL_GPL(bpf_prog_put_rcu); diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index a753bbe7df0a..3ea87fb19a94 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -1023,7 +1023,7 @@ static const struct bpf_map_ops htab_ops = { .map_delete_elem = htab_map_delete_elem, }; -static struct bpf_map_type_list htab_type __read_mostly = { +static struct bpf_map_type_list htab_type __ro_after_init = { .ops = &htab_ops, .type = BPF_MAP_TYPE_HASH, }; @@ -1037,7 +1037,7 @@ static const struct bpf_map_ops htab_lru_ops = { .map_delete_elem = htab_lru_map_delete_elem, }; -static struct bpf_map_type_list htab_lru_type __read_mostly = { +static struct bpf_map_type_list htab_lru_type __ro_after_init = { .ops = &htab_lru_ops, .type = BPF_MAP_TYPE_LRU_HASH, }; @@ -1124,7 +1124,7 @@ static const struct bpf_map_ops htab_percpu_ops = { .map_delete_elem = htab_map_delete_elem, }; -static struct bpf_map_type_list htab_percpu_type __read_mostly = { +static struct bpf_map_type_list htab_percpu_type __ro_after_init = { .ops = &htab_percpu_ops, .type = BPF_MAP_TYPE_PERCPU_HASH, }; @@ -1138,7 +1138,7 @@ static const struct bpf_map_ops htab_lru_percpu_ops = { .map_delete_elem = htab_lru_map_delete_elem, }; -static struct bpf_map_type_list htab_lru_percpu_type __read_mostly = { +static struct bpf_map_type_list htab_lru_percpu_type __ro_after_init = { .ops = &htab_lru_percpu_ops, .type = BPF_MAP_TYPE_LRU_PERCPU_HASH, }; diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 045cbe673356..3d24e238221e 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -176,6 +176,6 @@ const struct bpf_func_proto bpf_get_current_comm_proto = { .func = bpf_get_current_comm, .gpl_only = false, .ret_type = RET_INTEGER, - .arg1_type = ARG_PTR_TO_RAW_STACK, - .arg2_type = ARG_CONST_STACK_SIZE, + .arg1_type = ARG_PTR_TO_UNINIT_MEM, + .arg2_type = ARG_CONST_SIZE, }; diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index 0b030c9126d3..fddcae801724 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -21,6 +21,7 @@ #include <linux/parser.h> #include <linux/filter.h> #include <linux/bpf.h> +#include <linux/bpf_trace.h> enum bpf_type { BPF_TYPE_UNSPEC = 0, @@ -281,6 +282,13 @@ int bpf_obj_pin_user(u32 ufd, const char __user *pathname) ret = bpf_obj_do_pin(pname, raw, type); if (ret != 0) bpf_any_put(raw, type); + if ((trace_bpf_obj_pin_prog_enabled() || + trace_bpf_obj_pin_map_enabled()) && !ret) { + if (type == BPF_TYPE_PROG) + trace_bpf_obj_pin_prog(raw, ufd, pname); + if (type == BPF_TYPE_MAP) + trace_bpf_obj_pin_map(raw, ufd, pname); + } out: putname(pname); return ret; @@ -342,8 +350,15 @@ int bpf_obj_get_user(const char __user *pathname) else goto out; - if (ret < 0) + if (ret < 0) { bpf_any_put(raw, type); + } else if (trace_bpf_obj_get_prog_enabled() || + trace_bpf_obj_get_map_enabled()) { + if (type == BPF_TYPE_PROG) + trace_bpf_obj_get_prog(raw, ret, pname); + if (type == BPF_TYPE_MAP) + trace_bpf_obj_get_map(raw, ret, pname); + } out: putname(pname); return ret; diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c new file mode 100644 index 000000000000..8bfe0afaee10 --- /dev/null +++ b/kernel/bpf/lpm_trie.c @@ -0,0 +1,521 @@ +/* + * Longest prefix match list implementation + * + * Copyright (c) 2016,2017 Daniel Mack + * Copyright (c) 2016 David Herrmann + * + * This file is subject to the terms and conditions of version 2 of the GNU + * General Public License. See the file COPYING in the main directory of the + * Linux distribution for more details. + */ + +#include <linux/bpf.h> +#include <linux/err.h> +#include <linux/slab.h> +#include <linux/spinlock.h> +#include <linux/vmalloc.h> +#include <net/ipv6.h> + +/* Intermediate node */ +#define LPM_TREE_NODE_FLAG_IM BIT(0) + +struct lpm_trie_node; + +struct lpm_trie_node { + struct rcu_head rcu; + struct lpm_trie_node __rcu *child[2]; + u32 prefixlen; + u32 flags; + u8 data[0]; +}; + +struct lpm_trie { + struct bpf_map map; + struct lpm_trie_node __rcu *root; + size_t n_entries; + size_t max_prefixlen; + size_t data_size; + raw_spinlock_t lock; +}; + +/* This trie implements a longest prefix match algorithm that can be used to + * match IP addresses to a stored set of ranges. + * + * Data stored in @data of struct bpf_lpm_key and struct lpm_trie_node is + * interpreted as big endian, so data[0] stores the most significant byte. + * + * Match ranges are internally stored in instances of struct lpm_trie_node + * which each contain their prefix length as well as two pointers that may + * lead to more nodes containing more specific matches. Each node also stores + * a value that is defined by and returned to userspace via the update_elem + * and lookup functions. + * + * For instance, let's start with a trie that was created with a prefix length + * of 32, so it can be used for IPv4 addresses, and one single element that + * matches 192.168.0.0/16. The data array would hence contain + * [0xc0, 0xa8, 0x00, 0x00] in big-endian notation. This documentation will + * stick to IP-address notation for readability though. + * + * As the trie is empty initially, the new node (1) will be places as root + * node, denoted as (R) in the example below. As there are no other node, both + * child pointers are %NULL. + * + * +----------------+ + * | (1) (R) | + * | 192.168.0.0/16 | + * | value: 1 | + * | [0] [1] | + * +----------------+ + * + * Next, let's add a new node (2) matching 192.168.0.0/24. As there is already + * a node with the same data and a smaller prefix (ie, a less specific one), + * node (2) will become a child of (1). In child index depends on the next bit + * that is outside of what (1) matches, and that bit is 0, so (2) will be + * child[0] of (1): + * + * +----------------+ + * | (1) (R) | + * | 192.168.0.0/16 | + * | value: 1 | + * | [0] [1] | + * +----------------+ + * | + * +----------------+ + * | (2) | + * | 192.168.0.0/24 | + * | value: 2 | + * | [0] [1] | + * +----------------+ + * + * The child[1] slot of (1) could be filled with another node which has bit #17 + * (the next bit after the ones that (1) matches on) set to 1. For instance, + * 192.168.128.0/24: + * + * +----------------+ + * | (1) (R) | + * | 192.168.0.0/16 | + * | value: 1 | + * | [0] [1] | + * +----------------+ + * | | + * +----------------+ +------------------+ + * | (2) | | (3) | + * | 192.168.0.0/24 | | 192.168.128.0/24 | + * | value: 2 | | value: 3 | + * | [0] [1] | | [0] [1] | + * +----------------+ +------------------+ + * + * Let's add another node (4) to the game for 192.168.1.0/24. In order to place + * it, node (1) is looked at first, and because (4) of the semantics laid out + * above (bit #17 is 0), it would normally be attached to (1) as child[0]. + * However, that slot is already allocated, so a new node is needed in between. + * That node does not have a value attached to it and it will never be + * returned to users as result of a lookup. It is only there to differentiate + * the traversal further. It will get a prefix as wide as necessary to + * distinguish its two children: + * + * +----------------+ + * | (1) (R) | + * | 192.168.0.0/16 | + * | value: 1 | + * | [0] [1] | + * +----------------+ + * | | + * +----------------+ +------------------+ + * | (4) (I) | | (3) | + * | 192.168.0.0/23 | | 192.168.128.0/24 | + * | value: --- | | value: 3 | + * | [0] [1] | | [0] [1] | + * +----------------+ +------------------+ + * | | + * +----------------+ +----------------+ + * | (2) | | (5) | + * | 192.168.0.0/24 | | 192.168.1.0/24 | + * | value: 2 | | value: 5 | + * | [0] [1] | | [0] [1] | + * +----------------+ +----------------+ + * + * 192.168.1.1/32 would be a child of (5) etc. + * + * An intermediate node will be turned into a 'real' node on demand. In the + * example above, (4) would be re-used if 192.168.0.0/23 is added to the trie. + * + * A fully populated trie would have a height of 32 nodes, as the trie was + * created with a prefix length of 32. + * + * The lookup starts at the root node. If the current node matches and if there + * is a child that can be used to become more specific, the trie is traversed + * downwards. The last node in the traversal that is a non-intermediate one is + * returned. + */ + +static inline int extract_bit(const u8 *data, size_t index) +{ + return !!(data[index / 8] & (1 << (7 - (index % 8)))); +} + +/** + * longest_prefix_match() - determine the longest prefix + * @trie: The trie to get internal sizes from + * @node: The node to operate on + * @key: The key to compare to @node + * + * Determine the longest prefix of @node that matches the bits in @key. + */ +static size_t longest_prefix_match(const struct lpm_trie *trie, + const struct lpm_trie_node *node, + const struct bpf_lpm_trie_key *key) +{ + size_t prefixlen = 0; + size_t i; + + for (i = 0; i < trie->data_size; i++) { + size_t b; + + b = 8 - fls(node->data[i] ^ key->data[i]); + prefixlen += b; + + if (prefixlen >= node->prefixlen || prefixlen >= key->prefixlen) + return min(node->prefixlen, key->prefixlen); + + if (b < 8) + break; + } + + return prefixlen; +} + +/* Called from syscall or from eBPF program */ +static void *trie_lookup_elem(struct bpf_map *map, void *_key) +{ + struct lpm_trie *trie = container_of(map, struct lpm_trie, map); + struct lpm_trie_node *node, *found = NULL; + struct bpf_lpm_trie_key *key = _key; + + /* Start walking the trie from the root node ... */ + + for (node = rcu_dereference(trie->root); node;) { + unsigned int next_bit; + size_t matchlen; + + /* Determine the longest prefix of @node that matches @key. + * If it's the maximum possible prefix for this trie, we have + * an exact match and can return it directly. + */ + matchlen = longest_prefix_match(trie, node, key); + if (matchlen == trie->max_prefixlen) { + found = node; + break; + } + + /* If the number of bits that match is smaller than the prefix + * length of @node, bail out and return the node we have seen + * last in the traversal (ie, the parent). + */ + if (matchlen < node->prefixlen) + break; + + /* Consider this node as return candidate unless it is an + * artificially added intermediate one. + */ + if (!(node->flags & LPM_TREE_NODE_FLAG_IM)) + found = node; + + /* If the node match is fully satisfied, let's see if we can + * become more specific. Determine the next bit in the key and + * traverse down. + */ + next_bit = extract_bit(key->data, node->prefixlen); + node = rcu_dereference(node->child[next_bit]); + } + + if (!found) + return NULL; + + return found->data + trie->data_size; +} + +static struct lpm_trie_node *lpm_trie_node_alloc(const struct lpm_trie *trie, + const void *value) +{ + struct lpm_trie_node *node; + size_t size = sizeof(struct lpm_trie_node) + trie->data_size; + + if (value) + size += trie->map.value_size; + + node = kmalloc(size, GFP_ATOMIC | __GFP_NOWARN); + if (!node) + return NULL; + + node->flags = 0; + + if (value) + memcpy(node->data + trie->data_size, value, + trie->map.value_size); + + return node; +} + +/* Called from syscall or from eBPF program */ +static int trie_update_elem(struct bpf_map *map, + void *_key, void *value, u64 flags) +{ + struct lpm_trie *trie = container_of(map, struct lpm_trie, map); + struct lpm_trie_node *node, *im_node = NULL, *new_node = NULL; + struct lpm_trie_node __rcu **slot; + struct bpf_lpm_trie_key *key = _key; + unsigned long irq_flags; + unsigned int next_bit; + size_t matchlen = 0; + int ret = 0; + + if (unlikely(flags > BPF_EXIST)) + return -EINVAL; + + if (key->prefixlen > trie->max_prefixlen) + return -EINVAL; + + raw_spin_lock_irqsave(&trie->lock, irq_flags); + + /* Allocate and fill a new node */ + + if (trie->n_entries == trie->map.max_entries) { + ret = -ENOSPC; + goto out; + } + + new_node = lpm_trie_node_alloc(trie, value); + if (!new_node) { + ret = -ENOMEM; + goto out; + } + + trie->n_entries++; + + new_node->prefixlen = key->prefixlen; + RCU_INIT_POINTER(new_node->child[0], NULL); + RCU_INIT_POINTER(new_node->child[1], NULL); + memcpy(new_node->data, key->data, trie->data_size); + + /* Now find a slot to attach the new node. To do that, walk the tree + * from the root and match as many bits as possible for each node until + * we either find an empty slot or a slot that needs to be replaced by + * an intermediate node. + */ + slot = &trie->root; + + while ((node = rcu_dereference_protected(*slot, + lockdep_is_held(&trie->lock)))) { + matchlen = longest_prefix_match(trie, node, key); + + if (node->prefixlen != matchlen || + node->prefixlen == key->prefixlen || + node->prefixlen == trie->max_prefixlen) + break; + + next_bit = extract_bit(key->data, node->prefixlen); + slot = &node->child[next_bit]; + } + + /* If the slot is empty (a free child pointer or an empty root), + * simply assign the @new_node to that slot and be done. + */ + if (!node) { + rcu_assign_pointer(*slot, new_node); + goto out; + } + + /* If the slot we picked already exists, replace it with @new_node + * which already has the correct data array set. + */ + if (node->prefixlen == matchlen) { + new_node->child[0] = node->child[0]; + new_node->child[1] = node->child[1]; + + if (!(node->flags & LPM_TREE_NODE_FLAG_IM)) + trie->n_entries--; + + rcu_assign_pointer(*slot, new_node); + kfree_rcu(node, rcu); + + goto out; + } + + /* If the new node matches the prefix completely, it must be inserted + * as an ancestor. Simply insert it between @node and *@slot. + */ + if (matchlen == key->prefixlen) { + next_bit = extract_bit(node->data, matchlen); + rcu_assign_pointer(new_node->child[next_bit], node); + rcu_assign_pointer(*slot, new_node); + goto out; + } + + im_node = lpm_trie_node_alloc(trie, NULL); + if (!im_node) { + ret = -ENOMEM; + goto out; + } + + im_node->prefixlen = matchlen; + im_node->flags |= LPM_TREE_NODE_FLAG_IM; + memcpy(im_node->data, node->data, trie->data_size); + + /* Now determine which child to install in which slot */ + if (extract_bit(key->data, matchlen)) { + rcu_assign_pointer(im_node->child[0], node); + rcu_assign_pointer(im_node->child[1], new_node); + } else { + rcu_assign_pointer(im_node->child[0], new_node); + rcu_assign_pointer(im_node->child[1], node); + } + + /* Finally, assign the intermediate node to the determined spot */ + rcu_assign_pointer(*slot, im_node); + +out: + if (ret) { + if (new_node) + trie->n_entries--; + + kfree(new_node); + kfree(im_node); + } + + raw_spin_unlock_irqrestore(&trie->lock, irq_flags); + + return ret; +} + +static int trie_delete_elem(struct bpf_map *map, void *key) +{ + /* TODO */ + return -ENOSYS; +} + +#define LPM_DATA_SIZE_MAX 256 +#define LPM_DATA_SIZE_MIN 1 + +#define LPM_VAL_SIZE_MAX (KMALLOC_MAX_SIZE - LPM_DATA_SIZE_MAX - \ + sizeof(struct lpm_trie_node)) +#define LPM_VAL_SIZE_MIN 1 + +#define LPM_KEY_SIZE(X) (sizeof(struct bpf_lpm_trie_key) + (X)) +#define LPM_KEY_SIZE_MAX LPM_KEY_SIZE(LPM_DATA_SIZE_MAX) +#define LPM_KEY_SIZE_MIN LPM_KEY_SIZE(LPM_DATA_SIZE_MIN) + +static struct bpf_map *trie_alloc(union bpf_attr *attr) +{ + struct lpm_trie *trie; + u64 cost = sizeof(*trie), cost_per_node; + int ret; + + if (!capable(CAP_SYS_ADMIN)) + return ERR_PTR(-EPERM); + + /* check sanity of attributes */ + if (attr->max_entries == 0 || + attr->map_flags != BPF_F_NO_PREALLOC || + attr->key_size < LPM_KEY_SIZE_MIN || + attr->key_size > LPM_KEY_SIZE_MAX || + attr->value_size < LPM_VAL_SIZE_MIN || + attr->value_size > LPM_VAL_SIZE_MAX) + return ERR_PTR(-EINVAL); + + trie = kzalloc(sizeof(*trie), GFP_USER | __GFP_NOWARN); + if (!trie) + return ERR_PTR(-ENOMEM); + + /* copy mandatory map attributes */ + trie->map.map_type = attr->map_type; + trie->map.key_size = attr->key_size; + trie->map.value_size = attr->value_size; + trie->map.max_entries = attr->max_entries; + trie->data_size = attr->key_size - + offsetof(struct bpf_lpm_trie_key, data); + trie->max_prefixlen = trie->data_size * 8; + + cost_per_node = sizeof(struct lpm_trie_node) + + attr->value_size + trie->data_size; + cost += (u64) attr->max_entries * cost_per_node; + if (cost >= U32_MAX - PAGE_SIZE) { + ret = -E2BIG; + goto out_err; + } + + trie->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; + + ret = bpf_map_precharge_memlock(trie->map.pages); + if (ret) + goto out_err; + + raw_spin_lock_init(&trie->lock); + + return &trie->map; +out_err: + kfree(trie); + return ERR_PTR(ret); +} + +static void trie_free(struct bpf_map *map) +{ + struct lpm_trie *trie = container_of(map, struct lpm_trie, map); + struct lpm_trie_node __rcu **slot; + struct lpm_trie_node *node; + + raw_spin_lock(&trie->lock); + + /* Always start at the root and walk down to a node that has no + * children. Then free that node, nullify its reference in the parent + * and start over. + */ + + for (;;) { + slot = &trie->root; + + for (;;) { + node = rcu_dereference_protected(*slot, + lockdep_is_held(&trie->lock)); + if (!node) + goto unlock; + + if (rcu_access_pointer(node->child[0])) { + slot = &node->child[0]; + continue; + } + + if (rcu_access_pointer(node->child[1])) { + slot = &node->child[1]; + continue; + } + + kfree(node); + RCU_INIT_POINTER(*slot, NULL); + break; + } + } + +unlock: + raw_spin_unlock(&trie->lock); +} + +static const struct bpf_map_ops trie_ops = { + .map_alloc = trie_alloc, + .map_free = trie_free, + .map_lookup_elem = trie_lookup_elem, + .map_update_elem = trie_update_elem, + .map_delete_elem = trie_delete_elem, +}; + +static struct bpf_map_type_list trie_type __ro_after_init = { + .ops = &trie_ops, + .type = BPF_MAP_TYPE_LPM_TRIE, +}; + +static int __init register_trie_map(void) +{ + bpf_register_map_type(&trie_type); + return 0; +} +late_initcall(register_trie_map); diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index be8519148c25..22aa45cd0324 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -273,7 +273,7 @@ static const struct bpf_map_ops stack_map_ops = { .map_delete_elem = stack_map_delete_elem, }; -static struct bpf_map_type_list stack_map_type __read_mostly = { +static struct bpf_map_type_list stack_map_type __ro_after_init = { .ops = &stack_map_ops, .type = BPF_MAP_TYPE_STACK_TRACE, }; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index bbb016adbaeb..461eb1e66a0f 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -10,6 +10,7 @@ * General Public License for more details. */ #include <linux/bpf.h> +#include <linux/bpf_trace.h> #include <linux/syscalls.h> #include <linux/slab.h> #include <linux/vmalloc.h> @@ -241,6 +242,7 @@ static int map_create(union bpf_attr *attr) /* failed to allocate fd */ goto free_map; + trace_bpf_map_create(map, err); return err; free_map: @@ -365,6 +367,7 @@ static int map_lookup_elem(union bpf_attr *attr) if (copy_to_user(uvalue, value, value_size) != 0) goto free_value; + trace_bpf_map_lookup_elem(map, ufd, key, value); err = 0; free_value: @@ -447,6 +450,8 @@ static int map_update_elem(union bpf_attr *attr) __this_cpu_dec(bpf_prog_active); preempt_enable(); + if (!err) + trace_bpf_map_update_elem(map, ufd, key, value); free_value: kfree(value); free_key: @@ -492,6 +497,8 @@ static int map_delete_elem(union bpf_attr *attr) __this_cpu_dec(bpf_prog_active); preempt_enable(); + if (!err) + trace_bpf_map_delete_elem(map, ufd, key); free_key: kfree(key); err_put: @@ -544,6 +551,7 @@ static int map_get_next_key(union bpf_attr *attr) if (copy_to_user(unext_key, next_key, map->key_size) != 0) goto free_next_key; + trace_bpf_map_next_key(map, ufd, key, next_key); err = 0; free_next_key: @@ -697,8 +705,11 @@ static void __bpf_prog_put_rcu(struct rcu_head *rcu) void bpf_prog_put(struct bpf_prog *prog) { - if (atomic_dec_and_test(&prog->aux->refcnt)) + if (atomic_dec_and_test(&prog->aux->refcnt)) { + trace_bpf_prog_put_rcu(prog); + bpf_prog_kallsyms_del(prog); call_rcu(&prog->aux->rcu, __bpf_prog_put_rcu); + } } EXPORT_SYMBOL_GPL(bpf_prog_put); @@ -807,7 +818,11 @@ struct bpf_prog *bpf_prog_get(u32 ufd) struct bpf_prog *bpf_prog_get_type(u32 ufd, enum bpf_prog_type type) { - return __bpf_prog_get(ufd, &type); + struct bpf_prog *prog = __bpf_prog_get(ufd, &type); + + if (!IS_ERR(prog)) + trace_bpf_prog_get_type(prog); + return prog; } EXPORT_SYMBOL_GPL(bpf_prog_get_type); @@ -889,6 +904,8 @@ static int bpf_prog_load(union bpf_attr *attr) /* failed to allocate fd */ goto free_used_maps; + bpf_prog_kallsyms_add(prog); + trace_bpf_prog_load(prog, err); return err; free_used_maps: diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index cdc43b899f28..d2bded2b250c 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -481,6 +481,13 @@ static void reset_reg_range_values(struct bpf_reg_state *regs, u32 regno) regs[regno].max_value = BPF_REGISTER_MAX_RANGE; } +static void mark_reg_unknown_value_and_range(struct bpf_reg_state *regs, + u32 regno) +{ + mark_reg_unknown_value(regs, regno); + reset_reg_range_values(regs, regno); +} + enum reg_arg_type { SRC_OP, /* register is used as source operand */ DST_OP, /* register is used as destination operand */ @@ -532,6 +539,7 @@ static bool is_spillable_regtype(enum bpf_reg_type type) switch (type) { case PTR_TO_MAP_VALUE: case PTR_TO_MAP_VALUE_OR_NULL: + case PTR_TO_MAP_VALUE_ADJ: case PTR_TO_STACK: case PTR_TO_CTX: case PTR_TO_PACKET: @@ -616,7 +624,8 @@ static int check_stack_read(struct bpf_verifier_state *state, int off, int size, } if (value_regno >= 0) /* have read misc data from the stack */ - mark_reg_unknown_value(state->regs, value_regno); + mark_reg_unknown_value_and_range(state->regs, + value_regno); return 0; } } @@ -627,7 +636,7 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno, int off, { struct bpf_map *map = env->cur_state.regs[regno].map_ptr; - if (off < 0 || off + size > map->value_size) { + if (off < 0 || size <= 0 || off + size > map->value_size) { verbose("invalid access to map value, value_size=%d off=%d size=%d\n", map->value_size, off, size); return -EACCES; @@ -635,6 +644,51 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno, int off, return 0; } +/* check read/write into an adjusted map element */ +static int check_map_access_adj(struct bpf_verifier_env *env, u32 regno, + int off, int size) +{ + struct bpf_verifier_state *state = &env->cur_state; + struct bpf_reg_state *reg = &state->regs[regno]; + int err; + + /* We adjusted the register to this map value, so we + * need to change off and size to min_value and max_value + * respectively to make sure our theoretical access will be + * safe. + */ + if (log_level) + print_verifier_state(state); + env->varlen_map_value_access = true; + /* The minimum value is only important with signed + * comparisons where we can't assume the floor of a + * value is 0. If we are using signed variables for our + * index'es we need to make sure that whatever we use + * will have a set floor within our range. + */ + if (reg->min_value < 0) { + verbose("R%d min value is negative, either use unsigned index or do a if (index >=0) check.\n", + regno); + return -EACCES; + } + err = check_map_access(env, regno, reg->min_value + off, size); + if (err) { + verbose("R%d min value is outside of the array range\n", + regno); + return err; + } + + /* If we haven't set a max value then we need to bail + * since we can't be sure we won't do bad things. + */ + if (reg->max_value == BPF_REGISTER_MAX_RANGE) { + verbose("R%d unbounded memory access, make sure to bounds check any array access into a map\n", + regno); + return -EACCES; + } + return check_map_access(env, regno, reg->max_value + off, size); +} + #define MAX_PACKET_OFF 0xffff static bool may_access_direct_pkt_data(struct bpf_verifier_env *env, @@ -647,6 +701,7 @@ static bool may_access_direct_pkt_data(struct bpf_verifier_env *env, /* dst_input() and dst_output() can't write for now */ if (t == BPF_WRITE) return false; + /* fallthrough */ case BPF_PROG_TYPE_SCHED_CLS: case BPF_PROG_TYPE_SCHED_ACT: case BPF_PROG_TYPE_XDP: @@ -775,47 +830,13 @@ static int check_mem_access(struct bpf_verifier_env *env, u32 regno, int off, return -EACCES; } - /* If we adjusted the register to this map value at all then we - * need to change off and size to min_value and max_value - * respectively to make sure our theoretical access will be - * safe. - */ - if (reg->type == PTR_TO_MAP_VALUE_ADJ) { - if (log_level) - print_verifier_state(state); - env->varlen_map_value_access = true; - /* The minimum value is only important with signed - * comparisons where we can't assume the floor of a - * value is 0. If we are using signed variables for our - * index'es we need to make sure that whatever we use - * will have a set floor within our range. - */ - if (reg->min_value < 0) { - verbose("R%d min value is negative, either use unsigned index or do a if (index >=0) check.\n", - regno); - return -EACCES; - } - err = check_map_access(env, regno, reg->min_value + off, - size); - if (err) { - verbose("R%d min value is outside of the array range\n", - regno); - return err; - } - - /* If we haven't set a max value then we need to bail - * since we can't be sure we won't do bad things. - */ - if (reg->max_value == BPF_REGISTER_MAX_RANGE) { - verbose("R%d unbounded memory access, make sure to bounds check any array access into a map\n", - regno); - return -EACCES; - } - off += reg->max_value; - } - err = check_map_access(env, regno, off, size); + if (reg->type == PTR_TO_MAP_VALUE_ADJ) + err = check_map_access_adj(env, regno, off, size); + else + err = check_map_access(env, regno, off, size); if (!err && t == BPF_READ && value_regno >= 0) - mark_reg_unknown_value(state->regs, value_regno); + mark_reg_unknown_value_and_range(state->regs, + value_regno); } else if (reg->type == PTR_TO_CTX) { enum bpf_reg_type reg_type = UNKNOWN_VALUE; @@ -827,7 +848,8 @@ static int check_mem_access(struct bpf_verifier_env *env, u32 regno, int off, } err = check_ctx_access(env, off, size, t, ®_type); if (!err && t == BPF_READ && value_regno >= 0) { - mark_reg_unknown_value(state->regs, value_regno); + mark_reg_unknown_value_and_range(state->regs, + value_regno); /* note that reg.[id|off|range] == 0 */ state->regs[value_regno].type = reg_type; } @@ -860,7 +882,8 @@ static int check_mem_access(struct bpf_verifier_env *env, u32 regno, int off, } err = check_packet_access(env, regno, off, size); if (!err && t == BPF_READ && value_regno >= 0) - mark_reg_unknown_value(state->regs, value_regno); + mark_reg_unknown_value_and_range(state->regs, + value_regno); } else { verbose("R%d invalid mem access '%s'\n", regno, reg_type_str[reg->type]); @@ -958,6 +981,25 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno, return 0; } +static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, + int access_size, bool zero_size_allowed, + struct bpf_call_arg_meta *meta) +{ + struct bpf_reg_state *regs = env->cur_state.regs; + + switch (regs[regno].type) { + case PTR_TO_PACKET: + return check_packet_access(env, regno, 0, access_size); + case PTR_TO_MAP_VALUE: + return check_map_access(env, regno, 0, access_size); + case PTR_TO_MAP_VALUE_ADJ: + return check_map_access_adj(env, regno, 0, access_size); + default: /* const_imm|ptr_to_stack or invalid ptr */ + return check_stack_boundary(env, regno, access_size, + zero_size_allowed, meta); + } +} + static int check_func_arg(struct bpf_verifier_env *env, u32 regno, enum bpf_arg_type arg_type, struct bpf_call_arg_meta *meta) @@ -993,10 +1035,13 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, expected_type = PTR_TO_STACK; if (type != PTR_TO_PACKET && type != expected_type) goto err_type; - } else if (arg_type == ARG_CONST_STACK_SIZE || - arg_type == ARG_CONST_STACK_SIZE_OR_ZERO) { + } else if (arg_type == ARG_CONST_SIZE || + arg_type == ARG_CONST_SIZE_OR_ZERO) { expected_type = CONST_IMM; - if (type != expected_type) + /* One exception. Allow UNKNOWN_VALUE registers when the + * boundaries are known and don't cause unsafe memory accesses + */ + if (type != UNKNOWN_VALUE && type != expected_type) goto err_type; } else if (arg_type == ARG_CONST_MAP_PTR) { expected_type = CONST_PTR_TO_MAP; @@ -1006,8 +1051,8 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, expected_type = PTR_TO_CTX; if (type != expected_type) goto err_type; - } else if (arg_type == ARG_PTR_TO_STACK || - arg_type == ARG_PTR_TO_RAW_STACK) { + } else if (arg_type == ARG_PTR_TO_MEM || + arg_type == ARG_PTR_TO_UNINIT_MEM) { expected_type = PTR_TO_STACK; /* One exception here. In case function allows for NULL to be * passed in as argument, it's a CONST_IMM type. Final test @@ -1015,9 +1060,10 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, */ if (type == CONST_IMM && reg->imm == 0) /* final test in check_stack_boundary() */; - else if (type != PTR_TO_PACKET && type != expected_type) + else if (type != PTR_TO_PACKET && type != PTR_TO_MAP_VALUE && + type != PTR_TO_MAP_VALUE_ADJ && type != expected_type) goto err_type; - meta->raw_mode = arg_type == ARG_PTR_TO_RAW_STACK; + meta->raw_mode = arg_type == ARG_PTR_TO_UNINIT_MEM; } else { verbose("unsupported arg_type %d\n", arg_type); return -EFAULT; @@ -1063,9 +1109,9 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, err = check_stack_boundary(env, regno, meta->map_ptr->value_size, false, NULL); - } else if (arg_type == ARG_CONST_STACK_SIZE || - arg_type == ARG_CONST_STACK_SIZE_OR_ZERO) { - bool zero_size_allowed = (arg_type == ARG_CONST_STACK_SIZE_OR_ZERO); + } else if (arg_type == ARG_CONST_SIZE || + arg_type == ARG_CONST_SIZE_OR_ZERO) { + bool zero_size_allowed = (arg_type == ARG_CONST_SIZE_OR_ZERO); /* bpf_xxx(..., buf, len) call will access 'len' bytes * from stack pointer 'buf'. Check it @@ -1073,14 +1119,50 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, */ if (regno == 0) { /* kernel subsystem misconfigured verifier */ - verbose("ARG_CONST_STACK_SIZE cannot be first argument\n"); + verbose("ARG_CONST_SIZE cannot be first argument\n"); return -EACCES; } - if (regs[regno - 1].type == PTR_TO_PACKET) - err = check_packet_access(env, regno - 1, 0, reg->imm); - else - err = check_stack_boundary(env, regno - 1, reg->imm, - zero_size_allowed, meta); + + /* If the register is UNKNOWN_VALUE, the access check happens + * using its boundaries. Otherwise, just use its imm + */ + if (type == UNKNOWN_VALUE) { + /* For unprivileged variable accesses, disable raw + * mode so that the program is required to + * initialize all the memory that the helper could + * just partially fill up. + */ + meta = NULL; + + if (reg->min_value < 0) { + verbose("R%d min value is negative, either use unsigned or 'var &= const'\n", + regno); + return -EACCES; + } + + if (reg->min_value == 0) { + err = check_helper_mem_access(env, regno - 1, 0, + zero_size_allowed, + meta); + if (err) + return err; + } + + if (reg->max_value == BPF_REGISTER_MAX_RANGE) { + verbose("R%d unbounded memory access, use 'var &= const' or 'if (var < const)'\n", + regno); + return -EACCES; + } + err = check_helper_mem_access(env, regno - 1, + reg->max_value, + zero_size_allowed, meta); + if (err) + return err; + } else { + /* register is CONST_IMM */ + err = check_helper_mem_access(env, regno - 1, reg->imm, + zero_size_allowed, meta); + } } return err; @@ -1154,15 +1236,15 @@ static int check_raw_mode(const struct bpf_func_proto *fn) { int count = 0; - if (fn->arg1_type == ARG_PTR_TO_RAW_STACK) + if (fn->arg1_type == ARG_PTR_TO_UNINIT_MEM) count++; - if (fn->arg2_type == ARG_PTR_TO_RAW_STACK) + if (fn->arg2_type == ARG_PTR_TO_UNINIT_MEM) count++; - if (fn->arg3_type == ARG_PTR_TO_RAW_STACK) + if (fn->arg3_type == ARG_PTR_TO_UNINIT_MEM) count++; - if (fn->arg4_type == ARG_PTR_TO_RAW_STACK) + if (fn->arg4_type == ARG_PTR_TO_UNINIT_MEM) count++; - if (fn->arg5_type == ARG_PTR_TO_RAW_STACK) + if (fn->arg5_type == ARG_PTR_TO_UNINIT_MEM) count++; return count > 1 ? -EINVAL : 0; @@ -1316,7 +1398,7 @@ static int check_packet_ptr_add(struct bpf_verifier_env *env, imm = insn->imm; add_imm: - if (imm <= 0) { + if (imm < 0) { verbose("addition of negative constant to packet pointer is not allowed\n"); return -EACCES; } @@ -1485,22 +1567,54 @@ static int evaluate_reg_imm_alu(struct bpf_verifier_env *env, struct bpf_reg_state *dst_reg = ®s[insn->dst_reg]; struct bpf_reg_state *src_reg = ®s[insn->src_reg]; u8 opcode = BPF_OP(insn->code); + u64 dst_imm = dst_reg->imm; - /* dst_reg->type == CONST_IMM here, simulate execution of 'add'/'or' - * insn. Don't care about overflow or negative values, just add them + /* dst_reg->type == CONST_IMM here. Simulate execution of insns + * containing ALU ops. Don't care about overflow or negative + * values, just add/sub/... them; registers are in u64. */ - if (opcode == BPF_ADD && BPF_SRC(insn->code) == BPF_K) - dst_reg->imm += insn->imm; - else if (opcode == BPF_ADD && BPF_SRC(insn->code) == BPF_X && - src_reg->type == CONST_IMM) - dst_reg->imm += src_reg->imm; - else if (opcode == BPF_OR && BPF_SRC(insn->code) == BPF_K) - dst_reg->imm |= insn->imm; - else if (opcode == BPF_OR && BPF_SRC(insn->code) == BPF_X && - src_reg->type == CONST_IMM) - dst_reg->imm |= src_reg->imm; - else + if (opcode == BPF_ADD && BPF_SRC(insn->code) == BPF_K) { + dst_imm += insn->imm; + } else if (opcode == BPF_ADD && BPF_SRC(insn->code) == BPF_X && + src_reg->type == CONST_IMM) { + dst_imm += src_reg->imm; + } else if (opcode == BPF_SUB && BPF_SRC(insn->code) == BPF_K) { + dst_imm -= insn->imm; + } else if (opcode == BPF_SUB && BPF_SRC(insn->code) == BPF_X && + src_reg->type == CONST_IMM) { + dst_imm -= src_reg->imm; + } else if (opcode == BPF_MUL && BPF_SRC(insn->code) == BPF_K) { + dst_imm *= insn->imm; + } else if (opcode == BPF_MUL && BPF_SRC(insn->code) == BPF_X && + src_reg->type == CONST_IMM) { + dst_imm *= src_reg->imm; + } else if (opcode == BPF_OR && BPF_SRC(insn->code) == BPF_K) { + dst_imm |= insn->imm; + } else if (opcode == BPF_OR && BPF_SRC(insn->code) == BPF_X && + src_reg->type == CONST_IMM) { + dst_imm |= src_reg->imm; + } else if (opcode == BPF_AND && BPF_SRC(insn->code) == BPF_K) { + dst_imm &= insn->imm; + } else if (opcode == BPF_AND && BPF_SRC(insn->code) == BPF_X && + src_reg->type == CONST_IMM) { + dst_imm &= src_reg->imm; + } else if (opcode == BPF_RSH && BPF_SRC(insn->code) == BPF_K) { + dst_imm >>= insn->imm; + } else if (opcode == BPF_RSH && BPF_SRC(insn->code) == BPF_X && + src_reg->type == CONST_IMM) { + dst_imm >>= src_reg->imm; + } else if (opcode == BPF_LSH && BPF_SRC(insn->code) == BPF_K) { + dst_imm <<= insn->imm; + } else if (opcode == BPF_LSH && BPF_SRC(insn->code) == BPF_X && + src_reg->type == CONST_IMM) { + dst_imm <<= src_reg->imm; + } else { mark_reg_unknown_value(regs, insn->dst_reg); + goto out; + } + + dst_reg->imm = dst_imm; +out: return 0; } @@ -1894,6 +2008,7 @@ static void reg_set_min_max(struct bpf_reg_state *true_reg, case BPF_JGT: /* Unsigned comparison, the minimum value is 0. */ false_reg->min_value = 0; + /* fallthrough */ case BPF_JSGT: /* If this is false then we know the maximum val is val, * otherwise we know the min val is val+1. @@ -1904,6 +2019,7 @@ static void reg_set_min_max(struct bpf_reg_state *true_reg, case BPF_JGE: /* Unsigned comparison, the minimum value is 0. */ false_reg->min_value = 0; + /* fallthrough */ case BPF_JSGE: /* If this is false then we know the maximum value is val - 1, * otherwise we know the mimimum value is val. @@ -1942,6 +2058,7 @@ static void reg_set_min_max_inv(struct bpf_reg_state *true_reg, case BPF_JGT: /* Unsigned comparison, the minimum value is 0. */ true_reg->min_value = 0; + /* fallthrough */ case BPF_JSGT: /* * If this is false, then the val is <= the register, if it is @@ -1953,6 +2070,7 @@ static void reg_set_min_max_inv(struct bpf_reg_state *true_reg, case BPF_JGE: /* Unsigned comparison, the minimum value is 0. */ true_reg->min_value = 0; + /* fallthrough */ case BPF_JSGE: /* If this is false then constant < register, if it is true then * the register < constant. @@ -2144,14 +2262,8 @@ static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn) return err; if (insn->src_reg == 0) { - /* generic move 64-bit immediate into a register, - * only analyzer needs to collect the ld_imm value. - */ u64 imm = ((u64)(insn + 1)->imm << 32) | (u32)insn->imm; - if (!env->analyzer_ops) - return 0; - regs[insn->dst_reg].type = CONST_IMM; regs[insn->dst_reg].imm = imm; return 0; @@ -2729,7 +2841,6 @@ static int do_check(struct bpf_verifier_env *env) if (err) return err; - reset_reg_range_values(regs, insn->dst_reg); if (BPF_SIZE(insn->code) != BPF_W && BPF_SIZE(insn->code) != BPF_DW) { insn_idx++; @@ -3085,10 +3196,14 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) insn = env->prog->insnsi + delta; for (i = 0; i < insn_cnt; i++, insn++) { - if (insn->code == (BPF_LDX | BPF_MEM | BPF_W) || + if (insn->code == (BPF_LDX | BPF_MEM | BPF_B) || + insn->code == (BPF_LDX | BPF_MEM | BPF_H) || + insn->code == (BPF_LDX | BPF_MEM | BPF_W) || insn->code == (BPF_LDX | BPF_MEM | BPF_DW)) type = BPF_READ; - else if (insn->code == (BPF_STX | BPF_MEM | BPF_W) || + else if (insn->code == (BPF_STX | BPF_MEM | BPF_B) || + insn->code == (BPF_STX | BPF_MEM | BPF_H) || + insn->code == (BPF_STX | BPF_MEM | BPF_W) || insn->code == (BPF_STX | BPF_MEM | BPF_DW)) type = BPF_WRITE; else @@ -3097,8 +3212,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) if (env->insn_aux_data[i].ptr_type != PTR_TO_CTX) continue; - cnt = ops->convert_ctx_access(type, insn->dst_reg, insn->src_reg, - insn->off, insn_buf, env->prog); + cnt = ops->convert_ctx_access(type, insn, insn_buf, env->prog); if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf)) { verbose("bpf verifier is misconfigured\n"); return -EINVAL; diff --git a/kernel/configs/android-recommended.config b/kernel/configs/android-recommended.config index 297756be369c..99127edc5204 100644 --- a/kernel/configs/android-recommended.config +++ b/kernel/configs/android-recommended.config @@ -11,7 +11,7 @@ CONFIG_BLK_DEV_LOOP=y CONFIG_BLK_DEV_RAM=y CONFIG_BLK_DEV_RAM_SIZE=8192 CONFIG_COMPACTION=y -CONFIG_DEBUG_RODATA=y +CONFIG_STRICT_KERNEL_RWX=y CONFIG_DM_CRYPT=y CONFIG_DM_UEVENT=y CONFIG_DM_VERITY=y diff --git a/kernel/exit.c b/kernel/exit.c index b67c57faa705..580da79e38ee 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -14,7 +14,6 @@ #include <linux/tty.h> #include <linux/iocontext.h> #include <linux/key.h> -#include <linux/security.h> #include <linux/cpu.h> #include <linux/acct.h> #include <linux/tsacct_kern.h> @@ -1390,7 +1389,7 @@ static int wait_task_continued(struct wait_opts *wo, struct task_struct *p) * Returns nonzero for a final return, when we have unlocked tasklist_lock. * Returns zero if the search for a child should continue; * then ->notask_error is 0 if @p is an eligible child, - * or another error from security_task_wait(), or still -ECHILD. + * or still -ECHILD. */ static int wait_consider_task(struct wait_opts *wo, int ptrace, struct task_struct *p) @@ -1410,20 +1409,6 @@ static int wait_consider_task(struct wait_opts *wo, int ptrace, if (!ret) return ret; - ret = security_task_wait(p); - if (unlikely(ret < 0)) { - /* - * If we have not yet seen any eligible child, - * then let this error code replace -ECHILD. - * A permission error will give the user a clue - * to look for security policy problems, rather - * than for mysterious wait bugs. - */ - if (wo->notask_error) - wo->notask_error = ret; - return 0; - } - if (unlikely(exit_state == EXIT_TRACE)) { /* * ptrace == 0 means we are the natural parent. In this case @@ -1516,7 +1501,7 @@ static int wait_consider_task(struct wait_opts *wo, int ptrace, * Returns nonzero for a final return, when we have unlocked tasklist_lock. * Returns zero if the search for a child should continue; then * ->notask_error is 0 if there were any eligible children, - * or another error from security_task_wait(), or still -ECHILD. + * or still -ECHILD. */ static int do_wait_thread(struct wait_opts *wo, struct task_struct *tsk) { diff --git a/kernel/extable.c b/kernel/extable.c index e1359474baa5..2676d7f8baf6 100644 --- a/kernel/extable.c +++ b/kernel/extable.c @@ -17,10 +17,12 @@ */ #include <linux/ftrace.h> #include <linux/memory.h> +#include <linux/extable.h> #include <linux/module.h> #include <linux/mutex.h> #include <linux/init.h> #include <linux/kprobes.h> +#include <linux/filter.h> #include <asm/sections.h> #include <linux/uaccess.h> @@ -107,6 +109,8 @@ int __kernel_text_address(unsigned long addr) return 1; if (is_kprobe_optinsn_slot(addr) || is_kprobe_insn_slot(addr)) return 1; + if (is_bpf_text_address(addr)) + return 1; /* * There might be init symbols in saved stacktraces. * Give those symbols a chance to be printed in @@ -130,6 +134,8 @@ int kernel_text_address(unsigned long addr) return 1; if (is_kprobe_optinsn_slot(addr) || is_kprobe_insn_slot(addr)) return 1; + if (is_bpf_text_address(addr)) + return 1; return 0; } diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index fafd1a3ef0da..6a3b249a2ae1 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -23,6 +23,7 @@ #include <linux/mm.h> #include <linux/ctype.h> #include <linux/slab.h> +#include <linux/filter.h> #include <linux/compiler.h> #include <asm/sections.h> @@ -300,10 +301,11 @@ int kallsyms_lookup_size_offset(unsigned long addr, unsigned long *symbolsize, unsigned long *offset) { char namebuf[KSYM_NAME_LEN]; + if (is_ksym_addr(addr)) return !!get_symbol_pos(addr, symbolsize, offset); - - return !!module_address_lookup(addr, symbolsize, offset, NULL, namebuf); + return !!module_address_lookup(addr, symbolsize, offset, NULL, namebuf) || + !!__bpf_address_lookup(addr, symbolsize, offset, namebuf); } /* @@ -318,6 +320,8 @@ const char *kallsyms_lookup(unsigned long addr, unsigned long *offset, char **modname, char *namebuf) { + const char *ret; + namebuf[KSYM_NAME_LEN - 1] = 0; namebuf[0] = 0; @@ -333,9 +337,13 @@ const char *kallsyms_lookup(unsigned long addr, return namebuf; } - /* See if it's in a module. */ - return module_address_lookup(addr, symbolsize, offset, modname, - namebuf); + /* See if it's in a module or a BPF JITed image. */ + ret = module_address_lookup(addr, symbolsize, offset, + modname, namebuf); + if (!ret) + ret = bpf_address_lookup(addr, symbolsize, + offset, modname, namebuf); + return ret; } int lookup_symbol_name(unsigned long addr, char *symname) @@ -471,6 +479,7 @@ EXPORT_SYMBOL(__print_symbol); /* To avoid using get_symbol_offset for every symbol, we carry prefix along. */ struct kallsym_iter { loff_t pos; + loff_t pos_mod_end; unsigned long value; unsigned int nameoff; /* If iterating in core kernel symbols. */ char type; @@ -481,13 +490,27 @@ struct kallsym_iter { static int get_ksymbol_mod(struct kallsym_iter *iter) { - if (module_get_kallsym(iter->pos - kallsyms_num_syms, &iter->value, - &iter->type, iter->name, iter->module_name, - &iter->exported) < 0) + int ret = module_get_kallsym(iter->pos - kallsyms_num_syms, + &iter->value, &iter->type, + iter->name, iter->module_name, + &iter->exported); + if (ret < 0) { + iter->pos_mod_end = iter->pos; return 0; + } + return 1; } +static int get_ksymbol_bpf(struct kallsym_iter *iter) +{ + iter->module_name[0] = '\0'; + iter->exported = 0; + return bpf_get_kallsym(iter->pos - iter->pos_mod_end, + &iter->value, &iter->type, + iter->name) < 0 ? 0 : 1; +} + /* Returns space to next name. */ static unsigned long get_ksymbol_core(struct kallsym_iter *iter) { @@ -508,16 +531,30 @@ static void reset_iter(struct kallsym_iter *iter, loff_t new_pos) iter->name[0] = '\0'; iter->nameoff = get_symbol_offset(new_pos); iter->pos = new_pos; + if (new_pos == 0) + iter->pos_mod_end = 0; +} + +static int update_iter_mod(struct kallsym_iter *iter, loff_t pos) +{ + iter->pos = pos; + + if (iter->pos_mod_end > 0 && + iter->pos_mod_end < iter->pos) + return get_ksymbol_bpf(iter); + + if (!get_ksymbol_mod(iter)) + return get_ksymbol_bpf(iter); + + return 1; } /* Returns false if pos at or past end of file. */ static int update_iter(struct kallsym_iter *iter, loff_t pos) { /* Module symbols can be accessed randomly. */ - if (pos >= kallsyms_num_syms) { - iter->pos = pos; - return get_ksymbol_mod(iter); - } + if (pos >= kallsyms_num_syms) + return update_iter_mod(iter, pos); /* If we're not on the desired position, reset to new position. */ if (pos != iter->pos) diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index 5617cc412444..bfe62d5b3872 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c @@ -916,7 +916,7 @@ void crash_kexec(struct pt_regs *regs) old_cpu = atomic_cmpxchg(&panic_cpu, PANIC_CPU_INVALID, this_cpu); if (old_cpu == PANIC_CPU_INVALID) { /* This is the 1st CPU which comes here, so go ahead. */ - printk_nmi_flush_on_panic(); + printk_safe_flush_on_panic(); __crash_kexec(regs); /* @@ -1399,7 +1399,7 @@ void __weak arch_crash_save_vmcoreinfo(void) phys_addr_t __weak paddr_vmcoreinfo_note(void) { - return __pa((unsigned long)(char *)&vmcoreinfo_note); + return __pa_symbol((unsigned long)(char *)&vmcoreinfo_note); } static int __init crash_save_vmcoreinfo_init(void) diff --git a/kernel/kmod.c b/kernel/kmod.c index d45c96073afb..0c407f905ca4 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -516,7 +516,7 @@ static void helper_unlock(void) * Function must be runnable in either a process context or the * context in which call_usermodehelper_exec is called. */ -struct subprocess_info *call_usermodehelper_setup(char *path, char **argv, +struct subprocess_info *call_usermodehelper_setup(const char *path, char **argv, char **envp, gfp_t gfp_mask, int (*init)(struct subprocess_info *info, struct cred *new), void (*cleanup)(struct subprocess_info *info), @@ -528,7 +528,12 @@ struct subprocess_info *call_usermodehelper_setup(char *path, char **argv, goto out; INIT_WORK(&sub_info->work, call_usermodehelper_exec_work); + +#ifdef CONFIG_STATIC_USERMODEHELPER + sub_info->path = CONFIG_STATIC_USERMODEHELPER_PATH; +#else sub_info->path = path; +#endif sub_info->argv = argv; sub_info->envp = envp; @@ -566,6 +571,15 @@ int call_usermodehelper_exec(struct subprocess_info *sub_info, int wait) retval = -EBUSY; goto out; } + + /* + * If there is no binary for us to call, then just return and get out of + * here. This allows us to set STATIC_USERMODEHELPER_PATH to "" and + * disable all call_usermodehelper() calls. + */ + if (strlen(sub_info->path) == 0) + goto out; + /* * Set the completion pointer only if there is a waiter. * This makes it possible to use umh_complete to free @@ -613,7 +627,7 @@ EXPORT_SYMBOL(call_usermodehelper_exec); * This function is the equivalent to use call_usermodehelper_setup() and * call_usermodehelper_exec(). */ -int call_usermodehelper(char *path, char **argv, char **envp, int wait) +int call_usermodehelper(const char *path, char **argv, char **envp, int wait) { struct subprocess_info *info; gfp_t gfp_mask = (wait == UMH_NO_WAIT) ? GFP_ATOMIC : GFP_KERNEL; diff --git a/kernel/kprobes.c b/kernel/kprobes.c index ebb4dadca66b..699c5bc51a92 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -1740,6 +1740,12 @@ void unregister_kprobes(struct kprobe **kps, int num) } EXPORT_SYMBOL_GPL(unregister_kprobes); +int __weak __kprobes kprobe_exceptions_notify(struct notifier_block *self, + unsigned long val, void *data) +{ + return NOTIFY_DONE; +} + static struct notifier_block kprobe_exceptions_nb = { .notifier_call = kprobe_exceptions_notify, .priority = 0x7fffffff /* we need to be notified first */ diff --git a/kernel/module.c b/kernel/module.c index 3d8f126208e3..7eba6dea4f41 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -17,6 +17,7 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #include <linux/export.h> +#include <linux/extable.h> #include <linux/moduleloader.h> #include <linux/trace_events.h> #include <linux/init.h> @@ -61,6 +62,7 @@ #include <linux/pfn.h> #include <linux/bsearch.h> #include <linux/dynamic_debug.h> +#include <linux/audit.h> #include <uapi/linux/module.h> #include "module-internal.h" @@ -74,9 +76,9 @@ /* * Modules' sections will be aligned on page boundaries * to ensure complete separation of code and data, but - * only when CONFIG_DEBUG_SET_MODULE_RONX=y + * only when CONFIG_STRICT_MODULE_RWX=y */ -#ifdef CONFIG_DEBUG_SET_MODULE_RONX +#ifdef CONFIG_STRICT_MODULE_RWX # define debug_align(X) ALIGN(X, PAGE_SIZE) #else # define debug_align(X) (X) @@ -1844,7 +1846,7 @@ static void mod_sysfs_teardown(struct module *mod) mod_sysfs_fini(mod); } -#ifdef CONFIG_DEBUG_SET_MODULE_RONX +#ifdef CONFIG_STRICT_MODULE_RWX /* * LKM RO/NX protection: protect module's text/ro-data * from modification and any data from execution. @@ -2809,6 +2811,8 @@ static int check_modinfo_livepatch(struct module *mod, struct load_info *info) if (get_modinfo(info, "livepatch")) { mod->klp = true; add_taint_module(mod, TAINT_LIVEPATCH, LOCKDEP_STILL_OK); + pr_notice_once("%s: tainting kernel with TAINT_LIVEPATCH\n", + mod->name); } return 0; @@ -3608,6 +3612,8 @@ static int load_module(struct load_info *info, const char __user *uargs, goto free_copy; } + audit_log_kern_module(mod->name); + /* Reserve our place in the list. */ err = add_unformed_module(mod); if (err) @@ -3696,7 +3702,7 @@ static int load_module(struct load_info *info, const char __user *uargs, mod->name, after_dashes); } - /* Link in to syfs. */ + /* Link in to sysfs. */ err = mod_sysfs_setup(mod, info, mod->kp, mod->num_kp); if (err < 0) goto coming_cleanup; @@ -3719,6 +3725,7 @@ static int load_module(struct load_info *info, const char __user *uargs, mod_sysfs_teardown(mod); coming_cleanup: mod->state = MODULE_STATE_GOING; + destroy_params(mod->kp, mod->num_kp); blocking_notifier_call_chain(&module_notify_list, MODULE_STATE_GOING, mod); klp_module_going(mod); @@ -4165,22 +4172,23 @@ const struct exception_table_entry *search_module_extables(unsigned long addr) struct module *mod; preempt_disable(); - list_for_each_entry_rcu(mod, &modules, list) { - if (mod->state == MODULE_STATE_UNFORMED) - continue; - if (mod->num_exentries == 0) - continue; + mod = __module_address(addr); + if (!mod) + goto out; - e = search_extable(mod->extable, - mod->extable + mod->num_exentries - 1, - addr); - if (e) - break; - } + if (!mod->num_exentries) + goto out; + + e = search_extable(mod->extable, + mod->extable + mod->num_exentries - 1, + addr); +out: preempt_enable(); - /* Now, if we found one, we are running inside it now, hence - we cannot unload the module, hence no refcnt needed. */ + /* + * Now, if we found one, we are running inside it now, hence + * we cannot unload the module, hence no refcnt needed. + */ return e; } diff --git a/kernel/panic.c b/kernel/panic.c index 08aa88dde7de..b95959733ce0 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -188,7 +188,7 @@ void panic(const char *fmt, ...) * Bypass the panic_cpu check and call __crash_kexec directly. */ if (!_crash_kexec_post_notifiers) { - printk_nmi_flush_on_panic(); + printk_safe_flush_on_panic(); __crash_kexec(NULL); /* @@ -213,7 +213,7 @@ void panic(const char *fmt, ...) atomic_notifier_call_chain(&panic_notifier_list, 0, buf); /* Call flush even twice. It tries harder with a single online CPU */ - printk_nmi_flush_on_panic(); + printk_safe_flush_on_panic(); kmsg_dump(KMSG_DUMP_PANIC); /* diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index b26dbc48c75b..86385af1080f 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -1156,7 +1156,7 @@ static int __init hibernate_setup(char *str) } else if (!strncmp(str, "no", 2)) { noresume = 1; nohibernate = 1; - } else if (IS_ENABLED(CONFIG_DEBUG_RODATA) + } else if (IS_ENABLED(CONFIG_STRICT_KERNEL_RWX) && !strncmp(str, "protect_image", 13)) { enable_restore_image_protection(); } diff --git a/kernel/power/power.h b/kernel/power/power.h index 1dfa0da827d3..7fdc40d31b7d 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h @@ -61,12 +61,12 @@ extern int hibernation_snapshot(int platform_mode); extern int hibernation_restore(int platform_mode); extern int hibernation_platform_enter(void); -#ifdef CONFIG_DEBUG_RODATA +#ifdef CONFIG_STRICT_KERNEL_RWX /* kernel/power/snapshot.c */ extern void enable_restore_image_protection(void); #else static inline void enable_restore_image_protection(void) {} -#endif /* CONFIG_DEBUG_RODATA */ +#endif /* CONFIG_STRICT_KERNEL_RWX */ #else /* !CONFIG_HIBERNATION */ diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 2d8e2b227db8..905d5bbd595f 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -38,7 +38,7 @@ #include "power.h" -#ifdef CONFIG_DEBUG_RODATA +#ifdef CONFIG_STRICT_KERNEL_RWX static bool hibernate_restore_protection; static bool hibernate_restore_protection_active; @@ -73,7 +73,7 @@ static inline void hibernate_restore_protection_begin(void) {} static inline void hibernate_restore_protection_end(void) {} static inline void hibernate_restore_protect_page(void *page_address) {} static inline void hibernate_restore_unprotect_page(void *page_address) {} -#endif /* CONFIG_DEBUG_RODATA */ +#endif /* CONFIG_STRICT_KERNEL_RWX */ static int swsusp_page_is_free(struct page *); static void swsusp_set_page_forbidden(struct page *); diff --git a/kernel/power/suspend_test.c b/kernel/power/suspend_test.c index bdff5ed57f10..5db217051232 100644 --- a/kernel/power/suspend_test.c +++ b/kernel/power/suspend_test.c @@ -166,7 +166,7 @@ static int __init setup_test_suspend(char *value) return 0; } - for (i = 0; pm_labels[i]; i++) + for (i = PM_SUSPEND_MIN; i < PM_SUSPEND_MAX; i++) if (!strcmp(pm_labels[i], suspend_type)) { test_state_label = pm_labels[i]; return 0; diff --git a/kernel/power/swap.c b/kernel/power/swap.c index 32e0c232efba..f80fd33639e0 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -201,7 +201,7 @@ void free_all_swap_pages(int swap) struct swsusp_extent *ext; unsigned long offset; - ext = container_of(node, struct swsusp_extent, node); + ext = rb_entry(node, struct swsusp_extent, node); rb_erase(node, &swsusp_extents); for (offset = ext->start; offset <= ext->end; offset++) swap_free(swp_entry(swap, offset)); diff --git a/kernel/printk/Makefile b/kernel/printk/Makefile index abb0042a427b..4a2ffc39eb95 100644 --- a/kernel/printk/Makefile +++ b/kernel/printk/Makefile @@ -1,3 +1,3 @@ obj-y = printk.o -obj-$(CONFIG_PRINTK_NMI) += nmi.o +obj-$(CONFIG_PRINTK) += printk_safe.o obj-$(CONFIG_A11Y_BRAILLE_CONSOLE) += braille.o diff --git a/kernel/printk/internal.h b/kernel/printk/internal.h index 7fd2838fa417..1db044f808b7 100644 --- a/kernel/printk/internal.h +++ b/kernel/printk/internal.h @@ -16,42 +16,55 @@ */ #include <linux/percpu.h> -typedef __printf(1, 0) int (*printk_func_t)(const char *fmt, va_list args); +#ifdef CONFIG_PRINTK -int __printf(1, 0) vprintk_default(const char *fmt, va_list args); - -#ifdef CONFIG_PRINTK_NMI +#define PRINTK_SAFE_CONTEXT_MASK 0x7fffffff +#define PRINTK_NMI_CONTEXT_MASK 0x80000000 extern raw_spinlock_t logbuf_lock; +__printf(1, 0) int vprintk_default(const char *fmt, va_list args); +__printf(1, 0) int vprintk_func(const char *fmt, va_list args); +void __printk_safe_enter(void); +void __printk_safe_exit(void); + +#define printk_safe_enter_irqsave(flags) \ + do { \ + local_irq_save(flags); \ + __printk_safe_enter(); \ + } while (0) + +#define printk_safe_exit_irqrestore(flags) \ + do { \ + __printk_safe_exit(); \ + local_irq_restore(flags); \ + } while (0) + +#define printk_safe_enter_irq() \ + do { \ + local_irq_disable(); \ + __printk_safe_enter(); \ + } while (0) + +#define printk_safe_exit_irq() \ + do { \ + __printk_safe_exit(); \ + local_irq_enable(); \ + } while (0) + +#else + +__printf(1, 0) int vprintk_func(const char *fmt, va_list args) { return 0; } + /* - * printk() could not take logbuf_lock in NMI context. Instead, - * it temporary stores the strings into a per-CPU buffer. - * The alternative implementation is chosen transparently - * via per-CPU variable. + * In !PRINTK builds we still export logbuf_lock spin_lock, console_sem + * semaphore and some of console functions (console_unlock()/etc.), so + * printk-safe must preserve the existing local IRQ guarantees. */ -DECLARE_PER_CPU(printk_func_t, printk_func); -static inline __printf(1, 0) int vprintk_func(const char *fmt, va_list args) -{ - return this_cpu_read(printk_func)(fmt, args); -} - -extern atomic_t nmi_message_lost; -static inline int get_nmi_message_lost(void) -{ - return atomic_xchg(&nmi_message_lost, 0); -} - -#else /* CONFIG_PRINTK_NMI */ - -static inline __printf(1, 0) int vprintk_func(const char *fmt, va_list args) -{ - return vprintk_default(fmt, args); -} - -static inline int get_nmi_message_lost(void) -{ - return 0; -} - -#endif /* CONFIG_PRINTK_NMI */ +#define printk_safe_enter_irqsave(flags) local_irq_save(flags) +#define printk_safe_exit_irqrestore(flags) local_irq_restore(flags) + +#define printk_safe_enter_irq() local_irq_disable() +#define printk_safe_exit_irq() local_irq_enable() + +#endif /* CONFIG_PRINTK */ diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 4ba3d34938c0..34da86e73d00 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -213,17 +213,36 @@ static int nr_ext_console_drivers; static int __down_trylock_console_sem(unsigned long ip) { - if (down_trylock(&console_sem)) + int lock_failed; + unsigned long flags; + + /* + * Here and in __up_console_sem() we need to be in safe mode, + * because spindump/WARN/etc from under console ->lock will + * deadlock in printk()->down_trylock_console_sem() otherwise. + */ + printk_safe_enter_irqsave(flags); + lock_failed = down_trylock(&console_sem); + printk_safe_exit_irqrestore(flags); + + if (lock_failed) return 1; mutex_acquire(&console_lock_dep_map, 0, 1, ip); return 0; } #define down_trylock_console_sem() __down_trylock_console_sem(_RET_IP_) -#define up_console_sem() do { \ - mutex_release(&console_lock_dep_map, 1, _RET_IP_);\ - up(&console_sem);\ -} while (0) +static void __up_console_sem(unsigned long ip) +{ + unsigned long flags; + + mutex_release(&console_lock_dep_map, 1, ip); + + printk_safe_enter_irqsave(flags); + up(&console_sem); + printk_safe_exit_irqrestore(flags); +} +#define up_console_sem() __up_console_sem(_RET_IP_) /* * This is used for debugging the mess that is the VT code by @@ -351,6 +370,34 @@ __packed __aligned(4) */ DEFINE_RAW_SPINLOCK(logbuf_lock); +/* + * Helper macros to lock/unlock logbuf_lock and switch between + * printk-safe/unsafe modes. + */ +#define logbuf_lock_irq() \ + do { \ + printk_safe_enter_irq(); \ + raw_spin_lock(&logbuf_lock); \ + } while (0) + +#define logbuf_unlock_irq() \ + do { \ + raw_spin_unlock(&logbuf_lock); \ + printk_safe_exit_irq(); \ + } while (0) + +#define logbuf_lock_irqsave(flags) \ + do { \ + printk_safe_enter_irqsave(flags); \ + raw_spin_lock(&logbuf_lock); \ + } while (0) + +#define logbuf_unlock_irqrestore(flags) \ + do { \ + raw_spin_unlock(&logbuf_lock); \ + printk_safe_exit_irqrestore(flags); \ + } while (0) + #ifdef CONFIG_PRINTK DECLARE_WAIT_QUEUE_HEAD(log_wait); /* the next printk record to read by syslog(READ) or /proc/kmsg */ @@ -782,20 +829,21 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf, ret = mutex_lock_interruptible(&user->lock); if (ret) return ret; - raw_spin_lock_irq(&logbuf_lock); + + logbuf_lock_irq(); while (user->seq == log_next_seq) { if (file->f_flags & O_NONBLOCK) { ret = -EAGAIN; - raw_spin_unlock_irq(&logbuf_lock); + logbuf_unlock_irq(); goto out; } - raw_spin_unlock_irq(&logbuf_lock); + logbuf_unlock_irq(); ret = wait_event_interruptible(log_wait, user->seq != log_next_seq); if (ret) goto out; - raw_spin_lock_irq(&logbuf_lock); + logbuf_lock_irq(); } if (user->seq < log_first_seq) { @@ -803,7 +851,7 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf, user->idx = log_first_idx; user->seq = log_first_seq; ret = -EPIPE; - raw_spin_unlock_irq(&logbuf_lock); + logbuf_unlock_irq(); goto out; } @@ -816,7 +864,7 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf, user->idx = log_next(user->idx); user->seq++; - raw_spin_unlock_irq(&logbuf_lock); + logbuf_unlock_irq(); if (len > count) { ret = -EINVAL; @@ -843,7 +891,7 @@ static loff_t devkmsg_llseek(struct file *file, loff_t offset, int whence) if (offset) return -ESPIPE; - raw_spin_lock_irq(&logbuf_lock); + logbuf_lock_irq(); switch (whence) { case SEEK_SET: /* the first record */ @@ -867,7 +915,7 @@ static loff_t devkmsg_llseek(struct file *file, loff_t offset, int whence) default: ret = -EINVAL; } - raw_spin_unlock_irq(&logbuf_lock); + logbuf_unlock_irq(); return ret; } @@ -881,7 +929,7 @@ static unsigned int devkmsg_poll(struct file *file, poll_table *wait) poll_wait(file, &log_wait, wait); - raw_spin_lock_irq(&logbuf_lock); + logbuf_lock_irq(); if (user->seq < log_next_seq) { /* return error when data has vanished underneath us */ if (user->seq < log_first_seq) @@ -889,7 +937,7 @@ static unsigned int devkmsg_poll(struct file *file, poll_table *wait) else ret = POLLIN|POLLRDNORM; } - raw_spin_unlock_irq(&logbuf_lock); + logbuf_unlock_irq(); return ret; } @@ -919,10 +967,10 @@ static int devkmsg_open(struct inode *inode, struct file *file) mutex_init(&user->lock); - raw_spin_lock_irq(&logbuf_lock); + logbuf_lock_irq(); user->idx = log_first_idx; user->seq = log_first_seq; - raw_spin_unlock_irq(&logbuf_lock); + logbuf_unlock_irq(); file->private_data = user; return 0; @@ -1064,13 +1112,13 @@ void __init setup_log_buf(int early) return; } - raw_spin_lock_irqsave(&logbuf_lock, flags); + logbuf_lock_irqsave(flags); log_buf_len = new_log_buf_len; log_buf = new_log_buf; new_log_buf_len = 0; free = __LOG_BUF_LEN - log_next_idx; memcpy(log_buf, __log_buf, __LOG_BUF_LEN); - raw_spin_unlock_irqrestore(&logbuf_lock, flags); + logbuf_unlock_irqrestore(flags); pr_info("log_buf_len: %d bytes\n", log_buf_len); pr_info("early log buf free: %d(%d%%)\n", @@ -1248,7 +1296,7 @@ static int syslog_print(char __user *buf, int size) size_t n; size_t skip; - raw_spin_lock_irq(&logbuf_lock); + logbuf_lock_irq(); if (syslog_seq < log_first_seq) { /* messages are gone, move to first one */ syslog_seq = log_first_seq; @@ -1256,7 +1304,7 @@ static int syslog_print(char __user *buf, int size) syslog_partial = 0; } if (syslog_seq == log_next_seq) { - raw_spin_unlock_irq(&logbuf_lock); + logbuf_unlock_irq(); break; } @@ -1275,7 +1323,7 @@ static int syslog_print(char __user *buf, int size) syslog_partial += n; } else n = 0; - raw_spin_unlock_irq(&logbuf_lock); + logbuf_unlock_irq(); if (!n) break; @@ -1304,7 +1352,7 @@ static int syslog_print_all(char __user *buf, int size, bool clear) if (!text) return -ENOMEM; - raw_spin_lock_irq(&logbuf_lock); + logbuf_lock_irq(); if (buf) { u64 next_seq; u64 seq; @@ -1352,12 +1400,12 @@ static int syslog_print_all(char __user *buf, int size, bool clear) idx = log_next(idx); seq++; - raw_spin_unlock_irq(&logbuf_lock); + logbuf_unlock_irq(); if (copy_to_user(buf + len, text, textlen)) len = -EFAULT; else len += textlen; - raw_spin_lock_irq(&logbuf_lock); + logbuf_lock_irq(); if (seq < log_first_seq) { /* messages are gone, move to next one */ @@ -1371,7 +1419,7 @@ static int syslog_print_all(char __user *buf, int size, bool clear) clear_seq = log_next_seq; clear_idx = log_next_idx; } - raw_spin_unlock_irq(&logbuf_lock); + logbuf_unlock_irq(); kfree(text); return len; @@ -1458,7 +1506,7 @@ int do_syslog(int type, char __user *buf, int len, int source) break; /* Number of chars in the log buffer */ case SYSLOG_ACTION_SIZE_UNREAD: - raw_spin_lock_irq(&logbuf_lock); + logbuf_lock_irq(); if (syslog_seq < log_first_seq) { /* messages are gone, move to first one */ syslog_seq = log_first_seq; @@ -1486,7 +1534,7 @@ int do_syslog(int type, char __user *buf, int len, int source) } error -= syslog_partial; } - raw_spin_unlock_irq(&logbuf_lock); + logbuf_unlock_irq(); break; /* Size of the log buffer */ case SYSLOG_ACTION_SIZE_BUFFER: @@ -1510,8 +1558,7 @@ SYSCALL_DEFINE3(syslog, int, type, char __user *, buf, int, len) * log_buf[start] to log_buf[end - 1]. * The console_lock must be held. */ -static void call_console_drivers(int level, - const char *ext_text, size_t ext_len, +static void call_console_drivers(const char *ext_text, size_t ext_len, const char *text, size_t len) { struct console *con; @@ -1538,28 +1585,6 @@ static void call_console_drivers(int level, } } -/* - * Zap console related locks when oopsing. - * To leave time for slow consoles to print a full oops, - * only zap at most once every 30 seconds. - */ -static void zap_locks(void) -{ - static unsigned long oops_timestamp; - - if (time_after_eq(jiffies, oops_timestamp) && - !time_after(jiffies, oops_timestamp + 30 * HZ)) - return; - - oops_timestamp = jiffies; - - debug_locks_off(); - /* If a crash is occurring, make sure we can't deadlock */ - raw_spin_lock_init(&logbuf_lock); - /* And make sure that we print immediately */ - sema_init(&console_sem, 1); -} - int printk_delay_msec __read_mostly; static inline void printk_delay(void) @@ -1669,18 +1694,13 @@ asmlinkage int vprintk_emit(int facility, int level, const char *dict, size_t dictlen, const char *fmt, va_list args) { - static bool recursion_bug; static char textbuf[LOG_LINE_MAX]; char *text = textbuf; size_t text_len = 0; enum log_flags lflags = 0; unsigned long flags; - int this_cpu; int printed_len = 0; - int nmi_message_lost; bool in_sched = false; - /* cpu currently holding logbuf_lock in this function */ - static unsigned int logbuf_cpu = UINT_MAX; if (level == LOGLEVEL_SCHED) { level = LOGLEVEL_DEFAULT; @@ -1690,53 +1710,8 @@ asmlinkage int vprintk_emit(int facility, int level, boot_delay_msec(level); printk_delay(); - local_irq_save(flags); - this_cpu = smp_processor_id(); - - /* - * Ouch, printk recursed into itself! - */ - if (unlikely(logbuf_cpu == this_cpu)) { - /* - * If a crash is occurring during printk() on this CPU, - * then try to get the crash message out but make sure - * we can't deadlock. Otherwise just return to avoid the - * recursion and return - but flag the recursion so that - * it can be printed at the next appropriate moment: - */ - if (!oops_in_progress && !lockdep_recursing(current)) { - recursion_bug = true; - local_irq_restore(flags); - return 0; - } - zap_locks(); - } - - lockdep_off(); /* This stops the holder of console_sem just where we want him */ - raw_spin_lock(&logbuf_lock); - logbuf_cpu = this_cpu; - - if (unlikely(recursion_bug)) { - static const char recursion_msg[] = - "BUG: recent printk recursion!"; - - recursion_bug = false; - /* emit KERN_CRIT message */ - printed_len += log_store(0, 2, LOG_PREFIX|LOG_NEWLINE, 0, - NULL, 0, recursion_msg, - strlen(recursion_msg)); - } - - nmi_message_lost = get_nmi_message_lost(); - if (unlikely(nmi_message_lost)) { - text_len = scnprintf(textbuf, sizeof(textbuf), - "BAD LUCK: lost %d message(s) from NMI context!", - nmi_message_lost); - printed_len += log_store(0, 2, LOG_PREFIX|LOG_NEWLINE, 0, - NULL, 0, textbuf, text_len); - } - + logbuf_lock_irqsave(flags); /* * The printf needs to come first; we need the syslog * prefix which might be passed-in as a parameter. @@ -1779,14 +1754,10 @@ asmlinkage int vprintk_emit(int facility, int level, printed_len += log_output(facility, level, lflags, dict, dictlen, text, text_len); - logbuf_cpu = UINT_MAX; - raw_spin_unlock(&logbuf_lock); - lockdep_on(); - local_irq_restore(flags); + logbuf_unlock_irqrestore(flags); /* If called from the scheduler, we can not call up(). */ if (!in_sched) { - lockdep_off(); /* * Try to acquire and then immediately release the console * semaphore. The release will print out buffers and wake up @@ -1794,7 +1765,6 @@ asmlinkage int vprintk_emit(int facility, int level, */ if (console_trylock()) console_unlock(); - lockdep_on(); } return printed_len; @@ -1803,7 +1773,7 @@ EXPORT_SYMBOL(vprintk_emit); asmlinkage int vprintk(const char *fmt, va_list args) { - return vprintk_emit(0, LOGLEVEL_DEFAULT, NULL, 0, fmt, args); + return vprintk_func(fmt, args); } EXPORT_SYMBOL(vprintk); @@ -1895,16 +1865,12 @@ static ssize_t msg_print_ext_header(char *buf, size_t size, static ssize_t msg_print_ext_body(char *buf, size_t size, char *dict, size_t dict_len, char *text, size_t text_len) { return 0; } -static void call_console_drivers(int level, - const char *ext_text, size_t ext_len, +static void call_console_drivers(const char *ext_text, size_t ext_len, const char *text, size_t len) {} static size_t msg_print_text(const struct printk_log *msg, bool syslog, char *buf, size_t size) { return 0; } static bool suppress_message_printing(int level) { return false; } -/* Still needs to be defined for users */ -DEFINE_PER_CPU(printk_func_t, printk_func); - #endif /* CONFIG_PRINTK */ #ifdef CONFIG_EARLY_PRINTK @@ -2220,9 +2186,9 @@ again: struct printk_log *msg; size_t ext_len = 0; size_t len; - int level; - raw_spin_lock_irqsave(&logbuf_lock, flags); + printk_safe_enter_irqsave(flags); + raw_spin_lock(&logbuf_lock); if (seen_seq != log_next_seq) { wake_klogd = true; seen_seq = log_next_seq; @@ -2243,8 +2209,7 @@ skip: break; msg = log_from_idx(console_idx); - level = msg->level; - if (suppress_message_printing(level)) { + if (suppress_message_printing(msg->level)) { /* * Skip record we have buffered and already printed * directly to the console when we received it, and @@ -2270,9 +2235,9 @@ skip: raw_spin_unlock(&logbuf_lock); stop_critical_timings(); /* don't trace print latency */ - call_console_drivers(level, ext_text, ext_len, text, len); + call_console_drivers(ext_text, ext_len, text, len); start_critical_timings(); - local_irq_restore(flags); + printk_safe_exit_irqrestore(flags); if (do_cond_resched) cond_resched(); @@ -2295,7 +2260,8 @@ skip: */ raw_spin_lock(&logbuf_lock); retry = console_seq != log_next_seq; - raw_spin_unlock_irqrestore(&logbuf_lock, flags); + raw_spin_unlock(&logbuf_lock); + printk_safe_exit_irqrestore(flags); if (retry && console_trylock()) goto again; @@ -2558,10 +2524,10 @@ void register_console(struct console *newcon) * console_unlock(); will print out the buffered messages * for us. */ - raw_spin_lock_irqsave(&logbuf_lock, flags); + logbuf_lock_irqsave(flags); console_seq = syslog_seq; console_idx = syslog_idx; - raw_spin_unlock_irqrestore(&logbuf_lock, flags); + logbuf_unlock_irqrestore(flags); /* * We're about to replay the log buffer. Only do this to the * just-registered console to avoid excessive message spam to @@ -2860,12 +2826,12 @@ void kmsg_dump(enum kmsg_dump_reason reason) /* initialize iterator with data about the stored records */ dumper->active = true; - raw_spin_lock_irqsave(&logbuf_lock, flags); + logbuf_lock_irqsave(flags); dumper->cur_seq = clear_seq; dumper->cur_idx = clear_idx; dumper->next_seq = log_next_seq; dumper->next_idx = log_next_idx; - raw_spin_unlock_irqrestore(&logbuf_lock, flags); + logbuf_unlock_irqrestore(flags); /* invoke dumper which will iterate over records */ dumper->dump(dumper, reason); @@ -2950,9 +2916,9 @@ bool kmsg_dump_get_line(struct kmsg_dumper *dumper, bool syslog, unsigned long flags; bool ret; - raw_spin_lock_irqsave(&logbuf_lock, flags); + logbuf_lock_irqsave(flags); ret = kmsg_dump_get_line_nolock(dumper, syslog, line, size, len); - raw_spin_unlock_irqrestore(&logbuf_lock, flags); + logbuf_unlock_irqrestore(flags); return ret; } @@ -2991,7 +2957,7 @@ bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog, if (!dumper->active) goto out; - raw_spin_lock_irqsave(&logbuf_lock, flags); + logbuf_lock_irqsave(flags); if (dumper->cur_seq < log_first_seq) { /* messages are gone, move to first available one */ dumper->cur_seq = log_first_seq; @@ -3000,7 +2966,7 @@ bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog, /* last entry */ if (dumper->cur_seq >= dumper->next_seq) { - raw_spin_unlock_irqrestore(&logbuf_lock, flags); + logbuf_unlock_irqrestore(flags); goto out; } @@ -3042,7 +3008,7 @@ bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog, dumper->next_seq = next_seq; dumper->next_idx = next_idx; ret = true; - raw_spin_unlock_irqrestore(&logbuf_lock, flags); + logbuf_unlock_irqrestore(flags); out: if (len) *len = l; @@ -3080,9 +3046,9 @@ void kmsg_dump_rewind(struct kmsg_dumper *dumper) { unsigned long flags; - raw_spin_lock_irqsave(&logbuf_lock, flags); + logbuf_lock_irqsave(flags); kmsg_dump_rewind_nolock(dumper); - raw_spin_unlock_irqrestore(&logbuf_lock, flags); + logbuf_unlock_irqrestore(flags); } EXPORT_SYMBOL_GPL(kmsg_dump_rewind); diff --git a/kernel/printk/nmi.c b/kernel/printk/printk_safe.c index f011aaef583c..033e50a7d706 100644 --- a/kernel/printk/nmi.c +++ b/kernel/printk/printk_safe.c @@ -1,5 +1,5 @@ /* - * nmi.c - Safe printk in NMI context + * printk_safe.c - Safe printk for printk-deadlock-prone contexts * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License @@ -32,36 +32,58 @@ * is later flushed into the main ring buffer via IRQ work. * * The alternative implementation is chosen transparently - * via @printk_func per-CPU variable. + * by examinig current printk() context mask stored in @printk_context + * per-CPU variable. * * The implementation allows to flush the strings also from another CPU. * There are situations when we want to make sure that all buffers * were handled or when IRQs are blocked. */ -DEFINE_PER_CPU(printk_func_t, printk_func) = vprintk_default; -static int printk_nmi_irq_ready; -atomic_t nmi_message_lost; +static int printk_safe_irq_ready; -#define NMI_LOG_BUF_LEN ((1 << CONFIG_NMI_LOG_BUF_SHIFT) - \ - sizeof(atomic_t) - sizeof(struct irq_work)) +#define SAFE_LOG_BUF_LEN ((1 << CONFIG_PRINTK_SAFE_LOG_BUF_SHIFT) - \ + sizeof(atomic_t) - \ + sizeof(atomic_t) - \ + sizeof(struct irq_work)) -struct nmi_seq_buf { +struct printk_safe_seq_buf { atomic_t len; /* length of written data */ + atomic_t message_lost; struct irq_work work; /* IRQ work that flushes the buffer */ - unsigned char buffer[NMI_LOG_BUF_LEN]; + unsigned char buffer[SAFE_LOG_BUF_LEN]; }; -static DEFINE_PER_CPU(struct nmi_seq_buf, nmi_print_seq); + +static DEFINE_PER_CPU(struct printk_safe_seq_buf, safe_print_seq); +static DEFINE_PER_CPU(int, printk_context); + +#ifdef CONFIG_PRINTK_NMI +static DEFINE_PER_CPU(struct printk_safe_seq_buf, nmi_print_seq); +#endif + +/* Get flushed in a more safe context. */ +static void queue_flush_work(struct printk_safe_seq_buf *s) +{ + if (printk_safe_irq_ready) { + /* Make sure that IRQ work is really initialized. */ + smp_rmb(); + irq_work_queue(&s->work); + } +} /* - * Safe printk() for NMI context. It uses a per-CPU buffer to - * store the message. NMIs are not nested, so there is always only - * one writer running. But the buffer might get flushed from another - * CPU, so we need to be careful. + * Add a message to per-CPU context-dependent buffer. NMI and printk-safe + * have dedicated buffers, because otherwise printk-safe preempted by + * NMI-printk would have overwritten the NMI messages. + * + * The messages are fushed from irq work (or from panic()), possibly, + * from other CPU, concurrently with printk_safe_log_store(). Should this + * happen, printk_safe_log_store() will notice the buffer->len mismatch + * and repeat the write. */ -static int vprintk_nmi(const char *fmt, va_list args) +static int printk_safe_log_store(struct printk_safe_seq_buf *s, + const char *fmt, va_list args) { - struct nmi_seq_buf *s = this_cpu_ptr(&nmi_print_seq); - int add = 0; + int add; size_t len; again: @@ -69,18 +91,21 @@ again: /* The trailing '\0' is not counted into len. */ if (len >= sizeof(s->buffer) - 1) { - atomic_inc(&nmi_message_lost); + atomic_inc(&s->message_lost); + queue_flush_work(s); return 0; } /* - * Make sure that all old data have been read before the buffer was - * reseted. This is not needed when we just append data. + * Make sure that all old data have been read before the buffer + * was reset. This is not needed when we just append data. */ if (!len) smp_rmb(); add = vscnprintf(s->buffer + len, sizeof(s->buffer) - len, fmt, args); + if (!add) + return 0; /* * Do it once again if the buffer has been flushed in the meantime. @@ -90,32 +115,23 @@ again: if (atomic_cmpxchg(&s->len, len, len + add) != len) goto again; - /* Get flushed in a more safe context. */ - if (add && printk_nmi_irq_ready) { - /* Make sure that IRQ work is really initialized. */ - smp_rmb(); - irq_work_queue(&s->work); - } - + queue_flush_work(s); return add; } -static void printk_nmi_flush_line(const char *text, int len) +static inline void printk_safe_flush_line(const char *text, int len) { /* - * The buffers are flushed in NMI only on panic. The messages must - * go only into the ring buffer at this stage. Consoles will get - * explicitly called later when a crashdump is not generated. + * Avoid any console drivers calls from here, because we may be + * in NMI or printk_safe context (when in panic). The messages + * must go only into the ring buffer at this stage. Consoles will + * get explicitly called later when a crashdump is not generated. */ - if (in_nmi()) - printk_deferred("%.*s", len, text); - else - printk("%.*s", len, text); - + printk_deferred("%.*s", len, text); } /* printk part of the temporary buffer line by line */ -static int printk_nmi_flush_buffer(const char *start, size_t len) +static int printk_safe_flush_buffer(const char *start, size_t len) { const char *c, *end; bool header; @@ -127,7 +143,7 @@ static int printk_nmi_flush_buffer(const char *start, size_t len) /* Print line by line. */ while (c < end) { if (*c == '\n') { - printk_nmi_flush_line(start, c - start + 1); + printk_safe_flush_line(start, c - start + 1); start = ++c; header = true; continue; @@ -140,7 +156,7 @@ static int printk_nmi_flush_buffer(const char *start, size_t len) continue; } - printk_nmi_flush_line(start, c - start); + printk_safe_flush_line(start, c - start); start = c++; header = true; continue; @@ -154,22 +170,31 @@ static int printk_nmi_flush_buffer(const char *start, size_t len) if (start < end && !header) { static const char newline[] = KERN_CONT "\n"; - printk_nmi_flush_line(start, end - start); - printk_nmi_flush_line(newline, strlen(newline)); + printk_safe_flush_line(start, end - start); + printk_safe_flush_line(newline, strlen(newline)); } return len; } +static void report_message_lost(struct printk_safe_seq_buf *s) +{ + int lost = atomic_xchg(&s->message_lost, 0); + + if (lost) + printk_deferred("Lost %d message(s)!\n", lost); +} + /* - * Flush data from the associated per_CPU buffer. The function + * Flush data from the associated per-CPU buffer. The function * can be called either via IRQ work or independently. */ -static void __printk_nmi_flush(struct irq_work *work) +static void __printk_safe_flush(struct irq_work *work) { static raw_spinlock_t read_lock = __RAW_SPIN_LOCK_INITIALIZER(read_lock); - struct nmi_seq_buf *s = container_of(work, struct nmi_seq_buf, work); + struct printk_safe_seq_buf *s = + container_of(work, struct printk_safe_seq_buf, work); unsigned long flags; size_t len; int i; @@ -194,9 +219,9 @@ more: * buffer size. */ if ((i && i >= len) || len > sizeof(s->buffer)) { - const char *msg = "printk_nmi_flush: internal error\n"; + const char *msg = "printk_safe_flush: internal error\n"; - printk_nmi_flush_line(msg, strlen(msg)); + printk_safe_flush_line(msg, strlen(msg)); len = 0; } @@ -205,7 +230,7 @@ more: /* Make sure that data has been written up to the @len */ smp_rmb(); - i += printk_nmi_flush_buffer(s->buffer + i, len - i); + i += printk_safe_flush_buffer(s->buffer + i, len - i); /* * Check that nothing has got added in the meantime and truncate @@ -217,35 +242,40 @@ more: goto more; out: + report_message_lost(s); raw_spin_unlock_irqrestore(&read_lock, flags); } /** - * printk_nmi_flush - flush all per-cpu nmi buffers. + * printk_safe_flush - flush all per-cpu nmi buffers. * * The buffers are flushed automatically via IRQ work. This function * is useful only when someone wants to be sure that all buffers have * been flushed at some point. */ -void printk_nmi_flush(void) +void printk_safe_flush(void) { int cpu; - for_each_possible_cpu(cpu) - __printk_nmi_flush(&per_cpu(nmi_print_seq, cpu).work); + for_each_possible_cpu(cpu) { +#ifdef CONFIG_PRINTK_NMI + __printk_safe_flush(&per_cpu(nmi_print_seq, cpu).work); +#endif + __printk_safe_flush(&per_cpu(safe_print_seq, cpu).work); + } } /** - * printk_nmi_flush_on_panic - flush all per-cpu nmi buffers when the system + * printk_safe_flush_on_panic - flush all per-cpu nmi buffers when the system * goes down. * - * Similar to printk_nmi_flush() but it can be called even in NMI context when + * Similar to printk_safe_flush() but it can be called even in NMI context when * the system goes down. It does the best effort to get NMI messages into * the main ring buffer. * * Note that it could try harder when there is only one CPU online. */ -void printk_nmi_flush_on_panic(void) +void printk_safe_flush_on_panic(void) { /* * Make sure that we could access the main ring buffer. @@ -259,33 +289,97 @@ void printk_nmi_flush_on_panic(void) raw_spin_lock_init(&logbuf_lock); } - printk_nmi_flush(); + printk_safe_flush(); } -void __init printk_nmi_init(void) +#ifdef CONFIG_PRINTK_NMI +/* + * Safe printk() for NMI context. It uses a per-CPU buffer to + * store the message. NMIs are not nested, so there is always only + * one writer running. But the buffer might get flushed from another + * CPU, so we need to be careful. + */ +static int vprintk_nmi(const char *fmt, va_list args) { - int cpu; + struct printk_safe_seq_buf *s = this_cpu_ptr(&nmi_print_seq); - for_each_possible_cpu(cpu) { - struct nmi_seq_buf *s = &per_cpu(nmi_print_seq, cpu); + return printk_safe_log_store(s, fmt, args); +} - init_irq_work(&s->work, __printk_nmi_flush); - } +void printk_nmi_enter(void) +{ + this_cpu_or(printk_context, PRINTK_NMI_CONTEXT_MASK); +} - /* Make sure that IRQ works are initialized before enabling. */ - smp_wmb(); - printk_nmi_irq_ready = 1; +void printk_nmi_exit(void) +{ + this_cpu_and(printk_context, ~PRINTK_NMI_CONTEXT_MASK); +} - /* Flush pending messages that did not have scheduled IRQ works. */ - printk_nmi_flush(); +#else + +static int vprintk_nmi(const char *fmt, va_list args) +{ + return 0; } -void printk_nmi_enter(void) +#endif /* CONFIG_PRINTK_NMI */ + +/* + * Lock-less printk(), to avoid deadlocks should the printk() recurse + * into itself. It uses a per-CPU buffer to store the message, just like + * NMI. + */ +static int vprintk_safe(const char *fmt, va_list args) { - this_cpu_write(printk_func, vprintk_nmi); + struct printk_safe_seq_buf *s = this_cpu_ptr(&safe_print_seq); + + return printk_safe_log_store(s, fmt, args); } -void printk_nmi_exit(void) +/* Can be preempted by NMI. */ +void __printk_safe_enter(void) +{ + this_cpu_inc(printk_context); +} + +/* Can be preempted by NMI. */ +void __printk_safe_exit(void) { - this_cpu_write(printk_func, vprintk_default); + this_cpu_dec(printk_context); +} + +__printf(1, 0) int vprintk_func(const char *fmt, va_list args) +{ + if (this_cpu_read(printk_context) & PRINTK_NMI_CONTEXT_MASK) + return vprintk_nmi(fmt, args); + + if (this_cpu_read(printk_context) & PRINTK_SAFE_CONTEXT_MASK) + return vprintk_safe(fmt, args); + + return vprintk_default(fmt, args); +} + +void __init printk_safe_init(void) +{ + int cpu; + + for_each_possible_cpu(cpu) { + struct printk_safe_seq_buf *s; + + s = &per_cpu(safe_print_seq, cpu); + init_irq_work(&s->work, __printk_safe_flush); + +#ifdef CONFIG_PRINTK_NMI + s = &per_cpu(nmi_print_seq, cpu); + init_irq_work(&s->work, __printk_safe_flush); +#endif + } + + /* Make sure that IRQ works are initialized before enabling. */ + smp_wmb(); + printk_safe_irq_ready = 1; + + /* Flush pending messages that did not have scheduled IRQ works. */ + printk_safe_flush(); } diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 34e2291a9a6c..e1ae6ac15eac 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -23,6 +23,9 @@ #include <asm/switch_to.h> #include <asm/tlb.h> +#ifdef CONFIG_PARAVIRT +#include <asm/paravirt.h> +#endif #include "sched.h" #include "../workqueue_internal.h" diff --git a/kernel/seccomp.c b/kernel/seccomp.c index f7ce79a46050..e15185c28de5 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -16,6 +16,7 @@ #include <linux/atomic.h> #include <linux/audit.h> #include <linux/compat.h> +#include <linux/coredump.h> #include <linux/sched.h> #include <linux/seccomp.h> #include <linux/slab.h> @@ -486,6 +487,17 @@ void put_seccomp_filter(struct task_struct *tsk) } } +static void seccomp_init_siginfo(siginfo_t *info, int syscall, int reason) +{ + memset(info, 0, sizeof(*info)); + info->si_signo = SIGSYS; + info->si_code = SYS_SECCOMP; + info->si_call_addr = (void __user *)KSTK_EIP(current); + info->si_errno = reason; + info->si_arch = syscall_get_arch(); + info->si_syscall = syscall; +} + /** * seccomp_send_sigsys - signals the task to allow in-process syscall emulation * @syscall: syscall number to send to userland @@ -496,13 +508,7 @@ void put_seccomp_filter(struct task_struct *tsk) static void seccomp_send_sigsys(int syscall, int reason) { struct siginfo info; - memset(&info, 0, sizeof(info)); - info.si_signo = SIGSYS; - info.si_code = SYS_SECCOMP; - info.si_call_addr = (void __user *)KSTK_EIP(current); - info.si_errno = reason; - info.si_arch = syscall_get_arch(); - info.si_syscall = syscall; + seccomp_init_siginfo(&info, syscall, reason); force_sig_info(SIGSYS, &info, current); } #endif /* CONFIG_SECCOMP_FILTER */ @@ -634,10 +640,20 @@ static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd, return 0; case SECCOMP_RET_KILL: - default: + default: { + siginfo_t info; audit_seccomp(this_syscall, SIGSYS, action); + /* Dump core only if this is the last remaining thread. */ + if (get_nr_threads(current) == 1) { + /* Show the original registers in the dump. */ + syscall_rollback(current, task_pt_regs(current)); + /* Trigger a manual coredump since do_exit skips it. */ + seccomp_init_siginfo(&info, this_syscall, data); + do_coredump(&info); + } do_exit(SIGSYS); } + } unreachable(); diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 95cecbf67f5c..b2058a7f94bd 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -28,6 +28,8 @@ #include <linux/uaccess.h> #include <linux/list.h> +#include "../../block/blk.h" + #include <trace/events/block.h> #include "trace_output.h" @@ -292,9 +294,6 @@ record_it: local_irq_restore(flags); } -static struct dentry *blk_tree_root; -static DEFINE_MUTEX(blk_tree_mutex); - static void blk_trace_free(struct blk_trace *bt) { debugfs_remove(bt->msg_file); @@ -433,9 +432,9 @@ static void blk_trace_setup_lba(struct blk_trace *bt, /* * Setup everything required to start tracing */ -int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, - struct block_device *bdev, - struct blk_user_trace_setup *buts) +static int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, + struct block_device *bdev, + struct blk_user_trace_setup *buts) { struct blk_trace *bt = NULL; struct dentry *dir = NULL; @@ -468,22 +467,15 @@ int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, ret = -ENOENT; - mutex_lock(&blk_tree_mutex); - if (!blk_tree_root) { - blk_tree_root = debugfs_create_dir("block", NULL); - if (!blk_tree_root) { - mutex_unlock(&blk_tree_mutex); - goto err; - } - } - mutex_unlock(&blk_tree_mutex); - - dir = debugfs_create_dir(buts->name, blk_tree_root); + if (!blk_debugfs_root) + goto err; + dir = debugfs_lookup(buts->name, blk_debugfs_root); + if (!dir) + bt->dir = dir = debugfs_create_dir(buts->name, blk_debugfs_root); if (!dir) goto err; - bt->dir = dir; bt->dev = dev; atomic_set(&bt->dropped, 0); INIT_LIST_HEAD(&bt->running_list); @@ -525,9 +517,12 @@ int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, if (atomic_inc_return(&blk_probes_ref) == 1) blk_register_tracepoints(); - return 0; + ret = 0; err: - blk_trace_free(bt); + if (dir && !bt->dir) + dput(dir); + if (ret) + blk_trace_free(bt); return ret; } @@ -712,15 +707,13 @@ static void blk_add_trace_rq(struct request_queue *q, struct request *rq, if (likely(!bt)) return; - if (rq->cmd_type == REQ_TYPE_BLOCK_PC) { + if (blk_rq_is_passthrough(rq)) what |= BLK_TC_ACT(BLK_TC_PC); - __blk_add_trace(bt, 0, nr_bytes, req_op(rq), rq->cmd_flags, - what, rq->errors, rq->cmd_len, rq->cmd); - } else { + else what |= BLK_TC_ACT(BLK_TC_FS); - __blk_add_trace(bt, blk_rq_pos(rq), nr_bytes, req_op(rq), - rq->cmd_flags, what, rq->errors, 0, NULL); - } + + __blk_add_trace(bt, blk_rq_trace_sector(rq), nr_bytes, req_op(rq), + rq->cmd_flags, what, rq->errors, 0, NULL); } static void blk_add_trace_rq_abort(void *ignore, @@ -972,11 +965,7 @@ void blk_add_driver_data(struct request_queue *q, if (likely(!bt)) return; - if (rq->cmd_type == REQ_TYPE_BLOCK_PC) - __blk_add_trace(bt, 0, blk_rq_bytes(rq), 0, 0, - BLK_TA_DRV_DATA, rq->errors, len, data); - else - __blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq), 0, 0, + __blk_add_trace(bt, blk_rq_trace_sector(rq), blk_rq_bytes(rq), 0, 0, BLK_TA_DRV_DATA, rq->errors, len, data); } EXPORT_SYMBOL_GPL(blk_add_driver_data); @@ -1752,31 +1741,6 @@ void blk_trace_remove_sysfs(struct device *dev) #ifdef CONFIG_EVENT_TRACING -void blk_dump_cmd(char *buf, struct request *rq) -{ - int i, end; - int len = rq->cmd_len; - unsigned char *cmd = rq->cmd; - - if (rq->cmd_type != REQ_TYPE_BLOCK_PC) { - buf[0] = '\0'; - return; - } - - for (end = len - 1; end >= 0; end--) - if (cmd[end]) - break; - end++; - - for (i = 0; i < len; i++) { - buf += sprintf(buf, "%s%02x", i == 0 ? "" : " ", cmd[i]); - if (i == end && end != len - 1) { - sprintf(buf, " .."); - break; - } - } -} - void blk_fill_rwbs(char *rwbs, unsigned int op, int bytes) { int i = 0; diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index fa77311dadb2..cee9802cf3e0 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -76,8 +76,8 @@ static const struct bpf_func_proto bpf_probe_read_proto = { .func = bpf_probe_read, .gpl_only = true, .ret_type = RET_INTEGER, - .arg1_type = ARG_PTR_TO_RAW_STACK, - .arg2_type = ARG_CONST_STACK_SIZE, + .arg1_type = ARG_PTR_TO_UNINIT_MEM, + .arg2_type = ARG_CONST_SIZE, .arg3_type = ARG_ANYTHING, }; @@ -109,8 +109,8 @@ static const struct bpf_func_proto bpf_probe_write_user_proto = { .gpl_only = true, .ret_type = RET_INTEGER, .arg1_type = ARG_ANYTHING, - .arg2_type = ARG_PTR_TO_STACK, - .arg3_type = ARG_CONST_STACK_SIZE, + .arg2_type = ARG_PTR_TO_MEM, + .arg3_type = ARG_CONST_SIZE, }; static const struct bpf_func_proto *bpf_get_probe_write_proto(void) @@ -213,8 +213,8 @@ static const struct bpf_func_proto bpf_trace_printk_proto = { .func = bpf_trace_printk, .gpl_only = true, .ret_type = RET_INTEGER, - .arg1_type = ARG_PTR_TO_STACK, - .arg2_type = ARG_CONST_STACK_SIZE, + .arg1_type = ARG_PTR_TO_MEM, + .arg2_type = ARG_CONST_SIZE, }; const struct bpf_func_proto *bpf_get_trace_printk_proto(void) @@ -329,8 +329,8 @@ static const struct bpf_func_proto bpf_perf_event_output_proto = { .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_CONST_MAP_PTR, .arg3_type = ARG_ANYTHING, - .arg4_type = ARG_PTR_TO_STACK, - .arg5_type = ARG_CONST_STACK_SIZE, + .arg4_type = ARG_PTR_TO_MEM, + .arg5_type = ARG_CONST_SIZE, }; static DEFINE_PER_CPU(struct pt_regs, bpf_pt_regs); @@ -395,6 +395,36 @@ static const struct bpf_func_proto bpf_current_task_under_cgroup_proto = { .arg2_type = ARG_ANYTHING, }; +BPF_CALL_3(bpf_probe_read_str, void *, dst, u32, size, + const void *, unsafe_ptr) +{ + int ret; + + /* + * The strncpy_from_unsafe() call will likely not fill the entire + * buffer, but that's okay in this circumstance as we're probing + * arbitrary memory anyway similar to bpf_probe_read() and might + * as well probe the stack. Thus, memory is explicitly cleared + * only in error case, so that improper users ignoring return + * code altogether don't copy garbage; otherwise length of string + * is returned that can be used for bpf_perf_event_output() et al. + */ + ret = strncpy_from_unsafe(dst, unsafe_ptr, size); + if (unlikely(ret < 0)) + memset(dst, 0, size); + + return ret; +} + +static const struct bpf_func_proto bpf_probe_read_str_proto = { + .func = bpf_probe_read_str, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_UNINIT_MEM, + .arg2_type = ARG_CONST_SIZE, + .arg3_type = ARG_ANYTHING, +}; + static const struct bpf_func_proto *tracing_func_proto(enum bpf_func_id func_id) { switch (func_id) { @@ -432,6 +462,8 @@ static const struct bpf_func_proto *tracing_func_proto(enum bpf_func_id func_id) return &bpf_current_task_under_cgroup_proto; case BPF_FUNC_get_prandom_u32: return &bpf_get_prandom_u32_proto; + case BPF_FUNC_probe_read_str: + return &bpf_probe_read_str_proto; default: return NULL; } @@ -459,6 +491,13 @@ static bool kprobe_prog_is_valid_access(int off, int size, enum bpf_access_type return false; if (off % size != 0) return false; + /* + * Assertion for 32 bit to make sure last 8 byte access + * (BPF_DW) to the last 4 byte member is disallowed. + */ + if (off + size > sizeof(struct pt_regs)) + return false; + return true; } @@ -467,7 +506,7 @@ static const struct bpf_verifier_ops kprobe_prog_ops = { .is_valid_access = kprobe_prog_is_valid_access, }; -static struct bpf_prog_type_list kprobe_tl = { +static struct bpf_prog_type_list kprobe_tl __ro_after_init = { .ops = &kprobe_prog_ops, .type = BPF_PROG_TYPE_KPROBE, }; @@ -492,8 +531,8 @@ static const struct bpf_func_proto bpf_perf_event_output_proto_tp = { .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_CONST_MAP_PTR, .arg3_type = ARG_ANYTHING, - .arg4_type = ARG_PTR_TO_STACK, - .arg5_type = ARG_CONST_STACK_SIZE, + .arg4_type = ARG_PTR_TO_MEM, + .arg5_type = ARG_CONST_SIZE, }; BPF_CALL_3(bpf_get_stackid_tp, void *, tp_buff, struct bpf_map *, map, @@ -540,6 +579,8 @@ static bool tp_prog_is_valid_access(int off, int size, enum bpf_access_type type return false; if (off % size != 0) return false; + + BUILD_BUG_ON(PERF_MAX_TRACE_SIZE % sizeof(__u64)); return true; } @@ -548,7 +589,7 @@ static const struct bpf_verifier_ops tracepoint_prog_ops = { .is_valid_access = tp_prog_is_valid_access, }; -static struct bpf_prog_type_list tracepoint_tl = { +static struct bpf_prog_type_list tracepoint_tl __ro_after_init = { .ops = &tracepoint_prog_ops, .type = BPF_PROG_TYPE_TRACEPOINT, }; @@ -572,28 +613,29 @@ static bool pe_prog_is_valid_access(int off, int size, enum bpf_access_type type return true; } -static u32 pe_prog_convert_ctx_access(enum bpf_access_type type, int dst_reg, - int src_reg, int ctx_off, +static u32 pe_prog_convert_ctx_access(enum bpf_access_type type, + const struct bpf_insn *si, struct bpf_insn *insn_buf, struct bpf_prog *prog) { struct bpf_insn *insn = insn_buf; - switch (ctx_off) { + switch (si->off) { case offsetof(struct bpf_perf_event_data, sample_period): BUILD_BUG_ON(FIELD_SIZEOF(struct perf_sample_data, period) != sizeof(u64)); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_perf_event_data_kern, - data), dst_reg, src_reg, + data), si->dst_reg, si->src_reg, offsetof(struct bpf_perf_event_data_kern, data)); - *insn++ = BPF_LDX_MEM(BPF_DW, dst_reg, dst_reg, + *insn++ = BPF_LDX_MEM(BPF_DW, si->dst_reg, si->dst_reg, offsetof(struct perf_sample_data, period)); break; default: *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_perf_event_data_kern, - regs), dst_reg, src_reg, + regs), si->dst_reg, si->src_reg, offsetof(struct bpf_perf_event_data_kern, regs)); - *insn++ = BPF_LDX_MEM(BPF_SIZEOF(long), dst_reg, dst_reg, ctx_off); + *insn++ = BPF_LDX_MEM(BPF_SIZEOF(long), si->dst_reg, si->dst_reg, + si->off); break; } @@ -606,7 +648,7 @@ static const struct bpf_verifier_ops perf_event_prog_ops = { .convert_ctx_access = pe_prog_convert_ctx_access, }; -static struct bpf_prog_type_list perf_event_tl = { +static struct bpf_prog_type_list perf_event_tl __ro_after_init = { .ops = &perf_event_prog_ops, .type = BPF_PROG_TYPE_PERF_EVENT, }; diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 5d33a7352919..aea6a1218c7d 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -162,15 +162,27 @@ trace_print_bitmask_seq(struct trace_seq *p, void *bitmask_ptr, } EXPORT_SYMBOL_GPL(trace_print_bitmask_seq); +/** + * trace_print_hex_seq - print buffer as hex sequence + * @p: trace seq struct to write to + * @buf: The buffer to print + * @buf_len: Length of @buf in bytes + * @concatenate: Print @buf as single hex string or with spacing + * + * Prints the passed buffer as a hex sequence either as a whole, + * single hex string if @concatenate is true or with spacing after + * each byte in case @concatenate is false. + */ const char * -trace_print_hex_seq(struct trace_seq *p, const unsigned char *buf, int buf_len) +trace_print_hex_seq(struct trace_seq *p, const unsigned char *buf, int buf_len, + bool concatenate) { int i; const char *ret = trace_seq_buffer_ptr(p); for (i = 0; i < buf_len; i++) - trace_seq_printf(p, "%s%2.2x", i == 0 ? "" : " ", buf[i]); - + trace_seq_printf(p, "%s%2.2x", concatenate || i == 0 ? "" : " ", + buf[i]); trace_seq_putc(p, 0); return ret; |