33 files changed, 1421 insertions, 483 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index 172d151d429c..f85ae5dfa474 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -81,6 +81,7 @@ obj-$(CONFIG_AUDIT_TREE) += audit_tree.o
 obj-$(CONFIG_GCOV_KERNEL) += gcov/
 obj-$(CONFIG_KCOV) += kcov.o
 obj-$(CONFIG_KPROBES) += kprobes.o
+obj-$(CONFIG_FAIL_FUNCTION) += fail_function.o
 obj-$(CONFIG_KGDB) += debug/
 obj-$(CONFIG_DETECT_HUNG_TASK) += hung_task.o
 obj-$(CONFIG_LOCKUP_DETECTOR) += watchdog.o
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 7c25426d3cf5..b1f66480135b 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -49,34 +49,64 @@ static int bpf_array_alloc_percpu(struct bpf_array *array)
 }
 
 /* Called from syscall */
-static struct bpf_map *array_map_alloc(union bpf_attr *attr)
+static int array_map_alloc_check(union bpf_attr *attr)
 {
 	bool percpu = attr->map_type == BPF_MAP_TYPE_PERCPU_ARRAY;
 	int numa_node = bpf_map_attr_numa_node(attr);
-	struct bpf_array *array;
-	u64 array_size;
-	u32 elem_size;
 
 	/* check sanity of attributes */
 	if (attr->max_entries == 0 || attr->key_size != 4 ||
 	    attr->value_size == 0 ||
 	    attr->map_flags & ~ARRAY_CREATE_FLAG_MASK ||
 	    (percpu && numa_node != NUMA_NO_NODE))
-		return ERR_PTR(-EINVAL);
+		return -EINVAL;
 
 	if (attr->value_size > KMALLOC_MAX_SIZE)
 		/* if value_size is bigger, the user space won't be able to
 		 * access the elements.
 		 */
-		return ERR_PTR(-E2BIG);
+		return -E2BIG;
+
+	return 0;
+}
+
+static struct bpf_map *array_map_alloc(union bpf_attr *attr)
+{
+	bool percpu = attr->map_type == BPF_MAP_TYPE_PERCPU_ARRAY;
+	int numa_node = bpf_map_attr_numa_node(attr);
+	u32 elem_size, index_mask, max_entries;
+	bool unpriv = !capable(CAP_SYS_ADMIN);
+	struct bpf_array *array;
+	u64 array_size, mask64;
 
 	elem_size = round_up(attr->value_size, 8);
 
+	max_entries = attr->max_entries;
+
+	/* On 32 bit archs roundup_pow_of_two() with max_entries that has
+	 * upper most bit set in u32 space is undefined behavior due to
+	 * resulting 1U << 32, so do it manually here in u64 space.
+	 */
+	mask64 = fls_long(max_entries - 1);
+	mask64 = 1ULL << mask64;
+	mask64 -= 1;
+
+	index_mask = mask64;
+	if (unpriv) {
+		/* round up array size to nearest power of 2,
+		 * since cpu will speculate within index_mask limits
+		 */
+		max_entries = index_mask + 1;
+		/* Check for overflows. */
+		if (max_entries < attr->max_entries)
+			return ERR_PTR(-E2BIG);
+	}
+
 	array_size = sizeof(*array);
 	if (percpu)
-		array_size += (u64) attr->max_entries * sizeof(void *);
+		array_size += (u64) max_entries * sizeof(void *);
 	else
-		array_size += (u64) attr->max_entries * elem_size;
+		array_size += (u64) max_entries * elem_size;
 
 	/* make sure there is no u32 overflow later in round_up() */
 	if (array_size >= U32_MAX - PAGE_SIZE)
@@ -86,14 +116,11 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr)
 	array = bpf_map_area_alloc(array_size, numa_node);
 	if (!array)
 		return ERR_PTR(-ENOMEM);
+	array->index_mask = index_mask;
+	array->map.unpriv_array = unpriv;
 
 	/* copy mandatory map attributes */
-	array->map.map_type = attr->map_type;
-	array->map.key_size = attr->key_size;
-	array->map.value_size = attr->value_size;
-	array->map.max_entries = attr->max_entries;
-	array->map.map_flags = attr->map_flags;
-	array->map.numa_node = numa_node;
+	bpf_map_init_from_attr(&array->map, attr);
 	array->elem_size = elem_size;
 
 	if (!percpu)
@@ -121,12 +148,13 @@ static void *array_map_lookup_elem(struct bpf_map *map, void *key)
 	if (unlikely(index >= array->map.max_entries))
 		return NULL;
 
-	return array->value + array->elem_size * index;
+	return array->value + array->elem_size * (index & array->index_mask);
 }
 
 /* emit BPF instructions equivalent to C code of array_map_lookup_elem() */
 static u32 array_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf)
 {
+	struct bpf_array *array = container_of(map, struct bpf_array, map);
 	struct bpf_insn *insn = insn_buf;
 	u32 elem_size = round_up(map->value_size, 8);
 	const int ret = BPF_REG_0;
@@ -135,7 +163,12 @@ static u32 array_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf)
 
 	*insn++ = BPF_ALU64_IMM(BPF_ADD, map_ptr, offsetof(struct bpf_array, value));
 	*insn++ = BPF_LDX_MEM(BPF_W, ret, index, 0);
-	*insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 3);
+	if (map->unpriv_array) {
+		*insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 4);
+		*insn++ = BPF_ALU32_IMM(BPF_AND, ret, array->index_mask);
+	} else {
+		*insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 3);
+	}
 
 	if (is_power_of_2(elem_size)) {
 		*insn++ = BPF_ALU64_IMM(BPF_LSH, ret, ilog2(elem_size));
@@ -157,7 +190,7 @@ static void *percpu_array_map_lookup_elem(struct bpf_map *map, void *key)
 	if (unlikely(index >= array->map.max_entries))
 		return NULL;
 
-	return this_cpu_ptr(array->pptrs[index]);
+	return this_cpu_ptr(array->pptrs[index & array->index_mask]);
 }
 
 int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value)
@@ -177,7 +210,7 @@ int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value)
 	 */
 	size = round_up(map->value_size, 8);
 	rcu_read_lock();
-	pptr = array->pptrs[index];
+	pptr = array->pptrs[index & array->index_mask];
 	for_each_possible_cpu(cpu) {
 		bpf_long_memcpy(value + off, per_cpu_ptr(pptr, cpu), size);
 		off += size;
@@ -225,10 +258,11 @@ static int array_map_update_elem(struct bpf_map *map, void *key, void *value,
 		return -EEXIST;
 
 	if (array->map.map_type == BPF_MAP_TYPE_PERCPU_ARRAY)
-		memcpy(this_cpu_ptr(array->pptrs[index]),
+		memcpy(this_cpu_ptr(array->pptrs[index & array->index_mask]),
 		       value, map->value_size);
 	else
-		memcpy(array->value + array->elem_size * index,
+		memcpy(array->value +
+		       array->elem_size * (index & array->index_mask),
 		       value, map->value_size);
 	return 0;
 }
@@ -262,7 +296,7 @@ int bpf_percpu_array_update(struct bpf_map *map, void *key, void *value,
 	 */
 	size = round_up(map->value_size, 8);
 	rcu_read_lock();
-	pptr = array->pptrs[index];
+	pptr = array->pptrs[index & array->index_mask];
 	for_each_possible_cpu(cpu) {
 		bpf_long_memcpy(per_cpu_ptr(pptr, cpu), value + off, size);
 		off += size;
@@ -296,6 +330,7 @@ static void array_map_free(struct bpf_map *map)
 }
 
 const struct bpf_map_ops array_map_ops = {
+	.map_alloc_check = array_map_alloc_check,
 	.map_alloc = array_map_alloc,
 	.map_free = array_map_free,
 	.map_get_next_key = array_map_get_next_key,
@@ -306,6 +341,7 @@ const struct bpf_map_ops array_map_ops = {
 };
 
 const struct bpf_map_ops percpu_array_map_ops = {
+	.map_alloc_check = array_map_alloc_check,
 	.map_alloc = array_map_alloc,
 	.map_free = array_map_free,
 	.map_get_next_key = array_map_get_next_key,
@@ -314,12 +350,12 @@ const struct bpf_map_ops percpu_array_map_ops = {
 	.map_delete_elem = array_map_delete_elem,
 };
 
-static struct bpf_map *fd_array_map_alloc(union bpf_attr *attr)
+static int fd_array_map_alloc_check(union bpf_attr *attr)
 {
 	/* only file descriptors can be stored in this type of map */
 	if (attr->value_size != sizeof(u32))
-		return ERR_PTR(-EINVAL);
-	return array_map_alloc(attr);
+		return -EINVAL;
+	return array_map_alloc_check(attr);
 }
 
 static void fd_array_map_free(struct bpf_map *map)
@@ -443,7 +479,8 @@ void bpf_fd_array_map_clear(struct bpf_map *map)
 }
 
 const struct bpf_map_ops prog_array_map_ops = {
-	.map_alloc = fd_array_map_alloc,
+	.map_alloc_check = fd_array_map_alloc_check,
+	.map_alloc = array_map_alloc,
 	.map_free = fd_array_map_free,
 	.map_get_next_key = array_map_get_next_key,
 	.map_lookup_elem = fd_array_map_lookup_elem,
@@ -530,7 +567,8 @@ static void perf_event_fd_array_release(struct bpf_map *map,
 }
 
 const struct bpf_map_ops perf_event_array_map_ops = {
-	.map_alloc = fd_array_map_alloc,
+	.map_alloc_check = fd_array_map_alloc_check,
+	.map_alloc = array_map_alloc,
 	.map_free = fd_array_map_free,
 	.map_get_next_key = array_map_get_next_key,
 	.map_lookup_elem = fd_array_map_lookup_elem,
@@ -561,7 +599,8 @@ static void cgroup_fd_array_free(struct bpf_map *map)
 }
 
 const struct bpf_map_ops cgroup_array_map_ops = {
-	.map_alloc = fd_array_map_alloc,
+	.map_alloc_check = fd_array_map_alloc_check,
+	.map_alloc = array_map_alloc,
 	.map_free = cgroup_fd_array_free,
 	.map_get_next_key = array_map_get_next_key,
 	.map_lookup_elem = fd_array_map_lookup_elem,
@@ -579,7 +618,7 @@ static struct bpf_map *array_of_map_alloc(union bpf_attr *attr)
 	if (IS_ERR(inner_map_meta))
 		return inner_map_meta;
 
-	map = fd_array_map_alloc(attr);
+	map = array_map_alloc(attr);
 	if (IS_ERR(map)) {
 		bpf_map_meta_free(inner_map_meta);
 		return map;
@@ -613,6 +652,7 @@ static void *array_of_map_lookup_elem(struct bpf_map *map, void *key)
 static u32 array_of_map_gen_lookup(struct bpf_map *map,
 				   struct bpf_insn *insn_buf)
 {
+	struct bpf_array *array = container_of(map, struct bpf_array, map);
 	u32 elem_size = round_up(map->value_size, 8);
 	struct bpf_insn *insn = insn_buf;
 	const int ret = BPF_REG_0;
@@ -621,7 +661,12 @@ static u32 array_of_map_gen_lookup(struct bpf_map *map,
 
 	*insn++ = BPF_ALU64_IMM(BPF_ADD, map_ptr, offsetof(struct bpf_array, value));
 	*insn++ = BPF_LDX_MEM(BPF_W, ret, index, 0);
-	*insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 5);
+	if (map->unpriv_array) {
+		*insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 6);
+		*insn++ = BPF_ALU32_IMM(BPF_AND, ret, array->index_mask);
+	} else {
+		*insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 5);
+	}
 	if (is_power_of_2(elem_size))
 		*insn++ = BPF_ALU64_IMM(BPF_LSH, ret, ilog2(elem_size));
 	else
@@ -636,6 +681,7 @@ static u32 array_of_map_gen_lookup(struct bpf_map *map,
 }
 
 const struct bpf_map_ops array_of_maps_map_ops = {
+	.map_alloc_check = fd_array_map_alloc_check,
 	.map_alloc = array_of_map_alloc,
 	.map_free = array_of_map_free,
 	.map_get_next_key = array_map_get_next_key,
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 70a534549cd3..3aa0658add76 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -300,6 +300,11 @@ struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off,
 }
 
 #ifdef CONFIG_BPF_JIT
+/* All BPF JIT sysctl knobs here. */
+int bpf_jit_enable   __read_mostly = IS_BUILTIN(CONFIG_BPF_JIT_ALWAYS_ON);
+int bpf_jit_harden   __read_mostly;
+int bpf_jit_kallsyms __read_mostly;
+
 static __always_inline void
 bpf_get_prog_addr_region(const struct bpf_prog *prog,
 			 unsigned long *symbol_start,
@@ -381,8 +386,6 @@ static DEFINE_SPINLOCK(bpf_lock);
 static LIST_HEAD(bpf_kallsyms);
 static struct latch_tree_root bpf_tree __cacheline_aligned;
 
-int bpf_jit_kallsyms __read_mostly;
-
 static void bpf_prog_ksym_node_add(struct bpf_prog_aux *aux)
 {
 	WARN_ON_ONCE(!list_empty(&aux->ksym_lnode));
@@ -563,8 +566,6 @@ void __weak bpf_jit_free(struct bpf_prog *fp)
 	bpf_prog_unlock_free(fp);
 }
 
-int bpf_jit_harden __read_mostly;
-
 static int bpf_jit_blind_insn(const struct bpf_insn *from,
 			      const struct bpf_insn *aux,
 			      struct bpf_insn *to_buff)
@@ -781,6 +782,7 @@ noinline u64 __bpf_call_base(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
 }
 EXPORT_SYMBOL_GPL(__bpf_call_base);
 
+#ifndef CONFIG_BPF_JIT_ALWAYS_ON
 /**
  *	__bpf_prog_run - run eBPF program on a given context
  *	@ctx: is the data we are operating on
@@ -969,7 +971,7 @@ select_insn:
 		DST = tmp;
 		CONT;
 	ALU_MOD_X:
-		if (unlikely(SRC == 0))
+		if (unlikely((u32)SRC == 0))
 			return 0;
 		tmp = (u32) DST;
 		DST = do_div(tmp, (u32) SRC);
@@ -988,7 +990,7 @@ select_insn:
 		DST = div64_u64(DST, SRC);
 		CONT;
 	ALU_DIV_X:
-		if (unlikely(SRC == 0))
+		if (unlikely((u32)SRC == 0))
 			return 0;
 		tmp = (u32) DST;
 		do_div(tmp, (u32) SRC);
@@ -1377,6 +1379,18 @@ void bpf_patch_call_args(struct bpf_insn *insn, u32 stack_depth)
 	insn->code = BPF_JMP | BPF_CALL_ARGS;
 }
 
+#else
+static unsigned int __bpf_prog_ret0_warn(const void *ctx,
+					 const struct bpf_insn *insn)
+{
+	/* If this handler ever gets executed, then BPF_JIT_ALWAYS_ON
+	 * is not working properly, so warn about it!
+	 */
+	WARN_ON_ONCE(1);
+	return 0;
+}
+#endif
+
 bool bpf_prog_array_compatible(struct bpf_array *array,
 			       const struct bpf_prog *fp)
 {
@@ -1427,9 +1441,13 @@ static int bpf_check_tail_call(const struct bpf_prog *fp)
  */
 struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err)
 {
+#ifndef CONFIG_BPF_JIT_ALWAYS_ON
 	u32 stack_depth = max_t(u32, fp->aux->stack_depth, 1);
 
 	fp->bpf_func = interpreters[(round_up(stack_depth, 32) / 32) - 1];
+#else
+	fp->bpf_func = __bpf_prog_ret0_warn;
+#endif
 
 	/* eBPF JITs can rewrite the program in case constant
 	 * blinding is active. However, in case of error during
@@ -1439,6 +1457,12 @@ struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err)
 	 */
 	if (!bpf_prog_is_dev_bound(fp->aux)) {
 		fp = bpf_int_jit_compile(fp);
+#ifdef CONFIG_BPF_JIT_ALWAYS_ON
+		if (!fp->jited) {
+			*err = -ENOTSUPP;
+			return fp;
+		}
+#endif
 	} else {
 		*err = bpf_prog_offload_compile(fp);
 		if (*err)
diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c
index ce5b669003b2..fbfdada6caee 100644
--- a/kernel/bpf/cpumap.c
+++ b/kernel/bpf/cpumap.c
@@ -94,13 +94,7 @@ static struct bpf_map *cpu_map_alloc(union bpf_attr *attr)
 	if (!cmap)
 		return ERR_PTR(-ENOMEM);
 
-	/* mandatory map attributes */
-	cmap->map.map_type = attr->map_type;
-	cmap->map.key_size = attr->key_size;
-	cmap->map.value_size = attr->value_size;
-	cmap->map.max_entries = attr->max_entries;
-	cmap->map.map_flags = attr->map_flags;
-	cmap->map.numa_node = bpf_map_attr_numa_node(attr);
+	bpf_map_init_from_attr(&cmap->map, attr);
 
 	/* Pre-limit array size based on NR_CPUS, not final CPU check */
 	if (cmap->map.max_entries > NR_CPUS) {
@@ -143,7 +137,7 @@ free_cmap:
 	return ERR_PTR(err);
 }
 
-void __cpu_map_queue_destructor(void *ptr)
+static void __cpu_map_queue_destructor(void *ptr)
 {
 	/* The tear-down procedure should have made sure that queue is
 	 * empty.  See __cpu_map_entry_replace() and work-queue
@@ -222,8 +216,8 @@ static struct xdp_pkt *convert_to_xdp_pkt(struct xdp_buff *xdp)
 	return xdp_pkt;
 }
 
-struct sk_buff *cpu_map_build_skb(struct bpf_cpu_map_entry *rcpu,
-				  struct xdp_pkt *xdp_pkt)
+static struct sk_buff *cpu_map_build_skb(struct bpf_cpu_map_entry *rcpu,
+					 struct xdp_pkt *xdp_pkt)
 {
 	unsigned int frame_size;
 	void *pkt_data_start;
@@ -337,7 +331,8 @@ static int cpu_map_kthread_run(void *data)
 	return 0;
 }
 
-struct bpf_cpu_map_entry *__cpu_map_entry_alloc(u32 qsize, u32 cpu, int map_id)
+static struct bpf_cpu_map_entry *__cpu_map_entry_alloc(u32 qsize, u32 cpu,
+						       int map_id)
 {
 	gfp_t gfp = GFP_ATOMIC|__GFP_NOWARN;
 	struct bpf_cpu_map_entry *rcpu;
@@ -395,7 +390,7 @@ free_rcu:
 	return NULL;
 }
 
-void __cpu_map_entry_free(struct rcu_head *rcu)
+static void __cpu_map_entry_free(struct rcu_head *rcu)
 {
 	struct bpf_cpu_map_entry *rcpu;
 	int cpu;
@@ -438,8 +433,8 @@ void __cpu_map_entry_free(struct rcu_head *rcu)
  * cpu_map_kthread_stop, which waits for an RCU graze period before
  * stopping kthread, emptying the queue.
  */
-void __cpu_map_entry_replace(struct bpf_cpu_map *cmap,
-			     u32 key_cpu, struct bpf_cpu_map_entry *rcpu)
+static void __cpu_map_entry_replace(struct bpf_cpu_map *cmap,
+				    u32 key_cpu, struct bpf_cpu_map_entry *rcpu)
 {
 	struct bpf_cpu_map_entry *old_rcpu;
 
@@ -451,7 +446,7 @@ void __cpu_map_entry_replace(struct bpf_cpu_map *cmap,
 	}
 }
 
-int cpu_map_delete_elem(struct bpf_map *map, void *key)
+static int cpu_map_delete_elem(struct bpf_map *map, void *key)
 {
 	struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map);
 	u32 key_cpu = *(u32 *)key;
@@ -464,8 +459,8 @@ int cpu_map_delete_elem(struct bpf_map *map, void *key)
 	return 0;
 }
 
-int cpu_map_update_elem(struct bpf_map *map, void *key, void *value,
-				u64 map_flags)
+static int cpu_map_update_elem(struct bpf_map *map, void *key, void *value,
+			       u64 map_flags)
 {
 	struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map);
 	struct bpf_cpu_map_entry *rcpu;
@@ -502,7 +497,7 @@ int cpu_map_update_elem(struct bpf_map *map, void *key, void *value,
 	return 0;
 }
 
-void cpu_map_free(struct bpf_map *map)
+static void cpu_map_free(struct bpf_map *map)
 {
 	struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map);
 	int cpu;
diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
index ebdef54bf7df..565f9ece9115 100644
--- a/kernel/bpf/devmap.c
+++ b/kernel/bpf/devmap.c
@@ -93,13 +93,7 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr)
 	if (!dtab)
 		return ERR_PTR(-ENOMEM);
 
-	/* mandatory map attributes */
-	dtab->map.map_type = attr->map_type;
-	dtab->map.key_size = attr->key_size;
-	dtab->map.value_size = attr->value_size;
-	dtab->map.max_entries = attr->max_entries;
-	dtab->map.map_flags = attr->map_flags;
-	dtab->map.numa_node = bpf_map_attr_numa_node(attr);
+	bpf_map_init_from_attr(&dtab->map, attr);
 
 	/* make sure page count doesn't overflow */
 	cost = (u64) dtab->map.max_entries * sizeof(struct bpf_dtab_netdev *);
diff --git a/kernel/bpf/disasm.h b/kernel/bpf/disasm.h
index e0857d016f89..266fe8ee542b 100644
--- a/kernel/bpf/disasm.h
+++ b/kernel/bpf/disasm.h
@@ -29,8 +29,8 @@ extern const char *const bpf_class_string[8];
 
 const char *func_id_name(int id);
 
-typedef void (*bpf_insn_print_t)(struct bpf_verifier_env *env,
-				 const char *, ...);
+typedef __printf(2, 3) void (*bpf_insn_print_t)(struct bpf_verifier_env *env,
+						const char *, ...);
 typedef const char *(*bpf_insn_revmap_call_t)(void *private_data,
 					      const struct bpf_insn *insn);
 typedef const char *(*bpf_insn_print_imm_t)(void *private_data,
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index 3905d4bc5b80..b76828f23b49 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -227,7 +227,7 @@ static int alloc_extra_elems(struct bpf_htab *htab)
 }
 
 /* Called from syscall */
-static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
+static int htab_map_alloc_check(union bpf_attr *attr)
 {
 	bool percpu = (attr->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
 		       attr->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH);
@@ -241,9 +241,6 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
 	bool percpu_lru = (attr->map_flags & BPF_F_NO_COMMON_LRU);
 	bool prealloc = !(attr->map_flags & BPF_F_NO_PREALLOC);
 	int numa_node = bpf_map_attr_numa_node(attr);
-	struct bpf_htab *htab;
-	int err, i;
-	u64 cost;
 
 	BUILD_BUG_ON(offsetof(struct htab_elem, htab) !=
 		     offsetof(struct htab_elem, hash_node.pprev));
@@ -254,40 +251,68 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
 		/* LRU implementation is much complicated than other
 		 * maps.  Hence, limit to CAP_SYS_ADMIN for now.
 		 */
-		return ERR_PTR(-EPERM);
+		return -EPERM;
 
 	if (attr->map_flags & ~HTAB_CREATE_FLAG_MASK)
 		/* reserved bits should not be used */
-		return ERR_PTR(-EINVAL);
+		return -EINVAL;
 
 	if (!lru && percpu_lru)
-		return ERR_PTR(-EINVAL);
+		return -EINVAL;
 
 	if (lru && !prealloc)
-		return ERR_PTR(-ENOTSUPP);
+		return -ENOTSUPP;
 
 	if (numa_node != NUMA_NO_NODE && (percpu || percpu_lru))
-		return ERR_PTR(-EINVAL);
+		return -EINVAL;
+
+	/* check sanity of attributes.
+	 * value_size == 0 may be allowed in the future to use map as a set
+	 */
+	if (attr->max_entries == 0 || attr->key_size == 0 ||
+	    attr->value_size == 0)
+		return -EINVAL;
+
+	if (attr->key_size > MAX_BPF_STACK)
+		/* eBPF programs initialize keys on stack, so they cannot be
+		 * larger than max stack size
+		 */
+		return -E2BIG;
+
+	if (attr->value_size >= KMALLOC_MAX_SIZE -
+	    MAX_BPF_STACK - sizeof(struct htab_elem))
+		/* if value_size is bigger, the user space won't be able to
+		 * access the elements via bpf syscall. This check also makes
+		 * sure that the elem_size doesn't overflow and it's
+		 * kmalloc-able later in htab_map_update_elem()
+		 */
+		return -E2BIG;
+
+	return 0;
+}
+
+static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
+{
+	bool percpu = (attr->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
+		       attr->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH);
+	bool lru = (attr->map_type == BPF_MAP_TYPE_LRU_HASH ||
+		    attr->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH);
+	/* percpu_lru means each cpu has its own LRU list.
+	 * it is different from BPF_MAP_TYPE_PERCPU_HASH where
+	 * the map's value itself is percpu.  percpu_lru has
+	 * nothing to do with the map's value.
+	 */
+	bool percpu_lru = (attr->map_flags & BPF_F_NO_COMMON_LRU);
+	bool prealloc = !(attr->map_flags & BPF_F_NO_PREALLOC);
+	struct bpf_htab *htab;
+	int err, i;
+	u64 cost;
 
 	htab = kzalloc(sizeof(*htab), GFP_USER);
 	if (!htab)
 		return ERR_PTR(-ENOMEM);
 
-	/* mandatory map attributes */
-	htab->map.map_type = attr->map_type;
-	htab->map.key_size = attr->key_size;
-	htab->map.value_size = attr->value_size;
-	htab->map.max_entries = attr->max_entries;
-	htab->map.map_flags = attr->map_flags;
-	htab->map.numa_node = numa_node;
-
-	/* check sanity of attributes.
-	 * value_size == 0 may be allowed in the future to use map as a set
-	 */
-	err = -EINVAL;
-	if (htab->map.max_entries == 0 || htab->map.key_size == 0 ||
-	    htab->map.value_size == 0)
-		goto free_htab;
+	bpf_map_init_from_attr(&htab->map, attr);
 
 	if (percpu_lru) {
 		/* ensure each CPU's lru list has >=1 elements.
@@ -304,22 +329,6 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
 	/* hash table size must be power of 2 */
 	htab->n_buckets = roundup_pow_of_two(htab->map.max_entries);
 
-	err = -E2BIG;
-	if (htab->map.key_size > MAX_BPF_STACK)
-		/* eBPF programs initialize keys on stack, so they cannot be
-		 * larger than max stack size
-		 */
-		goto free_htab;
-
-	if (htab->map.value_size >= KMALLOC_MAX_SIZE -
-	    MAX_BPF_STACK - sizeof(struct htab_elem))
-		/* if value_size is bigger, the user space won't be able to
-		 * access the elements via bpf syscall. This check also makes
-		 * sure that the elem_size doesn't overflow and it's
-		 * kmalloc-able later in htab_map_update_elem()
-		 */
-		goto free_htab;
-
 	htab->elem_size = sizeof(struct htab_elem) +
 			  round_up(htab->map.key_size, 8);
 	if (percpu)
@@ -327,6 +336,7 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
 	else
 		htab->elem_size += round_up(htab->map.value_size, 8);
 
+	err = -E2BIG;
 	/* prevent zero size kmalloc and check for u32 overflow */
 	if (htab->n_buckets == 0 ||
 	    htab->n_buckets > U32_MAX / sizeof(struct bucket))
@@ -1143,6 +1153,7 @@ static void htab_map_free(struct bpf_map *map)
 }
 
 const struct bpf_map_ops htab_map_ops = {
+	.map_alloc_check = htab_map_alloc_check,
 	.map_alloc = htab_map_alloc,
 	.map_free = htab_map_free,
 	.map_get_next_key = htab_map_get_next_key,
@@ -1153,6 +1164,7 @@ const struct bpf_map_ops htab_map_ops = {
 };
 
 const struct bpf_map_ops htab_lru_map_ops = {
+	.map_alloc_check = htab_map_alloc_check,
 	.map_alloc = htab_map_alloc,
 	.map_free = htab_map_free,
 	.map_get_next_key = htab_map_get_next_key,
@@ -1236,6 +1248,7 @@ int bpf_percpu_hash_update(struct bpf_map *map, void *key, void *value,
 }
 
 const struct bpf_map_ops htab_percpu_map_ops = {
+	.map_alloc_check = htab_map_alloc_check,
 	.map_alloc = htab_map_alloc,
 	.map_free = htab_map_free,
 	.map_get_next_key = htab_map_get_next_key,
@@ -1245,6 +1258,7 @@ const struct bpf_map_ops htab_percpu_map_ops = {
 };
 
 const struct bpf_map_ops htab_lru_percpu_map_ops = {
+	.map_alloc_check = htab_map_alloc_check,
 	.map_alloc = htab_map_alloc,
 	.map_free = htab_map_free,
 	.map_get_next_key = htab_map_get_next_key,
@@ -1253,11 +1267,11 @@ const struct bpf_map_ops htab_lru_percpu_map_ops = {
 	.map_delete_elem = htab_lru_map_delete_elem,
 };
 
-static struct bpf_map *fd_htab_map_alloc(union bpf_attr *attr)
+static int fd_htab_map_alloc_check(union bpf_attr *attr)
 {
 	if (attr->value_size != sizeof(u32))
-		return ERR_PTR(-EINVAL);
-	return htab_map_alloc(attr);
+		return -EINVAL;
+	return htab_map_alloc_check(attr);
 }
 
 static void fd_htab_map_free(struct bpf_map *map)
@@ -1328,7 +1342,7 @@ static struct bpf_map *htab_of_map_alloc(union bpf_attr *attr)
 	if (IS_ERR(inner_map_meta))
 		return inner_map_meta;
 
-	map = fd_htab_map_alloc(attr);
+	map = htab_map_alloc(attr);
 	if (IS_ERR(map)) {
 		bpf_map_meta_free(inner_map_meta);
 		return map;
@@ -1372,6 +1386,7 @@ static void htab_of_map_free(struct bpf_map *map)
 }
 
 const struct bpf_map_ops htab_of_maps_map_ops = {
+	.map_alloc_check = fd_htab_map_alloc_check,
 	.map_alloc = htab_of_map_alloc,
 	.map_free = htab_of_map_free,
 	.map_get_next_key = htab_map_get_next_key,
diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c
index 885e45479680..d7ea96218516 100644
--- a/kernel/bpf/lpm_trie.c
+++ b/kernel/bpf/lpm_trie.c
@@ -522,12 +522,7 @@ static struct bpf_map *trie_alloc(union bpf_attr *attr)
 		return ERR_PTR(-ENOMEM);
 
 	/* copy mandatory map attributes */
-	trie->map.map_type = attr->map_type;
-	trie->map.key_size = attr->key_size;
-	trie->map.value_size = attr->value_size;
-	trie->map.max_entries = attr->max_entries;
-	trie->map.map_flags = attr->map_flags;
-	trie->map.numa_node = bpf_map_attr_numa_node(attr);
+	bpf_map_init_from_attr(&trie->map, attr);
 	trie->data_size = attr->key_size -
 			  offsetof(struct bpf_lpm_trie_key, data);
 	trie->max_prefixlen = trie->data_size * 8;
@@ -596,9 +591,100 @@ unlock:
 	raw_spin_unlock(&trie->lock);
 }
 
-static int trie_get_next_key(struct bpf_map *map, void *key, void *next_key)
+static int trie_get_next_key(struct bpf_map *map, void *_key, void *_next_key)
 {
-	return -ENOTSUPP;
+	struct lpm_trie *trie = container_of(map, struct lpm_trie, map);
+	struct bpf_lpm_trie_key *key = _key, *next_key = _next_key;
+	struct lpm_trie_node *node, *next_node = NULL, *parent;
+	struct lpm_trie_node **node_stack = NULL;
+	struct lpm_trie_node __rcu **root;
+	int err = 0, stack_ptr = -1;
+	unsigned int next_bit;
+	size_t matchlen;
+
+	/* The get_next_key follows postorder. For the 4 node example in
+	 * the top of this file, the trie_get_next_key() returns the following
+	 * one after another:
+	 *   192.168.0.0/24
+	 *   192.168.1.0/24
+	 *   192.168.128.0/24
+	 *   192.168.0.0/16
+	 *
+	 * The idea is to return more specific keys before less specific ones.
+	 */
+
+	/* Empty trie */
+	if (!rcu_dereference(trie->root))
+		return -ENOENT;
+
+	/* For invalid key, find the leftmost node in the trie */
+	if (!key || key->prefixlen > trie->max_prefixlen) {
+		root = &trie->root;
+		goto find_leftmost;
+	}
+
+	node_stack = kmalloc(trie->max_prefixlen * sizeof(struct lpm_trie_node *),
+			     GFP_USER | __GFP_NOWARN);
+	if (!node_stack)
+		return -ENOMEM;
+
+	/* Try to find the exact node for the given key */
+	for (node = rcu_dereference(trie->root); node;) {
+		node_stack[++stack_ptr] = node;
+		matchlen = longest_prefix_match(trie, node, key);
+		if (node->prefixlen != matchlen ||
+		    node->prefixlen == key->prefixlen)
+			break;
+
+		next_bit = extract_bit(key->data, node->prefixlen);
+		node = rcu_dereference(node->child[next_bit]);
+	}
+	if (!node || node->prefixlen != key->prefixlen ||
+	    (node->flags & LPM_TREE_NODE_FLAG_IM)) {
+		root = &trie->root;
+		goto find_leftmost;
+	}
+
+	/* The node with the exactly-matching key has been found,
+	 * find the first node in postorder after the matched node.
+	 */
+	node = node_stack[stack_ptr];
+	while (stack_ptr > 0) {
+		parent = node_stack[stack_ptr - 1];
+		if (rcu_dereference(parent->child[0]) == node &&
+		    rcu_dereference(parent->child[1])) {
+			root = &parent->child[1];
+			goto find_leftmost;
+		}
+		if (!(parent->flags & LPM_TREE_NODE_FLAG_IM)) {
+			next_node = parent;
+			goto do_copy;
+		}
+
+		node = parent;
+		stack_ptr--;
+	}
+
+	/* did not find anything */
+	err = -ENOENT;
+	goto free_stack;
+
+find_leftmost:
+	/* Find the leftmost non-intermediate node, all intermediate nodes
+	 * have exact two children, so this function will never return NULL.
+	 */
+	for (node = rcu_dereference(*root); node;) {
+		if (!(node->flags & LPM_TREE_NODE_FLAG_IM))
+			next_node = node;
+		node = rcu_dereference(node->child[0]);
+	}
+do_copy:
+	next_key->prefixlen = next_node->prefixlen;
+	memcpy((void *)next_key + offsetof(struct bpf_lpm_trie_key, data),
+	       next_node->data, trie->data_size);
+free_stack:
+	kfree(node_stack);
+	return err;
 }
 
 const struct bpf_map_ops trie_map_ops = {
diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c
index 040d4e0edf3f..c9401075b58c 100644
--- a/kernel/bpf/offload.c
+++ b/kernel/bpf/offload.c
@@ -24,15 +24,27 @@
 #include <linux/rtnetlink.h>
 #include <linux/rwsem.h>
 
-/* Protects bpf_prog_offload_devs and offload members of all progs.
+/* Protects bpf_prog_offload_devs, bpf_map_offload_devs and offload members
+ * of all progs.
  * RTNL lock cannot be taken when holding this lock.
  */
 static DECLARE_RWSEM(bpf_devs_lock);
 static LIST_HEAD(bpf_prog_offload_devs);
+static LIST_HEAD(bpf_map_offload_devs);
+
+static int bpf_dev_offload_check(struct net_device *netdev)
+{
+	if (!netdev)
+		return -EINVAL;
+	if (!netdev->netdev_ops->ndo_bpf)
+		return -EOPNOTSUPP;
+	return 0;
+}
 
 int bpf_prog_offload_init(struct bpf_prog *prog, union bpf_attr *attr)
 {
-	struct bpf_dev_offload *offload;
+	struct bpf_prog_offload *offload;
+	int err;
 
 	if (attr->prog_type != BPF_PROG_TYPE_SCHED_CLS &&
 	    attr->prog_type != BPF_PROG_TYPE_XDP)
@@ -49,12 +61,15 @@ int bpf_prog_offload_init(struct bpf_prog *prog, union bpf_attr *attr)
 
 	offload->netdev = dev_get_by_index(current->nsproxy->net_ns,
 					   attr->prog_ifindex);
-	if (!offload->netdev)
-		goto err_free;
+	err = bpf_dev_offload_check(offload->netdev);
+	if (err)
+		goto err_maybe_put;
 
 	down_write(&bpf_devs_lock);
-	if (offload->netdev->reg_state != NETREG_REGISTERED)
+	if (offload->netdev->reg_state != NETREG_REGISTERED) {
+		err = -EINVAL;
 		goto err_unlock;
+	}
 	prog->aux->offload = offload;
 	list_add_tail(&offload->offloads, &bpf_prog_offload_devs);
 	dev_put(offload->netdev);
@@ -63,16 +78,17 @@ int bpf_prog_offload_init(struct bpf_prog *prog, union bpf_attr *attr)
 	return 0;
 err_unlock:
 	up_write(&bpf_devs_lock);
-	dev_put(offload->netdev);
-err_free:
+err_maybe_put:
+	if (offload->netdev)
+		dev_put(offload->netdev);
 	kfree(offload);
-	return -EINVAL;
+	return err;
 }
 
 static int __bpf_offload_ndo(struct bpf_prog *prog, enum bpf_netdev_command cmd,
 			     struct netdev_bpf *data)
 {
-	struct bpf_dev_offload *offload = prog->aux->offload;
+	struct bpf_prog_offload *offload = prog->aux->offload;
 	struct net_device *netdev;
 
 	ASSERT_RTNL();
@@ -80,8 +96,6 @@ static int __bpf_offload_ndo(struct bpf_prog *prog, enum bpf_netdev_command cmd,
 	if (!offload)
 		return -ENODEV;
 	netdev = offload->netdev;
-	if (!netdev->netdev_ops->ndo_bpf)
-		return -EOPNOTSUPP;
 
 	data->command = cmd;
 
@@ -110,7 +124,7 @@ exit_unlock:
 int bpf_prog_offload_verify_insn(struct bpf_verifier_env *env,
 				 int insn_idx, int prev_insn_idx)
 {
-	struct bpf_dev_offload *offload;
+	struct bpf_prog_offload *offload;
 	int ret = -ENODEV;
 
 	down_read(&bpf_devs_lock);
@@ -124,7 +138,7 @@ int bpf_prog_offload_verify_insn(struct bpf_verifier_env *env,
 
 static void __bpf_prog_offload_destroy(struct bpf_prog *prog)
 {
-	struct bpf_dev_offload *offload = prog->aux->offload;
+	struct bpf_prog_offload *offload = prog->aux->offload;
 	struct netdev_bpf data = {};
 
 	data.offload.prog = prog;
@@ -216,9 +230,12 @@ int bpf_prog_offload_info_fill(struct bpf_prog_info *info,
 		.prog	= prog,
 		.info	= info,
 	};
+	struct bpf_prog_aux *aux = prog->aux;
 	struct inode *ns_inode;
 	struct path ns_path;
+	char __user *uinsns;
 	void *res;
+	u32 ulen;
 
 	res = ns_get_path_cb(&ns_path, bpf_prog_offload_info_fill_ns, &args);
 	if (IS_ERR(res)) {
@@ -227,6 +244,26 @@ int bpf_prog_offload_info_fill(struct bpf_prog_info *info,
 		return PTR_ERR(res);
 	}
 
+	down_read(&bpf_devs_lock);
+
+	if (!aux->offload) {
+		up_read(&bpf_devs_lock);
+		return -ENODEV;
+	}
+
+	ulen = info->jited_prog_len;
+	info->jited_prog_len = aux->offload->jited_len;
+	if (info->jited_prog_len & ulen) {
+		uinsns = u64_to_user_ptr(info->jited_prog_insns);
+		ulen = min_t(u32, info->jited_prog_len, ulen);
+		if (copy_to_user(uinsns, aux->offload->jited_image, ulen)) {
+			up_read(&bpf_devs_lock);
+			return -EFAULT;
+		}
+	}
+
+	up_read(&bpf_devs_lock);
+
 	ns_inode = ns_path.dentry->d_inode;
 	info->netns_dev = new_encode_dev(ns_inode->i_sb->s_dev);
 	info->netns_ino = ns_inode->i_ino;
@@ -238,11 +275,240 @@ int bpf_prog_offload_info_fill(struct bpf_prog_info *info,
 const struct bpf_prog_ops bpf_offload_prog_ops = {
 };
 
+static int bpf_map_offload_ndo(struct bpf_offloaded_map *offmap,
+			       enum bpf_netdev_command cmd)
+{
+	struct netdev_bpf data = {};
+	struct net_device *netdev;
+
+	ASSERT_RTNL();
+
+	data.command = cmd;
+	data.offmap = offmap;
+	/* Caller must make sure netdev is valid */
+	netdev = offmap->netdev;
+
+	return netdev->netdev_ops->ndo_bpf(netdev, &data);
+}
+
+struct bpf_map *bpf_map_offload_map_alloc(union bpf_attr *attr)
+{
+	struct net *net = current->nsproxy->net_ns;
+	struct bpf_offloaded_map *offmap;
+	int err;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return ERR_PTR(-EPERM);
+	if (attr->map_type != BPF_MAP_TYPE_ARRAY &&
+	    attr->map_type != BPF_MAP_TYPE_HASH)
+		return ERR_PTR(-EINVAL);
+
+	offmap = kzalloc(sizeof(*offmap), GFP_USER);
+	if (!offmap)
+		return ERR_PTR(-ENOMEM);
+
+	bpf_map_init_from_attr(&offmap->map, attr);
+
+	rtnl_lock();
+	down_write(&bpf_devs_lock);
+	offmap->netdev = __dev_get_by_index(net, attr->map_ifindex);
+	err = bpf_dev_offload_check(offmap->netdev);
+	if (err)
+		goto err_unlock;
+
+	err = bpf_map_offload_ndo(offmap, BPF_OFFLOAD_MAP_ALLOC);
+	if (err)
+		goto err_unlock;
+
+	list_add_tail(&offmap->offloads, &bpf_map_offload_devs);
+	up_write(&bpf_devs_lock);
+	rtnl_unlock();
+
+	return &offmap->map;
+
+err_unlock:
+	up_write(&bpf_devs_lock);
+	rtnl_unlock();
+	kfree(offmap);
+	return ERR_PTR(err);
+}
+
+static void __bpf_map_offload_destroy(struct bpf_offloaded_map *offmap)
+{
+	WARN_ON(bpf_map_offload_ndo(offmap, BPF_OFFLOAD_MAP_FREE));
+	/* Make sure BPF_MAP_GET_NEXT_ID can't find this dead map */
+	bpf_map_free_id(&offmap->map, true);
+	list_del_init(&offmap->offloads);
+	offmap->netdev = NULL;
+}
+
+void bpf_map_offload_map_free(struct bpf_map *map)
+{
+	struct bpf_offloaded_map *offmap = map_to_offmap(map);
+
+	rtnl_lock();
+	down_write(&bpf_devs_lock);
+	if (offmap->netdev)
+		__bpf_map_offload_destroy(offmap);
+	up_write(&bpf_devs_lock);
+	rtnl_unlock();
+
+	kfree(offmap);
+}
+
+int bpf_map_offload_lookup_elem(struct bpf_map *map, void *key, void *value)
+{
+	struct bpf_offloaded_map *offmap = map_to_offmap(map);
+	int ret = -ENODEV;
+
+	down_read(&bpf_devs_lock);
+	if (offmap->netdev)
+		ret = offmap->dev_ops->map_lookup_elem(offmap, key, value);
+	up_read(&bpf_devs_lock);
+
+	return ret;
+}
+
+int bpf_map_offload_update_elem(struct bpf_map *map,
+				void *key, void *value, u64 flags)
+{
+	struct bpf_offloaded_map *offmap = map_to_offmap(map);
+	int ret = -ENODEV;
+
+	if (unlikely(flags > BPF_EXIST))
+		return -EINVAL;
+
+	down_read(&bpf_devs_lock);
+	if (offmap->netdev)
+		ret = offmap->dev_ops->map_update_elem(offmap, key, value,
+						       flags);
+	up_read(&bpf_devs_lock);
+
+	return ret;
+}
+
+int bpf_map_offload_delete_elem(struct bpf_map *map, void *key)
+{
+	struct bpf_offloaded_map *offmap = map_to_offmap(map);
+	int ret = -ENODEV;
+
+	down_read(&bpf_devs_lock);
+	if (offmap->netdev)
+		ret = offmap->dev_ops->map_delete_elem(offmap, key);
+	up_read(&bpf_devs_lock);
+
+	return ret;
+}
+
+int bpf_map_offload_get_next_key(struct bpf_map *map, void *key, void *next_key)
+{
+	struct bpf_offloaded_map *offmap = map_to_offmap(map);
+	int ret = -ENODEV;
+
+	down_read(&bpf_devs_lock);
+	if (offmap->netdev)
+		ret = offmap->dev_ops->map_get_next_key(offmap, key, next_key);
+	up_read(&bpf_devs_lock);
+
+	return ret;
+}
+
+struct ns_get_path_bpf_map_args {
+	struct bpf_offloaded_map *offmap;
+	struct bpf_map_info *info;
+};
+
+static struct ns_common *bpf_map_offload_info_fill_ns(void *private_data)
+{
+	struct ns_get_path_bpf_map_args *args = private_data;
+	struct ns_common *ns;
+	struct net *net;
+
+	rtnl_lock();
+	down_read(&bpf_devs_lock);
+
+	if (args->offmap->netdev) {
+		args->info->ifindex = args->offmap->netdev->ifindex;
+		net = dev_net(args->offmap->netdev);
+		get_net(net);
+		ns = &net->ns;
+	} else {
+		args->info->ifindex = 0;
+		ns = NULL;
+	}
+
+	up_read(&bpf_devs_lock);
+	rtnl_unlock();
+
+	return ns;
+}
+
+int bpf_map_offload_info_fill(struct bpf_map_info *info, struct bpf_map *map)
+{
+	struct ns_get_path_bpf_map_args args = {
+		.offmap	= map_to_offmap(map),
+		.info	= info,
+	};
+	struct inode *ns_inode;
+	struct path ns_path;
+	void *res;
+
+	res = ns_get_path_cb(&ns_path, bpf_map_offload_info_fill_ns, &args);
+	if (IS_ERR(res)) {
+		if (!info->ifindex)
+			return -ENODEV;
+		return PTR_ERR(res);
+	}
+
+	ns_inode = ns_path.dentry->d_inode;
+	info->netns_dev = new_encode_dev(ns_inode->i_sb->s_dev);
+	info->netns_ino = ns_inode->i_ino;
+	path_put(&ns_path);
+
+	return 0;
+}
+
+bool bpf_offload_dev_match(struct bpf_prog *prog, struct bpf_map *map)
+{
+	struct bpf_offloaded_map *offmap;
+	struct bpf_prog_offload *offload;
+	bool ret;
+
+	if (!bpf_prog_is_dev_bound(prog->aux) || !bpf_map_is_dev_bound(map))
+		return false;
+
+	down_read(&bpf_devs_lock);
+	offload = prog->aux->offload;
+	offmap = map_to_offmap(map);
+
+	ret = offload && offload->netdev == offmap->netdev;
+	up_read(&bpf_devs_lock);
+
+	return ret;
+}
+
+static void bpf_offload_orphan_all_progs(struct net_device *netdev)
+{
+	struct bpf_prog_offload *offload, *tmp;
+
+	list_for_each_entry_safe(offload, tmp, &bpf_prog_offload_devs, offloads)
+		if (offload->netdev == netdev)
+			__bpf_prog_offload_destroy(offload->prog);
+}
+
+static void bpf_offload_orphan_all_maps(struct net_device *netdev)
+{
+	struct bpf_offloaded_map *offmap, *tmp;
+
+	list_for_each_entry_safe(offmap, tmp, &bpf_map_offload_devs, offloads)
+		if (offmap->netdev == netdev)
+			__bpf_map_offload_destroy(offmap);
+}
+
 static int bpf_offload_notification(struct notifier_block *notifier,
 				    ulong event, void *ptr)
 {
 	struct net_device *netdev = netdev_notifier_info_to_dev(ptr);
-	struct bpf_dev_offload *offload, *tmp;
 
 	ASSERT_RTNL();
 
@@ -253,11 +519,8 @@ static int bpf_offload_notification(struct notifier_block *notifier,
 			break;
 
 		down_write(&bpf_devs_lock);
-		list_for_each_entry_safe(offload, tmp, &bpf_prog_offload_devs,
-					 offloads) {
-			if (offload->netdev == netdev)
-				__bpf_prog_offload_destroy(offload->prog);
-		}
+		bpf_offload_orphan_all_progs(netdev);
+		bpf_offload_orphan_all_maps(netdev);
 		up_write(&bpf_devs_lock);
 		break;
 	default:
diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c
index 3f662ee23a34..0314d1783d77 100644
--- a/kernel/bpf/sockmap.c
+++ b/kernel/bpf/sockmap.c
@@ -513,13 +513,7 @@ static struct bpf_map *sock_map_alloc(union bpf_attr *attr)
 	if (!stab)
 		return ERR_PTR(-ENOMEM);
 
-	/* mandatory map attributes */
-	stab->map.map_type = attr->map_type;
-	stab->map.key_size = attr->key_size;
-	stab->map.value_size = attr->value_size;
-	stab->map.max_entries = attr->max_entries;
-	stab->map.map_flags = attr->map_flags;
-	stab->map.numa_node = bpf_map_attr_numa_node(attr);
+	bpf_map_init_from_attr(&stab->map, attr);
 
 	/* make sure page count doesn't overflow */
 	cost = (u64) stab->map.max_entries * sizeof(struct sock *);
@@ -583,8 +577,15 @@ static void sock_map_free(struct bpf_map *map)
 
 		write_lock_bh(&sock->sk_callback_lock);
 		psock = smap_psock_sk(sock);
-		smap_list_remove(psock, &stab->sock_map[i]);
-		smap_release_sock(psock, sock);
+		/* This check handles a racing sock event that can get the
+		 * sk_callback_lock before this case but after xchg happens
+		 * causing the refcnt to hit zero and sock user data (psock)
+		 * to be null and queued for garbage collection.
+		 */
+		if (likely(psock)) {
+			smap_list_remove(psock, &stab->sock_map[i]);
+			smap_release_sock(psock, sock);
+		}
 		write_unlock_bh(&sock->sk_callback_lock);
 	}
 	rcu_read_unlock();
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index 6c63c2222ea8..b0ecf43f5894 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -88,14 +88,10 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr)
 	if (cost >= U32_MAX - PAGE_SIZE)
 		goto free_smap;
 
-	smap->map.map_type = attr->map_type;
-	smap->map.key_size = attr->key_size;
+	bpf_map_init_from_attr(&smap->map, attr);
 	smap->map.value_size = value_size;
-	smap->map.max_entries = attr->max_entries;
-	smap->map.map_flags = attr->map_flags;
 	smap->n_buckets = n_buckets;
 	smap->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
-	smap->map.numa_node = bpf_map_attr_numa_node(attr);
 
 	err = bpf_map_precharge_memlock(smap->map.pages);
 	if (err)
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 2bac0dc8baba..5bdb0cc84ad2 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -94,18 +94,34 @@ static int check_uarg_tail_zero(void __user *uaddr,
 	return 0;
 }
 
+const struct bpf_map_ops bpf_map_offload_ops = {
+	.map_alloc = bpf_map_offload_map_alloc,
+	.map_free = bpf_map_offload_map_free,
+};
+
 static struct bpf_map *find_and_alloc_map(union bpf_attr *attr)
 {
+	const struct bpf_map_ops *ops;
 	struct bpf_map *map;
+	int err;
 
-	if (attr->map_type >= ARRAY_SIZE(bpf_map_types) ||
-	    !bpf_map_types[attr->map_type])
+	if (attr->map_type >= ARRAY_SIZE(bpf_map_types))
+		return ERR_PTR(-EINVAL);
+	ops = bpf_map_types[attr->map_type];
+	if (!ops)
 		return ERR_PTR(-EINVAL);
 
-	map = bpf_map_types[attr->map_type]->map_alloc(attr);
+	if (ops->map_alloc_check) {
+		err = ops->map_alloc_check(attr);
+		if (err)
+			return ERR_PTR(err);
+	}
+	if (attr->map_ifindex)
+		ops = &bpf_map_offload_ops;
+	map = ops->map_alloc(attr);
 	if (IS_ERR(map))
 		return map;
-	map->ops = bpf_map_types[attr->map_type];
+	map->ops = ops;
 	map->map_type = attr->map_type;
 	return map;
 }
@@ -134,6 +150,16 @@ void bpf_map_area_free(void *area)
 	kvfree(area);
 }
 
+void bpf_map_init_from_attr(struct bpf_map *map, union bpf_attr *attr)
+{
+	map->map_type = attr->map_type;
+	map->key_size = attr->key_size;
+	map->value_size = attr->value_size;
+	map->max_entries = attr->max_entries;
+	map->map_flags = attr->map_flags;
+	map->numa_node = bpf_map_attr_numa_node(attr);
+}
+
 int bpf_map_precharge_memlock(u32 pages)
 {
 	struct user_struct *user = get_current_user();
@@ -189,16 +215,25 @@ static int bpf_map_alloc_id(struct bpf_map *map)
 	return id > 0 ? 0 : id;
 }
 
-static void bpf_map_free_id(struct bpf_map *map, bool do_idr_lock)
+void bpf_map_free_id(struct bpf_map *map, bool do_idr_lock)
 {
 	unsigned long flags;
 
+	/* Offloaded maps are removed from the IDR store when their device
+	 * disappears - even if someone holds an fd to them they are unusable,
+	 * the memory is gone, all ops will fail; they are simply waiting for
+	 * refcnt to drop to be freed.
+	 */
+	if (!map->id)
+		return;
+
 	if (do_idr_lock)
 		spin_lock_irqsave(&map_idr_lock, flags);
 	else
 		__acquire(&map_idr_lock);
 
 	idr_remove(&map_idr, map->id);
+	map->id = 0;
 
 	if (do_idr_lock)
 		spin_unlock_irqrestore(&map_idr_lock, flags);
@@ -378,7 +413,7 @@ static int bpf_obj_name_cpy(char *dst, const char *src)
 	return 0;
 }
 
-#define BPF_MAP_CREATE_LAST_FIELD map_name
+#define BPF_MAP_CREATE_LAST_FIELD map_ifindex
 /* called via syscall */
 static int map_create(union bpf_attr *attr)
 {
@@ -566,8 +601,10 @@ static int map_lookup_elem(union bpf_attr *attr)
 	if (!value)
 		goto free_key;
 
-	if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
-	    map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) {
+	if (bpf_map_is_dev_bound(map)) {
+		err = bpf_map_offload_lookup_elem(map, key, value);
+	} else if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
+		   map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) {
 		err = bpf_percpu_hash_copy(map, key, value);
 	} else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) {
 		err = bpf_percpu_array_copy(map, key, value);
@@ -654,7 +691,10 @@ static int map_update_elem(union bpf_attr *attr)
 		goto free_value;
 
 	/* Need to create a kthread, thus must support schedule */
-	if (map->map_type == BPF_MAP_TYPE_CPUMAP) {
+	if (bpf_map_is_dev_bound(map)) {
+		err = bpf_map_offload_update_elem(map, key, value, attr->flags);
+		goto out;
+	} else if (map->map_type == BPF_MAP_TYPE_CPUMAP) {
 		err = map->ops->map_update_elem(map, key, value, attr->flags);
 		goto out;
 	}
@@ -731,6 +771,11 @@ static int map_delete_elem(union bpf_attr *attr)
 		goto err_put;
 	}
 
+	if (bpf_map_is_dev_bound(map)) {
+		err = bpf_map_offload_delete_elem(map, key);
+		goto out;
+	}
+
 	preempt_disable();
 	__this_cpu_inc(bpf_prog_active);
 	rcu_read_lock();
@@ -738,7 +783,7 @@ static int map_delete_elem(union bpf_attr *attr)
 	rcu_read_unlock();
 	__this_cpu_dec(bpf_prog_active);
 	preempt_enable();
-
+out:
 	if (!err)
 		trace_bpf_map_delete_elem(map, ufd, key);
 	kfree(key);
@@ -788,9 +833,15 @@ static int map_get_next_key(union bpf_attr *attr)
 	if (!next_key)
 		goto free_key;
 
+	if (bpf_map_is_dev_bound(map)) {
+		err = bpf_map_offload_get_next_key(map, key, next_key);
+		goto out;
+	}
+
 	rcu_read_lock();
 	err = map->ops->map_get_next_key(map, key, next_key);
 	rcu_read_unlock();
+out:
 	if (err)
 		goto free_next_key;
 
@@ -1453,6 +1504,8 @@ static int bpf_prog_test_run(const union bpf_attr *attr,
 	struct bpf_prog *prog;
 	int ret = -ENOTSUPP;
 
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
 	if (CHECK_ATTR(BPF_PROG_TEST_RUN))
 		return -EINVAL;
 
@@ -1673,19 +1726,6 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog,
 		goto done;
 	}
 
-	ulen = info.jited_prog_len;
-	info.jited_prog_len = prog->jited_len;
-	if (info.jited_prog_len && ulen) {
-		if (bpf_dump_raw_ok()) {
-			uinsns = u64_to_user_ptr(info.jited_prog_insns);
-			ulen = min_t(u32, info.jited_prog_len, ulen);
-			if (copy_to_user(uinsns, prog->bpf_func, ulen))
-				return -EFAULT;
-		} else {
-			info.jited_prog_insns = 0;
-		}
-	}
-
 	ulen = info.xlated_prog_len;
 	info.xlated_prog_len = bpf_prog_insn_size(prog);
 	if (info.xlated_prog_len && ulen) {
@@ -1711,6 +1751,24 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog,
 		err = bpf_prog_offload_info_fill(&info, prog);
 		if (err)
 			return err;
+		goto done;
+	}
+
+	/* NOTE: the following code is supposed to be skipped for offload.
+	 * bpf_prog_offload_info_fill() is the place to fill similar fields
+	 * for offload.
+	 */
+	ulen = info.jited_prog_len;
+	info.jited_prog_len = prog->jited_len;
+	if (info.jited_prog_len && ulen) {
+		if (bpf_dump_raw_ok()) {
+			uinsns = u64_to_user_ptr(info.jited_prog_insns);
+			ulen = min_t(u32, info.jited_prog_len, ulen);
+			if (copy_to_user(uinsns, prog->bpf_func, ulen))
+				return -EFAULT;
+		} else {
+			info.jited_prog_insns = 0;
+		}
 	}
 
 done:
@@ -1743,6 +1801,12 @@ static int bpf_map_get_info_by_fd(struct bpf_map *map,
 	info.map_flags = map->map_flags;
 	memcpy(info.name, map->name, sizeof(map->name));
 
+	if (bpf_map_is_dev_bound(map)) {
+		err = bpf_map_offload_info_fill(&info, map);
+		if (err)
+			return err;
+	}
+
 	if (copy_to_user(uinfo, &info, info_len) ||
 	    put_user(info_len, &uattr->info.info_len))
 		return -EFAULT;
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index a2b211262c25..dfb138b46488 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -169,11 +169,11 @@ struct bpf_call_arg_meta {
 static DEFINE_MUTEX(bpf_verifier_lock);
 
 /* log_level controls verbosity level of eBPF verifier.
- * verbose() is used to dump the verification trace to the log, so the user
- * can figure out what's wrong with the program
+ * bpf_verifier_log_write() is used to dump the verification trace to the log,
+ * so the user can figure out what's wrong with the program
  */
-static __printf(2, 3) void verbose(struct bpf_verifier_env *env,
-				   const char *fmt, ...)
+__printf(2, 3) void bpf_verifier_log_write(struct bpf_verifier_env *env,
+					   const char *fmt, ...)
 {
 	struct bpf_verifer_log *log = &env->log;
 	unsigned int n;
@@ -197,6 +197,14 @@ static __printf(2, 3) void verbose(struct bpf_verifier_env *env,
 	else
 		log->ubuf = NULL;
 }
+EXPORT_SYMBOL_GPL(bpf_verifier_log_write);
+/* Historically bpf_verifier_log_write was called verbose, but the name was too
+ * generic for symbol export. The function was renamed, but not the calls in
+ * the verifier to avoid complicating backports. Hence the alias below.
+ */
+static __printf(2, 3) void verbose(struct bpf_verifier_env *env,
+				   const char *fmt, ...)
+	__attribute__((alias("bpf_verifier_log_write")));
 
 static bool type_is_pkt_pointer(enum bpf_reg_type type)
 {
@@ -375,6 +383,8 @@ static int realloc_func_state(struct bpf_func_state *state, int size,
 
 static void free_func_state(struct bpf_func_state *state)
 {
+	if (!state)
+		return;
 	kfree(state->stack);
 	kfree(state);
 }
@@ -487,6 +497,8 @@ static struct bpf_verifier_state *push_stack(struct bpf_verifier_env *env,
 	}
 	return &elem->st;
 err:
+	free_verifier_state(env->cur_state, true);
+	env->cur_state = NULL;
 	/* pop all elements and return */
 	while (!pop_stack(env, NULL, NULL));
 	return NULL;
@@ -1337,6 +1349,13 @@ static bool is_pointer_value(struct bpf_verifier_env *env, int regno)
 	return __is_pointer_value(env->allow_ptr_leaks, cur_regs(env) + regno);
 }
 
+static bool is_ctx_reg(struct bpf_verifier_env *env, int regno)
+{
+	const struct bpf_reg_state *reg = cur_regs(env) + regno;
+
+	return reg->type == PTR_TO_CTX;
+}
+
 static int check_pkt_ptr_alignment(struct bpf_verifier_env *env,
 				   const struct bpf_reg_state *reg,
 				   int off, int size, bool strict)
@@ -1511,6 +1530,7 @@ continue_func:
 	goto continue_func;
 }
 
+#ifndef CONFIG_BPF_JIT_ALWAYS_ON
 static int get_callee_stack_depth(struct bpf_verifier_env *env,
 				  const struct bpf_insn *insn, int idx)
 {
@@ -1525,6 +1545,7 @@ static int get_callee_stack_depth(struct bpf_verifier_env *env,
 	subprog++;
 	return env->subprog_stack_depth[subprog];
 }
+#endif
 
 /* truncate register to smaller size (in bytes)
  * must be called with size < BPF_REG_SIZE
@@ -1714,6 +1735,12 @@ static int check_xadd(struct bpf_verifier_env *env, int insn_idx, struct bpf_ins
 		return -EACCES;
 	}
 
+	if (is_ctx_reg(env, insn->dst_reg)) {
+		verbose(env, "BPF_XADD stores into R%d context is not allowed\n",
+			insn->dst_reg);
+		return -EACCES;
+	}
+
 	/* check whether atomic_add can read the memory */
 	err = check_mem_access(env, insn_idx, insn->dst_reg, insn->off,
 			       BPF_SIZE(insn->code), BPF_READ, -1);
@@ -1823,6 +1850,19 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno,
 	}
 }
 
+static bool arg_type_is_mem_ptr(enum bpf_arg_type type)
+{
+	return type == ARG_PTR_TO_MEM ||
+	       type == ARG_PTR_TO_MEM_OR_NULL ||
+	       type == ARG_PTR_TO_UNINIT_MEM;
+}
+
+static bool arg_type_is_mem_size(enum bpf_arg_type type)
+{
+	return type == ARG_CONST_SIZE ||
+	       type == ARG_CONST_SIZE_OR_ZERO;
+}
+
 static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
 			  enum bpf_arg_type arg_type,
 			  struct bpf_call_arg_meta *meta)
@@ -1872,9 +1912,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
 		expected_type = PTR_TO_CTX;
 		if (type != expected_type)
 			goto err_type;
-	} else if (arg_type == ARG_PTR_TO_MEM ||
-		   arg_type == ARG_PTR_TO_MEM_OR_NULL ||
-		   arg_type == ARG_PTR_TO_UNINIT_MEM) {
+	} else if (arg_type_is_mem_ptr(arg_type)) {
 		expected_type = PTR_TO_STACK;
 		/* One exception here. In case function allows for NULL to be
 		 * passed in as argument, it's a SCALAR_VALUE type. Final test
@@ -1935,25 +1973,12 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
 			err = check_stack_boundary(env, regno,
 						   meta->map_ptr->value_size,
 						   false, NULL);
-	} else if (arg_type == ARG_CONST_SIZE ||
-		   arg_type == ARG_CONST_SIZE_OR_ZERO) {
+	} else if (arg_type_is_mem_size(arg_type)) {
 		bool zero_size_allowed = (arg_type == ARG_CONST_SIZE_OR_ZERO);
 
-		/* bpf_xxx(..., buf, len) call will access 'len' bytes
-		 * from stack pointer 'buf'. Check it
-		 * note: regno == len, regno - 1 == buf
-		 */
-		if (regno == 0) {
-			/* kernel subsystem misconfigured verifier */
-			verbose(env,
-				"ARG_CONST_SIZE cannot be first argument\n");
-			return -EACCES;
-		}
-
 		/* The register is SCALAR_VALUE; the access check
 		 * happens using its boundaries.
 		 */
-
 		if (!tnum_is_const(reg->var_off))
 			/* For unprivileged variable accesses, disable raw
 			 * mode so that the program is required to
@@ -2097,7 +2122,7 @@ error:
 	return -EINVAL;
 }
 
-static int check_raw_mode(const struct bpf_func_proto *fn)
+static bool check_raw_mode_ok(const struct bpf_func_proto *fn)
 {
 	int count = 0;
 
@@ -2112,7 +2137,44 @@ static int check_raw_mode(const struct bpf_func_proto *fn)
 	if (fn->arg5_type == ARG_PTR_TO_UNINIT_MEM)
 		count++;
 
-	return count > 1 ? -EINVAL : 0;
+	/* We only support one arg being in raw mode at the moment,
+	 * which is sufficient for the helper functions we have
+	 * right now.
+	 */
+	return count <= 1;
+}
+
+static bool check_args_pair_invalid(enum bpf_arg_type arg_curr,
+				    enum bpf_arg_type arg_next)
+{
+	return (arg_type_is_mem_ptr(arg_curr) &&
+	        !arg_type_is_mem_size(arg_next)) ||
+	       (!arg_type_is_mem_ptr(arg_curr) &&
+		arg_type_is_mem_size(arg_next));
+}
+
+static bool check_arg_pair_ok(const struct bpf_func_proto *fn)
+{
+	/* bpf_xxx(..., buf, len) call will access 'len'
+	 * bytes from memory 'buf'. Both arg types need
+	 * to be paired, so make sure there's no buggy
+	 * helper function specification.
+	 */
+	if (arg_type_is_mem_size(fn->arg1_type) ||
+	    arg_type_is_mem_ptr(fn->arg5_type)  ||
+	    check_args_pair_invalid(fn->arg1_type, fn->arg2_type) ||
+	    check_args_pair_invalid(fn->arg2_type, fn->arg3_type) ||
+	    check_args_pair_invalid(fn->arg3_type, fn->arg4_type) ||
+	    check_args_pair_invalid(fn->arg4_type, fn->arg5_type))
+		return false;
+
+	return true;
+}
+
+static int check_func_proto(const struct bpf_func_proto *fn)
+{
+	return check_raw_mode_ok(fn) &&
+	       check_arg_pair_ok(fn) ? 0 : -EINVAL;
 }
 
 /* Packet data might have moved, any old PTR_TO_PACKET[_META,_END]
@@ -2268,7 +2330,6 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn
 
 	if (env->ops->get_func_proto)
 		fn = env->ops->get_func_proto(func_id);
-
 	if (!fn) {
 		verbose(env, "unknown func %s#%d\n", func_id_name(func_id),
 			func_id);
@@ -2292,10 +2353,7 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn
 	memset(&meta, 0, sizeof(meta));
 	meta.pkt_access = fn->pkt_access;
 
-	/* We only support one arg being in raw mode at the moment, which
-	 * is sufficient for the helper functions we have right now.
-	 */
-	err = check_raw_mode(fn);
+	err = check_func_proto(fn);
 	if (err) {
 		verbose(env, "kernel subsystem misconfigured func %s#%d\n",
 			func_id_name(func_id), func_id);
@@ -2309,6 +2367,13 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn
 	err = check_func_arg(env, BPF_REG_2, fn->arg2_type, &meta);
 	if (err)
 		return err;
+	if (func_id == BPF_FUNC_tail_call) {
+		if (meta.map_ptr == NULL) {
+			verbose(env, "verifier bug\n");
+			return -EINVAL;
+		}
+		env->insn_aux_data[insn_idx].map_ptr = meta.map_ptr;
+	}
 	err = check_func_arg(env, BPF_REG_3, fn->arg3_type, &meta);
 	if (err)
 		return err;
@@ -2457,17 +2522,13 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
 
 	dst_reg = &regs[dst];
 
-	if (WARN_ON_ONCE(known && (smin_val != smax_val))) {
-		print_verifier_state(env, state);
-		verbose(env,
-			"verifier internal error: known but bad sbounds\n");
-		return -EINVAL;
-	}
-	if (WARN_ON_ONCE(known && (umin_val != umax_val))) {
-		print_verifier_state(env, state);
-		verbose(env,
-			"verifier internal error: known but bad ubounds\n");
-		return -EINVAL;
+	if ((known && (smin_val != smax_val || umin_val != umax_val)) ||
+	    smin_val > smax_val || umin_val > umax_val) {
+		/* Taint dst register if offset had invalid bounds derived from
+		 * e.g. dead branches.
+		 */
+		__mark_reg_unknown(dst_reg);
+		return 0;
 	}
 
 	if (BPF_CLASS(insn->code) != BPF_ALU64) {
@@ -2659,6 +2720,15 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
 	src_known = tnum_is_const(src_reg.var_off);
 	dst_known = tnum_is_const(dst_reg->var_off);
 
+	if ((src_known && (smin_val != smax_val || umin_val != umax_val)) ||
+	    smin_val > smax_val || umin_val > umax_val) {
+		/* Taint dst register if offset had invalid bounds derived from
+		 * e.g. dead branches.
+		 */
+		__mark_reg_unknown(dst_reg);
+		return 0;
+	}
+
 	if (!src_known &&
 	    opcode != BPF_ADD && opcode != BPF_SUB && opcode != BPF_AND) {
 		__mark_reg_unknown(dst_reg);
@@ -3070,6 +3140,11 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
 			return -EINVAL;
 		}
 
+		if (opcode == BPF_ARSH && BPF_CLASS(insn->code) != BPF_ALU64) {
+			verbose(env, "BPF_ARSH not supported for 32 bit ALU\n");
+			return -EINVAL;
+		}
+
 		if ((opcode == BPF_LSH || opcode == BPF_RSH ||
 		     opcode == BPF_ARSH) && BPF_SRC(insn->code) == BPF_K) {
 			int size = BPF_CLASS(insn->code) == BPF_ALU64 ? 64 : 32;
@@ -4635,6 +4710,12 @@ static int do_check(struct bpf_verifier_env *env)
 			if (err)
 				return err;
 
+			if (is_ctx_reg(env, insn->dst_reg)) {
+				verbose(env, "BPF_ST stores into R%d context is not allowed\n",
+					insn->dst_reg);
+				return -EACCES;
+			}
+
 			/* check that memory (dst_reg + off) is writeable */
 			err = check_mem_access(env, insn_idx, insn->dst_reg, insn->off,
 					       BPF_SIZE(insn->code), BPF_WRITE,
@@ -4753,7 +4834,8 @@ process_bpf_exit:
 		insn_idx++;
 	}
 
-	verbose(env, "processed %d insns, stack depth ", insn_processed);
+	verbose(env, "processed %d insns (limit %d), stack depth ",
+		insn_processed, BPF_COMPLEXITY_LIMIT_INSNS);
 	for (i = 0; i < env->subprog_cnt + 1; i++) {
 		u32 depth = env->subprog_stack_depth[i];
 
@@ -4795,6 +4877,13 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env,
 			return -EINVAL;
 		}
 	}
+
+	if ((bpf_prog_is_dev_bound(prog->aux) || bpf_map_is_dev_bound(map)) &&
+	    !bpf_offload_dev_match(prog, map)) {
+		verbose(env, "offload device mismatch between prog and map\n");
+		return -EINVAL;
+	}
+
 	return 0;
 }
 
@@ -5252,14 +5341,20 @@ out_free:
 
 static int fixup_call_args(struct bpf_verifier_env *env)
 {
+#ifndef CONFIG_BPF_JIT_ALWAYS_ON
 	struct bpf_prog *prog = env->prog;
 	struct bpf_insn *insn = prog->insnsi;
 	int i, depth;
+#endif
+	int err;
 
-	if (env->prog->jit_requested)
-		if (jit_subprogs(env) == 0)
+	err = 0;
+	if (env->prog->jit_requested) {
+		err = jit_subprogs(env);
+		if (err == 0)
 			return 0;
-
+	}
+#ifndef CONFIG_BPF_JIT_ALWAYS_ON
 	for (i = 0; i < prog->len; i++, insn++) {
 		if (insn->code != (BPF_JMP | BPF_CALL) ||
 		    insn->src_reg != BPF_PSEUDO_CALL)
@@ -5269,7 +5364,9 @@ static int fixup_call_args(struct bpf_verifier_env *env)
 			return depth;
 		bpf_patch_call_args(insn, depth);
 	}
-	return 0;
+	err = 0;
+#endif
+	return err;
 }
 
 /* fixup insn->imm field of bpf_call instructions
@@ -5289,6 +5386,24 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
 	int i, cnt, delta = 0;
 
 	for (i = 0; i < insn_cnt; i++, insn++) {
+		if (insn->code == (BPF_ALU | BPF_MOD | BPF_X) ||
+		    insn->code == (BPF_ALU | BPF_DIV | BPF_X)) {
+			/* due to JIT bugs clear upper 32-bits of src register
+			 * before div/mod operation
+			 */
+			insn_buf[0] = BPF_MOV32_REG(insn->src_reg, insn->src_reg);
+			insn_buf[1] = *insn;
+			cnt = 2;
+			new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
+			if (!new_prog)
+				return -ENOMEM;
+
+			delta    += cnt - 1;
+			env->prog = prog = new_prog;
+			insn      = new_prog->insnsi + i + delta;
+			continue;
+		}
+
 		if (insn->code != (BPF_JMP | BPF_CALL))
 			continue;
 		if (insn->src_reg == BPF_PSEUDO_CALL)
@@ -5316,6 +5431,35 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
 			 */
 			insn->imm = 0;
 			insn->code = BPF_JMP | BPF_TAIL_CALL;
+
+			/* instead of changing every JIT dealing with tail_call
+			 * emit two extra insns:
+			 * if (index >= max_entries) goto out;
+			 * index &= array->index_mask;
+			 * to avoid out-of-bounds cpu speculation
+			 */
+			map_ptr = env->insn_aux_data[i + delta].map_ptr;
+			if (map_ptr == BPF_MAP_PTR_POISON) {
+				verbose(env, "tail_call abusing map_ptr\n");
+				return -EINVAL;
+			}
+			if (!map_ptr->unpriv_array)
+				continue;
+			insn_buf[0] = BPF_JMP_IMM(BPF_JGE, BPF_REG_3,
+						  map_ptr->max_entries, 2);
+			insn_buf[1] = BPF_ALU32_IMM(BPF_AND, BPF_REG_3,
+						    container_of(map_ptr,
+								 struct bpf_array,
+								 map)->index_mask);
+			insn_buf[2] = *insn;
+			cnt = 3;
+			new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
+			if (!new_prog)
+				return -ENOMEM;
+
+			delta    += cnt - 1;
+			env->prog = prog = new_prog;
+			insn      = new_prog->insnsi + i + delta;
 			continue;
 		}
 
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 2cf06c274e4c..7e4c44538119 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -4447,6 +4447,7 @@ static struct cftype cgroup_base_files[] = {
 	},
 	{
 		.name = "cgroup.threads",
+		.flags = CFTYPE_NS_DELEGATABLE,
 		.release = cgroup_procs_release,
 		.seq_start = cgroup_threads_start,
 		.seq_next = cgroup_procs_next,
diff --git a/kernel/crash_core.c b/kernel/crash_core.c
index b3663896278e..4f63597c824d 100644
--- a/kernel/crash_core.c
+++ b/kernel/crash_core.c
@@ -410,7 +410,7 @@ static int __init crash_save_vmcoreinfo_init(void)
 	VMCOREINFO_SYMBOL(contig_page_data);
 #endif
 #ifdef CONFIG_SPARSEMEM
-	VMCOREINFO_SYMBOL(mem_section);
+	VMCOREINFO_SYMBOL_ARRAY(mem_section);
 	VMCOREINFO_LENGTH(mem_section, NR_SECTION_ROOTS);
 	VMCOREINFO_STRUCT_SIZE(mem_section);
 	VMCOREINFO_OFFSET(mem_section, section_mem_map);
diff --git a/kernel/delayacct.c b/kernel/delayacct.c
index 4a1c33416b6a..e2764d767f18 100644
--- a/kernel/delayacct.c
+++ b/kernel/delayacct.c
@@ -51,16 +51,16 @@ void __delayacct_tsk_init(struct task_struct *tsk)
  * Finish delay accounting for a statistic using its timestamps (@start),
  * accumalator (@total) and @count
  */
-static void delayacct_end(u64 *start, u64 *total, u32 *count)
+static void delayacct_end(spinlock_t *lock, u64 *start, u64 *total, u32 *count)
 {
 	s64 ns = ktime_get_ns() - *start;
 	unsigned long flags;
 
 	if (ns > 0) {
-		spin_lock_irqsave(&current->delays->lock, flags);
+		spin_lock_irqsave(lock, flags);
 		*total += ns;
 		(*count)++;
-		spin_unlock_irqrestore(&current->delays->lock, flags);
+		spin_unlock_irqrestore(lock, flags);
 	}
 }
 
@@ -69,17 +69,25 @@ void __delayacct_blkio_start(void)
 	current->delays->blkio_start = ktime_get_ns();
 }
 
-void __delayacct_blkio_end(void)
+/*
+ * We cannot rely on the `current` macro, as we haven't yet switched back to
+ * the process being woken.
+ */
+void __delayacct_blkio_end(struct task_struct *p)
 {
-	if (current->delays->flags & DELAYACCT_PF_SWAPIN)
-		/* Swapin block I/O */
-		delayacct_end(&current->delays->blkio_start,
-			&current->delays->swapin_delay,
-			&current->delays->swapin_count);
-	else	/* Other block I/O */
-		delayacct_end(&current->delays->blkio_start,
-			&current->delays->blkio_delay,
-			&current->delays->blkio_count);
+	struct task_delay_info *delays = p->delays;
+	u64 *total;
+	u32 *count;
+
+	if (p->delays->flags & DELAYACCT_PF_SWAPIN) {
+		total = &delays->swapin_delay;
+		count = &delays->swapin_count;
+	} else {
+		total = &delays->blkio_delay;
+		count = &delays->blkio_count;
+	}
+
+	delayacct_end(&delays->lock, &delays->blkio_start, total, count);
 }
 
 int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk)
@@ -153,8 +161,10 @@ void __delayacct_freepages_start(void)
 
 void __delayacct_freepages_end(void)
 {
-	delayacct_end(&current->delays->freepages_start,
-			&current->delays->freepages_delay,
-			&current->delays->freepages_count);
+	delayacct_end(
+		&current->delays->lock,
+		&current->delays->freepages_start,
+		&current->delays->freepages_delay,
+		&current->delays->freepages_count);
 }
 
diff --git a/kernel/fail_function.c b/kernel/fail_function.c
new file mode 100644
index 000000000000..21b0122cb39c
--- /dev/null
+++ b/kernel/fail_function.c
@@ -0,0 +1,349 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * fail_function.c: Function-based error injection
+ */
+#include <linux/error-injection.h>
+#include <linux/debugfs.h>
+#include <linux/fault-inject.h>
+#include <linux/kallsyms.h>
+#include <linux/kprobes.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+
+static int fei_kprobe_handler(struct kprobe *kp, struct pt_regs *regs);
+
+struct fei_attr {
+	struct list_head list;
+	struct kprobe kp;
+	unsigned long retval;
+};
+static DEFINE_MUTEX(fei_lock);
+static LIST_HEAD(fei_attr_list);
+static DECLARE_FAULT_ATTR(fei_fault_attr);
+static struct dentry *fei_debugfs_dir;
+
+static unsigned long adjust_error_retval(unsigned long addr, unsigned long retv)
+{
+	switch (get_injectable_error_type(addr)) {
+	case EI_ETYPE_NULL:
+		if (retv != 0)
+			return 0;
+		break;
+	case EI_ETYPE_ERRNO:
+		if (retv < (unsigned long)-MAX_ERRNO)
+			return (unsigned long)-EINVAL;
+		break;
+	case EI_ETYPE_ERRNO_NULL:
+		if (retv != 0 && retv < (unsigned long)-MAX_ERRNO)
+			return (unsigned long)-EINVAL;
+		break;
+	}
+
+	return retv;
+}
+
+static struct fei_attr *fei_attr_new(const char *sym, unsigned long addr)
+{
+	struct fei_attr *attr;
+
+	attr = kzalloc(sizeof(*attr), GFP_KERNEL);
+	if (attr) {
+		attr->kp.symbol_name = kstrdup(sym, GFP_KERNEL);
+		if (!attr->kp.symbol_name) {
+			kfree(attr);
+			return NULL;
+		}
+		attr->kp.pre_handler = fei_kprobe_handler;
+		attr->retval = adjust_error_retval(addr, 0);
+		INIT_LIST_HEAD(&attr->list);
+	}
+	return attr;
+}
+
+static void fei_attr_free(struct fei_attr *attr)
+{
+	if (attr) {
+		kfree(attr->kp.symbol_name);
+		kfree(attr);
+	}
+}
+
+static struct fei_attr *fei_attr_lookup(const char *sym)
+{
+	struct fei_attr *attr;
+
+	list_for_each_entry(attr, &fei_attr_list, list) {
+		if (!strcmp(attr->kp.symbol_name, sym))
+			return attr;
+	}
+
+	return NULL;
+}
+
+static bool fei_attr_is_valid(struct fei_attr *_attr)
+{
+	struct fei_attr *attr;
+
+	list_for_each_entry(attr, &fei_attr_list, list) {
+		if (attr == _attr)
+			return true;
+	}
+
+	return false;
+}
+
+static int fei_retval_set(void *data, u64 val)
+{
+	struct fei_attr *attr = data;
+	unsigned long retv = (unsigned long)val;
+	int err = 0;
+
+	mutex_lock(&fei_lock);
+	/*
+	 * Since this operation can be done after retval file is removed,
+	 * It is safer to check the attr is still valid before accessing
+	 * its member.
+	 */
+	if (!fei_attr_is_valid(attr)) {
+		err = -ENOENT;
+		goto out;
+	}
+
+	if (attr->kp.addr) {
+		if (adjust_error_retval((unsigned long)attr->kp.addr,
+					val) != retv)
+			err = -EINVAL;
+	}
+	if (!err)
+		attr->retval = val;
+out:
+	mutex_unlock(&fei_lock);
+
+	return err;
+}
+
+static int fei_retval_get(void *data, u64 *val)
+{
+	struct fei_attr *attr = data;
+	int err = 0;
+
+	mutex_lock(&fei_lock);
+	/* Here we also validate @attr to ensure it still exists. */
+	if (!fei_attr_is_valid(attr))
+		err = -ENOENT;
+	else
+		*val = attr->retval;
+	mutex_unlock(&fei_lock);
+
+	return err;
+}
+DEFINE_DEBUGFS_ATTRIBUTE(fei_retval_ops, fei_retval_get, fei_retval_set,
+			 "%llx\n");
+
+static int fei_debugfs_add_attr(struct fei_attr *attr)
+{
+	struct dentry *dir;
+
+	dir = debugfs_create_dir(attr->kp.symbol_name, fei_debugfs_dir);
+	if (!dir)
+		return -ENOMEM;
+
+	if (!debugfs_create_file("retval", 0600, dir, attr, &fei_retval_ops)) {
+		debugfs_remove_recursive(dir);
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
+static void fei_debugfs_remove_attr(struct fei_attr *attr)
+{
+	struct dentry *dir;
+
+	dir = debugfs_lookup(attr->kp.symbol_name, fei_debugfs_dir);
+	if (dir)
+		debugfs_remove_recursive(dir);
+}
+
+static int fei_kprobe_handler(struct kprobe *kp, struct pt_regs *regs)
+{
+	struct fei_attr *attr = container_of(kp, struct fei_attr, kp);
+
+	if (should_fail(&fei_fault_attr, 1)) {
+		regs_set_return_value(regs, attr->retval);
+		override_function_with_return(regs);
+		/* Kprobe specific fixup */
+		reset_current_kprobe();
+		preempt_enable_no_resched();
+		return 1;
+	}
+
+	return 0;
+}
+NOKPROBE_SYMBOL(fei_kprobe_handler)
+
+static void *fei_seq_start(struct seq_file *m, loff_t *pos)
+{
+	mutex_lock(&fei_lock);
+	return seq_list_start(&fei_attr_list, *pos);
+}
+
+static void fei_seq_stop(struct seq_file *m, void *v)
+{
+	mutex_unlock(&fei_lock);
+}
+
+static void *fei_seq_next(struct seq_file *m, void *v, loff_t *pos)
+{
+	return seq_list_next(v, &fei_attr_list, pos);
+}
+
+static int fei_seq_show(struct seq_file *m, void *v)
+{
+	struct fei_attr *attr = list_entry(v, struct fei_attr, list);
+
+	seq_printf(m, "%pf\n", attr->kp.addr);
+	return 0;
+}
+
+static const struct seq_operations fei_seq_ops = {
+	.start	= fei_seq_start,
+	.next	= fei_seq_next,
+	.stop	= fei_seq_stop,
+	.show	= fei_seq_show,
+};
+
+static int fei_open(struct inode *inode, struct file *file)
+{
+	return seq_open(file, &fei_seq_ops);
+}
+
+static void fei_attr_remove(struct fei_attr *attr)
+{
+	fei_debugfs_remove_attr(attr);
+	unregister_kprobe(&attr->kp);
+	list_del(&attr->list);
+	fei_attr_free(attr);
+}
+
+static void fei_attr_remove_all(void)
+{
+	struct fei_attr *attr, *n;
+
+	list_for_each_entry_safe(attr, n, &fei_attr_list, list) {
+		fei_attr_remove(attr);
+	}
+}
+
+static ssize_t fei_write(struct file *file, const char __user *buffer,
+			 size_t count, loff_t *ppos)
+{
+	struct fei_attr *attr;
+	unsigned long addr;
+	char *buf, *sym;
+	int ret;
+
+	/* cut off if it is too long */
+	if (count > KSYM_NAME_LEN)
+		count = KSYM_NAME_LEN;
+	buf = kmalloc(sizeof(char) * (count + 1), GFP_KERNEL);
+	if (!buf)
+		return -ENOMEM;
+
+	if (copy_from_user(buf, buffer, count)) {
+		ret = -EFAULT;
+		goto out;
+	}
+	buf[count] = '\0';
+	sym = strstrip(buf);
+
+	mutex_lock(&fei_lock);
+
+	/* Writing just spaces will remove all injection points */
+	if (sym[0] == '\0') {
+		fei_attr_remove_all();
+		ret = count;
+		goto out;
+	}
+	/* Writing !function will remove one injection point */
+	if (sym[0] == '!') {
+		attr = fei_attr_lookup(sym + 1);
+		if (!attr) {
+			ret = -ENOENT;
+			goto out;
+		}
+		fei_attr_remove(attr);
+		ret = count;
+		goto out;
+	}
+
+	addr = kallsyms_lookup_name(sym);
+	if (!addr) {
+		ret = -EINVAL;
+		goto out;
+	}
+	if (!within_error_injection_list(addr)) {
+		ret = -ERANGE;
+		goto out;
+	}
+	if (fei_attr_lookup(sym)) {
+		ret = -EBUSY;
+		goto out;
+	}
+	attr = fei_attr_new(sym, addr);
+	if (!attr) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	ret = register_kprobe(&attr->kp);
+	if (!ret)
+		ret = fei_debugfs_add_attr(attr);
+	if (ret < 0)
+		fei_attr_remove(attr);
+	else {
+		list_add_tail(&attr->list, &fei_attr_list);
+		ret = count;
+	}
+out:
+	kfree(buf);
+	mutex_unlock(&fei_lock);
+	return ret;
+}
+
+static const struct file_operations fei_ops = {
+	.open =		fei_open,
+	.read =		seq_read,
+	.write =	fei_write,
+	.llseek =	seq_lseek,
+	.release =	seq_release,
+};
+
+static int __init fei_debugfs_init(void)
+{
+	struct dentry *dir;
+
+	dir = fault_create_debugfs_attr("fail_function", NULL,
+					&fei_fault_attr);
+	if (IS_ERR(dir))
+		return PTR_ERR(dir);
+
+	/* injectable attribute is just a symlink of error_inject/list */
+	if (!debugfs_create_symlink("injectable", dir,
+				    "../error_injection/list"))
+		goto error;
+
+	if (!debugfs_create_file("inject", 0600, dir, NULL, &fei_ops))
+		goto error;
+
+	fei_debugfs_dir = dir;
+
+	return 0;
+error:
+	debugfs_remove_recursive(dir);
+	return -ENOMEM;
+}
+
+late_initcall(fei_debugfs_init);
diff --git a/kernel/futex.c b/kernel/futex.c
index 57d0b3657e16..8c5424dd5924 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -1878,6 +1878,9 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags,
 	struct futex_q *this, *next;
 	DEFINE_WAKE_Q(wake_q);
 
+	if (nr_wake < 0 || nr_requeue < 0)
+		return -EINVAL;
+
 	/*
 	 * When PI not supported: return -ENOSYS if requeue_pi is true,
 	 * consequently the compiler knows requeue_pi is always false past
@@ -2294,21 +2297,17 @@ static void unqueue_me_pi(struct futex_q *q)
 	spin_unlock(q->lock_ptr);
 }
 
-/*
- * Fixup the pi_state owner with the new owner.
- *
- * Must be called with hash bucket lock held and mm->sem held for non
- * private futexes.
- */
 static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
-				struct task_struct *newowner)
+				struct task_struct *argowner)
 {
-	u32 newtid = task_pid_vnr(newowner) | FUTEX_WAITERS;
 	struct futex_pi_state *pi_state = q->pi_state;
 	u32 uval, uninitialized_var(curval), newval;
-	struct task_struct *oldowner;
+	struct task_struct *oldowner, *newowner;
+	u32 newtid;
 	int ret;
 
+	lockdep_assert_held(q->lock_ptr);
+
 	raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
 
 	oldowner = pi_state->owner;
@@ -2317,11 +2316,17 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
 		newtid |= FUTEX_OWNER_DIED;
 
 	/*
-	 * We are here either because we stole the rtmutex from the
-	 * previous highest priority waiter or we are the highest priority
-	 * waiter but have failed to get the rtmutex the first time.
+	 * We are here because either:
+	 *
+	 *  - we stole the lock and pi_state->owner needs updating to reflect
+	 *    that (@argowner == current),
+	 *
+	 * or:
+	 *
+	 *  - someone stole our lock and we need to fix things to point to the
+	 *    new owner (@argowner == NULL).
 	 *
-	 * We have to replace the newowner TID in the user space variable.
+	 * Either way, we have to replace the TID in the user space variable.
 	 * This must be atomic as we have to preserve the owner died bit here.
 	 *
 	 * Note: We write the user space value _before_ changing the pi_state
@@ -2334,6 +2339,42 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
 	 * in the PID check in lookup_pi_state.
 	 */
 retry:
+	if (!argowner) {
+		if (oldowner != current) {
+			/*
+			 * We raced against a concurrent self; things are
+			 * already fixed up. Nothing to do.
+			 */
+			ret = 0;
+			goto out_unlock;
+		}
+
+		if (__rt_mutex_futex_trylock(&pi_state->pi_mutex)) {
+			/* We got the lock after all, nothing to fix. */
+			ret = 0;
+			goto out_unlock;
+		}
+
+		/*
+		 * Since we just failed the trylock; there must be an owner.
+		 */
+		newowner = rt_mutex_owner(&pi_state->pi_mutex);
+		BUG_ON(!newowner);
+	} else {
+		WARN_ON_ONCE(argowner != current);
+		if (oldowner == current) {
+			/*
+			 * We raced against a concurrent self; things are
+			 * already fixed up. Nothing to do.
+			 */
+			ret = 0;
+			goto out_unlock;
+		}
+		newowner = argowner;
+	}
+
+	newtid = task_pid_vnr(newowner) | FUTEX_WAITERS;
+
 	if (get_futex_value_locked(&uval, uaddr))
 		goto handle_fault;
 
@@ -2434,9 +2475,9 @@ static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked)
 		 * Got the lock. We might not be the anticipated owner if we
 		 * did a lock-steal - fix up the PI-state in that case:
 		 *
-		 * We can safely read pi_state->owner without holding wait_lock
-		 * because we now own the rt_mutex, only the owner will attempt
-		 * to change it.
+		 * Speculative pi_state->owner read (we don't hold wait_lock);
+		 * since we own the lock pi_state->owner == current is the
+		 * stable state, anything else needs more attention.
 		 */
 		if (q->pi_state->owner != current)
 			ret = fixup_pi_state_owner(uaddr, q, current);
@@ -2444,6 +2485,19 @@ static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked)
 	}
 
 	/*
+	 * If we didn't get the lock; check if anybody stole it from us. In
+	 * that case, we need to fix up the uval to point to them instead of
+	 * us, otherwise bad things happen. [10]
+	 *
+	 * Another speculative read; pi_state->owner == current is unstable
+	 * but needs our attention.
+	 */
+	if (q->pi_state->owner == current) {
+		ret = fixup_pi_state_owner(uaddr, q, NULL);
+		goto out;
+	}
+
+	/*
 	 * Paranoia check. If we did not take the lock, then we should not be
 	 * the owner of the rt_mutex.
 	 */
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index b4aab48ad258..da2ccf142358 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -83,16 +83,6 @@ static raw_spinlock_t *kretprobe_table_lock_ptr(unsigned long hash)
 	return &(kretprobe_table_locks[hash].lock);
 }
 
-/* List of symbols that can be overriden for error injection. */
-static LIST_HEAD(kprobe_error_injection_list);
-static DEFINE_MUTEX(kprobe_ei_mutex);
-struct kprobe_ei_entry {
-	struct list_head list;
-	unsigned long start_addr;
-	unsigned long end_addr;
-	void *priv;
-};
-
 /* Blacklist -- list of struct kprobe_blacklist_entry */
 static LIST_HEAD(kprobe_blacklist);
 
@@ -1404,17 +1394,6 @@ bool within_kprobe_blacklist(unsigned long addr)
 	return false;
 }
 
-bool within_kprobe_error_injection_list(unsigned long addr)
-{
-	struct kprobe_ei_entry *ent;
-
-	list_for_each_entry(ent, &kprobe_error_injection_list, list) {
-		if (addr >= ent->start_addr && addr < ent->end_addr)
-			return true;
-	}
-	return false;
-}
-
 /*
  * If we have a symbol_name argument, look it up and add the offset field
  * to it. This way, we can specify a relative address to a symbol.
@@ -2189,86 +2168,6 @@ static int __init populate_kprobe_blacklist(unsigned long *start,
 	return 0;
 }
 
-#ifdef CONFIG_BPF_KPROBE_OVERRIDE
-/* Markers of the _kprobe_error_inject_list section */
-extern unsigned long __start_kprobe_error_inject_list[];
-extern unsigned long __stop_kprobe_error_inject_list[];
-
-/*
- * Lookup and populate the kprobe_error_injection_list.
- *
- * For safety reasons we only allow certain functions to be overriden with
- * bpf_error_injection, so we need to populate the list of the symbols that have
- * been marked as safe for overriding.
- */
-static void populate_kprobe_error_injection_list(unsigned long *start,
-						 unsigned long *end,
-						 void *priv)
-{
-	unsigned long *iter;
-	struct kprobe_ei_entry *ent;
-	unsigned long entry, offset = 0, size = 0;
-
-	mutex_lock(&kprobe_ei_mutex);
-	for (iter = start; iter < end; iter++) {
-		entry = arch_deref_entry_point((void *)*iter);
-
-		if (!kernel_text_address(entry) ||
-		    !kallsyms_lookup_size_offset(entry, &size, &offset)) {
-			pr_err("Failed to find error inject entry at %p\n",
-				(void *)entry);
-			continue;
-		}
-
-		ent = kmalloc(sizeof(*ent), GFP_KERNEL);
-		if (!ent)
-			break;
-		ent->start_addr = entry;
-		ent->end_addr = entry + size;
-		ent->priv = priv;
-		INIT_LIST_HEAD(&ent->list);
-		list_add_tail(&ent->list, &kprobe_error_injection_list);
-	}
-	mutex_unlock(&kprobe_ei_mutex);
-}
-
-static void __init populate_kernel_kprobe_ei_list(void)
-{
-	populate_kprobe_error_injection_list(__start_kprobe_error_inject_list,
-					     __stop_kprobe_error_inject_list,
-					     NULL);
-}
-
-static void module_load_kprobe_ei_list(struct module *mod)
-{
-	if (!mod->num_kprobe_ei_funcs)
-		return;
-	populate_kprobe_error_injection_list(mod->kprobe_ei_funcs,
-					     mod->kprobe_ei_funcs +
-					     mod->num_kprobe_ei_funcs, mod);
-}
-
-static void module_unload_kprobe_ei_list(struct module *mod)
-{
-	struct kprobe_ei_entry *ent, *n;
-	if (!mod->num_kprobe_ei_funcs)
-		return;
-
-	mutex_lock(&kprobe_ei_mutex);
-	list_for_each_entry_safe(ent, n, &kprobe_error_injection_list, list) {
-		if (ent->priv == mod) {
-			list_del_init(&ent->list);
-			kfree(ent);
-		}
-	}
-	mutex_unlock(&kprobe_ei_mutex);
-}
-#else
-static inline void __init populate_kernel_kprobe_ei_list(void) {}
-static inline void module_load_kprobe_ei_list(struct module *m) {}
-static inline void module_unload_kprobe_ei_list(struct module *m) {}
-#endif
-
 /* Module notifier call back, checking kprobes on the module */
 static int kprobes_module_callback(struct notifier_block *nb,
 				   unsigned long val, void *data)
@@ -2279,11 +2178,6 @@ static int kprobes_module_callback(struct notifier_block *nb,
 	unsigned int i;
 	int checkcore = (val == MODULE_STATE_GOING);
 
-	if (val == MODULE_STATE_COMING)
-		module_load_kprobe_ei_list(mod);
-	else if (val == MODULE_STATE_GOING)
-		module_unload_kprobe_ei_list(mod);
-
 	if (val != MODULE_STATE_GOING && val != MODULE_STATE_LIVE)
 		return NOTIFY_DONE;
 
@@ -2346,8 +2240,6 @@ static int __init init_kprobes(void)
 		pr_err("Please take care of using kprobes.\n");
 	}
 
-	populate_kernel_kprobe_ei_list();
-
 	if (kretprobe_blacklist_size) {
 		/* lookup the function address from its name */
 		for (i = 0; kretprobe_blacklist[i].name != NULL; i++) {
@@ -2515,56 +2407,6 @@ static const struct file_operations debugfs_kprobe_blacklist_ops = {
 	.release        = seq_release,
 };
 
-/*
- * kprobes/error_injection_list -- shows which functions can be overriden for
- * error injection.
- * */
-static void *kprobe_ei_seq_start(struct seq_file *m, loff_t *pos)
-{
-	mutex_lock(&kprobe_ei_mutex);
-	return seq_list_start(&kprobe_error_injection_list, *pos);
-}
-
-static void kprobe_ei_seq_stop(struct seq_file *m, void *v)
-{
-	mutex_unlock(&kprobe_ei_mutex);
-}
-
-static void *kprobe_ei_seq_next(struct seq_file *m, void *v, loff_t *pos)
-{
-	return seq_list_next(v, &kprobe_error_injection_list, pos);
-}
-
-static int kprobe_ei_seq_show(struct seq_file *m, void *v)
-{
-	char buffer[KSYM_SYMBOL_LEN];
-	struct kprobe_ei_entry *ent =
-		list_entry(v, struct kprobe_ei_entry, list);
-
-	sprint_symbol(buffer, ent->start_addr);
-	seq_printf(m, "%s\n", buffer);
-	return 0;
-}
-
-static const struct seq_operations kprobe_ei_seq_ops = {
-	.start = kprobe_ei_seq_start,
-	.next  = kprobe_ei_seq_next,
-	.stop  = kprobe_ei_seq_stop,
-	.show  = kprobe_ei_seq_show,
-};
-
-static int kprobe_ei_open(struct inode *inode, struct file *filp)
-{
-	return seq_open(filp, &kprobe_ei_seq_ops);
-}
-
-static const struct file_operations debugfs_kprobe_ei_ops = {
-	.open           = kprobe_ei_open,
-	.read           = seq_read,
-	.llseek         = seq_lseek,
-	.release        = seq_release,
-};
-
 static void arm_all_kprobes(void)
 {
 	struct hlist_head *head;
@@ -2706,11 +2548,6 @@ static int __init debugfs_kprobe_init(void)
 	if (!file)
 		goto error;
 
-	file = debugfs_create_file("error_injection_list", 0444, dir, NULL,
-				  &debugfs_kprobe_ei_ops);
-	if (!file)
-		goto error;
-
 	return 0;
 
 error:
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index 6f3dba6e4e9e..65cc0cb984e6 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -1290,6 +1290,19 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
 	return ret;
 }
 
+static inline int __rt_mutex_slowtrylock(struct rt_mutex *lock)
+{
+	int ret = try_to_take_rt_mutex(lock, current, NULL);
+
+	/*
+	 * try_to_take_rt_mutex() sets the lock waiters bit
+	 * unconditionally. Clean this up.
+	 */
+	fixup_rt_mutex_waiters(lock);
+
+	return ret;
+}
+
 /*
  * Slow path try-lock function:
  */
@@ -1312,13 +1325,7 @@ static inline int rt_mutex_slowtrylock(struct rt_mutex *lock)
 	 */
 	raw_spin_lock_irqsave(&lock->wait_lock, flags);
 
-	ret = try_to_take_rt_mutex(lock, current, NULL);
-
-	/*
-	 * try_to_take_rt_mutex() sets the lock waiters bit
-	 * unconditionally. Clean this up.
-	 */
-	fixup_rt_mutex_waiters(lock);
+	ret = __rt_mutex_slowtrylock(lock);
 
 	raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
 
@@ -1505,6 +1512,11 @@ int __sched rt_mutex_futex_trylock(struct rt_mutex *lock)
 	return rt_mutex_slowtrylock(lock);
 }
 
+int __sched __rt_mutex_futex_trylock(struct rt_mutex *lock)
+{
+	return __rt_mutex_slowtrylock(lock);
+}
+
 /**
  * rt_mutex_timed_lock - lock a rt_mutex interruptible
  *			the timeout structure is provided
diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h
index 124e98ca0b17..68686b3ec3c1 100644
--- a/kernel/locking/rtmutex_common.h
+++ b/kernel/locking/rtmutex_common.h
@@ -148,6 +148,7 @@ extern bool rt_mutex_cleanup_proxy_lock(struct rt_mutex *lock,
 				 struct rt_mutex_waiter *waiter);
 
 extern int rt_mutex_futex_trylock(struct rt_mutex *l);
+extern int __rt_mutex_futex_trylock(struct rt_mutex *l);
 
 extern void rt_mutex_futex_unlock(struct rt_mutex *lock);
 extern bool __rt_mutex_futex_unlock(struct rt_mutex *lock,
diff --git a/kernel/module.c b/kernel/module.c
index bd695bfdc5c4..601494d4b7ea 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -3118,10 +3118,10 @@ static int find_module_sections(struct module *mod, struct load_info *info)
 					     sizeof(*mod->ftrace_callsites),
 					     &mod->num_ftrace_callsites);
 #endif
-#ifdef CONFIG_BPF_KPROBE_OVERRIDE
-	mod->kprobe_ei_funcs = section_objs(info, "_kprobe_error_inject_list",
-					    sizeof(*mod->kprobe_ei_funcs),
-					    &mod->num_kprobe_ei_funcs);
+#ifdef CONFIG_FUNCTION_ERROR_INJECTION
+	mod->ei_funcs = section_objs(info, "_error_injection_whitelist",
+					    sizeof(*mod->ei_funcs),
+					    &mod->num_ei_funcs);
 #endif
 	mod->extable = section_objs(info, "__ex_table",
 				    sizeof(*mod->extable), &mod->num_exentries);
diff --git a/kernel/sched/completion.c b/kernel/sched/completion.c
index 2ddaec40956f..0926aef10dad 100644
--- a/kernel/sched/completion.c
+++ b/kernel/sched/completion.c
@@ -34,11 +34,6 @@ void complete(struct completion *x)
 
 	spin_lock_irqsave(&x->wait.lock, flags);
 
-	/*
-	 * Perform commit of crossrelease here.
-	 */
-	complete_release_commit(x);
-
 	if (x->done != UINT_MAX)
 		x->done++;
 	__wake_up_locked(&x->wait, TASK_NORMAL, 1);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 644fa2e3d993..a7bf32aabfda 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2056,7 +2056,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
 	p->state = TASK_WAKING;
 
 	if (p->in_iowait) {
-		delayacct_blkio_end();
+		delayacct_blkio_end(p);
 		atomic_dec(&task_rq(p)->nr_iowait);
 	}
 
@@ -2069,7 +2069,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
 #else /* CONFIG_SMP */
 
 	if (p->in_iowait) {
-		delayacct_blkio_end();
+		delayacct_blkio_end(p);
 		atomic_dec(&task_rq(p)->nr_iowait);
 	}
 
@@ -2122,7 +2122,7 @@ static void try_to_wake_up_local(struct task_struct *p, struct rq_flags *rf)
 
 	if (!task_on_rq_queued(p)) {
 		if (p->in_iowait) {
-			delayacct_blkio_end();
+			delayacct_blkio_end(p);
 			atomic_dec(&rq->nr_iowait);
 		}
 		ttwu_activate(rq, p, ENQUEUE_WAKEUP | ENQUEUE_NOCLOCK);
diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c
index dd7908743dab..9bcbacba82a8 100644
--- a/kernel/sched/membarrier.c
+++ b/kernel/sched/membarrier.c
@@ -89,7 +89,9 @@ static int membarrier_private_expedited(void)
 		rcu_read_unlock();
 	}
 	if (!fallback) {
+		preempt_disable();
 		smp_call_function_many(tmpmask, ipi_mb, NULL, 1);
+		preempt_enable();
 		free_cpumask_var(tmpmask);
 	}
 	cpus_read_unlock();
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 89a9e1b4264a..0bcf00e3ce48 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -1696,7 +1696,7 @@ void run_local_timers(void)
 	hrtimer_run_queues();
 	/* Raise the softirq only if required. */
 	if (time_before(jiffies, base->clk)) {
-		if (!IS_ENABLED(CONFIG_NO_HZ_COMMON) || !base->nohz_active)
+		if (!IS_ENABLED(CONFIG_NO_HZ_COMMON))
 			return;
 		/* CPU is awake, so check the deferrable base. */
 		base++;
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index ae3a2d519e50..0b249e2f0c3c 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -355,7 +355,7 @@ config PROFILE_ANNOTATED_BRANCHES
 	  on if you need to profile the system's use of these macros.
 
 config PROFILE_ALL_BRANCHES
-	bool "Profile all if conditionals"
+	bool "Profile all if conditionals" if !FORTIFY_SOURCE
 	select TRACE_BRANCH_PROFILING
 	help
 	  This tracer profiles all branch conditions. Every if ()
@@ -533,9 +533,7 @@ config FUNCTION_PROFILER
 config BPF_KPROBE_OVERRIDE
 	bool "Enable BPF programs to override a kprobed function"
 	depends on BPF_EVENTS
-	depends on KPROBES_ON_FTRACE
-	depends on HAVE_KPROBE_OVERRIDE
-	depends on DYNAMIC_FTRACE_WITH_REGS
+	depends on FUNCTION_ERROR_INJECTION
 	default n
 	help
 	 Allows BPF to override the execution of a probed function and
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index f6d2327ecb59..fc2838ac8b78 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -14,7 +14,7 @@
 #include <linux/uaccess.h>
 #include <linux/ctype.h>
 #include <linux/kprobes.h>
-#include <asm/kprobes.h>
+#include <linux/error-injection.h>
 
 #include "trace_probe.h"
 #include "trace.h"
@@ -83,9 +83,8 @@ EXPORT_SYMBOL_GPL(trace_call_bpf);
 #ifdef CONFIG_BPF_KPROBE_OVERRIDE
 BPF_CALL_2(bpf_override_return, struct pt_regs *, regs, unsigned long, rc)
 {
-	__this_cpu_write(bpf_kprobe_override, 1);
 	regs_set_return_value(regs, rc);
-	arch_ftrace_kprobe_override_function(regs);
+	override_function_with_return(regs);
 	return 0;
 }
 
@@ -246,7 +245,7 @@ BPF_CALL_5(bpf_trace_printk, char *, fmt, u32, fmt_size, u64, arg1,
  */
 #define __BPF_TP_EMIT()	__BPF_ARG3_TP()
 #define __BPF_TP(...)							\
-	__trace_printk(1 /* Fake ip will not be printed. */,		\
+	__trace_printk(0 /* Fake ip */,					\
 		       fmt, ##__VA_ARGS__)
 
 #define __BPF_ARG1_TP(...)						\
@@ -800,11 +799,11 @@ int perf_event_attach_bpf_prog(struct perf_event *event,
 	int ret = -EEXIST;
 
 	/*
-	 * Kprobe override only works for ftrace based kprobes, and only if they
-	 * are on the opt-in list.
+	 * Kprobe override only works if they are on the function entry,
+	 * and only if they are on the opt-in list.
 	 */
 	if (prog->kprobe_override &&
-	    (!trace_kprobe_ftrace(event->tp_event) ||
+	    (!trace_kprobe_on_func_entry(event->tp_event) ||
 	     !trace_kprobe_error_injectable(event->tp_event)))
 		return -EINVAL;
 
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 9ab18995ff1e..5af2842dea96 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -2534,29 +2534,58 @@ rb_wakeups(struct ring_buffer *buffer, struct ring_buffer_per_cpu *cpu_buffer)
  * The lock and unlock are done within a preempt disable section.
  * The current_context per_cpu variable can only be modified
  * by the current task between lock and unlock. But it can
- * be modified more than once via an interrupt. There are four
- * different contexts that we need to consider.
+ * be modified more than once via an interrupt. To pass this
+ * information from the lock to the unlock without having to
+ * access the 'in_interrupt()' functions again (which do show
+ * a bit of overhead in something as critical as function tracing,
+ * we use a bitmask trick.
  *
- *  Normal context.
- *  SoftIRQ context
- *  IRQ context
- *  NMI context
+ *  bit 0 =  NMI context
+ *  bit 1 =  IRQ context
+ *  bit 2 =  SoftIRQ context
+ *  bit 3 =  normal context.
  *
- * If for some reason the ring buffer starts to recurse, we
- * only allow that to happen at most 4 times (one for each
- * context). If it happens 5 times, then we consider this a
- * recusive loop and do not let it go further.
+ * This works because this is the order of contexts that can
+ * preempt other contexts. A SoftIRQ never preempts an IRQ
+ * context.
+ *
+ * When the context is determined, the corresponding bit is
+ * checked and set (if it was set, then a recursion of that context
+ * happened).
+ *
+ * On unlock, we need to clear this bit. To do so, just subtract
+ * 1 from the current_context and AND it to itself.
+ *
+ * (binary)
+ *  101 - 1 = 100
+ *  101 & 100 = 100 (clearing bit zero)
+ *
+ *  1010 - 1 = 1001
+ *  1010 & 1001 = 1000 (clearing bit 1)
+ *
+ * The least significant bit can be cleared this way, and it
+ * just so happens that it is the same bit corresponding to
+ * the current context.
  */
 
 static __always_inline int
 trace_recursive_lock(struct ring_buffer_per_cpu *cpu_buffer)
 {
-	if (cpu_buffer->current_context >= 4)
+	unsigned int val = cpu_buffer->current_context;
+	unsigned long pc = preempt_count();
+	int bit;
+
+	if (!(pc & (NMI_MASK | HARDIRQ_MASK | SOFTIRQ_OFFSET)))
+		bit = RB_CTX_NORMAL;
+	else
+		bit = pc & NMI_MASK ? RB_CTX_NMI :
+			pc & HARDIRQ_MASK ? RB_CTX_IRQ : RB_CTX_SOFTIRQ;
+
+	if (unlikely(val & (1 << bit)))
 		return 1;
 
-	cpu_buffer->current_context++;
-	/* Interrupts must see this update */
-	barrier();
+	val |= (1 << bit);
+	cpu_buffer->current_context = val;
 
 	return 0;
 }
@@ -2564,9 +2593,7 @@ trace_recursive_lock(struct ring_buffer_per_cpu *cpu_buffer)
 static __always_inline void
 trace_recursive_unlock(struct ring_buffer_per_cpu *cpu_buffer)
 {
-	/* Don't let the dec leak out */
-	barrier();
-	cpu_buffer->current_context--;
+	cpu_buffer->current_context &= cpu_buffer->current_context - 1;
 }
 
 /**
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index ec0f9aa4e151..1b87157edbff 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -2213,6 +2213,7 @@ void trace_event_eval_update(struct trace_eval_map **map, int len)
 {
 	struct trace_event_call *call, *p;
 	const char *last_system = NULL;
+	bool first = false;
 	int last_i;
 	int i;
 
@@ -2220,15 +2221,28 @@ void trace_event_eval_update(struct trace_eval_map **map, int len)
 	list_for_each_entry_safe(call, p, &ftrace_events, list) {
 		/* events are usually grouped together with systems */
 		if (!last_system || call->class->system != last_system) {
+			first = true;
 			last_i = 0;
 			last_system = call->class->system;
 		}
 
+		/*
+		 * Since calls are grouped by systems, the likelyhood that the
+		 * next call in the iteration belongs to the same system as the
+		 * previous call is high. As an optimization, we skip seaching
+		 * for a map[] that matches the call's system if the last call
+		 * was from the same system. That's what last_i is for. If the
+		 * call has the same system as the previous call, then last_i
+		 * will be the index of the first map[] that has a matching
+		 * system.
+		 */
 		for (i = last_i; i < len; i++) {
 			if (call->class->system == map[i]->system) {
 				/* Save the first system if need be */
-				if (!last_i)
+				if (first) {
 					last_i = i;
+					first = false;
+				}
 				update_event_printk(call, map[i]);
 			}
 		}
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 91f4b57dab82..1fad24acd444 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -21,6 +21,7 @@
 #include <linux/module.h>
 #include <linux/uaccess.h>
 #include <linux/rculist.h>
+#include <linux/error-injection.h>
 
 #include "trace_probe.h"
 
@@ -42,8 +43,6 @@ struct trace_kprobe {
 	(offsetof(struct trace_kprobe, tp.args) +	\
 	(sizeof(struct probe_arg) * (n)))
 
-DEFINE_PER_CPU(int, bpf_kprobe_override);
-
 static nokprobe_inline bool trace_kprobe_is_return(struct trace_kprobe *tk)
 {
 	return tk->rp.handler != NULL;
@@ -88,13 +87,16 @@ static nokprobe_inline unsigned long trace_kprobe_nhit(struct trace_kprobe *tk)
 	return nhit;
 }
 
-int trace_kprobe_ftrace(struct trace_event_call *call)
+bool trace_kprobe_on_func_entry(struct trace_event_call *call)
 {
 	struct trace_kprobe *tk = (struct trace_kprobe *)call->data;
-	return kprobe_ftrace(&tk->rp.kp);
+
+	return kprobe_on_func_entry(tk->rp.kp.addr,
+			tk->rp.kp.addr ? NULL : tk->rp.kp.symbol_name,
+			tk->rp.kp.addr ? 0 : tk->rp.kp.offset);
 }
 
-int trace_kprobe_error_injectable(struct trace_event_call *call)
+bool trace_kprobe_error_injectable(struct trace_event_call *call)
 {
 	struct trace_kprobe *tk = (struct trace_kprobe *)call->data;
 	unsigned long addr;
@@ -106,7 +108,7 @@ int trace_kprobe_error_injectable(struct trace_event_call *call)
 	} else {
 		addr = (unsigned long)tk->rp.kp.addr;
 	}
-	return within_kprobe_error_injection_list(addr);
+	return within_error_injection_list(addr);
 }
 
 static int register_kprobe_event(struct trace_kprobe *tk);
@@ -1202,6 +1204,7 @@ kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs)
 	int rctx;
 
 	if (bpf_prog_array_valid(call)) {
+		unsigned long orig_ip = instruction_pointer(regs);
 		int ret;
 
 		ret = trace_call_bpf(call, regs);
@@ -1209,12 +1212,13 @@ kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs)
 		/*
 		 * We need to check and see if we modified the pc of the
 		 * pt_regs, and if so clear the kprobe and return 1 so that we
-		 * don't do the instruction skipping.  Also reset our state so
-		 * we are clean the next pass through.
+		 * don't do the single stepping.
+		 * The ftrace kprobe handler leaves it up to us to re-enable
+		 * preemption here before returning if we've modified the ip.
 		 */
-		if (__this_cpu_read(bpf_kprobe_override)) {
-			__this_cpu_write(bpf_kprobe_override, 0);
+		if (orig_ip != instruction_pointer(regs)) {
 			reset_current_kprobe();
+			preempt_enable_no_resched();
 			return 1;
 		}
 		if (!ret)
@@ -1322,15 +1326,8 @@ static int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs)
 	if (tk->tp.flags & TP_FLAG_TRACE)
 		kprobe_trace_func(tk, regs);
 #ifdef CONFIG_PERF_EVENTS
-	if (tk->tp.flags & TP_FLAG_PROFILE) {
+	if (tk->tp.flags & TP_FLAG_PROFILE)
 		ret = kprobe_perf_func(tk, regs);
-		/*
-		 * The ftrace kprobe handler leaves it up to us to re-enable
-		 * preemption here before returning if we've modified the ip.
-		 */
-		if (ret)
-			preempt_enable_no_resched();
-	}
 #endif
 	return ret;
 }
diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h
index 5e54d748c84c..e101c5bb9eda 100644
--- a/kernel/trace/trace_probe.h
+++ b/kernel/trace/trace_probe.h
@@ -252,8 +252,8 @@ struct symbol_cache;
 unsigned long update_symbol_cache(struct symbol_cache *sc);
 void free_symbol_cache(struct symbol_cache *sc);
 struct symbol_cache *alloc_symbol_cache(const char *sym, long offset);
-int trace_kprobe_ftrace(struct trace_event_call *call);
-int trace_kprobe_error_injectable(struct trace_event_call *call);
+bool trace_kprobe_on_func_entry(struct trace_event_call *call);
+bool trace_kprobe_error_injectable(struct trace_event_call *call);
 #else
 /* uprobes do not support symbol fetch methods */
 #define fetch_symbol_u8			NULL
@@ -280,14 +280,14 @@ alloc_symbol_cache(const char *sym, long offset)
 	return NULL;
 }
 
-static inline int trace_kprobe_ftrace(struct trace_event_call *call)
+static inline bool trace_kprobe_on_func_entry(struct trace_event_call *call)
 {
-	return 0;
+	return false;
 }
 
-static inline int trace_kprobe_error_injectable(struct trace_event_call *call)
+static inline bool trace_kprobe_error_injectable(struct trace_event_call *call)
 {
-	return 0;
+	return false;
 }
 #endif /* CONFIG_KPROBE_EVENTS */
 
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 43d18cb46308..f699122dab32 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -48,6 +48,7 @@
 #include <linux/moduleparam.h>
 #include <linux/uaccess.h>
 #include <linux/sched/isolation.h>
+#include <linux/nmi.h>
 
 #include "workqueue_internal.h"
 
@@ -4463,6 +4464,12 @@ void show_workqueue_state(void)
 			if (pwq->nr_active || !list_empty(&pwq->delayed_works))
 				show_pwq(pwq);
 			spin_unlock_irqrestore(&pwq->pool->lock, flags);
+			/*
+			 * We could be printing a lot from atomic context, e.g.
+			 * sysrq-t -> show_workqueue_state(). Avoid triggering
+			 * hard lockup.
+			 */
+			touch_nmi_watchdog();
 		}
 	}
 
@@ -4490,6 +4497,12 @@ void show_workqueue_state(void)
 		pr_cont("\n");
 	next_pool:
 		spin_unlock_irqrestore(&pool->lock, flags);
+		/*
+		 * We could be printing a lot from atomic context, e.g.
+		 * sysrq-t -> show_workqueue_state(). Avoid triggering
+		 * hard lockup.
+		 */
+		touch_nmi_watchdog();
 	}
 
 	rcu_read_unlock_sched();