diff options
Diffstat (limited to 'kernel/bpf/memalloc.c')
| -rw-r--r-- | kernel/bpf/memalloc.c | 64 | 
1 files changed, 53 insertions, 11 deletions
diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c index 5f83be1d2018..ebcc3dd0fa19 100644 --- a/kernel/bpf/memalloc.c +++ b/kernel/bpf/memalloc.c @@ -171,9 +171,24 @@ static void alloc_bulk(struct bpf_mem_cache *c, int cnt, int node)  	memcg = get_memcg(c);  	old_memcg = set_active_memcg(memcg);  	for (i = 0; i < cnt; i++) { -		obj = __alloc(c, node); -		if (!obj) -			break; +		/* +		 * free_by_rcu is only manipulated by irq work refill_work(). +		 * IRQ works on the same CPU are called sequentially, so it is +		 * safe to use __llist_del_first() here. If alloc_bulk() is +		 * invoked by the initial prefill, there will be no running +		 * refill_work(), so __llist_del_first() is fine as well. +		 * +		 * In most cases, objects on free_by_rcu are from the same CPU. +		 * If some objects come from other CPUs, it doesn't incur any +		 * harm because NUMA_NO_NODE means the preference for current +		 * numa node and it is not a guarantee. +		 */ +		obj = __llist_del_first(&c->free_by_rcu); +		if (!obj) { +			obj = __alloc(c, node); +			if (!obj) +				break; +		}  		if (IS_ENABLED(CONFIG_PREEMPT_RT))  			/* In RT irq_work runs in per-cpu kthread, so disable  			 * interrupts to avoid preemption and interrupts and @@ -222,9 +237,13 @@ static void __free_rcu(struct rcu_head *head)  static void __free_rcu_tasks_trace(struct rcu_head *head)  { -	struct bpf_mem_cache *c = container_of(head, struct bpf_mem_cache, rcu); - -	call_rcu(&c->rcu, __free_rcu); +	/* If RCU Tasks Trace grace period implies RCU grace period, +	 * there is no need to invoke call_rcu(). +	 */ +	if (rcu_trace_implies_rcu_gp()) +		__free_rcu(head); +	else +		call_rcu(head, __free_rcu);  }  static void enque_to_free(struct bpf_mem_cache *c, void *obj) @@ -253,8 +272,9 @@ static void do_call_rcu(struct bpf_mem_cache *c)  		 */  		__llist_add(llnode, &c->waiting_for_gp);  	/* Use call_rcu_tasks_trace() to wait for sleepable progs to finish. -	 * Then use call_rcu() to wait for normal progs to finish -	 * and finally do free_one() on each element. +	 * If RCU Tasks Trace grace period implies RCU grace period, free +	 * these elements directly, else use call_rcu() to wait for normal +	 * progs to finish and finally do free_one() on each element.  	 */  	call_rcu_tasks_trace(&c->rcu, __free_rcu_tasks_trace);  } @@ -418,14 +438,17 @@ static void drain_mem_cache(struct bpf_mem_cache *c)  	/* No progs are using this bpf_mem_cache, but htab_map_free() called  	 * bpf_mem_cache_free() for all remaining elements and they can be in  	 * free_by_rcu or in waiting_for_gp lists, so drain those lists now. +	 * +	 * Except for waiting_for_gp list, there are no concurrent operations +	 * on these lists, so it is safe to use __llist_del_all().  	 */  	llist_for_each_safe(llnode, t, __llist_del_all(&c->free_by_rcu))  		free_one(c, llnode);  	llist_for_each_safe(llnode, t, llist_del_all(&c->waiting_for_gp))  		free_one(c, llnode); -	llist_for_each_safe(llnode, t, llist_del_all(&c->free_llist)) +	llist_for_each_safe(llnode, t, __llist_del_all(&c->free_llist))  		free_one(c, llnode); -	llist_for_each_safe(llnode, t, llist_del_all(&c->free_llist_extra)) +	llist_for_each_safe(llnode, t, __llist_del_all(&c->free_llist_extra))  		free_one(c, llnode);  } @@ -441,9 +464,17 @@ static void free_mem_alloc(struct bpf_mem_alloc *ma)  {  	/* waiting_for_gp lists was drained, but __free_rcu might  	 * still execute. Wait for it now before we freeing percpu caches. +	 * +	 * rcu_barrier_tasks_trace() doesn't imply synchronize_rcu_tasks_trace(), +	 * but rcu_barrier_tasks_trace() and rcu_barrier() below are only used +	 * to wait for the pending __free_rcu_tasks_trace() and __free_rcu(), +	 * so if call_rcu(head, __free_rcu) is skipped due to +	 * rcu_trace_implies_rcu_gp(), it will be OK to skip rcu_barrier() by +	 * using rcu_trace_implies_rcu_gp() as well.  	 */  	rcu_barrier_tasks_trace(); -	rcu_barrier(); +	if (!rcu_trace_implies_rcu_gp()) +		rcu_barrier();  	free_mem_alloc_no_barrier(ma);  } @@ -493,6 +524,16 @@ void bpf_mem_alloc_destroy(struct bpf_mem_alloc *ma)  		rcu_in_progress = 0;  		for_each_possible_cpu(cpu) {  			c = per_cpu_ptr(ma->cache, cpu); +			/* +			 * refill_work may be unfinished for PREEMPT_RT kernel +			 * in which irq work is invoked in a per-CPU RT thread. +			 * It is also possible for kernel with +			 * arch_irq_work_has_interrupt() being false and irq +			 * work is invoked in timer interrupt. So waiting for +			 * the completion of irq work to ease the handling of +			 * concurrency. +			 */ +			irq_work_sync(&c->refill_work);  			drain_mem_cache(c);  			rcu_in_progress += atomic_read(&c->call_rcu_in_progress);  		} @@ -507,6 +548,7 @@ void bpf_mem_alloc_destroy(struct bpf_mem_alloc *ma)  			cc = per_cpu_ptr(ma->caches, cpu);  			for (i = 0; i < NUM_CACHES; i++) {  				c = &cc->cache[i]; +				irq_work_sync(&c->refill_work);  				drain_mem_cache(c);  				rcu_in_progress += atomic_read(&c->call_rcu_in_progress);  			}  |