diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2024-07-11 09:29:49 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2024-07-11 09:29:49 -0700 |
commit | 51df8e0cbaefd432f7029dde94e6c7e4e5b19465 (patch) | |
tree | 0bf175e0de508a8b493be2506dc6ccec1f188322 /kernel/bpf/helpers.c | |
parent | 83ab4b461eb7bdf90984eb56d4954dbe11e926d4 (diff) | |
parent | d7c199e77ef2fe259ad5b1beca5ddd6c951fcba2 (diff) |
Merge tag 'net-6.10-rc8' of git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net
Pull networking fixes from Paolo Abeni:
"Including fixes from bpf and netfilter.
Current release - regressions:
- core: fix rc7's __skb_datagram_iter() regression
Current release - new code bugs:
- eth: bnxt: fix crashes when reducing ring count with active RSS
contexts
Previous releases - regressions:
- sched: fix UAF when resolving a clash
- skmsg: skip zero length skb in sk_msg_recvmsg2
- sunrpc: fix kernel free on connection failure in
xs_tcp_setup_socket
- tcp: avoid too many retransmit packets
- tcp: fix incorrect undo caused by DSACK of TLP retransmit
- udp: Set SOCK_RCU_FREE earlier in udp_lib_get_port().
- eth: ks8851: fix deadlock with the SPI chip variant
- eth: i40e: fix XDP program unloading while removing the driver
Previous releases - always broken:
- bpf:
- fix too early release of tcx_entry
- fail bpf_timer_cancel when callback is being cancelled
- bpf: fix order of args in call to bpf_map_kvcalloc
- netfilter: nf_tables: prefer nft_chain_validate
- ppp: reject claimed-as-LCP but actually malformed packets
- wireguard: avoid unaligned 64-bit memory accesses"
* tag 'net-6.10-rc8' of git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net: (33 commits)
net, sunrpc: Remap EPERM in case of connection failure in xs_tcp_setup_socket
net/sched: Fix UAF when resolving a clash
net: ks8851: Fix potential TX stall after interface reopen
udp: Set SOCK_RCU_FREE earlier in udp_lib_get_port().
netfilter: nf_tables: prefer nft_chain_validate
netfilter: nfnetlink_queue: drop bogus WARN_ON
ethtool: netlink: do not return SQI value if link is down
ppp: reject claimed-as-LCP but actually malformed packets
selftests/bpf: Add timer lockup selftest
net: ethernet: mtk-star-emac: set mac_managed_pm when probing
e1000e: fix force smbus during suspend flow
tcp: avoid too many retransmit packets
bpf: Defer work in bpf_timer_cancel_and_free
bpf: Fail bpf_timer_cancel when callback is being cancelled
bpf: fix order of args in call to bpf_map_kvcalloc
net: ethernet: lantiq_etop: fix double free in detach
i40e: Fix XDP program unloading while removing the driver
net: fix rc7's __skb_datagram_iter()
net: ks8851: Fix deadlock with the SPI chip variant
octeontx2-af: Fix incorrect value output on error path in rvu_check_rsrc_availability()
...
Diffstat (limited to 'kernel/bpf/helpers.c')
-rw-r--r-- | kernel/bpf/helpers.c | 99 |
1 files changed, 82 insertions, 17 deletions
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 2a69a9a36c0f..3243c83ef3e3 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -1084,7 +1084,10 @@ struct bpf_async_cb { struct bpf_prog *prog; void __rcu *callback_fn; void *value; - struct rcu_head rcu; + union { + struct rcu_head rcu; + struct work_struct delete_work; + }; u64 flags; }; @@ -1107,6 +1110,7 @@ struct bpf_async_cb { struct bpf_hrtimer { struct bpf_async_cb cb; struct hrtimer timer; + atomic_t cancelling; }; struct bpf_work { @@ -1219,6 +1223,21 @@ static void bpf_wq_delete_work(struct work_struct *work) kfree_rcu(w, cb.rcu); } +static void bpf_timer_delete_work(struct work_struct *work) +{ + struct bpf_hrtimer *t = container_of(work, struct bpf_hrtimer, cb.delete_work); + + /* Cancel the timer and wait for callback to complete if it was running. + * If hrtimer_cancel() can be safely called it's safe to call + * kfree_rcu(t) right after for both preallocated and non-preallocated + * maps. The async->cb = NULL was already done and no code path can see + * address 't' anymore. Timer if armed for existing bpf_hrtimer before + * bpf_timer_cancel_and_free will have been cancelled. + */ + hrtimer_cancel(&t->timer); + kfree_rcu(t, cb.rcu); +} + static int __bpf_async_init(struct bpf_async_kern *async, struct bpf_map *map, u64 flags, enum bpf_async_type type) { @@ -1262,6 +1281,8 @@ static int __bpf_async_init(struct bpf_async_kern *async, struct bpf_map *map, u clockid = flags & (MAX_CLOCKS - 1); t = (struct bpf_hrtimer *)cb; + atomic_set(&t->cancelling, 0); + INIT_WORK(&t->cb.delete_work, bpf_timer_delete_work); hrtimer_init(&t->timer, clockid, HRTIMER_MODE_REL_SOFT); t->timer.function = bpf_timer_cb; cb->value = (void *)async - map->record->timer_off; @@ -1440,7 +1461,8 @@ static void drop_prog_refcnt(struct bpf_async_cb *async) BPF_CALL_1(bpf_timer_cancel, struct bpf_async_kern *, timer) { - struct bpf_hrtimer *t; + struct bpf_hrtimer *t, *cur_t; + bool inc = false; int ret = 0; if (in_nmi()) @@ -1452,14 +1474,41 @@ BPF_CALL_1(bpf_timer_cancel, struct bpf_async_kern *, timer) ret = -EINVAL; goto out; } - if (this_cpu_read(hrtimer_running) == t) { + + cur_t = this_cpu_read(hrtimer_running); + if (cur_t == t) { /* If bpf callback_fn is trying to bpf_timer_cancel() * its own timer the hrtimer_cancel() will deadlock - * since it waits for callback_fn to finish + * since it waits for callback_fn to finish. */ ret = -EDEADLK; goto out; } + + /* Only account in-flight cancellations when invoked from a timer + * callback, since we want to avoid waiting only if other _callbacks_ + * are waiting on us, to avoid introducing lockups. Non-callback paths + * are ok, since nobody would synchronously wait for their completion. + */ + if (!cur_t) + goto drop; + atomic_inc(&t->cancelling); + /* Need full barrier after relaxed atomic_inc */ + smp_mb__after_atomic(); + inc = true; + if (atomic_read(&cur_t->cancelling)) { + /* We're cancelling timer t, while some other timer callback is + * attempting to cancel us. In such a case, it might be possible + * that timer t belongs to the other callback, or some other + * callback waiting upon it (creating transitive dependencies + * upon us), and we will enter a deadlock if we continue + * cancelling and waiting for it synchronously, since it might + * do the same. Bail! + */ + ret = -EDEADLK; + goto out; + } +drop: drop_prog_refcnt(&t->cb); out: __bpf_spin_unlock_irqrestore(&timer->lock); @@ -1467,6 +1516,8 @@ out: * if it was running. */ ret = ret ?: hrtimer_cancel(&t->timer); + if (inc) + atomic_dec(&t->cancelling); rcu_read_unlock(); return ret; } @@ -1512,25 +1563,39 @@ void bpf_timer_cancel_and_free(void *val) if (!t) return; - /* Cancel the timer and wait for callback to complete if it was running. - * If hrtimer_cancel() can be safely called it's safe to call kfree(t) - * right after for both preallocated and non-preallocated maps. - * The async->cb = NULL was already done and no code path can - * see address 't' anymore. - * - * Check that bpf_map_delete/update_elem() wasn't called from timer - * callback_fn. In such case don't call hrtimer_cancel() (since it will - * deadlock) and don't call hrtimer_try_to_cancel() (since it will just - * return -1). Though callback_fn is still running on this cpu it's + /* We check that bpf_map_delete/update_elem() was called from timer + * callback_fn. In such case we don't call hrtimer_cancel() (since it + * will deadlock) and don't call hrtimer_try_to_cancel() (since it will + * just return -1). Though callback_fn is still running on this cpu it's * safe to do kfree(t) because bpf_timer_cb() read everything it needed * from 't'. The bpf subprog callback_fn won't be able to access 't', * since async->cb = NULL was already done. The timer will be * effectively cancelled because bpf_timer_cb() will return * HRTIMER_NORESTART. + * + * However, it is possible the timer callback_fn calling us armed the + * timer _before_ calling us, such that failing to cancel it here will + * cause it to possibly use struct hrtimer after freeing bpf_hrtimer. + * Therefore, we _need_ to cancel any outstanding timers before we do + * kfree_rcu, even though no more timers can be armed. + * + * Moreover, we need to schedule work even if timer does not belong to + * the calling callback_fn, as on two different CPUs, we can end up in a + * situation where both sides run in parallel, try to cancel one + * another, and we end up waiting on both sides in hrtimer_cancel + * without making forward progress, since timer1 depends on time2 + * callback to finish, and vice versa. + * + * CPU 1 (timer1_cb) CPU 2 (timer2_cb) + * bpf_timer_cancel_and_free(timer2) bpf_timer_cancel_and_free(timer1) + * + * To avoid these issues, punt to workqueue context when we are in a + * timer callback. */ - if (this_cpu_read(hrtimer_running) != t) - hrtimer_cancel(&t->timer); - kfree_rcu(t, cb.rcu); + if (this_cpu_read(hrtimer_running)) + queue_work(system_unbound_wq, &t->cb.delete_work); + else + bpf_timer_delete_work(&t->cb.delete_work); } /* This function is called by map_delete/update_elem for individual element and |