diff options
Diffstat (limited to 'net/ipv4')
-rw-r--r-- | net/ipv4/bpf_tcp_ca.c | 19 | ||||
-rw-r--r-- | net/ipv4/fib_frontend.c | 1 | ||||
-rw-r--r-- | net/ipv4/inet_diag.c | 20 | ||||
-rw-r--r-- | net/ipv4/ip_output.c | 3 | ||||
-rw-r--r-- | net/ipv4/ip_tunnel_core.c | 1 | ||||
-rw-r--r-- | net/ipv4/nexthop.c | 17 | ||||
-rw-r--r-- | net/ipv4/route.c | 14 | ||||
-rw-r--r-- | net/ipv4/tcp.c | 29 | ||||
-rw-r--r-- | net/ipv4/tcp_cong.c | 27 | ||||
-rw-r--r-- | net/ipv4/tcp_input.c | 31 | ||||
-rw-r--r-- | net/ipv4/tcp_output.c | 1 |
11 files changed, 75 insertions, 88 deletions
diff --git a/net/ipv4/bpf_tcp_ca.c b/net/ipv4/bpf_tcp_ca.c index e3939f76b024..74a2ef598c31 100644 --- a/net/ipv4/bpf_tcp_ca.c +++ b/net/ipv4/bpf_tcp_ca.c @@ -28,23 +28,18 @@ static u32 unsupported_ops[] = { static const struct btf_type *tcp_sock_type; static u32 tcp_sock_id, sock_id; -static int btf_sk_storage_get_ids[5]; static struct bpf_func_proto btf_sk_storage_get_proto __read_mostly; - -static int btf_sk_storage_delete_ids[5]; static struct bpf_func_proto btf_sk_storage_delete_proto __read_mostly; -static void convert_sk_func_proto(struct bpf_func_proto *to, int *to_btf_ids, - const struct bpf_func_proto *from) +static void convert_sk_func_proto(struct bpf_func_proto *to, const struct bpf_func_proto *from) { int i; *to = *from; - to->btf_id = to_btf_ids; for (i = 0; i < ARRAY_SIZE(to->arg_type); i++) { if (to->arg_type[i] == ARG_PTR_TO_SOCKET) { to->arg_type[i] = ARG_PTR_TO_BTF_ID; - to->btf_id[i] = tcp_sock_id; + to->arg_btf_id[i] = &tcp_sock_id; } } } @@ -64,12 +59,8 @@ static int bpf_tcp_ca_init(struct btf *btf) tcp_sock_id = type_id; tcp_sock_type = btf_type_by_id(btf, tcp_sock_id); - convert_sk_func_proto(&btf_sk_storage_get_proto, - btf_sk_storage_get_ids, - &bpf_sk_storage_get_proto); - convert_sk_func_proto(&btf_sk_storage_delete_proto, - btf_sk_storage_delete_ids, - &bpf_sk_storage_delete_proto); + convert_sk_func_proto(&btf_sk_storage_get_proto, &bpf_sk_storage_get_proto); + convert_sk_func_proto(&btf_sk_storage_delete_proto, &bpf_sk_storage_delete_proto); return 0; } @@ -185,8 +176,8 @@ static const struct bpf_func_proto bpf_tcp_send_ack_proto = { /* In case we want to report error later */ .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_BTF_ID, + .arg1_btf_id = &tcp_sock_id, .arg2_type = ARG_ANYTHING, - .btf_id = &tcp_sock_id, }; static const struct bpf_func_proto * diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 41079490a118..86a23e4a6a50 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -362,6 +362,7 @@ static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, fl4.flowi4_tun_key.tun_id = 0; fl4.flowi4_flags = 0; fl4.flowi4_uid = sock_net_uid(net, NULL); + fl4.flowi4_multipath_hash = 0; no_addr = idev->ifa_list == NULL; diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index 93816d47e55a..366a4507b5a3 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -203,8 +203,8 @@ errout: } EXPORT_SYMBOL_GPL(inet_diag_msg_attrs_fill); -static void inet_diag_parse_attrs(const struct nlmsghdr *nlh, int hdrlen, - struct nlattr **req_nlas) +static int inet_diag_parse_attrs(const struct nlmsghdr *nlh, int hdrlen, + struct nlattr **req_nlas) { struct nlattr *nla; int remaining; @@ -212,9 +212,13 @@ static void inet_diag_parse_attrs(const struct nlmsghdr *nlh, int hdrlen, nlmsg_for_each_attr(nla, nlh, hdrlen, remaining) { int type = nla_type(nla); + if (type == INET_DIAG_REQ_PROTOCOL && nla_len(nla) != sizeof(u32)) + return -EINVAL; + if (type < __INET_DIAG_REQ_MAX) req_nlas[type] = nla; } + return 0; } static int inet_diag_get_protocol(const struct inet_diag_req_v2 *req, @@ -591,7 +595,10 @@ static int inet_diag_cmd_exact(int cmd, struct sk_buff *in_skb, int err, protocol; memset(&dump_data, 0, sizeof(dump_data)); - inet_diag_parse_attrs(nlh, hdrlen, dump_data.req_nlas); + err = inet_diag_parse_attrs(nlh, hdrlen, dump_data.req_nlas); + if (err) + return err; + protocol = inet_diag_get_protocol(req, &dump_data); handler = inet_diag_lock_handler(protocol); @@ -1197,8 +1204,11 @@ static int __inet_diag_dump_start(struct netlink_callback *cb, int hdrlen) if (!cb_data) return -ENOMEM; - inet_diag_parse_attrs(nlh, hdrlen, cb_data->req_nlas); - + err = inet_diag_parse_attrs(nlh, hdrlen, cb_data->req_nlas); + if (err) { + kfree(cb_data); + return err; + } nla = cb_data->inet_diag_nla_bc; if (nla) { err = inet_diag_bc_audit(nla, skb); diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 5fb536ff51f0..8b200252c655 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -74,6 +74,7 @@ #include <net/icmp.h> #include <net/checksum.h> #include <net/inetpeer.h> +#include <net/inet_ecn.h> #include <net/lwtunnel.h> #include <linux/bpf-cgroup.h> #include <linux/igmp.h> @@ -1704,7 +1705,7 @@ void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb, if (IS_ERR(rt)) return; - inet_sk(sk)->tos = arg->tos; + inet_sk(sk)->tos = arg->tos & ~INET_ECN_MASK; sk->sk_protocol = ip_hdr(skb)->protocol; sk->sk_bound_dev_if = arg->bound_dev_if; diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c index 75c6013ff9a4..b2ea1a8c5fd6 100644 --- a/net/ipv4/ip_tunnel_core.c +++ b/net/ipv4/ip_tunnel_core.c @@ -554,6 +554,7 @@ static int ip_tun_parse_opts_vxlan(struct nlattr *attr, attr = tb[LWTUNNEL_IP_OPT_VXLAN_GBP]; md->gbp = nla_get_u32(attr); + md->gbp &= VXLAN_GBP_MASK; info->key.tun_flags |= TUNNEL_VXLAN_OPT; } diff --git a/net/ipv4/nexthop.c b/net/ipv4/nexthop.c index bf9d4cd2d6e5..8c0f17c6863c 100644 --- a/net/ipv4/nexthop.c +++ b/net/ipv4/nexthop.c @@ -42,8 +42,8 @@ static int call_nexthop_notifiers(struct net *net, { int err; - err = atomic_notifier_call_chain(&net->nexthop.notifier_chain, - event_type, nh); + err = blocking_notifier_call_chain(&net->nexthop.notifier_chain, + event_type, nh); return notifier_to_errno(err); } @@ -870,8 +870,6 @@ static void __remove_nexthop_fib(struct net *net, struct nexthop *nh) bool do_flush = false; struct fib_info *fi; - call_nexthop_notifiers(net, NEXTHOP_EVENT_DEL, nh); - list_for_each_entry(fi, &nh->fi_list, nh_list) { fi->fib_flags |= RTNH_F_DEAD; do_flush = true; @@ -909,6 +907,8 @@ static void __remove_nexthop(struct net *net, struct nexthop *nh, static void remove_nexthop(struct net *net, struct nexthop *nh, struct nl_info *nlinfo) { + call_nexthop_notifiers(net, NEXTHOP_EVENT_DEL, nh); + /* remove from the tree */ rb_erase(&nh->rb_node, &net->nexthop.rb_root); @@ -1959,14 +1959,15 @@ static struct notifier_block nh_netdev_notifier = { int register_nexthop_notifier(struct net *net, struct notifier_block *nb) { - return atomic_notifier_chain_register(&net->nexthop.notifier_chain, nb); + return blocking_notifier_chain_register(&net->nexthop.notifier_chain, + nb); } EXPORT_SYMBOL(register_nexthop_notifier); int unregister_nexthop_notifier(struct net *net, struct notifier_block *nb) { - return atomic_notifier_chain_unregister(&net->nexthop.notifier_chain, - nb); + return blocking_notifier_chain_unregister(&net->nexthop.notifier_chain, + nb); } EXPORT_SYMBOL(unregister_nexthop_notifier); @@ -1986,7 +1987,7 @@ static int __net_init nexthop_net_init(struct net *net) net->nexthop.devhash = kzalloc(sz, GFP_KERNEL); if (!net->nexthop.devhash) return -ENOMEM; - ATOMIC_INIT_NOTIFIER_HEAD(&net->nexthop.notifier_chain); + BLOCKING_INIT_NOTIFIER_HEAD(&net->nexthop.notifier_chain); return 0; } diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 2c05b863ae43..d15a78b26dfa 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -786,8 +786,10 @@ static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flow neigh_event_send(n, NULL); } else { if (fib_lookup(net, fl4, &res, 0) == 0) { - struct fib_nh_common *nhc = FIB_RES_NHC(res); + struct fib_nh_common *nhc; + fib_select_path(net, &res, fl4, skb); + nhc = FIB_RES_NHC(res); update_or_create_fnhe(nhc, fl4->daddr, new_gw, 0, false, jiffies + ip_rt_gc_timeout); @@ -1013,6 +1015,7 @@ out: kfree_skb(skb); static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu) { struct dst_entry *dst = &rt->dst; + struct net *net = dev_net(dst->dev); struct fib_result res; bool lock = false; u32 old_mtu; @@ -1034,9 +1037,11 @@ static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu) return; rcu_read_lock(); - if (fib_lookup(dev_net(dst->dev), fl4, &res, 0) == 0) { - struct fib_nh_common *nhc = FIB_RES_NHC(res); + if (fib_lookup(net, fl4, &res, 0) == 0) { + struct fib_nh_common *nhc; + fib_select_path(net, &res, fl4, NULL); + nhc = FIB_RES_NHC(res); update_or_create_fnhe(nhc, fl4->daddr, 0, mtu, lock, jiffies + ip_rt_mtu_expires); } @@ -2148,6 +2153,7 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, fl4.daddr = daddr; fl4.saddr = saddr; fl4.flowi4_uid = sock_net_uid(net, NULL); + fl4.flowi4_multipath_hash = 0; if (fib4_rules_early_flow_dissect(net, skb, &fl4, &_flkeys)) { flkeys = &_flkeys; @@ -2668,8 +2674,6 @@ struct rtable *ip_route_output_key_hash_rcu(struct net *net, struct flowi4 *fl4, fib_select_path(net, res, fl4, skb); dev_out = FIB_RES_DEV(*res); - fl4->flowi4_oif = dev_out->ifindex; - make_route: rth = __mkroute_output(res, fl4, orig_oif, dev_out, flags); diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 57a568875539..2a8bfa89a515 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -564,7 +564,7 @@ __poll_t tcp_poll(struct file *file, struct socket *sock, poll_table *wait) mask |= EPOLLIN | EPOLLRDNORM; if (!(sk->sk_shutdown & SEND_SHUTDOWN)) { - if (sk_stream_is_writeable(sk)) { + if (__sk_stream_is_writeable(sk, 1)) { mask |= EPOLLOUT | EPOLLWRNORM; } else { /* send SIGIO later */ sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk); @@ -576,7 +576,7 @@ __poll_t tcp_poll(struct file *file, struct socket *sock, poll_table *wait) * pairs with the input side. */ smp_mb__after_atomic(); - if (sk_stream_is_writeable(sk)) + if (__sk_stream_is_writeable(sk, 1)) mask |= EPOLLOUT | EPOLLWRNORM; } } else @@ -1004,12 +1004,12 @@ ssize_t do_tcp_sendpages(struct sock *sk, struct page *page, int offset, !tcp_skb_can_collapse_to(skb)) { new_segment: if (!sk_stream_memory_free(sk)) - goto wait_for_sndbuf; + goto wait_for_space; skb = sk_stream_alloc_skb(sk, 0, sk->sk_allocation, tcp_rtx_and_write_queues_empty(sk)); if (!skb) - goto wait_for_memory; + goto wait_for_space; #ifdef CONFIG_TLS_DEVICE skb->decrypted = !!(flags & MSG_SENDPAGE_DECRYPTED); @@ -1028,7 +1028,7 @@ new_segment: goto new_segment; } if (!sk_wmem_schedule(sk, copy)) - goto wait_for_memory; + goto wait_for_space; if (can_coalesce) { skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy); @@ -1069,9 +1069,8 @@ new_segment: tcp_push_one(sk, mss_now); continue; -wait_for_sndbuf: +wait_for_space: set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); -wait_for_memory: tcp_push(sk, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH, size_goal); @@ -1282,7 +1281,7 @@ restart: new_segment: if (!sk_stream_memory_free(sk)) - goto wait_for_sndbuf; + goto wait_for_space; if (unlikely(process_backlog >= 16)) { process_backlog = 0; @@ -1293,7 +1292,7 @@ new_segment: skb = sk_stream_alloc_skb(sk, 0, sk->sk_allocation, first_skb); if (!skb) - goto wait_for_memory; + goto wait_for_space; process_backlog++; skb->ip_summed = CHECKSUM_PARTIAL; @@ -1326,7 +1325,7 @@ new_segment: struct page_frag *pfrag = sk_page_frag(sk); if (!sk_page_frag_refill(sk, pfrag)) - goto wait_for_memory; + goto wait_for_space; if (!skb_can_coalesce(skb, i, pfrag->page, pfrag->offset)) { @@ -1340,7 +1339,7 @@ new_segment: copy = min_t(int, copy, pfrag->size - pfrag->offset); if (!sk_wmem_schedule(sk, copy)) - goto wait_for_memory; + goto wait_for_space; err = skb_copy_to_page_nocache(sk, &msg->msg_iter, skb, pfrag->page, @@ -1393,9 +1392,8 @@ new_segment: tcp_push_one(sk, mss_now); continue; -wait_for_sndbuf: +wait_for_space: set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); -wait_for_memory: if (copied) tcp_push(sk, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH, size_goal); @@ -1527,7 +1525,7 @@ static int tcp_peek_sndq(struct sock *sk, struct msghdr *msg, int len) * calculation of whether or not we must ACK for the sake of * a window update. */ -static void tcp_cleanup_rbuf(struct sock *sk, int copied) +void tcp_cleanup_rbuf(struct sock *sk, int copied) { struct tcp_sock *tp = tcp_sk(sk); bool time_to_ack = false; @@ -2698,6 +2696,7 @@ int tcp_disconnect(struct sock *sk, int flags) if (icsk->icsk_ca_ops->release) icsk->icsk_ca_ops->release(sk); memset(icsk->icsk_ca_priv, 0, sizeof(icsk->icsk_ca_priv)); + icsk->icsk_ca_initialized = 0; tcp_set_ca_state(sk, TCP_CA_Open); tp->is_sack_reneg = 0; tcp_clear_retrans(tp); @@ -3049,7 +3048,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level, int optname, name[val] = 0; lock_sock(sk); - err = tcp_set_congestion_control(sk, name, true, true, + err = tcp_set_congestion_control(sk, name, true, ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)); release_sock(sk); diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c index 62878cf26d9c..db47ac24d057 100644 --- a/net/ipv4/tcp_cong.c +++ b/net/ipv4/tcp_cong.c @@ -176,7 +176,7 @@ void tcp_assign_congestion_control(struct sock *sk) void tcp_init_congestion_control(struct sock *sk) { - const struct inet_connection_sock *icsk = inet_csk(sk); + struct inet_connection_sock *icsk = inet_csk(sk); tcp_sk(sk)->prior_ssthresh = 0; if (icsk->icsk_ca_ops->init) @@ -185,6 +185,7 @@ void tcp_init_congestion_control(struct sock *sk) INET_ECN_xmit(sk); else INET_ECN_dontxmit(sk); + icsk->icsk_ca_initialized = 1; } static void tcp_reinit_congestion_control(struct sock *sk, @@ -340,7 +341,7 @@ out: * already initialized. */ int tcp_set_congestion_control(struct sock *sk, const char *name, bool load, - bool reinit, bool cap_net_admin) + bool cap_net_admin) { struct inet_connection_sock *icsk = inet_csk(sk); const struct tcp_congestion_ops *ca; @@ -361,28 +362,14 @@ int tcp_set_congestion_control(struct sock *sk, const char *name, bool load, goto out; } - if (!ca) { + if (!ca) err = -ENOENT; - } else if (!load) { - const struct tcp_congestion_ops *old_ca = icsk->icsk_ca_ops; - - if (bpf_try_module_get(ca, ca->owner)) { - if (reinit) { - tcp_reinit_congestion_control(sk, ca); - } else { - icsk->icsk_ca_ops = ca; - bpf_module_put(old_ca, old_ca->owner); - } - } else { - err = -EBUSY; - } - } else if (!((ca->flags & TCP_CONG_NON_RESTRICTED) || cap_net_admin)) { + else if (!((ca->flags & TCP_CONG_NON_RESTRICTED) || cap_net_admin)) err = -EPERM; - } else if (!bpf_try_module_get(ca, ca->owner)) { + else if (!bpf_try_module_get(ca, ca->owner)) err = -EBUSY; - } else { + else tcp_reinit_congestion_control(sk, ca); - } out: rcu_read_unlock(); return err; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 3658ad84f0c6..748980862a09 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -4908,7 +4908,7 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) int eaten; if (sk_is_mptcp(sk)) - mptcp_incoming_options(sk, skb, &tp->rx_opt); + mptcp_incoming_options(sk, skb); if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq) { __kfree_skb(skb); @@ -5332,12 +5332,6 @@ static bool tcp_should_expand_sndbuf(const struct sock *sk) return true; } -/* When incoming ACK allowed to free some skb from write_queue, - * we remember this event in flag SOCK_QUEUE_SHRUNK and wake up socket - * on the exit from tcp input handler. - * - * PROBLEM: sndbuf expansion does not work well with largesend. - */ static void tcp_new_space(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); @@ -5352,16 +5346,13 @@ static void tcp_new_space(struct sock *sk) static void tcp_check_space(struct sock *sk) { - if (sock_flag(sk, SOCK_QUEUE_SHRUNK)) { - sock_reset_flag(sk, SOCK_QUEUE_SHRUNK); - /* pairs with tcp_poll() */ - smp_mb(); - if (sk->sk_socket && - test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) { - tcp_new_space(sk); - if (!test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) - tcp_chrono_stop(sk, TCP_CHRONO_SNDBUF_LIMITED); - } + /* pairs with tcp_poll() */ + smp_mb(); + if (sk->sk_socket && + test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) { + tcp_new_space(sk); + if (!test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) + tcp_chrono_stop(sk, TCP_CHRONO_SNDBUF_LIMITED); } } @@ -5894,8 +5885,10 @@ void tcp_init_transfer(struct sock *sk, int bpf_op, struct sk_buff *skb) tp->snd_cwnd = tcp_init_cwnd(tp, __sk_dst_get(sk)); tp->snd_cwnd_stamp = tcp_jiffies32; + icsk->icsk_ca_initialized = 0; bpf_skops_established(sk, bpf_op, skb); - tcp_init_congestion_control(sk); + if (!icsk->icsk_ca_initialized) + tcp_init_congestion_control(sk); tcp_init_buffer_space(sk); } @@ -6496,7 +6489,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) case TCP_LAST_ACK: if (!before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) { if (sk_is_mptcp(sk)) - mptcp_incoming_options(sk, skb, &tp->rx_opt); + mptcp_incoming_options(sk, skb); break; } fallthrough; diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index ab79d36ed07f..386978dcd318 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -1682,7 +1682,6 @@ int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len) skb->truesize -= delta_truesize; sk_wmem_queued_add(sk, -delta_truesize); sk_mem_uncharge(sk, delta_truesize); - sock_set_flag(sk, SOCK_QUEUE_SHRUNK); } /* Any change of skb->len requires recalculation of tso factor. */ |