diff options
-rw-r--r-- | drivers/firewire/net.c | 14 | ||||
-rw-r--r-- | drivers/media/dvb-core/dvb_net.c | 8 | ||||
-rw-r--r-- | drivers/message/fusion/mptlan.c | 2 | ||||
-rw-r--r-- | drivers/net/ethernet/cadence/macb_main.c | 2 | ||||
-rw-r--r-- | drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c | 97 | ||||
-rw-r--r-- | drivers/net/phy/at803x.c | 2 | ||||
-rw-r--r-- | drivers/net/thunderbolt.c | 8 | ||||
-rw-r--r-- | drivers/staging/octeon/ethernet.c | 2 | ||||
-rw-r--r-- | include/linux/bpf-cgroup.h | 1 | ||||
-rw-r--r-- | include/net/sch_generic.h | 2 | ||||
-rw-r--r-- | include/net/sock.h | 20 | ||||
-rw-r--r-- | include/net/tcp.h | 2 | ||||
-rw-r--r-- | net/core/xdp.c | 2 | ||||
-rw-r--r-- | net/ipv4/af_inet.c | 18 | ||||
-rw-r--r-- | net/ipv4/inet_diag.c | 2 | ||||
-rw-r--r-- | net/ipv4/tcp.c | 29 | ||||
-rw-r--r-- | net/ipv4/tcp_output.c | 22 | ||||
-rw-r--r-- | net/ipv6/af_inet6.c | 21 | ||||
-rw-r--r-- | net/ipv6/route.c | 24 | ||||
-rw-r--r-- | net/ipv6/tcp_ipv6.c | 1 | ||||
-rw-r--r-- | net/mptcp/protocol.c | 241 | ||||
-rw-r--r-- | net/mptcp/protocol.h | 15 | ||||
-rw-r--r-- | net/sched/em_meta.c | 2 | ||||
-rw-r--r-- | net/sched/sch_generic.c | 38 |
24 files changed, 259 insertions, 316 deletions
diff --git a/drivers/firewire/net.c b/drivers/firewire/net.c index 4c3fd2eed1da..dcc141068128 100644 --- a/drivers/firewire/net.c +++ b/drivers/firewire/net.c @@ -1443,8 +1443,8 @@ static int fwnet_probe(struct fw_unit *unit, struct net_device *net; bool allocated_netdev = false; struct fwnet_device *dev; + union fwnet_hwaddr ha; int ret; - union fwnet_hwaddr *ha; mutex_lock(&fwnet_device_mutex); @@ -1491,12 +1491,12 @@ static int fwnet_probe(struct fw_unit *unit, net->max_mtu = 4096U; /* Set our hardware address while we're at it */ - ha = (union fwnet_hwaddr *)net->dev_addr; - put_unaligned_be64(card->guid, &ha->uc.uniq_id); - ha->uc.max_rec = dev->card->max_receive; - ha->uc.sspd = dev->card->link_speed; - put_unaligned_be16(dev->local_fifo >> 32, &ha->uc.fifo_hi); - put_unaligned_be32(dev->local_fifo & 0xffffffff, &ha->uc.fifo_lo); + ha.uc.uniq_id = cpu_to_be64(card->guid); + ha.uc.max_rec = dev->card->max_receive; + ha.uc.sspd = dev->card->link_speed; + ha.uc.fifo_hi = cpu_to_be16(dev->local_fifo >> 32); + ha.uc.fifo_lo = cpu_to_be32(dev->local_fifo & 0xffffffff); + dev_addr_set(net, ha.u); memset(net->broadcast, -1, net->addr_len); diff --git a/drivers/media/dvb-core/dvb_net.c b/drivers/media/dvb-core/dvb_net.c index dddebea644bb..8a2febf33ce2 100644 --- a/drivers/media/dvb-core/dvb_net.c +++ b/drivers/media/dvb-core/dvb_net.c @@ -1008,7 +1008,7 @@ static u8 mask_promisc[6]={0x00, 0x00, 0x00, 0x00, 0x00, 0x00}; static int dvb_net_filter_sec_set(struct net_device *dev, struct dmx_section_filter **secfilter, - u8 *mac, u8 *mac_mask) + const u8 *mac, u8 *mac_mask) { struct dvb_net_priv *priv = netdev_priv(dev); int ret; @@ -1052,7 +1052,7 @@ static int dvb_net_feed_start(struct net_device *dev) int ret = 0, i; struct dvb_net_priv *priv = netdev_priv(dev); struct dmx_demux *demux = priv->demux; - unsigned char *mac = (unsigned char *) dev->dev_addr; + const unsigned char *mac = (const unsigned char *) dev->dev_addr; netdev_dbg(dev, "rx_mode %i\n", priv->rx_mode); mutex_lock(&priv->mutex); @@ -1272,7 +1272,7 @@ static int dvb_net_set_mac (struct net_device *dev, void *p) struct dvb_net_priv *priv = netdev_priv(dev); struct sockaddr *addr=p; - memcpy(dev->dev_addr, addr->sa_data, dev->addr_len); + eth_hw_addr_set(dev, addr->sa_data); if (netif_running(dev)) schedule_work(&priv->restart_net_feed_wq); @@ -1367,7 +1367,7 @@ static int dvb_net_add_if(struct dvb_net *dvbnet, u16 pid, u8 feedtype) dvbnet->dvbdev->adapter->num, if_num); net->addr_len = 6; - memcpy(net->dev_addr, dvbnet->dvbdev->adapter->proposed_mac, 6); + eth_hw_addr_set(net, dvbnet->dvbdev->adapter->proposed_mac); dvbnet->device[if_num] = net; diff --git a/drivers/message/fusion/mptlan.c b/drivers/message/fusion/mptlan.c index 3261cac762de..acdc257a900e 100644 --- a/drivers/message/fusion/mptlan.c +++ b/drivers/message/fusion/mptlan.c @@ -1350,7 +1350,7 @@ mpt_register_lan_device (MPT_ADAPTER *mpt_dev, int pnum) HWaddr[5] = a[0]; dev->addr_len = FC_ALEN; - memcpy(dev->dev_addr, HWaddr, FC_ALEN); + dev_addr_set(dev, HWaddr); memset(dev->broadcast, 0xff, FC_ALEN); /* The Tx queue is 127 deep on the 909. diff --git a/drivers/net/ethernet/cadence/macb_main.c b/drivers/net/ethernet/cadence/macb_main.c index 309371abfe23..ffce528aa00e 100644 --- a/drivers/net/ethernet/cadence/macb_main.c +++ b/drivers/net/ethernet/cadence/macb_main.c @@ -901,7 +901,7 @@ static int macb_mdiobus_register(struct macb *bp) * directly under the MAC node */ child = of_get_child_by_name(np, "mdio"); - if (np) { + if (child) { int ret = of_mdiobus_register(bp->mii_bus, child); of_node_put(child); diff --git a/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c b/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c index 8ddf58f379ac..587def69a6f7 100644 --- a/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c +++ b/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c @@ -6261,32 +6261,13 @@ static void mvpp2_phylink_validate(struct phylink_config *config, struct mvpp2_port *port = mvpp2_phylink_to_port(config); __ETHTOOL_DECLARE_LINK_MODE_MASK(mask) = { 0, }; - /* Invalid combinations */ - switch (state->interface) { - case PHY_INTERFACE_MODE_10GBASER: - case PHY_INTERFACE_MODE_XAUI: - if (!mvpp2_port_supports_xlg(port)) - goto empty_set; - break; - case PHY_INTERFACE_MODE_RGMII: - case PHY_INTERFACE_MODE_RGMII_ID: - case PHY_INTERFACE_MODE_RGMII_RXID: - case PHY_INTERFACE_MODE_RGMII_TXID: - if (!mvpp2_port_supports_rgmii(port)) - goto empty_set; - break; - case PHY_INTERFACE_MODE_1000BASEX: - case PHY_INTERFACE_MODE_2500BASEX: - /* When in 802.3z mode, we must have AN enabled: - * Bit 2 Field InBandAnEn In-band Auto-Negotiation enable. ... - * When <PortType> = 1 (1000BASE-X) this field must be set to 1. - */ - if (!phylink_test(state->advertising, Autoneg)) - goto empty_set; - break; - default: - break; - } + /* When in 802.3z mode, we must have AN enabled: + * Bit 2 Field InBandAnEn In-band Auto-Negotiation enable. ... + * When <PortType> = 1 (1000BASE-X) this field must be set to 1. + */ + if (phy_interface_mode_is_8023z(state->interface) && + !phylink_test(state->advertising, Autoneg)) + goto empty_set; phylink_set(mask, Autoneg); phylink_set_port_modes(mask); @@ -6299,14 +6280,12 @@ static void mvpp2_phylink_validate(struct phylink_config *config, switch (state->interface) { case PHY_INTERFACE_MODE_10GBASER: case PHY_INTERFACE_MODE_XAUI: - case PHY_INTERFACE_MODE_NA: if (mvpp2_port_supports_xlg(port)) { phylink_set_10g_modes(mask); phylink_set(mask, 10000baseKR_Full); } - if (state->interface != PHY_INTERFACE_MODE_NA) - break; - fallthrough; + break; + case PHY_INTERFACE_MODE_RGMII: case PHY_INTERFACE_MODE_RGMII_ID: case PHY_INTERFACE_MODE_RGMII_RXID: @@ -6318,30 +6297,24 @@ static void mvpp2_phylink_validate(struct phylink_config *config, phylink_set(mask, 100baseT_Full); phylink_set(mask, 1000baseT_Full); phylink_set(mask, 1000baseX_Full); - if (state->interface != PHY_INTERFACE_MODE_NA) - break; - fallthrough; + break; + case PHY_INTERFACE_MODE_1000BASEX: + phylink_set(mask, 1000baseT_Full); + phylink_set(mask, 1000baseX_Full); + break; + case PHY_INTERFACE_MODE_2500BASEX: - if (port->comphy || - state->interface != PHY_INTERFACE_MODE_2500BASEX) { - phylink_set(mask, 1000baseT_Full); - phylink_set(mask, 1000baseX_Full); - } - if (port->comphy || - state->interface == PHY_INTERFACE_MODE_2500BASEX) { - phylink_set(mask, 2500baseT_Full); - phylink_set(mask, 2500baseX_Full); - } + phylink_set(mask, 2500baseT_Full); + phylink_set(mask, 2500baseX_Full); break; + default: goto empty_set; } linkmode_and(supported, supported, mask); linkmode_and(state->advertising, state->advertising, mask); - - phylink_helper_basex_speed(state); return; empty_set: @@ -6937,6 +6910,40 @@ static int mvpp2_port_probe(struct platform_device *pdev, port->phylink_config.dev = &dev->dev; port->phylink_config.type = PHYLINK_NETDEV; + if (mvpp2_port_supports_xlg(port)) { + __set_bit(PHY_INTERFACE_MODE_10GBASER, + port->phylink_config.supported_interfaces); + __set_bit(PHY_INTERFACE_MODE_XAUI, + port->phylink_config.supported_interfaces); + } + + if (mvpp2_port_supports_rgmii(port)) + phy_interface_set_rgmii(port->phylink_config.supported_interfaces); + + if (comphy) { + /* If a COMPHY is present, we can support any of the + * serdes modes and switch between them. + */ + __set_bit(PHY_INTERFACE_MODE_SGMII, + port->phylink_config.supported_interfaces); + __set_bit(PHY_INTERFACE_MODE_1000BASEX, + port->phylink_config.supported_interfaces); + __set_bit(PHY_INTERFACE_MODE_2500BASEX, + port->phylink_config.supported_interfaces); + } else if (phy_mode == PHY_INTERFACE_MODE_2500BASEX) { + /* No COMPHY, with only 2500BASE-X mode supported */ + __set_bit(PHY_INTERFACE_MODE_2500BASEX, + port->phylink_config.supported_interfaces); + } else if (phy_mode == PHY_INTERFACE_MODE_1000BASEX || + phy_mode == PHY_INTERFACE_MODE_SGMII) { + /* No COMPHY, we can switch between 1000BASE-X and SGMII + */ + __set_bit(PHY_INTERFACE_MODE_1000BASEX, + port->phylink_config.supported_interfaces); + __set_bit(PHY_INTERFACE_MODE_SGMII, + port->phylink_config.supported_interfaces); + } + phylink = phylink_create(&port->phylink_config, port_fwnode, phy_mode, &mvpp2_phylink_ops); if (IS_ERR(phylink)) { diff --git a/drivers/net/phy/at803x.c b/drivers/net/phy/at803x.c index f1cbe1f6ddec..dae95d9a07e8 100644 --- a/drivers/net/phy/at803x.c +++ b/drivers/net/phy/at803x.c @@ -484,7 +484,7 @@ static int at803x_set_wol(struct phy_device *phydev, static void at803x_get_wol(struct phy_device *phydev, struct ethtool_wolinfo *wol) { - u32 value; + int value; wol->supported = WAKE_MAGIC; wol->wolopts = 0; diff --git a/drivers/net/thunderbolt.c b/drivers/net/thunderbolt.c index 9a6a8353e192..ff5d0e98a088 100644 --- a/drivers/net/thunderbolt.c +++ b/drivers/net/thunderbolt.c @@ -1202,17 +1202,19 @@ static void tbnet_generate_mac(struct net_device *dev) { const struct tbnet *net = netdev_priv(dev); const struct tb_xdomain *xd = net->xd; + u8 addr[ETH_ALEN]; u8 phy_port; u32 hash; phy_port = tb_phy_port_from_link(TBNET_L0_PORT_NUM(xd->route)); /* Unicast and locally administered MAC */ - dev->dev_addr[0] = phy_port << 4 | 0x02; + addr[0] = phy_port << 4 | 0x02; hash = jhash2((u32 *)xd->local_uuid, 4, 0); - memcpy(dev->dev_addr + 1, &hash, sizeof(hash)); + memcpy(addr + 1, &hash, sizeof(hash)); hash = jhash2((u32 *)xd->local_uuid, 4, hash); - dev->dev_addr[5] = hash & 0xff; + addr[5] = hash & 0xff; + eth_hw_addr_set(dev, addr); } static int tbnet_probe(struct tb_service *svc, const struct tb_service_id *id) diff --git a/drivers/staging/octeon/ethernet.c b/drivers/staging/octeon/ethernet.c index 5d24c1b6663b..d5785c0d06b0 100644 --- a/drivers/staging/octeon/ethernet.c +++ b/drivers/staging/octeon/ethernet.c @@ -409,7 +409,7 @@ int cvm_oct_common_init(struct net_device *dev) struct octeon_ethernet *priv = netdev_priv(dev); int ret; - ret = of_get_mac_address(priv->of_node, dev->dev_addr); + ret = of_get_ethdev_address(priv->of_node, dev); if (ret) eth_hw_addr_random(dev); diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h index 2746fd804216..3536ab432b30 100644 --- a/include/linux/bpf-cgroup.h +++ b/include/linux/bpf-cgroup.h @@ -517,6 +517,7 @@ static inline int bpf_percpu_cgroup_storage_update(struct bpf_map *map, #define cgroup_bpf_enabled(atype) (0) #define BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, atype, t_ctx) ({ 0; }) +#define BPF_CGROUP_RUN_SA_PROG(sk, uaddr, atype) ({ 0; }) #define BPF_CGROUP_PRE_CONNECT_ENABLED(sk) (0) #define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk,skb) ({ 0; }) #define BPF_CGROUP_RUN_PROG_INET_EGRESS(sk,skb) ({ 0; }) diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index ada02c4a4f51..22179b2fda72 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -1302,7 +1302,7 @@ struct mini_Qdisc { struct tcf_block *block; struct gnet_stats_basic_sync __percpu *cpu_bstats; struct gnet_stats_queue __percpu *cpu_qstats; - struct rcu_head rcu; + unsigned long rcu_state; }; static inline void mini_qdisc_bstats_cpu_update(struct mini_Qdisc *miniq, diff --git a/include/net/sock.h b/include/net/sock.h index ff4e62aa62e5..4f0280ad6c23 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1210,6 +1210,8 @@ struct proto { unsigned int inuse_idx; #endif + int (*forward_alloc_get)(const struct sock *sk); + bool (*stream_memory_free)(const struct sock *sk, int wake); bool (*stream_memory_read)(const struct sock *sk); /* Memory pressure */ @@ -1217,6 +1219,7 @@ struct proto { void (*leave_memory_pressure)(struct sock *sk); atomic_long_t *memory_allocated; /* Current allocated memory. */ struct percpu_counter *sockets_allocated; /* Current number of sockets. */ + /* * Pressure flag: try to collapse. * Technical note: it is used by multiple contexts non atomically. @@ -1294,6 +1297,14 @@ static inline void sk_refcnt_debug_release(const struct sock *sk) INDIRECT_CALLABLE_DECLARE(bool tcp_stream_memory_free(const struct sock *sk, int wake)); +static inline int sk_forward_alloc_get(const struct sock *sk) +{ + if (!sk->sk_prot->forward_alloc_get) + return sk->sk_forward_alloc; + + return sk->sk_prot->forward_alloc_get(sk); +} + static inline bool __sk_stream_memory_free(const struct sock *sk, int wake) { if (READ_ONCE(sk->sk_wmem_queued) >= READ_ONCE(sk->sk_sndbuf)) @@ -1573,6 +1584,11 @@ static inline void sk_mem_charge(struct sock *sk, int size) sk->sk_forward_alloc -= size; } +/* the following macros control memory reclaiming in sk_mem_uncharge() + */ +#define SK_RECLAIM_THRESHOLD (1 << 21) +#define SK_RECLAIM_CHUNK (1 << 20) + static inline void sk_mem_uncharge(struct sock *sk, int size) { int reclaimable; @@ -1589,8 +1605,8 @@ static inline void sk_mem_uncharge(struct sock *sk, int size) * If we reach 2 MBytes, reclaim 1 MBytes right now, there is * no need to hold that much forward allocation anyway. */ - if (unlikely(reclaimable >= 1 << 21)) - __sk_mem_reclaim(sk, 1 << 20); + if (unlikely(reclaimable >= SK_RECLAIM_THRESHOLD)) + __sk_mem_reclaim(sk, SK_RECLAIM_CHUNK); } static inline void sk_wmem_free_skb(struct sock *sk, struct sk_buff *skb) diff --git a/include/net/tcp.h b/include/net/tcp.h index 701587af6852..8e8c5922a7b0 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -311,7 +311,7 @@ void tcp_shutdown(struct sock *sk, int how); int tcp_v4_early_demux(struct sk_buff *skb); int tcp_v4_rcv(struct sk_buff *skb); -void tcp_remove_empty_skb(struct sock *sk, struct sk_buff *skb); +void tcp_remove_empty_skb(struct sock *sk); int tcp_v4_tw_remember_stamp(struct inet_timewait_sock *tw); int tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size); int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size); diff --git a/net/core/xdp.c b/net/core/xdp.c index cc92ccb38432..5ddc29f29bad 100644 --- a/net/core/xdp.c +++ b/net/core/xdp.c @@ -143,8 +143,6 @@ void xdp_rxq_info_unreg(struct xdp_rxq_info *xdp_rxq) if (xdp_rxq->reg_state == REG_STATE_UNUSED) return; - WARN(!(xdp_rxq->reg_state == REG_STATE_REGISTERED), "Driver BUG"); - xdp_rxq_info_unreg_mem_model(xdp_rxq); xdp_rxq->reg_state = REG_STATE_UNREGISTERED; diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 8eb428387bac..0189e3cd4a7d 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -150,7 +150,7 @@ void inet_sock_destruct(struct sock *sk) WARN_ON(atomic_read(&sk->sk_rmem_alloc)); WARN_ON(refcount_read(&sk->sk_wmem_alloc)); WARN_ON(sk->sk_wmem_queued); - WARN_ON(sk->sk_forward_alloc); + WARN_ON(sk_forward_alloc_get(sk)); kfree(rcu_dereference_protected(inet->inet_opt, 1)); dst_release(rcu_dereference_protected(sk->sk_dst_cache, 1)); @@ -769,26 +769,28 @@ int inet_getname(struct socket *sock, struct sockaddr *uaddr, DECLARE_SOCKADDR(struct sockaddr_in *, sin, uaddr); sin->sin_family = AF_INET; + lock_sock(sk); if (peer) { if (!inet->inet_dport || (((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_SYN_SENT)) && - peer == 1)) + peer == 1)) { + release_sock(sk); return -ENOTCONN; + } sin->sin_port = inet->inet_dport; sin->sin_addr.s_addr = inet->inet_daddr; - BPF_CGROUP_RUN_SA_PROG_LOCK(sk, (struct sockaddr *)sin, - CGROUP_INET4_GETPEERNAME, - NULL); + BPF_CGROUP_RUN_SA_PROG(sk, (struct sockaddr *)sin, + CGROUP_INET4_GETPEERNAME); } else { __be32 addr = inet->inet_rcv_saddr; if (!addr) addr = inet->inet_saddr; sin->sin_port = inet->inet_sport; sin->sin_addr.s_addr = addr; - BPF_CGROUP_RUN_SA_PROG_LOCK(sk, (struct sockaddr *)sin, - CGROUP_INET4_GETSOCKNAME, - NULL); + BPF_CGROUP_RUN_SA_PROG(sk, (struct sockaddr *)sin, + CGROUP_INET4_GETSOCKNAME); } + release_sock(sk); memset(sin->sin_zero, 0, sizeof(sin->sin_zero)); return sizeof(*sin); } diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index ef7897226f08..c8fa6e7f7d12 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -271,7 +271,7 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk, struct inet_diag_meminfo minfo = { .idiag_rmem = sk_rmem_alloc_get(sk), .idiag_wmem = READ_ONCE(sk->sk_wmem_queued), - .idiag_fmem = sk->sk_forward_alloc, + .idiag_fmem = sk_forward_alloc_get(sk), .idiag_tmem = sk_wmem_alloc_get(sk), }; diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index d0b848ff5c0f..7a7b9aa8f19a 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -658,10 +658,8 @@ void tcp_skb_entail(struct sock *sk, struct sk_buff *skb) struct tcp_sock *tp = tcp_sk(sk); struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); - skb->csum = 0; tcb->seq = tcb->end_seq = tp->write_seq; tcb->tcp_flags = TCPHDR_ACK; - tcb->sacked = 0; __skb_header_release(skb); tcp_add_write_queue_tail(sk, skb); sk_wmem_queued_add(sk, skb->truesize); @@ -876,11 +874,7 @@ struct sk_buff *tcp_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp, } if (likely(mem_scheduled)) { skb_reserve(skb, MAX_TCP_HEADER); - /* - * Make sure that we have exactly size bytes - * available to the caller, no more, no less. - */ - skb->reserved_tailroom = skb->end - skb->tail - size; + skb->ip_summed = CHECKSUM_PARTIAL; INIT_LIST_HEAD(&skb->tcp_tsorted_anchor); return skb; } @@ -933,8 +927,10 @@ int tcp_send_mss(struct sock *sk, int *size_goal, int flags) * importantly be able to generate EPOLLOUT for Edge Trigger epoll() * users. */ -void tcp_remove_empty_skb(struct sock *sk, struct sk_buff *skb) +void tcp_remove_empty_skb(struct sock *sk) { + struct sk_buff *skb = tcp_write_queue_tail(sk); + if (skb && TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq) { tcp_unlink_write_queue(skb, sk); if (tcp_write_queue_empty(sk)) @@ -996,7 +992,6 @@ new_segment: skb->truesize += copy; sk_wmem_queued_add(sk, copy); sk_mem_charge(sk, copy); - skb->ip_summed = CHECKSUM_PARTIAL; WRITE_ONCE(tp->write_seq, tp->write_seq + copy); TCP_SKB_CB(skb)->end_seq += copy; tcp_skb_pcount_set(skb, 0); @@ -1087,7 +1082,7 @@ out: return copied; do_error: - tcp_remove_empty_skb(sk, tcp_write_queue_tail(sk)); + tcp_remove_empty_skb(sk); if (copied) goto out; out_err: @@ -1292,7 +1287,6 @@ new_segment: goto wait_for_space; process_backlog++; - skb->ip_summed = CHECKSUM_PARTIAL; tcp_skb_entail(sk, skb); copy = size_goal; @@ -1309,14 +1303,7 @@ new_segment: if (copy > msg_data_left(msg)) copy = msg_data_left(msg); - /* Where to copy to? */ - if (skb_availroom(skb) > 0 && !zc) { - /* We have some space in skb head. Superb! */ - copy = min_t(int, copy, skb_availroom(skb)); - err = skb_add_data_nocache(sk, skb, &msg->msg_iter, copy); - if (err) - goto do_fault; - } else if (!zc) { + if (!zc) { bool merge = true; int i = skb_shinfo(skb)->nr_frags; struct page_frag *pfrag = sk_page_frag(sk); @@ -1415,9 +1402,7 @@ out_nopush: return copied + copied_syn; do_error: - skb = tcp_write_queue_tail(sk); -do_fault: - tcp_remove_empty_skb(sk, skb); + tcp_remove_empty_skb(sk); if (copied + copied_syn) goto out; diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index c0c55a8be8f7..6867e5db3e35 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -394,7 +394,6 @@ static void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags) skb->ip_summed = CHECKSUM_PARTIAL; TCP_SKB_CB(skb)->tcp_flags = flags; - TCP_SKB_CB(skb)->sacked = 0; tcp_skb_pcount_set(skb, 1); @@ -1590,8 +1589,6 @@ int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue, skb_split(skb, buff, len); - buff->ip_summed = CHECKSUM_PARTIAL; - buff->tstamp = skb->tstamp; tcp_fragment_tstamp(skb, buff); @@ -1676,7 +1673,6 @@ int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len) delta_truesize = __pskb_trim_head(skb, len); TCP_SKB_CB(skb)->seq += len; - skb->ip_summed = CHECKSUM_PARTIAL; if (delta_truesize) { skb->truesize -= delta_truesize; @@ -2142,12 +2138,8 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len, TCP_SKB_CB(skb)->tcp_flags = flags & ~(TCPHDR_FIN | TCPHDR_PSH); TCP_SKB_CB(buff)->tcp_flags = flags; - /* This packet was never sent out yet, so no SACK bits. */ - TCP_SKB_CB(buff)->sacked = 0; - tcp_skb_fragment_eor(skb, buff); - buff->ip_summed = CHECKSUM_PARTIAL; skb_split(skb, buff, len); tcp_fragment_tstamp(skb, buff); @@ -2401,9 +2393,6 @@ static int tcp_mtu_probe(struct sock *sk) TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(skb)->seq; TCP_SKB_CB(nskb)->end_seq = TCP_SKB_CB(skb)->seq + probe_size; TCP_SKB_CB(nskb)->tcp_flags = TCPHDR_ACK; - TCP_SKB_CB(nskb)->sacked = 0; - nskb->csum = 0; - nskb->ip_summed = CHECKSUM_PARTIAL; tcp_insert_write_queue_before(nskb, skb, sk); tcp_highest_sack_replace(sk, skb, nskb); @@ -3045,13 +3034,9 @@ static bool tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb) BUG_ON(tcp_skb_pcount(skb) != 1 || tcp_skb_pcount(next_skb) != 1); - if (next_skb_size) { - if (next_skb_size <= skb_availroom(skb)) - skb_copy_bits(next_skb, 0, skb_put(skb, next_skb_size), - next_skb_size); - else if (!tcp_skb_shift(skb, next_skb, 1, next_skb_size)) - return false; - } + if (next_skb_size && !tcp_skb_shift(skb, next_skb, 1, next_skb_size)) + return false; + tcp_highest_sack_replace(sk, next_skb, skb); /* Update sequence range on original skb. */ @@ -3757,7 +3742,6 @@ static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn) syn_data = tcp_stream_alloc_skb(sk, space, sk->sk_allocation, false); if (!syn_data) goto fallback; - syn_data->ip_summed = CHECKSUM_PARTIAL; memcpy(syn_data->cb, syn->cb, sizeof(syn->cb)); if (space) { int copied = copy_from_iter(skb_put(syn_data, space), space, diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index b5878bb8e419..0c4da163535a 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -521,31 +521,32 @@ int inet6_getname(struct socket *sock, struct sockaddr *uaddr, sin->sin6_family = AF_INET6; sin->sin6_flowinfo = 0; sin->sin6_scope_id = 0; + lock_sock(sk); if (peer) { - if (!inet->inet_dport) - return -ENOTCONN; - if (((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_SYN_SENT)) && - peer == 1) + if (!inet->inet_dport || + (((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_SYN_SENT)) && + peer == 1)) { + release_sock(sk); return -ENOTCONN; + } sin->sin6_port = inet->inet_dport; sin->sin6_addr = sk->sk_v6_daddr; if (np->sndflow) sin->sin6_flowinfo = np->flow_label; - BPF_CGROUP_RUN_SA_PROG_LOCK(sk, (struct sockaddr *)sin, - CGROUP_INET6_GETPEERNAME, - NULL); + BPF_CGROUP_RUN_SA_PROG(sk, (struct sockaddr *)sin, + CGROUP_INET6_GETPEERNAME); } else { if (ipv6_addr_any(&sk->sk_v6_rcv_saddr)) sin->sin6_addr = np->saddr; else sin->sin6_addr = sk->sk_v6_rcv_saddr; sin->sin6_port = inet->inet_sport; - BPF_CGROUP_RUN_SA_PROG_LOCK(sk, (struct sockaddr *)sin, - CGROUP_INET6_GETSOCKNAME, - NULL); + BPF_CGROUP_RUN_SA_PROG(sk, (struct sockaddr *)sin, + CGROUP_INET6_GETSOCKNAME); } sin->sin6_scope_id = ipv6_iface_scope_id(&sin->sin6_addr, sk->sk_bound_dev_if); + release_sock(sk); return sizeof(*sin); } EXPORT_SYMBOL(inet6_getname); diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 9b9ef09382ab..3ae25b8ffbd6 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -6306,11 +6306,11 @@ static int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write, static struct ctl_table ipv6_route_table_template[] = { { - .procname = "flush", - .data = &init_net.ipv6.sysctl.flush_delay, + .procname = "max_size", + .data = &init_net.ipv6.sysctl.ip6_rt_max_size, .maxlen = sizeof(int), - .mode = 0200, - .proc_handler = ipv6_sysctl_rtcache_flush + .mode = 0644, + .proc_handler = proc_dointvec, }, { .procname = "gc_thresh", @@ -6320,11 +6320,11 @@ static struct ctl_table ipv6_route_table_template[] = { .proc_handler = proc_dointvec, }, { - .procname = "max_size", - .data = &init_net.ipv6.sysctl.ip6_rt_max_size, + .procname = "flush", + .data = &init_net.ipv6.sysctl.flush_delay, .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, + .mode = 0200, + .proc_handler = ipv6_sysctl_rtcache_flush }, { .procname = "gc_min_interval", @@ -6396,10 +6396,10 @@ struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net) GFP_KERNEL); if (table) { - table[0].data = &net->ipv6.sysctl.flush_delay; - table[0].extra1 = net; + table[0].data = &net->ipv6.sysctl.ip6_rt_max_size; table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh; - table[2].data = &net->ipv6.sysctl.ip6_rt_max_size; + table[2].data = &net->ipv6.sysctl.flush_delay; + table[2].extra1 = net; table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval; table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout; table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval; @@ -6411,7 +6411,7 @@ struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net) /* Don't export sysctls to unprivileged users */ if (net->user_ns != &init_user_ns) - table[0].procname = NULL; + table[1].procname = NULL; } return table; diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index c678e778c1fb..2cc9b0e53ad1 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -969,7 +969,6 @@ static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32 fl6.flowlabel = label; buff->ip_summed = CHECKSUM_PARTIAL; - buff->csum = 0; __tcp_v6_send_check(buff, &fl6.saddr, &fl6.daddr); diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index cd6b11c9b54d..b7e32e316738 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -126,6 +126,11 @@ static void mptcp_drop(struct sock *sk, struct sk_buff *skb) __kfree_skb(skb); } +static void mptcp_rmem_charge(struct sock *sk, int size) +{ + mptcp_sk(sk)->rmem_fwd_alloc -= size; +} + static bool mptcp_try_coalesce(struct sock *sk, struct sk_buff *to, struct sk_buff *from) { @@ -142,7 +147,7 @@ static bool mptcp_try_coalesce(struct sock *sk, struct sk_buff *to, MPTCP_SKB_CB(to)->end_seq = MPTCP_SKB_CB(from)->end_seq; kfree_skb_partial(from, fragstolen); atomic_add(delta, &sk->sk_rmem_alloc); - sk_mem_charge(sk, delta); + mptcp_rmem_charge(sk, delta); return true; } @@ -155,6 +160,44 @@ static bool mptcp_ooo_try_coalesce(struct mptcp_sock *msk, struct sk_buff *to, return mptcp_try_coalesce((struct sock *)msk, to, from); } +static void __mptcp_rmem_reclaim(struct sock *sk, int amount) +{ + amount >>= SK_MEM_QUANTUM_SHIFT; + mptcp_sk(sk)->rmem_fwd_alloc -= amount << SK_MEM_QUANTUM_SHIFT; + __sk_mem_reduce_allocated(sk, amount); +} + +static void mptcp_rmem_uncharge(struct sock *sk, int size) +{ + struct mptcp_sock *msk = mptcp_sk(sk); + int reclaimable; + + msk->rmem_fwd_alloc += size; + reclaimable = msk->rmem_fwd_alloc - sk_unused_reserved_mem(sk); + + /* see sk_mem_uncharge() for the rationale behind the following schema */ + if (unlikely(reclaimable >= SK_RECLAIM_THRESHOLD)) + __mptcp_rmem_reclaim(sk, SK_RECLAIM_CHUNK); +} + +static void mptcp_rfree(struct sk_buff *skb) +{ + unsigned int len = skb->truesize; + struct sock *sk = skb->sk; + + atomic_sub(len, &sk->sk_rmem_alloc); + mptcp_rmem_uncharge(sk, len); +} + +static void mptcp_set_owner_r(struct sk_buff *skb, struct sock *sk) +{ + skb_orphan(skb); + skb->sk = sk; + skb->destructor = mptcp_rfree; + atomic_add(skb->truesize, &sk->sk_rmem_alloc); + mptcp_rmem_charge(sk, skb->truesize); +} + /* "inspired" by tcp_data_queue_ofo(), main differences: * - use mptcp seqs * - don't cope with sacks @@ -267,7 +310,29 @@ merge_right: end: skb_condense(skb); - skb_set_owner_r(skb, sk); + mptcp_set_owner_r(skb, sk); +} + +static bool mptcp_rmem_schedule(struct sock *sk, struct sock *ssk, int size) +{ + struct mptcp_sock *msk = mptcp_sk(sk); + int amt, amount; + + if (size < msk->rmem_fwd_alloc) + return true; + + amt = sk_mem_pages(size); + amount = amt << SK_MEM_QUANTUM_SHIFT; + msk->rmem_fwd_alloc += amount; + if (!__sk_mem_raise_allocated(sk, size, amt, SK_MEM_RECV)) { + if (ssk->sk_forward_alloc < amount) { + msk->rmem_fwd_alloc -= amount; + return false; + } + + ssk->sk_forward_alloc -= amount; + } + return true; } static bool __mptcp_move_skb(struct mptcp_sock *msk, struct sock *ssk, @@ -285,15 +350,8 @@ static bool __mptcp_move_skb(struct mptcp_sock *msk, struct sock *ssk, skb_orphan(skb); /* try to fetch required memory from subflow */ - if (!sk_rmem_schedule(sk, skb, skb->truesize)) { - int amount = sk_mem_pages(skb->truesize) << SK_MEM_QUANTUM_SHIFT; - - if (ssk->sk_forward_alloc < amount) - goto drop; - - ssk->sk_forward_alloc -= amount; - sk->sk_forward_alloc += amount; - } + if (!mptcp_rmem_schedule(sk, ssk, skb->truesize)) + goto drop; has_rxtstamp = TCP_SKB_CB(skb)->has_rxtstamp; @@ -313,7 +371,7 @@ static bool __mptcp_move_skb(struct mptcp_sock *msk, struct sock *ssk, if (tail && mptcp_try_coalesce(sk, tail, skb)) return true; - skb_set_owner_r(skb, sk); + mptcp_set_owner_r(skb, sk); __skb_queue_tail(&sk->sk_receive_queue, skb); return true; } else if (after64(MPTCP_SKB_CB(skb)->map_seq, msk->ack_seq)) { @@ -908,122 +966,20 @@ static bool mptcp_frag_can_collapse_to(const struct mptcp_sock *msk, df->data_seq + df->data_len == msk->write_seq; } -static int mptcp_wmem_with_overhead(int size) -{ - return size + ((sizeof(struct mptcp_data_frag) * size) >> PAGE_SHIFT); -} - -static void __mptcp_wmem_reserve(struct sock *sk, int size) -{ - int amount = mptcp_wmem_with_overhead(size); - struct mptcp_sock *msk = mptcp_sk(sk); - - WARN_ON_ONCE(msk->wmem_reserved); - if (WARN_ON_ONCE(amount < 0)) - amount = 0; - - if (amount <= sk->sk_forward_alloc) - goto reserve; - - /* under memory pressure try to reserve at most a single page - * otherwise try to reserve the full estimate and fallback - * to a single page before entering the error path - */ - if ((tcp_under_memory_pressure(sk) && amount > PAGE_SIZE) || - !sk_wmem_schedule(sk, amount)) { - if (amount <= PAGE_SIZE) - goto nomem; - - amount = PAGE_SIZE; - if (!sk_wmem_schedule(sk, amount)) - goto nomem; - } - -reserve: - msk->wmem_reserved = amount; - sk->sk_forward_alloc -= amount; - return; - -nomem: - /* we will wait for memory on next allocation */ - msk->wmem_reserved = -1; -} - -static void __mptcp_update_wmem(struct sock *sk) +static void __mptcp_mem_reclaim_partial(struct sock *sk) { - struct mptcp_sock *msk = mptcp_sk(sk); + int reclaimable = mptcp_sk(sk)->rmem_fwd_alloc - sk_unused_reserved_mem(sk); lockdep_assert_held_once(&sk->sk_lock.slock); - if (!msk->wmem_reserved) - return; - - if (msk->wmem_reserved < 0) - msk->wmem_reserved = 0; - if (msk->wmem_reserved > 0) { - sk->sk_forward_alloc += msk->wmem_reserved; - msk->wmem_reserved = 0; - } -} - -static bool mptcp_wmem_alloc(struct sock *sk, int size) -{ - struct mptcp_sock *msk = mptcp_sk(sk); - - /* check for pre-existing error condition */ - if (msk->wmem_reserved < 0) - return false; - - if (msk->wmem_reserved >= size) - goto account; - - mptcp_data_lock(sk); - if (!sk_wmem_schedule(sk, size)) { - mptcp_data_unlock(sk); - return false; - } - - sk->sk_forward_alloc -= size; - msk->wmem_reserved += size; - mptcp_data_unlock(sk); - -account: - msk->wmem_reserved -= size; - return true; -} - -static void mptcp_wmem_uncharge(struct sock *sk, int size) -{ - struct mptcp_sock *msk = mptcp_sk(sk); - - if (msk->wmem_reserved < 0) - msk->wmem_reserved = 0; - msk->wmem_reserved += size; -} - -static void __mptcp_mem_reclaim_partial(struct sock *sk) -{ - lockdep_assert_held_once(&sk->sk_lock.slock); - __mptcp_update_wmem(sk); + __mptcp_rmem_reclaim(sk, reclaimable - 1); sk_mem_reclaim_partial(sk); } static void mptcp_mem_reclaim_partial(struct sock *sk) { - struct mptcp_sock *msk = mptcp_sk(sk); - - /* if we are experiencing a transint allocation error, - * the forward allocation memory has been already - * released - */ - if (msk->wmem_reserved < 0) - return; - mptcp_data_lock(sk); - sk->sk_forward_alloc += msk->wmem_reserved; - sk_mem_reclaim_partial(sk); - msk->wmem_reserved = sk->sk_forward_alloc; - sk->sk_forward_alloc = 0; + __mptcp_mem_reclaim_partial(sk); mptcp_data_unlock(sk); } @@ -1218,7 +1174,7 @@ static struct sk_buff *__mptcp_do_alloc_tx_skb(struct sock *sk, gfp_t gfp) if (likely(skb)) { if (likely(__mptcp_add_ext(skb, gfp))) { skb_reserve(skb, MAX_TCP_HEADER); - skb->reserved_tailroom = skb->end - skb->tail; + skb->ip_summed = CHECKSUM_PARTIAL; INIT_LIST_HEAD(&skb->tcp_tsorted_anchor); return skb; } @@ -1335,7 +1291,7 @@ alloc_skb: u64 snd_una = READ_ONCE(msk->snd_una); if (snd_una != msk->snd_nxt) { - tcp_remove_empty_skb(ssk, tcp_write_queue_tail(ssk)); + tcp_remove_empty_skb(ssk); return 0; } @@ -1351,7 +1307,7 @@ alloc_skb: copy = min_t(size_t, copy, info->limit - info->sent); if (!sk_wmem_schedule(ssk, copy)) { - tcp_remove_empty_skb(ssk, tcp_write_queue_tail(ssk)); + tcp_remove_empty_skb(ssk); return -ENOMEM; } @@ -1367,7 +1323,6 @@ alloc_skb: skb->truesize += copy; sk_wmem_queued_add(ssk, copy); sk_mem_charge(ssk, copy); - skb->ip_summed = CHECKSUM_PARTIAL; WRITE_ONCE(tcp_sk(ssk)->write_seq, tcp_sk(ssk)->write_seq + copy); TCP_SKB_CB(skb)->end_seq += copy; tcp_skb_pcount_set(skb, 0); @@ -1513,8 +1468,7 @@ static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk) return NULL; } -static void mptcp_push_release(struct sock *sk, struct sock *ssk, - struct mptcp_sendmsg_info *info) +static void mptcp_push_release(struct sock *ssk, struct mptcp_sendmsg_info *info) { tcp_push(ssk, 0, info->mss_now, tcp_sk(ssk)->nonagle, info->size_goal); release_sock(ssk); @@ -1577,7 +1531,7 @@ void __mptcp_push_pending(struct sock *sk, unsigned int flags) * the last round, release prev_ssk */ if (ssk != prev_ssk && prev_ssk) - mptcp_push_release(sk, prev_ssk, &info); + mptcp_push_release(prev_ssk, &info); if (!ssk) goto out; @@ -1590,7 +1544,7 @@ void __mptcp_push_pending(struct sock *sk, unsigned int flags) ret = mptcp_sendmsg_frag(sk, ssk, dfrag, &info); if (ret <= 0) { - mptcp_push_release(sk, ssk, &info); + mptcp_push_release(ssk, &info); goto out; } @@ -1605,7 +1559,7 @@ void __mptcp_push_pending(struct sock *sk, unsigned int flags) /* at this point we held the socket lock for the last subflow we used */ if (ssk) - mptcp_push_release(sk, ssk, &info); + mptcp_push_release(ssk, &info); out: /* ensure the rtx timer is running */ @@ -1664,7 +1618,6 @@ out: /* __mptcp_alloc_tx_skb could have released some wmem and we are * not going to flush it via release_sock() */ - __mptcp_update_wmem(sk); if (copied) { tcp_push(ssk, 0, info.mss_now, tcp_sk(ssk)->nonagle, info.size_goal); @@ -1701,7 +1654,7 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) /* silently ignore everything else */ msg->msg_flags &= MSG_MORE | MSG_DONTWAIT | MSG_NOSIGNAL; - mptcp_lock_sock(sk, __mptcp_wmem_reserve(sk, min_t(size_t, 1 << 20, len))); + lock_sock(sk); timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT); @@ -1749,17 +1702,17 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) psize = min_t(size_t, psize, msg_data_left(msg)); total_ts = psize + frag_truesize; - if (!mptcp_wmem_alloc(sk, total_ts)) + if (!sk_wmem_schedule(sk, total_ts)) goto wait_for_memory; if (copy_page_from_iter(dfrag->page, offset, psize, &msg->msg_iter) != psize) { - mptcp_wmem_uncharge(sk, psize + frag_truesize); ret = -EFAULT; goto out; } /* data successfully copied into the write queue */ + sk->sk_forward_alloc -= total_ts; copied += psize; dfrag->data_len += psize; frag_truesize += psize; @@ -1956,7 +1909,7 @@ static void __mptcp_update_rmem(struct sock *sk) return; atomic_sub(msk->rmem_released, &sk->sk_rmem_alloc); - sk_mem_uncharge(sk, msk->rmem_released); + mptcp_rmem_uncharge(sk, msk->rmem_released); WRITE_ONCE(msk->rmem_released, 0); } @@ -2024,7 +1977,7 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, if (unlikely(flags & MSG_ERRQUEUE)) return inet_recv_error(sk, msg, len, addr_len); - mptcp_lock_sock(sk, __mptcp_splice_receive_queue(sk)); + lock_sock(sk); if (unlikely(sk->sk_state == TCP_LISTEN)) { copied = -ENOTCONN; goto out_err; @@ -2504,7 +2457,7 @@ static int __mptcp_init_sock(struct sock *sk) __skb_queue_head_init(&msk->receive_queue); msk->out_of_order_queue = RB_ROOT; msk->first_pending = NULL; - msk->wmem_reserved = 0; + msk->rmem_fwd_alloc = 0; WRITE_ONCE(msk->rmem_released, 0); msk->timer_ival = TCP_RTO_MIN; @@ -2715,7 +2668,7 @@ static void __mptcp_destroy_sock(struct sock *sk) sk->sk_prot->destroy(sk); - WARN_ON_ONCE(msk->wmem_reserved); + WARN_ON_ONCE(msk->rmem_fwd_alloc); WARN_ON_ONCE(msk->rmem_released); sk_stream_kill_queues(sk); xfrm_sk_free_policy(sk); @@ -2948,8 +2901,14 @@ void mptcp_destroy_common(struct mptcp_sock *msk) /* move to sk_receive_queue, sk_stream_kill_queues will purge it */ skb_queue_splice_tail_init(&msk->receive_queue, &sk->sk_receive_queue); - + __skb_queue_purge(&sk->sk_receive_queue); skb_rbtree_purge(&msk->out_of_order_queue); + + /* move all the rx fwd alloc into the sk_mem_reclaim_final in + * inet_sock_destruct() will dispose it + */ + sk->sk_forward_alloc += msk->rmem_fwd_alloc; + msk->rmem_fwd_alloc = 0; mptcp_token_destroy(msk); mptcp_pm_free_anno_list(msk); } @@ -3031,10 +2990,6 @@ static void mptcp_release_cb(struct sock *sk) if (test_and_clear_bit(MPTCP_ERROR_REPORT, &mptcp_sk(sk)->flags)) __mptcp_error_report(sk); - /* push_pending may touch wmem_reserved, ensure we do the cleanup - * later - */ - __mptcp_update_wmem(sk); __mptcp_update_rmem(sk); } @@ -3184,6 +3139,11 @@ static void mptcp_shutdown(struct sock *sk, int how) __mptcp_wr_shutdown(sk); } +static int mptcp_forward_alloc_get(const struct sock *sk) +{ + return sk->sk_forward_alloc + mptcp_sk(sk)->rmem_fwd_alloc; +} + static struct proto mptcp_prot = { .name = "MPTCP", .owner = THIS_MODULE, @@ -3201,6 +3161,7 @@ static struct proto mptcp_prot = { .hash = mptcp_hash, .unhash = mptcp_unhash, .get_port = mptcp_get_port, + .forward_alloc_get = mptcp_forward_alloc_get, .sockets_allocated = &mptcp_sockets_allocated, .memory_allocated = &tcp_memory_allocated, .memory_pressure = &tcp_memory_pressure, diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index 284fdcec067e..67a61ac48b20 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -227,7 +227,7 @@ struct mptcp_sock { u64 ack_seq; u64 rcv_wnd_sent; u64 rcv_data_fin_seq; - int wmem_reserved; + int rmem_fwd_alloc; struct sock *last_snd; int snd_burst; int old_wspace; @@ -272,19 +272,6 @@ struct mptcp_sock { char ca_name[TCP_CA_NAME_MAX]; }; -#define mptcp_lock_sock(___sk, cb) do { \ - struct sock *__sk = (___sk); /* silence macro reuse warning */ \ - might_sleep(); \ - spin_lock_bh(&__sk->sk_lock.slock); \ - if (__sk->sk_lock.owned) \ - __lock_sock(__sk); \ - cb; \ - __sk->sk_lock.owned = 1; \ - spin_unlock(&__sk->sk_lock.slock); \ - mutex_acquire(&__sk->sk_lock.dep_map, 0, 0, _RET_IP_); \ - local_bh_enable(); \ -} while (0) - #define mptcp_data_lock(sk) spin_lock_bh(&(sk)->sk_lock.slock) #define mptcp_data_unlock(sk) spin_unlock_bh(&(sk)->sk_lock.slock) diff --git a/net/sched/em_meta.c b/net/sched/em_meta.c index 46254968d390..0a04468b7314 100644 --- a/net/sched/em_meta.c +++ b/net/sched/em_meta.c @@ -457,7 +457,7 @@ META_COLLECTOR(int_sk_fwd_alloc) *err = -1; return; } - dst->value = sk->sk_forward_alloc; + dst->value = sk_forward_alloc_get(sk); } META_COLLECTOR(int_sk_sndbuf) diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index b0ff0dff2773..3b0f62095803 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -1487,10 +1487,6 @@ void psched_ppscfg_precompute(struct psched_pktrate *r, u64 pktrate64) } EXPORT_SYMBOL(psched_ppscfg_precompute); -static void mini_qdisc_rcu_func(struct rcu_head *head) -{ -} - void mini_qdisc_pair_swap(struct mini_Qdisc_pair *miniqp, struct tcf_proto *tp_head) { @@ -1503,28 +1499,30 @@ void mini_qdisc_pair_swap(struct mini_Qdisc_pair *miniqp, if (!tp_head) { RCU_INIT_POINTER(*miniqp->p_miniq, NULL); - /* Wait for flying RCU callback before it is freed. */ - rcu_barrier(); - return; - } + } else { + miniq = miniq_old != &miniqp->miniq1 ? + &miniqp->miniq1 : &miniqp->miniq2; - miniq = !miniq_old || miniq_old == &miniqp->miniq2 ? - &miniqp->miniq1 : &miniqp->miniq2; + /* We need to make sure that readers won't see the miniq + * we are about to modify. So ensure that at least one RCU + * grace period has elapsed since the miniq was made + * inactive. + */ + if (IS_ENABLED(CONFIG_PREEMPT_RT)) + cond_synchronize_rcu(miniq->rcu_state); + else if (!poll_state_synchronize_rcu(miniq->rcu_state)) + synchronize_rcu_expedited(); - /* We need to make sure that readers won't see the miniq - * we are about to modify. So wait until previous call_rcu callback - * is done. - */ - rcu_barrier(); - miniq->filter_list = tp_head; - rcu_assign_pointer(*miniqp->p_miniq, miniq); + miniq->filter_list = tp_head; + rcu_assign_pointer(*miniqp->p_miniq, miniq); + } if (miniq_old) - /* This is counterpart of the rcu barriers above. We need to + /* This is counterpart of the rcu sync above. We need to * block potential new user of miniq_old until all readers * are not seeing it. */ - call_rcu(&miniq_old->rcu, mini_qdisc_rcu_func); + miniq_old->rcu_state = start_poll_synchronize_rcu(); } EXPORT_SYMBOL(mini_qdisc_pair_swap); @@ -1543,6 +1541,8 @@ void mini_qdisc_pair_init(struct mini_Qdisc_pair *miniqp, struct Qdisc *qdisc, miniqp->miniq1.cpu_qstats = qdisc->cpu_qstats; miniqp->miniq2.cpu_bstats = qdisc->cpu_bstats; miniqp->miniq2.cpu_qstats = qdisc->cpu_qstats; + miniqp->miniq1.rcu_state = get_state_synchronize_rcu(); + miniqp->miniq2.rcu_state = miniqp->miniq1.rcu_state; miniqp->p_miniq = p_miniq; } EXPORT_SYMBOL(mini_qdisc_pair_init); |