diff options
-rw-r--r-- | Documentation/devicetree/bindings/net/maxlinear,gpy2xx.yaml | 11 | ||||
-rw-r--r-- | Documentation/netlink/specs/ovs_datapath.yaml | 30 | ||||
-rw-r--r-- | Documentation/netlink/specs/ovs_flow.yaml | 68 | ||||
-rw-r--r-- | Documentation/netlink/specs/ovs_vport.yaml | 13 | ||||
-rw-r--r-- | Documentation/networking/ip-sysctl.rst | 15 | ||||
-rw-r--r-- | MAINTAINERS | 1 | ||||
-rw-r--r-- | crypto/af_alg.c | 2 | ||||
-rw-r--r-- | drivers/net/phy/at803x.c | 44 | ||||
-rw-r--r-- | include/linux/net_mm.h | 17 | ||||
-rw-r--r-- | include/net/netns/ipv4.h | 1 | ||||
-rw-r--r-- | include/net/sch_generic.h | 14 | ||||
-rw-r--r-- | include/net/tcp.h | 1 | ||||
-rw-r--r-- | mm/memory.c | 7 | ||||
-rw-r--r-- | net/core/gro.c | 32 | ||||
-rw-r--r-- | net/devlink/leftover.c | 5 | ||||
-rw-r--r-- | net/ipv4/sysctl_net_ipv4.c | 9 | ||||
-rw-r--r-- | net/ipv4/tcp.c | 45 | ||||
-rw-r--r-- | net/ipv4/tcp_ipv4.c | 2 | ||||
-rw-r--r-- | net/ipv4/tcp_output.c | 60 | ||||
-rw-r--r-- | net/ipv6/route.c | 13 | ||||
-rw-r--r-- | net/kcm/kcmsock.c | 4 | ||||
-rw-r--r-- | net/mctp/route.c | 3 |
22 files changed, 274 insertions, 123 deletions
diff --git a/Documentation/devicetree/bindings/net/maxlinear,gpy2xx.yaml b/Documentation/devicetree/bindings/net/maxlinear,gpy2xx.yaml index d71fa9de2b64..8a3713abd1ca 100644 --- a/Documentation/devicetree/bindings/net/maxlinear,gpy2xx.yaml +++ b/Documentation/devicetree/bindings/net/maxlinear,gpy2xx.yaml @@ -17,11 +17,12 @@ properties: maxlinear,use-broken-interrupts: description: | Interrupts are broken on some GPY2xx PHYs in that they keep the - interrupt line asserted even after the interrupt status register is - cleared. Thus it is blocking the interrupt line which is usually bad - for shared lines. By default interrupts are disabled for this PHY and - polling mode is used. If one can live with the consequences, this - property can be used to enable interrupt handling. + interrupt line asserted for a random amount of time even after the + interrupt status register is cleared. Thus it is blocking the + interrupt line which is usually bad for shared lines. By default, + interrupts are disabled for this PHY and polling mode is used. If one + can live with the consequences, this property can be used to enable + interrupt handling. Affected PHYs (as far as known) are GPY215B and GPY215C. type: boolean diff --git a/Documentation/netlink/specs/ovs_datapath.yaml b/Documentation/netlink/specs/ovs_datapath.yaml index 6d71db8c4416..f709c26c3e92 100644 --- a/Documentation/netlink/specs/ovs_datapath.yaml +++ b/Documentation/netlink/specs/ovs_datapath.yaml @@ -3,6 +3,7 @@ name: ovs_datapath version: 2 protocol: genetlink-legacy +uapi-header: linux/openvswitch.h doc: OVS datapath configuration over generic netlink. @@ -18,6 +19,7 @@ definitions: - name: user-features type: flags + name-prefix: ovs-dp-f- entries: - name: unaligned @@ -33,35 +35,37 @@ definitions: doc: Allow per-cpu dispatch of upcalls - name: datapath-stats + enum-name: ovs-dp-stats type: struct members: - - name: hit + name: n-hit type: u64 - - name: missed + name: n-missed type: u64 - - name: lost + name: n-lost type: u64 - - name: flows + name: n-flows type: u64 - name: megaflow-stats + enum-name: ovs-dp-megaflow-stats type: struct members: - - name: mask-hit + name: n-mask-hit type: u64 - - name: masks + name: n-masks type: u32 - name: padding type: u32 - - name: cache-hits + name: n-cache-hit type: u64 - name: pad1 @@ -70,6 +74,8 @@ definitions: attribute-sets: - name: datapath + name-prefix: ovs-dp-attr- + enum-name: ovs-datapath-attrs attributes: - name: name @@ -101,12 +107,16 @@ attribute-sets: name: per-cpu-pids type: binary sub-type: u32 + - + name: ifindex + type: u32 operations: fixed-header: ovs-header + name-prefix: ovs-dp-cmd- list: - - name: dp-get + name: get doc: Get / dump OVS data path configuration and state value: 3 attribute-set: datapath @@ -125,7 +135,7 @@ operations: - per-cpu-pids dump: *dp-get-op - - name: dp-new + name: new doc: Create new OVS data path value: 1 attribute-set: datapath @@ -137,7 +147,7 @@ operations: - upcall-pid - user-features - - name: dp-del + name: del doc: Delete existing OVS data path value: 2 attribute-set: datapath diff --git a/Documentation/netlink/specs/ovs_flow.yaml b/Documentation/netlink/specs/ovs_flow.yaml index 3b0624c87074..1ecbcd117385 100644 --- a/Documentation/netlink/specs/ovs_flow.yaml +++ b/Documentation/netlink/specs/ovs_flow.yaml @@ -3,6 +3,7 @@ name: ovs_flow version: 1 protocol: genetlink-legacy +uapi-header: linux/openvswitch.h doc: OVS flow configuration over generic netlink. @@ -67,6 +68,7 @@ definitions: enum: ovs-frag-type - name: ovs-frag-type + name-prefix: ovs-frag-type- type: enum entries: - @@ -166,6 +168,7 @@ definitions: doc: Tag control identifier (TCI) to push. - name: ovs-ufid-flags + name-prefix: ovs-ufid-f- type: flags entries: - omit-key @@ -176,7 +179,7 @@ definitions: type: struct members: - - name: hash-algorithm + name: hash-alg type: u32 doc: Algorithm used to compute hash prior to recirculation. - @@ -198,13 +201,13 @@ definitions: type: struct members: - - name: lse + name: mpls-lse type: u32 byte-order: big-endian doc: | MPLS label stack entry to push - - name: ethertype + name: mpls-ethertype type: u32 byte-order: big-endian doc: | @@ -216,13 +219,13 @@ definitions: type: struct members: - - name: lse + name: mpls-lse type: u32 byte-order: big-endian doc: | MPLS label stack entry to push - - name: ethertype + name: mpls-ethertype type: u32 byte-order: big-endian doc: | @@ -237,6 +240,7 @@ definitions: - name: ct-state-flags type: flags + name-prefix: ovs-cs-f- entries: - name: new @@ -266,6 +270,8 @@ definitions: attribute-sets: - name: flow-attrs + enum-name: ovs-flow-attr + name-prefix: ovs-flow-attr- attributes: - name: key @@ -352,6 +358,8 @@ attribute-sets: - name: key-attrs + enum-name: ovs-key-attr + name-prefix: ovs-key-attr- attributes: - name: encap @@ -481,6 +489,8 @@ attribute-sets: doc: struct ovs_key_ipv6_exthdr - name: action-attrs + enum-name: ovs-action-attr + name-prefix: ovs-action-attr- attributes: - name: output @@ -608,6 +618,8 @@ attribute-sets: nested-attributes: dec-ttl-attrs - name: tunnel-key-attrs + enum-name: ovs-tunnel-key-attr + name-prefix: ovs-tunnel-key-attr- attributes: - name: id @@ -676,6 +688,8 @@ attribute-sets: type: flag - name: check-pkt-len-attrs + enum-name: ovs-check-pkt-len-attr + name-prefix: ovs-check-pkt-len-attr- attributes: - name: pkt-len @@ -690,6 +704,8 @@ attribute-sets: nested-attributes: action-attrs - name: sample-attrs + enum-name: ovs-sample-attr + name-prefix: ovs-sample-attr- attributes: - name: probability @@ -700,6 +716,8 @@ attribute-sets: nested-attributes: action-attrs - name: userspace-attrs + enum-name: ovs-userspace-attr + name-prefix: ovs-userspace-attr- attributes: - name: pid @@ -715,6 +733,8 @@ attribute-sets: type: flag - name: ovs-nsh-key-attrs + enum-name: ovs-nsh-key-attr + name-prefix: ovs-nsh-key-attr- attributes: - name: base @@ -727,6 +747,8 @@ attribute-sets: type: binary - name: ct-attrs + enum-name: ovs-ct-attr + name-prefix: ovs-ct-attr- attributes: - name: commit @@ -758,13 +780,15 @@ attribute-sets: type: string - name: nat-attrs + enum-name: ovs-nat-attr + name-prefix: ovs-nat-attr- attributes: - name: src - type: binary + type: flag - name: dst - type: binary + type: flag - name: ip-min type: binary @@ -773,21 +797,23 @@ attribute-sets: type: binary - name: proto-min - type: binary + type: u16 - name: proto-max - type: binary + type: u16 - name: persistent - type: binary + type: flag - name: proto-hash - type: binary + type: flag - name: proto-random - type: binary + type: flag - name: dec-ttl-attrs + enum-name: ovs-dec-ttl-attr + name-prefix: ovs-dec-ttl-attr- attributes: - name: action @@ -795,16 +821,19 @@ attribute-sets: nested-attributes: action-attrs - name: vxlan-ext-attrs + enum-name: ovs-vxlan-ext- + name-prefix: ovs-vxlan-ext- attributes: - name: gbp type: u32 operations: + name-prefix: ovs-flow-cmd- fixed-header: ovs-header list: - - name: flow-get + name: get doc: Get / dump OVS flow configuration and state value: 3 attribute-set: flow-attrs @@ -824,6 +853,19 @@ operations: - stats - actions dump: *flow-get-op + - + name: new + doc: Create OVS flow configuration in a data path + value: 1 + attribute-set: flow-attrs + do: + request: + attributes: + - dp-ifindex + - key + - ufid + - mask + - actions mcast-groups: list: diff --git a/Documentation/netlink/specs/ovs_vport.yaml b/Documentation/netlink/specs/ovs_vport.yaml index 8e55622ddf11..17336455bec1 100644 --- a/Documentation/netlink/specs/ovs_vport.yaml +++ b/Documentation/netlink/specs/ovs_vport.yaml @@ -3,6 +3,7 @@ name: ovs_vport version: 2 protocol: genetlink-legacy +uapi-header: linux/openvswitch.h doc: OVS vport configuration over generic netlink. @@ -18,10 +19,13 @@ definitions: - name: vport-type type: enum + enum-name: ovs-vport-type + name-prefix: ovs-vport-type- entries: [ unspec, netdev, internal, gre, vxlan, geneve ] - name: vport-stats type: struct + enum-name: ovs-vport-stats members: - name: rx-packets @@ -51,6 +55,8 @@ definitions: attribute-sets: - name: vport-options + enum-name: ovs-vport-options + name-prefix: ovs-tunnel-attr- attributes: - name: dst-port @@ -60,6 +66,8 @@ attribute-sets: type: u32 - name: upcall-stats + enum-name: ovs-vport-upcall-attr + name-prefix: ovs-vport-upcall-attr- attributes: - name: success @@ -70,6 +78,8 @@ attribute-sets: type: u64 - name: vport + name-prefix: ovs-vport-attr- + enum-name: ovs-vport-attr attributes: - name: port-no @@ -108,9 +118,10 @@ attribute-sets: nested-attributes: upcall-stats operations: + name-prefix: ovs-vport-cmd- list: - - name: vport-get + name: get doc: Get / dump OVS vport configuration and state value: 3 attribute-set: vport diff --git a/Documentation/networking/ip-sysctl.rst b/Documentation/networking/ip-sysctl.rst index 366e2a5097d9..4a010a7cde7f 100644 --- a/Documentation/networking/ip-sysctl.rst +++ b/Documentation/networking/ip-sysctl.rst @@ -981,6 +981,21 @@ tcp_tw_reuse - INTEGER tcp_window_scaling - BOOLEAN Enable window scaling as defined in RFC1323. +tcp_shrink_window - BOOLEAN + This changes how the TCP receive window is calculated. + + RFC 7323, section 2.4, says there are instances when a retracted + window can be offered, and that TCP implementations MUST ensure + that they handle a shrinking window, as specified in RFC 1122. + + - 0 - Disabled. The window is never shrunk. + - 1 - Enabled. The window is shrunk when necessary to remain within + the memory limit set by autotuning (sk_rcvbuf). + This only occurs if a non-zero receive window + scaling factor is also in effect. + + Default: 0 + tcp_wmem - vector of 3 INTEGERs: min, default, max min: Amount of memory reserved for send buffers for TCP sockets. Each TCP socket has rights to use it due to fact of its birth. diff --git a/MAINTAINERS b/MAINTAINERS index 7322963b0670..cb14589d14ab 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -14743,6 +14743,7 @@ NETWORKING [TCP] M: Eric Dumazet <[email protected]> S: Maintained +F: include/linux/net_mm.h F: include/linux/tcp.h F: include/net/tcp.h F: include/trace/events/tcp.h diff --git a/crypto/af_alg.c b/crypto/af_alg.c index 7d4b6016b83d..cdb1dcc5dd1a 100644 --- a/crypto/af_alg.c +++ b/crypto/af_alg.c @@ -1043,7 +1043,7 @@ int af_alg_sendmsg(struct socket *sock, struct msghdr *msg, size_t size, }; plen = extract_iter_to_sg(&msg->msg_iter, len, &sgtable, - MAX_SGL_ENTS, 0); + MAX_SGL_ENTS - sgl->cur, 0); if (plen < 0) { err = plen; goto unlock; diff --git a/drivers/net/phy/at803x.c b/drivers/net/phy/at803x.c index 656136628ffd..c1f307d90518 100644 --- a/drivers/net/phy/at803x.c +++ b/drivers/net/phy/at803x.c @@ -304,7 +304,6 @@ struct at803x_priv { bool is_1000basex; struct regulator_dev *vddio_rdev; struct regulator_dev *vddh_rdev; - struct regulator *vddio; u64 stats[ARRAY_SIZE(at803x_hw_stats)]; }; @@ -824,11 +823,11 @@ static int at803x_parse_dt(struct phy_device *phydev) if (ret < 0) return ret; - priv->vddio = devm_regulator_get_optional(&phydev->mdio.dev, - "vddio"); - if (IS_ERR(priv->vddio)) { + ret = devm_regulator_get_enable_optional(&phydev->mdio.dev, + "vddio"); + if (ret) { phydev_err(phydev, "failed to get VDDIO regulator\n"); - return PTR_ERR(priv->vddio); + return ret; } /* Only AR8031/8033 support 1000Base-X for SFP modules */ @@ -856,12 +855,6 @@ static int at803x_probe(struct phy_device *phydev) if (ret) return ret; - if (priv->vddio) { - ret = regulator_enable(priv->vddio); - if (ret < 0) - return ret; - } - if (phydev->drv->phy_id == ATH8031_PHY_ID) { int ccr = phy_read(phydev, AT803X_REG_CHIP_CONFIG); int mode_cfg; @@ -869,10 +862,8 @@ static int at803x_probe(struct phy_device *phydev) .wolopts = 0, }; - if (ccr < 0) { - ret = ccr; - goto err; - } + if (ccr < 0) + return ccr; mode_cfg = ccr & AT803X_MODE_CFG_MASK; switch (mode_cfg) { @@ -890,25 +881,11 @@ static int at803x_probe(struct phy_device *phydev) ret = at803x_set_wol(phydev, &wol); if (ret < 0) { phydev_err(phydev, "failed to disable WOL on probe: %d\n", ret); - goto err; + return ret; } } return 0; - -err: - if (priv->vddio) - regulator_disable(priv->vddio); - - return ret; -} - -static void at803x_remove(struct phy_device *phydev) -{ - struct at803x_priv *priv = phydev->priv; - - if (priv->vddio) - regulator_disable(priv->vddio); } static int at803x_get_features(struct phy_device *phydev) @@ -2021,7 +1998,6 @@ static struct phy_driver at803x_driver[] = { .name = "Qualcomm Atheros AR8035", .flags = PHY_POLL_CABLE_TEST, .probe = at803x_probe, - .remove = at803x_remove, .config_aneg = at803x_config_aneg, .config_init = at803x_config_init, .soft_reset = genphy_soft_reset, @@ -2043,7 +2019,6 @@ static struct phy_driver at803x_driver[] = { .name = "Qualcomm Atheros AR8030", .phy_id_mask = AT8030_PHY_ID_MASK, .probe = at803x_probe, - .remove = at803x_remove, .config_init = at803x_config_init, .link_change_notify = at803x_link_change_notify, .set_wol = at803x_set_wol, @@ -2059,7 +2034,6 @@ static struct phy_driver at803x_driver[] = { .name = "Qualcomm Atheros AR8031/AR8033", .flags = PHY_POLL_CABLE_TEST, .probe = at803x_probe, - .remove = at803x_remove, .config_init = at803x_config_init, .config_aneg = at803x_config_aneg, .soft_reset = genphy_soft_reset, @@ -2082,7 +2056,6 @@ static struct phy_driver at803x_driver[] = { PHY_ID_MATCH_EXACT(ATH8032_PHY_ID), .name = "Qualcomm Atheros AR8032", .probe = at803x_probe, - .remove = at803x_remove, .flags = PHY_POLL_CABLE_TEST, .config_init = at803x_config_init, .link_change_notify = at803x_link_change_notify, @@ -2100,7 +2073,6 @@ static struct phy_driver at803x_driver[] = { PHY_ID_MATCH_EXACT(ATH9331_PHY_ID), .name = "Qualcomm Atheros AR9331 built-in PHY", .probe = at803x_probe, - .remove = at803x_remove, .suspend = at803x_suspend, .resume = at803x_resume, .flags = PHY_POLL_CABLE_TEST, @@ -2117,7 +2089,6 @@ static struct phy_driver at803x_driver[] = { PHY_ID_MATCH_EXACT(QCA9561_PHY_ID), .name = "Qualcomm Atheros QCA9561 built-in PHY", .probe = at803x_probe, - .remove = at803x_remove, .suspend = at803x_suspend, .resume = at803x_resume, .flags = PHY_POLL_CABLE_TEST, @@ -2183,7 +2154,6 @@ static struct phy_driver at803x_driver[] = { .name = "Qualcomm QCA8081", .flags = PHY_POLL_CABLE_TEST, .probe = at803x_probe, - .remove = at803x_remove, .config_intr = at803x_config_intr, .handle_interrupt = at803x_handle_interrupt, .get_tunable = at803x_get_tunable, diff --git a/include/linux/net_mm.h b/include/linux/net_mm.h new file mode 100644 index 000000000000..b298998bd5a0 --- /dev/null +++ b/include/linux/net_mm.h @@ -0,0 +1,17 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +#ifdef CONFIG_MMU + +#ifdef CONFIG_INET +extern const struct vm_operations_struct tcp_vm_ops; +static inline bool vma_is_tcp(const struct vm_area_struct *vma) +{ + return vma->vm_ops == &tcp_vm_ops; +} +#else +static inline bool vma_is_tcp(const struct vm_area_struct *vma) +{ + return false; +} +#endif /* CONFIG_INET*/ + +#endif /* CONFIG_MMU */ diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index a4efb7a2796c..f00374718159 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -65,6 +65,7 @@ struct netns_ipv4 { #endif bool fib_has_custom_local_routes; bool fib_offload_disabled; + u8 sysctl_tcp_shrink_window; #ifdef CONFIG_IP_ROUTE_CLASSID atomic_t fib_num_tclassid_users; #endif diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index 12eadecf8cd0..e92f73bb3198 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -1190,20 +1190,6 @@ static inline int qdisc_drop_all(struct sk_buff *skb, struct Qdisc *sch, return NET_XMIT_DROP; } -/* Length to Time (L2T) lookup in a qdisc_rate_table, to determine how - long it will take to send a packet given its size. - */ -static inline u32 qdisc_l2t(struct qdisc_rate_table* rtab, unsigned int pktlen) -{ - int slot = pktlen + rtab->rate.cell_align + rtab->rate.overhead; - if (slot < 0) - slot = 0; - slot >>= rtab->rate.cell_log; - if (slot > 255) - return rtab->data[255]*(slot >> 8) + rtab->data[slot & 0xFF]; - return rtab->data[slot]; -} - struct psched_ratecfg { u64 rate_bytes_ps; /* bytes per second */ u32 mult; diff --git a/include/net/tcp.h b/include/net/tcp.h index 9c08eab647a2..31b534370787 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -45,6 +45,7 @@ #include <linux/memcontrol.h> #include <linux/bpf-cgroup.h> #include <linux/siphash.h> +#include <linux/net_mm.h> extern struct inet_hashinfo tcp_hashinfo; diff --git a/mm/memory.c b/mm/memory.c index f69fbc251198..3e46b4d881dc 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -77,6 +77,7 @@ #include <linux/ptrace.h> #include <linux/vmalloc.h> #include <linux/sched/sysctl.h> +#include <linux/net_mm.h> #include <trace/events/kmem.h> @@ -5280,12 +5281,12 @@ retry: if (!vma) goto inval; - /* Only anonymous vmas are supported for now */ - if (!vma_is_anonymous(vma)) + /* Only anonymous and tcp vmas are supported for now */ + if (!vma_is_anonymous(vma) && !vma_is_tcp(vma)) goto inval; /* find_mergeable_anon_vma uses adjacent vmas which are not locked */ - if (!vma->anon_vma) + if (!vma->anon_vma && !vma_is_tcp(vma)) goto inval; if (!vma_start_read(vma)) diff --git a/net/core/gro.c b/net/core/gro.c index dca800068e41..0759277dc14e 100644 --- a/net/core/gro.c +++ b/net/core/gro.c @@ -304,6 +304,24 @@ void napi_gro_flush(struct napi_struct *napi, bool flush_old) } EXPORT_SYMBOL(napi_gro_flush); +static unsigned long gro_list_prepare_tc_ext(const struct sk_buff *skb, + const struct sk_buff *p, + unsigned long diffs) +{ +#if IS_ENABLED(CONFIG_NET_TC_SKB_EXT) + struct tc_skb_ext *skb_ext; + struct tc_skb_ext *p_ext; + + skb_ext = skb_ext_find(skb, TC_SKB_EXT); + p_ext = skb_ext_find(p, TC_SKB_EXT); + + diffs |= (!!p_ext) ^ (!!skb_ext); + if (!diffs && unlikely(skb_ext)) + diffs |= p_ext->chain ^ skb_ext->chain; +#endif + return diffs; +} + static void gro_list_prepare(const struct list_head *head, const struct sk_buff *skb) { @@ -338,23 +356,11 @@ static void gro_list_prepare(const struct list_head *head, * avoid trying too hard to skip each of them individually */ if (!diffs && unlikely(skb->slow_gro | p->slow_gro)) { -#if IS_ENABLED(CONFIG_SKB_EXTENSIONS) && IS_ENABLED(CONFIG_NET_TC_SKB_EXT) - struct tc_skb_ext *skb_ext; - struct tc_skb_ext *p_ext; -#endif - diffs |= p->sk != skb->sk; diffs |= skb_metadata_dst_cmp(p, skb); diffs |= skb_get_nfct(p) ^ skb_get_nfct(skb); -#if IS_ENABLED(CONFIG_SKB_EXTENSIONS) && IS_ENABLED(CONFIG_NET_TC_SKB_EXT) - skb_ext = skb_ext_find(skb, TC_SKB_EXT); - p_ext = skb_ext_find(p, TC_SKB_EXT); - - diffs |= (!!p_ext) ^ (!!skb_ext); - if (!diffs && unlikely(skb_ext)) - diffs |= p_ext->chain ^ skb_ext->chain; -#endif + diffs |= gro_list_prepare_tc_ext(skb, p, diffs); } NAPI_GRO_CB(p)->same_flow = !diffs; diff --git a/net/devlink/leftover.c b/net/devlink/leftover.c index 649a9701eb6a..1f00f874471f 100644 --- a/net/devlink/leftover.c +++ b/net/devlink/leftover.c @@ -6737,7 +6737,10 @@ void devlink_notify_unregister(struct devlink *devlink) static void devlink_port_type_warn(struct work_struct *work) { - WARN(true, "Type was not set for devlink port."); + struct devlink_port *port = container_of(to_delayed_work(work), + struct devlink_port, + type_warn_dw); + dev_warn(port->devlink->dev, "Type was not set for devlink port."); } static bool devlink_port_type_should_warn(struct devlink_port *devlink_port) diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 356afe54951c..2afb0870648b 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -1480,6 +1480,15 @@ static struct ctl_table ipv4_net_table[] = { .extra1 = SYSCTL_ZERO, .extra2 = &tcp_syn_linear_timeouts_max, }, + { + .procname = "tcp_shrink_window", + .data = &init_net.ipv4.sysctl_tcp_shrink_window, + .maxlen = sizeof(u8), + .mode = 0644, + .proc_handler = proc_dou8vec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, { } }; diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 0e21ea92dc1d..71b42eef9dbf 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1774,7 +1774,7 @@ void tcp_update_recv_tstamps(struct sk_buff *skb, } #ifdef CONFIG_MMU -static const struct vm_operations_struct tcp_vm_ops = { +const struct vm_operations_struct tcp_vm_ops = { }; int tcp_mmap(struct file *file, struct socket *sock, @@ -2073,6 +2073,34 @@ static void tcp_zc_finalize_rx_tstamp(struct sock *sk, } } +static struct vm_area_struct *find_tcp_vma(struct mm_struct *mm, + unsigned long address, + bool *mmap_locked) +{ + struct vm_area_struct *vma = NULL; + +#ifdef CONFIG_PER_VMA_LOCK + vma = lock_vma_under_rcu(mm, address); +#endif + if (vma) { + if (!vma_is_tcp(vma)) { + vma_end_read(vma); + return NULL; + } + *mmap_locked = false; + return vma; + } + + mmap_read_lock(mm); + vma = vma_lookup(mm, address); + if (!vma || !vma_is_tcp(vma)) { + mmap_read_unlock(mm); + return NULL; + } + *mmap_locked = true; + return vma; +} + #define TCP_ZEROCOPY_PAGE_BATCH_SIZE 32 static int tcp_zerocopy_receive(struct sock *sk, struct tcp_zerocopy_receive *zc, @@ -2090,6 +2118,7 @@ static int tcp_zerocopy_receive(struct sock *sk, u32 seq = tp->copied_seq; u32 total_bytes_to_map; int inq = tcp_inq(sk); + bool mmap_locked; int ret; zc->copybuf_len = 0; @@ -2114,13 +2143,10 @@ static int tcp_zerocopy_receive(struct sock *sk, return 0; } - mmap_read_lock(current->mm); - - vma = vma_lookup(current->mm, address); - if (!vma || vma->vm_ops != &tcp_vm_ops) { - mmap_read_unlock(current->mm); + vma = find_tcp_vma(current->mm, address, &mmap_locked); + if (!vma) return -EINVAL; - } + vma_len = min_t(unsigned long, zc->length, vma->vm_end - address); avail_len = min_t(u32, vma_len, inq); total_bytes_to_map = avail_len & ~(PAGE_SIZE - 1); @@ -2194,7 +2220,10 @@ static int tcp_zerocopy_receive(struct sock *sk, zc, total_bytes_to_map); } out: - mmap_read_unlock(current->mm); + if (mmap_locked) + mmap_read_unlock(current->mm); + else + vma_end_read(vma); /* Try to copy straggler data. */ if (!ret) copylen = tcp_zc_handle_leftover(zc, sk, skb, &seq, copybuf_len, tss); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 84a5d557dc1a..9213804b034f 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -3281,6 +3281,8 @@ static int __net_init tcp_sk_init(struct net *net) net->ipv4.tcp_congestion_control = &tcp_reno; net->ipv4.sysctl_tcp_syn_linear_timeouts = 4; + net->ipv4.sysctl_tcp_shrink_window = 0; + return 0; } diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 660eac4bf2a7..2cb39b6dad02 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -260,8 +260,8 @@ static u16 tcp_select_window(struct sock *sk) u32 old_win = tp->rcv_wnd; u32 cur_win = tcp_receive_window(tp); u32 new_win = __tcp_select_window(sk); + struct net *net = sock_net(sk); - /* Never shrink the offered window */ if (new_win < cur_win) { /* Danger Will Robinson! * Don't update rcv_wup/rcv_wnd here or else @@ -270,11 +270,14 @@ static u16 tcp_select_window(struct sock *sk) * * Relax Will Robinson. */ - if (new_win == 0) - NET_INC_STATS(sock_net(sk), - LINUX_MIB_TCPWANTZEROWINDOWADV); - new_win = ALIGN(cur_win, 1 << tp->rx_opt.rcv_wscale); + if (!READ_ONCE(net->ipv4.sysctl_tcp_shrink_window) || !tp->rx_opt.rcv_wscale) { + /* Never shrink the offered window */ + if (new_win == 0) + NET_INC_STATS(net, LINUX_MIB_TCPWANTZEROWINDOWADV); + new_win = ALIGN(cur_win, 1 << tp->rx_opt.rcv_wscale); + } } + tp->rcv_wnd = new_win; tp->rcv_wup = tp->rcv_nxt; @@ -282,7 +285,7 @@ static u16 tcp_select_window(struct sock *sk) * scaled window. */ if (!tp->rx_opt.rcv_wscale && - READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_workaround_signed_windows)) + READ_ONCE(net->ipv4.sysctl_tcp_workaround_signed_windows)) new_win = min(new_win, MAX_TCP_WINDOW); else new_win = min(new_win, (65535U << tp->rx_opt.rcv_wscale)); @@ -294,10 +297,9 @@ static u16 tcp_select_window(struct sock *sk) if (new_win == 0) { tp->pred_flags = 0; if (old_win) - NET_INC_STATS(sock_net(sk), - LINUX_MIB_TCPTOZEROWINDOWADV); + NET_INC_STATS(net, LINUX_MIB_TCPTOZEROWINDOWADV); } else if (old_win == 0) { - NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFROMZEROWINDOWADV); + NET_INC_STATS(net, LINUX_MIB_TCPFROMZEROWINDOWADV); } return new_win; @@ -2987,6 +2989,7 @@ u32 __tcp_select_window(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); + struct net *net = sock_net(sk); /* MSS for the peer's data. Previous versions used mss_clamp * here. I don't know if the value based on our guesses * of peer's MSS is better for the performance. It's more correct @@ -3008,6 +3011,15 @@ u32 __tcp_select_window(struct sock *sk) if (mss <= 0) return 0; } + + /* Only allow window shrink if the sysctl is enabled and we have + * a non-zero scaling factor in effect. + */ + if (READ_ONCE(net->ipv4.sysctl_tcp_shrink_window) && tp->rx_opt.rcv_wscale) + goto shrink_window_allowed; + + /* do not allow window to shrink */ + if (free_space < (full_space >> 1)) { icsk->icsk_ack.quick = 0; @@ -3062,6 +3074,36 @@ u32 __tcp_select_window(struct sock *sk) } return window; + +shrink_window_allowed: + /* new window should always be an exact multiple of scaling factor */ + free_space = round_down(free_space, 1 << tp->rx_opt.rcv_wscale); + + if (free_space < (full_space >> 1)) { + icsk->icsk_ack.quick = 0; + + if (tcp_under_memory_pressure(sk)) + tcp_adjust_rcv_ssthresh(sk); + + /* if free space is too low, return a zero window */ + if (free_space < (allowed_space >> 4) || free_space < mss || + free_space < (1 << tp->rx_opt.rcv_wscale)) + return 0; + } + + if (free_space > tp->rcv_ssthresh) { + free_space = tp->rcv_ssthresh; + /* new window should always be an exact multiple of scaling factor + * + * For this case, we ALIGN "up" (increase free_space) because + * we know free_space is not zero here, it has been reduced from + * the memory-based limit, and rcv_ssthresh is not a hard limit + * (unlike sk_rcvbuf). + */ + free_space = ALIGN(free_space, (1 << tp->rx_opt.rcv_wscale)); + } + + return free_space; } void tcp_skb_collapse_tstamp(struct sk_buff *skb, diff --git a/net/ipv6/route.c b/net/ipv6/route.c index e510a4162ef8..64e873f5895f 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -3360,6 +3360,7 @@ static int ip6_route_check_nh_onlink(struct net *net, static int ip6_route_check_nh(struct net *net, struct fib6_config *cfg, struct net_device **_dev, + netdevice_tracker *dev_tracker, struct inet6_dev **idev) { const struct in6_addr *gw_addr = &cfg->fc_gateway; @@ -3404,7 +3405,7 @@ static int ip6_route_check_nh(struct net *net, err = -EHOSTUNREACH; } else { *_dev = dev = res.nh->fib_nh_dev; - dev_hold(dev); + netdev_hold(dev, dev_tracker, GFP_ATOMIC); *idev = in6_dev_get(dev); } @@ -3412,7 +3413,9 @@ static int ip6_route_check_nh(struct net *net, } static int ip6_validate_gw(struct net *net, struct fib6_config *cfg, - struct net_device **_dev, struct inet6_dev **idev, + struct net_device **_dev, + netdevice_tracker *dev_tracker, + struct inet6_dev **idev, struct netlink_ext_ack *extack) { const struct in6_addr *gw_addr = &cfg->fc_gateway; @@ -3453,7 +3456,8 @@ static int ip6_validate_gw(struct net *net, struct fib6_config *cfg, if (cfg->fc_flags & RTNH_F_ONLINK) err = ip6_route_check_nh_onlink(net, cfg, dev, extack); else - err = ip6_route_check_nh(net, cfg, _dev, idev); + err = ip6_route_check_nh(net, cfg, _dev, dev_tracker, + idev); rcu_read_unlock(); @@ -3571,7 +3575,8 @@ int fib6_nh_init(struct net *net, struct fib6_nh *fib6_nh, } if (cfg->fc_flags & RTF_GATEWAY) { - err = ip6_validate_gw(net, cfg, &dev, &idev, extack); + err = ip6_validate_gw(net, cfg, &dev, dev_tracker, + &idev, extack); if (err) goto out; diff --git a/net/kcm/kcmsock.c b/net/kcm/kcmsock.c index d75d775e9462..d0537c1c8cd7 100644 --- a/net/kcm/kcmsock.c +++ b/net/kcm/kcmsock.c @@ -661,6 +661,7 @@ retry: kcm_abort_tx_psock(psock, ret ? -ret : EPIPE, true); unreserve_psock(kcm); + psock = NULL; txm->started_tx = false; kcm_report_tx_retry(kcm); @@ -696,7 +697,8 @@ out: if (!head) { /* Done with all queued messages. */ WARN_ON(!skb_queue_empty(&sk->sk_write_queue)); - unreserve_psock(kcm); + if (psock) + unreserve_psock(kcm); } /* Check if write space is available */ diff --git a/net/mctp/route.c b/net/mctp/route.c index f51a05ec7162..ab62fe447038 100644 --- a/net/mctp/route.c +++ b/net/mctp/route.c @@ -1249,9 +1249,6 @@ static int mctp_newroute(struct sk_buff *skb, struct nlmsghdr *nlh, mtu = nla_get_u32(tbx[RTAX_MTU]); } - if (rtm->rtm_type != RTN_UNICAST) - return -EINVAL; - rc = mctp_route_add(mdev, daddr_start, rtm->rtm_dst_len, mtu, rtm->rtm_type); return rc; |