aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Documentation/devicetree/bindings/net/maxlinear,gpy2xx.yaml11
-rw-r--r--Documentation/netlink/specs/ovs_datapath.yaml30
-rw-r--r--Documentation/netlink/specs/ovs_flow.yaml68
-rw-r--r--Documentation/netlink/specs/ovs_vport.yaml13
-rw-r--r--Documentation/networking/ip-sysctl.rst15
-rw-r--r--MAINTAINERS1
-rw-r--r--crypto/af_alg.c2
-rw-r--r--drivers/net/phy/at803x.c44
-rw-r--r--include/linux/net_mm.h17
-rw-r--r--include/net/netns/ipv4.h1
-rw-r--r--include/net/sch_generic.h14
-rw-r--r--include/net/tcp.h1
-rw-r--r--mm/memory.c7
-rw-r--r--net/core/gro.c32
-rw-r--r--net/devlink/leftover.c5
-rw-r--r--net/ipv4/sysctl_net_ipv4.c9
-rw-r--r--net/ipv4/tcp.c45
-rw-r--r--net/ipv4/tcp_ipv4.c2
-rw-r--r--net/ipv4/tcp_output.c60
-rw-r--r--net/ipv6/route.c13
-rw-r--r--net/kcm/kcmsock.c4
-rw-r--r--net/mctp/route.c3
22 files changed, 274 insertions, 123 deletions
diff --git a/Documentation/devicetree/bindings/net/maxlinear,gpy2xx.yaml b/Documentation/devicetree/bindings/net/maxlinear,gpy2xx.yaml
index d71fa9de2b64..8a3713abd1ca 100644
--- a/Documentation/devicetree/bindings/net/maxlinear,gpy2xx.yaml
+++ b/Documentation/devicetree/bindings/net/maxlinear,gpy2xx.yaml
@@ -17,11 +17,12 @@ properties:
maxlinear,use-broken-interrupts:
description: |
Interrupts are broken on some GPY2xx PHYs in that they keep the
- interrupt line asserted even after the interrupt status register is
- cleared. Thus it is blocking the interrupt line which is usually bad
- for shared lines. By default interrupts are disabled for this PHY and
- polling mode is used. If one can live with the consequences, this
- property can be used to enable interrupt handling.
+ interrupt line asserted for a random amount of time even after the
+ interrupt status register is cleared. Thus it is blocking the
+ interrupt line which is usually bad for shared lines. By default,
+ interrupts are disabled for this PHY and polling mode is used. If one
+ can live with the consequences, this property can be used to enable
+ interrupt handling.
Affected PHYs (as far as known) are GPY215B and GPY215C.
type: boolean
diff --git a/Documentation/netlink/specs/ovs_datapath.yaml b/Documentation/netlink/specs/ovs_datapath.yaml
index 6d71db8c4416..f709c26c3e92 100644
--- a/Documentation/netlink/specs/ovs_datapath.yaml
+++ b/Documentation/netlink/specs/ovs_datapath.yaml
@@ -3,6 +3,7 @@
name: ovs_datapath
version: 2
protocol: genetlink-legacy
+uapi-header: linux/openvswitch.h
doc:
OVS datapath configuration over generic netlink.
@@ -18,6 +19,7 @@ definitions:
-
name: user-features
type: flags
+ name-prefix: ovs-dp-f-
entries:
-
name: unaligned
@@ -33,35 +35,37 @@ definitions:
doc: Allow per-cpu dispatch of upcalls
-
name: datapath-stats
+ enum-name: ovs-dp-stats
type: struct
members:
-
- name: hit
+ name: n-hit
type: u64
-
- name: missed
+ name: n-missed
type: u64
-
- name: lost
+ name: n-lost
type: u64
-
- name: flows
+ name: n-flows
type: u64
-
name: megaflow-stats
+ enum-name: ovs-dp-megaflow-stats
type: struct
members:
-
- name: mask-hit
+ name: n-mask-hit
type: u64
-
- name: masks
+ name: n-masks
type: u32
-
name: padding
type: u32
-
- name: cache-hits
+ name: n-cache-hit
type: u64
-
name: pad1
@@ -70,6 +74,8 @@ definitions:
attribute-sets:
-
name: datapath
+ name-prefix: ovs-dp-attr-
+ enum-name: ovs-datapath-attrs
attributes:
-
name: name
@@ -101,12 +107,16 @@ attribute-sets:
name: per-cpu-pids
type: binary
sub-type: u32
+ -
+ name: ifindex
+ type: u32
operations:
fixed-header: ovs-header
+ name-prefix: ovs-dp-cmd-
list:
-
- name: dp-get
+ name: get
doc: Get / dump OVS data path configuration and state
value: 3
attribute-set: datapath
@@ -125,7 +135,7 @@ operations:
- per-cpu-pids
dump: *dp-get-op
-
- name: dp-new
+ name: new
doc: Create new OVS data path
value: 1
attribute-set: datapath
@@ -137,7 +147,7 @@ operations:
- upcall-pid
- user-features
-
- name: dp-del
+ name: del
doc: Delete existing OVS data path
value: 2
attribute-set: datapath
diff --git a/Documentation/netlink/specs/ovs_flow.yaml b/Documentation/netlink/specs/ovs_flow.yaml
index 3b0624c87074..1ecbcd117385 100644
--- a/Documentation/netlink/specs/ovs_flow.yaml
+++ b/Documentation/netlink/specs/ovs_flow.yaml
@@ -3,6 +3,7 @@
name: ovs_flow
version: 1
protocol: genetlink-legacy
+uapi-header: linux/openvswitch.h
doc:
OVS flow configuration over generic netlink.
@@ -67,6 +68,7 @@ definitions:
enum: ovs-frag-type
-
name: ovs-frag-type
+ name-prefix: ovs-frag-type-
type: enum
entries:
-
@@ -166,6 +168,7 @@ definitions:
doc: Tag control identifier (TCI) to push.
-
name: ovs-ufid-flags
+ name-prefix: ovs-ufid-f-
type: flags
entries:
- omit-key
@@ -176,7 +179,7 @@ definitions:
type: struct
members:
-
- name: hash-algorithm
+ name: hash-alg
type: u32
doc: Algorithm used to compute hash prior to recirculation.
-
@@ -198,13 +201,13 @@ definitions:
type: struct
members:
-
- name: lse
+ name: mpls-lse
type: u32
byte-order: big-endian
doc: |
MPLS label stack entry to push
-
- name: ethertype
+ name: mpls-ethertype
type: u32
byte-order: big-endian
doc: |
@@ -216,13 +219,13 @@ definitions:
type: struct
members:
-
- name: lse
+ name: mpls-lse
type: u32
byte-order: big-endian
doc: |
MPLS label stack entry to push
-
- name: ethertype
+ name: mpls-ethertype
type: u32
byte-order: big-endian
doc: |
@@ -237,6 +240,7 @@ definitions:
-
name: ct-state-flags
type: flags
+ name-prefix: ovs-cs-f-
entries:
-
name: new
@@ -266,6 +270,8 @@ definitions:
attribute-sets:
-
name: flow-attrs
+ enum-name: ovs-flow-attr
+ name-prefix: ovs-flow-attr-
attributes:
-
name: key
@@ -352,6 +358,8 @@ attribute-sets:
-
name: key-attrs
+ enum-name: ovs-key-attr
+ name-prefix: ovs-key-attr-
attributes:
-
name: encap
@@ -481,6 +489,8 @@ attribute-sets:
doc: struct ovs_key_ipv6_exthdr
-
name: action-attrs
+ enum-name: ovs-action-attr
+ name-prefix: ovs-action-attr-
attributes:
-
name: output
@@ -608,6 +618,8 @@ attribute-sets:
nested-attributes: dec-ttl-attrs
-
name: tunnel-key-attrs
+ enum-name: ovs-tunnel-key-attr
+ name-prefix: ovs-tunnel-key-attr-
attributes:
-
name: id
@@ -676,6 +688,8 @@ attribute-sets:
type: flag
-
name: check-pkt-len-attrs
+ enum-name: ovs-check-pkt-len-attr
+ name-prefix: ovs-check-pkt-len-attr-
attributes:
-
name: pkt-len
@@ -690,6 +704,8 @@ attribute-sets:
nested-attributes: action-attrs
-
name: sample-attrs
+ enum-name: ovs-sample-attr
+ name-prefix: ovs-sample-attr-
attributes:
-
name: probability
@@ -700,6 +716,8 @@ attribute-sets:
nested-attributes: action-attrs
-
name: userspace-attrs
+ enum-name: ovs-userspace-attr
+ name-prefix: ovs-userspace-attr-
attributes:
-
name: pid
@@ -715,6 +733,8 @@ attribute-sets:
type: flag
-
name: ovs-nsh-key-attrs
+ enum-name: ovs-nsh-key-attr
+ name-prefix: ovs-nsh-key-attr-
attributes:
-
name: base
@@ -727,6 +747,8 @@ attribute-sets:
type: binary
-
name: ct-attrs
+ enum-name: ovs-ct-attr
+ name-prefix: ovs-ct-attr-
attributes:
-
name: commit
@@ -758,13 +780,15 @@ attribute-sets:
type: string
-
name: nat-attrs
+ enum-name: ovs-nat-attr
+ name-prefix: ovs-nat-attr-
attributes:
-
name: src
- type: binary
+ type: flag
-
name: dst
- type: binary
+ type: flag
-
name: ip-min
type: binary
@@ -773,21 +797,23 @@ attribute-sets:
type: binary
-
name: proto-min
- type: binary
+ type: u16
-
name: proto-max
- type: binary
+ type: u16
-
name: persistent
- type: binary
+ type: flag
-
name: proto-hash
- type: binary
+ type: flag
-
name: proto-random
- type: binary
+ type: flag
-
name: dec-ttl-attrs
+ enum-name: ovs-dec-ttl-attr
+ name-prefix: ovs-dec-ttl-attr-
attributes:
-
name: action
@@ -795,16 +821,19 @@ attribute-sets:
nested-attributes: action-attrs
-
name: vxlan-ext-attrs
+ enum-name: ovs-vxlan-ext-
+ name-prefix: ovs-vxlan-ext-
attributes:
-
name: gbp
type: u32
operations:
+ name-prefix: ovs-flow-cmd-
fixed-header: ovs-header
list:
-
- name: flow-get
+ name: get
doc: Get / dump OVS flow configuration and state
value: 3
attribute-set: flow-attrs
@@ -824,6 +853,19 @@ operations:
- stats
- actions
dump: *flow-get-op
+ -
+ name: new
+ doc: Create OVS flow configuration in a data path
+ value: 1
+ attribute-set: flow-attrs
+ do:
+ request:
+ attributes:
+ - dp-ifindex
+ - key
+ - ufid
+ - mask
+ - actions
mcast-groups:
list:
diff --git a/Documentation/netlink/specs/ovs_vport.yaml b/Documentation/netlink/specs/ovs_vport.yaml
index 8e55622ddf11..17336455bec1 100644
--- a/Documentation/netlink/specs/ovs_vport.yaml
+++ b/Documentation/netlink/specs/ovs_vport.yaml
@@ -3,6 +3,7 @@
name: ovs_vport
version: 2
protocol: genetlink-legacy
+uapi-header: linux/openvswitch.h
doc:
OVS vport configuration over generic netlink.
@@ -18,10 +19,13 @@ definitions:
-
name: vport-type
type: enum
+ enum-name: ovs-vport-type
+ name-prefix: ovs-vport-type-
entries: [ unspec, netdev, internal, gre, vxlan, geneve ]
-
name: vport-stats
type: struct
+ enum-name: ovs-vport-stats
members:
-
name: rx-packets
@@ -51,6 +55,8 @@ definitions:
attribute-sets:
-
name: vport-options
+ enum-name: ovs-vport-options
+ name-prefix: ovs-tunnel-attr-
attributes:
-
name: dst-port
@@ -60,6 +66,8 @@ attribute-sets:
type: u32
-
name: upcall-stats
+ enum-name: ovs-vport-upcall-attr
+ name-prefix: ovs-vport-upcall-attr-
attributes:
-
name: success
@@ -70,6 +78,8 @@ attribute-sets:
type: u64
-
name: vport
+ name-prefix: ovs-vport-attr-
+ enum-name: ovs-vport-attr
attributes:
-
name: port-no
@@ -108,9 +118,10 @@ attribute-sets:
nested-attributes: upcall-stats
operations:
+ name-prefix: ovs-vport-cmd-
list:
-
- name: vport-get
+ name: get
doc: Get / dump OVS vport configuration and state
value: 3
attribute-set: vport
diff --git a/Documentation/networking/ip-sysctl.rst b/Documentation/networking/ip-sysctl.rst
index 366e2a5097d9..4a010a7cde7f 100644
--- a/Documentation/networking/ip-sysctl.rst
+++ b/Documentation/networking/ip-sysctl.rst
@@ -981,6 +981,21 @@ tcp_tw_reuse - INTEGER
tcp_window_scaling - BOOLEAN
Enable window scaling as defined in RFC1323.
+tcp_shrink_window - BOOLEAN
+ This changes how the TCP receive window is calculated.
+
+ RFC 7323, section 2.4, says there are instances when a retracted
+ window can be offered, and that TCP implementations MUST ensure
+ that they handle a shrinking window, as specified in RFC 1122.
+
+ - 0 - Disabled. The window is never shrunk.
+ - 1 - Enabled. The window is shrunk when necessary to remain within
+ the memory limit set by autotuning (sk_rcvbuf).
+ This only occurs if a non-zero receive window
+ scaling factor is also in effect.
+
+ Default: 0
+
tcp_wmem - vector of 3 INTEGERs: min, default, max
min: Amount of memory reserved for send buffers for TCP sockets.
Each TCP socket has rights to use it due to fact of its birth.
diff --git a/MAINTAINERS b/MAINTAINERS
index 7322963b0670..cb14589d14ab 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -14743,6 +14743,7 @@ NETWORKING [TCP]
M: Eric Dumazet <[email protected]>
S: Maintained
+F: include/linux/net_mm.h
F: include/linux/tcp.h
F: include/net/tcp.h
F: include/trace/events/tcp.h
diff --git a/crypto/af_alg.c b/crypto/af_alg.c
index 7d4b6016b83d..cdb1dcc5dd1a 100644
--- a/crypto/af_alg.c
+++ b/crypto/af_alg.c
@@ -1043,7 +1043,7 @@ int af_alg_sendmsg(struct socket *sock, struct msghdr *msg, size_t size,
};
plen = extract_iter_to_sg(&msg->msg_iter, len, &sgtable,
- MAX_SGL_ENTS, 0);
+ MAX_SGL_ENTS - sgl->cur, 0);
if (plen < 0) {
err = plen;
goto unlock;
diff --git a/drivers/net/phy/at803x.c b/drivers/net/phy/at803x.c
index 656136628ffd..c1f307d90518 100644
--- a/drivers/net/phy/at803x.c
+++ b/drivers/net/phy/at803x.c
@@ -304,7 +304,6 @@ struct at803x_priv {
bool is_1000basex;
struct regulator_dev *vddio_rdev;
struct regulator_dev *vddh_rdev;
- struct regulator *vddio;
u64 stats[ARRAY_SIZE(at803x_hw_stats)];
};
@@ -824,11 +823,11 @@ static int at803x_parse_dt(struct phy_device *phydev)
if (ret < 0)
return ret;
- priv->vddio = devm_regulator_get_optional(&phydev->mdio.dev,
- "vddio");
- if (IS_ERR(priv->vddio)) {
+ ret = devm_regulator_get_enable_optional(&phydev->mdio.dev,
+ "vddio");
+ if (ret) {
phydev_err(phydev, "failed to get VDDIO regulator\n");
- return PTR_ERR(priv->vddio);
+ return ret;
}
/* Only AR8031/8033 support 1000Base-X for SFP modules */
@@ -856,12 +855,6 @@ static int at803x_probe(struct phy_device *phydev)
if (ret)
return ret;
- if (priv->vddio) {
- ret = regulator_enable(priv->vddio);
- if (ret < 0)
- return ret;
- }
-
if (phydev->drv->phy_id == ATH8031_PHY_ID) {
int ccr = phy_read(phydev, AT803X_REG_CHIP_CONFIG);
int mode_cfg;
@@ -869,10 +862,8 @@ static int at803x_probe(struct phy_device *phydev)
.wolopts = 0,
};
- if (ccr < 0) {
- ret = ccr;
- goto err;
- }
+ if (ccr < 0)
+ return ccr;
mode_cfg = ccr & AT803X_MODE_CFG_MASK;
switch (mode_cfg) {
@@ -890,25 +881,11 @@ static int at803x_probe(struct phy_device *phydev)
ret = at803x_set_wol(phydev, &wol);
if (ret < 0) {
phydev_err(phydev, "failed to disable WOL on probe: %d\n", ret);
- goto err;
+ return ret;
}
}
return 0;
-
-err:
- if (priv->vddio)
- regulator_disable(priv->vddio);
-
- return ret;
-}
-
-static void at803x_remove(struct phy_device *phydev)
-{
- struct at803x_priv *priv = phydev->priv;
-
- if (priv->vddio)
- regulator_disable(priv->vddio);
}
static int at803x_get_features(struct phy_device *phydev)
@@ -2021,7 +1998,6 @@ static struct phy_driver at803x_driver[] = {
.name = "Qualcomm Atheros AR8035",
.flags = PHY_POLL_CABLE_TEST,
.probe = at803x_probe,
- .remove = at803x_remove,
.config_aneg = at803x_config_aneg,
.config_init = at803x_config_init,
.soft_reset = genphy_soft_reset,
@@ -2043,7 +2019,6 @@ static struct phy_driver at803x_driver[] = {
.name = "Qualcomm Atheros AR8030",
.phy_id_mask = AT8030_PHY_ID_MASK,
.probe = at803x_probe,
- .remove = at803x_remove,
.config_init = at803x_config_init,
.link_change_notify = at803x_link_change_notify,
.set_wol = at803x_set_wol,
@@ -2059,7 +2034,6 @@ static struct phy_driver at803x_driver[] = {
.name = "Qualcomm Atheros AR8031/AR8033",
.flags = PHY_POLL_CABLE_TEST,
.probe = at803x_probe,
- .remove = at803x_remove,
.config_init = at803x_config_init,
.config_aneg = at803x_config_aneg,
.soft_reset = genphy_soft_reset,
@@ -2082,7 +2056,6 @@ static struct phy_driver at803x_driver[] = {
PHY_ID_MATCH_EXACT(ATH8032_PHY_ID),
.name = "Qualcomm Atheros AR8032",
.probe = at803x_probe,
- .remove = at803x_remove,
.flags = PHY_POLL_CABLE_TEST,
.config_init = at803x_config_init,
.link_change_notify = at803x_link_change_notify,
@@ -2100,7 +2073,6 @@ static struct phy_driver at803x_driver[] = {
PHY_ID_MATCH_EXACT(ATH9331_PHY_ID),
.name = "Qualcomm Atheros AR9331 built-in PHY",
.probe = at803x_probe,
- .remove = at803x_remove,
.suspend = at803x_suspend,
.resume = at803x_resume,
.flags = PHY_POLL_CABLE_TEST,
@@ -2117,7 +2089,6 @@ static struct phy_driver at803x_driver[] = {
PHY_ID_MATCH_EXACT(QCA9561_PHY_ID),
.name = "Qualcomm Atheros QCA9561 built-in PHY",
.probe = at803x_probe,
- .remove = at803x_remove,
.suspend = at803x_suspend,
.resume = at803x_resume,
.flags = PHY_POLL_CABLE_TEST,
@@ -2183,7 +2154,6 @@ static struct phy_driver at803x_driver[] = {
.name = "Qualcomm QCA8081",
.flags = PHY_POLL_CABLE_TEST,
.probe = at803x_probe,
- .remove = at803x_remove,
.config_intr = at803x_config_intr,
.handle_interrupt = at803x_handle_interrupt,
.get_tunable = at803x_get_tunable,
diff --git a/include/linux/net_mm.h b/include/linux/net_mm.h
new file mode 100644
index 000000000000..b298998bd5a0
--- /dev/null
+++ b/include/linux/net_mm.h
@@ -0,0 +1,17 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+#ifdef CONFIG_MMU
+
+#ifdef CONFIG_INET
+extern const struct vm_operations_struct tcp_vm_ops;
+static inline bool vma_is_tcp(const struct vm_area_struct *vma)
+{
+ return vma->vm_ops == &tcp_vm_ops;
+}
+#else
+static inline bool vma_is_tcp(const struct vm_area_struct *vma)
+{
+ return false;
+}
+#endif /* CONFIG_INET*/
+
+#endif /* CONFIG_MMU */
diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
index a4efb7a2796c..f00374718159 100644
--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h
@@ -65,6 +65,7 @@ struct netns_ipv4 {
#endif
bool fib_has_custom_local_routes;
bool fib_offload_disabled;
+ u8 sysctl_tcp_shrink_window;
#ifdef CONFIG_IP_ROUTE_CLASSID
atomic_t fib_num_tclassid_users;
#endif
diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index 12eadecf8cd0..e92f73bb3198 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -1190,20 +1190,6 @@ static inline int qdisc_drop_all(struct sk_buff *skb, struct Qdisc *sch,
return NET_XMIT_DROP;
}
-/* Length to Time (L2T) lookup in a qdisc_rate_table, to determine how
- long it will take to send a packet given its size.
- */
-static inline u32 qdisc_l2t(struct qdisc_rate_table* rtab, unsigned int pktlen)
-{
- int slot = pktlen + rtab->rate.cell_align + rtab->rate.overhead;
- if (slot < 0)
- slot = 0;
- slot >>= rtab->rate.cell_log;
- if (slot > 255)
- return rtab->data[255]*(slot >> 8) + rtab->data[slot & 0xFF];
- return rtab->data[slot];
-}
-
struct psched_ratecfg {
u64 rate_bytes_ps; /* bytes per second */
u32 mult;
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 9c08eab647a2..31b534370787 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -45,6 +45,7 @@
#include <linux/memcontrol.h>
#include <linux/bpf-cgroup.h>
#include <linux/siphash.h>
+#include <linux/net_mm.h>
extern struct inet_hashinfo tcp_hashinfo;
diff --git a/mm/memory.c b/mm/memory.c
index f69fbc251198..3e46b4d881dc 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -77,6 +77,7 @@
#include <linux/ptrace.h>
#include <linux/vmalloc.h>
#include <linux/sched/sysctl.h>
+#include <linux/net_mm.h>
#include <trace/events/kmem.h>
@@ -5280,12 +5281,12 @@ retry:
if (!vma)
goto inval;
- /* Only anonymous vmas are supported for now */
- if (!vma_is_anonymous(vma))
+ /* Only anonymous and tcp vmas are supported for now */
+ if (!vma_is_anonymous(vma) && !vma_is_tcp(vma))
goto inval;
/* find_mergeable_anon_vma uses adjacent vmas which are not locked */
- if (!vma->anon_vma)
+ if (!vma->anon_vma && !vma_is_tcp(vma))
goto inval;
if (!vma_start_read(vma))
diff --git a/net/core/gro.c b/net/core/gro.c
index dca800068e41..0759277dc14e 100644
--- a/net/core/gro.c
+++ b/net/core/gro.c
@@ -304,6 +304,24 @@ void napi_gro_flush(struct napi_struct *napi, bool flush_old)
}
EXPORT_SYMBOL(napi_gro_flush);
+static unsigned long gro_list_prepare_tc_ext(const struct sk_buff *skb,
+ const struct sk_buff *p,
+ unsigned long diffs)
+{
+#if IS_ENABLED(CONFIG_NET_TC_SKB_EXT)
+ struct tc_skb_ext *skb_ext;
+ struct tc_skb_ext *p_ext;
+
+ skb_ext = skb_ext_find(skb, TC_SKB_EXT);
+ p_ext = skb_ext_find(p, TC_SKB_EXT);
+
+ diffs |= (!!p_ext) ^ (!!skb_ext);
+ if (!diffs && unlikely(skb_ext))
+ diffs |= p_ext->chain ^ skb_ext->chain;
+#endif
+ return diffs;
+}
+
static void gro_list_prepare(const struct list_head *head,
const struct sk_buff *skb)
{
@@ -338,23 +356,11 @@ static void gro_list_prepare(const struct list_head *head,
* avoid trying too hard to skip each of them individually
*/
if (!diffs && unlikely(skb->slow_gro | p->slow_gro)) {
-#if IS_ENABLED(CONFIG_SKB_EXTENSIONS) && IS_ENABLED(CONFIG_NET_TC_SKB_EXT)
- struct tc_skb_ext *skb_ext;
- struct tc_skb_ext *p_ext;
-#endif
-
diffs |= p->sk != skb->sk;
diffs |= skb_metadata_dst_cmp(p, skb);
diffs |= skb_get_nfct(p) ^ skb_get_nfct(skb);
-#if IS_ENABLED(CONFIG_SKB_EXTENSIONS) && IS_ENABLED(CONFIG_NET_TC_SKB_EXT)
- skb_ext = skb_ext_find(skb, TC_SKB_EXT);
- p_ext = skb_ext_find(p, TC_SKB_EXT);
-
- diffs |= (!!p_ext) ^ (!!skb_ext);
- if (!diffs && unlikely(skb_ext))
- diffs |= p_ext->chain ^ skb_ext->chain;
-#endif
+ diffs |= gro_list_prepare_tc_ext(skb, p, diffs);
}
NAPI_GRO_CB(p)->same_flow = !diffs;
diff --git a/net/devlink/leftover.c b/net/devlink/leftover.c
index 649a9701eb6a..1f00f874471f 100644
--- a/net/devlink/leftover.c
+++ b/net/devlink/leftover.c
@@ -6737,7 +6737,10 @@ void devlink_notify_unregister(struct devlink *devlink)
static void devlink_port_type_warn(struct work_struct *work)
{
- WARN(true, "Type was not set for devlink port.");
+ struct devlink_port *port = container_of(to_delayed_work(work),
+ struct devlink_port,
+ type_warn_dw);
+ dev_warn(port->devlink->dev, "Type was not set for devlink port.");
}
static bool devlink_port_type_should_warn(struct devlink_port *devlink_port)
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 356afe54951c..2afb0870648b 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -1480,6 +1480,15 @@ static struct ctl_table ipv4_net_table[] = {
.extra1 = SYSCTL_ZERO,
.extra2 = &tcp_syn_linear_timeouts_max,
},
+ {
+ .procname = "tcp_shrink_window",
+ .data = &init_net.ipv4.sysctl_tcp_shrink_window,
+ .maxlen = sizeof(u8),
+ .mode = 0644,
+ .proc_handler = proc_dou8vec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
+ },
{ }
};
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 0e21ea92dc1d..71b42eef9dbf 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -1774,7 +1774,7 @@ void tcp_update_recv_tstamps(struct sk_buff *skb,
}
#ifdef CONFIG_MMU
-static const struct vm_operations_struct tcp_vm_ops = {
+const struct vm_operations_struct tcp_vm_ops = {
};
int tcp_mmap(struct file *file, struct socket *sock,
@@ -2073,6 +2073,34 @@ static void tcp_zc_finalize_rx_tstamp(struct sock *sk,
}
}
+static struct vm_area_struct *find_tcp_vma(struct mm_struct *mm,
+ unsigned long address,
+ bool *mmap_locked)
+{
+ struct vm_area_struct *vma = NULL;
+
+#ifdef CONFIG_PER_VMA_LOCK
+ vma = lock_vma_under_rcu(mm, address);
+#endif
+ if (vma) {
+ if (!vma_is_tcp(vma)) {
+ vma_end_read(vma);
+ return NULL;
+ }
+ *mmap_locked = false;
+ return vma;
+ }
+
+ mmap_read_lock(mm);
+ vma = vma_lookup(mm, address);
+ if (!vma || !vma_is_tcp(vma)) {
+ mmap_read_unlock(mm);
+ return NULL;
+ }
+ *mmap_locked = true;
+ return vma;
+}
+
#define TCP_ZEROCOPY_PAGE_BATCH_SIZE 32
static int tcp_zerocopy_receive(struct sock *sk,
struct tcp_zerocopy_receive *zc,
@@ -2090,6 +2118,7 @@ static int tcp_zerocopy_receive(struct sock *sk,
u32 seq = tp->copied_seq;
u32 total_bytes_to_map;
int inq = tcp_inq(sk);
+ bool mmap_locked;
int ret;
zc->copybuf_len = 0;
@@ -2114,13 +2143,10 @@ static int tcp_zerocopy_receive(struct sock *sk,
return 0;
}
- mmap_read_lock(current->mm);
-
- vma = vma_lookup(current->mm, address);
- if (!vma || vma->vm_ops != &tcp_vm_ops) {
- mmap_read_unlock(current->mm);
+ vma = find_tcp_vma(current->mm, address, &mmap_locked);
+ if (!vma)
return -EINVAL;
- }
+
vma_len = min_t(unsigned long, zc->length, vma->vm_end - address);
avail_len = min_t(u32, vma_len, inq);
total_bytes_to_map = avail_len & ~(PAGE_SIZE - 1);
@@ -2194,7 +2220,10 @@ static int tcp_zerocopy_receive(struct sock *sk,
zc, total_bytes_to_map);
}
out:
- mmap_read_unlock(current->mm);
+ if (mmap_locked)
+ mmap_read_unlock(current->mm);
+ else
+ vma_end_read(vma);
/* Try to copy straggler data. */
if (!ret)
copylen = tcp_zc_handle_leftover(zc, sk, skb, &seq, copybuf_len, tss);
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 84a5d557dc1a..9213804b034f 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -3281,6 +3281,8 @@ static int __net_init tcp_sk_init(struct net *net)
net->ipv4.tcp_congestion_control = &tcp_reno;
net->ipv4.sysctl_tcp_syn_linear_timeouts = 4;
+ net->ipv4.sysctl_tcp_shrink_window = 0;
+
return 0;
}
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 660eac4bf2a7..2cb39b6dad02 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -260,8 +260,8 @@ static u16 tcp_select_window(struct sock *sk)
u32 old_win = tp->rcv_wnd;
u32 cur_win = tcp_receive_window(tp);
u32 new_win = __tcp_select_window(sk);
+ struct net *net = sock_net(sk);
- /* Never shrink the offered window */
if (new_win < cur_win) {
/* Danger Will Robinson!
* Don't update rcv_wup/rcv_wnd here or else
@@ -270,11 +270,14 @@ static u16 tcp_select_window(struct sock *sk)
*
* Relax Will Robinson.
*/
- if (new_win == 0)
- NET_INC_STATS(sock_net(sk),
- LINUX_MIB_TCPWANTZEROWINDOWADV);
- new_win = ALIGN(cur_win, 1 << tp->rx_opt.rcv_wscale);
+ if (!READ_ONCE(net->ipv4.sysctl_tcp_shrink_window) || !tp->rx_opt.rcv_wscale) {
+ /* Never shrink the offered window */
+ if (new_win == 0)
+ NET_INC_STATS(net, LINUX_MIB_TCPWANTZEROWINDOWADV);
+ new_win = ALIGN(cur_win, 1 << tp->rx_opt.rcv_wscale);
+ }
}
+
tp->rcv_wnd = new_win;
tp->rcv_wup = tp->rcv_nxt;
@@ -282,7 +285,7 @@ static u16 tcp_select_window(struct sock *sk)
* scaled window.
*/
if (!tp->rx_opt.rcv_wscale &&
- READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_workaround_signed_windows))
+ READ_ONCE(net->ipv4.sysctl_tcp_workaround_signed_windows))
new_win = min(new_win, MAX_TCP_WINDOW);
else
new_win = min(new_win, (65535U << tp->rx_opt.rcv_wscale));
@@ -294,10 +297,9 @@ static u16 tcp_select_window(struct sock *sk)
if (new_win == 0) {
tp->pred_flags = 0;
if (old_win)
- NET_INC_STATS(sock_net(sk),
- LINUX_MIB_TCPTOZEROWINDOWADV);
+ NET_INC_STATS(net, LINUX_MIB_TCPTOZEROWINDOWADV);
} else if (old_win == 0) {
- NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFROMZEROWINDOWADV);
+ NET_INC_STATS(net, LINUX_MIB_TCPFROMZEROWINDOWADV);
}
return new_win;
@@ -2987,6 +2989,7 @@ u32 __tcp_select_window(struct sock *sk)
{
struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
+ struct net *net = sock_net(sk);
/* MSS for the peer's data. Previous versions used mss_clamp
* here. I don't know if the value based on our guesses
* of peer's MSS is better for the performance. It's more correct
@@ -3008,6 +3011,15 @@ u32 __tcp_select_window(struct sock *sk)
if (mss <= 0)
return 0;
}
+
+ /* Only allow window shrink if the sysctl is enabled and we have
+ * a non-zero scaling factor in effect.
+ */
+ if (READ_ONCE(net->ipv4.sysctl_tcp_shrink_window) && tp->rx_opt.rcv_wscale)
+ goto shrink_window_allowed;
+
+ /* do not allow window to shrink */
+
if (free_space < (full_space >> 1)) {
icsk->icsk_ack.quick = 0;
@@ -3062,6 +3074,36 @@ u32 __tcp_select_window(struct sock *sk)
}
return window;
+
+shrink_window_allowed:
+ /* new window should always be an exact multiple of scaling factor */
+ free_space = round_down(free_space, 1 << tp->rx_opt.rcv_wscale);
+
+ if (free_space < (full_space >> 1)) {
+ icsk->icsk_ack.quick = 0;
+
+ if (tcp_under_memory_pressure(sk))
+ tcp_adjust_rcv_ssthresh(sk);
+
+ /* if free space is too low, return a zero window */
+ if (free_space < (allowed_space >> 4) || free_space < mss ||
+ free_space < (1 << tp->rx_opt.rcv_wscale))
+ return 0;
+ }
+
+ if (free_space > tp->rcv_ssthresh) {
+ free_space = tp->rcv_ssthresh;
+ /* new window should always be an exact multiple of scaling factor
+ *
+ * For this case, we ALIGN "up" (increase free_space) because
+ * we know free_space is not zero here, it has been reduced from
+ * the memory-based limit, and rcv_ssthresh is not a hard limit
+ * (unlike sk_rcvbuf).
+ */
+ free_space = ALIGN(free_space, (1 << tp->rx_opt.rcv_wscale));
+ }
+
+ return free_space;
}
void tcp_skb_collapse_tstamp(struct sk_buff *skb,
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index e510a4162ef8..64e873f5895f 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -3360,6 +3360,7 @@ static int ip6_route_check_nh_onlink(struct net *net,
static int ip6_route_check_nh(struct net *net,
struct fib6_config *cfg,
struct net_device **_dev,
+ netdevice_tracker *dev_tracker,
struct inet6_dev **idev)
{
const struct in6_addr *gw_addr = &cfg->fc_gateway;
@@ -3404,7 +3405,7 @@ static int ip6_route_check_nh(struct net *net,
err = -EHOSTUNREACH;
} else {
*_dev = dev = res.nh->fib_nh_dev;
- dev_hold(dev);
+ netdev_hold(dev, dev_tracker, GFP_ATOMIC);
*idev = in6_dev_get(dev);
}
@@ -3412,7 +3413,9 @@ static int ip6_route_check_nh(struct net *net,
}
static int ip6_validate_gw(struct net *net, struct fib6_config *cfg,
- struct net_device **_dev, struct inet6_dev **idev,
+ struct net_device **_dev,
+ netdevice_tracker *dev_tracker,
+ struct inet6_dev **idev,
struct netlink_ext_ack *extack)
{
const struct in6_addr *gw_addr = &cfg->fc_gateway;
@@ -3453,7 +3456,8 @@ static int ip6_validate_gw(struct net *net, struct fib6_config *cfg,
if (cfg->fc_flags & RTNH_F_ONLINK)
err = ip6_route_check_nh_onlink(net, cfg, dev, extack);
else
- err = ip6_route_check_nh(net, cfg, _dev, idev);
+ err = ip6_route_check_nh(net, cfg, _dev, dev_tracker,
+ idev);
rcu_read_unlock();
@@ -3571,7 +3575,8 @@ int fib6_nh_init(struct net *net, struct fib6_nh *fib6_nh,
}
if (cfg->fc_flags & RTF_GATEWAY) {
- err = ip6_validate_gw(net, cfg, &dev, &idev, extack);
+ err = ip6_validate_gw(net, cfg, &dev, dev_tracker,
+ &idev, extack);
if (err)
goto out;
diff --git a/net/kcm/kcmsock.c b/net/kcm/kcmsock.c
index d75d775e9462..d0537c1c8cd7 100644
--- a/net/kcm/kcmsock.c
+++ b/net/kcm/kcmsock.c
@@ -661,6 +661,7 @@ retry:
kcm_abort_tx_psock(psock, ret ? -ret : EPIPE,
true);
unreserve_psock(kcm);
+ psock = NULL;
txm->started_tx = false;
kcm_report_tx_retry(kcm);
@@ -696,7 +697,8 @@ out:
if (!head) {
/* Done with all queued messages. */
WARN_ON(!skb_queue_empty(&sk->sk_write_queue));
- unreserve_psock(kcm);
+ if (psock)
+ unreserve_psock(kcm);
}
/* Check if write space is available */
diff --git a/net/mctp/route.c b/net/mctp/route.c
index f51a05ec7162..ab62fe447038 100644
--- a/net/mctp/route.c
+++ b/net/mctp/route.c
@@ -1249,9 +1249,6 @@ static int mctp_newroute(struct sk_buff *skb, struct nlmsghdr *nlh,
mtu = nla_get_u32(tbx[RTAX_MTU]);
}
- if (rtm->rtm_type != RTN_UNICAST)
- return -EINVAL;
-
rc = mctp_route_add(mdev, daddr_start, rtm->rtm_dst_len, mtu,
rtm->rtm_type);
return rc;