From bb5ecb0c63ac88b6f39029f75c47f4be4e352e8d Mon Sep 17 00:00:00 2001 From: Rostislav Lisovy <lisovy@gmail.com> Date: Fri, 24 Jan 2014 13:17:37 +0100 Subject: can: Propagate SO_PRIORITY of raw sockets to skbs This allows controlling certain queueing disciplines by setting the socket's SO_PRIORITY option. For example, with the default pfifo_fast queueing discipline, which provides three priorities, socket priority TC_PRIO_CONTROL means higher than default and TC_PRIO_BULK means lower than default. Signed-off-by: Rostislav Lisovy <lisovy@gmail.com> Signed-off-by: Michal Sojka <sojkam1@fel.cvut.cz> Acked-by: Oliver Hartkopp <socketcan@hartkopp.net> Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de> --- net/can/raw.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/can/raw.c b/net/can/raw.c index 07d72d852324..8be757cca2ec 100644 --- a/net/can/raw.c +++ b/net/can/raw.c @@ -715,6 +715,7 @@ static int raw_sendmsg(struct kiocb *iocb, struct socket *sock, skb->dev = dev; skb->sk = sk; + skb->priority = sk->sk_priority; err = can_send(skb, ro->loopback); -- cgit From b5aaab12b2b4bc4acab7384c17a87f3406e5047d Mon Sep 17 00:00:00 2001 From: Or Gerlitz <ogerlitz@mellanox.com> Date: Wed, 29 Jan 2014 18:08:59 +0200 Subject: net/ipv4: Use non-atomic allocation of udp offloads structure instance Since udp_add_offload() can be called from non-sleepable context e.g under this call tree from the vxlan driver use case: vxlan_socket_create() <-- holds the spinlock -> vxlan_notify_add_rx_port() -> udp_add_offload() <-- schedules we should allocate the udp_offloads structure in atomic manner. Fixes: b582ef0 ('net: Add GRO support for UDP encapsulating protocols') Reported-by: Dan Carpenter <dan.carpenter@oracle.com> Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com> Signed-off-by: David S. Miller <davem@davemloft.net> --- net/ipv4/udp_offload.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c index 25f5cee3a08a..2ffea6f31efc 100644 --- a/net/ipv4/udp_offload.c +++ b/net/ipv4/udp_offload.c @@ -101,7 +101,7 @@ out: int udp_add_offload(struct udp_offload *uo) { struct udp_offload_priv __rcu **head = &udp_offload_base; - struct udp_offload_priv *new_offload = kzalloc(sizeof(*new_offload), GFP_KERNEL); + struct udp_offload_priv *new_offload = kzalloc(sizeof(*new_offload), GFP_ATOMIC); if (!new_offload) return -ENOMEM; -- cgit From 0ae89beb283a0db5980d1d4781c7d7be2f2810d6 Mon Sep 17 00:00:00 2001 From: Oliver Hartkopp <socketcan@hartkopp.net> Date: Thu, 30 Jan 2014 10:11:28 +0100 Subject: can: add destructor for self generated skbs Self generated skbuffs in net/can/bcm.c are setting a skb->sk reference but no explicit destructor which is enforced since Linux 3.11 with commit 376c7311bdb6 (net: add a temporary sanity check in skb_orphan()). This patch adds some helper functions to make sure that a destructor is properly defined when a sock reference is assigned to a CAN related skb. To create an unshared skb owned by the original sock a common helper function has been introduced to replace open coded functions to create CAN echo skbs. Signed-off-by: Oliver Hartkopp <socketcan@hartkopp.net> Tested-by: Andre Naujoks <nautsch2@gmail.com> Reviewed-by: Eric Dumazet <edumazet@google.com> Signed-off-by: David S. Miller <davem@davemloft.net> --- drivers/net/can/dev.c | 15 +++------------ drivers/net/can/janz-ican3.c | 18 ++++-------------- drivers/net/can/vcan.c | 9 ++++----- include/linux/can/skb.h | 38 ++++++++++++++++++++++++++++++++++++++ net/can/af_can.c | 3 ++- net/can/bcm.c | 4 ++-- 6 files changed, 53 insertions(+), 34 deletions(-) (limited to 'net') diff --git a/drivers/net/can/dev.c b/drivers/net/can/dev.c index 13a909822e25..fc59bc6f040b 100644 --- a/drivers/net/can/dev.c +++ b/drivers/net/can/dev.c @@ -323,19 +323,10 @@ void can_put_echo_skb(struct sk_buff *skb, struct net_device *dev, } if (!priv->echo_skb[idx]) { - struct sock *srcsk = skb->sk; - if (atomic_read(&skb->users) != 1) { - struct sk_buff *old_skb = skb; - - skb = skb_clone(old_skb, GFP_ATOMIC); - kfree_skb(old_skb); - if (!skb) - return; - } else - skb_orphan(skb); - - skb->sk = srcsk; + skb = can_create_echo_skb(skb); + if (!skb) + return; /* make settings for echo to reduce code in irq context */ skb->protocol = htons(ETH_P_CAN); diff --git a/drivers/net/can/janz-ican3.c b/drivers/net/can/janz-ican3.c index e24e6690d672..2124c6790687 100644 --- a/drivers/net/can/janz-ican3.c +++ b/drivers/net/can/janz-ican3.c @@ -18,6 +18,7 @@ #include <linux/netdevice.h> #include <linux/can.h> #include <linux/can/dev.h> +#include <linux/can/skb.h> #include <linux/can/error.h> #include <linux/mfd/janz.h> @@ -1133,20 +1134,9 @@ static void ican3_handle_message(struct ican3_dev *mod, struct ican3_msg *msg) */ static void ican3_put_echo_skb(struct ican3_dev *mod, struct sk_buff *skb) { - struct sock *srcsk = skb->sk; - - if (atomic_read(&skb->users) != 1) { - struct sk_buff *old_skb = skb; - - skb = skb_clone(old_skb, GFP_ATOMIC); - kfree_skb(old_skb); - if (!skb) - return; - } else { - skb_orphan(skb); - } - - skb->sk = srcsk; + skb = can_create_echo_skb(skb); + if (!skb) + return; /* save this skb for tx interrupt echo handling */ skb_queue_tail(&mod->echoq, skb); diff --git a/drivers/net/can/vcan.c b/drivers/net/can/vcan.c index 0a2a5ee79a17..4e94057ef5cf 100644 --- a/drivers/net/can/vcan.c +++ b/drivers/net/can/vcan.c @@ -46,6 +46,7 @@ #include <linux/if_ether.h> #include <linux/can.h> #include <linux/can/dev.h> +#include <linux/can/skb.h> #include <linux/slab.h> #include <net/rtnetlink.h> @@ -109,25 +110,23 @@ static netdev_tx_t vcan_tx(struct sk_buff *skb, struct net_device *dev) stats->rx_packets++; stats->rx_bytes += cfd->len; } - kfree_skb(skb); + consume_skb(skb); return NETDEV_TX_OK; } /* perform standard echo handling for CAN network interfaces */ if (loop) { - struct sock *srcsk = skb->sk; - skb = skb_share_check(skb, GFP_ATOMIC); + skb = can_create_echo_skb(skb); if (!skb) return NETDEV_TX_OK; /* receive with packet counting */ - skb->sk = srcsk; vcan_rx(skb, dev); } else { /* no looped packets => no counting */ - kfree_skb(skb); + consume_skb(skb); } return NETDEV_TX_OK; } diff --git a/include/linux/can/skb.h b/include/linux/can/skb.h index 2f0543f7510c..f9bbbb472663 100644 --- a/include/linux/can/skb.h +++ b/include/linux/can/skb.h @@ -11,7 +11,9 @@ #define CAN_SKB_H #include <linux/types.h> +#include <linux/skbuff.h> #include <linux/can.h> +#include <net/sock.h> /* * The struct can_skb_priv is used to transport additional information along @@ -42,4 +44,40 @@ static inline void can_skb_reserve(struct sk_buff *skb) skb_reserve(skb, sizeof(struct can_skb_priv)); } +static inline void can_skb_destructor(struct sk_buff *skb) +{ + sock_put(skb->sk); +} + +static inline void can_skb_set_owner(struct sk_buff *skb, struct sock *sk) +{ + if (sk) { + sock_hold(sk); + skb->destructor = can_skb_destructor; + skb->sk = sk; + } +} + +/* + * returns an unshared skb owned by the original sock to be echo'ed back + */ +static inline struct sk_buff *can_create_echo_skb(struct sk_buff *skb) +{ + if (skb_shared(skb)) { + struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC); + + if (likely(nskb)) { + can_skb_set_owner(nskb, skb->sk); + consume_skb(skb); + return nskb; + } else { + kfree_skb(skb); + return NULL; + } + } + + /* we can assume to have an unshared skb with proper owner */ + return skb; +} + #endif /* CAN_SKB_H */ diff --git a/net/can/af_can.c b/net/can/af_can.c index d249874a366d..a27f8aad9e99 100644 --- a/net/can/af_can.c +++ b/net/can/af_can.c @@ -57,6 +57,7 @@ #include <linux/skbuff.h> #include <linux/can.h> #include <linux/can/core.h> +#include <linux/can/skb.h> #include <linux/ratelimit.h> #include <net/net_namespace.h> #include <net/sock.h> @@ -290,7 +291,7 @@ int can_send(struct sk_buff *skb, int loop) return -ENOMEM; } - newskb->sk = skb->sk; + can_skb_set_owner(newskb, skb->sk); newskb->ip_summed = CHECKSUM_UNNECESSARY; newskb->pkt_type = PACKET_BROADCAST; } diff --git a/net/can/bcm.c b/net/can/bcm.c index 3fc737b214c7..dcb75c0e66c1 100644 --- a/net/can/bcm.c +++ b/net/can/bcm.c @@ -268,7 +268,7 @@ static void bcm_can_tx(struct bcm_op *op) /* send with loopback */ skb->dev = dev; - skb->sk = op->sk; + can_skb_set_owner(skb, op->sk); can_send(skb, 1); /* update statistics */ @@ -1223,7 +1223,7 @@ static int bcm_tx_send(struct msghdr *msg, int ifindex, struct sock *sk) can_skb_prv(skb)->ifindex = dev->ifindex; skb->dev = dev; - skb->sk = sk; + can_skb_set_owner(skb, sk); err = can_send(skb, 1); /* send with loopback */ dev_put(dev); -- cgit From b045d37bd68c20ca88123c2b363cac5e3dae815f Mon Sep 17 00:00:00 2001 From: Eric Dumazet <edumazet@google.com> Date: Mon, 3 Feb 2014 12:52:14 -0800 Subject: ip_tunnel: fix panic in ip_tunnel_xmit() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Setting rt variable to NULL at the beginning of ip_tunnel_xmit() missed possible use of this variable as a scratch value. Also fixes a possible dst leak in tunnel_dst_check() : If we had to call tunnel_dst_reset(), we forgot to release the reference on dst. Merges tunnel_dst_get()/tunnel_dst_check() into a single tunnel_rtable_get() function for clarity. Many thanks to Tommi for his report and tests. Fixes: 7d442fab0a67 ("ipv4: Cache dst in tunnels") Reported-by: Tommi Rantala <tt.rantala@gmail.com> Signed-off-by: Eric Dumazet <edumazet@google.com> Tested-by: Tommi Rantala <tt.rantala@gmail.com> Cc: Tom Herbert <therbert@google.com> Cc: Maciej Żenczykowski <maze@google.com> Signed-off-by: David S. Miller <davem@davemloft.net> --- net/ipv4/ip_tunnel.c | 29 +++++++++++------------------ 1 file changed, 11 insertions(+), 18 deletions(-) (limited to 'net') diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c index bd28f386bd02..50228be5c17b 100644 --- a/net/ipv4/ip_tunnel.c +++ b/net/ipv4/ip_tunnel.c @@ -101,28 +101,22 @@ static void tunnel_dst_reset_all(struct ip_tunnel *t) __tunnel_dst_set(per_cpu_ptr(t->dst_cache, i), NULL); } -static struct dst_entry *tunnel_dst_get(struct ip_tunnel *t) +static struct rtable *tunnel_rtable_get(struct ip_tunnel *t, u32 cookie) { struct dst_entry *dst; rcu_read_lock(); dst = rcu_dereference(this_cpu_ptr(t->dst_cache)->dst); - if (dst) + if (dst) { + if (dst->obsolete && dst->ops->check(dst, cookie) == NULL) { + rcu_read_unlock(); + tunnel_dst_reset(t); + return NULL; + } dst_hold(dst); - rcu_read_unlock(); - return dst; -} - -static struct dst_entry *tunnel_dst_check(struct ip_tunnel *t, u32 cookie) -{ - struct dst_entry *dst = tunnel_dst_get(t); - - if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) { - tunnel_dst_reset(t); - return NULL; } - - return dst; + rcu_read_unlock(); + return (struct rtable *)dst; } /* Often modified stats are per cpu, other are shared (netdev->stats) */ @@ -584,7 +578,7 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, struct flowi4 fl4; u8 tos, ttl; __be16 df; - struct rtable *rt = NULL; /* Route to the other host */ + struct rtable *rt; /* Route to the other host */ unsigned int max_headroom; /* The extra header space needed */ __be32 dst; int err; @@ -657,8 +651,7 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, init_tunnel_flow(&fl4, protocol, dst, tnl_params->saddr, tunnel->parms.o_key, RT_TOS(tos), tunnel->parms.link); - if (connected) - rt = (struct rtable *)tunnel_dst_check(tunnel, 0); + rt = connected ? tunnel_rtable_get(tunnel, 0) : NULL; if (!rt) { rt = ip_route_output_key(tunnel->net, &fl4); -- cgit From 2a971354e74f3837d14b9c8d7f7983b0c9c330e4 Mon Sep 17 00:00:00 2001 From: Michal Kubecek <mkubecek@suse.cz> Date: Thu, 30 Jan 2014 08:50:20 +0100 Subject: ipvs: fix AF assignment in ip_vs_conn_new() If a fwmark is passed to ip_vs_conn_new(), it is passed in vaddr, not daddr. Therefore we should set AF to AF_UNSPEC in vaddr assignment (like we do in ip_vs_ct_in_get()), otherwise we may copy only first 4 bytes of an IPv6 address into cp->daddr. Signed-off-by: Bogdano Arendartchuk <barendartchuk@suse.com> Signed-off-by: Michal Kubecek <mkubecek@suse.cz> Acked-by: Julian Anastasov <ja@ssi.bg> Signed-off-by: Simon Horman <horms@verge.net.au> --- net/netfilter/ipvs/ip_vs_conn.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c index 59a1a85bcb3e..a8eb0a89326a 100644 --- a/net/netfilter/ipvs/ip_vs_conn.c +++ b/net/netfilter/ipvs/ip_vs_conn.c @@ -871,11 +871,11 @@ ip_vs_conn_new(const struct ip_vs_conn_param *p, cp->protocol = p->protocol; ip_vs_addr_set(p->af, &cp->caddr, p->caddr); cp->cport = p->cport; - ip_vs_addr_set(p->af, &cp->vaddr, p->vaddr); - cp->vport = p->vport; - /* proto should only be IPPROTO_IP if d_addr is a fwmark */ + /* proto should only be IPPROTO_IP if p->vaddr is a fwmark */ ip_vs_addr_set(p->protocol == IPPROTO_IP ? AF_UNSPEC : p->af, - &cp->daddr, daddr); + &cp->vaddr, p->vaddr); + cp->vport = p->vport; + ip_vs_addr_set(p->af, &cp->daddr, daddr); cp->dport = dport; cp->flags = flags; cp->fwmark = fwmark; -- cgit From a664a4f7aa4f01ca8728e3ec43618327416fc8ff Mon Sep 17 00:00:00 2001 From: Shlomo Pongratz <shlomop@mellanox.com> Date: Sun, 2 Feb 2014 15:42:10 +0200 Subject: net/ipv4: Use proper RCU APIs for writer-side in udp_offload.c RCU writer side should use rcu_dereference_protected() and not rcu_dereference(), fix that. This also removes the "suspicious RCU usage" warning seen when running with CONFIG_PROVE_RCU. Also, don't use rcu_assign_pointer/rcu_dereference for pointers which are invisible beyond the udp offload code. Fixes: b582ef0 ('net: Add GRO support for UDP encapsulating protocols') Reported-by: Eric Dumazet <edumazet@google.com> Cc: Eric Dumazet <edumazet@google.com> Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com> Signed-off-by: Shlomo Pongratz <shlomop@mellanox.com> Acked-by: Eric Dumazet <edumazet@google.com> Signed-off-by: David S. Miller <davem@davemloft.net> --- net/ipv4/udp_offload.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c index 2ffea6f31efc..88b4023ecfcf 100644 --- a/net/ipv4/udp_offload.c +++ b/net/ipv4/udp_offload.c @@ -17,6 +17,8 @@ static DEFINE_SPINLOCK(udp_offload_lock); static struct udp_offload_priv __rcu *udp_offload_base __read_mostly; +#define udp_deref_protected(X) rcu_dereference_protected(X, lockdep_is_held(&udp_offload_lock)) + struct udp_offload_priv { struct udp_offload *offload; struct rcu_head rcu; @@ -100,7 +102,6 @@ out: int udp_add_offload(struct udp_offload *uo) { - struct udp_offload_priv __rcu **head = &udp_offload_base; struct udp_offload_priv *new_offload = kzalloc(sizeof(*new_offload), GFP_ATOMIC); if (!new_offload) @@ -109,8 +110,8 @@ int udp_add_offload(struct udp_offload *uo) new_offload->offload = uo; spin_lock(&udp_offload_lock); - rcu_assign_pointer(new_offload->next, rcu_dereference(*head)); - rcu_assign_pointer(*head, new_offload); + new_offload->next = udp_offload_base; + rcu_assign_pointer(udp_offload_base, new_offload); spin_unlock(&udp_offload_lock); return 0; @@ -130,12 +131,12 @@ void udp_del_offload(struct udp_offload *uo) spin_lock(&udp_offload_lock); - uo_priv = rcu_dereference(*head); + uo_priv = udp_deref_protected(*head); for (; uo_priv != NULL; - uo_priv = rcu_dereference(*head)) { - + uo_priv = udp_deref_protected(*head)) { if (uo_priv->offload == uo) { - rcu_assign_pointer(*head, rcu_dereference(uo_priv->next)); + rcu_assign_pointer(*head, + udp_deref_protected(uo_priv->next)); goto unlock; } head = &uo_priv->next; -- cgit From 6049f2530cf2cb48a6fe8735309cc0b97aa7f700 Mon Sep 17 00:00:00 2001 From: Fernando Luis Vazquez Cao <fernando_b1@lab.ntt.co.jp> Date: Tue, 4 Feb 2014 19:35:02 +0900 Subject: rtnetlink: fix oops in rtnl_link_get_slave_info_data_size We should check whether rtnetlink link operations are defined before calling get_slave_size(). Without this, the following oops can occur when adding a tap device to OVS. [ 87.839553] BUG: unable to handle kernel NULL pointer dereference at 00000000000000a8 [ 87.839595] IP: [<ffffffff813d47c0>] if_nlmsg_size+0xf0/0x220 [...] [ 87.840651] Call Trace: [ 87.840664] [<ffffffff813d694b>] ? rtmsg_ifinfo+0x2b/0x100 [ 87.840688] [<ffffffff813c8340>] ? __netdev_adjacent_dev_insert+0x150/0x1a0 [ 87.840718] [<ffffffff813d6a50>] ? rtnetlink_event+0x30/0x40 [ 87.840742] [<ffffffff814b4144>] ? notifier_call_chain+0x44/0x70 [ 87.840768] [<ffffffff813c8946>] ? __netdev_upper_dev_link+0x3c6/0x3f0 [ 87.840798] [<ffffffffa0678d6c>] ? netdev_create+0xcc/0x160 [openvswitch] [ 87.840828] [<ffffffffa06781ea>] ? ovs_vport_add+0x4a/0xd0 [openvswitch] [ 87.840857] [<ffffffffa0670139>] ? new_vport+0x9/0x50 [openvswitch] [ 87.840884] [<ffffffffa067279e>] ? ovs_vport_cmd_new+0x11e/0x210 [openvswitch] [ 87.840915] [<ffffffff813f3efa>] ? genl_family_rcv_msg+0x19a/0x360 [ 87.840941] [<ffffffff813f40c0>] ? genl_family_rcv_msg+0x360/0x360 [ 87.840967] [<ffffffff813f4139>] ? genl_rcv_msg+0x79/0xc0 [ 87.840991] [<ffffffff813b6cf9>] ? __kmalloc_reserve.isra.25+0x29/0x80 [ 87.841018] [<ffffffff813f2389>] ? netlink_rcv_skb+0xa9/0xc0 [ 87.841042] [<ffffffff813f27cf>] ? genl_rcv+0x1f/0x30 [ 87.841064] [<ffffffff813f1988>] ? netlink_unicast+0xe8/0x1e0 [ 87.841088] [<ffffffff813f1d9a>] ? netlink_sendmsg+0x31a/0x750 [ 87.841113] [<ffffffff813aee96>] ? sock_sendmsg+0x86/0xc0 [ 87.841136] [<ffffffff813c960d>] ? __netdev_update_features+0x4d/0x200 [ 87.841163] [<ffffffff813ca94e>] ? ethtool_get_value+0x2e/0x50 [ 87.841188] [<ffffffff813af269>] ? ___sys_sendmsg+0x359/0x370 [ 87.841212] [<ffffffff813da686>] ? dev_ioctl+0x1a6/0x5c0 [ 87.841236] [<ffffffff8109c210>] ? autoremove_wake_function+0x30/0x30 [ 87.841264] [<ffffffff813ac59d>] ? sock_do_ioctl+0x3d/0x50 [ 87.841288] [<ffffffff813aca68>] ? sock_ioctl+0x1e8/0x2c0 [ 87.841312] [<ffffffff811934bf>] ? do_vfs_ioctl+0x2cf/0x4b0 [ 87.841335] [<ffffffff813afeb9>] ? __sys_sendmsg+0x39/0x70 [ 87.841362] [<ffffffff814b86f9>] ? system_call_fastpath+0x16/0x1b [ 87.841386] Code: c0 74 10 48 89 ef ff d0 83 c0 07 83 e0 fc 48 98 49 01 c7 48 89 ef e8 d0 d6 fe ff 48 85 c0 0f 84 df 00 00 00 48 8b 90 08 07 00 00 <48> 8b 8a a8 00 00 00 31 d2 48 85 c9 74 0c 48 89 ee 48 89 c7 ff [ 87.841529] RIP [<ffffffff813d47c0>] if_nlmsg_size+0xf0/0x220 [ 87.841555] RSP <ffff880221aa5950> [ 87.841569] CR2: 00000000000000a8 [ 87.851442] ---[ end trace e42ab217691b4fc2 ]--- Signed-off-by: Fernando Luis Vazquez Cao <fernando@oss.ntt.co.jp> Acked-by: Jiri Pirko <jiri@resnulli.us> Signed-off-by: David S. Miller <davem@davemloft.net> --- net/core/rtnetlink.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 393b1bc9a618..048dc8d183aa 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -374,7 +374,7 @@ static size_t rtnl_link_get_slave_info_data_size(const struct net_device *dev) if (!master_dev) return 0; ops = master_dev->rtnl_link_ops; - if (!ops->get_slave_size) + if (!ops || !ops->get_slave_size) return 0; /* IFLA_INFO_SLAVE_DATA + nested data */ return nla_total_size(sizeof(struct nlattr)) + -- cgit From aea0bb4f8ee513537ad84b9f3f609f96e272d98e Mon Sep 17 00:00:00 2001 From: Thomas Graf <tgraf@suug.ch> Date: Tue, 14 Jan 2014 16:27:49 +0000 Subject: openvswitch: Pad OVS_PACKET_ATTR_PACKET if linear copy was performed While the zerocopy method is correctly omitted if user space does not support unaligned Netlink messages. The attribute is still not padded correctly as skb_zerocopy() will not ensure padding and the attribute size is no longer pre calculated though nla_reserve() which ensured padding previously. This patch applies appropriate padding if a linear data copy was performed in skb_zerocopy(). Signed-off-by: Thomas Graf <tgraf@suug.ch> Acked-by: Zoltan Kiss <zoltan.kiss@citrix.com> Signed-off-by: Jesse Gross <jesse@nicira.com> --- net/openvswitch/datapath.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'net') diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index df4692826ead..d1a73a6102f8 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -466,6 +466,14 @@ static int queue_userspace_packet(struct datapath *dp, struct sk_buff *skb, skb_zerocopy(user_skb, skb, skb->len, hlen); + /* Pad OVS_PACKET_ATTR_PACKET if linear copy was performed */ + if (!(dp->user_features & OVS_DP_F_UNALIGNED)) { + size_t plen = NLA_ALIGN(user_skb->len) - user_skb->len; + + if (plen > 0) + memset(skb_put(user_skb, plen), 0, plen); + } + ((struct nlmsghdr *) user_skb->data)->nlmsg_len = user_skb->len; err = genlmsg_unicast(ovs_dp_get_net(dp), user_skb, upcall_info->portid); -- cgit From e80857cce82da31e41a6599fc888dfc92e0167cc Mon Sep 17 00:00:00 2001 From: Andy Zhou <azhou@nicira.com> Date: Tue, 21 Jan 2014 09:31:04 -0800 Subject: openvswitch: Fix kernel panic on ovs_flow_free Both mega flow mask's reference counter and per flow table mask list should only be accessed when holding ovs_mutex() lock. However this is not true with ovs_flow_table_flush(). The patch fixes this bug. Reported-by: Joe Stringer <joestringer@nicira.com> Signed-off-by: Andy Zhou <azhou@nicira.com> Signed-off-by: Jesse Gross <jesse@nicira.com> --- net/openvswitch/datapath.c | 9 +++-- net/openvswitch/flow_table.c | 84 +++++++++++++++++++++----------------------- net/openvswitch/flow_table.h | 2 +- 3 files changed, 47 insertions(+), 48 deletions(-) (limited to 'net') diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index d1a73a6102f8..e1b337e0bf4d 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -55,6 +55,7 @@ #include "datapath.h" #include "flow.h" +#include "flow_table.h" #include "flow_netlink.h" #include "vport-internal_dev.h" #include "vport-netdev.h" @@ -160,7 +161,6 @@ static void destroy_dp_rcu(struct rcu_head *rcu) { struct datapath *dp = container_of(rcu, struct datapath, rcu); - ovs_flow_tbl_destroy(&dp->table); free_percpu(dp->stats_percpu); release_net(ovs_dp_get_net(dp)); kfree(dp->ports); @@ -1287,7 +1287,7 @@ err_destroy_ports_array: err_destroy_percpu: free_percpu(dp->stats_percpu); err_destroy_table: - ovs_flow_tbl_destroy(&dp->table); + ovs_flow_tbl_destroy(&dp->table, false); err_free_dp: release_net(ovs_dp_get_net(dp)); kfree(dp); @@ -1314,10 +1314,13 @@ static void __dp_destroy(struct datapath *dp) list_del_rcu(&dp->list_node); /* OVSP_LOCAL is datapath internal port. We need to make sure that - * all port in datapath are destroyed first before freeing datapath. + * all ports in datapath are destroyed first before freeing datapath. */ ovs_dp_detach_port(ovs_vport_ovsl(dp, OVSP_LOCAL)); + /* RCU destroy the flow table */ + ovs_flow_tbl_destroy(&dp->table, true); + call_rcu(&dp->rcu, destroy_dp_rcu); } diff --git a/net/openvswitch/flow_table.c b/net/openvswitch/flow_table.c index c58a0fe3c889..bd14052ed342 100644 --- a/net/openvswitch/flow_table.c +++ b/net/openvswitch/flow_table.c @@ -153,29 +153,27 @@ static void rcu_free_flow_callback(struct rcu_head *rcu) flow_free(flow); } -static void flow_mask_del_ref(struct sw_flow_mask *mask, bool deferred) +void ovs_flow_free(struct sw_flow *flow, bool deferred) { - if (!mask) + if (!flow) return; - BUG_ON(!mask->ref_count); - mask->ref_count--; + ASSERT_OVSL(); - if (!mask->ref_count) { - list_del_rcu(&mask->list); - if (deferred) - kfree_rcu(mask, rcu); - else - kfree(mask); - } -} + if (flow->mask) { + struct sw_flow_mask *mask = flow->mask; -void ovs_flow_free(struct sw_flow *flow, bool deferred) -{ - if (!flow) - return; + BUG_ON(!mask->ref_count); + mask->ref_count--; - flow_mask_del_ref(flow->mask, deferred); + if (!mask->ref_count) { + list_del_rcu(&mask->list); + if (deferred) + kfree_rcu(mask, rcu); + else + kfree(mask); + } + } if (deferred) call_rcu(&flow->rcu, rcu_free_flow_callback); @@ -188,26 +186,9 @@ static void free_buckets(struct flex_array *buckets) flex_array_free(buckets); } + static void __table_instance_destroy(struct table_instance *ti) { - int i; - - if (ti->keep_flows) - goto skip_flows; - - for (i = 0; i < ti->n_buckets; i++) { - struct sw_flow *flow; - struct hlist_head *head = flex_array_get(ti->buckets, i); - struct hlist_node *n; - int ver = ti->node_ver; - - hlist_for_each_entry_safe(flow, n, head, hash_node[ver]) { - hlist_del(&flow->hash_node[ver]); - ovs_flow_free(flow, false); - } - } - -skip_flows: free_buckets(ti->buckets); kfree(ti); } @@ -258,20 +239,38 @@ static void flow_tbl_destroy_rcu_cb(struct rcu_head *rcu) static void table_instance_destroy(struct table_instance *ti, bool deferred) { + int i; + if (!ti) return; + if (ti->keep_flows) + goto skip_flows; + + for (i = 0; i < ti->n_buckets; i++) { + struct sw_flow *flow; + struct hlist_head *head = flex_array_get(ti->buckets, i); + struct hlist_node *n; + int ver = ti->node_ver; + + hlist_for_each_entry_safe(flow, n, head, hash_node[ver]) { + hlist_del_rcu(&flow->hash_node[ver]); + ovs_flow_free(flow, deferred); + } + } + +skip_flows: if (deferred) call_rcu(&ti->rcu, flow_tbl_destroy_rcu_cb); else __table_instance_destroy(ti); } -void ovs_flow_tbl_destroy(struct flow_table *table) +void ovs_flow_tbl_destroy(struct flow_table *table, bool deferred) { struct table_instance *ti = ovsl_dereference(table->ti); - table_instance_destroy(ti, false); + table_instance_destroy(ti, deferred); } struct sw_flow *ovs_flow_tbl_dump_next(struct table_instance *ti, @@ -504,16 +503,11 @@ static struct sw_flow_mask *mask_alloc(void) mask = kmalloc(sizeof(*mask), GFP_KERNEL); if (mask) - mask->ref_count = 0; + mask->ref_count = 1; return mask; } -static void mask_add_ref(struct sw_flow_mask *mask) -{ - mask->ref_count++; -} - static bool mask_equal(const struct sw_flow_mask *a, const struct sw_flow_mask *b) { @@ -554,9 +548,11 @@ static int flow_mask_insert(struct flow_table *tbl, struct sw_flow *flow, mask->key = new->key; mask->range = new->range; list_add_rcu(&mask->list, &tbl->mask_list); + } else { + BUG_ON(!mask->ref_count); + mask->ref_count++; } - mask_add_ref(mask); flow->mask = mask; return 0; } diff --git a/net/openvswitch/flow_table.h b/net/openvswitch/flow_table.h index 1996e34c0fd8..baaeb101924d 100644 --- a/net/openvswitch/flow_table.h +++ b/net/openvswitch/flow_table.h @@ -60,7 +60,7 @@ void ovs_flow_free(struct sw_flow *, bool deferred); int ovs_flow_tbl_init(struct flow_table *); int ovs_flow_tbl_count(struct flow_table *table); -void ovs_flow_tbl_destroy(struct flow_table *table); +void ovs_flow_tbl_destroy(struct flow_table *table, bool deferred); int ovs_flow_tbl_flush(struct flow_table *flow_table); int ovs_flow_tbl_insert(struct flow_table *table, struct sw_flow *flow, -- cgit From 45fb9c35b27c9982e9a55d04ed0a5230a2d0b306 Mon Sep 17 00:00:00 2001 From: Daniele Di Proietto <daniele.di.proietto@gmail.com> Date: Thu, 23 Jan 2014 10:47:35 -0800 Subject: openvswitch: Fix ovs_dp_cmd_msg_size() commit 43d4be9cb55f3bac5253e9289996fd9d735531db (openvswitch: Allow user space to announce ability to accept unaligned Netlink messages) introduced OVS_DP_ATTR_USER_FEATURES netlink attribute in datapath responses, but the attribute size was not taken into account in ovs_dp_cmd_msg_size(). Signed-off-by: Daniele Di Proietto <daniele.di.proietto@gmail.com> Signed-off-by: Jesse Gross <jesse@nicira.com> --- net/openvswitch/datapath.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index e1b337e0bf4d..58689dda8377 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -1087,6 +1087,7 @@ static size_t ovs_dp_cmd_msg_size(void) msgsize += nla_total_size(IFNAMSIZ); msgsize += nla_total_size(sizeof(struct ovs_dp_stats)); msgsize += nla_total_size(sizeof(struct ovs_dp_megaflow_stats)); + msgsize += nla_total_size(sizeof(u32)); /* OVS_DP_ATTR_USER_FEATURES */ return msgsize; } -- cgit From e4c6d7595403e943a3afc334eb8c0efcd840043a Mon Sep 17 00:00:00 2001 From: Pravin B Shelar <pshelar@nicira.com> Date: Fri, 31 Jan 2014 09:43:23 -0800 Subject: openvswitch: Fix ovs_flow_free() ovs-lock assert. ovs_flow_free() is not called under ovs-lock during packet execute path (ovs_packet_cmd_execute()). Since packet execute does not touch flow->mask, there is no need to take that lock either. So move assert in case where flow->mask is checked. Found by code inspection. Signed-off-by: Pravin B Shelar <pshelar@nicira.com> Signed-off-by: Jesse Gross <jesse@nicira.com> --- net/openvswitch/flow_table.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/openvswitch/flow_table.c b/net/openvswitch/flow_table.c index bd14052ed342..3c268b3d71c3 100644 --- a/net/openvswitch/flow_table.c +++ b/net/openvswitch/flow_table.c @@ -158,11 +158,13 @@ void ovs_flow_free(struct sw_flow *flow, bool deferred) if (!flow) return; - ASSERT_OVSL(); - if (flow->mask) { struct sw_flow_mask *mask = flow->mask; + /* ovs-lock is required to protect mask-refcount and + * mask list. + */ + ASSERT_OVSL(); BUG_ON(!mask->ref_count); mask->ref_count--; -- cgit From c14e0953ca51dbcb8d1ac92acbdcff23d0caa158 Mon Sep 17 00:00:00 2001 From: Andy Zhou <azhou@nicira.com> Date: Sun, 2 Feb 2014 17:08:06 -0800 Subject: openvswitch: Suppress error messages on megaflow updates With subfacets, we'd expect megaflow updates message to carry the original micro flow. If not, EINVAL is returned and kernel logs an error message. Now that the user space subfacet layer is removed, it is expected that flow updates can arrive with a micro flow other than the original. Change the return code to EEXIST and remove the kernel error log message. Reported-by: Ben Pfaff <blp@nicira.com> Signed-off-by: Andy Zhou <azhou@nicira.com> Signed-off-by: Jesse Gross <jesse@nicira.com> --- net/openvswitch/datapath.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'net') diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index 58689dda8377..e9a48baf8551 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -860,11 +860,8 @@ static int ovs_flow_cmd_new_or_set(struct sk_buff *skb, struct genl_info *info) goto err_unlock_ovs; /* The unmasked key has to be the same for flow updates. */ - error = -EINVAL; - if (!ovs_flow_cmp_unmasked_key(flow, &match)) { - OVS_NLERR("Flow modification message rejected, unmasked key does not match.\n"); + if (!ovs_flow_cmp_unmasked_key(flow, &match)) goto err_unlock_ovs; - } /* Update actions. */ old_acts = ovsl_dereference(flow->sf_acts); -- cgit From 2a53bfb3e0fb6aa6b1ac93e5979a040a4b57ea8b Mon Sep 17 00:00:00 2001 From: Arturo Borrero <arturo.borrero.glez@gmail.com> Date: Fri, 17 Jan 2014 02:28:45 +0100 Subject: netfilter: nft_ct: fix unconditional dump of 'dir' attr We want to make sure that the information that we get from the kernel can be reinjected without troubles. The kernel shouldn't return an attribute that is not required, or even prohibited. Dumping unconditionally NFTA_CT_DIRECTION could lead an application in userspace to interpret that the attribute was originally set, while it was not. Signed-off-by: Arturo Borrero Gonzalez <arturo.borrero.glez@gmail.com> Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org> --- net/netfilter/nft_ct.c | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/netfilter/nft_ct.c b/net/netfilter/nft_ct.c index 917052e20602..feaf0f354a93 100644 --- a/net/netfilter/nft_ct.c +++ b/net/netfilter/nft_ct.c @@ -311,8 +311,19 @@ static int nft_ct_get_dump(struct sk_buff *skb, const struct nft_expr *expr) goto nla_put_failure; if (nla_put_be32(skb, NFTA_CT_KEY, htonl(priv->key))) goto nla_put_failure; - if (nla_put_u8(skb, NFTA_CT_DIRECTION, priv->dir)) - goto nla_put_failure; + + switch (priv->key) { + case NFT_CT_PROTOCOL: + case NFT_CT_SRC: + case NFT_CT_DST: + case NFT_CT_PROTO_SRC: + case NFT_CT_PROTO_DST: + if (nla_put_u8(skb, NFTA_CT_DIRECTION, priv->dir)) + goto nla_put_failure; + default: + break; + } + return 0; nla_put_failure: -- cgit From 3dd7279fb6db05ec5a088cd0cae6ba22580a82bd Mon Sep 17 00:00:00 2001 From: Patrick McHardy <kaber@trash.net> Date: Sat, 25 Jan 2014 08:04:07 +0000 Subject: netfilter: nf_tables: fix oops when deleting a chain with references The following commands trigger an oops: # nft -i nft> add table filter nft> add chain filter input { type filter hook input priority 0; } nft> add chain filter test nft> add rule filter input jump test nft> delete chain filter test We need to check the chain use counter before allowing destruction since we might have references from sets or jump rules. Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=69341 Reported-by: Matthew Ife <deleriux1@gmail.com> Tested-by: Matthew Ife <deleriux1@gmail.com> Signed-off-by: Patrick McHardy <kaber@trash.net> Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org> --- net/netfilter/nf_tables_api.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 117bbaaddde6..9ce30534f853 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -1045,7 +1045,7 @@ static int nf_tables_delchain(struct sock *nlsk, struct sk_buff *skb, if (IS_ERR(chain)) return PTR_ERR(chain); - if (!list_empty(&chain->rules)) + if (!list_empty(&chain->rules) || chain->use > 0) return -EBUSY; list_del(&chain->list); -- cgit From c6825c0976fa7893692e0e43b09740b419b23c09 Mon Sep 17 00:00:00 2001 From: Andrey Vagin <avagin@openvz.org> Date: Wed, 29 Jan 2014 19:34:14 +0100 Subject: netfilter: nf_conntrack: fix RCU race in nf_conntrack_find_get Lets look at destroy_conntrack: hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode); ... nf_conntrack_free(ct) kmem_cache_free(net->ct.nf_conntrack_cachep, ct); net->ct.nf_conntrack_cachep is created with SLAB_DESTROY_BY_RCU. The hash is protected by rcu, so readers look up conntracks without locks. A conntrack is removed from the hash, but in this moment a few readers still can use the conntrack. Then this conntrack is released and another thread creates conntrack with the same address and the equal tuple. After this a reader starts to validate the conntrack: * It's not dying, because a new conntrack was created * nf_ct_tuple_equal() returns true. But this conntrack is not initialized yet, so it can not be used by two threads concurrently. In this case BUG_ON may be triggered from nf_nat_setup_info(). Florian Westphal suggested to check the confirm bit too. I think it's right. task 1 task 2 task 3 nf_conntrack_find_get ____nf_conntrack_find destroy_conntrack hlist_nulls_del_rcu nf_conntrack_free kmem_cache_free __nf_conntrack_alloc kmem_cache_alloc memset(&ct->tuplehash[IP_CT_DIR_MAX], if (nf_ct_is_dying(ct)) if (!nf_ct_tuple_equal() I'm not sure, that I have ever seen this race condition in a real life. Currently we are investigating a bug, which is reproduced on a few nodes. In our case one conntrack is initialized from a few tasks concurrently, we don't have any other explanation for this. <2>[46267.083061] kernel BUG at net/ipv4/netfilter/nf_nat_core.c:322! ... <4>[46267.083951] RIP: 0010:[<ffffffffa01e00a4>] [<ffffffffa01e00a4>] nf_nat_setup_info+0x564/0x590 [nf_nat] ... <4>[46267.085549] Call Trace: <4>[46267.085622] [<ffffffffa023421b>] alloc_null_binding+0x5b/0xa0 [iptable_nat] <4>[46267.085697] [<ffffffffa02342bc>] nf_nat_rule_find+0x5c/0x80 [iptable_nat] <4>[46267.085770] [<ffffffffa0234521>] nf_nat_fn+0x111/0x260 [iptable_nat] <4>[46267.085843] [<ffffffffa0234798>] nf_nat_out+0x48/0xd0 [iptable_nat] <4>[46267.085919] [<ffffffff814841b9>] nf_iterate+0x69/0xb0 <4>[46267.085991] [<ffffffff81494e70>] ? ip_finish_output+0x0/0x2f0 <4>[46267.086063] [<ffffffff81484374>] nf_hook_slow+0x74/0x110 <4>[46267.086133] [<ffffffff81494e70>] ? ip_finish_output+0x0/0x2f0 <4>[46267.086207] [<ffffffff814b5890>] ? dst_output+0x0/0x20 <4>[46267.086277] [<ffffffff81495204>] ip_output+0xa4/0xc0 <4>[46267.086346] [<ffffffff814b65a4>] raw_sendmsg+0x8b4/0x910 <4>[46267.086419] [<ffffffff814c10fa>] inet_sendmsg+0x4a/0xb0 <4>[46267.086491] [<ffffffff814459aa>] ? sock_update_classid+0x3a/0x50 <4>[46267.086562] [<ffffffff81444d67>] sock_sendmsg+0x117/0x140 <4>[46267.086638] [<ffffffff8151997b>] ? _spin_unlock_bh+0x1b/0x20 <4>[46267.086712] [<ffffffff8109d370>] ? autoremove_wake_function+0x0/0x40 <4>[46267.086785] [<ffffffff81495e80>] ? do_ip_setsockopt+0x90/0xd80 <4>[46267.086858] [<ffffffff8100be0e>] ? call_function_interrupt+0xe/0x20 <4>[46267.086936] [<ffffffff8118cb10>] ? ub_slab_ptr+0x20/0x90 <4>[46267.087006] [<ffffffff8118cb10>] ? ub_slab_ptr+0x20/0x90 <4>[46267.087081] [<ffffffff8118f2e8>] ? kmem_cache_alloc+0xd8/0x1e0 <4>[46267.087151] [<ffffffff81445599>] sys_sendto+0x139/0x190 <4>[46267.087229] [<ffffffff81448c0d>] ? sock_setsockopt+0x16d/0x6f0 <4>[46267.087303] [<ffffffff810efa47>] ? audit_syscall_entry+0x1d7/0x200 <4>[46267.087378] [<ffffffff810ef795>] ? __audit_syscall_exit+0x265/0x290 <4>[46267.087454] [<ffffffff81474885>] ? compat_sys_setsockopt+0x75/0x210 <4>[46267.087531] [<ffffffff81474b5f>] compat_sys_socketcall+0x13f/0x210 <4>[46267.087607] [<ffffffff8104dea3>] ia32_sysret+0x0/0x5 <4>[46267.087676] Code: 91 20 e2 01 75 29 48 89 de 4c 89 f7 e8 56 fa ff ff 85 c0 0f 84 68 fc ff ff 0f b6 4d c6 41 8b 45 00 e9 4d fb ff ff e8 7c 19 e9 e0 <0f> 0b eb fe f6 05 17 91 20 e2 80 74 ce 80 3d 5f 2e 00 00 00 74 <1>[46267.088023] RIP [<ffffffffa01e00a4>] nf_nat_setup_info+0x564/0x590 Cc: Eric Dumazet <eric.dumazet@gmail.com> Cc: Florian Westphal <fw@strlen.de> Cc: Pablo Neira Ayuso <pablo@netfilter.org> Cc: Patrick McHardy <kaber@trash.net> Cc: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> Cc: "David S. Miller" <davem@davemloft.net> Cc: Cyrill Gorcunov <gorcunov@openvz.org> Signed-off-by: Andrey Vagin <avagin@openvz.org> Acked-by: Eric Dumazet <edumazet@google.com> Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org> --- net/netfilter/nf_conntrack_core.c | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 8824ed0ccc9c..4d1fb5d094c3 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -312,6 +312,21 @@ static void death_by_timeout(unsigned long ul_conntrack) nf_ct_delete((struct nf_conn *)ul_conntrack, 0, 0); } +static inline bool +nf_ct_key_equal(struct nf_conntrack_tuple_hash *h, + const struct nf_conntrack_tuple *tuple, + u16 zone) +{ + struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h); + + /* A conntrack can be recreated with the equal tuple, + * so we need to check that the conntrack is confirmed + */ + return nf_ct_tuple_equal(tuple, &h->tuple) && + nf_ct_zone(ct) == zone && + nf_ct_is_confirmed(ct); +} + /* * Warning : * - Caller must take a reference on returned object @@ -333,8 +348,7 @@ ____nf_conntrack_find(struct net *net, u16 zone, local_bh_disable(); begin: hlist_nulls_for_each_entry_rcu(h, n, &net->ct.hash[bucket], hnnode) { - if (nf_ct_tuple_equal(tuple, &h->tuple) && - nf_ct_zone(nf_ct_tuplehash_to_ctrack(h)) == zone) { + if (nf_ct_key_equal(h, tuple, zone)) { NF_CT_STAT_INC(net, found); local_bh_enable(); return h; @@ -372,8 +386,7 @@ begin: !atomic_inc_not_zero(&ct->ct_general.use))) h = NULL; else { - if (unlikely(!nf_ct_tuple_equal(tuple, &h->tuple) || - nf_ct_zone(ct) != zone)) { + if (unlikely(!nf_ct_key_equal(h, tuple, zone))) { nf_ct_put(ct); goto begin; } -- cgit From 829d9315c46a2be57a8fb40c89aeb7db61513d96 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan <adobriyan@gmail.com> Date: Mon, 3 Feb 2014 13:07:24 +0100 Subject: netfilter: nf_nat_h323: fix crash in nf_ct_unlink_expect_report() Similar bug fixed in SIP module in 3f509c6 ("netfilter: nf_nat_sip: fix incorrect handling of EBUSY for RTCP expectation"). BUG: unable to handle kernel paging request at 00100104 IP: [<f8214f07>] nf_ct_unlink_expect_report+0x57/0xf0 [nf_conntrack] ... Call Trace: [<c0244bd8>] ? del_timer+0x48/0x70 [<f8215687>] nf_ct_remove_expectations+0x47/0x60 [nf_conntrack] [<f8211c99>] nf_ct_delete_from_lists+0x59/0x90 [nf_conntrack] [<f8212e5e>] death_by_timeout+0x14e/0x1c0 [nf_conntrack] [<f8212d10>] ? nf_conntrack_set_hashsize+0x190/0x190 [nf_conntrack] [<c024442d>] call_timer_fn+0x1d/0x80 [<c024461e>] run_timer_softirq+0x18e/0x1a0 [<f8212d10>] ? nf_conntrack_set_hashsize+0x190/0x190 [nf_conntrack] [<c023e6f3>] __do_softirq+0xa3/0x170 [<c023e650>] ? __local_bh_enable+0x70/0x70 <IRQ> [<c023e587>] ? irq_exit+0x67/0xa0 [<c0202af6>] ? do_IRQ+0x46/0xb0 [<c027ad05>] ? clockevents_notify+0x35/0x110 [<c066ac6c>] ? common_interrupt+0x2c/0x40 [<c056e3c1>] ? cpuidle_enter_state+0x41/0xf0 [<c056e6fb>] ? cpuidle_idle_call+0x8b/0x100 [<c02085f8>] ? arch_cpu_idle+0x8/0x30 [<c027314b>] ? cpu_idle_loop+0x4b/0x140 [<c0273258>] ? cpu_startup_entry+0x18/0x20 [<c066056d>] ? rest_init+0x5d/0x70 [<c0813ac8>] ? start_kernel+0x2ec/0x2f2 [<c081364f>] ? repair_env_string+0x5b/0x5b [<c0813269>] ? i386_start_kernel+0x33/0x35 Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com> Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org> --- net/ipv4/netfilter/nf_nat_h323.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/netfilter/nf_nat_h323.c b/net/ipv4/netfilter/nf_nat_h323.c index 9eea059dd621..574f7ebba0b6 100644 --- a/net/ipv4/netfilter/nf_nat_h323.c +++ b/net/ipv4/netfilter/nf_nat_h323.c @@ -229,7 +229,10 @@ static int nat_rtp_rtcp(struct sk_buff *skb, struct nf_conn *ct, ret = nf_ct_expect_related(rtcp_exp); if (ret == 0) break; - else if (ret != -EBUSY) { + else if (ret == -EBUSY) { + nf_ct_unexpect_related(rtp_exp); + continue; + } else if (ret < 0) { nf_ct_unexpect_related(rtp_exp); nated_port = 0; break; -- cgit From e53376bef2cd97d3e3f61fdc677fb8da7d03d0da Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso <pablo@netfilter.org> Date: Mon, 3 Feb 2014 20:01:53 +0100 Subject: netfilter: nf_conntrack: don't release a conntrack with non-zero refcnt With this patch, the conntrack refcount is initially set to zero and it is bumped once it is added to any of the list, so we fulfill Eric's golden rule which is that all released objects always have a refcount that equals zero. Andrey Vagin reports that nf_conntrack_free can't be called for a conntrack with non-zero ref-counter, because it can race with nf_conntrack_find_get(). A conntrack slab is created with SLAB_DESTROY_BY_RCU. Non-zero ref-counter says that this conntrack is used. So when we release a conntrack with non-zero counter, we break this assumption. CPU1 CPU2 ____nf_conntrack_find() nf_ct_put() destroy_conntrack() ... init_conntrack __nf_conntrack_alloc (set use = 1) atomic_inc_not_zero(&ct->use) (use = 2) if (!l4proto->new(ct, skb, dataoff, timeouts)) nf_conntrack_free(ct); (use = 2 !!!) ... __nf_conntrack_alloc (set use = 1) if (!nf_ct_key_equal(h, tuple, zone)) nf_ct_put(ct); (use = 0) destroy_conntrack() /* continue to work with CT */ After applying the path "[PATCH] netfilter: nf_conntrack: fix RCU race in nf_conntrack_find_get" another bug was triggered in destroy_conntrack(): <4>[67096.759334] ------------[ cut here ]------------ <2>[67096.759353] kernel BUG at net/netfilter/nf_conntrack_core.c:211! ... <4>[67096.759837] Pid: 498649, comm: atdd veid: 666 Tainted: G C --------------- 2.6.32-042stab084.18 #1 042stab084_18 /DQ45CB <4>[67096.759932] RIP: 0010:[<ffffffffa03d99ac>] [<ffffffffa03d99ac>] destroy_conntrack+0x15c/0x190 [nf_conntrack] <4>[67096.760255] Call Trace: <4>[67096.760255] [<ffffffff814844a7>] nf_conntrack_destroy+0x17/0x30 <4>[67096.760255] [<ffffffffa03d9bb5>] nf_conntrack_find_get+0x85/0x130 [nf_conntrack] <4>[67096.760255] [<ffffffffa03d9fb2>] nf_conntrack_in+0x352/0xb60 [nf_conntrack] <4>[67096.760255] [<ffffffffa048c771>] ipv4_conntrack_local+0x51/0x60 [nf_conntrack_ipv4] <4>[67096.760255] [<ffffffff81484419>] nf_iterate+0x69/0xb0 <4>[67096.760255] [<ffffffff814b5b00>] ? dst_output+0x0/0x20 <4>[67096.760255] [<ffffffff814845d4>] nf_hook_slow+0x74/0x110 <4>[67096.760255] [<ffffffff814b5b00>] ? dst_output+0x0/0x20 <4>[67096.760255] [<ffffffff814b66d5>] raw_sendmsg+0x775/0x910 <4>[67096.760255] [<ffffffff8104c5a8>] ? flush_tlb_others_ipi+0x128/0x130 <4>[67096.760255] [<ffffffff8100bc4e>] ? apic_timer_interrupt+0xe/0x20 <4>[67096.760255] [<ffffffff8100bc4e>] ? apic_timer_interrupt+0xe/0x20 <4>[67096.760255] [<ffffffff814c136a>] inet_sendmsg+0x4a/0xb0 <4>[67096.760255] [<ffffffff81444e93>] ? sock_sendmsg+0x13/0x140 <4>[67096.760255] [<ffffffff81444f97>] sock_sendmsg+0x117/0x140 <4>[67096.760255] [<ffffffff8102e299>] ? native_smp_send_reschedule+0x49/0x60 <4>[67096.760255] [<ffffffff81519beb>] ? _spin_unlock_bh+0x1b/0x20 <4>[67096.760255] [<ffffffff8109d930>] ? autoremove_wake_function+0x0/0x40 <4>[67096.760255] [<ffffffff814960f0>] ? do_ip_setsockopt+0x90/0xd80 <4>[67096.760255] [<ffffffff8100bc4e>] ? apic_timer_interrupt+0xe/0x20 <4>[67096.760255] [<ffffffff8100bc4e>] ? apic_timer_interrupt+0xe/0x20 <4>[67096.760255] [<ffffffff814457c9>] sys_sendto+0x139/0x190 <4>[67096.760255] [<ffffffff810efa77>] ? audit_syscall_entry+0x1d7/0x200 <4>[67096.760255] [<ffffffff810ef7c5>] ? __audit_syscall_exit+0x265/0x290 <4>[67096.760255] [<ffffffff81474daf>] compat_sys_socketcall+0x13f/0x210 <4>[67096.760255] [<ffffffff8104dea3>] ia32_sysret+0x0/0x5 I have reused the original title for the RFC patch that Andrey posted and most of the original patch description. Cc: Eric Dumazet <edumazet@google.com> Cc: Andrew Vagin <avagin@parallels.com> Cc: Florian Westphal <fw@strlen.de> Reported-by: Andrew Vagin <avagin@parallels.com> Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org> Reviewed-by: Eric Dumazet <edumazet@google.com> Acked-by: Andrew Vagin <avagin@parallels.com> --- include/net/netfilter/nf_conntrack.h | 2 ++ net/netfilter/nf_conntrack_core.c | 34 +++++++++++++++++++++++++++++----- net/netfilter/nf_synproxy_core.c | 5 ++--- net/netfilter/xt_CT.c | 7 +------ 4 files changed, 34 insertions(+), 14 deletions(-) (limited to 'net') diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h index 01ea6eed1bb1..b2ac6246b7e0 100644 --- a/include/net/netfilter/nf_conntrack.h +++ b/include/net/netfilter/nf_conntrack.h @@ -284,6 +284,8 @@ extern unsigned int nf_conntrack_max; extern unsigned int nf_conntrack_hash_rnd; void init_nf_conntrack_hash_rnd(void); +void nf_conntrack_tmpl_insert(struct net *net, struct nf_conn *tmpl); + #define NF_CT_STAT_INC(net, count) __this_cpu_inc((net)->ct.stat->count) #define NF_CT_STAT_INC_ATOMIC(net, count) this_cpu_inc((net)->ct.stat->count) diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 4d1fb5d094c3..356bef519fe5 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -448,7 +448,9 @@ nf_conntrack_hash_check_insert(struct nf_conn *ct) goto out; add_timer(&ct->timeout); - nf_conntrack_get(&ct->ct_general); + smp_wmb(); + /* The caller holds a reference to this object */ + atomic_set(&ct->ct_general.use, 2); __nf_conntrack_hash_insert(ct, hash, repl_hash); NF_CT_STAT_INC(net, insert); spin_unlock_bh(&nf_conntrack_lock); @@ -462,6 +464,21 @@ out: } EXPORT_SYMBOL_GPL(nf_conntrack_hash_check_insert); +/* deletion from this larval template list happens via nf_ct_put() */ +void nf_conntrack_tmpl_insert(struct net *net, struct nf_conn *tmpl) +{ + __set_bit(IPS_TEMPLATE_BIT, &tmpl->status); + __set_bit(IPS_CONFIRMED_BIT, &tmpl->status); + nf_conntrack_get(&tmpl->ct_general); + + spin_lock_bh(&nf_conntrack_lock); + /* Overload tuple linked list to put us in template list. */ + hlist_nulls_add_head_rcu(&tmpl->tuplehash[IP_CT_DIR_ORIGINAL].hnnode, + &net->ct.tmpl); + spin_unlock_bh(&nf_conntrack_lock); +} +EXPORT_SYMBOL_GPL(nf_conntrack_tmpl_insert); + /* Confirm a connection given skb; places it in hash table */ int __nf_conntrack_confirm(struct sk_buff *skb) @@ -733,11 +750,10 @@ __nf_conntrack_alloc(struct net *net, u16 zone, nf_ct_zone->id = zone; } #endif - /* - * changes to lookup keys must be done before setting refcnt to 1 + /* Because we use RCU lookups, we set ct_general.use to zero before + * this is inserted in any list. */ - smp_wmb(); - atomic_set(&ct->ct_general.use, 1); + atomic_set(&ct->ct_general.use, 0); return ct; #ifdef CONFIG_NF_CONNTRACK_ZONES @@ -761,6 +777,11 @@ void nf_conntrack_free(struct nf_conn *ct) { struct net *net = nf_ct_net(ct); + /* A freed object has refcnt == 0, that's + * the golden rule for SLAB_DESTROY_BY_RCU + */ + NF_CT_ASSERT(atomic_read(&ct->ct_general.use) == 0); + nf_ct_ext_destroy(ct); nf_ct_ext_free(ct); kmem_cache_free(net->ct.nf_conntrack_cachep, ct); @@ -856,6 +877,9 @@ init_conntrack(struct net *net, struct nf_conn *tmpl, NF_CT_STAT_INC(net, new); } + /* Now it is inserted into the unconfirmed list, bump refcount */ + nf_conntrack_get(&ct->ct_general); + /* Overload tuple linked list to put us in unconfirmed list. */ hlist_nulls_add_head_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode, &net->ct.unconfirmed); diff --git a/net/netfilter/nf_synproxy_core.c b/net/netfilter/nf_synproxy_core.c index 9858e3e51a3a..52e20c9a46a5 100644 --- a/net/netfilter/nf_synproxy_core.c +++ b/net/netfilter/nf_synproxy_core.c @@ -363,9 +363,8 @@ static int __net_init synproxy_net_init(struct net *net) goto err2; if (!nfct_synproxy_ext_add(ct)) goto err2; - __set_bit(IPS_TEMPLATE_BIT, &ct->status); - __set_bit(IPS_CONFIRMED_BIT, &ct->status); + nf_conntrack_tmpl_insert(net, ct); snet->tmpl = ct; snet->stats = alloc_percpu(struct synproxy_stats); @@ -390,7 +389,7 @@ static void __net_exit synproxy_net_exit(struct net *net) { struct synproxy_net *snet = synproxy_pernet(net); - nf_conntrack_free(snet->tmpl); + nf_ct_put(snet->tmpl); synproxy_proc_exit(net); free_percpu(snet->stats); } diff --git a/net/netfilter/xt_CT.c b/net/netfilter/xt_CT.c index 5929be622c5c..75747aecdebe 100644 --- a/net/netfilter/xt_CT.c +++ b/net/netfilter/xt_CT.c @@ -228,12 +228,7 @@ static int xt_ct_tg_check(const struct xt_tgchk_param *par, goto err3; } - __set_bit(IPS_TEMPLATE_BIT, &ct->status); - __set_bit(IPS_CONFIRMED_BIT, &ct->status); - - /* Overload tuple linked list to put us in template list. */ - hlist_nulls_add_head_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode, - &par->net->ct.tmpl); + nf_conntrack_tmpl_insert(par->net, ct); out: info->ct = ct; return 0; -- cgit From 53b70287ddf487a38b7cbf0a10db28f40714b799 Mon Sep 17 00:00:00 2001 From: Patrick McHardy <kaber@trash.net> Date: Wed, 5 Feb 2014 12:26:22 +0100 Subject: netfilter: nf_tables: fix overrun in nf_tables_set_alloc_name() The map that is used to allocate anonymous sets is indeed BITS_PER_BYTE * PAGE_SIZE long. Signed-off-by: Patrick McHardy <kaber@trash.net> Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org> --- net/netfilter/nf_tables_api.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 9ce30534f853..2a22a186eb3d 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -1989,13 +1989,13 @@ static int nf_tables_set_alloc_name(struct nft_ctx *ctx, struct nft_set *set, if (!sscanf(i->name, name, &tmp)) continue; - if (tmp < 0 || tmp > BITS_PER_LONG * PAGE_SIZE) + if (tmp < 0 || tmp >= BITS_PER_BYTE * PAGE_SIZE) continue; set_bit(tmp, inuse); } - n = find_first_zero_bit(inuse, BITS_PER_LONG * PAGE_SIZE); + n = find_first_zero_bit(inuse, BITS_PER_BYTE * PAGE_SIZE); free_page((unsigned long)inuse); } -- cgit From ec2c9935688fbd5eaa7c975e3e21562c3da77363 Mon Sep 17 00:00:00 2001 From: Patrick McHardy <kaber@trash.net> Date: Wed, 5 Feb 2014 15:03:35 +0000 Subject: netfilter: nf_tables: fix potential oops when dumping sets Commit c9c8e48597 (netfilter: nf_tables: dump sets in all existing families) changed nft_ctx_init_from_setattr() to only look up the address family if it is not NFPROTO_UNSPEC. However if it is NFPROTO_UNSPEC and a table attribute is given, nftables_afinfo_lookup() will dereference the NULL afi pointer. Fix by checking for non-NULL afi and also move a check added by that commit to the proper position. Signed-off-by: Patrick McHardy <kaber@trash.net> Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org> --- net/netfilter/nf_tables_api.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 2a22a186eb3d..3c5a219f4242 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -1943,6 +1943,9 @@ static int nft_ctx_init_from_setattr(struct nft_ctx *ctx, } if (nla[NFTA_SET_TABLE] != NULL) { + if (afi == NULL) + return -EAFNOSUPPORT; + table = nf_tables_table_lookup(afi, nla[NFTA_SET_TABLE]); if (IS_ERR(table)) return PTR_ERR(table); @@ -2428,6 +2431,8 @@ static int nf_tables_delset(struct sock *nlsk, struct sk_buff *skb, struct nft_ctx ctx; int err; + if (nfmsg->nfgen_family == NFPROTO_UNSPEC) + return -EAFNOSUPPORT; if (nla[NFTA_SET_TABLE] == NULL) return -EINVAL; @@ -2435,9 +2440,6 @@ static int nf_tables_delset(struct sock *nlsk, struct sk_buff *skb, if (err < 0) return err; - if (nfmsg->nfgen_family == NFPROTO_UNSPEC) - return -EAFNOSUPPORT; - set = nf_tables_set_lookup(ctx.table, nla[NFTA_SET_NAME]); if (IS_ERR(set)) return PTR_ERR(set); -- cgit From 51292c0735eb2d9e29115cbf6264845e19a6c77d Mon Sep 17 00:00:00 2001 From: Patrick McHardy <kaber@trash.net> Date: Wed, 5 Feb 2014 15:03:36 +0000 Subject: netfilter: nft_ct: fix missing NFT_CT_L3PROTOCOL key in validity checks The key was missing in the list of valid keys, add it. Signed-off-by: Patrick McHardy <kaber@trash.net> Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org> --- net/netfilter/nft_ct.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/netfilter/nft_ct.c b/net/netfilter/nft_ct.c index feaf0f354a93..46e275403838 100644 --- a/net/netfilter/nft_ct.c +++ b/net/netfilter/nft_ct.c @@ -226,6 +226,7 @@ static int nft_ct_init_validate_get(const struct nft_expr *expr, if (tb[NFTA_CT_DIRECTION] != NULL) return -EINVAL; break; + case NFT_CT_L3PROTOCOL: case NFT_CT_PROTOCOL: case NFT_CT_SRC: case NFT_CT_DST: -- cgit From 64d46806b6218c97f68742c5663a8ae3a5fbe838 Mon Sep 17 00:00:00 2001 From: Patrick McHardy <kaber@trash.net> Date: Wed, 5 Feb 2014 15:03:37 +0000 Subject: netfilter: nf_tables: add AF specific expression support For the reject module, we need to add AF-specific implementations to get rid of incorrect module dependencies. Try to load an AF-specific module first and fall back to generic modules. Signed-off-by: Patrick McHardy <kaber@trash.net> Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org> --- include/net/netfilter/nf_tables.h | 5 +++++ net/netfilter/nf_tables_api.c | 22 ++++++++++++++++------ 2 files changed, 21 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 57c8ff7955df..0f68e47d3e5e 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -252,6 +252,7 @@ void nf_tables_unbind_set(const struct nft_ctx *ctx, struct nft_set *set, * @owner: module reference * @policy: netlink attribute policy * @maxattr: highest netlink attribute number + * @family: address family for AF-specific types */ struct nft_expr_type { const struct nft_expr_ops *(*select_ops)(const struct nft_ctx *, @@ -262,6 +263,7 @@ struct nft_expr_type { struct module *owner; const struct nla_policy *policy; unsigned int maxattr; + u8 family; }; /** @@ -529,6 +531,9 @@ void nft_unregister_expr(struct nft_expr_type *); #define MODULE_ALIAS_NFT_CHAIN(family, name) \ MODULE_ALIAS("nft-chain-" __stringify(family) "-" name) +#define MODULE_ALIAS_NFT_AF_EXPR(family, name) \ + MODULE_ALIAS("nft-expr-" __stringify(family) "-" name) + #define MODULE_ALIAS_NFT_EXPR(name) \ MODULE_ALIAS("nft-expr-" name) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 3c5a219f4242..113c469c7579 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -1114,35 +1114,45 @@ void nft_unregister_expr(struct nft_expr_type *type) } EXPORT_SYMBOL_GPL(nft_unregister_expr); -static const struct nft_expr_type *__nft_expr_type_get(struct nlattr *nla) +static const struct nft_expr_type *__nft_expr_type_get(u8 family, + struct nlattr *nla) { const struct nft_expr_type *type; list_for_each_entry(type, &nf_tables_expressions, list) { - if (!nla_strcmp(nla, type->name)) + if (!nla_strcmp(nla, type->name) && + (!type->family || type->family == family)) return type; } return NULL; } -static const struct nft_expr_type *nft_expr_type_get(struct nlattr *nla) +static const struct nft_expr_type *nft_expr_type_get(u8 family, + struct nlattr *nla) { const struct nft_expr_type *type; if (nla == NULL) return ERR_PTR(-EINVAL); - type = __nft_expr_type_get(nla); + type = __nft_expr_type_get(family, nla); if (type != NULL && try_module_get(type->owner)) return type; #ifdef CONFIG_MODULES if (type == NULL) { + nfnl_unlock(NFNL_SUBSYS_NFTABLES); + request_module("nft-expr-%u-%.*s", family, + nla_len(nla), (char *)nla_data(nla)); + nfnl_lock(NFNL_SUBSYS_NFTABLES); + if (__nft_expr_type_get(family, nla)) + return ERR_PTR(-EAGAIN); + nfnl_unlock(NFNL_SUBSYS_NFTABLES); request_module("nft-expr-%.*s", nla_len(nla), (char *)nla_data(nla)); nfnl_lock(NFNL_SUBSYS_NFTABLES); - if (__nft_expr_type_get(nla)) + if (__nft_expr_type_get(family, nla)) return ERR_PTR(-EAGAIN); } #endif @@ -1193,7 +1203,7 @@ static int nf_tables_expr_parse(const struct nft_ctx *ctx, if (err < 0) return err; - type = nft_expr_type_get(tb[NFTA_EXPR_NAME]); + type = nft_expr_type_get(ctx->afi->family, tb[NFTA_EXPR_NAME]); if (IS_ERR(type)) return PTR_ERR(type); -- cgit From cc4723ca316742891954efa346298e7c747c0d17 Mon Sep 17 00:00:00 2001 From: Patrick McHardy <kaber@trash.net> Date: Wed, 5 Feb 2014 15:03:38 +0000 Subject: netfilter: nft_reject: split up reject module into IPv4 and IPv6 specifc parts Currently the nft_reject module depends on symbols from ipv6. This is wrong since no generic module should force IPv6 support to be loaded. Split up the module into AF-specific and a generic part. Signed-off-by: Patrick McHardy <kaber@trash.net> Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org> --- include/net/netfilter/nft_reject.h | 17 +++++++ net/ipv4/netfilter/Kconfig | 5 ++ net/ipv4/netfilter/Makefile | 1 + net/ipv4/netfilter/nft_reject_ipv4.c | 74 ++++++++++++++++++++++++++++++ net/ipv6/netfilter/Kconfig | 5 ++ net/ipv6/netfilter/Makefile | 1 + net/ipv6/netfilter/nft_reject_ipv6.c | 75 ++++++++++++++++++++++++++++++ net/netfilter/Kconfig | 1 - net/netfilter/nft_reject.c | 89 ++++-------------------------------- 9 files changed, 187 insertions(+), 81 deletions(-) create mode 100644 include/net/netfilter/nft_reject.h create mode 100644 net/ipv4/netfilter/nft_reject_ipv4.c create mode 100644 net/ipv6/netfilter/nft_reject_ipv6.c (limited to 'net') diff --git a/include/net/netfilter/nft_reject.h b/include/net/netfilter/nft_reject.h new file mode 100644 index 000000000000..ecda75945e77 --- /dev/null +++ b/include/net/netfilter/nft_reject.h @@ -0,0 +1,17 @@ +#ifndef _NFT_REJECT_H_ +#define _NFT_REJECT_H_ + +struct nft_reject { + enum nft_reject_types type:8; + u8 icmp_code; +}; + +extern const struct nla_policy nft_reject_policy[]; + +int nft_reject_init(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nlattr * const tb[]); + +int nft_reject_dump(struct sk_buff *skb, const struct nft_expr *expr); + +#endif diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig index 81c6910cfa92..a26ce035e3fa 100644 --- a/net/ipv4/netfilter/Kconfig +++ b/net/ipv4/netfilter/Kconfig @@ -61,6 +61,11 @@ config NFT_CHAIN_NAT_IPV4 packet transformations such as the source, destination address and source and destination ports. +config NFT_REJECT_IPV4 + depends on NF_TABLES_IPV4 + default NFT_REJECT + tristate + config NF_TABLES_ARP depends on NF_TABLES tristate "ARP nf_tables support" diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile index c16be9d58420..90b82405331e 100644 --- a/net/ipv4/netfilter/Makefile +++ b/net/ipv4/netfilter/Makefile @@ -30,6 +30,7 @@ obj-$(CONFIG_NF_NAT_PROTO_GRE) += nf_nat_proto_gre.o obj-$(CONFIG_NF_TABLES_IPV4) += nf_tables_ipv4.o obj-$(CONFIG_NFT_CHAIN_ROUTE_IPV4) += nft_chain_route_ipv4.o obj-$(CONFIG_NFT_CHAIN_NAT_IPV4) += nft_chain_nat_ipv4.o +obj-$(CONFIG_NFT_REJECT_IPV4) += nft_reject_ipv4.o obj-$(CONFIG_NF_TABLES_ARP) += nf_tables_arp.o # generic IP tables diff --git a/net/ipv4/netfilter/nft_reject_ipv4.c b/net/ipv4/netfilter/nft_reject_ipv4.c new file mode 100644 index 000000000000..e935d8de1182 --- /dev/null +++ b/net/ipv4/netfilter/nft_reject_ipv4.c @@ -0,0 +1,74 @@ +/* + * Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net> + * Copyright (c) 2013 Eric Leblond <eric@regit.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Development of this code funded by Astaro AG (http://www.astaro.com/) + */ + +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/netlink.h> +#include <linux/netfilter.h> +#include <linux/netfilter/nf_tables.h> +#include <net/netfilter/nf_tables.h> +#include <net/icmp.h> +#include <net/netfilter/ipv4/nf_reject.h> +#include <net/netfilter/nft_reject.h> + +static void nft_reject_ipv4_eval(const struct nft_expr *expr, + struct nft_data data[NFT_REG_MAX + 1], + const struct nft_pktinfo *pkt) +{ + struct nft_reject *priv = nft_expr_priv(expr); + + switch (priv->type) { + case NFT_REJECT_ICMP_UNREACH: + nf_send_unreach(pkt->skb, priv->icmp_code); + break; + case NFT_REJECT_TCP_RST: + nf_send_reset(pkt->skb, pkt->ops->hooknum); + break; + } + + data[NFT_REG_VERDICT].verdict = NF_DROP; +} + +static struct nft_expr_type nft_reject_ipv4_type; +static const struct nft_expr_ops nft_reject_ipv4_ops = { + .type = &nft_reject_ipv4_type, + .size = NFT_EXPR_SIZE(sizeof(struct nft_reject)), + .eval = nft_reject_ipv4_eval, + .init = nft_reject_init, + .dump = nft_reject_dump, +}; + +static struct nft_expr_type nft_reject_ipv4_type __read_mostly = { + .family = NFPROTO_IPV4, + .name = "reject", + .ops = &nft_reject_ipv4_ops, + .policy = nft_reject_policy, + .maxattr = NFTA_REJECT_MAX, + .owner = THIS_MODULE, +}; + +static int __init nft_reject_ipv4_module_init(void) +{ + return nft_register_expr(&nft_reject_ipv4_type); +} + +static void __exit nft_reject_ipv4_module_exit(void) +{ + nft_unregister_expr(&nft_reject_ipv4_type); +} + +module_init(nft_reject_ipv4_module_init); +module_exit(nft_reject_ipv4_module_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); +MODULE_ALIAS_NFT_AF_EXPR(AF_INET, "reject"); diff --git a/net/ipv6/netfilter/Kconfig b/net/ipv6/netfilter/Kconfig index 35750df744dc..4bff1f297e39 100644 --- a/net/ipv6/netfilter/Kconfig +++ b/net/ipv6/netfilter/Kconfig @@ -50,6 +50,11 @@ config NFT_CHAIN_NAT_IPV6 packet transformations such as the source, destination address and source and destination ports. +config NFT_REJECT_IPV6 + depends on NF_TABLES_IPV6 + default NFT_REJECT + tristate + config IP6_NF_IPTABLES tristate "IP6 tables support (required for filtering)" depends on INET && IPV6 diff --git a/net/ipv6/netfilter/Makefile b/net/ipv6/netfilter/Makefile index d1b4928f34f7..70d3dd66f2cd 100644 --- a/net/ipv6/netfilter/Makefile +++ b/net/ipv6/netfilter/Makefile @@ -27,6 +27,7 @@ obj-$(CONFIG_NF_DEFRAG_IPV6) += nf_defrag_ipv6.o obj-$(CONFIG_NF_TABLES_IPV6) += nf_tables_ipv6.o obj-$(CONFIG_NFT_CHAIN_ROUTE_IPV6) += nft_chain_route_ipv6.o obj-$(CONFIG_NFT_CHAIN_NAT_IPV6) += nft_chain_nat_ipv6.o +obj-$(CONFIG_NFT_REJECT_IPV6) += nft_reject_ipv6.o # matches obj-$(CONFIG_IP6_NF_MATCH_AH) += ip6t_ah.o diff --git a/net/ipv6/netfilter/nft_reject_ipv6.c b/net/ipv6/netfilter/nft_reject_ipv6.c new file mode 100644 index 000000000000..f73285924144 --- /dev/null +++ b/net/ipv6/netfilter/nft_reject_ipv6.c @@ -0,0 +1,75 @@ +/* + * Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net> + * Copyright (c) 2013 Eric Leblond <eric@regit.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Development of this code funded by Astaro AG (http://www.astaro.com/) + */ + +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/netlink.h> +#include <linux/netfilter.h> +#include <linux/netfilter/nf_tables.h> +#include <net/netfilter/nf_tables.h> +#include <net/netfilter/nft_reject.h> +#include <net/netfilter/ipv6/nf_reject.h> + +static void nft_reject_ipv6_eval(const struct nft_expr *expr, + struct nft_data data[NFT_REG_MAX + 1], + const struct nft_pktinfo *pkt) +{ + struct nft_reject *priv = nft_expr_priv(expr); + struct net *net = dev_net((pkt->in != NULL) ? pkt->in : pkt->out); + + switch (priv->type) { + case NFT_REJECT_ICMP_UNREACH: + nf_send_unreach6(net, pkt->skb, priv->icmp_code, + pkt->ops->hooknum); + break; + case NFT_REJECT_TCP_RST: + nf_send_reset6(net, pkt->skb, pkt->ops->hooknum); + break; + } + + data[NFT_REG_VERDICT].verdict = NF_DROP; +} + +static struct nft_expr_type nft_reject_ipv6_type; +static const struct nft_expr_ops nft_reject_ipv6_ops = { + .type = &nft_reject_ipv6_type, + .size = NFT_EXPR_SIZE(sizeof(struct nft_reject)), + .eval = nft_reject_ipv6_eval, + .init = nft_reject_init, + .dump = nft_reject_dump, +}; + +static struct nft_expr_type nft_reject_ipv6_type __read_mostly = { + .family = NFPROTO_IPV6, + .name = "reject", + .ops = &nft_reject_ipv6_ops, + .policy = nft_reject_policy, + .maxattr = NFTA_REJECT_MAX, + .owner = THIS_MODULE, +}; + +static int __init nft_reject_ipv6_module_init(void) +{ + return nft_register_expr(&nft_reject_ipv6_type); +} + +static void __exit nft_reject_ipv6_module_exit(void) +{ + nft_unregister_expr(&nft_reject_ipv6_type); +} + +module_init(nft_reject_ipv6_module_init); +module_exit(nft_reject_ipv6_module_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); +MODULE_ALIAS_NFT_AF_EXPR(AF_INET6, "reject"); diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig index c37467562fd0..ed8b50e62276 100644 --- a/net/netfilter/Kconfig +++ b/net/netfilter/Kconfig @@ -513,7 +513,6 @@ config NFT_QUEUE config NFT_REJECT depends on NF_TABLES - depends on NF_TABLES_IPV6 || !NF_TABLES_IPV6 default m if NETFILTER_ADVANCED=n tristate "Netfilter nf_tables reject support" help diff --git a/net/netfilter/nft_reject.c b/net/netfilter/nft_reject.c index 5e204711d704..f3448c296446 100644 --- a/net/netfilter/nft_reject.c +++ b/net/netfilter/nft_reject.c @@ -16,65 +16,23 @@ #include <linux/netfilter.h> #include <linux/netfilter/nf_tables.h> #include <net/netfilter/nf_tables.h> -#include <net/icmp.h> -#include <net/netfilter/ipv4/nf_reject.h> +#include <net/netfilter/nft_reject.h> -#if IS_ENABLED(CONFIG_NF_TABLES_IPV6) -#include <net/netfilter/ipv6/nf_reject.h> -#endif - -struct nft_reject { - enum nft_reject_types type:8; - u8 icmp_code; - u8 family; -}; - -static void nft_reject_eval(const struct nft_expr *expr, - struct nft_data data[NFT_REG_MAX + 1], - const struct nft_pktinfo *pkt) -{ - struct nft_reject *priv = nft_expr_priv(expr); -#if IS_ENABLED(CONFIG_NF_TABLES_IPV6) - struct net *net = dev_net((pkt->in != NULL) ? pkt->in : pkt->out); -#endif - switch (priv->type) { - case NFT_REJECT_ICMP_UNREACH: - if (priv->family == NFPROTO_IPV4) - nf_send_unreach(pkt->skb, priv->icmp_code); -#if IS_ENABLED(CONFIG_NF_TABLES_IPV6) - else if (priv->family == NFPROTO_IPV6) - nf_send_unreach6(net, pkt->skb, priv->icmp_code, - pkt->ops->hooknum); -#endif - break; - case NFT_REJECT_TCP_RST: - if (priv->family == NFPROTO_IPV4) - nf_send_reset(pkt->skb, pkt->ops->hooknum); -#if IS_ENABLED(CONFIG_NF_TABLES_IPV6) - else if (priv->family == NFPROTO_IPV6) - nf_send_reset6(net, pkt->skb, pkt->ops->hooknum); -#endif - break; - } - - data[NFT_REG_VERDICT].verdict = NF_DROP; -} - -static const struct nla_policy nft_reject_policy[NFTA_REJECT_MAX + 1] = { +const struct nla_policy nft_reject_policy[NFTA_REJECT_MAX + 1] = { [NFTA_REJECT_TYPE] = { .type = NLA_U32 }, [NFTA_REJECT_ICMP_CODE] = { .type = NLA_U8 }, }; +EXPORT_SYMBOL_GPL(nft_reject_policy); -static int nft_reject_init(const struct nft_ctx *ctx, - const struct nft_expr *expr, - const struct nlattr * const tb[]) +int nft_reject_init(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nlattr * const tb[]) { struct nft_reject *priv = nft_expr_priv(expr); if (tb[NFTA_REJECT_TYPE] == NULL) return -EINVAL; - priv->family = ctx->afi->family; priv->type = ntohl(nla_get_be32(tb[NFTA_REJECT_TYPE])); switch (priv->type) { case NFT_REJECT_ICMP_UNREACH: @@ -89,8 +47,9 @@ static int nft_reject_init(const struct nft_ctx *ctx, return 0; } +EXPORT_SYMBOL_GPL(nft_reject_init); -static int nft_reject_dump(struct sk_buff *skb, const struct nft_expr *expr) +int nft_reject_dump(struct sk_buff *skb, const struct nft_expr *expr) { const struct nft_reject *priv = nft_expr_priv(expr); @@ -109,37 +68,7 @@ static int nft_reject_dump(struct sk_buff *skb, const struct nft_expr *expr) nla_put_failure: return -1; } - -static struct nft_expr_type nft_reject_type; -static const struct nft_expr_ops nft_reject_ops = { - .type = &nft_reject_type, - .size = NFT_EXPR_SIZE(sizeof(struct nft_reject)), - .eval = nft_reject_eval, - .init = nft_reject_init, - .dump = nft_reject_dump, -}; - -static struct nft_expr_type nft_reject_type __read_mostly = { - .name = "reject", - .ops = &nft_reject_ops, - .policy = nft_reject_policy, - .maxattr = NFTA_REJECT_MAX, - .owner = THIS_MODULE, -}; - -static int __init nft_reject_module_init(void) -{ - return nft_register_expr(&nft_reject_type); -} - -static void __exit nft_reject_module_exit(void) -{ - nft_unregister_expr(&nft_reject_type); -} - -module_init(nft_reject_module_init); -module_exit(nft_reject_module_exit); +EXPORT_SYMBOL_GPL(nft_reject_dump); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); -MODULE_ALIAS_NFT_EXPR("reject"); -- cgit From 05513e9e33dbded8124567466a444d32173eecc6 Mon Sep 17 00:00:00 2001 From: Patrick McHardy <kaber@trash.net> Date: Wed, 5 Feb 2014 15:03:39 +0000 Subject: netfilter: nf_tables: add reject module for NFPROTO_INET Add a reject module for NFPROTO_INET. It does nothing but dispatch to the AF-specific modules based on the hook family. Signed-off-by: Patrick McHardy <kaber@trash.net> Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org> --- include/net/netfilter/nft_reject.h | 8 +++++ net/ipv4/netfilter/nft_reject_ipv4.c | 7 ++-- net/ipv6/netfilter/nft_reject_ipv6.c | 7 ++-- net/netfilter/Kconfig | 5 +++ net/netfilter/Makefile | 1 + net/netfilter/nft_reject_inet.c | 63 ++++++++++++++++++++++++++++++++++++ 6 files changed, 85 insertions(+), 6 deletions(-) create mode 100644 net/netfilter/nft_reject_inet.c (limited to 'net') diff --git a/include/net/netfilter/nft_reject.h b/include/net/netfilter/nft_reject.h index ecda75945e77..36b0da2d55bb 100644 --- a/include/net/netfilter/nft_reject.h +++ b/include/net/netfilter/nft_reject.h @@ -14,4 +14,12 @@ int nft_reject_init(const struct nft_ctx *ctx, int nft_reject_dump(struct sk_buff *skb, const struct nft_expr *expr); +void nft_reject_ipv4_eval(const struct nft_expr *expr, + struct nft_data data[NFT_REG_MAX + 1], + const struct nft_pktinfo *pkt); + +void nft_reject_ipv6_eval(const struct nft_expr *expr, + struct nft_data data[NFT_REG_MAX + 1], + const struct nft_pktinfo *pkt); + #endif diff --git a/net/ipv4/netfilter/nft_reject_ipv4.c b/net/ipv4/netfilter/nft_reject_ipv4.c index e935d8de1182..e79718a382f2 100644 --- a/net/ipv4/netfilter/nft_reject_ipv4.c +++ b/net/ipv4/netfilter/nft_reject_ipv4.c @@ -20,9 +20,9 @@ #include <net/netfilter/ipv4/nf_reject.h> #include <net/netfilter/nft_reject.h> -static void nft_reject_ipv4_eval(const struct nft_expr *expr, - struct nft_data data[NFT_REG_MAX + 1], - const struct nft_pktinfo *pkt) +void nft_reject_ipv4_eval(const struct nft_expr *expr, + struct nft_data data[NFT_REG_MAX + 1], + const struct nft_pktinfo *pkt) { struct nft_reject *priv = nft_expr_priv(expr); @@ -37,6 +37,7 @@ static void nft_reject_ipv4_eval(const struct nft_expr *expr, data[NFT_REG_VERDICT].verdict = NF_DROP; } +EXPORT_SYMBOL_GPL(nft_reject_ipv4_eval); static struct nft_expr_type nft_reject_ipv4_type; static const struct nft_expr_ops nft_reject_ipv4_ops = { diff --git a/net/ipv6/netfilter/nft_reject_ipv6.c b/net/ipv6/netfilter/nft_reject_ipv6.c index f73285924144..0bc19fa87821 100644 --- a/net/ipv6/netfilter/nft_reject_ipv6.c +++ b/net/ipv6/netfilter/nft_reject_ipv6.c @@ -19,9 +19,9 @@ #include <net/netfilter/nft_reject.h> #include <net/netfilter/ipv6/nf_reject.h> -static void nft_reject_ipv6_eval(const struct nft_expr *expr, - struct nft_data data[NFT_REG_MAX + 1], - const struct nft_pktinfo *pkt) +void nft_reject_ipv6_eval(const struct nft_expr *expr, + struct nft_data data[NFT_REG_MAX + 1], + const struct nft_pktinfo *pkt) { struct nft_reject *priv = nft_expr_priv(expr); struct net *net = dev_net((pkt->in != NULL) ? pkt->in : pkt->out); @@ -38,6 +38,7 @@ static void nft_reject_ipv6_eval(const struct nft_expr *expr, data[NFT_REG_VERDICT].verdict = NF_DROP; } +EXPORT_SYMBOL_GPL(nft_reject_ipv6_eval); static struct nft_expr_type nft_reject_ipv6_type; static const struct nft_expr_ops nft_reject_ipv6_ops = { diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig index ed8b50e62276..e9410d17619d 100644 --- a/net/netfilter/Kconfig +++ b/net/netfilter/Kconfig @@ -520,6 +520,11 @@ config NFT_REJECT explicitly deny and notify via TCP reset/ICMP informational errors unallowed traffic. +config NFT_REJECT_INET + depends on NF_TABLES_INET + default NFT_REJECT + tristate + config NFT_COMPAT depends on NF_TABLES depends on NETFILTER_XTABLES diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile index ee9c4de5f8ed..bffdad774da7 100644 --- a/net/netfilter/Makefile +++ b/net/netfilter/Makefile @@ -79,6 +79,7 @@ obj-$(CONFIG_NFT_LIMIT) += nft_limit.o obj-$(CONFIG_NFT_NAT) += nft_nat.o obj-$(CONFIG_NFT_QUEUE) += nft_queue.o obj-$(CONFIG_NFT_REJECT) += nft_reject.o +obj-$(CONFIG_NFT_REJECT_INET) += nft_reject_inet.o obj-$(CONFIG_NFT_RBTREE) += nft_rbtree.o obj-$(CONFIG_NFT_HASH) += nft_hash.o obj-$(CONFIG_NFT_COUNTER) += nft_counter.o diff --git a/net/netfilter/nft_reject_inet.c b/net/netfilter/nft_reject_inet.c new file mode 100644 index 000000000000..8a310f239c93 --- /dev/null +++ b/net/netfilter/nft_reject_inet.c @@ -0,0 +1,63 @@ +/* + * Copyright (c) 2014 Patrick McHardy <kaber@trash.net> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/netlink.h> +#include <linux/netfilter.h> +#include <linux/netfilter/nf_tables.h> +#include <net/netfilter/nf_tables.h> +#include <net/netfilter/nft_reject.h> + +static void nft_reject_inet_eval(const struct nft_expr *expr, + struct nft_data data[NFT_REG_MAX + 1], + const struct nft_pktinfo *pkt) +{ + switch (pkt->ops->pf) { + case NFPROTO_IPV4: + nft_reject_ipv4_eval(expr, data, pkt); + case NFPROTO_IPV6: + nft_reject_ipv6_eval(expr, data, pkt); + } +} + +static struct nft_expr_type nft_reject_inet_type; +static const struct nft_expr_ops nft_reject_inet_ops = { + .type = &nft_reject_inet_type, + .size = NFT_EXPR_SIZE(sizeof(struct nft_reject)), + .eval = nft_reject_inet_eval, + .init = nft_reject_init, + .dump = nft_reject_dump, +}; + +static struct nft_expr_type nft_reject_inet_type __read_mostly = { + .family = NFPROTO_INET, + .name = "reject", + .ops = &nft_reject_inet_ops, + .policy = nft_reject_policy, + .maxattr = NFTA_REJECT_MAX, + .owner = THIS_MODULE, +}; + +static int __init nft_reject_inet_module_init(void) +{ + return nft_register_expr(&nft_reject_inet_type); +} + +static void __exit nft_reject_inet_module_exit(void) +{ + nft_unregister_expr(&nft_reject_inet_type); +} + +module_init(nft_reject_inet_module_init); +module_exit(nft_reject_inet_module_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); +MODULE_ALIAS_NFT_AF_EXPR(1, "reject"); -- cgit From 2f617435c3a6fe3f39efb9ae2baa77de2d6c97b8 Mon Sep 17 00:00:00 2001 From: Eliad Peller <eliad@wizery.com> Date: Sun, 12 Jan 2014 11:06:37 +0200 Subject: mac80211: move roc cookie assignment earlier ieee80211_start_roc_work() might add a new roc to existing roc, and tell cfg80211 it has already started. However, this might happen before the roc cookie was set, resulting in REMAIN_ON_CHANNEL (started) event with null cookie. Consequently, it can make wpa_supplicant go out of sync. Fix it by setting the roc cookie earlier. Cc: stable@vger.kernel.org Signed-off-by: Eliad Peller <eliad@wizery.com> Signed-off-by: Johannes Berg <johannes.berg@intel.com> --- net/mac80211/cfg.c | 36 ++++++++++++++++++------------------ 1 file changed, 18 insertions(+), 18 deletions(-) (limited to 'net') diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index f9ae9b85d4c1..94b4acb5aabb 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -2638,6 +2638,24 @@ static int ieee80211_start_roc_work(struct ieee80211_local *local, INIT_DELAYED_WORK(&roc->work, ieee80211_sw_roc_work); INIT_LIST_HEAD(&roc->dependents); + /* + * cookie is either the roc cookie (for normal roc) + * or the SKB (for mgmt TX) + */ + if (!txskb) { + /* local->mtx protects this */ + local->roc_cookie_counter++; + roc->cookie = local->roc_cookie_counter; + /* wow, you wrapped 64 bits ... more likely a bug */ + if (WARN_ON(roc->cookie == 0)) { + roc->cookie = 1; + local->roc_cookie_counter++; + } + *cookie = roc->cookie; + } else { + *cookie = (unsigned long)txskb; + } + /* if there's one pending or we're scanning, queue this one */ if (!list_empty(&local->roc_list) || local->scanning || local->radar_detect_enabled) @@ -2772,24 +2790,6 @@ static int ieee80211_start_roc_work(struct ieee80211_local *local, if (!queued) list_add_tail(&roc->list, &local->roc_list); - /* - * cookie is either the roc cookie (for normal roc) - * or the SKB (for mgmt TX) - */ - if (!txskb) { - /* local->mtx protects this */ - local->roc_cookie_counter++; - roc->cookie = local->roc_cookie_counter; - /* wow, you wrapped 64 bits ... more likely a bug */ - if (WARN_ON(roc->cookie == 0)) { - roc->cookie = 1; - local->roc_cookie_counter++; - } - *cookie = roc->cookie; - } else { - *cookie = (unsigned long)txskb; - } - return 0; } -- cgit From f12cb2893069495726c21a4b0178705dacfecfe0 Mon Sep 17 00:00:00 2001 From: Pontus Fuchs <pontus.fuchs@gmail.com> Date: Thu, 16 Jan 2014 15:00:40 +0100 Subject: nl80211: Reset split_start when netlink skb is exhausted When the netlink skb is exhausted split_start is left set. In the subsequent retry, with a larger buffer, the dump is continued from the failing point instead of from the beginning. This was causing my rt28xx based USB dongle to now show up when running "iw list" with an old iw version without split dump support. Cc: stable@vger.kernel.org Fixes: 3713b4e364ef ("nl80211: allow splitting wiphy information in dumps") Signed-off-by: Pontus Fuchs <pontus.fuchs@gmail.com> [avoid the entire workaround when state->split is set] Signed-off-by: Johannes Berg <johannes.berg@intel.com> --- net/wireless/nl80211.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 7a742594916e..6ea960b1a8eb 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -1719,9 +1719,10 @@ static int nl80211_dump_wiphy(struct sk_buff *skb, struct netlink_callback *cb) * We can then retry with the larger buffer. */ if ((ret == -ENOBUFS || ret == -EMSGSIZE) && - !skb->len && + !skb->len && !state->split && cb->min_dump_alloc < 4096) { cb->min_dump_alloc = 4096; + state->split_start = 0; rtnl_unlock(); return 1; } -- cgit From 5a6aa705ffdd97552ff756bbfa7d5a3b62a6c690 Mon Sep 17 00:00:00 2001 From: Johannes Berg <johannes.berg@intel.com> Date: Thu, 23 Jan 2014 16:32:29 +0100 Subject: cfg80211: re-enable 5/10 MHz support Unfortunately I forgot this during the merge window, but the patch seems small enough to go in as a fix. The userspace API bug that was the reason for disabling it has long been fixed. Signed-off-by: Johannes Berg <johannes.berg@intel.com> --- net/wireless/core.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'net') diff --git a/net/wireless/core.c b/net/wireless/core.c index d89dee2259b5..77fe4c791269 100644 --- a/net/wireless/core.c +++ b/net/wireless/core.c @@ -440,9 +440,6 @@ int wiphy_register(struct wiphy *wiphy) int i; u16 ifmodes = wiphy->interface_modes; - /* support for 5/10 MHz is broken due to nl80211 API mess - disable */ - wiphy->flags &= ~WIPHY_FLAG_SUPPORTS_5_10_MHZ; - /* * There are major locking problems in nl80211/mac80211 for CSA, * disable for all drivers until this has been reworked. -- cgit From 8ffcc704c963b4157391bd87a4544cdfd18b574d Mon Sep 17 00:00:00 2001 From: Emmanuel Grumbach <emmanuel.grumbach@intel.com> Date: Thu, 23 Jan 2014 14:28:16 +0200 Subject: mac80211: avoid deadlock revealed by lockdep MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit sdata->u.ap.request_smps_work can’t be flushed synchronously under wdev_lock(wdev) since ieee80211_request_smps_ap_work itself locks the same lock. While at it, reset the driver_smps_mode when the ap is stopped to its default: OFF. This solves: ====================================================== [ INFO: possible circular locking dependency detected ] 3.12.0-ipeer+ #2 Tainted: G O ------------------------------------------------------- rmmod/2867 is trying to acquire lock: ((&sdata->u.ap.request_smps_work)){+.+...}, at: [<c105b8d0>] flush_work+0x0/0x90 but task is already holding lock: (&wdev->mtx){+.+.+.}, at: [<f9b32626>] cfg80211_stop_ap+0x26/0x230 [cfg80211] which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #1 (&wdev->mtx){+.+.+.}: [<c10aefa9>] lock_acquire+0x79/0xe0 [<c1607a1a>] mutex_lock_nested+0x4a/0x360 [<fb06288b>] ieee80211_request_smps_ap_work+0x2b/0x50 [mac80211] [<c105cdd8>] process_one_work+0x198/0x450 [<c105d469>] worker_thread+0xf9/0x320 [<c10669ff>] kthread+0x9f/0xb0 [<c1613397>] ret_from_kernel_thread+0x1b/0x28 -> #0 ((&sdata->u.ap.request_smps_work)){+.+...}: [<c10ae9df>] __lock_acquire+0x183f/0x1910 [<c10aefa9>] lock_acquire+0x79/0xe0 [<c105b917>] flush_work+0x47/0x90 [<c105d867>] __cancel_work_timer+0x67/0xe0 [<c105d90f>] cancel_work_sync+0xf/0x20 [<fb0765cc>] ieee80211_stop_ap+0x8c/0x340 [mac80211] [<f9b3268c>] cfg80211_stop_ap+0x8c/0x230 [cfg80211] [<f9b0d8f9>] cfg80211_leave+0x79/0x100 [cfg80211] [<f9b0da72>] cfg80211_netdev_notifier_call+0xf2/0x4f0 [cfg80211] [<c160f2c9>] notifier_call_chain+0x59/0x130 [<c106c6de>] __raw_notifier_call_chain+0x1e/0x30 [<c106c70f>] raw_notifier_call_chain+0x1f/0x30 [<c14f8213>] call_netdevice_notifiers_info+0x33/0x70 [<c14f8263>] call_netdevice_notifiers+0x13/0x20 [<c14f82a4>] __dev_close_many+0x34/0xb0 [<c14f83fe>] dev_close_many+0x6e/0xc0 [<c14f9c77>] rollback_registered_many+0xa7/0x1f0 [<c14f9dd4>] unregister_netdevice_many+0x14/0x60 [<fb06f4d9>] ieee80211_remove_interfaces+0xe9/0x170 [mac80211] [<fb055116>] ieee80211_unregister_hw+0x56/0x110 [mac80211] [<fa3e9396>] iwl_op_mode_mvm_stop+0x26/0xe0 [iwlmvm] [<f9b9d8ca>] _iwl_op_mode_stop+0x3a/0x70 [iwlwifi] [<f9b9d96f>] iwl_opmode_deregister+0x6f/0x90 [iwlwifi] [<fa405179>] __exit_compat+0xd/0x19 [iwlmvm] [<c10b8bf9>] SyS_delete_module+0x179/0x2b0 [<c1613421>] sysenter_do_call+0x12/0x32 Fixes: 687da132234f ("mac80211: implement SMPS for AP") Cc: <stable@vger.kernel.org> [3.13] Reported-by: Ilan Peer <ilan.peer@intel.com> Signed-off-by: Emmanuel Grumbach <emmanuel.grumbach@intel.com> Signed-off-by: Johannes Berg <johannes.berg@intel.com> --- net/mac80211/cfg.c | 3 +-- net/mac80211/ht.c | 4 +++- net/mac80211/iface.c | 15 +++++++++++---- 3 files changed, 15 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index 94b4acb5aabb..33acdca4a1df 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -1090,8 +1090,6 @@ static int ieee80211_stop_ap(struct wiphy *wiphy, struct net_device *dev) kfree(sdata->u.ap.next_beacon); sdata->u.ap.next_beacon = NULL; - cancel_work_sync(&sdata->u.ap.request_smps_work); - /* turn off carrier for this interface and dependent VLANs */ list_for_each_entry(vlan, &sdata->u.ap.vlans, u.vlan.list) netif_carrier_off(vlan->dev); @@ -1103,6 +1101,7 @@ static int ieee80211_stop_ap(struct wiphy *wiphy, struct net_device *dev) kfree_rcu(old_beacon, rcu_head); if (old_probe_resp) kfree_rcu(old_probe_resp, rcu_head); + sdata->u.ap.driver_smps_mode = IEEE80211_SMPS_OFF; __sta_info_flush(sdata, true); ieee80211_free_keys(sdata, true); diff --git a/net/mac80211/ht.c b/net/mac80211/ht.c index fab7b91923e0..70dd013de836 100644 --- a/net/mac80211/ht.c +++ b/net/mac80211/ht.c @@ -466,7 +466,9 @@ void ieee80211_request_smps_ap_work(struct work_struct *work) u.ap.request_smps_work); sdata_lock(sdata); - __ieee80211_request_smps_ap(sdata, sdata->u.ap.driver_smps_mode); + if (sdata_dereference(sdata->u.ap.beacon, sdata)) + __ieee80211_request_smps_ap(sdata, + sdata->u.ap.driver_smps_mode); sdata_unlock(sdata); } diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c index 3dfd20a453ab..ae2eb148a028 100644 --- a/net/mac80211/iface.c +++ b/net/mac80211/iface.c @@ -770,12 +770,19 @@ static void ieee80211_do_stop(struct ieee80211_sub_if_data *sdata, ieee80211_roc_purge(local, sdata); - if (sdata->vif.type == NL80211_IFTYPE_STATION) + switch (sdata->vif.type) { + case NL80211_IFTYPE_STATION: ieee80211_mgd_stop(sdata); - - if (sdata->vif.type == NL80211_IFTYPE_ADHOC) + break; + case NL80211_IFTYPE_ADHOC: ieee80211_ibss_stop(sdata); - + break; + case NL80211_IFTYPE_AP: + cancel_work_sync(&sdata->u.ap.request_smps_work); + break; + default: + break; + } /* * Remove all stations associated with this interface. -- cgit From a617302c531eaf497ccd02a61d380efc119ba999 Mon Sep 17 00:00:00 2001 From: Johannes Berg <johannes.berg@intel.com> Date: Wed, 22 Jan 2014 11:14:18 +0200 Subject: cfg80211: fix scan done race When an interface/wdev is removed, any ongoing scan should be cancelled by the driver. This will make it call cfg80211, which only queues a work struct. If interface/wdev removal is quick enough, this can leave the scan request pending and processed only after the interface is gone, causing a use-after-free. Fix this by making sure the scan request is not pending after the interface is destroyed. We can't flush or cancel the work item due to locking concerns, but when it'll run it shouldn't find anything to do. This leaves a potential issue, if a new scan gets requested before the work runs, it prematurely stops the running scan, potentially causing another crash. I'll fix that in the next patch. This was particularly observed with P2P_DEVICE wdevs, likely because freeing them is quicker than freeing netdevs. Reported-by: Andrei Otcheretianski <andrei.otcheretianski@intel.com> Fixes: 4a58e7c38443 ("cfg80211: don't "leak" uncompleted scans") Signed-off-by: Johannes Berg <johannes.berg@intel.com> --- net/wireless/core.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/wireless/core.c b/net/wireless/core.c index 77fe4c791269..02ed00dbf2df 100644 --- a/net/wireless/core.c +++ b/net/wireless/core.c @@ -203,8 +203,11 @@ void cfg80211_stop_p2p_device(struct cfg80211_registered_device *rdev, rdev->opencount--; - WARN_ON(rdev->scan_req && rdev->scan_req->wdev == wdev && - !rdev->scan_req->notified); + if (rdev->scan_req && rdev->scan_req->wdev == wdev) { + if (WARN_ON(!rdev->scan_req->notified)) + rdev->scan_req->aborted = true; + ___cfg80211_scan_done(rdev); + } } static int cfg80211_rfkill_set_block(void *data, bool blocked) @@ -856,8 +859,11 @@ static int cfg80211_netdev_notifier_call(struct notifier_block *nb, break; case NETDEV_DOWN: cfg80211_update_iface_num(rdev, wdev->iftype, -1); - WARN_ON(rdev->scan_req && rdev->scan_req->wdev == wdev && - !rdev->scan_req->notified); + if (rdev->scan_req && rdev->scan_req->wdev == wdev) { + if (WARN_ON(!rdev->scan_req->notified)) + rdev->scan_req->aborted = true; + ___cfg80211_scan_done(rdev); + } if (WARN_ON(rdev->sched_scan_req && rdev->sched_scan_req->dev == wdev->netdev)) { -- cgit From f9d15d162b3acf28f85b3ac05c4883e5ed588d28 Mon Sep 17 00:00:00 2001 From: Johannes Berg <johannes.berg@intel.com> Date: Wed, 22 Jan 2014 11:14:19 +0200 Subject: cfg80211: send scan results from work queue Due to the previous commit, when a scan finishes, it is in theory possible to hit the following sequence: 1. interface starts being removed 2. scan is cancelled by driver and cfg80211 is notified 3. scan done work is scheduled 4. interface is removed completely, rdev->scan_req is freed, event sent to userspace but scan done work remains pending 5. new scan is requested on another virtual interface 6. scan done work runs, freeing the still-running scan To fix this situation, hang on to the scan done message and block new scans while that is the case, and only send the message from the work function, regardless of whether the scan_req is already freed from interface removal. This makes step 5 above impossible and changes step 6 to be 5. scan done work runs, sending the scan done message As this can't work for wext, so we send the message immediately, but this shouldn't be an issue since we still return -EBUSY. Signed-off-by: Johannes Berg <johannes.berg@intel.com> --- net/wireless/core.c | 4 ++-- net/wireless/core.h | 4 +++- net/wireless/nl80211.c | 29 ++++++++++------------------- net/wireless/nl80211.h | 8 ++++---- net/wireless/scan.c | 40 +++++++++++++++++++++++++--------------- net/wireless/sme.c | 2 +- 6 files changed, 45 insertions(+), 42 deletions(-) (limited to 'net') diff --git a/net/wireless/core.c b/net/wireless/core.c index 02ed00dbf2df..010892b81a06 100644 --- a/net/wireless/core.c +++ b/net/wireless/core.c @@ -206,7 +206,7 @@ void cfg80211_stop_p2p_device(struct cfg80211_registered_device *rdev, if (rdev->scan_req && rdev->scan_req->wdev == wdev) { if (WARN_ON(!rdev->scan_req->notified)) rdev->scan_req->aborted = true; - ___cfg80211_scan_done(rdev); + ___cfg80211_scan_done(rdev, false); } } @@ -862,7 +862,7 @@ static int cfg80211_netdev_notifier_call(struct notifier_block *nb, if (rdev->scan_req && rdev->scan_req->wdev == wdev) { if (WARN_ON(!rdev->scan_req->notified)) rdev->scan_req->aborted = true; - ___cfg80211_scan_done(rdev); + ___cfg80211_scan_done(rdev, false); } if (WARN_ON(rdev->sched_scan_req && diff --git a/net/wireless/core.h b/net/wireless/core.h index 37ec16d7bb1a..f1d193b557b6 100644 --- a/net/wireless/core.h +++ b/net/wireless/core.h @@ -62,6 +62,7 @@ struct cfg80211_registered_device { struct rb_root bss_tree; u32 bss_generation; struct cfg80211_scan_request *scan_req; /* protected by RTNL */ + struct sk_buff *scan_msg; struct cfg80211_sched_scan_request *sched_scan_req; unsigned long suspend_at; struct work_struct scan_done_wk; @@ -361,7 +362,8 @@ int cfg80211_validate_key_settings(struct cfg80211_registered_device *rdev, struct key_params *params, int key_idx, bool pairwise, const u8 *mac_addr); void __cfg80211_scan_done(struct work_struct *wk); -void ___cfg80211_scan_done(struct cfg80211_registered_device *rdev); +void ___cfg80211_scan_done(struct cfg80211_registered_device *rdev, + bool send_message); void __cfg80211_sched_scan_results(struct work_struct *wk); int __cfg80211_stop_sched_scan(struct cfg80211_registered_device *rdev, bool driver_initiated); diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 6ea960b1a8eb..4fe2e6e2bc76 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -5245,7 +5245,7 @@ static int nl80211_trigger_scan(struct sk_buff *skb, struct genl_info *info) if (!rdev->ops->scan) return -EOPNOTSUPP; - if (rdev->scan_req) { + if (rdev->scan_req || rdev->scan_msg) { err = -EBUSY; goto unlock; } @@ -10012,40 +10012,31 @@ void nl80211_send_scan_start(struct cfg80211_registered_device *rdev, NL80211_MCGRP_SCAN, GFP_KERNEL); } -void nl80211_send_scan_done(struct cfg80211_registered_device *rdev, - struct wireless_dev *wdev) +struct sk_buff *nl80211_build_scan_msg(struct cfg80211_registered_device *rdev, + struct wireless_dev *wdev, bool aborted) { struct sk_buff *msg; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) - return; + return NULL; if (nl80211_send_scan_msg(msg, rdev, wdev, 0, 0, 0, - NL80211_CMD_NEW_SCAN_RESULTS) < 0) { + aborted ? NL80211_CMD_SCAN_ABORTED : + NL80211_CMD_NEW_SCAN_RESULTS) < 0) { nlmsg_free(msg); - return; + return NULL; } - genlmsg_multicast_netns(&nl80211_fam, wiphy_net(&rdev->wiphy), msg, 0, - NL80211_MCGRP_SCAN, GFP_KERNEL); + return msg; } -void nl80211_send_scan_aborted(struct cfg80211_registered_device *rdev, - struct wireless_dev *wdev) +void nl80211_send_scan_result(struct cfg80211_registered_device *rdev, + struct sk_buff *msg) { - struct sk_buff *msg; - - msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return; - if (nl80211_send_scan_msg(msg, rdev, wdev, 0, 0, 0, - NL80211_CMD_SCAN_ABORTED) < 0) { - nlmsg_free(msg); - return; - } - genlmsg_multicast_netns(&nl80211_fam, wiphy_net(&rdev->wiphy), msg, 0, NL80211_MCGRP_SCAN, GFP_KERNEL); } diff --git a/net/wireless/nl80211.h b/net/wireless/nl80211.h index b1b231324e10..75799746d845 100644 --- a/net/wireless/nl80211.h +++ b/net/wireless/nl80211.h @@ -8,10 +8,10 @@ void nl80211_exit(void); void nl80211_notify_dev_rename(struct cfg80211_registered_device *rdev); void nl80211_send_scan_start(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev); -void nl80211_send_scan_done(struct cfg80211_registered_device *rdev, - struct wireless_dev *wdev); -void nl80211_send_scan_aborted(struct cfg80211_registered_device *rdev, - struct wireless_dev *wdev); +struct sk_buff *nl80211_build_scan_msg(struct cfg80211_registered_device *rdev, + struct wireless_dev *wdev, bool aborted); +void nl80211_send_scan_result(struct cfg80211_registered_device *rdev, + struct sk_buff *msg); void nl80211_send_sched_scan(struct cfg80211_registered_device *rdev, struct net_device *netdev, u32 cmd); void nl80211_send_sched_scan_results(struct cfg80211_registered_device *rdev, diff --git a/net/wireless/scan.c b/net/wireless/scan.c index b528e31da2cf..d1ed4aebbbb7 100644 --- a/net/wireless/scan.c +++ b/net/wireless/scan.c @@ -161,18 +161,25 @@ static void __cfg80211_bss_expire(struct cfg80211_registered_device *dev, dev->bss_generation++; } -void ___cfg80211_scan_done(struct cfg80211_registered_device *rdev) +void ___cfg80211_scan_done(struct cfg80211_registered_device *rdev, + bool send_message) { struct cfg80211_scan_request *request; struct wireless_dev *wdev; + struct sk_buff *msg; #ifdef CONFIG_CFG80211_WEXT union iwreq_data wrqu; #endif ASSERT_RTNL(); - request = rdev->scan_req; + if (rdev->scan_msg) { + nl80211_send_scan_result(rdev, rdev->scan_msg); + rdev->scan_msg = NULL; + return; + } + request = rdev->scan_req; if (!request) return; @@ -186,18 +193,16 @@ void ___cfg80211_scan_done(struct cfg80211_registered_device *rdev) if (wdev->netdev) cfg80211_sme_scan_done(wdev->netdev); - if (request->aborted) { - nl80211_send_scan_aborted(rdev, wdev); - } else { - if (request->flags & NL80211_SCAN_FLAG_FLUSH) { - /* flush entries from previous scans */ - spin_lock_bh(&rdev->bss_lock); - __cfg80211_bss_expire(rdev, request->scan_start); - spin_unlock_bh(&rdev->bss_lock); - } - nl80211_send_scan_done(rdev, wdev); + if (!request->aborted && + request->flags & NL80211_SCAN_FLAG_FLUSH) { + /* flush entries from previous scans */ + spin_lock_bh(&rdev->bss_lock); + __cfg80211_bss_expire(rdev, request->scan_start); + spin_unlock_bh(&rdev->bss_lock); } + msg = nl80211_build_scan_msg(rdev, wdev, request->aborted); + #ifdef CONFIG_CFG80211_WEXT if (wdev->netdev && !request->aborted) { memset(&wrqu, 0, sizeof(wrqu)); @@ -211,6 +216,11 @@ void ___cfg80211_scan_done(struct cfg80211_registered_device *rdev) rdev->scan_req = NULL; kfree(request); + + if (!send_message) + rdev->scan_msg = msg; + else + nl80211_send_scan_result(rdev, msg); } void __cfg80211_scan_done(struct work_struct *wk) @@ -221,7 +231,7 @@ void __cfg80211_scan_done(struct work_struct *wk) scan_done_wk); rtnl_lock(); - ___cfg80211_scan_done(rdev); + ___cfg80211_scan_done(rdev, true); rtnl_unlock(); } @@ -1079,7 +1089,7 @@ int cfg80211_wext_siwscan(struct net_device *dev, if (IS_ERR(rdev)) return PTR_ERR(rdev); - if (rdev->scan_req) { + if (rdev->scan_req || rdev->scan_msg) { err = -EBUSY; goto out; } @@ -1481,7 +1491,7 @@ int cfg80211_wext_giwscan(struct net_device *dev, if (IS_ERR(rdev)) return PTR_ERR(rdev); - if (rdev->scan_req) + if (rdev->scan_req || rdev->scan_msg) return -EAGAIN; res = ieee80211_scan_results(rdev, info, extra, data->length); diff --git a/net/wireless/sme.c b/net/wireless/sme.c index a63509118508..f04d4c32e96e 100644 --- a/net/wireless/sme.c +++ b/net/wireless/sme.c @@ -67,7 +67,7 @@ static int cfg80211_conn_scan(struct wireless_dev *wdev) ASSERT_RDEV_LOCK(rdev); ASSERT_WDEV_LOCK(wdev); - if (rdev->scan_req) + if (rdev->scan_req || rdev->scan_msg) return -EBUSY; if (wdev->conn->params.channel) -- cgit From 0297ea17bf7879fb5846fafd1be4c0471e72848d Mon Sep 17 00:00:00 2001 From: Emmanuel Grumbach <emmanuel.grumbach@intel.com> Date: Mon, 27 Jan 2014 11:07:42 +0200 Subject: mac80211: release the channel in error path in start_ap When the driver cannot start the AP or when the assignement of the beacon goes wrong, we need to unassign the vif. Cc: stable@vger.kernel.org Signed-off-by: Emmanuel Grumbach <emmanuel.grumbach@intel.com> Signed-off-by: Johannes Berg <johannes.berg@intel.com> --- net/mac80211/cfg.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index 33acdca4a1df..453e974287d1 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -1021,8 +1021,10 @@ static int ieee80211_start_ap(struct wiphy *wiphy, struct net_device *dev, IEEE80211_P2P_OPPPS_ENABLE_BIT; err = ieee80211_assign_beacon(sdata, ¶ms->beacon); - if (err < 0) + if (err < 0) { + ieee80211_vif_release_channel(sdata); return err; + } changed |= err; err = drv_start_ap(sdata->local, sdata); @@ -1032,6 +1034,7 @@ static int ieee80211_start_ap(struct wiphy *wiphy, struct net_device *dev, if (old) kfree_rcu(old, rcu_head); RCU_INIT_POINTER(sdata->u.ap.beacon, NULL); + ieee80211_vif_release_channel(sdata); return err; } -- cgit From d4c80d9df6d1e4473b1409e4d220ca3d1612125c Mon Sep 17 00:00:00 2001 From: Sujith Manoharan <c_manoha@qca.qualcomm.com> Date: Thu, 30 Jan 2014 14:17:28 +0530 Subject: mac80211: Fix IBSS disconnect Currently, when a station leaves an IBSS network, the corresponding BSS is not dropped from cfg80211 if there are other active stations in the network. But, the small window that is present when trying to determine a station's status based on IEEE80211_IBSS_MERGE_INTERVAL introduces a race. Instead of trying to keep the BSS, always remove it when leaving an IBSS network. There is not much benefit to retain the BSS entry since it will be added with a subsequent join operation. This fixes an issue where a dangling BSS entry causes ath9k to wait for a beacon indefinitely. Cc: <stable@vger.kernel.org> Reported-by: Simon Wunderlich <sw@simonwunderlich.de> Signed-off-by: Sujith Manoharan <c_manoha@qca.qualcomm.com> Signed-off-by: Johannes Berg <johannes.berg@intel.com> --- net/mac80211/ibss.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'net') diff --git a/net/mac80211/ibss.c b/net/mac80211/ibss.c index 771080ec7212..2796a198728f 100644 --- a/net/mac80211/ibss.c +++ b/net/mac80211/ibss.c @@ -695,12 +695,9 @@ static void ieee80211_ibss_disconnect(struct ieee80211_sub_if_data *sdata) struct cfg80211_bss *cbss; struct beacon_data *presp; struct sta_info *sta; - int active_ibss; u16 capability; - active_ibss = ieee80211_sta_active_ibss(sdata); - - if (!active_ibss && !is_zero_ether_addr(ifibss->bssid)) { + if (!is_zero_ether_addr(ifibss->bssid)) { capability = WLAN_CAPABILITY_IBSS; if (ifibss->privacy) -- cgit From 338f977f4eb441e69bb9a46eaa0ac715c931a67f Mon Sep 17 00:00:00 2001 From: Johannes Berg <johannes.berg@intel.com> Date: Sat, 1 Feb 2014 00:16:23 +0100 Subject: mac80211: fix fragmentation code, particularly for encryption The "new" fragmentation code (since my rewrite almost 5 years ago) erroneously sets skb->len rather than using skb_trim() to adjust the length of the first fragment after copying out all the others. This leaves the skb tail pointer pointing to after where the data originally ended, and thus causes the encryption MIC to be written at that point, rather than where it belongs: immediately after the data. The impact of this is that if software encryption is done, then a) encryption doesn't work for the first fragment, the connection becomes unusable as the first fragment will never be properly verified at the receiver, the MIC is practically guaranteed to be wrong b) we leak up to 8 bytes of plaintext (!) of the packet out into the air This is only mitigated by the fact that many devices are capable of doing encryption in hardware, in which case this can't happen as the tail pointer is irrelevant in that case. Additionally, fragmentation is not used very frequently and would normally have to be configured manually. Fix this by using skb_trim() properly. Cc: stable@vger.kernel.org Fixes: 2de8e0d999b8 ("mac80211: rewrite fragmentation") Reported-by: Jouni Malinen <j@w1.fi> Signed-off-by: Johannes Berg <johannes.berg@intel.com> --- net/mac80211/tx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index 27c990bf2320..97a02d3f7d87 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -878,7 +878,7 @@ static int ieee80211_fragment(struct ieee80211_tx_data *tx, } /* adjust first fragment's length */ - skb->len = hdrlen + per_fragm; + skb_trim(skb, hdrlen + per_fragm); return 0; } -- cgit From fab57a6cc227468ca9e6a4c7ff8d3b10727785ee Mon Sep 17 00:00:00 2001 From: Johannes Berg <johannes.berg@intel.com> Date: Wed, 29 Jan 2014 13:28:02 +0100 Subject: mac80211: fix virtual monitor interface iteration During channel context assignment, the interface should be found by interface iteration, so we need to assign the pointer before the channel context. Reported-by: Emmanuel Grumbach <emmanuel.grumbach@intel.com> Tested-by: Emmanuel Grumbach <emmanuel.grumbach@intel.com> Signed-off-by: Johannes Berg <johannes.berg@intel.com> --- net/mac80211/iface.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c index ae2eb148a028..d6d1f1df9119 100644 --- a/net/mac80211/iface.c +++ b/net/mac80211/iface.c @@ -418,20 +418,24 @@ int ieee80211_add_virtual_monitor(struct ieee80211_local *local) return ret; } + mutex_lock(&local->iflist_mtx); + rcu_assign_pointer(local->monitor_sdata, sdata); + mutex_unlock(&local->iflist_mtx); + mutex_lock(&local->mtx); ret = ieee80211_vif_use_channel(sdata, &local->monitor_chandef, IEEE80211_CHANCTX_EXCLUSIVE); mutex_unlock(&local->mtx); if (ret) { + mutex_lock(&local->iflist_mtx); + rcu_assign_pointer(local->monitor_sdata, NULL); + mutex_unlock(&local->iflist_mtx); + synchronize_net(); drv_remove_interface(local, sdata); kfree(sdata); return ret; } - mutex_lock(&local->iflist_mtx); - rcu_assign_pointer(local->monitor_sdata, sdata); - mutex_unlock(&local->iflist_mtx); - return 0; } -- cgit From b8ecbee67c732ef9fc47fcf50aed6b7bb6231d98 Mon Sep 17 00:00:00 2001 From: Patrick McHardy <kaber@trash.net> Date: Thu, 6 Feb 2014 09:17:41 +0000 Subject: netfilter: nf_tables: fix log/queue expressions for NFPROTO_INET The log and queue expressions both store the family during ->init() and use it to deliver packets. This is wrong when used in NFPROTO_INET since they should both deliver to the actual AF of the packet, not the dummy NFPROTO_INET. Use the family from the hook ops to fix this. Signed-off-by: Patrick McHardy <kaber@trash.net> Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org> --- net/netfilter/nft_log.c | 5 +---- net/netfilter/nft_queue.c | 4 +--- 2 files changed, 2 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/netfilter/nft_log.c b/net/netfilter/nft_log.c index 5af790123ad8..26c5154e05f3 100644 --- a/net/netfilter/nft_log.c +++ b/net/netfilter/nft_log.c @@ -23,7 +23,6 @@ static const char *nft_log_null_prefix = ""; struct nft_log { struct nf_loginfo loginfo; char *prefix; - int family; }; static void nft_log_eval(const struct nft_expr *expr, @@ -33,7 +32,7 @@ static void nft_log_eval(const struct nft_expr *expr, const struct nft_log *priv = nft_expr_priv(expr); struct net *net = dev_net(pkt->in ? pkt->in : pkt->out); - nf_log_packet(net, priv->family, pkt->ops->hooknum, pkt->skb, pkt->in, + nf_log_packet(net, pkt->ops->pf, pkt->ops->hooknum, pkt->skb, pkt->in, pkt->out, &priv->loginfo, "%s", priv->prefix); } @@ -52,8 +51,6 @@ static int nft_log_init(const struct nft_ctx *ctx, struct nf_loginfo *li = &priv->loginfo; const struct nlattr *nla; - priv->family = ctx->afi->family; - nla = tb[NFTA_LOG_PREFIX]; if (nla != NULL) { priv->prefix = kmalloc(nla_len(nla) + 1, GFP_KERNEL); diff --git a/net/netfilter/nft_queue.c b/net/netfilter/nft_queue.c index cbea473d69e9..e8ae2f6bf232 100644 --- a/net/netfilter/nft_queue.c +++ b/net/netfilter/nft_queue.c @@ -25,7 +25,6 @@ struct nft_queue { u16 queuenum; u16 queues_total; u16 flags; - u8 family; }; static void nft_queue_eval(const struct nft_expr *expr, @@ -43,7 +42,7 @@ static void nft_queue_eval(const struct nft_expr *expr, queue = priv->queuenum + cpu % priv->queues_total; } else { queue = nfqueue_hash(pkt->skb, queue, - priv->queues_total, priv->family, + priv->queues_total, pkt->ops->pf, jhash_initval); } } @@ -71,7 +70,6 @@ static int nft_queue_init(const struct nft_ctx *ctx, return -EINVAL; init_hashrandom(&jhash_initval); - priv->family = ctx->afi->family; priv->queuenum = ntohs(nla_get_be16(tb[NFTA_QUEUE_NUM])); if (tb[NFTA_QUEUE_TOTAL] != NULL) -- cgit From 0165d9325d6a3cf856e2cbbe64a0f4635ac75893 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso <pablo@netfilter.org> Date: Sat, 25 Jan 2014 14:03:51 +0100 Subject: netfilter: nf_tables: fix racy rule deletion We may lost race if we flush the rule-set (which happens asynchronously via call_rcu) and we try to remove the table (that userspace assumes to be empty). Fix this by recovering synchronous rule and chain deletion. This was introduced time ago before we had no batch support, and synchronous rule deletion performance was not good. Now that we have the batch support, we can just postpone the purge of old rule in a second step in the commit phase. All object deletions are synchronous after this patch. As a side effect, we save memory as we don't need rcu_head per rule anymore. Cc: Patrick McHardy <kaber@trash.net> Reported-by: Arturo Borrero Gonzalez <arturo.borrero.glez@gmail.com> Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org> --- include/net/netfilter/nf_tables.h | 4 ---- net/netfilter/nf_tables_api.c | 40 ++++++++++++++++++++++----------------- 2 files changed, 23 insertions(+), 21 deletions(-) (limited to 'net') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 0f68e47d3e5e..e7e14ffe0f6a 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -322,7 +322,6 @@ static inline void *nft_expr_priv(const struct nft_expr *expr) * struct nft_rule - nf_tables rule * * @list: used internally - * @rcu_head: used internally for rcu * @handle: rule handle * @genmask: generation mask * @dlen: length of expression data @@ -330,7 +329,6 @@ static inline void *nft_expr_priv(const struct nft_expr *expr) */ struct nft_rule { struct list_head list; - struct rcu_head rcu_head; u64 handle:46, genmask:2, dlen:16; @@ -391,7 +389,6 @@ enum nft_chain_flags { * * @rules: list of rules in the chain * @list: used internally - * @rcu_head: used internally * @net: net namespace that this chain belongs to * @table: table that this chain belongs to * @handle: chain handle @@ -403,7 +400,6 @@ enum nft_chain_flags { struct nft_chain { struct list_head rules; struct list_head list; - struct rcu_head rcu_head; struct net *net; struct nft_table *table; u64 handle; diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 113c469c7579..3a2e4800b415 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -1008,10 +1008,8 @@ notify: return 0; } -static void nf_tables_rcu_chain_destroy(struct rcu_head *head) +static void nf_tables_chain_destroy(struct nft_chain *chain) { - struct nft_chain *chain = container_of(head, struct nft_chain, rcu_head); - BUG_ON(chain->use > 0); if (chain->flags & NFT_BASE_CHAIN) { @@ -1059,7 +1057,9 @@ static int nf_tables_delchain(struct sock *nlsk, struct sk_buff *skb, family); /* Make sure all rule references are gone before this is released */ - call_rcu(&chain->rcu_head, nf_tables_rcu_chain_destroy); + synchronize_rcu(); + + nf_tables_chain_destroy(chain); return 0; } @@ -1531,9 +1531,8 @@ err: return err; } -static void nf_tables_rcu_rule_destroy(struct rcu_head *head) +static void nf_tables_rule_destroy(struct nft_rule *rule) { - struct nft_rule *rule = container_of(head, struct nft_rule, rcu_head); struct nft_expr *expr; /* @@ -1548,11 +1547,6 @@ static void nf_tables_rcu_rule_destroy(struct rcu_head *head) kfree(rule); } -static void nf_tables_rule_destroy(struct nft_rule *rule) -{ - call_rcu(&rule->rcu_head, nf_tables_rcu_rule_destroy); -} - #define NFT_RULE_MAXEXPRS 128 static struct nft_expr_info *info; @@ -1819,9 +1813,6 @@ static int nf_tables_commit(struct sk_buff *skb) synchronize_rcu(); list_for_each_entry_safe(rupd, tmp, &net->nft.commit_list, list) { - /* Delete this rule from the dirty list */ - list_del(&rupd->list); - /* This rule was inactive in the past and just became active. * Clear the next bit of the genmask since its meaning has * changed, now it is the future. @@ -1832,6 +1823,7 @@ static int nf_tables_commit(struct sk_buff *skb) rupd->chain, rupd->rule, NFT_MSG_NEWRULE, 0, rupd->family); + list_del(&rupd->list); kfree(rupd); continue; } @@ -1841,7 +1833,15 @@ static int nf_tables_commit(struct sk_buff *skb) nf_tables_rule_notify(skb, rupd->nlh, rupd->table, rupd->chain, rupd->rule, NFT_MSG_DELRULE, 0, rupd->family); + } + + /* Make sure we don't see any packet traversing old rules */ + synchronize_rcu(); + + /* Now we can safely release unused old rules */ + list_for_each_entry_safe(rupd, tmp, &net->nft.commit_list, list) { nf_tables_rule_destroy(rupd->rule); + list_del(&rupd->list); kfree(rupd); } @@ -1854,20 +1854,26 @@ static int nf_tables_abort(struct sk_buff *skb) struct nft_rule_trans *rupd, *tmp; list_for_each_entry_safe(rupd, tmp, &net->nft.commit_list, list) { - /* Delete all rules from the dirty list */ - list_del(&rupd->list); - if (!nft_rule_is_active_next(net, rupd->rule)) { nft_rule_clear(net, rupd->rule); + list_del(&rupd->list); kfree(rupd); continue; } /* This rule is inactive, get rid of it */ list_del_rcu(&rupd->rule->list); + } + + /* Make sure we don't see any packet accessing aborted rules */ + synchronize_rcu(); + + list_for_each_entry_safe(rupd, tmp, &net->nft.commit_list, list) { nf_tables_rule_destroy(rupd->rule); + list_del(&rupd->list); kfree(rupd); } + return 0; } -- cgit From 63b5f152eb4a5bb79b9caf7ec37b4201d12f6e66 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven <geert@linux-m68k.org> Date: Wed, 5 Feb 2014 08:38:25 +0100 Subject: ipv4: Fix runtime WARNING in rtmsg_ifa() On m68k/ARAnyM: WARNING: CPU: 0 PID: 407 at net/ipv4/devinet.c:1599 0x316a99() Modules linked in: CPU: 0 PID: 407 Comm: ifconfig Not tainted 3.13.0-atari-09263-g0c71d68014d1 #1378 Stack from 10c4fdf0: 10c4fdf0 002ffabb 000243e8 00000000 008ced6c 00024416 00316a99 0000063f 00316a99 00000009 00000000 002501b4 00316a99 0000063f c0a86117 00000080 c0a86117 00ad0c90 00250a5a 00000014 00ad0c90 00000000 00000000 00000001 00b02dd0 00356594 00000000 00356594 c0a86117 eff6c9e4 008ced6c 00000002 008ced60 0024f9b4 00250b52 00ad0c90 00000000 00000000 00252390 00ad0c90 eff6c9e4 0000004f 00000000 00000000 eff6c9e4 8000e25c eff6c9e4 80001020 Call Trace: [<000243e8>] warn_slowpath_common+0x52/0x6c [<00024416>] warn_slowpath_null+0x14/0x1a [<002501b4>] rtmsg_ifa+0xdc/0xf0 [<00250a5a>] __inet_insert_ifa+0xd6/0x1c2 [<0024f9b4>] inet_abc_len+0x0/0x42 [<00250b52>] inet_insert_ifa+0xc/0x12 [<00252390>] devinet_ioctl+0x2ae/0x5d6 Adding some debugging code reveals that net_fill_ifaddr() fails in put_cacheinfo(skb, ifa->ifa_cstamp, ifa->ifa_tstamp, preferred, valid)) nla_put complains: lib/nlattr.c:454: skb_tailroom(skb) = 12, nla_total_size(attrlen) = 20 Apparently commit 5c766d642bcaffd0c2a5b354db2068515b3846cf ("ipv4: introduce address lifetime") forgot to take into account the addition of struct ifa_cacheinfo in inet_nlmsg_size(). Hence add it, like is already done for ipv6. Suggested-by: Cong Wang <cwang@twopensource.com> Signed-off-by: Geert Uytterhoeven <geert@linux-m68k.org> Signed-off-by: Cong Wang <cwang@twopensource.com> Signed-off-by: David S. Miller <davem@davemloft.net> --- net/ipv4/devinet.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index ac2dff3c2c1c..bdbf68bb2e2d 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -1443,7 +1443,8 @@ static size_t inet_nlmsg_size(void) + nla_total_size(4) /* IFA_LOCAL */ + nla_total_size(4) /* IFA_BROADCAST */ + nla_total_size(IFNAMSIZ) /* IFA_LABEL */ - + nla_total_size(4); /* IFA_FLAGS */ + + nla_total_size(4) /* IFA_FLAGS */ + + nla_total_size(sizeof(struct ifa_cacheinfo)); /* IFA_CACHEINFO */ } static inline u32 cstamp_delta(unsigned long cstamp) -- cgit From 661dbf34129cecd20879cb7b1cbe936911da0f02 Mon Sep 17 00:00:00 2001 From: Matija Glavinic Pecotic <matija.glavinic-pecotic.ext@nsn.com> Date: Thu, 6 Feb 2014 08:30:10 +0100 Subject: net: sctp: fix initialization of local source address on accepted ipv6 sockets commit efe4208f47f907b86f528788da711e8ab9dea44d: 'ipv6: make lookups simpler and faster' broke initialization of local source address on accepted ipv6 sockets. Before the mentioned commit receive address was copied along with the contents of ipv6_pinfo in sctp_v6_create_accept_sk. Now when it is moved, it has to be copied separately. This also fixes lksctp's ipv6 regression in a sense that test_getname_v6, TC5 - 'getsockname on a connected server socket' now passes. Signed-off-by: Matija Glavinic Pecotic <matija.glavinic-pecotic.ext@nsn.com> Signed-off-by: David S. Miller <davem@davemloft.net> --- net/sctp/ipv6.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c index 0f6259a6a932..2b1738ef9394 100644 --- a/net/sctp/ipv6.c +++ b/net/sctp/ipv6.c @@ -662,6 +662,8 @@ static struct sock *sctp_v6_create_accept_sk(struct sock *sk, */ sctp_v6_to_sk_daddr(&asoc->peer.primary_addr, newsk); + newsk->sk_v6_rcv_saddr = sk->sk_v6_rcv_saddr; + sk_refcnt_debug_inc(newsk); if (newsk->sk_prot->init(newsk)) { -- cgit From 00fe11b3c67dc670fe6391d22f1fe64e7c99a8ec Mon Sep 17 00:00:00 2001 From: Sabrina Dubroca <sd@queasysnail.net> Date: Thu, 6 Feb 2014 18:34:12 +0100 Subject: netpoll: fix netconsole IPv6 setup Currently, to make netconsole start over IPv6, the source address needs to be specified. Without a source address, netpoll_parse_options assumes we're setting up over IPv4 and the destination IPv6 address is rejected. Check if the IP version has been forced by a source address before checking for a version mismatch when parsing the destination address. Signed-off-by: Sabrina Dubroca <sd@queasysnail.net> Acked-by: Cong Wang <cwang@twopensource.com> Signed-off-by: David S. Miller <davem@davemloft.net> --- net/core/netpoll.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/netpoll.c b/net/core/netpoll.c index c03f3dec4763..a664f7829a6d 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -948,6 +948,7 @@ int netpoll_parse_options(struct netpoll *np, char *opt) { char *cur=opt, *delim; int ipv6; + bool ipversion_set = false; if (*cur != '@') { if ((delim = strchr(cur, '@')) == NULL) @@ -960,6 +961,7 @@ int netpoll_parse_options(struct netpoll *np, char *opt) cur++; if (*cur != '/') { + ipversion_set = true; if ((delim = strchr(cur, '/')) == NULL) goto parse_failed; *delim = 0; @@ -1002,7 +1004,7 @@ int netpoll_parse_options(struct netpoll *np, char *opt) ipv6 = netpoll_parse_ip_addr(cur, &np->remote_ip); if (ipv6 < 0) goto parse_failed; - else if (np->ipv6 != (bool)ipv6) + else if (ipversion_set && np->ipv6 != (bool)ipv6) goto parse_failed; else np->ipv6 = (bool)ipv6; -- cgit From ed98df3361f059db42786c830ea96e2d18b8d4db Mon Sep 17 00:00:00 2001 From: Eric Dumazet <edumazet@google.com> Date: Thu, 6 Feb 2014 10:42:42 -0800 Subject: net: use __GFP_NORETRY for high order allocations sock_alloc_send_pskb() & sk_page_frag_refill() have a loop trying high order allocations to prepare skb with low number of fragments as this increases performance. Problem is that under memory pressure/fragmentation, this can trigger OOM while the intent was only to try the high order allocations, then fallback to order-0 allocations. We had various reports from unexpected regressions. According to David, setting __GFP_NORETRY should be fine, as the asynchronous compaction is still enabled, and this will prevent OOM from kicking as in : CFSClientEventm invoked oom-killer: gfp_mask=0x42d0, order=3, oom_adj=0, oom_score_adj=0, oom_score_badness=2 (enabled),memcg_scoring=disabled CFSClientEventm Call Trace: [<ffffffff8043766c>] dump_header+0xe1/0x23e [<ffffffff80437a02>] oom_kill_process+0x6a/0x323 [<ffffffff80438443>] out_of_memory+0x4b3/0x50d [<ffffffff8043a4a6>] __alloc_pages_may_oom+0xa2/0xc7 [<ffffffff80236f42>] __alloc_pages_nodemask+0x1002/0x17f0 [<ffffffff8024bd23>] alloc_pages_current+0x103/0x2b0 [<ffffffff8028567f>] sk_page_frag_refill+0x8f/0x160 [<ffffffff80295fa0>] tcp_sendmsg+0x560/0xee0 [<ffffffff802a5037>] inet_sendmsg+0x67/0x100 [<ffffffff80283c9c>] __sock_sendmsg_nosec+0x6c/0x90 [<ffffffff80283e85>] sock_sendmsg+0xc5/0xf0 [<ffffffff802847b6>] __sys_sendmsg+0x136/0x430 [<ffffffff80284ec8>] sys_sendmsg+0x88/0x110 [<ffffffff80711472>] system_call_fastpath+0x16/0x1b Out of Memory: Kill process 2856 (bash) score 9999 or sacrifice child Signed-off-by: Eric Dumazet <edumazet@google.com> Acked-by: David Rientjes <rientjes@google.com> Acked-by: "Eric W. Biederman" <ebiederm@xmission.com> Signed-off-by: David S. Miller <davem@davemloft.net> --- net/core/sock.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/core/sock.c b/net/core/sock.c index 0c127dcdf6a8..5b6a9431b017 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1775,7 +1775,9 @@ struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len, while (order) { if (npages >= 1 << order) { page = alloc_pages(sk->sk_allocation | - __GFP_COMP | __GFP_NOWARN, + __GFP_COMP | + __GFP_NOWARN | + __GFP_NORETRY, order); if (page) goto fill_page; @@ -1845,7 +1847,7 @@ bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t prio) gfp_t gfp = prio; if (order) - gfp |= __GFP_COMP | __GFP_NOWARN; + gfp |= __GFP_COMP | __GFP_NOWARN | __GFP_NORETRY; pfrag->page = alloc_pages(gfp, order); if (likely(pfrag->page)) { pfrag->offset = 0; -- cgit From dbe173079ab58a444e12dbebe96f5aec1e0bed1a Mon Sep 17 00:00:00 2001 From: Cong Wang <cwang@twopensource.com> Date: Thu, 6 Feb 2014 15:00:52 -0800 Subject: bridge: fix netconsole setup over bridge Commit 93d8bf9fb8f3 ("bridge: cleanup netpoll code") introduced a check in br_netpoll_enable(), but this check is incorrect for br_netpoll_setup(). This patch moves the code after the check into __br_netpoll_enable() and calls it in br_netpoll_setup(). For br_add_if(), the check is still needed. Fixes: 93d8bf9fb8f3 ("bridge: cleanup netpoll code") Cc: Toshiaki Makita <makita.toshiaki@lab.ntt.co.jp> Cc: Stephen Hemminger <stephen@networkplumber.org> Cc: David S. Miller <davem@davemloft.net> Signed-off-by: Cong Wang <cwang@twopensource.com> Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com> Acked-by: Toshiaki Makita <makita.toshiaki@lab.ntt.co.jp> Tested-by: Toshiaki Makita <makita.toshiaki@lab.ntt.co.jp> Signed-off-by: David S. Miller <davem@davemloft.net> --- net/bridge/br_device.c | 51 +++++++++++++++++++++++++++----------------------- 1 file changed, 28 insertions(+), 23 deletions(-) (limited to 'net') diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c index e4401a531afb..d9a9b0fc1795 100644 --- a/net/bridge/br_device.c +++ b/net/bridge/br_device.c @@ -226,6 +226,33 @@ static void br_netpoll_cleanup(struct net_device *dev) br_netpoll_disable(p); } +static int __br_netpoll_enable(struct net_bridge_port *p, gfp_t gfp) +{ + struct netpoll *np; + int err; + + np = kzalloc(sizeof(*p->np), gfp); + if (!np) + return -ENOMEM; + + err = __netpoll_setup(np, p->dev, gfp); + if (err) { + kfree(np); + return err; + } + + p->np = np; + return err; +} + +int br_netpoll_enable(struct net_bridge_port *p, gfp_t gfp) +{ + if (!p->br->dev->npinfo) + return 0; + + return __br_netpoll_enable(p, gfp); +} + static int br_netpoll_setup(struct net_device *dev, struct netpoll_info *ni, gfp_t gfp) { @@ -236,7 +263,7 @@ static int br_netpoll_setup(struct net_device *dev, struct netpoll_info *ni, list_for_each_entry(p, &br->port_list, list) { if (!p->dev) continue; - err = br_netpoll_enable(p, gfp); + err = __br_netpoll_enable(p, gfp); if (err) goto fail; } @@ -249,28 +276,6 @@ fail: goto out; } -int br_netpoll_enable(struct net_bridge_port *p, gfp_t gfp) -{ - struct netpoll *np; - int err; - - if (!p->br->dev->npinfo) - return 0; - - np = kzalloc(sizeof(*p->np), gfp); - if (!np) - return -ENOMEM; - - err = __netpoll_setup(np, p->dev, gfp); - if (err) { - kfree(np); - return err; - } - - p->np = np; - return err; -} - void br_netpoll_disable(struct net_bridge_port *p) { struct netpoll *np = p->np; -- cgit From 4a5ab4e224288403b0b4b6b8c4d339323150c312 Mon Sep 17 00:00:00 2001 From: Eric Dumazet <edumazet@google.com> Date: Thu, 6 Feb 2014 15:57:10 -0800 Subject: tcp: remove 1ms offset in srtt computation TCP pacing depends on an accurate srtt estimation. Current srtt estimation is using jiffie resolution, and has an artificial offset of at least 1 ms, which can produce slowdowns when FQ/pacing is used, especially in DC world, where typical rtt is below 1 ms. We are planning a switch to usec resolution for linux-3.15, but in the meantime, this patch removes the 1 ms offset. All we need is to have tp->srtt minimal value of 1 to differentiate the case of srtt being initialized or not, not 8. The problematic behavior was observed on a 40Gbit testbed, where 32 concurrent netperf were reaching 12Gbps of aggregate speed, instead of line speed. This patch also has the effect of reporting more accurate srtt and send rates to iproute2 ss command as in : $ ss -i dst cca2 Netid State Recv-Q Send-Q Local Address:Port Peer Address:Port tcp ESTAB 0 0 10.244.129.1:56984 10.244.129.2:12865 cubic wscale:6,6 rto:200 rtt:0.25/0.25 ato:40 mss:1448 cwnd:10 send 463.4Mbps rcv_rtt:1 rcv_space:29200 tcp ESTAB 0 390960 10.244.129.1:60247 10.244.129.2:50204 cubic wscale:6,6 rto:200 rtt:0.875/0.75 mss:1448 cwnd:73 ssthresh:51 send 966.4Mbps unacked:73 retrans:0/121 rcv_space:29200 Reported-by: Vytautas Valancius <valas@google.com> Signed-off-by: Eric Dumazet <edumazet@google.com> Cc: Yuchung Cheng <ycheng@google.com> Cc: Neal Cardwell <ncardwell@google.com> Acked-by: Neal Cardwell <ncardwell@google.com> Signed-off-by: David S. Miller <davem@davemloft.net> --- net/ipv4/tcp_input.c | 18 ++++++++++-------- net/ipv4/tcp_output.c | 2 +- 2 files changed, 11 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 65cf90e063d5..227cba79fa6b 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -671,6 +671,7 @@ static void tcp_rtt_estimator(struct sock *sk, const __u32 mrtt) { struct tcp_sock *tp = tcp_sk(sk); long m = mrtt; /* RTT */ + u32 srtt = tp->srtt; /* The following amusing code comes from Jacobson's * article in SIGCOMM '88. Note that rtt and mdev @@ -688,11 +689,9 @@ static void tcp_rtt_estimator(struct sock *sk, const __u32 mrtt) * does not matter how to _calculate_ it. Seems, it was trap * that VJ failed to avoid. 8) */ - if (m == 0) - m = 1; - if (tp->srtt != 0) { - m -= (tp->srtt >> 3); /* m is now error in rtt est */ - tp->srtt += m; /* rtt = 7/8 rtt + 1/8 new */ + if (srtt != 0) { + m -= (srtt >> 3); /* m is now error in rtt est */ + srtt += m; /* rtt = 7/8 rtt + 1/8 new */ if (m < 0) { m = -m; /* m is now abs(error) */ m -= (tp->mdev >> 2); /* similar update on mdev */ @@ -723,11 +722,12 @@ static void tcp_rtt_estimator(struct sock *sk, const __u32 mrtt) } } else { /* no previous measure. */ - tp->srtt = m << 3; /* take the measured time to be rtt */ + srtt = m << 3; /* take the measured time to be rtt */ tp->mdev = m << 1; /* make sure rto = 3*rtt */ tp->mdev_max = tp->rttvar = max(tp->mdev, tcp_rto_min(sk)); tp->rtt_seq = tp->snd_nxt; } + tp->srtt = max(1U, srtt); } /* Set the sk_pacing_rate to allow proper sizing of TSO packets. @@ -746,8 +746,10 @@ static void tcp_update_pacing_rate(struct sock *sk) rate *= max(tp->snd_cwnd, tp->packets_out); - /* Correction for small srtt : minimum srtt being 8 (1 jiffy << 3), - * be conservative and assume srtt = 1 (125 us instead of 1.25 ms) + /* Correction for small srtt and scheduling constraints. + * For small rtt, consider noise is too high, and use + * the minimal value (srtt = 1 -> 125 us for HZ=1000) + * * We probably need usec resolution in the future. * Note: This also takes care of possible srtt=0 case, * when tcp_rtt_estimator() was not yet called. diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 03d26b85eab8..10435b3b9d0f 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -1977,7 +1977,7 @@ bool tcp_schedule_loss_probe(struct sock *sk) /* Schedule a loss probe in 2*RTT for SACK capable connections * in Open state, that are either limited by cwnd or application. */ - if (sysctl_tcp_early_retrans < 3 || !rtt || !tp->packets_out || + if (sysctl_tcp_early_retrans < 3 || !tp->srtt || !tp->packets_out || !tcp_is_sack(tp) || inet_csk(sk)->icsk_ca_state != TCP_CA_Open) return false; -- cgit From bd7fc645dabab423ef362186db2917f3919321d3 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso <pablo@netfilter.org> Date: Fri, 7 Feb 2014 12:53:07 +0100 Subject: netfilter: nf_tables: do not allow NFT_SET_ELEM_INTERVAL_END flag and data This combination is not allowed since end interval elements cannot contain data. Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org> Acked-by: Patrick McHardy <kaber@trash.net> --- net/netfilter/nf_tables_api.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net') diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 3a2e4800b415..d0c790e3e495 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -2741,6 +2741,9 @@ static int nft_add_set_elem(const struct nft_ctx *ctx, struct nft_set *set, if (nla[NFTA_SET_ELEM_DATA] == NULL && !(elem.flags & NFT_SET_ELEM_INTERVAL_END)) return -EINVAL; + if (nla[NFTA_SET_ELEM_DATA] != NULL && + elem.flags & NFT_SET_ELEM_INTERVAL_END) + return -EINVAL; } else { if (nla[NFTA_SET_ELEM_DATA] != NULL) return -EINVAL; -- cgit From 2fb91ddbf8e1dcb207e1e2085473aeaeff975102 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso <pablo@netfilter.org> Date: Thu, 6 Feb 2014 16:15:39 +0100 Subject: netfilter: nft_rbtree: fix data handling of end interval elements This patch fixes several things which related to the handling of end interval elements: * Chain use underflow with intervals and map: If you add a rule using intervals+map that introduces a loop, the error path of the rbtree set decrements the chain refcount for each side of the interval, leading to a chain use counter underflow. * Don't copy the data part of the end interval element since, this area is uninitialized and this confuses the loop detection code. * Don't allocate room for the data part of end interval elements since this is unused. So, after this patch the idea is that end interval elements don't have a data part. Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org> Acked-by: Patrick McHardy <kaber@trash.net> --- net/netfilter/nft_rbtree.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/netfilter/nft_rbtree.c b/net/netfilter/nft_rbtree.c index ca0c1b231bfe..e21d69d13506 100644 --- a/net/netfilter/nft_rbtree.c +++ b/net/netfilter/nft_rbtree.c @@ -69,8 +69,10 @@ static void nft_rbtree_elem_destroy(const struct nft_set *set, struct nft_rbtree_elem *rbe) { nft_data_uninit(&rbe->key, NFT_DATA_VALUE); - if (set->flags & NFT_SET_MAP) + if (set->flags & NFT_SET_MAP && + !(rbe->flags & NFT_SET_ELEM_INTERVAL_END)) nft_data_uninit(rbe->data, set->dtype); + kfree(rbe); } @@ -108,7 +110,8 @@ static int nft_rbtree_insert(const struct nft_set *set, int err; size = sizeof(*rbe); - if (set->flags & NFT_SET_MAP) + if (set->flags & NFT_SET_MAP && + !(elem->flags & NFT_SET_ELEM_INTERVAL_END)) size += sizeof(rbe->data[0]); rbe = kzalloc(size, GFP_KERNEL); @@ -117,7 +120,8 @@ static int nft_rbtree_insert(const struct nft_set *set, rbe->flags = elem->flags; nft_data_copy(&rbe->key, &elem->key); - if (set->flags & NFT_SET_MAP) + if (set->flags & NFT_SET_MAP && + !(rbe->flags & NFT_SET_ELEM_INTERVAL_END)) nft_data_copy(rbe->data, &elem->data); err = __nft_rbtree_insert(set, rbe); @@ -153,7 +157,8 @@ static int nft_rbtree_get(const struct nft_set *set, struct nft_set_elem *elem) parent = parent->rb_right; else { elem->cookie = rbe; - if (set->flags & NFT_SET_MAP) + if (set->flags & NFT_SET_MAP && + !(rbe->flags & NFT_SET_ELEM_INTERVAL_END)) nft_data_copy(&elem->data, rbe->data); elem->flags = rbe->flags; return 0; @@ -177,7 +182,8 @@ static void nft_rbtree_walk(const struct nft_ctx *ctx, rbe = rb_entry(node, struct nft_rbtree_elem, node); nft_data_copy(&elem.key, &rbe->key); - if (set->flags & NFT_SET_MAP) + if (set->flags & NFT_SET_MAP && + !(rbe->flags & NFT_SET_ELEM_INTERVAL_END)) nft_data_copy(&elem.data, rbe->data); elem.flags = rbe->flags; -- cgit From 62f9c8b40d2db915f89a6aa47395412fb29f1cfc Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso <pablo@netfilter.org> Date: Fri, 7 Feb 2014 14:45:01 +0100 Subject: netfilter: nf_tables: fix loop checking with end interval elements Fix access to uninitialized data for end interval elements. The element data part is uninitialized in interval end elements. Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org> --- net/netfilter/nf_tables_api.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net') diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index d0c790e3e495..adce01e8bb57 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -2998,6 +2998,9 @@ static int nf_tables_loop_check_setelem(const struct nft_ctx *ctx, const struct nft_set_iter *iter, const struct nft_set_elem *elem) { + if (elem->flags & NFT_SET_ELEM_INTERVAL_END) + return 0; + switch (elem->data.verdict) { case NFT_JUMP: case NFT_GOTO: -- cgit From 6d8c00d58e9e484fdc41aaaf62e5d8364efe375a Mon Sep 17 00:00:00 2001 From: Patrick McHardy <kaber@trash.net> Date: Thu, 9 Jan 2014 18:42:42 +0000 Subject: netfilter: nf_tables: unininline nft_trace_packet() It makes no sense to inline a rarely used function meant for debugging only that is called a total of five times in the main evaluation loop. Signed-off-by: Patrick McHardy <kaber@trash.net> Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org> --- net/netfilter/nf_tables_core.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_tables_core.c b/net/netfilter/nf_tables_core.c index 0d879fcb8763..90998a6ff8b9 100644 --- a/net/netfilter/nf_tables_core.c +++ b/net/netfilter/nf_tables_core.c @@ -103,9 +103,9 @@ static struct nf_loginfo trace_loginfo = { }, }; -static inline void nft_trace_packet(const struct nft_pktinfo *pkt, - const struct nft_chain *chain, - int rulenum, enum nft_trace type) +static void nft_trace_packet(const struct nft_pktinfo *pkt, + const struct nft_chain *chain, + int rulenum, enum nft_trace type) { struct net *net = dev_net(pkt->in ? pkt->in : pkt->out); -- cgit From fd944bded5ce32247941ce54eb8f65646549886b Mon Sep 17 00:00:00 2001 From: Rashika Kheria <rashika.kheria@gmail.com> Date: Sun, 9 Feb 2014 19:57:33 +0530 Subject: net: Mark function as static in 9p/client.c MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Mark function as static in net/9p/client.c because it is not used outside this file. This eliminates the following warning in net/9p/client.c: net/9p/client.c:207:18: warning: no previous prototype for ‘p9_fcall_alloc’ [-Wmissing-prototypes] Signed-off-by: Rashika Kheria <rashika.kheria@gmail.com> Reviewed-by: Josh Triplett <josh@joshtriplett.org> Signed-off-by: David S. Miller <davem@davemloft.net> --- net/9p/client.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/9p/client.c b/net/9p/client.c index a5e4d2dcb03e..9186550d77a6 100644 --- a/net/9p/client.c +++ b/net/9p/client.c @@ -204,7 +204,7 @@ free_and_return: return ret; } -struct p9_fcall *p9_fcall_alloc(int alloc_msize) +static struct p9_fcall *p9_fcall_alloc(int alloc_msize) { struct p9_fcall *fc; fc = kmalloc(sizeof(struct p9_fcall) + alloc_msize, GFP_NOFS); -- cgit From 8203274e1541392e8a85d4bcbcda55d62fe62469 Mon Sep 17 00:00:00 2001 From: Rashika Kheria <rashika.kheria@gmail.com> Date: Sun, 9 Feb 2014 19:59:04 +0530 Subject: net: Include appropriate header file in caif/caif_dev.c MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Include appropriate header file net/caif/caif_dev.h in caif/caif_dev.c because it has prototype declarations of function defined in caif/caif_dev.c. This eliminates the following file in caif/caif_dev.c: net/caif/caif_dev.c:303:6: warning: no previous prototype for ‘caif_enroll_dev’ [-Wmissing-prototypes] Signed-off-by: Rashika Kheria <rashika.kheria@gmail.com> Reviewed-by: Josh Triplett <josh@joshtriplett.org> Signed-off-by: David S. Miller <davem@davemloft.net> --- net/caif/caif_dev.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/caif/caif_dev.c b/net/caif/caif_dev.c index 4dca159435cf..edbca468fa73 100644 --- a/net/caif/caif_dev.c +++ b/net/caif/caif_dev.c @@ -22,6 +22,7 @@ #include <net/pkt_sched.h> #include <net/caif/caif_device.h> #include <net/caif/caif_layer.h> +#include <net/caif/caif_dev.h> #include <net/caif/cfpkt.h> #include <net/caif/cfcnfg.h> #include <net/caif/cfserl.h> -- cgit From 02fe72c9edc58d6b5d85c08327d7ef05cd3e97dc Mon Sep 17 00:00:00 2001 From: Rashika Kheria <rashika.kheria@gmail.com> Date: Sun, 9 Feb 2014 20:02:16 +0530 Subject: net: Include appropriate header file in caif/cfsrvl.c MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Include appropriate header file net/caif/caif_dev.h in caif/cfsrvl.c because it has prototype declaration of functions defined in caif/cfsrvl.c. This eliminates the following warning in caif/cfsrvl.c: net/caif/cfsrvl.c:198:6: warning: no previous prototype for ‘caif_free_client’ [-Wmissing-prototypes] net/caif/cfsrvl.c:208:6: warning: no previous prototype for ‘caif_client_register_refcnt’ [-Wmissing-prototypes] Signed-off-by: Rashika Kheria <rashika.kheria@gmail.com> Reviewed-by: Josh Triplett <josh@joshtriplett.org> Signed-off-by: David S. Miller <davem@davemloft.net> --- net/caif/cfsrvl.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/caif/cfsrvl.c b/net/caif/cfsrvl.c index 353f793d1b3b..a6e115463052 100644 --- a/net/caif/cfsrvl.c +++ b/net/caif/cfsrvl.c @@ -15,6 +15,7 @@ #include <net/caif/caif_layer.h> #include <net/caif/cfsrvl.h> #include <net/caif/cfpkt.h> +#include <net/caif/caif_dev.h> #define SRVL_CTRL_PKT_SIZE 1 #define SRVL_FLOW_OFF 0x81 -- cgit From 0a59f3a9fd7e2801a445682465ea0522ea497183 Mon Sep 17 00:00:00 2001 From: Rashika Kheria <rashika.kheria@gmail.com> Date: Sun, 9 Feb 2014 20:26:25 +0530 Subject: net: Mark functions as static in core/dev.c MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Mark functions as static in core/dev.c because they are not used outside this file. This eliminates the following warning in core/dev.c: net/core/dev.c:2806:5: warning: no previous prototype for ‘__dev_queue_xmit’ [-Wmissing-prototypes] net/core/dev.c:4640:5: warning: no previous prototype for ‘netdev_adjacent_sysfs_add’ [-Wmissing-prototypes] net/core/dev.c:4650:6: warning: no previous prototype for ‘netdev_adjacent_sysfs_del’ [-Wmissing-prototypes] Signed-off-by: Rashika Kheria <rashika.kheria@gmail.com> Reviewed-by: Josh Triplett <josh@joshtriplett.org> Reviewed-by: Veaceslav Falico <vfalico@redhat.com> Signed-off-by: David S. Miller <davem@davemloft.net> --- net/core/dev.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index 3721db716350..4ad1b78c9c77 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2803,7 +2803,7 @@ EXPORT_SYMBOL(dev_loopback_xmit); * the BH enable code must have IRQs enabled so that it will not deadlock. * --BLG */ -int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv) +static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv) { struct net_device *dev = skb->dev; struct netdev_queue *txq; @@ -4637,7 +4637,7 @@ struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev) } EXPORT_SYMBOL(netdev_master_upper_dev_get_rcu); -int netdev_adjacent_sysfs_add(struct net_device *dev, +static int netdev_adjacent_sysfs_add(struct net_device *dev, struct net_device *adj_dev, struct list_head *dev_list) { @@ -4647,7 +4647,7 @@ int netdev_adjacent_sysfs_add(struct net_device *dev, return sysfs_create_link(&(dev->dev.kobj), &(adj_dev->dev.kobj), linkname); } -void netdev_adjacent_sysfs_del(struct net_device *dev, +static void netdev_adjacent_sysfs_del(struct net_device *dev, char *name, struct list_head *dev_list) { -- cgit From f56b8bf6e445b22bb6207dbcf30b2c7f5ddfdf96 Mon Sep 17 00:00:00 2001 From: Rashika Kheria <rashika.kheria@gmail.com> Date: Sun, 9 Feb 2014 22:20:09 +0530 Subject: net: Move prototype declaration to appropriate header file from decnet/af_decnet.c MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Move prototype declaration of functions to header file include/net/dn_route.h from net/decnet/af_decnet.c because it is used by more than one file. This eliminates the following warning in net/decnet/dn_route.c: net/decnet/dn_route.c:629:5: warning: no previous prototype for ‘dn_route_rcv’ [-Wmissing-prototypes] Signed-off-by: Rashika Kheria <rashika.kheria@gmail.com> Reviewed-by: Josh Triplett <josh@joshtriplett.org> Signed-off-by: David S. Miller <davem@davemloft.net> --- include/net/dn_route.h | 2 ++ net/decnet/af_decnet.c | 2 -- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/include/net/dn_route.h b/include/net/dn_route.h index b409ad6b8d7a..55df9939bca2 100644 --- a/include/net/dn_route.h +++ b/include/net/dn_route.h @@ -20,6 +20,8 @@ int dn_route_output_sock(struct dst_entry __rcu **pprt, struct flowidn *, struct sock *sk, int flags); int dn_cache_dump(struct sk_buff *skb, struct netlink_callback *cb); void dn_rt_cache_flush(int delay); +int dn_route_rcv(struct sk_buff *skb, struct net_device *dev, + struct packet_type *pt, struct net_device *orig_dev); /* Masks for flags field */ #define DN_RT_F_PID 0x07 /* Mask for packet type */ diff --git a/net/decnet/af_decnet.c b/net/decnet/af_decnet.c index 2954dcbca832..24d9193240e3 100644 --- a/net/decnet/af_decnet.c +++ b/net/decnet/af_decnet.c @@ -2104,8 +2104,6 @@ static struct notifier_block dn_dev_notifier = { .notifier_call = dn_device_event, }; -extern int dn_route_rcv(struct sk_buff *, struct net_device *, struct packet_type *, struct net_device *); - static struct packet_type dn_dix_packet_type __read_mostly = { .type = cpu_to_be16(ETH_P_DNA_RT), .func = dn_route_rcv, -- cgit From ab3301bd96c024ec9c2e1c35c90327dc5c8012a4 Mon Sep 17 00:00:00 2001 From: Rashika Kheria <rashika.kheria@gmail.com> Date: Sun, 9 Feb 2014 22:22:53 +0530 Subject: net: Move prototype declaration to header file include/net/dn.h from net/decnet/af_decnet.c MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Move prototype declaration of functions to header file include/net/dn.h from net/decnet/af_decnet.c because they are used by more than one file. This eliminates the following warning in net/decnet/af_decnet.c: net/decnet/sysctl_net_decnet.c:354:6: warning: no previous prototype for ‘dn_register_sysctl’ [-Wmissing-prototypes] net/decnet/sysctl_net_decnet.c:359:6: warning: no previous prototype for ‘dn_unregister_sysctl’ [-Wmissing-prototypes] Signed-off-by: Rashika Kheria <rashika.kheria@gmail.com> Reviewed-by: Josh Triplett <josh@joshtriplett.org> Signed-off-by: David S. Miller <davem@davemloft.net> --- include/net/dn.h | 2 ++ net/decnet/af_decnet.c | 3 --- 2 files changed, 2 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/include/net/dn.h b/include/net/dn.h index ccc15588d108..913b73d239f5 100644 --- a/include/net/dn.h +++ b/include/net/dn.h @@ -200,6 +200,8 @@ static inline void dn_sk_ports_copy(struct flowidn *fld, struct dn_scp *scp) } unsigned int dn_mss_from_pmtu(struct net_device *dev, int mtu); +void dn_register_sysctl(void); +void dn_unregister_sysctl(void); #define DN_MENUVER_ACC 0x01 #define DN_MENUVER_USR 0x02 diff --git a/net/decnet/af_decnet.c b/net/decnet/af_decnet.c index 24d9193240e3..4c04848953bd 100644 --- a/net/decnet/af_decnet.c +++ b/net/decnet/af_decnet.c @@ -2351,9 +2351,6 @@ static const struct proto_ops dn_proto_ops = { .sendpage = sock_no_sendpage, }; -void dn_register_sysctl(void); -void dn_unregister_sysctl(void); - MODULE_DESCRIPTION("The Linux DECnet Network Protocol"); MODULE_AUTHOR("Linux DECnet Project Team"); MODULE_LICENSE("GPL"); -- cgit From 493cc5e5ba450356560cc12a8f71ff6a8574c91c Mon Sep 17 00:00:00 2001 From: Rashika Kheria <rashika.kheria@gmail.com> Date: Sun, 9 Feb 2014 22:24:33 +0530 Subject: net: Move prototype declaration to include/net/ipx.h from net/ipx/ipx_route.c MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Move prototype definition of function to header file include/net/ipx.h from net/ipx/ipx_route.c because they are used by more than one file. This eliminates the following warning from net/ipx/af_ipx.c: net/ipx/af_ipx.c:193:23: warning: no previous prototype for ‘ipxitf_find_using_net’ [-Wmissing-prototypes] net/ipx/af_ipx.c:577:5: warning: no previous prototype for ‘ipxitf_send’ [-Wmissing-prototypes] net/ipx/af_ipx.c:1219:8: warning: no previous prototype for ‘ipx_cksum’ [-Wmissing-prototypes] Signed-off-by: Rashika Kheria <rashika.kheria@gmail.com> Reviewed-by: Josh Triplett <josh@joshtriplett.org> Signed-off-by: David S. Miller <davem@davemloft.net> --- include/net/ipx.h | 3 +++ net/ipx/ipx_route.c | 4 ---- 2 files changed, 3 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/include/net/ipx.h b/include/net/ipx.h index 9e9e35465baf..75466acdce21 100644 --- a/include/net/ipx.h +++ b/include/net/ipx.h @@ -140,6 +140,9 @@ static __inline__ void ipxitf_hold(struct ipx_interface *intrfc) } void ipxitf_down(struct ipx_interface *intrfc); +struct ipx_interface *ipxitf_find_using_net(__be32 net); +int ipxitf_send(struct ipx_interface *intrfc, struct sk_buff *skb, char *node); +__be16 ipx_cksum(struct ipxhdr *packet, int length); static __inline__ void ipxitf_put(struct ipx_interface *intrfc) { diff --git a/net/ipx/ipx_route.c b/net/ipx/ipx_route.c index 30f4519b092f..c1f03185c5e1 100644 --- a/net/ipx/ipx_route.c +++ b/net/ipx/ipx_route.c @@ -20,15 +20,11 @@ DEFINE_RWLOCK(ipx_routes_lock); extern struct ipx_interface *ipx_internal_net; -extern __be16 ipx_cksum(struct ipxhdr *packet, int length); extern struct ipx_interface *ipxitf_find_using_net(__be32 net); extern int ipxitf_demux_socket(struct ipx_interface *intrfc, struct sk_buff *skb, int copy); extern int ipxitf_demux_socket(struct ipx_interface *intrfc, struct sk_buff *skb, int copy); -extern int ipxitf_send(struct ipx_interface *intrfc, struct sk_buff *skb, - char *node); -extern struct ipx_interface *ipxitf_find_using_net(__be32 net); struct ipx_route *ipxrtr_lookup(__be32 net) { -- cgit From 578efbc19f468686ad7eb1e505bab56d04e92784 Mon Sep 17 00:00:00 2001 From: Rashika Kheria <rashika.kheria@gmail.com> Date: Sun, 9 Feb 2014 22:26:32 +0530 Subject: net: Move prototype declaration to header file include/net/ipx.h from net/ipx/af_ipx.c MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Move prototype declaration of functions to header file include/net/ipx.h from net/ipx/af_ipx.c because they are used by more than one file. This eliminates the following warning in net/ipx/ipx_route.c:33:19: warning: no previous prototype for ‘ipxrtr_lookup’ [-Wmissing-prototypes] net/ipx/ipx_route.c:52:5: warning: no previous prototype for ‘ipxrtr_add_route’ [-Wmissing-prototypes] net/ipx/ipx_route.c:94:6: warning: no previous prototype for ‘ipxrtr_del_routes’ [-Wmissing-prototypes] net/ipx/ipx_route.c:149:5: warning: no previous prototype for ‘ipxrtr_route_skb’ [-Wmissing-prototypes] net/ipx/ipx_route.c:171:5: warning: no previous prototype for ‘ipxrtr_route_packet’ [-Wmissing-prototypes] net/ipx/ipx_route.c:261:5: warning: no previous prototype for ‘ipxrtr_ioctl’ [-Wmissing-prototypes] Signed-off-by: Rashika Kheria <rashika.kheria@gmail.com> Reviewed-by: Josh Triplett <josh@joshtriplett.org> Signed-off-by: David S. Miller <davem@davemloft.net> --- include/net/ipx.h | 8 ++++++++ net/ipx/af_ipx.c | 9 --------- 2 files changed, 8 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/include/net/ipx.h b/include/net/ipx.h index 75466acdce21..0143180fecc9 100644 --- a/include/net/ipx.h +++ b/include/net/ipx.h @@ -143,6 +143,14 @@ void ipxitf_down(struct ipx_interface *intrfc); struct ipx_interface *ipxitf_find_using_net(__be32 net); int ipxitf_send(struct ipx_interface *intrfc, struct sk_buff *skb, char *node); __be16 ipx_cksum(struct ipxhdr *packet, int length); +int ipxrtr_add_route(__be32 network, struct ipx_interface *intrfc, + unsigned char *node); +void ipxrtr_del_routes(struct ipx_interface *intrfc); +int ipxrtr_route_packet(struct sock *sk, struct sockaddr_ipx *usipx, + struct iovec *iov, size_t len, int noblock); +int ipxrtr_route_skb(struct sk_buff *skb); +struct ipx_route *ipxrtr_lookup(__be32 net); +int ipxrtr_ioctl(unsigned int cmd, void __user *arg); static __inline__ void ipxitf_put(struct ipx_interface *intrfc) { diff --git a/net/ipx/af_ipx.c b/net/ipx/af_ipx.c index 994e28bfb32e..e5a00a9b52f3 100644 --- a/net/ipx/af_ipx.c +++ b/net/ipx/af_ipx.c @@ -84,15 +84,6 @@ DEFINE_SPINLOCK(ipx_interfaces_lock); struct ipx_interface *ipx_primary_net; struct ipx_interface *ipx_internal_net; -extern int ipxrtr_add_route(__be32 network, struct ipx_interface *intrfc, - unsigned char *node); -extern void ipxrtr_del_routes(struct ipx_interface *intrfc); -extern int ipxrtr_route_packet(struct sock *sk, struct sockaddr_ipx *usipx, - struct iovec *iov, size_t len, int noblock); -extern int ipxrtr_route_skb(struct sk_buff *skb); -extern struct ipx_route *ipxrtr_lookup(__be32 net); -extern int ipxrtr_ioctl(unsigned int cmd, void __user *arg); - struct ipx_interface *ipx_interfaces_head(void) { struct ipx_interface *rc = NULL; -- cgit From 7780d8ae4ae1960ef7c0570de0a1ecd7b60c8152 Mon Sep 17 00:00:00 2001 From: Rashika Kheria <rashika.kheria@gmail.com> Date: Sun, 9 Feb 2014 22:27:41 +0530 Subject: net: Move prototype declaration to header file include/net/datalink.h from net/ipx/af_ipx.c MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Move prototype declarations of function to header file include/net/datalink.h from net/ipx/af_ipx.c because they are used by more than one file. This eliminates the following warning in net/ipx/pe2.c: net/ipx/pe2.c:20:24: warning: no previous prototype for ‘make_EII_client’ [-Wmissing-prototypes] net/ipx/pe2.c:32:6: warning: no previous prototype for ‘destroy_EII_client’ [-Wmissing-prototypes] Signed-off-by: Rashika Kheria <rashika.kheria@gmail.com> Reviewed-by: Josh Triplett <josh@joshtriplett.org> Signed-off-by: David S. Miller <davem@davemloft.net> --- include/net/datalink.h | 2 ++ net/ipx/af_ipx.c | 4 +--- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/include/net/datalink.h b/include/net/datalink.h index deb7ca75db48..93cb18f729b5 100644 --- a/include/net/datalink.h +++ b/include/net/datalink.h @@ -15,4 +15,6 @@ struct datalink_proto { struct list_head node; }; +struct datalink_proto *make_EII_client(void); +void destroy_EII_client(struct datalink_proto *dl); #endif diff --git a/net/ipx/af_ipx.c b/net/ipx/af_ipx.c index e5a00a9b52f3..224d05856b85 100644 --- a/net/ipx/af_ipx.c +++ b/net/ipx/af_ipx.c @@ -52,6 +52,7 @@ #include <net/p8022.h> #include <net/psnap.h> #include <net/sock.h> +#include <net/datalink.h> #include <net/tcp_states.h> #include <asm/uaccess.h> @@ -1977,9 +1978,6 @@ static struct notifier_block ipx_dev_notifier = { .notifier_call = ipxitf_device_event, }; -extern struct datalink_proto *make_EII_client(void); -extern void destroy_EII_client(struct datalink_proto *); - static const unsigned char ipx_8022_type = 0xE0; static const unsigned char ipx_snap_id[5] = { 0x0, 0x0, 0x0, 0x81, 0x37 }; static const char ipx_EII_err_msg[] __initconst = -- cgit From 535d3ae9c808e6a5248f0524dbc7a9e997cf3288 Mon Sep 17 00:00:00 2001 From: Rashika Kheria <rashika.kheria@gmail.com> Date: Sun, 9 Feb 2014 22:29:14 +0530 Subject: net: Move prototype declaration to header file include/net/net_namespace.h from net/ipx/af_ipx.c MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Move prototype declaration of function to header file include/net/net_namespace.h from net/ipx/af_ipx.c because they are used by more than one file. This eliminates the following warning in net/ipx/sysctl_net_ipx.c: net/ipx/sysctl_net_ipx.c:33:6: warning: no previous prototype for ‘ipx_register_sysctl’ [-Wmissing-prototypes] net/ipx/sysctl_net_ipx.c:38:6: warning: no previous prototype for ‘ipx_unregister_sysctl’ [-Wmissing-prototypes] Signed-off-by: Rashika Kheria <rashika.kheria@gmail.com> Signed-off-by: David S. Miller <davem@davemloft.net> --- include/net/net_namespace.h | 8 ++++++++ net/ipx/af_ipx.c | 9 +-------- 2 files changed, 9 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h index da68c9a90ac5..991dcd94cbbf 100644 --- a/include/net/net_namespace.h +++ b/include/net/net_namespace.h @@ -162,6 +162,14 @@ extern struct list_head net_namespace_list; struct net *get_net_ns_by_pid(pid_t pid); struct net *get_net_ns_by_fd(int pid); +#ifdef CONFIG_SYSCTL +void ipx_register_sysctl(void); +void ipx_unregister_sysctl(void); +#else +#define ipx_register_sysctl() +#define ipx_unregister_sysctl() +#endif + #ifdef CONFIG_NET_NS void __put_net(struct net *net); diff --git a/net/ipx/af_ipx.c b/net/ipx/af_ipx.c index 224d05856b85..00b2a6d1c009 100644 --- a/net/ipx/af_ipx.c +++ b/net/ipx/af_ipx.c @@ -54,17 +54,10 @@ #include <net/sock.h> #include <net/datalink.h> #include <net/tcp_states.h> +#include <net/net_namespace.h> #include <asm/uaccess.h> -#ifdef CONFIG_SYSCTL -extern void ipx_register_sysctl(void); -extern void ipx_unregister_sysctl(void); -#else -#define ipx_register_sysctl() -#define ipx_unregister_sysctl() -#endif - /* Configuration Variables */ static unsigned char ipxcfg_max_hops = 16; static char ipxcfg_auto_select_primary; -- cgit From bd76ed36bac50b5402fd09c3fc2f368ef324ffe2 Mon Sep 17 00:00:00 2001 From: Rashika Kheria <rashika.kheria@gmail.com> Date: Sun, 9 Feb 2014 22:31:42 +0530 Subject: net: Include appropriate header file in netfilter/nft_lookup.c MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Include appropriate header file net/netfilter/nf_tables_core.h in net/netfilter/nft_lookup.c because it has prototype declaration of functions defined in net/netfilter/nft_lookup.c. This eliminates the following warning in net/netfilter/nft_lookup.c: net/netfilter/nft_lookup.c:133:12: warning: no previous prototype for ‘nft_lookup_module_init’ [-Wmissing-prototypes] net/netfilter/nft_lookup.c:138:6: warning: no previous prototype for ‘nft_lookup_module_exit’ [-Wmissing-prototypes] Signed-off-by: Rashika Kheria <rashika.kheria@gmail.com> Reviewed-by: Josh Triplett <josh@joshtriplett.org> Signed-off-by: David S. Miller <davem@davemloft.net> --- net/netfilter/nft_lookup.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/netfilter/nft_lookup.c b/net/netfilter/nft_lookup.c index 8a6116b75b5a..bb4ef4cccb6e 100644 --- a/net/netfilter/nft_lookup.c +++ b/net/netfilter/nft_lookup.c @@ -16,6 +16,7 @@ #include <linux/netfilter.h> #include <linux/netfilter/nf_tables.h> #include <net/netfilter/nf_tables.h> +#include <net/netfilter/nf_tables_core.h> struct nft_lookup { struct nft_set *set; -- cgit From e1d83ee673eff5407255ba9e884312219f6832d7 Mon Sep 17 00:00:00 2001 From: Rashika Kheria <rashika.kheria@gmail.com> Date: Sun, 9 Feb 2014 22:33:03 +0530 Subject: net: Mark functions as static in net/sunrpc/svc_xprt.c MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Mark functions as static in net/sunrpc/svc_xprt.c because they are not used outside this file. This eliminates the following warning in net/sunrpc/svc_xprt.c: net/sunrpc/svc_xprt.c:574:5: warning: no previous prototype for ‘svc_alloc_arg’ [-Wmissing-prototypes] net/sunrpc/svc_xprt.c:615:18: warning: no previous prototype for ‘svc_get_next_xprt’ [-Wmissing-prototypes] net/sunrpc/svc_xprt.c:694:6: warning: no previous prototype for ‘svc_add_new_temp_xprt’ [-Wmissing-prototypes] Signed-off-by: Rashika Kheria <rashika.kheria@gmail.com> Reviewed-by: Josh Triplett <josh@joshtriplett.org> Signed-off-by: David S. Miller <davem@davemloft.net> --- net/sunrpc/svc_xprt.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c index 80a6640f329b..06c6ff0cb911 100644 --- a/net/sunrpc/svc_xprt.c +++ b/net/sunrpc/svc_xprt.c @@ -571,7 +571,7 @@ static void svc_check_conn_limits(struct svc_serv *serv) } } -int svc_alloc_arg(struct svc_rqst *rqstp) +static int svc_alloc_arg(struct svc_rqst *rqstp) { struct svc_serv *serv = rqstp->rq_server; struct xdr_buf *arg; @@ -612,7 +612,7 @@ int svc_alloc_arg(struct svc_rqst *rqstp) return 0; } -struct svc_xprt *svc_get_next_xprt(struct svc_rqst *rqstp, long timeout) +static struct svc_xprt *svc_get_next_xprt(struct svc_rqst *rqstp, long timeout) { struct svc_xprt *xprt; struct svc_pool *pool = rqstp->rq_pool; @@ -691,7 +691,7 @@ struct svc_xprt *svc_get_next_xprt(struct svc_rqst *rqstp, long timeout) return xprt; } -void svc_add_new_temp_xprt(struct svc_serv *serv, struct svc_xprt *newxpt) +static void svc_add_new_temp_xprt(struct svc_serv *serv, struct svc_xprt *newxpt) { spin_lock_bh(&serv->sv_lock); set_bit(XPT_TEMP, &newxpt->xpt_flags); -- cgit From d94c1f92bbf2c42c5febd68bbea51fffeac90834 Mon Sep 17 00:00:00 2001 From: FX Le Bail <fx.lebail@yahoo.com> Date: Fri, 7 Feb 2014 11:22:37 +0100 Subject: ipv6: icmp6_send: fix Oops when pinging a not set up IPv6 peer on a sit tunnel The patch 446fab59333dea91e54688f033dd8d788d0486fb ("ipv6: enable anycast addresses as source addresses in ICMPv6 error messages") causes an Oops when pinging a not set up IPv6 peer on a sit tunnel. The problem is that ipv6_anycast_destination() uses unconditionally skb_dst(skb), which is NULL in this case. The solution is to use instead the ipv6_chk_acast_addr_src() function. Here are the steps to reproduce it: modprobe sit ip link add sit1 type sit remote 10.16.0.121 local 10.16.0.249 ip l s sit1 up ip -6 a a dev sit1 2001:1234::123 remote 2001:1234::121 ping6 2001:1234::121 Reported-by: Nicolas Dichtel <nicolas.dichtel@6wind.com> Tested-by: Nicolas Dichtel <nicolas.dichtel@6wind.com> Signed-off-by: Francois-Xavier Le Bail <fx.lebail@yahoo.com> Signed-off-by: David S. Miller <davem@davemloft.net> --- net/ipv6/icmp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c index f81f59686f21..f2610e157660 100644 --- a/net/ipv6/icmp.c +++ b/net/ipv6/icmp.c @@ -414,7 +414,7 @@ static void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info) addr_type = ipv6_addr_type(&hdr->daddr); if (ipv6_chk_addr(net, &hdr->daddr, skb->dev, 0) || - ipv6_anycast_destination(skb)) + ipv6_chk_acast_addr_src(net, skb->dev, &hdr->daddr)) saddr = &hdr->daddr; /* -- cgit From 310f6fd0ca9de8c1fc6399af494855e15157aa35 Mon Sep 17 00:00:00 2001 From: Christian Engelmayer <cengelma@gmx.at> Date: Fri, 7 Feb 2014 22:58:38 +0100 Subject: 6lowpan: Remove unused pointer in lowpan_header_create() Commit 8df8c56a (6lowpan: Moving generic compression code into 6lowpan_iphc.c) left pointer 'hdr' unused - remove it. Detected by Coverity: CID 1164868. Signed-off-by: Christian Engelmayer <cengelma@gmx.at> Signed-off-by: David S. Miller <davem@davemloft.net> --- net/ieee802154/6lowpan.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'net') diff --git a/net/ieee802154/6lowpan.c b/net/ieee802154/6lowpan.c index 48b25c0af4d0..8bfb40153fe7 100644 --- a/net/ieee802154/6lowpan.c +++ b/net/ieee802154/6lowpan.c @@ -106,7 +106,6 @@ static int lowpan_header_create(struct sk_buff *skb, unsigned short type, const void *_daddr, const void *_saddr, unsigned int len) { - struct ipv6hdr *hdr; const u8 *saddr = _saddr; const u8 *daddr = _daddr; struct ieee802154_addr sa, da; @@ -117,8 +116,6 @@ static int lowpan_header_create(struct sk_buff *skb, if (type != ETH_P_IPV6) return 0; - hdr = ipv6_hdr(skb); - if (!saddr) saddr = dev->dev_addr; -- cgit From 946c032e5a53992ea45e062ecb08670ba39b99e3 Mon Sep 17 00:00:00 2001 From: Maciej Żenczykowski <maze@google.com> Date: Fri, 7 Feb 2014 16:23:48 -0800 Subject: net: fix 'ip rule' iif/oif device rename MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ip rules with iif/oif references do not update: (detach/attach) across interface renames. Signed-off-by: Maciej Żenczykowski <maze@google.com> CC: Willem de Bruijn <willemb@google.com> CC: Eric Dumazet <edumazet@google.com> CC: Chris Davis <chrismd@google.com> CC: Carlo Contavalli <ccontavalli@google.com> Google-Bug-Id: 12936021 Acked-by: Eric Dumazet <edumazet@google.com> Signed-off-by: David S. Miller <davem@davemloft.net> --- net/core/fib_rules.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'net') diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c index f409e0bd35c0..185c341fafbd 100644 --- a/net/core/fib_rules.c +++ b/net/core/fib_rules.c @@ -745,6 +745,13 @@ static int fib_rules_event(struct notifier_block *this, unsigned long event, attach_rules(&ops->rules_list, dev); break; + case NETDEV_CHANGENAME: + list_for_each_entry(ops, &net->rules_ops, list) { + detach_rules(&ops->rules_list, dev); + attach_rules(&ops->rules_list, dev); + } + break; + case NETDEV_UNREGISTER: list_for_each_entry(ops, &net->rules_ops, list) detach_rules(&ops->rules_list, dev); -- cgit From b10bd54c05a6c3c96549f4642d7de23c99b9853b Mon Sep 17 00:00:00 2001 From: Jesper Juhl <jj@chaosbits.net> Date: Sun, 9 Feb 2014 23:30:32 +0100 Subject: tcp: correct code comment stating 3 min timeout for FIN_WAIT2, we only do 1 min As far as I can tell we have used a default of 60 seconds for FIN_WAIT2 timeout for ages (since 2.x times??). In any case, the timeout these days is 60 seconds, so the 3 min comment is wrong (and cost me a few minutes of my life when I was debugging a FIN_WAIT2 related problem in a userspace application and checked the kernel source for details). Signed-off-by: Jesper Juhl <jj@chaosbits.net> Signed-off-by: David S. Miller <davem@davemloft.net> --- net/ipv4/tcp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 4475b3bb494d..9f3a2db9109e 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2229,7 +2229,7 @@ adjudge_to_death: /* This is a (useful) BSD violating of the RFC. There is a * problem with TCP as specified in that the other end could * keep a socket open forever with no application left this end. - * We use a 3 minute timeout (about the same as BSD) then kill + * We use a 1 minute timeout (about the same as BSD) then kill * our end. If they send after that then tough - BUT: long enough * that we won't make the old 4*rto = almost no time - whoops * reset mistake. -- cgit From a699d65ec4ff82245d2c4fdfb8ed1d64776d756e Mon Sep 17 00:00:00 2001 From: Trond Myklebust <trond.myklebust@primarydata.com> Date: Mon, 10 Feb 2014 16:28:52 -0500 Subject: SUNRPC: Don't create a gss auth cache unless rpc.gssd is running An infinite loop is caused when nfs4_establish_lease() fails with -EACCES. This causes nfs4_handle_reclaim_lease_error() to sleep a bit and resets the NFS4CLNT_LEASE_EXPIRED bit. This in turn causes nfs4_state_manager() to try and reestablished the lease, again, again, again... The problem is a valid RPCSEC_GSS client is being created when rpc.gssd is not running. Link: http://lkml.kernel.org/r/1392066375-16502-1-git-send-email-steved@redhat.com Fixes: 0ea9de0ea6a4 (sunrpc: turn warn_gssd() log message into a dprintk()) Reported-by: Steve Dickson <steved@redhat.com> Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com> --- net/sunrpc/auth_gss/auth_gss.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c index 6c0513a7f992..44a61e8fda6f 100644 --- a/net/sunrpc/auth_gss/auth_gss.c +++ b/net/sunrpc/auth_gss/auth_gss.c @@ -991,6 +991,8 @@ gss_create_new(struct rpc_auth_create_args *args, struct rpc_clnt *clnt) gss_auth->service = gss_pseudoflavor_to_service(gss_auth->mech, flavor); if (gss_auth->service == 0) goto err_put_mech; + if (!gssd_running(gss_auth->net)) + goto err_put_mech; auth = &gss_auth->rpc_auth; auth->au_cslack = GSS_CRED_SLACK >> 2; auth->au_rslack = GSS_VERF_SLACK >> 2; -- cgit From a5642ab4744bc8c5a8c7ce7c6e30c01bd6bbc691 Mon Sep 17 00:00:00 2001 From: Toshiaki Makita <makita.toshiaki@lab.ntt.co.jp> Date: Fri, 7 Feb 2014 16:48:18 +0900 Subject: bridge: Fix the way to find old local fdb entries in br_fdb_changeaddr br_fdb_changeaddr() assumes that there is at most one local entry per port per vlan. It used to be true, but since commit 36fd2b63e3b4 ("bridge: allow creating/deleting fdb entries via netlink"), it has not been so. Therefore, the function might fail to search a correct previous address to be deleted and delete an arbitrary local entry if user has added local entries manually. Example of problematic case: ip link set eth0 address ee:ff:12:34:56:78 brctl addif br0 eth0 bridge fdb add 12:34:56:78:90:ab dev eth0 master ip link set eth0 address aa:bb:cc:dd:ee:ff Then, the address 12:34:56:78:90:ab might be deleted instead of ee:ff:12:34:56:78, the original mac address of eth0. Address this issue by introducing a new flag, added_by_user, to struct net_bridge_fdb_entry. Note that br_fdb_delete_by_port() has to set added_by_user to 0 in cases like: ip link set eth0 address 12:34:56:78:90:ab ip link set eth1 address aa:bb:cc:dd:ee:ff brctl addif br0 eth0 bridge fdb add aa:bb:cc:dd:ee:ff dev eth0 master brctl addif br0 eth1 brctl delif br0 eth0 In this case, kernel should delete the user-added entry aa:bb:cc:dd:ee:ff, but it also should have been added by "brctl addif br0 eth1" originally, so we don't delete it and treat it a new kernel-created entry. Signed-off-by: Toshiaki Makita <makita.toshiaki@lab.ntt.co.jp> Signed-off-by: David S. Miller <davem@davemloft.net> --- net/bridge/br_fdb.c | 16 ++++++++++++---- net/bridge/br_input.c | 4 ++-- net/bridge/br_private.h | 3 ++- 3 files changed, 16 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c index c5f5a4a933f4..ce5411995a63 100644 --- a/net/bridge/br_fdb.c +++ b/net/bridge/br_fdb.c @@ -104,7 +104,7 @@ void br_fdb_changeaddr(struct net_bridge_port *p, const unsigned char *newaddr) struct net_bridge_fdb_entry *f; f = hlist_entry(h, struct net_bridge_fdb_entry, hlist); - if (f->dst == p && f->is_local) { + if (f->dst == p && f->is_local && !f->added_by_user) { /* maybe another port has same hw addr? */ struct net_bridge_port *op; u16 vid = f->vlan_id; @@ -247,6 +247,7 @@ void br_fdb_delete_by_port(struct net_bridge *br, ether_addr_equal(op->dev->dev_addr, f->addr.addr)) { f->dst = op; + f->added_by_user = 0; goto skip_delete; } } @@ -397,6 +398,7 @@ static struct net_bridge_fdb_entry *fdb_create(struct hlist_head *head, fdb->vlan_id = vid; fdb->is_local = 0; fdb->is_static = 0; + fdb->added_by_user = 0; fdb->updated = fdb->used = jiffies; hlist_add_head_rcu(&fdb->hlist, head); } @@ -447,7 +449,7 @@ int br_fdb_insert(struct net_bridge *br, struct net_bridge_port *source, } void br_fdb_update(struct net_bridge *br, struct net_bridge_port *source, - const unsigned char *addr, u16 vid) + const unsigned char *addr, u16 vid, bool added_by_user) { struct hlist_head *head = &br->hash[br_mac_hash(addr, vid)]; struct net_bridge_fdb_entry *fdb; @@ -473,13 +475,18 @@ void br_fdb_update(struct net_bridge *br, struct net_bridge_port *source, /* fastpath: update of existing entry */ fdb->dst = source; fdb->updated = jiffies; + if (unlikely(added_by_user)) + fdb->added_by_user = 1; } } else { spin_lock(&br->hash_lock); if (likely(!fdb_find(head, addr, vid))) { fdb = fdb_create(head, source, addr, vid); - if (fdb) + if (fdb) { + if (unlikely(added_by_user)) + fdb->added_by_user = 1; fdb_notify(br, fdb, RTM_NEWNEIGH); + } } /* else we lose race and someone else inserts * it first, don't bother updating @@ -647,6 +654,7 @@ static int fdb_add_entry(struct net_bridge_port *source, const __u8 *addr, modified = true; } + fdb->added_by_user = 1; fdb->used = jiffies; if (modified) { @@ -664,7 +672,7 @@ static int __br_fdb_add(struct ndmsg *ndm, struct net_bridge_port *p, if (ndm->ndm_flags & NTF_USE) { rcu_read_lock(); - br_fdb_update(p->br, p, addr, vid); + br_fdb_update(p->br, p, addr, vid, true); rcu_read_unlock(); } else { spin_lock_bh(&p->br->hash_lock); diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c index bf8dc7d308d6..28d544627422 100644 --- a/net/bridge/br_input.c +++ b/net/bridge/br_input.c @@ -77,7 +77,7 @@ int br_handle_frame_finish(struct sk_buff *skb) /* insert into forwarding database after filtering to avoid spoofing */ br = p->br; if (p->flags & BR_LEARNING) - br_fdb_update(br, p, eth_hdr(skb)->h_source, vid); + br_fdb_update(br, p, eth_hdr(skb)->h_source, vid, false); if (!is_broadcast_ether_addr(dest) && is_multicast_ether_addr(dest) && br_multicast_rcv(br, p, skb, vid)) @@ -148,7 +148,7 @@ static int br_handle_local_finish(struct sk_buff *skb) br_vlan_get_tag(skb, &vid); if (p->flags & BR_LEARNING) - br_fdb_update(p->br, p, eth_hdr(skb)->h_source, vid); + br_fdb_update(p->br, p, eth_hdr(skb)->h_source, vid, false); return 0; /* process further */ } diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index fcd12333c59b..939a59e15036 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -104,6 +104,7 @@ struct net_bridge_fdb_entry mac_addr addr; unsigned char is_local; unsigned char is_static; + unsigned char added_by_user; __u16 vlan_id; }; @@ -383,7 +384,7 @@ int br_fdb_fillbuf(struct net_bridge *br, void *buf, unsigned long count, int br_fdb_insert(struct net_bridge *br, struct net_bridge_port *source, const unsigned char *addr, u16 vid); void br_fdb_update(struct net_bridge *br, struct net_bridge_port *source, - const unsigned char *addr, u16 vid); + const unsigned char *addr, u16 vid, bool added_by_user); int fdb_delete_by_addr(struct net_bridge *br, const u8 *addr, u16 vid); int br_fdb_delete(struct ndmsg *ndm, struct nlattr *tb[], -- cgit From 2836882fe07718fe3263745b1aa07284ec71871c Mon Sep 17 00:00:00 2001 From: Toshiaki Makita <makita.toshiaki@lab.ntt.co.jp> Date: Fri, 7 Feb 2014 16:48:19 +0900 Subject: bridge: Fix the way to insert new local fdb entries in br_fdb_changeaddr Since commit bc9a25d21ef8 ("bridge: Add vlan support for local fdb entries"), br_fdb_changeaddr() has inserted a new local fdb entry only if it can find old one. But if we have two ports where they have the same address or user has deleted a local entry, there will be no entry for one of the ports. Example of problematic case: ip link set eth0 address aa:bb:cc:dd:ee:ff ip link set eth1 address aa:bb:cc:dd:ee:ff brctl addif br0 eth0 brctl addif br0 eth1 # eth1 will not have a local entry due to dup. ip link set eth1 address 12:34:56:78:90:ab Then, the new entry for the address 12:34:56:78:90:ab will not be created, and the bridge device will not be able to communicate. Insert new entries regardless of whether we can find old entries or not. Signed-off-by: Toshiaki Makita <makita.toshiaki@lab.ntt.co.jp> Acked-by: Vlad Yasevich <vyasevic@redhat.com> Signed-off-by: David S. Miller <davem@davemloft.net> --- net/bridge/br_fdb.c | 29 ++++++++++++++++++++--------- 1 file changed, 20 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c index ce5411995a63..96ab1d1748d0 100644 --- a/net/bridge/br_fdb.c +++ b/net/bridge/br_fdb.c @@ -92,8 +92,10 @@ static void fdb_delete(struct net_bridge *br, struct net_bridge_fdb_entry *f) void br_fdb_changeaddr(struct net_bridge_port *p, const unsigned char *newaddr) { struct net_bridge *br = p->br; - bool no_vlan = (nbp_get_vlan_info(p) == NULL) ? true : false; + struct net_port_vlans *pv = nbp_get_vlan_info(p); + bool no_vlan = !pv; int i; + u16 vid; spin_lock_bh(&br->hash_lock); @@ -114,28 +116,37 @@ void br_fdb_changeaddr(struct net_bridge_port *p, const unsigned char *newaddr) f->addr.addr) && nbp_vlan_find(op, vid)) { f->dst = op; - goto insert; + goto skip_delete; } } /* delete old one */ fdb_delete(br, f); -insert: - /* insert new address, may fail if invalid - * address or dup. - */ - fdb_insert(br, p, newaddr, vid); - +skip_delete: /* if this port has no vlan information * configured, we can safely be done at * this point. */ if (no_vlan) - goto done; + goto insert; } } } +insert: + /* insert new address, may fail if invalid address or dup. */ + fdb_insert(br, p, newaddr, 0); + + if (no_vlan) + goto done; + + /* Now add entries for every VLAN configured on the port. + * This function runs under RTNL so the bitmap will not change + * from under us. + */ + for_each_set_bit(vid, pv->vlan_bitmap, VLAN_N_VID) + fdb_insert(br, p, newaddr, vid); + done: spin_unlock_bh(&br->hash_lock); } -- cgit From a3ebb7efe7033d903d86c9d664f07a9cc21747b3 Mon Sep 17 00:00:00 2001 From: Toshiaki Makita <makita.toshiaki@lab.ntt.co.jp> Date: Fri, 7 Feb 2014 16:48:20 +0900 Subject: bridge: Fix the way to find old local fdb entries in br_fdb_change_mac_address We have been always failed to delete the old entry at br_fdb_change_mac_address() because br_set_mac_address() updates dev->dev_addr before calling br_fdb_change_mac_address() and br_fdb_change_mac_address() uses dev->dev_addr to find the old entry. That update of dev_addr is completely unnecessary because the same work is done in br_stp_change_bridge_id() which is called right away after calling br_fdb_change_mac_address(). Signed-off-by: Toshiaki Makita <makita.toshiaki@lab.ntt.co.jp> Acked-by: Vlad Yasevich <vyasevic@redhat.com> Signed-off-by: David S. Miller <davem@davemloft.net> --- net/bridge/br_device.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c index d9a9b0fc1795..6f5cbd1a2f38 100644 --- a/net/bridge/br_device.c +++ b/net/bridge/br_device.c @@ -187,8 +187,8 @@ static int br_set_mac_address(struct net_device *dev, void *p) spin_lock_bh(&br->lock); if (!ether_addr_equal(dev->dev_addr, addr->sa_data)) { - memcpy(dev->dev_addr, addr->sa_data, ETH_ALEN); br_fdb_change_mac_address(br, addr->sa_data); + /* Mac address will be changed in br_stp_change_bridge_id(). */ br_stp_change_bridge_id(br, addr->sa_data); } spin_unlock_bh(&br->lock); -- cgit From a4b816d8ba1c1917842dc3de97cbf8ef116e043e Mon Sep 17 00:00:00 2001 From: Toshiaki Makita <makita.toshiaki@lab.ntt.co.jp> Date: Fri, 7 Feb 2014 16:48:21 +0900 Subject: bridge: Change local fdb entries whenever mac address of bridge device changes Vlan code may need fdb change when changing mac address of bridge device even if it is caused by the mac address changing of a bridge port. Example configuration: ip link set eth0 address 12:34:56:78:90:ab ip link set eth1 address aa:bb:cc:dd:ee:ff brctl addif br0 eth0 brctl addif br0 eth1 # br0 will have mac address 12:34:56:78:90:ab bridge vlan add dev br0 vid 10 self bridge vlan add dev eth0 vid 10 We will have fdb entry such that f->dst == NULL, f->vlan_id == 10 and f->addr == 12:34:56:78:90:ab at this time. Next, change the mac address of eth0 to greater value. ip link set eth0 address ee:ff:12:34:56:78 Then, mac address of br0 will be recalculated and set to aa:bb:cc:dd:ee:ff. However, an entry aa:bb:cc:dd:ee:ff will not be created and we will be not able to communicate using br0 on vlan 10. Address this issue by deleting and adding local entries whenever changing the mac address of the bridge device. If there already exists an entry that has the same address, for example, in case that br_fdb_changeaddr() has already inserted it, br_fdb_change_mac_address() will simply fail to insert it and no duplicated entry will be made, as it was. This approach also needs br_add_if() to call br_fdb_insert() before br_stp_recalculate_bridge_id() so that we don't create an entry whose dst == NULL in this function to preserve previous behavior. Note that this is a slight change in behavior where the bridge device can receive the traffic to the new address before calling br_stp_recalculate_bridge_id() in br_add_if(). However, it is not a problem because we have already the address on the new port and such a way to insert new one before recalculating bridge id is taken in br_device_event() as well. Signed-off-by: Toshiaki Makita <makita.toshiaki@lab.ntt.co.jp> Acked-by: Vlad Yasevich <vyasevic@redhat.com> Signed-off-by: David S. Miller <davem@davemloft.net> --- net/bridge/br_device.c | 1 - net/bridge/br_if.c | 6 +++--- net/bridge/br_stp_if.c | 2 ++ 3 files changed, 5 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c index 6f5cbd1a2f38..63f0455c0bc3 100644 --- a/net/bridge/br_device.c +++ b/net/bridge/br_device.c @@ -187,7 +187,6 @@ static int br_set_mac_address(struct net_device *dev, void *p) spin_lock_bh(&br->lock); if (!ether_addr_equal(dev->dev_addr, addr->sa_data)) { - br_fdb_change_mac_address(br, addr->sa_data); /* Mac address will be changed in br_stp_change_bridge_id(). */ br_stp_change_bridge_id(br, addr->sa_data); } diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c index cffe1d666ba1..54d207d3a31c 100644 --- a/net/bridge/br_if.c +++ b/net/bridge/br_if.c @@ -389,6 +389,9 @@ int br_add_if(struct net_bridge *br, struct net_device *dev) if (br->dev->needed_headroom < dev->needed_headroom) br->dev->needed_headroom = dev->needed_headroom; + if (br_fdb_insert(br, p, dev->dev_addr, 0)) + netdev_err(dev, "failed insert local address bridge forwarding table\n"); + spin_lock_bh(&br->lock); changed_addr = br_stp_recalculate_bridge_id(br); @@ -404,9 +407,6 @@ int br_add_if(struct net_bridge *br, struct net_device *dev) dev_set_mtu(br->dev, br_min_mtu(br)); - if (br_fdb_insert(br, p, dev->dev_addr, 0)) - netdev_err(dev, "failed insert local address bridge forwarding table\n"); - kobject_uevent(&p->kobj, KOBJ_ADD); return 0; diff --git a/net/bridge/br_stp_if.c b/net/bridge/br_stp_if.c index 656a6f3e40de..189ba1e7d851 100644 --- a/net/bridge/br_stp_if.c +++ b/net/bridge/br_stp_if.c @@ -194,6 +194,8 @@ void br_stp_change_bridge_id(struct net_bridge *br, const unsigned char *addr) wasroot = br_is_root_bridge(br); + br_fdb_change_mac_address(br, addr); + memcpy(oldaddr, br->bridge_id.addr, ETH_ALEN); memcpy(br->bridge_id.addr, addr, ETH_ALEN); memcpy(br->dev->dev_addr, addr, ETH_ALEN); -- cgit From 2b292fb4a57dc233e298a84196d33be0bc3828e4 Mon Sep 17 00:00:00 2001 From: Toshiaki Makita <makita.toshiaki@lab.ntt.co.jp> Date: Fri, 7 Feb 2014 16:48:22 +0900 Subject: bridge: Fix the way to check if a local fdb entry can be deleted We should take into account the followings when deleting a local fdb entry. - nbp_vlan_find() can be used only when vid != 0 to check if an entry is deletable, because a fdb entry with vid 0 can exist at any time while nbp_vlan_find() always return false with vid 0. Example of problematic case: ip link set eth0 address 12:34:56:78:90:ab ip link set eth1 address 12:34:56:78:90:ab brctl addif br0 eth0 brctl addif br0 eth1 ip link set eth0 address aa:bb:cc:dd:ee:ff Then, the fdb entry 12:34:56:78:90:ab will be deleted even though the bridge port eth1 still has that address. - The port to which the bridge device is attached might needs a local entry if its mac address is set manually. Example of problematic case: ip link set eth0 address 12:34:56:78:90:ab brctl addif br0 eth0 ip link set br0 address 12:34:56:78:90:ab ip link set eth0 address aa:bb:cc:dd:ee:ff Then, the fdb still must have the entry 12:34:56:78:90:ab, but it will be deleted. We can use br->dev->addr_assign_type to check if the address is manually set or not, but I propose another approach. Since we delete and insert local entries whenever changing mac address of the bridge device, we can change dst of the entry to NULL regardless of addr_assign_type when deleting an entry associated with a certain port, and if it is found to be unnecessary later, then delete it. That is, if changing mac address of a port, the entry might be changed to its dst being NULL first, but is eventually deleted when recalculating and changing bridge id. This approach is especially useful when we want to share the code with deleting vlan in which the bridge device might want such an entry regardless of addr_assign_type, and makes things easy because we don't have to consider if mac address of the bridge device will be changed or not at the time we delete a local entry of a port, which means fdb code will not be bothered even if the bridge id calculating logic is changed in the future. Also, this change reduces inconsistent state, where frames whose dst is the mac address of the bridge, can't reach the bridge because of premature fdb entry deletion. This change reduces the possibility that the bridge device replies unreachable mac address to arp requests, which could occur during the short window between calling del_nbp() and br_stp_recalculate_bridge_id() in br_del_if(). This will effective after br_fdb_delete_by_port() starts to use the same code by following patch. Signed-off-by: Toshiaki Makita <makita.toshiaki@lab.ntt.co.jp> Acked-by: Vlad Yasevich <vyasevic@redhat.com> Signed-off-by: David S. Miller <davem@davemloft.net> --- net/bridge/br_fdb.c | 10 +++++++++- net/bridge/br_private.h | 6 ++++++ net/bridge/br_vlan.c | 19 +++++++++++++++++++ 3 files changed, 34 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c index 96ab1d1748d0..b4005f5b28f4 100644 --- a/net/bridge/br_fdb.c +++ b/net/bridge/br_fdb.c @@ -114,12 +114,20 @@ void br_fdb_changeaddr(struct net_bridge_port *p, const unsigned char *newaddr) if (op != p && ether_addr_equal(op->dev->dev_addr, f->addr.addr) && - nbp_vlan_find(op, vid)) { + (!vid || nbp_vlan_find(op, vid))) { f->dst = op; goto skip_delete; } } + /* maybe bridge device has same hw addr? */ + if (ether_addr_equal(br->dev->dev_addr, + f->addr.addr) && + (!vid || br_vlan_find(br, vid))) { + f->dst = NULL; + goto skip_delete; + } + /* delete old one */ fdb_delete(br, f); skip_delete: diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index 939a59e15036..f91e1d9c8e92 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -585,6 +585,7 @@ struct sk_buff *br_handle_vlan(struct net_bridge *br, int br_vlan_add(struct net_bridge *br, u16 vid, u16 flags); int br_vlan_delete(struct net_bridge *br, u16 vid); void br_vlan_flush(struct net_bridge *br); +bool br_vlan_find(struct net_bridge *br, u16 vid); int br_vlan_filter_toggle(struct net_bridge *br, unsigned long val); int nbp_vlan_add(struct net_bridge_port *port, u16 vid, u16 flags); int nbp_vlan_delete(struct net_bridge_port *port, u16 vid); @@ -666,6 +667,11 @@ static inline void br_vlan_flush(struct net_bridge *br) { } +static inline bool br_vlan_find(struct net_bridge *br, u16 vid) +{ + return false; +} + static inline int nbp_vlan_add(struct net_bridge_port *port, u16 vid, u16 flags) { return -EOPNOTSUPP; diff --git a/net/bridge/br_vlan.c b/net/bridge/br_vlan.c index 4ca4d0a0151c..233ec1c6e9db 100644 --- a/net/bridge/br_vlan.c +++ b/net/bridge/br_vlan.c @@ -295,6 +295,25 @@ void br_vlan_flush(struct net_bridge *br) __vlan_flush(pv); } +bool br_vlan_find(struct net_bridge *br, u16 vid) +{ + struct net_port_vlans *pv; + bool found = false; + + rcu_read_lock(); + pv = rcu_dereference(br->vlan_info); + + if (!pv) + goto out; + + if (test_bit(vid, pv->vlan_bitmap)) + found = true; + +out: + rcu_read_unlock(); + return found; +} + int br_vlan_filter_toggle(struct net_bridge *br, unsigned long val) { if (!rtnl_trylock()) -- cgit From 960b589f86c74ce582922fcb996103271081f4de Mon Sep 17 00:00:00 2001 From: Toshiaki Makita <makita.toshiaki@lab.ntt.co.jp> Date: Fri, 7 Feb 2014 16:48:23 +0900 Subject: bridge: Properly check if local fdb entry can be deleted in br_fdb_change_mac_address br_fdb_change_mac_address() doesn't check if the local entry has the same address as any of bridge ports. Although I'm not sure when it is beneficial, current implementation allow the bridge device to receive any mac address of its ports. To preserve this behavior, we have to check if the mac address of the entry being deleted is identical to that of any port. As this check is almost the same as that in br_fdb_changeaddr(), create a common function fdb_delete_local() and call it from br_fdb_changeadddr() and br_fdb_change_mac_address(). Signed-off-by: Toshiaki Makita <makita.toshiaki@lab.ntt.co.jp> Acked-by: Vlad Yasevich <vyasevic@redhat.com> Signed-off-by: David S. Miller <davem@davemloft.net> --- net/bridge/br_fdb.c | 57 ++++++++++++++++++++++++++++++----------------------- 1 file changed, 32 insertions(+), 25 deletions(-) (limited to 'net') diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c index b4005f5b28f4..15bf13d0b543 100644 --- a/net/bridge/br_fdb.c +++ b/net/bridge/br_fdb.c @@ -89,6 +89,34 @@ static void fdb_delete(struct net_bridge *br, struct net_bridge_fdb_entry *f) call_rcu(&f->rcu, fdb_rcu_free); } +/* Delete a local entry if no other port had the same address. */ +static void fdb_delete_local(struct net_bridge *br, + const struct net_bridge_port *p, + struct net_bridge_fdb_entry *f) +{ + const unsigned char *addr = f->addr.addr; + u16 vid = f->vlan_id; + struct net_bridge_port *op; + + /* Maybe another port has same hw addr? */ + list_for_each_entry(op, &br->port_list, list) { + if (op != p && ether_addr_equal(op->dev->dev_addr, addr) && + (!vid || nbp_vlan_find(op, vid))) { + f->dst = op; + return; + } + } + + /* Maybe bridge device has same hw addr? */ + if (p && ether_addr_equal(br->dev->dev_addr, addr) && + (!vid || br_vlan_find(br, vid))) { + f->dst = NULL; + return; + } + + fdb_delete(br, f); +} + void br_fdb_changeaddr(struct net_bridge_port *p, const unsigned char *newaddr) { struct net_bridge *br = p->br; @@ -107,30 +135,9 @@ void br_fdb_changeaddr(struct net_bridge_port *p, const unsigned char *newaddr) f = hlist_entry(h, struct net_bridge_fdb_entry, hlist); if (f->dst == p && f->is_local && !f->added_by_user) { - /* maybe another port has same hw addr? */ - struct net_bridge_port *op; - u16 vid = f->vlan_id; - list_for_each_entry(op, &br->port_list, list) { - if (op != p && - ether_addr_equal(op->dev->dev_addr, - f->addr.addr) && - (!vid || nbp_vlan_find(op, vid))) { - f->dst = op; - goto skip_delete; - } - } - - /* maybe bridge device has same hw addr? */ - if (ether_addr_equal(br->dev->dev_addr, - f->addr.addr) && - (!vid || br_vlan_find(br, vid))) { - f->dst = NULL; - goto skip_delete; - } - /* delete old one */ - fdb_delete(br, f); -skip_delete: + fdb_delete_local(br, p, f); + /* if this port has no vlan information * configured, we can safely be done at * this point. @@ -168,7 +175,7 @@ void br_fdb_change_mac_address(struct net_bridge *br, const u8 *newaddr) /* If old entry was unassociated with any port, then delete it. */ f = __br_fdb_get(br, br->dev->dev_addr, 0); if (f && f->is_local && !f->dst) - fdb_delete(br, f); + fdb_delete_local(br, NULL, f); fdb_insert(br, NULL, newaddr, 0); @@ -183,7 +190,7 @@ void br_fdb_change_mac_address(struct net_bridge *br, const u8 *newaddr) for_each_set_bit_from(vid, pv->vlan_bitmap, VLAN_N_VID) { f = __br_fdb_get(br, br->dev->dev_addr, vid); if (f && f->is_local && !f->dst) - fdb_delete(br, f); + fdb_delete_local(br, NULL, f); fdb_insert(br, NULL, newaddr, vid); } } -- cgit From a778e6d1a51faaafa6a3a3cef9bee11c3bd47f9f Mon Sep 17 00:00:00 2001 From: Toshiaki Makita <makita.toshiaki@lab.ntt.co.jp> Date: Fri, 7 Feb 2014 16:48:24 +0900 Subject: bridge: Properly check if local fdb entry can be deleted in br_fdb_delete_by_port br_fdb_delete_by_port() doesn't care about vlan and mac address of the bridge device. As the check is almost the same as mac address changing, slightly modify fdb_delete_local() and use it. Note that we can always set added_by_user to 0 in fdb_delete_local() because - br_fdb_delete_by_port() calls fdb_delete_local() for local entries regardless of its added_by_user. In this case, we have to check if another port has the same address and vlan, and if found, we have to create the entry (by changing dst). This is kernel-added entry, not user-added. - br_fdb_changeaddr() doesn't call fdb_delete_local() for user-added entry. Signed-off-by: Toshiaki Makita <makita.toshiaki@lab.ntt.co.jp> Acked-by: Vlad Yasevich <vyasevic@redhat.com> Signed-off-by: David S. Miller <davem@davemloft.net> --- net/bridge/br_fdb.c | 25 ++++++------------------- 1 file changed, 6 insertions(+), 19 deletions(-) (limited to 'net') diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c index 15bf13d0b543..43e54d1ae956 100644 --- a/net/bridge/br_fdb.c +++ b/net/bridge/br_fdb.c @@ -103,6 +103,7 @@ static void fdb_delete_local(struct net_bridge *br, if (op != p && ether_addr_equal(op->dev->dev_addr, addr) && (!vid || nbp_vlan_find(op, vid))) { f->dst = op; + f->added_by_user = 0; return; } } @@ -111,6 +112,7 @@ static void fdb_delete_local(struct net_bridge *br, if (p && ether_addr_equal(br->dev->dev_addr, addr) && (!vid || br_vlan_find(br, vid))) { f->dst = NULL; + f->added_by_user = 0; return; } @@ -261,26 +263,11 @@ void br_fdb_delete_by_port(struct net_bridge *br, if (f->is_static && !do_all) continue; - /* - * if multiple ports all have the same device address - * then when one port is deleted, assign - * the local entry to other port - */ - if (f->is_local) { - struct net_bridge_port *op; - list_for_each_entry(op, &br->port_list, list) { - if (op != p && - ether_addr_equal(op->dev->dev_addr, - f->addr.addr)) { - f->dst = op; - f->added_by_user = 0; - goto skip_delete; - } - } - } - fdb_delete(br, f); - skip_delete: ; + if (f->is_local) + fdb_delete_local(br, p, f); + else + fdb_delete(br, f); } } spin_unlock_bh(&br->hash_lock); -- cgit From 424bb9c97cc1992018950485ca7410779b8ea21d Mon Sep 17 00:00:00 2001 From: Toshiaki Makita <makita.toshiaki@lab.ntt.co.jp> Date: Fri, 7 Feb 2014 16:48:25 +0900 Subject: bridge: Properly check if local fdb entry can be deleted when deleting vlan Vlan codes unconditionally delete local fdb entries. We should consider the possibility that other ports have the same address and vlan. Example of problematic case: ip link set eth0 address 12:34:56:78:90:ab ip link set eth1 address aa:bb:cc:dd:ee:ff brctl addif br0 eth0 brctl addif br0 eth1 # br0 will have mac address 12:34:56:78:90:ab bridge vlan add dev eth0 vid 10 bridge vlan add dev eth1 vid 10 bridge vlan add dev br0 vid 10 self We will have fdb entry such that f->dst == eth0, f->vlan_id == 10 and f->addr == 12:34:56:78:90:ab at this time. Next, delete eth0 vlan 10. bridge vlan del dev eth0 vid 10 In this case, we still need the entry for br0, but it will be deleted. Note that br0 needs the entry even though its mac address is not set manually. To delete the entry with proper condition checking, fdb_delete_local() is suitable to use. Signed-off-by: Toshiaki Makita <makita.toshiaki@lab.ntt.co.jp> Acked-by: Vlad Yasevich <vyasevic@redhat.com> Signed-off-by: David S. Miller <davem@davemloft.net> --- net/bridge/br_fdb.c | 20 ++++++++++++++++++-- net/bridge/br_private.h | 4 +++- net/bridge/br_vlan.c | 8 ++------ 3 files changed, 23 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c index 43e54d1ae956..0426dff4b3fd 100644 --- a/net/bridge/br_fdb.c +++ b/net/bridge/br_fdb.c @@ -27,6 +27,9 @@ #include "br_private.h" static struct kmem_cache *br_fdb_cache __read_mostly; +static struct net_bridge_fdb_entry *fdb_find(struct hlist_head *head, + const unsigned char *addr, + __u16 vid); static int fdb_insert(struct net_bridge *br, struct net_bridge_port *source, const unsigned char *addr, u16 vid); static void fdb_notify(struct net_bridge *br, @@ -119,6 +122,20 @@ static void fdb_delete_local(struct net_bridge *br, fdb_delete(br, f); } +void br_fdb_find_delete_local(struct net_bridge *br, + const struct net_bridge_port *p, + const unsigned char *addr, u16 vid) +{ + struct hlist_head *head = &br->hash[br_mac_hash(addr, vid)]; + struct net_bridge_fdb_entry *f; + + spin_lock_bh(&br->hash_lock); + f = fdb_find(head, addr, vid); + if (f && f->is_local && !f->added_by_user && f->dst == p) + fdb_delete_local(br, p, f); + spin_unlock_bh(&br->hash_lock); +} + void br_fdb_changeaddr(struct net_bridge_port *p, const unsigned char *newaddr) { struct net_bridge *br = p->br; @@ -770,8 +787,7 @@ out: return err; } -int fdb_delete_by_addr(struct net_bridge *br, const u8 *addr, - u16 vlan) +static int fdb_delete_by_addr(struct net_bridge *br, const u8 *addr, u16 vlan) { struct hlist_head *head = &br->hash[br_mac_hash(addr, vlan)]; struct net_bridge_fdb_entry *fdb; diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index f91e1d9c8e92..3ba11bc99b65 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -371,6 +371,9 @@ static inline void br_netpoll_disable(struct net_bridge_port *p) int br_fdb_init(void); void br_fdb_fini(void); void br_fdb_flush(struct net_bridge *br); +void br_fdb_find_delete_local(struct net_bridge *br, + const struct net_bridge_port *p, + const unsigned char *addr, u16 vid); void br_fdb_changeaddr(struct net_bridge_port *p, const unsigned char *newaddr); void br_fdb_change_mac_address(struct net_bridge *br, const u8 *newaddr); void br_fdb_cleanup(unsigned long arg); @@ -385,7 +388,6 @@ int br_fdb_insert(struct net_bridge *br, struct net_bridge_port *source, const unsigned char *addr, u16 vid); void br_fdb_update(struct net_bridge *br, struct net_bridge_port *source, const unsigned char *addr, u16 vid, bool added_by_user); -int fdb_delete_by_addr(struct net_bridge *br, const u8 *addr, u16 vid); int br_fdb_delete(struct ndmsg *ndm, struct nlattr *tb[], struct net_device *dev, const unsigned char *addr); diff --git a/net/bridge/br_vlan.c b/net/bridge/br_vlan.c index 233ec1c6e9db..8249ca764c79 100644 --- a/net/bridge/br_vlan.c +++ b/net/bridge/br_vlan.c @@ -275,9 +275,7 @@ int br_vlan_delete(struct net_bridge *br, u16 vid) if (!pv) return -EINVAL; - spin_lock_bh(&br->hash_lock); - fdb_delete_by_addr(br, br->dev->dev_addr, vid); - spin_unlock_bh(&br->hash_lock); + br_fdb_find_delete_local(br, NULL, br->dev->dev_addr, vid); __vlan_del(pv, vid); return 0; @@ -378,9 +376,7 @@ int nbp_vlan_delete(struct net_bridge_port *port, u16 vid) if (!pv) return -EINVAL; - spin_lock_bh(&port->br->hash_lock); - fdb_delete_by_addr(port->br, port->dev->dev_addr, vid); - spin_unlock_bh(&port->br->hash_lock); + br_fdb_find_delete_local(port->br, port, port->dev->dev_addr, vid); return __vlan_del(pv, vid); } -- cgit From ac4c8868837a7c70ebb3eaf2298324a3b61b214e Mon Sep 17 00:00:00 2001 From: Toshiaki Makita <makita.toshiaki@lab.ntt.co.jp> Date: Fri, 7 Feb 2014 16:48:26 +0900 Subject: bridge: Prevent possible race condition in br_fdb_change_mac_address br_fdb_change_mac_address() calls fdb_insert()/fdb_delete() without br->hash_lock. These hash list updates are racy with br_fdb_update()/br_fdb_cleanup(). Signed-off-by: Toshiaki Makita <makita.toshiaki@lab.ntt.co.jp> Acked-by: Vlad Yasevich <vyasevic@redhat.com> Signed-off-by: David S. Miller <davem@davemloft.net> --- net/bridge/br_fdb.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c index 0426dff4b3fd..9203d5a1943f 100644 --- a/net/bridge/br_fdb.c +++ b/net/bridge/br_fdb.c @@ -191,6 +191,8 @@ void br_fdb_change_mac_address(struct net_bridge *br, const u8 *newaddr) struct net_port_vlans *pv; u16 vid = 0; + spin_lock_bh(&br->hash_lock); + /* If old entry was unassociated with any port, then delete it. */ f = __br_fdb_get(br, br->dev->dev_addr, 0); if (f && f->is_local && !f->dst) @@ -204,7 +206,7 @@ void br_fdb_change_mac_address(struct net_bridge *br, const u8 *newaddr) */ pv = br_get_vlan_info(br); if (!pv) - return; + goto out; for_each_set_bit_from(vid, pv->vlan_bitmap, VLAN_N_VID) { f = __br_fdb_get(br, br->dev->dev_addr, vid); @@ -212,6 +214,8 @@ void br_fdb_change_mac_address(struct net_bridge *br, const u8 *newaddr) fdb_delete_local(br, NULL, f); fdb_insert(br, NULL, newaddr, vid); } +out: + spin_unlock_bh(&br->hash_lock); } void br_fdb_cleanup(unsigned long _data) -- cgit From bf06200e732de613a1277984bf34d1a21c2de03d Mon Sep 17 00:00:00 2001 From: John Ogness <john.ogness@linutronix.de> Date: Sun, 9 Feb 2014 18:40:11 -0800 Subject: tcp: tsq: fix nonagle handling Commit 46d3ceabd8d9 ("tcp: TCP Small Queues") introduced a possible regression for applications using TCP_NODELAY. If TCP session is throttled because of tsq, we should consult tp->nonagle when TX completion is done and allow us to send additional segment, especially if this segment is not a full MSS. Otherwise this segment is sent after an RTO. [edumazet] : Cooked the changelog, added another fix about testing sk_wmem_alloc twice because TX completion can happen right before setting TSQ_THROTTLED bit. This problem is particularly visible with recent auto corking, but might also be triggered with low tcp_limit_output_bytes values or NIC drivers delaying TX completion by hundred of usec, and very low rtt. Thomas Glanzmann for example reported an iscsi regression, caused by tcp auto corking making this bug quite visible. Fixes: 46d3ceabd8d9 ("tcp: TCP Small Queues") Signed-off-by: John Ogness <john.ogness@linutronix.de> Signed-off-by: Eric Dumazet <edumazet@google.com> Reported-by: Thomas Glanzmann <thomas@glanzmann.de> Signed-off-by: David S. Miller <davem@davemloft.net> --- net/ipv4/tcp_output.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 10435b3b9d0f..3be16727f058 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -698,7 +698,8 @@ static void tcp_tsq_handler(struct sock *sk) if ((1 << sk->sk_state) & (TCPF_ESTABLISHED | TCPF_FIN_WAIT1 | TCPF_CLOSING | TCPF_CLOSE_WAIT | TCPF_LAST_ACK)) - tcp_write_xmit(sk, tcp_current_mss(sk), 0, 0, GFP_ATOMIC); + tcp_write_xmit(sk, tcp_current_mss(sk), tcp_sk(sk)->nonagle, + 0, GFP_ATOMIC); } /* * One tasklet per cpu tries to send more skbs. @@ -1904,7 +1905,15 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, if (atomic_read(&sk->sk_wmem_alloc) > limit) { set_bit(TSQ_THROTTLED, &tp->tsq_flags); - break; + /* It is possible TX completion already happened + * before we set TSQ_THROTTLED, so we must + * test again the condition. + * We abuse smp_mb__after_clear_bit() because + * there is no smp_mb__after_set_bit() yet + */ + smp_mb__after_clear_bit(); + if (atomic_read(&sk->sk_wmem_alloc) > limit) + break; } limit = mss_now; -- cgit From b6f52ae2f0d32387bde2b89883e3b64d88b9bfe8 Mon Sep 17 00:00:00 2001 From: Richard Yao <ryao@gentoo.org> Date: Sat, 8 Feb 2014 19:32:01 -0500 Subject: 9p/trans_virtio.c: Fix broken zero-copy on vmalloc() buffers The 9p-virtio transport does zero copy on things larger than 1024 bytes in size. It accomplishes this by returning the physical addresses of pages to the virtio-pci device. At present, the translation is usually a bit shift. That approach produces an invalid page address when we read/write to vmalloc buffers, such as those used for Linux kernel modules. Any attempt to load a Linux kernel module from 9p-virtio produces the following stack. [<ffffffff814878ce>] p9_virtio_zc_request+0x45e/0x510 [<ffffffff814814ed>] p9_client_zc_rpc.constprop.16+0xfd/0x4f0 [<ffffffff814839dd>] p9_client_read+0x15d/0x240 [<ffffffff811c8440>] v9fs_fid_readn+0x50/0xa0 [<ffffffff811c84a0>] v9fs_file_readn+0x10/0x20 [<ffffffff811c84e7>] v9fs_file_read+0x37/0x70 [<ffffffff8114e3fb>] vfs_read+0x9b/0x160 [<ffffffff81153571>] kernel_read+0x41/0x60 [<ffffffff810c83ab>] copy_module_from_fd.isra.34+0xfb/0x180 Subsequently, QEMU will die printing: qemu-system-x86_64: virtio: trying to map MMIO memory This patch enables 9p-virtio to correctly handle this case. This not only enables us to load Linux kernel modules off virtfs, but also enables ZFS file-based vdevs on virtfs to be used without killing QEMU. Special thanks to both Avi Kivity and Alexander Graf for their interpretation of QEMU backtraces. Without their guidence, tracking down this bug would have taken much longer. Also, special thanks to Linus Torvalds for his insightful explanation of why this should use is_vmalloc_addr() instead of is_vmalloc_or_module_addr(): https://lkml.org/lkml/2014/2/8/272 Signed-off-by: Richard Yao <ryao@gentoo.org> Signed-off-by: David S. Miller <davem@davemloft.net> --- net/9p/trans_virtio.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/9p/trans_virtio.c b/net/9p/trans_virtio.c index cd1e1ede73a4..ac2666c1d011 100644 --- a/net/9p/trans_virtio.c +++ b/net/9p/trans_virtio.c @@ -340,7 +340,10 @@ static int p9_get_mapped_pages(struct virtio_chan *chan, int count = nr_pages; while (nr_pages) { s = rest_of_page(data); - pages[index++] = kmap_to_page(data); + if (is_vmalloc_addr(data)) + pages[index++] = vmalloc_to_page(data); + else + pages[index++] = kmap_to_page(data); data += s; nr_pages--; } -- cgit From 20e7c4e80dcd01dad5e6c8b32455228b8fe9c619 Mon Sep 17 00:00:00 2001 From: Eric Dumazet <edumazet@google.com> Date: Mon, 10 Feb 2014 11:42:35 -0800 Subject: 6lowpan: fix lockdep splats When a device ndo_start_xmit() calls again dev_queue_xmit(), lockdep can complain because dev_queue_xmit() is re-entered and the spinlocks protecting tx queues share a common lockdep class. Same issue was fixed for bonding/l2tp/ppp in commits 0daa2303028a6 ("[PATCH] bonding: lockdep annotation") 49ee49202b4ac ("bonding: set qdisc_tx_busylock to avoid LOCKDEP splat") 23d3b8bfb8eb2 ("net: qdisc busylock needs lockdep annotations ") 303c07db487be ("ppp: set qdisc_tx_busylock to avoid LOCKDEP splat ") Reported-by: Alexander Aring <alex.aring@gmail.com> Signed-off-by: Eric Dumazet <edumazet@google.com> Tested-by: Alexander Aring <alex.aring@gmail.com> Signed-off-by: David S. Miller <davem@davemloft.net> --- net/ieee802154/6lowpan.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) (limited to 'net') diff --git a/net/ieee802154/6lowpan.c b/net/ieee802154/6lowpan.c index 8bfb40153fe7..8edfea5da572 100644 --- a/net/ieee802154/6lowpan.c +++ b/net/ieee802154/6lowpan.c @@ -530,7 +530,27 @@ static struct header_ops lowpan_header_ops = { .create = lowpan_header_create, }; +static struct lock_class_key lowpan_tx_busylock; +static struct lock_class_key lowpan_netdev_xmit_lock_key; + +static void lowpan_set_lockdep_class_one(struct net_device *dev, + struct netdev_queue *txq, + void *_unused) +{ + lockdep_set_class(&txq->_xmit_lock, + &lowpan_netdev_xmit_lock_key); +} + + +static int lowpan_dev_init(struct net_device *dev) +{ + netdev_for_each_tx_queue(dev, lowpan_set_lockdep_class_one, NULL); + dev->qdisc_tx_busylock = &lowpan_tx_busylock; + return 0; +} + static const struct net_device_ops lowpan_netdev_ops = { + .ndo_init = lowpan_dev_init, .ndo_start_xmit = lowpan_xmit, .ndo_set_mac_address = lowpan_set_address, }; -- cgit From 06ea0bfe6e6043cb56a78935a19f6f8ebc636226 Mon Sep 17 00:00:00 2001 From: Trond Myklebust <trond.myklebust@primarydata.com> Date: Tue, 11 Feb 2014 09:15:54 -0500 Subject: SUNRPC: Fix races in xs_nospace() When a send failure occurs due to the socket being out of buffer space, we call xs_nospace() in order to have the RPC task wait until the socket has drained enough to make it worth while trying again. The current patch fixes a race in which the socket is drained before we get round to setting up the machinery in xs_nospace(), and which is reported to cause hangs. Link: http://lkml.kernel.org/r/20140210170315.33dfc621@notabene.brown Fixes: a9a6b52ee1ba (SUNRPC: Don't start the retransmission timer...) Reported-by: Neil Brown <neilb@suse.com> Cc: stable@vger.kernel.org Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com> --- net/sunrpc/xprtsock.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 817a1e523969..0addefca8e77 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -510,6 +510,7 @@ static int xs_nospace(struct rpc_task *task) struct rpc_rqst *req = task->tk_rqstp; struct rpc_xprt *xprt = req->rq_xprt; struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt); + struct sock *sk = transport->inet; int ret = -EAGAIN; dprintk("RPC: %5u xmit incomplete (%u left of %u)\n", @@ -527,7 +528,7 @@ static int xs_nospace(struct rpc_task *task) * window size */ set_bit(SOCK_NOSPACE, &transport->sock->flags); - transport->inet->sk_write_pending++; + sk->sk_write_pending++; /* ...and wait for more buffer space */ xprt_wait_for_buffer_space(task, xs_nospace_callback); } @@ -537,6 +538,9 @@ static int xs_nospace(struct rpc_task *task) } spin_unlock_bh(&xprt->transport_lock); + + /* Race breaker in case memory is freed before above code is called */ + sk->sk_write_space(sk); return ret; } -- cgit From 628356791b04ea988fee070f66a748a823d001bb Mon Sep 17 00:00:00 2001 From: Trond Myklebust <trond.myklebust@primarydata.com> Date: Tue, 11 Feb 2014 13:56:54 -0500 Subject: SUNRPC: Fix potential memory scribble in xprt_free_bc_request() The call to xprt_free_allocation() will call list_del() on req->rq_bc_pa_list, which is not attached to a list. This patch moves the list_del() out of xprt_free_allocation() and into those callers that need it. Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com> --- net/sunrpc/backchannel_rqst.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/sunrpc/backchannel_rqst.c b/net/sunrpc/backchannel_rqst.c index 890a29912d5a..e860d4f7ed2a 100644 --- a/net/sunrpc/backchannel_rqst.c +++ b/net/sunrpc/backchannel_rqst.c @@ -64,7 +64,6 @@ static void xprt_free_allocation(struct rpc_rqst *req) free_page((unsigned long)xbufp->head[0].iov_base); xbufp = &req->rq_snd_buf; free_page((unsigned long)xbufp->head[0].iov_base); - list_del(&req->rq_bc_pa_list); kfree(req); } @@ -168,8 +167,10 @@ out_free: /* * Memory allocation failed, free the temporary list */ - list_for_each_entry_safe(req, tmp, &tmp_list, rq_bc_pa_list) + list_for_each_entry_safe(req, tmp, &tmp_list, rq_bc_pa_list) { + list_del(&req->rq_bc_pa_list); xprt_free_allocation(req); + } dprintk("RPC: setup backchannel transport failed\n"); return -ENOMEM; @@ -198,6 +199,7 @@ void xprt_destroy_backchannel(struct rpc_xprt *xprt, unsigned int max_reqs) xprt_dec_alloc_count(xprt, max_reqs); list_for_each_entry_safe(req, tmp, &xprt->bc_pa_list, rq_bc_pa_list) { dprintk("RPC: req=%p\n", req); + list_del(&req->rq_bc_pa_list); xprt_free_allocation(req); if (--max_reqs == 0) break; -- cgit From 64380a04deeed4720de76b086a3a4eab8dd41671 Mon Sep 17 00:00:00 2001 From: Erik Hugne <erik.hugne@ericsson.com> Date: Tue, 11 Feb 2014 11:38:26 +0100 Subject: tipc: fix message corruption bug for deferred packets If a packet received on a link is out-of-sequence, it will be placed on a deferred queue and later reinserted in the receive path once the preceding packets have been processed. The problem with this is that it will be subject to the buffer adjustment from link_recv_buf_validate twice. The second adjustment for 20 bytes header space will corrupt the packet. We solve this by tagging the deferred packets and bail out from receive buffer validation for packets that have already been subjected to this. Signed-off-by: Erik Hugne <erik.hugne@ericsson.com> Reviewed-by: Jon Maloy <jon.maloy@ericsson.com> Signed-off-by: David S. Miller <davem@davemloft.net> --- net/tipc/core.h | 1 + net/tipc/link.c | 7 +++++++ 2 files changed, 8 insertions(+) (limited to 'net') diff --git a/net/tipc/core.h b/net/tipc/core.h index 1ff477b0450d..5569d96b4da3 100644 --- a/net/tipc/core.h +++ b/net/tipc/core.h @@ -192,6 +192,7 @@ static inline void k_term_timer(struct timer_list *timer) struct tipc_skb_cb { void *handle; + bool deferred; }; #define TIPC_SKB_CB(__skb) ((struct tipc_skb_cb *)&((__skb)->cb[0])) diff --git a/net/tipc/link.c b/net/tipc/link.c index d4b5de41b682..da6018beb6eb 100644 --- a/net/tipc/link.c +++ b/net/tipc/link.c @@ -1391,6 +1391,12 @@ static int link_recv_buf_validate(struct sk_buff *buf) u32 hdr_size; u32 min_hdr_size; + /* If this packet comes from the defer queue, the skb has already + * been validated + */ + if (unlikely(TIPC_SKB_CB(buf)->deferred)) + return 1; + if (unlikely(buf->len < MIN_H_SIZE)) return 0; @@ -1703,6 +1709,7 @@ static void link_handle_out_of_seq_msg(struct tipc_link *l_ptr, &l_ptr->newest_deferred_in, buf)) { l_ptr->deferred_inqueue_sz++; l_ptr->stats.deferred_recv++; + TIPC_SKB_CB(buf)->deferred = true; if ((l_ptr->deferred_inqueue_sz % 16) == 1) tipc_link_send_proto_msg(l_ptr, STATE_MSG, 0, 0, 0, 0, 0); } else -- cgit From 0e0eee2465df77bcec2e8ff75432b8e57897b143 Mon Sep 17 00:00:00 2001 From: Cong Wang <cwang@twopensource.com> Date: Tue, 11 Feb 2014 15:51:30 -0800 Subject: net: correct error path in rtnl_newlink() I saw the following BUG when ->newlink() fails in rtnl_newlink(): [ 40.240058] kernel BUG at net/core/dev.c:6438! this is due to free_netdev() is not supposed to be called before netdev is completely unregistered, therefore it is not correct to call free_netdev() here, at least for ops->newlink!=NULL case, many drivers call it in ->destructor so that rtnl_unlock() will take care of it, we probably don't need to do anything here. Cc: David S. Miller <davem@davemloft.net> Cc: Eric Dumazet <eric.dumazet@gmail.com> Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com> Signed-off-by: Cong Wang <cwang@twopensource.com> Signed-off-by: David S. Miller <davem@davemloft.net> --- net/core/rtnetlink.c | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 048dc8d183aa..1a0dac2ef9ad 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -1963,16 +1963,21 @@ replay: dev->ifindex = ifm->ifi_index; - if (ops->newlink) + if (ops->newlink) { err = ops->newlink(net, dev, tb, data); - else + /* Drivers should call free_netdev() in ->destructor + * and unregister it on failure so that device could be + * finally freed in rtnl_unlock. + */ + if (err < 0) + goto out; + } else { err = register_netdevice(dev); - - if (err < 0) { - free_netdev(dev); - goto out; + if (err < 0) { + free_netdev(dev); + goto out; + } } - err = rtnl_configure_link(dev, ifm); if (err < 0) unregister_netdevice(dev); -- cgit From 22a1f5140ed69baa5906ce9b2b9143478f5a4da0 Mon Sep 17 00:00:00 2001 From: wangweidong <wangweidong1@huawei.com> Date: Wed, 12 Feb 2014 09:44:43 +0800 Subject: sctp: fix a missed .data initialization As commit 3c68198e75111a90("sctp: Make hmac algorithm selection for cookie generation dynamic"), we miss the .data initialization. If we don't use the net_namespace, the problem that parts of the sysctl configuration won't be isolation and won't occur. In sctp_sysctl_net_register(), we register the sysctl for each net, in the for(), we use the 'table[i].data' as check condition, so when the 'i' is the index of sctp_hmac_alg, the data is NULL, then break. So add the .data initialization. Acked-by: Neil Horman <nhorman@tuxdriver.com> Signed-off-by: Wang Weidong <wangweidong1@huawei.com> Signed-off-by: David S. Miller <davem@davemloft.net> --- net/sctp/sysctl.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/sctp/sysctl.c b/net/sctp/sysctl.c index 7135e617ab0f..d354de5a6fe0 100644 --- a/net/sctp/sysctl.c +++ b/net/sctp/sysctl.c @@ -151,6 +151,7 @@ static struct ctl_table sctp_net_table[] = { }, { .procname = "cookie_hmac_alg", + .data = &init_net.sctp.sctp_hmac_alg, .maxlen = 8, .mode = 0644, .proc_handler = proc_sctp_do_hmac_alg, -- cgit From efb842c45e667332774196bd5a1d539d048159cc Mon Sep 17 00:00:00 2001 From: wangweidong <wangweidong1@huawei.com> Date: Wed, 12 Feb 2014 09:44:44 +0800 Subject: sctp: optimize the sctp_sysctl_net_register Here, when the net is init_net, we needn't to kmemdup the ctl_table again. So add a check for net. Also we can save some memory. Signed-off-by: Wang Weidong <wangweidong1@huawei.com> Acked-by: Neil Horman <nhorman@tuxdriver.com> Signed-off-by: David S. Miller <davem@davemloft.net> --- net/sctp/sysctl.c | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/sctp/sysctl.c b/net/sctp/sysctl.c index d354de5a6fe0..35c8923b5554 100644 --- a/net/sctp/sysctl.c +++ b/net/sctp/sysctl.c @@ -402,15 +402,18 @@ static int proc_sctp_do_rto_max(struct ctl_table *ctl, int write, int sctp_sysctl_net_register(struct net *net) { - struct ctl_table *table; - int i; + struct ctl_table *table = sctp_net_table; + + if (!net_eq(net, &init_net)) { + int i; - table = kmemdup(sctp_net_table, sizeof(sctp_net_table), GFP_KERNEL); - if (!table) - return -ENOMEM; + table = kmemdup(sctp_net_table, sizeof(sctp_net_table), GFP_KERNEL); + if (!table) + return -ENOMEM; - for (i = 0; table[i].data; i++) - table[i].data += (char *)(&net->sctp) - (char *)&init_net.sctp; + for (i = 0; table[i].data; i++) + table[i].data += (char *)(&net->sctp) - (char *)&init_net.sctp; + } net->sctp.sysctl_header = register_net_sysctl(net, "net/sctp", table); return 0; -- cgit From d206940319c41df4299db75ed56142177bb2e5f6 Mon Sep 17 00:00:00 2001 From: Florian Westphal <fw@strlen.de> Date: Thu, 13 Feb 2014 23:09:11 +0100 Subject: net: core: introduce netif_skb_dev_features Will be used by upcoming ipv4 forward path change that needs to determine feature mask using skb->dst->dev instead of skb->dev. Signed-off-by: Florian Westphal <fw@strlen.de> Signed-off-by: David S. Miller <davem@davemloft.net> --- include/linux/netdevice.h | 7 ++++++- net/core/dev.c | 22 ++++++++++++---------- 2 files changed, 18 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 440a02ee6f92..21d4e6be8949 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3068,7 +3068,12 @@ void netdev_change_features(struct net_device *dev); void netif_stacked_transfer_operstate(const struct net_device *rootdev, struct net_device *dev); -netdev_features_t netif_skb_features(struct sk_buff *skb); +netdev_features_t netif_skb_dev_features(struct sk_buff *skb, + const struct net_device *dev); +static inline netdev_features_t netif_skb_features(struct sk_buff *skb) +{ + return netif_skb_dev_features(skb, skb->dev); +} static inline bool net_gso_ok(netdev_features_t features, int gso_type) { diff --git a/net/core/dev.c b/net/core/dev.c index 4ad1b78c9c77..b1b0c8d4d7df 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2420,7 +2420,7 @@ EXPORT_SYMBOL(netdev_rx_csum_fault); * 2. No high memory really exists on this machine. */ -static int illegal_highdma(struct net_device *dev, struct sk_buff *skb) +static int illegal_highdma(const struct net_device *dev, struct sk_buff *skb) { #ifdef CONFIG_HIGHMEM int i; @@ -2495,34 +2495,36 @@ static int dev_gso_segment(struct sk_buff *skb, netdev_features_t features) } static netdev_features_t harmonize_features(struct sk_buff *skb, - netdev_features_t features) + const struct net_device *dev, + netdev_features_t features) { if (skb->ip_summed != CHECKSUM_NONE && !can_checksum_protocol(features, skb_network_protocol(skb))) { features &= ~NETIF_F_ALL_CSUM; - } else if (illegal_highdma(skb->dev, skb)) { + } else if (illegal_highdma(dev, skb)) { features &= ~NETIF_F_SG; } return features; } -netdev_features_t netif_skb_features(struct sk_buff *skb) +netdev_features_t netif_skb_dev_features(struct sk_buff *skb, + const struct net_device *dev) { __be16 protocol = skb->protocol; - netdev_features_t features = skb->dev->features; + netdev_features_t features = dev->features; - if (skb_shinfo(skb)->gso_segs > skb->dev->gso_max_segs) + if (skb_shinfo(skb)->gso_segs > dev->gso_max_segs) features &= ~NETIF_F_GSO_MASK; if (protocol == htons(ETH_P_8021Q) || protocol == htons(ETH_P_8021AD)) { struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data; protocol = veh->h_vlan_encapsulated_proto; } else if (!vlan_tx_tag_present(skb)) { - return harmonize_features(skb, features); + return harmonize_features(skb, dev, features); } - features &= (skb->dev->vlan_features | NETIF_F_HW_VLAN_CTAG_TX | + features &= (dev->vlan_features | NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_STAG_TX); if (protocol == htons(ETH_P_8021Q) || protocol == htons(ETH_P_8021AD)) @@ -2530,9 +2532,9 @@ netdev_features_t netif_skb_features(struct sk_buff *skb) NETIF_F_GEN_CSUM | NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_STAG_TX; - return harmonize_features(skb, features); + return harmonize_features(skb, dev, features); } -EXPORT_SYMBOL(netif_skb_features); +EXPORT_SYMBOL(netif_skb_dev_features); int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev, struct netdev_queue *txq) -- cgit From fe6cc55f3a9a053482a76f5a6b2257cee51b4663 Mon Sep 17 00:00:00 2001 From: Florian Westphal <fw@strlen.de> Date: Thu, 13 Feb 2014 23:09:12 +0100 Subject: net: ip, ipv6: handle gso skbs in forwarding path Marcelo Ricardo Leitner reported problems when the forwarding link path has a lower mtu than the incoming one if the inbound interface supports GRO. Given: Host <mtu1500> R1 <mtu1200> R2 Host sends tcp stream which is routed via R1 and R2. R1 performs GRO. In this case, the kernel will fail to send ICMP fragmentation needed messages (or pkt too big for ipv6), as GSO packets currently bypass dstmtu checks in forward path. Instead, Linux tries to send out packets exceeding the mtu. When locking route MTU on Host (i.e., no ipv4 DF bit set), R1 does not fragment the packets when forwarding, and again tries to send out packets exceeding R1-R2 link mtu. This alters the forwarding dstmtu checks to take the individual gso segment lengths into account. For ipv6, we send out pkt too big error for gso if the individual segments are too big. For ipv4, we either send icmp fragmentation needed, or, if the DF bit is not set, perform software segmentation and let the output path create fragments when the packet is leaving the machine. It is not 100% correct as the error message will contain the headers of the GRO skb instead of the original/segmented one, but it seems to work fine in my (limited) tests. Eric Dumazet suggested to simply shrink mss via ->gso_size to avoid sofware segmentation. However it turns out that skb_segment() assumes skb nr_frags is related to mss size so we would BUG there. I don't want to mess with it considering Herbert and Eric disagree on what the correct behavior should be. Hannes Frederic Sowa notes that when we would shrink gso_size skb_segment would then also need to deal with the case where SKB_MAX_FRAGS would be exceeded. This uses sofware segmentation in the forward path when we hit ipv4 non-DF packets and the outgoing link mtu is too small. Its not perfect, but given the lack of bug reports wrt. GRO fwd being broken this is a rare case anyway. Also its not like this could not be improved later once the dust settles. Acked-by: Herbert Xu <herbert@gondor.apana.org.au> Reported-by: Marcelo Ricardo Leitner <mleitner@redhat.com> Signed-off-by: Florian Westphal <fw@strlen.de> Signed-off-by: David S. Miller <davem@davemloft.net> --- include/linux/skbuff.h | 17 ++++++++++++ net/ipv4/ip_forward.c | 71 ++++++++++++++++++++++++++++++++++++++++++++++++-- net/ipv6/ip6_output.c | 17 ++++++++++-- 3 files changed, 101 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index f589c9af8cbf..3ebbbe7b6d05 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -2916,5 +2916,22 @@ static inline bool skb_head_is_locked(const struct sk_buff *skb) { return !skb->head_frag || skb_cloned(skb); } + +/** + * skb_gso_network_seglen - Return length of individual segments of a gso packet + * + * @skb: GSO skb + * + * skb_gso_network_seglen is used to determine the real size of the + * individual segments, including Layer3 (IP, IPv6) and L4 headers (TCP/UDP). + * + * The MAC/L2 header is not accounted for. + */ +static inline unsigned int skb_gso_network_seglen(const struct sk_buff *skb) +{ + unsigned int hdr_len = skb_transport_header(skb) - + skb_network_header(skb); + return hdr_len + skb_gso_transport_seglen(skb); +} #endif /* __KERNEL__ */ #endif /* _LINUX_SKBUFF_H */ diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c index e9f1217a8afd..f3869c186d97 100644 --- a/net/ipv4/ip_forward.c +++ b/net/ipv4/ip_forward.c @@ -39,6 +39,71 @@ #include <net/route.h> #include <net/xfrm.h> +static bool ip_may_fragment(const struct sk_buff *skb) +{ + return unlikely((ip_hdr(skb)->frag_off & htons(IP_DF)) == 0) || + !skb->local_df; +} + +static bool ip_exceeds_mtu(const struct sk_buff *skb, unsigned int mtu) +{ + if (skb->len <= mtu || skb->local_df) + return false; + + if (skb_is_gso(skb) && skb_gso_network_seglen(skb) <= mtu) + return false; + + return true; +} + +static bool ip_gso_exceeds_dst_mtu(const struct sk_buff *skb) +{ + unsigned int mtu; + + if (skb->local_df || !skb_is_gso(skb)) + return false; + + mtu = ip_dst_mtu_maybe_forward(skb_dst(skb), true); + + /* if seglen > mtu, do software segmentation for IP fragmentation on + * output. DF bit cannot be set since ip_forward would have sent + * icmp error. + */ + return skb_gso_network_seglen(skb) > mtu; +} + +/* called if GSO skb needs to be fragmented on forward */ +static int ip_forward_finish_gso(struct sk_buff *skb) +{ + struct dst_entry *dst = skb_dst(skb); + netdev_features_t features; + struct sk_buff *segs; + int ret = 0; + + features = netif_skb_dev_features(skb, dst->dev); + segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK); + if (IS_ERR(segs)) { + kfree_skb(skb); + return -ENOMEM; + } + + consume_skb(skb); + + do { + struct sk_buff *nskb = segs->next; + int err; + + segs->next = NULL; + err = dst_output(segs); + + if (err && ret == 0) + ret = err; + segs = nskb; + } while (segs); + + return ret; +} + static int ip_forward_finish(struct sk_buff *skb) { struct ip_options *opt = &(IPCB(skb)->opt); @@ -49,6 +114,9 @@ static int ip_forward_finish(struct sk_buff *skb) if (unlikely(opt->optlen)) ip_forward_options(skb); + if (ip_gso_exceeds_dst_mtu(skb)) + return ip_forward_finish_gso(skb); + return dst_output(skb); } @@ -91,8 +159,7 @@ int ip_forward(struct sk_buff *skb) IPCB(skb)->flags |= IPSKB_FORWARDED; mtu = ip_dst_mtu_maybe_forward(&rt->dst, true); - if (unlikely(skb->len > mtu && !skb_is_gso(skb) && - (ip_hdr(skb)->frag_off & htons(IP_DF))) && !skb->local_df) { + if (!ip_may_fragment(skb) && ip_exceeds_mtu(skb, mtu)) { IP_INC_STATS(dev_net(rt->dst.dev), IPSTATS_MIB_FRAGFAILS); icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu)); diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index ef02b26ccf81..070a2fae2375 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -342,6 +342,20 @@ static unsigned int ip6_dst_mtu_forward(const struct dst_entry *dst) return mtu; } +static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu) +{ + if (skb->len <= mtu || skb->local_df) + return false; + + if (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu) + return true; + + if (skb_is_gso(skb) && skb_gso_network_seglen(skb) <= mtu) + return false; + + return true; +} + int ip6_forward(struct sk_buff *skb) { struct dst_entry *dst = skb_dst(skb); @@ -466,8 +480,7 @@ int ip6_forward(struct sk_buff *skb) if (mtu < IPV6_MIN_MTU) mtu = IPV6_MIN_MTU; - if ((!skb->local_df && skb->len > mtu && !skb_is_gso(skb)) || - (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu)) { + if (ip6_pkt_too_big(skb, mtu)) { /* Again, force OUTPUT device used as source address */ skb->dev = dst->dev; icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); -- cgit From 219e288e8900fac65211e0a23e2a1037fd521af1 Mon Sep 17 00:00:00 2001 From: Vijay Subramanian <vijaynsu@cisco.com> Date: Wed, 12 Feb 2014 18:58:21 -0800 Subject: net: sched: Cleanup PIE comments Fix incorrect comment reported by Norbert Kiesel. Edit another comment to add more details. Also add references to algorithm (IETF draft and paper) to top of file. Signed-off-by: Vijay Subramanian <subramanian.vijay@gmail.com> CC: Mythili Prabhu <mysuryan@cisco.com> CC: Norbert Kiesel <nkiesel@gmail.com> Signed-off-by: David S. Miller <davem@davemloft.net> --- net/sched/sch_pie.c | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/sched/sch_pie.c b/net/sched/sch_pie.c index a255d0200a59..fefeeb73f15f 100644 --- a/net/sched/sch_pie.c +++ b/net/sched/sch_pie.c @@ -15,6 +15,11 @@ * * ECN support is added by Naeem Khademi <naeemk@ifi.uio.no> * University of Oslo, Norway. + * + * References: + * IETF draft submission: http://tools.ietf.org/html/draft-pan-aqm-pie-00 + * IEEE Conference on High Performance Switching and Routing 2013 : + * "PIE: A * Lightweight Control Scheme to Address the Bufferbloat Problem" */ #include <linux/module.h> @@ -36,7 +41,7 @@ struct pie_params { psched_time_t target; /* user specified target delay in pschedtime */ u32 tupdate; /* timer frequency (in jiffies) */ u32 limit; /* number of packets that can be enqueued */ - u32 alpha; /* alpha and beta are between -4 and 4 */ + u32 alpha; /* alpha and beta are between 0 and 32 */ u32 beta; /* and are used for shift relative to 1 */ bool ecn; /* true if ecn is enabled */ bool bytemode; /* to scale drop early prob based on pkt size */ @@ -326,10 +331,16 @@ static void calculate_probability(struct Qdisc *sch) if (qdelay == 0 && qlen != 0) update_prob = false; - /* Add ranges for alpha and beta, more aggressive for high dropping - * mode and gentle steps for light dropping mode - * In light dropping mode, take gentle steps; in medium dropping mode, - * take medium steps; in high dropping mode, take big steps. + /* In the algorithm, alpha and beta are between 0 and 2 with typical + * value for alpha as 0.125. In this implementation, we use values 0-32 + * passed from user space to represent this. Also, alpha and beta have + * unit of HZ and need to be scaled before they can used to update + * probability. alpha/beta are updated locally below by 1) scaling them + * appropriately 2) scaling down by 16 to come to 0-2 range. + * Please see paper for details. + * + * We scale alpha and beta differently depending on whether we are in + * light, medium or high dropping mode. */ if (q->vars.prob < MAX_PROB / 100) { alpha = -- cgit From 357137a4222b1984b3d867923c7e388669744ceb Mon Sep 17 00:00:00 2001 From: FX Le Bail <fx.lebail@yahoo.com> Date: Mon, 10 Feb 2014 16:46:54 +0100 Subject: ipv4: ipconfig.c: add parentheses in an if statement Even if the 'time_before' macro expand with parentheses, the look is bad. Signed-off-by: Francois-Xavier Le Bail <fx.lebail@yahoo.com> Signed-off-by: David S. Miller <davem@davemloft.net> --- net/ipv4/ipconfig.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c index efa1138fa523..b3e86ea7b71b 100644 --- a/net/ipv4/ipconfig.c +++ b/net/ipv4/ipconfig.c @@ -273,7 +273,7 @@ static int __init ic_open_devs(void) msleep(1); - if time_before(jiffies, next_msg) + if (time_before(jiffies, next_msg)) continue; elapsed = jiffies_to_msecs(jiffies - start); -- cgit From 9eb2ddb48ce3a7bd745c14a933112994647fa3cd Mon Sep 17 00:00:00 2001 From: Trond Myklebust <trond.myklebust@primarydata.com> Date: Sun, 16 Feb 2014 12:14:13 -0500 Subject: SUNRPC: Ensure that gss_auth isn't freed before its upcall messages Fix a race in which the RPC client is shutting down while the gss daemon is processing a downcall. If the RPC client manages to shut down before the gss daemon is done, then the struct gss_auth used in gss_release_msg() may have already been freed. Link: http://lkml.kernel.org/r/1392494917.71728.YahooMailNeo@web140002.mail.bf1.yahoo.com Reported-by: John <da_audiophile@yahoo.com> Reported-by: Borislav Petkov <bp@alien8.de> Cc: stable@vger.kernel.org # 3.12+ Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com> --- net/sunrpc/auth_gss/auth_gss.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c index 44a61e8fda6f..1ba1fd114912 100644 --- a/net/sunrpc/auth_gss/auth_gss.c +++ b/net/sunrpc/auth_gss/auth_gss.c @@ -108,6 +108,7 @@ struct gss_auth { static DEFINE_SPINLOCK(pipe_version_lock); static struct rpc_wait_queue pipe_version_rpc_waitqueue; static DECLARE_WAIT_QUEUE_HEAD(pipe_version_waitqueue); +static void gss_put_auth(struct gss_auth *gss_auth); static void gss_free_ctx(struct gss_cl_ctx *); static const struct rpc_pipe_ops gss_upcall_ops_v0; @@ -320,6 +321,7 @@ gss_release_msg(struct gss_upcall_msg *gss_msg) if (gss_msg->ctx != NULL) gss_put_ctx(gss_msg->ctx); rpc_destroy_wait_queue(&gss_msg->rpc_waitqueue); + gss_put_auth(gss_msg->auth); kfree(gss_msg); } @@ -500,6 +502,7 @@ gss_alloc_msg(struct gss_auth *gss_auth, if (err) goto err_free_msg; }; + kref_get(&gss_auth->kref); return gss_msg; err_free_msg: kfree(gss_msg); @@ -1063,6 +1066,12 @@ gss_free_callback(struct kref *kref) gss_free(gss_auth); } +static void +gss_put_auth(struct gss_auth *gss_auth) +{ + kref_put(&gss_auth->kref, gss_free_callback); +} + static void gss_destroy(struct rpc_auth *auth) { @@ -1084,7 +1093,7 @@ gss_destroy(struct rpc_auth *auth) gss_auth->gss_pipe[1] = NULL; rpcauth_destroy_credcache(auth); - kref_put(&gss_auth->kref, gss_free_callback); + gss_put_auth(gss_auth); } /* @@ -1255,7 +1264,7 @@ gss_destroy_nullcred(struct rpc_cred *cred) call_rcu(&cred->cr_rcu, gss_free_cred_callback); if (ctx) gss_put_ctx(ctx); - kref_put(&gss_auth->kref, gss_free_callback); + gss_put_auth(gss_auth); } static void -- cgit From e9776d0f4adee8877145672f6416b06b57f2dc27 Mon Sep 17 00:00:00 2001 From: Trond Myklebust <trond.myklebust@primarydata.com> Date: Sun, 16 Feb 2014 13:28:01 -0500 Subject: SUNRPC: Fix a pipe_version reference leak In gss_alloc_msg(), if the call to gss_encode_v1_msg() fails, we want to release the reference to the pipe_version that was obtained earlier in the function. Fixes: 9d3a2260f0f4b (SUNRPC: Fix buffer overflow checking in...) Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com> --- net/sunrpc/auth_gss/auth_gss.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c index 1ba1fd114912..36e431ee1c90 100644 --- a/net/sunrpc/auth_gss/auth_gss.c +++ b/net/sunrpc/auth_gss/auth_gss.c @@ -500,10 +500,12 @@ gss_alloc_msg(struct gss_auth *gss_auth, default: err = gss_encode_v1_msg(gss_msg, service_name, gss_auth->target_name); if (err) - goto err_free_msg; + goto err_put_pipe_version; }; kref_get(&gss_auth->kref); return gss_msg; +err_put_pipe_version: + put_pipe_version(gss_auth->net); err_free_msg: kfree(gss_msg); err: -- cgit From 09db30805300e9ed5ad43d4d339115cf1d9c84e1 Mon Sep 17 00:00:00 2001 From: Gerrit Renker <gerrit@erg.abdn.ac.uk> Date: Thu, 13 Feb 2014 19:02:33 -0700 Subject: dccp: re-enable debug macro dccp tfrc: revert This reverts 6aee49c558de ("dccp: make local variable static") since the variable tfrc_debug is referenced by the tfrc_pr_debug(fmt, ...) macro when TFRC debugging is enabled. If it is enabled, use of the macro produces a compilation error. Signed-off-by: Gerrit Renker <gerrit@erg.abdn.ac.uk> Signed-off-by: David S. Miller <davem@davemloft.net> --- net/dccp/ccids/lib/tfrc.c | 2 +- net/dccp/ccids/lib/tfrc.h | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/dccp/ccids/lib/tfrc.c b/net/dccp/ccids/lib/tfrc.c index c073b81a1f3e..62b5828acde0 100644 --- a/net/dccp/ccids/lib/tfrc.c +++ b/net/dccp/ccids/lib/tfrc.c @@ -8,7 +8,7 @@ #include "tfrc.h" #ifdef CONFIG_IP_DCCP_TFRC_DEBUG -static bool tfrc_debug; +bool tfrc_debug; module_param(tfrc_debug, bool, 0644); MODULE_PARM_DESC(tfrc_debug, "Enable TFRC debug messages"); #endif diff --git a/net/dccp/ccids/lib/tfrc.h b/net/dccp/ccids/lib/tfrc.h index a3d8f7c76ae0..40ee7d62b652 100644 --- a/net/dccp/ccids/lib/tfrc.h +++ b/net/dccp/ccids/lib/tfrc.h @@ -21,6 +21,7 @@ #include "packet_history.h" #ifdef CONFIG_IP_DCCP_TFRC_DEBUG +extern bool tfrc_debug; #define tfrc_pr_debug(format, a...) DCCP_PR_DEBUG(tfrc_debug, format, ##a) #else #define tfrc_pr_debug(format, a...) -- cgit From cd0f0b95fd2cd2b716caf5f15db73ab76992789b Mon Sep 17 00:00:00 2001 From: Duan Jiong <duanj.fnst@cn.fujitsu.com> Date: Fri, 14 Feb 2014 18:26:22 +0800 Subject: ipv4: distinguish EHOSTUNREACH from the ENETUNREACH since commit 251da413("ipv4: Cache ip_error() routes even when not forwarding."), the counter IPSTATS_MIB_INADDRERRORS can't work correctly, because the value of err was always set to ENETUNREACH. Signed-off-by: Duan Jiong <duanj.fnst@cn.fujitsu.com> Signed-off-by: David S. Miller <davem@davemloft.net> --- net/ipv4/route.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 25071b48921c..6f6dd85bdffc 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1695,8 +1695,11 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, fl4.daddr = daddr; fl4.saddr = saddr; err = fib_lookup(net, &fl4, &res); - if (err != 0) + if (err != 0) { + if (!IN_DEV_FORWARD(in_dev)) + err = -EHOSTUNREACH; goto no_route; + } RT_CACHE_STAT_INC(in_slow_tot); @@ -1712,8 +1715,10 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, goto local_input; } - if (!IN_DEV_FORWARD(in_dev)) + if (!IN_DEV_FORWARD(in_dev)) { + err = -EHOSTUNREACH; goto no_route; + } if (res.type != RTN_UNICAST) goto martian_destination; -- cgit From ef2820a735f74ea60335f8ba3801b844f0cb184d Mon Sep 17 00:00:00 2001 From: Matija Glavinic Pecotic <matija.glavinic-pecotic.ext@nsn.com> Date: Fri, 14 Feb 2014 14:51:18 +0100 Subject: net: sctp: Fix a_rwnd/rwnd management to reflect real state of the receiver's buffer Implementation of (a)rwnd calculation might lead to severe performance issues and associations completely stalling. These problems are described and solution is proposed which improves lksctp's robustness in congestion state. 1) Sudden drop of a_rwnd and incomplete window recovery afterwards Data accounted in sctp_assoc_rwnd_decrease takes only payload size (sctp data), but size of sk_buff, which is blamed against receiver buffer, is not accounted in rwnd. Theoretically, this should not be the problem as actual size of buffer is double the amount requested on the socket (SO_RECVBUF). Problem here is that this will have bad scaling for data which is less then sizeof sk_buff. E.g. in 4G (LTE) networks, link interfacing radio side will have a large portion of traffic of this size (less then 100B). An example of sudden drop and incomplete window recovery is given below. Node B exhibits problematic behavior. Node A initiates association and B is configured to advertise rwnd of 10000. A sends messages of size 43B (size of typical sctp message in 4G (LTE) network). On B data is left in buffer by not reading socket in userspace. Lets examine when we will hit pressure state and declare rwnd to be 0 for scenario with above stated parameters (rwnd == 10000, chunk size == 43, each chunk is sent in separate sctp packet) Logic is implemented in sctp_assoc_rwnd_decrease: socket_buffer (see below) is maximum size which can be held in socket buffer (sk_rcvbuf). current_alloced is amount of data currently allocated (rx_count) A simple expression is given for which it will be examined after how many packets for above stated parameters we enter pressure state: We start by condition which has to be met in order to enter pressure state: socket_buffer < currently_alloced; currently_alloced is represented as size of sctp packets received so far and not yet delivered to userspace. x is the number of chunks/packets (since there is no bundling, and each chunk is delivered in separate packet, we can observe each chunk also as sctp packet, and what is important here, having its own sk_buff): socket_buffer < x*each_sctp_packet; each_sctp_packet is sctp chunk size + sizeof(struct sk_buff). socket_buffer is twice the amount of initially requested size of socket buffer, which is in case of sctp, twice the a_rwnd requested: 2*rwnd < x*(payload+sizeof(struc sk_buff)); sizeof(struct sk_buff) is 190 (3.13.0-rc4+). Above is stated that rwnd is 10000 and each payload size is 43 20000 < x(43+190); x > 20000/233; x ~> 84; After ~84 messages, pressure state is entered and 0 rwnd is advertised while received 84*43B ~= 3612B sctp data. This is why external observer notices sudden drop from 6474 to 0, as it will be now shown in example: IP A.34340 > B.12345: sctp (1) [INIT] [init tag: 1875509148] [rwnd: 81920] [OS: 10] [MIS: 65535] [init TSN: 1096057017] IP B.12345 > A.34340: sctp (1) [INIT ACK] [init tag: 3198966556] [rwnd: 10000] [OS: 10] [MIS: 10] [init TSN: 902132839] IP A.34340 > B.12345: sctp (1) [COOKIE ECHO] IP B.12345 > A.34340: sctp (1) [COOKIE ACK] IP A.34340 > B.12345: sctp (1) [DATA] (B)(E) [TSN: 1096057017] [SID: 0] [SSEQ 0] [PPID 0x18] IP B.12345 > A.34340: sctp (1) [SACK] [cum ack 1096057017] [a_rwnd 9957] [#gap acks 0] [#dup tsns 0] IP A.34340 > B.12345: sctp (1) [DATA] (B)(E) [TSN: 1096057018] [SID: 0] [SSEQ 1] [PPID 0x18] IP B.12345 > A.34340: sctp (1) [SACK] [cum ack 1096057018] [a_rwnd 9957] [#gap acks 0] [#dup tsns 0] IP A.34340 > B.12345: sctp (1) [DATA] (B)(E) [TSN: 1096057019] [SID: 0] [SSEQ 2] [PPID 0x18] IP B.12345 > A.34340: sctp (1) [SACK] [cum ack 1096057019] [a_rwnd 9914] [#gap acks 0] [#dup tsns 0] <...> IP A.34340 > B.12345: sctp (1) [DATA] (B)(E) [TSN: 1096057098] [SID: 0] [SSEQ 81] [PPID 0x18] IP B.12345 > A.34340: sctp (1) [SACK] [cum ack 1096057098] [a_rwnd 6517] [#gap acks 0] [#dup tsns 0] IP A.34340 > B.12345: sctp (1) [DATA] (B)(E) [TSN: 1096057099] [SID: 0] [SSEQ 82] [PPID 0x18] IP B.12345 > A.34340: sctp (1) [SACK] [cum ack 1096057099] [a_rwnd 6474] [#gap acks 0] [#dup tsns 0] IP A.34340 > B.12345: sctp (1) [DATA] (B)(E) [TSN: 1096057100] [SID: 0] [SSEQ 83] [PPID 0x18] --> Sudden drop IP B.12345 > A.34340: sctp (1) [SACK] [cum ack 1096057100] [a_rwnd 0] [#gap acks 0] [#dup tsns 0] At this point, rwnd_press stores current rwnd value so it can be later restored in sctp_assoc_rwnd_increase. This however doesn't happen as condition to start slowly increasing rwnd until rwnd_press is returned to rwnd is never met. This condition is not met since rwnd, after it hit 0, must first reach rwnd_press by adding amount which is read from userspace. Let us observe values in above example. Initial a_rwnd is 10000, pressure was hit when rwnd was ~6500 and the amount of actual sctp data currently waiting to be delivered to userspace is ~3500. When userspace starts to read, sctp_assoc_rwnd_increase will be blamed only for sctp data, which is ~3500. Condition is never met, and when userspace reads all data, rwnd stays on 3569. IP B.12345 > A.34340: sctp (1) [SACK] [cum ack 1096057100] [a_rwnd 1505] [#gap acks 0] [#dup tsns 0] IP B.12345 > A.34340: sctp (1) [SACK] [cum ack 1096057100] [a_rwnd 3010] [#gap acks 0] [#dup tsns 0] IP A.34340 > B.12345: sctp (1) [DATA] (B)(E) [TSN: 1096057101] [SID: 0] [SSEQ 84] [PPID 0x18] IP B.12345 > A.34340: sctp (1) [SACK] [cum ack 1096057101] [a_rwnd 3569] [#gap acks 0] [#dup tsns 0] --> At this point userspace read everything, rwnd recovered only to 3569 IP A.34340 > B.12345: sctp (1) [DATA] (B)(E) [TSN: 1096057102] [SID: 0] [SSEQ 85] [PPID 0x18] IP B.12345 > A.34340: sctp (1) [SACK] [cum ack 1096057102] [a_rwnd 3569] [#gap acks 0] [#dup tsns 0] Reproduction is straight forward, it is enough for sender to send packets of size less then sizeof(struct sk_buff) and receiver keeping them in its buffers. 2) Minute size window for associations sharing the same socket buffer In case multiple associations share the same socket, and same socket buffer (sctp.rcvbuf_policy == 0), different scenarios exist in which congestion on one of the associations can permanently drop rwnd of other association(s). Situation will be typically observed as one association suddenly having rwnd dropped to size of last packet received and never recovering beyond that point. Different scenarios will lead to it, but all have in common that one of the associations (let it be association from 1)) nearly depleted socket buffer, and the other association blames socket buffer just for the amount enough to start the pressure. This association will enter pressure state, set rwnd_press and announce 0 rwnd. When data is read by userspace, similar situation as in 1) will occur, rwnd will increase just for the size read by userspace but rwnd_press will be high enough so that association doesn't have enough credit to reach rwnd_press and restore to previous state. This case is special case of 1), being worse as there is, in the worst case, only one packet in buffer for which size rwnd will be increased. Consequence is association which has very low maximum rwnd ('minute size', in our case down to 43B - size of packet which caused pressure) and as such unusable. Scenario happened in the field and labs frequently after congestion state (link breaks, different probabilities of packet drop, packet reordering) and with scenario 1) preceding. Here is given a deterministic scenario for reproduction: >From node A establish two associations on the same socket, with rcvbuf_policy being set to share one common buffer (sctp.rcvbuf_policy == 0). On association 1 repeat scenario from 1), that is, bring it down to 0 and restore up. Observe scenario 1). Use small payload size (here we use 43). Once rwnd is 'recovered', bring it down close to 0, as in just one more packet would close it. This has as a consequence that association number 2 is able to receive (at least) one more packet which will bring it in pressure state. E.g. if association 2 had rwnd of 10000, packet received was 43, and we enter at this point into pressure, rwnd_press will have 9957. Once payload is delivered to userspace, rwnd will increase for 43, but conditions to restore rwnd to original state, just as in 1), will never be satisfied. --> Association 1, between A.y and B.12345 IP A.55915 > B.12345: sctp (1) [INIT] [init tag: 836880897] [rwnd: 10000] [OS: 10] [MIS: 65535] [init TSN: 4032536569] IP B.12345 > A.55915: sctp (1) [INIT ACK] [init tag: 2873310749] [rwnd: 81920] [OS: 10] [MIS: 10] [init TSN: 3799315613] IP A.55915 > B.12345: sctp (1) [COOKIE ECHO] IP B.12345 > A.55915: sctp (1) [COOKIE ACK] --> Association 2, between A.z and B.12346 IP A.55915 > B.12346: sctp (1) [INIT] [init tag: 534798321] [rwnd: 10000] [OS: 10] [MIS: 65535] [init TSN: 2099285173] IP B.12346 > A.55915: sctp (1) [INIT ACK] [init tag: 516668823] [rwnd: 81920] [OS: 10] [MIS: 10] [init TSN: 3676403240] IP A.55915 > B.12346: sctp (1) [COOKIE ECHO] IP B.12346 > A.55915: sctp (1) [COOKIE ACK] --> Deplete socket buffer by sending messages of size 43B over association 1 IP B.12345 > A.55915: sctp (1) [DATA] (B)(E) [TSN: 3799315613] [SID: 0] [SSEQ 0] [PPID 0x18] IP A.55915 > B.12345: sctp (1) [SACK] [cum ack 3799315613] [a_rwnd 9957] [#gap acks 0] [#dup tsns 0] <...> IP A.55915 > B.12345: sctp (1) [SACK] [cum ack 3799315696] [a_rwnd 6388] [#gap acks 0] [#dup tsns 0] IP B.12345 > A.55915: sctp (1) [DATA] (B)(E) [TSN: 3799315697] [SID: 0] [SSEQ 84] [PPID 0x18] IP A.55915 > B.12345: sctp (1) [SACK] [cum ack 3799315697] [a_rwnd 6345] [#gap acks 0] [#dup tsns 0] --> Sudden drop on 1 IP B.12345 > A.55915: sctp (1) [DATA] (B)(E) [TSN: 3799315698] [SID: 0] [SSEQ 85] [PPID 0x18] IP A.55915 > B.12345: sctp (1) [SACK] [cum ack 3799315698] [a_rwnd 0] [#gap acks 0] [#dup tsns 0] --> Here userspace read, rwnd 'recovered' to 3698, now deplete again using association 1 so there is place in buffer for only one more packet IP B.12345 > A.55915: sctp (1) [DATA] (B)(E) [TSN: 3799315799] [SID: 0] [SSEQ 186] [PPID 0x18] IP A.55915 > B.12345: sctp (1) [SACK] [cum ack 3799315799] [a_rwnd 86] [#gap acks 0] [#dup tsns 0] IP B.12345 > A.55915: sctp (1) [DATA] (B)(E) [TSN: 3799315800] [SID: 0] [SSEQ 187] [PPID 0x18] IP A.55915 > B.12345: sctp (1) [SACK] [cum ack 3799315800] [a_rwnd 43] [#gap acks 0] [#dup tsns 0] --> Socket buffer is almost depleted, but there is space for one more packet, send them over association 2, size 43B IP B.12346 > A.55915: sctp (1) [DATA] (B)(E) [TSN: 3676403240] [SID: 0] [SSEQ 0] [PPID 0x18] IP A.55915 > B.12346: sctp (1) [SACK] [cum ack 3676403240] [a_rwnd 0] [#gap acks 0] [#dup tsns 0] --> Immediate drop IP A.60995 > B.12346: sctp (1) [SACK] [cum ack 387491510] [a_rwnd 0] [#gap acks 0] [#dup tsns 0] --> Read everything from the socket, both association recover up to maximum rwnd they are capable of reaching, note that association 1 recovered up to 3698, and association 2 recovered only to 43 IP A.55915 > B.12345: sctp (1) [SACK] [cum ack 3799315800] [a_rwnd 1548] [#gap acks 0] [#dup tsns 0] IP A.55915 > B.12345: sctp (1) [SACK] [cum ack 3799315800] [a_rwnd 3053] [#gap acks 0] [#dup tsns 0] IP B.12345 > A.55915: sctp (1) [DATA] (B)(E) [TSN: 3799315801] [SID: 0] [SSEQ 188] [PPID 0x18] IP A.55915 > B.12345: sctp (1) [SACK] [cum ack 3799315801] [a_rwnd 3698] [#gap acks 0] [#dup tsns 0] IP B.12346 > A.55915: sctp (1) [DATA] (B)(E) [TSN: 3676403241] [SID: 0] [SSEQ 1] [PPID 0x18] IP A.55915 > B.12346: sctp (1) [SACK] [cum ack 3676403241] [a_rwnd 43] [#gap acks 0] [#dup tsns 0] A careful reader might wonder why it is necessary to reproduce 1) prior reproduction of 2). It is simply easier to observe when to send packet over association 2 which will push association into the pressure state. Proposed solution: Both problems share the same root cause, and that is improper scaling of socket buffer with rwnd. Solution in which sizeof(sk_buff) is taken into concern while calculating rwnd is not possible due to fact that there is no linear relationship between amount of data blamed in increase/decrease with IP packet in which payload arrived. Even in case such solution would be followed, complexity of the code would increase. Due to nature of current rwnd handling, slow increase (in sctp_assoc_rwnd_increase) of rwnd after pressure state is entered is rationale, but it gives false representation to the sender of current buffer space. Furthermore, it implements additional congestion control mechanism which is defined on implementation, and not on standard basis. Proposed solution simplifies whole algorithm having on mind definition from rfc: o Receiver Window (rwnd): This gives the sender an indication of the space available in the receiver's inbound buffer. Core of the proposed solution is given with these lines: sctp_assoc_rwnd_update: if ((asoc->base.sk->sk_rcvbuf - rx_count) > 0) asoc->rwnd = (asoc->base.sk->sk_rcvbuf - rx_count) >> 1; else asoc->rwnd = 0; We advertise to sender (half of) actual space we have. Half is in the braces depending whether you would like to observe size of socket buffer as SO_RECVBUF or twice the amount, i.e. size is the one visible from userspace, that is, from kernelspace. In this way sender is given with good approximation of our buffer space, regardless of the buffer policy - we always advertise what we have. Proposed solution fixes described problems and removes necessity for rwnd restoration algorithm. Finally, as proposed solution is simplification, some lines of code, along with some bytes in struct sctp_association are saved. Version 2 of the patch addressed comments from Vlad. Name of the function is set to be more descriptive, and two parts of code are changed, in one removing the superfluous call to sctp_assoc_rwnd_update since call would not result in update of rwnd, and the other being reordering of the code in a way that call to sctp_assoc_rwnd_update updates rwnd. Version 3 corrected change introduced in v2 in a way that existing function is not reordered/copied in line, but it is correctly called. Thanks Vlad for suggesting. Signed-off-by: Matija Glavinic Pecotic <matija.glavinic-pecotic.ext@nsn.com> Reviewed-by: Alexander Sverdlin <alexander.sverdlin@nsn.com> Acked-by: Vlad Yasevich <vyasevich@gmail.com> Signed-off-by: David S. Miller <davem@davemloft.net> --- include/net/sctp/structs.h | 14 +------- net/sctp/associola.c | 82 ++++++++++------------------------------------ net/sctp/sm_statefuns.c | 2 +- net/sctp/socket.c | 6 ---- net/sctp/ulpevent.c | 8 +++-- 5 files changed, 25 insertions(+), 87 deletions(-) (limited to 'net') diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h index d992ca3145fe..6ee76c804893 100644 --- a/include/net/sctp/structs.h +++ b/include/net/sctp/structs.h @@ -1653,17 +1653,6 @@ struct sctp_association { /* This is the last advertised value of rwnd over a SACK chunk. */ __u32 a_rwnd; - /* Number of bytes by which the rwnd has slopped. The rwnd is allowed - * to slop over a maximum of the association's frag_point. - */ - __u32 rwnd_over; - - /* Keeps treack of rwnd pressure. This happens when we have - * a window, but not recevie buffer (i.e small packets). This one - * is releases slowly (1 PMTU at a time ). - */ - __u32 rwnd_press; - /* This is the sndbuf size in use for the association. * This corresponds to the sndbuf size for the association, * as specified in the sk->sndbuf. @@ -1892,8 +1881,7 @@ void sctp_assoc_update(struct sctp_association *old, __u32 sctp_association_get_next_tsn(struct sctp_association *); void sctp_assoc_sync_pmtu(struct sock *, struct sctp_association *); -void sctp_assoc_rwnd_increase(struct sctp_association *, unsigned int); -void sctp_assoc_rwnd_decrease(struct sctp_association *, unsigned int); +void sctp_assoc_rwnd_update(struct sctp_association *, bool); void sctp_assoc_set_primary(struct sctp_association *, struct sctp_transport *); void sctp_assoc_del_nonprimary_peers(struct sctp_association *, diff --git a/net/sctp/associola.c b/net/sctp/associola.c index 5ae609200674..f558433537b8 100644 --- a/net/sctp/associola.c +++ b/net/sctp/associola.c @@ -1367,44 +1367,35 @@ static inline bool sctp_peer_needs_update(struct sctp_association *asoc) return false; } -/* Increase asoc's rwnd by len and send any window update SACK if needed. */ -void sctp_assoc_rwnd_increase(struct sctp_association *asoc, unsigned int len) +/* Update asoc's rwnd for the approximated state in the buffer, + * and check whether SACK needs to be sent. + */ +void sctp_assoc_rwnd_update(struct sctp_association *asoc, bool update_peer) { + int rx_count; struct sctp_chunk *sack; struct timer_list *timer; - if (asoc->rwnd_over) { - if (asoc->rwnd_over >= len) { - asoc->rwnd_over -= len; - } else { - asoc->rwnd += (len - asoc->rwnd_over); - asoc->rwnd_over = 0; - } - } else { - asoc->rwnd += len; - } + if (asoc->ep->rcvbuf_policy) + rx_count = atomic_read(&asoc->rmem_alloc); + else + rx_count = atomic_read(&asoc->base.sk->sk_rmem_alloc); - /* If we had window pressure, start recovering it - * once our rwnd had reached the accumulated pressure - * threshold. The idea is to recover slowly, but up - * to the initial advertised window. - */ - if (asoc->rwnd_press && asoc->rwnd >= asoc->rwnd_press) { - int change = min(asoc->pathmtu, asoc->rwnd_press); - asoc->rwnd += change; - asoc->rwnd_press -= change; - } + if ((asoc->base.sk->sk_rcvbuf - rx_count) > 0) + asoc->rwnd = (asoc->base.sk->sk_rcvbuf - rx_count) >> 1; + else + asoc->rwnd = 0; - pr_debug("%s: asoc:%p rwnd increased by %d to (%u, %u) - %u\n", - __func__, asoc, len, asoc->rwnd, asoc->rwnd_over, - asoc->a_rwnd); + pr_debug("%s: asoc:%p rwnd=%u, rx_count=%d, sk_rcvbuf=%d\n", + __func__, asoc, asoc->rwnd, rx_count, + asoc->base.sk->sk_rcvbuf); /* Send a window update SACK if the rwnd has increased by at least the * minimum of the association's PMTU and half of the receive buffer. * The algorithm used is similar to the one described in * Section 4.2.3.3 of RFC 1122. */ - if (sctp_peer_needs_update(asoc)) { + if (update_peer && sctp_peer_needs_update(asoc)) { asoc->a_rwnd = asoc->rwnd; pr_debug("%s: sending window update SACK- asoc:%p rwnd:%u " @@ -1426,45 +1417,6 @@ void sctp_assoc_rwnd_increase(struct sctp_association *asoc, unsigned int len) } } -/* Decrease asoc's rwnd by len. */ -void sctp_assoc_rwnd_decrease(struct sctp_association *asoc, unsigned int len) -{ - int rx_count; - int over = 0; - - if (unlikely(!asoc->rwnd || asoc->rwnd_over)) - pr_debug("%s: association:%p has asoc->rwnd:%u, " - "asoc->rwnd_over:%u!\n", __func__, asoc, - asoc->rwnd, asoc->rwnd_over); - - if (asoc->ep->rcvbuf_policy) - rx_count = atomic_read(&asoc->rmem_alloc); - else - rx_count = atomic_read(&asoc->base.sk->sk_rmem_alloc); - - /* If we've reached or overflowed our receive buffer, announce - * a 0 rwnd if rwnd would still be positive. Store the - * the potential pressure overflow so that the window can be restored - * back to original value. - */ - if (rx_count >= asoc->base.sk->sk_rcvbuf) - over = 1; - - if (asoc->rwnd >= len) { - asoc->rwnd -= len; - if (over) { - asoc->rwnd_press += asoc->rwnd; - asoc->rwnd = 0; - } - } else { - asoc->rwnd_over = len - asoc->rwnd; - asoc->rwnd = 0; - } - - pr_debug("%s: asoc:%p rwnd decreased by %d to (%u, %u, %u)\n", - __func__, asoc, len, asoc->rwnd, asoc->rwnd_over, - asoc->rwnd_press); -} /* Build the bind address list for the association based on info from the * local endpoint and the remote peer. diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c index 483dcd71b3c5..591b44d3b7de 100644 --- a/net/sctp/sm_statefuns.c +++ b/net/sctp/sm_statefuns.c @@ -6176,7 +6176,7 @@ static int sctp_eat_data(const struct sctp_association *asoc, * PMTU. In cases, such as loopback, this might be a rather * large spill over. */ - if ((!chunk->data_accepted) && (!asoc->rwnd || asoc->rwnd_over || + if ((!chunk->data_accepted) && (!asoc->rwnd || (datalen > asoc->rwnd + asoc->frag_point))) { /* If this is the next TSN, consider reneging to make diff --git a/net/sctp/socket.c b/net/sctp/socket.c index 9e91d6e5df63..7075ac847fde 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -2092,12 +2092,6 @@ static int sctp_recvmsg(struct kiocb *iocb, struct sock *sk, sctp_skb_pull(skb, copied); skb_queue_head(&sk->sk_receive_queue, skb); - /* When only partial message is copied to the user, increase - * rwnd by that amount. If all the data in the skb is read, - * rwnd is updated when the event is freed. - */ - if (!sctp_ulpevent_is_notification(event)) - sctp_assoc_rwnd_increase(event->asoc, copied); goto out; } else if ((event->msg_flags & MSG_NOTIFICATION) || (event->msg_flags & MSG_EOR)) diff --git a/net/sctp/ulpevent.c b/net/sctp/ulpevent.c index 85c64658bd0b..8d198ae03606 100644 --- a/net/sctp/ulpevent.c +++ b/net/sctp/ulpevent.c @@ -989,7 +989,7 @@ static void sctp_ulpevent_receive_data(struct sctp_ulpevent *event, skb = sctp_event2skb(event); /* Set the owner and charge rwnd for bytes received. */ sctp_ulpevent_set_owner(event, asoc); - sctp_assoc_rwnd_decrease(asoc, skb_headlen(skb)); + sctp_assoc_rwnd_update(asoc, false); if (!skb->data_len) return; @@ -1011,6 +1011,7 @@ static void sctp_ulpevent_release_data(struct sctp_ulpevent *event) { struct sk_buff *skb, *frag; unsigned int len; + struct sctp_association *asoc; /* Current stack structures assume that the rcv buffer is * per socket. For UDP style sockets this is not true as @@ -1035,8 +1036,11 @@ static void sctp_ulpevent_release_data(struct sctp_ulpevent *event) } done: - sctp_assoc_rwnd_increase(event->asoc, len); + asoc = event->asoc; + sctp_association_hold(asoc); sctp_ulpevent_release_owner(event); + sctp_assoc_rwnd_update(asoc, true); + sctp_association_put(asoc); } static void sctp_ulpevent_release_frag_data(struct sctp_ulpevent *event) -- cgit From 99932d4fc03a13bb3e94938fe25458fabc8f2fc3 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann <dborkman@redhat.com> Date: Sun, 16 Feb 2014 15:55:20 +0100 Subject: netdevice: add queue selection fallback handler for ndo_select_queue Add a new argument for ndo_select_queue() callback that passes a fallback handler. This gets invoked through netdev_pick_tx(); fallback handler is currently __netdev_pick_tx() as most drivers invoke this function within their customized implementation in case for skbs that don't need any special handling. This fallback handler can then be replaced on other call-sites with different queue selection methods (e.g. in packet sockets, pktgen etc). This also has the nice side-effect that __netdev_pick_tx() is then only invoked from netdev_pick_tx() and export of that function to modules can be undone. Suggested-by: David S. Miller <davem@davemloft.net> Signed-off-by: Daniel Borkmann <dborkman@redhat.com> Signed-off-by: David S. Miller <davem@davemloft.net> --- drivers/net/bonding/bond_main.c | 2 +- drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c | 4 ++-- drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.h | 2 +- drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 6 +++--- drivers/net/ethernet/lantiq_etop.c | 2 +- drivers/net/ethernet/mellanox/mlx4/en_tx.c | 4 ++-- drivers/net/ethernet/mellanox/mlx4/mlx4_en.h | 2 +- drivers/net/ethernet/tile/tilegx.c | 2 +- drivers/net/team/team.c | 2 +- drivers/net/tun.c | 2 +- drivers/net/wireless/mwifiex/main.c | 2 +- drivers/staging/bcm/Bcmnet.c | 2 +- drivers/staging/netlogic/xlr_net.c | 2 +- drivers/staging/rtl8188eu/os_dep/os_intfs.c | 2 +- include/linux/netdevice.h | 9 ++++++--- net/core/flow_dissector.c | 7 +++---- net/mac80211/iface.c | 6 ++++-- 17 files changed, 31 insertions(+), 27 deletions(-) (limited to 'net') diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index 867664918715..1c6104d3501d 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -3707,7 +3707,7 @@ static inline int bond_slave_override(struct bonding *bond, static u16 bond_select_queue(struct net_device *dev, struct sk_buff *skb, - void *accel_priv) + void *accel_priv, select_queue_fallback_t fallback) { /* * This helper function exists to help dev_pick_tx get the correct diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c index 9d7419e0390b..66c0df78c3ff 100644 --- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c +++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c @@ -1873,7 +1873,7 @@ void bnx2x_netif_stop(struct bnx2x *bp, int disable_hw) } u16 bnx2x_select_queue(struct net_device *dev, struct sk_buff *skb, - void *accel_priv) + void *accel_priv, select_queue_fallback_t fallback) { struct bnx2x *bp = netdev_priv(dev); @@ -1895,7 +1895,7 @@ u16 bnx2x_select_queue(struct net_device *dev, struct sk_buff *skb, } /* select a non-FCoE queue */ - return __netdev_pick_tx(dev, skb) % BNX2X_NUM_ETH_QUEUES(bp); + return fallback(dev, skb) % BNX2X_NUM_ETH_QUEUES(bp); } void bnx2x_set_num_queues(struct bnx2x *bp) diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.h b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.h index bfc58d488bb5..a89a40f88c25 100644 --- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.h +++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.h @@ -496,7 +496,7 @@ int bnx2x_set_vf_vlan(struct net_device *netdev, int vf, u16 vlan, u8 qos); /* select_queue callback */ u16 bnx2x_select_queue(struct net_device *dev, struct sk_buff *skb, - void *accel_priv); + void *accel_priv, select_queue_fallback_t fallback); static inline void bnx2x_update_rx_prod(struct bnx2x *bp, struct bnx2x_fastpath *fp, diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c index 6d4ada72dfd0..18076c4178b4 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c @@ -6881,7 +6881,7 @@ static inline int ixgbe_maybe_stop_tx(struct ixgbe_ring *tx_ring, u16 size) } static u16 ixgbe_select_queue(struct net_device *dev, struct sk_buff *skb, - void *accel_priv) + void *accel_priv, select_queue_fallback_t fallback) { struct ixgbe_fwd_adapter *fwd_adapter = accel_priv; #ifdef IXGBE_FCOE @@ -6907,7 +6907,7 @@ static u16 ixgbe_select_queue(struct net_device *dev, struct sk_buff *skb, if (adapter->flags & IXGBE_FLAG_FCOE_ENABLED) break; default: - return __netdev_pick_tx(dev, skb); + return fallback(dev, skb); } f = &adapter->ring_feature[RING_F_FCOE]; @@ -6920,7 +6920,7 @@ static u16 ixgbe_select_queue(struct net_device *dev, struct sk_buff *skb, return txq + f->offset; #else - return __netdev_pick_tx(dev, skb); + return fallback(dev, skb); #endif } diff --git a/drivers/net/ethernet/lantiq_etop.c b/drivers/net/ethernet/lantiq_etop.c index 8f9266c64c75..fd4b6aecf6ee 100644 --- a/drivers/net/ethernet/lantiq_etop.c +++ b/drivers/net/ethernet/lantiq_etop.c @@ -619,7 +619,7 @@ ltq_etop_set_multicast_list(struct net_device *dev) static u16 ltq_etop_select_queue(struct net_device *dev, struct sk_buff *skb, - void *accel_priv) + void *accel_priv, select_queue_fallback_t fallback) { /* we are currently only using the first queue */ return 0; diff --git a/drivers/net/ethernet/mellanox/mlx4/en_tx.c b/drivers/net/ethernet/mellanox/mlx4/en_tx.c index 8e8a7eb43a2c..13457032d15f 100644 --- a/drivers/net/ethernet/mellanox/mlx4/en_tx.c +++ b/drivers/net/ethernet/mellanox/mlx4/en_tx.c @@ -629,7 +629,7 @@ static void build_inline_wqe(struct mlx4_en_tx_desc *tx_desc, struct sk_buff *sk } u16 mlx4_en_select_queue(struct net_device *dev, struct sk_buff *skb, - void *accel_priv) + void *accel_priv, select_queue_fallback_t fallback) { struct mlx4_en_priv *priv = netdev_priv(dev); u16 rings_p_up = priv->num_tx_rings_p_up; @@ -641,7 +641,7 @@ u16 mlx4_en_select_queue(struct net_device *dev, struct sk_buff *skb, if (vlan_tx_tag_present(skb)) up = vlan_tx_tag_get(skb) >> VLAN_PRIO_SHIFT; - return __netdev_pick_tx(dev, skb) % rings_p_up + up * rings_p_up; + return fallback(dev, skb) % rings_p_up + up * rings_p_up; } static void mlx4_bf_copy(void __iomem *dst, unsigned long *src, unsigned bytecnt) diff --git a/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h b/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h index 3af04c3f42ea..9ca223bc90fc 100644 --- a/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h +++ b/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h @@ -723,7 +723,7 @@ int mlx4_en_arm_cq(struct mlx4_en_priv *priv, struct mlx4_en_cq *cq); void mlx4_en_tx_irq(struct mlx4_cq *mcq); u16 mlx4_en_select_queue(struct net_device *dev, struct sk_buff *skb, - void *accel_priv); + void *accel_priv, select_queue_fallback_t fallback); netdev_tx_t mlx4_en_xmit(struct sk_buff *skb, struct net_device *dev); int mlx4_en_create_tx_ring(struct mlx4_en_priv *priv, diff --git a/drivers/net/ethernet/tile/tilegx.c b/drivers/net/ethernet/tile/tilegx.c index 023237a65720..17503da9f7a5 100644 --- a/drivers/net/ethernet/tile/tilegx.c +++ b/drivers/net/ethernet/tile/tilegx.c @@ -2071,7 +2071,7 @@ static int tile_net_tx(struct sk_buff *skb, struct net_device *dev) /* Return subqueue id on this core (one per core). */ static u16 tile_net_select_queue(struct net_device *dev, struct sk_buff *skb, - void *accel_priv) + void *accel_priv, select_queue_fallback_t fallback) { return smp_processor_id(); } diff --git a/drivers/net/team/team.c b/drivers/net/team/team.c index 28407426fd6f..c8624a8235ab 100644 --- a/drivers/net/team/team.c +++ b/drivers/net/team/team.c @@ -1648,7 +1648,7 @@ static netdev_tx_t team_xmit(struct sk_buff *skb, struct net_device *dev) } static u16 team_select_queue(struct net_device *dev, struct sk_buff *skb, - void *accel_priv) + void *accel_priv, select_queue_fallback_t fallback) { /* * This helper function exists to help dev_pick_tx get the correct diff --git a/drivers/net/tun.c b/drivers/net/tun.c index 44c4db8450f0..8fe9cb7d0f72 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -366,7 +366,7 @@ static inline void tun_flow_save_rps_rxhash(struct tun_flow_entry *e, u32 hash) * hope the rxq no. may help here. */ static u16 tun_select_queue(struct net_device *dev, struct sk_buff *skb, - void *accel_priv) + void *accel_priv, select_queue_fallback_t fallback) { struct tun_struct *tun = netdev_priv(dev); struct tun_flow_entry *e; diff --git a/drivers/net/wireless/mwifiex/main.c b/drivers/net/wireless/mwifiex/main.c index 4d79761b9c87..9d3d2758ec35 100644 --- a/drivers/net/wireless/mwifiex/main.c +++ b/drivers/net/wireless/mwifiex/main.c @@ -748,7 +748,7 @@ static struct net_device_stats *mwifiex_get_stats(struct net_device *dev) static u16 mwifiex_netdev_select_wmm_queue(struct net_device *dev, struct sk_buff *skb, - void *accel_priv) + void *accel_priv, select_queue_fallback_t fallback) { skb->priority = cfg80211_classify8021d(skb, NULL); return mwifiex_1d_to_wmm_queue[skb->priority]; diff --git a/drivers/staging/bcm/Bcmnet.c b/drivers/staging/bcm/Bcmnet.c index 8dfdd2732bdc..95a2358267ba 100644 --- a/drivers/staging/bcm/Bcmnet.c +++ b/drivers/staging/bcm/Bcmnet.c @@ -40,7 +40,7 @@ static INT bcm_close(struct net_device *dev) } static u16 bcm_select_queue(struct net_device *dev, struct sk_buff *skb, - void *accel_priv) + void *accel_priv, select_queue_fallback_t fallback) { return ClassifyPacket(netdev_priv(dev), skb); } diff --git a/drivers/staging/netlogic/xlr_net.c b/drivers/staging/netlogic/xlr_net.c index eedffed17e39..6f9ac27730af 100644 --- a/drivers/staging/netlogic/xlr_net.c +++ b/drivers/staging/netlogic/xlr_net.c @@ -307,7 +307,7 @@ static netdev_tx_t xlr_net_start_xmit(struct sk_buff *skb, } static u16 xlr_net_select_queue(struct net_device *ndev, struct sk_buff *skb, - void *accel_priv) + void *accel_priv, select_queue_fallback_t fallback) { return (u16)smp_processor_id(); } diff --git a/drivers/staging/rtl8188eu/os_dep/os_intfs.c b/drivers/staging/rtl8188eu/os_dep/os_intfs.c index 68f98fa114d2..7c9ee58f47bb 100644 --- a/drivers/staging/rtl8188eu/os_dep/os_intfs.c +++ b/drivers/staging/rtl8188eu/os_dep/os_intfs.c @@ -653,7 +653,7 @@ static unsigned int rtw_classify8021d(struct sk_buff *skb) } static u16 rtw_select_queue(struct net_device *dev, struct sk_buff *skb, - void *accel_priv) + void *accel_priv, select_queue_fallback_t fallback) { struct adapter *padapter = rtw_netdev_priv(dev); struct mlme_priv *pmlmepriv = &padapter->mlmepriv; diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 21d4e6be8949..1de9c136b066 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -752,6 +752,9 @@ struct netdev_phys_port_id { unsigned char id_len; }; +typedef u16 (*select_queue_fallback_t)(struct net_device *dev, + struct sk_buff *skb); + /* * This structure defines the management hooks for network devices. * The following hooks can be defined; unless noted otherwise, they are @@ -783,7 +786,7 @@ struct netdev_phys_port_id { * Required can not be NULL. * * u16 (*ndo_select_queue)(struct net_device *dev, struct sk_buff *skb, - * void *accel_priv); + * void *accel_priv, select_queue_fallback_t fallback); * Called to decide which queue to when device supports multiple * transmit queues. * @@ -1005,7 +1008,8 @@ struct net_device_ops { struct net_device *dev); u16 (*ndo_select_queue)(struct net_device *dev, struct sk_buff *skb, - void *accel_priv); + void *accel_priv, + select_queue_fallback_t fallback); void (*ndo_change_rx_flags)(struct net_device *dev, int flags); void (*ndo_set_rx_mode)(struct net_device *dev); @@ -1551,7 +1555,6 @@ static inline void netdev_for_each_tx_queue(struct net_device *dev, struct netdev_queue *netdev_pick_tx(struct net_device *dev, struct sk_buff *skb, void *accel_priv); -u16 __netdev_pick_tx(struct net_device *dev, struct sk_buff *skb); /* * Net namespace inlines diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 87577d447554..75fe83f590ea 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -372,7 +372,7 @@ static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb) #endif } -u16 __netdev_pick_tx(struct net_device *dev, struct sk_buff *skb) +static u16 __netdev_pick_tx(struct net_device *dev, struct sk_buff *skb) { struct sock *sk = skb->sk; int queue_index = sk_tx_queue_get(sk); @@ -392,7 +392,6 @@ u16 __netdev_pick_tx(struct net_device *dev, struct sk_buff *skb) return queue_index; } -EXPORT_SYMBOL(__netdev_pick_tx); struct netdev_queue *netdev_pick_tx(struct net_device *dev, struct sk_buff *skb, @@ -403,8 +402,8 @@ struct netdev_queue *netdev_pick_tx(struct net_device *dev, if (dev->real_num_tx_queues != 1) { const struct net_device_ops *ops = dev->netdev_ops; if (ops->ndo_select_queue) - queue_index = ops->ndo_select_queue(dev, skb, - accel_priv); + queue_index = ops->ndo_select_queue(dev, skb, accel_priv, + __netdev_pick_tx); else queue_index = __netdev_pick_tx(dev, skb); diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c index d6d1f1df9119..ce1c44370610 100644 --- a/net/mac80211/iface.c +++ b/net/mac80211/iface.c @@ -1057,7 +1057,8 @@ static void ieee80211_uninit(struct net_device *dev) static u16 ieee80211_netdev_select_queue(struct net_device *dev, struct sk_buff *skb, - void *accel_priv) + void *accel_priv, + select_queue_fallback_t fallback) { return ieee80211_select_queue(IEEE80211_DEV_TO_SUB_IF(dev), skb); } @@ -1075,7 +1076,8 @@ static const struct net_device_ops ieee80211_dataif_ops = { static u16 ieee80211_monitor_select_queue(struct net_device *dev, struct sk_buff *skb, - void *accel_priv) + void *accel_priv, + select_queue_fallback_t fallback) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); struct ieee80211_local *local = sdata->local; -- cgit From b9507bdaf40e91fea2b1c0c1ee7dc627c8ee6fd6 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann <dborkman@redhat.com> Date: Sun, 16 Feb 2014 15:55:21 +0100 Subject: netdevice: move netdev_cap_txqueue for shared usage to header In order to allow users to invoke netdev_cap_txqueue, it needs to be moved into netdevice.h header file. While at it, also add kernel doc header to document the API. Signed-off-by: Daniel Borkmann <dborkman@redhat.com> Signed-off-by: David S. Miller <davem@davemloft.net> --- include/linux/netdevice.h | 20 ++++++++++++++++++++ net/core/flow_dissector.c | 13 +------------ 2 files changed, 21 insertions(+), 12 deletions(-) (limited to 'net') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 1de9c136b066..e8eeebd49a98 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2278,6 +2278,26 @@ static inline void netdev_reset_queue(struct net_device *dev_queue) netdev_tx_reset_queue(netdev_get_tx_queue(dev_queue, 0)); } +/** + * netdev_cap_txqueue - check if selected tx queue exceeds device queues + * @dev: network device + * @queue_index: given tx queue index + * + * Returns 0 if given tx queue index >= number of device tx queues, + * otherwise returns the originally passed tx queue index. + */ +static inline u16 netdev_cap_txqueue(struct net_device *dev, u16 queue_index) +{ + if (unlikely(queue_index >= dev->real_num_tx_queues)) { + net_warn_ratelimited("%s selects TX queue %d, but real number of TX queues is %d\n", + dev->name, queue_index, + dev->real_num_tx_queues); + return 0; + } + + return queue_index; +} + /** * netif_running - test if up * @dev: network device diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 75fe83f590ea..e29e810663d7 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -323,17 +323,6 @@ u32 __skb_get_poff(const struct sk_buff *skb) return poff; } -static inline u16 dev_cap_txqueue(struct net_device *dev, u16 queue_index) -{ - if (unlikely(queue_index >= dev->real_num_tx_queues)) { - net_warn_ratelimited("%s selects TX queue %d, but real number of TX queues is %d\n", - dev->name, queue_index, - dev->real_num_tx_queues); - return 0; - } - return queue_index; -} - static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb) { #ifdef CONFIG_XPS @@ -408,7 +397,7 @@ struct netdev_queue *netdev_pick_tx(struct net_device *dev, queue_index = __netdev_pick_tx(dev, skb); if (!accel_priv) - queue_index = dev_cap_txqueue(dev, queue_index); + queue_index = netdev_cap_txqueue(dev, queue_index); } skb_set_queue_mapping(skb, queue_index); -- cgit From 0fd5d57ba3456c4d0b77d1ae64be4818b47d7545 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann <dborkman@redhat.com> Date: Sun, 16 Feb 2014 15:55:22 +0100 Subject: packet: check for ndo_select_queue during queue selection Mathias reported that on an AMD Geode LX embedded board (ALiX) with ath9k driver PACKET_QDISC_BYPASS, introduced in commit d346a3fae3ff ("packet: introduce PACKET_QDISC_BYPASS socket option"), triggers a WARN_ON() coming from the driver itself via 066dae93bdf ("ath9k: rework tx queue selection and fix queue stopping/waking"). The reason why this happened is that ndo_select_queue() call is not invoked from direct xmit path i.e. for ieee80211 subsystem that sets queue and TID (similar to 802.1d tag) which is being put into the frame through 802.11e (WMM, QoS). If that is not set, pending frame counter for e.g. ath9k can get messed up. So the WARN_ON() in ath9k is absolutely legitimate. Generally, the hw queue selection in ieee80211 depends on the type of traffic, and priorities are set according to ieee80211_ac_numbers mapping; working in a similar way as DiffServ only on a lower layer, so that the AP can favour frames that have "real-time" requirements like voice or video data frames. Therefore, check for presence of ndo_select_queue() in netdev ops and, if available, invoke it with a fallback handler to __packet_pick_tx_queue(), so that driver such as bnx2x, ixgbe, or mlx4 can still select a hw queue for transmission in relation to the current CPU while e.g. ieee80211 subsystem can make their own choices. Reported-by: Mathias Kretschmer <mathias.kretschmer@fokus.fraunhofer.de> Signed-off-by: Daniel Borkmann <dborkman@redhat.com> Cc: Jesper Dangaard Brouer <brouer@redhat.com> Signed-off-by: David S. Miller <davem@davemloft.net> --- net/packet/af_packet.c | 24 +++++++++++++++++++++--- 1 file changed, 21 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 6a2bb37506c5..b5dc1168f98a 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -308,11 +308,27 @@ static bool packet_use_direct_xmit(const struct packet_sock *po) return po->xmit == packet_direct_xmit; } -static u16 packet_pick_tx_queue(struct net_device *dev) +static u16 __packet_pick_tx_queue(struct net_device *dev, struct sk_buff *skb) { return (u16) raw_smp_processor_id() % dev->real_num_tx_queues; } +static void packet_pick_tx_queue(struct net_device *dev, struct sk_buff *skb) +{ + const struct net_device_ops *ops = dev->netdev_ops; + u16 queue_index; + + if (ops->ndo_select_queue) { + queue_index = ops->ndo_select_queue(dev, skb, NULL, + __packet_pick_tx_queue); + queue_index = netdev_cap_txqueue(dev, queue_index); + } else { + queue_index = __packet_pick_tx_queue(dev, skb); + } + + skb_set_queue_mapping(skb, queue_index); +} + /* register_prot_hook must be invoked with the po->bind_lock held, * or from a context in which asynchronous accesses to the packet * socket is not possible (packet_create()). @@ -2285,7 +2301,8 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg) } } - skb_set_queue_mapping(skb, packet_pick_tx_queue(dev)); + packet_pick_tx_queue(dev, skb); + skb->destructor = tpacket_destruct_skb; __packet_set_status(po, ph, TP_STATUS_SENDING); packet_inc_pending(&po->tx_ring); @@ -2499,7 +2516,8 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len) skb->dev = dev; skb->priority = sk->sk_priority; skb->mark = sk->sk_mark; - skb_set_queue_mapping(skb, packet_pick_tx_queue(dev)); + + packet_pick_tx_queue(dev, skb); if (po->has_vnet_hdr) { if (vnet_hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) { -- cgit From 930cd6e46eadce8b8ed2a232ee536e5fd286c152 Mon Sep 17 00:00:00 2001 From: Antonio Quartulli <antonio@meshcoding.com> Date: Tue, 21 Jan 2014 11:22:05 +0100 Subject: batman-adv: fix soft-interface MTU computation The current MTU computation always returns a value smaller than 1500bytes even if the real interfaces have an MTU large enough to compensate the batman-adv overhead. Fix the computation by properly returning the highest admitted value. Introduced by a19d3d85e1b854e4a483a55d740a42458085560d ("batman-adv: limit local translation table max size") Reported-by: Russell Senior <russell@personaltelco.net> Signed-off-by: Antonio Quartulli <antonio@meshcoding.com> Signed-off-by: Marek Lindner <mareklindner@neomailbox.ch> --- net/batman-adv/hard-interface.c | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/batman-adv/hard-interface.c b/net/batman-adv/hard-interface.c index 3d417d3641c6..b851cc580853 100644 --- a/net/batman-adv/hard-interface.c +++ b/net/batman-adv/hard-interface.c @@ -241,7 +241,7 @@ int batadv_hardif_min_mtu(struct net_device *soft_iface) { struct batadv_priv *bat_priv = netdev_priv(soft_iface); const struct batadv_hard_iface *hard_iface; - int min_mtu = ETH_DATA_LEN; + int min_mtu = INT_MAX; rcu_read_lock(); list_for_each_entry_rcu(hard_iface, &batadv_hardif_list, list) { @@ -256,8 +256,6 @@ int batadv_hardif_min_mtu(struct net_device *soft_iface) } rcu_read_unlock(); - atomic_set(&bat_priv->packet_size_max, min_mtu); - if (atomic_read(&bat_priv->fragmentation) == 0) goto out; @@ -268,13 +266,21 @@ int batadv_hardif_min_mtu(struct net_device *soft_iface) min_mtu = min_t(int, min_mtu, BATADV_FRAG_MAX_FRAG_SIZE); min_mtu -= sizeof(struct batadv_frag_packet); min_mtu *= BATADV_FRAG_MAX_FRAGMENTS; - atomic_set(&bat_priv->packet_size_max, min_mtu); - - /* with fragmentation enabled we can fragment external packets easily */ - min_mtu = min_t(int, min_mtu, ETH_DATA_LEN); out: - return min_mtu - batadv_max_header_len(); + /* report to the other components the maximum amount of bytes that + * batman-adv can send over the wire (without considering the payload + * overhead). For example, this value is used by TT to compute the + * maximum local table table size + */ + atomic_set(&bat_priv->packet_size_max, min_mtu); + + /* the real soft-interface MTU is computed by removing the payload + * overhead from the maximum amount of bytes that was just computed. + * + * However batman-adv does not support MTUs bigger than ETH_DATA_LEN + */ + return min_t(int, min_mtu - batadv_max_header_len(), ETH_DATA_LEN); } /* adjusts the MTU if a new interface with a smaller MTU appeared. */ -- cgit From e889241f45f9cecbc84a6ffed577083ab52e62ee Mon Sep 17 00:00:00 2001 From: Antonio Quartulli <antonio@meshcoding.com> Date: Mon, 27 Jan 2014 12:23:28 +0100 Subject: batman-adv: fix TT-TVLV parsing on OGM reception When accessing a TT-TVLV container in the OGM RX path the variable pointing to the list of changes to apply is altered by mistake. This makes the TT component read data at the wrong position in the OGM packet buffer. Fix it by removing the bogus pointer alteration. Signed-off-by: Antonio Quartulli <antonio@meshcoding.com> Signed-off-by: Marek Lindner <mareklindner@neomailbox.ch> --- net/batman-adv/translation-table.c | 1 - 1 file changed, 1 deletion(-) (limited to 'net') diff --git a/net/batman-adv/translation-table.c b/net/batman-adv/translation-table.c index b6071f675a3e..beba13fbd10a 100644 --- a/net/batman-adv/translation-table.c +++ b/net/batman-adv/translation-table.c @@ -3218,7 +3218,6 @@ static void batadv_tt_update_orig(struct batadv_priv *bat_priv, spin_lock_bh(&orig_node->tt_lock); - tt_change = (struct batadv_tvlv_tt_change *)tt_buff; batadv_tt_update_changes(bat_priv, orig_node, tt_num_changes, ttvn, tt_change); -- cgit From 91c2b1a9f680ff105369d49abc7e19ca7efb33e1 Mon Sep 17 00:00:00 2001 From: Antonio Quartulli <antonio@meshcoding.com> Date: Tue, 28 Jan 2014 02:06:47 +0100 Subject: batman-adv: release vlan object after checking the CRC There is a refcounter unbalance in the CRC checking routine invoked on OGM reception. A vlan object is retrieved (thus its refcounter is increased by one) but it is never properly released. This leads to a memleak because the vlan object will never be free'd. Fix this by releasing the vlan object after having read the CRC. Reported-by: Russell Senior <russell@personaltelco.net> Reported-by: Daniel <daniel@makrotopia.org> Reported-by: cmsv <cmsv@wirelesspt.net> Signed-off-by: Antonio Quartulli <antonio@meshcoding.com> Signed-off-by: Marek Lindner <mareklindner@neomailbox.ch> --- net/batman-adv/translation-table.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/batman-adv/translation-table.c b/net/batman-adv/translation-table.c index beba13fbd10a..c21c5572c860 100644 --- a/net/batman-adv/translation-table.c +++ b/net/batman-adv/translation-table.c @@ -2262,6 +2262,7 @@ static bool batadv_tt_global_check_crc(struct batadv_orig_node *orig_node, { struct batadv_tvlv_tt_vlan_data *tt_vlan_tmp; struct batadv_orig_node_vlan *vlan; + uint32_t crc; int i; /* check if each received CRC matches the locally stored one */ @@ -2281,7 +2282,10 @@ static bool batadv_tt_global_check_crc(struct batadv_orig_node *orig_node, if (!vlan) return false; - if (vlan->tt.crc != ntohl(tt_vlan_tmp->crc)) + crc = vlan->tt.crc; + batadv_orig_node_vlan_free_ref(vlan); + + if (crc != ntohl(tt_vlan_tmp->crc)) return false; } -- cgit From f1791425cf0bcda43ab9a9a37df1ad3ccb1f6654 Mon Sep 17 00:00:00 2001 From: Antonio Quartulli <antonio@meshcoding.com> Date: Thu, 30 Jan 2014 00:12:24 +0100 Subject: batman-adv: properly check pskb_may_pull return value pskb_may_pull() returns 1 on success and 0 in case of failure, therefore checking for the return value being negative does not make sense at all. This way if the function fails we will probably read beyond the current skb data buffer. Fix this by doing the proper check. Signed-off-by: Antonio Quartulli <antonio@meshcoding.com> Signed-off-by: Marek Lindner <mareklindner@neomailbox.ch> --- net/batman-adv/routing.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/batman-adv/routing.c b/net/batman-adv/routing.c index 1ed9f7c9ecea..c26f07368849 100644 --- a/net/batman-adv/routing.c +++ b/net/batman-adv/routing.c @@ -688,7 +688,7 @@ static int batadv_check_unicast_ttvn(struct batadv_priv *bat_priv, int is_old_ttvn; /* check if there is enough data before accessing it */ - if (pskb_may_pull(skb, hdr_len + ETH_HLEN) < 0) + if (!pskb_may_pull(skb, hdr_len + ETH_HLEN)) return 0; /* create a copy of the skb (in case of for re-routing) to modify it. */ -- cgit From 08bf0ed29c7ded45c477d08618220dd200c3524a Mon Sep 17 00:00:00 2001 From: Antonio Quartulli <antonio@open-mesh.com> Date: Wed, 29 Jan 2014 11:25:12 +0100 Subject: batman-adv: avoid potential race condition when adding a new neighbour When adding a new neighbour it is important to atomically perform the following: - check if the neighbour already exists - append the neighbour to the proper list If the two operations are not performed in an atomic context it is possible that two concurrent insertions add the same neighbour twice. Signed-off-by: Antonio Quartulli <antonio@open-mesh.com> Signed-off-by: Marek Lindner <mareklindner@neomailbox.ch> --- net/batman-adv/bat_iv_ogm.c | 22 ++++++++++++++++------ net/batman-adv/originator.c | 36 ++++++++++++++++++++++++++++++++++++ net/batman-adv/originator.h | 4 ++++ 3 files changed, 56 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/batman-adv/bat_iv_ogm.c b/net/batman-adv/bat_iv_ogm.c index 512159bf607f..094ae7ca50a0 100644 --- a/net/batman-adv/bat_iv_ogm.c +++ b/net/batman-adv/bat_iv_ogm.c @@ -266,7 +266,7 @@ batadv_iv_ogm_neigh_new(struct batadv_hard_iface *hard_iface, struct batadv_orig_node *orig_neigh) { struct batadv_priv *bat_priv = netdev_priv(hard_iface->soft_iface); - struct batadv_neigh_node *neigh_node; + struct batadv_neigh_node *neigh_node, *tmp_neigh_node; neigh_node = batadv_neigh_node_new(hard_iface, neigh_addr, orig_node); if (!neigh_node) @@ -281,14 +281,24 @@ batadv_iv_ogm_neigh_new(struct batadv_hard_iface *hard_iface, neigh_node->orig_node = orig_neigh; neigh_node->if_incoming = hard_iface; - batadv_dbg(BATADV_DBG_BATMAN, bat_priv, - "Creating new neighbor %pM for orig_node %pM on interface %s\n", - neigh_addr, orig_node->orig, hard_iface->net_dev->name); - spin_lock_bh(&orig_node->neigh_list_lock); - hlist_add_head_rcu(&neigh_node->list, &orig_node->neigh_list); + tmp_neigh_node = batadv_neigh_node_get(orig_node, hard_iface, + neigh_addr); + if (!tmp_neigh_node) { + hlist_add_head_rcu(&neigh_node->list, &orig_node->neigh_list); + } else { + kfree(neigh_node); + batadv_hardif_free_ref(hard_iface); + neigh_node = tmp_neigh_node; + } spin_unlock_bh(&orig_node->neigh_list_lock); + if (!tmp_neigh_node) + batadv_dbg(BATADV_DBG_BATMAN, bat_priv, + "Creating new neighbor %pM for orig_node %pM on interface %s\n", + neigh_addr, orig_node->orig, + hard_iface->net_dev->name); + out: return neigh_node; } diff --git a/net/batman-adv/originator.c b/net/batman-adv/originator.c index 6df12a2e3605..853941629dc1 100644 --- a/net/batman-adv/originator.c +++ b/net/batman-adv/originator.c @@ -457,6 +457,42 @@ out: return neigh_node; } +/** + * batadv_neigh_node_get - retrieve a neighbour from the list + * @orig_node: originator which the neighbour belongs to + * @hard_iface: the interface where this neighbour is connected to + * @addr: the address of the neighbour + * + * Looks for and possibly returns a neighbour belonging to this originator list + * which is connected through the provided hard interface. + * Returns NULL if the neighbour is not found. + */ +struct batadv_neigh_node * +batadv_neigh_node_get(const struct batadv_orig_node *orig_node, + const struct batadv_hard_iface *hard_iface, + const uint8_t *addr) +{ + struct batadv_neigh_node *tmp_neigh_node, *res = NULL; + + rcu_read_lock(); + hlist_for_each_entry_rcu(tmp_neigh_node, &orig_node->neigh_list, list) { + if (!batadv_compare_eth(tmp_neigh_node->addr, addr)) + continue; + + if (tmp_neigh_node->if_incoming != hard_iface) + continue; + + if (!atomic_inc_not_zero(&tmp_neigh_node->refcount)) + continue; + + res = tmp_neigh_node; + break; + } + rcu_read_unlock(); + + return res; +} + /** * batadv_orig_ifinfo_free_rcu - free the orig_ifinfo object * @rcu: rcu pointer of the orig_ifinfo object diff --git a/net/batman-adv/originator.h b/net/batman-adv/originator.h index 37be290f63f6..db3a9ed734cb 100644 --- a/net/batman-adv/originator.h +++ b/net/batman-adv/originator.h @@ -29,6 +29,10 @@ void batadv_orig_node_free_ref_now(struct batadv_orig_node *orig_node); struct batadv_orig_node *batadv_orig_node_new(struct batadv_priv *bat_priv, const uint8_t *addr); struct batadv_neigh_node * +batadv_neigh_node_get(const struct batadv_orig_node *orig_node, + const struct batadv_hard_iface *hard_iface, + const uint8_t *addr); +struct batadv_neigh_node * batadv_neigh_node_new(struct batadv_hard_iface *hard_iface, const uint8_t *neigh_addr, struct batadv_orig_node *orig_node); -- cgit From b2262df7fcf2c395eca564df83238e931d88d7bf Mon Sep 17 00:00:00 2001 From: Simon Wunderlich <sw@simonwunderlich.de> Date: Sat, 8 Feb 2014 16:45:06 +0100 Subject: batman-adv: fix potential orig_node reference leak Since batadv_orig_node_new() sets the refcount to two, assuming that the calling function will use a reference for putting the orig_node into a hash or similar, both references must be freed if initialization of the orig_node fails. Otherwise that object may be leaked in that error case. Reported-by: Antonio Quartulli <antonio@meshcoding.com> Signed-off-by: Simon Wunderlich <sw@simonwunderlich.de> Signed-off-by: Marek Lindner <mareklindner@neomailbox.ch> Signed-off-by: Antonio Quartulli <antonio@meshcoding.com> --- net/batman-adv/bat_iv_ogm.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/batman-adv/bat_iv_ogm.c b/net/batman-adv/bat_iv_ogm.c index 094ae7ca50a0..42cbc0a68941 100644 --- a/net/batman-adv/bat_iv_ogm.c +++ b/net/batman-adv/bat_iv_ogm.c @@ -254,6 +254,8 @@ batadv_iv_ogm_orig_get(struct batadv_priv *bat_priv, const uint8_t *addr) free_bcast_own: kfree(orig_node->bat_iv.bcast_own); free_orig_node: + /* free twice, as batadv_orig_node_new sets refcount to 2 */ + batadv_orig_node_free_ref(orig_node); batadv_orig_node_free_ref(orig_node); return NULL; -- cgit From a30e22ca8464c2dc573e0144a972221c2f06c2cd Mon Sep 17 00:00:00 2001 From: Antonio Quartulli <antonio@open-mesh.com> Date: Tue, 11 Feb 2014 17:05:06 +0100 Subject: batman-adv: fix TT CRC computation by ensuring byte order When computing the CRC on a 2byte variable the order of the bytes obviously alters the final result. This means that computing the CRC over the same value on two archs having different endianess leads to different numbers. The global and local translation table CRC computation routine makes this mistake while processing the clients VIDs. The result is a continuous CRC mismatching between nodes having different endianess. Fix this by converting the VID to Network Order before processing it. This guarantees that every node uses the same byte order. Introduced by 7ea7b4a142758deaf46c1af0ca9ceca6dd55138b ("batman-adv: make the TT CRC logic VLAN specific") Reported-by: Russel Senior <russell@personaltelco.net> Signed-off-by: Antonio Quartulli <antonio@open-mesh.com> Tested-by: Russell Senior <russell@personaltelco.net> Signed-off-by: Marek Lindner <mareklindner@neomailbox.ch> --- net/batman-adv/translation-table.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/batman-adv/translation-table.c b/net/batman-adv/translation-table.c index c21c5572c860..959dde721c46 100644 --- a/net/batman-adv/translation-table.c +++ b/net/batman-adv/translation-table.c @@ -1975,6 +1975,7 @@ static uint32_t batadv_tt_global_crc(struct batadv_priv *bat_priv, struct hlist_head *head; uint32_t i, crc_tmp, crc = 0; uint8_t flags; + __be16 tmp_vid; for (i = 0; i < hash->size; i++) { head = &hash->table[i]; @@ -2011,8 +2012,11 @@ static uint32_t batadv_tt_global_crc(struct batadv_priv *bat_priv, orig_node)) continue; - crc_tmp = crc32c(0, &tt_common->vid, - sizeof(tt_common->vid)); + /* use network order to read the VID: this ensures that + * every node reads the bytes in the same order. + */ + tmp_vid = htons(tt_common->vid); + crc_tmp = crc32c(0, &tmp_vid, sizeof(tmp_vid)); /* compute the CRC on flags that have to be kept in sync * among nodes @@ -2046,6 +2050,7 @@ static uint32_t batadv_tt_local_crc(struct batadv_priv *bat_priv, struct hlist_head *head; uint32_t i, crc_tmp, crc = 0; uint8_t flags; + __be16 tmp_vid; for (i = 0; i < hash->size; i++) { head = &hash->table[i]; @@ -2064,8 +2069,11 @@ static uint32_t batadv_tt_local_crc(struct batadv_priv *bat_priv, if (tt_common->flags & BATADV_TT_CLIENT_NEW) continue; - crc_tmp = crc32c(0, &tt_common->vid, - sizeof(tt_common->vid)); + /* use network order to read the VID: this ensures that + * every node reads the bytes in the same order. + */ + tmp_vid = htons(tt_common->vid); + crc_tmp = crc32c(0, &tmp_vid, sizeof(tmp_vid)); /* compute the CRC on flags that have to be kept in sync * among nodes -- cgit From 05c3c8a636aa9ee35ce13f65afc5b665615cc786 Mon Sep 17 00:00:00 2001 From: Antonio Quartulli <antonio@open-mesh.com> Date: Tue, 11 Feb 2014 17:05:07 +0100 Subject: batman-adv: free skb on TVLV parsing success When the TVLV parsing routine succeed the skb is left untouched thus leading to a memory leak. Fix this by consuming the skb in case of success. Introduced by ef26157747d42254453f6b3ac2bd8bd3c53339c3 ("batman-adv: tvlv - basic infrastructure") Reported-by: Russel Senior <russell@personaltelco.net> Signed-off-by: Antonio Quartulli <antonio@open-mesh.com> Tested-by: Russell Senior <russell@personaltelco.net> Signed-off-by: Marek Lindner <mareklindner@neomailbox.ch> --- net/batman-adv/routing.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/batman-adv/routing.c b/net/batman-adv/routing.c index c26f07368849..a953d5b196a3 100644 --- a/net/batman-adv/routing.c +++ b/net/batman-adv/routing.c @@ -918,6 +918,8 @@ int batadv_recv_unicast_tvlv(struct sk_buff *skb, if (ret != NET_RX_SUCCESS) ret = batadv_route_unicast_packet(skb, recv_if); + else + consume_skb(skb); return ret; } -- cgit From a5a5cb8cab526af2f6cbe9715f8ca843192f0d81 Mon Sep 17 00:00:00 2001 From: Antonio Quartulli <antonio@meshcoding.com> Date: Sat, 15 Feb 2014 02:17:20 +0100 Subject: batman-adv: avoid double free when orig_node initialization fails In the failure path of the orig_node initialization routine the orig_node->bat_iv.bcast_own field is free'd twice: first in batadv_iv_ogm_orig_get() and then later in batadv_orig_node_free_rcu(). Fix it by removing the kfree in batadv_iv_ogm_orig_get(). Signed-off-by: Antonio Quartulli <antonio@meshcoding.com> Signed-off-by: Marek Lindner <mareklindner@neomailbox.ch> --- net/batman-adv/bat_iv_ogm.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/batman-adv/bat_iv_ogm.c b/net/batman-adv/bat_iv_ogm.c index 42cbc0a68941..8323bced8e5b 100644 --- a/net/batman-adv/bat_iv_ogm.c +++ b/net/batman-adv/bat_iv_ogm.c @@ -241,18 +241,16 @@ batadv_iv_ogm_orig_get(struct batadv_priv *bat_priv, const uint8_t *addr) size = bat_priv->num_ifaces * sizeof(uint8_t); orig_node->bat_iv.bcast_own_sum = kzalloc(size, GFP_ATOMIC); if (!orig_node->bat_iv.bcast_own_sum) - goto free_bcast_own; + goto free_orig_node; hash_added = batadv_hash_add(bat_priv->orig_hash, batadv_compare_orig, batadv_choose_orig, orig_node, &orig_node->hash_entry); if (hash_added != 0) - goto free_bcast_own; + goto free_orig_node; return orig_node; -free_bcast_own: - kfree(orig_node->bat_iv.bcast_own); free_orig_node: /* free twice, as batadv_orig_node_new sets refcount to 2 */ batadv_orig_node_free_ref(orig_node); -- cgit From 70b271a78beba787155d6696aacd7c4d4a251c50 Mon Sep 17 00:00:00 2001 From: Antonio Quartulli <antonio@meshcoding.com> Date: Sat, 15 Feb 2014 21:50:37 +0100 Subject: batman-adv: fix potential kernel paging error for unicast transmissions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit batadv_send_skb_prepare_unicast(_4addr) might reallocate the skb's data. If it does then our ethhdr pointer is not valid anymore in batadv_send_skb_unicast(), resulting in a kernel paging error. Fixing this by refetching the ethhdr pointer after the potential reallocation. Signed-off-by: Linus Lüssing <linus.luessing@web.de> Signed-off-by: Antonio Quartulli <antonio@meshcoding.com> --- net/batman-adv/send.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/batman-adv/send.c b/net/batman-adv/send.c index 579f5f00a385..843febd1e519 100644 --- a/net/batman-adv/send.c +++ b/net/batman-adv/send.c @@ -254,9 +254,9 @@ static int batadv_send_skb_unicast(struct batadv_priv *bat_priv, struct batadv_orig_node *orig_node, unsigned short vid) { - struct ethhdr *ethhdr = (struct ethhdr *)skb->data; + struct ethhdr *ethhdr; struct batadv_unicast_packet *unicast_packet; - int ret = NET_XMIT_DROP; + int ret = NET_XMIT_DROP, hdr_size; if (!orig_node) goto out; @@ -265,12 +265,16 @@ static int batadv_send_skb_unicast(struct batadv_priv *bat_priv, case BATADV_UNICAST: if (!batadv_send_skb_prepare_unicast(skb, orig_node)) goto out; + + hdr_size = sizeof(*unicast_packet); break; case BATADV_UNICAST_4ADDR: if (!batadv_send_skb_prepare_unicast_4addr(bat_priv, skb, orig_node, packet_subtype)) goto out; + + hdr_size = sizeof(struct batadv_unicast_4addr_packet); break; default: /* this function supports UNICAST and UNICAST_4ADDR only. It @@ -279,6 +283,7 @@ static int batadv_send_skb_unicast(struct batadv_priv *bat_priv, goto out; } + ethhdr = (struct ethhdr *)(skb->data + hdr_size); unicast_packet = (struct batadv_unicast_packet *)skb->data; /* inform the destination node that we are still missing a correct route -- cgit From 08b44656c08c8c2f73cdac2a058be2880e3361f2 Mon Sep 17 00:00:00 2001 From: Nicolas Dichtel <nicolas.dichtel@6wind.com> Date: Mon, 17 Feb 2014 14:22:21 +0100 Subject: gre: add link local route when local addr is any This bug was reported by Steinar H. Gunderson and was introduced by commit f7cb8886335d ("sit/gre6: don't try to add the same route two times"). root@morgental:~# ip tunnel add foo mode gre remote 1.2.3.4 ttl 64 root@morgental:~# ip link set foo up mtu 1468 root@morgental:~# ip -6 route show dev foo fe80::/64 proto kernel metric 256 but after the above commit, no such route shows up. There is no link local route because dev->dev_addr is 0 (because local ipv4 address is 0), hence no link local address is configured. In this scenario, the link local address is added manually: 'ip -6 addr add fe80::1 dev foo' and because prefix is /128, no link local route is added by the kernel. Even if the right things to do is to add the link local address with a /64 prefix, we need to restore the previous behavior to avoid breaking userpace. Reported-by: Steinar H. Gunderson <sesse@samfundet.no> Signed-off-by: Nicolas Dichtel <nicolas.dichtel@6wind.com> Signed-off-by: David S. Miller <davem@davemloft.net> --- net/ipv6/addrconf.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index ad235690684c..fdbfeca36d63 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -2783,6 +2783,8 @@ static void addrconf_gre_config(struct net_device *dev) ipv6_addr_set(&addr, htonl(0xFE800000), 0, 0, 0); if (!ipv6_generate_eui64(addr.s6_addr + 8, dev)) addrconf_add_linklocal(idev, &addr); + else + addrconf_prefix_route(&addr, 64, dev, 0, 0); } #endif -- cgit From a4b1b5877b514b276f0f31efe02388a9c2836728 Mon Sep 17 00:00:00 2001 From: David Herrmann <dh.herrmann@gmail.com> Date: Thu, 19 Dec 2013 12:09:32 +0100 Subject: HID: Bluetooth: hidp: make sure input buffers are big enough HID core expects the input buffers to be at least of size 4096 (HID_MAX_BUFFER_SIZE). Other sizes will result in buffer-overflows if an input-report is smaller than advertised. We could, like i2c, compute the biggest report-size instead of using HID_MAX_BUFFER_SIZE, but this will blow up if report-descriptors are changed after ->start() has been called. So lets be safe and just use the biggest buffer we have. Note that this adds an additional copy to the HIDP input path. If there is a way to make sure the skb-buf is big enough, we should use that instead. The best way would be to make hid-core honor the @size argument, though, that sounds easier than it is. So lets just fix the buffer-overflows for now and afterwards look for a faster way for all transport drivers. Signed-off-by: David Herrmann <dh.herrmann@gmail.com> Signed-off-by: Jiri Kosina <jkosina@suse.cz> --- net/bluetooth/hidp/core.c | 16 ++++++++++++++-- net/bluetooth/hidp/hidp.h | 4 ++++ 2 files changed, 18 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/bluetooth/hidp/core.c b/net/bluetooth/hidp/core.c index 292e619db896..d9fb93451442 100644 --- a/net/bluetooth/hidp/core.c +++ b/net/bluetooth/hidp/core.c @@ -430,6 +430,16 @@ static void hidp_del_timer(struct hidp_session *session) del_timer(&session->timer); } +static void hidp_process_report(struct hidp_session *session, + int type, const u8 *data, int len, int intr) +{ + if (len > HID_MAX_BUFFER_SIZE) + len = HID_MAX_BUFFER_SIZE; + + memcpy(session->input_buf, data, len); + hid_input_report(session->hid, type, session->input_buf, len, intr); +} + static void hidp_process_handshake(struct hidp_session *session, unsigned char param) { @@ -502,7 +512,8 @@ static int hidp_process_data(struct hidp_session *session, struct sk_buff *skb, hidp_input_report(session, skb); if (session->hid) - hid_input_report(session->hid, HID_INPUT_REPORT, skb->data, skb->len, 0); + hidp_process_report(session, HID_INPUT_REPORT, + skb->data, skb->len, 0); break; case HIDP_DATA_RTYPE_OTHER: @@ -584,7 +595,8 @@ static void hidp_recv_intr_frame(struct hidp_session *session, hidp_input_report(session, skb); if (session->hid) { - hid_input_report(session->hid, HID_INPUT_REPORT, skb->data, skb->len, 1); + hidp_process_report(session, HID_INPUT_REPORT, + skb->data, skb->len, 1); BT_DBG("report len %d", skb->len); } } else { diff --git a/net/bluetooth/hidp/hidp.h b/net/bluetooth/hidp/hidp.h index ab5241400cf7..8798492a6e99 100644 --- a/net/bluetooth/hidp/hidp.h +++ b/net/bluetooth/hidp/hidp.h @@ -24,6 +24,7 @@ #define __HIDP_H #include <linux/types.h> +#include <linux/hid.h> #include <linux/kref.h> #include <net/bluetooth/bluetooth.h> #include <net/bluetooth/l2cap.h> @@ -179,6 +180,9 @@ struct hidp_session { /* Used in hidp_output_raw_report() */ int output_report_success; /* boolean */ + + /* temporary input buffer */ + u8 input_buf[HID_MAX_BUFFER_SIZE]; }; /* HIDP init defines */ -- cgit From a6254864c08109c66a194612585afc0439005286 Mon Sep 17 00:00:00 2001 From: Duan Jiong <duanj.fnst@cn.fujitsu.com> Date: Mon, 17 Feb 2014 15:23:43 +0800 Subject: ipv4: fix counter in_slow_tot since commit 89aef8921bf("ipv4: Delete routing cache."), the counter in_slow_tot can't work correctly. The counter in_slow_tot increase by one when fib_lookup() return successfully in ip_route_input_slow(), but actually the dst struct maybe not be created and cached, so we can increase in_slow_tot after the dst struct is created. Signed-off-by: Duan Jiong <duanj.fnst@cn.fujitsu.com> Signed-off-by: David S. Miller <davem@davemloft.net> --- net/ipv4/route.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 6f6dd85bdffc..4c011ec69ed4 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1597,6 +1597,7 @@ static int __mkroute_input(struct sk_buff *skb, rth->rt_gateway = 0; rth->rt_uses_gateway = 0; INIT_LIST_HEAD(&rth->rt_uncached); + RT_CACHE_STAT_INC(in_slow_tot); rth->dst.input = ip_forward; rth->dst.output = ip_output; @@ -1701,8 +1702,6 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, goto no_route; } - RT_CACHE_STAT_INC(in_slow_tot); - if (res.type == RTN_BROADCAST) goto brd_input; @@ -1773,6 +1772,7 @@ local_input: rth->rt_gateway = 0; rth->rt_uses_gateway = 0; INIT_LIST_HEAD(&rth->rt_uncached); + RT_CACHE_STAT_INC(in_slow_tot); if (res.type == RTN_UNREACHABLE) { rth->dst.input= ip_error; rth->dst.error= -err; -- cgit From ffd5939381c609056b33b7585fb05a77b4c695f3 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann <dborkman@redhat.com> Date: Mon, 17 Feb 2014 12:11:11 +0100 Subject: net: sctp: fix sctp_connectx abi for ia32 emulation/compat mode SCTP's sctp_connectx() abi breaks for 64bit kernels compiled with 32bit emulation (e.g. ia32 emulation or x86_x32). Due to internal usage of 'struct sctp_getaddrs_old' which includes a struct sockaddr pointer, sizeof(param) check will always fail in kernel as the structure in 64bit kernel space is 4bytes larger than for user binaries compiled in 32bit mode. Thus, applications making use of sctp_connectx() won't be able to run under such circumstances. Introduce a compat interface in the kernel to deal with such situations by using a 'struct compat_sctp_getaddrs_old' structure where user data is copied into it, and then sucessively transformed into a 'struct sctp_getaddrs_old' structure with the help of compat_ptr(). That fixes sctp_connectx() abi without any changes needed in user space, and lets the SCTP test suite pass when compiled in 32bit and run on 64bit kernels. Fixes: f9c67811ebc0 ("sctp: Fix regression introduced by new sctp_connectx api") Signed-off-by: Daniel Borkmann <dborkman@redhat.com> Acked-by: Neil Horman <nhorman@tuxdriver.com> Acked-by: Vlad Yasevich <vyasevich@gmail.com> Signed-off-by: David S. Miller <davem@davemloft.net> --- net/sctp/socket.c | 41 ++++++++++++++++++++++++++++++++--------- 1 file changed, 32 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/sctp/socket.c b/net/sctp/socket.c index 7075ac847fde..981aaf8b6ace 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -64,6 +64,7 @@ #include <linux/crypto.h> #include <linux/slab.h> #include <linux/file.h> +#include <linux/compat.h> #include <net/ip.h> #include <net/icmp.h> @@ -1368,11 +1369,19 @@ static int sctp_setsockopt_connectx(struct sock *sk, /* * New (hopefully final) interface for the API. * We use the sctp_getaddrs_old structure so that use-space library - * can avoid any unnecessary allocations. The only defferent part + * can avoid any unnecessary allocations. The only different part * is that we store the actual length of the address buffer into the - * addrs_num structure member. That way we can re-use the existing + * addrs_num structure member. That way we can re-use the existing * code. */ +#ifdef CONFIG_COMPAT +struct compat_sctp_getaddrs_old { + sctp_assoc_t assoc_id; + s32 addr_num; + compat_uptr_t addrs; /* struct sockaddr * */ +}; +#endif + static int sctp_getsockopt_connectx3(struct sock *sk, int len, char __user *optval, int __user *optlen) @@ -1381,16 +1390,30 @@ static int sctp_getsockopt_connectx3(struct sock *sk, int len, sctp_assoc_t assoc_id = 0; int err = 0; - if (len < sizeof(param)) - return -EINVAL; +#ifdef CONFIG_COMPAT + if (is_compat_task()) { + struct compat_sctp_getaddrs_old param32; - if (copy_from_user(¶m, optval, sizeof(param))) - return -EFAULT; + if (len < sizeof(param32)) + return -EINVAL; + if (copy_from_user(¶m32, optval, sizeof(param32))) + return -EFAULT; - err = __sctp_setsockopt_connectx(sk, - (struct sockaddr __user *)param.addrs, - param.addr_num, &assoc_id); + param.assoc_id = param32.assoc_id; + param.addr_num = param32.addr_num; + param.addrs = compat_ptr(param32.addrs); + } else +#endif + { + if (len < sizeof(param)) + return -EINVAL; + if (copy_from_user(¶m, optval, sizeof(param))) + return -EFAULT; + } + err = __sctp_setsockopt_connectx(sk, (struct sockaddr __user *) + param.addrs, param.addr_num, + &assoc_id); if (err == 0 || err == -EINPROGRESS) { if (copy_to_user(optval, &assoc_id, sizeof(assoc_id))) return -EFAULT; -- cgit From d7cf0c34af067555737193b6c1aa7abaa677f29c Mon Sep 17 00:00:00 2001 From: Dan Carpenter <dan.carpenter@oracle.com> Date: Tue, 18 Feb 2014 15:20:51 +0300 Subject: af_packet: remove a stray tab in packet_set_ring() At first glance it looks like there is a missing curly brace but actually the code works the same either way. I have adjusted the indenting but left the code the same. Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com> Acked-by: Daniel Borkmann <dborkman@redhat.com> Signed-off-by: David S. Miller <davem@davemloft.net> --- net/packet/af_packet.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index b5dc1168f98a..48a6a93db296 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -3804,7 +3804,7 @@ static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u, */ if (!tx_ring) init_prb_bdqc(po, rb, pg_vec, req_u, tx_ring); - break; + break; default: break; } -- cgit