From 3e12939a2a67fbb4cbd962c3b9bc398c73319766 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Tue, 10 Jul 2012 04:01:57 -0700 Subject: inet: Kill FLOWI_FLAG_PRECOW_METRICS. No longer needed. TCP writes metrics, but now in it's own special cache that does not dirty the route metrics. Therefore there is no longer any reason to pre-cow metrics in this way. Signed-off-by: David S. Miller --- include/net/inet_sock.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/net/inet_sock.h') diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h index ae17e1352d7e..924d7b98ab60 100644 --- a/include/net/inet_sock.h +++ b/include/net/inet_sock.h @@ -245,8 +245,6 @@ static inline __u8 inet_sk_flowi_flags(const struct sock *sk) if (inet_sk(sk)->transparent || inet_sk(sk)->hdrincl) flags |= FLOWI_FLAG_ANYSRC; - if (sk->sk_protocol == IPPROTO_TCP) - flags |= FLOWI_FLAG_PRECOW_METRICS; return flags; } -- cgit From 92101b3b2e3178087127709a556b091dae314e9e Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Mon, 23 Jul 2012 16:29:00 -0700 Subject: ipv4: Prepare for change of rt->rt_iif encoding. Use inet_iif() consistently, and for TCP record the input interface of cached RX dst in inet sock. rt->rt_iif is going to be encoded differently, so that we can legitimately cache input routes in the FIB info more aggressively. When the input interface is "use SKB device index" the rt->rt_iif will be set to zero. This forces us to move the TCP RX dst cache installation into the ipv4 specific code, and as well it should since doing the route caching for ipv6 is pointless at the moment since it is not inspected in the ipv6 input paths yet. Also, remove the unlikely on dst->obsolete, all ipv4 dsts have obsolete set to a non-zero value to force invocation of the check callback. Signed-off-by: David S. Miller --- include/net/inet_sock.h | 1 + net/dccp/ipv4.c | 2 +- net/ipv4/icmp.c | 2 +- net/ipv4/ip_sockglue.c | 5 ++--- net/ipv4/route.c | 2 +- net/ipv4/tcp_input.c | 12 ------------ net/ipv4/tcp_ipv4.c | 24 ++++++++++++++++++------ net/sched/cls_route.c | 2 +- net/sched/em_meta.c | 2 +- net/sctp/protocol.c | 2 +- 10 files changed, 27 insertions(+), 27 deletions(-) (limited to 'include/net/inet_sock.h') diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h index 924d7b98ab60..613cfa401672 100644 --- a/include/net/inet_sock.h +++ b/include/net/inet_sock.h @@ -172,6 +172,7 @@ struct inet_sock { int uc_index; int mc_index; __be32 mc_addr; + int rx_dst_ifindex; struct ip_mc_socklist __rcu *mc_list; struct inet_cork_full cork; }; diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c index 25428d0c50c9..176ecdba4a22 100644 --- a/net/dccp/ipv4.c +++ b/net/dccp/ipv4.c @@ -481,7 +481,7 @@ static struct dst_entry* dccp_v4_route_skb(struct net *net, struct sock *sk, struct rtable *rt; const struct iphdr *iph = ip_hdr(skb); struct flowi4 fl4 = { - .flowi4_oif = skb_rtable(skb)->rt_iif, + .flowi4_oif = inet_iif(skb), .daddr = iph->saddr, .saddr = iph->daddr, .flowi4_tos = RT_CONN_FLAGS(sk), diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index f2a06beffbd3..f2eccd531746 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -571,7 +571,7 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info) rcu_read_lock(); if (rt_is_input_route(rt) && net->ipv4.sysctl_icmp_errors_use_inbound_ifaddr) - dev = dev_get_by_index_rcu(net, rt->rt_iif); + dev = dev_get_by_index_rcu(net, inet_iif(skb_in)); if (dev) saddr = inet_select_addr(dev, 0, RT_SCOPE_LINK); diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index de29f46f68b0..5eea4a811042 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -1027,10 +1027,9 @@ e_inval: void ipv4_pktinfo_prepare(struct sk_buff *skb) { struct in_pktinfo *pktinfo = PKTINFO_SKB_CB(skb); - const struct rtable *rt = skb_rtable(skb); - if (rt) { - pktinfo->ipi_ifindex = rt->rt_iif; + if (skb_rtable(skb)) { + pktinfo->ipi_ifindex = inet_iif(skb); pktinfo->ipi_spec_dst.s_addr = fib_compute_spec_dst(skb); } else { pktinfo->ipi_ifindex = 0; diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 34017be87c85..f6be78119396 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -848,7 +848,7 @@ void ip_rt_send_redirect(struct sk_buff *skb) if (log_martians && peer->rate_tokens == ip_rt_redirect_number) net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n", - &ip_hdr(skb)->saddr, rt->rt_iif, + &ip_hdr(skb)->saddr, inet_iif(skb), &ip_hdr(skb)->daddr, &rt->rt_gateway); #endif } diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 21d7f8f3a7a5..3e07a64ca44e 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -5391,18 +5391,6 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, { struct tcp_sock *tp = tcp_sk(sk); - if (sk->sk_rx_dst) { - struct dst_entry *dst = sk->sk_rx_dst; - if (unlikely(dst->obsolete)) { - if (dst->ops->check(dst, 0) == NULL) { - dst_release(dst); - sk->sk_rx_dst = NULL; - } - } - } - if (unlikely(sk->sk_rx_dst == NULL)) - sk->sk_rx_dst = dst_clone(skb_dst(skb)); - /* * Header prediction. * The code loosely follows the one in the famous diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index bc5432e3c778..3e30548ac32a 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1618,6 +1618,20 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */ sock_rps_save_rxhash(sk, skb); + if (sk->sk_rx_dst) { + struct dst_entry *dst = sk->sk_rx_dst; + if (dst->ops->check(dst, 0) == NULL) { + dst_release(dst); + sk->sk_rx_dst = NULL; + } + } + if (unlikely(sk->sk_rx_dst == NULL)) { + struct inet_sock *icsk = inet_sk(sk); + struct rtable *rt = skb_rtable(skb); + + sk->sk_rx_dst = dst_clone(&rt->dst); + icsk->rx_dst_ifindex = inet_iif(skb); + } if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len)) { rsk = sk; goto reset; @@ -1700,14 +1714,12 @@ void tcp_v4_early_demux(struct sk_buff *skb) skb->destructor = sock_edemux; if (sk->sk_state != TCP_TIME_WAIT) { struct dst_entry *dst = sk->sk_rx_dst; + struct inet_sock *icsk = inet_sk(sk); if (dst) dst = dst_check(dst, 0); - if (dst) { - struct rtable *rt = (struct rtable *) dst; - - if (rt->rt_iif == dev->ifindex) - skb_dst_set_noref(skb, dst); - } + if (dst && + icsk->rx_dst_ifindex == dev->ifindex) + skb_dst_set_noref(skb, dst); } } } diff --git a/net/sched/cls_route.c b/net/sched/cls_route.c index 36fec4227401..44f405cb9aaf 100644 --- a/net/sched/cls_route.c +++ b/net/sched/cls_route.c @@ -143,7 +143,7 @@ static int route4_classify(struct sk_buff *skb, const struct tcf_proto *tp, if (head == NULL) goto old_method; - iif = ((struct rtable *)dst)->rt_iif; + iif = inet_iif(skb); h = route4_fastmap_hash(id, iif); if (id == head->fastmap[h].id && diff --git a/net/sched/em_meta.c b/net/sched/em_meta.c index 4790c696cbce..4ab6e3325573 100644 --- a/net/sched/em_meta.c +++ b/net/sched/em_meta.c @@ -264,7 +264,7 @@ META_COLLECTOR(int_rtiif) if (unlikely(skb_rtable(skb) == NULL)) *err = -1; else - dst->value = skb_rtable(skb)->rt_iif; + dst->value = inet_iif(skb); } /************************************************************************** diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c index 9c90811d1134..1f89c4e69645 100644 --- a/net/sctp/protocol.c +++ b/net/sctp/protocol.c @@ -568,7 +568,7 @@ static void sctp_v4_get_saddr(struct sctp_sock *sk, /* What interface did this skb arrive on? */ static int sctp_v4_skb_iif(const struct sk_buff *skb) { - return skb_rtable(skb)->rt_iif; + return inet_iif(skb); } /* Was this packet marked by Explicit Congestion Notification? */ -- cgit From 404e0a8b6a55d5e1cd138c6deb1bca9abdf75d8c Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 29 Jul 2012 23:20:37 +0000 Subject: net: ipv4: fix RCU races on dst refcounts commit c6cffba4ffa2 (ipv4: Fix input route performance regression.) added various fatal races with dst refcounts. crashes happen on tcp workloads if routes are added/deleted at the same time. The dst_free() calls from free_fib_info_rcu() are clearly racy. We need instead regular dst refcounting (dst_release()) and make sure dst_release() is aware of RCU grace periods : Add DST_RCU_FREE flag so that dst_release() respects an RCU grace period before dst destruction for cached dst Introduce a new inet_sk_rx_dst_set() helper, using atomic_inc_not_zero() to make sure we dont increase a zero refcount (On a dst currently waiting an rcu grace period before destruction) rt_cache_route() must take a reference on the new cached route, and release it if was not able to install it. With this patch, my machines survive various benchmarks. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/dst.h | 7 +------ include/net/inet_sock.h | 13 +++++++++++++ net/core/dst.c | 26 +++++++++++++++++++++----- net/decnet/dn_route.c | 6 ++++++ net/ipv4/fib_semantics.c | 4 ++-- net/ipv4/route.c | 16 ++++------------ net/ipv4/tcp_input.c | 3 +-- net/ipv4/tcp_ipv4.c | 12 ++++++------ net/ipv4/tcp_minisocks.c | 3 +-- 9 files changed, 55 insertions(+), 35 deletions(-) (limited to 'include/net/inet_sock.h') diff --git a/include/net/dst.h b/include/net/dst.h index baf597890064..31a9fd39edb6 100644 --- a/include/net/dst.h +++ b/include/net/dst.h @@ -61,6 +61,7 @@ struct dst_entry { #define DST_NOPEER 0x0040 #define DST_FAKE_RTABLE 0x0080 #define DST_XFRM_TUNNEL 0x0100 +#define DST_RCU_FREE 0x0200 unsigned short pending_confirm; @@ -382,12 +383,6 @@ static inline void dst_free(struct dst_entry *dst) __dst_free(dst); } -static inline void dst_rcu_free(struct rcu_head *head) -{ - struct dst_entry *dst = container_of(head, struct dst_entry, rcu_head); - dst_free(dst); -} - static inline void dst_confirm(struct dst_entry *dst) { dst->pending_confirm = 1; diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h index 613cfa401672..e3fd34c83ac9 100644 --- a/include/net/inet_sock.h +++ b/include/net/inet_sock.h @@ -249,4 +249,17 @@ static inline __u8 inet_sk_flowi_flags(const struct sock *sk) return flags; } +static inline void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb) +{ + struct dst_entry *dst = skb_dst(skb); + + if (atomic_inc_not_zero(&dst->__refcnt)) { + if (!(dst->flags & DST_RCU_FREE)) + dst->flags |= DST_RCU_FREE; + + sk->sk_rx_dst = dst; + inet_sk(sk)->rx_dst_ifindex = skb->skb_iif; + } +} + #endif /* _INET_SOCK_H */ diff --git a/net/core/dst.c b/net/core/dst.c index 069d51d29414..d9e33ebe170f 100644 --- a/net/core/dst.c +++ b/net/core/dst.c @@ -258,6 +258,15 @@ again: } EXPORT_SYMBOL(dst_destroy); +static void dst_rcu_destroy(struct rcu_head *head) +{ + struct dst_entry *dst = container_of(head, struct dst_entry, rcu_head); + + dst = dst_destroy(dst); + if (dst) + __dst_free(dst); +} + void dst_release(struct dst_entry *dst) { if (dst) { @@ -265,10 +274,14 @@ void dst_release(struct dst_entry *dst) newrefcnt = atomic_dec_return(&dst->__refcnt); WARN_ON(newrefcnt < 0); - if (unlikely(dst->flags & DST_NOCACHE) && !newrefcnt) { - dst = dst_destroy(dst); - if (dst) - __dst_free(dst); + if (unlikely(dst->flags & (DST_NOCACHE | DST_RCU_FREE)) && !newrefcnt) { + if (dst->flags & DST_RCU_FREE) { + call_rcu_bh(&dst->rcu_head, dst_rcu_destroy); + } else { + dst = dst_destroy(dst); + if (dst) + __dst_free(dst); + } } } } @@ -320,11 +333,14 @@ EXPORT_SYMBOL(__dst_destroy_metrics_generic); */ void skb_dst_set_noref(struct sk_buff *skb, struct dst_entry *dst) { + bool hold; + WARN_ON(!rcu_read_lock_held() && !rcu_read_lock_bh_held()); /* If dst not in cache, we must take a reference, because * dst_release() will destroy dst as soon as its refcount becomes zero */ - if (unlikely(dst->flags & DST_NOCACHE)) { + hold = (dst->flags & (DST_NOCACHE | DST_RCU_FREE)) == DST_NOCACHE; + if (unlikely(hold)) { dst_hold(dst); skb_dst_set(skb, dst); } else { diff --git a/net/decnet/dn_route.c b/net/decnet/dn_route.c index 85a3604c87c8..26719779ad8e 100644 --- a/net/decnet/dn_route.c +++ b/net/decnet/dn_route.c @@ -184,6 +184,12 @@ static __inline__ unsigned int dn_hash(__le16 src, __le16 dst) return dn_rt_hash_mask & (unsigned int)tmp; } +static inline void dst_rcu_free(struct rcu_head *head) +{ + struct dst_entry *dst = container_of(head, struct dst_entry, rcu_head); + dst_free(dst); +} + static inline void dnrt_free(struct dn_route *rt) { call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free); diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index da0cc2e6b250..e55171f184f9 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -172,9 +172,9 @@ static void free_fib_info_rcu(struct rcu_head *head) if (nexthop_nh->nh_exceptions) free_nh_exceptions(nexthop_nh); if (nexthop_nh->nh_rth_output) - dst_free(&nexthop_nh->nh_rth_output->dst); + dst_release(&nexthop_nh->nh_rth_output->dst); if (nexthop_nh->nh_rth_input) - dst_free(&nexthop_nh->nh_rth_input->dst); + dst_release(&nexthop_nh->nh_rth_input->dst); } endfor_nexthops(fi); release_net(fi->fib_net); diff --git a/net/ipv4/route.c b/net/ipv4/route.c index fc1a81ca79a7..d6eabcfe8a90 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1199,11 +1199,6 @@ restart: fnhe->fnhe_stamp = jiffies; } -static inline void rt_free(struct rtable *rt) -{ - call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free); -} - static void rt_cache_route(struct fib_nh *nh, struct rtable *rt) { struct rtable *orig, *prev, **p = &nh->nh_rth_output; @@ -1213,17 +1208,14 @@ static void rt_cache_route(struct fib_nh *nh, struct rtable *rt) orig = *p; + rt->dst.flags |= DST_RCU_FREE; + dst_hold(&rt->dst); prev = cmpxchg(p, orig, rt); if (prev == orig) { if (orig) - rt_free(orig); + dst_release(&orig->dst); } else { - /* Routes we intend to cache in the FIB nexthop have - * the DST_NOCACHE bit clear. However, if we are - * unsuccessful at storing this route into the cache - * we really need to set it. - */ - rt->dst.flags |= DST_NOCACHE; + dst_release(&rt->dst); } } diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index a356e1fecf9a..9be30b039ae3 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -5604,8 +5604,7 @@ void tcp_finish_connect(struct sock *sk, struct sk_buff *skb) tcp_set_state(sk, TCP_ESTABLISHED); if (skb != NULL) { - sk->sk_rx_dst = dst_clone(skb_dst(skb)); - inet_sk(sk)->rx_dst_ifindex = skb->skb_iif; + inet_sk_rx_dst_set(sk, skb); security_inet_conn_established(sk, skb); } diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 2fbd9921253f..7f91e5ac8277 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1617,19 +1617,19 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) #endif if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */ + struct dst_entry *dst = sk->sk_rx_dst; + sock_rps_save_rxhash(sk, skb); - if (sk->sk_rx_dst) { - struct dst_entry *dst = sk->sk_rx_dst; + if (dst) { if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif || dst->ops->check(dst, 0) == NULL) { dst_release(dst); sk->sk_rx_dst = NULL; } } - if (unlikely(sk->sk_rx_dst == NULL)) { - sk->sk_rx_dst = dst_clone(skb_dst(skb)); - inet_sk(sk)->rx_dst_ifindex = skb->skb_iif; - } + if (unlikely(sk->sk_rx_dst == NULL)) + inet_sk_rx_dst_set(sk, skb); + if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len)) { rsk = sk; goto reset; diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 3f1cc2028edd..232a90c3ec86 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -387,8 +387,7 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, struct tcp_sock *oldtp = tcp_sk(sk); struct tcp_cookie_values *oldcvp = oldtp->cookie_values; - newsk->sk_rx_dst = dst_clone(skb_dst(skb)); - inet_sk(newsk)->rx_dst_ifindex = skb->skb_iif; + inet_sk_rx_dst_set(newsk, skb); /* TCP Cookie Transactions require space for the cookie pair, * as it differs for each connection. There is no need to -- cgit From 54764bb647b2e847c512acf8d443df965da35000 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 31 Jul 2012 01:08:23 +0000 Subject: ipv4: Restore old dst_free() behavior. commit 404e0a8b6a55 (net: ipv4: fix RCU races on dst refcounts) tried to solve a race but added a problem at device/fib dismantle time : We really want to call dst_free() as soon as possible, even if sockets still have dst in their cache. dst_release() calls in free_fib_info_rcu() are not welcomed. Root of the problem was that now we also cache output routes (in nh_rth_output), we must use call_rcu() instead of call_rcu_bh() in rt_free(), because output route lookups are done in process context. Based on feedback and initial patch from David Miller (adding another call_rcu_bh() call in fib, but it appears it was not the right fix) I left the inet_sk_rx_dst_set() helper and added __rcu attributes to nh_rth_output and nh_rth_input to better document what is going on in this code. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/dst.h | 7 ++++++- include/net/inet_sock.h | 10 +++------- include/net/ip_fib.h | 4 ++-- net/core/dst.c | 26 +++++--------------------- net/decnet/dn_route.c | 6 ------ net/ipv4/fib_semantics.c | 21 +++++++++++++++++---- net/ipv4/route.c | 26 +++++++++++++++++--------- 7 files changed, 50 insertions(+), 50 deletions(-) (limited to 'include/net/inet_sock.h') diff --git a/include/net/dst.h b/include/net/dst.h index 31a9fd39edb6..baf597890064 100644 --- a/include/net/dst.h +++ b/include/net/dst.h @@ -61,7 +61,6 @@ struct dst_entry { #define DST_NOPEER 0x0040 #define DST_FAKE_RTABLE 0x0080 #define DST_XFRM_TUNNEL 0x0100 -#define DST_RCU_FREE 0x0200 unsigned short pending_confirm; @@ -383,6 +382,12 @@ static inline void dst_free(struct dst_entry *dst) __dst_free(dst); } +static inline void dst_rcu_free(struct rcu_head *head) +{ + struct dst_entry *dst = container_of(head, struct dst_entry, rcu_head); + dst_free(dst); +} + static inline void dst_confirm(struct dst_entry *dst) { dst->pending_confirm = 1; diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h index e3fd34c83ac9..83b567fe1941 100644 --- a/include/net/inet_sock.h +++ b/include/net/inet_sock.h @@ -253,13 +253,9 @@ static inline void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb { struct dst_entry *dst = skb_dst(skb); - if (atomic_inc_not_zero(&dst->__refcnt)) { - if (!(dst->flags & DST_RCU_FREE)) - dst->flags |= DST_RCU_FREE; - - sk->sk_rx_dst = dst; - inet_sk(sk)->rx_dst_ifindex = skb->skb_iif; - } + dst_hold(dst); + sk->sk_rx_dst = dst; + inet_sk(sk)->rx_dst_ifindex = skb->skb_iif; } #endif /* _INET_SOCK_H */ diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h index e69c3a47153d..e521a03515b1 100644 --- a/include/net/ip_fib.h +++ b/include/net/ip_fib.h @@ -81,8 +81,8 @@ struct fib_nh { __be32 nh_gw; __be32 nh_saddr; int nh_saddr_genid; - struct rtable *nh_rth_output; - struct rtable *nh_rth_input; + struct rtable __rcu *nh_rth_output; + struct rtable __rcu *nh_rth_input; struct fnhe_hash_bucket *nh_exceptions; }; diff --git a/net/core/dst.c b/net/core/dst.c index d9e33ebe170f..069d51d29414 100644 --- a/net/core/dst.c +++ b/net/core/dst.c @@ -258,15 +258,6 @@ again: } EXPORT_SYMBOL(dst_destroy); -static void dst_rcu_destroy(struct rcu_head *head) -{ - struct dst_entry *dst = container_of(head, struct dst_entry, rcu_head); - - dst = dst_destroy(dst); - if (dst) - __dst_free(dst); -} - void dst_release(struct dst_entry *dst) { if (dst) { @@ -274,14 +265,10 @@ void dst_release(struct dst_entry *dst) newrefcnt = atomic_dec_return(&dst->__refcnt); WARN_ON(newrefcnt < 0); - if (unlikely(dst->flags & (DST_NOCACHE | DST_RCU_FREE)) && !newrefcnt) { - if (dst->flags & DST_RCU_FREE) { - call_rcu_bh(&dst->rcu_head, dst_rcu_destroy); - } else { - dst = dst_destroy(dst); - if (dst) - __dst_free(dst); - } + if (unlikely(dst->flags & DST_NOCACHE) && !newrefcnt) { + dst = dst_destroy(dst); + if (dst) + __dst_free(dst); } } } @@ -333,14 +320,11 @@ EXPORT_SYMBOL(__dst_destroy_metrics_generic); */ void skb_dst_set_noref(struct sk_buff *skb, struct dst_entry *dst) { - bool hold; - WARN_ON(!rcu_read_lock_held() && !rcu_read_lock_bh_held()); /* If dst not in cache, we must take a reference, because * dst_release() will destroy dst as soon as its refcount becomes zero */ - hold = (dst->flags & (DST_NOCACHE | DST_RCU_FREE)) == DST_NOCACHE; - if (unlikely(hold)) { + if (unlikely(dst->flags & DST_NOCACHE)) { dst_hold(dst); skb_dst_set(skb, dst); } else { diff --git a/net/decnet/dn_route.c b/net/decnet/dn_route.c index 26719779ad8e..85a3604c87c8 100644 --- a/net/decnet/dn_route.c +++ b/net/decnet/dn_route.c @@ -184,12 +184,6 @@ static __inline__ unsigned int dn_hash(__le16 src, __le16 dst) return dn_rt_hash_mask & (unsigned int)tmp; } -static inline void dst_rcu_free(struct rcu_head *head) -{ - struct dst_entry *dst = container_of(head, struct dst_entry, rcu_head); - dst_free(dst); -} - static inline void dnrt_free(struct dn_route *rt) { call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free); diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index e55171f184f9..625cf185c489 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -161,6 +161,21 @@ static void free_nh_exceptions(struct fib_nh *nh) kfree(hash); } +static void rt_nexthop_free(struct rtable __rcu **rtp) +{ + struct rtable *rt = rcu_dereference_protected(*rtp, 1); + + if (!rt) + return; + + /* Not even needed : RCU_INIT_POINTER(*rtp, NULL); + * because we waited an RCU grace period before calling + * free_fib_info_rcu() + */ + + dst_free(&rt->dst); +} + /* Release a nexthop info record */ static void free_fib_info_rcu(struct rcu_head *head) { @@ -171,10 +186,8 @@ static void free_fib_info_rcu(struct rcu_head *head) dev_put(nexthop_nh->nh_dev); if (nexthop_nh->nh_exceptions) free_nh_exceptions(nexthop_nh); - if (nexthop_nh->nh_rth_output) - dst_release(&nexthop_nh->nh_rth_output->dst); - if (nexthop_nh->nh_rth_input) - dst_release(&nexthop_nh->nh_rth_input->dst); + rt_nexthop_free(&nexthop_nh->nh_rth_output); + rt_nexthop_free(&nexthop_nh->nh_rth_input); } endfor_nexthops(fi); release_net(fi->fib_net); diff --git a/net/ipv4/route.c b/net/ipv4/route.c index d6eabcfe8a90..2bd107477469 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1199,23 +1199,31 @@ restart: fnhe->fnhe_stamp = jiffies; } +static inline void rt_free(struct rtable *rt) +{ + call_rcu(&rt->dst.rcu_head, dst_rcu_free); +} + static void rt_cache_route(struct fib_nh *nh, struct rtable *rt) { - struct rtable *orig, *prev, **p = &nh->nh_rth_output; + struct rtable *orig, *prev, **p = (struct rtable **)&nh->nh_rth_output; if (rt_is_input_route(rt)) - p = &nh->nh_rth_input; + p = (struct rtable **)&nh->nh_rth_input; orig = *p; - rt->dst.flags |= DST_RCU_FREE; - dst_hold(&rt->dst); prev = cmpxchg(p, orig, rt); if (prev == orig) { if (orig) - dst_release(&orig->dst); + rt_free(orig); } else { - dst_release(&rt->dst); + /* Routes we intend to cache in the FIB nexthop have + * the DST_NOCACHE bit clear. However, if we are + * unsuccessful at storing this route into the cache + * we really need to set it. + */ + rt->dst.flags |= DST_NOCACHE; } } @@ -1412,7 +1420,7 @@ static int __mkroute_input(struct sk_buff *skb, do_cache = false; if (res->fi) { if (!itag) { - rth = FIB_RES_NH(*res).nh_rth_input; + rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input); if (rt_cache_valid(rth)) { skb_dst_set_noref(skb, &rth->dst); goto out; @@ -1574,7 +1582,7 @@ local_input: do_cache = false; if (res.fi) { if (!itag) { - rth = FIB_RES_NH(res).nh_rth_input; + rth = rcu_dereference(FIB_RES_NH(res).nh_rth_input); if (rt_cache_valid(rth)) { skb_dst_set_noref(skb, &rth->dst); err = 0; @@ -1742,7 +1750,7 @@ static struct rtable *__mkroute_output(const struct fib_result *res, if (fi) { fnhe = find_exception(&FIB_RES_NH(*res), fl4->daddr); if (!fnhe) { - rth = FIB_RES_NH(*res).nh_rth_output; + rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_output); if (rt_cache_valid(rth)) { dst_hold(&rth->dst); return rth; -- cgit