diff options
Diffstat (limited to 'net/core/dst.c')
| -rw-r--r-- | net/core/dst.c | 300 | 
1 files changed, 67 insertions, 233 deletions
| diff --git a/net/core/dst.c b/net/core/dst.c index 960e503b5a52..00aa972ad1a1 100644 --- a/net/core/dst.c +++ b/net/core/dst.c @@ -42,108 +42,6 @@   * to dirty as few cache lines as possible in __dst_free().   * As this is not a very strong hint, we dont force an alignment on SMP.   */ -static struct { -	spinlock_t		lock; -	struct dst_entry	*list; -	unsigned long		timer_inc; -	unsigned long		timer_expires; -} dst_garbage = { -	.lock = __SPIN_LOCK_UNLOCKED(dst_garbage.lock), -	.timer_inc = DST_GC_MAX, -}; -static void dst_gc_task(struct work_struct *work); -static void ___dst_free(struct dst_entry *dst); - -static DECLARE_DELAYED_WORK(dst_gc_work, dst_gc_task); - -static DEFINE_MUTEX(dst_gc_mutex); -/* - * long lived entries are maintained in this list, guarded by dst_gc_mutex - */ -static struct dst_entry         *dst_busy_list; - -static void dst_gc_task(struct work_struct *work) -{ -	int    delayed = 0; -	int    work_performed = 0; -	unsigned long expires = ~0L; -	struct dst_entry *dst, *next, head; -	struct dst_entry *last = &head; - -	mutex_lock(&dst_gc_mutex); -	next = dst_busy_list; - -loop: -	while ((dst = next) != NULL) { -		next = dst->next; -		prefetch(&next->next); -		cond_resched(); -		if (likely(atomic_read(&dst->__refcnt))) { -			last->next = dst; -			last = dst; -			delayed++; -			continue; -		} -		work_performed++; - -		dst = dst_destroy(dst); -		if (dst) { -			/* NOHASH and still referenced. Unless it is already -			 * on gc list, invalidate it and add to gc list. -			 * -			 * Note: this is temporary. Actually, NOHASH dst's -			 * must be obsoleted when parent is obsoleted. -			 * But we do not have state "obsoleted, but -			 * referenced by parent", so it is right. -			 */ -			if (dst->obsolete > 0) -				continue; - -			___dst_free(dst); -			dst->next = next; -			next = dst; -		} -	} - -	spin_lock_bh(&dst_garbage.lock); -	next = dst_garbage.list; -	if (next) { -		dst_garbage.list = NULL; -		spin_unlock_bh(&dst_garbage.lock); -		goto loop; -	} -	last->next = NULL; -	dst_busy_list = head.next; -	if (!dst_busy_list) -		dst_garbage.timer_inc = DST_GC_MAX; -	else { -		/* -		 * if we freed less than 1/10 of delayed entries, -		 * we can sleep longer. -		 */ -		if (work_performed <= delayed/10) { -			dst_garbage.timer_expires += dst_garbage.timer_inc; -			if (dst_garbage.timer_expires > DST_GC_MAX) -				dst_garbage.timer_expires = DST_GC_MAX; -			dst_garbage.timer_inc += DST_GC_INC; -		} else { -			dst_garbage.timer_inc = DST_GC_INC; -			dst_garbage.timer_expires = DST_GC_MIN; -		} -		expires = dst_garbage.timer_expires; -		/* -		 * if the next desired timer is more than 4 seconds in the -		 * future then round the timer to whole seconds -		 */ -		if (expires > 4*HZ) -			expires = round_jiffies_relative(expires); -		schedule_delayed_work(&dst_gc_work, expires); -	} - -	spin_unlock_bh(&dst_garbage.lock); -	mutex_unlock(&dst_gc_mutex); -} -  int dst_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb)  {  	kfree_skb(skb); @@ -151,13 +49,13 @@ int dst_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb)  }  EXPORT_SYMBOL(dst_discard_out); -const u32 dst_default_metrics[RTAX_MAX + 1] = { +const struct dst_metrics dst_default_metrics = {  	/* This initializer is needed to force linker to place this variable  	 * into const section. Otherwise it might end into bss section.  	 * We really want to avoid false sharing on this variable, and catch  	 * any writes on it.  	 */ -	[RTAX_MAX] = 0xdeadbeef, +	.refcnt = ATOMIC_INIT(1),  };  void dst_init(struct dst_entry *dst, struct dst_ops *ops, @@ -169,7 +67,7 @@ void dst_init(struct dst_entry *dst, struct dst_ops *ops,  	if (dev)  		dev_hold(dev);  	dst->ops = ops; -	dst_init_metrics(dst, dst_default_metrics, true); +	dst_init_metrics(dst, dst_default_metrics.metrics, true);  	dst->expires = 0UL;  	dst->path = dst;  	dst->from = NULL; @@ -216,41 +114,12 @@ void *dst_alloc(struct dst_ops *ops, struct net_device *dev,  }  EXPORT_SYMBOL(dst_alloc); -static void ___dst_free(struct dst_entry *dst) -{ -	/* The first case (dev==NULL) is required, when -	   protocol module is unloaded. -	 */ -	if (dst->dev == NULL || !(dst->dev->flags&IFF_UP)) { -		dst->input = dst_discard; -		dst->output = dst_discard_out; -	} -	dst->obsolete = DST_OBSOLETE_DEAD; -} - -void __dst_free(struct dst_entry *dst) -{ -	spin_lock_bh(&dst_garbage.lock); -	___dst_free(dst); -	dst->next = dst_garbage.list; -	dst_garbage.list = dst; -	if (dst_garbage.timer_inc > DST_GC_INC) { -		dst_garbage.timer_inc = DST_GC_INC; -		dst_garbage.timer_expires = DST_GC_MIN; -		mod_delayed_work(system_wq, &dst_gc_work, -				 dst_garbage.timer_expires); -	} -	spin_unlock_bh(&dst_garbage.lock); -} -EXPORT_SYMBOL(__dst_free); -  struct dst_entry *dst_destroy(struct dst_entry * dst)  {  	struct dst_entry *child;  	smp_rmb(); -again:  	child = dst->child;  	if (!(dst->flags & DST_NOCOUNT)) @@ -269,20 +138,8 @@ again:  		kmem_cache_free(dst->ops->kmem_cachep, dst);  	dst = child; -	if (dst) { -		int nohash = dst->flags & DST_NOHASH; - -		if (atomic_dec_and_test(&dst->__refcnt)) { -			/* We were real parent of this dst, so kill child. */ -			if (nohash) -				goto again; -		} else { -			/* Child is still referenced, return it for freeing. */ -			if (nohash) -				return dst; -			/* Child is still in his hash table */ -		} -	} +	if (dst) +		dst_release_immediate(dst);  	return NULL;  }  EXPORT_SYMBOL(dst_destroy); @@ -292,47 +149,88 @@ static void dst_destroy_rcu(struct rcu_head *head)  	struct dst_entry *dst = container_of(head, struct dst_entry, rcu_head);  	dst = dst_destroy(dst); -	if (dst) -		__dst_free(dst);  } +/* Operations to mark dst as DEAD and clean up the net device referenced + * by dst: + * 1. put the dst under loopback interface and discard all tx/rx packets + *    on this route. + * 2. release the net_device + * This function should be called when removing routes from the fib tree + * in preparation for a NETDEV_DOWN/NETDEV_UNREGISTER event and also to + * make the next dst_ops->check() fail. + */ +void dst_dev_put(struct dst_entry *dst) +{ +	struct net_device *dev = dst->dev; + +	dst->obsolete = DST_OBSOLETE_DEAD; +	if (dst->ops->ifdown) +		dst->ops->ifdown(dst, dev, true); +	dst->input = dst_discard; +	dst->output = dst_discard_out; +	dst->dev = dev_net(dst->dev)->loopback_dev; +	dev_hold(dst->dev); +	dev_put(dev); +} +EXPORT_SYMBOL(dst_dev_put); +  void dst_release(struct dst_entry *dst)  {  	if (dst) {  		int newrefcnt; -		unsigned short nocache = dst->flags & DST_NOCACHE;  		newrefcnt = atomic_dec_return(&dst->__refcnt);  		if (unlikely(newrefcnt < 0))  			net_warn_ratelimited("%s: dst:%p refcnt:%d\n",  					     __func__, dst, newrefcnt); -		if (!newrefcnt && unlikely(nocache)) +		if (!newrefcnt)  			call_rcu(&dst->rcu_head, dst_destroy_rcu);  	}  }  EXPORT_SYMBOL(dst_release); +void dst_release_immediate(struct dst_entry *dst) +{ +	if (dst) { +		int newrefcnt; + +		newrefcnt = atomic_dec_return(&dst->__refcnt); +		if (unlikely(newrefcnt < 0)) +			net_warn_ratelimited("%s: dst:%p refcnt:%d\n", +					     __func__, dst, newrefcnt); +		if (!newrefcnt) +			dst_destroy(dst); +	} +} +EXPORT_SYMBOL(dst_release_immediate); +  u32 *dst_cow_metrics_generic(struct dst_entry *dst, unsigned long old)  { -	u32 *p = kmalloc(sizeof(u32) * RTAX_MAX, GFP_ATOMIC); +	struct dst_metrics *p = kmalloc(sizeof(*p), GFP_ATOMIC);  	if (p) { -		u32 *old_p = __DST_METRICS_PTR(old); +		struct dst_metrics *old_p = (struct dst_metrics *)__DST_METRICS_PTR(old);  		unsigned long prev, new; -		memcpy(p, old_p, sizeof(u32) * RTAX_MAX); +		atomic_set(&p->refcnt, 1); +		memcpy(p->metrics, old_p->metrics, sizeof(p->metrics));  		new = (unsigned long) p;  		prev = cmpxchg(&dst->_metrics, old, new);  		if (prev != old) {  			kfree(p); -			p = __DST_METRICS_PTR(prev); +			p = (struct dst_metrics *)__DST_METRICS_PTR(prev);  			if (prev & DST_METRICS_READ_ONLY)  				p = NULL; +		} else if (prev & DST_METRICS_REFCOUNTED) { +			if (atomic_dec_and_test(&old_p->refcnt)) +				kfree(old_p);  		}  	} -	return p; +	BUILD_BUG_ON(offsetof(struct dst_metrics, metrics) != 0); +	return (u32 *)p;  }  EXPORT_SYMBOL(dst_cow_metrics_generic); @@ -341,7 +239,7 @@ void __dst_destroy_metrics_generic(struct dst_entry *dst, unsigned long old)  {  	unsigned long prev, new; -	new = ((unsigned long) dst_default_metrics) | DST_METRICS_READ_ONLY; +	new = ((unsigned long) &dst_default_metrics) | DST_METRICS_READ_ONLY;  	prev = cmpxchg(&dst->_metrics, old, new);  	if (prev == old)  		kfree(__DST_METRICS_PTR(old)); @@ -366,21 +264,25 @@ static int dst_md_discard(struct sk_buff *skb)  	return 0;  } -static void __metadata_dst_init(struct metadata_dst *md_dst, u8 optslen) +static void __metadata_dst_init(struct metadata_dst *md_dst, +				enum metadata_type type, u8 optslen) +  {  	struct dst_entry *dst;  	dst = &md_dst->dst;  	dst_init(dst, &md_dst_ops, NULL, 1, DST_OBSOLETE_NONE, -		 DST_METADATA | DST_NOCACHE | DST_NOCOUNT); +		 DST_METADATA | DST_NOCOUNT);  	dst->input = dst_md_discard;  	dst->output = dst_md_discard_out;  	memset(dst + 1, 0, sizeof(*md_dst) + optslen - sizeof(*dst)); +	md_dst->type = type;  } -struct metadata_dst *metadata_dst_alloc(u8 optslen, gfp_t flags) +struct metadata_dst *metadata_dst_alloc(u8 optslen, enum metadata_type type, +					gfp_t flags)  {  	struct metadata_dst *md_dst; @@ -388,7 +290,7 @@ struct metadata_dst *metadata_dst_alloc(u8 optslen, gfp_t flags)  	if (!md_dst)  		return NULL; -	__metadata_dst_init(md_dst, optslen); +	__metadata_dst_init(md_dst, type, optslen);  	return md_dst;  } @@ -402,7 +304,8 @@ void metadata_dst_free(struct metadata_dst *md_dst)  	kfree(md_dst);  } -struct metadata_dst __percpu *metadata_dst_alloc_percpu(u8 optslen, gfp_t flags) +struct metadata_dst __percpu * +metadata_dst_alloc_percpu(u8 optslen, enum metadata_type type, gfp_t flags)  {  	int cpu;  	struct metadata_dst __percpu *md_dst; @@ -413,77 +316,8 @@ struct metadata_dst __percpu *metadata_dst_alloc_percpu(u8 optslen, gfp_t flags)  		return NULL;  	for_each_possible_cpu(cpu) -		__metadata_dst_init(per_cpu_ptr(md_dst, cpu), optslen); +		__metadata_dst_init(per_cpu_ptr(md_dst, cpu), type, optslen);  	return md_dst;  }  EXPORT_SYMBOL_GPL(metadata_dst_alloc_percpu); - -/* Dirty hack. We did it in 2.2 (in __dst_free), - * we have _very_ good reasons not to repeat - * this mistake in 2.3, but we have no choice - * now. _It_ _is_ _explicit_ _deliberate_ - * _race_ _condition_. - * - * Commented and originally written by Alexey. - */ -static void dst_ifdown(struct dst_entry *dst, struct net_device *dev, -		       int unregister) -{ -	if (dst->ops->ifdown) -		dst->ops->ifdown(dst, dev, unregister); - -	if (dev != dst->dev) -		return; - -	if (!unregister) { -		dst->input = dst_discard; -		dst->output = dst_discard_out; -	} else { -		dst->dev = dev_net(dst->dev)->loopback_dev; -		dev_hold(dst->dev); -		dev_put(dev); -	} -} - -static int dst_dev_event(struct notifier_block *this, unsigned long event, -			 void *ptr) -{ -	struct net_device *dev = netdev_notifier_info_to_dev(ptr); -	struct dst_entry *dst, *last = NULL; - -	switch (event) { -	case NETDEV_UNREGISTER_FINAL: -	case NETDEV_DOWN: -		mutex_lock(&dst_gc_mutex); -		for (dst = dst_busy_list; dst; dst = dst->next) { -			last = dst; -			dst_ifdown(dst, dev, event != NETDEV_DOWN); -		} - -		spin_lock_bh(&dst_garbage.lock); -		dst = dst_garbage.list; -		dst_garbage.list = NULL; -		spin_unlock_bh(&dst_garbage.lock); - -		if (last) -			last->next = dst; -		else -			dst_busy_list = dst; -		for (; dst; dst = dst->next) -			dst_ifdown(dst, dev, event != NETDEV_DOWN); -		mutex_unlock(&dst_gc_mutex); -		break; -	} -	return NOTIFY_DONE; -} - -static struct notifier_block dst_dev_notifier = { -	.notifier_call	= dst_dev_event, -	.priority = -10, /* must be called after other network notifiers */ -}; - -void __init dst_subsys_init(void) -{ -	register_netdevice_notifier(&dst_dev_notifier); -} |