diff options
Diffstat (limited to 'lib/rhashtable.c')
| -rw-r--r-- | lib/rhashtable.c | 1034 | 
1 files changed, 363 insertions, 671 deletions
| diff --git a/lib/rhashtable.c b/lib/rhashtable.c index 9cc4c4a90d00..4898442b837f 100644 --- a/lib/rhashtable.c +++ b/lib/rhashtable.c @@ -1,13 +1,13 @@  /*   * Resizable, Scalable, Concurrent Hash Table   * + * Copyright (c) 2015 Herbert Xu <[email protected]>   * Copyright (c) 2014-2015 Thomas Graf <[email protected]>   * Copyright (c) 2008-2014 Patrick McHardy <[email protected]>   * - * Based on the following paper: - * https://www.usenix.org/legacy/event/atc11/tech/final_files/Triplett.pdf - *   * Code partially derived from nft_hash + * Rewritten with rehash code from br_multicast plus single list + * pointer as suggested by Josh Triplett   *   * This program is free software; you can redistribute it and/or modify   * it under the terms of the GNU General Public License version 2 as @@ -17,6 +17,7 @@  #include <linux/kernel.h>  #include <linux/init.h>  #include <linux/log2.h> +#include <linux/sched.h>  #include <linux/slab.h>  #include <linux/vmalloc.h>  #include <linux/mm.h> @@ -26,121 +27,18 @@  #include <linux/err.h>  #define HASH_DEFAULT_SIZE	64UL -#define HASH_MIN_SIZE		4UL +#define HASH_MIN_SIZE		4U  #define BUCKET_LOCKS_PER_CPU   128UL -/* Base bits plus 1 bit for nulls marker */ -#define HASH_RESERVED_SPACE	(RHT_BASE_BITS + 1) - -enum { -	RHT_LOCK_NORMAL, -	RHT_LOCK_NESTED, -}; - -/* The bucket lock is selected based on the hash and protects mutations - * on a group of hash buckets. - * - * A maximum of tbl->size/2 bucket locks is allocated. This ensures that - * a single lock always covers both buckets which may both contains - * entries which link to the same bucket of the old table during resizing. - * This allows to simplify the locking as locking the bucket in both - * tables during resize always guarantee protection. - * - * IMPORTANT: When holding the bucket lock of both the old and new table - * during expansions and shrinking, the old bucket lock must always be - * acquired first. - */ -static spinlock_t *bucket_lock(const struct bucket_table *tbl, u32 hash) -{ -	return &tbl->locks[hash & tbl->locks_mask]; -} - -static void *rht_obj(const struct rhashtable *ht, const struct rhash_head *he) -{ -	return (void *) he - ht->p.head_offset; -} - -static u32 rht_bucket_index(const struct bucket_table *tbl, u32 hash) -{ -	return hash & (tbl->size - 1); -} - -static u32 obj_raw_hashfn(const struct rhashtable *ht, const void *ptr) -{ -	u32 hash; - -	if (unlikely(!ht->p.key_len)) -		hash = ht->p.obj_hashfn(ptr, ht->p.hash_rnd); -	else -		hash = ht->p.hashfn(ptr + ht->p.key_offset, ht->p.key_len, -				    ht->p.hash_rnd); - -	return hash >> HASH_RESERVED_SPACE; -} - -static u32 key_hashfn(struct rhashtable *ht, const void *key, u32 len) -{ -	return ht->p.hashfn(key, len, ht->p.hash_rnd) >> HASH_RESERVED_SPACE; -} - -static u32 head_hashfn(const struct rhashtable *ht, +static u32 head_hashfn(struct rhashtable *ht,  		       const struct bucket_table *tbl,  		       const struct rhash_head *he)  { -	return rht_bucket_index(tbl, obj_raw_hashfn(ht, rht_obj(ht, he))); +	return rht_head_hashfn(ht, tbl, he, ht->p);  }  #ifdef CONFIG_PROVE_LOCKING -static void debug_dump_buckets(const struct rhashtable *ht, -			       const struct bucket_table *tbl) -{ -	struct rhash_head *he; -	unsigned int i, hash; - -	for (i = 0; i < tbl->size; i++) { -		pr_warn(" [Bucket %d] ", i); -		rht_for_each_rcu(he, tbl, i) { -			hash = head_hashfn(ht, tbl, he); -			pr_cont("[hash = %#x, lock = %p] ", -				hash, bucket_lock(tbl, hash)); -		} -		pr_cont("\n"); -	} - -} - -static void debug_dump_table(struct rhashtable *ht, -			     const struct bucket_table *tbl, -			     unsigned int hash) -{ -	struct bucket_table *old_tbl, *future_tbl; - -	pr_emerg("BUG: lock for hash %#x in table %p not held\n", -		 hash, tbl); - -	rcu_read_lock(); -	future_tbl = rht_dereference_rcu(ht->future_tbl, ht); -	old_tbl = rht_dereference_rcu(ht->tbl, ht); -	if (future_tbl != old_tbl) { -		pr_warn("Future table %p (size: %zd)\n", -			future_tbl, future_tbl->size); -		debug_dump_buckets(ht, future_tbl); -	} - -	pr_warn("Table %p (size: %zd)\n", old_tbl, old_tbl->size); -	debug_dump_buckets(ht, old_tbl); - -	rcu_read_unlock(); -} -  #define ASSERT_RHT_MUTEX(HT) BUG_ON(!lockdep_rht_mutex_is_held(HT)) -#define ASSERT_BUCKET_LOCK(HT, TBL, HASH)				\ -	do {								\ -		if (unlikely(!lockdep_rht_bucket_is_held(TBL, HASH))) {	\ -			debug_dump_table(HT, TBL, HASH);		\ -			BUG();						\ -		}							\ -	} while (0)  int lockdep_rht_mutex_is_held(struct rhashtable *ht)  { @@ -150,30 +48,18 @@ EXPORT_SYMBOL_GPL(lockdep_rht_mutex_is_held);  int lockdep_rht_bucket_is_held(const struct bucket_table *tbl, u32 hash)  { -	spinlock_t *lock = bucket_lock(tbl, hash); +	spinlock_t *lock = rht_bucket_lock(tbl, hash);  	return (debug_locks) ? lockdep_is_held(lock) : 1;  }  EXPORT_SYMBOL_GPL(lockdep_rht_bucket_is_held);  #else  #define ASSERT_RHT_MUTEX(HT) -#define ASSERT_BUCKET_LOCK(HT, TBL, HASH)  #endif -static struct rhash_head __rcu **bucket_tail(struct bucket_table *tbl, u32 n) -{ -	struct rhash_head __rcu **pprev; - -	for (pprev = &tbl->buckets[n]; -	     !rht_is_a_nulls(rht_dereference_bucket(*pprev, tbl, n)); -	     pprev = &rht_dereference_bucket(*pprev, tbl, n)->next) -		; - -	return pprev; -} - -static int alloc_bucket_locks(struct rhashtable *ht, struct bucket_table *tbl) +static int alloc_bucket_locks(struct rhashtable *ht, struct bucket_table *tbl, +			      gfp_t gfp)  {  	unsigned int i, size;  #if defined(CONFIG_PROVE_LOCKING) @@ -190,12 +76,13 @@ static int alloc_bucket_locks(struct rhashtable *ht, struct bucket_table *tbl)  	if (sizeof(spinlock_t) != 0) {  #ifdef CONFIG_NUMA -		if (size * sizeof(spinlock_t) > PAGE_SIZE) +		if (size * sizeof(spinlock_t) > PAGE_SIZE && +		    gfp == GFP_KERNEL)  			tbl->locks = vmalloc(size * sizeof(spinlock_t));  		else  #endif  		tbl->locks = kmalloc_array(size, sizeof(spinlock_t), -					   GFP_KERNEL); +					   gfp);  		if (!tbl->locks)  			return -ENOMEM;  		for (i = 0; i < size; i++) @@ -214,155 +101,181 @@ static void bucket_table_free(const struct bucket_table *tbl)  	kvfree(tbl);  } +static void bucket_table_free_rcu(struct rcu_head *head) +{ +	bucket_table_free(container_of(head, struct bucket_table, rcu)); +} +  static struct bucket_table *bucket_table_alloc(struct rhashtable *ht, -					       size_t nbuckets) +					       size_t nbuckets, +					       gfp_t gfp)  { -	struct bucket_table *tbl; +	struct bucket_table *tbl = NULL;  	size_t size;  	int i;  	size = sizeof(*tbl) + nbuckets * sizeof(tbl->buckets[0]); -	tbl = kzalloc(size, GFP_KERNEL | __GFP_NOWARN); -	if (tbl == NULL) +	if (size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER) || +	    gfp != GFP_KERNEL) +		tbl = kzalloc(size, gfp | __GFP_NOWARN | __GFP_NORETRY); +	if (tbl == NULL && gfp == GFP_KERNEL)  		tbl = vzalloc(size); -  	if (tbl == NULL)  		return NULL;  	tbl->size = nbuckets; -	if (alloc_bucket_locks(ht, tbl) < 0) { +	if (alloc_bucket_locks(ht, tbl, gfp) < 0) {  		bucket_table_free(tbl);  		return NULL;  	} +	INIT_LIST_HEAD(&tbl->walkers); + +	get_random_bytes(&tbl->hash_rnd, sizeof(tbl->hash_rnd)); +  	for (i = 0; i < nbuckets; i++)  		INIT_RHT_NULLS_HEAD(tbl->buckets[i], ht, i);  	return tbl;  } -/** - * rht_grow_above_75 - returns true if nelems > 0.75 * table-size - * @ht:		hash table - * @new_size:	new table size - */ -bool rht_grow_above_75(const struct rhashtable *ht, size_t new_size) +static struct bucket_table *rhashtable_last_table(struct rhashtable *ht, +						  struct bucket_table *tbl)  { -	/* Expand table when exceeding 75% load */ -	return atomic_read(&ht->nelems) > (new_size / 4 * 3) && -	       (ht->p.max_shift && atomic_read(&ht->shift) < ht->p.max_shift); -} -EXPORT_SYMBOL_GPL(rht_grow_above_75); +	struct bucket_table *new_tbl; -/** - * rht_shrink_below_30 - returns true if nelems < 0.3 * table-size - * @ht:		hash table - * @new_size:	new table size - */ -bool rht_shrink_below_30(const struct rhashtable *ht, size_t new_size) -{ -	/* Shrink table beneath 30% load */ -	return atomic_read(&ht->nelems) < (new_size * 3 / 10) && -	       (atomic_read(&ht->shift) > ht->p.min_shift); -} -EXPORT_SYMBOL_GPL(rht_shrink_below_30); +	do { +		new_tbl = tbl; +		tbl = rht_dereference_rcu(tbl->future_tbl, ht); +	} while (tbl); -static void lock_buckets(struct bucket_table *new_tbl, -			 struct bucket_table *old_tbl, unsigned int hash) -	__acquires(old_bucket_lock) -{ -	spin_lock_bh(bucket_lock(old_tbl, hash)); -	if (new_tbl != old_tbl) -		spin_lock_bh_nested(bucket_lock(new_tbl, hash), -				    RHT_LOCK_NESTED); +	return new_tbl;  } -static void unlock_buckets(struct bucket_table *new_tbl, -			   struct bucket_table *old_tbl, unsigned int hash) -	__releases(old_bucket_lock) +static int rhashtable_rehash_one(struct rhashtable *ht, unsigned int old_hash)  { -	if (new_tbl != old_tbl) -		spin_unlock_bh(bucket_lock(new_tbl, hash)); -	spin_unlock_bh(bucket_lock(old_tbl, hash)); +	struct bucket_table *old_tbl = rht_dereference(ht->tbl, ht); +	struct bucket_table *new_tbl = rhashtable_last_table(ht, +		rht_dereference_rcu(old_tbl->future_tbl, ht)); +	struct rhash_head __rcu **pprev = &old_tbl->buckets[old_hash]; +	int err = -ENOENT; +	struct rhash_head *head, *next, *entry; +	spinlock_t *new_bucket_lock; +	unsigned int new_hash; + +	rht_for_each(entry, old_tbl, old_hash) { +		err = 0; +		next = rht_dereference_bucket(entry->next, old_tbl, old_hash); + +		if (rht_is_a_nulls(next)) +			break; + +		pprev = &entry->next; +	} + +	if (err) +		goto out; + +	new_hash = head_hashfn(ht, new_tbl, entry); + +	new_bucket_lock = rht_bucket_lock(new_tbl, new_hash); + +	spin_lock_nested(new_bucket_lock, SINGLE_DEPTH_NESTING); +	head = rht_dereference_bucket(new_tbl->buckets[new_hash], +				      new_tbl, new_hash); + +	if (rht_is_a_nulls(head)) +		INIT_RHT_NULLS_HEAD(entry->next, ht, new_hash); +	else +		RCU_INIT_POINTER(entry->next, head); + +	rcu_assign_pointer(new_tbl->buckets[new_hash], entry); +	spin_unlock(new_bucket_lock); + +	rcu_assign_pointer(*pprev, next); + +out: +	return err;  } -/** - * Unlink entries on bucket which hash to different bucket. - * - * Returns true if no more work needs to be performed on the bucket. - */ -static bool hashtable_chain_unzip(struct rhashtable *ht, -				  const struct bucket_table *new_tbl, -				  struct bucket_table *old_tbl, -				  size_t old_hash) +static void rhashtable_rehash_chain(struct rhashtable *ht, +				    unsigned int old_hash)  { -	struct rhash_head *he, *p, *next; -	unsigned int new_hash, new_hash2; +	struct bucket_table *old_tbl = rht_dereference(ht->tbl, ht); +	spinlock_t *old_bucket_lock; -	ASSERT_BUCKET_LOCK(ht, old_tbl, old_hash); +	old_bucket_lock = rht_bucket_lock(old_tbl, old_hash); -	/* Old bucket empty, no work needed. */ -	p = rht_dereference_bucket(old_tbl->buckets[old_hash], old_tbl, -				   old_hash); -	if (rht_is_a_nulls(p)) -		return false; - -	new_hash = head_hashfn(ht, new_tbl, p); -	ASSERT_BUCKET_LOCK(ht, new_tbl, new_hash); +	spin_lock_bh(old_bucket_lock); +	while (!rhashtable_rehash_one(ht, old_hash)) +		; +	old_tbl->rehash++; +	spin_unlock_bh(old_bucket_lock); +} -	/* Advance the old bucket pointer one or more times until it -	 * reaches a node that doesn't hash to the same bucket as the -	 * previous node p. Call the previous node p; -	 */ -	rht_for_each_continue(he, p->next, old_tbl, old_hash) { -		new_hash2 = head_hashfn(ht, new_tbl, he); -		ASSERT_BUCKET_LOCK(ht, new_tbl, new_hash2); +static int rhashtable_rehash_attach(struct rhashtable *ht, +				    struct bucket_table *old_tbl, +				    struct bucket_table *new_tbl) +{ +	/* Protect future_tbl using the first bucket lock. */ +	spin_lock_bh(old_tbl->locks); -		if (new_hash != new_hash2) -			break; -		p = he; +	/* Did somebody beat us to it? */ +	if (rcu_access_pointer(old_tbl->future_tbl)) { +		spin_unlock_bh(old_tbl->locks); +		return -EEXIST;  	} -	rcu_assign_pointer(old_tbl->buckets[old_hash], p->next); -	/* Find the subsequent node which does hash to the same -	 * bucket as node P, or NULL if no such node exists. +	/* Make insertions go into the new, empty table right away. Deletions +	 * and lookups will be attempted in both tables until we synchronize.  	 */ -	INIT_RHT_NULLS_HEAD(next, ht, old_hash); -	if (!rht_is_a_nulls(he)) { -		rht_for_each_continue(he, he->next, old_tbl, old_hash) { -			if (head_hashfn(ht, new_tbl, he) == new_hash) { -				next = he; -				break; -			} -		} -	} +	rcu_assign_pointer(old_tbl->future_tbl, new_tbl); -	/* Set p's next pointer to that subsequent node pointer, -	 * bypassing the nodes which do not hash to p's bucket -	 */ -	rcu_assign_pointer(p->next, next); +	/* Ensure the new table is visible to readers. */ +	smp_wmb(); -	p = rht_dereference_bucket(old_tbl->buckets[old_hash], old_tbl, -				   old_hash); +	spin_unlock_bh(old_tbl->locks); -	return !rht_is_a_nulls(p); +	return 0;  } -static void link_old_to_new(struct rhashtable *ht, struct bucket_table *new_tbl, -			    unsigned int new_hash, struct rhash_head *entry) +static int rhashtable_rehash_table(struct rhashtable *ht)  { -	ASSERT_BUCKET_LOCK(ht, new_tbl, new_hash); +	struct bucket_table *old_tbl = rht_dereference(ht->tbl, ht); +	struct bucket_table *new_tbl; +	struct rhashtable_walker *walker; +	unsigned int old_hash; + +	new_tbl = rht_dereference(old_tbl->future_tbl, ht); +	if (!new_tbl) +		return 0; + +	for (old_hash = 0; old_hash < old_tbl->size; old_hash++) +		rhashtable_rehash_chain(ht, old_hash); + +	/* Publish the new table pointer. */ +	rcu_assign_pointer(ht->tbl, new_tbl); -	rcu_assign_pointer(*bucket_tail(new_tbl, new_hash), entry); +	spin_lock(&ht->lock); +	list_for_each_entry(walker, &old_tbl->walkers, list) +		walker->tbl = NULL; +	spin_unlock(&ht->lock); + +	/* Wait for readers. All new readers will see the new +	 * table, and thus no references to the old table will +	 * remain. +	 */ +	call_rcu(&old_tbl->rcu, bucket_table_free_rcu); + +	return rht_dereference(new_tbl->future_tbl, ht) ? -EAGAIN : 0;  }  /**   * rhashtable_expand - Expand hash table while allowing concurrent lookups   * @ht:		the hash table to expand   * - * A secondary bucket array is allocated and the hash entries are migrated - * while keeping them on both lists until the end of the RCU grace period. + * A secondary bucket array is allocated and the hash entries are migrated.   *   * This function may only be called in a context where it is safe to call   * synchronize_rcu(), e.g. not within a rcu_read_lock() section. @@ -373,87 +286,32 @@ static void link_old_to_new(struct rhashtable *ht, struct bucket_table *new_tbl,   * It is valid to have concurrent insertions and deletions protected by per   * bucket locks or concurrent RCU protected lookups and traversals.   */ -int rhashtable_expand(struct rhashtable *ht) +static int rhashtable_expand(struct rhashtable *ht)  {  	struct bucket_table *new_tbl, *old_tbl = rht_dereference(ht->tbl, ht); -	struct rhash_head *he; -	unsigned int new_hash, old_hash; -	bool complete = false; +	int err;  	ASSERT_RHT_MUTEX(ht); -	new_tbl = bucket_table_alloc(ht, old_tbl->size * 2); +	old_tbl = rhashtable_last_table(ht, old_tbl); + +	new_tbl = bucket_table_alloc(ht, old_tbl->size * 2, GFP_KERNEL);  	if (new_tbl == NULL)  		return -ENOMEM; -	atomic_inc(&ht->shift); +	err = rhashtable_rehash_attach(ht, old_tbl, new_tbl); +	if (err) +		bucket_table_free(new_tbl); -	/* Make insertions go into the new, empty table right away. Deletions -	 * and lookups will be attempted in both tables until we synchronize. -	 * The synchronize_rcu() guarantees for the new table to be picked up -	 * so no new additions go into the old table while we relink. -	 */ -	rcu_assign_pointer(ht->future_tbl, new_tbl); -	synchronize_rcu(); - -	/* For each new bucket, search the corresponding old bucket for the -	 * first entry that hashes to the new bucket, and link the end of -	 * newly formed bucket chain (containing entries added to future -	 * table) to that entry. Since all the entries which will end up in -	 * the new bucket appear in the same old bucket, this constructs an -	 * entirely valid new hash table, but with multiple buckets -	 * "zipped" together into a single imprecise chain. -	 */ -	for (new_hash = 0; new_hash < new_tbl->size; new_hash++) { -		old_hash = rht_bucket_index(old_tbl, new_hash); -		lock_buckets(new_tbl, old_tbl, new_hash); -		rht_for_each(he, old_tbl, old_hash) { -			if (head_hashfn(ht, new_tbl, he) == new_hash) { -				link_old_to_new(ht, new_tbl, new_hash, he); -				break; -			} -		} -		unlock_buckets(new_tbl, old_tbl, new_hash); -	} - -	/* Unzip interleaved hash chains */ -	while (!complete && !ht->being_destroyed) { -		/* Wait for readers. All new readers will see the new -		 * table, and thus no references to the old table will -		 * remain. -		 */ -		synchronize_rcu(); - -		/* For each bucket in the old table (each of which -		 * contains items from multiple buckets of the new -		 * table): ... -		 */ -		complete = true; -		for (old_hash = 0; old_hash < old_tbl->size; old_hash++) { -			lock_buckets(new_tbl, old_tbl, old_hash); - -			if (hashtable_chain_unzip(ht, new_tbl, old_tbl, -						  old_hash)) -				complete = false; - -			unlock_buckets(new_tbl, old_tbl, old_hash); -		} -	} - -	rcu_assign_pointer(ht->tbl, new_tbl); -	synchronize_rcu(); - -	bucket_table_free(old_tbl); -	return 0; +	return err;  } -EXPORT_SYMBOL_GPL(rhashtable_expand);  /**   * rhashtable_shrink - Shrink hash table while allowing concurrent lookups   * @ht:		the hash table to shrink   * - * This function may only be called in a context where it is safe to call - * synchronize_rcu(), e.g. not within a rcu_read_lock() section. + * This function shrinks the hash table to fit, i.e., the smallest + * size would not cause it to expand right away automatically.   *   * The caller must ensure that no concurrent resizing occurs by holding   * ht->mutex. @@ -464,403 +322,146 @@ EXPORT_SYMBOL_GPL(rhashtable_expand);   * It is valid to have concurrent insertions and deletions protected by per   * bucket locks or concurrent RCU protected lookups and traversals.   */ -int rhashtable_shrink(struct rhashtable *ht) +static int rhashtable_shrink(struct rhashtable *ht)  { -	struct bucket_table *new_tbl, *tbl = rht_dereference(ht->tbl, ht); -	unsigned int new_hash; +	struct bucket_table *new_tbl, *old_tbl = rht_dereference(ht->tbl, ht); +	unsigned int size; +	int err;  	ASSERT_RHT_MUTEX(ht); -	new_tbl = bucket_table_alloc(ht, tbl->size / 2); -	if (new_tbl == NULL) -		return -ENOMEM; - -	rcu_assign_pointer(ht->future_tbl, new_tbl); -	synchronize_rcu(); - -	/* Link the first entry in the old bucket to the end of the -	 * bucket in the new table. As entries are concurrently being -	 * added to the new table, lock down the new bucket. As we -	 * always divide the size in half when shrinking, each bucket -	 * in the new table maps to exactly two buckets in the old -	 * table. -	 */ -	for (new_hash = 0; new_hash < new_tbl->size; new_hash++) { -		lock_buckets(new_tbl, tbl, new_hash); - -		rcu_assign_pointer(*bucket_tail(new_tbl, new_hash), -				   tbl->buckets[new_hash]); -		ASSERT_BUCKET_LOCK(ht, tbl, new_hash + new_tbl->size); -		rcu_assign_pointer(*bucket_tail(new_tbl, new_hash), -				   tbl->buckets[new_hash + new_tbl->size]); +	size = roundup_pow_of_two(atomic_read(&ht->nelems) * 3 / 2); +	if (size < ht->p.min_size) +		size = ht->p.min_size; -		unlock_buckets(new_tbl, tbl, new_hash); -	} +	if (old_tbl->size <= size) +		return 0; -	/* Publish the new, valid hash table */ -	rcu_assign_pointer(ht->tbl, new_tbl); -	atomic_dec(&ht->shift); +	if (rht_dereference(old_tbl->future_tbl, ht)) +		return -EEXIST; -	/* Wait for readers. No new readers will have references to the -	 * old hash table. -	 */ -	synchronize_rcu(); +	new_tbl = bucket_table_alloc(ht, size, GFP_KERNEL); +	if (new_tbl == NULL) +		return -ENOMEM; -	bucket_table_free(tbl); +	err = rhashtable_rehash_attach(ht, old_tbl, new_tbl); +	if (err) +		bucket_table_free(new_tbl); -	return 0; +	return err;  } -EXPORT_SYMBOL_GPL(rhashtable_shrink);  static void rht_deferred_worker(struct work_struct *work)  {  	struct rhashtable *ht;  	struct bucket_table *tbl; -	struct rhashtable_walker *walker; +	int err = 0;  	ht = container_of(work, struct rhashtable, run_work);  	mutex_lock(&ht->mutex); -	if (ht->being_destroyed) -		goto unlock;  	tbl = rht_dereference(ht->tbl, ht); +	tbl = rhashtable_last_table(ht, tbl); -	list_for_each_entry(walker, &ht->walkers, list) -		walker->resize = true; - -	if (ht->p.grow_decision && ht->p.grow_decision(ht, tbl->size)) +	if (rht_grow_above_75(ht, tbl))  		rhashtable_expand(ht); -	else if (ht->p.shrink_decision && ht->p.shrink_decision(ht, tbl->size)) +	else if (ht->p.automatic_shrinking && rht_shrink_below_30(ht, tbl))  		rhashtable_shrink(ht); -unlock: +	err = rhashtable_rehash_table(ht); +  	mutex_unlock(&ht->mutex); -} -static void rhashtable_wakeup_worker(struct rhashtable *ht) -{ -	struct bucket_table *tbl = rht_dereference_rcu(ht->tbl, ht); -	struct bucket_table *new_tbl = rht_dereference_rcu(ht->future_tbl, ht); -	size_t size = tbl->size; - -	/* Only adjust the table if no resizing is currently in progress. */ -	if (tbl == new_tbl && -	    ((ht->p.grow_decision && ht->p.grow_decision(ht, size)) || -	     (ht->p.shrink_decision && ht->p.shrink_decision(ht, size)))) +	if (err)  		schedule_work(&ht->run_work);  } -static void __rhashtable_insert(struct rhashtable *ht, struct rhash_head *obj, -				struct bucket_table *tbl, u32 hash) +static bool rhashtable_check_elasticity(struct rhashtable *ht, +					struct bucket_table *tbl, +					unsigned int hash)  { +	unsigned int elasticity = ht->elasticity;  	struct rhash_head *head; -	hash = rht_bucket_index(tbl, hash); -	head = rht_dereference_bucket(tbl->buckets[hash], tbl, hash); - -	ASSERT_BUCKET_LOCK(ht, tbl, hash); - -	if (rht_is_a_nulls(head)) -		INIT_RHT_NULLS_HEAD(obj->next, ht, hash); -	else -		RCU_INIT_POINTER(obj->next, head); - -	rcu_assign_pointer(tbl->buckets[hash], obj); +	rht_for_each(head, tbl, hash) +		if (!--elasticity) +			return true; -	atomic_inc(&ht->nelems); - -	rhashtable_wakeup_worker(ht); +	return false;  } -/** - * rhashtable_insert - insert object into hash table - * @ht:		hash table - * @obj:	pointer to hash head inside object - * - * Will take a per bucket spinlock to protect against mutual mutations - * on the same bucket. Multiple insertions may occur in parallel unless - * they map to the same bucket lock. - * - * It is safe to call this function from atomic context. - * - * Will trigger an automatic deferred table resizing if the size grows - * beyond the watermark indicated by grow_decision() which can be passed - * to rhashtable_init(). - */ -void rhashtable_insert(struct rhashtable *ht, struct rhash_head *obj) +int rhashtable_insert_rehash(struct rhashtable *ht)  { -	struct bucket_table *tbl, *old_tbl; -	unsigned hash; - -	rcu_read_lock(); - -	tbl = rht_dereference_rcu(ht->future_tbl, ht); -	old_tbl = rht_dereference_rcu(ht->tbl, ht); -	hash = obj_raw_hashfn(ht, rht_obj(ht, obj)); - -	lock_buckets(tbl, old_tbl, hash); -	__rhashtable_insert(ht, obj, tbl, hash); -	unlock_buckets(tbl, old_tbl, hash); - -	rcu_read_unlock(); -} -EXPORT_SYMBOL_GPL(rhashtable_insert); - -/** - * rhashtable_remove - remove object from hash table - * @ht:		hash table - * @obj:	pointer to hash head inside object - * - * Since the hash chain is single linked, the removal operation needs to - * walk the bucket chain upon removal. The removal operation is thus - * considerable slow if the hash table is not correctly sized. - * - * Will automatically shrink the table via rhashtable_expand() if the - * shrink_decision function specified at rhashtable_init() returns true. - * - * The caller must ensure that no concurrent table mutations occur. It is - * however valid to have concurrent lookups if they are RCU protected. - */ -bool rhashtable_remove(struct rhashtable *ht, struct rhash_head *obj) -{ -	struct bucket_table *tbl, *new_tbl, *old_tbl; -	struct rhash_head __rcu **pprev; -	struct rhash_head *he, *he2; -	unsigned int hash, new_hash; -	bool ret = false; +	struct bucket_table *old_tbl; +	struct bucket_table *new_tbl; +	struct bucket_table *tbl; +	unsigned int size; +	int err; -	rcu_read_lock();  	old_tbl = rht_dereference_rcu(ht->tbl, ht); -	tbl = new_tbl = rht_dereference_rcu(ht->future_tbl, ht); -	new_hash = obj_raw_hashfn(ht, rht_obj(ht, obj)); - -	lock_buckets(new_tbl, old_tbl, new_hash); -restart: -	hash = rht_bucket_index(tbl, new_hash); -	pprev = &tbl->buckets[hash]; -	rht_for_each(he, tbl, hash) { -		if (he != obj) { -			pprev = &he->next; -			continue; -		} - -		ASSERT_BUCKET_LOCK(ht, tbl, hash); - -		if (old_tbl->size > new_tbl->size && tbl == old_tbl && -		    !rht_is_a_nulls(obj->next) && -		    head_hashfn(ht, tbl, obj->next) != hash) { -			rcu_assign_pointer(*pprev, (struct rhash_head *) rht_marker(ht, hash)); -		} else if (unlikely(old_tbl->size < new_tbl->size && tbl == new_tbl)) { -			rht_for_each_continue(he2, obj->next, tbl, hash) { -				if (head_hashfn(ht, tbl, he2) == hash) { -					rcu_assign_pointer(*pprev, he2); -					goto found; -				} -			} - -			rcu_assign_pointer(*pprev, (struct rhash_head *) rht_marker(ht, hash)); -		} else { -			rcu_assign_pointer(*pprev, obj->next); -		} - -found: -		ret = true; -		break; -	} +	tbl = rhashtable_last_table(ht, old_tbl); -	/* The entry may be linked in either 'tbl', 'future_tbl', or both. -	 * 'future_tbl' only exists for a short period of time during -	 * resizing. Thus traversing both is fine and the added cost is -	 * very rare. -	 */ -	if (tbl != old_tbl) { -		tbl = old_tbl; -		goto restart; -	} +	size = tbl->size; -	unlock_buckets(new_tbl, old_tbl, new_hash); - -	if (ret) { -		atomic_dec(&ht->nelems); -		rhashtable_wakeup_worker(ht); -	} - -	rcu_read_unlock(); +	if (rht_grow_above_75(ht, tbl)) +		size *= 2; +	/* More than two rehashes (not resizes) detected. */ +	else if (WARN_ON(old_tbl != tbl && old_tbl->size == size)) +		return -EBUSY; -	return ret; -} -EXPORT_SYMBOL_GPL(rhashtable_remove); - -struct rhashtable_compare_arg { -	struct rhashtable *ht; -	const void *key; -}; - -static bool rhashtable_compare(void *ptr, void *arg) -{ -	struct rhashtable_compare_arg *x = arg; -	struct rhashtable *ht = x->ht; - -	return !memcmp(ptr + ht->p.key_offset, x->key, ht->p.key_len); -} - -/** - * rhashtable_lookup - lookup key in hash table - * @ht:		hash table - * @key:	pointer to key - * - * Computes the hash value for the key and traverses the bucket chain looking - * for a entry with an identical key. The first matching entry is returned. - * - * This lookup function may only be used for fixed key hash table (key_len - * parameter set). It will BUG() if used inappropriately. - * - * Lookups may occur in parallel with hashtable mutations and resizing. - */ -void *rhashtable_lookup(struct rhashtable *ht, const void *key) -{ -	struct rhashtable_compare_arg arg = { -		.ht = ht, -		.key = key, -	}; - -	BUG_ON(!ht->p.key_len); - -	return rhashtable_lookup_compare(ht, key, &rhashtable_compare, &arg); -} -EXPORT_SYMBOL_GPL(rhashtable_lookup); - -/** - * rhashtable_lookup_compare - search hash table with compare function - * @ht:		hash table - * @key:	the pointer to the key - * @compare:	compare function, must return true on match - * @arg:	argument passed on to compare function - * - * Traverses the bucket chain behind the provided hash value and calls the - * specified compare function for each entry. - * - * Lookups may occur in parallel with hashtable mutations and resizing. - * - * Returns the first entry on which the compare function returned true. - */ -void *rhashtable_lookup_compare(struct rhashtable *ht, const void *key, -				bool (*compare)(void *, void *), void *arg) -{ -	const struct bucket_table *tbl, *old_tbl; -	struct rhash_head *he; -	u32 hash; - -	rcu_read_lock(); - -	old_tbl = rht_dereference_rcu(ht->tbl, ht); -	tbl = rht_dereference_rcu(ht->future_tbl, ht); -	hash = key_hashfn(ht, key, ht->p.key_len); -restart: -	rht_for_each_rcu(he, tbl, rht_bucket_index(tbl, hash)) { -		if (!compare(rht_obj(ht, he), arg)) -			continue; -		rcu_read_unlock(); -		return rht_obj(ht, he); -	} +	new_tbl = bucket_table_alloc(ht, size, GFP_ATOMIC); +	if (new_tbl == NULL) +		return -ENOMEM; -	if (unlikely(tbl != old_tbl)) { -		tbl = old_tbl; -		goto restart; -	} -	rcu_read_unlock(); +	err = rhashtable_rehash_attach(ht, tbl, new_tbl); +	if (err) { +		bucket_table_free(new_tbl); +		if (err == -EEXIST) +			err = 0; +	} else +		schedule_work(&ht->run_work); -	return NULL; +	return err;  } -EXPORT_SYMBOL_GPL(rhashtable_lookup_compare); +EXPORT_SYMBOL_GPL(rhashtable_insert_rehash); -/** - * rhashtable_lookup_insert - lookup and insert object into hash table - * @ht:		hash table - * @obj:	pointer to hash head inside object - * - * Locks down the bucket chain in both the old and new table if a resize - * is in progress to ensure that writers can't remove from the old table - * and can't insert to the new table during the atomic operation of search - * and insertion. Searches for duplicates in both the old and new table if - * a resize is in progress. - * - * This lookup function may only be used for fixed key hash table (key_len - * parameter set). It will BUG() if used inappropriately. - * - * It is safe to call this function from atomic context. - * - * Will trigger an automatic deferred table resizing if the size grows - * beyond the watermark indicated by grow_decision() which can be passed - * to rhashtable_init(). - */ -bool rhashtable_lookup_insert(struct rhashtable *ht, struct rhash_head *obj) +int rhashtable_insert_slow(struct rhashtable *ht, const void *key, +			   struct rhash_head *obj, +			   struct bucket_table *tbl)  { -	struct rhashtable_compare_arg arg = { -		.ht = ht, -		.key = rht_obj(ht, obj) + ht->p.key_offset, -	}; +	struct rhash_head *head; +	unsigned int hash; +	int err; -	BUG_ON(!ht->p.key_len); +	tbl = rhashtable_last_table(ht, tbl); +	hash = head_hashfn(ht, tbl, obj); +	spin_lock_nested(rht_bucket_lock(tbl, hash), SINGLE_DEPTH_NESTING); -	return rhashtable_lookup_compare_insert(ht, obj, &rhashtable_compare, -						&arg); -} -EXPORT_SYMBOL_GPL(rhashtable_lookup_insert); +	err = -EEXIST; +	if (key && rhashtable_lookup_fast(ht, key, ht->p)) +		goto exit; -/** - * rhashtable_lookup_compare_insert - search and insert object to hash table - *                                    with compare function - * @ht:		hash table - * @obj:	pointer to hash head inside object - * @compare:	compare function, must return true on match - * @arg:	argument passed on to compare function - * - * Locks down the bucket chain in both the old and new table if a resize - * is in progress to ensure that writers can't remove from the old table - * and can't insert to the new table during the atomic operation of search - * and insertion. Searches for duplicates in both the old and new table if - * a resize is in progress. - * - * Lookups may occur in parallel with hashtable mutations and resizing. - * - * Will trigger an automatic deferred table resizing if the size grows - * beyond the watermark indicated by grow_decision() which can be passed - * to rhashtable_init(). - */ -bool rhashtable_lookup_compare_insert(struct rhashtable *ht, -				      struct rhash_head *obj, -				      bool (*compare)(void *, void *), -				      void *arg) -{ -	struct bucket_table *new_tbl, *old_tbl; -	u32 new_hash; -	bool success = true; +	err = -EAGAIN; +	if (rhashtable_check_elasticity(ht, tbl, hash) || +	    rht_grow_above_100(ht, tbl)) +		goto exit; -	BUG_ON(!ht->p.key_len); +	err = 0; -	rcu_read_lock(); -	old_tbl = rht_dereference_rcu(ht->tbl, ht); -	new_tbl = rht_dereference_rcu(ht->future_tbl, ht); -	new_hash = obj_raw_hashfn(ht, rht_obj(ht, obj)); +	head = rht_dereference_bucket(tbl->buckets[hash], tbl, hash); -	lock_buckets(new_tbl, old_tbl, new_hash); +	RCU_INIT_POINTER(obj->next, head); -	if (rhashtable_lookup_compare(ht, rht_obj(ht, obj) + ht->p.key_offset, -				      compare, arg)) { -		success = false; -		goto exit; -	} +	rcu_assign_pointer(tbl->buckets[hash], obj); -	__rhashtable_insert(ht, obj, new_tbl, new_hash); +	atomic_inc(&ht->nelems);  exit: -	unlock_buckets(new_tbl, old_tbl, new_hash); -	rcu_read_unlock(); +	spin_unlock(rht_bucket_lock(tbl, hash)); -	return success; +	return err;  } -EXPORT_SYMBOL_GPL(rhashtable_lookup_compare_insert); +EXPORT_SYMBOL_GPL(rhashtable_insert_slow);  /**   * rhashtable_walk_init - Initialise an iterator @@ -895,7 +496,8 @@ int rhashtable_walk_init(struct rhashtable *ht, struct rhashtable_iter *iter)  		return -ENOMEM;  	mutex_lock(&ht->mutex); -	list_add(&iter->walker->list, &ht->walkers); +	iter->walker->tbl = rht_dereference(ht->tbl, ht); +	list_add(&iter->walker->list, &iter->walker->tbl->walkers);  	mutex_unlock(&ht->mutex);  	return 0; @@ -911,7 +513,8 @@ EXPORT_SYMBOL_GPL(rhashtable_walk_init);  void rhashtable_walk_exit(struct rhashtable_iter *iter)  {  	mutex_lock(&iter->ht->mutex); -	list_del(&iter->walker->list); +	if (iter->walker->tbl) +		list_del(&iter->walker->list);  	mutex_unlock(&iter->ht->mutex);  	kfree(iter->walker);  } @@ -932,13 +535,21 @@ EXPORT_SYMBOL_GPL(rhashtable_walk_exit);   * by calling rhashtable_walk_next.   */  int rhashtable_walk_start(struct rhashtable_iter *iter) +	__acquires(RCU)  { +	struct rhashtable *ht = iter->ht; + +	mutex_lock(&ht->mutex); + +	if (iter->walker->tbl) +		list_del(&iter->walker->list); +  	rcu_read_lock(); -	if (iter->walker->resize) { -		iter->slot = 0; -		iter->skip = 0; -		iter->walker->resize = false; +	mutex_unlock(&ht->mutex); + +	if (!iter->walker->tbl) { +		iter->walker->tbl = rht_dereference_rcu(ht->tbl, ht);  		return -EAGAIN;  	} @@ -960,13 +571,11 @@ EXPORT_SYMBOL_GPL(rhashtable_walk_start);   */  void *rhashtable_walk_next(struct rhashtable_iter *iter)  { -	const struct bucket_table *tbl; +	struct bucket_table *tbl = iter->walker->tbl;  	struct rhashtable *ht = iter->ht;  	struct rhash_head *p = iter->p;  	void *obj = NULL; -	tbl = rht_dereference_rcu(ht->tbl, ht); -  	if (p) {  		p = rht_dereference_bucket_rcu(p->next, tbl, iter->slot);  		goto next; @@ -992,17 +601,20 @@ next:  		iter->skip = 0;  	} -	iter->p = NULL; +	/* Ensure we see any new tables. */ +	smp_rmb(); -out: -	if (iter->walker->resize) { -		iter->p = NULL; +	iter->walker->tbl = rht_dereference_rcu(tbl->future_tbl, ht); +	if (iter->walker->tbl) {  		iter->slot = 0;  		iter->skip = 0; -		iter->walker->resize = false;  		return ERR_PTR(-EAGAIN);  	} +	iter->p = NULL; + +out: +  	return obj;  }  EXPORT_SYMBOL_GPL(rhashtable_walk_next); @@ -1014,16 +626,39 @@ EXPORT_SYMBOL_GPL(rhashtable_walk_next);   * Finish a hash table walk.   */  void rhashtable_walk_stop(struct rhashtable_iter *iter) +	__releases(RCU)  { -	rcu_read_unlock(); +	struct rhashtable *ht; +	struct bucket_table *tbl = iter->walker->tbl; + +	if (!tbl) +		goto out; + +	ht = iter->ht; + +	spin_lock(&ht->lock); +	if (tbl->rehash < tbl->size) +		list_add(&iter->walker->list, &tbl->walkers); +	else +		iter->walker->tbl = NULL; +	spin_unlock(&ht->lock); +  	iter->p = NULL; + +out: +	rcu_read_unlock();  }  EXPORT_SYMBOL_GPL(rhashtable_walk_stop); -static size_t rounded_hashtable_size(struct rhashtable_params *params) +static size_t rounded_hashtable_size(const struct rhashtable_params *params)  {  	return max(roundup_pow_of_two(params->nelem_hint * 4 / 3), -		   1UL << params->min_shift); +		   (unsigned long)params->min_size); +} + +static u32 rhashtable_jhash2(const void *key, u32 length, u32 seed) +{ +	return jhash2(key, length, seed);  }  /** @@ -1056,7 +691,7 @@ static size_t rounded_hashtable_size(struct rhashtable_params *params)   *	struct rhash_head	node;   * };   * - * u32 my_hash_fn(const void *data, u32 seed) + * u32 my_hash_fn(const void *data, u32 len, u32 seed)   * {   *	struct test_obj *obj = data;   * @@ -1069,72 +704,129 @@ static size_t rounded_hashtable_size(struct rhashtable_params *params)   *	.obj_hashfn = my_hash_fn,   * };   */ -int rhashtable_init(struct rhashtable *ht, struct rhashtable_params *params) +int rhashtable_init(struct rhashtable *ht, +		    const struct rhashtable_params *params)  {  	struct bucket_table *tbl;  	size_t size;  	size = HASH_DEFAULT_SIZE; -	if ((params->key_len && !params->hashfn) || -	    (!params->key_len && !params->obj_hashfn)) +	if ((!params->key_len && !params->obj_hashfn) || +	    (params->obj_hashfn && !params->obj_cmpfn))  		return -EINVAL;  	if (params->nulls_base && params->nulls_base < (1U << RHT_BASE_SHIFT))  		return -EINVAL; -	params->min_shift = max_t(size_t, params->min_shift, -				  ilog2(HASH_MIN_SIZE)); -  	if (params->nelem_hint)  		size = rounded_hashtable_size(params);  	memset(ht, 0, sizeof(*ht));  	mutex_init(&ht->mutex); +	spin_lock_init(&ht->lock);  	memcpy(&ht->p, params, sizeof(*params)); -	INIT_LIST_HEAD(&ht->walkers); + +	if (params->min_size) +		ht->p.min_size = roundup_pow_of_two(params->min_size); + +	if (params->max_size) +		ht->p.max_size = rounddown_pow_of_two(params->max_size); + +	ht->p.min_size = max(ht->p.min_size, HASH_MIN_SIZE); + +	/* The maximum (not average) chain length grows with the +	 * size of the hash table, at a rate of (log N)/(log log N). +	 * The value of 16 is selected so that even if the hash +	 * table grew to 2^32 you would not expect the maximum +	 * chain length to exceed it unless we are under attack +	 * (or extremely unlucky). +	 * +	 * As this limit is only to detect attacks, we don't need +	 * to set it to a lower value as you'd need the chain +	 * length to vastly exceed 16 to have any real effect +	 * on the system. +	 */ +	if (!params->insecure_elasticity) +		ht->elasticity = 16;  	if (params->locks_mul)  		ht->p.locks_mul = roundup_pow_of_two(params->locks_mul);  	else  		ht->p.locks_mul = BUCKET_LOCKS_PER_CPU; -	tbl = bucket_table_alloc(ht, size); +	ht->key_len = ht->p.key_len; +	if (!params->hashfn) { +		ht->p.hashfn = jhash; + +		if (!(ht->key_len & (sizeof(u32) - 1))) { +			ht->key_len /= sizeof(u32); +			ht->p.hashfn = rhashtable_jhash2; +		} +	} + +	tbl = bucket_table_alloc(ht, size, GFP_KERNEL);  	if (tbl == NULL)  		return -ENOMEM;  	atomic_set(&ht->nelems, 0); -	atomic_set(&ht->shift, ilog2(tbl->size)); -	RCU_INIT_POINTER(ht->tbl, tbl); -	RCU_INIT_POINTER(ht->future_tbl, tbl); -	if (!ht->p.hash_rnd) -		get_random_bytes(&ht->p.hash_rnd, sizeof(ht->p.hash_rnd)); +	RCU_INIT_POINTER(ht->tbl, tbl); -	if (ht->p.grow_decision || ht->p.shrink_decision) -		INIT_WORK(&ht->run_work, rht_deferred_worker); +	INIT_WORK(&ht->run_work, rht_deferred_worker);  	return 0;  }  EXPORT_SYMBOL_GPL(rhashtable_init);  /** - * rhashtable_destroy - destroy hash table + * rhashtable_free_and_destroy - free elements and destroy hash table   * @ht:		the hash table to destroy + * @free_fn:	callback to release resources of element + * @arg:	pointer passed to free_fn + * + * Stops an eventual async resize. If defined, invokes free_fn for each + * element to releasal resources. Please note that RCU protected + * readers may still be accessing the elements. Releasing of resources + * must occur in a compatible manner. Then frees the bucket array.   * - * Frees the bucket array. This function is not rcu safe, therefore the caller - * has to make sure that no resizing may happen by unpublishing the hashtable - * and waiting for the quiescent cycle before releasing the bucket array. + * This function will eventually sleep to wait for an async resize + * to complete. The caller is responsible that no further write operations + * occurs in parallel.   */ -void rhashtable_destroy(struct rhashtable *ht) +void rhashtable_free_and_destroy(struct rhashtable *ht, +				 void (*free_fn)(void *ptr, void *arg), +				 void *arg)  { -	ht->being_destroyed = true; +	const struct bucket_table *tbl; +	unsigned int i; -	if (ht->p.grow_decision || ht->p.shrink_decision) -		cancel_work_sync(&ht->run_work); +	cancel_work_sync(&ht->run_work);  	mutex_lock(&ht->mutex); -	bucket_table_free(rht_dereference(ht->tbl, ht)); +	tbl = rht_dereference(ht->tbl, ht); +	if (free_fn) { +		for (i = 0; i < tbl->size; i++) { +			struct rhash_head *pos, *next; + +			for (pos = rht_dereference(tbl->buckets[i], ht), +			     next = !rht_is_a_nulls(pos) ? +					rht_dereference(pos->next, ht) : NULL; +			     !rht_is_a_nulls(pos); +			     pos = next, +			     next = !rht_is_a_nulls(pos) ? +					rht_dereference(pos->next, ht) : NULL) +				free_fn(rht_obj(ht, pos), arg); +		} +	} + +	bucket_table_free(tbl);  	mutex_unlock(&ht->mutex);  } +EXPORT_SYMBOL_GPL(rhashtable_free_and_destroy); + +void rhashtable_destroy(struct rhashtable *ht) +{ +	return rhashtable_free_and_destroy(ht, NULL, NULL); +}  EXPORT_SYMBOL_GPL(rhashtable_destroy); |